Intro

Leggo i dati preprocessati e le funzioni per calcolare l’errore:

rm(list=ls())

load("dati_FunScore.Rdata")

#sostituisco
# [1] "Conteggi.\\?"          "Conteggi.\\!"         
# [3] "Conteggi.@"            "Conteggi.#"           
# [5] "Conteggi.(€|euro)"     "Conteggi.(\\$|dollar)"]
# con un nome senza caratteri speciali
contgg=grep("Conteggi\\.",names(X))
names(X)[contgg]=paste("simbolo",sep="",1:length(contgg))

#per avere una risposta numerica 0-1:
tweets$soggettivo=tweets$soggettivo*1

creo il dataset di test per usarlo in seguito e riduco tweets al solo training set:

set.seed(1)
test=tweets[is.na(tweets$soggettivo),]
dim(test)

## [1] 1935    8

Xtest=X[is.na(tweets$soggettivo),]
dim(Xtest)

## [1]  1935 15286

X=X[!is.na(tweets$soggettivo),]
dim(X)

## [1]  4513 15286

tweets=tweets[!is.na(tweets$soggettivo),]
dim(tweets)

## [1] 4513    8

Riduzione della DTM

#le prime tre colonne sono sentiment e ncaratteri:
names(X[,1:8])

## [1] "sentPOS"                        "sentNEG"                       
## [3] "nchars"                         "♩♪♬"                           
## [5] "\U0001f60d\U0001f618✌"          "\u270b\U0001f60f\U0001f612"    
## [7] "\U0001f60d\U0001f60d\U0001f60d" "\U0001f602\U0001f602\U0001f602"

dim(X)

## [1]  4513 15286

Molte parole sono presenti pochissime volte, ad esempio tengo solo quelle che sono presenti almeno 20 volte

table(colSums(X[,-(1:3)]))

## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
## 3316 7976 1572  701  405  281  167  133   91   77   58   52   44   39   35 
##   15   16   17   18   19   20   21   22   23   24   25   26   27   28   29 
##   27   19   21   10   16   11   10   14   16   13   10   16    6    7   12 
##   30   31   32   33   34   35   36   37   38   39   40   41   42   43   44 
##    1    2    5    5    8    8    7    1    4    2    2    2    4    1    1 
##   45   46   48   49   50   51   52   54   55   57   58   59   60   61   62 
##    2    4    1    2    2    1    1    2    2    2    1    1    1    2    1 
##   63   64   66   67   68   69   72   73   77   78   79   80   87   90   91 
##    1    1    2    2    2    1    2    2    1    2    1    1    1    1    1 
##   94   98  100  101  109  115  117  121  123  127  130  145  150  156  172 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  174  175  179  258  286  764  793  970  972 1070 1476 1492 1825 3823 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1

dtm=X[,-(1:3)]
dtm=dtm[,colSums(dtm)>=20]
X=cbind(X[,1:3],dtm)
rm(dtm)
dim(X)

## [1] 4513  246

Albero di Classificazione per Task 1

library(rpart)
modrp=rpart(tweets$soggettivo~.,data=X,method = "class")
printcp(modrp)

## 
## Classification tree:
## rpart(formula = tweets$soggettivo ~ ., data = X, method = "class")
## 
## Variables actually used in tree construction:
## [1] persia    sentNEG   simbolo2  simbolo4  wwwurlwww
## 
## Root node error: 1276/4513 = 0.28274
## 
## n= 4513 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.043103      0   1.00000 1.00000 0.023709
## 2 0.036834      2   0.91379 0.96082 0.023419
## 3 0.019592      3   0.87696 0.87696 0.022735
## 4 0.018809      4   0.85737 0.86991 0.022673
## 5 0.010000      5   0.83856 0.83856 0.022391

Tabella di confusione

table(tweets$soggettivo,predict(modrp,type = "class"))

##    
##        0    1
##   0  592  684
##   1  386 2851

Threshold:

thr=quantile(predict(modrp)[,2],1-mean(tweets$soggettivo))
thr

## 28.27388% 
## 0.6028571

predetti=predict(modrp)[,2]>thr

Tabella di confusione con threshold

table(tweets$soggettivo,predict(modrp,type = "class"))

##    
##        0    1
##   0  592  684
##   1  386 2851

Calcolo punteggio su training set (ottimistico)

Scelgo una soglia, non lascio decidere a rpart

F_soggettivo=F_class(true = tweets$soggettivo ,
                    predetti)

##    recall precision         F 
## 0.8155700 0.8288854 0.8221738

F_obj=F_class(true = tweets$soggettivo==0,predetti==0)

##    recall precision         F 
## 0.5728840 0.5504518 0.5614439

(F_obj+F_soggettivo)/2

## [1] 0.6918089

Previsioni su Test set

yhat=predict(modrp,newdata = Xtest)[,2]>thr
write.table(file='previsioni/predicted_task1_tree.txt',yhat,row.names = FALSE,col.names = FALSE)

Massima estensione e Potatura dell’albero

Massima estensione

modrp.ext <- rpart(tweets$soggettivo~.,data=X,method = "class",xval=10,cp=0)
printcp(modrp.ext)

## 
## Classification tree:
## rpart(formula = tweets$soggettivo ~ ., data = X, method = "class", 
##     xval = 10, cp = 0)
## 
## Variables actually used in tree construction:
##  [1] @addthis      buon          ciao          crisi         dopo         
##  [6] governo       governo_monti grillo        italia        lega         
## [11] manovra       mario_monti   ministri      nchars        non          
## [16] oggi          persia        piazzapulita  più           politica     
## [21] premier       roma          senato        sentNEG       sentPOS      
## [26] simbolo1      simbolo2      simbolo3      simbolo4      stampa       
## [31] vita          voti          wwwurlwww     @youtube     
## 
## Root node error: 1276/4513 = 0.28274
## 
## n= 4513 
## 
##            CP nsplit rel error  xerror     xstd
## 1  0.04310345      0   1.00000 1.00000 0.023709
## 2  0.03683386      2   0.91379 0.99451 0.023669
## 3  0.01959248      3   0.87696 0.88558 0.022809
## 4  0.01880878      4   0.85737 0.87461 0.022714
## 5  0.00822884      5   0.83856 0.84013 0.022406
## 6  0.00626959      7   0.82210 0.84404 0.022441
## 7  0.00548589      8   0.81583 0.83386 0.022348
## 8  0.00313480      9   0.81034 0.82837 0.022297
## 9  0.00293887     11   0.80408 0.83934 0.022398
## 10 0.00235110     17   0.78448 0.83621 0.022370
## 11 0.00195925     22   0.77194 0.83699 0.022377
## 12 0.00182863     24   0.76803 0.84326 0.022434
## 13 0.00156740     28   0.75940 0.84326 0.022434
## 14 0.00125392     42   0.73197 0.84639 0.022463
## 15 0.00117555     53   0.71317 0.85502 0.022541
## 16 0.00104493     58   0.70690 0.85345 0.022527
## 17 0.00097962     63   0.70141 0.85815 0.022569
## 18 0.00078370     71   0.69357 0.85893 0.022576
## 19 0.00058777     88   0.67947 0.87147 0.022687
## 20 0.00052247     97   0.67398 0.87461 0.022714
## 21 0.00044783    103   0.67085 0.88088 0.022769
## 22 0.00039185    129   0.65909 0.88401 0.022796
## 23 0.00031348    131   0.65831 0.88480 0.022802
## 24 0.00026123    136   0.65674 0.88323 0.022789
## 25 0.00019592    139   0.65596 0.88323 0.022789
## 26 0.00000000    143   0.65517 0.90047 0.022935

plot(modrp.ext)

plotcp(modrp.ext)

Tabella di confusione

table(tweets$soggettivo,predict(modrp.ext,type = "class"))

##    
##        0    1
##   0  705  571
##   1  265 2972

Calcolo punteggio su training set (ottimistico)

thr=quantile(predict(modrp.ext)[,2],1-mean(tweets$soggettivo))
thr

## 28.27388% 
## 0.6666667

predetti=predict(modrp.ext)[,2]>thr
F_soggettivo=F_class(true = tweets$soggettivo ,
                    predetti)

##    recall precision         F 
## 0.8541860 0.8640625 0.8590959

F_obj=F_class(true = tweets$soggettivo==0,predetti==0)

##    recall precision         F 
## 0.6590909 0.6405179 0.6496717

(F_obj+F_soggettivo)/2

## [1] 0.7543838

Previsioni su Test set

yhat=predict(modrp.ext,newdata = Xtest)[,2]>thr
write.table(file='previsioni/predicted_task1_tree_max.txt',yhat,row.names = FALSE,col.names = FALSE)

Potatura

modrp.prune= prune(modrp.ext,cp = .005)
printcp(modrp.prune)

## 
## Classification tree:
## rpart(formula = tweets$soggettivo ~ ., data = X, method = "class", 
##     xval = 10, cp = 0)
## 
## Variables actually used in tree construction:
## [1] buon      non       oggi      persia    sentNEG   sentPOS   simbolo2 
## [8] simbolo4  wwwurlwww
## 
## Root node error: 1276/4513 = 0.28274
## 
## n= 4513 
## 
##          CP nsplit rel error  xerror     xstd
## 1 0.0431034      0   1.00000 1.00000 0.023709
## 2 0.0368339      2   0.91379 0.99451 0.023669
## 3 0.0195925      3   0.87696 0.88558 0.022809
## 4 0.0188088      4   0.85737 0.87461 0.022714
## 5 0.0082288      5   0.83856 0.84013 0.022406
## 6 0.0062696      7   0.82210 0.84404 0.022441
## 7 0.0054859      8   0.81583 0.83386 0.022348
## 8 0.0050000      9   0.81034 0.82837 0.022297

plot(modrp.prune)

plotcp(modrp.prune)

Calcolo punteggio su training set (ottimistico)

thr=quantile(predict(modrp.prune)[,2],1-mean(tweets$soggettivo))
thr

## 28.27388% 
##  0.744186

predetti=predict(modrp.prune)[,2]>thr
F_soggettivo=F_class(true = tweets$soggettivo ,
                    predetti)

##    recall precision         F 
## 0.8183503 0.8291080 0.8236940

F_obj=F_class(true = tweets$soggettivo==0,predetti==0)

##    recall precision         F 
## 0.5721003 0.5538695 0.5628373

(F_obj+F_soggettivo)/2

## [1] 0.6932657

Previsioni su Test set

yhat=predict(modrp.prune,newdata = Xtest)[,2]>thr
write.table(file='previsioni/predicted_task1_tree_prune.txt',yhat,row.names = FALSE,col.names = FALSE)

Esempio Analisi con albero di classificazione

Livio Finos

09/06/2016

Intro

Riduzione della DTM

Albero di Classificazione per Task 1

Calcolo punteggio su training set (ottimistico)

Previsioni su Test set

Massima estensione e Potatura dell’albero

Massima estensione

Calcolo punteggio su training set (ottimistico)

Previsioni su Test set

Potatura

Calcolo punteggio su training set (ottimistico)

Previsioni su Test set