Intro

alcuni codici di esempio per generare la matrice di predittori a partire dai tweets

# scaricato da http://46.51.205.81/dump_tweets/any_ita/
train=read.csv("dataset/train.csv",numerals="no.loss")
str(train)

## 'data.frame':    4513 obs. of  6 variables:
##  $ id        : Factor w/ 4499 levels "125485104863780865",..: 3137 940 3216 1167 2416 2173 3496 1693 3947 3856 ...
##  $ tweet     : Factor w/ 4494 levels ":')   ❤  ❤  ❤   :')",..: 1579 914 157 2328 94 3090 1503 784 3173 2514 ...
##  $ soggettivo: logi  TRUE TRUE FALSE FALSE TRUE FALSE ...
##  $ positivo  : logi  FALSE FALSE FALSE FALSE TRUE FALSE ...
##  $ negativo  : logi  TRUE TRUE FALSE FALSE FALSE FALSE ...
##  $ ironico   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...

test=read.csv("dataset/test.csv",numerals="no.loss")
str(test)

## 'data.frame':    1935 obs. of  2 variables:
##  $ id   : Factor w/ 1930 levels "122449983151669248",..: 862 1240 1506 460 1464 1823 602 1288 1750 185 ...
##  $ tweet: Factor w/ 1929 levels "08 aprile 2011 - 08 aprile 2012. La mia vita è cambiata completamente. #unannodiFreaks &lt;3",..: 610 512 685 507 679 1782 1133 1423 918 1178 ...

names(train)

## [1] "id"         "tweet"      "soggettivo" "positivo"   "negativo"  
## [6] "ironico"

test=data.frame(test,soggettivo=NA,positivo=NA,negativo=NA,ironico=NA)

tweets=rbind(train,test)
names(tweets)[2]="TEXT"

rm(train,test)

Numero di caratteri per tweet

nchars=  sapply(as.vector(tweets$TEXT),nchar)
nchars=as.vector(nchars)
boxplot(nchars~tweets$soggettivo,col=2:3)

t.test(nchars~tweets$soggettivo)

## 
##  Welch Two Sample t-test
## 
## data:  nchars by tweets$soggettivo
## t = -0.67863, df = 2440.6, p-value = 0.4974
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.866498  1.392550
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##            100.6152            101.3522

boxplot(nchars~tweets$positivo,col=2:3)

t.test(nchars~tweets$positivo)

## 
##  Welch Two Sample t-test
## 
## data:  nchars by tweets$positivo
## t = 4.8761, df = 2141.3, p-value = 1.162e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  3.385346 7.940279
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##           102.76498            97.10217

boxplot(nchars~tweets$negativo,col=2:3)

t.test(nchars~tweets$negativo)

## 
##  Welch Two Sample t-test
## 
## data:  nchars by tweets$negativo
## t = -10.925, df = 4457.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -12.619791  -8.779747
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##            96.39968           107.09945

pare essere interessante, lo teniamo come predittore

Normalizzazione del testo

Si possono seguire molte strade, ovviamente.

Trovate una accurata review qui (chi vuole approfondire poi condifida anche): http://sentiment.christopherpotts.net/

library(TextWiller)

## Loading required package: stringr

## Loading required package: twitteR

## Loading required package: RCurl

## Loading required package: bitops

## Loading required package: SnowballC

tweets$TEXT=iconv(tweets$TEXT,to="UTF-8")
tweets$TEXTorig=tweets$TEXT

Gestione emoticons

tweets$TEXT=normalizzaemote(tweets$TEXT)
length(grep("EMOTE",tweets$TEXT))

## [1] 414

normalizzo i testi

tweets$TEXT=gsub("…","",tweets$TEXT)
tweets$TEXT=normalizzaTesti(tweets$TEXT,contaStringhe = c("\\?","\\!","@","#","(\u20AC|euro)","(\\$|dollar)"))

Salvo i conteggi delle parole specificate come matrice a parte

conteggi_caratteri=as.data.frame(attributes(tweets$TEXT)$ counts)

faccio a mano alcuni preprocessamenti che possono essereq sfuggiti, elimino a questo punto i # degli hastag (questo sono già stati contati sopra). Mi aiuto anche con l’anali degli n-grammi (vedi codice commentato)

############ ricerca n-grammi più frequenti
# install.packages("tau")
# require(tau)
# 
# bigrams <- textcnt(tweets$TEXT,method="string",n=2L,split="[[:blank:]]")
# sort(bigrams,decreasing=TRUE)[1:10]
# sort(bigrams[grep("^stadio",names(bigrams))],decreasing=TRUE)[1:10]

# trigrams <- textcnt(tweets$TEXT,method="string",n=3L,split="[[:blank:]]")
# sort(trigrams,decreasing=TRUE)[1:10]


tweets$TEXT <- gsub("#", "", tweets$TEXT)
tweets$TEXT=removeStopwords(tweets$TEXT, stopwords = c(itastopwords,"…"))

tweets$TEXT <- gsub("( |^)piu( |$)", " più ", tweets$TEXT)
tweets$TEXT <- gsub("perch쎩", "perché", tweets$TEXT)
# tweets$TEXT <- gsub("tweet not available", "tweet_not_available", tweets$TEXT)
tweets$TEXT <- gsub("mario monti", "mario_monti", tweets$TEXT)
tweets$TEXT <- gsub("governo monti", "governo_monti", tweets$TEXT)
tweets$TEXT <- gsub("professor monti", "professor_monti", tweets$TEXT)

Crea Document Term Matrix

(= una riga per tweet, una colonna per ogni parola)

library(tm)

## Loading required package: NLP

corpus <- Corpus(VectorSource(tweets$TEXT))
data(itastopwords)
#Elenco di parole aggiuntive caricate con TextWiller
dtm <- as.matrix(DocumentTermMatrix(corpus
                                    , control = list( stemming = FALSE, stopwords = itastopwords,
                                                      minWordLength = 2, removeNumbers = TRUE,
                                                      removePunctuation = FALSE, bounds=list(local = c(1,Inf)) ))
) #dictionary=

# tweets=tweets[ids,]

Aggiungo alla dtm i conteggi della parole generale in fare di normalizzazione

dtm=cbind(dtm,conteggi_caratteri)

controllo che non ci siano colonne costantemente 0.

which(colSums(dtm)==0)

## named integer(0)

ok.

Dimensione della dtm

dim(dtm)

## [1]  6448 15283

Assegna sentiment ai tw

lo useremo come predittore in futuro

sent=sentiment(tweets$TEXT)
tweets$sent=sent
prop.table(table(tweets$sent,exclude = NULL))

## 
##        -1         0         1      <NA> 
## 0.3070720 0.3334367 0.3594913 0.0000000

barplot(table(tweets$sent),col=2:4)

legato a soggettivo?

tab=table(tweets$sent,tweets$soggettivo)
prop.table(tab,1)

##     
##          FALSE      TRUE
##   -1 0.2226225 0.7773775
##   0  0.3811075 0.6188925
##   1  0.2402516 0.7597484

mosaicplot(tab,col=1:2)

chisq.test(tab)

## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 112.13, df = 2, p-value < 2.2e-16

legato a positivo?

tab=table(tweets$sent,tweets$positivo)
prop.table(tab,1)

##     
##          FALSE      TRUE
##   -1 0.8602305 0.1397695
##   0  0.7706840 0.2293160
##   1  0.5308176 0.4691824

mosaicplot(tab,col=1:2)

chisq.test(tab)

## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 430.52, df = 2, p-value < 2.2e-16

legato a negativo?

tab=table(tweets$sent,tweets$negativo)
prop.table(tab,1)

##     
##          FALSE      TRUE
##   -1 0.3609510 0.6390490
##   0  0.6071661 0.3928339
##   1  0.6786164 0.3213836

mosaicplot(tab,col=1:2)

chisq.test(tab)

## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 327.1, df = 2, p-value < 2.2e-16

legato a ironico?

tab=table(tweets$sent,tweets$ironico)
prop.table(tab,1)

##     
##          FALSE      TRUE
##   -1 0.8674352 0.1325648
##   0  0.8794788 0.1205212
##   1  0.8748428 0.1251572

mosaicplot(tab,col=1:2)

chisq.test(tab)

## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 0.97197, df = 2, p-value = 0.6151

Creo il dataset dei predittori

X=cbind(sentPOS=tweets$sent==1,sentNEG=tweets$sent==-1,nchars=nchars,dtm)

Calcolo punteggi

faremo sempre riferiemnto al regolamento ufficiale:

http://clic.humnet.unipi.it/proceedings/Proceedings-EVALITA-2014.pdf (vedi “Overview of the Evalita 2014 SENTIment POLarity Classification Task” a pg 50)

alcune funzioni utli

è utile definire le funzioni che calcolano i punteggi finali:

precision <- function(true,predicted){
  sum(true[predicted==1]==1)/sum(predicted==1)
}

recall <- function(true,predicted){
  sum(predicted[true==1]==1)/sum(true==1)
}

F_class <- function(true,predicted,verbatim=TRUE){
  rec=recall(true,predicted)
  pre=precision(true,predicted)
  Fsc = 2*(pre*rec)/(pre+rec)
 if(verbatim) print(c(recall=rec,precision=pre,F=Fsc))
  Fsc
}

Task1: soggettivoectivity classification

Punteggio finale task 1: (F_obj+F_soggettivo)/2

(F_class( true==1,predetto==1) +F_class( true==0,predetto==0))/2

Task2: polarity classification

Punteggio finale task 2: ((F_neg_0+F_neg_1)/2+(F_pos_0+F_pos_1)/2)/2

per il modello che prevede i positivo F_pos_1=F_class( true==1,predetto==1) F_pos_0=F_class( true==0,predetto==0)

Uguale per il modello che prevede i negativo F_neg_1=F_class( true==1,predetto==1) F_neg_0=F_class( true==0,predetto==0)

Task3: ironico classification

come task 1

salvo i dataset creati

salvo due data.frame e le funzioni per calcolare i punteggi finali:

tweets che contiene i testi originali ed anche le classificazioni dei task 1, 2 e 3.
X la matrice dei predittori: dtm + sentiment + nchar

save(file="dati_FunScore.Rdata", tweets,X, precision, recall, F_class)

buon lavoro!!

Preprocessing tweets

Livio Finos

05/03/2016