forked from meefen/twitter-hashtag-analytics
-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathanalysis.R
345 lines (293 loc) · 16.6 KB
/
analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
setwd("~/src/r/twitter-analytics/twitter-hashtag-analytics")
source("get_tweets.R")
source("munge_tweets.R")
source("utilities.R")
# get tweets
df <- GetTweetsBySearch('#LAK13')
names(df)
head(df, 3)
# removing odd characters
df <- RemoveOddChars(df)
# extract user info and add to df
df <- ExtractUserInfo(df)
# how many unique Twitter accounts in the sample
length(unique(df$screenName))
# Anonymize users, by creating random numbers for each user
# rand.df <- AnonymizeUsers(df)
# Count table
countsSortSubset <- GetTweetCountTable(df, "screenName")
countRTSortSubset <- GetTweetCountTable(df, "rt")
countToSortSubset <- GetTweetCountTable(df, "to")
# combine counts into one data frame
counts1 <- merge(countsSortSubset, countRTSortSubset, by = "user", all.x=TRUE)
counts <- merge(counts1, countToSortSubset, by = "user", all.x = TRUE)
colnames(counts) <- c("user", "tweets", "to", "rt")
counts[is.na(counts)] <- 0
counts$tweets <- as.numeric(counts$tweets)
counts$to <- as.numeric(counts$to)
counts$rt <- as.numeric(counts$rt)
# create a Cleveland dot plot of tweet counts and retweet counts per Twitter account
# solid data point = number of tweets, letter R = number of retweets
library(ggplot2)
library(reshape2)
counts.melt <- melt(counts, id.vars = c("user"))
ggplot(counts.melt, aes(x = user, y = value, color = variable)) +
geom_point() + geom_line() + coord_flip() + ggtitle("Graph")
# calculate the number of followers of each Twitter account
# extract the usernames from the non-anonymised dataset
users <- unique(df$screenName)
users <- sapply(users, as.character)
# make a data frame for further manipulation
users.df <- data.frame(users = users, followers = "", stringsAsFactors = FALSE)
# loop to populate users$followers with a follower count obtained from Twitter API
for (i in 1:nrow(users.df))
{
# tell the loop to skip a user if their account is protected
# or some other error occurs
result <- try(getUser(users.df$users[i])$followersCount, silent = FALSE);
if(class(result) == "try-error") next;
# get the number of followers for each user
users.df$followers[i] <- getUser(users.df$users[i])$followersCount
# tell the loop to pause for 60 s between iterations to
# avoid exceeding the Twitter API request limit
# this is going to take a long time if there are a lot
# of users, good idea to let it run overnight
print('Sleeping for 60 seconds...')
Sys.sleep(60);
}
# merge follower count with number of tweets per author
followerCounts <- merge(TweetRetweet, users.df, by.x = "screenName", by.y = "users")
# convert to value to numeric for further analysis
followerCounts$followers <- as.numeric(followerCounts$followers)
followerCounts$counts <- as.numeric(followerCounts$counts)
# create a plot
ggplot(data = followerCounts, aes(count, followers)) +
geom_text(aes(label = randuser, size = RT_count)) +
scale_size(range=c(3,10)) +
scale_x_log10(breaks = c(10,20,40,60,80,100)) +
scale_y_log10(breaks = c(10,100,seq(1000,7000,1000))) +
xlab("Number of Messages") +
ylab("Number of Followers") +
theme_bw() +
theme(axis.title.x = element_text(vjust = -0.5, size = 14)) +
theme(axis.title.y = element_text(size = 14, angle=90)) +
theme(plot.margin = unit(c(1,1,2,2), "lines"))
# Ratio of retweets to tweets for some more insights
#
# make table with counts of tweets per person
t <- as.data.frame(table(rand.df$randuser))
# make table with counts of retweets per person
rt <- as.data.frame(table(rand.df$rt.rand))
# combine tweet count and retweet count per person
t.rt <- merge(t,rt,by="Var1")
# creates new col and adds ratio tweet/retweet
t.rt["ratio"] <- t.rt$Freq.y / t.rt$Freq.x
# sort it to put names in order by ratio
sort.t.rt <- t.rt[order(t.rt$ratio),]
# exclude those with 2 tweets or less
sort.t.rt.subset <- subset(sort.t.rt,sort.t.rt$Freq.x>2)
#
# drop unused levels leftover from subsetting
sort.t.rt.subset.drop <- droplevels(sort.t.rt.subset)
# plot nicely ordered counts of tweets by person for
# people > 5 tweets
ggplot(sort.t.rt.subset.drop, aes(reorder(Var1, ratio), ratio)) +
xlab("Author") +
ylab("Ratio of messages retweeted by others to original messages")+
geom_point() +
coord_flip() +
theme_bw() +
theme(axis.title.x = element_text(vjust = -0.5, size = 14)) +
theme(axis.title.y = element_text(size = 14, angle=90)) +
theme(plot.margin = unit(c(1,1,2,2), "lines"))
# extract tweeter-retweeted pairs
rt <- data.frame(user=rand.df$randuser, rt=rand.df$rt.rand)
# omit pairs with NA and get only unique pairs
rt.u <- na.omit(unique(rt)) #
# begin social network analysis plotting
require(igraph)
require (sna)
degree <- sna::degree
g <- graph.data.frame(rt.u, directed = F)
g <- as.undirected(g)
g.adj <- get.adjacency(g)
# plot network graph
gplot(g.adj, usearrows = FALSE,
vertex.col = "grey",vertex.border = "black",
displaylabels = FALSE, edge.lwd = 0.01, edge.col
= "grey30", vertex.cex = 0.5)
# get some basic network attributes
gden(g.adj) # density
grecip(g.adj) # reciprocity
gtrans(g.adj) # transitivity
centralization(g.adj, degree)
# calculate Univariate Conditional Uniform Graph Tests
# density
print(cug.gden <- cug.test(g.adj, gden))
plot(cug.gden)
range(cug.gden$rep.stat)
# reciprocity
print(cug.recip <- cug.test(g.adj, grecip))
plot(cug.recip)
range(cug.recip$rep.stat)
# transistivity
print(cug.gtrans <- cug.test(g.adj, gtrans))
plot(cug.gtrans)
range(cug.gtrans$rep.stat)
# centralisation
print(cug.cent <- cug.test(g.adj, centralization, FUN.arg=list(FUN=degree)))
# find out how many communities exist in the network using the walktrap
g.wc <- walktrap.community(g, steps = 1000, modularity=TRUE)
plot(as.dendrogram(g.wc))
max(g.wc$membership)+1
# some basic and widely-used text mining techniques to identify the issues
# that captured the attention of Twitter-using anthropologists during the meeting.
require(tm)
a <- Corpus(VectorSource(df$text)) # create corpus object
a <- tm_map(a, tolower) # convert all text to lower case
a <- tm_map(a, removePunctuation)
a <- tm_map(a, removeNumbers)
a <- tm_map(a, removeWords, stopwords("english")) # this list needs to be edited and this function repeated a few times to remove high frequency context specific words with no semantic value
require(rJava) # needed for stemming function
library(Snowball) # also needed for stemming function
a <- tm_map(a, stemDocument, language = "english") # converts terms to tokens
a.dtm <- TermDocumentMatrix(a, control = list(minWordLength = 3)) # create a term document matrix, keepiing only tokens longer than three characters, since shorter tokens are very hard to interpret
inspect(a.dtm[1:10,1:10]) # have a quick look at the term document matrix
findFreqTerms(a.dtm, lowfreq=30) # have a look at common words, in this case, those that appear at least 30 times, good to get high freq words and add to stopword list and re-make the dtm, in this case add aaa, panel, session
findAssocs(a.dtm, 'science', 0.30) # find associated words and strength of the common words. I repeated this function for the ten most frequent words.
# investigate the URLs contained in the Twitter messages
require(stringr)
require(ggplot2)
require(grid)
df$link <- sapply(df$text,function(tweet) str_extract(tweet,("http[[:print:]]+"))) # creates new field and extracts the links contained in the tweet
df$link <- sapply(df$text,function(tweet) str_extract(tweet,"http[[:print:]]{16}")) # limits to just 16 characters after http so I just get the shortened link.
countlink <- data.frame(URL = as.character(unlist(dimnames(sort(table(df$link))))), N = sort(table(df$link))) # get frequencies of each link and put in rank order
rownames(countlink) <- NULL # remove rownames
countlinkSub <- subset(countlink, N>2) # subset of just links appearing more than twice
# plot to see distribution of links
ggplot(countlinkSub, aes(reorder(URL, N), N)) +
xlab("URL") +
ylab("Number of messages containing the URL")+
geom_point() +
coord_flip() +
theme_bw() +
theme(axis.title.x = element_text(vjust = -0.5, size = 14)) +
theme(axis.title.y = element_text(size = 14, angle=90)) +
theme(plot.margin = unit(c(1,1,2,2), "lines"))
countlink<-sort(countlink) # sort them
barplot(countlink) # plot freqs
# or to use ggplot2, read on...
countlink<-data.frame(na.omit((df$link)))
names(countlink)="link"
qplot(countlink$link, geom="bar")+coord_flip() # but not sorted, so let's keep going...
links<-count(countlink, "link") # to get a data frame with two named vectors for sorting
qplot(reorder(link,freq),freq,data=links, geom="bar")+coord_flip() # and here it is in order
links2<-subset(links,links$freq>2) # subset of just links appearing more than twice
links2$link<-factor(links2$link,levels=links2$link) # create factor for sorting
qplot(reorder(links2$link,links2$freq),links2$freq,data=links2, geom="bar")+coord_flip() # plot nicely sorted
# This is based on Jeffrey Breen's excellent tutorial at http://jeffreybreen.wordpress.com/2011/07/04/twitter-text-mining-r-slides/
# download sentiment word list from here: http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar un-rar and put somewhere logical on your computer
hu.liu.pos = scan('F:/My Documents/My Papers/conferences/SAA2010/opinion-lexicon-English/positive-words.txt', what = 'character',comment.char=';') #load +ve sentiment word list
hu.liu.neg = scan('F:/My Documents/My Papers/conferences/SAA2010/opinion-lexicon-English/negative-words.txt',what = 'character',comment.char= ';') #load -ve sentiment word list
pos.words = c(hu.liu.pos)
neg.words = c(hu.liu.neg)
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
require(plyr)
require(stringr)
# we got a vector of sentences. plyr will handle a list
# or a vector as an "l" for us
# we want a simple array ("a") of scores back, so we use
# "l" + "a" + "ply" = "laply":
scores = laply(sentences, function(sentence, pos.words, neg.words) {
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
}, pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
require(plyr)
aaa.text <- laply(aaa2011, function(t)t$getText()) # draw on the original object of tweets that we first got to extract just the text of the tweets
length(aaa.text) #check how many tweets, make sure it agrees with the original sample size
head(aaa.text, 5) #check content sample, see that it looks as expected, no weird characters, etc.
aaa.scores <- score.sentiment(aaa.text,pos.words,neg.words,.progress='text') # get scores for the tweet text
# create a histogram of sentiment scores
ggplot(aaa.scores, aes(x=score)) +
geom_histogram(binwidth=1) +
xlab("Sentiment score") +
ylab("Frequency") +
theme_bw() +
opts(axis.title.x = theme_text(vjust = -0.5, size = 14)) +
opts(axis.title.y=theme_text(size = 14, angle=90, vjust = -0.25)) +
opts(plot.margin = unit(c(1,1,2,2), "lines"))
aaa.pos <- subset(aaa.scores,aaa.scores$score >= 2) # get tweets with only very +ve scores
aaa.neg <- subset(aaa.scores,aaa.scores$score <= -2) # get tweets with only very -ve scores
# Now create subset based on tweets with certain words, such as the high frequency words identified in the text mining. eg. science
scien <- subset(aaa.scores, regexpr("scien", aaa.scores$text) > 0) # extract tweets containing only 'scien'
# plot histogram for this token,
ggplot(scien, aes(x = score)) + geom_histogram(binwidth = 1) + xlab("Sentiment score for the token 'scien'") + ylab("Frequency") + theme_bw() + opts(axis.title.x = theme_text(vjust = -0.5, size = 14)) + opts(axis.title.y = theme_text(size = 14, angle = 90, vjust = -0.25)) + opts(plot.margin = unit(c(1,1,2,2), "lines"))
# repeat this block with different high frequency words
a.dtm.sp <- removeSparseTerms(a.dtm, sparse=0.989) # I found I had to iterate over this to ensure the dtm doesn't get too small... for example: 0.990 nrow=88, 0.989, nrow=67, 0.985, nrow=37, 0.98 nrow=23, 0.95 nrow=6
a.dtm.sp.df <- as.data.frame(inspect(a.dtm.sp)) # convert document term matrix to data frame
nrow(a.dtm.sp.df) # check to see how many words we're left with after removing sparse terms
# this analysis is based on http://www.statmethods.net/advstats/cluster.html
# scale and transpose data for cluster analysis
a.dtm.sp.df.sc.t <- t(scale(a.dtm.sp.df))
require(pvclust)
fit <- pvclust(a.dtm.sp.df.sc.t, method.hclust = "average", method.dist = "correlation", nboot = 10) # this method may take a few hours the bootstraping, you can reduce the nboot value for a quicker result
plot(fit, cex = 1.5, cex.pv = 1.2, col.pv = c(1,0,0), main="", xlab="", sub="") # draw the dendrogram
require(slam)
a.dtm.sp.t <- t(a.dtm.sp) # transpose document term matrix, necessary for the next steps using mean term frequency-inverse document frequency (tf-idf) to select the vocabulary for topic modeling
summary(col_sums(a.dtm.sp.t)) # check median...
term_tfidf <- tapply(a.dtm.sp.t$v/row_sums(a.dtm.sp.t)[a.dtm.sp.t$i], a.dtm.sp.t$j,mean) * log2(nDocs(a.dtm.sp.t)/col_sums(a.dtm.sp.t>0)) # calculate tf-idf values
summary(term_tfidf) # check median... note value for next line...
a.dtm.sp.t.tdif <- a.dtm.sp.t[,term_tfidf>=1.0] # keep only those terms that are slightly less frequent that the median
a.dtm.sp.t.tdif <- a.dtm.sp.t[row_sums(a.dtm.sp.t) > 0, ]
summary(col_sums(a.dtm.sp.t.tdif)) # have a look
# Before going right into generating the topic model and analysing the output, we need to decide on the number of topics that the model should use
# Here's a function to loop over different topic numbers, get the log liklihood of the model for each topic number and plot it so we can pick the best one
# The best number of topics is the one with the highest log liklihood value.
require(topicmodels)
best.model <- lapply(seq(2, 50, by = 1), function(d){LDA(a.dtm.sp.t.tdif, d)}) # this will make a topic model for every number of topics between 2 and 50... it will take some time!
best.model.logLik <- as.data.frame(as.matrix(lapply(best.model, logLik))) # this will produce a list of logLiks for each model...
# plot the distribution of logliklihoods by topic
best.model.logLik.df <- data.frame(topics=c(2:50), LL = as.numeric(as.matrix(best.model.logLik)))
ggplot(best.model.logLik.df, aes(x = topics, y = LL)) +
xlab("Number of topics") +
ylab("Log likelihood of the model") +
geom_line() +
theme_bw() +
opts(axis.title.x = theme_text(vjust = -0.5, size = 14)) +
opts(axis.title.y=theme_text(size = 14, angle=90, vjust= -0.25)) +
opts(plot.margin = unit(c(1,1,2,2), "lines")) # plot nicely the ggsave(file = "model_LL_per_topic_number.pdf") # export the plot to a PDF file
# it's not easy to see exactly which topic number has the highest LL, so let's look at the data...
best.model.logLik.df.sort <- best.model.logLik.df[order(-best.model.logLik.df$LL), ] # sort to find out which number of topics has the highest loglik, in this case 23 topics.
best.model.logLik.df.sort # have a look to see what's at the top of the list, the one with the highest score
lda <- LDA(a.dtm.sp.t.tdif,23) # generate a LDA model with 23 topics, as found to be optimum
get_terms(lda, 5) # get keywords for each topic, just for a quick look
get_topics(lda, 5) # gets topic numbers per document
lda_topics<-get_topics(lda, 5)
beta <- lda@beta # create object containing parameters of the word distribution for each topic
gamma <- lda@gamma # create object containing posterior topic distribution for each document
terms <- lda@terms # create object containing terms (words) that can be used to line up with beta and gamma
colnames(beta) <- terms # puts the terms (or words) as the column names for the topic weights.
id <- t(apply(beta, 1, order)) # order the beta values
beta_ranked <- lapply(1:nrow(id),function(i)beta[i,id[i,]]) # gives table of words per topic with words ranked in order of beta values. Useful for determining the most important words per topic