-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathperplecity.R
74 lines (60 loc) · 2.58 KB
/
perplecity.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
require(readr)
require(stringr)
require(tm)
tbl <- read_tsv("/labs/3grams-3.txt", col_names = F)[,-c(3,5)]
tbl$X1 <- tbl$X1/sum(tbl$X1)
tbl$three_grams <- paste(tbl$X2, tbl$X4, tbl$X6, sep = " ")
colnames(tbl)[1] <- "prob"
tbl <- subset(tbl, select = c(prob, three_grams))
saveRDS(tbl, "/labs/3grams.RDS")
dfs <- lapply(15:17, function(x) {read.table(paste0("/labs", x,"corpus.csv"),
header = FALSE,
col.names = c("links", "categories", "titles", "texts"),
stringsAsFactors = FALSE)})
df_politics <- dfs[[1]]
df_economics <- dfs[[2]]
get_3grams <- function(x, topic) {
words <- gsub("[[:punct:]]", "", x, perl=TRUE) %>%
str_split(" ") %>%
unlist %>%
.[. != " "] %>%
.[. != ""]
if(length(words) > 2){
v1 <- words[1:(length(words)-2)]
v2 <- words[2:(length(words)-1)]
v3 <- words[3:(length(words))]
write(length(words), paste0("/labs/words", topic,".txt"), append = T)
return(paste(v1, v2, v3))
}
}
process_text <- function(x, topic) {
clean_sentences <- str_split(x, "[.]") %>%
unlist %>%
str_trim(.) %>%
gsub("—", "", ., perl=TRUE) %>%
gsub(pattern = "\\s+", repl = " ", str_trim(.)) %>%
.[. != ""]
lapply(clean_sentences, get_3grams, topic = topic) %>% unlist
}
three_grams_politics <- lapply(df_politics$texts, process_text, topic = "politics") %>%
unlist %>%
gsub(pattern = "\\s+", repl = " ", str_trim(.))
three_grams_economics <- lapply(df_economics$texts, process_text, topic = "economics") %>%
unlist %>%
gsub(pattern = "\\s+", repl = " ", str_trim(.))
model <- readRDS("/labs/3grams.RDS")
politics_dictionary <- as.data.frame(table(three_grams_politics))
colnames(politics_dictionary) <- c("three_grams", "freq")
politics_dictionary <- merge(politics_dictionary, model, by = "three_grams")
economics_dictionary <- as.data.frame(table(three_grams_economics))
colnames(economics_dictionary) <- c("three_grams", "freq")
economics_dictionary <- merge(economics_dictionary, model, by = "three_grams")
perplexity <- function(x, topic, model) {
log_sum = log2(x$prob) * x$freq
words_count = sum(read_tsv(paste0("/labs/words", topic,".txt")))
l <- sum(log_sum)/words_count
2^(-l)
}
politics_perplexity <- perplexity(politics_dictionary, topic = "politics", model)
economics_perplexity <- perplexity(economics_dictionary, topic = "economics", model)
save(politics_perplexity, economics_perplexity, file = "/labs/perplexity.RData")