Skip to content

Commit

Permalink
ML-DTM code and corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
Elaine Zosa committed Jul 8, 2019
1 parent b7b6ef7 commit 156d009
Show file tree
Hide file tree
Showing 2 changed files with 447 additions and 0 deletions.
189 changes: 189 additions & 0 deletions corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import os
import numpy as np
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter
from datetime import date
import calendar
import tarfile
import random

exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
stopwords_yle = set(stopwords.words('finnish') +
['saada','tehdä','jo','sanoa','voi','tulla','muun','myös','jälkeen','pitää','vuosi','mennä','vielä','000']).union(stopwords.words('swedish') +
['också','få','vilja','ho','säga','fyra','gå','få','in','göra','år','komma','måste','hava','http','föra','taga','enligt',
'kunna','bliva','mången','böra','andraga','fjol','mycken','del','000'])

stopwords_denews = set(stopwords.words('english') +
['according','mr','said','could','would','today','should','shall']).union(stopwords.words('german') +
['dass', 'fuer', 'sei', 'ueber', 'sagte','sollen','wollen','heute','seien','wuerden','mehr'])

def clean_yle_doc(doc):
clean_short = " ".join([tok for tok in doc if len(tok) > 2])
clean_punc = ''.join(ch for ch in clean_short if ch not in exclude)
clean_stop = " ".join([i for i in clean_punc.lower().split() if i not in stopwords_yle])
return clean_stop

def clean_denews_doc(doc):
clean_xml = " ".join([line for line in doc if line[0] != "<"])
clean_punc = ''.join(ch for ch in clean_xml if ch not in exclude)
clean_stop = " ".join([i for i in clean_punc.lower().split() if i not in stopwords_denews and len(i) > 2])
clean_doc = " ".join(lemma.lemmatize(word) for word in clean_stop.split())
clean_doc = " ".join(word for word in clean_doc.split())
return clean_doc

def getKey(item):
return item[1]

def compute_frequency_scores(documents):
languages = list(documents.keys())
scores = {}
for lang in languages:
articles = [d for docs in documents[lang] for d in docs]
tokens = [token for art in articles for token in art]
counts = Counter(tokens)
tuples = [(key, counts[key]) for key in counts.keys()]
sorted_tuples = sorted(tuples, key=getKey, reverse=True)
scores[lang] = sorted_tuples
return scores

def prune_vocabulary(documents, vocab_len=2000):
term_scores = compute_frequency_scores(documents)
languages = list(documents.keys())
time_slices = len(documents[languages[0]])
dictionary = {lang: set() for lang in languages}
for lang in languages:
valid_tokens = [term[0] for term in term_scores[lang][:vocab_len]]
for t in range(time_slices):
n_docs = len(documents[lang][t])
for d in range(n_docs):
doc = documents[lang][t][d]
pruned_doc = [w for w in doc if w in valid_tokens and len(w) > 2]
documents[lang][t][d] = pruned_doc
dictionary[lang].update(pruned_doc)
for lang in languages:
dictionary[lang] = list(dictionary[lang])
return documents, dictionary

def add_months(sourcedate, months):
month = sourcedate.month - 1 + months
year = sourcedate.year + month // 12
month = month % 12 + 1
day = min(sourcedate.day, calendar.monthrange(year, month)[1])
return date(year, month, day)

def get_yle_corpus(n_timeslices):
print("Getting YLE corpus for", n_timeslices,"time slices")
yle_filepath = "/wrk/users/zosa/codes/pimlico_store/yle_preprocess3/main/lemmatize/lemmas/data/"
print("Reading lemmatized articles from ", yle_filepath)
articles = {}
tar_files = os.listdir(yle_filepath)
for tar_file in tar_files:
tar = tarfile.open(yle_filepath + "/" + tar_file, "r")
for member in tar.getmembers():
f = tar.extractfile(member)
if f is not None:
filename = member.name
print("Filename: ", filename)
text = f.read().decode('utf-8')
lines = text.split("|DatePublished ")
for art in lines:
if len(art) > 0:
a = art.split("|")
date_pub = a[0]
art_no = a[1].split()[1]
text = a[2]
lang = "fi" if "fi" in filename else "sv"
if art_no not in articles.keys():
articles[art_no] = {}
d = date_pub.split("-")
articles[art_no]['date'] = d[0] + d[1]
articles[art_no][lang] = text
start_date = date(year=2012, month=1, day=1)
end_date = add_months(start_date, n_timeslices - 1)
if end_date.month < 10:
end_date_str = str(end_date.year) + "0" + str(end_date.month)
else:
end_date_str = str(end_date.year) + str(end_date.month)
end_date_int = int(end_date_str)
languages = ['fi', 'sv']
documents = {lang: [] for lang in languages}
dictionary = {lang: set() for lang in languages}
timestamps = []
keys = list(articles.keys())
for k in keys:
art = articles[k]
if int(art['date']) <= end_date_int:
for lang in languages:
doc = art[lang]
clean_doc = clean_yle_doc(doc.split()).split()
documents[lang].append(clean_doc)
dictionary[lang].update(clean_doc)
timestamps.append(art['date'])
dictionary = {lang: list(dictionary[lang]) for lang in languages}
unique_timestamps = list(set(timestamps))
unique_timestamps.sort()
documents = {lang: np.array(documents[lang]) for lang in languages}
timestamps = np.array(timestamps)
documents_sliced = {lang: [] for lang in languages}
for t in unique_timestamps:
for lang in languages:
docs_t = documents[lang][timestamps == t]
documents_sliced[lang].append(docs_t)
print("time slices: ", len(unique_timestamps))
return documents_sliced, unique_timestamps, dictionary


def sample_yle_articles(documents, max_doc):
print("Sampling", max_doc, "articles for each time slice")
languages = list(documents.keys())
lang1 = languages[0]
documents = {lang: np.array(documents[lang]) for lang in languages}
documents_sampled = {lang: [] for lang in languages}
timeslices = len(documents[lang1])
for t in range(timeslices):
n_docs = len(documents[lang1][t])
if n_docs > max_doc:
random_indexes = random.sample(range(n_docs), max_doc)
for lang in languages:
random_docs = documents[lang][t][random_indexes]
documents_sampled[lang].append(random_docs)
else:
for lang in languages:
documents_sampled[lang].append(documents[lang][t])
return documents_sampled

def get_denews_corpus(path):
print("Getting DE-News corpus from: ", path)
filenames = os.listdir(path)
filenames.sort()
languages = ['english', 'german']
documents = {lang: [] for lang in languages}
timestamps = {lang:[] for lang in languages}
dictionary = {lang: set() for lang in languages}
for f in filenames:
text = open(path + "/" + f, 'r').read().split()
index_start = list(np.where(np.array(text) == "<DOC")[0])
lang = "english" if "en.txt" in f else "german"
for i in range(len(index_start) - 1):
start_art = index_start[i] + 2
end_art = index_start[i + 1]
article = clean_denews_doc(text[start_art:end_art]).split()
documents[lang].append(article)
timestamp = float(f.split("-")[-3]+f.split("-")[-2])
timestamps[lang].append(timestamp)
dictionary[lang].update(set(article))
dictionary = {lang: list(dictionary[lang]) for lang in languages}
unique_timestamps = list(set(timestamps[languages[0]]))
unique_timestamps.sort()
documents = {lang: np.array(documents[lang]) for lang in languages}
timestamps = {lang: np.array(timestamps[lang]) for lang in languages}
documents_sliced = {lang: [] for lang in languages}
for t in unique_timestamps:
for lang in languages:
docs_t = documents[lang][timestamps[lang]==t]
documents_sliced[lang].append(docs_t)
print("time slices: ", len(unique_timestamps))
return documents_sliced, unique_timestamps, dictionary
Loading

0 comments on commit 156d009

Please sign in to comment.