-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimport_data.py
87 lines (68 loc) · 2.88 KB
/
import_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
import pandas as pd
import numpy as np
import nltk
import os
from os import listdir
from os.path import isfile, join, isdir
import json
import pickle
def load_train(input_dir):
# lists of dictionaries for dataframes
dicts = []
# for every candidate in problem1 or 2
for direc in listdir(input_dir):
if "candidate" in (direc):
# print(direc)
# for every file in the candidate directory, read and add every txt to lst
for fin in listdir(input_dir + "/" + direc):
f_txt = open(input_dir + "/" + direc + "/" + fin, "r+", encoding="utf-8")
f = f_txt.read()
f_txt.close()
dict1 = {"text": f,
"author": str(direc), "length": len(f)}
dicts.append(dict1)
train = pd.DataFrame(dicts, columns=["text", "author", "length"])
return train
def load_test(input_dir):
test_dicts = []
# convert json file into dataframe
with open(input_dir + "/" + "ground-truth.json") as data:
data = json.loads(data.read())
for el in data.get("ground_truth"):
dict1 = {"text": el.get("unknown-text"),
"author": el.get("true-author"), "length": 0}
test_dicts.append(dict1)
test = pd.DataFrame(test_dicts, columns=["text", "author", "length"])
# read and insert txt files in text row
for i, row in enumerate(test.itertuples()):
for fin in listdir(input_dir + "/" + "unknown"):
if str(fin) == str(row[1]):
with open(input_dir + "/" + "unknown" + "/" + fin, "r", encoding="utf-8") as txt:
test.loc[i, "text"] = txt.read()
test.loc[i, "length"] = len(test.loc[i, "text"])
return test
def merge_data(df1, df2):
frames = [df1, df2]
df = pd.concat(frames)
df.reset_index(drop=True)
return df
def load_pickle(filename):
with open(filename, 'rb') as f:
df = pickle.load(f)
return df
train1 = load_train("pan18-cross-domain-authorship-attribution-training-dataset-2017-12-02/problem00001")
test1 = load_test("pan18-cross-domain-authorship-attribution-training-dataset-2017-12-02/problem00001")
train2 = load_train("pan18-cross-domain-authorship-attribution-training-dataset-2017-12-02/problem00002")
test2 = load_test("pan18-cross-domain-authorship-attribution-training-dataset-2017-12-02/problem00002")
set1 = merge_data(train1, test1)
# print("full set 1: ", set1, set1.columns)
train1.to_pickle("data/train1.pkl")
test1.to_pickle("data/test1.pkl")
train2.to_pickle("data/train2.pkl")
test2.to_pickle("data/test2.pkl")
train1_df = load_pickle('data/train1.pkl')
test1_df = load_pickle('data/test1.pkl')
train2_df = load_pickle('data/train2.pkl')
test2_df = load_pickle('data/test2.pkl')