forked from guneykan/gcn_text_categorization
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocess.py
42 lines (35 loc) · 1.92 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from sys import argv
from preprocessors.build_adjacency import build_adjacency
from preprocessors.build_node_features import build_node_features
from preprocessors.clean_data import clean_data
from preprocessors.configs import PreProcessingConfigs
from preprocessors.prepare_words import prepare_words
from preprocessors.shuffle_data import shuffle_data
def create_preprocessing_cfg() -> PreProcessingConfigs:
conf = PreProcessingConfigs()
conf.data_sets = ['20ng', 'R8', 'R52', 'ohsumed', 'mr', 'cora', 'citeseer', 'pubmed']
conf.data_set_extension = '.txt'
conf.corpus_dir = 'data/corpus/'
conf.corpus_meta_dir = 'data/corpus/meta/'
conf.corpus_cleaned_dir = 'data/corpus.cleaned/'
conf.corpus_shuffled_dir = 'data/corpus.shuffled/'
conf.corpus_shuffled_split_index_dir = 'data/corpus.shuffled/split_index/'
conf.corpus_shuffled_meta_dir = 'data/corpus.shuffled/meta/'
conf.corpus_shuffled_vocab_dir = 'data/corpus.shuffled/vocabulary/'
conf.corpus_shuffled_word_vectors_dir = 'data/corpus.shuffled/word_vectors/'
conf.corpus_shuffled_adjacency_dir = 'data/corpus.shuffled/adjacency/'
conf.corpus_shuffled_node_features_dir = 'data/corpus.shuffled/node_features/'
conf.build()
return conf
def preprocess(ds: str, preprocessing_cfg: PreProcessingConfigs): # Start pre-processing
clean_data(ds_name=ds, rare_count=5, cfg=preprocessing_cfg)
shuffle_data(ds_name=ds, cfg=preprocessing_cfg)
prepare_words(ds_name=ds, cfg=preprocessing_cfg)
build_node_features(ds_name=ds, validation_ratio=0.10, use_predefined_word_vectors=False, cfg=preprocessing_cfg)
build_adjacency(ds_name=ds, cfg=preprocessing_cfg)
if __name__ == '__main__':
prep_cfg = create_preprocessing_cfg()
if len(argv) < 2:
raise Exception("Dataset name cannot be left blank. Must be one of datasets:%r." % prep_cfg.data_sets)
ds_name = argv[1]
preprocess(ds=ds_name, preprocessing_cfg=prep_cfg)