diff --git a/train_pred_trankit.py b/train_pred_trankit.py new file mode 100644 index 0000000..089fc38 --- /dev/null +++ b/train_pred_trankit.py @@ -0,0 +1,325 @@ +# pip install trankit +# https://trankit.readthedocs.io/en/stable/training.html + +#an example using trankit to train custom model with pre tokenized conllu file including multiword token + +import trankit,re, os, sys, json +from trankit.iterators.tagger_iterators import TaggerDataset + +#load pipe +from trankit import Pipeline +from trankit.utils import CoNLL +from trankit.utils.base_utils import get_ud_score, get_ud_performance_table + + +# res_folder = 'trankit_res' +# train_path = 'trankit/conllus/train_parser_en_gum.conllu' +# dev_path = 'trankit/conllus/test_frontend.conllu' +# dev_raw = 'trankit/conllus/dev_raw.txt' +# train_raw = 'trankit/conllus/train_raw.txt' +# pred_fpath'trankit/pred_test_lem.conllu' + +# epoch = 100 +# epoch_tok = 40 + +ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10) + +def train_deprel_feat(res_folder, epoch): + # initialize a trainer for the task + trainer_dep = trankit.TPipeline( + training_config={ + 'max_epoch': epoch, + 'category': 'customized', # pipeline category + 'task': 'posdep', # task name + 'save_dir': res_folder, # directory for saving trained model + 'train_conllu_fpath': train_path, # annotations file in CONLLU format for training + 'dev_conllu_fpath': dev_path, # annotations file in CONLLU format for development + 'embedding': 'xlm-roberta-large' + + } + ) + + # start training + trainer_dep.train() + return trainer_dep + + + +def train_lemma(res_folder, epoch): + # initialize a trainer for the task + trainer= trankit.TPipeline( + training_config={ + 'max_epoch': epoch, + 'category': 'customized', # pipeline category + 'task': 'lemmatize', # task name + 'save_dir': res_folder, # directory for saving trained model + 'train_conllu_fpath': train_path, # annotations file in CONLLU format for training + 'dev_conllu_fpath': dev_path, # annotations file in CONLLU format for development + 'embedding': 'xlm-roberta-large' + } + ) + # start training + trainer.train() + + +def get_raw_file(conllu_path, raw_path): + txt = open(conllu_path).read() + txt_pattern = re.compile(r"# text =.+") + res = '\n'.join([l[9:] for l in re.findall(txt_pattern, txt)]) + if raw_path: + with open(raw_path, 'w') as f: + f.write(res) + + +def train_tok(res_folder, epoch_tok): + """tokenizer required to build a pipeline for parsing""" + get_raw_file(train_path, train_raw) + get_raw_file(dev_path, dev_raw) + + # initialize a trainer for the task + trainer_tok = trankit.TPipeline( + training_config={ + 'max_epoch': epoch_tok, + 'category': 'customized', # pipeline category + 'task': 'tokenize', # task name + 'save_dir': res_folder, # directory for saving trained model + 'train_txt_fpath': train_raw, # raw text file + 'train_conllu_fpath': train_path, # annotations file in CONLLU format for training + 'dev_txt_fpath': dev_raw, # raw text file + 'dev_conllu_fpath': dev_path, # annotations file in CONLLU format for development + 'embedding': 'xlm-roberta-large' + } + ) + # start training + trainer_tok.train() + +def test_deprel(trainer, test_path, name = 'test_dep'): + #test trainer + #from trankit.iterators.tagger_iterators import TaggerDataset + #trainer should be TPipeline instance for posdep, not for lemma + + test_set = TaggerDataset( + config=trainer._config, + input_conllu = test_path, + gold_conllu= test_path, + evaluate= False + ) + + test_set.numberize() + test_batch_num = len(test_set) // trainer._config.batch_size + (len(test_set) % trainer._config.batch_size != 0) + result = trainer._eval_posdep(data_set=test_set, batch_num=test_batch_num, + name=name, epoch= -1) + print(trankit.utils.base_utils.get_ud_performance_table(result[0])) + return result[0] + + +def check_pipe(model_dir): + #check pipe + trankit.verify_customized_pipeline( + category= 'customized', # pipeline category + save_dir= model_dir, # directory used for saving models in previous steps + embedding_name='xlm-roberta-large' # embedding version that we use for training our customized pipeline, by default, it is `xlm-roberta-base` + ) + + +#load pipe +# from trankit import Pipeline +# from trankit.utils import CoNLL + +def get_toklist(fpath): + conllu_ls = CoNLL.load_conll(open(fpath), ignore_gapping = True) + res = [] + for sent in conllu_ls: + res.append([l[1] for l in sent]) + return res + + + +def pred_trankit( pipe , to_parse_path, parsed_path, task = 'posdep'): + # p = Pipeline(lang='customized', cache_dir= model_dir ) + #get token list + conll_list = CoNLL.load_conll(open(to_parse_path), ignore_gapping = True) + + tok_ls = [] + expand_end = -1 + for sent in conll_list: + sent_info = [] + ldict = {} + for l in sent: + if '-' in l[0]: + ldict['id'] = ( int(l[0].split('-')[0]), int(l[0].split('-')[1]) ) + ldict['text'] = l[1] + expand_end = ldict['id'][1] + ldict['expanded'] = [] + elif expand_end > 0 and int(l[0]) <= expand_end: + ldict['expanded'].append( { 'id': int(l[0]), 'text' : l[1] } ) + + if int(l[0]) == expand_end: + #reset + expand_end = -1 + sent_info.append(ldict) + ldict = {} + else: + sent_info.append( { 'id': int(l[0]), 'text' : l[1] }) + tok_ls.append(sent_info) + + #lemmiatize + pos headid tag feat + if task == 'posdep': + res_dict = pipe.posdep_withID(tok_ls ) + + doc_conll = CoNLL.convert_dict([ s['tokens'] for s in res_dict['sentences'] ], use_expand = True) + conll_string = CoNLL.conll_as_string(doc_conll) + with open(parsed_path, 'w') as outfile: + outfile.write(conll_string) + + if task == 'lemmatize': + res_dict = pipe.lemmatize_withID(tok_ls) + + doc_conll = CoNLL.convert_dict([ s['tokens'] for s in res_dict['sentences'] ], use_expand = True) + conll_string = CoNLL.conll_as_string(doc_conll) + with open(parsed_path, 'w') as outfile: + outfile.write(conll_string) + + + +def train_trankit(res_folder, epoch, epoch_tok): + #train_tok(res_folder, epoch_tok) + train_deprel_feat(res_folder, epoch) + train_lemma(res_folder, epoch) + train_tok(res_folder, epoch_tok) + check_pipe(res_folder) + +def eval_parsed(parsed_path, gold_path): + score = get_ud_score(parsed_path, gold_path) + print(get_ud_performance_table(score)) + return score + +def save_score(score,score_dir, res_folder, cv_idx, name = 'test', newfile = True): + metric_ls = [ "Tokens", "Sentences", "Words", "UPOS", "XPOS", "UFeats", "AllTags", "Lemmas", "UAS", "LAS", + "CLAS", "MLAS", "BLEX"] + mode = 'w' if newfile else 'a' + #f1 score + with open(os.path.join(score_dir, name+'_trankit_f1score.tsv'), mode) as f: + if mode == 'w': + f.write('\t'.join( ["Metrics"] + metric_ls) + '\n') + + f.write('\t'.join( [f"cv{cv_idx}"] + ["{}".format(score[metric].f1) for metric in metric_ls] ) + "\n" ) + + #more score + res = {} + for metric in metric_ls: + res[metric] = { + 'precision': score[metric].precision, + 'recall': score[metric].recall, + 'f1': score[metric].f1} + if score[metric].aligned_accuracy is not None: + res['aligned_accuracy'] = score[metric].aligned_accuracy + + with open(os.path.join(res_folder, name + '_score.json'), 'w') as f1: + json.dump(res, f1, indent = 4) + + +def posdep(res_folder, epoch, test_path, name = 'test_dep'): + print(name) + trainer = train_deprel_feat(res_folder, epoch) + score = test_deprel(trainer, test_path, name = name) + return score + + +def copy_lemma_file(lemma_path, posdep_path): + # TODO + #better to combine in backend when we copy the upos??? + print('copy lemma:', lemma_path) + lemma_txt = open(lemma_path).read().strip() + begin, tmp = lemma_txt.split("sent_id ", 1) + lemmas= [t.split('\n') for t in ("# sent_id "+tmp).split('\n\n') if t] + + lemma_dict = {} + for conllu in lemmas: + # every sent begin with #sent_id + # TODO replace this by keyword sent_id instead of index + key = conllu[0].split('=')[1].strip() + lemma_dict[key] = [line for line in conllu[1:] if line[0] != '#'] + + posdep_txt = open(posdep_path).read().strip() + begin, tmp = parsed_txt.split("sent_id ", 1) + deprel = [t.split('\n') for t in ("# sent_id "+tmp).split('\n\n') if t] + + posdep_dict = {} + for conllu in deprel: + key = conllu[0].split('=')[1].strip() + posdep_dict[key] = conllu[1:] + + for key, conll in posdep_dict.items(): + begin = 0 + for l, line in enumerate(conll): + if(line[0]!='#'): + info = line.split('\t') + info_tag = lemma_dict[key][l - begin].split('\t') + #print(info) + info[3] = info_tag[LEMMA] + posdep_dict[key][l] = '\t'.join(info) + else: + begin += 1 + posdep_dict[key] = '\n'.join(posdep_dict[key]) + + to_write = begin[:-2] + '\n\n'.join([f'# sent_id = {k}\n' + val for k, val in posdep_dict.items()]) + '\n\n' + with open(os.path.join(os.path.dirname(posdep_path), 'combined_parsed.conllu'), 'w' ) as f: + f.write(to_write) + + + + +if __name__ == '__main__': + # pred_fpath = 'trankit/pred_test_lem.conllu' + if len(sys.argv) < 8: + print(len(sys.argv)) + print("Usage: train_pred_trankit.py project_folder data_folder score_dir to_parse_path epoch epoch_tok cv_idx", file=sys.stderr) + sys.exit(-1) + + #python3 train_pred_trankit.py 'test_trankit' 30 10 + + #set param + res_folder = sys.argv[1] #os.path.join(sys.argv[1], 'trankit_res') + + train_path = os.path.join(sys.argv[2], 'train.conllu') + dev_path = os.path.join(sys.argv[2], 'dev.conllu') + dev_raw = os.path.join(sys.argv[2], 'dev_raw.txt') + train_raw = os.path.join(sys.argv[2], 'train_raw.txt') + + score_dir = sys.argv[3] + + to_parse_path = sys.argv[4] #os.path.join(sys.argv[3], 'test1000.conllu') + epoch = int(sys.argv[5]) + epoch_tok = int(sys.argv[6]) + cv_idx = int(sys.argv[7]) + + + #train & pred + lemma = True + if lemma: + train_trankit(res_folder, epoch, epoch_tok) + p = Pipeline(lang='customized', cache_dir= res_folder, embedding = 'xlm-roberta-large' ) + + for task in ['posdep','lemmatize']: + print('==== pred for task ', task) + parsed_path = os.path.join(res_folder, f'parsed_{task}_test1000.conllu') + print(parsed_path) + print(to_parse_path) + pred_trankit(p, to_parse_path, parsed_path, task = task) + + score = eval_parsed(parsed_path, to_parse_path) + + new_fscore = True if cv_idx == 0 else False + save_score(score, score_dir, res_folder, cv_idx, name = f'test_{task}', newfile = new_fscore) + + # if False: + # copy_lemma_file( + # os.path.join(res_folder, 'parsed_lemmatize_test1000.conllu'), + # os.path.join(res_folder, 'parsed_posdep_test1000.conllu') + # ) + else: + posdep(res_folder, epoch, to_parse_path, name = 'test') + + + diff --git a/trankit/adapter_transformers/file_utils.py b/trankit/adapter_transformers/file_utils.py index e9f27e5..88aef17 100644 --- a/trankit/adapter_transformers/file_utils.py +++ b/trankit/adapter_transformers/file_utils.py @@ -252,7 +252,7 @@ def cached_path( # URL, so get it from the cache (downloading if necessary) output_path = get_from_cache( url_or_filename, - cache_dir=cache_dir, + cache_dir= "cache/xlm_roberta_model", force_download=force_download, proxies=proxies, resume_download=resume_download, diff --git a/trankit/models/lemma_model.py b/trankit/models/lemma_model.py index 29c73f3..28eecd0 100644 --- a/trankit/models/lemma_model.py +++ b/trankit/models/lemma_model.py @@ -383,7 +383,7 @@ def train(self): type(token[ID]) == tuple and len(token[ID]) == 2)]) dev_preds = self.trainer.predict_dict( [[token[TEXT], token[UPOS]] for sentence in self.dev_batch.doc for token in sentence if - not (type(token[ID]) == tuple and len(token[ID]) == 2)]) + not (type(token[ID]) == tuple and len(token[ID]) == 2 ) and LEMMA in token.keys()]) self.dev_batch.doc = set_lemma(self.dev_batch.doc, dev_preds, training_mode=True) CoNLL.dict2conll(self.dev_batch.doc, self.system_pred_file) dev_f = get_ud_score(self.system_pred_file, self.gold_file)['Lemmas'].f1 diff --git a/trankit/pipeline.py b/trankit/pipeline.py index 9a30f8c..6c69d70 100644 --- a/trankit/pipeline.py +++ b/trankit/pipeline.py @@ -666,6 +666,25 @@ def _tokenize_doc(self, in_doc): # assuming input is a document torch.cuda.empty_cache() return doc + def posdep_withID(self, input, is_sent = False): + #input format = { ID: tok_id, TEXT: tok_text, EXPANDED : if multiword} + if is_sent: + if self.auto_mode: + self._detect_lang_and_switch(text=' '.join( [ tok[TEXT] for tok in input] )) + + # input = [{ID: k_w[0], TEXT: k_w[1]} for k_w in input] + return {TOKENS: self._posdep_sent(in_sent=input), LANG: self.active_lang} + + else: + # switch to detected lang if auto mode is on + if self.auto_mode: + self._detect_lang_and_switch(text='\n'.join([' '.join( [ tok[TEXT] for tok in sent] ) for sent in input])) + + input = [{ID: sid + 1, TOKENS: sent } for sid, sent in + enumerate(input)] + + return {SENTENCES: self._posdep_doc(in_doc=input), LANG: self.active_lang} + def posdep(self, input, is_sent=False): if is_sent: assert is_string(input) or is_list_strings( @@ -860,6 +879,26 @@ def _posdep_doc(self, in_doc): # assuming input is a document torch.cuda.empty_cache() return tagged_doc + def lemmatize_withID(self, input, is_sent = False): + #input format = list of (ID, text), ID can be a tuple for multi word token + if is_sent: + if self.auto_mode: + self._detect_lang_and_switch(text=' '.join( [ tok[TEXT] for tok in input] )) + + # input = [{ID: k_w[0], TEXT: k_w[1]} for k_w in input] + return {TOKENS: self._lemmatize_sent(in_sent=input, obmit_tag=True), LANG: self.active_lang} + + else: + # switch to detected lang if auto mode is on + if self.auto_mode: + self._detect_lang_and_switch(text='\n'.join([' '.join( [ tok[TEXT] for tok in sent] ) for sent in input])) + + input = [{ID: sid + 1, TOKENS: sent } for sid, sent in + enumerate(input)] + + return {SENTENCES: self._lemmatize_doc(in_doc=input, obmit_tag=True), LANG: self.active_lang} + + def lemmatize(self, input, is_sent=False): if is_sent: assert is_string(input) or is_list_strings( diff --git a/trankit/tpipeline.py b/trankit/tpipeline.py index 63fee30..f7c91c1 100644 --- a/trankit/tpipeline.py +++ b/trankit/tpipeline.py @@ -155,6 +155,7 @@ def _set_up_config(self, training_config): treebank_name = 'UD_Japanese-like' # use this special name to note that text is not split by spaces, similar to Japanese language. else: treebank_name = lang2treebank.get(self._lang, 'UD_{}-New'.format(self._lang)) + tbname2training_id[treebank_name] = tbname2training_id[lang2treebank[self._lang]] lang2treebank[self._lang] = treebank_name treebank2lang[treebank_name] = self._lang diff --git a/trankit/utils/conll.py b/trankit/utils/conll.py index 28862b9..cf86fc0 100644 --- a/trankit/utils/conll.py +++ b/trankit/utils/conll.py @@ -113,8 +113,9 @@ def conll2dict(input_file=None, input_str=None, ignore_gapping=True): return doc_dict @staticmethod - def convert_dict(doc_dict): - """ Convert the dictionary format input data to the CoNLL-U format output data. This is the reverse function of + def convert_dict(doc_dict, use_expand = False): + """modified : add option use_expand to take into account multiword token (expanded) in the result from pipeline.py + Convert the dictionary format input data to the CoNLL-U format output data. This is the reverse function of `convert_conll`. Input: dictionary format data, which is a list of list of dictionaries for each token in each sentence in the data. Output: CoNLL-U format data, which is a list of list of list for each token in each sentence in the data. @@ -125,6 +126,11 @@ def convert_dict(doc_dict): for token_dict in sent_dict: token_conll = CoNLL.convert_token_dict(token_dict) sent_conll.append(token_conll) + + if type(token_dict[ID]) == tuple and use_expand: + for tok_dict in token_dict[EXPANDED]: + tok_conll = CoNLL.convert_token_dict(tok_dict) + sent_conll.append(tok_conll) doc_conll.append(sent_conll) return doc_conll