diff --git a/api.py b/api.py index 99d752c..e7b6337 100644 --- a/api.py +++ b/api.py @@ -1,7 +1,7 @@ -#!/usr/bin/python +#!/usr/local/bin/python2.7 # -*- coding: utf-8 -*- -from lib.tokenize_rf import MultiColumnLabelEncoder, DataFrameSelector, lambda_underscore +#from lib.tokenize_rf import MultiColumnLabelEncoder, DataFrameSelector, lambda_underscore #Example call on localhost: #http://localhost/coptic-nlp/api.py?data=%E2%B2%81%CF%A5%E2%B2%A5%E2%B2%B1%E2%B2%A7%E2%B2%99%20%E2%B2%9B%CF%AD%E2%B2%93%E2%B2%A1%E2%B2%A3%E2%B2%B1%E2%B2%99%E2%B2%89&lb=line @@ -17,7 +17,10 @@ if "lb" in storage: line = storage.getvalue("lb") else: - line = "noline" + if "" in processed: + processed = processed.replace("","\n") print(processed.strip()) elif format == "sgml_no_parse": print("Content-Type: text/sgml; charset=UTF-8\n") @@ -37,12 +42,12 @@ if "|" in data: processed = nlp_coptic(data, lb=line=="line", parse_only=False, do_tok=True, do_norm=True, do_tag=True, do_lemma=True, do_lang=True, - do_milestone=True, do_parse=True, sgml_mode="sgml", + do_milestone=True, do_parse=("no_parse" not in format), sgml_mode="sgml", tok_mode="from_pipes", old_tokenizer=False) else: processed = nlp_coptic(data, lb=line=="line", parse_only=False, do_tok=True, do_norm=True, do_tag=True, do_lemma=True, do_lang=True, - do_milestone=True, do_parse=True, sgml_mode="sgml", + do_milestone=True, do_parse=("no_parse" not in format), sgml_mode="sgml", tok_mode="auto", old_tokenizer=False) print(processed.strip() + "\n") elif format != "conll": diff --git a/coptic_nlp.py b/coptic_nlp.py index ab7f2eb..7c64d8a 100644 --- a/coptic_nlp.py +++ b/coptic_nlp.py @@ -390,7 +390,7 @@ def download_requirements(tt_ok=True, malt_ok=True): def nlp_coptic(input_data, lb=False, parse_only=False, do_tok=True, do_norm=True, do_mwe=True, do_tag=True, do_lemma=True, do_lang=True, do_milestone=True, do_parse=True, sgml_mode="sgml", tok_mode="auto", old_tokenizer=False, sent_tag=None, - preloaded=None, pos_spans=False, merge_parse=False, detokenize=0): + preloaded=None, pos_spans=False, merge_parse=False, detokenize=0, segment_merged=False, gold_parse=""): data = input_data.replace("\t","") data = data.replace("\r","") @@ -399,7 +399,7 @@ def nlp_coptic(input_data, lb=False, parse_only=False, do_tok=True, do_norm=True stk = preloaded else: stk = StackedTokenizer(pipes=sgml_mode != "sgml", lines=lb, tokenized=tok_mode=="from_pipes", - detok=detokenize, segment_merged=opts.segment_merged) + detok=detokenize, segment_merged=segment_merged) if do_milestone: data = binarize(data) @@ -453,7 +453,7 @@ def nlp_coptic(input_data, lb=False, parse_only=False, do_tok=True, do_norm=True do_tag = True elif resp.lower() == "a": sys.exit(0) - if do_tag: + if do_tag and not pos_spans: tag = [tt_path+'tree-tagger', tt_path+'coptic_fine.par', '-token','-lemma','-no-unknown', '-sgml' ,'tempfilename'] #no -token tagged = exec_via_temp(norms,tag) tagged = re.sub('\r','',tagged) @@ -464,13 +464,20 @@ def nlp_coptic(input_data, lb=False, parse_only=False, do_tok=True, do_norm=True tagged = input_data if PY3: tagged = input_data.encode("utf8") # Handle non-UTF-8 when calling TT from subprocess in Python 3 - conllized = conllize(tagged,tag="PUNCT",element=sent_tag, no_zero=True) # NB element is present it supercedes the POS tag - deped = DepEdit(io.open(data_dir + "add_ud_and_flat_morph.ini",encoding="utf8"),options=type('', (), {"quiet":True})()) - depedited = deped.run_depedit(conllized.split("\n")) - parse_coptic = ['java','-mx512m','-jar',"maltparser-1.8.jar",'-c','coptic','-i','tempfilename','-m','parse'] - parsed = exec_via_temp(depedited,parse_coptic,parser_path) - deped = DepEdit(io.open(data_dir + "parser_postprocess_nodom.ini",encoding="utf8"),options=type('', (), {"quiet":True})()) - depedited = deped.run_depedit(parsed.split("\n")) + if gold_parse == "": + conllized = conllize(tagged,tag="PUNCT",element=sent_tag, no_zero=True) # NB element is present it supercedes the POS tag + deped = DepEdit(io.open(data_dir + "add_ud_and_flat_morph.ini",encoding="utf8"),options=type('', (), {"quiet":True})()) + depedited = deped.run_depedit(conllized.split("\n")) + parse_coptic = ['java','-mx512m','-jar',"maltparser-1.8.jar",'-c','coptic','-i','tempfilename','-m','parse'] + parsed = exec_via_temp(depedited,parse_coptic,parser_path) + deped = DepEdit(io.open(data_dir + "parser_postprocess_nodom.ini",encoding="utf8"),options=type('', (), {"quiet":True})()) + depedited = deped.run_depedit(parsed.split("\n")) + else: # A cached gold parse has been specified + depedited = gold_parse + norm_count = len(re.findall(r'(\n|^)[0-9]+\t',depedited)) + input_norms = input_data.count(" norm=") + if norm_count != input_norms: + raise IOError("Mismatch in word count: " + str(norm_count) + " in gold parse but " + str(input_norms) + " in SGML file\n") if parse_only: # Output parse in conll format return depedited elif merge_parse: # Insert parse into input SGML as attributes of @@ -718,7 +725,8 @@ def nlp_coptic(input_data, lb=False, parse_only=False, do_tok=True, do_norm=True do_norm=opts.norm, do_mwe=opts.multiword, do_tag=opts.tag, do_lemma=opts.lemma, do_lang=opts.etym, do_milestone=opts.unary, do_parse=opts.parse, sgml_mode=opts.outmode, tok_mode="auto", old_tokenizer=old_tokenizer, sent_tag=opts.sent, preloaded=stk, - pos_spans=opts.pos_spans, merge_parse=opts.merge_parse, detokenize=opts.detokenize) + pos_spans=opts.pos_spans, merge_parse=opts.merge_parse, detokenize=opts.detokenize, + segment_merged=opts.segment_merged) if opts.outmode == "sgml": processed = reorder(processed.strip().split("\n"),add_fixed_meta=add_fixed_meta) diff --git a/index.py b/index.py index 7bdc2b3..bb88c98 100644 --- a/index.py +++ b/index.py @@ -4,7 +4,6 @@ import cgitb cgitb.enable() -from lib.tokenize_rf import MultiColumnLabelEncoder, DataFrameSelector, lambda_underscore from nlp_form import make_nlp_form print("Content-Type: text/html\n\n\n") diff --git a/lib/stacked_tokenizer.py b/lib/stacked_tokenizer.py index c7394b8..be50e6c 100644 --- a/lib/stacked_tokenizer.py +++ b/lib/stacked_tokenizer.py @@ -32,7 +32,7 @@ sys.path.append("lib") -class BoundGroup(): +class BoundGroup: # Static list of characters that are removed from norm text (basic normalization) orig_chars = ["̈", "", "̄", "̀", "̣", "`", "̅", "̈", "̂", "︤", "︥", "︦", "⳿", "~", "\n", "[", "]", "̇", "᷍"] @@ -324,6 +324,12 @@ def serialize(groups,pipes=False,group_sep="_",tok_sep="|",segment_merged=False) return out_text +def adjust_theta(tokenization): + """Post-edit pre-tokenization in 'from pipes' mode to account for theta boundaries""" + tokenization = tokenization.replace("ⲑ|","ⲧ|ϩ").replace("ⲑ-","ⲧ-ϩ") + return tokenization + + class StackedTokenizer: def __init__(self,lines=False,pipes=False,tokenized=False,no_morphs=False,detok=0,segment_merged=False,model="cop"): @@ -426,6 +432,7 @@ def analyze(self,data): for g in grps: # plain_tokenization = g.norm.replace("□","|").replace("■","-") plain_tokenization = g.pretokenization + plain_tokenization = adjust_theta(plain_tokenization) g.orig = g.orig.replace("□", "").replace("■", "") g.norm = g.norm.replace("□", "").replace("■", "") # g.dirty = g.dirty.replace("□","").replace("■","") diff --git a/nlp_form.html b/nlp_form.html index 8547987..3476076 100644 --- a/nlp_form.html +++ b/nlp_form.html @@ -40,14 +40,51 @@

Input:


Output:

- +
Use old finite state tokenizer +
Use old finite state tokenizer - Less accurate, provided for reproducing older results.
+ Less accurate, provided for reproducing older results. Not compatible with detokenization.

Re-merge bound groups + + + + + Regularizes bound group spaces if input does not follow Layton's guidelines
+ (a.k.a. 'Laytonization'; increases accuracy on Till-segmented text and OCR) +
+

+ +
SGML pipeline
    @@ -160,6 +197,31 @@

    Output:

    if (document.querySelector('input[name="sgml_mode"]:checked').value == "pipes"){ disable_checkboxes(true); } + + function toggle_laytonize(laytonize_on){ + if (laytonize_on){ + document.getElementById("old_tok").checked = false; + } + else{ + document.getElementById("detokenize").checked = false; + } + document.getElementById("norm").disabled = laytonize_on; + if (document.getElementById("detokenize").checked){ + document.getElementById("laytonize1").disabled = false; + document.getElementById("laytonize1").checked = true; + document.getElementById("laytonize2").disabled = false; + document.getElementById("segment_merged").disabled = false; + document.getElementById("segment_merged").checked = true; + } + else{ + document.getElementById("laytonize1").disabled = true; + document.getElementById("laytonize1").checked = false; + document.getElementById("laytonize2").disabled = true; + document.getElementById("laytonize2").checked = false; + document.getElementById("segment_merged").disabled = true; + document.getElementById("segment_merged").checked = false; + } + } diff --git a/nlp_form.py b/nlp_form.py index 292abf3..0f6a4e4 100644 --- a/nlp_form.py +++ b/nlp_form.py @@ -66,6 +66,8 @@ def make_nlp_form(access_level, mode): do_lemma = True do_tag = True do_parse = True + detok = 0 + segment_merged = False do_tok = True do_norm = True do_mwe = True @@ -85,6 +87,13 @@ def make_nlp_form(access_level, mode): do_parse = form.getvalue("parse") is not None do_norm = form.getvalue("norm") is not None do_mwe = form.getvalue("mwe") is not None + if form.getvalue("laytonize") == "aggressive": + detok = 2 + elif form.getvalue("laytonize") == "conservative": + detok = 1 + else: + detok = 0 + segment_merged = form.getvalue("segment_merged") is not None do_tok = form.getvalue("tok") is not None do_lang = form.getvalue("lang") is not None if sgml_mode == "pipes": @@ -95,7 +104,8 @@ def make_nlp_form(access_level, mode): else: processed = nlp_coptic(data,lb=lb=="line",parse_only=False,do_tok=do_tok,do_norm=do_norm,do_mwe=do_mwe, do_tag=do_tag, do_lemma=do_lemma,do_lang=do_lang,do_milestone=do_milestone, - do_parse=do_parse, sgml_mode=sgml_mode,tok_mode=tok_mode,old_tokenizer=old_tok) + do_parse=do_parse, sgml_mode=sgml_mode,tok_mode=tok_mode,old_tokenizer=old_tok, + detokenize=detok, segment_merged=segment_merged) processed = processed.strip() ### @@ -115,6 +125,10 @@ def make_nlp_form(access_level, mode): noline_checked = ' checked="checked"' if lb else "" tok_checked = ' checked="checked"' if do_tok else "" old_checked = ' checked="checked"' if old_tok else "" + segment_merged_checked = ' checked="checked"' if segment_merged else "" + detokenize_checked = ' checked="checked"' if detok > 0 else "" + laytonize_conservative_checked = ' checked="checked"' if detok == 1 else "" + laytonize_aggressive_checked = ' checked="checked"' if detok == 2 else "" auto_checked = ' checked="checked"' if tok_mode == "auto" else "" pipes_checked = ' checked="checked"' if tok_mode == "from_pipes" else "" norm_checked = ' checked="checked"' if do_norm else "" @@ -154,6 +168,10 @@ def make_nlp_form(access_level, mode): template = template.replace("**old_checked**", old_checked) template = template.replace("**milestone_checked**", milestone_checked) template = template.replace("**tok_checked**", tok_checked) + template = template.replace("**detokenize_checked**", detokenize_checked) + template = template.replace("**laytonize_conservative_checked**", laytonize_conservative_checked) + template = template.replace("**laytonize_aggressive_checked**", laytonize_aggressive_checked) + template = template.replace("**segment_merged_checked**", segment_merged_checked) template = template.replace("**auto_checked**", auto_checked) template = template.replace("**pipes_checked**", pipes_checked) template = template.replace("**norm_checked**", norm_checked)