Skip to content

Commit

Permalink
Merge pull request #18 from CopticScriptorium/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
amir-zeldes authored Jun 17, 2019
2 parents 96e9077 + 5705743 commit a35476b
Show file tree
Hide file tree
Showing 6 changed files with 121 additions and 22 deletions.
17 changes: 11 additions & 6 deletions api.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/python
#!/usr/local/bin/python2.7
# -*- coding: utf-8 -*-

from lib.tokenize_rf import MultiColumnLabelEncoder, DataFrameSelector, lambda_underscore
#from lib.tokenize_rf import MultiColumnLabelEncoder, DataFrameSelector, lambda_underscore

#Example call on localhost:
#http://localhost/coptic-nlp/api.py?data=%E2%B2%81%CF%A5%E2%B2%A5%E2%B2%B1%E2%B2%A7%E2%B2%99%20%E2%B2%9B%CF%AD%E2%B2%93%E2%B2%A1%E2%B2%A3%E2%B2%B1%E2%B2%99%E2%B2%89&lb=line
Expand All @@ -17,7 +17,10 @@
if "lb" in storage:
line = storage.getvalue("lb")
else:
line = "noline"
if "<lb" in data:
line = "noline"
else:
line = "line"

if "format" in storage:
format = storage.getvalue("format")
Expand All @@ -29,20 +32,22 @@

if format == "pipes":
print("Content-Type: text/plain; charset=UTF-8\n")
processed = nlp_coptic(data,line,sgml_mode="pipes",do_tok=True)
processed = nlp_coptic(data,lb=line=="line",sgml_mode="pipes",do_tok=True)
if "</lb>" in processed:
processed = processed.replace("</lb>","</lb>\n")
print(processed.strip())
elif format == "sgml_no_parse":
print("Content-Type: text/sgml; charset=UTF-8\n")
# secure call, note that htaccess prevents this running without authentication
if "|" in data:
processed = nlp_coptic(data, lb=line=="line", parse_only=False, do_tok=True,
do_norm=True, do_tag=True, do_lemma=True, do_lang=True,
do_milestone=True, do_parse=True, sgml_mode="sgml",
do_milestone=True, do_parse=("no_parse" not in format), sgml_mode="sgml",
tok_mode="from_pipes", old_tokenizer=False)
else:
processed = nlp_coptic(data, lb=line=="line", parse_only=False, do_tok=True,
do_norm=True, do_tag=True, do_lemma=True, do_lang=True,
do_milestone=True, do_parse=True, sgml_mode="sgml",
do_milestone=True, do_parse=("no_parse" not in format), sgml_mode="sgml",
tok_mode="auto", old_tokenizer=False)
print(processed.strip() + "\n")
elif format != "conll":
Expand Down
30 changes: 19 additions & 11 deletions coptic_nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ def download_requirements(tt_ok=True, malt_ok=True):

def nlp_coptic(input_data, lb=False, parse_only=False, do_tok=True, do_norm=True, do_mwe=True, do_tag=True, do_lemma=True, do_lang=True,
do_milestone=True, do_parse=True, sgml_mode="sgml", tok_mode="auto", old_tokenizer=False, sent_tag=None,
preloaded=None, pos_spans=False, merge_parse=False, detokenize=0):
preloaded=None, pos_spans=False, merge_parse=False, detokenize=0, segment_merged=False, gold_parse=""):

data = input_data.replace("\t","")
data = data.replace("\r","")
Expand All @@ -399,7 +399,7 @@ def nlp_coptic(input_data, lb=False, parse_only=False, do_tok=True, do_norm=True
stk = preloaded
else:
stk = StackedTokenizer(pipes=sgml_mode != "sgml", lines=lb, tokenized=tok_mode=="from_pipes",
detok=detokenize, segment_merged=opts.segment_merged)
detok=detokenize, segment_merged=segment_merged)

if do_milestone:
data = binarize(data)
Expand Down Expand Up @@ -453,7 +453,7 @@ def nlp_coptic(input_data, lb=False, parse_only=False, do_tok=True, do_norm=True
do_tag = True
elif resp.lower() == "a":
sys.exit(0)
if do_tag:
if do_tag and not pos_spans:
tag = [tt_path+'tree-tagger', tt_path+'coptic_fine.par', '-token','-lemma','-no-unknown', '-sgml' ,'tempfilename'] #no -token
tagged = exec_via_temp(norms,tag)
tagged = re.sub('\r','',tagged)
Expand All @@ -464,13 +464,20 @@ def nlp_coptic(input_data, lb=False, parse_only=False, do_tok=True, do_norm=True
tagged = input_data
if PY3:
tagged = input_data.encode("utf8") # Handle non-UTF-8 when calling TT from subprocess in Python 3
conllized = conllize(tagged,tag="PUNCT",element=sent_tag, no_zero=True) # NB element is present it supercedes the POS tag
deped = DepEdit(io.open(data_dir + "add_ud_and_flat_morph.ini",encoding="utf8"),options=type('', (), {"quiet":True})())
depedited = deped.run_depedit(conllized.split("\n"))
parse_coptic = ['java','-mx512m','-jar',"maltparser-1.8.jar",'-c','coptic','-i','tempfilename','-m','parse']
parsed = exec_via_temp(depedited,parse_coptic,parser_path)
deped = DepEdit(io.open(data_dir + "parser_postprocess_nodom.ini",encoding="utf8"),options=type('', (), {"quiet":True})())
depedited = deped.run_depedit(parsed.split("\n"))
if gold_parse == "":
conllized = conllize(tagged,tag="PUNCT",element=sent_tag, no_zero=True) # NB element is present it supercedes the POS tag
deped = DepEdit(io.open(data_dir + "add_ud_and_flat_morph.ini",encoding="utf8"),options=type('', (), {"quiet":True})())
depedited = deped.run_depedit(conllized.split("\n"))
parse_coptic = ['java','-mx512m','-jar',"maltparser-1.8.jar",'-c','coptic','-i','tempfilename','-m','parse']
parsed = exec_via_temp(depedited,parse_coptic,parser_path)
deped = DepEdit(io.open(data_dir + "parser_postprocess_nodom.ini",encoding="utf8"),options=type('', (), {"quiet":True})())
depedited = deped.run_depedit(parsed.split("\n"))
else: # A cached gold parse has been specified
depedited = gold_parse
norm_count = len(re.findall(r'(\n|^)[0-9]+\t',depedited))
input_norms = input_data.count(" norm=")
if norm_count != input_norms:
raise IOError("Mismatch in word count: " + str(norm_count) + " in gold parse but " + str(input_norms) + " in SGML file\n")
if parse_only: # Output parse in conll format
return depedited
elif merge_parse: # Insert parse into input SGML as attributes of <norm>
Expand Down Expand Up @@ -718,7 +725,8 @@ def nlp_coptic(input_data, lb=False, parse_only=False, do_tok=True, do_norm=True
do_norm=opts.norm, do_mwe=opts.multiword, do_tag=opts.tag, do_lemma=opts.lemma,
do_lang=opts.etym, do_milestone=opts.unary, do_parse=opts.parse, sgml_mode=opts.outmode,
tok_mode="auto", old_tokenizer=old_tokenizer, sent_tag=opts.sent, preloaded=stk,
pos_spans=opts.pos_spans, merge_parse=opts.merge_parse, detokenize=opts.detokenize)
pos_spans=opts.pos_spans, merge_parse=opts.merge_parse, detokenize=opts.detokenize,
segment_merged=opts.segment_merged)

if opts.outmode == "sgml":
processed = reorder(processed.strip().split("\n"),add_fixed_meta=add_fixed_meta)
Expand Down
1 change: 0 additions & 1 deletion index.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import cgitb
cgitb.enable()

from lib.tokenize_rf import MultiColumnLabelEncoder, DataFrameSelector, lambda_underscore
from nlp_form import make_nlp_form

print("Content-Type: text/html\n\n\n")
Expand Down
9 changes: 8 additions & 1 deletion lib/stacked_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@

sys.path.append("lib")

class BoundGroup():
class BoundGroup:

# Static list of characters that are removed from norm text (basic normalization)
orig_chars = ["̈", "", "̄", "̀", "̣", "`", "̅", "̈", "̂", "︤", "︥", "︦", "⳿", "~", "\n", "[", "]", "̇", "᷍"]
Expand Down Expand Up @@ -324,6 +324,12 @@ def serialize(groups,pipes=False,group_sep="_",tok_sep="|",segment_merged=False)
return out_text


def adjust_theta(tokenization):
"""Post-edit pre-tokenization in 'from pipes' mode to account for theta boundaries"""
tokenization = tokenization.replace("ⲑ|","ⲧ|ϩ").replace("ⲑ-","ⲧ-ϩ")
return tokenization


class StackedTokenizer:

def __init__(self,lines=False,pipes=False,tokenized=False,no_morphs=False,detok=0,segment_merged=False,model="cop"):
Expand Down Expand Up @@ -426,6 +432,7 @@ def analyze(self,data):
for g in grps:
# plain_tokenization = g.norm.replace("□","|").replace("■","-")
plain_tokenization = g.pretokenization
plain_tokenization = adjust_theta(plain_tokenization)
g.orig = g.orig.replace("□", "").replace("■", "")
g.norm = g.norm.replace("□", "").replace("■", "")
# g.dirty = g.dirty.replace("□","").replace("■","")
Expand Down
66 changes: 64 additions & 2 deletions nlp_form.html
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,51 @@ <h3 class="nlp_title">Input:</h3>
<br/>
<h3 class="nlp_title">Output:</h3>
<table>
<tr><td colspan="2" style="padding-bottom: 10px"><input type="checkbox" name="old_tok" value="old_tok"**old_checked**>Use old finite state tokenizer
<tr><td colspan="2" style="padding-bottom: 10px"><input type="checkbox" id="old_tok" name="old_tok" value="old_tok" onclick="toggle_laytonize(false);"**old_checked**>Use old finite state tokenizer
<a href="#" class="tooltip2">
<i class="fa fa-info-circle" style="display: inline-block"></i>
<span>
<img class="callout" src="img/callout.gif" />
Less accurate, provided for reproducing older results.<br/>
Less accurate, provided for reproducing older results. Not compatible with detokenization.<br/>
</span>
</a></input><br/></td></tr>
<tr><td colspan="2" style="padding-bottom: 10px"><input type="checkbox" id="detokenize" name="detokenize" value="detokenize" onclick="toggle_laytonize(true);"**detokenize_checked**>Re-merge bound groups
<a href="#" class="tooltip2">
<i class="fa fa-info-circle" style="display: inline-block"></i>
<span>
<img class="callout" src="img/callout.gif" />
Regularizes bound group spaces if input does not follow Layton's guidelines<br/>
(a.k.a. 'Laytonization'; increases accuracy on Till-segmented text and OCR)
</span>
</a></input><br/>
<ul>
<input type="radio" id="laytonize1" name="laytonize" value="conservative"**laytonize_conservative_checked**>Conservative merging<a href="#" class="tooltip2">
<i class="fa fa-info-circle" style="display: inline-block"></i>
<span>
<img class="callout" src="img/callout.gif" />
Only re-bind items known to appear unbound in other segmentations <br/>(e.g. well edited text following Till)<br/>
<div style="font-family: Antinoou; text-align:right; margin-bottom: 0px; font-weight: bold"><br/>ϩⲙ ⲡⲏⲓ --&gt; ϩⲙ|ⲡ|ⲏⲓ</div>
</span>
</a></input><br/>
<input type="radio" id="laytonize2" name="laytonize" value="aggressive"**laytonize_aggressive_checked**>Aggressive merging<a href="#" class="tooltip2">
<i class="fa fa-info-circle" style="display: inline-block"></i>
<span>
<img class="callout" src="img/callout.gif" />
Re-bind all items that are unlikely to appear unbound <br/>(better for messy data/OCR)<br/>
<div style="font-family: Antinoou; text-align:right; margin-bottom: 0px; font-weight: bold"><br/>ⲁ ϥⲥⲱⲧⲙ --&gt; ⲁ|ϥ|ⲥⲱⲧⲙ</div>
</span>
</a></input><br/>
<input type="checkbox" id="segment_merged" name="segment_merged" value="segment_merged"**segment_merged_checked**>Segment at merge point
<a href="#" class="tooltip2">
<i class="fa fa-info-circle" style="display: inline-block"></i>
<span>
<img class="callout" src="img/callout.gif" />
If bound groups are merged, assume a morpheme boundary <br/>
(recommended if base segmentation is reliable)
</span>
</a></input><br/>
</ul>
</td></tr>
<tr><td>
<input type="radio" name="sgml_mode" value="sgml" onclick="disable_checkboxes(false);"**sgml_checked**>SGML pipeline</input><br/>
<ul>
Expand Down Expand Up @@ -160,6 +197,31 @@ <h3 class="nlp_title">Output:</h3>
if (document.querySelector('input[name="sgml_mode"]:checked').value == "pipes"){
disable_checkboxes(true);
}

function toggle_laytonize(laytonize_on){
if (laytonize_on){
document.getElementById("old_tok").checked = false;
}
else{
document.getElementById("detokenize").checked = false;
}
document.getElementById("norm").disabled = laytonize_on;
if (document.getElementById("detokenize").checked){
document.getElementById("laytonize1").disabled = false;
document.getElementById("laytonize1").checked = true;
document.getElementById("laytonize2").disabled = false;
document.getElementById("segment_merged").disabled = false;
document.getElementById("segment_merged").checked = true;
}
else{
document.getElementById("laytonize1").disabled = true;
document.getElementById("laytonize1").checked = false;
document.getElementById("laytonize2").disabled = true;
document.getElementById("laytonize2").checked = false;
document.getElementById("segment_merged").disabled = true;
document.getElementById("segment_merged").checked = false;
}
}
</script>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
<script src="https://cdn.jsdelivr.net/bxslider/4.2.12/jquery.bxslider.min.js"></script>
Expand Down
20 changes: 19 additions & 1 deletion nlp_form.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ def make_nlp_form(access_level, mode):
do_lemma = True
do_tag = True
do_parse = True
detok = 0
segment_merged = False
do_tok = True
do_norm = True
do_mwe = True
Expand All @@ -85,6 +87,13 @@ def make_nlp_form(access_level, mode):
do_parse = form.getvalue("parse") is not None
do_norm = form.getvalue("norm") is not None
do_mwe = form.getvalue("mwe") is not None
if form.getvalue("laytonize") == "aggressive":
detok = 2
elif form.getvalue("laytonize") == "conservative":
detok = 1
else:
detok = 0
segment_merged = form.getvalue("segment_merged") is not None
do_tok = form.getvalue("tok") is not None
do_lang = form.getvalue("lang") is not None
if sgml_mode == "pipes":
Expand All @@ -95,7 +104,8 @@ def make_nlp_form(access_level, mode):
else:
processed = nlp_coptic(data,lb=lb=="line",parse_only=False,do_tok=do_tok,do_norm=do_norm,do_mwe=do_mwe,
do_tag=do_tag, do_lemma=do_lemma,do_lang=do_lang,do_milestone=do_milestone,
do_parse=do_parse, sgml_mode=sgml_mode,tok_mode=tok_mode,old_tokenizer=old_tok)
do_parse=do_parse, sgml_mode=sgml_mode,tok_mode=tok_mode,old_tokenizer=old_tok,
detokenize=detok, segment_merged=segment_merged)
processed = processed.strip()

###
Expand All @@ -115,6 +125,10 @@ def make_nlp_form(access_level, mode):
noline_checked = ' checked="checked"' if lb else ""
tok_checked = ' checked="checked"' if do_tok else ""
old_checked = ' checked="checked"' if old_tok else ""
segment_merged_checked = ' checked="checked"' if segment_merged else ""
detokenize_checked = ' checked="checked"' if detok > 0 else ""
laytonize_conservative_checked = ' checked="checked"' if detok == 1 else ""
laytonize_aggressive_checked = ' checked="checked"' if detok == 2 else ""
auto_checked = ' checked="checked"' if tok_mode == "auto" else ""
pipes_checked = ' checked="checked"' if tok_mode == "from_pipes" else ""
norm_checked = ' checked="checked"' if do_norm else ""
Expand Down Expand Up @@ -154,6 +168,10 @@ def make_nlp_form(access_level, mode):
template = template.replace("**old_checked**", old_checked)
template = template.replace("**milestone_checked**", milestone_checked)
template = template.replace("**tok_checked**", tok_checked)
template = template.replace("**detokenize_checked**", detokenize_checked)
template = template.replace("**laytonize_conservative_checked**", laytonize_conservative_checked)
template = template.replace("**laytonize_aggressive_checked**", laytonize_aggressive_checked)
template = template.replace("**segment_merged_checked**", segment_merged_checked)
template = template.replace("**auto_checked**", auto_checked)
template = template.replace("**pipes_checked**", pipes_checked)
template = template.replace("**norm_checked**", norm_checked)
Expand Down

0 comments on commit a35476b

Please sign in to comment.