From 857edaf3844daedee3cd7f7e90f0f0f06e483058 Mon Sep 17 00:00:00 2001 From: lauren-lizzy-levine Date: Sun, 28 Apr 2024 14:49:03 -0400 Subject: [PATCH 1/3] adding warning for entity span not being a constituent --- _build/utils/repair_tsv.py | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/_build/utils/repair_tsv.py b/_build/utils/repair_tsv.py index ffc587a23..8d0704f8e 100644 --- a/_build/utils/repair_tsv.py +++ b/_build/utils/repair_tsv.py @@ -550,7 +550,7 @@ def fix_genitive_s(tsv_path, xml_path, warn_only=True, outdir=None, string_input ### end genitive s fix -def adjust_edges(webanno_tsv, parsed_lines, ent_mappings, single_tok_mappings, single_coref_type=False): +def adjust_edges(webanno_tsv, parsed_lines, ent_mappings, single_tok_mappings, filename, single_coref_type=False): """ Fix webanno TSV in several ways: * All edges pointing back from a pronoun receive type 'ana' @@ -900,6 +900,35 @@ def get_min(toks): if start != end: closer_lists[end].append(str(group) + ")") + # Validate that entity spans are constituents + outside_head = [] + token_indexes = [] + token_heads = [] + for token in ent["toks"]: + token_indexes.append(token[0]) + token_heads.append(token[1]) + for i in range(len(ent["toks"])): + outside_head.append(0) + if token_heads[i] not in token_indexes and token_heads[i] is not None: + outside_head[i] = 1 + if sum(outside_head) > 1: + print("WARN: entity " + str(e_id) + " is not a constituent: ") + print("\tDocument: " + filename.split("/")[-1]) + ent_str = "" + for j, tok in enumerate(ent["toks"]): + if outside_head[j] == 1: + ent_str += "[[" + tok[4] + "]]" + " " + else: + ent_str += tok[4] + " " + ent_str = ent_str[:-1] + print("\tEntity Span: " + ent_str) + sent_text = "" + for tok in parsed_lines: + if tok["token_id"].split("-")[0] == ent["sid"]: + sent_text += tok["token"] + " " + sent_text = sent_text[:-1] + print("\tSentence Context: " + sent_text + "\n") + conllua_data = [] for i in range(len(tokens)): if i+1 in opener_lists or i+1 in closer_lists: @@ -1247,7 +1276,7 @@ def fix_file(filename, tt_file, outdir, genitive_s=False): output += "\n".join(out_lines) + "\n" parsed_lines, entity_mappings, single_tok_mappings = fix_genitive_s(output, tt_file, warn_only=True, string_input=True) - output, conllua_data, centering_transitions, group_saliences = adjust_edges(output, parsed_lines, entity_mappings, single_tok_mappings) + output, conllua_data, centering_transitions, group_saliences = adjust_edges(output, parsed_lines, entity_mappings, single_tok_mappings, filename) centering_doc_data = defaultdict(lambda: "no-ent") # Set missing transitions From 73b3cf8312be650f22efd7628ef35fb4c261a804 Mon Sep 17 00:00:00 2001 From: lauren-lizzy-levine Date: Sat, 4 May 2024 00:36:29 -0400 Subject: [PATCH 2/3] reformtting warning --- _build/utils/repair_tsv.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/_build/utils/repair_tsv.py b/_build/utils/repair_tsv.py index 8d0704f8e..4d2cf9d1e 100644 --- a/_build/utils/repair_tsv.py +++ b/_build/utils/repair_tsv.py @@ -912,22 +912,34 @@ def get_min(toks): if token_heads[i] not in token_indexes and token_heads[i] is not None: outside_head[i] = 1 if sum(outside_head) > 1: - print("WARN: entity " + str(e_id) + " is not a constituent: ") - print("\tDocument: " + filename.split("/")[-1]) + #print("\tDocument: " + filename.split("/")[-1]) ent_str = "" for j, tok in enumerate(ent["toks"]): if outside_head[j] == 1: - ent_str += "[[" + tok[4] + "]]" + " " + ent_str += "*" + tok[4] + "*" + " " else: ent_str += tok[4] + " " - ent_str = ent_str[:-1] - print("\tEntity Span: " + ent_str) - sent_text = "" - for tok in parsed_lines: + ent_str = "[" + ent_str[:-1] + "]" + #print("\tEntity Span: " + ent_str) + start_context = parsed_lines[max(ent["start"] - 5, 0):ent["start"]] + end_context = parsed_lines[ent["end"] + 1:min(ent["end"] + 6, len(parsed_lines))] + combined_text = "" + for tok in start_context: if tok["token_id"].split("-")[0] == ent["sid"]: - sent_text += tok["token"] + " " - sent_text = sent_text[:-1] - print("\tSentence Context: " + sent_text + "\n") + combined_text += tok["token"] + " " + combined_text += ent_str + " " + for tok in end_context: + if tok["token_id"].split("-")[0] == ent["sid"]: + combined_text += tok["token"] + " " + combined_text = combined_text[:-1] + #print("Combined:", combined_text) + print("WARN: non-constituent entity (" + filename.split("/")[-1] + "): " + combined_text) + #sent_text = "" + #for tok in parsed_lines: + # if tok["token_id"].split("-")[0] == ent["sid"]: + # sent_text += tok["token"] + " " + #sent_text = sent_text[:-1] + #print("\tSentence Context: " + sent_text + "\n") conllua_data = [] for i in range(len(tokens)): From bf5ea70434ca94ee871ae05a0df3b8e42cc006f9 Mon Sep 17 00:00:00 2001 From: lauren-lizzy-levine Date: Sat, 4 May 2024 10:41:01 -0400 Subject: [PATCH 3/3] adding filtering for entity --- _build/utils/repair_tsv.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/_build/utils/repair_tsv.py b/_build/utils/repair_tsv.py index 4d2cf9d1e..d2dd5d491 100644 --- a/_build/utils/repair_tsv.py +++ b/_build/utils/repair_tsv.py @@ -901,7 +901,7 @@ def get_min(toks): closer_lists[end].append(str(group) + ")") # Validate that entity spans are constituents - outside_head = [] + outside_head = [] # tally of which tokens have head outside of the entity span token_indexes = [] token_heads = [] for token in ent["toks"]: @@ -911,8 +911,16 @@ def get_min(toks): outside_head.append(0) if token_heads[i] not in token_indexes and token_heads[i] is not None: outside_head[i] = 1 - if sum(outside_head) > 1: - #print("\tDocument: " + filename.split("/")[-1]) + + # skip dates + skip = False + months = ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"] + # first token is month, second token is 4 digit number + if len(ent["toks"]) == 2 and ent["toks"][0][4].lower() in months \ + and len(ent["toks"][1][4]) == 4 and ent["toks"][1][2] == "CD": + skip = True + + if sum(outside_head) > 1 and not skip: ent_str = "" for j, tok in enumerate(ent["toks"]): if outside_head[j] == 1: @@ -920,7 +928,7 @@ def get_min(toks): else: ent_str += tok[4] + " " ent_str = "[" + ent_str[:-1] + "]" - #print("\tEntity Span: " + ent_str) + start_context = parsed_lines[max(ent["start"] - 5, 0):ent["start"]] end_context = parsed_lines[ent["end"] + 1:min(ent["end"] + 6, len(parsed_lines))] combined_text = "" @@ -932,14 +940,8 @@ def get_min(toks): if tok["token_id"].split("-")[0] == ent["sid"]: combined_text += tok["token"] + " " combined_text = combined_text[:-1] - #print("Combined:", combined_text) + print("WARN: non-constituent entity (" + filename.split("/")[-1] + "): " + combined_text) - #sent_text = "" - #for tok in parsed_lines: - # if tok["token_id"].split("-")[0] == ent["sid"]: - # sent_text += tok["token"] + " " - #sent_text = sent_text[:-1] - #print("\tSentence Context: " + sent_text + "\n") conllua_data = [] for i in range(len(tokens)):