From 857edaf3844daedee3cd7f7e90f0f0f06e483058 Mon Sep 17 00:00:00 2001
From: lauren-lizzy-levine <lauren.lizzy.levine@gmail.com>
Date: Sun, 28 Apr 2024 14:49:03 -0400
Subject: [PATCH 1/3] adding warning for entity span not being a constituent

---
 _build/utils/repair_tsv.py | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/_build/utils/repair_tsv.py b/_build/utils/repair_tsv.py
index ffc587a23..8d0704f8e 100644
--- a/_build/utils/repair_tsv.py
+++ b/_build/utils/repair_tsv.py
@@ -550,7 +550,7 @@ def fix_genitive_s(tsv_path, xml_path, warn_only=True, outdir=None, string_input
 ### end genitive s fix
 
 
-def adjust_edges(webanno_tsv, parsed_lines, ent_mappings, single_tok_mappings, single_coref_type=False):
+def adjust_edges(webanno_tsv, parsed_lines, ent_mappings, single_tok_mappings, filename, single_coref_type=False):
 	"""
 	Fix webanno TSV in several ways:
 	  * All edges pointing back from a pronoun receive type 'ana'
@@ -900,6 +900,35 @@ def get_min(toks):
 		if start != end:
 			closer_lists[end].append(str(group) + ")")
 
+		# Validate that entity spans are constituents
+		outside_head = []
+		token_indexes = []
+		token_heads = []
+		for token in ent["toks"]:
+			token_indexes.append(token[0])
+			token_heads.append(token[1])
+		for i in range(len(ent["toks"])):
+			outside_head.append(0)
+			if token_heads[i] not in token_indexes and token_heads[i] is not None:
+				outside_head[i] = 1
+		if sum(outside_head) > 1:
+			print("WARN: entity " + str(e_id) + " is not a constituent: ")
+			print("\tDocument: " + filename.split("/")[-1])
+			ent_str = ""
+			for j, tok in enumerate(ent["toks"]):
+				if outside_head[j] == 1:
+					ent_str += "[[" + tok[4] + "]]" + " "
+				else:
+					ent_str += tok[4] + " "
+			ent_str = ent_str[:-1]
+			print("\tEntity Span: " + ent_str)
+			sent_text = ""
+			for tok in parsed_lines:
+				if tok["token_id"].split("-")[0] == ent["sid"]:
+					sent_text += tok["token"] + " "
+			sent_text = sent_text[:-1]
+			print("\tSentence Context: " + sent_text + "\n")
+
 	conllua_data = []
 	for i in range(len(tokens)):
 		if i+1 in opener_lists or i+1 in closer_lists:
@@ -1247,7 +1276,7 @@ def fix_file(filename, tt_file, outdir, genitive_s=False):
 	output += "\n".join(out_lines) + "\n"
 	parsed_lines, entity_mappings, single_tok_mappings = fix_genitive_s(output, tt_file, warn_only=True, string_input=True)
 
-	output, conllua_data, centering_transitions, group_saliences = adjust_edges(output, parsed_lines, entity_mappings, single_tok_mappings)
+	output, conllua_data, centering_transitions, group_saliences = adjust_edges(output, parsed_lines, entity_mappings, single_tok_mappings, filename)
 	centering_doc_data = defaultdict(lambda: "no-ent")
 
 	# Set missing transitions

From 73b3cf8312be650f22efd7628ef35fb4c261a804 Mon Sep 17 00:00:00 2001
From: lauren-lizzy-levine <lauren.lizzy.levine@gmail.com>
Date: Sat, 4 May 2024 00:36:29 -0400
Subject: [PATCH 2/3] reformtting warning

---
 _build/utils/repair_tsv.py | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/_build/utils/repair_tsv.py b/_build/utils/repair_tsv.py
index 8d0704f8e..4d2cf9d1e 100644
--- a/_build/utils/repair_tsv.py
+++ b/_build/utils/repair_tsv.py
@@ -912,22 +912,34 @@ def get_min(toks):
 			if token_heads[i] not in token_indexes and token_heads[i] is not None:
 				outside_head[i] = 1
 		if sum(outside_head) > 1:
-			print("WARN: entity " + str(e_id) + " is not a constituent: ")
-			print("\tDocument: " + filename.split("/")[-1])
+			#print("\tDocument: " + filename.split("/")[-1])
 			ent_str = ""
 			for j, tok in enumerate(ent["toks"]):
 				if outside_head[j] == 1:
-					ent_str += "[[" + tok[4] + "]]" + " "
+					ent_str += "*" + tok[4] + "*" + " "
 				else:
 					ent_str += tok[4] + " "
-			ent_str = ent_str[:-1]
-			print("\tEntity Span: " + ent_str)
-			sent_text = ""
-			for tok in parsed_lines:
+			ent_str = "[" + ent_str[:-1] + "]"
+			#print("\tEntity Span: " + ent_str)
+			start_context = parsed_lines[max(ent["start"] - 5, 0):ent["start"]]
+			end_context = parsed_lines[ent["end"] + 1:min(ent["end"] + 6, len(parsed_lines))]
+			combined_text = ""
+			for tok in start_context:
 				if tok["token_id"].split("-")[0] == ent["sid"]:
-					sent_text += tok["token"] + " "
-			sent_text = sent_text[:-1]
-			print("\tSentence Context: " + sent_text + "\n")
+					combined_text += tok["token"] + " "
+			combined_text += ent_str + " "
+			for tok in end_context:
+				if tok["token_id"].split("-")[0] == ent["sid"]:
+					combined_text += tok["token"] + " "
+			combined_text = combined_text[:-1]
+			#print("Combined:", combined_text)
+			print("WARN: non-constituent entity (" + filename.split("/")[-1] + "): " + combined_text)
+			#sent_text = ""
+			#for tok in parsed_lines:
+			#	if tok["token_id"].split("-")[0] == ent["sid"]:
+			#		sent_text += tok["token"] + " "
+			#sent_text = sent_text[:-1]
+			#print("\tSentence Context: " + sent_text + "\n")
 
 	conllua_data = []
 	for i in range(len(tokens)):

From bf5ea70434ca94ee871ae05a0df3b8e42cc006f9 Mon Sep 17 00:00:00 2001
From: lauren-lizzy-levine <lauren.lizzy.levine@gmail.com>
Date: Sat, 4 May 2024 10:41:01 -0400
Subject: [PATCH 3/3] adding filtering for <month> <year> entity

---
 _build/utils/repair_tsv.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/_build/utils/repair_tsv.py b/_build/utils/repair_tsv.py
index 4d2cf9d1e..d2dd5d491 100644
--- a/_build/utils/repair_tsv.py
+++ b/_build/utils/repair_tsv.py
@@ -901,7 +901,7 @@ def get_min(toks):
 			closer_lists[end].append(str(group) + ")")
 
 		# Validate that entity spans are constituents
-		outside_head = []
+		outside_head = [] # tally of which tokens have head outside of the entity span
 		token_indexes = []
 		token_heads = []
 		for token in ent["toks"]:
@@ -911,8 +911,16 @@ def get_min(toks):
 			outside_head.append(0)
 			if token_heads[i] not in token_indexes and token_heads[i] is not None:
 				outside_head[i] = 1
-		if sum(outside_head) > 1:
-			#print("\tDocument: " + filename.split("/")[-1])
+
+		# skip <month> <year> dates
+		skip = False
+		months = ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]
+		# first token is month, second token is 4 digit number
+		if len(ent["toks"]) == 2 and ent["toks"][0][4].lower() in months \
+				and len(ent["toks"][1][4]) == 4 and ent["toks"][1][2] == "CD":
+			skip = True
+
+		if sum(outside_head) > 1 and not skip:
 			ent_str = ""
 			for j, tok in enumerate(ent["toks"]):
 				if outside_head[j] == 1:
@@ -920,7 +928,7 @@ def get_min(toks):
 				else:
 					ent_str += tok[4] + " "
 			ent_str = "[" + ent_str[:-1] + "]"
-			#print("\tEntity Span: " + ent_str)
+
 			start_context = parsed_lines[max(ent["start"] - 5, 0):ent["start"]]
 			end_context = parsed_lines[ent["end"] + 1:min(ent["end"] + 6, len(parsed_lines))]
 			combined_text = ""
@@ -932,14 +940,8 @@ def get_min(toks):
 				if tok["token_id"].split("-")[0] == ent["sid"]:
 					combined_text += tok["token"] + " "
 			combined_text = combined_text[:-1]
-			#print("Combined:", combined_text)
+
 			print("WARN: non-constituent entity (" + filename.split("/")[-1] + "): " + combined_text)
-			#sent_text = ""
-			#for tok in parsed_lines:
-			#	if tok["token_id"].split("-")[0] == ent["sid"]:
-			#		sent_text += tok["token"] + " "
-			#sent_text = sent_text[:-1]
-			#print("\tSentence Context: " + sent_text + "\n")
 
 	conllua_data = []
 	for i in range(len(tokens)):