Skip to content

Commit

Permalink
Add more comments to processor code
Browse files Browse the repository at this point in the history
  • Loading branch information
bgyori committed Jul 23, 2024
1 parent 4a1393c commit 09fe17a
Showing 1 changed file with 45 additions and 6 deletions.
51 changes: 45 additions & 6 deletions indra/sources/evex/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,17 @@ def __init__(self, relations_table, articles_table, standoff_index):
self.standoff_cache = {}

def process_statements(self):
"""Process rows of the EXEV relations table into INDRA Statements."""
for row in tqdm.tqdm(self.relations_table.itertuples(),
total=len(self.relations_table),
desc='Processing Evex relations'):
self.statements += self.process_row(row)

def process_row(self, row):
"""Process a row in the relations table into INDRA Statements."""

# First, we determine the statement type and create the subject/object
# ageints.
pol_idx = 1 if row.refined_polarity == 'Negative' else 0
stmt_types = type_indra_mappings.get(row.refined_type)
if not stmt_types:
Expand All @@ -47,19 +51,27 @@ def process_row(self, row):
obj_agent = get_standard_agent('EGID:%s' % target_id,
db_refs={'EGID': target_id})

# We now figure out what articles provide evidence for this relation
article_keys = self.article_lookup.get(row.general_event_id)
stmts = []
for article_prefix, article_id in article_keys:
# These text refs are known based on info we have independent of
# standoff availability
text_refs = {article_prefix: article_id}
pmid = article_id if article_prefix == 'PMID' else None

# We not find the standoff for the given relation and gather
# evidence info for it if possible.
standoff = self.get_standoff_for_event(article_prefix, article_id)
if not standoff:
evidence_info = [{}]
else:
evidence_info = find_evidence_info(standoff, source_id,
target_id, row.refined_type,
row.refined_polarity)
# For each article, it's possible that multiple evidences are
# available for the relation so we create a separate Statements
# (each with a single Evidence) here.
for ev_info in evidence_info:
annotations = {
'evex_relation_type': row.refined_type,
Expand All @@ -76,12 +88,17 @@ def process_row(self, row):
text_refs=text_refs,
text=ev_info.get('text'),
annotations=annotations)

# We can set the raw Agent text which is specific to this
# given evidence.
subj = copy.deepcopy(subj_agent)
obj = copy.deepcopy(obj_agent)
if ev_info.get('subj_text'):
subj.db_refs['TEXT'] = ev_info.get('subj_text')
if ev_info.get('obj_text'):
obj.db_refs['TEXT'] = ev_info.get('obj_text')

# Finally, create the Statement object
if stmt_type == Complex:
stmt = Complex([subj, obj], evidence=[ev])
else:
Expand All @@ -91,6 +108,7 @@ def process_row(self, row):
return stmts

def get_standoff_for_event(self, article_prefix, article_id):
"""Based on article info, return a standoff object of annotations."""
key = (
'pmc' if article_prefix == 'PMCID' else 'pubmed',
article_id[3:] if article_prefix == 'PMCID' else article_id
Expand Down Expand Up @@ -201,6 +219,7 @@ def find_evidence_info(standoff, source_id, target_id, event_type,


def get_regulation_info(standoff, regulation, source_uid, target_uid):
"""Gather specific evidence info from a regulation in a standoff."""
text = standoff.get_sentence_for_offset(regulation.event.start)
subj = standoff.elements[source_uid]
subj_text = subj.text
Expand Down Expand Up @@ -318,19 +337,30 @@ def process_annotations(ann_file):
elements = {}
reader = csv.reader(ann_file, delimiter='\t', quotechar=None)
for row in reader:
# The first element is always the UID
uid = row[0]
assert len(row) == 2 or len(row) == 3
# If the row has 3 elements, then the last one is a value
value = row[2] if len(row) == 3 else None
# The second element can have multiple space-separated parts
parts = row[1].split()
# If this is an entity of some type
if parts[0] in {'GGP', 'Entity'}:
entity = Entity(uid, parts[0], int(parts[1]), int(parts[2]), value)
elements[uid] = entity
# These represent entity references like Entrez IDs
elif parts[0] == 'Reference':
ref_ns, ref_id = parts[2].split(':', maxsplit=1)
elements[parts[1]].references[ref_ns] = ref_id
# These are various event types, we enumerate them explicitly in
# the standoff_event_types variable to make sure it's not some
# other type of row.
elif parts[0] in standoff_event_types:
event = Event(uid, parts[0], int(parts[1]), int(parts[2]), value)
elements[uid] = event
# These are confidence values associated with regulations but also
# other things like Negation. An additional complication is that it
# can either represent a numerical of a qualitative confidence level.
elif parts[0] == 'Confidence':
# Negation confidence
if isinstance(parts[1], Negation):
Expand All @@ -341,12 +371,16 @@ def process_annotations(ann_file):
# Regulation confidence level
else:
elements[parts[1]].confidence_level = parts[2]
# Represents a negation for a regulation
elif parts[0] == 'Negation':
elements[uid] = Negation(uid)
elements[parts[1]].negation = elements[uid]
# Represents a speculation for a regulation
elif parts[0] == 'Speculation':
elements[uid] = Speculation(uid)
elements[parts[1]].speculation = elements[uid]
# The remainder of cases are regulations. These are either basic
# regulations or special cases like subunit-complex relations.
elif len(row) == 2:
if ':' in parts[0]:
event_type, parent_id = parts[0].split(':')
Expand All @@ -360,6 +394,8 @@ def process_annotations(ann_file):
else:
assert False, row

# The row contains a series of arguments for the regulation that
# need to be parsed out in parts
arguments = {}
for element in parts[1:]:
role, arg_uid = element.split(':')
Expand All @@ -368,6 +404,9 @@ def process_annotations(ann_file):
# placeholder for these elements that can be resolved later
element_obj = elements.get(arg_uid, Unresolved(arg_uid))

# There are argument types that there are more than one of,
# e.g., Theme for Binding so we need to sometimes turn
# these into lists.
if role in arguments:
if not isinstance(arguments[role], list):
arguments[role] = [arguments[role]]
Expand All @@ -377,7 +416,7 @@ def process_annotations(ann_file):
regulation = Regulation(uid, event, arguments)
elements[uid] = regulation
else:
print(row)
logger.error('Could not process standoff file row: %s' % row)
break

# We now need to resolve Unresolved regulation references. At this point
Expand Down Expand Up @@ -517,6 +556,11 @@ def paths_to_entrez_id(self, entrez_id):
return paths


@dataclass
class Unresolved:
uid: str


def add_subgraph(g, obj):
"""Recursively build up a graph of standoff objects."""
label = '{ID | %s} | {event_type | %s}' % (obj.uid, obj.event.get_type())
Expand All @@ -542,11 +586,6 @@ def add_subgraph(g, obj):
g.add_edge(obj.uid, vv.uid, label=k)


@dataclass
class Unresolved:
uid: str


# The set of event types used in the standoff format
standoff_event_types = {
'Binding',
Expand Down

0 comments on commit 09fe17a

Please sign in to comment.