Make this reppo uptodate/runnable #7
Annotations
15 errors and 1 warning
/work/graph-db/extractor/src/biocyc/base_data_file_parser.py#L14
class BaseDataFileParser(BaseParser):
"""
Base parser for Biocyc .dat files.
"""
- def __init__(self, base_data_dir: str, biocyc_dbname, tar_file, datafile_name, entity_name, attr_names:dict, rel_names:dict,
- db_link_sources: dict=None):
+
+ def __init__(
+ self,
+ base_data_dir: str,
+ biocyc_dbname,
+ tar_file,
+ datafile_name,
+ entity_name,
+ attr_names: dict,
+ rel_names: dict,
+ db_link_sources: dict = None,
+ ):
"""
:param base_data_dir: the data file base directory, that is the parent folder for 'download'
:param biocyc_dbname: biocyc database name, eg. DB_ECOCYC, DB_HUMANCYC
:param tar_file: tar file downloaded from biocyc website
:param datafile_name: the data file name to process (in tar_file), e.g. genes.dat
|
/work/graph-db/extractor/src/biocyc/base_data_file_parser.py#L30
"""
BaseParser.__init__(self, DB_BIOCYC.lower(), base_data_dir)
self.input_zip = os.path.join(self.download_dir, tar_file)
self.db_output_dir = os.path.join(self.output_dir, biocyc_dbname.lower())
self.datafile = datafile_name
- self.node_labels = [NODE_BIOCYC, 'db_' + biocyc_dbname, entity_name]
+ self.node_labels = [NODE_BIOCYC, 'db_' + biocyc_dbname, entity_name]
self.entity_name = entity_name
self.attr_name_map = attr_names
self.rel_name_map = rel_names
self.db_link_sources = db_link_sources
self.attrs = []
self.version = ''
self.logger = logging.getLogger(__name__)
- def create_synonym_rels(self)->bool:
+ def create_synonym_rels(self) -> bool:
return False
- def get_db_version(self, tar:TarFile):
+ def get_db_version(self, tar: TarFile):
"""
find the latest version of data in the tar file. Sometimes a tar file has multiple version data.
:param tar:
:return:
"""
|
/work/graph-db/extractor/src/biocyc/base_data_file_parser.py#L63
with tarfile.open(self.input_zip, mode='r:gz') as tar:
if not self.version:
self.version = self.get_db_version(tar)
self.logger.info(f'Database file version: "{self.version}"')
for tarinfo in tar:
- if tarinfo.name.endswith('/'+ self.datafile) and self.version in tarinfo.name:
+ if (
+ tarinfo.name.endswith('/' + self.datafile)
+ and self.version in tarinfo.name
+ ):
self.logger.info('Parse ' + tarinfo.name)
utf8reader = codecs.getreader('ISO-8859-1')
f = utf8reader(tar.extractfile(tarinfo.name))
nodes = []
node = None
prev_line_is_comment = False
for line in f:
line = biocyc_utils.cleanhtml(line)
- node, prev_line_is_comment = self.parse_line(line, node, nodes, prev_line_is_comment)
+ node, prev_line_is_comment = self.parse_line(
+ line, node, nodes, prev_line_is_comment
+ )
return nodes
def parse_line(self, line, node, nodes, prev_line_is_comment):
try:
if line.startswith(UNIQUE_ID):
node = NodeData(self.node_labels.copy(), PROP_BIOCYC_ID)
nodes.append(node)
# add data source property
node.add_attribute(PROP_DATA_SOURCE, DB_BIOCYC, "str")
- if node and PROP_COMMENT in self.attr_name_map and prev_line_is_comment and line.startswith('/'):
+ if (
+ node
+ and PROP_COMMENT in self.attr_name_map
+ and prev_line_is_comment
+ and line.startswith('/')
+ ):
line = line[1:].strip()
node.add_attribute(PROP_COMMENT, line, 'str')
elif node:
attr, val = biocyc_utils.get_attr_val_from_line(line)
if attr:
|
/work/graph-db/extractor/src/biocyc/base_data_file_parser.py#L95
# reset comment
prev_line_is_comment = False
else:
prev_line_is_comment = True
if attr in self.attr_name_map:
- prop_name, data_type = biocyc_utils.get_property_name_type(attr, self.attr_name_map)
+ prop_name, data_type = biocyc_utils.get_property_name_type(
+ attr, self.attr_name_map
+ )
node.add_attribute(prop_name, val, data_type)
if attr == UNIQUE_ID:
node.add_attribute(PROP_ID, val, data_type)
if attr in self.rel_name_map:
# some rel could also be an attribute, e.g. types
|
/work/graph-db/extractor/src/biocyc/base_data_file_parser.py#L107
tokens = val.split(' ')
if len(tokens) > 1:
db_name = tokens[0].lstrip('(')
reference_id = tokens[1].strip(')').strip('"')
add_prefix = tokens[1]
- self.add_dblink(node, db_name, reference_id, )
+ self.add_dblink(
+ node,
+ db_name,
+ reference_id,
+ )
else:
rel_type = self.rel_name_map.get(attr)
node.add_edge_type(rel_type, val)
except Exception as ex:
self.logger.error('line:', line)
return node, prev_line_is_comment
- def add_dblink(self, node:NodeData, db_name, reference_id):
+ def add_dblink(self, node: NodeData, db_name, reference_id):
link_node = NodeData(NODE_DBLINK, PROP_REF_ID)
if reference_id.startswith(db_name):
- reference_id = reference_id[len(db_name)+1:] # remove db prefix
+ reference_id = reference_id[len(db_name) + 1 :] # remove db prefix
link_node.update_attribute(PROP_REF_ID, reference_id)
link_node.update_attribute(PROP_DB_NAME, db_name)
node.add_edge(node, link_node, REL_DBLINKS)
def create_indexes(self, database: Database):
- database.create_index(self.entity_name, PROP_ID, f"index_{self.entity_name.lower}_id")
- database.create_index(self.entity_name, PROP_BIOCYC_ID, f"index_{self.entity_name.lower}_biocycid")
- database.create_index(self.entity_name, PROP_NAME, f"index_{self.entity_name.lower}_name")
-
- def update_nodes_in_graphdb(self, nodes:[], database:Database, etl_load_id: str):
+ database.create_index(
+ self.entity_name, PROP_ID, f"index_{self.entity_name.lower}_id"
+ )
+ database.create_index(
+ self.entity_name, PROP_BIOCYC_ID, f"index_{self.entity_name.lower}_biocycid"
+ )
+ database.create_index(
+ self.entity_name, PROP_NAME, f"index_{self.entity_name.lower}_name"
+ )
+
+ def update_nodes_in_graphdb(self, nodes: [], database: Database, etl_load_id: str):
"""
Load or update nodes in KG. This can also be called for initial loading.
:param nodes: list of nodes
:param database: neo4j Database
:param etl_load: Id that (virtually) links a node to an EtlLoad node.
|
/work/graph-db/extractor/src/biocyc/base_data_file_parser.py#L143
self.logger.info('Update nodes: ' + ':'.join(self.node_labels))
rows = []
for node in nodes:
rows.append(node.to_dict())
attrs = self.attrs + [PROP_ID, PROP_DATA_SOURCE]
- query = get_update_nodes_query(NODE_BIOCYC, PROP_BIOCYC_ID, attrs, self.node_labels, etl_load_id=etl_load_id, return_node_count=True)
+ query = get_update_nodes_query(
+ NODE_BIOCYC,
+ PROP_BIOCYC_ID,
+ attrs,
+ self.node_labels,
+ etl_load_id=etl_load_id,
+ return_node_count=True,
+ )
return database.load_data_from_rows(query, rows, return_node_count=True)
- def add_edges_to_graphdb(self, nodes:[], database:Database, etl_load_id):
+ def add_edges_to_graphdb(self, nodes: [], database: Database, etl_load_id):
no_of_created_nodes = 0
no_of_updated_nodes = 0
no_of_created_relations = 0
no_of_updated_relations = 0
entity_rel_dict = dict()
|
/work/graph-db/extractor/src/biocyc/base_data_file_parser.py#L159
for node in nodes:
if self.create_synonym_rels():
id = node.get_attribute(PROP_BIOCYC_ID)
synonyms = node.get_synonym_set()
for syn in synonyms:
- synonym_list.append({PROP_BIOCYC_ID:id, PROP_NAME: syn})
+ synonym_list.append({PROP_BIOCYC_ID: id, PROP_NAME: syn})
for edge in node.edges:
from_id = edge.source.get_attribute(edge.source.id_attr)
to_id = edge.dest.get_attribute(edge.dest.id_attr)
rel = edge.label
if rel == REL_DBLINKS:
|
/work/graph-db/extractor/src/biocyc/base_data_file_parser.py#L171
continue
db_name = edge.dest.get_attribute(PROP_DB_NAME)
if db_name in self.db_link_sources:
if db_name not in db_link_dict:
db_link_dict[db_name] = []
- db_link_dict[db_name].append({'from_id': from_id, 'to_id': to_id})
+ db_link_dict[db_name].append(
+ {'from_id': from_id, 'to_id': to_id}
+ )
else:
if rel not in entity_rel_dict:
entity_rel_dict[rel] = []
entity_rel_dict[rel].append({'from_id': from_id, 'to_id': to_id})
if synonym_list:
self.logger.info('Add synonyms')
- query = get_create_synonym_relationships_query(NODE_BIOCYC, PROP_BIOCYC_ID, PROP_BIOCYC_ID, PROP_NAME, [], etl_load_id=etl_load_id, return_node_count=True)
+ query = get_create_synonym_relationships_query(
+ NODE_BIOCYC,
+ PROP_BIOCYC_ID,
+ PROP_BIOCYC_ID,
+ PROP_NAME,
+ [],
+ etl_load_id=etl_load_id,
+ return_node_count=True,
+ )
self.logger.debug(query)
- node_count, result_counters = database.load_data_from_rows(query, synonym_list, return_node_count=True)
+ node_count, result_counters = database.load_data_from_rows(
+ query, synonym_list, return_node_count=True
+ )
no_of_created_nodes += result_counters.nodes_created
- no_of_updated_nodes += (node_count - result_counters.nodes_created)
+ no_of_updated_nodes += node_count - result_counters.nodes_created
for rel in entity_rel_dict.keys():
self.logger.info('Add relationship ' + rel)
- query = get_create_relationships_query(NODE_BIOCYC, PROP_BIOCYC_ID, 'from_id',
- NODE_BIOCYC, PROP_BIOCYC_ID, 'to_id', rel, etl_load_id=etl_load_id, return_node_count=True)
+ query = get_create_relationships_query(
+ NODE_BIOCYC,
+ PROP_BIOCYC_ID,
+ 'from_id',
+ NODE_BIOCYC,
+ PROP_BIOCYC_ID,
+ 'to_id',
+ rel,
+ etl_load_id=etl_load_id,
+ return_node_count=True,
+ )
self.logger.debug(query)
- node_count, result_counters = database.load_data_from_rows(query, entity_rel_dict[rel], return_node_count=True)
+ node_count, result_counters = database.load_data_from_rows(
+ query, entity_rel_dict[rel], return_node_count=True
+ )
no_of_created_relations += result_counters.relationships_created
- no_of_updated_relations += (node_count - result_counters.relationships_created)
-
- _no_of_created_relations, _no_of_updated_relations = self.add_dblinks_to_graphdb(db_link_dict, database, etl_load_id)
+ no_of_updated_relations += (
+ node_count - result_counters.relationships_created
+ )
+
+ (
+ _no_of_created_relations,
+ _no_of_updated_relations,
+ ) = self.add_dblinks_to_graphdb(db_link_dict, database, etl_load_id)
no_of_created_relations += _no_of_created_relations
no_of_updated_relations += _no_of_updated_relations
- return no_of_created_nodes, no_of_updated_nodes, no_of_created_relations, no_of_updated_relations
-
- def add_dblinks_to_graphdb(self, db_link_dict:dict, database:Database, etl_load_id):
+ return (
+ no_of_created_nodes,
+ no_of_updated_nodes,
+ no_of_created_relations,
+ no_of_updated_relations,
+ )
+
+ def add_dblinks_to_graphdb(
+ self, db_link_dict: dict, database: Database, etl_load_id
+ ):
no_of_created_relations = 0
no_of_updated_relations = 0
for db_name in db_link_dict.keys():
- self.logger.info('Add DB Link relationship to ' + db_name )
+ self.logger.info('Add DB Link relationship to ' + db_name)
dest_label = 'db_' + db_name
rel = db_name.upper() + '_LINK'
- query = get_create_relationships_query(NODE_BIOCYC, PROP_BIOCYC_ID, 'from_id',
- dest_label, PROP_ID, 'to_id', rel, etl_load_id=etl_load_id, return_node_count=True)
+ query = get_create_relationships_query(
+ NODE_BIOCYC,
+ PROP_BIOCYC_ID,
+ 'from_id',
+ dest_label,
+ PROP_ID,
+ 'to_id',
+ rel,
+ etl_load_id=etl_load_id,
+ return_node_count=True,
+ )
self.logger.debug(query)
- node_count, result_counters = database.load_data_from_rows(query, db_link_dict[db_name], return_node_count=True)
+ node_count, result_counters = database.load_data_from_rows(
+ query, db_link_dict[db_name], return_node_count=True
+ )
no_of_created_relations += result_counters.relationships_created
- no_of_updated_relations += (node_count - result_counters.relationships_created)
+ no_of_updated_relations += (
+ node_count - result_counters.relationships_created
+ )
return no_of_created_relations, no_of_updated_relations
- def write_entity_data_files(self, nodes:[]):
+ def write_entity_data_files(self, nodes: []):
os.makedirs(self.db_output_dir, 0o777, True)
self.logger.info(f'Writing {self.entity_name} files')
- with open(os.path.join(self.db_output_dir, self.entity_name.lower() + '.tsv'), 'w') as f:
+ with open(
+ os.path.join(self.db_output_dir, self.entity_name.lower() + '.tsv'), 'w'
+ ) as f:
attrs = [PROP_ID] + [PROP_DATA_SOURCE] + self.attrs
f.write('\t'.join(attrs) + '\n')
f.writelines(NodeData.get_entity_data_rows(nodes, attrs))
-
|
/work/graph-db/extractor/src/common/liquibase_utils.py#L29
class ChangeLog:
def __init__(self, author: str, change_id_prefix: str):
if not change_id_prefix:
- raise ValueError('The argument change_id_prefix must not be null or empty string')
+ raise ValueError(
+ 'The argument change_id_prefix must not be null or empty string'
+ )
try:
int(change_id_prefix.split('-')[1])
except Exception:
- raise ValueError('The argument change_id_prefix must be the JIRA card number; e.g LL-1234')
+ raise ValueError(
+ 'The argument change_id_prefix must be the JIRA card number; e.g LL-1234'
+ )
self.author = author
self.id_prefix = change_id_prefix
self.file_prefix = f'jira-{change_id_prefix}-'
self.logger = logging.getLogger(__name__)
|
/work/graph-db/extractor/src/common/liquibase_utils.py#L70
def create_changelog_str(self):
template = get_template(sql_template)
# liquibase doesn't like the `<` character
self.cypher = self.cypher.replace('<', '<')
- return template.render(change_id=self.id, author=self.author, change_comment=self.comment, cypher_query=self.cypher)
+ return template.render(
+ change_id=self.id,
+ author=self.author,
+ change_comment=self.comment,
+ cypher_query=self.cypher,
+ )
class CustomChangeSet(ChangeSet):
- def __init__(self, id, author, comment, cypher,
- filename:str,
- handler="edu.ucsd.sbrg.FileQueryHandler",
- filetype='TSV',
- startrow=1):
+ def __init__(
+ self,
+ id,
+ author,
+ comment,
+ cypher,
+ filename: str,
+ handler="edu.ucsd.sbrg.FileQueryHandler",
+ filetype='TSV',
+ startrow=1,
+ ):
ChangeSet.__init__(self, id, author, comment, cypher)
self.handler = handler
self.filename = filename.replace('.tsv', '.zip')
self.filetype = filetype
self.start_at = startrow
def create_changelog_str(self):
template = get_template(custom_template)
- return template.render(change_id=self.id, change_comment=self.comment, author=self.author,
- handler_class=self.handler, cypher_query=self.cypher, data_file=self.filename,
- start_at=self.start_at, file_type=self.filetype, params=CUSTOM_PARAMS)
+ return template.render(
+ change_id=self.id,
+ change_comment=self.comment,
+ author=self.author,
+ handler_class=self.handler,
+ cypher_query=self.cypher,
+ data_file=self.filename,
+ start_at=self.start_at,
+ file_type=self.filetype,
+ params=CUSTOM_PARAMS,
+ )
def generate_sql_changelog_file(id, author, comment, cypher, outfile):
changeset = ChangeSet(id, author, comment, cypher)
temp = get_changelog_template()
|
/work/graph-db/extractor/src/common/liquibase_utils.py#L102
if __name__ == '__main__':
cypher = 'match(n:Gene)-[r]-(:Gene) where r.score < 0.4 delete r;'
comment = 'Remove ecocyc-plus string relationships with 0.4 threshold. After the update, create ecocyc-plus-10012021.dump file'
- outfile = os.path.join('../../../migration/liquibase/ecocyc-plus/ecocyc-plus changelog-0010.xml')
- generate_sql_changelog_file('LL-3702 cut string rels with threshold', 'robin cai',
- comment,
- cypher, outfile)
+ outfile = os.path.join(
+ '../../../migration/liquibase/ecocyc-plus/ecocyc-plus changelog-0010.xml'
+ )
+ generate_sql_changelog_file(
+ 'LL-3702 cut string rels with threshold', 'robin cai', comment, cypher, outfile
+ )
|
/work/graph-db/extractor/src/ncbi/ncbi_taxonomy_liquibase.py#L6
from common.query_builder import *
from ncbi.ncbi_taxonomy_parser import *
# reference to this directory
directory = os.path.realpath(os.path.dirname(__file__))
+
class NcbiTaxonomyChangeLog(ChangeLog):
def __init__(self, author: str, change_id_prefix: str):
super().__init__(author, change_id_prefix)
self.date_tag = datetime.today().strftime('%m%d%Y')
|
/work/graph-db/extractor/src/ncbi/ncbi_taxonomy_liquibase.py#L26
def load_ncbi_taxonomy_nodes(self):
id = f'NCBI taxonomy data on date {self.date_tag}'
if self.id_prefix:
id = f'{self.id_prefix} {id}'
comment = 'Load NCBI taxonomy nodes'
- query = get_create_update_nodes_query(NODE_TAXONOMY, PROP_ID, NODE_ATTRS, [NODE_NCBI], datasource='NCBI Taxonomy')
- changeset = CustomChangeSet(id, self.author, comment, query, f'{self.file_prefix}{NCBI_TAXONOMY_FILE}')
+ query = get_create_update_nodes_query(
+ NODE_TAXONOMY, PROP_ID, NODE_ATTRS, [NODE_NCBI], datasource='NCBI Taxonomy'
+ )
+ changeset = CustomChangeSet(
+ id, self.author, comment, query, f'{self.file_prefix}{NCBI_TAXONOMY_FILE}'
+ )
self.change_sets.append(changeset)
def load_ncbi_taxonomy_synonym_rels(self):
id = f'load NCBI taxonomy synonym relationship on date {self.date_tag}'
if self.id_prefix:
id = f'{self.id_prefix} {id}'
comment = 'Load NCBI gene taxonomy relationship'
- query = get_create_synonym_relationships_query(NODE_TAXONOMY, PROP_ID, PROP_ID, PROP_NAME, [PROP_TYPE])
- changeset = CustomChangeSet(id, self.author, comment, query, f'{self.file_prefix}{NCBI_TAXONOMY_SYNONYM_FILE}')
+ query = get_create_synonym_relationships_query(
+ NODE_TAXONOMY, PROP_ID, PROP_ID, PROP_NAME, [PROP_TYPE]
+ )
+ changeset = CustomChangeSet(
+ id,
+ self.author,
+ comment,
+ query,
+ f'{self.file_prefix}{NCBI_TAXONOMY_SYNONYM_FILE}',
+ )
self.change_sets.append(changeset)
def load_ncbi_taxonomy_parent_rels(self):
id = f'create relationship between taxonomy and parent nodes on date {self.date_tag}'
if self.id_prefix:
|
/work/graph-db/extractor/src/ncbi/ncbi_taxonomy_liquibase.py#L48
comment = 'Taxonomy relationship with parent'
query = """
CALL apoc.periodic.iterate(
'MATCH (n:Taxonomy), (m:Taxonomy) WHERE m.prop = n.parent_id RETURN n, m',
'MERGE (n)-[:HAS_PARENT]->(m)', {batchSize:5000})
- """.replace('prop', PROP_ID)
+ """.replace(
+ 'prop', PROP_ID
+ )
changeset = ChangeSet(id, self.author, comment, query)
self.change_sets.append(changeset)
def set_species_id(self):
id = f'set species_id for taxonomy nodes on date {self.date_tag}'
|
/work/graph-db/extractor/src/ncbi/ncbi_taxonomy_liquibase.py#L63
changeset = ChangeSet(id, self.author, comment, query)
self.change_sets.append(changeset)
def create_indexes(self):
queries = []
- queries.append(get_create_constraint_query(NODE_TAXONOMY, PROP_ID, 'constraint_taxonomy_id') + ';')
- queries.append(get_create_constraint_query(NODE_SYNONYM, PROP_NAME, 'constraint_synonym_name') + ';')
- queries.append(get_create_index_query(NODE_TAXONOMY, PROP_NAME, 'index_taxonomy_name') + ';')
- queries.append(get_create_index_query(NODE_TAXONOMY, 'species_id', 'index_taxonomy_speciesid') + ';')
+ queries.append(
+ get_create_constraint_query(
+ NODE_TAXONOMY, PROP_ID, 'constraint_taxonomy_id'
+ )
+ + ';'
+ )
+ queries.append(
+ get_create_constraint_query(
+ NODE_SYNONYM, PROP_NAME, 'constraint_synonym_name'
+ )
+ + ';'
+ )
+ queries.append(
+ get_create_index_query(NODE_TAXONOMY, PROP_NAME, 'index_taxonomy_name')
+ + ';'
+ )
+ queries.append(
+ get_create_index_query(
+ NODE_TAXONOMY, 'species_id', 'index_taxonomy_speciesid'
+ )
+ + ';'
+ )
return queries
def add_index_change_set(self):
id = f'create NCBI taxonomy constraints on date {self.date_tag}'
if self.id_prefix:
|
triage
Node.js 16 actions are deprecated. Please update the following actions to use Node.js 20: actions/labeler@v4. For more information see: https://github.blog/changelog/2023-09-22-github-actions-transitioning-from-node-16-to-node-20/.
|