Skip to content

Make this reppo uptodate/runnable #6

Make this reppo uptodate/runnable

Make this reppo uptodate/runnable #6

GitHub Actions / Black failed Apr 3, 2024 in 0s

15 errors

Black found 15 errors


Check failure on line 45 in /work/graph-db/extractor/src/common/

See this annotation in the file changed.

@github-actions github-actions / Black


 class ChangeLog:
     def __init__(self, author: str, change_id_prefix: str):
         if not change_id_prefix:
-            raise ValueError('The argument change_id_prefix must not be null or empty string')
+            raise ValueError(
+                'The argument change_id_prefix must not be null or empty string'
+            )
         except Exception:
-            raise ValueError('The argument change_id_prefix must be the JIRA card number; e.g LL-1234')
+            raise ValueError(
+                'The argument change_id_prefix must be the JIRA card number; e.g LL-1234'
+            ) = author
         self.id_prefix = change_id_prefix
         self.file_prefix = f'jira-{change_id_prefix}-'
         self.logger = logging.getLogger(__name__)

Check failure on line 100 in /work/graph-db/extractor/src/common/

See this annotation in the file changed.

@github-actions github-actions / Black


     def create_changelog_str(self):
         template = get_template(sql_template)
         # liquibase doesn't like the `<` character
         self.cypher = self.cypher.replace('<', '&lt;')
-        return template.render(,, change_comment=self.comment, cypher_query=self.cypher)
+        return template.render(
+  ,
+  ,
+            change_comment=self.comment,
+            cypher_query=self.cypher,
+        )
 class CustomChangeSet(ChangeSet):
-    def __init__(self, id, author, comment, cypher,
-                 filename:str,
-                 handler="edu.ucsd.sbrg.FileQueryHandler",
-                 filetype='TSV',
-                 startrow=1):
+    def __init__(
+        self,
+        id,
+        author,
+        comment,
+        cypher,
+        filename: str,
+        handler="edu.ucsd.sbrg.FileQueryHandler",
+        filetype='TSV',
+        startrow=1,
+    ):
         ChangeSet.__init__(self, id, author, comment, cypher)
         self.handler = handler
         self.filename = filename.replace('.tsv', '.zip')
         self.filetype = filetype
         self.start_at = startrow
     def create_changelog_str(self):
         template = get_template(custom_template)
-        return template.render(, change_comment=self.comment,,
-                               handler_class=self.handler, cypher_query=self.cypher, data_file=self.filename,
-                               start_at=self.start_at, file_type=self.filetype, params=CUSTOM_PARAMS)
+        return template.render(
+  ,
+            change_comment=self.comment,
+  ,
+            handler_class=self.handler,
+            cypher_query=self.cypher,
+            data_file=self.filename,
+            start_at=self.start_at,
+            file_type=self.filetype,
+            params=CUSTOM_PARAMS,
+        )
 def generate_sql_changelog_file(id, author, comment, cypher, outfile):
     changeset = ChangeSet(id, author, comment, cypher)
     temp = get_changelog_template()

Check failure on line 111 in /work/graph-db/extractor/src/common/

See this annotation in the file changed.

@github-actions github-actions / Black


 if __name__ == '__main__':
     cypher = 'match(n:Gene)-[r]-(:Gene) where r.score < 0.4 delete r;'
     comment = 'Remove ecocyc-plus string relationships with 0.4 threshold. After the update, create ecocyc-plus-10012021.dump file'
-    outfile = os.path.join('../../../migration/liquibase/ecocyc-plus/ecocyc-plus changelog-0010.xml')
-    generate_sql_changelog_file('LL-3702 cut string rels with threshold', 'robin cai',
-                                comment,
-                                cypher, outfile)
+    outfile = os.path.join(
+        '../../../migration/liquibase/ecocyc-plus/ecocyc-plus changelog-0010.xml'
+    )
+    generate_sql_changelog_file(
+        'LL-3702 cut string rels with threshold', 'robin cai', comment, cypher, outfile
+    )

Check failure on line 26 in /work/graph-db/extractor/src/biocyc/

See this annotation in the file changed.

@github-actions github-actions / Black


 class BaseDataFileParser(BaseParser):
     Base parser for Biocyc .dat files.
-    def __init__(self, base_data_dir: str, biocyc_dbname, tar_file, datafile_name, entity_name, attr_names:dict, rel_names:dict,
-                 db_link_sources: dict=None):
+    def __init__(
+        self,
+        base_data_dir: str,
+        biocyc_dbname,
+        tar_file,
+        datafile_name,
+        entity_name,
+        attr_names: dict,
+        rel_names: dict,
+        db_link_sources: dict = None,
+    ):
         :param base_data_dir: the data file base directory, that is the parent folder for 'download'
         :param biocyc_dbname: biocyc database name, eg. DB_ECOCYC, DB_HUMANCYC
         :param tar_file: tar file downloaded from biocyc website
         :param datafile_name: the data file name to process (in tar_file), e.g. genes.dat

Check failure on line 53 in /work/graph-db/extractor/src/biocyc/

See this annotation in the file changed.

@github-actions github-actions / Black


         BaseParser.__init__(self, DB_BIOCYC.lower(), base_data_dir)
         self.input_zip = os.path.join(self.download_dir, tar_file)
         self.db_output_dir = os.path.join(self.output_dir, biocyc_dbname.lower())
         self.datafile = datafile_name
-        self.node_labels = [NODE_BIOCYC, 'db_' + biocyc_dbname,  entity_name]
+        self.node_labels = [NODE_BIOCYC, 'db_' + biocyc_dbname, entity_name]
         self.entity_name = entity_name
         self.attr_name_map = attr_names
         self.rel_name_map = rel_names
         self.db_link_sources = db_link_sources
         self.attrs = []
         self.version = ''
         self.logger = logging.getLogger(__name__)
-    def create_synonym_rels(self)->bool:
+    def create_synonym_rels(self) -> bool:
         return False
-    def get_db_version(self, tar:TarFile):
+    def get_db_version(self, tar: TarFile):
         find the latest version of data in the tar file.  Sometimes a tar file has multiple version data.
         :param tar:

Check failure on line 94 in /work/graph-db/extractor/src/biocyc/

See this annotation in the file changed.

@github-actions github-actions / Black


         with, mode='r:gz') as tar:
             if not self.version:
                 self.version = self.get_db_version(tar)
       'Database file version: "{self.version}"')
             for tarinfo in tar:
-                if'/'+ self.datafile) and self.version in
+                if (
+          '/' + self.datafile)
+                    and self.version in
+                ):
           'Parse ' +
                     utf8reader = codecs.getreader('ISO-8859-1')
                     f = utf8reader(tar.extractfile(
                     nodes = []
                     node = None
                     prev_line_is_comment = False
                     for line in f:
                         line = biocyc_utils.cleanhtml(line)
-                        node, prev_line_is_comment = self.parse_line(line, node, nodes, prev_line_is_comment)
+                        node, prev_line_is_comment = self.parse_line(
+                            line, node, nodes, prev_line_is_comment
+                        )
                     return nodes
     def parse_line(self, line, node, nodes, prev_line_is_comment):
             if line.startswith(UNIQUE_ID):
                 node = NodeData(self.node_labels.copy(), PROP_BIOCYC_ID)
                 # add data source property
                 node.add_attribute(PROP_DATA_SOURCE, DB_BIOCYC, "str")
-            if node and PROP_COMMENT in self.attr_name_map and prev_line_is_comment and line.startswith('/'):
+            if (
+                node
+                and PROP_COMMENT in self.attr_name_map
+                and prev_line_is_comment
+                and line.startswith('/')
+            ):
                 line = line[1:].strip()
                 node.add_attribute(PROP_COMMENT, line, 'str')
             elif node:
                 attr, val = biocyc_utils.get_attr_val_from_line(line)
                 if attr:

Check failure on line 106 in /work/graph-db/extractor/src/biocyc/

See this annotation in the file changed.

@github-actions github-actions / Black


                         # reset comment
                         prev_line_is_comment = False
                         prev_line_is_comment = True
                     if attr in self.attr_name_map:
-                        prop_name, data_type = biocyc_utils.get_property_name_type(attr, self.attr_name_map)
+                        prop_name, data_type = biocyc_utils.get_property_name_type(
+                            attr, self.attr_name_map
+                        )
                         node.add_attribute(prop_name, val, data_type)
                         if attr == UNIQUE_ID:
                             node.add_attribute(PROP_ID, val, data_type)
                     if attr in self.rel_name_map:
                         # some rel could also be an attribute, e.g. types

Check failure on line 139 in /work/graph-db/extractor/src/biocyc/

See this annotation in the file changed.

@github-actions github-actions / Black


                             tokens = val.split(' ')
                             if len(tokens) > 1:
                                 db_name = tokens[0].lstrip('(')
                                 reference_id = tokens[1].strip(')').strip('"')
                                 add_prefix = tokens[1]
-                                self.add_dblink(node, db_name, reference_id, )
+                                self.add_dblink(
+                                    node,
+                                    db_name,
+                                    reference_id,
+                                )
                             rel_type = self.rel_name_map.get(attr)
                             node.add_edge_type(rel_type, val)
         except Exception as ex:
             self.logger.error('line:', line)
         return node, prev_line_is_comment
-    def add_dblink(self, node:NodeData, db_name, reference_id):
+    def add_dblink(self, node: NodeData, db_name, reference_id):
         link_node = NodeData(NODE_DBLINK, PROP_REF_ID)
         if reference_id.startswith(db_name):
-            reference_id = reference_id[len(db_name)+1:]  # remove db prefix
+            reference_id = reference_id[len(db_name) + 1 :]  # remove db prefix
         link_node.update_attribute(PROP_REF_ID, reference_id)
         link_node.update_attribute(PROP_DB_NAME, db_name)
         node.add_edge(node, link_node, REL_DBLINKS)
     def create_indexes(self, database: Database):
-        database.create_index(self.entity_name, PROP_ID, f"index_{self.entity_name.lower}_id")
-        database.create_index(self.entity_name, PROP_BIOCYC_ID, f"index_{self.entity_name.lower}_biocycid")
-        database.create_index(self.entity_name, PROP_NAME, f"index_{self.entity_name.lower}_name")
-    def update_nodes_in_graphdb(self, nodes:[], database:Database, etl_load_id: str):
+        database.create_index(
+            self.entity_name, PROP_ID, f"index_{self.entity_name.lower}_id"
+        )
+        database.create_index(
+            self.entity_name, PROP_BIOCYC_ID, f"index_{self.entity_name.lower}_biocycid"
+        )
+        database.create_index(
+            self.entity_name, PROP_NAME, f"index_{self.entity_name.lower}_name"
+        )
+    def update_nodes_in_graphdb(self, nodes: [], database: Database, etl_load_id: str):
         Load or update nodes in KG. This can also be called for initial loading.
         :param nodes: list of nodes
         :param database: neo4j Database
         :param etl_load: Id that (virtually) links a node to an EtlLoad node.

Check failure on line 157 in /work/graph-db/extractor/src/biocyc/

See this annotation in the file changed.

@github-actions github-actions / Black

'Update nodes: ' + ':'.join(self.node_labels))
         rows = []
         for node in nodes:
         attrs = self.attrs + [PROP_ID, PROP_DATA_SOURCE]
-        query = get_update_nodes_query(NODE_BIOCYC, PROP_BIOCYC_ID, attrs, self.node_labels, etl_load_id=etl_load_id, return_node_count=True)
+        query = get_update_nodes_query(
+            NODE_BIOCYC,
+            PROP_BIOCYC_ID,
+            attrs,
+            self.node_labels,
+            etl_load_id=etl_load_id,
+            return_node_count=True,
+        )
         return database.load_data_from_rows(query, rows, return_node_count=True)
-    def add_edges_to_graphdb(self, nodes:[], database:Database, etl_load_id):
+    def add_edges_to_graphdb(self, nodes: [], database: Database, etl_load_id):
         no_of_created_nodes = 0
         no_of_updated_nodes = 0
         no_of_created_relations = 0
         no_of_updated_relations = 0
         entity_rel_dict = dict()

Check failure on line 170 in /work/graph-db/extractor/src/biocyc/

See this annotation in the file changed.

@github-actions github-actions / Black


         for node in nodes:
             if self.create_synonym_rels():
                 id = node.get_attribute(PROP_BIOCYC_ID)
                 synonyms = node.get_synonym_set()
                 for syn in synonyms:
-                    synonym_list.append({PROP_BIOCYC_ID:id, PROP_NAME: syn})
+                    synonym_list.append({PROP_BIOCYC_ID: id, PROP_NAME: syn})
             for edge in node.edges:
                 from_id = edge.source.get_attribute(edge.source.id_attr)
                 to_id = edge.dest.get_attribute(edge.dest.id_attr)
                 rel = edge.label
                 if rel == REL_DBLINKS:

Check failure on line 229 in /work/graph-db/extractor/src/biocyc/

See this annotation in the file changed.

@github-actions github-actions / Black


                     db_name = edge.dest.get_attribute(PROP_DB_NAME)
                     if db_name in self.db_link_sources:
                         if db_name not in db_link_dict:
                             db_link_dict[db_name] = []
-                        db_link_dict[db_name].append({'from_id': from_id, 'to_id': to_id})
+                        db_link_dict[db_name].append(
+                            {'from_id': from_id, 'to_id': to_id}
+                        )
                     if rel not in entity_rel_dict:
                         entity_rel_dict[rel] = []
                     entity_rel_dict[rel].append({'from_id': from_id, 'to_id': to_id})
         if synonym_list:
   'Add synonyms')
-            query = get_create_synonym_relationships_query(NODE_BIOCYC, PROP_BIOCYC_ID, PROP_BIOCYC_ID, PROP_NAME, [], etl_load_id=etl_load_id, return_node_count=True)
+            query = get_create_synonym_relationships_query(
+                NODE_BIOCYC,
+                PROP_BIOCYC_ID,
+                PROP_BIOCYC_ID,
+                PROP_NAME,
+                [],
+                etl_load_id=etl_load_id,
+                return_node_count=True,
+            )
-            node_count, result_counters = database.load_data_from_rows(query, synonym_list, return_node_count=True)
+            node_count, result_counters = database.load_data_from_rows(
+                query, synonym_list, return_node_count=True
+            )
             no_of_created_nodes += result_counters.nodes_created
-            no_of_updated_nodes += (node_count - result_counters.nodes_created)
+            no_of_updated_nodes += node_count - result_counters.nodes_created
         for rel in entity_rel_dict.keys():
   'Add relationship ' + rel)
-            query = get_create_relationships_query(NODE_BIOCYC, PROP_BIOCYC_ID, 'from_id',
-                                                              NODE_BIOCYC, PROP_BIOCYC_ID, 'to_id', rel, etl_load_id=etl_load_id, return_node_count=True)
+            query = get_create_relationships_query(
+                NODE_BIOCYC,
+                PROP_BIOCYC_ID,
+                'from_id',
+                NODE_BIOCYC,
+                PROP_BIOCYC_ID,
+                'to_id',
+                rel,
+                etl_load_id=etl_load_id,
+                return_node_count=True,
+            )
-            node_count, result_counters = database.load_data_from_rows(query, entity_rel_dict[rel], return_node_count=True)
+            node_count, result_counters = database.load_data_from_rows(
+                query, entity_rel_dict[rel], return_node_count=True
+            )
             no_of_created_relations += result_counters.relationships_created
-            no_of_updated_relations += (node_count - result_counters.relationships_created)
-        _no_of_created_relations, _no_of_updated_relations = self.add_dblinks_to_graphdb(db_link_dict, database, etl_load_id)
+            no_of_updated_relations += (
+                node_count - result_counters.relationships_created
+            )
+        (
+            _no_of_created_relations,
+            _no_of_updated_relations,
+        ) = self.add_dblinks_to_graphdb(db_link_dict, database, etl_load_id)
         no_of_created_relations += _no_of_created_relations
         no_of_updated_relations += _no_of_updated_relations
-        return no_of_created_nodes, no_of_updated_nodes, no_of_created_relations, no_of_updated_relations
-    def add_dblinks_to_graphdb(self, db_link_dict:dict, database:Database, etl_load_id):
+        return (
+            no_of_created_nodes,
+            no_of_updated_nodes,
+            no_of_created_relations,
+            no_of_updated_relations,
+        )
+    def add_dblinks_to_graphdb(
+        self, db_link_dict: dict, database: Database, etl_load_id
+    ):
         no_of_created_relations = 0
         no_of_updated_relations = 0
         for db_name in db_link_dict.keys():
-  'Add DB Link relationship to ' + db_name )
+  'Add DB Link relationship to ' + db_name)
             dest_label = 'db_' + db_name
             rel = db_name.upper() + '_LINK'
-            query = get_create_relationships_query(NODE_BIOCYC, PROP_BIOCYC_ID, 'from_id',
-                                                              dest_label, PROP_ID, 'to_id', rel, etl_load_id=etl_load_id, return_node_count=True)
+            query = get_create_relationships_query(
+                NODE_BIOCYC,
+                PROP_BIOCYC_ID,
+                'from_id',
+                dest_label,
+                PROP_ID,
+                'to_id',
+                rel,
+                etl_load_id=etl_load_id,
+                return_node_count=True,
+            )
-            node_count, result_counters = database.load_data_from_rows(query, db_link_dict[db_name], return_node_count=True)
+            node_count, result_counters = database.load_data_from_rows(
+                query, db_link_dict[db_name], return_node_count=True
+            )
             no_of_created_relations += result_counters.relationships_created
-            no_of_updated_relations += (node_count - result_counters.relationships_created)
+            no_of_updated_relations += (
+                node_count - result_counters.relationships_created
+            )
         return no_of_created_relations, no_of_updated_relations
-    def write_entity_data_files(self, nodes:[]):
+    def write_entity_data_files(self, nodes: []):
         os.makedirs(self.db_output_dir, 0o777, True)'Writing {self.entity_name} files')
-        with open(os.path.join(self.db_output_dir, self.entity_name.lower() + '.tsv'), 'w') as f:
+        with open(
+            os.path.join(self.db_output_dir, self.entity_name.lower() + '.tsv'), 'w'
+        ) as f:
             attrs = [PROP_ID] + [PROP_DATA_SOURCE] + self.attrs
             f.write('\t'.join(attrs) + '\n')
             f.writelines(NodeData.get_entity_data_rows(nodes, attrs))

Check failure on line 16 in /work/graph-db/extractor/src/ncbi/

See this annotation in the file changed.

@github-actions github-actions / Black


 from common.query_builder import *
 from ncbi.ncbi_taxonomy_parser import *
 # reference to this directory
 directory = os.path.realpath(os.path.dirname(__file__))
 class NcbiTaxonomyChangeLog(ChangeLog):
     def __init__(self, author: str, change_id_prefix: str):
         super().__init__(author, change_id_prefix)
         self.date_tag ='%m%d%Y')

Check failure on line 47 in /work/graph-db/extractor/src/ncbi/

See this annotation in the file changed.

@github-actions github-actions / Black


     def load_ncbi_taxonomy_nodes(self):
         id = f'NCBI taxonomy data on date {self.date_tag}'
         if self.id_prefix:
             id = f'{self.id_prefix} {id}'
         comment = 'Load NCBI taxonomy nodes'
-        query = get_create_update_nodes_query(NODE_TAXONOMY, PROP_ID, NODE_ATTRS, [NODE_NCBI], datasource='NCBI Taxonomy')
-        changeset = CustomChangeSet(id,, comment, query, f'{self.file_prefix}{NCBI_TAXONOMY_FILE}')
+        query = get_create_update_nodes_query(
+            NODE_TAXONOMY, PROP_ID, NODE_ATTRS, [NODE_NCBI], datasource='NCBI Taxonomy'
+        )
+        changeset = CustomChangeSet(
+            id,, comment, query, f'{self.file_prefix}{NCBI_TAXONOMY_FILE}'
+        )
     def load_ncbi_taxonomy_synonym_rels(self):
         id = f'load NCBI taxonomy synonym relationship on date {self.date_tag}'
         if self.id_prefix:
             id = f'{self.id_prefix} {id}'
         comment = 'Load NCBI gene taxonomy relationship'
-        query = get_create_synonym_relationships_query(NODE_TAXONOMY, PROP_ID, PROP_ID, PROP_NAME, [PROP_TYPE])
-        changeset = CustomChangeSet(id,, comment, query, f'{self.file_prefix}{NCBI_TAXONOMY_SYNONYM_FILE}')
+        query = get_create_synonym_relationships_query(
+        )
+        changeset = CustomChangeSet(
+            id,
+  ,
+            comment,
+            query,
+            f'{self.file_prefix}{NCBI_TAXONOMY_SYNONYM_FILE}',
+        )
     def load_ncbi_taxonomy_parent_rels(self):
         id = f'create relationship between taxonomy and parent nodes on date {self.date_tag}'
         if self.id_prefix:

Check failure on line 59 in /work/graph-db/extractor/src/ncbi/

See this annotation in the file changed.

@github-actions github-actions / Black


         comment = 'Taxonomy relationship with parent'
         query = """
         CALL apoc.periodic.iterate(
         'MATCH (n:Taxonomy), (m:Taxonomy) WHERE m.prop = n.parent_id RETURN n, m',
         'MERGE (n)-[:HAS_PARENT]->(m)', {batchSize:5000})
-        """.replace('prop', PROP_ID)
+        """.replace(
+            'prop', PROP_ID
+        )
         changeset = ChangeSet(id,, comment, query)
     def set_species_id(self):
         id = f'set species_id for taxonomy nodes on date {self.date_tag}'

Check failure on line 77 in /work/graph-db/extractor/src/ncbi/

See this annotation in the file changed.

@github-actions github-actions / Black


         changeset = ChangeSet(id,, comment, query)
     def create_indexes(self):
         queries = []
-        queries.append(get_create_constraint_query(NODE_TAXONOMY, PROP_ID, 'constraint_taxonomy_id') + ';')
-        queries.append(get_create_constraint_query(NODE_SYNONYM, PROP_NAME, 'constraint_synonym_name') + ';')
-        queries.append(get_create_index_query(NODE_TAXONOMY, PROP_NAME, 'index_taxonomy_name') + ';')
-        queries.append(get_create_index_query(NODE_TAXONOMY, 'species_id', 'index_taxonomy_speciesid') + ';')
+        queries.append(
+            get_create_constraint_query(
+                NODE_TAXONOMY, PROP_ID, 'constraint_taxonomy_id'
+            )
+            + ';'
+        )
+        queries.append(
+            get_create_constraint_query(
+                NODE_SYNONYM, PROP_NAME, 'constraint_synonym_name'
+            )
+            + ';'
+        )
+        queries.append(
+            get_create_index_query(NODE_TAXONOMY, PROP_NAME, 'index_taxonomy_name')
+            + ';'
+        )
+        queries.append(
+            get_create_index_query(
+                NODE_TAXONOMY, 'species_id', 'index_taxonomy_speciesid'
+            )
+            + ';'
+        )
         return queries
     def add_index_change_set(self):
         id = f'create NCBI taxonomy constraints on date {self.date_tag}'
         if self.id_prefix: