Skip to content

Commit

Permalink
Merge pull request #599 from monarch-initiative/issue-649-SO-terms-fo…
Browse files Browse the repository at this point in the history
…r-hgnc

Populate type for HGNC & Alliance gene nodes
  • Loading branch information
kevinschaper authored Sep 10, 2024
2 parents a668fa6 + ee40eef commit 24f9de3
Show file tree
Hide file tree
Showing 17 changed files with 171 additions and 79 deletions.
2 changes: 2 additions & 0 deletions docs/Sources/alliance.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ __**Biolink captured**__
* source
* synonyms
* xref
* type (["SO:0001217"])


## [Gene to Phenotype](#gene_to_phenotype)

Expand Down
3 changes: 3 additions & 0 deletions docs/Sources/hgnc.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ The HGNC is responsible for approving unique symbols and names for human loci, i

This ingest uses HGNC's "complete set" download file, which only contains associations between publications and genes that are denoted in some way in the publication. We have selected to use a consistent high level term for 'publication' (IAO:0000311) as it is heterogeneous mix of publication types being referenced.

SO terms to populate the type are taken from the Alliance genome HGNC BGI files, provided by RGD.

__**Biolink Captured**__

* biolink:Gene
Expand All @@ -24,6 +26,7 @@ __**Biolink Captured**__
* omim id
* in_taxon (["NCBITaxon:9606"])
* provided_by (["infores:hgnc"])
* type (["SO:0001217"])

## Citation

Expand Down
13 changes: 12 additions & 1 deletion scripts/after_download.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,18 @@
#!/bin/sh

# set zcat to gzcat if gzcat is available
if command -v gzcat &> /dev/null
then
ZCAT=gzcat
else
ZCAT=zcat
fi

# Make a simple text file of all the gene IDs in Alliance
zcat data/alliance/BGI_*.gz | jq '.data[].basicGeneticEntity.primaryId' | pigz > data/alliance/alliance_gene_ids.txt.gz
${ZCAT} data/alliance/BGI_*.gz | jq '.data[].basicGeneticEntity.primaryId' | pigz > data/alliance/alliance_gene_ids.txt.gz

# Make a two column tsv of human gene IDs and SO terms
${ZCAT} data/alliance/BGI_HUMAN.json.gz | jq -r '.data[] | "\(.basicGeneticEntity.primaryId)\t\(.soTermId)"' > data/hgnc/hgnc_so_terms.tsv

# Make an id, name map of DDPHENO terms
sqlite3 -cmd ".mode tabs" -cmd ".headers on" data/dictybase/ddpheno.db "select subject as id, value as name from rdfs_label_statement where predicate = 'rdfs:label' and subject like 'DDPHENO:%'" > data/dictybase/ddpheno.tsv
Expand Down
17 changes: 11 additions & 6 deletions src/monarch_ingest/cli_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import csv
import gc
import os
import sys
import tarfile
Expand Down Expand Up @@ -407,12 +406,13 @@ def apply_closure(
)
sh.mv(database, f"{output_dir}/")


def load_sqlite():
sh.bash("scripts/load_sqlite.sh")


def load_solr():
sh.bash("scripts/load_solr.sh", _out=sys.stdout, _err=sys.stderr)
sh.bash("scripts/load_solr.sh", _out=sys.stdout, _err=sys.stderr)


def load_jsonl():
Expand Down Expand Up @@ -446,26 +446,31 @@ def slot_is_multi_valued(slot_name: str) -> bool:
mv_node_replacement = ", ".join([f"string_split({col}, '|') as {col}" for col in mv_node_columns])
mv_edge_replacement = ", ".join([f"string_split({col}, '|') as {col}" for col in mv_edge_columns])

db.sql(f"""
db.sql(
f"""
copy (
select nodes.* replace (ancestors as category, {mv_node_replacement})
from nodes
join class_ancestor_df on category = classname
) to 'output/monarch-kg_nodes.jsonl' (FORMAT JSON);
""")
"""
)

db.sql(f"""
db.sql(
f"""
copy (
select edges.* replace (ancestors as category, {mv_edge_replacement}),
from edges
join class_ancestor_df on category = classname
) to 'output/monarch-kg_edges.jsonl' (FORMAT JSON);
""")
"""
)


def export_tsv():
export()


def do_prepare_release(dir: str = OUTPUT_DIR):

compressed_artifacts = [
Expand Down
4 changes: 4 additions & 0 deletions src/monarch_ingest/download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@
url: https://fms.alliancegenome.org/download/GENECROSSREFERENCE_COMBINED.tsv.gz
local_name: data/alliance/GENECROSSREFERENCE_COMBINED.tsv.gz
tag: alliance_gene
-
url: https://fms.alliancegenome.org/download/BGI_HUMAN.json.gz
local_name: data/alliance/BGI_HUMAN.json.gz
tag: hgnc_gene
-
url: https://fms.alliancegenome.org/download/BGI_MGI.json.gz
local_name: data/alliance/BGI_MGI.json.gz
Expand Down
3 changes: 1 addition & 2 deletions src/monarch_ingest/ingests/alliance/gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@
symbol=row["symbol"],
name=row["symbol"],
full_name=row["name"].replace("\r", ""), # Replacement to remove stray carriage returns in XenBase files
# No place in the schema for gene type (SO term) right now
# type=row["soTermId"],
type=[row["soTermId"]],
in_taxon=[in_taxon],
in_taxon_label=in_taxon_label,
provided_by=[source],
Expand Down
1 change: 1 addition & 0 deletions src/monarch_ingest/ingests/alliance/gene.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ node_properties:
- 'provided_by'
- 'name'
- 'symbol'
- 'type'
- 'full_name'
- 'description'
- 'in_taxon'
Expand Down
4 changes: 2 additions & 2 deletions src/monarch_ingest/ingests/alliance/gene_to_phenotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@

koza_app = get_koza_app("alliance_gene_to_phenotype")

while (row := koza_app.get_row()) is not None:
gene_ids = koza_app.get_map("alliance-gene")

gene_ids = koza_app.get_map("alliance-gene")
while (row := koza_app.get_row()) is not None:

if len(row["phenotypeTermIdentifiers"]) == 0:
logger.debug("Phenotype ingest record has 0 phenotype terms: " + str(row))
Expand Down
3 changes: 3 additions & 0 deletions src/monarch_ingest/ingests/hgnc/gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

koza_app = get_koza_app("hgnc_gene")

so_term_map = koza_app.get_map("hgnc-so-terms")

while (row := koza_app.get_row()) is not None:

xref_list = []
Expand Down Expand Up @@ -31,6 +33,7 @@
symbol=row["symbol"],
name=row["symbol"],
full_name=row["name"],
type=[so_term_map[row['hgnc_id']]['so_term_id']] if row['hgnc_id'] in so_term_map else None,
xref=xref_list,
synonym=synonyms_list,
in_taxon=[in_taxon],
Expand Down
4 changes: 4 additions & 0 deletions src/monarch_ingest/ingests/hgnc/gene.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ delimiter: '\t'

global_table: './src/monarch_ingest/translation_table.yaml'

depends_on:
- './src/monarch_ingest/maps/hgnc-so-terms.yaml'

columns:
- hgnc_id
- symbol
Expand Down Expand Up @@ -73,6 +76,7 @@ node_properties:
- 'full_name'
- 'in_taxon'
- 'in_taxon_label'
- 'type'
- 'xref'
- 'synonym'
- 'provided_by'
Expand Down
5 changes: 3 additions & 2 deletions src/monarch_ingest/ingests/hpoa/disease_to_phenotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
)
from monarch_ingest.ingests.hpoa.hpoa_utils import phenotype_frequency_to_hpo_term, Frequency


def get_primary_knowledge_source(disease_id: str) -> str:
if disease_id.startswith("OMIM"):
return "infores:omim"
Expand Down Expand Up @@ -90,7 +91,7 @@ def get_primary_knowledge_source(disease_id: str) -> str:
# don't populate the reference with the database_id / disease id
publications = [p for p in publications if not p == row["database_id"]]

primary_knowledge_source = get_primary_knowledge_source(disease_id )
primary_knowledge_source = get_primary_knowledge_source(disease_id)

# Association/Edge
association = DiseaseToPhenotypicFeatureAssociation(
Expand All @@ -108,7 +109,7 @@ def get_primary_knowledge_source(disease_id: str) -> str:
frequency_qualifier=frequency.frequency_qualifier if frequency.frequency_qualifier else None,
has_count=frequency.has_count,
has_total=frequency.has_total,
aggregator_knowledge_source=["infores:monarchinitiative","infores:hpo-annotations"],
aggregator_knowledge_source=["infores:monarchinitiative", "infores:hpo-annotations"],
primary_knowledge_source=primary_knowledge_source,
knowledge_level=KnowledgeLevelEnum.knowledge_assertion,
agent_type=AgentTypeEnum.manual_agent,
Expand Down
4 changes: 3 additions & 1 deletion src/monarch_ingest/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,11 @@ def solr():
def export():
export_tsv()


@typer_app.command()
def prepare_release():
do_prepare_release();
do_prepare_release()


@typer_app.command()
def release(
Expand Down
22 changes: 22 additions & 0 deletions src/monarch_ingest/maps/hgnc-so-terms.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: 'hgnc-so-terms'

metadata:
description: 'Mapping file to look up SO terms (aka type) for HGNC genes, generated from Alliance BGI files and provided by RGD'

delimiter: '\t'

files:
- './data/hgnc/hgnc_so_terms.tsv'

header: 'none'

columns:
- 'gene_id'
- 'so_term_id'

key: 'gene_id'

values:
- 'gene_id'
- 'so_term_id'

Loading

0 comments on commit 24f9de3

Please sign in to comment.