Skip to content

Commit

Permalink
Add old symbols and implement client
Browse files Browse the repository at this point in the history
  • Loading branch information
bgyori committed Jun 15, 2024
1 parent b3e9023 commit 3a3256d
Show file tree
Hide file tree
Showing 2 changed files with 156 additions and 0 deletions.
142 changes: 142 additions & 0 deletions indra/databases/rgd_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
"""A client for accessing RGD rat gene data."""

from collections import defaultdict
from typing import List, Union

from indra.util import read_unicode_csv
from indra.resources import get_resource_path


def get_id_from_name(name: str) -> Union[str, None]:
"""Return an RGD ID from an RGD gene symbol.
Parameters
----------
name :
The RGD gene symbol whose ID will be returned.
Returns
-------
:
The RGD ID (without prefix) or None if not available.
"""
return rgd_name_to_id.get(name)


def get_name_from_id(rgd_id: str) -> Union[str, None]:
"""Return the RGD gene symbol for a given RGD ID.
Parameters
----------
rgd_id :
The RGD ID (without prefix) whose symbol will be returned.
Returns
-------
:
The RGD symbol for the given ID or None if not available.
"""
if rgd_id and rgd_id.startswith('RGD:'):
rgd_id = rgd_id[4:]
return rgd_id_to_name.get(rgd_id)


def get_synonyms(rgd_id: str) -> List[str]:
"""Return the synonyms for an RGD ID.
Parameters
----------
rgd_id :
An RGD ID, without prefix.
Returns
-------
:
The list of synonyms corresponding to the RGD ID, or an empty list
if not available.
"""
if rgd_id and rgd_id.startswith('RGD:'):
rgd_id = rgd_id[4:]
return rgd_synonyms.get(rgd_id, [])


def get_id_from_name_synonym(name_synonym: str) -> Union[None, str, List[str]]:
"""Return an RGD ID from an RGD gene symbol or synonym.
If the given name or synonym is the official symbol of a gene, its
ID is returned. If the input is a synonym, it can correspond to
one or more genes. If there is a single gene whose synonym matches
the input, the ID is returned as a string. If multiple genes share
the given synonym, their IDs are returned in a list. If the input
doesn't match any names or synonyms, None is returned.
Parameters
----------
name_synonym :
The RGD gene symbol or synonym whose ID will be returned.
Returns
-------
:
The RGD ID (without prefix) of a single gene, a list of RGD IDs,
or None.
"""
rgd_id = rgd_name_to_id.get(name_synonym)
if rgd_id:
return rgd_id
rgd_ids = rgd_synonyms_reverse.get(name_synonym)
if rgd_ids:
if len(rgd_ids) == 1:
return rgd_ids[0]
else:
return rgd_ids
return None


def get_ensembl_id(rgd_id: str) -> Union[str, None]:
"""Return the Ensembl ID for an RGD ID.
Parameters
----------
rgd_id :
An RGD ID, without prefix.
Returns
-------
:
A list of Ensembl IDs corresponding to the RGD ID,
or None if not available.
"""
return rgd_id_to_ensembl.get(rgd_id)


def _read_rgd():
fname = get_resource_path('rgd_entries.tsv')
rgd_id_to_name = {}
rgd_name_to_id = {}
rgd_synonyms = {}
rgd_synonyms_reverse = defaultdict(list)
rgd_id_to_ensembl = {}
ensemble_id_to_rgd = {}
for rgd_id, name, synonyms_str, ensembl_id in \
read_unicode_csv(fname, '\t'):
if name:
rgd_id_to_name[rgd_id] = name
rgd_name_to_id[name] = rgd_id
if synonyms_str:
synonyms = synonyms_str.split(';')
rgd_synonyms[rgd_id] = synonyms
for synonym in synonyms:
rgd_synonyms_reverse[synonym].append(rgd_id)
if ensembl_id:
ensemble_ids = ensembl_id.split(';')
rgd_id_to_ensembl[rgd_id] = ensemble_ids
for ensemble_id in ensemble_ids:
ensemble_id_to_rgd[ensemble_id] = rgd_id

return rgd_id_to_name, rgd_name_to_id, rgd_synonyms, \
dict(rgd_synonyms_reverse), rgd_id_to_ensembl, ensemble_id_to_rgd


rgd_id_to_name, rgd_name_to_id, rgd_synonyms, rgd_synonyms_reverse, \
rgd_id_to_ensembl, ensemble_id_to_rgd = _read_rgd()
14 changes: 14 additions & 0 deletions indra/resources/update_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -985,6 +985,20 @@ def update_ec_code():
PyOboClient.update_by_prefix("ec-code", indra_prefix='eccode')


def update_rgd():
url = "https://download.rgd.mcw.edu/data_release/GENES_RAT.txt"
df = pandas.read_csv(url, sep='\t', comment='#', dtype=str)
df = df[['GENE_RGD_ID', 'SYMBOL', 'OLD_SYMBOL', 'ENSEMBL_ID']]
df.rename(columns={'GENE_RGD_ID': 'ID',
'SYMBOL': 'symbol',
'OLD_SYMBOL': 'synonyms',
'ENSEMBL_ID': 'ensembl'},
inplace=True)
df['ID'] = df['ID'].astype(str)
fname = os.path.join(path, 'rgd_entries.tsv')
df.to_csv(fname, index=None, sep='\t')


def update_mgi():
# This provides basic ID and naming resources
url = "http://www.informatics.jax.org/downloads/reports/MRK_List2.rpt"
Expand Down

0 comments on commit 3a3256d

Please sign in to comment.