cognoma · dhimmel · Oct 9, 2016 · Oct 6, 2016 · Oct 7, 2016 · Oct 7, 2016
diff --git a/1.download.py b/1.download.py
@@ -0,0 +1,48 @@
+import os
+import collections
+import json
+import ftplib
+from datetime import datetime
+from urllib.request import urlretrieve
+
+host = 'ftp.ncbi.nih.gov'
+ftp_root = 'ftp://{}/'.format(host)
+
+def ncbi_ftp_download(ftp_paths, directory):
+    """
+    Download files from the NCBI FTP server. Returns a dictionary with datetime
+    information.
+    """
+    for ftp_path in ftp_paths:
+        url = ftp_root + ftp_path
+        path = os.path.join(directory, ftp_path.split('/')[-1])
+        urlretrieve(url, path)
+
+    versions = collections.OrderedDict()
+    versions['retrieved'] = datetime.utcnow().isoformat()
+
+    with ftplib.FTP(host) as ftp:
+        ftp.login()
+        for ftp_path in ftp_paths:
+            modified = ftp.sendcmd('MDTM ' + ftp_path)
+            _, modified = modified.split(' ')
+            modified = datetime.strptime(modified, '%Y%m%d%H%M%S')
+            versions[ftp_path] = modified.isoformat()
+
+    return versions
+
+if __name__ == '__main__':
+    # Where to download files
+    directory = 'download'
+
+    # Specific files to download
+    ftp_paths = [
+        'gene/DATA/gene_history.gz',
+        'gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz',
+    ]
+    versions = ncbi_ftp_download(ftp_paths, directory)
+
+    # Save version info as JSON
+    path = os.path.join('download', 'versions.json')
+    with open(path, 'w') as write_file:
+        json.dump(versions, write_file, indent=2)
diff --git a/2.process.py b/2.process.py
@@ -0,0 +1,146 @@
+import os
+import collections
+
+import pandas
+
+def create_history_df(path):
+    """
+    Process `gene_history.gz` for Project Cognoma. Returns a dataframe which
+    maps old GeneIDs to new GeneIDs. GeneIDs are not mapped to themselves, so
+    make sure not to filter all genes without an old GeneID when using this
+    dataset.
+    """
+
+    renamer = collections.OrderedDict([
+        ('Discontinued_GeneID', 'old_entrez_gene_id'),
+        ('GeneID', 'new_entrez_gene_id'),
+        ('Discontinue_Date', 'date'),
+        ('#tax_id', 'tax_id'),
+    ])
+
+    history_df = (
+        pandas.read_table(path, compression='gzip', na_values='-')
+        [list(renamer)]
+        .rename(columns=renamer)
+        .query("tax_id == 9606")
+        .drop(['tax_id'], axis='columns')
+        .dropna(subset=['new_entrez_gene_id'])
+        .sort_values('old_entrez_gene_id')
+    )
+    history_df.new_entrez_gene_id = history_df.new_entrez_gene_id.astype(int)
+
+    return history_df
+
+def create_gene_df(path):
+    """
+    Process `Homo_sapiens.gene_info.gz` for Project Cognoma. Filters genes to
+    homo sapiens with `tax_id == 9606` to remove Neanderthals et al.
+    """
+
+    renamer = collections.OrderedDict([
+        ('GeneID', 'entrez_gene_id'),
+        ('Symbol', 'symbol'),
+        ('description', 'description'),
+        ('chromosome', 'chromosome'),
+        ('type_of_gene', 'gene_type'),
+        ('Synonyms', 'synonyms'),
+        ('Other_designations', 'aliases'),
+        ('#tax_id', 'tax_id'),
+    ])
+
+    gene_df = (pandas.read_table(path, compression='gzip', na_values='-')
+        [list(renamer)]
+        .rename(columns=renamer)
+        .query("tax_id == 9606")
+        .drop(['tax_id'], axis='columns')
+        .sort_values('entrez_gene_id')
+    )
+
+    return gene_df
+
+def tidy_split(df, column, sep='|', keep=False):
+    """
+    Split the values of a column and expand so the new DataFrame has one split
+    value per row. Filters rows where the column is missing.
+
+    Params
+    ------
+    df : pandas.DataFrame
+        dataframe with the column to split and expand
+    column : str
+        the column to split and expand
+    sep : str
+        the string used to split the column's values
+    keep : bool
+        whether to retain the presplit value as it's own row
+
+    Returns
+    -------
+    pandas.DataFrame
+        Returns a dataframe with the same columns as `df`.
+    """
+    indexes = list()
+    new_values = list()
+    df = df.dropna(subset=[column])
+    for i, presplit in enumerate(df[column].astype(str)):
+        values = presplit.split(sep)
+        if keep and len(values) > 1:
+            indexes.append(i)
+            new_values.append(presplit)
+        for value in values:
+            indexes.append(i)
+            new_values.append(value)
+    new_df = df.iloc[indexes, :].copy()
+    new_df[column] = new_values
+    return new_df
+
+def get_chr_symbol_map(gene_df):
+    """
+    Create a dataframe for mapping genes to Entrez where all is known is the
+    chromosome and symbol. Symbol-chromosome pairs are unique. All approved
+    symbols should map and the majority of synonyms should also map. Only
+    synonyms that are ambigious within a chromosome are removed. 
+    """
+    primary_df = (gene_df
+        [['entrez_gene_id', 'chromosome', 'symbol']]
+        .pipe(tidy_split, column='chromosome', keep=True)
+    )
+
+    synonym_df = (gene_df
+        [['entrez_gene_id', 'chromosome', 'synonyms']]
+        .rename(columns={'synonyms': 'symbol'})
+        .pipe(tidy_split, column='symbol', keep=False)
+        .pipe(tidy_split, column='chromosome', keep=True)
+        .drop_duplicates(['chromosome', 'symbol'], keep=False)
+    )
+
+    map_df = (primary_df
+        .append(synonym_df)
+        .drop_duplicates(subset=['chromosome', 'symbol'], keep='first')
+        [['symbol', 'chromosome', 'entrez_gene_id']]
+        .sort_values(['symbol', 'chromosome'])
+    )
+
+    return map_df
+
+
+if __name__ == '__main__':
+
+    # History mapper
+    path = os.path.join('download', 'gene_history.gz')
+    history_df = create_history_df(path)
+
+    path = os.path.join('data', 'updater.tsv')
+    history_df.to_csv(path, index=False, sep='\t')
+
+    # Genes data
+    path = os.path.join('download', 'Homo_sapiens.gene_info.gz')
+    gene_df = create_gene_df(path)
+
+    path = os.path.join('data', 'genes.tsv')
+    gene_df.to_csv(path, index=False, sep='\t')
+
+    # Chromosome-Symbol Map
+    map_df = get_chr_symbol_map(gene_df)
+    path = os.path.join('data', 'chromosome-symbol-mapper.tsv')
+    map_df.to_csv(path, index=False, sep='\t')
diff --git a/README.md b/README.md
@@ -0,0 +1,30 @@
+# Genes for Project Cognoma
+
+This repository creates the set of genes to be used in Project Cognoma. The human subset of Entrez Gene is the basis of Cognoma genes. All genes in Cognoma should be converted to Entrez GeneIDs (using a preferred variable name of `entrez_gene_id`).
+
+When encountering genes in Project Cognoma, identify which of the following approach should be applied:
+
++ If the input genes are only in symbols, open an issue to discuss mapping options.
++ If the input genes contain chromosome and symbol information, use [`chromosome-symbol-map.tsv`](data/chromosome-symbol-map.tsv) to map the genes to Entrez GeneIDs.
++ If the genes are already encoded as Entrez GeneIDs, update the Gene_IDs to their most recent versions using [`updater.tsv`](data/updater.tsv) and remove GeneIDs that are not in [`genes.tsv`](data/genes.tsv).
+
+## Downloads and data
+
+The raw (downloaded) data is stored in the [`download`](download) directory. [`versions.json`](donwload/versions.json) contains timestamps for the raw data. The raw data is tracked since the Entrez Gene FTP site doesn't version and archive files.
+
+Created data is stored in the [`data`](data) directory. Applications should use the processed data rather than the raw data, if possible. Applications are strongly encouraged to use versioned (commit-hash-containing) links when accessing data from this repository.
+
+## Execution
+
+Use the following commands to run the analysis, inside the environment specified by [`environment.yml`](environment.yml):
+
+```sh
+# To run the entire analysis
+python 1.download.py
+python 2.process.py
+
+# To run just the data processing
+python 2.process.py
+```
+
+In general, we don't anticipate redownloading the data frequently. If you submit a pull request to create additional datasets, please do not execute `1.download.py`.