Merge pull request #84 from PNNL-CompBio/data-trigger

Data trigger file reorg
PNNL-CompBio · Apr 22, 2024 · 8268aac · 8268aac
2 parents f098a95 + a98eaa6
commit 8268aac
Show file tree

Hide file tree

Showing 96 changed files with 6,498 additions and 302 deletions.
diff --git a/MASV_classAndSource.csv b/MASV_classAndSource.csv
diff --git a/buildV3db.sh b/buildV3db.sh
diff --git a/build_script.py b/build_script.py
@@ -0,0 +1,174 @@
+'''
+Build script moved to python for better extendability and interoperability.
+
+'''
+
+
+import os
+import subprocess
+import pandas as pd
+import argparse
+
+
+def collectFiles(data_dir='https://raw.githubusercontent.com/PNNL-CompBio/srpAnalytics/main/data',filename='srp_build_files.csv'):
+    '''
+    every time the build file is updated, this script will collect the files and return
+    a dictionary of files to be fed into each module
+    '''
+    df = pd.read_csv(data_dir+'/'+filename)
+    return df
+
+
+def fitCurveFiles(morpho_behavior_tuples):
+    '''
+    get new curve fits, list of tuples of morpho/behavior pairs
+    '''
+
+
+
+def combineFiles(location_list,ftype):
+    '''
+    helper function to combine duplicates
+    '''
+    dflist=[]
+    required_columns = {'bmd':['Chemical_ID','End_Point','Model','BMD10','BMD50',"Min_Dose","Max_Dose",\
+                                "AUC_Norm","DataQC_Flag","BMD_Analysis_Flag"],#,"BMD10_Flag","BMD50_Flag{"),
+                          'dose':['Chemical_ID',"End_Point","Dose","Response","CI_Lo","CI_Hi"],\
+                          'fit':['Chemical_ID',"End_Point","X_vals","Y_vals"]}
+
+    print('concatenating '+ftype)
+    for loc in location_list.location:
+        f = pd.read_csv(loc)[required_columns[ftype]]
+        dflist.append(f)
+    fulldf=pd.concat(dflist)
+    fulldf = fulldf.drop_duplicates()
+
+
+    return fulldf.drop_duplicates()
+
+def runSampMap(is_sample=False,drcfiles=[],smap='',cid='',\
+               emap='',cclass='',ctfile='',fses='',desfile=''):
+    '''
+    run sample mapping
+    '''
+    if is_sample:
+        cmd = "Rscript sampleChemMapping/mapSamplesToChems.R --sample --drcFiles="+','.join(drcfiles)+\
+            ' --sampId='+smap+' --chemId='+cid+' --epMap='+emap+' --chemClass='+cclass+\
+            ' --compToxFile='+ctfile+' --sampleFiles='+fses+' --chemDesc='+desfile+\
+            ' --sampMap='+smap
+    elif len(drcfiles)>0:
+        cmd = "Rscript sampleChemMapping/mapSamplesToChems.R --chemical --drcFiles="+','.join(drcfiles)+\
+            ' --sampId='+smap+' --chemId='+cid+' --epMap='+emap+' --chemClass='+cclass+\
+            ' --compToxFile='+ctfile+' --sampleFiles='+fses+' --chemDesc='+desfile+\
+            ' --sampMap='+smap
+    else:
+        cmd = "Rscript sampleChemMapping/mapSamplesToChems.R --sampId="+smap+' --chemId='+cid+\
+            ' --epMap='+emap+' --chemClass='+cclass+\
+            ' --compToxFile='+ctfile+' --sampleFiles='+fses+' --chemDesc='+desfile+\
+            ' --sampMap='+smap
+
+    print(cmd)
+    os.system(cmd)
+
+def runExposome(chem_id_file):
+    '''
+    run exposome data pull
+    '''
+    ##TODO: make this work with chemical id file as argument
+    cmd = 'Rscript exposome/exposome_summary_stats.R'
+
+def runExpression(gex,chem):
+    '''
+    run expression parsing
+    '''
+    cmd = 'Rscript zfExp/parseGexData.R '+gex+' '+chem
+    print(cmd)
+    os.system(cmd)
+
+def runSchemaCheck(dbfiles=[]):
+    '''
+    run schema checking
+    '''
+    ##TODO: make this work with files as arguments
+    cmd = 'python dbSchema/main.py'
+
+def main():
+    '''
+    this wrapping script is placed into every docker image to pull the files
+    from the repo and initiate the appropriate call to the underlying code.
+    '''
+    df = collectFiles()
+
+
+    ####
+    # file parsing - collects all files we might need for the tool below
+    ####
+    ##first find the morphology and behavior pairs for chemical sources
+    chemdf = df.loc[df.sample_type=='chemical']
+    morph = chemdf.loc[chemdf.data_type=='morphology']
+    beh = chemdf.loc[chemdf.data_type=='behavior']
+    tupes =[]
+    for n in morph.name:
+        tupes.append([morph.loc[morph.name==n].location,beh.loc[beh.name==n].location])
+
+
+    ##now map sample information
+    sid = list(df.loc[df.name=='sampId'].location)[0]
+    cid = list(df.loc[df.name=='chemId'].location)[0]
+    cclass = list(df.loc[df.name=='class1'].location)[0]
+    emap = list(df.loc[df.name=='endpointMap'].location)[0]
+    fses = ','.join(list(df.loc[df.data_type=='sample'].location))
+    ctfile = list(df.loc[df.name=='compTox'].location)[0]
+    desfile = list(df.loc[df.name=='chemdesc'].location)[0]
+    smap = list(df.loc[df.name=='sampMap'].location)[0]
+    gex1 = ','.join(list(df.loc[df.data_type=='expression'].location))
+    ginfo = list(df.loc[df.name=='geneInfo'].location)[0]
+
+
+    ###now we can call individiual commands
+    parser = argparse.ArgumentParser('Pull files from github list of files and call appropriate command')
+    parser.add_argument('--bmd', dest='bmd',action='store_true', default=False, help='Re-run benchmark dose calculation and dependent commands')
+    parser.add_argument('--samps', dest='samps', action='store_true', default=False, help='Re run sample-chem mapping')
+    parser.add_argument('--expo', dest='expo', action='store_true', default=False, help='Re run exposome sample collection')
+    parser.add_argument('--geneEx', dest='geneEx', action='store_true', default=False, help='Re run gene expression generation')
+
+
+    args = parser.parse_args()
+
+    ##call bmdrc on all morphology/behavior pairs for sample sources
+    if args.bmd:
+        print("Re-running benchmark dose collection")
+        newbmds,newfits,newdoses =[],[],[]
+        fitCurveFilesls()
+
+    if args.bmd or args.samps: ### need to rerun samples if we have created new bmds
+        #add chemical BMDS, fits, curves to existing data
+        chemfiles=[]
+        sampfiles=[]
+        for st in ['chemical','extract']:
+            for dt in ['bmd','fit','dose']:
+                fdf = combineFiles(df.loc[df.sample_type==st].loc[df.data_type==dt],dt)
+                fname = 'tmp_'+st+'_'+dt+'.csv'
+                fdf.to_csv(fname,index=False)
+                if st=='chemical':
+                    chemfiles.append(fname)
+                else:
+                    sampfiles.append(fname)
+        runSampMap(True,sampfiles,smap,cid,emap,cclass,ctfile,fses,desfile)
+        runSampMap(False,chemfiles,smap,cid,emap,cclass,ctfile,fses,desfile)
+        runSampMap(False,[],smap,cid,emap,cclass,ctfile,fses,desfile)
+
+        ##now we run validation
+        runSchemaCheck()
+    if args.expo:
+        runExposome()
+        runSchemaCheck()
+    if args.geneEx:
+        if not os.path.exists("chemicals.csv"):
+            runSampMap(False,[],smap,cid,emap,cclass,ctfile,fses,desfile)
+
+        runExpresion(gene1,'chemicals.csv',ginfo)
+        runSchemaCheck()
+
+
+main()