Skip to content

Commit

Permalink
Merge pull request #84 from PNNL-CompBio/data-trigger
Browse files Browse the repository at this point in the history
Data trigger file reorg
  • Loading branch information
sgosline authored Apr 22, 2024
2 parents f098a95 + a98eaa6 commit 8268aac
Show file tree
Hide file tree
Showing 96 changed files with 6,498 additions and 302 deletions.
1,554 changes: 1,554 additions & 0 deletions MASV_classAndSource.csv

Large diffs are not rendered by default.

138 changes: 0 additions & 138 deletions buildV3db.sh

This file was deleted.

174 changes: 174 additions & 0 deletions build_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
'''
Build script moved to python for better extendability and interoperability.
'''


import os
import subprocess
import pandas as pd
import argparse


def collectFiles(data_dir='https://raw.githubusercontent.com/PNNL-CompBio/srpAnalytics/main/data',filename='srp_build_files.csv'):
'''
every time the build file is updated, this script will collect the files and return
a dictionary of files to be fed into each module
'''
df = pd.read_csv(data_dir+'/'+filename)
return df


def fitCurveFiles(morpho_behavior_tuples):
'''
get new curve fits, list of tuples of morpho/behavior pairs
'''



def combineFiles(location_list,ftype):
'''
helper function to combine duplicates
'''
dflist=[]
required_columns = {'bmd':['Chemical_ID','End_Point','Model','BMD10','BMD50',"Min_Dose","Max_Dose",\
"AUC_Norm","DataQC_Flag","BMD_Analysis_Flag"],#,"BMD10_Flag","BMD50_Flag{"),
'dose':['Chemical_ID',"End_Point","Dose","Response","CI_Lo","CI_Hi"],\
'fit':['Chemical_ID',"End_Point","X_vals","Y_vals"]}

print('concatenating '+ftype)
for loc in location_list.location:
f = pd.read_csv(loc)[required_columns[ftype]]
dflist.append(f)
fulldf=pd.concat(dflist)
fulldf = fulldf.drop_duplicates()


return fulldf.drop_duplicates()

def runSampMap(is_sample=False,drcfiles=[],smap='',cid='',\
emap='',cclass='',ctfile='',fses='',desfile=''):
'''
run sample mapping
'''
if is_sample:
cmd = "Rscript sampleChemMapping/mapSamplesToChems.R --sample --drcFiles="+','.join(drcfiles)+\
' --sampId='+smap+' --chemId='+cid+' --epMap='+emap+' --chemClass='+cclass+\
' --compToxFile='+ctfile+' --sampleFiles='+fses+' --chemDesc='+desfile+\
' --sampMap='+smap
elif len(drcfiles)>0:
cmd = "Rscript sampleChemMapping/mapSamplesToChems.R --chemical --drcFiles="+','.join(drcfiles)+\
' --sampId='+smap+' --chemId='+cid+' --epMap='+emap+' --chemClass='+cclass+\
' --compToxFile='+ctfile+' --sampleFiles='+fses+' --chemDesc='+desfile+\
' --sampMap='+smap
else:
cmd = "Rscript sampleChemMapping/mapSamplesToChems.R --sampId="+smap+' --chemId='+cid+\
' --epMap='+emap+' --chemClass='+cclass+\
' --compToxFile='+ctfile+' --sampleFiles='+fses+' --chemDesc='+desfile+\
' --sampMap='+smap

print(cmd)
os.system(cmd)

def runExposome(chem_id_file):
'''
run exposome data pull
'''
##TODO: make this work with chemical id file as argument
cmd = 'Rscript exposome/exposome_summary_stats.R'

def runExpression(gex,chem):
'''
run expression parsing
'''
cmd = 'Rscript zfExp/parseGexData.R '+gex+' '+chem
print(cmd)
os.system(cmd)

def runSchemaCheck(dbfiles=[]):
'''
run schema checking
'''
##TODO: make this work with files as arguments
cmd = 'python dbSchema/main.py'

def main():
'''
this wrapping script is placed into every docker image to pull the files
from the repo and initiate the appropriate call to the underlying code.
'''
df = collectFiles()


####
# file parsing - collects all files we might need for the tool below
####
##first find the morphology and behavior pairs for chemical sources
chemdf = df.loc[df.sample_type=='chemical']
morph = chemdf.loc[chemdf.data_type=='morphology']
beh = chemdf.loc[chemdf.data_type=='behavior']
tupes =[]
for n in morph.name:
tupes.append([morph.loc[morph.name==n].location,beh.loc[beh.name==n].location])


##now map sample information
sid = list(df.loc[df.name=='sampId'].location)[0]
cid = list(df.loc[df.name=='chemId'].location)[0]
cclass = list(df.loc[df.name=='class1'].location)[0]
emap = list(df.loc[df.name=='endpointMap'].location)[0]
fses = ','.join(list(df.loc[df.data_type=='sample'].location))
ctfile = list(df.loc[df.name=='compTox'].location)[0]
desfile = list(df.loc[df.name=='chemdesc'].location)[0]
smap = list(df.loc[df.name=='sampMap'].location)[0]
gex1 = ','.join(list(df.loc[df.data_type=='expression'].location))
ginfo = list(df.loc[df.name=='geneInfo'].location)[0]


###now we can call individiual commands
parser = argparse.ArgumentParser('Pull files from github list of files and call appropriate command')
parser.add_argument('--bmd', dest='bmd',action='store_true', default=False, help='Re-run benchmark dose calculation and dependent commands')
parser.add_argument('--samps', dest='samps', action='store_true', default=False, help='Re run sample-chem mapping')
parser.add_argument('--expo', dest='expo', action='store_true', default=False, help='Re run exposome sample collection')
parser.add_argument('--geneEx', dest='geneEx', action='store_true', default=False, help='Re run gene expression generation')


args = parser.parse_args()

##call bmdrc on all morphology/behavior pairs for sample sources
if args.bmd:
print("Re-running benchmark dose collection")
newbmds,newfits,newdoses =[],[],[]
fitCurveFilesls()

if args.bmd or args.samps: ### need to rerun samples if we have created new bmds
#add chemical BMDS, fits, curves to existing data
chemfiles=[]
sampfiles=[]
for st in ['chemical','extract']:
for dt in ['bmd','fit','dose']:
fdf = combineFiles(df.loc[df.sample_type==st].loc[df.data_type==dt],dt)
fname = 'tmp_'+st+'_'+dt+'.csv'
fdf.to_csv(fname,index=False)
if st=='chemical':
chemfiles.append(fname)
else:
sampfiles.append(fname)
runSampMap(True,sampfiles,smap,cid,emap,cclass,ctfile,fses,desfile)
runSampMap(False,chemfiles,smap,cid,emap,cclass,ctfile,fses,desfile)
runSampMap(False,[],smap,cid,emap,cclass,ctfile,fses,desfile)

##now we run validation
runSchemaCheck()
if args.expo:
runExposome()
runSchemaCheck()
if args.geneEx:
if not os.path.exists("chemicals.csv"):
runSampMap(False,[],smap,cid,emap,cclass,ctfile,fses,desfile)

runExpresion(gene1,'chemicals.csv',ginfo)
runSchemaCheck()


main()
Loading

0 comments on commit 8268aac

Please sign in to comment.