Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Data trigger file reorg #84

Merged
merged 3 commits into from
Apr 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,554 changes: 1,554 additions & 0 deletions MASV_classAndSource.csv

Large diffs are not rendered by default.

138 changes: 0 additions & 138 deletions buildV3db.sh

This file was deleted.

174 changes: 174 additions & 0 deletions build_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
'''
Build script moved to python for better extendability and interoperability.

'''


import os
import subprocess
import pandas as pd
import argparse


def collectFiles(data_dir='https://raw.githubusercontent.com/PNNL-CompBio/srpAnalytics/main/data',filename='srp_build_files.csv'):
'''
every time the build file is updated, this script will collect the files and return
a dictionary of files to be fed into each module
'''
df = pd.read_csv(data_dir+'/'+filename)
return df


def fitCurveFiles(morpho_behavior_tuples):
'''
get new curve fits, list of tuples of morpho/behavior pairs
'''



def combineFiles(location_list,ftype):
'''
helper function to combine duplicates
'''
dflist=[]
required_columns = {'bmd':['Chemical_ID','End_Point','Model','BMD10','BMD50',"Min_Dose","Max_Dose",\
"AUC_Norm","DataQC_Flag","BMD_Analysis_Flag"],#,"BMD10_Flag","BMD50_Flag{"),
'dose':['Chemical_ID',"End_Point","Dose","Response","CI_Lo","CI_Hi"],\
'fit':['Chemical_ID',"End_Point","X_vals","Y_vals"]}

print('concatenating '+ftype)
for loc in location_list.location:
f = pd.read_csv(loc)[required_columns[ftype]]
dflist.append(f)
fulldf=pd.concat(dflist)
fulldf = fulldf.drop_duplicates()


return fulldf.drop_duplicates()

def runSampMap(is_sample=False,drcfiles=[],smap='',cid='',\
emap='',cclass='',ctfile='',fses='',desfile=''):
'''
run sample mapping
'''
if is_sample:
cmd = "Rscript sampleChemMapping/mapSamplesToChems.R --sample --drcFiles="+','.join(drcfiles)+\
' --sampId='+smap+' --chemId='+cid+' --epMap='+emap+' --chemClass='+cclass+\
' --compToxFile='+ctfile+' --sampleFiles='+fses+' --chemDesc='+desfile+\
' --sampMap='+smap
elif len(drcfiles)>0:
cmd = "Rscript sampleChemMapping/mapSamplesToChems.R --chemical --drcFiles="+','.join(drcfiles)+\
' --sampId='+smap+' --chemId='+cid+' --epMap='+emap+' --chemClass='+cclass+\
' --compToxFile='+ctfile+' --sampleFiles='+fses+' --chemDesc='+desfile+\
' --sampMap='+smap
else:
cmd = "Rscript sampleChemMapping/mapSamplesToChems.R --sampId="+smap+' --chemId='+cid+\
' --epMap='+emap+' --chemClass='+cclass+\
' --compToxFile='+ctfile+' --sampleFiles='+fses+' --chemDesc='+desfile+\
' --sampMap='+smap

print(cmd)
os.system(cmd)

def runExposome(chem_id_file):
'''
run exposome data pull
'''
##TODO: make this work with chemical id file as argument
cmd = 'Rscript exposome/exposome_summary_stats.R'

def runExpression(gex,chem):
'''
run expression parsing
'''
cmd = 'Rscript zfExp/parseGexData.R '+gex+' '+chem
print(cmd)
os.system(cmd)

def runSchemaCheck(dbfiles=[]):
'''
run schema checking
'''
##TODO: make this work with files as arguments
cmd = 'python dbSchema/main.py'

def main():
'''
this wrapping script is placed into every docker image to pull the files
from the repo and initiate the appropriate call to the underlying code.
'''
df = collectFiles()


####
# file parsing - collects all files we might need for the tool below
####
##first find the morphology and behavior pairs for chemical sources
chemdf = df.loc[df.sample_type=='chemical']
morph = chemdf.loc[chemdf.data_type=='morphology']
beh = chemdf.loc[chemdf.data_type=='behavior']
tupes =[]
for n in morph.name:
tupes.append([morph.loc[morph.name==n].location,beh.loc[beh.name==n].location])


##now map sample information
sid = list(df.loc[df.name=='sampId'].location)[0]
cid = list(df.loc[df.name=='chemId'].location)[0]
cclass = list(df.loc[df.name=='class1'].location)[0]
emap = list(df.loc[df.name=='endpointMap'].location)[0]
fses = ','.join(list(df.loc[df.data_type=='sample'].location))
ctfile = list(df.loc[df.name=='compTox'].location)[0]
desfile = list(df.loc[df.name=='chemdesc'].location)[0]
smap = list(df.loc[df.name=='sampMap'].location)[0]
gex1 = ','.join(list(df.loc[df.data_type=='expression'].location))
ginfo = list(df.loc[df.name=='geneInfo'].location)[0]


###now we can call individiual commands
parser = argparse.ArgumentParser('Pull files from github list of files and call appropriate command')
parser.add_argument('--bmd', dest='bmd',action='store_true', default=False, help='Re-run benchmark dose calculation and dependent commands')
parser.add_argument('--samps', dest='samps', action='store_true', default=False, help='Re run sample-chem mapping')
parser.add_argument('--expo', dest='expo', action='store_true', default=False, help='Re run exposome sample collection')
parser.add_argument('--geneEx', dest='geneEx', action='store_true', default=False, help='Re run gene expression generation')


args = parser.parse_args()

##call bmdrc on all morphology/behavior pairs for sample sources
if args.bmd:
print("Re-running benchmark dose collection")
newbmds,newfits,newdoses =[],[],[]
fitCurveFilesls()

if args.bmd or args.samps: ### need to rerun samples if we have created new bmds
#add chemical BMDS, fits, curves to existing data
chemfiles=[]
sampfiles=[]
for st in ['chemical','extract']:
for dt in ['bmd','fit','dose']:
fdf = combineFiles(df.loc[df.sample_type==st].loc[df.data_type==dt],dt)
fname = 'tmp_'+st+'_'+dt+'.csv'
fdf.to_csv(fname,index=False)
if st=='chemical':
chemfiles.append(fname)
else:
sampfiles.append(fname)
runSampMap(True,sampfiles,smap,cid,emap,cclass,ctfile,fses,desfile)
runSampMap(False,chemfiles,smap,cid,emap,cclass,ctfile,fses,desfile)
runSampMap(False,[],smap,cid,emap,cclass,ctfile,fses,desfile)

##now we run validation
runSchemaCheck()
if args.expo:
runExposome()
runSchemaCheck()
if args.geneEx:
if not os.path.exists("chemicals.csv"):
runSampMap(False,[],smap,cid,emap,cclass,ctfile,fses,desfile)

runExpresion(gene1,'chemicals.csv',ginfo)
runSchemaCheck()


main()
Loading
Loading