-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #84 from PNNL-CompBio/data-trigger
Data trigger file reorg
- Loading branch information
Showing
96 changed files
with
6,498 additions
and
302 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,174 @@ | ||
''' | ||
Build script moved to python for better extendability and interoperability. | ||
''' | ||
|
||
|
||
import os | ||
import subprocess | ||
import pandas as pd | ||
import argparse | ||
|
||
|
||
def collectFiles(data_dir='https://raw.githubusercontent.com/PNNL-CompBio/srpAnalytics/main/data',filename='srp_build_files.csv'): | ||
''' | ||
every time the build file is updated, this script will collect the files and return | ||
a dictionary of files to be fed into each module | ||
''' | ||
df = pd.read_csv(data_dir+'/'+filename) | ||
return df | ||
|
||
|
||
def fitCurveFiles(morpho_behavior_tuples): | ||
''' | ||
get new curve fits, list of tuples of morpho/behavior pairs | ||
''' | ||
|
||
|
||
|
||
def combineFiles(location_list,ftype): | ||
''' | ||
helper function to combine duplicates | ||
''' | ||
dflist=[] | ||
required_columns = {'bmd':['Chemical_ID','End_Point','Model','BMD10','BMD50',"Min_Dose","Max_Dose",\ | ||
"AUC_Norm","DataQC_Flag","BMD_Analysis_Flag"],#,"BMD10_Flag","BMD50_Flag{"), | ||
'dose':['Chemical_ID',"End_Point","Dose","Response","CI_Lo","CI_Hi"],\ | ||
'fit':['Chemical_ID',"End_Point","X_vals","Y_vals"]} | ||
|
||
print('concatenating '+ftype) | ||
for loc in location_list.location: | ||
f = pd.read_csv(loc)[required_columns[ftype]] | ||
dflist.append(f) | ||
fulldf=pd.concat(dflist) | ||
fulldf = fulldf.drop_duplicates() | ||
|
||
|
||
return fulldf.drop_duplicates() | ||
|
||
def runSampMap(is_sample=False,drcfiles=[],smap='',cid='',\ | ||
emap='',cclass='',ctfile='',fses='',desfile=''): | ||
''' | ||
run sample mapping | ||
''' | ||
if is_sample: | ||
cmd = "Rscript sampleChemMapping/mapSamplesToChems.R --sample --drcFiles="+','.join(drcfiles)+\ | ||
' --sampId='+smap+' --chemId='+cid+' --epMap='+emap+' --chemClass='+cclass+\ | ||
' --compToxFile='+ctfile+' --sampleFiles='+fses+' --chemDesc='+desfile+\ | ||
' --sampMap='+smap | ||
elif len(drcfiles)>0: | ||
cmd = "Rscript sampleChemMapping/mapSamplesToChems.R --chemical --drcFiles="+','.join(drcfiles)+\ | ||
' --sampId='+smap+' --chemId='+cid+' --epMap='+emap+' --chemClass='+cclass+\ | ||
' --compToxFile='+ctfile+' --sampleFiles='+fses+' --chemDesc='+desfile+\ | ||
' --sampMap='+smap | ||
else: | ||
cmd = "Rscript sampleChemMapping/mapSamplesToChems.R --sampId="+smap+' --chemId='+cid+\ | ||
' --epMap='+emap+' --chemClass='+cclass+\ | ||
' --compToxFile='+ctfile+' --sampleFiles='+fses+' --chemDesc='+desfile+\ | ||
' --sampMap='+smap | ||
|
||
print(cmd) | ||
os.system(cmd) | ||
|
||
def runExposome(chem_id_file): | ||
''' | ||
run exposome data pull | ||
''' | ||
##TODO: make this work with chemical id file as argument | ||
cmd = 'Rscript exposome/exposome_summary_stats.R' | ||
|
||
def runExpression(gex,chem): | ||
''' | ||
run expression parsing | ||
''' | ||
cmd = 'Rscript zfExp/parseGexData.R '+gex+' '+chem | ||
print(cmd) | ||
os.system(cmd) | ||
|
||
def runSchemaCheck(dbfiles=[]): | ||
''' | ||
run schema checking | ||
''' | ||
##TODO: make this work with files as arguments | ||
cmd = 'python dbSchema/main.py' | ||
|
||
def main(): | ||
''' | ||
this wrapping script is placed into every docker image to pull the files | ||
from the repo and initiate the appropriate call to the underlying code. | ||
''' | ||
df = collectFiles() | ||
|
||
|
||
#### | ||
# file parsing - collects all files we might need for the tool below | ||
#### | ||
##first find the morphology and behavior pairs for chemical sources | ||
chemdf = df.loc[df.sample_type=='chemical'] | ||
morph = chemdf.loc[chemdf.data_type=='morphology'] | ||
beh = chemdf.loc[chemdf.data_type=='behavior'] | ||
tupes =[] | ||
for n in morph.name: | ||
tupes.append([morph.loc[morph.name==n].location,beh.loc[beh.name==n].location]) | ||
|
||
|
||
##now map sample information | ||
sid = list(df.loc[df.name=='sampId'].location)[0] | ||
cid = list(df.loc[df.name=='chemId'].location)[0] | ||
cclass = list(df.loc[df.name=='class1'].location)[0] | ||
emap = list(df.loc[df.name=='endpointMap'].location)[0] | ||
fses = ','.join(list(df.loc[df.data_type=='sample'].location)) | ||
ctfile = list(df.loc[df.name=='compTox'].location)[0] | ||
desfile = list(df.loc[df.name=='chemdesc'].location)[0] | ||
smap = list(df.loc[df.name=='sampMap'].location)[0] | ||
gex1 = ','.join(list(df.loc[df.data_type=='expression'].location)) | ||
ginfo = list(df.loc[df.name=='geneInfo'].location)[0] | ||
|
||
|
||
###now we can call individiual commands | ||
parser = argparse.ArgumentParser('Pull files from github list of files and call appropriate command') | ||
parser.add_argument('--bmd', dest='bmd',action='store_true', default=False, help='Re-run benchmark dose calculation and dependent commands') | ||
parser.add_argument('--samps', dest='samps', action='store_true', default=False, help='Re run sample-chem mapping') | ||
parser.add_argument('--expo', dest='expo', action='store_true', default=False, help='Re run exposome sample collection') | ||
parser.add_argument('--geneEx', dest='geneEx', action='store_true', default=False, help='Re run gene expression generation') | ||
|
||
|
||
args = parser.parse_args() | ||
|
||
##call bmdrc on all morphology/behavior pairs for sample sources | ||
if args.bmd: | ||
print("Re-running benchmark dose collection") | ||
newbmds,newfits,newdoses =[],[],[] | ||
fitCurveFilesls() | ||
|
||
if args.bmd or args.samps: ### need to rerun samples if we have created new bmds | ||
#add chemical BMDS, fits, curves to existing data | ||
chemfiles=[] | ||
sampfiles=[] | ||
for st in ['chemical','extract']: | ||
for dt in ['bmd','fit','dose']: | ||
fdf = combineFiles(df.loc[df.sample_type==st].loc[df.data_type==dt],dt) | ||
fname = 'tmp_'+st+'_'+dt+'.csv' | ||
fdf.to_csv(fname,index=False) | ||
if st=='chemical': | ||
chemfiles.append(fname) | ||
else: | ||
sampfiles.append(fname) | ||
runSampMap(True,sampfiles,smap,cid,emap,cclass,ctfile,fses,desfile) | ||
runSampMap(False,chemfiles,smap,cid,emap,cclass,ctfile,fses,desfile) | ||
runSampMap(False,[],smap,cid,emap,cclass,ctfile,fses,desfile) | ||
|
||
##now we run validation | ||
runSchemaCheck() | ||
if args.expo: | ||
runExposome() | ||
runSchemaCheck() | ||
if args.geneEx: | ||
if not os.path.exists("chemicals.csv"): | ||
runSampMap(False,[],smap,cid,emap,cclass,ctfile,fses,desfile) | ||
|
||
runExpresion(gene1,'chemicals.csv',ginfo) | ||
runSchemaCheck() | ||
|
||
|
||
main() |
Oops, something went wrong.