diff --git a/Snakefile b/Snakefile index a86b6ac..43c9552 100644 --- a/Snakefile +++ b/Snakefile @@ -161,8 +161,29 @@ rule all: # shell: # """ -# python3.6 -u scripts/parquet_to_tsv.py --parquet {input} --outname {output} 1>> {log.out} 2>> {log.err} +# python -u scripts/parquet_to_tsv.py --parquet {input} --outname {output} 1>> {log.out} 2>> {log.err} # """ +# +rule txt_to_pq: + input: + "data/{dataset}.txt" + + output: + "data/{dataset}.pq" + + resources: + mem_mb=lambda wildcards, attempt: attempt * 40000, +# mem_mb=lambda wildcards, attempt: attempt * 120000, + + time_min=lambda wildcards, attempt: attempt * 60 + log: + out="job_output/txt2pq_{dataset}.out", + err="job_output/txt2pq_{dataset}.err" + + shell: + """ + python -u scripts/parquet_to_tsv.py --parquet {output} --outname {input} --reverse 1>> {log.out} 2>> {log.err} + """ rule tsv_to_pq: input: @@ -182,7 +203,7 @@ rule tsv_to_pq: shell: """ - python3.6 -u scripts/parquet_to_tsv.py --parquet {output} --outname {input} --reverse 1>> {log.out} 2>> {log.err} + python -u scripts/parquet_to_tsv.py --parquet {output} --outname {input} --reverse 1>> {log.out} 2>> {log.err} """ rule pq_to_tsv: @@ -203,7 +224,7 @@ rule pq_to_tsv: shell: """ - python3.6 -u scripts/parquet_to_tsv.py --parquet {input} --outname {output} 1>> {log.out} 2>> {log.err} + python -u scripts/parquet_to_tsv.py --parquet {input} --outname {output} 1>> {log.out} 2>> {log.err} """ rule FDR_anova: @@ -233,8 +254,8 @@ rule FDR_anova: shell: """ - python3.6 -u scripts/final_FDRs_anova.py --dataname {wildcards.dataset} --suffix {params.suffix} --all_datanames {params.all_datanames} 1>> {log.out} 2>> {log.err} - python3.6 -u scripts/final_FDRs_anova_factor.py --dataname {wildcards.dataset} --suffix {params.suffix} --all_datanames {params.all_datanames} 1>> {log.out} 2>> {log.err} + python -u scripts/final_FDRs_anova.py --dataname {wildcards.dataset} --suffix {params.suffix} --all_datanames {params.all_datanames} 1>> {log.out} 2>> {log.err} + python -u scripts/final_FDRs_anova_factor.py --dataname {wildcards.dataset} --suffix {params.suffix} --all_datanames {params.all_datanames} 1>> {log.out} 2>> {log.err} """ @@ -259,7 +280,7 @@ rule significance: shell: """ - python3.6 -u scripts/significant_genes.py --dataname {wildcards.dataset} --z_col {params.z_col} --pinning_S {wildcards.pinS} --pinning_z {wildcards.pinz} --lower_bound {wildcards.bound} {params.unfilt} 1>> {log.out} 2>> {log.err} + python -u scripts/significant_genes.py --dataname {wildcards.dataset} --z_col {params.z_col} --pinning_S {wildcards.pinS} --pinning_z {wildcards.pinz} --lower_bound {wildcards.bound} {params.unfilt} 1>> {log.out} 2>> {log.err} """ rule FDR_mz: @@ -285,7 +306,7 @@ rule FDR_mz: shell: """ - python3.6 -u scripts/final_FDRs_mz.py --dataname {wildcards.dataset} --suffix {params.suffix} --all_datanames {params.all_datanames} 1>> {log.out} 2>> {log.err} + python -u scripts/final_FDRs_mz.py --dataname {wildcards.dataset} --suffix {params.suffix} --all_datanames {params.all_datanames} 1>> {log.out} 2>> {log.err} """ rule anova: @@ -342,7 +363,7 @@ rule rijk_zscore: shell: """ - python3.6 -u scripts/rijk_zscore.py {params.ver} --pinning_S {wildcards.pinS} --pinning_z {wildcards.pinz} --dataname {wildcards.dataset} --parquet {input} --lower_bound {wildcards.bound} {params.verbose} {params.light} {params.unfilt} 1>> {log.out} 2>> {log.err} + python -u scripts/rijk_zscore.py {params.ver} --pinning_S {wildcards.pinS} --pinning_z {wildcards.pinz} --dataname {wildcards.dataset} --parquet {input} --lower_bound {wildcards.bound} {params.verbose} {params.light} {params.unfilt} 1>> {log.out} 2>> {log.err} """ rule SVD_zscore: @@ -370,7 +391,7 @@ rule SVD_zscore: shell: """ - python3.6 -u scripts/SVD_zscore.py {params.ver} --svd_type normdonor --pinning_S {wildcards.pinS} --pinning_z {wildcards.pinz} --dataname {wildcards.dataset} --lower_bound {wildcards.bound} {params.verbose} {params.light} {params.unfilt} 1>> {log.out} 2>> {log.err} + python -u scripts/SVD_zscore.py {params.ver} --svd_type normdonor --pinning_S {wildcards.pinS} --pinning_z {wildcards.pinz} --dataname {wildcards.dataset} --lower_bound {wildcards.bound} {params.verbose} {params.light} {params.unfilt} 1>> {log.out} 2>> {log.err} """ rule perm_pval: @@ -395,10 +416,10 @@ rule perm_pval: shell: """ - python3.6 -u scripts/perm_pvals.py --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} --z_col scZ 1>> {log.out} 2>> {log.err} - python3.6 -u scripts/perm_pvals.py --z_col svd_z0 --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} 1>> {log.out} 2>> {log.err} - python3.6 -u scripts/perm_pvals.py --z_col svd_z1 --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} 1>> {log.out} 2>> {log.err} - python3.6 -u scripts/perm_pvals.py --z_col svd_z2 --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} 1>> {log.out} 2>> {log.err} + python -u scripts/perm_pvals.py --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} --z_col scZ 1>> {log.out} 2>> {log.err} + python -u scripts/perm_pvals.py --z_col svd_z0 --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} 1>> {log.out} 2>> {log.err} + python -u scripts/perm_pvals.py --z_col svd_z1 --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} 1>> {log.out} 2>> {log.err} + python -u scripts/perm_pvals.py --z_col svd_z2 --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} 1>> {log.out} 2>> {log.err} """ @@ -424,7 +445,7 @@ rule var_adj_perm_pval: shell: """ - python3.6 -u scripts/variance_adjusted_permutations.py --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} 1>> {log.out} 2>> {log.err} + python -u scripts/variance_adjusted_permutations.py --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} 1>> {log.out} 2>> {log.err} """ @@ -454,7 +475,7 @@ rule var_adj_perm_pval_bytiss: shell: """ - python3.6 -u scripts/variance_adjusted_permutations_bytiss.py --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} --group_col {wildcards.group} --sub_col {wildcards.sub} 1>> {log.out} 2>> {log.err} + python -u scripts/variance_adjusted_permutations_bytiss.py --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} --group_col {wildcards.group} --sub_col {wildcards.sub} 1>> {log.out} 2>> {log.err} """ @@ -499,5 +520,5 @@ rule summary: num_perms = num_perms shell: """ - python3.6 -u scripts/final_summary.py --group_col {wildcards.group} --sub_col {wildcards.sub} --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} 1>> {log.out} 2>> {log.err} + python -u scripts/final_summary.py --group_col {wildcards.group} --sub_col {wildcards.sub} --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} 1>> {log.out} 2>> {log.err} """ diff --git a/scripts/rijk_zscore.py b/scripts/rijk_zscore.py index ca3c410..4773180 100644 --- a/scripts/rijk_zscore.py +++ b/scripts/rijk_zscore.py @@ -113,7 +113,17 @@ def main(): args = get_args() outpath = "scripts/output/rijk_zscore/" - df = pd.read_parquet(args.parquet,columns=["juncPosR1A","geneR1A_uniq","juncPosR1B","numReads","cell","splice_ann","tissue","compartment","free_annotation","refName_newR1","called","chrR1A","exon_annR1A","exon_annR1B","strand"]) + required_columns = ["juncPosR1A","geneR1A_uniq","juncPosR1B","numReads","cell","refName_newR1","chrR1A"] + try: + df = pd.read_parquet(args.parquet,columns=["juncPosR1A","geneR1A_uniq","juncPosR1B","numReads","cell","splice_ann","tissue","compartment","free_annotation","refName_newR1","called","chrR1A","exon_annR1A","exon_annR1B","strand"]) + except: + df = pd.read_parquet(args.parquet) + + for rc in required_columns: + if rc not in df.columns: + raise RuntimeError("required column '{}' is missing".format(rc)) + + df["refName"] = df["chrR1A"] + "_" + df["geneR1A_uniq"] + "_" + df["juncPosR1A"].astype(str) + "_" + df["juncPosR1B"].astype(str) if "splice_ann" not in df.columns: @@ -396,7 +406,9 @@ def main(): # df.drop(["denom_sq","num_ann","num_unann","num","temp","temp_mag"],axis=1,inplace=True) # df.drop(["negz_B_unann","negz_B_ann","negz_B","denom_sq","num_ann","num_unann","num","temp","temp_mag"],axis=1,inplace=True) - df["ontology"] = df["tissue"] + df["compartment"] + df["free_annotation"] + + if ("tissue" in df.columns) and ("compartment" in df.columns) and ("free_annotation" in df.columns): + df["ontology"] = df["tissue"] + df["compartment"] + df["free_annotation"] # df["ontology_gene"] = df["ontology"] + df["geneR1A_uniq"] suff = ""