Skip to content

Commit

Permalink
allow whole dataframe to be read in if not all columns are present
Browse files Browse the repository at this point in the history
  • Loading branch information
Julia Eve Olivieri committed Sep 3, 2021
1 parent 4ef35aa commit ef3d837
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 18 deletions.
53 changes: 37 additions & 16 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,29 @@ rule all:

# shell:
# """
# python3.6 -u scripts/parquet_to_tsv.py --parquet {input} --outname {output} 1>> {log.out} 2>> {log.err}
# python -u scripts/parquet_to_tsv.py --parquet {input} --outname {output} 1>> {log.out} 2>> {log.err}
# """
#
rule txt_to_pq:
input:
"data/{dataset}.txt"

output:
"data/{dataset}.pq"

resources:
mem_mb=lambda wildcards, attempt: attempt * 40000,
# mem_mb=lambda wildcards, attempt: attempt * 120000,

time_min=lambda wildcards, attempt: attempt * 60
log:
out="job_output/txt2pq_{dataset}.out",
err="job_output/txt2pq_{dataset}.err"

shell:
"""
python -u scripts/parquet_to_tsv.py --parquet {output} --outname {input} --reverse 1>> {log.out} 2>> {log.err}
"""

rule tsv_to_pq:
input:
Expand All @@ -182,7 +203,7 @@ rule tsv_to_pq:

shell:
"""
python3.6 -u scripts/parquet_to_tsv.py --parquet {output} --outname {input} --reverse 1>> {log.out} 2>> {log.err}
python -u scripts/parquet_to_tsv.py --parquet {output} --outname {input} --reverse 1>> {log.out} 2>> {log.err}
"""

rule pq_to_tsv:
Expand All @@ -203,7 +224,7 @@ rule pq_to_tsv:

shell:
"""
python3.6 -u scripts/parquet_to_tsv.py --parquet {input} --outname {output} 1>> {log.out} 2>> {log.err}
python -u scripts/parquet_to_tsv.py --parquet {input} --outname {output} 1>> {log.out} 2>> {log.err}
"""

rule FDR_anova:
Expand Down Expand Up @@ -233,8 +254,8 @@ rule FDR_anova:

shell:
"""
python3.6 -u scripts/final_FDRs_anova.py --dataname {wildcards.dataset} --suffix {params.suffix} --all_datanames {params.all_datanames} 1>> {log.out} 2>> {log.err}
python3.6 -u scripts/final_FDRs_anova_factor.py --dataname {wildcards.dataset} --suffix {params.suffix} --all_datanames {params.all_datanames} 1>> {log.out} 2>> {log.err}
python -u scripts/final_FDRs_anova.py --dataname {wildcards.dataset} --suffix {params.suffix} --all_datanames {params.all_datanames} 1>> {log.out} 2>> {log.err}
python -u scripts/final_FDRs_anova_factor.py --dataname {wildcards.dataset} --suffix {params.suffix} --all_datanames {params.all_datanames} 1>> {log.out} 2>> {log.err}
"""

Expand All @@ -259,7 +280,7 @@ rule significance:

shell:
"""
python3.6 -u scripts/significant_genes.py --dataname {wildcards.dataset} --z_col {params.z_col} --pinning_S {wildcards.pinS} --pinning_z {wildcards.pinz} --lower_bound {wildcards.bound} {params.unfilt} 1>> {log.out} 2>> {log.err}
python -u scripts/significant_genes.py --dataname {wildcards.dataset} --z_col {params.z_col} --pinning_S {wildcards.pinS} --pinning_z {wildcards.pinz} --lower_bound {wildcards.bound} {params.unfilt} 1>> {log.out} 2>> {log.err}
"""

rule FDR_mz:
Expand All @@ -285,7 +306,7 @@ rule FDR_mz:

shell:
"""
python3.6 -u scripts/final_FDRs_mz.py --dataname {wildcards.dataset} --suffix {params.suffix} --all_datanames {params.all_datanames} 1>> {log.out} 2>> {log.err}
python -u scripts/final_FDRs_mz.py --dataname {wildcards.dataset} --suffix {params.suffix} --all_datanames {params.all_datanames} 1>> {log.out} 2>> {log.err}
"""

rule anova:
Expand Down Expand Up @@ -342,7 +363,7 @@ rule rijk_zscore:

shell:
"""
python3.6 -u scripts/rijk_zscore.py {params.ver} --pinning_S {wildcards.pinS} --pinning_z {wildcards.pinz} --dataname {wildcards.dataset} --parquet {input} --lower_bound {wildcards.bound} {params.verbose} {params.light} {params.unfilt} 1>> {log.out} 2>> {log.err}
python -u scripts/rijk_zscore.py {params.ver} --pinning_S {wildcards.pinS} --pinning_z {wildcards.pinz} --dataname {wildcards.dataset} --parquet {input} --lower_bound {wildcards.bound} {params.verbose} {params.light} {params.unfilt} 1>> {log.out} 2>> {log.err}
"""

rule SVD_zscore:
Expand Down Expand Up @@ -370,7 +391,7 @@ rule SVD_zscore:

shell:
"""
python3.6 -u scripts/SVD_zscore.py {params.ver} --svd_type normdonor --pinning_S {wildcards.pinS} --pinning_z {wildcards.pinz} --dataname {wildcards.dataset} --lower_bound {wildcards.bound} {params.verbose} {params.light} {params.unfilt} 1>> {log.out} 2>> {log.err}
python -u scripts/SVD_zscore.py {params.ver} --svd_type normdonor --pinning_S {wildcards.pinS} --pinning_z {wildcards.pinz} --dataname {wildcards.dataset} --lower_bound {wildcards.bound} {params.verbose} {params.light} {params.unfilt} 1>> {log.out} 2>> {log.err}
"""

rule perm_pval:
Expand All @@ -395,10 +416,10 @@ rule perm_pval:

shell:
"""
python3.6 -u scripts/perm_pvals.py --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} --z_col scZ 1>> {log.out} 2>> {log.err}
python3.6 -u scripts/perm_pvals.py --z_col svd_z0 --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} 1>> {log.out} 2>> {log.err}
python3.6 -u scripts/perm_pvals.py --z_col svd_z1 --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} 1>> {log.out} 2>> {log.err}
python3.6 -u scripts/perm_pvals.py --z_col svd_z2 --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} 1>> {log.out} 2>> {log.err}
python -u scripts/perm_pvals.py --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} --z_col scZ 1>> {log.out} 2>> {log.err}
python -u scripts/perm_pvals.py --z_col svd_z0 --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} 1>> {log.out} 2>> {log.err}
python -u scripts/perm_pvals.py --z_col svd_z1 --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} 1>> {log.out} 2>> {log.err}
python -u scripts/perm_pvals.py --z_col svd_z2 --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} 1>> {log.out} 2>> {log.err}
"""

Expand All @@ -424,7 +445,7 @@ rule var_adj_perm_pval:

shell:
"""
python3.6 -u scripts/variance_adjusted_permutations.py --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} 1>> {log.out} 2>> {log.err}
python -u scripts/variance_adjusted_permutations.py --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} 1>> {log.out} 2>> {log.err}
"""
Expand Down Expand Up @@ -454,7 +475,7 @@ rule var_adj_perm_pval_bytiss:

shell:
"""
python3.6 -u scripts/variance_adjusted_permutations_bytiss.py --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} --group_col {wildcards.group} --sub_col {wildcards.sub} 1>> {log.out} 2>> {log.err}
python -u scripts/variance_adjusted_permutations_bytiss.py --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} --group_col {wildcards.group} --sub_col {wildcards.sub} 1>> {log.out} 2>> {log.err}
"""
Expand Down Expand Up @@ -499,5 +520,5 @@ rule summary:
num_perms = num_perms
shell:
"""
python3.6 -u scripts/final_summary.py --group_col {wildcards.group} --sub_col {wildcards.sub} --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} 1>> {log.out} 2>> {log.err}
python -u scripts/final_summary.py --group_col {wildcards.group} --sub_col {wildcards.sub} --suffix {params.suffix} --dataname {wildcards.dataset} --num_perms {params.num_perms} 1>> {log.out} 2>> {log.err}
"""
16 changes: 14 additions & 2 deletions scripts/rijk_zscore.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,17 @@ def main():
args = get_args()
outpath = "scripts/output/rijk_zscore/"

df = pd.read_parquet(args.parquet,columns=["juncPosR1A","geneR1A_uniq","juncPosR1B","numReads","cell","splice_ann","tissue","compartment","free_annotation","refName_newR1","called","chrR1A","exon_annR1A","exon_annR1B","strand"])
required_columns = ["juncPosR1A","geneR1A_uniq","juncPosR1B","numReads","cell","refName_newR1","chrR1A"]
try:
df = pd.read_parquet(args.parquet,columns=["juncPosR1A","geneR1A_uniq","juncPosR1B","numReads","cell","splice_ann","tissue","compartment","free_annotation","refName_newR1","called","chrR1A","exon_annR1A","exon_annR1B","strand"])
except:
df = pd.read_parquet(args.parquet)

for rc in required_columns:
if rc not in df.columns:
raise RuntimeError("required column '{}' is missing".format(rc))



df["refName"] = df["chrR1A"] + "_" + df["geneR1A_uniq"] + "_" + df["juncPosR1A"].astype(str) + "_" + df["juncPosR1B"].astype(str)
if "splice_ann" not in df.columns:
Expand Down Expand Up @@ -396,7 +406,9 @@ def main():
# df.drop(["denom_sq","num_ann","num_unann","num","temp","temp_mag"],axis=1,inplace=True)
# df.drop(["negz_B_unann","negz_B_ann","negz_B","denom_sq","num_ann","num_unann","num","temp","temp_mag"],axis=1,inplace=True)

df["ontology"] = df["tissue"] + df["compartment"] + df["free_annotation"]

if ("tissue" in df.columns) and ("compartment" in df.columns) and ("free_annotation" in df.columns):
df["ontology"] = df["tissue"] + df["compartment"] + df["free_annotation"]

# df["ontology_gene"] = df["ontology"] + df["geneR1A_uniq"]
suff = ""
Expand Down

0 comments on commit ef3d837

Please sign in to comment.