From 79a716a018f14dfc4ee2b0b231efe398e4d2d924 Mon Sep 17 00:00:00 2001 From: fuentesazahara Date: Fri, 18 Aug 2023 13:14:11 +0200 Subject: [PATCH 1/2] fixed possible bug on calc_splizvd.py line 257 I am running this pipeline from the sicilian_called_splice_juncs.tsv file which the sicilian pipeline delivers (https://github.com/salzman-lab/SICILIAN). When I do not run it with the "light" mode, and since I have the column "missing_domains" present in the input table, it raises an error on line 261 as there is no column named posStart_group or posEnd_group, they have not been created before in the code of this script. However, there are two columns called posA_group and posB_group created upstream in this script using the start and end junction positions. I do not know if I am right with this change but I think that is possible that is pointing to this columns that already exist. --- bin/calc_splizvd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/calc_splizvd.py b/bin/calc_splizvd.py index bd0be1f..3a7945f 100755 --- a/bin/calc_splizvd.py +++ b/bin/calc_splizvd.py @@ -254,7 +254,7 @@ def main(): df["max_rank_don"] = df["posB_group"].map(df.groupby("posB_group")["rank_don"].max()) # add domain columns - letters = ["Start", "End"] + letters = ["A", "B"] for let in letters: if domain_breakdown: From 2925e9832e3398c35834aa2aaecc99958313e11f Mon Sep 17 00:00:00 2001 From: Azahara Maria Fuentes Trillo Date: Mon, 21 Aug 2023 14:41:39 +0200 Subject: [PATCH 2/2] made changes more general --- bin/calc_splizvd.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/bin/calc_splizvd.py b/bin/calc_splizvd.py index 3a7945f..efa19c6 100755 --- a/bin/calc_splizvd.py +++ b/bin/calc_splizvd.py @@ -231,17 +231,17 @@ def main(): split_dict = {True : ["ann"], False : ["unann"]} # remove constitutive splicing - df["posA_group"] = df["juncStart"].astype(str) + df["gene"] - df["posB_group"] = df["juncEnd"].astype(str) + df["gene"] + df["posStart_group"] = df["juncStart"].astype(str) + df["gene"] + df["posEnd_group"] = df["juncEnd"].astype(str) + df["gene"] - df["rank_acc"] = df.groupby("posA_group")["juncEnd"].rank(method="dense") - df["rank_don"] = df.groupby("posB_group")["juncStart"].rank(method="dense") + df["rank_acc"] = df.groupby("posStart_group")["juncEnd"].rank(method="dense") + df["rank_don"] = df.groupby("posEnd_group")["juncStart"].rank(method="dense") # remove "almost consistutive splicing" if args.rank_quant > 0: - let_dict2 = {"A" : "acc", "B" : "don"} + let_dict2 = {"Start" : "acc", "End" : "don"} # threshold ranks for each donor and acceptor - for let in ["A","B"]: + for let in ["Start","End"]: df["bottom_{}_quant".format(let_dict2[let])] = df["pos{}_group".format(let)].map(df.groupby("pos{}_group".format(let))["rank_{}".format(let_dict2[let])].quantile(q=args.rank_quant)) df["top_{}_quant".format(let_dict2[let])] = df["pos{}_group".format(let)].map(df.groupby("pos{}_group".format(let))["rank_{}".format(let_dict2[let])].quantile(q=1 - args.rank_quant)) df["rank_{}".format(let_dict2[let])] = df[["bottom_{}_quant".format(let_dict2[let]),"rank_{}".format(let_dict2[let])]].max(axis=1) @@ -250,11 +250,11 @@ def main(): # start ranks at 1 (in case 1 is removed by quantiling) df["rank_{}".format(let_dict2[let])] = df["rank_{}".format(let_dict2[let])] - df["bottom_{}_quant".format(let_dict2[let])] + 1 - df["max_rank_acc"] = df["posA_group"].map(df.groupby("posA_group")["rank_acc"].max()) - df["max_rank_don"] = df["posB_group"].map(df.groupby("posB_group")["rank_don"].max()) + df["max_rank_acc"] = df["posStart_group"].map(df.groupby("posStart_group")["rank_acc"].max()) + df["max_rank_don"] = df["posEnd_group"].map(df.groupby("posEnd_group")["rank_don"].max()) # add domain columns - letters = ["A", "B"] + letters = ["Start", "End"] for let in letters: if domain_breakdown: