From 2a7b9d50a6792fea82ca6f1d812f4870883efe45 Mon Sep 17 00:00:00 2001 From: Bede Constantinides Date: Tue, 11 Jun 2024 13:51:21 +0100 Subject: [PATCH] Add plot subcommand --- pyproject.toml | 3 ++ src/primaschema/cli.py | 15 ++++++++- src/primaschema/lib.py | 70 +++++++++++++++++++++++++++++++++++++++++- test/test_all.py | 5 +++ 4 files changed, 91 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d811945..65ace1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,12 +24,15 @@ classifiers = [ "Operating System :: MacOS", ] dependencies = [ + "altair>=5.3.0", "biopython>=1.80", "defopt==6.4.0", "jsonschema", "linkml", + "natsort>=8.4.0", "pandas>=1.5.3", "pyyaml", + "vl-convert-python>=1.4.0" ] [project.scripts] diff --git a/src/primaschema/cli.py b/src/primaschema/cli.py index b2fca9e..3520f3f 100644 --- a/src/primaschema/cli.py +++ b/src/primaschema/cli.py @@ -14,7 +14,7 @@ def hash_bed(bed_path: Path): """ Generate a bed file checksum - :arg ref_path: Path of bed file + :arg bed_path: Path of bed file """ hex_digest = lib.hash_bed(bed_path) print("BED checksum:", file=sys.stderr) @@ -154,6 +154,18 @@ def print_intervals(bed_path: Path): print(f"{chrom}\t{interval[0]}\t{interval[1]}\t{name}") +def plot(bed_path: Path): + """ + Plot amplicon and primer positions given a 7 column primer.bed file + Requires primers named {scheme-name}_{amplicon-number}… + Plots a row per amplicon per reference chromosome + Supported out_path extensions: html (interactive), pdf, png, svg + + :arg bed_path: Path of primer.bed file + """ + lib.plot(bed_path) + + def main(): defopt.run( { @@ -169,6 +181,7 @@ def main(): "7to6": seven_to_six, "show-non-ref-alts": show_non_ref_alts, "intervals": print_intervals, + "plot": plot, }, no_negated_flags=True, strict_kwonly=False, diff --git a/src/primaschema/lib.py b/src/primaschema/lib.py index 0f99bd6..c344f96 100644 --- a/src/primaschema/lib.py +++ b/src/primaschema/lib.py @@ -12,10 +12,15 @@ from typing import Literal import jsonschema -import pandas as pd import yaml + +import altair as alt +import pandas as pd + +from natsort import natsorted from Bio import SeqIO + # from linkml.generators.pydanticgen import PydanticGenerator from linkml.generators.pythongen import PythonGenerator from linkml_runtime.utils.schemaview import SchemaView @@ -491,3 +496,66 @@ def compute_intervals(bed_path: Path) -> dict[str, dict[str, (int, int)]]: prev_start, prev_end = intervals.get(primer_name, (sys.maxsize, -1)) intervals[primer_name] = (min(prev_start, start_pos), max(prev_end, end_pos)) return all_intervals + + +def plot(bed_path: Path, out_path: Path = Path("plot.html")) -> None: + """ + Plot amplicon and primer positions from a 7 column primer.bed file + Requires primers to be named {scheme-name}_{amplicon-number}… + Plots one row per reference chromosome + Supported out_path extensions: html (interactive), pdf, png, svg + """ + bed_df = parse_primer_bed(bed_path) + bed_df["amplicon"] = bed_df["name"].str.split("_").str[1] + print(pd.concat([bed_df.head(4), bed_df.tail(4)])) + amp_df = ( + bed_df.groupby(["chrom", "amplicon"]) + .agg(min_start=("chromStart", "min"), max_end=("chromEnd", "max")) + .reset_index() + ) + amp_df["is_amplicon"] = True + sorted_amplicons = natsorted(bed_df["amplicon"].unique()) + print(amp_df) + + bed_df["is_amplicon"] = False + amp_df = amp_df.rename(columns={"min_start": "chromStart", "max_end": "chromEnd"}) + + combined_df = pd.concat([bed_df, amp_df], ignore_index=True) + + primer_marks = ( + alt.Chart(combined_df) + .transform_filter(alt.datum.is_amplicon == False) # noqa + .mark_line(size=15) + .encode( + x="chromStart:Q", + x2="chromEnd:Q", + y=alt.Y("amplicon:O", sort=sorted_amplicons, scale=alt.Scale(padding=0)), + color=alt.Color("strand:N").scale(scheme="set2"), + tooltip=["name:N", "chromStart:Q", "chromEnd:Q"], + ) + .properties( + width=1000, + ) + ) + + amplicon_marks = ( + alt.Chart(combined_df) + .transform_filter(alt.datum.is_amplicon == True) # noqa + .mark_rule(size=2) + .encode( + x="chromStart:Q", + x2="chromEnd:Q", + y=alt.Y("amplicon:O", sort=sorted_amplicons, scale=alt.Scale(padding=0)), + tooltip=["amplicon:N", "chromStart:Q", "chromEnd:Q"], + ) + .properties( + width=1000, + ) + ) + + combined_chart = alt.layer(primer_marks, amplicon_marks).facet( + # row="chrom:O" + row=alt.Row("chrom:O", header=alt.Header(labelOrient="top"), title="") + ) + + combined_chart.interactive().save(str(out_path)) diff --git a/test/test_all.py b/test/test_all.py index d0be88f..0937844 100644 --- a/test/test_all.py +++ b/test/test_all.py @@ -170,3 +170,8 @@ def test_print_intervals(): ) assert """MN908947.3\t29452\t29854\tSARS-CoV-2_99\n""" in run_cmd.stdout + + +def test_plot_single_chrom_ref(): + lib.plot(data_dir / "primer-schemes/schemes/sars-cov-2/artic/v4.1/primer.bed") + run("rm -rf plot.html", cwd="./")