diff --git a/.github/actions/setup-rust/action.yml b/.github/actions/setup-rust/action.yml new file mode 100644 index 0000000..5fb1bcd --- /dev/null +++ b/.github/actions/setup-rust/action.yml @@ -0,0 +1,22 @@ +name: 'Setup Rust' +description: 'Setup rust toolchain' +inputs: + cache: + description: 'Cache' + required: false + default: false +runs: + using: "composite" + steps: + - name: Install Rust + uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + - name: Cache rust build + uses: Swatinem/rust-cache@v2 + if: ${{ inputs.cache == 'true' }} + with: + workspaces: "precellar -> target" \ No newline at end of file diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..eea1fe0 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,103 @@ +name: build-documentation + +on: + workflow_run: + workflows: [test-python-package] + types: + - completed + +jobs: + build_docs: + runs-on: ubuntu-latest + if: ${{ github.event.workflow_run.conclusion == 'success' }} + steps: + - name: Checkout code + uses: nschloe/action-cached-lfs-checkout@v1 + with: + ref: ${{ github.event.workflow_run.head_sha }} + + - uses: actions/setup-python@v5 + name: Install Python + with: + python-version: '3.10' + + - name: Install dependency + run: | + sudo apt-get install -y pandoc jq + sudo pip install --upgrade pip + pip install --user sphinx==7.* pydata-sphinx-theme==0.15.* pandoc nbsphinx \ + Pygments==2.16.* sphinx-autodoc-typehints myst-parser \ + markupsafe==2.1.* sphinx-plotly-directive + + - name: Download wheel files from artifacts + id: download-artifact + uses: dawidd6/action-download-artifact@v2 + with: + workflow: test_python.yml + commit: ${{ github.event.workflow_run.head_sha }} + name: wheel-files + path: wheel_files + + - name: Install wheel files + run: pip install --user wheel_files/*.whl + + - name: Build doc + run: sphinx-build ${GITHUB_WORKSPACE}/docs _build/html + + - name: Get SnapATAC2 version + id: get_version + run: | + VERSION_NUMBER=$(python -c "import precellar;print('.'.join(precellar.__version__.split('.')[:2]))") + echo $VERSION_NUMBER + echo "VERSION=$VERSION_NUMBER" >> $GITHUB_ENV + IS_DEV=$(python -c "import precellar;print('dev' in precellar.__version__)") + echo $IS_DEV + BRANCH_NAME=${{ github.event.workflow_run.head_branch }} + if [[ $IS_DEV == "True" && $BRANCH_NAME == "main" ]]; then + echo "DEPLOY_DEV=true" >> $GITHUB_ENV + elif [[ $BRANCH_NAME =~ ^v[0-9]+ || $BRANCH_NAME == "main" ]]; then + echo "DEPLOY_VERSION=true" >> $GITHUB_ENV + fi + + - name: Deploy 🚀 + uses: JamesIves/github-pages-deploy-action@v4 + if: ${{ env.DEPLOY_DEV == 'true' }} + with: + single-commit: true + branch: gh-pages + folder: _build/html + clean: true + target-folder: /version/dev/ + + - name: Deploy (version) 🚀 + uses: JamesIves/github-pages-deploy-action@v4 + if: ${{ env.DEPLOY_VERSION == 'true' }} + with: + single-commit: true + branch: gh-pages + folder: _build/html + clean: true + target-folder: /version/${{ env.VERSION }}/ + + - name: Fetch JSON and Get Preferred Version + run: | + #JSON=$(cat ${GITHUB_WORKSPACE}/docs/_static/versions.json) + JSON=$(curl -s "https://raw.githubusercontent.com/kaizhang/SnapATAC2/main/docs/_static/versions.json") + VERSION=$(echo "$JSON" | jq -r '.[] | select(.preferred == true) | .version') + echo "PREFERRED_VERSION=$VERSION" >> $GITHUB_ENV + echo "Preferred version is $VERSION" + + - name: Checkout code from gh-pages branch into folder + uses: actions/checkout@v2 + with: + ref: 'gh-pages' + path: 'gh-pages-folder' + + - name: Deploy (preferred version) + uses: JamesIves/github-pages-deploy-action@v4 + with: + single-commit: true + branch: gh-pages + folder: gh-pages-folder/version/${{ env.PREFERRED_VERSION }} + clean: true + clean-exclude: version \ No newline at end of file diff --git a/.github/workflows/test_python.yml b/.github/workflows/test_python.yml new file mode 100644 index 0000000..84c1544 --- /dev/null +++ b/.github/workflows/test_python.yml @@ -0,0 +1,45 @@ +name: test-python-package + +on: [push, pull_request] + +jobs: + build-and-test: + outputs: + VERSION: ${{ steps.get-version.outputs.VERSION }} + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: nschloe/action-cached-lfs-checkout@v1 + + - uses: actions/setup-python@v4 + name: Install Python + with: + python-version: '3.10' + + - uses: ./.github/actions/setup-rust + with: + cache: true + + - name: Install dependencies + run: | + sudo pip install --upgrade pip + pip install --user pytest hypothesis==6.72.4 wheel + + - name: Build wheel files + run: | + cd ${GITHUB_WORKSPACE}/python + mkdir ${GITHUB_WORKSPACE}/wheel_files + pip wheel . --wheel-dir ${GITHUB_WORKSPACE}/wheel_files + + - name: Get precellar version + id: get-version + run: | + VERSION_NUMBER=$(python -c "import precellar;print(precellar.__version__)") + echo $VERSION_NUMBER + echo "VERSION=$VERSION_NUMBER" >> $GITHUB_OUTPUT + + - name: Upload wheel files as artifacts + uses: actions/upload-artifact@v4 + with: + name: wheel-files + path: ./wheel_files/precellar*.whl \ No newline at end of file diff --git a/docs/_static/css/custom.css b/docs/_static/css/custom.css new file mode 100644 index 0000000..c7bef62 --- /dev/null +++ b/docs/_static/css/custom.css @@ -0,0 +1,15 @@ +:root { + // Sidebar styles + --pst-sidebar-secondary: 15rem; +} + +/* Main page overview cards */ + +.bd-page-width { + max-width: 98rem; +} + +/* Dark theme tweaking */ +html[data-theme=dark] .sd-card img[src*='.svg'] { + filter: invert(0.82) brightness(0.8) contrast(1.2); +} \ No newline at end of file diff --git a/docs/_static/versions.json b/docs/_static/versions.json new file mode 100644 index 0000000..a292dde --- /dev/null +++ b/docs/_static/versions.json @@ -0,0 +1,18 @@ +[ + { + "name": "dev", + "version": "dev", + "url": "https://kzhang.org/SnapATAC2/version/dev/" + }, + { + "name": "2.7 (stable)", + "version": "2.7", + "preferred": true, + "url": "https://kzhang.org/SnapATAC2/version/2.7/" + }, + { + "name": "2.6", + "version": "2.6", + "url": "https://kzhang.org/SnapATAC2/version/2.6/" + }, +] \ No newline at end of file diff --git a/docs/_templates/autosummary/class.rst b/docs/_templates/autosummary/class.rst new file mode 100644 index 0000000..3307d7e --- /dev/null +++ b/docs/_templates/autosummary/class.rst @@ -0,0 +1,31 @@ +{{ fullname | escape | underline}} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} + + {% block attributes %} + {% if attributes %} + .. rubric:: Attributes + + .. autosummary:: + :toctree: . + {% for item in attributes %} + ~{{ fullname }}.{{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block methods %} + {% if methods %} + .. rubric:: Methods + + .. autosummary:: + :toctree: . + {% for item in methods %} + {%- if item != '__init__' %} + ~{{ fullname }}.{{ item }} + {%- endif -%} + {%- endfor %} + {% endif %} + {% endblock %} \ No newline at end of file diff --git a/docs/_templates/layout.html b/docs/_templates/layout.html new file mode 100644 index 0000000..b553042 --- /dev/null +++ b/docs/_templates/layout.html @@ -0,0 +1,14 @@ +{% extends "!layout.html" %} + +{%- block extrahead %} +{{ super() }} + + + +{% endblock %} \ No newline at end of file diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 0000000..6101d2e --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,17 @@ +============= +API reference +============= + +This page gives an overview of all public precellar objects, functions and +methods. + +.. currentmodule:: precellar + +Backed AnnData objects +~~~~~~~~~~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: _autosummary + + align + diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..83645b9 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,197 @@ +# -- Path setup -------------------------------------------------------------- + +import re +import sys +import warnings +import os +import subprocess + +import precellar + +# -- Software version -------------------------------------------------------- + +# The short X.Y version (including .devXXXX, rcX, b1 suffixes if present) +version = re.sub(r'(\d+\.\d+)\.\d+(.*)', r'\1\2', precellar.__version__) +version = re.sub(r'(\.dev\d+).*?$', r'\1', version) + +# The full version, including alpha/beta/rc tags. +release = precellar.__version__ + +# pyData/Sphinx-Theme version switcher +if ".dev" in version: + switcher_version = "dev" +else: + switcher_version = f"{version}" + +print(f'Building documentation for precellar {release} (short version: {version}, switcher version: {switcher_version})') + +# -- Project information ----------------------------------------------------- + +project = 'precellar' +copyright = '2024-2024, Regulatory Genomics Lab, Westlake University' +author = 'Kai Zhang' + +# -- General configuration --------------------------------------------------- + +suppress_warnings = ['ref.citation'] +default_role = 'code' +add_function_parentheses = False + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "nbsphinx", + "myst_parser", + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + "sphinx.ext.autosummary", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", + "sphinx.ext.napoleon", + "sphinx.ext.linkcode", + "sphinx_autodoc_typehints", + "sphinx_plotly_directive", +] + +source_suffix = { + '.rst': 'restructuredtext', + '.txt': 'markdown', + '.md': 'markdown', +} + +myst_enable_extensions = [ + "amsmath", + #"colon_fence", + #"deflist", + "dollarmath", + #"fieldlist", + #"html_admonition", + #"html_image", + #"linkify", + #"replacements", + #"smartquotes", + #"strikethrough", + #"substitution", + #"tasklist", +] + +# Generate the API documentation when building +autosummary_generate = True +autodoc_member_order = 'bysource' +# autodoc_default_flags = ['members'] +napoleon_google_docstring = False +napoleon_numpy_docstring = True +napoleon_include_init_with_doc = False +napoleon_use_rtype = True # having a separate entry generally helps readability +napoleon_use_param = True +napoleon_custom_sections = [('Params', 'Parameters')] +todo_include_todos = False + +intersphinx_mapping = { + "numpy": ("https://numpy.org/doc/stable/", None), + "python": ("https://docs.python.org/3", None), +} + +smv_branch_whitelist = r'main' # Include all branches + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + +# -- Options for HTML output ------------------------------------------------- + +html_theme = 'pydata_sphinx_theme' +html_show_sphinx = False +html_show_sourcelink = False +html_static_path = ['_static'] +html_css_files = [ + 'css/custom.css', +] + +html_theme_options = { + "logo": { + "text": "precellar", + "image_dark": "_static/logo-dark.svg", + "alt_text": "precellar", + }, + + "github_url": "https://github.com/kaizhang/SnapATAC2", + "external_links": [ + ], + "header_links_before_dropdown": 6, + + "navbar_center": ["version-switcher", "navbar-nav"], + "navbar_end": ["theme-switcher", "navbar-icon-links"], + "navbar_align": "left", + "show_version_warning_banner": switcher_version == "dev", + + "switcher": { + "version_match": switcher_version, + "json_url": "https://raw.githubusercontent.com/kaizhang/SnapATAC2/main/docs/_static/versions.json", + }, +} + +commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip().decode('ascii') +code_url = f"https://github.com/kaizhang/SnapATAC2/blob/{commit}" + +# based on numpy doc/source/conf.py +def linkcode_resolve(domain, info): + """ + Determine the URL corresponding to Python object + """ + import inspect + + if domain != "py": + return None + + modname = info["module"] + fullname = info["fullname"] + + submod = sys.modules.get(modname) + if submod is None: + return None + + obj = submod + for part in fullname.split("."): + try: + with warnings.catch_warnings(): + # Accessing deprecated objects will generate noisy warnings + warnings.simplefilter("ignore", FutureWarning) + obj = getattr(obj, part) + except AttributeError: + return None + + try: + fn = inspect.getsourcefile(inspect.unwrap(obj)) + except TypeError: + try: # property + fn = inspect.getsourcefile(inspect.unwrap(obj.fget)) + except (AttributeError, TypeError): + fn = None + if not fn: + return None + + try: + source, lineno = inspect.getsourcelines(obj) + except TypeError: + try: # property + source, lineno = inspect.getsourcelines(obj.fget) + except (AttributeError, TypeError): + lineno = None + except OSError: + lineno = None + + if lineno: + linespec = f"#L{lineno}-L{lineno + len(source) - 1}" + else: + linespec = "" + + fn = os.path.relpath(fn, start=os.path.dirname(snapatac2.__file__)) + + return f"{code_url}/snapatac2-python/python/snapatac2/{fn}{linespec}" \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..680032a --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,8 @@ +precellar: Single-cell genomics data preprocessing tools +======================================================== + +.. toctree:: + :maxdepth: 3 + :hidden: + + api \ No newline at end of file diff --git a/precellar/Cargo.toml b/precellar/Cargo.toml index 8f47d6c..d5991a4 100644 --- a/precellar/Cargo.toml +++ b/precellar/Cargo.toml @@ -6,7 +6,8 @@ edition = "2021" [dependencies] anyhow = "1.0" bed-utils = "0.5.1" -bwa = { git = "https://github.com/regulatory-genomics/bwa-rust.git", rev = "69d482501956039588f94ce9f87367d7ae8f19af" } +#bwa = { git = "https://github.com/regulatory-genomics/bwa-rust.git", rev = "69d482501956039588f94ce9f87367d7ae8f19af" } +bwa-mem2 = { git = "https://github.com/regulatory-genomics/bwa-mem2-rust.git", rev = "27fdac5869958b82d8125be2d84ac2a51acd098c" } bstr = "1.0" cached-path = "0.6" either = "1.13" diff --git a/precellar/src/align.rs b/precellar/src/align.rs index a468bfb..6c99e81 100644 --- a/precellar/src/align.rs +++ b/precellar/src/align.rs @@ -14,7 +14,7 @@ use noodles::sam::alignment::{ Record, record_buf::RecordBuf, record::data::field::tag::Tag, }; use noodles::sam::alignment::record_buf::data::field::value::Value; -use bwa::BurrowsWheelerAligner; +use bwa_mem2::BurrowsWheelerAligner; use log::info; use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator}; use either::Either; @@ -24,9 +24,9 @@ pub trait Alinger { fn header(&self) -> sam::Header; - fn align_reads(&self, records: &mut [fastq::Record]) -> impl ExactSizeIterator; + fn align_reads(&mut self, records: &mut [fastq::Record]) -> impl ExactSizeIterator; - fn align_read_pairs(&self, records: &mut [(fastq::Record, fastq::Record)]) -> + fn align_read_pairs(&mut self, records: &mut [(fastq::Record, fastq::Record)]) -> impl ExactSizeIterator; } @@ -39,11 +39,11 @@ impl Alinger for BurrowsWheelerAligner { self.get_sam_header() } - fn align_reads(&self, records: &mut [fastq::Record]) -> impl ExactSizeIterator { + fn align_reads(&mut self, records: &mut [fastq::Record]) -> impl ExactSizeIterator { self.align_reads(records) } - fn align_read_pairs(&self, records: &mut [(fastq::Record, fastq::Record)]) -> + fn align_read_pairs(&mut self, records: &mut [(fastq::Record, fastq::Record)]) -> impl ExactSizeIterator { self.align_read_pairs(records) } @@ -188,7 +188,7 @@ impl FastqProcessor { }) } - pub fn gen_raw_alignments(&self) -> + pub fn gen_raw_alignments(&mut self) -> impl Iterator, Vec<(sam::Record, sam::Record)>>> + '_ { let fq_records = self.gen_raw_fastq_records(); @@ -498,7 +498,7 @@ impl<'a, R: std::io::Read> Iterator for NameCollatedRecords<'a, R> { #[cfg(test)] mod tests { - use bwa::{AlignerOpts, FMIndex, PairedEndStats}; + use bwa_mem2::{AlignerOpts, FMIndex, PairedEndStats}; use super::*; diff --git a/python/Cargo.toml b/python/Cargo.toml index 7c2e7e4..f185e77 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "precellar-py" -version = "0.1.0" +version = "0.1.0-dev" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -10,7 +10,8 @@ crate-type = ["cdylib"] [dependencies] anyhow = "1.0" -bwa = { git = "https://github.com/regulatory-genomics/bwa-rust.git", rev = "69d482501956039588f94ce9f87367d7ae8f19af" } +#bwa = { git = "https://github.com/regulatory-genomics/bwa-rust.git", rev = "69d482501956039588f94ce9f87367d7ae8f19af" } +bwa-mem2 = { git = "https://github.com/regulatory-genomics/bwa-mem2-rust.git", rev = "27fdac5869958b82d8125be2d84ac2a51acd098c" } bstr = "1.0" either = "1.13" itertools = "0.13" diff --git a/python/src/lib.rs b/python/src/lib.rs index d72588e..27cbdd1 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -1,7 +1,7 @@ mod utils; use std::{collections::HashMap, path::PathBuf, str::FromStr}; -use bwa::{AlignerOpts, BurrowsWheelerAligner, FMIndex, PairedEndStats}; +use bwa_mem2::{AlignerOpts, BurrowsWheelerAligner, FMIndex, PairedEndStats}; use either::Either; use pyo3::prelude::*; use anyhow::Result; @@ -23,6 +23,41 @@ use tikv_jemallocator::Jemalloc; #[global_allocator] static GLOBAL: Jemalloc = Jemalloc; +/// Align fastq reads to the reference genome and generate unique fragments. +/// +/// Parameters +/// ---------- +/// +/// seqspec: Path +/// File path to the sequencing specification, see https://github.com/pachterlab/seqspec. +/// genom_index: Path +/// File path to the genome index. +/// modality: str +/// The modality of the sequencing data, e.g., "rna" or "atac". +/// output_bam: Path | None +/// File path to the output bam file. If None, the bam file will not be generated. +/// output_fragment: Path | None +/// File path to the output fragment file. If None, the fragment file will not be generated. +/// mito_dna: list[str] +/// List of mitochondrial DNA names. +/// shift_left: int +/// The number of bases to shift the left end of the fragment. +/// shift_right: int +/// The number of bases to shift the right end of the fragment. +/// compression: str | None +/// The compression algorithm to use for the output fragment file. +/// If None, the compression algorithm will be inferred from the file extension. +/// compression_level: int | None +/// The compression level to use for the output fragment file. +/// temp_dir: Path | None +/// The temporary directory to use. +/// num_threads: int +/// The number of threads to use. +/// +/// Returns +/// ------- +/// dict +/// A dictionary containing the QC metrics of the alignment and fragment generation. #[pyfunction] #[pyo3( signature = ( @@ -60,7 +95,7 @@ fn align( let spec = SeqSpec::from_path(&seqspec).unwrap(); let aligner = BurrowsWheelerAligner::new( FMIndex::read(genome_index).unwrap(), - AlignerOpts::default().set_n_threads(num_threads as usize), + AlignerOpts::default().with_n_threads(num_threads as usize), PairedEndStats::default() ); let header = aligner.header(); @@ -129,6 +164,7 @@ fn align( output, *, mito_dna=vec!["chrM".to_owned(), "M".to_owned()], + chunk_size=50000000, compression=None, compression_level=None, temp_dir=None, @@ -141,6 +177,7 @@ fn make_fragment( input: PathBuf, output : PathBuf, mito_dna: Vec, + chunk_size: usize, compression: Option<&str>, compression_level: Option, temp_dir: Option, @@ -163,7 +200,7 @@ fn make_fragment( align_qc.update(&x.0, &header); align_qc.update(&x.1, &header); x - }).chunks(5000000); + }).chunks(chunk_size); let alignments = chunks.into_iter().map(|chunk| Either::Right(chunk.collect_vec())); let compression = compression.map(|x| Compression::from_str(x).unwrap()) @@ -197,6 +234,8 @@ fn precellar(m: &Bound<'_, PyModule>) -> PyResult<()> { .filter_level(log::LevelFilter::Info) .try_init().unwrap(); + m.add("__version__", env!("CARGO_PKG_VERSION"))?; + m.add_function(wrap_pyfunction!(align, m)?)?; m.add_function(wrap_pyfunction!(make_fragment, m)?)?; diff --git a/python/src/utils.rs b/python/src/utils.rs index dae81a7..6c2e9f6 100644 --- a/python/src/utils.rs +++ b/python/src/utils.rs @@ -7,19 +7,13 @@ use regex::Regex; #[pyfunction] #[pyo3( - signature = ( - in_fq, - out_fq, - *, - regex, - out_barcode=None, - left_add=0, - right_add=0, - compression=None, - compression_level=None, - num_threads=8, + signature = (in_fq, out_fq, + *, regex, out_barcode=None, left_add=0, right_add=0, + compression=None, compression_level=None, num_threads=8, ), - text_signature = "(in_fq, out_fq, *, regex, out_barcode, left_add=0, right_add=0, compression=None, compression_level=None, num_threads=8)", + text_signature = "(in_fq, out_fq, + *, regex, out_barcode, left_add=0, right_add=0, + compression=None, compression_level=None, num_threads=8)", )] fn strip_barcode_from_fastq( py: Python<'_>,