From bdfdd8763b61e64ec4dcc56da52099b5274b6173 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 2 Jul 2024 09:20:40 +0200 Subject: [PATCH] update to viash 0.9 --- .github/workflows/build.yml | 119 +----------- .github/workflows/test.yml | 106 +---------- _viash.yaml | 34 ++-- nextflow.config | 2 + scripts/script.R | 48 +++++ src/common/create_component/config.vsh.yaml | 4 +- src/common/create_component/script.py | 4 +- .../sync_test_resources/config.vsh.yaml | 4 +- .../ground_truth/config.vsh.yaml | 25 +-- .../mean_across_celltypes/config.vsh.yaml | 25 +-- .../mean_across_compounds/config.vsh.yaml | 4 +- .../mean_outcome/config.vsh.yaml | 25 +-- src/control_methods/sample/config.vsh.yaml | 25 +-- src/control_methods/zeros/config.vsh.yaml | 23 +-- src/methods/jn_ap_op2/config.vsh.yaml | 63 +++--- src/methods/lgc_ensemble/config.vsh.yaml | 95 ++++----- .../lgc_ensemble_direct/config.vsh.yaml | 103 +++++----- .../lgc_ensemble_predict/config.vsh.yaml | 69 +++---- .../lgc_ensemble_prepare/config.vsh.yaml | 115 +++++------ .../lgc_ensemble_train/config.vsh.yaml | 95 ++++----- .../config.vsh.yaml | 75 ++++---- src/methods/pyboost/config.vsh.yaml | 73 +++---- src/methods/scape/config.vsh.yaml | 129 ++++++------- .../transformer_ensemble/config.vsh.yaml | 91 ++++----- .../mean_rowwise_correlation/config.vsh.yaml | 101 +++++----- .../mean_rowwise_error/config.vsh.yaml | 73 +++---- .../add_uns_metadata/config.vsh.yaml | 107 +++++------ src/process_dataset/bootstrap/config.vsh.yaml | 109 +++++------ .../compute_pseudobulk/config.vsh.yaml | 51 ++--- .../convert_h5ad_to_parquet/config.vsh.yaml | 83 ++++---- .../config.vsh.yaml | 147 +++++++------- .../filter_obs/config.vsh.yaml | 51 ++--- .../filter_vars/config.vsh.yaml | 51 ++--- .../generate_id_map/config.vsh.yaml | 51 ++--- src/process_dataset/run_limma/config.vsh.yaml | 95 ++++----- src/workflows/process_dataset/config.vsh.yaml | 88 ++++----- src/workflows/run_benchmark/config.vsh.yaml | 180 +++++++++--------- .../run_stability_analysis/config.vsh.yaml | 168 ++++++++-------- 38 files changed, 1304 insertions(+), 1407 deletions(-) create mode 100644 scripts/script.R diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index cb1323d6..f5bc8988 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,120 +1,21 @@ -name: build +name: Build on: push: branches: [ 'main' ] workflow_dispatch: inputs: - target_branch: - description: 'Branch to deploy to. If not specified, `build-${BRANCH_NAME}` will be used.' - required: false version: - description: 'Version name to use for the build. If not specified, `build-${BRANCH_NAME}` will be used.' + description: | + The version of the project to build. Example: `1.0.3`. + + If not provided, a development build with a version name + based on the branch name will be built. Otherwise, a release + build with the provided version will be built. required: false jobs: - # phase 1 - list: - runs-on: ubuntu-latest - - outputs: - target_branch: ${{ steps.defaults.outputs.target_branch }} - version: ${{ steps.defaults.outputs.version }} - component_matrix: ${{ steps.set_matrix.outputs.matrix }} - - steps: - - uses: actions/checkout@v4 - - - uses: viash-io/viash-actions/setup@v5 - - - name: Determine version tag from branch name - id: defaults - run: | - BRANCH_NAME=$(echo $GITHUB_REF | sed 's/refs\/heads\///') - - VERSION=${{ github.event.inputs.version }} - if [ -z "$VERSION" ]; then - VERSION="build-$BRANCH_NAME" - fi - echo "version=$VERSION" >> $GITHUB_OUTPUT - - TARGET_BRANCH=${{ github.event.inputs.target_branch }} - if [ -z "$TARGET_BRANCH" ]; then - TARGET_BRANCH="build-$BRANCH_NAME" - fi - echo "target_branch=$TARGET_BRANCH" >> $GITHUB_OUTPUT - - - name: Remove target folder from .gitignore - run: | - # allow publishing the target folder - sed -i '/^target.*/d' .gitignore - - - uses: viash-io/viash-actions/ns-build@v5 - with: - config_mod: .functionality.version := '${{ steps.defaults.outputs.version }}' - parallel: true - - - name: Deploy to target branch - uses: peaceiris/actions-gh-pages@v4 - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: . - publish_branch: ${{ steps.defaults.outputs.target_branch }} - - - id: ns_list - uses: viash-io/viash-actions/ns-list@v5 - with: - platform: docker - src: src - format: json - - - id: set_matrix - run: | - echo "matrix=$(jq -c '[ .[] | - { - "name": (.functionality.namespace + "/" + .functionality.name), - "dir": .info.config | capture("^(?.*\/)").dir - } - ]' ${{ steps.ns_list.outputs.output_file }} )" >> $GITHUB_OUTPUT - - # phase 2 build: - needs: list - - runs-on: ubuntu-latest - - strategy: - fail-fast: false - matrix: - component: ${{ fromJson(needs.list.outputs.component_matrix) }} - - steps: - # Remove unnecessary files to free up space. Otherwise, we get 'no space left on device.' - - uses: data-intuitive/reclaim-the-bytes@v2 - - - uses: actions/checkout@v4 - - - uses: viash-io/viash-actions/setup@v5 - - - name: Build container - uses: viash-io/viash-actions/ns-build@v5 - with: - config_mod: .functionality.version := '${{ needs.list.outputs.version }}' - platform: docker - src: ${{ matrix.component.dir }} - setup: build - - - name: Login to container registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ secrets.GTHB_USER }} - password: ${{ secrets.GTHB_PAT }} - - - name: Push container - uses: viash-io/viash-actions/ns-build@v5 - with: - config_mod: .functionality.version := '${{ needs.list.outputs.version }}' - platform: docker - src: ${{ matrix.component.dir }} - setup: push \ No newline at end of file + uses: openproblems-bio/actions/.github/workflows/build.yml@main + with: + version: ${{ github.event.inputs.version }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d1892c8a..87537860 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,107 +1,9 @@ -name: test +name: Test on: - pull_request: push: - branches: [ '**' ] + pull_request: jobs: - run_ci_check_job: - runs-on: ubuntu-latest - outputs: - run_ci: ${{ steps.github_cli.outputs.check }} - steps: - - name: 'Check if branch has an existing pull request and the trigger was a push' - id: github_cli - run: | - pull_request=$(gh pr list -R ${{ github.repository }} -H ${{ github.ref_name }} --json url --state open --limit 1 | jq '.[0].url') - # If the branch has a PR and this run was triggered by a push event, do not run - if [[ "$pull_request" != "null" && "$GITHUB_REF_NAME" != "main" && "${{ github.event_name == 'push' }}" == "true" && "${{ !contains(github.event.head_commit.message, 'ci force') }}" == "true" ]]; then - echo "check=false" >> $GITHUB_OUTPUT - else - echo "check=true" >> $GITHUB_OUTPUT - fi - env: - GITHUB_TOKEN: ${{ secrets.GTHB_PAT }} - - # phase 1 - list: - needs: run_ci_check_job - env: - s3_bucket: s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/ - runs-on: ubuntu-latest - if: ${{ needs.run_ci_check_job.outputs.run_ci == 'true' }} - - outputs: - matrix: ${{ steps.set_matrix.outputs.matrix }} - cache_key: ${{ steps.cache.outputs.cache_key }} - - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - uses: viash-io/viash-actions/setup@v5 - - - uses: viash-io/viash-actions/project/sync-and-cache-s3@v5 - id: cache - with: - s3_bucket: $s3_bucket - dest_path: resources - cache_key_prefix: resources__ - - - id: ns_list - uses: viash-io/viash-actions/ns-list@v5 - with: - platform: docker - format: json - - - id: ns_list_filtered - uses: viash-io/viash-actions/project/detect-changed-components@v5 - with: - input_file: "${{ steps.ns_list.outputs.output_file }}" - - - id: set_matrix - run: | - echo "matrix=$(jq -c '[ .[] | - { - "name": (.functionality.namespace + "/" + .functionality.name), - "config": .info.config - } - ]' ${{ steps.ns_list_filtered.outputs.output_file }} )" >> $GITHUB_OUTPUT - - # phase 2 - viash_test: - needs: list - if: ${{ needs.list.outputs.matrix != '[]' && needs.list.outputs.matrix != '' }} - runs-on: ubuntu-latest - - strategy: - fail-fast: false - matrix: - component: ${{ fromJson(needs.list.outputs.matrix) }} - - steps: - # Remove unnecessary files to free up space. Otherwise, we get 'no space left on device.' - - uses: data-intuitive/reclaim-the-bytes@v2 - - - uses: actions/checkout@v4 - - - uses: viash-io/viash-actions/setup@v5 - - # use cache - - name: Cache resources data - uses: actions/cache@v4 - timeout-minutes: 10 - with: - path: resources - key: ${{ needs.list.outputs.cache_key }} - - - name: Run test - timeout-minutes: 30 - run: | - VIASH_TEMP=$RUNNER_TEMP/viash viash test \ - "${{ matrix.component.config }}" \ - --cpus 2 \ - --memory "5gb" - + build: + uses: openproblems-bio/actions/.github/workflows/test.yml@main diff --git a/_viash.yaml b/_viash.yaml index 5106f43c..c0d17ca7 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -1,15 +1,25 @@ -viash_version: 0.8.6 +name: task_perturbation_prediction +version: 1.1.0 -source: src -target: target +# package metadata +description: | + Predicting how small molecules change gene expression in different cell types. +license: MIT +keywords: [single-cell, perturbation prediction, perturbation, openproblems, benchmark] +links: + issue_tracker: https://github.com/openproblems-bio/task_perturbation_prediction/issues + repository: https://github.com/openproblems-bio/task_perturbation_prediction + docker_registry: ghcr.io +# technical settings +organization: openproblems-bio +viash_version: 0.9.0-RC6 +info: + test_resources: + - type: s3 + path: s3://openproblems-data/resources/perturbation_prediction + dest: resources + +# set default labels config_mods: | - .functionality.version := 'dev' - .functionality.arguments[.multiple == true].multiple_sep := ';' - .platforms[.type == 'docker'].target_registry := 'ghcr.io' - .platforms[.type == 'docker'].target_organization := 'openproblems-bio/task_perturbation_prediction' - .platforms[.type == 'docker'].target_image_source := 'https://github.com/openproblems-bio/task_perturbation_prediction' - .platforms[.type == "nextflow"].directives.tag := "$id" - .platforms[.type == "nextflow"].auto.simplifyOutput := false - .platforms[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" } - .platforms[.type == "nextflow"].config.script := "process.errorStrategy = 'ignore'" \ No newline at end of file + .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" } \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 6402ebf2..a3fd6d7f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1 +1,3 @@ process.container = 'nextflow/bash:latest' + +process.errorStrategy = "ignore" \ No newline at end of file diff --git a/scripts/script.R b/scripts/script.R new file mode 100644 index 00000000..19dd2c26 --- /dev/null +++ b/scripts/script.R @@ -0,0 +1,48 @@ +library(tidyverse) + +# aws s3 sync s3://openproblems-data/resources/perturbation_prediction/results output/benchmark_results + +df <- yaml::read_yaml("output/benchmark_results/kaggle_2024-06-02_22-27-09/score_uns.yaml") %>% + map_dfr(as.data.frame) %>% + as_tibble + +df %>% filter(metric_ids == "mean_rowwise_rmse") %>% arrange(metric_values) %>% select(method_id, metric_values) + +# method_id metric_values +# +# 1 ground_truth 0 +# 2 nn_retraining_with_pseudolabels 1.29 +# 3 scape 1.31 +# 4 pyboost 1.32 +# 5 jn_ap_op2 1.34 +# 6 lgc_ensemble 1.41 +# 7 mean_across_compounds 1.47 +# 8 transformer_ensemble 1.55 +# 9 zeros 1.57 +# 10 mean_outcome 1.57 +# 11 mean_across_celltypes 2.50 +# 12 sample 3.02 + +####### + +df <- yaml::read_yaml("output/benchmark_results/run_2024-06-02_22-27-09/score_uns.yaml") %>% + map_dfr(as.data.frame) %>% + as_tibble + +df %>% filter(metric_ids == "mean_rowwise_rmse") %>% arrange(metric_values) %>% select(method_id, metric_values) + +# # A tibble: 12 × 2 +# method_id metric_values +# +# 1 ground_truth 0 +# 2 nn_retraining_with_pseudolabels 0.757 +# 3 scape 0.775 +# 4 pyboost 0.795 +# 5 lgc_ensemble 0.802 +# 6 mean_across_celltypes 0.892 +# 7 jn_ap_op2 0.894 +# 8 transformer_ensemble 0.897 +# 9 mean_outcome 0.899 +# 10 zeros 0.918 +# 11 mean_across_compounds 0.943 +# 12 sample 1.36 \ No newline at end of file diff --git a/src/common/create_component/config.vsh.yaml b/src/common/create_component/config.vsh.yaml index 9a214832..9193f89d 100644 --- a/src/common/create_component/config.vsh.yaml +++ b/src/common/create_component/config.vsh.yaml @@ -44,13 +44,15 @@ functionality: - type: python_script path: script.py - path: read_and_merge_yaml.py -platforms: +engines: - type: docker image: python:3.10-slim setup: - type: python pypi: ruamel.yaml - type: native +runners: + - type: executable - type: nextflow diff --git a/src/common/create_component/script.py b/src/common/create_component/script.py index 65aaad9a..f6e6ceee 100644 --- a/src/common/create_component/script.py +++ b/src/common/create_component/script.py @@ -60,7 +60,9 @@ def create_config(par, component_type, pretty_name, script_path) -> str: | # This platform allows running the component natively | - type: native | # Allows turning the component into a Nextflow module / pipeline. - | - type: nextflow + |runners: + - type: executable + - type: nextflow | directives: | label: [midtime,midmem,midcpu] |''' diff --git a/src/common/sync_test_resources/config.vsh.yaml b/src/common/sync_test_resources/config.vsh.yaml index 017f1dfe..5189fe43 100644 --- a/src/common/sync_test_resources/config.vsh.yaml +++ b/src/common/sync_test_resources/config.vsh.yaml @@ -34,8 +34,10 @@ functionality: resources: - type: bash_script path: script.sh -platforms: +engines: - type: docker image: "amazon/aws-cli:2.7.12" - type: native +runners: + - type: executable - type: nextflow diff --git a/src/control_methods/ground_truth/config.vsh.yaml b/src/control_methods/ground_truth/config.vsh.yaml index 2542457b..46b6ebbb 100644 --- a/src/control_methods/ground_truth/config.vsh.yaml +++ b/src/control_methods/ground_truth/config.vsh.yaml @@ -1,16 +1,15 @@ __merge__: ../../api/comp_control_method.yaml -functionality: - name: ground_truth - info: - label: Ground truth - summary: "Returns the ground truth predictions." - description: | - The identity function that returns the ground-truth information as the output. - preferred_normalization: counts - resources: - - type: r_script - path: script.R -platforms: +name: ground_truth +info: + label: Ground truth + summary: "Returns the ground truth predictions." + description: | + The identity function that returns the ground-truth information as the output. + preferred_normalization: counts +resources: + - type: r_script + path: script.R +engines: - type: docker image: ghcr.io/openproblems-bio/base_r:1.0.4 setup: @@ -18,6 +17,8 @@ platforms: cran: [ arrow, dplyr ] - type: python packages: [ fastparquet ] +runners: + - type: executable - type: nextflow directives: label: [ midtime, highmem, highcpu ] \ No newline at end of file diff --git a/src/control_methods/mean_across_celltypes/config.vsh.yaml b/src/control_methods/mean_across_celltypes/config.vsh.yaml index a5102739..9ae27c44 100644 --- a/src/control_methods/mean_across_celltypes/config.vsh.yaml +++ b/src/control_methods/mean_across_celltypes/config.vsh.yaml @@ -1,21 +1,22 @@ __merge__: ../../api/comp_control_method.yaml -functionality: - name: mean_across_celltypes - info: - label: Mean per cell type and gene - summary: Baseline method that returns mean of cell type's outcomes - description: | - Baseline method that predicts for a cell type the mean of its outcomes of all compounds. - resources: - - type: python_script - path: script.py - - path: ../../utils/anndata_to_dataframe.py -platforms: +name: mean_across_celltypes +info: + label: Mean per cell type and gene + summary: Baseline method that returns mean of cell type's outcomes + description: | + Baseline method that predicts for a cell type the mean of its outcomes of all compounds. +resources: + - type: python_script + path: script.py + - path: ../../utils/anndata_to_dataframe.py +engines: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.4 setup: - type: python packages: [ fastparquet ] +runners: + - type: executable - type: nextflow directives: label: [ midtime, highmem, highcpu ] \ No newline at end of file diff --git a/src/control_methods/mean_across_compounds/config.vsh.yaml b/src/control_methods/mean_across_compounds/config.vsh.yaml index 79c6a2cc..bfc71b1c 100644 --- a/src/control_methods/mean_across_compounds/config.vsh.yaml +++ b/src/control_methods/mean_across_compounds/config.vsh.yaml @@ -10,12 +10,14 @@ functionality: - type: python_script path: script.py - path: ../../utils/anndata_to_dataframe.py -platforms: +engines: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.4 setup: - type: python packages: [ fastparquet ] +runners: + - type: executable - type: nextflow directives: label: [ midtime, highmem, highcpu ] \ No newline at end of file diff --git a/src/control_methods/mean_outcome/config.vsh.yaml b/src/control_methods/mean_outcome/config.vsh.yaml index 1f5a74cc..5f4eff95 100644 --- a/src/control_methods/mean_outcome/config.vsh.yaml +++ b/src/control_methods/mean_outcome/config.vsh.yaml @@ -1,21 +1,22 @@ __merge__: ../../api/comp_control_method.yaml -functionality: - name: mean_outcome - info: - label: Mean per gene - summary: Baseline method that returns mean of gene's outcomes - description: | - Baseline method that predicts for a gene the mean of its outcomes of all samples. - resources: - - type: python_script - path: script.py - - path: ../../utils/anndata_to_dataframe.py -platforms: +name: mean_outcome +info: + label: Mean per gene + summary: Baseline method that returns mean of gene's outcomes + description: | + Baseline method that predicts for a gene the mean of its outcomes of all samples. +resources: + - type: python_script + path: script.py + - path: ../../utils/anndata_to_dataframe.py +engines: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.4 setup: - type: python packages: [ fastparquet ] +runners: + - type: executable - type: nextflow directives: label: [ midtime, highmem, highcpu ] \ No newline at end of file diff --git a/src/control_methods/sample/config.vsh.yaml b/src/control_methods/sample/config.vsh.yaml index fcc5363a..aa580802 100644 --- a/src/control_methods/sample/config.vsh.yaml +++ b/src/control_methods/sample/config.vsh.yaml @@ -1,16 +1,15 @@ __merge__: ../../api/comp_control_method.yaml -functionality: - name: sample - info: - label: Sample - summary: Sample predictions from the training data - description: | - This method samples the training data to generate predictions. - preferred_normalization: counts - resources: - - type: r_script - path: script.R -platforms: +name: sample +info: + label: Sample + summary: Sample predictions from the training data + description: | + This method samples the training data to generate predictions. + preferred_normalization: counts +resources: + - type: r_script + path: script.R +engines: - type: docker image: ghcr.io/openproblems-bio/base_r:1.0.4 setup: @@ -18,6 +17,8 @@ platforms: cran: [ arrow, dplyr ] - type: python packages: [ fastparquet ] +runners: + - type: executable - type: nextflow directives: label: [ midtime, highmem, highcpu ] \ No newline at end of file diff --git a/src/control_methods/zeros/config.vsh.yaml b/src/control_methods/zeros/config.vsh.yaml index 7f5be374..3cdc25c2 100644 --- a/src/control_methods/zeros/config.vsh.yaml +++ b/src/control_methods/zeros/config.vsh.yaml @@ -1,20 +1,21 @@ __merge__: ../../api/comp_control_method.yaml -functionality: - name: zeros - info: - label: Zeros - summary: Baseline method that predicts all zeros - description: | - Baseline method that predicts all zeros. - resources: - - type: python_script - path: script.py -platforms: +name: zeros +info: + label: Zeros + summary: Baseline method that predicts all zeros + description: | + Baseline method that predicts all zeros. +resources: + - type: python_script + path: script.py +engines: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.4 setup: - type: python packages: [ fastparquet ] +runners: + - type: executable - type: nextflow directives: label: [ midtime, highmem, highcpu ] \ No newline at end of file diff --git a/src/methods/jn_ap_op2/config.vsh.yaml b/src/methods/jn_ap_op2/config.vsh.yaml index 1da6eb5b..35c3b060 100644 --- a/src/methods/jn_ap_op2/config.vsh.yaml +++ b/src/methods/jn_ap_op2/config.vsh.yaml @@ -1,36 +1,35 @@ __merge__: ../../api/comp_method.yaml -functionality: - name: jn_ap_op2 - info: - label: JN-AP-OP2 - neurips2023_rank: 20 - summary: "Deep learning architecture composed of 2 modules: a sample-centric MLP and a gene-centric MLP" - description: | - We first encode each sample using leave-one-out encoder based on compound and cell type. This produces X with the dimension of n_samples, n_genes, n_encode, - where n_encode is 2. Then, X is passed to a MLP1 sample-wise with input of n_samples, n_genes*n_encode, which outputs the same dimension data. - The purpose of this MLP is to learn inter-gene relationships. Then, we group the output of MLP1 with X (original encoded data) and feed it - to MLP2 which receives n_smaples*n_genes, (n_encode + n_encode) and results n_samples*n_genes. This MLP2 trains on each (compound, cell_type, gene) - combination. This is to overcome the underdetermination problem due to lack of sufficient (compound, cell_type) samples. - documentation_url: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/461159 - repository_url: https://github.com/AntoinePassemiers/Open-Challenges-Single-Cell-Perturbations - arguments: - - type: integer - name: --n_replica - default: 10 - info: - test_default: 1 - - type: string - name: --submission_names - multiple: true - default: [dl40, dl200] - info: - test_default: [dl40] - resources: - - type: python_script - path: script.py - - path: helper.py -platforms: +name: jn_ap_op2 +info: + label: JN-AP-OP2 + neurips2023_rank: 20 + summary: "Deep learning architecture composed of 2 modules: a sample-centric MLP and a gene-centric MLP" + description: | + We first encode each sample using leave-one-out encoder based on compound and cell type. This produces X with the dimension of n_samples, n_genes, n_encode, + where n_encode is 2. Then, X is passed to a MLP1 sample-wise with input of n_samples, n_genes*n_encode, which outputs the same dimension data. + The purpose of this MLP is to learn inter-gene relationships. Then, we group the output of MLP1 with X (original encoded data) and feed it + to MLP2 which receives n_smaples*n_genes, (n_encode + n_encode) and results n_samples*n_genes. This MLP2 trains on each (compound, cell_type, gene) + combination. This is to overcome the underdetermination problem due to lack of sufficient (compound, cell_type) samples. + documentation_url: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/461159 + repository_url: https://github.com/AntoinePassemiers/Open-Challenges-Single-Cell-Perturbations +arguments: + - type: integer + name: --n_replica + default: 10 + info: + test_default: 1 + - type: string + name: --submission_names + multiple: true + default: [dl40, dl200] + info: + test_default: [dl40] +resources: + - type: python_script + path: script.py + - path: helper.py +engines: - type: docker image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4 setup: @@ -41,6 +40,8 @@ platforms: - category_encoders - type: native +runners: + - type: executable - type: nextflow directives: label: [ hightime, midmem, highcpu, gpu ] diff --git a/src/methods/lgc_ensemble/config.vsh.yaml b/src/methods/lgc_ensemble/config.vsh.yaml index b36a8aa4..2190b10b 100644 --- a/src/methods/lgc_ensemble/config.vsh.yaml +++ b/src/methods/lgc_ensemble/config.vsh.yaml @@ -1,55 +1,56 @@ __merge__: ../../api/wf_method.yaml -functionality: - name: lgc_ensemble - info: - label: LSTM-GRU-CNN Ensemble - neurips2023_rank: 1 - summary: An ensemble of LSTM, GRU, and 1D CNN models - description: | - An ensemble of LSTM, GRU, and 1D CNN models with a variety of input features derived from ChemBERTa embeddings, - one-hot encoding of cell type/small molecule pairs, and various statistical measures of target gene expression. - The models were trained with a combination of MSE, MAE, LogCosh, and BCE loss functions to improve their - robustness and predictive performance. The approach also included data augmentation techniques to ensure - generalization and account for noise in the data. - documentation_url: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/459258 - repository_url: https://github.com/Jean-KOUAGOU/1st-place-solution-single-cell-pbs/tree/main +name: lgc_ensemble +info: + label: LSTM-GRU-CNN Ensemble + neurips2023_rank: 1 + summary: An ensemble of LSTM, GRU, and 1D CNN models + description: | + An ensemble of LSTM, GRU, and 1D CNN models with a variety of input features derived from ChemBERTa embeddings, + one-hot encoding of cell type/small molecule pairs, and various statistical measures of target gene expression. + The models were trained with a combination of MSE, MAE, LogCosh, and BCE loss functions to improve their + robustness and predictive performance. The approach also included data augmentation techniques to ensure + generalization and account for noise in the data. + documentation_url: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/459258 + repository_url: https://github.com/Jean-KOUAGOU/1st-place-solution-single-cell-pbs/tree/main - arguments: - - name: --epochs - type: integer - default: 250 - description: "Number of epochs to train." - info: - test_default: 1 - - name: --kf_n_splits - type: integer - default: 5 - description: "Number of splits for KFold." - info: - test_default: 2 - - name: --schemes - type: string - default: [initial, light, heavy] - multiple: true - info: - test_default: [initial, light] - - name: --models - type: string - default: [LSTM, GRU, Conv] - multiple: true - info: - test_default: [LSTM, GRU] +arguments: + - name: --epochs + type: integer + default: 250 + description: "Number of epochs to train." + info: + test_default: 1 + - name: --kf_n_splits + type: integer + default: 5 + description: "Number of splits for KFold." + info: + test_default: 2 + - name: --schemes + type: string + default: [initial, light, heavy] + multiple: true + info: + test_default: [initial, light] + - name: --models + type: string + default: [LSTM, GRU, Conv] + multiple: true + info: + test_default: [LSTM, GRU] - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf - dependencies: - - name: methods/lgc_ensemble_prepare - - name: methods/lgc_ensemble_train - - name: methods/lgc_ensemble_predict +dependencies: + - name: methods/lgc_ensemble_prepare + - name: methods/lgc_ensemble_train + - name: methods/lgc_ensemble_predict platforms: +runners: + - type: executable - type: nextflow diff --git a/src/methods/lgc_ensemble_direct/config.vsh.yaml b/src/methods/lgc_ensemble_direct/config.vsh.yaml index 5a6345e4..b5f76229 100644 --- a/src/methods/lgc_ensemble_direct/config.vsh.yaml +++ b/src/methods/lgc_ensemble_direct/config.vsh.yaml @@ -1,59 +1,58 @@ __merge__: ../../api/comp_method.yaml -functionality: - name: lgc_ensemble_direct - info: - label: LSTM-GRU-CNN Ensemble - neurips2023_rank: 1 - summary: An ensemble of LSTM, GRU, and 1D CNN models - description: | - An ensemble of LSTM, GRU, and 1D CNN models with a variety of input features derived from ChemBERTa embeddings, - one-hot encoding of cell type/small molecule pairs, and various statistical measures of target gene expression. - The models were trained with a combination of MSE, MAE, LogCosh, and BCE loss functions to improve their - robustness and predictive performance. The approach also included data augmentation techniques to ensure - generalization and account for noise in the data. - documentation_url: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/459258 - repository_url: https://github.com/Jean-KOUAGOU/1st-place-solution-single-cell-pbs/tree/main +name: lgc_ensemble_direct +info: + label: LSTM-GRU-CNN Ensemble + neurips2023_rank: 1 + summary: An ensemble of LSTM, GRU, and 1D CNN models + description: | + An ensemble of LSTM, GRU, and 1D CNN models with a variety of input features derived from ChemBERTa embeddings, + one-hot encoding of cell type/small molecule pairs, and various statistical measures of target gene expression. + The models were trained with a combination of MSE, MAE, LogCosh, and BCE loss functions to improve their + robustness and predictive performance. The approach also included data augmentation techniques to ensure + generalization and account for noise in the data. + documentation_url: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/459258 + repository_url: https://github.com/Jean-KOUAGOU/1st-place-solution-single-cell-pbs/tree/main - arguments: - - name: --epochs - type: integer - default: 250 - description: "Number of epochs to train." - info: - test_default: 1 - - name: --kf_n_splits - type: integer - default: 5 - description: "Number of splits for KFold." - info: - test_default: 2 - - name: --schemes - type: string - default: [initial, light, heavy] - multiple: true - info: - test_default: [initial, light] - - name: --models - type: string - default: [LSTM, GRU, Conv] - multiple: true - info: - test_default: [LSTM, GRU] +arguments: + - name: --epochs + type: integer + default: 250 + description: "Number of epochs to train." + info: + test_default: 1 + - name: --kf_n_splits + type: integer + default: 5 + description: "Number of splits for KFold." + info: + test_default: 2 + - name: --schemes + type: string + default: [initial, light, heavy] + multiple: true + info: + test_default: [initial, light] + - name: --models + type: string + default: [LSTM, GRU, Conv] + multiple: true + info: + test_default: [LSTM, GRU] - resources: - - type: python_script - path: script.py - - path: ../lgc_ensemble_helpers/helper_classes.py - - path: ../lgc_ensemble_helpers/helper_functions.py - - path: ../lgc_ensemble_helpers/models.py - - path: ../lgc_ensemble_helpers/predict.py - - path: ../lgc_ensemble_helpers/prepare_data.py - - path: ../lgc_ensemble_helpers/train.py - - path: ../lgc_ensemble_helpers/divisor_finder.py - - path: ../../utils/anndata_to_dataframe.py +resources: + - type: python_script + path: script.py + - path: ../lgc_ensemble_helpers/helper_classes.py + - path: ../lgc_ensemble_helpers/helper_functions.py + - path: ../lgc_ensemble_helpers/models.py + - path: ../lgc_ensemble_helpers/predict.py + - path: ../lgc_ensemble_helpers/prepare_data.py + - path: ../lgc_ensemble_helpers/train.py + - path: ../lgc_ensemble_helpers/divisor_finder.py + - path: ../../utils/anndata_to_dataframe.py -platforms: +engines: - type: docker image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4 run_args: ["--shm-size=2g"] @@ -75,6 +74,8 @@ platforms: - torch-summary - type: native +runners: + - type: executable - type: nextflow directives: label: [hightime, veryhighmem, highcpu, highsharedmem, highgpu] diff --git a/src/methods/lgc_ensemble_predict/config.vsh.yaml b/src/methods/lgc_ensemble_predict/config.vsh.yaml index cf2f3393..904057ab 100644 --- a/src/methods/lgc_ensemble_predict/config.vsh.yaml +++ b/src/methods/lgc_ensemble_predict/config.vsh.yaml @@ -1,38 +1,37 @@ -functionality: - name: lgc_ensemble_predict - namespace: methods - arguments: - - name: --train_data_aug_dir - type: file - required: true - direction: input - - name: --model_files - type: file - required: true - direction: input - example: model.pt - multiple: true - - name: --id_map - type: file - required: true - direction: input - - name: --output - type: file - required: true - direction: output - resources: - - type: python_script - path: script.py - - path: ../lgc_ensemble_helpers/helper_classes.py - - path: ../lgc_ensemble_helpers/helper_functions.py - - path: ../lgc_ensemble_helpers/models.py - - path: ../lgc_ensemble_helpers/predict.py - - path: ../lgc_ensemble_helpers/prepare_data.py - - path: ../lgc_ensemble_helpers/train.py - - path: ../lgc_ensemble_helpers/divisor_finder.py - - path: ../../utils/anndata_to_dataframe.py +name: lgc_ensemble_predict +namespace: methods +arguments: + - name: --train_data_aug_dir + type: file + required: true + direction: input + - name: --model_files + type: file + required: true + direction: input + example: model.pt + multiple: true + - name: --id_map + type: file + required: true + direction: input + - name: --output + type: file + required: true + direction: output +resources: + - type: python_script + path: script.py + - path: ../lgc_ensemble_helpers/helper_classes.py + - path: ../lgc_ensemble_helpers/helper_functions.py + - path: ../lgc_ensemble_helpers/models.py + - path: ../lgc_ensemble_helpers/predict.py + - path: ../lgc_ensemble_helpers/prepare_data.py + - path: ../lgc_ensemble_helpers/train.py + - path: ../lgc_ensemble_helpers/divisor_finder.py + - path: ../../utils/anndata_to_dataframe.py -platforms: +engines: - type: docker image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4 run_args: ["--shm-size=2g"] @@ -54,6 +53,8 @@ platforms: - torch-summary - type: native +runners: + - type: executable - type: nextflow directives: label: [hightime, veryhighmem, highcpu, highsharedmem, gpu] diff --git a/src/methods/lgc_ensemble_prepare/config.vsh.yaml b/src/methods/lgc_ensemble_prepare/config.vsh.yaml index 141beda3..4c6e2b27 100644 --- a/src/methods/lgc_ensemble_prepare/config.vsh.yaml +++ b/src/methods/lgc_ensemble_prepare/config.vsh.yaml @@ -1,61 +1,60 @@ -functionality: - name: lgc_ensemble_prepare - namespace: methods - arguments: - - name: --de_train_h5ad - type: file - required: false - direction: input - - name: --id_map - type: file - required: true - direction: input - - name: --layer - type: string - direction: input - default: clipped_sign_log10_pval - description: Which layer to use for prediction. - - name: --train_data_aug_dir - type: file - required: true - direction: output - - name: --epochs - type: integer - default: 250 - description: "Number of epochs to train." - info: - test_default: 1 - - name: --kf_n_splits - type: integer - default: 5 - description: "Number of splits for KFold." - info: - test_default: 2 - - name: --schemes - type: string - default: [initial, light, heavy] - multiple: true - info: - test_default: [initial, light] - - name: --models - type: string - default: [LSTM, GRU, Conv] - multiple: true - info: - test_default: [LSTM, GRU] - resources: - - type: python_script - path: script.py - - path: ../lgc_ensemble_helpers/helper_classes.py - - path: ../lgc_ensemble_helpers/helper_functions.py - - path: ../lgc_ensemble_helpers/models.py - - path: ../lgc_ensemble_helpers/predict.py - - path: ../lgc_ensemble_helpers/prepare_data.py - - path: ../lgc_ensemble_helpers/train.py - - path: ../lgc_ensemble_helpers/divisor_finder.py - - path: ../../utils/anndata_to_dataframe.py +name: lgc_ensemble_prepare +namespace: methods +arguments: + - name: --de_train_h5ad + type: file + required: false + direction: input + - name: --id_map + type: file + required: true + direction: input + - name: --layer + type: string + direction: input + default: clipped_sign_log10_pval + description: Which layer to use for prediction. + - name: --train_data_aug_dir + type: file + required: true + direction: output + - name: --epochs + type: integer + default: 250 + description: "Number of epochs to train." + info: + test_default: 1 + - name: --kf_n_splits + type: integer + default: 5 + description: "Number of splits for KFold." + info: + test_default: 2 + - name: --schemes + type: string + default: [initial, light, heavy] + multiple: true + info: + test_default: [initial, light] + - name: --models + type: string + default: [LSTM, GRU, Conv] + multiple: true + info: + test_default: [LSTM, GRU] +resources: + - type: python_script + path: script.py + - path: ../lgc_ensemble_helpers/helper_classes.py + - path: ../lgc_ensemble_helpers/helper_functions.py + - path: ../lgc_ensemble_helpers/models.py + - path: ../lgc_ensemble_helpers/predict.py + - path: ../lgc_ensemble_helpers/prepare_data.py + - path: ../lgc_ensemble_helpers/train.py + - path: ../lgc_ensemble_helpers/divisor_finder.py + - path: ../../utils/anndata_to_dataframe.py -platforms: +engines: - type: docker image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4 run_args: ["--shm-size=2g"] @@ -77,6 +76,8 @@ platforms: - torch-summary - type: native +runners: + - type: executable - type: nextflow directives: label: [hightime, veryhighmem, highcpu] diff --git a/src/methods/lgc_ensemble_train/config.vsh.yaml b/src/methods/lgc_ensemble_train/config.vsh.yaml index bb64ac65..2486ba3c 100644 --- a/src/methods/lgc_ensemble_train/config.vsh.yaml +++ b/src/methods/lgc_ensemble_train/config.vsh.yaml @@ -1,51 +1,50 @@ -functionality: - name: lgc_ensemble_train - namespace: methods - arguments: - - name: --train_data_aug_dir - type: file - required: true - direction: input - - name: --scheme - type: string - required: true - direction: input - description: "The scheme to use for training." - choices: [initial, light, heavy] - - name: --model - type: string - required: true - direction: input - description: "The name of the model to train." - choices: [LSTM, GRU, Conv] - - name: --fold - type: integer - required: true - direction: input - description: "The fold to train." - - name: --model_file - type: file - required: true - direction: output - example: model.pt - - name: --log_file - type: file - required: true - direction: output - example: log.json - resources: - - type: python_script - path: script.py - - path: ../lgc_ensemble_helpers/helper_classes.py - - path: ../lgc_ensemble_helpers/helper_functions.py - - path: ../lgc_ensemble_helpers/models.py - - path: ../lgc_ensemble_helpers/predict.py - - path: ../lgc_ensemble_helpers/prepare_data.py - - path: ../lgc_ensemble_helpers/train.py - - path: ../lgc_ensemble_helpers/divisor_finder.py - - path: ../../utils/anndata_to_dataframe.py +name: lgc_ensemble_train +namespace: methods +arguments: + - name: --train_data_aug_dir + type: file + required: true + direction: input + - name: --scheme + type: string + required: true + direction: input + description: "The scheme to use for training." + choices: [initial, light, heavy] + - name: --model + type: string + required: true + direction: input + description: "The name of the model to train." + choices: [LSTM, GRU, Conv] + - name: --fold + type: integer + required: true + direction: input + description: "The fold to train." + - name: --model_file + type: file + required: true + direction: output + example: model.pt + - name: --log_file + type: file + required: true + direction: output + example: log.json +resources: + - type: python_script + path: script.py + - path: ../lgc_ensemble_helpers/helper_classes.py + - path: ../lgc_ensemble_helpers/helper_functions.py + - path: ../lgc_ensemble_helpers/models.py + - path: ../lgc_ensemble_helpers/predict.py + - path: ../lgc_ensemble_helpers/prepare_data.py + - path: ../lgc_ensemble_helpers/train.py + - path: ../lgc_ensemble_helpers/divisor_finder.py + - path: ../../utils/anndata_to_dataframe.py -platforms: +engines: - type: docker image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4 run_args: ["--shm-size=2g"] @@ -67,6 +66,8 @@ platforms: - torch-summary - type: native +runners: + - type: executable - type: nextflow directives: label: [hightime, veryhighmem, highcpu, highsharedmem, gpu] diff --git a/src/methods/nn_retraining_with_pseudolabels/config.vsh.yaml b/src/methods/nn_retraining_with_pseudolabels/config.vsh.yaml index 70c4a341..461ba922 100644 --- a/src/methods/nn_retraining_with_pseudolabels/config.vsh.yaml +++ b/src/methods/nn_retraining_with_pseudolabels/config.vsh.yaml @@ -1,45 +1,44 @@ __merge__: ../../api/comp_method.yaml -functionality: - name: nn_retraining_with_pseudolabels - info: - label: NN retraining with pseudolabels - neurips2023_rank: 3 - summary: Neural networks with pseudolabeling and ensemble modelling - description: | - The prediction system is two staged, so I publish two versions of the notebook. - The first stage predicts pseudolabels. To be honest, if I stopped on this version, I would not be the third. - The predicted pseudolabels on all test data (255 rows) are added to training in the second stage. - - **Stage 1 preparing pseudolabels**: The main part of this system is a neural network. Every neural network and its environment was optimized by optuna. Hyperparameters that have been optimized: - a dropout value, a number of neurons in particular layers, an output dimension of an embedding layer, a number of epochs, a learning rate, a batch size, a number of dimension of truncated singular value decomposition. - The optimization was done on custom 4-folds cross validation. In order to avoid overfitting to cross validation by optuna I applied 2 repeats for every fold and took an average. Generally, the more, the better. The optuna's criterion was MRRMSE. - Finally, 7 models were ensembled. Optuna was applied again to determine best weights of linear combination. The prediction of test set is the pseudolabels now and will be used in second stage. - - **Stage 2 retraining with pseudolabels**: The pseudolabels (255 rows) were added to the training dataset. I applied 20 models with optimized parameters in different experiments for a model diversity. - Optuna selected optimal weights for the linear combination of the prediction again. - Models had high variance, so every model was trained 10 times on all dataset and the median of prediction is taken as a final prediction. The prediction was additionally clipped to colwise min and max. - reference: null - documentation_url: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/458750 - repository_url: https://github.com/okon2000/single_cell_perturbations +name: nn_retraining_with_pseudolabels +info: + label: NN retraining with pseudolabels + neurips2023_rank: 3 + summary: Neural networks with pseudolabeling and ensemble modelling + description: | + The prediction system is two staged, so I publish two versions of the notebook. + The first stage predicts pseudolabels. To be honest, if I stopped on this version, I would not be the third. + The predicted pseudolabels on all test data (255 rows) are added to training in the second stage. + + **Stage 1 preparing pseudolabels**: The main part of this system is a neural network. Every neural network and its environment was optimized by optuna. Hyperparameters that have been optimized: + a dropout value, a number of neurons in particular layers, an output dimension of an embedding layer, a number of epochs, a learning rate, a batch size, a number of dimension of truncated singular value decomposition. + The optimization was done on custom 4-folds cross validation. In order to avoid overfitting to cross validation by optuna I applied 2 repeats for every fold and took an average. Generally, the more, the better. The optuna's criterion was MRRMSE. + Finally, 7 models were ensembled. Optuna was applied again to determine best weights of linear combination. The prediction of test set is the pseudolabels now and will be used in second stage. + + **Stage 2 retraining with pseudolabels**: The pseudolabels (255 rows) were added to the training dataset. I applied 20 models with optimized parameters in different experiments for a model diversity. + Optuna selected optimal weights for the linear combination of the prediction again. + Models had high variance, so every model was trained 10 times on all dataset and the median of prediction is taken as a final prediction. The prediction was additionally clipped to colwise min and max. + reference: null + documentation_url: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/458750 + repository_url: https://github.com/okon2000/single_cell_perturbations - arguments: - - type: integer - name: --reps - default: 10 - description: "Number of repetitions to train the model." - info: - # use only 1 rep during unit tests - test_default: 1 +arguments: + - type: integer + name: --reps + default: 10 + description: "Number of repetitions to train the model." + info: + # use only 1 rep during unit tests + test_default: 1 - resources: - - type: python_script - path: script.py - - path: notebook_264.py - - path: notebook_266.py - - path: ../../utils/anndata_to_dataframe.py +resources: + - type: python_script + path: script.py + - path: notebook_264.py + - path: notebook_266.py + - path: ../../utils/anndata_to_dataframe.py -platforms: +engines: - type: docker image: nvcr.io/nvidia/tensorflow:24.03-tf2-py3 setup: @@ -59,6 +58,8 @@ platforms: - fastparquet - type: native +runners: + - type: executable - type: nextflow directives: label: [ hightime, midmem, highcpu, gpu, midsharedmem ] diff --git a/src/methods/pyboost/config.vsh.yaml b/src/methods/pyboost/config.vsh.yaml index 85432847..a6556c5e 100644 --- a/src/methods/pyboost/config.vsh.yaml +++ b/src/methods/pyboost/config.vsh.yaml @@ -1,43 +1,42 @@ __merge__: ../../api/comp_method.yaml -functionality: - name: pyboost - info: - label: Py-boost - neurips2023_rank: 18 - summary: "Py-boost predicting t-scores" - description: | - An ensemble of four models was considered: - - * Py-boost (a ridge regression-based recommender system) - * ExtraTrees (a decision tree ensemble with target-encoded features) - * a k-nearest neighbors recommender system - * a ridge regression model +name: pyboost +info: + label: Py-boost + neurips2023_rank: 18 + summary: "Py-boost predicting t-scores" + description: | + An ensemble of four models was considered: + + * Py-boost (a ridge regression-based recommender system) + * ExtraTrees (a decision tree ensemble with target-encoded features) + * a k-nearest neighbors recommender system + * a ridge regression model - Each model offered distinct strengths and weaknesses: ExtraTrees and - knn were unable to extrapolate beyond the training data, while ridge - regression provided extrapolation capability. To enhance model performance, - data augmentation techniques were used, including averaging differential - expressions for compound mixtures and adjusting cell counts to reduce biases. + Each model offered distinct strengths and weaknesses: ExtraTrees and + knn were unable to extrapolate beyond the training data, while ridge + regression provided extrapolation capability. To enhance model performance, + data augmentation techniques were used, including averaging differential + expressions for compound mixtures and adjusting cell counts to reduce biases. - In the end, only the py-boost model is used for generating predictions. - documentation_url: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/458661 - repository_url: https://github.com/Ambros-M/Single-Cell-Perturbations-2023 - arguments: - - type: string - name: --predictor_names - multiple: true - choices: [py_boost, ridge_recommender, knn_recommender, predict_extratrees] - default: [py_boost] - description: Which predictor(s) to use. - info: - test_default: [knn_recommender] - resources: - - type: python_script - path: script.py - - path: helper.py - - path: ../../utils/anndata_to_dataframe.py -platforms: + In the end, only the py-boost model is used for generating predictions. + documentation_url: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/458661 + repository_url: https://github.com/Ambros-M/Single-Cell-Perturbations-2023 +arguments: + - type: string + name: --predictor_names + multiple: true + choices: [py_boost, ridge_recommender, knn_recommender, predict_extratrees] + default: [py_boost] + description: Which predictor(s) to use. + info: + test_default: [knn_recommender] +resources: + - type: python_script + path: script.py + - path: helper.py + - path: ../../utils/anndata_to_dataframe.py +engines: - type: docker image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4 setup: @@ -46,6 +45,8 @@ platforms: - colorama - py-boost==0.4.3 - type: native +runners: + - type: executable - type: nextflow directives: label: [midtime,midmem,midcpu,gpu] \ No newline at end of file diff --git a/src/methods/scape/config.vsh.yaml b/src/methods/scape/config.vsh.yaml index fea2b014..9f774699 100644 --- a/src/methods/scape/config.vsh.yaml +++ b/src/methods/scape/config.vsh.yaml @@ -1,68 +1,67 @@ __merge__: ../../api/comp_method.yaml -functionality: - name: scape - info: - label: ScAPE - neurips2023_rank: 16 - summary: Neural network model for drug effect prediction - description: | - ScAPE is utilises a neural network (NN) model to estimate drug effects on gene expression in - peripheral blood mononuclear cells (PBMCs). The model took drug and cell features as input, - with these features primarily derived from the median of signed log-pvalues and log fold-changes - grouped by drug and cell type. The NN was trained using a leave-one-drug-out cross-validation - strategy, focusing on NK cells as a representative cell type due to their similarity to B cells - and Myeloid cells in principal component analysis. Model performance was evaluated by comparing - its predictions against two baselines: predicting zero effect and predicting the median - log-pvalue for each drug. The final submission combined predictions from models trained on - different gene and drug subsets, aiming to enhance overall prediction accuracy. - reference: pablormier2023scape - documentation_url: https://docs.google.com/document/d/1w0GIJ8VoQx3HEJNmLXoU-Y_STB-h5-bXusL80_6EVuU/edit - repository_url: https://github.com/scapeML/scape - arguments: - - type: string - name: --cell - description: Pre-defined cell type held for pre-training. - required: false - default: NK cells - - type: integer - name: --epochs - description: Number of epochs for coarse training. - default: 300 - info: - test_default: 2 - - type: integer - name: --epochs_enhanced - description: Number of epochs for enhanced training. - default: 800 - info: - test_default: 2 - - type: integer - name: --n_genes - description: The number of genes for coarse training. - default: 64 - info: - test_default: 10 - - type: integer - name: --n_genes_enhanced - description: The number of genes for enhanced training. - default: 256 - info: - test_default: 10 - - type: integer - name: --n_drugs - description: The number of drugs to consider for coarse training. If none, all drugs are considered. - info: - test_default: 5 - - type: integer - name: --min_n_top_drugs - description: The minimum number of top drugs to consider. - default: 50 - info: - test_default: 0 - resources: - - type: python_script - path: script.py -platforms: +name: scape +info: + label: ScAPE + neurips2023_rank: 16 + summary: Neural network model for drug effect prediction + description: | + ScAPE is utilises a neural network (NN) model to estimate drug effects on gene expression in + peripheral blood mononuclear cells (PBMCs). The model took drug and cell features as input, + with these features primarily derived from the median of signed log-pvalues and log fold-changes + grouped by drug and cell type. The NN was trained using a leave-one-drug-out cross-validation + strategy, focusing on NK cells as a representative cell type due to their similarity to B cells + and Myeloid cells in principal component analysis. Model performance was evaluated by comparing + its predictions against two baselines: predicting zero effect and predicting the median + log-pvalue for each drug. The final submission combined predictions from models trained on + different gene and drug subsets, aiming to enhance overall prediction accuracy. + reference: pablormier2023scape + documentation_url: https://docs.google.com/document/d/1w0GIJ8VoQx3HEJNmLXoU-Y_STB-h5-bXusL80_6EVuU/edit + repository_url: https://github.com/scapeML/scape +arguments: + - type: string + name: --cell + description: Pre-defined cell type held for pre-training. + required: false + default: NK cells + - type: integer + name: --epochs + description: Number of epochs for coarse training. + default: 300 + info: + test_default: 2 + - type: integer + name: --epochs_enhanced + description: Number of epochs for enhanced training. + default: 800 + info: + test_default: 2 + - type: integer + name: --n_genes + description: The number of genes for coarse training. + default: 64 + info: + test_default: 10 + - type: integer + name: --n_genes_enhanced + description: The number of genes for enhanced training. + default: 256 + info: + test_default: 10 + - type: integer + name: --n_drugs + description: The number of drugs to consider for coarse training. If none, all drugs are considered. + info: + test_default: 5 + - type: integer + name: --min_n_top_drugs + description: The minimum number of top drugs to consider. + default: 50 + info: + test_default: 0 +resources: + - type: python_script + path: script.py +engines: - type: docker image: nvcr.io/nvidia/tensorflow:24.03-tf2-py3 setup: @@ -83,6 +82,8 @@ platforms: - scikit-learn~=1.2.2 - fastparquet~=2023.10.1 - git+https://github.com/scapeML/scape.git +runners: + - type: executable - type: nextflow directives: label: [ hightime, highmem, highcpu, gpu, midsharedmem ] \ No newline at end of file diff --git a/src/methods/transformer_ensemble/config.vsh.yaml b/src/methods/transformer_ensemble/config.vsh.yaml index 057f06fb..7d529f01 100644 --- a/src/methods/transformer_ensemble/config.vsh.yaml +++ b/src/methods/transformer_ensemble/config.vsh.yaml @@ -1,50 +1,49 @@ __merge__: ../../api/comp_method.yaml -functionality: - name: transformer_ensemble - info: - label: Transformer Ensemble - neurips2023_rank: 2 - summary: An ensemble of four transformer models, trained on diverse feature sets, with a cluster-based sampling strategy and robust validation for optimal performance. - description: | - This method employs an ensemble of four transformer models, - each with different weights and trained on slightly varying feature sets. - The feature engineering process involved one-hot encoding of categorical labels, - target encoding using mean and standard deviation, and enriching the feature set - with the standard deviation of target variables. Additionally, the dataset was - carefully examined to ensure data cleanliness. A sophisticated sampling strategy - based on K-Means clustering was employed to partition the data into training and - validation sets, ensuring a representative distribution. The model architecture - leveraged sparse and dense feature encoding, along with a transformer for effective - learning. - documentation_url: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/458738 - repository_url: https://github.com/Eliorkalfon/single_cell_pb - arguments: - - name: --num_train_epochs - type: integer - default: 20000 - description: "Number of training epochs." - info: - test_default: 10 - - name: --d_model - type: integer - default: 128 - description: "Dimensionality of the model." - - name: --batch_size - type: integer - default: 32 - description: "Batch size." - - name: --early_stopping - type: integer - default: 5000 - description: "Number of epochs to wait for early stopping." - resources: - - type: python_script - path: script.py - - path: models.py - - path: utils.py - - path: train.py -platforms: +name: transformer_ensemble +info: + label: Transformer Ensemble + neurips2023_rank: 2 + summary: An ensemble of four transformer models, trained on diverse feature sets, with a cluster-based sampling strategy and robust validation for optimal performance. + description: | + This method employs an ensemble of four transformer models, + each with different weights and trained on slightly varying feature sets. + The feature engineering process involved one-hot encoding of categorical labels, + target encoding using mean and standard deviation, and enriching the feature set + with the standard deviation of target variables. Additionally, the dataset was + carefully examined to ensure data cleanliness. A sophisticated sampling strategy + based on K-Means clustering was employed to partition the data into training and + validation sets, ensuring a representative distribution. The model architecture + leveraged sparse and dense feature encoding, along with a transformer for effective + learning. + documentation_url: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/458738 + repository_url: https://github.com/Eliorkalfon/single_cell_pb +arguments: + - name: --num_train_epochs + type: integer + default: 20000 + description: "Number of training epochs." + info: + test_default: 10 + - name: --d_model + type: integer + default: 128 + description: "Dimensionality of the model." + - name: --batch_size + type: integer + default: 32 + description: "Batch size." + - name: --early_stopping + type: integer + default: 5000 + description: "Number of epochs to wait for early stopping." +resources: + - type: python_script + path: script.py + - path: models.py + - path: utils.py + - path: train.py +engines: - type: docker image: ghcr.io/openproblems-bio/base_pytorch_nvidia:1.0.4 setup: @@ -59,6 +58,8 @@ platforms: - matplotlib~=3.5.0 - PyYAML~=6.0.1 - lion-pytorch +runners: + - type: executable - type: nextflow directives: label: [ midtime, veryhighmem, highcpu, gpu ] diff --git a/src/metrics/mean_rowwise_correlation/config.vsh.yaml b/src/metrics/mean_rowwise_correlation/config.vsh.yaml index 19fcbb05..926d39a1 100644 --- a/src/metrics/mean_rowwise_correlation/config.vsh.yaml +++ b/src/metrics/mean_rowwise_correlation/config.vsh.yaml @@ -1,65 +1,66 @@ __merge__: ../../api/comp_metric.yaml -functionality: - name: mean_rowwise_correlation - info: - metrics: - - name: mean_rowwise_pearson - label: Mean Rowwise Pearson - summary: The mean of Pearson correlations per row (perturbation). - description: | - The **Mean Pearson Correlation** is computed as follows: +name: mean_rowwise_correlation +info: + metrics: + - name: mean_rowwise_pearson + label: Mean Rowwise Pearson + summary: The mean of Pearson correlations per row (perturbation). + description: | + The **Mean Pearson Correlation** is computed as follows: - $$ - \textrm{Mean-Pearson} = \frac{1}{R}\sum_{i=1}^R\frac{\textrm{Cov}(\mathbf{y}_i, \mathbf{\hat{y}}_i)}{\textrm{Var}(\mathbf{y}_i) \cdot \textrm{Var}(\mathbf{\hat{y}}_i)} - $$ + $$ + \textrm{Mean-Pearson} = \frac{1}{R}\sum_{i=1}^R\frac{\textrm{Cov}(\mathbf{y}_i, \mathbf{\hat{y}}_i)}{\textrm{Var}(\mathbf{y}_i) \cdot \textrm{Var}(\mathbf{\hat{y}}_i)} + $$ - where $(R)$ is the number of scored rows, and $(\mathbf{y}_i)$ and $(\mathbf{\hat{y}}_i)$ are the actual and predicted values, respectively, for row $(i)$. - repository_url: null - documentation_url: null - min: -1 - max: 1 - maximize: true - - name: mean_rowwise_spearman - label: Mean Rowwise Spearman - summary: The mean of Spearman correlations per row (perturbation). - description: | - The **Mean Spearman Correlation** is computed as follows: + where $(R)$ is the number of scored rows, and $(\mathbf{y}_i)$ and $(\mathbf{\hat{y}}_i)$ are the actual and predicted values, respectively, for row $(i)$. + repository_url: null + documentation_url: null + min: -1 + max: 1 + maximize: true + - name: mean_rowwise_spearman + label: Mean Rowwise Spearman + summary: The mean of Spearman correlations per row (perturbation). + description: | + The **Mean Spearman Correlation** is computed as follows: - $$ - \textrm{Mean-Pearson} = \frac{1}{R}\sum_{i=1}^R\frac{\textrm{Cov}(\mathbf{r}_i, \mathbf{\hat{r}}_i)}{\textrm{Var}(\mathbf{r}_i) \cdot \textrm{Var}(\mathbf{\hat{r}}_i)} - $$ + $$ + \textrm{Mean-Pearson} = \frac{1}{R}\sum_{i=1}^R\frac{\textrm{Cov}(\mathbf{r}_i, \mathbf{\hat{r}}_i)}{\textrm{Var}(\mathbf{r}_i) \cdot \textrm{Var}(\mathbf{\hat{r}}_i)} + $$ - where $(R)$ is the number of scored rows, and $(\mathbf{r}_i)$ and $(\mathbf{\hat{r}}_i)$ are the ranks of the actual and predicted values, respectively, for row $(i)$. - repository_url: null - documentation_url: null - min: -1 - max: 1 - maximize: true - - name: mean_rowwise_cosine - label: Mean Rowwise Cosine - summary: The mean of cosine similarities per row (perturbation). - description: | - The **Mean Cosine Similarity** is computed as follows: + where $(R)$ is the number of scored rows, and $(\mathbf{r}_i)$ and $(\mathbf{\hat{r}}_i)$ are the ranks of the actual and predicted values, respectively, for row $(i)$. + repository_url: null + documentation_url: null + min: -1 + max: 1 + maximize: true + - name: mean_rowwise_cosine + label: Mean Rowwise Cosine + summary: The mean of cosine similarities per row (perturbation). + description: | + The **Mean Cosine Similarity** is computed as follows: - $$ - \textrm{Mean-Cosine} = \frac{1}{R}\sum_{i=1}^R\frac{\mathbf{y}_i\cdot \mathbf{\hat{y}}_i}{\|\mathbf{y}_i\| \|\mathbf{\hat{y}}_i\|} - $$ + $$ + \textrm{Mean-Cosine} = \frac{1}{R}\sum_{i=1}^R\frac{\mathbf{y}_i\cdot \mathbf{\hat{y}}_i}{\|\mathbf{y}_i\| \|\mathbf{\hat{y}}_i\|} + $$ - where $(R)$ is the number of scored rows, and $(\mathbf{y}_i)$ and $(\mathbf{\hat{y}}_i)$ are the actual and predicted values, respectively, for row $(i)$. - repository_url: null - documentation_url: null - min: -1 - max: 1 - maximize: true - resources: - - type: r_script - path: script.R -platforms: + where $(R)$ is the number of scored rows, and $(\mathbf{y}_i)$ and $(\mathbf{\hat{y}}_i)$ are the actual and predicted values, respectively, for row $(i)$. + repository_url: null + documentation_url: null + min: -1 + max: 1 + maximize: true +resources: + - type: r_script + path: script.R +engines: - type: docker image: ghcr.io/openproblems-bio/base_r:1.0.4 setup: - type: r packages: proxyC +runners: + - type: executable - type: nextflow directives: label: [ midtime, highmem, highcpu ] \ No newline at end of file diff --git a/src/metrics/mean_rowwise_error/config.vsh.yaml b/src/metrics/mean_rowwise_error/config.vsh.yaml index 15d8317d..d1589a1a 100644 --- a/src/metrics/mean_rowwise_error/config.vsh.yaml +++ b/src/metrics/mean_rowwise_error/config.vsh.yaml @@ -1,49 +1,50 @@ __merge__: ../../api/comp_metric.yaml -functionality: - name: mean_rowwise_error - info: - metrics: - - name: mean_rowwise_rmse - label: Mean Rowwise RMSE - summary: The mean of the root mean squared error (RMSE) of each row in the matrix. - description: | - We use the **Mean Rowwise Root Mean Squared Error** to score submissions, computed as follows: +name: mean_rowwise_error +info: + metrics: + - name: mean_rowwise_rmse + label: Mean Rowwise RMSE + summary: The mean of the root mean squared error (RMSE) of each row in the matrix. + description: | + We use the **Mean Rowwise Root Mean Squared Error** to score submissions, computed as follows: + + $$ + \textrm{MRRMSE} = \frac{1}{R}\sum_{i=1}^R\left(\frac{1}{n} \sum_{j=1}^{n} (y_{ij} - \widehat{y}_{ij})^2\right)^{1/2} + $$ + + where $(R)$ is the number of scored rows, and $(y_{ij})$ and $(\widehat{y}_{ij})$ are the actual and predicted values, respectively, for row $(i)$ and column $(j)$, and $(n)$ bis the number of columns. + repository_url: null + documentation_url: null + min: 0 + max: "+inf" + maximize: false + - name: mean_rowwise_mae + label: Mean Rowwise MAE + summary: The mean of the absolute error (MAE) of each row in the matrix. + description: | + We use the **Mean Rowwise Absolute Error** to score submissions, computed as follows: $$ - \textrm{MRRMSE} = \frac{1}{R}\sum_{i=1}^R\left(\frac{1}{n} \sum_{j=1}^{n} (y_{ij} - \widehat{y}_{ij})^2\right)^{1/2} + \textrm{MRMAE} = \frac{1}{R}\sum_{i=1}^R\left(\frac{1}{n} \sum_{j=1}^{n} |y_{ij} - \widehat{y}_{ij}|\right) $$ - + where $(R)$ is the number of scored rows, and $(y_{ij})$ and $(\widehat{y}_{ij})$ are the actual and predicted values, respectively, for row $(i)$ and column $(j)$, and $(n)$ bis the number of columns. - repository_url: null - documentation_url: null - min: 0 - max: "+inf" - maximize: false - - name: mean_rowwise_mae - label: Mean Rowwise MAE - summary: The mean of the absolute error (MAE) of each row in the matrix. - description: | - We use the **Mean Rowwise Absolute Error** to score submissions, computed as follows: - - $$ - \textrm{MRMAE} = \frac{1}{R}\sum_{i=1}^R\left(\frac{1}{n} \sum_{j=1}^{n} |y_{ij} - \widehat{y}_{ij}|\right) - $$ - - where $(R)$ is the number of scored rows, and $(y_{ij})$ and $(\widehat{y}_{ij})$ are the actual and predicted values, respectively, for row $(i)$ and column $(j)$, and $(n)$ bis the number of columns. - repository_url: null - documentation_url: null - min: 0 - max: "+inf" - maximize: false - resources: - - type: r_script - path: script.R -platforms: + repository_url: null + documentation_url: null + min: 0 + max: "+inf" + maximize: false +resources: + - type: r_script + path: script.R +engines: - type: docker image: ghcr.io/openproblems-bio/base_r:1.0.4 setup: - type: r packages: proxyC +runners: + - type: executable - type: nextflow directives: label: [ midtime, highmem, highcpu ] \ No newline at end of file diff --git a/src/process_dataset/add_uns_metadata/config.vsh.yaml b/src/process_dataset/add_uns_metadata/config.vsh.yaml index 6244a417..2a9a95f6 100644 --- a/src/process_dataset/add_uns_metadata/config.vsh.yaml +++ b/src/process_dataset/add_uns_metadata/config.vsh.yaml @@ -1,58 +1,59 @@ -functionality: - name: add_uns_metadata - namespace: "process_dataset" - info: - type: process_dataset - type_info: - label: Add metadata - summary: Add metadata to the pseudobulked data - description: | - Add metadata to the pseudobulked single-cell dataset for the perturbation regression task. - arguments: - - name: --input - type: file - required: true - direction: input - example: resources/neurips-2023-raw/pseudobulk_cleaned.h5ad - - name: "--dataset_id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: true - - name: --output - type: file - required: true - direction: output - example: resources/neurips-2023-data/pseudobulk_uns.h5ad - resources: - - type: python_script - path: script.py -platforms: +name: add_uns_metadata +namespace: "process_dataset" +info: + type: process_dataset + type_info: + label: Add metadata + summary: Add metadata to the pseudobulked data + description: | + Add metadata to the pseudobulked single-cell dataset for the perturbation regression task. +arguments: + - name: --input + type: file + required: true + direction: input + example: resources/neurips-2023-raw/pseudobulk_cleaned.h5ad + - name: "--dataset_id" + type: string + description: Unique identifier of the dataset. + required: true + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: true + - name: --output + type: file + required: true + direction: output + example: resources/neurips-2023-data/pseudobulk_uns.h5ad +resources: + - type: python_script + path: script.py +engines: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.4 +runners: + - type: executable - type: nextflow directives: label: [ midtime, midmem, lowcpu ] diff --git a/src/process_dataset/bootstrap/config.vsh.yaml b/src/process_dataset/bootstrap/config.vsh.yaml index 391ddca6..70c8794c 100644 --- a/src/process_dataset/bootstrap/config.vsh.yaml +++ b/src/process_dataset/bootstrap/config.vsh.yaml @@ -1,61 +1,62 @@ -functionality: - name: bootstrap - namespace: "process_dataset" - info: - type: process_dataset - type_info: - label: Bootstrap - summary: Bootstrap a dataset - description: | - This component bootstraps a dataset. - argument_groups: - - name: Inputs - arguments: - - name: --input +name: bootstrap +namespace: "process_dataset" +info: + type: process_dataset + type_info: + label: Bootstrap + summary: Bootstrap a dataset + description: | + This component bootstraps a dataset. +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + required: true + direction: input + example: resources/neurips-2023-raw/sc_counts_reannotated_with_counts.h5ad + - name: Outputs + arguments: + - name: --output type: file required: true - direction: input - example: resources/neurips-2023-raw/sc_counts_reannotated_with_counts.h5ad - - name: Outputs - arguments: - - name: --output - type: file - required: true - direction: output - example: sc_counts_bootstrap.h5ad - - name: Sampling parameters - description: Parameters for sampling the bootstraps. - arguments: - - name: --bootstrap_obs - type: boolean - default: true - description: Whether to sample observations. - - name: --obs_fraction - type: double - default: 1 - description: Fraction of the obs of the sc_counts to include in each bootstrap. - - name: --obs_replace - type: boolean - default: true - description: Whether to sample with replacement. - - name: --bootstrap_var - type: boolean - default: false - description: Whether to sample variables. - - name: --var_fraction - type: double - default: 1 - description: Fraction of the var of the sc_counts to include in each bootstrap. - - name: --var_replace - type: boolean - default: true - description: Whether to sample with replacement. - resources: - - type: python_script - path: script.py -platforms: + direction: output + example: sc_counts_bootstrap.h5ad + - name: Sampling parameters + description: Parameters for sampling the bootstraps. + arguments: + - name: --bootstrap_obs + type: boolean + default: true + description: Whether to sample observations. + - name: --obs_fraction + type: double + default: 1 + description: Fraction of the obs of the sc_counts to include in each bootstrap. + - name: --obs_replace + type: boolean + default: true + description: Whether to sample with replacement. + - name: --bootstrap_var + type: boolean + default: false + description: Whether to sample variables. + - name: --var_fraction + type: double + default: 1 + description: Fraction of the var of the sc_counts to include in each bootstrap. + - name: --var_replace + type: boolean + default: true + description: Whether to sample with replacement. +resources: + - type: python_script + path: script.py +engines: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.4 +runners: + - type: executable - type: nextflow directives: label: [ midtime, highmem, midcpu ] diff --git a/src/process_dataset/compute_pseudobulk/config.vsh.yaml b/src/process_dataset/compute_pseudobulk/config.vsh.yaml index d8cd9aaf..23c61213 100644 --- a/src/process_dataset/compute_pseudobulk/config.vsh.yaml +++ b/src/process_dataset/compute_pseudobulk/config.vsh.yaml @@ -1,33 +1,34 @@ -functionality: - name: compute_pseudobulk - namespace: "process_dataset" - info: - type: process_dataset - type_info: - label: Pseudobulk - summary: Compute pseudobulk data - description: | - Compute pseudobulk data for the perturbation regression task. - arguments: - - name: --input - type: file - required: true - direction: input - example: resources/neurips-2023-raw/sc_counts.h5ad - - name: --output - type: file - required: true - direction: output - example: resources/neurips-2023-data/pseudobulk.h5ad - resources: - - type: python_script - path: script.py -platforms: +name: compute_pseudobulk +namespace: "process_dataset" +info: + type: process_dataset + type_info: + label: Pseudobulk + summary: Compute pseudobulk data + description: | + Compute pseudobulk data for the perturbation regression task. +arguments: + - name: --input + type: file + required: true + direction: input + example: resources/neurips-2023-raw/sc_counts.h5ad + - name: --output + type: file + required: true + direction: output + example: resources/neurips-2023-data/pseudobulk.h5ad +resources: + - type: python_script + path: script.py +engines: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.4 setup: - type: python packages: [ pyarrow ] +runners: + - type: executable - type: nextflow directives: label: [ midtime, midmem, lowcpu ] diff --git a/src/process_dataset/convert_h5ad_to_parquet/config.vsh.yaml b/src/process_dataset/convert_h5ad_to_parquet/config.vsh.yaml index 2a703694..952bdcec 100644 --- a/src/process_dataset/convert_h5ad_to_parquet/config.vsh.yaml +++ b/src/process_dataset/convert_h5ad_to_parquet/config.vsh.yaml @@ -1,49 +1,50 @@ -functionality: - name: convert_h5ad_to_parquet - namespace: "process_dataset" - info: - type: process_dataset - type_info: - label: Split dataset - summary: Split dataset into training and test parquet files - description: | - Split dataset into training and test parquet files. - arguments: - - name: --input_train - type: file - required: true - direction: input - example: resources/neurips-2023-data/de_train.h5ad - - name: --input_test - type: file - required: true - direction: input - example: resources/neurips-2023-data/de_test.h5ad - - name: --output_train - type: file - required: true - direction: output - example: resources/neurips-2023-data/de_train.parquet - - name: --output_test - type: file - required: true - direction: output - example: resources/neurips-2023-data/de_test.parquet - - name: --output_id_map - type: file - required: true - direction: output - example: resources/neurips-2023-data/id_map.csv - resources: - - type: python_script - path: script.py - - path: ../../utils/anndata_to_dataframe.py -platforms: +name: convert_h5ad_to_parquet +namespace: "process_dataset" +info: + type: process_dataset + type_info: + label: Split dataset + summary: Split dataset into training and test parquet files + description: | + Split dataset into training and test parquet files. +arguments: + - name: --input_train + type: file + required: true + direction: input + example: resources/neurips-2023-data/de_train.h5ad + - name: --input_test + type: file + required: true + direction: input + example: resources/neurips-2023-data/de_test.h5ad + - name: --output_train + type: file + required: true + direction: output + example: resources/neurips-2023-data/de_train.parquet + - name: --output_test + type: file + required: true + direction: output + example: resources/neurips-2023-data/de_test.parquet + - name: --output_id_map + type: file + required: true + direction: output + example: resources/neurips-2023-data/id_map.csv +resources: + - type: python_script + path: script.py + - path: ../../utils/anndata_to_dataframe.py +engines: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.4 setup: - type: python packages: [ fastparquet, anndata, pandas ] +runners: + - type: executable - type: nextflow directives: label: [ midtime, midmem, lowcpu ] diff --git a/src/process_dataset/convert_kaggle_h5ad_to_parquet/config.vsh.yaml b/src/process_dataset/convert_kaggle_h5ad_to_parquet/config.vsh.yaml index 858a7060..80f48205 100644 --- a/src/process_dataset/convert_kaggle_h5ad_to_parquet/config.vsh.yaml +++ b/src/process_dataset/convert_kaggle_h5ad_to_parquet/config.vsh.yaml @@ -1,87 +1,88 @@ -functionality: - name: convert_kaggle_h5ad_to_parquet - namespace: "process_dataset" - info: - type: process_dataset - type_info: - label: H5AD to Parquet - summary: Convert Kaggle h5ad to parquet - description: | - Convert dataset from h5ad files into training and test parquet files. - argument_groups: - - name: Inputs - arguments: - - name: --input_train +name: convert_kaggle_h5ad_to_parquet +namespace: "process_dataset" +info: + type: process_dataset + type_info: + label: H5AD to Parquet + summary: Convert Kaggle h5ad to parquet + description: | + Convert dataset from h5ad files into training and test parquet files. +argument_groups: + - name: Inputs + arguments: + - name: --input_train + type: file + required: true + direction: input + example: resources/neurips-2023-kaggle/12_de_by_cell_type_train.h5ad + - name: --input_test + type: file + required: true + direction: input + example: resources/neurips-2023-kaggle/12_de_by_cell_type_test.h5ad + - name: --input_single_cell_h5ad + type: file + required: true + direction: input + example: resources/neurips-2023-raw/sc_counts.h5ad + - name: Metadata + arguments: + - name: "--dataset_id" + type: string + description: Unique identifier of the dataset. + required: true + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: true + - name: Outputs + arguments: + - name: --output_train_h5ad type: file required: true - direction: input - example: resources/neurips-2023-kaggle/12_de_by_cell_type_train.h5ad - - name: --input_test + direction: output + example: resources/neurips-2023-kaggle/de_train.h5ad + - name: --output_test_h5ad type: file required: true - direction: input - example: resources/neurips-2023-kaggle/12_de_by_cell_type_test.h5ad - - name: --input_single_cell_h5ad + direction: output + example: resources/neurips-2023-kaggle/de_test.h5ad + - name: --output_id_map type: file required: true - direction: input - example: resources/neurips-2023-raw/sc_counts.h5ad - - name: Metadata - arguments: - - name: "--dataset_id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: true - - name: Outputs - arguments: - - name: --output_train_h5ad - type: file - required: true - direction: output - example: resources/neurips-2023-kaggle/de_train.h5ad - - name: --output_test_h5ad - type: file - required: true - direction: output - example: resources/neurips-2023-kaggle/de_test.h5ad - - name: --output_id_map - type: file - required: true - direction: output - example: resources/neurips-2023-kaggle/id_map.csv - resources: - - type: python_script - path: script.py -platforms: + direction: output + example: resources/neurips-2023-kaggle/id_map.csv +resources: + - type: python_script + path: script.py +engines: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.4 setup: - type: python packages: [ fastparquet, anndata, pandas ] +runners: + - type: executable - type: nextflow directives: label: [ midtime, midmem, lowcpu ] diff --git a/src/process_dataset/filter_obs/config.vsh.yaml b/src/process_dataset/filter_obs/config.vsh.yaml index 831ce031..d47e17a4 100644 --- a/src/process_dataset/filter_obs/config.vsh.yaml +++ b/src/process_dataset/filter_obs/config.vsh.yaml @@ -1,33 +1,34 @@ -functionality: - name: filter_obs - namespace: "process_dataset" - info: - type: process_dataset - type_info: - label: Filter observations - summary: Remove low-quality observations from the dataset - description: | - This task removes low-quality observations from the dataset. - arguments: - - name: --input - type: file - required: true - direction: input - example: resources/neurips-2023-raw/sc_counts.h5ad - - name: --output - type: file - required: true - direction: output - example: resources/neurips-2023-data/sc_counts_cleaned.h5ad - resources: - - type: r_script - path: script.R -platforms: +name: filter_obs +namespace: "process_dataset" +info: + type: process_dataset + type_info: + label: Filter observations + summary: Remove low-quality observations from the dataset + description: | + This task removes low-quality observations from the dataset. +arguments: + - name: --input + type: file + required: true + direction: input + example: resources/neurips-2023-raw/sc_counts.h5ad + - name: --output + type: file + required: true + direction: output + example: resources/neurips-2023-data/sc_counts_cleaned.h5ad +resources: + - type: r_script + path: script.R +engines: - type: docker image: ghcr.io/openproblems-bio/base_r:1.0.4 setup: - type: r cran: [ dplyr, tidyr, purrr, tibble ] +runners: + - type: executable - type: nextflow directives: label: [ midtime, midmem, lowcpu ] diff --git a/src/process_dataset/filter_vars/config.vsh.yaml b/src/process_dataset/filter_vars/config.vsh.yaml index 56260b55..c6eb6ea3 100644 --- a/src/process_dataset/filter_vars/config.vsh.yaml +++ b/src/process_dataset/filter_vars/config.vsh.yaml @@ -1,33 +1,34 @@ -functionality: - name: filter_vars - namespace: "process_dataset" - info: - type: process_dataset - type_info: - label: Filter variables - summary: Remove low-quality variables from the dataset - description: | - This task removes low-quality variables from the dataset. - arguments: - - name: --input - type: file - required: true - direction: input - example: resources/neurips-2023-raw/pseudobulk.h5ad - - name: --output - type: file - required: true - direction: output - example: resources/neurips-2023-data/pseudobulk_cleaned.h5ad - resources: - - type: r_script - path: script.R -platforms: +name: filter_vars +namespace: "process_dataset" +info: + type: process_dataset + type_info: + label: Filter variables + summary: Remove low-quality variables from the dataset + description: | + This task removes low-quality variables from the dataset. +arguments: + - name: --input + type: file + required: true + direction: input + example: resources/neurips-2023-raw/pseudobulk.h5ad + - name: --output + type: file + required: true + direction: output + example: resources/neurips-2023-data/pseudobulk_cleaned.h5ad +resources: + - type: r_script + path: script.R +engines: - type: docker image: ghcr.io/openproblems-bio/base_r:1.0.4 setup: - type: r cran: [ edgeR, limma, dplyr, tidyr, purrr, tibble ] +runners: + - type: executable - type: nextflow directives: label: [ midtime, midmem, lowcpu ] diff --git a/src/process_dataset/generate_id_map/config.vsh.yaml b/src/process_dataset/generate_id_map/config.vsh.yaml index 7669b606..97a9af95 100644 --- a/src/process_dataset/generate_id_map/config.vsh.yaml +++ b/src/process_dataset/generate_id_map/config.vsh.yaml @@ -1,33 +1,34 @@ -functionality: - name: generate_id_map - namespace: "process_dataset" - info: - type: process_dataset - type_info: - label: Generate ID map - summary: Generate the ID map file for competitors - description: | - This task generates the ID map file for competitors. - arguments: - - name: --de_test_h5ad - type: file - required: true - direction: input - example: resources/neurips-2023-data/de_test.h5ad - - name: --id_map - type: file - required: true - direction: output - example: resources/neurips-2023-data/id_map.csv - resources: - - type: python_script - path: script.py -platforms: +name: generate_id_map +namespace: "process_dataset" +info: + type: process_dataset + type_info: + label: Generate ID map + summary: Generate the ID map file for competitors + description: | + This task generates the ID map file for competitors. +arguments: + - name: --de_test_h5ad + type: file + required: true + direction: input + example: resources/neurips-2023-data/de_test.h5ad + - name: --id_map + type: file + required: true + direction: output + example: resources/neurips-2023-data/id_map.csv +resources: + - type: python_script + path: script.py +engines: - type: docker image: ghcr.io/openproblems-bio/base_python:1.0.4 setup: - type: python packages: [ anndata ] +runners: + - type: executable - type: nextflow directives: label: [ midtime, midmem, lowcpu ] diff --git a/src/process_dataset/run_limma/config.vsh.yaml b/src/process_dataset/run_limma/config.vsh.yaml index cd84c852..2730a0be 100644 --- a/src/process_dataset/run_limma/config.vsh.yaml +++ b/src/process_dataset/run_limma/config.vsh.yaml @@ -1,55 +1,56 @@ -functionality: - name: run_limma - namespace: "process_dataset" - info: - type: process_dataset - type_info: - label: Limma - summary: Run limma - description: | - Run limma for the perturbation regression task. - arguments: - - name: --input - type: file - required: true - direction: input - example: resources/neurips-2023-data/pseudobulk_cleaned.h5ad - - name: --input_splits - type: string - multiple: true - description: The splits to use for the limma fitting - example: [ train, control, public_test, private_test ] - - name: --output - type: file - required: true - direction: output - example: resources/neurips-2023-data/de.h5ad - - name: --output_splits - type: string - multiple: true - description: The splits to use for DE analysis - - name: --de_sig_cutoff - type: double - required: false - default: 0.05 - - name: --clipping_cutoff - type: double - required: false - default: 0.0001 - description: Clip the log p-values between log10(clip) and -log10(clip) - - name: --control_compound - type: string - required: false - default: "Dimethyl Sulfoxide" - resources: - - type: r_script - path: script.R -platforms: +name: run_limma +namespace: "process_dataset" +info: + type: process_dataset + type_info: + label: Limma + summary: Run limma + description: | + Run limma for the perturbation regression task. +arguments: + - name: --input + type: file + required: true + direction: input + example: resources/neurips-2023-data/pseudobulk_cleaned.h5ad + - name: --input_splits + type: string + multiple: true + description: The splits to use for the limma fitting + example: [ train, control, public_test, private_test ] + - name: --output + type: file + required: true + direction: output + example: resources/neurips-2023-data/de.h5ad + - name: --output_splits + type: string + multiple: true + description: The splits to use for DE analysis + - name: --de_sig_cutoff + type: double + required: false + default: 0.05 + - name: --clipping_cutoff + type: double + required: false + default: 0.0001 + description: Clip the log p-values between log10(clip) and -log10(clip) + - name: --control_compound + type: string + required: false + default: "Dimethyl Sulfoxide" +resources: + - type: r_script + path: script.R +engines: - type: docker image: ghcr.io/openproblems-bio/base_r:1.0.4 setup: - type: r bioc: [ edgeR, limma, dplyr, tidyr, purrr, tibble, furrr, future ] +runners: + - type: executable - type: nextflow directives: label: [ midtime, midmem, highcpu ] diff --git a/src/workflows/process_dataset/config.vsh.yaml b/src/workflows/process_dataset/config.vsh.yaml index 39c5332b..3d30a762 100644 --- a/src/workflows/process_dataset/config.vsh.yaml +++ b/src/workflows/process_dataset/config.vsh.yaml @@ -1,48 +1,48 @@ __merge__: ../../api/comp_process_dataset.yaml -functionality: - name: process_dataset - namespace: "workflows" - arguments: - - name: "--dataset_id" - type: string - description: Unique identifier of the dataset. - required: true - - name: "--dataset_name" - type: string - description: Nicely formatted name. - required: true - - name: "--dataset_url" - type: string - description: Link to the original source of the dataset. - required: false - - name: "--dataset_reference" - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: "--dataset_summary" - type: string - description: Short description of the dataset. - required: true - - name: "--dataset_description" - type: string - description: Long description of the dataset. - required: true - - name: "--dataset_organism" - type: string - description: The organism of the dataset. - required: true - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - dependencies: - - name: process_dataset/compute_pseudobulk - - name: process_dataset/filter_obs - - name: process_dataset/filter_vars - - name: process_dataset/add_uns_metadata - - name: process_dataset/run_limma - - name: process_dataset/generate_id_map -platforms: +name: process_dataset +namespace: "workflows" +arguments: + - name: "--dataset_id" + type: string + description: Unique identifier of the dataset. + required: true + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: true +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf +dependencies: + - name: process_dataset/compute_pseudobulk + - name: process_dataset/filter_obs + - name: process_dataset/filter_vars + - name: process_dataset/add_uns_metadata + - name: process_dataset/run_limma + - name: process_dataset/generate_id_map +runners: + - type: executable - type: nextflow directives: label: [ midtime, midmem, lowcpu ] diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 56171aba..0523f5dd 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -1,93 +1,93 @@ -functionality: - name: "run_benchmark" - namespace: "workflows" - argument_groups: - - name: Inputs - arguments: - - name: "--de_train_h5ad" - __merge__: ../../api/file_de_train_h5ad.yaml - required: true - direction: input - - name: "--de_test_h5ad" - __merge__: ../../api/file_de_test_h5ad.yaml - required: true - direction: input - - name: "--id_map" - __merge__: ../../api/file_id_map.yaml - required: true - direction: input - - name: --layer - type: string - direction: input - default: clipped_sign_log10_pval - description: Which layer to use for prediction and evaluation. - - name: Outputs - arguments: - - name: "--scores" - type: file - required: true - direction: output - description: A yaml file containing the scores of each of the methods - default: score_uns.yaml - - name: "--method_configs" - type: file - required: true - direction: output - default: method_configs.yaml - - name: "--metric_configs" - type: file - required: true - direction: output - default: metric_configs.yaml - - name: "--dataset_uns" - type: file - required: true - direction: output - default: dataset_uns.yaml - - name: "--task_info" - type: file - required: true - direction: output - default: task_info.yaml - - name: Arguments - arguments: - - name: "--method_ids" - type: string - multiple: true - description: A list of method ids to run. If not specified, all methods will be run. - - name: "--metric_ids" - type: string - multiple: true - description: A list of metric ids to run. If not specified, all metric will be run. - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - type: file - path: "../../api/task_info.yaml" - dependencies: - - name: common/extract_metadata - repository: openproblemsv2 - - name: control_methods/zeros - - name: control_methods/sample - - name: control_methods/ground_truth - - name: control_methods/mean_outcome - - name: control_methods/mean_across_celltypes - - name: control_methods/mean_across_compounds - - name: methods/nn_retraining_with_pseudolabels - - name: methods/scape - - name: methods/jn_ap_op2 - - name: methods/lgc_ensemble - - name: methods/transformer_ensemble - - name: methods/pyboost - - name: metrics/mean_rowwise_error - - name: metrics/mean_rowwise_correlation - repositories: - - name: openproblemsv2 - type: github - repo: openproblems-bio/openproblems-v2 - tag: main_build -platforms: +name: "run_benchmark" +namespace: "workflows" +argument_groups: + - name: Inputs + arguments: + - name: "--de_train_h5ad" + __merge__: ../../api/file_de_train_h5ad.yaml + required: true + direction: input + - name: "--de_test_h5ad" + __merge__: ../../api/file_de_test_h5ad.yaml + required: true + direction: input + - name: "--id_map" + __merge__: ../../api/file_id_map.yaml + required: true + direction: input + - name: --layer + type: string + direction: input + default: clipped_sign_log10_pval + description: Which layer to use for prediction and evaluation. + - name: Outputs + arguments: + - name: "--scores" + type: file + required: true + direction: output + description: A yaml file containing the scores of each of the methods + default: score_uns.yaml + - name: "--method_configs" + type: file + required: true + direction: output + default: method_configs.yaml + - name: "--metric_configs" + type: file + required: true + direction: output + default: metric_configs.yaml + - name: "--dataset_uns" + type: file + required: true + direction: output + default: dataset_uns.yaml + - name: "--task_info" + type: file + required: true + direction: output + default: task_info.yaml + - name: Arguments + arguments: + - name: "--method_ids" + type: string + multiple: true + description: A list of method ids to run. If not specified, all methods will be run. + - name: "--metric_ids" + type: string + multiple: true + description: A list of metric ids to run. If not specified, all metric will be run. +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: "../../api/task_info.yaml" +dependencies: + - name: common/extract_metadata + repository: openproblemsv2 + - name: control_methods/zeros + - name: control_methods/sample + - name: control_methods/ground_truth + - name: control_methods/mean_outcome + - name: control_methods/mean_across_celltypes + - name: control_methods/mean_across_compounds + - name: methods/nn_retraining_with_pseudolabels + - name: methods/scape + - name: methods/jn_ap_op2 + - name: methods/lgc_ensemble + - name: methods/transformer_ensemble + - name: methods/pyboost + - name: metrics/mean_rowwise_error + - name: metrics/mean_rowwise_correlation +repositories: + - name: openproblemsv2 + type: github + repo: openproblems-bio/openproblems-v2 + tag: main_build +runners: + - type: executable - type: nextflow config: script: | diff --git a/src/workflows/run_stability_analysis/config.vsh.yaml b/src/workflows/run_stability_analysis/config.vsh.yaml index 05a7d6fa..3e50d3f6 100644 --- a/src/workflows/run_stability_analysis/config.vsh.yaml +++ b/src/workflows/run_stability_analysis/config.vsh.yaml @@ -1,87 +1,87 @@ -functionality: - name: "run_stability_analysis" - namespace: "workflows" - argument_groups: - - name: Inputs - arguments: - - name: --sc_counts - __merge__: ../../api/file_sc_counts.yaml - required: true - direction: input - - name: "--id" - type: string - description: Unique identifier of the dataset. - required: true - - name: --layer - type: string - direction: input - default: clipped_sign_log10_pval - description: Which layer to use for prediction and evaluation. - - name: Bootstrapping arguments - description: Define the sampling strategy for the stability analysis. - arguments: - - name: --bootstrap_num_replicates - type: integer - default: 10 - description: Number of bootstrap replicates to run. - - name: --bootstrap_obs - type: boolean - default: true - description: Whether to sample observations. - - name: --bootstrap_obs_fraction - type: double - default: 1 - description: Fraction of the obs of the sc_counts to include in each bootstrap. - - name: --bootstrap_obs_replace - type: boolean - default: true - description: Whether to sample with replacement. - - name: --bootstrap_var - type: boolean - default: false - description: Whether to sample variables. - - name: --bootstrap_var_fraction - type: double - default: 1 - description: Fraction of the var of the sc_counts to include in each bootstrap. - - name: --bootstrap_var_replace - type: boolean - default: true - description: Whether to sample with replacement. - - name: Outputs - arguments: - - name: "--scores" - type: file - required: true - direction: output - description: A yaml file containing the scores of each of the methods - default: stability_uns.yaml - - name: Arguments - arguments: - - name: "--method_ids" - type: string - multiple: true - description: A list of method ids to run. If not specified, all methods will be run. - - name: "--metric_ids" - type: string - multiple: true - description: A list of metric ids to run. If not specified, all metric will be run. - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - type: file - path: "../../api/task_info.yaml" - dependencies: - - name: process_dataset/bootstrap - - name: workflows/process_dataset - - name: workflows/run_benchmark - repositories: - - name: openproblemsv2 - type: github - repo: openproblems-bio/openproblems-v2 - tag: main_build -platforms: +name: "run_stability_analysis" +namespace: "workflows" +argument_groups: + - name: Inputs + arguments: + - name: --sc_counts + __merge__: ../../api/file_sc_counts.yaml + required: true + direction: input + - name: "--id" + type: string + description: Unique identifier of the dataset. + required: true + - name: --layer + type: string + direction: input + default: clipped_sign_log10_pval + description: Which layer to use for prediction and evaluation. + - name: Bootstrapping arguments + description: Define the sampling strategy for the stability analysis. + arguments: + - name: --bootstrap_num_replicates + type: integer + default: 10 + description: Number of bootstrap replicates to run. + - name: --bootstrap_obs + type: boolean + default: true + description: Whether to sample observations. + - name: --bootstrap_obs_fraction + type: double + default: 1 + description: Fraction of the obs of the sc_counts to include in each bootstrap. + - name: --bootstrap_obs_replace + type: boolean + default: true + description: Whether to sample with replacement. + - name: --bootstrap_var + type: boolean + default: false + description: Whether to sample variables. + - name: --bootstrap_var_fraction + type: double + default: 1 + description: Fraction of the var of the sc_counts to include in each bootstrap. + - name: --bootstrap_var_replace + type: boolean + default: true + description: Whether to sample with replacement. + - name: Outputs + arguments: + - name: "--scores" + type: file + required: true + direction: output + description: A yaml file containing the scores of each of the methods + default: stability_uns.yaml + - name: Arguments + arguments: + - name: "--method_ids" + type: string + multiple: true + description: A list of method ids to run. If not specified, all methods will be run. + - name: "--metric_ids" + type: string + multiple: true + description: A list of metric ids to run. If not specified, all metric will be run. +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: "../../api/task_info.yaml" +dependencies: + - name: process_dataset/bootstrap + - name: workflows/process_dataset + - name: workflows/run_benchmark +repositories: + - name: openproblemsv2 + type: github + repo: openproblems-bio/openproblems-v2 + tag: main_build +runners: + - type: executable - type: nextflow config: script: |