fix viash 0.9 refactoring (#79)

* fix viash 0.9 refactoring * rename (sc|de)_(train|test)_h5ad to $1_$2 * fix components * fix scape
openproblems-bio · Oct 31, 2024 · 2fa4446 · 2fa4446
1 parent cb4543d
commit 2fa4446
Show file tree

Hide file tree

Showing 49 changed files with 396 additions and 342 deletions.
diff --git a/README.md b/README.md
@@ -139,8 +139,8 @@ perturbation responses in difference biological contexts.
 flowchart LR
   file_sc_counts("Single Cell Counts")
   comp_process_dataset[/"Process dataset"/]
-  file_de_train_h5ad("DE train")
-  file_de_test_h5ad("DE test")
+  file_de_train("DE train")
+  file_de_test("DE test")
   file_id_map("ID Map")
   comp_control_method[/"Control Method"/]
   comp_method[/"Method"/]
@@ -149,13 +149,13 @@ flowchart LR
   file_model("Model")
   file_score("Score")
   file_sc_counts---comp_process_dataset
-  comp_process_dataset-->file_de_train_h5ad
-  comp_process_dataset-->file_de_test_h5ad
+  comp_process_dataset-->file_de_train
+  comp_process_dataset-->file_de_test
   comp_process_dataset-->file_id_map
-  file_de_train_h5ad---comp_control_method
-  file_de_train_h5ad---comp_method
-  file_de_test_h5ad---comp_control_method
-  file_de_test_h5ad---comp_metric
+  file_de_train---comp_control_method
+  file_de_train---comp_method
+  file_de_test---comp_control_method
+  file_de_test---comp_metric
   file_id_map---comp_control_method
   file_id_map---comp_method
   comp_control_method-->file_prediction
@@ -224,8 +224,8 @@ Arguments:
 | Name              | Type   | Description                                                                                                         |
 |:------------------|:-------|:--------------------------------------------------------------------------------------------------------------------|
 | `--sc_counts`     | `file` | Anndata with the counts of the whole dataset.                                                                       |
-| `--de_train_h5ad` | `file` | (*Output*) Differential expression results for training. Default: `de_train.h5ad`.                                  |
-| `--de_test_h5ad`  | `file` | (*Output*) Differential expression results for testing. Default: `de_test.h5ad`.                                    |
+| `--de_train` | `file` | (*Output*) Differential expression results for training. Default: `de_train.h5ad`.                                  |
+| `--de_test`  | `file` | (*Output*) Differential expression results for testing. Default: `de_test.h5ad`.                                    |
 | `--id_map`        | `file` | (*Output*) File indicates the order of de_test, the cell types and the small molecule names. Default: `id_map.csv`. |
 
 </div>
@@ -371,8 +371,8 @@ Arguments:
 
 | Name              | Type     | Description                                                                         |
 |:------------------|:---------|:------------------------------------------------------------------------------------|
-| `--de_train_h5ad` | `file`   | (*Optional*) Differential expression results for training.                          |
-| `--de_test_h5ad`  | `file`   | Differential expression results for testing.                                        |
+| `--de_train` | `file`   | (*Optional*) Differential expression results for training.                          |
+| `--de_test`  | `file`   | Differential expression results for testing.                                        |
 | `--id_map`        | `file`   | File indicates the order of de_test, the cell types and the small molecule names.   |
 | `--layer`         | `string` | (*Optional*) Which layer to use for prediction. Default: `clipped_sign_log10_pval`. |
 | `--output`        | `file`   | (*Output*) Differential Gene Expression prediction.                                 |
@@ -392,7 +392,7 @@ Arguments:
 
 | Name              | Type     | Description                                                                                                         |
 |:------------------|:---------|:--------------------------------------------------------------------------------------------------------------------|
-| `--de_train_h5ad` | `file`   | (*Optional*) Differential expression results for training.                                                          |
+| `--de_train` | `file`   | (*Optional*) Differential expression results for training.                                                          |
 | `--id_map`        | `file`   | File indicates the order of de_test, the cell types and the small molecule names.                                   |
 | `--layer`         | `string` | (*Optional*) Which layer to use for prediction. Default: `clipped_sign_log10_pval`.                                 |
 | `--output`        | `file`   | (*Output*) Differential Gene Expression prediction.                                                                 |
@@ -413,7 +413,7 @@ Arguments:
 
 | Name                 | Type     | Description                                                                                   |
 |:---------------------|:---------|:----------------------------------------------------------------------------------------------|
-| `--de_test_h5ad`     | `file`   | Differential expression results for testing.                                                  |
+| `--de_test`     | `file`   | Differential expression results for testing.                                                  |
 | `--de_test_layer`    | `string` | (*Optional*) In which layer to find the DE data. Default: `clipped_sign_log10_pval`.          |
 | `--prediction`       | `file`   | Differential Gene Expression prediction.                                                      |
 | `--prediction_layer` | `string` | (*Optional*) In which layer to find the predicted DE data. Default: `prediction`.             |

diff --git a/common b/common
diff --git a/scripts/create_resources/neurips-2023-data.sh b/scripts/create_resources/neurips-2023-data.sh
@@ -42,15 +42,15 @@ nextflow run \
 
 echo ">> Run method"
 viash run src/control_methods/mean_across_compounds/config.vsh.yaml -- \
-  --de_train_h5ad "$OUT/de_train.h5ad" \
-  --de_test_h5ad "$OUT/de_test.h5ad" \
+  --de_train "$OUT/de_train.h5ad" \
+  --de_test "$OUT/de_test.h5ad" \
   --id_map "$OUT/id_map.csv" \
   --output "$OUT/prediction.h5ad"
 
 echo ">> Run metric"
 viash run src/metrics/mean_rowwise_error/config.vsh.yaml -- \
   --prediction "$OUT/prediction.h5ad" \
-  --de_test_h5ad "$OUT/de_test.h5ad" \
+  --de_test "$OUT/de_test.h5ad" \
   --output "$OUT/score.h5ad"
 
 echo ">> Uploading results to S3"

diff --git a/scripts/create_resources/neurips-2023-kaggle.sh b/scripts/create_resources/neurips-2023-kaggle.sh
@@ -35,21 +35,21 @@ viash run src/process_dataset/convert_kaggle_h5ad_to_parquet/config.vsh.yaml --
 
 echo ">> Run method"
 viash run src/control_methods/mean_across_compounds/config.vsh.yaml -- \
-  --de_train_h5ad "$OUT/de_train.h5ad" \
-  --de_test_h5ad "$OUT/de_test.h5ad" \
+  --de_train "$OUT/de_train.h5ad" \
+  --de_test "$OUT/de_test.h5ad" \
   --id_map "$OUT/id_map.csv" \
   --output "$OUT/prediction.h5ad"
 
 echo ">> Run metric"
 viash run src/metrics/mean_rowwise_error/config.vsh.yaml -- \
   --prediction "$OUT/prediction.h5ad" \
-  --de_test_h5ad "$OUT/de_test.h5ad" \
+  --de_test "$OUT/de_test.h5ad" \
   --output "$OUT/score.h5ad"
 
 cat > "$OUT/state.yaml" <<'EOF'
 id: neurips-2023-kaggle
-de_train_h5ad: !file de_train.h5ad
-de_test_h5ad: !file de_test.h5ad
+de_train: !file de_train.h5ad
+de_test: !file de_test.h5ad
 id_map: !file id_map.csv
 EOF
 

diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh
@@ -25,13 +25,13 @@ publish_dir="resources/results/${RUN_ID}"
 cat > /tmp/params.yaml << HERE
 param_list:
   - id: neurips-2023-data
-    de_train_h5ad: "$resources_dir/neurips-2023-data/de_train.h5ad"
-    de_test_h5ad: "$resources_dir/neurips-2023-data/de_test.h5ad"
+    de_train: "$resources_dir/neurips-2023-data/de_train.h5ad"
+    de_test: "$resources_dir/neurips-2023-data/de_test.h5ad"
     id_map: "$resources_dir/neurips-2023-data/id_map.csv"
     layer: clipped_sign_log10_pval
   - id: neurips-2023-kaggle
-    de_train_h5ad: "$resources_dir/neurips-2023-kaggle/de_train.h5ad"
-    de_test_h5ad: "$resources_dir/neurips-2023-kaggle/de_test.h5ad"
+    de_train: "$resources_dir/neurips-2023-kaggle/de_train.h5ad"
+    de_test: "$resources_dir/neurips-2023-kaggle/de_test.h5ad"
     id_map: "$resources_dir/neurips-2023-kaggle/id_map.csv"
     layer: sign_log10_pval
 output_state: "state.yaml"

diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh
@@ -17,13 +17,13 @@ publish_dir="s3://openproblems-data/resources/task_perturbation_prediction/resul
 cat > /tmp/params.yaml << HERE
 param_list:
   - id: neurips-2023-data
-    de_train_h5ad: "$resources_dir/neurips-2023-data/de_train.h5ad"
-    de_test_h5ad: "$resources_dir/neurips-2023-data/de_test.h5ad"
+    de_train: "$resources_dir/neurips-2023-data/de_train.h5ad"
+    de_test: "$resources_dir/neurips-2023-data/de_test.h5ad"
     id_map: "$resources_dir/neurips-2023-data/id_map.csv"
     layer: clipped_sign_log10_pval
   # - id: neurips-2023-kaggle
-  #   de_train_h5ad: "$resources_dir/neurips-2023-kaggle/de_train.h5ad"
-  #   de_test_h5ad: "$resources_dir/neurips-2023-kaggle/de_test.h5ad"
+  #   de_train: "$resources_dir/neurips-2023-kaggle/de_train.h5ad"
+  #   de_test: "$resources_dir/neurips-2023-kaggle/de_test.h5ad"
   #   id_map: "$resources_dir/neurips-2023-kaggle/id_map.csv"
   #   layer: sign_log10_pval
 output_state: "state.yaml"

diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml
@@ -7,11 +7,11 @@ info:
     description: |
       A control method to serve as a quality control for the perturbation prediction benchmark.
 arguments:
-  - name: --de_train_h5ad
+  - name: --de_train
     __merge__: file_de_train.yaml
     required: false
     direction: input
-  - name: --de_test_h5ad
+  - name: --de_test
     __merge__: file_de_test.yaml
     required: true
     direction: input

diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml
@@ -7,7 +7,7 @@ info:
     description: |
       A metric to compare a perturbation prediction to the ground truth.
 arguments:
-  - name: --de_test_h5ad
+  - name: --de_test
     __merge__: file_de_test.yaml
     required: true
     direction: input
@@ -37,6 +37,8 @@ arguments:
     description: |
       How to resolve difference in genes between the two datasets.
 test_resources:
+  - type: python_script
+    path: /common/component_tests/check_config.py
   - type: python_script
     path: /common/component_tests/run_and_check_output.py
   - path: /resources/datasets/neurips-2023-data

diff --git a/src/api/comp_process_dataset.yaml b/src/api/comp_process_dataset.yaml
@@ -11,12 +11,12 @@ arguments:
     __merge__: file_sc_counts.yaml
     required: true
     direction: input
-  - name: --de_train_h5ad
+  - name: --de_train
     __merge__: file_de_train.yaml
     required: true
     direction: output
     default: de_train.h5ad
-  - name: --de_test_h5ad
+  - name: --de_test
     __merge__: file_de_test.yaml
     required: true
     direction: output
@@ -26,12 +26,12 @@ arguments:
     required: true
     direction: output
     default: id_map.csv
-  - name: --sc_train_h5ad
+  - name: --sc_train
     type: file
     required: false
     direction: output
     default: sc_train.h5ad
-  - name: --sc_test_h5ad
+  - name: --sc_test
     type: file
     required: false
     direction: output

diff --git a/src/api/wf_method.yaml b/src/api/wf_method.yaml
@@ -7,7 +7,7 @@ info:
     description: |
       A method for predicting the perturbation response of small molecules on certain cell types.
 arguments:
-  - name: --de_train_h5ad
+  - name: --de_train
     __merge__: file_de_train.yaml
     required: false
     direction: input
@@ -29,3 +29,6 @@ arguments:
     direction: output
     required: false
     must_exist: false
+test_resources:
+  # - type: python_script
+  #   path: /common/component_tests/check_config.py
diff --git a/src/control_methods/ground_truth/script.R b/src/control_methods/ground_truth/script.R
@@ -3,27 +3,27 @@ library(dplyr, warn.conflicts = FALSE)
 
 ## VIASH START
 par <- list(
-  de_train_h5ad = "resources/datasets/neurips-2023-data/de_train.h5ad",
-  de_test_h5ad = "resources/datasets/neurips-2023-data/de_test.h5ad",
+  de_train = "resources/datasets/neurips-2023-data/de_train.h5ad",
+  de_test = "resources/datasets/neurips-2023-data/de_test.h5ad",
   layer = "clipped_sign_log10_pval",
   id_map = "resources/datasets/neurips-2023-data/id_map.csv",
   output = "resources/datasets/neurips-2023-data/output_identity.h5ad"
 )
 ## VIASH END
 
 # read data
-de_test_h5ad <- anndata::read_h5ad(par$de_test_h5ad)
+de_test <- anndata::read_h5ad(par$de_test)
 
 # remove unneeded columns
 output <- anndata::AnnData(
   layers = list(
-    prediction = de_test_h5ad$layers[[par$layer]]
+    prediction = de_test$layers[[par$layer]]
   ),
-  obs = de_test_h5ad$obs[, c()],
-  var = de_test_h5ad$var[, c()],
+  obs = de_test$obs[, c()],
+  var = de_test$var[, c()],
   uns = list(
-    dataset_id = de_test_h5ad$uns$dataset_id,
-    method_id = meta$functionality_name
+    dataset_id = de_test$uns$dataset_id,
+    method_id = meta$name
   )
 )
 

diff --git a/src/control_methods/mean_across_celltypes/script.py b/src/control_methods/mean_across_celltypes/script.py
@@ -4,8 +4,8 @@
 
 ## VIASH START
 par = {
-  "de_train_h5ad": "resources/datasets/neurips-2023-data/de_train.h5ad",
-  "de_test_h5ad": "resources/datasets/neurips-2023-data/de_test.h5ad",
+  "de_train": "resources/datasets/neurips-2023-data/de_train.h5ad",
+  "de_test": "resources/datasets/neurips-2023-data/de_test.h5ad",
   "layer": "clipped_sign_log10_pval",
   "id_map": "resources/datasets/neurips-2023-data/id_map.csv",
   "output": "resources/datasets/neurips-2023-data/output_mean.h5ad",
@@ -15,13 +15,13 @@
 sys.path.append(meta["resources_dir"])
 from anndata_to_dataframe import anndata_to_dataframe
 
-de_train_h5ad = ad.read_h5ad(par["de_train_h5ad"])
+de_train = ad.read_h5ad(par["de_train"])
 id_map = pd.read_csv(par["id_map"])
-gene_names = list(de_train_h5ad.var_names)
-de_train = anndata_to_dataframe(de_train_h5ad, par["layer"])
+gene_names = list(de_train.var_names)
+de_train_df = anndata_to_dataframe(de_train, par["layer"])
 
 # compute mean celltype
-mean_celltype = de_train.groupby("cell_type")[gene_names].mean()
+mean_celltype = de_train_df.groupby("cell_type")[gene_names].mean()
 mean_celltype = mean_celltype.loc[id_map.cell_type]
 
 # write output
@@ -32,8 +32,8 @@
     obs=pd.DataFrame(index=id_map["id"]),
     var=pd.DataFrame(index=gene_names),
     uns={
-      "dataset_id": de_train_h5ad.uns["dataset_id"],
-      "method_id": meta["functionality_name"]
+      "dataset_id": de_train.uns["dataset_id"],
+      "method_id": meta["name"]
     }
 )
 output.write_h5ad(par["output"], compression="gzip")
diff --git a/src/control_methods/mean_across_compounds/script.py b/src/control_methods/mean_across_compounds/script.py
@@ -4,8 +4,8 @@
 
 ## VIASH START
 par = {
-  "de_train_h5ad": "resources/datasets/neurips-2023-data/de_train.h5ad",
-  "de_test_h5ad": "resources/datasets/neurips-2023-data/de_test.h5ad",
+  "de_train": "resources/datasets/neurips-2023-data/de_train.h5ad",
+  "de_test": "resources/datasets/neurips-2023-data/de_test.h5ad",
   "layer": "clipped_sign_log10_pval",
   "id_map": "resources/datasets/neurips-2023-data/id_map.csv",
   "output": "resources/datasets/neurips-2023-data/output_mean.h5ad",
@@ -15,12 +15,12 @@
 sys.path.append(meta["resources_dir"])
 from anndata_to_dataframe import anndata_to_dataframe
 
-de_train_h5ad = ad.read_h5ad(par["de_train_h5ad"])
+de_train = ad.read_h5ad(par["de_train"])
 id_map = pd.read_csv(par["id_map"])
-gene_names = list(de_train_h5ad.var_names)
-de_train = anndata_to_dataframe(de_train_h5ad, par["layer"])
+gene_names = list(de_train.var_names)
+de_train_df = anndata_to_dataframe(de_train, par["layer"])
 
-mean_compound = de_train.groupby("sm_name")[gene_names].mean()
+mean_compound = de_train_df.groupby("sm_name")[gene_names].mean()
 mean_compound = mean_compound.loc[id_map.sm_name]
 
 # write output
@@ -31,8 +31,8 @@
     obs=pd.DataFrame(index=id_map["id"]),
     var=pd.DataFrame(index=gene_names),
     uns={
-      "dataset_id": de_train_h5ad.uns["dataset_id"],
-      "method_id": meta["functionality_name"]
+      "dataset_id": de_train.uns["dataset_id"],
+      "method_id": meta["name"]
     }
 )
 output.write_h5ad(par["output"], compression="gzip")
diff --git a/src/control_methods/mean_outcome/script.py b/src/control_methods/mean_outcome/script.py
@@ -5,8 +5,8 @@
 
 ## VIASH START
 par = {
-  "de_train_h5ad": "resources/datasets/neurips-2023-data/de_train.h5ad",
-  "de_test_h5ad": "resources/datasets/neurips-2023-data/de_test.h5ad",
+  "de_train": "resources/datasets/neurips-2023-data/de_train.h5ad",
+  "de_test": "resources/datasets/neurips-2023-data/de_test.h5ad",
   "layer": "clipped_sign_log10_pval",
   "id_map": "resources/datasets/neurips-2023-data/id_map.csv",
   "output": "resources/datasets/neurips-2023-data/output_mean.h5ad",
@@ -16,12 +16,12 @@
 sys.path.append(meta["resources_dir"])
 from anndata_to_dataframe import anndata_to_dataframe
 
-de_train_h5ad = ad.read_h5ad(par["de_train_h5ad"])
+de_train = ad.read_h5ad(par["de_train"])
 id_map = pd.read_csv(par["id_map"])
-gene_names = list(de_train_h5ad.var_names)
-de_train = anndata_to_dataframe(de_train_h5ad, par["layer"])
+gene_names = list(de_train.var_names)
+de_train_df = anndata_to_dataframe(de_train, par["layer"])
 
-mean_pred = de_train[gene_names].mean(axis=0)
+mean_pred = de_train_df[gene_names].mean(axis=0)
 
 # write output
 output = ad.AnnData(
@@ -31,8 +31,8 @@
     obs=pd.DataFrame(index=id_map["id"]),
     var=pd.DataFrame(index=gene_names),
     uns={
-      "dataset_id": de_train_h5ad.uns["dataset_id"],
-      "method_id": meta["functionality_name"]
+      "dataset_id": de_train.uns["dataset_id"],
+      "method_id": meta["name"]
     }
 )
 output.write_h5ad(par["output"], compression="gzip")