From 2fa44462b1e7d530bad703c4a20ed22b49d3705e Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Thu, 31 Oct 2024 05:34:44 +0100 Subject: [PATCH] fix viash 0.9 refactoring (#79) * fix viash 0.9 refactoring * rename (sc|de)_(train|test)_h5ad to $1_$2 * fix components * fix scape --- README.md | 28 +++++++------- common | 2 +- scripts/create_resources/neurips-2023-data.sh | 6 +-- .../create_resources/neurips-2023-kaggle.sh | 10 ++--- scripts/run_benchmark/run_full_local.sh | 8 ++-- scripts/run_benchmark/run_full_seqeracloud.sh | 8 ++-- src/api/comp_control_method.yaml | 4 +- src/api/comp_metric.yaml | 4 +- src/api/comp_process_dataset.yaml | 8 ++-- src/api/wf_method.yaml | 5 ++- src/control_methods/ground_truth/script.R | 16 ++++---- .../mean_across_celltypes/script.py | 16 ++++---- .../mean_across_compounds/script.py | 16 ++++---- src/control_methods/mean_outcome/script.py | 16 ++++---- src/control_methods/sample/script.R | 18 ++++----- src/control_methods/zeros/script.py | 12 +++--- src/methods/jn_ap_op2/config.vsh.yaml | 19 +++++----- src/methods/jn_ap_op2/script.py | 14 +++---- src/methods/lgc_ensemble/config.vsh.yaml | 19 +++++----- src/methods/lgc_ensemble/main.nf | 2 +- src/methods/lgc_ensemble/test.sh | 2 +- .../lgc_ensemble_direct/config.vsh.yaml | 19 +++++----- src/methods/lgc_ensemble_helpers/predict.py | 12 +++--- .../lgc_ensemble_helpers/prepare_data.py | 18 ++++----- src/methods/lgc_ensemble_helpers/train.py | 18 ++++----- src/methods/lgc_ensemble_predict/script.py | 2 +- .../lgc_ensemble_prepare/config.vsh.yaml | 2 +- src/methods/lgc_ensemble_prepare/script.py | 34 ++++++++--------- .../config.vsh.yaml | 34 ++++++++--------- .../nn_retraining_with_pseudolabels/script.py | 22 +++++------ src/methods/pyboost/config.vsh.yaml | 37 ++++++++++--------- src/methods/pyboost/script.py | 22 +++++------ src/methods/scape/config.vsh.yaml | 31 ++++++++-------- src/methods/scape/script.py | 23 ++++++------ .../transformer_ensemble/config.vsh.yaml | 30 ++++++++------- src/methods/transformer_ensemble/script.py | 20 +++++----- src/methods/transformer_ensemble/utils.py | 36 +++++++++--------- .../mean_rowwise_correlation/config.vsh.yaml | 37 ++++++++++++++++--- src/metrics/mean_rowwise_correlation/script.R | 4 +- .../mean_rowwise_error/config.vsh.yaml | 28 +++++++++++--- src/metrics/mean_rowwise_error/script.R | 4 +- .../generate_id_map/config.vsh.yaml | 2 +- src/process_dataset/generate_id_map/script.py | 6 +-- src/process_dataset/split_sc/config.vsh.yaml | 4 +- src/process_dataset/split_sc/script.py | 8 ++-- src/workflows/process_dataset/main.nf | 18 ++++----- src/workflows/run_benchmark/config.vsh.yaml | 4 +- src/workflows/run_benchmark/main.nf | 22 +++++------ src/workflows/run_stability_analysis/main.nf | 8 ++-- 49 files changed, 396 insertions(+), 342 deletions(-) diff --git a/README.md b/README.md index 1e35a52f..b0d95ddb 100644 --- a/README.md +++ b/README.md @@ -139,8 +139,8 @@ perturbation responses in difference biological contexts. flowchart LR file_sc_counts("Single Cell Counts") comp_process_dataset[/"Process dataset"/] - file_de_train_h5ad("DE train") - file_de_test_h5ad("DE test") + file_de_train("DE train") + file_de_test("DE test") file_id_map("ID Map") comp_control_method[/"Control Method"/] comp_method[/"Method"/] @@ -149,13 +149,13 @@ flowchart LR file_model("Model") file_score("Score") file_sc_counts---comp_process_dataset - comp_process_dataset-->file_de_train_h5ad - comp_process_dataset-->file_de_test_h5ad + comp_process_dataset-->file_de_train + comp_process_dataset-->file_de_test comp_process_dataset-->file_id_map - file_de_train_h5ad---comp_control_method - file_de_train_h5ad---comp_method - file_de_test_h5ad---comp_control_method - file_de_test_h5ad---comp_metric + file_de_train---comp_control_method + file_de_train---comp_method + file_de_test---comp_control_method + file_de_test---comp_metric file_id_map---comp_control_method file_id_map---comp_method comp_control_method-->file_prediction @@ -224,8 +224,8 @@ Arguments: | Name | Type | Description | |:------------------|:-------|:--------------------------------------------------------------------------------------------------------------------| | `--sc_counts` | `file` | Anndata with the counts of the whole dataset. | -| `--de_train_h5ad` | `file` | (*Output*) Differential expression results for training. Default: `de_train.h5ad`. | -| `--de_test_h5ad` | `file` | (*Output*) Differential expression results for testing. Default: `de_test.h5ad`. | +| `--de_train` | `file` | (*Output*) Differential expression results for training. Default: `de_train.h5ad`. | +| `--de_test` | `file` | (*Output*) Differential expression results for testing. Default: `de_test.h5ad`. | | `--id_map` | `file` | (*Output*) File indicates the order of de_test, the cell types and the small molecule names. Default: `id_map.csv`. | @@ -371,8 +371,8 @@ Arguments: | Name | Type | Description | |:------------------|:---------|:------------------------------------------------------------------------------------| -| `--de_train_h5ad` | `file` | (*Optional*) Differential expression results for training. | -| `--de_test_h5ad` | `file` | Differential expression results for testing. | +| `--de_train` | `file` | (*Optional*) Differential expression results for training. | +| `--de_test` | `file` | Differential expression results for testing. | | `--id_map` | `file` | File indicates the order of de_test, the cell types and the small molecule names. | | `--layer` | `string` | (*Optional*) Which layer to use for prediction. Default: `clipped_sign_log10_pval`. | | `--output` | `file` | (*Output*) Differential Gene Expression prediction. | @@ -392,7 +392,7 @@ Arguments: | Name | Type | Description | |:------------------|:---------|:--------------------------------------------------------------------------------------------------------------------| -| `--de_train_h5ad` | `file` | (*Optional*) Differential expression results for training. | +| `--de_train` | `file` | (*Optional*) Differential expression results for training. | | `--id_map` | `file` | File indicates the order of de_test, the cell types and the small molecule names. | | `--layer` | `string` | (*Optional*) Which layer to use for prediction. Default: `clipped_sign_log10_pval`. | | `--output` | `file` | (*Output*) Differential Gene Expression prediction. | @@ -413,7 +413,7 @@ Arguments: | Name | Type | Description | |:---------------------|:---------|:----------------------------------------------------------------------------------------------| -| `--de_test_h5ad` | `file` | Differential expression results for testing. | +| `--de_test` | `file` | Differential expression results for testing. | | `--de_test_layer` | `string` | (*Optional*) In which layer to find the DE data. Default: `clipped_sign_log10_pval`. | | `--prediction` | `file` | Differential Gene Expression prediction. | | `--prediction_layer` | `string` | (*Optional*) In which layer to find the predicted DE data. Default: `prediction`. | diff --git a/common b/common index e64f472b..65e05af6 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit e64f472b37f1bdbd383640098708ecf5c9f7fd7e +Subproject commit 65e05af68a11ee87853fcf7a3c6b579001f21abe diff --git a/scripts/create_resources/neurips-2023-data.sh b/scripts/create_resources/neurips-2023-data.sh index 4ee05d39..8cee066b 100755 --- a/scripts/create_resources/neurips-2023-data.sh +++ b/scripts/create_resources/neurips-2023-data.sh @@ -42,15 +42,15 @@ nextflow run \ echo ">> Run method" viash run src/control_methods/mean_across_compounds/config.vsh.yaml -- \ - --de_train_h5ad "$OUT/de_train.h5ad" \ - --de_test_h5ad "$OUT/de_test.h5ad" \ + --de_train "$OUT/de_train.h5ad" \ + --de_test "$OUT/de_test.h5ad" \ --id_map "$OUT/id_map.csv" \ --output "$OUT/prediction.h5ad" echo ">> Run metric" viash run src/metrics/mean_rowwise_error/config.vsh.yaml -- \ --prediction "$OUT/prediction.h5ad" \ - --de_test_h5ad "$OUT/de_test.h5ad" \ + --de_test "$OUT/de_test.h5ad" \ --output "$OUT/score.h5ad" echo ">> Uploading results to S3" diff --git a/scripts/create_resources/neurips-2023-kaggle.sh b/scripts/create_resources/neurips-2023-kaggle.sh index 55425960..6f9443d6 100755 --- a/scripts/create_resources/neurips-2023-kaggle.sh +++ b/scripts/create_resources/neurips-2023-kaggle.sh @@ -35,21 +35,21 @@ viash run src/process_dataset/convert_kaggle_h5ad_to_parquet/config.vsh.yaml -- echo ">> Run method" viash run src/control_methods/mean_across_compounds/config.vsh.yaml -- \ - --de_train_h5ad "$OUT/de_train.h5ad" \ - --de_test_h5ad "$OUT/de_test.h5ad" \ + --de_train "$OUT/de_train.h5ad" \ + --de_test "$OUT/de_test.h5ad" \ --id_map "$OUT/id_map.csv" \ --output "$OUT/prediction.h5ad" echo ">> Run metric" viash run src/metrics/mean_rowwise_error/config.vsh.yaml -- \ --prediction "$OUT/prediction.h5ad" \ - --de_test_h5ad "$OUT/de_test.h5ad" \ + --de_test "$OUT/de_test.h5ad" \ --output "$OUT/score.h5ad" cat > "$OUT/state.yaml" <<'EOF' id: neurips-2023-kaggle -de_train_h5ad: !file de_train.h5ad -de_test_h5ad: !file de_test.h5ad +de_train: !file de_train.h5ad +de_test: !file de_test.h5ad id_map: !file id_map.csv EOF diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh index 70e939a7..37e1df38 100755 --- a/scripts/run_benchmark/run_full_local.sh +++ b/scripts/run_benchmark/run_full_local.sh @@ -25,13 +25,13 @@ publish_dir="resources/results/${RUN_ID}" cat > /tmp/params.yaml << HERE param_list: - id: neurips-2023-data - de_train_h5ad: "$resources_dir/neurips-2023-data/de_train.h5ad" - de_test_h5ad: "$resources_dir/neurips-2023-data/de_test.h5ad" + de_train: "$resources_dir/neurips-2023-data/de_train.h5ad" + de_test: "$resources_dir/neurips-2023-data/de_test.h5ad" id_map: "$resources_dir/neurips-2023-data/id_map.csv" layer: clipped_sign_log10_pval - id: neurips-2023-kaggle - de_train_h5ad: "$resources_dir/neurips-2023-kaggle/de_train.h5ad" - de_test_h5ad: "$resources_dir/neurips-2023-kaggle/de_test.h5ad" + de_train: "$resources_dir/neurips-2023-kaggle/de_train.h5ad" + de_test: "$resources_dir/neurips-2023-kaggle/de_test.h5ad" id_map: "$resources_dir/neurips-2023-kaggle/id_map.csv" layer: sign_log10_pval output_state: "state.yaml" diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh index 064f9480..20148c56 100755 --- a/scripts/run_benchmark/run_full_seqeracloud.sh +++ b/scripts/run_benchmark/run_full_seqeracloud.sh @@ -17,13 +17,13 @@ publish_dir="s3://openproblems-data/resources/task_perturbation_prediction/resul cat > /tmp/params.yaml << HERE param_list: - id: neurips-2023-data - de_train_h5ad: "$resources_dir/neurips-2023-data/de_train.h5ad" - de_test_h5ad: "$resources_dir/neurips-2023-data/de_test.h5ad" + de_train: "$resources_dir/neurips-2023-data/de_train.h5ad" + de_test: "$resources_dir/neurips-2023-data/de_test.h5ad" id_map: "$resources_dir/neurips-2023-data/id_map.csv" layer: clipped_sign_log10_pval # - id: neurips-2023-kaggle - # de_train_h5ad: "$resources_dir/neurips-2023-kaggle/de_train.h5ad" - # de_test_h5ad: "$resources_dir/neurips-2023-kaggle/de_test.h5ad" + # de_train: "$resources_dir/neurips-2023-kaggle/de_train.h5ad" + # de_test: "$resources_dir/neurips-2023-kaggle/de_test.h5ad" # id_map: "$resources_dir/neurips-2023-kaggle/id_map.csv" # layer: sign_log10_pval output_state: "state.yaml" diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml index 9e41f985..a65c9b67 100644 --- a/src/api/comp_control_method.yaml +++ b/src/api/comp_control_method.yaml @@ -7,11 +7,11 @@ info: description: | A control method to serve as a quality control for the perturbation prediction benchmark. arguments: - - name: --de_train_h5ad + - name: --de_train __merge__: file_de_train.yaml required: false direction: input - - name: --de_test_h5ad + - name: --de_test __merge__: file_de_test.yaml required: true direction: input diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml index 7b8c9db4..a636a744 100644 --- a/src/api/comp_metric.yaml +++ b/src/api/comp_metric.yaml @@ -7,7 +7,7 @@ info: description: | A metric to compare a perturbation prediction to the ground truth. arguments: - - name: --de_test_h5ad + - name: --de_test __merge__: file_de_test.yaml required: true direction: input @@ -37,6 +37,8 @@ arguments: description: | How to resolve difference in genes between the two datasets. test_resources: + - type: python_script + path: /common/component_tests/check_config.py - type: python_script path: /common/component_tests/run_and_check_output.py - path: /resources/datasets/neurips-2023-data diff --git a/src/api/comp_process_dataset.yaml b/src/api/comp_process_dataset.yaml index 485044fd..ecfe2bb2 100644 --- a/src/api/comp_process_dataset.yaml +++ b/src/api/comp_process_dataset.yaml @@ -11,12 +11,12 @@ arguments: __merge__: file_sc_counts.yaml required: true direction: input - - name: --de_train_h5ad + - name: --de_train __merge__: file_de_train.yaml required: true direction: output default: de_train.h5ad - - name: --de_test_h5ad + - name: --de_test __merge__: file_de_test.yaml required: true direction: output @@ -26,12 +26,12 @@ arguments: required: true direction: output default: id_map.csv - - name: --sc_train_h5ad + - name: --sc_train type: file required: false direction: output default: sc_train.h5ad - - name: --sc_test_h5ad + - name: --sc_test type: file required: false direction: output diff --git a/src/api/wf_method.yaml b/src/api/wf_method.yaml index 612a73c6..b12659ec 100644 --- a/src/api/wf_method.yaml +++ b/src/api/wf_method.yaml @@ -7,7 +7,7 @@ info: description: | A method for predicting the perturbation response of small molecules on certain cell types. arguments: - - name: --de_train_h5ad + - name: --de_train __merge__: file_de_train.yaml required: false direction: input @@ -29,3 +29,6 @@ arguments: direction: output required: false must_exist: false +test_resources: + # - type: python_script + # path: /common/component_tests/check_config.py diff --git a/src/control_methods/ground_truth/script.R b/src/control_methods/ground_truth/script.R index 276db2f6..39715a36 100644 --- a/src/control_methods/ground_truth/script.R +++ b/src/control_methods/ground_truth/script.R @@ -3,8 +3,8 @@ library(dplyr, warn.conflicts = FALSE) ## VIASH START par <- list( - de_train_h5ad = "resources/datasets/neurips-2023-data/de_train.h5ad", - de_test_h5ad = "resources/datasets/neurips-2023-data/de_test.h5ad", + de_train = "resources/datasets/neurips-2023-data/de_train.h5ad", + de_test = "resources/datasets/neurips-2023-data/de_test.h5ad", layer = "clipped_sign_log10_pval", id_map = "resources/datasets/neurips-2023-data/id_map.csv", output = "resources/datasets/neurips-2023-data/output_identity.h5ad" @@ -12,18 +12,18 @@ par <- list( ## VIASH END # read data -de_test_h5ad <- anndata::read_h5ad(par$de_test_h5ad) +de_test <- anndata::read_h5ad(par$de_test) # remove unneeded columns output <- anndata::AnnData( layers = list( - prediction = de_test_h5ad$layers[[par$layer]] + prediction = de_test$layers[[par$layer]] ), - obs = de_test_h5ad$obs[, c()], - var = de_test_h5ad$var[, c()], + obs = de_test$obs[, c()], + var = de_test$var[, c()], uns = list( - dataset_id = de_test_h5ad$uns$dataset_id, - method_id = meta$functionality_name + dataset_id = de_test$uns$dataset_id, + method_id = meta$name ) ) diff --git a/src/control_methods/mean_across_celltypes/script.py b/src/control_methods/mean_across_celltypes/script.py index addd2ab3..a1b768cb 100644 --- a/src/control_methods/mean_across_celltypes/script.py +++ b/src/control_methods/mean_across_celltypes/script.py @@ -4,8 +4,8 @@ ## VIASH START par = { - "de_train_h5ad": "resources/datasets/neurips-2023-data/de_train.h5ad", - "de_test_h5ad": "resources/datasets/neurips-2023-data/de_test.h5ad", + "de_train": "resources/datasets/neurips-2023-data/de_train.h5ad", + "de_test": "resources/datasets/neurips-2023-data/de_test.h5ad", "layer": "clipped_sign_log10_pval", "id_map": "resources/datasets/neurips-2023-data/id_map.csv", "output": "resources/datasets/neurips-2023-data/output_mean.h5ad", @@ -15,13 +15,13 @@ sys.path.append(meta["resources_dir"]) from anndata_to_dataframe import anndata_to_dataframe -de_train_h5ad = ad.read_h5ad(par["de_train_h5ad"]) +de_train = ad.read_h5ad(par["de_train"]) id_map = pd.read_csv(par["id_map"]) -gene_names = list(de_train_h5ad.var_names) -de_train = anndata_to_dataframe(de_train_h5ad, par["layer"]) +gene_names = list(de_train.var_names) +de_train_df = anndata_to_dataframe(de_train, par["layer"]) # compute mean celltype -mean_celltype = de_train.groupby("cell_type")[gene_names].mean() +mean_celltype = de_train_df.groupby("cell_type")[gene_names].mean() mean_celltype = mean_celltype.loc[id_map.cell_type] # write output @@ -32,8 +32,8 @@ obs=pd.DataFrame(index=id_map["id"]), var=pd.DataFrame(index=gene_names), uns={ - "dataset_id": de_train_h5ad.uns["dataset_id"], - "method_id": meta["functionality_name"] + "dataset_id": de_train.uns["dataset_id"], + "method_id": meta["name"] } ) output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/control_methods/mean_across_compounds/script.py b/src/control_methods/mean_across_compounds/script.py index 6fd0500d..2bd00799 100644 --- a/src/control_methods/mean_across_compounds/script.py +++ b/src/control_methods/mean_across_compounds/script.py @@ -4,8 +4,8 @@ ## VIASH START par = { - "de_train_h5ad": "resources/datasets/neurips-2023-data/de_train.h5ad", - "de_test_h5ad": "resources/datasets/neurips-2023-data/de_test.h5ad", + "de_train": "resources/datasets/neurips-2023-data/de_train.h5ad", + "de_test": "resources/datasets/neurips-2023-data/de_test.h5ad", "layer": "clipped_sign_log10_pval", "id_map": "resources/datasets/neurips-2023-data/id_map.csv", "output": "resources/datasets/neurips-2023-data/output_mean.h5ad", @@ -15,12 +15,12 @@ sys.path.append(meta["resources_dir"]) from anndata_to_dataframe import anndata_to_dataframe -de_train_h5ad = ad.read_h5ad(par["de_train_h5ad"]) +de_train = ad.read_h5ad(par["de_train"]) id_map = pd.read_csv(par["id_map"]) -gene_names = list(de_train_h5ad.var_names) -de_train = anndata_to_dataframe(de_train_h5ad, par["layer"]) +gene_names = list(de_train.var_names) +de_train_df = anndata_to_dataframe(de_train, par["layer"]) -mean_compound = de_train.groupby("sm_name")[gene_names].mean() +mean_compound = de_train_df.groupby("sm_name")[gene_names].mean() mean_compound = mean_compound.loc[id_map.sm_name] # write output @@ -31,8 +31,8 @@ obs=pd.DataFrame(index=id_map["id"]), var=pd.DataFrame(index=gene_names), uns={ - "dataset_id": de_train_h5ad.uns["dataset_id"], - "method_id": meta["functionality_name"] + "dataset_id": de_train.uns["dataset_id"], + "method_id": meta["name"] } ) output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/control_methods/mean_outcome/script.py b/src/control_methods/mean_outcome/script.py index ec2354c8..ee7a1462 100644 --- a/src/control_methods/mean_outcome/script.py +++ b/src/control_methods/mean_outcome/script.py @@ -5,8 +5,8 @@ ## VIASH START par = { - "de_train_h5ad": "resources/datasets/neurips-2023-data/de_train.h5ad", - "de_test_h5ad": "resources/datasets/neurips-2023-data/de_test.h5ad", + "de_train": "resources/datasets/neurips-2023-data/de_train.h5ad", + "de_test": "resources/datasets/neurips-2023-data/de_test.h5ad", "layer": "clipped_sign_log10_pval", "id_map": "resources/datasets/neurips-2023-data/id_map.csv", "output": "resources/datasets/neurips-2023-data/output_mean.h5ad", @@ -16,12 +16,12 @@ sys.path.append(meta["resources_dir"]) from anndata_to_dataframe import anndata_to_dataframe -de_train_h5ad = ad.read_h5ad(par["de_train_h5ad"]) +de_train = ad.read_h5ad(par["de_train"]) id_map = pd.read_csv(par["id_map"]) -gene_names = list(de_train_h5ad.var_names) -de_train = anndata_to_dataframe(de_train_h5ad, par["layer"]) +gene_names = list(de_train.var_names) +de_train_df = anndata_to_dataframe(de_train, par["layer"]) -mean_pred = de_train[gene_names].mean(axis=0) +mean_pred = de_train_df[gene_names].mean(axis=0) # write output output = ad.AnnData( @@ -31,8 +31,8 @@ obs=pd.DataFrame(index=id_map["id"]), var=pd.DataFrame(index=gene_names), uns={ - "dataset_id": de_train_h5ad.uns["dataset_id"], - "method_id": meta["functionality_name"] + "dataset_id": de_train.uns["dataset_id"], + "method_id": meta["name"] } ) output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/control_methods/sample/script.R b/src/control_methods/sample/script.R index 738b7f24..896a421c 100644 --- a/src/control_methods/sample/script.R +++ b/src/control_methods/sample/script.R @@ -2,25 +2,25 @@ requireNamespace("anndata", quietly = TRUE) ## VIASH START par <- list( - de_train_h5ad = "resources/datasets/neurips-2023-data/de_train.h5ad", - de_test_h5ad = "resources/datasets/neurips-2023-data/de_test.h5ad", + de_train = "resources/datasets/neurips-2023-data/de_train.h5ad", + de_test = "resources/datasets/neurips-2023-data/de_test.h5ad", layer = "clipped_sign_log10_pval", id_map = "resources/datasets/neurips-2023-data/id_map.csv", output = "resources/datasets/neurips-2023-data/output_identity.h5ad" ) meta <- list( - functionality_name = "sample" + name = "sample" ) ## VIASH END # read data -de_train_h5ad <- anndata::read_h5ad(par$de_train_h5ad) +de_train <- anndata::read_h5ad(par$de_train) id_map <- read.csv(par$id_map) # get gene names -gene_names <- de_train_h5ad$var_names +gene_names <- de_train$var_names -input_layer <- de_train_h5ad$layers[[par$layer]] +input_layer <- de_train$layers[[par$layer]] prediction <- sapply(gene_names, function(gene_name) { sample(input_layer[,gene_name], size = nrow(id_map), replace = TRUE) @@ -31,11 +31,11 @@ rownames(prediction) <- id_map$id # remove unneeded columns output <- anndata::AnnData( layers = list(prediction = prediction), - var = de_train_h5ad$var[, c()], + var = de_train$var[, c()], shape = c(nrow(id_map), length(gene_names)), uns = list( - dataset_id = de_train_h5ad$uns$dataset_id, - method_id = meta$functionality_name + dataset_id = de_train$uns$dataset_id, + method_id = meta$name ) ) diff --git a/src/control_methods/zeros/script.py b/src/control_methods/zeros/script.py index 67d982a4..e01b1019 100644 --- a/src/control_methods/zeros/script.py +++ b/src/control_methods/zeros/script.py @@ -4,17 +4,17 @@ ## VIASH START par = { - "de_train_h5ad": "resources/datasets/neurips-2023-data/de_train.h5ad", - "de_test_h5ad": "resources/datasets/neurips-2023-data/de_test.h5ad", + "de_train": "resources/datasets/neurips-2023-data/de_train.h5ad", + "de_test": "resources/datasets/neurips-2023-data/de_test.h5ad", "layer": "clipped_sign_log10_pval", "id_map": "resources/datasets/neurips-2023-data/id_map.csv", "output": "resources/datasets/neurips-2023-data/output_mean.h5ad", } ## VIASH END -de_train_h5ad = ad.read_h5ad(par["de_train_h5ad"]) +de_train = ad.read_h5ad(par["de_train"]) id_map = pd.read_csv(par["id_map"]) -gene_names = list(de_train_h5ad.var_names) +gene_names = list(de_train.var_names) prediction = np.zeros((id_map.shape[0], len(gene_names))) @@ -24,8 +24,8 @@ obs=pd.DataFrame(index=id_map["id"]), var=pd.DataFrame(index=gene_names), uns={ - "dataset_id": de_train_h5ad.uns["dataset_id"], - "method_id": meta["functionality_name"] + "dataset_id": de_train.uns["dataset_id"], + "method_id": meta["name"] } ) output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/methods/jn_ap_op2/config.vsh.yaml b/src/methods/jn_ap_op2/config.vsh.yaml index 91c1c19c..140ea44e 100644 --- a/src/methods/jn_ap_op2/config.vsh.yaml +++ b/src/methods/jn_ap_op2/config.vsh.yaml @@ -2,17 +2,18 @@ __merge__: ../../api/comp_method.yaml name: jn_ap_op2 label: JN-AP-OP2 +summary: "Deep learning architecture composed of 2 modules: a sample-centric MLP and a gene-centric MLP" +description: | + We first encode each sample using leave-one-out encoder based on compound and cell type. This produces X with the dimension of n_samples, n_genes, n_encode, + where n_encode is 2. Then, X is passed to a MLP1 sample-wise with input of n_samples, n_genes*n_encode, which outputs the same dimension data. + The purpose of this MLP is to learn inter-gene relationships. Then, we group the output of MLP1 with X (original encoded data) and feed it + to MLP2 which receives n_smaples*n_genes, (n_encode + n_encode) and results n_samples*n_genes. This MLP2 trains on each (compound, cell_type, gene) + combination. This is to overcome the underdetermination problem due to lack of sufficient (compound, cell_type) samples. +links: + documentation: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/461159 + repository: https://github.com/AntoinePassemiers/Open-Challenges-Single-Cell-Perturbations info: neurips2023_rank: 20 - summary: "Deep learning architecture composed of 2 modules: a sample-centric MLP and a gene-centric MLP" - description: | - We first encode each sample using leave-one-out encoder based on compound and cell type. This produces X with the dimension of n_samples, n_genes, n_encode, - where n_encode is 2. Then, X is passed to a MLP1 sample-wise with input of n_samples, n_genes*n_encode, which outputs the same dimension data. - The purpose of this MLP is to learn inter-gene relationships. Then, we group the output of MLP1 with X (original encoded data) and feed it - to MLP2 which receives n_smaples*n_genes, (n_encode + n_encode) and results n_samples*n_genes. This MLP2 trains on each (compound, cell_type, gene) - combination. This is to overcome the underdetermination problem due to lack of sufficient (compound, cell_type) samples. - documentation_url: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/461159 - repository_url: https://github.com/AntoinePassemiers/Open-Challenges-Single-Cell-Perturbations arguments: - type: integer name: --n_replica diff --git a/src/methods/jn_ap_op2/script.py b/src/methods/jn_ap_op2/script.py index 1529cbef..ac334425 100644 --- a/src/methods/jn_ap_op2/script.py +++ b/src/methods/jn_ap_op2/script.py @@ -29,10 +29,10 @@ from helper import plant_seed, MultiOutputTargetEncoder, train print('Reading input files', flush=True) -de_train_h5ad = ad.read_h5ad(par["de_train_h5ad"]) +de_train = ad.read_h5ad(par["de_train"]) id_map = pd.read_csv(par["id_map"]) -gene_names = list(de_train_h5ad.var_names) +gene_names = list(de_train.var_names) print('Preprocess data', flush=True) SEED = 0xCAFE @@ -56,10 +56,10 @@ print('Data location', flush=True) # Data location -cell_types = de_train_h5ad.obs['cell_type'].astype(str) -sm_names = de_train_h5ad.obs['sm_name'].astype(str) +cell_types = de_train.obs['cell_type'].astype(str) +sm_names = de_train.obs['sm_name'].astype(str) -data = de_train_h5ad.layers[par["layer"]] +data = de_train.layers[par["layer"]] print('Train model', flush=True) # ... train model ... @@ -115,8 +115,8 @@ obs=pd.DataFrame(index=id_map["id"]), var=pd.DataFrame(index=gene_names), uns={ - "dataset_id": de_train_h5ad.uns["dataset_id"], - "method_id": meta["functionality_name"] + "dataset_id": de_train.uns["dataset_id"], + "method_id": meta["name"] } ) diff --git a/src/methods/lgc_ensemble/config.vsh.yaml b/src/methods/lgc_ensemble/config.vsh.yaml index 5bdb4204..bdcf9e25 100644 --- a/src/methods/lgc_ensemble/config.vsh.yaml +++ b/src/methods/lgc_ensemble/config.vsh.yaml @@ -2,17 +2,18 @@ __merge__: ../../api/wf_method.yaml name: lgc_ensemble label: LSTM-GRU-CNN Ensemble +summary: An ensemble of LSTM, GRU, and 1D CNN models +description: | + An ensemble of LSTM, GRU, and 1D CNN models with a variety of input features derived from ChemBERTa embeddings, + one-hot encoding of cell type/small molecule pairs, and various statistical measures of target gene expression. + The models were trained with a combination of MSE, MAE, LogCosh, and BCE loss functions to improve their + robustness and predictive performance. The approach also included data augmentation techniques to ensure + generalization and account for noise in the data. +links: + documentation: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/459258 + repository: https://github.com/Jean-KOUAGOU/1st-place-solution-single-cell-pbs/tree/main info: neurips2023_rank: 1 - summary: An ensemble of LSTM, GRU, and 1D CNN models - description: | - An ensemble of LSTM, GRU, and 1D CNN models with a variety of input features derived from ChemBERTa embeddings, - one-hot encoding of cell type/small molecule pairs, and various statistical measures of target gene expression. - The models were trained with a combination of MSE, MAE, LogCosh, and BCE loss functions to improve their - robustness and predictive performance. The approach also included data augmentation techniques to ensure - generalization and account for noise in the data. - documentation_url: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/459258 - repository_url: https://github.com/Jean-KOUAGOU/1st-place-solution-single-cell-pbs/tree/main arguments: - name: --epochs diff --git a/src/methods/lgc_ensemble/main.nf b/src/methods/lgc_ensemble/main.nf index 37026a86..023b1a5e 100644 --- a/src/methods/lgc_ensemble/main.nf +++ b/src/methods/lgc_ensemble/main.nf @@ -9,7 +9,7 @@ workflow run_wf { | lgc_ensemble_prepare.run( fromState: [ - "de_train_h5ad", + "de_train", "id_map", "layer", "epochs", diff --git a/src/methods/lgc_ensemble/test.sh b/src/methods/lgc_ensemble/test.sh index 69a9995b..e74a625e 100755 --- a/src/methods/lgc_ensemble/test.sh +++ b/src/methods/lgc_ensemble/test.sh @@ -4,7 +4,7 @@ nextflow run . \ -main-script target/nextflow/methods/lgc_ensemble/main.nf \ -profile docker \ -resume \ - --de_train_h5ad resources/datasets/neurips-2023-data/de_train.h5ad \ + --de_train resources/datasets/neurips-2023-data/de_train.h5ad \ --id_map resources/datasets/neurips-2023-data/id_map.csv \ --layer clipped_sign_log10_pval \ --epochs 2 \ diff --git a/src/methods/lgc_ensemble_direct/config.vsh.yaml b/src/methods/lgc_ensemble_direct/config.vsh.yaml index 34e27f37..b76b27a1 100644 --- a/src/methods/lgc_ensemble_direct/config.vsh.yaml +++ b/src/methods/lgc_ensemble_direct/config.vsh.yaml @@ -2,17 +2,18 @@ __merge__: ../../api/comp_method.yaml name: lgc_ensemble_direct label: LSTM-GRU-CNN Ensemble +summary: An ensemble of LSTM, GRU, and 1D CNN models +description: | + An ensemble of LSTM, GRU, and 1D CNN models with a variety of input features derived from ChemBERTa embeddings, + one-hot encoding of cell type/small molecule pairs, and various statistical measures of target gene expression. + The models were trained with a combination of MSE, MAE, LogCosh, and BCE loss functions to improve their + robustness and predictive performance. The approach also included data augmentation techniques to ensure + generalization and account for noise in the data. +links: + documentation: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/459258 + repository: https://github.com/Jean-KOUAGOU/1st-place-solution-single-cell-pbs/tree/main info: neurips2023_rank: 1 - summary: An ensemble of LSTM, GRU, and 1D CNN models - description: | - An ensemble of LSTM, GRU, and 1D CNN models with a variety of input features derived from ChemBERTa embeddings, - one-hot encoding of cell type/small molecule pairs, and various statistical measures of target gene expression. - The models were trained with a combination of MSE, MAE, LogCosh, and BCE loss functions to improve their - robustness and predictive performance. The approach also included data augmentation techniques to ensure - generalization and account for noise in the data. - documentation_url: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/459258 - repository_url: https://github.com/Jean-KOUAGOU/1st-place-solution-single-cell-pbs/tree/main arguments: - name: --epochs diff --git a/src/methods/lgc_ensemble_helpers/predict.py b/src/methods/lgc_ensemble_helpers/predict.py index 288d2b48..ebe6087f 100644 --- a/src/methods/lgc_ensemble_helpers/predict.py +++ b/src/methods/lgc_ensemble_helpers/predict.py @@ -5,9 +5,9 @@ from helper_functions import combine_features, load_trained_models, average_prediction, weighted_average_prediction def read_data(par): - de_train_h5ad = ad.read_h5ad(par["de_train_h5ad"]) + de_train = ad.read_h5ad(par["de_train"]) id_map = pd.read_csv(par["id_map"]) - return de_train_h5ad, id_map + return de_train, id_map def predict(par, meta, paths): test_config = { @@ -19,8 +19,8 @@ def predict(par, meta, paths): ## Read train, test and sample submission data # train data is needed for columns print("\nReading data...") # de_train, id_map, sample_submission = read_data(par) - de_train_h5ad, id_map = read_data(par) - gene_names = list(de_train_h5ad.var_names) + de_train, id_map = read_data(par) + gene_names = list(de_train.var_names) ## Build input features mean_cell_type = pd.read_csv(f'{paths["train_data_aug_dir"]}/mean_cell_type.csv') @@ -123,8 +123,8 @@ def predict(par, meta, paths): obs=pd.DataFrame(index=id_map["id"]), var=pd.DataFrame(index=gene_names), uns={ - "dataset_id": de_train_h5ad.uns["dataset_id"], - "method_id": meta["functionality_name"] + "dataset_id": de_train.uns["dataset_id"], + "method_id": meta["name"] } ) print(output) diff --git a/src/methods/lgc_ensemble_helpers/prepare_data.py b/src/methods/lgc_ensemble_helpers/prepare_data.py index c511a851..11079d0e 100644 --- a/src/methods/lgc_ensemble_helpers/prepare_data.py +++ b/src/methods/lgc_ensemble_helpers/prepare_data.py @@ -8,13 +8,13 @@ def prepare_data(par, paths): seed_everything() ## Read data print("\nPreparing data...") - de_train_h5ad = ad.read_h5ad(par["de_train_h5ad"]) - de_train = anndata_to_dataframe(de_train_h5ad, par["layer"]) - de_train = de_train.drop(columns=['split']) + de_train = ad.read_h5ad(par["de_train"]) + de_train_df = anndata_to_dataframe(de_train, par["layer"]) + de_train_df = de_train_df.drop(columns=['split']) id_map = pd.read_csv(par["id_map"]) ## Create data augmentation - de_cell_type = de_train.iloc[:, [0] + list(range(5, de_train.shape[1]))] - de_sm_name = de_train.iloc[:, [1] + list(range(5, de_train.shape[1]))] + de_cell_type = de_train_df.iloc[:, [0] + list(range(5, de_train_df.shape[1]))] + de_sm_name = de_train_df.iloc[:, [1] + list(range(5, de_train_df.shape[1]))] mean_cell_type = de_cell_type.groupby('cell_type').mean().reset_index() mean_sm_name = de_sm_name.groupby('sm_name').mean().reset_index() std_cell_type = de_cell_type.groupby('cell_type').std().reset_index() @@ -22,7 +22,7 @@ def prepare_data(par, paths): std_sm_name = std_sm_name.fillna(0) cell_types = de_cell_type.groupby('cell_type').quantile(0.1).reset_index()['cell_type'] # This is just to get cell types in the right order for the next line quantiles_cell_type = pd.concat([pd.DataFrame(cell_types)]+[de_cell_type.groupby('cell_type')[col]\ - .quantile([0.25, 0.50, 0.75], interpolation='linear').unstack().reset_index(drop=True) for col in list(de_train.columns)[5:]], axis=1) + .quantile([0.25, 0.50, 0.75], interpolation='linear').unstack().reset_index(drop=True) for col in list(de_train_df.columns)[5:]], axis=1) ## Save data augmentation features print(paths["train_data_aug_dir"]) if not os.path.exists(paths["train_data_aug_dir"]): @@ -34,10 +34,10 @@ def prepare_data(par, paths): std_sm_name.to_csv(f'{paths["train_data_aug_dir"]}/std_sm_name.csv', index=False) quantiles_cell_type.to_csv(f'{paths["train_data_aug_dir"]}/quantiles_cell_type.csv', index=False) ## Create one hot encoding features - one_hot_encode(de_train[["cell_type", "sm_name"]], id_map[["cell_type", "sm_name"]], out_dir=paths["train_data_aug_dir"]) + one_hot_encode(de_train_df[["cell_type", "sm_name"]], id_map[["cell_type", "sm_name"]], out_dir=paths["train_data_aug_dir"]) ## Prepare ChemBERTa features - save_ChemBERTa_features(de_train["SMILES"].tolist(), out_dir=paths["train_data_aug_dir"], on_train_data=True) - sm_name2smiles = {smname:smiles for smname, smiles in zip(de_train['sm_name'], de_train['SMILES'])} + save_ChemBERTa_features(de_train_df["SMILES"].tolist(), out_dir=paths["train_data_aug_dir"], on_train_data=True) + sm_name2smiles = {smname:smiles for smname, smiles in zip(de_train_df['sm_name'], de_train_df['SMILES'])} test_smiles = list(map(sm_name2smiles.get, id_map['sm_name'].values)) save_ChemBERTa_features(test_smiles, out_dir=paths["train_data_aug_dir"], on_train_data=False) print("### Done.") diff --git a/src/methods/lgc_ensemble_helpers/train.py b/src/methods/lgc_ensemble_helpers/train.py index 1d6ae5a5..991fc3fb 100644 --- a/src/methods/lgc_ensemble_helpers/train.py +++ b/src/methods/lgc_ensemble_helpers/train.py @@ -12,12 +12,12 @@ def train(par, paths): "KF_N_SPLITS": par["kf_n_splits"], } print("\nRead data and build features...") - de_train_h5ad = ad.read_h5ad(par["de_train_h5ad"]) - de_train = anndata_to_dataframe(de_train_h5ad, par["layer"]) - de_train = de_train.drop(columns=['split']) + de_train = ad.read_h5ad(par["de_train"]) + de_train_df = anndata_to_dataframe(de_train, par["layer"]) + de_train_df = de_train_df.drop(columns=['split']) ylist = ['cell_type','sm_name','sm_lincs_id','SMILES','control'] one_hot_train = pd.DataFrame(np.load(f'{paths["train_data_aug_dir"]}/one_hot_train.npy')) - y = de_train.drop(columns=ylist) + y = de_train_df.drop(columns=ylist) mean_cell_type = pd.read_csv(f'{paths["train_data_aug_dir"]}/mean_cell_type.csv') std_cell_type = pd.read_csv(f'{paths["train_data_aug_dir"]}/std_cell_type.csv') mean_sm_name = pd.read_csv(f'{paths["train_data_aug_dir"]}/mean_sm_name.csv') @@ -26,11 +26,11 @@ def train(par, paths): train_chem_feat = np.load(f'{paths["train_data_aug_dir"]}/chemberta_train.npy') train_chem_feat_mean = np.load(f'{paths["train_data_aug_dir"]}/chemberta_train_mean.npy') X_vec = combine_features([mean_cell_type, std_cell_type, mean_sm_name, std_sm_name],\ - [train_chem_feat, train_chem_feat_mean], de_train, one_hot_train) + [train_chem_feat, train_chem_feat_mean], de_train_df, one_hot_train) X_vec_light = combine_features([mean_cell_type,mean_sm_name],\ - [train_chem_feat, train_chem_feat_mean], de_train, one_hot_train) + [train_chem_feat, train_chem_feat_mean], de_train_df, one_hot_train) X_vec_heavy = combine_features([quantiles_df,mean_cell_type,mean_sm_name],\ - [train_chem_feat,train_chem_feat_mean], de_train, one_hot_train, quantiles_df) + [train_chem_feat,train_chem_feat_mean], de_train_df, one_hot_train, quantiles_df) ## Start training print(f"Mean cell type:{mean_cell_type.shape}") print(f"Std cell type:{std_cell_type.shape}") @@ -42,9 +42,9 @@ def train(par, paths): print(f"X_vec:{X_vec.shape}") print(f"X_vec_light:{X_vec_light.shape}") print(f"X_vec_heavy:{X_vec_heavy.shape}") - print(f"de_train:{de_train.shape}") + print(f"de_train:{de_train_df.shape}") print(f"Y:{y.shape}") - cell_types_sm_names = de_train[['cell_type', 'sm_name']] + cell_types_sm_names = de_train_df[['cell_type', 'sm_name']] print("\nTraining starting...") train_validate(X_vec, X_vec_light, X_vec_heavy, y, cell_types_sm_names, train_config, par, paths) print("\nDone.") diff --git a/src/methods/lgc_ensemble_predict/script.py b/src/methods/lgc_ensemble_predict/script.py index 3a03591f..3c32b96e 100644 --- a/src/methods/lgc_ensemble_predict/script.py +++ b/src/methods/lgc_ensemble_predict/script.py @@ -145,7 +145,7 @@ df_sub.reset_index(drop=True, inplace=True) # write output -method_id = meta["functionality_name"].replace("_predict", "") +method_id = meta["name"].replace("_predict", "") output = ad.AnnData( layers={"prediction": df_sub.to_numpy()}, obs=pd.DataFrame(index=id_map["id"]), diff --git a/src/methods/lgc_ensemble_prepare/config.vsh.yaml b/src/methods/lgc_ensemble_prepare/config.vsh.yaml index 60e56b0e..ee4ba12f 100644 --- a/src/methods/lgc_ensemble_prepare/config.vsh.yaml +++ b/src/methods/lgc_ensemble_prepare/config.vsh.yaml @@ -1,7 +1,7 @@ name: lgc_ensemble_prepare namespace: methods arguments: - - name: --de_train_h5ad + - name: --de_train type: file required: false direction: input diff --git a/src/methods/lgc_ensemble_prepare/script.py b/src/methods/lgc_ensemble_prepare/script.py index 7e778627..2cd28373 100644 --- a/src/methods/lgc_ensemble_prepare/script.py +++ b/src/methods/lgc_ensemble_prepare/script.py @@ -13,7 +13,7 @@ ## VIASH START par = { - "de_train_h5ad": "resources/datasets/neurips-2023-data/de_train.h5ad", + "de_train": "resources/datasets/neurips-2023-data/de_train.h5ad", "id_map": "resources/datasets/neurips-2023-data/id_map.csv", "layer": "clipped_sign_log10_pval", "epochs": 10, @@ -46,16 +46,16 @@ ## Read data print("\nPreparing data...", flush=True) -de_train_h5ad = ad.read_h5ad(par["de_train_h5ad"]) -de_train = anndata_to_dataframe(de_train_h5ad, par["layer"]) -de_train = de_train.drop(columns=['split']) +de_train = ad.read_h5ad(par["de_train"]) +de_train_df = anndata_to_dataframe(de_train, par["layer"]) +de_train_df = de_train_df.drop(columns=['split']) id_map = pd.read_csv(par["id_map"]) -gene_names = list(de_train_h5ad.var_names) +gene_names = list(de_train.var_names) print("Create data augmentation", flush=True) -de_cell_type = de_train.iloc[:, [0] + list(range(5, de_train.shape[1]))] -de_sm_name = de_train.iloc[:, [1] + list(range(5, de_train.shape[1]))] +de_cell_type = de_train_df.iloc[:, [0] + list(range(5, de_train_df.shape[1]))] +de_sm_name = de_train_df.iloc[:, [1] + list(range(5, de_train_df.shape[1]))] mean_cell_type = de_cell_type.groupby('cell_type').mean().reset_index() mean_sm_name = de_sm_name.groupby('sm_name').mean().reset_index() std_cell_type = de_cell_type.groupby('cell_type').std().reset_index() @@ -66,7 +66,7 @@ [pd.DataFrame(cell_types)] + [ de_cell_type.groupby('cell_type')[col].quantile([0.25, 0.50, 0.75], interpolation='linear').unstack().reset_index(drop=True) - for col in list(de_train.columns)[5:] + for col in list(de_train_df.columns)[5:] ], axis=1 ) @@ -81,12 +81,12 @@ json.dump(gene_names, f) print("Create one hot encoding features", flush=True) -one_hot_train, _ = one_hot_encode(de_train[["cell_type", "sm_name"]], id_map[["cell_type", "sm_name"]], out_dir=par["train_data_aug_dir"]) +one_hot_train, _ = one_hot_encode(de_train_df[["cell_type", "sm_name"]], id_map[["cell_type", "sm_name"]], out_dir=par["train_data_aug_dir"]) one_hot_train = pd.DataFrame(one_hot_train) print("Prepare ChemBERTa features", flush=True) -train_chem_feat, train_chem_feat_mean = save_ChemBERTa_features(de_train["SMILES"].tolist(), out_dir=par["train_data_aug_dir"], on_train_data=True) -sm_name2smiles = {smname:smiles for smname, smiles in zip(de_train['sm_name'], de_train['SMILES'])} +train_chem_feat, train_chem_feat_mean = save_ChemBERTa_features(de_train_df["SMILES"].tolist(), out_dir=par["train_data_aug_dir"], on_train_data=True) +sm_name2smiles = {smname:smiles for smname, smiles in zip(de_train_df['sm_name'], de_train_df['SMILES'])} test_smiles = list(map(sm_name2smiles.get, id_map['sm_name'].values)) _, _ = save_ChemBERTa_features(test_smiles, out_dir=par["train_data_aug_dir"], on_train_data=False) @@ -94,35 +94,35 @@ # interpreted from src/methods/lgc_ensemble/train.py ## Prepare cross-validation -cell_types_sm_names = de_train[['cell_type', 'sm_name']] +cell_types_sm_names = de_train_df[['cell_type', 'sm_name']] cell_types_sm_names.to_csv(f'{par["train_data_aug_dir"]}/cell_types_sm_names.csv', index=False) print("Store Xs and y", flush=True) X_vec = combine_features( [mean_cell_type, std_cell_type, mean_sm_name, std_sm_name], [train_chem_feat, train_chem_feat_mean], - de_train, + de_train_df, one_hot_train ) np.save(f'{par["train_data_aug_dir"]}/X_vec_initial.npy', X_vec) X_vec_light = combine_features( [mean_cell_type, mean_sm_name], [train_chem_feat, train_chem_feat_mean], - de_train, + de_train_df, one_hot_train ) np.save(f'{par["train_data_aug_dir"]}/X_vec_light.npy', X_vec_light) X_vec_heavy = combine_features( [quantiles_cell_type, mean_cell_type, mean_sm_name], [train_chem_feat,train_chem_feat_mean], - de_train, + de_train_df, one_hot_train, quantiles_cell_type ) np.save(f'{par["train_data_aug_dir"]}/X_vec_heavy.npy', X_vec_heavy) ylist = ['cell_type','sm_name','sm_lincs_id','SMILES','control'] -y = de_train.drop(columns=ylist) +y = de_train_df.drop(columns=ylist) np.save(f'{par["train_data_aug_dir"]}/y.npy', y.values) print("Store config and shapes", flush=True) @@ -133,7 +133,7 @@ "KF_N_SPLITS": par["kf_n_splits"], "SCHEMES": par["schemes"], "MODELS": par["models"], - "DATASET_ID": de_train_h5ad.uns["dataset_id"], + "DATASET_ID": de_train.uns["dataset_id"], } with open(f'{par["train_data_aug_dir"]}/config.json', 'w') as file: json.dump(config, file) diff --git a/src/methods/nn_retraining_with_pseudolabels/config.vsh.yaml b/src/methods/nn_retraining_with_pseudolabels/config.vsh.yaml index 58f15e88..16c404be 100644 --- a/src/methods/nn_retraining_with_pseudolabels/config.vsh.yaml +++ b/src/methods/nn_retraining_with_pseudolabels/config.vsh.yaml @@ -2,25 +2,25 @@ __merge__: ../../api/comp_method.yaml name: nn_retraining_with_pseudolabels label: NN retraining with pseudolabels +summary: Neural networks with pseudolabeling and ensemble modelling +description: | + The prediction system is two staged, so I publish two versions of the notebook. + The first stage predicts pseudolabels. To be honest, if I stopped on this version, I would not be the third. + The predicted pseudolabels on all test data (255 rows) are added to training in the second stage. + + **Stage 1 preparing pseudolabels**: The main part of this system is a neural network. Every neural network and its environment was optimized by optuna. Hyperparameters that have been optimized: + a dropout value, a number of neurons in particular layers, an output dimension of an embedding layer, a number of epochs, a learning rate, a batch size, a number of dimension of truncated singular value decomposition. + The optimization was done on custom 4-folds cross validation. In order to avoid overfitting to cross validation by optuna I applied 2 repeats for every fold and took an average. Generally, the more, the better. The optuna's criterion was MRRMSE. + Finally, 7 models were ensembled. Optuna was applied again to determine best weights of linear combination. The prediction of test set is the pseudolabels now and will be used in second stage. + + **Stage 2 retraining with pseudolabels**: The pseudolabels (255 rows) were added to the training dataset. I applied 20 models with optimized parameters in different experiments for a model diversity. + Optuna selected optimal weights for the linear combination of the prediction again. + Models had high variance, so every model was trained 10 times on all dataset and the median of prediction is taken as a final prediction. The prediction was additionally clipped to colwise min and max. +links: + documentation: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/458750 + repository: https://github.com/okon2000/single_cell_perturbations info: neurips2023_rank: 3 - summary: Neural networks with pseudolabeling and ensemble modelling - description: | - The prediction system is two staged, so I publish two versions of the notebook. - The first stage predicts pseudolabels. To be honest, if I stopped on this version, I would not be the third. - The predicted pseudolabels on all test data (255 rows) are added to training in the second stage. - - **Stage 1 preparing pseudolabels**: The main part of this system is a neural network. Every neural network and its environment was optimized by optuna. Hyperparameters that have been optimized: - a dropout value, a number of neurons in particular layers, an output dimension of an embedding layer, a number of epochs, a learning rate, a batch size, a number of dimension of truncated singular value decomposition. - The optimization was done on custom 4-folds cross validation. In order to avoid overfitting to cross validation by optuna I applied 2 repeats for every fold and took an average. Generally, the more, the better. The optuna's criterion was MRRMSE. - Finally, 7 models were ensembled. Optuna was applied again to determine best weights of linear combination. The prediction of test set is the pseudolabels now and will be used in second stage. - - **Stage 2 retraining with pseudolabels**: The pseudolabels (255 rows) were added to the training dataset. I applied 20 models with optimized parameters in different experiments for a model diversity. - Optuna selected optimal weights for the linear combination of the prediction again. - Models had high variance, so every model was trained 10 times on all dataset and the median of prediction is taken as a final prediction. The prediction was additionally clipped to colwise min and max. - reference: null - documentation_url: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/458750 - repository_url: https://github.com/okon2000/single_cell_perturbations arguments: - type: integer diff --git a/src/methods/nn_retraining_with_pseudolabels/script.py b/src/methods/nn_retraining_with_pseudolabels/script.py index 9842f6d2..3c3d5962 100644 --- a/src/methods/nn_retraining_with_pseudolabels/script.py +++ b/src/methods/nn_retraining_with_pseudolabels/script.py @@ -20,7 +20,7 @@ ## VIASH START par = { - "de_train_h5ad": "resources/datasets/neurips-2023-data/de_train.h5ad", + "de_train": "resources/datasets/neurips-2023-data/de_train.h5ad", "id_map": "resources/datasets/neurips-2023-data/id_map.csv", "layer": "clipped_sign_log10_pval", "output": "output.h5ad", @@ -37,23 +37,23 @@ from notebook_266 import run_notebook_266 # load train data -de_train_h5ad = ad.read_h5ad(par["de_train_h5ad"]) -train_df = anndata_to_dataframe(de_train_h5ad, par["layer"]) +de_train = ad.read_h5ad(par["de_train"]) +de_train_df = anndata_to_dataframe(de_train, par["layer"]) -train_df = train_df.sample(frac=1.0, random_state=42) -train_df = train_df.reset_index(drop=True) +de_train_df = de_train_df.sample(frac=1.0, random_state=42) +de_train_df = de_train_df.reset_index(drop=True) # load test data id_map = pd.read_csv(par["id_map"]) # determine gene names -gene_names = list(de_train_h5ad.var_names) +gene_names = list(de_train.var_names) # clean up train data -train_df = train_df.loc[:, ["cell_type", "sm_name"] + gene_names] +de_train_df = de_train_df.loc[:, ["cell_type", "sm_name"] + gene_names] # run notebook 264 -pseudolabel = run_notebook_264(train_df, id_map, gene_names, par["reps"]) +pseudolabel = run_notebook_264(de_train_df, id_map, gene_names, par["reps"]) # add metadata pseudolabel = pd.concat( @@ -61,7 +61,7 @@ ) # run notebook 266 -df = run_notebook_266(train_df, id_map, pseudolabel, gene_names, par["reps"]) +df = run_notebook_266(de_train_df, id_map, pseudolabel, gene_names, par["reps"]) print('Write output to file', flush=True) @@ -70,8 +70,8 @@ obs=pd.DataFrame(index=id_map["id"]), var=pd.DataFrame(index=gene_names), uns={ - "dataset_id": de_train_h5ad.uns["dataset_id"], - "method_id": meta["functionality_name"] + "dataset_id": de_train.uns["dataset_id"], + "method_id": meta["name"] } ) diff --git a/src/methods/pyboost/config.vsh.yaml b/src/methods/pyboost/config.vsh.yaml index fa5bea39..47038681 100644 --- a/src/methods/pyboost/config.vsh.yaml +++ b/src/methods/pyboost/config.vsh.yaml @@ -2,26 +2,27 @@ __merge__: ../../api/comp_method.yaml name: pyboost label: Py-boost -info: - neurips2023_rank: 18 - summary: "Py-boost predicting t-scores" - description: | - An ensemble of four models was considered: - - * Py-boost (a ridge regression-based recommender system) - * ExtraTrees (a decision tree ensemble with target-encoded features) - * a k-nearest neighbors recommender system - * a ridge regression model +summary: "Py-boost predicting t-scores" +description: | + An ensemble of four models was considered: + + * Py-boost (a ridge regression-based recommender system) + * ExtraTrees (a decision tree ensemble with target-encoded features) + * a k-nearest neighbors recommender system + * a ridge regression model - Each model offered distinct strengths and weaknesses: ExtraTrees and - knn were unable to extrapolate beyond the training data, while ridge - regression provided extrapolation capability. To enhance model performance, - data augmentation techniques were used, including averaging differential - expressions for compound mixtures and adjusting cell counts to reduce biases. + Each model offered distinct strengths and weaknesses: ExtraTrees and + knn were unable to extrapolate beyond the training data, while ridge + regression provided extrapolation capability. To enhance model performance, + data augmentation techniques were used, including averaging differential + expressions for compound mixtures and adjusting cell counts to reduce biases. - In the end, only the py-boost model is used for generating predictions. - documentation_url: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/458661 - repository_url: https://github.com/Ambros-M/Single-Cell-Perturbations-2023 + In the end, only the py-boost model is used for generating predictions. +links: + documentation: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/458661 + repository: https://github.com/Ambros-M/Single-Cell-Perturbations-2023 +info: + neurips2023_rank: 18 arguments: - type: string name: --predictor_names diff --git a/src/methods/pyboost/script.py b/src/methods/pyboost/script.py index c1abdeb0..95ca62ef 100644 --- a/src/methods/pyboost/script.py +++ b/src/methods/pyboost/script.py @@ -13,7 +13,7 @@ ## VIASH START par = dict( - de_train_h5ad = "resources/datasets/neurips-2023-data/de_train.h5ad", + de_train = "resources/datasets/neurips-2023-data/de_train.h5ad", layer = "clipped_sign_log10_pval", id_map = "resources/datasets/neurips-2023-data/id_map.csv", predictor_names = ["py_boost"], @@ -29,28 +29,28 @@ from helper import predictors print("Loading data\n", flush=True) -de_train_h5ad = ad.read_h5ad(par["de_train_h5ad"]) -de_train = anndata_to_dataframe(de_train_h5ad, par["layer"]) -adata_obs = de_train_h5ad.uns["single_cell_obs"] +de_train = ad.read_h5ad(par["de_train"]) +de_train_df = anndata_to_dataframe(de_train, par["layer"]) +adata_obs = de_train.uns["single_cell_obs"] id_map = pd.read_csv(par['id_map'], index_col = 0) # display(id_map) # 18211 genes -genes = de_train_h5ad.var_names -de_train_indexed = de_train.set_index(['cell_type', 'sm_name'])[genes] +genes = de_train.var_names +de_train_indexed = de_train_df.set_index(['cell_type', 'sm_name'])[genes] # All 146 sm_names -sm_names = sorted(de_train.sm_name.unique()) +sm_names = sorted(de_train_df.sm_name.unique()) # Determine the 17 compounds (including the two control compounds) with data for almost all cell types -train_sm_names = de_train.query("cell_type == 'B cells'").sm_name.sort_values().values +train_sm_names = de_train_df.query("cell_type == 'B cells'").sm_name.sort_values().values # The other 129 sm_names test_sm_names = [sm for sm in sm_names if sm not in train_sm_names] # The three control sm_names controls3 = ['Dabrafenib', 'Belinostat', 'Dimethyl Sulfoxide'] # All 6 cell types -cell_types = list(de_train_h5ad.obs.cell_type.cat.categories) +cell_types = list(de_train.obs.cell_type.cat.categories) test_cell_types = list(id_map.cell_type.unique()) train_cell_types = [ct for ct in cell_types if not ct in test_cell_types] @@ -94,8 +94,8 @@ obs=pd.DataFrame(index=id_map.index), var=pd.DataFrame(index=genes), uns={ - "dataset_id": de_train_h5ad.uns["dataset_id"], - "method_id": meta["functionality_name"] + "dataset_id": de_train.uns["dataset_id"], + "method_id": meta["name"] } ) diff --git a/src/methods/scape/config.vsh.yaml b/src/methods/scape/config.vsh.yaml index 14222f55..4766130b 100644 --- a/src/methods/scape/config.vsh.yaml +++ b/src/methods/scape/config.vsh.yaml @@ -1,22 +1,22 @@ __merge__: ../../api/comp_method.yaml name: scape label: ScAPE +summary: Neural network model for drug effect prediction +description: | + ScAPE is utilises a neural network (NN) model to estimate drug effects on gene expression in + peripheral blood mononuclear cells (PBMCs). The model took drug and cell features as input, + with these features primarily derived from the median of signed log-pvalues and log fold-changes + grouped by drug and cell type. The NN was trained using a leave-one-drug-out cross-validation + strategy, focusing on NK cells as a representative cell type due to their similarity to B cells + and Myeloid cells in principal component analysis. Model performance was evaluated by comparing + its predictions against two baselines: predicting zero effect and predicting the median + log-pvalue for each drug. The final submission combined predictions from models trained on + different gene and drug subsets, aiming to enhance overall prediction accuracy. +links: + documentation: https://docs.google.com/document/d/1w0GIJ8VoQx3HEJNmLXoU-Y_STB-h5-bXusL80_6EVuU/edit + repository: https://github.com/scapeML/scape info: neurips2023_rank: 16 - summary: Neural network model for drug effect prediction - description: | - ScAPE is utilises a neural network (NN) model to estimate drug effects on gene expression in - peripheral blood mononuclear cells (PBMCs). The model took drug and cell features as input, - with these features primarily derived from the median of signed log-pvalues and log fold-changes - grouped by drug and cell type. The NN was trained using a leave-one-drug-out cross-validation - strategy, focusing on NK cells as a representative cell type due to their similarity to B cells - and Myeloid cells in principal component analysis. Model performance was evaluated by comparing - its predictions against two baselines: predicting zero effect and predicting the median - log-pvalue for each drug. The final submission combined predictions from models trained on - different gene and drug subsets, aiming to enhance overall prediction accuracy. - reference: pablormier2023scape - documentation_url: https://docs.google.com/document/d/1w0GIJ8VoQx3HEJNmLXoU-Y_STB-h5-bXusL80_6EVuU/edit - repository_url: https://github.com/scapeML/scape arguments: - type: string name: --cell @@ -67,8 +67,9 @@ engines: setup: - type: python packages: - # - tensorflow~=2.14.0 + - tensorflow~=2.14.0 - tensorflow-io-gcs-filesystem>=0.31.0 + - pandas~=1.5.3 - scikit-learn~=1.2.2 - fastparquet~=2023.10.1 - git+https://github.com/scapeML/scape.git diff --git a/src/methods/scape/script.py b/src/methods/scape/script.py index 6f824000..34cd295f 100644 --- a/src/methods/scape/script.py +++ b/src/methods/scape/script.py @@ -1,4 +1,5 @@ -import sys, fastparquet, shutil +import sys +import shutil import pandas as pd import anndata as ad import numpy as np @@ -13,7 +14,7 @@ ## VIASH START par = dict( - de_train_h5ad = "resources/datasets/neurips-2023-data/de_train.h5ad", + de_train = "resources/datasets/neurips-2023-data/de_train.h5ad", id_map = "resources/datasets/neurips-2023-data/id_map.csv", output = "output.h5ad", output_model = None, @@ -34,17 +35,17 @@ ) ## VIASH END -def write_predictions(df_submission_data, par, meta, de_train_h5ad, id_map): +def write_predictions(df_submission_data, par, meta, de_train, id_map): # Write the files print('Write output to file', flush=True) - genes = list(de_train_h5ad.var_names) + genes = list(de_train.var_names) output = ad.AnnData( layers={"prediction": df_submission_data[genes].to_numpy()}, obs=pd.DataFrame(index=id_map["id"]), var=pd.DataFrame(index=genes), uns={ - "dataset_id": de_train_h5ad.uns["dataset_id"], - "method_id": meta["functionality_name"] + "dataset_id": de_train.uns["dataset_id"], + "method_id": meta["name"] } ) @@ -62,7 +63,7 @@ def write_predictions(df_submission_data, par, meta, de_train_h5ad, id_map): atexit.register(lambda: shutil.rmtree(model_dir)) # load log pvals -de_train_h5ad = ad.read_h5ad(par["de_train_h5ad"]) +de_train = ad.read_h5ad(par["de_train"]) # construct data frames def get_df(adata, layer): @@ -74,8 +75,8 @@ def get_df(adata, layer): axis=1 ).set_index(['cell_type', 'sm_name']) -df_de = get_df(de_train_h5ad, par["layer"]) -df_lfc = get_df(de_train_h5ad, "logFC") +df_de = get_df(de_train, par["layer"]) +df_lfc = get_df(de_train, "logFC") # Make sure rows/columns are in the same order @@ -152,7 +153,7 @@ def confirm_celltype(df_de, cell, sm_name=None): if len(top_drugs) == 0: # df_focus is not computed, just return the original submission df_submission_data = df_sub_ix.join(df_sub).reset_index(drop=True) - write_predictions(df_submission_data, par, meta, de_train_h5ad, id_map) + write_predictions(df_submission_data, par, meta, de_train, id_map) sys.exit(0) df_de_c = df_de[df_de.index.get_level_values("sm_name").isin(top_drugs)] @@ -191,4 +192,4 @@ def confirm_celltype(df_de, cell, sm_name=None): df_submission = 0.80 * df_focus + 0.20 * df_sub df_submission_data = df_sub_ix.join(df_submission).reset_index(drop=True) -write_predictions(df_submission_data, par, meta, de_train_h5ad, id_map) \ No newline at end of file +write_predictions(df_submission_data, par, meta, de_train, id_map) \ No newline at end of file diff --git a/src/methods/transformer_ensemble/config.vsh.yaml b/src/methods/transformer_ensemble/config.vsh.yaml index e77173ea..bf8e8b80 100644 --- a/src/methods/transformer_ensemble/config.vsh.yaml +++ b/src/methods/transformer_ensemble/config.vsh.yaml @@ -2,22 +2,24 @@ __merge__: ../../api/comp_method.yaml name: transformer_ensemble label: Transformer Ensemble +summary: An ensemble of four transformer models, trained on diverse feature sets, with a cluster-based sampling strategy and robust validation for optimal performance. +description: | + This method employs an ensemble of four transformer models, + each with different weights and trained on slightly varying feature sets. + The feature engineering process involved one-hot encoding of categorical labels, + target encoding using mean and standard deviation, and enriching the feature set + with the standard deviation of target variables. Additionally, the dataset was + carefully examined to ensure data cleanliness. A sophisticated sampling strategy + based on K-Means clustering was employed to partition the data into training and + validation sets, ensuring a representative distribution. The model architecture + leveraged sparse and dense feature encoding, along with a transformer for effective + learning. +links: + documentation: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/458738 + repository: https://github.com/Eliorkalfon/single_cell_pb +references: {} info: neurips2023_rank: 2 - summary: An ensemble of four transformer models, trained on diverse feature sets, with a cluster-based sampling strategy and robust validation for optimal performance. - description: | - This method employs an ensemble of four transformer models, - each with different weights and trained on slightly varying feature sets. - The feature engineering process involved one-hot encoding of categorical labels, - target encoding using mean and standard deviation, and enriching the feature set - with the standard deviation of target variables. Additionally, the dataset was - carefully examined to ensure data cleanliness. A sophisticated sampling strategy - based on K-Means clustering was employed to partition the data into training and - validation sets, ensuring a representative distribution. The model architecture - leveraged sparse and dense feature encoding, along with a transformer for effective - learning. - documentation_url: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/discussion/458738 - repository_url: https://github.com/Eliorkalfon/single_cell_pb arguments: - name: --num_train_epochs type: integer diff --git a/src/methods/transformer_ensemble/script.py b/src/methods/transformer_ensemble/script.py index f6368f10..5871ff3d 100644 --- a/src/methods/transformer_ensemble/script.py +++ b/src/methods/transformer_ensemble/script.py @@ -8,7 +8,7 @@ ## VIASH START par = { - "de_train_h5ad": "resources/neurips-2023-kaggle/de_train.h5ad", + "de_train": "resources/neurips-2023-kaggle/de_train.h5ad", "id_map": "resources/neurips-2023-kaggle/id_map.csv", "output": "output/prediction.h5ad", "output_model": "output/model/", @@ -33,17 +33,17 @@ os.makedirs(par["output_model"], exist_ok=True) # read data -de_train_h5ad = ad.read_h5ad(par["de_train_h5ad"]) +de_train = ad.read_h5ad(par["de_train"]) id_map = pd.read_csv(par["id_map"]) # convert .obs categoricals to string for ease of use -for col in de_train_h5ad.obs.select_dtypes(include=["category"]).columns: - de_train_h5ad.obs[col] = de_train_h5ad.obs[col].astype(str) +for col in de_train.obs.select_dtypes(include=["category"]).columns: + de_train.obs[col] = de_train.obs[col].astype(str) # reset index -de_train_h5ad.obs.reset_index(drop=True, inplace=True) +de_train.obs.reset_index(drop=True, inplace=True) # determine other variables -gene_names = list(de_train_h5ad.var_names) +gene_names = list(de_train.var_names) n_components = len(gene_names) # train and predict models @@ -89,14 +89,14 @@ print(f"> Prepare augmented data", flush=True) if argset["mean_std"] == "mean_std": one_hot_encode_features, targets, one_hot_test = prepare_augmented_data( - de_train_h5ad=de_train_h5ad, + de_train=de_train, id_map=id_map, layer=par["layer"], uncommon=argset["uncommon"], ) elif argset["mean_std"] == "mean": one_hot_encode_features, targets, one_hot_test = prepare_augmented_data_mean_only( - de_train_h5ad=de_train_h5ad, + de_train=de_train, id_map=id_map, layer=par["layer"], ) @@ -180,8 +180,8 @@ obs=pd.DataFrame(index=id_map["id"]), var=pd.DataFrame(index=gene_names), uns={ - "dataset_id": de_train_h5ad.uns["dataset_id"], - "method_id": meta["functionality_name"] + "dataset_id": de_train.uns["dataset_id"], + "method_id": meta["name"] } ) diff --git a/src/methods/transformer_ensemble/utils.py b/src/methods/transformer_ensemble/utils.py index 5b4a7a0b..cfc9d3cd 100644 --- a/src/methods/transformer_ensemble/utils.py +++ b/src/methods/transformer_ensemble/utils.py @@ -22,33 +22,33 @@ def reduce_labels(Y, n_components): def prepare_augmented_data( - de_train_h5ad, + de_train, id_map, layer, uncommon=False ): xlist = ['cell_type', 'sm_name'] y = pd.DataFrame( - de_train_h5ad.layers[layer], - columns=de_train_h5ad.var_names, - index=de_train_h5ad.obs.index + de_train.layers[layer], + columns=de_train.var_names, + index=de_train.obs.index ) # Combine train and test data for one-hot encoding - combined_data = pd.concat([de_train_h5ad.obs[xlist], id_map[xlist]]) + combined_data = pd.concat([de_train.obs[xlist], id_map[xlist]]) dum_data = pd.get_dummies(combined_data, columns=xlist) # Split the combined data back into train and test - train = dum_data.iloc[:de_train_h5ad.n_obs] - test = dum_data.iloc[de_train_h5ad.n_obs:] + train = dum_data.iloc[:de_train.n_obs] + test = dum_data.iloc[de_train.n_obs:] if uncommon: uncommon = [f for f in train if f not in test] X = train.drop(columns=uncommon) X = train - de_cell_type = pd.concat([de_train_h5ad.obs[['cell_type']], y], axis=1) - de_sm_name = pd.concat([de_train_h5ad.obs[['sm_name']], y], axis=1) + de_cell_type = pd.concat([de_train.obs[['cell_type']], y], axis=1) + de_sm_name = pd.concat([de_train.obs[['sm_name']], y], axis=1) mean_cell_type = de_cell_type.groupby('cell_type').mean().reset_index() std_cell_type = de_cell_type.groupby('cell_type').std().reset_index().fillna(0) @@ -111,30 +111,30 @@ def prepare_augmented_data( def prepare_augmented_data_mean_only( - de_train_h5ad, + de_train, layer, id_map ): xlist = ['cell_type', 'sm_name'] y = pd.DataFrame( - de_train_h5ad.layers[layer], - columns=de_train_h5ad.var_names, - index=de_train_h5ad.obs.index + de_train.layers[layer], + columns=de_train.var_names, + index=de_train.obs.index ) # Combine train and test data for one-hot encoding - combined_data = pd.concat([de_train_h5ad.obs[xlist], id_map[xlist]]) + combined_data = pd.concat([de_train.obs[xlist], id_map[xlist]]) dum_data = pd.get_dummies(combined_data, columns=xlist) # Split the combined data back into train and test - train = dum_data.iloc[:de_train_h5ad.n_obs] - test = dum_data.iloc[de_train_h5ad.n_obs:] + train = dum_data.iloc[:de_train.n_obs] + test = dum_data.iloc[de_train.n_obs:] # uncommon = [f for f in train if f not in test] # X = train.drop(columns=uncommon) X = train - de_cell_type = pd.concat([de_train_h5ad.obs[['cell_type']], y], axis=1) - de_sm_name = pd.concat([de_train_h5ad.obs[['sm_name']], y], axis=1) + de_cell_type = pd.concat([de_train.obs[['cell_type']], y], axis=1) + de_sm_name = pd.concat([de_train.obs[['sm_name']], y], axis=1) mean_cell_type = de_cell_type.groupby('cell_type').mean().reset_index() mean_sm_name = de_sm_name.groupby('sm_name').mean().reset_index() rows = [] diff --git a/src/metrics/mean_rowwise_correlation/config.vsh.yaml b/src/metrics/mean_rowwise_correlation/config.vsh.yaml index faeec66a..b1598603 100644 --- a/src/metrics/mean_rowwise_correlation/config.vsh.yaml +++ b/src/metrics/mean_rowwise_correlation/config.vsh.yaml @@ -13,11 +13,20 @@ info: $$ where $(R)$ is the number of scored rows, and $(\mathbf{y}_i)$ and $(\mathbf{\hat{y}}_i)$ are the actual and predicted values, respectively, for row $(i)$. - repository_url: null - documentation_url: null min: -1 max: 1 maximize: true + references: + bibtex: | + @article{slazata2024benchmark, + title = {A benchmark for prediction of transcriptomic responses to chemical perturbations across cell types}, + author = {Artur Szałata and Andrew Benz and Robrecht Cannoodt and Mauricio Cortes and Jason Fong and Sunil Kuppasani and Richard Lieberman and Tianyu Liu and Javier A. Mas-Rosario and Rico Meinl and Jalil Nourisa and Jared Tumiel and Tin M. Tunjic and Mengbo Wang and Noah Weber and Hongyu Zhao and Benedict Anchang and Fabian J Theis and Malte D Luecken and Daniel B Burkhardt}, + booktitle = {The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track}, + year = {2024}, + url = {https://openreview.net/forum?id=WTI4RJYSVm} + } + links: {} + - name: mean_rowwise_spearman label: Mean Rowwise Spearman summary: The mean of Spearman correlations per row (perturbation). @@ -29,11 +38,19 @@ info: $$ where $(R)$ is the number of scored rows, and $(\mathbf{r}_i)$ and $(\mathbf{\hat{r}}_i)$ are the ranks of the actual and predicted values, respectively, for row $(i)$. - repository_url: null - documentation_url: null min: -1 max: 1 maximize: true + references: + bibtex: | + @article{slazata2024benchmark, + title = {A benchmark for prediction of transcriptomic responses to chemical perturbations across cell types}, + author = {Artur Szałata and Andrew Benz and Robrecht Cannoodt and Mauricio Cortes and Jason Fong and Sunil Kuppasani and Richard Lieberman and Tianyu Liu and Javier A. Mas-Rosario and Rico Meinl and Jalil Nourisa and Jared Tumiel and Tin M. Tunjic and Mengbo Wang and Noah Weber and Hongyu Zhao and Benedict Anchang and Fabian J Theis and Malte D Luecken and Daniel B Burkhardt}, + booktitle = {The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track}, + year = {2024}, + url = {https://openreview.net/forum?id=WTI4RJYSVm} + } + links: {} - name: mean_rowwise_cosine label: Mean Rowwise Cosine summary: The mean of cosine similarities per row (perturbation). @@ -45,11 +62,19 @@ info: $$ where $(R)$ is the number of scored rows, and $(\mathbf{y}_i)$ and $(\mathbf{\hat{y}}_i)$ are the actual and predicted values, respectively, for row $(i)$. - repository_url: null - documentation_url: null min: -1 max: 1 maximize: true + references: + bibtex: | + @article{slazata2024benchmark, + title = {A benchmark for prediction of transcriptomic responses to chemical perturbations across cell types}, + author = {Artur Szałata and Andrew Benz and Robrecht Cannoodt and Mauricio Cortes and Jason Fong and Sunil Kuppasani and Richard Lieberman and Tianyu Liu and Javier A. Mas-Rosario and Rico Meinl and Jalil Nourisa and Jared Tumiel and Tin M. Tunjic and Mengbo Wang and Noah Weber and Hongyu Zhao and Benedict Anchang and Fabian J Theis and Malte D Luecken and Daniel B Burkhardt}, + booktitle = {The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track}, + year = {2024}, + url = {https://openreview.net/forum?id=WTI4RJYSVm} + } + links: {} resources: - type: r_script path: script.R diff --git a/src/metrics/mean_rowwise_correlation/script.R b/src/metrics/mean_rowwise_correlation/script.R index 5048d072..78c8c8d2 100644 --- a/src/metrics/mean_rowwise_correlation/script.R +++ b/src/metrics/mean_rowwise_correlation/script.R @@ -3,7 +3,7 @@ library(rlang) ## VIASH START par <- list( - de_test_h5ad = "resources/datasets/neurips-2023-data/de_test.h5ad", + de_test = "resources/datasets/neurips-2023-data/de_test.h5ad", de_test_layer = "clipped_sign_log10_pval", prediction = "resources/datasets/neurips-2023-data/prediction.h5ad", prediction_layer = "prediction", @@ -13,7 +13,7 @@ par <- list( ## VIASH END cat("Load data\n") -de_test <- read_h5ad(par$de_test_h5ad) +de_test <- read_h5ad(par$de_test) cat("de_test: "); print(de_test) prediction <- read_h5ad(par$prediction) cat("prediction: "); print(prediction) diff --git a/src/metrics/mean_rowwise_error/config.vsh.yaml b/src/metrics/mean_rowwise_error/config.vsh.yaml index dc385eb6..82683fa9 100644 --- a/src/metrics/mean_rowwise_error/config.vsh.yaml +++ b/src/metrics/mean_rowwise_error/config.vsh.yaml @@ -13,11 +13,19 @@ info: $$ where $(R)$ is the number of scored rows, and $(y_{ij})$ and $(\widehat{y}_{ij})$ are the actual and predicted values, respectively, for row $(i)$ and column $(j)$, and $(n)$ bis the number of columns. - repository_url: null - documentation_url: null min: 0 - max: "+inf" + max: +.inf maximize: false + references: + bibtex: | + @article{slazata2024benchmark, + title = {A benchmark for prediction of transcriptomic responses to chemical perturbations across cell types}, + author = {Artur Szałata and Andrew Benz and Robrecht Cannoodt and Mauricio Cortes and Jason Fong and Sunil Kuppasani and Richard Lieberman and Tianyu Liu and Javier A. Mas-Rosario and Rico Meinl and Jalil Nourisa and Jared Tumiel and Tin M. Tunjic and Mengbo Wang and Noah Weber and Hongyu Zhao and Benedict Anchang and Fabian J Theis and Malte D Luecken and Daniel B Burkhardt}, + booktitle = {The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track}, + year = {2024}, + url = {https://openreview.net/forum?id=WTI4RJYSVm} + } + links: {} - name: mean_rowwise_mae label: Mean Rowwise MAE summary: The mean of the absolute error (MAE) of each row in the matrix. @@ -29,11 +37,19 @@ info: $$ where $(R)$ is the number of scored rows, and $(y_{ij})$ and $(\widehat{y}_{ij})$ are the actual and predicted values, respectively, for row $(i)$ and column $(j)$, and $(n)$ bis the number of columns. - repository_url: null - documentation_url: null min: 0 - max: "+inf" + max: +.inf maximize: false + references: + bibtex: | + @article{slazata2024benchmark, + title = {A benchmark for prediction of transcriptomic responses to chemical perturbations across cell types}, + author = {Artur Szałata and Andrew Benz and Robrecht Cannoodt and Mauricio Cortes and Jason Fong and Sunil Kuppasani and Richard Lieberman and Tianyu Liu and Javier A. Mas-Rosario and Rico Meinl and Jalil Nourisa and Jared Tumiel and Tin M. Tunjic and Mengbo Wang and Noah Weber and Hongyu Zhao and Benedict Anchang and Fabian J Theis and Malte D Luecken and Daniel B Burkhardt}, + booktitle = {The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track}, + year = {2024}, + url = {https://openreview.net/forum?id=WTI4RJYSVm} + } + links: {} resources: - type: r_script path: script.R diff --git a/src/metrics/mean_rowwise_error/script.R b/src/metrics/mean_rowwise_error/script.R index fb94bf69..59848b73 100644 --- a/src/metrics/mean_rowwise_error/script.R +++ b/src/metrics/mean_rowwise_error/script.R @@ -2,7 +2,7 @@ library(anndata) ## VIASH START (unchanged) par <- list( - de_test_h5ad = "resources/datasets/neurips-2023-data/de_test.h5ad", + de_test = "resources/datasets/neurips-2023-data/de_test.h5ad", de_test_layer = "clipped_sign_log10_pval", prediction = "resources/datasets/neurips-2023-data/prediction.h5ad", prediction_layer = "prediction", @@ -12,7 +12,7 @@ par <- list( ## VIASH END cat("Load data\n") -de_test <- read_h5ad(par$de_test_h5ad) +de_test <- read_h5ad(par$de_test) cat("de_test: "); print(de_test) prediction <- read_h5ad(par$prediction) cat("prediction: "); print(prediction) diff --git a/src/process_dataset/generate_id_map/config.vsh.yaml b/src/process_dataset/generate_id_map/config.vsh.yaml index c90eec91..0c06ce8f 100644 --- a/src/process_dataset/generate_id_map/config.vsh.yaml +++ b/src/process_dataset/generate_id_map/config.vsh.yaml @@ -8,7 +8,7 @@ info: description: | This task generates the ID map file for competitors. arguments: - - name: --de_test_h5ad + - name: --de_test type: file required: true direction: input diff --git a/src/process_dataset/generate_id_map/script.py b/src/process_dataset/generate_id_map/script.py index 79303562..db46d272 100644 --- a/src/process_dataset/generate_id_map/script.py +++ b/src/process_dataset/generate_id_map/script.py @@ -2,16 +2,16 @@ ## VIASH START par = { - "de_test_h5ad": "resources/datasets/neurips-2023-data/de_test.h5ad", + "de_test": "resources/datasets/neurips-2023-data/de_test.h5ad", "id_map": "resources/datasets/neurips-2023-data/id_map.csv", } ## VIASH END print(">> Load dataset", flush=True) -de_test_h5ad = ad.read_h5ad(par["de_test_h5ad"]) +de_test = ad.read_h5ad(par["de_test"]) print(">> Generate id_map file", flush=True) -id_map = de_test_h5ad.obs[["sm_name", "cell_type"]] +id_map = de_test.obs[["sm_name", "cell_type"]] id_map.reset_index(drop=True, inplace=True) id_map.reset_index(names="id", inplace=True) diff --git a/src/process_dataset/split_sc/config.vsh.yaml b/src/process_dataset/split_sc/config.vsh.yaml index bcae3f92..cdd80d70 100644 --- a/src/process_dataset/split_sc/config.vsh.yaml +++ b/src/process_dataset/split_sc/config.vsh.yaml @@ -18,12 +18,12 @@ arguments: required: true direction: input example: resources/neurips-2023-data/pseudobulk_cleaned.h5ad - - name: --sc_train_h5ad + - name: --sc_train type: file required: true direction: output example: sc_train.h5ad - - name: --sc_test_h5ad + - name: --sc_test type: file required: true direction: output diff --git a/src/process_dataset/split_sc/script.py b/src/process_dataset/split_sc/script.py index 71a1e998..f2c440c1 100644 --- a/src/process_dataset/split_sc/script.py +++ b/src/process_dataset/split_sc/script.py @@ -6,8 +6,8 @@ par = { 'filtered_sc_counts': 'resources/neurips-2023-data/sc_counts_cleaned.h5ad', 'pseudobulk_filtered_with_uns': 'resources/neurips-2023-data/pseudobulk_cleaned.h5ad', - 'sc_train_h5ad': 'sc_train.h5ad', - 'sc_test_h5ad': 'sc_test.h5ad' + 'sc_train': 'sc_train.h5ad', + 'sc_test': 'sc_test.h5ad' } ## VIASH END @@ -55,5 +55,5 @@ filtered_sc_counts.obs[col] = filtered_sc_counts.obs[col].astype("category") print(">> Save sc dataset into splits", flush=True) -filtered_sc_counts[filtered_sc_counts.obs["split"] == "train"].write_h5ad(par["sc_train_h5ad"], compression="gzip") -filtered_sc_counts[filtered_sc_counts.obs["split"] == "test"].write_h5ad(par["sc_test_h5ad"], compression="gzip") +filtered_sc_counts[filtered_sc_counts.obs["split"] == "train"].write_h5ad(par["sc_train"], compression="gzip") +filtered_sc_counts[filtered_sc_counts.obs["split"] == "test"].write_h5ad(par["sc_test"], compression="gzip") diff --git a/src/workflows/process_dataset/main.nf b/src/workflows/process_dataset/main.nf index 09449697..b4093bb8 100644 --- a/src/workflows/process_dataset/main.nf +++ b/src/workflows/process_dataset/main.nf @@ -40,8 +40,8 @@ workflow run_wf { pseudobulk_filtered_with_uns: "pseudobulk_filtered_with_uns" ], toState: [ - sc_train_h5ad: "sc_train_h5ad", - sc_test_h5ad: "sc_test_h5ad" + sc_train: "sc_train", + sc_test: "sc_test" ] ) @@ -54,7 +54,7 @@ workflow run_wf { output_splits: ["train", "control", "public_test"] ] }, - toState: [de_train_h5ad: "output"] + toState: [de_train: "output"] ) | run_limma.run( @@ -66,20 +66,20 @@ workflow run_wf { output_splits: ["private_test"] ] }, - toState: [de_test_h5ad: "output"] + toState: [de_test: "output"] ) | generate_id_map.run( - fromState: [de_test_h5ad: "de_test_h5ad"], + fromState: [de_test: "de_test"], toState: [id_map: "id_map"] ) | setState([ - "de_train_h5ad", - "de_test_h5ad", + "de_train", + "de_test", "id_map", - "sc_train_h5ad", - "sc_test_h5ad", + "sc_train", + "sc_test", "pseudobulk_filtered_with_uns" ]) diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 382e3296..9bcc6624 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -3,11 +3,11 @@ namespace: "workflows" argument_groups: - name: Inputs arguments: - - name: "--de_train_h5ad" + - name: "--de_train" __merge__: ../../api/file_de_train.yaml required: true direction: input - - name: "--de_test_h5ad" + - name: "--de_test" __merge__: ../../api/file_de_test.yaml required: true direction: input diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index d6580ec5..6d12f5ea 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -45,20 +45,20 @@ workflow run_wf { metrics: metrics, methodFromState: { id, state, comp -> def new_args = [ - de_train_h5ad: state.de_train_h5ad, + de_train: state.de_train, id_map: state.id_map, layer: state.layer, output: 'predictions/$id.$key.output.h5ad', output_model: null ] - if (comp.config.functionality.info.type == "control_method") { - new_args.de_test_h5ad = state.de_test_h5ad + if (comp.config.info.type == "control_method") { + new_args.de_test = state.de_test } new_args }, methodToState: ["prediction": "output"], metricFromState: [ - de_test_h5ad: "de_test_h5ad", + de_test: "de_test", de_test_layer: "layer", prediction: "prediction" ], @@ -80,7 +80,7 @@ workflow run_wf { // create dataset, method and metric metadata files metadata_ch = input_ch | create_metadata_files( - datasetFromState: [input: "de_train_h5ad"], + datasetFromState: [input: "de_train"], methods: methods, metrics: metrics, meta: meta @@ -126,10 +126,10 @@ def run_benchmark_fun(args) { // add the key prefix to the method and metric names if (keyPrefix && keyPrefix != "") { methods_ = methods.collect{ method -> - method.run(key: keyPrefix + method.config.functionality.name) + method.run(key: keyPrefix + method.config.name) } metrics_ = metrics.collect{ metric -> - metric.run(key: keyPrefix + metric.config.functionality.name) + metric.run(key: keyPrefix + metric.config.name) } } @@ -142,10 +142,10 @@ def run_benchmark_fun(args) { | runEach( components: methods_, filter: { id, state, comp -> - !state.method_ids || state.method_ids.contains(comp.config.functionality.name) + !state.method_ids || state.method_ids.contains(comp.config.name) }, id: { id, state, comp -> - id + "." + comp.config.functionality.name + id + "." + comp.config.name }, fromState: methodFromState, toState: methodToState, @@ -156,10 +156,10 @@ def run_benchmark_fun(args) { | runEach( components: metrics_, filter: { id, state, comp -> - !state.metric_ids || state.metric_ids.contains(comp.config.functionality.name) + !state.metric_ids || state.metric_ids.contains(comp.config.name) }, id: { id, state, comp -> - id + "." + comp.config.functionality.name + id + "." + comp.config.name }, fromState: metricFromState, toState: metricToState, diff --git a/src/workflows/run_stability_analysis/main.nf b/src/workflows/run_stability_analysis/main.nf index 19b6bc17..a4f4fd1a 100644 --- a/src/workflows/run_stability_analysis/main.nf +++ b/src/workflows/run_stability_analysis/main.nf @@ -57,16 +57,16 @@ workflow run_wf { ] }, toState: [ - de_test_h5ad: "de_test_h5ad", - de_train_h5ad: "de_train_h5ad", + de_test: "de_test", + de_train: "de_train", id_map: "id_map" ] ) | run_benchmark.run( fromState: [ - de_train_h5ad: "de_train_h5ad", - de_test_h5ad: "de_test_h5ad", + de_train: "de_train", + de_test: "de_test", id_map: "id_map", method_ids: "method_ids", metric_ids: "metric_ids",