From af3cf2e96c741a2c031b61e0d0d9553acb61a93e Mon Sep 17 00:00:00 2001 From: f641l Date: Wed, 22 Apr 2026 18:06:31 +0200 Subject: [PATCH 01/22] changes many files --- README.md | 2 +- scripts/create_resources/resources.sh | 2 +- scripts/create_resources/test_resources.sh | 27 +++--- scripts/run_benchmark/run_full_local.sh | 2 +- scripts/run_benchmark/run_full_seqeracloud.sh | 2 +- src/api/comp_data_processor.yaml | 2 +- .../process_dataset/config.vsh.yaml | 9 +- .../config/config_default.json | 1 + src/data_processors/process_dataset/script.py | 85 +++++++++---------- src/methods/cellpose/config.vsh.yaml | 8 +- .../process_datasets/config.vsh.yaml | 4 +- src/workflows/process_datasets/main.nf | 4 +- 12 files changed, 70 insertions(+), 78 deletions(-) create mode 100644 src/data_processors/process_dataset/config/config_default.json diff --git a/README.md b/README.md index ccf6db4..9cde50f 100644 --- a/README.md +++ b/README.md @@ -176,7 +176,7 @@ Arguments: | `--input_sp` | `file` | An unprocessed spatial imaging dataset stored as a zarr file. | | `--input_sc` | `file` | An unprocessed dataset as output by a dataset loader. | | `--output_spatial_dataset` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. | -| `--output_scrnaseq_reference` | `file` | (*Output*) A single-cell reference dataset, preprocessed for this benchmark. | +| `--output_scrnaseq` | `file` | (*Output*) A single-cell dataset, preprocessed for this benchmark. | diff --git a/scripts/create_resources/resources.sh b/scripts/create_resources/resources.sh index 57f4d68..4ba5075 100755 --- a/scripts/create_resources/resources.sh +++ b/scripts/create_resources/resources.sh @@ -18,7 +18,7 @@ cat > /tmp/params.yaml << 'HERE' input_states: s3://openproblems-data/resources/datasets/**/state.yaml rename_keys: 'input:output_dataset' output_state: '$id/state.yaml' -settings: '{"output_train": "$id/train.h5ad", "output_test": "$id/test.h5ad", "output_solution": "$id/solution.h5ad"}' +settings: '{"output_scrnaseq": "$id/output_scrnaseq.h5ad"}' publish_dir: s3://openproblems-data/resources/task_template/datasets/ HERE diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh index 9cb372a..26074a9 100755 --- a/scripts/create_resources/test_resources.sh +++ b/scripts/create_resources/test_resources.sh @@ -13,23 +13,22 @@ cd "$REPO_ROOT" set -e -RAW_DATA=resources_test/common -DATASET_DIR=resources_test/task_template +RAW_DATA=resources_test/task_spatial_segmentation +DATASET_DIR=resources_test/task_spatial_segmentation mkdir -p $DATASET_DIR # process dataset viash run src/data_processors/process_dataset/config.vsh.yaml -- \ - --input $RAW_DATA/cxg_mouse_pancreas_atlas/dataset.h5ad \ - --output_train $DATASET_DIR/cxg_mouse_pancreas_atlas/train.h5ad \ - --output_test $DATASET_DIR/cxg_mouse_pancreas_atlas/test.h5ad \ - --output_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad + --input_sp $RAW_DATA/mouse_brain_combined/common_ist.zarr \ + --input_sc $RAW_DATA/mouse_brain_combined/common_scrnaseq.h5ad \ + --output_spatial_dataset $DATASET_DIR/output_spatial_dataset.zarr \ + --output_scrnaseq $DATASET_DIR/mouse_brain_combined/output_scrnaseq.h5ad # run one method viash run src/methods/logistic_regression/config.vsh.yaml -- \ - --input_train $DATASET_DIR/cxg_mouse_pancreas_atlas/train.h5ad \ - --input_test $DATASET_DIR/cxg_mouse_pancreas_atlas/test.h5ad \ - --output $DATASET_DIR/cxg_mouse_pancreas_atlas/prediction.h5ad + --input $DATASET_DIR/mouse_brain_combined/common_ist.zarr \ + --output $DATASET_DIR/mouse_brain_combined/prediction.h5ad # run one metric viash run src/metrics/accuracy/config.vsh.yaml -- \ @@ -38,12 +37,10 @@ viash run src/metrics/accuracy/config.vsh.yaml -- \ --output $DATASET_DIR/cxg_mouse_pancreas_atlas/score.h5ad # write manual state.yaml. this is not actually necessary but you never know it might be useful -cat > $DATASET_DIR/cxg_mouse_pancreas_atlas/state.yaml << HERE -id: cxg_mouse_pancreas_atlas -train: !file train.h5ad -test: !file test.h5ad -solution: !file solution.h5ad -prediction: !file prediction.h5ad +cat > $DATASET_DIR/mouse_brain_combined/state.yaml << HERE +id: mouse_brain_combined +processed: !file output_scrnaseq.h5ad +segmentation: !file prediction.h5ad score: !file score.h5ad HERE diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh index f8c1585..4b1aa11 100755 --- a/scripts/run_benchmark/run_full_local.sh +++ b/scripts/run_benchmark/run_full_local.sh @@ -31,7 +31,7 @@ publish_dir="resources/results/${RUN_ID}" # write the parameters to file cat > /tmp/params.yaml << HERE input_states: resources/datasets/**/state.yaml -rename_keys: 'input_train:output_train;input_test:output_test;input_solution:output_solution' +rename_keys: 'input_scrnaseq:output_scrnaseq' output_state: "state.yaml" publish_dir: "$publish_dir" HERE diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh index 87d133c..83f37b2 100755 --- a/scripts/run_benchmark/run_full_seqeracloud.sh +++ b/scripts/run_benchmark/run_full_seqeracloud.sh @@ -23,7 +23,7 @@ publish_dir="s3://openproblems-data/resources/task_template/results/${RUN_ID}" # write the parameters to file cat > /tmp/params.yaml << HERE input_states: s3://openproblems-data/resources/task_template/datasets/**/state.yaml -rename_keys: 'input_train:output_train;input_test:output_test;input_solution:output_solution' +rename_keys: 'input_scrnaseq:output_scrnaseq' output_state: "state.yaml" publish_dir: "$publish_dir" HERE diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml index 22c77aa..9134d64 100644 --- a/src/api/comp_data_processor.yaml +++ b/src/api/comp_data_processor.yaml @@ -23,7 +23,7 @@ argument_groups: __merge__: file_spatial_dataset.yaml direction: output required: true - - name: "--output_scrnaseq_reference" + - name: "--output_scrnaseq" __merge__: file_scrnaseq_reference.yaml direction: output required: true diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml index 0047ae1..92cdd12 100644 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -3,9 +3,8 @@ name: process_dataset arguments: - name: "--method" type: "string" - description: "The process method to assign train/test." - choices: ["batch", "random"] - default: "batch" + description: "The spatial technology data type." + choices: ["xenium"] - name: "--obs_label" type: "string" description: "Which .obs slot to use as label." @@ -18,6 +17,10 @@ arguments: type: "integer" description: "A seed for the subsampling." example: 123 + - name: "--conf" + type: "string" + description: "Config file in json format for data processing parameters." + default: "config/config_default.json" resources: - type: python_script path: script.py diff --git a/src/data_processors/process_dataset/config/config_default.json b/src/data_processors/process_dataset/config/config_default.json new file mode 100644 index 0000000..8d52b6c --- /dev/null +++ b/src/data_processors/process_dataset/config/config_default.json @@ -0,0 +1 @@ +{"span": 1.0, "n_top_genes": 3000} \ No newline at end of file diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index 7cca2bd..97dbe78 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -2,14 +2,16 @@ import random import numpy as np import anndata as ad +import scanpy as sc import openproblems as op +import json ## VIASH START par = { 'input_sp': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr', 'input_sc': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad', - 'output_spatial_dataset': 'output_spatial_dataset.zarr', - 'output_scrnaseq_reference': 'output_scrnaseq_reference.h5ad', + 'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr', + 'output_scrnaseq': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq.h5ad', } meta = { 'resources_dir': 'target/executable/data_processors/process_dataset', @@ -19,7 +21,6 @@ # import helper functions sys.path.append(meta['resources_dir']) -from subset_h5ad_by_format import subset_h5ad_by_format config = op.project.read_viash_config(meta["config"]) @@ -29,54 +30,44 @@ random.seed(par["seed"]) print(">> Load data", flush=True) -adata = ad.read_h5ad(par["input"]) -print("input:", adata) +adata = ad.read_h5ad(par["input_sc"]) +print("input_sc:", adata) -print(f">> Process data using {par['method']} method") -if par["method"] == "batch": - batch_info = adata.obs[par["obs_batch"]] - batch_categories = batch_info.dtype.categories - test_batches = random.sample(list(batch_categories), 1) - is_test = [ x in test_batches for x in batch_info ] -elif par["method"] == "random": - train_ix = np.random.choice(adata.n_obs, round(adata.n_obs * 0.8), replace=False) - is_test = [ not x in train_ix for x in range(0, adata.n_obs) ] +print(f">> Process {par['method']} data") -# subset the different adatas -print(">> Figuring which data needs to be copied to which output file", flush=True) -# use par arguments to look for label and batch value in different slots -slot_mapping = { - "obs": { - "label": par["obs_label"], - "batch": par["obs_batch"], - } -} +if par['config']: + print(f">> Perform standard data preprocessing") + with open(par['config'], "r") as f: + config = json.load(f) + + # Add config to params + for key, value in config.items(): + setattr(par, key, value) -print(">> Creating train data", flush=True) -output_train = subset_h5ad_by_format( - adata[[not x for x in is_test]], - config, - "output_train", - slot_mapping -) + adata.layers["counts"] = adata.X.copy() + + sc.pp.normalize_total(adata) + sc.pp.log1p(adata) + adata.layers['normlog'] = adata.X + + sc.pp.highly_variable_genes( + adata, + flavor="seurat_v3", + layer="counts", + span=par['span'], + n_top_genes=par['n_top_genes'] + ) -print(">> Creating test data", flush=True) -output_test = subset_h5ad_by_format( - adata[is_test], - config, - "output_test", - slot_mapping -) + adata.var.sort_values("means") + sc.pp.scale(adata, zero_center=False) + adata.layers['normlogscale'] = adata.X + + adata.X = adata.layers['counts'] -print(">> Creating solution data", flush=True) -output_solution = subset_h5ad_by_format( - adata[is_test], - config, - "output_solution", - slot_mapping -) + # cell area normalization + sc.pp.calculate_qc_metrics(adata, inplace=True) + for x in ['transcript_counts', 'n_genes_by_counts']: + adata.obs[f'canorm_{x}'] = adata.obs[f'{x}'] / adata.obs['cell_area'] print(">> Writing data", flush=True) -output_train.write_h5ad(par["output_train"]) -output_test.write_h5ad(par["output_test"]) -output_solution.write_h5ad(par["output_solution"]) +adata.write_h5ad(par["output_scrnaseq"]) diff --git a/src/methods/cellpose/config.vsh.yaml b/src/methods/cellpose/config.vsh.yaml index 46be884..47c6cec 100644 --- a/src/methods/cellpose/config.vsh.yaml +++ b/src/methods/cellpose/config.vsh.yaml @@ -1,11 +1,11 @@ name: cellpose label: "Cellpose" # TODO: update the summary, description and links -summary: "Output of the segmantation methot cellpose" -description: "Output of the segmantation methot cellpose" +summary: "Cellpose-SAM: cell and nucleus segmentation with superhuman generalization." +description: "cellpose is an anatomical segmentation algorithm written in Python 3." links: # these should point to the documentation of the method - documentation: "https://github.com/openproblems-bio/task_ist_preprocessing" - repository: "https://github.com/openproblems-bio/task_ist_preprocessing" + documentation: "https://cellpose.readthedocs.io/en/latest/" + repository: "https://github.com/mouseland/cellpose" references: doi: "10.1038/s41592-020-01018-x" diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml index c71286a..fe3b9d4 100644 --- a/src/workflows/process_datasets/config.vsh.yaml +++ b/src/workflows/process_datasets/config.vsh.yaml @@ -18,8 +18,8 @@ argument_groups: __merge__: /src/api/file_spatial_dataset.yaml direction: output required: true - - name: "--output_scrnaseq_reference" - __merge__: /src/api/file_scrnaseq_reference.yaml + - name: "--output_scrnaseq" + __merge__: /src/api/file_scrnaseq.yaml direction: output required: true diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf index 947a8f1..226e861 100644 --- a/src/workflows/process_datasets/main.nf +++ b/src/workflows/process_datasets/main.nf @@ -45,12 +45,12 @@ workflow run_wf { ], toState: [ output_spatial_dataset: "output_spatial_dataset", - output_scrnaseq_reference: "output_scrnaseq_reference" + output_scrnaseq: "output_scrnaseq" ] ) // only output the files for which an output file was specified - | setState(["output_spatial_dataset", "output_scrnaseq_reference"]) + | setState(["output_spatial_dataset", "output_scrnaseq"]) emit: output_ch From ac9c237ca2ae09ed80d7d4a801d9626919bb3483 Mon Sep 17 00:00:00 2001 From: f641l Date: Thu, 23 Apr 2026 20:02:44 +0200 Subject: [PATCH 02/22] bugfix --- README.md | 8 ++++---- src/api/comp_control_method.yaml | 2 +- src/api/comp_data_processor.yaml | 2 +- src/api/comp_metric.yaml | 2 +- .../{file_scrnaseq_reference.yaml => file_scrnaseq.yaml} | 0 src/data_processors/process_dataset/config.vsh.yaml | 4 +++- src/workflows/run_benchmark/config.vsh.yaml | 2 +- 7 files changed, 11 insertions(+), 9 deletions(-) rename src/api/{file_scrnaseq_reference.yaml => file_scrnaseq.yaml} (100%) diff --git a/README.md b/README.md index 9cde50f..26ed0b1 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ flowchart TB file_common_ist("Common iST Dataset") comp_data_processor[/"Data processor"/] file_spatial_dataset("Raw iST Dataset") - file_scrnaseq_reference("scRNA-seq Reference") + file_scrnaseq("scRNA-seq Reference") comp_control_method[/"Control Method"/] comp_method[/"Method"/] comp_metric[/"Metric"/] @@ -48,11 +48,11 @@ flowchart TB file_common_scrnaseq("Common SC Dataset") file_common_ist---comp_data_processor comp_data_processor-->file_spatial_dataset - comp_data_processor-->file_scrnaseq_reference + comp_data_processor-->file_scrnaseq file_spatial_dataset---comp_control_method file_spatial_dataset---comp_method - file_scrnaseq_reference---comp_control_method - file_scrnaseq_reference---comp_metric + file_scrnaseq---comp_control_method + file_scrnaseq---comp_metric comp_control_method-->file_prediction comp_method-->file_prediction comp_metric-->file_score diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml index 3f4fa2e..694f004 100644 --- a/src/api/comp_control_method.yaml +++ b/src/api/comp_control_method.yaml @@ -17,7 +17,7 @@ arguments: required: true direction: input - name: "--input_scrnaseq_reference" - __merge__: file_scrnaseq_reference.yaml + __merge__: file_scrnaseq.yaml direction: input required: true - name: --output diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml index 9134d64..137cd12 100644 --- a/src/api/comp_data_processor.yaml +++ b/src/api/comp_data_processor.yaml @@ -24,7 +24,7 @@ argument_groups: direction: output required: true - name: "--output_scrnaseq" - __merge__: file_scrnaseq_reference.yaml + __merge__: file_scrnaseq.yaml direction: output required: true test_resources: diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml index a7470e9..e2d21e6 100644 --- a/src/api/comp_metric.yaml +++ b/src/api/comp_metric.yaml @@ -12,7 +12,7 @@ arguments: direction: input required: true - name: "--input_scrnaseq_reference" - __merge__: file_scrnaseq_reference.yaml + __merge__: file_scrnaseq.yaml direction: input required: true - name: "--output" diff --git a/src/api/file_scrnaseq_reference.yaml b/src/api/file_scrnaseq.yaml similarity index 100% rename from src/api/file_scrnaseq_reference.yaml rename to src/api/file_scrnaseq.yaml diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml index 92cdd12..0aa574c 100644 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -1,5 +1,7 @@ __merge__: ../../api/comp_data_processor.yaml + name: process_dataset + arguments: - name: "--method" type: "string" @@ -21,10 +23,10 @@ arguments: type: "string" description: "Config file in json format for data processing parameters." default: "config/config_default.json" + resources: - type: python_script path: script.py - - path: /common/helper_functions/subset_h5ad_by_format.py engines: - type: docker diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 4ab5f83..dd7f49b 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -9,7 +9,7 @@ argument_groups: direction: output required: true - name: "--input_scrnaseq_reference" - __merge__: /src/api/file_scrnaseq_reference.yaml + __merge__: /src/api/file_scrnaseq.yaml direction: output required: true - name: Outputs From cb6235df4b464ea334e99c473b5520478d5d6336 Mon Sep 17 00:00:00 2001 From: f641l Date: Thu, 23 Apr 2026 20:36:27 +0200 Subject: [PATCH 03/22] bugfix for data_process in config.vsh.yaml --- src/data_processors/process_dataset/config.vsh.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml index 0aa574c..ba70f3a 100644 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -19,7 +19,7 @@ arguments: type: "integer" description: "A seed for the subsampling." example: 123 - - name: "--conf" + - name: "--config" type: "string" description: "Config file in json format for data processing parameters." default: "config/config_default.json" From 4e9b8f218fa918f17df50e93fbf78983767fa945 Mon Sep 17 00:00:00 2001 From: f641l Date: Thu, 23 Apr 2026 20:58:29 +0200 Subject: [PATCH 04/22] bugfix for data_process in config.vsh.yaml and script.py --- src/data_processors/process_dataset/config.vsh.yaml | 6 ++++-- src/data_processors/process_dataset/script.py | 4 ++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml index ba70f3a..c4fc3c6 100644 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -20,9 +20,11 @@ arguments: description: "A seed for the subsampling." example: 123 - name: "--config" - type: "string" + type: file description: "Config file in json format for data processing parameters." - default: "config/config_default.json" + required: true + direction: input + example: config/config_default.json resources: - type: python_script diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index 97dbe78..e0954ce 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -12,7 +12,11 @@ 'input_sc': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad', 'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr', 'output_scrnaseq': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq.h5ad', + 'method': 'xenium', + 'seed': 123, + 'config': 'config/config_default.json' } + meta = { 'resources_dir': 'target/executable/data_processors/process_dataset', 'config': 'target/executable/data_processors/process_dataset/.config.vsh.yaml' From ae452afb82cf2a0ce475be92d1bb56c92ded7eeb Mon Sep 17 00:00:00 2001 From: f641l Date: Thu, 23 Apr 2026 21:11:45 +0200 Subject: [PATCH 05/22] bugfix for data_process in script.py --- src/data_processors/process_dataset/script.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index e0954ce..d5e16f6 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -14,7 +14,7 @@ 'output_scrnaseq': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq.h5ad', 'method': 'xenium', 'seed': 123, - 'config': 'config/config_default.json' + 'config': 'task_spatial_segmentation/src/data_processors/process_dataset/config/config_default.json' } meta = { From 30d308b082be7a3bcff8618f0a5da9fc6ac6f8de Mon Sep 17 00:00:00 2001 From: f641l Date: Thu, 23 Apr 2026 21:16:39 +0200 Subject: [PATCH 06/22] bugfix for data_process in config.vsh.yml --- src/data_processors/process_dataset/config.vsh.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml index c4fc3c6..64444ab 100644 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -20,10 +20,8 @@ arguments: description: "A seed for the subsampling." example: 123 - name: "--config" - type: file + type: "string" description: "Config file in json format for data processing parameters." - required: true - direction: input example: config/config_default.json resources: From bc0de4e6399e5fe01706306b5697c7788c724780 Mon Sep 17 00:00:00 2001 From: f641l Date: Thu, 23 Apr 2026 21:26:16 +0200 Subject: [PATCH 07/22] bugfix for data_process in script.py --- src/data_processors/process_dataset/script.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index d5e16f6..7792c9b 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -5,6 +5,7 @@ import scanpy as sc import openproblems as op import json +import shutil ## VIASH START par = { @@ -75,3 +76,6 @@ print(">> Writing data", flush=True) adata.write_h5ad(par["output_scrnaseq"]) + +print(">> Writing spatial data", flush=True) +shutil.copy(par["input_sp"], par["output_spatial_dataset"]) \ No newline at end of file From 4e158a57f2c26b9782e984e6d66d99539dd3d5b5 Mon Sep 17 00:00:00 2001 From: f641l Date: Thu, 23 Apr 2026 21:32:41 +0200 Subject: [PATCH 08/22] bugfix for data_process in script.py --- src/data_processors/process_dataset/script.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index 7792c9b..fb5ee5a 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -78,4 +78,4 @@ adata.write_h5ad(par["output_scrnaseq"]) print(">> Writing spatial data", flush=True) -shutil.copy(par["input_sp"], par["output_spatial_dataset"]) \ No newline at end of file +shutil.copytree(par["input_sp"], par["output_spatial_dataset"]) \ No newline at end of file From cb7d5b4291a569c1c27165a1451e47fa3830f509 Mon Sep 17 00:00:00 2001 From: f641l Date: Thu, 23 Apr 2026 21:37:56 +0200 Subject: [PATCH 09/22] bugfix for data_process in script.py --- src/data_processors/process_dataset/script.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index fb5ee5a..7a48f6d 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -4,6 +4,7 @@ import anndata as ad import scanpy as sc import openproblems as op +import spatialdata as sd import json import shutil From cd1f3d2305b656f0a063103d868e5b5c29c3f112 Mon Sep 17 00:00:00 2001 From: f641l Date: Thu, 23 Apr 2026 22:04:59 +0200 Subject: [PATCH 10/22] changing docker containter in config.vsh.yaml --- src/data_processors/process_dataset/config.vsh.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml index 64444ab..d1dcd00 100644 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -32,6 +32,17 @@ engines: - type: docker image: openproblems/base_python:1 + - type: docker + #image: openproblems/base_pytorch_nvidia:1 # TODO: ideally get gpu image to work + image: openproblems/base_python:1 + setup: + - type: python + packages: scikit-learn + __merge__: + - /src/base/setup_spatialdata_partial.yaml + - type: native + + runners: - type: executable - type: nextflow From 59ed4c16a52758e6490033fc46e1bbadbf48a0bb Mon Sep 17 00:00:00 2001 From: f641l Date: Thu, 23 Apr 2026 22:09:54 +0200 Subject: [PATCH 11/22] changing docker containter in config.vsh.yaml --- src/data_processors/process_dataset/config.vsh.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml index d1dcd00..25700eb 100644 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -29,9 +29,6 @@ resources: path: script.py engines: - - type: docker - image: openproblems/base_python:1 - - type: docker #image: openproblems/base_pytorch_nvidia:1 # TODO: ideally get gpu image to work image: openproblems/base_python:1 From c5dfac01afb90b305fbfae097c65c80becc421ed Mon Sep 17 00:00:00 2001 From: f641l Date: Fri, 24 Apr 2026 14:35:15 +0200 Subject: [PATCH 12/22] comment out output_spatial_dataset --- README.md | 2 +- scripts/create_resources/test_resources.sh | 2 +- src/api/comp_data_processor.yaml | 8 ++++---- src/data_processors/process_dataset/script.py | 7 ++----- src/workflows/process_datasets/config.vsh.yaml | 8 ++++---- src/workflows/process_datasets/main.nf | 7 +++++-- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 26ed0b1..7827c92 100644 --- a/README.md +++ b/README.md @@ -175,7 +175,7 @@ Arguments: |:---|:---|:---| | `--input_sp` | `file` | An unprocessed spatial imaging dataset stored as a zarr file. | | `--input_sc` | `file` | An unprocessed dataset as output by a dataset loader. | -| `--output_spatial_dataset` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. | + | `--output_scrnaseq` | `file` | (*Output*) A single-cell dataset, preprocessed for this benchmark. | diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh index 26074a9..b9b99de 100755 --- a/scripts/create_resources/test_resources.sh +++ b/scripts/create_resources/test_resources.sh @@ -22,7 +22,7 @@ mkdir -p $DATASET_DIR viash run src/data_processors/process_dataset/config.vsh.yaml -- \ --input_sp $RAW_DATA/mouse_brain_combined/common_ist.zarr \ --input_sc $RAW_DATA/mouse_brain_combined/common_scrnaseq.h5ad \ - --output_spatial_dataset $DATASET_DIR/output_spatial_dataset.zarr \ + # --output_spatial_dataset $DATASET_DIR/output_spatial_dataset.zarr \ --output_scrnaseq $DATASET_DIR/mouse_brain_combined/output_scrnaseq.h5ad # run one method diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml index 137cd12..50a1597 100644 --- a/src/api/comp_data_processor.yaml +++ b/src/api/comp_data_processor.yaml @@ -19,10 +19,10 @@ argument_groups: direction: input - name: Outputs arguments: - - name: "--output_spatial_dataset" - __merge__: file_spatial_dataset.yaml - direction: output - required: true + # - name: "--output_spatial_dataset" + # __merge__: file_spatial_dataset.yaml + # direction: output + # required: true - name: "--output_scrnaseq" __merge__: file_scrnaseq.yaml direction: output diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index 7a48f6d..cd3025e 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -12,7 +12,7 @@ par = { 'input_sp': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr', 'input_sc': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad', - 'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr', + #'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr', 'output_scrnaseq': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq.h5ad', 'method': 'xenium', 'seed': 123, @@ -76,7 +76,4 @@ adata.obs[f'canorm_{x}'] = adata.obs[f'{x}'] / adata.obs['cell_area'] print(">> Writing data", flush=True) -adata.write_h5ad(par["output_scrnaseq"]) - -print(">> Writing spatial data", flush=True) -shutil.copytree(par["input_sp"], par["output_spatial_dataset"]) \ No newline at end of file +adata.write_h5ad(par["output_scrnaseq"]) \ No newline at end of file diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml index fe3b9d4..127a9e1 100644 --- a/src/workflows/process_datasets/config.vsh.yaml +++ b/src/workflows/process_datasets/config.vsh.yaml @@ -14,10 +14,10 @@ argument_groups: direction: input - name: Outputs arguments: - - name: "--output_spatial_dataset" - __merge__: /src/api/file_spatial_dataset.yaml - direction: output - required: true + # - name: "--output_spatial_dataset" + # __merge__: /src/api/file_spatial_dataset.yaml + # direction: output + # required: true - name: "--output_scrnaseq" __merge__: /src/api/file_scrnaseq.yaml direction: output diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf index 226e861..2be995d 100644 --- a/src/workflows/process_datasets/main.nf +++ b/src/workflows/process_datasets/main.nf @@ -44,13 +44,16 @@ workflow run_wf { "input_sc": "input_sc" ], toState: [ - output_spatial_dataset: "output_spatial_dataset", + // output_spatial_dataset: "output_spatial_dataset", output_scrnaseq: "output_scrnaseq" ] ) // only output the files for which an output file was specified - | setState(["output_spatial_dataset", "output_scrnaseq"]) + | setState([ + // "output_spatial_dataset", + "output_scrnaseq" + ]) emit: output_ch From 97c42d1530c4c1b0f54b21c8a7618c865b162850 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 27 Apr 2026 14:12:04 +0200 Subject: [PATCH 13/22] Revert "comment out output_spatial_dataset" This reverts commit c5dfac01afb90b305fbfae097c65c80becc421ed. --- README.md | 2 +- scripts/create_resources/test_resources.sh | 2 +- src/api/comp_data_processor.yaml | 8 ++++---- src/data_processors/process_dataset/script.py | 7 +++++-- src/workflows/process_datasets/config.vsh.yaml | 8 ++++---- src/workflows/process_datasets/main.nf | 7 ++----- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 7827c92..26ed0b1 100644 --- a/README.md +++ b/README.md @@ -175,7 +175,7 @@ Arguments: |:---|:---|:---| | `--input_sp` | `file` | An unprocessed spatial imaging dataset stored as a zarr file. | | `--input_sc` | `file` | An unprocessed dataset as output by a dataset loader. | - +| `--output_spatial_dataset` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. | | `--output_scrnaseq` | `file` | (*Output*) A single-cell dataset, preprocessed for this benchmark. | diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh index b9b99de..26074a9 100755 --- a/scripts/create_resources/test_resources.sh +++ b/scripts/create_resources/test_resources.sh @@ -22,7 +22,7 @@ mkdir -p $DATASET_DIR viash run src/data_processors/process_dataset/config.vsh.yaml -- \ --input_sp $RAW_DATA/mouse_brain_combined/common_ist.zarr \ --input_sc $RAW_DATA/mouse_brain_combined/common_scrnaseq.h5ad \ - # --output_spatial_dataset $DATASET_DIR/output_spatial_dataset.zarr \ + --output_spatial_dataset $DATASET_DIR/output_spatial_dataset.zarr \ --output_scrnaseq $DATASET_DIR/mouse_brain_combined/output_scrnaseq.h5ad # run one method diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml index 50a1597..137cd12 100644 --- a/src/api/comp_data_processor.yaml +++ b/src/api/comp_data_processor.yaml @@ -19,10 +19,10 @@ argument_groups: direction: input - name: Outputs arguments: - # - name: "--output_spatial_dataset" - # __merge__: file_spatial_dataset.yaml - # direction: output - # required: true + - name: "--output_spatial_dataset" + __merge__: file_spatial_dataset.yaml + direction: output + required: true - name: "--output_scrnaseq" __merge__: file_scrnaseq.yaml direction: output diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index cd3025e..7a48f6d 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -12,7 +12,7 @@ par = { 'input_sp': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr', 'input_sc': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad', - #'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr', + 'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr', 'output_scrnaseq': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq.h5ad', 'method': 'xenium', 'seed': 123, @@ -76,4 +76,7 @@ adata.obs[f'canorm_{x}'] = adata.obs[f'{x}'] / adata.obs['cell_area'] print(">> Writing data", flush=True) -adata.write_h5ad(par["output_scrnaseq"]) \ No newline at end of file +adata.write_h5ad(par["output_scrnaseq"]) + +print(">> Writing spatial data", flush=True) +shutil.copytree(par["input_sp"], par["output_spatial_dataset"]) \ No newline at end of file diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml index 127a9e1..fe3b9d4 100644 --- a/src/workflows/process_datasets/config.vsh.yaml +++ b/src/workflows/process_datasets/config.vsh.yaml @@ -14,10 +14,10 @@ argument_groups: direction: input - name: Outputs arguments: - # - name: "--output_spatial_dataset" - # __merge__: /src/api/file_spatial_dataset.yaml - # direction: output - # required: true + - name: "--output_spatial_dataset" + __merge__: /src/api/file_spatial_dataset.yaml + direction: output + required: true - name: "--output_scrnaseq" __merge__: /src/api/file_scrnaseq.yaml direction: output diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf index 2be995d..226e861 100644 --- a/src/workflows/process_datasets/main.nf +++ b/src/workflows/process_datasets/main.nf @@ -44,16 +44,13 @@ workflow run_wf { "input_sc": "input_sc" ], toState: [ - // output_spatial_dataset: "output_spatial_dataset", + output_spatial_dataset: "output_spatial_dataset", output_scrnaseq: "output_scrnaseq" ] ) // only output the files for which an output file was specified - | setState([ - // "output_spatial_dataset", - "output_scrnaseq" - ]) + | setState(["output_spatial_dataset", "output_scrnaseq"]) emit: output_ch From 00589e79f3a92a93e6db9de7296cbacdb38f56a7 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 27 Apr 2026 14:51:57 +0200 Subject: [PATCH 14/22] update project config --- _viash.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_viash.yaml b/_viash.yaml index 31ad320..a0130fe 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -11,8 +11,8 @@ license: MIT keywords: [single-cell, openproblems, benchmark] # Step 3: Update the `task_template` to the name of the task from step 1. links: - issue_tracker: https://github.com/openproblems-bio/task_template/issues - repository: https://github.com/openproblems-bio/task_template + issue_tracker: https://github.com/openproblems-bio/task_spatial_segmentation/issues + repository: https://github.com/openproblems-bio/task_spatial_segmentation docker_registry: ghcr.io From 9022aa35a9c06528b274ea82eaf1f070125361a4 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 27 Apr 2026 14:52:03 +0200 Subject: [PATCH 15/22] update readme --- README.md | 284 +++++++++++------------------------------------------- 1 file changed, 54 insertions(+), 230 deletions(-) diff --git a/README.md b/README.md index 26ed0b1..1526580 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ A one sentence summary of purpose and methodology. Used for creating an overview tables. Repository: -[openproblems-bio/task_template](https://github.com/openproblems-bio/task_template) +[openproblems-bio/task_spatial_segmentation](https://github.com/openproblems-bio/task_spatial_segmentation) ## Description @@ -28,34 +28,34 @@ should convince readers of the significance and relevance of your task. ## Authors & contributors -| Name | Roles | Linkedin | Twitter | Email | Github | Orcid | -|:---|:---|:---|:---|:---|:---|:---| -| John Doe | author, maintainer | johndoe | johndoe | john@doe.me | johndoe | 0000-0000-0000-0000 | +| name | roles | +|:---------|:-------------------| +| John Doe | author, maintainer | ## API ``` mermaid flowchart TB - file_common_ist("Common iST Dataset") - comp_data_processor[/"Data processor"/] - file_spatial_dataset("Raw iST Dataset") - file_scrnaseq("scRNA-seq Reference") - comp_control_method[/"Control Method"/] - comp_method[/"Method"/] - comp_metric[/"Metric"/] - file_prediction("Predicted data") - file_score("Score") - file_common_scrnaseq("Common SC Dataset") + file_common_ist("Common iST Dataset") + comp_data_processor[/"Data processor"/] + file_scrnaseq_reference("scRNA-seq Reference") + file_spatial_dataset("Raw iST Dataset") + comp_control_method[/"Control Method"/] + comp_metric[/"Metric"/] + comp_method[/"Method"/] + file_prediction("Predicted data") + file_score("Score") + file_common_scrnaseq("Common SC Dataset") file_common_ist---comp_data_processor + comp_data_processor-->file_scrnaseq_reference comp_data_processor-->file_spatial_dataset - comp_data_processor-->file_scrnaseq + file_scrnaseq_reference---comp_control_method + file_scrnaseq_reference---comp_metric file_spatial_dataset---comp_control_method file_spatial_dataset---comp_method - file_scrnaseq---comp_control_method - file_scrnaseq---comp_metric comp_control_method-->file_prediction - comp_method-->file_prediction comp_metric-->file_score + comp_method-->file_prediction file_prediction---comp_metric file_common_scrnaseq---comp_data_processor ``` @@ -76,91 +76,12 @@ Format:
- SpatialData object - images: 'image', 'image_3D', 'he_image' - labels: 'cell_labels', 'nucleus_labels' - points: 'transcripts' - shapes: 'cell_boundaries', 'nucleus_boundaries' - tables: 'metadata' - coordinate_systems: 'global' -
Data structure:
-*images* - -| Name | Description | -|:-----------|:------------------------------------| -| `image` | The raw image data. | -| `image_3D` | (*Optional*) The raw 3D image data. | -| `he_image` | (*Optional*) H&E image data. | - -*labels* - -| Name | Description | -|:-----------------|:---------------------------------------| -| `cell_labels` | (*Optional*) Cell segmentation labels. | -| `nucleus_labels` | (*Optional*) Cell segmentation labels. | - -*points* - -`transcripts`: Point cloud data of transcripts. - -| Column | Type | Description | -|:---|:---|:---| -| `x` | `float` | x-coordinate of the point. | -| `y` | `float` | y-coordinate of the point. | -| `z` | `float` | (*Optional*) z-coordinate of the point. | -| `feature_name` | `categorical` | Name of the feature. | -| `cell_id` | `integer` | (*Optional*) Unique identifier of the cell. | -| `nucleus_id` | `integer` | (*Optional*) Unique identifier of the nucleus. | -| `cell_type` | `string` | (*Optional*) Cell type of the cell. | -| `qv` | `float` | (*Optional*) Quality value of the point. | -| `transcript_id` | `long` | Unique identifier of the transcript. | -| `overlaps_nucleus` | `boolean` | (*Optional*) Whether the point overlaps with a nucleus. | - -*shapes* - -`cell_boundaries`: Cell boundaries. - -| Column | Type | Description | -|:-----------|:---------|:-------------------------------| -| `geometry` | `object` | Geometry of the cell boundary. | - -`nucleus_boundaries`: Nucleus boundaries. - -| Column | Type | Description | -|:-----------|:---------|:----------------------------------| -| `geometry` | `object` | Geometry of the nucleus boundary. | - -*tables* - -`metadata`: Metadata of spatial dataset. - -| Slot | Type | Description | -|:---|:---|:---| -| `obs["cell_id"]` | `string` | A unique identifier for the cell. | -| `var["gene_ids"]` | `string` | Unique identifier for the gene. | -| `var["feature_types"]` | `string` | Type of the feature. | -| `obsm["spatial"]` | `double` | Spatial coordinates of the cell. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["dataset_name"]` | `string` | A human-readable name for the dataset. | -| `uns["dataset_url"]` | `string` | Link to the original source of the dataset. | -| `uns["dataset_reference"]` | `string` | Bibtex reference of the paper in which the dataset was published. | -| `uns["dataset_summary"]` | `string` | Short description of the dataset. | -| `uns["dataset_description"]` | `string` | Long description of the dataset. | -| `uns["dataset_organism"]` | `string` | The organism of the sample in the dataset. | -| `uns["segmentation_id"]` | `string` | A unique identifier for the segmentation. | - -*coordinate_systems* - -| Name | Description | -|:---------|:------------------------------------| -| `global` | Coordinate system of the replicate. | -
## Component type: Data processor @@ -176,110 +97,7 @@ Arguments: | `--input_sp` | `file` | An unprocessed spatial imaging dataset stored as a zarr file. | | `--input_sc` | `file` | An unprocessed dataset as output by a dataset loader. | | `--output_spatial_dataset` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. | -| `--output_scrnaseq` | `file` | (*Output*) A single-cell dataset, preprocessed for this benchmark. | - - - -## File format: Raw iST Dataset - -A spatial transcriptomics dataset, preprocessed for this benchmark. - -Example file: -`resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr` - -Description: - -This dataset contains preprocessed images, labels, points, shapes, and -tables for spatial transcriptomics data. - -Format: - -
- - SpatialData object - images: 'image', 'image_3D', 'he_image' - labels: 'cell_labels', 'nucleus_labels' - points: 'transcripts' - shapes: 'cell_boundaries', 'nucleus_boundaries' - tables: 'metadata' - coordinate_systems: 'global' - -
- -Data structure: - -
- -*images* - -| Name | Description | -|:-----------|:------------------------------------| -| `image` | The raw image data. | -| `image_3D` | (*Optional*) The raw 3D image data. | -| `he_image` | (*Optional*) H&E image data. | - -*labels* - -| Name | Description | -|:-----------------|:---------------------------------------| -| `cell_labels` | (*Optional*) Cell segmentation labels. | -| `nucleus_labels` | (*Optional*) Cell segmentation labels. | - -*points* - -`transcripts`: Point cloud data of transcripts. - -| Column | Type | Description | -|:---|:---|:---| -| `x` | `float` | x-coordinate of the point. | -| `y` | `float` | y-coordinate of the point. | -| `z` | `float` | (*Optional*) z-coordinate of the point. | -| `feature_name` | `categorical` | Name of the feature. | -| `cell_id` | `integer` | (*Optional*) Unique identifier of the cell. | -| `nucleus_id` | `integer` | (*Optional*) Unique identifier of the nucleus. | -| `cell_type` | `string` | (*Optional*) Cell type of the cell. | -| `qv` | `float` | (*Optional*) Quality value of the point. | -| `transcript_id` | `long` | Unique identifier of the transcript. | -| `overlaps_nucleus` | `boolean` | (*Optional*) Whether the point overlaps with a nucleus. | - -*shapes* - -`cell_boundaries`: Cell boundaries. - -| Column | Type | Description | -|:-----------|:---------|:-------------------------------| -| `geometry` | `object` | Geometry of the cell boundary. | - -`nucleus_boundaries`: Nucleus boundaries. - -| Column | Type | Description | -|:-----------|:---------|:----------------------------------| -| `geometry` | `object` | Geometry of the nucleus boundary. | - -*tables* - -`metadata`: Metadata of spatial dataset. - -| Slot | Type | Description | -|:---|:---|:---| -| `obs["cell_id"]` | `string` | A unique identifier for the cell. | -| `var["gene_ids"]` | `string` | Unique identifier for the gene. | -| `var["feature_types"]` | `string` | Type of the feature. | -| `obsm["spatial"]` | `double` | Spatial coordinates of the cell. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["dataset_name"]` | `string` | A human-readable name for the dataset. | -| `uns["dataset_url"]` | `string` | Link to the original source of the dataset. | -| `uns["dataset_reference"]` | `string` | Bibtex reference of the paper in which the dataset was published. | -| `uns["dataset_summary"]` | `string` | Short description of the dataset. | -| `uns["dataset_description"]` | `string` | Long description of the dataset. | -| `uns["dataset_organism"]` | `string` | The organism of the sample in the dataset. | -| `uns["segmentation_id"]` | `string` | A unique identifier for the segmentation. | - -*coordinate_systems* - -| Name | Description | -|:---------|:------------------------------------| -| `global` | Coordinate system of the replicate. | +| `--output_scrnaseq` | `file` | (*Output*) A single-cell reference dataset, preprocessed for this benchmark. |
@@ -288,7 +106,7 @@ Data structure: A single-cell reference dataset, preprocessed for this benchmark. Example file: -`resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad` +`resources_test/task_spatial_segmentation/mouse_brain_combined/scrnaseq_reference.h5ad` Description: @@ -364,6 +182,30 @@ Data structure: +## File format: Raw iST Dataset + +A spatial transcriptomics dataset, preprocessed for this benchmark. + +Example file: +`resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr` + +Description: + +This dataset contains preprocessed images, labels, points, shapes, and +tables for spatial transcriptomics data. + +Format: + +
+ +
+ +Data structure: + +
+ +
+ ## Component type: Control Method Quality control methods for verifying the pipeline. @@ -380,9 +222,9 @@ Arguments: -## Component type: Method +## Component type: Metric -A method. +A task template metric. Arguments: @@ -390,14 +232,15 @@ Arguments: | Name | Type | Description | |:---|:---|:---| -| `--input` | `file` | A spatial transcriptomics dataset, preprocessed for this benchmark. | -| `--output` | `file` | (*Output*) A predicted dataset as output by a method. | +| `--input_prediction` | `file` | A predicted dataset as output by a method. | +| `--input_scrnaseq_reference` | `file` | A single-cell reference dataset, preprocessed for this benchmark. | +| `--output` | `file` | (*Output*) File indicating the score of a metric. | -## Component type: Metric +## Component type: Method -A task template metric. +A method. Arguments: @@ -405,9 +248,8 @@ Arguments: | Name | Type | Description | |:---|:---|:---| -| `--input_prediction` | `file` | A predicted dataset as output by a method. | -| `--input_scrnaseq_reference` | `file` | A single-cell reference dataset, preprocessed for this benchmark. | -| `--output` | `file` | (*Output*) File indicating the score of a metric. | +| `--input` | `file` | A spatial transcriptomics dataset, preprocessed for this benchmark. | +| `--output` | `file` | (*Output*) A predicted dataset as output by a method. | @@ -422,31 +264,12 @@ Format:
- SpatialData object - labels: 'segmentation' - tables: 'table' -
Data structure:
-*labels* - -| Name | Description | -|:---------------|:--------------------------| -| `segmentation` | Segmentation of the data. | - -*tables* - -`table`: AnnData table. - -| Slot | Type | Description | -|:-----------------|:---------|:------------| -| `obs["cell_id"]` | `string` | Cell ID. | -| `obs["region"]` | `string` | Region. | -
## File format: Score @@ -562,3 +385,4 @@ Data structure: | `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | + From 13368a70fbaeb8678002ce862001f5e6f37743be Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 27 Apr 2026 14:52:19 +0200 Subject: [PATCH 16/22] update helper scripts --- scripts/create_resources/resources.sh | 2 +- scripts/create_resources/test_resources.sh | 42 ++++++++++--------- scripts/run_benchmark/run_full_local.sh | 2 +- scripts/run_benchmark/run_full_seqeracloud.sh | 2 +- 4 files changed, 26 insertions(+), 22 deletions(-) diff --git a/scripts/create_resources/resources.sh b/scripts/create_resources/resources.sh index 4ba5075..52ee226 100755 --- a/scripts/create_resources/resources.sh +++ b/scripts/create_resources/resources.sh @@ -18,7 +18,7 @@ cat > /tmp/params.yaml << 'HERE' input_states: s3://openproblems-data/resources/datasets/**/state.yaml rename_keys: 'input:output_dataset' output_state: '$id/state.yaml' -settings: '{"output_scrnaseq": "$id/output_scrnaseq.h5ad"}' +settings: '{"output_spatial_dataset": "$id/output_spatial_dataset.zarr", "output_scrnaseq": "$id/output_scrnaseq.h5ad"}' publish_dir: s3://openproblems-data/resources/task_template/datasets/ HERE diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh index 26074a9..774b1f8 100755 --- a/scripts/create_resources/test_resources.sh +++ b/scripts/create_resources/test_resources.sh @@ -13,38 +13,42 @@ cd "$REPO_ROOT" set -e -RAW_DATA=resources_test/task_spatial_segmentation -DATASET_DIR=resources_test/task_spatial_segmentation +DATASET_ID=mouse_brain_combined + +RAW_DATA=resources_test/common +DATASET_DIR=resources_test/task_spatial_segmentation/$DATASET_ID mkdir -p $DATASET_DIR # process dataset viash run src/data_processors/process_dataset/config.vsh.yaml -- \ - --input_sp $RAW_DATA/mouse_brain_combined/common_ist.zarr \ - --input_sc $RAW_DATA/mouse_brain_combined/common_scrnaseq.h5ad \ - --output_spatial_dataset $DATASET_DIR/output_spatial_dataset.zarr \ - --output_scrnaseq $DATASET_DIR/mouse_brain_combined/output_scrnaseq.h5ad + --input_sp $RAW_DATA/2023_10x_mouse_brain_xenium_rep1/dataset.zarr \ + --input_sc $RAW_DATA/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad \ + --output_spatial_dataset $DATASET_DIR/spatial_dataset.zarr \ + --output_scrnaseq_reference $DATASET_DIR/scrnaseq_reference.h5ad # run one method -viash run src/methods/logistic_regression/config.vsh.yaml -- \ - --input $DATASET_DIR/mouse_brain_combined/common_ist.zarr \ - --output $DATASET_DIR/mouse_brain_combined/prediction.h5ad +viash run src/methods/cellpose/config.vsh.yaml -- \ + --input $DATASET_DIR/spatial_dataset.zarr \ + --output $DATASET_DIR/prediction.h5ad # run one metric -viash run src/metrics/accuracy/config.vsh.yaml -- \ - --input_prediction $DATASET_DIR/cxg_mouse_pancreas_atlas/prediction.h5ad \ - --input_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad \ - --output $DATASET_DIR/cxg_mouse_pancreas_atlas/score.h5ad +# TODO: implement this! +# viash run src/metrics/ari/config.vsh.yaml -- \ +# --input_prediction $DATASET_DIR/prediction.h5ad \ +# --input_scrnaseq_reference $DATASET_DIR/scrnaseq_reference.h5ad \ +# --output $DATASET_DIR/score.h5ad # write manual state.yaml. this is not actually necessary but you never know it might be useful -cat > $DATASET_DIR/mouse_brain_combined/state.yaml << HERE -id: mouse_brain_combined -processed: !file output_scrnaseq.h5ad -segmentation: !file prediction.h5ad -score: !file score.h5ad +cat > $DATASET_DIR/state.yaml << HERE +id: $DATASET_ID +spatial_dataset: spatial_dataset.zarr +scrnaseq_reference: scrnaseq_reference.h5ad +prediction: prediction.h5ad +score: score.h5ad HERE # only run this if you have access to the openproblems-data bucket aws s3 sync --profile op \ - "$DATASET_DIR" s3://openproblems-data/resources_test/task_template \ + "$DATASET_DIR" s3://openproblems-data/resources_test/task_spatial_segmentation/mouse_brain_combined/ \ --delete --dryrun diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh index 4b1aa11..26bba56 100755 --- a/scripts/run_benchmark/run_full_local.sh +++ b/scripts/run_benchmark/run_full_local.sh @@ -31,7 +31,7 @@ publish_dir="resources/results/${RUN_ID}" # write the parameters to file cat > /tmp/params.yaml << HERE input_states: resources/datasets/**/state.yaml -rename_keys: 'input_scrnaseq:output_scrnaseq' +rename_keys: 'input_spatial_dataset:output_spatial_dataset,input_scrnaseq_reference:output_scrnaseq_reference' output_state: "state.yaml" publish_dir: "$publish_dir" HERE diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh index 83f37b2..3c31e74 100755 --- a/scripts/run_benchmark/run_full_seqeracloud.sh +++ b/scripts/run_benchmark/run_full_seqeracloud.sh @@ -23,7 +23,7 @@ publish_dir="s3://openproblems-data/resources/task_template/results/${RUN_ID}" # write the parameters to file cat > /tmp/params.yaml << HERE input_states: s3://openproblems-data/resources/task_template/datasets/**/state.yaml -rename_keys: 'input_scrnaseq:output_scrnaseq' +rename_keys: 'input_spatial_dataset:output_spatial_dataset,input_scrnaseq_reference:output_scrnaseq_reference' output_state: "state.yaml" publish_dir: "$publish_dir" HERE From 4f275a894ec71a5ca6400b99a716d96ce1ad3a8d Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 27 Apr 2026 14:52:30 +0200 Subject: [PATCH 17/22] update data processor --- src/api/comp_control_method.yaml | 2 +- src/api/comp_data_processor.yaml | 4 +- src/api/comp_metric.yaml | 2 +- src/api/file_scrnaseq.yaml | 9 - src/api/file_scrnaseq_reference.yaml | 259 ++++++++++++++++++ src/api/file_spatial_dataset.yaml | 169 +++++++++++- .../process_dataset/config.vsh.yaml | 25 +- .../config/config_default.json | 1 - src/data_processors/process_dataset/script.py | 77 ++---- .../process_datasets/config.vsh.yaml | 4 +- src/workflows/process_datasets/main.nf | 4 +- src/workflows/run_benchmark/config.vsh.yaml | 2 +- 12 files changed, 465 insertions(+), 93 deletions(-) delete mode 100644 src/api/file_scrnaseq.yaml create mode 100644 src/api/file_scrnaseq_reference.yaml delete mode 100644 src/data_processors/process_dataset/config/config_default.json diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml index 694f004..3f4fa2e 100644 --- a/src/api/comp_control_method.yaml +++ b/src/api/comp_control_method.yaml @@ -17,7 +17,7 @@ arguments: required: true direction: input - name: "--input_scrnaseq_reference" - __merge__: file_scrnaseq.yaml + __merge__: file_scrnaseq_reference.yaml direction: input required: true - name: --output diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml index 137cd12..22c77aa 100644 --- a/src/api/comp_data_processor.yaml +++ b/src/api/comp_data_processor.yaml @@ -23,8 +23,8 @@ argument_groups: __merge__: file_spatial_dataset.yaml direction: output required: true - - name: "--output_scrnaseq" - __merge__: file_scrnaseq.yaml + - name: "--output_scrnaseq_reference" + __merge__: file_scrnaseq_reference.yaml direction: output required: true test_resources: diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml index e2d21e6..a7470e9 100644 --- a/src/api/comp_metric.yaml +++ b/src/api/comp_metric.yaml @@ -12,7 +12,7 @@ arguments: direction: input required: true - name: "--input_scrnaseq_reference" - __merge__: file_scrnaseq.yaml + __merge__: file_scrnaseq_reference.yaml direction: input required: true - name: "--output" diff --git a/src/api/file_scrnaseq.yaml b/src/api/file_scrnaseq.yaml deleted file mode 100644 index 06d8491..0000000 --- a/src/api/file_scrnaseq.yaml +++ /dev/null @@ -1,9 +0,0 @@ -type: file -example: "resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad" -# TODO: revert to the original example once file exists -# example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.h5ad" -label: "scRNA-seq Reference" -summary: A single-cell reference dataset, preprocessed for this benchmark. -description: | - This dataset contains preprocessed counts and metadata for single-cell RNA-seq data. -__merge__: file_common_scrnaseq.yaml \ No newline at end of file diff --git a/src/api/file_scrnaseq_reference.yaml b/src/api/file_scrnaseq_reference.yaml new file mode 100644 index 0000000..e214add --- /dev/null +++ b/src/api/file_scrnaseq_reference.yaml @@ -0,0 +1,259 @@ +type: file +example: "resources_test/task_spatial_segmentation/mouse_brain_combined/scrnaseq_reference.h5ad" +# TODO: revert to the original example once file exists +# example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.h5ad" +label: "scRNA-seq Reference" +summary: A single-cell reference dataset, preprocessed for this benchmark. +description: | + This dataset contains preprocessed counts and metadata for single-cell RNA-seq data. +info: + format: + type: h5ad + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: integer + name: normalized + description: Normalized expression values + required: true + obs: + - type: string + name: cell_type + description: Classification of the cell type based on its characteristics and function within the tissue or organism. + required: true + + - type: string + name: cell_type_level2 + description: Classification of the cell type based on its characteristics and function within the tissue or organism. + required: false + + - type: string + name: cell_type_level3 + description: Classification of the cell type based on its characteristics and function within the tissue or organism. + required: false + + - type: string + name: cell_type_level4 + description: Classification of the cell type based on its characteristics and function within the tissue or organism. + required: false + + - type: string + name: dataset_id + description: Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes. + required: false + + - type: string + name: assay + description: Type of assay used to generate the cell data, indicating the methodology or technique employed. + required: false + + - type: string + name: assay_ontology_term_id + description: Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type. + required: false + + - type: string + name: cell_type_ontology_term_id + description: Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification. + required: false + + - type: string + name: development_stage + description: Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase. + required: false + + - type: string + name: development_stage_ontology_term_id + description: | + Ontology term identifier for the developmental stage, providing a standardized reference to the organism's developmental phase. + + If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used. + If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used. + Otherwise, the Uberon (`UBERON:`) ontology is used. + required: false + + - type: string + name: disease + description: Information on any disease or pathological condition associated with the cell or donor. + required: false + + - type: string + name: disease_ontology_term_id + description: | + Ontology term identifier for the disease, enabling standardized disease classification and referencing. + + Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`). + required: false + + - type: string + name: donor_id + description: Identifier for the donor from whom the cell sample is obtained. + required: false + + - type: boolean + name: is_primary_data + description: Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data. + required: false + + - type: string + name: organism + description: Organism from which the cell sample is obtained. + required: false + + - type: string + name: organism_ontology_term_id + description: | + Ontology term identifier for the organism, providing a standardized reference for the organism. + + Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`. + required: false + + - type: string + name: self_reported_ethnicity + description: Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits. + required: false + + - type: string + name: self_reported_ethnicity_ontology_term_id + description: | + Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications. + + If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used. + required: false + + - type: string + name: sex + description: Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions. + required: false + + - type: string + name: sex_ontology_term_id + description: Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed. + required: false + + - type: string + name: suspension_type + description: Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions. + required: false + + - type: string + name: tissue + description: Specific tissue from which the cells were derived, key for context and specificity in cell studies. + required: false + + - type: string + name: tissue_ontology_term_id + description: | + Ontology term identifier for the tissue, providing a standardized reference for the tissue type. + + For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). + For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. + required: false + + - type: string + name: tissue_general + description: General category or classification of the tissue, useful for broader grouping and comparison of cell data. + required: false + + - type: string + name: tissue_general_ontology_term_id + description: | + Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types. + + For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). + For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. + required: false + + - type: string + name: batch + description: A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. + required: false + + - type: integer + name: soma_joinid + description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell. + required: false + var: + - type: string + name: feature_id + description: Unique identifier for the feature, usually a ENSEMBL gene id. + # TODO: make this required once openproblems_v1 dataloader supports it + required: false + + - type: string + name: feature_name + description: A human-readable name for the feature, usually a gene symbol. + # TODO: make this required once the dataloader supports it + required: true + + - type: integer + name: soma_joinid + description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature. + required: false + + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + + - type: double + name: hvg_score + description: A score for the feature indicating how highly variable it is. + required: true + + obsp: + - type: double + name: knn_distances + description: K nearest neighbors distance matrix. + required: true + + - type: double + name: knn_connectivities + description: K nearest neighbors connectivities matrix. + required: true + + obsm: + - type: double + name: X_pca + description: The resulting PCA embedding. + required: true + + varm: + - type: double + name: pca_loadings + description: The PCA loadings matrix. + required: true + + uns: + - type: string + name: dataset_id + description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. + required: true + - name: dataset_name + type: string + description: A human-readable name for the dataset. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + multiple: true + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + multiple: true diff --git a/src/api/file_spatial_dataset.yaml b/src/api/file_spatial_dataset.yaml index 5668a3f..41e3c31 100644 --- a/src/api/file_spatial_dataset.yaml +++ b/src/api/file_spatial_dataset.yaml @@ -1,9 +1,174 @@ type: file -example: "resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr" +example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr" # TODO: revert to the original example once file exists # example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr" label: "Raw iST Dataset" summary: A spatial transcriptomics dataset, preprocessed for this benchmark. description: | This dataset contains preprocessed images, labels, points, shapes, and tables for spatial transcriptomics data. -__merge__: file_common_ist.yaml +info: + format: + type: spatialdata_zarr + images: + - type: object + name: image + description: The raw image data + required: true + - type: object + name: image_3D + description: The raw 3D image data + required: false + - type: object + name: he_image + description: H&E image data + required: false + labels: + - type: object + name: "cell_labels" + description: Cell segmentation labels + required: false + - type: object + name: "nucleus_labels" + description: Cell segmentation labels + required: false + # - type: datatree + # name: "{segm}_3D" + # description: Custom segmentation of the 3D data + # required: false + # - type: datatree + # name: "expert_segm_{patch}" + # description: Expert segmentation of a patch of the data + # required: false + # - type: DataTree[zyx] + # name: "expert_segm_{patch}_3D" + # description: Expert segmentation of a 3D patch of the data + # required: false + points: + - type: dataframe + name: transcripts + description: Point cloud data of transcripts + required: true + columns: + - type: float + name: "x" + required: true + description: x-coordinate of the point + - type: float + name: "y" + required: true + description: y-coordinate of the point + - type: float + name: "z" + required: false + description: z-coordinate of the point + - type: categorical + name: feature_name + required: true + description: Name of the feature + - type: integer + name: "cell_id" + required: false + description: Unique identifier of the cell + - type: integer + name: "nucleus_id" + required: false + description: Unique identifier of the nucleus + - type: string + name: "cell_type" + required: false + description: Cell type of the cell + - type: float + name: qv + required: false + description: Quality value of the point + - type: long + name: transcript_id + required: true + description: Unique identifier of the transcript + - type: boolean + name: overlaps_nucleus + required: false + description: Whether the point overlaps with a nucleus + shapes: + - type: dataframe + name: "cell_boundaries" + description: Cell boundaries + required: false + columns: + - type: object + name: "geometry" + required: true + description: Geometry of the cell boundary + - type: dataframe + name: "nucleus_boundaries" + description: Nucleus boundaries + required: false + columns: + - type: object + name: "geometry" + required: true + description: Geometry of the nucleus boundary + tables: + - type: anndata + name: "metadata" + description: Metadata of spatial dataset + required: true + uns: + - type: string + name: dataset_id + required: true + description: A unique identifier for the dataset + - type: string + name: dataset_name + required: true + description: A human-readable name for the dataset + - type: string + name: dataset_url + required: true + description: Link to the original source of the dataset + - type: string + name: dataset_reference + required: true + description: Bibtex reference of the paper in which the dataset was published + - type: string + name: dataset_summary + required: true + description: Short description of the dataset + - type: string + name: dataset_description + required: true + description: Long description of the dataset + - type: string + name: dataset_organism + required: true + description: The organism of the sample in the dataset + - type: string + name: segmentation_id + required: true + multiple: true + description: A unique identifier for the segmentation + obs: + - type: string + name: cell_id + required: true + description: A unique identifier for the cell + var: + - type: string + name: gene_ids + required: true + description: Unique identifier for the gene + - type: string + name: feature_types + required: true + description: Type of the feature + obsm: + - type: double + name: spatial + required: true + description: Spatial coordinates of the cell + coordinate_systems: + - type: object + name: global + description: Coordinate system of the replicate + required: true + diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml index 25700eb..58bf840 100644 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -3,26 +3,18 @@ __merge__: ../../api/comp_data_processor.yaml name: process_dataset arguments: - - name: "--method" - type: "string" - description: "The spatial technology data type." - choices: ["xenium"] - - name: "--obs_label" - type: "string" - description: "Which .obs slot to use as label." - default: "cell_type" - - name: "--obs_batch" - type: "string" - description: "Which .obs slot to use as batch covariate." - default: "batch" - name: "--seed" type: "integer" description: "A seed for the subsampling." example: 123 - - name: "--config" - type: "string" - description: "Config file in json format for data processing parameters." - example: config/config_default.json + - name: "--span" + type: double + description: The fraction of the data (cells) used when estimating the variance in the loess model fit if flavor='seurat_v3'. + default: 0.3 + - name: "--n_top_genes" + type: integer + description: Number of highly-variable genes to keep. Mandatory if flavor='seurat_v3'. + default: 3000 resources: - type: python_script @@ -39,7 +31,6 @@ engines: - /src/base/setup_spatialdata_partial.yaml - type: native - runners: - type: executable - type: nextflow diff --git a/src/data_processors/process_dataset/config/config_default.json b/src/data_processors/process_dataset/config/config_default.json deleted file mode 100644 index 8d52b6c..0000000 --- a/src/data_processors/process_dataset/config/config_default.json +++ /dev/null @@ -1 +0,0 @@ -{"span": 1.0, "n_top_genes": 3000} \ No newline at end of file diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index 7a48f6d..4eeb6de 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -1,82 +1,49 @@ -import sys import random -import numpy as np import anndata as ad -import scanpy as sc -import openproblems as op import spatialdata as sd -import json +import os import shutil ## VIASH START par = { - 'input_sp': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr', - 'input_sc': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad', + 'input_sp': 'resources_test/common/2023_10x_mouse_brain_xenium_rep1/dataset.zarr', + 'input_sc': 'resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad', 'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr', - 'output_scrnaseq': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq.h5ad', - 'method': 'xenium', + 'output_scrnaseq_reference': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq_reference.h5ad', + 'span': 0.3, 'seed': 123, - 'config': 'task_spatial_segmentation/src/data_processors/process_dataset/config/config_default.json' -} - -meta = { - 'resources_dir': 'target/executable/data_processors/process_dataset', - 'config': 'target/executable/data_processors/process_dataset/.config.vsh.yaml' + 'n_top_genes': 3000 } ## VIASH END -# import helper functions -sys.path.append(meta['resources_dir']) - -config = op.project.read_viash_config(meta["config"]) - # set seed if need be if par["seed"]: print(f">> Setting seed to {par['seed']}") random.seed(par["seed"]) print(">> Load data", flush=True) -adata = ad.read_h5ad(par["input_sc"]) -print("input_sc:", adata) +sc_data = ad.read_h5ad(par["input_sc"]) -print(f">> Process {par['method']} data") +print(">> Processing sc_data", flush=True) -if par['config']: - print(f">> Perform standard data preprocessing") - with open(par['config'], "r") as f: - config = json.load(f) +# TODO: process the single-cell dataset - # Add config to params - for key, value in config.items(): - setattr(par, key, value) +print(f"single cell data: {sc_data}") - adata.layers["counts"] = adata.X.copy() - - sc.pp.normalize_total(adata) - sc.pp.log1p(adata) - adata.layers['normlog'] = adata.X - - sc.pp.highly_variable_genes( - adata, - flavor="seurat_v3", - layer="counts", - span=par['span'], - n_top_genes=par['n_top_genes'] - ) +print(">> Writing data", flush=True) +sc_data.write_h5ad(par["output_scrnaseq_reference"], compression="gzip") - adata.var.sort_values("means") - sc.pp.scale(adata, zero_center=False) - adata.layers['normlogscale'] = adata.X - - adata.X = adata.layers['counts'] +# read input_sp +print(">> Read spatial data", flush=True) +sp_data = sd.read_zarr(par["input_sp"]) - # cell area normalization - sc.pp.calculate_qc_metrics(adata, inplace=True) - for x in ['transcript_counts', 'n_genes_by_counts']: - adata.obs[f'canorm_{x}'] = adata.obs[f'{x}'] / adata.obs['cell_area'] +print(">> Processing spatial data", flush=True) +# TODO: process the spatial dataset -print(">> Writing data", flush=True) -adata.write_h5ad(par["output_scrnaseq"]) +print(f"spatial data: {sp_data}") print(">> Writing spatial data", flush=True) -shutil.copytree(par["input_sp"], par["output_spatial_dataset"]) \ No newline at end of file +# remove directory if it exists +if os.path.exists(par["output_spatial_dataset"]): + shutil.rmtree(par["output_spatial_dataset"]) +sp_data.write(par["output_spatial_dataset"], overwrite=True) diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml index fe3b9d4..c71286a 100644 --- a/src/workflows/process_datasets/config.vsh.yaml +++ b/src/workflows/process_datasets/config.vsh.yaml @@ -18,8 +18,8 @@ argument_groups: __merge__: /src/api/file_spatial_dataset.yaml direction: output required: true - - name: "--output_scrnaseq" - __merge__: /src/api/file_scrnaseq.yaml + - name: "--output_scrnaseq_reference" + __merge__: /src/api/file_scrnaseq_reference.yaml direction: output required: true diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf index 226e861..947a8f1 100644 --- a/src/workflows/process_datasets/main.nf +++ b/src/workflows/process_datasets/main.nf @@ -45,12 +45,12 @@ workflow run_wf { ], toState: [ output_spatial_dataset: "output_spatial_dataset", - output_scrnaseq: "output_scrnaseq" + output_scrnaseq_reference: "output_scrnaseq_reference" ] ) // only output the files for which an output file was specified - | setState(["output_spatial_dataset", "output_scrnaseq"]) + | setState(["output_spatial_dataset", "output_scrnaseq_reference"]) emit: output_ch diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index dd7f49b..4ab5f83 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -9,7 +9,7 @@ argument_groups: direction: output required: true - name: "--input_scrnaseq_reference" - __merge__: /src/api/file_scrnaseq.yaml + __merge__: /src/api/file_scrnaseq_reference.yaml direction: output required: true - name: Outputs From 6e6a2772dcd562780dd00df4d2994e3c81fd6ed4 Mon Sep 17 00:00:00 2001 From: f641l Date: Mon, 27 Apr 2026 19:33:24 +0200 Subject: [PATCH 18/22] change to file_scranseq_reference.yaml --- src/api/file_scrnaseq_reference.yaml | 175 ++------------------------- 1 file changed, 13 insertions(+), 162 deletions(-) diff --git a/src/api/file_scrnaseq_reference.yaml b/src/api/file_scrnaseq_reference.yaml index e214add..9b855fd 100644 --- a/src/api/file_scrnaseq_reference.yaml +++ b/src/api/file_scrnaseq_reference.yaml @@ -14,167 +14,28 @@ info: name: counts description: Raw counts required: true - - type: integer + + - type: double name: normalized description: Normalized expression values required: true + + - type: double + name: normalized_log + description: Log1p normalized expression values + required: true + + - type: double + name: normalized_log_scaled + description: Log1p normalized expression values scaled to unit variance and zero mean + required: true + obs: - type: string name: cell_type description: Classification of the cell type based on its characteristics and function within the tissue or organism. required: true - - - type: string - name: cell_type_level2 - description: Classification of the cell type based on its characteristics and function within the tissue or organism. - required: false - - - type: string - name: cell_type_level3 - description: Classification of the cell type based on its characteristics and function within the tissue or organism. - required: false - - - type: string - name: cell_type_level4 - description: Classification of the cell type based on its characteristics and function within the tissue or organism. - required: false - - - type: string - name: dataset_id - description: Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes. - required: false - - - type: string - name: assay - description: Type of assay used to generate the cell data, indicating the methodology or technique employed. - required: false - - - type: string - name: assay_ontology_term_id - description: Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type. - required: false - - - type: string - name: cell_type_ontology_term_id - description: Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification. - required: false - - - type: string - name: development_stage - description: Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase. - required: false - - - type: string - name: development_stage_ontology_term_id - description: | - Ontology term identifier for the developmental stage, providing a standardized reference to the organism's developmental phase. - - If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used. - If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used. - Otherwise, the Uberon (`UBERON:`) ontology is used. - required: false - - - type: string - name: disease - description: Information on any disease or pathological condition associated with the cell or donor. - required: false - - - type: string - name: disease_ontology_term_id - description: | - Ontology term identifier for the disease, enabling standardized disease classification and referencing. - - Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`). - required: false - - - type: string - name: donor_id - description: Identifier for the donor from whom the cell sample is obtained. - required: false - - type: boolean - name: is_primary_data - description: Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data. - required: false - - - type: string - name: organism - description: Organism from which the cell sample is obtained. - required: false - - - type: string - name: organism_ontology_term_id - description: | - Ontology term identifier for the organism, providing a standardized reference for the organism. - - Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`. - required: false - - - type: string - name: self_reported_ethnicity - description: Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits. - required: false - - - type: string - name: self_reported_ethnicity_ontology_term_id - description: | - Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications. - - If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used. - required: false - - - type: string - name: sex - description: Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions. - required: false - - - type: string - name: sex_ontology_term_id - description: Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed. - required: false - - - type: string - name: suspension_type - description: Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions. - required: false - - - type: string - name: tissue - description: Specific tissue from which the cells were derived, key for context and specificity in cell studies. - required: false - - - type: string - name: tissue_ontology_term_id - description: | - Ontology term identifier for the tissue, providing a standardized reference for the tissue type. - - For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). - For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. - required: false - - - type: string - name: tissue_general - description: General category or classification of the tissue, useful for broader grouping and comparison of cell data. - required: false - - - type: string - name: tissue_general_ontology_term_id - description: | - Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types. - - For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). - For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. - required: false - - - type: string - name: batch - description: A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. - required: false - - - type: integer - name: soma_joinid - description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell. - required: false var: - type: string name: feature_id @@ -188,21 +49,11 @@ info: # TODO: make this required once the dataloader supports it required: true - - type: integer - name: soma_joinid - description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature. - required: false - - type: boolean name: hvg description: Whether or not the feature is considered to be a 'highly variable gene' required: true - - type: double - name: hvg_score - description: A score for the feature indicating how highly variable it is. - required: true - obsp: - type: double name: knn_distances From 292fe3f11c0521a539b11501498bbe1ae1d1f7c3 Mon Sep 17 00:00:00 2001 From: f641l Date: Mon, 27 Apr 2026 19:33:53 +0200 Subject: [PATCH 19/22] change to script.py --- src/data_processors/process_dataset/script.py | 43 ++++++++++++++++++- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index 4eeb6de..6722e4e 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -2,6 +2,7 @@ import anndata as ad import spatialdata as sd import os +import scanpy as sc import shutil ## VIASH START @@ -25,8 +26,35 @@ sc_data = ad.read_h5ad(par["input_sc"]) print(">> Processing sc_data", flush=True) +if "counts" not in sc_data.layers and sc_data.X != None: + print(">> Save raw counts in .layer", flush=True) + sc_data.layers["counts"] = sc_data.X.copy() + +if "normalized" not in sc_data.layers: + print(">> Perform standard normalization", flush=True) + normalized = sc.pp.normalize_total(sc_data.layers["counts"]) + sc_data.layers["normalized"] = normalized.copy() -# TODO: process the single-cell dataset +if "normalized_log" not in sc_data.layers: + print(">> Perform log1p normalization", flush=True) + normalized_log = sc.pp.log1p(sc_data.layers["normalized"]) + sc_data.layers['normalized_log'] = normalized_log.copy() + +if "normalized_log_scaled" not in sc_data.layers: + print(">> Perform 0 mean and standard variance normalization", flush=True) + normalized_log_scaled = sc.pp.scale(sc_data.layers["normalized"]) + sc_data.layers['normalized_log_scaled'] = normalized_log_scaled.copy() + +if "hvg" not in sc_data.var: + print(">> Compute highly variable genes", flush=True) + sc.pp.highly_variable_genes( + sc_data, + flavor="seurat_v3", + layer="counts", + span=par['span'], + n_top_genes=par['n_top_genes'] + ) + sc_data.var.rename(columns={"highly_variable": "hvg"}, inplace=True) print(f"single cell data: {sc_data}") @@ -38,7 +66,16 @@ sp_data = sd.read_zarr(par["input_sp"]) print(">> Processing spatial data", flush=True) -# TODO: process the spatial dataset +sp_data_table = sp_data.tables['table'] + +if "cell_area" not in sp_data_table.obs: + print(">> Perform scanpy qc for cell area", flush=True) + sc.pp.calculate_qc_metrics(sp_data_table, inplace=True) + +for x in ["transcript_counts", "n_genes_by_counts"]: + if f"ca_normalized_{x}" not in sp_data_table.obs and x in sp_data_table.obs: + print(f">> Perform cell area normalization for {x}", flush=True) + sp_data_table.obs[f'ca_normalized_{x}'] = sp_data_table.obs[f"{x}"] / sp_data_table.obs["cell_area"] print(f"spatial data: {sp_data}") @@ -47,3 +84,5 @@ if os.path.exists(par["output_spatial_dataset"]): shutil.rmtree(par["output_spatial_dataset"]) sp_data.write(par["output_spatial_dataset"], overwrite=True) + +# %% From aefc78e48a9ae011eaeff2bee01e9f48d6dff82d Mon Sep 17 00:00:00 2001 From: f641l Date: Tue, 28 Apr 2026 10:39:25 +0200 Subject: [PATCH 20/22] adding ari scripts --- src/metrics/accuracy/script.py | 47 ----------------- src/metrics/{accuracy => ari}/config.vsh.yaml | 0 src/metrics/ari/script.py | 50 +++++++++++++++++++ 3 files changed, 50 insertions(+), 47 deletions(-) delete mode 100644 src/metrics/accuracy/script.py rename src/metrics/{accuracy => ari}/config.vsh.yaml (100%) create mode 100644 src/metrics/ari/script.py diff --git a/src/metrics/accuracy/script.py b/src/metrics/accuracy/script.py deleted file mode 100644 index 054e809..0000000 --- a/src/metrics/accuracy/script.py +++ /dev/null @@ -1,47 +0,0 @@ -import anndata as ad -import numpy as np -import sklearn.preprocessing - -## VIASH START -# Note: this section is auto-generated by viash at runtime. To edit it, make changes -# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. -par = { - 'input_solution': 'resources_test/task_template/cxg_mouse_pancreas_atlas/solution.h5ad', - 'input_prediction': 'resources_test/task_template/cxg_mouse_pancreas_atlas/prediction.h5ad', - 'output': 'output.h5ad' -} -meta = { - 'name': 'accuracy' -} -## VIASH END - -print('Reading input files', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_prediction = ad.read_h5ad(par['input_prediction']) - -assert (input_prediction.obs_names == input_solution.obs_names).all(), "obs_names not the same in prediction and solution inputs" - -print("Encode labels", flush=True) -cats = list(input_solution.obs["label"].dtype.categories) + list(input_prediction.obs["label_pred"].dtype.categories) -encoder = sklearn.preprocessing.LabelEncoder().fit(cats) -input_solution.obs["label"] = encoder.transform(input_solution.obs["label"]) -input_prediction.obs["label_pred"] = encoder.transform(input_prediction.obs["label_pred"]) - - -print('Compute metrics', flush=True) -# metric_ids and metric_values can have length > 1 -# but should be of equal length -uns_metric_ids = [ 'accuracy' ] -uns_metric_values = np.mean(input_solution.obs["label"] == input_prediction.obs["label_pred"]) - -print("Write output AnnData to file", flush=True) -output = ad.AnnData( - uns={ - 'dataset_id': input_prediction.uns['dataset_id'], - 'normalization_id': input_prediction.uns['normalization_id'], - 'method_id': input_prediction.uns['method_id'], - 'metric_ids': uns_metric_ids, - 'metric_values': uns_metric_values - } -) -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/metrics/accuracy/config.vsh.yaml b/src/metrics/ari/config.vsh.yaml similarity index 100% rename from src/metrics/accuracy/config.vsh.yaml rename to src/metrics/ari/config.vsh.yaml diff --git a/src/metrics/ari/script.py b/src/metrics/ari/script.py new file mode 100644 index 0000000..12c6801 --- /dev/null +++ b/src/metrics/ari/script.py @@ -0,0 +1,50 @@ +import anndata as ad +import scanpy as sc +from sklearn.metrics import adjusted_rand_score + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + 'input_solution': 'resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad', + 'input_prediction': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq_reference.h5ad', + 'output': 'output.h5ad', + 'label': None +} +meta = { + 'name': 'ari' +} +## VIASH END + +print('>> Reading input files', flush=True) +input_solution = ad.read_h5ad(par['input_solution']) +input_prediction = ad.read_h5ad(par['input_prediction']) + +assert (input_prediction.obs_names == input_solution.obs_names).all(), "obs_names not the same in prediction and solution inputs" + +if not par['label']: + print('>> Postprocessing for metric', flush=True) + seed= 123 + sc.pp.neighbors(input_solution, n_neighbors=20, random_state=seed) + sc.tl.umap(input_solution, min_dist=0.1, spread=1.2, random_state=seed) + sc.tl.leiden(input_solution, resolution=1.0, key_added='leiden', random_state=seed) + + sc.pp.neighbors(input_prediction, n_neighbors=20, random_state=seed) + sc.tl.umap(input_prediction, min_dist=0.1, spread=1.2, random_state=seed) + sc.tl.leiden(input_prediction, resolution=1.0, key_added='leiden', random_state=seed) + +print('>> Compute metrics', flush=True) +uns_metric_ids = [ 'ari' ] +uns_metric_values = adjusted_rand_score(input_solution.obs["label"], input_prediction.obs["label_pred"]) + +print(">> Write output AnnData to file", flush=True) +output = ad.AnnData( + uns={ + 'dataset_id': input_prediction.uns['dataset_id'], + 'normalization_id': input_prediction.uns['normalization_id'], + 'method_id': input_prediction.uns['method_id'], + 'metric_ids': uns_metric_ids, + 'metric_values': uns_metric_values + } +) +output.write_h5ad(par['output'], compression='gzip') From d8589e57ba50f0a2dba936daaab30fd32fee8b29 Mon Sep 17 00:00:00 2001 From: f641l Date: Tue, 28 Apr 2026 12:11:14 +0200 Subject: [PATCH 21/22] changes to ari scripts, test_resource.sh, config.vsh.yml, script.py --- scripts/create_resources/test_resources.sh | 10 ++--- src/metrics/ari/config.vsh.yaml | 45 ++++++++++++++++------ src/metrics/ari/script.py | 37 ++++++++++-------- 3 files changed, 61 insertions(+), 31 deletions(-) diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh index 774b1f8..26228b0 100755 --- a/scripts/create_resources/test_resources.sh +++ b/scripts/create_resources/test_resources.sh @@ -33,11 +33,11 @@ viash run src/methods/cellpose/config.vsh.yaml -- \ --output $DATASET_DIR/prediction.h5ad # run one metric -# TODO: implement this! -# viash run src/metrics/ari/config.vsh.yaml -- \ -# --input_prediction $DATASET_DIR/prediction.h5ad \ -# --input_scrnaseq_reference $DATASET_DIR/scrnaseq_reference.h5ad \ -# --output $DATASET_DIR/score.h5ad +# TODO files need to be changed +viash run src/metrics/ari/config.vsh.yaml -- \ + --input_scrnaseq_reference $RAW_DATA/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad \ + --input_prediction $DATASET_DIR/output_scrnaseq_reference.h5ad \ + --output $DATASET_DIR/score.h5ad # write manual state.yaml. this is not actually necessary but you never know it might be useful cat > $DATASET_DIR/state.yaml << HERE diff --git a/src/metrics/ari/config.vsh.yaml b/src/metrics/ari/config.vsh.yaml index ac197bd..0575150 100644 --- a/src/metrics/ari/config.vsh.yaml +++ b/src/metrics/ari/config.vsh.yaml @@ -8,26 +8,27 @@ __merge__: ../../api/comp_metric.yaml # A unique identifier for your component (required). # Can contain only lowercase letters or underscores. -name: accuracy +name: ari # Metadata for your component info: metrics: # A unique identifier for your metric (required). # Can contain only lowercase letters or underscores. - - name: accuracy + - name: ari # A relatively short label, used when rendering visualisarions (required) - label: Accuracy + label: ARI # A one sentence summary of how this metric works (required). Used when # rendering summary tables. - summary: "The percentage of correctly predicted labels." + summary: "Adjusted Rand index to measure the similarity between two data clusterings." # A multi-line description of how this component works (required). Used # when rendering reference documentation. description: | - The percentage of correctly predicted labels. + The Rand index is the accuracy of determining if a link belongs within a cluster or not. + The Rand index has a value between 0 and 1, with 0 indicating that the two data clusterings do not agree on any pair of points and 1 indicating that the data clusterings are exactly the same. # A reference key from the bibtex library at src/common/library.bib (required). references: - doi: 10.48550/arXiv.2008.05756 + doi: 10.1080/01621459.1971.10482356 # The minimum possible value for this metric (required) min: 0 # The maximum possible value for this metric (required) @@ -36,11 +37,31 @@ info: maximize: true # Component-specific parameters (optional) -# arguments: -# - name: "--n_neighbors" -# type: "integer" -# default: 5 -# description: Number of neighbors to use. +arguments: + - name: "--label" + type: "string" + default: "leiden" + description: Label to be used to perform ARI. + - name: "--n_neighbors" + type: "integer" + default: 20 + description: Number of neighbors to use for nearest neighbors distance matrix. + - name: "--min_dist" + type: "double" + default: 0.1 + description: Effective minimum distance to use for UMAP. + - name: "--spread" + type: "double" + default: 1.2 + description: The effective scale of embedded points to use for UMAP. + - name: "--resolution" + type: "double" + default: 1.0 + description: The resolution to use for leiden clustering. + - name: "--seed" + type: "integer" + default: 123 + description: Seed. # Resources required to run the component resources: @@ -60,6 +81,8 @@ engines: setup: - type: python packages: scikit-learn + - type: python + packages: leidenalg runners: # This platform allows running the component natively diff --git a/src/metrics/ari/script.py b/src/metrics/ari/script.py index 12c6801..7a09ec7 100644 --- a/src/metrics/ari/script.py +++ b/src/metrics/ari/script.py @@ -1,3 +1,4 @@ + import anndata as ad import scanpy as sc from sklearn.metrics import adjusted_rand_score @@ -6,10 +7,15 @@ # Note: this section is auto-generated by viash at runtime. To edit it, make changes # in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. par = { - 'input_solution': 'resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad', + 'input_scrnaseq_reference': 'resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad', 'input_prediction': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq_reference.h5ad', - 'output': 'output.h5ad', - 'label': None + 'output': 'score.h5ad', + 'label': 'leiden', + 'n_neighbors': 20, + 'min_dist': 0.1, + 'spread': 1.2, + 'resolution': 1.0, + 'seed': 123 } meta = { 'name': 'ari' @@ -17,34 +23,35 @@ ## VIASH END print('>> Reading input files', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) +input_scrnaseq_reference = ad.read_h5ad(par['input_scrnaseq_reference']) input_prediction = ad.read_h5ad(par['input_prediction']) -assert (input_prediction.obs_names == input_solution.obs_names).all(), "obs_names not the same in prediction and solution inputs" +assert (input_prediction.obs_names == input_scrnaseq_reference.obs_names).all(), "obs_names not the same in prediction and solution inputs" -if not par['label']: +if par['label'] == 'leiden' : print('>> Postprocessing for metric', flush=True) - seed= 123 - sc.pp.neighbors(input_solution, n_neighbors=20, random_state=seed) - sc.tl.umap(input_solution, min_dist=0.1, spread=1.2, random_state=seed) - sc.tl.leiden(input_solution, resolution=1.0, key_added='leiden', random_state=seed) + sc.pp.neighbors(input_scrnaseq_reference, n_neighbors=par['n_neighbors'], random_state=par['seed']) + sc.tl.umap(input_scrnaseq_reference, min_dist=par['min_dist'], spread=par['spread'], random_state=par['seed']) + sc.tl.leiden(input_scrnaseq_reference, resolution=par['resolution'], key_added='leiden', random_state=par['seed']) - sc.pp.neighbors(input_prediction, n_neighbors=20, random_state=seed) - sc.tl.umap(input_prediction, min_dist=0.1, spread=1.2, random_state=seed) - sc.tl.leiden(input_prediction, resolution=1.0, key_added='leiden', random_state=seed) + sc.pp.neighbors(input_prediction, n_neighbors=par['n_neighbors'], random_state=par['seed']) + sc.tl.umap(input_prediction, min_dist=par['min_dist'], spread=par['spread'], random_state=par['seed']) + sc.tl.leiden(input_prediction, resolution=par['resolution'], key_added='leiden', random_state=par['seed']) print('>> Compute metrics', flush=True) uns_metric_ids = [ 'ari' ] -uns_metric_values = adjusted_rand_score(input_solution.obs["label"], input_prediction.obs["label_pred"]) +uns_metric_values = adjusted_rand_score(input_scrnaseq_reference.obs[par['label']], input_prediction.obs[par['label']]) print(">> Write output AnnData to file", flush=True) output = ad.AnnData( uns={ 'dataset_id': input_prediction.uns['dataset_id'], 'normalization_id': input_prediction.uns['normalization_id'], - 'method_id': input_prediction.uns['method_id'], + # 'method_id': input_prediction.uns['method_id'], #TODO 'metric_ids': uns_metric_ids, 'metric_values': uns_metric_values } ) output.write_h5ad(par['output'], compression='gzip') + +# %% From 6e8c759f97af321e7fad17b9d2446979d19775d3 Mon Sep 17 00:00:00 2001 From: f641l Date: Tue, 28 Apr 2026 16:43:44 +0200 Subject: [PATCH 22/22] adding new files and changes to ari scripts --- .../random_labels/config.vsh.yaml | 27 ++++++++++ src/control_methods/random_labels/script.py | 39 ++++++++++++++ src/data_processors/leiden/config.vsh.yaml | 52 +++++++++++++++++++ src/data_processors/leiden/script.py | 43 +++++++++++++++ src/metrics/ari/config.vsh.yaml | 20 ------- src/metrics/ari/script.py | 27 ++-------- 6 files changed, 166 insertions(+), 42 deletions(-) create mode 100644 src/control_methods/random_labels/config.vsh.yaml create mode 100644 src/control_methods/random_labels/script.py create mode 100644 src/data_processors/leiden/config.vsh.yaml create mode 100644 src/data_processors/leiden/script.py diff --git a/src/control_methods/random_labels/config.vsh.yaml b/src/control_methods/random_labels/config.vsh.yaml new file mode 100644 index 0000000..b7386ef --- /dev/null +++ b/src/control_methods/random_labels/config.vsh.yaml @@ -0,0 +1,27 @@ +# Base component API configuration +__merge__: ../../api/comp_control_method.yaml + +# Component configuration +name: "random_labels" +label: Random Labels +summary: "Negative control by randomly generating labels." +description: "This method serves as a negative control, where random labels are generated for the data." +info: + preferred_normalization: counts + variants: + random_features: + +# Script configuration +resources: + - type: python_script + path: script.py + +# Platform configuration +engines: + - type: docker + image: openproblems/base_python:1.0.0 +runners: + - type: executable + - type: nextflow + directives: + label: [lowtime, lowmem, lowcpu] \ No newline at end of file diff --git a/src/control_methods/random_labels/script.py b/src/control_methods/random_labels/script.py new file mode 100644 index 0000000..5e091f5 --- /dev/null +++ b/src/control_methods/random_labels/script.py @@ -0,0 +1,39 @@ + +import anndata as ad +import random +import pandas as pd + +## VIASH START +par = { + "input": "resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq_reference.h5ad", + "output": "resources_test/task_spatial_segmentation/mouse_brain_combined/reference_prediction.h5ad", + "seed": 123, + "label": "cell_type" +} +meta = { + "name": "random_labels", +} +## VIASH END + +if par["seed"]: + print(f">> Setting seed to {par['seed']}") + random.seed(par["seed"]) + +print("Load input data", flush=True) +input = ad.read_h5ad(par["input"]) + +print("Create random labels", flush=True) +input.obs[par["label"]] = [random.randint(1, 10) for _ in range(input.n_obs)] + +print("Create output AnnData", flush=True) +output = ad.AnnData( + obs=pd.DataFrame(input.obs[par["label"]]), + uns={ + "dataset_id": input.uns["dataset_id"], + "normalization_id": input.uns["normalization_id"], + "method_id": meta["name"], + }, +) + +print("Write output to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") diff --git a/src/data_processors/leiden/config.vsh.yaml b/src/data_processors/leiden/config.vsh.yaml new file mode 100644 index 0000000..dcced68 --- /dev/null +++ b/src/data_processors/leiden/config.vsh.yaml @@ -0,0 +1,52 @@ +__merge__: ../../api/comp_data_processor.yaml + +name: process_dataset + +arguments: + - name: "--label" + type: "string" + default: "cell_type" + description: Label added to anndata for prediction. + - name: "--n_neighbors" + type: "integer" + default: 20 + description: Number of neighbors to use for nearest neighbors distance matrix. + - name: "--min_dist" + type: "double" + default: 0.1 + description: Effective minimum distance to use for UMAP. + - name: "--spread" + type: "double" + default: 1.2 + description: The effective scale of embedded points to use for UMAP. + - name: "--resolution" + type: "double" + default: 1.0 + description: The resolution to use for leiden clustering. + - name: "--seed" + type: "integer" + default: 123 + description: Seed. + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + #image: openproblems/base_pytorch_nvidia:1 # TODO: ideally get gpu image to work + image: openproblems/base_python:1 + setup: + - type: python + packages: scikit-learn + - type: python + packages: leidenalg + __merge__: + - /src/base/setup_spatialdata_partial.yaml + - type: native + +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, midcpu, midtime] \ No newline at end of file diff --git a/src/data_processors/leiden/script.py b/src/data_processors/leiden/script.py new file mode 100644 index 0000000..bb0b0dc --- /dev/null +++ b/src/data_processors/leiden/script.py @@ -0,0 +1,43 @@ + +import random +import anndata as ad +import scanpy as sc +import pandas as pd + +## VIASH START +par = { + 'input': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq_reference.h5ad', + 'output': 'resources_test/task_spatial_segmentation/mouse_brain_combined/method_prediction.h5ad', + 'label': 'cell_type', + 'n_neighbors': 20, + 'min_dist': 0.1, + 'spread': 1.2, + 'resolution': 1.0, + 'seed': 123 +} +## VIASH END + +# set seed if need be +if par["seed"]: + print(f">> Setting seed to {par['seed']}") + random.seed(par["seed"]) + +print('>> Reading input files', flush=True) +input = ad.read_h5ad(par['input']) + +print('>> Perform Leiden clustering', flush=True) +sc.pp.neighbors(input, n_neighbors=par['n_neighbors'], random_state=par['seed']) +sc.tl.umap(input, min_dist=par['min_dist'], spread=par['spread'], random_state=par['seed']) +sc.tl.leiden(input, resolution=par['resolution'], key_added=par["label"], random_state=par['seed']) + +print(">> Write output AnnData to file", flush=True) +output = ad.AnnData( + obs=pd.DataFrame(input.obs[par["label"]]), + uns={ + "dataset_id": input.uns["dataset_id"], + "normalization_id": input.uns["normalization_id"], + #"method_id": input.uns["method_id"], #TODO + }, +) + +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/metrics/ari/config.vsh.yaml b/src/metrics/ari/config.vsh.yaml index 0575150..c3fb1b0 100644 --- a/src/metrics/ari/config.vsh.yaml +++ b/src/metrics/ari/config.vsh.yaml @@ -42,26 +42,6 @@ arguments: type: "string" default: "leiden" description: Label to be used to perform ARI. - - name: "--n_neighbors" - type: "integer" - default: 20 - description: Number of neighbors to use for nearest neighbors distance matrix. - - name: "--min_dist" - type: "double" - default: 0.1 - description: Effective minimum distance to use for UMAP. - - name: "--spread" - type: "double" - default: 1.2 - description: The effective scale of embedded points to use for UMAP. - - name: "--resolution" - type: "double" - default: 1.0 - description: The resolution to use for leiden clustering. - - name: "--seed" - type: "integer" - default: 123 - description: Seed. # Resources required to run the component resources: diff --git a/src/metrics/ari/script.py b/src/metrics/ari/script.py index 7a09ec7..b2a547d 100644 --- a/src/metrics/ari/script.py +++ b/src/metrics/ari/script.py @@ -7,15 +7,10 @@ # Note: this section is auto-generated by viash at runtime. To edit it, make changes # in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. par = { - 'input_scrnaseq_reference': 'resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad', - 'input_prediction': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq_reference.h5ad', - 'output': 'score.h5ad', - 'label': 'leiden', - 'n_neighbors': 20, - 'min_dist': 0.1, - 'spread': 1.2, - 'resolution': 1.0, - 'seed': 123 + 'input_scrnaseq_reference': 'resources_test/task_spatial_segmentation/mouse_brain_combined/reference_prediction.h5ad', + 'input_prediction': 'resources_test/task_spatial_segmentation/mouse_brain_combined/method_prediction.h5ad', + 'output': 'resources_test/task_spatial_segmentation/mouse_brain_combined/score.h5ad', + 'label': 'cell_type' } meta = { 'name': 'ari' @@ -27,17 +22,7 @@ input_prediction = ad.read_h5ad(par['input_prediction']) assert (input_prediction.obs_names == input_scrnaseq_reference.obs_names).all(), "obs_names not the same in prediction and solution inputs" - -if par['label'] == 'leiden' : - print('>> Postprocessing for metric', flush=True) - sc.pp.neighbors(input_scrnaseq_reference, n_neighbors=par['n_neighbors'], random_state=par['seed']) - sc.tl.umap(input_scrnaseq_reference, min_dist=par['min_dist'], spread=par['spread'], random_state=par['seed']) - sc.tl.leiden(input_scrnaseq_reference, resolution=par['resolution'], key_added='leiden', random_state=par['seed']) - - sc.pp.neighbors(input_prediction, n_neighbors=par['n_neighbors'], random_state=par['seed']) - sc.tl.umap(input_prediction, min_dist=par['min_dist'], spread=par['spread'], random_state=par['seed']) - sc.tl.leiden(input_prediction, resolution=par['resolution'], key_added='leiden', random_state=par['seed']) - + print('>> Compute metrics', flush=True) uns_metric_ids = [ 'ari' ] uns_metric_values = adjusted_rand_score(input_scrnaseq_reference.obs[par['label']], input_prediction.obs[par['label']]) @@ -53,5 +38,3 @@ } ) output.write_h5ad(par['output'], compression='gzip') - -# %%