From af3cf2e96c741a2c031b61e0d0d9553acb61a93e Mon Sep 17 00:00:00 2001
From: f641l <florian.heyl@dkfz-heidelberg.de>
Date: Wed, 22 Apr 2026 18:06:31 +0200
Subject: [PATCH 01/22] changes many files

---
 README.md                                     |  2 +-
 scripts/create_resources/resources.sh         |  2 +-
 scripts/create_resources/test_resources.sh    | 27 +++---
 scripts/run_benchmark/run_full_local.sh       |  2 +-
 scripts/run_benchmark/run_full_seqeracloud.sh |  2 +-
 src/api/comp_data_processor.yaml              |  2 +-
 .../process_dataset/config.vsh.yaml           |  9 +-
 .../config/config_default.json                |  1 +
 src/data_processors/process_dataset/script.py | 85 +++++++++----------
 src/methods/cellpose/config.vsh.yaml          |  8 +-
 .../process_datasets/config.vsh.yaml          |  4 +-
 src/workflows/process_datasets/main.nf        |  4 +-
 12 files changed, 70 insertions(+), 78 deletions(-)
 create mode 100644 src/data_processors/process_dataset/config/config_default.json
diff --git a/README.md b/README.md
index ccf6db4..9cde50f 100644
--- a/README.md
+++ b/README.md
@@ -176,7 +176,7 @@ Arguments:
 | `--input_sp` | `file` | An unprocessed spatial imaging dataset stored as a zarr file. |
 | `--input_sc` | `file` | An unprocessed dataset as output by a dataset loader. |
 | `--output_spatial_dataset` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. |
-| `--output_scrnaseq_reference` | `file` | (*Output*) A single-cell reference dataset, preprocessed for this benchmark. |
+| `--output_scrnaseq` | `file` | (*Output*) A single-cell dataset, preprocessed for this benchmark. |
 
 </div>
 
diff --git a/scripts/create_resources/resources.sh b/scripts/create_resources/resources.sh
index 57f4d68..4ba5075 100755
--- a/scripts/create_resources/resources.sh
+++ b/scripts/create_resources/resources.sh
@@ -18,7 +18,7 @@ cat > /tmp/params.yaml << 'HERE'
 input_states: s3://openproblems-data/resources/datasets/**/state.yaml
 rename_keys: 'input:output_dataset'
 output_state: '$id/state.yaml'
-settings: '{"output_train": "$id/train.h5ad", "output_test": "$id/test.h5ad", "output_solution": "$id/solution.h5ad"}'
+settings: '{"output_scrnaseq": "$id/output_scrnaseq.h5ad"}'
 publish_dir: s3://openproblems-data/resources/task_template/datasets/
 HERE
 
diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh
index 9cb372a..26074a9 100755
--- a/scripts/create_resources/test_resources.sh
+++ b/scripts/create_resources/test_resources.sh
@@ -13,23 +13,22 @@ cd "$REPO_ROOT"
 
 set -e
 
-RAW_DATA=resources_test/common
-DATASET_DIR=resources_test/task_template
+RAW_DATA=resources_test/task_spatial_segmentation
+DATASET_DIR=resources_test/task_spatial_segmentation
 
 mkdir -p $DATASET_DIR
 
 # process dataset
 viash run src/data_processors/process_dataset/config.vsh.yaml -- \
-  --input $RAW_DATA/cxg_mouse_pancreas_atlas/dataset.h5ad \
-  --output_train $DATASET_DIR/cxg_mouse_pancreas_atlas/train.h5ad \
-  --output_test $DATASET_DIR/cxg_mouse_pancreas_atlas/test.h5ad \
-  --output_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad
+  --input_sp $RAW_DATA/mouse_brain_combined/common_ist.zarr \
+  --input_sc $RAW_DATA/mouse_brain_combined/common_scrnaseq.h5ad \
+  --output_spatial_dataset $DATASET_DIR/output_spatial_dataset.zarr \
+  --output_scrnaseq $DATASET_DIR/mouse_brain_combined/output_scrnaseq.h5ad
 
 # run one method
 viash run src/methods/logistic_regression/config.vsh.yaml -- \
-    --input_train $DATASET_DIR/cxg_mouse_pancreas_atlas/train.h5ad \
-    --input_test $DATASET_DIR/cxg_mouse_pancreas_atlas/test.h5ad \
-    --output $DATASET_DIR/cxg_mouse_pancreas_atlas/prediction.h5ad
+    --input $DATASET_DIR/mouse_brain_combined/common_ist.zarr \
+    --output $DATASET_DIR/mouse_brain_combined/prediction.h5ad
 
 # run one metric
 viash run src/metrics/accuracy/config.vsh.yaml -- \
@@ -38,12 +37,10 @@ viash run src/metrics/accuracy/config.vsh.yaml -- \
     --output $DATASET_DIR/cxg_mouse_pancreas_atlas/score.h5ad
 
 # write manual state.yaml. this is not actually necessary but you never know it might be useful
-cat > $DATASET_DIR/cxg_mouse_pancreas_atlas/state.yaml << HERE
-id: cxg_mouse_pancreas_atlas
-train: !file train.h5ad
-test: !file test.h5ad
-solution: !file solution.h5ad
-prediction: !file prediction.h5ad
+cat > $DATASET_DIR/mouse_brain_combined/state.yaml << HERE
+id: mouse_brain_combined
+processed: !file output_scrnaseq.h5ad
+segmentation: !file prediction.h5ad
 score: !file score.h5ad
 HERE
 
diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh
index f8c1585..4b1aa11 100755
--- a/scripts/run_benchmark/run_full_local.sh
+++ b/scripts/run_benchmark/run_full_local.sh
@@ -31,7 +31,7 @@ publish_dir="resources/results/${RUN_ID}"
 # write the parameters to file
 cat > /tmp/params.yaml << HERE
 input_states: resources/datasets/**/state.yaml
-rename_keys: 'input_train:output_train;input_test:output_test;input_solution:output_solution'
+rename_keys: 'input_scrnaseq:output_scrnaseq'
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
 HERE
diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh
index 87d133c..83f37b2 100755
--- a/scripts/run_benchmark/run_full_seqeracloud.sh
+++ b/scripts/run_benchmark/run_full_seqeracloud.sh
@@ -23,7 +23,7 @@ publish_dir="s3://openproblems-data/resources/task_template/results/${RUN_ID}"
 # write the parameters to file
 cat > /tmp/params.yaml << HERE
 input_states: s3://openproblems-data/resources/task_template/datasets/**/state.yaml
-rename_keys: 'input_train:output_train;input_test:output_test;input_solution:output_solution'
+rename_keys: 'input_scrnaseq:output_scrnaseq'
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
 HERE
diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml
index 22c77aa..9134d64 100644
--- a/src/api/comp_data_processor.yaml
+++ b/src/api/comp_data_processor.yaml
@@ -23,7 +23,7 @@ argument_groups:
         __merge__: file_spatial_dataset.yaml
         direction: output
         required: true
-      - name: "--output_scrnaseq_reference"
+      - name: "--output_scrnaseq"
         __merge__: file_scrnaseq_reference.yaml
         direction: output
         required: true
diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml
index 0047ae1..92cdd12 100644
--- a/src/data_processors/process_dataset/config.vsh.yaml
+++ b/src/data_processors/process_dataset/config.vsh.yaml
@@ -3,9 +3,8 @@ name: process_dataset
 arguments:
   - name: "--method"
     type: "string"
-    description: "The process method to assign train/test."
-    choices: ["batch", "random"]
-    default: "batch"
+    description: "The spatial technology data type."
+    choices: ["xenium"]
   - name: "--obs_label"
     type: "string"
     description: "Which .obs slot to use as label."
@@ -18,6 +17,10 @@ arguments:
     type: "integer"
     description: "A seed for the subsampling."
     example: 123
+  - name: "--conf"
+    type: "string"
+    description: "Config file in json format for data processing parameters."
+    default: "config/config_default.json"
 resources:
   - type: python_script
     path: script.py
diff --git a/src/data_processors/process_dataset/config/config_default.json b/src/data_processors/process_dataset/config/config_default.json
new file mode 100644
index 0000000..8d52b6c
--- /dev/null
+++ b/src/data_processors/process_dataset/config/config_default.json
@@ -0,0 +1 @@
+{"span": 1.0, "n_top_genes": 3000}
\ No newline at end of file
diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py
index 7cca2bd..97dbe78 100644
--- a/src/data_processors/process_dataset/script.py
+++ b/src/data_processors/process_dataset/script.py
@@ -2,14 +2,16 @@
 import random
 import numpy as np
 import anndata as ad
+import scanpy as sc
 import openproblems as op
+import json
 
 ## VIASH START
 par = {
     'input_sp': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr',
     'input_sc': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad',
-    'output_spatial_dataset': 'output_spatial_dataset.zarr',
-    'output_scrnaseq_reference': 'output_scrnaseq_reference.h5ad',
+    'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr',
+    'output_scrnaseq': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq.h5ad',
 }
 meta = {
     'resources_dir': 'target/executable/data_processors/process_dataset',
@@ -19,7 +21,6 @@
 
 # import helper functions
 sys.path.append(meta['resources_dir'])
-from subset_h5ad_by_format import subset_h5ad_by_format
 
 config = op.project.read_viash_config(meta["config"])
 
@@ -29,54 +30,44 @@
     random.seed(par["seed"])
 
 print(">> Load data", flush=True)
-adata = ad.read_h5ad(par["input"])
-print("input:", adata)
+adata = ad.read_h5ad(par["input_sc"])
+print("input_sc:", adata)
 
-print(f">> Process data using {par['method']} method")
-if par["method"] == "batch":
-    batch_info = adata.obs[par["obs_batch"]]
-    batch_categories = batch_info.dtype.categories
-    test_batches = random.sample(list(batch_categories), 1)
-    is_test = [ x in test_batches for x in batch_info ]
-elif par["method"] == "random":
-    train_ix = np.random.choice(adata.n_obs, round(adata.n_obs * 0.8), replace=False)
-    is_test = [ not x in train_ix for x in range(0, adata.n_obs) ]
+print(f">> Process {par['method']} data")
 
-# subset the different adatas
-print(">> Figuring which data needs to be copied to which output file", flush=True)
-# use par arguments to look for label and batch value in different slots
-slot_mapping = {
-    "obs": {
-        "label": par["obs_label"],
-        "batch": par["obs_batch"],
-    }
-}
+if par['config']:
+    print(f">> Perform standard data preprocessing")
+    with open(par['config'], "r") as f:
+        config = json.load(f)
+
+    # Add config to params
+    for key, value in config.items():
+        setattr(par, key, value)
 
-print(">> Creating train data", flush=True)
-output_train = subset_h5ad_by_format(
-    adata[[not x for x in is_test]],
-    config,
-    "output_train",
-    slot_mapping
-)
+    adata.layers["counts"] = adata.X.copy()
+    
+    sc.pp.normalize_total(adata)
+    sc.pp.log1p(adata)
+    adata.layers['normlog'] = adata.X
+    
+    sc.pp.highly_variable_genes(
+        adata,
+        flavor="seurat_v3",
+        layer="counts",
+        span=par['span'],
+        n_top_genes=par['n_top_genes']
+    )
 
-print(">> Creating test data", flush=True)
-output_test = subset_h5ad_by_format(
-    adata[is_test],
-    config,
-    "output_test",
-    slot_mapping
-)
+    adata.var.sort_values("means")
+    sc.pp.scale(adata, zero_center=False)
+    adata.layers['normlogscale'] = adata.X
+    
+    adata.X = adata.layers['counts']
 
-print(">> Creating solution data", flush=True)
-output_solution = subset_h5ad_by_format(
-    adata[is_test],
-    config,
-    "output_solution",
-    slot_mapping
-)
+    # cell area normalization
+    sc.pp.calculate_qc_metrics(adata, inplace=True)
+    for x in ['transcript_counts', 'n_genes_by_counts']:
+        adata.obs[f'canorm_{x}'] = adata.obs[f'{x}'] / adata.obs['cell_area']
 
 print(">> Writing data", flush=True)
-output_train.write_h5ad(par["output_train"])
-output_test.write_h5ad(par["output_test"])
-output_solution.write_h5ad(par["output_solution"])
+adata.write_h5ad(par["output_scrnaseq"])
diff --git a/src/methods/cellpose/config.vsh.yaml b/src/methods/cellpose/config.vsh.yaml
index 46be884..47c6cec 100644
--- a/src/methods/cellpose/config.vsh.yaml
+++ b/src/methods/cellpose/config.vsh.yaml
@@ -1,11 +1,11 @@
 name: cellpose
 label: "Cellpose"
 # TODO: update the summary, description and links
-summary: "Output of the segmantation methot cellpose"
-description: "Output of the segmantation methot cellpose"
+summary: "Cellpose-SAM: cell and nucleus segmentation with superhuman generalization."
+description: "cellpose is an anatomical segmentation algorithm written in Python 3."
 links: # these should point to the documentation of the method
-  documentation: "https://github.com/openproblems-bio/task_ist_preprocessing"
-  repository: "https://github.com/openproblems-bio/task_ist_preprocessing"
+  documentation: "https://cellpose.readthedocs.io/en/latest/"
+  repository: "https://github.com/mouseland/cellpose"
 references:
   doi: "10.1038/s41592-020-01018-x"
 
diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml
index c71286a..fe3b9d4 100644
--- a/src/workflows/process_datasets/config.vsh.yaml
+++ b/src/workflows/process_datasets/config.vsh.yaml
@@ -18,8 +18,8 @@ argument_groups:
         __merge__: /src/api/file_spatial_dataset.yaml
         direction: output
         required: true
-      - name: "--output_scrnaseq_reference"
-        __merge__: /src/api/file_scrnaseq_reference.yaml
+      - name: "--output_scrnaseq"
+        __merge__: /src/api/file_scrnaseq.yaml
         direction: output
         required: true
 
diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf
index 947a8f1..226e861 100644
--- a/src/workflows/process_datasets/main.nf
+++ b/src/workflows/process_datasets/main.nf
@@ -45,12 +45,12 @@ workflow run_wf {
       ],
       toState: [
         output_spatial_dataset: "output_spatial_dataset",
-        output_scrnaseq_reference: "output_scrnaseq_reference"
+        output_scrnaseq: "output_scrnaseq"
       ]
     )
 
     // only output the files for which an output file was specified
-    | setState(["output_spatial_dataset", "output_scrnaseq_reference"])
+    | setState(["output_spatial_dataset", "output_scrnaseq"])
 
   emit:
   output_ch

From ac9c237ca2ae09ed80d7d4a801d9626919bb3483 Mon Sep 17 00:00:00 2001
From: f641l <florian.heyl@dkfz-heidelberg.de>
Date: Thu, 23 Apr 2026 20:02:44 +0200
Subject: [PATCH 02/22] bugfix

---
 README.md                                                 | 8 ++++----
 src/api/comp_control_method.yaml                          | 2 +-
 src/api/comp_data_processor.yaml                          | 2 +-
 src/api/comp_metric.yaml                                  | 2 +-
 .../{file_scrnaseq_reference.yaml => file_scrnaseq.yaml}  | 0
 src/data_processors/process_dataset/config.vsh.yaml       | 4 +++-
 src/workflows/run_benchmark/config.vsh.yaml               | 2 +-
 7 files changed, 11 insertions(+), 9 deletions(-)
 rename src/api/{file_scrnaseq_reference.yaml => file_scrnaseq.yaml} (100%)

diff --git a/README.md b/README.md
index 9cde50f..26ed0b1 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ flowchart TB
   file_common_ist("<a href='https://github.com/openproblems-bio/task_template#file-format-common-ist-dataset'>Common iST Dataset</a>")
   comp_data_processor[/"<a href='https://github.com/openproblems-bio/task_template#component-type-data-processor'>Data processor</a>"/]
   file_spatial_dataset("<a href='https://github.com/openproblems-bio/task_template#file-format-raw-ist-dataset'>Raw iST Dataset</a>")
-  file_scrnaseq_reference("<a href='https://github.com/openproblems-bio/task_template#file-format-scrna-seq-reference'>scRNA-seq Reference</a>")
+  file_scrnaseq("<a href='https://github.com/openproblems-bio/task_template#file-format-scrna-seq-reference'>scRNA-seq Reference</a>")
   comp_control_method[/"<a href='https://github.com/openproblems-bio/task_template#component-type-control-method'>Control Method</a>"/]
   comp_method[/"<a href='https://github.com/openproblems-bio/task_template#component-type-method'>Method</a>"/]
   comp_metric[/"<a href='https://github.com/openproblems-bio/task_template#component-type-metric'>Metric</a>"/]
@@ -48,11 +48,11 @@ flowchart TB
   file_common_scrnaseq("<a href='https://github.com/openproblems-bio/task_template#file-format-common-sc-dataset'>Common SC Dataset</a>")
   file_common_ist---comp_data_processor
   comp_data_processor-->file_spatial_dataset
-  comp_data_processor-->file_scrnaseq_reference
+  comp_data_processor-->file_scrnaseq
   file_spatial_dataset---comp_control_method
   file_spatial_dataset---comp_method
-  file_scrnaseq_reference---comp_control_method
-  file_scrnaseq_reference---comp_metric
+  file_scrnaseq---comp_control_method
+  file_scrnaseq---comp_metric
   comp_control_method-->file_prediction
   comp_method-->file_prediction
   comp_metric-->file_score
diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml
index 3f4fa2e..694f004 100644
--- a/src/api/comp_control_method.yaml
+++ b/src/api/comp_control_method.yaml
@@ -17,7 +17,7 @@ arguments:
     required: true
     direction: input
   - name: "--input_scrnaseq_reference"
-    __merge__: file_scrnaseq_reference.yaml
+    __merge__: file_scrnaseq.yaml
     direction: input
     required: true
   - name: --output
diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml
index 9134d64..137cd12 100644
--- a/src/api/comp_data_processor.yaml
+++ b/src/api/comp_data_processor.yaml
@@ -24,7 +24,7 @@ argument_groups:
         direction: output
         required: true
       - name: "--output_scrnaseq"
-        __merge__: file_scrnaseq_reference.yaml
+        __merge__: file_scrnaseq.yaml
         direction: output
         required: true
 test_resources:
diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml
index a7470e9..e2d21e6 100644
--- a/src/api/comp_metric.yaml
+++ b/src/api/comp_metric.yaml
@@ -12,7 +12,7 @@ arguments:
     direction: input
     required: true
   - name: "--input_scrnaseq_reference"
-    __merge__: file_scrnaseq_reference.yaml
+    __merge__: file_scrnaseq.yaml
     direction: input
     required: true
   - name: "--output"
diff --git a/src/api/file_scrnaseq_reference.yaml b/src/api/file_scrnaseq.yaml
similarity index 100%
rename from src/api/file_scrnaseq_reference.yaml
rename to src/api/file_scrnaseq.yaml
diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml
index 92cdd12..0aa574c 100644
--- a/src/data_processors/process_dataset/config.vsh.yaml
+++ b/src/data_processors/process_dataset/config.vsh.yaml
@@ -1,5 +1,7 @@
 __merge__: ../../api/comp_data_processor.yaml
+
 name: process_dataset
+
 arguments:
   - name: "--method"
     type: "string"
@@ -21,10 +23,10 @@ arguments:
     type: "string"
     description: "Config file in json format for data processing parameters."
     default: "config/config_default.json"
+
 resources:
   - type: python_script
     path: script.py
-  - path: /common/helper_functions/subset_h5ad_by_format.py
 
 engines:
   - type: docker
diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
index 4ab5f83..dd7f49b 100644
--- a/src/workflows/run_benchmark/config.vsh.yaml
+++ b/src/workflows/run_benchmark/config.vsh.yaml
@@ -9,7 +9,7 @@ argument_groups:
         direction: output
         required: true
       - name: "--input_scrnaseq_reference"
-        __merge__: /src/api/file_scrnaseq_reference.yaml
+        __merge__: /src/api/file_scrnaseq.yaml
         direction: output
         required: true
   - name: Outputs

From cb6235df4b464ea334e99c473b5520478d5d6336 Mon Sep 17 00:00:00 2001
From: f641l <florian.heyl@dkfz-heidelberg.de>
Date: Thu, 23 Apr 2026 20:36:27 +0200
Subject: [PATCH 03/22] bugfix for data_process in config.vsh.yaml

---
 src/data_processors/process_dataset/config.vsh.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml
index 0aa574c..ba70f3a 100644
--- a/src/data_processors/process_dataset/config.vsh.yaml
+++ b/src/data_processors/process_dataset/config.vsh.yaml
@@ -19,7 +19,7 @@ arguments:
     type: "integer"
     description: "A seed for the subsampling."
     example: 123
-  - name: "--conf"
+  - name: "--config"
     type: "string"
     description: "Config file in json format for data processing parameters."
     default: "config/config_default.json"

From 4e9b8f218fa918f17df50e93fbf78983767fa945 Mon Sep 17 00:00:00 2001
From: f641l <florian.heyl@dkfz-heidelberg.de>
Date: Thu, 23 Apr 2026 20:58:29 +0200
Subject: [PATCH 04/22] bugfix for data_process in config.vsh.yaml and
 script.py

---
 src/data_processors/process_dataset/config.vsh.yaml | 6 ++++--
 src/data_processors/process_dataset/script.py       | 4 ++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml
index ba70f3a..c4fc3c6 100644
--- a/src/data_processors/process_dataset/config.vsh.yaml
+++ b/src/data_processors/process_dataset/config.vsh.yaml
@@ -20,9 +20,11 @@ arguments:
     description: "A seed for the subsampling."
     example: 123
   - name: "--config"
-    type: "string"
+    type: file
     description: "Config file in json format for data processing parameters."
-    default: "config/config_default.json"
+    required: true
+    direction: input
+    example: config/config_default.json
 
 resources:
   - type: python_script
diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py
index 97dbe78..e0954ce 100644
--- a/src/data_processors/process_dataset/script.py
+++ b/src/data_processors/process_dataset/script.py
@@ -12,7 +12,11 @@
     'input_sc': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad',
     'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr',
     'output_scrnaseq': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq.h5ad',
+    'method': 'xenium',
+    'seed': 123,
+    'config': 'config/config_default.json'
 }
+
 meta = {
     'resources_dir': 'target/executable/data_processors/process_dataset',
     'config': 'target/executable/data_processors/process_dataset/.config.vsh.yaml'

From ae452afb82cf2a0ce475be92d1bb56c92ded7eeb Mon Sep 17 00:00:00 2001
From: f641l <florian.heyl@dkfz-heidelberg.de>
Date: Thu, 23 Apr 2026 21:11:45 +0200
Subject: [PATCH 05/22] bugfix for data_process in script.py

---
 src/data_processors/process_dataset/script.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py
index e0954ce..d5e16f6 100644
--- a/src/data_processors/process_dataset/script.py
+++ b/src/data_processors/process_dataset/script.py
@@ -14,7 +14,7 @@
     'output_scrnaseq': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq.h5ad',
     'method': 'xenium',
     'seed': 123,
-    'config': 'config/config_default.json'
+    'config': 'task_spatial_segmentation/src/data_processors/process_dataset/config/config_default.json'
 }
 
 meta = {

From 30d308b082be7a3bcff8618f0a5da9fc6ac6f8de Mon Sep 17 00:00:00 2001
From: f641l <florian.heyl@dkfz-heidelberg.de>
Date: Thu, 23 Apr 2026 21:16:39 +0200
Subject: [PATCH 06/22] bugfix for data_process in config.vsh.yml

---
 src/data_processors/process_dataset/config.vsh.yaml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml
index c4fc3c6..64444ab 100644
--- a/src/data_processors/process_dataset/config.vsh.yaml
+++ b/src/data_processors/process_dataset/config.vsh.yaml
@@ -20,10 +20,8 @@ arguments:
     description: "A seed for the subsampling."
     example: 123
   - name: "--config"
-    type: file
+    type: "string"
     description: "Config file in json format for data processing parameters."
-    required: true
-    direction: input
     example: config/config_default.json
 
 resources:

From bc0de4e6399e5fe01706306b5697c7788c724780 Mon Sep 17 00:00:00 2001
From: f641l <florian.heyl@dkfz-heidelberg.de>
Date: Thu, 23 Apr 2026 21:26:16 +0200
Subject: [PATCH 07/22] bugfix for data_process in script.py

---
 src/data_processors/process_dataset/script.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py
index d5e16f6..7792c9b 100644
--- a/src/data_processors/process_dataset/script.py
+++ b/src/data_processors/process_dataset/script.py
@@ -5,6 +5,7 @@
 import scanpy as sc
 import openproblems as op
 import json
+import shutil
 
 ## VIASH START
 par = {
@@ -75,3 +76,6 @@
 
 print(">> Writing data", flush=True)
 adata.write_h5ad(par["output_scrnaseq"])
+
+print(">> Writing spatial data", flush=True)
+shutil.copy(par["input_sp"], par["output_spatial_dataset"])
\ No newline at end of file

From 4e158a57f2c26b9782e984e6d66d99539dd3d5b5 Mon Sep 17 00:00:00 2001
From: f641l <florian.heyl@dkfz-heidelberg.de>
Date: Thu, 23 Apr 2026 21:32:41 +0200
Subject: [PATCH 08/22] bugfix for data_process in script.py

---
 src/data_processors/process_dataset/script.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py
index 7792c9b..fb5ee5a 100644
--- a/src/data_processors/process_dataset/script.py
+++ b/src/data_processors/process_dataset/script.py
@@ -78,4 +78,4 @@
 adata.write_h5ad(par["output_scrnaseq"])
 
 print(">> Writing spatial data", flush=True)
-shutil.copy(par["input_sp"], par["output_spatial_dataset"])
\ No newline at end of file
+shutil.copytree(par["input_sp"], par["output_spatial_dataset"])
\ No newline at end of file

From cb7d5b4291a569c1c27165a1451e47fa3830f509 Mon Sep 17 00:00:00 2001
From: f641l <florian.heyl@dkfz-heidelberg.de>
Date: Thu, 23 Apr 2026 21:37:56 +0200
Subject: [PATCH 09/22] bugfix for data_process in script.py

---
 src/data_processors/process_dataset/script.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py
index fb5ee5a..7a48f6d 100644
--- a/src/data_processors/process_dataset/script.py
+++ b/src/data_processors/process_dataset/script.py
@@ -4,6 +4,7 @@
 import anndata as ad
 import scanpy as sc
 import openproblems as op
+import spatialdata as sd
 import json
 import shutil
 

From cd1f3d2305b656f0a063103d868e5b5c29c3f112 Mon Sep 17 00:00:00 2001
From: f641l <florian.heyl@dkfz-heidelberg.de>
Date: Thu, 23 Apr 2026 22:04:59 +0200
Subject: [PATCH 10/22] changing docker containter in config.vsh.yaml

---
 src/data_processors/process_dataset/config.vsh.yaml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml
index 64444ab..d1dcd00 100644
--- a/src/data_processors/process_dataset/config.vsh.yaml
+++ b/src/data_processors/process_dataset/config.vsh.yaml
@@ -32,6 +32,17 @@ engines:
   - type: docker
     image: openproblems/base_python:1
 
+  - type: docker
+    #image: openproblems/base_pytorch_nvidia:1 # TODO: ideally get gpu image to work
+    image: openproblems/base_python:1
+    setup:
+      - type: python
+        packages: scikit-learn
+    __merge__: 
+      - /src/base/setup_spatialdata_partial.yaml
+  - type: native
+
+
 runners:
   - type: executable
   - type: nextflow

From 59ed4c16a52758e6490033fc46e1bbadbf48a0bb Mon Sep 17 00:00:00 2001
From: f641l <florian.heyl@dkfz-heidelberg.de>
Date: Thu, 23 Apr 2026 22:09:54 +0200
Subject: [PATCH 11/22] changing docker containter in config.vsh.yaml

---
 src/data_processors/process_dataset/config.vsh.yaml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml
index d1dcd00..25700eb 100644
--- a/src/data_processors/process_dataset/config.vsh.yaml
+++ b/src/data_processors/process_dataset/config.vsh.yaml
@@ -29,9 +29,6 @@ resources:
     path: script.py
 
 engines:
-  - type: docker
-    image: openproblems/base_python:1
-
   - type: docker
     #image: openproblems/base_pytorch_nvidia:1 # TODO: ideally get gpu image to work
     image: openproblems/base_python:1

From c5dfac01afb90b305fbfae097c65c80becc421ed Mon Sep 17 00:00:00 2001
From: f641l <florian.heyl@dkfz-heidelberg.de>
Date: Fri, 24 Apr 2026 14:35:15 +0200
Subject: [PATCH 12/22] comment out output_spatial_dataset

---
 README.md                                      | 2 +-
 scripts/create_resources/test_resources.sh     | 2 +-
 src/api/comp_data_processor.yaml               | 8 ++++----
 src/data_processors/process_dataset/script.py  | 7 ++-----
 src/workflows/process_datasets/config.vsh.yaml | 8 ++++----
 src/workflows/process_datasets/main.nf         | 7 +++++--
 6 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 26ed0b1..7827c92 100644
--- a/README.md
+++ b/README.md
@@ -175,7 +175,7 @@ Arguments:
 |:---|:---|:---|
 | `--input_sp` | `file` | An unprocessed spatial imaging dataset stored as a zarr file. |
 | `--input_sc` | `file` | An unprocessed dataset as output by a dataset loader. |
-| `--output_spatial_dataset` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. |
+<!-- | `--output_spatial_dataset` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. | -->
 | `--output_scrnaseq` | `file` | (*Output*) A single-cell dataset, preprocessed for this benchmark. |
 
 </div>
diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh
index 26074a9..b9b99de 100755
--- a/scripts/create_resources/test_resources.sh
+++ b/scripts/create_resources/test_resources.sh
@@ -22,7 +22,7 @@ mkdir -p $DATASET_DIR
 viash run src/data_processors/process_dataset/config.vsh.yaml -- \
   --input_sp $RAW_DATA/mouse_brain_combined/common_ist.zarr \
   --input_sc $RAW_DATA/mouse_brain_combined/common_scrnaseq.h5ad \
-  --output_spatial_dataset $DATASET_DIR/output_spatial_dataset.zarr \
+  # --output_spatial_dataset $DATASET_DIR/output_spatial_dataset.zarr \
   --output_scrnaseq $DATASET_DIR/mouse_brain_combined/output_scrnaseq.h5ad
 
 # run one method
diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml
index 137cd12..50a1597 100644
--- a/src/api/comp_data_processor.yaml
+++ b/src/api/comp_data_processor.yaml
@@ -19,10 +19,10 @@ argument_groups:
         direction: input
   - name: Outputs
     arguments:
-      - name: "--output_spatial_dataset"
-        __merge__: file_spatial_dataset.yaml
-        direction: output
-        required: true
+      # - name: "--output_spatial_dataset"
+      #   __merge__: file_spatial_dataset.yaml
+      #   direction: output
+      #   required: true
       - name: "--output_scrnaseq"
         __merge__: file_scrnaseq.yaml
         direction: output
diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py
index 7a48f6d..cd3025e 100644
--- a/src/data_processors/process_dataset/script.py
+++ b/src/data_processors/process_dataset/script.py
@@ -12,7 +12,7 @@
 par = {
     'input_sp': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr',
     'input_sc': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad',
-    'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr',
+    #'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr',
     'output_scrnaseq': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq.h5ad',
     'method': 'xenium',
     'seed': 123,
@@ -76,7 +76,4 @@
         adata.obs[f'canorm_{x}'] = adata.obs[f'{x}'] / adata.obs['cell_area']
 
 print(">> Writing data", flush=True)
-adata.write_h5ad(par["output_scrnaseq"])
-
-print(">> Writing spatial data", flush=True)
-shutil.copytree(par["input_sp"], par["output_spatial_dataset"])
\ No newline at end of file
+adata.write_h5ad(par["output_scrnaseq"])
\ No newline at end of file
diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml
index fe3b9d4..127a9e1 100644
--- a/src/workflows/process_datasets/config.vsh.yaml
+++ b/src/workflows/process_datasets/config.vsh.yaml
@@ -14,10 +14,10 @@ argument_groups:
         direction: input
   - name: Outputs
     arguments:
-      - name: "--output_spatial_dataset"
-        __merge__: /src/api/file_spatial_dataset.yaml
-        direction: output
-        required: true
+      # - name: "--output_spatial_dataset"
+      #   __merge__: /src/api/file_spatial_dataset.yaml
+      #   direction: output
+      #   required: true
       - name: "--output_scrnaseq"
         __merge__: /src/api/file_scrnaseq.yaml
         direction: output
diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf
index 226e861..2be995d 100644
--- a/src/workflows/process_datasets/main.nf
+++ b/src/workflows/process_datasets/main.nf
@@ -44,13 +44,16 @@ workflow run_wf {
         "input_sc": "input_sc"
       ],
       toState: [
-        output_spatial_dataset: "output_spatial_dataset",
+        // output_spatial_dataset: "output_spatial_dataset",
         output_scrnaseq: "output_scrnaseq"
       ]
     )
 
     // only output the files for which an output file was specified
-    | setState(["output_spatial_dataset", "output_scrnaseq"])
+    | setState([
+      // "output_spatial_dataset", 
+      "output_scrnaseq"
+      ])
 
   emit:
   output_ch

From 97c42d1530c4c1b0f54b21c8a7618c865b162850 Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Mon, 27 Apr 2026 14:12:04 +0200
Subject: [PATCH 13/22] Revert "comment out output_spatial_dataset"

This reverts commit c5dfac01afb90b305fbfae097c65c80becc421ed.
---
 README.md                                      | 2 +-
 scripts/create_resources/test_resources.sh     | 2 +-
 src/api/comp_data_processor.yaml               | 8 ++++----
 src/data_processors/process_dataset/script.py  | 7 +++++--
 src/workflows/process_datasets/config.vsh.yaml | 8 ++++----
 src/workflows/process_datasets/main.nf         | 7 ++-----
 6 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 7827c92..26ed0b1 100644
--- a/README.md
+++ b/README.md
@@ -175,7 +175,7 @@ Arguments:
 |:---|:---|:---|
 | `--input_sp` | `file` | An unprocessed spatial imaging dataset stored as a zarr file. |
 | `--input_sc` | `file` | An unprocessed dataset as output by a dataset loader. |
-<!-- | `--output_spatial_dataset` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. | -->
+| `--output_spatial_dataset` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. |
 | `--output_scrnaseq` | `file` | (*Output*) A single-cell dataset, preprocessed for this benchmark. |
 
 </div>
diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh
index b9b99de..26074a9 100755
--- a/scripts/create_resources/test_resources.sh
+++ b/scripts/create_resources/test_resources.sh
@@ -22,7 +22,7 @@ mkdir -p $DATASET_DIR
 viash run src/data_processors/process_dataset/config.vsh.yaml -- \
   --input_sp $RAW_DATA/mouse_brain_combined/common_ist.zarr \
   --input_sc $RAW_DATA/mouse_brain_combined/common_scrnaseq.h5ad \
-  # --output_spatial_dataset $DATASET_DIR/output_spatial_dataset.zarr \
+  --output_spatial_dataset $DATASET_DIR/output_spatial_dataset.zarr \
   --output_scrnaseq $DATASET_DIR/mouse_brain_combined/output_scrnaseq.h5ad
 
 # run one method
diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml
index 50a1597..137cd12 100644
--- a/src/api/comp_data_processor.yaml
+++ b/src/api/comp_data_processor.yaml
@@ -19,10 +19,10 @@ argument_groups:
         direction: input
   - name: Outputs
     arguments:
-      # - name: "--output_spatial_dataset"
-      #   __merge__: file_spatial_dataset.yaml
-      #   direction: output
-      #   required: true
+      - name: "--output_spatial_dataset"
+        __merge__: file_spatial_dataset.yaml
+        direction: output
+        required: true
       - name: "--output_scrnaseq"
         __merge__: file_scrnaseq.yaml
         direction: output
diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py
index cd3025e..7a48f6d 100644
--- a/src/data_processors/process_dataset/script.py
+++ b/src/data_processors/process_dataset/script.py
@@ -12,7 +12,7 @@
 par = {
     'input_sp': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr',
     'input_sc': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad',
-    #'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr',
+    'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr',
     'output_scrnaseq': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq.h5ad',
     'method': 'xenium',
     'seed': 123,
@@ -76,4 +76,7 @@
         adata.obs[f'canorm_{x}'] = adata.obs[f'{x}'] / adata.obs['cell_area']
 
 print(">> Writing data", flush=True)
-adata.write_h5ad(par["output_scrnaseq"])
\ No newline at end of file
+adata.write_h5ad(par["output_scrnaseq"])
+
+print(">> Writing spatial data", flush=True)
+shutil.copytree(par["input_sp"], par["output_spatial_dataset"])
\ No newline at end of file
diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml
index 127a9e1..fe3b9d4 100644
--- a/src/workflows/process_datasets/config.vsh.yaml
+++ b/src/workflows/process_datasets/config.vsh.yaml
@@ -14,10 +14,10 @@ argument_groups:
         direction: input
   - name: Outputs
     arguments:
-      # - name: "--output_spatial_dataset"
-      #   __merge__: /src/api/file_spatial_dataset.yaml
-      #   direction: output
-      #   required: true
+      - name: "--output_spatial_dataset"
+        __merge__: /src/api/file_spatial_dataset.yaml
+        direction: output
+        required: true
       - name: "--output_scrnaseq"
         __merge__: /src/api/file_scrnaseq.yaml
         direction: output
diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf
index 2be995d..226e861 100644
--- a/src/workflows/process_datasets/main.nf
+++ b/src/workflows/process_datasets/main.nf
@@ -44,16 +44,13 @@ workflow run_wf {
         "input_sc": "input_sc"
       ],
       toState: [
-        // output_spatial_dataset: "output_spatial_dataset",
+        output_spatial_dataset: "output_spatial_dataset",
         output_scrnaseq: "output_scrnaseq"
       ]
     )
 
     // only output the files for which an output file was specified
-    | setState([
-      // "output_spatial_dataset", 
-      "output_scrnaseq"
-      ])
+    | setState(["output_spatial_dataset", "output_scrnaseq"])
 
   emit:
   output_ch

From 00589e79f3a92a93e6db9de7296cbacdb38f56a7 Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Mon, 27 Apr 2026 14:51:57 +0200
Subject: [PATCH 14/22] update project config

---
 _viash.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/_viash.yaml b/_viash.yaml
index 31ad320..a0130fe 100644
--- a/_viash.yaml
+++ b/_viash.yaml
@@ -11,8 +11,8 @@ license: MIT
 keywords: [single-cell, openproblems, benchmark]
 # Step 3: Update the `task_template` to the name of the task from step 1.
 links:
-  issue_tracker: https://github.com/openproblems-bio/task_template/issues
-  repository: https://github.com/openproblems-bio/task_template
+  issue_tracker: https://github.com/openproblems-bio/task_spatial_segmentation/issues
+  repository: https://github.com/openproblems-bio/task_spatial_segmentation
   docker_registry: ghcr.io
 
 

From 9022aa35a9c06528b274ea82eaf1f070125361a4 Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Mon, 27 Apr 2026 14:52:03 +0200
Subject: [PATCH 15/22] update readme

---
 README.md | 284 +++++++++++-------------------------------------------
 1 file changed, 54 insertions(+), 230 deletions(-)

diff --git a/README.md b/README.md
index 26ed0b1..1526580 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ A one sentence summary of purpose and methodology. Used for creating an
 overview tables.
 
 Repository:
-[openproblems-bio/task_template](https://github.com/openproblems-bio/task_template)
+[openproblems-bio/task_spatial_segmentation](https://github.com/openproblems-bio/task_spatial_segmentation)
 
 ## Description
 
@@ -28,34 +28,34 @@ should convince readers of the significance and relevance of your task.
 
 ## Authors & contributors
 
-| Name | Roles | Linkedin | Twitter | Email | Github | Orcid |
-|:---|:---|:---|:---|:---|:---|:---|
-| John Doe | author, maintainer | johndoe | johndoe | john@doe.me | johndoe | 0000-0000-0000-0000 |
+| name     | roles              |
+|:---------|:-------------------|
+| John Doe | author, maintainer |
 
 ## API
 
 ``` mermaid
 flowchart TB
-  file_common_ist("<a href='https://github.com/openproblems-bio/task_template#file-format-common-ist-dataset'>Common iST Dataset</a>")
-  comp_data_processor[/"<a href='https://github.com/openproblems-bio/task_template#component-type-data-processor'>Data processor</a>"/]
-  file_spatial_dataset("<a href='https://github.com/openproblems-bio/task_template#file-format-raw-ist-dataset'>Raw iST Dataset</a>")
-  file_scrnaseq("<a href='https://github.com/openproblems-bio/task_template#file-format-scrna-seq-reference'>scRNA-seq Reference</a>")
-  comp_control_method[/"<a href='https://github.com/openproblems-bio/task_template#component-type-control-method'>Control Method</a>"/]
-  comp_method[/"<a href='https://github.com/openproblems-bio/task_template#component-type-method'>Method</a>"/]
-  comp_metric[/"<a href='https://github.com/openproblems-bio/task_template#component-type-metric'>Metric</a>"/]
-  file_prediction("<a href='https://github.com/openproblems-bio/task_template#file-format-predicted-data'>Predicted data</a>")
-  file_score("<a href='https://github.com/openproblems-bio/task_template#file-format-score'>Score</a>")
-  file_common_scrnaseq("<a href='https://github.com/openproblems-bio/task_template#file-format-common-sc-dataset'>Common SC Dataset</a>")
+  file_common_ist("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-common-ist-dataset'>Common iST Dataset</a>")
+  comp_data_processor[/"<a href='https://github.com/openproblems-bio/task_spatial_segmentation#component-type-data-processor'>Data processor</a>"/]
+  file_scrnaseq_reference("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-scrna-seq-reference'>scRNA-seq Reference</a>")
+  file_spatial_dataset("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-raw-ist-dataset'>Raw iST Dataset</a>")
+  comp_control_method[/"<a href='https://github.com/openproblems-bio/task_spatial_segmentation#component-type-control-method'>Control Method</a>"/]
+  comp_metric[/"<a href='https://github.com/openproblems-bio/task_spatial_segmentation#component-type-metric'>Metric</a>"/]
+  comp_method[/"<a href='https://github.com/openproblems-bio/task_spatial_segmentation#component-type-method'>Method</a>"/]
+  file_prediction("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-predicted-data'>Predicted data</a>")
+  file_score("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-score'>Score</a>")
+  file_common_scrnaseq("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-common-sc-dataset'>Common SC Dataset</a>")
   file_common_ist---comp_data_processor
+  comp_data_processor-->file_scrnaseq_reference
   comp_data_processor-->file_spatial_dataset
-  comp_data_processor-->file_scrnaseq
+  file_scrnaseq_reference---comp_control_method
+  file_scrnaseq_reference---comp_metric
   file_spatial_dataset---comp_control_method
   file_spatial_dataset---comp_method
-  file_scrnaseq---comp_control_method
-  file_scrnaseq---comp_metric
   comp_control_method-->file_prediction
-  comp_method-->file_prediction
   comp_metric-->file_score
+  comp_method-->file_prediction
   file_prediction---comp_metric
   file_common_scrnaseq---comp_data_processor
 ```
@@ -76,91 +76,12 @@ Format:
 
 <div class="small">
 
-    SpatialData object
-     images: 'image', 'image_3D', 'he_image'
-     labels: 'cell_labels', 'nucleus_labels'
-     points: 'transcripts'
-     shapes: 'cell_boundaries', 'nucleus_boundaries'
-     tables: 'metadata'
-     coordinate_systems: 'global'
-
 </div>
 
 Data structure:
 
 <div class="small">
 
-*images*
-
-| Name       | Description                         |
-|:-----------|:------------------------------------|
-| `image`    | The raw image data.                 |
-| `image_3D` | (*Optional*) The raw 3D image data. |
-| `he_image` | (*Optional*) H&E image data.        |
-
-*labels*
-
-| Name             | Description                            |
-|:-----------------|:---------------------------------------|
-| `cell_labels`    | (*Optional*) Cell segmentation labels. |
-| `nucleus_labels` | (*Optional*) Cell segmentation labels. |
-
-*points*
-
-`transcripts`: Point cloud data of transcripts.
-
-| Column | Type | Description |
-|:---|:---|:---|
-| `x` | `float` | x-coordinate of the point. |
-| `y` | `float` | y-coordinate of the point. |
-| `z` | `float` | (*Optional*) z-coordinate of the point. |
-| `feature_name` | `categorical` | Name of the feature. |
-| `cell_id` | `integer` | (*Optional*) Unique identifier of the cell. |
-| `nucleus_id` | `integer` | (*Optional*) Unique identifier of the nucleus. |
-| `cell_type` | `string` | (*Optional*) Cell type of the cell. |
-| `qv` | `float` | (*Optional*) Quality value of the point. |
-| `transcript_id` | `long` | Unique identifier of the transcript. |
-| `overlaps_nucleus` | `boolean` | (*Optional*) Whether the point overlaps with a nucleus. |
-
-*shapes*
-
-`cell_boundaries`: Cell boundaries.
-
-| Column     | Type     | Description                    |
-|:-----------|:---------|:-------------------------------|
-| `geometry` | `object` | Geometry of the cell boundary. |
-
-`nucleus_boundaries`: Nucleus boundaries.
-
-| Column     | Type     | Description                       |
-|:-----------|:---------|:----------------------------------|
-| `geometry` | `object` | Geometry of the nucleus boundary. |
-
-*tables*
-
-`metadata`: Metadata of spatial dataset.
-
-| Slot | Type | Description |
-|:---|:---|:---|
-| `obs["cell_id"]` | `string` | A unique identifier for the cell. |
-| `var["gene_ids"]` | `string` | Unique identifier for the gene. |
-| `var["feature_types"]` | `string` | Type of the feature. |
-| `obsm["spatial"]` | `double` | Spatial coordinates of the cell. |
-| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. |
-| `uns["dataset_name"]` | `string` | A human-readable name for the dataset. |
-| `uns["dataset_url"]` | `string` | Link to the original source of the dataset. |
-| `uns["dataset_reference"]` | `string` | Bibtex reference of the paper in which the dataset was published. |
-| `uns["dataset_summary"]` | `string` | Short description of the dataset. |
-| `uns["dataset_description"]` | `string` | Long description of the dataset. |
-| `uns["dataset_organism"]` | `string` | The organism of the sample in the dataset. |
-| `uns["segmentation_id"]` | `string` | A unique identifier for the segmentation. |
-
-*coordinate_systems*
-
-| Name     | Description                         |
-|:---------|:------------------------------------|
-| `global` | Coordinate system of the replicate. |
-
 </div>
 
 ## Component type: Data processor
@@ -176,110 +97,7 @@ Arguments:
 | `--input_sp` | `file` | An unprocessed spatial imaging dataset stored as a zarr file. |
 | `--input_sc` | `file` | An unprocessed dataset as output by a dataset loader. |
 | `--output_spatial_dataset` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. |
-| `--output_scrnaseq` | `file` | (*Output*) A single-cell dataset, preprocessed for this benchmark. |
-
-</div>
-
-## File format: Raw iST Dataset
-
-A spatial transcriptomics dataset, preprocessed for this benchmark.
-
-Example file:
-`resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr`
-
-Description:
-
-This dataset contains preprocessed images, labels, points, shapes, and
-tables for spatial transcriptomics data.
-
-Format:
-
-<div class="small">
-
-    SpatialData object
-     images: 'image', 'image_3D', 'he_image'
-     labels: 'cell_labels', 'nucleus_labels'
-     points: 'transcripts'
-     shapes: 'cell_boundaries', 'nucleus_boundaries'
-     tables: 'metadata'
-     coordinate_systems: 'global'
-
-</div>
-
-Data structure:
-
-<div class="small">
-
-*images*
-
-| Name       | Description                         |
-|:-----------|:------------------------------------|
-| `image`    | The raw image data.                 |
-| `image_3D` | (*Optional*) The raw 3D image data. |
-| `he_image` | (*Optional*) H&E image data.        |
-
-*labels*
-
-| Name             | Description                            |
-|:-----------------|:---------------------------------------|
-| `cell_labels`    | (*Optional*) Cell segmentation labels. |
-| `nucleus_labels` | (*Optional*) Cell segmentation labels. |
-
-*points*
-
-`transcripts`: Point cloud data of transcripts.
-
-| Column | Type | Description |
-|:---|:---|:---|
-| `x` | `float` | x-coordinate of the point. |
-| `y` | `float` | y-coordinate of the point. |
-| `z` | `float` | (*Optional*) z-coordinate of the point. |
-| `feature_name` | `categorical` | Name of the feature. |
-| `cell_id` | `integer` | (*Optional*) Unique identifier of the cell. |
-| `nucleus_id` | `integer` | (*Optional*) Unique identifier of the nucleus. |
-| `cell_type` | `string` | (*Optional*) Cell type of the cell. |
-| `qv` | `float` | (*Optional*) Quality value of the point. |
-| `transcript_id` | `long` | Unique identifier of the transcript. |
-| `overlaps_nucleus` | `boolean` | (*Optional*) Whether the point overlaps with a nucleus. |
-
-*shapes*
-
-`cell_boundaries`: Cell boundaries.
-
-| Column     | Type     | Description                    |
-|:-----------|:---------|:-------------------------------|
-| `geometry` | `object` | Geometry of the cell boundary. |
-
-`nucleus_boundaries`: Nucleus boundaries.
-
-| Column     | Type     | Description                       |
-|:-----------|:---------|:----------------------------------|
-| `geometry` | `object` | Geometry of the nucleus boundary. |
-
-*tables*
-
-`metadata`: Metadata of spatial dataset.
-
-| Slot | Type | Description |
-|:---|:---|:---|
-| `obs["cell_id"]` | `string` | A unique identifier for the cell. |
-| `var["gene_ids"]` | `string` | Unique identifier for the gene. |
-| `var["feature_types"]` | `string` | Type of the feature. |
-| `obsm["spatial"]` | `double` | Spatial coordinates of the cell. |
-| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. |
-| `uns["dataset_name"]` | `string` | A human-readable name for the dataset. |
-| `uns["dataset_url"]` | `string` | Link to the original source of the dataset. |
-| `uns["dataset_reference"]` | `string` | Bibtex reference of the paper in which the dataset was published. |
-| `uns["dataset_summary"]` | `string` | Short description of the dataset. |
-| `uns["dataset_description"]` | `string` | Long description of the dataset. |
-| `uns["dataset_organism"]` | `string` | The organism of the sample in the dataset. |
-| `uns["segmentation_id"]` | `string` | A unique identifier for the segmentation. |
-
-*coordinate_systems*
-
-| Name     | Description                         |
-|:---------|:------------------------------------|
-| `global` | Coordinate system of the replicate. |
+| `--output_scrnaseq` | `file` | (*Output*) A single-cell reference dataset, preprocessed for this benchmark. |
 
 </div>
 
@@ -288,7 +106,7 @@ Data structure:
 A single-cell reference dataset, preprocessed for this benchmark.
 
 Example file:
-`resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad`
+`resources_test/task_spatial_segmentation/mouse_brain_combined/scrnaseq_reference.h5ad`
 
 Description:
 
@@ -364,6 +182,30 @@ Data structure:
 
 </div>
 
+## File format: Raw iST Dataset
+
+A spatial transcriptomics dataset, preprocessed for this benchmark.
+
+Example file:
+`resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr`
+
+Description:
+
+This dataset contains preprocessed images, labels, points, shapes, and
+tables for spatial transcriptomics data.
+
+Format:
+
+<div class="small">
+
+</div>
+
+Data structure:
+
+<div class="small">
+
+</div>
+
 ## Component type: Control Method
 
 Quality control methods for verifying the pipeline.
@@ -380,9 +222,9 @@ Arguments:
 
 </div>
 
-## Component type: Method
+## Component type: Metric
 
-A method.
+A task template metric.
 
 Arguments:
 
@@ -390,14 +232,15 @@ Arguments:
 
 | Name | Type | Description |
 |:---|:---|:---|
-| `--input` | `file` | A spatial transcriptomics dataset, preprocessed for this benchmark. |
-| `--output` | `file` | (*Output*) A predicted dataset as output by a method. |
+| `--input_prediction` | `file` | A predicted dataset as output by a method. |
+| `--input_scrnaseq_reference` | `file` | A single-cell reference dataset, preprocessed for this benchmark. |
+| `--output` | `file` | (*Output*) File indicating the score of a metric. |
 
 </div>
 
-## Component type: Metric
+## Component type: Method
 
-A task template metric.
+A method.
 
 Arguments:
 
@@ -405,9 +248,8 @@ Arguments:
 
 | Name | Type | Description |
 |:---|:---|:---|
-| `--input_prediction` | `file` | A predicted dataset as output by a method. |
-| `--input_scrnaseq_reference` | `file` | A single-cell reference dataset, preprocessed for this benchmark. |
-| `--output` | `file` | (*Output*) File indicating the score of a metric. |
+| `--input` | `file` | A spatial transcriptomics dataset, preprocessed for this benchmark. |
+| `--output` | `file` | (*Output*) A predicted dataset as output by a method. |
 
 </div>
 
@@ -422,31 +264,12 @@ Format:
 
 <div class="small">
 
-    SpatialData object
-     labels: 'segmentation'
-     tables: 'table'
-
 </div>
 
 Data structure:
 
 <div class="small">
 
-*labels*
-
-| Name           | Description               |
-|:---------------|:--------------------------|
-| `segmentation` | Segmentation of the data. |
-
-*tables*
-
-`table`: AnnData table.
-
-| Slot             | Type     | Description |
-|:-----------------|:---------|:------------|
-| `obs["cell_id"]` | `string` | Cell ID.    |
-| `obs["region"]`  | `string` | Region.     |
-
 </div>
 
 ## File format: Score
@@ -562,3 +385,4 @@ Data structure:
 | `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. |
 
 </div>
+

From 13368a70fbaeb8678002ce862001f5e6f37743be Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Mon, 27 Apr 2026 14:52:19 +0200
Subject: [PATCH 16/22] update helper scripts

---
 scripts/create_resources/resources.sh         |  2 +-
 scripts/create_resources/test_resources.sh    | 42 ++++++++++---------
 scripts/run_benchmark/run_full_local.sh       |  2 +-
 scripts/run_benchmark/run_full_seqeracloud.sh |  2 +-
 4 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/scripts/create_resources/resources.sh b/scripts/create_resources/resources.sh
index 4ba5075..52ee226 100755
--- a/scripts/create_resources/resources.sh
+++ b/scripts/create_resources/resources.sh
@@ -18,7 +18,7 @@ cat > /tmp/params.yaml << 'HERE'
 input_states: s3://openproblems-data/resources/datasets/**/state.yaml
 rename_keys: 'input:output_dataset'
 output_state: '$id/state.yaml'
-settings: '{"output_scrnaseq": "$id/output_scrnaseq.h5ad"}'
+settings: '{"output_spatial_dataset": "$id/output_spatial_dataset.zarr", "output_scrnaseq": "$id/output_scrnaseq.h5ad"}'
 publish_dir: s3://openproblems-data/resources/task_template/datasets/
 HERE
 
diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh
index 26074a9..774b1f8 100755
--- a/scripts/create_resources/test_resources.sh
+++ b/scripts/create_resources/test_resources.sh
@@ -13,38 +13,42 @@ cd "$REPO_ROOT"
 
 set -e
 
-RAW_DATA=resources_test/task_spatial_segmentation
-DATASET_DIR=resources_test/task_spatial_segmentation
+DATASET_ID=mouse_brain_combined
+
+RAW_DATA=resources_test/common
+DATASET_DIR=resources_test/task_spatial_segmentation/$DATASET_ID
 
 mkdir -p $DATASET_DIR
 
 # process dataset
 viash run src/data_processors/process_dataset/config.vsh.yaml -- \
-  --input_sp $RAW_DATA/mouse_brain_combined/common_ist.zarr \
-  --input_sc $RAW_DATA/mouse_brain_combined/common_scrnaseq.h5ad \
-  --output_spatial_dataset $DATASET_DIR/output_spatial_dataset.zarr \
-  --output_scrnaseq $DATASET_DIR/mouse_brain_combined/output_scrnaseq.h5ad
+  --input_sp $RAW_DATA/2023_10x_mouse_brain_xenium_rep1/dataset.zarr \
+  --input_sc $RAW_DATA/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad \
+  --output_spatial_dataset $DATASET_DIR/spatial_dataset.zarr \
+  --output_scrnaseq_reference $DATASET_DIR/scrnaseq_reference.h5ad
 
 # run one method
-viash run src/methods/logistic_regression/config.vsh.yaml -- \
-    --input $DATASET_DIR/mouse_brain_combined/common_ist.zarr \
-    --output $DATASET_DIR/mouse_brain_combined/prediction.h5ad
+viash run src/methods/cellpose/config.vsh.yaml -- \
+    --input $DATASET_DIR/spatial_dataset.zarr \
+    --output $DATASET_DIR/prediction.h5ad
 
 # run one metric
-viash run src/metrics/accuracy/config.vsh.yaml -- \
-    --input_prediction $DATASET_DIR/cxg_mouse_pancreas_atlas/prediction.h5ad \
-    --input_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad \
-    --output $DATASET_DIR/cxg_mouse_pancreas_atlas/score.h5ad
+# TODO: implement this!
+# viash run src/metrics/ari/config.vsh.yaml -- \
+#     --input_prediction $DATASET_DIR/prediction.h5ad \
+#     --input_scrnaseq_reference $DATASET_DIR/scrnaseq_reference.h5ad \
+#     --output $DATASET_DIR/score.h5ad
 
 # write manual state.yaml. this is not actually necessary but you never know it might be useful
-cat > $DATASET_DIR/mouse_brain_combined/state.yaml << HERE
-id: mouse_brain_combined
-processed: !file output_scrnaseq.h5ad
-segmentation: !file prediction.h5ad
-score: !file score.h5ad
+cat > $DATASET_DIR/state.yaml << HERE
+id: $DATASET_ID
+spatial_dataset: spatial_dataset.zarr
+scrnaseq_reference: scrnaseq_reference.h5ad
+prediction: prediction.h5ad
+score: score.h5ad
 HERE
 
 # only run this if you have access to the openproblems-data bucket
 aws s3 sync --profile op \
-  "$DATASET_DIR" s3://openproblems-data/resources_test/task_template \
+  "$DATASET_DIR" s3://openproblems-data/resources_test/task_spatial_segmentation/mouse_brain_combined/ \
   --delete --dryrun
diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh
index 4b1aa11..26bba56 100755
--- a/scripts/run_benchmark/run_full_local.sh
+++ b/scripts/run_benchmark/run_full_local.sh
@@ -31,7 +31,7 @@ publish_dir="resources/results/${RUN_ID}"
 # write the parameters to file
 cat > /tmp/params.yaml << HERE
 input_states: resources/datasets/**/state.yaml
-rename_keys: 'input_scrnaseq:output_scrnaseq'
+rename_keys: 'input_spatial_dataset:output_spatial_dataset,input_scrnaseq_reference:output_scrnaseq_reference'
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
 HERE
diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh
index 83f37b2..3c31e74 100755
--- a/scripts/run_benchmark/run_full_seqeracloud.sh
+++ b/scripts/run_benchmark/run_full_seqeracloud.sh
@@ -23,7 +23,7 @@ publish_dir="s3://openproblems-data/resources/task_template/results/${RUN_ID}"
 # write the parameters to file
 cat > /tmp/params.yaml << HERE
 input_states: s3://openproblems-data/resources/task_template/datasets/**/state.yaml
-rename_keys: 'input_scrnaseq:output_scrnaseq'
+rename_keys: 'input_spatial_dataset:output_spatial_dataset,input_scrnaseq_reference:output_scrnaseq_reference'
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
 HERE

From 4f275a894ec71a5ca6400b99a716d96ce1ad3a8d Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Mon, 27 Apr 2026 14:52:30 +0200
Subject: [PATCH 17/22] update data processor

---
 src/api/comp_control_method.yaml              |   2 +-
 src/api/comp_data_processor.yaml              |   4 +-
 src/api/comp_metric.yaml                      |   2 +-
 src/api/file_scrnaseq.yaml                    |   9 -
 src/api/file_scrnaseq_reference.yaml          | 259 ++++++++++++++++++
 src/api/file_spatial_dataset.yaml             | 169 +++++++++++-
 .../process_dataset/config.vsh.yaml           |  25 +-
 .../config/config_default.json                |   1 -
 src/data_processors/process_dataset/script.py |  77 ++----
 .../process_datasets/config.vsh.yaml          |   4 +-
 src/workflows/process_datasets/main.nf        |   4 +-
 src/workflows/run_benchmark/config.vsh.yaml   |   2 +-
 12 files changed, 465 insertions(+), 93 deletions(-)
 delete mode 100644 src/api/file_scrnaseq.yaml
 create mode 100644 src/api/file_scrnaseq_reference.yaml
 delete mode 100644 src/data_processors/process_dataset/config/config_default.json

diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml
index 694f004..3f4fa2e 100644
--- a/src/api/comp_control_method.yaml
+++ b/src/api/comp_control_method.yaml
@@ -17,7 +17,7 @@ arguments:
     required: true
     direction: input
   - name: "--input_scrnaseq_reference"
-    __merge__: file_scrnaseq.yaml
+    __merge__: file_scrnaseq_reference.yaml
     direction: input
     required: true
   - name: --output
diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml
index 137cd12..22c77aa 100644
--- a/src/api/comp_data_processor.yaml
+++ b/src/api/comp_data_processor.yaml
@@ -23,8 +23,8 @@ argument_groups:
         __merge__: file_spatial_dataset.yaml
         direction: output
         required: true
-      - name: "--output_scrnaseq"
-        __merge__: file_scrnaseq.yaml
+      - name: "--output_scrnaseq_reference"
+        __merge__: file_scrnaseq_reference.yaml
         direction: output
         required: true
 test_resources:
diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml
index e2d21e6..a7470e9 100644
--- a/src/api/comp_metric.yaml
+++ b/src/api/comp_metric.yaml
@@ -12,7 +12,7 @@ arguments:
     direction: input
     required: true
   - name: "--input_scrnaseq_reference"
-    __merge__: file_scrnaseq.yaml
+    __merge__: file_scrnaseq_reference.yaml
     direction: input
     required: true
   - name: "--output"
diff --git a/src/api/file_scrnaseq.yaml b/src/api/file_scrnaseq.yaml
deleted file mode 100644
index 06d8491..0000000
--- a/src/api/file_scrnaseq.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-type: file
-example: "resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad"
-# TODO: revert to the original example once file exists
-# example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.h5ad"
-label: "scRNA-seq Reference"
-summary: A single-cell reference dataset, preprocessed for this benchmark.
-description: |
-  This dataset contains preprocessed counts and metadata for single-cell RNA-seq data.
-__merge__: file_common_scrnaseq.yaml
\ No newline at end of file
diff --git a/src/api/file_scrnaseq_reference.yaml b/src/api/file_scrnaseq_reference.yaml
new file mode 100644
index 0000000..e214add
--- /dev/null
+++ b/src/api/file_scrnaseq_reference.yaml
@@ -0,0 +1,259 @@
+type: file
+example: "resources_test/task_spatial_segmentation/mouse_brain_combined/scrnaseq_reference.h5ad"
+# TODO: revert to the original example once file exists
+# example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.h5ad"
+label: "scRNA-seq Reference"
+summary: A single-cell reference dataset, preprocessed for this benchmark.
+description: |
+  This dataset contains preprocessed counts and metadata for single-cell RNA-seq data.
+info:
+  format:
+    type: h5ad
+    layers:
+      - type: integer
+        name: counts
+        description: Raw counts
+        required: true
+      - type: integer
+        name: normalized
+        description: Normalized expression values
+        required: true
+    obs:
+      - type: string
+        name: cell_type
+        description: Classification of the cell type based on its characteristics and function within the tissue or organism.
+        required: true
+        
+      - type: string
+        name: cell_type_level2
+        description: Classification of the cell type based on its characteristics and function within the tissue or organism.
+        required: false
+        
+      - type: string
+        name: cell_type_level3
+        description: Classification of the cell type based on its characteristics and function within the tissue or organism.
+        required: false
+        
+      - type: string
+        name: cell_type_level4
+        description: Classification of the cell type based on its characteristics and function within the tissue or organism.
+        required: false
+    
+      - type: string
+        name: dataset_id
+        description: Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes.
+        required: false
+
+      - type: string
+        name: assay
+        description: Type of assay used to generate the cell data, indicating the methodology or technique employed.
+        required: false
+
+      - type: string
+        name: assay_ontology_term_id
+        description: Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type.
+        required: false
+
+      - type: string
+        name: cell_type_ontology_term_id
+        description: Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification.
+        required: false
+
+      - type: string
+        name: development_stage
+        description: Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase.
+        required: false
+
+      - type: string
+        name: development_stage_ontology_term_id
+        description: |
+          Ontology term identifier for the developmental stage, providing a standardized reference to the organism's developmental phase.
+
+          If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used.  
+          If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used.
+          Otherwise, the Uberon (`UBERON:`) ontology is used.
+        required: false
+
+      - type: string
+        name: disease
+        description: Information on any disease or pathological condition associated with the cell or donor.
+        required: false
+
+      - type: string
+        name: disease_ontology_term_id
+        description: |
+          Ontology term identifier for the disease, enabling standardized disease classification and referencing.
+
+          Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`).
+        required: false
+
+      - type: string
+        name: donor_id
+        description: Identifier for the donor from whom the cell sample is obtained.
+        required: false
+
+      - type: boolean
+        name: is_primary_data
+        description: Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data.
+        required: false
+
+      - type: string
+        name: organism
+        description: Organism from which the cell sample is obtained.
+        required: false
+      
+      - type: string
+        name: organism_ontology_term_id
+        description: |
+          Ontology term identifier for the organism, providing a standardized reference for the organism.
+
+          Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`.
+        required: false
+
+      - type: string
+        name: self_reported_ethnicity
+        description: Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits.
+        required: false
+
+      - type: string
+        name: self_reported_ethnicity_ontology_term_id
+        description: |
+          Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications.
+
+          If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used.
+        required: false
+
+      - type: string
+        name: sex
+        description: Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions.
+        required: false
+
+      - type: string
+        name: sex_ontology_term_id
+        description: Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed.
+        required: false
+
+      - type: string
+        name: suspension_type
+        description: Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions.
+        required: false
+
+      - type: string
+        name: tissue
+        description: Specific tissue from which the cells were derived, key for context and specificity in cell studies.
+        required: false
+
+      - type: string
+        name: tissue_ontology_term_id
+        description: |
+          Ontology term identifier for the tissue, providing a standardized reference for the tissue type.
+
+          For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity).
+          For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`.
+        required: false
+
+      - type: string
+        name: tissue_general
+        description: General category or classification of the tissue, useful for broader grouping and comparison of cell data.
+        required: false
+
+      - type: string
+        name: tissue_general_ontology_term_id
+        description: |
+          Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types.
+
+          For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity).
+          For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`.
+        required: false
+
+      - type: string
+        name: batch
+        description: A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc.
+        required: false
+
+      - type: integer
+        name: soma_joinid
+        description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell.
+        required: false
+    var:
+      - type: string
+        name: feature_id
+        description: Unique identifier for the feature, usually a ENSEMBL gene id.
+        # TODO: make this required once openproblems_v1 dataloader supports it
+        required: false
+      
+      - type: string
+        name: feature_name
+        description: A human-readable name for the feature, usually a gene symbol.
+        # TODO: make this required once the dataloader supports it
+        required: true
+
+      - type: integer
+        name: soma_joinid
+        description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature.
+        required: false
+
+      - type: boolean
+        name: hvg
+        description: Whether or not the feature is considered to be a 'highly variable gene'
+        required: true
+
+      - type: double
+        name: hvg_score
+        description: A score for the feature indicating how highly variable it is.
+        required: true
+
+    obsp:
+      - type: double
+        name: knn_distances
+        description: K nearest neighbors distance matrix.
+        required: true
+
+      - type: double
+        name: knn_connectivities
+        description: K nearest neighbors connectivities matrix.
+        required: true
+
+    obsm:
+      - type: double
+        name: X_pca
+        description: The resulting PCA embedding.
+        required: true
+
+    varm:
+      - type: double
+        name: pca_loadings
+        description: The PCA loadings matrix.
+        required: true
+
+    uns:
+      - type: string
+        name: dataset_id
+        description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived.
+        required: true
+      - name: dataset_name
+        type: string
+        description: A human-readable name for the dataset.
+        required: true
+      - type: string
+        name: dataset_url
+        description: Link to the original source of the dataset.
+        required: false
+      - name: dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+        multiple: true
+      - name: dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false
+        multiple: true
diff --git a/src/api/file_spatial_dataset.yaml b/src/api/file_spatial_dataset.yaml
index 5668a3f..41e3c31 100644
--- a/src/api/file_spatial_dataset.yaml
+++ b/src/api/file_spatial_dataset.yaml
@@ -1,9 +1,174 @@
 type: file
-example: "resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr"
+example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr"
 # TODO: revert to the original example once file exists
 # example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr"
 label: "Raw iST Dataset"
 summary: A spatial transcriptomics dataset, preprocessed for this benchmark.
 description: |
   This dataset contains preprocessed images, labels, points, shapes, and tables for spatial transcriptomics data.
-__merge__: file_common_ist.yaml
+info:
+  format:
+    type: spatialdata_zarr
+    images:
+      - type: object
+        name: image
+        description: The raw image data
+        required: true
+      - type: object
+        name: image_3D
+        description: The raw 3D image data
+        required: false
+      - type: object
+        name: he_image
+        description: H&E image data
+        required: false 
+    labels:
+      - type: object
+        name: "cell_labels"
+        description: Cell segmentation labels
+        required: false
+      - type: object
+        name: "nucleus_labels"
+        description: Cell segmentation labels
+        required: false
+      # - type: datatree
+      #   name: "{segm}_3D"
+      #   description: Custom segmentation of the 3D data
+      #   required: false
+      # - type: datatree
+      #   name: "expert_segm_{patch}"
+      #   description: Expert segmentation of a patch of the data
+      #   required: false
+      # - type: DataTree[zyx]
+      #   name: "expert_segm_{patch}_3D"
+      #   description: Expert segmentation of a 3D patch of the data
+      #   required: false
+    points:
+      - type: dataframe
+        name: transcripts
+        description: Point cloud data of transcripts
+        required: true
+        columns:
+          - type: float
+            name: "x"
+            required: true
+            description: x-coordinate of the point
+          - type: float
+            name: "y"
+            required: true
+            description: y-coordinate of the point
+          - type: float
+            name: "z"
+            required: false
+            description: z-coordinate of the point
+          - type: categorical
+            name: feature_name
+            required: true
+            description: Name of the feature
+          - type: integer
+            name: "cell_id"
+            required: false
+            description: Unique identifier of the cell
+          - type: integer
+            name: "nucleus_id"
+            required: false
+            description: Unique identifier of the nucleus
+          - type: string
+            name: "cell_type"
+            required: false
+            description: Cell type of the cell
+          - type: float
+            name: qv
+            required: false
+            description: Quality value of the point
+          - type: long
+            name: transcript_id
+            required: true
+            description: Unique identifier of the transcript
+          - type: boolean
+            name: overlaps_nucleus
+            required: false
+            description: Whether the point overlaps with a nucleus
+    shapes:
+      - type: dataframe
+        name: "cell_boundaries"
+        description: Cell boundaries
+        required: false
+        columns:
+          - type: object
+            name: "geometry"
+            required: true
+            description: Geometry of the cell boundary
+      - type: dataframe
+        name: "nucleus_boundaries"
+        description: Nucleus boundaries
+        required: false
+        columns:
+          - type: object
+            name: "geometry"
+            required: true
+            description: Geometry of the nucleus boundary
+    tables:
+      - type: anndata
+        name: "metadata"
+        description: Metadata of spatial dataset
+        required: true
+        uns:
+          - type: string
+            name: dataset_id
+            required: true
+            description: A unique identifier for the dataset
+          - type: string
+            name: dataset_name
+            required: true
+            description: A human-readable name for the dataset
+          - type: string
+            name: dataset_url
+            required: true
+            description: Link to the original source of the dataset
+          - type: string
+            name: dataset_reference
+            required: true
+            description: Bibtex reference of the paper in which the dataset was published
+          - type: string
+            name: dataset_summary
+            required: true
+            description: Short description of the dataset
+          - type: string
+            name: dataset_description
+            required: true
+            description: Long description of the dataset
+          - type: string
+            name: dataset_organism
+            required: true
+            description: The organism of the sample in the dataset
+          - type: string
+            name: segmentation_id
+            required: true
+            multiple: true
+            description: A unique identifier for the segmentation
+        obs:
+          - type: string
+            name: cell_id
+            required: true
+            description: A unique identifier for the cell
+        var:
+          - type: string
+            name: gene_ids
+            required: true
+            description: Unique identifier for the gene
+          - type: string
+            name: feature_types
+            required: true
+            description: Type of the feature
+        obsm:
+          - type: double
+            name: spatial
+            required: true
+            description: Spatial coordinates of the cell
+    coordinate_systems:
+      - type: object
+        name: global
+        description: Coordinate system of the replicate
+        required: true
+
diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml
index 25700eb..58bf840 100644
--- a/src/data_processors/process_dataset/config.vsh.yaml
+++ b/src/data_processors/process_dataset/config.vsh.yaml
@@ -3,26 +3,18 @@ __merge__: ../../api/comp_data_processor.yaml
 name: process_dataset
 
 arguments:
-  - name: "--method"
-    type: "string"
-    description: "The spatial technology data type."
-    choices: ["xenium"]
-  - name: "--obs_label"
-    type: "string"
-    description: "Which .obs slot to use as label."
-    default: "cell_type"
-  - name: "--obs_batch"
-    type: "string"
-    description: "Which .obs slot to use as batch covariate."
-    default: "batch"
   - name: "--seed"
     type: "integer"
     description: "A seed for the subsampling."
     example: 123
-  - name: "--config"
-    type: "string"
-    description: "Config file in json format for data processing parameters."
-    example: config/config_default.json
+  - name: "--span"
+    type: double
+    description: The fraction of the data (cells) used when estimating the variance in the loess model fit if flavor='seurat_v3'.
+    default: 0.3
+  - name: "--n_top_genes"
+    type: integer
+    description: Number of highly-variable genes to keep. Mandatory if flavor='seurat_v3'.
+    default: 3000
 
 resources:
   - type: python_script
@@ -39,7 +31,6 @@ engines:
       - /src/base/setup_spatialdata_partial.yaml
   - type: native
 
-
 runners:
   - type: executable
   - type: nextflow
diff --git a/src/data_processors/process_dataset/config/config_default.json b/src/data_processors/process_dataset/config/config_default.json
deleted file mode 100644
index 8d52b6c..0000000
--- a/src/data_processors/process_dataset/config/config_default.json
+++ /dev/null
@@ -1 +0,0 @@
-{"span": 1.0, "n_top_genes": 3000}
\ No newline at end of file
diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py
index 7a48f6d..4eeb6de 100644
--- a/src/data_processors/process_dataset/script.py
+++ b/src/data_processors/process_dataset/script.py
@@ -1,82 +1,49 @@
-import sys
 import random
-import numpy as np
 import anndata as ad
-import scanpy as sc
-import openproblems as op
 import spatialdata as sd
-import json
+import os
 import shutil
 
 ## VIASH START
 par = {
-    'input_sp': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_ist.zarr',
-    'input_sc': 'resources_test/task_spatial_segmentation/mouse_brain_combined/common_scrnaseq.h5ad',
+    'input_sp': 'resources_test/common/2023_10x_mouse_brain_xenium_rep1/dataset.zarr',
+    'input_sc': 'resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad',
     'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr',
-    'output_scrnaseq': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq.h5ad',
-    'method': 'xenium',
+    'output_scrnaseq_reference': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq_reference.h5ad',
+    'span': 0.3,
     'seed': 123,
-    'config': 'task_spatial_segmentation/src/data_processors/process_dataset/config/config_default.json'
-}
-
-meta = {
-    'resources_dir': 'target/executable/data_processors/process_dataset',
-    'config': 'target/executable/data_processors/process_dataset/.config.vsh.yaml'
+    'n_top_genes': 3000
 }
 ## VIASH END
 
-# import helper functions
-sys.path.append(meta['resources_dir'])
-
-config = op.project.read_viash_config(meta["config"])
-
 # set seed if need be
 if par["seed"]:
     print(f">> Setting seed to {par['seed']}")
     random.seed(par["seed"])
 
 print(">> Load data", flush=True)
-adata = ad.read_h5ad(par["input_sc"])
-print("input_sc:", adata)
+sc_data = ad.read_h5ad(par["input_sc"])
 
-print(f">> Process {par['method']} data")
+print(">> Processing sc_data", flush=True)
 
-if par['config']:
-    print(f">> Perform standard data preprocessing")
-    with open(par['config'], "r") as f:
-        config = json.load(f)
+# TODO: process the single-cell dataset
 
-    # Add config to params
-    for key, value in config.items():
-        setattr(par, key, value)
+print(f"single cell data: {sc_data}")
 
-    adata.layers["counts"] = adata.X.copy()
-    
-    sc.pp.normalize_total(adata)
-    sc.pp.log1p(adata)
-    adata.layers['normlog'] = adata.X
-    
-    sc.pp.highly_variable_genes(
-        adata,
-        flavor="seurat_v3",
-        layer="counts",
-        span=par['span'],
-        n_top_genes=par['n_top_genes']
-    )
+print(">> Writing data", flush=True)
+sc_data.write_h5ad(par["output_scrnaseq_reference"], compression="gzip")
 
-    adata.var.sort_values("means")
-    sc.pp.scale(adata, zero_center=False)
-    adata.layers['normlogscale'] = adata.X
-    
-    adata.X = adata.layers['counts']
+# read input_sp
+print(">> Read spatial data", flush=True)
+sp_data = sd.read_zarr(par["input_sp"])
 
-    # cell area normalization
-    sc.pp.calculate_qc_metrics(adata, inplace=True)
-    for x in ['transcript_counts', 'n_genes_by_counts']:
-        adata.obs[f'canorm_{x}'] = adata.obs[f'{x}'] / adata.obs['cell_area']
+print(">> Processing spatial data", flush=True)
+# TODO: process the spatial dataset
 
-print(">> Writing data", flush=True)
-adata.write_h5ad(par["output_scrnaseq"])
+print(f"spatial data: {sp_data}")
 
 print(">> Writing spatial data", flush=True)
-shutil.copytree(par["input_sp"], par["output_spatial_dataset"])
\ No newline at end of file
+# remove directory if it exists
+if os.path.exists(par["output_spatial_dataset"]):
+    shutil.rmtree(par["output_spatial_dataset"])
+sp_data.write(par["output_spatial_dataset"], overwrite=True)
diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml
index fe3b9d4..c71286a 100644
--- a/src/workflows/process_datasets/config.vsh.yaml
+++ b/src/workflows/process_datasets/config.vsh.yaml
@@ -18,8 +18,8 @@ argument_groups:
         __merge__: /src/api/file_spatial_dataset.yaml
         direction: output
         required: true
-      - name: "--output_scrnaseq"
-        __merge__: /src/api/file_scrnaseq.yaml
+      - name: "--output_scrnaseq_reference"
+        __merge__: /src/api/file_scrnaseq_reference.yaml
         direction: output
         required: true
 
diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf
index 226e861..947a8f1 100644
--- a/src/workflows/process_datasets/main.nf
+++ b/src/workflows/process_datasets/main.nf
@@ -45,12 +45,12 @@ workflow run_wf {
       ],
       toState: [
         output_spatial_dataset: "output_spatial_dataset",
-        output_scrnaseq: "output_scrnaseq"
+        output_scrnaseq_reference: "output_scrnaseq_reference"
       ]
     )
 
     // only output the files for which an output file was specified
-    | setState(["output_spatial_dataset", "output_scrnaseq"])
+    | setState(["output_spatial_dataset", "output_scrnaseq_reference"])
 
   emit:
   output_ch
diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
index dd7f49b..4ab5f83 100644
--- a/src/workflows/run_benchmark/config.vsh.yaml
+++ b/src/workflows/run_benchmark/config.vsh.yaml
@@ -9,7 +9,7 @@ argument_groups:
         direction: output
         required: true
       - name: "--input_scrnaseq_reference"
-        __merge__: /src/api/file_scrnaseq.yaml
+        __merge__: /src/api/file_scrnaseq_reference.yaml
         direction: output
         required: true
   - name: Outputs

From 6e6a2772dcd562780dd00df4d2994e3c81fd6ed4 Mon Sep 17 00:00:00 2001
From: f641l <florian.heyl@dkfz-heidelberg.de>
Date: Mon, 27 Apr 2026 19:33:24 +0200
Subject: [PATCH 18/22] change to file_scranseq_reference.yaml

---
 src/api/file_scrnaseq_reference.yaml | 175 ++-------------------------
 1 file changed, 13 insertions(+), 162 deletions(-)

diff --git a/src/api/file_scrnaseq_reference.yaml b/src/api/file_scrnaseq_reference.yaml
index e214add..9b855fd 100644
--- a/src/api/file_scrnaseq_reference.yaml
+++ b/src/api/file_scrnaseq_reference.yaml
@@ -14,167 +14,28 @@ info:
         name: counts
         description: Raw counts
         required: true
-      - type: integer
+
+      - type: double
         name: normalized
         description: Normalized expression values
         required: true
+
+      - type: double
+        name: normalized_log
+        description: Log1p normalized expression values
+        required: true
+
+      - type: double
+        name: normalized_log_scaled
+        description: Log1p normalized expression values scaled to unit variance and zero mean
+        required: true
+        
     obs:
       - type: string
         name: cell_type
         description: Classification of the cell type based on its characteristics and function within the tissue or organism.
         required: true
-        
-      - type: string
-        name: cell_type_level2
-        description: Classification of the cell type based on its characteristics and function within the tissue or organism.
-        required: false
-        
-      - type: string
-        name: cell_type_level3
-        description: Classification of the cell type based on its characteristics and function within the tissue or organism.
-        required: false
-        
-      - type: string
-        name: cell_type_level4
-        description: Classification of the cell type based on its characteristics and function within the tissue or organism.
-        required: false
-    
-      - type: string
-        name: dataset_id
-        description: Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes.
-        required: false
-
-      - type: string
-        name: assay
-        description: Type of assay used to generate the cell data, indicating the methodology or technique employed.
-        required: false
-
-      - type: string
-        name: assay_ontology_term_id
-        description: Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type.
-        required: false
-
-      - type: string
-        name: cell_type_ontology_term_id
-        description: Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification.
-        required: false
-
-      - type: string
-        name: development_stage
-        description: Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase.
-        required: false
-
-      - type: string
-        name: development_stage_ontology_term_id
-        description: |
-          Ontology term identifier for the developmental stage, providing a standardized reference to the organism's developmental phase.
-
-          If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used.  
-          If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used.
-          Otherwise, the Uberon (`UBERON:`) ontology is used.
-        required: false
-
-      - type: string
-        name: disease
-        description: Information on any disease or pathological condition associated with the cell or donor.
-        required: false
-
-      - type: string
-        name: disease_ontology_term_id
-        description: |
-          Ontology term identifier for the disease, enabling standardized disease classification and referencing.
-
-          Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`).
-        required: false
-
-      - type: string
-        name: donor_id
-        description: Identifier for the donor from whom the cell sample is obtained.
-        required: false
 
-      - type: boolean
-        name: is_primary_data
-        description: Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data.
-        required: false
-
-      - type: string
-        name: organism
-        description: Organism from which the cell sample is obtained.
-        required: false
-      
-      - type: string
-        name: organism_ontology_term_id
-        description: |
-          Ontology term identifier for the organism, providing a standardized reference for the organism.
-
-          Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`.
-        required: false
-
-      - type: string
-        name: self_reported_ethnicity
-        description: Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits.
-        required: false
-
-      - type: string
-        name: self_reported_ethnicity_ontology_term_id
-        description: |
-          Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications.
-
-          If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used.
-        required: false
-
-      - type: string
-        name: sex
-        description: Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions.
-        required: false
-
-      - type: string
-        name: sex_ontology_term_id
-        description: Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed.
-        required: false
-
-      - type: string
-        name: suspension_type
-        description: Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions.
-        required: false
-
-      - type: string
-        name: tissue
-        description: Specific tissue from which the cells were derived, key for context and specificity in cell studies.
-        required: false
-
-      - type: string
-        name: tissue_ontology_term_id
-        description: |
-          Ontology term identifier for the tissue, providing a standardized reference for the tissue type.
-
-          For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity).
-          For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`.
-        required: false
-
-      - type: string
-        name: tissue_general
-        description: General category or classification of the tissue, useful for broader grouping and comparison of cell data.
-        required: false
-
-      - type: string
-        name: tissue_general_ontology_term_id
-        description: |
-          Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types.
-
-          For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity).
-          For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`.
-        required: false
-
-      - type: string
-        name: batch
-        description: A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc.
-        required: false
-
-      - type: integer
-        name: soma_joinid
-        description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell.
-        required: false
     var:
       - type: string
         name: feature_id
@@ -188,21 +49,11 @@ info:
         # TODO: make this required once the dataloader supports it
         required: true
 
-      - type: integer
-        name: soma_joinid
-        description: If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature.
-        required: false
-
       - type: boolean
         name: hvg
         description: Whether or not the feature is considered to be a 'highly variable gene'
         required: true
 
-      - type: double
-        name: hvg_score
-        description: A score for the feature indicating how highly variable it is.
-        required: true
-
     obsp:
       - type: double
         name: knn_distances

From 292fe3f11c0521a539b11501498bbe1ae1d1f7c3 Mon Sep 17 00:00:00 2001
From: f641l <florian.heyl@dkfz-heidelberg.de>
Date: Mon, 27 Apr 2026 19:33:53 +0200
Subject: [PATCH 19/22] change to script.py

---
 src/data_processors/process_dataset/script.py | 43 ++++++++++++++++++-
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py
index 4eeb6de..6722e4e 100644
--- a/src/data_processors/process_dataset/script.py
+++ b/src/data_processors/process_dataset/script.py
@@ -2,6 +2,7 @@
 import anndata as ad
 import spatialdata as sd
 import os
+import scanpy as sc
 import shutil
 
 ## VIASH START
@@ -25,8 +26,35 @@
 sc_data = ad.read_h5ad(par["input_sc"])
 
 print(">> Processing sc_data", flush=True)
+if "counts" not in sc_data.layers and sc_data.X != None:
+    print(">> Save raw counts in .layer", flush=True)
+    sc_data.layers["counts"] = sc_data.X.copy()
+    
+if "normalized" not in sc_data.layers:
+    print(">> Perform standard normalization", flush=True)
+    normalized = sc.pp.normalize_total(sc_data.layers["counts"])
+    sc_data.layers["normalized"] = normalized.copy()
 
-# TODO: process the single-cell dataset
+if "normalized_log" not in sc_data.layers:
+    print(">> Perform log1p normalization", flush=True)
+    normalized_log = sc.pp.log1p(sc_data.layers["normalized"])
+    sc_data.layers['normalized_log'] = normalized_log.copy()
+
+if "normalized_log_scaled" not in sc_data.layers:
+    print(">> Perform 0 mean and standard variance normalization", flush=True)
+    normalized_log_scaled = sc.pp.scale(sc_data.layers["normalized"])
+    sc_data.layers['normalized_log_scaled'] = normalized_log_scaled.copy()
+
+if "hvg" not in sc_data.var:
+    print(">> Compute highly variable genes", flush=True)
+    sc.pp.highly_variable_genes(
+        sc_data,
+        flavor="seurat_v3",
+        layer="counts",
+        span=par['span'],
+        n_top_genes=par['n_top_genes']
+    )
+    sc_data.var.rename(columns={"highly_variable": "hvg"}, inplace=True)
 
 print(f"single cell data: {sc_data}")
 
@@ -38,7 +66,16 @@
 sp_data = sd.read_zarr(par["input_sp"])
 
 print(">> Processing spatial data", flush=True)
-# TODO: process the spatial dataset
+sp_data_table = sp_data.tables['table']
+
+if "cell_area" not in sp_data_table.obs:
+    print(">> Perform scanpy qc for cell area", flush=True)
+    sc.pp.calculate_qc_metrics(sp_data_table, inplace=True)
+
+for x in ["transcript_counts", "n_genes_by_counts"]:
+    if f"ca_normalized_{x}" not in sp_data_table.obs and x in sp_data_table.obs:
+        print(f">> Perform cell area normalization for {x}", flush=True)
+        sp_data_table.obs[f'ca_normalized_{x}'] = sp_data_table.obs[f"{x}"] / sp_data_table.obs["cell_area"]
 
 print(f"spatial data: {sp_data}")
 
@@ -47,3 +84,5 @@
 if os.path.exists(par["output_spatial_dataset"]):
     shutil.rmtree(par["output_spatial_dataset"])
 sp_data.write(par["output_spatial_dataset"], overwrite=True)
+
+# %%

From aefc78e48a9ae011eaeff2bee01e9f48d6dff82d Mon Sep 17 00:00:00 2001
From: f641l <florian.heyl@dkfz-heidelberg.de>
Date: Tue, 28 Apr 2026 10:39:25 +0200
Subject: [PATCH 20/22] adding ari scripts

---
 src/metrics/accuracy/script.py                | 47 -----------------
 src/metrics/{accuracy => ari}/config.vsh.yaml |  0
 src/metrics/ari/script.py                     | 50 +++++++++++++++++++
 3 files changed, 50 insertions(+), 47 deletions(-)
 delete mode 100644 src/metrics/accuracy/script.py
 rename src/metrics/{accuracy => ari}/config.vsh.yaml (100%)
 create mode 100644 src/metrics/ari/script.py

diff --git a/src/metrics/accuracy/script.py b/src/metrics/accuracy/script.py
deleted file mode 100644
index 054e809..0000000
--- a/src/metrics/accuracy/script.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import anndata as ad
-import numpy as np
-import sklearn.preprocessing
-
-## VIASH START
-# Note: this section is auto-generated by viash at runtime. To edit it, make changes
-# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
-par = {
-  'input_solution': 'resources_test/task_template/cxg_mouse_pancreas_atlas/solution.h5ad',
-  'input_prediction': 'resources_test/task_template/cxg_mouse_pancreas_atlas/prediction.h5ad',
-  'output': 'output.h5ad'
-}
-meta = {
-  'name': 'accuracy'
-}
-## VIASH END
-
-print('Reading input files', flush=True)
-input_solution = ad.read_h5ad(par['input_solution'])
-input_prediction = ad.read_h5ad(par['input_prediction'])
-
-assert (input_prediction.obs_names == input_solution.obs_names).all(), "obs_names not the same in prediction and solution inputs"
-
-print("Encode labels", flush=True)
-cats = list(input_solution.obs["label"].dtype.categories) + list(input_prediction.obs["label_pred"].dtype.categories)
-encoder = sklearn.preprocessing.LabelEncoder().fit(cats)
-input_solution.obs["label"] = encoder.transform(input_solution.obs["label"])
-input_prediction.obs["label_pred"] = encoder.transform(input_prediction.obs["label_pred"])
-
-
-print('Compute metrics', flush=True)
-# metric_ids and metric_values can have length > 1
-# but should be of equal length
-uns_metric_ids = [ 'accuracy' ]
-uns_metric_values = np.mean(input_solution.obs["label"] == input_prediction.obs["label_pred"])
-
-print("Write output AnnData to file", flush=True)
-output = ad.AnnData(
-  uns={
-    'dataset_id': input_prediction.uns['dataset_id'],
-    'normalization_id': input_prediction.uns['normalization_id'],
-    'method_id': input_prediction.uns['method_id'],
-    'metric_ids': uns_metric_ids,
-    'metric_values': uns_metric_values
-  }
-)
-output.write_h5ad(par['output'], compression='gzip')
diff --git a/src/metrics/accuracy/config.vsh.yaml b/src/metrics/ari/config.vsh.yaml
similarity index 100%
rename from src/metrics/accuracy/config.vsh.yaml
rename to src/metrics/ari/config.vsh.yaml
diff --git a/src/metrics/ari/script.py b/src/metrics/ari/script.py
new file mode 100644
index 0000000..12c6801
--- /dev/null
+++ b/src/metrics/ari/script.py
@@ -0,0 +1,50 @@
+import anndata as ad
+import scanpy as sc
+from sklearn.metrics import adjusted_rand_score
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+  'input_solution': 'resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad',
+  'input_prediction': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq_reference.h5ad',
+  'output': 'output.h5ad',
+  'label': None
+}
+meta = {
+  'name': 'ari'
+}
+## VIASH END
+
+print('>> Reading input files', flush=True)
+input_solution = ad.read_h5ad(par['input_solution'])
+input_prediction = ad.read_h5ad(par['input_prediction'])
+
+assert (input_prediction.obs_names == input_solution.obs_names).all(), "obs_names not the same in prediction and solution inputs"
+
+if not par['label']:
+  print('>> Postprocessing for metric', flush=True)
+  seed= 123
+  sc.pp.neighbors(input_solution, n_neighbors=20, random_state=seed)
+  sc.tl.umap(input_solution, min_dist=0.1, spread=1.2, random_state=seed)
+  sc.tl.leiden(input_solution, resolution=1.0, key_added='leiden', random_state=seed)
+
+  sc.pp.neighbors(input_prediction, n_neighbors=20, random_state=seed)
+  sc.tl.umap(input_prediction, min_dist=0.1, spread=1.2, random_state=seed)
+  sc.tl.leiden(input_prediction, resolution=1.0, key_added='leiden', random_state=seed)
+
+print('>> Compute metrics', flush=True)
+uns_metric_ids = [ 'ari' ]
+uns_metric_values = adjusted_rand_score(input_solution.obs["label"], input_prediction.obs["label_pred"])
+
+print(">> Write output AnnData to file", flush=True)
+output = ad.AnnData(
+  uns={
+    'dataset_id': input_prediction.uns['dataset_id'],
+    'normalization_id': input_prediction.uns['normalization_id'],
+    'method_id': input_prediction.uns['method_id'],
+    'metric_ids': uns_metric_ids,
+    'metric_values': uns_metric_values
+  }
+)
+output.write_h5ad(par['output'], compression='gzip')

From d8589e57ba50f0a2dba936daaab30fd32fee8b29 Mon Sep 17 00:00:00 2001
From: f641l <florian.heyl@dkfz-heidelberg.de>
Date: Tue, 28 Apr 2026 12:11:14 +0200
Subject: [PATCH 21/22] changes to ari scripts, test_resource.sh,
 config.vsh.yml, script.py

---
 scripts/create_resources/test_resources.sh | 10 ++---
 src/metrics/ari/config.vsh.yaml            | 45 ++++++++++++++++------
 src/metrics/ari/script.py                  | 37 ++++++++++--------
 3 files changed, 61 insertions(+), 31 deletions(-)

diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh
index 774b1f8..26228b0 100755
--- a/scripts/create_resources/test_resources.sh
+++ b/scripts/create_resources/test_resources.sh
@@ -33,11 +33,11 @@ viash run src/methods/cellpose/config.vsh.yaml -- \
     --output $DATASET_DIR/prediction.h5ad
 
 # run one metric
-# TODO: implement this!
-# viash run src/metrics/ari/config.vsh.yaml -- \
-#     --input_prediction $DATASET_DIR/prediction.h5ad \
-#     --input_scrnaseq_reference $DATASET_DIR/scrnaseq_reference.h5ad \
-#     --output $DATASET_DIR/score.h5ad
+# TODO files need to be changed
+viash run src/metrics/ari/config.vsh.yaml -- \
+    --input_scrnaseq_reference $RAW_DATA/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad \
+    --input_prediction $DATASET_DIR/output_scrnaseq_reference.h5ad \
+    --output $DATASET_DIR/score.h5ad
 
 # write manual state.yaml. this is not actually necessary but you never know it might be useful
 cat > $DATASET_DIR/state.yaml << HERE
diff --git a/src/metrics/ari/config.vsh.yaml b/src/metrics/ari/config.vsh.yaml
index ac197bd..0575150 100644
--- a/src/metrics/ari/config.vsh.yaml
+++ b/src/metrics/ari/config.vsh.yaml
@@ -8,26 +8,27 @@ __merge__: ../../api/comp_metric.yaml
 
 # A unique identifier for your component (required).
 # Can contain only lowercase letters or underscores.
-name: accuracy
+name: ari
 
 # Metadata for your component
 info:
   metrics:
     # A unique identifier for your metric (required).
     # Can contain only lowercase letters or underscores.
-    - name: accuracy
+    - name: ari
       # A relatively short label, used when rendering visualisarions (required)
-      label: Accuracy
+      label: ARI
       # A one sentence summary of how this metric works (required). Used when 
       # rendering summary tables.
-      summary: "The percentage of correctly predicted labels."
+      summary: "Adjusted Rand index to measure the similarity between two data clusterings."
       # A multi-line description of how this component works (required). Used
       # when rendering reference documentation.
       description: |
-        The percentage of correctly predicted labels.
+        The Rand index is the accuracy of determining if a link belongs within a cluster or not.
+        The Rand index has a value between 0 and 1, with 0 indicating that the two data clusterings do not agree on any pair of points and 1 indicating that the data clusterings are exactly the same.
       # A reference key from the bibtex library at src/common/library.bib (required).
       references:
-        doi: 10.48550/arXiv.2008.05756
+        doi: 10.1080/01621459.1971.10482356
       # The minimum possible value for this metric (required)
       min: 0
       # The maximum possible value for this metric (required)
@@ -36,11 +37,31 @@ info:
       maximize: true
 
 # Component-specific parameters (optional)
-# arguments:
-#   - name: "--n_neighbors"
-#     type: "integer"
-#     default: 5
-#     description: Number of neighbors to use.
+arguments:
+  - name: "--label"
+    type: "string"
+    default: "leiden"
+    description: Label to be used to perform ARI.
+  - name: "--n_neighbors"
+    type: "integer"
+    default: 20
+    description: Number of neighbors to use for nearest neighbors distance matrix.
+  - name: "--min_dist"
+    type: "double"
+    default: 0.1
+    description: Effective minimum distance to use for UMAP.
+  - name: "--spread"
+    type: "double"
+    default: 1.2
+    description: The effective scale of embedded points to use for UMAP.
+  - name: "--resolution"
+    type: "double"
+    default: 1.0
+    description: The resolution to use for leiden clustering.
+  - name: "--seed"
+    type: "integer"
+    default: 123
+    description: Seed.
 
 # Resources required to run the component
 resources:
@@ -60,6 +81,8 @@ engines:
     setup:
       - type: python
         packages: scikit-learn
+      - type: python
+        packages: leidenalg
 
 runners:
   # This platform allows running the component natively
diff --git a/src/metrics/ari/script.py b/src/metrics/ari/script.py
index 12c6801..7a09ec7 100644
--- a/src/metrics/ari/script.py
+++ b/src/metrics/ari/script.py
@@ -1,3 +1,4 @@
+
 import anndata as ad
 import scanpy as sc
 from sklearn.metrics import adjusted_rand_score
@@ -6,10 +7,15 @@
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
 # in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
 par = {
-  'input_solution': 'resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad',
+  'input_scrnaseq_reference': 'resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad',
   'input_prediction': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq_reference.h5ad',
-  'output': 'output.h5ad',
-  'label': None
+  'output': 'score.h5ad',
+  'label': 'leiden',
+  'n_neighbors': 20,
+  'min_dist': 0.1,
+  'spread': 1.2,
+  'resolution': 1.0,
+  'seed': 123
 }
 meta = {
   'name': 'ari'
@@ -17,34 +23,35 @@
 ## VIASH END
 
 print('>> Reading input files', flush=True)
-input_solution = ad.read_h5ad(par['input_solution'])
+input_scrnaseq_reference = ad.read_h5ad(par['input_scrnaseq_reference'])
 input_prediction = ad.read_h5ad(par['input_prediction'])
 
-assert (input_prediction.obs_names == input_solution.obs_names).all(), "obs_names not the same in prediction and solution inputs"
+assert (input_prediction.obs_names == input_scrnaseq_reference.obs_names).all(), "obs_names not the same in prediction and solution inputs"
 
-if not par['label']:
+if par['label'] == 'leiden' :
   print('>> Postprocessing for metric', flush=True)
-  seed= 123
-  sc.pp.neighbors(input_solution, n_neighbors=20, random_state=seed)
-  sc.tl.umap(input_solution, min_dist=0.1, spread=1.2, random_state=seed)
-  sc.tl.leiden(input_solution, resolution=1.0, key_added='leiden', random_state=seed)
+  sc.pp.neighbors(input_scrnaseq_reference, n_neighbors=par['n_neighbors'], random_state=par['seed'])
+  sc.tl.umap(input_scrnaseq_reference, min_dist=par['min_dist'], spread=par['spread'], random_state=par['seed'])
+  sc.tl.leiden(input_scrnaseq_reference, resolution=par['resolution'], key_added='leiden', random_state=par['seed'])
 
-  sc.pp.neighbors(input_prediction, n_neighbors=20, random_state=seed)
-  sc.tl.umap(input_prediction, min_dist=0.1, spread=1.2, random_state=seed)
-  sc.tl.leiden(input_prediction, resolution=1.0, key_added='leiden', random_state=seed)
+  sc.pp.neighbors(input_prediction, n_neighbors=par['n_neighbors'], random_state=par['seed'])
+  sc.tl.umap(input_prediction, min_dist=par['min_dist'], spread=par['spread'], random_state=par['seed'])
+  sc.tl.leiden(input_prediction, resolution=par['resolution'], key_added='leiden', random_state=par['seed'])
 
 print('>> Compute metrics', flush=True)
 uns_metric_ids = [ 'ari' ]
-uns_metric_values = adjusted_rand_score(input_solution.obs["label"], input_prediction.obs["label_pred"])
+uns_metric_values = adjusted_rand_score(input_scrnaseq_reference.obs[par['label']], input_prediction.obs[par['label']])
 
 print(">> Write output AnnData to file", flush=True)
 output = ad.AnnData(
   uns={
     'dataset_id': input_prediction.uns['dataset_id'],
     'normalization_id': input_prediction.uns['normalization_id'],
-    'method_id': input_prediction.uns['method_id'],
+    # 'method_id': input_prediction.uns['method_id'], #TODO
     'metric_ids': uns_metric_ids,
     'metric_values': uns_metric_values
   }
 )
 output.write_h5ad(par['output'], compression='gzip')
+
+# %%

From 6e8c759f97af321e7fad17b9d2446979d19775d3 Mon Sep 17 00:00:00 2001
From: f641l <florian.heyl@dkfz-heidelberg.de>
Date: Tue, 28 Apr 2026 16:43:44 +0200
Subject: [PATCH 22/22] adding new files and changes to ari scripts

---
 .../random_labels/config.vsh.yaml             | 27 ++++++++++
 src/control_methods/random_labels/script.py   | 39 ++++++++++++++
 src/data_processors/leiden/config.vsh.yaml    | 52 +++++++++++++++++++
 src/data_processors/leiden/script.py          | 43 +++++++++++++++
 src/metrics/ari/config.vsh.yaml               | 20 -------
 src/metrics/ari/script.py                     | 27 ++--------
 6 files changed, 166 insertions(+), 42 deletions(-)
 create mode 100644 src/control_methods/random_labels/config.vsh.yaml
 create mode 100644 src/control_methods/random_labels/script.py
 create mode 100644 src/data_processors/leiden/config.vsh.yaml
 create mode 100644 src/data_processors/leiden/script.py

diff --git a/src/control_methods/random_labels/config.vsh.yaml b/src/control_methods/random_labels/config.vsh.yaml
new file mode 100644
index 0000000..b7386ef
--- /dev/null
+++ b/src/control_methods/random_labels/config.vsh.yaml
@@ -0,0 +1,27 @@
+# Base component API configuration
+__merge__: ../../api/comp_control_method.yaml
+
+# Component configuration
+name: "random_labels"
+label: Random Labels
+summary: "Negative control by randomly generating labels."
+description: "This method serves as a negative control, where random labels are generated for the data."
+info:
+  preferred_normalization: counts
+  variants:
+    random_features:
+
+# Script configuration
+resources:
+  - type: python_script
+    path: script.py
+
+# Platform configuration
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [lowtime, lowmem, lowcpu]
\ No newline at end of file
diff --git a/src/control_methods/random_labels/script.py b/src/control_methods/random_labels/script.py
new file mode 100644
index 0000000..5e091f5
--- /dev/null
+++ b/src/control_methods/random_labels/script.py
@@ -0,0 +1,39 @@
+
+import anndata as ad
+import random
+import pandas as pd
+
+## VIASH START
+par = {
+    "input": "resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq_reference.h5ad",
+    "output": "resources_test/task_spatial_segmentation/mouse_brain_combined/reference_prediction.h5ad",
+    "seed": 123,
+    "label": "cell_type"
+}
+meta = {
+    "name": "random_labels",
+}
+## VIASH END
+
+if par["seed"]:
+    print(f">> Setting seed to {par['seed']}")
+    random.seed(par["seed"])
+
+print("Load input data", flush=True)
+input = ad.read_h5ad(par["input"])
+
+print("Create random labels", flush=True)
+input.obs[par["label"]] = [random.randint(1, 10) for _ in range(input.n_obs)]
+
+print("Create output AnnData", flush=True)
+output = ad.AnnData(
+    obs=pd.DataFrame(input.obs[par["label"]]),
+    uns={
+        "dataset_id": input.uns["dataset_id"],
+        "normalization_id": input.uns["normalization_id"],
+        "method_id": meta["name"],
+    },
+)
+
+print("Write output to file", flush=True)
+output.write_h5ad(par["output"], compression="gzip")
diff --git a/src/data_processors/leiden/config.vsh.yaml b/src/data_processors/leiden/config.vsh.yaml
new file mode 100644
index 0000000..dcced68
--- /dev/null
+++ b/src/data_processors/leiden/config.vsh.yaml
@@ -0,0 +1,52 @@
+__merge__: ../../api/comp_data_processor.yaml
+
+name: process_dataset
+
+arguments:
+  - name: "--label"
+    type: "string"
+    default: "cell_type"
+    description: Label added to anndata for prediction.
+  - name: "--n_neighbors"
+    type: "integer"
+    default: 20
+    description: Number of neighbors to use for nearest neighbors distance matrix.
+  - name: "--min_dist"
+    type: "double"
+    default: 0.1
+    description: Effective minimum distance to use for UMAP.
+  - name: "--spread"
+    type: "double"
+    default: 1.2
+    description: The effective scale of embedded points to use for UMAP.
+  - name: "--resolution"
+    type: "double"
+    default: 1.0
+    description: The resolution to use for leiden clustering.
+  - name: "--seed"
+    type: "integer"
+    default: 123
+    description: Seed.
+
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    #image: openproblems/base_pytorch_nvidia:1 # TODO: ideally get gpu image to work
+    image: openproblems/base_python:1
+    setup:
+      - type: python
+        packages: scikit-learn
+      - type: python
+        packages: leidenalg
+    __merge__: 
+      - /src/base/setup_spatialdata_partial.yaml
+  - type: native
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [highmem, midcpu, midtime]
\ No newline at end of file
diff --git a/src/data_processors/leiden/script.py b/src/data_processors/leiden/script.py
new file mode 100644
index 0000000..bb0b0dc
--- /dev/null
+++ b/src/data_processors/leiden/script.py
@@ -0,0 +1,43 @@
+
+import random
+import anndata as ad
+import scanpy as sc
+import pandas as pd
+
+## VIASH START
+par = {
+    'input': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq_reference.h5ad',
+    'output': 'resources_test/task_spatial_segmentation/mouse_brain_combined/method_prediction.h5ad',
+    'label': 'cell_type',
+    'n_neighbors': 20,
+    'min_dist': 0.1,
+    'spread': 1.2,
+    'resolution': 1.0,
+    'seed': 123
+}
+## VIASH END
+
+# set seed if need be
+if par["seed"]:
+    print(f">> Setting seed to {par['seed']}")
+    random.seed(par["seed"])
+
+print('>> Reading input files', flush=True)
+input = ad.read_h5ad(par['input'])
+
+print('>> Perform Leiden clustering', flush=True)
+sc.pp.neighbors(input, n_neighbors=par['n_neighbors'], random_state=par['seed'])
+sc.tl.umap(input, min_dist=par['min_dist'], spread=par['spread'], random_state=par['seed'])
+sc.tl.leiden(input, resolution=par['resolution'], key_added=par["label"], random_state=par['seed'])
+
+print(">> Write output AnnData to file", flush=True)
+output = ad.AnnData(
+    obs=pd.DataFrame(input.obs[par["label"]]),
+    uns={
+        "dataset_id": input.uns["dataset_id"],
+        "normalization_id": input.uns["normalization_id"],
+        #"method_id": input.uns["method_id"], #TODO
+    },
+)
+
+output.write_h5ad(par['output'], compression='gzip')
diff --git a/src/metrics/ari/config.vsh.yaml b/src/metrics/ari/config.vsh.yaml
index 0575150..c3fb1b0 100644
--- a/src/metrics/ari/config.vsh.yaml
+++ b/src/metrics/ari/config.vsh.yaml
@@ -42,26 +42,6 @@ arguments:
     type: "string"
     default: "leiden"
     description: Label to be used to perform ARI.
-  - name: "--n_neighbors"
-    type: "integer"
-    default: 20
-    description: Number of neighbors to use for nearest neighbors distance matrix.
-  - name: "--min_dist"
-    type: "double"
-    default: 0.1
-    description: Effective minimum distance to use for UMAP.
-  - name: "--spread"
-    type: "double"
-    default: 1.2
-    description: The effective scale of embedded points to use for UMAP.
-  - name: "--resolution"
-    type: "double"
-    default: 1.0
-    description: The resolution to use for leiden clustering.
-  - name: "--seed"
-    type: "integer"
-    default: 123
-    description: Seed.
 
 # Resources required to run the component
 resources:
diff --git a/src/metrics/ari/script.py b/src/metrics/ari/script.py
index 7a09ec7..b2a547d 100644
--- a/src/metrics/ari/script.py
+++ b/src/metrics/ari/script.py
@@ -7,15 +7,10 @@
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
 # in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
 par = {
-  'input_scrnaseq_reference': 'resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad',
-  'input_prediction': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq_reference.h5ad',
-  'output': 'score.h5ad',
-  'label': 'leiden',
-  'n_neighbors': 20,
-  'min_dist': 0.1,
-  'spread': 1.2,
-  'resolution': 1.0,
-  'seed': 123
+  'input_scrnaseq_reference': 'resources_test/task_spatial_segmentation/mouse_brain_combined/reference_prediction.h5ad',
+  'input_prediction': 'resources_test/task_spatial_segmentation/mouse_brain_combined/method_prediction.h5ad',
+  'output': 'resources_test/task_spatial_segmentation/mouse_brain_combined/score.h5ad',
+  'label': 'cell_type'
 }
 meta = {
   'name': 'ari'
@@ -27,17 +22,7 @@
 input_prediction = ad.read_h5ad(par['input_prediction'])
 
 assert (input_prediction.obs_names == input_scrnaseq_reference.obs_names).all(), "obs_names not the same in prediction and solution inputs"
-
-if par['label'] == 'leiden' :
-  print('>> Postprocessing for metric', flush=True)
-  sc.pp.neighbors(input_scrnaseq_reference, n_neighbors=par['n_neighbors'], random_state=par['seed'])
-  sc.tl.umap(input_scrnaseq_reference, min_dist=par['min_dist'], spread=par['spread'], random_state=par['seed'])
-  sc.tl.leiden(input_scrnaseq_reference, resolution=par['resolution'], key_added='leiden', random_state=par['seed'])
-
-  sc.pp.neighbors(input_prediction, n_neighbors=par['n_neighbors'], random_state=par['seed'])
-  sc.tl.umap(input_prediction, min_dist=par['min_dist'], spread=par['spread'], random_state=par['seed'])
-  sc.tl.leiden(input_prediction, resolution=par['resolution'], key_added='leiden', random_state=par['seed'])
-
+  
 print('>> Compute metrics', flush=True)
 uns_metric_ids = [ 'ari' ]
 uns_metric_values = adjusted_rand_score(input_scrnaseq_reference.obs[par['label']], input_prediction.obs[par['label']])
@@ -53,5 +38,3 @@
   }
 )
 output.write_h5ad(par['output'], compression='gzip')
-
-# %%