openproblems-bio · heylf · Apr 22, 2026 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh
@@ -40,11 +40,11 @@ viash run src/methods/cellpose/config.vsh.yaml -- \
     --output $DATASET_DIR/prediction.h5ad
 
 # run one metric
-# TODO: implement this!
-# viash run src/metrics/ari/config.vsh.yaml -- \
-#     --input_prediction $DATASET_DIR/prediction.h5ad \
-#     --input_scrnaseq_reference $DATASET_DIR/scrnaseq_reference.h5ad \
-#     --output $DATASET_DIR/score.h5ad
+# TODO files need to be changed
+viash run src/metrics/ari/config.vsh.yaml -- \
+    --input_scrnaseq_reference $RAW_DATA/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad \
+    --input_prediction $DATASET_DIR/output_scrnaseq_reference.h5ad \
+    --output $DATASET_DIR/score.h5ad
 
 # write manual state.yaml. this is not actually necessary but you never know it might be useful
 cat > $DATASET_DIR/state.yaml << HERE

diff --git a/src/control_methods/random_labels/config.vsh.yaml b/src/control_methods/random_labels/config.vsh.yaml
@@ -0,0 +1,27 @@
+# Base component API configuration
+__merge__: ../../api/comp_control_method.yaml
+
+# Component configuration
+name: "random_labels"
+label: Random Labels
+summary: "Negative control by randomly generating labels."
+description: "This method serves as a negative control, where random labels are generated for the data."
+info:
+  preferred_normalization: counts
+  variants:
+    random_features:
+
+# Script configuration
+resources:
+  - type: python_script
+    path: script.py
+
+# Platform configuration
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [lowtime, lowmem, lowcpu]
diff --git a/src/control_methods/random_labels/script.py b/src/control_methods/random_labels/script.py
@@ -0,0 +1,39 @@
+
+import anndata as ad
+import random
+import pandas as pd
+
+## VIASH START
+par = {
+    "input": "resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq_reference.h5ad",
+    "output": "resources_test/task_spatial_segmentation/mouse_brain_combined/reference_prediction.h5ad",
+    "seed": 123,
+    "label": "cell_type"
+}
+meta = {
+    "name": "random_labels",
+}
+## VIASH END
+
+if par["seed"]:
+    print(f">> Setting seed to {par['seed']}")
+    random.seed(par["seed"])
+
+print("Load input data", flush=True)
+input = ad.read_h5ad(par["input"])
+
+print("Create random labels", flush=True)
+input.obs[par["label"]] = [random.randint(1, 10) for _ in range(input.n_obs)]
+
+print("Create output AnnData", flush=True)
+output = ad.AnnData(
+    obs=pd.DataFrame(input.obs[par["label"]]),
+    uns={
+        "dataset_id": input.uns["dataset_id"],
+        "normalization_id": input.uns["normalization_id"],
+        "method_id": meta["name"],
+    },
+)
+
+print("Write output to file", flush=True)
+output.write_h5ad(par["output"], compression="gzip")
diff --git a/src/data_processors/leiden/config.vsh.yaml b/src/data_processors/leiden/config.vsh.yaml
@@ -0,0 +1,52 @@
+__merge__: ../../api/comp_data_processor.yaml
+
+name: process_dataset
+
+arguments:
+  - name: "--label"
+    type: "string"
+    default: "cell_type"
+    description: Label added to anndata for prediction.
+  - name: "--n_neighbors"
+    type: "integer"
+    default: 20
+    description: Number of neighbors to use for nearest neighbors distance matrix.
+  - name: "--min_dist"
+    type: "double"
+    default: 0.1
+    description: Effective minimum distance to use for UMAP.
+  - name: "--spread"
+    type: "double"
+    default: 1.2
+    description: The effective scale of embedded points to use for UMAP.
+  - name: "--resolution"
+    type: "double"
+    default: 1.0
+    description: The resolution to use for leiden clustering.
+  - name: "--seed"
+    type: "integer"
+    default: 123
+    description: Seed.
+
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    #image: openproblems/base_pytorch_nvidia:1 # TODO: ideally get gpu image to work
+    image: openproblems/base_python:1
+    setup:
+      - type: python
+        packages: scikit-learn
+      - type: python
+        packages: leidenalg
+    __merge__: 
+      - /src/base/setup_spatialdata_partial.yaml
+  - type: native
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [highmem, midcpu, midtime]
diff --git a/src/data_processors/leiden/script.py b/src/data_processors/leiden/script.py
@@ -0,0 +1,43 @@
+
+import random
+import anndata as ad
+import scanpy as sc
+import pandas as pd
+
+## VIASH START
+par = {
+    'input': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq_reference.h5ad',
+    'output': 'resources_test/task_spatial_segmentation/mouse_brain_combined/method_prediction.h5ad',
+    'label': 'cell_type',
+    'n_neighbors': 20,
+    'min_dist': 0.1,
+    'spread': 1.2,
+    'resolution': 1.0,
+    'seed': 123
+}
+## VIASH END
+
+# set seed if need be
+if par["seed"]:
+    print(f">> Setting seed to {par['seed']}")
+    random.seed(par["seed"])
+
+print('>> Reading input files', flush=True)
+input = ad.read_h5ad(par['input'])
+
+print('>> Perform Leiden clustering', flush=True)
+sc.pp.neighbors(input, n_neighbors=par['n_neighbors'], random_state=par['seed'])
+sc.tl.umap(input, min_dist=par['min_dist'], spread=par['spread'], random_state=par['seed'])
+sc.tl.leiden(input, resolution=par['resolution'], key_added=par["label"], random_state=par['seed'])
+
+print(">> Write output AnnData to file", flush=True)
+output = ad.AnnData(
+    obs=pd.DataFrame(input.obs[par["label"]]),
+    uns={
+        "dataset_id": input.uns["dataset_id"],
+        "normalization_id": input.uns["normalization_id"],
+        #"method_id": input.uns["method_id"], #TODO
+    },
+)
+
+output.write_h5ad(par['output'], compression='gzip')
diff --git a/src/metrics/accuracy/script.py b/src/metrics/accuracy/script.py
diff --git a/src/metrics/accuracy/config.vsh.yaml → src/metrics/ari/config.vsh.yaml b/src/metrics/accuracy/config.vsh.yaml → src/metrics/ari/config.vsh.yaml
@@ -8,26 +8,27 @@ __merge__: ../../api/comp_metric.yaml
 
 # A unique identifier for your component (required).
 # Can contain only lowercase letters or underscores.
-name: accuracy
+name: ari
 
 # Metadata for your component
 info:
   metrics:
     # A unique identifier for your metric (required).
     # Can contain only lowercase letters or underscores.
-    - name: accuracy
+    - name: ari
       # A relatively short label, used when rendering visualisarions (required)
-      label: Accuracy
+      label: ARI
       # A one sentence summary of how this metric works (required). Used when 
       # rendering summary tables.
-      summary: "The percentage of correctly predicted labels."
+      summary: "Adjusted Rand index to measure the similarity between two data clusterings."
       # A multi-line description of how this component works (required). Used
       # when rendering reference documentation.
       description: |
-        The percentage of correctly predicted labels.
+        The Rand index is the accuracy of determining if a link belongs within a cluster or not.
+        The Rand index has a value between 0 and 1, with 0 indicating that the two data clusterings do not agree on any pair of points and 1 indicating that the data clusterings are exactly the same.
       # A reference key from the bibtex library at src/common/library.bib (required).
       references:
-        doi: 10.48550/arXiv.2008.05756
+        doi: 10.1080/01621459.1971.10482356
       # The minimum possible value for this metric (required)
       min: 0
       # The maximum possible value for this metric (required)
@@ -36,11 +37,11 @@ info:
       maximize: true
 
 # Component-specific parameters (optional)
-# arguments:
-#   - name: "--n_neighbors"
-#     type: "integer"
-#     default: 5
-#     description: Number of neighbors to use.
+arguments:
+  - name: "--label"
+    type: "string"
+    default: "leiden"
+    description: Label to be used to perform ARI.
 
 # Resources required to run the component
 resources:
@@ -60,6 +61,8 @@ engines:
     setup:
       - type: python
         packages: scikit-learn
+      - type: python
+        packages: leidenalg
 
 runners:
   # This platform allows running the component natively

diff --git a/src/metrics/ari/script.py b/src/metrics/ari/script.py
@@ -0,0 +1,40 @@
+
+import anndata as ad
+import scanpy as sc
+from sklearn.metrics import adjusted_rand_score
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+  'input_scrnaseq_reference': 'resources_test/task_spatial_segmentation/mouse_brain_combined/reference_prediction.h5ad',
+  'input_prediction': 'resources_test/task_spatial_segmentation/mouse_brain_combined/method_prediction.h5ad',
+  'output': 'resources_test/task_spatial_segmentation/mouse_brain_combined/score.h5ad',
+  'label': 'cell_type'
+}
+meta = {
+  'name': 'ari'
+}
+## VIASH END
+
+print('>> Reading input files', flush=True)
+input_scrnaseq_reference = ad.read_h5ad(par['input_scrnaseq_reference'])
+input_prediction = ad.read_h5ad(par['input_prediction'])
+
+assert (input_prediction.obs_names == input_scrnaseq_reference.obs_names).all(), "obs_names not the same in prediction and solution inputs"
+
+print('>> Compute metrics', flush=True)
+uns_metric_ids = [ 'ari' ]
+uns_metric_values = adjusted_rand_score(input_scrnaseq_reference.obs[par['label']], input_prediction.obs[par['label']])
+
+print(">> Write output AnnData to file", flush=True)
+output = ad.AnnData(
+  uns={
+    'dataset_id': input_prediction.uns['dataset_id'],
+    'normalization_id': input_prediction.uns['normalization_id'],
+    # 'method_id': input_prediction.uns['method_id'], #TODO
+    'metric_ids': uns_metric_ids,
+    'metric_values': uns_metric_values
+  }
+)
+output.write_h5ad(par['output'], compression='gzip')