diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh index b11d437..6bd985c 100755 --- a/scripts/create_resources/test_resources.sh +++ b/scripts/create_resources/test_resources.sh @@ -40,11 +40,11 @@ viash run src/methods/cellpose/config.vsh.yaml -- \ --output $DATASET_DIR/prediction.h5ad # run one metric -# TODO: implement this! -# viash run src/metrics/ari/config.vsh.yaml -- \ -# --input_prediction $DATASET_DIR/prediction.h5ad \ -# --input_scrnaseq_reference $DATASET_DIR/scrnaseq_reference.h5ad \ -# --output $DATASET_DIR/score.h5ad +# TODO files need to be changed +viash run src/metrics/ari/config.vsh.yaml -- \ + --input_scrnaseq_reference $RAW_DATA/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad \ + --input_prediction $DATASET_DIR/output_scrnaseq_reference.h5ad \ + --output $DATASET_DIR/score.h5ad # write manual state.yaml. this is not actually necessary but you never know it might be useful cat > $DATASET_DIR/state.yaml << HERE diff --git a/src/control_methods/random_labels/config.vsh.yaml b/src/control_methods/random_labels/config.vsh.yaml new file mode 100644 index 0000000..b7386ef --- /dev/null +++ b/src/control_methods/random_labels/config.vsh.yaml @@ -0,0 +1,27 @@ +# Base component API configuration +__merge__: ../../api/comp_control_method.yaml + +# Component configuration +name: "random_labels" +label: Random Labels +summary: "Negative control by randomly generating labels." +description: "This method serves as a negative control, where random labels are generated for the data." +info: + preferred_normalization: counts + variants: + random_features: + +# Script configuration +resources: + - type: python_script + path: script.py + +# Platform configuration +engines: + - type: docker + image: openproblems/base_python:1.0.0 +runners: + - type: executable + - type: nextflow + directives: + label: [lowtime, lowmem, lowcpu] \ No newline at end of file diff --git a/src/control_methods/random_labels/script.py b/src/control_methods/random_labels/script.py new file mode 100644 index 0000000..5e091f5 --- /dev/null +++ b/src/control_methods/random_labels/script.py @@ -0,0 +1,39 @@ + +import anndata as ad +import random +import pandas as pd + +## VIASH START +par = { + "input": "resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq_reference.h5ad", + "output": "resources_test/task_spatial_segmentation/mouse_brain_combined/reference_prediction.h5ad", + "seed": 123, + "label": "cell_type" +} +meta = { + "name": "random_labels", +} +## VIASH END + +if par["seed"]: + print(f">> Setting seed to {par['seed']}") + random.seed(par["seed"]) + +print("Load input data", flush=True) +input = ad.read_h5ad(par["input"]) + +print("Create random labels", flush=True) +input.obs[par["label"]] = [random.randint(1, 10) for _ in range(input.n_obs)] + +print("Create output AnnData", flush=True) +output = ad.AnnData( + obs=pd.DataFrame(input.obs[par["label"]]), + uns={ + "dataset_id": input.uns["dataset_id"], + "normalization_id": input.uns["normalization_id"], + "method_id": meta["name"], + }, +) + +print("Write output to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") diff --git a/src/data_processors/leiden/config.vsh.yaml b/src/data_processors/leiden/config.vsh.yaml new file mode 100644 index 0000000..dcced68 --- /dev/null +++ b/src/data_processors/leiden/config.vsh.yaml @@ -0,0 +1,52 @@ +__merge__: ../../api/comp_data_processor.yaml + +name: process_dataset + +arguments: + - name: "--label" + type: "string" + default: "cell_type" + description: Label added to anndata for prediction. + - name: "--n_neighbors" + type: "integer" + default: 20 + description: Number of neighbors to use for nearest neighbors distance matrix. + - name: "--min_dist" + type: "double" + default: 0.1 + description: Effective minimum distance to use for UMAP. + - name: "--spread" + type: "double" + default: 1.2 + description: The effective scale of embedded points to use for UMAP. + - name: "--resolution" + type: "double" + default: 1.0 + description: The resolution to use for leiden clustering. + - name: "--seed" + type: "integer" + default: 123 + description: Seed. + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + #image: openproblems/base_pytorch_nvidia:1 # TODO: ideally get gpu image to work + image: openproblems/base_python:1 + setup: + - type: python + packages: scikit-learn + - type: python + packages: leidenalg + __merge__: + - /src/base/setup_spatialdata_partial.yaml + - type: native + +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, midcpu, midtime] \ No newline at end of file diff --git a/src/data_processors/leiden/script.py b/src/data_processors/leiden/script.py new file mode 100644 index 0000000..bb0b0dc --- /dev/null +++ b/src/data_processors/leiden/script.py @@ -0,0 +1,43 @@ + +import random +import anndata as ad +import scanpy as sc +import pandas as pd + +## VIASH START +par = { + 'input': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq_reference.h5ad', + 'output': 'resources_test/task_spatial_segmentation/mouse_brain_combined/method_prediction.h5ad', + 'label': 'cell_type', + 'n_neighbors': 20, + 'min_dist': 0.1, + 'spread': 1.2, + 'resolution': 1.0, + 'seed': 123 +} +## VIASH END + +# set seed if need be +if par["seed"]: + print(f">> Setting seed to {par['seed']}") + random.seed(par["seed"]) + +print('>> Reading input files', flush=True) +input = ad.read_h5ad(par['input']) + +print('>> Perform Leiden clustering', flush=True) +sc.pp.neighbors(input, n_neighbors=par['n_neighbors'], random_state=par['seed']) +sc.tl.umap(input, min_dist=par['min_dist'], spread=par['spread'], random_state=par['seed']) +sc.tl.leiden(input, resolution=par['resolution'], key_added=par["label"], random_state=par['seed']) + +print(">> Write output AnnData to file", flush=True) +output = ad.AnnData( + obs=pd.DataFrame(input.obs[par["label"]]), + uns={ + "dataset_id": input.uns["dataset_id"], + "normalization_id": input.uns["normalization_id"], + #"method_id": input.uns["method_id"], #TODO + }, +) + +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/metrics/accuracy/script.py b/src/metrics/accuracy/script.py deleted file mode 100644 index 054e809..0000000 --- a/src/metrics/accuracy/script.py +++ /dev/null @@ -1,47 +0,0 @@ -import anndata as ad -import numpy as np -import sklearn.preprocessing - -## VIASH START -# Note: this section is auto-generated by viash at runtime. To edit it, make changes -# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. -par = { - 'input_solution': 'resources_test/task_template/cxg_mouse_pancreas_atlas/solution.h5ad', - 'input_prediction': 'resources_test/task_template/cxg_mouse_pancreas_atlas/prediction.h5ad', - 'output': 'output.h5ad' -} -meta = { - 'name': 'accuracy' -} -## VIASH END - -print('Reading input files', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_prediction = ad.read_h5ad(par['input_prediction']) - -assert (input_prediction.obs_names == input_solution.obs_names).all(), "obs_names not the same in prediction and solution inputs" - -print("Encode labels", flush=True) -cats = list(input_solution.obs["label"].dtype.categories) + list(input_prediction.obs["label_pred"].dtype.categories) -encoder = sklearn.preprocessing.LabelEncoder().fit(cats) -input_solution.obs["label"] = encoder.transform(input_solution.obs["label"]) -input_prediction.obs["label_pred"] = encoder.transform(input_prediction.obs["label_pred"]) - - -print('Compute metrics', flush=True) -# metric_ids and metric_values can have length > 1 -# but should be of equal length -uns_metric_ids = [ 'accuracy' ] -uns_metric_values = np.mean(input_solution.obs["label"] == input_prediction.obs["label_pred"]) - -print("Write output AnnData to file", flush=True) -output = ad.AnnData( - uns={ - 'dataset_id': input_prediction.uns['dataset_id'], - 'normalization_id': input_prediction.uns['normalization_id'], - 'method_id': input_prediction.uns['method_id'], - 'metric_ids': uns_metric_ids, - 'metric_values': uns_metric_values - } -) -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/metrics/accuracy/config.vsh.yaml b/src/metrics/ari/config.vsh.yaml similarity index 75% rename from src/metrics/accuracy/config.vsh.yaml rename to src/metrics/ari/config.vsh.yaml index ac197bd..c3fb1b0 100644 --- a/src/metrics/accuracy/config.vsh.yaml +++ b/src/metrics/ari/config.vsh.yaml @@ -8,26 +8,27 @@ __merge__: ../../api/comp_metric.yaml # A unique identifier for your component (required). # Can contain only lowercase letters or underscores. -name: accuracy +name: ari # Metadata for your component info: metrics: # A unique identifier for your metric (required). # Can contain only lowercase letters or underscores. - - name: accuracy + - name: ari # A relatively short label, used when rendering visualisarions (required) - label: Accuracy + label: ARI # A one sentence summary of how this metric works (required). Used when # rendering summary tables. - summary: "The percentage of correctly predicted labels." + summary: "Adjusted Rand index to measure the similarity between two data clusterings." # A multi-line description of how this component works (required). Used # when rendering reference documentation. description: | - The percentage of correctly predicted labels. + The Rand index is the accuracy of determining if a link belongs within a cluster or not. + The Rand index has a value between 0 and 1, with 0 indicating that the two data clusterings do not agree on any pair of points and 1 indicating that the data clusterings are exactly the same. # A reference key from the bibtex library at src/common/library.bib (required). references: - doi: 10.48550/arXiv.2008.05756 + doi: 10.1080/01621459.1971.10482356 # The minimum possible value for this metric (required) min: 0 # The maximum possible value for this metric (required) @@ -36,11 +37,11 @@ info: maximize: true # Component-specific parameters (optional) -# arguments: -# - name: "--n_neighbors" -# type: "integer" -# default: 5 -# description: Number of neighbors to use. +arguments: + - name: "--label" + type: "string" + default: "leiden" + description: Label to be used to perform ARI. # Resources required to run the component resources: @@ -60,6 +61,8 @@ engines: setup: - type: python packages: scikit-learn + - type: python + packages: leidenalg runners: # This platform allows running the component natively diff --git a/src/metrics/ari/script.py b/src/metrics/ari/script.py new file mode 100644 index 0000000..b2a547d --- /dev/null +++ b/src/metrics/ari/script.py @@ -0,0 +1,40 @@ + +import anndata as ad +import scanpy as sc +from sklearn.metrics import adjusted_rand_score + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + 'input_scrnaseq_reference': 'resources_test/task_spatial_segmentation/mouse_brain_combined/reference_prediction.h5ad', + 'input_prediction': 'resources_test/task_spatial_segmentation/mouse_brain_combined/method_prediction.h5ad', + 'output': 'resources_test/task_spatial_segmentation/mouse_brain_combined/score.h5ad', + 'label': 'cell_type' +} +meta = { + 'name': 'ari' +} +## VIASH END + +print('>> Reading input files', flush=True) +input_scrnaseq_reference = ad.read_h5ad(par['input_scrnaseq_reference']) +input_prediction = ad.read_h5ad(par['input_prediction']) + +assert (input_prediction.obs_names == input_scrnaseq_reference.obs_names).all(), "obs_names not the same in prediction and solution inputs" + +print('>> Compute metrics', flush=True) +uns_metric_ids = [ 'ari' ] +uns_metric_values = adjusted_rand_score(input_scrnaseq_reference.obs[par['label']], input_prediction.obs[par['label']]) + +print(">> Write output AnnData to file", flush=True) +output = ad.AnnData( + uns={ + 'dataset_id': input_prediction.uns['dataset_id'], + 'normalization_id': input_prediction.uns['normalization_id'], + # 'method_id': input_prediction.uns['method_id'], #TODO + 'metric_ids': uns_metric_ids, + 'metric_values': uns_metric_values + } +) +output.write_h5ad(par['output'], compression='gzip')