diff --git a/examples/0.download_example_dataset.ipynb b/examples/0.download_example_dataset.ipynb new file mode 100644 index 0000000..e213919 --- /dev/null +++ b/examples/0.download_example_dataset.ipynb @@ -0,0 +1,1705 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b013fcef", + "metadata": {}, + "source": [ + "# Download JUMP pilot plate data from AWS S3 bucket for example training" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6c8de60f", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "from urllib.parse import urlparse\n", + "\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "from virtual_stain_flow.datasets.example.cpjump1_manifest import get_manifest\n", + "from virtual_stain_flow.datasets.example.arrange_as_wide import arrange_manifest_channels" + ] + }, + { + "cell_type": "markdown", + "id": "c879dacc", + "metadata": {}, + "source": [ + "## Pathing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3baf91c", + "metadata": {}, + "outputs": [], + "source": [ + "DATA_DOWNLOAD_DIR = Path(\"/PATH/TO/WHERE/YOU/WANT/TO/DOWNLOAD/CPJUMP1\")\n", + "DATA_DOWNLOAD_DIR.mkdir(exist_ok=True, parents=True)" + ] + }, + { + "cell_type": "markdown", + "id": "63c8d005", + "metadata": {}, + "source": [ + "## S3 download helpers" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6d789a28", + "metadata": {}, + "outputs": [], + "source": [ + "def _parse_s3_url(url):\n", + " parsed = urlparse(url)\n", + " if parsed.scheme != \"s3\":\n", + " raise ValueError(f\"Expected s3:// URL, got: {url}\")\n", + " return parsed.netloc, parsed.path.lstrip(\"/\")\n", + "\n", + "def download_wide_manifest_channels(\n", + " wide_manifest,\n", + " dest_dir,\n", + " channel_columns=None,\n", + " overwrite=False,\n", + "):\n", + " \"\"\"\n", + " Download S3 TIFFs for each channel and write a local file_index.csv with paths.\n", + " \"\"\"\n", + " if channel_columns is None:\n", + " channel_columns = [\"LZ_BF\", \"BF\", \"HZ_BF\", \"DNA\", \"Mito\", \"AGP\", \"ER\", \"RNA\"]\n", + " dest_dir = Path(dest_dir)\n", + " dest_dir.mkdir(parents=True, exist_ok=True)\n", + " try:\n", + " import boto3\n", + " from botocore import UNSIGNED\n", + " from botocore.config import Config\n", + " except ImportError as exc:\n", + " raise ImportError(\n", + " \"boto3 is required for S3 downloads. Install with: pip install boto3\"\n", + " ) from exc\n", + " s3 = boto3.client(\"s3\", config=Config(signature_version=UNSIGNED))\n", + " local_rows = []\n", + " for row_idx, row in wide_manifest.iterrows():\n", + " prefix_parts = []\n", + " for key in [\"Metadata_Plate\", \"Metadata_Well\", \"Metadata_Site\"]:\n", + " if key in wide_manifest.columns:\n", + " prefix_parts.append(str(row[key]))\n", + " prefix = \"_\".join(prefix_parts) if prefix_parts else f\"row_{row_idx}\"\n", + " local_row = {}\n", + " for channel in channel_columns:\n", + " url = row[channel] if channel in wide_manifest.columns else None\n", + " if pd.isna(url):\n", + " local_row[channel] = None\n", + " continue\n", + " bucket, key = _parse_s3_url(url)\n", + " suffix = Path(key).suffix or \".tif\"\n", + " local_path = dest_dir / f\"{prefix}_{channel}{suffix}\"\n", + " if overwrite or not local_path.exists():\n", + " s3.download_file(bucket, key, str(local_path))\n", + " local_row[channel] = str(local_path)\n", + " local_rows.append(local_row)\n", + " file_index = pd.DataFrame(local_rows, columns=channel_columns)\n", + " file_index.to_csv(dest_dir / \"file_index.csv\", index=False)\n", + " return file_index\n" + ] + }, + { + "cell_type": "markdown", + "id": "8e5e7baf", + "metadata": {}, + "source": [ + "## Retrieve compound manifest" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3452cbb4", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Metadata_Well", + "rawType": "object", + "type": "string" + }, + { + "name": "broad_sample", + "rawType": "object", + "type": "string" + }, + { + "name": "solvent", + "rawType": "object", + "type": "string" + }, + { + "name": "InChIKey", + "rawType": "object", + "type": "string" + }, + { + "name": "pert_iname", + "rawType": "object", + "type": "string" + }, + { + "name": "pubchem_cid", + "rawType": "float64", + "type": "float" + }, + { + "name": "target", + "rawType": "object", + "type": "string" + }, + { + "name": "target_list", + "rawType": "object", + "type": "string" + }, + { + "name": "pert_type", + "rawType": "object", + "type": "string" + }, + { + "name": "control_type", + "rawType": "object", + "type": "unknown" + }, + { + "name": "smiles", + "rawType": "object", + "type": "string" + }, + { + "name": "Batch", + "rawType": "object", + "type": "string" + }, + { + "name": "Plate_Map_Name", + "rawType": "object", + "type": "string" + }, + { + "name": "Metadata_Plate", + "rawType": "object", + "type": "string" + }, + { + "name": "Perturbation", + "rawType": "object", + "type": "string" + }, + { + "name": "Cell_type", + "rawType": "object", + "type": "string" + }, + { + "name": "Time", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Density", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Antibiotics", + "rawType": "object", + "type": "string" + }, + { + "name": "Cell_line", + "rawType": "object", + "type": "string" + }, + { + "name": "Time_delay", + "rawType": "object", + "type": "string" + }, + { + "name": "Times_imaged", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Anomaly", + "rawType": "object", + "type": "string" + }, + { + "name": "Number_of_images", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Metadata_Site", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Metadata_ChannelName", + "rawType": "object", + "type": "string" + }, + { + "name": "Metadata_PlaneID", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Metadata_PositionZ", + "rawType": "float64", + "type": "float" + }, + { + "name": "Metadata_FileUrl", + "rawType": "object", + "type": "string" + }, + { + "name": "Metadata_Filename", + "rawType": "object", + "type": "string" + } + ], + "ref": "6934d718-fb51-496b-99ce-57f0607bb381", + "rows": [ + [ + "0", + "A01", + "BRD-A86665761-001-01-1", + "DMSO", + "TZDUHAJSIBHXDL-UHFFFAOYSA-N", + "gabapentin-enacarbil", + "9883933.0", + "CACNB4", + "CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1F|CACNA1G|CACNA1H|CACNA1I|CACNA1S|CACNA2D1|CACNA2D2|CACNA2D3|CACNA2D4|CACNB1|CACNB2|CACNB3|CACNB4|CACNG1|CACNG2|CACNG3|CACNG4|CACNG5|CACNG6|CACNG7|CACNG8", + "trt", + null, + "CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O)=O)CCCCC1", + "2020_11_04_CPJUMP1", + "JUMP-Target-1_compound_platemap", + "BR00116991", + "compound", + "A549", + "24", + "100", + "absent", + "Parental", + "Day0", + "1", + "WGA", + "27648", + "1", + "DNA", + "1", + "-2e-06", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116991__2020-11-05T19_51_35-Measurement1/Images/r01c01f01p01-ch5sk1fk1fl1.tiff", + "r01c01f01p01-ch5sk1fk1fl1.tiff" + ], + [ + "1", + "A01", + "BRD-A86665761-001-01-1", + "DMSO", + "TZDUHAJSIBHXDL-UHFFFAOYSA-N", + "gabapentin-enacarbil", + "9883933.0", + "CACNB4", + "CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1F|CACNA1G|CACNA1H|CACNA1I|CACNA1S|CACNA2D1|CACNA2D2|CACNA2D3|CACNA2D4|CACNB1|CACNB2|CACNB3|CACNB4|CACNG1|CACNG2|CACNG3|CACNG4|CACNG5|CACNG6|CACNG7|CACNG8", + "trt", + null, + "CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O)=O)CCCCC1", + "2020_11_04_CPJUMP1", + "JUMP-Target-1_compound_platemap", + "BR00116991", + "compound", + "A549", + "24", + "100", + "absent", + "Parental", + "Day0", + "1", + "WGA", + "27648", + "2", + "DNA", + "1", + "-2e-06", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116991__2020-11-05T19_51_35-Measurement1/Images/r01c01f02p01-ch5sk1fk1fl1.tiff", + "r01c01f02p01-ch5sk1fk1fl1.tiff" + ], + [ + "2", + "A01", + "BRD-A86665761-001-01-1", + "DMSO", + "TZDUHAJSIBHXDL-UHFFFAOYSA-N", + "gabapentin-enacarbil", + "9883933.0", + "CACNB4", + "CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1F|CACNA1G|CACNA1H|CACNA1I|CACNA1S|CACNA2D1|CACNA2D2|CACNA2D3|CACNA2D4|CACNB1|CACNB2|CACNB3|CACNB4|CACNG1|CACNG2|CACNG3|CACNG4|CACNG5|CACNG6|CACNG7|CACNG8", + "trt", + null, + "CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O)=O)CCCCC1", + "2020_11_04_CPJUMP1", + "JUMP-Target-1_compound_platemap", + "BR00116991", + "compound", + "A549", + "24", + "100", + "absent", + "Parental", + "Day0", + "1", + "WGA", + "27648", + "3", + "DNA", + "1", + "-2e-06", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116991__2020-11-05T19_51_35-Measurement1/Images/r01c01f03p01-ch5sk1fk1fl1.tiff", + "r01c01f03p01-ch5sk1fk1fl1.tiff" + ], + [ + "3", + "A01", + "BRD-A86665761-001-01-1", + "DMSO", + "TZDUHAJSIBHXDL-UHFFFAOYSA-N", + "gabapentin-enacarbil", + "9883933.0", + "CACNB4", + "CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1F|CACNA1G|CACNA1H|CACNA1I|CACNA1S|CACNA2D1|CACNA2D2|CACNA2D3|CACNA2D4|CACNB1|CACNB2|CACNB3|CACNB4|CACNG1|CACNG2|CACNG3|CACNG4|CACNG5|CACNG6|CACNG7|CACNG8", + "trt", + null, + "CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O)=O)CCCCC1", + "2020_11_04_CPJUMP1", + "JUMP-Target-1_compound_platemap", + "BR00116991", + "compound", + "A549", + "24", + "100", + "absent", + "Parental", + "Day0", + "1", + "WGA", + "27648", + "4", + "DNA", + "1", + "-2e-06", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116991__2020-11-05T19_51_35-Measurement1/Images/r01c01f04p01-ch5sk1fk1fl1.tiff", + "r01c01f04p01-ch5sk1fk1fl1.tiff" + ], + [ + "4", + "A01", + "BRD-A86665761-001-01-1", + "DMSO", + "TZDUHAJSIBHXDL-UHFFFAOYSA-N", + "gabapentin-enacarbil", + "9883933.0", + "CACNB4", + "CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1F|CACNA1G|CACNA1H|CACNA1I|CACNA1S|CACNA2D1|CACNA2D2|CACNA2D3|CACNA2D4|CACNB1|CACNB2|CACNB3|CACNB4|CACNG1|CACNG2|CACNG3|CACNG4|CACNG5|CACNG6|CACNG7|CACNG8", + "trt", + null, + "CC(C)C(=O)OC(C)OC(=O)NCC1(CC(O)=O)CCCCC1", + "2020_11_04_CPJUMP1", + "JUMP-Target-1_compound_platemap", + "BR00116991", + "compound", + "A549", + "24", + "100", + "absent", + "Parental", + "Day0", + "1", + "WGA", + "27648", + "5", + "DNA", + "1", + "-2e-06", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00116991__2020-11-05T19_51_35-Measurement1/Images/r01c01f05p01-ch5sk1fk1fl1.tiff", + "r01c01f05p01-ch5sk1fk1fl1.tiff" + ] + ], + "shape": { + "columns": 30, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Metadata_Wellbroad_samplesolventInChIKeypert_inamepubchem_cidtargettarget_listpert_typecontrol_type...Time_delayTimes_imagedAnomalyNumber_of_imagesMetadata_SiteMetadata_ChannelNameMetadata_PlaneIDMetadata_PositionZMetadata_FileUrlMetadata_Filename
0A01BRD-A86665761-001-01-1DMSOTZDUHAJSIBHXDL-UHFFFAOYSA-Ngabapentin-enacarbil9883933.0CACNB4CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1...trtNaN...Day01WGA276481DNA1-0.000002s3://cellpainting-gallery/cpg0000-jump-pilot/s...r01c01f01p01-ch5sk1fk1fl1.tiff
1A01BRD-A86665761-001-01-1DMSOTZDUHAJSIBHXDL-UHFFFAOYSA-Ngabapentin-enacarbil9883933.0CACNB4CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1...trtNaN...Day01WGA276482DNA1-0.000002s3://cellpainting-gallery/cpg0000-jump-pilot/s...r01c01f02p01-ch5sk1fk1fl1.tiff
2A01BRD-A86665761-001-01-1DMSOTZDUHAJSIBHXDL-UHFFFAOYSA-Ngabapentin-enacarbil9883933.0CACNB4CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1...trtNaN...Day01WGA276483DNA1-0.000002s3://cellpainting-gallery/cpg0000-jump-pilot/s...r01c01f03p01-ch5sk1fk1fl1.tiff
3A01BRD-A86665761-001-01-1DMSOTZDUHAJSIBHXDL-UHFFFAOYSA-Ngabapentin-enacarbil9883933.0CACNB4CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1...trtNaN...Day01WGA276484DNA1-0.000002s3://cellpainting-gallery/cpg0000-jump-pilot/s...r01c01f04p01-ch5sk1fk1fl1.tiff
4A01BRD-A86665761-001-01-1DMSOTZDUHAJSIBHXDL-UHFFFAOYSA-Ngabapentin-enacarbil9883933.0CACNB4CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1...trtNaN...Day01WGA276485DNA1-0.000002s3://cellpainting-gallery/cpg0000-jump-pilot/s...r01c01f05p01-ch5sk1fk1fl1.tiff
\n", + "

5 rows × 30 columns

\n", + "
" + ], + "text/plain": [ + " Metadata_Well broad_sample solvent InChIKey \\\n", + "0 A01 BRD-A86665761-001-01-1 DMSO TZDUHAJSIBHXDL-UHFFFAOYSA-N \n", + "1 A01 BRD-A86665761-001-01-1 DMSO TZDUHAJSIBHXDL-UHFFFAOYSA-N \n", + "2 A01 BRD-A86665761-001-01-1 DMSO TZDUHAJSIBHXDL-UHFFFAOYSA-N \n", + "3 A01 BRD-A86665761-001-01-1 DMSO TZDUHAJSIBHXDL-UHFFFAOYSA-N \n", + "4 A01 BRD-A86665761-001-01-1 DMSO TZDUHAJSIBHXDL-UHFFFAOYSA-N \n", + "\n", + " pert_iname pubchem_cid target \\\n", + "0 gabapentin-enacarbil 9883933.0 CACNB4 \n", + "1 gabapentin-enacarbil 9883933.0 CACNB4 \n", + "2 gabapentin-enacarbil 9883933.0 CACNB4 \n", + "3 gabapentin-enacarbil 9883933.0 CACNB4 \n", + "4 gabapentin-enacarbil 9883933.0 CACNB4 \n", + "\n", + " target_list pert_type control_type \\\n", + "0 CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1... trt NaN \n", + "1 CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1... trt NaN \n", + "2 CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1... trt NaN \n", + "3 CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1... trt NaN \n", + "4 CACNA1A|CACNA1B|CACNA1C|CACNA1D|CACNA1E|CACNA1... trt NaN \n", + "\n", + " ... Time_delay Times_imaged Anomaly Number_of_images Metadata_Site \\\n", + "0 ... Day0 1 WGA 27648 1 \n", + "1 ... Day0 1 WGA 27648 2 \n", + "2 ... Day0 1 WGA 27648 3 \n", + "3 ... Day0 1 WGA 27648 4 \n", + "4 ... Day0 1 WGA 27648 5 \n", + "\n", + " Metadata_ChannelName Metadata_PlaneID Metadata_PositionZ \\\n", + "0 DNA 1 -0.000002 \n", + "1 DNA 1 -0.000002 \n", + "2 DNA 1 -0.000002 \n", + "3 DNA 1 -0.000002 \n", + "4 DNA 1 -0.000002 \n", + "\n", + " Metadata_FileUrl \\\n", + "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "\n", + " Metadata_Filename \n", + "0 r01c01f01p01-ch5sk1fk1fl1.tiff \n", + "1 r01c01f02p01-ch5sk1fk1fl1.tiff \n", + "2 r01c01f03p01-ch5sk1fk1fl1.tiff \n", + "3 r01c01f04p01-ch5sk1fk1fl1.tiff \n", + "4 r01c01f05p01-ch5sk1fk1fl1.tiff \n", + "\n", + "[5 rows x 30 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "MANIFEST = get_manifest()\n", + "MANIFEST.head()" + ] + }, + { + "cell_type": "markdown", + "id": "c62e8252", + "metadata": {}, + "source": [ + "## Filter manifest\n", + "For the sake of demoing training here we restricted timepoint to 24, and selected untreated U2-OS cells" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "82d77177", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Metadata_Well", + "rawType": "object", + "type": "string" + }, + { + "name": "broad_sample", + "rawType": "object", + "type": "unknown" + }, + { + "name": "solvent", + "rawType": "object", + "type": "string" + }, + { + "name": "InChIKey", + "rawType": "object", + "type": "string" + }, + { + "name": "pert_iname", + "rawType": "object", + "type": "string" + }, + { + "name": "pubchem_cid", + "rawType": "float64", + "type": "float" + }, + { + "name": "target", + "rawType": "object", + "type": "unknown" + }, + { + "name": "target_list", + "rawType": "object", + "type": "unknown" + }, + { + "name": "pert_type", + "rawType": "object", + "type": "string" + }, + { + "name": "control_type", + "rawType": "object", + "type": "string" + }, + { + "name": "smiles", + "rawType": "object", + "type": "string" + }, + { + "name": "Batch", + "rawType": "object", + "type": "string" + }, + { + "name": "Plate_Map_Name", + "rawType": "object", + "type": "string" + }, + { + "name": "Metadata_Plate", + "rawType": "object", + "type": "string" + }, + { + "name": "Perturbation", + "rawType": "object", + "type": "string" + }, + { + "name": "Cell_type", + "rawType": "object", + "type": "string" + }, + { + "name": "Time", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Density", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Antibiotics", + "rawType": "object", + "type": "string" + }, + { + "name": "Cell_line", + "rawType": "object", + "type": "string" + }, + { + "name": "Time_delay", + "rawType": "object", + "type": "string" + }, + { + "name": "Times_imaged", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Anomaly", + "rawType": "object", + "type": "string" + }, + { + "name": "Number_of_images", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Metadata_Site", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Metadata_ChannelName", + "rawType": "object", + "type": "string" + }, + { + "name": "Metadata_PlaneID", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Metadata_PositionZ", + "rawType": "float64", + "type": "float" + }, + { + "name": "Metadata_FileUrl", + "rawType": "object", + "type": "string" + }, + { + "name": "Metadata_Filename", + "rawType": "object", + "type": "string" + } + ], + "ref": "c4146cc9-71a6-40cc-ab80-c53a50b5f1e3", + "rows": [ + [ + "2312", + "A02", + null, + "DMSO", + "IAZDPXIOMUYVGZ-UHFFFAOYSA-N", + "DMSO", + "679.0", + null, + null, + "control", + "negcon", + "CS(=O)C", + "2020_11_04_CPJUMP1", + "JUMP-Target-1_compound_platemap", + "BR00117008", + "compound", + "A549", + "48", + "80", + "absent", + "Parental", + "Day0", + "1", + "none", + "27648", + "1", + "DNA", + "1", + "-2e-06", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f01p01-ch5sk1fk1fl1.tiff", + "r01c02f01p01-ch5sk1fk1fl1.tiff" + ], + [ + "2313", + "A02", + null, + "DMSO", + "IAZDPXIOMUYVGZ-UHFFFAOYSA-N", + "DMSO", + "679.0", + null, + null, + "control", + "negcon", + "CS(=O)C", + "2020_11_04_CPJUMP1", + "JUMP-Target-1_compound_platemap", + "BR00117008", + "compound", + "A549", + "48", + "80", + "absent", + "Parental", + "Day0", + "1", + "none", + "27648", + "2", + "DNA", + "1", + "-2e-06", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f02p01-ch5sk1fk1fl1.tiff", + "r01c02f02p01-ch5sk1fk1fl1.tiff" + ], + [ + "2314", + "A02", + null, + "DMSO", + "IAZDPXIOMUYVGZ-UHFFFAOYSA-N", + "DMSO", + "679.0", + null, + null, + "control", + "negcon", + "CS(=O)C", + "2020_11_04_CPJUMP1", + "JUMP-Target-1_compound_platemap", + "BR00117008", + "compound", + "A549", + "48", + "80", + "absent", + "Parental", + "Day0", + "1", + "none", + "27648", + "3", + "DNA", + "1", + "-2e-06", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f03p01-ch5sk1fk1fl1.tiff", + "r01c02f03p01-ch5sk1fk1fl1.tiff" + ], + [ + "2315", + "A02", + null, + "DMSO", + "IAZDPXIOMUYVGZ-UHFFFAOYSA-N", + "DMSO", + "679.0", + null, + null, + "control", + "negcon", + "CS(=O)C", + "2020_11_04_CPJUMP1", + "JUMP-Target-1_compound_platemap", + "BR00117008", + "compound", + "A549", + "48", + "80", + "absent", + "Parental", + "Day0", + "1", + "none", + "27648", + "4", + "DNA", + "1", + "-2e-06", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f04p01-ch5sk1fk1fl1.tiff", + "r01c02f04p01-ch5sk1fk1fl1.tiff" + ], + [ + "2316", + "A02", + null, + "DMSO", + "IAZDPXIOMUYVGZ-UHFFFAOYSA-N", + "DMSO", + "679.0", + null, + null, + "control", + "negcon", + "CS(=O)C", + "2020_11_04_CPJUMP1", + "JUMP-Target-1_compound_platemap", + "BR00117008", + "compound", + "A549", + "48", + "80", + "absent", + "Parental", + "Day0", + "1", + "none", + "27648", + "5", + "DNA", + "1", + "-2e-06", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f05p01-ch5sk1fk1fl1.tiff", + "r01c02f05p01-ch5sk1fk1fl1.tiff" + ] + ], + "shape": { + "columns": 30, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Metadata_Wellbroad_samplesolventInChIKeypert_inamepubchem_cidtargettarget_listpert_typecontrol_type...Time_delayTimes_imagedAnomalyNumber_of_imagesMetadata_SiteMetadata_ChannelNameMetadata_PlaneIDMetadata_PositionZMetadata_FileUrlMetadata_Filename
2312A02NaNDMSOIAZDPXIOMUYVGZ-UHFFFAOYSA-NDMSO679.0NaNNaNcontrolnegcon...Day01none276481DNA1-0.000002s3://cellpainting-gallery/cpg0000-jump-pilot/s...r01c02f01p01-ch5sk1fk1fl1.tiff
2313A02NaNDMSOIAZDPXIOMUYVGZ-UHFFFAOYSA-NDMSO679.0NaNNaNcontrolnegcon...Day01none276482DNA1-0.000002s3://cellpainting-gallery/cpg0000-jump-pilot/s...r01c02f02p01-ch5sk1fk1fl1.tiff
2314A02NaNDMSOIAZDPXIOMUYVGZ-UHFFFAOYSA-NDMSO679.0NaNNaNcontrolnegcon...Day01none276483DNA1-0.000002s3://cellpainting-gallery/cpg0000-jump-pilot/s...r01c02f03p01-ch5sk1fk1fl1.tiff
2315A02NaNDMSOIAZDPXIOMUYVGZ-UHFFFAOYSA-NDMSO679.0NaNNaNcontrolnegcon...Day01none276484DNA1-0.000002s3://cellpainting-gallery/cpg0000-jump-pilot/s...r01c02f04p01-ch5sk1fk1fl1.tiff
2316A02NaNDMSOIAZDPXIOMUYVGZ-UHFFFAOYSA-NDMSO679.0NaNNaNcontrolnegcon...Day01none276485DNA1-0.000002s3://cellpainting-gallery/cpg0000-jump-pilot/s...r01c02f05p01-ch5sk1fk1fl1.tiff
\n", + "

5 rows × 30 columns

\n", + "
" + ], + "text/plain": [ + " Metadata_Well broad_sample solvent InChIKey \\\n", + "2312 A02 NaN DMSO IAZDPXIOMUYVGZ-UHFFFAOYSA-N \n", + "2313 A02 NaN DMSO IAZDPXIOMUYVGZ-UHFFFAOYSA-N \n", + "2314 A02 NaN DMSO IAZDPXIOMUYVGZ-UHFFFAOYSA-N \n", + "2315 A02 NaN DMSO IAZDPXIOMUYVGZ-UHFFFAOYSA-N \n", + "2316 A02 NaN DMSO IAZDPXIOMUYVGZ-UHFFFAOYSA-N \n", + "\n", + " pert_iname pubchem_cid target target_list pert_type control_type ... \\\n", + "2312 DMSO 679.0 NaN NaN control negcon ... \n", + "2313 DMSO 679.0 NaN NaN control negcon ... \n", + "2314 DMSO 679.0 NaN NaN control negcon ... \n", + "2315 DMSO 679.0 NaN NaN control negcon ... \n", + "2316 DMSO 679.0 NaN NaN control negcon ... \n", + "\n", + " Time_delay Times_imaged Anomaly Number_of_images Metadata_Site \\\n", + "2312 Day0 1 none 27648 1 \n", + "2313 Day0 1 none 27648 2 \n", + "2314 Day0 1 none 27648 3 \n", + "2315 Day0 1 none 27648 4 \n", + "2316 Day0 1 none 27648 5 \n", + "\n", + " Metadata_ChannelName Metadata_PlaneID Metadata_PositionZ \\\n", + "2312 DNA 1 -0.000002 \n", + "2313 DNA 1 -0.000002 \n", + "2314 DNA 1 -0.000002 \n", + "2315 DNA 1 -0.000002 \n", + "2316 DNA 1 -0.000002 \n", + "\n", + " Metadata_FileUrl \\\n", + "2312 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2313 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2314 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2315 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2316 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "\n", + " Metadata_Filename \n", + "2312 r01c02f01p01-ch5sk1fk1fl1.tiff \n", + "2313 r01c02f02p01-ch5sk1fk1fl1.tiff \n", + "2314 r01c02f03p01-ch5sk1fk1fl1.tiff \n", + "2315 r01c02f04p01-ch5sk1fk1fl1.tiff \n", + "2316 r01c02f05p01-ch5sk1fk1fl1.tiff \n", + "\n", + "[5 rows x 30 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "negcon_a549_48_manifest = MANIFEST[\n", + " (MANIFEST[\"Cell_type\"] == \"A549\") &\n", + " (MANIFEST[\"Anomaly\"] == \"none\") &\n", + " (MANIFEST[\"control_type\"] == 'negcon') &\n", + " (MANIFEST[\"Time\"] == 48)\n", + "]\n", + "negcon_a549_48_manifest.head()" + ] + }, + { + "cell_type": "markdown", + "id": "6b4431f0", + "metadata": {}, + "source": [ + "## Arrange as wide is the anticipated format in virtual stain flow datasets and also the format the download helper expects this format" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "952e2717", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Metadata_Plate", + "rawType": "object", + "type": "string" + }, + { + "name": "Metadata_Well", + "rawType": "object", + "type": "string" + }, + { + "name": "Metadata_Site", + "rawType": "int64", + "type": "integer" + }, + { + "name": "LZ_BF", + "rawType": "object", + "type": "string" + }, + { + "name": "BF", + "rawType": "object", + "type": "string" + }, + { + "name": "HZ_BF", + "rawType": "object", + "type": "string" + }, + { + "name": "DNA", + "rawType": "object", + "type": "string" + }, + { + "name": "Mito", + "rawType": "object", + "type": "string" + }, + { + "name": "AGP", + "rawType": "object", + "type": "string" + }, + { + "name": "ER", + "rawType": "object", + "type": "string" + }, + { + "name": "RNA", + "rawType": "object", + "type": "string" + } + ], + "ref": "a6710032-e5c6-41b0-a361-af52c8cb0633", + "rows": [ + [ + "0", + "BR00117008", + "A02", + "1", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f01p01-ch7sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f01p01-ch8sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f01p01-ch6sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f01p01-ch5sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f01p01-ch1sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f01p01-ch2sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f01p01-ch4sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f01p01-ch3sk1fk1fl1.tiff" + ], + [ + "1", + "BR00117008", + "A02", + "2", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f02p01-ch7sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f02p01-ch8sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f02p01-ch6sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f02p01-ch5sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f02p01-ch1sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f02p01-ch2sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f02p01-ch4sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f02p01-ch3sk1fk1fl1.tiff" + ], + [ + "2", + "BR00117008", + "A02", + "3", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f03p01-ch7sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f03p01-ch8sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f03p01-ch6sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f03p01-ch5sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f03p01-ch1sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f03p01-ch2sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f03p01-ch4sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f03p01-ch3sk1fk1fl1.tiff" + ], + [ + "3", + "BR00117008", + "A02", + "4", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f04p01-ch7sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f04p01-ch8sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f04p01-ch6sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f04p01-ch5sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f04p01-ch1sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f04p01-ch2sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f04p01-ch4sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f04p01-ch3sk1fk1fl1.tiff" + ], + [ + "4", + "BR00117008", + "A02", + "5", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f05p01-ch7sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f05p01-ch8sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f05p01-ch6sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f05p01-ch5sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f05p01-ch1sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f05p01-ch2sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f05p01-ch4sk1fk1fl1.tiff", + "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/images/2020_11_04_CPJUMP1/images/BR00117008__2020-11-09T00_55_40-Measurement1/Images/r01c02f05p01-ch3sk1fk1fl1.tiff" + ] + ], + "shape": { + "columns": 11, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Metadata_ChannelNameMetadata_PlateMetadata_WellMetadata_SiteLZ_BFBFHZ_BFDNAMitoAGPERRNA
0BR00117008A021s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...
1BR00117008A022s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...
2BR00117008A023s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...
3BR00117008A024s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...
4BR00117008A025s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...s3://cellpainting-gallery/cpg0000-jump-pilot/s...
\n", + "
" + ], + "text/plain": [ + "Metadata_ChannelName Metadata_Plate Metadata_Well Metadata_Site \\\n", + "0 BR00117008 A02 1 \n", + "1 BR00117008 A02 2 \n", + "2 BR00117008 A02 3 \n", + "3 BR00117008 A02 4 \n", + "4 BR00117008 A02 5 \n", + "\n", + "Metadata_ChannelName LZ_BF \\\n", + "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "\n", + "Metadata_ChannelName BF \\\n", + "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "\n", + "Metadata_ChannelName HZ_BF \\\n", + "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "\n", + "Metadata_ChannelName DNA \\\n", + "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "\n", + "Metadata_ChannelName Mito \\\n", + "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "\n", + "Metadata_ChannelName AGP \\\n", + "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "\n", + "Metadata_ChannelName ER \\\n", + "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "\n", + "Metadata_ChannelName RNA \n", + "0 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "1 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "2 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "3 s3://cellpainting-gallery/cpg0000-jump-pilot/s... \n", + "4 s3://cellpainting-gallery/cpg0000-jump-pilot/s... " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wide_manifest = arrange_manifest_channels(negcon_a549_48_manifest)\n", + "wide_manifest.head()" + ] + }, + { + "cell_type": "markdown", + "id": "d62c00e5", + "metadata": {}, + "source": [ + "## Data split" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "848fd839", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train plates: 9, Test plates: 3\n", + "Train samples: 6974, Test samples: 1728\n" + ] + } + ], + "source": [ + "a549_data_dir = DATA_DOWNLOAD_DIR / \"cpjump1_a549_48h\"\n", + "a549_data_dir.mkdir(exist_ok=True, parents=True)\n", + "\n", + "# Get unique plates\n", + "unique_plates = wide_manifest['Metadata_Plate'].unique()\n", + "\n", + "# Split plates into train (75%) and test (25%) with seed\n", + "train_plates, test_plates = train_test_split(\n", + " unique_plates, \n", + " test_size=0.25, \n", + " random_state=42\n", + ")\n", + "\n", + "# Create train and test manifests based on plate split\n", + "train_manifest_wide = wide_manifest[wide_manifest['Metadata_Plate'].isin(train_plates)]\n", + "test_manifest_wide = wide_manifest[wide_manifest['Metadata_Plate'].isin(test_plates)]\n", + "\n", + "print(f\"Train plates: {len(train_plates)}, Test plates: {len(test_plates)}\")\n", + "print(f\"Train samples: {len(train_manifest_wide)}, Test samples: {len(test_manifest_wide)}\")\n", + "\n", + "negcon_a549_48_manifest.to_csv(a549_data_dir / \"raw_manifest.csv\", index=False)\n", + "train_manifest_wide.to_csv(a549_data_dir/ \"train_manifest.csv\", index=False)\n", + "test_manifest_wide.to_csv(a549_data_dir / \"test_manifest.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "70d2ff65", + "metadata": {}, + "source": [ + "## Download all data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "010575e5", + "metadata": {}, + "outputs": [], + "source": [ + "_ = download_wide_manifest_channels(\n", + " train_manifest_wide,\n", + " dest_dir = a549_data_dir / \"train\" \n", + ")\n", + "_ = download_wide_manifest_channels(\n", + " test_manifest_wide,\n", + " dest_dir = a549_data_dir / \"test\" \n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "virtual_stain_flow", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/nbconverted/0.download_example_dataset.py b/examples/nbconverted/0.download_example_dataset.py new file mode 100644 index 0000000..3fc359c --- /dev/null +++ b/examples/nbconverted/0.download_example_dataset.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python +# coding: utf-8 + +# # Download JUMP pilot plate data from AWS S3 bucket for example training + +# In[1]: + + +from pathlib import Path +from urllib.parse import urlparse + +import pandas as pd +from sklearn.model_selection import train_test_split + +from virtual_stain_flow.datasets.example.cpjump1_manifest import get_manifest +from virtual_stain_flow.datasets.example.arrange_as_wide import arrange_manifest_channels + + +# ## Pathing + +# In[ ]: + + +DATA_DOWNLOAD_DIR = Path("/PATH/TO/WHERE/YOU/WANT/TO/DOWNLOAD/CPJUMP1") +DATA_DOWNLOAD_DIR.mkdir(exist_ok=True, parents=True) + + +# ## S3 download helpers + +# In[3]: + + +def _parse_s3_url(url): + parsed = urlparse(url) + if parsed.scheme != "s3": + raise ValueError(f"Expected s3:// URL, got: {url}") + return parsed.netloc, parsed.path.lstrip("/") + +def download_wide_manifest_channels( + wide_manifest, + dest_dir, + channel_columns=None, + overwrite=False, +): + """ + Download S3 TIFFs for each channel and write a local file_index.csv with paths. + """ + if channel_columns is None: + channel_columns = ["LZ_BF", "BF", "HZ_BF", "DNA", "Mito", "AGP", "ER", "RNA"] + dest_dir = Path(dest_dir) + dest_dir.mkdir(parents=True, exist_ok=True) + try: + import boto3 + from botocore import UNSIGNED + from botocore.config import Config + except ImportError as exc: + raise ImportError( + "boto3 is required for S3 downloads. Install with: pip install boto3" + ) from exc + s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED)) + local_rows = [] + for row_idx, row in wide_manifest.iterrows(): + prefix_parts = [] + for key in ["Metadata_Plate", "Metadata_Well", "Metadata_Site"]: + if key in wide_manifest.columns: + prefix_parts.append(str(row[key])) + prefix = "_".join(prefix_parts) if prefix_parts else f"row_{row_idx}" + local_row = {} + for channel in channel_columns: + url = row[channel] if channel in wide_manifest.columns else None + if pd.isna(url): + local_row[channel] = None + continue + bucket, key = _parse_s3_url(url) + suffix = Path(key).suffix or ".tif" + local_path = dest_dir / f"{prefix}_{channel}{suffix}" + if overwrite or not local_path.exists(): + s3.download_file(bucket, key, str(local_path)) + local_row[channel] = str(local_path) + local_rows.append(local_row) + file_index = pd.DataFrame(local_rows, columns=channel_columns) + file_index.to_csv(dest_dir / "file_index.csv", index=False) + return file_index + + +# ## Retrieve compound manifest + +# In[4]: + + +MANIFEST = get_manifest() +MANIFEST.head() + + +# ## Filter manifest +# For the sake of demoing training here we restricted timepoint to 24, and selected untreated U2-OS cells + +# In[5]: + + +negcon_a549_48_manifest = MANIFEST[ + (MANIFEST["Cell_type"] == "A549") & + (MANIFEST["Anomaly"] == "none") & + (MANIFEST["control_type"] == 'negcon') & + (MANIFEST["Time"] == 48) +] +negcon_a549_48_manifest.head() + + +# ## Arrange as wide is the anticipated format in virtual stain flow datasets and also the format the download helper expects this format + +# In[6]: + + +wide_manifest = arrange_manifest_channels(negcon_a549_48_manifest) +wide_manifest.head() + + +# ## Data split + +# In[7]: + + +a549_data_dir = DATA_DOWNLOAD_DIR / "cpjump1_a549_48h" +a549_data_dir.mkdir(exist_ok=True, parents=True) + +# Get unique plates +unique_plates = wide_manifest['Metadata_Plate'].unique() + +# Split plates into train (75%) and test (25%) with seed +train_plates, test_plates = train_test_split( + unique_plates, + test_size=0.25, + random_state=42 +) + +# Create train and test manifests based on plate split +train_manifest_wide = wide_manifest[wide_manifest['Metadata_Plate'].isin(train_plates)] +test_manifest_wide = wide_manifest[wide_manifest['Metadata_Plate'].isin(test_plates)] + +print(f"Train plates: {len(train_plates)}, Test plates: {len(test_plates)}") +print(f"Train samples: {len(train_manifest_wide)}, Test samples: {len(test_manifest_wide)}") + +negcon_a549_48_manifest.to_csv(a549_data_dir / "raw_manifest.csv", index=False) +train_manifest_wide.to_csv(a549_data_dir/ "train_manifest.csv", index=False) +test_manifest_wide.to_csv(a549_data_dir / "test_manifest.csv", index=False) + + +# ## Download all data + +# In[ ]: + + +_ = download_wide_manifest_channels( + train_manifest_wide, + dest_dir = a549_data_dir / "train" +) +_ = download_wide_manifest_channels( + test_manifest_wide, + dest_dir = a549_data_dir / "test" +) + diff --git a/pyproject.toml b/pyproject.toml index 1dfdd8a..2f7b3dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ dependencies = [ "notebook", "tifffile", "pandera[pandas]", + "boto3" ] [project.optional-dependencies] diff --git a/src/virtual_stain_flow/datasets/example/arrange_as_wide.py b/src/virtual_stain_flow/datasets/example/arrange_as_wide.py new file mode 100644 index 0000000..f150bf3 --- /dev/null +++ b/src/virtual_stain_flow/datasets/example/arrange_as_wide.py @@ -0,0 +1,32 @@ +""" +Helper utility specifically to support the example CPJUMP1 dataset + pivoting and arranging as file index. +""" + +import pandas as pd + + +def arrange_manifest_channels(manifest): + """ + Return a wide dataframe with one row per plate/well/site and URL columns per channel. + """ + required_channels = ["LZ_BF", "BF", "HZ_BF", "DNA", "Mito", "AGP", "ER", "RNA"] + keys = ["Metadata_Plate", "Metadata_Well", "Metadata_Site"] + filtered = manifest[manifest["Metadata_ChannelName"].isin(required_channels)].copy() + filtered["Metadata_ChannelName"] = filtered["Metadata_ChannelName"].astype( + pd.CategoricalDtype(categories=required_channels, ordered=True) + ) + filtered = filtered.sort_values(keys + ["Metadata_ChannelName"]) + wide = ( + filtered.pivot_table( + index=keys, + columns="Metadata_ChannelName", + values="Metadata_FileUrl", + aggfunc="first", + observed=False, + ) + .reindex(columns=required_channels) + .reset_index() + ) + + return wide diff --git a/src/virtual_stain_flow/datasets/example/cpjump1_manifest.py b/src/virtual_stain_flow/datasets/example/cpjump1_manifest.py new file mode 100644 index 0000000..cd45a8b --- /dev/null +++ b/src/virtual_stain_flow/datasets/example/cpjump1_manifest.py @@ -0,0 +1,144 @@ +"""Build an enriched image manifest for CPJUMP1 dataset access. + +Only compound perturbations (no CRISPR or ORF) are included, which is +appropriate for virtual staining experiments. +""" + +from __future__ import annotations + +import argparse +import sys +from typing import Optional + +import pandas as pd + +# Most recent commit ref as of Mar 25 2026. +REPO_REF = "6ea3958c3809cd04ac95b63138937dd64a7c4c12" +REPO_BASE = f"https://github.com/WayScience/JUMP-single-cell/raw/{REPO_REF}/" + +IMAGE_MANIFEST_URL = f"{REPO_BASE}0.download_data/data/2020_11_04_CPJUMP1_all_plates.parquet" +IMAGE_MANIFEST_COLUMNS = [ + "Metadata_Plate", + "Metadata_Well", + "Metadata_Site", + "Metadata_ChannelName", + "Metadata_PlaneID", + "Metadata_PositionZ", + "Metadata_FileUrl", + "Metadata_Filename", +] + +EXPERIMENT_METADATA_URL = f"{REPO_BASE}reference_plate_data/experiment-metadata.tsv" +COMPOUND_PLATEMAP_URL = f"{REPO_BASE}reference_plate_data/JUMP-Target-1_compound_platemap.txt" +COMPOUND_METADATA_URL = f"{REPO_BASE}reference_plate_data/JUMP-Target-1_compound_metadata_targets.tsv" + +__all__ = ["build_manifest", "get_manifest", "main"] + +_MANIFEST_CACHE: Optional[pd.DataFrame] = None + + +def build_manifest() -> pd.DataFrame: + """ + Main utility function that handles all the wrangling. + Return an enriched CPJUMP1 manifest as a pandas DataFrame. + """ + image_manifest = pd.read_parquet(IMAGE_MANIFEST_URL, columns=IMAGE_MANIFEST_COLUMNS) + + experiment_meta = pd.read_csv(EXPERIMENT_METADATA_URL, delimiter="\t") + experiment_meta.rename(columns={"Assay_Plate_Barcode": "Metadata_Plate"}, inplace=True) + experiment_meta = experiment_meta[experiment_meta["Perturbation"] == "compound"] + # exclude dl batch which is essentially duplicate in context for image data access + experiment_meta = experiment_meta[~experiment_meta["Batch"].str.endswith("_DL")] + + compound_platemap = pd.merge( + pd.read_csv(COMPOUND_PLATEMAP_URL, delimiter="\t"), + pd.read_csv(COMPOUND_METADATA_URL, delimiter="\t"), + on="broad_sample", + how="left", + validate="many_to_one" + ).rename(columns={"well_position": "Metadata_Well"}, inplace=False) + + image_manifest_compound = pd.merge( + experiment_meta, + image_manifest, + on="Metadata_Plate", + how="inner", + validate="one_to_many" # one plate id should map to many image rows + ) + + return pd.merge( + compound_platemap, + image_manifest_compound, + on="Metadata_Well", + how="inner", + # all the plates share the same well map so one well should map to many image rows + validate="one_to_many" + ) + + +def get_manifest() -> pd.DataFrame: + """ + Return a cached manifest to avoid repeated network reads. + """ + global _MANIFEST_CACHE + if _MANIFEST_CACHE is None: + _MANIFEST_CACHE = build_manifest() + return _MANIFEST_CACHE + + +def _write_manifest(df: pd.DataFrame, output: str, fmt: str) -> None: + if fmt == "csv": + df.to_csv(output, index=False) + elif fmt == "parquet": + df.to_parquet(output, index=False) + else: + raise ValueError(f"Unsupported format: {fmt}") + + +def main(argv: Optional[list[str]] = None) -> int: + """ + Command-line interface to building and outputting the CPJUMP1 manifest. + By default, it prints a summary and preview of the manifest. + Use --output or --stdout to write the full manifest to a file or stdout. + """ + parser = argparse.ArgumentParser(description="Build CPJUMP1 enriched manifest.") + parser.add_argument( + "--output", + help="Write manifest to a file (CSV or Parquet).", + ) + parser.add_argument( + "--format", + choices=["csv", "parquet"], + default="csv", + help="Output file format when using --output (default: csv).", + ) + parser.add_argument( + "--stdout", + action="store_true", + help="Write manifest to stdout as CSV.", + ) + parser.add_argument( + "--head", + type=int, + default=5, + help="Rows to display when no output is specified (default: 5).", + ) + args = parser.parse_args(argv) + + manifest = get_manifest() + + if args.stdout: + manifest.to_csv(sys.stdout, index=False) + return 0 + + if args.output: + _write_manifest(manifest, args.output, args.format) + return 0 + + print(f"Rows: {len(manifest):,} | Columns: {len(manifest.columns)}") + print(manifest.head(args.head).to_string(index=False)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())