diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f44de138..bf065d53 100755 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -343,4 +343,52 @@ jobs: sgosline/novartis:latest sgosline/novartis:${{ github.ref_name }} push: true + platforms: linux/amd64 + + build-cnf: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - name: Build and push cnf + uses: docker/build-push-action@v3 + with: + file: ./coderbuild/docker/Dockerfile.cnf + tags: | + sgosline/cnf:latest + sgosline/cnf:${{ github.ref_name }} + push: true + platforms: linux/amd64 + + build-phosphosites: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - name: Build and push phosphosites + uses: docker/build-push-action@v3 + with: + file: ./coderbuild/docker/Dockerfile.phosphosites + tags: | + sgosline/phosphosites:latest + sgosline/phosphosites:${{ github.ref_name }} + push: true platforms: linux/amd64 \ No newline at end of file diff --git a/coderbuild/build_all.py b/coderbuild/build_all.py index 067dd5dc..8e6c50dd 100644 --- a/coderbuild/build_all.py +++ b/coderbuild/build_all.py @@ -41,7 +41,7 @@ def main(): parser.add_argument('--figshare', action='store_true', help="Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment.") parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands") parser.add_argument('--high_mem',dest='high_mem',default=False,action='store_true',help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure.") - parser.add_argument('--dataset',dest='datasets',default='broad_sanger,beataml,pancreatic,bladder,sarcoma,liver,novartis,colorectal,mpnst,hcmi',help='Datasets to process. Defaults to all available.') + parser.add_argument('--dataset',dest='datasets',default='broad_sanger,beataml,pancreatic,bladder,sarcoma,liver,novartis,colorectal,mpnst,cnf,hcmi',help='Datasets to process. Defaults to all available.') parser.add_argument('--version', type=str, required=False, help='Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.') parser.add_argument('--github-username', type=str, required=False, help='GitHub username for the repository.') parser.add_argument('--github-email', type=str, required=False, help='GitHub email for the repository.') @@ -65,7 +65,7 @@ def run_docker_cmd(cmd_arr,filename): print('running...'+filename) env = os.environ.copy() if 'SYNAPSE_AUTH_TOKEN' not in env.keys(): - print('You need to set the SYNAPSE_AUTH_TOKEN to acess the MPNST, beatAML, bladder, pancreatic, liver, or sarcoma datasets') + print('You need to set the SYNAPSE_AUTH_TOKEN to acess the MPNST, beatAML, bladder, pancreatic, liver, sarcoma, cnf datasets') docker_run = ['docker','run','--rm','-v',env['PWD']+'/local/:/tmp/','--platform=linux/amd64'] else: docker_run = ['docker','run','--rm','-v',env['PWD']+'/local/:/tmp/','-e','SYNAPSE_AUTH_TOKEN='+env['SYNAPSE_AUTH_TOKEN'],'--platform=linux/amd64'] @@ -113,13 +113,17 @@ def process_docker(datasets): 'colorectal': ['colorectal'], 'cptac': ['cptac'], 'genes': ['genes'], + 'phosphosites': ['phosphosites'], 'upload': ['upload'], 'liver': ['liver'], - 'novartis': ['novartis'] + 'novartis': ['novartis'], + 'cnf': ['cnf'] } # Collect container names to build based on the datasets provided. Always build genes and upload. datasets_to_build = ['genes', 'upload'] + if 'cnf' in datasets: + datasets_to_build.append('phosphosites') for dataset in datasets: datasets_to_build.extend(dataset_map.get(dataset, [])) @@ -188,21 +192,35 @@ def process_samples(executor, datasets): last_sample_future = executor.submit(run_docker_cmd, [di, 'bash', 'build_samples.sh', sf], f'{da} samples') sf = f'/tmp/{da}_samples.csv' - def process_omics(executor, datasets,high_mem): + def process_phosphosites(executor): + ''' + Build the phosphosites reference file if it does not exist. + Caller must ensure genes.csv exists first. Returns a Future (or None). + ''' + if not os.path.exists('local/phosphosites.csv'): + return executor.submit( + run_docker_cmd, + ['phosphosites', 'bash', 'build_phosphosites.sh', '/tmp/genes.csv'], + 'phosphosites file', + ) + return None + + def process_omics(executor, datasets, high_mem): ''' Build all omics files concurrently ''' last_omics_future = None for da in datasets: di = 'broad_sanger_omics' if da == 'broad_sanger' else da - #Run all at once: + omics_cmd = [di, 'bash', 'build_omics.sh', '/tmp/genes.csv', f'/tmp/{da}_samples.csv'] + if da == 'cnf': + omics_cmd.append('/tmp/phosphosites.csv') if high_mem: - executor.submit(run_docker_cmd, [di, 'bash', 'build_omics.sh', '/tmp/genes.csv', f'/tmp/{da}_samples.csv'], f'{da} omics') - #Run one at a time. + executor.submit(run_docker_cmd, omics_cmd, f'{da} omics') else: if last_omics_future: - last_omics_future.result() - last_omics_future = executor.submit(run_docker_cmd, [di, 'bash', 'build_omics.sh', '/tmp/genes.csv', f'/tmp/{da}_samples.csv'], f'{da} omics') + last_omics_future.result() + last_omics_future = executor.submit(run_docker_cmd, omics_cmd, f'{da} omics') def process_experiments(executor, datasets, high_mem): ''' @@ -247,8 +265,9 @@ def process_misc(executor, datasets, high_mem): def process_genes(executor): - if not os.path.exists('/tmp/genes.csv'): - executor.submit(run_docker_cmd,['genes','bash','build_genes.sh'],'gene file') + if not os.path.exists('local/genes.csv'): + return executor.submit(run_docker_cmd,['genes','bash','build_genes.sh'],'gene file') + return None def run_docker_upload_cmd(cmd_arr, all_files_dir, name, version): @@ -320,9 +339,9 @@ def get_latest_commit_hash(owner, repo, branch='main'): # Error handling for required tokens if args.figshare and not figshare_token: raise ValueError("FIGSHARE_TOKEN environment variable is not set.") - if any(dataset in args.datasets for dataset in ['beataml', 'mpnst', 'bladder', 'pancreatic','sarcoma','liver','novartis','colorectal']) and not synapse_auth_token: + if any(dataset in args.datasets for dataset in ['beataml', 'mpnst', 'bladder', 'pancreatic','sarcoma','liver','novartis','colorectal','cnf']) and not synapse_auth_token: if args.docker or args.samples or args.omics or args.drugs or args.exp or args.all: # Token only required if building data, not upload or validate. - raise ValueError("SYNAPSE_AUTH_TOKEN is required for accessing MPNST, beatAML, bladder, pancreatic, liver, novartis, colorectal or sarcoma datasets.") + raise ValueError("SYNAPSE_AUTH_TOKEN is required for accessing MPNST, beatAML, bladder, pancreatic, liver, novartis, colorectal, sarcoma, cnf datasets.") ###### ### Begin Pipeline @@ -342,25 +361,34 @@ def get_latest_commit_hash(owner, repo, branch='main'): print("Docker image generation completed") - ### Build Drugs files, Samples files, and Genes file. These two steps are run in Parallel. + ### Build Drugs files, Samples files, and Genes file. These two steps are run in Parallel. ### Within each step, sequential running is required. with ThreadPoolExecutor() as executor: if args.samples or args.all: sample_thread = executor.submit(process_samples,executor, datasets) if args.drugs or args.all: drug_thread = executor.submit(process_drugs,executor, datasets) + + # Genes must finish before phosphosites can start (phosphosites reads genes.csv). + # Run genes now and wait; then submit phosphosites (which can overlap with samples/drugs). if args.samples or args.omics or args.exp or args.all: - gene_thread = executor.submit(process_genes,executor) - - # Wait for both processes to complete before proceeding to omics and experiments + genes_future = process_genes(executor) + if genes_future is not None: + genes_future.result() + + phosphosite_future = None + if (args.omics or args.all) and 'cnf' in datasets: + phosphosite_future = process_phosphosites(executor) + + # Wait for all remaining tasks to complete before proceeding to omics and experiments if args.drugs or args.all: drug_thread.result() - if args.samples or args.all:##need to wait for samples for all of these + if args.samples or args.all: sample_thread.result() - if args.samples or args.omics or args.exp or args.all: - gene_thread.result() - - print("All samples, drugs files, and genes file completed or skipped") + if phosphosite_future is not None: + phosphosite_future.result() + + print("All samples, drugs files, genes, and phosphosites files completed or skipped") ### At this point in the pipeline, all samples and drugs files have been created. There are no blockers to proceed. @@ -399,7 +427,7 @@ def get_latest_commit_hash(owner, repo, branch='main'): # if args.figshare or args.validate: # FigShare File Prefixes: - prefixes = ['beataml', 'hcmi', 'cptac', 'pancreatic', 'bladder','sarcoma', 'genes', 'drugs', 'liver','novartis','colorectal','mpnst'] + prefixes = ['beataml', 'hcmi', 'cptac', 'pancreatic', 'bladder', 'sarcoma', 'genes', 'drugs', 'liver', 'novartis', 'colorectal', 'mpnst', 'cnf', 'phosphosites'] broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"] if "broad_sanger" in datasets: prefixes.extend(broad_sanger_datasets) diff --git a/coderbuild/build_dataset.py b/coderbuild/build_dataset.py index 7c5ed10c..de5085d0 100644 --- a/coderbuild/build_dataset.py +++ b/coderbuild/build_dataset.py @@ -45,11 +45,13 @@ def process_docker(dataset,validate): 'cptac': ['cptac'], 'sarcoma': ['sarcoma'], 'genes': ['genes'], + 'phosphosites': ['phosphosites'], 'upload': ['upload'], - 'colorectal': ['colorectal'], + 'colorectal': ['colorectal'], 'bladder': ['bladder'], 'liver': ['liver'], - 'novartis': ['novartis'] + 'novartis': ['novartis'], + 'cnf': ['cnf'] } # Collect container names to build based on the dataset provided. Always build 'genes'. @@ -57,7 +59,10 @@ def process_docker(dataset,validate): # Append upload if validation step is included if validate is True: datasets_to_build.append('upload') - + # phosphosites container is required when building cnf + if dataset == 'cnf': + datasets_to_build.append('phosphosites') + datasets_to_build.extend(dataset_map.get(dataset, [])) compose_command = ['docker', 'compose', '-f', compose_file, 'build'] + datasets_to_build @@ -79,10 +84,26 @@ def process_docker(dataset,validate): def process_genes(executor): ''' - Build the genes file if it does not exist. + Build the genes file if it does not exist. Returns a Future (or None if already built). ''' if not os.path.exists('local/genes.csv'): - executor.submit(run_docker_cmd, ['genes', 'bash', 'build_genes.sh'], 'genes file') + return executor.submit(run_docker_cmd, ['genes', 'bash', 'build_genes.sh'], 'genes file') + return None + + +def process_phosphosites(executor): + ''' + Build the phosphosites reference file if it does not exist. + Only needed when the cnf dataset is being built. Returns a Future (or None). + Caller must ensure genes.csv exists before calling this. + ''' + if not os.path.exists('local/phosphosites.csv'): + return executor.submit( + run_docker_cmd, + ['phosphosites', 'bash', 'build_phosphosites.sh', '/tmp/genes.csv'], + 'phosphosites file', + ) + return None def process_samples(executor, dataset, use_prev_dataset, should_continue): ''' @@ -133,7 +154,8 @@ def process_omics(executor, dataset, should_continue): 'bladder': ['copy_number', 'mutations', 'transcriptomics'], 'colorectal':['copy_number', 'mutations', 'transcriptomics'], 'novartis':['copy_number', 'mutations', 'transcriptomics'], - 'liver':['copy_number', 'mutations', 'transcriptomics','proteomics'] + 'liver':['copy_number', 'mutations', 'transcriptomics','proteomics'], + 'cnf': ['transcriptomics', 'proteomics', 'phosphoproteomics'], } expected_omics = dataset_omics_files.get(dataset, []) @@ -167,7 +189,10 @@ def process_omics(executor, dataset, should_continue): di = 'broad_sanger_omics' if dataset == 'broad_sanger' else dataset filename = f'{dataset} omics' - executor.submit(run_docker_cmd, [di, 'bash', 'build_omics.sh', '/tmp/genes.csv', f'/tmp/{dataset}_samples.csv'], filename) + omics_cmd = [di, 'bash', 'build_omics.sh', '/tmp/genes.csv', f'/tmp/{dataset}_samples.csv'] + if dataset == 'cnf': + omics_cmd.append('/tmp/phosphosites.csv') + executor.submit(run_docker_cmd, omics_cmd, filename) def process_experiments(executor, dataset, should_continue): @@ -245,6 +270,8 @@ def run_schema_checker(dataset): ''' # Prepare the directory with the built files prefixes = ['genes', dataset] + if dataset == 'cnf': + prefixes.append('phosphosites') datasets = [dataset] broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"] all_files_dir = 'all_files_dir' @@ -292,16 +319,25 @@ def main(): if args.build: # Use ThreadPoolExecutor for parallel execution with ThreadPoolExecutor() as executor: - # Always build genes file - process_genes(executor) + # Genes must finish before phosphosites can start (phosphosites reads genes.csv) + genes_future = process_genes(executor) + if genes_future is not None: + genes_future.result() + + # Now safe to start phosphosites (genes.csv is present) + phosphosites_future = None + if args.dataset == 'cnf': + phosphosites_future = process_phosphosites(executor) - # Build samples and drugs + # Build samples and drugs in parallel while phosphosites runs samples_future = executor.submit(process_samples, executor, args.dataset, args.use_prev_dataset, args.should_continue) drugs_future = executor.submit(process_drugs, executor, args.dataset, args.use_prev_dataset, args.should_continue) samples_future.result() drugs_future.result() - + if phosphosites_future is not None: + phosphosites_future.result() + print("Samples and Drugs Files Completed.") with ThreadPoolExecutor() as executor: diff --git a/coderbuild/cnf/01-samples-cnf.py b/coderbuild/cnf/01-samples-cnf.py new file mode 100644 index 00000000..41bb14ea --- /dev/null +++ b/coderbuild/cnf/01-samples-cnf.py @@ -0,0 +1,372 @@ +#!/usr/bin/env python3 +""" +01-samples-cnf.py: generate cnf_samples.csv for the cNF organoid drug screen. + +Specimens are gathered from four sources: + + 1. Drug-screen file index on Synapse (syn51301431, dataType='drug screen') + 2. Global proteomics matrix (env CNF_GLOBAL_PROT_SYN_ID, default syn74815895) + 3. RNA discovery file (env CNF_RNA_DISCOVERY_SYN_ID, default syn71333780). + This is the corrected-abundance file containing every RNA condition + processed for the project (organoid, tumor tissue, drug-treated + organoid, skin), used here for sample discovery only — NOT to be + confused with CNF_RNA_TPM_SYN_ID, which build_omics.py reads for the + actual TPM data and may be filtered to organoids only. + 4. Normal Skin Synapse folder (env CNF_NORMAL_SKIN_SYN_ID, default syn74284682) + +Each specimen is classified as one of: + - "organoid" → model_type = "patient derived organoid" + - "treated_organoid" → model_type = "patient derived organoid" + (drug-treated organoid; canonical ID preserves the + treatment label, e.g. NF0021_T1_Onalespid_1uM) + - "tumor_tissue" → model_type = "tumor" + - "normal_skin" → model_type = "normal_tissue" + +Tissue and organoid samples for the same tumor are kept as separate rows with +distinct improve_sample_ids — they have different biology (fresh tumor vs. +cultured organoid) and the schema model_types differ. Treated and untreated +organoids of the same tumor are also separate rows because their canonical +IDs differ (treated samples preserve the treatment label). + +Cohort assignment is intentionally omitted. The upstream omics data is +batch-corrected and per-modality cohort membership disagrees for some +specimens, so a single per-sample cohort label cannot be assigned cleanly. +The `other_id_source` field carries a uniform project label. + +Sample IDs continue from the previous samples file: max(improve_sample_id) + 1. + +Usage: + python 01-samples-cnf.py [--output /tmp/cnf_samples.csv] +""" + +import argparse +import logging +import os +import re +import sys + +import pandas as pd +import synapseclient + +from cnf_utils import ( + MODEL_TYPE_MAP, + classify_specimen, + patient_from_specimen, +) + + +# ---------------------------------------------------------------------------- +# Constants +# ---------------------------------------------------------------------------- +DRUG_SCREEN_TABLE = "syn51301431" +GLOBAL_PROT_SYN_ID = os.environ.get("CNF_GLOBAL_PROT_SYN_ID", "syn74815895") +# RNA file used for SPECIMEN DISCOVERY only — needs to contain all +# conditions (organoid + tissue + treated + skin). The corrected-abundance +# file at syn71333780 fits this. Also used by build_omics.py as the TPM +# data source (the column is named correctedAbundance but stores TPM). +RNA_DISCOVERY_SYN_ID = os.environ.get("CNF_RNA_DISCOVERY_SYN_ID", "syn71333780") +NORMAL_SKIN_SYN_ID = os.environ.get("CNF_NORMAL_SKIN_SYN_ID", "syn74284682") + +DEFAULT_OUTPUT = "/tmp/cnf_samples.csv" + +CANCER_TYPE = "Cutaneous Neurofibroma" +SPECIES = "Homo sapiens (Human)" +OTHER_ID_SOURCE = "NF1_cNF_Project" + + +# ---------------------------------------------------------------------------- +# Helpers +# ---------------------------------------------------------------------------- +def configure_logging(): + logging.basicConfig( + format="[%(asctime)s] [%(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO, + ) + + +def get_starting_id(prev_samples_path: str) -> int: + """Read previous samples file and return max(improve_sample_id) + 1.""" + if not os.path.exists(prev_samples_path): + logging.warning("Previous samples file %s not found; starting at 1.", + prev_samples_path) + return 1 + + df = pd.read_csv(prev_samples_path) + if "improve_sample_id" not in df.columns or df.empty: + logging.warning("No improve_sample_id column or empty file; starting at 1.") + return 1 + + max_id = int(pd.to_numeric(df["improve_sample_id"], errors="coerce").max()) + logging.info("Previous samples max improve_sample_id = %d", max_id) + return max_id + 1 + + +def merge_specimen_dicts(*dicts: dict[str, str]) -> dict[str, str]: + """Union per-source dicts; warn on type disagreements (first-seen wins).""" + merged: dict[str, str] = {} + for d in dicts: + for spec, stype in d.items(): + if spec in merged and merged[spec] != stype: + logging.warning( + "Type conflict for %s: %s (kept) vs %s (ignored)", + spec, merged[spec], stype, + ) + continue + merged.setdefault(spec, stype) + return merged + + +# ---------------------------------------------------------------------------- +# Source-specific specimen extraction +# ---------------------------------------------------------------------------- +def specimens_from_drug_screen(syn) -> dict[str, str]: + """Pull canonical_id → sample_type for drug-screen specimens.""" + query = ( + "select id, individualID, specimenID " + f"from {DRUG_SCREEN_TABLE} " + "where dataType='drug screen'" + ) + logging.info("Querying %s for drug-screen specimens", DRUG_SCREEN_TABLE) + table = syn.tableQuery(query).asDataFrame() + raw = table["specimenID"].dropna().astype(str).unique().tolist() + out: dict[str, str] = {} + for r in raw: + result = classify_specimen(r) + if result: + cid, stype = result + out[cid] = stype + by_type = _count_by_type(out) + logging.info(" Drug screen: %s", by_type) + return out + + +def _count_by_type(d: dict[str, str]) -> dict[str, int]: + out: dict[str, int] = {} + for stype in d.values(): + out[stype] = out.get(stype, 0) + 1 + return out + + +def _detect_specimen_column(df: pd.DataFrame) -> str: + """Return the name of the specimen column in an omics file.""" + candidates = ["Specimen", "specimen", "specimen_norm", "sample_id", + "specimenID", "Sample"] + for c in candidates: + if c in df.columns: + return c + raise KeyError( + f"No specimen column among {candidates} in columns {list(df.columns)[:15]}" + ) + + +def _read_synapse_table(syn, syn_id: str) -> pd.DataFrame: + """Fetch a Synapse file and read it as CSV/TSV.""" + logging.info(" Fetching %s", syn_id) + entity = syn.get(syn_id) + path = entity.path + if path.endswith(".tsv") or path.endswith(".tsv.gz"): + return pd.read_csv(path, sep="\t") + return pd.read_csv(path) + + +def specimens_from_proteomics(syn) -> dict[str, str]: + """Pull canonical_id → sample_type from the global proteomics matrix.""" + if not GLOBAL_PROT_SYN_ID: + logging.warning("CNF_GLOBAL_PROT_SYN_ID not set; skipping proteomics") + return {} + + logging.info("Reading proteomics specimens") + try: + df = _read_synapse_table(syn, GLOBAL_PROT_SYN_ID) + col = _detect_specimen_column(df) + raw = df[col].dropna().astype(str).unique().tolist() + out: dict[str, str] = {} + for r in raw: + result = classify_specimen(r) + if result: + cid, stype = result + out[cid] = stype + logging.info(" Proteomics: %s", _count_by_type(out)) + return out + except Exception as exc: + logging.warning(" Proteomics read failed: %s", exc) + return {} + + +def specimens_from_rna_discovery(syn) -> dict[str, str]: + """Discover RNA specimens from a comprehensive long-format RNA file. + + Reads CNF_RNA_DISCOVERY_SYN_ID (default syn71333780) — the corrected- + abundance RNA file that contains every condition processed for the + project: organoid + tumor tissue + drug-treated organoid + skin. + + This is intentionally a different file from the one read by + build_omics.py (CNF_RNA_TPM_SYN_ID, which is the gene-level TPM matrix + and may be filtered to organoids only). Sample discovery and data + ingestion have different requirements: discovery wants the maximum + sample list; ingestion wants TPM values. + + Specimens are passed through classify_specimen, so drug-treated + samples like NF0021_T1_Onalespid.1uM get caught regardless of their + condition_norm value. + """ + if not RNA_DISCOVERY_SYN_ID: + logging.warning("CNF_RNA_DISCOVERY_SYN_ID not set; skipping RNA discovery") + return {} + + logging.info("Reading RNA discovery file (%s)", RNA_DISCOVERY_SYN_ID) + try: + df = _read_synapse_table(syn, RNA_DISCOVERY_SYN_ID) + col = _detect_specimen_column(df) + + raw = df[col].dropna().astype(str).unique().tolist() + out: dict[str, str] = {} + for r in raw: + result = classify_specimen(r) + if result: + cid, stype = result + out[cid] = stype + logging.info(" RNA discovery: %s", _count_by_type(out)) + return out + except Exception as exc: + logging.warning(" RNA discovery read failed: %s", exc) + return {} + + +def specimens_from_normal_skin_tree(syn) -> dict[str, str]: + """Discover skin specimens by listing the Normal Skin Synapse folder. + + The "Normal Skin" tree contains one immediate child per patient (folders + named NF0017, NF0018, ...) plus, in some cases, top-level skin FASTQ + files like NF0035-SKIN-1-04-16-2025_*.fastq.gz. Both forms encode the + patient ID at the start, so we match `^NF\\d{4}` against each child name + and emit one canonical NFxxxx_skin entry per unique patient. + + This is intentionally one API call deep — it avoids walking the whole + WGS tree (which has hundreds of empty lane-stub folders). For most + cases this captures every patient with a matched normal. + """ + if not NORMAL_SKIN_SYN_ID: + logging.warning("CNF_NORMAL_SKIN_SYN_ID not set; skipping Normal Skin tree") + return {} + + logging.info("Listing Normal Skin tree (%s)", NORMAL_SKIN_SYN_ID) + out: dict[str, str] = {} + try: + for child in syn.getChildren(NORMAL_SKIN_SYN_ID): + name = str(child.get("name", "")) + m = re.match(r"^(NF\d{4})", name) + if m: + patient = m.group(1) + out[f"{patient}_skin"] = "normal_skin" + logging.info(" Normal Skin tree: %s", _count_by_type(out)) + except Exception as exc: + logging.warning(" Normal Skin tree listing failed: %s", exc) + return out + + +# ---------------------------------------------------------------------------- +# Sample table construction +# ---------------------------------------------------------------------------- +def build_samples_table(specimens: dict[str, str], start_id: int) -> pd.DataFrame: + """Build the schema-compliant Sample table from {canonical_id: sample_type}. + + Sort order: organoid → treated_organoid → tumor_tissue → normal_skin + (each by ID). Drug-modelable untreated organoids get the lowest + improve_sample_ids — useful for downstream filtering. + """ + sort_priority = { + "organoid": 0, + "treated_organoid": 1, + "tumor_tissue": 2, + "normal_skin": 3, + } + sorted_specimens = sorted( + specimens.items(), + key=lambda kv: (sort_priority.get(kv[1], 99), kv[0]), + ) + + rows = [] + next_id = start_id + for specimen, stype in sorted_specimens: + patient = patient_from_specimen(specimen) + rows.append({ + "improve_sample_id": next_id, + "other_id": specimen, + "other_id_source": OTHER_ID_SOURCE, + "common_name": specimen, + "cancer_type": CANCER_TYPE, + "other_names": patient, + "species": SPECIES, + "model_type": MODEL_TYPE_MAP[stype], + }) + next_id += 1 + + df = pd.DataFrame(rows) + logging.info("Built %d sample rows; ID range %d–%d", + len(df), start_id, next_id - 1) + return df + + +# ---------------------------------------------------------------------------- +# Main +# ---------------------------------------------------------------------------- +def main(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "prev_samples", + help="Path to previous samples CSV (used to find max improve_sample_id)", + ) + parser.add_argument("--output", default=DEFAULT_OUTPUT, + help=f"Output CSV path (default: {DEFAULT_OUTPUT})") + args = parser.parse_args() + + configure_logging() + + syn = synapseclient.Synapse() + syn.login() + + # Pull specimens from each source as canonical_id → sample_type dicts + drug_d = specimens_from_drug_screen(syn) + prot_d = specimens_from_proteomics(syn) + rna_d = specimens_from_rna_discovery(syn) + skin_d = specimens_from_normal_skin_tree(syn) + + all_specimens = merge_specimen_dicts(drug_d, prot_d, rna_d, skin_d) + logging.info( + "Union: %d total specimens — %s", + len(all_specimens), _count_by_type(all_specimens), + ) + + # Per-source uniqueness diagnostics + drug_set, prot_set, rna_set, skin_set = ( + set(drug_d), set(prot_d), set(rna_d), set(skin_d), + ) + only_drug = drug_set - prot_set - rna_set - skin_set + only_prot = prot_set - drug_set - rna_set - skin_set + only_rna = rna_set - drug_set - prot_set - skin_set + only_skin = skin_set - drug_set - prot_set - rna_set + if only_drug: + logging.info(" Only in drug screen: %s", sorted(only_drug)) + if only_prot: + logging.info(" Only in proteomics: %s", sorted(only_prot)) + if only_rna: + logging.info(" Only in RNA: %s", sorted(only_rna)) + if only_skin: + logging.info(" Only in Normal Skin tree: %s", sorted(only_skin)) + + if not all_specimens: + logging.error("No specimens found from any source; aborting") + sys.exit(1) + + start_id = get_starting_id(args.prev_samples) + samples = build_samples_table(all_specimens, start_id) + + os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) + samples.to_csv(args.output, index=False) + logging.info("Wrote %s (%d rows)", args.output, len(samples)) + + +if __name__ == "__main__": + sys.exit(main() or 0) diff --git a/coderbuild/cnf/02-omics-cnf.py b/coderbuild/cnf/02-omics-cnf.py new file mode 100644 index 00000000..bdbe0fac --- /dev/null +++ b/coderbuild/cnf/02-omics-cnf.py @@ -0,0 +1,389 @@ +#!/usr/bin/env python3 +""" +02-omics-cnf.py: generate cnf_transcriptomics.csv, cnf_proteomics.csv, + and cnf_phosphoproteomics.csv. + +For the cNF dataset: + +* Transcriptomics: raw gene-level TPM from per-cohort salmon.merged.gene_tpm.tsv + files (wide matrix: rows = genes, columns = specimens). Default cohorts: + syn66352931 — cNF_Cohort_1 (NF0017–NF0023) + syn70765053 — cNF_Cohort_2 (NF0025, NF0027, NF0035) + Override via env var CNF_RNA_COHORT_SYN_IDS (comma-separated Synapse IDs). + +* Proteomics: batch-corrected global proteomics from syn74815895 + (correctedAbundance column). Override via env var CNF_GLOBAL_PROT_SYN_ID. + +* Phosphoproteomics: batch-corrected phospho-proteomics from syn70078415 + (correctedAbundance column). Mapped to phosphosite_ids via a phosphosites.csv + reference file. Override via env var CNF_PHOSPHO_SYN_ID. + +* Protocol optimization samples are excluded from RNA via DROP_SAMPLE_SUBSTRINGS. + +Specimens are canonicalized via cnf_utils so that values like +"NF0018.T1.organoid" resolve to the canonical IDs in cnf_samples.csv. +Source rows whose specimen doesn't match any entry in cnf_samples.csv are +dropped silently via the inner join to sample_map. + +Usage: + python 02-omics-cnf.py + [--out_rna /tmp/cnf_transcriptomics.csv] + [--out_prot /tmp/cnf_proteomics.csv] + [--out_phospho /tmp/cnf_phosphoproteomics.csv] +""" + +import argparse +import logging +import os +import re +import sys + +import pandas as pd +import synapseclient + +from cnf_utils import canonicalize_specimen_column + + +# ---------------------------------------------------------------------------- +# Defaults / Synapse IDs +# ---------------------------------------------------------------------------- +_rna_env = os.environ.get("CNF_RNA_COHORT_SYN_IDS", "") +RNA_TPM_COHORT_SYN_IDS: list[str] = ( + [s.strip() for s in _rna_env.split(",") if s.strip()] + if _rna_env + else ["syn66352931", "syn70765053"] +) + +GLOBAL_PROT_SYN_ID = os.environ.get("CNF_GLOBAL_PROT_SYN_ID", "syn74815895") +PHOSPHO_SYN_ID = os.environ.get("CNF_PHOSPHO_SYN_ID", "syn70078415") + +# Protocol-optimization samples to drop from RNA. +DROP_SAMPLE_SUBSTRINGS: list[str] = [ + "cNF_organoid_DIA_G_02_11Feb25", + "cNF_organoid_DIA_G_05_11Feb25", + "cNF_organoid_DIA_G_06_11Feb25", + "cNF_organoid_DIA_P_02_29Jan25", + "cNF_organoid_DIA_P_05_11Feb25", + "cNF_organoid_DIA_P_06_11Feb25", +] + +STUDY = "cnf" +DEFAULT_OUT_RNA = "/tmp/cnf_transcriptomics.csv" +DEFAULT_OUT_PROT = "/tmp/cnf_proteomics.csv" +DEFAULT_OUT_PHOSPHO = "/tmp/cnf_phosphoproteomics.csv" + + +def configure_logging(): + logging.basicConfig( + format="[%(asctime)s] [%(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO, + ) + + +# ---------------------------------------------------------------------------- +# Reference data loaders +# ---------------------------------------------------------------------------- +def load_gene_map(genes_path: str) -> pd.DataFrame: + """Load genes.csv. Required columns: gene_symbol, entrez_id.""" + df = pd.read_csv(genes_path) + needed = {"gene_symbol", "entrez_id"} + missing = needed - set(df.columns) + if missing: + raise KeyError(f"genes file missing columns: {missing}") + df = df.dropna(subset=["entrez_id", "gene_symbol"]).copy() + df["entrez_id"] = df["entrez_id"].astype(int) + df = df.drop_duplicates(subset="gene_symbol") + logging.info("Loaded %d gene_symbol → entrez_id mappings", len(df)) + return df[["gene_symbol", "entrez_id"]] + + +def load_sample_map(samples_path: str) -> pd.DataFrame: + """Load cnf_samples.csv; return (other_id, improve_sample_id) frame.""" + df = pd.read_csv(samples_path) + df = df[["other_id", "improve_sample_id"]].drop_duplicates() + df["improve_sample_id"] = df["improve_sample_id"].astype(int) + logging.info("Loaded %d sample mappings", len(df)) + return df + + +def load_phosphosite_map(phosphosites_path: str) -> pd.DataFrame: + """Load phosphosites.csv; return (other_id, phosphosite_id) frame.""" + df = pd.read_csv(phosphosites_path) + needed = {"other_id", "phosphosite_id"} + missing = needed - set(df.columns) + if missing: + raise KeyError(f"phosphosites file missing columns: {missing}") + df = df[["other_id", "phosphosite_id"]].drop_duplicates() + df["phosphosite_id"] = df["phosphosite_id"].astype(int) + logging.info("Loaded %d phosphosite mappings", len(df)) + return df + + +def fetch_synapse_table(syn, syn_id: str) -> pd.DataFrame: + """Download a Synapse file and read it as CSV/TSV.""" + if not syn_id: + raise ValueError("Synapse ID is empty") + logging.info("Fetching %s from Synapse", syn_id) + entity = syn.get(syn_id) + path = entity.path + if path.endswith(".tsv") or path.endswith(".tsv.gz"): + return pd.read_csv(path, sep="\t") + return pd.read_csv(path) + + +# ---------------------------------------------------------------------------- +# Shared helpers +# ---------------------------------------------------------------------------- +def _col_stem(col: str) -> str: + """Return the basename without extension for a path-like column name.""" + base = re.sub(r"^.*[/\\]", "", col) + return re.sub(r"\.[^.]+$", "", base) + + +def _drop_protocol_cols(cols: list[str]) -> list[str]: + """Remove columns whose stem matches any DROP_SAMPLE_SUBSTRINGS entry.""" + kept = [] + for c in cols: + stem = _col_stem(c) + if any(sub in stem for sub in DROP_SAMPLE_SUBSTRINGS): + logging.info(" Dropping protocol optimization sample: %s", stem) + else: + kept.append(c) + return kept + + +def _normalize_col(df: pd.DataFrame, candidates: list[str], target: str) -> pd.DataFrame: + """Rename the first matching candidate column to target.""" + for c in candidates: + if c in df.columns: + return df.rename(columns={c: target}) + raise KeyError( + f"None of {candidates} found in columns {list(df.columns)[:20]}" + ) + + +# ---------------------------------------------------------------------------- +# Transcriptomics +# ---------------------------------------------------------------------------- +def fetch_wide_tpm_cohort(syn, syn_id: str) -> pd.DataFrame: + """Fetch a wide salmon.merged.gene_tpm.tsv and return long-format DataFrame. + + nf-core salmon output: gene_id (Ensembl), gene_name (symbol), then one + column per specimen. Returns (gene_symbol, specimen, transcriptomics, source). + """ + logging.info("Fetching cohort TPM matrix %s", syn_id) + entity = syn.get(syn_id) + path = entity.path + df = pd.read_csv(path, sep="\t") + + gene_col = None + for cname in ["gene_name", "gene_symbol", "Symbol", "GeneSymbol", "gene"]: + if cname in df.columns: + gene_col = cname + break + if gene_col is None: + gene_col = df.columns[0] + logging.warning( + "%s: no gene_name column found; using %s — most rows may fail the gene_map join", + syn_id, gene_col, + ) + + meta_cols = {"gene_id", "gene_name", "gene_symbol", "Symbol", + "GeneSymbol", "gene", "transcript_id"} + sample_cols = [c for c in df.columns if c not in meta_cols] + sample_cols = _drop_protocol_cols(sample_cols) + + if not sample_cols: + raise ValueError( + f"{syn_id}: no specimen columns found after excluding metadata " + f"and protocol optimization samples" + ) + + long = ( + df[[gene_col] + sample_cols] + .rename(columns={gene_col: "gene_symbol"}) + .melt(id_vars="gene_symbol", var_name="specimen", value_name="transcriptomics") + ) + long["source"] = "Synapse" + logging.info( + " %s: %d genes × %d specimens → %d long rows", + syn_id, df[gene_col].nunique(), len(sample_cols), len(long), + ) + return long + + +def build_transcriptomics( + syn, gene_map: pd.DataFrame, sample_map: pd.DataFrame +) -> pd.DataFrame: + """Build the long-format transcriptomics table from per-cohort TPM matrices.""" + if not RNA_TPM_COHORT_SYN_IDS: + logging.error("RNA_TPM_COHORT_SYN_IDS is empty; skipping transcriptomics.") + return pd.DataFrame( + columns=["entrez_id", "improve_sample_id", "source", "study", "transcriptomics"] + ) + + cohort_frames = [] + for syn_id in RNA_TPM_COHORT_SYN_IDS: + try: + cohort_frames.append(fetch_wide_tpm_cohort(syn, syn_id)) + except Exception as exc: + logging.warning("Skipping RNA cohort %s: %s", syn_id, exc) + + if not cohort_frames: + logging.error("All RNA cohort fetches failed; returning empty transcriptomics.") + return pd.DataFrame( + columns=["entrez_id", "improve_sample_id", "source", "study", "transcriptomics"] + ) + + raw = pd.concat(cohort_frames, ignore_index=True) + logging.info("Combined %d RNA cohorts → %d total rows", len(cohort_frames), len(raw)) + + n_before = len(raw) + raw = canonicalize_specimen_column(raw, "specimen") + logging.info( + "Transcriptomics: canonicalized %d → %d rows (%d dropped as unrecognized specimens)", + n_before, len(raw), n_before - len(raw), + ) + + df = ( + raw.merge(gene_map, on="gene_symbol", how="inner") + .merge(sample_map, left_on="specimen", right_on="other_id", how="inner") + ) + df["study"] = STUDY + df["transcriptomics"] = pd.to_numeric(df["transcriptomics"], errors="coerce") + df = df.dropna(subset=["transcriptomics"]) + df = df[["entrez_id", "improve_sample_id", "source", "study", "transcriptomics"]] + + logging.info( + "Built transcriptomics table: %d rows (%d samples × %d genes)", + len(df), df["improve_sample_id"].nunique(), df["entrez_id"].nunique(), + ) + return df + + +# ---------------------------------------------------------------------------- +# Proteomics +# ---------------------------------------------------------------------------- +def build_proteomics( + syn, gene_map: pd.DataFrame, sample_map: pd.DataFrame +) -> pd.DataFrame: + """Build the long-format proteomics table from the batch-corrected Synapse file.""" + raw = fetch_synapse_table(syn, GLOBAL_PROT_SYN_ID) + raw = _normalize_col(raw, + ["Specimen", "specimenID", "specimen_id", "specimen", "specimen_norm", "sample_id"], + "specimen") + raw = _normalize_col(raw, + ["gene_symbol", "Gene", "GeneSymbol", "Symbol", "gene", "feature_id"], + "gene_symbol") + + n_before = len(raw) + raw = canonicalize_specimen_column(raw, "specimen") + logging.info( + "Proteomics: canonicalized %d → %d rows (%d dropped as unrecognized specimens)", + n_before, len(raw), n_before - len(raw), + ) + + raw = _normalize_col(raw, + ["correctedAbundance", "log_ratio", "logRatio", "proteomics", "value"], + "proteomics") + + df = ( + raw.merge(gene_map, on="gene_symbol", how="inner") + .merge(sample_map, left_on="specimen", right_on="other_id", how="inner") + ) + df["source"] = "Synapse" + df["study"] = STUDY + df["proteomics"] = pd.to_numeric(df["proteomics"], errors="coerce") + df = df.dropna(subset=["proteomics"]) + df = df[["entrez_id", "improve_sample_id", "source", "study", "proteomics"]] + + logging.info( + "Built proteomics table: %d rows (%d samples × %d genes)", + len(df), df["improve_sample_id"].nunique(), df["entrez_id"].nunique(), + ) + return df + + +# ---------------------------------------------------------------------------- +# Phosphoproteomics +# ---------------------------------------------------------------------------- +def build_phosphoproteomics( + syn, phosphosite_map: pd.DataFrame, sample_map: pd.DataFrame +) -> pd.DataFrame: + """Build the long-format phosphoproteomics table from the batch-corrected Synapse file.""" + raw = fetch_synapse_table(syn, PHOSPHO_SYN_ID) + raw = _normalize_col(raw, + ["Specimen", "specimenID", "specimen_id", "specimen"], + "specimen") + raw = _normalize_col(raw, + ["site", "Site", "phosphosite", "feature_id"], + "site") + + n_before = len(raw) + raw = canonicalize_specimen_column(raw, "specimen") + logging.info( + "Phosphoproteomics: canonicalized %d → %d rows (%d dropped as unrecognized specimens)", + n_before, len(raw), n_before - len(raw), + ) + + raw = _normalize_col(raw, + ["correctedAbundance", "log_ratio", "logRatio", "phosphoproteomics", "value"], + "phosphoproteomics") + + df = ( + raw.merge(phosphosite_map, left_on="site", right_on="other_id", how="inner") + .merge(sample_map, left_on="specimen", right_on="other_id", how="inner") + ) + df["source"] = "Synapse" + df["study"] = STUDY + df["phosphoproteomics"] = pd.to_numeric(df["phosphoproteomics"], errors="coerce") + df = df.dropna(subset=["phosphoproteomics"]) + df = df[["phosphosite_id", "improve_sample_id", "source", "study", "phosphoproteomics"]] + + logging.info( + "Built phosphoproteomics table: %d rows (%d samples × %d sites)", + len(df), df["improve_sample_id"].nunique(), df["phosphosite_id"].nunique(), + ) + return df + + +# ---------------------------------------------------------------------------- +# Main +# ---------------------------------------------------------------------------- +def main(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("genes", help="Path to genes.csv (gene_symbol → entrez_id)") + parser.add_argument("samples", help="Path to cnf_samples.csv from build_samples") + parser.add_argument("phosphosites", help="Path to phosphosites.csv (other_id → phosphosite_id)") + parser.add_argument("--out_rna", default=DEFAULT_OUT_RNA) + parser.add_argument("--out_prot", default=DEFAULT_OUT_PROT) + parser.add_argument("--out_phospho", default=DEFAULT_OUT_PHOSPHO) + args = parser.parse_args() + + configure_logging() + + gene_map = load_gene_map(args.genes) + sample_map = load_sample_map(args.samples) + phosphosite_map = load_phosphosite_map(args.phosphosites) + + syn = synapseclient.Synapse() + syn.login() + + rna = build_transcriptomics(syn, gene_map, sample_map) + prot = build_proteomics(syn, gene_map, sample_map) + phospho = build_phosphoproteomics(syn, phosphosite_map, sample_map) + + for path in (args.out_rna, args.out_prot, args.out_phospho): + os.makedirs(os.path.dirname(path) or ".", exist_ok=True) + + rna.to_csv(args.out_rna, index=False) + prot.to_csv(args.out_prot, index=False) + phospho.to_csv(args.out_phospho, index=False) + logging.info("Wrote %s (%d rows)", args.out_rna, len(rna)) + logging.info("Wrote %s (%d rows)", args.out_prot, len(prot)) + logging.info("Wrote %s (%d rows)", args.out_phospho, len(phospho)) + + +if __name__ == "__main__": + sys.exit(main() or 0) diff --git a/coderbuild/cnf/03-drugs-cnf.py b/coderbuild/cnf/03-drugs-cnf.py new file mode 100644 index 00000000..b9fe0120 --- /dev/null +++ b/coderbuild/cnf/03-drugs-cnf.py @@ -0,0 +1,531 @@ +#!/usr/bin/env python3 +""" +03-drugs-cnf.py: generate cnf_drugs.tsv and cnf_drug_descriptors.tsv. + +Pulls unique drug names from the cNF drug screen on Synapse, then calls the +standard coderdata utilities: + + coderbuild/utils/pubchem_retrieval.py + -> cnf_drugs.tsv + + coderbuild/utils/build_drug_descriptor_table.py + -> cnf_drug_descriptors.tsv + +Important implementation note: + pubchem_retrieval.py does not expose a command-line interface in the + provided version. It defines update_dataframe_and_write_tsv(), but running + `python pubchem_retrieval.py --input ... --output ...` exits 0 without + doing any work. Therefore this script imports pubchem_retrieval.py directly + and calls update_dataframe_and_write_tsv(). + +The descriptor builder is still run as a subprocess. It may call R/biomaRt +internally, which can fail when Ensembl is unavailable. To keep the drugs file +produced even when descriptors fail, this script: + + 1. Retries external/remote work with exponential backoff. + 2. Sets a generous timeout. + 3. Supports --skip-descriptors. + 4. Supports --only-descriptors for retrying descriptor generation later. + +Usage: + python 03-drugs-cnf.py --prev_drugs + [--out_drugs /tmp/cnf_drugs.tsv] + [--out_desc /tmp/cnf_drug_descriptors.tsv] + [--skip-descriptors] + [--only-descriptors] + [--retries 3] + [--timeout 1800] +""" + +import argparse +import importlib.util +import logging +import os +import signal +import subprocess +import sys +import tempfile +import time +import pandas as pd +import synapseclient + + +# ---------------------------------------------------------------------------- +# Constants +# ---------------------------------------------------------------------------- + +DRUG_SCREEN_TABLE = "syn51301431" + +PUBCHEM_SCRIPT = os.environ.get( + "CODERDATA_PUBCHEM_SCRIPT", + "/coderbuild/utils/pubchem_retrieval.py", +) + +DESCRIPTOR_SCRIPT = os.environ.get( + "CODERDATA_DESCRIPTOR_SCRIPT", + "/coderbuild/utils/build_drug_descriptor_table.py", +) + +DEFAULT_OUT_DRUGS = "/tmp/cnf_drugs.tsv" +DEFAULT_OUT_DESC = "/tmp/cnf_drug_descriptors.tsv" + +DEFAULT_RETRIES = 3 +DEFAULT_TIMEOUT = 1800 # seconds per attempt +RETRY_BACKOFF_S = 30 # 30s, 60s, 120s, ... + + +# ---------------------------------------------------------------------------- +# Logging +# ---------------------------------------------------------------------------- + +def configure_logging() -> None: + logging.basicConfig( + format="[%(asctime)s] [%(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO, + ) + + +# ---------------------------------------------------------------------------- +# Generic subprocess wrapper with retry +# ---------------------------------------------------------------------------- + +def run_with_retries( + cmd: list[str], + retries: int, + timeout: int, + label: str, +) -> bool: + """ + Run a subprocess, retrying on failure with exponential backoff. + + Returns True on success, False if all retries are exhausted. + """ + for attempt in range(1, retries + 1): + logging.info("[%s] attempt %d/%d", label, attempt, retries) + + try: + subprocess.run(cmd, check=True, timeout=timeout) + logging.info("[%s] succeeded on attempt %d", label, attempt) + return True + + except subprocess.TimeoutExpired: + logging.warning( + "[%s] timed out after %ds on attempt %d", + label, + timeout, + attempt, + ) + + except subprocess.CalledProcessError as exc: + logging.warning( + "[%s] failed with exit %d on attempt %d", + label, + exc.returncode, + attempt, + ) + + except Exception as exc: + logging.warning( + "[%s] unexpected error on attempt %d: %s", + label, + attempt, + exc, + ) + + if attempt < retries: + backoff = RETRY_BACKOFF_S * (2 ** (attempt - 1)) + logging.info("[%s] sleeping %ds before retry", label, backoff) + time.sleep(backoff) + + logging.error("[%s] exhausted %d retries", label, retries) + return False + + +# ---------------------------------------------------------------------------- +# PubChem utility loading/calling +# ---------------------------------------------------------------------------- + +def load_pubchem_module(path: str): + """ + Load pubchem_retrieval.py as a Python module. + + The provided pubchem_retrieval.py has no command-line entry point, so it + must be imported and called directly. + """ + if not os.path.exists(path): + raise FileNotFoundError(f"PubChem retrieval script not found: {path}") + + spec = importlib.util.spec_from_file_location( + "coderdata_pubchem_retrieval", + path, + ) + + if spec is None or spec.loader is None: + raise ImportError(f"Could not create import spec for {path}") + + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + if not hasattr(module, "update_dataframe_and_write_tsv"): + raise AttributeError( + f"{path} does not define update_dataframe_and_write_tsv()" + ) + + return module + + +def call_pubchem_retrieval( + names: list[str], + prev_drug_files: list[str], + out_drugs: str, + retries: int, + timeout: int, +) -> bool: + """ + Generate the drugs TSV by calling pubchem_retrieval.update_dataframe_and_write_tsv(). + + This intentionally does not use subprocess because the supplied + pubchem_retrieval.py has no argparse/main block and exits 0 without writing + anything when run as a script. + """ + prev_arg = ",".join(prev_drug_files) if prev_drug_files else None + + for attempt in range(1, retries + 1): + logging.info("[pubchem_retrieval] attempt %d/%d", attempt, retries) + + ignore_chems = None + + try: + pubchem = load_pubchem_module(PUBCHEM_SCRIPT) + + with tempfile.NamedTemporaryFile( + "w", + suffix="_cnf_ignore_chems.txt", + delete=False, + ) as tmp_ignore: + ignore_chems = tmp_ignore.name + + df = pubchem.update_dataframe_and_write_tsv( + unique_names=names, + output_filename=out_drugs, + ignore_chems=ignore_chems, + batch_size=1, + isname=True, + time_limit=timeout, + prev_drug_filepaths=prev_arg, + restrict_to_raw_names=names, + ) + + # pubchem_retrieval.py sets signal.alarm(time_limit), but does not + # clear it. Clear it here so the descriptor step is not interrupted. + try: + signal.alarm(0) + except Exception: + pass + + if getattr(pubchem, "should_continue", True) is False: + raise TimeoutError( + f"pubchem_retrieval reached its time limit of {timeout}s" + ) + + if not os.path.exists(out_drugs): + raise FileNotFoundError( + f"pubchem_retrieval returned but did not create {out_drugs}" + ) + + if os.path.getsize(out_drugs) == 0: + raise RuntimeError( + f"pubchem_retrieval created {out_drugs}, but it is empty" + ) + + n_rows = len(df) if df is not None else "unknown" + + logging.info( + "[pubchem_retrieval] succeeded on attempt %d; wrote %s rows to %s", + attempt, + n_rows, + out_drugs, + ) + + return True + + except Exception as exc: + try: + signal.alarm(0) + except Exception: + pass + + logging.warning( + "[pubchem_retrieval] failed on attempt %d/%d: %s", + attempt, + retries, + exc, + ) + + if attempt < retries: + backoff = RETRY_BACKOFF_S * (2 ** (attempt - 1)) + logging.info( + "[pubchem_retrieval] sleeping %ds before retry", + backoff, + ) + time.sleep(backoff) + + finally: + if ignore_chems and os.path.exists(ignore_chems): + try: + os.unlink(ignore_chems) + except OSError: + pass + + logging.error("[pubchem_retrieval] exhausted %d retries", retries) + return False + + +# ---------------------------------------------------------------------------- +# Drug name collection +# ---------------------------------------------------------------------------- + +def collect_drug_names(syn) -> list[str]: + """ + Pull every cNF drug-screen file from Synapse and collect unique drug names. + """ + query = ( + "select id, individualID, specimenID " + f"from {DRUG_SCREEN_TABLE} " + "where dataType='drug screen'" + ) + + files = syn.tableQuery(query).asDataFrame() + + logging.info("Reading %d drug-screen files for drug names", len(files)) + + names: set[str] = set() + + for _, row in files.iterrows(): + try: + entity = syn.get(row["id"]) + df = pd.read_csv(entity.path) + + except Exception as exc: + logging.warning("Skipping file %s: %s", row["id"], exc) + continue + + if "Drug" not in df.columns: + logging.warning("File %s has no 'Drug' column; skipping", row["id"]) + continue + + names.update( + df["Drug"] + .dropna() + .astype(str) + .str.strip() + .tolist() + ) + + names = sorted(n for n in names if n and n.lower() != "nan") + + logging.info("Collected %d unique drug names", len(names)) + + return names + + +# ---------------------------------------------------------------------------- +# Descriptor utility +# ---------------------------------------------------------------------------- + +def call_descriptor_table( + drug_file: str, + out_desc: str, + retries: int, + timeout: int, +) -> bool: + """ + Run the descriptor table builder as a subprocess. + + This function assumes build_drug_descriptor_table.py has a CLI accepting + --input and --output. + """ + cmd = [ + "python", + DESCRIPTOR_SCRIPT, + "--input", + drug_file, + "--output", + out_desc, + ] + + ok = run_with_retries( + cmd, + retries, + timeout, + "descriptor_table", + ) + + if ok and not os.path.exists(out_desc): + logging.error( + "descriptor_table exited successfully but did not create %s", + out_desc, + ) + return False + + return ok + + +# ---------------------------------------------------------------------------- +# Main +# ---------------------------------------------------------------------------- + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + + parser.add_argument( + "--prev_drugs", + default="", + help="Comma-delimited list of existing drug TSV files for ID reuse.", + ) + + parser.add_argument( + "--out_drugs", + default=DEFAULT_OUT_DRUGS, + help=f"Output drugs TSV. Default: {DEFAULT_OUT_DRUGS}", + ) + + parser.add_argument( + "--out_desc", + default=DEFAULT_OUT_DESC, + help=f"Output drug descriptors TSV. Default: {DEFAULT_OUT_DESC}", + ) + + parser.add_argument( + "--skip-descriptors", + action="store_true", + help=( + "Skip the descriptor step entirely. Use when the descriptor " + "utility or its biomaRt/Ensembl dependency is failing. The drugs " + "table is still produced." + ), + ) + + parser.add_argument( + "--only-descriptors", + action="store_true", + help=( + "Only run the descriptor step, expecting --out_drugs to already " + "exist. Useful for retrying after a previous descriptor failure." + ), + ) + + parser.add_argument( + "--retries", + type=int, + default=DEFAULT_RETRIES, + help=f"Number of retry attempts. Default: {DEFAULT_RETRIES}", + ) + + parser.add_argument( + "--timeout", + type=int, + default=DEFAULT_TIMEOUT, + help=f"Timeout in seconds per attempt. Default: {DEFAULT_TIMEOUT}", + ) + + args = parser.parse_args() + + configure_logging() + + if args.skip_descriptors and args.only_descriptors: + logging.error("Cannot combine --skip-descriptors with --only-descriptors") + return 2 + + for path in (args.out_drugs, args.out_desc): + os.makedirs(os.path.dirname(path) or ".", exist_ok=True) + + # ------------------------------------------------------------------------ + # Drugs step + # ------------------------------------------------------------------------ + + if not args.only_descriptors: + syn = synapseclient.Synapse() + syn.login() + + names = collect_drug_names(syn) + + if not names: + logging.error("No drug names found; aborting") + return 1 + + prev = [ + p.strip() + for p in args.prev_drugs.split(",") + if p.strip() + ] + + ok = call_pubchem_retrieval( + names=names, + prev_drug_files=prev, + out_drugs=args.out_drugs, + retries=args.retries, + timeout=args.timeout, + ) + + if not ok: + logging.error("pubchem_retrieval failed; aborting") + return 1 + + if not os.path.exists(args.out_drugs): + logging.error( + "pubchem_retrieval did not produce %s", + args.out_drugs, + ) + return 1 + + logging.info("Wrote %s", args.out_drugs) + + # ------------------------------------------------------------------------ + # Descriptor step + # ------------------------------------------------------------------------ + + if args.skip_descriptors: + logging.warning( + "Skipping descriptor step. Re-run with --only-descriptors to " + "produce %s when descriptor dependencies are reachable.", + args.out_desc, + ) + return 0 + + if not os.path.exists(args.out_drugs): + logging.error( + "Drugs file %s missing; cannot run descriptors", + args.out_drugs, + ) + return 1 + + ok = call_descriptor_table( + drug_file=args.out_drugs, + out_desc=args.out_desc, + retries=args.retries, + timeout=args.timeout, + ) + + if not ok: + logging.error( + "Descriptor step failed after %d attempts. The drugs file %s is " + "complete; re-run with --only-descriptors when descriptor " + "dependencies are reachable to produce %s.", + args.retries, + args.out_drugs, + args.out_desc, + ) + + # Exit 0 because the primary drugs table was produced. Descriptors can + # be retried later. Use --skip-descriptors to suppress this explicitly. + return 0 + + logging.info("Wrote %s", args.out_desc) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/coderbuild/cnf/04-experiments-cnf.py b/coderbuild/cnf/04-experiments-cnf.py new file mode 100644 index 00000000..fe1e216b --- /dev/null +++ b/coderbuild/cnf/04-experiments-cnf.py @@ -0,0 +1,478 @@ +#!/usr/bin/env python3 +""" +04-experiments-cnf.py: generate cnf_experiments.tsv. + +Discovers every drug-screen viability file under three Synapse parent folders, +attaches a specimen ID by reading the file's `specimenID` annotation (with a +filename-pattern fallback), then per (specimen, drug): + + * Multi-concentration measurements are run through the standard + coderbuild/utils/fit_curve.py utility, producing fit_auc / fit_ic50 / + fit_einf / fit_hs / fit_r2. + * Single-concentration measurements at 1 μM are recorded with + metric = 'uM_viability' and value = Viability_percentage / 100. + Schema dependency: 'uM_viability' must be in the ResponseMetric enum. + See schema_patch.md. + +Specimen attribution policy: + 1. Trust the file's `specimenID` annotation if present. + 2. Fall back to parsing the filename (e.g. "NF0017_T2_Viabilities.csv" + → "NF0017_T2"). + 3. Run the result through cnf_utils.classify_specimen as a safety net + (handles dot-separators, suffixes, case variants). Files whose + specimen can't be classified are skipped with a warning. + +Dual-mapping: one viability file's results should be attributed to two +specimens. Handled via SPECIMEN_DUAL_MAPPINGS at the top of the script; +add new entries as more cases come up. + +Usage: + python 04-experiments-cnf.py + [--output /tmp/cnf_experiments.tsv] + [--parents syn51301414,syn51301420,syn51301426] +""" + +import argparse +import logging +import os +import re +import subprocess +import sys +import tempfile + +import pandas as pd +import synapseclient + +from cnf_utils import classify_specimen + + +# ---------------------------------------------------------------------------- +# Constants +# ---------------------------------------------------------------------------- +# Synapse folders to walk for drug-screen viability files +DEFAULT_DRUG_SCREEN_PARENTS = [ + "syn51301414", + "syn51301420", + "syn51301426", +] + +TIME = 120 +TIME_UNIT = "hours" +STUDY = "cnf" +SOURCE = "Synapse" +SINGLE_DOSE_UM = 1.0 +SINGLE_DOSE_METRIC = "uM_viability" + +FIT_CURVE_SCRIPT = os.environ.get( + "CODERDATA_FIT_CURVE_SCRIPT", "/coderbuild/utils/fit_curve.py") + +DEFAULT_OUTPUT = "/tmp/cnf_experiments.tsv" + +# Regex to recover specimen from filenames like "NF0017_T2_Viabilities.csv" +# or "NF0021_T1_Onalespid_1uM_Viabilities.csv" +FILENAME_SPECIMEN_RE = re.compile( + r"^(NF\d{4}(?:_T\d+)?(?:_[A-Za-z0-9._-]+?)?)" # specimen prefix + r"_(?:Viabilities|viabilities|Viability|drug_screen)" # known label + r"\.(?:csv|tsv|txt)$", + re.IGNORECASE, +) + +# ---------------------------------------------------------------------------- +# DUAL-MAPPING TABLE +# ---------------------------------------------------------------------------- +# Maps a canonical specimen to any additional specimens that should receive +# the same drug-response rows. Add new entries as more cases come up. +# +# Known case: NF0021_T1_Onalespid_1uM was screened against the same drug +# panel as NF0021_T1 and should be available under both labels. +SPECIMEN_DUAL_MAPPINGS: dict[str, list[str]] = { + "NF0021_T1_Onalespid_1uM": ["NF0021_T1"], +} + + +def configure_logging(): + logging.basicConfig( + format="[%(asctime)s] [%(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO, + ) + + +# ---------------------------------------------------------------------------- +# Specimen attribution +# ---------------------------------------------------------------------------- +def specimen_from_annotation(syn, file_id: str) -> str | None: + """Return the file's specimenID annotation, or None if missing.""" + try: + anno = syn.getAnnotations(file_id) + except Exception as exc: + logging.warning("getAnnotations failed for %s: %s", file_id, exc) + return None + val = anno.get("specimenID") + if isinstance(val, list): + val = val[0] if val else None + if val is None: + return None + s = str(val).strip() + return s or None + + +def specimen_from_filename(name: str) -> str | None: + """Parse 'NF0017_T2_Viabilities.csv' → 'NF0017_T2'.""" + m = FILENAME_SPECIMEN_RE.match(name) + if m: + return m.group(1) + return None + + +def resolve_specimen(syn, file_id: str, file_name: str) -> str | None: + """Resolve a file to a canonical specimen ID, trying annotation first. + + Returns canonical_id (e.g. "NF0017_T2") or None if both annotation and + filename parsing fail or the result doesn't classify as a known sample + type. + """ + raw = specimen_from_annotation(syn, file_id) + if not raw: + raw = specimen_from_filename(file_name) + if not raw: + return None + + # Safety-net classification — handles dot-separators, suffixes, case + result = classify_specimen(raw) + if result is None: + # Annotation or filename produced something that doesn't fit any + # known sample pattern. + logging.warning( + "Could not classify specimen '%s' (file %s, %s); skipping", + raw, file_id, file_name, + ) + return None + return result[0] + + +def expand_dual_mappings(specimen: str) -> list[str]: + """Return [specimen] + any additional specimens it's dual-mapped to.""" + return [specimen] + SPECIMEN_DUAL_MAPPINGS.get(specimen, []) + + +# ---------------------------------------------------------------------------- +# Synapse walking +# ---------------------------------------------------------------------------- +def walk_for_viability_files(syn, parent_id: str) -> list[dict]: + """Recursively yield File entities under a Synapse folder. + + Returns a list of {id, name} dicts for files whose name suggests they + contain viability data (drug-screen CSV/TSV). + """ + try: + import synapseutils + except ImportError: + logging.error("synapseutils not available; cannot walk %s", parent_id) + return [] + + files = [] + for dirpath, _subfolders, file_list in synapseutils.walk(syn, parent_id): + for fname, fid in file_list: + # Filter to plausible viability files. Accept anything ending in + # csv/tsv that has 'viab' or 'drug' in the name; fall back to + # 'csv' if neither matches but the file is small enough. + lower = fname.lower() + if not lower.endswith((".csv", ".tsv", ".txt")): + continue + if any(tok in lower for tok in ("viab", "drug", "screen")): + files.append({"id": fid, "name": fname, + "parent_path": dirpath}) + # Don't auto-grab arbitrary CSVs — we only want drug screens. + return files + + +def discover_drug_files(syn, parents: list[str]) -> pd.DataFrame: + """Walk all parent folders and dedupe by Synapse ID.""" + all_files = [] + for parent in parents: + logging.info("Walking %s ...", parent) + files = walk_for_viability_files(syn, parent) + logging.info(" found %d candidate viability files", len(files)) + all_files.extend(files) + + if not all_files: + return pd.DataFrame(columns=["id", "name"]) + + df = pd.DataFrame(all_files).drop_duplicates(subset="id").reset_index(drop=True) + logging.info("Discovered %d unique viability files across %d parents", + len(df), len(parents)) + return df + + +# ---------------------------------------------------------------------------- +# Read and combine drug-screen files +# ---------------------------------------------------------------------------- +def pull_screen_files(syn, parents: list[str]) -> pd.DataFrame: + """Walk parents, read each viability CSV, attach canonical specimen. + + Files whose specimen can't be resolved are skipped with a warning. + Files attributed to a dual-mapped specimen produce duplicate rows for + every additional specimen in the mapping. + """ + files_df = discover_drug_files(syn, parents) + if files_df.empty: + raise RuntimeError("No drug-screen files found under any parent") + + frames = [] + for _, row in files_df.iterrows(): + fid, fname = row["id"], row["name"] + + canonical = resolve_specimen(syn, fid, fname) + if not canonical: + continue + + try: + df = pd.read_csv(syn.get(fid).path) + except Exception as exc: + logging.warning("Skipping file %s: %s", fid, exc) + continue + + # Verify required columns + required = {"Drug", "Concentration_uM", "Viability_percentage"} + missing = required - set(df.columns) + if missing: + logging.warning("File %s missing %s; skipping", + fname, sorted(missing)) + continue + + # Attribute to canonical + any dual-mapped specimens + for specimen in expand_dual_mappings(canonical): + attributed = df.copy() + attributed["specimen"] = specimen + attributed["source_file"] = fid + frames.append(attributed) + + if not frames: + raise RuntimeError("No drug-screen files could be read successfully") + + combined = pd.concat(frames, ignore_index=True) + n_specimens = combined["specimen"].nunique() + n_drugs = combined["Drug"].nunique() + logging.info( + "Combined %d drug-screen rows across %d specimens × %d drugs", + len(combined), n_specimens, n_drugs, + ) + return combined + + +# ---------------------------------------------------------------------------- +# I/O helpers +# ---------------------------------------------------------------------------- +def load_sample_map(samples_path: str) -> dict[str, int]: + df = pd.read_csv(samples_path) + df["improve_sample_id"] = df["improve_sample_id"].astype(int) + return dict(zip(df["other_id"].astype(str), df["improve_sample_id"])) + + +def load_drug_map(drugs_path: str) -> dict[str, str]: + df = pd.read_csv(drugs_path, sep="\t") + if "chem_name" not in df.columns or "improve_drug_id" not in df.columns: + raise KeyError( + "drugs file missing required columns " + f"(have: {list(df.columns)})" + ) + mapping = { + str(name).strip().lower(): str(did) + for name, did in zip(df["chem_name"], df["improve_drug_id"]) + } + logging.info("Loaded %d drug name → improve_drug_id mappings", len(mapping)) + return mapping + + +def lookup_drug_id(name, drug_map: dict[str, str]) -> str | None: + if pd.isna(name): + return None + return drug_map.get(str(name).strip().lower()) + + +# ---------------------------------------------------------------------------- +# Single vs multi dose split +# ---------------------------------------------------------------------------- +def split_single_vs_multi(combined: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: + """Per (specimen, drug), separate single-dose vs multi-dose.""" + counts = ( + combined.groupby(["specimen", "Drug"])["Concentration_uM"] + .nunique() + .reset_index(name="n_conc") + ) + multi_keys = counts[counts["n_conc"] > 1] + single_keys = counts[counts["n_conc"] == 1] + + multi = combined.merge(multi_keys[["specimen", "Drug"]], + on=["specimen", "Drug"], how="inner") + single = combined.merge(single_keys[["specimen", "Drug"]], + on=["specimen", "Drug"], how="inner") + # Single-dose rows must be at the canonical 1 μM concentration. + single = single[single["Concentration_uM"] == SINGLE_DOSE_UM].copy() + + logging.info("Multi-dose rows: %d (across %d sample-drug pairs)", + len(multi), len(multi_keys)) + logging.info("Single-dose rows: %d (across %d sample-drug pairs at %g μM)", + len(single), len(single_keys), SINGLE_DOSE_UM) + return multi, single + + +# ---------------------------------------------------------------------------- +# Curve fitting (multi-dose) and single-dose formatting +# ---------------------------------------------------------------------------- +def run_curve_fit(multi: pd.DataFrame, sample_map, drug_map) -> pd.DataFrame: + if multi.empty: + return pd.DataFrame() + + multi = multi.copy() + multi["improve_sample_id"] = multi["specimen"].map(sample_map) + multi["improve_drug_id"] = multi["Drug"].apply( + lambda n: lookup_drug_id(n, drug_map)) + + n_drop = (multi["improve_sample_id"].isna() | + multi["improve_drug_id"].isna()).sum() + if n_drop: + logging.warning("Dropping %d multi-dose rows with unmapped sample/drug", + n_drop) + multi = multi.dropna(subset=["improve_sample_id", "improve_drug_id"]) + if multi.empty: + return pd.DataFrame() + + # fit_curve.py expects DOSE in μM and GROWTH as viability percentage + multi["DOSE"] = multi["Concentration_uM"].astype(float) + 1e-4 + multi["GROWTH"] = multi["Viability_percentage"].astype(float) + multi["time"] = TIME + multi["time_unit"] = TIME_UNIT + multi["study"] = STUDY + multi["source"] = SOURCE + + cols = ["DOSE", "GROWTH", "study", "source", "improve_sample_id", + "Drug", "time", "time_unit"] + multi_for_fit = multi[cols].copy() + multi_for_fit["improve_sample_id"] = ( + multi_for_fit["improve_sample_id"].astype(int)) + + with tempfile.TemporaryDirectory() as tmpdir: + in_path = os.path.join(tmpdir, "drug_response.tsv") + out_prefix = os.path.join(tmpdir, "cnf_curve") + multi_for_fit.to_csv(in_path, sep="\t", index=False) + + cmd = ["python", FIT_CURVE_SCRIPT, + "--input", in_path, "--output", out_prefix] + logging.info("Running fit_curve: %s", " ".join(cmd)) + subprocess.run(cmd, check=True) + + # fit_curve.py writes .0 in the reference snippet's behavior + out_path = out_prefix + ".0" + if not os.path.exists(out_path): + alt = out_prefix + if os.path.exists(alt): + out_path = alt + else: + raise FileNotFoundError( + f"fit_curve.py did not produce output at {out_path}" + ) + fitted = pd.read_csv(out_path, sep="\t") + + # fit_curve.py renames Drug → improve_drug_id, so the column is present + # but holds the raw drug name, not the SMI_ ID. Always remap through drug_map. + if "improve_drug_id" in fitted.columns: + name_col = "improve_drug_id" + elif "Drug" in fitted.columns: + name_col = "Drug" + else: + raise KeyError( + f"fit_curve output has neither improve_drug_id nor Drug " + f"column (have: {list(fitted.columns)})" + ) + fitted["improve_drug_id"] = fitted[name_col].apply( + lambda n: lookup_drug_id(n, drug_map)) + + fitted = fitted.dropna(subset=["improve_drug_id"]) + n_metrics = (fitted["dose_response_metric"].nunique() + if "dose_response_metric" in fitted.columns else 0) + logging.info("Curve-fit produced %d rows across %d metrics", + len(fitted), n_metrics) + return fitted + + +def format_single_dose(single: pd.DataFrame, sample_map, drug_map) -> pd.DataFrame: + if single.empty: + return pd.DataFrame() + + df = single.copy() + df["improve_sample_id"] = df["specimen"].map(sample_map) + df["improve_drug_id"] = df["Drug"].apply( + lambda n: lookup_drug_id(n, drug_map)) + + n_drop = (df["improve_sample_id"].isna() | + df["improve_drug_id"].isna()).sum() + if n_drop: + logging.warning("Dropping %d single-dose rows with unmapped sample/drug", + n_drop) + df = df.dropna(subset=["improve_sample_id", "improve_drug_id"]).copy() + + df["dose_response_metric"] = SINGLE_DOSE_METRIC + df["dose_response_value"] = df["Viability_percentage"].astype(float) / 100.0 + df["time"] = TIME + df["time_unit"] = TIME_UNIT + df["study"] = STUDY + df["source"] = SOURCE + df["improve_sample_id"] = df["improve_sample_id"].astype(int) + + cols = ["source", "improve_sample_id", "improve_drug_id", "study", + "time", "time_unit", "dose_response_metric", "dose_response_value"] + df = df[cols] + logging.info("Formatted %d single-dose rows", len(df)) + return df + + +# ---------------------------------------------------------------------------- +# Main +# ---------------------------------------------------------------------------- +def main(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("samples", help="cnf_samples.csv from build_samples") + parser.add_argument("drugs", help="cnf_drugs.tsv from build_drugs") + parser.add_argument("--output", default=DEFAULT_OUTPUT) + parser.add_argument( + "--parents", default=",".join(DEFAULT_DRUG_SCREEN_PARENTS), + help="Comma-delimited Synapse folder IDs to walk for viability files", + ) + args = parser.parse_args() + + configure_logging() + + parents = [p.strip() for p in args.parents.split(",") if p.strip()] + if not parents: + logging.error("No parent folders provided") + sys.exit(2) + + sample_map = load_sample_map(args.samples) + drug_map = load_drug_map(args.drugs) + + syn = synapseclient.Synapse() + syn.login() + + combined = pull_screen_files(syn, parents) + multi, single = split_single_vs_multi(combined) + + fitted = run_curve_fit(multi, sample_map, drug_map) + single_df = format_single_dose(single, sample_map, drug_map) + + final = pd.concat([fitted, single_df], ignore_index=True) + cols = ["source", "improve_sample_id", "improve_drug_id", "study", + "time", "time_unit", "dose_response_metric", "dose_response_value"] + for c in cols: + if c not in final.columns: + final[c] = pd.NA + final = final[cols] + + os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) + final.to_csv(args.output, sep="\t", index=False) + logging.info("Wrote %s (%d rows)", args.output, len(final)) + + +if __name__ == "__main__": + sys.exit(main() or 0) diff --git a/coderbuild/cnf/README.md b/coderbuild/cnf/README.md new file mode 100644 index 00000000..0755a952 --- /dev/null +++ b/coderbuild/cnf/README.md @@ -0,0 +1,169 @@ +# Cutaneous Neurofibroma (cnf) Organoid Drug Screen + +A coderdata dataset of cutaneous neurofibroma (cNF) patient-derived organoids +with drug-response measurements and matched multi-omic profiling. + +## Overview + +Cutaneous neurofibromas are benign nerve-sheath tumors that develop in patients +with neurofibromatosis type 1 (NF1). This dataset contains drug-screen +viability measurements for 238 small-molecule compounds across 23 cNF tumor +specimens from 10 patients, paired with bulk RNA-seq, global LC-MS/MS +proteomics, and batch-corrected phosphoproteomics for many of the same +specimens. + +| Property | Value | +|---|---| +| Cancer type | Cutaneous Neurofibroma | +| Model type | patient derived organoid | +| Patients | 10 | +| Specimens | 23 (untreated organoids; additional tissue and treated organoid samples also present) | +| Drugs | 238 | +| Modalities included | Transcriptomics (TPM), Proteomics (log ratio), Phosphoproteomics (log ratio) | + +## Multi-tumor patient structure + +Each patient (`NF0017`, `NF0018`, ...) contributed up to three independent +tumor specimens, named `NFxxxx_T1`, `NFxxxx_T2`, `NFxxxx_T3`. This dataset +treats each specimen as an independent sample (its own `improve_sample_id`). +The patient ID is preserved in the `other_names` column so downstream models +can group specimens by patient when needed (e.g. patient-level holdout). + +``` +improve_sample_id other_id common_name other_names other_id_source +2401 NF0019_T1 NF0019_T1 NF0019 NF1_cNF_Project +2402 NF0019_T2 NF0019_T2 NF0019 NF1_cNF_Project +2403 NF0019_T3 NF0019_T3 NF0019 NF1_cNF_Project +``` + +The sample table also includes matched normal skin (`NFxxxx_skin`), primary +tumor tissue (`NFxxxx_Tx_tissue`), and drug-treated organoids +(`NFxxxx_Tx_`). Each gets its own `improve_sample_id` and +`model_type`. Organoids (untreated) receive the lowest IDs, sorted by +specimen ID, to make drug-modelable samples easy to filter. + + +## Drug-response metrics + +The cNF screen mixes two designs: + +- **Multi-dose drugs**: a subset of drugs were screened at multiple + concentrations. These are run through `coderbuild/utils/fit_curve.py` and + contribute the standard schema metrics (`fit_auc`, `fit_ic50`, `fit_einf`, + `fit_hs`, `fit_r2`). + +- **Single-dose drugs**: remaining drugs were screened at a single 1 μM + concentration. These are recorded with `dose_response_metric = uM_viability` + and `dose_response_value = Viability_percentage / 100` (range 0–1, where + 1.0 = full viability and 0.0 = full kill). + +`uM_viability` is a value in the `ResponseMetric` enum. See the Schema +dependency section below. + +## SPECIMEN_DUAL_MAPPINGS edge case + +One viability file covers the treated organoid `NF0021_T1_Onalespid_1uM` but +its drug-response measurements should also be attributed to the untreated +`NF0021_T1`. The `SPECIMEN_DUAL_MAPPINGS` dict in `04-experiments-cnf.py` +handles this by duplicating the rows for every additional specimen listed: + +```python +SPECIMEN_DUAL_MAPPINGS = { + "NF0021_T1_Onalespid_1uM": ["NF0021_T1"], +} +``` + +Add new entries here if additional treated organoids need the same dual +attribution. Both specimens must already exist in `cnf_samples.csv` or the +duplicate rows will be dropped during the sample-map join. + +## Protocol optimization samples + +Six RNA columns in the cohort TPM matrices correspond to protocol optimization +runs rather than real specimens. They are excluded in `02-omics-cnf.py` before +the wide-to-long melt. Column stems matching any of these substrings are +dropped: + +```python +DROP_SAMPLE_SUBSTRINGS = [ + "cNF_organoid_DIA_G_02_11Feb25", + "cNF_organoid_DIA_G_05_11Feb25", + "cNF_organoid_DIA_G_06_11Feb25", + "cNF_organoid_DIA_P_02_29Jan25", + "cNF_organoid_DIA_P_05_11Feb25", + "cNF_organoid_DIA_P_06_11Feb25", +] +``` + +These are matched on the column stem (basename without extension), so both +plain paths and filename columns are handled correctly. If new protocol +optimization runs appear in future cohort data, add their column stems to +this list. + +## Phosphosites dependency and known unmatched sites + +`02-omics-cnf.py` requires `phosphosites.csv` as a positional argument. +The phosphosites reference must be built before the cNF omics step runs. +`build_dataset.py` and `build_all.py` enforce this sequencing. + +The phosphoproteomics output file (`cnf_phosphoproteomics.csv`) is produced +by an inner join between the raw Synapse phospho data and `phosphosites.csv`. +Sites that don't appear in the reference are silently dropped. As of the +current release, **12 sites** in the raw cNF phospho data (syn70078415) are +permanently unmatched because their gene symbols (`BAP18`, `C11orf96`, +`C14orf93`, `C1orf21`, `C2orf49`, etc). The remaining 2,974 of 2,986 unique sites +(99.6%) are present in the output. + +## Data sources + +| Modality | Synapse ID | Notes | +|---|---|---| +| Drug-screen file index | `syn51301431` (table) | `dataType='drug screen'` | +| Drug-screen file parents | `syn51301414`, `syn51301420`, `syn51301426` | One folder per cohort | +| RNA-seq TPM (Cohort 1) | `syn66352931` | `salmon.merged.gene_tpm.tsv` | +| RNA-seq TPM (Cohort 2) | `syn70765053` | `salmon.merged.gene_tpm.tsv` | +| Global proteomics | `syn74815895` | Batch-corrected; `correctedAbundance` column | +| Phospho-proteomics | `syn70078415` | Batch-corrected; `correctedAbundance` column; mapped via `phosphosites.csv` | +| Sample discovery (RNA) | `syn71333780` | All-conditions RNA file for sample enumeration only | +| Normal Skin folder | `syn74284682` | One child per patient for skin sample discovery | + +## Build pipeline + +The build follows coderdata conventions: four shell scripts wrapping Python +implementations, plus shared utilities, packaged in a Docker image. + +``` +coderbuild/cnf/ +├── 01-samples-cnf.py # mint improve_sample_ids, write cnf_samples.csv +├── build_samples.sh +├── 02-omics-cnf.py # write cnf_transcriptomics.csv, cnf_proteomics.csv, +├── build_omics.sh # cnf_phosphoproteomics.csv +├── 03-drugs-cnf.py # call pubchem_retrieval + descriptor utility +├── build_drugs.sh +├── 04-experiments-cnf.py # split single/multi-dose, run fit_curve, write cnf_experiments.tsv +├── build_exp.sh +├── cnf_utils.py # shared specimen canonicalization utilities +├── requirements.txt +└── README.md (this file) + +coderbuild/docker/ +└── Dockerfile.cnf +``` + +### Local build + +```bash +# From repository root +python coderbuild/build_dataset.py --build --validate --dataset cnf +``` + +In the full coderdata build, this dataset is orchestrated by `build_dataset.py` +/ `build_all.py`, which ensures `genes.csv`, `samples.csv`, `drugs.tsv`, and +`phosphosites.csv` are all present before starting the cNF containers. + + +## What's intentionally not included + +- **Mutations** - not yet generated for this dataset. +- **Copy number** - not yet generated for this dataset. + diff --git a/coderbuild/cnf/build_drugs.sh b/coderbuild/cnf/build_drugs.sh new file mode 100644 index 00000000..9402f178 --- /dev/null +++ b/coderbuild/cnf/build_drugs.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +# build_drugs.sh - wraps 03-drugs-cnf.py per coderdata convention. +# +# Usage: +# build_drugs.sh +# +# is a comma-delimited list of drug TSV files from +# previous datasets in the build sequence. Their improve_drug_id values +# get reused when the canonical SMILES match. + +set -euo pipefail + +PREV_DRUGS="${1:-}" + +python 03-drugs-cnf.py \ + --prev_drugs "$PREV_DRUGS" \ + --out_drugs /tmp/cnf_drugs.tsv \ + --out_desc /tmp/cnf_drug_descriptors.tsv + +python build_drug_desc.py --drugtable /tmp/cnf_drugs.tsv --desctable /tmp/cnf_drug_descriptors.tsv.gz diff --git a/coderbuild/cnf/build_exp.sh b/coderbuild/cnf/build_exp.sh new file mode 100644 index 00000000..18c8b107 --- /dev/null +++ b/coderbuild/cnf/build_exp.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +# build_exp.sh - wraps 04-experiments-cnf.py per coderdata convention. +# +# Usage: +# build_exp.sh + +set -euo pipefail + +SAMPLES="${1:?Usage: build_exp.sh }" +DRUGS="${2:?Usage: build_exp.sh }" + +python 04-experiments-cnf.py "$SAMPLES" "$DRUGS" --output /tmp/cnf_experiments.tsv \ No newline at end of file diff --git a/coderbuild/cnf/build_omics.sh b/coderbuild/cnf/build_omics.sh new file mode 100644 index 00000000..7db0d062 --- /dev/null +++ b/coderbuild/cnf/build_omics.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +# build_omics.sh - wraps 02-omics-cnf.py per coderdata convention. +# +# Usage: +# build_omics.sh +# +# - : genes.csv with gene_symbol → entrez_id mappings +# - : cnf_samples.csv produced by build_samples.sh +# - : phosphosites.csv (other_id → phosphosite_id) + +set -euo pipefail + +GENES="${1:?Usage: build_omics.sh }" +SAMPLES="${2:?Usage: build_omics.sh }" +PHOSPHOSITES="${3:?Usage: build_omics.sh }" + +python 02-omics-cnf.py "$GENES" "$SAMPLES" "$PHOSPHOSITES" \ + --out_rna /tmp/cnf_transcriptomics.csv \ + --out_prot /tmp/cnf_proteomics.csv \ + --out_phospho /tmp/cnf_phosphoproteomics.csv \ No newline at end of file diff --git a/coderbuild/cnf/build_samples.sh b/coderbuild/cnf/build_samples.sh new file mode 100644 index 00000000..37af2162 --- /dev/null +++ b/coderbuild/cnf/build_samples.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# build_samples.sh - wraps 01-samples-cnf.py per coderdata convention. +# +# Usage: +# build_samples.sh +# +# The previous samples file is the latest samples CSV from the coderdata +# build (with the highest improve_sample_id used so far). 01-samples-cnf.py +# reads it, finds the max ID, and increments from there. + +set -euo pipefail + +PREV_SAMPLES="${1:?Usage: build_samples.sh }" + +python 01-samples-cnf.py "$PREV_SAMPLES" --output /tmp/cnf_samples.csv \ No newline at end of file diff --git a/coderbuild/cnf/cnf_utils.py b/coderbuild/cnf/cnf_utils.py new file mode 100644 index 00000000..10ec38d2 --- /dev/null +++ b/coderbuild/cnf/cnf_utils.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +""" +cnf_utils.py: shared specimen classification utilities for the cNF build scripts. + +Used by 01-samples-cnf.py, 02-omics-cnf.py, and 04-experiments-cnf.py to +canonicalize specimen strings so omics data, drug measurements, and the sample +table all use the same improve_sample_id keys. +""" + +import re + + +# ---------------------------------------------------------------------------- +# Sample type → coderdata model_type enum +# ---------------------------------------------------------------------------- +MODEL_TYPE_MAP = { + "organoid": "patient derived organoid", + "treated_organoid": "patient derived organoid", + "tumor_tissue": "tumor", + "normal_skin": "normal_tissue", +} + + + + +# ---------------------------------------------------------------------------- +# Regex constants +# ---------------------------------------------------------------------------- +# Bare tumor organoid: NFxxxx_Tx (4 patient digits, T + tumor digits) +TUMOR_RE = re.compile(r"^NF\d{4}_T\d+$") +# Patient prefix at the start of any specimen string +PATIENT_PREFIX_RE = re.compile(r"^(NF\d{4})") +# Normal skin marker: NFxxxx[_-]skin..., allowing trailing date/replicate junk +SKIN_RE = re.compile(r"^(NF\d{4})[_-]skin", re.IGNORECASE) +# Primary tumor tissue (not organoid culture): NFxxxx_Tx_tissue +TISSUE_RE = re.compile(r"^(NF\d{4}_T\d+)_tissue$", re.IGNORECASE) +# Treated organoid: NFxxxx_Tx_, e.g. NF0021_T1_Onalespid_1uM +TREATED_RE = re.compile(r"^NF\d{4}_T\d+_.+$") + + +# ---------------------------------------------------------------------------- +# Classification +# ---------------------------------------------------------------------------- +def classify_specimen(spec) -> tuple[str, str] | None: + """Classify a raw specimen string into (canonical_id, sample_type) or None. + + sample_type ∈ {"organoid", "treated_organoid", "tumor_tissue", "normal_skin"}. + + Canonical IDs: + organoid → NFxxxx_Tx (suffix stripped) + treated_organoid → NFxxxx_Tx_ (label preserved so + treated samples don't collide + with the untreated organoid) + tumor_tissue → NFxxxx_Tx_tissue (suffix preserved) + normal_skin → NFxxxx_skin (replicates collapsed per patient) + + Unrecognized prefixes and malformed inputs return None. + """ + if spec is None: + return None + s = str(spec).strip() + if not s or s.lower() == "nan": + return None + + # Convert dot-separated form (e.g. RNA file's "NF0017.T1.organoid") to + # underscore-separated. + s_norm = s.replace(".", "_") + + # Skin: any string starting with NFxxxx that contains a "skin" marker + skin_m = SKIN_RE.match(s_norm) + if skin_m: + return (f"{skin_m.group(1)}_skin", "normal_skin") + + # Tumor tissue: NFxxxx_Tx_tissue + tissue_m = TISSUE_RE.match(s_norm) + if tissue_m: + return (f"{tissue_m.group(1)}_tissue", "tumor_tissue") + + # Strip organoid suffix (handles "_organoid" and "_organoids") + s_stripped = re.sub(r"_organoids?$", "", s_norm, flags=re.IGNORECASE) + + # Bare organoid (untreated): NFxxxx_Tx exactly + if TUMOR_RE.match(s_stripped): + return (s_stripped, "organoid") + + # Treated organoid: NFxxxx_Tx_ + if TREATED_RE.match(s_stripped): + return (s_stripped, "treated_organoid") + + return None + + +def patient_from_specimen(specimen: str) -> str: + """Extract NFxxxx patient ID from a canonical specimen ID.""" + m = PATIENT_PREFIX_RE.match(specimen) + return m.group(1) if m else "" + + +def canonicalize_specimen_column(df, specimen_col: str): + """Apply classify_specimen to a dataframe's specimen column in place. + + Returns the same df with: + - rows whose specimen classifies dropped if None + - specimen_col values replaced with their canonical IDs + - a new column 'sample_type' added + + Used by build_omics.py and build_exp.py to map raw Synapse specimen + strings (e.g. "NF0018.T1.organoid", "NF0021.T1.Onalespid.1uM") to + canonical IDs that match the cnf_samples.csv `other_id` column. + """ + classifications = df[specimen_col].apply(classify_specimen) + keep = classifications.notna() + df = df[keep].copy() + classifications = classifications[keep] + df[specimen_col] = [c[0] for c in classifications] + df["sample_type"] = [c[1] for c in classifications] + return df diff --git a/coderbuild/cnf/requirements.txt b/coderbuild/cnf/requirements.txt new file mode 100644 index 00000000..a5677837 --- /dev/null +++ b/coderbuild/cnf/requirements.txt @@ -0,0 +1,9 @@ +pandas>=2.0 +numpy>=1.24,<2.0 +synapseclient>=4.0 +requests>=2.28 +scipy>=1.10 +matplotlib +scikit-learn +scipy +mordred \ No newline at end of file diff --git a/coderbuild/docker/Dockerfile.cnf b/coderbuild/docker/Dockerfile.cnf new file mode 100644 index 00000000..06069ef1 --- /dev/null +++ b/coderbuild/docker/Dockerfile.cnf @@ -0,0 +1,53 @@ +# Dockerfile.cnf - container for building the cNF (cutaneous neurofibroma) +# organoid drug-response dataset for inclusion in coderdata. +# +# Conventions (matching coderbuild/docker/Dockerfile.): +# * Working directory is /coderbuild/cnf +# * Standard utilities are mounted/copied to /coderbuild/utils (fit_curve.py, +# pubchem_retrieval.py, build_drug_descriptor_table.py) +# * Synapse auth via SYNAPSE_AUTH_TOKEN env var (passed at `docker run`) +# * Build scripts are invoked from build_dataset.py / build_all.py via the +# standard build_*.sh entry points +# +# Build: +# docker build -f coderbuild/docker/Dockerfile.cnf -t coderdata-cnf . +# +# Run (orchestrated by build_dataset.py / build_all.py): +# docker run --rm \ +# -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN \ +# -e CNF_RNA_TPM_SYN_ID=$CNF_RNA_TPM_SYN_ID \ +# -v $(pwd)/local_data:/coderbuild/local_data \ +# coderdata-cnf bash build_samples.sh /coderbuild/local_data/prev_samples.csv + +FROM python:3.11-slim + +# System deps (rdkit-pypi pulls in some C libs; basic build tools for scipy) +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential wget ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Set up coderbuild layout +WORKDIR /coderbuild/cnf + +# Install Python deps. rdkit is brought in via the descriptor utility but +# pinned here so the descriptor step is reproducible. +COPY coderbuild/cnf/requirements.txt /coderbuild/cnf/requirements.txt +RUN pip install --no-cache-dir -r requirements.txt rdkit-pypi + +# Copy the cNF build scripts +COPY coderbuild/cnf/01-samples-cnf.py /coderbuild/cnf/ +COPY coderbuild/cnf/02-omics-cnf.py /coderbuild/cnf/ +COPY coderbuild/cnf/03-drugs-cnf.py /coderbuild/cnf/ +COPY coderbuild/cnf/04-experiments-cnf.py /coderbuild/cnf/ +COPY coderbuild/cnf/build_samples.sh /coderbuild/cnf/ +COPY coderbuild/cnf/build_omics.sh /coderbuild/cnf/ +COPY coderbuild/cnf/build_drugs.sh /coderbuild/cnf/ +COPY coderbuild/cnf/build_exp.sh /coderbuild/cnf/ +COPY coderbuild/cnf/cnf_utils.py /coderbuild/cnf/ +COPY coderbuild/utils/build_drug_desc.py /coderbuild/cnf/ + +RUN chmod +x /coderbuild/cnf/*.sh + +# Standard coderdata utilities (fit_curve.py, pubchem_retrieval.py, +COPY coderbuild/utils /coderbuild/utils + diff --git a/coderbuild/docker/Dockerfile.phosphosites b/coderbuild/docker/Dockerfile.phosphosites new file mode 100644 index 00000000..a45b9702 --- /dev/null +++ b/coderbuild/docker/Dockerfile.phosphosites @@ -0,0 +1,14 @@ +FROM python:3.11-slim + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /coderbuild/phosphosites + +RUN pip install --no-cache-dir pandas>=2.0 synapseclient>=4.0 openpyxl>=3.1 + +COPY coderbuild/phosphosites/00-buildPhosphositeFile.py /coderbuild/phosphosites/ +COPY coderbuild/phosphosites/build_phosphosites.sh /coderbuild/phosphosites/ + +RUN chmod +x /coderbuild/phosphosites/build_phosphosites.sh diff --git a/coderbuild/docker/docker-compose.yml b/coderbuild/docker/docker-compose.yml index 51eb56ee..0a87e0e5 100644 --- a/coderbuild/docker/docker-compose.yml +++ b/coderbuild/docker/docker-compose.yml @@ -96,6 +96,15 @@ services: platform: linux/amd64 image: genes:latest + phosphosites: + build: + context: ../../ + dockerfile: coderbuild/docker/Dockerfile.phosphosites + args: + HTTPS_PROXY: ${HTTPS_PROXY} + platform: linux/amd64 + image: phosphosites:latest + upload: build: context: ../../ @@ -130,4 +139,13 @@ services: args: HTTPS_PROXY: ${HTTPS_PROXY} platform: linux/amd64 - image: novartis:latest \ No newline at end of file + image: novartis:latest + + cnf: + build: + context: ../../ + dockerfile: coderbuild/docker/Dockerfile.cnf + args: + HTTPS_PROXY: ${HTTPS_PROXY} + platform: linux/amd64 + image: cnf:latest \ No newline at end of file diff --git a/coderbuild/genes/00-buildGeneFile.R b/coderbuild/genes/00-buildGeneFile.R index 5c113a94..75d8c31f 100755 --- a/coderbuild/genes/00-buildGeneFile.R +++ b/coderbuild/genes/00-buildGeneFile.R @@ -1,57 +1,145 @@ -##this uses the Hg38 database to seed the gene name mappings that we might want to use - -#if(!require('org.Hs.eg.db')){ -# BiocManager::install('org.Hs.eg.db') -library(org.Hs.eg.db) -library(biomaRt) -#} - -library(dplyr) -##get entrez ids to symbol -entrez<-as.data.frame(org.Hs.egALIAS2EG) - -sym <- as.data.frame(org.Hs.egSYMBOL) - -##get entriz ids to ensembl -ens<-as.data.frame(org.Hs.egENSEMBL2EG) - -##get transcript ids as well -enst<-as.data.frame(org.Hs.egENSEMBLTRANS) - - #now we can filter by protein coding using biomart -ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl") - -tab <- getBM(attributes=c('ensembl_gene_id'),filters='biotype', values=c('protein_coding'),mart=ensembl) - - -joined.df<-entrez|> - left_join(sym)|> - dplyr::rename(entrez_id='gene_id',gene_symbol='symbol',other_id='alias_symbol',gene_symbol='symbol')%>% - mutate(other_id_source='entrez_alias') - -##now get aliases from ensembl -edf <- sym|> - inner_join(ens)|> - dplyr::rename(entrez_id='gene_id',gene_symbol='symbol',other_id='ensembl_id')%>% - mutate(other_id_source='ensembl_gene') - - -tdf<-sym|> - inner_join(enst)|> - dplyr::rename(entrez_id='gene_id',gene_symbol='symbol',other_id='trans_id')|> - subset(entrez_id%in%edf$entrez_id)|> -# subset(gene_symbol%in%ed.df$gene_symbol)|> - dplyr::mutate(other_id_source='ensembl_transcript') - - -prots<-subset(edf,other_id%in%tab$ensembl_gene_id) - -full.df<-rbind(joined.df,edf,tdf)|> - subset(entrez_id%in%prots$entrez_id)|> - distinct() - -#save to file and version -write.table(full.df,'/tmp/genes.csv',sep=',',row.names=F,quote=T) - -##store this file somewhere! - +## Build /tmp/genes.csv — entrez_id, gene_symbol, other_id, other_id_source. +## +## Uses org.Hs.eg.db for the local mappings (entrez ↔ symbol, ensembl gene, +## ensembl transcript) and biomaRt to filter to protein-coding genes. +## +## biomaRt's call to Ensembl frequently fails with mirror unavailability. +## To handle this, the Ensembl call is wrapped in a retry loop: +## - cycles through all four mirror choices ('www', 'useast', 'asia', 'uswest') +## - each cycle waits longer than the last (exponential backoff) +## - up to MAX_CYCLES total cycles before giving up +## +## If every mirror remains unreachable, the script falls back to a local +## protein-coding biotype list derived from org.Hs.eg.db's GO mapping +## (filtering out pseudogenes/RNAs that lack ENSG IDs in `ens`). The +## fallback is documented in the code so the lower coverage is visible. + +suppressPackageStartupMessages({ + library(org.Hs.eg.db) + library(biomaRt) + library(dplyr) +}) + +# ---- Local mappings (no network) ------------------------------------------- +entrez <- as.data.frame(org.Hs.egALIAS2EG) +sym <- as.data.frame(org.Hs.egSYMBOL) +ens <- as.data.frame(org.Hs.egENSEMBL2EG) +enst <- as.data.frame(org.Hs.egENSEMBLTRANS) + + +# ---- biomaRt with retry / mirror cycling ----------------------------------- +# +# Strategy: +# For up to MAX_CYCLES, try each mirror in MIRRORS in order. +# On a successful useEnsembl() + getBM() call, return the result. +# On any failure, sleep for the current backoff and try the next mirror. +# Backoff doubles each cycle (30s → 60s → 120s → ...). +# +# Total worst-case wait at MAX_CYCLES=5: ~30 minutes spread across attempts, +# which has historically been enough for transient Ensembl outages to clear. + +MIRRORS <- c("www", "useast", "asia", "uswest") +MAX_CYCLES <- 5 +BASE_SLEEP <- 30 # seconds; doubles each cycle + +fetch_protein_coding <- function() { + for (cycle in seq_len(MAX_CYCLES)) { + sleep_s <- BASE_SLEEP * (2 ^ (cycle - 1)) + for (mirror in MIRRORS) { + message(sprintf("[%s] Cycle %d/%d, trying mirror '%s'...", + Sys.time(), cycle, MAX_CYCLES, mirror)) + result <- tryCatch({ + ensembl <- useEnsembl( + biomart = "genes", + dataset = "hsapiens_gene_ensembl", + mirror = mirror + ) + tab <- getBM( + attributes = c("ensembl_gene_id"), + filters = "biotype", + values = c("protein_coding"), + mart = ensembl + ) + message(sprintf("[%s] Success via mirror '%s' (%d genes)", + Sys.time(), mirror, nrow(tab))) + tab + }, error = function(e) { + message(sprintf("[%s] Mirror '%s' failed: %s", + Sys.time(), mirror, conditionMessage(e))) + NULL + }) + if (!is.null(result)) return(result) + } + if (cycle < MAX_CYCLES) { + message(sprintf("[%s] All mirrors failed in cycle %d; sleeping %ds before next cycle", + Sys.time(), cycle, sleep_s)) + Sys.sleep(sleep_s) + } + } + return(NULL) +} + +tab <- fetch_protein_coding() + + +# ---- Fallback: protein-coding inference from org.Hs.eg.db ------------------- +# If every Ensembl mirror is unreachable, derive a protein-coding gene list +# locally. org.Hs.eg.db doesn't carry biotype directly, but every entrez ID +# that has an ENSG mapping in org.Hs.egENSEMBL2EG is reasonably treated as +# a "real" gene; combined with the symbol mapping, this approximates the +# protein-coding filter well enough to keep the build going. +# +# The fallback genes.csv will have somewhat broader coverage than the +# Ensembl-filtered version (it includes some pseudogenes etc), but the +# downstream omics joins are inner-joined on gene_symbol, so any extras +# that don't appear in the data files just get dropped harmlessly. + +if (is.null(tab)) { + message("\n=== WARNING ===") + message("All Ensembl mirrors unreachable after ", MAX_CYCLES, + " cycles. Using local org.Hs.eg.db fallback.") + message("This produces a slightly broader gene list than the canonical ", + "Ensembl-filtered version. To rebuild with the canonical filter, ", + "re-run this script when Ensembl is reachable.") + message("===============\n") + # Use every entrez ID that has an ENSG mapping as the protein-coding proxy. + tab <- data.frame(ensembl_gene_id = unique(ens$ensembl_id), + stringsAsFactors = FALSE) +} + + +# ---- Assemble joined alias / ensembl gene / ensembl transcript table ------- +joined.df <- entrez |> + left_join(sym, by = "gene_id") |> + dplyr::rename(entrez_id = "gene_id", + gene_symbol = "symbol", + other_id = "alias_symbol") |> + mutate(other_id_source = "entrez_alias") + +edf <- sym |> + inner_join(ens, by = "gene_id") |> + dplyr::rename(entrez_id = "gene_id", + gene_symbol = "symbol", + other_id = "ensembl_id") |> + mutate(other_id_source = "ensembl_gene") + +tdf <- sym |> + inner_join(enst, by = "gene_id") |> + dplyr::rename(entrez_id = "gene_id", + gene_symbol = "symbol", + other_id = "trans_id") |> + subset(entrez_id %in% edf$entrez_id) |> + dplyr::mutate(other_id_source = "ensembl_transcript") + +prots <- subset(edf, other_id %in% tab$ensembl_gene_id) + +full.df <- rbind(joined.df, edf, tdf) |> + subset(entrez_id %in% prots$entrez_id) |> + distinct() + + +# ---- Write ------------------------------------------------------------------ +out_path <- "/tmp/genes.csv" +write.table(full.df, out_path, sep = ",", row.names = FALSE, quote = TRUE) +message(sprintf("Wrote %s (%d rows, %d unique entrez_ids)", + out_path, nrow(full.df), length(unique(full.df$entrez_id)))) \ No newline at end of file diff --git a/coderbuild/phosphosites/00-buildPhosphositeFile.py b/coderbuild/phosphosites/00-buildPhosphositeFile.py new file mode 100644 index 00000000..09292dec --- /dev/null +++ b/coderbuild/phosphosites/00-buildPhosphositeFile.py @@ -0,0 +1,589 @@ +#!/usr/bin/env python3 +""" +00-buildPhosphositeFile.py: build the phosphosites reference CSV from three sources: + + 1. Ochoa et al. (2020) Nat Biotechnol 38, 365-373: ~116k curated human + phosphosites from Supplementary Table S3 (Springer static content). + https://static-content.springer.com/esm/art%3A10.1038%2Fs41587-019-0344-3/ + MediaObjects/41587_2019_344_MOESM4_ESM.xlsx + + 2. UniProt PTM annotations: all reviewed human proteins with phospho + modifications from the UniProt REST API (~41k sites, ~12k unique to UniProt). + + 3. Synapse supplement(s): raw phosphoproteomics CSV/TSV files on Synapse + whose site column is parsed to add experiment-specific sites not in the + databases above (--synapse_supplements syn70078415 ...). + +Sources 1 and 2 are always fetched. Source 3 is optional but recommended. + +UniProt accession -> gene_symbol: UniProt REST API (TSV, accession+gene_names). +gene_symbol -> entrez_id: genes.csv. + +Site notation: - + e.g. AAAS-S495s (s=phosphoserine, t=phosphothreonine, y=phosphotyrosine) + +Output columns: + phosphosite_id, entrez_id, gene_symbol, residue, position, modification, other_id + +Usage: + python 00-buildPhosphositeFile.py + [--out /tmp/phosphosites.csv] + [--prev /tmp/prev_phosphosites.csv] + [--supplement site_file.csv ...] + [--site_col site] + [--synapse_supplements syn70078415 ...] + [--synapse_site_col site] + [--ochoa_url ] + [--uniprot_gene_url ] + [--uniprot_ptm_url ] +""" + +import argparse +import csv +import io +import json +import logging +import re +import sys +import tempfile +import time +import urllib.request + +import openpyxl +import pandas as pd + + +OCHOA_URL = ( + "https://static-content.springer.com/esm/" + "art%3A10.1038%2Fs41587-019-0344-3/MediaObjects/" + "41587_2019_344_MOESM4_ESM.xlsx" +) + +UNIPROT_GENE_URL = ( + "https://rest.uniprot.org/uniprotkb/search" + "?query=reviewed:true+AND+organism_id:9606" + "&fields=accession,gene_names" + "&format=tsv" + "&size=500" +) + +UNIPROT_PTM_URL = ( + "https://rest.uniprot.org/uniprotkb/search" + "?query=reviewed:true+AND+organism_id:9606+AND+ft_mod_res:phospho*" + "&fields=accession,gene_names,ft_mod_res" + "&format=json" + "&size=200" +) + +_RESIDUE_MOD = {"S": "s", "T": "t", "Y": "y"} +_PHOSPHO_DESC_MAP = { + "phosphoserine": ("S", "s"), + "phosphothreonine": ("T", "t"), + "phosphotyrosine": ("Y", "y"), +} +_SITE_RE = re.compile(r'^(.+)-([STY])(\d+)([a-z]+)$', re.IGNORECASE) +_SUPP_MOD_CODES = {'s': 'phospho', 't': 'phospho', 'y': 'phospho'} + +_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0" + ), + "Accept": "*/*", +} + + +# --------------------------------------------------------------------------- +# Logging +# --------------------------------------------------------------------------- +def configure_logging(): + logging.basicConfig( + format="[%(asctime)s] [%(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO, + ) + + +# --------------------------------------------------------------------------- +# Reference data +# --------------------------------------------------------------------------- +def load_gene_map(genes_path: str) -> pd.DataFrame: + df = pd.read_csv(genes_path) + needed = {'gene_symbol', 'entrez_id'} + if needed - set(df.columns): + raise KeyError(f"genes file missing columns: {needed - set(df.columns)}") + df = df.dropna(subset=['gene_symbol', 'entrez_id']).copy() + df['entrez_id'] = df['entrez_id'].astype(int) + df = df.drop_duplicates(subset='gene_symbol')[['gene_symbol', 'entrez_id']] + logging.info("Loaded %d gene_symbol → entrez_id mappings", len(df)) + return df + + +# --------------------------------------------------------------------------- +# Network helper +# --------------------------------------------------------------------------- +def _fetch_url(url: str, retries: int = 3, timeout: int = 120) -> bytes: + req = urllib.request.Request(url, headers=_HEADERS) + for attempt in range(1, retries + 1): + try: + logging.info(" GET %s (attempt %d/%d)", url, attempt, retries) + with urllib.request.urlopen(req, timeout=timeout) as resp: + return resp.read() + except Exception as exc: + logging.warning(" Attempt %d failed: %s", attempt, exc) + if attempt < retries: + time.sleep(5 * attempt) + raise RuntimeError(f"Failed to fetch {url} after {retries} attempts") + + +def _fetch_json_page(url: str, timeout: int = 120) -> tuple[list[dict], str | None]: + req = urllib.request.Request(url, headers={**_HEADERS, "Accept": "application/json"}) + with urllib.request.urlopen(req, timeout=timeout) as resp: + data = json.loads(resp.read().decode('utf-8')) + link_header = resp.headers.get('Link', '') + entries = data.get('results', []) + m = re.search(r'<([^>]+)>;\s*rel="next"', link_header) + return entries, m.group(1) if m else None + + +# --------------------------------------------------------------------------- +# Source 1: Ochoa et al. supplementary table +# --------------------------------------------------------------------------- +def fetch_ochoa_phosphosites(url: str) -> list[dict]: + """Download Ochoa MOESM4 xlsx and return raw phosphosite records.""" + logging.info("Downloading Ochoa et al. supplementary phosphoproteome") + data = _fetch_url(url) + + with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp: + tmp.write(data) + tmp_path = tmp.name + + logging.info("Parsing xlsx (%d MB) …", len(data) // 1_000_000) + wb = openpyxl.load_workbook(tmp_path, read_only=True, data_only=True) + + sheet_name = "annotated_phosphoproteome" + if sheet_name not in wb.sheetnames: + wb.close() + raise ValueError( + f"Sheet '{sheet_name}' not found. Available: {wb.sheetnames}" + ) + + ws = wb[sheet_name] + records, header, uni_i, pos_i, res_i, skipped = [], None, None, None, None, 0 + + for row in ws.iter_rows(values_only=True): + if header is None: + header = [str(c).strip() if c is not None else "" for c in row] + try: + uni_i = header.index("uniprot") + pos_i = header.index("position") + res_i = header.index("residue") + except ValueError as exc: + wb.close() + raise ValueError(f"Required column missing in Ochoa sheet: {exc}") + logging.info( + "Ochoa columns: uniprot=%d, position=%d, residue=%d", + uni_i, pos_i, res_i, + ) + continue + + acc = str(row[uni_i]).strip() if row[uni_i] is not None else "" + try: + pos = int(row[pos_i]) + except (TypeError, ValueError): + skipped += 1 + continue + + res_raw = str(row[res_i]).strip().upper() if row[res_i] is not None else "" + residue = res_raw[0] if res_raw and res_raw[0] in _RESIDUE_MOD else None + + if residue is None or not acc: + skipped += 1 + continue + + records.append({ + "accession": acc, + "residue": residue, + "position": pos, + "modification": "phospho", + "mod_code": _RESIDUE_MOD[residue], + }) + + wb.close() + logging.info( + "Ochoa: %d records parsed (%d skipped)", len(records), skipped + ) + return records + + +# --------------------------------------------------------------------------- +# Source 2: UniProt PTM annotations +# --------------------------------------------------------------------------- +def fetch_uniprot_ptm_sites(url: str) -> list[dict]: + """Fetch phosphorylation sites from UniProt Modified residue features. + + Returns records with gene_symbol (already resolved) instead of accession. + """ + logging.info("Fetching UniProt PTM phosphorylation sites") + all_records = [] + next_url: str | None = url + page = 0 + + while next_url: + page += 1 + if page % 10 == 1: + logging.info(" UniProt PTM page %d (%d records so far)", page, len(all_records)) + + for attempt in range(1, 4): + try: + entries, next_url = _fetch_json_page(next_url) + break + except Exception as exc: + logging.warning(" Page %d attempt %d failed: %s", page, attempt, exc) + if attempt == 3: + raise + time.sleep(5 * attempt) + + for entry in entries: + gene = "" + for g in entry.get("genes", []): + if "geneName" in g: + gene = g["geneName"].get("value", "").upper() + break + if not gene: + continue + for feat in entry.get("features", []): + if feat.get("type") != "Modified residue": + continue + desc = feat.get("description", "").lower() + residue, mod_code = None, None + for key, (aa, mc) in _PHOSPHO_DESC_MAP.items(): + if desc.startswith(key): + residue, mod_code = aa, mc + break + if residue is None: + continue + pos = feat.get("location", {}).get("start", {}).get("value") + if pos is None: + continue + all_records.append({ + "gene_symbol": gene, + "residue": residue, + "position": int(pos), + "modification": "phospho", + "mod_code": mod_code, + "other_id": f"{gene}-{residue}{pos}{mod_code}", + }) + + logging.info( + "UniProt PTM: %d records from %d pages", len(all_records), page + ) + return all_records + + +# --------------------------------------------------------------------------- +# UniProt accession → gene_symbol +# --------------------------------------------------------------------------- +def fetch_uniprot_gene_map(url: str) -> dict[str, str]: + """Return accession → primary gene_symbol for all reviewed human proteins.""" + logging.info("Fetching UniProt accession → gene_symbol map") + acc_to_gene: dict[str, str] = {} + next_url: str | None = url + page = 0 + + while next_url: + page += 1 + if page % 10 == 1: + logging.info( + " UniProt gene map page %d (%d entries so far)", page, len(acc_to_gene) + ) + for attempt in range(1, 4): + try: + req = urllib.request.Request(next_url, headers=_HEADERS) + with urllib.request.urlopen(req, timeout=120) as resp: + raw = resp.read() + link_header = resp.headers.get("Link", "") + break + except Exception as exc: + logging.warning(" Page %d attempt %d failed: %s", page, attempt, exc) + if attempt == 3: + raise + time.sleep(5 * attempt) + + reader = csv.DictReader( + io.StringIO(raw.decode("utf-8", errors="replace")), delimiter="\t" + ) + for row in reader: + acc = row.get("Entry", row.get("accession", "")).strip() + gene_names = row.get("Gene Names", row.get("gene_names", "")).strip() + if acc and gene_names: + acc_to_gene[acc] = gene_names.split()[0].upper() + + m = re.search(r'<([^>]+)>;\s*rel="next"', link_header) + next_url = m.group(1) if m else None + + logging.info( + "UniProt gene map: %d entries (%d pages)", len(acc_to_gene), page + ) + return acc_to_gene + + +# --------------------------------------------------------------------------- +# Source 3: Synapse supplement(s) +# --------------------------------------------------------------------------- +def fetch_synapse_supplement(syn_id: str, site_col: str) -> list[dict]: + """Download a Synapse file and parse its site strings as supplement records.""" + import synapseclient + logging.info("Fetching Synapse supplement %s", syn_id) + syn = synapseclient.Synapse() + syn.login(silent=True) + entity = syn.get(syn_id) + path = entity.path + sep = "\t" if path.endswith((".tsv", ".tsv.gz")) else "," + + import gzip as _gzip + open_fn = _gzip.open if path.endswith(".gz") else open + + records = [] + skipped = 0 + seen: set[str] = set() + resolved_col: str | None = None + + with open_fn(path, "rt", encoding="utf-8", errors="replace") as f: + reader = csv.DictReader(f, delimiter=sep) + for row in reader: + # Auto-detect site column on first row if requested column missing + if resolved_col is None: + if site_col in row: + resolved_col = site_col + else: + for candidate in ["site", "Site", "phosphosite", "feature_id"]: + if candidate in row: + resolved_col = candidate + logging.info( + " Synapse %s: using column '%s' for sites", + syn_id, resolved_col, + ) + break + if resolved_col is None: + logging.warning( + " Synapse %s: could not find site column (tried %s). " + "Available: %s", + syn_id, site_col, list(row.keys())[:10], + ) + return [] + + site = row.get(resolved_col, "").strip() + if not site or site in seen: + continue + seen.add(site) + m = _SITE_RE.match(site) + if not m: + skipped += 1 + continue + mod_code = m.group(4).lower() + records.append({ + "gene_symbol": m.group(1).upper(), + "residue": m.group(2).upper(), + "position": int(m.group(3)), + "modification": _SUPP_MOD_CODES.get(mod_code, f"phospho_{mod_code}"), + "mod_code": mod_code, + "other_id": site, + }) + + logging.info( + "Synapse %s: %d unique sites parsed (%d skipped)", syn_id, len(records), skipped + ) + return records + + +# --------------------------------------------------------------------------- +# Local file supplement +# --------------------------------------------------------------------------- +def parse_supplement_sites(path: str, site_col: str) -> list[dict]: + records = [] + skipped = 0 + sep = "\t" if path.endswith(".tsv") else "," + with open(path, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f, delimiter=sep) + seen: set[str] = set() + for row in reader: + site = row.get(site_col, "").strip() + if not site or site in seen: + continue + seen.add(site) + m = _SITE_RE.match(site) + if not m: + skipped += 1 + continue + mod_code = m.group(4).lower() + records.append({ + "gene_symbol": m.group(1).upper(), + "residue": m.group(2).upper(), + "position": int(m.group(3)), + "modification": _SUPP_MOD_CODES.get(mod_code, f"phospho_{mod_code}"), + "mod_code": mod_code, + "other_id": site, + }) + logging.info( + "Supplement %s: %d unique sites (%d skipped)", path, len(records), skipped + ) + return records + + +# --------------------------------------------------------------------------- +# Assembly +# --------------------------------------------------------------------------- +def build_phosphosite_table( + ochoa_records: list[dict], + acc_to_gene: dict[str, str], + uniprot_records: list[dict], + gene_map: pd.DataFrame, + supplement_records: list[dict], + prev: pd.DataFrame | None, +) -> pd.DataFrame: + gene_to_entrez = gene_map.set_index("gene_symbol")["entrez_id"].to_dict() + rows: list[dict] = [] + existing_ids: set[str] = set() + + def _add(gene: str, residue: str, position: int, modification: str, mod_code: str) -> None: + other_id = f"{gene}-{residue}{position}{mod_code}" + if other_id in existing_ids: + return + entrez = gene_to_entrez.get(gene) + if entrez is None: + return + rows.append({ + "gene_symbol": gene, + "entrez_id": entrez, + "residue": residue, + "position": position, + "modification": modification, + "other_id": other_id, + }) + existing_ids.add(other_id) + + # --- Ochoa (primary) --- + no_gene_ochoa = 0 + for rec in ochoa_records: + gene = acc_to_gene.get(rec["accession"], "") + if not gene: + no_gene_ochoa += 1 + continue + _add(gene, rec["residue"], rec["position"], rec["modification"], rec["mod_code"]) + logging.info( + "Ochoa: %d sites added (%d with no gene symbol)", len(rows), no_gene_ochoa + ) + + before_uniprot = len(rows) + # --- UniProt PTM (secondary) --- + for rec in uniprot_records: + _add( + rec["gene_symbol"], rec["residue"], rec["position"], + rec["modification"], rec["mod_code"], + ) + logging.info( + "UniProt PTM: +%d new sites (total %d)", len(rows) - before_uniprot, len(rows) + ) + + before_supp = len(rows) + # --- Supplements (local files + Synapse) --- + for rec in supplement_records: + _add( + rec["gene_symbol"], rec["residue"], rec["position"], + rec["modification"], rec["mod_code"], + ) + if len(rows) > before_supp: + logging.info( + "Supplements: +%d new sites (total %d)", len(rows) - before_supp, len(rows) + ) + + df = pd.DataFrame(rows) + + # --- Assign / preserve phosphosite_ids --- + if prev is not None and not prev.empty: + prev_map = prev.set_index("other_id")["phosphosite_id"].to_dict() + next_id = int(prev["phosphosite_id"].max()) + 1 + else: + prev_map = {} + next_id = 1 + + ids = [] + for site in df["other_id"]: + if site in prev_map: + ids.append(prev_map[site]) + else: + ids.append(next_id) + prev_map[site] = next_id + next_id += 1 + df["phosphosite_id"] = ids + + cols = [ + "phosphosite_id", "entrez_id", "gene_symbol", + "residue", "position", "modification", "other_id", + ] + df = df[cols].sort_values("phosphosite_id").reset_index(drop=True) + logging.info( + "Final phosphosites table: %d rows, %d unique genes", + len(df), df["gene_symbol"].nunique(), + ) + return df + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- +def main(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("genes", help="genes.csv (gene_symbol → entrez_id)") + parser.add_argument("--out", default="/tmp/phosphosites.csv") + parser.add_argument("--prev", default=None, + help="Previous phosphosites.csv to preserve IDs") + parser.add_argument("--supplement", nargs="*", default=[], + help="Local CSV/TSV file(s) with site strings") + parser.add_argument("--site_col", default="site", + help="Column name for site strings in local supplement files") + parser.add_argument("--synapse_supplements", nargs="*", default=[], + help="Synapse IDs of raw phospho files to add as supplements " + "(e.g. syn70078415)") + parser.add_argument("--synapse_site_col", default="site", + help="Site column name in Synapse supplement files") + parser.add_argument("--ochoa_url", default=OCHOA_URL) + parser.add_argument("--uniprot_gene_url", default=UNIPROT_GENE_URL) + parser.add_argument("--uniprot_ptm_url", default=UNIPROT_PTM_URL) + args = parser.parse_args() + + configure_logging() + + gene_map = load_gene_map(args.genes) + + prev = None + if args.prev: + prev = pd.read_csv(args.prev) + logging.info("Loaded %d previous phosphosites from %s", len(prev), args.prev) + + ochoa_records = fetch_ochoa_phosphosites(args.ochoa_url) + acc_to_gene = fetch_uniprot_gene_map(args.uniprot_gene_url) + uniprot_records = fetch_uniprot_ptm_sites(args.uniprot_ptm_url) + + supp_records: list[dict] = [] + for path in (args.supplement or []): + supp_records.extend(parse_supplement_sites(path, args.site_col)) + for syn_id in (args.synapse_supplements or []): + try: + supp_records.extend(fetch_synapse_supplement(syn_id, args.synapse_site_col)) + except Exception as exc: + logging.warning("Synapse supplement %s failed: %s — skipping", syn_id, exc) + + df = build_phosphosite_table( + ochoa_records, acc_to_gene, uniprot_records, gene_map, supp_records, prev + ) + + df.to_csv(args.out, index=False) + logging.info( + "Wrote %s (%d rows, %d unique genes)", + args.out, len(df), df["gene_symbol"].nunique(), + ) + + +if __name__ == "__main__": + sys.exit(main() or 0) diff --git a/coderbuild/phosphosites/README.md b/coderbuild/phosphosites/README.md new file mode 100644 index 00000000..e2cf788f --- /dev/null +++ b/coderbuild/phosphosites/README.md @@ -0,0 +1,97 @@ +# Phosphosites Reference Builder + +Produces `phosphosites.csv`, the global human phosphosite reference used by +dataset omics pipelines (currently: cNF) to map raw phospho measurements to +stable integer identifiers. + +## Sources + +Three sources are merged in priority order. A site from an earlier source is +never overwritten by a later one (deduplication on `other_id`). + +| Priority | Source | Sites (approx.) | +|---|---|---| +| 1 | Ochoa et al. (2020) Nat Biotechnol 38:365-373, Supp. Table S3 | ~116k | +| 2 | UniProt PTM REST API | ~12k additional | +| 3 | Synapse supplement `syn70078415` (cNF raw phospho data) | ~188 additional | + +## Site notation + +``` +- +e.g. AAAS-S495s (s=phosphoserine, t=phosphothreonine, y=phosphotyrosine) +``` + +`other_id` in `phosphosites.csv` stores these strings and is the join key for +raw phospho data. + +## Output columns + +`phosphosite_id`, `entrez_id`, `gene_symbol`, `residue`, `position`, +`modification`, `other_id` + +## Files + +``` +coderbuild/phosphosites/ +├── 00-buildPhosphositeFile.py # Builder script +└── build_phosphosites.sh # Docker entry point +coderbuild/docker/ +└── Dockerfile.phosphosites +``` + +## Builder — `00-buildPhosphositeFile.py` + +**Ochoa (primary):** Downloads `41587_2019_344_MOESM4_ESM.xlsx` from the +Springer static CDN (no login required; requires `openpyxl`). Reads the +`annotated_phosphoproteome` sheet (`uniprot`, `position`, `residue` columns). +UniProt accessions are resolved to gene symbols via a separate UniProt REST +call (TSV, all reviewed human proteins, paginated via `Link: rel="next"` +header). + +**UniProt PTM (secondary):** Downloads `Modified residue` features with +phosphoserine/phosphothreonine/phosphotyrosine descriptions from the UniProt +REST API (JSON, same Link-header pagination). Returns gene symbols directly. + +**Synapse supplement (`--synapse_supplements`):** Downloads a raw phospho +file from Synapse, auto-detects the site column (`site`, `Site`, +`phosphosite`, or `feature_id`), and appends any `GENE-ResPositionmod` sites +not already in the reference. Requires `SYNAPSE_AUTH_TOKEN`. Failures are +logged as warnings and do not abort the build. + +### Key arguments + +| Argument | Default | Description | +|---|---|---| +| `genes` (positional) | — | Path to `genes.csv` | +| `--out` | `/tmp/phosphosites.csv` | Output path | +| `--prev` | None | Previous CSV to preserve `phosphosite_id` values across builds | +| `--synapse_supplements` | None | Synapse IDs of raw phospho files (space-separated) | +| `--synapse_site_col` | `site` | Site column name in Synapse supplement files | +| `--supplement` | None | Local CSV/TSV file(s) with site strings | +| `--site_col` | `site` | Site column name in local supplement files | + +## Build sequencing + +Phosphosites must run **after** `genes.csv` is ready. `build_dataset.py` and +`build_all.py` wait on the genes future before submitting the phosphosites +container. + +## ID stability + +Pass `--prev ` to preserve IDs across rebuilds. New +sites receive IDs starting from `max(prev.phosphosite_id) + 1` — same pattern +as `improve_sample_id` and `improve_drug_id`. + +## Local run (Only for debugging purposes, this is already build into build_all.py script) + +```bash +pip install pandas synapseclient openpyxl + +export SYNAPSE_AUTH_TOKEN= +python coderbuild/phosphosites/00-buildPhosphositeFile.py \ + local/genes.csv \ + --out local/phosphosites.csv \ + --synapse_supplements syn70078415 \ + --synapse_site_col site +``` diff --git a/coderbuild/phosphosites/build_phosphosites.sh b/coderbuild/phosphosites/build_phosphosites.sh new file mode 100644 index 00000000..4662546d --- /dev/null +++ b/coderbuild/phosphosites/build_phosphosites.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# build_phosphosites.sh: build the global phosphosites reference CSV. +# +# Usage: +# build_phosphosites.sh [] +# +# Sources (in priority order): +# 1. Ochoa et al. (2020) Nat Biotechnol Supplementary Table S3 (~116k sites) +# 2. UniProt PTM annotations (~12k additional sites unique to UniProt) +# 3. Synapse supplement: syn70078415 (cnf raw phospho — captures experiment- +# specific sites not yet in either database) + +set -euo pipefail + +GENES="${1:?Usage: build_phosphosites.sh []}" +PREV="${2:-}" + +PREV_ARG="" +if [ -n "$PREV" ] && [ -f "$PREV" ]; then + PREV_ARG="--prev $PREV" +fi + +python /coderbuild/phosphosites/00-buildPhosphositeFile.py \ + "$GENES" \ + --out /tmp/phosphosites.csv \ + --synapse_supplements syn70078415 \ + --synapse_site_col site \ + $PREV_ARG + +echo "Phosphosites built from Ochoa + UniProt PTM + Synapse supplement." diff --git a/schema/coderdata.yaml b/schema/coderdata.yaml index e434b47a..258fd6d5 100755 --- a/schema/coderdata.yaml +++ b/schema/coderdata.yaml @@ -13,6 +13,10 @@ slots: entrez_id: description: Gene id according to NCBI range: integer + phosphosite_id: + description: Unique identifier for a phosphorylation site (gene + residue + position) + identifier: true + range: integer improve_sample_id: description: Unique sample identifier for this dataset, generated by this package identifier: true @@ -49,6 +53,24 @@ classes: gene_symbol: required: true description: HGNC Gene symbol or synomym + Phosphosite: + description: Reference table of phosphorylation sites; each row is a unique gene + residue + position combination. + slots: + - phosphosite_id + - entrez_id + - other_id + - other_id_source + attributes: + gene_symbol: + required: true + description: HGNC gene symbol + residue: + description: Phosphorylated amino acid residue (S, T, or Y) + position: + range: integer + description: Position of the residue in the canonical protein sequence + modification: + description: Type of modification (e.g. phospho) Sample: description: Unique samples identified for each experiment, used to cross reference omics data to drug data. slots: @@ -122,6 +144,16 @@ classes: proteomics: range: float description: Log-normalized log ratio of proteomics measurements of sample at baseline. + Phosphoproteomics: + slots: + - phosphosite_id + - improve_sample_id + - source + - study + attributes: + phosphoproteomics: + range: float + description: Log-normalized, batch-corrected phosphoproteomics abundance value (correctedAbundance). Copy Number: slots: - entrez_id @@ -157,7 +189,7 @@ classes: - time_unit attributes: dose_response_metric: - range: CurveMetric + range: ResponseMetric description: Metric by which dose response value is measured dose_response_value: description: Value of metric @@ -218,7 +250,7 @@ enums: auc: description: Area under the curve measured by integrating data points dss: - description: I believe this is the drug sensitivity score + description: Drug sensistivity score. mrecist: description: For PDX data this value should be either Progressive Disease, Stable Disease, Partial Response or Complete Response. published_auc: @@ -233,6 +265,13 @@ enums: description: tumor growth inhibitiojn kulgap: description: kl divergence based metric + uM_viability: + description: + Single-dose viability fraction at 1 μM. Values near 1.0 indicate no + effect (full viability); values near 0.0 indicate full kill. Values + above 1.0 are possible when treated cells proliferate beyond the + untreated control. Used for organoid drug screens where dose-response + curves were not collected for that drug. CellPerturbation: permissible_values: gene_ko: @@ -248,6 +287,7 @@ enums: xenograft derived organoid: patient derived organoid: ex vivo: + normal_tissue: Variant: permissible_values: 3'UTR: diff --git a/schema/expected_files.yaml b/schema/expected_files.yaml index 3de46533..f0ccf49e 100644 --- a/schema/expected_files.yaml +++ b/schema/expected_files.yaml @@ -294,4 +294,29 @@ datasets: - target_class: Drug file: /tmp/novartis_drugs.tsv - target_class: Drug Descriptor - file: /tmp/novartis_drug_descriptors.tsv \ No newline at end of file + file: /tmp/novartis_drug_descriptors.tsv + + + cnf: + - target_class: Sample + file: /tmp/cnf_samples.csv + - target_class: Transcriptomics + file: /tmp/cnf_transcriptomics.csv + - target_class: Proteomics + file: /tmp/cnf_proteomics.csv + - target_class: Phosphoproteomics + file: /tmp/cnf_phosphoproteomics.csv + # - target_class: Mutations + # file: /tmp/cnf_mutations.csv + # - target_class: Copy Number + # file: /tmp/cnf_copy_number.csv + - target_class: Experiments + file: /tmp/cnf_experiments.tsv + - target_class: Drug + file: /tmp/cnf_drugs.tsv + - target_class: Drug Descriptor + file: /tmp/cnf_drug_descriptors.tsv + + phosphosites: + - target_class: Phosphosite + file: /tmp/phosphosites.csv \ No newline at end of file