diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index f44de138..bf065d53 100755
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -343,4 +343,52 @@ jobs:
             sgosline/novartis:latest
             sgosline/novartis:${{ github.ref_name }}
           push: true
+          platforms: linux/amd64
+
+  build-cnf:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      - name: Build and push cnf
+        uses: docker/build-push-action@v3
+        with:
+          file: ./coderbuild/docker/Dockerfile.cnf
+          tags: |
+            sgosline/cnf:latest
+            sgosline/cnf:${{ github.ref_name }}
+          push: true
+          platforms: linux/amd64
+
+  build-phosphosites:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      - name: Build and push phosphosites
+        uses: docker/build-push-action@v3
+        with:
+          file: ./coderbuild/docker/Dockerfile.phosphosites
+          tags: |
+            sgosline/phosphosites:latest
+            sgosline/phosphosites:${{ github.ref_name }}
+          push: true
           platforms: linux/amd64
\ No newline at end of file
diff --git a/coderbuild/build_all.py b/coderbuild/build_all.py
index 067dd5dc..8e6c50dd 100644
--- a/coderbuild/build_all.py
+++ b/coderbuild/build_all.py
@@ -41,7 +41,7 @@ def main():
     parser.add_argument('--figshare', action='store_true', help="Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment.")
     parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands")
     parser.add_argument('--high_mem',dest='high_mem',default=False,action='store_true',help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure.")
-    parser.add_argument('--dataset',dest='datasets',default='broad_sanger,beataml,pancreatic,bladder,sarcoma,liver,novartis,colorectal,mpnst,hcmi',help='Datasets to process. Defaults to all available.')
+    parser.add_argument('--dataset',dest='datasets',default='broad_sanger,beataml,pancreatic,bladder,sarcoma,liver,novartis,colorectal,mpnst,cnf,hcmi',help='Datasets to process. Defaults to all available.')
     parser.add_argument('--version', type=str, required=False, help='Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.')
     parser.add_argument('--github-username', type=str, required=False, help='GitHub username for the repository.')
     parser.add_argument('--github-email', type=str, required=False, help='GitHub email for the repository.')
@@ -65,7 +65,7 @@ def run_docker_cmd(cmd_arr,filename):
         print('running...'+filename)
         env = os.environ.copy()
         if 'SYNAPSE_AUTH_TOKEN' not in env.keys():
-            print('You need to set the SYNAPSE_AUTH_TOKEN to acess the MPNST, beatAML, bladder, pancreatic, liver, or sarcoma datasets')
+            print('You need to set the SYNAPSE_AUTH_TOKEN to acess the MPNST, beatAML, bladder, pancreatic, liver, sarcoma, cnf datasets')
             docker_run = ['docker','run','--rm','-v',env['PWD']+'/local/:/tmp/','--platform=linux/amd64']
         else:
             docker_run = ['docker','run','--rm','-v',env['PWD']+'/local/:/tmp/','-e','SYNAPSE_AUTH_TOKEN='+env['SYNAPSE_AUTH_TOKEN'],'--platform=linux/amd64']
@@ -113,13 +113,17 @@ def process_docker(datasets):
             'colorectal': ['colorectal'],
             'cptac': ['cptac'],
             'genes': ['genes'],
+            'phosphosites': ['phosphosites'],
             'upload': ['upload'],
             'liver': ['liver'],
-            'novartis': ['novartis']
+            'novartis': ['novartis'],
+            'cnf': ['cnf']
         }
         
         # Collect container names to build based on the datasets provided. Always build genes and upload.
         datasets_to_build = ['genes', 'upload']
+        if 'cnf' in datasets:
+            datasets_to_build.append('phosphosites')
         for dataset in datasets:
             datasets_to_build.extend(dataset_map.get(dataset, []))
         
@@ -188,21 +192,35 @@ def process_samples(executor, datasets):
                 last_sample_future = executor.submit(run_docker_cmd, [di, 'bash', 'build_samples.sh', sf], f'{da} samples')
                 sf = f'/tmp/{da}_samples.csv'
                 
-    def process_omics(executor, datasets,high_mem):
+    def process_phosphosites(executor):
+        '''
+        Build the phosphosites reference file if it does not exist.
+        Caller must ensure genes.csv exists first. Returns a Future (or None).
+        '''
+        if not os.path.exists('local/phosphosites.csv'):
+            return executor.submit(
+                run_docker_cmd,
+                ['phosphosites', 'bash', 'build_phosphosites.sh', '/tmp/genes.csv'],
+                'phosphosites file',
+            )
+        return None
+
+    def process_omics(executor, datasets, high_mem):
         '''
         Build all omics files concurrently
         '''
         last_omics_future = None
         for da in datasets:
             di = 'broad_sanger_omics' if da == 'broad_sanger' else da
-            #Run all at once:
+            omics_cmd = [di, 'bash', 'build_omics.sh', '/tmp/genes.csv', f'/tmp/{da}_samples.csv']
+            if da == 'cnf':
+                omics_cmd.append('/tmp/phosphosites.csv')
             if high_mem:
-                executor.submit(run_docker_cmd, [di, 'bash', 'build_omics.sh', '/tmp/genes.csv', f'/tmp/{da}_samples.csv'], f'{da} omics')
-            #Run one at a time.
+                executor.submit(run_docker_cmd, omics_cmd, f'{da} omics')
             else:
                 if last_omics_future:
-                    last_omics_future.result() 
-                last_omics_future = executor.submit(run_docker_cmd, [di, 'bash', 'build_omics.sh', '/tmp/genes.csv', f'/tmp/{da}_samples.csv'], f'{da} omics')
+                    last_omics_future.result()
+                last_omics_future = executor.submit(run_docker_cmd, omics_cmd, f'{da} omics')
         
     def process_experiments(executor, datasets, high_mem):
         '''
@@ -247,8 +265,9 @@ def process_misc(executor, datasets, high_mem):
         
 
     def process_genes(executor):
-        if not os.path.exists('/tmp/genes.csv'):
-            executor.submit(run_docker_cmd,['genes','bash','build_genes.sh'],'gene file')
+        if not os.path.exists('local/genes.csv'):
+            return executor.submit(run_docker_cmd,['genes','bash','build_genes.sh'],'gene file')
+        return None
         
         
     def run_docker_upload_cmd(cmd_arr, all_files_dir, name, version):
@@ -320,9 +339,9 @@ def get_latest_commit_hash(owner, repo, branch='main'):
     # Error handling for required tokens
     if args.figshare and not figshare_token:
         raise ValueError("FIGSHARE_TOKEN environment variable is not set.")
-    if any(dataset in args.datasets for dataset in ['beataml', 'mpnst', 'bladder', 'pancreatic','sarcoma','liver','novartis','colorectal']) and not synapse_auth_token:
+    if any(dataset in args.datasets for dataset in ['beataml', 'mpnst', 'bladder', 'pancreatic','sarcoma','liver','novartis','colorectal','cnf']) and not synapse_auth_token:
         if args.docker or args.samples or args.omics or args.drugs or args.exp or args.all: # Token only required if building data, not upload or validate.
-            raise ValueError("SYNAPSE_AUTH_TOKEN is required for accessing MPNST, beatAML, bladder, pancreatic, liver, novartis, colorectal or sarcoma datasets.")
+            raise ValueError("SYNAPSE_AUTH_TOKEN is required for accessing MPNST, beatAML, bladder, pancreatic, liver, novartis, colorectal, sarcoma, cnf datasets.")
        
     ######
     ### Begin Pipeline
@@ -342,25 +361,34 @@ def get_latest_commit_hash(owner, repo, branch='main'):
         print("Docker image generation completed")
         
 
-    ### Build Drugs files, Samples files, and Genes file. These two steps are run in Parallel. 
+    ### Build Drugs files, Samples files, and Genes file. These two steps are run in Parallel.
     ### Within each step, sequential running is required.
     with ThreadPoolExecutor() as executor:
         if args.samples or args.all:
             sample_thread = executor.submit(process_samples,executor, datasets)
         if args.drugs or args.all:
             drug_thread = executor.submit(process_drugs,executor, datasets)
+
+        # Genes must finish before phosphosites can start (phosphosites reads genes.csv).
+        # Run genes now and wait; then submit phosphosites (which can overlap with samples/drugs).
         if args.samples or args.omics or args.exp or args.all:
-            gene_thread = executor.submit(process_genes,executor)
-        
-        # Wait for both processes to complete before proceeding to omics and experiments
+            genes_future = process_genes(executor)
+            if genes_future is not None:
+                genes_future.result()
+
+        phosphosite_future = None
+        if (args.omics or args.all) and 'cnf' in datasets:
+            phosphosite_future = process_phosphosites(executor)
+
+        # Wait for all remaining tasks to complete before proceeding to omics and experiments
         if args.drugs or args.all:
             drug_thread.result()
-        if args.samples or args.all:##need to wait for samples for all of these
+        if args.samples or args.all:
             sample_thread.result()
-        if args.samples or args.omics or args.exp or args.all:
-            gene_thread.result()
-            
-    print("All samples, drugs files, and genes file completed or skipped")
+        if phosphosite_future is not None:
+            phosphosite_future.result()
+
+    print("All samples, drugs files, genes, and phosphosites files completed or skipped")
 
 
     ### At this point in the pipeline, all samples and drugs files have been created. There are no blockers to proceed.
@@ -399,7 +427,7 @@ def get_latest_commit_hash(owner, repo, branch='main'):
     # if args.figshare or args.validate:
         # FigShare File Prefixes:
         
-        prefixes = ['beataml', 'hcmi', 'cptac', 'pancreatic', 'bladder','sarcoma', 'genes', 'drugs', 'liver','novartis','colorectal','mpnst']
+        prefixes = ['beataml', 'hcmi', 'cptac', 'pancreatic', 'bladder', 'sarcoma', 'genes', 'drugs', 'liver', 'novartis', 'colorectal', 'mpnst', 'cnf', 'phosphosites']
         broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
         if "broad_sanger" in datasets:
             prefixes.extend(broad_sanger_datasets)
diff --git a/coderbuild/build_dataset.py b/coderbuild/build_dataset.py
index 7c5ed10c..de5085d0 100644
--- a/coderbuild/build_dataset.py
+++ b/coderbuild/build_dataset.py
@@ -45,11 +45,13 @@ def process_docker(dataset,validate):
         'cptac': ['cptac'],
         'sarcoma': ['sarcoma'],
         'genes': ['genes'],
+        'phosphosites': ['phosphosites'],
         'upload': ['upload'],
-        'colorectal': ['colorectal'], 
+        'colorectal': ['colorectal'],
         'bladder': ['bladder'],
         'liver': ['liver'],
-        'novartis': ['novartis']
+        'novartis': ['novartis'],
+        'cnf': ['cnf']
     }
 
     # Collect container names to build based on the dataset provided. Always build 'genes'.
@@ -57,7 +59,10 @@ def process_docker(dataset,validate):
     # Append upload if validation step is included
     if validate is True:
         datasets_to_build.append('upload')
-        
+    # phosphosites container is required when building cnf
+    if dataset == 'cnf':
+        datasets_to_build.append('phosphosites')
+
     datasets_to_build.extend(dataset_map.get(dataset, []))
 
     compose_command = ['docker', 'compose', '-f', compose_file, 'build'] + datasets_to_build
@@ -79,10 +84,26 @@ def process_docker(dataset,validate):
 
 def process_genes(executor):
     '''
-    Build the genes file if it does not exist.
+    Build the genes file if it does not exist. Returns a Future (or None if already built).
     '''
     if not os.path.exists('local/genes.csv'):
-        executor.submit(run_docker_cmd, ['genes', 'bash', 'build_genes.sh'], 'genes file')
+        return executor.submit(run_docker_cmd, ['genes', 'bash', 'build_genes.sh'], 'genes file')
+    return None
+
+
+def process_phosphosites(executor):
+    '''
+    Build the phosphosites reference file if it does not exist.
+    Only needed when the cnf dataset is being built. Returns a Future (or None).
+    Caller must ensure genes.csv exists before calling this.
+    '''
+    if not os.path.exists('local/phosphosites.csv'):
+        return executor.submit(
+            run_docker_cmd,
+            ['phosphosites', 'bash', 'build_phosphosites.sh', '/tmp/genes.csv'],
+            'phosphosites file',
+        )
+    return None
 
 def process_samples(executor, dataset, use_prev_dataset, should_continue):
     '''
@@ -133,7 +154,8 @@ def process_omics(executor, dataset, should_continue):
         'bladder': ['copy_number', 'mutations', 'transcriptomics'],
         'colorectal':['copy_number', 'mutations', 'transcriptomics'],
         'novartis':['copy_number', 'mutations', 'transcriptomics'],
-        'liver':['copy_number', 'mutations', 'transcriptomics','proteomics']
+        'liver':['copy_number', 'mutations', 'transcriptomics','proteomics'],
+        'cnf': ['transcriptomics', 'proteomics', 'phosphoproteomics'],
     }
 
     expected_omics = dataset_omics_files.get(dataset, [])
@@ -167,7 +189,10 @@ def process_omics(executor, dataset, should_continue):
 
     di = 'broad_sanger_omics' if dataset == 'broad_sanger' else dataset
     filename = f'{dataset} omics'
-    executor.submit(run_docker_cmd, [di, 'bash', 'build_omics.sh', '/tmp/genes.csv', f'/tmp/{dataset}_samples.csv'], filename)
+    omics_cmd = [di, 'bash', 'build_omics.sh', '/tmp/genes.csv', f'/tmp/{dataset}_samples.csv']
+    if dataset == 'cnf':
+        omics_cmd.append('/tmp/phosphosites.csv')
+    executor.submit(run_docker_cmd, omics_cmd, filename)
 
 
 def process_experiments(executor, dataset, should_continue):
@@ -245,6 +270,8 @@ def run_schema_checker(dataset):
     '''
     # Prepare the directory with the built files
     prefixes = ['genes', dataset]
+    if dataset == 'cnf':
+        prefixes.append('phosphosites')
     datasets = [dataset]
     broad_sanger_datasets = ["ccle","ctrpv2","fimm","gdscv1","gdscv2","gcsi","prism","nci60"]
     all_files_dir = 'all_files_dir'
@@ -292,16 +319,25 @@ def main():
     if args.build:
         # Use ThreadPoolExecutor for parallel execution
         with ThreadPoolExecutor() as executor:
-            # Always build genes file
-            process_genes(executor)
+            # Genes must finish before phosphosites can start (phosphosites reads genes.csv)
+            genes_future = process_genes(executor)
+            if genes_future is not None:
+                genes_future.result()
+
+            # Now safe to start phosphosites (genes.csv is present)
+            phosphosites_future = None
+            if args.dataset == 'cnf':
+                phosphosites_future = process_phosphosites(executor)
 
-            # Build samples and drugs
+            # Build samples and drugs in parallel while phosphosites runs
             samples_future = executor.submit(process_samples, executor, args.dataset, args.use_prev_dataset, args.should_continue)
             drugs_future = executor.submit(process_drugs, executor, args.dataset, args.use_prev_dataset, args.should_continue)
 
             samples_future.result()
             drugs_future.result()
-            
+            if phosphosites_future is not None:
+                phosphosites_future.result()
+
         print("Samples and Drugs Files Completed.")
 
         with ThreadPoolExecutor() as executor:
diff --git a/coderbuild/cnf/01-samples-cnf.py b/coderbuild/cnf/01-samples-cnf.py
new file mode 100644
index 00000000..41bb14ea
--- /dev/null
+++ b/coderbuild/cnf/01-samples-cnf.py
@@ -0,0 +1,372 @@
+#!/usr/bin/env python3
+"""
+01-samples-cnf.py: generate cnf_samples.csv for the cNF organoid drug screen.
+
+Specimens are gathered from four sources:
+
+  1. Drug-screen file index on Synapse (syn51301431, dataType='drug screen')
+  2. Global proteomics matrix (env CNF_GLOBAL_PROT_SYN_ID, default syn74815895)
+  3. RNA discovery file (env CNF_RNA_DISCOVERY_SYN_ID, default syn71333780).
+     This is the corrected-abundance file containing every RNA condition
+     processed for the project (organoid, tumor tissue, drug-treated
+     organoid, skin), used here for sample discovery only — NOT to be
+     confused with CNF_RNA_TPM_SYN_ID, which build_omics.py reads for the
+     actual TPM data and may be filtered to organoids only.
+  4. Normal Skin Synapse folder (env CNF_NORMAL_SKIN_SYN_ID, default syn74284682)
+
+Each specimen is classified as one of:
+  - "organoid"         → model_type = "patient derived organoid"
+  - "treated_organoid" → model_type = "patient derived organoid"
+                          (drug-treated organoid; canonical ID preserves the
+                           treatment label, e.g. NF0021_T1_Onalespid_1uM)
+  - "tumor_tissue"     → model_type = "tumor"
+  - "normal_skin"      → model_type = "normal_tissue"
+
+Tissue and organoid samples for the same tumor are kept as separate rows with
+distinct improve_sample_ids — they have different biology (fresh tumor vs.
+cultured organoid) and the schema model_types differ. Treated and untreated
+organoids of the same tumor are also separate rows because their canonical
+IDs differ (treated samples preserve the treatment label).
+
+Cohort assignment is intentionally omitted. The upstream omics data is
+batch-corrected and per-modality cohort membership disagrees for some
+specimens, so a single per-sample cohort label cannot be assigned cleanly.
+The `other_id_source` field carries a uniform project label.
+
+Sample IDs continue from the previous samples file: max(improve_sample_id) + 1.
+
+Usage:
+    python 01-samples-cnf.py <previous_samples.csv> [--output /tmp/cnf_samples.csv]
+"""
+
+import argparse
+import logging
+import os
+import re
+import sys
+
+import pandas as pd
+import synapseclient
+
+from cnf_utils import (
+    MODEL_TYPE_MAP,
+    classify_specimen,
+    patient_from_specimen,
+)
+
+
+# ----------------------------------------------------------------------------
+# Constants
+# ----------------------------------------------------------------------------
+DRUG_SCREEN_TABLE       = "syn51301431"
+GLOBAL_PROT_SYN_ID      = os.environ.get("CNF_GLOBAL_PROT_SYN_ID", "syn74815895")
+# RNA file used for SPECIMEN DISCOVERY only — needs to contain all
+# conditions (organoid + tissue + treated + skin). The corrected-abundance
+# file at syn71333780 fits this. Also used by build_omics.py as the TPM
+# data source (the column is named correctedAbundance but stores TPM).
+RNA_DISCOVERY_SYN_ID    = os.environ.get("CNF_RNA_DISCOVERY_SYN_ID", "syn71333780")
+NORMAL_SKIN_SYN_ID      = os.environ.get("CNF_NORMAL_SKIN_SYN_ID", "syn74284682")
+
+DEFAULT_OUTPUT = "/tmp/cnf_samples.csv"
+
+CANCER_TYPE     = "Cutaneous Neurofibroma"
+SPECIES         = "Homo sapiens (Human)"
+OTHER_ID_SOURCE = "NF1_cNF_Project"
+
+
+# ----------------------------------------------------------------------------
+# Helpers
+# ----------------------------------------------------------------------------
+def configure_logging():
+    logging.basicConfig(
+        format="[%(asctime)s] [%(levelname)s] %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        level=logging.INFO,
+    )
+
+
+def get_starting_id(prev_samples_path: str) -> int:
+    """Read previous samples file and return max(improve_sample_id) + 1."""
+    if not os.path.exists(prev_samples_path):
+        logging.warning("Previous samples file %s not found; starting at 1.",
+                         prev_samples_path)
+        return 1
+
+    df = pd.read_csv(prev_samples_path)
+    if "improve_sample_id" not in df.columns or df.empty:
+        logging.warning("No improve_sample_id column or empty file; starting at 1.")
+        return 1
+
+    max_id = int(pd.to_numeric(df["improve_sample_id"], errors="coerce").max())
+    logging.info("Previous samples max improve_sample_id = %d", max_id)
+    return max_id + 1
+
+
+def merge_specimen_dicts(*dicts: dict[str, str]) -> dict[str, str]:
+    """Union per-source dicts; warn on type disagreements (first-seen wins)."""
+    merged: dict[str, str] = {}
+    for d in dicts:
+        for spec, stype in d.items():
+            if spec in merged and merged[spec] != stype:
+                logging.warning(
+                    "Type conflict for %s: %s (kept) vs %s (ignored)",
+                    spec, merged[spec], stype,
+                )
+                continue
+            merged.setdefault(spec, stype)
+    return merged
+
+
+# ----------------------------------------------------------------------------
+# Source-specific specimen extraction
+# ----------------------------------------------------------------------------
+def specimens_from_drug_screen(syn) -> dict[str, str]:
+    """Pull canonical_id → sample_type for drug-screen specimens."""
+    query = (
+        "select id, individualID, specimenID "
+        f"from {DRUG_SCREEN_TABLE} "
+        "where dataType='drug screen'"
+    )
+    logging.info("Querying %s for drug-screen specimens", DRUG_SCREEN_TABLE)
+    table = syn.tableQuery(query).asDataFrame()
+    raw = table["specimenID"].dropna().astype(str).unique().tolist()
+    out: dict[str, str] = {}
+    for r in raw:
+        result = classify_specimen(r)
+        if result:
+            cid, stype = result
+            out[cid] = stype
+    by_type = _count_by_type(out)
+    logging.info("  Drug screen: %s", by_type)
+    return out
+
+
+def _count_by_type(d: dict[str, str]) -> dict[str, int]:
+    out: dict[str, int] = {}
+    for stype in d.values():
+        out[stype] = out.get(stype, 0) + 1
+    return out
+
+
+def _detect_specimen_column(df: pd.DataFrame) -> str:
+    """Return the name of the specimen column in an omics file."""
+    candidates = ["Specimen", "specimen", "specimen_norm", "sample_id",
+                  "specimenID", "Sample"]
+    for c in candidates:
+        if c in df.columns:
+            return c
+    raise KeyError(
+        f"No specimen column among {candidates} in columns {list(df.columns)[:15]}"
+    )
+
+
+def _read_synapse_table(syn, syn_id: str) -> pd.DataFrame:
+    """Fetch a Synapse file and read it as CSV/TSV."""
+    logging.info("  Fetching %s", syn_id)
+    entity = syn.get(syn_id)
+    path = entity.path
+    if path.endswith(".tsv") or path.endswith(".tsv.gz"):
+        return pd.read_csv(path, sep="\t")
+    return pd.read_csv(path)
+
+
+def specimens_from_proteomics(syn) -> dict[str, str]:
+    """Pull canonical_id → sample_type from the global proteomics matrix."""
+    if not GLOBAL_PROT_SYN_ID:
+        logging.warning("CNF_GLOBAL_PROT_SYN_ID not set; skipping proteomics")
+        return {}
+
+    logging.info("Reading proteomics specimens")
+    try:
+        df = _read_synapse_table(syn, GLOBAL_PROT_SYN_ID)
+        col = _detect_specimen_column(df)
+        raw = df[col].dropna().astype(str).unique().tolist()
+        out: dict[str, str] = {}
+        for r in raw:
+            result = classify_specimen(r)
+            if result:
+                cid, stype = result
+                out[cid] = stype
+        logging.info("  Proteomics: %s", _count_by_type(out))
+        return out
+    except Exception as exc:
+        logging.warning("  Proteomics read failed: %s", exc)
+        return {}
+
+
+def specimens_from_rna_discovery(syn) -> dict[str, str]:
+    """Discover RNA specimens from a comprehensive long-format RNA file.
+
+    Reads CNF_RNA_DISCOVERY_SYN_ID (default syn71333780) — the corrected-
+    abundance RNA file that contains every condition processed for the
+    project: organoid + tumor tissue + drug-treated organoid + skin.
+
+    This is intentionally a different file from the one read by
+    build_omics.py (CNF_RNA_TPM_SYN_ID, which is the gene-level TPM matrix
+    and may be filtered to organoids only). Sample discovery and data
+    ingestion have different requirements: discovery wants the maximum
+    sample list; ingestion wants TPM values.
+
+    Specimens are passed through classify_specimen, so drug-treated
+    samples like NF0021_T1_Onalespid.1uM get caught regardless of their
+    condition_norm value.
+    """
+    if not RNA_DISCOVERY_SYN_ID:
+        logging.warning("CNF_RNA_DISCOVERY_SYN_ID not set; skipping RNA discovery")
+        return {}
+
+    logging.info("Reading RNA discovery file (%s)", RNA_DISCOVERY_SYN_ID)
+    try:
+        df = _read_synapse_table(syn, RNA_DISCOVERY_SYN_ID)
+        col = _detect_specimen_column(df)
+
+        raw = df[col].dropna().astype(str).unique().tolist()
+        out: dict[str, str] = {}
+        for r in raw:
+            result = classify_specimen(r)
+            if result:
+                cid, stype = result
+                out[cid] = stype
+        logging.info("  RNA discovery: %s", _count_by_type(out))
+        return out
+    except Exception as exc:
+        logging.warning("  RNA discovery read failed: %s", exc)
+        return {}
+
+
+def specimens_from_normal_skin_tree(syn) -> dict[str, str]:
+    """Discover skin specimens by listing the Normal Skin Synapse folder.
+
+    The "Normal Skin" tree contains one immediate child per patient (folders
+    named NF0017, NF0018, ...) plus, in some cases, top-level skin FASTQ
+    files like NF0035-SKIN-1-04-16-2025_*.fastq.gz. Both forms encode the
+    patient ID at the start, so we match `^NF\\d{4}` against each child name
+    and emit one canonical NFxxxx_skin entry per unique patient.
+
+    This is intentionally one API call deep — it avoids walking the whole
+    WGS tree (which has hundreds of empty lane-stub folders). For most
+    cases this captures every patient with a matched normal.
+    """
+    if not NORMAL_SKIN_SYN_ID:
+        logging.warning("CNF_NORMAL_SKIN_SYN_ID not set; skipping Normal Skin tree")
+        return {}
+
+    logging.info("Listing Normal Skin tree (%s)", NORMAL_SKIN_SYN_ID)
+    out: dict[str, str] = {}
+    try:
+        for child in syn.getChildren(NORMAL_SKIN_SYN_ID):
+            name = str(child.get("name", ""))
+            m = re.match(r"^(NF\d{4})", name)
+            if m:
+                patient = m.group(1)
+                out[f"{patient}_skin"] = "normal_skin"
+        logging.info("  Normal Skin tree: %s", _count_by_type(out))
+    except Exception as exc:
+        logging.warning("  Normal Skin tree listing failed: %s", exc)
+    return out
+
+
+# ----------------------------------------------------------------------------
+# Sample table construction
+# ----------------------------------------------------------------------------
+def build_samples_table(specimens: dict[str, str], start_id: int) -> pd.DataFrame:
+    """Build the schema-compliant Sample table from {canonical_id: sample_type}.
+
+    Sort order: organoid → treated_organoid → tumor_tissue → normal_skin
+    (each by ID). Drug-modelable untreated organoids get the lowest
+    improve_sample_ids — useful for downstream filtering.
+    """
+    sort_priority = {
+        "organoid":         0,
+        "treated_organoid": 1,
+        "tumor_tissue":     2,
+        "normal_skin":      3,
+    }
+    sorted_specimens = sorted(
+        specimens.items(),
+        key=lambda kv: (sort_priority.get(kv[1], 99), kv[0]),
+    )
+
+    rows = []
+    next_id = start_id
+    for specimen, stype in sorted_specimens:
+        patient = patient_from_specimen(specimen)
+        rows.append({
+            "improve_sample_id": next_id,
+            "other_id":          specimen,
+            "other_id_source":   OTHER_ID_SOURCE,
+            "common_name":       specimen,
+            "cancer_type":       CANCER_TYPE,
+            "other_names":       patient,
+            "species":           SPECIES,
+            "model_type":        MODEL_TYPE_MAP[stype],
+        })
+        next_id += 1
+
+    df = pd.DataFrame(rows)
+    logging.info("Built %d sample rows; ID range %d–%d",
+                 len(df), start_id, next_id - 1)
+    return df
+
+
+# ----------------------------------------------------------------------------
+# Main
+# ----------------------------------------------------------------------------
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "prev_samples",
+        help="Path to previous samples CSV (used to find max improve_sample_id)",
+    )
+    parser.add_argument("--output", default=DEFAULT_OUTPUT,
+                         help=f"Output CSV path (default: {DEFAULT_OUTPUT})")
+    args = parser.parse_args()
+
+    configure_logging()
+
+    syn = synapseclient.Synapse()
+    syn.login()
+
+    # Pull specimens from each source as canonical_id → sample_type dicts
+    drug_d = specimens_from_drug_screen(syn)
+    prot_d = specimens_from_proteomics(syn)
+    rna_d  = specimens_from_rna_discovery(syn)
+    skin_d = specimens_from_normal_skin_tree(syn)
+
+    all_specimens = merge_specimen_dicts(drug_d, prot_d, rna_d, skin_d)
+    logging.info(
+        "Union: %d total specimens — %s",
+        len(all_specimens), _count_by_type(all_specimens),
+    )
+
+    # Per-source uniqueness diagnostics
+    drug_set, prot_set, rna_set, skin_set = (
+        set(drug_d), set(prot_d), set(rna_d), set(skin_d),
+    )
+    only_drug = drug_set - prot_set - rna_set - skin_set
+    only_prot = prot_set - drug_set - rna_set - skin_set
+    only_rna  = rna_set  - drug_set - prot_set - skin_set
+    only_skin = skin_set - drug_set - prot_set - rna_set
+    if only_drug:
+        logging.info("  Only in drug screen: %s", sorted(only_drug))
+    if only_prot:
+        logging.info("  Only in proteomics: %s", sorted(only_prot))
+    if only_rna:
+        logging.info("  Only in RNA: %s", sorted(only_rna))
+    if only_skin:
+        logging.info("  Only in Normal Skin tree: %s", sorted(only_skin))
+
+    if not all_specimens:
+        logging.error("No specimens found from any source; aborting")
+        sys.exit(1)
+
+    start_id = get_starting_id(args.prev_samples)
+    samples = build_samples_table(all_specimens, start_id)
+
+    os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
+    samples.to_csv(args.output, index=False)
+    logging.info("Wrote %s (%d rows)", args.output, len(samples))
+
+
+if __name__ == "__main__":
+    sys.exit(main() or 0)
diff --git a/coderbuild/cnf/02-omics-cnf.py b/coderbuild/cnf/02-omics-cnf.py
new file mode 100644
index 00000000..bdbe0fac
--- /dev/null
+++ b/coderbuild/cnf/02-omics-cnf.py
@@ -0,0 +1,389 @@
+#!/usr/bin/env python3
+"""
+02-omics-cnf.py: generate cnf_transcriptomics.csv, cnf_proteomics.csv,
+                 and cnf_phosphoproteomics.csv.
+
+For the cNF dataset:
+
+* Transcriptomics: raw gene-level TPM from per-cohort salmon.merged.gene_tpm.tsv
+  files (wide matrix: rows = genes, columns = specimens). Default cohorts:
+    syn66352931  — cNF_Cohort_1 (NF0017–NF0023)
+    syn70765053  — cNF_Cohort_2 (NF0025, NF0027, NF0035)
+  Override via env var CNF_RNA_COHORT_SYN_IDS (comma-separated Synapse IDs).
+
+* Proteomics: batch-corrected global proteomics from syn74815895
+  (correctedAbundance column). Override via env var CNF_GLOBAL_PROT_SYN_ID.
+
+* Phosphoproteomics: batch-corrected phospho-proteomics from syn70078415
+  (correctedAbundance column). Mapped to phosphosite_ids via a phosphosites.csv
+  reference file. Override via env var CNF_PHOSPHO_SYN_ID.
+
+* Protocol optimization samples are excluded from RNA via DROP_SAMPLE_SUBSTRINGS.
+
+Specimens are canonicalized via cnf_utils so that values like
+"NF0018.T1.organoid" resolve to the canonical IDs in cnf_samples.csv.
+Source rows whose specimen doesn't match any entry in cnf_samples.csv are
+dropped silently via the inner join to sample_map.
+
+Usage:
+    python 02-omics-cnf.py <genes.csv> <cnf_samples.csv> <phosphosites.csv>
+                          [--out_rna  /tmp/cnf_transcriptomics.csv]
+                          [--out_prot /tmp/cnf_proteomics.csv]
+                          [--out_phospho /tmp/cnf_phosphoproteomics.csv]
+"""
+
+import argparse
+import logging
+import os
+import re
+import sys
+
+import pandas as pd
+import synapseclient
+
+from cnf_utils import canonicalize_specimen_column
+
+
+# ----------------------------------------------------------------------------
+# Defaults / Synapse IDs
+# ----------------------------------------------------------------------------
+_rna_env = os.environ.get("CNF_RNA_COHORT_SYN_IDS", "")
+RNA_TPM_COHORT_SYN_IDS: list[str] = (
+    [s.strip() for s in _rna_env.split(",") if s.strip()]
+    if _rna_env
+    else ["syn66352931", "syn70765053"]
+)
+
+GLOBAL_PROT_SYN_ID  = os.environ.get("CNF_GLOBAL_PROT_SYN_ID",  "syn74815895")
+PHOSPHO_SYN_ID      = os.environ.get("CNF_PHOSPHO_SYN_ID",      "syn70078415")
+
+# Protocol-optimization samples to drop from RNA.
+DROP_SAMPLE_SUBSTRINGS: list[str] = [
+    "cNF_organoid_DIA_G_02_11Feb25",
+    "cNF_organoid_DIA_G_05_11Feb25",
+    "cNF_organoid_DIA_G_06_11Feb25",
+    "cNF_organoid_DIA_P_02_29Jan25",
+    "cNF_organoid_DIA_P_05_11Feb25",
+    "cNF_organoid_DIA_P_06_11Feb25",
+]
+
+STUDY               = "cnf"
+DEFAULT_OUT_RNA     = "/tmp/cnf_transcriptomics.csv"
+DEFAULT_OUT_PROT    = "/tmp/cnf_proteomics.csv"
+DEFAULT_OUT_PHOSPHO = "/tmp/cnf_phosphoproteomics.csv"
+
+
+def configure_logging():
+    logging.basicConfig(
+        format="[%(asctime)s] [%(levelname)s] %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        level=logging.INFO,
+    )
+
+
+# ----------------------------------------------------------------------------
+# Reference data loaders
+# ----------------------------------------------------------------------------
+def load_gene_map(genes_path: str) -> pd.DataFrame:
+    """Load genes.csv. Required columns: gene_symbol, entrez_id."""
+    df = pd.read_csv(genes_path)
+    needed = {"gene_symbol", "entrez_id"}
+    missing = needed - set(df.columns)
+    if missing:
+        raise KeyError(f"genes file missing columns: {missing}")
+    df = df.dropna(subset=["entrez_id", "gene_symbol"]).copy()
+    df["entrez_id"] = df["entrez_id"].astype(int)
+    df = df.drop_duplicates(subset="gene_symbol")
+    logging.info("Loaded %d gene_symbol → entrez_id mappings", len(df))
+    return df[["gene_symbol", "entrez_id"]]
+
+
+def load_sample_map(samples_path: str) -> pd.DataFrame:
+    """Load cnf_samples.csv; return (other_id, improve_sample_id) frame."""
+    df = pd.read_csv(samples_path)
+    df = df[["other_id", "improve_sample_id"]].drop_duplicates()
+    df["improve_sample_id"] = df["improve_sample_id"].astype(int)
+    logging.info("Loaded %d sample mappings", len(df))
+    return df
+
+
+def load_phosphosite_map(phosphosites_path: str) -> pd.DataFrame:
+    """Load phosphosites.csv; return (other_id, phosphosite_id) frame."""
+    df = pd.read_csv(phosphosites_path)
+    needed = {"other_id", "phosphosite_id"}
+    missing = needed - set(df.columns)
+    if missing:
+        raise KeyError(f"phosphosites file missing columns: {missing}")
+    df = df[["other_id", "phosphosite_id"]].drop_duplicates()
+    df["phosphosite_id"] = df["phosphosite_id"].astype(int)
+    logging.info("Loaded %d phosphosite mappings", len(df))
+    return df
+
+
+def fetch_synapse_table(syn, syn_id: str) -> pd.DataFrame:
+    """Download a Synapse file and read it as CSV/TSV."""
+    if not syn_id:
+        raise ValueError("Synapse ID is empty")
+    logging.info("Fetching %s from Synapse", syn_id)
+    entity = syn.get(syn_id)
+    path = entity.path
+    if path.endswith(".tsv") or path.endswith(".tsv.gz"):
+        return pd.read_csv(path, sep="\t")
+    return pd.read_csv(path)
+
+
+# ----------------------------------------------------------------------------
+# Shared helpers
+# ----------------------------------------------------------------------------
+def _col_stem(col: str) -> str:
+    """Return the basename without extension for a path-like column name."""
+    base = re.sub(r"^.*[/\\]", "", col)
+    return re.sub(r"\.[^.]+$", "", base)
+
+
+def _drop_protocol_cols(cols: list[str]) -> list[str]:
+    """Remove columns whose stem matches any DROP_SAMPLE_SUBSTRINGS entry."""
+    kept = []
+    for c in cols:
+        stem = _col_stem(c)
+        if any(sub in stem for sub in DROP_SAMPLE_SUBSTRINGS):
+            logging.info("  Dropping protocol optimization sample: %s", stem)
+        else:
+            kept.append(c)
+    return kept
+
+
+def _normalize_col(df: pd.DataFrame, candidates: list[str], target: str) -> pd.DataFrame:
+    """Rename the first matching candidate column to target."""
+    for c in candidates:
+        if c in df.columns:
+            return df.rename(columns={c: target})
+    raise KeyError(
+        f"None of {candidates} found in columns {list(df.columns)[:20]}"
+    )
+
+
+# ----------------------------------------------------------------------------
+# Transcriptomics
+# ----------------------------------------------------------------------------
+def fetch_wide_tpm_cohort(syn, syn_id: str) -> pd.DataFrame:
+    """Fetch a wide salmon.merged.gene_tpm.tsv and return long-format DataFrame.
+
+    nf-core salmon output: gene_id (Ensembl), gene_name (symbol), then one
+    column per specimen. Returns (gene_symbol, specimen, transcriptomics, source).
+    """
+    logging.info("Fetching cohort TPM matrix %s", syn_id)
+    entity = syn.get(syn_id)
+    path = entity.path
+    df = pd.read_csv(path, sep="\t")
+
+    gene_col = None
+    for cname in ["gene_name", "gene_symbol", "Symbol", "GeneSymbol", "gene"]:
+        if cname in df.columns:
+            gene_col = cname
+            break
+    if gene_col is None:
+        gene_col = df.columns[0]
+        logging.warning(
+            "%s: no gene_name column found; using %s — most rows may fail the gene_map join",
+            syn_id, gene_col,
+        )
+
+    meta_cols = {"gene_id", "gene_name", "gene_symbol", "Symbol",
+                 "GeneSymbol", "gene", "transcript_id"}
+    sample_cols = [c for c in df.columns if c not in meta_cols]
+    sample_cols = _drop_protocol_cols(sample_cols)
+
+    if not sample_cols:
+        raise ValueError(
+            f"{syn_id}: no specimen columns found after excluding metadata "
+            f"and protocol optimization samples"
+        )
+
+    long = (
+        df[[gene_col] + sample_cols]
+        .rename(columns={gene_col: "gene_symbol"})
+        .melt(id_vars="gene_symbol", var_name="specimen", value_name="transcriptomics")
+    )
+    long["source"] = "Synapse"
+    logging.info(
+        "  %s: %d genes × %d specimens → %d long rows",
+        syn_id, df[gene_col].nunique(), len(sample_cols), len(long),
+    )
+    return long
+
+
+def build_transcriptomics(
+    syn, gene_map: pd.DataFrame, sample_map: pd.DataFrame
+) -> pd.DataFrame:
+    """Build the long-format transcriptomics table from per-cohort TPM matrices."""
+    if not RNA_TPM_COHORT_SYN_IDS:
+        logging.error("RNA_TPM_COHORT_SYN_IDS is empty; skipping transcriptomics.")
+        return pd.DataFrame(
+            columns=["entrez_id", "improve_sample_id", "source", "study", "transcriptomics"]
+        )
+
+    cohort_frames = []
+    for syn_id in RNA_TPM_COHORT_SYN_IDS:
+        try:
+            cohort_frames.append(fetch_wide_tpm_cohort(syn, syn_id))
+        except Exception as exc:
+            logging.warning("Skipping RNA cohort %s: %s", syn_id, exc)
+
+    if not cohort_frames:
+        logging.error("All RNA cohort fetches failed; returning empty transcriptomics.")
+        return pd.DataFrame(
+            columns=["entrez_id", "improve_sample_id", "source", "study", "transcriptomics"]
+        )
+
+    raw = pd.concat(cohort_frames, ignore_index=True)
+    logging.info("Combined %d RNA cohorts → %d total rows", len(cohort_frames), len(raw))
+
+    n_before = len(raw)
+    raw = canonicalize_specimen_column(raw, "specimen")
+    logging.info(
+        "Transcriptomics: canonicalized %d → %d rows (%d dropped as unrecognized specimens)",
+        n_before, len(raw), n_before - len(raw),
+    )
+
+    df = (
+        raw.merge(gene_map, on="gene_symbol", how="inner")
+        .merge(sample_map, left_on="specimen", right_on="other_id", how="inner")
+    )
+    df["study"] = STUDY
+    df["transcriptomics"] = pd.to_numeric(df["transcriptomics"], errors="coerce")
+    df = df.dropna(subset=["transcriptomics"])
+    df = df[["entrez_id", "improve_sample_id", "source", "study", "transcriptomics"]]
+
+    logging.info(
+        "Built transcriptomics table: %d rows (%d samples × %d genes)",
+        len(df), df["improve_sample_id"].nunique(), df["entrez_id"].nunique(),
+    )
+    return df
+
+
+# ----------------------------------------------------------------------------
+# Proteomics
+# ----------------------------------------------------------------------------
+def build_proteomics(
+    syn, gene_map: pd.DataFrame, sample_map: pd.DataFrame
+) -> pd.DataFrame:
+    """Build the long-format proteomics table from the batch-corrected Synapse file."""
+    raw = fetch_synapse_table(syn, GLOBAL_PROT_SYN_ID)
+    raw = _normalize_col(raw,
+        ["Specimen", "specimenID", "specimen_id", "specimen", "specimen_norm", "sample_id"],
+        "specimen")
+    raw = _normalize_col(raw,
+        ["gene_symbol", "Gene", "GeneSymbol", "Symbol", "gene", "feature_id"],
+        "gene_symbol")
+
+    n_before = len(raw)
+    raw = canonicalize_specimen_column(raw, "specimen")
+    logging.info(
+        "Proteomics: canonicalized %d → %d rows (%d dropped as unrecognized specimens)",
+        n_before, len(raw), n_before - len(raw),
+    )
+
+    raw = _normalize_col(raw,
+        ["correctedAbundance", "log_ratio", "logRatio", "proteomics", "value"],
+        "proteomics")
+
+    df = (
+        raw.merge(gene_map, on="gene_symbol", how="inner")
+        .merge(sample_map, left_on="specimen", right_on="other_id", how="inner")
+    )
+    df["source"] = "Synapse"
+    df["study"]  = STUDY
+    df["proteomics"] = pd.to_numeric(df["proteomics"], errors="coerce")
+    df = df.dropna(subset=["proteomics"])
+    df = df[["entrez_id", "improve_sample_id", "source", "study", "proteomics"]]
+
+    logging.info(
+        "Built proteomics table: %d rows (%d samples × %d genes)",
+        len(df), df["improve_sample_id"].nunique(), df["entrez_id"].nunique(),
+    )
+    return df
+
+
+# ----------------------------------------------------------------------------
+# Phosphoproteomics
+# ----------------------------------------------------------------------------
+def build_phosphoproteomics(
+    syn, phosphosite_map: pd.DataFrame, sample_map: pd.DataFrame
+) -> pd.DataFrame:
+    """Build the long-format phosphoproteomics table from the batch-corrected Synapse file."""
+    raw = fetch_synapse_table(syn, PHOSPHO_SYN_ID)
+    raw = _normalize_col(raw,
+        ["Specimen", "specimenID", "specimen_id", "specimen"],
+        "specimen")
+    raw = _normalize_col(raw,
+        ["site", "Site", "phosphosite", "feature_id"],
+        "site")
+
+    n_before = len(raw)
+    raw = canonicalize_specimen_column(raw, "specimen")
+    logging.info(
+        "Phosphoproteomics: canonicalized %d → %d rows (%d dropped as unrecognized specimens)",
+        n_before, len(raw), n_before - len(raw),
+    )
+
+    raw = _normalize_col(raw,
+        ["correctedAbundance", "log_ratio", "logRatio", "phosphoproteomics", "value"],
+        "phosphoproteomics")
+
+    df = (
+        raw.merge(phosphosite_map, left_on="site", right_on="other_id", how="inner")
+        .merge(sample_map, left_on="specimen", right_on="other_id", how="inner")
+    )
+    df["source"] = "Synapse"
+    df["study"]  = STUDY
+    df["phosphoproteomics"] = pd.to_numeric(df["phosphoproteomics"], errors="coerce")
+    df = df.dropna(subset=["phosphoproteomics"])
+    df = df[["phosphosite_id", "improve_sample_id", "source", "study", "phosphoproteomics"]]
+
+    logging.info(
+        "Built phosphoproteomics table: %d rows (%d samples × %d sites)",
+        len(df), df["improve_sample_id"].nunique(), df["phosphosite_id"].nunique(),
+    )
+    return df
+
+
+# ----------------------------------------------------------------------------
+# Main
+# ----------------------------------------------------------------------------
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("genes",       help="Path to genes.csv (gene_symbol → entrez_id)")
+    parser.add_argument("samples",     help="Path to cnf_samples.csv from build_samples")
+    parser.add_argument("phosphosites", help="Path to phosphosites.csv (other_id → phosphosite_id)")
+    parser.add_argument("--out_rna",    default=DEFAULT_OUT_RNA)
+    parser.add_argument("--out_prot",   default=DEFAULT_OUT_PROT)
+    parser.add_argument("--out_phospho", default=DEFAULT_OUT_PHOSPHO)
+    args = parser.parse_args()
+
+    configure_logging()
+
+    gene_map         = load_gene_map(args.genes)
+    sample_map       = load_sample_map(args.samples)
+    phosphosite_map  = load_phosphosite_map(args.phosphosites)
+
+    syn = synapseclient.Synapse()
+    syn.login()
+
+    rna    = build_transcriptomics(syn, gene_map, sample_map)
+    prot   = build_proteomics(syn, gene_map, sample_map)
+    phospho = build_phosphoproteomics(syn, phosphosite_map, sample_map)
+
+    for path in (args.out_rna, args.out_prot, args.out_phospho):
+        os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
+
+    rna.to_csv(args.out_rna,     index=False)
+    prot.to_csv(args.out_prot,   index=False)
+    phospho.to_csv(args.out_phospho, index=False)
+    logging.info("Wrote %s (%d rows)", args.out_rna,     len(rna))
+    logging.info("Wrote %s (%d rows)", args.out_prot,    len(prot))
+    logging.info("Wrote %s (%d rows)", args.out_phospho, len(phospho))
+
+
+if __name__ == "__main__":
+    sys.exit(main() or 0)
diff --git a/coderbuild/cnf/03-drugs-cnf.py b/coderbuild/cnf/03-drugs-cnf.py
new file mode 100644
index 00000000..b9fe0120
--- /dev/null
+++ b/coderbuild/cnf/03-drugs-cnf.py
@@ -0,0 +1,531 @@
+#!/usr/bin/env python3
+"""
+03-drugs-cnf.py: generate cnf_drugs.tsv and cnf_drug_descriptors.tsv.
+
+Pulls unique drug names from the cNF drug screen on Synapse, then calls the
+standard coderdata utilities:
+
+    coderbuild/utils/pubchem_retrieval.py
+        -> cnf_drugs.tsv
+
+    coderbuild/utils/build_drug_descriptor_table.py
+        -> cnf_drug_descriptors.tsv
+
+Important implementation note:
+    pubchem_retrieval.py does not expose a command-line interface in the
+    provided version. It defines update_dataframe_and_write_tsv(), but running
+    `python pubchem_retrieval.py --input ... --output ...` exits 0 without
+    doing any work. Therefore this script imports pubchem_retrieval.py directly
+    and calls update_dataframe_and_write_tsv().
+
+The descriptor builder is still run as a subprocess. It may call R/biomaRt
+internally, which can fail when Ensembl is unavailable. To keep the drugs file
+produced even when descriptors fail, this script:
+
+  1. Retries external/remote work with exponential backoff.
+  2. Sets a generous timeout.
+  3. Supports --skip-descriptors.
+  4. Supports --only-descriptors for retrying descriptor generation later.
+
+Usage:
+    python 03-drugs-cnf.py --prev_drugs <file1.tsv,file2.tsv,...>
+                          [--out_drugs /tmp/cnf_drugs.tsv]
+                          [--out_desc  /tmp/cnf_drug_descriptors.tsv]
+                          [--skip-descriptors]
+                          [--only-descriptors]
+                          [--retries 3]
+                          [--timeout 1800]
+"""
+
+import argparse
+import importlib.util
+import logging
+import os
+import signal
+import subprocess
+import sys
+import tempfile
+import time
+import pandas as pd
+import synapseclient
+
+
+# ----------------------------------------------------------------------------
+# Constants
+# ----------------------------------------------------------------------------
+
+DRUG_SCREEN_TABLE = "syn51301431"
+
+PUBCHEM_SCRIPT = os.environ.get(
+    "CODERDATA_PUBCHEM_SCRIPT",
+    "/coderbuild/utils/pubchem_retrieval.py",
+)
+
+DESCRIPTOR_SCRIPT = os.environ.get(
+    "CODERDATA_DESCRIPTOR_SCRIPT",
+    "/coderbuild/utils/build_drug_descriptor_table.py",
+)
+
+DEFAULT_OUT_DRUGS = "/tmp/cnf_drugs.tsv"
+DEFAULT_OUT_DESC = "/tmp/cnf_drug_descriptors.tsv"
+
+DEFAULT_RETRIES = 3
+DEFAULT_TIMEOUT = 1800  # seconds per attempt
+RETRY_BACKOFF_S = 30    # 30s, 60s, 120s, ...
+
+
+# ----------------------------------------------------------------------------
+# Logging
+# ----------------------------------------------------------------------------
+
+def configure_logging() -> None:
+    logging.basicConfig(
+        format="[%(asctime)s] [%(levelname)s] %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        level=logging.INFO,
+    )
+
+
+# ----------------------------------------------------------------------------
+# Generic subprocess wrapper with retry
+# ----------------------------------------------------------------------------
+
+def run_with_retries(
+    cmd: list[str],
+    retries: int,
+    timeout: int,
+    label: str,
+) -> bool:
+    """
+    Run a subprocess, retrying on failure with exponential backoff.
+
+    Returns True on success, False if all retries are exhausted.
+    """
+    for attempt in range(1, retries + 1):
+        logging.info("[%s] attempt %d/%d", label, attempt, retries)
+
+        try:
+            subprocess.run(cmd, check=True, timeout=timeout)
+            logging.info("[%s] succeeded on attempt %d", label, attempt)
+            return True
+
+        except subprocess.TimeoutExpired:
+            logging.warning(
+                "[%s] timed out after %ds on attempt %d",
+                label,
+                timeout,
+                attempt,
+            )
+
+        except subprocess.CalledProcessError as exc:
+            logging.warning(
+                "[%s] failed with exit %d on attempt %d",
+                label,
+                exc.returncode,
+                attempt,
+            )
+
+        except Exception as exc:
+            logging.warning(
+                "[%s] unexpected error on attempt %d: %s",
+                label,
+                attempt,
+                exc,
+            )
+
+        if attempt < retries:
+            backoff = RETRY_BACKOFF_S * (2 ** (attempt - 1))
+            logging.info("[%s] sleeping %ds before retry", label, backoff)
+            time.sleep(backoff)
+
+    logging.error("[%s] exhausted %d retries", label, retries)
+    return False
+
+
+# ----------------------------------------------------------------------------
+# PubChem utility loading/calling
+# ----------------------------------------------------------------------------
+
+def load_pubchem_module(path: str):
+    """
+    Load pubchem_retrieval.py as a Python module.
+
+    The provided pubchem_retrieval.py has no command-line entry point, so it
+    must be imported and called directly.
+    """
+    if not os.path.exists(path):
+        raise FileNotFoundError(f"PubChem retrieval script not found: {path}")
+
+    spec = importlib.util.spec_from_file_location(
+        "coderdata_pubchem_retrieval",
+        path,
+    )
+
+    if spec is None or spec.loader is None:
+        raise ImportError(f"Could not create import spec for {path}")
+
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+
+    if not hasattr(module, "update_dataframe_and_write_tsv"):
+        raise AttributeError(
+            f"{path} does not define update_dataframe_and_write_tsv()"
+        )
+
+    return module
+
+
+def call_pubchem_retrieval(
+    names: list[str],
+    prev_drug_files: list[str],
+    out_drugs: str,
+    retries: int,
+    timeout: int,
+) -> bool:
+    """
+    Generate the drugs TSV by calling pubchem_retrieval.update_dataframe_and_write_tsv().
+
+    This intentionally does not use subprocess because the supplied
+    pubchem_retrieval.py has no argparse/main block and exits 0 without writing
+    anything when run as a script.
+    """
+    prev_arg = ",".join(prev_drug_files) if prev_drug_files else None
+
+    for attempt in range(1, retries + 1):
+        logging.info("[pubchem_retrieval] attempt %d/%d", attempt, retries)
+
+        ignore_chems = None
+
+        try:
+            pubchem = load_pubchem_module(PUBCHEM_SCRIPT)
+
+            with tempfile.NamedTemporaryFile(
+                "w",
+                suffix="_cnf_ignore_chems.txt",
+                delete=False,
+            ) as tmp_ignore:
+                ignore_chems = tmp_ignore.name
+
+            df = pubchem.update_dataframe_and_write_tsv(
+                unique_names=names,
+                output_filename=out_drugs,
+                ignore_chems=ignore_chems,
+                batch_size=1,
+                isname=True,
+                time_limit=timeout,
+                prev_drug_filepaths=prev_arg,
+                restrict_to_raw_names=names,
+            )
+
+            # pubchem_retrieval.py sets signal.alarm(time_limit), but does not
+            # clear it. Clear it here so the descriptor step is not interrupted.
+            try:
+                signal.alarm(0)
+            except Exception:
+                pass
+
+            if getattr(pubchem, "should_continue", True) is False:
+                raise TimeoutError(
+                    f"pubchem_retrieval reached its time limit of {timeout}s"
+                )
+
+            if not os.path.exists(out_drugs):
+                raise FileNotFoundError(
+                    f"pubchem_retrieval returned but did not create {out_drugs}"
+                )
+
+            if os.path.getsize(out_drugs) == 0:
+                raise RuntimeError(
+                    f"pubchem_retrieval created {out_drugs}, but it is empty"
+                )
+
+            n_rows = len(df) if df is not None else "unknown"
+
+            logging.info(
+                "[pubchem_retrieval] succeeded on attempt %d; wrote %s rows to %s",
+                attempt,
+                n_rows,
+                out_drugs,
+            )
+
+            return True
+
+        except Exception as exc:
+            try:
+                signal.alarm(0)
+            except Exception:
+                pass
+
+            logging.warning(
+                "[pubchem_retrieval] failed on attempt %d/%d: %s",
+                attempt,
+                retries,
+                exc,
+            )
+
+            if attempt < retries:
+                backoff = RETRY_BACKOFF_S * (2 ** (attempt - 1))
+                logging.info(
+                    "[pubchem_retrieval] sleeping %ds before retry",
+                    backoff,
+                )
+                time.sleep(backoff)
+
+        finally:
+            if ignore_chems and os.path.exists(ignore_chems):
+                try:
+                    os.unlink(ignore_chems)
+                except OSError:
+                    pass
+
+    logging.error("[pubchem_retrieval] exhausted %d retries", retries)
+    return False
+
+
+# ----------------------------------------------------------------------------
+# Drug name collection
+# ----------------------------------------------------------------------------
+
+def collect_drug_names(syn) -> list[str]:
+    """
+    Pull every cNF drug-screen file from Synapse and collect unique drug names.
+    """
+    query = (
+        "select id, individualID, specimenID "
+        f"from {DRUG_SCREEN_TABLE} "
+        "where dataType='drug screen'"
+    )
+
+    files = syn.tableQuery(query).asDataFrame()
+
+    logging.info("Reading %d drug-screen files for drug names", len(files))
+
+    names: set[str] = set()
+
+    for _, row in files.iterrows():
+        try:
+            entity = syn.get(row["id"])
+            df = pd.read_csv(entity.path)
+
+        except Exception as exc:
+            logging.warning("Skipping file %s: %s", row["id"], exc)
+            continue
+
+        if "Drug" not in df.columns:
+            logging.warning("File %s has no 'Drug' column; skipping", row["id"])
+            continue
+
+        names.update(
+            df["Drug"]
+            .dropna()
+            .astype(str)
+            .str.strip()
+            .tolist()
+        )
+
+    names = sorted(n for n in names if n and n.lower() != "nan")
+
+    logging.info("Collected %d unique drug names", len(names))
+
+    return names
+
+
+# ----------------------------------------------------------------------------
+# Descriptor utility
+# ----------------------------------------------------------------------------
+
+def call_descriptor_table(
+    drug_file: str,
+    out_desc: str,
+    retries: int,
+    timeout: int,
+) -> bool:
+    """
+    Run the descriptor table builder as a subprocess.
+
+    This function assumes build_drug_descriptor_table.py has a CLI accepting
+    --input and --output.
+    """
+    cmd = [
+        "python",
+        DESCRIPTOR_SCRIPT,
+        "--input",
+        drug_file,
+        "--output",
+        out_desc,
+    ]
+
+    ok = run_with_retries(
+        cmd,
+        retries,
+        timeout,
+        "descriptor_table",
+    )
+
+    if ok and not os.path.exists(out_desc):
+        logging.error(
+            "descriptor_table exited successfully but did not create %s",
+            out_desc,
+        )
+        return False
+
+    return ok
+
+
+# ----------------------------------------------------------------------------
+# Main
+# ----------------------------------------------------------------------------
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+
+    parser.add_argument(
+        "--prev_drugs",
+        default="",
+        help="Comma-delimited list of existing drug TSV files for ID reuse.",
+    )
+
+    parser.add_argument(
+        "--out_drugs",
+        default=DEFAULT_OUT_DRUGS,
+        help=f"Output drugs TSV. Default: {DEFAULT_OUT_DRUGS}",
+    )
+
+    parser.add_argument(
+        "--out_desc",
+        default=DEFAULT_OUT_DESC,
+        help=f"Output drug descriptors TSV. Default: {DEFAULT_OUT_DESC}",
+    )
+
+    parser.add_argument(
+        "--skip-descriptors",
+        action="store_true",
+        help=(
+            "Skip the descriptor step entirely. Use when the descriptor "
+            "utility or its biomaRt/Ensembl dependency is failing. The drugs "
+            "table is still produced."
+        ),
+    )
+
+    parser.add_argument(
+        "--only-descriptors",
+        action="store_true",
+        help=(
+            "Only run the descriptor step, expecting --out_drugs to already "
+            "exist. Useful for retrying after a previous descriptor failure."
+        ),
+    )
+
+    parser.add_argument(
+        "--retries",
+        type=int,
+        default=DEFAULT_RETRIES,
+        help=f"Number of retry attempts. Default: {DEFAULT_RETRIES}",
+    )
+
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=DEFAULT_TIMEOUT,
+        help=f"Timeout in seconds per attempt. Default: {DEFAULT_TIMEOUT}",
+    )
+
+    args = parser.parse_args()
+
+    configure_logging()
+
+    if args.skip_descriptors and args.only_descriptors:
+        logging.error("Cannot combine --skip-descriptors with --only-descriptors")
+        return 2
+
+    for path in (args.out_drugs, args.out_desc):
+        os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
+
+    # ------------------------------------------------------------------------
+    # Drugs step
+    # ------------------------------------------------------------------------
+
+    if not args.only_descriptors:
+        syn = synapseclient.Synapse()
+        syn.login()
+
+        names = collect_drug_names(syn)
+
+        if not names:
+            logging.error("No drug names found; aborting")
+            return 1
+
+        prev = [
+            p.strip()
+            for p in args.prev_drugs.split(",")
+            if p.strip()
+        ]
+
+        ok = call_pubchem_retrieval(
+            names=names,
+            prev_drug_files=prev,
+            out_drugs=args.out_drugs,
+            retries=args.retries,
+            timeout=args.timeout,
+        )
+
+        if not ok:
+            logging.error("pubchem_retrieval failed; aborting")
+            return 1
+
+        if not os.path.exists(args.out_drugs):
+            logging.error(
+                "pubchem_retrieval did not produce %s",
+                args.out_drugs,
+            )
+            return 1
+
+        logging.info("Wrote %s", args.out_drugs)
+
+    # ------------------------------------------------------------------------
+    # Descriptor step
+    # ------------------------------------------------------------------------
+
+    if args.skip_descriptors:
+        logging.warning(
+            "Skipping descriptor step. Re-run with --only-descriptors to "
+            "produce %s when descriptor dependencies are reachable.",
+            args.out_desc,
+        )
+        return 0
+
+    if not os.path.exists(args.out_drugs):
+        logging.error(
+            "Drugs file %s missing; cannot run descriptors",
+            args.out_drugs,
+        )
+        return 1
+
+    ok = call_descriptor_table(
+        drug_file=args.out_drugs,
+        out_desc=args.out_desc,
+        retries=args.retries,
+        timeout=args.timeout,
+    )
+
+    if not ok:
+        logging.error(
+            "Descriptor step failed after %d attempts. The drugs file %s is "
+            "complete; re-run with --only-descriptors when descriptor "
+            "dependencies are reachable to produce %s.",
+            args.retries,
+            args.out_drugs,
+            args.out_desc,
+        )
+
+        # Exit 0 because the primary drugs table was produced. Descriptors can
+        # be retried later. Use --skip-descriptors to suppress this explicitly.
+        return 0
+
+    logging.info("Wrote %s", args.out_desc)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/coderbuild/cnf/04-experiments-cnf.py b/coderbuild/cnf/04-experiments-cnf.py
new file mode 100644
index 00000000..fe1e216b
--- /dev/null
+++ b/coderbuild/cnf/04-experiments-cnf.py
@@ -0,0 +1,478 @@
+#!/usr/bin/env python3
+"""
+04-experiments-cnf.py: generate cnf_experiments.tsv.
+
+Discovers every drug-screen viability file under three Synapse parent folders,
+attaches a specimen ID by reading the file's `specimenID` annotation (with a
+filename-pattern fallback), then per (specimen, drug):
+
+  * Multi-concentration measurements are run through the standard
+    coderbuild/utils/fit_curve.py utility, producing fit_auc / fit_ic50 /
+    fit_einf / fit_hs / fit_r2.
+  * Single-concentration measurements at 1 μM are recorded with
+    metric = 'uM_viability' and value = Viability_percentage / 100.
+    Schema dependency: 'uM_viability' must be in the ResponseMetric enum.
+    See schema_patch.md.
+
+Specimen attribution policy:
+  1. Trust the file's `specimenID` annotation if present.
+  2. Fall back to parsing the filename (e.g. "NF0017_T2_Viabilities.csv"
+     → "NF0017_T2").
+  3. Run the result through cnf_utils.classify_specimen as a safety net
+     (handles dot-separators, suffixes, case variants). Files whose
+     specimen can't be classified are skipped with a warning.
+
+Dual-mapping: one viability file's results should be attributed to two
+specimens. Handled via SPECIMEN_DUAL_MAPPINGS at the top of the script;
+add new entries as more cases come up.
+
+Usage:
+    python 04-experiments-cnf.py <cnf_samples.csv> <cnf_drugs.tsv>
+                        [--output /tmp/cnf_experiments.tsv]
+                        [--parents syn51301414,syn51301420,syn51301426]
+"""
+
+import argparse
+import logging
+import os
+import re
+import subprocess
+import sys
+import tempfile
+
+import pandas as pd
+import synapseclient
+
+from cnf_utils import classify_specimen
+
+
+# ----------------------------------------------------------------------------
+# Constants
+# ----------------------------------------------------------------------------
+# Synapse folders to walk for drug-screen viability files
+DEFAULT_DRUG_SCREEN_PARENTS = [
+    "syn51301414",
+    "syn51301420",
+    "syn51301426",
+]
+
+TIME              = 120
+TIME_UNIT         = "hours"
+STUDY             = "cnf"
+SOURCE            = "Synapse"
+SINGLE_DOSE_UM    = 1.0
+SINGLE_DOSE_METRIC = "uM_viability"
+
+FIT_CURVE_SCRIPT  = os.environ.get(
+    "CODERDATA_FIT_CURVE_SCRIPT", "/coderbuild/utils/fit_curve.py")
+
+DEFAULT_OUTPUT = "/tmp/cnf_experiments.tsv"
+
+# Regex to recover specimen from filenames like "NF0017_T2_Viabilities.csv"
+# or "NF0021_T1_Onalespid_1uM_Viabilities.csv"
+FILENAME_SPECIMEN_RE = re.compile(
+    r"^(NF\d{4}(?:_T\d+)?(?:_[A-Za-z0-9._-]+?)?)"   # specimen prefix
+    r"_(?:Viabilities|viabilities|Viability|drug_screen)"  # known label
+    r"\.(?:csv|tsv|txt)$",
+    re.IGNORECASE,
+)
+
+# ----------------------------------------------------------------------------
+# DUAL-MAPPING TABLE
+# ----------------------------------------------------------------------------
+# Maps a canonical specimen to any additional specimens that should receive
+# the same drug-response rows. Add new entries as more cases come up.
+#
+# Known case: NF0021_T1_Onalespid_1uM was screened against the same drug
+# panel as NF0021_T1 and should be available under both labels.
+SPECIMEN_DUAL_MAPPINGS: dict[str, list[str]] = {
+    "NF0021_T1_Onalespid_1uM": ["NF0021_T1"],
+}
+
+
+def configure_logging():
+    logging.basicConfig(
+        format="[%(asctime)s] [%(levelname)s] %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        level=logging.INFO,
+    )
+
+
+# ----------------------------------------------------------------------------
+# Specimen attribution
+# ----------------------------------------------------------------------------
+def specimen_from_annotation(syn, file_id: str) -> str | None:
+    """Return the file's specimenID annotation, or None if missing."""
+    try:
+        anno = syn.getAnnotations(file_id)
+    except Exception as exc:
+        logging.warning("getAnnotations failed for %s: %s", file_id, exc)
+        return None
+    val = anno.get("specimenID")
+    if isinstance(val, list):
+        val = val[0] if val else None
+    if val is None:
+        return None
+    s = str(val).strip()
+    return s or None
+
+
+def specimen_from_filename(name: str) -> str | None:
+    """Parse 'NF0017_T2_Viabilities.csv' → 'NF0017_T2'."""
+    m = FILENAME_SPECIMEN_RE.match(name)
+    if m:
+        return m.group(1)
+    return None
+
+
+def resolve_specimen(syn, file_id: str, file_name: str) -> str | None:
+    """Resolve a file to a canonical specimen ID, trying annotation first.
+
+    Returns canonical_id (e.g. "NF0017_T2") or None if both annotation and
+    filename parsing fail or the result doesn't classify as a known sample
+    type.
+    """
+    raw = specimen_from_annotation(syn, file_id)
+    if not raw:
+        raw = specimen_from_filename(file_name)
+        if not raw:
+            return None
+
+    # Safety-net classification — handles dot-separators, suffixes, case
+    result = classify_specimen(raw)
+    if result is None:
+        # Annotation or filename produced something that doesn't fit any
+        # known sample pattern.
+        logging.warning(
+            "Could not classify specimen '%s' (file %s, %s); skipping",
+            raw, file_id, file_name,
+        )
+        return None
+    return result[0]
+
+
+def expand_dual_mappings(specimen: str) -> list[str]:
+    """Return [specimen] + any additional specimens it's dual-mapped to."""
+    return [specimen] + SPECIMEN_DUAL_MAPPINGS.get(specimen, [])
+
+
+# ----------------------------------------------------------------------------
+# Synapse walking
+# ----------------------------------------------------------------------------
+def walk_for_viability_files(syn, parent_id: str) -> list[dict]:
+    """Recursively yield File entities under a Synapse folder.
+
+    Returns a list of {id, name} dicts for files whose name suggests they
+    contain viability data (drug-screen CSV/TSV).
+    """
+    try:
+        import synapseutils
+    except ImportError:
+        logging.error("synapseutils not available; cannot walk %s", parent_id)
+        return []
+
+    files = []
+    for dirpath, _subfolders, file_list in synapseutils.walk(syn, parent_id):
+        for fname, fid in file_list:
+            # Filter to plausible viability files. Accept anything ending in
+            # csv/tsv that has 'viab' or 'drug' in the name; fall back to
+            # 'csv' if neither matches but the file is small enough.
+            lower = fname.lower()
+            if not lower.endswith((".csv", ".tsv", ".txt")):
+                continue
+            if any(tok in lower for tok in ("viab", "drug", "screen")):
+                files.append({"id": fid, "name": fname,
+                              "parent_path": dirpath})
+            # Don't auto-grab arbitrary CSVs — we only want drug screens.
+    return files
+
+
+def discover_drug_files(syn, parents: list[str]) -> pd.DataFrame:
+    """Walk all parent folders and dedupe by Synapse ID."""
+    all_files = []
+    for parent in parents:
+        logging.info("Walking %s ...", parent)
+        files = walk_for_viability_files(syn, parent)
+        logging.info("  found %d candidate viability files", len(files))
+        all_files.extend(files)
+
+    if not all_files:
+        return pd.DataFrame(columns=["id", "name"])
+
+    df = pd.DataFrame(all_files).drop_duplicates(subset="id").reset_index(drop=True)
+    logging.info("Discovered %d unique viability files across %d parents",
+                 len(df), len(parents))
+    return df
+
+
+# ----------------------------------------------------------------------------
+# Read and combine drug-screen files
+# ----------------------------------------------------------------------------
+def pull_screen_files(syn, parents: list[str]) -> pd.DataFrame:
+    """Walk parents, read each viability CSV, attach canonical specimen.
+
+    Files whose specimen can't be resolved are skipped with a warning.
+    Files attributed to a dual-mapped specimen produce duplicate rows for
+    every additional specimen in the mapping.
+    """
+    files_df = discover_drug_files(syn, parents)
+    if files_df.empty:
+        raise RuntimeError("No drug-screen files found under any parent")
+
+    frames = []
+    for _, row in files_df.iterrows():
+        fid, fname = row["id"], row["name"]
+
+        canonical = resolve_specimen(syn, fid, fname)
+        if not canonical:
+            continue
+
+        try:
+            df = pd.read_csv(syn.get(fid).path)
+        except Exception as exc:
+            logging.warning("Skipping file %s: %s", fid, exc)
+            continue
+
+        # Verify required columns
+        required = {"Drug", "Concentration_uM", "Viability_percentage"}
+        missing = required - set(df.columns)
+        if missing:
+            logging.warning("File %s missing %s; skipping",
+                            fname, sorted(missing))
+            continue
+
+        # Attribute to canonical + any dual-mapped specimens
+        for specimen in expand_dual_mappings(canonical):
+            attributed = df.copy()
+            attributed["specimen"]    = specimen
+            attributed["source_file"] = fid
+            frames.append(attributed)
+
+    if not frames:
+        raise RuntimeError("No drug-screen files could be read successfully")
+
+    combined = pd.concat(frames, ignore_index=True)
+    n_specimens = combined["specimen"].nunique()
+    n_drugs     = combined["Drug"].nunique()
+    logging.info(
+        "Combined %d drug-screen rows across %d specimens × %d drugs",
+        len(combined), n_specimens, n_drugs,
+    )
+    return combined
+
+
+# ----------------------------------------------------------------------------
+# I/O helpers
+# ----------------------------------------------------------------------------
+def load_sample_map(samples_path: str) -> dict[str, int]:
+    df = pd.read_csv(samples_path)
+    df["improve_sample_id"] = df["improve_sample_id"].astype(int)
+    return dict(zip(df["other_id"].astype(str), df["improve_sample_id"]))
+
+
+def load_drug_map(drugs_path: str) -> dict[str, str]:
+    df = pd.read_csv(drugs_path, sep="\t")
+    if "chem_name" not in df.columns or "improve_drug_id" not in df.columns:
+        raise KeyError(
+            "drugs file missing required columns "
+            f"(have: {list(df.columns)})"
+        )
+    mapping = {
+        str(name).strip().lower(): str(did)
+        for name, did in zip(df["chem_name"], df["improve_drug_id"])
+    }
+    logging.info("Loaded %d drug name → improve_drug_id mappings", len(mapping))
+    return mapping
+
+
+def lookup_drug_id(name, drug_map: dict[str, str]) -> str | None:
+    if pd.isna(name):
+        return None
+    return drug_map.get(str(name).strip().lower())
+
+
+# ----------------------------------------------------------------------------
+# Single vs multi dose split
+# ----------------------------------------------------------------------------
+def split_single_vs_multi(combined: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Per (specimen, drug), separate single-dose vs multi-dose."""
+    counts = (
+        combined.groupby(["specimen", "Drug"])["Concentration_uM"]
+        .nunique()
+        .reset_index(name="n_conc")
+    )
+    multi_keys  = counts[counts["n_conc"] > 1]
+    single_keys = counts[counts["n_conc"] == 1]
+
+    multi  = combined.merge(multi_keys[["specimen", "Drug"]],
+                              on=["specimen", "Drug"], how="inner")
+    single = combined.merge(single_keys[["specimen", "Drug"]],
+                              on=["specimen", "Drug"], how="inner")
+    # Single-dose rows must be at the canonical 1 μM concentration.
+    single = single[single["Concentration_uM"] == SINGLE_DOSE_UM].copy()
+
+    logging.info("Multi-dose rows: %d (across %d sample-drug pairs)",
+                 len(multi), len(multi_keys))
+    logging.info("Single-dose rows: %d (across %d sample-drug pairs at %g μM)",
+                 len(single), len(single_keys), SINGLE_DOSE_UM)
+    return multi, single
+
+
+# ----------------------------------------------------------------------------
+# Curve fitting (multi-dose) and single-dose formatting
+# ----------------------------------------------------------------------------
+def run_curve_fit(multi: pd.DataFrame, sample_map, drug_map) -> pd.DataFrame:
+    if multi.empty:
+        return pd.DataFrame()
+
+    multi = multi.copy()
+    multi["improve_sample_id"] = multi["specimen"].map(sample_map)
+    multi["improve_drug_id"]   = multi["Drug"].apply(
+        lambda n: lookup_drug_id(n, drug_map))
+
+    n_drop = (multi["improve_sample_id"].isna() |
+               multi["improve_drug_id"].isna()).sum()
+    if n_drop:
+        logging.warning("Dropping %d multi-dose rows with unmapped sample/drug",
+                        n_drop)
+    multi = multi.dropna(subset=["improve_sample_id", "improve_drug_id"])
+    if multi.empty:
+        return pd.DataFrame()
+
+    # fit_curve.py expects DOSE in μM and GROWTH as viability percentage
+    multi["DOSE"]   = multi["Concentration_uM"].astype(float) + 1e-4
+    multi["GROWTH"] = multi["Viability_percentage"].astype(float)
+    multi["time"]   = TIME
+    multi["time_unit"] = TIME_UNIT
+    multi["study"]  = STUDY
+    multi["source"] = SOURCE
+
+    cols = ["DOSE", "GROWTH", "study", "source", "improve_sample_id",
+            "Drug", "time", "time_unit"]
+    multi_for_fit = multi[cols].copy()
+    multi_for_fit["improve_sample_id"] = (
+        multi_for_fit["improve_sample_id"].astype(int))
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        in_path    = os.path.join(tmpdir, "drug_response.tsv")
+        out_prefix = os.path.join(tmpdir, "cnf_curve")
+        multi_for_fit.to_csv(in_path, sep="\t", index=False)
+
+        cmd = ["python", FIT_CURVE_SCRIPT,
+               "--input", in_path, "--output", out_prefix]
+        logging.info("Running fit_curve: %s", " ".join(cmd))
+        subprocess.run(cmd, check=True)
+
+        # fit_curve.py writes <prefix>.0 in the reference snippet's behavior
+        out_path = out_prefix + ".0"
+        if not os.path.exists(out_path):
+            alt = out_prefix
+            if os.path.exists(alt):
+                out_path = alt
+            else:
+                raise FileNotFoundError(
+                    f"fit_curve.py did not produce output at {out_path}"
+                )
+        fitted = pd.read_csv(out_path, sep="\t")
+
+    # fit_curve.py renames Drug → improve_drug_id, so the column is present
+    # but holds the raw drug name, not the SMI_ ID. Always remap through drug_map.
+    if "improve_drug_id" in fitted.columns:
+        name_col = "improve_drug_id"
+    elif "Drug" in fitted.columns:
+        name_col = "Drug"
+    else:
+        raise KeyError(
+            f"fit_curve output has neither improve_drug_id nor Drug "
+            f"column (have: {list(fitted.columns)})"
+        )
+    fitted["improve_drug_id"] = fitted[name_col].apply(
+        lambda n: lookup_drug_id(n, drug_map))
+
+    fitted = fitted.dropna(subset=["improve_drug_id"])
+    n_metrics = (fitted["dose_response_metric"].nunique()
+                  if "dose_response_metric" in fitted.columns else 0)
+    logging.info("Curve-fit produced %d rows across %d metrics",
+                 len(fitted), n_metrics)
+    return fitted
+
+
+def format_single_dose(single: pd.DataFrame, sample_map, drug_map) -> pd.DataFrame:
+    if single.empty:
+        return pd.DataFrame()
+
+    df = single.copy()
+    df["improve_sample_id"] = df["specimen"].map(sample_map)
+    df["improve_drug_id"]   = df["Drug"].apply(
+        lambda n: lookup_drug_id(n, drug_map))
+
+    n_drop = (df["improve_sample_id"].isna() |
+               df["improve_drug_id"].isna()).sum()
+    if n_drop:
+        logging.warning("Dropping %d single-dose rows with unmapped sample/drug",
+                        n_drop)
+    df = df.dropna(subset=["improve_sample_id", "improve_drug_id"]).copy()
+
+    df["dose_response_metric"] = SINGLE_DOSE_METRIC
+    df["dose_response_value"]  = df["Viability_percentage"].astype(float) / 100.0
+    df["time"]      = TIME
+    df["time_unit"] = TIME_UNIT
+    df["study"]     = STUDY
+    df["source"]    = SOURCE
+    df["improve_sample_id"] = df["improve_sample_id"].astype(int)
+
+    cols = ["source", "improve_sample_id", "improve_drug_id", "study",
+            "time", "time_unit", "dose_response_metric", "dose_response_value"]
+    df = df[cols]
+    logging.info("Formatted %d single-dose rows", len(df))
+    return df
+
+
+# ----------------------------------------------------------------------------
+# Main
+# ----------------------------------------------------------------------------
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("samples", help="cnf_samples.csv from build_samples")
+    parser.add_argument("drugs",   help="cnf_drugs.tsv from build_drugs")
+    parser.add_argument("--output",  default=DEFAULT_OUTPUT)
+    parser.add_argument(
+        "--parents", default=",".join(DEFAULT_DRUG_SCREEN_PARENTS),
+        help="Comma-delimited Synapse folder IDs to walk for viability files",
+    )
+    args = parser.parse_args()
+
+    configure_logging()
+
+    parents = [p.strip() for p in args.parents.split(",") if p.strip()]
+    if not parents:
+        logging.error("No parent folders provided")
+        sys.exit(2)
+
+    sample_map = load_sample_map(args.samples)
+    drug_map   = load_drug_map(args.drugs)
+
+    syn = synapseclient.Synapse()
+    syn.login()
+
+    combined = pull_screen_files(syn, parents)
+    multi, single = split_single_vs_multi(combined)
+
+    fitted    = run_curve_fit(multi, sample_map, drug_map)
+    single_df = format_single_dose(single, sample_map, drug_map)
+
+    final = pd.concat([fitted, single_df], ignore_index=True)
+    cols = ["source", "improve_sample_id", "improve_drug_id", "study",
+            "time", "time_unit", "dose_response_metric", "dose_response_value"]
+    for c in cols:
+        if c not in final.columns:
+            final[c] = pd.NA
+    final = final[cols]
+
+    os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
+    final.to_csv(args.output, sep="\t", index=False)
+    logging.info("Wrote %s (%d rows)", args.output, len(final))
+
+
+if __name__ == "__main__":
+    sys.exit(main() or 0)
diff --git a/coderbuild/cnf/README.md b/coderbuild/cnf/README.md
new file mode 100644
index 00000000..0755a952
--- /dev/null
+++ b/coderbuild/cnf/README.md
@@ -0,0 +1,169 @@
+# Cutaneous Neurofibroma (cnf) Organoid Drug Screen
+
+A coderdata dataset of cutaneous neurofibroma (cNF) patient-derived organoids
+with drug-response measurements and matched multi-omic profiling.
+
+## Overview
+
+Cutaneous neurofibromas are benign nerve-sheath tumors that develop in patients
+with neurofibromatosis type 1 (NF1). This dataset contains drug-screen
+viability measurements for 238 small-molecule compounds across 23 cNF tumor
+specimens from 10 patients, paired with bulk RNA-seq, global LC-MS/MS
+proteomics, and batch-corrected phosphoproteomics for many of the same
+specimens.
+
+| Property | Value |
+|---|---|
+| Cancer type | Cutaneous Neurofibroma |
+| Model type | patient derived organoid |
+| Patients | 10 |
+| Specimens | 23 (untreated organoids; additional tissue and treated organoid samples also present) |
+| Drugs | 238 |
+| Modalities included | Transcriptomics (TPM), Proteomics (log ratio), Phosphoproteomics (log ratio) |
+
+## Multi-tumor patient structure
+
+Each patient (`NF0017`, `NF0018`, ...) contributed up to three independent
+tumor specimens, named `NFxxxx_T1`, `NFxxxx_T2`, `NFxxxx_T3`. This dataset
+treats each specimen as an independent sample (its own `improve_sample_id`).
+The patient ID is preserved in the `other_names` column so downstream models
+can group specimens by patient when needed (e.g. patient-level holdout).
+
+```
+improve_sample_id  other_id    common_name  other_names  other_id_source
+2401               NF0019_T1   NF0019_T1    NF0019       NF1_cNF_Project
+2402               NF0019_T2   NF0019_T2    NF0019       NF1_cNF_Project
+2403               NF0019_T3   NF0019_T3    NF0019       NF1_cNF_Project
+```
+
+The sample table also includes matched normal skin (`NFxxxx_skin`), primary
+tumor tissue (`NFxxxx_Tx_tissue`), and drug-treated organoids
+(`NFxxxx_Tx_<treatment_label>`). Each gets its own `improve_sample_id` and
+`model_type`. Organoids (untreated) receive the lowest IDs, sorted by
+specimen ID, to make drug-modelable samples easy to filter.
+
+
+## Drug-response metrics
+
+The cNF screen mixes two designs:
+
+- **Multi-dose drugs**: a subset of drugs were screened at multiple
+  concentrations. These are run through `coderbuild/utils/fit_curve.py` and
+  contribute the standard schema metrics (`fit_auc`, `fit_ic50`, `fit_einf`,
+  `fit_hs`, `fit_r2`).
+
+- **Single-dose drugs**: remaining drugs were screened at a single 1 μM
+  concentration. These are recorded with `dose_response_metric = uM_viability`
+  and `dose_response_value = Viability_percentage / 100` (range 0–1, where
+  1.0 = full viability and 0.0 = full kill).
+
+`uM_viability` is a value in the `ResponseMetric` enum. See the Schema
+dependency section below.
+
+## SPECIMEN_DUAL_MAPPINGS edge case
+
+One viability file covers the treated organoid `NF0021_T1_Onalespid_1uM` but
+its drug-response measurements should also be attributed to the untreated
+`NF0021_T1`. The `SPECIMEN_DUAL_MAPPINGS` dict in `04-experiments-cnf.py`
+handles this by duplicating the rows for every additional specimen listed:
+
+```python
+SPECIMEN_DUAL_MAPPINGS = {
+    "NF0021_T1_Onalespid_1uM": ["NF0021_T1"],
+}
+```
+
+Add new entries here if additional treated organoids need the same dual
+attribution. Both specimens must already exist in `cnf_samples.csv` or the
+duplicate rows will be dropped during the sample-map join.
+
+## Protocol optimization samples
+
+Six RNA columns in the cohort TPM matrices correspond to protocol optimization
+runs rather than real specimens. They are excluded in `02-omics-cnf.py` before
+the wide-to-long melt. Column stems matching any of these substrings are
+dropped:
+
+```python
+DROP_SAMPLE_SUBSTRINGS = [
+    "cNF_organoid_DIA_G_02_11Feb25",
+    "cNF_organoid_DIA_G_05_11Feb25",
+    "cNF_organoid_DIA_G_06_11Feb25",
+    "cNF_organoid_DIA_P_02_29Jan25",
+    "cNF_organoid_DIA_P_05_11Feb25",
+    "cNF_organoid_DIA_P_06_11Feb25",
+]
+```
+
+These are matched on the column stem (basename without extension), so both
+plain paths and filename columns are handled correctly. If new protocol
+optimization runs appear in future cohort data, add their column stems to
+this list.
+
+## Phosphosites dependency and known unmatched sites
+
+`02-omics-cnf.py` requires `phosphosites.csv` as a positional argument.
+The phosphosites reference must be built before the cNF omics step runs.
+`build_dataset.py` and `build_all.py` enforce this sequencing.
+
+The phosphoproteomics output file (`cnf_phosphoproteomics.csv`) is produced
+by an inner join between the raw Synapse phospho data and `phosphosites.csv`.
+Sites that don't appear in the reference are silently dropped. As of the
+current release, **12 sites** in the raw cNF phospho data (syn70078415) are
+permanently unmatched because their gene symbols (`BAP18`, `C11orf96`,
+`C14orf93`, `C1orf21`, `C2orf49`, etc). The remaining 2,974 of 2,986 unique sites
+(99.6%) are present in the output.
+
+## Data sources
+
+| Modality | Synapse ID | Notes |
+|---|---|---|
+| Drug-screen file index | `syn51301431` (table) | `dataType='drug screen'` |
+| Drug-screen file parents | `syn51301414`, `syn51301420`, `syn51301426` | One folder per cohort |
+| RNA-seq TPM (Cohort 1) | `syn66352931` | `salmon.merged.gene_tpm.tsv` |
+| RNA-seq TPM (Cohort 2) | `syn70765053` | `salmon.merged.gene_tpm.tsv` |
+| Global proteomics | `syn74815895` | Batch-corrected; `correctedAbundance` column |
+| Phospho-proteomics | `syn70078415` | Batch-corrected; `correctedAbundance` column; mapped via `phosphosites.csv` |
+| Sample discovery (RNA) | `syn71333780` | All-conditions RNA file for sample enumeration only |
+| Normal Skin folder | `syn74284682` | One child per patient for skin sample discovery |
+
+## Build pipeline
+
+The build follows coderdata conventions: four shell scripts wrapping Python
+implementations, plus shared utilities, packaged in a Docker image.
+
+```
+coderbuild/cnf/
+├── 01-samples-cnf.py     # mint improve_sample_ids, write cnf_samples.csv
+├── build_samples.sh
+├── 02-omics-cnf.py       # write cnf_transcriptomics.csv, cnf_proteomics.csv,
+├── build_omics.sh        #   cnf_phosphoproteomics.csv
+├── 03-drugs-cnf.py       # call pubchem_retrieval + descriptor utility
+├── build_drugs.sh
+├── 04-experiments-cnf.py # split single/multi-dose, run fit_curve, write cnf_experiments.tsv
+├── build_exp.sh
+├── cnf_utils.py          # shared specimen canonicalization utilities
+├── requirements.txt
+└── README.md             (this file)
+
+coderbuild/docker/
+└── Dockerfile.cnf
+```
+
+### Local build
+
+```bash
+# From repository root
+python coderbuild/build_dataset.py  --build --validate --dataset cnf
+```
+
+In the full coderdata build, this dataset is orchestrated by `build_dataset.py`
+/ `build_all.py`, which ensures `genes.csv`, `samples.csv`, `drugs.tsv`, and
+`phosphosites.csv` are all present before starting the cNF containers.
+
+
+## What's intentionally not included
+
+- **Mutations** - not yet generated for this dataset.
+- **Copy number** - not yet generated for this dataset.
+
diff --git a/coderbuild/cnf/build_drugs.sh b/coderbuild/cnf/build_drugs.sh
new file mode 100644
index 00000000..9402f178
--- /dev/null
+++ b/coderbuild/cnf/build_drugs.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+# build_drugs.sh - wraps 03-drugs-cnf.py per coderdata convention.
+#
+# Usage:
+#   build_drugs.sh <existing_drug_files>
+#
+# <existing_drug_files> is a comma-delimited list of drug TSV files from
+# previous datasets in the build sequence. Their improve_drug_id values
+# get reused when the canonical SMILES match.
+
+set -euo pipefail
+
+PREV_DRUGS="${1:-}"
+
+python 03-drugs-cnf.py \
+    --prev_drugs "$PREV_DRUGS" \
+    --out_drugs /tmp/cnf_drugs.tsv \
+    --out_desc  /tmp/cnf_drug_descriptors.tsv
+
+python build_drug_desc.py --drugtable /tmp/cnf_drugs.tsv --desctable /tmp/cnf_drug_descriptors.tsv.gz
diff --git a/coderbuild/cnf/build_exp.sh b/coderbuild/cnf/build_exp.sh
new file mode 100644
index 00000000..18c8b107
--- /dev/null
+++ b/coderbuild/cnf/build_exp.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+# build_exp.sh - wraps 04-experiments-cnf.py per coderdata convention.
+#
+# Usage:
+#   build_exp.sh <samples_file> <drugs_file>
+
+set -euo pipefail
+
+SAMPLES="${1:?Usage: build_exp.sh <cnf_samples.csv> <cnf_drugs.tsv>}"
+DRUGS="${2:?Usage: build_exp.sh <cnf_samples.csv> <cnf_drugs.tsv>}"
+
+python 04-experiments-cnf.py "$SAMPLES" "$DRUGS" --output /tmp/cnf_experiments.tsv
\ No newline at end of file
diff --git a/coderbuild/cnf/build_omics.sh b/coderbuild/cnf/build_omics.sh
new file mode 100644
index 00000000..7db0d062
--- /dev/null
+++ b/coderbuild/cnf/build_omics.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+# build_omics.sh - wraps 02-omics-cnf.py per coderdata convention.
+#
+# Usage:
+#   build_omics.sh <genes_file> <samples_file> <phosphosites_file>
+#
+# - <genes_file>:       genes.csv with gene_symbol → entrez_id mappings
+# - <samples_file>:     cnf_samples.csv produced by build_samples.sh
+# - <phosphosites_file>: phosphosites.csv (other_id → phosphosite_id)
+
+set -euo pipefail
+
+GENES="${1:?Usage: build_omics.sh <genes.csv> <cnf_samples.csv> <phosphosites.csv>}"
+SAMPLES="${2:?Usage: build_omics.sh <genes.csv> <cnf_samples.csv> <phosphosites.csv>}"
+PHOSPHOSITES="${3:?Usage: build_omics.sh <genes.csv> <cnf_samples.csv> <phosphosites.csv>}"
+
+python 02-omics-cnf.py "$GENES" "$SAMPLES" "$PHOSPHOSITES" \
+    --out_rna     /tmp/cnf_transcriptomics.csv \
+    --out_prot    /tmp/cnf_proteomics.csv \
+    --out_phospho /tmp/cnf_phosphoproteomics.csv
\ No newline at end of file
diff --git a/coderbuild/cnf/build_samples.sh b/coderbuild/cnf/build_samples.sh
new file mode 100644
index 00000000..37af2162
--- /dev/null
+++ b/coderbuild/cnf/build_samples.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+# build_samples.sh - wraps 01-samples-cnf.py per coderdata convention.
+#
+# Usage:
+#   build_samples.sh <previous_samples_file>
+#
+# The previous samples file is the latest samples CSV from the coderdata
+# build (with the highest improve_sample_id used so far). 01-samples-cnf.py
+# reads it, finds the max ID, and increments from there.
+
+set -euo pipefail
+
+PREV_SAMPLES="${1:?Usage: build_samples.sh <previous_samples.csv>}"
+
+python 01-samples-cnf.py "$PREV_SAMPLES" --output /tmp/cnf_samples.csv
\ No newline at end of file
diff --git a/coderbuild/cnf/cnf_utils.py b/coderbuild/cnf/cnf_utils.py
new file mode 100644
index 00000000..10ec38d2
--- /dev/null
+++ b/coderbuild/cnf/cnf_utils.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+"""
+cnf_utils.py: shared specimen classification utilities for the cNF build scripts.
+
+Used by 01-samples-cnf.py, 02-omics-cnf.py, and 04-experiments-cnf.py to
+canonicalize specimen strings so omics data, drug measurements, and the sample
+table all use the same improve_sample_id keys.
+"""
+
+import re
+
+
+# ----------------------------------------------------------------------------
+# Sample type → coderdata model_type enum
+# ----------------------------------------------------------------------------
+MODEL_TYPE_MAP = {
+    "organoid":         "patient derived organoid",
+    "treated_organoid": "patient derived organoid",
+    "tumor_tissue":     "tumor",
+    "normal_skin":      "normal_tissue",
+}
+
+
+
+
+# ----------------------------------------------------------------------------
+# Regex constants
+# ----------------------------------------------------------------------------
+# Bare tumor organoid: NFxxxx_Tx (4 patient digits, T + tumor digits)
+TUMOR_RE          = re.compile(r"^NF\d{4}_T\d+$")
+# Patient prefix at the start of any specimen string
+PATIENT_PREFIX_RE = re.compile(r"^(NF\d{4})")
+# Normal skin marker: NFxxxx[_-]skin..., allowing trailing date/replicate junk
+SKIN_RE           = re.compile(r"^(NF\d{4})[_-]skin", re.IGNORECASE)
+# Primary tumor tissue (not organoid culture): NFxxxx_Tx_tissue
+TISSUE_RE         = re.compile(r"^(NF\d{4}_T\d+)_tissue$", re.IGNORECASE)
+# Treated organoid: NFxxxx_Tx_<treatment_label>, e.g. NF0021_T1_Onalespid_1uM
+TREATED_RE        = re.compile(r"^NF\d{4}_T\d+_.+$")
+
+
+# ----------------------------------------------------------------------------
+# Classification
+# ----------------------------------------------------------------------------
+def classify_specimen(spec) -> tuple[str, str] | None:
+    """Classify a raw specimen string into (canonical_id, sample_type) or None.
+
+    sample_type ∈ {"organoid", "treated_organoid", "tumor_tissue", "normal_skin"}.
+
+    Canonical IDs:
+        organoid         → NFxxxx_Tx          (suffix stripped)
+        treated_organoid → NFxxxx_Tx_<treatment_label> (label preserved so
+                                              treated samples don't collide
+                                              with the untreated organoid)
+        tumor_tissue     → NFxxxx_Tx_tissue   (suffix preserved)
+        normal_skin      → NFxxxx_skin        (replicates collapsed per patient)
+
+    Unrecognized prefixes and malformed inputs return None.
+    """
+    if spec is None:
+        return None
+    s = str(spec).strip()
+    if not s or s.lower() == "nan":
+        return None
+
+    # Convert dot-separated form (e.g. RNA file's "NF0017.T1.organoid") to
+    # underscore-separated.
+    s_norm = s.replace(".", "_")
+
+    # Skin: any string starting with NFxxxx that contains a "skin" marker
+    skin_m = SKIN_RE.match(s_norm)
+    if skin_m:
+        return (f"{skin_m.group(1)}_skin", "normal_skin")
+
+    # Tumor tissue: NFxxxx_Tx_tissue
+    tissue_m = TISSUE_RE.match(s_norm)
+    if tissue_m:
+        return (f"{tissue_m.group(1)}_tissue", "tumor_tissue")
+
+    # Strip organoid suffix (handles "_organoid" and "_organoids")
+    s_stripped = re.sub(r"_organoids?$", "", s_norm, flags=re.IGNORECASE)
+
+    # Bare organoid (untreated): NFxxxx_Tx exactly
+    if TUMOR_RE.match(s_stripped):
+        return (s_stripped, "organoid")
+
+    # Treated organoid: NFxxxx_Tx_<anything else>
+    if TREATED_RE.match(s_stripped):
+        return (s_stripped, "treated_organoid")
+
+    return None
+
+
+def patient_from_specimen(specimen: str) -> str:
+    """Extract NFxxxx patient ID from a canonical specimen ID."""
+    m = PATIENT_PREFIX_RE.match(specimen)
+    return m.group(1) if m else ""
+
+
+def canonicalize_specimen_column(df, specimen_col: str):
+    """Apply classify_specimen to a dataframe's specimen column in place.
+
+    Returns the same df with:
+      - rows whose specimen classifies dropped if None
+      - specimen_col values replaced with their canonical IDs
+      - a new column 'sample_type' added
+
+    Used by build_omics.py and build_exp.py to map raw Synapse specimen
+    strings (e.g. "NF0018.T1.organoid", "NF0021.T1.Onalespid.1uM") to
+    canonical IDs that match the cnf_samples.csv `other_id` column.
+    """
+    classifications = df[specimen_col].apply(classify_specimen)
+    keep = classifications.notna()
+    df = df[keep].copy()
+    classifications = classifications[keep]
+    df[specimen_col] = [c[0] for c in classifications]
+    df["sample_type"] = [c[1] for c in classifications]
+    return df
diff --git a/coderbuild/cnf/requirements.txt b/coderbuild/cnf/requirements.txt
new file mode 100644
index 00000000..a5677837
--- /dev/null
+++ b/coderbuild/cnf/requirements.txt
@@ -0,0 +1,9 @@
+pandas>=2.0
+numpy>=1.24,<2.0
+synapseclient>=4.0
+requests>=2.28
+scipy>=1.10
+matplotlib
+scikit-learn
+scipy
+mordred
\ No newline at end of file
diff --git a/coderbuild/docker/Dockerfile.cnf b/coderbuild/docker/Dockerfile.cnf
new file mode 100644
index 00000000..06069ef1
--- /dev/null
+++ b/coderbuild/docker/Dockerfile.cnf
@@ -0,0 +1,53 @@
+# Dockerfile.cnf - container for building the cNF (cutaneous neurofibroma)
+# organoid drug-response dataset for inclusion in coderdata.
+#
+# Conventions (matching coderbuild/docker/Dockerfile.<dataset>):
+#   * Working directory is /coderbuild/cnf
+#   * Standard utilities are mounted/copied to /coderbuild/utils (fit_curve.py,
+#     pubchem_retrieval.py, build_drug_descriptor_table.py)
+#   * Synapse auth via SYNAPSE_AUTH_TOKEN env var (passed at `docker run`)
+#   * Build scripts are invoked from build_dataset.py / build_all.py via the
+#     standard build_*.sh entry points
+#
+# Build:
+#   docker build -f coderbuild/docker/Dockerfile.cnf -t coderdata-cnf .
+#
+# Run (orchestrated by build_dataset.py / build_all.py):
+#   docker run --rm \
+#     -e SYNAPSE_AUTH_TOKEN=$SYNAPSE_AUTH_TOKEN \
+#     -e CNF_RNA_TPM_SYN_ID=$CNF_RNA_TPM_SYN_ID \
+#     -v $(pwd)/local_data:/coderbuild/local_data \
+#     coderdata-cnf bash build_samples.sh /coderbuild/local_data/prev_samples.csv
+
+FROM python:3.11-slim
+
+# System deps (rdkit-pypi pulls in some C libs; basic build tools for scipy)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential wget ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set up coderbuild layout
+WORKDIR /coderbuild/cnf
+
+# Install Python deps. rdkit is brought in via the descriptor utility but
+# pinned here so the descriptor step is reproducible.
+COPY coderbuild/cnf/requirements.txt /coderbuild/cnf/requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt rdkit-pypi
+
+# Copy the cNF build scripts
+COPY coderbuild/cnf/01-samples-cnf.py /coderbuild/cnf/
+COPY coderbuild/cnf/02-omics-cnf.py   /coderbuild/cnf/
+COPY coderbuild/cnf/03-drugs-cnf.py   /coderbuild/cnf/
+COPY coderbuild/cnf/04-experiments-cnf.py     /coderbuild/cnf/
+COPY coderbuild/cnf/build_samples.sh /coderbuild/cnf/
+COPY coderbuild/cnf/build_omics.sh   /coderbuild/cnf/
+COPY coderbuild/cnf/build_drugs.sh   /coderbuild/cnf/
+COPY coderbuild/cnf/build_exp.sh     /coderbuild/cnf/
+COPY coderbuild/cnf/cnf_utils.py     /coderbuild/cnf/
+COPY coderbuild/utils/build_drug_desc.py /coderbuild/cnf/
+
+RUN chmod +x /coderbuild/cnf/*.sh
+
+# Standard coderdata utilities (fit_curve.py, pubchem_retrieval.py,
+COPY coderbuild/utils /coderbuild/utils
+
diff --git a/coderbuild/docker/Dockerfile.phosphosites b/coderbuild/docker/Dockerfile.phosphosites
new file mode 100644
index 00000000..a45b9702
--- /dev/null
+++ b/coderbuild/docker/Dockerfile.phosphosites
@@ -0,0 +1,14 @@
+FROM python:3.11-slim
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /coderbuild/phosphosites
+
+RUN pip install --no-cache-dir pandas>=2.0 synapseclient>=4.0 openpyxl>=3.1
+
+COPY coderbuild/phosphosites/00-buildPhosphositeFile.py /coderbuild/phosphosites/
+COPY coderbuild/phosphosites/build_phosphosites.sh      /coderbuild/phosphosites/
+
+RUN chmod +x /coderbuild/phosphosites/build_phosphosites.sh
diff --git a/coderbuild/docker/docker-compose.yml b/coderbuild/docker/docker-compose.yml
index 51eb56ee..0a87e0e5 100644
--- a/coderbuild/docker/docker-compose.yml
+++ b/coderbuild/docker/docker-compose.yml
@@ -96,6 +96,15 @@ services:
     platform: linux/amd64
     image: genes:latest
 
+  phosphosites:
+    build:
+      context: ../../
+      dockerfile: coderbuild/docker/Dockerfile.phosphosites
+      args:
+        HTTPS_PROXY: ${HTTPS_PROXY}
+    platform: linux/amd64
+    image: phosphosites:latest
+
   upload:
     build:
       context: ../../
@@ -130,4 +139,13 @@ services:
       args:
         HTTPS_PROXY: ${HTTPS_PROXY}
     platform: linux/amd64
-    image: novartis:latest
\ No newline at end of file
+    image: novartis:latest
+
+  cnf:
+    build:
+      context: ../../
+      dockerfile: coderbuild/docker/Dockerfile.cnf
+      args:
+        HTTPS_PROXY: ${HTTPS_PROXY}
+    platform: linux/amd64
+    image: cnf:latest
\ No newline at end of file
diff --git a/coderbuild/genes/00-buildGeneFile.R b/coderbuild/genes/00-buildGeneFile.R
index 5c113a94..75d8c31f 100755
--- a/coderbuild/genes/00-buildGeneFile.R
+++ b/coderbuild/genes/00-buildGeneFile.R
@@ -1,57 +1,145 @@
-##this uses the Hg38 database to seed the gene name mappings that we might want to use
-
-#if(!require('org.Hs.eg.db')){
-#  BiocManager::install('org.Hs.eg.db')
-library(org.Hs.eg.db)
-library(biomaRt)
-#}
-
-library(dplyr)
-##get entrez ids to symbol
-entrez<-as.data.frame(org.Hs.egALIAS2EG)
-
-sym <- as.data.frame(org.Hs.egSYMBOL)
-
-##get entriz ids to ensembl
-ens<-as.data.frame(org.Hs.egENSEMBL2EG)
-
-##get transcript ids as well
-enst<-as.data.frame(org.Hs.egENSEMBLTRANS)
-
-                                        #now we can filter by protein coding using biomart
-ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
-
-tab <- getBM(attributes=c('ensembl_gene_id'),filters='biotype', values=c('protein_coding'),mart=ensembl)
-
-
-joined.df<-entrez|>
-    left_join(sym)|>
-    dplyr::rename(entrez_id='gene_id',gene_symbol='symbol',other_id='alias_symbol',gene_symbol='symbol')%>%
-    mutate(other_id_source='entrez_alias')
-
-##now get aliases from ensembl
-edf <- sym|>
-    inner_join(ens)|>
-    dplyr::rename(entrez_id='gene_id',gene_symbol='symbol',other_id='ensembl_id')%>%
-    mutate(other_id_source='ensembl_gene')
-
-
-tdf<-sym|>
-    inner_join(enst)|>
-    dplyr::rename(entrez_id='gene_id',gene_symbol='symbol',other_id='trans_id')|>
-    subset(entrez_id%in%edf$entrez_id)|>
-#    subset(gene_symbol%in%ed.df$gene_symbol)|>
-    dplyr::mutate(other_id_source='ensembl_transcript')
-
-
-prots<-subset(edf,other_id%in%tab$ensembl_gene_id)
-
-full.df<-rbind(joined.df,edf,tdf)|>
-    subset(entrez_id%in%prots$entrez_id)|>
-    distinct()
-
-#save to file and version
-write.table(full.df,'/tmp/genes.csv',sep=',',row.names=F,quote=T)
-
-##store this file somewhere!
-
+## Build /tmp/genes.csv — entrez_id, gene_symbol, other_id, other_id_source.
+##
+## Uses org.Hs.eg.db for the local mappings (entrez ↔ symbol, ensembl gene,
+## ensembl transcript) and biomaRt to filter to protein-coding genes.
+##
+## biomaRt's call to Ensembl frequently fails with mirror unavailability.
+## To handle this, the Ensembl call is wrapped in a retry loop:
+##   - cycles through all four mirror choices ('www', 'useast', 'asia', 'uswest')
+##   - each cycle waits longer than the last (exponential backoff)
+##   - up to MAX_CYCLES total cycles before giving up
+##
+## If every mirror remains unreachable, the script falls back to a local
+## protein-coding biotype list derived from org.Hs.eg.db's GO mapping
+## (filtering out pseudogenes/RNAs that lack ENSG IDs in `ens`). The
+## fallback is documented in the code so the lower coverage is visible.
+
+suppressPackageStartupMessages({
+  library(org.Hs.eg.db)
+  library(biomaRt)
+  library(dplyr)
+})
+
+# ---- Local mappings (no network) -------------------------------------------
+entrez <- as.data.frame(org.Hs.egALIAS2EG)
+sym    <- as.data.frame(org.Hs.egSYMBOL)
+ens    <- as.data.frame(org.Hs.egENSEMBL2EG)
+enst   <- as.data.frame(org.Hs.egENSEMBLTRANS)
+
+
+# ---- biomaRt with retry / mirror cycling -----------------------------------
+#
+# Strategy:
+#   For up to MAX_CYCLES, try each mirror in MIRRORS in order.
+#   On a successful useEnsembl() + getBM() call, return the result.
+#   On any failure, sleep for the current backoff and try the next mirror.
+#   Backoff doubles each cycle (30s → 60s → 120s → ...).
+#
+# Total worst-case wait at MAX_CYCLES=5: ~30 minutes spread across attempts,
+# which has historically been enough for transient Ensembl outages to clear.
+
+MIRRORS     <- c("www", "useast", "asia", "uswest")
+MAX_CYCLES  <- 5
+BASE_SLEEP  <- 30   # seconds; doubles each cycle
+
+fetch_protein_coding <- function() {
+  for (cycle in seq_len(MAX_CYCLES)) {
+    sleep_s <- BASE_SLEEP * (2 ^ (cycle - 1))
+    for (mirror in MIRRORS) {
+      message(sprintf("[%s] Cycle %d/%d, trying mirror '%s'...",
+                      Sys.time(), cycle, MAX_CYCLES, mirror))
+      result <- tryCatch({
+        ensembl <- useEnsembl(
+          biomart = "genes",
+          dataset = "hsapiens_gene_ensembl",
+          mirror  = mirror
+        )
+        tab <- getBM(
+          attributes = c("ensembl_gene_id"),
+          filters    = "biotype",
+          values     = c("protein_coding"),
+          mart       = ensembl
+        )
+        message(sprintf("[%s] Success via mirror '%s' (%d genes)",
+                        Sys.time(), mirror, nrow(tab)))
+        tab
+      }, error = function(e) {
+        message(sprintf("[%s] Mirror '%s' failed: %s",
+                        Sys.time(), mirror, conditionMessage(e)))
+        NULL
+      })
+      if (!is.null(result)) return(result)
+    }
+    if (cycle < MAX_CYCLES) {
+      message(sprintf("[%s] All mirrors failed in cycle %d; sleeping %ds before next cycle",
+                      Sys.time(), cycle, sleep_s))
+      Sys.sleep(sleep_s)
+    }
+  }
+  return(NULL)
+}
+
+tab <- fetch_protein_coding()
+
+
+# ---- Fallback: protein-coding inference from org.Hs.eg.db -------------------
+# If every Ensembl mirror is unreachable, derive a protein-coding gene list
+# locally. org.Hs.eg.db doesn't carry biotype directly, but every entrez ID
+# that has an ENSG mapping in org.Hs.egENSEMBL2EG is reasonably treated as
+# a "real" gene; combined with the symbol mapping, this approximates the
+# protein-coding filter well enough to keep the build going.
+#
+# The fallback genes.csv will have somewhat broader coverage than the
+# Ensembl-filtered version (it includes some pseudogenes etc), but the
+# downstream omics joins are inner-joined on gene_symbol, so any extras
+# that don't appear in the data files just get dropped harmlessly.
+
+if (is.null(tab)) {
+  message("\n=== WARNING ===")
+  message("All Ensembl mirrors unreachable after ", MAX_CYCLES,
+          " cycles. Using local org.Hs.eg.db fallback.")
+  message("This produces a slightly broader gene list than the canonical ",
+          "Ensembl-filtered version. To rebuild with the canonical filter, ",
+          "re-run this script when Ensembl is reachable.")
+  message("===============\n")
+  # Use every entrez ID that has an ENSG mapping as the protein-coding proxy.
+  tab <- data.frame(ensembl_gene_id = unique(ens$ensembl_id),
+                    stringsAsFactors = FALSE)
+}
+
+
+# ---- Assemble joined alias / ensembl gene / ensembl transcript table -------
+joined.df <- entrez |>
+  left_join(sym, by = "gene_id") |>
+  dplyr::rename(entrez_id   = "gene_id",
+                gene_symbol = "symbol",
+                other_id    = "alias_symbol") |>
+  mutate(other_id_source = "entrez_alias")
+
+edf <- sym |>
+  inner_join(ens, by = "gene_id") |>
+  dplyr::rename(entrez_id   = "gene_id",
+                gene_symbol = "symbol",
+                other_id    = "ensembl_id") |>
+  mutate(other_id_source = "ensembl_gene")
+
+tdf <- sym |>
+  inner_join(enst, by = "gene_id") |>
+  dplyr::rename(entrez_id   = "gene_id",
+                gene_symbol = "symbol",
+                other_id    = "trans_id") |>
+  subset(entrez_id %in% edf$entrez_id) |>
+  dplyr::mutate(other_id_source = "ensembl_transcript")
+
+prots <- subset(edf, other_id %in% tab$ensembl_gene_id)
+
+full.df <- rbind(joined.df, edf, tdf) |>
+  subset(entrez_id %in% prots$entrez_id) |>
+  distinct()
+
+
+# ---- Write ------------------------------------------------------------------
+out_path <- "/tmp/genes.csv"
+write.table(full.df, out_path, sep = ",", row.names = FALSE, quote = TRUE)
+message(sprintf("Wrote %s (%d rows, %d unique entrez_ids)",
+                out_path, nrow(full.df), length(unique(full.df$entrez_id))))
\ No newline at end of file
diff --git a/coderbuild/phosphosites/00-buildPhosphositeFile.py b/coderbuild/phosphosites/00-buildPhosphositeFile.py
new file mode 100644
index 00000000..09292dec
--- /dev/null
+++ b/coderbuild/phosphosites/00-buildPhosphositeFile.py
@@ -0,0 +1,589 @@
+#!/usr/bin/env python3
+"""
+00-buildPhosphositeFile.py: build the phosphosites reference CSV from three sources:
+
+  1. Ochoa et al. (2020) Nat Biotechnol 38, 365-373: ~116k curated human
+     phosphosites from Supplementary Table S3 (Springer static content).
+     https://static-content.springer.com/esm/art%3A10.1038%2Fs41587-019-0344-3/
+         MediaObjects/41587_2019_344_MOESM4_ESM.xlsx
+
+  2. UniProt PTM annotations: all reviewed human proteins with phospho
+     modifications from the UniProt REST API (~41k sites, ~12k unique to UniProt).
+
+  3. Synapse supplement(s): raw phosphoproteomics CSV/TSV files on Synapse
+     whose site column is parsed to add experiment-specific sites not in the
+     databases above (--synapse_supplements syn70078415 ...).
+
+Sources 1 and 2 are always fetched. Source 3 is optional but recommended.
+
+UniProt accession -> gene_symbol: UniProt REST API (TSV, accession+gene_names).
+gene_symbol -> entrez_id: genes.csv.
+
+Site notation: <GENE>-<Residue><Position><mod_code>
+    e.g. AAAS-S495s  (s=phosphoserine, t=phosphothreonine, y=phosphotyrosine)
+
+Output columns:
+    phosphosite_id, entrez_id, gene_symbol, residue, position, modification, other_id
+
+Usage:
+    python 00-buildPhosphositeFile.py <genes.csv>
+                                      [--out /tmp/phosphosites.csv]
+                                      [--prev /tmp/prev_phosphosites.csv]
+                                      [--supplement site_file.csv ...]
+                                      [--site_col site]
+                                      [--synapse_supplements syn70078415 ...]
+                                      [--synapse_site_col site]
+                                      [--ochoa_url <url>]
+                                      [--uniprot_gene_url <url>]
+                                      [--uniprot_ptm_url <url>]
+"""
+
+import argparse
+import csv
+import io
+import json
+import logging
+import re
+import sys
+import tempfile
+import time
+import urllib.request
+
+import openpyxl
+import pandas as pd
+
+
+OCHOA_URL = (
+    "https://static-content.springer.com/esm/"
+    "art%3A10.1038%2Fs41587-019-0344-3/MediaObjects/"
+    "41587_2019_344_MOESM4_ESM.xlsx"
+)
+
+UNIPROT_GENE_URL = (
+    "https://rest.uniprot.org/uniprotkb/search"
+    "?query=reviewed:true+AND+organism_id:9606"
+    "&fields=accession,gene_names"
+    "&format=tsv"
+    "&size=500"
+)
+
+UNIPROT_PTM_URL = (
+    "https://rest.uniprot.org/uniprotkb/search"
+    "?query=reviewed:true+AND+organism_id:9606+AND+ft_mod_res:phospho*"
+    "&fields=accession,gene_names,ft_mod_res"
+    "&format=json"
+    "&size=200"
+)
+
+_RESIDUE_MOD = {"S": "s", "T": "t", "Y": "y"}
+_PHOSPHO_DESC_MAP = {
+    "phosphoserine":    ("S", "s"),
+    "phosphothreonine": ("T", "t"),
+    "phosphotyrosine":  ("Y", "y"),
+}
+_SITE_RE = re.compile(r'^(.+)-([STY])(\d+)([a-z]+)$', re.IGNORECASE)
+_SUPP_MOD_CODES = {'s': 'phospho', 't': 'phospho', 'y': 'phospho'}
+
+_HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0"
+    ),
+    "Accept": "*/*",
+}
+
+
+# ---------------------------------------------------------------------------
+# Logging
+# ---------------------------------------------------------------------------
+def configure_logging():
+    logging.basicConfig(
+        format="[%(asctime)s] [%(levelname)s] %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        level=logging.INFO,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Reference data
+# ---------------------------------------------------------------------------
+def load_gene_map(genes_path: str) -> pd.DataFrame:
+    df = pd.read_csv(genes_path)
+    needed = {'gene_symbol', 'entrez_id'}
+    if needed - set(df.columns):
+        raise KeyError(f"genes file missing columns: {needed - set(df.columns)}")
+    df = df.dropna(subset=['gene_symbol', 'entrez_id']).copy()
+    df['entrez_id'] = df['entrez_id'].astype(int)
+    df = df.drop_duplicates(subset='gene_symbol')[['gene_symbol', 'entrez_id']]
+    logging.info("Loaded %d gene_symbol → entrez_id mappings", len(df))
+    return df
+
+
+# ---------------------------------------------------------------------------
+# Network helper
+# ---------------------------------------------------------------------------
+def _fetch_url(url: str, retries: int = 3, timeout: int = 120) -> bytes:
+    req = urllib.request.Request(url, headers=_HEADERS)
+    for attempt in range(1, retries + 1):
+        try:
+            logging.info("  GET %s (attempt %d/%d)", url, attempt, retries)
+            with urllib.request.urlopen(req, timeout=timeout) as resp:
+                return resp.read()
+        except Exception as exc:
+            logging.warning("  Attempt %d failed: %s", attempt, exc)
+            if attempt < retries:
+                time.sleep(5 * attempt)
+    raise RuntimeError(f"Failed to fetch {url} after {retries} attempts")
+
+
+def _fetch_json_page(url: str, timeout: int = 120) -> tuple[list[dict], str | None]:
+    req = urllib.request.Request(url, headers={**_HEADERS, "Accept": "application/json"})
+    with urllib.request.urlopen(req, timeout=timeout) as resp:
+        data = json.loads(resp.read().decode('utf-8'))
+        link_header = resp.headers.get('Link', '')
+    entries = data.get('results', [])
+    m = re.search(r'<([^>]+)>;\s*rel="next"', link_header)
+    return entries, m.group(1) if m else None
+
+
+# ---------------------------------------------------------------------------
+# Source 1: Ochoa et al. supplementary table
+# ---------------------------------------------------------------------------
+def fetch_ochoa_phosphosites(url: str) -> list[dict]:
+    """Download Ochoa MOESM4 xlsx and return raw phosphosite records."""
+    logging.info("Downloading Ochoa et al. supplementary phosphoproteome")
+    data = _fetch_url(url)
+
+    with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
+        tmp.write(data)
+        tmp_path = tmp.name
+
+    logging.info("Parsing xlsx (%d MB) …", len(data) // 1_000_000)
+    wb = openpyxl.load_workbook(tmp_path, read_only=True, data_only=True)
+
+    sheet_name = "annotated_phosphoproteome"
+    if sheet_name not in wb.sheetnames:
+        wb.close()
+        raise ValueError(
+            f"Sheet '{sheet_name}' not found. Available: {wb.sheetnames}"
+        )
+
+    ws = wb[sheet_name]
+    records, header, uni_i, pos_i, res_i, skipped = [], None, None, None, None, 0
+
+    for row in ws.iter_rows(values_only=True):
+        if header is None:
+            header = [str(c).strip() if c is not None else "" for c in row]
+            try:
+                uni_i = header.index("uniprot")
+                pos_i = header.index("position")
+                res_i = header.index("residue")
+            except ValueError as exc:
+                wb.close()
+                raise ValueError(f"Required column missing in Ochoa sheet: {exc}")
+            logging.info(
+                "Ochoa columns: uniprot=%d, position=%d, residue=%d",
+                uni_i, pos_i, res_i,
+            )
+            continue
+
+        acc = str(row[uni_i]).strip() if row[uni_i] is not None else ""
+        try:
+            pos = int(row[pos_i])
+        except (TypeError, ValueError):
+            skipped += 1
+            continue
+
+        res_raw = str(row[res_i]).strip().upper() if row[res_i] is not None else ""
+        residue = res_raw[0] if res_raw and res_raw[0] in _RESIDUE_MOD else None
+
+        if residue is None or not acc:
+            skipped += 1
+            continue
+
+        records.append({
+            "accession":    acc,
+            "residue":      residue,
+            "position":     pos,
+            "modification": "phospho",
+            "mod_code":     _RESIDUE_MOD[residue],
+        })
+
+    wb.close()
+    logging.info(
+        "Ochoa: %d records parsed (%d skipped)", len(records), skipped
+    )
+    return records
+
+
+# ---------------------------------------------------------------------------
+# Source 2: UniProt PTM annotations
+# ---------------------------------------------------------------------------
+def fetch_uniprot_ptm_sites(url: str) -> list[dict]:
+    """Fetch phosphorylation sites from UniProt Modified residue features.
+
+    Returns records with gene_symbol (already resolved) instead of accession.
+    """
+    logging.info("Fetching UniProt PTM phosphorylation sites")
+    all_records = []
+    next_url: str | None = url
+    page = 0
+
+    while next_url:
+        page += 1
+        if page % 10 == 1:
+            logging.info("  UniProt PTM page %d (%d records so far)", page, len(all_records))
+
+        for attempt in range(1, 4):
+            try:
+                entries, next_url = _fetch_json_page(next_url)
+                break
+            except Exception as exc:
+                logging.warning("  Page %d attempt %d failed: %s", page, attempt, exc)
+                if attempt == 3:
+                    raise
+                time.sleep(5 * attempt)
+
+        for entry in entries:
+            gene = ""
+            for g in entry.get("genes", []):
+                if "geneName" in g:
+                    gene = g["geneName"].get("value", "").upper()
+                    break
+            if not gene:
+                continue
+            for feat in entry.get("features", []):
+                if feat.get("type") != "Modified residue":
+                    continue
+                desc = feat.get("description", "").lower()
+                residue, mod_code = None, None
+                for key, (aa, mc) in _PHOSPHO_DESC_MAP.items():
+                    if desc.startswith(key):
+                        residue, mod_code = aa, mc
+                        break
+                if residue is None:
+                    continue
+                pos = feat.get("location", {}).get("start", {}).get("value")
+                if pos is None:
+                    continue
+                all_records.append({
+                    "gene_symbol":  gene,
+                    "residue":      residue,
+                    "position":     int(pos),
+                    "modification": "phospho",
+                    "mod_code":     mod_code,
+                    "other_id":     f"{gene}-{residue}{pos}{mod_code}",
+                })
+
+    logging.info(
+        "UniProt PTM: %d records from %d pages", len(all_records), page
+    )
+    return all_records
+
+
+# ---------------------------------------------------------------------------
+# UniProt accession → gene_symbol
+# ---------------------------------------------------------------------------
+def fetch_uniprot_gene_map(url: str) -> dict[str, str]:
+    """Return accession → primary gene_symbol for all reviewed human proteins."""
+    logging.info("Fetching UniProt accession → gene_symbol map")
+    acc_to_gene: dict[str, str] = {}
+    next_url: str | None = url
+    page = 0
+
+    while next_url:
+        page += 1
+        if page % 10 == 1:
+            logging.info(
+                "  UniProt gene map page %d (%d entries so far)", page, len(acc_to_gene)
+            )
+        for attempt in range(1, 4):
+            try:
+                req = urllib.request.Request(next_url, headers=_HEADERS)
+                with urllib.request.urlopen(req, timeout=120) as resp:
+                    raw = resp.read()
+                    link_header = resp.headers.get("Link", "")
+                break
+            except Exception as exc:
+                logging.warning("  Page %d attempt %d failed: %s", page, attempt, exc)
+                if attempt == 3:
+                    raise
+                time.sleep(5 * attempt)
+
+        reader = csv.DictReader(
+            io.StringIO(raw.decode("utf-8", errors="replace")), delimiter="\t"
+        )
+        for row in reader:
+            acc = row.get("Entry", row.get("accession", "")).strip()
+            gene_names = row.get("Gene Names", row.get("gene_names", "")).strip()
+            if acc and gene_names:
+                acc_to_gene[acc] = gene_names.split()[0].upper()
+
+        m = re.search(r'<([^>]+)>;\s*rel="next"', link_header)
+        next_url = m.group(1) if m else None
+
+    logging.info(
+        "UniProt gene map: %d entries (%d pages)", len(acc_to_gene), page
+    )
+    return acc_to_gene
+
+
+# ---------------------------------------------------------------------------
+# Source 3: Synapse supplement(s)
+# ---------------------------------------------------------------------------
+def fetch_synapse_supplement(syn_id: str, site_col: str) -> list[dict]:
+    """Download a Synapse file and parse its site strings as supplement records."""
+    import synapseclient
+    logging.info("Fetching Synapse supplement %s", syn_id)
+    syn = synapseclient.Synapse()
+    syn.login(silent=True)
+    entity = syn.get(syn_id)
+    path = entity.path
+    sep = "\t" if path.endswith((".tsv", ".tsv.gz")) else ","
+
+    import gzip as _gzip
+    open_fn = _gzip.open if path.endswith(".gz") else open
+
+    records = []
+    skipped = 0
+    seen: set[str] = set()
+    resolved_col: str | None = None
+
+    with open_fn(path, "rt", encoding="utf-8", errors="replace") as f:
+        reader = csv.DictReader(f, delimiter=sep)
+        for row in reader:
+            # Auto-detect site column on first row if requested column missing
+            if resolved_col is None:
+                if site_col in row:
+                    resolved_col = site_col
+                else:
+                    for candidate in ["site", "Site", "phosphosite", "feature_id"]:
+                        if candidate in row:
+                            resolved_col = candidate
+                            logging.info(
+                                "  Synapse %s: using column '%s' for sites",
+                                syn_id, resolved_col,
+                            )
+                            break
+                if resolved_col is None:
+                    logging.warning(
+                        "  Synapse %s: could not find site column (tried %s). "
+                        "Available: %s",
+                        syn_id, site_col, list(row.keys())[:10],
+                    )
+                    return []
+
+            site = row.get(resolved_col, "").strip()
+            if not site or site in seen:
+                continue
+            seen.add(site)
+            m = _SITE_RE.match(site)
+            if not m:
+                skipped += 1
+                continue
+            mod_code = m.group(4).lower()
+            records.append({
+                "gene_symbol":  m.group(1).upper(),
+                "residue":      m.group(2).upper(),
+                "position":     int(m.group(3)),
+                "modification": _SUPP_MOD_CODES.get(mod_code, f"phospho_{mod_code}"),
+                "mod_code":     mod_code,
+                "other_id":     site,
+            })
+
+    logging.info(
+        "Synapse %s: %d unique sites parsed (%d skipped)", syn_id, len(records), skipped
+    )
+    return records
+
+
+# ---------------------------------------------------------------------------
+# Local file supplement
+# ---------------------------------------------------------------------------
+def parse_supplement_sites(path: str, site_col: str) -> list[dict]:
+    records = []
+    skipped = 0
+    sep = "\t" if path.endswith(".tsv") else ","
+    with open(path, newline="", encoding="utf-8") as f:
+        reader = csv.DictReader(f, delimiter=sep)
+        seen: set[str] = set()
+        for row in reader:
+            site = row.get(site_col, "").strip()
+            if not site or site in seen:
+                continue
+            seen.add(site)
+            m = _SITE_RE.match(site)
+            if not m:
+                skipped += 1
+                continue
+            mod_code = m.group(4).lower()
+            records.append({
+                "gene_symbol":  m.group(1).upper(),
+                "residue":      m.group(2).upper(),
+                "position":     int(m.group(3)),
+                "modification": _SUPP_MOD_CODES.get(mod_code, f"phospho_{mod_code}"),
+                "mod_code":     mod_code,
+                "other_id":     site,
+            })
+    logging.info(
+        "Supplement %s: %d unique sites (%d skipped)", path, len(records), skipped
+    )
+    return records
+
+
+# ---------------------------------------------------------------------------
+# Assembly
+# ---------------------------------------------------------------------------
+def build_phosphosite_table(
+    ochoa_records:   list[dict],
+    acc_to_gene:     dict[str, str],
+    uniprot_records: list[dict],
+    gene_map:        pd.DataFrame,
+    supplement_records: list[dict],
+    prev: pd.DataFrame | None,
+) -> pd.DataFrame:
+    gene_to_entrez = gene_map.set_index("gene_symbol")["entrez_id"].to_dict()
+    rows: list[dict] = []
+    existing_ids: set[str] = set()
+
+    def _add(gene: str, residue: str, position: int, modification: str, mod_code: str) -> None:
+        other_id = f"{gene}-{residue}{position}{mod_code}"
+        if other_id in existing_ids:
+            return
+        entrez = gene_to_entrez.get(gene)
+        if entrez is None:
+            return
+        rows.append({
+            "gene_symbol":  gene,
+            "entrez_id":    entrez,
+            "residue":      residue,
+            "position":     position,
+            "modification": modification,
+            "other_id":     other_id,
+        })
+        existing_ids.add(other_id)
+
+    # --- Ochoa (primary) ---
+    no_gene_ochoa = 0
+    for rec in ochoa_records:
+        gene = acc_to_gene.get(rec["accession"], "")
+        if not gene:
+            no_gene_ochoa += 1
+            continue
+        _add(gene, rec["residue"], rec["position"], rec["modification"], rec["mod_code"])
+    logging.info(
+        "Ochoa: %d sites added (%d with no gene symbol)", len(rows), no_gene_ochoa
+    )
+
+    before_uniprot = len(rows)
+    # --- UniProt PTM (secondary) ---
+    for rec in uniprot_records:
+        _add(
+            rec["gene_symbol"], rec["residue"], rec["position"],
+            rec["modification"], rec["mod_code"],
+        )
+    logging.info(
+        "UniProt PTM: +%d new sites (total %d)", len(rows) - before_uniprot, len(rows)
+    )
+
+    before_supp = len(rows)
+    # --- Supplements (local files + Synapse) ---
+    for rec in supplement_records:
+        _add(
+            rec["gene_symbol"], rec["residue"], rec["position"],
+            rec["modification"], rec["mod_code"],
+        )
+    if len(rows) > before_supp:
+        logging.info(
+            "Supplements: +%d new sites (total %d)", len(rows) - before_supp, len(rows)
+        )
+
+    df = pd.DataFrame(rows)
+
+    # --- Assign / preserve phosphosite_ids ---
+    if prev is not None and not prev.empty:
+        prev_map = prev.set_index("other_id")["phosphosite_id"].to_dict()
+        next_id = int(prev["phosphosite_id"].max()) + 1
+    else:
+        prev_map = {}
+        next_id = 1
+
+    ids = []
+    for site in df["other_id"]:
+        if site in prev_map:
+            ids.append(prev_map[site])
+        else:
+            ids.append(next_id)
+            prev_map[site] = next_id
+            next_id += 1
+    df["phosphosite_id"] = ids
+
+    cols = [
+        "phosphosite_id", "entrez_id", "gene_symbol",
+        "residue", "position", "modification", "other_id",
+    ]
+    df = df[cols].sort_values("phosphosite_id").reset_index(drop=True)
+    logging.info(
+        "Final phosphosites table: %d rows, %d unique genes",
+        len(df), df["gene_symbol"].nunique(),
+    )
+    return df
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("genes", help="genes.csv (gene_symbol → entrez_id)")
+    parser.add_argument("--out",  default="/tmp/phosphosites.csv")
+    parser.add_argument("--prev", default=None,
+                        help="Previous phosphosites.csv to preserve IDs")
+    parser.add_argument("--supplement", nargs="*", default=[],
+                        help="Local CSV/TSV file(s) with site strings")
+    parser.add_argument("--site_col", default="site",
+                        help="Column name for site strings in local supplement files")
+    parser.add_argument("--synapse_supplements", nargs="*", default=[],
+                        help="Synapse IDs of raw phospho files to add as supplements "
+                             "(e.g. syn70078415)")
+    parser.add_argument("--synapse_site_col", default="site",
+                        help="Site column name in Synapse supplement files")
+    parser.add_argument("--ochoa_url",       default=OCHOA_URL)
+    parser.add_argument("--uniprot_gene_url", default=UNIPROT_GENE_URL)
+    parser.add_argument("--uniprot_ptm_url",  default=UNIPROT_PTM_URL)
+    args = parser.parse_args()
+
+    configure_logging()
+
+    gene_map = load_gene_map(args.genes)
+
+    prev = None
+    if args.prev:
+        prev = pd.read_csv(args.prev)
+        logging.info("Loaded %d previous phosphosites from %s", len(prev), args.prev)
+
+    ochoa_records   = fetch_ochoa_phosphosites(args.ochoa_url)
+    acc_to_gene     = fetch_uniprot_gene_map(args.uniprot_gene_url)
+    uniprot_records = fetch_uniprot_ptm_sites(args.uniprot_ptm_url)
+
+    supp_records: list[dict] = []
+    for path in (args.supplement or []):
+        supp_records.extend(parse_supplement_sites(path, args.site_col))
+    for syn_id in (args.synapse_supplements or []):
+        try:
+            supp_records.extend(fetch_synapse_supplement(syn_id, args.synapse_site_col))
+        except Exception as exc:
+            logging.warning("Synapse supplement %s failed: %s — skipping", syn_id, exc)
+
+    df = build_phosphosite_table(
+        ochoa_records, acc_to_gene, uniprot_records, gene_map, supp_records, prev
+    )
+
+    df.to_csv(args.out, index=False)
+    logging.info(
+        "Wrote %s (%d rows, %d unique genes)",
+        args.out, len(df), df["gene_symbol"].nunique(),
+    )
+
+
+if __name__ == "__main__":
+    sys.exit(main() or 0)
diff --git a/coderbuild/phosphosites/README.md b/coderbuild/phosphosites/README.md
new file mode 100644
index 00000000..e2cf788f
--- /dev/null
+++ b/coderbuild/phosphosites/README.md
@@ -0,0 +1,97 @@
+# Phosphosites Reference Builder
+
+Produces `phosphosites.csv`, the global human phosphosite reference used by
+dataset omics pipelines (currently: cNF) to map raw phospho measurements to
+stable integer identifiers.
+
+## Sources
+
+Three sources are merged in priority order. A site from an earlier source is
+never overwritten by a later one (deduplication on `other_id`).
+
+| Priority | Source | Sites (approx.) |
+|---|---|---|
+| 1 | Ochoa et al. (2020) Nat Biotechnol 38:365-373, Supp. Table S3 | ~116k |
+| 2 | UniProt PTM REST API | ~12k additional |
+| 3 | Synapse supplement `syn70078415` (cNF raw phospho data) | ~188 additional |
+
+## Site notation
+
+```
+<GENE>-<Residue><Position><mod_code>
+e.g.  AAAS-S495s  (s=phosphoserine, t=phosphothreonine, y=phosphotyrosine)
+```
+
+`other_id` in `phosphosites.csv` stores these strings and is the join key for
+raw phospho data.
+
+## Output columns
+
+`phosphosite_id`, `entrez_id`, `gene_symbol`, `residue`, `position`,
+`modification`, `other_id`
+
+## Files
+
+```
+coderbuild/phosphosites/
+├── 00-buildPhosphositeFile.py   # Builder script
+└── build_phosphosites.sh        # Docker entry point
+coderbuild/docker/
+└── Dockerfile.phosphosites
+```
+
+## Builder — `00-buildPhosphositeFile.py`
+
+**Ochoa (primary):** Downloads `41587_2019_344_MOESM4_ESM.xlsx` from the
+Springer static CDN (no login required; requires `openpyxl`). Reads the
+`annotated_phosphoproteome` sheet (`uniprot`, `position`, `residue` columns).
+UniProt accessions are resolved to gene symbols via a separate UniProt REST
+call (TSV, all reviewed human proteins, paginated via `Link: rel="next"`
+header).
+
+**UniProt PTM (secondary):** Downloads `Modified residue` features with
+phosphoserine/phosphothreonine/phosphotyrosine descriptions from the UniProt
+REST API (JSON, same Link-header pagination). Returns gene symbols directly.
+
+**Synapse supplement (`--synapse_supplements`):** Downloads a raw phospho
+file from Synapse, auto-detects the site column (`site`, `Site`,
+`phosphosite`, or `feature_id`), and appends any `GENE-ResPositionmod` sites
+not already in the reference. Requires `SYNAPSE_AUTH_TOKEN`. Failures are
+logged as warnings and do not abort the build.
+
+### Key arguments
+
+| Argument | Default | Description |
+|---|---|---|
+| `genes` (positional) | — | Path to `genes.csv` |
+| `--out` | `/tmp/phosphosites.csv` | Output path |
+| `--prev` | None | Previous CSV to preserve `phosphosite_id` values across builds |
+| `--synapse_supplements` | None | Synapse IDs of raw phospho files (space-separated) |
+| `--synapse_site_col` | `site` | Site column name in Synapse supplement files |
+| `--supplement` | None | Local CSV/TSV file(s) with site strings |
+| `--site_col` | `site` | Site column name in local supplement files |
+
+## Build sequencing
+
+Phosphosites must run **after** `genes.csv` is ready. `build_dataset.py` and
+`build_all.py` wait on the genes future before submitting the phosphosites
+container.
+
+## ID stability
+
+Pass `--prev <prev_phosphosites.csv>` to preserve IDs across rebuilds. New
+sites receive IDs starting from `max(prev.phosphosite_id) + 1` — same pattern
+as `improve_sample_id` and `improve_drug_id`.
+
+## Local run (Only for debugging purposes, this is already build into build_all.py script)
+
+```bash
+pip install pandas synapseclient openpyxl
+
+export SYNAPSE_AUTH_TOKEN=<token>
+python coderbuild/phosphosites/00-buildPhosphositeFile.py \
+    local/genes.csv \
+    --out local/phosphosites.csv \
+    --synapse_supplements syn70078415 \
+    --synapse_site_col site
+```
diff --git a/coderbuild/phosphosites/build_phosphosites.sh b/coderbuild/phosphosites/build_phosphosites.sh
new file mode 100644
index 00000000..4662546d
--- /dev/null
+++ b/coderbuild/phosphosites/build_phosphosites.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# build_phosphosites.sh: build the global phosphosites reference CSV.
+#
+# Usage:
+#   build_phosphosites.sh <genes.csv> [<prev_phosphosites.csv>]
+#
+# Sources (in priority order):
+#   1. Ochoa et al. (2020) Nat Biotechnol Supplementary Table S3 (~116k sites)
+#   2. UniProt PTM annotations (~12k additional sites unique to UniProt)
+#   3. Synapse supplement: syn70078415 (cnf raw phospho — captures experiment-
+#      specific sites not yet in either database)
+
+set -euo pipefail
+
+GENES="${1:?Usage: build_phosphosites.sh <genes.csv> [<prev_phosphosites.csv>]}"
+PREV="${2:-}"
+
+PREV_ARG=""
+if [ -n "$PREV" ] && [ -f "$PREV" ]; then
+    PREV_ARG="--prev $PREV"
+fi
+
+python /coderbuild/phosphosites/00-buildPhosphositeFile.py \
+    "$GENES" \
+    --out /tmp/phosphosites.csv \
+    --synapse_supplements syn70078415 \
+    --synapse_site_col site \
+    $PREV_ARG
+
+echo "Phosphosites built from Ochoa + UniProt PTM + Synapse supplement."
diff --git a/schema/coderdata.yaml b/schema/coderdata.yaml
index e434b47a..258fd6d5 100755
--- a/schema/coderdata.yaml
+++ b/schema/coderdata.yaml
@@ -13,6 +13,10 @@ slots:
   entrez_id:
     description: Gene id according to NCBI
     range: integer
+  phosphosite_id:
+    description: Unique identifier for a phosphorylation site (gene + residue + position)
+    identifier: true
+    range: integer
   improve_sample_id:
     description: Unique sample identifier for this dataset, generated by this package
     identifier: true
@@ -49,6 +53,24 @@ classes:
       gene_symbol:
         required: true
         description: HGNC Gene symbol or synomym
+  Phosphosite:
+    description: Reference table of phosphorylation sites; each row is a unique gene + residue + position combination.
+    slots:
+      - phosphosite_id
+      - entrez_id
+      - other_id
+      - other_id_source
+    attributes:
+      gene_symbol:
+        required: true
+        description: HGNC gene symbol
+      residue:
+        description: Phosphorylated amino acid residue (S, T, or Y)
+      position:
+        range: integer
+        description: Position of the residue in the canonical protein sequence
+      modification:
+        description: Type of modification (e.g. phospho)
   Sample:
     description: Unique samples identified for each experiment, used to cross reference omics data to drug data.
     slots:
@@ -122,6 +144,16 @@ classes:
       proteomics:
          range: float
          description: Log-normalized log ratio of proteomics measurements of sample at baseline.
+  Phosphoproteomics:
+    slots:
+      - phosphosite_id
+      - improve_sample_id
+      - source
+      - study
+    attributes:
+      phosphoproteomics:
+        range: float
+        description: Log-normalized, batch-corrected phosphoproteomics abundance value (correctedAbundance).
   Copy Number:
     slots:
       - entrez_id
@@ -157,7 +189,7 @@ classes:
       - time_unit
     attributes:
       dose_response_metric:
-        range: CurveMetric
+        range: ResponseMetric
         description: Metric by which dose response value is measured
       dose_response_value:
         description: Value of metric
@@ -218,7 +250,7 @@ enums:
       auc:
         description: Area under the curve measured by integrating data points
       dss:
-        description: I believe this is the drug sensitivity score
+        description: Drug sensistivity score.
       mrecist:
         description: For PDX data this value should be either Progressive Disease, Stable Disease, Partial Response or Complete Response.
       published_auc:
@@ -233,6 +265,13 @@ enums:
         description: tumor growth inhibitiojn
       kulgap:
         description: kl divergence based metric
+      uM_viability:
+        description:
+          Single-dose viability fraction at 1 μM. Values near 1.0 indicate no
+          effect (full viability); values near 0.0 indicate full kill. Values
+          above 1.0 are possible when treated cells proliferate beyond the
+          untreated control. Used for organoid drug screens where dose-response
+          curves were not collected for that drug.
   CellPerturbation:
     permissible_values:
       gene_ko:
@@ -248,6 +287,7 @@ enums:
       xenograft derived organoid:
       patient derived organoid:
       ex vivo:
+      normal_tissue:
   Variant:
     permissible_values:
       3'UTR:
diff --git a/schema/expected_files.yaml b/schema/expected_files.yaml
index 3de46533..f0ccf49e 100644
--- a/schema/expected_files.yaml
+++ b/schema/expected_files.yaml
@@ -294,4 +294,29 @@ datasets:
     - target_class: Drug
       file: /tmp/novartis_drugs.tsv
     - target_class: Drug Descriptor
-      file: /tmp/novartis_drug_descriptors.tsv
\ No newline at end of file
+      file: /tmp/novartis_drug_descriptors.tsv
+
+
+  cnf:
+    - target_class: Sample
+      file: /tmp/cnf_samples.csv
+    - target_class: Transcriptomics
+      file: /tmp/cnf_transcriptomics.csv
+    - target_class: Proteomics
+      file: /tmp/cnf_proteomics.csv
+    - target_class: Phosphoproteomics
+      file: /tmp/cnf_phosphoproteomics.csv
+    # - target_class: Mutations
+    #   file: /tmp/cnf_mutations.csv
+    # - target_class: Copy Number
+    #   file: /tmp/cnf_copy_number.csv
+    - target_class: Experiments
+      file: /tmp/cnf_experiments.tsv
+    - target_class: Drug
+      file: /tmp/cnf_drugs.tsv
+    - target_class: Drug Descriptor
+      file: /tmp/cnf_drug_descriptors.tsv
+
+  phosphosites:
+    - target_class: Phosphosite
+      file: /tmp/phosphosites.csv
\ No newline at end of file