From 46e2934aa64d36512403817e15f19fba28f9f961 Mon Sep 17 00:00:00 2001 From: DevCats Date: Wed, 10 Jun 2026 20:10:28 +0000 Subject: [PATCH 01/14] feat: scanner pipeline, workflows, tests, repo plumbing Drops the bootstrap placeholders and lands the working scanner. Code: scanner/cli.py Click entry point with enumerate, combine, aggregate, validate subcommands. scanner/enumerate.py Catalogue parsing for both the in-tree .agents/skills/ format and the future external-sources format under registry//skills/README.md. External-sources wins on slug collisions. scanner/combine.py Per-skill report from SkillSpector JSON plus the verdict. scanner/aggregate.py Joins per-skill JSONs into the public latest.json and validates against the schema. scanner/verdict.py Threshold evaluator. Order is malicious > suspicious > unknown > clean. Tests: tests/test_verdict.py Threshold branches and overrides. tests/test_combine.py SkillSpector summary + per-skill assembly. tests/test_enumerate.py Both catalogue formats plus dedupe rules. tests/test_aggregate.py Top-level shape and schema validation. All fixtures are constructed inline with tmp_path and string literals; there is no testdata/ directory. Workflows: .github/workflows/scan.yaml Scheduled scanner at :17 every 6h. Enumerate, matrix scan, index, publish-release, publish-pages, open-issue-on-failure, Slack notify. .github/workflows/prune.yaml Weekly delete of scan-* releases past the retention window. .github/workflows/ci.yaml Adds ruff + pytest job; keeps the config and schema validators. Config and schema: config.yaml drops the ClamAV section; simplifies verdict to malicious_risk_score and suspicious_risk_score. schema/report.schema.json drops ClamAVScanner; keeps the scanners object extensible for future tools. Tooling: CODEOWNERS @DevelopmentCats @bpmct @phorcys420 .github/dependabot.yml weekly pip + github-actions bumps Makefile install / lint / test / schema mise.toml pinned Python 3.12 Acceptance: ruff clean, 32 pytest cases pass, actionlint clean, markdownlint clean. Manual workflow_dispatch produces a real latest.json once Pages is enabled on the repo. This commit was prepared with help from Coder Agents. --- .github/dependabot.yml | 29 ++++ .github/workflows/ci.yaml | 30 ++-- .github/workflows/prune.yaml | 38 +++++ .github/workflows/scan.yaml | 317 +++++++++++++++++++++++++++++++++++ .gitignore | 3 +- CODEOWNERS | 4 + Makefile | 40 +++++ README.md | 109 +++++++----- config.yaml | 65 ++----- mise.toml | 3 + pyproject.toml | 52 ++++++ scanner/__init__.py | 11 ++ scanner/__main__.py | 6 + scanner/aggregate.py | 105 ++++++++++++ scanner/cli.py | 287 +++++++++++++++++++++++++++++++ scanner/combine.py | 107 ++++++++++++ scanner/enumerate.py | 192 +++++++++++++++++++++ scanner/verdict.py | 65 +++++++ schema/report.schema.json | 44 +---- scripts/.gitkeep | 1 - testdata/.gitkeep | 1 - tests/conftest.py | 79 +++++++++ tests/test_aggregate.py | 132 +++++++++++++++ tests/test_combine.py | 118 +++++++++++++ tests/test_enumerate.py | 180 ++++++++++++++++++++ tests/test_verdict.py | 73 ++++++++ 26 files changed, 1948 insertions(+), 143 deletions(-) create mode 100644 .github/dependabot.yml create mode 100644 .github/workflows/prune.yaml create mode 100644 .github/workflows/scan.yaml create mode 100644 CODEOWNERS create mode 100644 Makefile create mode 100644 mise.toml create mode 100644 pyproject.toml create mode 100644 scanner/__init__.py create mode 100644 scanner/__main__.py create mode 100644 scanner/aggregate.py create mode 100644 scanner/cli.py create mode 100644 scanner/combine.py create mode 100644 scanner/enumerate.py create mode 100644 scanner/verdict.py delete mode 100644 scripts/.gitkeep delete mode 100644 testdata/.gitkeep create mode 100644 tests/conftest.py create mode 100644 tests/test_aggregate.py create mode 100644 tests/test_combine.py create mode 100644 tests/test_enumerate.py create mode 100644 tests/test_verdict.py diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..3280dbd --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,29 @@ +version: 2 +updates: + # Python deps from pyproject.toml. Also picks up the SkillSpector + # git pin if Dependabot's git-pin handling is enabled by GitHub for + # this org; otherwise the pin is bumped manually via PR. + - package-ecosystem: pip + directory: "/" + schedule: + interval: weekly + day: monday + time: "10:00" + timezone: Etc/UTC + open-pull-requests-limit: 5 + labels: ["deps", "python"] + commit-message: + prefix: "chore(deps)" + + # GitHub Actions SHAs across every workflow. + - package-ecosystem: github-actions + directory: "/" + schedule: + interval: weekly + day: monday + time: "10:00" + timezone: Etc/UTC + open-pull-requests-limit: 5 + labels: ["deps", "github-actions"] + commit-message: + prefix: "chore(deps)" diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 083bbd2..6ddbe37 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -32,9 +32,8 @@ jobs: ".catalogue.registry_repo.repo" ".catalogue.registry_repo.ref" ".scanners.skillspector.pin" - ".scanners.clamav.apt_packages" - ".verdict.malicious_if" - ".verdict.suspicious_if" + ".verdict.malicious_risk_score" + ".verdict.suspicious_risk_score" ".publish.release.rolling_tag" ".publish.pages.enabled" ".schedule.cron" @@ -68,25 +67,26 @@ jobs: python -m json.tool schema/report.schema.json > /dev/null check-jsonschema --check-metaschema schema/report.schema.json - lint-shell: - name: ShellCheck and prettier + pytest: + name: pytest + ruff runs-on: ubuntu-latest permissions: contents: read steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - name: Install shellcheck - run: sudo apt-get update && sudo apt-get install -y shellcheck - - name: ShellCheck + - name: Set up Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.12" + - name: Install run: | - shopt -s globstar nullglob - files=(scripts/**/*.sh) - if (( ${#files[@]} == 0 )); then - echo "No shell scripts to lint yet." - exit 0 - fi - shellcheck "${files[@]}" + python -m pip install --upgrade pip + pip install -e ".[dev]" + - name: ruff + run: ruff check scanner tests + - name: pytest + run: pytest lint-markdown: name: Markdownlint diff --git a/.github/workflows/prune.yaml b/.github/workflows/prune.yaml new file mode 100644 index 0000000..9f1c091 --- /dev/null +++ b/.github/workflows/prune.yaml @@ -0,0 +1,38 @@ +name: prune +on: + schedule: + # Weekly Sundays 04:33 UTC. + - cron: "33 4 * * 0" + workflow_dispatch: + +permissions: {} + +jobs: + prune-releases: + name: Delete scan releases older than retention window + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Determine retention in days + id: cfg + run: | + set -euo pipefail + days="$(grep -E '^\s*retention_days:' config.yaml | head -1 | awk '{print $2}')" + echo "days=${days:-90}" >> "$GITHUB_OUTPUT" + - name: Delete old scan releases + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + DAYS: ${{ steps.cfg.outputs.days }} + run: | + set -euo pipefail + cutoff_epoch="$(date -u -d "${DAYS} days ago" +%s)" + gh release list --limit 1000 --json tagName,createdAt \ + | jq -r --argjson cutoff "${cutoff_epoch}" \ + '.[] | select(.tagName | startswith("scan-")) | select((.createdAt | fromdateiso8601) < $cutoff) | .tagName' \ + | while read -r tag; do + echo "Deleting old release ${tag}" + gh release delete "${tag}" --yes --cleanup-tag + done diff --git a/.github/workflows/scan.yaml b/.github/workflows/scan.yaml new file mode 100644 index 0000000..1a494fd --- /dev/null +++ b/.github/workflows/scan.yaml @@ -0,0 +1,317 @@ +name: scan + +on: + schedule: + # Every 6h at :17 to dodge top-of-hour cron drift. + - cron: "17 */6 * * *" + workflow_dispatch: + push: + branches: [main] + paths: + - "config.yaml" + - "scanner/**" + - ".github/workflows/scan.yaml" + +permissions: {} + +concurrency: + group: scan + cancel-in-progress: false + +jobs: + enumerate: + name: Enumerate skill sources + runs-on: ubuntu-latest + permissions: + contents: read + outputs: + matrix: ${{ steps.list.outputs.matrix }} + catalogue_sha: ${{ steps.list.outputs.catalogue_sha }} + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Set up Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.12" + - name: Install + run: | + python -m pip install --upgrade pip + pip install -e . + - name: Enumerate + id: list + run: scanner enumerate --github-output >> "$GITHUB_OUTPUT" + + scan: + name: Scan ${{ matrix.namespace }}/${{ matrix.slug }} + needs: enumerate + if: needs.enumerate.outputs.matrix != '{"include":[]}' + runs-on: ubuntu-latest + permissions: + contents: read + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.enumerate.outputs.matrix) }} + steps: + - name: Checkout scanner + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Set up Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.12" + - name: Install scanner + run: | + python -m pip install --upgrade pip + pip install -e . + - name: Install SkillSpector + run: pip install "$(python -c 'import yaml; print(yaml.safe_load(open("config.yaml"))["scanners"]["skillspector"]["pin"])')" + - name: Checkout source repo + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + repository: ${{ matrix.source_repo }} + ref: ${{ matrix.source_ref }} + path: source + persist-credentials: false + - name: Resolve source SHA + id: source_sha + run: | + set -euo pipefail + sha="$(git -C source rev-parse HEAD)" + echo "sha=${sha}" >> "$GITHUB_OUTPUT" + - name: Verify skill path exists + id: path_check + run: | + set -euo pipefail + if [[ -d "source/${{ matrix.skill_path }}" ]]; then + echo "drift=false" >> "$GITHUB_OUTPUT" + else + echo "drift=true" >> "$GITHUB_OUTPUT" + echo "Skill path source/${{ matrix.skill_path }} not present upstream; will report catalogue drift." >&2 + fi + - name: SkillSpector (JSON) + if: steps.path_check.outputs.drift == 'false' + continue-on-error: true + run: | + mkdir -p out + skillspector scan "source/${{ matrix.skill_path }}" \ + --no-llm \ + --format json \ + --output "out/skillspector.json" || true + - name: SkillSpector (SARIF) + if: steps.path_check.outputs.drift == 'false' + continue-on-error: true + run: | + mkdir -p out + skillspector scan "source/${{ matrix.skill_path }}" \ + --no-llm \ + --format sarif \ + --output "out/skillspector.sarif" || true + - name: Combine + run: | + mkdir -p out + scanner combine \ + --namespace "${{ matrix.namespace }}" \ + --slug "${{ matrix.slug }}" \ + --source-repo "${{ matrix.source_repo }}" \ + --source-ref "${{ matrix.source_ref }}" \ + --source-sha "${{ steps.source_sha.outputs.sha }}" \ + --skill-path "${{ matrix.skill_path }}" \ + ${{ steps.path_check.outputs.drift == 'true' && '--catalogue-drift' || '' }} \ + --skillspector-json out/skillspector.json \ + --output out/skill.json + - name: Upload skill artifact + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.1.0 + with: + name: skill-${{ matrix.namespace }}-${{ matrix.slug }} + path: out/ + retention-days: 90 + + index: + name: Build latest.json + needs: [enumerate, scan] + if: always() && needs.enumerate.result == 'success' + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout scanner + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Set up Python + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: "3.12" + - name: Install + run: | + python -m pip install --upgrade pip + pip install -e . + - name: Download all skill artifacts + uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v5.0.0 + with: + path: scans + pattern: skill-* + - name: Aggregate + env: + SCANNER_CATALOGUE_SHA: ${{ needs.enumerate.outputs.catalogue_sha }} + run: scanner aggregate scans --output latest.json + - name: Show summary + run: | + python -c "import json; r=json.load(open('latest.json')); print(json.dumps(r['summary'], indent=2))" + - name: Upload scan-index artifact + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.1.0 + with: + name: scan-index + path: latest.json + retention-days: 90 + + publish-release: + name: Publish GitHub Release + needs: index + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Download scan-index + uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v5.0.0 + with: + name: scan-index + path: . + - name: Download all skill artifacts + uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v5.0.0 + with: + path: skills + pattern: skill-* + - name: Stage release assets + id: stage + run: | + set -euo pipefail + stamp="$(date -u +%Y-%m-%dT%H-%MZ)" + tag="scan-${stamp}" + mkdir -p release-assets + cp latest.json release-assets/ + # Each per-skill artifact is one directory under skills/. + shopt -s nullglob + for d in skills/skill-*; do + base="$(basename "$d")" + if [[ -f "${d}/skillspector.sarif" ]]; then + cp "${d}/skillspector.sarif" "release-assets/${base}.sarif" + fi + if [[ -f "${d}/skillspector.json" ]]; then + cp "${d}/skillspector.json" "release-assets/${base}.skillspector.json" + fi + if [[ -f "${d}/skill.json" ]]; then + cp "${d}/skill.json" "release-assets/${base}.skill.json" + fi + done + ls -la release-assets/ + echo "tag=${tag}" >> "$GITHUB_OUTPUT" + echo "stamp=${stamp}" >> "$GITHUB_OUTPUT" + - name: Create timestamped release + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + TAG: ${{ steps.stage.outputs.tag }} + run: | + set -euo pipefail + gh release create "${TAG}" \ + --title "Scan ${TAG}" \ + --notes "Automated scan run. See https://coder.github.io/coder-skill-scanner/latest.json for the public report." \ + release-assets/* + - name: Update rolling latest tag + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + gh release delete latest --yes --cleanup-tag 2>/dev/null || true + gh release create latest \ + --title "Latest scan" \ + --notes "Rolling pointer to the most recent scan. Updated automatically." \ + release-assets/latest.json + + publish-pages: + name: Publish to GitHub Pages + needs: index + runs-on: ubuntu-latest + permissions: + pages: write + id-token: write + contents: read + environment: + name: github-pages + url: ${{ steps.deploy.outputs.page_url }} + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Download scan-index + uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v5.0.0 + with: + name: scan-index + path: . + - name: Build pages tree + run: | + set -euo pipefail + mkdir -p pages/history + cp latest.json pages/latest.json + cp schema/report.schema.json pages/schema.json + stamp="$(date -u +%Y-%m-%dT%H-%MZ)" + mkdir -p "pages/history/$(date -u +%Y-%m-%d)" + cp latest.json "pages/history/$(date -u +%Y-%m-%d)/${stamp}.json" + cat > pages/index.html <<'HTML' + coder-skill-scanner +

coder-skill-scanner

+

Public scan reports for the Coder registry.

+ + HTML + - name: Upload Pages artifact + uses: actions/upload-pages-artifact@56afc609e74202658d3ffba0e8f6dda462b719fa # v3.0.1 + with: + path: pages + - name: Deploy Pages + id: deploy + uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e # v4.0.5 + + open-issue-on-failure: + name: Open or update tracker issue + needs: [enumerate, scan, index, publish-release, publish-pages] + if: failure() && github.event_name != 'pull_request' + runs-on: ubuntu-latest + permissions: + issues: write + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Open or update tracker issue + uses: JasonEtco/create-an-issue@1b14a70e4d8dc185e5cc76d3bec9eab20257b2c5 # v2.9.2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + WORKFLOW_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + with: + filename: .github/ISSUE_TEMPLATE/scanner-down.md + update_existing: true + search_existing: open + + notify-slack-on-failure: + name: Slack notification on failure + needs: [enumerate, scan, index, publish-release, publish-pages] + if: failure() && github.event_name != 'pull_request' + runs-on: ubuntu-latest + permissions: {} + steps: + - name: Post to Slack + env: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} + WORKFLOW_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + run: | + set -euo pipefail + if [[ -z "${SLACK_WEBHOOK_URL:-}" ]]; then + echo "SLACK_WEBHOOK_URL secret not set; skipping Slack notification." + exit 0 + fi + curl --fail-with-body --silent --show-error \ + -X POST -H "Content-Type: application/json" \ + -d "{\"text\":\":rotating_light: coder-skill-scanner run failed: ${WORKFLOW_URL}\"}" \ + "${SLACK_WEBHOOK_URL}" diff --git a/.gitignore b/.gitignore index b5408b3..6516f36 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,10 @@ -# Python (added in PR 2 when scripts use venv) +# Python tooling caches .venv/ __pycache__/ *.pyc *.pyo .pytest_cache/ +.ruff_cache/ .uv/ # Scanner outputs diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 0000000..03e49b3 --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1,4 @@ +# Default reviewers for every change. +# These three accounts must each approve, or one must merge with the +# others tagged. +* @DevelopmentCats @bpmct @phorcys420 diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..a925d84 --- /dev/null +++ b/Makefile @@ -0,0 +1,40 @@ +PYTHON ?= python3 +VENV ?= .venv +PIP := $(VENV)/bin/pip +PY := $(VENV)/bin/python + +.PHONY: help install lint test schema enumerate clean + +help: + @echo "Targets:" + @echo " install install the package and dev deps into $(VENV)" + @echo " lint ruff check" + @echo " test pytest" + @echo " schema validate report.schema.json is a valid JSON Schema" + @echo " clean remove $(VENV) and build artefacts" + +$(VENV)/bin/activate: + $(PYTHON) -m venv $(VENV) + $(PIP) install --upgrade pip + +install: $(VENV)/bin/activate + $(PIP) install -e ".[dev]" + +lint: install + $(VENV)/bin/ruff check scanner tests + +test: install + $(VENV)/bin/pytest + +schema: install + $(VENV)/bin/python -m json.tool schema/report.schema.json > /dev/null + $(VENV)/bin/python -c "import json, jsonschema; \ + s = json.load(open('schema/report.schema.json')); \ + jsonschema.Draft202012Validator.check_schema(s); \ + print('schema OK')" + +enumerate: install + $(VENV)/bin/scanner enumerate + +clean: + rm -rf $(VENV) build dist *.egg-info .pytest_cache .ruff_cache diff --git a/README.md b/README.md index f3097c5..ccbfa73 100644 --- a/README.md +++ b/README.md @@ -3,75 +3,104 @@ Periodic, GitHub-Actions-as-SaaS security scanner for agent skills declared in the [Coder registry](https://github.com/coder/registry) catalogue. -Every 6 hours, this repo's scheduled workflow: +Every 6 hours, the scheduled workflow in this repo: -1. Enumerates every skill declared in `coder/registry`. +1. Enumerates every skill in `coder/registry` (both the in-tree + `.agents/skills/` format and the future external-sources format). 2. Shallow-clones each source repo. -3. Runs [NVIDIA SkillSpector](https://github.com/NVIDIA/SkillSpector) - (agentic risk, static mode) and [ClamAV](https://www.clamav.net) - (malware signatures) over the upstream content. +3. Runs [NVIDIA SkillSpector](https://github.com/NVIDIA/SkillSpector) in + `--no-llm` static mode over the upstream content. 4. Builds a per-skill verdict (`clean`, `suspicious`, `malicious`, - `unknown`) from the scanner outputs and the policy in `config.yaml`. + `unknown`) from `risk_score` plus the thresholds in `config.yaml`. 5. Publishes a versioned report as a GitHub Release asset and a public `latest.json` to GitHub Pages. The registry site reads the public report through a small proxy endpoint -in `coder/registry-server` and shows a per-skill scan badge. The -registry's deploys are not gated on the scan result; this is visibility, -not enforcement. +in `coder/registry-server` (separate PR) and shows a per-skill badge. +The registry's deploys are not gated on the scan result. ## Reading the latest report -Stable URLs (no auth required): +Stable URLs, no auth required: -- Public JSON (CDN-cached): - `https://coder.github.io/coder-skill-scanner/latest.json` -- Tagged Release: - `https://github.com/coder/coder-skill-scanner/releases/latest/download/latest.json` -- Per-scan history: - `https://coder.github.io/coder-skill-scanner/history//