diff --git a/.github/workflows/firecracker-e2e.yml b/.github/workflows/firecracker-e2e.yml new file mode 100644 index 0000000..b1afb4b --- /dev/null +++ b/.github/workflows/firecracker-e2e.yml @@ -0,0 +1,201 @@ +name: Firecracker snapshot e2e + +# Real-microVM validation of the Firecracker snapshot driver (tools/firecracker): +# boot -> NIC over TAP -> warmup -> snapshot -> restore -> resume -> consume, and +# assert the snapshot-consumed impacted set is byte-identical to the local-driver +# (cold) set (RFC §5.3 correctness canary). +# +# Manual (`workflow_dispatch`) because it needs /dev/kvm and boots a VM. GitHub's +# hosted Linux runners expose /dev/kvm; if a runner ever lacks it the job fails +# fast with a clear message. Use the `runner` input to target a beefier/self- +# hosted host. This is the path the Raspberry Pi dev host cannot finish: its +# 16 KB-page kernel freezes guest userspace after restore (see +# tools/firecracker/README.md). A mainline-kernel x86_64 runner does not. +on: + workflow_dispatch: + inputs: + runner: + description: 'Runner label (needs Linux + /dev/kvm)' + default: 'ubuntu-latest' + type: string + packages: + description: 'Synthetic workspace size (packages)' + default: '300' + type: string + mem_mib: + description: 'Guest memory (MiB)' + default: '4096' + type: string + +permissions: + contents: read + +jobs: + firecracker-e2e: + runs-on: ${{ inputs.runner }} + env: + ARCH: x86_64 + KERNEL_VER: '6.1.128' + # Big artifacts (multi-GB rootfs, copied per record/consume) live on /mnt, + # which has far more free space than / on GitHub-hosted runners. + IMG: /mnt/fc/image + WS: /mnt/fc/ws + STORE: /mnt/fc/store + FC_OUT: /mnt/fc/impacted.fc.txt + SIZE_MB: '4096' + TAP: fc-tap0 + GUEST_ADDR: root@172.16.0.2 + FC_VER: v1.16.0 + steps: + - uses: actions/checkout@v4 + + - name: Setup Java JDK + uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: '21' + + - name: Setup Go environment + uses: actions/setup-go@v5 + with: + go-version: '^1.21' + + - name: Setup Bazelisk + run: go install github.com/bazelbuild/bazelisk@latest + + - name: Install host tooling (squashfs-tools, e2fsprogs, iproute2) + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends squashfs-tools e2fsprogs iproute2 + + - name: Prepare /mnt scratch (more free space than /) + run: | + sudo mkdir -p /mnt/fc && sudo chown "$USER" /mnt/fc + df -h / /mnt + + - name: Verify /dev/kvm + run: | + if [ ! -e /dev/kvm ]; then + echo "::error::/dev/kvm not present on this runner; pick a runner with nested virtualization." >&2 + exit 1 + fi + sudo chmod 666 /dev/kvm + ls -l /dev/kvm + + - name: Build bazel-diff (deploy jar + launcher) + run: | + ~/go/bin/bazelisk build //cli:bazel-diff_deploy.jar + ~/go/bin/bazelisk run //:bazel-diff --script_path=/tmp/bazel_diff + ls -l bazel-bin/cli/bazel-diff_deploy.jar /tmp/bazel_diff + + - name: Build + unit-test the orchestrator + working-directory: tools/firecracker + run: | + go vet ./... && go vet -tags fcintegration ./... + go test ./... + go build -o /tmp/bazel-diff-snap . + + - name: Resolve a real bazel binary to bake into the guest + id: bazel + run: | + ~/go/bin/bazelisk version >/dev/null + # bazelisk caches the real bazel under ~/.cache/bazelisk/downloads/... + BAZEL_REAL=$(find "$HOME/.cache/bazelisk/downloads" -type f \( -name bazel -o -name 'bazel-*' \) -perm -u+x 2>/dev/null | grep -vE '\.(sha256|json)$' | head -1) + test -n "$BAZEL_REAL" || { echo "::error::could not locate the bazelisk-downloaded bazel binary" >&2; exit 1; } + echo "bin=$BAZEL_REAL" >> "$GITHUB_OUTPUT" + # Also make it the host bazel for the local-driver leg. + sudo install -m755 "$BAZEL_REAL" /usr/local/bin/bazel + echo "baking guest bazel: $BAZEL_REAL" + + - name: Generate synthetic workspace (two revisions) + id: ws + run: | + python3 tools/firecracker/bench/gen_project.py \ + --out "$WS" --packages "${{ inputs.packages }}" --targets-per-package 4 --git > /tmp/gen.json + cat /tmp/gen.json + echo "base=$(python3 -c 'import json;print(json.load(open("/tmp/gen.json"))["base_sha"])')" >> "$GITHUB_OUTPUT" + echo "target=$(python3 -c 'import json;print(json.load(open("/tmp/gen.json"))["target_sha"])')" >> "$GITHUB_OUTPUT" + + - name: Generate guest ssh keypair + run: ssh-keygen -t ed25519 -N '' -f /tmp/fc_guest -C fc-guest + + - name: Download Firecracker + run: | + curl -fsSL -o /tmp/fc.tgz \ + "https://github.com/firecracker-microvm/firecracker/releases/download/${FC_VER}/firecracker-${FC_VER}-${ARCH}.tgz" + tar -C /tmp -xzf /tmp/fc.tgz + install -m755 "/tmp/release-${FC_VER}-${ARCH}/firecracker-${FC_VER}-${ARCH}" /tmp/firecracker + /tmp/firecracker --version | head -1 + + - name: Build guest image (kernel + rootfs.base.ext4) + run: | + sudo -E env \ + ARCH="$ARCH" KERNEL_VER="$KERNEL_VER" OUT="$IMG" SIZE_MB="$SIZE_MB" \ + BAZEL_DIFF_JAR="$PWD/bazel-bin/cli/bazel-diff_deploy.jar" \ + BAZEL_BIN="${{ steps.bazel.outputs.bin }}" \ + WORKSPACE_SRC="$WS" SSH_PUBKEY=/tmp/fc_guest.pub \ + tools/firecracker/bench/build_guest_image.sh + sudo chown "$USER" "$IMG/rootfs.base.ext4" + ls -l "$IMG" + + - name: Set up host TAP + run: sudo tools/firecracker/bench/setup_tap.sh "$TAP" 172.16.0.1 30 + + - name: Local-driver consume (cold baseline) + run: | + /tmp/bazel-diff-snap record --driver local --store /tmp/localstore \ + --workspace "$WS" --base-sha "${{ steps.ws.outputs.base }}" \ + --bazel /usr/local/bin/bazel --bazel-diff /tmp/bazel_diff + /tmp/bazel-diff-snap consume --driver local --store /tmp/localstore \ + --workspace "$WS" --target-sha "${{ steps.ws.outputs.target }}" \ + --bazel /usr/local/bin/bazel --bazel-diff /tmp/bazel_diff \ + --out /tmp/impacted.local.txt + sort -o /tmp/impacted.local.txt /tmp/impacted.local.txt + echo "local impacted: $(wc -l < /tmp/impacted.local.txt) targets" + + - name: Firecracker-driver record/consume (real microVM) + working-directory: tools/firecracker + env: + FC_BIN: /tmp/firecracker + FC_KERNEL: ${{ env.IMG }}/vmlinux-${{ env.KERNEL_VER }} + FC_STORE: ${{ env.STORE }} + FC_GUEST_ADDR: ${{ env.GUEST_ADDR }} + FC_GUEST_KEY: /tmp/fc_guest + FC_TAP: ${{ env.TAP }} + FC_WORKSPACE: /work + FC_BASE_SHA: ${{ steps.ws.outputs.base }} + FC_TARGET_SHA: ${{ steps.ws.outputs.target }} + FC_BAZEL: /usr/local/bin/bazel + FC_BAZEL_DIFF: bazel-diff + FC_OUT: ${{ env.FC_OUT }} + FC_MEM_MIB: ${{ inputs.mem_mib }} + FC_VCPUS: '2' + run: | + mkdir -p "$FC_STORE" + go test -tags fcintegration -run TestFirecrackerRecordConsume -v -timeout 25m ./... + sort -o "$FC_OUT" "$FC_OUT" + echo "firecracker impacted: $(wc -l < "$FC_OUT") targets" + + - name: Assert snapshot-consumed == cold (RFC §5.3 correctness canary) + run: | + test -s "$FC_OUT" || { echo "::error::firecracker impacted set is empty" >&2; exit 1; } + if ! diff -u /tmp/impacted.local.txt "$FC_OUT"; then + echo "::error::snapshot-consumed impacted set differs from the cold/local set" >&2 + exit 1 + fi + echo "PASS: firecracker-consumed impacted set is byte-identical to the cold/local set ($(wc -l < "$FC_OUT") targets)" + + - name: Upload impacted sets + uses: actions/upload-artifact@v4 + if: always() + with: + name: firecracker-e2e-impacted + path: | + /tmp/impacted.local.txt + /mnt/fc/impacted.fc.txt + /tmp/gen.json + if-no-files-found: warn + + - name: Tear down TAP + if: always() + run: sudo ip link del "$TAP" 2>/dev/null || true diff --git a/.gitignore b/.gitignore index 9f0d593..082d29b 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,9 @@ coverage-html/ # so this is a precise ignore list, not a blanket .claude/ ignore. .claude/settings.local.json .claude/worktrees/ + +# Firecracker benchmark harness outputs +.bench-results/ + +# Go orchestrator build output (go build . in tools/firecracker) +/tools/firecracker/firecracker diff --git a/cli/BUILD b/cli/BUILD index c1a0cf7..0c58aba 100644 --- a/cli/BUILD +++ b/cli/BUILD @@ -102,6 +102,30 @@ kt_jvm_test( runtime_deps = [":cli-test-lib"], ) +kt_jvm_test( + name = "FingerprintInteractorTest", + test_class = "com.bazel_diff.interactor.FingerprintInteractorTest", + runtime_deps = [":cli-test-lib"], +) + +kt_jvm_test( + name = "FingerprintGathererTest", + test_class = "com.bazel_diff.cli.FingerprintGathererTest", + runtime_deps = [":cli-test-lib"], +) + +kt_jvm_test( + name = "FingerprintCommandTest", + test_class = "com.bazel_diff.cli.FingerprintCommandTest", + runtime_deps = [":cli-test-lib"], +) + +kt_jvm_test( + name = "WarmupCommandTest", + test_class = "com.bazel_diff.cli.WarmupCommandTest", + runtime_deps = [":cli-test-lib"], +) + kt_jvm_test( name = "BazelRuleTest", test_class = "com.bazel_diff.bazel.BazelRuleTest", diff --git a/cli/src/main/kotlin/com/bazel_diff/cli/BazelDiff.kt b/cli/src/main/kotlin/com/bazel_diff/cli/BazelDiff.kt index 8f7e992..bd1afdb 100644 --- a/cli/src/main/kotlin/com/bazel_diff/cli/BazelDiff.kt +++ b/cli/src/main/kotlin/com/bazel_diff/cli/BazelDiff.kt @@ -7,7 +7,12 @@ import picocli.CommandLine.Spec @CommandLine.Command( name = "bazel-diff", description = ["Writes to a file the impacted targets between two Bazel graph JSON files"], - subcommands = [GenerateHashesCommand::class, GetImpactedTargetsCommand::class], + subcommands = + [ + GenerateHashesCommand::class, + GetImpactedTargetsCommand::class, + WarmupCommand::class, + FingerprintCommand::class], mixinStandardHelpOptions = true, versionProvider = VersionProvider::class, ) diff --git a/cli/src/main/kotlin/com/bazel_diff/cli/FingerprintCommand.kt b/cli/src/main/kotlin/com/bazel_diff/cli/FingerprintCommand.kt new file mode 100644 index 0000000..012762a --- /dev/null +++ b/cli/src/main/kotlin/com/bazel_diff/cli/FingerprintCommand.kt @@ -0,0 +1,154 @@ +package com.bazel_diff.cli + +import com.bazel_diff.cli.converter.CommaSeparatedValueConverter +import com.bazel_diff.cli.converter.NormalisingPathConverter +import com.bazel_diff.cli.converter.OptionsConverter +import com.bazel_diff.interactor.FingerprintInteractor +import com.google.gson.GsonBuilder +import java.io.File +import java.nio.file.Path +import java.util.concurrent.Callable +import picocli.CommandLine + +/** + * Computes the Firecracker snapshot cache key ("fingerprint") for the current workspace + flag set + * and writes it as JSON. Used at record time to tag a snapshot and at consume time to validate a + * candidate snapshot before trusting it. See `docs/firecracker-snapshots.md` §5. + * + * Pure metadata: this command does not run `bazel query` and is cheap. The only external call is + * `bazel version`. + */ +@CommandLine.Command( + name = "fingerprint", + mixinStandardHelpOptions = true, + description = + [ + "Computes the snapshot cache key over the inputs that affect the build graph " + + "(bazel version, MODULE.bazel.lock, .bazelrc, bazel-diff version, flag set) and " + + "writes it as JSON. Used to decide whether a Firecracker snapshot is safe to consume."], + versionProvider = VersionProvider::class) +class FingerprintCommand : Callable { + @CommandLine.ParentCommand private lateinit var parent: BazelDiff + + @CommandLine.Option( + names = ["-w", "--workspacePath"], + description = ["Path to Bazel workspace directory."], + scope = CommandLine.ScopeType.INHERIT, + required = true, + converter = [NormalisingPathConverter::class]) + lateinit var workspacePath: Path + + @CommandLine.Option( + names = ["-b", "--bazelPath"], + description = + [ + "Path to Bazel binary. If not specified, the Bazel binary available in PATH will be used."], + scope = CommandLine.ScopeType.INHERIT, + defaultValue = "bazel", + ) + lateinit var bazelPath: Path + + // --- query-affecting flags (must match the corresponding generate-hashes flags) --- + + @CommandLine.Option( + names = ["-so", "--bazelStartupOptions"], + converter = [OptionsConverter::class], + description = + ["Bazel client startup options (must match the consuming generate-hashes run)."]) + var bazelStartupOptions: List = emptyList() + + @CommandLine.Option( + names = ["-co", "--bazelCommandOptions"], + converter = [OptionsConverter::class], + description = ["Bazel command options for `bazel query`."]) + var bazelCommandOptions: List = emptyList() + + @CommandLine.Option( + names = ["--cqueryCommandOptions"], + converter = [OptionsConverter::class], + description = ["Bazel command options for `bazel cquery`."]) + var cqueryCommandOptions: List = emptyList() + + @CommandLine.Option(names = ["--useCquery"], negatable = true) var useCquery = false + + @CommandLine.Option(names = ["--cqueryExpression"]) var cqueryExpression: String? = null + + @CommandLine.Option(names = ["--includeTargetType"], negatable = true) + var includeTargetType = false + + @CommandLine.Option(names = ["-tt", "--targetType"], split = ",") + var targetType: Set? = null + + @CommandLine.Option( + names = ["--fineGrainedHashExternalRepos"], converter = [CommaSeparatedValueConverter::class]) + var fineGrainedHashExternalRepos: Set = emptySet() + + @CommandLine.Option( + names = ["--ignoredRuleHashingAttributes"], converter = [CommaSeparatedValueConverter::class]) + var ignoredRuleHashingAttributes: Set = emptySet() + + @CommandLine.Option(names = ["--excludeExternalTargets"], negatable = true) + var excludeExternalTargets = false + + @CommandLine.Option( + names = ["-k", "--keep_going"], + negatable = true, + defaultValue = "true", + fallbackValue = "true") + var keepGoing = true + + @CommandLine.Option( + names = ["-o", "--output"], + description = ["Path to write the fingerprint JSON. Defaults to STDOUT."], + defaultValue = CommandLine.Parameters.NULL_VALUE) + var outputPath: File? = null + + override fun call(): Int { + val flags = + FingerprintGatherer.canonicalizeFlags( + bazelStartupOptions = bazelStartupOptions, + bazelCommandOptions = bazelCommandOptions, + cqueryCommandOptions = cqueryCommandOptions, + useCquery = useCquery, + cqueryExpression = cqueryExpression, + includeTargetType = includeTargetType, + targetType = targetType, + fineGrainedHashExternalRepos = fineGrainedHashExternalRepos, + ignoredRuleHashingAttributes = ignoredRuleHashingAttributes, + excludeExternalTargets = excludeExternalTargets, + keepGoing = keepGoing, + ) + val inputs = + FingerprintGatherer.gather( + workspacePath = workspacePath, + bazelPath = bazelPath, + bazelDiffVersion = VersionProvider().version.firstOrNull() ?: "unknown", + flags = flags, + ) + + val result = FingerprintInteractor().compute(inputs) + val json = renderFingerprintJson(result, inputs.flags) + + val out = outputPath + if (out != null && out.path != "-") { + out.writeText(json + "\n") + } else { + println(json) + } + return CommandLine.ExitCode.OK + } +} + +/** Renders the fingerprint result + flag set as pretty JSON. Shared with [WarmupCommand]. */ +internal fun renderFingerprintJson( + result: com.bazel_diff.interactor.FingerprintResult, + flags: Map, +): String { + val gson = GsonBuilder().disableHtmlEscaping().setPrettyPrinting().create() + return gson.toJson( + linkedMapOf( + "fingerprint" to result.fingerprint, + "components" to result.components, + "flags" to flags.toSortedMap(), + )) +} diff --git a/cli/src/main/kotlin/com/bazel_diff/cli/FingerprintGatherer.kt b/cli/src/main/kotlin/com/bazel_diff/cli/FingerprintGatherer.kt new file mode 100644 index 0000000..fdfdf0a --- /dev/null +++ b/cli/src/main/kotlin/com/bazel_diff/cli/FingerprintGatherer.kt @@ -0,0 +1,113 @@ +package com.bazel_diff.cli + +import com.bazel_diff.interactor.FingerprintInputs +import java.nio.file.Files +import java.nio.file.Path +import java.util.concurrent.TimeUnit + +/** + * Gathers the environment-specific [FingerprintInputs] (reads `MODULE.bazel.lock`, `.bazelrc` + + * imports, runs `bazel version`, reads bazel-diff's version) and canonicalizes the query-affecting + * flag set. + * + * Shared by [FingerprintCommand] and [WarmupCommand] so the cache key is computed identically on + * the record and consume sides. The pure hashing lives in + * [com.bazel_diff.interactor.FingerprintInteractor]; this object is the IO around it. + */ +object FingerprintGatherer { + + /** Canonicalize the flag set into stable key -> value strings. Lists are joined, sets sorted. */ + fun canonicalizeFlags( + bazelStartupOptions: List, + bazelCommandOptions: List, + cqueryCommandOptions: List, + useCquery: Boolean, + cqueryExpression: String?, + includeTargetType: Boolean, + targetType: Set?, + fineGrainedHashExternalRepos: Set, + ignoredRuleHashingAttributes: Set, + excludeExternalTargets: Boolean, + keepGoing: Boolean, + ): Map { + fun list(xs: List) = xs.joinToString(" ") + fun set(xs: Set) = xs.toSortedSet().joinToString(",") + return linkedMapOf( + "bazelStartupOptions" to list(bazelStartupOptions), + "bazelCommandOptions" to list(bazelCommandOptions), + "cqueryCommandOptions" to list(cqueryCommandOptions), + "useCquery" to useCquery.toString(), + "cqueryExpression" to (cqueryExpression ?: ""), + "includeTargetType" to includeTargetType.toString(), + "targetType" to (targetType?.let { set(it) } ?: ""), + "fineGrainedHashExternalRepos" to set(fineGrainedHashExternalRepos), + "ignoredRuleHashingAttributes" to set(ignoredRuleHashingAttributes), + "excludeExternalTargets" to excludeExternalTargets.toString(), + "keepGoing" to keepGoing.toString(), + ) + } + + fun gather( + workspacePath: Path, + bazelPath: Path, + bazelDiffVersion: String, + flags: Map, + ): FingerprintInputs = + FingerprintInputs( + bazelDiffVersion = bazelDiffVersion, + bazelVersion = readBazelVersion(workspacePath, bazelPath), + moduleLockContent = readOptional(workspacePath.resolve("MODULE.bazel.lock")), + bazelrcContents = readBazelrcs(workspacePath), + flags = flags, + ) + + private fun readOptional(path: Path): ByteArray? = + if (Files.isRegularFile(path)) Files.readAllBytes(path) else null + + /** Read `.bazelrc` plus any files it `import`s / `try-import`s (best-effort, one level deep). */ + private fun readBazelrcs(workspacePath: Path): Map { + val result = LinkedHashMap() + val root = workspacePath.resolve(".bazelrc") + readOptional(root)?.let { bytes -> + result[".bazelrc"] = bytes + String(bytes).lineSequence().forEach { line -> + val trimmed = line.trim() + if (trimmed.startsWith("import ") || trimmed.startsWith("try-import ")) { + val raw = trimmed.substringAfter(' ').trim() + val resolved = raw.replace("%workspace%", workspacePath.toString()) + val p = Path.of(resolved) + val abs = if (p.isAbsolute) p else workspacePath.resolve(resolved) + readOptional(abs)?.let { imported -> + result[workspacePath.relativize(abs).toString()] = imported + } + } + } + } + return result + } + + private fun readBazelVersion(workspacePath: Path, bazelPath: Path): String { + return try { + val proc = + ProcessBuilder(bazelPath.toString(), "version") + .directory(workspacePath.toFile()) + .redirectErrorStream(false) + .start() + val output = proc.inputStream.bufferedReader().readText() + proc.waitFor(60, TimeUnit.SECONDS) + output + .lineSequence() + .firstOrNull { it.startsWith("Build label: ") } + ?.removePrefix("Build label: ") + ?.trim() + ?: output + .lineSequence() + .firstOrNull { it.startsWith("bazel ") } + ?.removePrefix("bazel ") + ?.trim() + ?: "unknown" + } catch (e: Exception) { + "unknown" + } + } +} diff --git a/cli/src/main/kotlin/com/bazel_diff/cli/GenerateHashesCommand.kt b/cli/src/main/kotlin/com/bazel_diff/cli/GenerateHashesCommand.kt index 3e18544..a342f55 100644 --- a/cli/src/main/kotlin/com/bazel_diff/cli/GenerateHashesCommand.kt +++ b/cli/src/main/kotlin/com/bazel_diff/cli/GenerateHashesCommand.kt @@ -20,7 +20,7 @@ import picocli.CommandLine description = ["Writes to a file the SHA256 hashes for each Bazel Target in the provided workspace."], versionProvider = VersionProvider::class) -class GenerateHashesCommand : Callable { +open class GenerateHashesCommand : Callable { @CommandLine.ParentCommand private lateinit var parent: BazelDiff @CommandLine.Option( diff --git a/cli/src/main/kotlin/com/bazel_diff/cli/WarmupCommand.kt b/cli/src/main/kotlin/com/bazel_diff/cli/WarmupCommand.kt new file mode 100644 index 0000000..e800ea8 --- /dev/null +++ b/cli/src/main/kotlin/com/bazel_diff/cli/WarmupCommand.kt @@ -0,0 +1,92 @@ +package com.bazel_diff.cli + +import com.bazel_diff.interactor.FingerprintInteractor +import java.io.File +import picocli.CommandLine + +/** + * The Firecracker **record-side entrypoint**. See `docs/firecracker-snapshots.md` §4.1. + * + * `warmup` is `generate-hashes` for the base revision plus two metadata side effects: + * 1. writes the base hashes to a known path (`--base-hashes`, default `/snap/base_hashes.json`), + * 2. writes the fingerprint file (`--fingerprint-output`, default `/snap/fingerprint.json`). + * + * Crucially, it exits `0` **only** once `bazel query` has completed and the server is warm and + * quiesced — the host watches for this clean exit as the "safe to snapshot" signal. + * + * It extends [GenerateHashesCommand] so it inherits the exact same query-affecting flags; warmup is + * deliberately *generate-hashes plus metadata*, never a divergent query path. That guarantees the + * base hashes baked into the snapshot are byte-identical to what a cold `generate-hashes` would + * produce, and that the fingerprint reflects the flags actually used. + */ +@CommandLine.Command( + name = "warmup", + mixinStandardHelpOptions = true, + description = + [ + "Record-side entrypoint for Firecracker snapshots: runs generate-hashes for the base " + + "revision, writes base hashes + fingerprint to known paths, and exits 0 only once " + + "the Bazel server is warm (the host's 'safe to snapshot' signal)."], + versionProvider = VersionProvider::class) +class WarmupCommand : GenerateHashesCommand() { + + @CommandLine.Option( + names = ["--base-hashes"], + description = ["Path to write the base hashes JSON. Default: /snap/base_hashes.json"], + defaultValue = "/snap/base_hashes.json") + lateinit var baseHashesPath: File + + @CommandLine.Option( + names = ["--fingerprint-output"], + description = ["Path to write the fingerprint JSON. Default: /snap/fingerprint.json"], + defaultValue = "/snap/fingerprint.json") + lateinit var fingerprintOutputPath: File + + override fun call(): Int { + // Route generate-hashes' output to the known base-hashes path. + outputPath = baseHashesPath + baseHashesPath.parentFile?.mkdirs() + + val genResult = super.call() + if (genResult != CommandLine.ExitCode.OK) { + // Do not write the fingerprint or signal "safe to snapshot" on a failed warmup. + return genResult + } + + writeFingerprint() + return CommandLine.ExitCode.OK + } + + /** + * Computes the fingerprint over the current flag set + workspace and writes it to + * [fingerprintOutputPath]. Split out of [call] so it is unit-testable without the bazel-backed + * `generate-hashes` run that [call] performs via `super.call()`. + */ + fun writeFingerprint() { + val flags = + FingerprintGatherer.canonicalizeFlags( + bazelStartupOptions = bazelStartupOptions, + bazelCommandOptions = bazelCommandOptions, + cqueryCommandOptions = cqueryCommandOptions, + useCquery = useCquery, + cqueryExpression = cqueryExpression, + includeTargetType = includeTargetType, + targetType = targetType, + fineGrainedHashExternalRepos = fineGrainedHashExternalRepos, + ignoredRuleHashingAttributes = ignoredRuleHashingAttributes, + excludeExternalTargets = excludeExternalTargets, + keepGoing = keepGoing, + ) + val inputs = + FingerprintGatherer.gather( + workspacePath = workspacePath, + bazelPath = bazelPath, + bazelDiffVersion = VersionProvider().version.firstOrNull() ?: "unknown", + flags = flags, + ) + val result = FingerprintInteractor().compute(inputs) + + fingerprintOutputPath.parentFile?.mkdirs() + fingerprintOutputPath.writeText(renderFingerprintJson(result, inputs.flags) + "\n") + } +} diff --git a/cli/src/main/kotlin/com/bazel_diff/interactor/FingerprintInteractor.kt b/cli/src/main/kotlin/com/bazel_diff/interactor/FingerprintInteractor.kt new file mode 100644 index 0000000..feff991 --- /dev/null +++ b/cli/src/main/kotlin/com/bazel_diff/interactor/FingerprintInteractor.kt @@ -0,0 +1,96 @@ +package com.bazel_diff.interactor + +import com.bazel_diff.extensions.toHexString +import com.bazel_diff.hash.safePutBytes +import com.bazel_diff.hash.sha256 + +/** + * Inputs to the snapshot cache key (a.k.a. the "fingerprint"). See `docs/firecracker-snapshots.md` + * §5.2. + * + * A Firecracker snapshot may only be consumed when the consuming environment matches the recording + * environment on everything that could change the build graph `generate-hashes` produces. These are + * exactly those inputs. + */ +data class FingerprintInputs( + /** bazel-diff's own version (from [com.bazel_diff.cli.VersionProvider]). */ + val bazelDiffVersion: String, + /** The `bazel version` "Build label" string of the bazel binary used. */ + val bazelVersion: String, + /** Raw bytes of `MODULE.bazel.lock`, or null if the workspace has no lockfile. */ + val moduleLockContent: ByteArray?, + /** Map of bazelrc path (relative to workspace) -> raw contents, including imported rc files. */ + val bazelrcContents: Map, + /** + * The canonicalized flag set that affects what `generate-hashes` queries or how it hashes. Keys + * are stable flag identifiers; values are their stringified settings. See + * [com.bazel_diff.cli.FingerprintCommand.collectFlags]. + */ + val flags: Map, +) + +/** + * Result of computing a fingerprint: the overall key plus per-component sub-hashes for debugging. + */ +data class FingerprintResult( + val fingerprint: String, + val components: Map, +) + +/** + * Computes the snapshot cache key from [FingerprintInputs]. + * + * Pure and deterministic: identical inputs always yield an identical fingerprint, independent of + * map iteration order. This is the part of the Firecracker design that is fully unit-testable with + * no VM and no Bazel server (RFC Phase 1). + */ +class FingerprintInteractor { + fun compute(inputs: FingerprintInputs): FingerprintResult { + val components = LinkedHashMap() + + components["bazelDiffVersion"] = + sha256 { safePutBytes(inputs.bazelDiffVersion.toByteArray()) }.toHexString() + components["bazelVersion"] = + sha256 { safePutBytes(inputs.bazelVersion.toByteArray()) }.toHexString() + components["moduleLock"] = + sha256 { + // Tag presence so a missing lockfile never collides with a present-but-empty one. + val lock = inputs.moduleLockContent + if (lock == null) { + putBytes("absent".toByteArray()) + } else { + putBytes("present".toByteArray()) + putBytes(lock) + } + } + .toHexString() + components["bazelrc"] = + sha256 { + // Sort by path so iteration order never changes the result. + inputs.bazelrcContents.toSortedMap().forEach { (path, content) -> + putBytes(path.toByteArray()) + safePutBytes(content) + } + } + .toHexString() + components["flags"] = + sha256 { + inputs.flags.toSortedMap().forEach { (k, v) -> + putBytes(k.toByteArray()) + putBytes(v.toByteArray()) + } + } + .toHexString() + + val fingerprint = + sha256 { + components.toSortedMap().forEach { (k, v) -> + putBytes(k.toByteArray()) + putBytes(v.toByteArray()) + } + } + .toHexString() + + return FingerprintResult(fingerprint, components) + } +} diff --git a/cli/src/test/kotlin/com/bazel_diff/cli/FingerprintCommandTest.kt b/cli/src/test/kotlin/com/bazel_diff/cli/FingerprintCommandTest.kt new file mode 100644 index 0000000..3d0434e --- /dev/null +++ b/cli/src/test/kotlin/com/bazel_diff/cli/FingerprintCommandTest.kt @@ -0,0 +1,65 @@ +package com.bazel_diff.cli + +import assertk.assertThat +import assertk.assertions.contains +import assertk.assertions.isEqualTo +import java.io.File +import org.junit.Rule +import org.junit.Test +import org.junit.rules.TemporaryFolder +import picocli.CommandLine + +class FingerprintCommandTest { + @get:Rule val temp: TemporaryFolder = TemporaryFolder() + + private fun fakeBazel(label: String): File = + File(temp.root, "bazel").apply { + writeText("#!/bin/sh\necho '$label'\n") + setExecutable(true) + } + + private fun command(ws: File): FingerprintCommand = + FingerprintCommand().apply { + workspacePath = ws.toPath() + bazelPath = fakeBazel("Build label: 8.5.1").toPath() + } + + @Test + fun writesFingerprintJsonToFile() { + val ws = temp.newFolder("ws") + File(ws, ".bazelrc").writeText("common --x") + val out = File(temp.root, "fp.json") + val cmd = command(ws).apply { outputPath = out } + + assertThat(cmd.call()).isEqualTo(CommandLine.ExitCode.OK) + val json = out.readText() + assertThat(json).contains("\"fingerprint\"") + assertThat(json).contains("\"flags\"") + assertThat(json).contains("\"components\"") + } + + @Test + fun fingerprintChangesWhenAFlagChanges() { + val ws = temp.newFolder("ws2") + val a = File(temp.root, "a.json") + val b = File(temp.root, "b.json") + command(ws).apply { outputPath = a }.call() + command(ws).apply { + outputPath = b + useCquery = true + } + .call() + // different flag set -> different fingerprint line + val fpA = a.readLines().first { it.contains("\"fingerprint\"") } + val fpB = b.readLines().first { it.contains("\"fingerprint\"") } + assert(fpA != fpB) { "fingerprint should change when --useCquery changes" } + } + + @Test + fun dashOutputDoesNotCreateAFileNamedDash() { + val ws = temp.newFolder("ws3") + val cmd = command(ws).apply { outputPath = File("-") } + assertThat(cmd.call()).isEqualTo(CommandLine.ExitCode.OK) + assertThat(File("-").exists()).isEqualTo(false) + } +} diff --git a/cli/src/test/kotlin/com/bazel_diff/cli/FingerprintGathererTest.kt b/cli/src/test/kotlin/com/bazel_diff/cli/FingerprintGathererTest.kt new file mode 100644 index 0000000..4091e6a --- /dev/null +++ b/cli/src/test/kotlin/com/bazel_diff/cli/FingerprintGathererTest.kt @@ -0,0 +1,87 @@ +package com.bazel_diff.cli + +import assertk.assertThat +import assertk.assertions.containsAll +import assertk.assertions.isEqualTo +import java.io.File +import org.junit.Rule +import org.junit.Test +import org.junit.rules.TemporaryFolder + +class FingerprintGathererTest { + @get:Rule val temp: TemporaryFolder = TemporaryFolder() + + @Test + fun canonicalizeFlagsSortsSetsAndJoinsLists() { + val flags = + FingerprintGatherer.canonicalizeFlags( + bazelStartupOptions = listOf("--a", "--b"), + bazelCommandOptions = listOf("--c"), + cqueryCommandOptions = emptyList(), + useCquery = true, + cqueryExpression = "deps(//...)", + includeTargetType = false, + targetType = setOf("Rule", "GeneratedFile"), + fineGrainedHashExternalRepos = setOf("maven", "abc"), + ignoredRuleHashingAttributes = emptySet(), + excludeExternalTargets = true, + keepGoing = false, + ) + assertThat(flags["bazelStartupOptions"]).isEqualTo("--a --b") + assertThat(flags["bazelCommandOptions"]).isEqualTo("--c") + assertThat(flags["useCquery"]).isEqualTo("true") + assertThat(flags["cqueryExpression"]).isEqualTo("deps(//...)") + // sets are sorted + comma-joined for determinism + assertThat(flags["targetType"]).isEqualTo("GeneratedFile,Rule") + assertThat(flags["fineGrainedHashExternalRepos"]).isEqualTo("abc,maven") + assertThat(flags["excludeExternalTargets"]).isEqualTo("true") + assertThat(flags["keepGoing"]).isEqualTo("false") + } + + @Test + fun canonicalizeFlagsIsOrderIndependentForSets() { + fun build(tt: Set) = + FingerprintGatherer.canonicalizeFlags( + emptyList(), emptyList(), emptyList(), false, null, false, tt, + emptySet(), emptySet(), false, true) + assertThat(build(linkedSetOf("a", "b", "c"))).isEqualTo(build(linkedSetOf("c", "a", "b"))) + } + + private fun fakeBazel(label: String): File = + File(temp.root, "bazel").apply { + writeText("#!/bin/sh\necho '$label'\n") + setExecutable(true) + } + + @Test + fun gatherReadsLockBazelrcImportsAndBazelVersion() { + val ws = temp.newFolder("ws") + File(ws, "MODULE.bazel.lock").writeText("lockbytes") + // .bazelrc that imports another rc via %workspace% + File(ws, ".bazelrc").writeText("common --x\nimport %workspace%/ci.bazelrc\n") + File(ws, "ci.bazelrc").writeText("build --y") + val bazel = fakeBazel("Build label: 8.5.1") + + val flags = mapOf("useCquery" to "false") + val inputs = + FingerprintGatherer.gather(ws.toPath(), bazel.toPath(), "26.0.1", flags) + + assertThat(inputs.bazelVersion).isEqualTo("8.5.1") + assertThat(inputs.bazelDiffVersion).isEqualTo("26.0.1") + assertThat(String(inputs.moduleLockContent!!)).isEqualTo("lockbytes") + assertThat(inputs.flags).isEqualTo(flags) + // both the root rc and the imported rc are captured + assertThat(inputs.bazelrcContents.keys).containsAll(".bazelrc", "ci.bazelrc") + } + + @Test + fun gatherHandlesMissingFilesAndUnknownBazel() { + val ws = temp.newFolder("empty-ws") // no MODULE.bazel.lock, no .bazelrc + val missingBazel = File(temp.root, "no-bazel").toPath() + val inputs = FingerprintGatherer.gather(ws.toPath(), missingBazel, "26.0.1", emptyMap()) + assertThat(inputs.moduleLockContent).isEqualTo(null) + assertThat(inputs.bazelrcContents.isEmpty()).isEqualTo(true) + // a bazel binary that can't run yields the "unknown" sentinel, not a crash + assertThat(inputs.bazelVersion).isEqualTo("unknown") + } +} diff --git a/cli/src/test/kotlin/com/bazel_diff/cli/WarmupCommandTest.kt b/cli/src/test/kotlin/com/bazel_diff/cli/WarmupCommandTest.kt new file mode 100644 index 0000000..fbcbd10 --- /dev/null +++ b/cli/src/test/kotlin/com/bazel_diff/cli/WarmupCommandTest.kt @@ -0,0 +1,76 @@ +package com.bazel_diff.cli + +import assertk.assertThat +import assertk.assertions.contains +import assertk.assertions.isEqualTo +import java.io.File +import org.junit.Rule +import org.junit.Test +import org.junit.rules.TemporaryFolder + +class WarmupCommandTest { + @get:Rule val temp: TemporaryFolder = TemporaryFolder() + + private fun fakeBazel(label: String): File = + File(temp.root, "bazel").apply { + writeText("#!/bin/sh\necho '$label'\n") + setExecutable(true) + } + + @Test + fun writeFingerprintEmitsJsonReflectingFlags() { + val ws = temp.newFolder("ws") + File(ws, ".bazelrc").writeText("common --x") + val fpOut = File(temp.newFolder("snap"), "fingerprint.json") + + val cmd = + WarmupCommand().apply { + workspacePath = ws.toPath() + bazelPath = fakeBazel("Build label: 8.5.1").toPath() + fingerprintOutputPath = fpOut + } + cmd.writeFingerprint() + + val json = fpOut.readText() + assertThat(json).contains("\"fingerprint\"") + assertThat(json).contains("\"flags\"") + // the flag set the fingerprint covers is rendered, reflecting defaults + assertThat(json).contains("useCquery") + } + + @Test + fun writeFingerprintCreatesParentDirs() { + val ws = temp.newFolder("ws2") + // nested, not-yet-existing output dir must be created + val fpOut = File(temp.root, "nested/dir/fingerprint.json") + WarmupCommand() + .apply { + workspacePath = ws.toPath() + bazelPath = fakeBazel("Build label: 8.5.1").toPath() + fingerprintOutputPath = fpOut + } + .writeFingerprint() + assertThat(fpOut.exists()).isEqualTo(true) + } + + @Test + fun fingerprintChangesWithFlags() { + val ws = temp.newFolder("ws3") + val a = File(temp.root, "a.json") + val b = File(temp.root, "b.json") + fun run(out: File, cquery: Boolean) = + WarmupCommand() + .apply { + workspacePath = ws.toPath() + bazelPath = fakeBazel("Build label: 8.5.1").toPath() + fingerprintOutputPath = out + useCquery = cquery + } + .writeFingerprint() + run(a, false) + run(b, true) + val fpA = a.readLines().first { it.contains("\"fingerprint\"") } + val fpB = b.readLines().first { it.contains("\"fingerprint\"") } + assert(fpA != fpB) { "fingerprint must change when --useCquery changes" } + } +} diff --git a/cli/src/test/kotlin/com/bazel_diff/interactor/FingerprintInteractorTest.kt b/cli/src/test/kotlin/com/bazel_diff/interactor/FingerprintInteractorTest.kt new file mode 100644 index 0000000..83a21ee --- /dev/null +++ b/cli/src/test/kotlin/com/bazel_diff/interactor/FingerprintInteractorTest.kt @@ -0,0 +1,110 @@ +package com.bazel_diff.interactor + +import assertk.assertThat +import assertk.assertions.isEqualTo +import assertk.assertions.isNotEqualTo +import org.junit.Test + +class FingerprintInteractorTest { + private val interactor = FingerprintInteractor() + + private fun inputs( + bazelDiffVersion: String = "26.0.1", + bazelVersion: String = "8.5.1", + moduleLockContent: ByteArray? = "lock-contents".toByteArray(), + bazelrcContents: Map = mapOf(".bazelrc" to "common --x".toByteArray()), + flags: Map = mapOf("useCquery" to "false"), + ) = + FingerprintInputs( + bazelDiffVersion = bazelDiffVersion, + bazelVersion = bazelVersion, + moduleLockContent = moduleLockContent, + bazelrcContents = bazelrcContents, + flags = flags, + ) + + @Test + fun deterministic_sameInputsSameFingerprint() { + val a = interactor.compute(inputs()) + val b = interactor.compute(inputs()) + assertThat(a.fingerprint).isEqualTo(b.fingerprint) + } + + @Test + fun flagMapOrderDoesNotMatter() { + val a = + interactor.compute( + inputs(flags = linkedMapOf("useCquery" to "true", "keepGoing" to "false"))) + val b = + interactor.compute( + inputs(flags = linkedMapOf("keepGoing" to "false", "useCquery" to "true"))) + assertThat(a.fingerprint).isEqualTo(b.fingerprint) + } + + @Test + fun bazelrcOrderDoesNotMatter() { + val a = + interactor.compute( + inputs( + bazelrcContents = + linkedMapOf( + ".bazelrc" to "a".toByteArray(), "ci.bazelrc" to "b".toByteArray()))) + val b = + interactor.compute( + inputs( + bazelrcContents = + linkedMapOf( + "ci.bazelrc" to "b".toByteArray(), ".bazelrc" to "a".toByteArray()))) + assertThat(a.fingerprint).isEqualTo(b.fingerprint) + } + + @Test + fun bazelVersionChangeChangesFingerprint() { + val a = interactor.compute(inputs(bazelVersion = "8.5.1")) + val b = interactor.compute(inputs(bazelVersion = "8.6.0")) + assertThat(a.fingerprint).isNotEqualTo(b.fingerprint) + } + + @Test + fun bazelDiffVersionChangeChangesFingerprint() { + val a = interactor.compute(inputs(bazelDiffVersion = "26.0.1")) + val b = interactor.compute(inputs(bazelDiffVersion = "27.0.0")) + assertThat(a.fingerprint).isNotEqualTo(b.fingerprint) + } + + @Test + fun moduleLockChangeChangesFingerprint() { + val a = interactor.compute(inputs(moduleLockContent = "v1".toByteArray())) + val b = interactor.compute(inputs(moduleLockContent = "v2".toByteArray())) + assertThat(a.fingerprint).isNotEqualTo(b.fingerprint) + } + + @Test + fun missingModuleLockIsDistinctFromEmpty() { + val absent = interactor.compute(inputs(moduleLockContent = null)) + val empty = interactor.compute(inputs(moduleLockContent = ByteArray(0))) + // null (no lockfile) and a present-but-empty lockfile must not collide. + assertThat(absent.components["moduleLock"]).isNotEqualTo(empty.components["moduleLock"]) + } + + @Test + fun bazelrcChangeChangesFingerprint() { + val a = interactor.compute(inputs(bazelrcContents = mapOf(".bazelrc" to "x".toByteArray()))) + val b = interactor.compute(inputs(bazelrcContents = mapOf(".bazelrc" to "y".toByteArray()))) + assertThat(a.fingerprint).isNotEqualTo(b.fingerprint) + } + + @Test + fun flagChangeChangesFingerprint() { + val a = interactor.compute(inputs(flags = mapOf("useCquery" to "false"))) + val b = interactor.compute(inputs(flags = mapOf("useCquery" to "true"))) + assertThat(a.fingerprint).isNotEqualTo(b.fingerprint) + } + + @Test + fun exposesPerComponentHashes() { + val r = interactor.compute(inputs()) + assertThat(r.components.keys) + .isEqualTo(setOf("bazelDiffVersion", "bazelVersion", "moduleLock", "bazelrc", "flags")) + } +} diff --git a/tools/firecracker/README.md b/tools/firecracker/README.md new file mode 100644 index 0000000..fbc4fd8 --- /dev/null +++ b/tools/firecracker/README.md @@ -0,0 +1,219 @@ +# Firecracker snapshot harness for bazel-diff + +Implementation + benchmarking harness for the Firecracker snapshot design +([`docs/firecracker-snapshots.md`](../../docs/firecracker-snapshots.md), PR #376). + +The goal: **instant starts** of bazel-diff on large monorepos by restoring a +microVM whose Bazel server already has the build graph loaded and external repos +fetched, so the PR-time path only re-analyses changed packages. + +This directory contains everything needed to build, validate, and run that: + +| Piece | What it is | Runs on | +| --- | --- | --- | +| `bench/gen_project.py` | Synthetic large-Bazel-project generator (no external toolchains) | anywhere | +| `bench/bench.py` | Cold-vs-warm analysis-time benchmark (the addressable win) | anywhere | +| `bazel-diff fingerprint` / `warmup` | CLI hooks (Kotlin, in `//cli`) | anywhere | +| `bazel-diff-snap` (this Go module) | `record` / `consume` orchestrator | Linux+KVM (real), anywhere (local driver) | + +## Why the cold-vs-warm benchmark proves the win + +A restored snapshot ≈ a **warm** Bazel server (graph loaded, externals fetched). +So the per-PR analysis-time a snapshot can save is bounded by: + +``` +win = cold `generate-hashes` - warm `generate-hashes` +``` + +`bench.py` measures both on a generated workspace and asserts the warm output is +**byte-identical** to the cold output (the RFC's core correctness claim). On a +~15k-target synthetic graph this is already ~70% faster warm; on a real monorepo +(bzlmod resolution + minutes of cold start) the absolute win is far larger. + +### Run the benchmark (any platform) + +```bash +# 1. generate a large workspace with two revisions to diff +python3 bench/gen_project.py --out /tmp/bigproj --packages 3000 \ + --targets-per-package 4 --git +# -> prints {"base_sha": "...", "target_sha": "...", ...} + +# 2. build bazel-diff +bazel run //:bazel-diff --script_path=/tmp/bazel_diff + +# 3. benchmark cold vs warm +python3 bench/bench.py \ + --workspace /tmp/bigproj \ + --base-sha --target-sha \ + --bazel "$(which bazel)" --bazel-diff /tmp/bazel_diff \ + --iterations 3 --json /tmp/report.json +``` + +`gen_project.py` builds a **layered** genrule DAG: packages are partitioned into +`--layers` bands and a package depends on a few packages in the previous band. +This gives real depth + width and real source files for `SourceFileHasher` to +hash — all with **zero** external toolchains, so the cold path actually +re-analyses on every run and the benchmark is reproducible. Layering bounds the +graph *depth* (default 40), which matters at scale: bazel-diff hashes +transitively with a per-level dependency-path set, so a single N-deep chain would +cost O(N²) memory — a layered, modular graph (like a real monorepo) does not. + +### Run on Linux at scale (Docker) + +The whole flow above also runs in a Linux container — the actual CI target OS — +via [`bench/run_docker_bench.sh`](bench/run_docker_bench.sh): + +```bash +bazel build //cli:bazel-diff_deploy.jar +(cd tools/firecracker && GOOS=linux GOARCH=arm64 go build -o /tmp/bazel-diff-snap-linux-arm64 .) +ARCH=arm64 SNAP=/tmp/bazel-diff-snap-linux-arm64 \ + tools/firecracker/bench/run_docker_bench.sh 11500 2 # ~150k targets, 2 iters +# results land in .bench-results/ (report.json, target_count.txt, impacted.txt) +``` + +The image bundles a JDK, bazelisk, git, the bazel-diff fat jar, and the Go +orchestrator. It does **not** run Firecracker itself — that needs `/dev/kvm`, +which Docker-for-Mac does not expose; real microVM record/consume runs on the +self-hosted Linux+KVM host. + +## CLI hooks (`//cli`) + +Two picocli subcommands implement RFC §4 (Phase 1, pure Kotlin, unit-tested): + +- **`bazel-diff fingerprint`** — computes the snapshot cache key over the inputs + that affect the build graph (bazel version, `MODULE.bazel.lock`, `.bazelrc`, + bazel-diff version, flag set) and writes it as JSON. Used to decide whether a + snapshot is safe to consume. +- **`bazel-diff warmup`** — the record-side entrypoint: runs `generate-hashes` + for the base revision, writes `base_hashes.json` + `fingerprint.json` to known + paths, and exits `0` only once the server is warm (the host's "safe to + snapshot" signal). It *extends* `generate-hashes`, so the baked base hashes are + byte-identical to a cold run. + +## Orchestrator (`bazel-diff-snap`, this Go module) + +Dependency-free (stdlib only): the Firecracker REST API is spoken over a unix +socket with `net/http`, so the tool builds as a static CI binary with no module +downloads. *(This is a deliberate deviation from the RFC's `firecracker-go-sdk` +suggestion — the API surface we need is small and a zero-dependency static binary +is simpler to ship to CI.)* + +```bash +go build -o bazel-diff-snap . # native +GOOS=linux GOARCH=amd64 go build . # static Linux binary for CI +go test ./... # pure logic + API client, runs anywhere +``` + +### Drivers + +- **`--driver local`** (default) — runs `warmup` / `generate-hashes` / + `get-impacted-targets` directly on the host, no VM. Exercises the full + store + fingerprint + resolve pipeline; works on macOS. This is what the unit + tests and local end-to-end runs use. +- **`--driver firecracker`** — boots/snapshots/restores a real microVM via the + Firecracker API. Requires Linux + `/dev/kvm`, a prepared kernel + rootfs base + image (`bench/build_guest_image.sh`), and a host TAP (`bench/setup_tap.sh`) so + the orchestrator can ssh into the guest. This is the production path (RFC §6). + + The guest gets a virtio-net device backed by the host TAP; the orchestrator + bakes a static `ip=` directive into the kernel cmdline so the guest is + reachable with no DHCP. Addressing defaults match the guest image's + `fcnet-setup.sh` MAC→IP convention (MAC `06:00:AC:10:00:02` → `172.16.0.2/30`). + TAP creation needs `CAP_NET_ADMIN`, so it is the operator's job (run + `setup_tap.sh` once); the orchestrator stays privilege-free and only checks the + TAP exists before a restore. Relevant flags: + + ``` + --kernel guest kernel (rootfs.base.ext4 must sit beside it) + --guest-addr guest ssh address, e.g. root@172.16.0.2 + --tap-device host TAP backing the guest NIC, e.g. fc-tap0 + --guest-ip / --host-ip / --netmask / --guest-mac static TAP addressing + --guest-key ssh identity trusted by the guest + --vcpus / --mem-mib guest sizing + ``` + +### Building the guest image + +```bash +# 1. host TAP (once per host; needs root) +sudo tools/firecracker/bench/setup_tap.sh # fc-tap0, host 172.16.0.1/30 + +# 2. kernel + rootfs.base.ext4 with JDK + bazel + git + bazel-diff + workspace +sudo -E OUT=/tmp/fc-image \ + BAZEL_DIFF_JAR=bazel-bin/cli/bazel-diff_deploy.jar \ + BAZEL_BIN=$(which bazelisk) WORKSPACE_SRC=/tmp/bigproj \ + SSH_PUBKEY=~/.ssh/fc_guest.pub \ + tools/firecracker/bench/build_guest_image.sh +``` + +The image switches sshd from systemd socket-activation to a standalone +always-running daemon — socket-activated sshd does not service connections +reliably after a snapshot restore. + +### record / consume + +```bash +# RECORD a snapshot for a base revision (on merge to master / nightly) +bazel-diff-snap record \ + --workspace /path/to/repo --base-sha \ + --store /snapshots --bazel "$(which bazel)" --bazel-diff /tmp/bazel_diff + +# CONSUME on a PR: resolve a compatible snapshot, restore, diff +bazel-diff-snap consume \ + --workspace /path/to/repo --target-sha \ + --store /snapshots --out impacted.txt \ + --bazel "$(which bazel)" --bazel-diff /tmp/bazel_diff +# exit 0 = wrote impacted.txt | exit 2 = no compatible snapshot, run cold path | exit 1 = error +``` + +`consume` is **fail-safe** (RFC §5.2): it fingerprints the *target* environment +and only uses a snapshot whose fingerprint matches **and** whose base SHA is an +ancestor of the target (nearest ancestor wins, to minimise re-analysis). Any +mismatch → exit `2` so the caller runs the existing cold +[`bazel-diff-example.sh`](../../bazel-diff-example.sh) path. A stale snapshot is +never silently trusted. + +### Snapshot store layout + +``` +/// + mem_file guest memory image (diff snapshot) [firecracker driver] + vmstate Firecracker microVM state [firecracker driver] + rootfs.backing frozen read-only disk image [firecracker driver] + base_hashes.json produced by `bazel-diff warmup` + fingerprint.json the cache key + flag set + metadata.json fingerprint, baseSHA, versions, created-at +``` + +## Status + +- [x] Synthetic generator + cold/warm benchmark — validates the analysis-time win +- [x] `fingerprint` + `warmup` CLI hooks (unit-tested) +- [x] Go orchestrator: store/resolve/fingerprint-match/API client (unit-tested), + `local` driver end-to-end +- [x] `firecracker` driver: networking (TAP-backed NIC + static `ip=`) wired into + record/consume, restore-time TAP precondition, unit-tested API client +- [x] Guest image + TAP build scripts (`bench/build_guest_image.sh`, + `bench/setup_tap.sh`) +- [x] Real-microVM canary as a gated Go integration test (`fc_integration_test.go`, + build tag `fcintegration`) — runs `fcDriver.record` + `consume` end-to-end +- [x] One-click CI workflow that builds the image, boots a real microVM, and + asserts snapshot-consumed == cold: + [`.github/workflows/firecracker-e2e.yml`](../../.github/workflows/firecracker-e2e.yml) + (`workflow_dispatch`; runs on a mainline-kernel x86_64 runner with `/dev/kvm`) + +### Validation notes (aarch64 / KVM) + +The driver was exercised on real Firecracker + `/dev/kvm`. **What works:** microVM +boot, the full API sequence incl. `PUT /network-interfaces`, ssh into the guest +over the TAP, `PATCH /vm` pause, `PUT /snapshot/create`, and `PUT /snapshot/load` ++ resume — devices reconnect and the restored guest's *kernel* networking is live +(host↔guest ping and the TCP handshake to sshd both succeed). + +**Known host limitation:** on a downstream 16 KB-page kernel (Raspberry Pi 5, +`rpi-2712`), the restored guest's *userspace* does not advance after resume +(sshd accepts the connection but never sends its banner), with both the 5.10 and +6.1 CI guest kernels — an aarch64 guest-timer-restore quirk below the driver, not +in the orchestrator's API sequence. Trigger the `Firecracker snapshot e2e` +workflow (a mainline-kernel, 4 KB-page x86_64 runner with `/dev/kvm`) to land the +green end-to-end run. diff --git a/tools/firecracker/bazel.go b/tools/firecracker/bazel.go new file mode 100644 index 0000000..9298326 --- /dev/null +++ b/tools/firecracker/bazel.go @@ -0,0 +1,23 @@ +package main + +import ( + "os/exec" + "strings" +) + +// readBazelLabel returns the bazel "Build label" version string for bookkeeping +// in snapshot metadata. Best-effort: returns "" on any error. +func readBazelLabel(bazel, workspace string) (string, error) { + cmd := exec.Command(bazel, "version") + cmd.Dir = workspace + out, err := cmd.Output() + if err != nil { + return "", err + } + for _, line := range strings.Split(string(out), "\n") { + if strings.HasPrefix(line, "Build label: ") { + return strings.TrimSpace(strings.TrimPrefix(line, "Build label: ")), nil + } + } + return "", nil +} diff --git a/tools/firecracker/bazel_test.go b/tools/firecracker/bazel_test.go new file mode 100644 index 0000000..07badc8 --- /dev/null +++ b/tools/firecracker/bazel_test.go @@ -0,0 +1,49 @@ +package main + +import ( + "os" + "path/filepath" + "testing" +) + +// writeFakeBazel writes an executable shell stub at dir/bazel that prints the +// given lines for `bazel version` and returns its path. +func writeFakeBazel(t *testing.T, dir, versionOutput string) string { + t.Helper() + p := filepath.Join(dir, "bazel") + body := "#!/bin/sh\nif [ \"$1\" = version ]; then cat <<'EOF'\n" + versionOutput + "\nEOF\nfi\n" + if err := os.WriteFile(p, []byte(body), 0o755); err != nil { + t.Fatal(err) + } + return p +} + +func TestReadBazelLabel(t *testing.T) { + dir := t.TempDir() + bazel := writeFakeBazel(t, dir, "Build label: 8.5.1\nBuild time: ...") + got, err := readBazelLabel(bazel, dir) + if err != nil { + t.Fatal(err) + } + if got != "8.5.1" { + t.Fatalf("want 8.5.1, got %q", got) + } +} + +func TestReadBazelLabelNoLabel(t *testing.T) { + dir := t.TempDir() + bazel := writeFakeBazel(t, dir, "no version line here") + got, err := readBazelLabel(bazel, dir) + if err != nil { + t.Fatal(err) + } + if got != "" { + t.Fatalf("want empty for missing label, got %q", got) + } +} + +func TestReadBazelLabelExecError(t *testing.T) { + if _, err := readBazelLabel("/no/such/bazel/binary", t.TempDir()); err == nil { + t.Fatal("expected error when bazel binary is missing") + } +} diff --git a/tools/firecracker/bench/Dockerfile b/tools/firecracker/bench/Dockerfile new file mode 100644 index 0000000..19dffca --- /dev/null +++ b/tools/firecracker/bench/Dockerfile @@ -0,0 +1,34 @@ +# Linux benchmark image for the Firecracker snapshot harness. +# +# Runs the cold-vs-warm analysis-time benchmark and the local-driver +# orchestrator on a generated large-Bazel-project, inside real Linux (the CI +# target OS). Does NOT run Firecracker itself — that needs /dev/kvm, which is +# unavailable in Docker-for-Mac. See README.md. +FROM eclipse-temurin:21-jdk + +ARG BAZELISK_ARCH=arm64 +ARG BAZELISK_VERSION=v1.20.0 + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + git python3 bash curl ca-certificates \ + && rm -rf /var/lib/apt/lists/* \ + && curl -fsSL -o /usr/local/bin/bazelisk \ + "https://github.com/bazelbuild/bazelisk/releases/download/${BAZELISK_VERSION}/bazelisk-linux-${BAZELISK_ARCH}" \ + && chmod +x /usr/local/bin/bazelisk \ + && ln -sf /usr/local/bin/bazelisk /usr/local/bin/bazel + +# bazel-diff as a portable fat jar + a wrapper on PATH. +COPY bazel-diff.jar /opt/bazel-diff.jar +# Large repos need real heap for transitive hashing; default is ~25% of RAM. +RUN printf '#!/bin/sh\nexec java %s -jar /opt/bazel-diff.jar "$@"\n' "${BAZEL_DIFF_JVM_ARGS:--Xmx4g}" \ + > /usr/local/bin/bazel-diff \ + && chmod +x /usr/local/bin/bazel-diff + +# The Go orchestrator (linux binary) + the bench scripts. +COPY bazel-diff-snap /usr/local/bin/bazel-diff-snap +COPY gen_project.py bench.py run_in_container.sh /opt/bench/ +RUN chmod +x /usr/local/bin/bazel-diff-snap /opt/bench/run_in_container.sh + +WORKDIR /work +ENTRYPOINT ["/opt/bench/run_in_container.sh"] diff --git a/tools/firecracker/bench/bench.py b/tools/firecracker/bench/bench.py new file mode 100644 index 0000000..5f39cc7 --- /dev/null +++ b/tools/firecracker/bench/bench.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python3 +"""Cold-vs-warm analysis-time benchmark for the Firecracker snapshot RFC. + +This measures the *addressable* win of a Firecracker snapshot without needing +Firecracker itself. The insight: a restored snapshot gives you a Bazel server +whose Skyframe graph is already loaded and whose external repos are already +fetched — i.e. a **warm** server. So: + + snapshot-restore consume ~= warm `generate-hashes` + restore overhead + +and the per-PR win a snapshot can capture is bounded by: + + win = cold `generate-hashes` - warm `generate-hashes` + +The cold number is what bazel-diff costs today on every PR (fresh server, +full graph load). The warm number is what it would cost if the server were +already warm — exactly the snapshot consume path. This script measures both +on a real generated workspace and reports the delta. + +It also asserts the warm output is byte-identical to the cold output, which is +the RFC's core correctness claim (an incorrect affected set is worse than none). + +Usage: + python3 bench.py \ + --workspace /tmp/benchbig \ + --base-sha --target-sha \ + --bazel /usr/local/bin/bazel \ + --bazel-diff /tmp/bazel_diff \ + --iterations 3 \ + [--json out.json] +""" + +from __future__ import annotations + +import argparse +import json +import statistics +import subprocess +import sys +import time +from pathlib import Path + + +def _run(cmd: list[str], cwd: str | None = None) -> None: + # Capture stderr so a failing bazel/bazel-diff surfaces its diagnostics + # instead of failing silently. + proc = subprocess.run( + cmd, + cwd=cwd, + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + text=True, + ) + if proc.returncode != 0: + tail = "\n".join(proc.stderr.splitlines()[-25:]) + raise RuntimeError( + f"command failed (exit {proc.returncode}): {' '.join(cmd)}\n{tail}" + ) + + +def _git_checkout(workspace: str, sha: str) -> None: + _run(["git", "-C", workspace, "checkout", "--quiet", "--force", sha]) + + +def _shutdown(bazel: str, workspace: str) -> None: + subprocess.run( + [bazel, "shutdown"], + cwd=workspace, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + + +def _generate_hashes(bazel_diff: str, bazel: str, workspace: str, out: str) -> float: + t0 = time.time() + _run( + [ + bazel_diff, + "generate-hashes", + "-w", + workspace, + "-b", + bazel, + out, + ] + ) + return time.time() - t0 + + +def benchmark( + workspace: str, + base_sha: str, + target_sha: str, + bazel: str, + bazel_diff: str, + iterations: int, + tmp: Path, +) -> dict: + cold_base = [] + cold_consume = [] + warm_consume = [] + + base_json = str(tmp / "base.json") + warm_json = str(tmp / "target_warm.json") + cold_json = str(tmp / "target_cold.json") + + correctness_ok = True + + for it in range(iterations): + # --- record-time warmup: cold server on base SHA --- + _git_checkout(workspace, base_sha) + _shutdown(bazel, workspace) + cold_base.append(_generate_hashes(bazel_diff, bazel, workspace, base_json)) + + # --- consume the warm server: checkout target, incremental re-analysis. + # The server is still warm from the base run; this models a restore. + _git_checkout(workspace, target_sha) + warm_consume.append(_generate_hashes(bazel_diff, bazel, workspace, warm_json)) + + # --- consume cold: what a PR pays today (no snapshot) --- + _shutdown(bazel, workspace) + cold_consume.append(_generate_hashes(bazel_diff, bazel, workspace, cold_json)) + + # correctness: warm output must equal cold output + if Path(warm_json).read_bytes() != Path(cold_json).read_bytes(): + correctness_ok = False + + print( + f" iter {it + 1}/{iterations}: " + f"cold_base={cold_base[-1]:.1f}s " + f"warm_consume={warm_consume[-1]:.1f}s " + f"cold_consume={cold_consume[-1]:.1f}s", + file=sys.stderr, + ) + + def med(xs: list[float]) -> float: + return round(statistics.median(xs), 2) + + cold_med = med(cold_consume) + warm_med = med(warm_consume) + win = round(cold_med - warm_med, 2) + pct = round(100 * win / cold_med, 1) if cold_med else 0.0 + + return { + "workspace": workspace, + "base_sha": base_sha, + "target_sha": target_sha, + "iterations": iterations, + "cold_base_warmup_s": med(cold_base), + "cold_consume_s": cold_med, + "warm_consume_s": warm_med, + "win_s": win, + "win_pct": pct, + "correctness_warm_equals_cold": correctness_ok, + "raw": { + "cold_base": cold_base, + "warm_consume": warm_consume, + "cold_consume": cold_consume, + }, + } + + +def main(argv: list[str]) -> int: + p = argparse.ArgumentParser(description=__doc__) + p.add_argument("--workspace", required=True) + p.add_argument("--base-sha", required=True) + p.add_argument("--target-sha", required=True) + p.add_argument("--bazel", default="bazel") + p.add_argument("--bazel-diff", required=True) + p.add_argument("--iterations", type=int, default=3) + p.add_argument("--json", help="write full report JSON here") + args = p.parse_args(argv) + + tmp = Path(args.workspace).parent / "_bench_tmp" + tmp.mkdir(exist_ok=True) + + print("Running cold-vs-warm benchmark...", file=sys.stderr) + report = benchmark( + workspace=args.workspace, + base_sha=args.base_sha, + target_sha=args.target_sha, + bazel=args.bazel, + bazel_diff=args.bazel_diff, + iterations=args.iterations, + tmp=tmp, + ) + + print("\n=== Analysis-time win (median over " + f"{report['iterations']} iters) ===") + print(f" cold consume (today): {report['cold_consume_s']}s") + print(f" warm consume (snapshot ~=): {report['warm_consume_s']}s") + print(f" win: {report['win_s']}s " + f"({report['win_pct']}% faster)") + print(f" correctness (warm == cold): " + f"{'OK' if report['correctness_warm_equals_cold'] else 'FAILED'}") + + if args.json: + Path(args.json).write_text(json.dumps(report, indent=2)) + print(f"\nwrote {args.json}") + + if not report["correctness_warm_equals_cold"]: + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) diff --git a/tools/firecracker/bench/build_guest_image.sh b/tools/firecracker/bench/build_guest_image.sh new file mode 100755 index 0000000..67af4ad --- /dev/null +++ b/tools/firecracker/bench/build_guest_image.sh @@ -0,0 +1,125 @@ +#!/usr/bin/env bash +# Build the Firecracker guest image (kernel + rootfs.base.ext4) that the +# `firecracker` driver boots, warms, and snapshots. +# +# The rootfs bakes in everything `bazel-diff warmup` / `generate-hashes` needs so +# the guest is self-contained and offline: a JDK, a pinned `bazel` binary, git, +# the bazel-diff fat jar (wrapped as `/usr/local/bin/bazel-diff`), the workspace, +# an sshd that trusts the operator key, and the /snap dir warmup writes to. +# fcDriver.baseRootfs() expects the rootfs at /rootfs.base.ext4, so +# both land in OUT together. +# +# Networking matches setup_tap.sh + the bazel-diff-snap defaults: the guest's +# stock fcnet-setup.sh derives the eth0 IP from the NIC MAC (06:00:AC:10:00:02 -> +# 172.16.0.2/30), so no DHCP and no in-image network config is needed. +# +# Requires (Linux): unsquashfs, mke2fs (e2fsprogs), chroot, curl, and network +# access for the one-time base download + apt install into the chroot. +# +# OUT=/tmp/fc-image \ +# BAZEL_DIFF_JAR=bazel-bin/cli/bazel-diff_deploy.jar \ +# BAZEL_BIN=$(which bazelisk) \ +# WORKSPACE_SRC=/tmp/fcbench \ +# SSH_PUBKEY=~/.ssh/fc_guest.pub \ +# tools/firecracker/bench/build_guest_image.sh +set -euo pipefail + +ARCH="${ARCH:-aarch64}" +KERNEL_VER="${KERNEL_VER:-6.1.128}" +CI_BASE="${CI_BASE:-https://s3.amazonaws.com/spec.ccfc.min/firecracker-ci/v1.12/${ARCH}}" +OUT="${OUT:-/tmp/fc-image}" +SIZE_MB="${SIZE_MB:-6144}" +JDK_PKG="${JDK_PKG:-openjdk-21-jdk-headless}" + +BAZEL_DIFF_JAR="${BAZEL_DIFF_JAR:?set BAZEL_DIFF_JAR to the bazel-diff_deploy.jar}" +BAZEL_BIN="${BAZEL_BIN:?set BAZEL_BIN to a bazel/bazelisk binary to bake in}" +WORKSPACE_SRC="${WORKSPACE_SRC:?set WORKSPACE_SRC to the git workspace to bake in}" +SSH_PUBKEY="${SSH_PUBKEY:?set SSH_PUBKEY to the public key the guest should trust}" + +if [[ $EUID -ne 0 ]]; then echo "run as root (chroot + mke2fs): sudo -E $0" >&2; exit 1; fi + +mkdir -p "$OUT" +KERNEL="$OUT/vmlinux-$KERNEL_VER" +ROOTFS_IMG="$OUT/rootfs.base.ext4" +WORK="$OUT/rootfs.work" + +echo ">> [1/6] fetch kernel + base rootfs" +[[ -f "$KERNEL" ]] || curl -fsSL -o "$KERNEL" "$CI_BASE/vmlinux-$KERNEL_VER" +SQUASHFS="$OUT/ubuntu.squashfs" +[[ -f "$SQUASHFS" ]] || curl -fsSL -o "$SQUASHFS" "$CI_BASE/ubuntu-24.04.squashfs" + +echo ">> [2/6] extract base rootfs" +rm -rf "$WORK" +unsquashfs -q -d "$WORK" "$SQUASHFS" + +echo ">> [3/6] inject ssh key + snap dir" +install -d -m 700 "$WORK/root/.ssh" +install -m 600 "$SSH_PUBKEY" "$WORK/root/.ssh/authorized_keys" +install -d -m 755 "$WORK/snap" +# Generate guest host keys at build time so the first ssh doesn't race sshd. +ssh-keygen -A -f "$WORK" >/dev/null 2>&1 || chroot "$WORK" ssh-keygen -A +# Standalone always-on sshd: socket-activated sshd (the Ubuntu default) does not +# reliably service connections after a snapshot restore. Enable ssh.service and +# disable ssh.socket so the daemon is already listening when the VM resumes. +if [[ -e "$WORK/lib/systemd/system/ssh.service" ]]; then + mkdir -p "$WORK/etc/systemd/system/multi-user.target.wants" + ln -sf /lib/systemd/system/ssh.service \ + "$WORK/etc/systemd/system/multi-user.target.wants/ssh.service" + rm -f "$WORK/etc/systemd/system/sockets.target.wants/ssh.socket" + rm -f "$WORK/etc/systemd/system/ssh.service.requires/ssh.socket" +fi + +echo ">> [4/6] install JDK + git into the chroot" +mount --bind /dev "$WORK/dev"; mount -t proc proc "$WORK/proc"; mount -t sysfs sys "$WORK/sys" +# devpts is needed by package postinst scripts (e.g. the JDK invokes java, which +# calls posix_openpt); without it dpkg --configure fails on minimized bases. +mkdir -p "$WORK/dev/pts"; mount -t devpts devpts "$WORK/dev/pts" +cp /etc/resolv.conf "$WORK/etc/resolv.conf" +trap 'umount -l "$WORK/dev/pts" "$WORK/dev" "$WORK/proc" "$WORK/sys" 2>/dev/null || true' EXIT +chroot "$WORK" /bin/bash -euxc " + export DEBIAN_FRONTEND=noninteractive + # Some hosts' chroot /tmp is missing/unwritable for the unprivileged _apt + # sandbox user, which makes apt fail to stage temp files ('Couldn't create + # temporary file ...') and then report repos as unsigned. Ensure a sticky + # world-writable /tmp and run apt as root (sandbox off) to be host-agnostic. + install -d -m 1777 /tmp + # Minimized base images can lack apt's spool/log dirs; recreate them so + # downloads and dpkg configuration (which writes apt logs) succeed. + install -d /var/cache/apt/archives/partial /var/lib/apt/lists/partial /var/log/apt + # ...and lack /usr/share/man/manN, which the JDK's update-alternatives needs + # to create man symlinks (else 'error creating symbolic link .../java.1.gz'). + mkdir -p /usr/share/man/man1 /usr/share/man/man2 /usr/share/man/man3 \ + /usr/share/man/man4 /usr/share/man/man5 /usr/share/man/man6 \ + /usr/share/man/man7 /usr/share/man/man8 + apt-get -o APT::Sandbox::User=root update -qq + apt-get -o APT::Sandbox::User=root install -y --no-install-recommends $JDK_PKG git ca-certificates >/dev/null + apt-get clean && rm -rf /var/lib/apt/lists/* +" + +echo ">> [5/6] bake bazel + bazel-diff + workspace" +install -m 755 "$BAZEL_BIN" "$WORK/usr/local/bin/bazel" +install -d -m 755 "$WORK/opt/bazel-diff" +install -m 644 "$BAZEL_DIFF_JAR" "$WORK/opt/bazel-diff/bazel-diff_deploy.jar" +cat > "$WORK/usr/local/bin/bazel-diff" <<'EOF' +#!/bin/sh +exec java -jar /opt/bazel-diff/bazel-diff_deploy.jar "$@" +EOF +chmod 755 "$WORK/usr/local/bin/bazel-diff" +# Bake the workspace (git repo) under /work. record/consume `git checkout` here. +# Own it as root (the guest runs commands as root) so git doesn't reject the repo +# with "detected dubious ownership" (exit 128) when the source uid differs. +rm -rf "$WORK/work" +cp -a "$WORKSPACE_SRC" "$WORK/work" +chown -R 0:0 "$WORK/work" + +echo ">> [6/6] build ext4 image ($SIZE_MB MiB) at $ROOTFS_IMG" +umount -l "$WORK/dev/pts" "$WORK/dev" "$WORK/proc" "$WORK/sys" 2>/dev/null || true +trap - EXIT +rm -f "$ROOTFS_IMG" +mke2fs -q -F -L rootfs -t ext4 -d "$WORK" "$ROOTFS_IMG" "${SIZE_MB}M" + +echo +echo "done:" +echo " kernel : $KERNEL" +echo " rootfs : $ROOTFS_IMG (baseRootfs convention: rootfs.base.ext4 next to kernel)" +echo " guest workspace: /work guest snap dir: /snap ssh: root@172.16.0.2" diff --git a/tools/firecracker/bench/gen_project.py b/tools/firecracker/bench/gen_project.py new file mode 100644 index 0000000..0ca06eb --- /dev/null +++ b/tools/firecracker/bench/gen_project.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python3 +"""Synthetic Bazel workspace generator for bazel-diff benchmarking. + +Produces a large, self-contained Bazel workspace that uses **only native +genrule + source-file targets** — no external toolchains, no network, no +rules_* downloads. The point is to stress the exact thing the Firecracker +snapshot RFC targets: the cost of `bazel query deps(//...)` (Skyframe graph +load + package analysis) on a large graph. + +The graph is a layered DAG: package ``i`` depends on a few lower-indexed +packages, giving both depth (long dependency chains) and width (many +packages). Every package contains real source files so that bazel-diff's +SourceFileHasher has content to hash, plus a chain of genrules that consume +those sources and the outputs of upstream packages. + +Optionally (``--git``) the generator initialises a git repo with two commits: +a ``base`` commit and a ``change`` commit that mutates a single source file +deep in the graph. The two SHAs are what the benchmark / orchestrator diffs. + +Example: + + python3 gen_project.py --out /tmp/bigproj --packages 2000 \ + --targets-per-package 4 --git + + # prints JSON: {"workspace": "...", "base_sha": "...", "target_sha": "..."} +""" + +from __future__ import annotations + +import argparse +import json +import os +import random +import subprocess +import sys +from pathlib import Path + +# Bazel version the generated workspace pins. Kept in sync with the repo root +# .bazelversion so a single bazelisk install serves both. +DEFAULT_BAZEL_VERSION = "8.5.1" + + +def _layer_of(i: int, packages: int, layers: int) -> int: + return (i * layers) // packages + + +def _upstream_deps(i: int, packages: int, layers: int, fanin: int) -> list[int]: + """Pick a deterministic set of upstream packages for package ``i``. + + Packages are partitioned into ``layers`` contiguous bands; package ``i`` (in + layer ``k``) depends on ``fanin`` packages drawn from layer ``k-1``. This + keeps the dependency-graph **depth bounded by ``layers``** (a realistic, + modular monorepo) instead of forming one long chain — important because + bazel-diff hashes transitively with a per-level dependency-path set, so an + N-deep chain costs O(N^2) memory and an N-frame recursion. + + Always acyclic (deps live in a strictly-lower layer) and deterministic. + """ + k = _layer_of(i, packages, layers) + if k == 0: + return [] + lo = ((k - 1) * packages) // layers # first index of previous layer + hi = (k * packages) // layers # first index of this layer (exclusive bound) + span = hi - lo + if span <= 0: + return [] + deps = set() + for j in range(fanin): + deps.add(lo + (i * 2654435761 + j * 40503) % span) + return sorted(deps) + + +def _write(path: Path, content: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content) + + +def gen_workspace( + out: Path, + packages: int, + targets_per_package: int, + sources_per_package: int, + fanin: int, + layers: int, + bazel_version: str, +) -> dict: + out.mkdir(parents=True, exist_ok=True) + + # --- workspace-level files (bzlmod, no external deps) --- + _write(out / ".bazelversion", bazel_version + "\n") + _write( + out / "MODULE.bazel", + 'module(name = "bench", version = "0.0.0")\n', + ) + _write( + out / ".bazelrc", + # Keep analysis honest and reproducible; no remote cache so each cold + # run actually re-analyses. + "common --lockfile_mode=off\n" + "common --color=no\n", + ) + _write(out / "BUILD.bazel", '# root package (intentionally empty)\n') + _write(out / ".gitignore", "/bazel-*\n") + + total_targets = 0 + # Remember a deep source file to mutate for the "change" commit. + deep_source: Path | None = None + + for i in range(packages): + pkg_dir = out / "pkg" / f"p{i:05d}" + rel_pkg = f"//pkg/p{i:05d}" + + # source files + src_labels = [] + for s in range(sources_per_package): + fname = f"src_{s}.txt" + _write(pkg_dir / fname, f"package {i} source {s}\n") + src_labels.append(f'"{fname}"') + + deps = _upstream_deps(i, packages, layers, fanin) + + lines = [ + 'package(default_visibility = ["//visibility:public"])', + "", + ] + + # genrule chain within the package + prev_out_label = None + for t in range(targets_per_package): + name = f"g{t}" + srcs = list(src_labels) + if prev_out_label is not None: + srcs.append(f'"{prev_out_label}"') + elif deps: + # first genrule pulls in upstream packages' final outputs + for d in deps: + srcs.append(f'"//pkg/p{d:05d}:g{targets_per_package - 1}"') + out_file = f"out_{t}.txt" + lines += [ + "genrule(", + f' name = "{name}",', + f" srcs = [{', '.join(srcs)}],", + f' outs = ["{out_file}"],', + ' cmd = "cat $(SRCS) > $@",', + ")", + "", + ] + prev_out_label = name + total_targets += 1 + + # a filegroup aggregating everything (extra graph edges) + lines += [ + "filegroup(", + ' name = "all_srcs",', + f" srcs = [{', '.join(src_labels)}],", + ")", + "", + ] + total_targets += 1 + + _write(pkg_dir / "BUILD.bazel", "\n".join(lines)) + + # remember a file roughly 75% deep to mutate later + if i == int(packages * 0.75): + deep_source = pkg_dir / "src_0.txt" + + if deep_source is None: + deep_source = out / "pkg" / "p00000" / "src_0.txt" + + return { + "workspace": str(out), + "packages": packages, + "rule_targets": total_targets, + "deep_source": str(deep_source.relative_to(out)), + "bazel_version": bazel_version, + } + + +def _git(out: Path, *args: str) -> str: + return subprocess.check_output( + ["git", "-C", str(out), *args], text=True + ).strip() + + +def init_git(out: Path, deep_source_rel: str) -> dict: + if (out / ".git").exists(): + raise SystemExit(f"{out} already has a .git; refusing to re-init") + _git(out, "init", "-q") + _git(out, "config", "user.email", "bench@bazel-diff.local") + _git(out, "config", "user.name", "bazel-diff bench") + _git(out, "add", "-A") + _git(out, "commit", "-q", "-m", "base") + base_sha = _git(out, "rev-parse", "HEAD") + + # mutate one deep source file -> minimal, realistic PR-sized change + deep = out / deep_source_rel + with deep.open("a") as f: + f.write("mutated for change commit\n") + _git(out, "add", "-A") + _git(out, "commit", "-q", "-m", "change") + target_sha = _git(out, "rev-parse", "HEAD") + + return {"base_sha": base_sha, "target_sha": target_sha} + + +def main(argv: list[str]) -> int: + p = argparse.ArgumentParser(description=__doc__) + p.add_argument("--out", required=True, help="output workspace directory") + p.add_argument("--packages", type=int, default=2000) + p.add_argument("--targets-per-package", type=int, default=4) + p.add_argument("--sources-per-package", type=int, default=3) + p.add_argument( + "--fanin", + type=int, + default=4, + help="upstream packages each package depends on (from the previous layer)", + ) + p.add_argument( + "--layers", + type=int, + default=40, + help="number of dependency layers; bounds graph depth (and recursion depth)", + ) + p.add_argument("--bazel-version", default=DEFAULT_BAZEL_VERSION) + p.add_argument( + "--git", + action="store_true", + help="git init + create base/change commits", + ) + p.add_argument("--seed", type=int, default=0) + args = p.parse_args(argv) + + random.seed(args.seed) + out = Path(args.out).resolve() + + meta = gen_workspace( + out, + packages=args.packages, + targets_per_package=args.targets_per_package, + sources_per_package=args.sources_per_package, + fanin=args.fanin, + layers=args.layers, + bazel_version=args.bazel_version, + ) + + if args.git: + meta.update(init_git(out, meta["deep_source"])) + + json.dump(meta, sys.stdout, indent=2) + sys.stdout.write("\n") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) diff --git a/tools/firecracker/bench/run_docker_bench.sh b/tools/firecracker/bench/run_docker_bench.sh new file mode 100755 index 0000000..5373861 --- /dev/null +++ b/tools/firecracker/bench/run_docker_bench.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +# Host-side driver: stages artifacts, builds the Linux benchmark image, and runs +# the cold-vs-warm benchmark + local-driver orchestrator inside a container. +# +# Prereqs (run from repo root): +# bazel build //cli:bazel-diff_deploy.jar +# (cd tools/firecracker && GOOS=linux GOARCH="$ARCH" go build -o /tmp/bazel-diff-snap-linux .) +# +# Usage: +# tools/firecracker/bench/run_docker_bench.sh [PKGS] [ITERS] +set -euo pipefail + +PKGS=${1:-11500} +ITERS=${2:-2} +ARCH=${ARCH:-arm64} # docker host arch (arm64 on Apple Silicon) +REPO_ROOT=$(cd "$(dirname "$0")/../../.." && pwd) +BENCH_DIR="$REPO_ROOT/tools/firecracker/bench" + +JAR="$REPO_ROOT/bazel-bin/cli/bazel-diff_deploy.jar" +SNAP="${SNAP:-/tmp/bazel-diff-snap-linux-$ARCH}" +[ -f "$JAR" ] || { echo "missing $JAR — run: bazel build //cli:bazel-diff_deploy.jar"; exit 1; } +[ -f "$SNAP" ] || { echo "missing $SNAP — cross-compile the go binary first"; exit 1; } + +STAGE=$(mktemp -d) +trap 'rm -rf "$STAGE"' EXIT +cp "$JAR" "$STAGE/bazel-diff.jar" +cp "$SNAP" "$STAGE/bazel-diff-snap" +cp "$BENCH_DIR/Dockerfile" "$BENCH_DIR/gen_project.py" \ + "$BENCH_DIR/bench.py" "$BENCH_DIR/run_in_container.sh" "$STAGE/" + +RESULTS=${RESULTS:-"$REPO_ROOT/.bench-results"} +mkdir -p "$RESULTS" + +echo "=== building image (arch=$ARCH) ===" +docker build --build-arg BAZELISK_ARCH="$ARCH" -t bazel-diff-bench "$STAGE" + +echo "=== running benchmark: PKGS=$PKGS ITERS=$ITERS ===" +docker run --rm \ + -e PKGS="$PKGS" -e ITERS="$ITERS" \ + -v "$RESULTS:/results" \ + bazel-diff-bench + +echo "=== results in $RESULTS ===" +cat "$RESULTS/target_count.txt" 2>/dev/null || true +cat "$RESULTS/report.json" 2>/dev/null || true diff --git a/tools/firecracker/bench/run_in_container.sh b/tools/firecracker/bench/run_in_container.sh new file mode 100755 index 0000000..64e35ce --- /dev/null +++ b/tools/firecracker/bench/run_in_container.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# Entrypoint executed inside the Linux benchmark container. +# +# 1. generates a large Bazel project (PKGS packages, two git revisions) +# 2. pre-warms bazel (download toolchain + fetch BCR deps + populate repo cache) +# so those one-time costs are NOT counted as analysis time +# 3. runs the cold-vs-warm benchmark (the addressable snapshot win) +# 4. runs the local-driver orchestrator record/consume end-to-end +# +# Outputs land in /results (bind-mounted from the host). +set -euo pipefail + +PKGS=${PKGS:-11500} +TPP=${TPP:-4} +ITERS=${ITERS:-2} +WS=/work/proj +RESULTS=/results +mkdir -p "$RESULTS" + +echo "=== generating project: ${PKGS} packages x ${TPP} targets/pkg ===" +python3 /opt/bench/gen_project.py --out "$WS" --packages "$PKGS" \ + --targets-per-package "$TPP" --git | tee "$RESULTS/gen.json" + +BASE=$(python3 -c "import json;print(json.load(open('$RESULTS/gen.json'))['base_sha'])") +TARGET=$(python3 -c "import json;print(json.load(open('$RESULTS/gen.json'))['target_sha'])") + +echo "=== pre-warm bazel (toolchain + BCR fetch + repo cache) ===" +( cd "$WS" && bazel version >/dev/null ) +TGTS=$( cd "$WS" && bazel query 'deps(//...:all-targets)' 2>/dev/null | wc -l ) +echo "TARGET_COUNT=$TGTS" | tee "$RESULTS/target_count.txt" + +echo "=== cold-vs-warm benchmark (${ITERS} iters) ===" +python3 /opt/bench/bench.py \ + --workspace "$WS" --base-sha "$BASE" --target-sha "$TARGET" \ + --bazel "$(command -v bazel)" --bazel-diff /usr/local/bin/bazel-diff \ + --iterations "$ITERS" --json "$RESULTS/report.json" + +echo "=== orchestrator: local driver record + consume ===" +bazel-diff-snap record --workspace "$WS" --base-sha "$BASE" \ + --store /work/store --bazel "$(command -v bazel)" --bazel-diff bazel-diff +bazel-diff-snap consume --workspace "$WS" --target-sha "$TARGET" \ + --store /work/store --out "$RESULTS/impacted.txt" \ + --bazel "$(command -v bazel)" --bazel-diff bazel-diff +echo "impacted_targets=$(wc -l < "$RESULTS/impacted.txt")" | tee "$RESULTS/impacted_count.txt" + +echo "=== done; results in $RESULTS ===" diff --git a/tools/firecracker/bench/setup_tap.sh b/tools/firecracker/bench/setup_tap.sh new file mode 100755 index 0000000..7c0b676 --- /dev/null +++ b/tools/firecracker/bench/setup_tap.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +# Set up the host TAP that backs the guest NIC for the firecracker driver. +# +# The orchestrator (`bazel-diff-snap --driver firecracker`) is deliberately +# privilege-free: it references the TAP by name and checks it exists, but never +# creates it (that needs CAP_NET_ADMIN). This script is the operator/CI step that +# creates it. Run it once per host before record/consume; it is idempotent. +# +# sudo tools/firecracker/bench/setup_tap.sh [TAP] [HOST_IP] [MASK_BITS] +# +# Defaults match the bazel-diff-snap defaults and the guest image's +# fcnet-setup.sh MAC->IP convention (MAC 06:00:AC:10:00:02 => guest 172.16.0.2): +# TAP=fc-tap0 HOST_IP=172.16.0.1 MASK_BITS=30 (guest is .2 on the /30) +set -euo pipefail + +TAP="${1:-fc-tap0}" +HOST_IP="${2:-172.16.0.1}" +MASK_BITS="${3:-30}" +USER_NAME="${SUDO_USER:-$(id -un)}" + +if [[ $EUID -ne 0 ]]; then + echo "must run as root (TAP creation needs CAP_NET_ADMIN): sudo $0 $*" >&2 + exit 1 +fi + +# Idempotent: recreate cleanly so re-runs don't accumulate stale addresses. +if ip link show "$TAP" >/dev/null 2>&1; then + ip link del "$TAP" +fi +ip tuntap add dev "$TAP" mode tap user "$USER_NAME" +ip addr add "${HOST_IP}/${MASK_BITS}" dev "$TAP" +ip link set "$TAP" up + +# Allow the guest to reach the outside world if the host forwards (optional; +# harmless if the guest needs no egress). Not enabled by default. +echo "TAP $TAP up: host ${HOST_IP}/${MASK_BITS}, owned by $USER_NAME" +ip addr show "$TAP" diff --git a/tools/firecracker/driver.go b/tools/firecracker/driver.go new file mode 100644 index 0000000..4b29a0c --- /dev/null +++ b/tools/firecracker/driver.go @@ -0,0 +1,103 @@ +package main + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" +) + +// recordRequest is everything a driver needs to produce a snapshot for a base SHA. +type recordRequest struct { + Workspace string + BaseSHA string + Bazel string + BazelDiff string + Flags []string + Entry entry +} + +// consumeRequest is everything a driver needs to compute impacted targets for a +// target SHA against a recorded snapshot. +type consumeRequest struct { + Workspace string + TargetSHA string + Bazel string + BazelDiff string + Flags []string + Entry entry + Out string +} + +// driver abstracts *where* the warm Bazel server lives. The firecracker driver +// runs warmup/consume inside a microVM and snapshots it; the local driver runs +// them directly on the host (no snapshotting — useful for testing the +// orchestration and as a portable cold/warm proxy). +type driver interface { + name() string + record(recordRequest) error + consume(consumeRequest) error +} + +// localDriver runs everything on the host. It does NOT snapshot — record simply +// runs `bazel-diff warmup` to bake base_hashes.json + fingerprint.json into the +// store entry, and consume runs generate-hashes + get-impacted-targets directly. +// This is the path that runs anywhere (incl. macOS) and underpins the unit +// tests and the cold/warm proxy benchmark. +type localDriver struct { + // runner lets tests stub command execution. nil => real exec. + runner func(name string, args ...string) error +} + +func (localDriver) name() string { return "local" } + +func (d localDriver) exec(name string, args ...string) error { + if d.runner != nil { + return d.runner(name, args...) + } + cmd := exec.Command(name, args...) + cmd.Stdout = os.Stderr // diagnostics go to stderr; stdout stays clean + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + return fmt.Errorf("%s %s: %w", name, strings.Join(args, " "), err) + } + return nil +} + +func (d localDriver) record(r recordRequest) error { + git := gitClient{workspace: r.Workspace} + if err := git.checkout(r.BaseSHA); err != nil { + return err + } + args := []string{ + "warmup", + "-w", r.Workspace, + "-b", r.Bazel, + "--base-hashes", r.Entry.baseHashes(), + "--fingerprint-output", filepath.Join(r.Entry.Dir, fingerprintName), + } + args = append(args, r.Flags...) + return d.exec(r.BazelDiff, args...) +} + +func (d localDriver) consume(r consumeRequest) error { + git := gitClient{workspace: r.Workspace} + if err := git.checkout(r.TargetSHA); err != nil { + return err + } + targetHashes := filepath.Join(filepath.Dir(r.Out), "target_hashes.json") + genArgs := []string{"generate-hashes", "-w", r.Workspace, "-b", r.Bazel, targetHashes} + genArgs = append(genArgs, r.Flags...) + if err := d.exec(r.BazelDiff, genArgs...); err != nil { + return err + } + return d.exec(r.BazelDiff, + "get-impacted-targets", + "-w", r.Workspace, + "-b", r.Bazel, + "-sh", r.Entry.baseHashes(), + "-fh", targetHashes, + "-o", r.Out, + ) +} diff --git a/tools/firecracker/driver_firecracker.go b/tools/firecracker/driver_firecracker.go new file mode 100644 index 0000000..2297d31 --- /dev/null +++ b/tools/firecracker/driver_firecracker.go @@ -0,0 +1,355 @@ +package main + +import ( + "fmt" + "io" + "os" + "os/exec" + "path/filepath" + "strings" + "time" +) + +// guestRunner executes commands and moves files in/out of the running microVM. +// Implementations are infrastructure-specific (SSH over a TAP/vsock network); +// abstracting them keeps fcDriver testable and lets operators swap transports. +type guestRunner interface { + exec(command string) error + copyOut(guestPath, hostPath string) error +} + +// fcDriver records and consumes snapshots using Firecracker microVMs. +// +// REQUIRES Linux + KVM (/dev/kvm) on the host, a prepared kernel + rootfs base +// image (bazel + JDK + git + the bazel-diff binary + the workspace baked in), +// and a configured guestRunner. None of that is exercisable on macOS; this is +// the path that runs on the self-hosted CI host (RFC §6). +type fcDriver struct { + firecrackerBin string // path to the `firecracker` binary + socketPath string // API unix socket + kernelImage string // guest kernel + vcpus int + memMib int + guest guestRunner + net netConfig + + // guestSnapDir is where warmup writes base_hashes.json / fingerprint.json + // inside the guest (matches the CLI defaults under /snap). + guestSnapDir string + + // pollInterval is how often waitForGuest retries the guest; 0 => 2s default. + // Overridable so tests don't sleep seconds. + pollInterval time.Duration +} + +// netConfig is the guest networking the driver attaches: a virtio-net device +// backed by a host TAP, with a static point-to-point address pair so the host +// can ssh in without DHCP. The TAP is owned by CI/operator setup (see +// bench/setup_tap.sh) — the driver only references it by name and bakes the +// guest-side address into the kernel `ip=` boot arg. Zero value => no network +// (the device-less boot used by the boot smoke test). +type netConfig struct { + tapDevice string // host TAP name, e.g. "fc-tap0" + guestIP string // guest address, e.g. "172.16.0.2" + hostIP string // host/gateway address, e.g. "172.16.0.1" + netmask string // e.g. "255.255.255.252" + guestMAC string // stable MAC so the guest's NIC name survives restore +} + +func (n netConfig) enabled() bool { return n.tapDevice != "" } + +// bootArg renders the kernel `ip=` directive that statically configures eth0 in +// the guest at boot, so it is reachable over the TAP with no DHCP server. +// Format: ip=::::::. +func (n netConfig) bootArg() string { + if !n.enabled() { + return "" + } + return fmt.Sprintf("ip=%s::%s:%s::eth0:off", n.guestIP, n.hostIP, n.netmask) +} + +func (fcDriver) name() string { return "firecracker" } + +// bootArgs is the guest kernel command line. It boots from the root virtio-block +// device (the rootfs drive is added as the first /dev/vda; Firecracker does not +// synthesize a `root=` arg, so we must pass it). When networking is configured +// the `ip=` directive is appended so the restored guest comes up addressable. +// Because boot args are captured in the snapshot, the address baked here is what +// consume-time ssh must target. +func (d fcDriver) bootArgs() string { + args := "console=ttyS0 reboot=k panic=1 pci=off root=/dev/vda rw" + if na := d.net.bootArg(); na != "" { + args += " " + na + } + return args +} + +// boot launches the firecracker process against socketPath and returns it so the +// caller can tear it down. The process is detached from our stdout. +func (d fcDriver) boot() (*exec.Cmd, error) { + _ = os.Remove(d.socketPath) + cmd := exec.Command(d.firecrackerBin, "--api-sock", d.socketPath) + cmd.Stdout = os.Stderr + cmd.Stderr = os.Stderr + if err := cmd.Start(); err != nil { + return nil, fmt.Errorf("starting firecracker: %w", err) + } + // Wait for the API socket to appear. + deadline := time.Now().Add(10 * time.Second) + for time.Now().Before(deadline) { + if _, err := os.Stat(d.socketPath); err == nil { + return cmd, nil + } + time.Sleep(50 * time.Millisecond) + } + _ = cmd.Process.Kill() + return nil, fmt.Errorf("firecracker API socket %s never appeared", d.socketPath) +} + +func teardown(cmd *exec.Cmd) { + if cmd != nil && cmd.Process != nil { + _ = cmd.Process.Kill() + _, _ = cmd.Process.Wait() + } +} + +// record boots a fresh VM on a read-write copy of the rootfs, warms it, copies +// the base hashes + fingerprint out to the store entry, then pauses and writes a +// full snapshot. The frozen rootfs becomes the consume-time backing image. +func (d fcDriver) record(r recordRequest) error { + rootfs := r.Entry.rootfs() + if err := copyFile(d.baseRootfs(), rootfs); err != nil { + return fmt.Errorf("preparing rootfs: %w", err) + } + + proc, err := d.boot() + if err != nil { + return err + } + defer teardown(proc) + c := newFCClient(d.socketPath) + + if err := c.setMachineConfig(machineConfig{VCPUCount: d.vcpus, MemSizeMib: d.memMib}); err != nil { + return err + } + if err := c.setBootSource(bootSource{ + KernelImagePath: d.kernelImage, + BootArgs: d.bootArgs(), + }); err != nil { + return err + } + if err := c.addDrive(drive{ + DriveID: "rootfs", PathOnHost: rootfs, IsRootDevice: true, IsReadOnly: false, + }); err != nil { + return err + } + // Attach the guest NIC before boot. Net devices cannot be hot-added, and the + // device config is captured in the snapshot so consume reconnects to a TAP of + // the same name. Skipped only by the device-less boot smoke test. + if d.net.enabled() { + if err := c.addNetworkInterface(networkInterface{ + IfaceID: "eth0", HostDevName: d.net.tapDevice, GuestMAC: d.net.guestMAC, + }); err != nil { + return err + } + } + if err := c.instanceStart(); err != nil { + return err + } + // Wait for the freshly-booted guest's sshd before issuing commands. + if err := d.waitForGuest(120 * time.Second); err != nil { + return fmt.Errorf("guest unreachable after boot: %w", err) + } + + if err := d.guest.exec(d.warmupCommand(r)); err != nil { + return fmt.Errorf("guest warmup: %w", err) + } + // Copy the artifacts out for host-side bookkeeping + cold-fallback reuse. + if err := d.guest.copyOut(filepath.Join(d.guestSnapDir, baseHashesName), r.Entry.baseHashes()); err != nil { + return err + } + _ = d.guest.copyOut(filepath.Join(d.guestSnapDir, fingerprintName), + filepath.Join(r.Entry.Dir, fingerprintName)) + + if err := c.pause(); err != nil { + return err + } + if err := c.createSnapshot(snapshotCreate{ + SnapshotType: "Full", + SnapshotPath: r.Entry.vmstate(), + MemFilePath: r.Entry.memFile(), + }); err != nil { + return err + } + return nil +} + +// consume restores the snapshot on a copy-on-write overlay, checks out the +// target revision in the warm guest, and runs generate-hashes + +// get-impacted-targets, copying the impacted list out to r.Out. +func (d fcDriver) consume(r consumeRequest) error { + // On restore Firecracker reconnects the snapshotted virtio-net device to a + // host TAP with the same name it had at record time. If that TAP is missing + // the restore fails, so check the precondition up front with a clear error. + if d.net.enabled() { + if err := ensureTapExists(d.net.tapDevice); err != nil { + return err + } + } + + overlay := r.Out + ".rootfs.overlay" + if err := copyFile(r.Entry.rootfs(), overlay); err != nil { + return fmt.Errorf("preparing COW overlay: %w", err) + } + defer os.Remove(overlay) + + proc, err := d.boot() + if err != nil { + return err + } + defer teardown(proc) + c := newFCClient(d.socketPath) + + if err := c.loadSnapshot(snapshotLoad{ + SnapshotPath: r.Entry.vmstate(), + MemBackend: memBackend{BackendType: "File", BackendPath: r.Entry.memFile()}, + ResumeVM: true, + }); err != nil { + return fmt.Errorf("loading snapshot: %w", err) + } + // The resumed guest's sshd may take a moment to service connections. + if err := d.waitForGuest(60 * time.Second); err != nil { + return fmt.Errorf("guest unreachable after restore: %w", err) + } + + if err := d.guest.exec(d.consumeScript(r)); err != nil { + return fmt.Errorf("guest consume: %w", err) + } + return d.guest.copyOut(d.guestImpactedPath(), r.Out) +} + +// ensureTapExists verifies the host TAP is present before a restore. TAP setup +// needs CAP_NET_ADMIN and is the operator/CI's responsibility (bench/setup_tap.sh), +// so the driver only checks for it rather than creating it — keeping the tool +// privilege-free. A network device appears as /sys/class/net/. +func ensureTapExists(tap string) error { + if _, err := os.Stat(filepath.Join("/sys/class/net", tap)); err != nil { + return fmt.Errorf("host TAP %q not found (set it up first, e.g. bench/setup_tap.sh): %w", tap, err) + } + return nil +} + +// waitForGuest polls the guest over the configured transport until a trivial +// command succeeds, so callers don't race the guest's boot (record) or its +// snapshot-resume (consume) before sshd is serving connections. +func (d fcDriver) waitForGuest(timeout time.Duration) error { + interval := d.pollInterval + if interval == 0 { + interval = 2 * time.Second + } + deadline := time.Now().Add(timeout) + var last error + for time.Now().Before(deadline) { + if last = d.guest.exec("true"); last == nil { + return nil + } + time.Sleep(interval) + } + return fmt.Errorf("not reachable within %s: %w", timeout, last) +} + +// warmupCommand is the shell run in the guest at record time: check out the base +// revision, then `bazel-diff warmup` to bake base_hashes.json + fingerprint.json. +// The checkout mirrors localDriver.record (and consume's target checkout) so the +// baked hashes are for the base SHA regardless of the image's baked revision. +func (d fcDriver) warmupCommand(r recordRequest) string { + return strings.Join([]string{ + fmt.Sprintf("git -C %s checkout --force --quiet %s", r.Workspace, r.BaseSHA), + fmt.Sprintf("bazel-diff warmup -w %s -b %s --base-hashes %s --fingerprint-output %s %s", + r.Workspace, r.Bazel, + filepath.Join(d.guestSnapDir, baseHashesName), + filepath.Join(d.guestSnapDir, fingerprintName), + strings.Join(r.Flags, " ")), + }, " && ") +} + +func (d fcDriver) guestImpactedPath() string { + return filepath.Join(d.guestSnapDir, "impacted.txt") +} + +// consumeScript is the shell run in the restored guest: check out the target +// revision, generate-hashes for it, then get-impacted-targets against the baked +// base hashes, writing the impacted list to the in-guest path copied out after. +func (d fcDriver) consumeScript(r consumeRequest) string { + guestTarget := filepath.Join(d.guestSnapDir, "target_hashes.json") + return strings.Join([]string{ + fmt.Sprintf("git -C %s checkout --force --quiet %s", r.Workspace, r.TargetSHA), + fmt.Sprintf("bazel-diff generate-hashes -w %s -b %s %s %s", + r.Workspace, r.Bazel, guestTarget, strings.Join(r.Flags, " ")), + fmt.Sprintf("bazel-diff get-impacted-targets -w %s -b %s -sh %s -fh %s -o %s", + r.Workspace, r.Bazel, + filepath.Join(d.guestSnapDir, baseHashesName), guestTarget, d.guestImpactedPath()), + }, " && ") +} + +func (d fcDriver) baseRootfs() string { + // The base rootfs lives next to the kernel by convention. + return filepath.Join(filepath.Dir(d.kernelImage), "rootfs.base.ext4") +} + +func copyFile(src, dst string) error { + in, err := os.Open(src) + if err != nil { + return err + } + defer in.Close() + if err := os.MkdirAll(filepath.Dir(dst), 0o755); err != nil { + return err + } + out, err := os.Create(dst) + if err != nil { + return err + } + defer out.Close() + if _, err := io.Copy(out, in); err != nil { + return err + } + return out.Close() +} + +// sshGuest is the default guestRunner: it shells out to `ssh`/`scp`. Operators +// configure the address + identity for their CI network. +type sshGuest struct { + addr string // user@host + identity string // path to ssh private key, optional + sshOpts []string +} + +func (g sshGuest) base(tool string) []string { + args := []string{} + if g.identity != "" { + args = append(args, "-i", g.identity) + } + args = append(args, g.sshOpts...) + _ = tool + return args +} + +func (g sshGuest) exec(command string) error { + args := append(g.base("ssh"), g.addr, command) + cmd := exec.Command("ssh", args...) + cmd.Stdout = os.Stderr + cmd.Stderr = os.Stderr + return cmd.Run() +} + +func (g sshGuest) copyOut(guestPath, hostPath string) error { + if err := os.MkdirAll(filepath.Dir(hostPath), 0o755); err != nil { + return err + } + args := append(g.base("scp"), g.addr+":"+guestPath, hostPath) + cmd := exec.Command("scp", args...) + cmd.Stdout = os.Stderr + cmd.Stderr = os.Stderr + return cmd.Run() +} diff --git a/tools/firecracker/driver_firecracker_test.go b/tools/firecracker/driver_firecracker_test.go new file mode 100644 index 0000000..3cd4249 --- /dev/null +++ b/tools/firecracker/driver_firecracker_test.go @@ -0,0 +1,214 @@ +package main + +import ( + "os" + "path/filepath" + "strings" + "testing" + "time" +) + +func TestNetConfigBootArg(t *testing.T) { + // Disabled (zero value) => no ip= directive, and bootArgs is the bare cmdline. + var off netConfig + if off.enabled() { + t.Fatal("zero-value netConfig should be disabled") + } + if off.bootArg() != "" { + t.Fatalf("disabled bootArg should be empty, got %q", off.bootArg()) + } + if got := (fcDriver{}).bootArgs(); strings.Contains(got, "ip=") { + t.Fatalf("device-less bootArgs should have no ip=, got %q", got) + } + // Must boot from the rootfs block device (Firecracker doesn't synthesize root=). + if got := (fcDriver{}).bootArgs(); !strings.Contains(got, "root=/dev/vda") { + t.Fatalf("bootArgs must specify the root device, got %q", got) + } + + // Enabled => point-to-point ip= directive baked into the kernel cmdline. + on := netConfig{ + tapDevice: "fc-tap0", + guestIP: "172.16.0.2", + hostIP: "172.16.0.1", + netmask: "255.255.255.252", + guestMAC: "06:00:AC:10:00:02", + } + if !on.enabled() { + t.Fatal("configured netConfig should be enabled") + } + want := "ip=172.16.0.2::172.16.0.1:255.255.255.252::eth0:off" + if on.bootArg() != want { + t.Fatalf("bootArg: want %q, got %q", want, on.bootArg()) + } + full := fcDriver{net: on}.bootArgs() + if !strings.Contains(full, "console=ttyS0") || !strings.Contains(full, "root=/dev/vda") || !strings.HasSuffix(full, want) { + t.Fatalf("bootArgs should keep base flags and append ip=, got %q", full) + } +} + +func TestEnsureTapExists(t *testing.T) { + // A name that cannot exist under /sys/class/net should fail with guidance. + err := ensureTapExists("definitely-not-a-real-tap-xyz") + if err == nil { + t.Fatal("expected error for missing TAP") + } + if !strings.Contains(err.Error(), "not found") { + t.Fatalf("error should mention the missing TAP, got %v", err) + } + + // "lo" exists as /sys/class/net/lo on Linux, so the check passes for it. + // Only assert that on a host that actually has the sysfs entry (not macOS). + if _, statErr := os.Stat("/sys/class/net/lo"); statErr == nil { + if err := ensureTapExists("lo"); err != nil { + t.Fatalf("ensureTapExists(lo) should pass when /sys/class/net/lo exists, got %v", err) + } + } +} + +// fakeGuest is a guestRunner that fails its first failUntil exec() calls, then +// succeeds, recording every command. copyOut always succeeds and is recorded. +type fakeGuest struct { + failUntil int + calls int + execCmds []string + copies [][2]string + execErr error +} + +func (g *fakeGuest) exec(command string) error { + g.calls++ + g.execCmds = append(g.execCmds, command) + if g.calls <= g.failUntil { + if g.execErr != nil { + return g.execErr + } + return errFakeGuest + } + return nil +} + +func (g *fakeGuest) copyOut(guestPath, hostPath string) error { + g.copies = append(g.copies, [2]string{guestPath, hostPath}) + return nil +} + +var errFakeGuest = &fakeErr{"guest not ready"} + +type fakeErr struct{ s string } + +func (e *fakeErr) Error() string { return e.s } + +func TestWaitForGuestRetriesThenSucceeds(t *testing.T) { + g := &fakeGuest{failUntil: 2} + d := fcDriver{guest: g, pollInterval: time.Millisecond} + if err := d.waitForGuest(5 * time.Second); err != nil { + t.Fatalf("expected success after retries, got %v", err) + } + if g.calls != 3 { + t.Fatalf("expected 3 attempts (2 fail + 1 ok), got %d", g.calls) + } +} + +func TestWaitForGuestTimesOut(t *testing.T) { + g := &fakeGuest{failUntil: 1 << 30} // always fail + d := fcDriver{guest: g, pollInterval: time.Millisecond} + err := d.waitForGuest(20 * time.Millisecond) + if err == nil { + t.Fatal("expected timeout error") + } + if !strings.Contains(err.Error(), "not reachable") { + t.Fatalf("error should mention unreachable, got %v", err) + } +} + +func TestWarmupCommand(t *testing.T) { + d := fcDriver{guestSnapDir: "/snap"} + cmd := d.warmupCommand(recordRequest{ + Workspace: "/work", BaseSHA: "abc", Bazel: "bazel", Flags: []string{"--useCquery"}, + }) + for _, want := range []string{ + "git -C /work checkout --force --quiet abc", + "bazel-diff warmup -w /work -b bazel", + "--base-hashes /snap/base_hashes.json", + "--fingerprint-output /snap/fingerprint.json", + "--useCquery", + " && ", + } { + if !strings.Contains(cmd, want) { + t.Fatalf("warmupCommand missing %q in:\n%s", want, cmd) + } + } +} + +func TestConsumeScript(t *testing.T) { + d := fcDriver{guestSnapDir: "/snap"} + s := d.consumeScript(consumeRequest{Workspace: "/work", TargetSHA: "def", Bazel: "bazel"}) + for _, want := range []string{ + "git -C /work checkout --force --quiet def", + "bazel-diff generate-hashes -w /work -b bazel /snap/target_hashes.json", + "get-impacted-targets -w /work -b bazel -sh /snap/base_hashes.json -fh /snap/target_hashes.json -o /snap/impacted.txt", + } { + if !strings.Contains(s, want) { + t.Fatalf("consumeScript missing %q in:\n%s", want, s) + } + } +} + +func TestBaseRootfs(t *testing.T) { + d := fcDriver{kernelImage: "/img/vmlinux-6.1.128"} + if got := d.baseRootfs(); got != "/img/rootfs.base.ext4" { + t.Fatalf("baseRootfs: want /img/rootfs.base.ext4, got %s", got) + } +} + +func TestCopyFile(t *testing.T) { + dir := t.TempDir() + src := dir + "/src" + if err := os.WriteFile(src, []byte("hello"), 0o644); err != nil { + t.Fatal(err) + } + dst := dir + "/nested/dir/dst" // parent dirs must be created + if err := copyFile(src, dst); err != nil { + t.Fatal(err) + } + b, err := os.ReadFile(dst) + if err != nil || string(b) != "hello" { + t.Fatalf("copyFile result wrong: %q err=%v", string(b), err) + } + if err := copyFile(dir+"/missing", dst); err == nil { + t.Fatal("copyFile of a missing source should error") + } +} + +func TestSSHGuestBaseArgs(t *testing.T) { + g := sshGuest{addr: "root@h", identity: "/k", sshOpts: []string{"-o", "BatchMode=yes"}} + args := g.base("ssh") + joined := strings.Join(args, " ") + if !strings.Contains(joined, "-i /k") || !strings.Contains(joined, "BatchMode=yes") { + t.Fatalf("base args missing identity/opts: %v", args) + } + // No identity => no -i flag. + if strings.Contains(strings.Join(sshGuest{addr: "x"}.base("ssh"), " "), "-i") { + t.Fatal("base args should omit -i when identity is empty") + } +} + +func TestTeardownNilSafe(t *testing.T) { + teardown(nil) // must not panic on a nil cmd / nil process +} + +func TestFCDriverName(t *testing.T) { + if (fcDriver{}).name() != "firecracker" { + t.Fatal("fcDriver.name() should be 'firecracker'") + } +} + +func TestBootMissingBinary(t *testing.T) { + d := fcDriver{ + firecrackerBin: "/definitely/not/a/firecracker/binary", + socketPath: filepath.Join(t.TempDir(), "fc.sock"), + } + if _, err := d.boot(); err == nil { + t.Fatal("boot should error when the firecracker binary is missing") + } +} diff --git a/tools/firecracker/driver_test.go b/tools/firecracker/driver_test.go new file mode 100644 index 0000000..20daa13 --- /dev/null +++ b/tools/firecracker/driver_test.go @@ -0,0 +1,93 @@ +package main + +import ( + "os" + "path/filepath" + "testing" +) + +func TestLocalDriverRecordInvokesWarmup(t *testing.T) { + ws, shas := makeLinearRepo(t, 2) + store := t.TempDir() + e, err := newEntry(store, "fp", shas[0]) + if err != nil { + t.Fatal(err) + } + + var calls [][]string + d := localDriver{runner: func(name string, args ...string) error { + calls = append(calls, append([]string{name}, args...)) + return nil + }} + err = d.record(recordRequest{ + Workspace: ws, BaseSHA: shas[0], Bazel: "bazel", BazelDiff: "bazel-diff", + Flags: []string{"--useCquery"}, Entry: e, + }) + if err != nil { + t.Fatal(err) + } + if len(calls) != 1 { + t.Fatalf("want 1 warmup call, got %d: %v", len(calls), calls) + } + got := calls[0] + assertContains(t, got, "warmup") + assertContains(t, got, "--base-hashes") + assertContains(t, got, e.baseHashes()) + assertContains(t, got, "--fingerprint-output") + assertContains(t, got, "--useCquery") +} + +func TestLocalDriverConsumeChain(t *testing.T) { + ws, shas := makeLinearRepo(t, 2) + store := t.TempDir() + e, _ := newEntry(store, "fp", shas[0]) + out := filepath.Join(t.TempDir(), "impacted.txt") + + var calls [][]string + d := localDriver{runner: func(name string, args ...string) error { + calls = append(calls, append([]string{name}, args...)) + return nil + }} + err := d.consume(consumeRequest{ + Workspace: ws, TargetSHA: shas[1], Bazel: "bazel", BazelDiff: "bazel-diff", + Entry: e, Out: out, + }) + if err != nil { + t.Fatal(err) + } + if len(calls) != 2 { + t.Fatalf("want generate-hashes + get-impacted-targets (2 calls), got %d: %v", len(calls), calls) + } + assertContains(t, calls[0], "generate-hashes") + assertContains(t, calls[1], "get-impacted-targets") + assertContains(t, calls[1], "-sh") + assertContains(t, calls[1], e.baseHashes()) + assertContains(t, calls[1], out) +} + +func TestComputeFingerprintParsesOutput(t *testing.T) { + dir := t.TempDir() + script := filepath.Join(dir, "fake-bazel-diff") + // A fake bazel-diff that prints a fingerprint JSON regardless of args. + body := "#!/bin/sh\ncat <<'EOF'\n{\"fingerprint\":\"abc123\",\"flags\":{\"useCquery\":\"false\"}}\nEOF\n" + if err := os.WriteFile(script, []byte(body), 0o755); err != nil { + t.Fatal(err) + } + fp, err := computeFingerprint(script, dir, "bazel", nil) + if err != nil { + t.Fatal(err) + } + if fp != "abc123" { + t.Fatalf("want abc123, got %q", fp) + } +} + +func assertContains(t *testing.T, hay []string, needle string) { + t.Helper() + for _, s := range hay { + if s == needle { + return + } + } + t.Fatalf("expected %q in %v", needle, hay) +} diff --git a/tools/firecracker/fc_integration_test.go b/tools/firecracker/fc_integration_test.go new file mode 100644 index 0000000..726ba98 --- /dev/null +++ b/tools/firecracker/fc_integration_test.go @@ -0,0 +1,127 @@ +//go:build fcintegration + +// Real-microVM end-to-end canary for the firecracker driver (RFC §5.3). +// +// Unlike the rest of the suite (pure logic + a fake API socket), this test boots +// an actual Firecracker microVM, so it is gated behind the `fcintegration` build +// tag and a set of env vars. It runs on a Linux + /dev/kvm host with a guest +// image built by bench/build_guest_image.sh and a TAP from bench/setup_tap.sh: +// +// FC_BIN path to the firecracker binary +// FC_KERNEL guest kernel (rootfs.base.ext4 must sit beside it) +// FC_STORE empty dir for the snapshot store +// FC_GUEST_ADDR root@172.16.0.2 +// FC_GUEST_KEY ssh identity trusted by the guest +// FC_TAP host TAP name (e.g. fc-tap0) +// FC_WORKSPACE in-guest workspace path (e.g. /work) +// FC_BASE_SHA / FC_TARGET_SHA revisions baked into the guest repo +// +// It exercises the exact code the unit tests cannot: fcDriver.record (boot → +// NIC over TAP → warmup → pause → snapshot) and fcDriver.consume (restore → +// resume → git checkout → generate-hashes → get-impacted-targets → scp out), +// then asserts a non-empty impacted set. Build/run: +// +// go test -tags fcintegration -run TestFirecrackerRecordConsume -v ./... +package main + +import ( + "os" + "path/filepath" + "strconv" + "testing" +) + +func envOrSkip(t *testing.T, key string) string { + t.Helper() + v := os.Getenv(key) + if v == "" { + t.Skipf("set %s to run the firecracker integration canary", key) + } + return v +} + +func TestFirecrackerRecordConsume(t *testing.T) { + fcBin := envOrSkip(t, "FC_BIN") + kernel := envOrSkip(t, "FC_KERNEL") + store := envOrSkip(t, "FC_STORE") + guestAddr := envOrSkip(t, "FC_GUEST_ADDR") + guestKey := envOrSkip(t, "FC_GUEST_KEY") + tap := envOrSkip(t, "FC_TAP") + workspace := envOrSkip(t, "FC_WORKSPACE") + baseSHA := envOrSkip(t, "FC_BASE_SHA") + targetSHA := envOrSkip(t, "FC_TARGET_SHA") + + bazel := getenvDefault("FC_BAZEL", "bazel") + bazelDiff := getenvDefault("FC_BAZEL_DIFF", "bazel-diff") + + d := fcDriver{ + firecrackerBin: fcBin, + socketPath: filepath.Join(t.TempDir(), "fc.sock"), + kernelImage: kernel, + vcpus: atoiDefault("FC_VCPUS", 2), + memMib: atoiDefault("FC_MEM_MIB", 2048), + guestSnapDir: "/snap", + guest: sshGuest{addr: guestAddr, identity: guestKey, sshOpts: noHostKeyChecking}, + net: netConfig{ + tapDevice: tap, + guestIP: "172.16.0.2", + hostIP: "172.16.0.1", + netmask: "255.255.255.252", + guestMAC: "06:00:AC:10:00:02", + }, + } + + e, err := newEntry(store, "itfp", baseSHA) + if err != nil { + t.Fatal(err) + } + + if err := d.record(recordRequest{ + Workspace: workspace, BaseSHA: baseSHA, Bazel: bazel, BazelDiff: bazelDiff, Entry: e, + }); err != nil { + t.Fatalf("record: %v", err) + } + if _, err := os.Stat(e.baseHashes()); err != nil { + t.Fatalf("record did not produce base hashes: %v", err) + } + + // FC_OUT lets the workflow capture the impacted set on a stable path so it can + // diff it against the local driver's output (the RFC §5.3 correctness canary). + out := getenvDefault("FC_OUT", filepath.Join(t.TempDir(), "impacted.txt")) + if err := d.consume(consumeRequest{ + Workspace: workspace, TargetSHA: targetSHA, Bazel: bazel, BazelDiff: bazelDiff, Entry: e, Out: out, + }); err != nil { + t.Fatalf("consume: %v", err) + } + b, err := os.ReadFile(out) + if err != nil || len(b) == 0 { + t.Fatalf("consume produced no impacted targets (err=%v, len=%d)", err, len(b)) + } + t.Logf("impacted targets:\n%s", b) +} + +var noHostKeyChecking = []string{ + "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", + "-o", "LogLevel=ERROR", + // Bound per-attempt connect time and never prompt, so the driver's + // readiness polling fails fast and retries instead of hanging. + "-o", "ConnectTimeout=10", + "-o", "BatchMode=yes", +} + +func getenvDefault(key, def string) string { + if v := os.Getenv(key); v != "" { + return v + } + return def +} + +func atoiDefault(key string, def int) int { + if v := os.Getenv(key); v != "" { + if n, err := strconv.Atoi(v); err == nil { + return n + } + } + return def +} diff --git a/tools/firecracker/fcapi.go b/tools/firecracker/fcapi.go new file mode 100644 index 0000000..e0776f8 --- /dev/null +++ b/tools/firecracker/fcapi.go @@ -0,0 +1,152 @@ +package main + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net" + "net/http" + "time" +) + +// fcClient is a minimal Firecracker REST API client speaking HTTP over the +// microVM's unix socket. Only the calls the orchestrator needs are implemented. +// See https://github.com/firecracker-microvm/firecracker/blob/main/src/firecracker/swagger/firecracker.yaml +type fcClient struct { + http *http.Client +} + +// newFCClient dials the Firecracker API unix socket at socketPath. The http +// transport ignores the URL host and always connects to that socket. +func newFCClient(socketPath string) *fcClient { + return &fcClient{ + http: &http.Client{ + // Generous: most calls are instant over the unix socket, but + // /snapshot/create and /snapshot/load dump or load the guest's full + // memory to/from disk and legitimately take well over a minute for a + // multi-GB VM. A short timeout aborts the snapshot mid-write. + Timeout: 15 * time.Minute, + Transport: &http.Transport{ + DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) { + var d net.Dialer + return d.DialContext(ctx, "unix", socketPath) + }, + }, + }, + } +} + +func (c *fcClient) do(method, path string, body any) error { + var rdr io.Reader + if body != nil { + b, err := json.Marshal(body) + if err != nil { + return err + } + rdr = bytes.NewReader(b) + } + // Host is ignored (unix socket) but must be a valid URL. + req, err := http.NewRequest(method, "http://localhost"+path, rdr) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "application/json") + resp, err := c.http.Do(req) + if err != nil { + return fmt.Errorf("%s %s: %w", method, path, err) + } + defer resp.Body.Close() + if resp.StatusCode >= 300 { + msg, _ := io.ReadAll(resp.Body) + return fmt.Errorf("%s %s: status %d: %s", method, path, resp.StatusCode, string(msg)) + } + return nil +} + +// --- request payloads (subset of the Firecracker API schema) --- + +type machineConfig struct { + VCPUCount int `json:"vcpu_count"` + MemSizeMib int `json:"mem_size_mib"` + SMT bool `json:"smt,omitempty"` +} + +type bootSource struct { + KernelImagePath string `json:"kernel_image_path"` + BootArgs string `json:"boot_args,omitempty"` +} + +type drive struct { + DriveID string `json:"drive_id"` + PathOnHost string `json:"path_on_host"` + IsRootDevice bool `json:"is_root_device"` + IsReadOnly bool `json:"is_read_only"` +} + +// networkInterface attaches a virtio-net device backed by a host TAP. The guest +// configures the interface itself (here via the kernel `ip=` boot arg); the host +// side is the pre-existing TAP named HostDevName. On snapshot restore Firecracker +// reconnects the device to a TAP of the same name, so the TAP must exist at both +// record and consume time — see fcDriver. +type networkInterface struct { + IfaceID string `json:"iface_id"` + HostDevName string `json:"host_dev_name"` + GuestMAC string `json:"guest_mac,omitempty"` +} + +type action struct { + ActionType string `json:"action_type"` +} + +type snapshotCreate struct { + SnapshotType string `json:"snapshot_type"` // "Full" or "Diff" + SnapshotPath string `json:"snapshot_path"` + MemFilePath string `json:"mem_file_path"` +} + +type snapshotLoad struct { + SnapshotPath string `json:"snapshot_path"` + MemBackend memBackend `json:"mem_backend"` + EnableDiffSnapshots bool `json:"enable_diff_snapshots,omitempty"` + ResumeVM bool `json:"resume_vm"` +} + +type memBackend struct { + BackendType string `json:"backend_type"` // "File" or "Uffd" + BackendPath string `json:"backend_path"` +} + +type vmState struct { + State string `json:"state"` // "Paused" or "Resumed" +} + +func (c *fcClient) setMachineConfig(cfg machineConfig) error { + return c.do(http.MethodPut, "/machine-config", cfg) +} +func (c *fcClient) setBootSource(b bootSource) error { + return c.do(http.MethodPut, "/boot-source", b) +} +func (c *fcClient) addDrive(d drive) error { + return c.do(http.MethodPut, "/drives/"+d.DriveID, d) +} +func (c *fcClient) addNetworkInterface(n networkInterface) error { + return c.do(http.MethodPut, "/network-interfaces/"+n.IfaceID, n) +} +func (c *fcClient) instanceStart() error { + return c.do(http.MethodPut, "/actions", action{ActionType: "InstanceStart"}) +} +func (c *fcClient) pause() error { + return c.do(http.MethodPatch, "/vm", vmState{State: "Paused"}) +} +func (c *fcClient) resume() error { + return c.do(http.MethodPatch, "/vm", vmState{State: "Resumed"}) +} +func (c *fcClient) createSnapshot(s snapshotCreate) error { + return c.do(http.MethodPut, "/snapshot/create", s) +} +func (c *fcClient) loadSnapshot(s snapshotLoad) error { + return c.do(http.MethodPut, "/snapshot/load", s) +} diff --git a/tools/firecracker/fcapi_test.go b/tools/firecracker/fcapi_test.go new file mode 100644 index 0000000..0eb9160 --- /dev/null +++ b/tools/firecracker/fcapi_test.go @@ -0,0 +1,131 @@ +package main + +import ( + "encoding/json" + "io" + "net" + "net/http" + "path/filepath" + "sync" + "testing" +) + +type recordedReq struct { + method string + path string + body map[string]any +} + +// fakeFirecracker serves the API over a unix socket and records requests. +func fakeFirecracker(t *testing.T) (socket string, reqs *[]recordedReq) { + t.Helper() + socket = filepath.Join(t.TempDir(), "fc.sock") + ln, err := net.Listen("unix", socket) + if err != nil { + t.Fatal(err) + } + var mu sync.Mutex + recorded := &[]recordedReq{} + h := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + var body map[string]any + if b, _ := io.ReadAll(r.Body); len(b) > 0 { + _ = json.Unmarshal(b, &body) + } + mu.Lock() + *recorded = append(*recorded, recordedReq{r.Method, r.URL.Path, body}) + mu.Unlock() + w.WriteHeader(http.StatusNoContent) + }) + srv := &http.Server{Handler: h} + go srv.Serve(ln) + t.Cleanup(func() { srv.Close() }) + return socket, recorded +} + +func TestFCClientCalls(t *testing.T) { + socket, reqs := fakeFirecracker(t) + c := newFCClient(socket) + + if err := c.setMachineConfig(machineConfig{VCPUCount: 2, MemSizeMib: 4096}); err != nil { + t.Fatal(err) + } + if err := c.setBootSource(bootSource{KernelImagePath: "/k", BootArgs: "x"}); err != nil { + t.Fatal(err) + } + if err := c.addDrive(drive{DriveID: "rootfs", PathOnHost: "/r", IsRootDevice: true}); err != nil { + t.Fatal(err) + } + if err := c.addNetworkInterface(networkInterface{IfaceID: "eth0", HostDevName: "fc-tap0", GuestMAC: "06:00:AC:10:00:02"}); err != nil { + t.Fatal(err) + } + if err := c.instanceStart(); err != nil { + t.Fatal(err) + } + if err := c.pause(); err != nil { + t.Fatal(err) + } + if err := c.resume(); err != nil { + t.Fatal(err) + } + if err := c.createSnapshot(snapshotCreate{SnapshotType: "Full", SnapshotPath: "/v", MemFilePath: "/m"}); err != nil { + t.Fatal(err) + } + if err := c.loadSnapshot(snapshotLoad{SnapshotPath: "/v", MemBackend: memBackend{BackendType: "File", BackendPath: "/m"}, ResumeVM: true}); err != nil { + t.Fatal(err) + } + + want := []struct { + method, path string + }{ + {"PUT", "/machine-config"}, + {"PUT", "/boot-source"}, + {"PUT", "/drives/rootfs"}, + {"PUT", "/network-interfaces/eth0"}, + {"PUT", "/actions"}, + {"PATCH", "/vm"}, // pause + {"PATCH", "/vm"}, // resume + {"PUT", "/snapshot/create"}, + {"PUT", "/snapshot/load"}, + } + if len(*reqs) != len(want) { + t.Fatalf("want %d requests, got %d: %+v", len(want), len(*reqs), *reqs) + } + for i, w := range want { + got := (*reqs)[i] + if got.method != w.method || got.path != w.path { + t.Fatalf("req %d: want %s %s, got %s %s", i, w.method, w.path, got.method, got.path) + } + } + + // Spot-check a body: drive payload should carry is_root_device. + if v, _ := (*reqs)[2].body["is_root_device"].(bool); !v { + t.Fatalf("addDrive body missing is_root_device: %+v", (*reqs)[2].body) + } + // network interface payload should carry the host TAP name. + if (*reqs)[3].body["host_dev_name"] != "fc-tap0" { + t.Fatalf("addNetworkInterface body wrong: %+v", (*reqs)[3].body) + } + // instanceStart action type. + if (*reqs)[4].body["action_type"] != "InstanceStart" { + t.Fatalf("instanceStart action_type wrong: %+v", (*reqs)[4].body) + } +} + +func TestFCClientErrorStatus(t *testing.T) { + socket := filepath.Join(t.TempDir(), "fc.sock") + ln, err := net.Listen("unix", socket) + if err != nil { + t.Fatal(err) + } + srv := &http.Server{Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusBadRequest) + io.WriteString(w, `{"fault_message":"bad"}`) + })} + go srv.Serve(ln) + t.Cleanup(func() { srv.Close() }) + + c := newFCClient(socket) + if err := c.instanceStart(); err == nil { + t.Fatal("expected error on 400 status") + } +} diff --git a/tools/firecracker/fingerprint.go b/tools/firecracker/fingerprint.go new file mode 100644 index 0000000..d588a3d --- /dev/null +++ b/tools/firecracker/fingerprint.go @@ -0,0 +1,38 @@ +package main + +import ( + "bytes" + "encoding/json" + "fmt" + "os/exec" +) + +// fingerprintJSON is the subset of `bazel-diff fingerprint` output we consume. +type fingerprintJSON struct { + Fingerprint string `json:"fingerprint"` + Flags map[string]string `json:"flags"` +} + +// computeFingerprint runs `bazel-diff fingerprint` for the given workspace and +// flag set and returns the cache key. The flag set passed here MUST match the +// flags used for generate-hashes/warmup, or the key will (correctly) differ. +func computeFingerprint(bazelDiff, workspace, bazel string, flags []string) (string, error) { + // No -o: the fingerprint command writes JSON to stdout by default. + args := []string{"fingerprint", "-w", workspace, "-b", bazel} + args = append(args, flags...) + cmd := exec.Command(bazelDiff, args...) + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + if err := cmd.Run(); err != nil { + return "", fmt.Errorf("bazel-diff fingerprint failed: %w\n%s", err, stderr.String()) + } + var parsed fingerprintJSON + if err := json.Unmarshal(stdout.Bytes(), &parsed); err != nil { + return "", fmt.Errorf("parsing fingerprint JSON: %w", err) + } + if parsed.Fingerprint == "" { + return "", fmt.Errorf("fingerprint output had empty fingerprint: %s", stdout.String()) + } + return parsed.Fingerprint, nil +} diff --git a/tools/firecracker/fingerprint_test.go b/tools/firecracker/fingerprint_test.go new file mode 100644 index 0000000..77a48c1 --- /dev/null +++ b/tools/firecracker/fingerprint_test.go @@ -0,0 +1,40 @@ +package main + +import ( + "os" + "path/filepath" + "testing" +) + +func writeScript(t *testing.T, dir, body string) string { + t.Helper() + p := filepath.Join(dir, "bazel-diff") + if err := os.WriteFile(p, []byte("#!/bin/sh\n"+body+"\n"), 0o755); err != nil { + t.Fatal(err) + } + return p +} + +func TestComputeFingerprintNonZeroExit(t *testing.T) { + dir := t.TempDir() + bd := writeScript(t, dir, "echo boom >&2; exit 1") + if _, err := computeFingerprint(bd, dir, "bazel", nil); err == nil { + t.Fatal("expected error on non-zero exit") + } +} + +func TestComputeFingerprintBadJSON(t *testing.T) { + dir := t.TempDir() + bd := writeScript(t, dir, "echo 'not json'") + if _, err := computeFingerprint(bd, dir, "bazel", nil); err == nil { + t.Fatal("expected error on unparseable JSON") + } +} + +func TestComputeFingerprintEmptyFingerprint(t *testing.T) { + dir := t.TempDir() + bd := writeScript(t, dir, `echo '{"fingerprint":""}'`) + if _, err := computeFingerprint(bd, dir, "bazel", nil); err == nil { + t.Fatal("expected error when fingerprint is empty") + } +} diff --git a/tools/firecracker/git.go b/tools/firecracker/git.go new file mode 100644 index 0000000..caec9f6 --- /dev/null +++ b/tools/firecracker/git.go @@ -0,0 +1,57 @@ +package main + +import ( + "fmt" + "os/exec" + "strconv" + "strings" +) + +// gitClient runs git against a fixed workspace directory. +type gitClient struct { + workspace string +} + +func (g gitClient) run(args ...string) (string, error) { + cmd := exec.Command("git", append([]string{"-C", g.workspace}, args...)...) + out, err := cmd.Output() + if err != nil { + return "", fmt.Errorf("git %s: %w", strings.Join(args, " "), err) + } + return strings.TrimSpace(string(out)), nil +} + +// isAncestor reports whether ancestor is an ancestor of (or equal to) descendant. +func (g gitClient) isAncestor(ancestor, descendant string) (bool, error) { + cmd := exec.Command("git", "-C", g.workspace, + "merge-base", "--is-ancestor", ancestor, descendant) + err := cmd.Run() + if err == nil { + return true, nil + } + // Exit code 1 == "not an ancestor"; anything else is a real error. + if exit, ok := err.(*exec.ExitError); ok && exit.ExitCode() == 1 { + return false, nil + } + return false, fmt.Errorf("git merge-base --is-ancestor %s %s: %w", ancestor, descendant, err) +} + +// distance returns the number of commits in from..to (commits reachable from +// `to` but not from `from`). Smaller means a nearer ancestor. +func (g gitClient) distance(from, to string) (int, error) { + out, err := g.run("rev-list", "--count", from+".."+to) + if err != nil { + return 0, err + } + n, err := strconv.Atoi(out) + if err != nil { + return 0, fmt.Errorf("parsing rev-list count %q: %w", out, err) + } + return n, nil +} + +// checkout checks out the given revision, discarding local changes. +func (g gitClient) checkout(sha string) error { + _, err := g.run("checkout", "--force", "--quiet", sha) + return err +} diff --git a/tools/firecracker/go.mod b/tools/firecracker/go.mod new file mode 100644 index 0000000..76f2cc1 --- /dev/null +++ b/tools/firecracker/go.mod @@ -0,0 +1,7 @@ +module github.com/Tinder/bazel-diff/tools/firecracker + +go 1.21 + +// Intentionally dependency-free (stdlib only): the Firecracker REST API is +// spoken over a unix socket with net/http, so the orchestrator builds as a +// static CI binary with no module downloads. See README.md. diff --git a/tools/firecracker/main.go b/tools/firecracker/main.go new file mode 100644 index 0000000..d684d18 --- /dev/null +++ b/tools/firecracker/main.go @@ -0,0 +1,279 @@ +// Command bazel-diff-snap orchestrates Firecracker microVM snapshots to give +// instant starts of bazel-diff on large monorepos. See docs/firecracker-snapshots.md. +// +// bazel-diff-snap record --workspace

--base-sha --store

[opts] +// bazel-diff-snap consume --workspace

--target-sha --store

--out [opts] +// +// `record` warms a Bazel server (via `bazel-diff warmup`) and snapshots it, +// keyed by fingerprint + base SHA. `consume` resolves a compatible snapshot +// (fail-safe: cold fallback on fingerprint mismatch or no ancestor), restores +// it, and computes impacted targets against the baked-in base hashes. +package main + +import ( + "flag" + "fmt" + "os" + "runtime" + "strings" + "time" +) + +// exit codes +const ( + exitOK = 0 + exitError = 1 + exitColdFallback = 2 // consume found no compatible snapshot; caller runs cold path +) + +func main() { + if len(os.Args) < 2 { + usage() + os.Exit(exitError) + } + var err error + switch os.Args[1] { + case "record": + err = runRecord(os.Args[2:]) + case "consume": + os.Exit(runConsume(os.Args[2:])) + case "-h", "--help", "help": + usage() + return + default: + fmt.Fprintf(os.Stderr, "unknown command %q\n\n", os.Args[1]) + usage() + os.Exit(exitError) + } + if err != nil { + fmt.Fprintln(os.Stderr, "error:", err) + os.Exit(exitError) + } +} + +func usage() { + fmt.Fprint(os.Stderr, `bazel-diff-snap — Firecracker snapshot orchestration for bazel-diff + +usage: + bazel-diff-snap record --workspace

--base-sha --store

[opts] + bazel-diff-snap consume --workspace

--target-sha --store

--out [opts] + +common opts: + --bazel bazel binary (default "bazel") + --bazel-diff bazel-diff binary (default "bazel-diff") + --driver local|firecracker (default "local") + --flag extra bazel-diff flag, repeatable (must match record/consume) + +firecracker driver opts: + --kernel guest kernel image (required) + --guest-addr guest ssh address (required) + --tap-device host TAP backing the guest NIC (required; see bench/setup_tap.sh) + --guest-ip / --host-ip / --netmask / --guest-mac static TAP addressing + --guest-key ssh identity file + --vcpus / --mem-mib guest sizing (default 4 vCPU / 8192 MiB) + +consume exit codes: 0 ok, 1 error, 2 no compatible snapshot (run cold path) +`) +} + +// commonFlags are shared by record and consume. +type commonFlags struct { + workspace string + store string + bazel string + bazelDiff string + driver string + flags multiFlag + // firecracker driver config + firecrackerBin string + socket string + kernel string + vcpus int + memMib int + guestSnapDir string + guestAddr string + guestKey string + // firecracker guest networking (host TAP + static point-to-point addresses) + tapDevice string + guestIP string + hostIP string + netmask string + guestMAC string +} + +func registerCommon(fs *flag.FlagSet, c *commonFlags) { + fs.StringVar(&c.workspace, "workspace", "", "path to the Bazel workspace (git repo)") + fs.StringVar(&c.store, "store", "", "snapshot store directory") + fs.StringVar(&c.bazel, "bazel", "bazel", "bazel binary") + fs.StringVar(&c.bazelDiff, "bazel-diff", "bazel-diff", "bazel-diff binary") + fs.StringVar(&c.driver, "driver", "local", "driver: local | firecracker") + fs.Var(&c.flags, "flag", "extra bazel-diff flag (repeatable)") + fs.StringVar(&c.firecrackerBin, "firecracker-bin", "firecracker", "firecracker binary") + fs.StringVar(&c.socket, "socket", "/tmp/bazel-diff-fc.sock", "firecracker API socket") + fs.StringVar(&c.kernel, "kernel", "", "guest kernel image (firecracker driver)") + fs.IntVar(&c.vcpus, "vcpus", 4, "guest vCPUs (firecracker driver)") + fs.IntVar(&c.memMib, "mem-mib", 8192, "guest memory MiB (firecracker driver)") + fs.StringVar(&c.guestSnapDir, "guest-snap-dir", "/snap", "in-guest snapshot dir") + fs.StringVar(&c.guestAddr, "guest-addr", "", "guest ssh address user@host (firecracker driver)") + fs.StringVar(&c.guestKey, "guest-key", "", "guest ssh identity file (firecracker driver)") + fs.StringVar(&c.tapDevice, "tap-device", "", "host TAP backing the guest NIC (firecracker driver)") + fs.StringVar(&c.guestIP, "guest-ip", "172.16.0.2", "guest IP on the TAP subnet (firecracker driver)") + fs.StringVar(&c.hostIP, "host-ip", "172.16.0.1", "host/gateway IP on the TAP subnet (firecracker driver)") + fs.StringVar(&c.netmask, "netmask", "255.255.255.252", "TAP subnet mask (firecracker driver)") + fs.StringVar(&c.guestMAC, "guest-mac", "06:00:AC:10:00:02", "guest NIC MAC (stable across restore)") +} + +func (c commonFlags) makeDriver() (driver, error) { + switch c.driver { + case "local", "": + return localDriver{}, nil + case "firecracker": + if runtime.GOOS != "linux" { + return nil, fmt.Errorf("firecracker driver requires Linux (host is %s)", runtime.GOOS) + } + if _, err := os.Stat("/dev/kvm"); err != nil { + return nil, fmt.Errorf("firecracker driver requires /dev/kvm: %w", err) + } + if c.kernel == "" || c.guestAddr == "" { + return nil, fmt.Errorf("firecracker driver requires --kernel and --guest-addr") + } + if c.tapDevice == "" { + return nil, fmt.Errorf("firecracker driver requires --tap-device (the guest is driven over ssh, which needs a NIC)") + } + return fcDriver{ + firecrackerBin: c.firecrackerBin, + socketPath: c.socket, + kernelImage: c.kernel, + vcpus: c.vcpus, + memMib: c.memMib, + guestSnapDir: c.guestSnapDir, + guest: sshGuest{addr: c.guestAddr, identity: c.guestKey}, + net: netConfig{ + tapDevice: c.tapDevice, + guestIP: c.guestIP, + hostIP: c.hostIP, + netmask: c.netmask, + guestMAC: c.guestMAC, + }, + }, nil + default: + return nil, fmt.Errorf("unknown driver %q", c.driver) + } +} + +func runRecord(args []string) error { + fs := flag.NewFlagSet("record", flag.ExitOnError) + var c commonFlags + var baseSHA string + registerCommon(fs, &c) + fs.StringVar(&baseSHA, "base-sha", "", "base revision to snapshot") + if err := fs.Parse(args); err != nil { + return err + } + if c.workspace == "" || c.store == "" || baseSHA == "" { + return fmt.Errorf("record requires --workspace, --store, --base-sha") + } + store, err := mustAbs(c.store) + if err != nil { + return err + } + d, err := c.makeDriver() + if err != nil { + return err + } + git := gitClient{workspace: c.workspace} + + // Fingerprint is computed against the base revision's environment. + if err := git.checkout(baseSHA); err != nil { + return err + } + fp, err := computeFingerprint(c.bazelDiff, c.workspace, c.bazel, c.flags) + if err != nil { + return err + } + fmt.Fprintf(os.Stderr, "record: fingerprint=%s base=%s driver=%s\n", fp, baseSHA, d.name()) + + e, err := newEntry(store, fp, baseSHA) + if err != nil { + return err + } + if err := d.record(recordRequest{ + Workspace: c.workspace, BaseSHA: baseSHA, Bazel: c.bazel, + BazelDiff: c.bazelDiff, Flags: c.flags, Entry: e, + }); err != nil { + return err + } + bazelVer, _ := readBazelLabel(c.bazel, c.workspace) + if err := writeMetadata(e, metadata{ + Fingerprint: fp, BazelVersion: bazelVer, + }, time.Now()); err != nil { + return err + } + fmt.Fprintf(os.Stderr, "record: wrote snapshot to %s\n", e.Dir) + return nil +} + +func runConsume(args []string) int { + fs := flag.NewFlagSet("consume", flag.ExitOnError) + var c commonFlags + var targetSHA, out string + registerCommon(fs, &c) + fs.StringVar(&targetSHA, "target-sha", "", "target revision to analyse") + fs.StringVar(&out, "out", "", "path to write impacted targets") + if err := fs.Parse(args); err != nil { + return exitError + } + if c.workspace == "" || c.store == "" || targetSHA == "" || out == "" { + fmt.Fprintln(os.Stderr, "consume requires --workspace, --store, --target-sha, --out") + return exitError + } + store, err := mustAbs(c.store) + if err != nil { + fmt.Fprintln(os.Stderr, "error:", err) + return exitError + } + d, err := c.makeDriver() + if err != nil { + fmt.Fprintln(os.Stderr, "error:", err) + return exitError + } + git := gitClient{workspace: c.workspace} + + // Fingerprint the *target* environment, then resolve a compatible snapshot. + if err := git.checkout(targetSHA); err != nil { + fmt.Fprintln(os.Stderr, "error:", err) + return exitError + } + fp, err := computeFingerprint(c.bazelDiff, c.workspace, c.bazel, c.flags) + if err != nil { + fmt.Fprintln(os.Stderr, "error:", err) + return exitError + } + e, err := resolve(store, fp, targetSHA, git) + if err != nil { + fmt.Fprintln(os.Stderr, "error:", err) + return exitError + } + if e == nil { + // Fail-safe: no compatible snapshot. Caller runs the cold path. + fmt.Fprintf(os.Stderr, "consume: no compatible snapshot for fingerprint=%s target=%s — cold fallback\n", fp, targetSHA) + return exitColdFallback + } + fmt.Fprintf(os.Stderr, "consume: using snapshot base=%s (fingerprint=%s) driver=%s\n", e.BaseSHA, fp, d.name()) + + if err := d.consume(consumeRequest{ + Workspace: c.workspace, TargetSHA: targetSHA, Bazel: c.bazel, + BazelDiff: c.bazelDiff, Flags: c.flags, Entry: *e, Out: out, + }); err != nil { + fmt.Fprintln(os.Stderr, "error:", err) + return exitError + } + fmt.Fprintf(os.Stderr, "consume: wrote impacted targets to %s\n", out) + return exitOK +} + +// multiFlag collects a repeatable string flag. +type multiFlag []string + +func (m *multiFlag) String() string { return strings.Join(*m, " ") } +func (m *multiFlag) Set(v string) error { *m = append(*m, v); return nil } diff --git a/tools/firecracker/main_test.go b/tools/firecracker/main_test.go new file mode 100644 index 0000000..0f57a93 --- /dev/null +++ b/tools/firecracker/main_test.go @@ -0,0 +1,141 @@ +package main + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +// fakeBazelDiff is a stand-in `bazel-diff` for exercising the local driver end to +// end: it emits a fixed fingerprint and writes the output files each subcommand +// is asked to produce, so runRecord/runConsume run without a real Bazel. +const fakeBazelDiff = `#!/bin/sh +sub="$1"; shift +case "$sub" in + fingerprint) + echo '{"fingerprint":"FAKEFP","flags":{}}' ;; + warmup) + bh=""; fp="" + while [ "$#" -gt 0 ]; do + case "$1" in + --base-hashes) bh="$2"; shift 2 ;; + --fingerprint-output) fp="$2"; shift 2 ;; + *) shift ;; + esac + done + [ -n "$bh" ] && echo '{}' > "$bh" + [ -n "$fp" ] && echo '{"fingerprint":"FAKEFP"}' > "$fp" ;; + generate-hashes) + out="" + while [ "$#" -gt 0 ]; do + case "$1" in -w|-b) shift 2 ;; *) out="$1"; shift ;; esac + done + [ -n "$out" ] && echo '{"//x:y":"h"}' > "$out" ;; + get-impacted-targets) + out="" + while [ "$#" -gt 0 ]; do + case "$1" in -o) out="$2"; shift 2 ;; *) shift ;; esac + done + [ -n "$out" ] && printf '//x:y\n//a:b\n' > "$out" ;; +esac +` + +func writeFakeBazelDiff(t *testing.T, dir string) string { + t.Helper() + p := filepath.Join(dir, "bazel-diff") + if err := os.WriteFile(p, []byte(fakeBazelDiff), 0o755); err != nil { + t.Fatal(err) + } + return p +} + +func TestRunRecordThenConsumeLocalDriver(t *testing.T) { + ws, shas := makeLinearRepo(t, 3) // base=shas[0], target=shas[2] + dir := t.TempDir() + bd := writeFakeBazelDiff(t, dir) + bazel := writeFakeBazel(t, dir, "Build label: 8.5.1") + store := filepath.Join(dir, "store") + + if err := runRecord([]string{ + "--workspace", ws, "--base-sha", shas[0], "--store", store, + "--bazel", bazel, "--bazel-diff", bd, + }); err != nil { + t.Fatalf("runRecord: %v", err) + } + // record must have produced the store entry with base hashes + metadata. + e := entry{Dir: entryDir(store, "FAKEFP", shas[0])} + if _, err := os.Stat(e.baseHashes()); err != nil { + t.Fatalf("record did not write base hashes: %v", err) + } + + out := filepath.Join(dir, "impacted.txt") + if code := runConsume([]string{ + "--workspace", ws, "--target-sha", shas[2], "--store", store, + "--bazel", bazel, "--bazel-diff", bd, "--out", out, + }); code != exitOK { + t.Fatalf("runConsume code=%d, want %d", code, exitOK) + } + b, err := os.ReadFile(out) + if err != nil || !strings.Contains(string(b), "//x:y") { + t.Fatalf("consume did not write impacted targets: %q err=%v", string(b), err) + } +} + +func TestRunConsumeColdFallback(t *testing.T) { + ws, shas := makeLinearRepo(t, 2) + dir := t.TempDir() + bd := writeFakeBazelDiff(t, dir) + bazel := writeFakeBazel(t, dir, "Build label: 8.5.1") + // Empty store => no compatible snapshot => exit 2 (cold fallback). + code := runConsume([]string{ + "--workspace", ws, "--target-sha", shas[1], "--store", filepath.Join(dir, "store"), + "--bazel", bazel, "--bazel-diff", bd, "--out", filepath.Join(dir, "o.txt"), + }) + if code != exitColdFallback { + t.Fatalf("expected cold-fallback exit %d, got %d", exitColdFallback, code) + } +} + +func TestRunRecordMissingArgs(t *testing.T) { + if err := runRecord([]string{"--workspace", "/x"}); err == nil { + t.Fatal("runRecord should require --store and --base-sha") + } +} + +func TestRunConsumeMissingArgs(t *testing.T) { + if code := runConsume([]string{"--workspace", "/x"}); code != exitError { + t.Fatalf("runConsume with missing args should be exitError, got %d", code) + } +} + +func TestMakeDriver(t *testing.T) { + if d, err := (commonFlags{driver: "local"}).makeDriver(); err != nil || d.name() != "local" { + t.Fatalf("local driver: d=%v err=%v", d, err) + } + if d, err := (commonFlags{driver: ""}).makeDriver(); err != nil || d.name() != "local" { + t.Fatalf("empty driver should default to local: d=%v err=%v", d, err) + } + if _, err := (commonFlags{driver: "nope"}).makeDriver(); err == nil { + t.Fatal("unknown driver should error") + } + // firecracker without the required host/config always errors (no kvm/kernel/tap). + if _, err := (commonFlags{driver: "firecracker"}).makeDriver(); err == nil { + t.Fatal("firecracker driver without config should error") + } +} + +func TestUsageDoesNotPanic(t *testing.T) { + usage() // writes help to stderr; just exercise it +} + +func TestMultiFlag(t *testing.T) { + var m multiFlag + if err := m.Set("a"); err != nil { + t.Fatal(err) + } + _ = m.Set("b") + if len(m) != 2 || m.String() != "a b" { + t.Fatalf("multiFlag wrong: %v / %q", m, m.String()) + } +} diff --git a/tools/firecracker/resolve.go b/tools/firecracker/resolve.go new file mode 100644 index 0000000..cefd3bb --- /dev/null +++ b/tools/firecracker/resolve.go @@ -0,0 +1,43 @@ +package main + +import "sort" + +// candidate is a stored snapshot base SHA that is an ancestor of the target SHA, +// together with how many commits separate it from the target. +type candidate struct { + BaseSHA string + Distance int // git rev-list --count .. +} + +// pickNearestAncestor chooses the snapshot whose base SHA is the *nearest* +// ancestor of the target (fewest commits between base and target), so the +// warm server has to re-analyse the least. Ties are broken by lexically +// smallest SHA so the choice is deterministic. +// +// Returns nil when there are no candidates, which the caller treats as a +// cold-fallback signal. +func pickNearestAncestor(candidates []candidate) *candidate { + if len(candidates) == 0 { + return nil + } + best := candidates[0] + for _, c := range candidates[1:] { + if c.Distance < best.Distance || + (c.Distance == best.Distance && c.BaseSHA < best.BaseSHA) { + best = c + } + } + return &best +} + +// sortCandidates returns candidates ordered nearest-first (for display/logging). +func sortCandidates(candidates []candidate) []candidate { + out := append([]candidate(nil), candidates...) + sort.Slice(out, func(i, j int) bool { + if out[i].Distance != out[j].Distance { + return out[i].Distance < out[j].Distance + } + return out[i].BaseSHA < out[j].BaseSHA + }) + return out +} diff --git a/tools/firecracker/resolve_test.go b/tools/firecracker/resolve_test.go new file mode 100644 index 0000000..2a541c5 --- /dev/null +++ b/tools/firecracker/resolve_test.go @@ -0,0 +1,45 @@ +package main + +import "testing" + +func TestPickNearestAncestor(t *testing.T) { + tests := []struct { + name string + in []candidate + want string // "" means nil + }{ + {"empty", nil, ""}, + {"single", []candidate{{"a", 5}}, "a"}, + {"nearest wins", []candidate{{"far", 10}, {"near", 2}, {"mid", 5}}, "near"}, + {"tie broken by sha", []candidate{{"zzz", 3}, {"aaa", 3}}, "aaa"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := pickNearestAncestor(tt.in) + if tt.want == "" { + if got != nil { + t.Fatalf("want nil, got %+v", got) + } + return + } + if got == nil || got.BaseSHA != tt.want { + t.Fatalf("want %q, got %+v", tt.want, got) + } + }) + } +} + +func TestSortCandidates(t *testing.T) { + in := []candidate{{"b", 5}, {"a", 2}, {"c", 2}} + got := sortCandidates(in) + want := []string{"a", "c", "b"} // distance asc, then sha asc + for i, c := range got { + if c.BaseSHA != want[i] { + t.Fatalf("at %d: want %q got %q (%+v)", i, want[i], c.BaseSHA, got) + } + } + // input not mutated + if in[0].BaseSHA != "b" { + t.Fatalf("input was mutated: %+v", in) + } +} diff --git a/tools/firecracker/store.go b/tools/firecracker/store.go new file mode 100644 index 0000000..74ebd82 --- /dev/null +++ b/tools/firecracker/store.go @@ -0,0 +1,141 @@ +package main + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "time" +) + +// Snapshot store layout (see docs/firecracker-snapshots.md §7): +// +// /// +// mem_file guest memory image (diff snapshot) +// vmstate Firecracker microVM state +// rootfs.backing frozen read-only disk image +// base_hashes.json produced by `bazel-diff warmup` +// metadata.json fingerprint, baseSHA, versions, created-at + +const ( + memFileName = "mem_file" + vmstateName = "vmstate" + rootfsName = "rootfs.backing" + baseHashesName = "base_hashes.json" + metadataName = "metadata.json" + fingerprintName = "fingerprint.json" +) + +// metadata is the JSON written next to each snapshot. +type metadata struct { + Fingerprint string `json:"fingerprint"` + BaseSHA string `json:"base_sha"` + BazelVersion string `json:"bazel_version"` + BazelDiffVer string `json:"bazel_diff_version"` + CreatedAtUnix int64 `json:"created_at_unix"` + CreatedAtString string `json:"created_at"` +} + +// entry is a resolved store location for a (fingerprint, baseSHA) pair. +type entry struct { + Dir string + BaseSHA string +} + +func (e entry) memFile() string { return filepath.Join(e.Dir, memFileName) } +func (e entry) vmstate() string { return filepath.Join(e.Dir, vmstateName) } +func (e entry) rootfs() string { return filepath.Join(e.Dir, rootfsName) } +func (e entry) baseHashes() string { return filepath.Join(e.Dir, baseHashesName) } +func (e entry) metadata() string { return filepath.Join(e.Dir, metadataName) } + +// entryDir returns the directory for a (fingerprint, baseSHA) snapshot. +func entryDir(store, fingerprint, baseSHA string) string { + return filepath.Join(store, fingerprint, baseSHA) +} + +// newEntry builds and creates the directory for a snapshot to be recorded. +func newEntry(store, fingerprint, baseSHA string) (entry, error) { + dir := entryDir(store, fingerprint, baseSHA) + if err := os.MkdirAll(dir, 0o755); err != nil { + return entry{}, err + } + return entry{Dir: dir, BaseSHA: baseSHA}, nil +} + +func writeMetadata(e entry, m metadata, now time.Time) error { + m.BaseSHA = e.BaseSHA + m.CreatedAtUnix = now.Unix() + m.CreatedAtString = now.UTC().Format(time.RFC3339) + b, err := json.MarshalIndent(m, "", " ") + if err != nil { + return err + } + return os.WriteFile(e.metadata(), append(b, '\n'), 0o644) +} + +// listBaseSHAs returns the base SHAs that have a complete snapshot for the +// given fingerprint. A snapshot is "complete" only if its base_hashes.json +// exists — a half-written record entry is ignored. +func listBaseSHAs(store, fingerprint string) ([]string, error) { + dir := filepath.Join(store, fingerprint) + ents, err := os.ReadDir(dir) + if os.IsNotExist(err) { + return nil, nil + } + if err != nil { + return nil, err + } + var shas []string + for _, ent := range ents { + if !ent.IsDir() { + continue + } + bh := filepath.Join(dir, ent.Name(), baseHashesName) + if _, err := os.Stat(bh); err == nil { + shas = append(shas, ent.Name()) + } + } + return shas, nil +} + +// resolve picks the best snapshot to consume for a target revision: +// - same fingerprint (caller already computed it), +// - base SHA is an ancestor of the target, +// - nearest such ancestor. +// +// Returns (nil, nil) when no compatible snapshot exists — the caller falls +// back to a cold run. This is the fail-safe from RFC §5.2. +func resolve(store, fingerprint, targetSHA string, git gitClient) (*entry, error) { + shas, err := listBaseSHAs(store, fingerprint) + if err != nil { + return nil, err + } + var candidates []candidate + for _, sha := range shas { + ok, err := git.isAncestor(sha, targetSHA) + if err != nil { + return nil, err + } + if !ok { + continue + } + dist, err := git.distance(sha, targetSHA) + if err != nil { + return nil, err + } + candidates = append(candidates, candidate{BaseSHA: sha, Distance: dist}) + } + best := pickNearestAncestor(candidates) + if best == nil { + return nil, nil + } + return &entry{Dir: entryDir(store, fingerprint, best.BaseSHA), BaseSHA: best.BaseSHA}, nil +} + +func mustAbs(p string) (string, error) { + abs, err := filepath.Abs(p) + if err != nil { + return "", fmt.Errorf("resolving path %q: %w", p, err) + } + return abs, nil +} diff --git a/tools/firecracker/store_test.go b/tools/firecracker/store_test.go new file mode 100644 index 0000000..d0951b7 --- /dev/null +++ b/tools/firecracker/store_test.go @@ -0,0 +1,195 @@ +package main + +import ( + "encoding/json" + "os" + "os/exec" + "path/filepath" + "testing" + "time" +) + +// makeLinearRepo creates a git repo with n commits and returns workspace + SHAs +// in chronological order (shas[0] is oldest). +func makeLinearRepo(t *testing.T, n int) (string, []string) { + t.Helper() + ws := t.TempDir() + run := func(args ...string) string { + cmd := exec.Command("git", append([]string{"-C", ws}, args...)...) + out, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("git %v: %v\n%s", args, err, out) + } + return string(out) + } + run("init", "-q") + run("config", "user.email", "t@t.local") + run("config", "user.name", "t") + var shas []string + for i := 0; i < n; i++ { + f := filepath.Join(ws, "f.txt") + if err := os.WriteFile(f, []byte{byte('0' + i)}, 0o644); err != nil { + t.Fatal(err) + } + run("add", "-A") + run("commit", "-q", "-m", "c") + sha := run("rev-parse", "HEAD") + shas = append(shas, trim(sha)) + } + return ws, shas +} + +func trim(s string) string { + for len(s) > 0 && (s[len(s)-1] == '\n' || s[len(s)-1] == ' ' || s[len(s)-1] == '\r') { + s = s[:len(s)-1] + } + return s +} + +// writeSnapshot creates a complete store entry (with base_hashes.json marker). +func writeSnapshot(t *testing.T, store, fp, baseSHA string) { + t.Helper() + dir := entryDir(store, fp, baseSHA) + if err := os.MkdirAll(dir, 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(dir, baseHashesName), []byte("{}"), 0o644); err != nil { + t.Fatal(err) + } +} + +func TestNewEntryAndPaths(t *testing.T) { + store := t.TempDir() + e, err := newEntry(store, "fp1", "sha1") + if err != nil { + t.Fatal(err) + } + if _, err := os.Stat(e.Dir); err != nil { + t.Fatalf("newEntry should create the dir: %v", err) + } + checks := map[string]string{ + e.memFile(): memFileName, + e.vmstate(): vmstateName, + e.rootfs(): rootfsName, + e.baseHashes(): baseHashesName, + e.metadata(): metadataName, + } + for got, suffix := range checks { + if filepath.Base(got) != suffix { + t.Fatalf("path %q should end in %q", got, suffix) + } + if filepath.Dir(got) != e.Dir { + t.Fatalf("path %q should live under the entry dir %q", got, e.Dir) + } + } +} + +func TestWriteMetadata(t *testing.T) { + store := t.TempDir() + e, _ := newEntry(store, "fp1", "sha1") + now := time.Unix(1700000000, 0) + if err := writeMetadata(e, metadata{Fingerprint: "fp1", BazelVersion: "8.5.1"}, now); err != nil { + t.Fatal(err) + } + b, err := os.ReadFile(e.metadata()) + if err != nil { + t.Fatal(err) + } + var m metadata + if err := json.Unmarshal(b, &m); err != nil { + t.Fatal(err) + } + if m.Fingerprint != "fp1" || m.BaseSHA != "sha1" || m.BazelVersion != "8.5.1" { + t.Fatalf("metadata fields wrong: %+v", m) + } + if m.CreatedAtUnix != 1700000000 || m.CreatedAtString == "" { + t.Fatalf("metadata timestamps wrong: %+v", m) + } +} + +func TestMustAbs(t *testing.T) { + abs, err := mustAbs("relative/path") + if err != nil { + t.Fatal(err) + } + if !filepath.IsAbs(abs) { + t.Fatalf("mustAbs should return an absolute path, got %q", abs) + } + already := "/already/abs" + if got, _ := mustAbs(already); got != already { + t.Fatalf("mustAbs of an absolute path should be unchanged, got %q", got) + } +} + +func TestResolveNearestAncestor(t *testing.T) { + ws, shas := makeLinearRepo(t, 5) // shas[0..4], 4 is newest + store := t.TempDir() + fp := "fp1" + git := gitClient{workspace: ws} + + // Snapshots at commit 0 and commit 2; target is commit 4. + writeSnapshot(t, store, fp, shas[0]) + writeSnapshot(t, store, fp, shas[2]) + + e, err := resolve(store, fp, shas[4], git) + if err != nil { + t.Fatal(err) + } + if e == nil { + t.Fatal("expected a snapshot, got nil") + } + if e.BaseSHA != shas[2] { + t.Fatalf("want nearest ancestor %s, got %s", shas[2], e.BaseSHA) + } +} + +func TestResolveNoCompatibleFingerprint(t *testing.T) { + ws, shas := makeLinearRepo(t, 3) + store := t.TempDir() + git := gitClient{workspace: ws} + writeSnapshot(t, store, "fpA", shas[0]) + + // Different fingerprint => no candidates => cold fallback (nil, nil). + e, err := resolve(store, "fpB", shas[2], git) + if err != nil { + t.Fatal(err) + } + if e != nil { + t.Fatalf("expected nil (cold fallback), got %+v", e) + } +} + +func TestResolveIgnoresNonAncestor(t *testing.T) { + ws, shas := makeLinearRepo(t, 4) + store := t.TempDir() + fp := "fp1" + git := gitClient{workspace: ws} + + // Snapshot at the NEWEST commit; target is an OLDER commit -> not an ancestor. + writeSnapshot(t, store, fp, shas[3]) + e, err := resolve(store, fp, shas[1], git) + if err != nil { + t.Fatal(err) + } + if e != nil { + t.Fatalf("expected nil (newer snapshot is not an ancestor of older target), got %+v", e) + } +} + +func TestListBaseSHAsIgnoresIncomplete(t *testing.T) { + store := t.TempDir() + fp := "fp1" + // complete + writeSnapshot(t, store, fp, "complete") + // incomplete: dir exists but no base_hashes.json + if err := os.MkdirAll(entryDir(store, fp, "incomplete"), 0o755); err != nil { + t.Fatal(err) + } + got, err := listBaseSHAs(store, fp) + if err != nil { + t.Fatal(err) + } + if len(got) != 1 || got[0] != "complete" { + t.Fatalf("want [complete], got %v", got) + } +}