diff --git a/actions/setup/js/apply_samples.cjs b/actions/setup/js/apply_samples.cjs
new file mode 100644
index 00000000000..0a3dc7844cf
--- /dev/null
+++ b/actions/setup/js/apply_samples.cjs
@@ -0,0 +1,363 @@
+#!/usr/bin/env node
+// @ts-check
+///
+
+// apply_samples.cjs
+//
+// Deterministic replay driver for `gh aw compile --use-samples`.
+//
+// Reads `GH_AW_SAMPLES` (a JSON array of `{tool, arguments, sidecars}`
+// entries produced by the compiler), spawns the safe-outputs MCP server
+// (`safe_outputs_mcp_server.cjs`) as a child process, sends one JSON-RPC
+// `tools/call` per sample over stdio, and writes a synthetic `agent-stdio.log`
+// so downstream log-parsing / failure-handling steps continue to work.
+//
+// For samples whose tool is `create_pull_request` or `push_to_pull_request_branch`
+// and whose sidecars include `patch`, the driver pre-stages a branch and commits
+// the patch into the workspace BEFORE invoking the MCP tool. This lets the
+// real `create_pull_request` MCP handler (which derives a git diff against the
+// base branch) produce a meaningful transport payload.
+//
+// Env contract:
+// GH_AW_SAMPLES — JSON array of replay entries (required)
+// GH_AW_AGENT_STDIO_LOG — path where the synthetic stdio log is written
+// GH_AW_SAFE_OUTPUTS_CONFIG_PATH — path to the MCP server's config.json
+// GH_AW_SAFE_OUTPUTS — path to the MCP server's outputs.jsonl
+// GITHUB_WORKSPACE — git working directory for pre-staging (optional;
+// falls back to cwd)
+
+require("./shim.cjs");
+
+const { spawn } = require("child_process");
+const fs = require("fs");
+const path = require("path");
+const os = require("os");
+const { getErrorMessage } = require("./error_helpers.cjs");
+
+const DEFAULT_BASE_BRANCH = process.env.GH_AW_CUSTOM_BASE_BRANCH || process.env.GITHUB_BASE_REF || process.env.GITHUB_REF_NAME || "main";
+const PATCH_SIDECAR_TOOLS = new Set(["create_pull_request", "push_to_pull_request_branch"]);
+
+/**
+ * @typedef {Object} SampleEntry
+ * @property {string} tool
+ * @property {Record} arguments
+ * @property {Record} [sidecars]
+ */
+
+/**
+ * Read and parse the GH_AW_SAMPLES env var. Returns an empty array (with a
+ * warning) when unset or empty so the workflow can still complete cleanly.
+ * @returns {SampleEntry[]}
+ */
+function loadSamples() {
+ const raw = process.env.GH_AW_SAMPLES;
+ if (!raw || !raw.trim()) {
+ core.warning("apply_samples: GH_AW_SAMPLES is empty — no samples to replay.");
+ return [];
+ }
+ let parsed;
+ try {
+ parsed = JSON.parse(raw);
+ } catch (err) {
+ throw new Error(`apply_samples: failed to parse GH_AW_SAMPLES as JSON: ${getErrorMessage(err)}`);
+ }
+ // Tolerate a literal JSON `null` payload (older compiler emitted it for
+ // workflows with --use-samples but no `samples:` entries). Treat as empty.
+ if (parsed === null) {
+ core.warning("apply_samples: GH_AW_SAMPLES is null — treating as no samples to replay.");
+ return [];
+ }
+ if (!Array.isArray(parsed)) {
+ throw new Error("apply_samples: GH_AW_SAMPLES must be a JSON array");
+ }
+ for (const [i, entry] of parsed.entries()) {
+ if (!entry || typeof entry !== "object" || typeof entry.tool !== "string") {
+ throw new Error(`apply_samples: entry ${i} is missing a string "tool" field`);
+ }
+ if (!entry.arguments || typeof entry.arguments !== "object") {
+ throw new Error(`apply_samples: entry ${i} (tool=${entry.tool}) is missing an "arguments" object`);
+ }
+ }
+ return parsed;
+}
+
+/**
+ * Run a git subcommand synchronously and return stdout. Throws on non-zero exit.
+ * @param {string[]} args
+ * @param {string} cwd
+ * @returns {string}
+ */
+function runGit(args, cwd) {
+ const { spawnSync } = require("child_process");
+ const result = spawnSync("git", args, { cwd, encoding: "utf8" });
+ if (result.status !== 0) {
+ throw new Error(`git ${args.join(" ")} failed (exit ${result.status}): ${result.stderr || result.stdout}`);
+ }
+ return result.stdout;
+}
+
+/**
+ * Ensure git user.email / user.name are configured so commits succeed in CI.
+ * @param {string} cwd
+ */
+function ensureGitIdentity(cwd) {
+ try {
+ runGit(["config", "user.email"], cwd);
+ } catch {
+ runGit(["config", "user.email", "gh-aw-samples@github.com"], cwd);
+ }
+ try {
+ runGit(["config", "user.name"], cwd);
+ } catch {
+ runGit(["config", "user.name", "gh-aw samples"], cwd);
+ }
+}
+
+/**
+ * Pre-stage a branch + patch for samples whose tool reads the workspace diff.
+ * Mutates `entry.arguments.branch` to the actual checked-out branch.
+ * @param {SampleEntry} entry
+ * @param {number} index
+ * @param {string} workspace
+ */
+function preStagePatch(entry, index, workspace) {
+ const patch = entry.sidecars && entry.sidecars.patch;
+ if (typeof patch !== "string" || !patch.trim()) {
+ return;
+ }
+ const branch = typeof entry.arguments.branch === "string" && entry.arguments.branch.trim() ? entry.arguments.branch.trim() : `gh-aw-sample-${index + 1}`;
+ entry.arguments.branch = branch;
+
+ ensureGitIdentity(workspace);
+
+ // Start from the base branch so the diff is meaningful. Tolerate the case
+ // where the base ref doesn't exist locally — fall back to HEAD.
+ try {
+ runGit(["checkout", DEFAULT_BASE_BRANCH], workspace);
+ } catch (err) {
+ core.warning(`apply_samples: could not check out base branch ${DEFAULT_BASE_BRANCH}: ${getErrorMessage(err)}; staying on current HEAD`);
+ }
+
+ // Create the branch (or check it out if it already exists from a previous sample).
+ try {
+ runGit(["checkout", "-b", branch], workspace);
+ } catch {
+ runGit(["checkout", branch], workspace);
+ }
+
+ // Write patch to a temp file and apply it.
+ const tmpPatch = path.join(os.tmpdir(), `gh-aw-sample-${index + 1}.patch`);
+ fs.writeFileSync(tmpPatch, patch.endsWith("\n") ? patch : patch + "\n");
+ try {
+ runGit(["apply", "--whitespace=nowarn", tmpPatch], workspace);
+ } catch (err) {
+ // Fall back to --3way for patches that don't apply cleanly on top of an
+ // empty working tree (uncommon but possible for synthetic samples).
+ runGit(["apply", "--3way", "--whitespace=nowarn", tmpPatch], workspace);
+ }
+
+ runGit(["add", "-A"], workspace);
+ runGit(["commit", "-m", `gh-aw sample ${index + 1}: ${entry.tool}`, "--allow-empty"], workspace);
+}
+
+/**
+ * Send a single JSON-RPC request to the MCP server child process and resolve
+ * with the parsed JSON response (or reject on timeout).
+ * @param {import("child_process").ChildProcess} child
+ * @param {NodeJS.WritableStream} stdin
+ * @param {object} request
+ * @param {AsyncIterableIterator} responseIterator
+ * @returns {Promise}
+ */
+async function sendJsonRpc(child, stdin, request, responseIterator) {
+ stdin.write(JSON.stringify(request) + "\n");
+ const { value, done } = await responseIterator.next();
+ if (done) {
+ throw new Error(`apply_samples: MCP server closed stdout before responding to request id=${request.id}`);
+ }
+ return JSON.parse(value);
+}
+
+/**
+ * Turn the MCP server's stdout into an async iterator of line strings.
+ * @param {NodeJS.ReadableStream} stdout
+ */
+async function* lineIterator(stdout) {
+ let buffer = "";
+ for await (const chunk of stdout) {
+ buffer += chunk.toString();
+ let newlineIdx;
+ while ((newlineIdx = buffer.indexOf("\n")) !== -1) {
+ const line = buffer.slice(0, newlineIdx).trim();
+ buffer = buffer.slice(newlineIdx + 1);
+ if (line) {
+ yield line;
+ }
+ }
+ }
+ if (buffer.trim()) {
+ yield buffer.trim();
+ }
+}
+
+/**
+ * Locate the safe_outputs_mcp_server.cjs script. The setup action copies it
+ * into ${RUNNER_TEMP}/gh-aw/actions/ alongside this driver; fall back to
+ * resolving via __dirname for local-execution / tests.
+ * @returns {string}
+ */
+function resolveMcpServerPath() {
+ const candidates = [
+ path.join(__dirname, "safe_outputs_mcp_server.cjs"),
+ process.env.RUNNER_TEMP ? path.join(process.env.RUNNER_TEMP, "gh-aw", "actions", "safe_outputs_mcp_server.cjs") : null,
+ process.env.RUNNER_TEMP ? path.join(process.env.RUNNER_TEMP, "gh-aw", "safeoutputs", "safe_outputs_mcp_server.cjs") : null,
+ ].filter(/** @returns {p is string} */ p => typeof p === "string");
+ for (const candidate of candidates) {
+ if (fs.existsSync(candidate)) {
+ return candidate;
+ }
+ }
+ throw new Error(`apply_samples: could not locate safe_outputs_mcp_server.cjs. Looked in: ${candidates.join(", ")}`);
+}
+
+/**
+ * Append a synthetic terminal_reason: completed marker to the engine stdio log
+ * so downstream parsers / handle_agent_failure recognize the replay as a
+ * successful agent run.
+ * @param {string} logPath
+ * @param {number} sampleCount
+ */
+function writeSyntheticStdioLog(logPath, sampleCount) {
+ if (!logPath) return;
+ try {
+ fs.mkdirSync(path.dirname(logPath), { recursive: true });
+ } catch {
+ /* ignore */
+ }
+ const lines = [
+ `gh-aw samples replay: ${sampleCount} MCP tools/call invocation(s) completed deterministically.`,
+ JSON.stringify({
+ type: "result",
+ subtype: "success",
+ terminal_reason: "completed",
+ num_turns: sampleCount,
+ driver: "apply_samples",
+ }),
+ "",
+ ];
+ fs.appendFileSync(logPath, lines.join("\n"));
+}
+
+async function main() {
+ const samples = loadSamples();
+ const workspace = process.env.GITHUB_WORKSPACE || process.cwd();
+ const logPath = process.env.GH_AW_AGENT_STDIO_LOG || "";
+
+ // Pre-stage branches/patches.
+ samples.forEach((sample, i) => {
+ if (PATCH_SIDECAR_TOOLS.has(sample.tool)) {
+ preStagePatch(sample, i, workspace);
+ }
+ });
+
+ if (samples.length === 0) {
+ core.info("apply_samples: nothing to replay; exiting cleanly.");
+ writeSyntheticStdioLog(logPath, 0);
+ return;
+ }
+
+ const serverPath = resolveMcpServerPath();
+ core.info(`apply_samples: spawning MCP server ${serverPath}`);
+ const child = spawn(process.execPath, [serverPath], {
+ stdio: ["pipe", "pipe", "inherit"],
+ env: process.env,
+ });
+
+ const stdoutIter = lineIterator(child.stdout);
+ let nextId = 1;
+ const failures = [];
+
+ try {
+ // Initialize handshake.
+ const initRsp = await sendJsonRpc(
+ child,
+ child.stdin,
+ {
+ jsonrpc: "2.0",
+ id: nextId++,
+ method: "initialize",
+ params: {
+ protocolVersion: "2025-06-18",
+ capabilities: {},
+ clientInfo: { name: "apply_samples", version: "1.0.0" },
+ },
+ },
+ stdoutIter
+ );
+ if (initRsp.error) {
+ throw new Error(`MCP initialize failed: ${JSON.stringify(initRsp.error)}`);
+ }
+
+ // Send one tools/call per sample.
+ for (const [i, sample] of samples.entries()) {
+ const callRsp = await sendJsonRpc(
+ child,
+ child.stdin,
+ {
+ jsonrpc: "2.0",
+ id: nextId++,
+ method: "tools/call",
+ params: { name: sample.tool, arguments: sample.arguments },
+ },
+ stdoutIter
+ );
+ if (callRsp.error) {
+ failures.push(`sample[${i}] (tool=${sample.tool}): ${JSON.stringify(callRsp.error)}`);
+ continue;
+ }
+ const result = callRsp.result;
+ if (result && result.isError) {
+ const text = result.content && result.content[0] && result.content[0].text;
+ failures.push(`sample[${i}] (tool=${sample.tool}): ${text || JSON.stringify(result)}`);
+ } else {
+ core.info(`apply_samples: sample[${i}] (tool=${sample.tool}) ok`);
+ }
+ }
+ } finally {
+ try {
+ child.stdin.end();
+ } catch {
+ /* ignore */
+ }
+ // Give the server up to 2s to exit cleanly.
+ await new Promise(resolve => {
+ const timer = setTimeout(() => {
+ try {
+ child.kill("SIGTERM");
+ } catch {
+ /* ignore */
+ }
+ resolve(undefined);
+ }, 2000);
+ child.once("exit", () => {
+ clearTimeout(timer);
+ resolve(undefined);
+ });
+ });
+ }
+
+ writeSyntheticStdioLog(logPath, samples.length);
+
+ if (failures.length > 0) {
+ throw new Error(`apply_samples: ${failures.length} sample(s) failed:\n - ${failures.join("\n - ")}`);
+ }
+ core.info(`apply_samples: ${samples.length} sample(s) replayed successfully.`);
+}
+
+if (require.main === module) {
+ main().catch(err => {
+ core.setFailed(err && err.stack ? err.stack : String(err));
+ });
+}
+
+module.exports = { main, loadSamples, preStagePatch, resolveMcpServerPath };
diff --git a/actions/setup/js/apply_samples.test.cjs b/actions/setup/js/apply_samples.test.cjs
new file mode 100644
index 00000000000..9b2963bdead
--- /dev/null
+++ b/actions/setup/js/apply_samples.test.cjs
@@ -0,0 +1,283 @@
+// @ts-check
+//
+// apply_samples.test.cjs
+//
+// Smoke test for the deterministic samples replay driver. Spawns the
+// driver as a subprocess (so it actually launches the real MCP server) and
+// asserts that:
+// - the driver exits 0
+// - the MCP server appends the expected JSONL entry to GH_AW_SAFE_OUTPUTS
+// - the synthetic agent-stdio log includes a `terminal_reason: completed` marker
+//
+// Tests intentionally use the simplest safe-output tool (`create_issue`) so we
+// do not need to set up a git working tree for patch sidecars.
+
+import { describe, it, expect, beforeAll } from "vitest";
+import { spawnSync } from "child_process";
+import { createRequire } from "module";
+import fs from "fs";
+import path from "path";
+import os from "os";
+import { fileURLToPath } from "url";
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
+const driverPath = path.join(__dirname, "apply_samples.cjs");
+const require = createRequire(import.meta.url);
+
+function makeTempDir(prefix) {
+ return fs.mkdtempSync(path.join(os.tmpdir(), prefix));
+}
+
+function git(args, cwd) {
+ const r = spawnSync("git", args, { cwd, encoding: "utf8" });
+ if (r.status !== 0) {
+ throw new Error(`git ${args.join(" ")} failed: ${r.stderr || r.stdout}`);
+ }
+ return r.stdout;
+}
+
+function initRepo(dir, defaultBranch) {
+ git(["init", "-q", "-b", defaultBranch], dir);
+ git(["config", "user.email", "ghaw-test@example.com"], dir);
+ git(["config", "user.name", "ghaw test"], dir);
+ fs.writeFileSync(path.join(dir, "README.md"), "# seed\n");
+ git(["add", "."], dir);
+ git(["commit", "-q", "-m", "seed"], dir);
+}
+
+describe.sequential("apply_samples.cjs", () => {
+ let tempDir;
+ let configPath;
+ let outputsPath;
+ let logPath;
+
+ beforeAll(() => {
+ tempDir = makeTempDir("gh-aw-apply-samples-");
+ configPath = path.join(tempDir, "config.json");
+ outputsPath = path.join(tempDir, "outputs.jsonl");
+ logPath = path.join(tempDir, "agent-stdio.log");
+
+ // Minimal safe-outputs config enabling only the `create_issue` tool. The
+ // bootstrap loader keys off the snake-case keys present here.
+ fs.writeFileSync(
+ configPath,
+ JSON.stringify({
+ create_issue: { max: 1 },
+ })
+ );
+ });
+
+ it("replays a create_issue sample through the real MCP server and emits a completed marker", () => {
+ const samples = [
+ {
+ tool: "create_issue",
+ arguments: {
+ title: "Deterministic sample issue",
+ body: "This issue was emitted by the apply_samples driver during a unit test.",
+ },
+ },
+ ];
+
+ const result = spawnSync(process.execPath, [driverPath], {
+ env: {
+ ...process.env,
+ GH_AW_SAMPLES: JSON.stringify(samples),
+ GH_AW_SAFE_OUTPUTS_CONFIG_PATH: configPath,
+ GH_AW_SAFE_OUTPUTS: outputsPath,
+ GH_AW_AGENT_STDIO_LOG: logPath,
+ },
+ encoding: "utf8",
+ timeout: 15000,
+ });
+
+ if (result.status !== 0) {
+ // Surface stderr so failures are diagnosable in CI.
+ throw new Error(`driver exited with status ${result.status}\nstderr:\n${result.stderr}\nstdout:\n${result.stdout}`);
+ }
+
+ expect(fs.existsSync(outputsPath)).toBe(true);
+ const outputLines = fs
+ .readFileSync(outputsPath, "utf8")
+ .split("\n")
+ .filter(line => line.trim().length > 0);
+ expect(outputLines.length).toBeGreaterThanOrEqual(1);
+
+ const firstEntry = JSON.parse(outputLines[0]);
+ expect(firstEntry.type).toBe("create_issue");
+ expect(firstEntry.title).toBe("Deterministic sample issue");
+
+ expect(fs.existsSync(logPath)).toBe(true);
+ const logText = fs.readFileSync(logPath, "utf8");
+ expect(logText).toContain("terminal_reason");
+ expect(logText).toContain("completed");
+ });
+
+ it("exits cleanly when GH_AW_SAMPLES is empty", () => {
+ const result = spawnSync(process.execPath, [driverPath], {
+ env: {
+ ...process.env,
+ GH_AW_SAMPLES: "[]",
+ GH_AW_SAFE_OUTPUTS_CONFIG_PATH: configPath,
+ GH_AW_SAFE_OUTPUTS: outputsPath,
+ GH_AW_AGENT_STDIO_LOG: path.join(tempDir, "empty-log.log"),
+ },
+ encoding: "utf8",
+ timeout: 10000,
+ });
+
+ expect(result.status).toBe(0);
+ const logText = fs.readFileSync(path.join(tempDir, "empty-log.log"), "utf8");
+ expect(logText).toContain("terminal_reason");
+ });
+
+ // Defense in depth: an older compiler that marshaled a nil Go slice would
+ // emit `null` into GH_AW_SAMPLES. Newer drivers must tolerate that and
+ // treat it as "no samples", not crash with `must be a JSON array`.
+ it("exits cleanly when GH_AW_SAMPLES is the literal `null`", () => {
+ const logPath = path.join(tempDir, "null-log.log");
+ const result = spawnSync(process.execPath, [driverPath], {
+ env: {
+ ...process.env,
+ GH_AW_SAMPLES: "null",
+ GH_AW_SAFE_OUTPUTS_CONFIG_PATH: configPath,
+ GH_AW_SAFE_OUTPUTS: outputsPath,
+ GH_AW_AGENT_STDIO_LOG: logPath,
+ },
+ encoding: "utf8",
+ timeout: 10000,
+ });
+
+ if (result.status !== 0) {
+ throw new Error(`driver exited with status ${result.status}\nstderr:\n${result.stderr}\nstdout:\n${result.stdout}`);
+ }
+ expect(result.stderr).toContain("GH_AW_SAMPLES is null");
+ const logText = fs.readFileSync(logPath, "utf8");
+ expect(logText).toContain("terminal_reason");
+ });
+});
+
+describe("apply_samples.cjs preStagePatch (create_pull_request / push_to_pull_request_branch)", () => {
+ // Load the module under test directly so we can drive preStagePatch in
+ // isolation against a real, throwaway git working tree. This is the
+ // critical code path that turns a `patch` sidecar on a sample entry into
+ // a real branch + commit that the downstream MCP `create_pull_request`
+ // handler (which derives a git diff) can act on.
+ const { preStagePatch } = require("./apply_samples.cjs");
+
+ /**
+ * Build a unified diff that adds a brand-new file. Synthetic but realistic.
+ */
+ function newFileDiff(filePath, contents) {
+ const lines = contents.split("\n");
+ // Strip trailing empty element produced by a terminating "\n" so the
+ // hunk header line count matches what git apply expects.
+ if (lines[lines.length - 1] === "") lines.pop();
+ const body = lines.map(l => "+" + l).join("\n");
+ return `diff --git a/${filePath} b/${filePath}\n` + `new file mode 100644\n` + `index 0000000..1111111\n` + `--- /dev/null\n` + `+++ b/${filePath}\n` + `@@ -0,0 +1,${lines.length} @@\n` + body + "\n";
+ }
+
+ it("checks out the requested branch and commits the patch on it (create_pull_request)", () => {
+ const workspace = makeTempDir("gh-aw-prestage-cpr-");
+ initRepo(workspace, "main");
+
+ const branchName = "feat/gh-aw-sample-branch";
+ const fileToAdd = "sample-feature.txt";
+ const fileBody = "hello from a deterministic sample\nsecond line\n";
+ const entry = {
+ tool: "create_pull_request",
+ arguments: {
+ title: "Sample PR",
+ body: "Sample PR body",
+ branch: branchName,
+ },
+ sidecars: { patch: newFileDiff(fileToAdd, fileBody) },
+ };
+
+ // GH_AW_CUSTOM_BASE_BRANCH steers preStagePatch to check out the right
+ // base ref inside our fresh repo (default is GITHUB_BASE_REF / "main").
+ const prev = process.env.GH_AW_CUSTOM_BASE_BRANCH;
+ process.env.GH_AW_CUSTOM_BASE_BRANCH = "main";
+ try {
+ preStagePatch(entry, 0, workspace);
+ } finally {
+ if (prev === undefined) delete process.env.GH_AW_CUSTOM_BASE_BRANCH;
+ else process.env.GH_AW_CUSTOM_BASE_BRANCH = prev;
+ }
+
+ // 1. Branch name on the entry is preserved (driver must forward it to MCP).
+ expect(entry.arguments.branch).toBe(branchName);
+
+ // 2. The named branch exists in the working repo.
+ const branches = git(["branch", "--list", branchName], workspace).trim();
+ expect(branches).toContain(branchName);
+
+ // 3. Current HEAD is that branch.
+ const head = git(["rev-parse", "--abbrev-ref", "HEAD"], workspace).trim();
+ expect(head).toBe(branchName);
+
+ // 4. The patch was applied AND committed (not just sitting in the worktree).
+ const status = git(["status", "--porcelain"], workspace).trim();
+ expect(status).toBe("");
+ expect(fs.existsSync(path.join(workspace, fileToAdd))).toBe(true);
+ expect(fs.readFileSync(path.join(workspace, fileToAdd), "utf8")).toBe(fileBody);
+
+ // 5. The commit message identifies the sample so failures are diagnosable.
+ const lastMsg = git(["log", "-1", "--pretty=%s"], workspace).trim();
+ expect(lastMsg).toMatch(/gh-aw sample 1: create_pull_request/);
+
+ // 6. The new file shows up as a real diff against the base branch — this is
+ // precisely what the downstream MCP create_pull_request handler will read.
+ const diff = git(["diff", "main..." + branchName, "--", fileToAdd], workspace);
+ expect(diff).toContain("+hello from a deterministic sample");
+ });
+
+ it("defaults the branch name to gh-aw-sample- when none is supplied", () => {
+ const workspace = makeTempDir("gh-aw-prestage-default-");
+ initRepo(workspace, "main");
+
+ const entry = {
+ tool: "push_to_pull_request_branch",
+ arguments: {
+ body: "Sample push body",
+ // branch intentionally omitted — driver should synthesize one.
+ },
+ sidecars: { patch: newFileDiff("push-feature.txt", "from push sample\n") },
+ };
+
+ const prev = process.env.GH_AW_CUSTOM_BASE_BRANCH;
+ process.env.GH_AW_CUSTOM_BASE_BRANCH = "main";
+ try {
+ preStagePatch(entry, 2, workspace);
+ } finally {
+ if (prev === undefined) delete process.env.GH_AW_CUSTOM_BASE_BRANCH;
+ else process.env.GH_AW_CUSTOM_BASE_BRANCH = prev;
+ }
+
+ // Index in preStagePatch is zero-based; the default uses i+1 → "gh-aw-sample-3".
+ expect(entry.arguments.branch).toBe("gh-aw-sample-3");
+ const head = git(["rev-parse", "--abbrev-ref", "HEAD"], workspace).trim();
+ expect(head).toBe("gh-aw-sample-3");
+ expect(fs.existsSync(path.join(workspace, "push-feature.txt"))).toBe(true);
+ });
+
+ it("is a no-op when the sample tool isn't in the patch-sidecar set", () => {
+ // We assert this at the driver level (PATCH_SIDECAR_TOOLS gate in main()),
+ // but preStagePatch itself should also be a no-op when called with an
+ // entry that has no patch sidecar — protecting against misuse.
+ const workspace = makeTempDir("gh-aw-prestage-noop-");
+ initRepo(workspace, "main");
+
+ const entry = {
+ tool: "create_issue",
+ arguments: { title: "x", body: "y" },
+ };
+ preStagePatch(entry, 0, workspace);
+
+ // Still on main, no extra commits, no new files.
+ expect(git(["rev-parse", "--abbrev-ref", "HEAD"], workspace).trim()).toBe("main");
+ const log = git(["log", "--pretty=%s"], workspace).trim().split("\n");
+ expect(log).toEqual(["seed"]);
+ });
+});
diff --git a/actions/setup/js/safe_outputs_mcp_server.cjs b/actions/setup/js/safe_outputs_mcp_server.cjs
index aca4f49cd93..3e1c2c54457 100644
--- a/actions/setup/js/safe_outputs_mcp_server.cjs
+++ b/actions/setup/js/safe_outputs_mcp_server.cjs
@@ -1,4 +1,5 @@
// @ts-check
+///
// Safe Outputs MCP Server Module
//
@@ -12,6 +13,11 @@
// const server = require("./safe_outputs_mcp_server.cjs");
// server.startSafeOutputsServer();
+// Load core/context shim so handlers that reference `core.*` (e.g.
+// create_pull_request.cjs) work when this file is spawned directly as a
+// child process (e.g. by apply_samples.cjs) outside the github-script runtime.
+require("./shim.cjs");
+
const { createServer, registerTool, normalizeTool, start } = require("./mcp_server_core.cjs");
const { createAppendFunction } = require("./safe_outputs_append.cjs");
const { createHandlers } = require("./safe_outputs_handlers.cjs");
diff --git a/cmd/gh-aw/main.go b/cmd/gh-aw/main.go
index 895fdb2beed..27748d7cee5 100644
--- a/cmd/gh-aw/main.go
+++ b/cmd/gh-aw/main.go
@@ -304,6 +304,7 @@ Examples:
priorManifestFile, _ := cmd.Flags().GetString("prior-manifest-file")
ghes, _ := cmd.Flags().GetBool("ghes")
verbose, _ := cmd.Flags().GetBool("verbose")
+ useSamples, _ := cmd.Flags().GetBool("use-samples")
if err := validateEngine(engineOverride); err != nil {
return err
}
@@ -364,6 +365,7 @@ Examples:
ValidateImages: validateImages,
PriorManifestFile: priorManifestFile,
GHESCompat: ghes,
+ UseSamples: useSamples,
}
if _, err := cli.CompileWorkflows(cmd.Context(), config); err != nil {
// Return error as-is without additional formatting
@@ -703,6 +705,8 @@ Use "` + string(constants.CLIExtensionPrefix) + ` help all" to show help for all
compileCmd.Flags().Bool("strict", false, "Override frontmatter to enforce strict mode validation for all workflows (enforces action pinning, network config, safe-outputs, refuses write permissions and deprecated fields). Note: Workflows default to strict mode unless frontmatter sets strict: false")
compileCmd.Flags().Bool("trial", false, "Enable trial mode compilation (modifies workflows for trial execution)")
compileCmd.Flags().String("logical-repo", "", "Repository to simulate workflow execution against (for trial mode)")
+ compileCmd.Flags().Bool("use-samples", false, "Hidden: replace the agentic 'Execute coding agent' step with a deterministic driver that replays the workflow's safe-outputs `samples` frontmatter entries through the safe-outputs MCP server. Used to make end-to-end tests deterministic.")
+ _ = compileCmd.Flags().MarkHidden("use-samples")
compileCmd.Flags().Bool("dependabot", false, "Generate dependency manifests (package.json, requirements.txt, go.mod) and Dependabot config when dependencies are detected")
compileCmd.Flags().Bool("force", false, "Force overwrite of existing dependency files (e.g., dependabot.yml)")
compileCmd.Flags().Bool("refresh-stop-time", false, "Force regeneration of stop-after times instead of preserving existing values from lock files")
diff --git a/pkg/cli/compile_compiler_setup.go b/pkg/cli/compile_compiler_setup.go
index bdc672aea13..f5853e752ec 100644
--- a/pkg/cli/compile_compiler_setup.go
+++ b/pkg/cli/compile_compiler_setup.go
@@ -147,6 +147,12 @@ func configureCompilerFlags(compiler *workflow.Compiler, config CompileConfig) {
}
}
+ // Replace the agentic step with a deterministic samples replay driver when requested (hidden feature).
+ if config.UseSamples {
+ compileCompilerSetupLog.Print("Enabling --use-samples: agentic step will be replaced by a deterministic replay driver")
+ compiler.SetUseSamples(true)
+ }
+
// Set refresh stop time flag
compiler.SetRefreshStopTime(config.RefreshStopTime)
if config.RefreshStopTime {
diff --git a/pkg/cli/compile_config.go b/pkg/cli/compile_config.go
index 24206c2d583..901c3ccb27e 100644
--- a/pkg/cli/compile_config.go
+++ b/pkg/cli/compile_config.go
@@ -13,6 +13,7 @@ type CompileConfig struct {
Purge bool // Remove orphaned lock files
TrialMode bool // Enable trial mode (suppress safe outputs)
TrialLogicalRepoSlug string // Target repository for trial mode
+ UseSamples bool // Hidden: replace agentic step with a deterministic samples replay driver
Strict bool // Enable strict mode validation
Dependabot bool // Generate Dependabot manifests for npm dependencies
ForceOverwrite bool // Force overwrite of existing files (dependabot.yml)
diff --git a/pkg/parser/schemas/main_workflow_schema.json b/pkg/parser/schemas/main_workflow_schema.json
index 4cc557a9934..bfc22f569de 100644
--- a/pkg/parser/schemas/main_workflow_schema.json
+++ b/pkg/parser/schemas/main_workflow_schema.json
@@ -4402,6 +4402,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false
@@ -4992,6 +5008,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false,
@@ -5070,6 +5102,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false
@@ -5125,6 +5173,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false
@@ -5266,6 +5330,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false,
@@ -5386,6 +5466,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false
@@ -5436,6 +5532,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false,
@@ -5566,6 +5678,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false,
@@ -5665,6 +5793,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false,
@@ -5750,6 +5894,22 @@
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
},
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
+ },
"github-token": {
"$ref": "#/$defs/github_token",
"description": "GitHub token to use for this specific output type. Overrides global github-token if specified."
@@ -5822,6 +5982,22 @@
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
},
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
+ },
"state-reason": {
"type": "string",
"enum": ["completed", "not_planned", "duplicate"],
@@ -5908,6 +6084,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false,
@@ -5989,6 +6181,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false,
@@ -6105,6 +6313,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false,
@@ -6480,6 +6704,22 @@
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
},
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
+ },
"allow-workflows": {
"type": "boolean",
"description": "When true, adds workflows: write to the GitHub App token permissions. Required when allowed-files targets .github/workflows/ paths. Requires safe-outputs.github-app to be configured because the workflows permission is a GitHub App-only permission and cannot be granted via GITHUB_TOKEN.",
@@ -6559,6 +6799,22 @@
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
},
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
+ },
"required-labels": {
"type": "array",
"items": {
@@ -6652,6 +6908,22 @@
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
},
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
+ },
"required-labels": {
"type": "array",
"items": {
@@ -6723,6 +6995,22 @@
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
},
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
+ },
"required-labels": {
"type": "array",
"items": {
@@ -6789,6 +7077,22 @@
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
},
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
+ },
"required-labels": {
"type": "array",
"items": {
@@ -6853,6 +7157,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false
@@ -6892,6 +7212,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false
@@ -6936,6 +7272,22 @@
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
},
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
+ },
"github-app": {
"$ref": "#/$defs/github_app",
"description": "GitHub App credentials for minting an installation access token scoped to checks:write for this handler. When set, a short-lived token is minted before the handler runs and revoked afterwards."
@@ -7045,6 +7397,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false
@@ -7130,6 +7498,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false
@@ -7239,6 +7623,22 @@
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
},
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
+ },
"required-labels": {
"type": "array",
"items": {
@@ -7313,6 +7713,22 @@
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
},
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
+ },
"required-labels": {
"type": "array",
"items": {
@@ -7420,6 +7836,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false
@@ -7494,6 +7926,22 @@
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
},
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
+ },
"required-labels": {
"type": "array",
"items": {
@@ -7573,6 +8021,22 @@
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
},
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
+ },
"required-labels": {
"type": "array",
"items": {
@@ -7664,6 +8128,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false
@@ -7738,6 +8218,22 @@
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
},
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
+ },
"required-labels": {
"type": "array",
"items": {
@@ -7826,6 +8322,22 @@
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
},
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
+ },
"required-labels": {
"type": "array",
"items": {
@@ -8024,6 +8536,22 @@
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
},
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
+ },
"github-token-for-extra-empty-commit": {
"type": "string",
"description": "Token used to push an empty commit after pushing changes to trigger CI events. Works around the GITHUB_TOKEN limitation where pushes don't trigger workflow runs. Defaults to the magic secret GH_AW_CI_TRIGGER_TOKEN if set in the repository. Use a secret expression (e.g. '${{ secrets.CI_TOKEN }}') for a custom token, or 'app' for GitHub App auth."
@@ -8209,6 +8737,22 @@
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
},
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
+ },
"required-labels": {
"type": "array",
"items": {
@@ -8283,6 +8827,22 @@
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
},
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
+ },
"required-labels": {
"type": "array",
"items": {
@@ -8355,6 +8915,22 @@
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
},
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
+ },
"required-labels": {
"type": "array",
"items": {
@@ -8420,6 +8996,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"required": ["workflows"],
@@ -8576,6 +9168,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"required": ["workflows"],
@@ -8640,6 +9248,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false
@@ -8702,6 +9326,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false
@@ -8752,6 +9392,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false
@@ -8817,6 +9473,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false
@@ -8969,6 +9641,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false
@@ -9662,6 +10350,22 @@
"type": "boolean",
"description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)",
"examples": [true, false]
+ },
+ "samples": {
+ "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.",
+ "oneOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "additionalProperties": true
+ }
+ },
+ {
+ "type": "object",
+ "additionalProperties": true
+ }
+ ]
}
},
"additionalProperties": false
diff --git a/pkg/workflow/compiler_types.go b/pkg/workflow/compiler_types.go
index 9c0639c4977..4ea0310b547 100644
--- a/pkg/workflow/compiler_types.go
+++ b/pkg/workflow/compiler_types.go
@@ -70,6 +70,7 @@ type Compiler struct {
forceStaged bool // If true, force all safe-outputs into staged mode
trialMode bool // If true, suppress safe outputs for trial mode execution
trialLogicalRepoSlug string // If set in trial mode, the logical repository to checkout
+ useSamples bool // If true, replace the agentic step with a deterministic samples replay driver (hidden feature)
refreshStopTime bool // If true, regenerate stop-after times instead of preserving existing ones
forceRefreshActionPins bool // If true, clear action cache and resolve all actions from GitHub API
failFast bool // If true, stop at first validation error instead of collecting all errors
@@ -203,6 +204,14 @@ func (c *Compiler) SetTrialLogicalRepoSlug(repo string) {
c.trialLogicalRepoSlug = repo
}
+// SetUseSamples configures whether to replace the agentic step with a
+// deterministic replay driver that feeds `samples` entries to the safe-outputs
+// MCP server via real `tools/call` JSON-RPC. Hidden feature used by
+// `gh aw compile --use-samples`.
+func (c *Compiler) SetUseSamples(use bool) {
+ c.useSamples = use
+}
+
// SetStrictMode configures whether to enable strict validation mode
func (c *Compiler) SetStrictMode(strict bool) {
c.strictMode = strict
@@ -446,6 +455,7 @@ type WorkflowData struct {
WorkflowID string // workflow identifier derived from markdown filename (basename without extension)
TrialMode bool // whether the workflow is running in trial mode
TrialLogicalRepo string // target repository slug for trial mode (owner/repo)
+ UseSamples bool // whether the agentic step should be replaced by a deterministic samples replay driver (hidden feature)
FrontmatterName string // name field from frontmatter (for code scanning alert driver default)
FrontmatterEmoji string // emoji field from frontmatter (for display in footers and UI)
FrontmatterYAML string // raw frontmatter YAML content (rendered as comment in lock file for reference)
@@ -627,6 +637,8 @@ type BaseSafeOutputConfig struct {
GitHubApp *GitHubAppConfig `yaml:"github-app,omitempty"` // GitHub App credentials for minting a per-handler installation access token
Staged bool `yaml:"staged,omitempty"` // If true, emit step summary messages instead of making GitHub API calls for this specific output type
NormalizeClosingKeywords *bool `yaml:"normalize-closing-keywords,omitempty"` // When true for this output type, strip backticks from recognized issue-closing keywords in body fields.
+ // Samples carries deterministic replay samples for the hidden `gh aw compile --use-samples` flag. Each entry is the JSON object passed to the corresponding MCP tool's `tools/call` arguments. Sample-only sidecar fields (e.g. `patch` for create_pull_request) are stripped before the call and used by the replay driver.
+ Samples []map[string]any `yaml:"samples,omitempty"`
}
// SafeOutputsConfig holds configuration for automatic output routes
diff --git a/pkg/workflow/compiler_validators.go b/pkg/workflow/compiler_validators.go
index 316ba6a8b27..d1286dcd4bb 100644
--- a/pkg/workflow/compiler_validators.go
+++ b/pkg/workflow/compiler_validators.go
@@ -152,6 +152,7 @@ func (c *Compiler) validateCoreToolConfiguration(workflowData *WorkflowData, mar
{logMessage: "Validating sandbox configuration", validateFn: func() error { return validateSandboxConfig(workflowData) }},
{logMessage: "Validating safe-outputs target fields", validateFn: func() error { return validateSafeOutputsTarget(workflowData.SafeOutputs) }},
{logMessage: "Validating safe-outputs max fields", validateFn: func() error { return validateSafeOutputsMax(workflowData.SafeOutputs) }},
+ {logMessage: "Validating safe-outputs samples entries against MCP tool schemas", validateFn: func() error { return validateSafeOutputsSamples(workflowData.SafeOutputs) }},
{logMessage: "Validating safe-outputs allowed-domains", validateFn: func() error { return c.validateSafeOutputsAllowedDomains(workflowData.SafeOutputs) }},
{logMessage: "Validating safe-outputs merge-pull-request", validateFn: func() error { return validateSafeOutputsMergePullRequest(workflowData.SafeOutputs) }},
{logMessage: "Validating safe-outputs needs declarations", validateFn: func() error { return validateSafeOutputsNeeds(workflowData) }},
diff --git a/pkg/workflow/compiler_yaml_ai_execution.go b/pkg/workflow/compiler_yaml_ai_execution.go
index b3b6826b74a..48473d9c883 100644
--- a/pkg/workflow/compiler_yaml_ai_execution.go
+++ b/pkg/workflow/compiler_yaml_ai_execution.go
@@ -9,6 +9,14 @@ import (
// generateEngineExecutionSteps generates the GitHub Actions steps for executing the AI engine
func (c *Compiler) generateEngineExecutionSteps(yaml *strings.Builder, data *WorkflowData, engine CodingAgentEngine, logFile string) {
+ // --use-samples (hidden) replaces the agent step with a deterministic driver
+ // that replays the workflow's safe-outputs `samples` frontmatter entries
+ // through the safe-outputs MCP server. The engine is never invoked.
+ if data.UseSamples {
+ compilerYamlLog.Printf("Replacing engine execution with samples replay driver: engine=%s", engine.GetID())
+ c.generateSamplesReplayStep(yaml, data, logFile)
+ return
+ }
steps := engine.GetExecutionSteps(data, logFile)
compilerYamlLog.Printf("Generating engine execution steps: engine=%s, steps=%d", engine.GetID(), len(steps))
diff --git a/pkg/workflow/safe_outputs_config.go b/pkg/workflow/safe_outputs_config.go
index 58dc548537a..3899b7c28e2 100644
--- a/pkg/workflow/safe_outputs_config.go
+++ b/pkg/workflow/safe_outputs_config.go
@@ -693,6 +693,14 @@ func (c *Compiler) extractSafeOutputsConfig(frontmatter map[string]any) *SafeOut
}
}
+ // Force-disable threat detection when --use-samples is active: the replay driver
+ // emits synthetic outputs solely for deterministic end-to-end tests, and running
+ // an LLM-backed detection pass would defeat that determinism.
+ if config != nil && c.useSamples && config.ThreatDetection != nil {
+ safeOutputsConfigLog.Print("Disabling threat-detection because --use-samples is set")
+ config.ThreatDetection = nil
+ }
+
if config != nil {
safeOutputsConfigLog.Print("Successfully extracted safe-outputs configuration")
} else {
@@ -755,6 +763,48 @@ func (c *Compiler) parseBaseSafeOutputConfig(configMap map[string]any, config *B
config.Staged = stagedBool
}
}
+
+ // Parse samples list (hidden feature: deterministic replay samples for --use-samples).
+ // Accepts either a YAML list of objects, or a single object that is auto-wrapped
+ // into a one-element list. The JSON schema rejects scalar/string shapes so we
+ // don't need a defensive YAML-string branch here.
+ if samples, exists := configMap["samples"]; exists {
+ parsed := parseSamplesValue(samples)
+ if len(parsed) > 0 {
+ safeOutputsConfigLog.Printf("Parsed %d samples entries", len(parsed))
+ config.Samples = parsed
+ }
+ }
+}
+
+// parseSamplesValue normalizes a `samples` frontmatter value into a list of
+// objects. Accepted shapes:
+// - YAML list of mappings: returned as-is
+// - single YAML mapping: wrapped into a one-element list
+//
+// Any other shape returns an empty slice — schema validation rejects those
+// shapes upstream and we keep this parser strict to match.
+func parseSamplesValue(samples any) []map[string]any {
+ switch v := samples.(type) {
+ case []any:
+ out := make([]map[string]any, 0, len(v))
+ for _, item := range v {
+ if m, ok := item.(map[string]any); ok {
+ out = append(out, m)
+ } else if mStr, ok := item.(map[string]string); ok {
+ converted := make(map[string]any, len(mStr))
+ for k, s := range mStr {
+ converted[k] = s
+ }
+ out = append(out, converted)
+ }
+ }
+ return out
+ case map[string]any:
+ return []map[string]any{v}
+ default:
+ return nil
+ }
}
// SafeOutputStepConfig holds configuration for building a single safe output step
diff --git a/pkg/workflow/samples_replay.go b/pkg/workflow/samples_replay.go
new file mode 100644
index 00000000000..28f16afe1c4
--- /dev/null
+++ b/pkg/workflow/samples_replay.go
@@ -0,0 +1,112 @@
+package workflow
+
+import (
+ "encoding/json"
+ "fmt"
+ "sort"
+ "strings"
+)
+
+// SampleEntry is the per-call payload consumed by apply_samples.cjs.
+// Each entry corresponds to a single MCP `tools/call` invocation.
+type SampleEntry struct {
+ // Tool is the snake_case MCP tool name (e.g. "create_pull_request").
+ Tool string `json:"tool"`
+ // Arguments are passed verbatim as the MCP `tools/call` arguments.
+ // Sample sidecar fields (e.g. `patch`) have already been stripped.
+ Arguments map[string]any `json:"arguments"`
+ // Sidecars carries fields stripped from Arguments that need out-of-band
+ // pre-staging by the driver (e.g. `patch` for create_pull_request).
+ Sidecars map[string]any `json:"sidecars,omitempty"`
+}
+
+// collectSampleEntries walks the safe-outputs config and flattens every
+// configured `samples` entry into the order they will be sent to the MCP
+// server. Iteration order is deterministic (sorted by struct field name) so
+// that compiled YAML is stable across runs.
+func collectSampleEntries(config *SafeOutputsConfig) []SampleEntry {
+ if config == nil {
+ return nil
+ }
+
+ fieldNames := make([]string, 0, len(safeOutputFieldMapping))
+ for fieldName := range safeOutputFieldMapping {
+ fieldNames = append(fieldNames, fieldName)
+ }
+ sort.Strings(fieldNames)
+
+ var entries []SampleEntry
+ for _, fieldName := range fieldNames {
+ toolName := safeOutputFieldMapping[fieldName]
+ base := extractBaseSafeOutputConfig(config, fieldName)
+ if base == nil || len(base.Samples) == 0 {
+ continue
+ }
+ sidecarKeys := sampleSidecarFields[toolName]
+ for _, sample := range base.Samples {
+ args := make(map[string]any, len(sample))
+ var sidecars map[string]any
+ for k, v := range sample {
+ if sidecarKeys[k] {
+ if sidecars == nil {
+ sidecars = make(map[string]any)
+ }
+ sidecars[k] = v
+ continue
+ }
+ args[k] = v
+ }
+ entries = append(entries, SampleEntry{
+ Tool: toolName,
+ Arguments: args,
+ Sidecars: sidecars,
+ })
+ }
+ }
+ return entries
+}
+
+// generateSamplesReplayStep emits the YAML that replaces the agentic
+// `Execute coding agent` step when the hidden `gh aw compile --use-samples`
+// flag is used. It spawns the safe-outputs MCP server over stdio and feeds it
+// a `tools/call` for every collected sample, after pre-staging branches/patches
+// for samples that carry them.
+func (c *Compiler) generateSamplesReplayStep(yaml *strings.Builder, data *WorkflowData, logFile string) {
+ entries := collectSampleEntries(data.SafeOutputs)
+ compilerYamlLog.Printf("Generating samples replay step: entries=%d", len(entries))
+
+ // Normalize a nil slice to an empty slice so json.Marshal emits "[]" not "null".
+ // The driver rejects anything that isn't a JSON array; emitting "null" here
+ // would crash the replay step with `GH_AW_SAMPLES must be a JSON array` for
+ // workflows that opt into --use-samples but configure no samples (or whose
+ // configured samples all live on disabled handlers).
+ if entries == nil {
+ entries = []SampleEntry{}
+ }
+
+ // Serialize entries to JSON for the driver. Always emit valid JSON even when
+ // empty so the driver can produce a clear `no samples configured` message
+ // rather than crashing on an empty env var.
+ payload, err := json.Marshal(entries)
+ if err != nil {
+ // Should never happen for map[string]any payloads; fall back to empty
+ // array so the workflow still compiles and the driver reports cleanly.
+ compilerYamlLog.Printf("Warning: failed to marshal samples entries: %v", err)
+ payload = []byte("[]")
+ }
+
+ yaml.WriteString(" - name: Replay safe-outputs samples (deterministic)\n")
+ yaml.WriteString(" id: agentic_execution\n")
+ yaml.WriteString(" env:\n")
+ yaml.WriteString(" GH_AW_SAMPLES: |\n")
+ for line := range strings.SplitSeq(string(payload), "\n") {
+ fmt.Fprintf(yaml, " %s\n", line)
+ }
+ fmt.Fprintf(yaml, " GH_AW_AGENT_STDIO_LOG: %s\n", logFile)
+ yaml.WriteString(" GH_AW_SAFE_OUTPUTS_CONFIG_PATH: ${{ runner.temp }}/gh-aw/safeoutputs/config.json\n")
+ yaml.WriteString(" GH_AW_SAFE_OUTPUTS: ${{ runner.temp }}/gh-aw/safeoutputs/outputs.jsonl\n")
+ yaml.WriteString(" run: |\n")
+ yaml.WriteString(" set -euo pipefail\n")
+ yaml.WriteString(" mkdir -p \"$(dirname \"$GH_AW_AGENT_STDIO_LOG\")\"\n")
+ yaml.WriteString(" node \"${{ runner.temp }}/gh-aw/actions/apply_samples.cjs\"\n")
+}
diff --git a/pkg/workflow/samples_replay_test.go b/pkg/workflow/samples_replay_test.go
new file mode 100644
index 00000000000..ac8232fd825
--- /dev/null
+++ b/pkg/workflow/samples_replay_test.go
@@ -0,0 +1,341 @@
+//go:build integration
+
+package workflow
+
+import (
+ "encoding/json"
+ "os"
+ "strings"
+ "testing"
+)
+
+// TestUseSamplesReplacesAgentStep verifies that compiling with
+// SetUseSamples(true) replaces the engine `Execute coding agent` step
+// with the deterministic `Replay safe-outputs samples` step driven by
+// apply_samples.cjs.
+func TestUseSamplesReplacesAgentStep(t *testing.T) {
+ const md = `---
+on:
+ workflow_dispatch:
+permissions: read-all
+engine:
+ id: claude
+safe-outputs:
+ create-issue:
+ samples:
+ - title: "Deterministic test issue"
+ body: "Issue body emitted by gh-aw samples replay."
+---
+
+Trivial workflow whose only job is to be compiled with --use-samples.
+`
+
+ tmpFile, err := os.CreateTemp("", "use-samples-*.md")
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer os.Remove(tmpFile.Name())
+ if _, err := tmpFile.WriteString(md); err != nil {
+ t.Fatal(err)
+ }
+ tmpFile.Close()
+
+ t.Run("Default Mode", func(t *testing.T) {
+ compiler := NewCompiler()
+ if err := compiler.CompileWorkflow(tmpFile.Name()); err != nil {
+ t.Fatalf("compile failed: %v", err)
+ }
+ lockPath := strings.TrimSuffix(tmpFile.Name(), ".md") + ".lock.yml"
+ defer os.Remove(lockPath)
+ b, err := os.ReadFile(lockPath)
+ if err != nil {
+ t.Fatalf("read lock: %v", err)
+ }
+ lockContent := string(b)
+ if strings.Contains(lockContent, "Replay safe-outputs samples") {
+ t.Error("Did not expect samples replay step in default mode")
+ }
+ if strings.Contains(lockContent, "apply_samples.cjs") {
+ t.Error("Did not expect apply_samples driver in default mode")
+ }
+ })
+
+ t.Run("Use Samples Mode", func(t *testing.T) {
+ compiler := NewCompiler()
+ compiler.SetUseSamples(true)
+ if err := compiler.CompileWorkflow(tmpFile.Name()); err != nil {
+ t.Fatalf("compile failed: %v", err)
+ }
+ workflowData, err := compiler.ParseWorkflowFile(tmpFile.Name())
+ if err != nil {
+ t.Fatalf("ParseWorkflowFile failed: %v", err)
+ }
+ if !workflowData.UseSamples {
+ t.Fatal("Expected workflowData.UseSamples to be true after SetUseSamples(true)")
+ }
+ lockPath := strings.TrimSuffix(tmpFile.Name(), ".md") + ".lock.yml"
+ defer os.Remove(lockPath)
+ b, err := os.ReadFile(lockPath)
+ if err != nil {
+ t.Fatalf("read lock: %v", err)
+ }
+ lockContent := string(b)
+ if !strings.Contains(lockContent, "Replay safe-outputs samples (deterministic)") {
+ t.Error("Expected `Replay safe-outputs samples (deterministic)` step in lock file")
+ }
+ if !strings.Contains(lockContent, "apply_samples.cjs") {
+ t.Error("Expected lock file to invoke apply_samples.cjs driver")
+ }
+ if !strings.Contains(lockContent, "GH_AW_SAMPLES:") {
+ t.Error("Expected GH_AW_SAMPLES env var in lock file")
+ }
+ if !strings.Contains(lockContent, `"tool":"create_issue"`) {
+ t.Error("Expected JSON-encoded create_issue tool entry in lock file")
+ }
+ if !strings.Contains(lockContent, "Deterministic test issue") {
+ t.Error("Expected sample title in lock file")
+ }
+ if !strings.Contains(lockContent, "id: agentic_execution") {
+ t.Error("Expected id: agentic_execution on the replay step")
+ }
+ // Threat detection must be force-disabled under --use-samples so the
+ // deterministic replay isn't perturbed by an LLM-backed detection job.
+ if strings.Contains(lockContent, "\n detection:\n") {
+ t.Error("Expected no `detection:` job under --use-samples")
+ }
+ })
+}
+
+// TestUseSamplesCreatePullRequestWithPatch is the end-to-end smoke test for
+// the create-pull-request + patch sidecar flow. It compiles a workflow whose
+// only safe-output is `create-pull-request` with a `samples` entry carrying
+// a `patch` sidecar, then inspects the generated lock.yml to verify that:
+//
+// 1. The agentic step is replaced by the deterministic replay step
+// 2. GH_AW_SAMPLES contains a JSON-encoded create_pull_request entry
+// 3. The patch is partitioned into `sidecars`, NOT into `arguments`
+// (the MCP server's create_pull_request handler must NOT receive `patch`
+// as a tool argument — it derives the diff from the working tree)
+// 4. The branch name and other PR fields land in `arguments`
+// 5. The actual diff payload is preserved verbatim in the lock file
+// (so the driver can `git apply` it at replay time)
+// 6. No `detection:` job is emitted
+func TestUseSamplesCreatePullRequestWithPatch(t *testing.T) {
+ const patch = "diff --git a/sample.txt b/sample.txt\nnew file mode 100644\nindex 0000000..1111111\n--- /dev/null\n+++ b/sample.txt\n@@ -0,0 +1 @@\n+hello from gh-aw samples\n"
+
+ md := `---
+on:
+ workflow_dispatch:
+permissions: read-all
+engine:
+ id: claude
+safe-outputs:
+ create-pull-request:
+ samples:
+ - title: "Sample PR from gh-aw"
+ body: "PR body emitted by samples replay."
+ branch: "feat/gh-aw-sample-pr"
+ patch: |
+` + indentBlock(patch, " ") + `---
+
+Trivial workflow exercising create-pull-request via --use-samples.
+`
+
+ tmpFile, err := os.CreateTemp("", "use-samples-cpr-*.md")
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer os.Remove(tmpFile.Name())
+ if _, err := tmpFile.WriteString(md); err != nil {
+ t.Fatal(err)
+ }
+ tmpFile.Close()
+
+ compiler := NewCompiler()
+ compiler.SetUseSamples(true)
+ if err := compiler.CompileWorkflow(tmpFile.Name()); err != nil {
+ t.Fatalf("compile failed: %v", err)
+ }
+ lockPath := strings.TrimSuffix(tmpFile.Name(), ".md") + ".lock.yml"
+ defer os.Remove(lockPath)
+ b, err := os.ReadFile(lockPath)
+ if err != nil {
+ t.Fatalf("read lock: %v", err)
+ }
+ lock := string(b)
+
+ // 1. Agentic step replaced
+ if !strings.Contains(lock, "Replay safe-outputs samples (deterministic)") {
+ t.Error("Expected `Replay safe-outputs samples (deterministic)` step in lock file")
+ }
+ if !strings.Contains(lock, "apply_samples.cjs") {
+ t.Error("Expected lock file to invoke apply_samples.cjs driver")
+ }
+
+ // 2. GH_AW_SAMPLES contains a create_pull_request entry
+ if !strings.Contains(lock, "GH_AW_SAMPLES:") {
+ t.Fatal("Expected GH_AW_SAMPLES env var in lock file")
+ }
+ if !strings.Contains(lock, `"tool":"create_pull_request"`) {
+ t.Error("Expected JSON-encoded create_pull_request tool entry in lock file")
+ }
+
+ // Extract the GH_AW_SAMPLES JSON block from the YAML for structural assertions.
+ samplesJSON := extractGHAWSamplesJSON(t, lock)
+ var entries []map[string]any
+ if err := json.Unmarshal([]byte(samplesJSON), &entries); err != nil {
+ t.Fatalf("failed to parse GH_AW_SAMPLES JSON: %v\nRaw:\n%s", err, samplesJSON)
+ }
+ if len(entries) != 1 {
+ t.Fatalf("expected exactly one sample entry, got %d", len(entries))
+ }
+ entry := entries[0]
+
+ // 3. Patch is in sidecars, NOT in arguments
+ args, _ := entry["arguments"].(map[string]any)
+ sidecars, _ := entry["sidecars"].(map[string]any)
+ if args == nil {
+ t.Fatal("expected entry.arguments to be an object")
+ }
+ if _, hasPatchInArgs := args["patch"]; hasPatchInArgs {
+ t.Error("patch must be stripped from arguments — MCP create_pull_request handler must not receive it")
+ }
+ if sidecars == nil {
+ t.Fatal("expected entry.sidecars to be present (patch should land here)")
+ }
+ gotPatch, _ := sidecars["patch"].(string)
+ if gotPatch == "" {
+ t.Fatal("expected sidecars.patch to be a non-empty string")
+ }
+
+ // 4. PR fields preserved in arguments
+ if args["title"] != "Sample PR from gh-aw" {
+ t.Errorf("arguments.title = %q, want %q", args["title"], "Sample PR from gh-aw")
+ }
+ if args["body"] != "PR body emitted by samples replay." {
+ t.Errorf("arguments.body = %q, want %q", args["body"], "PR body emitted by samples replay.")
+ }
+ if args["branch"] != "feat/gh-aw-sample-pr" {
+ t.Errorf("arguments.branch = %q, want %q", args["branch"], "feat/gh-aw-sample-pr")
+ }
+
+ // 5. Patch payload preserved verbatim
+ if !strings.Contains(gotPatch, "diff --git a/sample.txt b/sample.txt") {
+ t.Errorf("sidecars.patch missing diff header; got: %q", gotPatch)
+ }
+ if !strings.Contains(gotPatch, "+hello from gh-aw samples") {
+ t.Errorf("sidecars.patch missing payload line; got: %q", gotPatch)
+ }
+
+ // 6. No detection job
+ if strings.Contains(lock, "\n detection:\n") {
+ t.Error("Expected no `detection:` job under --use-samples")
+ }
+}
+
+// indentBlock prefixes every line of s with prefix. Used to embed a multi-line
+// patch under a YAML block scalar in the test fixture.
+func indentBlock(s, prefix string) string {
+ lines := strings.Split(strings.TrimRight(s, "\n"), "\n")
+ for i, line := range lines {
+ lines[i] = prefix + line
+ }
+ return strings.Join(lines, "\n") + "\n"
+}
+
+// extractGHAWSamplesJSON pulls the literal block scalar value of GH_AW_SAMPLES
+// out of the compiled YAML and returns the unindented JSON text. This avoids
+// pulling in a full YAML parser for what is a tightly-controlled emit format.
+func extractGHAWSamplesJSON(t *testing.T, lock string) string {
+ t.Helper()
+ const marker = "GH_AW_SAMPLES: |\n"
+ start := strings.Index(lock, marker)
+ if start < 0 {
+ t.Fatalf("could not find %q in lock file", marker)
+ }
+ start += len(marker)
+ // Determine indentation from the first content line.
+ rest := lock[start:]
+ firstNL := strings.Index(rest, "\n")
+ if firstNL < 0 {
+ t.Fatal("malformed GH_AW_SAMPLES block: no newline after first line")
+ }
+ firstLine := rest[:firstNL]
+ indent := firstLine[:len(firstLine)-len(strings.TrimLeft(firstLine, " "))]
+ if indent == "" {
+ t.Fatal("malformed GH_AW_SAMPLES block: expected indented content")
+ }
+ // Collect lines until we hit one that no longer starts with the same indent
+ // (i.e. the next YAML key like GH_AW_AGENT_STDIO_LOG).
+ var out strings.Builder
+ for _, line := range strings.Split(rest, "\n") {
+ if !strings.HasPrefix(line, indent) {
+ break
+ }
+ out.WriteString(strings.TrimPrefix(line, indent))
+ out.WriteString("\n")
+ }
+ return strings.TrimSpace(out.String())
+}
+
+// TestUseSamplesEmitsEmptyArrayWhenNoSamplesConfigured guards against a
+// regression where compiling with --use-samples but no `samples:` entries on
+// any enabled handler caused json.Marshal of a nil Go slice to emit the
+// literal string "null" into GH_AW_SAMPLES, which the driver rightly
+// rejected with `GH_AW_SAMPLES must be a JSON array`. The compiler must
+// emit "[]" instead so the driver can exit cleanly with `no samples to
+// replay`.
+func TestUseSamplesEmitsEmptyArrayWhenNoSamplesConfigured(t *testing.T) {
+ // Workflow opts into --use-samples and configures safe-outputs but has
+ // no `samples:` entries on the create-issue handler.
+ const md = `---
+on:
+ workflow_dispatch:
+permissions: read-all
+engine:
+ id: claude
+safe-outputs:
+ create-issue:
+ title-prefix: "[no-samples] "
+---
+
+Workflow with safe-outputs but no samples — should still compile and
+emit a valid empty-array GH_AW_SAMPLES under --use-samples.
+`
+
+ tmpFile, err := os.CreateTemp("", "use-samples-empty-*.md")
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer os.Remove(tmpFile.Name())
+ if _, err := tmpFile.WriteString(md); err != nil {
+ t.Fatal(err)
+ }
+ tmpFile.Close()
+
+ compiler := NewCompiler()
+ compiler.SetUseSamples(true)
+ if err := compiler.CompileWorkflow(tmpFile.Name()); err != nil {
+ t.Fatalf("compile failed: %v", err)
+ }
+ lockPath := strings.TrimSuffix(tmpFile.Name(), ".md") + ".lock.yml"
+ defer os.Remove(lockPath)
+ b, err := os.ReadFile(lockPath)
+ if err != nil {
+ t.Fatalf("read lock: %v", err)
+ }
+ lock := string(b)
+
+ // Must still emit the replay step.
+ if !strings.Contains(lock, "Replay safe-outputs samples (deterministic)") {
+ t.Fatal("Expected replay step in lock file even with no samples configured")
+ }
+
+ samplesJSON := extractGHAWSamplesJSON(t, lock)
+ if samplesJSON == "null" {
+ t.Fatalf("GH_AW_SAMPLES must not be the literal `null` (driver would reject it); got %q", samplesJSON)
+ }
+ if samplesJSON != "[]" {
+ t.Fatalf("GH_AW_SAMPLES = %q, want %q", samplesJSON, "[]")
+ }
+}
diff --git a/pkg/workflow/samples_threat_detection_test.go b/pkg/workflow/samples_threat_detection_test.go
new file mode 100644
index 00000000000..ba7c082d2be
--- /dev/null
+++ b/pkg/workflow/samples_threat_detection_test.go
@@ -0,0 +1,63 @@
+package workflow
+
+import "testing"
+
+// TestExtractSafeOutputsConfig_UseSamplesDisablesThreatDetection verifies
+// that --use-samples force-disables threat detection so the deterministic
+// replay isn't perturbed by an LLM-backed detection job.
+func TestExtractSafeOutputsConfig_UseSamplesDisablesThreatDetection(t *testing.T) {
+ frontmatter := map[string]any{
+ "safe-outputs": map[string]any{
+ "create-issue": map[string]any{
+ "samples": []any{
+ map[string]any{"title": "x", "body": "y"},
+ },
+ },
+ },
+ }
+
+ t.Run("default mode applies threat-detection", func(t *testing.T) {
+ c := NewCompiler()
+ cfg := c.extractSafeOutputsConfig(frontmatter)
+ if cfg == nil {
+ t.Fatal("expected non-nil SafeOutputsConfig")
+ }
+ if cfg.ThreatDetection == nil {
+ t.Fatal("expected default threat-detection to be applied in default mode")
+ }
+ })
+
+ t.Run("use-samples disables threat-detection (default)", func(t *testing.T) {
+ c := NewCompiler()
+ c.SetUseSamples(true)
+ cfg := c.extractSafeOutputsConfig(frontmatter)
+ if cfg == nil {
+ t.Fatal("expected non-nil SafeOutputsConfig")
+ }
+ if cfg.ThreatDetection != nil {
+ t.Fatal("expected threat-detection to be force-disabled under --use-samples")
+ }
+ })
+
+ t.Run("use-samples disables threat-detection (explicit true)", func(t *testing.T) {
+ fm := map[string]any{
+ "safe-outputs": map[string]any{
+ "threat-detection": true,
+ "create-issue": map[string]any{
+ "samples": []any{
+ map[string]any{"title": "x", "body": "y"},
+ },
+ },
+ },
+ }
+ c := NewCompiler()
+ c.SetUseSamples(true)
+ cfg := c.extractSafeOutputsConfig(fm)
+ if cfg == nil {
+ t.Fatal("expected non-nil SafeOutputsConfig")
+ }
+ if cfg.ThreatDetection != nil {
+ t.Fatal("expected explicit threat-detection: true to be force-disabled under --use-samples")
+ }
+ })
+}
diff --git a/pkg/workflow/samples_validation.go b/pkg/workflow/samples_validation.go
new file mode 100644
index 00000000000..b04219aa23c
--- /dev/null
+++ b/pkg/workflow/samples_validation.go
@@ -0,0 +1,164 @@
+package workflow
+
+import (
+ "encoding/json"
+ "fmt"
+ "reflect"
+ "sort"
+ "strings"
+ "sync"
+
+ "github.com/santhosh-tekuri/jsonschema/v6"
+)
+
+// sampleSidecarFields lists fields recognized inside a `samples` entry
+// that are NOT passed to the MCP tool's `tools/call` arguments. They are stripped
+// from the sample before schema validation and consumed by the replay driver
+// (e.g. to pre-stage a branch + patch on disk).
+var sampleSidecarFields = map[string]map[string]bool{
+ "create_pull_request": {
+ "patch": true,
+ },
+ "push_to_pull_request_branch": {
+ "patch": true,
+ },
+}
+
+// compiledToolSchemas caches the per-tool jsonschema.Schema parsed from the
+// embedded safe_outputs_tools.json. Compiled lazily on first use.
+var (
+ compiledToolSchemasOnce sync.Once
+ compiledToolSchemas map[string]*jsonschema.Schema
+ compiledToolSchemasErr error
+)
+
+func getCompiledToolSchemas() (map[string]*jsonschema.Schema, error) {
+ compiledToolSchemasOnce.Do(func() {
+ var tools []struct {
+ Name string `json:"name"`
+ InputSchema json.RawMessage `json:"inputSchema"`
+ }
+ if err := json.Unmarshal([]byte(safeOutputsToolsJSONContent), &tools); err != nil {
+ compiledToolSchemasErr = fmt.Errorf("failed to parse safe_outputs_tools.json for samples validation: %w", err)
+ return
+ }
+ out := make(map[string]*jsonschema.Schema, len(tools))
+ for _, t := range tools {
+ if len(t.InputSchema) == 0 {
+ continue
+ }
+ var schemaDoc any
+ if err := json.Unmarshal(t.InputSchema, &schemaDoc); err != nil {
+ compiledToolSchemasErr = fmt.Errorf("failed to parse inputSchema for tool %q: %w", t.Name, err)
+ return
+ }
+ compiler := jsonschema.NewCompiler()
+ schemaURL := fmt.Sprintf("inmem://safe-outputs-tools/%s.json", t.Name)
+ if err := compiler.AddResource(schemaURL, schemaDoc); err != nil {
+ compiledToolSchemasErr = fmt.Errorf("failed to add schema resource for tool %q: %w", t.Name, err)
+ return
+ }
+ schema, err := compiler.Compile(schemaURL)
+ if err != nil {
+ compiledToolSchemasErr = fmt.Errorf("failed to compile inputSchema for tool %q: %w", t.Name, err)
+ return
+ }
+ out[t.Name] = schema
+ }
+ compiledToolSchemas = out
+ })
+ return compiledToolSchemas, compiledToolSchemasErr
+}
+
+// validateSafeOutputsSamples validates every `samples` entry on every
+// enabled safe-output handler against the corresponding MCP tool's inputSchema.
+// Sample sidecar fields (e.g. `patch`) are stripped before validation. Returns
+// the first error encountered; iteration order is deterministic (sorted by
+// struct field name) so error messages are stable.
+func validateSafeOutputsSamples(config *SafeOutputsConfig) error {
+ if config == nil {
+ return nil
+ }
+
+ fieldNames := make([]string, 0, len(safeOutputFieldMapping))
+ for fieldName := range safeOutputFieldMapping {
+ fieldNames = append(fieldNames, fieldName)
+ }
+ sort.Strings(fieldNames)
+
+ for _, fieldName := range fieldNames {
+ toolName := safeOutputFieldMapping[fieldName]
+ base := extractBaseSafeOutputConfig(config, fieldName)
+ if base == nil || len(base.Samples) == 0 {
+ continue
+ }
+ if err := validateSamplesForTool(toolName, base.Samples); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// extractBaseSafeOutputConfig returns the embedded BaseSafeOutputConfig of the
+// non-nil safe-output config at SafeOutputsConfig., or nil if the
+// field is unset or the struct does not embed BaseSafeOutputConfig.
+func extractBaseSafeOutputConfig(config *SafeOutputsConfig, fieldName string) *BaseSafeOutputConfig {
+ field, ok := safeOutputPointerFieldValue(config, fieldName)
+ if !ok || field.IsNil() {
+ return nil
+ }
+ elem := field.Elem()
+ if elem.Kind() != reflect.Struct {
+ return nil
+ }
+ baseField := elem.FieldByName("BaseSafeOutputConfig")
+ if !baseField.IsValid() || !baseField.CanAddr() {
+ return nil
+ }
+ if base, ok := baseField.Addr().Interface().(*BaseSafeOutputConfig); ok {
+ return base
+ }
+ return nil
+}
+
+// validateSamplesForTool validates each sample against the named MCP tool's
+// inputSchema after stripping recognized sidecar fields.
+func validateSamplesForTool(toolName string, samples []map[string]any) error {
+ schemas, err := getCompiledToolSchemas()
+ if err != nil {
+ return err
+ }
+ schema, found := schemas[toolName]
+ if !found {
+ return fmt.Errorf("samples: no MCP tool schema found for %q (yaml key %q). Available tools come from pkg/workflow/js/safe_outputs_tools.json", toolName, toolDisplayKey(toolName))
+ }
+ displayKey := toolDisplayKey(toolName)
+ sidecars := sampleSidecarFields[toolName]
+ for i, sample := range samples {
+ stripped := stripSidecarFields(sample, sidecars)
+ if err := schema.Validate(stripped); err != nil {
+ return fmt.Errorf("safe-outputs.%s.samples[%d]: %w", displayKey, i, err)
+ }
+ }
+ return nil
+}
+
+// stripSidecarFields returns a shallow copy of sample with sidecar keys removed.
+// The original map is never modified, even when no sidecars are configured —
+// callers may mutate the returned map without affecting the caller's input.
+func stripSidecarFields(sample map[string]any, sidecars map[string]bool) map[string]any {
+ out := make(map[string]any, len(sample))
+ for k, v := range sample {
+ if sidecars[k] {
+ continue
+ }
+ out[k] = v
+ }
+ return out
+}
+
+// toolDisplayKey converts a snake_case MCP tool name into the hyphenated YAML
+// frontmatter key (e.g. "create_pull_request" -> "create-pull-request").
+func toolDisplayKey(toolName string) string {
+ return strings.ReplaceAll(toolName, "_", "-")
+}
diff --git a/pkg/workflow/samples_validation_test.go b/pkg/workflow/samples_validation_test.go
new file mode 100644
index 00000000000..394d35d665d
--- /dev/null
+++ b/pkg/workflow/samples_validation_test.go
@@ -0,0 +1,170 @@
+package workflow
+
+import (
+ "strings"
+ "testing"
+)
+
+// TestValidateSafeOutputsSamples_Valid covers the happy path for the
+// strict schema validation of samples entries. We use create_issue (no
+// sidecars, just title/body) and create_pull_request (with the `patch` sidecar
+// that must be stripped before validation).
+func TestValidateSafeOutputsSamples_Valid(t *testing.T) {
+ cfg := &SafeOutputsConfig{
+ CreateIssues: &CreateIssuesConfig{
+ BaseSafeOutputConfig: BaseSafeOutputConfig{
+ Samples: []map[string]any{
+ {
+ "title": "Sample issue",
+ "body": "Sample body",
+ },
+ },
+ },
+ },
+ CreatePullRequests: &CreatePullRequestsConfig{
+ BaseSafeOutputConfig: BaseSafeOutputConfig{
+ Samples: []map[string]any{
+ {
+ "title": "Sample PR",
+ "body": "Sample PR body",
+ "branch": "gh-aw-sample-pr",
+ // patch is a sidecar — must be stripped before validation
+ // and must NOT cause an `additionalProperties` failure.
+ "patch": "diff --git a/foo b/foo\nnew file mode 100644\n--- /dev/null\n+++ b/foo\n@@ -0,0 +1 @@\n+hi\n",
+ },
+ },
+ },
+ },
+ }
+ if err := validateSafeOutputsSamples(cfg); err != nil {
+ t.Fatalf("expected no validation error, got: %v", err)
+ }
+}
+
+// TestValidateSafeOutputsSamples_MissingRequired verifies that omitting a
+// required field (title) surfaces a stable, parseable error.
+func TestValidateSafeOutputsSamples_MissingRequired(t *testing.T) {
+ cfg := &SafeOutputsConfig{
+ CreateIssues: &CreateIssuesConfig{
+ BaseSafeOutputConfig: BaseSafeOutputConfig{
+ Samples: []map[string]any{
+ {
+ // title intentionally missing
+ "body": "Body without title",
+ },
+ },
+ },
+ },
+ }
+ err := validateSafeOutputsSamples(cfg)
+ if err == nil {
+ t.Fatal("expected validation error for missing title, got nil")
+ }
+ msg := err.Error()
+ if !strings.Contains(msg, "create-issue") {
+ t.Errorf("expected error to reference the YAML key `create-issue`, got: %s", msg)
+ }
+ if !strings.Contains(msg, "samples[0]") {
+ t.Errorf("expected error to reference `samples[0]`, got: %s", msg)
+ }
+}
+
+// TestValidateSafeOutputsSamples_SidecarStripped verifies that the `patch`
+// sidecar is stripped before validation, so a create_pull_request sample with
+// only the schema-required fields PLUS a patch validates cleanly.
+func TestValidateSafeOutputsSamples_SidecarStripped(t *testing.T) {
+ cfg := &SafeOutputsConfig{
+ CreatePullRequests: &CreatePullRequestsConfig{
+ BaseSafeOutputConfig: BaseSafeOutputConfig{
+ Samples: []map[string]any{
+ {
+ "title": "PR",
+ "body": "PR body",
+ "branch": "gh-aw-x",
+ "patch": "diff --git a/x b/x\n",
+ },
+ },
+ },
+ },
+ }
+ if err := validateSafeOutputsSamples(cfg); err != nil {
+ t.Fatalf("expected sidecar to be stripped and validation to pass, got: %v", err)
+ }
+}
+
+// TestCollectSampleEntries_DeterministicOrdering verifies that entries are
+// emitted in a stable order across runs (sorted by SafeOutputsConfig field name)
+// so that compiled YAML is deterministic.
+func TestCollectSampleEntries_DeterministicOrdering(t *testing.T) {
+ cfg := &SafeOutputsConfig{
+ CreateIssues: &CreateIssuesConfig{
+ BaseSafeOutputConfig: BaseSafeOutputConfig{
+ Samples: []map[string]any{
+ {"title": "A", "body": "A"},
+ },
+ },
+ },
+ AddComments: &AddCommentsConfig{
+ BaseSafeOutputConfig: BaseSafeOutputConfig{
+ Samples: []map[string]any{
+ {"body": "comment-A"},
+ },
+ },
+ },
+ }
+ first := collectSampleEntries(cfg)
+ second := collectSampleEntries(cfg)
+
+ if len(first) != 2 {
+ t.Fatalf("expected 2 entries, got %d", len(first))
+ }
+ if first[0].Tool != second[0].Tool || first[1].Tool != second[1].Tool {
+ t.Errorf("expected deterministic ordering across runs, got first=%v second=%v", first, second)
+ }
+ // Sorted by struct field name: AddComments < CreateIssues.
+ if first[0].Tool != "add_comment" {
+ t.Errorf("expected first entry tool to be add_comment (alphabetical struct field order), got %q", first[0].Tool)
+ }
+ if first[1].Tool != "create_issue" {
+ t.Errorf("expected second entry tool to be create_issue, got %q", first[1].Tool)
+ }
+}
+
+// TestCollectSampleEntries_SidecarPartitioning verifies that sidecar fields
+// land in Sidecars (not Arguments) so the driver knows what to pre-stage.
+func TestCollectSampleEntries_SidecarPartitioning(t *testing.T) {
+ cfg := &SafeOutputsConfig{
+ CreatePullRequests: &CreatePullRequestsConfig{
+ BaseSafeOutputConfig: BaseSafeOutputConfig{
+ Samples: []map[string]any{
+ {
+ "title": "PR",
+ "body": "Body",
+ "branch": "br",
+ "patch": "diff --git a/x b/x\n",
+ },
+ },
+ },
+ },
+ }
+ entries := collectSampleEntries(cfg)
+ if len(entries) != 1 {
+ t.Fatalf("expected 1 entry, got %d", len(entries))
+ }
+ e := entries[0]
+ if e.Tool != "create_pull_request" {
+ t.Errorf("expected tool create_pull_request, got %q", e.Tool)
+ }
+ if _, hasPatchInArgs := e.Arguments["patch"]; hasPatchInArgs {
+ t.Error("expected patch to be stripped from Arguments")
+ }
+ if e.Arguments["title"] != "PR" || e.Arguments["body"] != "Body" || e.Arguments["branch"] != "br" {
+ t.Errorf("expected title/body/branch to remain in Arguments, got %#v", e.Arguments)
+ }
+ if e.Sidecars == nil {
+ t.Fatal("expected Sidecars to be non-nil")
+ }
+ if patch, ok := e.Sidecars["patch"].(string); !ok || !strings.HasPrefix(patch, "diff --git") {
+ t.Errorf("expected patch to be present in Sidecars as a git diff string, got %#v", e.Sidecars["patch"])
+ }
+}
diff --git a/pkg/workflow/workflow_builder.go b/pkg/workflow/workflow_builder.go
index 762de84d3ea..2a87ff848a3 100644
--- a/pkg/workflow/workflow_builder.go
+++ b/pkg/workflow/workflow_builder.go
@@ -67,6 +67,7 @@ func (c *Compiler) buildInitialWorkflowData(
ToolsStartupTimeout: toolsResult.toolsStartupTimeout,
TrialMode: c.trialMode,
TrialLogicalRepo: c.trialLogicalRepoSlug,
+ UseSamples: c.useSamples,
StrictMode: c.strictMode,
AllowActionRefs: c.allowActionRefs,
ValidateAWFConfig: !c.skipValidation,