diff --git a/actions/setup/js/apply_samples.cjs b/actions/setup/js/apply_samples.cjs new file mode 100644 index 00000000000..0a3dc7844cf --- /dev/null +++ b/actions/setup/js/apply_samples.cjs @@ -0,0 +1,363 @@ +#!/usr/bin/env node +// @ts-check +/// + +// apply_samples.cjs +// +// Deterministic replay driver for `gh aw compile --use-samples`. +// +// Reads `GH_AW_SAMPLES` (a JSON array of `{tool, arguments, sidecars}` +// entries produced by the compiler), spawns the safe-outputs MCP server +// (`safe_outputs_mcp_server.cjs`) as a child process, sends one JSON-RPC +// `tools/call` per sample over stdio, and writes a synthetic `agent-stdio.log` +// so downstream log-parsing / failure-handling steps continue to work. +// +// For samples whose tool is `create_pull_request` or `push_to_pull_request_branch` +// and whose sidecars include `patch`, the driver pre-stages a branch and commits +// the patch into the workspace BEFORE invoking the MCP tool. This lets the +// real `create_pull_request` MCP handler (which derives a git diff against the +// base branch) produce a meaningful transport payload. +// +// Env contract: +// GH_AW_SAMPLES — JSON array of replay entries (required) +// GH_AW_AGENT_STDIO_LOG — path where the synthetic stdio log is written +// GH_AW_SAFE_OUTPUTS_CONFIG_PATH — path to the MCP server's config.json +// GH_AW_SAFE_OUTPUTS — path to the MCP server's outputs.jsonl +// GITHUB_WORKSPACE — git working directory for pre-staging (optional; +// falls back to cwd) + +require("./shim.cjs"); + +const { spawn } = require("child_process"); +const fs = require("fs"); +const path = require("path"); +const os = require("os"); +const { getErrorMessage } = require("./error_helpers.cjs"); + +const DEFAULT_BASE_BRANCH = process.env.GH_AW_CUSTOM_BASE_BRANCH || process.env.GITHUB_BASE_REF || process.env.GITHUB_REF_NAME || "main"; +const PATCH_SIDECAR_TOOLS = new Set(["create_pull_request", "push_to_pull_request_branch"]); + +/** + * @typedef {Object} SampleEntry + * @property {string} tool + * @property {Record} arguments + * @property {Record} [sidecars] + */ + +/** + * Read and parse the GH_AW_SAMPLES env var. Returns an empty array (with a + * warning) when unset or empty so the workflow can still complete cleanly. + * @returns {SampleEntry[]} + */ +function loadSamples() { + const raw = process.env.GH_AW_SAMPLES; + if (!raw || !raw.trim()) { + core.warning("apply_samples: GH_AW_SAMPLES is empty — no samples to replay."); + return []; + } + let parsed; + try { + parsed = JSON.parse(raw); + } catch (err) { + throw new Error(`apply_samples: failed to parse GH_AW_SAMPLES as JSON: ${getErrorMessage(err)}`); + } + // Tolerate a literal JSON `null` payload (older compiler emitted it for + // workflows with --use-samples but no `samples:` entries). Treat as empty. + if (parsed === null) { + core.warning("apply_samples: GH_AW_SAMPLES is null — treating as no samples to replay."); + return []; + } + if (!Array.isArray(parsed)) { + throw new Error("apply_samples: GH_AW_SAMPLES must be a JSON array"); + } + for (const [i, entry] of parsed.entries()) { + if (!entry || typeof entry !== "object" || typeof entry.tool !== "string") { + throw new Error(`apply_samples: entry ${i} is missing a string "tool" field`); + } + if (!entry.arguments || typeof entry.arguments !== "object") { + throw new Error(`apply_samples: entry ${i} (tool=${entry.tool}) is missing an "arguments" object`); + } + } + return parsed; +} + +/** + * Run a git subcommand synchronously and return stdout. Throws on non-zero exit. + * @param {string[]} args + * @param {string} cwd + * @returns {string} + */ +function runGit(args, cwd) { + const { spawnSync } = require("child_process"); + const result = spawnSync("git", args, { cwd, encoding: "utf8" }); + if (result.status !== 0) { + throw new Error(`git ${args.join(" ")} failed (exit ${result.status}): ${result.stderr || result.stdout}`); + } + return result.stdout; +} + +/** + * Ensure git user.email / user.name are configured so commits succeed in CI. + * @param {string} cwd + */ +function ensureGitIdentity(cwd) { + try { + runGit(["config", "user.email"], cwd); + } catch { + runGit(["config", "user.email", "gh-aw-samples@github.com"], cwd); + } + try { + runGit(["config", "user.name"], cwd); + } catch { + runGit(["config", "user.name", "gh-aw samples"], cwd); + } +} + +/** + * Pre-stage a branch + patch for samples whose tool reads the workspace diff. + * Mutates `entry.arguments.branch` to the actual checked-out branch. + * @param {SampleEntry} entry + * @param {number} index + * @param {string} workspace + */ +function preStagePatch(entry, index, workspace) { + const patch = entry.sidecars && entry.sidecars.patch; + if (typeof patch !== "string" || !patch.trim()) { + return; + } + const branch = typeof entry.arguments.branch === "string" && entry.arguments.branch.trim() ? entry.arguments.branch.trim() : `gh-aw-sample-${index + 1}`; + entry.arguments.branch = branch; + + ensureGitIdentity(workspace); + + // Start from the base branch so the diff is meaningful. Tolerate the case + // where the base ref doesn't exist locally — fall back to HEAD. + try { + runGit(["checkout", DEFAULT_BASE_BRANCH], workspace); + } catch (err) { + core.warning(`apply_samples: could not check out base branch ${DEFAULT_BASE_BRANCH}: ${getErrorMessage(err)}; staying on current HEAD`); + } + + // Create the branch (or check it out if it already exists from a previous sample). + try { + runGit(["checkout", "-b", branch], workspace); + } catch { + runGit(["checkout", branch], workspace); + } + + // Write patch to a temp file and apply it. + const tmpPatch = path.join(os.tmpdir(), `gh-aw-sample-${index + 1}.patch`); + fs.writeFileSync(tmpPatch, patch.endsWith("\n") ? patch : patch + "\n"); + try { + runGit(["apply", "--whitespace=nowarn", tmpPatch], workspace); + } catch (err) { + // Fall back to --3way for patches that don't apply cleanly on top of an + // empty working tree (uncommon but possible for synthetic samples). + runGit(["apply", "--3way", "--whitespace=nowarn", tmpPatch], workspace); + } + + runGit(["add", "-A"], workspace); + runGit(["commit", "-m", `gh-aw sample ${index + 1}: ${entry.tool}`, "--allow-empty"], workspace); +} + +/** + * Send a single JSON-RPC request to the MCP server child process and resolve + * with the parsed JSON response (or reject on timeout). + * @param {import("child_process").ChildProcess} child + * @param {NodeJS.WritableStream} stdin + * @param {object} request + * @param {AsyncIterableIterator} responseIterator + * @returns {Promise} + */ +async function sendJsonRpc(child, stdin, request, responseIterator) { + stdin.write(JSON.stringify(request) + "\n"); + const { value, done } = await responseIterator.next(); + if (done) { + throw new Error(`apply_samples: MCP server closed stdout before responding to request id=${request.id}`); + } + return JSON.parse(value); +} + +/** + * Turn the MCP server's stdout into an async iterator of line strings. + * @param {NodeJS.ReadableStream} stdout + */ +async function* lineIterator(stdout) { + let buffer = ""; + for await (const chunk of stdout) { + buffer += chunk.toString(); + let newlineIdx; + while ((newlineIdx = buffer.indexOf("\n")) !== -1) { + const line = buffer.slice(0, newlineIdx).trim(); + buffer = buffer.slice(newlineIdx + 1); + if (line) { + yield line; + } + } + } + if (buffer.trim()) { + yield buffer.trim(); + } +} + +/** + * Locate the safe_outputs_mcp_server.cjs script. The setup action copies it + * into ${RUNNER_TEMP}/gh-aw/actions/ alongside this driver; fall back to + * resolving via __dirname for local-execution / tests. + * @returns {string} + */ +function resolveMcpServerPath() { + const candidates = [ + path.join(__dirname, "safe_outputs_mcp_server.cjs"), + process.env.RUNNER_TEMP ? path.join(process.env.RUNNER_TEMP, "gh-aw", "actions", "safe_outputs_mcp_server.cjs") : null, + process.env.RUNNER_TEMP ? path.join(process.env.RUNNER_TEMP, "gh-aw", "safeoutputs", "safe_outputs_mcp_server.cjs") : null, + ].filter(/** @returns {p is string} */ p => typeof p === "string"); + for (const candidate of candidates) { + if (fs.existsSync(candidate)) { + return candidate; + } + } + throw new Error(`apply_samples: could not locate safe_outputs_mcp_server.cjs. Looked in: ${candidates.join(", ")}`); +} + +/** + * Append a synthetic terminal_reason: completed marker to the engine stdio log + * so downstream parsers / handle_agent_failure recognize the replay as a + * successful agent run. + * @param {string} logPath + * @param {number} sampleCount + */ +function writeSyntheticStdioLog(logPath, sampleCount) { + if (!logPath) return; + try { + fs.mkdirSync(path.dirname(logPath), { recursive: true }); + } catch { + /* ignore */ + } + const lines = [ + `gh-aw samples replay: ${sampleCount} MCP tools/call invocation(s) completed deterministically.`, + JSON.stringify({ + type: "result", + subtype: "success", + terminal_reason: "completed", + num_turns: sampleCount, + driver: "apply_samples", + }), + "", + ]; + fs.appendFileSync(logPath, lines.join("\n")); +} + +async function main() { + const samples = loadSamples(); + const workspace = process.env.GITHUB_WORKSPACE || process.cwd(); + const logPath = process.env.GH_AW_AGENT_STDIO_LOG || ""; + + // Pre-stage branches/patches. + samples.forEach((sample, i) => { + if (PATCH_SIDECAR_TOOLS.has(sample.tool)) { + preStagePatch(sample, i, workspace); + } + }); + + if (samples.length === 0) { + core.info("apply_samples: nothing to replay; exiting cleanly."); + writeSyntheticStdioLog(logPath, 0); + return; + } + + const serverPath = resolveMcpServerPath(); + core.info(`apply_samples: spawning MCP server ${serverPath}`); + const child = spawn(process.execPath, [serverPath], { + stdio: ["pipe", "pipe", "inherit"], + env: process.env, + }); + + const stdoutIter = lineIterator(child.stdout); + let nextId = 1; + const failures = []; + + try { + // Initialize handshake. + const initRsp = await sendJsonRpc( + child, + child.stdin, + { + jsonrpc: "2.0", + id: nextId++, + method: "initialize", + params: { + protocolVersion: "2025-06-18", + capabilities: {}, + clientInfo: { name: "apply_samples", version: "1.0.0" }, + }, + }, + stdoutIter + ); + if (initRsp.error) { + throw new Error(`MCP initialize failed: ${JSON.stringify(initRsp.error)}`); + } + + // Send one tools/call per sample. + for (const [i, sample] of samples.entries()) { + const callRsp = await sendJsonRpc( + child, + child.stdin, + { + jsonrpc: "2.0", + id: nextId++, + method: "tools/call", + params: { name: sample.tool, arguments: sample.arguments }, + }, + stdoutIter + ); + if (callRsp.error) { + failures.push(`sample[${i}] (tool=${sample.tool}): ${JSON.stringify(callRsp.error)}`); + continue; + } + const result = callRsp.result; + if (result && result.isError) { + const text = result.content && result.content[0] && result.content[0].text; + failures.push(`sample[${i}] (tool=${sample.tool}): ${text || JSON.stringify(result)}`); + } else { + core.info(`apply_samples: sample[${i}] (tool=${sample.tool}) ok`); + } + } + } finally { + try { + child.stdin.end(); + } catch { + /* ignore */ + } + // Give the server up to 2s to exit cleanly. + await new Promise(resolve => { + const timer = setTimeout(() => { + try { + child.kill("SIGTERM"); + } catch { + /* ignore */ + } + resolve(undefined); + }, 2000); + child.once("exit", () => { + clearTimeout(timer); + resolve(undefined); + }); + }); + } + + writeSyntheticStdioLog(logPath, samples.length); + + if (failures.length > 0) { + throw new Error(`apply_samples: ${failures.length} sample(s) failed:\n - ${failures.join("\n - ")}`); + } + core.info(`apply_samples: ${samples.length} sample(s) replayed successfully.`); +} + +if (require.main === module) { + main().catch(err => { + core.setFailed(err && err.stack ? err.stack : String(err)); + }); +} + +module.exports = { main, loadSamples, preStagePatch, resolveMcpServerPath }; diff --git a/actions/setup/js/apply_samples.test.cjs b/actions/setup/js/apply_samples.test.cjs new file mode 100644 index 00000000000..9b2963bdead --- /dev/null +++ b/actions/setup/js/apply_samples.test.cjs @@ -0,0 +1,283 @@ +// @ts-check +// +// apply_samples.test.cjs +// +// Smoke test for the deterministic samples replay driver. Spawns the +// driver as a subprocess (so it actually launches the real MCP server) and +// asserts that: +// - the driver exits 0 +// - the MCP server appends the expected JSONL entry to GH_AW_SAFE_OUTPUTS +// - the synthetic agent-stdio log includes a `terminal_reason: completed` marker +// +// Tests intentionally use the simplest safe-output tool (`create_issue`) so we +// do not need to set up a git working tree for patch sidecars. + +import { describe, it, expect, beforeAll } from "vitest"; +import { spawnSync } from "child_process"; +import { createRequire } from "module"; +import fs from "fs"; +import path from "path"; +import os from "os"; +import { fileURLToPath } from "url"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +const driverPath = path.join(__dirname, "apply_samples.cjs"); +const require = createRequire(import.meta.url); + +function makeTempDir(prefix) { + return fs.mkdtempSync(path.join(os.tmpdir(), prefix)); +} + +function git(args, cwd) { + const r = spawnSync("git", args, { cwd, encoding: "utf8" }); + if (r.status !== 0) { + throw new Error(`git ${args.join(" ")} failed: ${r.stderr || r.stdout}`); + } + return r.stdout; +} + +function initRepo(dir, defaultBranch) { + git(["init", "-q", "-b", defaultBranch], dir); + git(["config", "user.email", "ghaw-test@example.com"], dir); + git(["config", "user.name", "ghaw test"], dir); + fs.writeFileSync(path.join(dir, "README.md"), "# seed\n"); + git(["add", "."], dir); + git(["commit", "-q", "-m", "seed"], dir); +} + +describe.sequential("apply_samples.cjs", () => { + let tempDir; + let configPath; + let outputsPath; + let logPath; + + beforeAll(() => { + tempDir = makeTempDir("gh-aw-apply-samples-"); + configPath = path.join(tempDir, "config.json"); + outputsPath = path.join(tempDir, "outputs.jsonl"); + logPath = path.join(tempDir, "agent-stdio.log"); + + // Minimal safe-outputs config enabling only the `create_issue` tool. The + // bootstrap loader keys off the snake-case keys present here. + fs.writeFileSync( + configPath, + JSON.stringify({ + create_issue: { max: 1 }, + }) + ); + }); + + it("replays a create_issue sample through the real MCP server and emits a completed marker", () => { + const samples = [ + { + tool: "create_issue", + arguments: { + title: "Deterministic sample issue", + body: "This issue was emitted by the apply_samples driver during a unit test.", + }, + }, + ]; + + const result = spawnSync(process.execPath, [driverPath], { + env: { + ...process.env, + GH_AW_SAMPLES: JSON.stringify(samples), + GH_AW_SAFE_OUTPUTS_CONFIG_PATH: configPath, + GH_AW_SAFE_OUTPUTS: outputsPath, + GH_AW_AGENT_STDIO_LOG: logPath, + }, + encoding: "utf8", + timeout: 15000, + }); + + if (result.status !== 0) { + // Surface stderr so failures are diagnosable in CI. + throw new Error(`driver exited with status ${result.status}\nstderr:\n${result.stderr}\nstdout:\n${result.stdout}`); + } + + expect(fs.existsSync(outputsPath)).toBe(true); + const outputLines = fs + .readFileSync(outputsPath, "utf8") + .split("\n") + .filter(line => line.trim().length > 0); + expect(outputLines.length).toBeGreaterThanOrEqual(1); + + const firstEntry = JSON.parse(outputLines[0]); + expect(firstEntry.type).toBe("create_issue"); + expect(firstEntry.title).toBe("Deterministic sample issue"); + + expect(fs.existsSync(logPath)).toBe(true); + const logText = fs.readFileSync(logPath, "utf8"); + expect(logText).toContain("terminal_reason"); + expect(logText).toContain("completed"); + }); + + it("exits cleanly when GH_AW_SAMPLES is empty", () => { + const result = spawnSync(process.execPath, [driverPath], { + env: { + ...process.env, + GH_AW_SAMPLES: "[]", + GH_AW_SAFE_OUTPUTS_CONFIG_PATH: configPath, + GH_AW_SAFE_OUTPUTS: outputsPath, + GH_AW_AGENT_STDIO_LOG: path.join(tempDir, "empty-log.log"), + }, + encoding: "utf8", + timeout: 10000, + }); + + expect(result.status).toBe(0); + const logText = fs.readFileSync(path.join(tempDir, "empty-log.log"), "utf8"); + expect(logText).toContain("terminal_reason"); + }); + + // Defense in depth: an older compiler that marshaled a nil Go slice would + // emit `null` into GH_AW_SAMPLES. Newer drivers must tolerate that and + // treat it as "no samples", not crash with `must be a JSON array`. + it("exits cleanly when GH_AW_SAMPLES is the literal `null`", () => { + const logPath = path.join(tempDir, "null-log.log"); + const result = spawnSync(process.execPath, [driverPath], { + env: { + ...process.env, + GH_AW_SAMPLES: "null", + GH_AW_SAFE_OUTPUTS_CONFIG_PATH: configPath, + GH_AW_SAFE_OUTPUTS: outputsPath, + GH_AW_AGENT_STDIO_LOG: logPath, + }, + encoding: "utf8", + timeout: 10000, + }); + + if (result.status !== 0) { + throw new Error(`driver exited with status ${result.status}\nstderr:\n${result.stderr}\nstdout:\n${result.stdout}`); + } + expect(result.stderr).toContain("GH_AW_SAMPLES is null"); + const logText = fs.readFileSync(logPath, "utf8"); + expect(logText).toContain("terminal_reason"); + }); +}); + +describe("apply_samples.cjs preStagePatch (create_pull_request / push_to_pull_request_branch)", () => { + // Load the module under test directly so we can drive preStagePatch in + // isolation against a real, throwaway git working tree. This is the + // critical code path that turns a `patch` sidecar on a sample entry into + // a real branch + commit that the downstream MCP `create_pull_request` + // handler (which derives a git diff) can act on. + const { preStagePatch } = require("./apply_samples.cjs"); + + /** + * Build a unified diff that adds a brand-new file. Synthetic but realistic. + */ + function newFileDiff(filePath, contents) { + const lines = contents.split("\n"); + // Strip trailing empty element produced by a terminating "\n" so the + // hunk header line count matches what git apply expects. + if (lines[lines.length - 1] === "") lines.pop(); + const body = lines.map(l => "+" + l).join("\n"); + return `diff --git a/${filePath} b/${filePath}\n` + `new file mode 100644\n` + `index 0000000..1111111\n` + `--- /dev/null\n` + `+++ b/${filePath}\n` + `@@ -0,0 +1,${lines.length} @@\n` + body + "\n"; + } + + it("checks out the requested branch and commits the patch on it (create_pull_request)", () => { + const workspace = makeTempDir("gh-aw-prestage-cpr-"); + initRepo(workspace, "main"); + + const branchName = "feat/gh-aw-sample-branch"; + const fileToAdd = "sample-feature.txt"; + const fileBody = "hello from a deterministic sample\nsecond line\n"; + const entry = { + tool: "create_pull_request", + arguments: { + title: "Sample PR", + body: "Sample PR body", + branch: branchName, + }, + sidecars: { patch: newFileDiff(fileToAdd, fileBody) }, + }; + + // GH_AW_CUSTOM_BASE_BRANCH steers preStagePatch to check out the right + // base ref inside our fresh repo (default is GITHUB_BASE_REF / "main"). + const prev = process.env.GH_AW_CUSTOM_BASE_BRANCH; + process.env.GH_AW_CUSTOM_BASE_BRANCH = "main"; + try { + preStagePatch(entry, 0, workspace); + } finally { + if (prev === undefined) delete process.env.GH_AW_CUSTOM_BASE_BRANCH; + else process.env.GH_AW_CUSTOM_BASE_BRANCH = prev; + } + + // 1. Branch name on the entry is preserved (driver must forward it to MCP). + expect(entry.arguments.branch).toBe(branchName); + + // 2. The named branch exists in the working repo. + const branches = git(["branch", "--list", branchName], workspace).trim(); + expect(branches).toContain(branchName); + + // 3. Current HEAD is that branch. + const head = git(["rev-parse", "--abbrev-ref", "HEAD"], workspace).trim(); + expect(head).toBe(branchName); + + // 4. The patch was applied AND committed (not just sitting in the worktree). + const status = git(["status", "--porcelain"], workspace).trim(); + expect(status).toBe(""); + expect(fs.existsSync(path.join(workspace, fileToAdd))).toBe(true); + expect(fs.readFileSync(path.join(workspace, fileToAdd), "utf8")).toBe(fileBody); + + // 5. The commit message identifies the sample so failures are diagnosable. + const lastMsg = git(["log", "-1", "--pretty=%s"], workspace).trim(); + expect(lastMsg).toMatch(/gh-aw sample 1: create_pull_request/); + + // 6. The new file shows up as a real diff against the base branch — this is + // precisely what the downstream MCP create_pull_request handler will read. + const diff = git(["diff", "main..." + branchName, "--", fileToAdd], workspace); + expect(diff).toContain("+hello from a deterministic sample"); + }); + + it("defaults the branch name to gh-aw-sample- when none is supplied", () => { + const workspace = makeTempDir("gh-aw-prestage-default-"); + initRepo(workspace, "main"); + + const entry = { + tool: "push_to_pull_request_branch", + arguments: { + body: "Sample push body", + // branch intentionally omitted — driver should synthesize one. + }, + sidecars: { patch: newFileDiff("push-feature.txt", "from push sample\n") }, + }; + + const prev = process.env.GH_AW_CUSTOM_BASE_BRANCH; + process.env.GH_AW_CUSTOM_BASE_BRANCH = "main"; + try { + preStagePatch(entry, 2, workspace); + } finally { + if (prev === undefined) delete process.env.GH_AW_CUSTOM_BASE_BRANCH; + else process.env.GH_AW_CUSTOM_BASE_BRANCH = prev; + } + + // Index in preStagePatch is zero-based; the default uses i+1 → "gh-aw-sample-3". + expect(entry.arguments.branch).toBe("gh-aw-sample-3"); + const head = git(["rev-parse", "--abbrev-ref", "HEAD"], workspace).trim(); + expect(head).toBe("gh-aw-sample-3"); + expect(fs.existsSync(path.join(workspace, "push-feature.txt"))).toBe(true); + }); + + it("is a no-op when the sample tool isn't in the patch-sidecar set", () => { + // We assert this at the driver level (PATCH_SIDECAR_TOOLS gate in main()), + // but preStagePatch itself should also be a no-op when called with an + // entry that has no patch sidecar — protecting against misuse. + const workspace = makeTempDir("gh-aw-prestage-noop-"); + initRepo(workspace, "main"); + + const entry = { + tool: "create_issue", + arguments: { title: "x", body: "y" }, + }; + preStagePatch(entry, 0, workspace); + + // Still on main, no extra commits, no new files. + expect(git(["rev-parse", "--abbrev-ref", "HEAD"], workspace).trim()).toBe("main"); + const log = git(["log", "--pretty=%s"], workspace).trim().split("\n"); + expect(log).toEqual(["seed"]); + }); +}); diff --git a/actions/setup/js/safe_outputs_mcp_server.cjs b/actions/setup/js/safe_outputs_mcp_server.cjs index aca4f49cd93..3e1c2c54457 100644 --- a/actions/setup/js/safe_outputs_mcp_server.cjs +++ b/actions/setup/js/safe_outputs_mcp_server.cjs @@ -1,4 +1,5 @@ // @ts-check +/// // Safe Outputs MCP Server Module // @@ -12,6 +13,11 @@ // const server = require("./safe_outputs_mcp_server.cjs"); // server.startSafeOutputsServer(); +// Load core/context shim so handlers that reference `core.*` (e.g. +// create_pull_request.cjs) work when this file is spawned directly as a +// child process (e.g. by apply_samples.cjs) outside the github-script runtime. +require("./shim.cjs"); + const { createServer, registerTool, normalizeTool, start } = require("./mcp_server_core.cjs"); const { createAppendFunction } = require("./safe_outputs_append.cjs"); const { createHandlers } = require("./safe_outputs_handlers.cjs"); diff --git a/cmd/gh-aw/main.go b/cmd/gh-aw/main.go index 895fdb2beed..27748d7cee5 100644 --- a/cmd/gh-aw/main.go +++ b/cmd/gh-aw/main.go @@ -304,6 +304,7 @@ Examples: priorManifestFile, _ := cmd.Flags().GetString("prior-manifest-file") ghes, _ := cmd.Flags().GetBool("ghes") verbose, _ := cmd.Flags().GetBool("verbose") + useSamples, _ := cmd.Flags().GetBool("use-samples") if err := validateEngine(engineOverride); err != nil { return err } @@ -364,6 +365,7 @@ Examples: ValidateImages: validateImages, PriorManifestFile: priorManifestFile, GHESCompat: ghes, + UseSamples: useSamples, } if _, err := cli.CompileWorkflows(cmd.Context(), config); err != nil { // Return error as-is without additional formatting @@ -703,6 +705,8 @@ Use "` + string(constants.CLIExtensionPrefix) + ` help all" to show help for all compileCmd.Flags().Bool("strict", false, "Override frontmatter to enforce strict mode validation for all workflows (enforces action pinning, network config, safe-outputs, refuses write permissions and deprecated fields). Note: Workflows default to strict mode unless frontmatter sets strict: false") compileCmd.Flags().Bool("trial", false, "Enable trial mode compilation (modifies workflows for trial execution)") compileCmd.Flags().String("logical-repo", "", "Repository to simulate workflow execution against (for trial mode)") + compileCmd.Flags().Bool("use-samples", false, "Hidden: replace the agentic 'Execute coding agent' step with a deterministic driver that replays the workflow's safe-outputs `samples` frontmatter entries through the safe-outputs MCP server. Used to make end-to-end tests deterministic.") + _ = compileCmd.Flags().MarkHidden("use-samples") compileCmd.Flags().Bool("dependabot", false, "Generate dependency manifests (package.json, requirements.txt, go.mod) and Dependabot config when dependencies are detected") compileCmd.Flags().Bool("force", false, "Force overwrite of existing dependency files (e.g., dependabot.yml)") compileCmd.Flags().Bool("refresh-stop-time", false, "Force regeneration of stop-after times instead of preserving existing values from lock files") diff --git a/pkg/cli/compile_compiler_setup.go b/pkg/cli/compile_compiler_setup.go index bdc672aea13..f5853e752ec 100644 --- a/pkg/cli/compile_compiler_setup.go +++ b/pkg/cli/compile_compiler_setup.go @@ -147,6 +147,12 @@ func configureCompilerFlags(compiler *workflow.Compiler, config CompileConfig) { } } + // Replace the agentic step with a deterministic samples replay driver when requested (hidden feature). + if config.UseSamples { + compileCompilerSetupLog.Print("Enabling --use-samples: agentic step will be replaced by a deterministic replay driver") + compiler.SetUseSamples(true) + } + // Set refresh stop time flag compiler.SetRefreshStopTime(config.RefreshStopTime) if config.RefreshStopTime { diff --git a/pkg/cli/compile_config.go b/pkg/cli/compile_config.go index 24206c2d583..901c3ccb27e 100644 --- a/pkg/cli/compile_config.go +++ b/pkg/cli/compile_config.go @@ -13,6 +13,7 @@ type CompileConfig struct { Purge bool // Remove orphaned lock files TrialMode bool // Enable trial mode (suppress safe outputs) TrialLogicalRepoSlug string // Target repository for trial mode + UseSamples bool // Hidden: replace agentic step with a deterministic samples replay driver Strict bool // Enable strict mode validation Dependabot bool // Generate Dependabot manifests for npm dependencies ForceOverwrite bool // Force overwrite of existing files (dependabot.yml) diff --git a/pkg/parser/schemas/main_workflow_schema.json b/pkg/parser/schemas/main_workflow_schema.json index 4cc557a9934..bfc22f569de 100644 --- a/pkg/parser/schemas/main_workflow_schema.json +++ b/pkg/parser/schemas/main_workflow_schema.json @@ -4402,6 +4402,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false @@ -4992,6 +5008,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false, @@ -5070,6 +5102,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false @@ -5125,6 +5173,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false @@ -5266,6 +5330,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false, @@ -5386,6 +5466,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false @@ -5436,6 +5532,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false, @@ -5566,6 +5678,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false, @@ -5665,6 +5793,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false, @@ -5750,6 +5894,22 @@ "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] + }, "github-token": { "$ref": "#/$defs/github_token", "description": "GitHub token to use for this specific output type. Overrides global github-token if specified." @@ -5822,6 +5982,22 @@ "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] + }, "state-reason": { "type": "string", "enum": ["completed", "not_planned", "duplicate"], @@ -5908,6 +6084,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false, @@ -5989,6 +6181,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false, @@ -6105,6 +6313,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false, @@ -6480,6 +6704,22 @@ "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] + }, "allow-workflows": { "type": "boolean", "description": "When true, adds workflows: write to the GitHub App token permissions. Required when allowed-files targets .github/workflows/ paths. Requires safe-outputs.github-app to be configured because the workflows permission is a GitHub App-only permission and cannot be granted via GITHUB_TOKEN.", @@ -6559,6 +6799,22 @@ "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] + }, "required-labels": { "type": "array", "items": { @@ -6652,6 +6908,22 @@ "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] + }, "required-labels": { "type": "array", "items": { @@ -6723,6 +6995,22 @@ "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] + }, "required-labels": { "type": "array", "items": { @@ -6789,6 +7077,22 @@ "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] + }, "required-labels": { "type": "array", "items": { @@ -6853,6 +7157,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false @@ -6892,6 +7212,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false @@ -6936,6 +7272,22 @@ "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] + }, "github-app": { "$ref": "#/$defs/github_app", "description": "GitHub App credentials for minting an installation access token scoped to checks:write for this handler. When set, a short-lived token is minted before the handler runs and revoked afterwards." @@ -7045,6 +7397,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false @@ -7130,6 +7498,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false @@ -7239,6 +7623,22 @@ "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] + }, "required-labels": { "type": "array", "items": { @@ -7313,6 +7713,22 @@ "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] + }, "required-labels": { "type": "array", "items": { @@ -7420,6 +7836,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false @@ -7494,6 +7926,22 @@ "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] + }, "required-labels": { "type": "array", "items": { @@ -7573,6 +8021,22 @@ "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] + }, "required-labels": { "type": "array", "items": { @@ -7664,6 +8128,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false @@ -7738,6 +8218,22 @@ "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] + }, "required-labels": { "type": "array", "items": { @@ -7826,6 +8322,22 @@ "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] + }, "required-labels": { "type": "array", "items": { @@ -8024,6 +8536,22 @@ "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] + }, "github-token-for-extra-empty-commit": { "type": "string", "description": "Token used to push an empty commit after pushing changes to trigger CI events. Works around the GITHUB_TOKEN limitation where pushes don't trigger workflow runs. Defaults to the magic secret GH_AW_CI_TRIGGER_TOKEN if set in the repository. Use a secret expression (e.g. '${{ secrets.CI_TOKEN }}') for a custom token, or 'app' for GitHub App auth." @@ -8209,6 +8737,22 @@ "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] + }, "required-labels": { "type": "array", "items": { @@ -8283,6 +8827,22 @@ "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] + }, "required-labels": { "type": "array", "items": { @@ -8355,6 +8915,22 @@ "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] + }, "required-labels": { "type": "array", "items": { @@ -8420,6 +8996,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "required": ["workflows"], @@ -8576,6 +9168,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "required": ["workflows"], @@ -8640,6 +9248,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false @@ -8702,6 +9326,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false @@ -8752,6 +9392,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false @@ -8817,6 +9473,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false @@ -8969,6 +9641,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false @@ -9662,6 +10350,22 @@ "type": "boolean", "description": "If true, emit step summary messages instead of making GitHub API calls for this specific output type (preview mode)", "examples": [true, false] + }, + "samples": { + "description": "Internal hidden feature. Optional list of declarative sample payloads that exercise this safe-output handler. Used by the hidden `gh aw compile --use-samples` flag to replace the agentic step with a deterministic replay through the safe-outputs MCP server. Each entry should conform to the corresponding MCP tool inputSchema; recognized sidecar keys (currently `patch` for create-pull-request and push-to-pull-request-branch) are stripped before schema validation and consumed by the replay driver.", + "oneOf": [ + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": true + } + }, + { + "type": "object", + "additionalProperties": true + } + ] } }, "additionalProperties": false diff --git a/pkg/workflow/compiler_types.go b/pkg/workflow/compiler_types.go index 9c0639c4977..4ea0310b547 100644 --- a/pkg/workflow/compiler_types.go +++ b/pkg/workflow/compiler_types.go @@ -70,6 +70,7 @@ type Compiler struct { forceStaged bool // If true, force all safe-outputs into staged mode trialMode bool // If true, suppress safe outputs for trial mode execution trialLogicalRepoSlug string // If set in trial mode, the logical repository to checkout + useSamples bool // If true, replace the agentic step with a deterministic samples replay driver (hidden feature) refreshStopTime bool // If true, regenerate stop-after times instead of preserving existing ones forceRefreshActionPins bool // If true, clear action cache and resolve all actions from GitHub API failFast bool // If true, stop at first validation error instead of collecting all errors @@ -203,6 +204,14 @@ func (c *Compiler) SetTrialLogicalRepoSlug(repo string) { c.trialLogicalRepoSlug = repo } +// SetUseSamples configures whether to replace the agentic step with a +// deterministic replay driver that feeds `samples` entries to the safe-outputs +// MCP server via real `tools/call` JSON-RPC. Hidden feature used by +// `gh aw compile --use-samples`. +func (c *Compiler) SetUseSamples(use bool) { + c.useSamples = use +} + // SetStrictMode configures whether to enable strict validation mode func (c *Compiler) SetStrictMode(strict bool) { c.strictMode = strict @@ -446,6 +455,7 @@ type WorkflowData struct { WorkflowID string // workflow identifier derived from markdown filename (basename without extension) TrialMode bool // whether the workflow is running in trial mode TrialLogicalRepo string // target repository slug for trial mode (owner/repo) + UseSamples bool // whether the agentic step should be replaced by a deterministic samples replay driver (hidden feature) FrontmatterName string // name field from frontmatter (for code scanning alert driver default) FrontmatterEmoji string // emoji field from frontmatter (for display in footers and UI) FrontmatterYAML string // raw frontmatter YAML content (rendered as comment in lock file for reference) @@ -627,6 +637,8 @@ type BaseSafeOutputConfig struct { GitHubApp *GitHubAppConfig `yaml:"github-app,omitempty"` // GitHub App credentials for minting a per-handler installation access token Staged bool `yaml:"staged,omitempty"` // If true, emit step summary messages instead of making GitHub API calls for this specific output type NormalizeClosingKeywords *bool `yaml:"normalize-closing-keywords,omitempty"` // When true for this output type, strip backticks from recognized issue-closing keywords in body fields. + // Samples carries deterministic replay samples for the hidden `gh aw compile --use-samples` flag. Each entry is the JSON object passed to the corresponding MCP tool's `tools/call` arguments. Sample-only sidecar fields (e.g. `patch` for create_pull_request) are stripped before the call and used by the replay driver. + Samples []map[string]any `yaml:"samples,omitempty"` } // SafeOutputsConfig holds configuration for automatic output routes diff --git a/pkg/workflow/compiler_validators.go b/pkg/workflow/compiler_validators.go index 316ba6a8b27..d1286dcd4bb 100644 --- a/pkg/workflow/compiler_validators.go +++ b/pkg/workflow/compiler_validators.go @@ -152,6 +152,7 @@ func (c *Compiler) validateCoreToolConfiguration(workflowData *WorkflowData, mar {logMessage: "Validating sandbox configuration", validateFn: func() error { return validateSandboxConfig(workflowData) }}, {logMessage: "Validating safe-outputs target fields", validateFn: func() error { return validateSafeOutputsTarget(workflowData.SafeOutputs) }}, {logMessage: "Validating safe-outputs max fields", validateFn: func() error { return validateSafeOutputsMax(workflowData.SafeOutputs) }}, + {logMessage: "Validating safe-outputs samples entries against MCP tool schemas", validateFn: func() error { return validateSafeOutputsSamples(workflowData.SafeOutputs) }}, {logMessage: "Validating safe-outputs allowed-domains", validateFn: func() error { return c.validateSafeOutputsAllowedDomains(workflowData.SafeOutputs) }}, {logMessage: "Validating safe-outputs merge-pull-request", validateFn: func() error { return validateSafeOutputsMergePullRequest(workflowData.SafeOutputs) }}, {logMessage: "Validating safe-outputs needs declarations", validateFn: func() error { return validateSafeOutputsNeeds(workflowData) }}, diff --git a/pkg/workflow/compiler_yaml_ai_execution.go b/pkg/workflow/compiler_yaml_ai_execution.go index b3b6826b74a..48473d9c883 100644 --- a/pkg/workflow/compiler_yaml_ai_execution.go +++ b/pkg/workflow/compiler_yaml_ai_execution.go @@ -9,6 +9,14 @@ import ( // generateEngineExecutionSteps generates the GitHub Actions steps for executing the AI engine func (c *Compiler) generateEngineExecutionSteps(yaml *strings.Builder, data *WorkflowData, engine CodingAgentEngine, logFile string) { + // --use-samples (hidden) replaces the agent step with a deterministic driver + // that replays the workflow's safe-outputs `samples` frontmatter entries + // through the safe-outputs MCP server. The engine is never invoked. + if data.UseSamples { + compilerYamlLog.Printf("Replacing engine execution with samples replay driver: engine=%s", engine.GetID()) + c.generateSamplesReplayStep(yaml, data, logFile) + return + } steps := engine.GetExecutionSteps(data, logFile) compilerYamlLog.Printf("Generating engine execution steps: engine=%s, steps=%d", engine.GetID(), len(steps)) diff --git a/pkg/workflow/safe_outputs_config.go b/pkg/workflow/safe_outputs_config.go index 58dc548537a..3899b7c28e2 100644 --- a/pkg/workflow/safe_outputs_config.go +++ b/pkg/workflow/safe_outputs_config.go @@ -693,6 +693,14 @@ func (c *Compiler) extractSafeOutputsConfig(frontmatter map[string]any) *SafeOut } } + // Force-disable threat detection when --use-samples is active: the replay driver + // emits synthetic outputs solely for deterministic end-to-end tests, and running + // an LLM-backed detection pass would defeat that determinism. + if config != nil && c.useSamples && config.ThreatDetection != nil { + safeOutputsConfigLog.Print("Disabling threat-detection because --use-samples is set") + config.ThreatDetection = nil + } + if config != nil { safeOutputsConfigLog.Print("Successfully extracted safe-outputs configuration") } else { @@ -755,6 +763,48 @@ func (c *Compiler) parseBaseSafeOutputConfig(configMap map[string]any, config *B config.Staged = stagedBool } } + + // Parse samples list (hidden feature: deterministic replay samples for --use-samples). + // Accepts either a YAML list of objects, or a single object that is auto-wrapped + // into a one-element list. The JSON schema rejects scalar/string shapes so we + // don't need a defensive YAML-string branch here. + if samples, exists := configMap["samples"]; exists { + parsed := parseSamplesValue(samples) + if len(parsed) > 0 { + safeOutputsConfigLog.Printf("Parsed %d samples entries", len(parsed)) + config.Samples = parsed + } + } +} + +// parseSamplesValue normalizes a `samples` frontmatter value into a list of +// objects. Accepted shapes: +// - YAML list of mappings: returned as-is +// - single YAML mapping: wrapped into a one-element list +// +// Any other shape returns an empty slice — schema validation rejects those +// shapes upstream and we keep this parser strict to match. +func parseSamplesValue(samples any) []map[string]any { + switch v := samples.(type) { + case []any: + out := make([]map[string]any, 0, len(v)) + for _, item := range v { + if m, ok := item.(map[string]any); ok { + out = append(out, m) + } else if mStr, ok := item.(map[string]string); ok { + converted := make(map[string]any, len(mStr)) + for k, s := range mStr { + converted[k] = s + } + out = append(out, converted) + } + } + return out + case map[string]any: + return []map[string]any{v} + default: + return nil + } } // SafeOutputStepConfig holds configuration for building a single safe output step diff --git a/pkg/workflow/samples_replay.go b/pkg/workflow/samples_replay.go new file mode 100644 index 00000000000..28f16afe1c4 --- /dev/null +++ b/pkg/workflow/samples_replay.go @@ -0,0 +1,112 @@ +package workflow + +import ( + "encoding/json" + "fmt" + "sort" + "strings" +) + +// SampleEntry is the per-call payload consumed by apply_samples.cjs. +// Each entry corresponds to a single MCP `tools/call` invocation. +type SampleEntry struct { + // Tool is the snake_case MCP tool name (e.g. "create_pull_request"). + Tool string `json:"tool"` + // Arguments are passed verbatim as the MCP `tools/call` arguments. + // Sample sidecar fields (e.g. `patch`) have already been stripped. + Arguments map[string]any `json:"arguments"` + // Sidecars carries fields stripped from Arguments that need out-of-band + // pre-staging by the driver (e.g. `patch` for create_pull_request). + Sidecars map[string]any `json:"sidecars,omitempty"` +} + +// collectSampleEntries walks the safe-outputs config and flattens every +// configured `samples` entry into the order they will be sent to the MCP +// server. Iteration order is deterministic (sorted by struct field name) so +// that compiled YAML is stable across runs. +func collectSampleEntries(config *SafeOutputsConfig) []SampleEntry { + if config == nil { + return nil + } + + fieldNames := make([]string, 0, len(safeOutputFieldMapping)) + for fieldName := range safeOutputFieldMapping { + fieldNames = append(fieldNames, fieldName) + } + sort.Strings(fieldNames) + + var entries []SampleEntry + for _, fieldName := range fieldNames { + toolName := safeOutputFieldMapping[fieldName] + base := extractBaseSafeOutputConfig(config, fieldName) + if base == nil || len(base.Samples) == 0 { + continue + } + sidecarKeys := sampleSidecarFields[toolName] + for _, sample := range base.Samples { + args := make(map[string]any, len(sample)) + var sidecars map[string]any + for k, v := range sample { + if sidecarKeys[k] { + if sidecars == nil { + sidecars = make(map[string]any) + } + sidecars[k] = v + continue + } + args[k] = v + } + entries = append(entries, SampleEntry{ + Tool: toolName, + Arguments: args, + Sidecars: sidecars, + }) + } + } + return entries +} + +// generateSamplesReplayStep emits the YAML that replaces the agentic +// `Execute coding agent` step when the hidden `gh aw compile --use-samples` +// flag is used. It spawns the safe-outputs MCP server over stdio and feeds it +// a `tools/call` for every collected sample, after pre-staging branches/patches +// for samples that carry them. +func (c *Compiler) generateSamplesReplayStep(yaml *strings.Builder, data *WorkflowData, logFile string) { + entries := collectSampleEntries(data.SafeOutputs) + compilerYamlLog.Printf("Generating samples replay step: entries=%d", len(entries)) + + // Normalize a nil slice to an empty slice so json.Marshal emits "[]" not "null". + // The driver rejects anything that isn't a JSON array; emitting "null" here + // would crash the replay step with `GH_AW_SAMPLES must be a JSON array` for + // workflows that opt into --use-samples but configure no samples (or whose + // configured samples all live on disabled handlers). + if entries == nil { + entries = []SampleEntry{} + } + + // Serialize entries to JSON for the driver. Always emit valid JSON even when + // empty so the driver can produce a clear `no samples configured` message + // rather than crashing on an empty env var. + payload, err := json.Marshal(entries) + if err != nil { + // Should never happen for map[string]any payloads; fall back to empty + // array so the workflow still compiles and the driver reports cleanly. + compilerYamlLog.Printf("Warning: failed to marshal samples entries: %v", err) + payload = []byte("[]") + } + + yaml.WriteString(" - name: Replay safe-outputs samples (deterministic)\n") + yaml.WriteString(" id: agentic_execution\n") + yaml.WriteString(" env:\n") + yaml.WriteString(" GH_AW_SAMPLES: |\n") + for line := range strings.SplitSeq(string(payload), "\n") { + fmt.Fprintf(yaml, " %s\n", line) + } + fmt.Fprintf(yaml, " GH_AW_AGENT_STDIO_LOG: %s\n", logFile) + yaml.WriteString(" GH_AW_SAFE_OUTPUTS_CONFIG_PATH: ${{ runner.temp }}/gh-aw/safeoutputs/config.json\n") + yaml.WriteString(" GH_AW_SAFE_OUTPUTS: ${{ runner.temp }}/gh-aw/safeoutputs/outputs.jsonl\n") + yaml.WriteString(" run: |\n") + yaml.WriteString(" set -euo pipefail\n") + yaml.WriteString(" mkdir -p \"$(dirname \"$GH_AW_AGENT_STDIO_LOG\")\"\n") + yaml.WriteString(" node \"${{ runner.temp }}/gh-aw/actions/apply_samples.cjs\"\n") +} diff --git a/pkg/workflow/samples_replay_test.go b/pkg/workflow/samples_replay_test.go new file mode 100644 index 00000000000..ac8232fd825 --- /dev/null +++ b/pkg/workflow/samples_replay_test.go @@ -0,0 +1,341 @@ +//go:build integration + +package workflow + +import ( + "encoding/json" + "os" + "strings" + "testing" +) + +// TestUseSamplesReplacesAgentStep verifies that compiling with +// SetUseSamples(true) replaces the engine `Execute coding agent` step +// with the deterministic `Replay safe-outputs samples` step driven by +// apply_samples.cjs. +func TestUseSamplesReplacesAgentStep(t *testing.T) { + const md = `--- +on: + workflow_dispatch: +permissions: read-all +engine: + id: claude +safe-outputs: + create-issue: + samples: + - title: "Deterministic test issue" + body: "Issue body emitted by gh-aw samples replay." +--- + +Trivial workflow whose only job is to be compiled with --use-samples. +` + + tmpFile, err := os.CreateTemp("", "use-samples-*.md") + if err != nil { + t.Fatal(err) + } + defer os.Remove(tmpFile.Name()) + if _, err := tmpFile.WriteString(md); err != nil { + t.Fatal(err) + } + tmpFile.Close() + + t.Run("Default Mode", func(t *testing.T) { + compiler := NewCompiler() + if err := compiler.CompileWorkflow(tmpFile.Name()); err != nil { + t.Fatalf("compile failed: %v", err) + } + lockPath := strings.TrimSuffix(tmpFile.Name(), ".md") + ".lock.yml" + defer os.Remove(lockPath) + b, err := os.ReadFile(lockPath) + if err != nil { + t.Fatalf("read lock: %v", err) + } + lockContent := string(b) + if strings.Contains(lockContent, "Replay safe-outputs samples") { + t.Error("Did not expect samples replay step in default mode") + } + if strings.Contains(lockContent, "apply_samples.cjs") { + t.Error("Did not expect apply_samples driver in default mode") + } + }) + + t.Run("Use Samples Mode", func(t *testing.T) { + compiler := NewCompiler() + compiler.SetUseSamples(true) + if err := compiler.CompileWorkflow(tmpFile.Name()); err != nil { + t.Fatalf("compile failed: %v", err) + } + workflowData, err := compiler.ParseWorkflowFile(tmpFile.Name()) + if err != nil { + t.Fatalf("ParseWorkflowFile failed: %v", err) + } + if !workflowData.UseSamples { + t.Fatal("Expected workflowData.UseSamples to be true after SetUseSamples(true)") + } + lockPath := strings.TrimSuffix(tmpFile.Name(), ".md") + ".lock.yml" + defer os.Remove(lockPath) + b, err := os.ReadFile(lockPath) + if err != nil { + t.Fatalf("read lock: %v", err) + } + lockContent := string(b) + if !strings.Contains(lockContent, "Replay safe-outputs samples (deterministic)") { + t.Error("Expected `Replay safe-outputs samples (deterministic)` step in lock file") + } + if !strings.Contains(lockContent, "apply_samples.cjs") { + t.Error("Expected lock file to invoke apply_samples.cjs driver") + } + if !strings.Contains(lockContent, "GH_AW_SAMPLES:") { + t.Error("Expected GH_AW_SAMPLES env var in lock file") + } + if !strings.Contains(lockContent, `"tool":"create_issue"`) { + t.Error("Expected JSON-encoded create_issue tool entry in lock file") + } + if !strings.Contains(lockContent, "Deterministic test issue") { + t.Error("Expected sample title in lock file") + } + if !strings.Contains(lockContent, "id: agentic_execution") { + t.Error("Expected id: agentic_execution on the replay step") + } + // Threat detection must be force-disabled under --use-samples so the + // deterministic replay isn't perturbed by an LLM-backed detection job. + if strings.Contains(lockContent, "\n detection:\n") { + t.Error("Expected no `detection:` job under --use-samples") + } + }) +} + +// TestUseSamplesCreatePullRequestWithPatch is the end-to-end smoke test for +// the create-pull-request + patch sidecar flow. It compiles a workflow whose +// only safe-output is `create-pull-request` with a `samples` entry carrying +// a `patch` sidecar, then inspects the generated lock.yml to verify that: +// +// 1. The agentic step is replaced by the deterministic replay step +// 2. GH_AW_SAMPLES contains a JSON-encoded create_pull_request entry +// 3. The patch is partitioned into `sidecars`, NOT into `arguments` +// (the MCP server's create_pull_request handler must NOT receive `patch` +// as a tool argument — it derives the diff from the working tree) +// 4. The branch name and other PR fields land in `arguments` +// 5. The actual diff payload is preserved verbatim in the lock file +// (so the driver can `git apply` it at replay time) +// 6. No `detection:` job is emitted +func TestUseSamplesCreatePullRequestWithPatch(t *testing.T) { + const patch = "diff --git a/sample.txt b/sample.txt\nnew file mode 100644\nindex 0000000..1111111\n--- /dev/null\n+++ b/sample.txt\n@@ -0,0 +1 @@\n+hello from gh-aw samples\n" + + md := `--- +on: + workflow_dispatch: +permissions: read-all +engine: + id: claude +safe-outputs: + create-pull-request: + samples: + - title: "Sample PR from gh-aw" + body: "PR body emitted by samples replay." + branch: "feat/gh-aw-sample-pr" + patch: | +` + indentBlock(patch, " ") + `--- + +Trivial workflow exercising create-pull-request via --use-samples. +` + + tmpFile, err := os.CreateTemp("", "use-samples-cpr-*.md") + if err != nil { + t.Fatal(err) + } + defer os.Remove(tmpFile.Name()) + if _, err := tmpFile.WriteString(md); err != nil { + t.Fatal(err) + } + tmpFile.Close() + + compiler := NewCompiler() + compiler.SetUseSamples(true) + if err := compiler.CompileWorkflow(tmpFile.Name()); err != nil { + t.Fatalf("compile failed: %v", err) + } + lockPath := strings.TrimSuffix(tmpFile.Name(), ".md") + ".lock.yml" + defer os.Remove(lockPath) + b, err := os.ReadFile(lockPath) + if err != nil { + t.Fatalf("read lock: %v", err) + } + lock := string(b) + + // 1. Agentic step replaced + if !strings.Contains(lock, "Replay safe-outputs samples (deterministic)") { + t.Error("Expected `Replay safe-outputs samples (deterministic)` step in lock file") + } + if !strings.Contains(lock, "apply_samples.cjs") { + t.Error("Expected lock file to invoke apply_samples.cjs driver") + } + + // 2. GH_AW_SAMPLES contains a create_pull_request entry + if !strings.Contains(lock, "GH_AW_SAMPLES:") { + t.Fatal("Expected GH_AW_SAMPLES env var in lock file") + } + if !strings.Contains(lock, `"tool":"create_pull_request"`) { + t.Error("Expected JSON-encoded create_pull_request tool entry in lock file") + } + + // Extract the GH_AW_SAMPLES JSON block from the YAML for structural assertions. + samplesJSON := extractGHAWSamplesJSON(t, lock) + var entries []map[string]any + if err := json.Unmarshal([]byte(samplesJSON), &entries); err != nil { + t.Fatalf("failed to parse GH_AW_SAMPLES JSON: %v\nRaw:\n%s", err, samplesJSON) + } + if len(entries) != 1 { + t.Fatalf("expected exactly one sample entry, got %d", len(entries)) + } + entry := entries[0] + + // 3. Patch is in sidecars, NOT in arguments + args, _ := entry["arguments"].(map[string]any) + sidecars, _ := entry["sidecars"].(map[string]any) + if args == nil { + t.Fatal("expected entry.arguments to be an object") + } + if _, hasPatchInArgs := args["patch"]; hasPatchInArgs { + t.Error("patch must be stripped from arguments — MCP create_pull_request handler must not receive it") + } + if sidecars == nil { + t.Fatal("expected entry.sidecars to be present (patch should land here)") + } + gotPatch, _ := sidecars["patch"].(string) + if gotPatch == "" { + t.Fatal("expected sidecars.patch to be a non-empty string") + } + + // 4. PR fields preserved in arguments + if args["title"] != "Sample PR from gh-aw" { + t.Errorf("arguments.title = %q, want %q", args["title"], "Sample PR from gh-aw") + } + if args["body"] != "PR body emitted by samples replay." { + t.Errorf("arguments.body = %q, want %q", args["body"], "PR body emitted by samples replay.") + } + if args["branch"] != "feat/gh-aw-sample-pr" { + t.Errorf("arguments.branch = %q, want %q", args["branch"], "feat/gh-aw-sample-pr") + } + + // 5. Patch payload preserved verbatim + if !strings.Contains(gotPatch, "diff --git a/sample.txt b/sample.txt") { + t.Errorf("sidecars.patch missing diff header; got: %q", gotPatch) + } + if !strings.Contains(gotPatch, "+hello from gh-aw samples") { + t.Errorf("sidecars.patch missing payload line; got: %q", gotPatch) + } + + // 6. No detection job + if strings.Contains(lock, "\n detection:\n") { + t.Error("Expected no `detection:` job under --use-samples") + } +} + +// indentBlock prefixes every line of s with prefix. Used to embed a multi-line +// patch under a YAML block scalar in the test fixture. +func indentBlock(s, prefix string) string { + lines := strings.Split(strings.TrimRight(s, "\n"), "\n") + for i, line := range lines { + lines[i] = prefix + line + } + return strings.Join(lines, "\n") + "\n" +} + +// extractGHAWSamplesJSON pulls the literal block scalar value of GH_AW_SAMPLES +// out of the compiled YAML and returns the unindented JSON text. This avoids +// pulling in a full YAML parser for what is a tightly-controlled emit format. +func extractGHAWSamplesJSON(t *testing.T, lock string) string { + t.Helper() + const marker = "GH_AW_SAMPLES: |\n" + start := strings.Index(lock, marker) + if start < 0 { + t.Fatalf("could not find %q in lock file", marker) + } + start += len(marker) + // Determine indentation from the first content line. + rest := lock[start:] + firstNL := strings.Index(rest, "\n") + if firstNL < 0 { + t.Fatal("malformed GH_AW_SAMPLES block: no newline after first line") + } + firstLine := rest[:firstNL] + indent := firstLine[:len(firstLine)-len(strings.TrimLeft(firstLine, " "))] + if indent == "" { + t.Fatal("malformed GH_AW_SAMPLES block: expected indented content") + } + // Collect lines until we hit one that no longer starts with the same indent + // (i.e. the next YAML key like GH_AW_AGENT_STDIO_LOG). + var out strings.Builder + for _, line := range strings.Split(rest, "\n") { + if !strings.HasPrefix(line, indent) { + break + } + out.WriteString(strings.TrimPrefix(line, indent)) + out.WriteString("\n") + } + return strings.TrimSpace(out.String()) +} + +// TestUseSamplesEmitsEmptyArrayWhenNoSamplesConfigured guards against a +// regression where compiling with --use-samples but no `samples:` entries on +// any enabled handler caused json.Marshal of a nil Go slice to emit the +// literal string "null" into GH_AW_SAMPLES, which the driver rightly +// rejected with `GH_AW_SAMPLES must be a JSON array`. The compiler must +// emit "[]" instead so the driver can exit cleanly with `no samples to +// replay`. +func TestUseSamplesEmitsEmptyArrayWhenNoSamplesConfigured(t *testing.T) { + // Workflow opts into --use-samples and configures safe-outputs but has + // no `samples:` entries on the create-issue handler. + const md = `--- +on: + workflow_dispatch: +permissions: read-all +engine: + id: claude +safe-outputs: + create-issue: + title-prefix: "[no-samples] " +--- + +Workflow with safe-outputs but no samples — should still compile and +emit a valid empty-array GH_AW_SAMPLES under --use-samples. +` + + tmpFile, err := os.CreateTemp("", "use-samples-empty-*.md") + if err != nil { + t.Fatal(err) + } + defer os.Remove(tmpFile.Name()) + if _, err := tmpFile.WriteString(md); err != nil { + t.Fatal(err) + } + tmpFile.Close() + + compiler := NewCompiler() + compiler.SetUseSamples(true) + if err := compiler.CompileWorkflow(tmpFile.Name()); err != nil { + t.Fatalf("compile failed: %v", err) + } + lockPath := strings.TrimSuffix(tmpFile.Name(), ".md") + ".lock.yml" + defer os.Remove(lockPath) + b, err := os.ReadFile(lockPath) + if err != nil { + t.Fatalf("read lock: %v", err) + } + lock := string(b) + + // Must still emit the replay step. + if !strings.Contains(lock, "Replay safe-outputs samples (deterministic)") { + t.Fatal("Expected replay step in lock file even with no samples configured") + } + + samplesJSON := extractGHAWSamplesJSON(t, lock) + if samplesJSON == "null" { + t.Fatalf("GH_AW_SAMPLES must not be the literal `null` (driver would reject it); got %q", samplesJSON) + } + if samplesJSON != "[]" { + t.Fatalf("GH_AW_SAMPLES = %q, want %q", samplesJSON, "[]") + } +} diff --git a/pkg/workflow/samples_threat_detection_test.go b/pkg/workflow/samples_threat_detection_test.go new file mode 100644 index 00000000000..ba7c082d2be --- /dev/null +++ b/pkg/workflow/samples_threat_detection_test.go @@ -0,0 +1,63 @@ +package workflow + +import "testing" + +// TestExtractSafeOutputsConfig_UseSamplesDisablesThreatDetection verifies +// that --use-samples force-disables threat detection so the deterministic +// replay isn't perturbed by an LLM-backed detection job. +func TestExtractSafeOutputsConfig_UseSamplesDisablesThreatDetection(t *testing.T) { + frontmatter := map[string]any{ + "safe-outputs": map[string]any{ + "create-issue": map[string]any{ + "samples": []any{ + map[string]any{"title": "x", "body": "y"}, + }, + }, + }, + } + + t.Run("default mode applies threat-detection", func(t *testing.T) { + c := NewCompiler() + cfg := c.extractSafeOutputsConfig(frontmatter) + if cfg == nil { + t.Fatal("expected non-nil SafeOutputsConfig") + } + if cfg.ThreatDetection == nil { + t.Fatal("expected default threat-detection to be applied in default mode") + } + }) + + t.Run("use-samples disables threat-detection (default)", func(t *testing.T) { + c := NewCompiler() + c.SetUseSamples(true) + cfg := c.extractSafeOutputsConfig(frontmatter) + if cfg == nil { + t.Fatal("expected non-nil SafeOutputsConfig") + } + if cfg.ThreatDetection != nil { + t.Fatal("expected threat-detection to be force-disabled under --use-samples") + } + }) + + t.Run("use-samples disables threat-detection (explicit true)", func(t *testing.T) { + fm := map[string]any{ + "safe-outputs": map[string]any{ + "threat-detection": true, + "create-issue": map[string]any{ + "samples": []any{ + map[string]any{"title": "x", "body": "y"}, + }, + }, + }, + } + c := NewCompiler() + c.SetUseSamples(true) + cfg := c.extractSafeOutputsConfig(fm) + if cfg == nil { + t.Fatal("expected non-nil SafeOutputsConfig") + } + if cfg.ThreatDetection != nil { + t.Fatal("expected explicit threat-detection: true to be force-disabled under --use-samples") + } + }) +} diff --git a/pkg/workflow/samples_validation.go b/pkg/workflow/samples_validation.go new file mode 100644 index 00000000000..b04219aa23c --- /dev/null +++ b/pkg/workflow/samples_validation.go @@ -0,0 +1,164 @@ +package workflow + +import ( + "encoding/json" + "fmt" + "reflect" + "sort" + "strings" + "sync" + + "github.com/santhosh-tekuri/jsonschema/v6" +) + +// sampleSidecarFields lists fields recognized inside a `samples` entry +// that are NOT passed to the MCP tool's `tools/call` arguments. They are stripped +// from the sample before schema validation and consumed by the replay driver +// (e.g. to pre-stage a branch + patch on disk). +var sampleSidecarFields = map[string]map[string]bool{ + "create_pull_request": { + "patch": true, + }, + "push_to_pull_request_branch": { + "patch": true, + }, +} + +// compiledToolSchemas caches the per-tool jsonschema.Schema parsed from the +// embedded safe_outputs_tools.json. Compiled lazily on first use. +var ( + compiledToolSchemasOnce sync.Once + compiledToolSchemas map[string]*jsonschema.Schema + compiledToolSchemasErr error +) + +func getCompiledToolSchemas() (map[string]*jsonschema.Schema, error) { + compiledToolSchemasOnce.Do(func() { + var tools []struct { + Name string `json:"name"` + InputSchema json.RawMessage `json:"inputSchema"` + } + if err := json.Unmarshal([]byte(safeOutputsToolsJSONContent), &tools); err != nil { + compiledToolSchemasErr = fmt.Errorf("failed to parse safe_outputs_tools.json for samples validation: %w", err) + return + } + out := make(map[string]*jsonschema.Schema, len(tools)) + for _, t := range tools { + if len(t.InputSchema) == 0 { + continue + } + var schemaDoc any + if err := json.Unmarshal(t.InputSchema, &schemaDoc); err != nil { + compiledToolSchemasErr = fmt.Errorf("failed to parse inputSchema for tool %q: %w", t.Name, err) + return + } + compiler := jsonschema.NewCompiler() + schemaURL := fmt.Sprintf("inmem://safe-outputs-tools/%s.json", t.Name) + if err := compiler.AddResource(schemaURL, schemaDoc); err != nil { + compiledToolSchemasErr = fmt.Errorf("failed to add schema resource for tool %q: %w", t.Name, err) + return + } + schema, err := compiler.Compile(schemaURL) + if err != nil { + compiledToolSchemasErr = fmt.Errorf("failed to compile inputSchema for tool %q: %w", t.Name, err) + return + } + out[t.Name] = schema + } + compiledToolSchemas = out + }) + return compiledToolSchemas, compiledToolSchemasErr +} + +// validateSafeOutputsSamples validates every `samples` entry on every +// enabled safe-output handler against the corresponding MCP tool's inputSchema. +// Sample sidecar fields (e.g. `patch`) are stripped before validation. Returns +// the first error encountered; iteration order is deterministic (sorted by +// struct field name) so error messages are stable. +func validateSafeOutputsSamples(config *SafeOutputsConfig) error { + if config == nil { + return nil + } + + fieldNames := make([]string, 0, len(safeOutputFieldMapping)) + for fieldName := range safeOutputFieldMapping { + fieldNames = append(fieldNames, fieldName) + } + sort.Strings(fieldNames) + + for _, fieldName := range fieldNames { + toolName := safeOutputFieldMapping[fieldName] + base := extractBaseSafeOutputConfig(config, fieldName) + if base == nil || len(base.Samples) == 0 { + continue + } + if err := validateSamplesForTool(toolName, base.Samples); err != nil { + return err + } + } + return nil +} + +// extractBaseSafeOutputConfig returns the embedded BaseSafeOutputConfig of the +// non-nil safe-output config at SafeOutputsConfig., or nil if the +// field is unset or the struct does not embed BaseSafeOutputConfig. +func extractBaseSafeOutputConfig(config *SafeOutputsConfig, fieldName string) *BaseSafeOutputConfig { + field, ok := safeOutputPointerFieldValue(config, fieldName) + if !ok || field.IsNil() { + return nil + } + elem := field.Elem() + if elem.Kind() != reflect.Struct { + return nil + } + baseField := elem.FieldByName("BaseSafeOutputConfig") + if !baseField.IsValid() || !baseField.CanAddr() { + return nil + } + if base, ok := baseField.Addr().Interface().(*BaseSafeOutputConfig); ok { + return base + } + return nil +} + +// validateSamplesForTool validates each sample against the named MCP tool's +// inputSchema after stripping recognized sidecar fields. +func validateSamplesForTool(toolName string, samples []map[string]any) error { + schemas, err := getCompiledToolSchemas() + if err != nil { + return err + } + schema, found := schemas[toolName] + if !found { + return fmt.Errorf("samples: no MCP tool schema found for %q (yaml key %q). Available tools come from pkg/workflow/js/safe_outputs_tools.json", toolName, toolDisplayKey(toolName)) + } + displayKey := toolDisplayKey(toolName) + sidecars := sampleSidecarFields[toolName] + for i, sample := range samples { + stripped := stripSidecarFields(sample, sidecars) + if err := schema.Validate(stripped); err != nil { + return fmt.Errorf("safe-outputs.%s.samples[%d]: %w", displayKey, i, err) + } + } + return nil +} + +// stripSidecarFields returns a shallow copy of sample with sidecar keys removed. +// The original map is never modified, even when no sidecars are configured — +// callers may mutate the returned map without affecting the caller's input. +func stripSidecarFields(sample map[string]any, sidecars map[string]bool) map[string]any { + out := make(map[string]any, len(sample)) + for k, v := range sample { + if sidecars[k] { + continue + } + out[k] = v + } + return out +} + +// toolDisplayKey converts a snake_case MCP tool name into the hyphenated YAML +// frontmatter key (e.g. "create_pull_request" -> "create-pull-request"). +func toolDisplayKey(toolName string) string { + return strings.ReplaceAll(toolName, "_", "-") +} diff --git a/pkg/workflow/samples_validation_test.go b/pkg/workflow/samples_validation_test.go new file mode 100644 index 00000000000..394d35d665d --- /dev/null +++ b/pkg/workflow/samples_validation_test.go @@ -0,0 +1,170 @@ +package workflow + +import ( + "strings" + "testing" +) + +// TestValidateSafeOutputsSamples_Valid covers the happy path for the +// strict schema validation of samples entries. We use create_issue (no +// sidecars, just title/body) and create_pull_request (with the `patch` sidecar +// that must be stripped before validation). +func TestValidateSafeOutputsSamples_Valid(t *testing.T) { + cfg := &SafeOutputsConfig{ + CreateIssues: &CreateIssuesConfig{ + BaseSafeOutputConfig: BaseSafeOutputConfig{ + Samples: []map[string]any{ + { + "title": "Sample issue", + "body": "Sample body", + }, + }, + }, + }, + CreatePullRequests: &CreatePullRequestsConfig{ + BaseSafeOutputConfig: BaseSafeOutputConfig{ + Samples: []map[string]any{ + { + "title": "Sample PR", + "body": "Sample PR body", + "branch": "gh-aw-sample-pr", + // patch is a sidecar — must be stripped before validation + // and must NOT cause an `additionalProperties` failure. + "patch": "diff --git a/foo b/foo\nnew file mode 100644\n--- /dev/null\n+++ b/foo\n@@ -0,0 +1 @@\n+hi\n", + }, + }, + }, + }, + } + if err := validateSafeOutputsSamples(cfg); err != nil { + t.Fatalf("expected no validation error, got: %v", err) + } +} + +// TestValidateSafeOutputsSamples_MissingRequired verifies that omitting a +// required field (title) surfaces a stable, parseable error. +func TestValidateSafeOutputsSamples_MissingRequired(t *testing.T) { + cfg := &SafeOutputsConfig{ + CreateIssues: &CreateIssuesConfig{ + BaseSafeOutputConfig: BaseSafeOutputConfig{ + Samples: []map[string]any{ + { + // title intentionally missing + "body": "Body without title", + }, + }, + }, + }, + } + err := validateSafeOutputsSamples(cfg) + if err == nil { + t.Fatal("expected validation error for missing title, got nil") + } + msg := err.Error() + if !strings.Contains(msg, "create-issue") { + t.Errorf("expected error to reference the YAML key `create-issue`, got: %s", msg) + } + if !strings.Contains(msg, "samples[0]") { + t.Errorf("expected error to reference `samples[0]`, got: %s", msg) + } +} + +// TestValidateSafeOutputsSamples_SidecarStripped verifies that the `patch` +// sidecar is stripped before validation, so a create_pull_request sample with +// only the schema-required fields PLUS a patch validates cleanly. +func TestValidateSafeOutputsSamples_SidecarStripped(t *testing.T) { + cfg := &SafeOutputsConfig{ + CreatePullRequests: &CreatePullRequestsConfig{ + BaseSafeOutputConfig: BaseSafeOutputConfig{ + Samples: []map[string]any{ + { + "title": "PR", + "body": "PR body", + "branch": "gh-aw-x", + "patch": "diff --git a/x b/x\n", + }, + }, + }, + }, + } + if err := validateSafeOutputsSamples(cfg); err != nil { + t.Fatalf("expected sidecar to be stripped and validation to pass, got: %v", err) + } +} + +// TestCollectSampleEntries_DeterministicOrdering verifies that entries are +// emitted in a stable order across runs (sorted by SafeOutputsConfig field name) +// so that compiled YAML is deterministic. +func TestCollectSampleEntries_DeterministicOrdering(t *testing.T) { + cfg := &SafeOutputsConfig{ + CreateIssues: &CreateIssuesConfig{ + BaseSafeOutputConfig: BaseSafeOutputConfig{ + Samples: []map[string]any{ + {"title": "A", "body": "A"}, + }, + }, + }, + AddComments: &AddCommentsConfig{ + BaseSafeOutputConfig: BaseSafeOutputConfig{ + Samples: []map[string]any{ + {"body": "comment-A"}, + }, + }, + }, + } + first := collectSampleEntries(cfg) + second := collectSampleEntries(cfg) + + if len(first) != 2 { + t.Fatalf("expected 2 entries, got %d", len(first)) + } + if first[0].Tool != second[0].Tool || first[1].Tool != second[1].Tool { + t.Errorf("expected deterministic ordering across runs, got first=%v second=%v", first, second) + } + // Sorted by struct field name: AddComments < CreateIssues. + if first[0].Tool != "add_comment" { + t.Errorf("expected first entry tool to be add_comment (alphabetical struct field order), got %q", first[0].Tool) + } + if first[1].Tool != "create_issue" { + t.Errorf("expected second entry tool to be create_issue, got %q", first[1].Tool) + } +} + +// TestCollectSampleEntries_SidecarPartitioning verifies that sidecar fields +// land in Sidecars (not Arguments) so the driver knows what to pre-stage. +func TestCollectSampleEntries_SidecarPartitioning(t *testing.T) { + cfg := &SafeOutputsConfig{ + CreatePullRequests: &CreatePullRequestsConfig{ + BaseSafeOutputConfig: BaseSafeOutputConfig{ + Samples: []map[string]any{ + { + "title": "PR", + "body": "Body", + "branch": "br", + "patch": "diff --git a/x b/x\n", + }, + }, + }, + }, + } + entries := collectSampleEntries(cfg) + if len(entries) != 1 { + t.Fatalf("expected 1 entry, got %d", len(entries)) + } + e := entries[0] + if e.Tool != "create_pull_request" { + t.Errorf("expected tool create_pull_request, got %q", e.Tool) + } + if _, hasPatchInArgs := e.Arguments["patch"]; hasPatchInArgs { + t.Error("expected patch to be stripped from Arguments") + } + if e.Arguments["title"] != "PR" || e.Arguments["body"] != "Body" || e.Arguments["branch"] != "br" { + t.Errorf("expected title/body/branch to remain in Arguments, got %#v", e.Arguments) + } + if e.Sidecars == nil { + t.Fatal("expected Sidecars to be non-nil") + } + if patch, ok := e.Sidecars["patch"].(string); !ok || !strings.HasPrefix(patch, "diff --git") { + t.Errorf("expected patch to be present in Sidecars as a git diff string, got %#v", e.Sidecars["patch"]) + } +} diff --git a/pkg/workflow/workflow_builder.go b/pkg/workflow/workflow_builder.go index 762de84d3ea..2a87ff848a3 100644 --- a/pkg/workflow/workflow_builder.go +++ b/pkg/workflow/workflow_builder.go @@ -67,6 +67,7 @@ func (c *Compiler) buildInitialWorkflowData( ToolsStartupTimeout: toolsResult.toolsStartupTimeout, TrialMode: c.trialMode, TrialLogicalRepo: c.trialLogicalRepoSlug, + UseSamples: c.useSamples, StrictMode: c.strictMode, AllowActionRefs: c.allowActionRefs, ValidateAWFConfig: !c.skipValidation,