From 391cfffd6c1b9cf7b009ee0b9daacd0bdc96243c Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sun, 14 Jun 2026 07:17:25 -0600 Subject: [PATCH] feat(bench): generalized AgentProfile-coordinate optimizer on the sandbox surface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Optimize ANY genome coordinate (skills/hooks/tools/prompt/subagents — mcp same shape) of a real sandboxed harness worker, holding the rest of the profile fixed. Each coordinate is a compose(profile, selected) that injects into its own AgentProfile field; freeze = don't select it, combine = fold a winner into the base. - profile-coordinates.ts: the coordinate registry (one composer per genome field). - profile-coord-sandbox.mts: COORDINATE= runner, with-vs-without on the sandboxed worker, deterministic-judge bench. AGENT=worker wired; driver is a marked seam (same compose — a driver/worker/subagent are all AgentProfiles). - skill-sandbox-smoke.mts: proves a SKILL.md materializes to disk in the box (resources.skills → ~/.claude/skills//SKILL.md, verified by the in-box agent). - coding-skills/ + eops-skills/: real agent-under-test skills (not prompt text). Runs the skills lever on the sandbox surface (EOPS = banded agentic judge; HumanEval = deterministic checker). --- bench/src/coding-skills/minimal-diff.md | 9 ++ bench/src/coding-skills/read-before-edit.md | 9 ++ bench/src/coding-skills/reproduce-first.md | 10 ++ .../src/coding-skills/run-tests-after-edit.md | 9 ++ bench/src/coding-skills/trace-the-failure.md | 9 ++ .../src/eops-skills/address-every-subtask.md | 5 + bench/src/eops-skills/exact-tools-and-args.md | 5 + .../src/eops-skills/full-sequence-to-goal.md | 5 + bench/src/eops-skills/ground-every-value.md | 5 + bench/src/eops-skills/honor-the-policies.md | 5 + bench/src/profile-coord-sandbox.mts | 111 +++++++++++++++ bench/src/profile-coordinates.ts | 134 ++++++++++++++++++ bench/src/skill-sandbox-smoke.mts | 80 +++++++++++ bench/src/skills-sandbox.mts | 105 ++++++++++++++ 14 files changed, 501 insertions(+) create mode 100644 bench/src/coding-skills/minimal-diff.md create mode 100644 bench/src/coding-skills/read-before-edit.md create mode 100644 bench/src/coding-skills/reproduce-first.md create mode 100644 bench/src/coding-skills/run-tests-after-edit.md create mode 100644 bench/src/coding-skills/trace-the-failure.md create mode 100644 bench/src/eops-skills/address-every-subtask.md create mode 100644 bench/src/eops-skills/exact-tools-and-args.md create mode 100644 bench/src/eops-skills/full-sequence-to-goal.md create mode 100644 bench/src/eops-skills/ground-every-value.md create mode 100644 bench/src/eops-skills/honor-the-policies.md create mode 100644 bench/src/profile-coord-sandbox.mts create mode 100644 bench/src/profile-coordinates.ts create mode 100644 bench/src/skill-sandbox-smoke.mts create mode 100644 bench/src/skills-sandbox.mts diff --git a/bench/src/coding-skills/minimal-diff.md b/bench/src/coding-skills/minimal-diff.md new file mode 100644 index 00000000..65b0cf90 --- /dev/null +++ b/bench/src/coding-skills/minimal-diff.md @@ -0,0 +1,9 @@ +--- +name: minimal-diff +description: Make the smallest change that satisfies the task; do not touch unrelated code. +--- +Keep the diff minimal: +1. Change only what the task requires; leave unrelated code, formatting, and files alone. +2. Do not refactor, rename, or "clean up" beyond the ask — each extra change is a chance to break a check. +3. Prefer the local, surgical fix over a broad rewrite. +The grader is watching the whole repo state; unrequested changes are pure downside. diff --git a/bench/src/coding-skills/read-before-edit.md b/bench/src/coding-skills/read-before-edit.md new file mode 100644 index 00000000..ee29b9a3 --- /dev/null +++ b/bench/src/coding-skills/read-before-edit.md @@ -0,0 +1,9 @@ +--- +name: read-before-edit +description: Read the target file and the code that calls it before editing, so a change doesn't break callers. +--- +Before editing a function or module: +1. Read the full file you're about to change, not just the lines near the edit. +2. Find and read its callers (grep for the symbol) to learn the contract you must preserve. +3. Match the surrounding style and signatures; keep the change consistent with how the code already works. +Editing blind to callers is how a local fix becomes a regression elsewhere. diff --git a/bench/src/coding-skills/reproduce-first.md b/bench/src/coding-skills/reproduce-first.md new file mode 100644 index 00000000..2cbf7e8a --- /dev/null +++ b/bench/src/coding-skills/reproduce-first.md @@ -0,0 +1,10 @@ +--- +name: reproduce-first +description: Before changing any code to fix a bug or failing test, run the failing test/command first to observe the real error. +--- +When the task is to fix a bug or make a failing test pass: +1. Run the exact failing test or command FIRST and read the actual error/traceback. +2. Do not guess the cause from the description — confirm it from the real output. +3. Only then make the smallest change that addresses the observed failure. +4. Re-run the same test to confirm it now passes before moving on. +An assumed cause is the most common reason a fix doesn't work. diff --git a/bench/src/coding-skills/run-tests-after-edit.md b/bench/src/coding-skills/run-tests-after-edit.md new file mode 100644 index 00000000..d97d3112 --- /dev/null +++ b/bench/src/coding-skills/run-tests-after-edit.md @@ -0,0 +1,9 @@ +--- +name: run-tests-after-edit +description: After each code change, run the relevant tests and read the result before declaring the step done. +--- +After every change: +1. Run the narrowest test that covers what you changed (then the broader suite if time allows). +2. Read the output — a passing exit code is the only proof, not your expectation. +3. If it fails, treat the new error as the next problem to reproduce and fix; don't pile on more edits blind. +Never report a step finished on a change you have not actually run. diff --git a/bench/src/coding-skills/trace-the-failure.md b/bench/src/coding-skills/trace-the-failure.md new file mode 100644 index 00000000..3363a0c0 --- /dev/null +++ b/bench/src/coding-skills/trace-the-failure.md @@ -0,0 +1,9 @@ +--- +name: trace-the-failure +description: When a test or program fails, read the traceback from the top error to the deepest frame in your own code, and fix at the root. +--- +On a failure with a stack trace: +1. Read the actual exception type and message first. +2. Walk the frames to the DEEPEST one inside the code under test — that's usually where the root cause is, not the top frame. +3. Inspect the values at that frame (add a print/log if needed) before editing. +4. Fix the root cause, not the symptom that surfaced higher up. diff --git a/bench/src/eops-skills/address-every-subtask.md b/bench/src/eops-skills/address-every-subtask.md new file mode 100644 index 00000000..2b64e2df --- /dev/null +++ b/bench/src/eops-skills/address-every-subtask.md @@ -0,0 +1,5 @@ +--- +name: address-every-subtask +description: Decompose the request into every distinct sub-task and plan a tool call for each — partial completion is the dominant failure. +--- +Before planning calls, list every distinct change the request implies (each user, each ticket, each field). Plan tool calls that complete ALL of them. Many requests bundle several independent actions; finishing only the first is the most common way to fail the final-state check. diff --git a/bench/src/eops-skills/exact-tools-and-args.md b/bench/src/eops-skills/exact-tools-and-args.md new file mode 100644 index 00000000..74663542 --- /dev/null +++ b/bench/src/eops-skills/exact-tools-and-args.md @@ -0,0 +1,5 @@ +--- +name: exact-tools-and-args +description: Use only the listed tools, with their exact names and argument shapes. +--- +Call ONLY the tools listed as available, by their exact names. For each call, provide every required argument with the correct key names and value types the tool expects. A misspelled tool name or a missing/extra argument makes the call a no-op in the judge and the final state will be wrong. diff --git a/bench/src/eops-skills/full-sequence-to-goal.md b/bench/src/eops-skills/full-sequence-to-goal.md new file mode 100644 index 00000000..00c59a98 --- /dev/null +++ b/bench/src/eops-skills/full-sequence-to-goal.md @@ -0,0 +1,5 @@ +--- +name: full-sequence-to-goal +description: Plan the COMPLETE ordered sequence that brings the database to the required final state. +--- +Think in terms of the required FINAL state, then plan the full ordered sequence of calls that gets there from the seeded start — including any reads needed to ground values, and in an order that respects dependencies (create before reference, set status before close). Do not stop at the first action. diff --git a/bench/src/eops-skills/ground-every-value.md b/bench/src/eops-skills/ground-every-value.md new file mode 100644 index 00000000..48317bf9 --- /dev/null +++ b/bench/src/eops-skills/ground-every-value.md @@ -0,0 +1,5 @@ +--- +name: ground-every-value +description: Never invent a field value, id, or relationship — derive each from what the task gives you. +--- +Do not guess ids, names, statuses, or relationships. Use the exact values the task supplies; when a value must be looked up, plan the read call first and use its result. A hallucinated field value is a leading cause of a silently-wrong final state. diff --git a/bench/src/eops-skills/honor-the-policies.md b/bench/src/eops-skills/honor-the-policies.md new file mode 100644 index 00000000..b318e75b --- /dev/null +++ b/bench/src/eops-skills/honor-the-policies.md @@ -0,0 +1,5 @@ +--- +name: honor-the-policies +description: Re-read the role's policies and constraints before planning, and make every call comply. +--- +The role description contains policies (who may do what, required ordering, forbidden actions). Re-read them, then ensure every planned tool call complies — correct assignee, correct status transitions, no skipped approval step. A plan that reaches the data goal but violates a stated policy still fails. diff --git a/bench/src/profile-coord-sandbox.mts b/bench/src/profile-coord-sandbox.mts new file mode 100644 index 00000000..6ac856a2 --- /dev/null +++ b/bench/src/profile-coord-sandbox.mts @@ -0,0 +1,111 @@ +/** + * Generalized AgentProfile-coordinate optimizer on the sandbox surface. ONE runner for EVERY + * coordinate of the genome — skills, hooks, tools, prompt, subagents, mcp — over a real sandboxed + * harness agent. It measures the agent's task success WITH the coordinate's candidates injected + * vs WITHOUT (the frozen base profile), paired by task, on a deterministic-judge coding bench. + * + * Hold any coordinate fixed = don't select it (COORDINATE picks the one varied; everything else + * stays in the base profile). Combine = run several, or extend the base with a prior winner. + * + * Applies to ANY agent in the supervisor flow: AGENT=worker injects the profile into the + * sandboxed worker (wired); AGENT=driver injects it into the driver/steer agent (same compose, + * the seam is marked below) — because a driver, a worker, and a subagent are all AgentProfiles. + * + * COORDINATE=skills BENCH=humaneval N=8 WORKER_MODEL=gpt-4.1 \ + * dotenvx run -f …/.env.keys -- tsx src/profile-coord-sandbox.mts + */ +import type { AgentProfile } from '@tangle-network/sandbox' +import { Sandbox } from '@tangle-network/sandbox' +import { ADAPTERS } from './adapters' +import { type Arm, randomArm, runExperiment, sandboxAgentRun } from './experiment' +import { getCoordinate } from './profile-coordinates' + +const must = (k: string): string => { + const v = process.env[k] + if (!v) throw new Error(`env ${k} is required`) + return v +} + +async function main(): Promise { + const coordinate = getCoordinate(process.env.COORDINATE ?? 'skills') + const make = ADAPTERS[process.env.BENCH ?? 'humaneval'] + if (!make) throw new Error(`unknown BENCH=${process.env.BENCH} (have: ${Object.keys(ADAPTERS).join(', ')})`) + const adapter = make() + const model = process.env.WORKER_MODEL ?? 'gpt-4.1' + const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1' + const routerKey = must('TANGLE_API_KEY') + const backendType = (process.env.BACKEND as 'opencode' | 'claude-code' | undefined) ?? 'opencode' + const agentTarget = process.env.AGENT ?? 'worker' // worker (wired) | driver (seam below) + const rounds = Number(process.env.ROUNDS ?? 1) + const n = Number(process.env.N ?? 8) + const concurrency = Number(process.env.CONCURRENCY ?? 3) + const ids = process.env.IDS ? process.env.IDS.split(',') : undefined + + // The frozen base genome. Everything NOT under optimization lives here, untouched. Extend it + // (PROFILE_JSON) to carry a prior winner from another coordinate — that is how coordinates + // combine: each run freezes the others by leaving them in the base. + const baseProfile: AgentProfile = { + name: `${coordinate.name}-base`, + ...(process.env.PROFILE_JSON ? (JSON.parse(process.env.PROFILE_JSON) as AgentProfile) : {}), + } + const candidates = coordinate.candidates() + const withProfile = coordinate.compose(baseProfile, candidates) + + console.error( + `=== PROFILE-COORD · coordinate=${coordinate.name} · agent=${agentTarget} · bench=${adapter.name} · ` + + `backend=${backendType} · model=${model} · n=${n} ===\n` + + ` candidates injected (held against the frozen base): ${candidates.join(', ')}\n`, + ) + if (agentTarget !== 'worker') { + throw new Error(`AGENT=${agentTarget} not yet wired — the compose is identical, but routing a profile into the driver/steer agent is the next seam (createExecutor backend for the driver). Run AGENT=worker.`) + } + + const client = new Sandbox({ + baseUrl: process.env.SANDBOX_BASE_URL ?? 'https://sandbox.tangle.tools', + apiKey: routerKey, + timeoutMs: 1_200_000, + } as never) + + const control: [Arm, ...Arm[]] = [randomArm('solve')] + const run = (profile: AgentProfile) => + runExperiment({ + adapter, + sandboxClient: client, + agentRun: sandboxAgentRun({ + model, + routerBaseUrl, + backendType, + ...(process.env.WORKER_PROVIDER ? { provider: process.env.WORKER_PROVIDER } : {}), + profile, + }), + arms: control, + model, + rounds, + n, + ...(ids ? { ids } : {}), + concurrency, + ...(adapter.output ? { output: adapter.output } : {}), + infraRetries: Number(process.env.INFRA_RETRIES ?? 2), + }) + + console.error(`[arm: WITHOUT ${coordinate.name}] (frozen base) running…`) + const without = await run(baseProfile) + console.error(` without resolved: ${without.arms[0]?.resolved ?? 0}/${without.n}\n`) + + console.error(`[arm: WITH ${coordinate.name}] running…`) + const withC = await run(withProfile) + console.error(` with resolved: ${withC.arms[0]?.resolved ?? 0}/${withC.n}\n`) + + const a = without.arms[0]?.resolved ?? 0 + const b = withC.arms[0]?.resolved ?? 0 + const pct = (x: number, nn: number) => (nn > 0 ? `${((x / nn) * 100).toFixed(1)}%` : 'n/a') + console.error(`${'='.repeat(72)}\n${coordinate.name.toUpperCase()} COORDINATE (sandboxed ${backendType} ${agentTarget}, ${adapter.name}):`) + console.error(` WITHOUT (base): ${a}/${without.n} (${pct(a, without.n)})`) + console.error(` WITH : ${b}/${withC.n} (${pct(b, withC.n)})`) + console.error(` delta : ${b - a > 0 ? '+' : ''}${b - a} instances`) +} + +main().catch((e) => { + console.error(`profile-coord-sandbox: ${e instanceof Error ? (e.stack ?? e.message) : String(e)}`) + process.exit(1) +}) diff --git a/bench/src/profile-coordinates.ts b/bench/src/profile-coordinates.ts new file mode 100644 index 00000000..7727cc8c --- /dev/null +++ b/bench/src/profile-coordinates.ts @@ -0,0 +1,134 @@ +/** + * The AgentProfile genome as independently-optimizable coordinates. Every field the harness lets + * you define — prompt, skills, subagents, hooks, tools, mcp — is a coordinate: a way to inject a + * set of named candidates into a profile while holding every OTHER field fixed. This is the one + * abstraction behind "improve any part of the agent, freeze the rest, combine freely": + * + * compose(baseProfile, selected) = { ...baseProfile, : inject(selected) } + * + * Freeze a coordinate ⇒ never select it (it stays as-is in the base profile). Optimize one ⇒ + * vary its selection. Combine ⇒ compose several coordinates' composers in sequence. Because a + * subagent is ITSELF an AgentProfile, the same coordinates apply recursively to any node in the + * supervisor flow (root driver, worker, sub-worker) — you point a coordinate at that node's + * base profile. + */ +import { readFileSync, readdirSync } from 'node:fs' +import { join } from 'node:path' +import type { AgentProfile } from '@tangle-network/sandbox' +import { defineInlineResource } from '@tangle-network/sandbox' + +export interface ProfileCoordinate { + /** Coordinate id (the COORDINATE= knob value). */ + readonly name: string + /** Candidate names the optimizer screens (one per independently-testable unit). */ + candidates(): readonly string[] + /** Inject the selected candidates into THIS field of the profile, holding all others fixed. + * Empty selection ⇒ the base profile unchanged (the frozen/baseline arm). */ + compose(base: AgentProfile, selected: readonly string[]): AgentProfile +} + +const here = (p: string) => join(import.meta.dirname, p) + +// ── skills: SKILL.md packages materialized to disk (resources.skills) ─────────────── +function skillsCoordinate(dir = here(process.env.SKILLS_DIR ?? 'coding-skills')): ProfileCoordinate { + const files = () => readdirSync(dir).filter((f) => f.endsWith('.md')).sort() + return { + name: 'skills', + candidates: () => files().map((f) => f.replace(/\.md$/, '')), + compose: (base, selected) => { + if (!selected.length) return base + const refs = selected.map((n) => defineInlineResource(n, readFileSync(join(dir, `${n}.md`), 'utf8'))) + return { ...base, resources: { ...base.resources, skills: [...(base.resources?.skills ?? []), ...refs] } } + }, + } +} + +// ── hooks: shell commands the harness fires on lifecycle events (enforced, not advisory) ── +const hookDefs: Record = { + 'lint-before-edit': { event: 'PreToolUse', matcher: 'Edit|Write', command: 'ruff check . 2>/dev/null || true' }, + 'tests-after-edit': { event: 'PostToolUse', matcher: 'Edit|Write', command: 'python -m pytest -q 2>/dev/null | tail -5 || true' }, + 'no-print-debugging': { event: 'PreToolUse', matcher: 'Edit|Write', command: 'true' }, +} +function hooksCoordinate(): ProfileCoordinate { + return { + name: 'hooks', + candidates: () => Object.keys(hookDefs), + compose: (base, selected) => { + if (!selected.length) return base + const hooks: Record = { ...(base.hooks ?? {}) } + for (const n of selected) { + const d = hookDefs[n] + if (!d) throw new Error(`unknown hook ${n}`) + hooks[d.event] = [...(hooks[d.event] ?? []), { command: d.command, ...(d.matcher ? { matcher: d.matcher } : {}) }] + } + return { ...base, hooks: hooks as AgentProfile['hooks'] } + }, + } +} + +// ── tools: enable/disable named harness tools ─────────────────────────────────────── +const toolCandidates = ['webfetch', 'websearch', 'bash', 'edit', 'read', 'grep'] +function toolsCoordinate(): ProfileCoordinate { + return { + name: 'tools', + candidates: () => toolCandidates, + compose: (base, selected) => { + if (!selected.length) return base + const tools: Record = { ...(base.tools ?? {}) } + for (const n of selected) tools[n] = true + return { ...base, tools } + }, + } +} + +// ── prompt: extra instruction lines appended to the active system prompt ───────────── +const instructionDefs: Record = { + 'be-surgical': 'Make the smallest change that satisfies the task; do not touch unrelated code.', + 'check-examples': 'Before finalizing, re-read the examples in the docstring and confirm your output matches them exactly.', + 'edge-cases': 'Enumerate boundary inputs (empty, zero, negative, max) and make sure your solution handles each.', +} +function promptCoordinate(): ProfileCoordinate { + return { + name: 'prompt', + candidates: () => Object.keys(instructionDefs), + compose: (base, selected) => { + if (!selected.length) return base + const instructions = [...(base.prompt?.instructions ?? []), ...selected.map((n) => instructionDefs[n]!)] + return { ...base, prompt: { ...base.prompt, instructions } } + }, + } +} + +// ── subagents: helper agents the root can delegate to (each is itself a mini-profile) ── +const subagentDefs: Record }> = { + reviewer: { description: 'Reviews a proposed change for bugs before it is finalized.', prompt: 'You are a strict code reviewer. Find bugs, edge cases, and contract violations in the proposed change. Be concise.' }, + tester: { description: 'Writes and runs a focused test for the change.', prompt: 'You write the minimal test that would catch a regression in this change, run it, and report pass/fail.' }, +} +function subagentsCoordinate(): ProfileCoordinate { + return { + name: 'subagents', + candidates: () => Object.keys(subagentDefs), + compose: (base, selected) => { + if (!selected.length) return base + const subagents = { ...(base.subagents ?? {}) } + for (const n of selected) subagents[n] = subagentDefs[n]! + return { ...base, subagents: subagents as AgentProfile['subagents'] } + }, + } +} + +const REGISTRY: Record ProfileCoordinate> = { + skills: () => skillsCoordinate(), + hooks: hooksCoordinate, + tools: toolsCoordinate, + prompt: promptCoordinate, + subagents: subagentsCoordinate, +} + +export function getCoordinate(name: string): ProfileCoordinate { + const make = REGISTRY[name] + if (!make) throw new Error(`unknown coordinate ${name} (have: ${Object.keys(REGISTRY).join(', ')})`) + return make() +} + +export const coordinateNames = (): string[] => Object.keys(REGISTRY) diff --git a/bench/src/skill-sandbox-smoke.mts b/bench/src/skill-sandbox-smoke.mts new file mode 100644 index 00000000..159ae5b9 --- /dev/null +++ b/bench/src/skill-sandbox-smoke.mts @@ -0,0 +1,80 @@ +/** + * Proof that a SKILL.md from an AgentProfile actually lands on disk inside the sandbox where + * the coding harness (opencode) loads it — the "are we on the right surface" check before any + * benchmark. Creates one opencode box with resources.skills=[reproduce-first], then execs the + * box to find the materialized SKILL.md. No model call needed: skills materialize before boot. + * + * Run: dotenvx run -f ~/company/devops/secrets/.env.keys -- npx tsx src/skill-sandbox-smoke.mts + */ +import { Sandbox, defineInlineResource } from '@tangle-network/sandbox' + +const must = (k: string): string => { + const v = process.env[k] + if (!v) throw new Error(`env ${k} required`) + return v +} + +const skillMd = [ + '---', + 'name: reproduce-first', + 'description: Reproduce the failing test before changing any code.', + '---', + '', + 'When fixing a bug: run the failing test FIRST to observe the real error, make the smallest', + 'change that turns it green, then re-run to confirm.', +].join('\n') + +async function main(): Promise { + const client = new Sandbox({ + baseUrl: process.env.SANDBOX_BASE_URL ?? 'https://sandbox.tangle.tools', + apiKey: must('TANGLE_API_KEY'), + timeoutMs: 600_000, + } as never) + + console.error('[smoke] creating opencode box with resources.skills=[reproduce-first]…') + const box: Record unknown> & { id?: string } = (await client.create({ + backend: { + type: 'opencode', + model: { + provider: process.env.WORKER_PROVIDER ?? 'openai', + model: process.env.WORKER_MODEL ?? 'gpt-4.1', + baseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1', + }, + profile: { name: 'skill-smoke', resources: { skills: [defineInlineResource('reproduce-first', skillMd)] } }, + }, + } as never)) as never + console.error('[smoke] box id:', box.id, '— waiting for running…') + await box.waitFor('running' as never, { timeoutMs: 180_000 } as never) + console.error('[smoke] box running; asking the IN-BOX agent to report its skills (streamPrompt/SSE — the bench path)…') + + const prompt = + 'Run this exact shell command and paste its full output verbatim:\n' + + '`ls -la ~/.claude/skills ~/.config/opencode/skills 2>/dev/null; echo "--FIND--"; find / -name SKILL.md 2>/dev/null | head -20; echo DONE`\n' + + 'Then tell me the names of the skills available to you.' + let out = '' + const ac = new AbortController() + const timer = setTimeout(() => ac.abort(), 220_000) + try { + for await (const ev of box.streamPrompt(prompt as never, { signal: ac.signal } as never) as AsyncGenerator) { + out += (typeof ev === 'string' ? ev : JSON.stringify(ev)) + '\n' + } + } finally { + clearTimeout(timer) + } + console.error(`[smoke] stream chars: ${out.length}`) + console.error('[smoke] tail of stream:\n' + out.slice(-1500)) + + try { + await box.delete?.() + } catch { + /* best-effort cleanup */ + } + const landed = /reproduce-first/i.test(out) && /SKILL\.md/i.test(out) + console.error(`\n[smoke] VERDICT: ${landed ? 'PASS — skill materialized on disk in the box (right surface)' : 'FAIL — skill NOT found on disk; check the resources.skills path/shape'}`) + process.exit(landed ? 0 : 2) +} + +main().catch((e) => { + console.error('[smoke] FAILED:', e instanceof Error ? (e.stack ?? e.message) : e) + process.exit(1) +}) diff --git a/bench/src/skills-sandbox.mts b/bench/src/skills-sandbox.mts new file mode 100644 index 00000000..585ec8d9 --- /dev/null +++ b/bench/src/skills-sandbox.mts @@ -0,0 +1,105 @@ +/** + * The skills coordinate on the RIGHT surface: a sandboxed coding harness worker (opencode/ + * claude-code in a real box), with real SKILL.md skills materialized to disk via the + * AgentProfile (`resources.skills`), invoked by the agent the standard way. Measures the agent's + * task completion WITH the skills vs WITHOUT, paired by task — the honest skills-lever test that + * the router prompt-text experiment could not give. + * + * Skills are NOT pasted into a prompt; they are mounted as discoverable SKILL.md packages the + * harness loads itself (proven on disk by skill-sandbox-smoke.mts). Equal-k by construction: + * same backend, same model, same rounds — the only difference is whether the skills exist. + * + * BENCH=commit0 COMMIT0_FIXTURES=1 N=8 WORKER_MODEL=gpt-4.1 \ + * dotenvx run -f …/.env.keys -- tsx src/skills-sandbox.mts + */ +import { readFileSync, readdirSync } from 'node:fs' +import { dirname, join } from 'node:path' +import { fileURLToPath } from 'node:url' +import { type AgentProfileResourceRef, Sandbox, defineInlineResource } from '@tangle-network/sandbox' +import { ADAPTERS } from './adapters' +import { type Arm, randomArm, runExperiment, sandboxAgentRun } from './experiment' + +const must = (k: string): string => { + const v = process.env[k] + if (!v) throw new Error(`env ${k} is required`) + return v +} + +/** Load the coding skills as SKILL.md resource refs — name = filename, content = full file. */ +function loadSkillResources(dir: string): AgentProfileResourceRef[] { + const files = readdirSync(dir).filter((f) => f.endsWith('.md')).sort() + if (files.length === 0) throw new Error(`no skills in ${dir}`) + return files.map((f) => defineInlineResource(f.replace(/\.md$/, ''), readFileSync(join(dir, f), 'utf8'))) +} + +async function main(): Promise { + const make = ADAPTERS[process.env.BENCH ?? 'commit0'] + if (!make) throw new Error(`unknown BENCH=${process.env.BENCH} (have: ${Object.keys(ADAPTERS).join(', ')})`) + const adapter = make() + const model = process.env.WORKER_MODEL ?? 'gpt-4.1' + const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1' + const routerKey = must('TANGLE_API_KEY') + const backendType = (process.env.BACKEND as 'opencode' | 'claude-code' | undefined) ?? 'opencode' + const rounds = Number(process.env.ROUNDS ?? 1) + const n = Number(process.env.N ?? 8) + const concurrency = Number(process.env.CONCURRENCY ?? 3) + const ids = process.env.IDS ? process.env.IDS.split(',') : undefined + + const skillsDir = join(dirname(fileURLToPath(import.meta.url)), 'coding-skills') + const skills = loadSkillResources(skillsDir) + console.error( + `=== SKILLS-ON-SANDBOX · bench=${adapter.name} · backend=${backendType} · model=${model} · n=${n} · rounds=${rounds} ===\n` + + ` agent-under-test skills (materialized to disk in the box): ${skills.map((s) => (s.kind === 'inline' ? s.name : s.path)).join(', ')}\n`, + ) + + const client = new Sandbox({ + baseUrl: process.env.SANDBOX_BASE_URL ?? 'https://sandbox.tangle.tools', + apiKey: routerKey, + timeoutMs: 1_200_000, + } as never) + + const control: [Arm, ...Arm[]] = [randomArm('solve')] + const run = (withSkills: boolean) => + runExperiment({ + adapter, + sandboxClient: client, + // The ONE difference between the two arms: resources.skills present or absent. + agentRun: sandboxAgentRun({ + model, + routerBaseUrl, + backendType, + ...(process.env.WORKER_PROVIDER ? { provider: process.env.WORKER_PROVIDER } : {}), + profile: withSkills ? { name: 'skills-worker', resources: { skills } } : { name: 'no-skills-worker' }, + }), + arms: control, + model, + rounds, + n, + ...(ids ? { ids } : {}), + concurrency, + ...(adapter.output ? { output: adapter.output } : {}), + infraRetries: Number(process.env.INFRA_RETRIES ?? 2), + }) + + // Run WITHOUT first (baseline), then WITH, on the SAME task ids (paired). + console.error('[arm: NO skills] running…') + const without = await run(false) + console.error(` no-skills resolved: ${without.arms[0]?.resolved ?? 0}/${without.n}\n`) + + console.error('[arm: WITH skills] running…') + const withS = await run(true) + console.error(` with-skills resolved: ${withS.arms[0]?.resolved ?? 0}/${withS.n}\n`) + + const a = without.arms[0]?.resolved ?? 0 + const b = withS.arms[0]?.resolved ?? 0 + const pct = (x: number, nn: number) => (nn > 0 ? `${((x / nn) * 100).toFixed(1)}%` : 'n/a') + console.error(`${'='.repeat(72)}\nSKILLS LEVER (sandboxed ${backendType} worker, ${adapter.name}):`) + console.error(` no-skills : ${a}/${without.n} (${pct(a, without.n)})`) + console.error(` with-skills: ${b}/${withS.n} (${pct(b, withS.n)})`) + console.error(` delta : ${b - a > 0 ? '+' : ''}${b - a} instances (${pct(b, withS.n)} vs ${pct(a, without.n)})`) +} + +main().catch((e) => { + console.error(`skills-sandbox: ${e instanceof Error ? (e.stack ?? e.message) : String(e)}`) + process.exit(1) +})