From 391cfffd6c1b9cf7b009ee0b9daacd0bdc96243c Mon Sep 17 00:00:00 2001
From: Drew Stone <hello@tangle.tools>
Date: Sun, 14 Jun 2026 07:17:25 -0600
Subject: [PATCH] feat(bench): generalized AgentProfile-coordinate optimizer on
 the sandbox surface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Optimize ANY genome coordinate (skills/hooks/tools/prompt/subagents — mcp same shape)
of a real sandboxed harness worker, holding the rest of the profile fixed. Each
coordinate is a compose(profile, selected) that injects into its own AgentProfile
field; freeze = don't select it, combine = fold a winner into the base.

- profile-coordinates.ts: the coordinate registry (one composer per genome field).
- profile-coord-sandbox.mts: COORDINATE= runner, with-vs-without on the sandboxed
  worker, deterministic-judge bench. AGENT=worker wired; driver is a marked seam
  (same compose — a driver/worker/subagent are all AgentProfiles).
- skill-sandbox-smoke.mts: proves a SKILL.md materializes to disk in the box
  (resources.skills → ~/.claude/skills/<id>/SKILL.md, verified by the in-box agent).
- coding-skills/ + eops-skills/: real agent-under-test skills (not prompt text).

Runs the skills lever on the sandbox surface (EOPS = banded agentic judge; HumanEval
= deterministic checker).
---
 bench/src/coding-skills/minimal-diff.md       |   9 ++
 bench/src/coding-skills/read-before-edit.md   |   9 ++
 bench/src/coding-skills/reproduce-first.md    |  10 ++
 .../src/coding-skills/run-tests-after-edit.md |   9 ++
 bench/src/coding-skills/trace-the-failure.md  |   9 ++
 .../src/eops-skills/address-every-subtask.md  |   5 +
 bench/src/eops-skills/exact-tools-and-args.md |   5 +
 .../src/eops-skills/full-sequence-to-goal.md  |   5 +
 bench/src/eops-skills/ground-every-value.md   |   5 +
 bench/src/eops-skills/honor-the-policies.md   |   5 +
 bench/src/profile-coord-sandbox.mts           | 111 +++++++++++++++
 bench/src/profile-coordinates.ts              | 134 ++++++++++++++++++
 bench/src/skill-sandbox-smoke.mts             |  80 +++++++++++
 bench/src/skills-sandbox.mts                  | 105 ++++++++++++++
 14 files changed, 501 insertions(+)
 create mode 100644 bench/src/coding-skills/minimal-diff.md
 create mode 100644 bench/src/coding-skills/read-before-edit.md
 create mode 100644 bench/src/coding-skills/reproduce-first.md
 create mode 100644 bench/src/coding-skills/run-tests-after-edit.md
 create mode 100644 bench/src/coding-skills/trace-the-failure.md
 create mode 100644 bench/src/eops-skills/address-every-subtask.md
 create mode 100644 bench/src/eops-skills/exact-tools-and-args.md
 create mode 100644 bench/src/eops-skills/full-sequence-to-goal.md
 create mode 100644 bench/src/eops-skills/ground-every-value.md
 create mode 100644 bench/src/eops-skills/honor-the-policies.md
 create mode 100644 bench/src/profile-coord-sandbox.mts
 create mode 100644 bench/src/profile-coordinates.ts
 create mode 100644 bench/src/skill-sandbox-smoke.mts
 create mode 100644 bench/src/skills-sandbox.mts
diff --git a/bench/src/coding-skills/minimal-diff.md b/bench/src/coding-skills/minimal-diff.md
new file mode 100644
index 00000000..65b0cf90
--- /dev/null
+++ b/bench/src/coding-skills/minimal-diff.md
@@ -0,0 +1,9 @@
+---
+name: minimal-diff
+description: Make the smallest change that satisfies the task; do not touch unrelated code.
+---
+Keep the diff minimal:
+1. Change only what the task requires; leave unrelated code, formatting, and files alone.
+2. Do not refactor, rename, or "clean up" beyond the ask — each extra change is a chance to break a check.
+3. Prefer the local, surgical fix over a broad rewrite.
+The grader is watching the whole repo state; unrequested changes are pure downside.
diff --git a/bench/src/coding-skills/read-before-edit.md b/bench/src/coding-skills/read-before-edit.md
new file mode 100644
index 00000000..ee29b9a3
--- /dev/null
+++ b/bench/src/coding-skills/read-before-edit.md
@@ -0,0 +1,9 @@
+---
+name: read-before-edit
+description: Read the target file and the code that calls it before editing, so a change doesn't break callers.
+---
+Before editing a function or module:
+1. Read the full file you're about to change, not just the lines near the edit.
+2. Find and read its callers (grep for the symbol) to learn the contract you must preserve.
+3. Match the surrounding style and signatures; keep the change consistent with how the code already works.
+Editing blind to callers is how a local fix becomes a regression elsewhere.
diff --git a/bench/src/coding-skills/reproduce-first.md b/bench/src/coding-skills/reproduce-first.md
new file mode 100644
index 00000000..2cbf7e8a
--- /dev/null
+++ b/bench/src/coding-skills/reproduce-first.md
@@ -0,0 +1,10 @@
+---
+name: reproduce-first
+description: Before changing any code to fix a bug or failing test, run the failing test/command first to observe the real error.
+---
+When the task is to fix a bug or make a failing test pass:
+1. Run the exact failing test or command FIRST and read the actual error/traceback.
+2. Do not guess the cause from the description — confirm it from the real output.
+3. Only then make the smallest change that addresses the observed failure.
+4. Re-run the same test to confirm it now passes before moving on.
+An assumed cause is the most common reason a fix doesn't work.
diff --git a/bench/src/coding-skills/run-tests-after-edit.md b/bench/src/coding-skills/run-tests-after-edit.md
new file mode 100644
index 00000000..d97d3112
--- /dev/null
+++ b/bench/src/coding-skills/run-tests-after-edit.md
@@ -0,0 +1,9 @@
+---
+name: run-tests-after-edit
+description: After each code change, run the relevant tests and read the result before declaring the step done.
+---
+After every change:
+1. Run the narrowest test that covers what you changed (then the broader suite if time allows).
+2. Read the output — a passing exit code is the only proof, not your expectation.
+3. If it fails, treat the new error as the next problem to reproduce and fix; don't pile on more edits blind.
+Never report a step finished on a change you have not actually run.
diff --git a/bench/src/coding-skills/trace-the-failure.md b/bench/src/coding-skills/trace-the-failure.md
new file mode 100644
index 00000000..3363a0c0
--- /dev/null
+++ b/bench/src/coding-skills/trace-the-failure.md
@@ -0,0 +1,9 @@
+---
+name: trace-the-failure
+description: When a test or program fails, read the traceback from the top error to the deepest frame in your own code, and fix at the root.
+---
+On a failure with a stack trace:
+1. Read the actual exception type and message first.
+2. Walk the frames to the DEEPEST one inside the code under test — that's usually where the root cause is, not the top frame.
+3. Inspect the values at that frame (add a print/log if needed) before editing.
+4. Fix the root cause, not the symptom that surfaced higher up.
diff --git a/bench/src/eops-skills/address-every-subtask.md b/bench/src/eops-skills/address-every-subtask.md
new file mode 100644
index 00000000..2b64e2df
--- /dev/null
+++ b/bench/src/eops-skills/address-every-subtask.md
@@ -0,0 +1,5 @@
+---
+name: address-every-subtask
+description: Decompose the request into every distinct sub-task and plan a tool call for each — partial completion is the dominant failure.
+---
+Before planning calls, list every distinct change the request implies (each user, each ticket, each field). Plan tool calls that complete ALL of them. Many requests bundle several independent actions; finishing only the first is the most common way to fail the final-state check.
diff --git a/bench/src/eops-skills/exact-tools-and-args.md b/bench/src/eops-skills/exact-tools-and-args.md
new file mode 100644
index 00000000..74663542
--- /dev/null
+++ b/bench/src/eops-skills/exact-tools-and-args.md
@@ -0,0 +1,5 @@
+---
+name: exact-tools-and-args
+description: Use only the listed tools, with their exact names and argument shapes.
+---
+Call ONLY the tools listed as available, by their exact names. For each call, provide every required argument with the correct key names and value types the tool expects. A misspelled tool name or a missing/extra argument makes the call a no-op in the judge and the final state will be wrong.
diff --git a/bench/src/eops-skills/full-sequence-to-goal.md b/bench/src/eops-skills/full-sequence-to-goal.md
new file mode 100644
index 00000000..00c59a98
--- /dev/null
+++ b/bench/src/eops-skills/full-sequence-to-goal.md
@@ -0,0 +1,5 @@
+---
+name: full-sequence-to-goal
+description: Plan the COMPLETE ordered sequence that brings the database to the required final state.
+---
+Think in terms of the required FINAL state, then plan the full ordered sequence of calls that gets there from the seeded start — including any reads needed to ground values, and in an order that respects dependencies (create before reference, set status before close). Do not stop at the first action.
diff --git a/bench/src/eops-skills/ground-every-value.md b/bench/src/eops-skills/ground-every-value.md
new file mode 100644
index 00000000..48317bf9
--- /dev/null
+++ b/bench/src/eops-skills/ground-every-value.md
@@ -0,0 +1,5 @@
+---
+name: ground-every-value
+description: Never invent a field value, id, or relationship — derive each from what the task gives you.
+---
+Do not guess ids, names, statuses, or relationships. Use the exact values the task supplies; when a value must be looked up, plan the read call first and use its result. A hallucinated field value is a leading cause of a silently-wrong final state.
diff --git a/bench/src/eops-skills/honor-the-policies.md b/bench/src/eops-skills/honor-the-policies.md
new file mode 100644
index 00000000..b318e75b
--- /dev/null
+++ b/bench/src/eops-skills/honor-the-policies.md
@@ -0,0 +1,5 @@
+---
+name: honor-the-policies
+description: Re-read the role's policies and constraints before planning, and make every call comply.
+---
+The role description contains policies (who may do what, required ordering, forbidden actions). Re-read them, then ensure every planned tool call complies — correct assignee, correct status transitions, no skipped approval step. A plan that reaches the data goal but violates a stated policy still fails.
diff --git a/bench/src/profile-coord-sandbox.mts b/bench/src/profile-coord-sandbox.mts
new file mode 100644
index 00000000..6ac856a2
--- /dev/null
+++ b/bench/src/profile-coord-sandbox.mts
@@ -0,0 +1,111 @@
+/**
+ * Generalized AgentProfile-coordinate optimizer on the sandbox surface. ONE runner for EVERY
+ * coordinate of the genome — skills, hooks, tools, prompt, subagents, mcp — over a real sandboxed
+ * harness agent. It measures the agent's task success WITH the coordinate's candidates injected
+ * vs WITHOUT (the frozen base profile), paired by task, on a deterministic-judge coding bench.
+ *
+ * Hold any coordinate fixed = don't select it (COORDINATE picks the one varied; everything else
+ * stays in the base profile). Combine = run several, or extend the base with a prior winner.
+ *
+ * Applies to ANY agent in the supervisor flow: AGENT=worker injects the profile into the
+ * sandboxed worker (wired); AGENT=driver injects it into the driver/steer agent (same compose,
+ * the seam is marked below) — because a driver, a worker, and a subagent are all AgentProfiles.
+ *
+ *   COORDINATE=skills BENCH=humaneval N=8 WORKER_MODEL=gpt-4.1 \
+ *     dotenvx run -f …/.env.keys -- tsx src/profile-coord-sandbox.mts
+ */
+import type { AgentProfile } from '@tangle-network/sandbox'
+import { Sandbox } from '@tangle-network/sandbox'
+import { ADAPTERS } from './adapters'
+import { type Arm, randomArm, runExperiment, sandboxAgentRun } from './experiment'
+import { getCoordinate } from './profile-coordinates'
+
+const must = (k: string): string => {
+  const v = process.env[k]
+  if (!v) throw new Error(`env ${k} is required`)
+  return v
+}
+
+async function main(): Promise<void> {
+  const coordinate = getCoordinate(process.env.COORDINATE ?? 'skills')
+  const make = ADAPTERS[process.env.BENCH ?? 'humaneval']
+  if (!make) throw new Error(`unknown BENCH=${process.env.BENCH} (have: ${Object.keys(ADAPTERS).join(', ')})`)
+  const adapter = make()
+  const model = process.env.WORKER_MODEL ?? 'gpt-4.1'
+  const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1'
+  const routerKey = must('TANGLE_API_KEY')
+  const backendType = (process.env.BACKEND as 'opencode' | 'claude-code' | undefined) ?? 'opencode'
+  const agentTarget = process.env.AGENT ?? 'worker' // worker (wired) | driver (seam below)
+  const rounds = Number(process.env.ROUNDS ?? 1)
+  const n = Number(process.env.N ?? 8)
+  const concurrency = Number(process.env.CONCURRENCY ?? 3)
+  const ids = process.env.IDS ? process.env.IDS.split(',') : undefined
+
+  // The frozen base genome. Everything NOT under optimization lives here, untouched. Extend it
+  // (PROFILE_JSON) to carry a prior winner from another coordinate — that is how coordinates
+  // combine: each run freezes the others by leaving them in the base.
+  const baseProfile: AgentProfile = {
+    name: `${coordinate.name}-base`,
+    ...(process.env.PROFILE_JSON ? (JSON.parse(process.env.PROFILE_JSON) as AgentProfile) : {}),
+  }
+  const candidates = coordinate.candidates()
+  const withProfile = coordinate.compose(baseProfile, candidates)
+
+  console.error(
+    `=== PROFILE-COORD · coordinate=${coordinate.name} · agent=${agentTarget} · bench=${adapter.name} · ` +
+      `backend=${backendType} · model=${model} · n=${n} ===\n` +
+      `  candidates injected (held against the frozen base): ${candidates.join(', ')}\n`,
+  )
+  if (agentTarget !== 'worker') {
+    throw new Error(`AGENT=${agentTarget} not yet wired — the compose is identical, but routing a profile into the driver/steer agent is the next seam (createExecutor backend for the driver). Run AGENT=worker.`)
+  }
+
+  const client = new Sandbox({
+    baseUrl: process.env.SANDBOX_BASE_URL ?? 'https://sandbox.tangle.tools',
+    apiKey: routerKey,
+    timeoutMs: 1_200_000,
+  } as never)
+
+  const control: [Arm, ...Arm[]] = [randomArm('solve')]
+  const run = (profile: AgentProfile) =>
+    runExperiment({
+      adapter,
+      sandboxClient: client,
+      agentRun: sandboxAgentRun({
+        model,
+        routerBaseUrl,
+        backendType,
+        ...(process.env.WORKER_PROVIDER ? { provider: process.env.WORKER_PROVIDER } : {}),
+        profile,
+      }),
+      arms: control,
+      model,
+      rounds,
+      n,
+      ...(ids ? { ids } : {}),
+      concurrency,
+      ...(adapter.output ? { output: adapter.output } : {}),
+      infraRetries: Number(process.env.INFRA_RETRIES ?? 2),
+    })
+
+  console.error(`[arm: WITHOUT ${coordinate.name}] (frozen base) running…`)
+  const without = await run(baseProfile)
+  console.error(`  without resolved: ${without.arms[0]?.resolved ?? 0}/${without.n}\n`)
+
+  console.error(`[arm: WITH ${coordinate.name}] running…`)
+  const withC = await run(withProfile)
+  console.error(`  with resolved: ${withC.arms[0]?.resolved ?? 0}/${withC.n}\n`)
+
+  const a = without.arms[0]?.resolved ?? 0
+  const b = withC.arms[0]?.resolved ?? 0
+  const pct = (x: number, nn: number) => (nn > 0 ? `${((x / nn) * 100).toFixed(1)}%` : 'n/a')
+  console.error(`${'='.repeat(72)}\n${coordinate.name.toUpperCase()} COORDINATE (sandboxed ${backendType} ${agentTarget}, ${adapter.name}):`)
+  console.error(`  WITHOUT (base): ${a}/${without.n} (${pct(a, without.n)})`)
+  console.error(`  WITH         : ${b}/${withC.n} (${pct(b, withC.n)})`)
+  console.error(`  delta        : ${b - a > 0 ? '+' : ''}${b - a} instances`)
+}
+
+main().catch((e) => {
+  console.error(`profile-coord-sandbox: ${e instanceof Error ? (e.stack ?? e.message) : String(e)}`)
+  process.exit(1)
+})
diff --git a/bench/src/profile-coordinates.ts b/bench/src/profile-coordinates.ts
new file mode 100644
index 00000000..7727cc8c
--- /dev/null
+++ b/bench/src/profile-coordinates.ts
@@ -0,0 +1,134 @@
+/**
+ * The AgentProfile genome as independently-optimizable coordinates. Every field the harness lets
+ * you define — prompt, skills, subagents, hooks, tools, mcp — is a coordinate: a way to inject a
+ * set of named candidates into a profile while holding every OTHER field fixed. This is the one
+ * abstraction behind "improve any part of the agent, freeze the rest, combine freely":
+ *
+ *   compose(baseProfile, selected) = { ...baseProfile, <thisField>: inject(selected) }
+ *
+ * Freeze a coordinate ⇒ never select it (it stays as-is in the base profile). Optimize one ⇒
+ * vary its selection. Combine ⇒ compose several coordinates' composers in sequence. Because a
+ * subagent is ITSELF an AgentProfile, the same coordinates apply recursively to any node in the
+ * supervisor flow (root driver, worker, sub-worker) — you point a coordinate at that node's
+ * base profile.
+ */
+import { readFileSync, readdirSync } from 'node:fs'
+import { join } from 'node:path'
+import type { AgentProfile } from '@tangle-network/sandbox'
+import { defineInlineResource } from '@tangle-network/sandbox'
+
+export interface ProfileCoordinate {
+  /** Coordinate id (the COORDINATE= knob value). */
+  readonly name: string
+  /** Candidate names the optimizer screens (one per independently-testable unit). */
+  candidates(): readonly string[]
+  /** Inject the selected candidates into THIS field of the profile, holding all others fixed.
+   *  Empty selection ⇒ the base profile unchanged (the frozen/baseline arm). */
+  compose(base: AgentProfile, selected: readonly string[]): AgentProfile
+}
+
+const here = (p: string) => join(import.meta.dirname, p)
+
+// ── skills: SKILL.md packages materialized to disk (resources.skills) ───────────────
+function skillsCoordinate(dir = here(process.env.SKILLS_DIR ?? 'coding-skills')): ProfileCoordinate {
+  const files = () => readdirSync(dir).filter((f) => f.endsWith('.md')).sort()
+  return {
+    name: 'skills',
+    candidates: () => files().map((f) => f.replace(/\.md$/, '')),
+    compose: (base, selected) => {
+      if (!selected.length) return base
+      const refs = selected.map((n) => defineInlineResource(n, readFileSync(join(dir, `${n}.md`), 'utf8')))
+      return { ...base, resources: { ...base.resources, skills: [...(base.resources?.skills ?? []), ...refs] } }
+    },
+  }
+}
+
+// ── hooks: shell commands the harness fires on lifecycle events (enforced, not advisory) ──
+const hookDefs: Record<string, { event: string; command: string; matcher?: string }> = {
+  'lint-before-edit': { event: 'PreToolUse', matcher: 'Edit|Write', command: 'ruff check . 2>/dev/null || true' },
+  'tests-after-edit': { event: 'PostToolUse', matcher: 'Edit|Write', command: 'python -m pytest -q 2>/dev/null | tail -5 || true' },
+  'no-print-debugging': { event: 'PreToolUse', matcher: 'Edit|Write', command: 'true' },
+}
+function hooksCoordinate(): ProfileCoordinate {
+  return {
+    name: 'hooks',
+    candidates: () => Object.keys(hookDefs),
+    compose: (base, selected) => {
+      if (!selected.length) return base
+      const hooks: Record<string, { command: string; matcher?: string }[]> = { ...(base.hooks ?? {}) }
+      for (const n of selected) {
+        const d = hookDefs[n]
+        if (!d) throw new Error(`unknown hook ${n}`)
+        hooks[d.event] = [...(hooks[d.event] ?? []), { command: d.command, ...(d.matcher ? { matcher: d.matcher } : {}) }]
+      }
+      return { ...base, hooks: hooks as AgentProfile['hooks'] }
+    },
+  }
+}
+
+// ── tools: enable/disable named harness tools ───────────────────────────────────────
+const toolCandidates = ['webfetch', 'websearch', 'bash', 'edit', 'read', 'grep']
+function toolsCoordinate(): ProfileCoordinate {
+  return {
+    name: 'tools',
+    candidates: () => toolCandidates,
+    compose: (base, selected) => {
+      if (!selected.length) return base
+      const tools: Record<string, boolean> = { ...(base.tools ?? {}) }
+      for (const n of selected) tools[n] = true
+      return { ...base, tools }
+    },
+  }
+}
+
+// ── prompt: extra instruction lines appended to the active system prompt ─────────────
+const instructionDefs: Record<string, string> = {
+  'be-surgical': 'Make the smallest change that satisfies the task; do not touch unrelated code.',
+  'check-examples': 'Before finalizing, re-read the examples in the docstring and confirm your output matches them exactly.',
+  'edge-cases': 'Enumerate boundary inputs (empty, zero, negative, max) and make sure your solution handles each.',
+}
+function promptCoordinate(): ProfileCoordinate {
+  return {
+    name: 'prompt',
+    candidates: () => Object.keys(instructionDefs),
+    compose: (base, selected) => {
+      if (!selected.length) return base
+      const instructions = [...(base.prompt?.instructions ?? []), ...selected.map((n) => instructionDefs[n]!)]
+      return { ...base, prompt: { ...base.prompt, instructions } }
+    },
+  }
+}
+
+// ── subagents: helper agents the root can delegate to (each is itself a mini-profile) ──
+const subagentDefs: Record<string, { description: string; prompt: string; tools?: Record<string, boolean> }> = {
+  reviewer: { description: 'Reviews a proposed change for bugs before it is finalized.', prompt: 'You are a strict code reviewer. Find bugs, edge cases, and contract violations in the proposed change. Be concise.' },
+  tester: { description: 'Writes and runs a focused test for the change.', prompt: 'You write the minimal test that would catch a regression in this change, run it, and report pass/fail.' },
+}
+function subagentsCoordinate(): ProfileCoordinate {
+  return {
+    name: 'subagents',
+    candidates: () => Object.keys(subagentDefs),
+    compose: (base, selected) => {
+      if (!selected.length) return base
+      const subagents = { ...(base.subagents ?? {}) }
+      for (const n of selected) subagents[n] = subagentDefs[n]!
+      return { ...base, subagents: subagents as AgentProfile['subagents'] }
+    },
+  }
+}
+
+const REGISTRY: Record<string, () => ProfileCoordinate> = {
+  skills: () => skillsCoordinate(),
+  hooks: hooksCoordinate,
+  tools: toolsCoordinate,
+  prompt: promptCoordinate,
+  subagents: subagentsCoordinate,
+}
+
+export function getCoordinate(name: string): ProfileCoordinate {
+  const make = REGISTRY[name]
+  if (!make) throw new Error(`unknown coordinate ${name} (have: ${Object.keys(REGISTRY).join(', ')})`)
+  return make()
+}
+
+export const coordinateNames = (): string[] => Object.keys(REGISTRY)
diff --git a/bench/src/skill-sandbox-smoke.mts b/bench/src/skill-sandbox-smoke.mts
new file mode 100644
index 00000000..159ae5b9
--- /dev/null
+++ b/bench/src/skill-sandbox-smoke.mts
@@ -0,0 +1,80 @@
+/**
+ * Proof that a SKILL.md from an AgentProfile actually lands on disk inside the sandbox where
+ * the coding harness (opencode) loads it — the "are we on the right surface" check before any
+ * benchmark. Creates one opencode box with resources.skills=[reproduce-first], then execs the
+ * box to find the materialized SKILL.md. No model call needed: skills materialize before boot.
+ *
+ * Run: dotenvx run -f ~/company/devops/secrets/.env.keys -- npx tsx src/skill-sandbox-smoke.mts
+ */
+import { Sandbox, defineInlineResource } from '@tangle-network/sandbox'
+
+const must = (k: string): string => {
+  const v = process.env[k]
+  if (!v) throw new Error(`env ${k} required`)
+  return v
+}
+
+const skillMd = [
+  '---',
+  'name: reproduce-first',
+  'description: Reproduce the failing test before changing any code.',
+  '---',
+  '',
+  'When fixing a bug: run the failing test FIRST to observe the real error, make the smallest',
+  'change that turns it green, then re-run to confirm.',
+].join('\n')
+
+async function main(): Promise<void> {
+  const client = new Sandbox({
+    baseUrl: process.env.SANDBOX_BASE_URL ?? 'https://sandbox.tangle.tools',
+    apiKey: must('TANGLE_API_KEY'),
+    timeoutMs: 600_000,
+  } as never)
+
+  console.error('[smoke] creating opencode box with resources.skills=[reproduce-first]…')
+  const box: Record<string, (...a: never[]) => unknown> & { id?: string } = (await client.create({
+    backend: {
+      type: 'opencode',
+      model: {
+        provider: process.env.WORKER_PROVIDER ?? 'openai',
+        model: process.env.WORKER_MODEL ?? 'gpt-4.1',
+        baseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1',
+      },
+      profile: { name: 'skill-smoke', resources: { skills: [defineInlineResource('reproduce-first', skillMd)] } },
+    },
+  } as never)) as never
+  console.error('[smoke] box id:', box.id, '— waiting for running…')
+  await box.waitFor('running' as never, { timeoutMs: 180_000 } as never)
+  console.error('[smoke] box running; asking the IN-BOX agent to report its skills (streamPrompt/SSE — the bench path)…')
+
+  const prompt =
+    'Run this exact shell command and paste its full output verbatim:\n' +
+    '`ls -la ~/.claude/skills ~/.config/opencode/skills 2>/dev/null; echo "--FIND--"; find / -name SKILL.md 2>/dev/null | head -20; echo DONE`\n' +
+    'Then tell me the names of the skills available to you.'
+  let out = ''
+  const ac = new AbortController()
+  const timer = setTimeout(() => ac.abort(), 220_000)
+  try {
+    for await (const ev of box.streamPrompt(prompt as never, { signal: ac.signal } as never) as AsyncGenerator<unknown>) {
+      out += (typeof ev === 'string' ? ev : JSON.stringify(ev)) + '\n'
+    }
+  } finally {
+    clearTimeout(timer)
+  }
+  console.error(`[smoke] stream chars: ${out.length}`)
+  console.error('[smoke] tail of stream:\n' + out.slice(-1500))
+
+  try {
+    await box.delete?.()
+  } catch {
+    /* best-effort cleanup */
+  }
+  const landed = /reproduce-first/i.test(out) && /SKILL\.md/i.test(out)
+  console.error(`\n[smoke] VERDICT: ${landed ? 'PASS — skill materialized on disk in the box (right surface)' : 'FAIL — skill NOT found on disk; check the resources.skills path/shape'}`)
+  process.exit(landed ? 0 : 2)
+}
+
+main().catch((e) => {
+  console.error('[smoke] FAILED:', e instanceof Error ? (e.stack ?? e.message) : e)
+  process.exit(1)
+})
diff --git a/bench/src/skills-sandbox.mts b/bench/src/skills-sandbox.mts
new file mode 100644
index 00000000..585ec8d9
--- /dev/null
+++ b/bench/src/skills-sandbox.mts
@@ -0,0 +1,105 @@
+/**
+ * The skills coordinate on the RIGHT surface: a sandboxed coding harness worker (opencode/
+ * claude-code in a real box), with real SKILL.md skills materialized to disk via the
+ * AgentProfile (`resources.skills`), invoked by the agent the standard way. Measures the agent's
+ * task completion WITH the skills vs WITHOUT, paired by task — the honest skills-lever test that
+ * the router prompt-text experiment could not give.
+ *
+ * Skills are NOT pasted into a prompt; they are mounted as discoverable SKILL.md packages the
+ * harness loads itself (proven on disk by skill-sandbox-smoke.mts). Equal-k by construction:
+ * same backend, same model, same rounds — the only difference is whether the skills exist.
+ *
+ *   BENCH=commit0 COMMIT0_FIXTURES=1 N=8 WORKER_MODEL=gpt-4.1 \
+ *     dotenvx run -f …/.env.keys -- tsx src/skills-sandbox.mts
+ */
+import { readFileSync, readdirSync } from 'node:fs'
+import { dirname, join } from 'node:path'
+import { fileURLToPath } from 'node:url'
+import { type AgentProfileResourceRef, Sandbox, defineInlineResource } from '@tangle-network/sandbox'
+import { ADAPTERS } from './adapters'
+import { type Arm, randomArm, runExperiment, sandboxAgentRun } from './experiment'
+
+const must = (k: string): string => {
+  const v = process.env[k]
+  if (!v) throw new Error(`env ${k} is required`)
+  return v
+}
+
+/** Load the coding skills as SKILL.md resource refs — name = filename, content = full file. */
+function loadSkillResources(dir: string): AgentProfileResourceRef[] {
+  const files = readdirSync(dir).filter((f) => f.endsWith('.md')).sort()
+  if (files.length === 0) throw new Error(`no skills in ${dir}`)
+  return files.map((f) => defineInlineResource(f.replace(/\.md$/, ''), readFileSync(join(dir, f), 'utf8')))
+}
+
+async function main(): Promise<void> {
+  const make = ADAPTERS[process.env.BENCH ?? 'commit0']
+  if (!make) throw new Error(`unknown BENCH=${process.env.BENCH} (have: ${Object.keys(ADAPTERS).join(', ')})`)
+  const adapter = make()
+  const model = process.env.WORKER_MODEL ?? 'gpt-4.1'
+  const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1'
+  const routerKey = must('TANGLE_API_KEY')
+  const backendType = (process.env.BACKEND as 'opencode' | 'claude-code' | undefined) ?? 'opencode'
+  const rounds = Number(process.env.ROUNDS ?? 1)
+  const n = Number(process.env.N ?? 8)
+  const concurrency = Number(process.env.CONCURRENCY ?? 3)
+  const ids = process.env.IDS ? process.env.IDS.split(',') : undefined
+
+  const skillsDir = join(dirname(fileURLToPath(import.meta.url)), 'coding-skills')
+  const skills = loadSkillResources(skillsDir)
+  console.error(
+    `=== SKILLS-ON-SANDBOX · bench=${adapter.name} · backend=${backendType} · model=${model} · n=${n} · rounds=${rounds} ===\n` +
+      `  agent-under-test skills (materialized to disk in the box): ${skills.map((s) => (s.kind === 'inline' ? s.name : s.path)).join(', ')}\n`,
+  )
+
+  const client = new Sandbox({
+    baseUrl: process.env.SANDBOX_BASE_URL ?? 'https://sandbox.tangle.tools',
+    apiKey: routerKey,
+    timeoutMs: 1_200_000,
+  } as never)
+
+  const control: [Arm, ...Arm[]] = [randomArm('solve')]
+  const run = (withSkills: boolean) =>
+    runExperiment({
+      adapter,
+      sandboxClient: client,
+      // The ONE difference between the two arms: resources.skills present or absent.
+      agentRun: sandboxAgentRun({
+        model,
+        routerBaseUrl,
+        backendType,
+        ...(process.env.WORKER_PROVIDER ? { provider: process.env.WORKER_PROVIDER } : {}),
+        profile: withSkills ? { name: 'skills-worker', resources: { skills } } : { name: 'no-skills-worker' },
+      }),
+      arms: control,
+      model,
+      rounds,
+      n,
+      ...(ids ? { ids } : {}),
+      concurrency,
+      ...(adapter.output ? { output: adapter.output } : {}),
+      infraRetries: Number(process.env.INFRA_RETRIES ?? 2),
+    })
+
+  // Run WITHOUT first (baseline), then WITH, on the SAME task ids (paired).
+  console.error('[arm: NO skills] running…')
+  const without = await run(false)
+  console.error(`  no-skills resolved: ${without.arms[0]?.resolved ?? 0}/${without.n}\n`)
+
+  console.error('[arm: WITH skills] running…')
+  const withS = await run(true)
+  console.error(`  with-skills resolved: ${withS.arms[0]?.resolved ?? 0}/${withS.n}\n`)
+
+  const a = without.arms[0]?.resolved ?? 0
+  const b = withS.arms[0]?.resolved ?? 0
+  const pct = (x: number, nn: number) => (nn > 0 ? `${((x / nn) * 100).toFixed(1)}%` : 'n/a')
+  console.error(`${'='.repeat(72)}\nSKILLS LEVER (sandboxed ${backendType} worker, ${adapter.name}):`)
+  console.error(`  no-skills : ${a}/${without.n} (${pct(a, without.n)})`)
+  console.error(`  with-skills: ${b}/${withS.n} (${pct(b, withS.n)})`)
+  console.error(`  delta     : ${b - a > 0 ? '+' : ''}${b - a} instances (${pct(b, withS.n)} vs ${pct(a, without.n)})`)
+}
+
+main().catch((e) => {
+  console.error(`skills-sandbox: ${e instanceof Error ? (e.stack ?? e.message) : String(e)}`)
+  process.exit(1)
+})