tangle-network · drewstone · Jun 14, 2026 · Jun 14, 2026
diff --git a/bench/src/coding-skills/minimal-diff.md b/bench/src/coding-skills/minimal-diff.md
@@ -0,0 +1,9 @@
+---
+name: minimal-diff
+description: Make the smallest change that satisfies the task; do not touch unrelated code.
+---
+Keep the diff minimal:
+1. Change only what the task requires; leave unrelated code, formatting, and files alone.
+2. Do not refactor, rename, or "clean up" beyond the ask — each extra change is a chance to break a check.
+3. Prefer the local, surgical fix over a broad rewrite.
+The grader is watching the whole repo state; unrequested changes are pure downside.
diff --git a/bench/src/coding-skills/read-before-edit.md b/bench/src/coding-skills/read-before-edit.md
@@ -0,0 +1,9 @@
+---
+name: read-before-edit
+description: Read the target file and the code that calls it before editing, so a change doesn't break callers.
+---
+Before editing a function or module:
+1. Read the full file you're about to change, not just the lines near the edit.
+2. Find and read its callers (grep for the symbol) to learn the contract you must preserve.
+3. Match the surrounding style and signatures; keep the change consistent with how the code already works.
+Editing blind to callers is how a local fix becomes a regression elsewhere.
diff --git a/bench/src/coding-skills/reproduce-first.md b/bench/src/coding-skills/reproduce-first.md
@@ -0,0 +1,10 @@
+---
+name: reproduce-first
+description: Before changing any code to fix a bug or failing test, run the failing test/command first to observe the real error.
+---
+When the task is to fix a bug or make a failing test pass:
+1. Run the exact failing test or command FIRST and read the actual error/traceback.
+2. Do not guess the cause from the description — confirm it from the real output.
+3. Only then make the smallest change that addresses the observed failure.
+4. Re-run the same test to confirm it now passes before moving on.
+An assumed cause is the most common reason a fix doesn't work.
diff --git a/bench/src/coding-skills/run-tests-after-edit.md b/bench/src/coding-skills/run-tests-after-edit.md
@@ -0,0 +1,9 @@
+---
+name: run-tests-after-edit
+description: After each code change, run the relevant tests and read the result before declaring the step done.
+---
+After every change:
+1. Run the narrowest test that covers what you changed (then the broader suite if time allows).
+2. Read the output — a passing exit code is the only proof, not your expectation.
+3. If it fails, treat the new error as the next problem to reproduce and fix; don't pile on more edits blind.
+Never report a step finished on a change you have not actually run.
diff --git a/bench/src/coding-skills/trace-the-failure.md b/bench/src/coding-skills/trace-the-failure.md
@@ -0,0 +1,9 @@
+---
+name: trace-the-failure
+description: When a test or program fails, read the traceback from the top error to the deepest frame in your own code, and fix at the root.
+---
+On a failure with a stack trace:
+1. Read the actual exception type and message first.
+2. Walk the frames to the DEEPEST one inside the code under test — that's usually where the root cause is, not the top frame.
+3. Inspect the values at that frame (add a print/log if needed) before editing.
+4. Fix the root cause, not the symptom that surfaced higher up.
diff --git a/bench/src/eops-skills/address-every-subtask.md b/bench/src/eops-skills/address-every-subtask.md
@@ -0,0 +1,5 @@
+---
+name: address-every-subtask
+description: Decompose the request into every distinct sub-task and plan a tool call for each — partial completion is the dominant failure.
+---
+Before planning calls, list every distinct change the request implies (each user, each ticket, each field). Plan tool calls that complete ALL of them. Many requests bundle several independent actions; finishing only the first is the most common way to fail the final-state check.
diff --git a/bench/src/eops-skills/exact-tools-and-args.md b/bench/src/eops-skills/exact-tools-and-args.md
@@ -0,0 +1,5 @@
+---
+name: exact-tools-and-args
+description: Use only the listed tools, with their exact names and argument shapes.
+---
+Call ONLY the tools listed as available, by their exact names. For each call, provide every required argument with the correct key names and value types the tool expects. A misspelled tool name or a missing/extra argument makes the call a no-op in the judge and the final state will be wrong.
diff --git a/bench/src/eops-skills/full-sequence-to-goal.md b/bench/src/eops-skills/full-sequence-to-goal.md
@@ -0,0 +1,5 @@
+---
+name: full-sequence-to-goal
+description: Plan the COMPLETE ordered sequence that brings the database to the required final state.
+---
+Think in terms of the required FINAL state, then plan the full ordered sequence of calls that gets there from the seeded start — including any reads needed to ground values, and in an order that respects dependencies (create before reference, set status before close). Do not stop at the first action.
diff --git a/bench/src/eops-skills/ground-every-value.md b/bench/src/eops-skills/ground-every-value.md
@@ -0,0 +1,5 @@
+---
+name: ground-every-value
+description: Never invent a field value, id, or relationship — derive each from what the task gives you.
+---
+Do not guess ids, names, statuses, or relationships. Use the exact values the task supplies; when a value must be looked up, plan the read call first and use its result. A hallucinated field value is a leading cause of a silently-wrong final state.
diff --git a/bench/src/eops-skills/honor-the-policies.md b/bench/src/eops-skills/honor-the-policies.md
@@ -0,0 +1,5 @@
+---
+name: honor-the-policies
+description: Re-read the role's policies and constraints before planning, and make every call comply.
+---
+The role description contains policies (who may do what, required ordering, forbidden actions). Re-read them, then ensure every planned tool call complies — correct assignee, correct status transitions, no skipped approval step. A plan that reaches the data goal but violates a stated policy still fails.
diff --git a/bench/src/profile-coord-sandbox.mts b/bench/src/profile-coord-sandbox.mts
@@ -0,0 +1,111 @@
+/**
+ * Generalized AgentProfile-coordinate optimizer on the sandbox surface. ONE runner for EVERY
+ * coordinate of the genome — skills, hooks, tools, prompt, subagents, mcp — over a real sandboxed
+ * harness agent. It measures the agent's task success WITH the coordinate's candidates injected
+ * vs WITHOUT (the frozen base profile), paired by task, on a deterministic-judge coding bench.
+ *
+ * Hold any coordinate fixed = don't select it (COORDINATE picks the one varied; everything else
+ * stays in the base profile). Combine = run several, or extend the base with a prior winner.
+ *
+ * Applies to ANY agent in the supervisor flow: AGENT=worker injects the profile into the
+ * sandboxed worker (wired); AGENT=driver injects it into the driver/steer agent (same compose,
+ * the seam is marked below) — because a driver, a worker, and a subagent are all AgentProfiles.
+ *
+ *   COORDINATE=skills BENCH=humaneval N=8 WORKER_MODEL=gpt-4.1 \
+ *     dotenvx run -f …/.env.keys -- tsx src/profile-coord-sandbox.mts
+ */
+import type { AgentProfile } from '@tangle-network/sandbox'
+import { Sandbox } from '@tangle-network/sandbox'
+import { ADAPTERS } from './adapters'
+import { type Arm, randomArm, runExperiment, sandboxAgentRun } from './experiment'
+import { getCoordinate } from './profile-coordinates'
+
+const must = (k: string): string => {
+  const v = process.env[k]
+  if (!v) throw new Error(`env ${k} is required`)
+  return v
+}
+
+async function main(): Promise<void> {
+  const coordinate = getCoordinate(process.env.COORDINATE ?? 'skills')
+  const make = ADAPTERS[process.env.BENCH ?? 'humaneval']
+  if (!make) throw new Error(`unknown BENCH=${process.env.BENCH} (have: ${Object.keys(ADAPTERS).join(', ')})`)
+  const adapter = make()
+  const model = process.env.WORKER_MODEL ?? 'gpt-4.1'
+  const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1'
+  const routerKey = must('TANGLE_API_KEY')
+  const backendType = (process.env.BACKEND as 'opencode' | 'claude-code' | undefined) ?? 'opencode'
+  const agentTarget = process.env.AGENT ?? 'worker' // worker (wired) | driver (seam below)
+  const rounds = Number(process.env.ROUNDS ?? 1)
+  const n = Number(process.env.N ?? 8)
+  const concurrency = Number(process.env.CONCURRENCY ?? 3)
+  const ids = process.env.IDS ? process.env.IDS.split(',') : undefined
+
+  // The frozen base genome. Everything NOT under optimization lives here, untouched. Extend it
+  // (PROFILE_JSON) to carry a prior winner from another coordinate — that is how coordinates
+  // combine: each run freezes the others by leaving them in the base.
+  const baseProfile: AgentProfile = {
+    name: `${coordinate.name}-base`,
+    ...(process.env.PROFILE_JSON ? (JSON.parse(process.env.PROFILE_JSON) as AgentProfile) : {}),
+  }
+  const candidates = coordinate.candidates()
+  const withProfile = coordinate.compose(baseProfile, candidates)
+
+  console.error(
+    `=== PROFILE-COORD · coordinate=${coordinate.name} · agent=${agentTarget} · bench=${adapter.name} · ` +
+      `backend=${backendType} · model=${model} · n=${n} ===\n` +
+      `  candidates injected (held against the frozen base): ${candidates.join(', ')}\n`,
+  )
+  if (agentTarget !== 'worker') {
+    throw new Error(`AGENT=${agentTarget} not yet wired — the compose is identical, but routing a profile into the driver/steer agent is the next seam (createExecutor backend for the driver). Run AGENT=worker.`)
+  }
+
+  const client = new Sandbox({
+    baseUrl: process.env.SANDBOX_BASE_URL ?? 'https://sandbox.tangle.tools',
+    apiKey: routerKey,
+    timeoutMs: 1_200_000,
+  } as never)
+
+  const control: [Arm, ...Arm[]] = [randomArm('solve')]
+  const run = (profile: AgentProfile) =>
+    runExperiment({
+      adapter,
+      sandboxClient: client,
+      agentRun: sandboxAgentRun({
+        model,
+        routerBaseUrl,
+        backendType,
+        ...(process.env.WORKER_PROVIDER ? { provider: process.env.WORKER_PROVIDER } : {}),
+        profile,
+      }),
+      arms: control,
+      model,
+      rounds,
+      n,
+      ...(ids ? { ids } : {}),
+      concurrency,
+      ...(adapter.output ? { output: adapter.output } : {}),
+      infraRetries: Number(process.env.INFRA_RETRIES ?? 2),
+    })
+
+  console.error(`[arm: WITHOUT ${coordinate.name}] (frozen base) running…`)
+  const without = await run(baseProfile)
+  console.error(`  without resolved: ${without.arms[0]?.resolved ?? 0}/${without.n}\n`)
+
+  console.error(`[arm: WITH ${coordinate.name}] running…`)
+  const withC = await run(withProfile)
+  console.error(`  with resolved: ${withC.arms[0]?.resolved ?? 0}/${withC.n}\n`)
+
+  const a = without.arms[0]?.resolved ?? 0
+  const b = withC.arms[0]?.resolved ?? 0
+  const pct = (x: number, nn: number) => (nn > 0 ? `${((x / nn) * 100).toFixed(1)}%` : 'n/a')
+  console.error(`${'='.repeat(72)}\n${coordinate.name.toUpperCase()} COORDINATE (sandboxed ${backendType} ${agentTarget}, ${adapter.name}):`)
+  console.error(`  WITHOUT (base): ${a}/${without.n} (${pct(a, without.n)})`)
+  console.error(`  WITH         : ${b}/${withC.n} (${pct(b, withC.n)})`)
+  console.error(`  delta        : ${b - a > 0 ? '+' : ''}${b - a} instances`)
+}
+
+main().catch((e) => {
+  console.error(`profile-coord-sandbox: ${e instanceof Error ? (e.stack ?? e.message) : String(e)}`)
+  process.exit(1)
+})
diff --git a/bench/src/profile-coordinates.ts b/bench/src/profile-coordinates.ts
@@ -0,0 +1,134 @@
+/**
+ * The AgentProfile genome as independently-optimizable coordinates. Every field the harness lets
+ * you define — prompt, skills, subagents, hooks, tools, mcp — is a coordinate: a way to inject a
+ * set of named candidates into a profile while holding every OTHER field fixed. This is the one
+ * abstraction behind "improve any part of the agent, freeze the rest, combine freely":
+ *
+ *   compose(baseProfile, selected) = { ...baseProfile, <thisField>: inject(selected) }
+ *
+ * Freeze a coordinate ⇒ never select it (it stays as-is in the base profile). Optimize one ⇒
+ * vary its selection. Combine ⇒ compose several coordinates' composers in sequence. Because a
+ * subagent is ITSELF an AgentProfile, the same coordinates apply recursively to any node in the
+ * supervisor flow (root driver, worker, sub-worker) — you point a coordinate at that node's
+ * base profile.
+ */
+import { readFileSync, readdirSync } from 'node:fs'
+import { join } from 'node:path'
+import type { AgentProfile } from '@tangle-network/sandbox'
+import { defineInlineResource } from '@tangle-network/sandbox'
+
+export interface ProfileCoordinate {
+  /** Coordinate id (the COORDINATE= knob value). */
+  readonly name: string
+  /** Candidate names the optimizer screens (one per independently-testable unit). */
+  candidates(): readonly string[]
+  /** Inject the selected candidates into THIS field of the profile, holding all others fixed.
+   *  Empty selection ⇒ the base profile unchanged (the frozen/baseline arm). */
+  compose(base: AgentProfile, selected: readonly string[]): AgentProfile
+}
+
+const here = (p: string) => join(import.meta.dirname, p)
+
+// ── skills: SKILL.md packages materialized to disk (resources.skills) ───────────────
+function skillsCoordinate(dir = here(process.env.SKILLS_DIR ?? 'coding-skills')): ProfileCoordinate {
+  const files = () => readdirSync(dir).filter((f) => f.endsWith('.md')).sort()
+  return {
+    name: 'skills',
+    candidates: () => files().map((f) => f.replace(/\.md$/, '')),
+    compose: (base, selected) => {
+      if (!selected.length) return base
+      const refs = selected.map((n) => defineInlineResource(n, readFileSync(join(dir, `${n}.md`), 'utf8')))
+      return { ...base, resources: { ...base.resources, skills: [...(base.resources?.skills ?? []), ...refs] } }
+    },
+  }
+}
+
+// ── hooks: shell commands the harness fires on lifecycle events (enforced, not advisory) ──
+const hookDefs: Record<string, { event: string; command: string; matcher?: string }> = {
+  'lint-before-edit': { event: 'PreToolUse', matcher: 'Edit|Write', command: 'ruff check . 2>/dev/null || true' },
+  'tests-after-edit': { event: 'PostToolUse', matcher: 'Edit|Write', command: 'python -m pytest -q 2>/dev/null | tail -5 || true' },
+  'no-print-debugging': { event: 'PreToolUse', matcher: 'Edit|Write', command: 'true' },
+}
+function hooksCoordinate(): ProfileCoordinate {
+  return {
+    name: 'hooks',
+    candidates: () => Object.keys(hookDefs),
+    compose: (base, selected) => {
+      if (!selected.length) return base
+      const hooks: Record<string, { command: string; matcher?: string }[]> = { ...(base.hooks ?? {}) }
+      for (const n of selected) {
+        const d = hookDefs[n]
+        if (!d) throw new Error(`unknown hook ${n}`)
+        hooks[d.event] = [...(hooks[d.event] ?? []), { command: d.command, ...(d.matcher ? { matcher: d.matcher } : {}) }]
+      }
+      return { ...base, hooks: hooks as AgentProfile['hooks'] }
+    },
+  }
+}
+
+// ── tools: enable/disable named harness tools ───────────────────────────────────────
+const toolCandidates = ['webfetch', 'websearch', 'bash', 'edit', 'read', 'grep']
+function toolsCoordinate(): ProfileCoordinate {
+  return {
+    name: 'tools',
+    candidates: () => toolCandidates,
+    compose: (base, selected) => {
+      if (!selected.length) return base
+      const tools: Record<string, boolean> = { ...(base.tools ?? {}) }
+      for (const n of selected) tools[n] = true
+      return { ...base, tools }
+    },
+  }
+}
+
+// ── prompt: extra instruction lines appended to the active system prompt ─────────────
+const instructionDefs: Record<string, string> = {
+  'be-surgical': 'Make the smallest change that satisfies the task; do not touch unrelated code.',
+  'check-examples': 'Before finalizing, re-read the examples in the docstring and confirm your output matches them exactly.',
+  'edge-cases': 'Enumerate boundary inputs (empty, zero, negative, max) and make sure your solution handles each.',
+}
+function promptCoordinate(): ProfileCoordinate {
+  return {
+    name: 'prompt',
+    candidates: () => Object.keys(instructionDefs),
+    compose: (base, selected) => {
+      if (!selected.length) return base
+      const instructions = [...(base.prompt?.instructions ?? []), ...selected.map((n) => instructionDefs[n]!)]
+      return { ...base, prompt: { ...base.prompt, instructions } }
+    },
+  }
+}
+
+// ── subagents: helper agents the root can delegate to (each is itself a mini-profile) ──
+const subagentDefs: Record<string, { description: string; prompt: string; tools?: Record<string, boolean> }> = {
+  reviewer: { description: 'Reviews a proposed change for bugs before it is finalized.', prompt: 'You are a strict code reviewer. Find bugs, edge cases, and contract violations in the proposed change. Be concise.' },
+  tester: { description: 'Writes and runs a focused test for the change.', prompt: 'You write the minimal test that would catch a regression in this change, run it, and report pass/fail.' },
+}
+function subagentsCoordinate(): ProfileCoordinate {
+  return {
+    name: 'subagents',
+    candidates: () => Object.keys(subagentDefs),
+    compose: (base, selected) => {
+      if (!selected.length) return base
+      const subagents = { ...(base.subagents ?? {}) }
+      for (const n of selected) subagents[n] = subagentDefs[n]!
+      return { ...base, subagents: subagents as AgentProfile['subagents'] }
+    },
+  }
+}
+
+const REGISTRY: Record<string, () => ProfileCoordinate> = {
+  skills: () => skillsCoordinate(),
+  hooks: hooksCoordinate,
+  tools: toolsCoordinate,
+  prompt: promptCoordinate,
+  subagents: subagentsCoordinate,
+}
+
+export function getCoordinate(name: string): ProfileCoordinate {
+  const make = REGISTRY[name]
+  if (!make) throw new Error(`unknown coordinate ${name} (have: ${Object.keys(REGISTRY).join(', ')})`)
+  return make()
+}
+
+export const coordinateNames = (): string[] => Object.keys(REGISTRY)