diff --git a/package.json b/package.json index 5f2dba5..2e4094a 100644 --- a/package.json +++ b/package.json @@ -68,8 +68,8 @@ "format": "biome format --write src tests" }, "dependencies": { - "@tangle-network/agent-eval": ">=0.77.0 <0.80.0", - "@tangle-network/agent-runtime": "^0.44.0", + "@tangle-network/agent-eval": "^0.91.0", + "@tangle-network/agent-runtime": "^0.50.0", "zod": "^4.3.6" }, "devDependencies": { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index fff26b1..a8f86cd 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -9,11 +9,11 @@ importers: .: dependencies: '@tangle-network/agent-eval': - specifier: '>=0.77.0 <0.80.0' - version: 0.79.0(@tangle-network/sandbox@0.4.3(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3) + specifier: ^0.91.0 + version: 0.91.0(@tangle-network/sandbox@0.4.3(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3) '@tangle-network/agent-runtime': - specifier: ^0.44.0 - version: 0.44.0(@tangle-network/agent-eval@0.79.0(@tangle-network/sandbox@0.4.3(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3))(@tangle-network/sandbox@0.4.3(viem@2.48.8(typescript@5.9.3)(zod@4.4.2))) + specifier: ^0.50.0 + version: 0.50.0(@tangle-network/agent-eval@0.91.0(@tangle-network/sandbox@0.4.3(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3))(@tangle-network/sandbox@0.4.3(viem@2.48.8(typescript@5.9.3)(zod@4.4.2))) zod: specifier: ^4.3.6 version: 4.4.2 @@ -458,8 +458,8 @@ packages: '@scure/bip39@2.2.0': resolution: {integrity: sha512-T/Bj/YvYMNkIPq6EENO6/rcs2e7qTNuyoUXf0KBFDmp0ZDu0H2X4Lq6yC3i0c8PcWkov5EbW+yQZZbdMmk154A==} - '@tangle-network/agent-eval@0.79.0': - resolution: {integrity: sha512-reN1SbKvTXFS27PQa4l5dnwf0y33j118FM0aPKAq8I0fvj3H6olF1wwQWHCSm7Sjats9rU4c8x2wlSNk0VeCBQ==} + '@tangle-network/agent-eval@0.91.0': + resolution: {integrity: sha512-YzFZIcH7E1HbqL7pBf0BURc/4h5PQIDuGOssL2ybU2RFmZ9SSGAnKtliX2Dtn3sOObfXbH3/zDN4ka2RdgRPtw==} engines: {node: '>=20'} hasBin: true peerDependencies: @@ -468,19 +468,22 @@ packages: '@tangle-network/sandbox': optional: true - '@tangle-network/agent-runtime@0.44.0': - resolution: {integrity: sha512-uMzWcziIV+SsgvdvvnnSobaFYZuYXQ3KRfvq9h9kHglVLtPoUGH78ypCnyn5QQIccTk4gjSenAHC5Iy076DkQg==} + '@tangle-network/agent-runtime@0.50.0': + resolution: {integrity: sha512-fNVcaG7sDOxu8ILt61N4+zBfA/lnY6P8YGAt4r5cI7ekfitfwJ3GZBk2YahxJHQ0XrtMQrF6kGe7dMuSxidxNg==} engines: {node: '>=20'} hasBin: true peerDependencies: - '@tangle-network/agent-eval': '>=0.61.0 <1.0.0' + '@tangle-network/agent-eval': '>=0.83.0 <1.0.0' '@tangle-network/agent-knowledge': '>=1.3.0 <2.0.0' - '@tangle-network/sandbox': '>=0.1.2 <0.5.0' + '@tangle-network/sandbox': '>=0.1.2 <0.7.0' + playwright: ^1.40.0 peerDependenciesMeta: '@tangle-network/agent-knowledge': optional: true '@tangle-network/sandbox': optional: true + playwright: + optional: true '@tangle-network/sandbox@0.1.2': resolution: {integrity: sha512-6TPH9QgCgou9Bhc1kzLNL4/PRiT1mjId6NONY5Le/KT2kh77cXH8KN3TTY/cU+/eW+WM5FYJOy32FWl2HShXbw==} @@ -1252,7 +1255,7 @@ snapshots: '@noble/hashes': 2.2.0 '@scure/base': 2.2.0 - '@tangle-network/agent-eval@0.79.0(@tangle-network/sandbox@0.4.3(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)': + '@tangle-network/agent-eval@0.91.0(@tangle-network/sandbox@0.4.3(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)': dependencies: '@asteasolutions/zod-to-openapi': 8.5.0(zod@4.4.2) '@ax-llm/ax': 19.0.45(zod@4.4.2) @@ -1267,9 +1270,9 @@ snapshots: - typescript - utf-8-validate - '@tangle-network/agent-runtime@0.44.0(@tangle-network/agent-eval@0.79.0(@tangle-network/sandbox@0.4.3(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3))(@tangle-network/sandbox@0.4.3(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))': + '@tangle-network/agent-runtime@0.50.0(@tangle-network/agent-eval@0.91.0(@tangle-network/sandbox@0.4.3(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3))(@tangle-network/sandbox@0.4.3(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))': dependencies: - '@tangle-network/agent-eval': 0.79.0(@tangle-network/sandbox@0.4.3(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3) + '@tangle-network/agent-eval': 0.91.0(@tangle-network/sandbox@0.4.3(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3) optionalDependencies: '@tangle-network/sandbox': 0.4.3(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)) diff --git a/src/profiles/researcher.ts b/src/profiles/researcher.ts index 045a82e..ba866bf 100644 --- a/src/profiles/researcher.ts +++ b/src/profiles/researcher.ts @@ -26,9 +26,10 @@ import { type AgentProfile, type AgentRunSpec, - createFanoutVoteDriver, + createDriver, type DefaultVerdict, type Driver, + type DriverDecision, type OutputAdapter, type SandboxEvent, type Validator, @@ -185,7 +186,7 @@ export function multiHarnessResearcherFanout(options: MultiHarnessResearcherFano agentRuns: AgentRunSpec[] output: OutputAdapter validator: Validator - driver: Driver + driver: Driver } { const harnesses = options.harnesses && options.harnesses.length > 0 @@ -200,7 +201,15 @@ export function multiHarnessResearcherFanout(options: MultiHarnessResearcherFano citationDensityMin: options.citationDensityMin, task: options.task, }) - const driver = createFanoutVoteDriver({ n: harnesses.length }) + // Single fanout round across the N harnesses, then stop: the kernel + // round-robins `agentRuns` over the N branches and selects the winner + // (best valid score) across all iterations via `defaultSelectWinner`. + const driver = createDriver({ + planner: ({ task, history }) => + history.length === 0 + ? { kind: 'fanout', tasks: Array.from({ length: harnesses.length }, () => task) } + : { kind: 'stop' }, + }) return { agentRuns, output, validator, driver } } diff --git a/tests/loops/researcher-integration.test.ts b/tests/loops/researcher-integration.test.ts index 9dbd8a4..1057e3a 100644 --- a/tests/loops/researcher-integration.test.ts +++ b/tests/loops/researcher-integration.test.ts @@ -108,7 +108,7 @@ describe('researcherProfile end-to-end through runLoop', () => { ctx: { sandboxClient: client }, }) - expect(result.decision).toBe('pick-winner') + expect(result.decision).toBe('done') expect(result.iterations).toHaveLength(3) expect(result.winner).toBeDefined() expect(result.winner?.output.items).toHaveLength(1) @@ -135,15 +135,15 @@ describe('researcherProfile end-to-end through runLoop', () => { ctx: { sandboxClient: client }, }) - expect(result.decision).toBe('fail') + expect(result.decision).toBe('done') expect(result.iterations).toHaveLength(2) for (const iter of result.iterations) { expect(iter.verdict?.valid).toBe(false) expect(iter.verdict?.notes).toMatch(/namespace violation/) } - // The kernel surfaces a structural top-of-attempts even on `fail`. - // The contract is `decision === 'fail'` + `winner.verdict.valid === false`; - // never a winner with `valid === true` when every output leaked. + // The kernel may surface a structural top-of-attempts even when nothing + // validates. The contract is: never a winner with `valid === true` when + // every output leaked across namespaces. if (result.winner) { expect(result.winner.verdict?.valid).toBe(false) } @@ -168,7 +168,7 @@ describe('researcherProfile end-to-end through runLoop', () => { ctx: { sandboxClient: client }, }) - expect(result.decision).toBe('pick-winner') + expect(result.decision).toBe('done') expect(result.winner?.iterationIndex).toBe(1) expect(result.winner?.agentRunName).toBe('researcher-high-quality') }) diff --git a/tests/profiles/researcher.test.ts b/tests/profiles/researcher.test.ts index cc5e4e2..e49076d 100644 --- a/tests/profiles/researcher.test.ts +++ b/tests/profiles/researcher.test.ts @@ -402,7 +402,7 @@ describe('loose-output passthrough', () => { }) describe('multiHarnessResearcherFanout', () => { - it('builds N AgentRunSpecs with a FanoutVote driver', () => { + it('builds N AgentRunSpecs with a single-fanout-then-stop driver', () => { const fan = multiHarnessResearcherFanout({ harnesses: ['claude-code', 'codex', 'opencode/zai-coding-plan/glm-5.1'], }) @@ -414,7 +414,7 @@ describe('multiHarnessResearcherFanout', () => { ]) expect(typeof fan.driver.plan).toBe('function') expect(typeof fan.driver.decide).toBe('function') - expect(fan.driver.name).toBe('fanout-vote') + expect(fan.driver.name).toBe('dynamic') }) it('falls back to three default harnesses when none supplied', () => {