Skip to content

Commit d613b12

Browse files
committed
update
1 parent 7507527 commit d613b12

File tree

9 files changed

+104
-57
lines changed

9 files changed

+104
-57
lines changed

agents/claude-code.ts

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -107,11 +107,12 @@ const claudeCodeAgent: AgentDefinition = {
107107
const cacheKey = sessionKey(cwd, model);
108108
const existingSessionID = sessionCache.get(cacheKey);
109109

110-
const actions: string[] = [];
111-
const usage = {
112-
input: 0,
113-
output: 0,
114-
};
110+
const actions: string[] = [];
111+
const usage = {
112+
input: 0,
113+
output: 0,
114+
cost: 0,
115+
};
115116

116117
try {
117118
const result = query({
@@ -134,6 +135,7 @@ const claudeCodeAgent: AgentDefinition = {
134135
if (message.type === "result" && "usage" in message) {
135136
usage.input += message.usage.input_tokens || 0;
136137
usage.output += message.usage.output_tokens || 0;
138+
usage.cost += message.total_cost_usd || 0;
137139
}
138140

139141
actions.push(JSON.stringify(message));

agents/codex.ts

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,13 @@ const DEFAULT_SANDBOX: SandboxMode = "workspace-write";
2121
const codexClient = new Codex();
2222
const threadCache = new Map<string, Thread>();
2323

24-
export const models: string[] = [
24+
export const models = [
2525
"gpt-5-codex",
2626
"gpt-5.1-codex",
2727
// "gpt-5",
2828
// "o3",
2929
// "o4-mini"
30-
];
30+
] as const;
3131

3232
function sessionKey(cwd: string, model: string): string {
3333
return `${cwd}::${model}`;
@@ -106,9 +106,9 @@ function getOrCreateThread(model: string, cwd: string): Thread {
106106
return thread;
107107
}
108108

109-
const codexAgent: AgentDefinition = {
109+
const codexAgent: AgentDefinition<(typeof models)[number]> = {
110110
async run(
111-
model: string,
111+
model: (typeof models)[number],
112112
prompt: string,
113113
cwd: string,
114114
options?: AgentRunOptions,
@@ -130,10 +130,32 @@ const codexAgent: AgentDefinition = {
130130

131131
const actions: string[] = [];
132132
let usage: Usage;
133+
let cost = 0;
133134
try {
135+
const pricingKey = model;
136+
const pricing = openai.models[pricingKey]?.cost;
134137
const turn = await thread.run(prompt);
135138
assert(turn.usage, "The agent did not emit the usage information.");
136139
usage = turn.usage;
140+
if (!pricing) {
141+
if (!missingPricing.has(pricingKey)) {
142+
missingPricing.add(pricingKey);
143+
console.warn(
144+
`[codex] Pricing not found for ${pricingKey}; using $0 for cost calculation.`,
145+
);
146+
}
147+
} else {
148+
const billableInput =
149+
(usage.input_tokens ?? 0) - (usage.cached_input_tokens ?? 0);
150+
const cachedInput = usage.cached_input_tokens ?? 0;
151+
const output = usage.output_tokens ?? 0;
152+
cost =
153+
(billableInput * pricing.input +
154+
output * pricing.output +
155+
cachedInput * pricing.cache_read) /
156+
1_000_000;
157+
}
158+
137159
actions.push(...turn.items.map((item) => JSON.stringify(item)));
138160
logTurnItems(turn.items, options);
139161
} catch (error) {
@@ -147,9 +169,28 @@ const codexAgent: AgentDefinition = {
147169
usage: {
148170
input: usage.input_tokens,
149171
output: usage.output_tokens,
172+
cost,
150173
},
151174
};
152175
},
153176
};
154177

155178
export default codexAgent;
179+
180+
181+
const response = await fetch("https://models.dev/api.json");
182+
if (!response.ok) {
183+
throw new Error(`models.dev responded with ${response.status}`);
184+
}
185+
186+
const openai = (await response.json())["openai"] as {
187+
models: Record<string, {
188+
cost: {
189+
input: number,
190+
output: number,
191+
cache_read: number
192+
}
193+
}>
194+
}
195+
196+
const missingPricing = new Set<string>();

agents/index.ts

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,21 @@ import { strict as assert } from "node:assert";
22

33
import type { AgentDefinition } from "~/lib/createAgent.js";
44

5-
export interface AgentRegistration {
5+
export interface AgentRegistration<TModel extends string = string> {
66
name: string;
7-
definition: AgentDefinition;
8-
models: string[];
7+
definition: AgentDefinition<TModel>;
8+
models: ReadonlyArray<TModel>;
99
}
1010

11-
interface AgentModuleShape {
12-
default?: AgentDefinition;
13-
models?: string[];
11+
interface AgentModuleShape<TModel extends string = string> {
12+
default?: AgentDefinition<TModel>;
13+
models?: ReadonlyArray<TModel>;
1414
}
1515

16-
function createAgentRegistration(
16+
function createAgentRegistration<TModel extends string>(
1717
name: string,
18-
module: AgentModuleShape,
19-
): AgentRegistration {
18+
module: AgentModuleShape<TModel>,
19+
): AgentRegistration<TModel> {
2020
const definition = module.default;
2121
const models = module.models;
2222

@@ -26,7 +26,7 @@ function createAgentRegistration(
2626
return { name, definition, models };
2727
}
2828

29-
const agents: Record<string, AgentRegistration> = {
29+
const agents: Record<string, AgentRegistration<any>> = {
3030
codex: createAgentRegistration("codex", await import("~/agents/codex.js")),
3131
opencode: createAgentRegistration(
3232
"opencode",

agents/opencode.ts

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -169,11 +169,12 @@ const opencodeAgent: AgentDefinition = {
169169
sessionCache.set(cacheKey, sessionID);
170170
}
171171

172-
const actions: string[] = [];
173-
const usage = {
174-
input: 0,
175-
output: 0,
176-
};
172+
const actions: string[] = [];
173+
const usage = {
174+
input: 0,
175+
output: 0,
176+
cost: 0,
177+
};
177178
try {
178179
const [providerID, modelID] = model.split("/");
179180

@@ -196,6 +197,7 @@ const opencodeAgent: AgentDefinition = {
196197
const info = data.info;
197198
usage.input = info?.tokens?.input ?? 0;
198199
usage.output = info?.tokens?.output ?? 0;
200+
usage.cost = info?.cost ?? 0;
199201

200202
if (info) {
201203
actions.push(JSON.stringify(info));

bun.lock

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
{
22
"lockfileVersion": 1,
3-
"configVersion": 0,
43
"workspaces": {
54
"": {
65
"name": "opencode-bench",

cli.ts

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ interface EpisodeResult {
3737
logs: string[];
3838
actions: string[];
3939
usage: Usage;
40-
durationMs: number;
40+
duration: number;
4141
}
4242

4343
const evalIds = dataset
@@ -172,15 +172,14 @@ cli.command(
172172
console.log(
173173
`${prefix} Starting episode (timeout: ${timeoutInMinutes} min)...`,
174174
);
175-
const startedAt = Date.now();
176175
const result = await runEpisode(
177176
evalDef,
178177
agent,
179178
model,
180179
tasks,
181180
prefix,
182181
);
183-
return { index, durationMs: Date.now() - startedAt, ...result };
182+
return { index, ...result };
184183
},
185184
{
186185
timeoutMs: timeoutInMinutes * 60 * 1000,
@@ -226,17 +225,15 @@ cli.command(
226225
(prev, { usage }) => ({
227226
input: prev.input + usage.input / episodeResults.length,
228227
output: prev.output + usage.output / episodeResults.length,
228+
cost: prev.cost + usage.cost / episodeResults.length,
229229
}),
230-
{ input: 0, output: 0 },
230+
{ input: 0, output: 0, cost: 0 },
231231
);
232232

233-
const totalDurationMs = episodeResults.reduce(
234-
(prev, { durationMs }) => prev + durationMs,
233+
const totalDuration = episodeResults.reduce(
234+
(prev, { duration }) => prev + duration,
235235
0,
236236
);
237-
const totalTokens = averageUsage.input + averageUsage.output;
238-
const tokensPerSecond =
239-
totalDurationMs > 0 ? totalTokens / (totalDurationMs / 1000) : 0;
240237

241238
for (const result of episodeResults) {
242239
mergeAggregationInputs(aggregatedInputs, result.aggregation);
@@ -281,8 +278,7 @@ cli.command(
281278
episodeExports,
282279
averageUsage,
283280
summary,
284-
totalDurationMs,
285-
tokensPerSecond,
281+
totalDuration,
286282
);
287283

288284
printEvalResult(evaluationResult);
@@ -316,6 +312,7 @@ async function runEpisode(
316312
) {
317313
const baselineCommit = evalDef.from;
318314
let cwd: string | undefined;
315+
let episodeDuration = 0;
319316

320317
try {
321318
console.log(`${prefix} Cloning repository...`);
@@ -345,15 +342,20 @@ async function runEpisode(
345342
}
346343

347344
let tasksExecuted = 0;
348-
let usage: Usage = { input: 0, output: 0 };
345+
let usage: Usage = { input: 0, output: 0, cost: 0 };
349346
const episodeActions: string[] = [];
347+
let episodeDuration = 0;
350348

351349
for (const task of tasks) {
352350
const logPrefix = `${prefix} ${task.commit}`;
353351

354352
try {
353+
let successfulRunDuration = 0;
354+
// TODO: retrying the agent runs here means if the agent did half of the work, the next agent would come up and continue those changes which is not correct.
355+
// the agent should start from a clean state again and do the work. so the whole loop should be restarted.
355356
const result = await withRetries(
356357
async () => {
358+
const startedAt = Date.now();
357359
const result = await agent.definition.run(
358360
model,
359361
task.prompt,
@@ -365,6 +367,7 @@ async function runEpisode(
365367
logPrefix,
366368
},
367369
);
370+
successfulRunDuration = Date.now() - startedAt;
368371
return result;
369372
},
370373
{
@@ -386,10 +389,12 @@ async function runEpisode(
386389
},
387390
},
388391
);
392+
episodeDuration += successfulRunDuration;
389393

390394
// Only accumulate usage from the successful result
391395
usage.input += result.usage.input;
392396
usage.output += result.usage.output;
397+
usage.cost += result.usage.cost;
393398

394399
// Collect actions from this task
395400
episodeActions.push(...result.actions);
@@ -447,6 +452,7 @@ async function runEpisode(
447452
logs: [],
448453
actions: episodeActions,
449454
usage,
455+
duration: episodeDuration,
450456
};
451457
} finally {
452458
if (cwd) {
@@ -633,8 +639,7 @@ function summarizeAggregation(
633639
episodes: Episode[],
634640
usage: Usage,
635641
summary: string,
636-
durationMs: number,
637-
tokensPerSecond: number,
642+
duration: number,
638643
): { lines: string[]; exportData: EvaluationRunExport } {
639644
const evalId = datasetEval.repo;
640645
const runContext = contextLabel ? `${evalId} [${contextLabel}]` : evalId;
@@ -697,8 +702,7 @@ function summarizeAggregation(
697702
episodes,
698703
usage,
699704
summary,
700-
durationMs,
701-
tokensPerSecond,
705+
duration,
702706
};
703707

704708
return { lines, exportData };

lib/createAgent.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@ export type AgentExecutor = (
1313
prompt: AgentPrompt,
1414
) => AgentCommandSpec | Promise<AgentCommandSpec>;
1515

16-
export interface AgentDefinition {
16+
export interface AgentDefinition<TModel extends string = string> {
1717
run: (
18-
model: string,
18+
model: TModel,
1919
prompt: AgentPrompt,
2020
cwd: string,
2121
options?: AgentRunOptions,
@@ -29,6 +29,7 @@ export interface AgentRunResult {
2929
usage: {
3030
input: number;
3131
output: number;
32+
cost: number;
3233
};
3334
}
3435

0 commit comments

Comments
 (0)