Skip to content

Run Benchmark

Run Benchmark #13

Workflow file for this run

name: Run Benchmark
on:
workflow_dispatch:
inputs:
agent:
description: "Agent to use"
required: true
type: string
model:
description: "Model to use"
required: true
type: string
tasks:
description: "Comma-separated list of tasks"
required: true
type: string
jobs:
prepare:
runs-on: ubuntu-latest
outputs:
tasks: ${{ steps.split.outputs.tasks }}
steps:
- name: Split tasks into matrix
id: split
run: |
TASKS_JSON=$(echo "${{ inputs.tasks }}" | tr ',' '\n' | sed 's/^[[:space:]]*//;s/[[:space:]]*$//' | jq -R -s -c 'split("\n") | map(select(length > 0))')
echo "tasks=$TASKS_JSON" >> $GITHUB_OUTPUT
benchmark:
needs: prepare
runs-on: ubuntu-latest
strategy:
matrix:
task: ${{ fromJson(needs.prepare.outputs.tasks) }}
run: [1, 2, 3]
environment: production
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Setup Bun
uses: oven-sh/setup-bun@v1
with:
bun-version: 1.2.21
- name: Install dependencies
run: bun install
- name: Install OpenCode CLI
run: bun add -g opencode-ai
- name: Print benchmark config
env:
MODEL: ${{ inputs.model }}
TASK: ${{ matrix.task }}
RUN: ${{ matrix.run }}
run: |
echo "Model: ${MODEL}"
echo "Task: ${TASK}"
echo "Run: ${RUN}"
- name: Run benchmark
env:
OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
DEBUG: true
TASK: ${{ matrix.task }}
MODEL: ${{ inputs.model }}
AGENT: ${{ inputs.agent }}
RESULT_PATH: result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run${{ matrix.run }}.json
run: bun github/run.ts
- name: Upload benchmark results
uses: actions/upload-artifact@v4
with:
name: result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run${{ matrix.run }}
path: result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run${{ matrix.run }}.json
summarize-runs:
needs: benchmark
runs-on: ubuntu-latest
strategy:
matrix:
task: ${{ fromJson(needs.prepare.outputs.tasks) }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Setup Bun
uses: oven-sh/setup-bun@v1
with:
bun-version: 1.2.21
- name: Install dependencies
run: bun install
- name: Download run 1 results
uses: actions/download-artifact@v4
with:
name: result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run1
path: results
- name: Download run 2 results
uses: actions/download-artifact@v4
with:
name: result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run2
path: results
- name: Download run 3 results
uses: actions/download-artifact@v4
with:
name: result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run3
path: results
- name: Summarize runs
env:
RESULT_PATHS: results/result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run1.json,results/result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run2.json,results/result-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}-run3.json
RUNS_SUMMARY_PATH: runs-summary-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}.json
run: bun github/summarize-runs.ts
- name: Upload runs summary
uses: actions/upload-artifact@v4
with:
name: runs-summary-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}
path: runs-summary-${{ matrix.task }}-${{ inputs.model }}-${{ inputs.agent }}.json
summarize-tasks:
needs: summarize-runs
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Setup Bun
uses: oven-sh/setup-bun@v1
with:
bun-version: 1.2.21
- name: Install dependencies
run: bun install
- name: Download all runs summaries
uses: actions/download-artifact@v4
with:
pattern: runs-summary-*
path: runs-summaries
- name: Summarize tasks
env:
RUNS_SUMMARY_PATHS: runs-summaries/*/runs-summary-*.json
run: |
RUNS_SUMMARY_PATHS_COMMA=$(find runs-summaries -name 'runs-summary-*.json' | tr '\n' ',' | sed 's/,$//')
export RUNS_SUMMARY_PATHS="$RUNS_SUMMARY_PATHS_COMMA"
export TASKS_SUMMARY_PATH=tasks-summary.json
bun github/summarize-tasks.ts
- name: Upload tasks summary
uses: actions/upload-artifact@v4
with:
name: tasks-summary
path: tasks-summary.json