From 223a9feb56e668a9ab159f1849d13c68d8d405ae Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Tue, 13 Jan 2026 16:54:48 -0800 Subject: [PATCH 1/3] GPU concurrency --- .github/workflows/nvidia_workflow.yml | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index 59a5e4d4..7240393d 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -18,8 +18,23 @@ on: run-name: 'NVIDIA Job - ${{ github.event.inputs.run_id }}' jobs: + select-runner: + runs-on: ubuntu-latest + outputs: + runner: ${{ steps.pick.outputs.runner }} + steps: + - id: pick + run: | + runners=("b200-02-gpu0" "b200-02-gpu2" "b200-02-gpu4" "b200-02-gpu5" "b200-02-gpu7") + index=$(( ${{ github.run_number }} % 5 )) + echo "runner=${runners[$index]}" >> $GITHUB_OUTPUT + run: - runs-on: [nvidia-docker-b200-8-x86-64] + needs: select-runner + runs-on: ${{ needs.select-runner.outputs.runner }} + concurrency: + group: ${{ needs.select-runner.outputs.runner }} + cancel-in-progress: false timeout-minutes: 10 steps: - uses: actions/checkout@v3 From e5b72eabb9772fd76fe0e73ac566ff0d26459572 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Tue, 13 Jan 2026 16:57:01 -0800 Subject: [PATCH 2/3] update --- .github/workflows/nvidia_workflow.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index 7240393d..a680e3e4 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -25,8 +25,8 @@ jobs: steps: - id: pick run: | - runners=("b200-02-gpu0" "b200-02-gpu2" "b200-02-gpu4" "b200-02-gpu5" "b200-02-gpu7") - index=$(( ${{ github.run_number }} % 5 )) + runners=("b200-02-gpu0" "b200-02-gpu1" "b200-02-gpu2" "b200-02-gpu3" "b200-02-gpu4" "b200-02-gpu5" "b200-02-gpu6" "b200-02-gpu7") + index=$(( ${{ github.run_number }} % 8 )) echo "runner=${runners[$index]}" >> $GITHUB_OUTPUT run: From 89007f1bbb47cefac8c106a7d79f42c6bf3f61df Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Tue, 13 Jan 2026 17:02:08 -0800 Subject: [PATCH 3/3] Simplify NVIDIA workflow runner selection Remove round-robin GPU selection and concurrency groups in favor of letting GitHub's native self-hosted runner queuing handle distribution. Self-hosted runners only run one job at a time by default. --- .github/workflows/nvidia_workflow.yml | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index a680e3e4..544c9de9 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -18,23 +18,8 @@ on: run-name: 'NVIDIA Job - ${{ github.event.inputs.run_id }}' jobs: - select-runner: - runs-on: ubuntu-latest - outputs: - runner: ${{ steps.pick.outputs.runner }} - steps: - - id: pick - run: | - runners=("b200-02-gpu0" "b200-02-gpu1" "b200-02-gpu2" "b200-02-gpu3" "b200-02-gpu4" "b200-02-gpu5" "b200-02-gpu6" "b200-02-gpu7") - index=$(( ${{ github.run_number }} % 8 )) - echo "runner=${runners[$index]}" >> $GITHUB_OUTPUT - run: - needs: select-runner - runs-on: ${{ needs.select-runner.outputs.runner }} - concurrency: - group: ${{ needs.select-runner.outputs.runner }} - cancel-in-progress: false + runs-on: [self-hosted, nvidia-docker-b200-8-x86-64] timeout-minutes: 10 steps: - uses: actions/checkout@v3