Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions .github/workflows/collect-evals.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,24 @@ jobs:
path: eval_results/
pattern: ${{ inputs.result-prefix && format('eval_{0}_*', inputs.result-prefix) || 'eval_*' }}

- name: Set up Python
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: '3.12'
cache: 'pip'
cache-dependency-path: utils/requirements.txt

- name: Install dependencies
run: |
pip install -r utils/requirements.txt

- name: Summarize evals
run: |
pip install tabulate
echo "## Eval Summary" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
python3 utils/collect_eval_results.py eval_results/ ${{ inputs.result-prefix || 'all' }} >> $GITHUB_STEP_SUMMARY
{
echo "## Eval Summary"
echo ""
python3 utils/collect_eval_results.py eval_results/ "${{ inputs.result-prefix || 'all' }}"
} >> "$GITHUB_STEP_SUMMARY"

- name: Upload aggregated evals
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
Expand Down
19 changes: 13 additions & 6 deletions .github/workflows/collect-results.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,22 @@ jobs:
path: results/
pattern: ${{ inputs.result-prefix && format('{0}_*', inputs.result-prefix) || '*' }}

- name: Print summary
- name: Set up Python
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: '3.12'
cache: 'pip'
cache-dependency-path: utils/requirements.txt

- name: Install dependencies
run: |
pip install tabulate
python3 utils/summarize.py results/ >> $GITHUB_STEP_SUMMARY
pip install -r utils/requirements.txt

- name: Print summary
run: python3 utils/summarize.py results/ >> "$GITHUB_STEP_SUMMARY"

- name: Aggregate results
run: |
pip install tabulate
python3 utils/collect_results.py results/ ${{ inputs.result-prefix || 'all' }}
run: python3 utils/collect_results.py results/ ${{ inputs.result-prefix || 'all' }}

- name: Upload aggregated results
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
Expand Down
24 changes: 21 additions & 3 deletions .github/workflows/e2e-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,19 @@ jobs:
if: ${{ !inputs.ref || inputs.ref == '' }}
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

- name: Set up Python
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: '3.12'
cache: 'pip'
cache-dependency-path: utils/requirements.txt

- name: Install dependencies
run: |
pip install -r utils/requirements.txt

- id: get-jobs
run: |
pip install pydantic
CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \
${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }})
SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and not x.get('eval-only', False)]))")
Expand Down Expand Up @@ -196,11 +206,19 @@ jobs:
path: ${{ env.RESULTS_DIR }}
pattern: results_*

- name: Set up Python
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: '3.12'
cache: 'pip'
cache-dependency-path: utils/requirements.txt

- name: Install python dependencies
run: pip install PyGithub
run: |
pip install -r utils/requirements.txt

- name: Calculate success rate
run: python3 utils/calc_success_rate.py $STATS_FILENAME
run: python3 utils/calc_success_rate.py "$STATS_FILENAME"

- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
Expand Down
12 changes: 11 additions & 1 deletion .github/workflows/profile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,20 @@ jobs:
with:
ref: ${{ inputs.ref || github.ref }}

- name: Set up Python
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: '3.12'
cache: 'pip'
cache-dependency-path: utils/requirements.txt

- name: Install dependencies
run: |
pip install -r utils/requirements.txt

- id: gen
name: Generate matrix via script
run: |
pip install pydantic
CLI_ARGS="test-config --config-files ${{ inputs.config-file }} --config-keys ${{ inputs.config-key }} --conc ${{ inputs.conc }}"
CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py $CLI_ARGS)
echo "raw=$CONFIG_JSON" >> $GITHUB_OUTPUT
Expand Down
39 changes: 33 additions & 6 deletions .github/workflows/run-sweep.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,19 @@ jobs:
with:
fetch-depth: 0

- id: setup
- name: Set up Python
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: '3.12'
cache: 'pip'
cache-dependency-path: utils/requirements.txt

- name: Install dependencies
run: |
pip install pydantic
pip install -r utils/requirements.txt

- id: setup
run: |
if [ "${{ github.event_name }}" == "pull_request" ]; then
BASE_REF="origin/${{ github.base_ref }}"
HEAD_REF="${{ github.event.pull_request.head.sha }}"
Expand Down Expand Up @@ -251,11 +260,20 @@ jobs:
path: ${{ env.RESULTS_DIR }}
pattern: results_*

- name: Set up Python
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: '3.12'
cache: 'pip'
cache-dependency-path: utils/requirements.txt

- name: Install python dependencies
run: pip install PyGithub
run: |
python -m pip install --upgrade pip
pip install -r utils/requirements.txt

- name: Calculate success rate
run: python3 utils/calc_success_rate.py $STATS_FILENAME
run: python3 utils/calc_success_rate.py "$STATS_FILENAME"

- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
Expand Down Expand Up @@ -286,11 +304,20 @@ jobs:
path: results/
pattern: results_bmk

- name: Set up Python
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: '3.12'
cache: 'pip'
cache-dependency-path: utils/requirements.txt

- name: Install dependencies
run: pip install psycopg2-binary tabulate
run: |
python -m pip install --upgrade pip
pip install -r utils/requirements.txt

- name: Compare results against main
run: python3 utils/compare_results.py results/ >> $GITHUB_STEP_SUMMARY
run: python3 utils/compare_results.py results/ >> "$GITHUB_STEP_SUMMARY"

trigger-ingest:
needs:
Expand Down
5 changes: 4 additions & 1 deletion .github/workflows/test-matrix-logic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ on:
pull_request:
paths:
- 'utils/matrix_logic/**'
- 'utils/requirements.txt'

permissions:
contents: read
Expand All @@ -24,11 +25,13 @@ jobs:
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: '3.12'
cache: 'pip'
cache-dependency-path: utils/requirements.txt

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pytest pydantic pyyaml
pip install -r utils/requirements.txt

- name: test_generate_sweep_configs tests
run: |
Expand Down
5 changes: 4 additions & 1 deletion .github/workflows/test-process-result.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ on:
paths:
- 'utils/process_result.py'
- 'utils/test_process_result.py'
- 'utils/requirements.txt'

permissions:
contents: read
Expand All @@ -24,11 +25,13 @@ jobs:
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: '3.12'
cache: 'pip'
cache-dependency-path: utils/requirements.txt

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pytest
pip install -r utils/requirements.txt

- name: Run pytest
run: |
Expand Down
8 changes: 8 additions & 0 deletions utils/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Direct dependencies for utils/ scripts and CI tests.
# Exact pins for deterministic resolution on Python 3.12 in CI.
pydantic==2.13.0
PyGithub==2.8.1
psycopg2-binary==2.9.11
pytest==9.0.3
PyYAML==6.0.3
tabulate==0.10.0