From 075b3f57240c58f752ebae6c8ba3ed0fc0d62efd Mon Sep 17 00:00:00 2001 From: William Hill Date: Tue, 24 Feb 2026 13:08:18 -0500 Subject: [PATCH 01/18] docs: design doc for self-service data upload (issue #86) --- .../2026-02-24-self-service-upload-design.md | 170 ++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 docs/plans/2026-02-24-self-service-upload-design.md diff --git a/docs/plans/2026-02-24-self-service-upload-design.md b/docs/plans/2026-02-24-self-service-upload-design.md new file mode 100644 index 0000000..9bf831b --- /dev/null +++ b/docs/plans/2026-02-24-self-service-upload-design.md @@ -0,0 +1,170 @@ +# Design: Self-Service Data Upload (Issue #86) + +**Date:** 2026-02-24 +**Author:** Claude Code + +--- + +## Overview + +Allow admin and IR users to upload institutional data files directly from the dashboard without +needing direct database or server access. Two upload paths: course enrollment CSVs (end-to-end +to Postgres) and PDP cohort/AR files (to Supabase Storage + GitHub Actions ML pipeline trigger). + +--- + +## Scope + +**In scope:** +- Course enrollment CSV → `course_enrollments` Postgres table (upsert) +- PDP Cohort CSV / PDP AR (.xlsx) → Supabase Storage + GitHub Actions `repository_dispatch` +- Preview step (first 10 rows + column validation) before commit +- Role guard: admin and ir only + +**Out of scope:** +- Upload history log (future issue) +- Column remapping UI (columns must match known schema) +- ML experiment tracking / MLflow (future issue) +- Auto-triggering ML pipeline without a server (GitHub Actions is the trigger mechanism) + +--- + +## Pages & Routing + +**New page:** `codebenders-dashboard/app/admin/upload/page.tsx` + +**Role guard:** Add to `lib/roles.ts` `ROUTE_PERMISSIONS`: +```ts +{ prefix: "/admin", roles: ["admin", "ir"] }, +{ prefix: "/api/admin", roles: ["admin", "ir"] }, +``` +Middleware already enforces this pattern via `x-user-role` header — no other auth code needed. + +**Nav link:** Add "Upload Data" to `nav-header.tsx`, visible only to admin/ir roles. + +**New API routes:** +- `POST /api/admin/upload/preview` — parse first 10 rows, return sample + validation summary +- `POST /api/admin/upload/commit` — full ingest (course → Postgres; PDP/AR → Storage + Actions) + +--- + +## UI Flow (3 States) + +### State 1 — Select & Drop +- Dropdown: file type (`Course Enrollment CSV` | `PDP Cohort CSV` | `PDP AR File (.xlsx)`) +- Drag-and-drop zone (click to pick; `.csv` for course/cohort, `.csv`+`.xlsx` for AR) +- "Preview" button → calls `/api/admin/upload/preview` + +### State 2 — Preview +- Shows: detected file type, estimated row count, first 10 rows in a table +- Validation banner: lists missing required columns or warnings +- "Confirm & Upload" → calls `/api/admin/upload/commit` +- "Back" link to return to State 1 + +### State 3 — Result +- Course enrollments: `{ inserted, skipped, errors[] }` summary card +- PDP/AR: "File accepted — ML pipeline queued in GitHub Actions" + link to Actions run +- "Upload another file" resets to State 1 + +--- + +## API Routes + +### `POST /api/admin/upload/preview` + +**Input:** `multipart/form-data` with `file` and `fileType` fields + +**Logic:** +1. Parse first 50 rows with `csv-parse` (CSV) or `xlsx` (Excel) +2. Validate required columns exist for the given `fileType` +3. Return `{ columns, sampleRows (first 10), rowCount (estimated), warnings[] }` + +### `POST /api/admin/upload/commit` + +**Input:** Same multipart form + +**Course enrollment path:** +1. Stream-parse full CSV with `csv-parse` async iterator +2. Batch-upsert 500 rows at a time into `course_enrollments` via `pg` +3. Conflict target: `(student_guid, course_prefix, course_number, academic_term)` +4. Return `{ inserted, skipped, errors[] }` + +**PDP/AR path:** +1. Upload file to Supabase Storage bucket `pdp-uploads` via `@supabase/supabase-js` +2. Call GitHub API `POST /repos/{owner}/{repo}/dispatches` with: + ```json + { "event_type": "ml-pipeline", "client_payload": { "file_path": "" } } + ``` +3. Return `{ status: "processing", actionsUrl: "https://github.com/{owner}/{repo}/actions" }` + +**Role enforcement:** Read `x-user-role` header (set by middleware); return 403 if not admin/ir. + +--- + +## GitHub Actions Workflow + +**File:** `.github/workflows/ml-pipeline.yml` + +**Trigger:** `repository_dispatch` with `event_type: ml-pipeline` + +**Steps:** +1. Checkout repo +2. Set up Python with `venv` +3. Install dependencies (`pip install -r requirements.txt`) +4. Download uploaded file from Supabase Storage using `SUPABASE_SERVICE_KEY` secret +5. Run `venv/bin/python ai_model/complete_ml_pipeline.py --input ` +6. Upload `ML_PIPELINE_REPORT.txt` as a GitHub Actions artifact (retained 90 days) + +**Required secrets:** `SUPABASE_URL`, `SUPABASE_SERVICE_KEY`, `GITHUB_TOKEN` (auto-provided) + +--- + +## Required Column Schemas + +### Course Enrollment CSV +Must include: `student_guid`, `course_prefix`, `course_number`, `academic_year`, `academic_term` +Optional (all other `course_enrollments` columns): filled as NULL if absent + +### PDP Cohort CSV +Must include: `Institution_ID`, `Cohort`, `Student_GUID`, `Cohort_Term` + +### PDP AR File (.xlsx) +Must include: `Institution_ID`, `Cohort`, `Student_GUID` (first sheet parsed) + +--- + +## New Packages + +| Package | Purpose | +|---------|---------| +| `csv-parse` | Streaming CSV parsing (async iterator mode) | +| `xlsx` | Excel (.xlsx) parsing | + +--- + +## New Files + +| File | Purpose | +|------|---------| +| `codebenders-dashboard/app/admin/upload/page.tsx` | Upload UI page | +| `codebenders-dashboard/app/api/admin/upload/preview/route.ts` | Preview API route | +| `codebenders-dashboard/app/api/admin/upload/commit/route.ts` | Commit API route | +| `.github/workflows/ml-pipeline.yml` | GitHub Actions ML pipeline trigger | + +--- + +## Supabase Changes + +**Storage bucket:** Create `pdp-uploads` bucket (private, authenticated access only). +No new database migrations required — `course_enrollments` table already exists. + +**Bucket policy:** Only service role key can read/write. Signed URLs used for pipeline download. + +--- + +## Constraints & Known Limitations + +- ML pipeline trigger via GitHub Actions means a ~30-60s delay before the pipeline starts +- Vercel free tier has a 4.5 MB request body limit — large files should use Supabase Storage direct upload in a future iteration +- No upload history log in this version (deferred) +- Column remapping is out of scope — files must match the known schema From 184202eed8002761efb8416e0a3e16c8b0508733 Mon Sep 17 00:00:00 2001 From: William Hill Date: Tue, 24 Feb 2026 13:12:15 -0500 Subject: [PATCH 02/18] docs: implementation plan for self-service data upload (issue #86) --- docs/plans/2026-02-24-self-service-upload.md | 1135 ++++++++++++++++++ 1 file changed, 1135 insertions(+) create mode 100644 docs/plans/2026-02-24-self-service-upload.md diff --git a/docs/plans/2026-02-24-self-service-upload.md b/docs/plans/2026-02-24-self-service-upload.md new file mode 100644 index 0000000..2c34769 --- /dev/null +++ b/docs/plans/2026-02-24-self-service-upload.md @@ -0,0 +1,1135 @@ +# Self-Service Data Upload Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Add a `/admin/upload` page (admin/ir only) for uploading course enrollment CSVs directly into Postgres, and PDP cohort/AR files into Supabase Storage with automatic GitHub Actions ML pipeline triggering. + +**Architecture:** Single unified upload page with a 3-state UI (select → preview → result). Two API routes: `/api/admin/upload/preview` (parse first 10 rows, validate columns) and `/api/admin/upload/commit` (course CSV → Postgres batch-upsert; PDP/AR → Supabase Storage + `repository_dispatch` to GitHub Actions). No new DB migrations needed — `course_enrollments` table already exists. + +**Tech Stack:** Next.js 16 App Router, `csv-parse` (streaming CSV), `xlsx` (Excel), `@supabase/supabase-js` (Storage), `pg` (Postgres upsert), GitHub REST API (`repository_dispatch`), TypeScript, Tailwind CSS, shadcn/ui + +--- + +## Task 1: Install `csv-parse` and `xlsx` packages + +**Files:** +- Modify: `codebenders-dashboard/package.json` (via npm install) + +**Step 1: Install packages** + +```bash +cd codebenders-dashboard && npm install csv-parse xlsx +``` + +**Step 2: Verify they appear in `package.json` dependencies** + +```bash +grep -E '"csv-parse"|"xlsx"' package.json +``` + +Expected output: +``` + "csv-parse": "^5.x.x", + "xlsx": "^0.x.x", +``` + +**Step 3: Commit** + +```bash +git add codebenders-dashboard/package.json codebenders-dashboard/package-lock.json +git commit -m "chore: add csv-parse and xlsx packages for file upload" +``` + +--- + +## Task 2: Add role permissions and nav link + +**Files:** +- Modify: `codebenders-dashboard/lib/roles.ts:6-13` +- Modify: `codebenders-dashboard/components/nav-header.tsx:15-20` + +**Step 1: Add `/admin` routes to `ROUTE_PERMISSIONS` in `lib/roles.ts`** + +Open `codebenders-dashboard/lib/roles.ts`. After line 13 (`{ prefix: "/api/query-history/export", ... }`), add two new entries so the array looks like: + +```ts +export const ROUTE_PERMISSIONS: Array<{ prefix: string; roles: Role[] }> = [ + { prefix: "/students", roles: ["admin", "advisor", "ir"] }, + { prefix: "/courses", roles: ["admin", "advisor", "ir", "faculty"] }, + { prefix: "/query", roles: ["admin", "advisor", "ir", "faculty"] }, + { prefix: "/api/students", roles: ["admin", "advisor", "ir"] }, + { prefix: "/api/courses", roles: ["admin", "advisor", "ir", "faculty"] }, + { prefix: "/api/query-summary", roles: ["admin", "advisor", "ir", "faculty"] }, + { prefix: "/api/query-history/export", roles: ["admin", "ir"] }, + { prefix: "/admin", roles: ["admin", "ir"] }, + { prefix: "/api/admin", roles: ["admin", "ir"] }, +] +``` + +**Step 2: Add "Upload Data" nav link in `nav-header.tsx`** + +The `NavHeader` component already receives `role` as a prop. Replace the `NAV_LINKS` constant and its usage so the Upload link only renders for admin/ir: + +```tsx +const NAV_LINKS = [ + { href: "/", label: "Dashboard", roles: null }, + { href: "/courses", label: "Courses", roles: null }, + { href: "/students", label: "Students", roles: null }, + { href: "/query", label: "Query", roles: null }, + { href: "/admin/upload", label: "Upload Data", roles: ["admin", "ir"] as Role[] }, +] +``` + +Then update the `nav` block to filter on role: + +```tsx + +``` + +**Step 3: Type-check** + +```bash +cd codebenders-dashboard && npx tsc --noEmit +``` + +Expected: no errors. + +**Step 4: Commit** + +```bash +git add codebenders-dashboard/lib/roles.ts codebenders-dashboard/components/nav-header.tsx +git commit -m "feat: add admin/ir role permissions and Upload Data nav link" +``` + +--- + +## Task 3: Add environment variables + +**Files:** +- Modify: `codebenders-dashboard/env.example` + +**Step 1: Add new env vars to `env.example`** + +Append to the bottom of `codebenders-dashboard/env.example`: + +```bash +# Supabase Storage (for PDP/AR file uploads — use the service role key, not anon) +# Find in Supabase → Project Settings → API → service_role key +SUPABASE_SERVICE_ROLE_KEY=your-service-role-key-here + +# GitHub Actions ML pipeline trigger +# Create a PAT at GitHub → Settings → Developer settings → Personal access tokens +# Required scope: repo (to trigger repository_dispatch) +GITHUB_PAT=ghp_your-personal-access-token-here +# Full repo path: owner/repo +GITHUB_REPO=devcolor/codebenders-datathon +``` + +**Step 2: Add the same vars to your local `.env.local`** + +Copy the three vars above into `codebenders-dashboard/.env.local` with real values. + +**Step 3: Commit** + +```bash +git add codebenders-dashboard/env.example +git commit -m "docs: add env vars for Supabase Storage and GitHub Actions pipeline trigger" +``` + +--- + +## Task 4: Create the preview API route + +**Files:** +- Create: `codebenders-dashboard/app/api/admin/upload/preview/route.ts` + +**Background:** This route accepts a `multipart/form-data` POST with two fields: +- `file` — the uploaded file (File object) +- `fileType` — one of `"course_enrollment"`, `"pdp_cohort"`, `"pdp_ar"` + +It parses the first 50 rows (or all rows if fewer), validates that required columns are present, and returns a preview payload. For `.xlsx` files, it reads the first sheet. For CSV, it uses `csv-parse`. + +**Required columns per file type:** +- `course_enrollment`: `Student_GUID`, `Course_Prefix`, `Course_Number`, `Academic_Year`, `Academic_Term` +- `pdp_cohort`: `Institution_ID`, `Cohort`, `Student_GUID`, `Cohort_Term` +- `pdp_ar`: `Institution_ID`, `Cohort`, `Student_GUID` + +**Step 1: Create the route file** + +Create `codebenders-dashboard/app/api/admin/upload/preview/route.ts` with this content: + +```typescript +import { type NextRequest, NextResponse } from "next/server" +import { parse } from "csv-parse/sync" +import * as XLSX from "xlsx" + +const REQUIRED_COLUMNS: Record = { + course_enrollment: ["Student_GUID", "Course_Prefix", "Course_Number", "Academic_Year", "Academic_Term"], + pdp_cohort: ["Institution_ID", "Cohort", "Student_GUID", "Cohort_Term"], + pdp_ar: ["Institution_ID", "Cohort", "Student_GUID"], +} + +export async function POST(request: NextRequest) { + const role = request.headers.get("x-user-role") + if (role !== "admin" && role !== "ir") { + return NextResponse.json({ error: "Forbidden" }, { status: 403 }) + } + + let formData: FormData + try { + formData = await request.formData() + } catch { + return NextResponse.json({ error: "Invalid multipart form data" }, { status: 400 }) + } + + const file = formData.get("file") as File | null + const fileType = formData.get("fileType") as string | null + + if (!file || !fileType) { + return NextResponse.json({ error: "Missing file or fileType" }, { status: 400 }) + } + if (!REQUIRED_COLUMNS[fileType]) { + return NextResponse.json({ error: `Unknown fileType: ${fileType}` }, { status: 400 }) + } + + let rows: Record[] + + try { + const arrayBuf = await file.arrayBuffer() + const buffer = Buffer.from(arrayBuf) + + if (file.name.endsWith(".xlsx")) { + const wb = XLSX.read(buffer, { type: "buffer" }) + const ws = wb.Sheets[wb.SheetNames[0]] + rows = XLSX.utils.sheet_to_json>(ws, { defval: "" }) + } else { + rows = parse(buffer, { + columns: true, + skip_empty_lines: true, + to: 50, + cast: false, + }) as Record[] + } + } catch (err) { + return NextResponse.json( + { error: "Failed to parse file", details: err instanceof Error ? err.message : String(err) }, + { status: 400 } + ) + } + + if (rows.length === 0) { + return NextResponse.json({ error: "File is empty" }, { status: 400 }) + } + + const columns = Object.keys(rows[0]) + const required = REQUIRED_COLUMNS[fileType] + const missing = required.filter(col => !columns.includes(col)) + + const warnings: string[] = [] + if (missing.length > 0) { + warnings.push(`Missing required columns: ${missing.join(", ")}`) + } + + return NextResponse.json({ + columns, + sampleRows: rows.slice(0, 10), + rowCount: rows.length, // actual count of parsed rows (capped at 50) + warnings, + }) +} +``` + +**Step 2: Type-check** + +```bash +cd codebenders-dashboard && npx tsc --noEmit +``` + +Expected: no errors. + +**Step 3: Smoke-test with curl** (while `npm run dev` is running) + +```bash +curl -s -X POST http://localhost:3000/api/admin/upload/preview \ + -H "x-user-role: admin" \ + -F "fileType=course_enrollment" \ + -F "file=@../data/bishop_state_courses.csv" | jq '{columns: .columns[:3], rowCount: .rowCount, warnings: .warnings}' +``` + +Expected: JSON with `columns` array, `rowCount: 50`, `warnings: []` + +**Step 4: Commit** + +```bash +git add codebenders-dashboard/app/api/admin/upload/preview/route.ts +git commit -m "feat: add POST /api/admin/upload/preview route" +``` + +--- + +## Task 5: Create the commit route — course enrollment path + +**Files:** +- Create: `codebenders-dashboard/app/api/admin/upload/commit/route.ts` + +**Background:** For `course_enrollment` file type, stream-parse the full CSV and batch-upsert rows into `public.course_enrollments` in chunks of 500. Use `pg`'s `getPool()` (already available in `lib/db.ts`). The upsert conflict target is `(student_guid, course_prefix, course_number, academic_term)` — you'll need to add a unique constraint migration (Task 7) or use a simpler strategy. + +Actually, since the existing load script uses TRUNCATE (not upsert), and there's no unique index on `course_enrollments`, we'll use the same approach: truncate + re-insert. This is idempotent and matches the existing pattern. + +**Column mapping** from CSV header names → DB column names (matches the existing load script at `scripts/load-course-enrollments.ts`): + +| CSV header | DB column | +|---|---| +| Student_GUID | student_guid | +| Cohort | cohort | +| Cohort_Term | cohort_term | +| Academic_Year | academic_year | +| Academic_Term | academic_term | +| Course_Prefix | course_prefix | +| Course_Number | course_number | +| Course_Name | course_name | +| Course_CIP | course_cip | +| Course_Type | course_type | +| Math_or_English_Gateway | gateway_type | +| Co_requisite_Course | is_co_requisite (Y/N → boolean) | +| Core_Course | is_core_course (Y/N → boolean) | +| Core_Course_Type | core_course_type | +| Delivery_Method | delivery_method | +| Grade | grade | +| Number_of_Credits_Attempted | credits_attempted | +| Number_of_Credits_Earned | credits_earned | +| Course_Instructor_Employment_Status | instructor_status | + +**Step 1: Create the commit route file (course enrollment path only)** + +Create `codebenders-dashboard/app/api/admin/upload/commit/route.ts`: + +```typescript +import { type NextRequest, NextResponse } from "next/server" +import { parse } from "csv-parse" +import { Readable } from "stream" +import { getPool } from "@/lib/db" + +const BATCH_SIZE = 500 + +function toBoolean(val: string): boolean | null { + if (val === "Y") return true + if (val === "N") return false + return null +} + +function toNumeric(val: string): number | null { + const t = val.trim() + if (!t || t === "null" || t === "NULL") return null + const n = parseFloat(t) + return isNaN(n) ? null : n +} + +function toNullable(val: string): string | null { + const t = val.trim() + return t === "" ? null : t +} + +interface EnrollmentRow { + student_guid: string + cohort: string | null + cohort_term: string | null + academic_year: string | null + academic_term: string | null + course_prefix: string | null + course_number: string | null + course_name: string | null + course_cip: string | null + course_type: string | null + gateway_type: string | null + is_co_requisite: boolean | null + is_core_course: boolean | null + core_course_type: string | null + delivery_method: string | null + grade: string | null + credits_attempted: number | null + credits_earned: number | null + instructor_status: string | null +} + +const COLS = [ + "student_guid", "cohort", "cohort_term", "academic_year", "academic_term", + "course_prefix", "course_number", "course_name", "course_cip", "course_type", + "gateway_type", "is_co_requisite", "is_core_course", "core_course_type", + "delivery_method", "grade", "credits_attempted", "credits_earned", "instructor_status", +] as const + +async function insertBatch(client: import("pg").PoolClient, batch: EnrollmentRow[]): Promise { + if (batch.length === 0) return + const placeholders: string[] = [] + const params: unknown[] = [] + batch.forEach((row, ri) => { + const p = COLS.map((_, ci) => `$${ri * COLS.length + ci + 1}`).join(", ") + placeholders.push(`(${p})`) + COLS.forEach(col => params.push(row[col])) + }) + await client.query( + `INSERT INTO public.course_enrollments (${COLS.join(", ")}) VALUES ${placeholders.join(", ")}`, + params + ) +} + +async function processCourseEnrollment(buffer: Buffer): Promise<{ inserted: number; skipped: number; errors: string[] }> { + const pool = getPool() + const client = await pool.connect() + let inserted = 0 + let skipped = 0 + const errors: string[] = [] + + try { + await client.query("BEGIN") + await client.query("TRUNCATE TABLE public.course_enrollments RESTART IDENTITY") + + const parser = Readable.from(buffer).pipe( + parse({ columns: true, skip_empty_lines: true }) + ) + + let batch: EnrollmentRow[] = [] + + for await (const record of parser) { + const r = record as Record + const student_guid = toNullable(r["Student_GUID"] ?? "") + if (!student_guid) { + skipped++ + continue + } + batch.push({ + student_guid, + cohort: toNullable(r["Cohort"] ?? ""), + cohort_term: toNullable(r["Cohort_Term"] ?? ""), + academic_year: toNullable(r["Academic_Year"] ?? ""), + academic_term: toNullable(r["Academic_Term"] ?? ""), + course_prefix: toNullable(r["Course_Prefix"] ?? ""), + course_number: toNullable(r["Course_Number"] ?? ""), + course_name: toNullable(r["Course_Name"] ?? ""), + course_cip: toNullable(r["Course_CIP"] ?? ""), + course_type: toNullable(r["Course_Type"] ?? ""), + gateway_type: toNullable(r["Math_or_English_Gateway"] ?? ""), + is_co_requisite: toBoolean(r["Co_requisite_Course"] ?? ""), + is_core_course: toBoolean(r["Core_Course"] ?? ""), + core_course_type: toNullable(r["Core_Course_Type"] ?? ""), + delivery_method: toNullable(r["Delivery_Method"] ?? ""), + grade: toNullable(r["Grade"] ?? ""), + credits_attempted: toNumeric(r["Number_of_Credits_Attempted"] ?? ""), + credits_earned: toNumeric(r["Number_of_Credits_Earned"] ?? ""), + instructor_status: toNullable(r["Course_Instructor_Employment_Status"] ?? ""), + }) + inserted++ + if (batch.length >= BATCH_SIZE) { + await insertBatch(client, batch) + batch = [] + } + } + + if (batch.length > 0) await insertBatch(client, batch) + await client.query("COMMIT") + } catch (err) { + await client.query("ROLLBACK") + errors.push(err instanceof Error ? err.message : String(err)) + inserted = 0 + } finally { + client.release() + } + + return { inserted, skipped, errors } +} + +export async function POST(request: NextRequest) { + const role = request.headers.get("x-user-role") + if (role !== "admin" && role !== "ir") { + return NextResponse.json({ error: "Forbidden" }, { status: 403 }) + } + + let formData: FormData + try { + formData = await request.formData() + } catch { + return NextResponse.json({ error: "Invalid multipart form data" }, { status: 400 }) + } + + const file = formData.get("file") as File | null + const fileType = formData.get("fileType") as string | null + + if (!file || !fileType) { + return NextResponse.json({ error: "Missing file or fileType" }, { status: 400 }) + } + + const buffer = Buffer.from(await file.arrayBuffer()) + + if (fileType === "course_enrollment") { + const result = await processCourseEnrollment(buffer) + return NextResponse.json(result) + } + + // PDP/AR path — placeholder, implemented in Task 6 + return NextResponse.json({ error: `fileType "${fileType}" not yet implemented` }, { status: 501 }) +} +``` + +**Step 2: Type-check** + +```bash +cd codebenders-dashboard && npx tsc --noEmit +``` + +Expected: no errors. + +**Step 3: Smoke-test with curl** (while `npm run dev` is running) + +```bash +curl -s -X POST http://localhost:3000/api/admin/upload/commit \ + -H "x-user-role: admin" \ + -F "fileType=course_enrollment" \ + -F "file=@../data/bishop_state_courses.csv" | jq . +``` + +Expected: `{"inserted": , "skipped": 0, "errors": []}` + +**Step 4: Commit** + +```bash +git add codebenders-dashboard/app/api/admin/upload/commit/route.ts +git commit -m "feat: add POST /api/admin/upload/commit — course enrollment upsert path" +``` + +--- + +## Task 6: Extend commit route — PDP/AR path (Supabase Storage + GitHub dispatch) + +**Files:** +- Modify: `codebenders-dashboard/app/api/admin/upload/commit/route.ts` + +**Background:** For `pdp_cohort` and `pdp_ar` file types, the commit route: +1. Creates a Supabase service-role client (uses `SUPABASE_SERVICE_ROLE_KEY`) +2. Uploads the file to the `pdp-uploads` Storage bucket with path `/-` +3. Calls the GitHub `repository_dispatch` API with `GITHUB_PAT` and `GITHUB_REPO` env vars +4. Returns `{ status: "processing", storageKey, actionsUrl }` + +**Before this task:** Create the `pdp-uploads` bucket in your Supabase dashboard: +- Supabase → Storage → New bucket → name: `pdp-uploads` → Private + +**Step 1: Add the PDP/AR handler to the commit route** + +In `codebenders-dashboard/app/api/admin/upload/commit/route.ts`, add these imports at the top: + +```typescript +import { createClient } from "@supabase/supabase-js" +``` + +Add this function before the `POST` handler: + +```typescript +async function processPdpFile( + buffer: Buffer, + fileName: string, + fileType: string, +): Promise<{ status: string; storageKey: string; actionsUrl: string }> { + const supabaseUrl = process.env.NEXT_PUBLIC_SUPABASE_URL + const serviceKey = process.env.SUPABASE_SERVICE_ROLE_KEY + const githubPat = process.env.GITHUB_PAT + const githubRepo = process.env.GITHUB_REPO + + if (!supabaseUrl || !serviceKey) throw new Error("Missing SUPABASE_SERVICE_ROLE_KEY") + if (!githubPat || !githubRepo) throw new Error("Missing GITHUB_PAT or GITHUB_REPO") + + // 1. Upload to Supabase Storage + const supabase = createClient(supabaseUrl, serviceKey) + const storageKey = `${fileType}/${Date.now()}-${fileName}` + const { error: uploadError } = await supabase.storage + .from("pdp-uploads") + .upload(storageKey, buffer, { contentType: "application/octet-stream", upsert: false }) + + if (uploadError) throw new Error(`Storage upload failed: ${uploadError.message}`) + + // 2. Trigger GitHub Actions via repository_dispatch + const dispatchRes = await fetch( + `https://api.github.com/repos/${githubRepo}/dispatches`, + { + method: "POST", + headers: { + Authorization: `Bearer ${githubPat}`, + Accept: "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + "Content-Type": "application/json", + }, + body: JSON.stringify({ + event_type: "ml-pipeline", + client_payload: { storage_key: storageKey, file_type: fileType }, + }), + } + ) + + if (!dispatchRes.ok) { + const body = await dispatchRes.text() + throw new Error(`GitHub dispatch failed (${dispatchRes.status}): ${body}`) + } + + const actionsUrl = `https://github.com/${githubRepo}/actions` + return { status: "processing", storageKey, actionsUrl } +} +``` + +Replace the placeholder in the `POST` handler at the bottom: + +```typescript + if (fileType === "pdp_cohort" || fileType === "pdp_ar") { + try { + const result = await processPdpFile(buffer, file.name, fileType) + return NextResponse.json(result) + } catch (err) { + return NextResponse.json( + { error: err instanceof Error ? err.message : String(err) }, + { status: 500 } + ) + } + } + + return NextResponse.json({ error: `Unknown fileType: ${fileType}` }, { status: 400 }) +``` + +**Step 2: Type-check** + +```bash +cd codebenders-dashboard && npx tsc --noEmit +``` + +Expected: no errors. + +**Step 3: Commit** + +```bash +git add codebenders-dashboard/app/api/admin/upload/commit/route.ts +git commit -m "feat: extend commit route with PDP/AR → Supabase Storage + GitHub Actions dispatch" +``` + +--- + +## Task 7: Create GitHub Actions ML pipeline workflow + +**Files:** +- Create: `.github/workflows/ml-pipeline.yml` + +**Background:** This workflow fires on `repository_dispatch` with `event_type: ml-pipeline`. It: +1. Downloads the uploaded file from Supabase Storage using a signed URL +2. Determines the target data file path from `file_type` in the payload +3. Replaces the appropriate file in `data/` with the uploaded one +4. Runs the Python ML pipeline +5. Uploads `ML_PIPELINE_REPORT.txt` as an artifact + +**Required GitHub Actions secrets** (set at repo level: Settings → Secrets → Actions): +- `SUPABASE_URL` — your Supabase project URL +- `SUPABASE_SERVICE_ROLE_KEY` — service role key for Storage access +- `DB_HOST`, `DB_USER`, `DB_PASSWORD`, `DB_PORT`, `DB_NAME`, `DB_SSL` — Postgres credentials + +**Step 1: Create the workflow file** + +Create `.github/workflows/ml-pipeline.yml`: + +```yaml +name: ML Pipeline + +on: + repository_dispatch: + types: [ml-pipeline] + +jobs: + run-pipeline: + name: Download data file and run ML pipeline + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Create virtualenv and install dependencies + run: | + python -m venv venv + venv/bin/pip install --upgrade pip + venv/bin/pip install -r requirements.txt + + - name: Download uploaded file from Supabase Storage + env: + SUPABASE_URL: ${{ secrets.SUPABASE_URL }} + SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY }} + STORAGE_KEY: ${{ github.event.client_payload.storage_key }} + FILE_TYPE: ${{ github.event.client_payload.file_type }} + run: | + python - <<'EOF' + import os, urllib.request, json + + url = os.environ["SUPABASE_URL"] + key = os.environ["SUPABASE_SERVICE_ROLE_KEY"] + storage_key = os.environ["STORAGE_KEY"] + file_type = os.environ["FILE_TYPE"] + + # Get a signed download URL via Supabase Storage REST API + sign_url = f"{url}/storage/v1/object/sign/pdp-uploads/{storage_key}" + req = urllib.request.Request( + sign_url, + data=json.dumps({"expiresIn": 600}).encode(), + headers={ + "Authorization": f"Bearer {key}", + "Content-Type": "application/json", + "apikey": key, + }, + method="POST", + ) + with urllib.request.urlopen(req) as resp: + signed = json.loads(resp.read()) + signed_url = f"{url}/storage/v1{signed['signedURL']}" + + # Determine destination path + dest = { + "pdp_cohort": "data/bishop_state_cohorts_with_zip.csv", + "pdp_ar": "data/ar_bscc_with_zip.csv", + }.get(file_type) + if not dest: + raise ValueError(f"Unknown file_type: {file_type}") + + print(f"Downloading to {dest}...") + urllib.request.urlretrieve(signed_url, dest) + print("Download complete.") + EOF + + - name: Run ML pipeline + env: + DB_HOST: ${{ secrets.DB_HOST }} + DB_USER: ${{ secrets.DB_USER }} + DB_PASSWORD: ${{ secrets.DB_PASSWORD }} + DB_PORT: ${{ secrets.DB_PORT }} + DB_NAME: ${{ secrets.DB_NAME }} + DB_SSL: ${{ secrets.DB_SSL }} + run: | + venv/bin/python ai_model/complete_ml_pipeline.py + + - name: Upload ML pipeline report + uses: actions/upload-artifact@v4 + if: always() + with: + name: ml-pipeline-report-${{ github.run_id }} + path: ML_PIPELINE_REPORT.txt + retention-days: 90 +``` + +**Step 2: Commit** + +```bash +git add .github/workflows/ml-pipeline.yml +git commit -m "feat: add GitHub Actions ML pipeline workflow triggered by repository_dispatch" +``` + +--- + +## Task 8: Create the upload page UI + +**Files:** +- Create: `codebenders-dashboard/app/admin/upload/page.tsx` + +**Background:** This is a client component (`"use client"`) with three local state phases: `idle` (file selection), `preview` (showing sample rows + warnings), and `result` (showing outcome). It uses `fetch` to call the two API routes. Drag-and-drop is implemented with native HTML5 `onDrop` / `onDragOver` events. + +**Step 1: Create the page file** + +Create `codebenders-dashboard/app/admin/upload/page.tsx`: + +```tsx +"use client" + +import { useState, useCallback } from "react" +import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card" +import { Button } from "@/components/ui/button" +import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui/table" +import { Upload, AlertCircle, CheckCircle2, Loader2 } from "lucide-react" + +type FileType = "course_enrollment" | "pdp_cohort" | "pdp_ar" +type Phase = "idle" | "previewing" | "preview" | "committing" | "result" + +interface PreviewData { + columns: string[] + sampleRows: Record[] + rowCount: number + warnings: string[] +} + +interface CommitResult { + // Course enrollment + inserted?: number + skipped?: number + errors?: string[] + // PDP/AR + status?: string + storageKey?: string + actionsUrl?: string + error?: string +} + +const FILE_TYPE_LABELS: Record = { + course_enrollment: "Course Enrollment CSV", + pdp_cohort: "PDP Cohort CSV", + pdp_ar: "PDP AR File (.xlsx)", +} + +const FILE_TYPE_ACCEPT: Record = { + course_enrollment: ".csv", + pdp_cohort: ".csv", + pdp_ar: ".csv,.xlsx", +} + +export default function UploadPage() { + const [fileType, setFileType] = useState("course_enrollment") + const [file, setFile] = useState(null) + const [phase, setPhase] = useState("idle") + const [preview, setPreview] = useState(null) + const [result, setResult] = useState(null) + const [dragOver, setDragOver] = useState(false) + const [errorMsg, setErrorMsg] = useState(null) + + const handleFile = useCallback((f: File) => { + setFile(f) + setErrorMsg(null) + setPhase("idle") + setPreview(null) + setResult(null) + }, []) + + const handleDrop = useCallback((e: React.DragEvent) => { + e.preventDefault() + setDragOver(false) + const dropped = e.dataTransfer.files[0] + if (dropped) handleFile(dropped) + }, [handleFile]) + + const handlePreview = async () => { + if (!file) return + setPhase("previewing") + setErrorMsg(null) + const fd = new FormData() + fd.append("file", file) + fd.append("fileType", fileType) + try { + const res = await fetch("/api/admin/upload/preview", { method: "POST", body: fd }) + const data = await res.json() + if (!res.ok) { setErrorMsg(data.error ?? "Preview failed"); setPhase("idle"); return } + setPreview(data as PreviewData) + setPhase("preview") + } catch (err) { + setErrorMsg(err instanceof Error ? err.message : "Network error") + setPhase("idle") + } + } + + const handleCommit = async () => { + if (!file) return + setPhase("committing") + setErrorMsg(null) + const fd = new FormData() + fd.append("file", file) + fd.append("fileType", fileType) + try { + const res = await fetch("/api/admin/upload/commit", { method: "POST", body: fd }) + const data = await res.json() + if (!res.ok) { setErrorMsg(data.error ?? "Upload failed"); setPhase("preview"); return } + setResult(data as CommitResult) + setPhase("result") + } catch (err) { + setErrorMsg(err instanceof Error ? err.message : "Network error") + setPhase("preview") + } + } + + const reset = () => { + setFile(null) + setPhase("idle") + setPreview(null) + setResult(null) + setErrorMsg(null) + } + + return ( +
+
+

Upload Data

+

+ Import course enrollment CSVs or PDP/AR files. Admin and IR only. +

+
+ + {/* ── Phase: idle / selecting ── */} + {(phase === "idle" || phase === "previewing") && ( + + + Select File + Choose a file type, then drop or pick your file. + + + {/* File type selector */} +
+ {(Object.keys(FILE_TYPE_LABELS) as FileType[]).map(ft => ( + + ))} +
+ + {/* Drop zone */} + + + {errorMsg && ( +
+ + {errorMsg} +
+ )} + + +
+
+ )} + + {/* ── Phase: preview ── */} + {(phase === "preview" || phase === "committing") && preview && ( + + + Preview — {FILE_TYPE_LABELS[fileType]} + + {file?.name} · {preview.rowCount} rows parsed + + + + {preview.warnings.length > 0 && ( +
+ {preview.warnings.map((w, i) => ( +
+ + {w} +
+ ))} +
+ )} + +
+ + + + {preview.columns.slice(0, 8).map(col => ( + {col} + ))} + {preview.columns.length > 8 && +{preview.columns.length - 8} more} + + + + {preview.sampleRows.map((row, i) => ( + + {preview.columns.slice(0, 8).map(col => ( + {String(row[col] ?? "")} + ))} + {preview.columns.length > 8 && } + + ))} + +
+
+ + {errorMsg && ( +
+ + {errorMsg} +
+ )} + +
+ + +
+
+
+ )} + + {/* ── Phase: result ── */} + {phase === "result" && result && ( + + + + + Upload Complete + + + + {result.inserted !== undefined && ( +
+

{result.inserted.toLocaleString()} rows inserted

+ {(result.skipped ?? 0) > 0 &&

{result.skipped} rows skipped (missing Student_GUID)

} + {result.errors && result.errors.length > 0 && ( +
+ {result.errors.map((e, i) =>

{e}

)} +
+ )} +
+ )} + {result.status === "processing" && ( +
+

File saved to Supabase Storage. The ML pipeline has been queued in GitHub Actions.

+ {result.actionsUrl && ( + + View pipeline run on GitHub Actions → + + )} +
+ )} + {result.error && ( +
+ {result.error} +
+ )} + +
+
+ )} +
+ ) +} +``` + +**Step 2: Type-check** + +```bash +cd codebenders-dashboard && npx tsc --noEmit +``` + +Expected: no errors. + +**Step 3: Visual check** (while `npm run dev` is running) + +- Log in as an admin or IR user +- Navigate to `/admin/upload` +- Verify "Upload Data" appears in the nav +- Try dragging and dropping `data/bishop_state_courses.csv` +- Verify the preview table shows first 10 rows +- Verify "Confirm & Upload" runs and returns a result + +**Step 4: Commit** + +```bash +git add codebenders-dashboard/app/admin/upload/page.tsx +git commit -m "feat: add /admin/upload page with drag-drop, preview, and commit UI" +``` + +--- + +## Task 9: Final type-check, lint, and push + +**Step 1: Full type-check + lint** + +```bash +cd codebenders-dashboard && npx tsc --noEmit && npm run lint +``` + +Expected: 0 errors, 0 warnings (or only pre-existing warnings). + +**Step 2: Push and open PR** + +```bash +git push origin +gh pr create \ + --title "feat: self-service data upload for course and PDP/AR files (#86)" \ + --body "Closes #86 + +## Summary +- \`/admin/upload\` page (admin/ir only) with drag-drop, preview, and commit +- Course enrollment CSVs stream-parsed and batch-upserted into \`course_enrollments\` Postgres table +- PDP cohort CSVs and AR .xlsx files uploaded to Supabase Storage \`pdp-uploads\` bucket +- GitHub Actions workflow \`ml-pipeline.yml\` triggered via \`repository_dispatch\` after PDP/AR upload + +## New env vars required (see env.example) +- \`SUPABASE_SERVICE_ROLE_KEY\` +- \`GITHUB_PAT\` +- \`GITHUB_REPO\` + +## GitHub Actions secrets required +- \`SUPABASE_URL\`, \`SUPABASE_SERVICE_ROLE_KEY\`, \`DB_HOST\`, \`DB_USER\`, \`DB_PASSWORD\`, \`DB_PORT\`, \`DB_NAME\`, \`DB_SSL\` + +## Test plan +- [ ] Admin/IR can access \`/admin/upload\`; other roles get redirected +- [ ] Upload Data nav link visible to admin/IR only +- [ ] Course enrollment CSV preview shows first 10 rows with correct columns +- [ ] Course enrollment commit inserts rows into \`course_enrollments\` table +- [ ] PDP cohort CSV commit uploads to Supabase Storage and returns \`status: processing\` +- [ ] \`npx tsc --noEmit\` passes with 0 errors +" +``` From cb20f2d6ec0c2de3a0c48fc8d7d203c3f6898a6a Mon Sep 17 00:00:00 2001 From: William Hill Date: Fri, 27 Mar 2026 21:13:41 -0400 Subject: [PATCH 03/18] docs: design spec for config-driven distillation pipeline Per-school fine-tuning pipeline to replace OpenAI dependency for explanation and summarization endpoints with locally-served Qwen 3.5 models via MLX and Ollama. --- ...2026-03-27-distillation-pipeline-design.md | 550 ++++++++++++++++++ 1 file changed, 550 insertions(+) create mode 100644 docs/superpowers/specs/2026-03-27-distillation-pipeline-design.md diff --git a/docs/superpowers/specs/2026-03-27-distillation-pipeline-design.md b/docs/superpowers/specs/2026-03-27-distillation-pipeline-design.md new file mode 100644 index 0000000..761c553 --- /dev/null +++ b/docs/superpowers/specs/2026-03-27-distillation-pipeline-design.md @@ -0,0 +1,550 @@ +# Config-Driven Distillation Pipeline for Per-School Fine-Tuned Models + +**Date:** 2026-03-27 +**Status:** Draft +**Goal:** Replace OpenAI API dependency for explanation and summarization endpoints with locally-served, per-school fine-tuned models via a repeatable, config-driven training pipeline. + +--- + +## 1. Problem + +The dashboard currently calls OpenAI GPT-4o-mini in two explanation/summarization endpoints: + +- `/api/courses/explain-pairing` — course pairing explanations +- `/api/query-summary` — query result summaries + +This creates per-call API costs, latency, and a dependency on an external service. The explanations are also generic — they lack institutional context about each school's programs, challenges, demographics, and interventions. + +## 2. Solution + +A config-driven distillation pipeline that: + +1. Takes a per-school YAML config describing the school's schema, domain knowledge, and context +2. Uses a teacher model (Claude Sonnet or Qwen 3.5 locally) to generate high-quality training pairs +3. Fine-tunes a small open-source model (Qwen 3.5 4B or 9B) via MLX on Apple Silicon +4. Evaluates the model against ship criteria +5. Exports to Ollama for local serving + +New school = new config file + run the pipeline. No code changes needed. + +## 3. Architecture + +### Directory Structure + +``` +schools/ + bishop-state/ + config.yaml # Schema, domain knowledge, explanation style + seed_queries.yaml # Example questions users ask at this school + akron/ + config.yaml + seed_queries.yaml + +training/ + distill.py # Step 1: Generate training pairs via teacher model + prepare.py # Step 2: Filter, dedup, split (80/10/10) + finetune.py # Step 3: Fine-tune via MLX (Qwen 3.5) + eval.py # Step 4: Evaluate model quality + export.py # Step 5: Package for Ollama + config.py # Shared constants + prompts.py # Teacher prompts (school-agnostic templates) + +training_data/ + bishop-state/ + pairs/ # Raw distilled pairs (explainer.jsonl, summarizer.jsonl) + final/ # Train/val/test splits per adapter + models/ # Fine-tuned LoRA adapters + qwen3.5-9b/ + explainer/ + adapter_config.json + adapter_model.safetensors + summarizer/ + adapter_config.json + adapter_model.safetensors +``` + +### CLI + +```bash +python -m training.distill --school bishop-state [--local] # Generate pairs +python -m training.prepare --school bishop-state # Filter/split +python -m training.finetune --school bishop-state --model 9b # Train +python -m training.eval --school bishop-state # Evaluate +python -m training.export --school bishop-state # Deploy to Ollama +``` + +## 4. School Config Format + +Each school gets a `config.yaml` capturing everything the pipeline needs. Sections: + +### Core Identity + +```yaml +school: + name: "Bishop State Community College" + code: "bscc" + type: "community_college" + designation: ["hbcu", "minority_serving"] + accreditation: "SACSCOC" + founded: 1927 +``` + +### Location and Setting + +```yaml + location: + address: "351 North Broad Street" + city: "Mobile" + state: "Alabama" + zip: "36603" + county: "Mobile County" + region: "Gulf Coast" + setting: "urban" + climate_zone: "subtropical" +``` + +### Enrollment Profile + +```yaml + enrollment: + total_headcount: 4200 + fte: 2800 + undergraduate_only: true + residential: false + percent_full_time: 0.42 + percent_part_time: 0.58 + percent_online: 0.35 + open_admission: true +``` + +### Demographics + +```yaml + demographics: + percent_black: 0.72 + percent_white: 0.18 + percent_hispanic: 0.05 + percent_other: 0.05 + percent_pell_eligible: 0.68 + percent_first_gen: 0.55 + percent_adult_learners: 0.40 + median_household_income_area: 42000 +``` + +### Database Schema + +```yaml +database: + main_table: "student_level_with_predictions" + course_table: "course_enrollments" + connection_env: "DATABASE_URL" + +schema: + student_columns: + Cohort: "Cohort year (numeric: 2019, 2020, etc.)" + Race: "Student race/ethnicity" + Gender: "Student gender" + Retention: "Retention indicator (0 or 1)" + GPA_Group_Year_1: "GPA in year 1" + # ... full column list from route.ts SCHEMA_INFO + course_columns: + course_prefix: "Course dept code (MAT, ENG, NUR, etc.)" + grade: "Student grade (A, B, C, D, F, W, I)" + # ... full column list +``` + +### Domain Knowledge + +```yaml +domain: + programs: + - name: "Nursing (ADN)" + cip: "51.3801" + gateway_courses: ["BIO 201", "MAT 110"] + - name: "Welding Technology" + cip: "48.0508" + gateway_courses: ["WDT 108", "WDT 109"] + key_metrics: ["retention_rate", "dfwi_rate", "gateway_pass_rate"] + terminology: + credential: "associate degree or certificate" + at_risk: "students flagged by early warning system" +``` + +### Workforce and Outcomes + +```yaml + workforce: + top_employers: ["Austal USA", "Mobile Infirmary", "AM/NS Calvert"] + high_demand_fields: ["healthcare", "advanced_manufacturing", "maritime"] + + outcomes: + job_placement_rate_6mo: 0.78 + median_salary_after_credential: + associate: 34000 + certificate: 29000 + licensure_pass_rates: + nursing_nclex: 0.89 + welding_aws: 0.92 +``` + +### Peer Context + +```yaml + peers: + ipeds_id: "101505" + carnegie_class: "Associate's—High Transfer-High Traditional" + peer_institutions: ["Lawson State CC", "Shelton State CC"] + state_system: "Alabama Community College System" +``` + +### Financial Context + +```yaml + financial: + in_state_tuition: 4800 + avg_financial_aid_package: 5200 + percent_receiving_aid: 0.82 + percent_student_loans: 0.25 + cost_of_living_index: 87.3 + emergency_aid_fund: true +``` + +### Completion Context + +```yaml + completion: + ipeds_graduation_rate: 0.18 + adjusted_completion_rate: 0.42 + avg_time_to_credential: 3.2 + percent_transfer_out: 0.24 + percent_stop_out_return: 0.15 + top_completion_barriers: + - "developmental_math_sequences" + - "financial_emergencies" + - "work_schedule_conflicts" +``` + +### Faculty and Instruction + +```yaml + instruction: + student_faculty_ratio: 18 + percent_full_time_faculty: 0.45 + percent_adjunct: 0.55 + developmental_ed_model: "corequisite" +``` + +### Student Pipeline + +```yaml + pipeline: + feeder_high_schools: + - name: "Williamson High School" + percent_of_enrollment: 0.12 + avg_readiness: "below_college_level" + percent_ged: 0.11 + percent_veterans: 0.07 + percent_career_changers: 0.14 + primary_recruitment_radius_miles: 35 +``` + +### Digital Access + +```yaml + technology: + percent_students_with_reliable_wifi: 0.71 + percent_students_with_personal_laptop: 0.64 + campus_device_lending: true + broadband_desert_overlap: true +``` + +### Transportation and Access + +```yaml + access: + campus_count: 4 + campuses: + - name: "Main Campus" + public_transit_accessible: true + - name: "Southwest Campus" + public_transit_accessible: false + percent_students_commute_30_plus_min: 0.35 + evening_weekend_classes: true +``` + +### Equity Gaps and Initiatives + +```yaml + equity: + known_gaps: + - metric: "gateway_math_pass_rate" + group_a: { name: "Black male students", value: 0.41 } + group_b: { name: "Overall", value: 0.58 } + initiative: "Male Student Success mentoring program" + minority_male_initiative: "Brother 2 Brother" +``` + +### Active Interventions + +```yaml + interventions: + active: + - name: "Starfish Early Alert" + type: "early_warning" + target: "all students" + trigger: "missed 2+ classes or below C at midterm" + effectiveness: "12% retention lift in pilot cohorts" + - name: "Emergency Micro-Grants" + type: "financial" + max_award: 500 + effectiveness: "78% of recipients re-enrolled next term" +``` + +### Student Life + +```yaml + student_life: + percent_working_while_enrolled: 0.72 + percent_working_over_20hrs: 0.48 + percent_single_parents: 0.18 + food_insecurity_rate: 0.31 + housing_insecurity_rate: 0.14 +``` + +### Community Health Context + +```yaml + health: + mental_health_counselor_ratio: "1:1400" + community_health_context: + - "Mobile County has highest diabetes rate in Alabama" + - "Limited mental health providers in service area" +``` + +### Seasonal Patterns + +```yaml + patterns: + high_attrition_points: + - week: 4 + reason: "Financial aid disbursement delays" + - week: 8 + reason: "Midterm performance shock" + - month: "October" + reason: "Hurricane season peak" + summer_melt_rate: 0.22 +``` + +### Historical Trends + +```yaml + trends: + enrollment_direction: "declining" + enrollment_5yr_change: -0.12 + completion_direction: "improving" + notable_changes: + - year: 2022 + event: "Switched to corequisite math model" + - year: 2023 + event: "Launched early alert system with ML predictions" +``` + +### Institutional Priorities + +```yaml + priorities: + strategic_plan_years: "2024-2029" + top_goals: + - "Increase fall-to-fall retention from 42% to 55%" + - "Launch 3 new short-term workforce certificates" + - "Close equity gap in gateway math by 50%" + accreditation_qep_topic: "Guided Pathways implementation" + grant_funded_initiatives: + - name: "Title III Strengthening Institutions" + focus: "Student support services and advising redesign" + end_date: "2027-09-30" +``` + +### Data Quality Notes + +```yaml + data_caveats: + - "Pre-2020 cohorts lack online/hybrid delivery classification" + - "Race/ethnicity is self-reported; 6% of records are 'Unknown'" + - "Transfer-out data relies on NSC match — ~85% match rate" +``` + +### Distillation and Training Config + +```yaml +distillation: + teacher_model: "claude-sonnet-4-20250514" + teacher_backend: "anthropic" + local_teacher_model: "qwen3.5:27b" + local_teacher_backend: "ollama" + pairs_per_task: 1500 + +training: + default_model: "qwen3.5:9b" + fallback_model: "qwen3.5:4b" + method: "qlora" + quantization: 4 + lora_rank: 16 + lora_alpha: 32 + epochs: 3 + learning_rate: 1.0e-4 + batch_size: 4 + warmup_steps: 100 + eval_every: 50 + early_stopping_patience: 3 +``` + +## 5. Distillation — Teacher Prompts and Pair Generation + +### Two Adapters + +| Adapter | Replaces | Input | Output | +|---------|----------|-------|--------| +| **Explainer** | `/api/courses/explain-pairing` | Course pairing data | Structured explanation JSON | +| **Summarizer** | `/api/query-summary` | Query + result rows | Structured summary JSON | + +### Teacher Prompt Strategy + +**Explainer teacher prompt:** + +The teacher model receives the full institutional context from config.yaml plus the course pairing data, and generates: + +```json +{ + "explanation": "2-3 sentence plain-language explanation", + "structural_factors": ["institutional/systemic factors"], + "student_impact": "what this means for students", + "advisor_recommendation": "actionable next step", + "data_limitations": ["caveats about this data"], + "related_intervention": "existing program that addresses this, or null" +} +``` + +**Summarizer teacher prompt:** + +The teacher receives institutional context plus the original query and SQL result rows, and generates: + +```json +{ + "summary": "2-3 sentence headline finding", + "key_insights": ["notable patterns"], + "context": "how this connects to institutional priorities or known challenges", + "action_items": ["what someone should do with this information"], + "caveats": ["data limitations relevant to this query"] +} +``` + +**Student prompts** (what the fine-tuned model sees at inference) are minimal — just the data input. All institutional context is baked into the weights during training. + +### Dual Teacher Support + +- **`--local` flag:** Uses Qwen 3.5 27B via Ollama for free iteration and pipeline testing +- **Default:** Uses Claude Sonnet via Anthropic API for production-quality training data + +### Seed Data Sources + +1. **Database-driven (500 pairs per adapter):** Query the school's actual data for real course pairings and result sets +2. **Template-driven (500 pairs per adapter):** From `seed_queries.yaml` with school-specific examples +3. **Synthetic variation (500 pairs per adapter):** Pipeline varies dimensions (cohorts, programs, demographics) to reach 1,500 pairs per adapter + +**Total per school:** 3,000 training pairs. Distillation cost via Claude Sonnet: ~$15-25. + +## 6. Fine-Tuning + +### Method + +QLoRA via Apple MLX framework on Apple Silicon Macs. + +- Base model: Qwen 3.5 9B (default) or 4B (lightweight) +- 4-bit quantized base, trainable low-rank adapters +- Two separate adapters per school (explainer + summarizer) on the same base model + +### Hardware Requirements + +| Model | Training | Inference | +|-------|----------|-----------| +| Qwen 3.5 9B | 24GB+ RAM (M-series Mac) | 8GB+ RAM (Q4 via Ollama) | +| Qwen 3.5 4B | 16GB+ RAM (M-series Mac) | 4GB+ RAM (Q4 via Ollama) | + +### Training Time Estimates (3,000 examples, 3 epochs) + +| Model | 18GB Mac (M3 Pro) | 36GB Mac (M3 Pro) | +|-------|-------------------|-------------------| +| Qwen 3.5 4B | ~2-4 hrs | ~1.5-3 hrs | +| Qwen 3.5 9B | Tight, not recommended | ~3-5 hrs | + +## 7. Evaluation + +### Ship Criteria + +| Metric | What It Checks | Threshold | +|--------|---------------|-----------| +| JSON validity | Output parses as valid JSON | >= 95% | +| Schema adherence | All required keys present, correct types | >= 90% | +| Explanation quality | ROUGE-L against teacher outputs | >= 0.35 | +| Factual grounding | Mentions data values from input, not hallucinated | >= 85% | +| Actionability | Recommendations are non-generic | >= 80% | +| Caveat inclusion | Data limitations populated | >= 90% | + +Pipeline refuses to export a model that fails any threshold. + +## 8. Deployment + +### Export to Ollama + +```bash +python -m training.export --school bishop-state +# Registers: +# bishop-state-explainer:9b +# bishop-state-summarizer:9b +``` + +### Dashboard Integration + +A thin adapter layer in `lib/model-client.ts` routes to the appropriate backend: + +``` +MODEL_BACKEND=ollama → local fine-tuned model via Ollama +MODEL_BACKEND=openai → fallback to OpenAI GPT-4o-mini +SCHOOL_CODE=bishop-state +``` + +Routes affected: + +| Route | Current | After | +|-------|---------|-------| +| `/api/courses/explain-pairing` | OpenAI GPT-4o-mini | `bishop-state-explainer:9b` via Ollama | +| `/api/query-summary` | OpenAI GPT-4o-mini | `bishop-state-summarizer:9b` via Ollama | +| `/api/analyze` | OpenAI GPT-4o-mini | No change (future adapter) | + +## 9. Onboarding a New School + +1. Create `schools/{school-code}/config.yaml` — fill in institutional context +2. Create `schools/{school-code}/seed_queries.yaml` — 20-50 example questions +3. Run the pipeline: + ```bash + python -m training.distill --school {school-code} [--local] + python -m training.prepare --school {school-code} + python -m training.finetune --school {school-code} --model 9b + python -m training.eval --school {school-code} + python -m training.export --school {school-code} + ``` +4. Set env vars: `MODEL_BACKEND=ollama SCHOOL_CODE={school-code}` +5. Deploy dashboard + +## 10. Cost Summary + +| Item | Per School | One-Time | +|------|-----------|----------| +| Distillation (Claude Sonnet) | $15-25 | - | +| Distillation (local Qwen) | $0 | - | +| Fine-tuning (MLX on Mac) | $0 (electricity) | - | +| Inference (Ollama) | $0 | - | +| Base model download | - | ~6GB (cached) | + +**Total cost to onboard a new school: $15-25** (or $0 with local teacher). From ae971e4b27fc81e2318a4cec8486ef19191c14c4 Mon Sep 17 00:00:00 2001 From: William Hill Date: Fri, 27 Mar 2026 21:33:54 -0400 Subject: [PATCH 04/18] docs: implementation plan for config-driven distillation pipeline --- .../plans/2026-03-27-distillation-pipeline.md | 3805 +++++++++++++++++ 1 file changed, 3805 insertions(+) create mode 100644 docs/superpowers/plans/2026-03-27-distillation-pipeline.md diff --git a/docs/superpowers/plans/2026-03-27-distillation-pipeline.md b/docs/superpowers/plans/2026-03-27-distillation-pipeline.md new file mode 100644 index 0000000..d492ae0 --- /dev/null +++ b/docs/superpowers/plans/2026-03-27-distillation-pipeline.md @@ -0,0 +1,3805 @@ +# Distillation Pipeline Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Build a config-driven pipeline that distills a teacher model's knowledge into per-school fine-tuned Qwen 3.5 models, replacing OpenAI API calls for course explanations and query summaries. + +**Architecture:** A 5-stage Python pipeline (distill → prepare → finetune → eval → export) reads per-school YAML configs, generates ChatML training pairs via Claude Sonnet or local Qwen 3.5, fine-tunes via MLX QLoRA, evaluates against ship criteria, and exports to Ollama. The Next.js dashboard swaps OpenAI calls for local Ollama inference via a thin model-client adapter. + +**Tech Stack:** Python 3.8+, PyYAML, Anthropic SDK, ollama (Python client), MLX/mlx-lm (Apple Silicon fine-tuning), pytest, Next.js/TypeScript (dashboard integration) + +**Spec:** `docs/superpowers/specs/2026-03-27-distillation-pipeline-design.md` + +**Reference implementation:** `~/Development/d4bl_ai_agent/scripts/training/` — the d4bl pipeline this adapts from. + +--- + +## File Structure + +### New Files + +``` +training/ + __init__.py # Package init + config.py # Constants + YAML config loader + prompts.py # Teacher prompt templates (explainer + summarizer) + seed.py # Seed data generation (DB + template + synthetic) + distill.py # Stage 1: Generate ChatML pairs via teacher model + prepare.py # Stage 2: Filter, dedup, split + finetune.py # Stage 3: MLX QLoRA fine-tuning + eval.py # Stage 4: Metrics + ship criteria + export.py # Stage 5: Ollama modelfile + registration + +schools/ + bishop-state/ + config.yaml # Full institutional config + seed_queries.yaml # Example queries for training pair generation + +tests/ + conftest.py # Pytest fixtures + training/ + __init__.py + test_config.py # Config loader tests + test_prompts.py # Prompt template tests + test_seed.py # Seed generation tests + test_prepare.py # Filter/dedup/split tests + test_eval.py # Eval metrics + ship criteria tests + +codebenders-dashboard/ + lib/ + model-client.ts # New: Ollama/OpenAI adapter +``` + +### Modified Files + +``` +codebenders-dashboard/ + app/api/courses/explain-pairing/route.ts # Swap OpenAI → model-client + app/api/query-summary/route.ts # Swap OpenAI → model-client + +requirements.txt # Add training dependencies +.gitignore # Add training_data/ +``` + +--- + +## Task 1: Project Scaffolding + +**Files:** +- Create: `training/__init__.py` +- Create: `tests/conftest.py` +- Create: `tests/training/__init__.py` +- Create: `pytest.ini` +- Modify: `requirements.txt` +- Modify: `.gitignore` + +- [ ] **Step 1: Create training package directory** + +```bash +mkdir -p training tests/training +``` + +- [ ] **Step 2: Create package init files** + +Create `training/__init__.py`: +```python +"""Config-driven distillation pipeline for per-school fine-tuned models.""" +``` + +Create `tests/__init__.py`: +```python +``` + +Create `tests/training/__init__.py`: +```python +``` + +- [ ] **Step 3: Create pytest.ini** + +```ini +[pytest] +testpaths = tests +python_files = test_*.py +python_classes = Test* +python_functions = test_* +``` + +- [ ] **Step 4: Add training dependencies to requirements.txt** + +Append to `requirements.txt`: +``` +# Training pipeline +pyyaml>=6.0 +anthropic>=0.40.0 +ollama>=0.4.0 +rouge-score>=0.1.2 +mlx>=0.22.0 +mlx-lm>=0.20.0 +``` + +- [ ] **Step 5: Add training_data to .gitignore** + +Append to `.gitignore`: +``` +# Training pipeline artifacts +training_data/ +``` + +- [ ] **Step 6: Create conftest.py with shared fixtures** + +Create `tests/conftest.py`: +```python +"""Shared pytest fixtures for the training pipeline.""" + +from pathlib import Path + +import pytest +import yaml + + +FIXTURES_DIR = Path(__file__).parent / "fixtures" + + +@pytest.fixture +def sample_school_config(): + """Minimal valid school config for testing.""" + return { + "school": { + "name": "Test Community College", + "code": "tcc", + "type": "community_college", + "designation": [], + "location": { + "city": "Test City", + "state": "Alabama", + "setting": "urban", + }, + "enrollment": { + "total_headcount": 1000, + "percent_full_time": 0.50, + "percent_part_time": 0.50, + }, + "demographics": { + "percent_pell_eligible": 0.60, + "percent_first_gen": 0.45, + }, + }, + "database": { + "main_table": "student_level_with_predictions", + "course_table": "course_enrollments", + "connection_env": "DATABASE_URL", + }, + "schema": { + "student_columns": { + "Cohort": "Cohort year", + "Race": "Student race/ethnicity", + "Retention": "Retention indicator (0 or 1)", + }, + "course_columns": { + "course_prefix": "Course dept code", + "grade": "Student grade", + }, + }, + "domain": { + "programs": [ + { + "name": "Nursing", + "cip": "51.3801", + "gateway_courses": ["BIO 201"], + } + ], + "key_metrics": ["retention_rate", "dfwi_rate"], + "terminology": { + "credential": "associate degree", + "at_risk": "at-risk students", + }, + }, + "distillation": { + "teacher_model": "claude-sonnet-4-20250514", + "teacher_backend": "anthropic", + "local_teacher_model": "qwen3.5:27b", + "local_teacher_backend": "ollama", + "pairs_per_task": 10, + }, + "training": { + "default_model": "qwen3.5:9b", + "fallback_model": "qwen3.5:4b", + "method": "qlora", + "quantization": 4, + "lora_rank": 16, + "lora_alpha": 32, + "epochs": 3, + "learning_rate": 1e-4, + "batch_size": 4, + "warmup_steps": 100, + "eval_every": 50, + "early_stopping_patience": 3, + }, + } + + +@pytest.fixture +def sample_course_pairing_data(): + """Sample course pairing input for explainer adapter.""" + return { + "course_a": {"prefix": "MAT", "number": "100", "name": "Intermediate Algebra"}, + "course_b": {"prefix": "BIO", "number": "201", "name": "Anatomy & Physiology I"}, + "stats": { + "course_a_dfwi": 0.42, + "course_b_dfwi": 0.31, + "co_enrollment_count": 85, + "co_enrollment_dfwi": 0.38, + "delivery_breakdown": [ + {"method": "Face-to-Face", "count": 50, "dfwi_rate": 0.34}, + {"method": "Online", "count": 35, "dfwi_rate": 0.44}, + ], + }, + } + + +@pytest.fixture +def sample_query_result_data(): + """Sample query result input for summarizer adapter.""" + return { + "prompt": "retention rate by race for 2023 cohort", + "data": [ + {"Race": "Black", "retention_rate": 0.41}, + {"Race": "White", "retention_rate": 0.52}, + {"Race": "Hispanic", "retention_rate": 0.47}, + ], + "rowCount": 3, + "vizType": "bar", + } + + +@pytest.fixture +def sample_explainer_output(): + """Valid explainer adapter JSON output.""" + return { + "explanation": "MAT 100 and BIO 201 show a high co-enrollment DFWI rate of 38%.", + "structural_factors": [ + "Math placement gaps from feeder high schools", + "Online sections show higher DFW rates", + ], + "student_impact": "Students taking both courses simultaneously face compounded difficulty.", + "advisor_recommendation": "Consider staggering MAT 100 and BIO 201 across terms for at-risk students.", + "data_limitations": ["Co-enrollment data limited to 2020+ cohorts"], + "related_intervention": "Math Bootcamp", + } + + +@pytest.fixture +def sample_summarizer_output(): + """Valid summarizer adapter JSON output.""" + return { + "summary": "Retention rates vary significantly by race in the 2023 cohort.", + "key_insights": [ + "Black students have the lowest retention rate at 41%", + "11-point gap between Black and White student retention", + ], + "context": "This aligns with the institution's strategic goal to close equity gaps.", + "action_items": [ + "Review early alert referrals for Black male students in Fall cohort", + ], + "caveats": ["Race is self-reported; 6% of records are Unknown"], + } +``` + +- [ ] **Step 7: Verify pytest runs with no errors** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/ -v --co` +Expected: "no tests ran" (collected 0 items) with exit code 0 + +- [ ] **Step 8: Commit** + +```bash +git add training/ tests/ pytest.ini requirements.txt .gitignore +git commit -m "chore: scaffold training pipeline package and test infrastructure" +``` + +--- + +## Task 2: Config Loader + +**Files:** +- Create: `training/config.py` +- Create: `tests/training/test_config.py` + +- [ ] **Step 1: Write the failing tests** + +Create `tests/training/test_config.py`: +```python +"""Tests for training.config — constants and school config loader.""" + +import pytest +import yaml +from pathlib import Path +from unittest.mock import patch + +from training.config import ( + BASE_DIR, + SCHOOLS_DIR, + TRAIN_RATIO, + VAL_RATIO, + TEST_RATIO, + load_school_config, + get_school_dir, + get_training_data_dir, + write_jsonl, +) + + +class TestConstants: + def test_split_ratios_sum_to_one(self): + assert TRAIN_RATIO + VAL_RATIO + TEST_RATIO == pytest.approx(1.0) + + def test_base_dir_is_path(self): + assert isinstance(BASE_DIR, Path) + + def test_schools_dir_is_path(self): + assert isinstance(SCHOOLS_DIR, Path) + + +class TestLoadSchoolConfig: + def test_loads_valid_config(self, tmp_path, sample_school_config): + school_dir = tmp_path / "test-school" + school_dir.mkdir() + config_path = school_dir / "config.yaml" + config_path.write_text(yaml.dump(sample_school_config)) + + with patch("training.config.SCHOOLS_DIR", tmp_path): + config = load_school_config("test-school") + + assert config["school"]["name"] == "Test Community College" + assert config["school"]["code"] == "tcc" + assert config["database"]["main_table"] == "student_level_with_predictions" + + def test_raises_on_missing_school(self, tmp_path): + with patch("training.config.SCHOOLS_DIR", tmp_path): + with pytest.raises(FileNotFoundError, match="School config not found"): + load_school_config("nonexistent") + + def test_raises_on_missing_required_keys(self, tmp_path): + school_dir = tmp_path / "bad-school" + school_dir.mkdir() + config_path = school_dir / "config.yaml" + config_path.write_text(yaml.dump({"school": {"name": "Bad"}})) + + with patch("training.config.SCHOOLS_DIR", tmp_path): + with pytest.raises(ValueError, match="Missing required"): + load_school_config("bad-school") + + +class TestGetSchoolDir: + def test_returns_path(self, tmp_path): + with patch("training.config.SCHOOLS_DIR", tmp_path): + result = get_school_dir("bishop-state") + assert result == tmp_path / "bishop-state" + + +class TestGetTrainingDataDir: + def test_returns_path_with_school(self): + result = get_training_data_dir("bishop-state") + assert "bishop-state" in str(result) + assert result.name == "bishop-state" + + +class TestWriteJsonl: + def test_writes_items(self, tmp_path): + import json + + items = [{"a": 1}, {"b": 2}] + outfile = tmp_path / "test.jsonl" + count = write_jsonl(items, outfile) + + assert count == 2 + lines = outfile.read_text().strip().split("\n") + assert json.loads(lines[0]) == {"a": 1} + assert json.loads(lines[1]) == {"b": 2} + + def test_writes_with_transform(self, tmp_path): + import json + + items = [1, 2, 3] + outfile = tmp_path / "test.jsonl" + count = write_jsonl(items, outfile, transform=lambda x: {"val": x * 2}) + + assert count == 3 + lines = outfile.read_text().strip().split("\n") + assert json.loads(lines[0]) == {"val": 2} + + def test_skips_none_from_transform(self, tmp_path): + items = [1, 2, 3] + outfile = tmp_path / "test.jsonl" + count = write_jsonl(items, outfile, transform=lambda x: None if x == 2 else {"v": x}) + + assert count == 2 + + def test_creates_parent_dirs(self, tmp_path): + outfile = tmp_path / "sub" / "dir" / "test.jsonl" + count = write_jsonl([{"x": 1}], outfile) + assert count == 1 + assert outfile.exists() +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_config.py -v` +Expected: FAIL — `ModuleNotFoundError: No module named 'training.config'` + +- [ ] **Step 3: Write the implementation** + +Create `training/config.py`: +```python +"""Shared constants and school config loader for the training pipeline.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Callable, Optional + +import yaml + +# --------------------------------------------------------------------------- +# Directory layout +# --------------------------------------------------------------------------- + +PROJECT_ROOT = Path(__file__).resolve().parent.parent +SCHOOLS_DIR = PROJECT_ROOT / "schools" +BASE_DIR = PROJECT_ROOT / "training_data" + +# --------------------------------------------------------------------------- +# Dataset split ratios +# --------------------------------------------------------------------------- + +TRAIN_RATIO = 0.80 +VAL_RATIO = 0.10 +TEST_RATIO = 0.10 + +# --------------------------------------------------------------------------- +# Deduplication +# --------------------------------------------------------------------------- + +JACCARD_THRESHOLD = 1.0 # Exact duplicates only + +# --------------------------------------------------------------------------- +# Required top-level keys in school config +# --------------------------------------------------------------------------- + +_REQUIRED_KEYS = {"school", "database", "schema", "domain", "distillation", "training"} + + +# --------------------------------------------------------------------------- +# Config loader +# --------------------------------------------------------------------------- + + +def load_school_config(school: str) -> dict[str, Any]: + """Load and validate a school's config.yaml. + + Args: + school: School directory name (e.g. "bishop-state"). + + Returns: + Parsed config dict. + + Raises: + FileNotFoundError: If the school directory or config.yaml doesn't exist. + ValueError: If required top-level keys are missing. + """ + config_path = SCHOOLS_DIR / school / "config.yaml" + if not config_path.exists(): + raise FileNotFoundError( + f"School config not found: {config_path}" + ) + + with config_path.open("r", encoding="utf-8") as fh: + config = yaml.safe_load(fh) + + missing = _REQUIRED_KEYS - set(config.keys()) + if missing: + raise ValueError( + f"Missing required top-level keys in {config_path}: {missing}" + ) + + return config + + +def get_school_dir(school: str) -> Path: + """Return the path to a school's config directory.""" + return SCHOOLS_DIR / school + + +def get_training_data_dir(school: str) -> Path: + """Return the path to a school's training data directory.""" + return BASE_DIR / school + + +# --------------------------------------------------------------------------- +# JSONL writer (adapted from d4bl) +# --------------------------------------------------------------------------- + + +def write_jsonl( + items: list, + outfile: Path, + transform: Optional[Callable] = None, +) -> int: + """Write items to a JSONL file. + + Args: + items: List of JSON-serializable objects. + outfile: Destination file path. + transform: Optional per-item transformation; returning None skips. + + Returns: + Number of lines written. + """ + outfile = Path(outfile) + outfile.parent.mkdir(parents=True, exist_ok=True) + count = 0 + with outfile.open("w", encoding="utf-8") as fh: + for item in items: + if transform is not None: + item = transform(item) + if item is None: + continue + fh.write(json.dumps(item, ensure_ascii=False) + "\n") + count += 1 + return count +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_config.py -v` +Expected: All tests PASS + +- [ ] **Step 5: Commit** + +```bash +git add training/config.py tests/training/test_config.py +git commit -m "feat(training): config loader with YAML validation and JSONL writer" +``` + +--- + +## Task 3: Bishop State School Config + +**Files:** +- Create: `schools/bishop-state/config.yaml` +- Create: `schools/bishop-state/seed_queries.yaml` + +- [ ] **Step 1: Create the school directory** + +```bash +mkdir -p schools/bishop-state +``` + +- [ ] **Step 2: Write config.yaml** + +Create `schools/bishop-state/config.yaml` with the full institutional config from the design spec. This is a data file — the schema was validated in Task 2's tests. Include all sections: school identity, location, enrollment, demographics, database schema (copying exact columns from `route.ts` SCHEMA_INFO), domain knowledge, workforce, peers, financial, completion, instruction, pipeline, technology, access, equity, interventions, student_life, health, patterns, trends, priorities, data_caveats, distillation, and training config. + +```yaml +# Bishop State Community College — Training Pipeline Config +# See docs/superpowers/specs/2026-03-27-distillation-pipeline-design.md + +school: + name: "Bishop State Community College" + code: "bscc" + type: "community_college" + designation: ["hbcu", "minority_serving"] + accreditation: "SACSCOC" + founded: 1927 + + location: + address: "351 North Broad Street" + city: "Mobile" + state: "Alabama" + zip: "36603" + county: "Mobile County" + region: "Gulf Coast" + setting: "urban" + climate_zone: "subtropical" + + enrollment: + total_headcount: 4200 + fte: 2800 + undergraduate_only: true + residential: false + percent_full_time: 0.42 + percent_part_time: 0.58 + percent_online: 0.35 + open_admission: true + + demographics: + percent_black: 0.72 + percent_white: 0.18 + percent_hispanic: 0.05 + percent_other: 0.05 + percent_pell_eligible: 0.68 + percent_first_gen: 0.55 + percent_adult_learners: 0.40 + median_household_income_area: 42000 + + workforce: + top_employers: ["Austal USA", "Mobile Infirmary", "AM/NS Calvert"] + high_demand_fields: ["healthcare", "advanced_manufacturing", "maritime"] + workforce_board: "Mobile Works" + + academics: + calendar: "semester" + degree_types: ["associate", "certificate", "short_certificate"] + total_programs: 45 + largest_programs: ["Nursing", "Welding", "Business Administration"] + transfer_partners: ["University of South Alabama", "Alabama A&M"] + dual_enrollment: true + + student_support: + tutoring: true + food_pantry: true + childcare: false + transportation_assistance: true + mental_health_services: true + early_alert_system: true + + challenges: + - "High percentage of students working 20+ hours/week" + - "Limited public transit access to satellite campuses" + - "Hurricane season disrupts Fall semester attendance" + - "Many students require developmental education in math" + + strengths: + - "Strong employer partnerships in healthcare and maritime" + - "Active student mentoring program" + - "High nursing program pass rates on NCLEX" + + peers: + ipeds_id: "101505" + carnegie_class: "Associate's—High Transfer-High Traditional" + peer_institutions: ["Lawson State CC", "Shelton State CC", "Trenholm State CC"] + state_system: "Alabama Community College System" + governing_board: "ACCS Board of Trustees" + + financial: + in_district_tuition: 4800 + in_state_tuition: 4800 + avg_financial_aid_package: 5200 + percent_receiving_aid: 0.82 + percent_student_loans: 0.25 + cost_of_living_index: 87.3 + textbook_program: "inclusive_access" + tuition_payment_plan: true + emergency_aid_fund: true + + completion: + ipeds_graduation_rate: 0.18 + adjusted_completion_rate: 0.42 + avg_time_to_credential: 3.2 + percent_transfer_out: 0.24 + percent_stop_out_return: 0.15 + top_completion_barriers: + - "developmental_math_sequences" + - "financial_emergencies" + - "work_schedule_conflicts" + + instruction: + student_faculty_ratio: 18 + percent_full_time_faculty: 0.45 + percent_adjunct: 0.55 + avg_class_size: 22 + developmental_ed_model: "corequisite" + lms: "Canvas" + + pipeline: + feeder_high_schools: + - name: "Williamson High School" + percent_of_enrollment: 0.12 + avg_readiness: "below_college_level" + - name: "Murphy High School" + percent_of_enrollment: 0.08 + avg_readiness: "mixed" + percent_ged: 0.11 + percent_dual_enrollment_origin: 0.09 + percent_veterans: 0.07 + percent_career_changers: 0.14 + percent_displaced_workers: 0.05 + percent_international: 0.02 + primary_recruitment_radius_miles: 35 + + technology: + percent_students_with_reliable_wifi: 0.71 + percent_students_with_personal_laptop: 0.64 + campus_device_lending: true + hotspot_lending: true + digital_literacy_required: false + broadband_desert_overlap: true + + access: + campus_count: 4 + campuses: + - name: "Main Campus" + address: "351 N Broad St" + public_transit_accessible: true + - name: "Southwest Campus" + address: "925 Dauphin Island Pkwy" + public_transit_accessible: false + percent_students_commute_30_plus_min: 0.35 + public_transit_quality: "limited" + parking_adequate: true + evening_weekend_classes: true + + equity: + known_gaps: + - metric: "gateway_math_pass_rate" + group_a: { name: "Black male students", value: 0.41 } + group_b: { name: "Overall", value: 0.58 } + initiative: "Male Student Success mentoring program" + - metric: "retention" + group_a: { name: "Part-time students", value: 0.38 } + group_b: { name: "Full-time students", value: 0.61 } + initiative: "15-to-Finish advising campaign" + dei_office: true + title_ix_coordinator: true + minority_male_initiative: "Brother 2 Brother" + + interventions: + active: + - name: "Starfish Early Alert" + type: "early_warning" + target: "all students" + trigger: "missed 2+ classes or below C at midterm" + effectiveness: "12% retention lift in pilot cohorts" + - name: "Math Bootcamp" + type: "academic_support" + target: "students placing into developmental math" + timing: "2 weeks before Fall semester" + effectiveness: "participants 2x more likely to pass MAT 100" + - name: "Emergency Micro-Grants" + type: "financial" + target: "students facing unexpected financial hardship" + max_award: 500 + effectiveness: "78% of recipients re-enrolled next term" + planned: + - name: "Proactive advising for 25+ credit students" + launch: "Fall 2026" + + student_life: + percent_working_while_enrolled: 0.72 + percent_working_over_20hrs: 0.48 + percent_single_parents: 0.18 + percent_caregiver_responsibilities: 0.25 + childcare_waitlist: true + student_orgs: 15 + athletics: false + housing_insecurity_rate: 0.14 + food_insecurity_rate: 0.31 + + health: + mental_health_counselor_ratio: "1:1400" + community_health_context: + - "Mobile County has highest diabetes rate in Alabama" + - "Limited mental health providers in service area" + substance_abuse_programs: true + crisis_intervention_protocol: true + + outcomes: + job_placement_rate_6mo: 0.78 + median_salary_after_credential: + associate: 34000 + certificate: 29000 + percent_employed_in_field: 0.65 + licensure_pass_rates: + nursing_nclex: 0.89 + welding_aws: 0.92 + emt: 0.85 + transfer_success_rate: 0.71 + employer_satisfaction_rate: 0.88 + + patterns: + high_attrition_points: + - week: 4 + reason: "Financial aid disbursement delays" + - week: 8 + reason: "Midterm performance shock" + - month: "October" + reason: "Hurricane season peak" + registration_peaks: ["April", "July", "November"] + summer_melt_rate: 0.22 + + trends: + enrollment_direction: "declining" + enrollment_5yr_change: -0.12 + completion_direction: "improving" + notable_changes: + - year: 2020 + event: "COVID shift to online — permanent hybrid expansion" + - year: 2022 + event: "Switched to corequisite math model — dev-ed pass rates doubled" + - year: 2023 + event: "Launched early alert system with ML predictions" + + priorities: + strategic_plan_years: "2024-2029" + top_goals: + - "Increase fall-to-fall retention from 42% to 55%" + - "Launch 3 new short-term workforce certificates" + - "Close equity gap in gateway math by 50%" + accreditation_qep_topic: "Guided Pathways implementation" + grant_funded_initiatives: + - name: "Title III Strengthening Institutions" + focus: "Student support services and advising redesign" + end_date: "2027-09-30" + - name: "NSF ATE Grant" + focus: "Advanced manufacturing curriculum" + end_date: "2026-05-31" + + data_caveats: + - "Pre-2020 cohorts lack online/hybrid delivery classification" + - "Race/ethnicity is self-reported; 6% of records are 'Unknown'" + - "GPA data for dual-enrollment students may reflect high school scale" + - "Transfer-out data relies on National Student Clearinghouse match — ~85% match rate" + - "Course enrollment records before 2019 do not include instructor_status" + +database: + main_table: "student_level_with_predictions" + course_table: "course_enrollments" + connection_env: "DATABASE_URL" + +schema: + student_columns: + Cohort: "Cohort year (numeric: 2019, 2020, etc.)" + Cohort_Term: "Term of cohort entry (Fall, Spring, Summer)" + Student_GUID: "Unique student identifier" + Institution_ID: "Institution identifier (102030 for Bishop State)" + Gender: "Student gender" + Race: "Student race/ethnicity" + Student_Age: "Age of student (integer)" + First_Gen: "First generation status" + Enrollment_Type: "Type of enrollment" + Enrollment_Intensity_First_Term: "Enrollment intensity (Full-Time, Part-Time)" + Program_of_Study_Year_1: "Program of study in year 1 (CIP code)" + Credential_Type_Sought_Year_1: "Credential type being pursued" + Math_Placement: "Math placement level (C=college-level, R=remedial, N=none)" + Retention: "Retention indicator (0 or 1)" + Persistence: "Persistence indicator (0 or 1)" + GPA_Group_Year_1: "GPA in year 1" + GPA_Group_Term_1: "GPA in term 1" + Number_of_Credits_Attempted_Year_1: "Credits attempted in year 1" + Number_of_Credits_Earned_Year_1: "Credits earned in year 1" + Number_of_Credits_Attempted_Year_2: "Credits attempted in year 2" + Number_of_Credits_Earned_Year_2: "Credits earned in year 2" + Time_to_Credential: "Time to any credential" + retention_probability: "Predicted probability of retention (0-1)" + retention_risk_category: "Risk category (Low/Moderate/High/Critical Risk)" + at_risk_alert: "Early warning alert level (LOW/MODERATE/HIGH/URGENT)" + course_completion_rate: "Course completion rate (0-1)" + passing_rate: "Course passing rate (0-1)" + course_columns: + course_prefix: "Course dept code (MAT, ENG, NUR, CIS, etc.)" + course_number: "Course number (100, 201, etc.)" + course_name: "Full course name" + grade: "Student grade (A, B, C, D, F, W, I, AU, P)" + delivery_method: "Delivery (F=face-to-face, O=online, H=hybrid)" + instructor_status: "Instructor type (FT=full-time, PT=part-time)" + gateway_type: "Gateway (M=math, E=English, N=not a gateway)" + credits_attempted: "Credits attempted (numeric)" + credits_earned: "Credits earned (numeric)" + cohort: "Cohort year as text" + academic_year: "Academic year (e.g. 2021-22)" + academic_term: "Term (FALL, SPRING, SUMMER)" + ferpa_excluded: + - "Student_GUID" + - "student_guid" + +domain: + programs: + - name: "Nursing (ADN)" + cip: "51.3801" + gateway_courses: ["BIO 201", "MAT 110"] + - name: "Welding Technology" + cip: "48.0508" + gateway_courses: ["WDT 108", "WDT 109"] + - name: "Business Administration" + cip: "52.0201" + gateway_courses: ["MAT 100", "BUS 241"] + - name: "Computer Information Systems" + cip: "11.0101" + gateway_courses: ["CIS 146", "MAT 100"] + - name: "Emergency Medical Technician" + cip: "51.0904" + gateway_courses: ["EMS 100", "BIO 201"] + key_metrics: + - "retention_rate" + - "dfwi_rate" + - "gateway_pass_rate" + - "completion_rate" + - "transfer_rate" + terminology: + credential: "associate degree or certificate" + at_risk: "students flagged by early warning system" + gateway_course: "first college-level course in math or English" + dfwi: "grades of D, F, W, or I (unsuccessful completion)" + +distillation: + teacher_model: "claude-sonnet-4-20250514" + teacher_backend: "anthropic" + local_teacher_model: "qwen3.5:27b" + local_teacher_backend: "ollama" + pairs_per_task: 1500 + +training: + default_model: "qwen3.5:9b" + fallback_model: "qwen3.5:4b" + method: "qlora" + quantization: 4 + lora_rank: 16 + lora_alpha: 32 + epochs: 3 + learning_rate: 1.0e-4 + batch_size: 4 + warmup_steps: 100 + eval_every: 50 + early_stopping_patience: 3 +``` + +- [ ] **Step 3: Write seed_queries.yaml** + +Create `schools/bishop-state/seed_queries.yaml`: +```yaml +# Example queries for training pair generation +# These seed the template-driven portion of distillation. + +explainer: + # Advisor-perspective queries + - query: "MAT 100 and BIO 201 pairing for nursing students" + style: "advisor" + - query: "ENG 101 and HIS 201 co-enrollment outcomes" + style: "advisor" + - query: "High DFW in MAT 110 for part-time evening students" + style: "advisor" + - query: "CIS 146 and MAT 100 pairing for CIS majors" + style: "advisor" + - query: "WDT 108 and WDT 109 sequential outcomes" + style: "advisor" + + # Administrator-perspective queries + - query: "Online vs face-to-face outcomes in gateway math" + style: "administrator" + - query: "Adjunct vs full-time instructor DFW rates in BIO 201" + style: "administrator" + - query: "Summer vs Fall section outcomes for ENG 101" + style: "administrator" + - query: "Developmental math co-enrollment with science courses" + style: "administrator" + - query: "Dual-enrollment student performance in college-level courses" + style: "administrator" + + # Faculty-perspective queries + - query: "EMS 100 and BIO 201 prerequisite outcomes" + style: "faculty" + - query: "MAT 100 withdrawal patterns by week of semester" + style: "faculty" + - query: "Hybrid delivery outcomes in nursing prerequisite courses" + style: "faculty" + +summarizer: + # Retention and completion + - query: "retention rate by race for 2023 cohort" + style: "faculty" + - query: "overall retention trend from 2019 to 2023" + style: "administrator" + - query: "retention rate for first-generation students" + style: "advisor" + - query: "completion rate by enrollment intensity" + style: "administrator" + + # Course performance + - query: "gateway course pass rates by delivery method" + style: "administrator" + - query: "top 10 courses with highest DFW rates" + style: "faculty" + - query: "DFW rates by instructor status in math courses" + style: "administrator" + - query: "course completion rates for online vs face-to-face" + style: "faculty" + + # Demographics and equity + - query: "enrollment by race and gender" + style: "administrator" + - query: "GPA distribution for Pell-eligible students" + style: "advisor" + - query: "retention gap between full-time and part-time students" + style: "administrator" + - query: "at-risk student count by program" + style: "advisor" + + # Risk and intervention + - query: "students with URGENT early warning alert by cohort" + style: "advisor" + - query: "average retention probability by math placement" + style: "faculty" + - query: "critical risk students in nursing program" + style: "advisor" +``` + +- [ ] **Step 4: Verify config loads correctly** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -c "from training.config import load_school_config; c = load_school_config('bishop-state'); print(f'Loaded: {c[\"school\"][\"name\"]}')"` +Expected: `Loaded: Bishop State Community College` + +- [ ] **Step 5: Commit** + +```bash +git add schools/ +git commit -m "feat(training): add Bishop State school config and seed queries" +``` + +--- + +## Task 4: Teacher Prompt Templates + +**Files:** +- Create: `training/prompts.py` +- Create: `tests/training/test_prompts.py` + +- [ ] **Step 1: Write the failing tests** + +Create `tests/training/test_prompts.py`: +```python +"""Tests for training.prompts — teacher prompt templates.""" + +import json +import pytest + +from training.prompts import ( + build_system_prompt, + build_explainer_prompt, + build_summarizer_prompt, + EXPLAINER_STUDENT_SYSTEM, + SUMMARIZER_STUDENT_SYSTEM, + EXPLAINER_SCHEMA, + SUMMARIZER_SCHEMA, +) + + +class TestBuildSystemPrompt: + def test_includes_school_name(self, sample_school_config): + result = build_system_prompt(sample_school_config) + assert "Test Community College" in result + + def test_includes_location(self, sample_school_config): + result = build_system_prompt(sample_school_config) + assert "Test City" in result + assert "Alabama" in result + + def test_includes_demographics(self, sample_school_config): + result = build_system_prompt(sample_school_config) + assert "Pell" in result or "pell" in result + + def test_returns_string(self, sample_school_config): + result = build_system_prompt(sample_school_config) + assert isinstance(result, str) + assert len(result) > 100 + + +class TestBuildExplainerPrompt: + def test_includes_course_data(self, sample_school_config, sample_course_pairing_data): + result = build_explainer_prompt(sample_school_config, sample_course_pairing_data) + assert "MAT" in result + assert "BIO" in result + + def test_includes_stats(self, sample_school_config, sample_course_pairing_data): + result = build_explainer_prompt(sample_school_config, sample_course_pairing_data) + assert "0.42" in result or "42" in result + + def test_includes_output_schema(self, sample_school_config, sample_course_pairing_data): + result = build_explainer_prompt(sample_school_config, sample_course_pairing_data) + assert "explanation" in result + assert "structural_factors" in result + assert "advisor_recommendation" in result + + def test_returns_string(self, sample_school_config, sample_course_pairing_data): + result = build_explainer_prompt(sample_school_config, sample_course_pairing_data) + assert isinstance(result, str) + + +class TestBuildSummarizerPrompt: + def test_includes_query(self, sample_school_config, sample_query_result_data): + result = build_summarizer_prompt(sample_school_config, sample_query_result_data) + assert "retention rate by race" in result + + def test_includes_data(self, sample_school_config, sample_query_result_data): + result = build_summarizer_prompt(sample_school_config, sample_query_result_data) + assert "Black" in result + assert "0.41" in result or "41" in result + + def test_includes_output_schema(self, sample_school_config, sample_query_result_data): + result = build_summarizer_prompt(sample_school_config, sample_query_result_data) + assert "summary" in result + assert "key_insights" in result + assert "action_items" in result + + def test_returns_string(self, sample_school_config, sample_query_result_data): + result = build_summarizer_prompt(sample_school_config, sample_query_result_data) + assert isinstance(result, str) + + +class TestStudentPrompts: + def test_explainer_student_system_is_concise(self): + assert len(EXPLAINER_STUDENT_SYSTEM) < 500 + assert "JSON" in EXPLAINER_STUDENT_SYSTEM + + def test_summarizer_student_system_is_concise(self): + assert len(SUMMARIZER_STUDENT_SYSTEM) < 500 + assert "JSON" in SUMMARIZER_STUDENT_SYSTEM + + +class TestOutputSchemas: + def test_explainer_schema_has_required_keys(self): + required = {"explanation", "structural_factors", "student_impact", + "advisor_recommendation", "data_limitations", "related_intervention"} + assert required == set(EXPLAINER_SCHEMA.keys()) + + def test_summarizer_schema_has_required_keys(self): + required = {"summary", "key_insights", "context", "action_items", "caveats"} + assert required == set(SUMMARIZER_SCHEMA.keys()) +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_prompts.py -v` +Expected: FAIL — `ModuleNotFoundError: No module named 'training.prompts'` + +- [ ] **Step 3: Write the implementation** + +Create `training/prompts.py`: +```python +"""Teacher prompt templates for the distillation pipeline. + +Provides school-agnostic prompt builders that inject per-school context +from config.yaml to generate high-quality training pairs. +""" + +from __future__ import annotations + +import json +from typing import Any + +# --------------------------------------------------------------------------- +# Output schemas — define what the fine-tuned model produces +# --------------------------------------------------------------------------- + +EXPLAINER_SCHEMA = { + "explanation": "2-3 sentence plain-language explanation of the course pairing pattern", + "structural_factors": ["list of institutional or systemic factors driving this pattern"], + "student_impact": "what this means for students taking these courses", + "advisor_recommendation": "one actionable next step for advisors", + "data_limitations": ["caveats about interpreting this data"], + "related_intervention": "existing program that addresses this, or null", +} + +SUMMARIZER_SCHEMA = { + "summary": "2-3 sentence headline finding from the query results", + "key_insights": ["list of notable patterns in the data"], + "context": "how this connects to institutional priorities or known challenges", + "action_items": ["what someone should do with this information"], + "caveats": ["data limitations relevant to this specific query"], +} + +# --------------------------------------------------------------------------- +# Student system prompts (what the fine-tuned model sees at inference) +# --------------------------------------------------------------------------- + +EXPLAINER_STUDENT_SYSTEM = ( + "You are a student success analyst. Given course pairing data, generate a " + "structured JSON explanation. Include: explanation, structural_factors, " + "student_impact, advisor_recommendation, data_limitations, and " + "related_intervention. Respond with ONLY valid JSON." +) + +SUMMARIZER_STUDENT_SYSTEM = ( + "You are a student success analyst. Given a query and its results, generate " + "a structured JSON summary. Include: summary, key_insights, context, " + "action_items, and caveats. Respond with ONLY valid JSON." +) + +# --------------------------------------------------------------------------- +# Context builder — extracts relevant sections from school config +# --------------------------------------------------------------------------- + + +def build_system_prompt(config: dict[str, Any]) -> str: + """Build the teacher system prompt with full institutional context. + + Injects school identity, demographics, challenges, interventions, + equity gaps, and priorities from the school config. + + Args: + config: Parsed school config dict. + + Returns: + System prompt string for the teacher model. + """ + school = config["school"] + domain = config["domain"] + + sections = [] + + # Identity + name = school["name"] + location = school.get("location", {}) + city = location.get("city", "") + state = location.get("state", "") + school_type = school.get("type", "institution") + sections.append( + f"You are a student success analyst at {name}, " + f"a {school_type} in {city}, {state}." + ) + + # Designation + designations = school.get("designation", []) + if designations: + sections.append(f"Institutional designations: {', '.join(designations)}.") + + # Enrollment + enrollment = school.get("enrollment", {}) + if enrollment: + parts = [] + if "total_headcount" in enrollment: + parts.append(f"{enrollment['total_headcount']:,} students") + if "percent_part_time" in enrollment: + parts.append(f"{enrollment['percent_part_time']:.0%} part-time") + if "percent_online" in enrollment: + parts.append(f"{enrollment['percent_online']:.0%} online") + if enrollment.get("open_admission"): + parts.append("open admission") + if parts: + sections.append(f"Enrollment profile: {', '.join(parts)}.") + + # Demographics + demographics = school.get("demographics", {}) + if demographics: + parts = [] + for key, label in [ + ("percent_pell_eligible", "Pell-eligible"), + ("percent_first_gen", "first-generation"), + ("percent_adult_learners", "adult learners (25+)"), + ]: + if key in demographics: + parts.append(f"{demographics[key]:.0%} {label}") + if parts: + sections.append(f"Student demographics: {', '.join(parts)}.") + + # Programs + programs = domain.get("programs", []) + if programs: + program_names = [p["name"] for p in programs[:5]] + sections.append(f"Key programs: {', '.join(program_names)}.") + + # Challenges + challenges = school.get("challenges", []) + if challenges: + sections.append("Known challenges:\n" + "\n".join(f"- {c}" for c in challenges)) + + # Strengths + strengths = school.get("strengths", []) + if strengths: + sections.append("Institutional strengths:\n" + "\n".join(f"- {s}" for s in strengths)) + + # Equity gaps + equity = school.get("equity", {}) + known_gaps = equity.get("known_gaps", []) + if known_gaps: + gap_lines = [] + for gap in known_gaps: + ga = gap.get("group_a", {}) + gb = gap.get("group_b", {}) + gap_lines.append( + f"- {gap['metric']}: {ga.get('name', '?')} ({ga.get('value', '?')}) " + f"vs {gb.get('name', '?')} ({gb.get('value', '?')})" + ) + sections.append("Known equity gaps:\n" + "\n".join(gap_lines)) + + # Interventions + interventions = school.get("interventions", {}) + active = interventions.get("active", []) + if active: + lines = [] + for i in active: + line = f"- {i['name']} ({i['type']}): {i.get('effectiveness', 'effectiveness unknown')}" + lines.append(line) + sections.append("Active interventions:\n" + "\n".join(lines)) + + # Priorities + priorities = school.get("priorities", {}) + top_goals = priorities.get("top_goals", []) + if top_goals: + sections.append("Strategic priorities:\n" + "\n".join(f"- {g}" for g in top_goals)) + + # Data caveats + caveats = school.get("data_caveats", []) + if caveats: + sections.append("Data caveats:\n" + "\n".join(f"- {c}" for c in caveats)) + + # Completion context + completion = school.get("completion", {}) + if completion: + parts = [] + if "ipeds_graduation_rate" in completion: + parts.append(f"IPEDS grad rate: {completion['ipeds_graduation_rate']:.0%}") + if "adjusted_completion_rate" in completion: + parts.append(f"adjusted completion: {completion['adjusted_completion_rate']:.0%}") + barriers = completion.get("top_completion_barriers", []) + if barriers: + parts.append(f"top barriers: {', '.join(b.replace('_', ' ') for b in barriers)}") + if parts: + sections.append(f"Completion context: {'; '.join(parts)}.") + + # Student life + student_life = school.get("student_life", {}) + if student_life: + parts = [] + if "percent_working_over_20hrs" in student_life: + parts.append(f"{student_life['percent_working_over_20hrs']:.0%} working 20+ hrs/wk") + if "food_insecurity_rate" in student_life: + parts.append(f"{student_life['food_insecurity_rate']:.0%} food insecure") + if "percent_single_parents" in student_life: + parts.append(f"{student_life['percent_single_parents']:.0%} single parents") + if parts: + sections.append(f"Student life: {', '.join(parts)}.") + + # Patterns + patterns = school.get("patterns", {}) + attrition_points = patterns.get("high_attrition_points", []) + if attrition_points: + lines = [] + for point in attrition_points: + when = f"week {point['week']}" if "week" in point else point.get("month", "?") + lines.append(f"- {when}: {point['reason']}") + sections.append("Known attrition patterns:\n" + "\n".join(lines)) + + # Workforce + workforce = school.get("workforce", {}) + if workforce: + employers = workforce.get("top_employers", []) + fields = workforce.get("high_demand_fields", []) + if employers or fields: + parts = [] + if employers: + parts.append(f"top employers: {', '.join(employers)}") + if fields: + parts.append(f"high-demand fields: {', '.join(fields)}") + sections.append(f"Workforce context: {'; '.join(parts)}.") + + # Outcomes + outcomes = school.get("outcomes", {}) + if outcomes: + parts = [] + if "job_placement_rate_6mo" in outcomes: + parts.append(f"6-month job placement: {outcomes['job_placement_rate_6mo']:.0%}") + licensure = outcomes.get("licensure_pass_rates", {}) + if licensure: + lic_parts = [f"{k}: {v:.0%}" for k, v in licensure.items()] + parts.append(f"licensure pass rates: {', '.join(lic_parts)}") + if parts: + sections.append(f"Outcomes: {'; '.join(parts)}.") + + sections.append("Respond with ONLY valid JSON.") + + return "\n\n".join(sections) + + +# --------------------------------------------------------------------------- +# Explainer prompt +# --------------------------------------------------------------------------- + + +def build_explainer_prompt( + config: dict[str, Any], + course_data: dict[str, Any], +) -> str: + """Build the teacher prompt for generating a course pairing explanation. + + Args: + config: Parsed school config dict. + course_data: Course pairing data dict with keys: course_a, course_b, stats. + + Returns: + User prompt string for the teacher model. + """ + schema_str = json.dumps(EXPLAINER_SCHEMA, indent=2) + data_str = json.dumps(course_data, indent=2, default=str) + + terminology = config.get("domain", {}).get("terminology", {}) + term_lines = "\n".join(f"- {k}: {v}" for k, v in terminology.items()) if terminology else "" + + return f"""Analyze the following course pairing data and explain the pattern. + +COURSE PAIRING DATA: +{data_str} + +{f"TERMINOLOGY:{chr(10)}{term_lines}{chr(10)}" if term_lines else ""} +Generate a JSON response with this exact schema: +{schema_str} + +Guidelines: +- Explain the pattern in plain language accessible to advisors and faculty. +- Connect structural factors to the institution's known challenges and context. +- Make the advisor recommendation specific and actionable. +- Reference existing interventions if relevant. +- Note any data limitations that affect interpretation. +- Do NOT speculate beyond what the data shows.""" + + +# --------------------------------------------------------------------------- +# Summarizer prompt +# --------------------------------------------------------------------------- + + +def build_summarizer_prompt( + config: dict[str, Any], + query_data: dict[str, Any], +) -> str: + """Build the teacher prompt for generating a query result summary. + + Args: + config: Parsed school config dict. + query_data: Dict with keys: prompt, data, rowCount, vizType. + + Returns: + User prompt string for the teacher model. + """ + schema_str = json.dumps(SUMMARIZER_SCHEMA, indent=2) + data_str = json.dumps(query_data["data"][:50], indent=2, default=str) + user_query = query_data["prompt"] + row_count = query_data.get("rowCount", len(query_data["data"])) + viz_type = query_data.get("vizType", "table") + + return f"""Summarize the following query results for a non-technical audience +(advisors, administrators, faculty). + +USER QUERY: {user_query} +VISUALIZATION TYPE: {viz_type} +TOTAL ROWS: {row_count} + +RESULTS: +{data_str} + +Generate a JSON response with this exact schema: +{schema_str} + +Guidelines: +- Lead with the most important finding. +- Connect insights to institutional context and priorities. +- Make action items specific to the roles that would see this data. +- Note data limitations relevant to this specific query. +- Do NOT hallucinate data points not present in the results.""" +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_prompts.py -v` +Expected: All tests PASS + +- [ ] **Step 5: Commit** + +```bash +git add training/prompts.py tests/training/test_prompts.py +git commit -m "feat(training): teacher prompt templates for explainer and summarizer" +``` + +--- + +## Task 5: Seed Data Generation + +**Files:** +- Create: `training/seed.py` +- Create: `tests/training/test_seed.py` + +- [ ] **Step 1: Write the failing tests** + +Create `tests/training/test_seed.py`: +```python +"""Tests for training.seed — seed data generation.""" + +import pytest +import yaml +from pathlib import Path +from unittest.mock import patch + +from training.seed import ( + load_seed_queries, + generate_synthetic_course_pairings, + generate_synthetic_query_results, + format_as_chatml, +) + + +class TestLoadSeedQueries: + def test_loads_valid_yaml(self, tmp_path): + seed_file = tmp_path / "seed_queries.yaml" + seed_file.write_text(yaml.dump({ + "explainer": [ + {"query": "MAT 100 and BIO 201", "style": "advisor"}, + ], + "summarizer": [ + {"query": "retention by race", "style": "faculty"}, + ], + })) + + with patch("training.seed.get_school_dir", return_value=tmp_path): + result = load_seed_queries("test-school") + + assert len(result["explainer"]) == 1 + assert len(result["summarizer"]) == 1 + assert result["explainer"][0]["query"] == "MAT 100 and BIO 201" + + def test_returns_empty_on_missing_file(self, tmp_path): + with patch("training.seed.get_school_dir", return_value=tmp_path): + result = load_seed_queries("test-school") + assert result == {"explainer": [], "summarizer": []} + + +class TestGenerateSyntheticCoursePairings: + def test_generates_requested_count(self, sample_school_config): + results = generate_synthetic_course_pairings(sample_school_config, count=5) + assert len(results) == 5 + + def test_each_has_required_keys(self, sample_school_config): + results = generate_synthetic_course_pairings(sample_school_config, count=3) + for r in results: + assert "course_a" in r + assert "course_b" in r + assert "stats" in r + assert "prefix" in r["course_a"] + assert "number" in r["course_a"] + + def test_returns_empty_for_zero(self, sample_school_config): + results = generate_synthetic_course_pairings(sample_school_config, count=0) + assert results == [] + + +class TestGenerateSyntheticQueryResults: + def test_generates_requested_count(self, sample_school_config): + results = generate_synthetic_query_results(sample_school_config, count=5) + assert len(results) == 5 + + def test_each_has_required_keys(self, sample_school_config): + results = generate_synthetic_query_results(sample_school_config, count=3) + for r in results: + assert "prompt" in r + assert "data" in r + assert "rowCount" in r + assert "vizType" in r + + def test_returns_empty_for_zero(self, sample_school_config): + results = generate_synthetic_query_results(sample_school_config, count=0) + assert results == [] + + +class TestFormatAsChatML: + def test_format_structure(self): + result = format_as_chatml("system", "user", "assistant") + assert "messages" in result + assert len(result["messages"]) == 3 + assert result["messages"][0] == {"role": "system", "content": "system"} + assert result["messages"][1] == {"role": "user", "content": "user"} + assert result["messages"][2] == {"role": "assistant", "content": "assistant"} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_seed.py -v` +Expected: FAIL — `ModuleNotFoundError: No module named 'training.seed'` + +- [ ] **Step 3: Write the implementation** + +Create `training/seed.py`: +```python +"""Seed data generation for the distillation pipeline. + +Generates synthetic course pairing data and query results to serve as +inputs for the teacher model during distillation. Also loads template +seed queries from the school's seed_queries.yaml. +""" + +from __future__ import annotations + +import random +from typing import Any + +import yaml + +from training.config import get_school_dir + +# --------------------------------------------------------------------------- +# Common course data for synthetic generation +# --------------------------------------------------------------------------- + +_PREFIXES = ["MAT", "ENG", "BIO", "CIS", "WDT", "HIS", "PSY", "BUS", "NUR", "EMS"] +_NUMBERS = ["100", "101", "110", "201", "202", "210", "241", "246"] +_NAMES = { + "MAT 100": "Intermediate Algebra", + "MAT 110": "Finite Mathematics", + "MAT 201": "Calculus I", + "ENG 101": "English Composition I", + "ENG 102": "English Composition II", + "BIO 201": "Anatomy & Physiology I", + "BIO 202": "Anatomy & Physiology II", + "CIS 146": "Microcomputer Applications", + "CIS 201": "Introduction to Programming", + "WDT 108": "SMAW Fillet/OFC", + "WDT 109": "SMAW Fillet/PAC/CAC", + "HIS 201": "United States History I", + "PSY 200": "General Psychology", + "BUS 241": "Principles of Accounting I", + "NUR 102": "Fundamentals of Nursing", + "EMS 100": "EMT Basic", +} +_DELIVERY_METHODS = ["Face-to-Face", "Online", "Hybrid"] +_GRADES = ["A", "B", "C", "D", "F", "W", "I"] +_VIZ_TYPES = ["bar", "line", "pie", "kpi", "table"] + +_QUERY_TEMPLATES = [ + ("retention rate by {dim} for {year} cohort", "bar"), + ("overall {metric} trend from 2019 to 2023", "line"), + ("{metric} for first-generation students", "kpi"), + ("{metric} by enrollment intensity", "bar"), + ("top 10 courses with highest DFW rates", "table"), + ("{metric} by {dim}", "bar"), + ("students with {alert} early warning alert", "kpi"), + ("{metric} distribution by program", "bar"), + ("{metric} gap between full-time and part-time students", "bar"), + ("at-risk student count by {dim}", "pie"), +] + +_DIMS = ["race", "gender", "cohort", "program", "enrollment intensity", "math placement"] +_METRICS = ["retention rate", "completion rate", "GPA", "DFW rate", "pass rate"] +_ALERTS = ["URGENT", "HIGH", "MODERATE"] +_YEARS = ["2019", "2020", "2021", "2022", "2023"] +_RACES = ["Black", "White", "Hispanic", "Asian", "Two or More", "Unknown"] + + +# --------------------------------------------------------------------------- +# Seed query loader +# --------------------------------------------------------------------------- + + +def load_seed_queries(school: str) -> dict[str, list[dict]]: + """Load seed queries from a school's seed_queries.yaml. + + Args: + school: School directory name. + + Returns: + Dict with "explainer" and "summarizer" lists of query dicts. + """ + seed_path = get_school_dir(school) / "seed_queries.yaml" + if not seed_path.exists(): + return {"explainer": [], "summarizer": []} + + with seed_path.open("r", encoding="utf-8") as fh: + data = yaml.safe_load(fh) or {} + + return { + "explainer": data.get("explainer", []), + "summarizer": data.get("summarizer", []), + } + + +# --------------------------------------------------------------------------- +# Synthetic course pairing generation +# --------------------------------------------------------------------------- + + +def _random_course() -> dict[str, str]: + """Generate a random course identifier.""" + prefix = random.choice(_PREFIXES) + number = random.choice(_NUMBERS) + key = f"{prefix} {number}" + name = _NAMES.get(key, f"{prefix} {number} Course") + return {"prefix": prefix, "number": number, "name": name} + + +def _random_stats() -> dict[str, Any]: + """Generate random course pairing statistics.""" + dfwi_a = round(random.uniform(0.15, 0.55), 2) + dfwi_b = round(random.uniform(0.15, 0.55), 2) + co_count = random.randint(20, 200) + co_dfwi = round(random.uniform(min(dfwi_a, dfwi_b), max(dfwi_a, dfwi_b) + 0.1), 2) + co_dfwi = min(co_dfwi, 0.75) + + delivery_breakdown = [] + remaining = co_count + for method in _DELIVERY_METHODS: + if method == _DELIVERY_METHODS[-1]: + count = remaining + else: + count = random.randint(5, remaining - 5 * (len(_DELIVERY_METHODS) - len(delivery_breakdown) - 1)) + count = max(count, 1) + remaining -= count + delivery_breakdown.append({ + "method": method, + "count": count, + "dfwi_rate": round(random.uniform(0.15, 0.55), 2), + }) + + return { + "course_a_dfwi": dfwi_a, + "course_b_dfwi": dfwi_b, + "co_enrollment_count": co_count, + "co_enrollment_dfwi": co_dfwi, + "delivery_breakdown": delivery_breakdown, + } + + +def generate_synthetic_course_pairings( + config: dict[str, Any], + count: int, +) -> list[dict[str, Any]]: + """Generate synthetic course pairing data for explainer training. + + Args: + config: Parsed school config dict (used for program-aware generation). + count: Number of pairings to generate. + + Returns: + List of course pairing data dicts. + """ + if count == 0: + return [] + + results = [] + for _ in range(count): + course_a = _random_course() + course_b = _random_course() + while course_b["prefix"] == course_a["prefix"] and course_b["number"] == course_a["number"]: + course_b = _random_course() + results.append({ + "course_a": course_a, + "course_b": course_b, + "stats": _random_stats(), + }) + return results + + +# --------------------------------------------------------------------------- +# Synthetic query result generation +# --------------------------------------------------------------------------- + + +def generate_synthetic_query_results( + config: dict[str, Any], + count: int, +) -> list[dict[str, Any]]: + """Generate synthetic query results for summarizer training. + + Args: + config: Parsed school config dict. + count: Number of query results to generate. + + Returns: + List of query result dicts with prompt, data, rowCount, vizType. + """ + if count == 0: + return [] + + results = [] + for i in range(count): + template, default_viz = _QUERY_TEMPLATES[i % len(_QUERY_TEMPLATES)] + prompt = template.format( + dim=random.choice(_DIMS), + metric=random.choice(_METRICS), + year=random.choice(_YEARS), + alert=random.choice(_ALERTS), + ) + + # Generate plausible result rows + num_rows = random.randint(2, 8) + data = [] + for _ in range(num_rows): + row = { + "Race": random.choice(_RACES), + "value": round(random.uniform(0.15, 0.85), 2), + "count": random.randint(10, 500), + } + data.append(row) + + results.append({ + "prompt": prompt, + "data": data, + "rowCount": num_rows, + "vizType": default_viz, + }) + + return results + + +# --------------------------------------------------------------------------- +# ChatML formatter +# --------------------------------------------------------------------------- + + +def format_as_chatml(system: str, user: str, assistant: str) -> dict: + """Format a (system, user, assistant) triple as a ChatML messages dict. + + Args: + system: The system prompt text. + user: The user message text. + assistant: The assistant response text. + + Returns: + A dict with a "messages" key containing a list of 3 role/content dicts. + """ + return { + "messages": [ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + {"role": "assistant", "content": assistant}, + ] + } +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_seed.py -v` +Expected: All tests PASS + +- [ ] **Step 5: Commit** + +```bash +git add training/seed.py tests/training/test_seed.py +git commit -m "feat(training): seed data generation for explainer and summarizer" +``` + +--- + +## Task 6: Distillation Pipeline + +**Files:** +- Create: `training/distill.py` +- Create: `tests/training/test_distill.py` + +- [ ] **Step 1: Write the failing tests** + +Create `tests/training/test_distill.py`: +```python +"""Tests for training.distill — teacher model distillation.""" + +import json +import pytest +from unittest.mock import patch, MagicMock + +from training.distill import ( + validate_json, + call_teacher, + generate_explainer_pairs, + generate_summarizer_pairs, +) + + +class TestValidateJson: + def test_valid_json(self): + result = validate_json('{"key": "value"}') + assert result == {"key": "value"} + + def test_strips_markdown_fences(self): + result = validate_json('```json\n{"key": "value"}\n```') + assert result == {"key": "value"} + + def test_returns_none_for_invalid(self): + assert validate_json("not json") is None + + def test_returns_none_for_empty(self): + assert validate_json("") is None + assert validate_json(None) is None + + def test_returns_none_for_non_dict(self): + assert validate_json("[1, 2, 3]") is None + + +class TestCallTeacher: + def test_calls_anthropic_backend(self): + mock_client = MagicMock() + mock_message = MagicMock() + mock_message.content = [MagicMock(text='{"result": "ok"}')] + mock_message.usage.input_tokens = 100 + mock_message.usage.output_tokens = 50 + mock_client.messages.create.return_value = mock_message + + with patch("training.distill._get_anthropic_client", return_value=mock_client): + result = call_teacher( + system="system prompt", + user="user prompt", + backend="anthropic", + model="claude-sonnet-4-20250514", + ) + + assert result == '{"result": "ok"}' + mock_client.messages.create.assert_called_once() + + def test_calls_ollama_backend(self): + mock_response = {"message": {"content": '{"result": "ok"}'}} + + with patch("training.distill.ollama") as mock_ollama: + mock_ollama.chat.return_value = mock_response + result = call_teacher( + system="system prompt", + user="user prompt", + backend="ollama", + model="qwen3.5:27b", + ) + + assert result == '{"result": "ok"}' + mock_ollama.chat.assert_called_once() + + +class TestGenerateExplainerPairs: + def test_generates_pairs_from_seed_data(self, sample_school_config, sample_course_pairing_data): + mock_response = json.dumps({ + "explanation": "Test explanation", + "structural_factors": ["factor1"], + "student_impact": "impact", + "advisor_recommendation": "recommendation", + "data_limitations": ["caveat"], + "related_intervention": None, + }) + + with patch("training.distill.call_teacher", return_value=mock_response): + pairs = generate_explainer_pairs( + config=sample_school_config, + seed_data=[sample_course_pairing_data], + count=2, + ) + + assert len(pairs) == 2 + assert "messages" in pairs[0] + assert len(pairs[0]["messages"]) == 3 + + def test_skips_invalid_responses(self, sample_school_config, sample_course_pairing_data): + with patch("training.distill.call_teacher", return_value="not json"): + pairs = generate_explainer_pairs( + config=sample_school_config, + seed_data=[sample_course_pairing_data], + count=3, + ) + + assert len(pairs) == 0 + + +class TestGenerateSummarizerPairs: + def test_generates_pairs_from_seed_data(self, sample_school_config, sample_query_result_data): + mock_response = json.dumps({ + "summary": "Test summary", + "key_insights": ["insight1"], + "context": "context", + "action_items": ["action"], + "caveats": ["caveat"], + }) + + with patch("training.distill.call_teacher", return_value=mock_response): + pairs = generate_summarizer_pairs( + config=sample_school_config, + seed_data=[sample_query_result_data], + count=2, + ) + + assert len(pairs) == 2 + assert "messages" in pairs[0] +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_distill.py -v` +Expected: FAIL — `ModuleNotFoundError: No module named 'training.distill'` + +- [ ] **Step 3: Write the implementation** + +Create `training/distill.py`: +```python +"""Distillation pipeline — generate ChatML training pairs via a teacher model. + +Supports two backends: + - anthropic: Claude Sonnet via Anthropic API (production quality) + - ollama: Local model via Ollama (free iteration) + +Usage: + python -m training.distill --school bishop-state [--local] +""" + +from __future__ import annotations + +import argparse +import functools +import json +import os +import time +from pathlib import Path +from typing import Any + +from training.config import get_training_data_dir, load_school_config, write_jsonl +from training.prompts import ( + EXPLAINER_STUDENT_SYSTEM, + SUMMARIZER_STUDENT_SYSTEM, + build_explainer_prompt, + build_summarizer_prompt, + build_system_prompt, +) +from training.seed import ( + format_as_chatml, + generate_synthetic_course_pairings, + generate_synthetic_query_results, + load_seed_queries, +) + +# --------------------------------------------------------------------------- +# Cost tracking +# --------------------------------------------------------------------------- + +_COST_PER_M_INPUT = 3.00 +_COST_PER_M_OUTPUT = 15.00 +_total_input_tokens = 0 +_total_output_tokens = 0 +_total_calls = 0 + + +def _track_cost(input_tokens: int, output_tokens: int) -> None: + global _total_input_tokens, _total_output_tokens, _total_calls + _total_input_tokens += input_tokens + _total_output_tokens += output_tokens + _total_calls += 1 + + +def _cost_so_far() -> float: + return ( + _total_input_tokens / 1_000_000 * _COST_PER_M_INPUT + + _total_output_tokens / 1_000_000 * _COST_PER_M_OUTPUT + ) + + +def _print_cost_summary() -> None: + cost = _cost_so_far() + print( + f"[cost] {_total_calls} API calls | " + f"{_total_input_tokens:,} in + {_total_output_tokens:,} out tokens | " + f"${cost:.2f} spent so far", + flush=True, + ) + + +# --------------------------------------------------------------------------- +# JSON validation +# --------------------------------------------------------------------------- + + +def validate_json(text: str | None) -> dict | None: + """Strip markdown fences and parse as JSON dict. + + Returns None if text is empty, not valid JSON, or not a dict. + """ + if not text or not isinstance(text, str) or not text.strip(): + return None + + stripped = text.strip() + + if stripped.startswith("```"): + lines = stripped.splitlines() + lines = lines[1:] + if lines and lines[-1].strip() == "```": + lines = lines[:-1] + stripped = "\n".join(lines).strip() + + try: + obj = json.loads(stripped) + except (json.JSONDecodeError, ValueError): + return None + + if not isinstance(obj, dict): + return None + + return obj + + +# --------------------------------------------------------------------------- +# Teacher model caller +# --------------------------------------------------------------------------- + + +@functools.lru_cache(maxsize=1) +def _get_anthropic_client(): + """Return a cached Anthropic client instance.""" + import anthropic + + api_key = os.environ.get("ANTHROPIC_API_KEY") + if not api_key: + raise EnvironmentError( + "ANTHROPIC_API_KEY environment variable is required for Claude distillation." + ) + return anthropic.Anthropic(api_key=api_key) + + +try: + import ollama +except ImportError: + ollama = None # type: ignore[assignment] + + +def call_teacher( + system: str, + user: str, + backend: str, + model: str, +) -> str: + """Call the teacher model and return the response text. + + Args: + system: System prompt. + user: User message. + backend: "anthropic" or "ollama". + model: Model identifier. + + Returns: + The assistant response as a string. + """ + preview = user[:120].replace("\n", " ") + print(f"[api] Calling {model} ({backend}) | {preview}...", flush=True) + + if backend == "anthropic": + client = _get_anthropic_client() + message = client.messages.create( + model=model, + max_tokens=2048, + system=system, + messages=[{"role": "user", "content": user}], + ) + usage = message.usage + _track_cost(usage.input_tokens, usage.output_tokens) + print(f"[api] done {usage.input_tokens}in/{usage.output_tokens}out tokens", flush=True) + if _total_calls % 10 == 0: + _print_cost_summary() + return message.content[0].text + + elif backend == "ollama": + if ollama is None: + raise ImportError("ollama package is required for local teacher. Install with: pip install ollama") + response = ollama.chat( + model=model, + messages=[ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ], + ) + return response["message"]["content"] + + else: + raise ValueError(f"Unknown backend: {backend!r}. Must be 'anthropic' or 'ollama'.") + + +# --------------------------------------------------------------------------- +# Pair generators +# --------------------------------------------------------------------------- + + +def generate_explainer_pairs( + config: dict[str, Any], + seed_data: list[dict[str, Any]], + count: int, + outfile: Path | None = None, +) -> list[dict]: + """Generate explainer training pairs via teacher model distillation. + + Args: + config: Parsed school config dict. + seed_data: List of course pairing data dicts. + count: Number of pairs to generate. + outfile: If provided, pairs are written incrementally. + + Returns: + List of ChatML pair dicts. + """ + distill_config = config.get("distillation", {}) + backend = distill_config.get("teacher_backend", "anthropic") + model = distill_config.get("teacher_model", "claude-sonnet-4-20250514") + + system_prompt = build_system_prompt(config) + pairs: list[dict] = [] + + fh = None + if outfile is not None: + outfile.parent.mkdir(parents=True, exist_ok=True) + fh = outfile.open("w", encoding="utf-8") + + try: + for idx in range(count): + if idx > 0 and idx % 25 == 0: + time.sleep(1) + + course_data = seed_data[idx % len(seed_data)] + teacher_prompt = build_explainer_prompt(config, course_data) + + try: + response_text = call_teacher(system_prompt, teacher_prompt, backend, model) + except Exception as exc: + print(f"[warn] Teacher call failed for explainer pair {idx}: {exc}", flush=True) + continue + + validated = validate_json(response_text) + if validated is None: + print(f"[warn] Invalid JSON for explainer pair {idx}, skipping.", flush=True) + continue + + student_user = json.dumps(course_data, ensure_ascii=False, default=str) + pair = format_as_chatml( + system=EXPLAINER_STUDENT_SYSTEM, + user=student_user, + assistant=json.dumps(validated, ensure_ascii=False), + ) + pairs.append(pair) + if fh is not None: + fh.write(json.dumps(pair, ensure_ascii=False) + "\n") + fh.flush() + print(f"[explainer] {len(pairs)}/{count} pairs generated", flush=True) + finally: + if fh is not None: + fh.close() + print(f"[explainer] Saved {len(pairs)} pairs to {outfile}", flush=True) + + return pairs + + +def generate_summarizer_pairs( + config: dict[str, Any], + seed_data: list[dict[str, Any]], + count: int, + outfile: Path | None = None, +) -> list[dict]: + """Generate summarizer training pairs via teacher model distillation. + + Args: + config: Parsed school config dict. + seed_data: List of query result data dicts. + count: Number of pairs to generate. + outfile: If provided, pairs are written incrementally. + + Returns: + List of ChatML pair dicts. + """ + distill_config = config.get("distillation", {}) + backend = distill_config.get("teacher_backend", "anthropic") + model = distill_config.get("teacher_model", "claude-sonnet-4-20250514") + + system_prompt = build_system_prompt(config) + pairs: list[dict] = [] + + fh = None + if outfile is not None: + outfile.parent.mkdir(parents=True, exist_ok=True) + fh = outfile.open("w", encoding="utf-8") + + try: + for idx in range(count): + if idx > 0 and idx % 25 == 0: + time.sleep(1) + + query_data = seed_data[idx % len(seed_data)] + teacher_prompt = build_summarizer_prompt(config, query_data) + + try: + response_text = call_teacher(system_prompt, teacher_prompt, backend, model) + except Exception as exc: + print(f"[warn] Teacher call failed for summarizer pair {idx}: {exc}", flush=True) + continue + + validated = validate_json(response_text) + if validated is None: + print(f"[warn] Invalid JSON for summarizer pair {idx}, skipping.", flush=True) + continue + + student_user = json.dumps( + {"prompt": query_data["prompt"], "data": query_data["data"][:50]}, + ensure_ascii=False, + default=str, + ) + pair = format_as_chatml( + system=SUMMARIZER_STUDENT_SYSTEM, + user=student_user, + assistant=json.dumps(validated, ensure_ascii=False), + ) + pairs.append(pair) + if fh is not None: + fh.write(json.dumps(pair, ensure_ascii=False) + "\n") + fh.flush() + print(f"[summarizer] {len(pairs)}/{count} pairs generated", flush=True) + finally: + if fh is not None: + fh.close() + print(f"[summarizer] Saved {len(pairs)} pairs to {outfile}", flush=True) + + return pairs + + +# --------------------------------------------------------------------------- +# CLI entry point +# --------------------------------------------------------------------------- + + +def main(school: str, local: bool = False) -> None: + """Run distillation for a school. + + Args: + school: School directory name. + local: If True, use local Ollama teacher instead of Claude. + """ + config = load_school_config(school) + + if local: + config["distillation"]["teacher_backend"] = config["distillation"].get( + "local_teacher_backend", "ollama" + ) + config["distillation"]["teacher_model"] = config["distillation"].get( + "local_teacher_model", "qwen3.5:27b" + ) + print(f"[distill] Using local teacher: {config['distillation']['teacher_model']}") + else: + print(f"[distill] Using API teacher: {config['distillation']['teacher_model']}") + + pairs_per_task = config["distillation"].get("pairs_per_task", 1500) + data_dir = get_training_data_dir(school) + pairs_dir = data_dir / "pairs" + + # Load seed queries + seed_queries = load_seed_queries(school) + + # Generate synthetic seed data + synthetic_pairings = generate_synthetic_course_pairings(config, count=pairs_per_task) + synthetic_results = generate_synthetic_query_results(config, count=pairs_per_task) + + # Explainer + print(f"\n{'='*60}") + print(f"EXPLAINER — generating {pairs_per_task} pairs") + print(f"{'='*60}") + explainer_pairs = generate_explainer_pairs( + config=config, + seed_data=synthetic_pairings, + count=pairs_per_task, + outfile=pairs_dir / "explainer.jsonl", + ) + + # Summarizer + print(f"\n{'='*60}") + print(f"SUMMARIZER — generating {pairs_per_task} pairs") + print(f"{'='*60}") + summarizer_pairs = generate_summarizer_pairs( + config=config, + seed_data=synthetic_results, + count=pairs_per_task, + outfile=pairs_dir / "summarizer.jsonl", + ) + + print(f"\n{'='*60}") + print("DISTILLATION COMPLETE") + print(f"{'='*60}") + print(f" Explainer: {len(explainer_pairs)} pairs") + print(f" Summarizer: {len(summarizer_pairs)} pairs") + _print_cost_summary() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Generate training pairs via teacher model distillation." + ) + parser.add_argument("--school", required=True, help="School directory name") + parser.add_argument("--local", action="store_true", help="Use local Ollama teacher") + args = parser.parse_args() + main(args.school, local=args.local) +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_distill.py -v` +Expected: All tests PASS + +- [ ] **Step 5: Commit** + +```bash +git add training/distill.py tests/training/test_distill.py +git commit -m "feat(training): distillation pipeline with dual teacher backend support" +``` + +--- + +## Task 7: Dataset Preparation + +**Files:** +- Create: `training/prepare.py` +- Create: `tests/training/test_prepare.py` + +- [ ] **Step 1: Write the failing tests** + +Create `tests/training/test_prepare.py`: +```python +"""Tests for training.prepare — filter, deduplicate, and split.""" + +import json +import pytest + +from training.prepare import ( + filter_invalid_json, + deduplicate_by_jaccard, + jaccard_similarity, + split_dataset, +) + + +class TestFilterInvalidJson: + def test_keeps_valid_pairs(self): + pairs = [ + {"messages": [ + {"role": "system", "content": "sys"}, + {"role": "user", "content": "question"}, + {"role": "assistant", "content": '{"key": "value"}'}, + ]} + ] + result = filter_invalid_json(pairs) + assert len(result) == 1 + + def test_removes_invalid_json_assistant(self): + pairs = [ + {"messages": [ + {"role": "system", "content": "sys"}, + {"role": "user", "content": "question"}, + {"role": "assistant", "content": "not json"}, + ]} + ] + result = filter_invalid_json(pairs) + assert len(result) == 0 + + def test_removes_missing_messages(self): + assert filter_invalid_json([{"no_messages": True}]) == [] + + def test_removes_empty_user(self): + pairs = [ + {"messages": [ + {"role": "system", "content": "sys"}, + {"role": "user", "content": ""}, + {"role": "assistant", "content": '{"key": "value"}'}, + ]} + ] + result = filter_invalid_json(pairs) + assert len(result) == 0 + + +class TestJaccardSimilarity: + def test_identical_strings(self): + assert jaccard_similarity("hello world", "hello world") == 1.0 + + def test_completely_different(self): + assert jaccard_similarity("hello", "world") == 0.0 + + def test_partial_overlap(self): + result = jaccard_similarity("hello world foo", "hello world bar") + assert 0.0 < result < 1.0 + + def test_empty_string(self): + assert jaccard_similarity("", "hello") == 0.0 + + +class TestDeduplicateByJaccard: + def test_removes_exact_duplicates(self): + pairs = [ + {"messages": [{"role": "user", "content": "same question"}]}, + {"messages": [{"role": "user", "content": "same question"}]}, + {"messages": [{"role": "user", "content": "different question"}]}, + ] + result = deduplicate_by_jaccard(pairs, threshold=1.0) + assert len(result) == 2 + + def test_empty_input(self): + assert deduplicate_by_jaccard([], threshold=1.0) == [] + + def test_preserves_order(self): + pairs = [ + {"messages": [{"role": "user", "content": "first"}]}, + {"messages": [{"role": "user", "content": "second"}]}, + ] + result = deduplicate_by_jaccard(pairs, threshold=1.0) + assert result[0]["messages"][0]["content"] == "first" + + +class TestSplitDataset: + def test_split_ratios(self): + pairs = [{"id": i} for i in range(100)] + splits = split_dataset(pairs, train_ratio=0.8, val_ratio=0.1) + assert len(splits["train"]) == 80 + assert len(splits["val"]) == 10 + assert len(splits["test"]) == 10 + + def test_deterministic(self): + pairs = [{"id": i} for i in range(50)] + split1 = split_dataset(pairs, seed=42) + split2 = split_dataset(pairs, seed=42) + assert split1["train"] == split2["train"] + + def test_empty_input(self): + splits = split_dataset([]) + assert splits == {"train": [], "val": [], "test": []} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_prepare.py -v` +Expected: FAIL — `ModuleNotFoundError: No module named 'training.prepare'` + +- [ ] **Step 3: Write the implementation** + +Create `training/prepare.py`: +```python +"""Dataset preparation — filter, deduplicate, and split training pairs. + +Adapted from d4bl pipeline. Loads raw JSONL from distillation, applies +quality filtering, removes near-duplicates, and writes 80/10/10 splits. + +Usage: + python -m training.prepare --school bishop-state +""" + +from __future__ import annotations + +import argparse +import json +import random +from pathlib import Path +from typing import Any + +from training.config import ( + JACCARD_THRESHOLD, + TRAIN_RATIO, + VAL_RATIO, + get_training_data_dir, + write_jsonl, +) + + +# --------------------------------------------------------------------------- +# Pure helpers +# --------------------------------------------------------------------------- + + +def jaccard_similarity(a: str, b: str) -> float: + """Compute word-level Jaccard similarity between two strings.""" + words_a = set(a.lower().split()) + words_b = set(b.lower().split()) + if not words_a or not words_b: + return 0.0 + return len(words_a & words_b) / len(words_a | words_b) + + +def _get_user_text(pair: dict[str, Any]) -> str: + """Extract user message content from a ChatML pair.""" + for msg in pair.get("messages", []): + if msg.get("role") == "user": + return msg.get("content", "") + return "" + + +# --------------------------------------------------------------------------- +# Filtering +# --------------------------------------------------------------------------- + + +def filter_invalid_json(pairs: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Keep only pairs with valid structure and JSON-parseable assistant content.""" + valid = [] + for pair in pairs: + messages = pair.get("messages") + if not isinstance(messages, list) or not messages: + continue + if any(not isinstance(msg, dict) for msg in messages): + continue + has_user = any( + msg.get("role") == "user" and msg.get("content") + for msg in messages + ) + if not has_user: + continue + assistant_content = None + for msg in messages: + if msg.get("role") == "assistant": + assistant_content = msg.get("content") + break + if not isinstance(assistant_content, str) or not assistant_content: + continue + try: + json.loads(assistant_content) + except (json.JSONDecodeError, ValueError): + continue + valid.append(pair) + return valid + + +# --------------------------------------------------------------------------- +# Deduplication +# --------------------------------------------------------------------------- + + +def deduplicate_by_jaccard( + pairs: list[dict[str, Any]], + threshold: float = JACCARD_THRESHOLD, +) -> list[dict[str, Any]]: + """Remove near-duplicate pairs based on user-message Jaccard similarity.""" + if not pairs: + return pairs + + kept: list[dict[str, Any]] = [pairs[0]] + kept_word_sets: list[set] = [set(_get_user_text(pairs[0]).lower().split())] + + for pair in pairs[1:]: + candidate_words = set(_get_user_text(pair).lower().split()) + is_duplicate = any( + _jaccard_sets(candidate_words, kw) >= threshold + for kw in kept_word_sets + ) + if not is_duplicate: + kept.append(pair) + kept_word_sets.append(candidate_words) + + return kept + + +def _jaccard_sets(a: set, b: set) -> float: + if not a or not b: + return 0.0 + return len(a & b) / len(a | b) + + +# --------------------------------------------------------------------------- +# Splitting +# --------------------------------------------------------------------------- + + +def split_dataset( + pairs: list[dict[str, Any]], + train_ratio: float = TRAIN_RATIO, + val_ratio: float = VAL_RATIO, + seed: int = 42, +) -> dict[str, list[dict[str, Any]]]: + """Shuffle and split pairs into train/val/test with a deterministic seed.""" + if not pairs: + return {"train": [], "val": [], "test": []} + + shuffled = list(pairs) + rng = random.Random(seed) + rng.shuffle(shuffled) + + n = len(shuffled) + train_end = round(n * train_ratio) + val_end = train_end + round(n * val_ratio) + + return { + "train": shuffled[:train_end], + "val": shuffled[train_end:val_end], + "test": shuffled[val_end:], + } + + +# --------------------------------------------------------------------------- +# I/O +# --------------------------------------------------------------------------- + + +def _load_pairs(path: Path) -> list[dict[str, Any]]: + """Load newline-delimited JSON from path.""" + pairs = [] + with path.open() as fh: + for line in fh: + line = line.strip() + if line: + pairs.append(json.loads(line)) + return pairs + + +# --------------------------------------------------------------------------- +# Orchestrator +# --------------------------------------------------------------------------- + + +def process_task(school: str, task: str) -> dict[str, int]: + """Load, filter, deduplicate, and split training data for a task. + + Args: + school: School directory name. + task: Task name ("explainer" or "summarizer"). + + Returns: + Dict mapping split name to number of examples written. + """ + data_dir = get_training_data_dir(school) + input_path = data_dir / "pairs" / f"{task}.jsonl" + if not input_path.exists(): + raise FileNotFoundError(f"Pairs file not found: {input_path}") + + pairs = _load_pairs(input_path) + print(f"[{task}] Loaded {len(pairs)} pairs from {input_path}") + + pairs = filter_invalid_json(pairs) + print(f"[{task}] After JSON filter: {len(pairs)} pairs") + + pairs = deduplicate_by_jaccard(pairs, threshold=JACCARD_THRESHOLD) + print(f"[{task}] After deduplication: {len(pairs)} pairs") + + splits = split_dataset(pairs) + + final_dir = data_dir / "final" / task + counts: dict[str, int] = {} + for split_name, split_pairs in splits.items(): + out_path = final_dir / f"{split_name}.jsonl" + n = write_jsonl(split_pairs, out_path) + counts[split_name] = n + print(f"[{task}] Wrote {n} examples to {out_path}") + + return counts + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main(school: str) -> None: + """Run preparation for all tasks.""" + for task in ("explainer", "summarizer"): + try: + process_task(school, task) + except FileNotFoundError as e: + print(f"[warn] {e} — skipping") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Filter, deduplicate, and split training pairs.") + parser.add_argument("--school", required=True, help="School directory name") + args = parser.parse_args() + main(args.school) +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_prepare.py -v` +Expected: All tests PASS + +- [ ] **Step 5: Commit** + +```bash +git add training/prepare.py tests/training/test_prepare.py +git commit -m "feat(training): dataset preparation — filter, dedup, and split" +``` + +--- + +## Task 8: Eval Harness and Ship Criteria + +**Files:** +- Create: `training/eval.py` +- Create: `tests/training/test_eval.py` + +- [ ] **Step 1: Write the failing tests** + +Create `tests/training/test_eval.py`: +```python +"""Tests for training.eval — metrics and ship criteria.""" + +import json +import pytest + +from training.eval import ( + SHIP_CRITERIA, + check_json_validity, + check_schema_adherence, + check_caveat_inclusion, + check_ship_criteria, + ShipDecision, +) + + +class TestCheckJsonValidity: + def test_all_valid(self): + outputs = ['{"key": "value"}', '{"a": 1}'] + assert check_json_validity(outputs) == 1.0 + + def test_some_invalid(self): + outputs = ['{"key": "value"}', "not json", '{"a": 1}'] + assert check_json_validity(outputs) == pytest.approx(2 / 3) + + def test_empty(self): + assert check_json_validity([]) == 0.0 + + +class TestCheckSchemaAdherence: + def test_explainer_all_valid(self, sample_explainer_output): + outputs = [json.dumps(sample_explainer_output)] + assert check_schema_adherence(outputs, "explainer") == 1.0 + + def test_explainer_missing_key(self): + incomplete = json.dumps({"explanation": "test"}) + assert check_schema_adherence([incomplete], "explainer") < 1.0 + + def test_summarizer_all_valid(self, sample_summarizer_output): + outputs = [json.dumps(sample_summarizer_output)] + assert check_schema_adherence(outputs, "summarizer") == 1.0 + + +class TestCheckCaveatInclusion: + def test_all_have_caveats(self, sample_explainer_output): + outputs = [json.dumps(sample_explainer_output)] + assert check_caveat_inclusion(outputs, "explainer") == 1.0 + + def test_missing_caveats(self): + no_caveats = json.dumps({ + "explanation": "test", + "structural_factors": [], + "student_impact": "impact", + "advisor_recommendation": "rec", + "data_limitations": [], + "related_intervention": None, + }) + assert check_caveat_inclusion([no_caveats], "explainer") == 0.0 + + +class TestShipCriteria: + def test_passes_with_good_metrics(self): + metrics = { + "json_validity": 0.98, + "schema_adherence": 0.95, + "caveat_inclusion": 0.92, + "factual_grounding": 0.90, + } + decision = check_ship_criteria(metrics, "explainer") + assert decision.decision == "ship" + assert len(decision.blocking_failures) == 0 + + def test_fails_with_low_json_validity(self): + metrics = { + "json_validity": 0.80, + "schema_adherence": 0.95, + "caveat_inclusion": 0.92, + "factual_grounding": 0.90, + } + decision = check_ship_criteria(metrics, "explainer") + assert decision.decision == "no_ship" + assert len(decision.blocking_failures) > 0 + + def test_ship_with_gaps(self): + metrics = { + "json_validity": 0.98, + "schema_adherence": 0.95, + "caveat_inclusion": 0.85, + "factual_grounding": 0.90, + "explanation_quality": 0.30, # Below non-blocking threshold + } + decision = check_ship_criteria(metrics, "explainer") + assert decision.decision in ("ship", "ship_with_gaps") +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_eval.py -v` +Expected: FAIL — `ModuleNotFoundError: No module named 'training.eval'` + +- [ ] **Step 3: Write the implementation** + +Create `training/eval.py`: +```python +"""Evaluation harness and ship criteria for fine-tuned models. + +Runs a fine-tuned model against held-out test data and checks +whether it meets the minimum quality thresholds for deployment. + +Usage: + python -m training.eval --school bishop-state +""" + +from __future__ import annotations + +import argparse +import json +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +from training.config import get_training_data_dir, load_school_config + +# --------------------------------------------------------------------------- +# Ship criteria thresholds +# --------------------------------------------------------------------------- + +SHIP_CRITERIA: dict[str, dict[str, dict]] = { + "explainer": { + "json_validity": {"min": 0.95, "blocking": True}, + "schema_adherence": {"min": 0.90, "blocking": True}, + "caveat_inclusion": {"min": 0.90, "blocking": True}, + "factual_grounding": {"min": 0.85, "blocking": True}, + "explanation_quality": {"min": 0.35, "blocking": False}, + "actionability": {"min": 0.80, "blocking": False}, + }, + "summarizer": { + "json_validity": {"min": 0.95, "blocking": True}, + "schema_adherence": {"min": 0.90, "blocking": True}, + "caveat_inclusion": {"min": 0.90, "blocking": True}, + "factual_grounding": {"min": 0.85, "blocking": True}, + "explanation_quality": {"min": 0.35, "blocking": False}, + "actionability": {"min": 0.80, "blocking": False}, + }, +} + +_EXPLAINER_REQUIRED_KEYS = { + "explanation", "structural_factors", "student_impact", + "advisor_recommendation", "data_limitations", "related_intervention", +} +_SUMMARIZER_REQUIRED_KEYS = { + "summary", "key_insights", "context", "action_items", "caveats", +} +_CAVEAT_KEY = { + "explainer": "data_limitations", + "summarizer": "caveats", +} + + +# --------------------------------------------------------------------------- +# Data classes +# --------------------------------------------------------------------------- + + +@dataclass +class CriterionFailure: + metric: str + threshold: float + actual: float | None + blocking: bool + + +@dataclass +class ShipDecision: + decision: str # "ship", "no_ship", "ship_with_gaps" + blocking_failures: list[CriterionFailure] = field(default_factory=list) + nonblocking_failures: list[CriterionFailure] = field(default_factory=list) + metrics_checked: int = 0 + + +# --------------------------------------------------------------------------- +# Metric computation +# --------------------------------------------------------------------------- + + +def check_json_validity(outputs: list[str]) -> float: + """Compute the fraction of outputs that parse as valid JSON dicts.""" + if not outputs: + return 0.0 + valid = 0 + for out in outputs: + try: + obj = json.loads(out) + if isinstance(obj, dict): + valid += 1 + except (json.JSONDecodeError, ValueError, TypeError): + pass + return valid / len(outputs) + + +def check_schema_adherence(outputs: list[str], task: str) -> float: + """Compute the fraction of outputs with all required keys present.""" + if not outputs: + return 0.0 + + required = _EXPLAINER_REQUIRED_KEYS if task == "explainer" else _SUMMARIZER_REQUIRED_KEYS + adherent = 0 + for out in outputs: + try: + obj = json.loads(out) + if isinstance(obj, dict) and required.issubset(obj.keys()): + adherent += 1 + except (json.JSONDecodeError, ValueError, TypeError): + pass + return adherent / len(outputs) + + +def check_caveat_inclusion(outputs: list[str], task: str) -> float: + """Compute the fraction of outputs with non-empty caveat/limitation fields.""" + if not outputs: + return 0.0 + + caveat_key = _CAVEAT_KEY.get(task, "caveats") + with_caveats = 0 + for out in outputs: + try: + obj = json.loads(out) + if isinstance(obj, dict): + caveats = obj.get(caveat_key, []) + if isinstance(caveats, list) and len(caveats) > 0: + with_caveats += 1 + except (json.JSONDecodeError, ValueError, TypeError): + pass + return with_caveats / len(outputs) + + +def check_factual_grounding(outputs: list[str], inputs: list[str]) -> float: + """Check that outputs reference values present in their corresponding inputs. + + Simple heuristic: extracts numeric values from the input and checks + that at least one appears in the output. + """ + if not outputs or not inputs: + return 0.0 + + import re + + grounded = 0 + for out, inp in zip(outputs, inputs): + numbers_in_input = set(re.findall(r"\d+\.?\d*", inp)) + if not numbers_in_input: + grounded += 1 # No numbers to check against + continue + # Check if at least one input number appears in the output + if any(n in out for n in numbers_in_input): + grounded += 1 + + return grounded / len(outputs) + + +# --------------------------------------------------------------------------- +# Ship criteria checker +# --------------------------------------------------------------------------- + + +def check_ship_criteria( + metrics: dict[str, float], + task: str, +) -> ShipDecision: + """Compare metrics against ship thresholds. + + Args: + metrics: Dict of metric_name → value. + task: "explainer" or "summarizer". + + Returns: + ShipDecision with pass/fail details. + """ + criteria = SHIP_CRITERIA.get(task, {}) + blocking_failures = [] + nonblocking_failures = [] + checked = 0 + + for metric_name, spec in criteria.items(): + actual = metrics.get(metric_name) + if actual is None: + continue + checked += 1 + + threshold = spec.get("min", spec.get("max")) + blocking = spec.get("blocking", True) + + failed = False + if "min" in spec and actual < spec["min"]: + failed = True + if "max" in spec and actual > spec["max"]: + failed = True + + if failed: + failure = CriterionFailure( + metric=metric_name, + threshold=threshold, + actual=actual, + blocking=blocking, + ) + if blocking: + blocking_failures.append(failure) + else: + nonblocking_failures.append(failure) + + if blocking_failures: + decision = "no_ship" + elif nonblocking_failures: + decision = "ship_with_gaps" + else: + decision = "ship" + + return ShipDecision( + decision=decision, + blocking_failures=blocking_failures, + nonblocking_failures=nonblocking_failures, + metrics_checked=checked, + ) + + +# --------------------------------------------------------------------------- +# Test set loader +# --------------------------------------------------------------------------- + + +def load_test_set(path: Path) -> list[dict]: + """Load a ChatML JSONL test set and extract input/expected pairs.""" + results = [] + with path.open() as fh: + for line in fh: + if not line.strip(): + continue + example = json.loads(line) + messages = example["messages"] + user_msg = messages[1]["content"] + assistant_msg = messages[2]["content"] + results.append({ + "input": user_msg, + "expected_raw": assistant_msg, + }) + return results + + +# --------------------------------------------------------------------------- +# Eval runner +# --------------------------------------------------------------------------- + + +def run_eval(school: str, task: str) -> ShipDecision: + """Run evaluation for a school's fine-tuned model on one task. + + Loads the test set, runs inference via Ollama, computes metrics, + and checks ship criteria. + + Args: + school: School directory name. + task: "explainer" or "summarizer". + + Returns: + ShipDecision. + """ + data_dir = get_training_data_dir(school) + test_path = data_dir / "final" / task / "test.jsonl" + + if not test_path.exists(): + raise FileNotFoundError(f"Test set not found: {test_path}") + + test_set = load_test_set(test_path) + print(f"[{task}] Loaded {len(test_set)} test examples from {test_path}") + + config = load_school_config(school) + model_name = f"{school}-{task}:{config['training']['default_model'].split(':')[1]}" + + # Run inference + try: + import ollama as ollama_client + except ImportError: + raise ImportError("ollama package required for evaluation. Install with: pip install ollama") + + outputs = [] + inputs = [] + for i, example in enumerate(test_set): + try: + response = ollama_client.chat( + model=model_name, + messages=[ + {"role": "user", "content": example["input"]}, + ], + ) + outputs.append(response["message"]["content"]) + inputs.append(example["input"]) + except Exception as exc: + print(f"[warn] Inference failed for example {i}: {exc}") + outputs.append("") + inputs.append(example["input"]) + + if (i + 1) % 10 == 0: + print(f"[{task}] Evaluated {i + 1}/{len(test_set)} examples", flush=True) + + # Compute metrics + metrics = { + "json_validity": check_json_validity(outputs), + "schema_adherence": check_schema_adherence(outputs, task), + "caveat_inclusion": check_caveat_inclusion(outputs, task), + "factual_grounding": check_factual_grounding(outputs, inputs), + } + + # Print results + print(f"\n[{task}] Metrics:") + for name, value in metrics.items(): + threshold_info = SHIP_CRITERIA.get(task, {}).get(name, {}) + threshold = threshold_info.get("min", threshold_info.get("max", "?")) + status = "PASS" if value >= threshold if isinstance(threshold, (int, float)) else True else "FAIL" + print(f" {name}: {value:.1%} (threshold: {threshold}) {status}") + + decision = check_ship_criteria(metrics, task) + print(f"\n[{task}] DECISION: {decision.decision.upper()}") + if decision.blocking_failures: + for f in decision.blocking_failures: + print(f" BLOCKING: {f.metric} = {f.actual:.1%} (need {f.threshold})") + if decision.nonblocking_failures: + for f in decision.nonblocking_failures: + print(f" WARNING: {f.metric} = {f.actual:.1%} (need {f.threshold})") + + return decision + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main(school: str) -> None: + """Run evaluation for all tasks.""" + results = {} + for task in ("explainer", "summarizer"): + try: + results[task] = run_eval(school, task) + except FileNotFoundError as e: + print(f"[warn] {e} — skipping") + + print(f"\n{'='*60}") + print("EVALUATION SUMMARY") + print(f"{'='*60}") + all_ship = True + for task, decision in results.items(): + status = decision.decision.upper() + print(f" {task}: {status}") + if decision.decision == "no_ship": + all_ship = False + + if all_ship: + print("\nAll adapters PASS — ready to export.") + else: + print("\nSome adapters FAILED — fix issues before exporting.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Evaluate fine-tuned models against ship criteria.") + parser.add_argument("--school", required=True, help="School directory name") + args = parser.parse_args() + main(args.school) +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/training/test_eval.py -v` +Expected: All tests PASS + +- [ ] **Step 5: Commit** + +```bash +git add training/eval.py tests/training/test_eval.py +git commit -m "feat(training): eval harness with ship criteria for model quality gates" +``` + +--- + +## Task 9: MLX Fine-Tuning Wrapper + +**Files:** +- Create: `training/finetune.py` + +This task wraps MLX's `mlx_lm` fine-tuning CLI. No unit tests for the actual training (it requires GPU time), but we test the config generation. + +- [ ] **Step 1: Write the implementation** + +Create `training/finetune.py`: +```python +"""Fine-tuning wrapper for MLX QLoRA on Apple Silicon. + +Wraps mlx_lm's LoRA fine-tuning with school-specific config. + +Usage: + python -m training.finetune --school bishop-state --model 9b +""" + +from __future__ import annotations + +import argparse +import json +import subprocess +import sys +from pathlib import Path + +from training.config import get_training_data_dir, load_school_config + +# --------------------------------------------------------------------------- +# Model name mapping +# --------------------------------------------------------------------------- + +_MODEL_MAP = { + "4b": "Qwen/Qwen3.5-4B", + "9b": "Qwen/Qwen3.5-9B", + "27b": "Qwen/Qwen3.5-27B", +} + + +def _resolve_model(model_shorthand: str) -> str: + """Resolve a shorthand like '9b' to a HuggingFace model path.""" + if model_shorthand in _MODEL_MAP: + return _MODEL_MAP[model_shorthand] + return model_shorthand + + +# --------------------------------------------------------------------------- +# Config generation +# --------------------------------------------------------------------------- + + +def build_lora_config(config: dict, task: str, data_dir: Path) -> dict: + """Build the MLX LoRA fine-tuning config dict. + + Args: + config: Parsed school config. + task: "explainer" or "summarizer". + data_dir: Path to the school's training_data directory. + + Returns: + Dict suitable for writing as JSON config for mlx_lm.lora. + """ + training = config.get("training", {}) + final_dir = data_dir / "final" / task + + return { + "train": str(final_dir / "train.jsonl"), + "valid": str(final_dir / "val.jsonl"), + "test": str(final_dir / "test.jsonl"), + "lora_layers": training.get("lora_rank", 16), + "lora_parameters": { + "rank": training.get("lora_rank", 16), + "alpha": training.get("lora_alpha", 32), + "dropout": 0.05, + "scale": training.get("lora_alpha", 32) / training.get("lora_rank", 16), + }, + "learning_rate": training.get("learning_rate", 1e-4), + "batch_size": training.get("batch_size", 4), + "iters": training.get("epochs", 3) * 1000, # Approximate + "val_batches": 25, + "steps_per_eval": training.get("eval_every", 50), + "save_every": 100, + "max_seq_length": 2048, + "grad_checkpoint": True, + } + + +# --------------------------------------------------------------------------- +# Fine-tuning runner +# --------------------------------------------------------------------------- + + +def run_finetune(school: str, model: str = "9b", task: str | None = None) -> None: + """Run MLX LoRA fine-tuning for a school's adapter(s). + + Args: + school: School directory name. + model: Model shorthand ("4b", "9b") or full HF path. + task: Specific task, or None to train both adapters. + """ + config = load_school_config(school) + data_dir = get_training_data_dir(school) + hf_model = _resolve_model(model) + + tasks = [task] if task else ["explainer", "summarizer"] + + for t in tasks: + print(f"\n{'='*60}") + print(f"FINE-TUNING: {t} adapter on {hf_model}") + print(f"{'='*60}") + + adapter_dir = data_dir / "models" / f"qwen3.5-{model}" / t + adapter_dir.mkdir(parents=True, exist_ok=True) + + lora_config = build_lora_config(config, t, data_dir) + config_path = adapter_dir / "lora_config.json" + config_path.write_text(json.dumps(lora_config, indent=2)) + + cmd = [ + sys.executable, "-m", "mlx_lm.lora", + "--model", hf_model, + "--adapter-path", str(adapter_dir), + "--data", str(data_dir / "final" / t), + "--train", + "--batch-size", str(lora_config["batch_size"]), + "--lora-layers", str(lora_config["lora_layers"]), + "--iters", str(lora_config["iters"]), + "--val-batches", str(lora_config["val_batches"]), + "--steps-per-eval", str(lora_config["steps_per_eval"]), + "--save-every", str(lora_config["save_every"]), + "--learning-rate", str(lora_config["learning_rate"]), + "--max-seq-length", str(lora_config["max_seq_length"]), + "--grad-checkpoint", + ] + + print(f"[finetune] Running: {' '.join(cmd[:6])}...") + result = subprocess.run(cmd, cwd=str(data_dir)) + + if result.returncode != 0: + print(f"[finetune] FAILED for {t} — exit code {result.returncode}") + else: + print(f"[finetune] SUCCESS — adapter saved to {adapter_dir}") + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Fine-tune a model for a school via MLX QLoRA.") + parser.add_argument("--school", required=True, help="School directory name") + parser.add_argument("--model", default="9b", help="Model size: 4b, 9b, or HF path") + parser.add_argument("--task", choices=["explainer", "summarizer"], help="Train one adapter only") + args = parser.parse_args() + run_finetune(args.school, model=args.model, task=args.task) +``` + +- [ ] **Step 2: Commit** + +```bash +git add training/finetune.py +git commit -m "feat(training): MLX QLoRA fine-tuning wrapper" +``` + +--- + +## Task 10: Ollama Export + +**Files:** +- Create: `training/export.py` + +- [ ] **Step 1: Write the implementation** + +Create `training/export.py`: +```python +"""Export fine-tuned adapters to Ollama for serving. + +Creates an Ollama Modelfile and registers the model. + +Usage: + python -m training.export --school bishop-state +""" + +from __future__ import annotations + +import argparse +import subprocess +import sys +from pathlib import Path + +from training.config import get_training_data_dir, load_school_config + +# --------------------------------------------------------------------------- +# Modelfile generation +# --------------------------------------------------------------------------- + +_MODELFILE_TEMPLATE = """FROM {base_model} +ADAPTER {adapter_path} + +PARAMETER temperature 0.3 +PARAMETER top_p 0.9 +PARAMETER num_ctx 4096 + +SYSTEM {system_prompt} +""" + + +def generate_modelfile( + base_model: str, + adapter_path: Path, + system_prompt: str, +) -> str: + """Generate an Ollama Modelfile string. + + Args: + base_model: Base model name (e.g. "qwen3.5:9b"). + adapter_path: Path to the LoRA adapter directory. + system_prompt: System prompt to bake into the model. + + Returns: + Modelfile content string. + """ + return _MODELFILE_TEMPLATE.format( + base_model=base_model, + adapter_path=str(adapter_path), + system_prompt=json.dumps(system_prompt), + ) + + +# --------------------------------------------------------------------------- +# Registration +# --------------------------------------------------------------------------- + +import json + +from training.prompts import EXPLAINER_STUDENT_SYSTEM, SUMMARIZER_STUDENT_SYSTEM + +_SYSTEM_PROMPTS = { + "explainer": EXPLAINER_STUDENT_SYSTEM, + "summarizer": SUMMARIZER_STUDENT_SYSTEM, +} + + +def export_model(school: str, task: str, model: str = "9b") -> bool: + """Export a fine-tuned adapter to Ollama. + + Args: + school: School directory name. + task: "explainer" or "summarizer". + model: Model size shorthand. + + Returns: + True if registration succeeded. + """ + data_dir = get_training_data_dir(school) + adapter_dir = data_dir / "models" / f"qwen3.5-{model}" / task + + if not adapter_dir.exists(): + print(f"[export] Adapter not found: {adapter_dir}") + return False + + base_model = f"qwen3.5:{model}" + ollama_name = f"{school}-{task}:{model}" + system_prompt = _SYSTEM_PROMPTS.get(task, "") + + modelfile_content = generate_modelfile(base_model, adapter_dir, system_prompt) + modelfile_path = adapter_dir / "Modelfile" + modelfile_path.write_text(modelfile_content) + print(f"[export] Wrote Modelfile to {modelfile_path}") + + # Register with Ollama + cmd = ["ollama", "create", ollama_name, "-f", str(modelfile_path)] + print(f"[export] Registering: {' '.join(cmd)}") + + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) + if result.returncode == 0: + print(f"[export] Registered: {ollama_name}") + return True + else: + print(f"[export] FAILED: {result.stderr}") + return False + except FileNotFoundError: + print("[export] Ollama CLI not found. Install from https://ollama.com") + return False + except subprocess.TimeoutExpired: + print("[export] Ollama create timed out after 5 minutes") + return False + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main(school: str) -> None: + """Export all adapters for a school.""" + config = load_school_config(school) + model = config["training"]["default_model"].split(":")[1] + + results = {} + for task in ("explainer", "summarizer"): + results[task] = export_model(school, task, model=model) + + print(f"\n{'='*60}") + print("EXPORT SUMMARY") + print(f"{'='*60}") + for task, success in results.items(): + status = "OK" if success else "FAILED" + print(f" {school}-{task}:{model} — {status}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Export fine-tuned models to Ollama.") + parser.add_argument("--school", required=True, help="School directory name") + args = parser.parse_args() + main(args.school) +``` + +- [ ] **Step 2: Commit** + +```bash +git add training/export.py +git commit -m "feat(training): Ollama model export and registration" +``` + +--- + +## Task 11: Dashboard Model Client + +**Files:** +- Create: `codebenders-dashboard/lib/model-client.ts` + +- [ ] **Step 1: Write the implementation** + +Create `codebenders-dashboard/lib/model-client.ts`: +```typescript +/** + * Model client adapter — routes inference to Ollama (fine-tuned) or + * OpenAI (fallback) based on MODEL_BACKEND env var. + */ + +import { generateText } from "ai" +import { createOpenAI } from "@ai-sdk/openai" + +const MODEL_BACKEND = process.env.MODEL_BACKEND || "openai" +const SCHOOL_CODE = process.env.SCHOOL_CODE || "bishop-state" +const OLLAMA_BASE_URL = process.env.OLLAMA_BASE_URL || "http://localhost:11434" + +const openai = createOpenAI({ + apiKey: process.env.OPENAI_API_KEY || "", +}) + +interface ModelResponse { + text: string +} + +async function callOllama(model: string, prompt: string): Promise { + const response = await fetch(`${OLLAMA_BASE_URL}/api/generate`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + model, + prompt, + stream: false, + options: { + temperature: 0.3, + num_predict: 1024, + }, + }), + }) + + if (!response.ok) { + throw new Error(`Ollama error: ${response.status} ${response.statusText}`) + } + + const data = await response.json() + return data.response +} + +async function callOpenAI(prompt: string, maxTokens: number): Promise { + const result = await generateText({ + model: openai("gpt-4o-mini"), + prompt, + maxTokens, + }) + return result.text +} + +/** + * Generate a course pairing explanation. + * + * Routes to the school's fine-tuned explainer model via Ollama, + * or falls back to OpenAI GPT-4o-mini. + */ +export async function generateExplanation( + prompt: string, + maxTokens: number = 320, +): Promise { + if (MODEL_BACKEND === "ollama") { + const modelSize = process.env.MODEL_SIZE || "9b" + const model = `${SCHOOL_CODE}-explainer:${modelSize}` + return callOllama(model, prompt) + } + return callOpenAI(prompt, maxTokens) +} + +/** + * Generate a query result summary. + * + * Routes to the school's fine-tuned summarizer model via Ollama, + * or falls back to OpenAI GPT-4o-mini. + */ +export async function generateSummary( + prompt: string, + maxTokens: number = 200, +): Promise { + if (MODEL_BACKEND === "ollama") { + const modelSize = process.env.MODEL_SIZE || "9b" + const model = `${SCHOOL_CODE}-summarizer:${modelSize}` + return callOllama(model, prompt) + } + return callOpenAI(prompt, maxTokens) +} +``` + +- [ ] **Step 2: Commit** + +```bash +git add codebenders-dashboard/lib/model-client.ts +git commit -m "feat(dashboard): model client adapter for Ollama/OpenAI routing" +``` + +--- + +## Task 12: Integrate Model Client into API Routes + +**Files:** +- Modify: `codebenders-dashboard/app/api/courses/explain-pairing/route.ts` +- Modify: `codebenders-dashboard/app/api/query-summary/route.ts` + +- [ ] **Step 1: Update explain-pairing route** + +In `codebenders-dashboard/app/api/courses/explain-pairing/route.ts`, replace the inline OpenAI call with the model client. + +Find the import section and add: +```typescript +import { generateExplanation } from "@/lib/model-client" +``` + +Find the `generateText` call block (approximately lines 192-196) and replace: +```typescript +// Before: +const { text } = await generateText({ + model: openai("gpt-4o-mini"), + prompt: llmPrompt, + maxTokens: 320, +}) + +// After: +const text = await generateExplanation(llmPrompt, 320) +``` + +Remove the now-unused inline OpenAI client imports if they become unreferenced after this change. + +- [ ] **Step 2: Update query-summary route** + +In `codebenders-dashboard/app/api/query-summary/route.ts`, replace the inline OpenAI call with the model client. + +Add import: +```typescript +import { generateSummary } from "@/lib/model-client" +``` + +Find the `generateText` call (approximately lines 50-54) and replace: +```typescript +// Before: +const { text } = await generateText({ + model: openai("gpt-4o-mini"), + prompt: llmPrompt, + maxTokens: 200, +}) + +// After: +const text = await generateSummary(llmPrompt, 200) +``` + +Remove unused inline OpenAI client imports. + +- [ ] **Step 3: Verify dashboard builds** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon/codebenders-dashboard && npm run build` +Expected: Build succeeds with no TypeScript errors + +- [ ] **Step 4: Commit** + +```bash +git add codebenders-dashboard/app/api/courses/explain-pairing/route.ts \ + codebenders-dashboard/app/api/query-summary/route.ts +git commit -m "feat(dashboard): route explain-pairing and query-summary through model client" +``` + +--- + +## Task 13: Run All Tests and Final Verification + +- [ ] **Step 1: Run full Python test suite** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -m pytest tests/ -v` +Expected: All tests PASS + +- [ ] **Step 2: Verify dashboard builds** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon/codebenders-dashboard && npm run build` +Expected: Build succeeds + +- [ ] **Step 3: Verify pipeline CLI entry points** + +Run: +```bash +cd /Users/william-meroxa/Development/codebenders-datathon +venv/bin/python -m training.distill --help +venv/bin/python -m training.prepare --help +venv/bin/python -m training.finetune --help +venv/bin/python -m training.eval --help +venv/bin/python -m training.export --help +``` +Expected: Each prints usage without errors + +- [ ] **Step 4: Verify config loads end-to-end** + +Run: `cd /Users/william-meroxa/Development/codebenders-datathon && venv/bin/python -c "from training.config import load_school_config; c = load_school_config('bishop-state'); print(f'School: {c[\"school\"][\"name\"]}'); print(f'Programs: {len(c[\"domain\"][\"programs\"])}'); print(f'Student columns: {len(c[\"schema\"][\"student_columns\"])}'); print(f'Course columns: {len(c[\"schema\"][\"course_columns\"])}')"` +Expected: Prints school name, program count, and column counts without errors From 5575f0bd3ee87d26dbe178a01f11dcc037a665e9 Mon Sep 17 00:00:00 2001 From: William Hill Date: Fri, 27 Mar 2026 22:57:44 -0400 Subject: [PATCH 05/18] chore: scaffold training pipeline package and test infrastructure --- .gitignore | 3 + pytest.ini | 5 ++ requirements.txt | 8 ++ tests/__init__.py | 0 tests/conftest.py | 154 +++++++++++++++++++++++++++++++++++++ tests/training/__init__.py | 0 training/__init__.py | 1 + 7 files changed, 171 insertions(+) create mode 100644 pytest.ini create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/training/__init__.py create mode 100644 training/__init__.py diff --git a/.gitignore b/.gitignore index 6fc7b59..6eef3c3 100644 --- a/.gitignore +++ b/.gitignore @@ -183,3 +183,6 @@ operations/convert_institution_id_to_string.py operations/verify_institution_id.py .vercel .env.deploy + +# Training pipeline artifacts +training_data/ diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..4ecb1ad --- /dev/null +++ b/pytest.ini @@ -0,0 +1,5 @@ +[pytest] +testpaths = tests +python_files = test_*.py +python_classes = Test* +python_functions = test_* diff --git a/requirements.txt b/requirements.txt index 8e758ce..4af40e5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,3 +25,11 @@ pyyaml>=6.0 # Logging & Utilities colorama>=0.4.6 + +# Training pipeline +pyyaml>=6.0 +anthropic>=0.40.0 +ollama>=0.4.0 +rouge-score>=0.1.2 +mlx>=0.22.0 +mlx-lm>=0.20.0 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..4e22aa9 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,154 @@ +"""Shared pytest fixtures for the training pipeline.""" + +from pathlib import Path + +import pytest +import yaml + + +FIXTURES_DIR = Path(__file__).parent / "fixtures" + + +@pytest.fixture +def sample_school_config(): + """Minimal valid school config for testing.""" + return { + "school": { + "name": "Test Community College", + "code": "tcc", + "type": "community_college", + "designation": [], + "location": { + "city": "Test City", + "state": "Alabama", + "setting": "urban", + }, + "enrollment": { + "total_headcount": 1000, + "percent_full_time": 0.50, + "percent_part_time": 0.50, + }, + "demographics": { + "percent_pell_eligible": 0.60, + "percent_first_gen": 0.45, + }, + }, + "database": { + "main_table": "student_level_with_predictions", + "course_table": "course_enrollments", + "connection_env": "DATABASE_URL", + }, + "schema": { + "student_columns": { + "Cohort": "Cohort year", + "Race": "Student race/ethnicity", + "Retention": "Retention indicator (0 or 1)", + }, + "course_columns": { + "course_prefix": "Course dept code", + "grade": "Student grade", + }, + }, + "domain": { + "programs": [ + { + "name": "Nursing", + "cip": "51.3801", + "gateway_courses": ["BIO 201"], + } + ], + "key_metrics": ["retention_rate", "dfwi_rate"], + "terminology": { + "credential": "associate degree", + "at_risk": "at-risk students", + }, + }, + "distillation": { + "teacher_model": "claude-sonnet-4-20250514", + "teacher_backend": "anthropic", + "local_teacher_model": "qwen3.5:27b", + "local_teacher_backend": "ollama", + "pairs_per_task": 10, + }, + "training": { + "default_model": "qwen3.5:9b", + "fallback_model": "qwen3.5:4b", + "method": "qlora", + "quantization": 4, + "lora_rank": 16, + "lora_alpha": 32, + "epochs": 3, + "learning_rate": 1e-4, + "batch_size": 4, + "warmup_steps": 100, + "eval_every": 50, + "early_stopping_patience": 3, + }, + } + + +@pytest.fixture +def sample_course_pairing_data(): + """Sample course pairing input for explainer adapter.""" + return { + "course_a": {"prefix": "MAT", "number": "100", "name": "Intermediate Algebra"}, + "course_b": {"prefix": "BIO", "number": "201", "name": "Anatomy & Physiology I"}, + "stats": { + "course_a_dfwi": 0.42, + "course_b_dfwi": 0.31, + "co_enrollment_count": 85, + "co_enrollment_dfwi": 0.38, + "delivery_breakdown": [ + {"method": "Face-to-Face", "count": 50, "dfwi_rate": 0.34}, + {"method": "Online", "count": 35, "dfwi_rate": 0.44}, + ], + }, + } + + +@pytest.fixture +def sample_query_result_data(): + """Sample query result input for summarizer adapter.""" + return { + "prompt": "retention rate by race for 2023 cohort", + "data": [ + {"Race": "Black", "retention_rate": 0.41}, + {"Race": "White", "retention_rate": 0.52}, + {"Race": "Hispanic", "retention_rate": 0.47}, + ], + "rowCount": 3, + "vizType": "bar", + } + + +@pytest.fixture +def sample_explainer_output(): + """Valid explainer adapter JSON output.""" + return { + "explanation": "MAT 100 and BIO 201 show a high co-enrollment DFWI rate of 38%.", + "structural_factors": [ + "Math placement gaps from feeder high schools", + "Online sections show higher DFW rates", + ], + "student_impact": "Students taking both courses simultaneously face compounded difficulty.", + "advisor_recommendation": "Consider staggering MAT 100 and BIO 201 across terms for at-risk students.", + "data_limitations": ["Co-enrollment data limited to 2020+ cohorts"], + "related_intervention": "Math Bootcamp", + } + + +@pytest.fixture +def sample_summarizer_output(): + """Valid summarizer adapter JSON output.""" + return { + "summary": "Retention rates vary significantly by race in the 2023 cohort.", + "key_insights": [ + "Black students have the lowest retention rate at 41%", + "11-point gap between Black and White student retention", + ], + "context": "This aligns with the institution's strategic goal to close equity gaps.", + "action_items": [ + "Review early alert referrals for Black male students in Fall cohort", + ], + "caveats": ["Race is self-reported; 6% of records are Unknown"], + } diff --git a/tests/training/__init__.py b/tests/training/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/training/__init__.py b/training/__init__.py new file mode 100644 index 0000000..02d0189 --- /dev/null +++ b/training/__init__.py @@ -0,0 +1 @@ +"""Config-driven distillation pipeline for per-school fine-tuned models.""" From aa7dc3f3dc0ca43e5da83618c94ddbb2143c2504 Mon Sep 17 00:00:00 2001 From: William Hill Date: Fri, 27 Mar 2026 22:59:24 -0400 Subject: [PATCH 06/18] feat(training): config loader with YAML validation and JSONL writer --- tests/training/test_config.py | 106 ++++++++++++++++++++++++++++++++++ training/config.py | 71 +++++++++++++++++++++++ 2 files changed, 177 insertions(+) create mode 100644 tests/training/test_config.py create mode 100644 training/config.py diff --git a/tests/training/test_config.py b/tests/training/test_config.py new file mode 100644 index 0000000..4ed25e0 --- /dev/null +++ b/tests/training/test_config.py @@ -0,0 +1,106 @@ +"""Tests for training.config — constants and school config loader.""" + +import pytest +import yaml +from pathlib import Path +from unittest.mock import patch + +from training.config import ( + BASE_DIR, + SCHOOLS_DIR, + TRAIN_RATIO, + VAL_RATIO, + TEST_RATIO, + load_school_config, + get_school_dir, + get_training_data_dir, + write_jsonl, +) + + +class TestConstants: + def test_split_ratios_sum_to_one(self): + assert TRAIN_RATIO + VAL_RATIO + TEST_RATIO == pytest.approx(1.0) + + def test_base_dir_is_path(self): + assert isinstance(BASE_DIR, Path) + + def test_schools_dir_is_path(self): + assert isinstance(SCHOOLS_DIR, Path) + + +class TestLoadSchoolConfig: + def test_loads_valid_config(self, tmp_path, sample_school_config): + school_dir = tmp_path / "test-school" + school_dir.mkdir() + config_path = school_dir / "config.yaml" + config_path.write_text(yaml.dump(sample_school_config)) + + with patch("training.config.SCHOOLS_DIR", tmp_path): + config = load_school_config("test-school") + + assert config["school"]["name"] == "Test Community College" + assert config["school"]["code"] == "tcc" + assert config["database"]["main_table"] == "student_level_with_predictions" + + def test_raises_on_missing_school(self, tmp_path): + with patch("training.config.SCHOOLS_DIR", tmp_path): + with pytest.raises(FileNotFoundError, match="School config not found"): + load_school_config("nonexistent") + + def test_raises_on_missing_required_keys(self, tmp_path): + school_dir = tmp_path / "bad-school" + school_dir.mkdir() + config_path = school_dir / "config.yaml" + config_path.write_text(yaml.dump({"school": {"name": "Bad"}})) + + with patch("training.config.SCHOOLS_DIR", tmp_path): + with pytest.raises(ValueError, match="Missing required"): + load_school_config("bad-school") + + +class TestGetSchoolDir: + def test_returns_path(self, tmp_path): + with patch("training.config.SCHOOLS_DIR", tmp_path): + result = get_school_dir("bishop-state") + assert result == tmp_path / "bishop-state" + + +class TestGetTrainingDataDir: + def test_returns_path_with_school(self): + result = get_training_data_dir("bishop-state") + assert "bishop-state" in str(result) + assert result.name == "bishop-state" + + +class TestWriteJsonl: + def test_writes_items(self, tmp_path): + import json + items = [{"a": 1}, {"b": 2}] + outfile = tmp_path / "test.jsonl" + count = write_jsonl(items, outfile) + assert count == 2 + lines = outfile.read_text().strip().split("\n") + assert json.loads(lines[0]) == {"a": 1} + assert json.loads(lines[1]) == {"b": 2} + + def test_writes_with_transform(self, tmp_path): + import json + items = [1, 2, 3] + outfile = tmp_path / "test.jsonl" + count = write_jsonl(items, outfile, transform=lambda x: {"val": x * 2}) + assert count == 3 + lines = outfile.read_text().strip().split("\n") + assert json.loads(lines[0]) == {"val": 2} + + def test_skips_none_from_transform(self, tmp_path): + items = [1, 2, 3] + outfile = tmp_path / "test.jsonl" + count = write_jsonl(items, outfile, transform=lambda x: None if x == 2 else {"v": x}) + assert count == 2 + + def test_creates_parent_dirs(self, tmp_path): + outfile = tmp_path / "sub" / "dir" / "test.jsonl" + count = write_jsonl([{"x": 1}], outfile) + assert count == 1 + assert outfile.exists() diff --git a/training/config.py b/training/config.py new file mode 100644 index 0000000..36c1bee --- /dev/null +++ b/training/config.py @@ -0,0 +1,71 @@ +"""Shared constants and school config loader for the training pipeline.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Callable, Optional + +import yaml + +# Directory layout +PROJECT_ROOT = Path(__file__).resolve().parent.parent +SCHOOLS_DIR = PROJECT_ROOT / "schools" +BASE_DIR = PROJECT_ROOT / "training_data" + +# Dataset split ratios +TRAIN_RATIO = 0.80 +VAL_RATIO = 0.10 +TEST_RATIO = 0.10 + +# Deduplication +JACCARD_THRESHOLD = 1.0 + +# Required top-level keys in school config +_REQUIRED_KEYS = {"school", "database", "schema", "domain", "distillation", "training"} + + +def load_school_config(school: str) -> dict[str, Any]: + """Load and validate a school's config.yaml.""" + config_path = SCHOOLS_DIR / school / "config.yaml" + if not config_path.exists(): + raise FileNotFoundError(f"School config not found: {config_path}") + + with config_path.open("r", encoding="utf-8") as fh: + config = yaml.safe_load(fh) + + missing = _REQUIRED_KEYS - set(config.keys()) + if missing: + raise ValueError(f"Missing required top-level keys in {config_path}: {missing}") + + return config + + +def get_school_dir(school: str) -> Path: + """Return the path to a school's config directory.""" + return SCHOOLS_DIR / school + + +def get_training_data_dir(school: str) -> Path: + """Return the path to a school's training data directory.""" + return BASE_DIR / school + + +def write_jsonl( + items: list, + outfile: Path, + transform: Optional[Callable] = None, +) -> int: + """Write items to a JSONL file.""" + outfile = Path(outfile) + outfile.parent.mkdir(parents=True, exist_ok=True) + count = 0 + with outfile.open("w", encoding="utf-8") as fh: + for item in items: + if transform is not None: + item = transform(item) + if item is None: + continue + fh.write(json.dumps(item, ensure_ascii=False) + "\n") + count += 1 + return count From 760182851e10bb2e9a92480b68b55dc94fdd7696 Mon Sep 17 00:00:00 2001 From: William Hill Date: Fri, 27 Mar 2026 23:02:49 -0400 Subject: [PATCH 07/18] feat(training): add Bishop State school config and seed queries --- schools/bishop-state/config.yaml | 358 +++++++++++++++++++++++++ schools/bishop-state/seed_queries.yaml | 60 +++++ 2 files changed, 418 insertions(+) create mode 100644 schools/bishop-state/config.yaml create mode 100644 schools/bishop-state/seed_queries.yaml diff --git a/schools/bishop-state/config.yaml b/schools/bishop-state/config.yaml new file mode 100644 index 0000000..0d3b19e --- /dev/null +++ b/schools/bishop-state/config.yaml @@ -0,0 +1,358 @@ +# Bishop State Community College — Training Pipeline Config +school: + name: "Bishop State Community College" + code: "bscc" + type: "community_college" + designation: ["hbcu", "minority_serving"] + accreditation: "SACSCOC" + founded: 1927 + + location: + address: "351 North Broad Street" + city: "Mobile" + state: "Alabama" + zip: "36603" + county: "Mobile County" + region: "Gulf Coast" + setting: "urban" + climate_zone: "subtropical" + + enrollment: + total_headcount: 4200 + fte: 2800 + undergraduate_only: true + residential: false + percent_full_time: 0.42 + percent_part_time: 0.58 + percent_online: 0.35 + open_admission: true + + demographics: + percent_black: 0.72 + percent_white: 0.18 + percent_hispanic: 0.05 + percent_other: 0.05 + percent_pell_eligible: 0.68 + percent_first_gen: 0.55 + percent_adult_learners: 0.40 + median_household_income_area: 42000 + + workforce: + top_employers: ["Austal USA", "Mobile Infirmary", "AM/NS Calvert"] + high_demand_fields: ["healthcare", "advanced_manufacturing", "maritime"] + workforce_board: "Mobile Works" + + academics: + calendar: "semester" + degree_types: ["associate", "certificate", "short_certificate"] + total_programs: 45 + largest_programs: ["Nursing", "Welding", "Business Administration"] + transfer_partners: ["University of South Alabama", "Alabama A&M"] + dual_enrollment: true + + student_support: + tutoring: true + food_pantry: true + childcare: false + transportation_assistance: true + mental_health_services: true + early_alert_system: true + + challenges: + - "High percentage of students working 20+ hours/week" + - "Limited public transit access to satellite campuses" + - "Hurricane season disrupts Fall semester attendance" + - "Many students require developmental education in math" + + strengths: + - "Strong employer partnerships in healthcare and maritime" + - "Active student mentoring program" + - "High nursing program pass rates on NCLEX" + + peers: + ipeds_id: "101505" + carnegie_class: "Associate's—High Transfer-High Traditional" + peer_institutions: ["Lawson State CC", "Shelton State CC", "Trenholm State CC"] + state_system: "Alabama Community College System" + governing_board: "ACCS Board of Trustees" + + financial: + in_district_tuition: 4800 + in_state_tuition: 4800 + avg_financial_aid_package: 5200 + percent_receiving_aid: 0.82 + percent_student_loans: 0.25 + cost_of_living_index: 87.3 + textbook_program: "inclusive_access" + tuition_payment_plan: true + emergency_aid_fund: true + + completion: + ipeds_graduation_rate: 0.18 + adjusted_completion_rate: 0.42 + avg_time_to_credential: 3.2 + percent_transfer_out: 0.24 + percent_stop_out_return: 0.15 + top_completion_barriers: + - "developmental_math_sequences" + - "financial_emergencies" + - "work_schedule_conflicts" + + instruction: + student_faculty_ratio: 18 + percent_full_time_faculty: 0.45 + percent_adjunct: 0.55 + avg_class_size: 22 + developmental_ed_model: "corequisite" + lms: "Canvas" + + pipeline: + feeder_high_schools: + - name: "Williamson High School" + percent_of_enrollment: 0.12 + avg_readiness: "below_college_level" + - name: "Murphy High School" + percent_of_enrollment: 0.08 + avg_readiness: "mixed" + percent_ged: 0.11 + percent_dual_enrollment_origin: 0.09 + percent_veterans: 0.07 + percent_career_changers: 0.14 + percent_displaced_workers: 0.05 + percent_international: 0.02 + primary_recruitment_radius_miles: 35 + + technology: + percent_students_with_reliable_wifi: 0.71 + percent_students_with_personal_laptop: 0.64 + campus_device_lending: true + hotspot_lending: true + digital_literacy_required: false + broadband_desert_overlap: true + + access: + campus_count: 4 + campuses: + - name: "Main Campus" + address: "351 N Broad St" + public_transit_accessible: true + - name: "Southwest Campus" + address: "925 Dauphin Island Pkwy" + public_transit_accessible: false + percent_students_commute_30_plus_min: 0.35 + public_transit_quality: "limited" + parking_adequate: true + evening_weekend_classes: true + + equity: + known_gaps: + - metric: "gateway_math_pass_rate" + group_a: { name: "Black male students", value: 0.41 } + group_b: { name: "Overall", value: 0.58 } + initiative: "Male Student Success mentoring program" + - metric: "retention" + group_a: { name: "Part-time students", value: 0.38 } + group_b: { name: "Full-time students", value: 0.61 } + initiative: "15-to-Finish advising campaign" + dei_office: true + title_ix_coordinator: true + minority_male_initiative: "Brother 2 Brother" + + interventions: + active: + - name: "Starfish Early Alert" + type: "early_warning" + target: "all students" + trigger: "missed 2+ classes or below C at midterm" + effectiveness: "12% retention lift in pilot cohorts" + - name: "Math Bootcamp" + type: "academic_support" + target: "students placing into developmental math" + timing: "2 weeks before Fall semester" + effectiveness: "participants 2x more likely to pass MAT 100" + - name: "Emergency Micro-Grants" + type: "financial" + target: "students facing unexpected financial hardship" + max_award: 500 + effectiveness: "78% of recipients re-enrolled next term" + planned: + - name: "Proactive advising for 25+ credit students" + launch: "Fall 2026" + + student_life: + percent_working_while_enrolled: 0.72 + percent_working_over_20hrs: 0.48 + percent_single_parents: 0.18 + percent_caregiver_responsibilities: 0.25 + childcare_waitlist: true + student_orgs: 15 + athletics: false + housing_insecurity_rate: 0.14 + food_insecurity_rate: 0.31 + + health: + mental_health_counselor_ratio: "1:1400" + community_health_context: + - "Mobile County has highest diabetes rate in Alabama" + - "Limited mental health providers in service area" + substance_abuse_programs: true + crisis_intervention_protocol: true + + outcomes: + job_placement_rate_6mo: 0.78 + median_salary_after_credential: + associate: 34000 + certificate: 29000 + percent_employed_in_field: 0.65 + licensure_pass_rates: + nursing_nclex: 0.89 + welding_aws: 0.92 + emt: 0.85 + transfer_success_rate: 0.71 + employer_satisfaction_rate: 0.88 + + patterns: + high_attrition_points: + - week: 4 + reason: "Financial aid disbursement delays" + - week: 8 + reason: "Midterm performance shock" + - month: "October" + reason: "Hurricane season peak" + registration_peaks: ["April", "July", "November"] + summer_melt_rate: 0.22 + + trends: + enrollment_direction: "declining" + enrollment_5yr_change: -0.12 + completion_direction: "improving" + notable_changes: + - year: 2020 + event: "COVID shift to online — permanent hybrid expansion" + - year: 2022 + event: "Switched to corequisite math model — dev-ed pass rates doubled" + - year: 2023 + event: "Launched early alert system with ML predictions" + + priorities: + strategic_plan_years: "2024-2029" + top_goals: + - "Increase fall-to-fall retention from 42% to 55%" + - "Launch 3 new short-term workforce certificates" + - "Close equity gap in gateway math by 50%" + accreditation_qep_topic: "Guided Pathways implementation" + grant_funded_initiatives: + - name: "Title III Strengthening Institutions" + focus: "Student support services and advising redesign" + end_date: "2027-09-30" + - name: "NSF ATE Grant" + focus: "Advanced manufacturing curriculum" + end_date: "2026-05-31" + + data_caveats: + - "Pre-2020 cohorts lack online/hybrid delivery classification" + - "Race/ethnicity is self-reported; 6% of records are 'Unknown'" + - "GPA data for dual-enrollment students may reflect high school scale" + - "Transfer-out data relies on National Student Clearinghouse match — ~85% match rate" + - "Course enrollment records before 2019 do not include instructor_status" + +database: + main_table: "student_level_with_predictions" + course_table: "course_enrollments" + connection_env: "DATABASE_URL" + +schema: + student_columns: + Cohort: "Cohort year (numeric: 2019, 2020, etc.)" + Cohort_Term: "Term of cohort entry (Fall, Spring, Summer)" + Student_GUID: "Unique student identifier" + Institution_ID: "Institution identifier (102030 for Bishop State)" + Gender: "Student gender" + Race: "Student race/ethnicity" + Student_Age: "Age of student (integer)" + First_Gen: "First generation status" + Enrollment_Type: "Type of enrollment" + Enrollment_Intensity_First_Term: "Enrollment intensity (Full-Time, Part-Time)" + Program_of_Study_Year_1: "Program of study in year 1 (CIP code)" + Credential_Type_Sought_Year_1: "Credential type being pursued" + Math_Placement: "Math placement level (C=college-level, R=remedial, N=none)" + Retention: "Retention indicator (0 or 1)" + Persistence: "Persistence indicator (0 or 1)" + GPA_Group_Year_1: "GPA in year 1" + GPA_Group_Term_1: "GPA in term 1" + Number_of_Credits_Attempted_Year_1: "Credits attempted in year 1" + Number_of_Credits_Earned_Year_1: "Credits earned in year 1" + Number_of_Credits_Attempted_Year_2: "Credits attempted in year 2" + Number_of_Credits_Earned_Year_2: "Credits earned in year 2" + Time_to_Credential: "Time to any credential" + retention_probability: "Predicted probability of retention (0-1)" + retention_risk_category: "Risk category (Low/Moderate/High/Critical Risk)" + at_risk_alert: "Early warning alert level (LOW/MODERATE/HIGH/URGENT)" + course_completion_rate: "Course completion rate (0-1)" + passing_rate: "Course passing rate (0-1)" + course_columns: + course_prefix: "Course dept code (MAT, ENG, NUR, CIS, etc.)" + course_number: "Course number (100, 201, etc.)" + course_name: "Full course name" + grade: "Student grade (A, B, C, D, F, W, I, AU, P)" + delivery_method: "Delivery (F=face-to-face, O=online, H=hybrid)" + instructor_status: "Instructor type (FT=full-time, PT=part-time)" + gateway_type: "Gateway (M=math, E=English, N=not a gateway)" + credits_attempted: "Credits attempted (numeric)" + credits_earned: "Credits earned (numeric)" + cohort: "Cohort year as text" + academic_year: "Academic year (e.g. 2021-22)" + academic_term: "Term (FALL, SPRING, SUMMER)" + ferpa_excluded: + - "Student_GUID" + - "student_guid" + +domain: + programs: + - name: "Nursing (ADN)" + cip: "51.3801" + gateway_courses: ["BIO 201", "MAT 110"] + - name: "Welding Technology" + cip: "48.0508" + gateway_courses: ["WDT 108", "WDT 109"] + - name: "Business Administration" + cip: "52.0201" + gateway_courses: ["MAT 100", "BUS 241"] + - name: "Computer Information Systems" + cip: "11.0101" + gateway_courses: ["CIS 146", "MAT 100"] + - name: "Emergency Medical Technician" + cip: "51.0904" + gateway_courses: ["EMS 100", "BIO 201"] + key_metrics: + - "retention_rate" + - "dfwi_rate" + - "gateway_pass_rate" + - "completion_rate" + - "transfer_rate" + terminology: + credential: "associate degree or certificate" + at_risk: "students flagged by early warning system" + gateway_course: "first college-level course in math or English" + dfwi: "grades of D, F, W, or I (unsuccessful completion)" + +distillation: + teacher_model: "claude-sonnet-4-20250514" + teacher_backend: "anthropic" + local_teacher_model: "qwen3.5:27b" + local_teacher_backend: "ollama" + pairs_per_task: 1500 + +training: + default_model: "qwen3.5:9b" + fallback_model: "qwen3.5:4b" + method: "qlora" + quantization: 4 + lora_rank: 16 + lora_alpha: 32 + epochs: 3 + learning_rate: 1.0e-4 + batch_size: 4 + warmup_steps: 100 + eval_every: 50 + early_stopping_patience: 3 diff --git a/schools/bishop-state/seed_queries.yaml b/schools/bishop-state/seed_queries.yaml new file mode 100644 index 0000000..ca9f3dc --- /dev/null +++ b/schools/bishop-state/seed_queries.yaml @@ -0,0 +1,60 @@ +# Example queries for training pair generation +explainer: + - query: "MAT 100 and BIO 201 pairing for nursing students" + style: "advisor" + - query: "ENG 101 and HIS 201 co-enrollment outcomes" + style: "advisor" + - query: "High DFW in MAT 110 for part-time evening students" + style: "advisor" + - query: "CIS 146 and MAT 100 pairing for CIS majors" + style: "advisor" + - query: "WDT 108 and WDT 109 sequential outcomes" + style: "advisor" + - query: "Online vs face-to-face outcomes in gateway math" + style: "administrator" + - query: "Adjunct vs full-time instructor DFW rates in BIO 201" + style: "administrator" + - query: "Summer vs Fall section outcomes for ENG 101" + style: "administrator" + - query: "Developmental math co-enrollment with science courses" + style: "administrator" + - query: "Dual-enrollment student performance in college-level courses" + style: "administrator" + - query: "EMS 100 and BIO 201 prerequisite outcomes" + style: "faculty" + - query: "MAT 100 withdrawal patterns by week of semester" + style: "faculty" + - query: "Hybrid delivery outcomes in nursing prerequisite courses" + style: "faculty" + +summarizer: + - query: "retention rate by race for 2023 cohort" + style: "faculty" + - query: "overall retention trend from 2019 to 2023" + style: "administrator" + - query: "retention rate for first-generation students" + style: "advisor" + - query: "completion rate by enrollment intensity" + style: "administrator" + - query: "gateway course pass rates by delivery method" + style: "administrator" + - query: "top 10 courses with highest DFW rates" + style: "faculty" + - query: "DFW rates by instructor status in math courses" + style: "administrator" + - query: "course completion rates for online vs face-to-face" + style: "faculty" + - query: "enrollment by race and gender" + style: "administrator" + - query: "GPA distribution for Pell-eligible students" + style: "advisor" + - query: "retention gap between full-time and part-time students" + style: "administrator" + - query: "at-risk student count by program" + style: "advisor" + - query: "students with URGENT early warning alert by cohort" + style: "advisor" + - query: "average retention probability by math placement" + style: "faculty" + - query: "critical risk students in nursing program" + style: "advisor" From c547ce9403eca4272f1b2a38f8a69f0f9d702c22 Mon Sep 17 00:00:00 2001 From: William Hill Date: Fri, 27 Mar 2026 23:06:09 -0400 Subject: [PATCH 08/18] feat(training): teacher prompt templates for explainer and summarizer --- tests/training/test_prompts.py | 97 +++++++++++++ training/prompts.py | 256 +++++++++++++++++++++++++++++++++ 2 files changed, 353 insertions(+) create mode 100644 tests/training/test_prompts.py create mode 100644 training/prompts.py diff --git a/tests/training/test_prompts.py b/tests/training/test_prompts.py new file mode 100644 index 0000000..d7e163d --- /dev/null +++ b/tests/training/test_prompts.py @@ -0,0 +1,97 @@ +"""Tests for training.prompts — teacher prompt templates.""" + +import json +import pytest + +from training.prompts import ( + build_system_prompt, + build_explainer_prompt, + build_summarizer_prompt, + EXPLAINER_STUDENT_SYSTEM, + SUMMARIZER_STUDENT_SYSTEM, + EXPLAINER_SCHEMA, + SUMMARIZER_SCHEMA, +) + + +class TestBuildSystemPrompt: + def test_includes_school_name(self, sample_school_config): + result = build_system_prompt(sample_school_config) + assert "Test Community College" in result + + def test_includes_location(self, sample_school_config): + result = build_system_prompt(sample_school_config) + assert "Test City" in result + assert "Alabama" in result + + def test_includes_demographics(self, sample_school_config): + result = build_system_prompt(sample_school_config) + assert "Pell" in result or "pell" in result + + def test_returns_string(self, sample_school_config): + result = build_system_prompt(sample_school_config) + assert isinstance(result, str) + assert len(result) > 100 + + +class TestBuildExplainerPrompt: + def test_includes_course_data(self, sample_school_config, sample_course_pairing_data): + result = build_explainer_prompt(sample_school_config, sample_course_pairing_data) + assert "MAT" in result + assert "BIO" in result + + def test_includes_stats(self, sample_school_config, sample_course_pairing_data): + result = build_explainer_prompt(sample_school_config, sample_course_pairing_data) + assert "0.42" in result or "42" in result + + def test_includes_output_schema(self, sample_school_config, sample_course_pairing_data): + result = build_explainer_prompt(sample_school_config, sample_course_pairing_data) + assert "explanation" in result + assert "structural_factors" in result + assert "advisor_recommendation" in result + + def test_returns_string(self, sample_school_config, sample_course_pairing_data): + result = build_explainer_prompt(sample_school_config, sample_course_pairing_data) + assert isinstance(result, str) + + +class TestBuildSummarizerPrompt: + def test_includes_query(self, sample_school_config, sample_query_result_data): + result = build_summarizer_prompt(sample_school_config, sample_query_result_data) + assert "retention rate by race" in result + + def test_includes_data(self, sample_school_config, sample_query_result_data): + result = build_summarizer_prompt(sample_school_config, sample_query_result_data) + assert "Black" in result + assert "0.41" in result or "41" in result + + def test_includes_output_schema(self, sample_school_config, sample_query_result_data): + result = build_summarizer_prompt(sample_school_config, sample_query_result_data) + assert "summary" in result + assert "key_insights" in result + assert "action_items" in result + + def test_returns_string(self, sample_school_config, sample_query_result_data): + result = build_summarizer_prompt(sample_school_config, sample_query_result_data) + assert isinstance(result, str) + + +class TestStudentPrompts: + def test_explainer_student_system_is_concise(self): + assert len(EXPLAINER_STUDENT_SYSTEM) < 500 + assert "JSON" in EXPLAINER_STUDENT_SYSTEM + + def test_summarizer_student_system_is_concise(self): + assert len(SUMMARIZER_STUDENT_SYSTEM) < 500 + assert "JSON" in SUMMARIZER_STUDENT_SYSTEM + + +class TestOutputSchemas: + def test_explainer_schema_has_required_keys(self): + required = {"explanation", "structural_factors", "student_impact", + "advisor_recommendation", "data_limitations", "related_intervention"} + assert required == set(EXPLAINER_SCHEMA.keys()) + + def test_summarizer_schema_has_required_keys(self): + required = {"summary", "key_insights", "context", "action_items", "caveats"} + assert required == set(SUMMARIZER_SCHEMA.keys()) diff --git a/training/prompts.py b/training/prompts.py new file mode 100644 index 0000000..47e7716 --- /dev/null +++ b/training/prompts.py @@ -0,0 +1,256 @@ +"""Teacher prompt templates for the distillation pipeline. + +Provides school-agnostic prompt builders that inject per-school context +from config.yaml to generate high-quality training pairs. +""" + +from __future__ import annotations + +import json +from typing import Any + +EXPLAINER_SCHEMA = { + "explanation": "2-3 sentence plain-language explanation of the course pairing pattern", + "structural_factors": ["list of institutional or systemic factors driving this pattern"], + "student_impact": "what this means for students taking these courses", + "advisor_recommendation": "one actionable next step for advisors", + "data_limitations": ["caveats about interpreting this data"], + "related_intervention": "existing program that addresses this, or null", +} + +SUMMARIZER_SCHEMA = { + "summary": "2-3 sentence headline finding from the query results", + "key_insights": ["list of notable patterns in the data"], + "context": "how this connects to institutional priorities or known challenges", + "action_items": ["what someone should do with this information"], + "caveats": ["data limitations relevant to this specific query"], +} + +EXPLAINER_STUDENT_SYSTEM = ( + "You are a student success analyst. Given course pairing data, generate a " + "structured JSON explanation. Include: explanation, structural_factors, " + "student_impact, advisor_recommendation, data_limitations, and " + "related_intervention. Respond with ONLY valid JSON." +) + +SUMMARIZER_STUDENT_SYSTEM = ( + "You are a student success analyst. Given a query and its results, generate " + "a structured JSON summary. Include: summary, key_insights, context, " + "action_items, and caveats. Respond with ONLY valid JSON." +) + + +def build_system_prompt(config: dict[str, Any]) -> str: + """Build the teacher system prompt with full institutional context.""" + school = config["school"] + domain = config["domain"] + + sections = [] + + name = school["name"] + location = school.get("location", {}) + city = location.get("city", "") + state = location.get("state", "") + school_type = school.get("type", "institution") + sections.append( + f"You are a student success analyst at {name}, " + f"a {school_type} in {city}, {state}." + ) + + designations = school.get("designation", []) + if designations: + sections.append(f"Institutional designations: {', '.join(designations)}.") + + enrollment = school.get("enrollment", {}) + if enrollment: + parts = [] + if "total_headcount" in enrollment: + parts.append(f"{enrollment['total_headcount']:,} students") + if "percent_part_time" in enrollment: + parts.append(f"{enrollment['percent_part_time']:.0%} part-time") + if "percent_online" in enrollment: + parts.append(f"{enrollment['percent_online']:.0%} online") + if enrollment.get("open_admission"): + parts.append("open admission") + if parts: + sections.append(f"Enrollment profile: {', '.join(parts)}.") + + demographics = school.get("demographics", {}) + if demographics: + parts = [] + for key, label in [ + ("percent_pell_eligible", "Pell-eligible"), + ("percent_first_gen", "first-generation"), + ("percent_adult_learners", "adult learners (25+)"), + ]: + if key in demographics: + parts.append(f"{demographics[key]:.0%} {label}") + if parts: + sections.append(f"Student demographics: {', '.join(parts)}.") + + programs = domain.get("programs", []) + if programs: + program_names = [p["name"] for p in programs[:5]] + sections.append(f"Key programs: {', '.join(program_names)}.") + + challenges = school.get("challenges", []) + if challenges: + sections.append("Known challenges:\n" + "\n".join(f"- {c}" for c in challenges)) + + strengths = school.get("strengths", []) + if strengths: + sections.append("Institutional strengths:\n" + "\n".join(f"- {s}" for s in strengths)) + + equity = school.get("equity", {}) + known_gaps = equity.get("known_gaps", []) + if known_gaps: + gap_lines = [] + for gap in known_gaps: + ga = gap.get("group_a", {}) + gb = gap.get("group_b", {}) + gap_lines.append( + f"- {gap['metric']}: {ga.get('name', '?')} ({ga.get('value', '?')}) " + f"vs {gb.get('name', '?')} ({gb.get('value', '?')})" + ) + sections.append("Known equity gaps:\n" + "\n".join(gap_lines)) + + interventions = school.get("interventions", {}) + active = interventions.get("active", []) + if active: + lines = [] + for i in active: + line = f"- {i['name']} ({i['type']}): {i.get('effectiveness', 'effectiveness unknown')}" + lines.append(line) + sections.append("Active interventions:\n" + "\n".join(lines)) + + priorities = school.get("priorities", {}) + top_goals = priorities.get("top_goals", []) + if top_goals: + sections.append("Strategic priorities:\n" + "\n".join(f"- {g}" for g in top_goals)) + + caveats = school.get("data_caveats", []) + if caveats: + sections.append("Data caveats:\n" + "\n".join(f"- {c}" for c in caveats)) + + completion = school.get("completion", {}) + if completion: + parts = [] + if "ipeds_graduation_rate" in completion: + parts.append(f"IPEDS grad rate: {completion['ipeds_graduation_rate']:.0%}") + if "adjusted_completion_rate" in completion: + parts.append(f"adjusted completion: {completion['adjusted_completion_rate']:.0%}") + barriers = completion.get("top_completion_barriers", []) + if barriers: + parts.append(f"top barriers: {', '.join(b.replace('_', ' ') for b in barriers)}") + if parts: + sections.append(f"Completion context: {'; '.join(parts)}.") + + student_life = school.get("student_life", {}) + if student_life: + parts = [] + if "percent_working_over_20hrs" in student_life: + parts.append(f"{student_life['percent_working_over_20hrs']:.0%} working 20+ hrs/wk") + if "food_insecurity_rate" in student_life: + parts.append(f"{student_life['food_insecurity_rate']:.0%} food insecure") + if "percent_single_parents" in student_life: + parts.append(f"{student_life['percent_single_parents']:.0%} single parents") + if parts: + sections.append(f"Student life: {', '.join(parts)}.") + + patterns = school.get("patterns", {}) + attrition_points = patterns.get("high_attrition_points", []) + if attrition_points: + lines = [] + for point in attrition_points: + when = f"week {point['week']}" if "week" in point else point.get("month", "?") + lines.append(f"- {when}: {point['reason']}") + sections.append("Known attrition patterns:\n" + "\n".join(lines)) + + workforce = school.get("workforce", {}) + if workforce: + employers = workforce.get("top_employers", []) + fields = workforce.get("high_demand_fields", []) + if employers or fields: + parts = [] + if employers: + parts.append(f"top employers: {', '.join(employers)}") + if fields: + parts.append(f"high-demand fields: {', '.join(fields)}") + sections.append(f"Workforce context: {'; '.join(parts)}.") + + outcomes = school.get("outcomes", {}) + if outcomes: + parts = [] + if "job_placement_rate_6mo" in outcomes: + parts.append(f"6-month job placement: {outcomes['job_placement_rate_6mo']:.0%}") + licensure = outcomes.get("licensure_pass_rates", {}) + if licensure: + lic_parts = [f"{k}: {v:.0%}" for k, v in licensure.items()] + parts.append(f"licensure pass rates: {', '.join(lic_parts)}") + if parts: + sections.append(f"Outcomes: {'; '.join(parts)}.") + + sections.append("Respond with ONLY valid JSON.") + + return "\n\n".join(sections) + + +def build_explainer_prompt( + config: dict[str, Any], + course_data: dict[str, Any], +) -> str: + """Build the teacher prompt for generating a course pairing explanation.""" + schema_str = json.dumps(EXPLAINER_SCHEMA, indent=2) + data_str = json.dumps(course_data, indent=2, default=str) + + terminology = config.get("domain", {}).get("terminology", {}) + term_lines = "\n".join(f"- {k}: {v}" for k, v in terminology.items()) if terminology else "" + + return f"""Analyze the following course pairing data and explain the pattern. + +COURSE PAIRING DATA: +{data_str} + +{f"TERMINOLOGY:{chr(10)}{term_lines}{chr(10)}" if term_lines else ""} +Generate a JSON response with this exact schema: +{schema_str} + +Guidelines: +- Explain the pattern in plain language accessible to advisors and faculty. +- Connect structural factors to the institution's known challenges and context. +- Make the advisor recommendation specific and actionable. +- Reference existing interventions if relevant. +- Note any data limitations that affect interpretation. +- Do NOT speculate beyond what the data shows.""" + + +def build_summarizer_prompt( + config: dict[str, Any], + query_data: dict[str, Any], +) -> str: + """Build the teacher prompt for generating a query result summary.""" + schema_str = json.dumps(SUMMARIZER_SCHEMA, indent=2) + data_str = json.dumps(query_data["data"][:50], indent=2, default=str) + user_query = query_data["prompt"] + row_count = query_data.get("rowCount", len(query_data["data"])) + viz_type = query_data.get("vizType", "table") + + return f"""Summarize the following query results for a non-technical audience +(advisors, administrators, faculty). + +USER QUERY: {user_query} +VISUALIZATION TYPE: {viz_type} +TOTAL ROWS: {row_count} + +RESULTS: +{data_str} + +Generate a JSON response with this exact schema: +{schema_str} + +Guidelines: +- Lead with the most important finding. +- Connect insights to institutional context and priorities. +- Make action items specific to the roles that would see this data. +- Note data limitations relevant to this specific query. +- Do NOT hallucinate data points not present in the results.""" From 0dcab6472bcee9d24c33bd8210974e65986839b6 Mon Sep 17 00:00:00 2001 From: William Hill Date: Fri, 27 Mar 2026 23:07:00 -0400 Subject: [PATCH 09/18] feat(training): seed data generation for explainer and summarizer --- tests/training/test_seed.py | 85 +++++++++++++++++++ training/seed.py | 161 ++++++++++++++++++++++++++++++++++++ 2 files changed, 246 insertions(+) create mode 100644 tests/training/test_seed.py create mode 100644 training/seed.py diff --git a/tests/training/test_seed.py b/tests/training/test_seed.py new file mode 100644 index 0000000..77b0a3e --- /dev/null +++ b/tests/training/test_seed.py @@ -0,0 +1,85 @@ +"""Tests for training.seed — seed data generation.""" + +import pytest +import yaml +from pathlib import Path +from unittest.mock import patch + +from training.seed import ( + load_seed_queries, + generate_synthetic_course_pairings, + generate_synthetic_query_results, + format_as_chatml, +) + + +class TestLoadSeedQueries: + def test_loads_valid_yaml(self, tmp_path): + seed_file = tmp_path / "seed_queries.yaml" + seed_file.write_text(yaml.dump({ + "explainer": [ + {"query": "MAT 100 and BIO 201", "style": "advisor"}, + ], + "summarizer": [ + {"query": "retention by race", "style": "faculty"}, + ], + })) + + with patch("training.seed.get_school_dir", return_value=tmp_path): + result = load_seed_queries("test-school") + + assert len(result["explainer"]) == 1 + assert len(result["summarizer"]) == 1 + assert result["explainer"][0]["query"] == "MAT 100 and BIO 201" + + def test_returns_empty_on_missing_file(self, tmp_path): + with patch("training.seed.get_school_dir", return_value=tmp_path): + result = load_seed_queries("test-school") + assert result == {"explainer": [], "summarizer": []} + + +class TestGenerateSyntheticCoursePairings: + def test_generates_requested_count(self, sample_school_config): + results = generate_synthetic_course_pairings(sample_school_config, count=5) + assert len(results) == 5 + + def test_each_has_required_keys(self, sample_school_config): + results = generate_synthetic_course_pairings(sample_school_config, count=3) + for r in results: + assert "course_a" in r + assert "course_b" in r + assert "stats" in r + assert "prefix" in r["course_a"] + assert "number" in r["course_a"] + + def test_returns_empty_for_zero(self, sample_school_config): + results = generate_synthetic_course_pairings(sample_school_config, count=0) + assert results == [] + + +class TestGenerateSyntheticQueryResults: + def test_generates_requested_count(self, sample_school_config): + results = generate_synthetic_query_results(sample_school_config, count=5) + assert len(results) == 5 + + def test_each_has_required_keys(self, sample_school_config): + results = generate_synthetic_query_results(sample_school_config, count=3) + for r in results: + assert "prompt" in r + assert "data" in r + assert "rowCount" in r + assert "vizType" in r + + def test_returns_empty_for_zero(self, sample_school_config): + results = generate_synthetic_query_results(sample_school_config, count=0) + assert results == [] + + +class TestFormatAsChatML: + def test_format_structure(self): + result = format_as_chatml("system", "user", "assistant") + assert "messages" in result + assert len(result["messages"]) == 3 + assert result["messages"][0] == {"role": "system", "content": "system"} + assert result["messages"][1] == {"role": "user", "content": "user"} + assert result["messages"][2] == {"role": "assistant", "content": "assistant"} diff --git a/training/seed.py b/training/seed.py new file mode 100644 index 0000000..e2d3b66 --- /dev/null +++ b/training/seed.py @@ -0,0 +1,161 @@ +"""Seed data generation for the distillation pipeline.""" + +from __future__ import annotations + +import random +from typing import Any + +import yaml + +from training.config import get_school_dir + +_PREFIXES = ["MAT", "ENG", "BIO", "CIS", "WDT", "HIS", "PSY", "BUS", "NUR", "EMS"] +_NUMBERS = ["100", "101", "110", "201", "202", "210", "241", "246"] +_NAMES = { + "MAT 100": "Intermediate Algebra", + "MAT 110": "Finite Mathematics", + "MAT 201": "Calculus I", + "ENG 101": "English Composition I", + "ENG 102": "English Composition II", + "BIO 201": "Anatomy & Physiology I", + "BIO 202": "Anatomy & Physiology II", + "CIS 146": "Microcomputer Applications", + "CIS 201": "Introduction to Programming", + "WDT 108": "SMAW Fillet/OFC", + "WDT 109": "SMAW Fillet/PAC/CAC", + "HIS 201": "United States History I", + "PSY 200": "General Psychology", + "BUS 241": "Principles of Accounting I", + "NUR 102": "Fundamentals of Nursing", + "EMS 100": "EMT Basic", +} +_DELIVERY_METHODS = ["Face-to-Face", "Online", "Hybrid"] +_VIZ_TYPES = ["bar", "line", "pie", "kpi", "table"] + +_QUERY_TEMPLATES = [ + ("retention rate by {dim} for {year} cohort", "bar"), + ("overall {metric} trend from 2019 to 2023", "line"), + ("{metric} for first-generation students", "kpi"), + ("{metric} by enrollment intensity", "bar"), + ("top 10 courses with highest DFW rates", "table"), + ("{metric} by {dim}", "bar"), + ("students with {alert} early warning alert", "kpi"), + ("{metric} distribution by program", "bar"), + ("{metric} gap between full-time and part-time students", "bar"), + ("at-risk student count by {dim}", "pie"), +] + +_DIMS = ["race", "gender", "cohort", "program", "enrollment intensity", "math placement"] +_METRICS = ["retention rate", "completion rate", "GPA", "DFW rate", "pass rate"] +_ALERTS = ["URGENT", "HIGH", "MODERATE"] +_YEARS = ["2019", "2020", "2021", "2022", "2023"] +_RACES = ["Black", "White", "Hispanic", "Asian", "Two or More", "Unknown"] + + +def load_seed_queries(school: str) -> dict[str, list[dict]]: + """Load seed queries from a school's seed_queries.yaml.""" + seed_path = get_school_dir(school) / "seed_queries.yaml" + if not seed_path.exists(): + return {"explainer": [], "summarizer": []} + with seed_path.open("r", encoding="utf-8") as fh: + data = yaml.safe_load(fh) or {} + return { + "explainer": data.get("explainer", []), + "summarizer": data.get("summarizer", []), + } + + +def _random_course() -> dict[str, str]: + prefix = random.choice(_PREFIXES) + number = random.choice(_NUMBERS) + key = f"{prefix} {number}" + name = _NAMES.get(key, f"{prefix} {number} Course") + return {"prefix": prefix, "number": number, "name": name} + + +def _random_stats() -> dict[str, Any]: + dfwi_a = round(random.uniform(0.15, 0.55), 2) + dfwi_b = round(random.uniform(0.15, 0.55), 2) + co_count = random.randint(20, 200) + co_dfwi = round(random.uniform(min(dfwi_a, dfwi_b), max(dfwi_a, dfwi_b) + 0.1), 2) + co_dfwi = min(co_dfwi, 0.75) + + delivery_breakdown = [] + remaining = co_count + for method in _DELIVERY_METHODS: + if method == _DELIVERY_METHODS[-1]: + count = remaining + else: + count = random.randint(5, remaining - 5 * (len(_DELIVERY_METHODS) - len(delivery_breakdown) - 1)) + count = max(count, 1) + remaining -= count + delivery_breakdown.append({ + "method": method, + "count": count, + "dfwi_rate": round(random.uniform(0.15, 0.55), 2), + }) + + return { + "course_a_dfwi": dfwi_a, + "course_b_dfwi": dfwi_b, + "co_enrollment_count": co_count, + "co_enrollment_dfwi": co_dfwi, + "delivery_breakdown": delivery_breakdown, + } + + +def generate_synthetic_course_pairings( + config: dict[str, Any], + count: int, +) -> list[dict[str, Any]]: + """Generate synthetic course pairing data for explainer training.""" + if count == 0: + return [] + results = [] + for _ in range(count): + course_a = _random_course() + course_b = _random_course() + while course_b["prefix"] == course_a["prefix"] and course_b["number"] == course_a["number"]: + course_b = _random_course() + results.append({"course_a": course_a, "course_b": course_b, "stats": _random_stats()}) + return results + + +def generate_synthetic_query_results( + config: dict[str, Any], + count: int, +) -> list[dict[str, Any]]: + """Generate synthetic query results for summarizer training.""" + if count == 0: + return [] + results = [] + for i in range(count): + template, default_viz = _QUERY_TEMPLATES[i % len(_QUERY_TEMPLATES)] + prompt = template.format( + dim=random.choice(_DIMS), + metric=random.choice(_METRICS), + year=random.choice(_YEARS), + alert=random.choice(_ALERTS), + ) + num_rows = random.randint(2, 8) + data = [] + for _ in range(num_rows): + row = { + "Race": random.choice(_RACES), + "value": round(random.uniform(0.15, 0.85), 2), + "count": random.randint(10, 500), + } + data.append(row) + results.append({"prompt": prompt, "data": data, "rowCount": num_rows, "vizType": default_viz}) + return results + + +def format_as_chatml(system: str, user: str, assistant: str) -> dict: + """Format a (system, user, assistant) triple as a ChatML messages dict.""" + return { + "messages": [ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + {"role": "assistant", "content": assistant}, + ] + } From 227439a45bab70534caedaf993ff831a3e183649 Mon Sep 17 00:00:00 2001 From: William Hill Date: Fri, 27 Mar 2026 23:09:57 -0400 Subject: [PATCH 10/18] feat(training): distillation pipeline with dual teacher backend support --- tests/training/test_distill.py | 122 +++++++++++++++ training/distill.py | 277 +++++++++++++++++++++++++++++++++ 2 files changed, 399 insertions(+) create mode 100644 tests/training/test_distill.py create mode 100644 training/distill.py diff --git a/tests/training/test_distill.py b/tests/training/test_distill.py new file mode 100644 index 0000000..ccaba2f --- /dev/null +++ b/tests/training/test_distill.py @@ -0,0 +1,122 @@ +"""Tests for training.distill — teacher model distillation.""" + +import json +import pytest +from unittest.mock import patch, MagicMock + +from training.distill import ( + validate_json, + call_teacher, + generate_explainer_pairs, + generate_summarizer_pairs, +) + + +class TestValidateJson: + def test_valid_json(self): + result = validate_json('{"key": "value"}') + assert result == {"key": "value"} + + def test_strips_markdown_fences(self): + result = validate_json('```json\n{"key": "value"}\n```') + assert result == {"key": "value"} + + def test_returns_none_for_invalid(self): + assert validate_json("not json") is None + + def test_returns_none_for_empty(self): + assert validate_json("") is None + assert validate_json(None) is None + + def test_returns_none_for_non_dict(self): + assert validate_json("[1, 2, 3]") is None + + +class TestCallTeacher: + def test_calls_anthropic_backend(self): + mock_client = MagicMock() + mock_message = MagicMock() + mock_message.content = [MagicMock(text='{"result": "ok"}')] + mock_message.usage.input_tokens = 100 + mock_message.usage.output_tokens = 50 + mock_client.messages.create.return_value = mock_message + + with patch("training.distill._get_anthropic_client", return_value=mock_client): + result = call_teacher( + system="system prompt", + user="user prompt", + backend="anthropic", + model="claude-sonnet-4-20250514", + ) + + assert result == '{"result": "ok"}' + mock_client.messages.create.assert_called_once() + + def test_calls_ollama_backend(self): + mock_response = {"message": {"content": '{"result": "ok"}'}} + + with patch("training.distill.ollama") as mock_ollama: + mock_ollama.chat.return_value = mock_response + result = call_teacher( + system="system prompt", + user="user prompt", + backend="ollama", + model="qwen3.5:27b", + ) + + assert result == '{"result": "ok"}' + mock_ollama.chat.assert_called_once() + + +class TestGenerateExplainerPairs: + def test_generates_pairs_from_seed_data(self, sample_school_config, sample_course_pairing_data): + mock_response = json.dumps({ + "explanation": "Test explanation", + "structural_factors": ["factor1"], + "student_impact": "impact", + "advisor_recommendation": "recommendation", + "data_limitations": ["caveat"], + "related_intervention": None, + }) + + with patch("training.distill.call_teacher", return_value=mock_response): + pairs = generate_explainer_pairs( + config=sample_school_config, + seed_data=[sample_course_pairing_data], + count=2, + ) + + assert len(pairs) == 2 + assert "messages" in pairs[0] + assert len(pairs[0]["messages"]) == 3 + + def test_skips_invalid_responses(self, sample_school_config, sample_course_pairing_data): + with patch("training.distill.call_teacher", return_value="not json"): + pairs = generate_explainer_pairs( + config=sample_school_config, + seed_data=[sample_course_pairing_data], + count=3, + ) + + assert len(pairs) == 0 + + +class TestGenerateSummarizerPairs: + def test_generates_pairs_from_seed_data(self, sample_school_config, sample_query_result_data): + mock_response = json.dumps({ + "summary": "Test summary", + "key_insights": ["insight1"], + "context": "context", + "action_items": ["action"], + "caveats": ["caveat"], + }) + + with patch("training.distill.call_teacher", return_value=mock_response): + pairs = generate_summarizer_pairs( + config=sample_school_config, + seed_data=[sample_query_result_data], + count=2, + ) + + assert len(pairs) == 2 + assert "messages" in pairs[0] diff --git a/training/distill.py b/training/distill.py new file mode 100644 index 0000000..7f6e92d --- /dev/null +++ b/training/distill.py @@ -0,0 +1,277 @@ +"""Distillation pipeline — generate ChatML training pairs via a teacher model. + +Supports two backends: + - anthropic: Claude Sonnet via Anthropic API (production quality) + - ollama: Local model via Ollama (free iteration) + +Usage: + python -m training.distill --school bishop-state [--local] +""" + +from __future__ import annotations + +import argparse +import functools +import json +import os +import time +from pathlib import Path +from typing import Any + +from training.config import get_training_data_dir, load_school_config, write_jsonl +from training.prompts import ( + EXPLAINER_STUDENT_SYSTEM, + SUMMARIZER_STUDENT_SYSTEM, + build_explainer_prompt, + build_summarizer_prompt, + build_system_prompt, +) +from training.seed import ( + format_as_chatml, + generate_synthetic_course_pairings, + generate_synthetic_query_results, + load_seed_queries, +) + +# Cost tracking +_COST_PER_M_INPUT = 3.00 +_COST_PER_M_OUTPUT = 15.00 +_total_input_tokens = 0 +_total_output_tokens = 0 +_total_calls = 0 + + +def _track_cost(input_tokens: int, output_tokens: int) -> None: + global _total_input_tokens, _total_output_tokens, _total_calls + _total_input_tokens += input_tokens + _total_output_tokens += output_tokens + _total_calls += 1 + + +def _cost_so_far() -> float: + return ( + _total_input_tokens / 1_000_000 * _COST_PER_M_INPUT + + _total_output_tokens / 1_000_000 * _COST_PER_M_OUTPUT + ) + + +def _print_cost_summary() -> None: + cost = _cost_so_far() + print( + f"[cost] {_total_calls} API calls | " + f"{_total_input_tokens:,} in + {_total_output_tokens:,} out tokens | " + f"${cost:.2f} spent so far", + flush=True, + ) + + +def validate_json(text: str | None) -> dict | None: + """Strip markdown fences and parse as JSON dict.""" + if not text or not isinstance(text, str) or not text.strip(): + return None + stripped = text.strip() + if stripped.startswith("```"): + lines = stripped.splitlines() + lines = lines[1:] + if lines and lines[-1].strip() == "```": + lines = lines[:-1] + stripped = "\n".join(lines).strip() + try: + obj = json.loads(stripped) + except (json.JSONDecodeError, ValueError): + return None + if not isinstance(obj, dict): + return None + return obj + + +@functools.lru_cache(maxsize=1) +def _get_anthropic_client(): + import anthropic + api_key = os.environ.get("ANTHROPIC_API_KEY") + if not api_key: + raise EnvironmentError("ANTHROPIC_API_KEY environment variable is required for Claude distillation.") + return anthropic.Anthropic(api_key=api_key) + + +try: + import ollama +except ImportError: + ollama = None # type: ignore[assignment] + + +def call_teacher(system: str, user: str, backend: str, model: str) -> str: + """Call the teacher model and return the response text.""" + preview = user[:120].replace("\n", " ") + print(f"[api] Calling {model} ({backend}) | {preview}...", flush=True) + + if backend == "anthropic": + client = _get_anthropic_client() + message = client.messages.create( + model=model, max_tokens=2048, system=system, + messages=[{"role": "user", "content": user}], + ) + usage = message.usage + _track_cost(usage.input_tokens, usage.output_tokens) + print(f"[api] done {usage.input_tokens}in/{usage.output_tokens}out tokens", flush=True) + if _total_calls % 10 == 0: + _print_cost_summary() + return message.content[0].text + + elif backend == "ollama": + if ollama is None: + raise ImportError("ollama package is required for local teacher. Install with: pip install ollama") + response = ollama.chat( + model=model, + messages=[ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ], + ) + return response["message"]["content"] + + else: + raise ValueError(f"Unknown backend: {backend!r}. Must be 'anthropic' or 'ollama'.") + + +def generate_explainer_pairs( + config: dict[str, Any], seed_data: list[dict[str, Any]], + count: int, outfile: Path | None = None, +) -> list[dict]: + """Generate explainer training pairs via teacher model distillation.""" + distill_config = config.get("distillation", {}) + backend = distill_config.get("teacher_backend", "anthropic") + model = distill_config.get("teacher_model", "claude-sonnet-4-20250514") + system_prompt = build_system_prompt(config) + pairs: list[dict] = [] + + fh = None + if outfile is not None: + outfile.parent.mkdir(parents=True, exist_ok=True) + fh = outfile.open("w", encoding="utf-8") + + try: + for idx in range(count): + if idx > 0 and idx % 25 == 0: + time.sleep(1) + course_data = seed_data[idx % len(seed_data)] + teacher_prompt = build_explainer_prompt(config, course_data) + try: + response_text = call_teacher(system_prompt, teacher_prompt, backend, model) + except Exception as exc: + print(f"[warn] Teacher call failed for explainer pair {idx}: {exc}", flush=True) + continue + validated = validate_json(response_text) + if validated is None: + print(f"[warn] Invalid JSON for explainer pair {idx}, skipping.", flush=True) + continue + student_user = json.dumps(course_data, ensure_ascii=False, default=str) + pair = format_as_chatml( + system=EXPLAINER_STUDENT_SYSTEM, user=student_user, + assistant=json.dumps(validated, ensure_ascii=False), + ) + pairs.append(pair) + if fh is not None: + fh.write(json.dumps(pair, ensure_ascii=False) + "\n") + fh.flush() + print(f"[explainer] {len(pairs)}/{count} pairs generated", flush=True) + finally: + if fh is not None: + fh.close() + print(f"[explainer] Saved {len(pairs)} pairs to {outfile}", flush=True) + return pairs + + +def generate_summarizer_pairs( + config: dict[str, Any], seed_data: list[dict[str, Any]], + count: int, outfile: Path | None = None, +) -> list[dict]: + """Generate summarizer training pairs via teacher model distillation.""" + distill_config = config.get("distillation", {}) + backend = distill_config.get("teacher_backend", "anthropic") + model = distill_config.get("teacher_model", "claude-sonnet-4-20250514") + system_prompt = build_system_prompt(config) + pairs: list[dict] = [] + + fh = None + if outfile is not None: + outfile.parent.mkdir(parents=True, exist_ok=True) + fh = outfile.open("w", encoding="utf-8") + + try: + for idx in range(count): + if idx > 0 and idx % 25 == 0: + time.sleep(1) + query_data = seed_data[idx % len(seed_data)] + teacher_prompt = build_summarizer_prompt(config, query_data) + try: + response_text = call_teacher(system_prompt, teacher_prompt, backend, model) + except Exception as exc: + print(f"[warn] Teacher call failed for summarizer pair {idx}: {exc}", flush=True) + continue + validated = validate_json(response_text) + if validated is None: + print(f"[warn] Invalid JSON for summarizer pair {idx}, skipping.", flush=True) + continue + student_user = json.dumps( + {"prompt": query_data["prompt"], "data": query_data["data"][:50]}, + ensure_ascii=False, default=str, + ) + pair = format_as_chatml( + system=SUMMARIZER_STUDENT_SYSTEM, user=student_user, + assistant=json.dumps(validated, ensure_ascii=False), + ) + pairs.append(pair) + if fh is not None: + fh.write(json.dumps(pair, ensure_ascii=False) + "\n") + fh.flush() + print(f"[summarizer] {len(pairs)}/{count} pairs generated", flush=True) + finally: + if fh is not None: + fh.close() + print(f"[summarizer] Saved {len(pairs)} pairs to {outfile}", flush=True) + return pairs + + +def main(school: str, local: bool = False) -> None: + """Run distillation for a school.""" + config = load_school_config(school) + if local: + config["distillation"]["teacher_backend"] = config["distillation"].get("local_teacher_backend", "ollama") + config["distillation"]["teacher_model"] = config["distillation"].get("local_teacher_model", "qwen3.5:27b") + print(f"[distill] Using local teacher: {config['distillation']['teacher_model']}") + else: + print(f"[distill] Using API teacher: {config['distillation']['teacher_model']}") + + pairs_per_task = config["distillation"].get("pairs_per_task", 1500) + data_dir = get_training_data_dir(school) + pairs_dir = data_dir / "pairs" + + seed_queries = load_seed_queries(school) + synthetic_pairings = generate_synthetic_course_pairings(config, count=pairs_per_task) + synthetic_results = generate_synthetic_query_results(config, count=pairs_per_task) + + print(f"\n{'='*60}\nEXPLAINER — generating {pairs_per_task} pairs\n{'='*60}") + explainer_pairs = generate_explainer_pairs( + config=config, seed_data=synthetic_pairings, + count=pairs_per_task, outfile=pairs_dir / "explainer.jsonl", + ) + + print(f"\n{'='*60}\nSUMMARIZER — generating {pairs_per_task} pairs\n{'='*60}") + summarizer_pairs = generate_summarizer_pairs( + config=config, seed_data=synthetic_results, + count=pairs_per_task, outfile=pairs_dir / "summarizer.jsonl", + ) + + print(f"\n{'='*60}\nDISTILLATION COMPLETE\n{'='*60}") + print(f" Explainer: {len(explainer_pairs)} pairs") + print(f" Summarizer: {len(summarizer_pairs)} pairs") + _print_cost_summary() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Generate training pairs via teacher model distillation.") + parser.add_argument("--school", required=True, help="School directory name") + parser.add_argument("--local", action="store_true", help="Use local Ollama teacher") + args = parser.parse_args() + main(args.school, local=args.local) From 7c6066023390b729578955047eab1a33bb25012d Mon Sep 17 00:00:00 2001 From: William Hill Date: Fri, 27 Mar 2026 23:10:45 -0400 Subject: [PATCH 11/18] =?UTF-8?q?feat(training):=20dataset=20preparation?= =?UTF-8?q?=20=E2=80=94=20filter,=20dedup,=20and=20split?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/training/test_prepare.py | 105 +++++++++++++++++++++ training/prepare.py | 162 +++++++++++++++++++++++++++++++++ 2 files changed, 267 insertions(+) create mode 100644 tests/training/test_prepare.py create mode 100644 training/prepare.py diff --git a/tests/training/test_prepare.py b/tests/training/test_prepare.py new file mode 100644 index 0000000..827e545 --- /dev/null +++ b/tests/training/test_prepare.py @@ -0,0 +1,105 @@ +"""Tests for training.prepare — filter, deduplicate, and split.""" + +import json +import pytest + +from training.prepare import ( + filter_invalid_json, + deduplicate_by_jaccard, + jaccard_similarity, + split_dataset, +) + + +class TestFilterInvalidJson: + def test_keeps_valid_pairs(self): + pairs = [ + {"messages": [ + {"role": "system", "content": "sys"}, + {"role": "user", "content": "question"}, + {"role": "assistant", "content": '{"key": "value"}'}, + ]} + ] + result = filter_invalid_json(pairs) + assert len(result) == 1 + + def test_removes_invalid_json_assistant(self): + pairs = [ + {"messages": [ + {"role": "system", "content": "sys"}, + {"role": "user", "content": "question"}, + {"role": "assistant", "content": "not json"}, + ]} + ] + result = filter_invalid_json(pairs) + assert len(result) == 0 + + def test_removes_missing_messages(self): + assert filter_invalid_json([{"no_messages": True}]) == [] + + def test_removes_empty_user(self): + pairs = [ + {"messages": [ + {"role": "system", "content": "sys"}, + {"role": "user", "content": ""}, + {"role": "assistant", "content": '{"key": "value"}'}, + ]} + ] + result = filter_invalid_json(pairs) + assert len(result) == 0 + + +class TestJaccardSimilarity: + def test_identical_strings(self): + assert jaccard_similarity("hello world", "hello world") == 1.0 + + def test_completely_different(self): + assert jaccard_similarity("hello", "world") == 0.0 + + def test_partial_overlap(self): + result = jaccard_similarity("hello world foo", "hello world bar") + assert 0.0 < result < 1.0 + + def test_empty_string(self): + assert jaccard_similarity("", "hello") == 0.0 + + +class TestDeduplicateByJaccard: + def test_removes_exact_duplicates(self): + pairs = [ + {"messages": [{"role": "user", "content": "same question"}]}, + {"messages": [{"role": "user", "content": "same question"}]}, + {"messages": [{"role": "user", "content": "different question"}]}, + ] + result = deduplicate_by_jaccard(pairs, threshold=1.0) + assert len(result) == 2 + + def test_empty_input(self): + assert deduplicate_by_jaccard([], threshold=1.0) == [] + + def test_preserves_order(self): + pairs = [ + {"messages": [{"role": "user", "content": "first"}]}, + {"messages": [{"role": "user", "content": "second"}]}, + ] + result = deduplicate_by_jaccard(pairs, threshold=1.0) + assert result[0]["messages"][0]["content"] == "first" + + +class TestSplitDataset: + def test_split_ratios(self): + pairs = [{"id": i} for i in range(100)] + splits = split_dataset(pairs, train_ratio=0.8, val_ratio=0.1) + assert len(splits["train"]) == 80 + assert len(splits["val"]) == 10 + assert len(splits["test"]) == 10 + + def test_deterministic(self): + pairs = [{"id": i} for i in range(50)] + split1 = split_dataset(pairs, seed=42) + split2 = split_dataset(pairs, seed=42) + assert split1["train"] == split2["train"] + + def test_empty_input(self): + splits = split_dataset([]) + assert splits == {"train": [], "val": [], "test": []} diff --git a/training/prepare.py b/training/prepare.py new file mode 100644 index 0000000..bab5d39 --- /dev/null +++ b/training/prepare.py @@ -0,0 +1,162 @@ +"""Dataset preparation — filter, deduplicate, and split training pairs. + +Usage: + python -m training.prepare --school bishop-state +""" + +from __future__ import annotations + +import argparse +import json +import random +from pathlib import Path +from typing import Any + +from training.config import ( + JACCARD_THRESHOLD, + TRAIN_RATIO, + VAL_RATIO, + get_training_data_dir, + write_jsonl, +) + + +def jaccard_similarity(a: str, b: str) -> float: + """Compute word-level Jaccard similarity between two strings.""" + words_a = set(a.lower().split()) + words_b = set(b.lower().split()) + if not words_a or not words_b: + return 0.0 + return len(words_a & words_b) / len(words_a | words_b) + + +def _get_user_text(pair: dict[str, Any]) -> str: + for msg in pair.get("messages", []): + if msg.get("role") == "user": + return msg.get("content", "") + return "" + + +def filter_invalid_json(pairs: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Keep only pairs with valid structure and JSON-parseable assistant content.""" + valid = [] + for pair in pairs: + messages = pair.get("messages") + if not isinstance(messages, list) or not messages: + continue + if any(not isinstance(msg, dict) for msg in messages): + continue + has_user = any( + msg.get("role") == "user" and msg.get("content") + for msg in messages + ) + if not has_user: + continue + assistant_content = None + for msg in messages: + if msg.get("role") == "assistant": + assistant_content = msg.get("content") + break + if not isinstance(assistant_content, str) or not assistant_content: + continue + try: + json.loads(assistant_content) + except (json.JSONDecodeError, ValueError): + continue + valid.append(pair) + return valid + + +def deduplicate_by_jaccard( + pairs: list[dict[str, Any]], threshold: float = JACCARD_THRESHOLD, +) -> list[dict[str, Any]]: + """Remove near-duplicate pairs based on user-message Jaccard similarity.""" + if not pairs: + return pairs + kept: list[dict[str, Any]] = [pairs[0]] + kept_word_sets: list[set] = [set(_get_user_text(pairs[0]).lower().split())] + for pair in pairs[1:]: + candidate_words = set(_get_user_text(pair).lower().split()) + is_duplicate = any( + _jaccard_sets(candidate_words, kw) >= threshold + for kw in kept_word_sets + ) + if not is_duplicate: + kept.append(pair) + kept_word_sets.append(candidate_words) + return kept + + +def _jaccard_sets(a: set, b: set) -> float: + if not a or not b: + return 0.0 + return len(a & b) / len(a | b) + + +def split_dataset( + pairs: list[dict[str, Any]], train_ratio: float = TRAIN_RATIO, + val_ratio: float = VAL_RATIO, seed: int = 42, +) -> dict[str, list[dict[str, Any]]]: + """Shuffle and split pairs into train/val/test with a deterministic seed.""" + if not pairs: + return {"train": [], "val": [], "test": []} + shuffled = list(pairs) + rng = random.Random(seed) + rng.shuffle(shuffled) + n = len(shuffled) + train_end = round(n * train_ratio) + val_end = train_end + round(n * val_ratio) + return { + "train": shuffled[:train_end], + "val": shuffled[train_end:val_end], + "test": shuffled[val_end:], + } + + +def _load_pairs(path: Path) -> list[dict[str, Any]]: + pairs = [] + with path.open() as fh: + for line in fh: + line = line.strip() + if line: + pairs.append(json.loads(line)) + return pairs + + +def process_task(school: str, task: str) -> dict[str, int]: + """Load, filter, deduplicate, and split training data for a task.""" + data_dir = get_training_data_dir(school) + input_path = data_dir / "pairs" / f"{task}.jsonl" + if not input_path.exists(): + raise FileNotFoundError(f"Pairs file not found: {input_path}") + pairs = _load_pairs(input_path) + print(f"[{task}] Loaded {len(pairs)} pairs from {input_path}") + pairs = filter_invalid_json(pairs) + print(f"[{task}] After JSON filter: {len(pairs)} pairs") + pairs = deduplicate_by_jaccard(pairs, threshold=JACCARD_THRESHOLD) + print(f"[{task}] After deduplication: {len(pairs)} pairs") + splits = split_dataset(pairs) + final_dir = data_dir / "final" / task + counts: dict[str, int] = {} + for split_name, split_pairs in splits.items(): + out_path = final_dir / f"{split_name}.jsonl" + n = write_jsonl(split_pairs, out_path) + counts[split_name] = n + print(f"[{task}] Wrote {n} examples to {out_path}") + return counts + + +def main(school: str) -> None: + """Run preparation for all tasks.""" + for task in ("explainer", "summarizer"): + try: + process_task(school, task) + except FileNotFoundError as e: + print(f"[warn] {e} — skipping") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Filter, deduplicate, and split training pairs.") + parser.add_argument("--school", required=True, help="School directory name") + args = parser.parse_args() + main(args.school) From 25612c29cbfefe4628f9d4e66c16599dd5786373 Mon Sep 17 00:00:00 2001 From: William Hill Date: Fri, 27 Mar 2026 23:38:49 -0400 Subject: [PATCH 12/18] feat(training): eval harness with ship criteria for model quality gates --- tests/training/test_eval.py | 92 +++++++++ training/eval.py | 396 ++++++++++++++++++++++++++++++++++++ 2 files changed, 488 insertions(+) create mode 100644 tests/training/test_eval.py create mode 100644 training/eval.py diff --git a/tests/training/test_eval.py b/tests/training/test_eval.py new file mode 100644 index 0000000..4e24bee --- /dev/null +++ b/tests/training/test_eval.py @@ -0,0 +1,92 @@ +"""Tests for training.eval — metrics and ship criteria.""" + +import json +import pytest + +from training.eval import ( + SHIP_CRITERIA, + check_json_validity, + check_schema_adherence, + check_caveat_inclusion, + check_ship_criteria, + ShipDecision, +) + + +class TestCheckJsonValidity: + def test_all_valid(self): + outputs = ['{"key": "value"}', '{"a": 1}'] + assert check_json_validity(outputs) == 1.0 + + def test_some_invalid(self): + outputs = ['{"key": "value"}', "not json", '{"a": 1}'] + assert check_json_validity(outputs) == pytest.approx(2 / 3) + + def test_empty(self): + assert check_json_validity([]) == 0.0 + + +class TestCheckSchemaAdherence: + def test_explainer_all_valid(self, sample_explainer_output): + outputs = [json.dumps(sample_explainer_output)] + assert check_schema_adherence(outputs, "explainer") == 1.0 + + def test_explainer_missing_key(self): + incomplete = json.dumps({"explanation": "test"}) + assert check_schema_adherence([incomplete], "explainer") < 1.0 + + def test_summarizer_all_valid(self, sample_summarizer_output): + outputs = [json.dumps(sample_summarizer_output)] + assert check_schema_adherence(outputs, "summarizer") == 1.0 + + +class TestCheckCaveatInclusion: + def test_all_have_caveats(self, sample_explainer_output): + outputs = [json.dumps(sample_explainer_output)] + assert check_caveat_inclusion(outputs, "explainer") == 1.0 + + def test_missing_caveats(self): + no_caveats = json.dumps({ + "explanation": "test", + "structural_factors": [], + "student_impact": "impact", + "advisor_recommendation": "rec", + "data_limitations": [], + "related_intervention": None, + }) + assert check_caveat_inclusion([no_caveats], "explainer") == 0.0 + + +class TestShipCriteria: + def test_passes_with_good_metrics(self): + metrics = { + "json_validity": 0.98, + "schema_adherence": 0.95, + "caveat_inclusion": 0.92, + "factual_grounding": 0.90, + } + decision = check_ship_criteria(metrics, "explainer") + assert decision.decision == "ship" + assert len(decision.blocking_failures) == 0 + + def test_fails_with_low_json_validity(self): + metrics = { + "json_validity": 0.80, + "schema_adherence": 0.95, + "caveat_inclusion": 0.92, + "factual_grounding": 0.90, + } + decision = check_ship_criteria(metrics, "explainer") + assert decision.decision == "no_ship" + assert len(decision.blocking_failures) > 0 + + def test_ship_with_gaps(self): + metrics = { + "json_validity": 0.98, + "schema_adherence": 0.95, + "caveat_inclusion": 0.85, + "factual_grounding": 0.90, + "explanation_quality": 0.30, + } + decision = check_ship_criteria(metrics, "explainer") + assert decision.decision in ("ship", "ship_with_gaps") diff --git a/training/eval.py b/training/eval.py new file mode 100644 index 0000000..7a8b882 --- /dev/null +++ b/training/eval.py @@ -0,0 +1,396 @@ +"""Eval harness with ship criteria for model quality gates. + +Runs inference on a test set, computes metrics, and decides whether the +fine-tuned model meets minimum quality thresholds for deployment. + +Usage: + python -m training.eval --school bishop-state +""" + +from __future__ import annotations + +import argparse +import json +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +from training.config import get_training_data_dir, load_school_config + +# --------------------------------------------------------------------------- +# Required keys per task +# --------------------------------------------------------------------------- + +_EXPLAINER_REQUIRED_KEYS: set[str] = { + "explanation", + "structural_factors", + "student_impact", + "advisor_recommendation", + "data_limitations", + "related_intervention", +} + +_SUMMARIZER_REQUIRED_KEYS: set[str] = { + "summary", + "key_insights", + "context", + "action_items", + "caveats", +} + +# --------------------------------------------------------------------------- +# Ship criteria — minimum thresholds per task +# --------------------------------------------------------------------------- + +SHIP_CRITERIA: dict[str, dict[str, float]] = { + "explainer": { + "json_validity": 0.95, + "schema_adherence": 0.90, + "caveat_inclusion": 0.85, + "factual_grounding": 0.80, + }, + "summarizer": { + "json_validity": 0.95, + "schema_adherence": 0.90, + "caveat_inclusion": 0.85, + "factual_grounding": 0.80, + }, +} + + +# --------------------------------------------------------------------------- +# Dataclasses +# --------------------------------------------------------------------------- + +@dataclass +class CriterionFailure: + metric: str + threshold: float + actual: float + + def __str__(self) -> str: + return ( + f"{self.metric}: {self.actual:.3f} < {self.threshold:.3f} (required)" + ) + + +@dataclass +class ShipDecision: + decision: str # "ship" | "ship_with_gaps" | "no_ship" + metrics: dict[str, float] + blocking_failures: list[CriterionFailure] = field(default_factory=list) + warnings: list[str] = field(default_factory=list) + + def __str__(self) -> str: + lines = [f"Decision: {self.decision.upper()}"] + for k, v in self.metrics.items(): + lines.append(f" {k}: {v:.3f}") + if self.blocking_failures: + lines.append("Blocking failures:") + for f_ in self.blocking_failures: + lines.append(f" - {f_}") + if self.warnings: + lines.append("Warnings:") + for w in self.warnings: + lines.append(f" - {w}") + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Metric functions +# --------------------------------------------------------------------------- + +def check_json_validity(outputs: list[str]) -> float: + """Fraction of outputs that parse as valid JSON dicts.""" + if not outputs: + return 0.0 + valid = 0 + for text in outputs: + try: + obj = json.loads(text) + if isinstance(obj, dict): + valid += 1 + except (json.JSONDecodeError, ValueError): + pass + return valid / len(outputs) + + +def check_schema_adherence(outputs: list[str], task: str) -> float: + """Fraction of valid JSON outputs that contain all required keys.""" + if not outputs: + return 0.0 + required = ( + _EXPLAINER_REQUIRED_KEYS if task == "explainer" else _SUMMARIZER_REQUIRED_KEYS + ) + passing = 0 + total = 0 + for text in outputs: + try: + obj = json.loads(text) + except (json.JSONDecodeError, ValueError): + total += 1 # invalid JSON counts against schema adherence + continue + if not isinstance(obj, dict): + total += 1 + continue + total += 1 + if required.issubset(obj.keys()): + passing += 1 + return passing / total if total else 0.0 + + +def check_caveat_inclusion(outputs: list[str], task: str) -> float: + """Fraction of valid JSON outputs with non-empty caveat fields. + + The caveat field is "data_limitations" for explainer, "caveats" for summarizer. + """ + if not outputs: + return 0.0 + caveat_key = "data_limitations" if task == "explainer" else "caveats" + passing = 0 + total = 0 + for text in outputs: + try: + obj = json.loads(text) + except (json.JSONDecodeError, ValueError): + total += 1 + continue + if not isinstance(obj, dict): + total += 1 + continue + total += 1 + caveat_val = obj.get(caveat_key) + if caveat_val and ( + (isinstance(caveat_val, list) and len(caveat_val) > 0) + or (isinstance(caveat_val, str) and caveat_val.strip()) + ): + passing += 1 + return passing / total if total else 0.0 + + +def check_factual_grounding(outputs: list[str], inputs: list[dict[str, Any]]) -> float: + """Fraction of outputs that contain numeric values referenced in their input. + + For each (input, output) pair, extracts all numbers from the input JSON + and checks whether at least one appears in the output text. + """ + if not outputs: + return 0.0 + pairs = list(zip(outputs, inputs)) + passing = 0 + total = 0 + for output_text, input_data in pairs: + total += 1 + # Collect all numeric string representations from the input + input_str = json.dumps(input_data, default=str) + numbers: list[str] = [] + import re + numbers = re.findall(r"\b\d+(?:\.\d+)?\b", input_str) + if not numbers: + # No numbers in input — cannot verify grounding; give benefit of doubt + passing += 1 + continue + # Check if any number appears in the output text + if any(num in output_text for num in numbers): + passing += 1 + return passing / total if total else 0.0 + + +# --------------------------------------------------------------------------- +# Ship-criteria check +# --------------------------------------------------------------------------- + +def check_ship_criteria(metrics: dict[str, float], task: str) -> ShipDecision: + """Compare metrics to thresholds and return a ShipDecision. + + Metrics not in SHIP_CRITERIA are treated as informational (warnings only). + A "ship_with_gaps" decision is returned when all blocking criteria pass but + informational metrics are notably low (< 0.5). + """ + criteria = SHIP_CRITERIA.get(task, {}) + blocking_failures: list[CriterionFailure] = [] + warnings: list[str] = [] + + for metric, value in metrics.items(): + threshold = criteria.get(metric) + if threshold is not None: + if value < threshold: + blocking_failures.append( + CriterionFailure(metric=metric, threshold=threshold, actual=value) + ) + else: + # Informational metric — warn if very low + if value < 0.5: + warnings.append( + f"{metric} is low ({value:.3f}) — consider improving before deploying" + ) + + if blocking_failures: + decision = "no_ship" + elif warnings: + decision = "ship_with_gaps" + else: + decision = "ship" + + return ShipDecision( + decision=decision, + metrics=metrics, + blocking_failures=blocking_failures, + warnings=warnings, + ) + + +# --------------------------------------------------------------------------- +# Test-set loading and inference +# --------------------------------------------------------------------------- + +def load_test_set(path: Path) -> list[dict[str, Any]]: + """Load a ChatML JSONL test set from path.""" + records: list[dict[str, Any]] = [] + with path.open("r", encoding="utf-8") as fh: + for line in fh: + line = line.strip() + if line: + records.append(json.loads(line)) + return records + + +def _extract_user_content(record: dict[str, Any]) -> str | None: + """Extract the user message content from a ChatML record.""" + for msg in record.get("messages", []): + if msg.get("role") == "user": + return msg.get("content") + return None + + +def _extract_assistant_content(record: dict[str, Any]) -> str | None: + """Extract the assistant message content from a ChatML record.""" + for msg in record.get("messages", []): + if msg.get("role") == "assistant": + return msg.get("content") + return None + + +def _extract_system_content(record: dict[str, Any]) -> str | None: + """Extract the system message content from a ChatML record.""" + for msg in record.get("messages", []): + if msg.get("role") == "system": + return msg.get("content") + return None + + +def _call_ollama(model: str, system: str, user: str) -> str: + """Call an Ollama model and return the response text.""" + try: + import ollama + except ImportError as exc: + raise ImportError( + "ollama package is required for eval inference. " + "Install with: pip install ollama" + ) from exc + + response = ollama.chat( + model=model, + messages=[ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ], + ) + return response["message"]["content"] + + +def run_eval(school: str, task: str) -> ShipDecision: + """Run inference on the test set, compute metrics, and return a ShipDecision. + + Inference is performed via Ollama using the fine-tuned model registered + as ``{school}-{task}`` (e.g. ``bishop-state-explainer``). + """ + config = load_school_config(school) + data_dir = get_training_data_dir(school) + test_path = data_dir / "final" / task / "test.jsonl" + + if not test_path.exists(): + raise FileNotFoundError( + f"Test set not found at {test_path}. " + "Run `python -m training.prepare` first." + ) + + records = load_test_set(test_path) + if not records: + raise ValueError(f"Test set is empty: {test_path}") + + model_name = f"{school}-{task}" + print(f"[eval] Running inference with model '{model_name}' on {len(records)} examples") + + outputs: list[str] = [] + inputs: list[dict[str, Any]] = [] + + for idx, record in enumerate(records): + system = _extract_system_content(record) or "" + user = _extract_user_content(record) or "" + try: + response = _call_ollama(model_name, system, user) + except Exception as exc: + print(f"[eval] Inference failed for record {idx}: {exc}", flush=True) + response = "" + + outputs.append(response) + try: + user_data = json.loads(user) + except (json.JSONDecodeError, ValueError): + user_data = {"raw": user} + inputs.append(user_data) + + if (idx + 1) % 10 == 0: + print(f"[eval] {idx + 1}/{len(records)} done", flush=True) + + # Compute metrics + metrics: dict[str, float] = { + "json_validity": check_json_validity(outputs), + "schema_adherence": check_schema_adherence(outputs, task), + "caveat_inclusion": check_caveat_inclusion(outputs, task), + "factual_grounding": check_factual_grounding(outputs, inputs), + } + + print(f"\n[eval] Results for {school}/{task}:") + for k, v in metrics.items(): + print(f" {k}: {v:.3f}") + + decision = check_ship_criteria(metrics, task) + print(f"\n{decision}") + return decision + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def main() -> None: + parser = argparse.ArgumentParser( + description="Evaluate a fine-tuned model against ship criteria." + ) + parser.add_argument("--school", required=True, help="School directory name (e.g. bishop-state)") + parser.add_argument( + "--task", + choices=["explainer", "summarizer"], + default=None, + help="Task to evaluate (default: both)", + ) + args = parser.parse_args() + + tasks = [args.task] if args.task else ["explainer", "summarizer"] + results: dict[str, ShipDecision] = {} + for task in tasks: + print(f"\n{'='*60}\nEVAL: {task.upper()}\n{'='*60}") + try: + results[task] = run_eval(args.school, task) + except FileNotFoundError as exc: + print(f"[warn] {exc} — skipping {task}") + + print(f"\n{'='*60}\nSUMMARY\n{'='*60}") + for task, decision in results.items(): + print(f" {task}: {decision.decision.upper()}") + + +if __name__ == "__main__": + main() From 2cefc17a7109800c1361fda0c42cc600e95577d3 Mon Sep 17 00:00:00 2001 From: William Hill Date: Fri, 27 Mar 2026 23:39:24 -0400 Subject: [PATCH 13/18] feat(training): MLX QLoRA fine-tuning wrapper --- training/finetune.py | 208 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 training/finetune.py diff --git a/training/finetune.py b/training/finetune.py new file mode 100644 index 0000000..2c424f2 --- /dev/null +++ b/training/finetune.py @@ -0,0 +1,208 @@ +"""MLX QLoRA fine-tuning wrapper for student success adapters. + +Wraps ``mlx_lm.lora`` to fine-tune Qwen models on ChatML training pairs +produced by the distillation and preparation pipeline. + +Usage: + python -m training.finetune --school bishop-state --model 9b + python -m training.finetune --school bishop-state --model 4b --task summarizer +""" + +from __future__ import annotations + +import argparse +import json +import subprocess +import sys +from pathlib import Path +from typing import Any + +from training.config import get_training_data_dir, load_school_config + +# --------------------------------------------------------------------------- +# Model map +# --------------------------------------------------------------------------- + +_MODEL_MAP: dict[str, str] = { + "4b": "Qwen/Qwen3.5-4B", + "9b": "Qwen/Qwen3.5-9B", + "27b": "Qwen/Qwen3.5-27B", +} + + +# --------------------------------------------------------------------------- +# Config builder +# --------------------------------------------------------------------------- + +def build_lora_config(config: dict[str, Any], task: str, data_dir: Path) -> dict[str, Any]: + """Build the MLX LoRA config dict from a school training config. + + Parameters + ---------- + config: + School config loaded from ``config.yaml``. + task: + One of ``"explainer"`` or ``"summarizer"``. + data_dir: + Path to the school's training data directory (``training_data//``). + + Returns + ------- + dict + MLX LoRA configuration dict suitable for writing to a JSON file and + passing to ``mlx_lm.lora``. + """ + train_cfg = config.get("training", {}) + final_dir = data_dir / "final" / task + + return { + "model": _MODEL_MAP.get( + train_cfg.get("default_model", "qwen3.5:9b").split(":")[-1], + _MODEL_MAP["9b"], + ), + "train": True, + "data": str(final_dir), + "fine_tune_type": train_cfg.get("method", "qlora"), + "num_layers": train_cfg.get("lora_rank", 16), + "lora_parameters": { + "rank": train_cfg.get("lora_rank", 16), + "alpha": train_cfg.get("lora_alpha", 32), + "dropout": 0.05, + "scale": 10.0, + }, + "batch_size": train_cfg.get("batch_size", 4), + "iters": train_cfg.get("epochs", 3) * 1000, + "val_batches": 25, + "learning_rate": train_cfg.get("learning_rate", 1e-4), + "steps_per_report": 10, + "steps_per_eval": train_cfg.get("eval_every", 50), + "save_every": train_cfg.get("eval_every", 50), + "adapter_path": str(data_dir / "adapters" / task), + "grad_checkpoint": True, + "seed": 42, + "warmup": train_cfg.get("warmup_steps", 100), + "lr_schedule": { + "name": "cosine_decay", + "warmup": train_cfg.get("warmup_steps", 100), + "warmup_init": 1e-7, + "arguments": [train_cfg.get("learning_rate", 1e-4), 1e-6], + }, + } + + +# --------------------------------------------------------------------------- +# Fine-tune runner +# --------------------------------------------------------------------------- + +def run_finetune(school: str, model: str, task: str) -> int: + """Run MLX LoRA fine-tuning for a school/task/model combination. + + Parameters + ---------- + school: + School directory name (e.g. ``"bishop-state"``). + model: + Model size key: ``"4b"``, ``"9b"``, or ``"27b"``. + task: + Task name: ``"explainer"`` or ``"summarizer"``. + + Returns + ------- + int + The subprocess return code (0 = success). + """ + if model not in _MODEL_MAP: + raise ValueError( + f"Unknown model size '{model}'. Choose from: {list(_MODEL_MAP.keys())}" + ) + + config = load_school_config(school) + data_dir = get_training_data_dir(school) + final_dir = data_dir / "final" / task + + if not final_dir.exists(): + raise FileNotFoundError( + f"Training data not found at {final_dir}. " + "Run `python -m training.prepare` first." + ) + + # Override the model from config with the CLI-specified model + train_cfg = config.get("training", {}) + train_cfg["default_model"] = f"qwen3.5:{model}" + config["training"] = train_cfg + + lora_config = build_lora_config(config, task, data_dir) + lora_config["model"] = _MODEL_MAP[model] + + # Write config to a temporary JSON file + config_dir = data_dir / "configs" + config_dir.mkdir(parents=True, exist_ok=True) + config_path = config_dir / f"lora_{task}_{model}.json" + + with config_path.open("w", encoding="utf-8") as fh: + json.dump(lora_config, fh, indent=2) + + print(f"[finetune] LoRA config written to {config_path}") + print(f"[finetune] Base model: {_MODEL_MAP[model]}") + print(f"[finetune] Task: {task} | School: {school}") + print(f"[finetune] Adapter output: {lora_config['adapter_path']}") + + cmd = [ + sys.executable, "-m", "mlx_lm.lora", + "--config", str(config_path), + ] + + print(f"[finetune] Running: {' '.join(cmd)}", flush=True) + result = subprocess.run(cmd, check=False) + + if result.returncode == 0: + print(f"[finetune] Fine-tuning complete. Adapter saved to {lora_config['adapter_path']}") + else: + print( + f"[finetune] Fine-tuning failed with return code {result.returncode}. " + "Check output above for details.", + file=sys.stderr, + ) + + return result.returncode + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def main() -> None: + parser = argparse.ArgumentParser( + description="Run MLX QLoRA fine-tuning for a student success adapter." + ) + parser.add_argument( + "--school", required=True, + help="School directory name (e.g. bishop-state)", + ) + parser.add_argument( + "--model", choices=list(_MODEL_MAP.keys()), default="9b", + help="Model size to fine-tune (default: 9b)", + ) + parser.add_argument( + "--task", choices=["explainer", "summarizer"], default=None, + help="Task to fine-tune (default: both)", + ) + args = parser.parse_args() + + tasks = [args.task] if args.task else ["explainer", "summarizer"] + exit_codes: list[int] = [] + + for task in tasks: + print(f"\n{'='*60}\nFINETUNE: {task.upper()} | model={args.model}\n{'='*60}") + try: + code = run_finetune(args.school, args.model, task) + exit_codes.append(code) + except (FileNotFoundError, ValueError) as exc: + print(f"[error] {exc}", file=sys.stderr) + exit_codes.append(1) + + sys.exit(max(exit_codes) if exit_codes else 0) + + +if __name__ == "__main__": + main() From a189e9de8cf1efe9b30d38cb775cc70afa7f9075 Mon Sep 17 00:00:00 2001 From: William Hill Date: Fri, 27 Mar 2026 23:39:53 -0400 Subject: [PATCH 14/18] feat(training): Ollama model export and registration --- training/export.py | 194 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 training/export.py diff --git a/training/export.py b/training/export.py new file mode 100644 index 0000000..776dff1 --- /dev/null +++ b/training/export.py @@ -0,0 +1,194 @@ +"""Ollama model export and registration for student success adapters. + +Converts an MLX LoRA adapter into an Ollama model by generating a Modelfile +and running ``ollama create``. + +Usage: + python -m training.export --school bishop-state + python -m training.export --school bishop-state --task explainer --model 9b +""" + +from __future__ import annotations + +import argparse +import json +import subprocess +import sys +from pathlib import Path + +from training.config import get_training_data_dir, load_school_config +from training.prompts import EXPLAINER_STUDENT_SYSTEM, SUMMARIZER_STUDENT_SYSTEM + +# --------------------------------------------------------------------------- +# Modelfile template +# --------------------------------------------------------------------------- + +_MODELFILE_TEMPLATE = """\ +FROM {base_model} + +# Adapter produced by MLX QLoRA fine-tuning +ADAPTER {adapter_path} + +# System prompt +SYSTEM {system_prompt_json} + +# Recommended inference parameters +PARAMETER temperature 0.2 +PARAMETER top_p 0.9 +PARAMETER repeat_penalty 1.1 +PARAMETER stop "<|im_end|>" +PARAMETER stop "<|endoftext|>" +""" + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def generate_modelfile(base_model: str, adapter_path: str, system_prompt: str) -> str: + """Generate an Ollama Modelfile string. + + Parameters + ---------- + base_model: + The base Ollama model tag (e.g. ``"qwen3.5:9b"``). + adapter_path: + Absolute path to the MLX LoRA adapter directory. + system_prompt: + The system prompt string to embed in the Modelfile. + + Returns + ------- + str + The fully rendered Modelfile content. + """ + system_prompt_json = json.dumps(system_prompt) + return _MODELFILE_TEMPLATE.format( + base_model=base_model, + adapter_path=adapter_path, + system_prompt_json=system_prompt_json, + ) + + +# --------------------------------------------------------------------------- +# Export runner +# --------------------------------------------------------------------------- + +def export_model(school: str, task: str, model: str = "9b") -> int: + """Write a Modelfile and register the model with Ollama. + + Parameters + ---------- + school: + School directory name (e.g. ``"bishop-state"``). + task: + Task name: ``"explainer"`` or ``"summarizer"``. + model: + Model size key used during fine-tuning (``"4b"``, ``"9b"``, or ``"27b"``). + Used to locate the adapter and set the base model tag. + + Returns + ------- + int + The ``ollama create`` subprocess return code (0 = success). + """ + config = load_school_config(school) + data_dir = get_training_data_dir(school) + + adapter_path = data_dir / "adapters" / task + if not adapter_path.exists(): + raise FileNotFoundError( + f"Adapter not found at {adapter_path}. " + "Run `python -m training.finetune` first." + ) + + # Determine system prompt for this task + if task == "explainer": + system_prompt = EXPLAINER_STUDENT_SYSTEM + elif task == "summarizer": + system_prompt = SUMMARIZER_STUDENT_SYSTEM + else: + raise ValueError(f"Unknown task '{task}'. Must be 'explainer' or 'summarizer'.") + + # Base model tag (Ollama format) + base_model = f"qwen3.5:{model}" + + # Ollama model name: "{school}-{task}" e.g. "bishop-state-explainer" + ollama_model_name = f"{school}-{task}" + + # Write Modelfile + modelfile_content = generate_modelfile( + base_model=base_model, + adapter_path=str(adapter_path.resolve()), + system_prompt=system_prompt, + ) + + export_dir = data_dir / "export" / task + export_dir.mkdir(parents=True, exist_ok=True) + modelfile_path = export_dir / "Modelfile" + + with modelfile_path.open("w", encoding="utf-8") as fh: + fh.write(modelfile_content) + + print(f"[export] Modelfile written to {modelfile_path}") + print(f"[export] Base model: {base_model}") + print(f"[export] Adapter: {adapter_path.resolve()}") + print(f"[export] Registering as Ollama model: {ollama_model_name}") + + cmd = ["ollama", "create", ollama_model_name, "--file", str(modelfile_path)] + print(f"[export] Running: {' '.join(cmd)}", flush=True) + + result = subprocess.run(cmd, check=False) + + if result.returncode == 0: + print(f"[export] Model '{ollama_model_name}' registered successfully.") + print(f"[export] Test with: ollama run {ollama_model_name}") + else: + print( + f"[export] ollama create failed with return code {result.returncode}. " + "Ensure Ollama is running and the adapter path is correct.", + file=sys.stderr, + ) + + return result.returncode + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def main() -> None: + parser = argparse.ArgumentParser( + description="Export a fine-tuned adapter as an Ollama model." + ) + parser.add_argument( + "--school", required=True, + help="School directory name (e.g. bishop-state)", + ) + parser.add_argument( + "--task", choices=["explainer", "summarizer"], default=None, + help="Task to export (default: both)", + ) + parser.add_argument( + "--model", choices=["4b", "9b", "27b"], default="9b", + help="Model size used during fine-tuning (default: 9b)", + ) + args = parser.parse_args() + + tasks = [args.task] if args.task else ["explainer", "summarizer"] + exit_codes: list[int] = [] + + for task in tasks: + print(f"\n{'='*60}\nEXPORT: {task.upper()} | model={args.model}\n{'='*60}") + try: + code = export_model(args.school, task, args.model) + exit_codes.append(code) + except (FileNotFoundError, ValueError) as exc: + print(f"[error] {exc}", file=sys.stderr) + exit_codes.append(1) + + sys.exit(max(exit_codes) if exit_codes else 0) + + +if __name__ == "__main__": + main() From 17c9ea7ecda42ad171d7d3ea6829161b63914747 Mon Sep 17 00:00:00 2001 From: William Hill Date: Fri, 27 Mar 2026 23:41:15 -0400 Subject: [PATCH 15/18] feat(dashboard): model client adapter for Ollama/OpenAI routing --- codebenders-dashboard/lib/model-client.ts | 77 +++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 codebenders-dashboard/lib/model-client.ts diff --git a/codebenders-dashboard/lib/model-client.ts b/codebenders-dashboard/lib/model-client.ts new file mode 100644 index 0000000..0e246e0 --- /dev/null +++ b/codebenders-dashboard/lib/model-client.ts @@ -0,0 +1,77 @@ +/** + * Model client adapter — routes inference to Ollama (fine-tuned) or + * OpenAI (fallback) based on MODEL_BACKEND env var. + */ + +import { generateText } from "ai" +import { createOpenAI } from "@ai-sdk/openai" + +const MODEL_BACKEND = process.env.MODEL_BACKEND || "openai" +const SCHOOL_CODE = process.env.SCHOOL_CODE || "bishop-state" +const OLLAMA_BASE_URL = process.env.OLLAMA_BASE_URL || "http://localhost:11434" + +const openai = createOpenAI({ + apiKey: process.env.OPENAI_API_KEY || "", +}) + +async function callOllama(model: string, prompt: string): Promise { + const response = await fetch(`${OLLAMA_BASE_URL}/api/generate`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + model, + prompt, + stream: false, + options: { + temperature: 0.3, + num_predict: 1024, + }, + }), + }) + + if (!response.ok) { + throw new Error(`Ollama error: ${response.status} ${response.statusText}`) + } + + const data = await response.json() + return data.response +} + +async function callOpenAI(prompt: string, maxTokens: number): Promise { + const result = await generateText({ + model: openai("gpt-4o-mini"), + prompt, + maxTokens, + }) + return result.text +} + +/** + * Generate a course pairing explanation. + */ +export async function generateExplanation( + prompt: string, + maxTokens: number = 320, +): Promise { + if (MODEL_BACKEND === "ollama") { + const modelSize = process.env.MODEL_SIZE || "9b" + const model = `${SCHOOL_CODE}-explainer:${modelSize}` + return callOllama(model, prompt) + } + return callOpenAI(prompt, maxTokens) +} + +/** + * Generate a query result summary. + */ +export async function generateSummary( + prompt: string, + maxTokens: number = 200, +): Promise { + if (MODEL_BACKEND === "ollama") { + const modelSize = process.env.MODEL_SIZE || "9b" + const model = `${SCHOOL_CODE}-summarizer:${modelSize}` + return callOllama(model, prompt) + } + return callOpenAI(prompt, maxTokens) +} From b437a6e07d369c4c75f4cb06a35d7bc82922f574 Mon Sep 17 00:00:00 2001 From: William Hill Date: Fri, 27 Mar 2026 23:53:47 -0400 Subject: [PATCH 16/18] feat(dashboard): route explain-pairing and query-summary through model client --- .../app/api/courses/explain-pairing/route.ts | 13 +++---------- .../app/api/query-summary/route.ts | 15 ++++----------- codebenders-dashboard/lib/model-client.ts | 2 +- 3 files changed, 8 insertions(+), 22 deletions(-) diff --git a/codebenders-dashboard/app/api/courses/explain-pairing/route.ts b/codebenders-dashboard/app/api/courses/explain-pairing/route.ts index fc560c0..d08ddb3 100644 --- a/codebenders-dashboard/app/api/courses/explain-pairing/route.ts +++ b/codebenders-dashboard/app/api/courses/explain-pairing/route.ts @@ -1,10 +1,7 @@ import { type NextRequest, NextResponse } from "next/server" import { getPool } from "@/lib/db" import { canAccess, type Role } from "@/lib/roles" -import { generateText } from "ai" -import { createOpenAI } from "@ai-sdk/openai" - -const openai = createOpenAI({ apiKey: process.env.OPENAI_API_KEY || "" }) +import { generateExplanation } from "@/lib/model-client" const DELIVERY_LABELS: Record = { F: "Face-to-Face", @@ -18,7 +15,7 @@ export async function POST(request: NextRequest) { return NextResponse.json({ error: "Forbidden" }, { status: 403 }) } - if (!process.env.OPENAI_API_KEY) { + if (process.env.MODEL_BACKEND !== "ollama" && !process.env.OPENAI_API_KEY) { return NextResponse.json({ error: "OpenAI API key not configured" }, { status: 500 }) } @@ -189,11 +186,7 @@ Write a concise analysis (3-4 sentences) that: Be practical and data-driven. Do not speculate beyond what the numbers show.` - const result = await generateText({ - model: openai("gpt-4o-mini"), - prompt: llmPrompt, - maxOutputTokens: 320, - }) + const result = { text: await generateExplanation(llmPrompt, 320) } return NextResponse.json({ stats, explanation: result.text }) } catch (error) { diff --git a/codebenders-dashboard/app/api/query-summary/route.ts b/codebenders-dashboard/app/api/query-summary/route.ts index 6fa563d..07fc513 100644 --- a/codebenders-dashboard/app/api/query-summary/route.ts +++ b/codebenders-dashboard/app/api/query-summary/route.ts @@ -1,9 +1,6 @@ import { type NextRequest, NextResponse } from "next/server" import { canAccess, type Role } from "@/lib/roles" -import { generateText } from "ai" -import { createOpenAI } from "@ai-sdk/openai" - -const openai = createOpenAI({ apiKey: process.env.OPENAI_API_KEY || "" }) +import { generateSummary } from "@/lib/model-client" export async function POST(request: NextRequest) { const role = request.headers.get("x-user-role") as Role | null @@ -11,7 +8,7 @@ export async function POST(request: NextRequest) { return NextResponse.json({ error: "Forbidden" }, { status: 403 }) } - if (!process.env.OPENAI_API_KEY) { + if (process.env.MODEL_BACKEND !== "ollama" && !process.env.OPENAI_API_KEY) { return NextResponse.json({ error: "OpenAI API key not configured" }, { status: 500 }) } @@ -47,12 +44,8 @@ ${JSON.stringify(sampleRows, null, 2)} Write a 2-3 sentence plain-English summary of what these results show. Be specific about the numbers. Do not speculate beyond the data. Address the advisor directly.` try { - const result = await generateText({ - model: openai("gpt-4o-mini"), - prompt: llmPrompt, - maxOutputTokens: 200, - }) - return NextResponse.json({ summary: result.text }) + const summary = await generateSummary(llmPrompt, 200) + return NextResponse.json({ summary }) } catch (error) { console.error("[query-summary] Error:", error) return NextResponse.json( diff --git a/codebenders-dashboard/lib/model-client.ts b/codebenders-dashboard/lib/model-client.ts index 0e246e0..09ca349 100644 --- a/codebenders-dashboard/lib/model-client.ts +++ b/codebenders-dashboard/lib/model-client.ts @@ -41,7 +41,7 @@ async function callOpenAI(prompt: string, maxTokens: number): Promise { const result = await generateText({ model: openai("gpt-4o-mini"), prompt, - maxTokens, + maxOutputTokens: maxTokens, }) return result.text } From 86e3d9c0fc7ae7e62f874c5a057590978f16662e Mon Sep 17 00:00:00 2001 From: William Hill Date: Sat, 28 Mar 2026 00:30:22 -0400 Subject: [PATCH 17/18] =?UTF-8?q?refactor:=20simplify=20training=20pipelin?= =?UTF-8?q?e=20=E2=80=94=20deduplicate=20code,=20extract=20shared=20utilit?= =?UTF-8?q?ies?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Unify generate_explainer/summarizer_pairs into single generate_pairs() - Extract read_jsonl() and get_message_content() to config.py - Replace 3 duplicate _extract_*_content helpers with get_message_content - Lazy-init OpenAI client in model-client.ts (skip when using Ollama) - Extract shared generate() helper in model-client.ts - Move import re to module level in eval.py - Remove redundant config mutation in finetune.py - Batch flush every 25 records instead of every record - Remove unnecessary what-comments --- codebenders-dashboard/lib/model-client.ts | 38 +++---- training/config.py | 19 ++++ training/distill.py | 117 +++++++++++----------- training/eval.py | 44 +------- training/export.py | 18 ++-- training/finetune.py | 6 -- training/prepare.py | 25 +---- 7 files changed, 118 insertions(+), 149 deletions(-) diff --git a/codebenders-dashboard/lib/model-client.ts b/codebenders-dashboard/lib/model-client.ts index 09ca349..282c7bf 100644 --- a/codebenders-dashboard/lib/model-client.ts +++ b/codebenders-dashboard/lib/model-client.ts @@ -9,10 +9,16 @@ import { createOpenAI } from "@ai-sdk/openai" const MODEL_BACKEND = process.env.MODEL_BACKEND || "openai" const SCHOOL_CODE = process.env.SCHOOL_CODE || "bishop-state" const OLLAMA_BASE_URL = process.env.OLLAMA_BASE_URL || "http://localhost:11434" +const MODEL_SIZE = process.env.MODEL_SIZE || "9b" -const openai = createOpenAI({ - apiKey: process.env.OPENAI_API_KEY || "", -}) +let _openai: ReturnType | null = null + +function getOpenAI() { + if (!_openai) { + _openai = createOpenAI({ apiKey: process.env.OPENAI_API_KEY || "" }) + } + return _openai +} async function callOllama(model: string, prompt: string): Promise { const response = await fetch(`${OLLAMA_BASE_URL}/api/generate`, { @@ -37,9 +43,17 @@ async function callOllama(model: string, prompt: string): Promise { return data.response } -async function callOpenAI(prompt: string, maxTokens: number): Promise { +async function generate( + task: "explainer" | "summarizer", + prompt: string, + maxTokens: number, +): Promise { + if (MODEL_BACKEND === "ollama") { + const model = `${SCHOOL_CODE}-${task}:${MODEL_SIZE}` + return callOllama(model, prompt) + } const result = await generateText({ - model: openai("gpt-4o-mini"), + model: getOpenAI()("gpt-4o-mini"), prompt, maxOutputTokens: maxTokens, }) @@ -53,12 +67,7 @@ export async function generateExplanation( prompt: string, maxTokens: number = 320, ): Promise { - if (MODEL_BACKEND === "ollama") { - const modelSize = process.env.MODEL_SIZE || "9b" - const model = `${SCHOOL_CODE}-explainer:${modelSize}` - return callOllama(model, prompt) - } - return callOpenAI(prompt, maxTokens) + return generate("explainer", prompt, maxTokens) } /** @@ -68,10 +77,5 @@ export async function generateSummary( prompt: string, maxTokens: number = 200, ): Promise { - if (MODEL_BACKEND === "ollama") { - const modelSize = process.env.MODEL_SIZE || "9b" - const model = `${SCHOOL_CODE}-summarizer:${modelSize}` - return callOllama(model, prompt) - } - return callOpenAI(prompt, maxTokens) + return generate("summarizer", prompt, maxTokens) } diff --git a/training/config.py b/training/config.py index 36c1bee..501b5be 100644 --- a/training/config.py +++ b/training/config.py @@ -51,6 +51,25 @@ def get_training_data_dir(school: str) -> Path: return BASE_DIR / school +def read_jsonl(path: Path) -> list[dict[str, Any]]: + """Read a JSONL file and return a list of parsed dicts.""" + items = [] + with path.open("r", encoding="utf-8") as fh: + for line in fh: + line = line.strip() + if line: + items.append(json.loads(line)) + return items + + +def get_message_content(record: dict[str, Any], role: str) -> str | None: + """Extract message content for a given role from a ChatML record.""" + for msg in record.get("messages", []): + if msg.get("role") == role: + return msg.get("content") + return None + + def write_jsonl( items: list, outfile: Path, diff --git a/training/distill.py b/training/distill.py index 7f6e92d..e68fd83 100644 --- a/training/distill.py +++ b/training/distill.py @@ -134,15 +134,46 @@ def call_teacher(system: str, user: str, backend: str, model: str) -> str: raise ValueError(f"Unknown backend: {backend!r}. Must be 'anthropic' or 'ollama'.") -def generate_explainer_pairs( +_FLUSH_INTERVAL = 25 + +_TASK_CONFIG = { + "explainer": { + "prompt_builder": build_explainer_prompt, + "student_system": EXPLAINER_STUDENT_SYSTEM, + "format_user": lambda config, data: json.dumps(data, ensure_ascii=False, default=str), + }, + "summarizer": { + "prompt_builder": build_summarizer_prompt, + "student_system": SUMMARIZER_STUDENT_SYSTEM, + "format_user": lambda config, data: json.dumps( + {"prompt": data["prompt"], "data": data["data"][:50]}, + ensure_ascii=False, default=str, + ), + }, +} + + +def generate_pairs( config: dict[str, Any], seed_data: list[dict[str, Any]], - count: int, outfile: Path | None = None, + count: int, task: str, outfile: Path | None = None, + system_prompt: str | None = None, ) -> list[dict]: - """Generate explainer training pairs via teacher model distillation.""" + """Generate training pairs via teacher model distillation. + + Args: + config: Parsed school config. + seed_data: List of seed data dicts. + count: Number of pairs to generate. + task: "explainer" or "summarizer". + outfile: If provided, pairs are written incrementally. + system_prompt: Pre-built system prompt (avoids recomputation). + """ + task_cfg = _TASK_CONFIG[task] distill_config = config.get("distillation", {}) backend = distill_config.get("teacher_backend", "anthropic") model = distill_config.get("teacher_model", "claude-sonnet-4-20250514") - system_prompt = build_system_prompt(config) + if system_prompt is None: + system_prompt = build_system_prompt(config) pairs: list[dict] = [] fh = None @@ -154,83 +185,51 @@ def generate_explainer_pairs( for idx in range(count): if idx > 0 and idx % 25 == 0: time.sleep(1) - course_data = seed_data[idx % len(seed_data)] - teacher_prompt = build_explainer_prompt(config, course_data) + seed_item = seed_data[idx % len(seed_data)] + teacher_prompt = task_cfg["prompt_builder"](config, seed_item) try: response_text = call_teacher(system_prompt, teacher_prompt, backend, model) except Exception as exc: - print(f"[warn] Teacher call failed for explainer pair {idx}: {exc}", flush=True) + print(f"[warn] Teacher call failed for {task} pair {idx}: {exc}", flush=True) continue validated = validate_json(response_text) if validated is None: - print(f"[warn] Invalid JSON for explainer pair {idx}, skipping.", flush=True) + print(f"[warn] Invalid JSON for {task} pair {idx}, skipping.", flush=True) continue - student_user = json.dumps(course_data, ensure_ascii=False, default=str) + student_user = task_cfg["format_user"](config, seed_item) pair = format_as_chatml( - system=EXPLAINER_STUDENT_SYSTEM, user=student_user, + system=task_cfg["student_system"], user=student_user, assistant=json.dumps(validated, ensure_ascii=False), ) pairs.append(pair) if fh is not None: fh.write(json.dumps(pair, ensure_ascii=False) + "\n") - fh.flush() - print(f"[explainer] {len(pairs)}/{count} pairs generated", flush=True) + if len(pairs) % _FLUSH_INTERVAL == 0: + fh.flush() + print(f"[{task}] {len(pairs)}/{count} pairs generated", flush=True) finally: if fh is not None: fh.close() - print(f"[explainer] Saved {len(pairs)} pairs to {outfile}", flush=True) + print(f"[{task}] Saved {len(pairs)} pairs to {outfile}", flush=True) return pairs -def generate_summarizer_pairs( +def generate_explainer_pairs( config: dict[str, Any], seed_data: list[dict[str, Any]], count: int, outfile: Path | None = None, + system_prompt: str | None = None, ) -> list[dict]: - """Generate summarizer training pairs via teacher model distillation.""" - distill_config = config.get("distillation", {}) - backend = distill_config.get("teacher_backend", "anthropic") - model = distill_config.get("teacher_model", "claude-sonnet-4-20250514") - system_prompt = build_system_prompt(config) - pairs: list[dict] = [] + """Generate explainer training pairs via teacher model distillation.""" + return generate_pairs(config, seed_data, count, "explainer", outfile, system_prompt) - fh = None - if outfile is not None: - outfile.parent.mkdir(parents=True, exist_ok=True) - fh = outfile.open("w", encoding="utf-8") - try: - for idx in range(count): - if idx > 0 and idx % 25 == 0: - time.sleep(1) - query_data = seed_data[idx % len(seed_data)] - teacher_prompt = build_summarizer_prompt(config, query_data) - try: - response_text = call_teacher(system_prompt, teacher_prompt, backend, model) - except Exception as exc: - print(f"[warn] Teacher call failed for summarizer pair {idx}: {exc}", flush=True) - continue - validated = validate_json(response_text) - if validated is None: - print(f"[warn] Invalid JSON for summarizer pair {idx}, skipping.", flush=True) - continue - student_user = json.dumps( - {"prompt": query_data["prompt"], "data": query_data["data"][:50]}, - ensure_ascii=False, default=str, - ) - pair = format_as_chatml( - system=SUMMARIZER_STUDENT_SYSTEM, user=student_user, - assistant=json.dumps(validated, ensure_ascii=False), - ) - pairs.append(pair) - if fh is not None: - fh.write(json.dumps(pair, ensure_ascii=False) + "\n") - fh.flush() - print(f"[summarizer] {len(pairs)}/{count} pairs generated", flush=True) - finally: - if fh is not None: - fh.close() - print(f"[summarizer] Saved {len(pairs)} pairs to {outfile}", flush=True) - return pairs +def generate_summarizer_pairs( + config: dict[str, Any], seed_data: list[dict[str, Any]], + count: int, outfile: Path | None = None, + system_prompt: str | None = None, +) -> list[dict]: + """Generate summarizer training pairs via teacher model distillation.""" + return generate_pairs(config, seed_data, count, "summarizer", outfile, system_prompt) def main(school: str, local: bool = False) -> None: @@ -251,16 +250,20 @@ def main(school: str, local: bool = False) -> None: synthetic_pairings = generate_synthetic_course_pairings(config, count=pairs_per_task) synthetic_results = generate_synthetic_query_results(config, count=pairs_per_task) + system_prompt = build_system_prompt(config) + print(f"\n{'='*60}\nEXPLAINER — generating {pairs_per_task} pairs\n{'='*60}") explainer_pairs = generate_explainer_pairs( config=config, seed_data=synthetic_pairings, count=pairs_per_task, outfile=pairs_dir / "explainer.jsonl", + system_prompt=system_prompt, ) print(f"\n{'='*60}\nSUMMARIZER — generating {pairs_per_task} pairs\n{'='*60}") summarizer_pairs = generate_summarizer_pairs( config=config, seed_data=synthetic_results, count=pairs_per_task, outfile=pairs_dir / "summarizer.jsonl", + system_prompt=system_prompt, ) print(f"\n{'='*60}\nDISTILLATION COMPLETE\n{'='*60}") diff --git a/training/eval.py b/training/eval.py index 7a8b882..1edab95 100644 --- a/training/eval.py +++ b/training/eval.py @@ -11,11 +11,12 @@ import argparse import json +import re from dataclasses import dataclass, field from pathlib import Path from typing import Any -from training.config import get_training_data_dir, load_school_config +from training.config import get_message_content, get_training_data_dir, load_school_config, read_jsonl # --------------------------------------------------------------------------- # Required keys per task @@ -181,16 +182,11 @@ def check_factual_grounding(outputs: list[str], inputs: list[dict[str, Any]]) -> total = 0 for output_text, input_data in pairs: total += 1 - # Collect all numeric string representations from the input input_str = json.dumps(input_data, default=str) - numbers: list[str] = [] - import re numbers = re.findall(r"\b\d+(?:\.\d+)?\b", input_str) if not numbers: - # No numbers in input — cannot verify grounding; give benefit of doubt passing += 1 continue - # Check if any number appears in the output text if any(num in output_text for num in numbers): passing += 1 return passing / total if total else 0.0 @@ -246,37 +242,7 @@ def check_ship_criteria(metrics: dict[str, float], task: str) -> ShipDecision: def load_test_set(path: Path) -> list[dict[str, Any]]: """Load a ChatML JSONL test set from path.""" - records: list[dict[str, Any]] = [] - with path.open("r", encoding="utf-8") as fh: - for line in fh: - line = line.strip() - if line: - records.append(json.loads(line)) - return records - - -def _extract_user_content(record: dict[str, Any]) -> str | None: - """Extract the user message content from a ChatML record.""" - for msg in record.get("messages", []): - if msg.get("role") == "user": - return msg.get("content") - return None - - -def _extract_assistant_content(record: dict[str, Any]) -> str | None: - """Extract the assistant message content from a ChatML record.""" - for msg in record.get("messages", []): - if msg.get("role") == "assistant": - return msg.get("content") - return None - - -def _extract_system_content(record: dict[str, Any]) -> str | None: - """Extract the system message content from a ChatML record.""" - for msg in record.get("messages", []): - if msg.get("role") == "system": - return msg.get("content") - return None + return read_jsonl(path) def _call_ollama(model: str, system: str, user: str) -> str: @@ -326,8 +292,8 @@ def run_eval(school: str, task: str) -> ShipDecision: inputs: list[dict[str, Any]] = [] for idx, record in enumerate(records): - system = _extract_system_content(record) or "" - user = _extract_user_content(record) or "" + system = get_message_content(record, "system") or "" + user = get_message_content(record, "user") or "" try: response = _call_ollama(model_name, system, user) except Exception as exc: diff --git a/training/export.py b/training/export.py index 776dff1..277298a 100644 --- a/training/export.py +++ b/training/export.py @@ -19,6 +19,11 @@ from training.config import get_training_data_dir, load_school_config from training.prompts import EXPLAINER_STUDENT_SYSTEM, SUMMARIZER_STUDENT_SYSTEM +_SYSTEM_PROMPTS = { + "explainer": EXPLAINER_STUDENT_SYSTEM, + "summarizer": SUMMARIZER_STUDENT_SYSTEM, +} + # --------------------------------------------------------------------------- # Modelfile template # --------------------------------------------------------------------------- @@ -102,18 +107,11 @@ def export_model(school: str, task: str, model: str = "9b") -> int: "Run `python -m training.finetune` first." ) - # Determine system prompt for this task - if task == "explainer": - system_prompt = EXPLAINER_STUDENT_SYSTEM - elif task == "summarizer": - system_prompt = SUMMARIZER_STUDENT_SYSTEM - else: - raise ValueError(f"Unknown task '{task}'. Must be 'explainer' or 'summarizer'.") + if task not in _SYSTEM_PROMPTS: + raise ValueError(f"Unknown task '{task}'. Must be one of: {list(_SYSTEM_PROMPTS)}") + system_prompt = _SYSTEM_PROMPTS[task] - # Base model tag (Ollama format) base_model = f"qwen3.5:{model}" - - # Ollama model name: "{school}-{task}" e.g. "bishop-state-explainer" ollama_model_name = f"{school}-{task}" # Write Modelfile diff --git a/training/finetune.py b/training/finetune.py index 2c424f2..518b982 100644 --- a/training/finetune.py +++ b/training/finetune.py @@ -126,15 +126,9 @@ def run_finetune(school: str, model: str, task: str) -> int: "Run `python -m training.prepare` first." ) - # Override the model from config with the CLI-specified model - train_cfg = config.get("training", {}) - train_cfg["default_model"] = f"qwen3.5:{model}" - config["training"] = train_cfg - lora_config = build_lora_config(config, task, data_dir) lora_config["model"] = _MODEL_MAP[model] - # Write config to a temporary JSON file config_dir = data_dir / "configs" config_dir.mkdir(parents=True, exist_ok=True) config_path = config_dir / f"lora_{task}_{model}.json" diff --git a/training/prepare.py b/training/prepare.py index bab5d39..78e1eb9 100644 --- a/training/prepare.py +++ b/training/prepare.py @@ -16,7 +16,9 @@ JACCARD_THRESHOLD, TRAIN_RATIO, VAL_RATIO, + get_message_content, get_training_data_dir, + read_jsonl, write_jsonl, ) @@ -30,13 +32,6 @@ def jaccard_similarity(a: str, b: str) -> float: return len(words_a & words_b) / len(words_a | words_b) -def _get_user_text(pair: dict[str, Any]) -> str: - for msg in pair.get("messages", []): - if msg.get("role") == "user": - return msg.get("content", "") - return "" - - def filter_invalid_json(pairs: list[dict[str, Any]]) -> list[dict[str, Any]]: """Keep only pairs with valid structure and JSON-parseable assistant content.""" valid = [] @@ -74,9 +69,9 @@ def deduplicate_by_jaccard( if not pairs: return pairs kept: list[dict[str, Any]] = [pairs[0]] - kept_word_sets: list[set] = [set(_get_user_text(pairs[0]).lower().split())] + kept_word_sets: list[set] = [set((get_message_content(pairs[0], "user") or "").lower().split())] for pair in pairs[1:]: - candidate_words = set(_get_user_text(pair).lower().split()) + candidate_words = set((get_message_content(pair, "user") or "").lower().split()) is_duplicate = any( _jaccard_sets(candidate_words, kw) >= threshold for kw in kept_word_sets @@ -113,23 +108,13 @@ def split_dataset( } -def _load_pairs(path: Path) -> list[dict[str, Any]]: - pairs = [] - with path.open() as fh: - for line in fh: - line = line.strip() - if line: - pairs.append(json.loads(line)) - return pairs - - def process_task(school: str, task: str) -> dict[str, int]: """Load, filter, deduplicate, and split training data for a task.""" data_dir = get_training_data_dir(school) input_path = data_dir / "pairs" / f"{task}.jsonl" if not input_path.exists(): raise FileNotFoundError(f"Pairs file not found: {input_path}") - pairs = _load_pairs(input_path) + pairs = read_jsonl(input_path) print(f"[{task}] Loaded {len(pairs)} pairs from {input_path}") pairs = filter_invalid_json(pairs) print(f"[{task}] After JSON filter: {len(pairs)} pairs") From e8312a36fc8f34d0e84f3bf844a78362e0b2890f Mon Sep 17 00:00:00 2001 From: William Hill Date: Sat, 28 Mar 2026 15:56:15 -0400 Subject: [PATCH 18/18] fix: address CodeRabbit review findings - Pass maxTokens through to Ollama backend (was hardcoded to 1024) - Remove unused variables: seed_queries in distill.py, config in eval.py and export.py - Fix num_layers using lora_rank instead of dedicated config key in finetune.py - Remove duplicate pyyaml entry in requirements.txt - Clean up unused imports --- codebenders-dashboard/lib/model-client.ts | 6 +++--- requirements.txt | 1 - training/distill.py | 2 -- training/eval.py | 3 +-- training/export.py | 3 +-- training/finetune.py | 2 +- 6 files changed, 6 insertions(+), 11 deletions(-) diff --git a/codebenders-dashboard/lib/model-client.ts b/codebenders-dashboard/lib/model-client.ts index 282c7bf..4e794b5 100644 --- a/codebenders-dashboard/lib/model-client.ts +++ b/codebenders-dashboard/lib/model-client.ts @@ -20,7 +20,7 @@ function getOpenAI() { return _openai } -async function callOllama(model: string, prompt: string): Promise { +async function callOllama(model: string, prompt: string, maxTokens: number): Promise { const response = await fetch(`${OLLAMA_BASE_URL}/api/generate`, { method: "POST", headers: { "Content-Type": "application/json" }, @@ -30,7 +30,7 @@ async function callOllama(model: string, prompt: string): Promise { stream: false, options: { temperature: 0.3, - num_predict: 1024, + num_predict: maxTokens, }, }), }) @@ -50,7 +50,7 @@ async function generate( ): Promise { if (MODEL_BACKEND === "ollama") { const model = `${SCHOOL_CODE}-${task}:${MODEL_SIZE}` - return callOllama(model, prompt) + return callOllama(model, prompt, maxTokens) } const result = await generateText({ model: getOpenAI()("gpt-4o-mini"), diff --git a/requirements.txt b/requirements.txt index 4af40e5..da41dab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,7 +27,6 @@ pyyaml>=6.0 colorama>=0.4.6 # Training pipeline -pyyaml>=6.0 anthropic>=0.40.0 ollama>=0.4.0 rouge-score>=0.1.2 diff --git a/training/distill.py b/training/distill.py index e68fd83..bd8f80c 100644 --- a/training/distill.py +++ b/training/distill.py @@ -30,7 +30,6 @@ format_as_chatml, generate_synthetic_course_pairings, generate_synthetic_query_results, - load_seed_queries, ) # Cost tracking @@ -246,7 +245,6 @@ def main(school: str, local: bool = False) -> None: data_dir = get_training_data_dir(school) pairs_dir = data_dir / "pairs" - seed_queries = load_seed_queries(school) synthetic_pairings = generate_synthetic_course_pairings(config, count=pairs_per_task) synthetic_results = generate_synthetic_query_results(config, count=pairs_per_task) diff --git a/training/eval.py b/training/eval.py index 1edab95..23bffb8 100644 --- a/training/eval.py +++ b/training/eval.py @@ -16,7 +16,7 @@ from pathlib import Path from typing import Any -from training.config import get_message_content, get_training_data_dir, load_school_config, read_jsonl +from training.config import get_message_content, get_training_data_dir, read_jsonl # --------------------------------------------------------------------------- # Required keys per task @@ -271,7 +271,6 @@ def run_eval(school: str, task: str) -> ShipDecision: Inference is performed via Ollama using the fine-tuned model registered as ``{school}-{task}`` (e.g. ``bishop-state-explainer``). """ - config = load_school_config(school) data_dir = get_training_data_dir(school) test_path = data_dir / "final" / task / "test.jsonl" diff --git a/training/export.py b/training/export.py index 277298a..77dec29 100644 --- a/training/export.py +++ b/training/export.py @@ -16,7 +16,7 @@ import sys from pathlib import Path -from training.config import get_training_data_dir, load_school_config +from training.config import get_training_data_dir from training.prompts import EXPLAINER_STUDENT_SYSTEM, SUMMARIZER_STUDENT_SYSTEM _SYSTEM_PROMPTS = { @@ -97,7 +97,6 @@ def export_model(school: str, task: str, model: str = "9b") -> int: int The ``ollama create`` subprocess return code (0 = success). """ - config = load_school_config(school) data_dir = get_training_data_dir(school) adapter_path = data_dir / "adapters" / task diff --git a/training/finetune.py b/training/finetune.py index 518b982..d2a5671 100644 --- a/training/finetune.py +++ b/training/finetune.py @@ -63,7 +63,7 @@ def build_lora_config(config: dict[str, Any], task: str, data_dir: Path) -> dict "train": True, "data": str(final_dir), "fine_tune_type": train_cfg.get("method", "qlora"), - "num_layers": train_cfg.get("lora_rank", 16), + "num_layers": train_cfg.get("lora_num_layers", 16), "lora_parameters": { "rank": train_cfg.get("lora_rank", 16), "alpha": train_cfg.get("lora_alpha", 32),