"""Pydantic models for the eval system.
Used both as the contract for what each expert LLM must return (JSON validated
against ExpertResult) and as the storage shape for evaluation rows.
"""
from __future__ import annotations
from datetime import datetime
from typing import Literal
from uuid import UUID
from pydantic import BaseModel, Field, field_validator
# Axes the judge must score. Order does not matter for storage but matches
# rubric_v1.yaml for readability when humans inspect output.
AXIS_NAMES: tuple[str, ...] = (
"task_complexity",
"goal_completion",
"tool_usage_quality",
"efficiency",
"communication",
"subagent_orchestration",
"self_extension",
)
# Axes that may be null when the session never used the relevant mechanic.
NULLABLE_AXES: frozenset[str] = frozenset({"subagent_orchestration", "self_extension"})
class EvalScores(BaseModel):
"""Per-axis numeric scores. Fully open-ended scale: any non-negative integer."""
task_complexity: int = Field(ge=0)
goal_completion: int = Field(ge=0)
tool_usage_quality: int = Field(ge=0)
efficiency: int = Field(ge=0)
communication: int = Field(ge=0)
subagent_orchestration: int | None = Field(default=None, ge=0)
self_extension: int | None = Field(default=None, ge=0)
class ExpertResult(BaseModel):
"""One expert's verdict for one session. Matches the JSON the LLM emits."""
expert_id: Literal["strict_critic", "pragmatist", "tech_lead"]
scores: EvalScores
comment: str
@field_validator("comment")
@classmethod
def _strip_comment(cls, v: str) -> str:
v = (v or "").strip()
if not v:
raise ValueError("comment must not be empty")
return v
class EvalRunMetadata(BaseModel):
"""Metadata that pins the rubric / judge version for a single run."""
eval_run_id: UUID
eval_date: datetime
judge_model: str
judge_version: str
rubric_version: str
class StoredEvaluation(BaseModel):
"""Row shape returned by db.list_evaluations()."""
id: UUID
session_id: str
eval_run_id: UUID
eval_date: datetime
judge_model: str
judge_version: str
rubric_version: str
expert_id: str
scores: EvalScores
comment: str
# ── REST response shapes ─────────────────────────────────────────────────
class SessionOverview(BaseModel):
"""Row in GET /eval/sessions."""
session_id: str
profile_id: str
name: str | None = None
created_at: datetime
last_active: datetime
pinned: bool
msg_count: int
likes: int
dislikes: int
eval_status: Literal["evaluated", "pending", "stale"]
latest_avg: dict | None = None
latest_eval_date: datetime | None = None
latest_judge_version: str | None = None
latest_rubric_version: str | None = None
class SessionDetail(BaseModel):
"""GET /eval/sessions/{id}."""
session_id: str
profile_id: str
name: str | None = None
created_at: datetime
last_active: datetime
msg_count: int
feedback: list[dict] # [{message_index, rating, ...}]
evaluations: list[StoredEvaluation] # newest first
class WeeklyAxisMeans(BaseModel):
week_start: str
bucket: str
sample_count: int
axis_means: dict
class StatsResponse(BaseModel):
buckets: list[str]
weekly: list[WeeklyAxisMeans]
# ── Run trigger / status ─────────────────────────────────────────────────
class RunRequest(BaseModel):
"""POST /eval/run body."""
scope: Literal["unevaluated", "session", "all"] = "unevaluated"
session_id: str | None = None
since: datetime | None = None
limit: int | None = None
model: str = "gemma4:31b-cloud"
backend: str = "ollama"
class RunSessionStatus(BaseModel):
session_id: str
state: Literal["pending", "running", "ok", "failed"]
avg: dict | None = None
error: str | None = None
class RunStatus(BaseModel):
"""GET /eval/run/{run_id}."""
run_id: str
state: Literal["running", "done", "failed"]
started_at: datetime
finished_at: datetime | None = None
judge_model: str
judge_version: str
rubric_version: str
sessions: list[RunSessionStatus]