"""Pydantic models for the eval system.
Used both as the contract for what each expert LLM must return (JSON validated
against ExpertResult) and as the storage shape for evaluation rows.
"""
from __future__ import annotations
from datetime import datetime
from typing import Literal
from uuid import UUID
from pydantic import BaseModel, Field, field_validator
# Axes the judge must score. Order does not matter for storage but matches
# rubric_v1.yaml for readability when humans inspect output.
AXIS_NAMES: tuple[str, ...] = (
"task_complexity",
"goal_completion",
"tool_usage_quality",
"efficiency",
"communication",
"subagent_orchestration",
"self_extension",
)
# Axes that may be null when the session never used the relevant mechanic.
NULLABLE_AXES: frozenset[str] = frozenset({"subagent_orchestration", "self_extension"})
class EvalScores(BaseModel):
"""Per-axis numeric scores. Open scale: values >100 are accepted."""
task_complexity: int = Field(ge=0)
goal_completion: int = Field(ge=0)
tool_usage_quality: int = Field(ge=0)
efficiency: int = Field(ge=0)
communication: int = Field(ge=0)
subagent_orchestration: int | None = Field(default=None, ge=0)
self_extension: int | None = Field(default=None, ge=0)
class ExpertResult(BaseModel):
"""One expert's verdict for one session. Matches the JSON the LLM emits."""
expert_id: Literal["strict_critic", "pragmatist", "tech_lead"]
scores: EvalScores
comment: str
@field_validator("comment")
@classmethod
def _strip_comment(cls, v: str) -> str:
v = (v or "").strip()
if not v:
raise ValueError("comment must not be empty")
return v
class EvalRunMetadata(BaseModel):
"""Metadata that pins the rubric / judge version for a single run."""
eval_run_id: UUID
eval_date: datetime
judge_model: str
judge_version: str
rubric_version: str
class StoredEvaluation(BaseModel):
"""Row shape returned by db.list_evaluations()."""
id: UUID
session_id: str
eval_run_id: UUID
eval_date: datetime
judge_model: str
judge_version: str
rubric_version: str
expert_id: str
scores: EvalScores
comment: str