navi-1/debug/eval/schema.py at 864261a6cf895e6bbf76fe7c886036a55ae05282

Fork: 0

root / navi-1

Find file

Newer

Older

navi-1 / debug / eval / schema.py

Eugene Sukhodolskiy on 26 Apr 2 KB Add eval system Phase 2 — rubric, expert prompts, judge skeleton

Raw Blame History

"""Pydantic models for the eval system.

Used both as the contract for what each expert LLM must return (JSON validated
against ExpertResult) and as the storage shape for evaluation rows.
"""

from __future__ import annotations

from datetime import datetime
from typing import Literal
from uuid import UUID

from pydantic import BaseModel, Field, field_validator


# Axes the judge must score. Order does not matter for storage but matches
# rubric_v1.yaml for readability when humans inspect output.
AXIS_NAMES: tuple[str, ...] = (
    "task_complexity",
    "goal_completion",
    "tool_usage_quality",
    "efficiency",
    "communication",
    "subagent_orchestration",
    "self_extension",
)

# Axes that may be null when the session never used the relevant mechanic.
NULLABLE_AXES: frozenset[str] = frozenset({"subagent_orchestration", "self_extension"})


class EvalScores(BaseModel):
    """Per-axis numeric scores. Open scale: values >100 are accepted."""

    task_complexity: int = Field(ge=0)
    goal_completion: int = Field(ge=0)
    tool_usage_quality: int = Field(ge=0)
    efficiency: int = Field(ge=0)
    communication: int = Field(ge=0)
    subagent_orchestration: int | None = Field(default=None, ge=0)
    self_extension: int | None = Field(default=None, ge=0)


class ExpertResult(BaseModel):
    """One expert's verdict for one session. Matches the JSON the LLM emits."""

    expert_id: Literal["strict_critic", "pragmatist", "tech_lead"]
    scores: EvalScores
    comment: str

    @field_validator("comment")
    @classmethod
    def _strip_comment(cls, v: str) -> str:
        v = (v or "").strip()
        if not v:
            raise ValueError("comment must not be empty")
        return v


class EvalRunMetadata(BaseModel):
    """Metadata that pins the rubric / judge version for a single run."""

    eval_run_id: UUID
    eval_date: datetime
    judge_model: str
    judge_version: str
    rubric_version: str


class StoredEvaluation(BaseModel):
    """Row shape returned by db.list_evaluations()."""

    id: UUID
    session_id: str
    eval_run_id: UUID
    eval_date: datetime
    judge_model: str
    judge_version: str
    rubric_version: str
    expert_id: str
    scores: EvalScores
    comment: str