navi-1/debug/eval/judge.py at e4771277847c3da2f64395883c6f65adfdf137fb

Fork: 0
root / navi-1
Find file
Newer
Older
navi-1 / debug / eval / judge.py
Eugene Sukhodolskiy on 26 Apr 6 KB Add eval system Phase 2 — rubric, expert prompts, judge skeleton
Raw Blame History
"""Judge orchestration: render a session, fan out across 3 experts, average.

Phase 2 ships the skeleton. Real LLM calls and transcript rendering land in
Phase 3. The shape here is intentionally final so cli.py and api.py can wire
against it without churn.
"""

from __future__ import annotations

import json
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterable
from uuid import UUID, uuid4

import yaml

from navi.core.session import Session

from .schema import AXIS_NAMES, EvalRunMetadata, EvalScores, ExpertResult


# Pinned versions for this rubric / judge generation. Bumping either forces
# re-evaluation of the whole archive (or a parallel run, depending on policy).
JUDGE_VERSION: str = "v1"
RUBRIC_VERSION: str = "v1"

EXPERT_IDS: tuple[str, ...] = ("strict_critic", "pragmatist", "tech_lead")

_PROMPTS_DIR = Path(__file__).parent / "prompts"


@dataclass(frozen=True)
class RenderedSession:
    """The text blob the judge actually sees, plus the metadata header."""

    header: str
    transcript: str

    def as_user_message(self) -> str:
        return f"{self.header}\n\n=== Session transcript ===\n{self.transcript}"


# ── Prompt + rubric loading ──────────────────────────────────────────────


def load_rubric() -> dict:
    """Load rubric_v1.yaml as a dict. Raises if the file is missing/malformed."""
    path = _PROMPTS_DIR / f"rubric_{RUBRIC_VERSION}.yaml"
    return yaml.safe_load(path.read_text(encoding="utf-8"))


def load_expert_prompt(expert_id: str) -> str:
    """Read the system prompt for one expert from prompts/expert_<id>.txt."""
    if expert_id not in EXPERT_IDS:
        raise ValueError(f"unknown expert_id: {expert_id}")
    return (_PROMPTS_DIR / f"expert_{expert_id}.txt").read_text(encoding="utf-8")


def render_rubric_for_prompt(rubric: dict) -> str:
    """Compact text rendering of the rubric to inline into the user message.

    The expert system prompt references a "rubric" — we send it as plain text
    next to the session block so the model has the anchors in front of it.
    """
    lines: list[str] = [f"=== Rubric {rubric['version']} ==="]
    for axis_name, axis in rubric["axes"].items():
        lines.append(f"\n## {axis_name}")
        lines.append(axis["description"].strip())
        if axis.get("nullable"):
            lines.append("(nullable: score this null when the mechanic was not used)")
        for a in axis["anchors"]:
            lines.append(f"  {a['score']:>3} — {a['label']}: {a['what']}")
    return "\n".join(lines)


# ── Session rendering (Phase 3) ──────────────────────────────────────────


def render_session(
    session: Session,
    feedback_by_index: dict[int, int] | None = None,
) -> RenderedSession:
    """Render a session into the text the judge will see.

    Phase 2 stub. Phase 3 will produce:
      - Header: profile, model list, planning flags, timing, like/dislike counts.
      - Transcript: every message in order, with user reactions inlined next
        to each assistant block, sub-agent traces indented, planning phases
        included as-is. No compression-summary substitution.
    """
    raise NotImplementedError("render_session lands in Phase 3")


# ── Expert call (Phase 3) ────────────────────────────────────────────────


async def run_expert(
    *,
    expert_id: str,
    rendered: RenderedSession,
    rubric_text: str,
    llm,  # navi.llm.base.LLMBackend — kept untyped to avoid circular import
    model: str | list[str],
) -> ExpertResult:
    """Run one expert against the rendered session, parse JSON, validate.

    Phase 2 stub. Phase 3 will:
      1. Build [system=expert_prompt, user=rubric_text + rendered.as_user_message()]
      2. Call llm.complete(..., temperature=0.2, think=False) without tools.
      3. Strip code fences if any leaked, json.loads, ExpertResult.model_validate.
      4. On parse error: one retry with a "your previous output was invalid JSON"
         nudge appended; then raise.
    """
    raise NotImplementedError("run_expert lands in Phase 3")


# ── Run orchestration (Phase 3) ──────────────────────────────────────────


async def evaluate_session(
    *,
    session: Session,
    feedback_by_index: dict[int, int] | None,
    llm,
    model: str | list[str],
) -> tuple[EvalRunMetadata, list[ExpertResult]]:
    """Run all three experts on one session. Returns (metadata, results).

    Phase 2 stub.
    """
    raise NotImplementedError("evaluate_session lands in Phase 3")


def average_scores(results: Iterable[ExpertResult]) -> EvalScores:
    """Mean across experts. Nullable axes average over non-null values only;
    if every expert returned null for an axis, the mean stays null."""
    sums: dict[str, int] = {a: 0 for a in AXIS_NAMES}
    counts: dict[str, int] = {a: 0 for a in AXIS_NAMES}
    for r in results:
        for a in AXIS_NAMES:
            v = getattr(r.scores, a)
            if v is None:
                continue
            sums[a] += v
            counts[a] += 1
    averaged: dict[str, int | None] = {}
    for a in AXIS_NAMES:
        averaged[a] = round(sums[a] / counts[a]) if counts[a] else None
    return EvalScores(**averaged)


def new_run_metadata(judge_model: str) -> EvalRunMetadata:
    """Stamp a fresh eval_run_id and pin the rubric/judge versions."""
    return EvalRunMetadata(
        eval_run_id=uuid4(),
        eval_date=datetime.now(timezone.utc),
        judge_model=judge_model,
        judge_version=JUDGE_VERSION,
        rubric_version=RUBRIC_VERSION,
    )


# ── Convenience: parse JSON output that may have stray fences ────────────


def parse_expert_json(raw: str, expected_expert_id: str) -> ExpertResult:
    """Strip code fences if present, parse, validate against ExpertResult.

    Tolerant to ```json fences and surrounding whitespace; strict otherwise.
    """
    text = raw.strip()
    if text.startswith("```"):
        # Drop the opening fence (with optional language tag) and the closing fence.
        first_nl = text.find("\n")
        text = text[first_nl + 1:] if first_nl != -1 else text[3:]
        if text.rstrip().endswith("```"):
            text = text.rstrip()[:-3]
    data = json.loads(text)
    if data.get("expert_id") != expected_expert_id:
        raise ValueError(
            f"expert_id mismatch: expected {expected_expert_id!r}, got {data.get('expert_id')!r}"
        )
    return ExpertResult.model_validate(data)