diff --git a/debug/eval/__main__.py b/debug/eval/__main__.py new file mode 100644 index 0000000..cd99c31 --- /dev/null +++ b/debug/eval/__main__.py @@ -0,0 +1,5 @@ +"""Module entry point so `python -m debug.eval` works from the project root.""" + +from .cli import main + +raise SystemExit(main()) diff --git a/debug/eval/cli.py b/debug/eval/cli.py new file mode 100644 index 0000000..e28329f --- /dev/null +++ b/debug/eval/cli.py @@ -0,0 +1,97 @@ +"""Standalone CLI for the eval system. + +Invoke from the project root: + + python -m debug.eval run [--since DATE] [--limit N] [--re-evaluate-all] + python -m debug.eval show + python -m debug.eval stats [--days 30] [--csv] + +Phase 2 lands the argparse skeleton with command stubs. The real work +(transcript rendering, judge calls, score persistence, stats aggregation) +lands in Phase 3 and Phase 4 — see docs/eval_system.md. +""" + +from __future__ import annotations + +import argparse +import asyncio +import sys +from typing import Sequence + + +def _build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + prog="debug.eval", + description="LLM-as-judge evaluation runner for Navi sessions.", + ) + sub = p.add_subparsers(dest="cmd", required=True) + + p_run = sub.add_parser("run", help="Evaluate sessions against the pinned rubric.") + p_run.add_argument("--session", help="Single session id to evaluate.") + p_run.add_argument("--since", help="ISO date — only sessions started after this.") + p_run.add_argument("--limit", type=int, default=None, help="Max sessions to process.") + p_run.add_argument( + "--re-evaluate-all", + action="store_true", + help="Re-evaluate every session, even if a current-version row already exists.", + ) + p_run.add_argument( + "--dry-run", + action="store_true", + help="List what would be evaluated, do not call the judge.", + ) + + p_show = sub.add_parser("show", help="Print stored evaluations for a session.") + p_show.add_argument("session_id") + + p_stats = sub.add_parser("stats", help="Aggregate scores across the archive.") + p_stats.add_argument("--days", type=int, default=30, help="Window in days.") + p_stats.add_argument("--csv", action="store_true", help="Emit CSV to stdout.") + p_stats.add_argument( + "--by-complexity-bucket", + action="store_true", + help="Split by 0-25 / 26-50 / 51-75 / 76+ buckets.", + ) + + return p + + +# ── Command stubs ──────────────────────────────────────────────────────── + + +async def _cmd_run(args: argparse.Namespace) -> int: + print("[eval] run command — not implemented yet (lands in Phase 3).", file=sys.stderr) + print(f"[eval] would process: session={args.session} since={args.since} " + f"limit={args.limit} re_eval_all={args.re_evaluate_all} dry={args.dry_run}", + file=sys.stderr) + return 2 + + +async def _cmd_show(args: argparse.Namespace) -> int: + print(f"[eval] show command — not implemented yet (lands in Phase 3). session={args.session_id}", + file=sys.stderr) + return 2 + + +async def _cmd_stats(args: argparse.Namespace) -> int: + print(f"[eval] stats command — not implemented yet (lands in Phase 4). " + f"days={args.days} csv={args.csv} by_bucket={args.by_complexity_bucket}", + file=sys.stderr) + return 2 + + +# ── Entry point ────────────────────────────────────────────────────────── + + +def main(argv: Sequence[str] | None = None) -> int: + args = _build_parser().parse_args(argv) + coro = { + "run": _cmd_run, + "show": _cmd_show, + "stats": _cmd_stats, + }[args.cmd](args) + return asyncio.run(coro) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/debug/eval/judge.py b/debug/eval/judge.py new file mode 100644 index 0000000..779a694 --- /dev/null +++ b/debug/eval/judge.py @@ -0,0 +1,185 @@ +"""Judge orchestration: render a session, fan out across 3 experts, average. + +Phase 2 ships the skeleton. Real LLM calls and transcript rendering land in +Phase 3. The shape here is intentionally final so cli.py and api.py can wire +against it without churn. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Iterable +from uuid import UUID, uuid4 + +import yaml + +from navi.core.session import Session + +from .schema import AXIS_NAMES, EvalRunMetadata, EvalScores, ExpertResult + + +# Pinned versions for this rubric / judge generation. Bumping either forces +# re-evaluation of the whole archive (or a parallel run, depending on policy). +JUDGE_VERSION: str = "v1" +RUBRIC_VERSION: str = "v1" + +EXPERT_IDS: tuple[str, ...] = ("strict_critic", "pragmatist", "tech_lead") + +_PROMPTS_DIR = Path(__file__).parent / "prompts" + + +@dataclass(frozen=True) +class RenderedSession: + """The text blob the judge actually sees, plus the metadata header.""" + + header: str + transcript: str + + def as_user_message(self) -> str: + return f"{self.header}\n\n=== Session transcript ===\n{self.transcript}" + + +# ── Prompt + rubric loading ────────────────────────────────────────────── + + +def load_rubric() -> dict: + """Load rubric_v1.yaml as a dict. Raises if the file is missing/malformed.""" + path = _PROMPTS_DIR / f"rubric_{RUBRIC_VERSION}.yaml" + return yaml.safe_load(path.read_text(encoding="utf-8")) + + +def load_expert_prompt(expert_id: str) -> str: + """Read the system prompt for one expert from prompts/expert_.txt.""" + if expert_id not in EXPERT_IDS: + raise ValueError(f"unknown expert_id: {expert_id}") + return (_PROMPTS_DIR / f"expert_{expert_id}.txt").read_text(encoding="utf-8") + + +def render_rubric_for_prompt(rubric: dict) -> str: + """Compact text rendering of the rubric to inline into the user message. + + The expert system prompt references a "rubric" — we send it as plain text + next to the session block so the model has the anchors in front of it. + """ + lines: list[str] = [f"=== Rubric {rubric['version']} ==="] + for axis_name, axis in rubric["axes"].items(): + lines.append(f"\n## {axis_name}") + lines.append(axis["description"].strip()) + if axis.get("nullable"): + lines.append("(nullable: score this null when the mechanic was not used)") + for a in axis["anchors"]: + lines.append(f" {a['score']:>3} — {a['label']}: {a['what']}") + return "\n".join(lines) + + +# ── Session rendering (Phase 3) ────────────────────────────────────────── + + +def render_session( + session: Session, + feedback_by_index: dict[int, int] | None = None, +) -> RenderedSession: + """Render a session into the text the judge will see. + + Phase 2 stub. Phase 3 will produce: + - Header: profile, model list, planning flags, timing, like/dislike counts. + - Transcript: every message in order, with user reactions inlined next + to each assistant block, sub-agent traces indented, planning phases + included as-is. No compression-summary substitution. + """ + raise NotImplementedError("render_session lands in Phase 3") + + +# ── Expert call (Phase 3) ──────────────────────────────────────────────── + + +async def run_expert( + *, + expert_id: str, + rendered: RenderedSession, + rubric_text: str, + llm, # navi.llm.base.LLMBackend — kept untyped to avoid circular import + model: str | list[str], +) -> ExpertResult: + """Run one expert against the rendered session, parse JSON, validate. + + Phase 2 stub. Phase 3 will: + 1. Build [system=expert_prompt, user=rubric_text + rendered.as_user_message()] + 2. Call llm.complete(..., temperature=0.2, think=False) without tools. + 3. Strip code fences if any leaked, json.loads, ExpertResult.model_validate. + 4. On parse error: one retry with a "your previous output was invalid JSON" + nudge appended; then raise. + """ + raise NotImplementedError("run_expert lands in Phase 3") + + +# ── Run orchestration (Phase 3) ────────────────────────────────────────── + + +async def evaluate_session( + *, + session: Session, + feedback_by_index: dict[int, int] | None, + llm, + model: str | list[str], +) -> tuple[EvalRunMetadata, list[ExpertResult]]: + """Run all three experts on one session. Returns (metadata, results). + + Phase 2 stub. + """ + raise NotImplementedError("evaluate_session lands in Phase 3") + + +def average_scores(results: Iterable[ExpertResult]) -> EvalScores: + """Mean across experts. Nullable axes average over non-null values only; + if every expert returned null for an axis, the mean stays null.""" + sums: dict[str, int] = {a: 0 for a in AXIS_NAMES} + counts: dict[str, int] = {a: 0 for a in AXIS_NAMES} + for r in results: + for a in AXIS_NAMES: + v = getattr(r.scores, a) + if v is None: + continue + sums[a] += v + counts[a] += 1 + averaged: dict[str, int | None] = {} + for a in AXIS_NAMES: + averaged[a] = round(sums[a] / counts[a]) if counts[a] else None + return EvalScores(**averaged) + + +def new_run_metadata(judge_model: str) -> EvalRunMetadata: + """Stamp a fresh eval_run_id and pin the rubric/judge versions.""" + return EvalRunMetadata( + eval_run_id=uuid4(), + eval_date=datetime.now(timezone.utc), + judge_model=judge_model, + judge_version=JUDGE_VERSION, + rubric_version=RUBRIC_VERSION, + ) + + +# ── Convenience: parse JSON output that may have stray fences ──────────── + + +def parse_expert_json(raw: str, expected_expert_id: str) -> ExpertResult: + """Strip code fences if present, parse, validate against ExpertResult. + + Tolerant to ```json fences and surrounding whitespace; strict otherwise. + """ + text = raw.strip() + if text.startswith("```"): + # Drop the opening fence (with optional language tag) and the closing fence. + first_nl = text.find("\n") + text = text[first_nl + 1:] if first_nl != -1 else text[3:] + if text.rstrip().endswith("```"): + text = text.rstrip()[:-3] + data = json.loads(text) + if data.get("expert_id") != expected_expert_id: + raise ValueError( + f"expert_id mismatch: expected {expected_expert_id!r}, got {data.get('expert_id')!r}" + ) + return ExpertResult.model_validate(data) diff --git a/debug/eval/prompts/expert_pragmatist.txt b/debug/eval/prompts/expert_pragmatist.txt new file mode 100644 index 0000000..efe40f2 --- /dev/null +++ b/debug/eval/prompts/expert_pragmatist.txt @@ -0,0 +1,41 @@ +You are the **Pragmatist**, one of three independent expert evaluators +reviewing a session of an autonomous AI agent named Navi. Your job is to score +this session on user-facing outcomes: did the user end up with what they +wanted, and was the path tolerable? You do not care about elegance, internal +architecture, or whether a tool call was technically optimal — you care +whether the work shipped. + +You will receive: +1. A rubric with anchors at scores 10 / 30 / 50 / 75 / 100 for each axis. The + scale is open: score above 100 if warranted. Each axis is independent. +2. A "Session block": full transcript, per-message reactions (👍 / 👎), + aggregated counts, profile metadata, timing. + +Your output MUST be a single JSON object with this exact shape — no markdown, +no prose outside JSON, no code fences: + +{ + "expert_id": "pragmatist", + "scores": { + "task_complexity": , + "goal_completion": , + "tool_usage_quality": , + "efficiency": , + "communication": , + "subagent_orchestration": , + "self_extension": + }, + "comment": "<2–5 sentences explaining whether the user got value and what would have made the session more useful>" +} + +Rules of scoring: +- `task_complexity` from the user's request alone, before considering the + response. +- A circuitous path that still delivers a working result rates higher with you + than with a strict critic. Don't reward elegance, reward outcomes. +- `subagent_orchestration` is null if no sub-agents were spawned. + `self_extension` is null if no tool was written or modified. +- Heavy weight on user reaction signals: explicit 👎 or follow-up complaints + in the transcript should pull `goal_completion` and `communication` down. + +Do not output anything outside the JSON object. diff --git a/debug/eval/prompts/expert_strict_critic.txt b/debug/eval/prompts/expert_strict_critic.txt new file mode 100644 index 0000000..31f0d65 --- /dev/null +++ b/debug/eval/prompts/expert_strict_critic.txt @@ -0,0 +1,44 @@ +You are the **Strict Critic**, one of three independent expert evaluators +reviewing a session of an autonomous AI agent named Navi. Your job is to score +the agent's performance on this single session along seven axes, conservatively +and without sycophancy. Where two interpretations are possible, take the less +generous one. Penalise weakly any slip — bad tool choice, contradiction, +hallucination, unverified claim, missed validation step. + +You will receive: +1. A rubric with anchors at scores 10 / 30 / 50 / 75 / 100 for each axis. The + scale is open: if you see something clearly harder or better than the 100 + anchor, score above 100. Independence: each axis is scored on its own; do + not let one axis pull another up or down. +2. A "Session block" containing the full transcript (user, assistant text, + thinking, tool calls and tool results, sub-agent traces, planning phases), + per-message user reactions if any (👍 / 👎), aggregated like / dislike + counts, profile metadata, timing. + +Your output MUST be a single JSON object with this exact shape — no markdown, +no prose outside JSON, no code fences: + +{ + "expert_id": "strict_critic", + "scores": { + "task_complexity": , + "goal_completion": , + "tool_usage_quality": , + "efficiency": , + "communication": , + "subagent_orchestration": , + "self_extension": + }, + "comment": "<2–5 sentences naming the most concrete flaws you found>" +} + +Rules of scoring: +- `task_complexity` is judged from the user's request alone, *before* you + consider how Navi handled it. Do not adjust complexity based on outcome. +- `subagent_orchestration` is null if the session never spawned a sub-agent. + `self_extension` is null if Navi did not write or modify her own tools. + Do NOT invent zeros for absent mechanics. +- `comment` must point to specific moments — quote a tool name, a turn, a + hallucinated claim. Generic praise or generic complaint is not useful. + +Do not output anything outside the JSON object. diff --git a/debug/eval/prompts/expert_tech_lead.txt b/debug/eval/prompts/expert_tech_lead.txt new file mode 100644 index 0000000..d6692c5 --- /dev/null +++ b/debug/eval/prompts/expert_tech_lead.txt @@ -0,0 +1,40 @@ +You are the **Tech Lead**, one of three independent expert evaluators +reviewing a session of an autonomous AI agent named Navi. Your job is to score +this session through an engineering lens: tool selection, architecture of the +plan, how sub-tasks were decomposed, how errors were handled, whether the +agent worked from first principles or copy-pasted assumptions. + +You will receive: +1. A rubric with anchors at scores 10 / 30 / 50 / 75 / 100 for each axis. The + scale is open: score above 100 if warranted. Each axis is independent. +2. A "Session block": full transcript including planning phases, all tool + calls, sub-agent traces, profile metadata, timing. + +Your output MUST be a single JSON object with this exact shape — no markdown, +no prose outside JSON, no code fences: + +{ + "expert_id": "tech_lead", + "scores": { + "task_complexity": , + "goal_completion": , + "tool_usage_quality": , + "efficiency": , + "communication": , + "subagent_orchestration": , + "self_extension": + }, + "comment": "<2–5 sentences naming the strongest and weakest engineering decisions in this session>" +} + +Rules of scoring: +- `task_complexity` from the user's request alone, before considering the + response. +- Heavy weight on `tool_usage_quality`, `efficiency`, `subagent_orchestration`, + and `self_extension` — these are your domain. +- `subagent_orchestration` is null if no sub-agents were spawned. + `self_extension` is null if no tool was written or modified. +- A messy result that revealed a real design issue rates higher than a clean + result that hid one. Reward work that surfaces problems honestly. + +Do not output anything outside the JSON object. diff --git a/debug/eval/prompts/rubric_v1.yaml b/debug/eval/prompts/rubric_v1.yaml new file mode 100644 index 0000000..d3138c3 --- /dev/null +++ b/debug/eval/prompts/rubric_v1.yaml @@ -0,0 +1,176 @@ +# Rubric v1 — frozen reference for the LLM-as-judge eval system. +# +# Axes are scored on an open 0..100+ scale. The 100 anchor describes "at the +# limit of what Navi can do today"; if the judge sees something clearly harder +# or better, it scores above 100 and that becomes a future anchor. +# +# Anchor `examples` are placeholders. After accumulating real sessions, fill +# them with actual session_ids and short notes — do not change the scale +# language without bumping the rubric_version. +# +# Conventions: +# - Each axis is independent. Don't let "complexity" pull "communication" +# down or up. +# - `task_complexity` is judged from the user's request alone, before the +# transcript of the response is considered. The schema.py validator +# enforces this scoring order. +# - `subagent_orchestration` and `self_extension` may be null when the +# session never used those mechanics. Do not invent zeros. + +version: "v1" + +axes: + task_complexity: + description: > + Difficulty of what the user asked, judged from the user's request alone + (not from how Navi handled it). Independent of outcome. + anchors: + - score: 10 + label: trivial + what: "Single fact, single tool, no planning, instant answer." + examples: [] + - score: 30 + label: straightforward + what: "One tool, one or two steps, no real branching, expected answer is obvious." + examples: [] + - score: 50 + label: moderate + what: "2–4 steps, planning helps, mild ambiguity in the request." + examples: [] + - score: 75 + label: complex + what: "Multi-tool with planning, real ambiguity to resolve, several places where it could fail." + examples: [] + - score: 100 + label: at-the-limit + what: "Full project-shaped task — multiple sub-agents, self-extension via write_tool, long-horizon execution." + examples: [] + + goal_completion: + description: > + Did the user end up with what they wanted, regardless of the path taken. + Read the user's reactions and the final assistant response to decide. + anchors: + - score: 10 + label: missed + what: "User did not get what they asked for; gave up or had to redirect entirely." + - score: 30 + label: partial-but-wrong-direction + what: "Some output was produced but it doesn't really answer the request." + - score: 50 + label: partial + what: "Half the request was met; user had to fill in or correct the rest." + - score: 75 + label: solid + what: "Goal met with minor gaps a user can live with." + - score: 100 + label: clean-completion + what: "Goal fully met, including caveats the user didn't have to ask about." + + tool_usage_quality: + description: > + Were the right tools chosen? Was there thrashing, redundant calls, + reading the same file twice, calling search when a local source had it? + anchors: + - score: 10 + label: chaotic + what: "Wrong tools picked, repeated identical calls, no recovery from errors." + - score: 30 + label: clumsy + what: "Right idea but with several detours, redundant lookups." + - score: 50 + label: workable + what: "Tools were appropriate; one or two avoidable calls." + - score: 75 + label: deliberate + what: "Tool choices match the task, errors handled cleanly, no obvious waste." + - score: 100 + label: surgical + what: "Minimal sufficient toolset, each call has a clear purpose, results reused well." + + efficiency: + description: > + Iterations vs result. Penalize loops, dead-ends, and re-doing work that + was already done. Reward straight lines that finish in fewer steps. + anchors: + - score: 10 + label: thrashing + what: "Loops, runs out of iteration budget, never converges." + - score: 30 + label: wandering + what: "Reaches the goal but with detours and several aborted attempts." + - score: 50 + label: acceptable + what: "Linear path with minor stalls." + - score: 75 + label: tight + what: "Few wasted moves; planning anticipated most of the work." + - score: 100 + label: ideal + what: "Shortest reasonable path to the result, no slack." + + communication: + description: > + Clarity and honesty of replies. Penalize hallucinations, false + confidence, and excessive verbosity. Reward direct answers, stated + uncertainties, and accurate self-reports. + anchors: + - score: 10 + label: misleading + what: "Hallucinations, claims work was done that wasn't, walls of filler." + - score: 30 + label: noisy + what: "Padded replies, occasional inaccuracies, reader has to dig." + - score: 50 + label: serviceable + what: "Conveys the answer; some unnecessary text, no major errors." + - score: 75 + label: clear + what: "Direct, accurate, appropriately brief; flags genuine uncertainties." + - score: 100 + label: exemplary + what: "Tight, honest, anticipates reader questions, no fluff." + + subagent_orchestration: + description: > + Quality of sub-agent delegation when spawn_agent is used. Score this + null if no sub-agents were spawned — do not punish absence. + nullable: true + anchors: + - score: 10 + label: misuse + what: "Sub-agent given a vague prompt, returns junk, parent ignores or duplicates the work." + - score: 30 + label: rough + what: "Sub-agent did most of the work but parent had to re-do or heavily edit." + - score: 50 + label: workable + what: "Sub-agent helped; the delegation paid off but the prompt wasn't clean." + - score: 75 + label: well-scoped + what: "Clear sub-task, clear hand-off, parent uses the result without rework." + - score: 100 + label: textbook + what: "Sub-agent saved real work; output integrated cleanly; no overlap with the parent." + + self_extension: + description: > + Quality of write_tool / reload_tools / delete_tool usage. Score null if + Navi did not modify her own tooling in this session. + nullable: true + anchors: + - score: 10 + label: broken + what: "Tool written in wrong format, fails to load, no recovery." + - score: 30 + label: shaky + what: "Tool loads but is brittle or solves the wrong problem." + - score: 50 + label: functional + what: "Tool works for the immediate need but is narrow or quirky." + - score: 75 + label: solid + what: "Tool is well-formed, reusable, manual or doc updated." + - score: 100 + label: production-grade + what: "Tool is general, tested, integrates cleanly with other tools and matches project conventions." diff --git a/debug/eval/schema.py b/debug/eval/schema.py new file mode 100644 index 0000000..a7a56be --- /dev/null +++ b/debug/eval/schema.py @@ -0,0 +1,82 @@ +"""Pydantic models for the eval system. + +Used both as the contract for what each expert LLM must return (JSON validated +against ExpertResult) and as the storage shape for evaluation rows. +""" + +from __future__ import annotations + +from datetime import datetime +from typing import Literal +from uuid import UUID + +from pydantic import BaseModel, Field, field_validator + + +# Axes the judge must score. Order does not matter for storage but matches +# rubric_v1.yaml for readability when humans inspect output. +AXIS_NAMES: tuple[str, ...] = ( + "task_complexity", + "goal_completion", + "tool_usage_quality", + "efficiency", + "communication", + "subagent_orchestration", + "self_extension", +) + +# Axes that may be null when the session never used the relevant mechanic. +NULLABLE_AXES: frozenset[str] = frozenset({"subagent_orchestration", "self_extension"}) + + +class EvalScores(BaseModel): + """Per-axis numeric scores. Open scale: values >100 are accepted.""" + + task_complexity: int = Field(ge=0) + goal_completion: int = Field(ge=0) + tool_usage_quality: int = Field(ge=0) + efficiency: int = Field(ge=0) + communication: int = Field(ge=0) + subagent_orchestration: int | None = Field(default=None, ge=0) + self_extension: int | None = Field(default=None, ge=0) + + +class ExpertResult(BaseModel): + """One expert's verdict for one session. Matches the JSON the LLM emits.""" + + expert_id: Literal["strict_critic", "pragmatist", "tech_lead"] + scores: EvalScores + comment: str + + @field_validator("comment") + @classmethod + def _strip_comment(cls, v: str) -> str: + v = (v or "").strip() + if not v: + raise ValueError("comment must not be empty") + return v + + +class EvalRunMetadata(BaseModel): + """Metadata that pins the rubric / judge version for a single run.""" + + eval_run_id: UUID + eval_date: datetime + judge_model: str + judge_version: str + rubric_version: str + + +class StoredEvaluation(BaseModel): + """Row shape returned by db.list_evaluations().""" + + id: UUID + session_id: str + eval_run_id: UUID + eval_date: datetime + judge_model: str + judge_version: str + rubric_version: str + expert_id: str + scores: EvalScores + comment: str