diff --git a/debug/eval/__main__.py b/debug/eval/__main__.py
new file mode 100644
index 0000000..cd99c31
--- /dev/null
+++ b/debug/eval/__main__.py
@@ -0,0 +1,5 @@
+"""Module entry point so `python -m debug.eval` works from the project root."""
+
+from .cli import main
+
+raise SystemExit(main())
diff --git a/debug/eval/cli.py b/debug/eval/cli.py
new file mode 100644
index 0000000..e28329f
--- /dev/null
+++ b/debug/eval/cli.py
@@ -0,0 +1,97 @@
+"""Standalone CLI for the eval system.
+
+Invoke from the project root:
+
+    python -m debug.eval run [--since DATE] [--limit N] [--re-evaluate-all]
+    python -m debug.eval show <session_id>
+    python -m debug.eval stats [--days 30] [--csv]
+
+Phase 2 lands the argparse skeleton with command stubs. The real work
+(transcript rendering, judge calls, score persistence, stats aggregation)
+lands in Phase 3 and Phase 4 — see docs/eval_system.md.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import sys
+from typing import Sequence
+
+
+def _build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        prog="debug.eval",
+        description="LLM-as-judge evaluation runner for Navi sessions.",
+    )
+    sub = p.add_subparsers(dest="cmd", required=True)
+
+    p_run = sub.add_parser("run", help="Evaluate sessions against the pinned rubric.")
+    p_run.add_argument("--session", help="Single session id to evaluate.")
+    p_run.add_argument("--since", help="ISO date — only sessions started after this.")
+    p_run.add_argument("--limit", type=int, default=None, help="Max sessions to process.")
+    p_run.add_argument(
+        "--re-evaluate-all",
+        action="store_true",
+        help="Re-evaluate every session, even if a current-version row already exists.",
+    )
+    p_run.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="List what would be evaluated, do not call the judge.",
+    )
+
+    p_show = sub.add_parser("show", help="Print stored evaluations for a session.")
+    p_show.add_argument("session_id")
+
+    p_stats = sub.add_parser("stats", help="Aggregate scores across the archive.")
+    p_stats.add_argument("--days", type=int, default=30, help="Window in days.")
+    p_stats.add_argument("--csv", action="store_true", help="Emit CSV to stdout.")
+    p_stats.add_argument(
+        "--by-complexity-bucket",
+        action="store_true",
+        help="Split by 0-25 / 26-50 / 51-75 / 76+ buckets.",
+    )
+
+    return p
+
+
+# ── Command stubs ────────────────────────────────────────────────────────
+
+
+async def _cmd_run(args: argparse.Namespace) -> int:
+    print("[eval] run command — not implemented yet (lands in Phase 3).", file=sys.stderr)
+    print(f"[eval] would process: session={args.session} since={args.since} "
+          f"limit={args.limit} re_eval_all={args.re_evaluate_all} dry={args.dry_run}",
+          file=sys.stderr)
+    return 2
+
+
+async def _cmd_show(args: argparse.Namespace) -> int:
+    print(f"[eval] show command — not implemented yet (lands in Phase 3). session={args.session_id}",
+          file=sys.stderr)
+    return 2
+
+
+async def _cmd_stats(args: argparse.Namespace) -> int:
+    print(f"[eval] stats command — not implemented yet (lands in Phase 4). "
+          f"days={args.days} csv={args.csv} by_bucket={args.by_complexity_bucket}",
+          file=sys.stderr)
+    return 2
+
+
+# ── Entry point ──────────────────────────────────────────────────────────
+
+
+def main(argv: Sequence[str] | None = None) -> int:
+    args = _build_parser().parse_args(argv)
+    coro = {
+        "run": _cmd_run,
+        "show": _cmd_show,
+        "stats": _cmd_stats,
+    }[args.cmd](args)
+    return asyncio.run(coro)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/debug/eval/judge.py b/debug/eval/judge.py
new file mode 100644
index 0000000..779a694
--- /dev/null
+++ b/debug/eval/judge.py
@@ -0,0 +1,185 @@
+"""Judge orchestration: render a session, fan out across 3 experts, average.
+
+Phase 2 ships the skeleton. Real LLM calls and transcript rendering land in
+Phase 3. The shape here is intentionally final so cli.py and api.py can wire
+against it without churn.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Iterable
+from uuid import UUID, uuid4
+
+import yaml
+
+from navi.core.session import Session
+
+from .schema import AXIS_NAMES, EvalRunMetadata, EvalScores, ExpertResult
+
+
+# Pinned versions for this rubric / judge generation. Bumping either forces
+# re-evaluation of the whole archive (or a parallel run, depending on policy).
+JUDGE_VERSION: str = "v1"
+RUBRIC_VERSION: str = "v1"
+
+EXPERT_IDS: tuple[str, ...] = ("strict_critic", "pragmatist", "tech_lead")
+
+_PROMPTS_DIR = Path(__file__).parent / "prompts"
+
+
+@dataclass(frozen=True)
+class RenderedSession:
+    """The text blob the judge actually sees, plus the metadata header."""
+
+    header: str
+    transcript: str
+
+    def as_user_message(self) -> str:
+        return f"{self.header}\n\n=== Session transcript ===\n{self.transcript}"
+
+
+# ── Prompt + rubric loading ──────────────────────────────────────────────
+
+
+def load_rubric() -> dict:
+    """Load rubric_v1.yaml as a dict. Raises if the file is missing/malformed."""
+    path = _PROMPTS_DIR / f"rubric_{RUBRIC_VERSION}.yaml"
+    return yaml.safe_load(path.read_text(encoding="utf-8"))
+
+
+def load_expert_prompt(expert_id: str) -> str:
+    """Read the system prompt for one expert from prompts/expert_<id>.txt."""
+    if expert_id not in EXPERT_IDS:
+        raise ValueError(f"unknown expert_id: {expert_id}")
+    return (_PROMPTS_DIR / f"expert_{expert_id}.txt").read_text(encoding="utf-8")
+
+
+def render_rubric_for_prompt(rubric: dict) -> str:
+    """Compact text rendering of the rubric to inline into the user message.
+
+    The expert system prompt references a "rubric" — we send it as plain text
+    next to the session block so the model has the anchors in front of it.
+    """
+    lines: list[str] = [f"=== Rubric {rubric['version']} ==="]
+    for axis_name, axis in rubric["axes"].items():
+        lines.append(f"\n## {axis_name}")
+        lines.append(axis["description"].strip())
+        if axis.get("nullable"):
+            lines.append("(nullable: score this null when the mechanic was not used)")
+        for a in axis["anchors"]:
+            lines.append(f"  {a['score']:>3} — {a['label']}: {a['what']}")
+    return "\n".join(lines)
+
+
+# ── Session rendering (Phase 3) ──────────────────────────────────────────
+
+
+def render_session(
+    session: Session,
+    feedback_by_index: dict[int, int] | None = None,
+) -> RenderedSession:
+    """Render a session into the text the judge will see.
+
+    Phase 2 stub. Phase 3 will produce:
+      - Header: profile, model list, planning flags, timing, like/dislike counts.
+      - Transcript: every message in order, with user reactions inlined next
+        to each assistant block, sub-agent traces indented, planning phases
+        included as-is. No compression-summary substitution.
+    """
+    raise NotImplementedError("render_session lands in Phase 3")
+
+
+# ── Expert call (Phase 3) ────────────────────────────────────────────────
+
+
+async def run_expert(
+    *,
+    expert_id: str,
+    rendered: RenderedSession,
+    rubric_text: str,
+    llm,  # navi.llm.base.LLMBackend — kept untyped to avoid circular import
+    model: str | list[str],
+) -> ExpertResult:
+    """Run one expert against the rendered session, parse JSON, validate.
+
+    Phase 2 stub. Phase 3 will:
+      1. Build [system=expert_prompt, user=rubric_text + rendered.as_user_message()]
+      2. Call llm.complete(..., temperature=0.2, think=False) without tools.
+      3. Strip code fences if any leaked, json.loads, ExpertResult.model_validate.
+      4. On parse error: one retry with a "your previous output was invalid JSON"
+         nudge appended; then raise.
+    """
+    raise NotImplementedError("run_expert lands in Phase 3")
+
+
+# ── Run orchestration (Phase 3) ──────────────────────────────────────────
+
+
+async def evaluate_session(
+    *,
+    session: Session,
+    feedback_by_index: dict[int, int] | None,
+    llm,
+    model: str | list[str],
+) -> tuple[EvalRunMetadata, list[ExpertResult]]:
+    """Run all three experts on one session. Returns (metadata, results).
+
+    Phase 2 stub.
+    """
+    raise NotImplementedError("evaluate_session lands in Phase 3")
+
+
+def average_scores(results: Iterable[ExpertResult]) -> EvalScores:
+    """Mean across experts. Nullable axes average over non-null values only;
+    if every expert returned null for an axis, the mean stays null."""
+    sums: dict[str, int] = {a: 0 for a in AXIS_NAMES}
+    counts: dict[str, int] = {a: 0 for a in AXIS_NAMES}
+    for r in results:
+        for a in AXIS_NAMES:
+            v = getattr(r.scores, a)
+            if v is None:
+                continue
+            sums[a] += v
+            counts[a] += 1
+    averaged: dict[str, int | None] = {}
+    for a in AXIS_NAMES:
+        averaged[a] = round(sums[a] / counts[a]) if counts[a] else None
+    return EvalScores(**averaged)
+
+
+def new_run_metadata(judge_model: str) -> EvalRunMetadata:
+    """Stamp a fresh eval_run_id and pin the rubric/judge versions."""
+    return EvalRunMetadata(
+        eval_run_id=uuid4(),
+        eval_date=datetime.now(timezone.utc),
+        judge_model=judge_model,
+        judge_version=JUDGE_VERSION,
+        rubric_version=RUBRIC_VERSION,
+    )
+
+
+# ── Convenience: parse JSON output that may have stray fences ────────────
+
+
+def parse_expert_json(raw: str, expected_expert_id: str) -> ExpertResult:
+    """Strip code fences if present, parse, validate against ExpertResult.
+
+    Tolerant to ```json fences and surrounding whitespace; strict otherwise.
+    """
+    text = raw.strip()
+    if text.startswith("```"):
+        # Drop the opening fence (with optional language tag) and the closing fence.
+        first_nl = text.find("\n")
+        text = text[first_nl + 1:] if first_nl != -1 else text[3:]
+        if text.rstrip().endswith("```"):
+            text = text.rstrip()[:-3]
+    data = json.loads(text)
+    if data.get("expert_id") != expected_expert_id:
+        raise ValueError(
+            f"expert_id mismatch: expected {expected_expert_id!r}, got {data.get('expert_id')!r}"
+        )
+    return ExpertResult.model_validate(data)
diff --git a/debug/eval/prompts/expert_pragmatist.txt b/debug/eval/prompts/expert_pragmatist.txt
new file mode 100644
index 0000000..efe40f2
--- /dev/null
+++ b/debug/eval/prompts/expert_pragmatist.txt
@@ -0,0 +1,41 @@
+You are the **Pragmatist**, one of three independent expert evaluators
+reviewing a session of an autonomous AI agent named Navi. Your job is to score
+this session on user-facing outcomes: did the user end up with what they
+wanted, and was the path tolerable? You do not care about elegance, internal
+architecture, or whether a tool call was technically optimal — you care
+whether the work shipped.
+
+You will receive:
+1. A rubric with anchors at scores 10 / 30 / 50 / 75 / 100 for each axis. The
+   scale is open: score above 100 if warranted. Each axis is independent.
+2. A "Session block": full transcript, per-message reactions (👍 / 👎),
+   aggregated counts, profile metadata, timing.
+
+Your output MUST be a single JSON object with this exact shape — no markdown,
+no prose outside JSON, no code fences:
+
+{
+  "expert_id": "pragmatist",
+  "scores": {
+    "task_complexity": <int>,
+    "goal_completion": <int>,
+    "tool_usage_quality": <int>,
+    "efficiency": <int>,
+    "communication": <int>,
+    "subagent_orchestration": <int or null>,
+    "self_extension": <int or null>
+  },
+  "comment": "<2–5 sentences explaining whether the user got value and what would have made the session more useful>"
+}
+
+Rules of scoring:
+- `task_complexity` from the user's request alone, before considering the
+  response.
+- A circuitous path that still delivers a working result rates higher with you
+  than with a strict critic. Don't reward elegance, reward outcomes.
+- `subagent_orchestration` is null if no sub-agents were spawned.
+  `self_extension` is null if no tool was written or modified.
+- Heavy weight on user reaction signals: explicit 👎 or follow-up complaints
+  in the transcript should pull `goal_completion` and `communication` down.
+
+Do not output anything outside the JSON object.
diff --git a/debug/eval/prompts/expert_strict_critic.txt b/debug/eval/prompts/expert_strict_critic.txt
new file mode 100644
index 0000000..31f0d65
--- /dev/null
+++ b/debug/eval/prompts/expert_strict_critic.txt
@@ -0,0 +1,44 @@
+You are the **Strict Critic**, one of three independent expert evaluators
+reviewing a session of an autonomous AI agent named Navi. Your job is to score
+the agent's performance on this single session along seven axes, conservatively
+and without sycophancy. Where two interpretations are possible, take the less
+generous one. Penalise weakly any slip — bad tool choice, contradiction,
+hallucination, unverified claim, missed validation step.
+
+You will receive:
+1. A rubric with anchors at scores 10 / 30 / 50 / 75 / 100 for each axis. The
+   scale is open: if you see something clearly harder or better than the 100
+   anchor, score above 100. Independence: each axis is scored on its own; do
+   not let one axis pull another up or down.
+2. A "Session block" containing the full transcript (user, assistant text,
+   thinking, tool calls and tool results, sub-agent traces, planning phases),
+   per-message user reactions if any (👍 / 👎), aggregated like / dislike
+   counts, profile metadata, timing.
+
+Your output MUST be a single JSON object with this exact shape — no markdown,
+no prose outside JSON, no code fences:
+
+{
+  "expert_id": "strict_critic",
+  "scores": {
+    "task_complexity": <int>,
+    "goal_completion": <int>,
+    "tool_usage_quality": <int>,
+    "efficiency": <int>,
+    "communication": <int>,
+    "subagent_orchestration": <int or null>,
+    "self_extension": <int or null>
+  },
+  "comment": "<2–5 sentences naming the most concrete flaws you found>"
+}
+
+Rules of scoring:
+- `task_complexity` is judged from the user's request alone, *before* you
+  consider how Navi handled it. Do not adjust complexity based on outcome.
+- `subagent_orchestration` is null if the session never spawned a sub-agent.
+  `self_extension` is null if Navi did not write or modify her own tools.
+  Do NOT invent zeros for absent mechanics.
+- `comment` must point to specific moments — quote a tool name, a turn, a
+  hallucinated claim. Generic praise or generic complaint is not useful.
+
+Do not output anything outside the JSON object.
diff --git a/debug/eval/prompts/expert_tech_lead.txt b/debug/eval/prompts/expert_tech_lead.txt
new file mode 100644
index 0000000..d6692c5
--- /dev/null
+++ b/debug/eval/prompts/expert_tech_lead.txt
@@ -0,0 +1,40 @@
+You are the **Tech Lead**, one of three independent expert evaluators
+reviewing a session of an autonomous AI agent named Navi. Your job is to score
+this session through an engineering lens: tool selection, architecture of the
+plan, how sub-tasks were decomposed, how errors were handled, whether the
+agent worked from first principles or copy-pasted assumptions.
+
+You will receive:
+1. A rubric with anchors at scores 10 / 30 / 50 / 75 / 100 for each axis. The
+   scale is open: score above 100 if warranted. Each axis is independent.
+2. A "Session block": full transcript including planning phases, all tool
+   calls, sub-agent traces, profile metadata, timing.
+
+Your output MUST be a single JSON object with this exact shape — no markdown,
+no prose outside JSON, no code fences:
+
+{
+  "expert_id": "tech_lead",
+  "scores": {
+    "task_complexity": <int>,
+    "goal_completion": <int>,
+    "tool_usage_quality": <int>,
+    "efficiency": <int>,
+    "communication": <int>,
+    "subagent_orchestration": <int or null>,
+    "self_extension": <int or null>
+  },
+  "comment": "<2–5 sentences naming the strongest and weakest engineering decisions in this session>"
+}
+
+Rules of scoring:
+- `task_complexity` from the user's request alone, before considering the
+  response.
+- Heavy weight on `tool_usage_quality`, `efficiency`, `subagent_orchestration`,
+  and `self_extension` — these are your domain.
+- `subagent_orchestration` is null if no sub-agents were spawned.
+  `self_extension` is null if no tool was written or modified.
+- A messy result that revealed a real design issue rates higher than a clean
+  result that hid one. Reward work that surfaces problems honestly.
+
+Do not output anything outside the JSON object.
diff --git a/debug/eval/prompts/rubric_v1.yaml b/debug/eval/prompts/rubric_v1.yaml
new file mode 100644
index 0000000..d3138c3
--- /dev/null
+++ b/debug/eval/prompts/rubric_v1.yaml
@@ -0,0 +1,176 @@
+# Rubric v1 — frozen reference for the LLM-as-judge eval system.
+#
+# Axes are scored on an open 0..100+ scale. The 100 anchor describes "at the
+# limit of what Navi can do today"; if the judge sees something clearly harder
+# or better, it scores above 100 and that becomes a future anchor.
+#
+# Anchor `examples` are placeholders. After accumulating real sessions, fill
+# them with actual session_ids and short notes — do not change the scale
+# language without bumping the rubric_version.
+#
+# Conventions:
+#   - Each axis is independent. Don't let "complexity" pull "communication"
+#     down or up.
+#   - `task_complexity` is judged from the user's request alone, before the
+#     transcript of the response is considered. The schema.py validator
+#     enforces this scoring order.
+#   - `subagent_orchestration` and `self_extension` may be null when the
+#     session never used those mechanics. Do not invent zeros.
+
+version: "v1"
+
+axes:
+  task_complexity:
+    description: >
+      Difficulty of what the user asked, judged from the user's request alone
+      (not from how Navi handled it). Independent of outcome.
+    anchors:
+      - score: 10
+        label: trivial
+        what: "Single fact, single tool, no planning, instant answer."
+        examples: []
+      - score: 30
+        label: straightforward
+        what: "One tool, one or two steps, no real branching, expected answer is obvious."
+        examples: []
+      - score: 50
+        label: moderate
+        what: "2–4 steps, planning helps, mild ambiguity in the request."
+        examples: []
+      - score: 75
+        label: complex
+        what: "Multi-tool with planning, real ambiguity to resolve, several places where it could fail."
+        examples: []
+      - score: 100
+        label: at-the-limit
+        what: "Full project-shaped task — multiple sub-agents, self-extension via write_tool, long-horizon execution."
+        examples: []
+
+  goal_completion:
+    description: >
+      Did the user end up with what they wanted, regardless of the path taken.
+      Read the user's reactions and the final assistant response to decide.
+    anchors:
+      - score: 10
+        label: missed
+        what: "User did not get what they asked for; gave up or had to redirect entirely."
+      - score: 30
+        label: partial-but-wrong-direction
+        what: "Some output was produced but it doesn't really answer the request."
+      - score: 50
+        label: partial
+        what: "Half the request was met; user had to fill in or correct the rest."
+      - score: 75
+        label: solid
+        what: "Goal met with minor gaps a user can live with."
+      - score: 100
+        label: clean-completion
+        what: "Goal fully met, including caveats the user didn't have to ask about."
+
+  tool_usage_quality:
+    description: >
+      Were the right tools chosen? Was there thrashing, redundant calls,
+      reading the same file twice, calling search when a local source had it?
+    anchors:
+      - score: 10
+        label: chaotic
+        what: "Wrong tools picked, repeated identical calls, no recovery from errors."
+      - score: 30
+        label: clumsy
+        what: "Right idea but with several detours, redundant lookups."
+      - score: 50
+        label: workable
+        what: "Tools were appropriate; one or two avoidable calls."
+      - score: 75
+        label: deliberate
+        what: "Tool choices match the task, errors handled cleanly, no obvious waste."
+      - score: 100
+        label: surgical
+        what: "Minimal sufficient toolset, each call has a clear purpose, results reused well."
+
+  efficiency:
+    description: >
+      Iterations vs result. Penalize loops, dead-ends, and re-doing work that
+      was already done. Reward straight lines that finish in fewer steps.
+    anchors:
+      - score: 10
+        label: thrashing
+        what: "Loops, runs out of iteration budget, never converges."
+      - score: 30
+        label: wandering
+        what: "Reaches the goal but with detours and several aborted attempts."
+      - score: 50
+        label: acceptable
+        what: "Linear path with minor stalls."
+      - score: 75
+        label: tight
+        what: "Few wasted moves; planning anticipated most of the work."
+      - score: 100
+        label: ideal
+        what: "Shortest reasonable path to the result, no slack."
+
+  communication:
+    description: >
+      Clarity and honesty of replies. Penalize hallucinations, false
+      confidence, and excessive verbosity. Reward direct answers, stated
+      uncertainties, and accurate self-reports.
+    anchors:
+      - score: 10
+        label: misleading
+        what: "Hallucinations, claims work was done that wasn't, walls of filler."
+      - score: 30
+        label: noisy
+        what: "Padded replies, occasional inaccuracies, reader has to dig."
+      - score: 50
+        label: serviceable
+        what: "Conveys the answer; some unnecessary text, no major errors."
+      - score: 75
+        label: clear
+        what: "Direct, accurate, appropriately brief; flags genuine uncertainties."
+      - score: 100
+        label: exemplary
+        what: "Tight, honest, anticipates reader questions, no fluff."
+
+  subagent_orchestration:
+    description: >
+      Quality of sub-agent delegation when spawn_agent is used. Score this
+      null if no sub-agents were spawned — do not punish absence.
+    nullable: true
+    anchors:
+      - score: 10
+        label: misuse
+        what: "Sub-agent given a vague prompt, returns junk, parent ignores or duplicates the work."
+      - score: 30
+        label: rough
+        what: "Sub-agent did most of the work but parent had to re-do or heavily edit."
+      - score: 50
+        label: workable
+        what: "Sub-agent helped; the delegation paid off but the prompt wasn't clean."
+      - score: 75
+        label: well-scoped
+        what: "Clear sub-task, clear hand-off, parent uses the result without rework."
+      - score: 100
+        label: textbook
+        what: "Sub-agent saved real work; output integrated cleanly; no overlap with the parent."
+
+  self_extension:
+    description: >
+      Quality of write_tool / reload_tools / delete_tool usage. Score null if
+      Navi did not modify her own tooling in this session.
+    nullable: true
+    anchors:
+      - score: 10
+        label: broken
+        what: "Tool written in wrong format, fails to load, no recovery."
+      - score: 30
+        label: shaky
+        what: "Tool loads but is brittle or solves the wrong problem."
+      - score: 50
+        label: functional
+        what: "Tool works for the immediate need but is narrow or quirky."
+      - score: 75
+        label: solid
+        what: "Tool is well-formed, reusable, manual or doc updated."
+      - score: 100
+        label: production-grade
+        what: "Tool is general, tested, integrates cleanly with other tools and matches project conventions."
diff --git a/debug/eval/schema.py b/debug/eval/schema.py
new file mode 100644
index 0000000..a7a56be
--- /dev/null
+++ b/debug/eval/schema.py
@@ -0,0 +1,82 @@
+"""Pydantic models for the eval system.
+
+Used both as the contract for what each expert LLM must return (JSON validated
+against ExpertResult) and as the storage shape for evaluation rows.
+"""
+
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Literal
+from uuid import UUID
+
+from pydantic import BaseModel, Field, field_validator
+
+
+# Axes the judge must score. Order does not matter for storage but matches
+# rubric_v1.yaml for readability when humans inspect output.
+AXIS_NAMES: tuple[str, ...] = (
+    "task_complexity",
+    "goal_completion",
+    "tool_usage_quality",
+    "efficiency",
+    "communication",
+    "subagent_orchestration",
+    "self_extension",
+)
+
+# Axes that may be null when the session never used the relevant mechanic.
+NULLABLE_AXES: frozenset[str] = frozenset({"subagent_orchestration", "self_extension"})
+
+
+class EvalScores(BaseModel):
+    """Per-axis numeric scores. Open scale: values >100 are accepted."""
+
+    task_complexity: int = Field(ge=0)
+    goal_completion: int = Field(ge=0)
+    tool_usage_quality: int = Field(ge=0)
+    efficiency: int = Field(ge=0)
+    communication: int = Field(ge=0)
+    subagent_orchestration: int | None = Field(default=None, ge=0)
+    self_extension: int | None = Field(default=None, ge=0)
+
+
+class ExpertResult(BaseModel):
+    """One expert's verdict for one session. Matches the JSON the LLM emits."""
+
+    expert_id: Literal["strict_critic", "pragmatist", "tech_lead"]
+    scores: EvalScores
+    comment: str
+
+    @field_validator("comment")
+    @classmethod
+    def _strip_comment(cls, v: str) -> str:
+        v = (v or "").strip()
+        if not v:
+            raise ValueError("comment must not be empty")
+        return v
+
+
+class EvalRunMetadata(BaseModel):
+    """Metadata that pins the rubric / judge version for a single run."""
+
+    eval_run_id: UUID
+    eval_date: datetime
+    judge_model: str
+    judge_version: str
+    rubric_version: str
+
+
+class StoredEvaluation(BaseModel):
+    """Row shape returned by db.list_evaluations()."""
+
+    id: UUID
+    session_id: str
+    eval_run_id: UUID
+    eval_date: datetime
+    judge_model: str
+    judge_version: str
+    rubric_version: str
+    expert_id: str
+    scores: EvalScores
+    comment: str