"""Standalone CLI for the eval system.
Invoke from the project root:
python -m debug.eval run [--since DATE] [--limit N] [--re-evaluate-all]
python -m debug.eval show <session_id>
python -m debug.eval stats [--days 30] [--csv]
Phase 2 lands the argparse skeleton with command stubs. The real work
(transcript rendering, judge calls, score persistence, stats aggregation)
lands in Phase 3 and Phase 4 — see docs/eval_system.md.
"""
from __future__ import annotations
import argparse
import asyncio
import sys
from typing import Sequence
def _build_parser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(
prog="debug.eval",
description="LLM-as-judge evaluation runner for Navi sessions.",
)
sub = p.add_subparsers(dest="cmd", required=True)
p_run = sub.add_parser("run", help="Evaluate sessions against the pinned rubric.")
p_run.add_argument("--session", help="Single session id to evaluate.")
p_run.add_argument("--since", help="ISO date — only sessions started after this.")
p_run.add_argument("--limit", type=int, default=None, help="Max sessions to process.")
p_run.add_argument(
"--re-evaluate-all",
action="store_true",
help="Re-evaluate every session, even if a current-version row already exists.",
)
p_run.add_argument(
"--dry-run",
action="store_true",
help="List what would be evaluated, do not call the judge.",
)
p_show = sub.add_parser("show", help="Print stored evaluations for a session.")
p_show.add_argument("session_id")
p_stats = sub.add_parser("stats", help="Aggregate scores across the archive.")
p_stats.add_argument("--days", type=int, default=30, help="Window in days.")
p_stats.add_argument("--csv", action="store_true", help="Emit CSV to stdout.")
p_stats.add_argument(
"--by-complexity-bucket",
action="store_true",
help="Split by 0-25 / 26-50 / 51-75 / 76+ buckets.",
)
return p
# ── Command stubs ────────────────────────────────────────────────────────
async def _cmd_run(args: argparse.Namespace) -> int:
print("[eval] run command — not implemented yet (lands in Phase 3).", file=sys.stderr)
print(f"[eval] would process: session={args.session} since={args.since} "
f"limit={args.limit} re_eval_all={args.re_evaluate_all} dry={args.dry_run}",
file=sys.stderr)
return 2
async def _cmd_show(args: argparse.Namespace) -> int:
print(f"[eval] show command — not implemented yet (lands in Phase 3). session={args.session_id}",
file=sys.stderr)
return 2
async def _cmd_stats(args: argparse.Namespace) -> int:
print(f"[eval] stats command — not implemented yet (lands in Phase 4). "
f"days={args.days} csv={args.csv} by_bucket={args.by_complexity_bucket}",
file=sys.stderr)
return 2
# ── Entry point ──────────────────────────────────────────────────────────
def main(argv: Sequence[str] | None = None) -> int:
args = _build_parser().parse_args(argv)
coro = {
"run": _cmd_run,
"show": _cmd_show,
"stats": _cmd_stats,
}[args.cmd](args)
return asyncio.run(coro)
if __name__ == "__main__":
raise SystemExit(main())