navi-1/debug/eval/cli.py at e4771277847c3da2f64395883c6f65adfdf137fb

Fork: 0
root / navi-1
Find file
Newer
Older
navi-1 / debug / eval / cli.py
Eugene Sukhodolskiy on 26 Apr 3 KB Add eval system Phase 2 — rubric, expert prompts, judge skeleton
Raw Blame History
"""Standalone CLI for the eval system.

Invoke from the project root:

    python -m debug.eval run [--since DATE] [--limit N] [--re-evaluate-all]
    python -m debug.eval show <session_id>
    python -m debug.eval stats [--days 30] [--csv]

Phase 2 lands the argparse skeleton with command stubs. The real work
(transcript rendering, judge calls, score persistence, stats aggregation)
lands in Phase 3 and Phase 4 — see docs/eval_system.md.
"""

from __future__ import annotations

import argparse
import asyncio
import sys
from typing import Sequence


def _build_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(
        prog="debug.eval",
        description="LLM-as-judge evaluation runner for Navi sessions.",
    )
    sub = p.add_subparsers(dest="cmd", required=True)

    p_run = sub.add_parser("run", help="Evaluate sessions against the pinned rubric.")
    p_run.add_argument("--session", help="Single session id to evaluate.")
    p_run.add_argument("--since", help="ISO date — only sessions started after this.")
    p_run.add_argument("--limit", type=int, default=None, help="Max sessions to process.")
    p_run.add_argument(
        "--re-evaluate-all",
        action="store_true",
        help="Re-evaluate every session, even if a current-version row already exists.",
    )
    p_run.add_argument(
        "--dry-run",
        action="store_true",
        help="List what would be evaluated, do not call the judge.",
    )

    p_show = sub.add_parser("show", help="Print stored evaluations for a session.")
    p_show.add_argument("session_id")

    p_stats = sub.add_parser("stats", help="Aggregate scores across the archive.")
    p_stats.add_argument("--days", type=int, default=30, help="Window in days.")
    p_stats.add_argument("--csv", action="store_true", help="Emit CSV to stdout.")
    p_stats.add_argument(
        "--by-complexity-bucket",
        action="store_true",
        help="Split by 0-25 / 26-50 / 51-75 / 76+ buckets.",
    )

    return p


# ── Command stubs ────────────────────────────────────────────────────────


async def _cmd_run(args: argparse.Namespace) -> int:
    print("[eval] run command — not implemented yet (lands in Phase 3).", file=sys.stderr)
    print(f"[eval] would process: session={args.session} since={args.since} "
          f"limit={args.limit} re_eval_all={args.re_evaluate_all} dry={args.dry_run}",
          file=sys.stderr)
    return 2


async def _cmd_show(args: argparse.Namespace) -> int:
    print(f"[eval] show command — not implemented yet (lands in Phase 3). session={args.session_id}",
          file=sys.stderr)
    return 2


async def _cmd_stats(args: argparse.Namespace) -> int:
    print(f"[eval] stats command — not implemented yet (lands in Phase 4). "
          f"days={args.days} csv={args.csv} by_bucket={args.by_complexity_bucket}",
          file=sys.stderr)
    return 2


# ── Entry point ──────────────────────────────────────────────────────────


def main(argv: Sequence[str] | None = None) -> int:
    args = _build_parser().parse_args(argv)
    coro = {
        "run": _cmd_run,
        "show": _cmd_show,
        "stats": _cmd_stats,
    }[args.cmd](args)
    return asyncio.run(coro)


if __name__ == "__main__":
    raise SystemExit(main())