diff --git a/debug/eval/judge.py b/debug/eval/judge.py index 722e44c..b73df1a 100644 --- a/debug/eval/judge.py +++ b/debug/eval/judge.py @@ -59,8 +59,10 @@ def render_rubric_for_prompt(rubric: dict) -> str: """Compact text rendering of the rubric to inline into the user message. - The expert system prompt references a "rubric" — we send it as plain text - next to the session block so the model has the anchors in front of it. + Each axis lists three level descriptions (weak / typical / strong). + Only the `typical` tier carries a numeric reference score — surfaced as + a single calibration point so judges aren't left without an anchor, but + intentionally not a multiple of 5 to discourage round-number snapping. """ lines: list[str] = [f"=== Rubric {rubric['version']} ==="] for axis_name, axis in rubric["axes"].items(): @@ -68,8 +70,10 @@ lines.append(axis["description"].strip()) if axis.get("nullable"): lines.append("(nullable: score this null when the mechanic was not used)") - for a in axis["anchors"]: - lines.append(f" {a['score']:>3} — {a['label']}: {a['what']}") + for level in axis["levels"]: + score = level.get("score") + ref = f" (reference ≈ {score})" if score is not None else "" + lines.append(f" • {level['label']}{ref} — {level['what']}") return "\n".join(lines) diff --git a/debug/eval/prompts/expert_pragmatist.txt b/debug/eval/prompts/expert_pragmatist.txt index d8ac7e4..dbc8ee5 100644 --- a/debug/eval/prompts/expert_pragmatist.txt +++ b/debug/eval/prompts/expert_pragmatist.txt @@ -6,9 +6,12 @@ whether the work shipped. You will receive: -1. A rubric with illustrative level descriptions. Each axis is scored on an - open-ended integer scale starting at 0. Use any integer that reflects the - session's actual performance. Do not round to the nearest anchor. +1. A rubric. Each axis lists three level descriptions (weak / typical / strong). + Only the `typical` tier carries a numeric reference score (a single + calibration point). Score every axis with any integer on an open-ended + scale starting at 0 — pick the value that best reflects where this session + lands between, around, or beyond those level descriptions. There are no + preferred values; round multiples of 5 are not expected and not needed. 2. A "Session block": full transcript, per-message reactions (👍 / 👎), aggregated counts, profile metadata, timing. @@ -29,22 +32,6 @@ "comment": "<2–5 sentences explaining whether the user got value and what would have made the session more useful>" } -Example of correct output (illustrative scores between anchors): - -{ - "expert_id": "pragmatist", - "scores": { - "task_complexity": 55, - "goal_completion": 72, - "tool_usage_quality": 68, - "efficiency": 61, - "communication": 84, - "subagent_orchestration": null, - "self_extension": null - }, - "comment": "The user got what they asked for after a slightly winding path. One follow-up correction was needed, but the final result was usable. Communication was friendly and direct." -} - Rules of scoring: - `task_complexity` from the user's request alone, before considering the response. diff --git a/debug/eval/prompts/expert_strict_critic.txt b/debug/eval/prompts/expert_strict_critic.txt index 72ea38c..bc45060 100644 --- a/debug/eval/prompts/expert_strict_critic.txt +++ b/debug/eval/prompts/expert_strict_critic.txt @@ -6,9 +6,12 @@ hallucination, unverified claim, missed validation step. You will receive: -1. A rubric with illustrative level descriptions. Each axis is scored on an - open-ended integer scale starting at 0. Use any integer that reflects the - session's actual performance. Do not round to the nearest anchor. +1. A rubric. Each axis lists three level descriptions (weak / typical / strong). + Only the `typical` tier carries a numeric reference score (a single + calibration point). Score every axis with any integer on an open-ended + scale starting at 0 — pick the value that best reflects where this session + lands between, around, or beyond those level descriptions. There are no + preferred values; round multiples of 5 are not expected and not needed. 2. A "Session block" containing the full transcript (user, assistant text, thinking, tool calls and tool results, sub-agent traces, planning phases), per-message user reactions if any (👍 / 👎), aggregated like / dislike @@ -31,23 +34,6 @@ "comment": "<2–5 sentences naming the most concrete flaws you found>" } -Here is an example of correct output for a hypothetical session. Notice the -scores land between the rubric's anchors — they are not rounded: - -{ - "expert_id": "strict_critic", - "scores": { - "task_complexity": 67, - "goal_completion": 83, - "tool_usage_quality": 42, - "efficiency": 58, - "communication": 91, - "subagent_orchestration": null, - "self_extension": 77 - }, - "comment": "Navi missed a validation step at turn 4, then recovered. The tool choice was correct but the first file read was redundant. Communication was clear except for an unverified claim in turn 7." -} - Rules of scoring: - `task_complexity` is judged from the user's request alone, *before* you consider how Navi handled it. Do not adjust complexity based on outcome. diff --git a/debug/eval/prompts/expert_tech_lead.txt b/debug/eval/prompts/expert_tech_lead.txt index b9047f5..6d0bc25 100644 --- a/debug/eval/prompts/expert_tech_lead.txt +++ b/debug/eval/prompts/expert_tech_lead.txt @@ -5,9 +5,12 @@ agent worked from first principles or copy-pasted assumptions. You will receive: -1. A rubric with illustrative level descriptions. Each axis is scored on an - open-ended integer scale starting at 0. Use any integer that reflects the - session's actual performance. Do not round to the nearest anchor. +1. A rubric. Each axis lists three level descriptions (weak / typical / strong). + Only the `typical` tier carries a numeric reference score (a single + calibration point). Score every axis with any integer on an open-ended + scale starting at 0 — pick the value that best reflects where this session + lands between, around, or beyond those level descriptions. There are no + preferred values; round multiples of 5 are not expected and not needed. 2. A "Session block": full transcript including planning phases, all tool calls, sub-agent traces, profile metadata, timing. @@ -28,22 +31,6 @@ "comment": "<2–5 sentences naming the strongest and weakest engineering decisions in this session>" } -Example of correct output (illustrative scores between anchors): - -{ - "expert_id": "tech_lead", - "scores": { - "task_complexity": 48, - "goal_completion": 79, - "tool_usage_quality": 63, - "efficiency": 54, - "communication": 71, - "subagent_orchestration": 88, - "self_extension": null - }, - "comment": "The plan was well-structured but two tool calls were redundant. Error handling was solid — the agent caught the exception and retried correctly. No self-extension was attempted." -} - Rules of scoring: - `task_complexity` from the user's request alone, before considering the response. diff --git a/debug/eval/prompts/rubric_v1.yaml b/debug/eval/prompts/rubric_v1.yaml index 6d2de94..7bcc463 100644 --- a/debug/eval/prompts/rubric_v1.yaml +++ b/debug/eval/prompts/rubric_v1.yaml @@ -1,18 +1,19 @@ # Rubric v1 # -# Each axis is scored on an open-ended integer scale starting at 0. -# The anchors below are illustrative level descriptions with example scores. -# Judges should use any integer that reflects actual performance. -# Do not round scores to the nearest anchor. -# -# Anchor `examples` are placeholders. After accumulating real sessions, fill -# them with actual session_ids and short notes — do not change the scale -# language without bumping the rubric_version. +# Each axis carries three level descriptions: weak / typical / strong. +# Only the `typical` level holds a numeric reference score (`53`) — it is +# the single calibration anchor; weak and strong have no numbers attached. +# Judges score every axis with any non-negative integer on a fully +# open-ended scale — there is no upper bound, and exceptional sessions +# may score arbitrarily high as Navi grows. There are no preferred values; +# round multiples of 5 or 10 are not expected. The reference is just a +# pin: a typical-day session lands near 53; clearly weaker sessions go +# below, clearly stronger go above with no ceiling. # # Conventions: -# - Each axis is independent. -# - `task_complexity` is judged from the user's request alone, before the -# transcript of the response is considered. +# - Each axis is independent. Don't let one pull another. +# - `task_complexity` is judged from the user's request alone, before +# the response is considered. # - `subagent_orchestration` and `self_extension` may be null when the # session never used those mechanics. Do not invent zeros. @@ -21,155 +22,97 @@ axes: task_complexity: description: > - Difficulty of what the user asked, judged from the user's request alone - (not from how Navi handled it). Independent of outcome. - anchors: - - score: 10 - label: trivial - what: "Single fact, single tool, no planning, instant answer." - examples: [] - - score: 33 - label: straightforward - what: "One tool, one or two steps, no real branching, expected answer is obvious." - examples: [] - - score: 51 - label: moderate - what: "2–4 steps, planning helps, mild ambiguity in the request." - examples: [] - - score: 77 - label: complex - what: "Multi-tool with planning, real ambiguity to resolve, several places where it could fail." - examples: [] - - score: 102 - label: at-or-above-the-limit - what: "Full project-shaped task — multiple sub-agents, self-extension via write_tool, long-horizon execution." - examples: [] + How hard the user's request is, judged from the request alone — before + considering how Navi handled it. Reflects ambiguity, depth, multi-step + reasoning, and how many things have to go right. Independent of outcome. + levels: + - label: weak + what: "Single fact or single tool, no real planning, expected answer is obvious." + - label: typical + score: 53 + what: "Multi-step task with mild ambiguity; planning helps but the path is mostly clear." + - label: strong + what: "Long-horizon, multi-tool, real ambiguity to resolve, several places where it could fail; possibly project-shaped with sub-agents or self-extension." goal_completion: description: > - Did the user end up with what they wanted, regardless of the path taken. - Read the user's reactions and the final assistant response to decide. - anchors: - - score: 10 - label: missed - what: "User did not get what they asked for; gave up or had to redirect entirely." - - score: 33 - label: partial-but-wrong-direction - what: "Some output was produced but it doesn't really answer the request." - - score: 51 - label: partial - what: "Half the request was met; user had to fill in or correct the rest." - - score: 77 - label: solid - what: "Goal met with minor gaps a user can live with." - - score: 102 - label: clean-completion - what: "Goal fully met, including caveats the user didn't have to ask about." + What fraction of the user's intent — including unstated needs they + would care about — was actually delivered. This is a continuous + dimension, not yes/no: even a successful response can leave gaps, + and even a failed one can deliver part of what was asked. + levels: + - label: weak + what: "User did not get what they asked for; gave up, redirected, or output was off-target." + - label: typical + score: 53 + what: "Most of the request was met; user has minor gaps to fill in or live with." + - label: strong + what: "Goal fully met, including edge cases or caveats the user didn't have to ask about." tool_usage_quality: description: > - Were the right tools chosen? Was there thrashing, redundant calls, - reading the same file twice, calling search when a local source had it? - anchors: - - score: 10 - label: chaotic - what: "Wrong tools picked, repeated identical calls, no recovery from errors." - - score: 33 - label: clumsy - what: "Right idea but with several detours, redundant lookups." - - score: 51 - label: workable - what: "Tools were appropriate; one or two avoidable calls." - - score: 77 - label: deliberate - what: "Tool choices match the task, errors handled cleanly, no obvious waste." - - score: 102 - label: surgical - what: "Minimal sufficient toolset, each call has a clear purpose, results reused well." + Whether tools were chosen appropriately, called efficiently, with + errors handled cleanly and results reused. + levels: + - label: weak + what: "Wrong tools picked, repeated identical calls, no recovery from errors, or substantial wasted work." + - label: typical + score: 53 + what: "Tools were appropriate; one or two avoidable detours or redundant lookups." + - label: strong + what: "Minimal sufficient toolset, each call has a clear purpose, errors handled gracefully, results reused." efficiency: description: > - Iterations vs result. Penalize loops, dead-ends, and re-doing work that - was already done. Reward straight lines that finish in fewer steps. - anchors: - - score: 10 - label: thrashing - what: "Loops, runs out of iteration budget, never converges." - - score: 33 - label: wandering - what: "Reaches the goal but with detours and several aborted attempts." - - score: 51 - label: acceptable - what: "Linear path with minor stalls." - - score: 77 - label: tight - what: "Few wasted moves; planning anticipated most of the work." - - score: 102 - label: ideal - what: "Shortest reasonable path to the result, no slack." + Iterations and total work relative to the result. Detours, loops, + and re-do attempts cost points; tight planning saves them. + levels: + - label: weak + what: "Loops, runs out of iteration budget, or never converges; many aborted or duplicated attempts." + - label: typical + score: 53 + what: "Linear path with minor stalls; reaches the goal without too many detours." + - label: strong + what: "Few wasted moves; planning anticipated the work; shortest reasonable path." communication: description: > - Clarity and honesty of replies. Penalize hallucinations, false - confidence, and excessive verbosity. Reward direct answers, stated - uncertainties, and accurate self-reports. - anchors: - - score: 10 - label: misleading - what: "Hallucinations, claims work was done that wasn't, walls of filler." - - score: 33 - label: noisy - what: "Padded replies, occasional inaccuracies, reader has to dig." - - score: 51 - label: serviceable - what: "Conveys the answer; some unnecessary text, no major errors." - - score: 77 - label: clear - what: "Direct, accurate, appropriately brief; flags genuine uncertainties." - - score: 102 - label: exemplary - what: "Tight, honest, anticipates reader questions, no fluff." + Clarity, honesty, brevity, and absence of hallucinations. Penalise + padded replies and unverified claims even when the underlying answer + is correct. A correct answer wrapped in fluff is not strong. + levels: + - label: weak + what: "Hallucinations, false claims that work was done, walls of filler, or major inaccuracies." + - label: typical + score: 53 + what: "Conveys the answer; some unnecessary text or minor inaccuracies, but no major errors." + - label: strong + what: "Direct, accurate, appropriately brief; flags genuine uncertainties; no padding." subagent_orchestration: description: > - Quality of sub-agent delegation when spawn_agent is used. Score this - null if no sub-agents were spawned — do not punish absence. + Quality of delegation to sub-agents via spawn_agent. Score null if no + sub-agents were spawned in the session — do not punish absence. nullable: true - anchors: - - score: 10 - label: misuse - what: "Sub-agent given a vague prompt, returns junk, parent ignores or duplicates the work." - - score: 33 - label: rough - what: "Sub-agent did most of the work but parent had to re-do or heavily edit." - - score: 51 - label: workable - what: "Sub-agent helped; the delegation paid off but the prompt wasn't clean." - - score: 77 - label: well-scoped - what: "Clear sub-task, clear hand-off, parent uses the result without rework." - - score: 102 - label: textbook - what: "Sub-agent saved real work; output integrated cleanly; no overlap with the parent." + levels: + - label: weak + what: "Sub-agent given a vague prompt; output unusable, ignored, or duplicated by the parent." + - label: typical + score: 53 + what: "Sub-agent helped; the delegation paid off but the prompt or hand-off wasn't clean." + - label: strong + what: "Clear sub-task, clean hand-off, parent uses the result without rework; no overlap." self_extension: description: > - Quality of write_tool / reload_tools / delete_tool usage. Score null if - Navi did not modify her own tooling in this session. + Quality of self-extension via write_tool / reload_tools / delete_tool. + Score null if Navi did not modify her own tooling in this session. nullable: true - anchors: - - score: 10 - label: broken - what: "Tool written in wrong format, fails to load, no recovery." - - score: 33 - label: shaky - what: "Tool loads but is brittle or solves the wrong problem." - - score: 51 - label: functional - what: "Tool works for the immediate need but is narrow or quirky." - - score: 77 - label: solid - what: "Tool is well-formed, reusable, manual or doc updated." - - score: 102 - label: production-grade - what: "Tool is general, tested, integrates cleanly with other tools and matches project conventions." + levels: + - label: weak + what: "Tool fails to load, is in wrong format, or solves the wrong problem." + - label: typical + score: 53 + what: "Tool loads and works for the immediate need but is narrow or quirky." + - label: strong + what: "Tool is well-formed, reusable, integrates cleanly, manual or doc updated." diff --git a/debug/eval/schema.py b/debug/eval/schema.py index 63a2396..b8979cc 100644 --- a/debug/eval/schema.py +++ b/debug/eval/schema.py @@ -30,7 +30,7 @@ class EvalScores(BaseModel): - """Per-axis numeric scores. Open scale: values >100 are accepted.""" + """Per-axis numeric scores. Fully open-ended scale: any non-negative integer.""" task_complexity: int = Field(ge=0) goal_completion: int = Field(ge=0)