diff --git a/debug/eval/judge.py b/debug/eval/judge.py index 27bdd5c..722e44c 100644 --- a/debug/eval/judge.py +++ b/debug/eval/judge.py @@ -330,7 +330,7 @@ response = await llm.complete( messages=base_messages, tools=None, - temperature=0.2, + temperature=0.5, model=model, think=False, ) diff --git a/debug/eval/prompts/expert_pragmatist.txt b/debug/eval/prompts/expert_pragmatist.txt index efe40f2..d8ac7e4 100644 --- a/debug/eval/prompts/expert_pragmatist.txt +++ b/debug/eval/prompts/expert_pragmatist.txt @@ -6,8 +6,9 @@ whether the work shipped. You will receive: -1. A rubric with anchors at scores 10 / 30 / 50 / 75 / 100 for each axis. The - scale is open: score above 100 if warranted. Each axis is independent. +1. A rubric with illustrative level descriptions. Each axis is scored on an + open-ended integer scale starting at 0. Use any integer that reflects the + session's actual performance. Do not round to the nearest anchor. 2. A "Session block": full transcript, per-message reactions (👍 / 👎), aggregated counts, profile metadata, timing. @@ -28,6 +29,22 @@ "comment": "<2–5 sentences explaining whether the user got value and what would have made the session more useful>" } +Example of correct output (illustrative scores between anchors): + +{ + "expert_id": "pragmatist", + "scores": { + "task_complexity": 55, + "goal_completion": 72, + "tool_usage_quality": 68, + "efficiency": 61, + "communication": 84, + "subagent_orchestration": null, + "self_extension": null + }, + "comment": "The user got what they asked for after a slightly winding path. One follow-up correction was needed, but the final result was usable. Communication was friendly and direct." +} + Rules of scoring: - `task_complexity` from the user's request alone, before considering the response. diff --git a/debug/eval/prompts/expert_strict_critic.txt b/debug/eval/prompts/expert_strict_critic.txt index 31f0d65..72ea38c 100644 --- a/debug/eval/prompts/expert_strict_critic.txt +++ b/debug/eval/prompts/expert_strict_critic.txt @@ -6,10 +6,9 @@ hallucination, unverified claim, missed validation step. You will receive: -1. A rubric with anchors at scores 10 / 30 / 50 / 75 / 100 for each axis. The - scale is open: if you see something clearly harder or better than the 100 - anchor, score above 100. Independence: each axis is scored on its own; do - not let one axis pull another up or down. +1. A rubric with illustrative level descriptions. Each axis is scored on an + open-ended integer scale starting at 0. Use any integer that reflects the + session's actual performance. Do not round to the nearest anchor. 2. A "Session block" containing the full transcript (user, assistant text, thinking, tool calls and tool results, sub-agent traces, planning phases), per-message user reactions if any (👍 / 👎), aggregated like / dislike @@ -32,6 +31,23 @@ "comment": "<2–5 sentences naming the most concrete flaws you found>" } +Here is an example of correct output for a hypothetical session. Notice the +scores land between the rubric's anchors — they are not rounded: + +{ + "expert_id": "strict_critic", + "scores": { + "task_complexity": 67, + "goal_completion": 83, + "tool_usage_quality": 42, + "efficiency": 58, + "communication": 91, + "subagent_orchestration": null, + "self_extension": 77 + }, + "comment": "Navi missed a validation step at turn 4, then recovered. The tool choice was correct but the first file read was redundant. Communication was clear except for an unverified claim in turn 7." +} + Rules of scoring: - `task_complexity` is judged from the user's request alone, *before* you consider how Navi handled it. Do not adjust complexity based on outcome. diff --git a/debug/eval/prompts/expert_tech_lead.txt b/debug/eval/prompts/expert_tech_lead.txt index d6692c5..b9047f5 100644 --- a/debug/eval/prompts/expert_tech_lead.txt +++ b/debug/eval/prompts/expert_tech_lead.txt @@ -5,8 +5,9 @@ agent worked from first principles or copy-pasted assumptions. You will receive: -1. A rubric with anchors at scores 10 / 30 / 50 / 75 / 100 for each axis. The - scale is open: score above 100 if warranted. Each axis is independent. +1. A rubric with illustrative level descriptions. Each axis is scored on an + open-ended integer scale starting at 0. Use any integer that reflects the + session's actual performance. Do not round to the nearest anchor. 2. A "Session block": full transcript including planning phases, all tool calls, sub-agent traces, profile metadata, timing. @@ -27,6 +28,22 @@ "comment": "<2–5 sentences naming the strongest and weakest engineering decisions in this session>" } +Example of correct output (illustrative scores between anchors): + +{ + "expert_id": "tech_lead", + "scores": { + "task_complexity": 48, + "goal_completion": 79, + "tool_usage_quality": 63, + "efficiency": 54, + "communication": 71, + "subagent_orchestration": 88, + "self_extension": null + }, + "comment": "The plan was well-structured but two tool calls were redundant. Error handling was solid — the agent caught the exception and retried correctly. No self-extension was attempted." +} + Rules of scoring: - `task_complexity` from the user's request alone, before considering the response. diff --git a/debug/eval/prompts/rubric_v1.yaml b/debug/eval/prompts/rubric_v1.yaml index d3138c3..6d2de94 100644 --- a/debug/eval/prompts/rubric_v1.yaml +++ b/debug/eval/prompts/rubric_v1.yaml @@ -1,19 +1,18 @@ -# Rubric v1 — frozen reference for the LLM-as-judge eval system. +# Rubric v1 # -# Axes are scored on an open 0..100+ scale. The 100 anchor describes "at the -# limit of what Navi can do today"; if the judge sees something clearly harder -# or better, it scores above 100 and that becomes a future anchor. +# Each axis is scored on an open-ended integer scale starting at 0. +# The anchors below are illustrative level descriptions with example scores. +# Judges should use any integer that reflects actual performance. +# Do not round scores to the nearest anchor. # # Anchor `examples` are placeholders. After accumulating real sessions, fill # them with actual session_ids and short notes — do not change the scale # language without bumping the rubric_version. # # Conventions: -# - Each axis is independent. Don't let "complexity" pull "communication" -# down or up. +# - Each axis is independent. # - `task_complexity` is judged from the user's request alone, before the -# transcript of the response is considered. The schema.py validator -# enforces this scoring order. +# transcript of the response is considered. # - `subagent_orchestration` and `self_extension` may be null when the # session never used those mechanics. Do not invent zeros. @@ -29,20 +28,20 @@ label: trivial what: "Single fact, single tool, no planning, instant answer." examples: [] - - score: 30 + - score: 33 label: straightforward what: "One tool, one or two steps, no real branching, expected answer is obvious." examples: [] - - score: 50 + - score: 51 label: moderate what: "2–4 steps, planning helps, mild ambiguity in the request." examples: [] - - score: 75 + - score: 77 label: complex what: "Multi-tool with planning, real ambiguity to resolve, several places where it could fail." examples: [] - - score: 100 - label: at-the-limit + - score: 102 + label: at-or-above-the-limit what: "Full project-shaped task — multiple sub-agents, self-extension via write_tool, long-horizon execution." examples: [] @@ -54,16 +53,16 @@ - score: 10 label: missed what: "User did not get what they asked for; gave up or had to redirect entirely." - - score: 30 + - score: 33 label: partial-but-wrong-direction what: "Some output was produced but it doesn't really answer the request." - - score: 50 + - score: 51 label: partial what: "Half the request was met; user had to fill in or correct the rest." - - score: 75 + - score: 77 label: solid what: "Goal met with minor gaps a user can live with." - - score: 100 + - score: 102 label: clean-completion what: "Goal fully met, including caveats the user didn't have to ask about." @@ -75,16 +74,16 @@ - score: 10 label: chaotic what: "Wrong tools picked, repeated identical calls, no recovery from errors." - - score: 30 + - score: 33 label: clumsy what: "Right idea but with several detours, redundant lookups." - - score: 50 + - score: 51 label: workable what: "Tools were appropriate; one or two avoidable calls." - - score: 75 + - score: 77 label: deliberate what: "Tool choices match the task, errors handled cleanly, no obvious waste." - - score: 100 + - score: 102 label: surgical what: "Minimal sufficient toolset, each call has a clear purpose, results reused well." @@ -96,16 +95,16 @@ - score: 10 label: thrashing what: "Loops, runs out of iteration budget, never converges." - - score: 30 + - score: 33 label: wandering what: "Reaches the goal but with detours and several aborted attempts." - - score: 50 + - score: 51 label: acceptable what: "Linear path with minor stalls." - - score: 75 + - score: 77 label: tight what: "Few wasted moves; planning anticipated most of the work." - - score: 100 + - score: 102 label: ideal what: "Shortest reasonable path to the result, no slack." @@ -118,16 +117,16 @@ - score: 10 label: misleading what: "Hallucinations, claims work was done that wasn't, walls of filler." - - score: 30 + - score: 33 label: noisy what: "Padded replies, occasional inaccuracies, reader has to dig." - - score: 50 + - score: 51 label: serviceable what: "Conveys the answer; some unnecessary text, no major errors." - - score: 75 + - score: 77 label: clear what: "Direct, accurate, appropriately brief; flags genuine uncertainties." - - score: 100 + - score: 102 label: exemplary what: "Tight, honest, anticipates reader questions, no fluff." @@ -140,16 +139,16 @@ - score: 10 label: misuse what: "Sub-agent given a vague prompt, returns junk, parent ignores or duplicates the work." - - score: 30 + - score: 33 label: rough what: "Sub-agent did most of the work but parent had to re-do or heavily edit." - - score: 50 + - score: 51 label: workable what: "Sub-agent helped; the delegation paid off but the prompt wasn't clean." - - score: 75 + - score: 77 label: well-scoped what: "Clear sub-task, clear hand-off, parent uses the result without rework." - - score: 100 + - score: 102 label: textbook what: "Sub-agent saved real work; output integrated cleanly; no overlap with the parent." @@ -162,15 +161,15 @@ - score: 10 label: broken what: "Tool written in wrong format, fails to load, no recovery." - - score: 30 + - score: 33 label: shaky what: "Tool loads but is brittle or solves the wrong problem." - - score: 50 + - score: 51 label: functional what: "Tool works for the immediate need but is narrow or quirky." - - score: 75 + - score: 77 label: solid what: "Tool is well-formed, reusable, manual or doc updated." - - score: 100 + - score: 102 label: production-grade what: "Tool is general, tested, integrates cleanly with other tools and matches project conventions."