diff --git a/debug/eval/judge.py b/debug/eval/judge.py
index 27bdd5c..722e44c 100644
--- a/debug/eval/judge.py
+++ b/debug/eval/judge.py
@@ -330,7 +330,7 @@
     response = await llm.complete(
         messages=base_messages,
         tools=None,
-        temperature=0.2,
+        temperature=0.5,
         model=model,
         think=False,
     )
diff --git a/debug/eval/prompts/expert_pragmatist.txt b/debug/eval/prompts/expert_pragmatist.txt
index efe40f2..d8ac7e4 100644
--- a/debug/eval/prompts/expert_pragmatist.txt
+++ b/debug/eval/prompts/expert_pragmatist.txt
@@ -6,8 +6,9 @@
 whether the work shipped.
 
 You will receive:
-1. A rubric with anchors at scores 10 / 30 / 50 / 75 / 100 for each axis. The
-   scale is open: score above 100 if warranted. Each axis is independent.
+1. A rubric with illustrative level descriptions. Each axis is scored on an
+   open-ended integer scale starting at 0. Use any integer that reflects the
+   session's actual performance. Do not round to the nearest anchor.
 2. A "Session block": full transcript, per-message reactions (👍 / 👎),
    aggregated counts, profile metadata, timing.
 
@@ -28,6 +29,22 @@
   "comment": "<2–5 sentences explaining whether the user got value and what would have made the session more useful>"
 }
 
+Example of correct output (illustrative scores between anchors):
+
+{
+  "expert_id": "pragmatist",
+  "scores": {
+    "task_complexity": 55,
+    "goal_completion": 72,
+    "tool_usage_quality": 68,
+    "efficiency": 61,
+    "communication": 84,
+    "subagent_orchestration": null,
+    "self_extension": null
+  },
+  "comment": "The user got what they asked for after a slightly winding path. One follow-up correction was needed, but the final result was usable. Communication was friendly and direct."
+}
+
 Rules of scoring:
 - `task_complexity` from the user's request alone, before considering the
   response.
diff --git a/debug/eval/prompts/expert_strict_critic.txt b/debug/eval/prompts/expert_strict_critic.txt
index 31f0d65..72ea38c 100644
--- a/debug/eval/prompts/expert_strict_critic.txt
+++ b/debug/eval/prompts/expert_strict_critic.txt
@@ -6,10 +6,9 @@
 hallucination, unverified claim, missed validation step.
 
 You will receive:
-1. A rubric with anchors at scores 10 / 30 / 50 / 75 / 100 for each axis. The
-   scale is open: if you see something clearly harder or better than the 100
-   anchor, score above 100. Independence: each axis is scored on its own; do
-   not let one axis pull another up or down.
+1. A rubric with illustrative level descriptions. Each axis is scored on an
+   open-ended integer scale starting at 0. Use any integer that reflects the
+   session's actual performance. Do not round to the nearest anchor.
 2. A "Session block" containing the full transcript (user, assistant text,
    thinking, tool calls and tool results, sub-agent traces, planning phases),
    per-message user reactions if any (👍 / 👎), aggregated like / dislike
@@ -32,6 +31,23 @@
   "comment": "<2–5 sentences naming the most concrete flaws you found>"
 }
 
+Here is an example of correct output for a hypothetical session. Notice the
+scores land between the rubric's anchors — they are not rounded:
+
+{
+  "expert_id": "strict_critic",
+  "scores": {
+    "task_complexity": 67,
+    "goal_completion": 83,
+    "tool_usage_quality": 42,
+    "efficiency": 58,
+    "communication": 91,
+    "subagent_orchestration": null,
+    "self_extension": 77
+  },
+  "comment": "Navi missed a validation step at turn 4, then recovered. The tool choice was correct but the first file read was redundant. Communication was clear except for an unverified claim in turn 7."
+}
+
 Rules of scoring:
 - `task_complexity` is judged from the user's request alone, *before* you
   consider how Navi handled it. Do not adjust complexity based on outcome.
diff --git a/debug/eval/prompts/expert_tech_lead.txt b/debug/eval/prompts/expert_tech_lead.txt
index d6692c5..b9047f5 100644
--- a/debug/eval/prompts/expert_tech_lead.txt
+++ b/debug/eval/prompts/expert_tech_lead.txt
@@ -5,8 +5,9 @@
 agent worked from first principles or copy-pasted assumptions.
 
 You will receive:
-1. A rubric with anchors at scores 10 / 30 / 50 / 75 / 100 for each axis. The
-   scale is open: score above 100 if warranted. Each axis is independent.
+1. A rubric with illustrative level descriptions. Each axis is scored on an
+   open-ended integer scale starting at 0. Use any integer that reflects the
+   session's actual performance. Do not round to the nearest anchor.
 2. A "Session block": full transcript including planning phases, all tool
    calls, sub-agent traces, profile metadata, timing.
 
@@ -27,6 +28,22 @@
   "comment": "<2–5 sentences naming the strongest and weakest engineering decisions in this session>"
 }
 
+Example of correct output (illustrative scores between anchors):
+
+{
+  "expert_id": "tech_lead",
+  "scores": {
+    "task_complexity": 48,
+    "goal_completion": 79,
+    "tool_usage_quality": 63,
+    "efficiency": 54,
+    "communication": 71,
+    "subagent_orchestration": 88,
+    "self_extension": null
+  },
+  "comment": "The plan was well-structured but two tool calls were redundant. Error handling was solid — the agent caught the exception and retried correctly. No self-extension was attempted."
+}
+
 Rules of scoring:
 - `task_complexity` from the user's request alone, before considering the
   response.
diff --git a/debug/eval/prompts/rubric_v1.yaml b/debug/eval/prompts/rubric_v1.yaml
index d3138c3..6d2de94 100644
--- a/debug/eval/prompts/rubric_v1.yaml
+++ b/debug/eval/prompts/rubric_v1.yaml
@@ -1,19 +1,18 @@
-# Rubric v1 — frozen reference for the LLM-as-judge eval system.
+# Rubric v1
 #
-# Axes are scored on an open 0..100+ scale. The 100 anchor describes "at the
-# limit of what Navi can do today"; if the judge sees something clearly harder
-# or better, it scores above 100 and that becomes a future anchor.
+# Each axis is scored on an open-ended integer scale starting at 0.
+# The anchors below are illustrative level descriptions with example scores.
+# Judges should use any integer that reflects actual performance.
+# Do not round scores to the nearest anchor.
 #
 # Anchor `examples` are placeholders. After accumulating real sessions, fill
 # them with actual session_ids and short notes — do not change the scale
 # language without bumping the rubric_version.
 #
 # Conventions:
-#   - Each axis is independent. Don't let "complexity" pull "communication"
-#     down or up.
+#   - Each axis is independent.
 #   - `task_complexity` is judged from the user's request alone, before the
-#     transcript of the response is considered. The schema.py validator
-#     enforces this scoring order.
+#     transcript of the response is considered.
 #   - `subagent_orchestration` and `self_extension` may be null when the
 #     session never used those mechanics. Do not invent zeros.
 
@@ -29,20 +28,20 @@
         label: trivial
         what: "Single fact, single tool, no planning, instant answer."
         examples: []
-      - score: 30
+      - score: 33
         label: straightforward
         what: "One tool, one or two steps, no real branching, expected answer is obvious."
         examples: []
-      - score: 50
+      - score: 51
         label: moderate
         what: "2–4 steps, planning helps, mild ambiguity in the request."
         examples: []
-      - score: 75
+      - score: 77
         label: complex
         what: "Multi-tool with planning, real ambiguity to resolve, several places where it could fail."
         examples: []
-      - score: 100
-        label: at-the-limit
+      - score: 102
+        label: at-or-above-the-limit
         what: "Full project-shaped task — multiple sub-agents, self-extension via write_tool, long-horizon execution."
         examples: []
 
@@ -54,16 +53,16 @@
       - score: 10
         label: missed
         what: "User did not get what they asked for; gave up or had to redirect entirely."
-      - score: 30
+      - score: 33
         label: partial-but-wrong-direction
         what: "Some output was produced but it doesn't really answer the request."
-      - score: 50
+      - score: 51
         label: partial
         what: "Half the request was met; user had to fill in or correct the rest."
-      - score: 75
+      - score: 77
         label: solid
         what: "Goal met with minor gaps a user can live with."
-      - score: 100
+      - score: 102
         label: clean-completion
         what: "Goal fully met, including caveats the user didn't have to ask about."
 
@@ -75,16 +74,16 @@
       - score: 10
         label: chaotic
         what: "Wrong tools picked, repeated identical calls, no recovery from errors."
-      - score: 30
+      - score: 33
         label: clumsy
         what: "Right idea but with several detours, redundant lookups."
-      - score: 50
+      - score: 51
         label: workable
         what: "Tools were appropriate; one or two avoidable calls."
-      - score: 75
+      - score: 77
         label: deliberate
         what: "Tool choices match the task, errors handled cleanly, no obvious waste."
-      - score: 100
+      - score: 102
         label: surgical
         what: "Minimal sufficient toolset, each call has a clear purpose, results reused well."
 
@@ -96,16 +95,16 @@
       - score: 10
         label: thrashing
         what: "Loops, runs out of iteration budget, never converges."
-      - score: 30
+      - score: 33
         label: wandering
         what: "Reaches the goal but with detours and several aborted attempts."
-      - score: 50
+      - score: 51
         label: acceptable
         what: "Linear path with minor stalls."
-      - score: 75
+      - score: 77
         label: tight
         what: "Few wasted moves; planning anticipated most of the work."
-      - score: 100
+      - score: 102
         label: ideal
         what: "Shortest reasonable path to the result, no slack."
 
@@ -118,16 +117,16 @@
       - score: 10
         label: misleading
         what: "Hallucinations, claims work was done that wasn't, walls of filler."
-      - score: 30
+      - score: 33
         label: noisy
         what: "Padded replies, occasional inaccuracies, reader has to dig."
-      - score: 50
+      - score: 51
         label: serviceable
         what: "Conveys the answer; some unnecessary text, no major errors."
-      - score: 75
+      - score: 77
         label: clear
         what: "Direct, accurate, appropriately brief; flags genuine uncertainties."
-      - score: 100
+      - score: 102
         label: exemplary
         what: "Tight, honest, anticipates reader questions, no fluff."
 
@@ -140,16 +139,16 @@
       - score: 10
         label: misuse
         what: "Sub-agent given a vague prompt, returns junk, parent ignores or duplicates the work."
-      - score: 30
+      - score: 33
         label: rough
         what: "Sub-agent did most of the work but parent had to re-do or heavily edit."
-      - score: 50
+      - score: 51
         label: workable
         what: "Sub-agent helped; the delegation paid off but the prompt wasn't clean."
-      - score: 75
+      - score: 77
         label: well-scoped
         what: "Clear sub-task, clear hand-off, parent uses the result without rework."
-      - score: 100
+      - score: 102
         label: textbook
         what: "Sub-agent saved real work; output integrated cleanly; no overlap with the parent."
 
@@ -162,15 +161,15 @@
       - score: 10
         label: broken
         what: "Tool written in wrong format, fails to load, no recovery."
-      - score: 30
+      - score: 33
         label: shaky
         what: "Tool loads but is brittle or solves the wrong problem."
-      - score: 50
+      - score: 51
         label: functional
         what: "Tool works for the immediate need but is narrow or quirky."
-      - score: 75
+      - score: 77
         label: solid
         what: "Tool is well-formed, reusable, manual or doc updated."
-      - score: 100
+      - score: 102
         label: production-grade
         what: "Tool is general, tested, integrates cleanly with other tools and matches project conventions."