diff --git a/debug/eval/judge.py b/debug/eval/judge.py
index 722e44c..b73df1a 100644
--- a/debug/eval/judge.py
+++ b/debug/eval/judge.py
@@ -59,8 +59,10 @@
 def render_rubric_for_prompt(rubric: dict) -> str:
     """Compact text rendering of the rubric to inline into the user message.
 
-    The expert system prompt references a "rubric" — we send it as plain text
-    next to the session block so the model has the anchors in front of it.
+    Each axis lists three level descriptions (weak / typical / strong).
+    Only the `typical` tier carries a numeric reference score — surfaced as
+    a single calibration point so judges aren't left without an anchor, but
+    intentionally not a multiple of 5 to discourage round-number snapping.
     """
     lines: list[str] = [f"=== Rubric {rubric['version']} ==="]
     for axis_name, axis in rubric["axes"].items():
@@ -68,8 +70,10 @@
         lines.append(axis["description"].strip())
         if axis.get("nullable"):
             lines.append("(nullable: score this null when the mechanic was not used)")
-        for a in axis["anchors"]:
-            lines.append(f"  {a['score']:>3} — {a['label']}: {a['what']}")
+        for level in axis["levels"]:
+            score = level.get("score")
+            ref = f" (reference ≈ {score})" if score is not None else ""
+            lines.append(f"  • {level['label']}{ref} — {level['what']}")
     return "\n".join(lines)
 
 
diff --git a/debug/eval/prompts/expert_pragmatist.txt b/debug/eval/prompts/expert_pragmatist.txt
index d8ac7e4..dbc8ee5 100644
--- a/debug/eval/prompts/expert_pragmatist.txt
+++ b/debug/eval/prompts/expert_pragmatist.txt
@@ -6,9 +6,12 @@
 whether the work shipped.
 
 You will receive:
-1. A rubric with illustrative level descriptions. Each axis is scored on an
-   open-ended integer scale starting at 0. Use any integer that reflects the
-   session's actual performance. Do not round to the nearest anchor.
+1. A rubric. Each axis lists three level descriptions (weak / typical / strong).
+   Only the `typical` tier carries a numeric reference score (a single
+   calibration point). Score every axis with any integer on an open-ended
+   scale starting at 0 — pick the value that best reflects where this session
+   lands between, around, or beyond those level descriptions. There are no
+   preferred values; round multiples of 5 are not expected and not needed.
 2. A "Session block": full transcript, per-message reactions (👍 / 👎),
    aggregated counts, profile metadata, timing.
 
@@ -29,22 +32,6 @@
   "comment": "<2–5 sentences explaining whether the user got value and what would have made the session more useful>"
 }
 
-Example of correct output (illustrative scores between anchors):
-
-{
-  "expert_id": "pragmatist",
-  "scores": {
-    "task_complexity": 55,
-    "goal_completion": 72,
-    "tool_usage_quality": 68,
-    "efficiency": 61,
-    "communication": 84,
-    "subagent_orchestration": null,
-    "self_extension": null
-  },
-  "comment": "The user got what they asked for after a slightly winding path. One follow-up correction was needed, but the final result was usable. Communication was friendly and direct."
-}
-
 Rules of scoring:
 - `task_complexity` from the user's request alone, before considering the
   response.
diff --git a/debug/eval/prompts/expert_strict_critic.txt b/debug/eval/prompts/expert_strict_critic.txt
index 72ea38c..bc45060 100644
--- a/debug/eval/prompts/expert_strict_critic.txt
+++ b/debug/eval/prompts/expert_strict_critic.txt
@@ -6,9 +6,12 @@
 hallucination, unverified claim, missed validation step.
 
 You will receive:
-1. A rubric with illustrative level descriptions. Each axis is scored on an
-   open-ended integer scale starting at 0. Use any integer that reflects the
-   session's actual performance. Do not round to the nearest anchor.
+1. A rubric. Each axis lists three level descriptions (weak / typical / strong).
+   Only the `typical` tier carries a numeric reference score (a single
+   calibration point). Score every axis with any integer on an open-ended
+   scale starting at 0 — pick the value that best reflects where this session
+   lands between, around, or beyond those level descriptions. There are no
+   preferred values; round multiples of 5 are not expected and not needed.
 2. A "Session block" containing the full transcript (user, assistant text,
    thinking, tool calls and tool results, sub-agent traces, planning phases),
    per-message user reactions if any (👍 / 👎), aggregated like / dislike
@@ -31,23 +34,6 @@
   "comment": "<2–5 sentences naming the most concrete flaws you found>"
 }
 
-Here is an example of correct output for a hypothetical session. Notice the
-scores land between the rubric's anchors — they are not rounded:
-
-{
-  "expert_id": "strict_critic",
-  "scores": {
-    "task_complexity": 67,
-    "goal_completion": 83,
-    "tool_usage_quality": 42,
-    "efficiency": 58,
-    "communication": 91,
-    "subagent_orchestration": null,
-    "self_extension": 77
-  },
-  "comment": "Navi missed a validation step at turn 4, then recovered. The tool choice was correct but the first file read was redundant. Communication was clear except for an unverified claim in turn 7."
-}
-
 Rules of scoring:
 - `task_complexity` is judged from the user's request alone, *before* you
   consider how Navi handled it. Do not adjust complexity based on outcome.
diff --git a/debug/eval/prompts/expert_tech_lead.txt b/debug/eval/prompts/expert_tech_lead.txt
index b9047f5..6d0bc25 100644
--- a/debug/eval/prompts/expert_tech_lead.txt
+++ b/debug/eval/prompts/expert_tech_lead.txt
@@ -5,9 +5,12 @@
 agent worked from first principles or copy-pasted assumptions.
 
 You will receive:
-1. A rubric with illustrative level descriptions. Each axis is scored on an
-   open-ended integer scale starting at 0. Use any integer that reflects the
-   session's actual performance. Do not round to the nearest anchor.
+1. A rubric. Each axis lists three level descriptions (weak / typical / strong).
+   Only the `typical` tier carries a numeric reference score (a single
+   calibration point). Score every axis with any integer on an open-ended
+   scale starting at 0 — pick the value that best reflects where this session
+   lands between, around, or beyond those level descriptions. There are no
+   preferred values; round multiples of 5 are not expected and not needed.
 2. A "Session block": full transcript including planning phases, all tool
    calls, sub-agent traces, profile metadata, timing.
 
@@ -28,22 +31,6 @@
   "comment": "<2–5 sentences naming the strongest and weakest engineering decisions in this session>"
 }
 
-Example of correct output (illustrative scores between anchors):
-
-{
-  "expert_id": "tech_lead",
-  "scores": {
-    "task_complexity": 48,
-    "goal_completion": 79,
-    "tool_usage_quality": 63,
-    "efficiency": 54,
-    "communication": 71,
-    "subagent_orchestration": 88,
-    "self_extension": null
-  },
-  "comment": "The plan was well-structured but two tool calls were redundant. Error handling was solid — the agent caught the exception and retried correctly. No self-extension was attempted."
-}
-
 Rules of scoring:
 - `task_complexity` from the user's request alone, before considering the
   response.
diff --git a/debug/eval/prompts/rubric_v1.yaml b/debug/eval/prompts/rubric_v1.yaml
index 6d2de94..7bcc463 100644
--- a/debug/eval/prompts/rubric_v1.yaml
+++ b/debug/eval/prompts/rubric_v1.yaml
@@ -1,18 +1,19 @@
 # Rubric v1
 #
-# Each axis is scored on an open-ended integer scale starting at 0.
-# The anchors below are illustrative level descriptions with example scores.
-# Judges should use any integer that reflects actual performance.
-# Do not round scores to the nearest anchor.
-#
-# Anchor `examples` are placeholders. After accumulating real sessions, fill
-# them with actual session_ids and short notes — do not change the scale
-# language without bumping the rubric_version.
+# Each axis carries three level descriptions: weak / typical / strong.
+# Only the `typical` level holds a numeric reference score (`53`) — it is
+# the single calibration anchor; weak and strong have no numbers attached.
+# Judges score every axis with any non-negative integer on a fully
+# open-ended scale — there is no upper bound, and exceptional sessions
+# may score arbitrarily high as Navi grows. There are no preferred values;
+# round multiples of 5 or 10 are not expected. The reference is just a
+# pin: a typical-day session lands near 53; clearly weaker sessions go
+# below, clearly stronger go above with no ceiling.
 #
 # Conventions:
-#   - Each axis is independent.
-#   - `task_complexity` is judged from the user's request alone, before the
-#     transcript of the response is considered.
+#   - Each axis is independent. Don't let one pull another.
+#   - `task_complexity` is judged from the user's request alone, before
+#     the response is considered.
 #   - `subagent_orchestration` and `self_extension` may be null when the
 #     session never used those mechanics. Do not invent zeros.
 
@@ -21,155 +22,97 @@
 axes:
   task_complexity:
     description: >
-      Difficulty of what the user asked, judged from the user's request alone
-      (not from how Navi handled it). Independent of outcome.
-    anchors:
-      - score: 10
-        label: trivial
-        what: "Single fact, single tool, no planning, instant answer."
-        examples: []
-      - score: 33
-        label: straightforward
-        what: "One tool, one or two steps, no real branching, expected answer is obvious."
-        examples: []
-      - score: 51
-        label: moderate
-        what: "2–4 steps, planning helps, mild ambiguity in the request."
-        examples: []
-      - score: 77
-        label: complex
-        what: "Multi-tool with planning, real ambiguity to resolve, several places where it could fail."
-        examples: []
-      - score: 102
-        label: at-or-above-the-limit
-        what: "Full project-shaped task — multiple sub-agents, self-extension via write_tool, long-horizon execution."
-        examples: []
+      How hard the user's request is, judged from the request alone — before
+      considering how Navi handled it. Reflects ambiguity, depth, multi-step
+      reasoning, and how many things have to go right. Independent of outcome.
+    levels:
+      - label: weak
+        what: "Single fact or single tool, no real planning, expected answer is obvious."
+      - label: typical
+        score: 53
+        what: "Multi-step task with mild ambiguity; planning helps but the path is mostly clear."
+      - label: strong
+        what: "Long-horizon, multi-tool, real ambiguity to resolve, several places where it could fail; possibly project-shaped with sub-agents or self-extension."
 
   goal_completion:
     description: >
-      Did the user end up with what they wanted, regardless of the path taken.
-      Read the user's reactions and the final assistant response to decide.
-    anchors:
-      - score: 10
-        label: missed
-        what: "User did not get what they asked for; gave up or had to redirect entirely."
-      - score: 33
-        label: partial-but-wrong-direction
-        what: "Some output was produced but it doesn't really answer the request."
-      - score: 51
-        label: partial
-        what: "Half the request was met; user had to fill in or correct the rest."
-      - score: 77
-        label: solid
-        what: "Goal met with minor gaps a user can live with."
-      - score: 102
-        label: clean-completion
-        what: "Goal fully met, including caveats the user didn't have to ask about."
+      What fraction of the user's intent — including unstated needs they
+      would care about — was actually delivered. This is a continuous
+      dimension, not yes/no: even a successful response can leave gaps,
+      and even a failed one can deliver part of what was asked.
+    levels:
+      - label: weak
+        what: "User did not get what they asked for; gave up, redirected, or output was off-target."
+      - label: typical
+        score: 53
+        what: "Most of the request was met; user has minor gaps to fill in or live with."
+      - label: strong
+        what: "Goal fully met, including edge cases or caveats the user didn't have to ask about."
 
   tool_usage_quality:
     description: >
-      Were the right tools chosen? Was there thrashing, redundant calls,
-      reading the same file twice, calling search when a local source had it?
-    anchors:
-      - score: 10
-        label: chaotic
-        what: "Wrong tools picked, repeated identical calls, no recovery from errors."
-      - score: 33
-        label: clumsy
-        what: "Right idea but with several detours, redundant lookups."
-      - score: 51
-        label: workable
-        what: "Tools were appropriate; one or two avoidable calls."
-      - score: 77
-        label: deliberate
-        what: "Tool choices match the task, errors handled cleanly, no obvious waste."
-      - score: 102
-        label: surgical
-        what: "Minimal sufficient toolset, each call has a clear purpose, results reused well."
+      Whether tools were chosen appropriately, called efficiently, with
+      errors handled cleanly and results reused.
+    levels:
+      - label: weak
+        what: "Wrong tools picked, repeated identical calls, no recovery from errors, or substantial wasted work."
+      - label: typical
+        score: 53
+        what: "Tools were appropriate; one or two avoidable detours or redundant lookups."
+      - label: strong
+        what: "Minimal sufficient toolset, each call has a clear purpose, errors handled gracefully, results reused."
 
   efficiency:
     description: >
-      Iterations vs result. Penalize loops, dead-ends, and re-doing work that
-      was already done. Reward straight lines that finish in fewer steps.
-    anchors:
-      - score: 10
-        label: thrashing
-        what: "Loops, runs out of iteration budget, never converges."
-      - score: 33
-        label: wandering
-        what: "Reaches the goal but with detours and several aborted attempts."
-      - score: 51
-        label: acceptable
-        what: "Linear path with minor stalls."
-      - score: 77
-        label: tight
-        what: "Few wasted moves; planning anticipated most of the work."
-      - score: 102
-        label: ideal
-        what: "Shortest reasonable path to the result, no slack."
+      Iterations and total work relative to the result. Detours, loops,
+      and re-do attempts cost points; tight planning saves them.
+    levels:
+      - label: weak
+        what: "Loops, runs out of iteration budget, or never converges; many aborted or duplicated attempts."
+      - label: typical
+        score: 53
+        what: "Linear path with minor stalls; reaches the goal without too many detours."
+      - label: strong
+        what: "Few wasted moves; planning anticipated the work; shortest reasonable path."
 
   communication:
     description: >
-      Clarity and honesty of replies. Penalize hallucinations, false
-      confidence, and excessive verbosity. Reward direct answers, stated
-      uncertainties, and accurate self-reports.
-    anchors:
-      - score: 10
-        label: misleading
-        what: "Hallucinations, claims work was done that wasn't, walls of filler."
-      - score: 33
-        label: noisy
-        what: "Padded replies, occasional inaccuracies, reader has to dig."
-      - score: 51
-        label: serviceable
-        what: "Conveys the answer; some unnecessary text, no major errors."
-      - score: 77
-        label: clear
-        what: "Direct, accurate, appropriately brief; flags genuine uncertainties."
-      - score: 102
-        label: exemplary
-        what: "Tight, honest, anticipates reader questions, no fluff."
+      Clarity, honesty, brevity, and absence of hallucinations. Penalise
+      padded replies and unverified claims even when the underlying answer
+      is correct. A correct answer wrapped in fluff is not strong.
+    levels:
+      - label: weak
+        what: "Hallucinations, false claims that work was done, walls of filler, or major inaccuracies."
+      - label: typical
+        score: 53
+        what: "Conveys the answer; some unnecessary text or minor inaccuracies, but no major errors."
+      - label: strong
+        what: "Direct, accurate, appropriately brief; flags genuine uncertainties; no padding."
 
   subagent_orchestration:
     description: >
-      Quality of sub-agent delegation when spawn_agent is used. Score this
-      null if no sub-agents were spawned — do not punish absence.
+      Quality of delegation to sub-agents via spawn_agent. Score null if no
+      sub-agents were spawned in the session — do not punish absence.
     nullable: true
-    anchors:
-      - score: 10
-        label: misuse
-        what: "Sub-agent given a vague prompt, returns junk, parent ignores or duplicates the work."
-      - score: 33
-        label: rough
-        what: "Sub-agent did most of the work but parent had to re-do or heavily edit."
-      - score: 51
-        label: workable
-        what: "Sub-agent helped; the delegation paid off but the prompt wasn't clean."
-      - score: 77
-        label: well-scoped
-        what: "Clear sub-task, clear hand-off, parent uses the result without rework."
-      - score: 102
-        label: textbook
-        what: "Sub-agent saved real work; output integrated cleanly; no overlap with the parent."
+    levels:
+      - label: weak
+        what: "Sub-agent given a vague prompt; output unusable, ignored, or duplicated by the parent."
+      - label: typical
+        score: 53
+        what: "Sub-agent helped; the delegation paid off but the prompt or hand-off wasn't clean."
+      - label: strong
+        what: "Clear sub-task, clean hand-off, parent uses the result without rework; no overlap."
 
   self_extension:
     description: >
-      Quality of write_tool / reload_tools / delete_tool usage. Score null if
-      Navi did not modify her own tooling in this session.
+      Quality of self-extension via write_tool / reload_tools / delete_tool.
+      Score null if Navi did not modify her own tooling in this session.
     nullable: true
-    anchors:
-      - score: 10
-        label: broken
-        what: "Tool written in wrong format, fails to load, no recovery."
-      - score: 33
-        label: shaky
-        what: "Tool loads but is brittle or solves the wrong problem."
-      - score: 51
-        label: functional
-        what: "Tool works for the immediate need but is narrow or quirky."
-      - score: 77
-        label: solid
-        what: "Tool is well-formed, reusable, manual or doc updated."
-      - score: 102
-        label: production-grade
-        what: "Tool is general, tested, integrates cleanly with other tools and matches project conventions."
+    levels:
+      - label: weak
+        what: "Tool fails to load, is in wrong format, or solves the wrong problem."
+      - label: typical
+        score: 53
+        what: "Tool loads and works for the immediate need but is narrow or quirky."
+      - label: strong
+        what: "Tool is well-formed, reusable, integrates cleanly, manual or doc updated."
diff --git a/debug/eval/schema.py b/debug/eval/schema.py
index 63a2396..b8979cc 100644
--- a/debug/eval/schema.py
+++ b/debug/eval/schema.py
@@ -30,7 +30,7 @@
 
 
 class EvalScores(BaseModel):
-    """Per-axis numeric scores. Open scale: values >100 are accepted."""
+    """Per-axis numeric scores. Fully open-ended scale: any non-negative integer."""
 
     task_complexity: int = Field(ge=0)
     goal_completion: int = Field(ge=0)