navi-1/debug/eval/prompts/rubric_v1.yaml at e4771277847c3da2f64395883c6f65adfdf137fb

Fork: 0
root / navi-1
Find file
Newer
Older
navi-1 / debug / eval / prompts / rubric_v1.yaml
Eugene Sukhodolskiy on 26 Apr 6 KB Add eval system Phase 2 — rubric, expert prompts, judge skeleton
Raw Blame History
# Rubric v1 — frozen reference for the LLM-as-judge eval system.
#
# Axes are scored on an open 0..100+ scale. The 100 anchor describes "at the
# limit of what Navi can do today"; if the judge sees something clearly harder
# or better, it scores above 100 and that becomes a future anchor.
#
# Anchor `examples` are placeholders. After accumulating real sessions, fill
# them with actual session_ids and short notes — do not change the scale
# language without bumping the rubric_version.
#
# Conventions:
#   - Each axis is independent. Don't let "complexity" pull "communication"
#     down or up.
#   - `task_complexity` is judged from the user's request alone, before the
#     transcript of the response is considered. The schema.py validator
#     enforces this scoring order.
#   - `subagent_orchestration` and `self_extension` may be null when the
#     session never used those mechanics. Do not invent zeros.

version: "v1"

axes:
  task_complexity:
    description: >
      Difficulty of what the user asked, judged from the user's request alone
      (not from how Navi handled it). Independent of outcome.
    anchors:
      - score: 10
        label: trivial
        what: "Single fact, single tool, no planning, instant answer."
        examples: []
      - score: 30
        label: straightforward
        what: "One tool, one or two steps, no real branching, expected answer is obvious."
        examples: []
      - score: 50
        label: moderate
        what: "2–4 steps, planning helps, mild ambiguity in the request."
        examples: []
      - score: 75
        label: complex
        what: "Multi-tool with planning, real ambiguity to resolve, several places where it could fail."
        examples: []
      - score: 100
        label: at-the-limit
        what: "Full project-shaped task — multiple sub-agents, self-extension via write_tool, long-horizon execution."
        examples: []

  goal_completion:
    description: >
      Did the user end up with what they wanted, regardless of the path taken.
      Read the user's reactions and the final assistant response to decide.
    anchors:
      - score: 10
        label: missed
        what: "User did not get what they asked for; gave up or had to redirect entirely."
      - score: 30
        label: partial-but-wrong-direction
        what: "Some output was produced but it doesn't really answer the request."
      - score: 50
        label: partial
        what: "Half the request was met; user had to fill in or correct the rest."
      - score: 75
        label: solid
        what: "Goal met with minor gaps a user can live with."
      - score: 100
        label: clean-completion
        what: "Goal fully met, including caveats the user didn't have to ask about."

  tool_usage_quality:
    description: >
      Were the right tools chosen? Was there thrashing, redundant calls,
      reading the same file twice, calling search when a local source had it?
    anchors:
      - score: 10
        label: chaotic
        what: "Wrong tools picked, repeated identical calls, no recovery from errors."
      - score: 30
        label: clumsy
        what: "Right idea but with several detours, redundant lookups."
      - score: 50
        label: workable
        what: "Tools were appropriate; one or two avoidable calls."
      - score: 75
        label: deliberate
        what: "Tool choices match the task, errors handled cleanly, no obvious waste."
      - score: 100
        label: surgical
        what: "Minimal sufficient toolset, each call has a clear purpose, results reused well."

  efficiency:
    description: >
      Iterations vs result. Penalize loops, dead-ends, and re-doing work that
      was already done. Reward straight lines that finish in fewer steps.
    anchors:
      - score: 10
        label: thrashing
        what: "Loops, runs out of iteration budget, never converges."
      - score: 30
        label: wandering
        what: "Reaches the goal but with detours and several aborted attempts."
      - score: 50
        label: acceptable
        what: "Linear path with minor stalls."
      - score: 75
        label: tight
        what: "Few wasted moves; planning anticipated most of the work."
      - score: 100
        label: ideal
        what: "Shortest reasonable path to the result, no slack."

  communication:
    description: >
      Clarity and honesty of replies. Penalize hallucinations, false
      confidence, and excessive verbosity. Reward direct answers, stated
      uncertainties, and accurate self-reports.
    anchors:
      - score: 10
        label: misleading
        what: "Hallucinations, claims work was done that wasn't, walls of filler."
      - score: 30
        label: noisy
        what: "Padded replies, occasional inaccuracies, reader has to dig."
      - score: 50
        label: serviceable
        what: "Conveys the answer; some unnecessary text, no major errors."
      - score: 75
        label: clear
        what: "Direct, accurate, appropriately brief; flags genuine uncertainties."
      - score: 100
        label: exemplary
        what: "Tight, honest, anticipates reader questions, no fluff."

  subagent_orchestration:
    description: >
      Quality of sub-agent delegation when spawn_agent is used. Score this
      null if no sub-agents were spawned — do not punish absence.
    nullable: true
    anchors:
      - score: 10
        label: misuse
        what: "Sub-agent given a vague prompt, returns junk, parent ignores or duplicates the work."
      - score: 30
        label: rough
        what: "Sub-agent did most of the work but parent had to re-do or heavily edit."
      - score: 50
        label: workable
        what: "Sub-agent helped; the delegation paid off but the prompt wasn't clean."
      - score: 75
        label: well-scoped
        what: "Clear sub-task, clear hand-off, parent uses the result without rework."
      - score: 100
        label: textbook
        what: "Sub-agent saved real work; output integrated cleanly; no overlap with the parent."

  self_extension:
    description: >
      Quality of write_tool / reload_tools / delete_tool usage. Score null if
      Navi did not modify her own tooling in this session.
    nullable: true
    anchors:
      - score: 10
        label: broken
        what: "Tool written in wrong format, fails to load, no recovery."
      - score: 30
        label: shaky
        what: "Tool loads but is brittle or solves the wrong problem."
      - score: 50
        label: functional
        what: "Tool works for the immediate need but is narrow or quirky."
      - score: 75
        label: solid
        what: "Tool is well-formed, reusable, manual or doc updated."
      - score: 100
        label: production-grade
        what: "Tool is general, tested, integrates cleanly with other tools and matches project conventions."