diff --git a/docs/agent.md b/docs/agent.md index 2860730..5a44707 100644 --- a/docs/agent.md +++ b/docs/agent.md @@ -24,21 +24,28 @@ - `DIRECT` → skip planning entirely (simple request). - A structured analysis + `REFLECT: yes/no` → continue to Phase 2 or 3. -### Phase 2 — Multi-perspective review (conditional) -Runs only when `profile.planning_reflect_enabled = True` AND Phase 1 outputs `REFLECT: yes`. -Three advisors run in parallel (via `asyncio.gather`), each receiving the full chat context and the Phase 1 analysis: -- **Critic** — what could go wrong -- **Pragmatist** — simpler/more direct path -- **Detailer** — missing requirements +### Phase 2 — Structured review (conditional) +Runs only when `planning_phase2_enabled = True` AND Phase 1 outputs `REFLECT: yes`. +One LLM call reviews the Phase 1 analysis and returns four sections: +- **Critic** — wrong assumptions, risks, contradictions, facts to verify +- **Pragmatist** — simpler path, unnecessary steps, better executor choices +- **Detailer** — missing requirements, source files/docs/tools to inspect, validation gaps +- **Plan Adjustments** — concrete changes Phase 3 must apply -Advisor feedback is embedded into the Phase 3 prompt. +The review is embedded into the Phase 3 prompt. ### Phase 3 — Execution plan -LLM produces a numbered step list. Each step is assigned an executor: +LLM produces milestones plus a numbered step list. Each step is assigned an executor: - `TOOL: tool_name` — single tool call -- `AGENT: profile_id` — delegated to a subagent via `spawn_agent` +- `AGENT: profile_id` — bounded 3+ tool-call subtask delegated to a subagent via `spawn_agent` - `SELF` — handled inline (synthesis, context-dependent action) +Plan depth is adaptive: +- simple: 1-3 steps +- medium: 5-9 steps +- complex or autonomous: 8-15 steps +- hard maximum: 15 steps + **Comma test (enforced in prompt):** if a step description lists multiple things with "and" or commas, each item must be a separate step. The plan is injected into `session.context` as an assistant message and saved to `session.messages` with `is_plan=True` for UI rendering. The todo list is auto-populated from the plan steps. @@ -53,7 +60,7 @@ |---|---|---| | `think_enabled` | `true` | Passes `think=True` to LLM on every main-loop call (extended reasoning) | | `iteration_budget_enabled` | `true` | Injects remaining iteration count into context so model wraps up in time | -| `planning_reflect_enabled` | `false` | Enables Phase 2 advisor review (3 parallel LLM calls, adds latency) | +| `planning_phase2_enabled` | `false` | Enables Phase 2 structured review (one extra LLM call when Phase 1 outputs `REFLECT: yes`) | | `goal_anchoring_enabled` | `true` | Injects goal-reminder system message every N iterations | | `goal_anchoring_interval` | `5` | N for goal anchoring | | `anti_stall_enabled` | `true` | Detects looping without todo progress and injects a warning | diff --git a/docs/profiles.md b/docs/profiles.md index b31d62c..910901b 100644 --- a/docs/profiles.md +++ b/docs/profiles.md @@ -57,15 +57,15 @@ The planning pipeline runs before the main tool-calling loop and produces a structured execution plan. It has three phases: - **Phase 1 — Analysis**: reformulates the task, identifies subtasks and unknowns. Can output `DIRECT` to skip to execution immediately. -- **Phase 2 — Multi-advisor review**: three independent LLM advisors (Critic / Pragmatist / Detailer) critique the Phase 1 analysis. Adds ~3 LLM calls. -- **Phase 3 — Execution plan**: assigns each subtask to `TOOL / AGENT / SELF`. +- **Phase 2 — Structured review**: one LLM call critiques the Phase 1 analysis through Critic / Pragmatist / Detailer sections and emits Plan Adjustments. Runs only when Phase 1 signals `REFLECT: yes`. +- **Phase 3 — Execution plan**: assigns each subtask to `TOOL / AGENT / SELF` and uses adaptive plan depth. | Key | Type | Default | Description | |---|---|---|---| | `planning_enabled` | bool | `false` | Run the planning pipeline on every user message (not just the first). First-message planning always runs regardless of this flag. | | `planning_mandatory` | bool | `false` | `true` — the `DIRECT` shortcut is never offered to the model; all three phases always run. `false` — the model can output `DIRECT` in Phase 1 to skip straight to execution. First-message planning is always forced regardless of this flag. | | `planning_phase1_enabled` | bool | `true` | Enable Phase 1 (task analysis). When disabled, Phase 3 runs without analysis context. | -| `planning_phase2_enabled` | bool | `false` | Enable Phase 2 (multi-advisor review). Only runs when Phase 1 signals `REFLECT: yes`. | +| `planning_phase2_enabled` | bool | `false` | Enable Phase 2 structured review. Adds one LLM call only when Phase 1 signals `REFLECT: yes`. | | `planning_phase3_enabled` | bool | `true` | Enable Phase 3 (structured execution plan). When disabled, only Phase 1 (analysis) runs. | ### Sub-agent planning diff --git a/navi/core/agent.py b/navi/core/agent.py index 05b1160..194b789 100644 --- a/navi/core/agent.py +++ b/navi/core/agent.py @@ -154,12 +154,17 @@ def _parse_plan_steps(plan_text: str) -> list[str]: """Extract numbered step lines from the **Steps:** section of a plan.""" import re - m = re.search(r'\*\*Steps:\*\*\s*\n(.*?)(?=\n\s*\*\*[A-Z]|\Z)', plan_text, re.DOTALL) + m = re.search(r'\*\*Steps:\*\*\s*\n(.*?)(?=\n\s*\*\*[^*\n]+:\*\*|\Z)', plan_text, re.DOTALL) if not m: return [] steps_block = m.group(1) - steps = re.findall(r'^\s*\d+[\.\)]\s*(.+)', steps_block, re.MULTILINE) - return [s.strip() for s in steps if s.strip()] + steps: list[str] = [] + for raw in re.findall(r'^\s*\d+[\.\)]\s*(.+)', steps_block, re.MULTILINE): + step = raw.strip() + if not step or step.startswith("["): + continue + steps.append(step) + return steps log = structlog.get_logger() @@ -932,10 +937,10 @@ identify subtasks and unknowns. Outputs DIRECT for simple requests. Outputs Reflect: yes/no to signal whether multi-perspective review is warranted. - Phase 2 — Multi-perspective review (conditional, think=False, parallel): - Three advisors (Critic / Pragmatist / Detailer) independently - critique the draft analysis. Runs only when - profile.planning_reflect_enabled=True AND phase 1 outputs Reflect: yes. + Phase 2 — Structured review (conditional, think=False): one critique pass + with Critic / Pragmatist / Detailer / Plan Adjustments sections. + Runs only when planning_phase2_enabled=True AND phase 1 outputs + Reflect: yes. Phase 3 — Execution plan (think=False): assigns each subtask to TOOL / AGENT / SELF. If phase 2 ran, advisor feedback is embedded in the prompt. @@ -999,6 +1004,7 @@ "RESOURCES:\n" "- [tool_name]: [what it does] — [limitation if any] — [alternative if limitation blocks the goal]\n" "- context sources: [which of memory / NAVI.md / web you will check and why]\n" + "COMPLEXITY: simple | medium | complex — choose based on ambiguity, number of files/systems, risk, and autonomy needed.\n" "SUBTASKS:\n" "1. [discrete unit of work]\n" "2. [discrete unit of work]\n" @@ -1009,7 +1015,9 @@ "research required, or high-stakes/irreversible actions); " "no — if it is straightforward and the path is clear.\n" "COMMITMENTS: [follow the plan step by step using the todo tool; gather any missing context independently without asking the user]\n\n" - "Rules: maximum 6 subtasks. Each must be concrete and actionable. " + "Rules: list enough subtasks to make execution unambiguous. " + "Simple tasks usually need 1-3 subtasks; medium tasks 5-9; complex or autonomous tasks 8-15. " + "Hard maximum: 15 subtasks. Each must be concrete and actionable. " "No execution yet — analysis only." ), ) @@ -1062,88 +1070,67 @@ else: log.debug("agent.planning_phase1_skipped") - # ── Phase 2: Multi-perspective review (conditional) ──────────────────── + # ── Phase 2: Structured review (conditional) ─────────────────────────── # Runs only when planning_phase2_enabled=True AND phase 1 signals - # that the task is complex enough to warrant independent critique. + # that the task is complex enough to warrant critique. advisor_feedback: str = "" needs_reflect = bool(_re.search(r"REFLECT\s*:\s*yes", analysis, _re.IGNORECASE)) if profile.planning_phase2_enabled and needs_reflect and not is_subagent: - yield PlanningStatus(phase=2, label="Consulting advisors...", is_subagent=is_subagent) + yield PlanningStatus(phase=2, label="Reviewing plan...", is_subagent=is_subagent) - _ADVISORS = [ - ( - "Critic", - "You are the Critic advisor. Your role: identify what could go wrong with this plan. " - "Look for untested assumptions, overlooked risks, missing error handling, and steps that " - "might fail silently. Be specific — name the exact step and the exact risk. " - "Do NOT suggest changing the user's goal. Critique the plan, not the objective.", + review_system = Message( + role="system", + content=( + _base_sys + + "\n\n---\n\n" + "[PLANNING - PHASE 2: STRUCTURED REVIEW]\n\n" + "Review the phase 1 task analysis before execution. " + "Do not change the user's goal. Do not invent facts. " + "Prefer resolving missing information through NAVI.md, docs, manuals, memory, files, " + "tool schemas, command output, or web research before asking the user.\n\n" + "Return exactly these sections:\n\n" + "## Critic\n" + "- 3-5 bullets: wrong or unverified assumptions, ignored risks, contradictions, " + "and facts that must be verified before acting.\n\n" + "## Pragmatist\n" + "- 3-5 bullets: simpler path, unnecessary steps, mergeable steps, better executor choices, " + "and cheaper ways to reach the user's actual goal.\n\n" + "## Detailer\n" + "- 3-5 bullets: missing requirements, missing docs/files/tools to inspect, edge cases, " + "and validation steps.\n\n" + "## Plan Adjustments\n" + "- Concrete changes Phase 3 must apply: add/remove/split/merge/reorder steps, " + "change TOOL/AGENT/SELF executor, verify specific facts, or defer user questions " + "until available sources are checked.\n\n" + "Keep output concise. No prose outside these sections.\n\n" + f"PHASE 1 ANALYSIS:\n{analysis}" ), - ( - "Pragmatist", - "You are the Pragmatist advisor. Your role: find a simpler, more direct path. " - "Identify steps that are unnecessary, can be merged, or parallelised. " - "Flag steps where the chosen executor (TOOL/AGENT/SELF) is suboptimal. " - "Do NOT suggest changing the user's goal. Improve efficiency, not direction.", - ), - ( - "Detailer", - "You are the Detailer advisor. Your role: find what is missing. " - "Identify prerequisites not listed, edge cases unaddressed, outputs not specified, " - "and validation steps absent. Be concrete — add what is needed, do not restate what is there. " - "Do NOT suggest changing the user's goal. Complete the plan, do not redirect it.", - ), - ] - - async def _run_advisor(name: str, role_prompt: str) -> tuple[str, str, int, int]: - adv_system = Message( - role="system", - content=( - _base_sys - + "\n\n---\n\n" - f"[PLANNING — PHASE 2: ADVISOR — {name.upper()}]\n\n" - + role_prompt - + "\n\n---\n\n" - "The task analysis below was produced in phase 1. " - "Review it and provide your concise critique (3–7 bullet points max). " - "Speak directly to the plan — no preamble, no conclusion.\n\n" - f"PHASE 1 ANALYSIS:\n{analysis}" - ), - ) - # Full chat context so advisors have complete picture of the conversation - adv_ctx: list[Message] = [adv_system] - if mem: - adv_ctx.append(mem) - adv_ctx.extend(m for m in context if m.role != "system") - try: - r = await asyncio.wait_for( - llm.complete(adv_ctx, tools=None, temperature=0.4, model=profile.model, think=False), - timeout=settings.llm_complete_timeout, - ) - return name, (r.content or "").strip(), r.prompt_tokens or 0, r.completion_tokens or 0 - except Exception: - log.warning("agent.planning_advisor_failed", advisor=name, exc_info=True) - return name, "", 0, 0 - + ) + review_ctx: list[Message] = [review_system] + if mem: + review_ctx.append(mem) + review_ctx.extend(m for m in context if m.role != "system") try: - advisor_results = await asyncio.gather(*[ - _run_advisor(name, prompt) for name, prompt in _ADVISORS - ]) + r_review = await asyncio.wait_for( + llm.complete(review_ctx, tools=None, temperature=0.35, model=profile.model, think=False), + timeout=settings.llm_complete_timeout, + ) + advisor_feedback = (r_review.content or "").strip() + if r_review.prompt_tokens or r_review.completion_tokens: + yield AIHelperTokensUsed( + prompt_tokens=r_review.prompt_tokens or 0, + completion_tokens=r_review.completion_tokens or 0, + ) + _dbg["phases"]["2"] = { + "output": advisor_feedback, + "prompt_tokens": r_review.prompt_tokens or 0, + "completion_tokens": r_review.completion_tokens or 0, + } + log.debug("agent.planning_review_done", has_output=bool(advisor_feedback)) except Exception: - log.warning("agent.planning_reflect_failed", exc_info=True) - advisor_results = [] - - _dbg["phases"]["2"] = {} - feedback_parts: list[str] = [] - for name, output, pt, ct in advisor_results: - if output: - feedback_parts.append(f"### {name}\n{output}") - yield AIHelperTokensUsed(prompt_tokens=pt, completion_tokens=ct) - _dbg["phases"]["2"][name] = {"output": output, "prompt_tokens": pt, "completion_tokens": ct} - - if feedback_parts: - advisor_feedback = "\n\n".join(feedback_parts) - log.debug("agent.planning_reflect_done", advisors=len(feedback_parts)) + log.warning("agent.planning_review_failed", exc_info=True) + _dbg["phases"]["2"] = {"output": "", "prompt_tokens": 0, "completion_tokens": 0} if _stop and _stop.is_set(): log.debug("agent.planning_stopped", phase=2) @@ -1160,7 +1147,7 @@ yield PlanningStatus(phase=3, label="Building execution plan...", is_subagent=is_subagent) advisor_block = ( - "Advisor feedback from multi-perspective review — address these points in your plan:\n\n" + "Structured review feedback — apply the Plan Adjustments in your plan:\n\n" + advisor_feedback + "\n\n---\n\n" ) if advisor_feedback else "" @@ -1178,8 +1165,16 @@ + available_tools_block + "Now write the execution plan. For each subtask assign a specific executor:\n" "- TOOL: — a single tool call is enough; use exact tool names from the list above\n" - "- AGENT: — needs 2+ tool calls; one subagent handles this ONE step\n" + "- AGENT: — a bounded subtask needing 3+ tool calls; one subagent handles this ONE step\n" "- SELF — final synthesis or a context-dependent single action only\n\n" + "Plan depth:\n" + "- simple: 1-3 steps\n" + "- medium: 5-9 steps\n" + "- complex or autonomous: 8-15 steps\n" + "- hard maximum: 15 steps\n" + "Use enough steps to make execution unambiguous. Do not compress unrelated actions into one step.\n\n" + "For every non-trivial task, include steps for information gathering from project notes/docs/files/tool schemas, " + "implementation or analysis, verification, final synthesis, and NAVI.md updates when stable reusable project facts are discovered.\n\n" "AGENT scoping rules (critical):\n" "- Each AGENT step is one focused, independently verifiable unit of work.\n" "- One AGENT step = one spawn_agent call later. Do NOT bundle multiple concerns.\n" @@ -1191,13 +1186,20 @@ "## Plan\n\n" "**Task:** [reformulated task]\n" "**Goal:** [success criterion]\n\n" + "**Milestones:**\n" + "A. [strategic phase]\n" + "B. [strategic phase]\n" + "C. [strategic phase]\n\n" "**Steps:**\n" "1. [description] → TOOL: tool_name\n" "2. [description] → AGENT: profile_id\n" "3. [description] → AGENT: profile_id\n" - "4. [description] → SELF\n\n" + "4. [description] → SELF\n" + "... continue to the needed depth, up to 15 steps\n\n" "**Parallel:** [step numbers that can run simultaneously, or NONE]\n" "**Risks:** [unknowns to watch for, or NONE]\n\n" + "Reject vague steps such as 'research and implement everything', 'fix all issues', " + "or 'analyze project and make changes'. Split them into concrete, verifiable units. " "Do not write prose. Do not start executing. Plan only." ), ) diff --git a/navi/profiles/base.py b/navi/profiles/base.py index 581dfc5..8e1553d 100644 --- a/navi/profiles/base.py +++ b/navi/profiles/base.py @@ -57,8 +57,8 @@ # that don't need them. # Phase 1: task analysis (TASK/GOAL/UNKNOWNS). Entry point for the pipeline. planning_phase1_enabled: bool = True - # Phase 2: multi-perspective review (Critic/Pragmatist/Detailer advisors). - # Adds ~3 LLM calls; only useful for complex multi-step tasks. + # Phase 2: structured review (Critic/Pragmatist/Detailer + Plan Adjustments). + # Adds 1 LLM call only when Phase 1 outputs REFLECT: yes. planning_phase2_enabled: bool = False # Phase 3: structured execution plan (numbered steps with TOOL/AGENT/SELF). planning_phase3_enabled: bool = True diff --git a/navi/profiles/developer/config.json b/navi/profiles/developer/config.json index b7e9963..564c8b9 100644 --- a/navi/profiles/developer/config.json +++ b/navi/profiles/developer/config.json @@ -60,7 +60,7 @@ ], "planning_mandatory": false, "planning_phase1_enabled": true, - "planning_phase2_enabled": false, + "planning_phase2_enabled": true, "planning_phase3_enabled": true, "top_k": 40, "top_p": 0.88 diff --git a/navi/profiles/secretary/config.json b/navi/profiles/secretary/config.json index e6bf32e..06cfdc1 100644 --- a/navi/profiles/secretary/config.json +++ b/navi/profiles/secretary/config.json @@ -58,7 +58,7 @@ ], "planning_mandatory": false, "planning_phase1_enabled": true, - "planning_phase2_enabled": false, + "planning_phase2_enabled": true, "planning_phase3_enabled": true, "top_k": 50, "top_p": 0.90 diff --git a/navi/profiles/server_admin/config.json b/navi/profiles/server_admin/config.json index 863a422..11e319b 100644 --- a/navi/profiles/server_admin/config.json +++ b/navi/profiles/server_admin/config.json @@ -60,7 +60,7 @@ ], "planning_mandatory": false, "planning_phase1_enabled": true, - "planning_phase2_enabled": false, + "planning_phase2_enabled": true, "planning_phase3_enabled": true, "top_k": 30, "top_p": 0.80 diff --git a/navi/profiles/tool_developer/config.json b/navi/profiles/tool_developer/config.json index 3cfa3ca..7a92305 100644 --- a/navi/profiles/tool_developer/config.json +++ b/navi/profiles/tool_developer/config.json @@ -69,7 +69,7 @@ ], "planning_mandatory": false, "planning_phase1_enabled": true, - "planning_phase2_enabled": false, + "planning_phase2_enabled": true, "planning_phase3_enabled": true, "top_k": 40, "top_p": 0.85