diff --git a/docs/config.md b/docs/config.md index 73ffb71..cca852a 100644 --- a/docs/config.md +++ b/docs/config.md @@ -133,7 +133,7 @@ | `CONTEXT_COMPRESSION_THRESHOLD` | float | `0.70` | Trigger compression at this fraction of `OLLAMA_NUM_CTX` | | `CONTEXT_KEEP_RECENT` | int | `8` | Number of recent conversation turns to keep verbatim | | `CONTEXT_SUMMARY_TEMPERATURE` | float | `0.3` | Temperature for the summarization LLM call | -| `CONTEXT_SUMMARY_MAX_TOKENS` | int | `3000` | Max output tokens for the summary LLM call | +| `CONTEXT_SUMMARY_MAX_TOKENS` | int | `4000` | Max output tokens for the summary LLM call | | `OUTPUT_RESERVE_TOKENS` | int | `2048` | Headroom reserved for model response in context size checks | ## Gmail diff --git a/navi/config.py b/navi/config.py index 53ee5f9..53c2411 100644 --- a/navi/config.py +++ b/navi/config.py @@ -122,7 +122,7 @@ context_compression_threshold: float = 0.70 # trigger at 70% of ollama_num_ctx context_keep_recent: int = 8 # conversational turns to keep verbatim context_summary_temperature: float = 0.3 - context_summary_max_tokens: int = 3000 # max output tokens for the summary LLM call + context_summary_max_tokens: int = 4000 # max output tokens for the summary LLM call output_reserve_tokens: int = 2048 # headroom reserved for model response in context checks # Global personality prompt prepended to every agent's system prompt. diff --git a/navi/core/agent.py b/navi/core/agent.py index 12090d8..978e67d 100644 --- a/navi/core/agent.py +++ b/navi/core/agent.py @@ -178,6 +178,11 @@ mcp_manager=mcp_manager, ) + def _set_active_profile(self, profile) -> None: + """Update cached profile and propagate to subsystems that depend on it.""" + self._compressor.set_profile(profile) + self._subagent.set_profile(profile) + # ------------------------------------------------------------------ # Public interface # ------------------------------------------------------------------ @@ -510,6 +515,7 @@ if fresh and fresh.profile_id != session.profile_id: session.profile_id = fresh.profile_id profile = self._profiles.get(session.profile_id) + self._set_active_profile(profile) tools = self._tool_list(profile.get_agent_tools()) tool_schemas = [t.schema() for t in tools] llm = self._get_backend(profile.llm_backend) diff --git a/navi/core/compressor.py b/navi/core/compressor.py index bcffeaa..03a9a12 100644 --- a/navi/core/compressor.py +++ b/navi/core/compressor.py @@ -3,63 +3,107 @@ Flow: 1. Partition session messages into "to_summarize" (old turns) and "to_keep" (recent turns). -2. Call the LLM to produce a concise bullet-point summary of the old turns. -3. Replace the old turns with a single summary message (role=user, is_summary=True). +2. Call the LLM to produce a structured summary of the old messages. +3. Replace the old turns with a structured summary message (role=user, is_summary=True). A "turn" is one user message plus all following assistant/tool messages up to the next user message. Tool call groups (assistant + tool results) are never split. Existing summary messages are always folded into the next compression pass. + +Compression is profile-aware: AgentProfile can provide compression_keep_recent, +compression_max_tokens, and a compression_prompt_file to specialize summaries. """ import json +import re from datetime import datetime, timezone +from pathlib import Path +from typing import TYPE_CHECKING from navi.llm.base import LLMBackend, Message from navi.config import settings from .events import ContextCompressed -_SUMMARIZE_SYSTEM = ( +if TYPE_CHECKING: + from navi.profiles.base import AgentProfile + + +_SUMMARY_SECTIONS = [ + ("Goal", "One clear sentence describing what the user is trying to accomplish in this session. Include deadlines or constraints if stated."), + ("Active Files", "Every file or directory the assistant touched, with absolute or project-relative path and status: created / modified / read / deleted. For modified files, note the purpose of the change."), + ("Decisions & User Preferences", "Explicit choices, architecture decisions, style preferences, or corrections stated by the user. Things the user said NOT to do."), + ("Completed Work", "Concrete finished steps — include file/function names and verification outcome if available."), + ("Pending Work / Todo", "Open tasks, in-progress items, or follow-ups that still need action."), + ("Errors & Blockers", "Failures, exceptions, or unresolved issues. Include exact error snippets when short and diagnostic."), + ("Key Values", "Exact constants the assistant should remember: ports, config keys, versions, dependency names, important paths, IDs."), +] + +_SUMMARY_TEMPLATE_INSTRUCTIONS = ( "You are summarizing a conversation history to free up context space. " "The assistant will continue working using ONLY this summary — it will have no access " "to the original messages. Be thorough and precise. Prefer specifics over generalities. " "This summary is historical context, not a new user request.\n\n" - "## Current goal\n" - "What the user is trying to accomplish in this session. Include any stated deadlines, " - "constraints, or acceptance criteria.\n\n" - "## Work state\n" - "What has been completed (be specific — name files, functions, endpoints, steps). " - "What is still in progress or pending. Any blockers or open questions.\n\n" - "## Key facts\n" - "Everything the user told the assistant: preferences, system details, environment info, " - "decisions made, constraints discovered, explicit instructions. Include exact values " - "(port numbers, file paths, config keys, IDs) — do not paraphrase if precision matters.\n\n" - "## Outputs\n" - "Every file created or modified (full paths). Every config value set. " - "Commands run and their outcome. Preserve exact command output only when it is short " - "and needed to prove a result or diagnose an error. " - "Do not preserve tool-call-like examples with parenthesized arguments; describe tool " - "usage in words or as key/value facts instead.\n\n" - "## Errors\n" - "Failures, exceptions, or unexpected results encountered. " - "How each was resolved — or that it remains unresolved.\n\n" - "## User preferences and feedback\n" - "Corrections the user made to the assistant's approach. Explicit style or behavior " - "preferences stated during this session. Things the user said not to do.\n\n" - "Do not include greetings, filler, transitions, or meta-commentary about the summary itself. " - "Do not use Markdown code fences or inline-code backticks unless preserving a user-authored " - "literal value is essential. " - "Write in tight prose or bullet points — whatever preserves more information per token." + "Use EXACTLY the Markdown structure below. Every section must be present. " + "If a section has no relevant information, write its header and the literal word NONE. " + "Keep bullet points tight and information-dense. " + "Do not include greetings, filler, transitions, or meta-commentary.\n\n" + + "\n\n".join(f"## {title}\n{desc}" for title, desc in _SUMMARY_SECTIONS) + + "\n\n" + "Output rules:\n" + "- Preserve exact file paths, function names, config keys, and short error snippets verbatim.\n" + "- Do not paraphrase values that must stay precise.\n" + "- Do not write implementation code, patches, or long command output.\n" + "- Use Markdown headers exactly as shown." ) +# Tools whose full output is often needed later and should not be aggressively truncated. +_CRITICAL_TOOL_NAMES = frozenset({ + "filesystem", + "code_exec", + "terminal", + "ssh_exec", +}) + + +# Content markers that make a turn worth preserving verbatim longer. +_CRITICAL_PATTERNS = [ + re.compile(r"\b(error|exception|traceback|failed|failure)\b", re.IGNORECASE), + re.compile(r"\b(user\s+said|no,\s+|don't\s+|do\s+not\s+|never\s+|instead\s+|wrong\s+|incorrect\s+|fix\s+|correct\s+)\b", re.IGNORECASE), + re.compile(r"\b(edited|modified|created|deleted|wrote|added)\s+(file|function|class|method)\b", re.IGNORECASE), +] + + def should_compress(context_tokens: int, max_context_tokens: int, threshold: float) -> bool: return context_tokens >= int(max_context_tokens * threshold) +def _turn_importance(turn: list[Message]) -> int: + """Score a turn for adaptive keep_recent. Higher = more important to keep.""" + score = 0 + text = "\n".join((m.content or "") for m in turn) + lowered = text.lower() + # Strong signals: user corrections and explicit negatives + if any(w in lowered for w in ("wrong", "incorrect", "fix", "don't use", "do not use", "instead use", "change to")): + score += 3 + for pattern in _CRITICAL_PATTERNS: + score += len(pattern.findall(text)) + for m in turn: + if getattr(m, "is_compression_critical", False): + score += 3 + if m.role == "tool" and m.name in _CRITICAL_TOOL_NAMES: + score += 1 + if m.role == "user" and len((m.content or "").strip()) <= 20: + # Very short user messages are usually social/filler; deprioritize + score -= 2 + return max(0, score) + + def partition_messages( messages: list[Message], keep_recent: int, keep_recent_messages: int | None = None, + adaptive: bool = True, ) -> tuple[list[Message], list[Message]]: """ Returns (to_summarize, to_keep). @@ -67,6 +111,8 @@ Keeps the system message and the last `keep_recent` conversational turns verbatim. Everything older goes into to_summarize. Tool call groups (assistant + tool results) always stay together. + When adaptive=True, important turns (user corrections, errors, critical tools) + are kept longer and social/filler turns are compressed sooner. """ non_system = [m for m in messages if m.role != "system"] @@ -89,8 +135,24 @@ return intra_turn return [], non_system # nothing old enough to compress - old_turns = turns[:-keep_recent] - recent_turns = turns[-keep_recent:] + # Adaptive: pull important older turns into the kept region and push + # unimportant recent/filler turns out for summarization. + base_keep = keep_recent + recent_turns = turns[-base_keep:] + old_turns = turns[:-base_keep] + if adaptive: + # Identify important old turns that should not be lost. + important_old = [t for t in old_turns if _turn_importance(t) > 0] + # Identify filler turns in the recent window that can be swapped out. + filler_recent = [t for t in recent_turns if _turn_importance(t) == 0] + swaps = min(len(important_old), len(filler_recent)) + for i in range(swaps): + # Replace the oldest filler in recent with the most important old turn. + recent_turns[recent_turns.index(filler_recent[i])] = important_old[-(i + 1)] + # Re-sort kept turns by original position so context order stays chronological. + kept_set = {id(t) for t in recent_turns} + recent_turns = [t for t in turns if id(t) in kept_set] + old_turns = [t for t in turns if id(t) not in kept_set] to_summarize = [m for turn in old_turns for m in turn] to_keep = [m for turn in recent_turns for m in turn] @@ -167,9 +229,19 @@ lines.append(f"[Tool call: {tc.name}; arguments preview: {args_preview}]") i += 1 while i < len(messages) and messages[i].role == "tool": - result = messages[i].content or "" - preview = result[:300] + ("…" if len(result) > 300 else "") - lines.append(f"[Tool result: {messages[i].name}; preview: {preview}]") + tool_msg = messages[i] + result = tool_msg.content or "" + # Critical tool results and explicit critical flag survive verbatim + # up to a larger budget, so exact errors/file contents are preserved. + critical = ( + getattr(tool_msg, "is_compression_critical", False) + or tool_msg.name in _CRITICAL_TOOL_NAMES + ) + if critical and len(result) <= 4000: + preview = result + else: + preview = result[:300] + ("…" if len(result) > 300 else "") + lines.append(f"[Tool result: {tool_msg.name}; preview: {preview}]") i += 1 elif m.role == "assistant" and m.content: @@ -229,6 +301,24 @@ ) +def _build_summary_system_prompt(profile: "AgentProfile | None") -> str: + """Build the system prompt used by the summarization LLM. + + Uses the profile-specific compression prompt file if configured. + """ + base = _SUMMARY_TEMPLATE_INSTRUCTIONS + extra = "" + if profile is not None and getattr(profile, "compression_prompt_file", None): + profile_dir = Path("navi/profiles") / profile.id + prompt_path = profile_dir / profile.compression_prompt_file + if prompt_path.exists(): + try: + extra = "\n\n---\n\n[Profile-specific compression instructions]\n\n" + prompt_path.read_text(encoding="utf-8").strip() + except Exception: + pass + return base + extra + + async def compress_context( context: list[Message], llm: LLMBackend, @@ -237,6 +327,7 @@ keep_recent: int, max_tokens: int | None = None, keep_recent_messages: int | None = None, + profile: "AgentProfile | None" = None, ) -> tuple[list[Message], str] | None: """ Summarize old messages in the LLM context and return a shorter context list. @@ -250,12 +341,20 @@ Uses the same model already loaded in memory (profile.model passed via WorkerContext) — no model swap, no extra loading overhead. + Profile settings override global defaults when provided: + - compression_keep_recent -> keep_recent + - compression_max_tokens -> max_tokens + - compression_prompt_file -> appended to summary system prompt + Exceptions propagate to the caller (CompressionWorker catches them). """ + effective_keep_recent = getattr(profile, "compression_keep_recent", None) or keep_recent + effective_max_tokens = getattr(profile, "compression_max_tokens", None) or max_tokens + system_msgs = [m for m in context if m.role == "system"] to_summarize, to_keep = partition_messages( context, - keep_recent, + effective_keep_recent, keep_recent_messages=keep_recent_messages, ) @@ -265,7 +364,7 @@ if len(to_summarize) < 2 and keep_recent_messages is not None and keep_recent_messages > 2: to_summarize, to_keep = partition_messages( context, - keep_recent, + effective_keep_recent, keep_recent_messages=2, ) @@ -294,14 +393,15 @@ if len(summary_text_input) > _MAX_SUMMARY_INPUT_CHARS: summary_text_input = summary_text_input[:_MAX_SUMMARY_INPUT_CHARS] + "\n…[truncated]" + system_prompt = _build_summary_system_prompt(profile) prompt = [ - Message(role="system", content=_SUMMARIZE_SYSTEM), + Message(role="system", content=system_prompt), Message(role="user", content=summary_text_input, images=images or None), ] # think=False: compression must be fast — extended reasoning wastes context and hangs response = await llm.complete( - prompt, tools=None, temperature=temperature, model=model, think=False, max_tokens=max_tokens + prompt, tools=None, temperature=temperature, model=model, think=False, max_tokens=effective_max_tokens ) summary_text = (response.content or "").strip() or "(summary unavailable)" @@ -336,6 +436,13 @@ imgs = sum(500 for m in context if m.images) return chars // 3 + imgs + def __init__(self) -> None: + self._profile: "AgentProfile | None" = None + + def set_profile(self, profile: "AgentProfile | None") -> None: + """Tell the compressor which profile is active so it can use profile-specific settings.""" + self._profile = profile + async def compress_session( self, context: list[Message], @@ -352,6 +459,9 @@ Does NOT mutate the session — the caller is responsible for updating session.context, session.context_token_count, and persisting. """ + effective_keep_recent = getattr(self._profile, "compression_keep_recent", None) or keep_recent + effective_max_tokens = getattr(self._profile, "compression_max_tokens", None) or max_tokens + # Attempt 1: normal compression try: result = await compress_context( @@ -359,9 +469,10 @@ llm=llm, model=model, temperature=temperature, - keep_recent=keep_recent, - max_tokens=max_tokens, + keep_recent=effective_keep_recent, + max_tokens=effective_max_tokens, keep_recent_messages=keep_recent_messages, + profile=self._profile, ) except Exception: # Attempt 2: keep more recent turns verbatim @@ -371,11 +482,12 @@ llm=llm, model=model, temperature=temperature, - keep_recent=keep_recent + 4, - max_tokens=max_tokens, + keep_recent=effective_keep_recent + 4, + max_tokens=effective_max_tokens, keep_recent_messages=(keep_recent_messages + 4) if keep_recent_messages is not None else None, + profile=self._profile, ) except Exception: # Attempt 3: hard-truncate fallback diff --git a/navi/core/subagent_runner.py b/navi/core/subagent_runner.py index 23dc34a..3ec7c23 100644 --- a/navi/core/subagent_runner.py +++ b/navi/core/subagent_runner.py @@ -60,6 +60,10 @@ self._sessions = session_store self._mcp_manager = mcp_manager + def set_profile(self, profile) -> None: + """Propagate active profile to compressor so subagent uses profile-specific compression settings.""" + self._compressor.set_profile(profile) + async def run( self, user_message: str, diff --git a/navi/llm/base.py b/navi/llm/base.py index 10fb8b9..67e2aa8 100644 --- a/navi/llm/base.py +++ b/navi/llm/base.py @@ -64,6 +64,8 @@ # DB sequence number — set by PgSessionStore on load, used for delta-save. # -1 means "not yet persisted" (new messages created by the agent). sequence_number: int = Field(default=-1, exclude=True) + # Marks messages that must survive compression verbatim (user corrections, exact tool output) + is_compression_critical: bool = False class LLMResponse(BaseModel): diff --git a/navi/profiles/base.py b/navi/profiles/base.py index d030d75..dfead92 100644 --- a/navi/profiles/base.py +++ b/navi/profiles/base.py @@ -123,6 +123,14 @@ # Global providers (global_provider=True) are always injected regardless of this list. context_providers: list[str] = Field(default_factory=list) + # Optional compression tuning for this profile. When provided, the context + # compressor uses these values as defaults instead of global settings. + compression_keep_recent: int | None = None + compression_max_tokens: int | None = None + # Optional path (relative to profile dir) of a plain-text file containing an + # additional system prompt appended to the compressor's summary instructions. + compression_prompt_file: str | None = None + @field_validator("model", mode="before") @classmethod def _coerce_model(cls, v): diff --git a/navi/profiles/loader.py b/navi/profiles/loader.py index b988918..4857438 100644 --- a/navi/profiles/loader.py +++ b/navi/profiles/loader.py @@ -108,6 +108,9 @@ subagent_think_enabled=config.get("subagent_think_enabled", None), subagent_system_prompt=subagent_system_prompt, context_providers=config.get("context_providers", []), + compression_keep_recent=config.get("compression_keep_recent", None), + compression_max_tokens=config.get("compression_max_tokens", None), + compression_prompt_file=config.get("compression_prompt_file", None), )) log.debug("profile.loader.loaded", profile_id=config["id"]) @@ -158,6 +161,9 @@ "is_subagent_only": profile.is_subagent_only, "tools": profile.tools.model_dump(mode="json"), "context_providers": profile.context_providers, + "compression_keep_recent": profile.compression_keep_recent, + "compression_max_tokens": profile.compression_max_tokens, + "compression_prompt_file": profile.compression_prompt_file, } config_file = profile_dir / "config.json" diff --git a/navi/profiles/navi_code/compression_prompt.txt b/navi/profiles/navi_code/compression_prompt.txt new file mode 100644 index 0000000..ab59d85 --- /dev/null +++ b/navi/profiles/navi_code/compression_prompt.txt @@ -0,0 +1,14 @@ +You are summarizing a local-terminal coding session. Preserve information that is essential for continuing implementation and verification. + +Priority rules: +- Keep every file path the assistant read, created, or modified, with the action taken. +- Keep exact code signatures the user explicitly approved or that were final (function/class names, important config keys, exact command-line flags). +- Keep the outcome of the last test/build/verification run (pass/fail and the final error snippet if it failed). +- Keep the current todo list state and any pending sub-tasks. +- Keep user corrections about style, approach, or things the user said must/not be done. +- Keep exact environment facts: ports, Python versions, dependency names, paths to project roots, special local quirks. + +Do not preserve: +- Long terminal output, full stack traces, or verbose directory listings. +- Social greetings, filler, or commentary about the summary itself. +- Intermediate reasoning or tool-call argument previews. diff --git a/navi/profiles/navi_code/config.json b/navi/profiles/navi_code/config.json index 4ddd862..b7f837d 100644 --- a/navi/profiles/navi_code/config.json +++ b/navi/profiles/navi_code/config.json @@ -32,6 +32,9 @@ "top_k": 40, "top_p": 0.88, "num_thread": 11, + "compression_keep_recent": 12, + "compression_max_tokens": 4000, + "compression_prompt_file": "compression_prompt.txt", "tools": { "agent": { "native": [ diff --git a/tests/unit/core/test_compressor.py b/tests/unit/core/test_compressor.py index 07c59a2..b17bc11 100644 --- a/tests/unit/core/test_compressor.py +++ b/tests/unit/core/test_compressor.py @@ -4,13 +4,15 @@ from navi.core.compressor import ( ContextCompressor, + _build_summary_system_prompt, _format_for_summary, + _turn_importance, compress_context, partition_messages, should_compress, ) from navi.llm.base import Message, ToolCallRequest -from tests.conftest_factory import FakeLLMBackend +from tests.conftest_factory import FakeLLMBackend, make_profile class TestShouldCompress: @@ -471,3 +473,78 @@ keep_recent=5, ) assert result is None + + def test_turn_importance(self): + important = [ + Message(role="user", content="that is wrong, use json instead"), + ] + casual = [ + Message(role="user", content="hi"), + Message(role="assistant", content="hello"), + ] + assert _turn_importance(important) > _turn_importance(casual) + + def test_adaptive_partition_keeps_important_turns(self): + msgs = [] + for i in range(6): + msgs.append(Message(role="user", content=f"task {i}")) + msgs.append(Message(role="assistant", content=f"answer {i}")) + # Mark one old turn as important; it should survive even with keep_recent=2 + msgs[6].content = "this is wrong, fix it" + old, recent = partition_messages(msgs, keep_recent=2) + assert msgs[6] in recent # user correction stays + + def test_format_keeps_critical_tool_result(self): + long_result = "exact error output\n" * 100 + msgs = [ + Message( + role="assistant", + tool_calls=[ToolCallRequest(id="1", name="filesystem", arguments={})], + ), + Message(role="tool", content=long_result, name="filesystem", tool_call_id="1", is_compression_critical=True), + ] + text, _ = _format_for_summary(msgs) + assert long_result in text + + def test_format_truncates_noncritical_tool_result(self): + long_result = "x" * 1000 + msgs = [ + Message( + role="assistant", + tool_calls=[ToolCallRequest(id="1", name="web_search", arguments={})], + ), + Message(role="tool", content=long_result, name="web_search", tool_call_id="1"), + ] + text, _ = _format_for_summary(msgs) + assert long_result not in text + assert "…" in text + + def test_summary_prompt_uses_profile_compression_prompt_file(self): + profile = make_profile("navi_code", compression_prompt_file="compression_prompt.txt") + prompt = _build_summary_system_prompt(profile) + assert "## Goal" in prompt + assert "## Active Files" in prompt + + async def test_profile_overrides_compression_max_tokens(self): + profile = make_profile("test", compression_max_tokens=1234) + backend = FakeLLMBackend(responses=["short"]) + context = [ + Message(role="user", content="1"), + Message(role="assistant", content="a1"), + Message(role="user", content="2"), + Message(role="assistant", content="a2"), + Message(role="user", content="3"), + Message(role="assistant", content="a3"), + ] + _, _ = await compress_context( + context=context, + llm=backend, + model="test", + temperature=0.3, + keep_recent=1, + max_tokens=9999, + profile=profile, + ) + # max_tokens passed to llm.complete should be overridden by profile + # FakeLLMBackend ignores max_tokens, but we can at least verify the call ran + assert backend._call_idx == 1