diff --git a/client/js/app.js b/client/js/app.js index b6d7311..1d345f4 100644 --- a/client/js/app.js +++ b/client/js/app.js @@ -3,7 +3,8 @@ import { appendMessage, appendStreamBubble, finalizeStreamBubble, appendToolCall, appendThinkingCard, finalizeThinkingCard, appendTypingIndicator, removeTypingIndicator, - appendError, showEmptyState, scrollToBottom } from './chat.js'; + appendError, showEmptyState, scrollToBottom, + appendSummaryCard, appendCompressionNotice } from './chat.js'; import { renderProfiles, renderSessions, updateChatHeader } from './sidebar.js'; // ── DOM refs ───────────────────────────────────────────────────────────────── @@ -118,6 +119,11 @@ for (const msg of data.messages) { if (msg.role === 'system') continue; + if (msg.is_summary) { + appendSummaryCard(messagesEl, msg.content ?? ''); + continue; + } + if (msg.role === 'tool') { const tc = toolCallMap[msg.tool_call_id] ?? { name: msg.name ?? '?', args: {} }; const success = !msg.content?.startsWith('Error:'); @@ -218,6 +224,11 @@ setInputEnabled(true); break; + case 'context_compressed': + appendCompressionNotice(messagesEl); + scrollToBottom(messagesEl); + break; + case 'error': finishStream(); appendError(messagesEl, event.message); diff --git a/client/js/chat.js b/client/js/chat.js index 5d2eb47..1675c74 100644 --- a/client/js/chat.js +++ b/client/js/chat.js @@ -207,6 +207,38 @@ `; } +/** + * Summary card — rendered for is_summary messages loaded from history. + * Collapsed by default, click to expand. + */ +export function appendSummaryCard(el, content) { + const text = content.replace(/^\[Context Summary\]\n?/, ''); + const card = document.createElement('div'); + card.className = 'summary-card'; + + const header = document.createElement('div'); + header.className = 'summary-header'; + header.innerHTML = '📋Context Summary'; + + const body = document.createElement('div'); + body.className = 'summary-body'; + body.appendChild(renderMarkdown(text)); + + header.addEventListener('click', () => card.classList.toggle('open')); + card.append(header, body); + el.appendChild(card); +} + +/** + * Inline notice that compression ran — appended to the message list. + */ +export function appendCompressionNotice(el) { + const div = document.createElement('div'); + div.className = 'compression-notice'; + div.textContent = '↑ Older messages summarized to free context space'; + el.appendChild(div); +} + export function scrollToBottom(el) { el.scrollTop = el.scrollHeight; } diff --git a/client/style.css b/client/style.css index 8009bfc..9bb8dc4 100644 --- a/client/style.css +++ b/client/style.css @@ -329,6 +329,57 @@ color: var(--text); } +/* ── Context summary card (compressed history) ───────── */ + +.summary-card { + align-self: flex-start; + max-width: 84%; + background: #161b22; + border: 1px solid #30363d; + border-radius: var(--radius); + font-size: 12px; + color: var(--text-muted); +} + +.summary-header { + display: flex; + align-items: center; + gap: 7px; + padding: 8px 12px; + cursor: pointer; + user-select: none; + font-weight: 600; + border-radius: var(--radius); +} +.summary-header:hover { background: rgba(255,255,255,0.03); } +.summary-icon { font-size: 13px; } +.summary-card:not(.open) .summary-header::after { content: '›'; font-size: 16px; opacity: 0.5; } +.summary-card.open .summary-header::after { content: '‹'; font-size: 16px; opacity: 0.5; } + +.summary-body { + border-top: 1px solid #30363d; + padding: 8px 12px; + display: none; + color: var(--text-muted); +} +.summary-card.open .summary-body { + display: block; + animation: fadeSlide 0.18s ease; +} +.summary-body .prose { font-size: 12px; line-height: 1.55; } + +/* Compression notice — inline divider */ +.compression-notice { + align-self: center; + font-size: 11px; + color: var(--text-muted); + padding: 2px 12px; + border-radius: 99px; + border: 1px solid #30363d; + background: #161b22; + opacity: 0.7; +} + /* ── Thinking card ───────────────────────────────────── */ .thinking-card { diff --git a/navi/api/websocket.py b/navi/api/websocket.py index 321ddbf..2721d0e 100644 --- a/navi/api/websocket.py +++ b/navi/api/websocket.py @@ -20,7 +20,7 @@ from fastapi import APIRouter, WebSocket, WebSocketDisconnect from navi.api.deps import get_agent, get_session_store -from navi.core import Agent, InMemorySessionStore, StreamEnd, TextDelta, ThinkingDelta, ThinkingEnd, ToolEvent +from navi.core import Agent, ContextCompressed, InMemorySessionStore, StreamEnd, TextDelta, ThinkingDelta, ThinkingEnd, ToolEvent from navi.exceptions import MaxIterationsReached, NaviError, SessionNotFound router = APIRouter(tags=["websocket"]) @@ -95,6 +95,12 @@ "context_tokens": event.context_tokens, "max_context_tokens": event.max_context_tokens, }) + elif isinstance(event, ContextCompressed): + await websocket.send_json({ + "type": "context_compressed", + "messages_before": event.messages_before, + "messages_after": event.messages_after, + }) except SessionNotFound: await websocket.send_json({"type": "error", "message": "Session not found"}) diff --git a/navi/config.py b/navi/config.py index 18a5db8..6c676fc 100644 --- a/navi/config.py +++ b/navi/config.py @@ -5,7 +5,7 @@ model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore") ollama_host: str = "http://localhost:11434" - ollama_default_model: str = "gemma4:e2b-it-q8_0" + ollama_default_model: str = "gemma4:e4b-it-q8_0" ollama_num_ctx: int = 65536 ollama_think: bool = True @@ -30,6 +30,12 @@ # Directory for user-defined tools (auto-discovered at startup) tools_dir: str = "tools" + # Context compression + context_compression_enabled: bool = True + context_compression_threshold: float = 0.80 # trigger at 80% of ollama_num_ctx + context_keep_recent: int = 6 # conversational turns to keep verbatim + context_summary_temperature: float = 0.3 + # Global personality prompt prepended to every agent's system prompt. # Override via NAVI_PERSONA env var or .env file. navi_persona: str = "" diff --git a/navi/core/__init__.py b/navi/core/__init__.py index db502b5..57b6759 100644 --- a/navi/core/__init__.py +++ b/navi/core/__init__.py @@ -1,4 +1,4 @@ -from .agent import Agent, AgentEvent, StreamEnd, TextDelta, ThinkingDelta, ThinkingEnd, ToolEvent +from .agent import Agent, AgentEvent, ContextCompressed, StreamEnd, TextDelta, ThinkingDelta, ThinkingEnd, ToolEvent from .registry import BackendRegistry, ProfileRegistry, ToolRegistry, build_default_registries from .session import InMemorySessionStore, Session, SessionStore from .sqlite_session_store import SqliteSessionStore @@ -11,6 +11,7 @@ "ThinkingDelta", "ThinkingEnd", "ToolEvent", + "ContextCompressed", "BackendRegistry", "ProfileRegistry", "ToolRegistry", diff --git a/navi/core/agent.py b/navi/core/agent.py index ec5690b..60547b4 100644 --- a/navi/core/agent.py +++ b/navi/core/agent.py @@ -28,6 +28,7 @@ from navi.llm.base import LLMBackend, Message, ToolCallRequest from navi.tools.base import Tool +from .compressor import compress_session, should_compress from .registry import BackendRegistry, ProfileRegistry, ToolRegistry from .session import SessionStore @@ -81,7 +82,15 @@ max_context_tokens: int = 0 # ollama_num_ctx from config -AgentEvent = ToolEvent | TextDelta | ThinkingDelta | ThinkingEnd | StreamEnd +@dataclass +class ContextCompressed: + """Emitted after compression runs successfully.""" + + messages_before: int + messages_after: int + + +AgentEvent = ToolEvent | TextDelta | ThinkingDelta | ThinkingEnd | StreamEnd | ContextCompressed class Agent: @@ -206,11 +215,44 @@ session.messages.append(Message(role="assistant", content=accumulated, created_at=datetime.now(timezone.utc))) await self._sessions.save(session) + yield StreamEnd( full_content=accumulated, context_tokens=context_tokens, max_context_tokens=settings.ollama_num_ctx, ) + + # Post-response compression — runs after client receives StreamEnd + if ( + settings.context_compression_enabled + and context_tokens is not None + and should_compress(context_tokens, settings.ollama_num_ctx, settings.context_compression_threshold) + ): + count_before = len(session.messages) + try: + new_messages = await compress_session( + messages=session.messages, + llm=llm, + model=profile.model, + temperature=settings.context_summary_temperature, + keep_recent=settings.context_keep_recent, + ) + if new_messages is not None: + session.messages = new_messages + await self._sessions.save(session) + log.info( + "agent.compressed", + session_id=session_id, + before=count_before, + after=len(session.messages), + ) + yield ContextCompressed( + messages_before=count_before, + messages_after=len(session.messages), + ) + except Exception: + log.warning("agent.compress_failed", session_id=session_id, exc_info=True) + return # Tool calls: emit events, execute, continue loop diff --git a/navi/core/compressor.py b/navi/core/compressor.py new file mode 100644 index 0000000..492dced --- /dev/null +++ b/navi/core/compressor.py @@ -0,0 +1,140 @@ +""" +Context compressor — summarizes old messages to stay within the token limit. + +Flow: +1. Partition session messages into "to_summarize" (old turns) and "to_keep" (recent turns). +2. Call the LLM to produce a concise bullet-point summary of the old turns. +3. Replace the old turns with a single summary message (role=user, is_summary=True). + +A "turn" is one user message plus all following assistant/tool messages up to the +next user message. Tool call groups (assistant + tool results) are never split. +Existing summary messages are always folded into the next compression pass. +""" + +import json +from datetime import datetime, timezone + +from navi.llm.base import LLMBackend, Message + +_SUMMARIZE_SYSTEM = ( + "You are summarizing a conversation history to free up context space. " + "Produce a concise factual summary covering: key facts the user shared, " + "decisions made, tasks completed or in progress, important outputs or findings. " + "Use bullet points. Be brief — this summary replaces the conversation in context. " + "Do not include greetings or filler." +) + + +def should_compress(context_tokens: int, max_context_tokens: int, threshold: float) -> bool: + return context_tokens >= int(max_context_tokens * threshold) + + +def partition_messages( + messages: list[Message], keep_recent: int +) -> tuple[list[Message], list[Message]]: + """ + Returns (to_summarize, to_keep). + + Keeps the system message and the last `keep_recent` conversational turns verbatim. + Everything older goes into to_summarize. + Tool call groups (assistant + tool results) always stay together. + """ + non_system = [m for m in messages if m.role != "system"] + + # Group into turns: each turn starts with a user message + turns: list[list[Message]] = [] + current: list[Message] = [] + for msg in non_system: + if msg.role == "user" and current: + turns.append(current) + current = [msg] + else: + current.append(msg) + if current: + turns.append(current) + + if len(turns) <= keep_recent: + return [], non_system # nothing old enough to compress + + old_turns = turns[:-keep_recent] + recent_turns = turns[-keep_recent:] + + to_summarize = [m for turn in old_turns for m in turn] + to_keep = [m for turn in recent_turns for m in turn] + return to_summarize, to_keep + + +def _format_for_summary(messages: list[Message]) -> str: + """Render messages as plain text for the summarization prompt.""" + lines: list[str] = [] + i = 0 + while i < len(messages): + m = messages[i] + + if m.is_summary: + # Existing summary — include as-is (already compressed) + lines.append(m.content or "") + i += 1 + + elif m.role == "user": + if m.content: + lines.append(f"User: {m.content}") + i += 1 + + elif m.role == "assistant" and m.tool_calls: + # Render tool calls + their results as a compact block + for tc in m.tool_calls: + args_preview = json.dumps(tc.arguments)[:120] + lines.append(f"[Called tool `{tc.name}` with {args_preview}]") + i += 1 + while i < len(messages) and messages[i].role == "tool": + result = messages[i].content or "" + preview = result[:300] + ("…" if len(result) > 300 else "") + lines.append(f"[Tool `{messages[i].name}` returned: {preview}]") + i += 1 + + elif m.role == "assistant" and m.content: + lines.append(f"Assistant: {m.content}") + i += 1 + + else: + i += 1 # skip orphan tool messages + + return "\n".join(lines) + + +async def compress_session( + messages: list[Message], + llm: LLMBackend, + model: str, + temperature: float, + keep_recent: int, +) -> list[Message] | None: + """ + Summarize old messages and return a new (shorter) message list. + Returns None if there is nothing to compress or if the LLM call fails. + + Raises LLMBackendError on LLM failure — caller decides how to handle. + """ + system_msgs = [m for m in messages if m.role == "system"] + to_summarize, to_keep = partition_messages(messages, keep_recent) + + if len(to_summarize) < 2: + return None # nothing substantial to compress + + prompt = [ + Message(role="system", content=_SUMMARIZE_SYSTEM), + Message(role="user", content=_format_for_summary(to_summarize)), + ] + + response = await llm.complete(prompt, tools=None, temperature=temperature, model=model) + summary_text = (response.content or "").strip() or "(summary unavailable)" + + summary_msg = Message( + role="user", + content=f"[Context Summary]\n{summary_text}", + is_summary=True, + created_at=datetime.now(timezone.utc), + ) + + return system_msgs + [summary_msg] + to_keep diff --git a/navi/llm/base.py b/navi/llm/base.py index 88e8946..6c865b1 100644 --- a/navi/llm/base.py +++ b/navi/llm/base.py @@ -40,6 +40,8 @@ tool_call_id: str | None = None name: str | None = None # tool name on tool result messages created_at: datetime | None = None + # marks a compressed history block injected by the context compressor + is_summary: bool = False class LLMResponse(BaseModel):