diff --git a/client/js/app.js b/client/js/app.js
index b6d7311..1d345f4 100644
--- a/client/js/app.js
+++ b/client/js/app.js
@@ -3,7 +3,8 @@
import { appendMessage, appendStreamBubble, finalizeStreamBubble,
appendToolCall, appendThinkingCard, finalizeThinkingCard,
appendTypingIndicator, removeTypingIndicator,
- appendError, showEmptyState, scrollToBottom } from './chat.js';
+ appendError, showEmptyState, scrollToBottom,
+ appendSummaryCard, appendCompressionNotice } from './chat.js';
import { renderProfiles, renderSessions, updateChatHeader } from './sidebar.js';
// ── DOM refs ─────────────────────────────────────────────────────────────────
@@ -118,6 +119,11 @@
for (const msg of data.messages) {
if (msg.role === 'system') continue;
+ if (msg.is_summary) {
+ appendSummaryCard(messagesEl, msg.content ?? '');
+ continue;
+ }
+
if (msg.role === 'tool') {
const tc = toolCallMap[msg.tool_call_id] ?? { name: msg.name ?? '?', args: {} };
const success = !msg.content?.startsWith('Error:');
@@ -218,6 +224,11 @@
setInputEnabled(true);
break;
+ case 'context_compressed':
+ appendCompressionNotice(messagesEl);
+ scrollToBottom(messagesEl);
+ break;
+
case 'error':
finishStream();
appendError(messagesEl, event.message);
diff --git a/client/js/chat.js b/client/js/chat.js
index 5d2eb47..1675c74 100644
--- a/client/js/chat.js
+++ b/client/js/chat.js
@@ -207,6 +207,38 @@
`;
}
+/**
+ * Summary card — rendered for is_summary messages loaded from history.
+ * Collapsed by default, click to expand.
+ */
+export function appendSummaryCard(el, content) {
+ const text = content.replace(/^\[Context Summary\]\n?/, '');
+ const card = document.createElement('div');
+ card.className = 'summary-card';
+
+ const header = document.createElement('div');
+ header.className = 'summary-header';
+ header.innerHTML = '📋Context Summary';
+
+ const body = document.createElement('div');
+ body.className = 'summary-body';
+ body.appendChild(renderMarkdown(text));
+
+ header.addEventListener('click', () => card.classList.toggle('open'));
+ card.append(header, body);
+ el.appendChild(card);
+}
+
+/**
+ * Inline notice that compression ran — appended to the message list.
+ */
+export function appendCompressionNotice(el) {
+ const div = document.createElement('div');
+ div.className = 'compression-notice';
+ div.textContent = '↑ Older messages summarized to free context space';
+ el.appendChild(div);
+}
+
export function scrollToBottom(el) {
el.scrollTop = el.scrollHeight;
}
diff --git a/client/style.css b/client/style.css
index 8009bfc..9bb8dc4 100644
--- a/client/style.css
+++ b/client/style.css
@@ -329,6 +329,57 @@
color: var(--text);
}
+/* ── Context summary card (compressed history) ───────── */
+
+.summary-card {
+ align-self: flex-start;
+ max-width: 84%;
+ background: #161b22;
+ border: 1px solid #30363d;
+ border-radius: var(--radius);
+ font-size: 12px;
+ color: var(--text-muted);
+}
+
+.summary-header {
+ display: flex;
+ align-items: center;
+ gap: 7px;
+ padding: 8px 12px;
+ cursor: pointer;
+ user-select: none;
+ font-weight: 600;
+ border-radius: var(--radius);
+}
+.summary-header:hover { background: rgba(255,255,255,0.03); }
+.summary-icon { font-size: 13px; }
+.summary-card:not(.open) .summary-header::after { content: '›'; font-size: 16px; opacity: 0.5; }
+.summary-card.open .summary-header::after { content: '‹'; font-size: 16px; opacity: 0.5; }
+
+.summary-body {
+ border-top: 1px solid #30363d;
+ padding: 8px 12px;
+ display: none;
+ color: var(--text-muted);
+}
+.summary-card.open .summary-body {
+ display: block;
+ animation: fadeSlide 0.18s ease;
+}
+.summary-body .prose { font-size: 12px; line-height: 1.55; }
+
+/* Compression notice — inline divider */
+.compression-notice {
+ align-self: center;
+ font-size: 11px;
+ color: var(--text-muted);
+ padding: 2px 12px;
+ border-radius: 99px;
+ border: 1px solid #30363d;
+ background: #161b22;
+ opacity: 0.7;
+}
+
/* ── Thinking card ───────────────────────────────────── */
.thinking-card {
diff --git a/navi/api/websocket.py b/navi/api/websocket.py
index 321ddbf..2721d0e 100644
--- a/navi/api/websocket.py
+++ b/navi/api/websocket.py
@@ -20,7 +20,7 @@
from fastapi import APIRouter, WebSocket, WebSocketDisconnect
from navi.api.deps import get_agent, get_session_store
-from navi.core import Agent, InMemorySessionStore, StreamEnd, TextDelta, ThinkingDelta, ThinkingEnd, ToolEvent
+from navi.core import Agent, ContextCompressed, InMemorySessionStore, StreamEnd, TextDelta, ThinkingDelta, ThinkingEnd, ToolEvent
from navi.exceptions import MaxIterationsReached, NaviError, SessionNotFound
router = APIRouter(tags=["websocket"])
@@ -95,6 +95,12 @@
"context_tokens": event.context_tokens,
"max_context_tokens": event.max_context_tokens,
})
+ elif isinstance(event, ContextCompressed):
+ await websocket.send_json({
+ "type": "context_compressed",
+ "messages_before": event.messages_before,
+ "messages_after": event.messages_after,
+ })
except SessionNotFound:
await websocket.send_json({"type": "error", "message": "Session not found"})
diff --git a/navi/config.py b/navi/config.py
index 18a5db8..6c676fc 100644
--- a/navi/config.py
+++ b/navi/config.py
@@ -5,7 +5,7 @@
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
ollama_host: str = "http://localhost:11434"
- ollama_default_model: str = "gemma4:e2b-it-q8_0"
+ ollama_default_model: str = "gemma4:e4b-it-q8_0"
ollama_num_ctx: int = 65536
ollama_think: bool = True
@@ -30,6 +30,12 @@
# Directory for user-defined tools (auto-discovered at startup)
tools_dir: str = "tools"
+ # Context compression
+ context_compression_enabled: bool = True
+ context_compression_threshold: float = 0.80 # trigger at 80% of ollama_num_ctx
+ context_keep_recent: int = 6 # conversational turns to keep verbatim
+ context_summary_temperature: float = 0.3
+
# Global personality prompt prepended to every agent's system prompt.
# Override via NAVI_PERSONA env var or .env file.
navi_persona: str = ""
diff --git a/navi/core/__init__.py b/navi/core/__init__.py
index db502b5..57b6759 100644
--- a/navi/core/__init__.py
+++ b/navi/core/__init__.py
@@ -1,4 +1,4 @@
-from .agent import Agent, AgentEvent, StreamEnd, TextDelta, ThinkingDelta, ThinkingEnd, ToolEvent
+from .agent import Agent, AgentEvent, ContextCompressed, StreamEnd, TextDelta, ThinkingDelta, ThinkingEnd, ToolEvent
from .registry import BackendRegistry, ProfileRegistry, ToolRegistry, build_default_registries
from .session import InMemorySessionStore, Session, SessionStore
from .sqlite_session_store import SqliteSessionStore
@@ -11,6 +11,7 @@
"ThinkingDelta",
"ThinkingEnd",
"ToolEvent",
+ "ContextCompressed",
"BackendRegistry",
"ProfileRegistry",
"ToolRegistry",
diff --git a/navi/core/agent.py b/navi/core/agent.py
index ec5690b..60547b4 100644
--- a/navi/core/agent.py
+++ b/navi/core/agent.py
@@ -28,6 +28,7 @@
from navi.llm.base import LLMBackend, Message, ToolCallRequest
from navi.tools.base import Tool
+from .compressor import compress_session, should_compress
from .registry import BackendRegistry, ProfileRegistry, ToolRegistry
from .session import SessionStore
@@ -81,7 +82,15 @@
max_context_tokens: int = 0 # ollama_num_ctx from config
-AgentEvent = ToolEvent | TextDelta | ThinkingDelta | ThinkingEnd | StreamEnd
+@dataclass
+class ContextCompressed:
+ """Emitted after compression runs successfully."""
+
+ messages_before: int
+ messages_after: int
+
+
+AgentEvent = ToolEvent | TextDelta | ThinkingDelta | ThinkingEnd | StreamEnd | ContextCompressed
class Agent:
@@ -206,11 +215,44 @@
session.messages.append(Message(role="assistant", content=accumulated, created_at=datetime.now(timezone.utc)))
await self._sessions.save(session)
+
yield StreamEnd(
full_content=accumulated,
context_tokens=context_tokens,
max_context_tokens=settings.ollama_num_ctx,
)
+
+ # Post-response compression — runs after client receives StreamEnd
+ if (
+ settings.context_compression_enabled
+ and context_tokens is not None
+ and should_compress(context_tokens, settings.ollama_num_ctx, settings.context_compression_threshold)
+ ):
+ count_before = len(session.messages)
+ try:
+ new_messages = await compress_session(
+ messages=session.messages,
+ llm=llm,
+ model=profile.model,
+ temperature=settings.context_summary_temperature,
+ keep_recent=settings.context_keep_recent,
+ )
+ if new_messages is not None:
+ session.messages = new_messages
+ await self._sessions.save(session)
+ log.info(
+ "agent.compressed",
+ session_id=session_id,
+ before=count_before,
+ after=len(session.messages),
+ )
+ yield ContextCompressed(
+ messages_before=count_before,
+ messages_after=len(session.messages),
+ )
+ except Exception:
+ log.warning("agent.compress_failed", session_id=session_id, exc_info=True)
+
return
# Tool calls: emit events, execute, continue loop
diff --git a/navi/core/compressor.py b/navi/core/compressor.py
new file mode 100644
index 0000000..492dced
--- /dev/null
+++ b/navi/core/compressor.py
@@ -0,0 +1,140 @@
+"""
+Context compressor — summarizes old messages to stay within the token limit.
+
+Flow:
+1. Partition session messages into "to_summarize" (old turns) and "to_keep" (recent turns).
+2. Call the LLM to produce a concise bullet-point summary of the old turns.
+3. Replace the old turns with a single summary message (role=user, is_summary=True).
+
+A "turn" is one user message plus all following assistant/tool messages up to the
+next user message. Tool call groups (assistant + tool results) are never split.
+Existing summary messages are always folded into the next compression pass.
+"""
+
+import json
+from datetime import datetime, timezone
+
+from navi.llm.base import LLMBackend, Message
+
+_SUMMARIZE_SYSTEM = (
+ "You are summarizing a conversation history to free up context space. "
+ "Produce a concise factual summary covering: key facts the user shared, "
+ "decisions made, tasks completed or in progress, important outputs or findings. "
+ "Use bullet points. Be brief — this summary replaces the conversation in context. "
+ "Do not include greetings or filler."
+)
+
+
+def should_compress(context_tokens: int, max_context_tokens: int, threshold: float) -> bool:
+ return context_tokens >= int(max_context_tokens * threshold)
+
+
+def partition_messages(
+ messages: list[Message], keep_recent: int
+) -> tuple[list[Message], list[Message]]:
+ """
+ Returns (to_summarize, to_keep).
+
+ Keeps the system message and the last `keep_recent` conversational turns verbatim.
+ Everything older goes into to_summarize.
+ Tool call groups (assistant + tool results) always stay together.
+ """
+ non_system = [m for m in messages if m.role != "system"]
+
+ # Group into turns: each turn starts with a user message
+ turns: list[list[Message]] = []
+ current: list[Message] = []
+ for msg in non_system:
+ if msg.role == "user" and current:
+ turns.append(current)
+ current = [msg]
+ else:
+ current.append(msg)
+ if current:
+ turns.append(current)
+
+ if len(turns) <= keep_recent:
+ return [], non_system # nothing old enough to compress
+
+ old_turns = turns[:-keep_recent]
+ recent_turns = turns[-keep_recent:]
+
+ to_summarize = [m for turn in old_turns for m in turn]
+ to_keep = [m for turn in recent_turns for m in turn]
+ return to_summarize, to_keep
+
+
+def _format_for_summary(messages: list[Message]) -> str:
+ """Render messages as plain text for the summarization prompt."""
+ lines: list[str] = []
+ i = 0
+ while i < len(messages):
+ m = messages[i]
+
+ if m.is_summary:
+ # Existing summary — include as-is (already compressed)
+ lines.append(m.content or "")
+ i += 1
+
+ elif m.role == "user":
+ if m.content:
+ lines.append(f"User: {m.content}")
+ i += 1
+
+ elif m.role == "assistant" and m.tool_calls:
+ # Render tool calls + their results as a compact block
+ for tc in m.tool_calls:
+ args_preview = json.dumps(tc.arguments)[:120]
+ lines.append(f"[Called tool `{tc.name}` with {args_preview}]")
+ i += 1
+ while i < len(messages) and messages[i].role == "tool":
+ result = messages[i].content or ""
+ preview = result[:300] + ("…" if len(result) > 300 else "")
+ lines.append(f"[Tool `{messages[i].name}` returned: {preview}]")
+ i += 1
+
+ elif m.role == "assistant" and m.content:
+ lines.append(f"Assistant: {m.content}")
+ i += 1
+
+ else:
+ i += 1 # skip orphan tool messages
+
+ return "\n".join(lines)
+
+
+async def compress_session(
+ messages: list[Message],
+ llm: LLMBackend,
+ model: str,
+ temperature: float,
+ keep_recent: int,
+) -> list[Message] | None:
+ """
+ Summarize old messages and return a new (shorter) message list.
+ Returns None if there is nothing to compress or if the LLM call fails.
+
+ Raises LLMBackendError on LLM failure — caller decides how to handle.
+ """
+ system_msgs = [m for m in messages if m.role == "system"]
+ to_summarize, to_keep = partition_messages(messages, keep_recent)
+
+ if len(to_summarize) < 2:
+ return None # nothing substantial to compress
+
+ prompt = [
+ Message(role="system", content=_SUMMARIZE_SYSTEM),
+ Message(role="user", content=_format_for_summary(to_summarize)),
+ ]
+
+ response = await llm.complete(prompt, tools=None, temperature=temperature, model=model)
+ summary_text = (response.content or "").strip() or "(summary unavailable)"
+
+ summary_msg = Message(
+ role="user",
+ content=f"[Context Summary]\n{summary_text}",
+ is_summary=True,
+ created_at=datetime.now(timezone.utc),
+ )
+
+ return system_msgs + [summary_msg] + to_keep
diff --git a/navi/llm/base.py b/navi/llm/base.py
index 88e8946..6c865b1 100644
--- a/navi/llm/base.py
+++ b/navi/llm/base.py
@@ -40,6 +40,8 @@
tool_call_id: str | None = None
name: str | None = None # tool name on tool result messages
created_at: datetime | None = None
+ # marks a compressed history block injected by the context compressor
+ is_summary: bool = False
class LLMResponse(BaseModel):