"""
Context compressor — summarizes old messages to stay within the token limit.
Flow:
1. Partition session messages into "to_summarize" (old turns) and "to_keep" (recent turns).
2. Call the LLM to produce a concise bullet-point summary of the old turns.
3. Replace the old turns with a single summary message (role=user, is_summary=True).
A "turn" is one user message plus all following assistant/tool messages up to the
next user message. Tool call groups (assistant + tool results) are never split.
Existing summary messages are always folded into the next compression pass.
"""
import json
from datetime import datetime, timezone
from navi.llm.base import LLMBackend, Message
_SUMMARIZE_SYSTEM = (
"You are summarizing a conversation history to free up context space. "
"Produce a structured factual summary covering: key facts the user shared, "
"decisions made, tasks completed or in progress, important outputs or findings, "
"any code or config snippets that were produced. "
"Use bullet points grouped by topic. Be thorough — capture enough detail that "
"the assistant can continue the conversation without the original messages. "
"Do not include greetings, filler, or meta-commentary about the summary itself."
)
def should_compress(context_tokens: int, max_context_tokens: int, threshold: float) -> bool:
return context_tokens >= int(max_context_tokens * threshold)
def partition_messages(
messages: list[Message], keep_recent: int
) -> tuple[list[Message], list[Message]]:
"""
Returns (to_summarize, to_keep).
Keeps the system message and the last `keep_recent` conversational turns verbatim.
Everything older goes into to_summarize.
Tool call groups (assistant + tool results) always stay together.
"""
non_system = [m for m in messages if m.role != "system"]
# Group into turns: each turn starts with a user message
turns: list[list[Message]] = []
current: list[Message] = []
for msg in non_system:
if msg.role == "user" and current:
turns.append(current)
current = [msg]
else:
current.append(msg)
if current:
turns.append(current)
if len(turns) <= keep_recent:
return [], non_system # nothing old enough to compress
old_turns = turns[:-keep_recent]
recent_turns = turns[-keep_recent:]
to_summarize = [m for turn in old_turns for m in turn]
to_keep = [m for turn in recent_turns for m in turn]
return to_summarize, to_keep
def _format_for_summary(messages: list[Message]) -> tuple[str, list[str]]:
"""
Render messages as plain text for the summarization prompt.
Returns (text, images) where images is a flat list of base64 strings
collected from all user messages. Vision-capable models will receive
the images alongside the text; non-vision models silently ignore them.
"""
lines: list[str] = []
images: list[str] = []
i = 0
while i < len(messages):
m = messages[i]
if m.is_summary:
# Existing summary — include as-is (already compressed)
lines.append(m.content or "")
i += 1
elif m.role == "user":
if m.images:
images.extend(m.images)
img_note = f" [+ {len(m.images)} image(s)]"
else:
img_note = ""
if m.content:
lines.append(f"User: {m.content}{img_note}")
elif img_note:
lines.append(f"User:{img_note}")
i += 1
elif m.role == "assistant" and m.tool_calls:
# Render tool calls + their results as a compact block
for tc in m.tool_calls:
args_preview = json.dumps(tc.arguments)[:120]
lines.append(f"[Called tool `{tc.name}` with {args_preview}]")
i += 1
while i < len(messages) and messages[i].role == "tool":
result = messages[i].content or ""
preview = result[:300] + ("…" if len(result) > 300 else "")
lines.append(f"[Tool `{messages[i].name}` returned: {preview}]")
i += 1
elif m.role == "assistant" and m.content:
lines.append(f"Assistant: {m.content}")
i += 1
else:
i += 1 # skip orphan tool messages
return "\n".join(lines), images
# Safety limit: truncate formatted input to this many characters before sending to LLM.
# Prevents the summarizer from receiving near-context-sized input it can't fit alongside output.
_MAX_SUMMARY_INPUT_CHARS = 24_000
async def compress_context(
context: list[Message],
llm: LLMBackend,
model: str,
temperature: float,
keep_recent: int,
max_tokens: int | None = None,
) -> tuple[list[Message], str] | None:
"""
Summarize old messages in the LLM context and return a shorter context list.
Only operates on `context` — the full display history (session.messages) is never touched.
Returns None if there is nothing to compress.
Images from old user messages are passed to the summarization model.
Vision-capable models will incorporate image descriptions into the summary;
non-vision models silently ignore the images field.
Uses the same model already loaded in memory (profile.model passed via WorkerContext) —
no model swap, no extra loading overhead.
Exceptions propagate to the caller (CompressionWorker catches them).
"""
system_msgs = [m for m in context if m.role == "system"]
to_summarize, to_keep = partition_messages(context, keep_recent)
if len(to_summarize) < 2:
return None # nothing substantial to compress
summary_text_input, images = _format_for_summary(to_summarize)
# Truncate oversized input so the summarizer LLM has room to generate output
if len(summary_text_input) > _MAX_SUMMARY_INPUT_CHARS:
summary_text_input = summary_text_input[:_MAX_SUMMARY_INPUT_CHARS] + "\n…[truncated]"
prompt = [
Message(role="system", content=_SUMMARIZE_SYSTEM),
Message(role="user", content=summary_text_input, images=images or None),
]
# think=False: compression must be fast — extended reasoning wastes context and hangs
response = await llm.complete(
prompt, tools=None, temperature=temperature, model=model, think=False, max_tokens=max_tokens
)
summary_text = (response.content or "").strip() or "(summary unavailable)"
summary_msg = Message(
role="user",
content=f"[Context Summary]\n{summary_text}",
is_summary=True,
created_at=datetime.now(timezone.utc),
)
return system_msgs + [summary_msg] + to_keep, summary_text