"""
Context compressor — summarizes old messages to stay within the token limit.
Flow:
1. Partition session messages into "to_summarize" (old turns) and "to_keep" (recent turns).
2. Call the LLM to produce a concise bullet-point summary of the old turns.
3. Replace the old turns with a single summary message (role=user, is_summary=True).
A "turn" is one user message plus all following assistant/tool messages up to the
next user message. Tool call groups (assistant + tool results) are never split.
Existing summary messages are always folded into the next compression pass.
"""
import json
from datetime import datetime, timezone
from navi.llm.base import LLMBackend, Message
_SUMMARIZE_SYSTEM = (
"You are summarizing a conversation history to free up context space. "
"The assistant will continue working using ONLY this summary — it will have no access "
"to the original messages. Be thorough and precise. Prefer specifics over generalities.\n\n"
"## Current goal\n"
"What the user is trying to accomplish in this session. Include any stated deadlines, "
"constraints, or acceptance criteria.\n\n"
"## Work state\n"
"What has been completed (be specific — name files, functions, endpoints, steps). "
"What is still in progress or pending. Any blockers or open questions.\n\n"
"## Key facts\n"
"Everything the user told the assistant: preferences, system details, environment info, "
"decisions made, constraints discovered, explicit instructions. Include exact values "
"(port numbers, file paths, config keys, IDs) — do not paraphrase if precision matters.\n\n"
"## Outputs\n"
"Every file created or modified (full paths). Every config value set. "
"Commands run and their output (include verbatim if relevant). "
"Code snippets, SQL, configs, or other artifacts produced — include verbatim if short, "
"summarize with key details if long.\n\n"
"## Errors\n"
"Failures, exceptions, or unexpected results encountered. "
"How each was resolved — or that it remains unresolved.\n\n"
"## User preferences and feedback\n"
"Corrections the user made to the assistant's approach. Explicit style or behavior "
"preferences stated during this session. Things the user said not to do.\n\n"
"Do not include greetings, filler, transitions, or meta-commentary about the summary itself. "
"Write in tight prose or bullet points — whatever preserves more information per token."
)
def should_compress(context_tokens: int, max_context_tokens: int, threshold: float) -> bool:
return context_tokens >= int(max_context_tokens * threshold)
def partition_messages(
messages: list[Message], keep_recent: int
) -> tuple[list[Message], list[Message]]:
"""
Returns (to_summarize, to_keep).
Keeps the system message and the last `keep_recent` conversational turns verbatim.
Everything older goes into to_summarize.
Tool call groups (assistant + tool results) always stay together.
"""
non_system = [m for m in messages if m.role != "system"]
# Group into turns: each turn starts with a user message
turns: list[list[Message]] = []
current: list[Message] = []
for msg in non_system:
if msg.role == "user" and current:
turns.append(current)
current = [msg]
else:
current.append(msg)
if current:
turns.append(current)
if len(turns) <= keep_recent:
return [], non_system # nothing old enough to compress
old_turns = turns[:-keep_recent]
recent_turns = turns[-keep_recent:]
to_summarize = [m for turn in old_turns for m in turn]
to_keep = [m for turn in recent_turns for m in turn]
return to_summarize, to_keep
def _format_for_summary(messages: list[Message]) -> tuple[str, list[str]]:
"""
Render messages as plain text for the summarization prompt.
Returns (text, images) where images is a flat list of base64 strings
collected from all user messages. Vision-capable models will receive
the images alongside the text; non-vision models silently ignore them.
"""
lines: list[str] = []
images: list[str] = []
i = 0
while i < len(messages):
m = messages[i]
if m.is_summary:
# Existing summary — include as-is (already compressed)
lines.append(m.content or "")
i += 1
elif m.role == "user":
if m.images:
images.extend(m.images)
img_note = f" [+ {len(m.images)} image(s)]"
else:
img_note = ""
if m.content:
lines.append(f"User: {m.content}{img_note}")
elif img_note:
lines.append(f"User:{img_note}")
i += 1
elif m.role == "assistant" and m.tool_calls:
# Render tool calls + their results as a compact block
for tc in m.tool_calls:
args_preview = json.dumps(tc.arguments)[:120]
lines.append(f"[Called tool `{tc.name}` with {args_preview}]")
i += 1
while i < len(messages) and messages[i].role == "tool":
result = messages[i].content or ""
preview = result[:300] + ("…" if len(result) > 300 else "")
lines.append(f"[Tool `{messages[i].name}` returned: {preview}]")
i += 1
elif m.role == "assistant" and m.content:
lines.append(f"Assistant: {m.content}")
i += 1
else:
i += 1 # skip orphan tool messages
return "\n".join(lines), images
# Safety limit: truncate formatted input to this many characters before sending to LLM.
# Prevents the summarizer from receiving near-context-sized input it can't fit alongside output.
_MAX_SUMMARY_INPUT_CHARS = 24_000
async def compress_context(
context: list[Message],
llm: LLMBackend,
model: str,
temperature: float,
keep_recent: int,
max_tokens: int | None = None,
) -> tuple[list[Message], str] | None:
"""
Summarize old messages in the LLM context and return a shorter context list.
Only operates on `context` — the full display history (session.messages) is never touched.
Returns None if there is nothing to compress.
Images from old user messages are passed to the summarization model.
Vision-capable models will incorporate image descriptions into the summary;
non-vision models silently ignore the images field.
Uses the same model already loaded in memory (profile.model passed via WorkerContext) —
no model swap, no extra loading overhead.
Exceptions propagate to the caller (CompressionWorker catches them).
"""
system_msgs = [m for m in context if m.role == "system"]
to_summarize, to_keep = partition_messages(context, keep_recent)
if len(to_summarize) < 2:
return None # nothing substantial to compress
summary_text_input, images = _format_for_summary(to_summarize)
# Truncate oversized input so the summarizer LLM has room to generate output
if len(summary_text_input) > _MAX_SUMMARY_INPUT_CHARS:
summary_text_input = summary_text_input[:_MAX_SUMMARY_INPUT_CHARS] + "\n…[truncated]"
prompt = [
Message(role="system", content=_SUMMARIZE_SYSTEM),
Message(role="user", content=summary_text_input, images=images or None),
]
# think=False: compression must be fast — extended reasoning wastes context and hangs
response = await llm.complete(
prompt, tools=None, temperature=temperature, model=model, think=False, max_tokens=max_tokens
)
summary_text = (response.content or "").strip() or "(summary unavailable)"
summary_msg = Message(
role="user",
content=f"[Context Summary]\n{summary_text}",
is_summary=True,
created_at=datetime.now(timezone.utc),
)
return system_msgs + [summary_msg] + to_keep, summary_text