navi-1/navi/core/compressor.py at a03b29ac7c26ac92ac41d4099fde885620745a2a

Fork: 0
root / navi-1
Find file
Newer
Older
navi-1 / navi / core / compressor.py
Eugene Sukhodolskiy on 15 Apr 6 KB Add explicit output token budget for summarizer (context_summary_max_tokens)
Raw Blame History
"""
Context compressor — summarizes old messages to stay within the token limit.

Flow:
1. Partition session messages into "to_summarize" (old turns) and "to_keep" (recent turns).
2. Call the LLM to produce a concise bullet-point summary of the old turns.
3. Replace the old turns with a single summary message (role=user, is_summary=True).

A "turn" is one user message plus all following assistant/tool messages up to the
next user message. Tool call groups (assistant + tool results) are never split.
Existing summary messages are always folded into the next compression pass.
"""

import json
from datetime import datetime, timezone

from navi.llm.base import LLMBackend, Message

_SUMMARIZE_SYSTEM = (
    "You are summarizing a conversation history to free up context space. "
    "Produce a structured factual summary covering: key facts the user shared, "
    "decisions made, tasks completed or in progress, important outputs or findings, "
    "any code or config snippets that were produced. "
    "Use bullet points grouped by topic. Be thorough — capture enough detail that "
    "the assistant can continue the conversation without the original messages. "
    "Do not include greetings, filler, or meta-commentary about the summary itself."
)


def should_compress(context_tokens: int, max_context_tokens: int, threshold: float) -> bool:
    return context_tokens >= int(max_context_tokens * threshold)


def partition_messages(
    messages: list[Message], keep_recent: int
) -> tuple[list[Message], list[Message]]:
    """
    Returns (to_summarize, to_keep).

    Keeps the system message and the last `keep_recent` conversational turns verbatim.
    Everything older goes into to_summarize.
    Tool call groups (assistant + tool results) always stay together.
    """
    non_system = [m for m in messages if m.role != "system"]

    # Group into turns: each turn starts with a user message
    turns: list[list[Message]] = []
    current: list[Message] = []
    for msg in non_system:
        if msg.role == "user" and current:
            turns.append(current)
            current = [msg]
        else:
            current.append(msg)
    if current:
        turns.append(current)

    if len(turns) <= keep_recent:
        return [], non_system  # nothing old enough to compress

    old_turns = turns[:-keep_recent]
    recent_turns = turns[-keep_recent:]

    to_summarize = [m for turn in old_turns for m in turn]
    to_keep = [m for turn in recent_turns for m in turn]
    return to_summarize, to_keep


def _format_for_summary(messages: list[Message]) -> tuple[str, list[str]]:
    """
    Render messages as plain text for the summarization prompt.

    Returns (text, images) where images is a flat list of base64 strings
    collected from all user messages. Vision-capable models will receive
    the images alongside the text; non-vision models silently ignore them.
    """
    lines: list[str] = []
    images: list[str] = []
    i = 0
    while i < len(messages):
        m = messages[i]

        if m.is_summary:
            # Existing summary — include as-is (already compressed)
            lines.append(m.content or "")
            i += 1

        elif m.role == "user":
            if m.images:
                images.extend(m.images)
                img_note = f" [+ {len(m.images)} image(s)]"
            else:
                img_note = ""
            if m.content:
                lines.append(f"User: {m.content}{img_note}")
            elif img_note:
                lines.append(f"User:{img_note}")
            i += 1

        elif m.role == "assistant" and m.tool_calls:
            # Render tool calls + their results as a compact block
            for tc in m.tool_calls:
                args_preview = json.dumps(tc.arguments)[:120]
                lines.append(f"[Called tool `{tc.name}` with {args_preview}]")
            i += 1
            while i < len(messages) and messages[i].role == "tool":
                result = messages[i].content or ""
                preview = result[:300] + ("…" if len(result) > 300 else "")
                lines.append(f"[Tool `{messages[i].name}` returned: {preview}]")
                i += 1

        elif m.role == "assistant" and m.content:
            lines.append(f"Assistant: {m.content}")
            i += 1

        else:
            i += 1  # skip orphan tool messages

    return "\n".join(lines), images


# Safety limit: truncate formatted input to this many characters before sending to LLM.
# Prevents the summarizer from receiving near-context-sized input it can't fit alongside output.
_MAX_SUMMARY_INPUT_CHARS = 24_000


async def compress_context(
    context: list[Message],
    llm: LLMBackend,
    model: str,
    temperature: float,
    keep_recent: int,
    max_tokens: int | None = None,
) -> tuple[list[Message], str] | None:
    """
    Summarize old messages in the LLM context and return a shorter context list.
    Only operates on `context` — the full display history (session.messages) is never touched.
    Returns None if there is nothing to compress.

    Images from old user messages are passed to the summarization model.
    Vision-capable models will incorporate image descriptions into the summary;
    non-vision models silently ignore the images field.

    Uses the same model already loaded in memory (profile.model passed via WorkerContext) —
    no model swap, no extra loading overhead.

    Exceptions propagate to the caller (CompressionWorker catches them).
    """
    system_msgs = [m for m in context if m.role == "system"]
    to_summarize, to_keep = partition_messages(context, keep_recent)

    if len(to_summarize) < 2:
        return None  # nothing substantial to compress

    summary_text_input, images = _format_for_summary(to_summarize)

    # Truncate oversized input so the summarizer LLM has room to generate output
    if len(summary_text_input) > _MAX_SUMMARY_INPUT_CHARS:
        summary_text_input = summary_text_input[:_MAX_SUMMARY_INPUT_CHARS] + "\n…[truncated]"

    prompt = [
        Message(role="system", content=_SUMMARIZE_SYSTEM),
        Message(role="user", content=summary_text_input, images=images or None),
    ]

    # think=False: compression must be fast — extended reasoning wastes context and hangs
    response = await llm.complete(
        prompt, tools=None, temperature=temperature, model=model, think=False, max_tokens=max_tokens
    )
    summary_text = (response.content or "").strip() or "(summary unavailable)"

    summary_msg = Message(
        role="user",
        content=f"[Context Summary]\n{summary_text}",
        is_summary=True,
        created_at=datetime.now(timezone.utc),
    )

    return system_msgs + [summary_msg] + to_keep, summary_text