diff --git a/navi/config.py b/navi/config.py index 38f48ce..b4887eb 100644 --- a/navi/config.py +++ b/navi/config.py @@ -61,6 +61,7 @@ context_compression_threshold: float = 0.80 # trigger at 80% of ollama_num_ctx context_keep_recent: int = 8 # conversational turns to keep verbatim context_summary_temperature: float = 0.3 + context_summary_max_tokens: int = 1024 # max output tokens for the summary LLM call # Global personality prompt prepended to every agent's system prompt. # Multi-line values don't survive .env parsing reliably, so prefer diff --git a/navi/core/compressor.py b/navi/core/compressor.py index 383ed7d..c79420f 100644 --- a/navi/core/compressor.py +++ b/navi/core/compressor.py @@ -130,6 +130,7 @@ model: str, temperature: float, keep_recent: int, + max_tokens: int | None = None, ) -> tuple[list[Message], str] | None: """ Summarize old messages in the LLM context and return a shorter context list. @@ -163,7 +164,9 @@ ] # think=False: compression must be fast — extended reasoning wastes context and hangs - response = await llm.complete(prompt, tools=None, temperature=temperature, model=model, think=False) + response = await llm.complete( + prompt, tools=None, temperature=temperature, model=model, think=False, max_tokens=max_tokens + ) summary_text = (response.content or "").strip() or "(summary unavailable)" summary_msg = Message( diff --git a/navi/llm/base.py b/navi/llm/base.py index 5b688e2..8e13c03 100644 --- a/navi/llm/base.py +++ b/navi/llm/base.py @@ -77,6 +77,7 @@ temperature: float = 0.7, model: str | None = None, think: bool | None = None, + max_tokens: int | None = None, ) -> LLMResponse: """Single-shot completion. Used in the agent tool-calling loop.""" diff --git a/navi/llm/ollama.py b/navi/llm/ollama.py index 578f73c..5ca96d7 100644 --- a/navi/llm/ollama.py +++ b/navi/llm/ollama.py @@ -30,12 +30,18 @@ return [t.model_dump() for t in tools] -def _base_options(temperature: float, think: bool | None = None) -> dict: +def _base_options( + temperature: float, + think: bool | None = None, + max_tokens: int | None = None, +) -> dict: opts: dict = {"temperature": temperature, "num_ctx": settings.ollama_num_ctx} # think=None → use global setting; think=False → force off even if global is True effective_think = settings.ollama_think if think is None else think if effective_think: opts["think"] = True + if max_tokens is not None: + opts["num_predict"] = max_tokens return opts @@ -51,12 +57,13 @@ temperature: float = 0.7, model: str | None = None, think: bool | None = None, + max_tokens: int | None = None, ) -> LLMResponse: try: kwargs: dict = { "model": model or self.model, "messages": _to_ollama_messages(messages), - "options": _base_options(temperature, think=think), + "options": _base_options(temperature, think=think, max_tokens=max_tokens), "stream": False, } if tools: diff --git a/navi/workers/compressor.py b/navi/workers/compressor.py index 2eb2845..1d6c080 100644 --- a/navi/workers/compressor.py +++ b/navi/workers/compressor.py @@ -34,6 +34,7 @@ model=ctx.model, temperature=settings.context_summary_temperature, keep_recent=settings.context_keep_recent, + max_tokens=settings.context_summary_max_tokens, ) except Exception: log.warning("compression_worker.llm_failed", session_id=ctx.session_id, exc_info=True)