diff --git a/navi/config.py b/navi/config.py
index 38f48ce..b4887eb 100644
--- a/navi/config.py
+++ b/navi/config.py
@@ -61,6 +61,7 @@
     context_compression_threshold: float = 0.80   # trigger at 80% of ollama_num_ctx
     context_keep_recent: int = 8                   # conversational turns to keep verbatim
     context_summary_temperature: float = 0.3
+    context_summary_max_tokens: int = 1024         # max output tokens for the summary LLM call
 
     # Global personality prompt prepended to every agent's system prompt.
     # Multi-line values don't survive .env parsing reliably, so prefer
diff --git a/navi/core/compressor.py b/navi/core/compressor.py
index 383ed7d..c79420f 100644
--- a/navi/core/compressor.py
+++ b/navi/core/compressor.py
@@ -130,6 +130,7 @@
     model: str,
     temperature: float,
     keep_recent: int,
+    max_tokens: int | None = None,
 ) -> tuple[list[Message], str] | None:
     """
     Summarize old messages in the LLM context and return a shorter context list.
@@ -163,7 +164,9 @@
     ]
 
     # think=False: compression must be fast — extended reasoning wastes context and hangs
-    response = await llm.complete(prompt, tools=None, temperature=temperature, model=model, think=False)
+    response = await llm.complete(
+        prompt, tools=None, temperature=temperature, model=model, think=False, max_tokens=max_tokens
+    )
     summary_text = (response.content or "").strip() or "(summary unavailable)"
 
     summary_msg = Message(
diff --git a/navi/llm/base.py b/navi/llm/base.py
index 5b688e2..8e13c03 100644
--- a/navi/llm/base.py
+++ b/navi/llm/base.py
@@ -77,6 +77,7 @@
         temperature: float = 0.7,
         model: str | None = None,
         think: bool | None = None,
+        max_tokens: int | None = None,
     ) -> LLMResponse:
         """Single-shot completion. Used in the agent tool-calling loop."""
 
diff --git a/navi/llm/ollama.py b/navi/llm/ollama.py
index 578f73c..5ca96d7 100644
--- a/navi/llm/ollama.py
+++ b/navi/llm/ollama.py
@@ -30,12 +30,18 @@
     return [t.model_dump() for t in tools]
 
 
-def _base_options(temperature: float, think: bool | None = None) -> dict:
+def _base_options(
+    temperature: float,
+    think: bool | None = None,
+    max_tokens: int | None = None,
+) -> dict:
     opts: dict = {"temperature": temperature, "num_ctx": settings.ollama_num_ctx}
     # think=None → use global setting; think=False → force off even if global is True
     effective_think = settings.ollama_think if think is None else think
     if effective_think:
         opts["think"] = True
+    if max_tokens is not None:
+        opts["num_predict"] = max_tokens
     return opts
 
 
@@ -51,12 +57,13 @@
         temperature: float = 0.7,
         model: str | None = None,
         think: bool | None = None,
+        max_tokens: int | None = None,
     ) -> LLMResponse:
         try:
             kwargs: dict = {
                 "model": model or self.model,
                 "messages": _to_ollama_messages(messages),
-                "options": _base_options(temperature, think=think),
+                "options": _base_options(temperature, think=think, max_tokens=max_tokens),
                 "stream": False,
             }
             if tools:
diff --git a/navi/workers/compressor.py b/navi/workers/compressor.py
index 2eb2845..1d6c080 100644
--- a/navi/workers/compressor.py
+++ b/navi/workers/compressor.py
@@ -34,6 +34,7 @@
                 model=ctx.model,
                 temperature=settings.context_summary_temperature,
                 keep_recent=settings.context_keep_recent,
+                max_tokens=settings.context_summary_max_tokens,
             )
         except Exception:
             log.warning("compression_worker.llm_failed", session_id=ctx.session_id, exc_info=True)