diff --git a/navi/core/agent.py b/navi/core/agent.py index 4363c7e..af452a0 100644 --- a/navi/core/agent.py +++ b/navi/core/agent.py @@ -282,6 +282,7 @@ model=profile.model, top_k=profile.top_k, top_p=profile.top_p, + num_thread=profile.num_thread, ) if response.finish_reason == "stop" or not response.tool_calls: @@ -453,6 +454,7 @@ think=profile.think_enabled, top_k=profile.top_k, top_p=profile.top_p, + num_thread=profile.num_thread, ), stop_event=stop_event, first_chunk_timeout=settings.llm_stream_first_chunk_timeout, @@ -731,6 +733,7 @@ think=profile.think_enabled, top_k=profile.top_k, top_p=profile.top_p, + num_thread=profile.num_thread, ), stop_event=stop_event, first_chunk_timeout=settings.llm_stream_first_chunk_timeout, diff --git a/navi/llm/fallback.py b/navi/llm/fallback.py index 3c2862a..d1eaff7 100644 --- a/navi/llm/fallback.py +++ b/navi/llm/fallback.py @@ -76,6 +76,7 @@ max_tokens: int | None = None, top_k: int | None = None, top_p: float | None = None, + num_thread: int | None = None, ) -> LLMResponse: models = self._model_list(model) last_err: Exception = LLMBackendError("No backends configured") @@ -90,7 +91,7 @@ return await self._get_client(server).complete( messages, tools=tools, temperature=temperature, model=m, think=think, max_tokens=max_tokens, - top_k=top_k, top_p=top_p, + top_k=top_k, top_p=top_p, num_thread=num_thread, ) except LLMConnectionError as e: log.warning("fallback.server_dead", host=server.host, error=str(e)) @@ -112,6 +113,7 @@ model: "list[str] | str | None" = None, top_k: int | None = None, top_p: float | None = None, + num_thread: int | None = None, ) -> AsyncGenerator[LLMChunk, None]: models = self._model_list(model) last_err: Exception = LLMBackendError("No backends configured") @@ -124,7 +126,7 @@ continue try: gen = self._get_client(server).stream( - messages, temperature=temperature, model=m, top_k=top_k, top_p=top_p + messages, temperature=temperature, model=m, top_k=top_k, top_p=top_p, num_thread=num_thread, ) first = await gen.__anext__() except StopAsyncIteration: @@ -156,6 +158,7 @@ think: bool | None = None, top_k: int | None = None, top_p: float | None = None, + num_thread: int | None = None, ) -> AsyncGenerator[LLMChunk, None]: models = self._model_list(model) last_err: Exception = LLMBackendError("No backends configured") @@ -169,7 +172,7 @@ try: gen = self._get_client(server).stream_complete( messages, tools=tools, temperature=temperature, model=m, think=think, - top_k=top_k, top_p=top_p, + top_k=top_k, top_p=top_p, num_thread=num_thread, ) first = await gen.__anext__() except StopAsyncIteration: diff --git a/navi/llm/ollama.py b/navi/llm/ollama.py index c12d27c..e09fe71 100644 --- a/navi/llm/ollama.py +++ b/navi/llm/ollama.py @@ -36,6 +36,7 @@ max_tokens: int | None = None, top_k: int | None = None, top_p: float | None = None, + num_thread: int | None = None, ) -> dict: opts: dict = {"temperature": temperature, "num_ctx": settings.ollama_num_ctx} # think=None → use global setting; think=False → force off even if global is True @@ -48,6 +49,8 @@ opts["top_k"] = top_k if top_p is not None: opts["top_p"] = top_p + if num_thread is not None: + opts["num_thread"] = num_thread return opts @@ -92,13 +95,14 @@ max_tokens: int | None = None, top_k: int | None = None, top_p: float | None = None, + num_thread: int | None = None, ) -> LLMResponse: resolved = _resolve_model(model, self.model) try: kwargs: dict = { "model": resolved, "messages": _to_ollama_messages(messages), - "options": _base_options(temperature, think=think, max_tokens=max_tokens, top_k=top_k, top_p=top_p), + "options": _base_options(temperature, think=think, max_tokens=max_tokens, top_k=top_k, top_p=top_p, num_thread=num_thread), "stream": False, } if tools: @@ -139,13 +143,14 @@ model: "list[str] | str | None" = None, top_k: int | None = None, top_p: float | None = None, + num_thread: int | None = None, ) -> AsyncGenerator[LLMChunk, None]: resolved = _resolve_model(model, self.model) try: async for chunk in await self._client.chat( model=resolved, messages=_to_ollama_messages(messages), - options=_base_options(temperature, top_k=top_k, top_p=top_p), + options=_base_options(temperature, top_k=top_k, top_p=top_p, num_thread=num_thread), stream=True, ): thinking = getattr(chunk.message, "thinking", None) or None @@ -172,13 +177,14 @@ think: bool | None = None, top_k: int | None = None, top_p: float | None = None, + num_thread: int | None = None, ) -> AsyncGenerator[LLMChunk, None]: resolved = _resolve_model(model, self.model) try: kwargs: dict = { "model": resolved, "messages": _to_ollama_messages(messages), - "options": _base_options(temperature, think=think, top_k=top_k, top_p=top_p), + "options": _base_options(temperature, think=think, top_k=top_k, top_p=top_p, num_thread=num_thread), "stream": True, } if tools: diff --git a/navi/profiles/base.py b/navi/profiles/base.py index 8e1553d..ee96761 100644 --- a/navi/profiles/base.py +++ b/navi/profiles/base.py @@ -26,6 +26,9 @@ temperature: float = 0.7 top_k: int | None = None top_p: float | None = None + # Number of CPU threads for local inference. None = Ollama default (physical cores). + # Cloud models ignore this option. + num_thread: int | None = None planning_enabled: bool = False # if True, run a planning LLM call before the main loop # Profile discoverability — used for system prompt injection and list_profiles tool. diff --git a/navi/profiles/developer/config.json b/navi/profiles/developer/config.json index 89c528c..b0ca59f 100644 --- a/navi/profiles/developer/config.json +++ b/navi/profiles/developer/config.json @@ -10,8 +10,8 @@ }, "llm_backend": "ollama", "model": [ - "qwen3.6:35b", "gemma4:31b-cloud", + "qwen3.6:35b", "gemma4:26b-a4b-it-q4_K_M" ], "temperature": 0.45, @@ -64,5 +64,6 @@ "planning_phase2_enabled": true, "planning_phase3_enabled": true, "top_k": 40, - "top_p": 0.88 + "top_p": 0.88, + "num_thread": 11 } diff --git a/navi/profiles/discuss/config.json b/navi/profiles/discuss/config.json index 136e29c..6844811 100644 --- a/navi/profiles/discuss/config.json +++ b/navi/profiles/discuss/config.json @@ -4,8 +4,8 @@ "description": "Creative partner for Q&A, brainstorming, and idea exploration. High creativity, free-form thinking.", "short_description": "Creative Q&A and idea discussion — best for open questions, brainstorming, and exploring concepts.", "model": [ - "qwen3.6:35b", "gemma4:31b-cloud", + "qwen3.6:35b", "gemma4:26b-a4b-it-q4_K_M" ], "temperature": 0.85, @@ -39,5 +39,6 @@ "adaptive_replan_enabled": false, "subagent_planning_enabled": false, "top_k": 80, - "top_p": 0.95 + "top_p": 0.95, + "num_thread": 11 } diff --git a/navi/profiles/loader.py b/navi/profiles/loader.py index 456805a..76f2200 100644 --- a/navi/profiles/loader.py +++ b/navi/profiles/loader.py @@ -74,6 +74,7 @@ temperature=config.get("temperature", 0.7), top_k=config.get("top_k", None), top_p=config.get("top_p", None), + num_thread=config.get("num_thread", None), max_iterations=config.get("max_iterations", 20), planning_enabled=config.get("planning_enabled", False), planning_mandatory=config.get("planning_mandatory", False), diff --git a/navi/profiles/secretary/config.json b/navi/profiles/secretary/config.json index 8876f69..a9f0d3b 100644 --- a/navi/profiles/secretary/config.json +++ b/navi/profiles/secretary/config.json @@ -10,8 +10,8 @@ }, "llm_backend": "ollama", "model": [ - "qwen3.6:35b", "gemma4:31b-cloud", + "qwen3.6:35b", "gemma4:26b-a4b-it-q4_K_M" ], "temperature": 0.65, @@ -62,5 +62,6 @@ "planning_phase2_enabled": true, "planning_phase3_enabled": true, "top_k": 50, - "top_p": 0.90 + "top_p": 0.90, + "num_thread": 11 } diff --git a/navi/profiles/server_admin/config.json b/navi/profiles/server_admin/config.json index 551fa91..80d6c70 100644 --- a/navi/profiles/server_admin/config.json +++ b/navi/profiles/server_admin/config.json @@ -10,8 +10,8 @@ }, "llm_backend": "ollama", "model": [ - "qwen3.6:35b", "gemma4:31b-cloud", + "qwen3.6:35b", "gemma4:26b-a4b-it-q4_K_M" ], "temperature": 0.30, @@ -64,5 +64,6 @@ "planning_phase2_enabled": true, "planning_phase3_enabled": true, "top_k": 30, - "top_p": 0.80 + "top_p": 0.80, + "num_thread": 11 } diff --git a/navi/profiles/tool_developer/config.json b/navi/profiles/tool_developer/config.json index ac9f986..9765021 100644 --- a/navi/profiles/tool_developer/config.json +++ b/navi/profiles/tool_developer/config.json @@ -10,8 +10,8 @@ }, "llm_backend": "ollama", "model": [ - "qwen3.6:35b", "gemma4:31b-cloud", + "qwen3.6:35b", "gemma4:26b-a4b-it-q4_K_M" ], "temperature": 0.35, @@ -73,5 +73,6 @@ "planning_phase2_enabled": true, "planning_phase3_enabled": true, "top_k": 40, - "top_p": 0.85 + "top_p": 0.85, + "num_thread": 11 }