diff --git a/docs/tech_debt_review_2026-04-29.md b/docs/tech_debt_review_2026-04-29.md index aa3cabc..ec81c72 100644 --- a/docs/tech_debt_review_2026-04-29.md +++ b/docs/tech_debt_review_2026-04-29.md @@ -86,7 +86,7 @@ | 51 | **Frontend: imprecise relative-time updates** | `webclient/src/composables/useTime.js` | 36 | 30-second interval means "just now" can be stale for up to 30s before jumping. | | 52 | **Frontend: stream message ID collision risk** | `webclient/src/stores/chat.js` | 148 | `id: 'stream_${Date.now()}'` could collide within the same millisecond on rapid reconnect. | | 53 | **[FIXED] Frontend: flawed meta-row visibility logic** | `webclient/src/components/messages/AssistantMessage.vue` | 102-104 | `hasMeta` uses `|| props.msg.tool_call_count`, so `0` is falsy and may incorrectly hide the meta row. | -| 54 | **Dead `stream()` method on LLM backends** | `navi/llm/ollama.py:149`, `navi/llm/fallback.py:168` | `OllamaBackend.stream()` and `FallbackOllamaBackend.stream()` exist in code but are never called by `agent.py` or `messages.py`. Only `stream_complete()` is used. Historical purpose unknown — investigate before removing. May have been for a prior streaming architecture where thinking/tool deltas were separate from `stream_complete`. | +| 54 | **[FIXED] Dead `stream()` method on LLM backends** | `navi/llm/ollama.py:149`, `navi/llm/fallback.py:168` | `OllamaBackend.stream()` and `FallbackOllamaBackend.stream()` existed in code but were never called by `agent.py` or `messages.py`. Only `stream_complete()` was used. Removed from all backends and the base `LLMBackend` interface. | --- diff --git a/navi/llm/base.py b/navi/llm/base.py index 642c895..2a747fe 100644 --- a/navi/llm/base.py +++ b/navi/llm/base.py @@ -96,15 +96,6 @@ """Single-shot completion. Used in the agent tool-calling loop.""" @abstractmethod - async def stream( - self, - messages: list[Message], - temperature: float = 0.7, - model: str | None = None, - ) -> AsyncGenerator[LLMChunk, None]: - """Streaming text completion (no tool calling). Used for final response streaming.""" - - @abstractmethod async def stream_complete( self, messages: list[Message], diff --git a/navi/llm/fallback.py b/navi/llm/fallback.py index e79c086..f54bf87 100644 --- a/navi/llm/fallback.py +++ b/navi/llm/fallback.py @@ -190,50 +190,6 @@ raise LLMBackendError(f"All backends exhausted: {last_err}") from last_err - async def stream( - self, - messages: list[Message], - temperature: float = 0.7, - model: "list[str] | str | None" = None, - top_k: int | None = None, - top_p: float | None = None, - num_thread: int | None = None, - ) -> AsyncGenerator[LLMChunk, None]: - models = self._model_list(model) - last_err: Exception = LLMBackendError("No backends configured") - - for server in self._servers: - if _is_dead_server(server.host): - continue - for m in models: - if _is_dead_model(server.host, m): - continue - try: - gen = self._get_client(server).stream( - messages, temperature=temperature, model=m, top_k=top_k, top_p=top_p, num_thread=num_thread, - ) - first = await gen.__anext__() - except StopAsyncIteration: - last_err = LLMModelNotFoundError("Empty stream from server") - continue - except LLMConnectionError as e: - log.warning("fallback.server_dead", host=server.host, error=str(e)) - _dead_servers[server.host] = time.monotonic() - last_err = e - break - except LLMModelNotFoundError as e: - log.warning("fallback.model_dead", host=server.host, model=m, error=str(e)) - _dead_models[(server.host, m)] = time.monotonic() - last_err = e - continue - else: - yield first - async for chunk in gen: - yield chunk - return - - raise LLMBackendError(f"All backends exhausted: {last_err}") from last_err - async def stream_complete( self, messages: list[Message], diff --git a/navi/llm/ollama.py b/navi/llm/ollama.py index 9268410..5f805df 100644 --- a/navi/llm/ollama.py +++ b/navi/llm/ollama.py @@ -146,38 +146,6 @@ except Exception as e: raise _classify_error(e) from e - async def stream( - self, - messages: list[Message], - temperature: float = 0.7, - model: "list[str] | str | None" = None, - top_k: int | None = None, - top_p: float | None = None, - num_thread: int | None = None, - ) -> AsyncGenerator[LLMChunk, None]: - resolved = _resolve_model(model, self.model) - try: - async for chunk in await self._client.chat( - model=resolved, - messages=_to_ollama_messages(messages), - options=_base_options(temperature, top_k=top_k, top_p=top_p, num_thread=num_thread), - stream=True, - ): - thinking = getattr(chunk.message, "thinking", None) or None - delta = chunk.message.content or None - finish_reason = "stop" if chunk.done else None - yield LLMChunk( - delta=delta, - thinking=thinking, - finish_reason=finish_reason, - prompt_tokens=chunk.prompt_eval_count if chunk.done else None, - completion_tokens=chunk.eval_count if chunk.done else None, - ) - except (LLMConnectionError, LLMModelNotFoundError, LLMBackendError): - raise - except Exception as e: - raise _classify_error(e) from e - async def stream_complete( self, messages: list[Message], diff --git a/navi/llm/openai_backend.py b/navi/llm/openai_backend.py index dfdd7af..e6ac77b 100644 --- a/navi/llm/openai_backend.py +++ b/navi/llm/openai_backend.py @@ -26,15 +26,6 @@ ) -> LLMResponse: raise NotImplementedError("OpenAI backend not yet implemented") - async def stream( - self, - messages: list[Message], - temperature: float = 0.7, - model: str | None = None, - ) -> AsyncGenerator[LLMChunk, None]: - raise NotImplementedError("OpenAI backend not yet implemented") - yield # makes this a generator - async def stream_complete( self, messages: list[Message],