diff --git a/navi/core/agent.py b/navi/core/agent.py index 5b40650..7592b36 100644 --- a/navi/core/agent.py +++ b/navi/core/agent.py @@ -225,7 +225,7 @@ if _is_first_message or profile.planning_enabled: async for _ev in self._planning.run(session.context, profile, llm, mem, tool_schemas, messages=session.messages, force_plan=_force_plan): if isinstance(_ev, AIHelperTokensUsed): - turn_ctx.subagent_tokens += _ev.total + turn_ctx.subagent_tokens += _ev.completion_tokens elif isinstance(_ev, PlanningDebugData): session.planning_logs.append(_ev.log) # Cap to prevent unbounded DB growth on long sessions @@ -562,10 +562,10 @@ if state.thinking_active: yield ThinkingEnd() break - if chunk.prompt_tokens is not None or chunk.completion_tokens is not None: - _iter_tokens = (chunk.prompt_tokens or 0) + (chunk.completion_tokens or 0) - turn_ctx.turn_tokens += _iter_tokens - state.context_tokens = _iter_tokens + if chunk.prompt_tokens is not None: + state.context_tokens = chunk.prompt_tokens + if chunk.completion_tokens is not None: + turn_ctx.turn_tokens += chunk.completion_tokens if chunk.thinking: state.accumulated_thinking += chunk.thinking if not state.thinking_active: diff --git a/navi/core/events.py b/navi/core/events.py index a5e579b..1a05d33 100644 --- a/navi/core/events.py +++ b/navi/core/events.py @@ -76,11 +76,11 @@ """Marks the end of the streaming response.""" full_content: str - context_tokens: int | None = None # total tokens used in this turn + context_tokens: int | None = None # prompt tokens for the last LLM call max_context_tokens: int = 0 # ollama_num_ctx from config elapsed_seconds: float | None = None tool_call_count: int = 0 - token_count: int | None = None # same as context_tokens; kept separate for clarity + token_count: int | None = None # completion tokens generated in this turn message_index: int | None = None # raw index of the first assistant msg in this turn group def to_wire(self) -> dict: diff --git a/navi/core/subagent_runner.py b/navi/core/subagent_runner.py index e37577b..6da19e2 100644 --- a/navi/core/subagent_runner.py +++ b/navi/core/subagent_runner.py @@ -223,7 +223,7 @@ is_subagent=True, ): if isinstance(_ev, AIHelperTokensUsed): - _turn_tokens += _ev.total + _turn_tokens += _ev.completion_tokens elif sink is not None: await sink.put(_ev) @@ -276,13 +276,8 @@ first_chunk_timeout=settings.llm_stream_first_chunk_timeout, chunk_timeout=settings.llm_stream_chunk_timeout, ): - if ( - chunk.prompt_tokens is not None - or chunk.completion_tokens is not None - ): - _turn_tokens += (chunk.prompt_tokens or 0) + ( - chunk.completion_tokens or 0 - ) + if chunk.completion_tokens is not None: + _turn_tokens += chunk.completion_tokens if chunk.thinking: if thinking_started_at is None: thinking_started_at = time.monotonic() diff --git a/tests/unit/core/test_agent.py b/tests/unit/core/test_agent.py index 56fee06..e48fa6c 100644 --- a/tests/unit/core/test_agent.py +++ b/tests/unit/core/test_agent.py @@ -95,7 +95,7 @@ @pytest.mark.asyncio async def test_run_token_accumulation(self, agent, session): - """_turn_tokens accumulates across tool-calling iterations.""" + """_turn_tokens accumulates completion tokens across tool-calling iterations.""" backend = FakeLLMBackend( responses=["", "done"], tool_calls=[ @@ -110,8 +110,8 @@ await agent.run(session.id, "do something") saved = await agent._sessions.get(session.id) final_msg = saved.messages[-1] - # Two iterations × (10 + 5) = 30 tokens - assert final_msg.token_count == 30 + # Two iterations × 5 completion tokens = 10 tokens + assert final_msg.token_count == 10 @pytest.mark.asyncio async def test_run_max_iterations(self, agent, session): @@ -209,9 +209,9 @@ if isinstance(ev, StreamEnd): events.append(ev) - assert events[0].token_count == 150 + assert events[0].token_count == 50 saved = await agent._sessions.get(session.id) - assert saved.messages[-1].token_count == 150 + assert saved.messages[-1].token_count == 50 # ─── run_ephemeral() tests ─────────────────────────────────────────────────── @@ -283,10 +283,10 @@ if isinstance(item, SubagentComplete): subagent_complete = item - # Planning tokens: (5+10) + (3+7) = 25 + # Planning completion tokens: 10 + 7 = 17 # Final LLM call: 0 (no tokens in FakeLLMBackend default) assert subagent_complete is not None - assert subagent_complete.token_count == 25 + assert subagent_complete.token_count == 17 finally: current_event_sink.reset(token) agent._planning.run = original_planning_run