diff --git a/client/index.html b/client/index.html index 3776b26..834febd 100644 --- a/client/index.html +++ b/client/index.html @@ -36,15 +36,18 @@
- - +
+
+ + + + +
diff --git a/client/js/app.js b/client/js/app.js index 33e0ec7..4aa27bb 100644 --- a/client/js/app.js +++ b/client/js/app.js @@ -7,23 +7,25 @@ // ── DOM refs ───────────────────────────────────────────────────────────────── -const profileSelect = document.getElementById('profile-select'); -const btnNew = document.getElementById('btn-new'); -const sessionListEl = document.getElementById('session-list'); -const chatHeaderEl = document.getElementById('chat-header'); -const messagesEl = document.getElementById('messages'); -const textarea = document.getElementById('input'); -const btnSend = document.getElementById('btn-send'); +const profileSelect = document.getElementById('profile-select'); +const btnNew = document.getElementById('btn-new'); +const sessionListEl = document.getElementById('session-list'); +const chatHeaderEl = document.getElementById('chat-header'); +const messagesEl = document.getElementById('messages'); +const textarea = document.getElementById('input'); +const btnSend = document.getElementById('btn-send'); +const btnAttach = document.getElementById('btn-attach'); +const fileInput = document.getElementById('file-input'); +const previewStrip = document.getElementById('image-preview-strip'); // ── State ───────────────────────────────────────────────────────────────────── -const STORAGE_KEY = 'navi_current_session'; - let profiles = []; let sessions = []; -let currentId = localStorage.getItem(STORAGE_KEY) ?? null; +let currentId = null; let streaming = false; let currentBubble = null; +let pendingImages = []; // array of full data URLs (data:image/...;base64,...) const ws = new WsClient(); @@ -32,8 +34,11 @@ async function init() { textarea.addEventListener('keydown', onKey); textarea.addEventListener('input', autoResize); + textarea.addEventListener('paste', onPaste); btnSend.addEventListener('click', sendMessage); btnNew.addEventListener('click', newChat); + btnAttach.addEventListener('click', () => fileInput.click()); + fileInput.addEventListener('change', onFileChange); [profiles, sessions] = await Promise.all([api.getProfiles(), api.getSessions()]); @@ -43,11 +48,15 @@ renderProfiles(profileSelect, profiles); rerenderSidebar(); - // Restore last active session - if (currentId && sessions.some(s => s.session_id === currentId)) { - await openSession(currentId, false); + // Open session from URL hash, or fall back to most recently active + const hashId = location.hash.slice(1); + const targetId = hashId && sessions.some(s => s.session_id === hashId) + ? hashId + : sessions[0]?.session_id ?? null; + + if (targetId) { + await openSession(targetId, false); } else { - currentId = null; showEmptyState(messagesEl); setInputEnabled(false); } @@ -72,7 +81,7 @@ async function openSession(sessionId, skipLoad = false) { ws.disconnect(); currentId = sessionId; - localStorage.setItem(STORAGE_KEY, sessionId); + history.replaceState(null, '', '#' + sessionId); rerenderSidebar(); const s = sessions.find(s => s.session_id === sessionId); @@ -92,12 +101,38 @@ messagesEl.innerHTML = ''; try { const data = await api.getSession(sessionId); + + // Build a lookup: tool_call_id β†’ {name, arguments} from assistant tool_calls + const toolCallMap = {}; for (const msg of data.messages) { - if (msg.role === 'system') continue; - if (msg.role === 'user' || (msg.role === 'assistant' && msg.content)) { - appendMessage(messagesEl, msg.role, msg.content); + if (msg.role === 'assistant' && msg.tool_calls) { + for (const tc of msg.tool_calls) { + toolCallMap[tc.id] = { name: tc.name, args: tc.arguments ?? {} }; + } } } + + for (const msg of data.messages) { + if (msg.role === 'system') continue; + + if (msg.role === 'tool') { + const tc = toolCallMap[msg.tool_call_id] ?? { name: msg.name ?? '?', args: {} }; + const success = !msg.content?.startsWith('Error:'); + appendToolCall(messagesEl, { + tool: tc.name, + args: tc.args, + result: msg.content ?? '', + success, + }); + continue; + } + + if (msg.role === 'user' || (msg.role === 'assistant' && msg.content)) { + const imgs = msg.images?.map(b => b.startsWith('data:') ? b : `data:image/jpeg;base64,${b}`) ?? null; + appendMessage(messagesEl, msg.role, msg.content, imgs, msg.created_at ?? null); + } + } + scrollToBottom(messagesEl); } catch (e) { console.error('loadHistory', e); @@ -110,7 +145,7 @@ if (currentId === sessionId) { ws.disconnect(); currentId = null; - localStorage.removeItem(STORAGE_KEY); + history.replaceState(null, '', location.pathname); showEmptyState(messagesEl); updateChatHeader(chatHeaderEl, null); setInputEnabled(false); @@ -140,15 +175,16 @@ switch (event.type) { case 'stream_start': streaming = true; - removeTypingIndicator(messagesEl); - currentBubble = appendStreamBubble(messagesEl); + currentBubble = null; // bubble created lazily on first delta, so tool cards appear first break; case 'stream_delta': - if (currentBubble) { - currentBubble.textContent += event.delta; - scrollToBottom(messagesEl); + if (!currentBubble) { + removeTypingIndicator(messagesEl); // remove only when text actually starts + currentBubble = appendStreamBubble(messagesEl); } + currentBubble.textContent += event.delta; + scrollToBottom(messagesEl); break; case 'tool_call': @@ -171,16 +207,17 @@ function finishStream(finalContent) { streaming = false; - removeTypingIndicator(messagesEl); - if (currentBubble) { - if (finalContent !== undefined) { - finalizeStreamBubble(currentBubble, finalContent); - updatePreview(currentId, finalContent); - } else { - currentBubble.classList.remove('cursor'); + removeTypingIndicator(messagesEl); // safe to call even if already removed + if (finalContent !== undefined) { + if (!currentBubble) { + currentBubble = appendStreamBubble(messagesEl); } - currentBubble = null; + finalizeStreamBubble(currentBubble, finalContent); + updatePreview(currentId, finalContent); + } else if (currentBubble) { + currentBubble.classList.remove('cursor'); } + currentBubble = null; scrollToBottom(messagesEl); } @@ -188,17 +225,22 @@ async function sendMessage() { const text = textarea.value.trim(); - if (!text || !ws.ready || streaming) return; + if ((!text && !pendingImages.length) || !ws.ready || streaming) return; + const imagesToSend = [...pendingImages]; // full data URLs + clearImages(); textarea.value = ''; autoResize(); setInputEnabled(false); - appendMessage(messagesEl, 'user', text); + // Display with full data URLs + appendMessage(messagesEl, 'user', text || null, imagesToSend.length ? imagesToSend : null); appendTypingIndicator(messagesEl); scrollToBottom(messagesEl); - ws.send(text); + // Strip data URI prefix before sending to server (server expects raw base64) + const b64List = imagesToSend.map(d => d.split(',', 2)[1]); + ws.send(text || ' ', b64List.length ? b64List : null); } function onKey(e) { @@ -230,8 +272,9 @@ } function setInputEnabled(on) { - textarea.disabled = !on; - btnSend.disabled = !on; + textarea.disabled = !on; + btnSend.disabled = !on; + btnAttach.disabled = !on; if (on) textarea.focus(); } @@ -240,6 +283,61 @@ textarea.style.height = Math.min(textarea.scrollHeight, 180) + 'px'; } +// ── Image handling ──────────────────────────────────────────────────────────── + +function addImageFile(file) { + if (!file.type.startsWith('image/')) return; + const reader = new FileReader(); + reader.onload = (e) => { + // Store the full data URL so we retain mime type for display + pendingImages.push(e.target.result); + renderPreviewStrip(); + }; + reader.readAsDataURL(file); +} + +function onFileChange(e) { + for (const file of e.target.files) addImageFile(file); + fileInput.value = ''; +} + +function onPaste(e) { + for (const item of e.clipboardData?.items ?? []) { + if (item.kind === 'file' && item.type.startsWith('image/')) { + e.preventDefault(); + addImageFile(item.getAsFile()); + } + } +} + +function clearImages() { + pendingImages = []; + previewStrip.innerHTML = ''; +} + +function renderPreviewStrip() { + previewStrip.innerHTML = ''; + pendingImages.forEach((dataUrl, i) => { + const wrap = document.createElement('div'); + wrap.className = 'img-thumb-wrap'; + + const img = document.createElement('img'); + img.src = dataUrl; // full data URL + img.className = 'img-thumb'; + + const btn = document.createElement('button'); + btn.className = 'img-thumb-remove'; + btn.textContent = 'Γ—'; + btn.addEventListener('click', () => { + pendingImages.splice(i, 1); + renderPreviewStrip(); + }); + + wrap.append(img, btn); + previewStrip.appendChild(wrap); + }); +} + // ── Start ───────────────────────────────────────────────────────────────────── init(); diff --git a/client/js/chat.js b/client/js/chat.js index 9005d7e..a410076 100644 --- a/client/js/chat.js +++ b/client/js/chat.js @@ -27,6 +27,7 @@ code_exec: 'βš™οΈ', terminal: 'πŸ’»', ssh_exec: 'πŸ–§', + image_view: 'πŸ–ΌοΈ', }; // ── Helpers ─────────────────────────────────────────────────────────────────── @@ -53,24 +54,40 @@ /** * Append a complete message bubble (used for history and user messages). * Assistant messages are rendered as markdown; user messages as plain text. + * Pass images (array of base64 strings) to render them in the bubble. * Returns the bubble element. */ -export function appendMessage(el, role, content) { +export function appendMessage(el, role, content, images = null, timestamp = null) { const wrap = document.createElement('div'); wrap.className = `msg ${role}`; const bubble = document.createElement('div'); bubble.className = 'bubble'; + if (images?.length) { + const imgStrip = document.createElement('div'); + imgStrip.className = 'bubble-images'; + for (const b64 of images) { + const img = document.createElement('img'); + img.src = b64.startsWith('data:') ? b64 : `data:image/jpeg;base64,${b64}`; + img.className = 'bubble-img'; + img.alt = 'attached image'; + imgStrip.appendChild(img); + } + bubble.appendChild(imgStrip); + } + if (role === 'assistant') { bubble.appendChild(renderMarkdown(content)); - } else { - bubble.textContent = content; + } else if (content) { + const text = document.createElement('span'); + text.textContent = content; + bubble.appendChild(text); } const time = document.createElement('div'); time.className = 'msg-time'; - time.textContent = timeLabel(new Date().toISOString()); + time.textContent = timeLabel(timestamp ?? new Date().toISOString()); wrap.append(bubble, time); el.appendChild(wrap); @@ -107,29 +124,42 @@ } /** - * Tool call card with accordion for arguments + result. + * Tool call card β€” collapsed by default, click header to toggle. */ export function appendToolCall(el, event) { const icon = TOOL_ICONS[event.tool] ?? 'πŸ”§'; const success = event.success; - // Format args as readable lines const argsLines = Object.entries(event.args ?? {}) .map(([k, v]) => `${esc(k)}${esc(JSON.stringify(v))}`) .join(''); - const card = document.createElement('details'); + const card = document.createElement('div'); card.className = `tool-card${success ? '' : ' error'}`; - card.innerHTML = ` - - ${icon} - ${esc(event.tool)} - ${success ? 'βœ“' : 'βœ—'} - -
- ${argsLines ? `
${argsLines}
` : ''} -
${esc(event.result)}
-
`; + + const header = document.createElement('div'); + header.className = 'tool-header'; + header.innerHTML = ` + ${icon} + ${esc(event.tool)} + ${success ? 'βœ“' : 'βœ—'}`; + + const body = document.createElement('div'); + body.className = 'tool-body'; + if (argsLines) { + const argsDiv = document.createElement('div'); + argsDiv.className = 'tool-args'; + argsDiv.innerHTML = argsLines; + body.appendChild(argsDiv); + } + const pre = document.createElement('pre'); + pre.className = 'tool-result-pre'; + pre.textContent = event.result; + body.appendChild(pre); + + header.addEventListener('click', () => card.classList.toggle('open')); + + card.append(header, body); el.appendChild(card); } diff --git a/client/js/ws.js b/client/js/ws.js index bac65c0..6761259 100644 --- a/client/js/ws.js +++ b/client/js/ws.js @@ -19,9 +19,11 @@ this.#ws.onmessage = (e) => handlers.onMessage?.(JSON.parse(e.data)); } - send(content) { + send(content, images = null) { if (this.#ws?.readyState === WebSocket.OPEN) { - this.#ws.send(JSON.stringify({ type: 'message', content })); + const payload = { type: 'message', content }; + if (images?.length) payload.images = images; + this.#ws.send(JSON.stringify(payload)); return true; } return false; diff --git a/client/style.css b/client/style.css index 4fd0434..257299d 100644 --- a/client/style.css +++ b/client/style.css @@ -204,6 +204,22 @@ .msg.user .bubble { background: var(--user-bubble); color: var(--user-text); border-bottom-right-radius: 3px; white-space: pre-wrap; } .msg.assistant .bubble { background: var(--bot-bubble); color: var(--bot-text); border-bottom-left-radius: 3px; } +/* Images inside chat bubbles */ +.bubble-images { + display: flex; + flex-wrap: wrap; + gap: 6px; + margin-bottom: 8px; +} +.bubble-img { + max-width: 320px; + max-height: 240px; + border-radius: 6px; + object-fit: contain; + cursor: pointer; +} +.bubble-img:only-child { max-width: 100%; } + .msg-time { font-size: 11px; color: var(--text-muted); margin-top: 4px; padding: 0 2px; } /* ── Markdown prose ───────────────────────────────── */ @@ -218,8 +234,8 @@ .prose code { font-family: "Fira Code", "Cascadia Code", ui-monospace, monospace; font-size: 0.85em; background: #2a2a2a; color: #e2b97e; padding: 1px 5px; border-radius: 4px; } .prose pre { margin: 0.6em 0; border-radius: 8px; overflow: hidden; } -.prose pre code { background: none; color: inherit; padding: 0; border-radius: 0; font-size: 0.9em; } -.prose pre .hljs { padding: 12px 16px; border-radius: 8px; font-size: 0.9em; } +.prose pre code { background: none; color: inherit; padding: 0; border-radius: 0; font-size: 1em; } +.prose pre .hljs { padding: 12px 16px; border-radius: 8px; font-size: 1em; } .prose blockquote { border-left: 3px solid #444; margin: 0.5em 0; padding: 0.2em 0 0.2em 0.8em; color: var(--text-muted); } .prose table { border-collapse: collapse; width: 100%; margin: 0.5em 0; font-size: 0.9em; } .prose th,.prose td { border: 1px solid #333; padding: 5px 10px; text-align: left; } @@ -240,13 +256,9 @@ border-radius: var(--radius); font-size: 12px; color: var(--tool-text); - overflow: hidden; } .tool-card.error { background: var(--error-bg); border-color: var(--error-border); color: var(--error-text); } -.tool-card summary { list-style: none; } -.tool-card summary::-webkit-details-marker { display: none; } - .tool-header { display: flex; align-items: center; @@ -255,20 +267,24 @@ cursor: pointer; user-select: none; font-weight: 600; + border-radius: var(--radius); } .tool-header:hover { background: rgba(255,255,255,0.04); } .tool-icon { font-size: 14px; } .tool-name { flex: 1; } .tool-status { font-size: 13px; opacity: 0.8; } -.tool-card:not([open]) .tool-header::after { content: 'β€Ί'; font-size: 16px; opacity: 0.5; } -.tool-card[open] .tool-header::after { content: 'β€Ή'; font-size: 16px; opacity: 0.5; } +.tool-card:not(.open) .tool-header::after { content: 'β€Ί'; font-size: 16px; opacity: 0.5; } +.tool-card.open .tool-header::after { content: 'β€Ή'; font-size: 16px; opacity: 0.5; } .tool-body { border-top: 1px solid var(--tool-border); padding: 8px 12px; - display: flex; + display: none; flex-direction: column; gap: 6px; +} +.tool-card.open .tool-body { + display: flex; animation: fadeSlide 0.18s ease; } @keyframes fadeSlide { from { opacity: 0; transform: translateY(-4px); } to { opacity: 1; transform: translateY(0); } } @@ -346,14 +362,78 @@ /* ── Input bar ───────────────────────────────────────── */ .input-bar { - padding: 16px 20px; + padding: 12px 20px 16px; background: var(--sidebar-bg); border-top: 1px solid var(--border); display: flex; + flex-direction: column; + gap: 8px; +} + +/* Image preview strip */ +.image-preview-strip { + display: flex; + flex-wrap: wrap; + gap: 8px; +} +.image-preview-strip:empty { display: none; } + +.img-thumb-wrap { + position: relative; + width: 72px; + height: 72px; +} +.img-thumb { + width: 72px; + height: 72px; + object-fit: cover; + border-radius: 6px; + border: 1px solid var(--border); +} +.img-thumb-remove { + position: absolute; + top: -6px; + right: -6px; + width: 18px; + height: 18px; + border-radius: 50%; + border: none; + background: var(--error-text); + color: #fff; + font-size: 12px; + line-height: 1; + cursor: pointer; + display: flex; + align-items: center; + justify-content: center; + padding: 0; +} + +/* Input row: attach + textarea + send */ +.input-row { + display: flex; gap: 10px; align-items: flex-end; } +.btn-attach { + width: 44px; + height: 44px; + flex-shrink: 0; + background: var(--input-bg); + color: var(--text-muted); + border: 1px solid var(--border); + border-radius: var(--radius); + font-size: 18px; + cursor: pointer; + transition: border-color 0.15s, color 0.15s; + display: flex; + align-items: center; + justify-content: center; +} +.btn-attach:hover { border-color: var(--accent); color: var(--accent); } +.btn-attach:disabled { opacity: 0.5; cursor: not-allowed; } + .input-bar textarea { flex: 1; padding: 10px 14px; diff --git a/navi/api/routes/sessions.py b/navi/api/routes/sessions.py index 473e1a6..bfd9b45 100644 --- a/navi/api/routes/sessions.py +++ b/navi/api/routes/sessions.py @@ -77,7 +77,7 @@ return { "session_id": session.id, "profile_id": session.profile_id, - "messages": [m.model_dump(exclude_none=True) for m in session.messages], + "messages": [m.model_dump(mode='json', exclude_none=True) for m in session.messages], "created_at": session.created_at.isoformat(), "last_active": session.last_active.isoformat(), } diff --git a/navi/api/websocket.py b/navi/api/websocket.py index 117ce58..fa7541b 100644 --- a/navi/api/websocket.py +++ b/navi/api/websocket.py @@ -57,10 +57,21 @@ continue user_content = data["content"] + # images: list of base64 strings (data URI prefix already stripped by client) + raw_images: list[str] | None = data.get("images") or None + if raw_images: + # Strip data URI prefix if client sent it with prefix + cleaned = [] + for img in raw_images: + if "," in img and img.startswith("data:"): + img = img.split(",", 1)[1] + cleaned.append(img) + raw_images = cleaned + await websocket.send_json({"type": "stream_start"}) try: - async for event in agent.run_stream(session_id, user_content): + async for event in agent.run_stream(session_id, user_content, images=raw_images): if isinstance(event, TextDelta): await websocket.send_json({"type": "stream_delta", "delta": event.delta}) elif isinstance(event, ToolEvent): diff --git a/navi/core/agent.py b/navi/core/agent.py index 6ba7582..336b793 100644 --- a/navi/core/agent.py +++ b/navi/core/agent.py @@ -17,6 +17,7 @@ import asyncio import json from dataclasses import dataclass +from datetime import datetime, timezone from typing import AsyncGenerator import structlog @@ -75,7 +76,7 @@ # Public interface # ------------------------------------------------------------------ - async def run(self, session_id: str, user_message: str) -> str: + async def run(self, session_id: str, user_message: str, images: list[str] | None = None) -> str: """Non-streaming: run the full tool-calling loop and return the final text.""" session = await self._sessions.get(session_id) if session is None: @@ -90,7 +91,7 @@ if not session.messages: session.messages.append(Message(role="system", content=profile.system_prompt)) - session.messages.append(Message(role="user", content=user_message)) + session.messages.append(Message(role="user", content=user_message, images=images or None, created_at=datetime.now(timezone.utc))) for iteration in range(profile.max_iterations): log.debug("agent.iteration", session_id=session_id, iteration=iteration) @@ -114,14 +115,15 @@ ) session.messages.append(assistant_msg) - tool_results = await self._execute_tool_calls(response.tool_calls, tools) + tool_results, image_injections = await self._execute_tool_calls(response.tool_calls, tools) session.messages.extend(tool_results) + session.messages.extend(image_injections) await self._sessions.save(session) raise MaxIterationsReached(profile.max_iterations) async def run_stream( - self, session_id: str, user_message: str + self, session_id: str, user_message: str, images: list[str] | None = None ) -> AsyncGenerator[AgentEvent, None]: """ Streaming variant. Yields AgentEvent objects: @@ -141,7 +143,7 @@ if not session.messages: session.messages.append(Message(role="system", content=profile.system_prompt)) - session.messages.append(Message(role="user", content=user_message)) + session.messages.append(Message(role="user", content=user_message, images=images or None, created_at=datetime.now(timezone.utc))) # Tool-calling loop (non-streaming) for iteration in range(profile.max_iterations): @@ -162,7 +164,7 @@ accumulated += chunk.delta yield TextDelta(delta=chunk.delta) - session.messages.append(Message(role="assistant", content=accumulated)) + session.messages.append(Message(role="assistant", content=accumulated, created_at=datetime.now(timezone.utc))) await self._sessions.save(session) yield StreamEnd(full_content=accumulated) return @@ -175,12 +177,13 @@ ) session.messages.append(assistant_msg) - tool_results_msgs = await self._execute_tool_calls_streaming( + tool_results_msgs, image_injections = await self._execute_tool_calls_streaming( response.tool_calls, tools ) for event, msg in tool_results_msgs: yield event session.messages.append(msg) + session.messages.extend(image_injections) await self._sessions.save(session) raise MaxIterationsReached(profile.max_iterations) @@ -197,33 +200,42 @@ async def _execute_tool_calls( self, tool_calls: list[ToolCallRequest], tools: list[Tool] - ) -> list[Message]: + ) -> tuple[list[Message], list[Message]]: tool_map = {t.name: t for t in tools} - async def _run_one(tc: ToolCallRequest) -> Message: + async def _run_one(tc: ToolCallRequest) -> tuple[Message, Message | None]: tool = tool_map.get(tc.name) + image_msg = None if tool is None: content = f"Error: tool '{tc.name}' not found." else: log.info("tool.execute", tool=tc.name, args=tc.arguments) result = await tool.execute(tc.arguments) content = result.to_message_content() - return Message( - role="tool", - content=content, - tool_call_id=tc.id, - name=tc.name, - ) + if result.success and result.metadata and result.metadata.get("is_image"): + b64 = result.metadata.get("base64") + if b64: + image_msg = Message( + role="user", + content=f"[Image loaded via {tc.name} β€” analyse it]", + images=[b64], + ) + tool_msg = Message(role="tool", content=content, tool_call_id=tc.id, name=tc.name) + return tool_msg, image_msg - return await asyncio.gather(*[_run_one(tc) for tc in tool_calls]) + pairs = await asyncio.gather(*[_run_one(tc) for tc in tool_calls]) + tool_msgs = [p[0] for p in pairs] + image_msgs = [p[1] for p in pairs if p[1] is not None] + return tool_msgs, image_msgs async def _execute_tool_calls_streaming( self, tool_calls: list[ToolCallRequest], tools: list[Tool] - ) -> list[tuple[ToolEvent, Message]]: + ) -> tuple[list[tuple[ToolEvent, Message]], list[Message]]: tool_map = {t.name: t for t in tools} - async def _run_one(tc: ToolCallRequest) -> tuple[ToolEvent, Message]: + async def _run_one(tc: ToolCallRequest) -> tuple[ToolEvent, Message, Message | None]: tool = tool_map.get(tc.name) + image_msg = None if tool is None: content = f"Error: tool '{tc.name}' not found." event = ToolEvent( @@ -239,7 +251,18 @@ result=content, success=result.success, ) + if result.success and result.metadata and result.metadata.get("is_image"): + b64 = result.metadata.get("base64") + if b64: + image_msg = Message( + role="user", + content=f"[Image loaded via {tc.name} β€” analyse it]", + images=[b64], + ) msg = Message(role="tool", content=content, tool_call_id=tc.id, name=tc.name) - return event, msg + return event, msg, image_msg - return await asyncio.gather(*[_run_one(tc) for tc in tool_calls]) + triples = await asyncio.gather(*[_run_one(tc) for tc in tool_calls]) + pairs = [(t[0], t[1]) for t in triples] + image_msgs = [t[2] for t in triples if t[2] is not None] + return pairs, image_msgs diff --git a/navi/core/registry.py b/navi/core/registry.py index 0765a98..6ecbb23 100644 --- a/navi/core/registry.py +++ b/navi/core/registry.py @@ -10,6 +10,7 @@ CodeExecTool, FilesystemTool, HttpRequestTool, + ImageViewTool, SshExecTool, TerminalTool, Tool, @@ -79,6 +80,7 @@ tools.register(CodeExecTool()) tools.register(TerminalTool()) tools.register(SshExecTool()) + tools.register(ImageViewTool()) profiles = ProfileRegistry() for p in ALL_PROFILES: diff --git a/navi/core/sqlite_session_store.py b/navi/core/sqlite_session_store.py index d68a2d8..736af6b 100644 --- a/navi/core/sqlite_session_store.py +++ b/navi/core/sqlite_session_store.py @@ -59,7 +59,7 @@ async def save(self, session: Session) -> None: session.last_active = datetime.utcnow() messages_json = json.dumps( - [m.model_dump(exclude_none=True) for m in session.messages], + [m.model_dump(mode='json', exclude_none=True) for m in session.messages], ensure_ascii=False, ) async with aiosqlite.connect(self._db_path) as db: diff --git a/navi/llm/base.py b/navi/llm/base.py index d9695f8..fea77d0 100644 --- a/navi/llm/base.py +++ b/navi/llm/base.py @@ -6,6 +6,7 @@ """ from abc import ABC, abstractmethod +from datetime import datetime, timezone from typing import AsyncGenerator, Literal from pydantic import BaseModel @@ -31,11 +32,14 @@ role: Literal["system", "user", "assistant", "tool"] content: str | None = None + # base64-encoded images (multimodal); user and assistant roles only + images: list[str] | None = None # set by assistant when requesting tool calls tool_calls: list[ToolCallRequest] | None = None # set on tool result messages tool_call_id: str | None = None name: str | None = None # tool name on tool result messages + created_at: datetime | None = None class LLMResponse(BaseModel): diff --git a/navi/llm/ollama.py b/navi/llm/ollama.py index 1aeef02..281ec87 100644 --- a/navi/llm/ollama.py +++ b/navi/llm/ollama.py @@ -14,14 +14,13 @@ result = [] for m in messages: msg: dict = {"role": m.role, "content": m.content or ""} + if m.images: + msg["images"] = m.images # list of base64 strings, Ollama format if m.tool_calls: msg["tool_calls"] = [ {"function": {"name": tc.name, "arguments": tc.arguments}} for tc in m.tool_calls ] - if m.tool_call_id: - # Ollama uses role="tool" with content - pass result.append(msg) return result diff --git a/navi/main.py b/navi/main.py index 789dde1..2749e46 100644 --- a/navi/main.py +++ b/navi/main.py @@ -1,8 +1,8 @@ """FastAPI application entry point.""" import structlog -from fastapi import FastAPI -from fastapi.responses import FileResponse +from fastapi import FastAPI, Request +from fastapi.responses import FileResponse, Response from fastapi.staticfiles import StaticFiles from navi.api.routes import agents, health, messages, sessions @@ -30,6 +30,14 @@ app.mount("/static", StaticFiles(directory="client"), name="static") +@app.middleware("http") +async def no_cache_static(request: Request, call_next) -> Response: + response = await call_next(request) + if request.url.path.startswith("/static/"): + response.headers["Cache-Control"] = "no-store" + return response + + @app.get("/", include_in_schema=False) async def index() -> FileResponse: - return FileResponse("client/index.html") + return FileResponse("client/index.html", headers={"Cache-Control": "no-store"}) diff --git a/navi/profiles/secretary.py b/navi/profiles/secretary.py index 9d8456a..ba47e1f 100644 --- a/navi/profiles/secretary.py +++ b/navi/profiles/secretary.py @@ -12,12 +12,13 @@ - HTTP requests to query external APIs or services - Filesystem to read and write documents, notes, and files - Code execution to perform calculations, data processing, or automate tasks +- image_view: load and analyse images from a local file path or URL Be concise and actionable in your responses. When asked to research a topic, provide a structured summary with sources. When writing documents, match the requested tone and format. """, - enabled_tools=["web_search", "http_request", "filesystem", "code_exec"], + enabled_tools=["web_search", "http_request", "filesystem", "code_exec", "terminal", "image_view"], model="gemma4:e2b-it-q4_K_M", temperature=0.7, ) diff --git a/navi/profiles/server_admin.py b/navi/profiles/server_admin.py index 9ba485f..ad79b44 100644 --- a/navi/profiles/server_admin.py +++ b/navi/profiles/server_admin.py @@ -16,6 +16,7 @@ - filesystem: read and write files on the local machine - http_request: call REST APIs, monitoring endpoints, or health checks - web_search: look up documentation, error messages, or solutions +- image_view: load and analyse images from a local file path or URL Guidelines: - When the user asks about a remote server, use ssh_exec immediately β€” do not say @@ -23,7 +24,7 @@ - Prefer non-destructive operations; ask for confirmation before anything irreversible. - When troubleshooting, gather information first (logs, status) before making changes. """, - enabled_tools=["terminal", "filesystem", "http_request", "web_search", "ssh_exec"], + enabled_tools=["terminal", "filesystem", "http_request", "web_search", "ssh_exec", "image_view"], model="gemma4:e2b-it-q4_K_M", temperature=0.2, ) diff --git a/navi/profiles/smart_home.py b/navi/profiles/smart_home.py index 1b4bd1b..a48a621 100644 --- a/navi/profiles/smart_home.py +++ b/navi/profiles/smart_home.py @@ -15,11 +15,12 @@ - ssh_exec: execute commands on remote hosts via SSH. Pass host, username, password (and optionally port, key_path) directly as tool parameters β€” no config file needed. ALWAYS use it for any task involving a remote host. +- image_view: load and analyse images from a local file path or URL Always confirm before making irreversible changes to device state or automation configuration. When writing automations, prefer clear, well-commented YAML. """, - enabled_tools=["http_request", "filesystem", "code_exec", "terminal", "ssh_exec"], + enabled_tools=["http_request", "filesystem", "code_exec", "terminal", "ssh_exec", "image_view"], model="gemma4:e2b-it-q4_K_M", temperature=0.3, ) diff --git a/navi/tools/__init__.py b/navi/tools/__init__.py index 5270483..eef15ec 100644 --- a/navi/tools/__init__.py +++ b/navi/tools/__init__.py @@ -2,6 +2,7 @@ from .code_exec import CodeExecTool from .filesystem import FilesystemTool from .http_request import HttpRequestTool +from .image_view import ImageViewTool from .ssh_exec import SshExecTool from .terminal import TerminalTool from .web_search import WebSearchTool @@ -15,4 +16,5 @@ "CodeExecTool", "TerminalTool", "SshExecTool", + "ImageViewTool", ] diff --git a/navi/tools/image_view.py b/navi/tools/image_view.py new file mode 100644 index 0000000..5f72352 --- /dev/null +++ b/navi/tools/image_view.py @@ -0,0 +1,69 @@ +"""Image view tool β€” load an image from a file path or URL for the LLM to analyse. + +The image is returned as base64 and injected into the conversation so the LLM +can actually see it (not just read a text description of it). +""" + +import base64 +import mimetypes +from pathlib import Path + +import httpx + +from .base import Tool, ToolResult + +_TIMEOUT = 30 +_SUPPORTED = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"} + + +class ImageViewTool(Tool): + name = "image_view" + description = ( + "Load an image from a local file path or a URL so you can see and analyse it. " + "Supports JPEG, PNG, GIF, WebP. After calling this tool the image will be " + "visible to you in the next turn." + ) + parameters = { + "type": "object", + "properties": { + "source": { + "type": "string", + "description": "Absolute file path (e.g. /home/user/photo.jpg) or HTTP/HTTPS URL", + }, + }, + "required": ["source"], + } + + async def execute(self, params: dict) -> ToolResult: + source = params["source"].strip() + try: + if source.startswith(("http://", "https://")): + raw, mime = await self._fetch_url(source) + else: + raw, mime = self._read_file(source) + + b64 = base64.b64encode(raw).decode() + size_kb = len(raw) // 1024 + return ToolResult( + success=True, + output=f"Image loaded ({size_kb} KB, {mime}). It will appear in the next turn.", + metadata={"base64": b64, "mime": mime, "is_image": True}, + ) + except Exception as e: + return ToolResult(success=False, output=f"Failed to load image: {e}", error=str(e)) + + async def _fetch_url(self, url: str) -> tuple[bytes, str]: + async with httpx.AsyncClient(timeout=_TIMEOUT, follow_redirects=True) as client: + r = await client.get(url) + r.raise_for_status() + mime = r.headers.get("content-type", "image/jpeg").split(";")[0].strip() + return r.content, mime + + def _read_file(self, path_str: str) -> tuple[bytes, str]: + path = Path(path_str).expanduser().resolve() + if not path.exists(): + raise FileNotFoundError(f"File not found: {path}") + if path.suffix.lower() not in _SUPPORTED: + raise ValueError(f"Unsupported image format: {path.suffix}") + mime = mimetypes.guess_type(str(path))[0] or "image/jpeg" + return path.read_bytes(), mime