diff --git a/navi/api/websocket.py b/navi/api/websocket.py index e5f071f..b4706bd 100644 --- a/navi/api/websocket.py +++ b/navi/api/websocket.py @@ -298,6 +298,16 @@ cleaned.append(img) raw_images = cleaned + # Tell the model the inline images are already in its multimodal context, + # so it doesn't hallucinate a path/URL and call image_view to "load" them. + if raw_images: + n = len(raw_images) + noun = "image" if n == 1 else "images" + user_content = ( + user_content + + f"\n\n[{n} {noun} attached inline — already in your context, no extra loading needed.]" + ) + # Append uploaded file paths to user content so Navi knows about them uploaded_files: list[dict] = data.get("files") or [] if uploaded_files: diff --git a/navi/tools/image_view.py b/navi/tools/image_view.py index c50eba7..855b50b 100644 --- a/navi/tools/image_view.py +++ b/navi/tools/image_view.py @@ -21,8 +21,11 @@ name = "image_view" description = ( "Load an image from a local file path or HTTP/HTTPS URL so you can see and analyse it. " - "Call this whenever the user mentions an image file or URL, or when you need to " - "inspect visual content. The image becomes visible to you in the next message." + "Use this whenever the conversation references an image you cannot already see — " + "a file path, a URL, a screenshot you produced, or any visual you need to inspect. " + "Images the user attached directly to a message (visible inline in your context) " + "don't need this tool; just analyse them from what you see. " + "The loaded image becomes visible to you in the next message." ) parameters = { "type": "object",