"""web_view — headless browser page extraction."""
from __future__ import annotations
import base64
import re
from typing import Any
from playwright.async_api import async_playwright
_TIMEOUT = 30_000
_MAX_TEXT = 20_000
def _clean(text: str) -> str:
lines = [line.rstrip() for line in text.splitlines()]
result: list[str] = []
blank_run = 0
for line in lines:
if line == "":
blank_run += 1
if blank_run <= 2:
result.append("")
else:
blank_run = 0
result.append(line)
return "\n".join(result).strip()
async def web_view(
url: str,
screenshot: bool = False,
wait_until: str = "networkidle",
) -> dict[str, Any]:
"""Open a URL in a headless browser and return readable text.
Returns dict with success, output, error, metadata.
"""
if not url.startswith(("http://", "https://")):
return {
"success": False,
"output": "URL must start with http:// or https://",
"error": "invalid_url",
"metadata": {},
}
try:
async with async_playwright() as pw:
browser = await pw.chromium.launch(headless=True)
context = await browser.new_context(
viewport={"width": 1280, "height": 800},
user_agent=(
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
),
)
page = await context.new_page()
try:
await page.goto(url, wait_until=wait_until, timeout=_TIMEOUT)
except Exception:
pass
title = await page.title()
final_url = page.url
text = await page.evaluate("""() => {
const kill = ['script','style','noscript','iframe',
'nav','header','footer','aside',
'[role="navigation"]','[role="banner"]',
'[role="contentinfo"]'];
const clone = document.body.cloneNode(true);
kill.forEach(sel => {
clone.querySelectorAll(sel).forEach(el => el.remove());
});
return clone.innerText || clone.textContent || '';
}""")
text = _clean(text)
if len(text) > _MAX_TEXT:
text = text[:_MAX_TEXT] + f"\n\n[… truncated at {_MAX_TEXT} chars]"
output_parts = []
if title:
output_parts.append(f"Title: {title}")
if final_url != url:
output_parts.append(f"Final URL: {final_url}")
output_parts.append("")
output_parts.append(text)
output = "\n".join(output_parts)
metadata: dict[str, Any] = {}
if screenshot:
png = await page.screenshot(full_page=False)
metadata = {
"base64": base64.b64encode(png).decode(),
"mime": "image/png",
"is_image": True,
}
await context.close()
await browser.close()
return {
"success": True,
"output": output,
"error": None,
"metadata": metadata or {},
}
except Exception as e:
return {
"success": False,
"output": f"Browser error: {e}",
"error": str(e),
"metadata": {},
}