"""Extract `window.__INITIAL_STATE__` from DOM.RIA HTML responses."""
import json
from typing import Any, Dict, List, Optional
def extract_initial_state(html: str) -> Optional[Dict[str, Any]]:
"""Parse the embedded JSON from `window.__INITIAL_STATE__ = {...};`.
Uses manual bracket counting instead of a regex because the JSON may
contain nested objects and the closing `};` pattern is not reliably
matched by a simple regex across all pages.
"""
marker = "window.__INITIAL_STATE__"
idx = html.find(marker)
if idx == -1:
return None
start = idx + len(marker)
# skip optional whitespace and '='
while start < len(html) and html[start] in " \t=":
start += 1
if start >= len(html) or html[start] != "{":
return None
depth = 0
in_string = False
escape = False
for i, ch in enumerate(html[start:], start=start):
if in_string:
if escape:
escape = False
continue
if ch == "\\":
escape = True
continue
if ch == '"':
in_string = False
continue
if ch == '"':
in_string = True
continue
if ch == "{":
depth += 1
elif ch == "}":
depth -= 1
if depth == 0:
json_str = html[start : i + 1]
try:
return json.loads(json_str)
except json.JSONDecodeError:
return None
return None
def parse_catalog_page(html: str) -> Dict[str, Any]:
"""Return catalog data from a listing search page.
Keys returned:
- items: List[dict] — catalog.realtyForCatalog
- total_count: int — catalog.realtyCountCatalog
- page_404: bool — bus.page404 flag
"""
state = extract_initial_state(html)
if state is None:
return {"items": [], "total_count": 0, "page_404": True}
catalog = state.get("catalog", {})
items = catalog.get("realtyForCatalog", [])
total_count = catalog.get("realtyCountCatalog", 0)
page_404 = bool(state.get("bus", {}).get("page404"))
return {"items": items, "total_count": total_count, "page_404": page_404}
def parse_detail_page(html: str) -> Optional[Dict[str, Any]]:
"""Return the full realty object from a detail page.
Data path inside __INITIAL_STATE__: listing.data.realty
"""
state = extract_initial_state(html)
if state is None:
return None
listing = state.get("listing", {})
data = listing.get("data", {})
return data.get("realty")