Newer
Older
vmk-360-domria_parser / src / parser.py
"""Extract `window.__INITIAL_STATE__` from DOM.RIA HTML responses."""
import json
from typing import Any, Dict, List, Optional


def extract_initial_state(html: str) -> Optional[Dict[str, Any]]:
    """Parse the embedded JSON from `window.__INITIAL_STATE__ = {...};`.

    Uses manual bracket counting instead of a regex because the JSON may
    contain nested objects and the closing `};` pattern is not reliably
    matched by a simple regex across all pages.
    """
    marker = "window.__INITIAL_STATE__"
    idx = html.find(marker)
    if idx == -1:
        return None

    start = idx + len(marker)
    # skip optional whitespace and '='
    while start < len(html) and html[start] in " \t=":
        start += 1
    if start >= len(html) or html[start] != "{":
        return None

    depth = 0
    in_string = False
    escape = False
    for i, ch in enumerate(html[start:], start=start):
        if in_string:
            if escape:
                escape = False
                continue
            if ch == "\\":
                escape = True
                continue
            if ch == '"':
                in_string = False
            continue

        if ch == '"':
            in_string = True
            continue

        if ch == "{":
            depth += 1
        elif ch == "}":
            depth -= 1
            if depth == 0:
                json_str = html[start : i + 1]
                try:
                    return json.loads(json_str)
                except json.JSONDecodeError:
                    return None

    return None


def parse_catalog_page(html: str) -> Dict[str, Any]:
    """Return catalog data from a listing search page.

    Keys returned:
        - items: List[dict] — catalog.realtyForCatalog
        - total_count: int   — catalog.realtyCountCatalog
        - page_404: bool     — bus.page404 flag
    """
    state = extract_initial_state(html)
    if state is None:
        return {"items": [], "total_count": 0, "page_404": True}

    catalog = state.get("catalog", {})
    items = catalog.get("realtyForCatalog", [])
    total_count = catalog.get("realtyCountCatalog", 0)
    page_404 = bool(state.get("bus", {}).get("page404"))

    return {"items": items, "total_count": total_count, "page_404": page_404}


def parse_detail_page(html: str) -> Optional[Dict[str, Any]]:
    """Return the full realty object from a detail page.

    Data path inside __INITIAL_STATE__: listing.data.realty
    """
    state = extract_initial_state(html)
    if state is None:
        return None

    listing = state.get("listing", {})
    data = listing.get("data", {})
    return data.get("realty")