vmk-360-domria_parser/src/normalizer.py at 23367fc9b4ffbfc0fe55752d5442742063299d46

Fork: 0
root / vmk-360-domria_parser
Find file
Newer
Older
vmk-360-domria_parser / src / normalizer.py
Eugene Sukhodolskiy 1 day ago 6 KB Improve photo extraction robustness and add logging
Raw Blame History
"""Transform DOM.RIA raw data into a data_collector payload."""
from typing import Any, Dict, List, Optional


def _first(val):
    """Helper: return first element if list, else the value itself."""
    if isinstance(val, list) and val:
        return val[0]
    return val


def _safe_int(val) -> Optional[int]:
    try:
        return int(float(str(val).replace(" ", "").replace(",", ".")))
    except (ValueError, TypeError):
        return None


def _safe_float(val) -> Optional[float]:
    try:
        return float(str(val).replace(" ", "").replace(",", "."))
    except (ValueError, TypeError):
        return None


def _extract_photos(raw_photos) -> List[Dict]:
    """Normalise DOM.RIA photo field to a list of dicts.

    Sometimes photos come as a list, sometimes as a dict keyed by ordering.
    """
    if isinstance(raw_photos, list):
        return raw_photos
    if isinstance(raw_photos, dict):
        return list(raw_photos.values())
    return []


def _build_photo_urls(photos: List[Dict]) -> List[str]:
    """Turn photo metadata into full HTTPS URLs.

    DOM.RIA photos are served from cdn.riastatic.com.
    We can inject 'xl' before the extension for a larger variant.
    """
    urls: List[str] = []
    for p in photos:
        if not isinstance(p, dict):
            continue
        base = (
            p.get("file")
            or p.get("beautifulUrl")
            or p.get("photo_base_url")
            or p.get("url")
            or p.get("src")
        )
        if not base:
            continue
        if base.startswith("//"):
            base = "https:" + base
        elif base.startswith("/"):
            base = "https://cdn.riastatic.com" + base
        elif not base.startswith("http"):
            base = "https://cdn.riastatic.com/" + base
        # xl variant if plain .jpg
        if base.endswith(".jpg") and "xl" not in base.split("/")[-1]:
            base = base.replace(".jpg", "xl.jpg")
        urls.append(base)
    return urls


def normalize_listing(
    catalog_item: Dict[str, Any],
    detail_realty: Optional[Dict[str, Any]] = None,
    city_name_meta: str = "",
) -> Dict[str, Any]:
    """Create the payload dict that will be sent to data_collector.

    Strategy:
        * Simple / stable fields → normalized scalar values.
        * Complex / nested / volatile fields → kept raw under `raw_domria`.
    """
    c = catalog_item  # shorthand
    d = detail_realty or {}

    # --- normalized simple fields ------------------------------------------------
    price_usd = _safe_int(c.get("price_USD")) or _safe_int(c.get("price_usd"))

    # Try detail priceObj first (most explicit)
    if price_usd is None:
        price_usd = _safe_int(d.get("priceObj", {}).get("priceUSD"))

    # Try catalog priceArr (dict with keys 1/2/3 or list of objects)
    if price_usd is None:
        pa = c.get("priceArr") or d.get("priceArr")
        if isinstance(pa, dict):
            # DOM.RIA sends {'1': '130 000', '2': '112 317', '3': '5 863 000'}
            # Key '1' is USD in all observed cases
            price_usd = _safe_int(pa.get("1"))
        elif isinstance(pa, list):
            for entry in pa:
                if isinstance(entry, dict) and entry.get("currency") == "USD":
                    price_usd = _safe_int(entry.get("price"))
                    break

    # Fallback to plain price field (usually already in USD on sale pages)
    if price_usd is None:
        price_usd = _safe_int(c.get("price"))

    # --- photos ------------------------------------------------------------------
    raw_catalog_photos = _extract_photos(c.get("photos"))
    raw_detail_photos = _extract_photos(d.get("photos"))
    # Prefer detail photos (usually more / higher quality), fallback to catalog
    photo_source = raw_detail_photos if raw_detail_photos else raw_catalog_photos
    photo_urls = _build_photo_urls(photo_source)
    if not photo_urls:
        # Last resort: detail sometimes has a single `main_photo` dict
        main = d.get("main_photo")
        if isinstance(main, dict):
            photo_urls = _build_photo_urls([main])

    normalized = {
        "external_id": str(c.get("realty_id")),
        "url": f"https://dom.ria.com/uk/{(c.get('beautiful_url') or '').lstrip('/')}",
        "title": (d.get("title") or c.get("title") or "").strip(),
        "description": (d.get("description") or c.get("description") or "").strip(),
        "price_usd": price_usd,
        "price_raw": c.get("priceArr") or c.get("price"),
        "area_total_m2": _safe_float(c.get("total_square_meters")),
        "area_living_m2": _safe_float(
            d.get("living_square_meters") or c.get("living_square_meters")
        ),
        "area_kitchen_m2": _safe_float(
            d.get("kitchen_square_meters") or c.get("kitchen_square_meters")
        ),
        "rooms_count": _safe_int(c.get("rooms_count")),
        "floor": _safe_int(c.get("floor")),
        "floors_total": _safe_int(
            d.get("floors_count") or c.get("floors_count")
        ),
        "city_name": (d.get("city_name") or c.get("city_name") or city_name_meta or "").strip(),
        "district_name": (d.get("district_name") or c.get("district_name") or "").strip(),
        "street_name": (d.get("street_name") or c.get("street_name") or "").strip(),
        "building_number": (d.get("building_number") or c.get("building_number") or "").strip(),
        "latitude": _safe_float(c.get("lat")),
        "longitude": _safe_float(c.get("lng")),
        "photos": photo_urls,
        "photos_count": len(photo_urls),
        "tags": [t.get("name_uk") or t.get("name") for t in (d.get("tags") or c.get("tags") or []) if isinstance(t, dict)],
        "contact_phones": _extract_phones(d),
        "raw_domria": {
            "catalog_item": c,
            "detail_realty": d,
        },
    }

    # Remove None values to keep payload compact
    cleaned = {k: v for k, v in normalized.items() if v is not None}
    return cleaned


def _extract_phones(detail: Dict[str, Any]) -> List[str]:
    """Pull phone numbers from the detail object if available."""
    phones: List[str] = []
    user = detail.get("user")
    user_contacts = user.get("contacts") if isinstance(user, dict) else None
    contacts = detail.get("contacts") or user_contacts
    if not isinstance(contacts, dict):
        return phones

    for key in ("phones", "phone", "mobile", "tel"):
        vals = contacts.get(key)
        if isinstance(vals, list):
            for v in vals:
                if isinstance(v, dict):
                    phones.append(v.get("phone") or v.get("number") or str(v))
                elif isinstance(v, str):
                    phones.append(v)
        elif isinstance(vals, str):
            phones.append(vals)

    # dedupe preserving order
    seen = set()
    uniq = []
    for p in phones:
        p = str(p).strip()
        if p and p not in seen:
            seen.add(p)
            uniq.append(p)
    return uniq