"""Transform DOM.RIA raw data into a data_collector payload."""
from typing import Any, Dict, List, Optional
def _first(val):
"""Helper: return first element if list, else the value itself."""
if isinstance(val, list) and val:
return val[0]
return val
def _safe_int(val) -> Optional[int]:
try:
return int(float(str(val).replace(" ", "").replace(",", ".")))
except (ValueError, TypeError):
return None
def _safe_float(val) -> Optional[float]:
try:
return float(str(val).replace(" ", "").replace(",", "."))
except (ValueError, TypeError):
return None
def _extract_photos(raw_photos) -> List[Dict]:
"""Normalise DOM.RIA photo field to a list of dicts.
Sometimes photos come as a list, sometimes as a dict keyed by ordering.
"""
if isinstance(raw_photos, list):
return raw_photos
if isinstance(raw_photos, dict):
return list(raw_photos.values())
return []
def _build_photo_urls(photos: List[Dict]) -> List[str]:
"""Turn photo metadata into full HTTPS URLs.
DOM.RIA photos are served from cdn.riastatic.com.
We can inject 'xl' before the extension for a larger variant.
"""
urls: List[str] = []
for p in photos:
if not isinstance(p, dict):
continue
base = (
p.get("file")
or p.get("beautifulUrl")
or p.get("photo_base_url")
or p.get("url")
or p.get("src")
)
if not base:
continue
if base.startswith("//"):
base = "https:" + base
elif base.startswith("/"):
base = "https://cdn.riastatic.com" + base
elif not base.startswith("http"):
base = "https://cdn.riastatic.com/" + base
# xl variant if plain .jpg
if base.endswith(".jpg") and "xl" not in base.split("/")[-1]:
base = base.replace(".jpg", "xl.jpg")
urls.append(base)
return urls
def normalize_listing(
catalog_item: Dict[str, Any],
detail_realty: Optional[Dict[str, Any]] = None,
city_name_meta: str = "",
) -> Dict[str, Any]:
"""Create the payload dict that will be sent to data_collector.
Strategy:
* Simple / stable fields → normalized scalar values.
* Complex / nested / volatile fields → kept raw under `raw_domria`.
"""
c = catalog_item # shorthand
d = detail_realty or {}
# --- normalized simple fields ------------------------------------------------
price_usd = _safe_int(c.get("price_USD")) or _safe_int(c.get("price_usd"))
# Try detail priceObj first (most explicit)
if price_usd is None:
price_usd = _safe_int(d.get("priceObj", {}).get("priceUSD"))
# Try catalog priceArr (dict with keys 1/2/3 or list of objects)
if price_usd is None:
pa = c.get("priceArr") or d.get("priceArr")
if isinstance(pa, dict):
# DOM.RIA sends {'1': '130 000', '2': '112 317', '3': '5 863 000'}
# Key '1' is USD in all observed cases
price_usd = _safe_int(pa.get("1"))
elif isinstance(pa, list):
for entry in pa:
if isinstance(entry, dict) and entry.get("currency") == "USD":
price_usd = _safe_int(entry.get("price"))
break
# Fallback to plain price field (usually already in USD on sale pages)
if price_usd is None:
price_usd = _safe_int(c.get("price"))
# --- photos ------------------------------------------------------------------
raw_catalog_photos = _extract_photos(c.get("photos"))
raw_detail_photos = _extract_photos(d.get("photos"))
# Prefer detail photos (usually more / higher quality), fallback to catalog
photo_source = raw_detail_photos if raw_detail_photos else raw_catalog_photos
photo_urls = _build_photo_urls(photo_source)
if not photo_urls:
# Last resort: detail sometimes has a single `main_photo` dict
main = d.get("main_photo")
if isinstance(main, dict):
photo_urls = _build_photo_urls([main])
normalized = {
"external_id": str(c.get("realty_id")),
"url": f"https://dom.ria.com/uk/{(c.get('beautiful_url') or '').lstrip('/')}",
"title": (d.get("title") or c.get("title") or "").strip(),
"description": (d.get("description") or c.get("description") or "").strip(),
"price_usd": price_usd,
"price_raw": c.get("priceArr") or c.get("price"),
"area_total_m2": _safe_float(c.get("total_square_meters")),
"area_living_m2": _safe_float(
d.get("living_square_meters") or c.get("living_square_meters")
),
"area_kitchen_m2": _safe_float(
d.get("kitchen_square_meters") or c.get("kitchen_square_meters")
),
"rooms_count": _safe_int(c.get("rooms_count")),
"floor": _safe_int(c.get("floor")),
"floors_total": _safe_int(
d.get("floors_count") or c.get("floors_count")
),
"city_name": (d.get("city_name") or c.get("city_name") or city_name_meta or "").strip(),
"district_name": (d.get("district_name") or c.get("district_name") or "").strip(),
"street_name": (d.get("street_name") or c.get("street_name") or "").strip(),
"building_number": (d.get("building_number") or c.get("building_number") or "").strip(),
"latitude": _safe_float(c.get("lat")),
"longitude": _safe_float(c.get("lng")),
"photos": photo_urls,
"photos_count": len(photo_urls),
"tags": [t.get("name_uk") or t.get("name") for t in (d.get("tags") or c.get("tags") or []) if isinstance(t, dict)],
"contact_phones": _extract_phones(d),
"raw_domria": {
"catalog_item": c,
"detail_realty": d,
},
}
# Remove None values to keep payload compact
cleaned = {k: v for k, v in normalized.items() if v is not None}
return cleaned
def _extract_phones(detail: Dict[str, Any]) -> List[str]:
"""Pull phone numbers from the detail object if available."""
phones: List[str] = []
user = detail.get("user")
user_contacts = user.get("contacts") if isinstance(user, dict) else None
contacts = detail.get("contacts") or user_contacts
if not isinstance(contacts, dict):
return phones
for key in ("phones", "phone", "mobile", "tel"):
vals = contacts.get(key)
if isinstance(vals, list):
for v in vals:
if isinstance(v, dict):
phones.append(v.get("phone") or v.get("number") or str(v))
elif isinstance(v, str):
phones.append(v)
elif isinstance(vals, str):
phones.append(vals)
# dedupe preserving order
seen = set()
uniq = []
for p in phones:
p = str(p).strip()
if p and p not in seen:
seen.add(p)
uniq.append(p)
return uniq