diff --git a/src/crawler.py b/src/crawler.py index 3052b88..74d9bee 100644 --- a/src/crawler.py +++ b/src/crawler.py @@ -118,10 +118,10 @@ city_name_meta=target.city_name, ) - photo_count = payload.get("photos_count", 0) - if photo_count == 0: + image_count = len(payload.get("images", [])) + if image_count == 0: print( - f"[crawler] WARNING {realty_id}: zero photos in payload " + f"[crawler] WARNING {realty_id}: zero images in payload " f"(catalog={len(item.get('photos') or [])}, detail={len(detail.get('photos') or []) if detail else 'N/A'})" ) diff --git a/src/normalizer.py b/src/normalizer.py index a3a52ec..7f2d836 100644 --- a/src/normalizer.py +++ b/src/normalizer.py @@ -1,4 +1,17 @@ -"""Transform DOM.RIA raw data into a data_collector payload.""" +"""Transform DOM.RIA raw data into a data_collector payload. + +Aligns with data_collector PayloadSchema (see data_collector/src/vmk_data_collector/schemas/raw_data.py): + - images : list[str] (photo URLs) + - price : str|float|int (normalized price) + - contact_phone: str (first phone number) + - address : str (combined address line) + - area : str|float|int (total area) + - rooms : str|int (rooms count) + - floor : str|int (floor number) + - title, description, url (direct mapping) + +Extra fields not in the strict schema are kept because model_config = {"extra": "allow"}. +""" from typing import Any, Dict, List, Optional @@ -35,7 +48,7 @@ return [] -def _build_photo_urls(photos: List[Dict]) -> List[str]: +def _build_image_urls(photos: List[Dict]) -> List[str]: """Turn photo metadata into full HTTPS URLs. DOM.RIA photos are served from cdn.riastatic.com. @@ -67,96 +80,6 @@ return urls -def normalize_listing( - catalog_item: Dict[str, Any], - detail_realty: Optional[Dict[str, Any]] = None, - city_name_meta: str = "", -) -> Dict[str, Any]: - """Create the payload dict that will be sent to data_collector. - - Strategy: - * Simple / stable fields → normalized scalar values. - * Complex / nested / volatile fields → kept raw under `raw_domria`. - """ - c = catalog_item # shorthand - d = detail_realty or {} - - # --- normalized simple fields ------------------------------------------------ - price_usd = _safe_int(c.get("price_USD")) or _safe_int(c.get("price_usd")) - - # Try detail priceObj first (most explicit) - if price_usd is None: - price_usd = _safe_int(d.get("priceObj", {}).get("priceUSD")) - - # Try catalog priceArr (dict with keys 1/2/3 or list of objects) - if price_usd is None: - pa = c.get("priceArr") or d.get("priceArr") - if isinstance(pa, dict): - # DOM.RIA sends {'1': '130 000', '2': '112 317', '3': '5 863 000'} - # Key '1' is USD in all observed cases - price_usd = _safe_int(pa.get("1")) - elif isinstance(pa, list): - for entry in pa: - if isinstance(entry, dict) and entry.get("currency") == "USD": - price_usd = _safe_int(entry.get("price")) - break - - # Fallback to plain price field (usually already in USD on sale pages) - if price_usd is None: - price_usd = _safe_int(c.get("price")) - - # --- photos ------------------------------------------------------------------ - raw_catalog_photos = _extract_photos(c.get("photos")) - raw_detail_photos = _extract_photos(d.get("photos")) - # Prefer detail photos (usually more / higher quality), fallback to catalog - photo_source = raw_detail_photos if raw_detail_photos else raw_catalog_photos - photo_urls = _build_photo_urls(photo_source) - if not photo_urls: - # Last resort: detail sometimes has a single `main_photo` dict - main = d.get("main_photo") - if isinstance(main, dict): - photo_urls = _build_photo_urls([main]) - - normalized = { - "external_id": str(c.get("realty_id")), - "url": f"https://dom.ria.com/uk/{(c.get('beautiful_url') or '').lstrip('/')}", - "title": (d.get("title") or c.get("title") or "").strip(), - "description": (d.get("description") or c.get("description") or "").strip(), - "price_usd": price_usd, - "price_raw": c.get("priceArr") or c.get("price"), - "area_total_m2": _safe_float(c.get("total_square_meters")), - "area_living_m2": _safe_float( - d.get("living_square_meters") or c.get("living_square_meters") - ), - "area_kitchen_m2": _safe_float( - d.get("kitchen_square_meters") or c.get("kitchen_square_meters") - ), - "rooms_count": _safe_int(c.get("rooms_count")), - "floor": _safe_int(c.get("floor")), - "floors_total": _safe_int( - d.get("floors_count") or c.get("floors_count") - ), - "city_name": (d.get("city_name") or c.get("city_name") or city_name_meta or "").strip(), - "district_name": (d.get("district_name") or c.get("district_name") or "").strip(), - "street_name": (d.get("street_name") or c.get("street_name") or "").strip(), - "building_number": (d.get("building_number") or c.get("building_number") or "").strip(), - "latitude": _safe_float(c.get("lat")), - "longitude": _safe_float(c.get("lng")), - "photos": photo_urls, - "photos_count": len(photo_urls), - "tags": [t.get("name_uk") or t.get("name") for t in (d.get("tags") or c.get("tags") or []) if isinstance(t, dict)], - "contact_phones": _extract_phones(d), - "raw_domria": { - "catalog_item": c, - "detail_realty": d, - }, - } - - # Remove None values to keep payload compact - cleaned = {k: v for k, v in normalized.items() if v is not None} - return cleaned - - def _extract_phones(detail: Dict[str, Any]) -> List[str]: """Pull phone numbers from the detail object if available.""" phones: List[str] = [] @@ -186,3 +109,108 @@ seen.add(p) uniq.append(p) return uniq + + +def _build_address(city: str, district: str, street: str, building: str) -> str: + """Combine address parts into a single human-readable string.""" + parts = [p for p in (city, district, street, building) if p] + return ", ".join(parts) + + +def normalize_listing( + catalog_item: Dict[str, Any], + detail_realty: Optional[Dict[str, Any]] = None, + city_name_meta: str = "", +) -> Dict[str, Any]: + """Create the payload dict that will be sent to data_collector. + + Maps DOM.RIA fields to data_collector PayloadSchema where possible; + everything else is kept in `raw_domria`. + """ + c = catalog_item # shorthand + d = detail_realty or {} + + # --- price ------------------------------------------------------------------- + price = _safe_int(c.get("price_USD")) or _safe_int(c.get("price_usd")) + if price is None: + price = _safe_int(d.get("priceObj", {}).get("priceUSD")) + if price is None: + pa = c.get("priceArr") or d.get("priceArr") + if isinstance(pa, dict): + price = _safe_int(pa.get("1")) + elif isinstance(pa, list): + for entry in pa: + if isinstance(entry, dict) and entry.get("currency") == "USD": + price = _safe_int(entry.get("price")) + break + if price is None: + price = _safe_int(c.get("price")) + + # --- images ------------------------------------------------------------------ + raw_catalog_photos = _extract_photos(c.get("photos")) + raw_detail_photos = _extract_photos(d.get("photos")) + photo_source = raw_detail_photos if raw_detail_photos else raw_catalog_photos + image_urls = _build_image_urls(photo_source) + if not image_urls: + main = d.get("main_photo") + if isinstance(main, dict): + image_urls = _build_image_urls([main]) + + # --- phones ------------------------------------------------------------------ + phones = _extract_phones(d) + contact_phone = phones[0] if phones else None + + # --- address ----------------------------------------------------------------- + city = (d.get("city_name") or c.get("city_name") or city_name_meta or "").strip() + district = (d.get("district_name") or c.get("district_name") or "").strip() + street = (d.get("street_name") or c.get("street_name") or "").strip() + building = (d.get("building_number") or c.get("building_number") or "").strip() + address = _build_address(city, district, street, building) + + # --- schema-aligned payload -------------------------------------------------- + normalized: Dict[str, Any] = { + # Strict schema fields (PayloadSchema) + "title": (d.get("title") or c.get("title") or "").strip(), + "description": (d.get("description") or c.get("description") or "").strip(), + "price": price, + "url": f"https://dom.ria.com/uk/{(c.get('beautiful_url') or '').lstrip('/')}", + "images": image_urls, + "contact_phone": contact_phone, + "address": address, + "area": _safe_float(c.get("total_square_meters")), + "rooms": _safe_int(c.get("rooms_count")), + "floor": _safe_int(c.get("floor")), + + # Extra fields (extra="allow" in PayloadSchema) + "external_id": str(c.get("realty_id")), + "price_usd": price, + "price_raw": c.get("priceArr") or c.get("price"), + "area_total_m2": _safe_float(c.get("total_square_meters")), + "area_living_m2": _safe_float( + d.get("living_square_meters") or c.get("living_square_meters") + ), + "area_kitchen_m2": _safe_float( + d.get("kitchen_square_meters") or c.get("kitchen_square_meters") + ), + "floors_total": _safe_int(d.get("floors_count") or c.get("floors_count")), + "city_name": city, + "district_name": district, + "street_name": street, + "building_number": building, + "latitude": _safe_float(c.get("lat")), + "longitude": _safe_float(c.get("lng")), + "contact_phones": phones, + "tags": [ + t.get("name_uk") or t.get("name") + for t in (d.get("tags") or c.get("tags") or []) + if isinstance(t, dict) + ], + "raw_domria": { + "catalog_item": c, + "detail_realty": d, + }, + } + + # Remove None values to keep payload compact + cleaned = {k: v for k, v in normalized.items() if v is not None} + return cleaned