diff --git a/src/crawler.py b/src/crawler.py index 0f7cd2d..3052b88 100644 --- a/src/crawler.py +++ b/src/crawler.py @@ -118,6 +118,13 @@ city_name_meta=target.city_name, ) + photo_count = payload.get("photos_count", 0) + if photo_count == 0: + print( + f"[crawler] WARNING {realty_id}: zero photos in payload " + f"(catalog={len(item.get('photos') or [])}, detail={len(detail.get('photos') or []) if detail else 'N/A'})" + ) + if self.collector is not None: result = self.collector.ingest(realty_id, payload) if result: diff --git a/src/normalizer.py b/src/normalizer.py index a82830f..a3a52ec 100644 --- a/src/normalizer.py +++ b/src/normalizer.py @@ -23,6 +23,18 @@ return None +def _extract_photos(raw_photos) -> List[Dict]: + """Normalise DOM.RIA photo field to a list of dicts. + + Sometimes photos come as a list, sometimes as a dict keyed by ordering. + """ + if isinstance(raw_photos, list): + return raw_photos + if isinstance(raw_photos, dict): + return list(raw_photos.values()) + return [] + + def _build_photo_urls(photos: List[Dict]) -> List[str]: """Turn photo metadata into full HTTPS URLs. @@ -31,7 +43,15 @@ """ urls: List[str] = [] for p in photos: - base = p.get("file") or p.get("beautifulUrl") or p.get("photo_base_url") or p.get("url") or p.get("src") + if not isinstance(p, dict): + continue + base = ( + p.get("file") + or p.get("beautifulUrl") + or p.get("photo_base_url") + or p.get("url") + or p.get("src") + ) if not base: continue if base.startswith("//"): @@ -85,6 +105,18 @@ if price_usd is None: price_usd = _safe_int(c.get("price")) + # --- photos ------------------------------------------------------------------ + raw_catalog_photos = _extract_photos(c.get("photos")) + raw_detail_photos = _extract_photos(d.get("photos")) + # Prefer detail photos (usually more / higher quality), fallback to catalog + photo_source = raw_detail_photos if raw_detail_photos else raw_catalog_photos + photo_urls = _build_photo_urls(photo_source) + if not photo_urls: + # Last resort: detail sometimes has a single `main_photo` dict + main = d.get("main_photo") + if isinstance(main, dict): + photo_urls = _build_photo_urls([main]) + normalized = { "external_id": str(c.get("realty_id")), "url": f"https://dom.ria.com/uk/{(c.get('beautiful_url') or '').lstrip('/')}", @@ -110,7 +142,8 @@ "building_number": (d.get("building_number") or c.get("building_number") or "").strip(), "latitude": _safe_float(c.get("lat")), "longitude": _safe_float(c.get("lng")), - "photos": _build_photo_urls(d.get("photos") or c.get("photos") or []), + "photos": photo_urls, + "photos_count": len(photo_urls), "tags": [t.get("name_uk") or t.get("name") for t in (d.get("tags") or c.get("tags") or []) if isinstance(t, dict)], "contact_phones": _extract_phones(d), "raw_domria": {