Newer
Older
vmk-360-domria_parser / src / discovery.py
"""Dynamic city slug discovery from DOM.RIA sitemaps and homepage."""
import gzip
import re
from typing import List, Set

from curl_cffi import requests

from src.config import BASE_URL, IMPERSONATE


# Known sub-page indicators that are NOT cities
_SUBPAGE_INDICATORS = [
    "-metro-",
    "-massyv-",
    "-uzvoz-",
    "-rayon-",
    "-zhk-",
    "-pereulok-",
    "-ul-",
    "-bulvar-",
    "-prospekt-",
    "-ploshchad-",
    "-vul-",
    "-proezd-",
    "-doroga-",
    "-shosse-",
    "-naberezhnaia-",
    "-km-",
    "-mkr-",
    "-kvartal-",
    "-poselok-",
    "-smt-",
    "-gorodok-",
    "-vezd-",
    "-tupyk-",
    "-mykroraion-",
    "-urochyshche-",
    "-plato-",
    "-alleia-",
    "-zhk",        # suffix like foo-zhk-123
    "-ul",         # suffix like foo-ul-bar
    "-rayon",      # suffix like foo-rayon
    # NOTE: "-obl-" is NOT here — e.g. "brovary-obl-kievskaya" is a real city
]

# These look like streets/buildings inside cities
_SUBPAGE_SUFFIXES = [
    "-1-ia-ulytsa-",
    "-3-ia-ulytsa-",
    "-4-ia-ulytsa-",
    "-5-ia-ulytsa-",
    "-6-ia-ulytsa-",
    "-7-ia-ulytsa-",
    "-12-ia-ulytsa-",
]


def _is_subpage(slug: str) -> bool:
    """Return True if the slug is clearly a district/street/metro, not a city."""
    if slug.startswith("obl-"):
        return True
    for indicator in _SUBPAGE_INDICATORS:
        if indicator in slug:
            return True
    for suffix in _SUBPAGE_SUFFIXES:
        if suffix in slug:
            return True
    return False


def _is_likely_city(slug: str, all_slugs: Set[str]) -> bool:
    """Heuristic: true cities are not sub-pages and are not prefixes of sub-pages
    in a way that suggests they're just districts.

    Additional check: if slug contains a hyphen, verify it's not `city-district`.
    We do this by checking whether there's a shorter slug that is a prefix.
    """
    if _is_subpage(slug):
        return False

    # If there's another slug that is a strict prefix of this one,
    # then this slug is likely a sub-page (e.g. boryspol-schastlyvoe
    # when boryspol exists).
    for other in all_slugs:
        if other != slug and slug.startswith(other + "-"):
            return False

    return True


def fetch_city_slugs_from_sitemap(category_slug: str = "kvartir", operation_slug: str = "prodazha") -> List[str]:
    """Download the non-tag sitemap for a category and extract city slugs.

    Args:
        category_slug: e.g. 'kvartir', 'domov'
        operation_slug: e.g. 'prodazha', 'arenda'

    Returns:
        Sorted list of unique city slugs.
    """
    sitemap_url = f"{BASE_URL}/{operation_slug}-{category_slug}/non-tag-sitemap.xml.gz"
    print(f"[discovery] Fetching sitemap: {sitemap_url}")
    resp = requests.get(sitemap_url, impersonate=IMPERSONATE, timeout=60)
    resp.raise_for_status()

    text = gzip.decompress(resp.content).decode("utf-8")

    # Extract all URLs
    urls = re.findall(r"<loc>([^<]+)</loc>", text)
    print(f"[discovery] Total URLs in sitemap: {len(urls)}")

    # Extract slugs (path segment after /{op}-{type}/)
    slugs: Set[str] = set()
    pattern = re.compile(rf"/{operation_slug}-{category_slug}/([^/]+)/")
    for u in urls:
        m = pattern.search(u)
        if m:
            slugs.add(m.group(1))

    print(f"[discovery] Unique slugs: {len(slugs)}")

    # Filter to likely cities
    cities = [s for s in slugs if _is_likely_city(s, slugs)]
    cities.sort()
    print(f"[discovery] Probable cities after filtering: {len(cities)}")
    return cities


def fetch_city_slugs_from_homepage() -> List[str]:
    """Extract city slugs from the homepage navigation links.

    This is a smaller but very reliable list (popular cities shown in UI).
    """
    print(f"[discovery] Fetching homepage for city links …")
    resp = requests.get(f"{BASE_URL}/", impersonate=IMPERSONATE, timeout=30)
    resp.raise_for_status()

    # Look for links like /uk/prodazha-kvartir/kiev/
    cities: Set[str] = set()
    for cat in ["kvartir", "domov"]:
        for op in ["prodazha", "arenda"]:
            found = re.findall(rf'href="/uk/{op}-{cat}/([^/"]+)/"', resp.text)
            cities.update(found)

    # Filter out regions
    cities = {c for c in cities if not c.startswith("obl-")}

    result = sorted(cities)
    print(f"[discovery] Cities from homepage: {len(result)}")
    return result


def discover_all_city_slugs() -> List[str]:
    """Return a merged, sorted list of city slugs from all sources.

    Priority:
        1. Sitemap (apartments sale) — broadest coverage
        2. Homepage navigation — validation / fallback
    """
    sitemap_cities = set(fetch_city_slugs_from_sitemap())
    homepage_cities = set(fetch_city_slugs_from_homepage())

    merged = sitemap_cities | homepage_cities
    result = sorted(merged)
    print(f"[discovery] Total unique cities after merge: {len(result)}")
    return result