"""Dynamic city slug discovery from DOM.RIA sitemaps and homepage."""
import gzip
import re
from typing import List, Set
from curl_cffi import requests
from src.config import BASE_URL, IMPERSONATE
# Known sub-page indicators that are NOT cities
_SUBPAGE_INDICATORS = [
"-metro-",
"-massyv-",
"-uzvoz-",
"-rayon-",
"-zhk-",
"-pereulok-",
"-ul-",
"-bulvar-",
"-prospekt-",
"-ploshchad-",
"-vul-",
"-proezd-",
"-doroga-",
"-shosse-",
"-naberezhnaia-",
"-km-",
"-mkr-",
"-kvartal-",
"-poselok-",
"-smt-",
"-gorodok-",
"-vezd-",
"-tupyk-",
"-mykroraion-",
"-urochyshche-",
"-plato-",
"-alleia-",
"-zhk", # suffix like foo-zhk-123
"-ul", # suffix like foo-ul-bar
"-rayon", # suffix like foo-rayon
"-obl-", # region prefix (handled separately)
]
# These look like streets/buildings inside cities
_SUBPAGE_SUFFIXES = [
"-1-ia-ulytsa-",
"-3-ia-ulytsa-",
"-4-ia-ulytsa-",
"-5-ia-ulytsa-",
"-6-ia-ulytsa-",
"-7-ia-ulytsa-",
"-12-ia-ulytsa-",
]
def _is_subpage(slug: str) -> bool:
"""Return True if the slug is clearly a district/street/metro, not a city."""
if slug.startswith("obl-"):
return True
for indicator in _SUBPAGE_INDICATORS:
if indicator in slug:
return True
for suffix in _SUBPAGE_SUFFIXES:
if suffix in slug:
return True
return False
def _is_likely_city(slug: str, all_slugs: Set[str]) -> bool:
"""Heuristic: true cities are not sub-pages and are not prefixes of sub-pages
in a way that suggests they're just districts.
Additional check: if slug contains a hyphen, verify it's not `city-district`.
We do this by checking whether there's a shorter slug that is a prefix.
"""
if _is_subpage(slug):
return False
# If there's another slug that is a strict prefix of this one,
# then this slug is likely a sub-page (e.g. boryspol-schastlyvoe
# when boryspol exists).
for other in all_slugs:
if other != slug and slug.startswith(other + "-"):
return False
return True
def fetch_city_slugs_from_sitemap(category_slug: str = "kvartir", operation_slug: str = "prodazha") -> List[str]:
"""Download the non-tag sitemap for a category and extract city slugs.
Args:
category_slug: e.g. 'kvartir', 'domov'
operation_slug: e.g. 'prodazha', 'arenda'
Returns:
Sorted list of unique city slugs.
"""
sitemap_url = f"{BASE_URL}/{operation_slug}-{category_slug}/non-tag-sitemap.xml.gz"
print(f"[discovery] Fetching sitemap: {sitemap_url}")
resp = requests.get(sitemap_url, impersonate=IMPERSONATE, timeout=60)
resp.raise_for_status()
text = gzip.decompress(resp.content).decode("utf-8")
# Extract all URLs
urls = re.findall(r"<loc>([^<]+)</loc>", text)
print(f"[discovery] Total URLs in sitemap: {len(urls)}")
# Extract slugs (path segment after /{op}-{type}/)
slugs: Set[str] = set()
pattern = re.compile(rf"/{operation_slug}-{category_slug}/([^/]+)/")
for u in urls:
m = pattern.search(u)
if m:
slugs.add(m.group(1))
print(f"[discovery] Unique slugs: {len(slugs)}")
# Filter to likely cities
cities = [s for s in slugs if _is_likely_city(s, slugs)]
cities.sort()
print(f"[discovery] Probable cities after filtering: {len(cities)}")
return cities
def fetch_city_slugs_from_homepage() -> List[str]:
"""Extract city slugs from the homepage navigation links.
This is a smaller but very reliable list (popular cities shown in UI).
"""
print(f"[discovery] Fetching homepage for city links …")
resp = requests.get(f"{BASE_URL}/", impersonate=IMPERSONATE, timeout=30)
resp.raise_for_status()
# Look for links like /uk/prodazha-kvartir/kiev/
cities: Set[str] = set()
for cat in ["kvartir", "domov"]:
for op in ["prodazha", "arenda"]:
found = re.findall(rf'href="/uk/{op}-{cat}/([^/"]+)/"', resp.text)
cities.update(found)
# Filter out regions
cities = {c for c in cities if not c.startswith("obl-")}
result = sorted(cities)
print(f"[discovery] Cities from homepage: {len(result)}")
return result
def discover_all_city_slugs() -> List[str]:
"""Return a merged, sorted list of city slugs from all sources.
Priority:
1. Sitemap (apartments sale) — broadest coverage
2. Homepage navigation — validation / fallback
"""
sitemap_cities = set(fetch_city_slugs_from_sitemap())
homepage_cities = set(fetch_city_slugs_from_homepage())
merged = sitemap_cities | homepage_cities
result = sorted(merged)
print(f"[discovery] Total unique cities after merge: {len(result)}")
return result