diff --git a/src/config.py b/src/config.py index 4efb9b3..8ad12bf 100644 --- a/src/config.py +++ b/src/config.py @@ -50,9 +50,8 @@ "rent_daily": "posutochnaya-arenda", } -# --- Cities (verified or partially verified) ---------------------------------- -# slug: common name (for logging / payload metadata) -CITY_SLUGS: Dict[str, str] = { +# Fallback city names (enriched dynamically from sitemap + homepage) +FALLBACK_CITY_NAMES: Dict[str, str] = { "kiev": "Київ", "lvov": "Львів", "odessa": "Одеса", @@ -63,7 +62,6 @@ "lutsk": "Луцьк", "nikolaev": "Миколаїв", "khmelnytskyi": "Хмельницький", - # TODO: extend full list } @@ -92,7 +90,6 @@ operations: List[str] = None, ) -> List[CrawlTarget]: """Generate all target combinations (Cartesian product).""" - city_slugs = city_slugs or list(CITY_SLUGS.keys()) category_ids = category_ids or list(CATEGORY_MAP.keys()) operations = operations or list(OPERATION_SLUGS.keys()) @@ -103,7 +100,7 @@ targets.append( CrawlTarget( city_slug=city, - city_name=CITY_SLUGS.get(city, city), + city_name=FALLBACK_CITY_NAMES.get(city, city), category_id=cat_id, category_slug=CATEGORY_MAP[cat_id], operation=op, diff --git a/src/discovery.py b/src/discovery.py new file mode 100644 index 0000000..ca3353d --- /dev/null +++ b/src/discovery.py @@ -0,0 +1,166 @@ +"""Dynamic city slug discovery from DOM.RIA sitemaps and homepage.""" +import gzip +import re +from typing import List, Set + +from curl_cffi import requests + +from src.config import BASE_URL, IMPERSONATE + + +# Known sub-page indicators that are NOT cities +_SUBPAGE_INDICATORS = [ + "-metro-", + "-massyv-", + "-uzvoz-", + "-rayon-", + "-zhk-", + "-pereulok-", + "-ul-", + "-bulvar-", + "-prospekt-", + "-ploshchad-", + "-vul-", + "-proezd-", + "-doroga-", + "-shosse-", + "-naberezhnaia-", + "-km-", + "-mkr-", + "-kvartal-", + "-poselok-", + "-smt-", + "-gorodok-", + "-vezd-", + "-tupyk-", + "-mykroraion-", + "-urochyshche-", + "-plato-", + "-alleia-", + "-zhk", # suffix like foo-zhk-123 + "-ul", # suffix like foo-ul-bar + "-rayon", # suffix like foo-rayon + "-obl-", # region prefix (handled separately) +] + +# These look like streets/buildings inside cities +_SUBPAGE_SUFFIXES = [ + "-1-ia-ulytsa-", + "-3-ia-ulytsa-", + "-4-ia-ulytsa-", + "-5-ia-ulytsa-", + "-6-ia-ulytsa-", + "-7-ia-ulytsa-", + "-12-ia-ulytsa-", +] + + +def _is_subpage(slug: str) -> bool: + """Return True if the slug is clearly a district/street/metro, not a city.""" + if slug.startswith("obl-"): + return True + for indicator in _SUBPAGE_INDICATORS: + if indicator in slug: + return True + for suffix in _SUBPAGE_SUFFIXES: + if suffix in slug: + return True + return False + + +def _is_likely_city(slug: str, all_slugs: Set[str]) -> bool: + """Heuristic: true cities are not sub-pages and are not prefixes of sub-pages + in a way that suggests they're just districts. + + Additional check: if slug contains a hyphen, verify it's not `city-district`. + We do this by checking whether there's a shorter slug that is a prefix. + """ + if _is_subpage(slug): + return False + + # If there's another slug that is a strict prefix of this one, + # then this slug is likely a sub-page (e.g. boryspol-schastlyvoe + # when boryspol exists). + for other in all_slugs: + if other != slug and slug.startswith(other + "-"): + return False + + return True + + +def fetch_city_slugs_from_sitemap(category_slug: str = "kvartir", operation_slug: str = "prodazha") -> List[str]: + """Download the non-tag sitemap for a category and extract city slugs. + + Args: + category_slug: e.g. 'kvartir', 'domov' + operation_slug: e.g. 'prodazha', 'arenda' + + Returns: + Sorted list of unique city slugs. + """ + sitemap_url = f"{BASE_URL}/{operation_slug}-{category_slug}/non-tag-sitemap.xml.gz" + print(f"[discovery] Fetching sitemap: {sitemap_url}") + resp = requests.get(sitemap_url, impersonate=IMPERSONATE, timeout=60) + resp.raise_for_status() + + text = gzip.decompress(resp.content).decode("utf-8") + + # Extract all URLs + urls = re.findall(r"([^<]+)", text) + print(f"[discovery] Total URLs in sitemap: {len(urls)}") + + # Extract slugs (path segment after /{op}-{type}/) + slugs: Set[str] = set() + pattern = re.compile(rf"/{operation_slug}-{category_slug}/([^/]+)/") + for u in urls: + m = pattern.search(u) + if m: + slugs.add(m.group(1)) + + print(f"[discovery] Unique slugs: {len(slugs)}") + + # Filter to likely cities + cities = [s for s in slugs if _is_likely_city(s, slugs)] + cities.sort() + print(f"[discovery] Probable cities after filtering: {len(cities)}") + return cities + + +def fetch_city_slugs_from_homepage() -> List[str]: + """Extract city slugs from the homepage navigation links. + + This is a smaller but very reliable list (popular cities shown in UI). + """ + print(f"[discovery] Fetching homepage for city links …") + resp = requests.get(f"{BASE_URL}/", impersonate=IMPERSONATE, timeout=30) + resp.raise_for_status() + + # Look for links like /uk/prodazha-kvartir/kiev/ + cities: Set[str] = set() + for cat in ["kvartir", "domov"]: + for op in ["prodazha", "arenda"]: + found = re.findall(rf'href="/uk/{op}-{cat}/([^/"]+)/"', resp.text) + cities.update(found) + + # Filter out regions + cities = {c for c in cities if not c.startswith("obl-")} + + result = sorted(cities) + print(f"[discovery] Cities from homepage: {len(result)}") + return result + + +def discover_all_city_slugs() -> List[str]: + """Return a merged, sorted list of city slugs from all sources. + + Priority: + 1. Sitemap (apartments sale) — broadest coverage + 2. Homepage navigation — validation / fallback + """ + sitemap_cities = set(fetch_city_slugs_from_sitemap()) + homepage_cities = set(fetch_city_slugs_from_homepage()) + + merged = sitemap_cities | homepage_cities + result = sorted(merged) + print(f"[discovery] Total unique cities after merge: {len(result)}") + return result diff --git a/src/main.py b/src/main.py index cb8a0e1..a012089 100644 --- a/src/main.py +++ b/src/main.py @@ -3,9 +3,10 @@ import sys from typing import List, Optional -from src.config import build_targets, CATEGORY_MAP, CITY_SLUGS, OPERATION_SLUGS +from src.config import build_targets, CATEGORY_MAP, OPERATION_SLUGS from src.collector import DataCollectorClient from src.crawler import Crawler +from src.discovery import discover_all_city_slugs from src.session import DomRiaSession from src.storage import ResumeStorage @@ -15,7 +16,7 @@ parser.add_argument( "--city", action="append", - help="City slug to scrape (can be given multiple times). Default: all.", + help="City slug to scrape (can be given multiple times). Default: auto-discover all from DOM.RIA.", ) parser.add_argument( "--category", @@ -51,14 +52,36 @@ action="store_true", help="Skip ingestion to data_collector (parse only).", ) + parser.add_argument( + "--discover-cities", + action="store_true", + help="Only discover and print city slugs from DOM.RIA, then exit.", + ) return parser.parse_args() def main() -> int: args = parse_args() - # --- build targets -------------------------------------------------------- - city_slugs: Optional[List[str]] = args.city + # --- discover cities only mode -------------------------------------------- + if args.discover_cities: + cities = discover_all_city_slugs() + print(f"\nDiscovered {len(cities)} cities / settlements:") + for c in cities: + print(f" {c}") + return 0 + + # --- resolve city slugs ----------------------------------------------------- + city_slugs: List[str] + if args.city: + city_slugs = args.city + print(f"[main] Using {len(city_slugs)} explicitly provided city(s)") + else: + print("[main] Discovering city list from DOM.RIA …") + city_slugs = discover_all_city_slugs() + print(f"[main] Discovered {len(city_slugs)} cities / settlements") + + # --- build targets ---------------------------------------------------------- category_ids: Optional[List[int]] = args.category operations: Optional[List[str]] = args.operation @@ -72,7 +95,7 @@ print("[main] Nothing to do.") return 0 - # --- init components ------------------------------------------------------ + # --- init components -------------------------------------------------------- print("[main] Initializing session …") session = DomRiaSession() session.warmup() @@ -91,7 +114,7 @@ crawler = Crawler(session=session, collector=collector, storage=storage) - # --- run ------------------------------------------------------------------ + # --- run -------------------------------------------------------------------- for idx, target in enumerate(targets, 1): print(f"\n[main] Target {idx}/{len(targets)}") try: