"""Configuration for DOM.RIA parser."""
from dataclasses import dataclass
from typing import Dict, List
# --- data_collector -----------------------------------------------------------
DATA_COLLECTOR_BASE_URL = "http://localhost:8020"
DATA_COLLECTOR_INGEST_ENDPOINT = "/api/v1/ingest"
DATA_COLLECTOR_HEALTH_ENDPOINT = "/api/v1/health"
DATA_COLLECTOR_SOURCE_SLUG = "domria"
# --- DOM.RIA scraping ---------------------------------------------------------
BASE_URL = "https://dom.ria.com/uk"
HOMEPAGE_URL = f"{BASE_URL}/"
# impersonate target for curl_cffi (tested working)
IMPERSONATE = "chrome124"
# HTTP headers shared across requests
DEFAULT_HEADERS = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "uk-UA,uk;q=0.9,ru;q=0.8,en;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
}
# --- Rate limits (seconds) ----------------------------------------------------
DELAY_BETWEEN_CATALOG_PAGES = 10.0
DELAY_BETWEEN_DETAIL_PAGES = 10.0
PAUSE_EVERY_N_CATALOG_PAGES = 50
PAUSE_DURATION_SECONDS = 120.0
# --- Categories ---------------------------------------------------------------
# type_id: slug fragment used in URL (e.g. "kvartir" for /prodazha-kvartir/)
CATEGORY_MAP: Dict[int, str] = {
1: "kvartir", # apartments
2: "domov", # houses
3: "uchastkov", # land
4: "kommercheskih", # commercial
5: "garazhey", # garages
}
# operation slug → URL prefix
OPERATION_SLUGS: Dict[str, str] = {
"sale": "prodazha",
"rent": "arenda",
"rent_daily": "posutochnaya-arenda",
}
# Fallback city names (enriched dynamically from sitemap + homepage)
FALLBACK_CITY_NAMES: Dict[str, str] = {
"kiev": "Київ",
"lvov": "Львів",
"odessa": "Одеса",
"kharkov": "Харків",
"dnepr": "Дніпро",
"cherkassy": "Черкаси",
"ivano-frankovsk": "Івано-Франківськ",
"lutsk": "Луцьк",
"nikolaev": "Миколаїв",
"khmelnytskyi": "Хмельницький",
}
@dataclass(frozen=True)
class CrawlTarget:
"""One city + category + operation combination."""
city_slug: str
city_name: str
category_id: int
category_slug: str
operation: str # sale / rent / rent_daily
operation_slug: str
@property
def catalog_url_template(self) -> str:
"""e.g. https://dom.ria.com/uk/prodazha-kvartir/kiev/?page={page}"""
return (
f"{BASE_URL}/{self.operation_slug}-{self.category_slug}"
f"/{self.city_slug}/?page={{page}}"
)
def build_targets(
city_slugs: List[str] = None,
category_ids: List[int] = None,
operations: List[str] = None,
) -> List[CrawlTarget]:
"""Generate all target combinations (Cartesian product)."""
category_ids = category_ids or list(CATEGORY_MAP.keys())
operations = operations or list(OPERATION_SLUGS.keys())
targets: List[CrawlTarget] = []
for city in city_slugs:
for cat_id in category_ids:
for op in operations:
targets.append(
CrawlTarget(
city_slug=city,
city_name=FALLBACK_CITY_NAMES.get(city, city),
category_id=cat_id,
category_slug=CATEGORY_MAP[cat_id],
operation=op,
operation_slug=OPERATION_SLUGS[op],
)
)
return targets