Newer
Older
vmk-360-domria_parser / src / config.py
"""Configuration for DOM.RIA parser."""
from dataclasses import dataclass
from typing import Dict, List

# --- data_collector -----------------------------------------------------------
DATA_COLLECTOR_BASE_URL = "http://localhost:8020"
DATA_COLLECTOR_INGEST_ENDPOINT = "/api/v1/ingest"
DATA_COLLECTOR_INGEST_WITH_IMAGES_ENDPOINT = "/api/v1/ingest/with-images"
DATA_COLLECTOR_HEALTH_ENDPOINT = "/api/v1/health"
DATA_COLLECTOR_SOURCE_SLUG = "domria"

# --- DOM.RIA scraping ---------------------------------------------------------
BASE_URL = "https://dom.ria.com/uk"
HOMEPAGE_URL = f"{BASE_URL}/"

# impersonate target for curl_cffi (tested working)
IMPERSONATE = "chrome124"

# HTTP headers shared across requests
DEFAULT_HEADERS = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "uk-UA,uk;q=0.9,ru;q=0.8,en;q=0.7",
    "Accept-Encoding": "gzip, deflate, br",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
}

# --- Rate limits (seconds) ----------------------------------------------------
DELAY_BETWEEN_CATALOG_PAGES = 10.0
DELAY_BETWEEN_DETAIL_PAGES = 10.0
PAUSE_EVERY_N_CATALOG_PAGES = 50
PAUSE_DURATION_SECONDS = 120.0

# --- Categories ---------------------------------------------------------------
# Discovered from DOM.RIA homepage navigation links.
CATEGORY_MAP: Dict[int, str] = {
    1: "kvartir",          # apartments
    2: "domov",            # houses
    3: "uchastkov",        # land
    4: "kom-nedvizhimosti",  # commercial (offices + premises)
    5: "garazhei",         # garages / parking
    6: "komnat",           # rooms
}

# operation slug → URL prefix
# Discovered from DOM.RIA homepage: posutochnaia (not posutochnaya)
OPERATION_SLUGS: Dict[str, str] = {
    "sale": "prodazha",
    "rent": "arenda",
    "rent_daily": "posutochnaia-arenda",
}

# Fallback city names (enriched dynamically from sitemap + homepage)
FALLBACK_CITY_NAMES: Dict[str, str] = {
    "kiev": "Київ",
    "lvov": "Львів",
    "odessa": "Одеса",
    "kharkov": "Харків",
    "dnepr": "Дніпро",
    "cherkassy": "Черкаси",
    "ivano-frankovsk": "Івано-Франківськ",
    "lutsk": "Луцьк",
    "nikolaev": "Миколаїв",
    "khmelnytskyi": "Хмельницький",
}


@dataclass(frozen=True)
class CrawlTarget:
    """One city + category + operation combination."""
    city_slug: str
    city_name: str
    category_id: int
    category_slug: str
    operation: str      # sale / rent / rent_daily
    operation_slug: str

    @property
    def catalog_url_template(self) -> str:
        """e.g. https://dom.ria.com/uk/prodazha-kvartir/kiev/?page={page}"""
        return (
            f"{BASE_URL}/{self.operation_slug}-{self.category_slug}"
            f"/{self.city_slug}/?page={{page}}"
        )


def build_targets(
    city_slugs: List[str] = None,
    category_ids: List[int] = None,
    operations: List[str] = None,
) -> List[CrawlTarget]:
    """Generate all target combinations (Cartesian product)."""
    category_ids = category_ids or list(CATEGORY_MAP.keys())
    operations = operations or list(OPERATION_SLUGS.keys())

    targets: List[CrawlTarget] = []
    for city in city_slugs:
        for cat_id in category_ids:
            for op in operations:
                targets.append(
                    CrawlTarget(
                        city_slug=city,
                        city_name=FALLBACK_CITY_NAMES.get(city, city),
                        category_id=cat_id,
                        category_slug=CATEGORY_MAP[cat_id],
                        operation=op,
                        operation_slug=OPERATION_SLUGS[op],
                    )
                )
    return targets