vmk-360-domria_parser/src/main.py at a679a818b5b5af7febd750e966f0ae4e53f49635

Fork: 0
root / vmk-360-domria_parser
Find file
Newer
Older
vmk-360-domria_parser / src / main.py
Eugene Sukhodolskiy 1 day ago 3 KB Implement MVP DOM.RIA parser with curl_cffi scraping
Raw Blame History
"""CLI entry point for the DOM.RIA parser."""
import argparse
import sys
from typing import List, Optional

from src.config import build_targets, CATEGORY_MAP, CITY_SLUGS, OPERATION_SLUGS
from src.collector import DataCollectorClient
from src.crawler import Crawler
from src.session import DomRiaSession
from src.storage import ResumeStorage


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="DOM.RIA → data_collector scraper")
    parser.add_argument(
        "--city",
        action="append",
        help="City slug to scrape (can be given multiple times). Default: all.",
    )
    parser.add_argument(
        "--category",
        type=int,
        action="append",
        choices=list(CATEGORY_MAP.keys()),
        help="Category ID (1=apartments, 2=houses, 3=land, 4=commercial, 5=garages). Default: all.",
    )
    parser.add_argument(
        "--operation",
        action="append",
        choices=list(OPERATION_SLUGS.keys()),
        help="Operation (sale, rent, rent_daily). Default: all.",
    )
    parser.add_argument(
        "--max-pages",
        type=int,
        default=None,
        help="Limit catalog pages per target (for testing).",
    )
    parser.add_argument(
        "--db",
        default="domria_resume.db",
        help="Path to SQLite resume DB.",
    )
    parser.add_argument(
        "--collector-url",
        default="http://localhost:8020",
        help="data_collector base URL.",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Skip ingestion to data_collector (parse only).",
    )
    return parser.parse_args()


def main() -> int:
    args = parse_args()

    # --- build targets --------------------------------------------------------
    city_slugs: Optional[List[str]] = args.city
    category_ids: Optional[List[int]] = args.category
    operations: Optional[List[str]] = args.operation

    targets = build_targets(
        city_slugs=city_slugs,
        category_ids=category_ids,
        operations=operations,
    )
    print(f"[main] {len(targets)} target(s) to process")
    if not targets:
        print("[main] Nothing to do.")
        return 0

    # --- init components ------------------------------------------------------
    print("[main] Initializing session …")
    session = DomRiaSession()
    session.warmup()

    collector: Optional[DataCollectorClient] = None
    if not args.dry_run:
        collector = DataCollectorClient(base_url=args.collector_url)
        if not collector.health_check():
            print("[main] data_collector is not reachable — aborting.")
            return 1
        print("[main] data_collector is healthy.")

    storage = ResumeStorage(db_path=args.db)
    stats = storage.stats()
    print(f"[main] Resume DB: {stats['total']} total processed, {stats['today']} today")

    crawler = Crawler(session=session, collector=collector, storage=storage)

    # --- run ------------------------------------------------------------------
    for idx, target in enumerate(targets, 1):
        print(f"\n[main] Target {idx}/{len(targets)}")
        try:
            crawler.crawl_target(target, max_pages=args.max_pages)
        except KeyboardInterrupt:
            print("\n[main] Interrupted by user.")
            return 130
        except Exception as exc:
            print(f"[main] Unhandled exception on target {target}: {exc}")
            # Continue with next target — we don't want one broken city to
            # kill a long-running multi-city job.

    print("\n[main] All targets finished.")
    return 0


if __name__ == "__main__":
    sys.exit(main())