"""CLI entry point for the DOM.RIA parser."""
import argparse
import sys
from typing import List, Optional
from src.config import build_targets, CATEGORY_MAP, CITY_SLUGS, OPERATION_SLUGS
from src.collector import DataCollectorClient
from src.crawler import Crawler
from src.session import DomRiaSession
from src.storage import ResumeStorage
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="DOM.RIA → data_collector scraper")
parser.add_argument(
"--city",
action="append",
help="City slug to scrape (can be given multiple times). Default: all.",
)
parser.add_argument(
"--category",
type=int,
action="append",
choices=list(CATEGORY_MAP.keys()),
help="Category ID (1=apartments, 2=houses, 3=land, 4=commercial, 5=garages). Default: all.",
)
parser.add_argument(
"--operation",
action="append",
choices=list(OPERATION_SLUGS.keys()),
help="Operation (sale, rent, rent_daily). Default: all.",
)
parser.add_argument(
"--max-pages",
type=int,
default=None,
help="Limit catalog pages per target (for testing).",
)
parser.add_argument(
"--db",
default="domria_resume.db",
help="Path to SQLite resume DB.",
)
parser.add_argument(
"--collector-url",
default="http://localhost:8020",
help="data_collector base URL.",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Skip ingestion to data_collector (parse only).",
)
return parser.parse_args()
def main() -> int:
args = parse_args()
# --- build targets --------------------------------------------------------
city_slugs: Optional[List[str]] = args.city
category_ids: Optional[List[int]] = args.category
operations: Optional[List[str]] = args.operation
targets = build_targets(
city_slugs=city_slugs,
category_ids=category_ids,
operations=operations,
)
print(f"[main] {len(targets)} target(s) to process")
if not targets:
print("[main] Nothing to do.")
return 0
# --- init components ------------------------------------------------------
print("[main] Initializing session …")
session = DomRiaSession()
session.warmup()
collector: Optional[DataCollectorClient] = None
if not args.dry_run:
collector = DataCollectorClient(base_url=args.collector_url)
if not collector.health_check():
print("[main] data_collector is not reachable — aborting.")
return 1
print("[main] data_collector is healthy.")
storage = ResumeStorage(db_path=args.db)
stats = storage.stats()
print(f"[main] Resume DB: {stats['total']} total processed, {stats['today']} today")
crawler = Crawler(session=session, collector=collector, storage=storage)
# --- run ------------------------------------------------------------------
for idx, target in enumerate(targets, 1):
print(f"\n[main] Target {idx}/{len(targets)}")
try:
crawler.crawl_target(target, max_pages=args.max_pages)
except KeyboardInterrupt:
print("\n[main] Interrupted by user.")
return 130
except Exception as exc:
print(f"[main] Unhandled exception on target {target}: {exc}")
# Continue with next target — we don't want one broken city to
# kill a long-running multi-city job.
print("\n[main] All targets finished.")
return 0
if __name__ == "__main__":
sys.exit(main())