"""CLI entry point for the DOM.RIA parser."""
import argparse
import sys
from typing import List, Optional
from src.config import build_targets, CATEGORY_MAP, OPERATION_SLUGS
from src.collector import DataCollectorClient
from src.crawler import Crawler
from src.discovery import discover_all_city_slugs
from src.session import DomRiaSession
from src.storage import ResumeStorage
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="DOM.RIA → data_collector scraper")
parser.add_argument(
"--city",
action="append",
help="City slug to scrape (can be given multiple times). Default: auto-discover all from DOM.RIA.",
)
parser.add_argument(
"--category",
type=int,
action="append",
choices=list(CATEGORY_MAP.keys()),
help="Category ID (1=apartments, 2=houses, 3=land, 4=commercial, 5=garages). Default: all.",
)
parser.add_argument(
"--operation",
action="append",
choices=list(OPERATION_SLUGS.keys()),
help="Operation (sale, rent, rent_daily). Default: all.",
)
parser.add_argument(
"--max-pages",
type=int,
default=None,
help="Limit catalog pages per target (for testing).",
)
parser.add_argument(
"--db",
default="domria_resume.db",
help="Path to SQLite resume DB.",
)
parser.add_argument(
"--collector-url",
default="http://localhost:8020",
help="data_collector base URL.",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Skip ingestion to data_collector (parse only).",
)
parser.add_argument(
"--discover-cities",
action="store_true",
help="Only discover and print city slugs from DOM.RIA, then exit.",
)
return parser.parse_args()
def main() -> int:
args = parse_args()
# --- discover cities only mode --------------------------------------------
if args.discover_cities:
cities = discover_all_city_slugs()
print(f"\nDiscovered {len(cities)} cities / settlements:")
for c in cities:
print(f" {c}")
return 0
# --- resolve city slugs -----------------------------------------------------
city_slugs: List[str]
if args.city:
city_slugs = args.city
print(f"[main] Using {len(city_slugs)} explicitly provided city(s)")
else:
print("[main] Discovering city list from DOM.RIA …")
city_slugs = discover_all_city_slugs()
print(f"[main] Discovered {len(city_slugs)} cities / settlements")
# --- build targets ----------------------------------------------------------
category_ids: Optional[List[int]] = args.category
operations: Optional[List[str]] = args.operation
targets = build_targets(
city_slugs=city_slugs,
category_ids=category_ids,
operations=operations,
)
print(f"[main] {len(targets)} target(s) to process")
if not targets:
print("[main] Nothing to do.")
return 0
# --- init components --------------------------------------------------------
print("[main] Initializing session …")
session = DomRiaSession()
session.warmup()
collector: Optional[DataCollectorClient] = None
if not args.dry_run:
collector = DataCollectorClient(base_url=args.collector_url)
if not collector.health_check():
print("[main] data_collector is not reachable — aborting.")
return 1
print("[main] data_collector is healthy.")
storage = ResumeStorage(db_path=args.db)
stats = storage.stats()
print(f"[main] Resume DB: {stats['total']} total processed, {stats['today']} today")
crawler = Crawler(session=session, collector=collector, storage=storage)
# --- run --------------------------------------------------------------------
for idx, target in enumerate(targets, 1):
print(f"\n[main] Target {idx}/{len(targets)}")
try:
crawler.crawl_target(target, max_pages=args.max_pages)
except KeyboardInterrupt:
print("\n[main] Interrupted by user.")
return 130
except Exception as exc:
print(f"[main] Unhandled exception on target {target}: {exc}")
# Continue with next target — we don't want one broken city to
# kill a long-running multi-city job.
print("\n[main] All targets finished.")
return 0
if __name__ == "__main__":
sys.exit(main())