Newer
Older
vmk-360-domria_parser / src / session.py
"""curl_cffi session manager with cookie warmup and TLS impersonation."""
import time
from curl_cffi import requests

from src.config import (
    BASE_URL,
    DEFAULT_HEADERS,
    HOMEPAGE_URL,
    IMPERSONATE,
)


class DomRiaSession:
    """Lightweight wrapper around curl_cffi.requests.Session.

    Establishes cookies by hitting the homepage first, then reuses the
    session (cookie jar + connection pool) for all subsequent requests.
    """

    def __init__(self, extra_headers: dict = None, timeout: int = 30):
        self.timeout = timeout
        self.session = requests.Session()
        self.headers = {**DEFAULT_HEADERS, **(extra_headers or {})}

    def warmup(self) -> None:
        """Hit the homepage so DOM.RIA sets session cookies.

        Without this step catalog requests often return 404/403.
        """
        print("[session] Warming up cookies via homepage …")
        resp = self.session.get(
            HOMEPAGE_URL,
            headers=self.headers,
            impersonate=IMPERSONATE,
            timeout=self.timeout,
        )
        # We only care about side-effects (Set-Cookie), not the body
        print(f"[session] Homepage status={resp.status_code} cookies={len(self.session.cookies)} items")
        time.sleep(2.0)

    def get_catalog(self, url: str) -> requests.Response:
        """GET a catalog listing page with correct Referer."""
        headers = {
            **self.headers,
            "Referer": HOMEPAGE_URL,
        }
        return self.session.get(
            url,
            headers=headers,
            impersonate=IMPERSONATE,
            timeout=self.timeout,
        )

    def get_detail(self, beautiful_url: str) -> requests.Response:
        """GET a detail page (beautiful_url is relative, e.g. 'realtor/…')."""
        url = f"{BASE_URL}/{beautiful_url.lstrip('/')}"
        headers = {
            **self.headers,
            "Referer": HOMEPAGE_URL,
        }
        return self.session.get(
            url,
            headers=headers,
            impersonate=IMPERSONATE,
            timeout=self.timeout,
        )