diff --git a/src/collector.py b/src/collector.py index 31645e0..96b6efc 100644 --- a/src/collector.py +++ b/src/collector.py @@ -107,7 +107,7 @@ url, data={"metadata": metadata}, multipart=mime, - timeout=120, + timeout=300, ) if resp.status_code == 202: data = resp.json() diff --git a/src/crawler.py b/src/crawler.py index fbc4cc4..302dde9 100644 --- a/src/crawler.py +++ b/src/crawler.py @@ -134,8 +134,14 @@ image_paths: List[str] = [] temp_dir = "" - if self.collector is not None and image_urls: - image_paths, temp_dir = self._download_images(image_urls) + # cap images to avoid inline pipeline timeouts (AI analysis per photo) + MAX_IMAGES_PER_LISTING = 15 + capped_urls = image_urls[:MAX_IMAGES_PER_LISTING] if len(image_urls) > MAX_IMAGES_PER_LISTING else image_urls + if len(image_urls) > MAX_IMAGES_PER_LISTING: + print(f"[crawler] {realty_id}: capping images {len(image_urls)} -> {MAX_IMAGES_PER_LISTING}") + + if self.collector is not None and capped_urls: + image_paths, temp_dir = self._download_images(capped_urls) if self.collector is not None: if image_paths: