diff --git a/src/ook/handlers/external/paths.py b/src/ook/handlers/external/paths.py index 7d3cb84..9a23312 100644 --- a/src/ook/handlers/external/paths.py +++ b/src/ook/handlers/external/paths.py @@ -72,4 +72,11 @@ async def post_ingest_ltd( edition_slug=ingest_request.edition_slug, ) ) + if ingest_request.product_slug_pattern is not None: + task_group.create_task( + classifier.queue_ingest_for_ltd_product_slug_pattern( + product_slug_pattern=ingest_request.product_slug_pattern, + edition_slug=ingest_request.edition_slug, + ) + ) return Response(status_code=202) diff --git a/src/ook/services/classification.py b/src/ook/services/classification.py index 7dd4ad3..473d5b1 100644 --- a/src/ook/services/classification.py +++ b/src/ook/services/classification.py @@ -2,6 +2,7 @@ from __future__ import annotations +import asyncio import re from datetime import UTC, datetime @@ -103,6 +104,24 @@ async def queue_ingest_for_ltd_product_slug( value=kafka_value, ) + async def queue_ingest_for_ltd_product_slug_pattern( + self, *, product_slug_pattern: str, edition_slug: str + ) -> None: + """Queue an ingest for a LSST the Docs project slug pattern.""" + pattern = re.compile(product_slug_pattern) + project_urls = await self._ltd_service.get_project_urls() + async with asyncio.TaskGroup() as task_group: + for project_url in project_urls: + # The slug is the last component of the API URL + project_slug = project_url.split("/")[-1] + if pattern.match(project_slug) is not None: + task_group.create_task( + self.queue_ingest_for_ltd_product_slug( + product_slug=project_slug, + edition_slug=edition_slug, + ) + ) + async def classify_ltd_site( self, *, product_slug: str, published_url: str ) -> DocumentSourceType: diff --git a/src/ook/services/ltdmetadataservice.py b/src/ook/services/ltdmetadataservice.py index bfd7253..54da7ca 100644 --- a/src/ook/services/ltdmetadataservice.py +++ b/src/ook/services/ltdmetadataservice.py @@ -14,12 +14,20 @@ class LtdMetadataService: def __init__( self, *, logger: BoundLogger, http_client: AsyncClient ) -> None: + self._base = "https://keeper.lsst.codes" self._logger = logger self._http_client = http_client def get_product_api_url(self, product_slug: str) -> str: """Get the LTD API URL for a given product slug.""" - return f"https://keeper.lsst.codes/products/{product_slug}" + return f"{self._base}/products/{product_slug}" + + async def get_project_urls(self) -> dict: + """Get all LTD Project URLs.""" + url = f"{self._base}/products/" + response = await self._http_client.get(url) + response.raise_for_status() + return response.json()["products"] async def get_project(self, product_slug: str) -> dict: """Get the LTD project metadata for a given product slug.""" @@ -32,9 +40,7 @@ async def get_edition( self, product_slug: str, edition_slug: str = "main" ) -> dict: """Get the LTD edition metadata for a given product and edition.""" - editions_url = ( - f"https://keeper.lsst.codes/products/{product_slug}/editions/" - ) + editions_url = f"{self._base}/products/{product_slug}/editions/" response = await self._http_client.get(editions_url) response.raise_for_status() editions = response.json()