diff --git a/src/ook/cli.py b/src/ook/cli.py index 7e3877e..14e46d4 100644 --- a/src/ook/cli.py +++ b/src/ook/cli.py @@ -2,8 +2,6 @@ from __future__ import annotations -__all__ = ["main", "help", "upload_doc_stub"] - from pathlib import Path from typing import Any @@ -12,10 +10,14 @@ from algoliasearch.search_client import SearchClient from safir.asyncio import run_with_asyncio +# from ook.factory import Factory # noqa: ERA001 from ook.config import Configuration from ook.domain.algoliarecord import MinimalDocumentModel +from ook.factory import Factory from ook.services.algoliadocindex import AlgoliaDocIndexService +__all__ = ["main", "help", "upload_doc_stub"] + # Add -h as a help shortcut option CONTEXT_SETTINGS = {"help_option_names": ["-h", "--help"]} @@ -52,7 +54,7 @@ def help(ctx: click.Context, topic: None | str, **kw: Any) -> None: "--dataset", required=True, type=click.Path(exists=True, path_type=Path), - description="Path to the JSON-formatted document stub dataset to upload.", + help="Path to the JSON-formatted document stub dataset to upload.", ) @run_with_asyncio async def upload_doc_stub(dataset: Path) -> None: @@ -86,3 +88,25 @@ async def upload_doc_stub(dataset: Path) -> None: index = client.init_index(config.algolia_document_index_name) algolia_doc_service = AlgoliaDocIndexService(index, logger) await algolia_doc_service.save_doc_stub(stub_record) + + +@main.command() +@run_with_asyncio +async def audit() -> None: + """Audit the Algolia document index and check if any documents are missing + based on the listing of projects registered in the LTD Keeper service. + """ + config = Configuration() + logger = structlog.get_logger("ook") + if any( + _ is None + for _ in ( + config.algolia_document_index_name, + config.algolia_app_id, + config.algolia_api_key, + ) + ): + raise click.UsageError("Algolia credentials not set in environment.") + async with Factory.create_standalone(logger=logger) as factory: + algolia_audit_service = factory.create_algolia_audit_service() + await algolia_audit_service.audit_missing_documents() diff --git a/src/ook/factory.py b/src/ook/factory.py index fe0fad0..eaa506e 100644 --- a/src/ook/factory.py +++ b/src/ook/factory.py @@ -24,6 +24,7 @@ from .config import config from .dependencies.algoliasearch import algolia_client_dependency from .domain.kafka import LtdUrlIngestV1, UrlIngestKeyV1 +from .services.algoliaaudit import AlgoliaAuditService from .services.algoliadocindex import AlgoliaDocIndexService from .services.classification import ClassificationService from .services.githubmetadata import GitHubMetadataService @@ -227,3 +228,11 @@ def create_sphinx_technote_ingest_service( github_service=self.create_github_metadata_service(), logger=self._logger, ) + + def create_algolia_audit_service(self) -> AlgoliaAuditService: + """Create an AlgoliaAuditService.""" + return AlgoliaAuditService( + http_client=self.http_client, + algolia_search_client=self._process_context.algolia_client, + logger=self._logger, + ) diff --git a/src/ook/services/algoliaaudit.py b/src/ook/services/algoliaaudit.py new file mode 100644 index 0000000..bbb927b --- /dev/null +++ b/src/ook/services/algoliaaudit.py @@ -0,0 +1,141 @@ +"""Service for auditing the Algolia indices for completeness.""" + +from __future__ import annotations + +import re +from dataclasses import dataclass + +from algoliasearch.search_client import SearchClient +from httpx import AsyncClient +from structlog.stdlib import BoundLogger + +from ..config import config + +# Python regular expression pattern that matches an LTD document slug such as +# "sqr-000". +LTD_SLUG_PATTERN = re.compile(r"^[a-z]+-[0-9]+$") + + +@dataclass +class LtdDocument: + """Information about a document registered in LTD.""" + + published_url: str + """The base URL where the document is published.""" + + slug: str + """The LTD slug for the document.""" + + @property + def handle(self) -> str: + """The handle for the document in Algolia.""" + return self.slug.upper() + + def __lt__(self, other: LtdDocument) -> bool: + """Sort documents by their handle.""" + return self.handle < other.handle + + def __le__(self, other: LtdDocument) -> bool: + """Sort documents by their handle.""" + return self.handle <= other.handle + + def __gt__(self, other: LtdDocument) -> bool: + """Sort documents by their handle.""" + return self.handle > other.handle + + def __ge__(self, other: LtdDocument) -> bool: + """Sort documents by their handle.""" + return self.handle >= other.handle + + +class AlgoliaAuditService: + """A service for auditing the Algolia indices for completeness.""" + + def __init__( + self, + *, + http_client: AsyncClient, + logger: BoundLogger, + algolia_search_client: SearchClient, + ) -> None: + """Initialize the service.""" + self._http_client = http_client + self._search_client = algolia_search_client + self._logger = logger + + async def audit_missing_documents(self) -> list[LtdDocument]: + """Audit the Algolia indices for completeness of missing documents. + + A document is considered "missing" if it is registered in the LTD API, + but its handle is not in the Algolia index. + + This audit only tests documents, not documentation projects (user + guides). + + Returns + ------- + list + A list of missing documents. + """ + expected_ltd_docs = await self._get_ltd_documents() + expected_ltd_docs.sort() + + missing_docs: list[LtdDocument] = [] + + doc_index = self._search_client.init_index( + config.algolia_document_index_name + ) + for expected_ltd_doc in expected_ltd_docs: + result = doc_index.search( + expected_ltd_doc.slug.upper(), + {"restrictSearchableAttributes": "handle"}, + ) + if result["nbHits"] == 0: + self._logger.warning( + "Document not found in Algolia index", + handle=expected_ltd_doc.slug.upper(), + published_url=expected_ltd_doc.published_url, + ) + missing_docs.append(expected_ltd_doc) + + self._logger.info( + "Audit complete.", + found=len(expected_ltd_docs) - len(missing_docs), + missing=len(missing_docs), + ) + + missing_docs.sort() + return missing_docs + + async def _get_ltd_documents(self) -> list[LtdDocument]: + """Get a list of documents registered in LTD.""" + r = await self._http_client.get("https://keeper.lsst.codes/products/") + products = r.json() + + documents: list[LtdDocument] = [] + + for product_api_url in products["products"]: + slug = product_api_url.split("/")[-1] + if LTD_SLUG_PATTERN.match(slug) is None: + continue + series = slug.upper().split("-")[0] + if series in ("TEST", "TESTN", "TESTDOC", "TESTR"): + # Skip known test document series before requesting their + # metadata. + continue + + r = await self._http_client.get(product_api_url) + product = r.json() + + if product["doc_repo"].startswith( + "https://github.com/lsst-sqre-testing/" + ): + # Skip known test documents. + continue + + document = LtdDocument( + published_url=product["published_url"], + slug=product["slug"], + ) + documents.append(document) + return documents