From c0b31c3afe1eb32fccc0dceae1417dc85baf68e0 Mon Sep 17 00:00:00 2001 From: Jonathan Sick Date: Fri, 25 Aug 2023 18:02:59 -0400 Subject: [PATCH] Add AlgoliaAuditService and ook audit command This service checks the Algolia indices to ensure that expected documents are present. We intend to run this service primarily through the CLI (ook audit) from a Kubernetes cron job. --- src/ook/cli.py | 30 ++++++- src/ook/factory.py | 9 ++ src/ook/services/algoliaaudit.py | 141 +++++++++++++++++++++++++++++++ 3 files changed, 177 insertions(+), 3 deletions(-) create mode 100644 src/ook/services/algoliaaudit.py diff --git a/src/ook/cli.py b/src/ook/cli.py index 7e3877e..14e46d4 100644 --- a/src/ook/cli.py +++ b/src/ook/cli.py @@ -2,8 +2,6 @@ from __future__ import annotations -__all__ = ["main", "help", "upload_doc_stub"] - from pathlib import Path from typing import Any @@ -12,10 +10,14 @@ from algoliasearch.search_client import SearchClient from safir.asyncio import run_with_asyncio +# from ook.factory import Factory # noqa: ERA001 from ook.config import Configuration from ook.domain.algoliarecord import MinimalDocumentModel +from ook.factory import Factory from ook.services.algoliadocindex import AlgoliaDocIndexService +__all__ = ["main", "help", "upload_doc_stub"] + # Add -h as a help shortcut option CONTEXT_SETTINGS = {"help_option_names": ["-h", "--help"]} @@ -52,7 +54,7 @@ def help(ctx: click.Context, topic: None | str, **kw: Any) -> None: "--dataset", required=True, type=click.Path(exists=True, path_type=Path), - description="Path to the JSON-formatted document stub dataset to upload.", + help="Path to the JSON-formatted document stub dataset to upload.", ) @run_with_asyncio async def upload_doc_stub(dataset: Path) -> None: @@ -86,3 +88,25 @@ async def upload_doc_stub(dataset: Path) -> None: index = client.init_index(config.algolia_document_index_name) algolia_doc_service = AlgoliaDocIndexService(index, logger) await algolia_doc_service.save_doc_stub(stub_record) + + +@main.command() +@run_with_asyncio +async def audit() -> None: + """Audit the Algolia document index and check if any documents are missing + based on the listing of projects registered in the LTD Keeper service. + """ + config = Configuration() + logger = structlog.get_logger("ook") + if any( + _ is None + for _ in ( + config.algolia_document_index_name, + config.algolia_app_id, + config.algolia_api_key, + ) + ): + raise click.UsageError("Algolia credentials not set in environment.") + async with Factory.create_standalone(logger=logger) as factory: + algolia_audit_service = factory.create_algolia_audit_service() + await algolia_audit_service.audit_missing_documents() diff --git a/src/ook/factory.py b/src/ook/factory.py index fe0fad0..eaa506e 100644 --- a/src/ook/factory.py +++ b/src/ook/factory.py @@ -24,6 +24,7 @@ from .config import config from .dependencies.algoliasearch import algolia_client_dependency from .domain.kafka import LtdUrlIngestV1, UrlIngestKeyV1 +from .services.algoliaaudit import AlgoliaAuditService from .services.algoliadocindex import AlgoliaDocIndexService from .services.classification import ClassificationService from .services.githubmetadata import GitHubMetadataService @@ -227,3 +228,11 @@ def create_sphinx_technote_ingest_service( github_service=self.create_github_metadata_service(), logger=self._logger, ) + + def create_algolia_audit_service(self) -> AlgoliaAuditService: + """Create an AlgoliaAuditService.""" + return AlgoliaAuditService( + http_client=self.http_client, + algolia_search_client=self._process_context.algolia_client, + logger=self._logger, + ) diff --git a/src/ook/services/algoliaaudit.py b/src/ook/services/algoliaaudit.py new file mode 100644 index 0000000..bbb927b --- /dev/null +++ b/src/ook/services/algoliaaudit.py @@ -0,0 +1,141 @@ +"""Service for auditing the Algolia indices for completeness.""" + +from __future__ import annotations + +import re +from dataclasses import dataclass + +from algoliasearch.search_client import SearchClient +from httpx import AsyncClient +from structlog.stdlib import BoundLogger + +from ..config import config + +# Python regular expression pattern that matches an LTD document slug such as +# "sqr-000". +LTD_SLUG_PATTERN = re.compile(r"^[a-z]+-[0-9]+$") + + +@dataclass +class LtdDocument: + """Information about a document registered in LTD.""" + + published_url: str + """The base URL where the document is published.""" + + slug: str + """The LTD slug for the document.""" + + @property + def handle(self) -> str: + """The handle for the document in Algolia.""" + return self.slug.upper() + + def __lt__(self, other: LtdDocument) -> bool: + """Sort documents by their handle.""" + return self.handle < other.handle + + def __le__(self, other: LtdDocument) -> bool: + """Sort documents by their handle.""" + return self.handle <= other.handle + + def __gt__(self, other: LtdDocument) -> bool: + """Sort documents by their handle.""" + return self.handle > other.handle + + def __ge__(self, other: LtdDocument) -> bool: + """Sort documents by their handle.""" + return self.handle >= other.handle + + +class AlgoliaAuditService: + """A service for auditing the Algolia indices for completeness.""" + + def __init__( + self, + *, + http_client: AsyncClient, + logger: BoundLogger, + algolia_search_client: SearchClient, + ) -> None: + """Initialize the service.""" + self._http_client = http_client + self._search_client = algolia_search_client + self._logger = logger + + async def audit_missing_documents(self) -> list[LtdDocument]: + """Audit the Algolia indices for completeness of missing documents. + + A document is considered "missing" if it is registered in the LTD API, + but its handle is not in the Algolia index. + + This audit only tests documents, not documentation projects (user + guides). + + Returns + ------- + list + A list of missing documents. + """ + expected_ltd_docs = await self._get_ltd_documents() + expected_ltd_docs.sort() + + missing_docs: list[LtdDocument] = [] + + doc_index = self._search_client.init_index( + config.algolia_document_index_name + ) + for expected_ltd_doc in expected_ltd_docs: + result = doc_index.search( + expected_ltd_doc.slug.upper(), + {"restrictSearchableAttributes": "handle"}, + ) + if result["nbHits"] == 0: + self._logger.warning( + "Document not found in Algolia index", + handle=expected_ltd_doc.slug.upper(), + published_url=expected_ltd_doc.published_url, + ) + missing_docs.append(expected_ltd_doc) + + self._logger.info( + "Audit complete.", + found=len(expected_ltd_docs) - len(missing_docs), + missing=len(missing_docs), + ) + + missing_docs.sort() + return missing_docs + + async def _get_ltd_documents(self) -> list[LtdDocument]: + """Get a list of documents registered in LTD.""" + r = await self._http_client.get("https://keeper.lsst.codes/products/") + products = r.json() + + documents: list[LtdDocument] = [] + + for product_api_url in products["products"]: + slug = product_api_url.split("/")[-1] + if LTD_SLUG_PATTERN.match(slug) is None: + continue + series = slug.upper().split("-")[0] + if series in ("TEST", "TESTN", "TESTDOC", "TESTR"): + # Skip known test document series before requesting their + # metadata. + continue + + r = await self._http_client.get(product_api_url) + product = r.json() + + if product["doc_repo"].startswith( + "https://github.com/lsst-sqre-testing/" + ): + # Skip known test documents. + continue + + document = LtdDocument( + published_url=product["published_url"], + slug=product["slug"], + ) + documents.append(document) + return documents