Skip to content

Commit

Permalink
Add AlgoliaAuditService and ook audit command
Browse files Browse the repository at this point in the history
This service checks the Algolia indices to ensure that expected
documents are present. We intend to run this service primarily through
the CLI (ook audit) from a Kubernetes cron job.
  • Loading branch information
jonathansick committed Aug 30, 2023
1 parent a500987 commit c0b31c3
Show file tree
Hide file tree
Showing 3 changed files with 177 additions and 3 deletions.
30 changes: 27 additions & 3 deletions src/ook/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

from __future__ import annotations

__all__ = ["main", "help", "upload_doc_stub"]

from pathlib import Path
from typing import Any

Expand All @@ -12,10 +10,14 @@
from algoliasearch.search_client import SearchClient
from safir.asyncio import run_with_asyncio

# from ook.factory import Factory # noqa: ERA001
from ook.config import Configuration
from ook.domain.algoliarecord import MinimalDocumentModel
from ook.factory import Factory
from ook.services.algoliadocindex import AlgoliaDocIndexService

__all__ = ["main", "help", "upload_doc_stub"]

# Add -h as a help shortcut option
CONTEXT_SETTINGS = {"help_option_names": ["-h", "--help"]}

Expand Down Expand Up @@ -52,7 +54,7 @@ def help(ctx: click.Context, topic: None | str, **kw: Any) -> None:
"--dataset",
required=True,
type=click.Path(exists=True, path_type=Path),
description="Path to the JSON-formatted document stub dataset to upload.",
help="Path to the JSON-formatted document stub dataset to upload.",
)
@run_with_asyncio
async def upload_doc_stub(dataset: Path) -> None:
Expand Down Expand Up @@ -86,3 +88,25 @@ async def upload_doc_stub(dataset: Path) -> None:
index = client.init_index(config.algolia_document_index_name)
algolia_doc_service = AlgoliaDocIndexService(index, logger)
await algolia_doc_service.save_doc_stub(stub_record)


@main.command()
@run_with_asyncio
async def audit() -> None:
"""Audit the Algolia document index and check if any documents are missing
based on the listing of projects registered in the LTD Keeper service.
"""
config = Configuration()
logger = structlog.get_logger("ook")
if any(
_ is None
for _ in (
config.algolia_document_index_name,
config.algolia_app_id,
config.algolia_api_key,
)
):
raise click.UsageError("Algolia credentials not set in environment.")
async with Factory.create_standalone(logger=logger) as factory:
algolia_audit_service = factory.create_algolia_audit_service()
await algolia_audit_service.audit_missing_documents()
9 changes: 9 additions & 0 deletions src/ook/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from .config import config
from .dependencies.algoliasearch import algolia_client_dependency
from .domain.kafka import LtdUrlIngestV1, UrlIngestKeyV1
from .services.algoliaaudit import AlgoliaAuditService
from .services.algoliadocindex import AlgoliaDocIndexService
from .services.classification import ClassificationService
from .services.githubmetadata import GitHubMetadataService
Expand Down Expand Up @@ -227,3 +228,11 @@ def create_sphinx_technote_ingest_service(
github_service=self.create_github_metadata_service(),
logger=self._logger,
)

def create_algolia_audit_service(self) -> AlgoliaAuditService:
"""Create an AlgoliaAuditService."""
return AlgoliaAuditService(
http_client=self.http_client,
algolia_search_client=self._process_context.algolia_client,
logger=self._logger,
)
141 changes: 141 additions & 0 deletions src/ook/services/algoliaaudit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
"""Service for auditing the Algolia indices for completeness."""

from __future__ import annotations

import re
from dataclasses import dataclass

from algoliasearch.search_client import SearchClient
from httpx import AsyncClient
from structlog.stdlib import BoundLogger

from ..config import config

# Python regular expression pattern that matches an LTD document slug such as
# "sqr-000".
LTD_SLUG_PATTERN = re.compile(r"^[a-z]+-[0-9]+$")


@dataclass
class LtdDocument:
"""Information about a document registered in LTD."""

published_url: str
"""The base URL where the document is published."""

slug: str
"""The LTD slug for the document."""

@property
def handle(self) -> str:
"""The handle for the document in Algolia."""
return self.slug.upper()

def __lt__(self, other: LtdDocument) -> bool:
"""Sort documents by their handle."""
return self.handle < other.handle

def __le__(self, other: LtdDocument) -> bool:
"""Sort documents by their handle."""
return self.handle <= other.handle

def __gt__(self, other: LtdDocument) -> bool:
"""Sort documents by their handle."""
return self.handle > other.handle

def __ge__(self, other: LtdDocument) -> bool:
"""Sort documents by their handle."""
return self.handle >= other.handle


class AlgoliaAuditService:
"""A service for auditing the Algolia indices for completeness."""

def __init__(
self,
*,
http_client: AsyncClient,
logger: BoundLogger,
algolia_search_client: SearchClient,
) -> None:
"""Initialize the service."""
self._http_client = http_client
self._search_client = algolia_search_client
self._logger = logger

async def audit_missing_documents(self) -> list[LtdDocument]:
"""Audit the Algolia indices for completeness of missing documents.
A document is considered "missing" if it is registered in the LTD API,
but its handle is not in the Algolia index.
This audit only tests documents, not documentation projects (user
guides).
Returns
-------
list
A list of missing documents.
"""
expected_ltd_docs = await self._get_ltd_documents()
expected_ltd_docs.sort()

missing_docs: list[LtdDocument] = []

doc_index = self._search_client.init_index(
config.algolia_document_index_name
)
for expected_ltd_doc in expected_ltd_docs:
result = doc_index.search(
expected_ltd_doc.slug.upper(),
{"restrictSearchableAttributes": "handle"},
)
if result["nbHits"] == 0:
self._logger.warning(
"Document not found in Algolia index",
handle=expected_ltd_doc.slug.upper(),
published_url=expected_ltd_doc.published_url,
)
missing_docs.append(expected_ltd_doc)

self._logger.info(
"Audit complete.",
found=len(expected_ltd_docs) - len(missing_docs),
missing=len(missing_docs),
)

missing_docs.sort()
return missing_docs

async def _get_ltd_documents(self) -> list[LtdDocument]:
"""Get a list of documents registered in LTD."""
r = await self._http_client.get("https://keeper.lsst.codes/products/")
products = r.json()

documents: list[LtdDocument] = []

for product_api_url in products["products"]:
slug = product_api_url.split("/")[-1]
if LTD_SLUG_PATTERN.match(slug) is None:
continue
series = slug.upper().split("-")[0]
if series in ("TEST", "TESTN", "TESTDOC", "TESTR"):
# Skip known test document series before requesting their
# metadata.
continue

r = await self._http_client.get(product_api_url)
product = r.json()

if product["doc_repo"].startswith(
"https://github.com/lsst-sqre-testing/"
):
# Skip known test documents.
continue

document = LtdDocument(
published_url=product["published_url"],
slug=product["slug"],
)
documents.append(document)
return documents

0 comments on commit c0b31c3

Please sign in to comment.