From 614a83f41c72c9a5198c0b258bfbe57b34dced4f Mon Sep 17 00:00:00 2001 From: Russ Allbery Date: Tue, 28 Jun 2022 11:21:00 -0700 Subject: [PATCH 1/4] Use per-thread Google storage clients We're seeing problems where crawlspace locks up and stops answering any requests after a small amount of traffic. The backtrace is a timeout in the handler. This may be due to reusing Google Cloud Storage clients across threads, resulting in cross-thread confusion about responses to requests. Try using per-thread instances via context variables to see if that helps. --- src/crawlspace/dependencies/gcs.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/crawlspace/dependencies/gcs.py b/src/crawlspace/dependencies/gcs.py index 721c43c..b873ba4 100644 --- a/src/crawlspace/dependencies/gcs.py +++ b/src/crawlspace/dependencies/gcs.py @@ -2,12 +2,14 @@ from __future__ import annotations -from typing import Optional +from contextvars import ContextVar from google.cloud import storage from ..config import config +_GCS_CLIENT: ContextVar[storage.Client] = ContextVar("_GCS_CLIENT") + __all__ = [ "GCSClientDependency", "gcs_client_dependency", @@ -17,14 +19,13 @@ class GCSClientDependency: """Provides a `google.cloud.storage.Client` as a dependency.""" - def __init__(self) -> None: - self.gcs: Optional[storage.Client] = None - async def __call__(self) -> storage.client.Client: """Return the cached `google.cloud.storage.Client`.""" - if not self.gcs: - self.gcs = storage.Client(project=config.gcs_project) - return self.gcs + client = _GCS_CLIENT.get(None) + if not client: + client = storage.Client(project=config.gcs_project) + _GCS_CLIENT.set(client) + return client gcs_client_dependency = GCSClientDependency() From 5be198d99aa996257c07cc7f7d3bf31d085cb981 Mon Sep 17 00:00:00 2001 From: Russ Allbery Date: Tue, 28 Jun 2022 11:25:10 -0700 Subject: [PATCH 2/4] Add debug logging for requests Add debug logging to make it easier to debug problems in the future. --- src/crawlspace/handlers/external.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/crawlspace/handlers/external.py b/src/crawlspace/handlers/external.py index 6db9776..0785efb 100644 --- a/src/crawlspace/handlers/external.py +++ b/src/crawlspace/handlers/external.py @@ -40,6 +40,7 @@ def get_file( etags: List[str] = Depends(etag_validation_dependency), logger: BoundLogger = Depends(logger_dependency), ) -> Response: + logger.debug("File request", path=path) if path == "": path = "index.html" @@ -47,6 +48,7 @@ def get_file( try: crawlspace_file = file_service.get_file(path) except GCSFileNotFoundError: + logger.debug("File not found", path=path) raise HTTPException(status_code=404, detail="File not found") except Exception as e: logger.exception(f"Failed to retrieve {path}", error=str(e)) @@ -55,6 +57,7 @@ def get_file( ) if crawlspace_file.blob.etag in etags: + logger.debug("File unchanged", path=path) return Response( status_code=304, content="", @@ -62,6 +65,7 @@ def get_file( media_type=crawlspace_file.media_type, ) else: + logger.debug("Returning file", path=path) return StreamingResponse( crawlspace_file.stream(), media_type=crawlspace_file.media_type, @@ -82,6 +86,7 @@ def head_file( gcs: storage.Client = Depends(gcs_client_dependency), logger: BoundLogger = Depends(logger_dependency), ) -> Response: + logger.debug("Head request", path=path) if path == "": path = "index.html" @@ -89,6 +94,7 @@ def head_file( try: crawlspace_file = file_service.get_file(path) except GCSFileNotFoundError: + logger.debug("File not found for head request", path=path) raise HTTPException(status_code=404, detail="File not found") except Exception as e: logger.exception(f"Failed to retrieve {path}", error=str(e)) @@ -96,6 +102,7 @@ def head_file( status_code=500, detail="Failed to retrieve file from GCS" ) + logger.debug("Returning file metadata", path=path) return Response( status_code=200, content="", From 2ad80a5cecaba19b96bef0d6b2299ac102120282 Mon Sep 17 00:00:00 2001 From: Russ Allbery Date: Tue, 28 Jun 2022 11:36:13 -0700 Subject: [PATCH 3/4] Recreate GCS client on each request See if the problem is still some sort of contention between clients or problems with expiring credentials. --- src/crawlspace/dependencies/gcs.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/crawlspace/dependencies/gcs.py b/src/crawlspace/dependencies/gcs.py index b873ba4..8c18ae6 100644 --- a/src/crawlspace/dependencies/gcs.py +++ b/src/crawlspace/dependencies/gcs.py @@ -21,11 +21,7 @@ class GCSClientDependency: async def __call__(self) -> storage.client.Client: """Return the cached `google.cloud.storage.Client`.""" - client = _GCS_CLIENT.get(None) - if not client: - client = storage.Client(project=config.gcs_project) - _GCS_CLIENT.set(client) - return client + return storage.Client(project=config.gcs_project) gcs_client_dependency = GCSClientDependency() From 911817197e87de55fc461e3fac1200df6a40e6e4 Mon Sep 17 00:00:00 2001 From: Russ Allbery Date: Tue, 28 Jun 2022 11:48:19 -0700 Subject: [PATCH 4/4] Don't stream files from GCS Download the full file and return it directly, in case streaming is causing the deadlock issues. --- src/crawlspace/handlers/external.py | 8 +++++--- src/crawlspace/services/file.py | 4 ++++ tests/support/gcs.py | 3 +++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/crawlspace/handlers/external.py b/src/crawlspace/handlers/external.py index 0785efb..6222db7 100644 --- a/src/crawlspace/handlers/external.py +++ b/src/crawlspace/handlers/external.py @@ -3,7 +3,7 @@ from typing import List from fastapi import APIRouter, Depends, HTTPException, Path, Request, Response -from fastapi.responses import RedirectResponse, StreamingResponse +from fastapi.responses import RedirectResponse from google.cloud import storage from safir.dependencies.logger import logger_dependency from structlog.stdlib import BoundLogger @@ -66,8 +66,10 @@ def get_file( ) else: logger.debug("Returning file", path=path) - return StreamingResponse( - crawlspace_file.stream(), + data = crawlspace_file.download_as_bytes() + return Response( + status_code=200, + content=data, media_type=crawlspace_file.media_type, headers=crawlspace_file.headers, ) diff --git a/src/crawlspace/services/file.py b/src/crawlspace/services/file.py index a1edfc5..ba29a4b 100644 --- a/src/crawlspace/services/file.py +++ b/src/crawlspace/services/file.py @@ -52,6 +52,10 @@ def from_blob(cls, path: str, blob: storage.Blob) -> CrawlspaceFile: media_type = guessed_type if guessed_type else "text/plain" return cls(blob=blob, headers=headers, media_type=media_type) + def download_as_bytes(self) -> bytes: + """Download the content from GCS.""" + return self.blob.download_as_bytes() + def stream(self) -> Iterator[bytes]: """Stream the content from GCS.""" with self.blob.open("rb") as content: diff --git a/tests/support/gcs.py b/tests/support/gcs.py index b20533c..0d4a297 100644 --- a/tests/support/gcs.py +++ b/tests/support/gcs.py @@ -30,6 +30,9 @@ def __init__(self, name: str) -> None: self.updated = datetime.fromtimestamp(mtime, tz=timezone.utc) self.etag = str(self._path.stat().st_ino) + def download_as_bytes(self) -> bytes: + return self._path.read_bytes() + def exists(self) -> bool: return self._exists