From f5cf5251026f64800db270e1d233b9127e2ce3ff Mon Sep 17 00:00:00 2001
From: Fabian Meyer <3982806+meyfa@users.noreply.github.com>
Date: Sun, 14 Jul 2024 17:41:08 +0200
Subject: [PATCH] fix: Serialize access to pod logs

Pod logs are a major performance bottleneck, both due to enormous
data transfers from the Kubernetes API and due to formatting being
a expensive. Before this patch, if multiple clients requested the same
log simultaneously, these expensive operations were done multiple
times in parallel. With this patch, mutexes are added around log
retrieval and log formatting such that only one such operation can
execute at any time.

In the multi-client scenario, the client arriving first will lock the mutex
and retrieve the logs. When this client unlocks the mutex and another
client is allowed through, the logs will already be cached and the
request returns immediately.

Performance tests using the `ab` CLI tool confirm this: One of the
requests still takes the same time as before, but all other requests are
much faster.

Memory requirements are reduced from O(client count) to O(1).

In case multiple clients request different logs each, in the worst case,
the response time may be slightly worse, but probably not by much
on average:
Requesting logs in parallel from Kubernetes is likely not faster than
requesting in series due to storage or network bottlenecks, and since
Foreman is single-threaded, formatting in parallel is also not faster.
---
 backend/src/api/pod-logs.ts     | 23 ++++++++++++++------
 backend/src/controllers/logs.ts | 38 +++++++++++++++++++++------------
 2 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/backend/src/api/pod-logs.ts b/backend/src/api/pod-logs.ts
index 852d05e..27f4ee4 100644
--- a/backend/src/api/pod-logs.ts
+++ b/backend/src/api/pod-logs.ts
@@ -5,6 +5,7 @@ import { authenticateSession } from '../auth/common.js'
 import { prettifyLogs } from '../renovate/prettify-logs.js'
 import { enums, optional, type } from 'superstruct'
 import { WeakCache } from '../util/cache.js'
+import { Mutex } from '../util/mutex.js'
 
 export interface PodLogsRoute {
   Reply: string
@@ -17,8 +18,10 @@ const logsQuerystringSchema = type({
 export const podLogsRoute = ({ logsController }: Controllers): FastifyPluginAsync => async (app) => {
   app.addHook('preValidation', authenticateSession())
 
-  // Prettifying is an expensive operation. Cache the results.
+  // Prettifying is an expensive operation. Cache the results, and only allow one prettification at a time.
+  // The latter helps if multiple clients request the same logs at the same time since no
   const prettyLogsCache = new WeakCache<{ data: string }>(5000)
+  const prettyLogsMutex = new Mutex()
 
   app.get<PodLogsRoute & {
     Params: {
@@ -38,12 +41,18 @@ export const podLogsRoute = ({ logsController }: Controllers): FastifyPluginAsyn
       return await notFound(reply)
     }
     if (query.pretty === 'true') {
-      const result = await prettyLogsCache.lazyCompute([request.params.namespace, request.params.name], async () => {
-        return {
-          data: await prettifyLogs(logs)
-        }
-      })
-      return result?.data
+      let unlock
+      try {
+        unlock = await prettyLogsMutex.lock()
+        const result = await prettyLogsCache.lazyCompute([request.params.namespace, request.params.name], async () => {
+          return {
+            data: await prettifyLogs(logs)
+          }
+        })
+        return result?.data
+      } finally {
+        unlock?.()
+      }
     }
     return logs
   })
diff --git a/backend/src/controllers/logs.ts b/backend/src/controllers/logs.ts
index 6662e3e..20cc922 100644
--- a/backend/src/controllers/logs.ts
+++ b/backend/src/controllers/logs.ts
@@ -1,12 +1,16 @@
 import { KubernetesApi } from '../kubernetes/api.js'
 import { PodController } from './pod.js'
 import { WeakCache } from '../util/cache.js'
+import { Mutex } from '../util/mutex.js'
 
 export class LogsController {
   // The log strings may be large, so we use a weak cache to allow them to be garbage collected.
   // However, it's impossible to create a WeakRef to a string, so we wrap the string in an object.
   private readonly cache = new WeakCache<{ data: string }>(5000)
 
+  // Only allow fetching one log at a time.
+  private readonly mutex = new Mutex()
+
   constructor (
     private readonly k8s: KubernetesApi,
     private readonly podController: PodController
@@ -17,20 +21,26 @@ export class LogsController {
     namespace: string
     name: string
   }): Promise<string | undefined> {
-    // Input must relate to an existing pod
-    const podItem = await this.podController.findPod(pod)
-    if (podItem == null) {
-      return undefined
-    }
-    // Requesting logs for a pod that is starting is invalid and will result in a 400 error.
-    if (podItem.status?.phase === 'Pending' || podItem.status?.phase === 'Unknown') {
-      return undefined
-    }
-    const result = await this.cache.lazyCompute([pod.namespace, pod.name], async () => {
-      return {
-        data: await this.k8s.getPodLogs(pod)
+    let unlock
+    try {
+      unlock = await this.mutex.lock()
+      // Input must relate to an existing pod
+      const podItem = await this.podController.findPod(pod)
+      if (podItem == null) {
+        return undefined
       }
-    })
-    return result?.data
+      // Requesting logs for a pod that is starting is invalid and will result in a 400 error.
+      if (podItem.status?.phase === 'Pending' || podItem.status?.phase === 'Unknown') {
+        return undefined
+      }
+      const result = await this.cache.lazyCompute([pod.namespace, pod.name], async () => {
+        return {
+          data: await this.k8s.getPodLogs(pod)
+        }
+      })
+      return result?.data
+    } finally {
+      unlock?.()
+    }
   }
 }