From e1c2d960d37de6a0fa8021408740b058b1ef9b69 Mon Sep 17 00:00:00 2001
From: Oliver Gugger <gugger@gmail.com>
Date: Fri, 26 Jul 2024 13:37:29 +0200
Subject: [PATCH] collectors: don't shut down on GetInfo timeout

Since the lnd GetInfo call sometimes takes way longer than average, we
don't want to shut down on just a timeout.
---
 collectors/chain_collector.go    | 16 +++++++++++--
 collectors/channels_collector.go | 16 +++++++++++--
 collectors/errors.go             | 39 ++++++++++++++++++++++++++++++++
 3 files changed, 67 insertions(+), 4 deletions(-)
 create mode 100644 collectors/errors.go

diff --git a/collectors/chain_collector.go b/collectors/chain_collector.go
index 8093207..d853913 100644
--- a/collectors/chain_collector.go
+++ b/collectors/chain_collector.go
@@ -70,8 +70,20 @@ func (c *ChainCollector) Describe(ch chan<- *prometheus.Desc) {
 func (c *ChainCollector) Collect(ch chan<- prometheus.Metric) {
 	resp, err := c.lnd.GetInfo(context.Background())
 	if err != nil {
-		c.errChan <- fmt.Errorf("ChainCollector GetInfo failed with: "+
-			"%v", err)
+		errWithContext := fmt.Errorf("ChainCollector GetInfo "+
+			"failed with: %w", err)
+		Logger.Error(errWithContext)
+
+		// If this isn't just a timeout, we'll want to exit to give the
+		// runtime (Docker/k8s/systemd) a chance to restart us, in case
+		// something with the lnd connection and/or credentials has
+		// changed. We just do this check for the GetInfo call, since
+		// that's known to sometimes randomly take way longer than on
+		// average (database interactions?).
+		if !IsDeadlineExceeded(err) {
+			c.errChan <- errWithContext
+		}
+
 		return
 	}
 
diff --git a/collectors/channels_collector.go b/collectors/channels_collector.go
index 138374a..f2ad2ea 100644
--- a/collectors/channels_collector.go
+++ b/collectors/channels_collector.go
@@ -311,8 +311,20 @@ func (c *ChannelsCollector) Collect(ch chan<- prometheus.Metric) {
 	// have open.
 	getInfoResp, err := c.lnd.GetInfo(context.Background())
 	if err != nil {
-		c.errChan <- fmt.Errorf("ChannelsCollector GetInfo failed "+
-			"with: %v", err)
+		errWithContext := fmt.Errorf("ChannelsCollector GetInfo "+
+			"failed with: %w", err)
+		Logger.Error(errWithContext)
+
+		// If this isn't just a timeout, we'll want to exit to give the
+		// runtime (Docker/k8s/systemd) a chance to restart us, in case
+		// something with the lnd connection and/or credentials has
+		// changed. We just do this check for the GetInfo call, since
+		// that's known to sometimes randomly take way longer than on
+		// average (database interactions?).
+		if !IsDeadlineExceeded(err) {
+			c.errChan <- errWithContext
+		}
+
 		return
 	}
 
diff --git a/collectors/errors.go b/collectors/errors.go
new file mode 100644
index 0000000..55cde5b
--- /dev/null
+++ b/collectors/errors.go
@@ -0,0 +1,39 @@
+package collectors
+
+import (
+	"context"
+	"strings"
+
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
+)
+
+var (
+	// errRPCDeadlineExceeded is the error that is sent over the gRPC
+	// interface when it's coming from the server side. The
+	// status.FromContextError() function won't recognize it correctly
+	// since the error sent over the wire is a string and not a structured
+	// error anymore.
+	errRPCDeadlineExceeded = status.Error(
+		codes.DeadlineExceeded, context.DeadlineExceeded.Error(),
+	)
+)
+
+// IsDeadlineExceeded returns true if the passed error is a gRPC error with the
+// context.DeadlineExceeded error as the cause.
+func IsDeadlineExceeded(err error) bool {
+	if err == nil {
+		return false
+	}
+
+	st := status.FromContextError(err)
+	if st.Code() == codes.DeadlineExceeded {
+		return true
+	}
+
+	if strings.Contains(err.Error(), errRPCDeadlineExceeded.Error()) {
+		return true
+	}
+
+	return false
+}