From e1c2d960d37de6a0fa8021408740b058b1ef9b69 Mon Sep 17 00:00:00 2001 From: Oliver Gugger Date: Fri, 26 Jul 2024 13:37:29 +0200 Subject: [PATCH] collectors: don't shut down on GetInfo timeout Since the lnd GetInfo call sometimes takes way longer than average, we don't want to shut down on just a timeout. --- collectors/chain_collector.go | 16 +++++++++++-- collectors/channels_collector.go | 16 +++++++++++-- collectors/errors.go | 39 ++++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 4 deletions(-) create mode 100644 collectors/errors.go diff --git a/collectors/chain_collector.go b/collectors/chain_collector.go index 8093207..d853913 100644 --- a/collectors/chain_collector.go +++ b/collectors/chain_collector.go @@ -70,8 +70,20 @@ func (c *ChainCollector) Describe(ch chan<- *prometheus.Desc) { func (c *ChainCollector) Collect(ch chan<- prometheus.Metric) { resp, err := c.lnd.GetInfo(context.Background()) if err != nil { - c.errChan <- fmt.Errorf("ChainCollector GetInfo failed with: "+ - "%v", err) + errWithContext := fmt.Errorf("ChainCollector GetInfo "+ + "failed with: %w", err) + Logger.Error(errWithContext) + + // If this isn't just a timeout, we'll want to exit to give the + // runtime (Docker/k8s/systemd) a chance to restart us, in case + // something with the lnd connection and/or credentials has + // changed. We just do this check for the GetInfo call, since + // that's known to sometimes randomly take way longer than on + // average (database interactions?). + if !IsDeadlineExceeded(err) { + c.errChan <- errWithContext + } + return } diff --git a/collectors/channels_collector.go b/collectors/channels_collector.go index 138374a..f2ad2ea 100644 --- a/collectors/channels_collector.go +++ b/collectors/channels_collector.go @@ -311,8 +311,20 @@ func (c *ChannelsCollector) Collect(ch chan<- prometheus.Metric) { // have open. getInfoResp, err := c.lnd.GetInfo(context.Background()) if err != nil { - c.errChan <- fmt.Errorf("ChannelsCollector GetInfo failed "+ - "with: %v", err) + errWithContext := fmt.Errorf("ChannelsCollector GetInfo "+ + "failed with: %w", err) + Logger.Error(errWithContext) + + // If this isn't just a timeout, we'll want to exit to give the + // runtime (Docker/k8s/systemd) a chance to restart us, in case + // something with the lnd connection and/or credentials has + // changed. We just do this check for the GetInfo call, since + // that's known to sometimes randomly take way longer than on + // average (database interactions?). + if !IsDeadlineExceeded(err) { + c.errChan <- errWithContext + } + return } diff --git a/collectors/errors.go b/collectors/errors.go new file mode 100644 index 0000000..55cde5b --- /dev/null +++ b/collectors/errors.go @@ -0,0 +1,39 @@ +package collectors + +import ( + "context" + "strings" + + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +var ( + // errRPCDeadlineExceeded is the error that is sent over the gRPC + // interface when it's coming from the server side. The + // status.FromContextError() function won't recognize it correctly + // since the error sent over the wire is a string and not a structured + // error anymore. + errRPCDeadlineExceeded = status.Error( + codes.DeadlineExceeded, context.DeadlineExceeded.Error(), + ) +) + +// IsDeadlineExceeded returns true if the passed error is a gRPC error with the +// context.DeadlineExceeded error as the cause. +func IsDeadlineExceeded(err error) bool { + if err == nil { + return false + } + + st := status.FromContextError(err) + if st.Code() == codes.DeadlineExceeded { + return true + } + + if strings.Contains(err.Error(), errRPCDeadlineExceeded.Error()) { + return true + } + + return false +}