Skip to content

Commit

Permalink
collectors: don't shut down on GetInfo timeout
Browse files Browse the repository at this point in the history
Since the lnd GetInfo call sometimes takes way longer than average, we
don't want to shut down on just a timeout.
  • Loading branch information
guggero committed Jul 26, 2024
1 parent c5cb503 commit e1c2d96
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 4 deletions.
16 changes: 14 additions & 2 deletions collectors/chain_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,20 @@ func (c *ChainCollector) Describe(ch chan<- *prometheus.Desc) {
func (c *ChainCollector) Collect(ch chan<- prometheus.Metric) {
resp, err := c.lnd.GetInfo(context.Background())
if err != nil {
c.errChan <- fmt.Errorf("ChainCollector GetInfo failed with: "+
"%v", err)
errWithContext := fmt.Errorf("ChainCollector GetInfo "+
"failed with: %w", err)
Logger.Error(errWithContext)

// If this isn't just a timeout, we'll want to exit to give the
// runtime (Docker/k8s/systemd) a chance to restart us, in case
// something with the lnd connection and/or credentials has
// changed. We just do this check for the GetInfo call, since
// that's known to sometimes randomly take way longer than on
// average (database interactions?).
if !IsDeadlineExceeded(err) {
c.errChan <- errWithContext
}

return
}

Expand Down
16 changes: 14 additions & 2 deletions collectors/channels_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -311,8 +311,20 @@ func (c *ChannelsCollector) Collect(ch chan<- prometheus.Metric) {
// have open.
getInfoResp, err := c.lnd.GetInfo(context.Background())
if err != nil {
c.errChan <- fmt.Errorf("ChannelsCollector GetInfo failed "+
"with: %v", err)
errWithContext := fmt.Errorf("ChannelsCollector GetInfo "+
"failed with: %w", err)
Logger.Error(errWithContext)

// If this isn't just a timeout, we'll want to exit to give the
// runtime (Docker/k8s/systemd) a chance to restart us, in case
// something with the lnd connection and/or credentials has
// changed. We just do this check for the GetInfo call, since
// that's known to sometimes randomly take way longer than on
// average (database interactions?).
if !IsDeadlineExceeded(err) {
c.errChan <- errWithContext
}

return
}

Expand Down
39 changes: 39 additions & 0 deletions collectors/errors.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package collectors

import (
"context"
"strings"

"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)

var (
// errRPCDeadlineExceeded is the error that is sent over the gRPC
// interface when it's coming from the server side. The
// status.FromContextError() function won't recognize it correctly
// since the error sent over the wire is a string and not a structured
// error anymore.
errRPCDeadlineExceeded = status.Error(
codes.DeadlineExceeded, context.DeadlineExceeded.Error(),
)
)

// IsDeadlineExceeded returns true if the passed error is a gRPC error with the
// context.DeadlineExceeded error as the cause.
func IsDeadlineExceeded(err error) bool {
if err == nil {
return false
}

st := status.FromContextError(err)
if st.Code() == codes.DeadlineExceeded {
return true
}

if strings.Contains(err.Error(), errRPCDeadlineExceeded.Error()) {
return true
}

return false
}

0 comments on commit e1c2d96

Please sign in to comment.