From a3263521cb234d17e15ebb30bbf76edfc867a6a9 Mon Sep 17 00:00:00 2001 From: "d.anchikov" Date: Fri, 17 Jun 2022 16:54:25 +0400 Subject: [PATCH 1/3] feat: Added retry logic to remote api request --- api/handler/triggers.go | 2 +- clock/clock.go | 5 + cmd/checker/config.go | 8 +- cmd/config.go | 40 ++++- interfaces.go | 1 + local/checker.yml | 5 +- metric_source/remote/config.go | 17 +- metric_source/remote/remote.go | 37 +++-- metric_source/remote/remote_test.go | 185 +++++++++++++++++++--- metric_source/remote/request.go | 66 ++++++-- metric_source/remote/request_test.go | 222 ++++++++++++++++++++++++++- mock/clock/clock.go | 12 ++ 12 files changed, 537 insertions(+), 63 deletions(-) diff --git a/api/handler/triggers.go b/api/handler/triggers.go index f1a5987b6..017c62155 100644 --- a/api/handler/triggers.go +++ b/api/handler/triggers.go @@ -91,7 +91,7 @@ func getTriggerFromRequest(request *http.Request) (*dto.Trigger, *api.ErrorRespo return nil, api.ErrorInvalidRequest(err) case remote.ErrRemoteTriggerResponse: response := api.ErrorRemoteServerUnavailable(err) - middleware.GetLoggerEntry(request).Error("%s : %s : %s", response.StatusText, response.ErrorText, err) + middleware.GetLoggerEntry(request).Errorf("%s : %s : %s", response.StatusText, response.ErrorText, err) return nil, response default: return nil, api.ErrorInternalServer(err) diff --git a/clock/clock.go b/clock/clock.go index b5e253805..a2f80d37b 100644 --- a/clock/clock.go +++ b/clock/clock.go @@ -14,3 +14,8 @@ func NewSystemClock() *SystemClock { func (t *SystemClock) Now() time.Time { return time.Now().UTC() } + +// Sleep pauses the current goroutine for at least the passed duration +func (t *SystemClock) Sleep(duration time.Duration) { + time.Sleep(duration) +} diff --git a/cmd/checker/config.go b/cmd/checker/config.go index 6cac4b8a7..1e5d6b558 100644 --- a/cmd/checker/config.go +++ b/cmd/checker/config.go @@ -92,9 +92,11 @@ func getDefault() config { Pprof: cmd.ProfilerConfig{Enabled: false}, }, Remote: cmd.RemoteConfig{ - CheckInterval: "60s", - Timeout: "60s", - MetricsTTL: "7d", + CheckInterval: "60s", + Timeout: "60s", + MetricsTTL: "168h", + HealthCheckTimeout: "60s", + HealthCheckRetrySeconds: "1 1", }, } } diff --git a/cmd/config.go b/cmd/config.go index 02f7df4e2..84a303ecc 100644 --- a/cmd/config.go +++ b/cmd/config.go @@ -3,7 +3,9 @@ package cmd import ( "fmt" "io/ioutil" + "strconv" "strings" + "time" "github.com/moira-alert/moira/metrics" @@ -113,6 +115,12 @@ type RemoteConfig struct { MetricsTTL string `yaml:"metrics_ttl"` // Timeout for remote requests Timeout string `yaml:"timeout"` + // Retry seconds for remote requests divided by spaces + RetrySeconds string `yaml:"retry_seconds"` + // HealthCheckTimeout is timeout for remote api health check requests + HealthCheckTimeout string `yaml:"health_check_timeout"` + // Retry seconds for remote api health check requests divided by spaces + HealthCheckRetrySeconds string `yaml:"health_check_retry_seconds"` // Username for basic auth User string `yaml:"user"` // Password for basic auth @@ -129,16 +137,34 @@ type ImageStoreConfig struct { // GetRemoteSourceSettings returns remote config parsed from moira config files func (config *RemoteConfig) GetRemoteSourceSettings() *remoteSource.Config { return &remoteSource.Config{ - URL: config.URL, - CheckInterval: to.Duration(config.CheckInterval), - MetricsTTL: to.Duration(config.MetricsTTL), - Timeout: to.Duration(config.Timeout), - User: config.User, - Password: config.Password, - Enabled: config.Enabled, + URL: config.URL, + CheckInterval: to.Duration(config.CheckInterval), + MetricsTTL: to.Duration(config.MetricsTTL), + Timeout: to.Duration(config.Timeout), + RetrySeconds: ParseRetrySeconds(config.RetrySeconds), + HealthCheckTimeout: to.Duration(config.HealthCheckTimeout), + HealthCheckRetrySeconds: ParseRetrySeconds(config.HealthCheckRetrySeconds), + User: config.User, + Password: config.Password, + Enabled: config.Enabled, } } +// ParseRetrySeconds parses config value string into array of integers +func ParseRetrySeconds(retrySecondsString string) []time.Duration { + secondsStringList := strings.Fields(retrySecondsString) + retrySecondsIntList := make([]time.Duration, len(secondsStringList)) + + for index, secondsString := range secondsStringList { + secondsInt, err := strconv.Atoi(secondsString) + if err != nil { + panic(err) + } + retrySecondsIntList[index] = time.Second * time.Duration(secondsInt) + } + return retrySecondsIntList +} + // ReadConfig parses config file by the given path into Moira-used type func ReadConfig(configFileName string, config interface{}) error { configYaml, err := ioutil.ReadFile(configFileName) diff --git a/interfaces.go b/interfaces.go index 080986b2c..88cbf5dae 100644 --- a/interfaces.go +++ b/interfaces.go @@ -224,4 +224,5 @@ type PlotTheme interface { // Clock is an interface to work with Time. type Clock interface { Now() time.Time + Sleep(duration time.Duration) } diff --git a/local/checker.yml b/local/checker.yml index c704183ad..1a09c3562 100644 --- a/local/checker.yml +++ b/local/checker.yml @@ -16,8 +16,11 @@ remote: enabled: true url: "http://graphite:80/render" check_interval: 60s + metrics_ttl: 168h timeout: 60s - metrics_ttl: 7d + retry_seconds: 1 1 1 + health_check_timeout: 6s + health_check_retry_seconds: 1 1 1 checker: nodata_check_interval: 60s check_interval: 10s diff --git a/metric_source/remote/config.go b/metric_source/remote/config.go index 63278e13d..115ef2787 100644 --- a/metric_source/remote/config.go +++ b/metric_source/remote/config.go @@ -4,13 +4,16 @@ import "time" // Config represents config from remote storage type Config struct { - URL string - CheckInterval time.Duration - MetricsTTL time.Duration - Timeout time.Duration - User string - Password string - Enabled bool + URL string + CheckInterval time.Duration + MetricsTTL time.Duration + Timeout time.Duration + RetrySeconds []time.Duration + HealthCheckTimeout time.Duration + HealthCheckRetrySeconds []time.Duration + User string + Password string + Enabled bool } // isEnabled checks that remote config is enabled (url is defined and enabled flag is set) diff --git a/metric_source/remote/remote.go b/metric_source/remote/remote.go index 6be143ecf..ea93385a5 100644 --- a/metric_source/remote/remote.go +++ b/metric_source/remote/remote.go @@ -5,6 +5,8 @@ import ( "net/http" "time" + "github.com/moira-alert/moira/clock" + "github.com/moira-alert/moira" metricSource "github.com/moira-alert/moira/metric_source" ) @@ -23,10 +25,22 @@ func (err ErrRemoteTriggerResponse) Error() string { return err.InternalError.Error() } +// ErrRemoteUnavailable is a custom error when remote trigger check fails +type ErrRemoteUnavailable struct { + InternalError error + Target string +} + +// Error is a representation of Error interface method +func (err ErrRemoteUnavailable) Error() string { + return err.InternalError.Error() +} + // Remote is implementation of MetricSource interface, which implements fetch metrics method from remote graphite installation type Remote struct { config *Config client *http.Client + clock moira.Clock } // Create configures remote metric source @@ -34,6 +48,7 @@ func Create(config *Config) metricSource.MetricSource { return &Remote{ config: config, client: &http.Client{Timeout: config.Timeout}, + clock: clock.NewSystemClock(), } } @@ -50,9 +65,15 @@ func (remote *Remote) Fetch(target string, from, until int64, allowRealTimeAlert Target: target, } } - body, err := remote.makeRequest(req) + body, isRemoteAvailable, err := remote.makeRequestWithRetries(req, remote.config.Timeout, remote.config.RetrySeconds) if err != nil { - return nil, ErrRemoteTriggerResponse{ + if isRemoteAvailable { + return nil, ErrRemoteTriggerResponse{ + InternalError: err, + Target: target, + } + } + return nil, ErrRemoteUnavailable{ InternalError: err, Target: target, } @@ -83,18 +104,14 @@ func (remote *Remote) IsConfigured() (bool, error) { // IsRemoteAvailable checks if graphite API is available and returns 200 response func (remote *Remote) IsRemoteAvailable() (bool, error) { - maxRetries := 3 until := time.Now().Unix() from := until - 600 //nolint req, err := remote.prepareRequest(from, until, "NonExistingTarget") if err != nil { return false, err } - for attempt := 0; attempt < maxRetries; attempt++ { - _, err = remote.makeRequest(req) - if err == nil { - return true, nil - } - } - return false, err + _, isRemoteAvailable, err := remote.makeRequestWithRetries( + req, remote.config.HealthCheckTimeout, remote.config.HealthCheckRetrySeconds, + ) + return isRemoteAvailable, err } diff --git a/metric_source/remote/remote_test.go b/metric_source/remote/remote_test.go index 4828ef4ac..8a540bc6c 100644 --- a/metric_source/remote/remote_test.go +++ b/metric_source/remote/remote_test.go @@ -4,6 +4,11 @@ import ( "fmt" "net/http" "testing" + "time" + + "github.com/golang/mock/gomock" + + mock_clock "github.com/moira-alert/moira/mock/clock" metricSource "github.com/moira-alert/moira/metric_source" . "github.com/smartystreets/goconvey/convey" @@ -26,20 +31,79 @@ func TestIsConfigured(t *testing.T) { } func TestIsRemoteAvailable(t *testing.T) { - Convey("Is available", t, func() { - server := createServer([]byte("Some string"), http.StatusOK) - remote := Remote{client: server.Client(), config: &Config{URL: server.URL}} - isAvailable, err := remote.IsRemoteAvailable() - So(isAvailable, ShouldBeTrue) - So(err, ShouldBeEmpty) + mockCtrl := gomock.NewController(t) + systemClock := mock_clock.NewMockClock(mockCtrl) + systemClock.EXPECT().Sleep(time.Second).Times(0) + testConfigs := []Config{ + {}, + {HealthCheckRetrySeconds: []time.Duration{time.Second}}, + {HealthCheckRetrySeconds: []time.Duration{time.Second}}, + {HealthCheckRetrySeconds: []time.Duration{time.Second, time.Second}}, + {HealthCheckRetrySeconds: []time.Duration{time.Second, time.Second}}, + {HealthCheckRetrySeconds: []time.Duration{time.Second, time.Second, time.Second}}, + } + body := []byte("Some string") + + Convey("Given server returns OK response the remote is available", t, func() { + server := createServer(body, http.StatusOK) + for _, config := range testConfigs { + config.URL = server.URL + remote := Remote{client: server.Client(), config: &config} + isAvailable, err := remote.IsRemoteAvailable() + So(isAvailable, ShouldBeTrue) + So(err, ShouldBeEmpty) + } }) - Convey("Not available", t, func() { - server := createServer([]byte("Some string"), http.StatusInternalServerError) - remote := Remote{client: server.Client(), config: &Config{URL: server.URL}} - isAvailable, err := remote.IsRemoteAvailable() - So(isAvailable, ShouldBeFalse) - So(err, ShouldResemble, fmt.Errorf("bad response status %d: %s", http.StatusInternalServerError, "Some string")) + Convey("Given server returns Remote Unavailable responses permanently", t, func() { + for _, statusCode := range remoteUnavailableStatusCodes { + server := createTestServer(TestResponse{body, statusCode}) + + Convey(fmt.Sprintf( + "request failed with %d response status code and remote is unavailable", statusCode, + ), func() { + remote := Remote{client: server.Client()} + for _, config := range testConfigs { + config.URL = server.URL + remote.config = &config + systemClock := mock_clock.NewMockClock(mockCtrl) + systemClock.EXPECT().Sleep(time.Second).Times(len(config.HealthCheckRetrySeconds)) + remote.clock = systemClock + + isAvailable, err := remote.IsRemoteAvailable() + So(err, ShouldResemble, fmt.Errorf( + "the remote server is not available. Response status %d: %s", statusCode, string(body), + )) + So(isAvailable, ShouldBeFalse) + } + }) + } + }) + + Convey("Given server returns Remote Unavailable response temporary", t, func() { + for _, statusCode := range remoteUnavailableStatusCodes { + Convey(fmt.Sprintf( + "the remote is available with retry after %d response", statusCode, + ), func() { + for _, config := range testConfigs { + if len(config.HealthCheckRetrySeconds) == 0 { + continue + } + server := createTestServer( + TestResponse{body, statusCode}, + TestResponse{body, http.StatusOK}, + ) + config.URL = server.URL + systemClock := mock_clock.NewMockClock(mockCtrl) + systemClock.EXPECT().Sleep(time.Second).Times(1) + remote := Remote{client: server.Client(), config: &config, clock: systemClock} + + isAvailable, err := remote.IsRemoteAvailable() + So(err, ShouldBeNil) + So(isAvailable, ShouldBeTrue) + } + }) + } }) } @@ -47,6 +111,16 @@ func TestFetch(t *testing.T) { var from int64 = 300 var until int64 = 500 target := "foo.bar" //nolint + testConfigs := []Config{ + {}, + {RetrySeconds: []time.Duration{time.Second}}, + {RetrySeconds: []time.Duration{time.Second}}, + {RetrySeconds: []time.Duration{time.Second, time.Second}}, + {RetrySeconds: []time.Duration{time.Second, time.Second}}, + {RetrySeconds: []time.Duration{time.Second, time.Second, time.Second}}, + } + mockCtrl := gomock.NewController(t) + validBody := []byte("[{\"Target\": \"t1\",\"DataPoints\":[[1,2],[3,4]]}]") Convey("Request success but body is invalid", t, func() { server := createServer([]byte("[]"), http.StatusOK) @@ -62,21 +136,90 @@ func TestFetch(t *testing.T) { result, err := remote.Fetch(target, from, until, false) So(result, ShouldBeEmpty) So(err.Error(), ShouldResemble, "invalid character 'S' looking for beginning of value") + _, ok := err.(ErrRemoteTriggerResponse) + So(ok, ShouldBeTrue) }) Convey("Fail request with InternalServerError", t, func() { server := createServer([]byte("Some string"), http.StatusInternalServerError) - remote := Remote{client: server.Client(), config: &Config{URL: server.URL}} - result, err := remote.Fetch(target, from, until, false) - So(result, ShouldBeEmpty) - So(err.Error(), ShouldResemble, fmt.Sprintf("bad response status %d: %s", http.StatusInternalServerError, "Some string")) + remote := Remote{client: server.Client()} + for _, config := range testConfigs { + config.URL = server.URL + remote.config = &config + result, err := remote.Fetch(target, from, until, false) + So(result, ShouldBeEmpty) + So(err.Error(), ShouldResemble, fmt.Sprintf("bad response status %d: %s", http.StatusInternalServerError, "Some string")) + _, ok := err.(ErrRemoteTriggerResponse) + So(ok, ShouldBeTrue) + } }) - Convey("Fail make request", t, func() { + Convey("Client calls bad url", t, func() { url := "💩%$&TR" - remote := Remote{config: &Config{URL: url}} - result, err := remote.Fetch(target, from, until, false) - So(result, ShouldBeEmpty) - So(err.Error(), ShouldResemble, "parse \"💩%$&TR\": invalid URL escape \"%$&\"") + for _, config := range testConfigs { + config.URL = url + remote := Remote{config: &config} + result, err := remote.Fetch(target, from, until, false) + So(result, ShouldBeEmpty) + So(err.Error(), ShouldResemble, "parse \"💩%$&TR\": invalid URL escape \"%$&\"") + _, ok := err.(ErrRemoteTriggerResponse) + So(ok, ShouldBeTrue) + } + }) + + Convey("Given server returns Remote Unavailable responses permanently", t, func() { + for _, statusCode := range remoteUnavailableStatusCodes { + server := createTestServer(TestResponse{validBody, statusCode}) + + Convey(fmt.Sprintf( + "request failed with %d response status code and remote is unavailable", statusCode, + ), func() { + remote := Remote{client: server.Client()} + for _, config := range testConfigs { + config.URL = server.URL + remote.config = &config + systemClock := mock_clock.NewMockClock(mockCtrl) + systemClock.EXPECT().Sleep(time.Second).Times(len(config.RetrySeconds)) + remote.clock = systemClock + + result, err := remote.Fetch(target, from, until, false) + So(err, ShouldResemble, ErrRemoteUnavailable{ + InternalError: fmt.Errorf( + "the remote server is not available. Response status %d: %s", statusCode, string(validBody), + ), Target: target, + }) + So(result, ShouldBeNil) + } + }) + } + }) + + Convey("Given server returns Remote Unavailable response temporary", t, func() { + for _, statusCode := range remoteUnavailableStatusCodes { + Convey(fmt.Sprintf( + "the remote is available with retry after %d response", statusCode, + ), func() { + for _, config := range testConfigs { + if len(config.RetrySeconds) == 0 { + continue + } + server := createTestServer( + TestResponse{validBody, statusCode}, + TestResponse{validBody, http.StatusOK}, + ) + config.URL = server.URL + systemClock := mock_clock.NewMockClock(mockCtrl) + systemClock.EXPECT().Sleep(time.Second).Times(1) + remote := Remote{client: server.Client(), config: &config, clock: systemClock} + + result, err := remote.Fetch(target, from, until, false) + So(err, ShouldBeNil) + So(result, ShouldNotBeNil) + metricsData := result.GetMetricsData() + So(len(metricsData), ShouldEqual, 1) + So(metricsData[0].Name, ShouldEqual, "t1") + } + }) + } }) } diff --git a/metric_source/remote/request.go b/metric_source/remote/request.go index c6eb72e56..3c22cc000 100644 --- a/metric_source/remote/request.go +++ b/metric_source/remote/request.go @@ -1,10 +1,12 @@ package remote import ( + "context" "fmt" "io/ioutil" "net/http" "strconv" + "time" ) func (remote *Remote) prepareRequest(from, until int64, target string) (*http.Request, error) { @@ -24,27 +26,73 @@ func (remote *Remote) prepareRequest(from, until int64, target string) (*http.Re return req, nil } -func (remote *Remote) makeRequest(req *http.Request) ([]byte, error) { - var body []byte - +func (remote *Remote) makeRequest(req *http.Request) (body []byte, isRemoteAvailable bool, err error) { resp, err := remote.client.Do(req) if resp != nil { defer resp.Body.Close() } if err != nil { - return body, fmt.Errorf("The remote server is not available or the response was reset by timeout. " + //nolint - "TTL: %s, PATH: %s, ERROR: %v ", remote.client.Timeout.String(), req.URL.RawPath, err) + return body, false, fmt.Errorf( + "the remote server is not available or the response was reset by timeout. Url: %s, Error: %v ", + req.URL.String(), + err, + ) } body, err = ioutil.ReadAll(resp.Body) if err != nil { - return body, err + return body, false, err + } + + if isRemoteUnavailableStatusCode(resp.StatusCode) { + return body, false, fmt.Errorf( + "the remote server is not available. Response status %d: %s", resp.StatusCode, string(body), + ) + } else if resp.StatusCode != http.StatusOK { + return body, true, fmt.Errorf("bad response status %d: %s", resp.StatusCode, string(body)) + } + + return body, true, nil +} + +func isRemoteUnavailableStatusCode(statusCode int) bool { + switch statusCode { + case http.StatusUnauthorized, + http.StatusBadGateway, + http.StatusServiceUnavailable, + http.StatusGatewayTimeout: + return true + default: + return false } +} - if resp.StatusCode != 200 { //nolint - return body, fmt.Errorf("bad response status %d: %s", resp.StatusCode, string(body)) +func (remote *Remote) makeRequestWithRetries( + req *http.Request, + requestTimeout time.Duration, + retrySeconds []time.Duration, +) (body []byte, isRemoteAvailable bool, err error) { + for attemptIndex := 0; attemptIndex < len(retrySeconds)+1; attemptIndex++ { + body, isRemoteAvailable, err = remote.makeRequestWithTimeout(req, requestTimeout) + if err == nil || isRemoteAvailable { + return body, true, err + } + if attemptIndex < len(retrySeconds) { + remote.clock.Sleep(retrySeconds[attemptIndex]) + } } + return nil, false, err +} - return body, nil +func (remote *Remote) makeRequestWithTimeout( + req *http.Request, + requestTimeout time.Duration, +) (body []byte, isRemoteAvailable bool, err error) { + if requestTimeout > 0 { + ctx, cancel := context.WithTimeout(context.Background(), requestTimeout) + defer cancel() + req = req.WithContext(ctx) + } + return remote.makeRequest(req) } diff --git a/metric_source/remote/request_test.go b/metric_source/remote/request_test.go index 340a8cef9..ecd9e057f 100644 --- a/metric_source/remote/request_test.go +++ b/metric_source/remote/request_test.go @@ -5,6 +5,10 @@ import ( "net/http" "net/http/httptest" "testing" + "time" + + "github.com/golang/mock/gomock" + mock_clock "github.com/moira-alert/moira/mock/clock" . "github.com/smartystreets/goconvey/convey" ) @@ -54,8 +58,9 @@ func TestMakeRequest(t *testing.T) { server := createServer(body, http.StatusOK) remote := Remote{client: server.Client(), config: &Config{URL: server.URL}} request, _ := remote.prepareRequest(from, until, target) - actual, err := remote.makeRequest(request) + actual, isRemoteAvailable, err := remote.makeRequest(request) So(err, ShouldBeNil) + So(isRemoteAvailable, ShouldBeTrue) So(actual, ShouldResemble, body) }) @@ -63,19 +68,189 @@ func TestMakeRequest(t *testing.T) { server := createServer(body, http.StatusInternalServerError) remote := Remote{client: server.Client(), config: &Config{URL: server.URL}} request, _ := remote.prepareRequest(from, until, target) - actual, err := remote.makeRequest(request) + actual, isRemoteAvailable, err := remote.makeRequest(request) So(err, ShouldResemble, fmt.Errorf("bad response status %d: %s", http.StatusInternalServerError, string(body))) + So(isRemoteAvailable, ShouldBeTrue) So(actual, ShouldResemble, body) }) Convey("Client calls bad url", t, func() { server := createServer(body, http.StatusOK) - remote := Remote{client: server.Client(), config: &Config{URL: "http://bad/"}} + client := server.Client() + client.Timeout = time.Millisecond + remote := Remote{client: client, config: &Config{URL: "http://bad/"}} request, _ := remote.prepareRequest(from, until, target) - actual, err := remote.makeRequest(request) + actual, isRemoteAvailable, err := remote.makeRequest(request) So(err, ShouldNotBeEmpty) + So(isRemoteAvailable, ShouldBeFalse) So(actual, ShouldBeEmpty) }) + + Convey("Client returns status Remote Unavailable status codes", t, func() { + for _, statusCode := range remoteUnavailableStatusCodes { + server := createServer(body, statusCode) + remote := Remote{client: server.Client(), config: &Config{URL: server.URL}} + request, _ := remote.prepareRequest(from, until, target) + actual, isRemoteAvailable, err := remote.makeRequest(request) + So(err, ShouldResemble, fmt.Errorf( + "the remote server is not available. Response status %d: %s", statusCode, string(body), + )) + So(isRemoteAvailable, ShouldBeFalse) + So(actual, ShouldResemble, body) + } + }) +} + +func TestMakeRequestWithRetries(t *testing.T) { + var from int64 = 300 + var until int64 = 500 + target := "foo.bar" + body := []byte("Some string") + testConfigs := []Config{ + {}, + {RetrySeconds: []time.Duration{time.Second}}, + {RetrySeconds: []time.Duration{time.Second}}, + {RetrySeconds: []time.Duration{time.Second, time.Second}}, + {RetrySeconds: []time.Duration{time.Second, time.Second}}, + {RetrySeconds: []time.Duration{time.Second, time.Second, time.Second}}, + } + mockCtrl := gomock.NewController(t) + + Convey("Given server returns OK response", t, func() { + server := createTestServer(TestResponse{body, http.StatusOK}) + systemClock := mock_clock.NewMockClock(mockCtrl) + systemClock.EXPECT().Sleep(time.Second).Times(0) + + Convey("request is successful", func() { + remote := Remote{client: server.Client(), clock: systemClock} + + for _, config := range testConfigs { + config.URL = server.URL + remote.config = &config + request, _ := remote.prepareRequest(from, until, target) + actual, isRemoteAvailable, err := remote.makeRequestWithRetries( + request, + remote.config.Timeout, + remote.config.RetrySeconds, + ) + So(err, ShouldBeNil) + So(isRemoteAvailable, ShouldBeTrue) + So(actual, ShouldResemble, body) + } + }) + }) + + Convey("Given server returns 500 response", t, func() { + server := createTestServer(TestResponse{body, http.StatusInternalServerError}) + systemClock := mock_clock.NewMockClock(mockCtrl) + systemClock.EXPECT().Sleep(time.Second).Times(0) + + Convey("request failed with 500 response and remote is available", func() { + remote := Remote{client: server.Client(), clock: systemClock} + + for _, config := range testConfigs { + config.URL = server.URL + remote.config = &config + request, _ := remote.prepareRequest(from, until, target) + actual, isRemoteAvailable, err := remote.makeRequestWithRetries( + request, + remote.config.Timeout, + remote.config.RetrySeconds, + ) + So(err, ShouldResemble, fmt.Errorf("bad response status %d: %s", http.StatusInternalServerError, string(body))) + So(isRemoteAvailable, ShouldBeTrue) + So(actual, ShouldResemble, body) + } + }) + }) + + Convey("Given client calls bad url", t, func() { + server := createTestServer(TestResponse{body, http.StatusOK}) + + Convey("request failed and remote is unavailable", func() { + remote := Remote{client: server.Client()} + for _, config := range testConfigs { + config.URL = "http://bad/" + remote.config = &config + systemClock := mock_clock.NewMockClock(mockCtrl) + systemClock.EXPECT().Sleep(time.Second).Times(len(config.RetrySeconds)) + remote.clock = systemClock + + request, _ := remote.prepareRequest(from, until, target) + actual, isRemoteAvailable, err := remote.makeRequestWithRetries( + request, + time.Millisecond, + remote.config.RetrySeconds, + ) + So(err, ShouldNotBeEmpty) + So(isRemoteAvailable, ShouldBeFalse) + So(actual, ShouldBeEmpty) + } + }) + }) + + Convey("Given server returns Remote Unavailable responses permanently", t, func() { + for _, statusCode := range remoteUnavailableStatusCodes { + server := createTestServer(TestResponse{body, statusCode}) + + Convey(fmt.Sprintf( + "request failed with %d response status code and remote is unavailable", statusCode, + ), func() { + remote := Remote{client: server.Client()} + for _, config := range testConfigs { + config.URL = server.URL + remote.config = &config + systemClock := mock_clock.NewMockClock(mockCtrl) + systemClock.EXPECT().Sleep(time.Second).Times(len(config.RetrySeconds)) + remote.clock = systemClock + + request, _ := remote.prepareRequest(from, until, target) + actual, isRemoteAvailable, err := remote.makeRequestWithRetries( + request, + remote.config.Timeout, + remote.config.RetrySeconds, + ) + So(err, ShouldResemble, fmt.Errorf( + "the remote server is not available. Response status %d: %s", statusCode, string(body), + )) + So(isRemoteAvailable, ShouldBeFalse) + So(actual, ShouldBeNil) + } + }) + } + }) + + Convey("Given server returns Remote Unavailable response temporary", t, func() { + for _, statusCode := range remoteUnavailableStatusCodes { + Convey(fmt.Sprintf( + "request is successful with retry after %d response and remote is available", statusCode, + ), func() { + for _, config := range testConfigs { + if len(config.RetrySeconds) == 0 { + continue + } + server := createTestServer( + TestResponse{body, statusCode}, + TestResponse{body, http.StatusOK}, + ) + config.URL = server.URL + systemClock := mock_clock.NewMockClock(mockCtrl) + systemClock.EXPECT().Sleep(time.Second).Times(1) + remote := Remote{client: server.Client(), config: &config, clock: systemClock} + + request, _ := remote.prepareRequest(from, until, target) + actual, isRemoteAvailable, err := remote.makeRequestWithRetries( + request, + remote.config.Timeout, + remote.config.RetrySeconds, + ) + So(err, ShouldBeNil) + So(isRemoteAvailable, ShouldBeTrue) + So(actual, ShouldResemble, body) + } + }) + } + }) } func createServer(body []byte, statusCode int) *httptest.Server { @@ -84,3 +259,42 @@ func createServer(body []byte, statusCode int) *httptest.Server { rw.Write(body) //nolint })) } + +func createTestServer(testResponses ...TestResponse) *httptest.Server { + responseWriter := NewTestResponseWriter(testResponses) + return httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { + response := responseWriter.GetResponse() + rw.WriteHeader(response.statusCode) + rw.Write(response.body) //nolint + })) +} + +type TestResponse struct { + body []byte + statusCode int +} + +type TestResponseWriter struct { + responses []TestResponse + count int +} + +func NewTestResponseWriter(testResponses []TestResponse) *TestResponseWriter { + responseWriter := new(TestResponseWriter) + responseWriter.responses = testResponses + responseWriter.count = 0 + return responseWriter +} + +func (responseWriter *TestResponseWriter) GetResponse() TestResponse { + response := responseWriter.responses[responseWriter.count%len(responseWriter.responses)] + responseWriter.count++ + return response +} + +var remoteUnavailableStatusCodes = []int{ + http.StatusUnauthorized, + http.StatusBadGateway, + http.StatusServiceUnavailable, + http.StatusGatewayTimeout, +} diff --git a/mock/clock/clock.go b/mock/clock/clock.go index fbfb56afb..6b55a6f3a 100644 --- a/mock/clock/clock.go +++ b/mock/clock/clock.go @@ -47,3 +47,15 @@ func (mr *MockClockMockRecorder) Now() *gomock.Call { mr.mock.ctrl.T.Helper() return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Now", reflect.TypeOf((*MockClock)(nil).Now)) } + +// Sleep mocks base method. +func (m *MockClock) Sleep(arg0 time.Duration) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "Sleep", arg0) +} + +// Sleep indicates an expected call of Sleep. +func (mr *MockClockMockRecorder) Sleep(arg0 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Sleep", reflect.TypeOf((*MockClock)(nil).Sleep), arg0) +} From 88feba3e07c1ff68248d184dca845fca7ee5a5fc Mon Sep 17 00:00:00 2001 From: "d.anchikov" Date: Fri, 17 Jun 2022 16:54:25 +0400 Subject: [PATCH 2/3] feat: Added handling of Remote Unavailable error --- api/error_response.go | 2 +- api/handler/triggers.go | 4 + checker/check.go | 39 +++--- checker/check_test.go | 177 ++++++++++++++++++++++++--- metric_source/remote/remote_test.go | 2 +- metric_source/remote/request.go | 2 +- metric_source/remote/request_test.go | 4 +- 7 files changed, 190 insertions(+), 40 deletions(-) diff --git a/api/error_response.go b/api/error_response.go index 23e77ffe3..842d6fb39 100644 --- a/api/error_response.go +++ b/api/error_response.go @@ -75,7 +75,7 @@ func ErrorRemoteServerUnavailable(err error) *ErrorResponse { return &ErrorResponse{ Err: err, HTTPStatusCode: 503, //nolint - StatusText: "Remote server unavailable.", + StatusText: "Remote server unavailable", ErrorText: fmt.Sprintf("Remote server error, please contact administrator. Raw error: %s", err.Error()), } } diff --git a/api/handler/triggers.go b/api/handler/triggers.go index 017c62155..448d70b11 100644 --- a/api/handler/triggers.go +++ b/api/handler/triggers.go @@ -90,6 +90,10 @@ func getTriggerFromRequest(request *http.Request) (*dto.Trigger, *api.ErrorRespo case api.ErrInvalidRequestContent: return nil, api.ErrorInvalidRequest(err) case remote.ErrRemoteTriggerResponse: + response := api.ErrorInvalidRequest(err) + middleware.GetLoggerEntry(request).Warningf("%s : %s : %s", response.StatusText, response.ErrorText, err) + return nil, response + case remote.ErrRemoteUnavailable: response := api.ErrorRemoteServerUnavailable(err) middleware.GetLoggerEntry(request).Errorf("%s : %s : %s", response.StatusText, response.ErrorText, err) return nil, response diff --git a/checker/check.go b/checker/check.go index 3030d93b1..7ab6cf50e 100644 --- a/checker/check.go +++ b/checker/check.go @@ -97,18 +97,17 @@ func (triggerChecker *TriggerChecker) handleFetchError(checkData moira.CheckData checkData.UpdateScore() return triggerChecker.database.SetTriggerLastCheck(triggerChecker.triggerID, &checkData, triggerChecker.trigger.IsRemote) } - case remote.ErrRemoteTriggerResponse: - timeSinceLastSuccessfulCheck := checkData.Timestamp - checkData.LastSuccessfulCheckTimestamp - if timeSinceLastSuccessfulCheck >= triggerChecker.ttl { - checkData.State = moira.StateEXCEPTION - checkData.Message = fmt.Sprintf("Remote server unavailable. Trigger is not checked for %d seconds", timeSinceLastSuccessfulCheck) - checkData, err = triggerChecker.compareTriggerStates(checkData) - } + case local.ErrUnknownFunction, local.ErrEvalExpr, remote.ErrRemoteTriggerResponse, remote.ErrRemoteUnavailable: triggerChecker.logger.Warning(formatTriggerCheckException(triggerChecker.triggerID, err)) - case local.ErrUnknownFunction, local.ErrEvalExpr: checkData.State = moira.StateEXCEPTION checkData.Message = err.Error() - triggerChecker.logger.Warning(formatTriggerCheckException(triggerChecker.triggerID, err)) + if _, ok := err.(remote.ErrRemoteUnavailable); ok { + timeSinceLastSuccessfulCheck := checkData.Timestamp - checkData.LastSuccessfulCheckTimestamp + checkData.Message = fmt.Sprintf( + "Remote server unavailable. Trigger is not checked for %d seconds", + timeSinceLastSuccessfulCheck, + ) + } default: return triggerChecker.handleUndefinedError(checkData, err) } @@ -235,8 +234,12 @@ func (triggerChecker *TriggerChecker) preparePatternMetrics(fetchedMetrics conve } // check is the function that handles check on prepared metrics. -func (triggerChecker *TriggerChecker) check(metrics map[string]map[string]metricSource.MetricData, - aloneMetrics map[string]metricSource.MetricData, checkData moira.CheckData, logger moira.Logger) (moira.CheckData, error) { +func (triggerChecker *TriggerChecker) check( + metrics map[string]map[string]metricSource.MetricData, + aloneMetrics map[string]metricSource.MetricData, + checkData moira.CheckData, + logger moira.Logger, +) (moira.CheckData, error) { if len(metrics) == 0 { // Case when trigger have only alone metrics if metrics == nil { metrics = make(map[string]map[string]metricSource.MetricData, 1) @@ -264,8 +267,11 @@ func (triggerChecker *TriggerChecker) check(metrics map[string]map[string]metric } // checkTargets is a Function that takes a -func (triggerChecker *TriggerChecker) checkTargets(metricName string, metrics map[string]metricSource.MetricData, - logger moira.Logger) (lastState moira.MetricState, needToDeleteMetric bool, err error) { +func (triggerChecker *TriggerChecker) checkTargets( + metricName string, + metrics map[string]metricSource.MetricData, + logger moira.Logger, +) (lastState moira.MetricState, needToDeleteMetric bool, err error) { lastState, metricStates, err := triggerChecker.getMetricStepsStates(metricName, metrics, logger) if err != nil { return lastState, needToDeleteMetric, err @@ -308,8 +314,11 @@ func (triggerChecker *TriggerChecker) checkForNoData(metricLastState moira.Metri ) } -func (triggerChecker *TriggerChecker) getMetricStepsStates(metricName string, metrics map[string]metricSource.MetricData, - logger moira.Logger) (last moira.MetricState, current []moira.MetricState, err error) { +func (triggerChecker *TriggerChecker) getMetricStepsStates( + metricName string, + metrics map[string]metricSource.MetricData, + logger moira.Logger, +) (last moira.MetricState, current []moira.MetricState, err error) { var startTime int64 var stepTime int64 diff --git a/checker/check_test.go b/checker/check_test.go index b9385bd51..f7d80adc9 100644 --- a/checker/check_test.go +++ b/checker/check_test.go @@ -6,6 +6,8 @@ import ( "testing" "time" + "github.com/moira-alert/moira/metric_source/remote" + "github.com/golang/mock/gomock" "github.com/moira-alert/moira" "github.com/moira-alert/moira/checker/metrics/conversion" @@ -619,6 +621,7 @@ func TestCheck(t *testing.T) { source := mock_metric_source.NewMockMetricSource(mockCtrl) fetchResult := mock_metric_source.NewMockFetchResult(mockCtrl) logger, _ := logging.GetLogger("Test") + logger.Level("fatal") // nolint: errcheck defer mockCtrl.Finish() var retention int64 = 10 @@ -627,8 +630,6 @@ func TestCheck(t *testing.T) { var errValue float64 = 20 pattern := "super.puper.pattern" //nolint metric := "super.puper.metric" //nolint - message := "ooops, metric error" - metricErr := fmt.Errorf(message) messageException := `Unknown graphite function: "WrongFunction"` unknownFunctionExc := local.ErrorUnknownFunction(fmt.Errorf(messageException)) @@ -667,31 +668,56 @@ func TestCheck(t *testing.T) { }, } - Convey("Fetch error", func() { - lastCheck := moira.CheckData{ + Convey("trigger should switch to EXCEPTION", func() { + expectedLastCheck := moira.CheckData{ Metrics: triggerChecker.lastCheck.Metrics, State: moira.StateEXCEPTION, Timestamp: triggerChecker.until, EventTimestamp: triggerChecker.until, Score: int64(100000), - Message: metricErr.Error(), MetricsToTargetRelation: map[string]string{}, } - - gomock.InOrder( - source.EXPECT().Fetch(pattern, triggerChecker.from, triggerChecker.until, true).Return(nil, metricErr), - dataBase.EXPECT().PushNotificationEvent(&moira.NotificationEvent{ - IsTriggerEvent: true, - TriggerID: triggerChecker.triggerID, - State: moira.StateEXCEPTION, - OldState: moira.StateOK, - Timestamp: int64(67), - Metric: triggerChecker.trigger.Name, - }, true).Return(nil), - dataBase.EXPECT().SetTriggerLastCheck(triggerChecker.triggerID, &lastCheck, triggerChecker.trigger.IsRemote).Return(nil), - ) - err := triggerChecker.Check() - So(err, ShouldBeNil) + expectedNotification := &moira.NotificationEvent{ + IsTriggerEvent: true, + TriggerID: triggerChecker.triggerID, + State: moira.StateEXCEPTION, + OldState: moira.StateOK, + Timestamp: int64(67), + Metric: triggerChecker.trigger.Name, + } + Convey("on undefined fetching error", func() { + fetchErr := fmt.Errorf("some error") + expectedLastCheck.Message = fetchErr.Error() + gomock.InOrder( + source.EXPECT().Fetch(pattern, triggerChecker.from, triggerChecker.until, true).Return(nil, fetchErr), + dataBase.EXPECT().PushNotificationEvent(expectedNotification, true).Return(nil), + dataBase.EXPECT().SetTriggerLastCheck(triggerChecker.triggerID, &expectedLastCheck, triggerChecker.trigger.IsRemote).Return(nil), + ) + err := triggerChecker.Check() + So(err, ShouldBeNil) + }) + Convey("on ErrRemoteUnavailable fetching error", func() { + fetchErr := remote.ErrRemoteUnavailable{InternalError: fmt.Errorf("some error")} + expectedLastCheck.Message = "Remote server unavailable. Trigger is not checked for 67 seconds" + gomock.InOrder( + source.EXPECT().Fetch(pattern, triggerChecker.from, triggerChecker.until, true).Return(nil, fetchErr), + dataBase.EXPECT().PushNotificationEvent(expectedNotification, true).Return(nil), + dataBase.EXPECT().SetTriggerLastCheck(triggerChecker.triggerID, &expectedLastCheck, triggerChecker.trigger.IsRemote).Return(nil), + ) + err := triggerChecker.Check() + So(err, ShouldBeNil) + }) + Convey("on ErrRemoteTriggerResponse fetching error", func() { + fetchErr := remote.ErrRemoteTriggerResponse{InternalError: fmt.Errorf("some error")} + expectedLastCheck.Message = fetchErr.Error() + gomock.InOrder( + source.EXPECT().Fetch(pattern, triggerChecker.from, triggerChecker.until, true).Return(nil, fetchErr), + dataBase.EXPECT().PushNotificationEvent(expectedNotification, true).Return(nil), + dataBase.EXPECT().SetTriggerLastCheck(triggerChecker.triggerID, &expectedLastCheck, triggerChecker.trigger.IsRemote).Return(nil), + ) + err := triggerChecker.Check() + So(err, ShouldBeNil) + }) }) Convey("Switch trigger to EXCEPTION and back", func() { @@ -1497,3 +1523,114 @@ func TestTriggerChecker_handlePrepareError(t *testing.T) { }) }) } + +func TestTriggerChecker_handleFetchError(t *testing.T) { + Convey("Test handleFetchError", t, func() { + mockCtrl := gomock.NewController(t) + defer mockCtrl.Finish() + dataBase := mock_moira_alert.NewMockDatabase(mockCtrl) + logger, _ := logging.GetLogger("Test") + logger.Level("error") // nolint: errcheck + + trigger := &moira.Trigger{} + triggerChecker := TriggerChecker{ + triggerID: "test trigger", + trigger: trigger, + database: dataBase, + logger: logger, + ttlState: moira.TTLStateNODATA, + } + checkData := moira.CheckData{} + + Convey("with ErrTriggerHasEmptyTargets", func() { + err := ErrTriggerHasEmptyTargets{} + checkData.Timestamp = int64(15) + triggerChecker.lastCheck = &moira.CheckData{ + State: moira.StateOK, + EventTimestamp: 10, + } + expectedCheckData := moira.CheckData{ + Score: 1000, + State: moira.StateNODATA, + Message: err.Error(), + Timestamp: int64(15), + } + dataBase.EXPECT().SetTriggerLastCheck("test trigger", &expectedCheckData, false) + actualErr := triggerChecker.handleFetchError(checkData, err) + So(actualErr, ShouldBeNil) + }) + Convey("with ErrTriggerHasOnlyWildcards", func() { + err := ErrTriggerHasOnlyWildcards{} + checkData.Timestamp = int64(15) + triggerChecker.lastCheck = &moira.CheckData{ + State: moira.StateOK, + EventTimestamp: 10, + } + expectedCheckData := moira.CheckData{ + Score: 1000, + State: moira.StateNODATA, + Message: err.Error(), + Timestamp: int64(15), + } + dataBase.EXPECT().SetTriggerLastCheck("test trigger", &expectedCheckData, false) + actualErr := triggerChecker.handleFetchError(checkData, err) + So(actualErr, ShouldBeNil) + }) + Convey("with ErrRemoteUnavailable", func() { + err := remote.ErrRemoteUnavailable{InternalError: fmt.Errorf("some error")} + checkData.Timestamp = int64(25) + checkData.LastSuccessfulCheckTimestamp = int64(5) + triggerChecker.lastCheck = &moira.CheckData{ + State: moira.StateOK, + EventTimestamp: 10, + } + expectedCheckData := moira.CheckData{ + Score: 100000, + State: moira.StateEXCEPTION, + Message: "Remote server unavailable. Trigger is not checked for 20 seconds", + Timestamp: int64(25), + EventTimestamp: int64(25), + LastSuccessfulCheckTimestamp: int64(5), + } + dataBase.EXPECT().PushNotificationEvent(&moira.NotificationEvent{ + IsTriggerEvent: true, + TriggerID: triggerChecker.triggerID, + State: moira.StateEXCEPTION, + OldState: getEventOldState(moira.StateOK, "", false), + Timestamp: 25, + Metric: triggerChecker.trigger.Name, + MessageEventInfo: nil, + }, true) + dataBase.EXPECT().SetTriggerLastCheck("test trigger", &expectedCheckData, false) + actualErr := triggerChecker.handleFetchError(checkData, err) + So(actualErr, ShouldBeNil) + }) + Convey("with ErrRemoteTriggerResponse", func() { + err := remote.ErrRemoteTriggerResponse{InternalError: fmt.Errorf("some error")} + checkData.Timestamp = int64(10) + triggerChecker.lastCheck = &moira.CheckData{ + State: moira.StateOK, + EventTimestamp: 10, + } + expectedCheckData := moira.CheckData{ + Score: 100000, + State: moira.StateEXCEPTION, + Message: err.Error(), + Timestamp: 10, + EventTimestamp: 10, + } + dataBase.EXPECT().PushNotificationEvent(&moira.NotificationEvent{ + IsTriggerEvent: true, + TriggerID: triggerChecker.triggerID, + State: moira.StateEXCEPTION, + OldState: getEventOldState(moira.StateOK, "", false), + Timestamp: 10, + Metric: triggerChecker.trigger.Name, + MessageEventInfo: nil, + }, true) + dataBase.EXPECT().SetTriggerLastCheck("test trigger", &expectedCheckData, false) + actualErr := triggerChecker.handleFetchError(checkData, err) + So(actualErr, ShouldBeNil) + }) + }) +} diff --git a/metric_source/remote/remote_test.go b/metric_source/remote/remote_test.go index 8a540bc6c..d752c9d0c 100644 --- a/metric_source/remote/remote_test.go +++ b/metric_source/remote/remote_test.go @@ -148,7 +148,7 @@ func TestFetch(t *testing.T) { remote.config = &config result, err := remote.Fetch(target, from, until, false) So(result, ShouldBeEmpty) - So(err.Error(), ShouldResemble, fmt.Sprintf("bad response status %d: %s", http.StatusInternalServerError, "Some string")) + So(err.Error(), ShouldResemble, fmt.Sprintf("remote server response status %d: %s", http.StatusInternalServerError, "Some string")) _, ok := err.(ErrRemoteTriggerResponse) So(ok, ShouldBeTrue) } diff --git a/metric_source/remote/request.go b/metric_source/remote/request.go index 3c22cc000..d6999a25d 100644 --- a/metric_source/remote/request.go +++ b/metric_source/remote/request.go @@ -50,7 +50,7 @@ func (remote *Remote) makeRequest(req *http.Request) (body []byte, isRemoteAvail "the remote server is not available. Response status %d: %s", resp.StatusCode, string(body), ) } else if resp.StatusCode != http.StatusOK { - return body, true, fmt.Errorf("bad response status %d: %s", resp.StatusCode, string(body)) + return body, true, fmt.Errorf("remote server response status %d: %s", resp.StatusCode, string(body)) } return body, true, nil diff --git a/metric_source/remote/request_test.go b/metric_source/remote/request_test.go index ecd9e057f..6520b7245 100644 --- a/metric_source/remote/request_test.go +++ b/metric_source/remote/request_test.go @@ -69,7 +69,7 @@ func TestMakeRequest(t *testing.T) { remote := Remote{client: server.Client(), config: &Config{URL: server.URL}} request, _ := remote.prepareRequest(from, until, target) actual, isRemoteAvailable, err := remote.makeRequest(request) - So(err, ShouldResemble, fmt.Errorf("bad response status %d: %s", http.StatusInternalServerError, string(body))) + So(err, ShouldResemble, fmt.Errorf("remote server response status %d: %s", http.StatusInternalServerError, string(body))) So(isRemoteAvailable, ShouldBeTrue) So(actual, ShouldResemble, body) }) @@ -157,7 +157,7 @@ func TestMakeRequestWithRetries(t *testing.T) { remote.config.Timeout, remote.config.RetrySeconds, ) - So(err, ShouldResemble, fmt.Errorf("bad response status %d: %s", http.StatusInternalServerError, string(body))) + So(err, ShouldResemble, fmt.Errorf("remote server response status %d: %s", http.StatusInternalServerError, string(body))) So(isRemoteAvailable, ShouldBeTrue) So(actual, ShouldResemble, body) } From 8c0f2d3a07033d29b4da2c4b920e51063589dcae Mon Sep 17 00:00:00 2001 From: Dmitry Anchikov Date: Wed, 24 Aug 2022 12:05:22 +0400 Subject: [PATCH 3/3] feat: Added specific metric for graphite unavailability check result (#766) --- checker/worker/remote.go | 1 + metrics/checker.go | 12 +++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/checker/worker/remote.go b/checker/worker/remote.go index a424e4e9e..77f6f1ca1 100644 --- a/checker/worker/remote.go +++ b/checker/worker/remote.go @@ -48,6 +48,7 @@ func (worker *Checker) checkRemote() error { remoteAvailable, err := source.(*remote.Remote).IsRemoteAvailable() if !remoteAvailable { worker.Logger.Infof("Remote API is unavailable. Stop checking remote triggers. Error: %s", err.Error()) + worker.Metrics.RemoteAvailabilityCheckFailed.Mark(1) } else { worker.Logger.Debug("Checking remote triggers") triggerIds, err := worker.Database.GetRemoteTriggerIDs() diff --git a/metrics/checker.go b/metrics/checker.go index 0d92592c3..5fcdd449f 100644 --- a/metrics/checker.go +++ b/metrics/checker.go @@ -4,11 +4,12 @@ import "github.com/moira-alert/moira" // CheckerMetrics is a collection of metrics used in checker type CheckerMetrics struct { - LocalMetrics *CheckMetrics - RemoteMetrics *CheckMetrics - MetricEventsChannelLen Histogram - UnusedTriggersCount Histogram - MetricEventsHandleTime Timer + LocalMetrics *CheckMetrics + RemoteMetrics *CheckMetrics + MetricEventsChannelLen Histogram + UnusedTriggersCount Histogram + MetricEventsHandleTime Timer + RemoteAvailabilityCheckFailed Meter } // GetCheckMetrics return check metrics dependent on given trigger type @@ -37,6 +38,7 @@ func ConfigureCheckerMetrics(registry Registry, remoteEnabled bool) *CheckerMetr } if remoteEnabled { m.RemoteMetrics = configureCheckMetrics(registry, "remote") + m.RemoteAvailabilityCheckFailed = registry.NewMeter("remote", "unavailable") } return m }