Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Added handling of Remote Unavailable error #765

Open
wants to merge 3 commits into
base: feature/graphite-unavailability-retriable-calls
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion api/error_response.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ func ErrorRemoteServerUnavailable(err error) *ErrorResponse {
return &ErrorResponse{
Err: err,
HTTPStatusCode: 503, //nolint
StatusText: "Remote server unavailable.",
StatusText: "Remote server unavailable",
ErrorText: fmt.Sprintf("Remote server error, please contact administrator. Raw error: %s", err.Error()),
}
}
Expand Down
6 changes: 5 additions & 1 deletion api/handler/triggers.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,12 @@ func getTriggerFromRequest(request *http.Request) (*dto.Trigger, *api.ErrorRespo
case api.ErrInvalidRequestContent:
return nil, api.ErrorInvalidRequest(err)
case remote.ErrRemoteTriggerResponse:
response := api.ErrorInvalidRequest(err)
middleware.GetLoggerEntry(request).Warningf("%s : %s : %s", response.StatusText, response.ErrorText, err)
return nil, response
case remote.ErrRemoteUnavailable:
response := api.ErrorRemoteServerUnavailable(err)
middleware.GetLoggerEntry(request).Error("%s : %s : %s", response.StatusText, response.ErrorText, err)
middleware.GetLoggerEntry(request).Errorf("%s : %s : %s", response.StatusText, response.ErrorText, err)
return nil, response
default:
return nil, api.ErrorInternalServer(err)
Expand Down
39 changes: 24 additions & 15 deletions checker/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,18 +97,17 @@ func (triggerChecker *TriggerChecker) handleFetchError(checkData moira.CheckData
checkData.UpdateScore()
return triggerChecker.database.SetTriggerLastCheck(triggerChecker.triggerID, &checkData, triggerChecker.trigger.IsRemote)
}
case remote.ErrRemoteTriggerResponse:
timeSinceLastSuccessfulCheck := checkData.Timestamp - checkData.LastSuccessfulCheckTimestamp
if timeSinceLastSuccessfulCheck >= triggerChecker.ttl {
checkData.State = moira.StateEXCEPTION
checkData.Message = fmt.Sprintf("Remote server unavailable. Trigger is not checked for %d seconds", timeSinceLastSuccessfulCheck)
checkData, err = triggerChecker.compareTriggerStates(checkData)
}
case local.ErrUnknownFunction, local.ErrEvalExpr, remote.ErrRemoteTriggerResponse, remote.ErrRemoteUnavailable:
triggerChecker.logger.Warning(formatTriggerCheckException(triggerChecker.triggerID, err))
case local.ErrUnknownFunction, local.ErrEvalExpr:
checkData.State = moira.StateEXCEPTION
checkData.Message = err.Error()
triggerChecker.logger.Warning(formatTriggerCheckException(triggerChecker.triggerID, err))
if _, ok := err.(remote.ErrRemoteUnavailable); ok {
timeSinceLastSuccessfulCheck := checkData.Timestamp - checkData.LastSuccessfulCheckTimestamp
checkData.Message = fmt.Sprintf(
"Remote server unavailable. Trigger is not checked for %d seconds",
timeSinceLastSuccessfulCheck,
)
}
default:
return triggerChecker.handleUndefinedError(checkData, err)
}
Expand Down Expand Up @@ -235,8 +234,12 @@ func (triggerChecker *TriggerChecker) preparePatternMetrics(fetchedMetrics conve
}

// check is the function that handles check on prepared metrics.
func (triggerChecker *TriggerChecker) check(metrics map[string]map[string]metricSource.MetricData,
aloneMetrics map[string]metricSource.MetricData, checkData moira.CheckData, logger moira.Logger) (moira.CheckData, error) {
func (triggerChecker *TriggerChecker) check(
metrics map[string]map[string]metricSource.MetricData,
aloneMetrics map[string]metricSource.MetricData,
checkData moira.CheckData,
logger moira.Logger,
) (moira.CheckData, error) {
if len(metrics) == 0 { // Case when trigger have only alone metrics
if metrics == nil {
metrics = make(map[string]map[string]metricSource.MetricData, 1)
Expand Down Expand Up @@ -264,8 +267,11 @@ func (triggerChecker *TriggerChecker) check(metrics map[string]map[string]metric
}

// checkTargets is a Function that takes a
func (triggerChecker *TriggerChecker) checkTargets(metricName string, metrics map[string]metricSource.MetricData,
logger moira.Logger) (lastState moira.MetricState, needToDeleteMetric bool, err error) {
func (triggerChecker *TriggerChecker) checkTargets(
metricName string,
metrics map[string]metricSource.MetricData,
logger moira.Logger,
) (lastState moira.MetricState, needToDeleteMetric bool, err error) {
lastState, metricStates, err := triggerChecker.getMetricStepsStates(metricName, metrics, logger)
if err != nil {
return lastState, needToDeleteMetric, err
Expand Down Expand Up @@ -308,8 +314,11 @@ func (triggerChecker *TriggerChecker) checkForNoData(metricLastState moira.Metri
)
}

func (triggerChecker *TriggerChecker) getMetricStepsStates(metricName string, metrics map[string]metricSource.MetricData,
logger moira.Logger) (last moira.MetricState, current []moira.MetricState, err error) {
func (triggerChecker *TriggerChecker) getMetricStepsStates(
metricName string,
metrics map[string]metricSource.MetricData,
logger moira.Logger,
) (last moira.MetricState, current []moira.MetricState, err error) {
var startTime int64
var stepTime int64

Expand Down
177 changes: 157 additions & 20 deletions checker/check_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ import (
"testing"
"time"

"github.com/moira-alert/moira/metric_source/remote"

"github.com/golang/mock/gomock"
"github.com/moira-alert/moira"
"github.com/moira-alert/moira/checker/metrics/conversion"
Expand Down Expand Up @@ -619,6 +621,7 @@ func TestCheck(t *testing.T) {
source := mock_metric_source.NewMockMetricSource(mockCtrl)
fetchResult := mock_metric_source.NewMockFetchResult(mockCtrl)
logger, _ := logging.GetLogger("Test")
logger.Level("fatal") // nolint: errcheck
kissken marked this conversation as resolved.
Show resolved Hide resolved
defer mockCtrl.Finish()

var retention int64 = 10
Expand All @@ -627,8 +630,6 @@ func TestCheck(t *testing.T) {
var errValue float64 = 20
pattern := "super.puper.pattern" //nolint
metric := "super.puper.metric" //nolint
message := "ooops, metric error"
metricErr := fmt.Errorf(message)
messageException := `Unknown graphite function: "WrongFunction"`
unknownFunctionExc := local.ErrorUnknownFunction(fmt.Errorf(messageException))

Expand Down Expand Up @@ -667,31 +668,56 @@ func TestCheck(t *testing.T) {
},
}

Convey("Fetch error", func() {
lastCheck := moira.CheckData{
Convey("trigger should switch to EXCEPTION", func() {
expectedLastCheck := moira.CheckData{
Metrics: triggerChecker.lastCheck.Metrics,
State: moira.StateEXCEPTION,
Timestamp: triggerChecker.until,
EventTimestamp: triggerChecker.until,
Score: int64(100000),
Message: metricErr.Error(),
MetricsToTargetRelation: map[string]string{},
}

gomock.InOrder(
source.EXPECT().Fetch(pattern, triggerChecker.from, triggerChecker.until, true).Return(nil, metricErr),
dataBase.EXPECT().PushNotificationEvent(&moira.NotificationEvent{
IsTriggerEvent: true,
TriggerID: triggerChecker.triggerID,
State: moira.StateEXCEPTION,
OldState: moira.StateOK,
Timestamp: int64(67),
Metric: triggerChecker.trigger.Name,
}, true).Return(nil),
dataBase.EXPECT().SetTriggerLastCheck(triggerChecker.triggerID, &lastCheck, triggerChecker.trigger.IsRemote).Return(nil),
)
err := triggerChecker.Check()
So(err, ShouldBeNil)
expectedNotification := &moira.NotificationEvent{
IsTriggerEvent: true,
TriggerID: triggerChecker.triggerID,
State: moira.StateEXCEPTION,
OldState: moira.StateOK,
Timestamp: int64(67),
Metric: triggerChecker.trigger.Name,
}
Convey("on undefined fetching error", func() {
fetchErr := fmt.Errorf("some error")
expectedLastCheck.Message = fetchErr.Error()
gomock.InOrder(
source.EXPECT().Fetch(pattern, triggerChecker.from, triggerChecker.until, true).Return(nil, fetchErr),
dataBase.EXPECT().PushNotificationEvent(expectedNotification, true).Return(nil),
dataBase.EXPECT().SetTriggerLastCheck(triggerChecker.triggerID, &expectedLastCheck, triggerChecker.trigger.IsRemote).Return(nil),
)
err := triggerChecker.Check()
So(err, ShouldBeNil)
})
Convey("on ErrRemoteUnavailable fetching error", func() {
fetchErr := remote.ErrRemoteUnavailable{InternalError: fmt.Errorf("some error")}
expectedLastCheck.Message = "Remote server unavailable. Trigger is not checked for 67 seconds"
gomock.InOrder(
source.EXPECT().Fetch(pattern, triggerChecker.from, triggerChecker.until, true).Return(nil, fetchErr),
dataBase.EXPECT().PushNotificationEvent(expectedNotification, true).Return(nil),
dataBase.EXPECT().SetTriggerLastCheck(triggerChecker.triggerID, &expectedLastCheck, triggerChecker.trigger.IsRemote).Return(nil),
)
err := triggerChecker.Check()
So(err, ShouldBeNil)
})
Convey("on ErrRemoteTriggerResponse fetching error", func() {
fetchErr := remote.ErrRemoteTriggerResponse{InternalError: fmt.Errorf("some error")}
expectedLastCheck.Message = fetchErr.Error()
gomock.InOrder(
source.EXPECT().Fetch(pattern, triggerChecker.from, triggerChecker.until, true).Return(nil, fetchErr),
dataBase.EXPECT().PushNotificationEvent(expectedNotification, true).Return(nil),
dataBase.EXPECT().SetTriggerLastCheck(triggerChecker.triggerID, &expectedLastCheck, triggerChecker.trigger.IsRemote).Return(nil),
)
err := triggerChecker.Check()
So(err, ShouldBeNil)
})
})

Convey("Switch trigger to EXCEPTION and back", func() {
Expand Down Expand Up @@ -1497,3 +1523,114 @@ func TestTriggerChecker_handlePrepareError(t *testing.T) {
})
})
}

func TestTriggerChecker_handleFetchError(t *testing.T) {
Convey("Test handleFetchError", t, func() {
mockCtrl := gomock.NewController(t)
defer mockCtrl.Finish()
dataBase := mock_moira_alert.NewMockDatabase(mockCtrl)
logger, _ := logging.GetLogger("Test")
logger.Level("error") // nolint: errcheck
kissken marked this conversation as resolved.
Show resolved Hide resolved

trigger := &moira.Trigger{}
triggerChecker := TriggerChecker{
triggerID: "test trigger",
trigger: trigger,
database: dataBase,
logger: logger,
ttlState: moira.TTLStateNODATA,
}
checkData := moira.CheckData{}

Convey("with ErrTriggerHasEmptyTargets", func() {
err := ErrTriggerHasEmptyTargets{}
checkData.Timestamp = int64(15)
triggerChecker.lastCheck = &moira.CheckData{
State: moira.StateOK,
EventTimestamp: 10,
}
expectedCheckData := moira.CheckData{
Score: 1000,
State: moira.StateNODATA,
Message: err.Error(),
Timestamp: int64(15),
}
dataBase.EXPECT().SetTriggerLastCheck("test trigger", &expectedCheckData, false)
actualErr := triggerChecker.handleFetchError(checkData, err)
So(actualErr, ShouldBeNil)
})
Convey("with ErrTriggerHasOnlyWildcards", func() {
err := ErrTriggerHasOnlyWildcards{}
checkData.Timestamp = int64(15)
triggerChecker.lastCheck = &moira.CheckData{
State: moira.StateOK,
EventTimestamp: 10,
}
expectedCheckData := moira.CheckData{
Score: 1000,
State: moira.StateNODATA,
Message: err.Error(),
Timestamp: int64(15),
}
dataBase.EXPECT().SetTriggerLastCheck("test trigger", &expectedCheckData, false)
actualErr := triggerChecker.handleFetchError(checkData, err)
So(actualErr, ShouldBeNil)
})
Convey("with ErrRemoteUnavailable", func() {
err := remote.ErrRemoteUnavailable{InternalError: fmt.Errorf("some error")}
checkData.Timestamp = int64(25)
checkData.LastSuccessfulCheckTimestamp = int64(5)
triggerChecker.lastCheck = &moira.CheckData{
State: moira.StateOK,
EventTimestamp: 10,
}
expectedCheckData := moira.CheckData{
Score: 100000,
State: moira.StateEXCEPTION,
Message: "Remote server unavailable. Trigger is not checked for 20 seconds",
Timestamp: int64(25),
EventTimestamp: int64(25),
LastSuccessfulCheckTimestamp: int64(5),
}
dataBase.EXPECT().PushNotificationEvent(&moira.NotificationEvent{
IsTriggerEvent: true,
TriggerID: triggerChecker.triggerID,
State: moira.StateEXCEPTION,
OldState: getEventOldState(moira.StateOK, "", false),
Timestamp: 25,
Metric: triggerChecker.trigger.Name,
MessageEventInfo: nil,
}, true)
dataBase.EXPECT().SetTriggerLastCheck("test trigger", &expectedCheckData, false)
actualErr := triggerChecker.handleFetchError(checkData, err)
So(actualErr, ShouldBeNil)
})
Convey("with ErrRemoteTriggerResponse", func() {
err := remote.ErrRemoteTriggerResponse{InternalError: fmt.Errorf("some error")}
checkData.Timestamp = int64(10)
triggerChecker.lastCheck = &moira.CheckData{
State: moira.StateOK,
EventTimestamp: 10,
}
expectedCheckData := moira.CheckData{
Score: 100000,
State: moira.StateEXCEPTION,
Message: err.Error(),
Timestamp: 10,
EventTimestamp: 10,
}
dataBase.EXPECT().PushNotificationEvent(&moira.NotificationEvent{
IsTriggerEvent: true,
TriggerID: triggerChecker.triggerID,
State: moira.StateEXCEPTION,
OldState: getEventOldState(moira.StateOK, "", false),
Timestamp: 10,
Metric: triggerChecker.trigger.Name,
MessageEventInfo: nil,
}, true)
dataBase.EXPECT().SetTriggerLastCheck("test trigger", &expectedCheckData, false)
actualErr := triggerChecker.handleFetchError(checkData, err)
So(actualErr, ShouldBeNil)
})
})
}
1 change: 1 addition & 0 deletions checker/worker/remote.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ func (worker *Checker) checkRemote() error {
remoteAvailable, err := source.(*remote.Remote).IsRemoteAvailable()
if !remoteAvailable {
worker.Logger.Infof("Remote API is unavailable. Stop checking remote triggers. Error: %s", err.Error())
worker.Metrics.RemoteAvailabilityCheckFailed.Mark(1)
} else {
worker.Logger.Debug("Checking remote triggers")
triggerIds, err := worker.Database.GetRemoteTriggerIDs()
Expand Down
5 changes: 5 additions & 0 deletions clock/clock.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,8 @@ func NewSystemClock() *SystemClock {
func (t *SystemClock) Now() time.Time {
return time.Now().UTC()
}

// Sleep pauses the current goroutine for at least the passed duration
func (t *SystemClock) Sleep(duration time.Duration) {
time.Sleep(duration)
}
8 changes: 5 additions & 3 deletions cmd/checker/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,11 @@ func getDefault() config {
Pprof: cmd.ProfilerConfig{Enabled: false},
},
Remote: cmd.RemoteConfig{
CheckInterval: "60s",
Timeout: "60s",
MetricsTTL: "7d",
CheckInterval: "60s",
Timeout: "60s",
MetricsTTL: "168h",
HealthCheckTimeout: "60s",
HealthCheckRetrySeconds: "1 1",
},
}
}
Loading