Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Increase RDS connection pool max capacity in Prod #332

Merged
merged 1 commit into from
Jan 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions app/stacks/cumulus/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,9 @@ resource "aws_cloudwatch_event_target" "background_job_queue_watcher" {
arn = module.cumulus.sqs2sfThrottle_lambda_function_arn
input = jsonencode(
{
messageLimit = 1000
messageLimit = 300
queueUrl = aws_sqs_queue.background_job_queue.id
timeLimit = 60
timeLimit = 30
}
)
}
Expand Down Expand Up @@ -576,7 +576,7 @@ module "cumulus" {
{
id = "backgroundJobQueue",
url = aws_sqs_queue.background_job_queue.id,
execution_limit = 300
execution_limit = 500
}
]
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "WV03_MSI_L1B",
"version": "1",
"duplicateHandling": "replace",
"duplicateHandling": "skip",
"granuleId": ".*",
"granuleIdExtraction": "^(WV03_.+-M1BS-.+_P\\d{3}).+(?<!rename)$",
"sampleFileName": "WV03_20140824082814_10400100012AD900_14AUG24082814-M1BS-504548417070_01_P001-BROWSE.jpg",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "WV03_Pan_L1B",
"version": "1",
"duplicateHandling": "replace",
"duplicateHandling": "skip",
"granuleId": ".*",
"granuleIdExtraction": "^(WV03_.+-P1BS-.+_P\\d{3}).+(?<!rename)$",
"sampleFileName": "WV03_20140824052550_104001000109D200_14AUG24052550-P1BS-506481065090_01_P001-BROWSE.jpg",
Expand Down
35 changes: 26 additions & 9 deletions app/stacks/cumulus/templates/discover-granules-workflow.asl.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
"DiscoverGranulesMap": {
"Type": "Map",
"End": true,
"MaxConcurrency": 3,
"ToleratedFailurePercentage": 3,
"MaxConcurrency": 7,
"ToleratedFailurePercentage": 1,
"ItemReader": {
"Resource": "arn:aws:states:::s3:getObject",
"ReaderConfig": {
Expand Down Expand Up @@ -102,7 +102,8 @@
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException"
"Lambda.SdkClientException",
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
Expand Down Expand Up @@ -144,7 +145,8 @@
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException"
"Lambda.SdkClientException",
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
Expand All @@ -155,7 +157,7 @@
},
"QueueGranulesMap": {
"Type": "Map",
"MaxConcurrency": 1,
"MaxConcurrency": 2,
"ToleratedFailurePercentage": 0,
"ItemsPath": "$",
"ResultWriter": {
Expand Down Expand Up @@ -195,7 +197,8 @@
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException"
"Lambda.SdkClientException",
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
Expand Down Expand Up @@ -245,7 +248,8 @@
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException"
"Lambda.SdkClientException",
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
Expand Down Expand Up @@ -287,7 +291,7 @@
"TargetPath": "$.payload"
},
"task_config": {
"concurrency": 2,
"concurrency": 4,
"queueUrl": "${background_job_queue_url}",
"preferredQueueBatchSize": "{$.meta.collection.meta.preferredQueueBatchSize}",
"provider": "{$.meta.provider}",
Expand All @@ -303,15 +307,28 @@
"Resource": "${queue_granules_task_arn}",
"Retry": [
{
"Comment": "Include 'Error' because Cumulus fails to rethrow Knex errors as something more specific.",
"ErrorEquals": [
"Error",
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException"
"Lambda.SdkClientException",
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
"BackoffRate": 2
},
{
"Comment": "When approaching a timeout, the CMA causes early termination, thus preventing an actual timeout, which produces 'Lambda.Unknown' errors instead. In these cases, we don't want as many retries, and we want to space them further apart.",
"ErrorEquals": [
"Lambda.Unknown",
"States.Timeout"
],
"IntervalSeconds": 120,
"MaxAttempts": 4,
"BackoffRate": 2
}
]
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,21 @@
}
}
},
"Next": "SyncGranule"
"Next": "SyncGranule",
"Retry": [
{
"ErrorEquals": [
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException",
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
"BackoffRate": 2
}
]
},
"SyncGranule": {
"Parameters": {
Expand Down Expand Up @@ -79,6 +93,18 @@
"Resource": "${sync_granule_task_arn}",
"Next": "AddUmmgChecksums",
"Retry": [
{
"ErrorEquals": [
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException",
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
"BackoffRate": 2
},
{
"ErrorEquals": [
"States.ALL"
Expand Down Expand Up @@ -117,7 +143,29 @@
}
}
},
"Next": "AddMissingFileChecksums"
"Next": "AddMissingFileChecksums",
"Retry": [
{
"ErrorEquals": [
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException",
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
"BackoffRate": 2
},
{
"ErrorEquals": [
"States.ALL"
],
"IntervalSeconds": 2,
"BackoffRate": 2,
"MaxAttempts": 3
}
]
},
"AddMissingFileChecksums": {
"Type": "Task",
Expand Down Expand Up @@ -148,7 +196,29 @@
}
}
},
"Next": "MoveGranule"
"Next": "MoveGranule",
"Retry": [
{
"ErrorEquals": [
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException",
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
"BackoffRate": 2
},
{
"ErrorEquals": [
"States.ALL"
],
"IntervalSeconds": 2,
"BackoffRate": 2,
"MaxAttempts": 3
}
]
},
"MoveGranule": {
"Parameters": {
Expand All @@ -174,14 +244,23 @@
"Retry": [
{
"ErrorEquals": [
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException",
"States.TaskFailed"
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
"BackoffRate": 2
},
{
"ErrorEquals": [
"States.ALL"
],
"IntervalSeconds": 2,
"BackoffRate": 2,
"MaxAttempts": 6
"MaxAttempts": 3
}
]
},
Expand Down Expand Up @@ -218,13 +297,23 @@
"Retry": [
{
"ErrorEquals": [
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException"
"Lambda.SdkClientException",
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
"BackoffRate": 2
},
{
"ErrorEquals": [
"States.ALL"
],
"IntervalSeconds": 2,
"BackoffRate": 2,
"MaxAttempts": 3
}
]
},
Expand All @@ -246,20 +335,29 @@
},
"Type": "Task",
"Resource": "${copy_to_archive_adapter_task_arn}",
"Next": "PostToCmr",
"Retry": [
{
"ErrorEquals": [
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException",
"States.TaskFailed"
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
"BackoffRate": 2
},
{
"ErrorEquals": [
"States.ALL"
],
"IntervalSeconds": 2,
"BackoffRate": 2,
"MaxAttempts": 3
}
],
"Next": "PostToCmr"
]
},
"PostToCmr": {
"Parameters": {
Expand All @@ -284,14 +382,23 @@
"Retry": [
{
"ErrorEquals": [
"Lambda.ClientExecutionTimeoutException",
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException",
"States.TaskFailed"
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 2,
"MaxAttempts": 6,
"BackoffRate": 2
},
{
"ErrorEquals": [
"States.ALL"
],
"IntervalSeconds": 2,
"BackoffRate": 2,
"MaxAttempts": 3
}
],
"End": true
Expand Down
2 changes: 2 additions & 0 deletions app/stacks/rds-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ module "rds_cluster" {
permissions_boundary_arn = local.permissions_boundary_arn
prefix = var.prefix
provision_user_database = true
min_capacity = var.min_capacity
max_capacity = var.max_capacity
# ORCA requires us to use a password that contains a special character, but there is
# some Cumulus constraint that allows only an underscore (in addition to alphanumeric
# characters), and no other special characters, so we must generate a password that
Expand Down
4 changes: 4 additions & 0 deletions app/stacks/rds-cluster/tfvars/prod.tfvars
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# See https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/rds_cluster

min_capacity = 2
max_capacity = 384
9 changes: 9 additions & 0 deletions app/stacks/rds-cluster/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
variable "max_capacity" {
type = number
default = 4
}

variable "min_capacity" {
type = number
default = 2
}
4 changes: 2 additions & 2 deletions src/lib/discovery.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,13 @@ type BucketKey = {
readonly key: string;
};

// Default maximum batch size for batching granules after discovery is 5000, but this
// Default maximum batch size for batching granules after discovery is 1000, but this
// can be set on a per rule basis by setting `meta.maxBatchSize` in a rule definition.
export const BatchGranulesInput = t.readonly(
t.type({
config: t.type({
providerPath: t.string,
maxBatchSize: tt.fromNullable(t.number, 5000),
maxBatchSize: tt.fromNullable(t.number, 1000),
}),
input: DiscoverGranulesOutput,
})
Expand Down
Loading