From db373b986508ebbccad50416a6f278911637ddc1 Mon Sep 17 00:00:00 2001 From: Chuck Daniels Date: Wed, 22 Nov 2023 15:36:43 -0500 Subject: [PATCH] Replace expired Launchpad certificate (#298) Fixes #297 --- .gitignore | 2 + Makefile | 17 +++--- app/stacks/cumulus/main.tf | 20 ++++++++ bin/copy-launchpad-pfx.sh | 43 ---------------- bin/transfer-launchpad-pfx-secret.sh | 39 -------------- bin/update-launchpad-pfx.sh | 68 ++++++++++++++++++++++++ docs/OPERATING.md | 77 ++++++++++++++++++++++++++++ docs/TROUBLESHOOTING.md | 14 +++++ 8 files changed, 189 insertions(+), 91 deletions(-) delete mode 100755 bin/copy-launchpad-pfx.sh delete mode 100755 bin/transfer-launchpad-pfx-secret.sh create mode 100755 bin/update-launchpad-pfx.sh diff --git a/.gitignore b/.gitignore index 8d0d658..f00a438 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +*.pfx +*.zip /log # Created by https://www.toptal.com/developers/gitignore/api/terraform diff --git a/Makefile b/Makefile index 141790f..46af27b 100644 --- a/Makefile +++ b/Makefile @@ -54,10 +54,12 @@ all-up: logs-init install ## all-up-yes: Deploys all modules (in dependency order) with automatic approval all-up-yes: logs-init install + $(eval DOCKER_RUN_OPTS := --interactive) tail -f log/up/*.log & $(TERRASPACE) all up --yes; kill $$! ## all-SUBCOMMAND: Runs Terraspace SUBCOMMAND across all stacks (make all-help for list of SUBCOMMANDs) all-%: install + $(eval DOCKER_RUN_OPTS := --interactive) $(TERRASPACE) all $(patsubst all-%,%,$@) ## bash: Runs bash terminal in Docker container @@ -145,22 +147,15 @@ nuke: docker output-%: docker $(TERRASPACE) output $* -plan-cumulus: install - ## plan-STACK: Runs `terraform plan` for specified STACK plan-%: install $(eval DOCKER_RUN_OPTS := --interactive) $(TERRASPACE) plan $* ## pre-deploy-setup: Setup resources prior to initial deployment (idempotent) -pre-deploy-setup: logs-init - # Tail terraspace logs in background so we can see output from "all init" - # command to initialize all terraform modules. After initialization is - # complete, kill background process that's following the logs. - tail -f log/init/*.log & $(TERRASPACE) all init; kill $$! - +pre-deploy-setup: all-init # Ensure buckets exist, grab the name of the "internal" bucket, and copy launchpad.pfx there. - $(DOCKER_RUN) $(IMAGE) -ic "bin/ensure-buckets-exist.sh 2>/dev/null | grep internal | xargs bin/copy-launchpad-pfx.sh" + $(DOCKER_RUN) --interactive $(IMAGE) -ic "bin/ensure-buckets-exist.sh 2>/dev/null ## terraform-doctor-STACK: Fixes "duplicate resource" errors for specified STACK terraform-doctor-%: docker @@ -185,6 +180,10 @@ up-%: logs-init install $(eval DOCKER_RUN_OPTS := --interactive) $(TERRASPACE) up $* | tee -a log/up/$*.log +## update-launchpad: Updates Launchpad certificate and passcode (expects LAUNCHPAD_PFX env var to be set to path to Launchpad certificate file [.pfx]) +update-launchpad: + $(DOCKER_RUN) --interactive $(IMAGE) -ic "bin/update-launchpad-pfx.sh ${LAUNCHPAD_PFX}" + ## validate-STACK: Runs `terraform validate` for specified STACK validate-%: docker $(TERRASPACE) validate $* diff --git a/app/stacks/cumulus/main.tf b/app/stacks/cumulus/main.tf index 8df04cb..bdb388a 100644 --- a/app/stacks/cumulus/main.tf +++ b/app/stacks/cumulus/main.tf @@ -69,10 +69,30 @@ data "archive_file" "lambda" { output_path = "${data.external.lambda_archive_exploded.result.dir}/../lambda.zip" } +data "aws_secretsmanager_secret" "launchpad_pfx" { + name = "cumulus-launchpad-pfx" +} + +# When the Launchpad certificate expires, and a new one is obtained (along with +# a new passphrase), we need to update this secret with the contents of the new +# certificate. See OPERATING.md for information on updating the Launchpad cert. +data "aws_secretsmanager_secret_version" "launchpad_pfx" { + secret_id = data.aws_secretsmanager_secret.launchpad_pfx.id +} + #------------------------------------------------------------------------------- # RESOURCES #------------------------------------------------------------------------------- +# When the Launchpad certificate secret is updated, we need to update the +# launchpad.pfx file in the system bucket that Cumulus uses to generate tokens +# to authorize publishing to the CMR (via the PostToCmr Lambda function). +resource "aws_s3_object" "launchpad_pfx" { + bucket = var.system_bucket + key = "${var.prefix}/crypto/launchpad.pfx" + content_base64 = data.aws_secretsmanager_secret_version.launchpad_pfx.secret_string +} + # <% if !in_sandbox? then %> resource "null_resource" "put_bucket_logging" { for_each = toset(concat(local.protected_bucket_names, local.public_bucket_names)) diff --git a/bin/copy-launchpad-pfx.sh b/bin/copy-launchpad-pfx.sh deleted file mode 100755 index f75df25..0000000 --- a/bin/copy-launchpad-pfx.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env bash - -# Make sure launchpad.pfx file is removed locally even if an error occurs. -trap 'rm -f "${_tmpfile}"' EXIT - -set -euo pipefail - -declare _tmpfile=/tmp/launchpad.pfx -declare _dest_key=${CUMULUS_PREFIX}/crypto/launchpad.pfx - -function usage() { - echo "Usage: ${0} BUCKET" - echo - echo "Writes the 'cumulus-launchpad-pfx' secret to the specified" - echo "S3 bucket, at the key ${_dest_key}." -} - -function die() { - echo "ERROR: ${1}" 2>&1 - echo - usage - exit 1 -} - -function main() { - [[ ${#} -eq 0 ]] && die "No S3 bucket specified" - [[ ${#} -gt 1 ]] && die "Too many arguments specified" - - # Decode Cumulus Launchpad PFX secret binary key from base64 encoding and - # write it to a temporary file, so we can upload it to S3. - - aws secretsmanager get-secret-value \ - --secret-id cumulus-launchpad-pfx \ - --output text \ - --query SecretBinary | - base64 -d >"${_tmpfile}" - - # Upload Cumulus Launchpad PFX file to S3 location expected by Cumulus. - - aws s3 cp "${_tmpfile}" "s3://${1}/${_dest_key}" -} - -main "$@" diff --git a/bin/transfer-launchpad-pfx-secret.sh b/bin/transfer-launchpad-pfx-secret.sh deleted file mode 100755 index db2aeed..0000000 --- a/bin/transfer-launchpad-pfx-secret.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env bash - -set -Eeuo pipefail - -declare secret_id="cumulus-launchpad-pfx" - -function usage() { - echo "Usage: ${0} SRC_AWS_PROFILE DST_AWS_PROFILE" - echo - echo "Transfers the AWS secret named '${secret_id}' from the source account" - echo "to the destination account, where the accounts are specified by their" - echo "AWS profile names, as defined on your system." -} - -function die() { - echo "ERROR: ${1}" 2>&1 - echo - usage - exit 1 -} - -function main() { - [[ ${#} -eq 0 ]] && die "No profiles specified" - [[ ${#} -eq 1 ]] && die "Destination profile not specified" - [[ ${#} -gt 2 ]] && die "Too many arguments specified" - - declare src_aws_profile=${1} - declare dst_aws_profile=${2} - - AWS_PROFILE="${src_aws_profile}" aws secretsmanager get-secret-value \ - --secret-id cumulus-launchpad-pfx \ - --output text \ - --query SecretBinary | - AWS_PROFILE="${dst_aws_profile}" xargs -L1 aws secretsmanager create-secret \ - --name cumulus-launchpad-pfx \ - --secret-binary -} - -main "${@}" diff --git a/bin/update-launchpad-pfx.sh b/bin/update-launchpad-pfx.sh new file mode 100755 index 0000000..0ad36fd --- /dev/null +++ b/bin/update-launchpad-pfx.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash + +# Make sure launchpad.pfx file is removed locally even if an error occurs. +trap 'rm -f "${_launchpad_pfx_base64}"' EXIT + +set -euo pipefail + +declare _launchpad_pfx_base64=/tmp/launchpad.pfx.txt + +function usage() { + echo "Usage: ${0} LAUNCHPAD_PFX" + echo + echo "Updates the Launchpad certificate and passphrase/code that Cumulus uses" + echo "for obtaining Launchpad tokens for auth to publish metadata to the CMR." + echo + echo "LAUNCHPAD_PFX is the path to the new launchpad.pfx certificate file to" + echo "be used, and you will be prompted for the associated passphrase/code." +} + +function die() { + echo "ERROR: ${1}" 2>&1 + echo + usage + exit 1 +} + +function main() { + [[ ${#} -eq 0 ]] && die "No Launchpad certificate file (.pfx) specified" + [[ ${#} -gt 1 ]] && die "Too many arguments specified" + + read -r -p "Enter the passphrase/code for the Launchpad certificate file: " + curl \ + --cert "${1}:${REPLY}" \ + --cert-type P12 https://api.launchpad.nasa.gov/icam/api/sm/v1/gettoken \ + --no-progress-meter >/dev/null + + secret_id=cumulus-launchpad-pfx + + # Convert the Launchpad certificate file to a base64-encoded string written to + # a temporary file. This is necessary because Terraform does not support any + # means of converting a secret binary file to a base64-encoded string, which + # is required when specifying the contents of a binary S3 object. It's an odd + # gap in Terraform's functionality. + base64 -w 0 "${1}" >"${_launchpad_pfx_base64}" + + # Upsert the Launchpad certificate file as a base64-encoded secret string. + # This secret is NOT used by Cumulus itself. Rather, it is simply a convenient + # Cumulus stack-independent location to store the certificate so that it can + # be retrieved and written to a Cumulus stack-specific location during deploy. + if aws secretsmanager describe-secret --secret-id "${secret_id}" >/dev/null 2>&1; then + aws secretsmanager update-secret \ + --secret-id "${secret_id}" \ + --secret-string "file://${_launchpad_pfx_base64}" + else + aws secretsmanager create-secret \ + --name "${secret_id}" \ + --description "Launchpad certificate (base64-encoded) for generating tokens to use for publishing metadata to the CMR" \ + --secret-string "file://${_launchpad_pfx_base64}" + fi + + aws ssm put-parameter \ + --name /shared/cumulus/launchpad-passphrase \ + --type SecureString \ + --overwrite \ + --value "${REPLY}" +} + +main "$@" diff --git a/docs/OPERATING.md b/docs/OPERATING.md index d6168a8..1ffc907 100644 --- a/docs/OPERATING.md +++ b/docs/OPERATING.md @@ -1,5 +1,6 @@ # Operating CSDAP Cumulus +- [Update Launchpad Certificate](#update-launchpad-certificate) - [The Cumulus CLI](#the-cumulus-cli) - [Running Commands](#running-commands) - [Running Against Non-Development Deployments](#running-against-non-development-deployments) @@ -15,6 +16,80 @@ - [Updating CMR Metadata (Self Discovery)](#updating-cmr-metadata-self-discovery) - [Destroying a Deployment](#destroying-a-deployment) +## Update Launchpad Certificate + +When the Launchpad certificate used for generating auth tokens for publishing +metadata to the CMR has expired, the `PostToCmr` Lambda function will always +fail with 401 (Unauthorized) errors. When this happens (ideally, _BEFORE_ it +expires, so `PostToCmr` does _not_ start throwing such errors), the Project +Owner must request a new certificate. Once the request is fulfilled, the +Project Owner should receive a new certificate file (`.pfx`) as well as a new +passphrase/password. + +For every AWS account in which CSDA Cumulus is deployed (sandbox, UAT, and +Prod), the new Launchpad certificate file and its associated passcode must be +updated _once_ per AWS account (regardless of the number of deployments in an +account), using the following commands, where `` is the path to the new +Launchpad certificate file (`.pfx`) downloaded from the email resulting from the +completion of the renewal request, relative to your current directory (which +must be the root of the repo). + +You will be prompted for the passcode each time: + +```plain +export LAUNCHPAD_PFX= +AWS_PROFILE=csda-cumulus-sbx-7894 make update-launchpad +AWS_PROFILE=csda-cumulus-uat-1686 make update-launchpad +AWS_PROFILE=csda-cumulus-prod-5047 make update-launchpad +``` + +Once all of the commands above run successfully, be sure to delete your local +copy of the `.pfx` file, for security reasons. + +The command above does the following in the AWS account associated with the +specified AWS profile: + +1. Verifies that the specified certificate/passcode combination are valid by + using them in an attempt to generate a Launchpad token. If successful, the + command continues with the following steps. Otherwise, it fails with an + error message. Failure might typically be because you have entered the + passcode incorrectly, so upon failure, you should double-check the passcode. +1. Creates/updates an AWS binary secret named `cumulus-launchpad-pfx` from the + contents of the specified `.pfx` file. +1. Creates/updates an AWS SSM secret string parameter named + `/cumulus/shared/launchpad-passcode` from the passcode entered at the prompt. + +Once the certificate and passcode are updated, each deployment must be +_redeployed_ in order to pick up the new certificate and passcode. During +redeployment, the new value of the `cumulus-launchpad-pfx` secret will be used +to create/replace the S3 object `/crypto/launchpad.pfx` within the +deployment's "system" bucket (typically the "internal" bucket). This is where +Cumulus expects to find the Launchpad certificate. + +For sandbox deployments, each developer should redeploy their own deployment +by running `make up-cumulus-yes`. + +To redeploy UAT and Prod, do the following: + +1. Go to the list of [GitHub Actions Cumulus workflow runs] +1. Find the most recent successful workflow run (ideally, this will be the first + one in the list) and click its title to view the details of the run, where + you should see that deployment to UAT and to Prod both ran successfully. +1. Towards the upper right of the page, click the **Re-run all jobs** button to + trigger deployment to UAT. +1. Once deployment to UAT succeeds, deployment to Prod will be pending manual + approval. At this point, run a smoke test in UAT to determine whether or not + the `PostToCmr` Lambda function succeeds. +1. Once the smoke test in UAT shows that `PostToCmr` succeeds, return to the + page where you previously clicked the **Re-run all jobs** button, where you + should now see a **Review deployments** button. +1. Click the **Review deployments** button to open the "Review pending + deployments" dialog box. +1. On the dialog box, check the box next to "prod", then click the **Approve and + deploy** button. +1. Once deployment to "prod" succeeds, run a smoke test to confirm successful + operation of the `PostToCmr` Lambda function. + ## The Cumulus CLI The [Cumulus CLI] is a command-line interface for performing various Cumulus @@ -728,6 +803,8 @@ you must manually finish any cleanup effort. https://nasa.github.io/cumulus/docs/operator-docs/provider [rule]: https://nasa.github.io/cumulus/docs/data-cookbooks/setup#rules +[GitHub Actions Cumulus workflow runs]: + https://github.com/NASA-IMPACT/csdap-cumulus/actions/workflows/main.yml [How to specify a file location in a bucket]: https://nasa.github.io/cumulus/docs/workflows/workflow-configuration-how-to#how-to-specify-a-file-location-in-a-bucket [ISO 8601 Combined date and time representation]: diff --git a/docs/TROUBLESHOOTING.md b/docs/TROUBLESHOOTING.md index c87e612..4a049a7 100644 --- a/docs/TROUBLESHOOTING.md +++ b/docs/TROUBLESHOOTING.md @@ -1,5 +1,7 @@ # Troubleshooting +- [Ingestion](#ingestion) + - [PostToCmr Always Fails with 401 (Unauthorized)](#posttocmr-always-fails-with-401-unauthorized) - [Deployment](#deployment) - [Error creating API Gateway Deployment: BadRequestException: Private REST API doesn't have a resource policy attached to it](#error-creating-api-gateway-deployment-badrequestexception-private-rest-api-doesnt-have-a-resource-policy-attached-to-it) - [Aws::STS::Errors::InvalidClientTokenId: The security token included in the request is invalid](#awsstserrorsinvalidclienttokenid-the-security-token-included-in-the-request-is-invalid) @@ -17,6 +19,18 @@ - [Error Deleting Security Group (DependencyViolation)](#error-deleting-security-group-dependencyviolation) - [Error Deleting RDS Cluster (Cannot delete protected Cluster)](#error-deleting-rds-cluster-cannot-delete-protected-cluster) +## Ingestion + +### PostToCmr Always Fails with 401 (Unauthorized) + +When every request to the CMR fails, it is very likely that the Launchpad +certificate has expired, so you should ask the Project Owner to check the +status. If the certificate has expired, the Project Owner must request a new +certificate. Once the request is fulfilled, the Project Owner should receive a +new certificate file (`.pfx`) as well as a new passphrase/password. For +instructions on what to do with the new file and passphrase, see +[OPERATING.md](./OPERATING.md). + ## Deployment ### Error creating API Gateway Deployment: BadRequestException: Private REST API doesn't have a resource policy attached to it