Skip to content

Commit

Permalink
Replace expired Launchpad certificate (#298)
Browse files Browse the repository at this point in the history
Fixes #297
  • Loading branch information
chuckwondo authored Nov 22, 2023
1 parent 47e90f0 commit db373b9
Show file tree
Hide file tree
Showing 8 changed files with 189 additions and 91 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
*.pfx
*.zip
/log

# Created by https://www.toptal.com/developers/gitignore/api/terraform
Expand Down
17 changes: 8 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,12 @@ all-up: logs-init install

## all-up-yes: Deploys all modules (in dependency order) with automatic approval
all-up-yes: logs-init install
$(eval DOCKER_RUN_OPTS := --interactive)
tail -f log/up/*.log & $(TERRASPACE) all up --yes; kill $$!

## all-SUBCOMMAND: Runs Terraspace SUBCOMMAND across all stacks (make all-help for list of SUBCOMMANDs)
all-%: install
$(eval DOCKER_RUN_OPTS := --interactive)
$(TERRASPACE) all $(patsubst all-%,%,$@)

## bash: Runs bash terminal in Docker container
Expand Down Expand Up @@ -145,22 +147,15 @@ nuke: docker
output-%: docker
$(TERRASPACE) output $*

plan-cumulus: install

## plan-STACK: Runs `terraform plan` for specified STACK
plan-%: install
$(eval DOCKER_RUN_OPTS := --interactive)
$(TERRASPACE) plan $*

## pre-deploy-setup: Setup resources prior to initial deployment (idempotent)
pre-deploy-setup: logs-init
# Tail terraspace logs in background so we can see output from "all init"
# command to initialize all terraform modules. After initialization is
# complete, kill background process that's following the logs.
tail -f log/init/*.log & $(TERRASPACE) all init; kill $$!

pre-deploy-setup: all-init
# Ensure buckets exist, grab the name of the "internal" bucket, and copy launchpad.pfx there.
$(DOCKER_RUN) $(IMAGE) -ic "bin/ensure-buckets-exist.sh 2>/dev/null | grep internal | xargs bin/copy-launchpad-pfx.sh"
$(DOCKER_RUN) --interactive $(IMAGE) -ic "bin/ensure-buckets-exist.sh 2>/dev/null

## terraform-doctor-STACK: Fixes "duplicate resource" errors for specified STACK
terraform-doctor-%: docker
Expand All @@ -185,6 +180,10 @@ up-%: logs-init install
$(eval DOCKER_RUN_OPTS := --interactive)
$(TERRASPACE) up $* | tee -a log/up/$*.log

## update-launchpad: Updates Launchpad certificate and passcode (expects LAUNCHPAD_PFX env var to be set to path to Launchpad certificate file [.pfx])
update-launchpad:
$(DOCKER_RUN) --interactive $(IMAGE) -ic "bin/update-launchpad-pfx.sh ${LAUNCHPAD_PFX}"

## validate-STACK: Runs `terraform validate` for specified STACK
validate-%: docker
$(TERRASPACE) validate $*
20 changes: 20 additions & 0 deletions app/stacks/cumulus/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,30 @@ data "archive_file" "lambda" {
output_path = "${data.external.lambda_archive_exploded.result.dir}/../lambda.zip"
}

data "aws_secretsmanager_secret" "launchpad_pfx" {
name = "cumulus-launchpad-pfx"
}

# When the Launchpad certificate expires, and a new one is obtained (along with
# a new passphrase), we need to update this secret with the contents of the new
# certificate. See OPERATING.md for information on updating the Launchpad cert.
data "aws_secretsmanager_secret_version" "launchpad_pfx" {
secret_id = data.aws_secretsmanager_secret.launchpad_pfx.id
}

#-------------------------------------------------------------------------------
# RESOURCES
#-------------------------------------------------------------------------------

# When the Launchpad certificate secret is updated, we need to update the
# launchpad.pfx file in the system bucket that Cumulus uses to generate tokens
# to authorize publishing to the CMR (via the PostToCmr Lambda function).
resource "aws_s3_object" "launchpad_pfx" {
bucket = var.system_bucket
key = "${var.prefix}/crypto/launchpad.pfx"
content_base64 = data.aws_secretsmanager_secret_version.launchpad_pfx.secret_string
}

# <% if !in_sandbox? then %>
resource "null_resource" "put_bucket_logging" {
for_each = toset(concat(local.protected_bucket_names, local.public_bucket_names))
Expand Down
43 changes: 0 additions & 43 deletions bin/copy-launchpad-pfx.sh

This file was deleted.

39 changes: 0 additions & 39 deletions bin/transfer-launchpad-pfx-secret.sh

This file was deleted.

68 changes: 68 additions & 0 deletions bin/update-launchpad-pfx.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/usr/bin/env bash

# Make sure launchpad.pfx file is removed locally even if an error occurs.
trap 'rm -f "${_launchpad_pfx_base64}"' EXIT

set -euo pipefail

declare _launchpad_pfx_base64=/tmp/launchpad.pfx.txt

function usage() {
echo "Usage: ${0} LAUNCHPAD_PFX"
echo
echo "Updates the Launchpad certificate and passphrase/code that Cumulus uses"
echo "for obtaining Launchpad tokens for auth to publish metadata to the CMR."
echo
echo "LAUNCHPAD_PFX is the path to the new launchpad.pfx certificate file to"
echo "be used, and you will be prompted for the associated passphrase/code."
}

function die() {
echo "ERROR: ${1}" 2>&1
echo
usage
exit 1
}

function main() {
[[ ${#} -eq 0 ]] && die "No Launchpad certificate file (.pfx) specified"
[[ ${#} -gt 1 ]] && die "Too many arguments specified"

read -r -p "Enter the passphrase/code for the Launchpad certificate file: "
curl \
--cert "${1}:${REPLY}" \
--cert-type P12 https://api.launchpad.nasa.gov/icam/api/sm/v1/gettoken \
--no-progress-meter >/dev/null

secret_id=cumulus-launchpad-pfx

# Convert the Launchpad certificate file to a base64-encoded string written to
# a temporary file. This is necessary because Terraform does not support any
# means of converting a secret binary file to a base64-encoded string, which
# is required when specifying the contents of a binary S3 object. It's an odd
# gap in Terraform's functionality.
base64 -w 0 "${1}" >"${_launchpad_pfx_base64}"

# Upsert the Launchpad certificate file as a base64-encoded secret string.
# This secret is NOT used by Cumulus itself. Rather, it is simply a convenient
# Cumulus stack-independent location to store the certificate so that it can
# be retrieved and written to a Cumulus stack-specific location during deploy.
if aws secretsmanager describe-secret --secret-id "${secret_id}" >/dev/null 2>&1; then
aws secretsmanager update-secret \
--secret-id "${secret_id}" \
--secret-string "file://${_launchpad_pfx_base64}"
else
aws secretsmanager create-secret \
--name "${secret_id}" \
--description "Launchpad certificate (base64-encoded) for generating tokens to use for publishing metadata to the CMR" \
--secret-string "file://${_launchpad_pfx_base64}"
fi

aws ssm put-parameter \
--name /shared/cumulus/launchpad-passphrase \
--type SecureString \
--overwrite \
--value "${REPLY}"
}

main "$@"
77 changes: 77 additions & 0 deletions docs/OPERATING.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Operating CSDAP Cumulus

- [Update Launchpad Certificate](#update-launchpad-certificate)
- [The Cumulus CLI](#the-cumulus-cli)
- [Running Commands](#running-commands)
- [Running Against Non-Development Deployments](#running-against-non-development-deployments)
Expand All @@ -15,6 +16,80 @@
- [Updating CMR Metadata (Self Discovery)](#updating-cmr-metadata-self-discovery)
- [Destroying a Deployment](#destroying-a-deployment)

## Update Launchpad Certificate

When the Launchpad certificate used for generating auth tokens for publishing
metadata to the CMR has expired, the `PostToCmr` Lambda function will always
fail with 401 (Unauthorized) errors. When this happens (ideally, _BEFORE_ it
expires, so `PostToCmr` does _not_ start throwing such errors), the Project
Owner must request a new certificate. Once the request is fulfilled, the
Project Owner should receive a new certificate file (`.pfx`) as well as a new
passphrase/password.

For every AWS account in which CSDA Cumulus is deployed (sandbox, UAT, and
Prod), the new Launchpad certificate file and its associated passcode must be
updated _once_ per AWS account (regardless of the number of deployments in an
account), using the following commands, where `<pfx>` is the path to the new
Launchpad certificate file (`.pfx`) downloaded from the email resulting from the
completion of the renewal request, relative to your current directory (which
must be the root of the repo).

You will be prompted for the passcode each time:

```plain
export LAUNCHPAD_PFX=<pfx>
AWS_PROFILE=csda-cumulus-sbx-7894 make update-launchpad
AWS_PROFILE=csda-cumulus-uat-1686 make update-launchpad
AWS_PROFILE=csda-cumulus-prod-5047 make update-launchpad
```

Once all of the commands above run successfully, be sure to delete your local
copy of the `.pfx` file, for security reasons.

The command above does the following in the AWS account associated with the
specified AWS profile:

1. Verifies that the specified certificate/passcode combination are valid by
using them in an attempt to generate a Launchpad token. If successful, the
command continues with the following steps. Otherwise, it fails with an
error message. Failure might typically be because you have entered the
passcode incorrectly, so upon failure, you should double-check the passcode.
1. Creates/updates an AWS binary secret named `cumulus-launchpad-pfx` from the
contents of the specified `.pfx` file.
1. Creates/updates an AWS SSM secret string parameter named
`/cumulus/shared/launchpad-passcode` from the passcode entered at the prompt.

Once the certificate and passcode are updated, each deployment must be
_redeployed_ in order to pick up the new certificate and passcode. During
redeployment, the new value of the `cumulus-launchpad-pfx` secret will be used
to create/replace the S3 object `<prefix>/crypto/launchpad.pfx` within the
deployment's "system" bucket (typically the "internal" bucket). This is where
Cumulus expects to find the Launchpad certificate.

For sandbox deployments, each developer should redeploy their own deployment
by running `make up-cumulus-yes`.

To redeploy UAT and Prod, do the following:

1. Go to the list of [GitHub Actions Cumulus workflow runs]
1. Find the most recent successful workflow run (ideally, this will be the first
one in the list) and click its title to view the details of the run, where
you should see that deployment to UAT and to Prod both ran successfully.
1. Towards the upper right of the page, click the **Re-run all jobs** button to
trigger deployment to UAT.
1. Once deployment to UAT succeeds, deployment to Prod will be pending manual
approval. At this point, run a smoke test in UAT to determine whether or not
the `PostToCmr` Lambda function succeeds.
1. Once the smoke test in UAT shows that `PostToCmr` succeeds, return to the
page where you previously clicked the **Re-run all jobs** button, where you
should now see a **Review deployments** button.
1. Click the **Review deployments** button to open the "Review pending
deployments" dialog box.
1. On the dialog box, check the box next to "prod", then click the **Approve and
deploy** button.
1. Once deployment to "prod" succeeds, run a smoke test to confirm successful
operation of the `PostToCmr` Lambda function.

## The Cumulus CLI

The [Cumulus CLI] is a command-line interface for performing various Cumulus
Expand Down Expand Up @@ -728,6 +803,8 @@ you must manually finish any cleanup effort.
https://nasa.github.io/cumulus/docs/operator-docs/provider
[rule]:
https://nasa.github.io/cumulus/docs/data-cookbooks/setup#rules
[GitHub Actions Cumulus workflow runs]:
https://github.com/NASA-IMPACT/csdap-cumulus/actions/workflows/main.yml
[How to specify a file location in a bucket]:
https://nasa.github.io/cumulus/docs/workflows/workflow-configuration-how-to#how-to-specify-a-file-location-in-a-bucket
[ISO 8601 Combined date and time representation]:
Expand Down
14 changes: 14 additions & 0 deletions docs/TROUBLESHOOTING.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Troubleshooting

- [Ingestion](#ingestion)
- [PostToCmr Always Fails with 401 (Unauthorized)](#posttocmr-always-fails-with-401-unauthorized)
- [Deployment](#deployment)
- [Error creating API Gateway Deployment: BadRequestException: Private REST API doesn't have a resource policy attached to it](#error-creating-api-gateway-deployment-badrequestexception-private-rest-api-doesnt-have-a-resource-policy-attached-to-it)
- [Aws::STS::Errors::InvalidClientTokenId: The security token included in the request is invalid](#awsstserrorsinvalidclienttokenid-the-security-token-included-in-the-request-is-invalid)
Expand All @@ -17,6 +19,18 @@
- [Error Deleting Security Group (DependencyViolation)](#error-deleting-security-group-dependencyviolation)
- [Error Deleting RDS Cluster (Cannot delete protected Cluster)](#error-deleting-rds-cluster-cannot-delete-protected-cluster)

## Ingestion

### PostToCmr Always Fails with 401 (Unauthorized)

When every request to the CMR fails, it is very likely that the Launchpad
certificate has expired, so you should ask the Project Owner to check the
status. If the certificate has expired, the Project Owner must request a new
certificate. Once the request is fulfilled, the Project Owner should receive a
new certificate file (`.pfx`) as well as a new passphrase/password. For
instructions on what to do with the new file and passphrase, see
[OPERATING.md](./OPERATING.md).

## Deployment

### Error creating API Gateway Deployment: BadRequestException: Private REST API doesn't have a resource policy attached to it
Expand Down

0 comments on commit db373b9

Please sign in to comment.