diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..c0ee48f --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length=80 +exclude=.git,__pycache__,.venv diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..91b747e --- /dev/null +++ b/.gitignore @@ -0,0 +1,146 @@ +target/ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# static files generated from Django application using `collectstatic` +media +static + +.idea/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 939e534..e69de29 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,28 +0,0 @@ -# How to Contribute - -We'd love to accept your patches and contributions to this project. There are -just a few small guidelines you need to follow. - -## Contributor License Agreement - -Contributions to this project must be accompanied by a Contributor License -Agreement. You (or your employer) retain the copyright to your contribution; -this simply gives us permission to use and redistribute your contributions as -part of the project. Head over to to see -your current agreements on file or to sign a new one. - -You generally only need to submit a CLA once, so if you've already submitted one -(even if it was for a different project), you probably don't need to do it -again. - -## Code reviews - -All submissions, including submissions by project members, require review. We -use GitHub pull requests for this purpose. Consult -[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more -information on using pull requests. - -## Community Guidelines - -This project follows [Google's Open Source Community -Guidelines](https://opensource.google.com/conduct/). diff --git a/LICENSE b/LICENSE index f433b1a..a5f721e 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,3 @@ - Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ @@ -175,3 +174,28 @@ of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2019 Google LLC + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..ee5c242 --- /dev/null +++ b/Makefile @@ -0,0 +1,40 @@ +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Make will use bash instead of sh +SHELL := /usr/bin/env bash + +# The .PHONY directive tells make that this isn't a file target +.PHONY: fmt +fmt: ## Format files, including README + @$$SHELL ./helpers/format.sh + +help: ## Prints help for targets with comments + @grep -E '^[a-zA-Z._-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "make \033[36m%- 30s\033[0m %s\n", $$1, $$2}' + +.PHONY: test +test: ## Test if all files are properly formatted + @$$SHELL ./helpers/check_format.sh && python3 -m flake8 --max-line-length=100 && ./helpers/run_tests.sh + +.PHONY: precommit +precommit: ## Test if all files are properly formatted + @$$SHELL ./helpers/check_format.sh && python3 -m flake8 --max-line-length=100 && ./helpers/run_relevant_cloudbuilds.sh precommit_cloudbuild.yaml + +.PHONY: push_ci_image +push_ci_image: + @cd ci && gcloud builds submit --project=datapipelines-ci-282719 --tag gcr.io/datapipelines-ci-282719/make . + +.PHONY: push_deploydags_image +push_deploydags_image: + @cd composer/cloudbuild/go/dagsdeployer && gcloud builds submit --project=datapipelines-ci-282719 --tag gcr.io/datapipelines-ci-282719/deploydags . + diff --git a/README.md b/README.md index ceaff0b..4257295 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,346 @@ -# CI/CD for data processing workflow -This repository contains source code for the guide on how to use Cloud Build and Cloud Composer to create a CI/CD pipeline for building, deployment and testing of a data processing workflow. +# Data Pipelines CI/CD Mono Repo +This repo provides an example of using [Cloud Build](https://cloud.google.com/cloud-build/) +to deploy various artifacts to deploy GCP D&A technologies. +The repo includes a Terraform directory to spin up infrastructure as well as +Cloud Build Triggers which will automate the deployments of new commits to +master. -Please refer to the solution guide for the steps to run the code: [solution -tutorial](https://cloud.google.com/solutions/cicd-pipeline-for-data-processing) +## GCP Project Structure +This example focuses on CI checks on PRs, Artifact staging and production deployment. +1. CI: Houses infrastructure similar to production to facilitate Continuous +Integration tests on PRs. +1. Aritfacts: Houses built artifacts (such as images, executables, etc.) that +passed all CI checks. Pushed from CI; Pulled from Prod. +1. Production: Where the workload runs that actually serves the business. + +The formal [similarity](https://en.wikipedia.org/wiki/Similarity_(geometry)) +between CI and production is enforced as they are provisioned with terraform +with different inputs. +This includes pointing to different projects / buckets. This might include +sizing differences in Composer environment for production scale workload. + +This project uses [terragrunt](https://terragrunt.gruntwork.io/) to manage all +ci, artifacts and production projects keep terraform configs and backends DRY +and handle passing dependencies between the terraform states. This was chosen as +an OSS alternative to terraform enterprise. + +CI/CD for IaC is a topic of it's own and is only included here for +reproducibility and examples sake. + +In many organizations, there is a concept of "QA" or "Staging" project / +environment where additional manual validation is done. +The concepts in this repo can be extended to accomodate such a structure +by adding a directory under terraform with a `terragrunt.hcl` file that +handles inputs and dependencies. + +## Flow +### Development Flow +1. Prepare changes and run `make test` to run static / unit tests locally. +1. Open PR. Unit and style checks will run automatically. +1. Maintainer's `/gcbrun` comment triggers CI process (below) in CI project. +1. Fix anything that is causing the build to fail (this could include adding +new build steps if necessary). +1. On successful CI run pushes artifacts to the artifacts project. +Images go to GCR, JARs go to GCS with a `SHORT_SHA` prefix. + +### Deployment Flow +Run any necessary large scale integration testing or manual confirmation of the +CI environment. These tests do not fit comfortably in the Cloud Build 10 minute +timeout and were out of scope for this example but could also be automated in a +more persistent CI framework like spinnaker, jenkins or gitlab. +Run the root cloudbuild with the production substitution values. + +## Precommit and Postcommit "Discovery" +Each directory in this repo containing code to be tested with a precommit and/or +deployed with a post commit can be picked up by the build discovery script +defined in `./helpers/run_relevant_cloudbuilds.sh` by defining the following: +1. a `precommit_cloudbuild.yaml`: defines unit tests and static analysis beyond +what the repo enforces. +1. a `cloudbuild.yaml`: integration tests, deploys artifacts and updates +necessary references for System Tests. For example build a dataflow jar and +update the Airflow Variable in Composer Environment that tells the DAG what jar +to run. + +All nested cloudbuilds should assume they run from the root of the repo and set +`dir` accordingly. + +#### Precommit +The precommit should run without substitutions. + +### Cloud Build +The Cloud Build should accept the following substitutions: +- `_COMPOSER_REGION` +- `_COMPOSER_ENV_NAME` +- `_DATAFLOW_JAR_BUCKET` + +The precommit will be run on every PR including changes under that file tree. +The build will deploy to the CI environment on a "/gcbrun" comment. + +## The Cloud Build CI Process +1. init-git-repo: initialize the git repository. +1. merge-master: merge to master branch so we test post merge code. +1. run-builds: search for post commit `cloudbuild.yaml`s to run using `helpers/run_relevant_cloudbuilds.sh` +1. deploy-sql-queries-for-composer: Copy the BigQuery SQL to the dags folder of the target Composer Environment. +1. deploy-airflowignore: Copies an [`.airflowignore`](https://airflow.apache.org/docs/stable/concepts.html#airflowignore) +to ignore non-dag definition files (like sql files) in the dag parser. +1. deploy-test-input-file: Copies a file to GCS (just for example purpose of this DAG) +1. deploy-test-ref-file: Copies a file to GCS (just for example purpose of this DAG) +1. stage-airflow-variables: Copies the rendered AirflowVariables.json file to the Cloud Composer wokers. +1. import-airflow-variables: Imports the rendered AirflowVariables.json file to the Cloud Composer Environment. +1. deploy-custom-plugins: Copy the source code for the Airflow plugins to the `plugins/` directory of +the Composer Bucket. +1. stage-for-integration-test: Copy the airflow dags to a `data/test/` directory in the Composer +environment for integration test. +1. dag-parse-integration-test: Run `list_dags` on the `data/test/` directory in the Composer +environment. This is validates that dags don't refer to variables or connections that don't exist in the target environment +1. clean-up-data-dir-dags: Clean up the integration test artifacts. +1. pull-deploydags: pull the existing deploydags image (to facilitate caching if possible). +1. build-deploydags: Build the golang `deploydags` application +(documented in `composer/cloudbuild/README.md`) +1. run-deploydags: Run the deploy dags application. + + +## Setup Cloud Shell Development Environment (for example's sake) +Install terragrunt and ensure java 8. +```bash +sudo ./helpers/init_cloudshell.sh +``` + + +You can confirm things look roughly like this: +``` +# Python for airflow / beam development +$ python3 --version +Python 3.7.3 + +# Java for beam development +$ mvn -version +mvn -version +Apache Maven 3.6.3 (cecedd343002696d0abb50b32b541b8a6ba2883f) +Maven home: /opt/maven +Java version: 1.8.0_232, vendor: Oracle Corporation, runtime: /usr/lib/jvm/java-8-openjdk-amd64/jre +Default locale: en_US, platform encoding: UTF-8 +OS name: "linux", version: "4.19.112+", arch: "amd64", family: "uni" + +# Golang for modifying deploydags app +$ go version +go version go1.14.4 linux/amd64 + +# Terragrunt / Terraform for IaC for the projects +$ terraform -version +Terraform v0.12.24 + +$ terragrunt -version +terragrunt version v0.23.24 +``` + +To setup python dependencies for running the tests: +```bash +python3 -m venv .venv +source .venv/bin/activate +pip install -r requirements-dev.txt +cd composer +python3 -m pytest +``` + +### Formatting Code Locally +Runs `go fmt`, `yapf`, `google-java-format`, `terraform fmt` on appropriate files. +```bash +make fmt +``` + +### Running Tests Locally +Runs linters, static code analysis and unit tests. +```bash +make test +``` + +### Pushing a new version of the deploydags golang application +Changes to the deploydags golang app can be pushed with +```bash +make push_deploydags_image +``` + +## Repo Structure +``` +. +├── bigquery +│   ├── precommit_cloudbuild.yaml +│   ├── README.md +│   ├── sql +│   │   └── shakespeare_top_25.sql +│   └── tests +│   └── test_sql.sh +├── cd +│   └── prod.yaml +├── ci +│   └── Dockerfile +├── cloudbuild.yaml +├── composer +│   ├── cloudbuild +│   │   ├── bin +│   │   │   └── run_tests.sh +│   │   ├── go +│   │   │   └── dagsdeployer +│   │   │   ├── cmd +│   │   │   │   └── deploydags +│   │   │   │   ├── deploydags +│   │   │   │   └── main.go +│   │   │   ├── Dockerfile +│   │   │   ├── go.mod +│   │   │   ├── go.sum +│   │   │   └── internal +│   │   │   ├── composerdeployer +│   │   │   │   ├── composer_ops.go +│   │   │   │   └── composer_ops_test.go +│   │   │   └── gcshasher +│   │   │   ├── gcs_hash.go +│   │   │   ├── gcs_hash_test.go +│   │   │   └── testdata +│   │   │   ├── test_diff.txt +│   │   │   └── test.txt +│   │   ├── Makefile +│   │   └── README.md +│   ├── config +│   │   ├── AirflowVariables.json +│   │   └── ci_dags.txt +│   ├── dags +│   │   ├── support-files +│   │   │   ├── input.txt +│   │   │   └── ref.txt +│   │   ├── tutorial.py +│   │   └── wordcount_dag.py +│   ├── deploydags +│   ├── __init__.py +│   ├── plugins +│   │   └── xcom_utils_plugin +│   │   ├── __init__.py +│   │   ├── operators +│   │   │   ├── compare_xcom_maps.py +│   │   │   ├── __init__.py +│   │   │   └── __pycache__ +│   │   │   ├── compare_xcom_maps.cpython-37.pyc +│   │   │   └── __init__.cpython-37.pyc +│   │   └── __pycache__ +│   │   └── __init__.cpython-37.pyc +│   ├── precommit_cloudbuild.yaml +│   ├── __pycache__ +│   │   └── __init__.cpython-37.pyc +│   ├── requirements-dev.txt +│   └── tests +│   ├── __init__.py +│   ├── __pycache__ +│   │   ├── __init__.cpython-37.pyc +│   │   ├── test_compare_xcom_maps.cpython-37.pyc +│   │   ├── test_compare_xcom_maps.cpython-37-pytest-5.4.3.pyc +│   │   ├── test_dag_validation.cpython-37.pyc +│   │   └── test_dag_validation.cpython-37-pytest-5.4.3.pyc +│   ├── test_compare_xcom_maps.py +│   └── test_dag_validation.py +├── CONTRIBUTING.md +├── dataflow +│   └── java +│   └── wordcount +│   ├── cloudbuild.yaml +│   ├── pom.xml +│   ├── precommit_cloudbuild.yaml +│   ├── src +│   │   ├── main +│   │   │   └── java +│   │   │   └── org +│   │   │   └── apache +│   │   │   └── beam +│   │   │   └── examples +│   │   │   └── WordCount.java +│   │   └── test +│   │   └── java +│   │   └── org +│   │   └── apache +│   │   └── beam +│   │   └── examples +│   │   └── WordCountTest.java +│   └── target +│   ├── classes +│   │   └── org +│   │   └── apache +│   │   └── beam +│   │   └── examples +│   │   ├── WordCount$CountWords.class +│   │   ├── WordCount$ExtractWordsFn.class +│   │   ├── WordCount$FormatAsTextFn.class +│   │   ├── WordCount$WordCountOptions.class +│   │   └── WordCount.class +│   ├── generated-sources +│   │   └── annotations +│   ├── generated-test-sources +│   │   └── test-annotations +│   ├── maven-archiver +│   │   └── pom.properties +│   ├── maven-status +│   │   └── maven-compiler-plugin +│   │   ├── compile +│   │   │   └── default-compile +│   │   │   ├── createdFiles.lst +│   │   │   └── inputFiles.lst +│   │   └── testCompile +│   │   └── default-testCompile +│   │   ├── createdFiles.lst +│   │   └── inputFiles.lst +│   ├── surefire-reports +│   │   ├── org.apache.beam.examples.WordCountTest-output.txt +│   │   ├── org.apache.beam.examples.WordCountTest.txt +│   │   └── TEST-org.apache.beam.examples.WordCountTest.xml +│   ├── test-classes +│   │   └── org +│   │   └── apache +│   │   └── beam +│   │   └── examples +│   │   └── WordCountTest.class +│   ├── word-count-beam-0.1.jar +│   └── word-count-beam-bundled-0.1.jar +├── helpers +│   ├── check_format.sh +│   ├── exclusion_list.txt +│   ├── format.sh +│   ├── init_cloudshell.sh +│   ├── init_git_repo.sh +│   ├── run_relevant_cloudbuilds.sh +│   └── run_tests.sh +├── LICENSE +├── license-templates +│   └── LICENSE.txt +├── Makefile +├── precommit_cloudbuild.yaml +├── README.md +├── scripts +│   ├── get_composer_properties.sh +│   └── set_env.sh +└── terraform + ├── artifacts + │   ├── backend.tf + │   ├── main.tf + │   ├── outputs.tf + │   ├── README.md + │   ├── terragrunt.hcl + │   └── variables.tf + ├── backend.tf + ├── ci + │   └── terragrunt.hcl + ├── datapipelines-infra + │   ├── backend.tf + │   ├── composer.tf + │   ├── gcs.tf + │   ├── network.tf + │   ├── outputs.tf + │   ├── prod.tfvars + │   ├── README.md + │   ├── services.tf + │   ├── terragrunt.hcl + │   ├── variables.tf + │   └── versions.tf + ├── prod + │   └── terragrunt.hcl + └── terragrunt.hcl + +74 directories, 103 files +``` diff --git a/bigquery/README.md b/bigquery/README.md new file mode 100644 index 0000000..6ef7022 --- /dev/null +++ b/bigquery/README.md @@ -0,0 +1,3 @@ +# BigQuery +Store sql files under teh `sql` directory. +If your sql contains jinja templates add a json file with substitution values that will make the query pass in the CI project. diff --git a/bigquery/precommit_cloudbuild.yaml b/bigquery/precommit_cloudbuild.yaml new file mode 100644 index 0000000..774b92a --- /dev/null +++ b/bigquery/precommit_cloudbuild.yaml @@ -0,0 +1,8 @@ +steps: +# Dry Run SQL. +- name: 'google/cloud-sdk' + waitFor: ['-'] + dir: 'bigquery' + entrypoint: 'bash' + args: ['tests/test_sql.sh'] + id: 'test-sql-queries' diff --git a/bigquery/sql/shakespeare_top_25.sql b/bigquery/sql/shakespeare_top_25.sql new file mode 100644 index 0000000..8aa1cc7 --- /dev/null +++ b/bigquery/sql/shakespeare_top_25.sql @@ -0,0 +1,11 @@ +#standardSQL +SELECT + word, + SUM(word_count) as wc +FROM + `bigquery-public-data.samples.shakespeare` +GROUP BY + word +ORDER BY + wc DESC +LIMIT 25 diff --git a/bigquery/tests/test_sql.sh b/bigquery/tests/test_sql.sh new file mode 100755 index 0000000..ad16d30 --- /dev/null +++ b/bigquery/tests/test_sql.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# Copyright 2019 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +# $1 is a query string to dry_run +function dry_run_query() { + bq query \ + --use_legacy_sql=false \ + --dry_run \ + "$1" +} + +while IFS= read -r query_file +do + echo "$query_file" + dry_run_query "$(cat "$query_file")" + result="$?" + if [ "$result" -ne 0 ]; then + echo "Failed to dry run $query_file" + exit "$result" + fi +done < <(find ./sql -path "*.sql") + diff --git a/cd/prod.yaml b/cd/prod.yaml new file mode 100644 index 0000000..aaa7a64 --- /dev/null +++ b/cd/prod.yaml @@ -0,0 +1,152 @@ +# Copyright 2019 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +steps: +# [Dataflow] +# Stage JARs on GCS. +- name: gcr.io/cloud-builders/gsutil + args: [ + 'cp', + 'gs://${_DATAFLOW_ARTIFACT_BUCKET}/${_CI_BUILD_ID}/wordcount.jar' + 'gs://${_DATAFLOW_JAR_BUCKET}/wordcount.jar'] + id: 'deploy-wordcount-jar' +# [BigQuery] +# Copy SQL to DAGs folder in prod. +- name: gcr.io/cloud-builders/gsutil + waitFor: ['test-sql-queries'] + args: [ + 'rsync','-r', '-d', + 'sql', '${_COMPOSER_DAG_BUCKET}dags/sql' + ] + dir: './bigquery/' + id: 'deploy-sql-queries-for-composer' +# [Composer] +# Render AirflowVariables.json with production values +- name: 'gcr.io/${PROJECT_ID}/envsubst' + waitFor: ['-'] + env: [ + "GCP_PROJECT_ID=${PROJECT_ID}", + "COMPOSER_REGION=${_COMPOSER_REGION}", + "DATAFLOW_JAR_BUCKET=${_DATAFLOW_JAR_BUCKET}", + "INPUT_BUCKET=${_WORDCOUNT_INPUT_BUCKET}", + "REF_BUCKET=${_WORDCOUNT_REF_BUCKET}", + "RESULT_BUCKET=${_WORDCOUNT_RESULT_BUCKET}", + "DATAFLOW_STAGING_BUCKET=${_DATAFLOW_STAGING_BUCKET}", + ] + args: ['AirflowVariables.json'] + dir: './composer/config' + id: 'render-airflow-variables' +# Add .airflowignore to GCS prod DAGs folder. +- name: gcr.io/cloud-builders/gcloud + waitFor: ['run-unit-tests'] + args: [ + 'composer', 'environments', 'storage', 'dags', 'import', + '--source','.airflowignore', + '--environment', '${_COMPOSER_ENV_NAME}', + '--location', '${_COMPOSER_REGION}' + ] + dir: './composer/dags/' + id: 'deploy-airflowignore' +# Stage AirflowVariables.json to data directory to be synced to workers. +- name: gcr.io/cloud-builders/gcloud + waitFor: ['render-airflow-variables'] + args: [ + 'composer', 'environments', 'storage', 'data', 'import', + '--location=${_COMPOSER_REGION}', + '--environment=${_COMPOSER_ENV_NAME}', + '--source','AirflowVariables.json', + '--destination', 'config' + ] + dir: './composer/config/' + id: 'stage-airflow-variables' +# Import AirflowVariables.json +- name: gcr.io/cloud-builders/gcloud + waitFor: ['stage-airflow-variables'] + args: [ + 'composer', 'environments', 'run', + '--location=${_COMPOSER_REGION}', + '${_COMPOSER_ENV_NAME}', + 'variables', '--', + '--import', '/home/airflow/gcs/data/config/AirflowVariables.json' + ] + id: 'import-airflow-variables' +# Override JAR reference variable to the artifact built in this build. +- name: gcr.io/cloud-builders/gcloud + args: [ + 'composer', 'environments', 'run', + '--location', '${_COMPOSER_REGION}', + '${_COMPOSER_ENV_NAME}', + 'variables', '--', + '--set', 'dataflow_jar_file_test', 'wordcount.jar' + ] + id: 'set-composer-test-jar-ref' +# Sync plugins to GCS plugins dir +- name: gcr.io/cloud-builders/gsutil + args: [ + 'rsync','-r', '-d', + 'plugins/', + '${_COMPOSER_DAG_BUCKET}plugins' + ] + dir: './composer/' + id: 'deploy-custom-plugins' +# Sync DAGs to data dir for integration test parsing in target Composer Environment. +- name: gcr.io/cloud-builders/gsutil + waitFor: ['deploy-custom-plugins'] + args: [ + 'rsync','-r', '-d', + 'dags/', + '${_COMPOSER_DAG_BUCKET}data/test-dags/$BUILD_ID' + ] + dir: './composer/' + id: 'stage-for-integration-test' +# Run integration tests parsing in target Composer Environment. +- name: gcr.io/cloud-builders/gcloud + waitFor: ['stage-for-integration-test'] + args: [ + 'composer', 'environments', 'run', + '--location', '${_COMPOSER_REGION}', + '${_COMPOSER_ENV_NAME}', + 'list_dags', '--', + '-sd', '/home/airflow/gcs/data/test-dags/$BUILD_ID' + ] + id: 'dag-parse-integration-test' +# Clean up. +- name: gcr.io/cloud-builders/gsutil + waitFor: ['dag-parse-integration-test'] + args: [ + '-m', 'rm','-r', + '${_COMPOSER_DAG_BUCKET}data/test-dags/$BUILD_ID' + ] + dir: './composer/' + id: 'clean-up-data-dir-dags' +# Run dags deployer golang app. +- name: gcr.io/${_CI_PROJECT_ID}/deploydags + dir: './composer' + waitFor: [ + 'run-style-and-unit-tests', + 'build-deploydags', + 'clean-up-data-dir-dags', + 'deploy-wordcount-jar' + ] + args: [ + '-dagList=./config/running_dags.txt', + '-dagsFolder=./dags', + '-project=${PROJECT_ID}', + '-region=${_COMPOSER_REGION}', + '-composerEnv=${_COMPOSER_ENV_NAME}', + '-dagBucketPrefix=${_COMPOSER_DAG_BUCKET}dags', + '-replace' + ] + id: 'run-deploydags' +options: + machineType: 'N1_HIGHCPU_8' diff --git a/ci/Dockerfile b/ci/Dockerfile new file mode 100644 index 0000000..830e485 --- /dev/null +++ b/ci/Dockerfile @@ -0,0 +1,29 @@ +# This Dockerfile builds the image used in Cloud Build CI to run 'make test'. + +FROM python:buster + +# install core tools +RUN apt-get update && apt-get install -y build-essential + +RUN curl -sSL https://sdk.cloud.google.com | bash + +# install shellcheck +RUN apt-get install shellcheck + +# install yapf +RUN pip3 install yapf flake8 pytest apache-airflow[gcp]==1.10.6 + +# install golang (+gofmt) +RUN apt-get install -y golang + +# Install java + google-java-format jar +RUN apt-get install -y default-jdk maven +RUN wget https://github.com/google/google-java-format/releases/download/google-java-format-1.7/google-java-format-1.7-all-deps.jar --directory-prefix=/usr/share/java/ + +# install terraform (+fmt ) +RUN wget https://releases.hashicorp.com/terraform/0.12.24/terraform_0.12.24_linux_amd64.zip \ + && unzip terraform_0.12.24_linux_amd64.zip \ + && mv terraform /usr/bin \ + && rm terraform_0.12.24_linux_amd64.zip + +ENTRYPOINT ["make"] diff --git a/cloudbuild.yaml b/cloudbuild.yaml new file mode 100644 index 0000000..aca6271 --- /dev/null +++ b/cloudbuild.yaml @@ -0,0 +1,216 @@ +# Copyright 2019 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +steps: +- name: 'gcr.io/cloud-builders/git' + entrypoint: 'bash' + args: [ + './helpers/init_git_repo.sh' + ] + env: [ + 'COMMIT_SHA=${COMMIT_SHA}', + 'BASE_REPO_URL=https://github.com/jaketf/ci-cd-for-data-processing-workflow.git', + 'BASE_BRANCH=${_BASE_BRANCH}', + 'PR_NUMBER=${_PR_NUMBER}' + ] + id: 'init-git-repo' +# Merge Master because this is a post-commit +- name: 'gcr.io/cloud-builders/git' + args: ['merge', 'origin/master'] + waitFor: ['init-git-repo'] + id: 'merge-master' +# Run linters and relevant cloudbuilds (we have to do this again in case merge +# broke a unit test or static check) +# Individual cloudbuild.yaml's (e.g. for a dataflow pipeline) are responsible +# for pushing their own artifacts to the appropriate gcs location in the +# artifacts project and updating the necessary references in composer. +- name: 'google/cloud-sdk' + waitFor: ['merge-master'] + entrypoint: 'bash' + args: [ + './helpers/run_relevant_cloudbuilds.sh', + 'cloudbuild.yaml', + '--substitutions=_SHORT_SHA=${SHORT_SHA},_COMPOSER_REGION=${_COMPOSER_REGION},_COMPOSER_ENV_NAME=${_COMPOSER_ENV_NAME},_DATAFLOW_JAR_BUCKET=${_DATAFLOW_JAR_BUCKET}' + ] + id: 'run-builds' +# [BigQuery] +# Copy SQL to DAGs folder. +- name: 'google/cloud-sdk' + waitFor: ['run-builds'] + entrypoint: 'gsutil' + args: [ + 'rsync','-r', '-d', + 'sql', '${_COMPOSER_DAG_BUCKET}dags/sql' + ] + dir: './bigquery/' + id: 'deploy-sql-queries-for-composer' +# [Composer] +# Render AirflowVariables.json +- name: 'gcr.io/${PROJECT_ID}/envsubst' + waitFor: ['-'] + env: [ + "GCP_PROJECT_ID=${PROJECT_ID}", + "COMPOSER_REGION=${_COMPOSER_REGION}", + "DATAFLOW_JAR_BUCKET=${_DATAFLOW_ARTIFACTS_BUCKET}", + "INPUT_BUCKET=${_WORDCOUNT_INPUT_BUCKET}", + "REF_BUCKET=${_WORDCOUNT_REF_BUCKET}", + "RESULT_BUCKET=${_WORDCOUNT_RESULT_BUCKET}", + "DATAFLOW_STAGING_BUCKET=${_DATAFLOW_STAGING_BUCKET}", + ] + args: ['AirflowVariables.json'] + dir: './composer/config' + id: 'render-airflow-variables' +# Add .airflowignore to GCS DAGs folder. +- name: 'google/cloud-sdk' + waitFor: ['run-builds'] + entrypoint: 'gcloud' + args: [ + 'composer', 'environments', 'storage', 'dags', 'import', + '--source','.airflowignore', + '--environment', '${_COMPOSER_ENV_NAME}', + '--location', '${_COMPOSER_REGION}' + ] + dir: './composer/dags/' + id: 'deploy-airflowignore' +# Stage files for running the example. +- name: 'google/cloud-sdk' + waitFor: ['-'] + entrypoint: 'gsutil' + args: [ + 'cp', + 'support-files/input.txt', + 'gs://${_WORDCOUNT_INPUT_BUCKET}' + ] + dir: './composer/dags' + id: 'deploy-test-input-file' +- name: 'google/cloud-sdk' + waitFor: ['-'] + entrypoint: 'gsutil' + args: [ + 'cp', + 'support-files/ref.txt', + 'gs://${_WORDCOUNT_REF_BUCKET}' + ] + dir: './composer/dags' + id: 'deploy-test-ref-file' +# Stage AirflowVariables.json to data directory to be synced to workers. +- name: 'google/cloud-sdk' + waitFor: ['render-airflow-variables'] + entrypoint: 'gcloud' + args: [ + 'composer', 'environments', 'storage', 'data', 'import', + '--location=${_COMPOSER_REGION}', + '--environment=${_COMPOSER_ENV_NAME}', + '--source','AirflowVariables.json', + '--destination', 'config' + ] + dir: './composer/config/' + id: 'stage-airflow-variables' +# Import AirflowVariables.json +- name: 'google/cloud-sdk' + waitFor: ['stage-airflow-variables'] + entrypoint: 'gcloud' + args: [ + 'composer', 'environments', 'run', + '--location=${_COMPOSER_REGION}', + '${_COMPOSER_ENV_NAME}', + 'variables', '--', + '--import', '/home/airflow/gcs/data/config/AirflowVariables.json' + ] + id: 'import-airflow-variables' +# Sync plugins to GCS plugins dir +- name: 'google/cloud-sdk' + waitFor: ['run-builds'] + entrypoint: 'gsutil' + args: [ + 'rsync','-r', '-d', + 'plugins/', + '${_COMPOSER_DAG_BUCKET}plugins' + ] + dir: './composer/' + id: 'deploy-custom-plugins' +# Sync DAGs to data dir for integration test parsing in target Composer Environment. +- name: 'google/cloud-sdk' + waitFor: ['deploy-custom-plugins'] + entrypoint: 'gsutil' + args: [ + 'rsync','-r', '-d', + 'dags/', + '${_COMPOSER_DAG_BUCKET}data/test-dags/$BUILD_ID' + ] + dir: './composer/' + id: 'stage-for-integration-test' +# Run integration tests parsing in target Composer Environment. +- name: 'google/cloud-sdk' + waitFor: ['stage-for-integration-test'] + entrypoint: 'gcloud' + args: [ + 'composer', 'environments', 'run', + '--location', '${_COMPOSER_REGION}', + '${_COMPOSER_ENV_NAME}', + 'list_dags', '--', + '-sd', '/home/airflow/gcs/data/test-dags/$BUILD_ID' + ] + id: 'dag-parse-integration-test' +# Clean up. +- name: 'google/cloud-sdk' + waitFor: ['dag-parse-integration-test'] + entrypoint: 'gsutil' + args: [ + '-m', 'rm','-r', + '${_COMPOSER_DAG_BUCKET}data/test-dags/$BUILD_ID' + ] + dir: './composer/' + id: 'clean-up-data-dir-dags' +# pull dags deployer golang app. +- name: gcr.io/cloud-builders/docker + waitFor: ['-'] + entrypoint: 'bash' + args: [ + '-c', + 'docker pull gcr.io/${_ARTIFACTS_PROJECT_ID}/deploydags:latest || exit 0' + ] + id: 'pull-deploydags' +# build with cache +- name: gcr.io/cloud-builders/docker + waitFor: ['pull-deploydags'] + dir: './composer/cloudbuild/go/dagsdeployer' + args: [ + 'build', + '-t', 'gcr.io/${PROJECT_ID}/deploydags:latest', + '--cache-from', 'gcr.io/${_ARTIFACTS_PROJECT_ID}/deploydags:latest', + '.' + ] + id: 'build-deploydags' +# Run dags deployer golang app. +- name: gcr.io/${PROJECT_ID}/deploydags + dir: './composer' + waitFor: [ + 'run-builds', + 'build-deploydags', + 'clean-up-data-dir-dags', + ] + args: [ + '-dagList=./config/running_dags.txt', + '-dagsFolder=./dags', + '-project=${PROJECT_ID}', + '-region=${_COMPOSER_REGION}', + '-composerEnv=${_COMPOSER_ENV_NAME}', + '-dagBucketPrefix=${_COMPOSER_DAG_BUCKET}dags', + '-replace' + ] + id: 'run-deploydags' +artifacts: + images: ['gcr.io/${_ARTIFACTS_PROJECT_ID}/deploydags'] +options: + machineType: 'N1_HIGHCPU_32' # For test and deploy dags parallelization. diff --git a/composer/.gitignore b/composer/.gitignore new file mode 100644 index 0000000..71c88fa --- /dev/null +++ b/composer/.gitignore @@ -0,0 +1,144 @@ +deploydags +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# static files generated from Django application using `collectstatic` +media +static diff --git a/composer/__init__.py b/composer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/composer/cloudbuild/Makefile b/composer/cloudbuild/Makefile new file mode 100644 index 0000000..f2199a8 --- /dev/null +++ b/composer/cloudbuild/Makefile @@ -0,0 +1,26 @@ +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Make will use bash instead of sh +SHELL := /usr/bin/env bash + +help: ## Prints help for targets with comments + @grep -E '^[a-zA-Z._-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "make \033[36m%- 30s\033[0m %s\n", $$1, $$2}' + +.PHONY: test +test: ## Test if all files are properly formatted + @(cd go/dagsdeployer/internal/ && go test ./... && go vet ./... ) + +.PHONY: push_deploydags_image +push_deploydags_image: + @cd go/dagsdeployer/ && gcloud builds submit --project=datapipelines-ci --tag gcr.io/datapipelines-ci/deploydags . diff --git a/composer/cloudbuild/README.md b/composer/cloudbuild/README.md new file mode 100644 index 0000000..2323f88 --- /dev/null +++ b/composer/cloudbuild/README.md @@ -0,0 +1,55 @@ +# Dags Deployer Application + +The Dags Deployer Application seeks to automate the following steps in the DAG deployment process: +1. Identify Dags to Start / Stop based on presence of the dag id in the `running_dags.txt` config file. +1. Check if a DAG needs to be redeployed be checking the filehash of the GCS object against that of the file in the repo. +1. Stop DAGs: 1) Pause the DAG 2) Delete the GCS source file for the DAG 3) Delete the metadata in the airflowdb for the DAG. +1. Start DAGs: 1) Copy the source file the GCS dags folder 2) Unpause the DAG. + +The process for [how Composer stores code in GCS](https://cloud.google.com/composer/docs/concepts/cloud-storage) +and syncs to the airflow workers / webserver is eventually consistent. Therefore this Dags Deployer Application +retries operations that we might expect to fail (e.g. unpausing a DAG immediately after copying it to GCS may occur +before the scheduler has parsed the DAG, registering it with the airflowdb). This retry process can take minutes so +golang was selected as the implementation language to leverage goroutines to concurrently perform the +DAG stop / DAG start processes to speed up deployments involving the starting / stopping of many DAGs. + +Cloud Build will build golang application creating an executable with the parameters documented below. + +## Parameters +- `repoRoot`: path to the root of this repo +- `projectID`: GCP project ID +- `composerRegion`: GCP Region wher Composer Environment lives +- `composerEnvName`: Cloud Composer Environment name +- `dagBucketPrefix`: The GCS dags bucket prefix + +### Running the dags deployer tests +From this directory run +```bash +make test +``` + +### Deploying a new image +From this directory run +```bash +make push_deploydags_image +``` + +### run_tests.sh +In order for DAG validation to pass, all files(e.g. sql query files), variables and connections +must exist in the local airflow environment. +`run_tests.sh` is a script to set up a local airflow environment to run dag validation tests. +It takes three arguments: +1. Relative path to local BigQuery SQL. +1. Relative path to a local JSON files with AirflowVariables necessary for your tests. +1. Relative path to plugins directory + +Installing dependencies +```bash +python3 -m venv .venv && source .venv/bin/activate +pip3 install -r ../requirements-dev.txt +``` + +Running the dag validation tests +```bash +(cd .. && ./cloudbuild/bin/run_tests.sh ../bigquery/sql ./config/AirflowVariables.json ./plugins) +``` diff --git a/composer/cloudbuild/bin/run_tests.sh b/composer/cloudbuild/bin/run_tests.sh new file mode 100755 index 0000000..fe55346 --- /dev/null +++ b/composer/cloudbuild/bin/run_tests.sh @@ -0,0 +1,119 @@ +#!/bin/bash + +# Copyright 2019 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +PATH=$PATH:/usr/local/airflow/google-cloud-sdk/bin +export AIRFLOW_HOME=/tmp/airflow + +# $1 relative path to directory containing bigquery sql. +# $2 relative path to JSON file contianing Airflow Variables. +# $3 relative path to plugins directory. +function setup_local_airflow() { + LOCAL_SQL_DIR=$1 + LOCAL_VARIABLES_JSON=$2 + LOCAL_PLUGIN_DIR=$3 + mkdir -p $AIRFLOW_HOME + echo "setting up local aiflow" + airflow version + echo "initialize airflow database." + airflow initdb + if [ -z "$LOCAL_PLUGIN_DIR" ]; + then + echo "no plugins dir provided; skipping copy to plugins dir." + else + echo "copying ${LOCAL_PLUGIN_DIR} to ${AIRFLOW_HOME}." + cp -r "$LOCAL_PLUGIN_DIR" "$AIRFLOW_HOME/" + fi + + + if [ -z "$LOCAL_SQL_DIR" ]; + then + echo "no sql dir provided; skipping copy to dags dir." + else + echo "setting up sql." + SQL_PREFIX=$AIRFLOW_HOME/dags/sql + mkdir -p "$SQL_PREFIX" + rsync -r -d "$LOCAL_SQL_DIR" "$SQL_PREFIX" + fi + + echo "generating fernet key." + FERNET_KEY=$(python3 -c "from cryptography.fernet import Fernet; \ + print(Fernet.generate_key().decode('utf-8'))") + export FERNET_KEY + + echo "uploading connections." + for conn_id in $AIRFLOW_CONN_LIST; do + set_local_conn "$conn_id" + done + + # Import Airflow Variables to local Airflow. + if [ -z "$LOCAL_VARIABLES_JSON" ] + then + echo "not local variables json provided; skipping import." + else + echo "import airflow vaiables." + airflow variables --import "$LOCAL_VARIABLES_JSON" + echo "imported airflow vaiables:" + airflow variables --export /tmp/AirflowVariables.json.exported + cat /tmp/AirflowVariables.json.exported + rm /tmp/AirflowVariables.json.exported + fi + + + echo "setting up DAGs." + rsync -r dags $AIRFLOW_HOME +} + +# Upload custom connetions to local Airflow. +# $1 conn_id +function set_local_conn() { + echo "uploading connection: $conn_id." + #TODO remove assumption that custom connections are http. + airflow connections --add --conn_id "$1" --conn_type http || \ + echo "Upload $1 to local Airflow failed" +} + +# Run DAG validation tests. +function run_tests() { + python3 -m unittest discover +} + +function clean_up() { + echo "cleaning up AIRFLOW_HOME" + rm -rf $AIRFLOW_HOME + unset AIRFLOW_HOME +} + +# Might be necessary if we chose another image. +function install_airflow() { + python3 -m venv airflow-env + # shellcheck disable=SC1091 + source airflow-env/bin/activate + pip3 install -r requirements-dev.txt +} + +# $1 relative path to directory containing bigquery sql. +# $2 relative path to JSON file contianing Airflow Variables. +main() { + setup_local_airflow "$1" "$2" "$3" + run_tests + TEST_STATUS=$? + clean_up + exit $TEST_STATUS +} + +main "$1" "$2" "$3" diff --git a/composer/cloudbuild/go/.gitignore b/composer/cloudbuild/go/.gitignore new file mode 100644 index 0000000..e541d15 --- /dev/null +++ b/composer/cloudbuild/go/.gitignore @@ -0,0 +1,3 @@ +pkg/ +bin/ +src/ diff --git a/composer/cloudbuild/go/dagsdeployer/Dockerfile b/composer/cloudbuild/go/dagsdeployer/Dockerfile new file mode 100644 index 0000000..4dbde08 --- /dev/null +++ b/composer/cloudbuild/go/dagsdeployer/Dockerfile @@ -0,0 +1,10 @@ +FROM golang:buster AS builder +COPY . /dagsdeployer +WORKDIR /dagsdeployer/cmd/deploydags +ENV GO111MODULE=on +RUN CGO_ENABLED=0 GOOS=linux go build -o /app/deploydags . + +FROM google/cloud-sdk:latest +COPY --from=builder /app/deploydags /app/deploydags +WORKDIR /app +ENTRYPOINT ["/app/deploydags"] diff --git a/composer/cloudbuild/go/dagsdeployer/cmd/deploydags/main.go b/composer/cloudbuild/go/dagsdeployer/cmd/deploydags/main.go new file mode 100644 index 0000000..1273aef --- /dev/null +++ b/composer/cloudbuild/go/dagsdeployer/cmd/deploydags/main.go @@ -0,0 +1,63 @@ +// Copyright 2019 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and + +package main + +import ( + "flag" + "log" + "source.cloud.google.com/datapipelines-ci/composer/cloudbuild/go/dagsdeployer/internal/composerdeployer" +) + +func main() { + + var dagsFolder, dagList, projectID, composerRegion, composerEnvName, dagBucketPrefix string + var replace bool + + flag.StringVar(&dagList, "dagList", "./config/running_dags.txt", "path to the list of dags that should be running after the deploy") + flag.StringVar(&dagsFolder, "dagsFolder", "./dags", "path to the dags folder in the repo.") + flag.StringVar(&projectID, "project", "", "gcp project id") + flag.StringVar(&composerRegion, "region", "", "project") + flag.StringVar(&composerEnvName, "composerEnv", "", "Composer environment name") + flag.StringVar(&dagBucketPrefix, "dagBucketPrefix", "", "Composer DAGs bucket prefix") + flag.BoolVar(&replace, "replace", false, "Boolean flag to indicatae if source dag mismatches the object of same name in GCS delte the old version and deploy over it") + + flag.Parse() + + flags := map[string]string{ + "dagsFolder": dagsFolder, + "dagList": dagList, + "projectID": projectID, + "composerRegion": composerRegion, + "composerEnvName": composerEnvName, + "dagBucketPrefix": dagBucketPrefix, + } + + // Check flags are not empty. + for k, v := range flags { + if v == "" { + log.Panicf("%v must not be empty.", k) + } + } + + c := composerdeployer.ComposerEnv{ + Name: composerEnvName, + Project: projectID, + Location: composerRegion, + DagBucketPrefix: dagBucketPrefix, + LocalDagsPrefix: dagsFolder} + + dagsToStop, dagsToStart := c.GetStopAndStartDags(dagList, replace) + c.StopDags(dagsToStop, !replace) + c.StartDags(dagsFolder, dagsToStart) +} diff --git a/composer/cloudbuild/go/dagsdeployer/go.mod b/composer/cloudbuild/go/dagsdeployer/go.mod new file mode 100644 index 0000000..c46a9f5 --- /dev/null +++ b/composer/cloudbuild/go/dagsdeployer/go.mod @@ -0,0 +1,8 @@ +module source.cloud.google.com/datapipelines-ci/composer/cloudbuild/go/dagsdeployer + +go 1.12 + +require ( + cloud.google.com/go/storage v1.1.2 + github.com/bmatcuk/doublestar v1.2.3 +) diff --git a/composer/cloudbuild/go/dagsdeployer/go.sum b/composer/cloudbuild/go/dagsdeployer/go.sum new file mode 100644 index 0000000..e0948df --- /dev/null +++ b/composer/cloudbuild/go/dagsdeployer/go.sum @@ -0,0 +1,168 @@ +cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= +cloud.google.com/go v0.44.1/go.mod h1:iSa0KzasP4Uvy3f1mN/7PiObzGgflwredwwASm/v6AU= +cloud.google.com/go v0.44.2/go.mod h1:60680Gw3Yr4ikxnPRS/oxxkBccT6SA1yMk63TGekxKY= +cloud.google.com/go v0.45.1/go.mod h1:RpBamKRgapWJb87xiFSdk4g1CME7QZg3uwTez+TSTjc= +cloud.google.com/go v0.46.3 h1:AVXDdKsrtX33oR9fbCMu/+c1o8Ofjq6Ku/MInaLVg5Y= +cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg0= +cloud.google.com/go/bigquery v1.0.1 h1:hL+ycaJpVE9M7nLoiXb/Pn10ENE2u+oddxbD8uu0ZVU= +cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= +cloud.google.com/go/datastore v1.0.0 h1:Kt+gOPPp2LEPWp8CSfxhsM8ik9CcyE/gYu+0r+RnZvM= +cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE= +cloud.google.com/go/pubsub v1.0.1 h1:W9tAK3E57P75u0XLLR82LZyw8VpAnhmyTOxW9qzmyj8= +cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I= +cloud.google.com/go/storage v1.1.2 h1:q7KNypEb3CARnitCAqY63g+dZp9HDEgv/c6IPlPLMJI= +cloud.google.com/go/storage v1.1.2/go.mod h1:/03MkR5FWjF0OpcKpdJ4RgWybEaYAr2boHXq5RDlxbw= +dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= +github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ= +github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= +github.com/bmatcuk/doublestar v1.2.3 h1:ChLVAfc51TZWXjnbB/3ZKMbk78j0vs0WhtgeDS+L/+I= +github.com/bmatcuk/doublestar v1.2.3/go.mod h1:wiQtGV+rzVYxB7WIlirSN++5HPtPlXEo9MEoZQC/PmE= +github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= +github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= +github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b h1:VKtxabqXZkF25pY9ekfRL6a582T4P37/31XEstQ5p58= +github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= +github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= +github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= +github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.2 h1:6nsPYzhq5kReh6QImI3k5qWzO4PEbvbIW2cwSfR/6xs= +github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= +github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= +github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/google/go-cmp v0.3.0 h1:crn/baboCvb5fXaQ0IJ1SGTsTVrWpDsCWC8EGETZijY= +github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/martian v2.1.0+incompatible h1:/CP5g8u/VJHijgedC/Legn3BAbAaWPgecwXBIDzw5no= +github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= +github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= +github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= +github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= +github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= +github.com/googleapis/gax-go/v2 v2.0.5 h1:sjZBwGj9Jlw33ImPtvFviGYvseOtDM7hkSKB7+Tv3SM= +github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= +github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= +github.com/hashicorp/golang-lru v0.5.1 h1:0hERBMJE1eitiLkihrMvRVBYAkpHzc/J3QdDN+dAcgU= +github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= +github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024 h1:rBMNdlhTLzJjJSDIjNEXX1Pz3Hmwmz91v+zycvx9PJc= +github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= +go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= +go.opencensus.io v0.22.0 h1:C9hSCOW830chIVkdja34wa6Ky+IzWllkUinR+BtRZd4= +go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= +golang.org/x/exp v0.0.0-20190829153037-c13cbed26979/go.mod h1:86+5VVa7VpoJ4kLfm080zCjGlMRFzhUhsZKEZO7MGek= +golang.org/x/exp v0.0.0-20191014171548-69215a2ee97e h1:ewBcnrlKhy0GKnQ31tXkOC/G7/jHC4ogar1TiIfANC4= +golang.org/x/exp v0.0.0-20191014171548-69215a2ee97e/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY= +golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= +golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= +golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= +golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20190909230951-414d861bb4ac/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20190930215403-16217165b5de h1:5hukYrvBGR8/eNkX5mdUezrA6JiaEZDtJb9Ei+1LlBs= +golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE= +golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o= +golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= +golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY= +golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190501004415-9ce7a6920f09/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859 h1:R/3boaszxrf1GEUWTVDzSKVwLmSJpwZ1yqXm8j0v2QI= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= +golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45 h1:SVwTIAaPC2U/AvvLNZ2a7OVsmBpC8L5BlwK1whH3hm0= +golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58 h1:8gQV6CLnAEikrhgkHFbMAEhagSSnXWGV915qUMm9mrU= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0 h1:HyfiK1WMnHj5FXFXatD+Qs1A/xC2Run6RzeW1SyHxpc= +golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= +golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191022210528-83d82311fd1f h1:X4UYO3m0+b0v4ctMUiMVB/vdVP5v25QRYMtH88N+Ne8= +golang.org/x/tools v0.0.0-20191022210528-83d82311fd1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= +google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M= +google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= +google.golang.org/api v0.9.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= +google.golang.org/api v0.11.0 h1:n/qM3q0/rV2F0pox7o0CvNhlPvZAo7pLbef122cbLJ0= +google.golang.org/api v0.11.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= +google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= +google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/appengine v1.6.1 h1:QzqyMA1tlu6CgqCDUtU9V+ZKhLFT2dkJuANu5QaxI3I= +google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0= +google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= +google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190418145605-e7d98fc518a7/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190502173448-54afdca5d873/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190801165951-fa694d86fc64/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= +google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= +google.golang.org/genproto v0.0.0-20190911173649-1774047e7e51/go.mod h1:IbNlFCBrqXvoKpeg0TB2l7cyZUmoaFKYIwrEpbDKLA8= +google.golang.org/genproto v0.0.0-20191009194640-548a555dbc03 h1:4HYDjxeNXAOTv3o1N2tjo8UUSlhQgAD52FVkwxnWgM8= +google.golang.org/genproto v0.0.0-20191009194640-548a555dbc03/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= +google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= +google.golang.org/grpc v1.21.1 h1:j6XxA85m/6txkUCHvzlV5f+HBNl/1r5cZ2A/3IEFOO8= +google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= +honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.1-2019.2.3 h1:3JgtbtFHMiCmsznwGVTUWbgGov+pVqnlf1dEJTNAXeM= +honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= +rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= diff --git a/composer/cloudbuild/go/dagsdeployer/internal/composerdeployer/composer_ops.go b/composer/cloudbuild/go/dagsdeployer/internal/composerdeployer/composer_ops.go new file mode 100644 index 0000000..a2b817c --- /dev/null +++ b/composer/cloudbuild/go/dagsdeployer/internal/composerdeployer/composer_ops.go @@ -0,0 +1,566 @@ +// Copyright 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package composerdeployer + +import ( + "bufio" + "fmt" + "github.com/bmatcuk/doublestar" + "io/ioutil" + "log" + "math/rand" + "net/url" + "os" + "os/exec" + "path" + "path/filepath" + "regexp" + "source.cloud.google.com/datapipelines-ci/composer/cloudbuild/go/dagsdeployer/internal/gcshasher" + "strings" + "sync" + "time" +) + +// ComposerEnv is a lightweight representaataion of Cloud Composer environment +type ComposerEnv struct { + Name string + Project string + Location string + DagBucketPrefix string + LocalDagsPrefix string +} + +func logDagList(a map[string]bool) { + for k := range a { + log.Printf("\t%s", k) + } + return +} + +// DagList is a set of dags (for quick membership check) +type DagList map[string]bool + +// ReadRunningDagsTxt reads a newline separated list of dags from a text file +func ReadRunningDagsTxt(filename string) (map[string]bool, error) { + file, err := os.Open(filename) + if err != nil { + return nil, err + } + defer file.Close() + + dagsToRun := make(map[string]bool) + sc := bufio.NewScanner(file) + + for sc.Scan() { + dagsToRun[sc.Text()] = true + } + log.Printf("Read dagsToRun from %s:", filename) + logDagList(dagsToRun) + return dagsToRun, err +} + +// DagListIntersect finds the common keys in two map[string]bool representing a +// list of airflow DAG IDs. +func DagListIntersect(a map[string]bool, b map[string]bool) map[string]bool { + short := make(map[string]bool) + long := make(map[string]bool) + in := make(map[string]bool) + + if len(a) < len(b) { + short, long = a, b + } else { + short, long = b, a + } + for k := range short { + if long[k] { + in[k] = true + } + } + return in +} + +// DagListDiff finds the keys in the first map[string]bool that do no appear in +// the second. +func DagListDiff(a map[string]bool, b map[string]bool) map[string]bool { + diff := make(map[string]bool) + for k := range a { + if !b[k] { + diff[k] = true + } + } + return diff +} + +// shell out to call gsutil +func gsutil(args ...string) ([]byte, error) { + c := exec.Command("gsutil", args...) + return c.CombinedOutput() +} + +func (c *ComposerEnv) assembleComposerRunCmd(subCmd string, args ...string) []string { + subCmdArgs := []string{ + "composer", "environments", "run", + c.Name, + fmt.Sprintf("--location=%s", c.Location), + subCmd} + + if len(args) > 0 { + subCmdArgs = append(subCmdArgs, "--") + subCmdArgs = append(subCmdArgs, args...) + } + return subCmdArgs +} + +// Run is used to run airflow cli commands +// it is a wrapper of gcloud composer environments run +func (c *ComposerEnv) Run(subCmd string, args ...string) ([]byte, error) { + subCmdArgs := c.assembleComposerRunCmd(subCmd, args...) + log.Printf("running gcloud with subCmd Args: %s", subCmdArgs) + cmd := exec.Command( + "gcloud", subCmdArgs...) + return cmd.CombinedOutput() +} + +func parseListDagsOuput(out []byte) map[string]bool { + runningDags := make(map[string]bool) + outArr := strings.Split(string(out[:]), "\n") + + // Find the DAGs in output + dagSep := "-------------------------------------------------------------------" + var dagsIdx, nSep int + + for nSep < 2 { + if outArr[dagsIdx] == dagSep { + nSep++ + } + dagsIdx++ + if dagsIdx >= len(outArr) { + log.Fatalf("list_dags output did not contain expected separators: %s", out) + } + } + + // Ignore empty newline and airflow_monitoring dag. + for _, dag := range outArr[dagsIdx:] { + if dag != "" && dag != "airflow_monitoring" { + runningDags[dag] = true + } + } + + return runningDags +} + +// GetRunningDags lists dags currently running in Composer Environment. +func (c *ComposerEnv) GetRunningDags() (map[string]bool, error) { + runningDags := make(map[string]bool) + out, err := c.Run("list_dags") + if err != nil { + log.Fatalf("list_dags failed: %s with %s", err, out) + } + + runningDags = parseListDagsOuput(out) + log.Printf("running DAGs:") + logDagList(runningDags) + return runningDags, err +} + +func readCommentScrubbedLines(path string) ([]string, error) { + log.Printf("scrubbing comments in %v", path) + commentPattern, err := regexp.Compile(`#.+`) + if err != nil { + return nil, fmt.Errorf("error compiling regex: %v", err) + } + file, err := os.Open(path) + if err != nil { + return nil, fmt.Errorf("couldn't open file %v: %v", path, err) + } + defer file.Close() + + lines := make([]string, 0, 1) + scanner := bufio.NewScanner(file) + for scanner.Scan() { + candidate := commentPattern.ReplaceAllString(scanner.Text(), "") + if len(candidate) > 0 { + lines = append(lines, candidate) + } + } + + return lines, scanner.Err() +} + +// FindDagFilesInLocalTree searches for Dag files in dagsRoot with names in dagNames respecting .airflowignores +func FindDagFilesInLocalTree(dagsRoot string, dagNames map[string]bool) (map[string][]string, error) { + + if len(dagNames) == 0 { + return make(map[string][]string), nil + } + log.Printf("searching for these DAGs in %v:", dagsRoot) + logDagList(dagNames) + matches := make(map[string][]string) + // This should map a dir to the ignore patterns in it's airflow ignore if relevant + // this allows us to easily identify the patterns relevant to this dir and it's parents, grandparents, etc. + airflowignoreTree := make(map[string][]string) + _, err := ioutil.ReadDir(dagsRoot) + if err != nil { + return matches, fmt.Errorf("error reading dagRoot: %v. %v", dagsRoot, err) + } + filepath.Walk(dagsRoot, func(path string, info os.FileInfo, err error) error { + dagID := strings.TrimSuffix(info.Name(), ".py") + relPath, err := filepath.Rel(dagsRoot, path) + + if info == nil { + dur, _ := time.ParseDuration("5s") + time.Sleep(dur) + } + // resepect .airflowignore + if info.Name() == ".airflowignore" { + log.Printf("found %v, adding to airflowignoreTree", path) + patterns, err := readCommentScrubbedLines(path) + if err != nil { + return err + } + dir, err := filepath.Rel(dagsRoot, filepath.Dir(path)) + if err != nil { + return fmt.Errorf("error making %v relative to dag root %v: %v", filepath.Dir(path), dagsRoot, err) + } + fullyQualifiedPatterns := make([]string, 0, len(patterns)) + for _, p := range patterns { + fullyQualifiedPatterns = append(fullyQualifiedPatterns, filepath.Join(dir, p)) + } + log.Printf("adding the following patterns to airflowignoreTree[%v]: %+v", dir, fullyQualifiedPatterns) + airflowignoreTree[filepath.Dir(path)] = fullyQualifiedPatterns + return nil + } + + if !info.IsDir() && !dagNames[dagID] { // skip to next file if this is not relevant to dagNames + return nil + } + + relevantIgnores := make([]string, 0) + p := path + + if ignores, ok := airflowignoreTree[p]; ok { + relevantIgnores = append(relevantIgnores, ignores...) + } + + // walk back to respect all parents' .airflowignore + for { + if p == filepath.Dir(dagsRoot) { + break + } + parent := filepath.Dir(p) + p = parent // for next iteration. + if patterns, ok := airflowignoreTree[parent]; ok { // parent has .airflowignore + relevantIgnores = append(relevantIgnores, patterns...) + } + } + + thisMatch := make(map[string]bool) + if err != nil { + log.Printf("error making %v relative to %v, %v", path, dagsRoot, err) + return fmt.Errorf("error making %v relative to %v, %v", path, dagsRoot, err) + } + + for _, ignore := range relevantIgnores { + absIgnore, err := filepath.Abs(filepath.Join(".", ignore)) + if err != nil { + return err + } + absPath, err := filepath.Abs(filepath.Join(".", relPath)) + if err != nil { + return err + } + var match bool + if strings.Contains(absIgnore, "**") { + match, err = doublestar.PathMatch(absIgnore, absPath) + if err != nil { + return err + } + } + if !match && !strings.Contains(ignore, "**") { + match, err = regexp.MatchString(ignore, relPath) + if err != nil { + log.Printf("ERROR: comparing %v %v: %v", relPath, ignore, err) + return err + } + } + + // don't walk dirs we don't have to + if match && info.IsDir() { + log.Printf("ignoring dir: %v because matched %v", relPath, ignore) + return filepath.SkipDir + } + + // remove matches if previously added but now matches this ignore pattern + if match && !info.IsDir() && dagNames[dagID] { + log.Printf("ignoring path: %v because matched %v", relPath, ignore) + if _, ok := matches[dagID]; ok { + matches[dagID] = make([]string, 0) + break // no other ignore patterns relevant if we now know this file should be ignored + } + return nil + } + + // if we shouldn't ignore it and it is in dagNames then add it to matches if not already present + if !match && !info.IsDir() && dagNames[dagID] { + thisMatch[dagID] = true + } + } + + if thisMatch[dagID] { + alreadyMatched := false + for _, p := range matches[dagID] { + if relPath == p { + alreadyMatched = true + break + } + } + if !alreadyMatched { + matches[dagID] = append(matches[dagID], relPath) + } + } + + return nil + }) + + errs := make([]error, 0) + + // should match exactly one path in the tree. + for dag, matches := range matches { + if len(matches) == 0 { + errs = append(errs, fmt.Errorf("did not find match for %v", dag)) + } else if len(matches) > 1 { + errs = append(errs, fmt.Errorf("found multiple matches for %v: %v", dag, matches)) + } + } + + if len(errs) > 0 { + return matches, fmt.Errorf("Encountered errors matching files to dags: %+v", errs) + } + return matches, nil +} + +// FindDagFilesInGcsPrefix necessary find the file path of a dag that has been deleted from VCS +func FindDagFilesInGcsPrefix(prefix string, dagFileNames map[string]bool) (map[string][]string, error) { + dir, err := ioutil.TempDir("", "gcsDags_") + if err != nil { + return nil, fmt.Errorf("error creating temp dir to pull gcs dags: %v", err) + } + defer os.RemoveAll(dir) // clean up temp dir + + // copy gcs dags dir to local temp dir + log.Printf("pulling down %v", prefix) + _, err = gsutil("-m", "cp", "-r", prefix, dir) + if err != nil { + return nil, fmt.Errorf("error fetching dags dir from GCS: %v", err) + } + return FindDagFilesInLocalTree(filepath.Join(dir, "dags"), dagFileNames) +} + +func (c *ComposerEnv) getRestartDags(sameDags map[string]string) map[string]bool { + dagsToRestart := make(map[string]bool) + for dag, relPath := range sameDags { + // We know that the file name = dag id from the dag validation test asseting this. + local := filepath.Join(c.LocalDagsPrefix, relPath) + gcs, err := url.Parse(c.DagBucketPrefix) + gcs.Path = path.Join(gcs.Path, relPath) + eq, err := gcshasher.LocalFileEqGCS(local, gcs.String()) + if err != nil { + log.Printf("error comparing file hashes %s, attempting to restart: %s", err, dag) + dagsToRestart[dag] = true + } else if !eq { + dagsToRestart[dag] = true + } + } + return dagsToRestart +} + +// Dag is a type for dag containing it's path +type Dag struct { + ID string + Path string +} + +// GetStopAndStartDags uses set differences between dags running in the Composer +// Environment and those in the running dags text config file. +func (c *ComposerEnv) GetStopAndStartDags(filename string, replace bool) (map[string]string, map[string]string) { + dagsToRun, err := ReadRunningDagsTxt(filename) + if err != nil { + log.Fatalf("couldn't read running_dags.txt: %v", filename) + } + runningDags, err := c.GetRunningDags() + if err != nil { + log.Fatalf("couldn't list dags in composer environment: %v", err) + } + dagsToStop := DagListDiff(runningDags, dagsToRun) + dagsToStart := DagListDiff(dagsToRun, runningDags) + dagsSame := DagListIntersect(runningDags, dagsToRun) + log.Printf("DAGs same:") + logDagList(dagsSame) + + dagPathListsSame, err := FindDagFilesInGcsPrefix(c.DagBucketPrefix, dagsToStop) + if err != nil { + log.Fatalf("error finding dags to stop: %v", err) + } + // unnest out of slice + dagPathsSame := make(map[string]string) + for k, v := range dagPathListsSame { + dagPathsSame[k] = v[0] + } + restartDags := c.getRestartDags(dagPathsSame) + + if replace { + for k, v := range restartDags { + dagsToStop[k], dagsToStart[k] = v, v + } + } else { + log.Fatalf("FAILED: tried to overwite DAGs in place put replace=false the following existing dags: %#v", restartDags) + } + + log.Printf("DAGs to Stop:") + logDagList(dagsToStop) + log.Printf("DAGs to Start:") + logDagList(dagsToStart) + + dagPathListsToStop, err := FindDagFilesInGcsPrefix(c.DagBucketPrefix, dagsToStop) + if err != nil { + log.Fatalf("error finding dags to stop: %v", err) + } + dagPathsToStop := make(map[string]string) + for k, v := range dagPathListsToStop { + dagPathsToStop[k] = v[0] + } + dagPathListsToStart, err := FindDagFilesInLocalTree(c.LocalDagsPrefix, dagsToStart) + if err != nil { + log.Fatalf("error finding dags to start: %v", err) + } + + dagPathsToStart := make(map[string]string) + for k, v := range dagPathListsToStart { + dagPathsToStart[k] = v[0] + } + return dagPathsToStop, dagPathsToStart +} + +// ComposerEnv.stopDag pauses the dag, removes the dag definition file from gcs +// and deletes the DAG from the airflow db. +func (c *ComposerEnv) stopDag(dag string, relPath string, pauseOnly bool, wg *sync.WaitGroup) (err error) { + defer wg.Done() + log.Printf("pausing dag: %v with relPath: %v", dag, relPath) + out, err := c.Run("pause", dag) + if err != nil { + return fmt.Errorf("error pausing dag %v: %v", dag, string(out)) + } + if !pauseOnly { + log.Printf("parsing gcs url %v", c.DagBucketPrefix) + gcs, err := url.Parse(c.DagBucketPrefix) + if err != nil { + panic("error parsing dag bucket prefix") + } + + gcs.Path = path.Join(gcs.Path, relPath) + log.Printf("deleting %v", gcs.String()) + out, err = gsutil("rm", gcs.String()) + if err != nil { + panic("error deleting from gcs") + } + + _, err = c.Run("delete_dag", dag) + if err != nil { + panic("error deleteing dag") + } + + for i := 0; i < 5; i++ { + if err == nil { + break + } + log.Printf("Waiting 5s to retry") + dur, _ := time.ParseDuration("5s") + time.Sleep(dur) + log.Printf("Retrying delete %s", dag) + _, err = c.Run("delete_dag", dag) + } + if err != nil { + return fmt.Errorf("Retried 5x, pause still failing with: %v", string(out)) + } + } + return err +} + +// StopDags deletes a list of dags in parallel go routines +func (c *ComposerEnv) StopDags(dagsToStop map[string]string, pauseOnly bool) error { + var stopWg sync.WaitGroup + for k, v := range dagsToStop { + stopWg.Add(1) + go c.stopDag(k, v, pauseOnly, &stopWg) + } + stopWg.Wait() + return nil +} + +func jitter(d time.Duration) time.Duration { + const pct = 0.10 //Jitter up to 10% of the supplied duration. + jit := 1 + pct*(rand.Float64()*2-1) + return time.Duration(jit * float64(d)) +} + +// ComposerEnv.waitForDeploy polls a Composer environment trying to unpause +// dags. This should be called after copying a dag file to gcs when +// dag_paused_on_creation=True. +func (c *ComposerEnv) waitForDeploy(dag string) error { + _, err := c.Run("unpause", dag) + for i := 0; i < 5; i++ { + if err == nil { + break + } + log.Printf("Waiting 60s to retry") + time.Sleep(jitter(time.Minute)) + log.Printf("Retrying unpause %s", dag) + _, err = c.Run("unpause", dag) + } + if err != nil { + err = fmt.Errorf("Retried 5x, unpause still failing with: %s", err) + } + return err +} + +// ComposerEnv.startDag copies a DAG definition file to GCS and waits until you can +// successfully unpause. +func (c *ComposerEnv) startDag(dagsFolder string, dag string, relPath string, wg *sync.WaitGroup) error { + defer wg.Done() + loc := filepath.Join(dagsFolder, relPath) + gcs, err := url.Parse(c.DagBucketPrefix) + if err != nil { + return fmt.Errorf("error parsing dags prefix %v", err) + } + gcs.Path = path.Join(gcs.Path, relPath) + _, err = gsutil("cp", loc, gcs.String()) + if err != nil { + return fmt.Errorf("error copying file %v to gcs: %v", loc, err) + } + c.waitForDeploy(dag) + return err +} + +// StartDags deploys a list of dags in parallel go routines +func (c *ComposerEnv) StartDags(dagsFolder string, dagsToStart map[string]string) error { + c.Run("unpause", "airflow_monitoring") + var startWg sync.WaitGroup + for k, v := range dagsToStart { + startWg.Add(1) + go c.startDag(dagsFolder, k, v, &startWg) + } + startWg.Wait() + return nil +} diff --git a/composer/cloudbuild/go/dagsdeployer/internal/composerdeployer/composer_ops_test.go b/composer/cloudbuild/go/dagsdeployer/internal/composerdeployer/composer_ops_test.go new file mode 100644 index 0000000..78b90fe --- /dev/null +++ b/composer/cloudbuild/go/dagsdeployer/internal/composerdeployer/composer_ops_test.go @@ -0,0 +1,206 @@ +// Copyright 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package composerdeployer + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + "reflect" + "testing" +) + +// test dag lists +var ab = map[string]bool{"a": true, "b": true} + +var ac = map[string]bool{"a": true, "c": true} + +var a = map[string]bool{"a": true} + +var d = map[string]bool{"d": true} + +func TestDagListIntersect(t *testing.T) { + + testTable := []struct { + a map[string]bool + b map[string]bool + out map[string]bool + }{ + {ab, ac, map[string]bool{"a": true}}, + {ac, ab, map[string]bool{"a": true}}, // commutative + {ab, ab, ab}, // identity + {ab, a, a}, + {a, ab, a}, + {ab, d, make(map[string]bool)}} + + for _, tt := range testTable { + t.Run(fmt.Sprintf("%+v", tt), func(t *testing.T) { + if got := DagListIntersect(tt.a, tt.b); !reflect.DeepEqual(got, tt.out) { + t.Errorf("DagListIntersect(%+v, %+v) = %+v, want %+v", tt.a, tt.b, got, tt.out) + } + }) + } +} + +func TestDagListDiff(t *testing.T) { + + testTable := []struct { + a map[string]bool + b map[string]bool + out map[string]bool + }{ + {ab, ac, map[string]bool{"b": true}}, + {ac, ab, map[string]bool{"c": true}}, // commutative + {ab, ab, make(map[string]bool)}, + {ab, a, map[string]bool{"b": true}}, + {a, ab, make(map[string]bool)}, + {ab, d, ab}, + {d, ab, d}} + + for _, tt := range testTable { + t.Run(fmt.Sprintf("%+v, %+v", tt.a, tt.b), func(t *testing.T) { + if got := DagListDiff(tt.a, tt.b); !reflect.DeepEqual(got, tt.out) { + t.Errorf("DagListDiff(%+v, %+v) = %+v, want %+v", tt.a, tt.b, got, tt.out) + } + }) + } + +} + +func TestAssembleComposerRunCmd(t *testing.T) { + c := ComposerEnv{ + Name: "composerenv", + Location: "us-central1", + DagBucketPrefix: "gs://composerenv-bucket/dags/", + } + // Test single command. + want := []string{ + "composer", "environments", "run", + "composerenv", + "--location=us-central1", + "list_dags"} + + got := c.assembleComposerRunCmd("list_dags") + if !reflect.DeepEqual(got, want) { + t.Errorf("ComposerEnv.assembleComposerrRunCmd(\"list_dags\") = %+v, want %+v", got, want) + } + + // Test command w/ arguments + want = []string{ + "composer", "environments", "run", + "composerenv", + "--location=us-central1", + "variables", "--", "import", "AirflowVariables.json"} + + got = c.assembleComposerRunCmd("variables", "import", "AirflowVariables.json") + if !reflect.DeepEqual(got, want) { + t.Errorf("ComposerEnv.assembleComposerrRunCmd(\"variables\", \"import\", \"AirflowVariables.json\") = %+v, want %+v", got, want) + } +} + +func populateAirflowIgnore(path string, ignores []string) error { + f, err := os.OpenFile(filepath.Join(path, ".airflowignore"), os.O_RDWR|os.O_CREATE, 0755) + if err != nil { + return err + } + defer f.Close() + for _, ignore := range ignores { + _, err := f.WriteString(ignore + "\n") + if err != nil { + panic(fmt.Sprintf("couldn't write %v to %v: %v", ignore, f, err)) + } + } + return nil +} + +func prepareTestTempDirTree() (string, error) { + tmpDir, err := ioutil.TempDir("", "") + if err != nil { + return "", fmt.Errorf("error creating temp dir: %v", err) + } + + // create dir tree + for _, p := range []string{ + "team1/usecase1/sql", + "team1/helpers/utils", + "team2/usecase1/", + "team2/usecase2/", + "team2/helpers/"} { + err = os.MkdirAll(filepath.Join(tmpDir, p), 0755) + if err != nil { + return tmpDir, err + } + } + + // add some files + for _, name := range []string{ + ".airflowignore", + "team1/.airflowignore", + "team1/usecase1/sql/foo.sql", + "team1/usecase1/sql/dag1.py", // sometimes people define sql in python files as constants (should be ignored in dag finding) + "team1/helpers/create_dag.py", // some dag generation helper utility (should be ignored in dag finding) + "team1/usecase1/dag1.py", + "team1/usecase1/dag2.py", + "team2/usecase1/create_dag.py", // conflicts with utility file in team1/helpers, but should be ok as that was ignored. + "team2/usecase2/dag2.py", // uh-oh a real dag name conflict! (we will that this fails in second test) + "team2/helpers/helper_dag.py", // this should not be ignored as team2 follows a different convention. + } { + f, err := os.Create(filepath.Join(tmpDir, name)) + if err != nil { + f.Close() + return tmpDir, err + } + } + + // add some contents to .airflowignore files + populateAirflowIgnore(tmpDir, []string{"./**/sql/"}) // ignore sql/ dirs throughout the tree + populateAirflowIgnore(filepath.Join(tmpDir, "team1"), []string{"helpers/"}) // ignore helpers/ dir under team1 + return tmpDir, nil +} + +func TestFindDagFilesInLocalTree(t *testing.T) { + tmpDir, err := prepareTestTempDirTree() + defer os.RemoveAll(tmpDir) + if err != nil { + t.Errorf("couldn't initialize test dir tree: %v", err) + } + + // look for the dags we know not to have name conflicts. + got, err := FindDagFilesInLocalTree(tmpDir, map[string]bool{ + "helper_dag": true, + "create_dag": true, + "dag1": true, + }) + + want := map[string][]string{ + "helper_dag": []string{"team2/helpers/helper_dag.py"}, + "create_dag": []string{"team2/usecase1/create_dag.py"}, + "dag1": []string{"team1/usecase1/dag1.py"}, + } + + if !reflect.DeepEqual(got, want) { + t.Errorf("got: %+v,\n want %+v", got, want) + } + + // test w/ name conflict + _, err = FindDagFilesInLocalTree(tmpDir, map[string]bool{ + "dag2": true, + }) + + if err == nil { + t.Errorf("should error on duplicate dag names but didn't") + } +} diff --git a/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/gcs_hash.go b/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/gcs_hash.go new file mode 100644 index 0000000..28d669b --- /dev/null +++ b/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/gcs_hash.go @@ -0,0 +1,94 @@ +// Copyright 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gcshasher + +import ( + "bytes" + "cloud.google.com/go/storage" + "context" + "crypto/md5" + "fmt" + "io" + "log" + "net/url" + "os" +) + +func parseGcsPath(gcsPath string) (bucket string, path string, err error) { + uri, err := url.Parse(gcsPath) + bucket = "" + path = "" + if err != nil { + err = fmt.Errorf("couldn't parse GCS URI %+v", gcsPath) + return + } + if uri.Scheme != "gs" { + err = fmt.Errorf("couldn't parse GCS URI: %+v scheme should be 'gs'", gcsPath) + return + } + bucket = uri.Host + path = uri.Path[1:] + return +} +func gcsMD5(gcsPath string) ([]byte, error) { + bktName, path, err := parseGcsPath(gcsPath) + if err != nil { + log.Fatalf("%s", err) + } + + ctx := context.Background() + client, err := storage.NewClient(ctx) + if err != nil { + return nil, fmt.Errorf("Couldn't authenticate GCS client: %s", err) + } + + attrs, err := client.Bucket(bktName).Object(path).Attrs(ctx) + if err != nil { + return nil, fmt.Errorf("Couldn't read file hash for %s: %s", path, err) + } + + hash := attrs.MD5 + return hash, nil +} + +func localMD5(path string) ([]byte, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + + h := md5.New() + if _, err := io.Copy(h, f); err != nil { + return nil, err + } + return h.Sum(nil), nil +} + +// LocalFileEqGCS check equalit of local file and GCS object using md5 hash +func LocalFileEqGCS(localPath, gcsPath string) (bool, error) { + loc, err := localMD5(localPath) + if err != nil { + err = fmt.Errorf("Local file not found %s", err) + return false, err + } + gcs, err := gcsMD5(gcsPath) + if err != nil { + err = fmt.Errorf("GCS file not found %s", err) + return false, err + } + + return bytes.Compare(loc, gcs) == 0, nil +} diff --git a/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/gcs_hash_test.go b/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/gcs_hash_test.go new file mode 100644 index 0000000..6d90b83 --- /dev/null +++ b/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/gcs_hash_test.go @@ -0,0 +1,74 @@ +// Copyright 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gcshasher + +import ( + "cloud.google.com/go/storage" + "context" + "flag" + "io" + "os" + "path/filepath" + "testing" +) + +var testBkt = flag.String("bkt", "", "The bucket to use for testing the hash comparison") + +func TestLocalMD5(t *testing.T) { + locPath := filepath.Join("testdata", "test.txt") + _, err := localMD5(locPath) + if err != nil { + t.Errorf("error hashing local file: %s", err) + } +} + +func TestLocalFileEqGCS(t *testing.T) { + if *testBkt == "" { + t.Skip("skipping hash comparison integration test because no test bucket passed") + } + + locPath := filepath.Join("testdata", "test.txt") + ctx := context.Background() + client, err := storage.NewClient(ctx) + if err != nil { + t.Errorf("Couldn't authenticate GCS client: %s", err) + } + + var r io.Reader + f, err := os.Open(locPath) + defer f.Close() + r = f + + obj := client.Bucket(*testBkt).Object("testdata/test.txt") + w := obj.NewWriter(ctx) + io.Copy(w, r) + if err := w.Close(); err != nil { + t.Errorf("couldn't write test object %s ", err) + } + + eq, err := LocalFileEqGCS(locPath, "gs://"+*testBkt+"/testdata/test.txt") + if !eq { + t.Errorf("hashes were not equal for local test.txt vs gcs test.txt") + } + + diffLocPath := filepath.Join("testdata", "test_diff.txt") + eq, err = LocalFileEqGCS(diffLocPath, "gs://"+*testBkt+"/testdata/test.txt") + if eq { + t.Errorf("hashes were equal for local test_diff.txt vs gcs test.txt") + } + if err := obj.Delete(ctx); err != nil { + t.Logf("couldn't clean up test object: %s", err) + } +} diff --git a/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/testdata/test.txt b/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/testdata/test.txt new file mode 100644 index 0000000..ce27bd9 --- /dev/null +++ b/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/testdata/test.txt @@ -0,0 +1 @@ +This is a test file for hashing. diff --git a/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/testdata/test_diff.txt b/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/testdata/test_diff.txt new file mode 100644 index 0000000..17e67a8 --- /dev/null +++ b/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/testdata/test_diff.txt @@ -0,0 +1 @@ +This is a different test file for hashing. diff --git a/composer/config/AirflowVariables.json b/composer/config/AirflowVariables.json new file mode 100644 index 0000000..ae4985f --- /dev/null +++ b/composer/config/AirflowVariables.json @@ -0,0 +1,14 @@ +{ + "gcp_project": "${GCP_PROJECT_ID}", + "gcp_region": "${COMPOSER_REGION}", + "dataflow_jar_location": "${DATAFLOW_JAR_BUCKET}", + "dataflow_jar_file": "to_be_overriden", + "gcs_input_bucket": "${INPUT_BUCKET}", + "gcs_ref_bucket": "${REF_BUCKET}", + "gcs_output_bucket": "${RESULT_BUCKET}", + "dataflow_staging_bucket": "${DATAFLOW_STAGING_BUCKET}", + "dataproc_bucket": "${DATFLOW_STAGING_BUCKET}", + "gce_zone": "${COMPOSER_REGION}-a", + "gcs_bucket": "spark_bucket", + "bq_output_table": "${GCP_PROJECT_ID}.nyc_taxi.avg_speed" +} diff --git a/composer/config/running_dags.txt b/composer/config/running_dags.txt new file mode 100644 index 0000000..e69de29 diff --git a/composer/dags/.airflowignore b/composer/dags/.airflowignore new file mode 100644 index 0000000..46a265c --- /dev/null +++ b/composer/dags/.airflowignore @@ -0,0 +1,2 @@ +sql/ +support-files/ diff --git a/composer/dags/ephemeral_dataproc_spark_dag.py b/composer/dags/ephemeral_dataproc_spark_dag.py new file mode 100644 index 0000000..9f328bb --- /dev/null +++ b/composer/dags/ephemeral_dataproc_spark_dag.py @@ -0,0 +1,160 @@ +# Copyright 2018 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from datetime import datetime, timedelta + +from airflow import DAG +from airflow.contrib.operators.dataproc_operator import ( + DataprocClusterCreateOperator, + DataProcPySparkOperator, + DataprocClusterDeleteOperator) +from airflow.contrib.operators.gcs_to_bq import ( + GoogleCloudStorageToBigQueryOperator) +from airflow.operators import BashOperator +from airflow.models import Variable +from airflow.utils.trigger_rule import TriggerRule + +################################################################## +# This file defines the DAG for the logic pictured below. # +################################################################## +# # +# create_cluster # +# | # +# V # +# submit_pyspark....... # +# | . # +# / \ V # +# / \ move_failed_files # +# / \ ^ # +# | | . # +# V V . # +# delete_cluster bq_load..... # +# | # +# V # +# delete_transformed_files # +# # +# (Note: Dotted lines indicate conditional trigger rule on # +# failure of the up stream tasks. In this case the files in the # +# raw-{timestamp}/ GCS path will be moved to a failed-{timestamp}# +# path.) # +################################################################## + +# Airflow parameters, see https://airflow.incubator.apache.org/code.html +DEFAULT_DAG_ARGS = { + 'owner': 'jferriero@google.com', # The owner of the task. + # Task instance should not rely on the previous task's schedule to succeed. + 'depends_on_past': False, + # We use this in combination with schedule_interval=None to only trigger the + # DAG with a POST to the REST API. + # Alternatively, we could set this to yesterday and the dag will be + # triggered upon upload to the dag folder. + 'start_date': datetime(2020, 1, 1), + 'email_on_failure': False, + 'email_on_retry': False, + 'retries': 1, # Retry once before failing the task. + 'retry_delay': timedelta(minutes=5), # Time between retries. + 'project_id': Variable.get('gcp_project'), # Cloud Composer project ID. + # We only want the DAG to run when we POST to the api. + # Alternatively, this could be set to '@daily' to run the job once a day. + # more options at https://airflow.apache.org/scheduler.html#dag-runs +} + +# Create Directed Acyclic Graph for Airflow +with DAG('ephemeral_dataproc_spark_dag', default_args=DEFAULT_DAG_ARGS, + schedule_interval=None) as dag: # Here we are using dag as context. + # Create the Cloud Dataproc cluster. + # Note: this operator will be flagged a success if the cluster by this name + # already exists. + create_cluster = DataprocClusterCreateOperator( + task_id='create_dataproc_cluster', + # ds_nodash is an airflow macro for "[Execution] Date string no dashes" + # in YYYYMMDD format. + # See docs https://airflow.apache.org/code.html?highlight=macros#macros + cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}', + image_version='1.5-debian10', + num_workers=2, + storage_bucket=Variable.get('dataproc_bucket'), + zone=Variable.get('gce_zone')) + + # Submit the PySpark job. + submit_pyspark = DataProcPySparkOperator( + task_id='run_dataproc_pyspark', + main='gs://' + Variable.get('gcs_bucket') + + '/spark-jobs/spark_avg_speed.py', + # Obviously needs to match the name of cluster created in the prior + # Operator. + cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}', + # Let's template our arguments for the pyspark job from the POST + # payload. + arguments=[ + "--gcs_path_raw={{ dag_run.conf['raw_path'] }}", + "--gcs_path_transformed=gs://{{ var.value.gcs_bucket}}" + + "/{{ dag_run.conf['transformed_path'] }}" + ]) + + # Load the transformed files to a BigQuery table. + bq_load = GoogleCloudStorageToBigQueryOperator( + task_id='GCS_to_BigQuery', + bucket='{{ var.value.gcs_bucket }}', + # Wildcard for objects created by spark job to be written to BigQuery + # Reads the relative path to the objects transformed by the spark job + # from the POST message. + source_objects=["{{ dag_run.conf['transformed_path'] }}/part-*"], + destination_project_dataset_table='{{ var.value.bq_output_table }}', + schema_fields=None, + # Relative gcs path to schema file. + schema_object='schemas/nyc-tlc-yellow.json', + # Note that our spark job does json -> csv conversion. + source_format='CSV', + create_disposition='CREATE_IF_NEEDED', + skip_leading_rows=0, + write_disposition='WRITE_TRUNCATE', # If the table exists, overwrite it + max_bad_records=0) + + # Delete the Cloud Dataproc cluster. + delete_cluster = DataprocClusterDeleteOperator( + task_id='delete_dataproc_cluster', + # Obviously needs to match the name of cluster created in the prior two + # Operators. + cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}', + # This will tear down the cluster even if there are failures in upstream + # tasks. + trigger_rule=TriggerRule.ALL_DONE) + + # Delete gcs files in the timestamped transformed folder. + delete_transformed_files = BashOperator( + task_id='delete_transformed_files', + bash_command="gsutil -m rm -r gs://{{ var.value.gcs_bucket }}" + + "/{{ dag_run.conf['transformed_path'] }}/") + + # If the spark job or BQ Load fails we rename the timestamped raw path to + # a timestamped failed path. + move_failed_files = BashOperator( + task_id='move_failed_files', + bash_command="gsutil mv gs://{{ var.value.gcs_bucket }}" + + "/{{ dag_run.conf['raw_path'] }}/ " + "gs://{{ var.value.gcs_bucket}}" + + "/{{ dag_run.conf['failed_path'] }}/", + trigger_rule=TriggerRule.ONE_FAILED) + # Set the dag property of the first Operators, this will be inherited by + # downstream Operators. + + create_cluster.dag = dag + + create_cluster.set_downstream(submit_pyspark) + + submit_pyspark.set_downstream([delete_cluster, bq_load]) + + bq_load.set_downstream(delete_transformed_files) + + move_failed_files.set_upstream([bq_load, submit_pyspark]) diff --git a/source-code/workflow-dag/support-files/input.txt b/composer/dags/support-files/input.txt similarity index 100% rename from source-code/workflow-dag/support-files/input.txt rename to composer/dags/support-files/input.txt diff --git a/source-code/workflow-dag/support-files/ref.txt b/composer/dags/support-files/ref.txt similarity index 100% rename from source-code/workflow-dag/support-files/ref.txt rename to composer/dags/support-files/ref.txt diff --git a/composer/dags/tutorial.py b/composer/dags/tutorial.py new file mode 100644 index 0000000..fc6306d --- /dev/null +++ b/composer/dags/tutorial.py @@ -0,0 +1,121 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +### Tutorial Documentation +Documentation that goes along with the Airflow tutorial located +[here](https://airflow.apache.org/tutorial.html) +""" +# [START tutorial] +from datetime import timedelta + +# [START import_module] +# The DAG object; we'll need this to instantiate a DAG +from airflow import DAG +# Operators; we need this to operate! +from airflow.operators.bash_operator import BashOperator +from airflow.utils.dates import days_ago + +# [END import_module] + +# [START default_args] +# These args will get passed on to each operator +# You can override them on a per-task basis during operator initialization +default_args = { + 'owner': 'jferriero@google.com', + 'depends_on_past': False, + 'start_date': days_ago(2), + 'email': ['airflow@example.com'], + 'email_on_failure': False, + 'email_on_retry': False, + 'retries': 1, + 'retry_delay': timedelta(minutes=5), + # 'queue': 'bash_queue', + # 'pool': 'backfill', + # 'priority_weight': 10, + # 'end_date': datetime(2016, 1, 1), + # 'wait_for_downstream': False, + # 'dag': dag, + # 'sla': timedelta(hours=2), + # 'execution_timeout': timedelta(seconds=300), + # 'on_failure_callback': some_function, + # 'on_success_callback': some_other_function, + # 'on_retry_callback': another_function, + # 'sla_miss_callback': yet_another_function, + # 'trigger_rule': 'all_success' +} +# [END default_args] + +# [START instantiate_dag] +dag = DAG( + 'tutorial', + default_args=default_args, + description='A simple tutorial DAG', + schedule_interval=timedelta(days=1), +) +# [END instantiate_dag] + +# t1, t2 and t3 are examples of tasks created by instantiating operators +# [START basic_task] +t1 = BashOperator( + task_id='print_date', + bash_command='date', + dag=dag, +) + +t2 = BashOperator( + task_id='sleep', + depends_on_past=False, + bash_command='sleep 5', + retries=3, + dag=dag, +) +# [END basic_task] + +# [START documentation] +dag.doc_md = __doc__ + +t1.doc_md = """\ +#### Task Documentation +You can document your task using the attributes `doc_md` (markdown), +`doc` (plain text), `doc_rst`, `doc_json`, `doc_yaml` which gets +rendered in the UI's Task Instance Details page. +![img](http://montcs.bloomu.edu/~bobmon/Semesters/2012-01/491/import%20soul.png) +""" +# [END documentation] + +# [START jinja_template] +templated_command = """ +{% for i in range(5) %} + echo "{{ ds }}" + echo "{{ macros.ds_add(ds, 7)}}" + echo "{{ params.my_param }}" +{% endfor %} +""" + +t3 = BashOperator( + task_id='templated', + depends_on_past=False, + bash_command=templated_command, + params={'my_param': 'Parameter I passed in'}, + dag=dag, +) +# [END jinja_template] + +t1 >> [t2, t3] +# [END tutorial] diff --git a/composer/dags/wordcount_dag.py b/composer/dags/wordcount_dag.py new file mode 100644 index 0000000..9423570 --- /dev/null +++ b/composer/dags/wordcount_dag.py @@ -0,0 +1,121 @@ +# Copyright 2019 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Data processing test workflow definition. +""" + +import datetime +import os +from airflow import models +from airflow.contrib.operators.bigquery_operator import BigQueryOperator +from airflow.contrib.operators.dataflow_operator import DataFlowJavaOperator +from airflow.contrib.operators.gcs_download_operator import \ + GoogleCloudStorageDownloadOperator +# pylint: disable=import-error +from airflow.operators.xcom_utils_plugin import CompareXComMapsOperator + +DATAFLOW_STAGING_BUCKET = 'gs://{{ var.value.dataflow_staging_bucket }}/staging' + +DATAFLOW_JAR_LOCATION = ('gs://{{ var.value.dataflow_jar_location }}' + '/{{ var.dataflow_word_count_jar }}') + +PROJECT = models.Variable.get('gcp_project') +REGION = models.Variable.get('gcp_region') +INPUT_BUCKET = 'gs://' + models.Variable.get('gcs_input_bucket') +OUTPUT_BUCKET_NAME = models.Variable.get('gcs_output_bucket') +OUTPUT_BUCKET = 'gs://' + OUTPUT_BUCKET_NAME +REF_BUCKET = models.Variable.get('gcs_ref_bucket') +OUTPUT_PREFIX = 'output' +DOWNLOAD_TASK_PREFIX = 'download_result' + +# Dynamic prefix gives us flexibility for running airflow in a ci container or +# on composer. +SQL_PREFIX = os.path.join(os.environ.get('AIRFLOW_HOME', '/home/airflow'), + 'gcs', 'data', 'sql') + +SHAKESPEARE_SQL = os.path.join(SQL_PREFIX, 'shakespeare_top_25.sql') + +YESTERDAY = datetime.datetime.combine( + datetime.datetime.today() - datetime.timedelta(1), + datetime.datetime.min.time()) + +DEFAULT_ARGS = { + 'owner': 'jferriero@google.com', + 'dataflow_default_options': { + 'project': PROJECT, + 'region': REGION, + 'stagingLocation': DATAFLOW_STAGING_BUCKET + } +} + +with models.DAG('wordcount_dag', + start_date=YESTERDAY, + schedule_interval=None, + default_args=DEFAULT_ARGS) as dag: + + DATAFLOW_EXECUTION = DataFlowJavaOperator( + task_id='wordcount-run', + jar=DATAFLOW_JAR_LOCATION, + options={ + 'autoscalingAlgorithm': 'THROUGHPUT_BASED', + 'maxNumWorkers': '3', + 'inputFile': f'{INPUT_BUCKET}/input.txt', + 'output': f'{OUTPUT_BUCKET}/{OUTPUT_PREFIX}' + }) + + DOWNLOAD_EXPECTED = GoogleCloudStorageDownloadOperator( + task_id='download_ref_string', + bucket=REF_BUCKET, + object='ref.txt', + store_to_xcom_key='ref_str', + ) + + DOWNLOAD_RESULT_ONE = GoogleCloudStorageDownloadOperator( + task_id=DOWNLOAD_TASK_PREFIX + '_1', + bucket=OUTPUT_BUCKET_NAME, + object=OUTPUT_PREFIX + '-00000-of-00003', + store_to_xcom_key='res_str_1', + ) + + DOWNLOAD_RESULT_TWO = GoogleCloudStorageDownloadOperator( + task_id=DOWNLOAD_TASK_PREFIX + '_2', + bucket=OUTPUT_BUCKET_NAME, + object=OUTPUT_PREFIX + '-00001-of-00003', + store_to_xcom_key='res_str_2', + ) + + DOWNLOAD_RESULT_THREE = GoogleCloudStorageDownloadOperator( + task_id=DOWNLOAD_TASK_PREFIX + '_3', + bucket=OUTPUT_BUCKET_NAME, + object=OUTPUT_PREFIX + '-00002-of-00003', + store_to_xcom_key='res_str_3', + ) + + COMPARE_RESULT = CompareXComMapsOperator( + task_id='do_comparison', + ref_task_ids=['download_ref_string'], + res_task_ids=[ + DOWNLOAD_TASK_PREFIX + '_1', DOWNLOAD_TASK_PREFIX + '_2', + DOWNLOAD_TASK_PREFIX + '_3' + ], + ) + RUN_QUERY = BigQueryOperator(task_id='run_sql', sql=SHAKESPEARE_SQL) + + RUN_QUERY >> DATAFLOW_EXECUTION # pylint: disable=pointless-statement + DATAFLOW_EXECUTION.set_downstream( + [DOWNLOAD_RESULT_ONE, DOWNLOAD_RESULT_TWO, DOWNLOAD_RESULT_THREE]) + + COMPARE_RESULT.set_upstream([ + DOWNLOAD_EXPECTED, DOWNLOAD_RESULT_ONE, DOWNLOAD_RESULT_TWO, + DOWNLOAD_RESULT_THREE + ]) diff --git a/composer/plugins/xcom_utils_plugin/__init__.py b/composer/plugins/xcom_utils_plugin/__init__.py new file mode 100644 index 0000000..9aa5b70 --- /dev/null +++ b/composer/plugins/xcom_utils_plugin/__init__.py @@ -0,0 +1,41 @@ +# Copyright 2019 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Defines Plugin for XCom Operators.""" + +from typing import Any, List +from airflow.plugins_manager import AirflowPlugin + +# Allow unittests to easily import. +try: + from xcom_utils_plugin.operators.compare_xcom_maps import \ + CompareXComMapsOperator +except ModuleNotFoundError: + from plugins.xcom_utils_plugin.operators.compare_xcom_maps import \ + CompareXComMapsOperator + + +class XComUtilsPlugin(AirflowPlugin): + """Plugin to define operators perform common logic on XComs. + Operators: + CompareXComMapsOperator: An Operator that checks the equality + of XComs. + """ + name = "xcom_utils_plugin" + operators = [CompareXComMapsOperator] + hooks: List[Any] = [] + executors: List[Any] = [] + macros: List[Any] = [] + admin_views: List[Any] = [] + flask_blueprints: List[Any] = [] + menu_links: List[Any] = [] diff --git a/composer/plugins/xcom_utils_plugin/operators/__init__.py b/composer/plugins/xcom_utils_plugin/operators/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/composer/plugins/xcom_utils_plugin/operators/compare_xcom_maps.py b/composer/plugins/xcom_utils_plugin/operators/compare_xcom_maps.py new file mode 100644 index 0000000..512f55c --- /dev/null +++ b/composer/plugins/xcom_utils_plugin/operators/compare_xcom_maps.py @@ -0,0 +1,84 @@ +# Copyright 2019 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Custom operator that compares dictionaries in xcom. +""" + +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults + + +class CompareXComMapsOperator(BaseOperator): + """Compare dictionary stored in xcom. + + Args: + ref_task_ids: list of task ids from where the reference dictionary + is fetched + res_task_ids: list of task ids from where the comparing dictionary + is fetched + """ + + @apply_defaults + def __init__(self, ref_task_ids, res_task_ids, *args, **kwargs): + super(CompareXComMapsOperator, self).__init__(*args, **kwargs) + self.ref_task_ids = ref_task_ids + self.res_task_ids = res_task_ids + + def execute(self, context): + """Perform the XCom comparison based on the ref and res task_ids. + """ + ref_obj = self.read_value_as_obj(self.ref_task_ids, context) + res_obj = self.read_value_as_obj(self.res_task_ids, context) + self.compare_obj(ref_obj, res_obj) + return 'result contains the expected values' + + def read_value_as_obj(self, task_ids, context): + """Reads XComs from task_ids as dict. + """ + ret_obj = {} + for task_id in task_ids: + value_str = context['ti'].xcom_pull(key=None, task_ids=task_id) + self.parse_str_obj(value_str, ret_obj) + return ret_obj + + @staticmethod + def parse_str_obj(str_rep, obj): + """Parses Handles key: value strings to dict. + """ + entries = str_rep.split('\n') + for entry in entries: + if entry: + key, value = entry.split(': ') + obj[key] = value + + def compare_obj(self, ref_obj, res_obj): + """Raise ValueError if objects are not equal""" + if ref_obj != res_obj: + raise ValueError(self.create_diff_str(ref_obj, res_obj)) + + @staticmethod + def create_diff_str(ref_obj, res_obj): + """Creates an informative error message detailing the differences + in the objects. + """ + msg = 'The result differs from expected in the following ways:' + for k in ref_obj: + if k not in res_obj: + msg = msg + ('\nmissing key: %s in result' % k) + elif ref_obj[k] != res_obj[k]: + msg = msg + ('\nexpected %s: %s but got %s: %s' % + (k, ref_obj[k], k, res_obj[k])) + for k in res_obj: + if k not in ref_obj: + msg = msg + ('\nunexpected key: %s in result' % k) + return msg diff --git a/composer/precommit_cloudbuild.yaml b/composer/precommit_cloudbuild.yaml new file mode 100644 index 0000000..e7388a4 --- /dev/null +++ b/composer/precommit_cloudbuild.yaml @@ -0,0 +1,31 @@ +# Copyright 2019 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Composer is an orchestrator and therefore to test appropritely, we need +# to build everything that it will be orchestrating and stage it in a known, +# readable place to facilitate the tests. +steps: +# Run unit tests in Airflow container (local to cloud build). +- name: 'gcr.io/cloud-solutions-images/apache-airflow:1.10' + dir: 'composer' + entrypoint: 'bash' + args: [ + 'cloudbuild/bin/run_tests.sh', + '../bigquery/sql', + './config/AirflowVariables.json', + './plugins'] + id: 'run-unit-tests' +options: + machineType: 'N1_HIGHCPU_8' # For test and deploy dags parallelization. diff --git a/composer/requirements-dev.txt b/composer/requirements-dev.txt new file mode 100644 index 0000000..4a10aff --- /dev/null +++ b/composer/requirements-dev.txt @@ -0,0 +1,13 @@ +apache-airflow[gcp_api]==1.10.6 +flake8>=3.6.0 +flake8-colors +nose +pytest +parameterized +pre-commit +pylint~=2.3.1 +nose +mock +mypy +tenacity==5.1.5 +Werkzeug==0.16.0 diff --git a/composer/tests/__init__.py b/composer/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/composer/tests/test_compare_xcom_maps.py b/composer/tests/test_compare_xcom_maps.py new file mode 100644 index 0000000..2c1e05f --- /dev/null +++ b/composer/tests/test_compare_xcom_maps.py @@ -0,0 +1,130 @@ +# Copyright 2019 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit test of the CompareXComMapsOperator. +""" +import unittest +import mock + +# pylint: disable=import-error +from plugins.xcom_utils_plugin.operators.compare_xcom_maps import \ + CompareXComMapsOperator + +TASK_ID = 'test_compare_task_id' +REF_TASK_ID = 'download_ref_string' +DOWNLOAD_TASK_PREFIX = 'download_result' +CONTEXT_CLASS_NAME = 'airflow.ti_deps.dep_context' +ERROR_LINE_ONE = 'The result differs from expected in the following ways:\n' + + +def generate_mock_function(first_value, second_value, third_value): + """Mock dictionary for XCom.""" + + def mock_function(**kwargs): + return { + REF_TASK_ID: 'a: 1\nb: 2\nc: 3', + DOWNLOAD_TASK_PREFIX + '_1': first_value, + DOWNLOAD_TASK_PREFIX + '_2': second_value, + DOWNLOAD_TASK_PREFIX + '_3': third_value + }[kwargs['task_ids']] + + return mock_function + + +def equal_mock(): + """Mocks no change.""" + return generate_mock_function('c: 3', 'b: 2', 'a: 1') + + +def missing_value_mock(): + """Mock missing key.""" + return generate_mock_function('b: 2', 'a: 1', 'b: 2') + + +def wrong_value_mock(): + """Mock wrong value.""" + return generate_mock_function('a: 1', 'b: 4', 'c: 3') + + +def unexpected_value_mock(): + """Mock wrong key.""" + return generate_mock_function('a: 1', 'c: 3\nd: 4', 'b: 2') + + +class CompareXComMapsOperatorTest(unittest.TestCase): + """Test class for XComMapsOperator for success case and various + error handling.""" + + def setUp(self): + """Set up test fixture.""" + super(CompareXComMapsOperatorTest, self).setUp() + self.xcom_compare = CompareXComMapsOperator( + task_id=TASK_ID, + ref_task_ids=[REF_TASK_ID], + res_task_ids=[ + DOWNLOAD_TASK_PREFIX + '_1', DOWNLOAD_TASK_PREFIX + '_2', + DOWNLOAD_TASK_PREFIX + '_3' + ]) + + def test_init(self): + """Test the Operator's constructor.""" + self.assertEqual(self.xcom_compare.task_id, TASK_ID) + self.assertListEqual(self.xcom_compare.ref_task_ids, [REF_TASK_ID]) + self.assertListEqual(self.xcom_compare.res_task_ids, [ + DOWNLOAD_TASK_PREFIX + '_1', DOWNLOAD_TASK_PREFIX + '_2', + DOWNLOAD_TASK_PREFIX + '_3' + ]) + + def assert_raises_with_message(self, error_type, msg, func, *args, + **kwargs): + """Utility method for asserting a message was produced.""" + with self.assertRaises(error_type) as context: + func(*args, **kwargs) + self.assertEqual(msg, str(context.exception)) + + def execute_value_error(self, mock_func, error_expect_tr): + """Utility for testing various ValueError paths.""" + with mock.patch(CONTEXT_CLASS_NAME) as context_mock: + context_mock['ti'].xcom_pull = mock_func + self.assert_raises_with_message(ValueError, error_expect_tr, + self.xcom_compare.execute, + context_mock) + + def test_equal(self): + """Test success case.""" + with mock.patch(CONTEXT_CLASS_NAME) as context_mock: + context_mock['ti'].xcom_pull = equal_mock() + self.xcom_compare.execute(context_mock) + + def test_missing_value(self): + """Test expected error message when missing key.""" + self.execute_value_error( + missing_value_mock(), '{}{}'.format(ERROR_LINE_ONE, + 'missing key: c in result')) + + def test_wrong_value(self): + """Test expected error message if xcom values don't match.""" + self.execute_value_error( + wrong_value_mock(), '{}{}'.format(ERROR_LINE_ONE, + 'expected b: 2 but got b: 4')) + + def test_unexpected_value(self): + """Test expected error message if xcom contains unexpected key.""" + self.execute_value_error( + unexpected_value_mock(), + '{}{}'.format(ERROR_LINE_ONE, 'unexpected key: d in result')) + + +SUITE = unittest.TestLoader().loadTestsFromTestCase(CompareXComMapsOperatorTest) + +unittest.TextTestRunner(verbosity=2).run(SUITE) diff --git a/composer/tests/test_dag_validation.py b/composer/tests/test_dag_validation.py new file mode 100644 index 0000000..b7d5776 --- /dev/null +++ b/composer/tests/test_dag_validation.py @@ -0,0 +1,89 @@ +# Copyright 2019 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DAG Quality tests.""" + +import os +from pathlib import Path +import time +import unittest + +from airflow.models import DagBag + + +class TestDagIntegrity(unittest.TestCase): + """Tests DAG Syntax, compatibility with environment and load time.""" + LOAD_SECOND_THRESHOLD = 2 + + def setUp(self): + """Setup dagbag for each test.""" + self.dagbag = DagBag( + dag_folder=os.environ.get('AIRFLOW_HOME', "~/airflow/") + '/dags/', + include_examples=False) + with open('./config/running_dags.txt') as running_dags_txt: + self.running_dag_ids = running_dags_txt.read().splitlines() + + def test_no_ignore_running_dags(self): + """ + Tests that we don't have any dags in running_dags.txt that are + ignored by .airflowignore + """ + for dag_id in self.running_dag_ids: + try: + self.assertTrue(self.dagbag.get_dag(dag_id) is not None) + except AssertionError: + self.fail(f"{dag_id} is in running_dags.txt but not dagbag.") + + def test_import_dags(self): + """Tests there are no syntax issues or environment compaibility issues. + """ + self.assertFalse( + len(self.dagbag.import_errors), + 'DAG import failures. Errors: {}'.format( + self.dagbag.import_errors)) + + def test_non_airflow_owner(self): + """Tests that owners are set for all dags""" + for dag_id in self.dagbag.dag_ids: + if dag_id != 'airflow_monitoring': + dag = self.dagbag.get_dag(dag_id) + try: + self.assertIsNotNone(dag.owner) + self.assertNotEqual(dag.owner, 'airflow') + except AssertionError as err: + self.fail(f"issue validating owner for DAG {dag_id}: {err}") + + def test_same_file_and_dag_id_name(self): + """Tests that filename matches dag_id""" + for dag_id in self.dagbag.dag_ids: + dag = self.dagbag.get_dag(dag_id) + if not dag.is_subdag: + stripped_filename = os.path.splitext( + Path(self.dagbag.get_dag(dag_id).filepath).name)[0] + self.assertEqual(dag_id, stripped_filename) + + def test_import_time(self): + """Test that all DAGs can be parsed under the threshold time.""" + for dag_id in self.dagbag.dag_ids: + start = time.time() + + self.dagbag.process_file(self.dagbag.get_dag(dag_id).filepath) + + end = time.time() + total = end - start + + self.assertLessEqual(total, self.LOAD_SECOND_THRESHOLD) + + +if __name__ == "__main__": + unittest.main() diff --git a/dataflow/java/wordcount/cloudbuild.yaml b/dataflow/java/wordcount/cloudbuild.yaml new file mode 100644 index 0000000..b50cfde --- /dev/null +++ b/dataflow/java/wordcount/cloudbuild.yaml @@ -0,0 +1,25 @@ +steps: +# [Dataflow] +# Maven package will run compile run the prior phases (validate, compile, test) +# https://maven.apache.org/guides/introduction/introduction-to-the-lifecycle.html#a-build-lifecycle-is-made-up-of-phases +- name: maven:3.6.0-jdk-8-slim + waitFor: ['-'] + dir: 'dataflow/java/wordcount' + entrypoint: 'mvn' + args: ['package', '-q'] + id: 'build-wordcount-jar' +# Override JAR reference variable to the artifact built in this build to the +# airflow DAG that orchestrates this job picks up this version of the JAR. +- name: 'google/cloud-sdk' + waitFor: ['build-wordcount-jar'] + dir: 'dataflow/java/wordcount' + entrypoint: 'bash' + args: [ + '-c', + 'gcloud composer environments run --location ${_COMPOSER_REGION} ${_COMPOSER_ENV_NAME} variables -- --set dataflow_word_count_jar "wordcount/${_SHORT_SHA}/$(basename $(ls target/word-count-beam-bundled-*.jar))"' + ] + id: 'set-composer-jar-ref' +artifacts: + objects: + location: 'gs://${_DATAFLOW_JAR_BUCKET}/wordcount/${_SHORT_SHA}/' + paths: ['./dataflow/java/wordcount/target/word-count-beam-bundled-*.jar'] diff --git a/source-code/data-processing-code/pom.xml b/dataflow/java/wordcount/pom.xml similarity index 100% rename from source-code/data-processing-code/pom.xml rename to dataflow/java/wordcount/pom.xml diff --git a/dataflow/java/wordcount/precommit_cloudbuild.yaml b/dataflow/java/wordcount/precommit_cloudbuild.yaml new file mode 100644 index 0000000..5bc80b0 --- /dev/null +++ b/dataflow/java/wordcount/precommit_cloudbuild.yaml @@ -0,0 +1,10 @@ +steps: +# [Dataflow] +# Maven package will run compile run the prior phases (validate, compile, test) +# https://maven.apache.org/guides/introduction/introduction-to-the-lifecycle.html#a-build-lifecycle-is-made-up-of-phases +- name: maven:3.6.0-jdk-8-slim + waitFor: ['-'] + dir: 'dataflow/java/wordcount' + entrypoint: 'mvn' + args: ['package', '-q'] + id: 'build-wordcount-jar' \ No newline at end of file diff --git a/dataflow/java/wordcount/src/main/java/org/apache/beam/examples/WordCount.java b/dataflow/java/wordcount/src/main/java/org/apache/beam/examples/WordCount.java new file mode 100644 index 0000000..4b20d48 --- /dev/null +++ b/dataflow/java/wordcount/src/main/java/org/apache/beam/examples/WordCount.java @@ -0,0 +1,195 @@ +/* + * Copyright 2019 Google Inc. + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.examples; + +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.io.TextIO; +import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.metrics.Distribution; +import org.apache.beam.sdk.metrics.Metrics; +import org.apache.beam.sdk.options.Default; +import org.apache.beam.sdk.options.Description; +import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.options.Validation.Required; +import org.apache.beam.sdk.transforms.Count; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.SimpleFunction; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; + +/** + * An example that counts words in Shakespeare and includes Beam best practices. + * + *

This class, {@link WordCount}, is the second in a series of four successively more detailed + * 'word count' examples. You may first want to take a look at {@link MinimalWordCount}. After + * you've looked at this example, then see the {@link DebuggingWordCount} pipeline, for introduction + * of additional concepts. + * + *

For a detailed walkthrough of this example, see + * https://beam.apache.org/get-started/wordcount-example/ + * + *

Basic concepts, also in the MinimalWordCount example: Reading text files; counting a + * PCollection; writing to text files + * + *

New Concepts: + * + *

+ *   1. Executing a Pipeline both locally and using the selected runner
+ *   2. Using ParDo with static DoFns defined out-of-line
+ *   3. Building a composite transform
+ *   4. Defining your own pipeline options
+ * 
+ * + *

Concept #1: you can execute this pipeline either locally or using by selecting another runner. + * These are now command-line options and not hard-coded as they were in the MinimalWordCount + * example. + * + *

To change the runner, specify: + * + *

{@code
+ * --runner=YOUR_SELECTED_RUNNER
+ * }
+ * + *

To execute this pipeline, specify a local output file (if using the {@code DirectRunner}) or + * output prefix on a supported distributed file system. + * + *

{@code
+ * --output=[YOUR_LOCAL_FILE | YOUR_OUTPUT_PREFIX]
+ * }
+ * + *

The input file defaults to a public data set containing the text of of King Lear, by William + * Shakespeare. You can override it and choose your own input with {@code --inputFile}. + */ +public class WordCount { + + /** + * Concept #2: You can make your pipeline assembly code less verbose by defining your DoFns + * statically out-of-line. This DoFn tokenizes lines of text into individual words; we pass it to + * a ParDo in the pipeline. + */ + static class ExtractWordsFn extends DoFn { + private final Counter emptyLines = Metrics.counter(ExtractWordsFn.class, "emptyLines"); + private final Distribution lineLenDist = + Metrics.distribution(ExtractWordsFn.class, "lineLenDistro"); + private static final String TOKENIZER_PATTERN = "[^\\p{L}]+"; + + @ProcessElement + public void processElement(@Element String element, OutputReceiver receiver) { + lineLenDist.update(element.length()); + if (element.trim().isEmpty()) { + emptyLines.inc(); + } + + // Split the line into words. + String[] words = element.split(TOKENIZER_PATTERN, -1); + + // Output each word encountered into the output PCollection. + for (String word : words) { + if (!word.isEmpty()) { + receiver.output(word); + } + } + } + } + + /** A SimpleFunction that converts a Word and Count into a printable string. */ + public static class FormatAsTextFn extends SimpleFunction, String> { + @Override + public String apply(KV input) { + return input.getKey() + ": " + input.getValue(); + } + } + + /** + * A PTransform that converts a PCollection containing lines of text into a PCollection of + * formatted word counts. + * + *

Concept #3: This is a custom composite transform that bundles two transforms (ParDo and + * Count) as a reusable PTransform subclass. Using composite transforms allows for easy reuse, + * modular testing, and an improved monitoring experience. + */ + public static class CountWords + extends PTransform, PCollection>> { + @Override + public PCollection> expand(PCollection lines) { + + // Convert lines of text into individual words. + PCollection words = lines.apply(ParDo.of(new ExtractWordsFn())); + + // Count the number of times each word occurs. + PCollection> wordCounts = words.apply(Count.perElement()); + + return wordCounts; + } + } + + /** + * Options supported by {@link WordCount}. + * + *

Concept #4: Defining your own configuration options. Here, you can add your own arguments to + * be processed by the command-line parser, and specify default values for them. You can then + * access the options values in your pipeline code. + * + *

Inherits standard configuration options. + */ + public interface WordCountOptions extends PipelineOptions { + + /** + * By default, this example reads from a public dataset containing the text of King Lear. Set + * this option to choose a different input file or glob. + */ + @Description("Path of the file to read from") + @Default.String("gs://apache-beam-samples/shakespeare/kinglear.txt") + String getInputFile(); + + void setInputFile(String value); + + /** Set this required option to specify where to write the output. */ + @Description("Path of the file to write to") + @Required + String getOutput(); + + void setOutput(String value); + } + + static void runWordCount(WordCountOptions options) { + Pipeline p = Pipeline.create(options); + + // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the + // static FormatAsTextFn() to the ParDo transform. + p.apply("ReadLines", TextIO.read().from(options.getInputFile())) + .apply(new CountWords()) + .apply(MapElements.via(new FormatAsTextFn())) + .apply("WriteCounts", TextIO.write().to(options.getOutput())); + + p.run().waitUntilFinish(); + } + + public static void main(String[] args) { + WordCountOptions options = + PipelineOptionsFactory.fromArgs(args).withValidation().as(WordCountOptions.class); + runWordCount(options); + } +} diff --git a/source-code/data-processing-code/src/test/java/org/apache/beam/examples/WordCountTest.java b/dataflow/java/wordcount/src/test/java/org/apache/beam/examples/WordCountTest.java similarity index 55% rename from source-code/data-processing-code/src/test/java/org/apache/beam/examples/WordCountTest.java rename to dataflow/java/wordcount/src/test/java/org/apache/beam/examples/WordCountTest.java index df8f3a3..7c76573 100644 --- a/source-code/data-processing-code/src/test/java/org/apache/beam/examples/WordCountTest.java +++ b/dataflow/java/wordcount/src/test/java/org/apache/beam/examples/WordCountTest.java @@ -45,52 +45,42 @@ @RunWith(JUnit4.class) public class WordCountTest { - /** Example test that tests a specific {@link DoFn}. */ - @Test - public void testExtractWordsFn() throws Exception { - DoFnTester < String, String > extractWordsFn = DoFnTester.of(new ExtractWordsFn()); + /** Example test that tests a specific {@link DoFn}. */ + @Test + public void testExtractWordsFn() throws Exception { + DoFnTester extractWordsFn = DoFnTester.of(new ExtractWordsFn()); - Assert.assertThat( - extractWordsFn.processBundle(" some input words "), - CoreMatchers.hasItems("some", "input", "words")); - Assert.assertThat(extractWordsFn.processBundle(" "), CoreMatchers.hasItems()); - Assert.assertThat( - extractWordsFn.processBundle(" some ", " input", " words"), - CoreMatchers.hasItems("some", "input", "words")); - } + Assert.assertThat( + extractWordsFn.processBundle(" some input words "), + CoreMatchers.hasItems("some", "input", "words")); + Assert.assertThat(extractWordsFn.processBundle(" "), CoreMatchers.hasItems()); + Assert.assertThat( + extractWordsFn.processBundle(" some ", " input", " words"), + CoreMatchers.hasItems("some", "input", "words")); + } - static final String[] WORDS_ARRAY = - new String[] { - "five", - "five four", - "five four three", - "five four three two", - "", - "five four three two one" - }; + static final String[] WORDS_ARRAY = + new String[] { + "five", "five four", "five four three", "five four three two", "", "five four three two one" + }; - static final List < String > WORDS = Arrays.asList(WORDS_ARRAY); + static final List WORDS = Arrays.asList(WORDS_ARRAY); - static final String[] COUNTS_ARRAY = new String[] { - "five: 5", - "four: 4", - "three: 3", - "two: 2", - "one: 1" - }; + static final String[] COUNTS_ARRAY = + new String[] {"five: 5", "four: 4", "three: 3", "two: 2", "one: 1"}; - @Rule public TestPipeline p = TestPipeline.create(); + @Rule public TestPipeline p = TestPipeline.create(); - /** Example test that tests a PTransform by using an in-memory input and inspecting the output. */ - @Test - @Category(ValidatesRunner.class) - public void testCountWords() throws Exception { - PCollection < String > input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of())); + /** Example test that tests a PTransform by using an in-memory input and inspecting the output. */ + @Test + @Category(ValidatesRunner.class) + public void testCountWords() throws Exception { + PCollection input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of())); - PCollection < String > output = - input.apply(new CountWords()).apply(MapElements.via(new FormatAsTextFn())); + PCollection output = + input.apply(new CountWords()).apply(MapElements.via(new FormatAsTextFn())); - PAssert.that(output).containsInAnyOrder(COUNTS_ARRAY); - p.run().waitUntilFinish(); - } + PAssert.that(output).containsInAnyOrder(COUNTS_ARRAY); + p.run().waitUntilFinish(); + } } diff --git a/env-setup/create_buckets.sh b/env-setup/create_buckets.sh deleted file mode 100644 index 88ea048..0000000 --- a/env-setup/create_buckets.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash -# -# This script creates the buckets used by the build pipelines and the data -# processing workflow. It also gives the Cloud Composer service account the -# access level it need to execute the data processing workflow -# -# Copyright 2019 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -gsutil ls -L "gs://${DATAFLOW_JAR_BUCKET_TEST}" 2>/dev/null \ -|| gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${DATAFLOW_JAR_BUCKET_TEST}" -gsutil ls -L "gs://${INPUT_BUCKET_TEST}" 2>/dev/null \ -|| gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${INPUT_BUCKET_TEST}" -gsutil ls -L "gs://${REF_BUCKET_TEST}" 2>/dev/null \ -|| gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${REF_BUCKET_TEST}" -gsutil ls -L "gs://${RESULT_BUCKET_TEST}" 2>/dev/null \ -|| gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${RESULT_BUCKET_TEST}" -gsutil ls -L "gs://${DATAFLOW_STAGING_BUCKET_TEST}" 2>/dev/null \ -|| gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${DATAFLOW_STAGING_BUCKET_TEST}" -gsutil ls -L "gs://${DATAFLOW_JAR_BUCKET_PROD}" 2>/dev/null \ -|| gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${DATAFLOW_JAR_BUCKET_PROD}" -gsutil ls -L "gs://${INPUT_BUCKET_PROD}" 2>/dev/null \ -|| gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${INPUT_BUCKET_PROD}" -gsutil ls -L "gs://${RESULT_BUCKET_PROD}" 2>/dev/null \ -|| gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${RESULT_BUCKET_PROD}" -gsutil ls -L "gs://${DATAFLOW_STAGING_BUCKET_PROD}" 2>/dev/null \ -|| gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${DATAFLOW_STAGING_BUCKET_PROD}" - -gsutil acl ch -u "${COMPOSER_SERVICE_ACCOUNT}:R" \ - "gs://${DATAFLOW_JAR_BUCKET_TEST}" \ - "gs://${INPUT_BUCKET_TEST}" \ - "gs://${REF_BUCKET_TEST}" \ - "gs://${DATAFLOW_JAR_BUCKET_PROD}" "gs://${INPUT_BUCKET_PROD}" -gsutil acl ch -u "${COMPOSER_SERVICE_ACCOUNT}:W" \ - "gs://${RESULT_BUCKET_TEST}" \ - "gs://${DATAFLOW_STAGING_BUCKET_TEST}" \ - "gs://${RESULT_BUCKET_PROD}" "gs://${DATAFLOW_STAGING_BUCKET_PROD}" diff --git a/env-setup/set_composer_variables.sh b/env-setup/set_composer_variables.sh deleted file mode 100644 index 67fbffa..0000000 --- a/env-setup/set_composer_variables.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash -# -# This script sets the variables in Composer. The variables are needed for the -# data processing DAGs to properly execute, such as project-id, GCP region and -#zone. It also sets Cloud Storage buckets where test files are stored. -# -# Copyright 2019 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -declare -A variables -variables["gcp_project"]="${GCP_PROJECT_ID}" -variables["gcp_region"]="${COMPOSER_REGION}" -variables["gcp_zone"]="${COMPOSER_ZONE_ID}" -variables["dataflow_jar_location_test"]="${DATAFLOW_JAR_BUCKET_TEST}" -variables["dataflow_jar_file_test"]="to_be_overriden" -variables["gcs_input_bucket_test"]="${INPUT_BUCKET_TEST}" -variables["gcs_ref_bucket_test"]="${REF_BUCKET_TEST}" -variables["gcs_output_bucket_test"]="${RESULT_BUCKET_TEST}" -variables["dataflow_staging_bucket_test"]="${DATAFLOW_STAGING_BUCKET_TEST}" -variables["dataflow_jar_location_prod"]="${DATAFLOW_JAR_BUCKET_PROD}" -variables["dataflow_jar_file_prod"]="to_be_overriden" -variables["gcs_input_bucket_prod"]="${INPUT_BUCKET_PROD}" -variables["gcs_output_bucket_prod"]="${RESULT_BUCKET_PROD}" -variables["dataflow_staging_bucket_prod"]="${DATAFLOW_STAGING_BUCKET_PROD}" - -for i in "${!variables[@]}"; do - gcloud composer environments run "${COMPOSER_ENV_NAME}" \ - --location "${COMPOSER_REGION}" variables -- --set "${i}" "${variables[$i]}" -done diff --git a/env-setup/set_env.sh b/env-setup/set_env.sh deleted file mode 100644 index 458a28b..0000000 --- a/env-setup/set_env.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash -# -# This script sets the environment variables for project environment specific -# information such as project_id, region and zone choice. And also name of -# buckets that are used by the build pipeline and the data processing workflow. -# -# Copyright 2019 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -export TEST='test' -export GCP_PROJECT_ID=$(gcloud config list --format 'value(core.project)') -export PROJECT_NUMBER=$(gcloud projects describe "${GCP_PROJECT_ID}" --format='get(projectNumber)') -export DATAFLOW_JAR_BUCKET_TEST="${GCP_PROJECT_ID}-composer-dataflow-source-${TEST}" -export INPUT_BUCKET_TEST="${GCP_PROJECT_ID}-composer-input-${TEST}" -export RESULT_BUCKET_TEST="${GCP_PROJECT_ID}-composer-result-${TEST}" -export REF_BUCKET_TEST="${GCP_PROJECT_ID}-composer-ref-${TEST}" -export DATAFLOW_STAGING_BUCKET_TEST="${GCP_PROJECT_ID}-dataflow-staging-${TEST}" -export PROD='prod' -export DATAFLOW_JAR_BUCKET_PROD="${GCP_PROJECT_ID}-composer-dataflow-source-${PROD}" -export INPUT_BUCKET_PROD="${GCP_PROJECT_ID}-composer-input-${PROD}" -export RESULT_BUCKET_PROD="${GCP_PROJECT_ID}-composer-result-${PROD}" -export DATAFLOW_STAGING_BUCKET_PROD="${GCP_PROJECT_ID}-dataflow-staging-${PROD}" -export COMPOSER_REGION='us-central1' -export RESULT_BUCKET_REGION="${COMPOSER_REGION}" -export COMPOSER_ZONE_ID='us-central1-a' - -export COMPOSER_ENV_NAME='data-pipeline-composer' -export SOURCE_CODE_REPO='data-pipeline-source' -export COMPOSER_DAG_NAME_TEST='test_word_count' -export COMPOSER_DAG_NAME_PROD='prod_word_count' diff --git a/helpers/check_format.sh b/helpers/check_format.sh new file mode 100755 index 0000000..2f60566 --- /dev/null +++ b/helpers/check_format.sh @@ -0,0 +1,206 @@ +#!/bin/bash + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script checks the format of various files in the tools/ subfolders +# based on Google open source style guidelines. +# +# The following languages are currently supported: +# - python (using yapf) + +set -e + +# need_formatting - helper function to error out when +# a folder contains files that need formatting +# @args $1 - Folder local path +# @args $2 - List of files in that folder that need formatting +# Exit with error code 1 - always +need_formatting() { + FOLDER=$1 + FILES_TO_LINT=${*:2} + echo "Some files need to be formatted in $FOLDER - FAIL" + echo "$FILES_TO_LINT" + exit 1 +} + +# validate_bash - takes a folder path as input and shell checks files +validate_bash() { + FOLDER=$1 + echo "Validating $FOLDER - Checking bash files" + + FILES_TO_CHECK=$(find "$FOLDER" -type f -name "*.sh") + + # Initialize FILES_TO_LINT to empty string + FILES_TO_LINT="" + + if [[ -n "$FILES_TO_CHECK" ]] + then + for FILE_TO_CHECK in $FILES_TO_CHECK + do + if ! shellcheck "$FILE_TO_CHECK"; + then + FILES_TO_LINT+="$FILE_TO_CHECK " + fi + done + + if [[ -n "$FILES_TO_LINT" ]] + then + need_formatting "$FOLDER" "$FILES_TO_LINT" + fi + else + echo "No bash files found for $FOLDER - SKIP" + fi +} + +# validate_terraform - checks terraform files in terraform/ +validate_terraform() { + FOLDER=$1 + echo "Checking terraform fmt in $FOLDER" + FILES_TO_CHECK=$(find "$FOLDER" -type f -name "*.tf") + + # Initialize FILES_TO_LINT to empty string + FILES_TO_LINT="" + + if [[ -n "$FILES_TO_CHECK" ]] + then + FILES_TO_LINT="" + if ! terraform fmt -check "$FOLDER"; + then + FILES_TO_LINT+="$TF_TO_LINT " + need_formatting "$FOLDER" "$FILES_TO_LINT" + fi + else + echo "No terraform files found for $FOLDER - SKIP" + fi +} + +# validate_python - takes a folder path as input and validate python files +# using yapf (supports both python2 and python3) +# errors out if yapf --diff -r --style google returns a non-0 status +validate_python() { + FOLDER=$1 + echo "Validating $FOLDER - Checking python files" + + FILES_TO_CHECK=$(find "$FOLDER" -type f -name "*.py") + + # Initialize FILES_TO_LINT to empty string + FILES_TO_LINT="" + + if [[ -n "$FILES_TO_CHECK" ]] + then + # Checking python files + # python 2 yapf + echo "Testing formatting for python2 files in $FOLDER" + + # Getting the list of files to lint + YAPF_PYTHON2_OUTPUT=$(python2 -m yapf --diff -r --style google "$FILES_TO_CHECK" 2>&1) + YAPF_PYTHON2_STATUS=$? + FILES_TO_LINT+=$( echo "$YAPF_PYTHON2_OUTPUT" | grep -E '^---.*\(original\)$' | awk '{print $2}') + + if [[ -n "$FILES_TO_LINT" ]] + then + # Error out with details + need_formatting "$FOLDER" "$FILES_TO_LINT" + fi + + # Checking python files if python2 failed (i.e not python2 compatible code) + if [[ "$YAPF_PYTHON2_STATUS" -ne 0 ]] + then + # python 3 yapf + echo "Testing formatting for python3 files in $FOLDER" + FILES_TO_LINT+=$(python3 -m yapf --diff -r --style google "$FILES_TO_CHECK" | grep -E '^---.*\(original\)$' | awk '{print $2}') + + if [[ -n "$FILES_TO_LINT" ]] + then + # Error out with details + need_formatting "$FOLDER" "$FILES_TO_LINT" + fi + + if [[ -z "$FILES_TO_LINT" ]] + then + echo "No files need to be formatted in $FOLDER - PASS" + fi + fi + else + echo "No python files found for $FOLDER - SKIP" + fi +} + +# validate_go - takes a folder path as input and validate go files +# using gofmt +# errors out if gofmt returns a non-0 status +validate_go() { + FOLDER=$1 + echo "Validating $FOLDER - Checking GO files" + + FILES_TO_LINT=$(gofmt -l "$FOLDER") + + if [[ -n "$FILES_TO_LINT" ]] + then + # Error out with details + need_formatting "$FOLDER" "$FILES_TO_LINT" + else + echo "No go files need formatting for $FOLDER - SKIP" + fi +} + +# validate_java - takes a folder path as input and validate folder +# using gts +# errors out if gts init or npm audit returns a non-0 status +validate_java(){ + FOLDER=$1 + echo "Validating $FOLDER - Checking java files" + + FILES_TO_CHECK=$(find "$FOLDER" -type f -name "*.java") + + # Initialize FILES_TO_LINT to empty string + FILES_TO_LINT="" + + if [[ -n "$FILES_TO_CHECK" ]] + then + echo "Testing formatting for java files in $FOLDER" + # shellcheck disable=SC2086 + FILES_TO_LINT=$(java -jar "/usr/share/java/google-java-format-1.7-all-deps.jar" --set-exit-if-changed -n $FILES_TO_CHECK) + + if [[ -n "$FILES_TO_LINT" ]] + then + need_formatting "$FOLDER" "$FILES_TO_LINT" + fi + + if [[ -z "$FILES_TO_LINT" ]] + then + echo "No files need to be formatted in $FOLDER - PASS" + fi + else + echo "No java files found for $FOLDER - SKIP" + fi +} + +# temporary list of folders to exclude +EXCLUDE_FOLDERS=$(cat helpers/exclusion_list.txt) +while IFS= read -r -d '' FOLDER +do + if [[ ! ${EXCLUDE_FOLDERS[*]} =~ $FOLDER ]] + then + validate_java "$FOLDER" + validate_python "$FOLDER" + validate_go "$FOLDER" + validate_bash "$FOLDER" + validate_terraform "$FOLDER" + else + echo "$FOLDER in exclusion list - SKIP " + fi +done < <(find . -maxdepth 1 -mindepth 1 -type d -print0) +echo "finished checking format" diff --git a/helpers/exclusion_list.txt b/helpers/exclusion_list.txt new file mode 100644 index 0000000..d2de6ef --- /dev/null +++ b/helpers/exclusion_list.txt @@ -0,0 +1,3 @@ +./.git/ +./terraform/.terraform/ +./composer/.venv/ diff --git a/helpers/format.sh b/helpers/format.sh new file mode 100755 index 0000000..06096f2 --- /dev/null +++ b/helpers/format.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script formats the various files in this repository based on Google open source +# style guidelines. This script is automatically called when running +# "make fmt" at the root of the repository. +# +# NOTE: The files will be formatted in place. +# +# The following languages are currently supported: +# - python (using yapf) + +# temporary list of folders to exclude +EXCLUDE_FOLDERS=$(cat helpers/exclusion_list.txt) + +while IFS= read -r -d '' FOLDER +do + if [[ ! ${EXCLUDE_FOLDERS[*]} =~ $FOLDER ]] + then + echo "Formatting $FOLDER" + + echo "Formatting python files (if any)" + + FILES_TO_FORMAT=$(find "$FOLDER" -type f -name "*.py") + if [[ -n "$FILES_TO_FORMAT" ]] + then + # format all python files in place for python2 + + + # If python2 failed, try to format using python3 instead + if ! python2 -m yapf -i -r --style google "$FILES_TO_FORMAT" > /dev/null 2>&1 + then + # format all python files in place for python2 + python3 -m yapf -i -r --style google "$FILES_TO_FORMAT" > /dev/null + fi + else + echo "No python files found for $FOLDER - SKIP" + fi + + echo "Formatting go files (if any)" + gofmt -w "$FOLDER" + + echo "Formatting terraform files (if any)" + terraform fmt -r "$FOLDER" + + echo "Formatting java files (if any)" + + FILES_TO_FORMAT=$(find "$FOLDER" -type f -name "*.java") + if [[ -n "$FILES_TO_FORMAT" ]] + then + # format all java files in place + java -jar /usr/share/java/google-java-format-1.7-all-deps.jar -i "$FILES_TO_FORMAT" + else + echo "No java files found for $FOLDER - SKIP" + fi + fi +done < <(find . -maxdepth 1 -mindepth 1 -type d) diff --git a/helpers/init_cloudshell.sh b/helpers/init_cloudshell.sh new file mode 100755 index 0000000..56fdc63 --- /dev/null +++ b/helpers/init_cloudshell.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +echo "downloading terragrunt" +INSTALL_DIR=$(command -v terraform | sed s/terraform/terragrunt/g) +wget https://github.com/gruntwork-io/terragrunt/releases/download/v0.23.25/terragrunt_linux_amd64 +mv terragrunt_linux_amd64 "$INSTALL_DIR" +chmod +x "$INSTALL_DIR" +echo "terragrunt install successful!" +terragrunt -version + +echo "resetting to java 8" +update-java-alternatives -s java-1.8.0-openjdk-amd64 && export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre +java -version diff --git a/helpers/init_git_repo.sh b/helpers/init_git_repo.sh new file mode 100755 index 0000000..6555151 --- /dev/null +++ b/helpers/init_git_repo.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +################################################################################ +# Checkout the repo for a PR and add the remote of the target branch # +################################################################################ + +# Fetches master branch from GitHub and "resets" local changes to be relative to it, +# so we can diff what changed relatively to master branch. +git init +git config user.email "ia-tests@presubmit.example.com" +git config user.name "ia-tests" + +git commit -m "empty commit" +git remote add origin "${BASE_REPO_URL}" +git fetch origin master + +# Fetch all PRs to get history for PRs created from forked repos +git fetch origin +refs/pull/*/merge:refs/remotes/origin/pr/* + +git reset --hard "origin/pr/${PR_NUMBER}" + +if ! git rebase "origin/${BASE_BRANCH}" +then + exit 1 +fi + +echo "successfully rebased PR #${PR_NUMBER} on master" diff --git a/helpers/run_relevant_cloudbuilds.sh b/helpers/run_relevant_cloudbuilds.sh new file mode 100755 index 0000000..6072582 --- /dev/null +++ b/helpers/run_relevant_cloudbuilds.sh @@ -0,0 +1,106 @@ +#!/bin/bash + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +################################################################################ +# Construct and submit a dynamic cloudbuild yaml file to run the nested # +# cloud builds for directories containing changes according to git diff master.# +# # +# Arguments: # +# $1 - file to search for (e.g. cloudbuild.yaml or precommit_cloudbuild.yaml) # +# all subsequent args will be passed to gcloud builds submit commands # +# https://cloud.google.com/sdk/gcloud/reference/builds/submit # +# # +# Example usage: # +# ./run_relevant_cloudbuilds.sh precommit_cloudbuild.yaml # +################################################################################ + +set -e + +COMMIT_SHA=$(git rev-parse HEAD) + +# list of changed files. +DIFF=$(git diff --name-only origin/master) + +# Temporary file to define a dynamic cloud build based on changed files. +PRE_COMMIT_BUILD=relevant-cloudbuilds-for-${COMMIT_SHA}.yaml + +# get a list of dirs containin cloud builds and the list of passed files. +# $1 - cloudbuild filename to search for +# $2 - list of files containing changes +function find_relevant_cloud_build_dirs(){ + DIRS_WITH_COULD_BUILD_PATTERN="(^$(find . -type f -path "./*/$1" -printf '%h|' | sed s#\\./##g | sed s/\|$//g))" + echo "$2" | grep -oP "$DIRS_WITH_COULD_BUILD_PATTERN" | sort | uniq +} +# utility for adding a line to the working build file +function append_to_build(){ + echo "$1" >> "$PRE_COMMIT_BUILD" +} +# initializes a cloud build file +function init_build() { + touch "$PRE_COMMIT_BUILD" + append_to_build "steps:" +} +# loop through the diff and add a step to run each relevant nested cloud build. +# $1 - cloudbuild file to look for +# $2 - additional arguments for gcloud builds submit +function construct_build(){ + for DIR in $DIRS_WITH_DIFF_AND_BUILD + do + append_to_build '- name: google/cloud-sdk' + append_to_build " entrypoint: 'gcloud'" + if [ -z "$2" ] + then + append_to_build " args: ['builds', 'submit', '.' , '--config=$DIR/$1']" + else + append_to_build " args: ['builds', 'submit', '.' , '--config=$DIR/$1', '$2']" + fi + append_to_build " waitFor: ['-']" # run nested builds in parallel + append_to_build " id: '$DIR'" + done + # beef up resources for parallelizaiton + append_to_build "options:" + append_to_build " machineType: 'N1_HIGHCPU_8'" +} +# run the cloud build created in this script +function run() { + echo "running relevant pre-commits for $COMMIT_SHA" + cat "$PRE_COMMIT_BUILD" + gcloud builds submit . --config="$PRE_COMMIT_BUILD" + BUILD_STATUS=$? + # clean up + rm "$PRE_COMMIT_BUILD" + exit $BUILD_STATUS +} + +function main(){ + FILENAME="$1" + CLOUD_BUILD_EXTRA_ARGS="${*:2}" + echo "${CLOUD_BUILD_EXTRA_ARGS}" + DIRS_WITH_DIFF_AND_BUILD=$(find_relevant_cloud_build_dirs "$FILENAME" "$DIFF") + # If there are no cloudbuilds in dirs with diff we should not fail. + if [ -z "$DIRS_WITH_DIFF_AND_BUILD" ] + then + echo "no cloudbuilds to run." + exit 0 + else + init_build + construct_build "$FILENAME" "${CLOUD_BUILD_EXTRA_ARGS[*]}" + run + fi + echo "all relevant cloudbuilds succeeded!" +} + +main "$@" \ No newline at end of file diff --git a/source-code/build-pipeline/wait_for_dag_deployed.sh b/helpers/run_tests.sh old mode 100644 new mode 100755 similarity index 57% rename from source-code/build-pipeline/wait_for_dag_deployed.sh rename to helpers/run_tests.sh index 93c7184..b2f0798 --- a/source-code/build-pipeline/wait_for_dag_deployed.sh +++ b/helpers/run_tests.sh @@ -1,14 +1,12 @@ #!/bin/bash -# -# Script that waits for the specified Cloud Composer DAG to deploy. -# -# Copyright 2019 Google Inc. + +# Copyright 2019 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# https://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -16,14 +14,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -n=0 -until [[ $n -ge $4 ]] -do - status=0 - gcloud composer environments run "${1}" --location "${2}" list_dags \ - 2>&1 >/dev/null | grep "${3}" && break - status=$? - n=$(($n+1)) - sleep "${5}" -done -exit $status +set -e + +echo "running deploydags go tests..." +if ! (cd ./composer/cloudbuild/go/dagsdeployer/internal/ && go vet ./... && go test ./...); +then + echo "go tests for dags deployer failed" + exit 1 +fi + +echo "running dataflow java tests..." +find ./dataflow/java/ -name pom.xml -execdir mvn test \; diff --git a/license-templates/LICENSE.txt b/license-templates/LICENSE.txt new file mode 100644 index 0000000..7748fc1 --- /dev/null +++ b/license-templates/LICENSE.txt @@ -0,0 +1,13 @@ +Copyright 2019 Google Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/precommit_cloudbuild.yaml b/precommit_cloudbuild.yaml new file mode 100644 index 0000000..81d921d --- /dev/null +++ b/precommit_cloudbuild.yaml @@ -0,0 +1,36 @@ +# Copyright 2019 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +steps: + - name: 'gcr.io/cloud-builders/git' + entrypoint: 'bash' + args: [ + './helpers/init_git_repo.sh', + ] + env: [ + 'COMMIT_SHA=${COMMIT_SHA}', + 'BASE_REPO_URL=https://github.com/jaketf/ci-cd-for-data-processing-workflow.git', + 'BASE_BRANCH=${_BASE_BRANCH}', + 'PR_NUMBER=${_PR_NUMBER}', + ] + id: 'checkout-pr-branch' + # Run linters + - name: 'gcr.io/${PROJECT_ID}/make' + args: ['test'] + waitFor: ['checkout-pr-branch'] + id: 'run-style-and-unit-tests' + - name: 'google/cloud-sdk' + waitFor: ['checkout-pr-branch'] + entrypoint: 'bash' + args: ['./helpers/run_relevant_cloudbuilds.sh', 'precommit_cloudbuild.yaml'] + id: 'run-relevant-cloud-builds' diff --git a/source-code/build-pipeline/build_deploy_test.yaml b/source-code/build-pipeline/build_deploy_test.yaml deleted file mode 100644 index e28e6bb..0000000 --- a/source-code/build-pipeline/build_deploy_test.yaml +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright 2019 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -steps: -- name: gcr.io/cloud-builders/git - args: ['clone', 'https://source.developers.google.com/p/$PROJECT_ID/r/$REPO_NAME'] - id: 'check-out-source-code' -- name: gcr.io/cloud-builders/mvn - args: ['package', '-q'] - dir: '$REPO_NAME/data-processing-code' - id: 'build-jar' -- name: gcr.io/cloud-builders/gsutil - args: ['cp', '*bundled*.jar', 'gs://${_DATAFLOW_JAR_BUCKET}/dataflow_deployment_$BUILD_ID.jar'] - dir: '$REPO_NAME/data-processing-code/target' - id: 'deploy-jar' -- name: apache/airflow:master - entrypoint: 'python' - args: ['test_compare_xcom_maps.py'] - dir: '$REPO_NAME/workflow-dag' - id: 'unit-test-on-operator-code' -- name: gcr.io/cloud-builders/gsutil - args: ['cp', 'support-files/input.txt', 'gs://${_COMPOSER_INPUT_BUCKET}'] - dir: '$REPO_NAME/workflow-dag' - id: 'deploy-test-input-file' -- name: gcr.io/cloud-builders/gsutil - args: ['cp', 'support-files/ref.txt', 'gs://${_COMPOSER_REF_BUCKET}'] - dir: '$REPO_NAME/workflow-dag' - id: 'deploy-test-ref-file' -- name: gcr.io/cloud-builders/gcloud - args: ['composer', 'environments', 'run', '${_COMPOSER_ENV_NAME}', '--location', '${_COMPOSER_REGION}','variables', '--', '--set', 'dataflow_jar_file_test', 'dataflow_deployment_$BUILD_ID.jar'] - id: 'set-composer-jar-ref' -- name: gcr.io/cloud-builders/gsutil - args: ['cp', 'compare_xcom_maps.py', '${_COMPOSER_DAG_BUCKET}'] - dir: '$REPO_NAME/workflow-dag' - id: 'deploy-custom-operator' -- name: gcr.io/cloud-builders/gsutil - args: ['cp', 'data-pipeline-test.py', '${_COMPOSER_DAG_BUCKET}'] - dir: '$REPO_NAME/workflow-dag' - id: 'deploy-processing-pipeline' -- name: gcr.io/cloud-builders/gcloud - entrypoint: 'bash' - args: ['wait_for_dag_deployed.sh', '${_COMPOSER_ENV_NAME}', '${_COMPOSER_REGION}', '${_COMPOSER_DAG_NAME_TEST}', '6', '20'] - dir: '$REPO_NAME/build-pipeline' - id: 'wait-for-dag-deployed-on-composer' -- name: gcr.io/cloud-builders/gcloud - args: ['composer', 'environments', 'run', '${_COMPOSER_ENV_NAME}', '--location', '${_COMPOSER_REGION}', 'trigger_dag', '--', '${_COMPOSER_DAG_NAME_TEST}', '--run_id=$BUILD_ID'] - id: 'trigger-pipeline-execution' diff --git a/source-code/build-pipeline/deploy_prod.yaml b/source-code/build-pipeline/deploy_prod.yaml deleted file mode 100644 index fc43e4f..0000000 --- a/source-code/build-pipeline/deploy_prod.yaml +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright 2019 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -steps: -- name: gcr.io/cloud-builders/gsutil - args: ['cp', 'gs://${_DATAFLOW_JAR_BUCKET_TEST}/${_DATAFLOW_JAR_FILE_LATEST}', 'gs://${_DATAFLOW_JAR_BUCKET_PROD}/dataflow_deployment_$BUILD_ID.jar'] - id: 'deploy-jar-to-prod' -- name: gcr.io/cloud-builders/git - args: ['clone', 'https://source.developers.google.com/p/$PROJECT_ID/r/$REPO_NAME'] - id: 'check-out-source-code' -- name: gcr.io/cloud-builders/gsutil - args: ['cp', 'support-files/input.txt', 'gs://${_COMPOSER_INPUT_BUCKET}'] - dir: '$REPO_NAME/workflow-dag' - id: 'deploy-input-file' -- name: gcr.io/cloud-builders/gcloud - args: ['composer', 'environments', 'run', '${_COMPOSER_ENV_NAME}', '--location', '${_COMPOSER_REGION}','variables', '--', '--set', 'dataflow_jar_file_prod', 'dataflow_deployment_$BUILD_ID.jar'] - id: 'set-composer-jar-ref' -- name: gcr.io/cloud-builders/gsutil - args: ['cp', 'data-pipeline-prod.py', '${_COMPOSER_DAG_BUCKET}'] - dir: '$REPO_NAME/workflow-dag' - id: 'deploy-processing-pipeline' -- name: gcr.io/cloud-builders/gcloud - entrypoint: 'bash' - args: ['wait_for_dag_deployed.sh', '${_COMPOSER_ENV_NAME}', '${_COMPOSER_REGION}', '${_COMPOSER_DAG_NAME_PROD}', '6', '20'] - dir: '$REPO_NAME/build-pipeline' - id: 'wait-for-dag-deployed-on-composer' diff --git a/source-code/data-processing-code/src/main/java/org/apache/beam/examples/WordCount.java b/source-code/data-processing-code/src/main/java/org/apache/beam/examples/WordCount.java deleted file mode 100644 index afcd838..0000000 --- a/source-code/data-processing-code/src/main/java/org/apache/beam/examples/WordCount.java +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright 2019 Google Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.examples; - -import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.io.TextIO; -import org.apache.beam.sdk.metrics.Counter; -import org.apache.beam.sdk.metrics.Distribution; -import org.apache.beam.sdk.metrics.Metrics; -import org.apache.beam.sdk.options.Default; -import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.PipelineOptions; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.options.Validation.Required; -import org.apache.beam.sdk.transforms.Count; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.MapElements; -import org.apache.beam.sdk.transforms.PTransform; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.SimpleFunction; -import org.apache.beam.sdk.values.KV; -import org.apache.beam.sdk.values.PCollection; - -/** - * An example that counts words in Shakespeare and includes Beam best practices. - * - *

This class, {@link WordCount}, is the second in a series of four successively more detailed - * 'word count' examples. You may first want to take a look at {@link MinimalWordCount}. After - * you've looked at this example, then see the {@link DebuggingWordCount} pipeline, for introduction - * of additional concepts. - * - *

For a detailed walkthrough of this example, see - * https://beam.apache.org/get-started/wordcount-example/ - * - *

Basic concepts, also in the MinimalWordCount example: Reading text files; counting a - * PCollection; writing to text files - * - *

New Concepts: - * - *

- *   1. Executing a Pipeline both locally and using the selected runner
- *   2. Using ParDo with static DoFns defined out-of-line
- *   3. Building a composite transform
- *   4. Defining your own pipeline options
- * 
- * - *

Concept #1: you can execute this pipeline either locally or using by selecting another runner. - * These are now command-line options and not hard-coded as they were in the MinimalWordCount - * example. - * - *

To change the runner, specify: - * - *

{@code
- * --runner=YOUR_SELECTED_RUNNER
- * }
- * - *

To execute this pipeline, specify a local output file (if using the {@code DirectRunner}) or - * output prefix on a supported distributed file system. - * - *

{@code
- * --output=[YOUR_LOCAL_FILE | YOUR_OUTPUT_PREFIX]
- * }
- * - *

The input file defaults to a public data set containing the text of of King Lear, by William - * Shakespeare. You can override it and choose your own input with {@code --inputFile}. - */ -public class WordCount { - - /** - * Concept #2: You can make your pipeline assembly code less verbose by defining your DoFns - * statically out-of-line. This DoFn tokenizes lines of text into individual words; we pass it to - * a ParDo in the pipeline. - */ - static class ExtractWordsFn extends DoFn { - private final Counter emptyLines = Metrics.counter(ExtractWordsFn.class, "emptyLines"); - private final Distribution lineLenDist = - Metrics.distribution(ExtractWordsFn.class, "lineLenDistro"); - private static final String TOKENIZER_PATTERN = "[^\\p{L}]+"; - - @ProcessElement - public void processElement(@Element String element, OutputReceiver receiver) { - lineLenDist.update(element.length()); - if (element.trim().isEmpty()) { - emptyLines.inc(); - } - - // Split the line into words. - String[] words = element.split(TOKENIZER_PATTERN, -1); - - // Output each word encountered into the output PCollection. - for (String word: words) { - if (!word.isEmpty()) { - receiver.output(word); - } - } - } - } - - /** A SimpleFunction that converts a Word and Count into a printable string. */ - public static class FormatAsTextFn extends SimpleFunction, String> { - @Override - public String apply(KV input) { - return input.getKey() + ": " + input.getValue(); - } - } - - /** - * A PTransform that converts a PCollection containing lines of text into a PCollection of - * formatted word counts. - * - *

Concept #3: This is a custom composite transform that bundles two transforms (ParDo and - * Count) as a reusable PTransform subclass. Using composite transforms allows for easy reuse, - * modular testing, and an improved monitoring experience. - */ - public static class CountWords - extends PTransform, PCollection>> { - @Override - public PCollection>expand(PCollection lines) { - - // Convert lines of text into individual words. - PCollection words = lines.apply(ParDo.of(new ExtractWordsFn())); - - // Count the number of times each word occurs. - PCollection> wordCounts = words.apply(Count.perElement()); - - return wordCounts; - } - } - - /** - * Options supported by {@link WordCount}. - * - *

Concept #4: Defining your own configuration options. Here, you can add your own arguments to - * be processed by the command-line parser, and specify default values for them. You can then - * access the options values in your pipeline code. - * - *

Inherits standard configuration options. - */ - public interface WordCountOptions extends PipelineOptions { - - /** - * By default, this example reads from a public dataset containing the text of King Lear. Set - * this option to choose a different input file or glob. - */ - @Description("Path of the file to read from") - @Default.String("gs://apache-beam-samples/shakespeare/kinglear.txt") - String getInputFile(); - - void setInputFile(String value); - - /** Set this required option to specify where to write the output. */ - @Description("Path of the file to write to") - @Required - String getOutput(); - - void setOutput(String value); - } - - static void runWordCount(WordCountOptions options) { - Pipeline p = Pipeline.create(options); - - // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the - // static FormatAsTextFn() to the ParDo transform. - p.apply("ReadLines", TextIO.read().from(options.getInputFile())) - .apply(new CountWords()) - .apply(MapElements.via(new FormatAsTextFn())) - .apply("WriteCounts", TextIO.write().to(options.getOutput())); - - p.run().waitUntilFinish(); - } - - public static void main(String[] args) { - WordCountOptions options = - PipelineOptionsFactory.fromArgs(args).withValidation().as(WordCountOptions.class); - runWordCount(options); - } -} diff --git a/source-code/workflow-dag/compare_xcom_maps.py b/source-code/workflow-dag/compare_xcom_maps.py deleted file mode 100644 index 9ad28e8..0000000 --- a/source-code/workflow-dag/compare_xcom_maps.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright 2019 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Custom operator that compares dictionaries in xcom. -""" - -from airflow.models import BaseOperator -from airflow.utils.decorators import apply_defaults - - -class CompareXComMapsOperator(BaseOperator): - """Compare dictionary stored in xcom. - - Args: - ref_task_ids: list of task ids from where the reference dictionary - is fetched - res_task_ids: list of task ids from where the comparing dictionary - is fetched - """ - - @apply_defaults - def __init__( - self, - ref_task_ids, - res_task_ids, - *args, **kwargs): - super(CompareXComMapsOperator, self).__init__(*args, **kwargs) - self.ref_task_ids = ref_task_ids - self.res_task_ids = res_task_ids - - def execute(self, context): - ref_obj = self.read_value_as_obj(self.ref_task_ids, context) - res_obj = self.read_value_as_obj(self.res_task_ids, context) - self.compare_obj(ref_obj, res_obj) - return 'result contains the expected values' - - def read_value_as_obj(self, task_ids, context): - ret_obj = {} - for task_id in task_ids: - value_str = context['ti'].xcom_pull( - key=None, - task_ids=task_id) - self.parse_str_obj(value_str, ret_obj) - return ret_obj - - def parse_str_obj(self, str_rep, obj): - entries = str_rep.split('\n') - for entry in entries: - if entry: - key, value = entry.split(': ') - obj[key] = value - - def compare_obj(self, ref_obj, res_obj): - if ref_obj != res_obj: - raise ValueError(self.create_diff_str(ref_obj, res_obj)) - - def create_diff_str(self, ref_obj, res_obj): - msg = 'The result differs from the expected in the following ways:' - for k in ref_obj: - if k not in res_obj: - msg = msg + ('\nmissing key: %s in result' % k) - elif ref_obj[k] != res_obj[k]: - msg = msg + ('\nexpected %s: %s but got %s: %s' % ( - k, ref_obj[k], k, res_obj[k])) - for k in res_obj: - if k not in ref_obj: - msg = msg + ('\nunexpected key: %s in result' % k) - return msg diff --git a/source-code/workflow-dag/data-pipeline-prod.py b/source-code/workflow-dag/data-pipeline-prod.py deleted file mode 100644 index 1f0f993..0000000 --- a/source-code/workflow-dag/data-pipeline-prod.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright 2019 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Data processing production workflow definition. -""" -import datetime -from airflow import models -from airflow.contrib.operators.dataflow_operator import DataFlowJavaOperator - -dataflow_staging_bucket = 'gs://%s/staging' % ( - models.Variable.get('dataflow_staging_bucket_prod')) - -dataflow_jar_location = 'gs://%s/%s' % ( - models.Variable.get('dataflow_jar_location_prod'), - models.Variable.get('dataflow_jar_file_prod')) - -project = models.Variable.get('gcp_project') -region = models.Variable.get('gcp_region') -zone = models.Variable.get('gcp_zone') -input_bucket = 'gs://' + models.Variable.get('gcs_input_bucket_prod') -output_bucket_name = models.Variable.get('gcs_output_bucket_prod') -output_bucket = 'gs://' + output_bucket_name -output_prefix = 'output' -download_task_prefix = 'download_result' - -yesterday = datetime.datetime.combine( - datetime.datetime.today() - datetime.timedelta(1), - datetime.datetime.min.time()) - -default_args = { - 'dataflow_default_options': { - 'project': project, - 'zone': zone, - 'region': region, - 'stagingLocation': dataflow_staging_bucket - } -} - -with models.DAG( - 'prod_word_count', - schedule_interval=None, - default_args=default_args) as dag: - dataflow_execution = DataFlowJavaOperator( - task_id='wordcount-run', - jar=dataflow_jar_location, - start_date=yesterday, - options={ - 'autoscalingAlgorithm': 'THROUGHPUT_BASED', - 'maxNumWorkers': '3', - 'inputFile': input_bucket+'/input.txt', - 'output': output_bucket+'/'+output_prefix - } - ) diff --git a/source-code/workflow-dag/data-pipeline-test.py b/source-code/workflow-dag/data-pipeline-test.py deleted file mode 100644 index 17da7d2..0000000 --- a/source-code/workflow-dag/data-pipeline-test.py +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright 2019 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Data processing test workflow definition. -""" -import datetime -from airflow import models -from airflow.contrib.operators.dataflow_operator import DataFlowJavaOperator -from airflow.contrib.operators.gcs_download_operator import GoogleCloudStorageDownloadOperator -from compare_xcom_maps import CompareXComMapsOperator - -dataflow_staging_bucket = 'gs://%s/staging' % ( - models.Variable.get('dataflow_staging_bucket_test')) - -dataflow_jar_location = 'gs://%s/%s' % ( - models.Variable.get('dataflow_jar_location_test'), - models.Variable.get('dataflow_jar_file_test')) - -project = models.Variable.get('gcp_project') -region = models.Variable.get('gcp_region') -zone = models.Variable.get('gcp_zone') -input_bucket = 'gs://' + models.Variable.get('gcs_input_bucket_test') -output_bucket_name = models.Variable.get('gcs_output_bucket_test') -output_bucket = 'gs://' + output_bucket_name -ref_bucket = models.Variable.get('gcs_ref_bucket_test') -output_prefix = 'output' -download_task_prefix = 'download_result' - -yesterday = datetime.datetime.combine( - datetime.datetime.today() - datetime.timedelta(1), - datetime.datetime.min.time()) - -default_args = { - 'dataflow_default_options': { - 'project': project, - 'zone': zone, - 'region': region, - 'stagingLocation': dataflow_staging_bucket - } -} - -with models.DAG( - 'test_word_count', - schedule_interval=None, - default_args=default_args) as dag: - dataflow_execution = DataFlowJavaOperator( - task_id='wordcount-run', - jar=dataflow_jar_location, - start_date=yesterday, - options={ - 'autoscalingAlgorithm': 'THROUGHPUT_BASED', - 'maxNumWorkers': '3', - 'inputFile': input_bucket+'/input.txt', - 'output': output_bucket+'/'+output_prefix - } - ) - download_expected = GoogleCloudStorageDownloadOperator( - task_id='download_ref_string', - bucket=ref_bucket, - object='ref.txt', - store_to_xcom_key='ref_str', - start_date=yesterday - ) - download_result_one = GoogleCloudStorageDownloadOperator( - task_id=download_task_prefix+'_1', - bucket=output_bucket_name, - object=output_prefix+'-00000-of-00003', - store_to_xcom_key='res_str_1', - start_date=yesterday - ) - download_result_two = GoogleCloudStorageDownloadOperator( - task_id=download_task_prefix+'_2', - bucket=output_bucket_name, - object=output_prefix+'-00001-of-00003', - store_to_xcom_key='res_str_2', - start_date=yesterday - ) - download_result_three = GoogleCloudStorageDownloadOperator( - task_id=download_task_prefix+'_3', - bucket=output_bucket_name, - object=output_prefix+'-00002-of-00003', - store_to_xcom_key='res_str_3', - start_date=yesterday - ) - compare_result = CompareXComMapsOperator( - task_id='do_comparison', - ref_task_ids=['download_ref_string'], - res_task_ids=[download_task_prefix+'_1', - download_task_prefix+'_2', - download_task_prefix+'_3'], - start_date=yesterday - ) - - dataflow_execution >> download_result_one - dataflow_execution >> download_result_two - dataflow_execution >> download_result_three - - download_expected >> compare_result - download_result_one >> compare_result - download_result_two >> compare_result - download_result_three >> compare_result diff --git a/source-code/workflow-dag/test_compare_xcom_maps.py b/source-code/workflow-dag/test_compare_xcom_maps.py deleted file mode 100644 index 9da06c2..0000000 --- a/source-code/workflow-dag/test_compare_xcom_maps.py +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright 2019 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Unit test of the CompareXComMapsOperator. -""" -import unittest -from compare_xcom_maps import CompareXComMapsOperator -import mock - -TASK_ID = 'test_compare_task_id' -REF_TASK_ID = 'download_ref_string' -DOWNLOAD_TASK_PREFIX = 'download_result' -CONTEXT_CLASS_NAME = 'airflow.ti_deps.dep_context' -ERROR_LINE_ONE = 'The result differs from the expected in the following ways:\n' - - -def generate_mock_function(first_value, second_value, third_value): - def mock_function(**kwargs): - return { - REF_TASK_ID: 'a: 1\nb: 2\nc: 3', - DOWNLOAD_TASK_PREFIX+'_1': first_value, - DOWNLOAD_TASK_PREFIX+'_2': second_value, - DOWNLOAD_TASK_PREFIX+'_3': third_value - }[kwargs['task_ids']] - return mock_function - - -def equal_mock(): - return generate_mock_function('c: 3', 'b: 2', 'a: 1') - - -def missing_value_mock(): - return generate_mock_function('b: 2', 'a: 1', 'b: 2') - - -def wrong_value_mock(): - return generate_mock_function('a: 1', 'b: 4', 'c: 3') - - -def unexpected_value_mock(): - return generate_mock_function('a: 1', 'c: 3\nd: 4', 'b: 2') - - -class CompareXComMapsOperatorTest(unittest.TestCase): - - def setUp(self): - super(CompareXComMapsOperatorTest, self).setUp() - self.xcom_compare = CompareXComMapsOperator( - task_id=TASK_ID, - ref_task_ids=[REF_TASK_ID], - res_task_ids=[DOWNLOAD_TASK_PREFIX+'_1', - DOWNLOAD_TASK_PREFIX+'_2', - DOWNLOAD_TASK_PREFIX+'_3']) - - def test_init(self): - self.assertEqual(self.xcom_compare.task_id, TASK_ID) - self.assertListEqual(self.xcom_compare.ref_task_ids, [REF_TASK_ID]) - self.assertListEqual(self.xcom_compare.res_task_ids, - [DOWNLOAD_TASK_PREFIX+'_1', - DOWNLOAD_TASK_PREFIX+'_2', - DOWNLOAD_TASK_PREFIX+'_3']) - - def assertRaisesWithMessage(self, error_type, msg, func, *args, **kwargs): - with self.assertRaises(error_type) as context: - func(*args, **kwargs) - self.assertEqual(msg, str(context.exception)) - - def execute_value_error(self, mock_func, error_expect_tr): - with mock.patch(CONTEXT_CLASS_NAME) as context_mock: - context_mock['ti'].xcom_pull = mock_func - self.assertRaisesWithMessage( - ValueError, - error_expect_tr, - self.xcom_compare.execute, context_mock) - - def test_equal(self): - with mock.patch(CONTEXT_CLASS_NAME) as context_mock: - context_mock['ti'].xcom_pull = equal_mock() - self.xcom_compare.execute(context_mock) - - def test_missing_value(self): - self.execute_value_error( - missing_value_mock(), - '{}{}'.format(ERROR_LINE_ONE, 'missing key: c in result')) - - def test_wrong_value(self): - self.execute_value_error( - wrong_value_mock(), - '{}{}'.format(ERROR_LINE_ONE, 'expected b: 2 but got b: 4')) - - def test_unexpected_value(self): - self.execute_value_error( - unexpected_value_mock(), - '{}{}'.format(ERROR_LINE_ONE, 'unexpected key: d in result')) - -suite = unittest.TestLoader().loadTestsFromTestCase(CompareXComMapsOperatorTest) -unittest.TextTestRunner(verbosity=2).run(suite) diff --git a/terraform/.gitignore b/terraform/.gitignore new file mode 100644 index 0000000..9eb783f --- /dev/null +++ b/terraform/.gitignore @@ -0,0 +1,44 @@ +# Terragrunt +.terragrunt-cache/ + +# OSX leaves these everywhere on SMB shares +._* + +# OSX trash +.DS_Store + +# Python +*.pyc + +# Emacs save files +*~ +\#*\# +.\#* + +# Vim-related files +[._]*.s[a-w][a-z] +[._]s[a-w][a-z] +*.un~ +Session.vim +.netrwhist + +### https://raw.github.com/github/gitignore/90f149de451a5433aebd94d02d11b0e28843a1af/Terraform.gitignore + +# Local .terraform directories +**/.terraform/* + +# .tfstate files +*.tfstate +*.tfstate.* + +# Crash log files +crash.log + +# Kitchen files +**/inspec.lock +**/.kitchen +**/kitchen.local.yml +**/Gemfile.lock + + +credentials.json diff --git a/terraform/README.md b/terraform/README.md new file mode 100644 index 0000000..3251d08 --- /dev/null +++ b/terraform/README.md @@ -0,0 +1,9 @@ +# Datapipelines CICD terraform IaC +This defines terraform for the ci, artifacts and prod projects. +For more details on the inputs and outputs look at the READMEs in +the artifacts and datapipelines-infra modules. + +## Running +``` +terragrunt apply-all +``` diff --git a/terraform/artifacts/README.md b/terraform/artifacts/README.md new file mode 100644 index 0000000..5f9ccb7 --- /dev/null +++ b/terraform/artifacts/README.md @@ -0,0 +1,29 @@ +# Artifacts and Cloud Build +The terraform in this dir handles the infrastructure for building and storing +artifacts that are built in the CI environment and used in the production environment. +## Requirements + +No requirements. + +## Providers + +| Name | Version | +|------|---------| +| google | n/a | +| google-beta | n/a | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| ci\_composer\_env | n/a | `string` | `""` | no | +| ci\_project | Continuous Integration Project which pushes artifacts | `any` | n/a | yes | +| prod\_project | Production project which pulls artifacts | `any` | n/a | yes | +| project\_id | Project ID for your GCP project to store artifacts | `any` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| dataflow\_artifacts\_bucket | n/a | + diff --git a/terraform/artifacts/backend.tf b/terraform/artifacts/backend.tf new file mode 100644 index 0000000..3af586a --- /dev/null +++ b/terraform/artifacts/backend.tf @@ -0,0 +1,7 @@ +# Generated by Terragrunt. Sig: nIlQXj57tbuaRZEa +terraform { + backend "gcs" { + bucket = "datapipelines-terraform-state" + prefix = "terraform_state/artifacts/terraform.tfstate" + } +} diff --git a/terraform/artifacts/main.tf b/terraform/artifacts/main.tf new file mode 100644 index 0000000..4a89de2 --- /dev/null +++ b/terraform/artifacts/main.tf @@ -0,0 +1,130 @@ +# Copyright 2019 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +module "project-services" { + source = "terraform-google-modules/project-factory/google//modules/project_services" + version = "7.1.0" + + project_id = var.project_id + + activate_apis = [ + "compute.googleapis.com", + "cloudbuild.googleapis.com", + "sourcerepo.googleapis.com", + "artifactregistry.googleapis.com", + "containerregistry.googleapis.com", + "containerscanning.googleapis.com", + "storage-component.googleapis.com", + "storage-api.googleapis.com", + "pubsub.googleapis.com", + "stackdriver.googleapis.com", + ] +} + +module "artifacts-buckets" { + source = "terraform-google-modules/cloud-storage/google" + version = "~> 1.6" + project_id = var.project_id + location = "US" + names = ["dataflow"] + prefix = var.project_id + set_admin_roles = true + admins = [""] + versioning = { + first = true + } +} + +resource "google_cloudbuild_trigger" "ci-pre-commit-trigger" { + provider = google-beta + description = "Datapipelines Pre Commit" + project = var.ci_project + + github { + owner = "jaketf" + name = "ci-cd-for-data-processing-workflow" + pull_request { + branch = ".*" + } + } + + filename = "precommit_cloudbuild.yaml" +} + +resource "google_cloudbuild_trigger" "ci-post-commit-trigger" { + provider = google-beta + description = "Data Pipelines Post Commit" + project = var.ci_project + + github { + owner = "jaketf" + name = "ci-cd-for-data-processing-workflow" + pull_request { + branch = ".*" + comment_control = "COMMENTS_ENABLED" + } + } + + substitutions = { + _COMPOSER_ENV_NAME = var.ci_composer_env + _COMPOSER_REGION = var.ci_composer_region + _DATAFLOW_JAR_BUCKET = var.dataflow_jars_bucket + _DATAFLOW_STAGING_BUCKET = "${var.ci_project}-us-dataflow_staging" + _COMPOSER_DAG_BUCKET = var.ci_composer_dags_bucket + _WORDCOUNT_INPUT_BUCKET = "${var.ci_project}-us-wordcount_input" + _WORDCOUNT_RESULT_BUCKET = "${var.ci_project}-us-wordcount_result" + _WORDCOUNT_REF_BUCKET = "${var.ci_project}-us-wordcount_ref" + _ARTIFACTS_PROJECT_ID = var.project_id + _DATAFLOW_ARTIFACTS_BUCKET = module.artifacts-buckets.names_list[0] + } + + filename = "cloudbuild.yaml" +} + +resource "google_project_iam_member" "ci-cloudbuild-composer-user" { + project = var.ci_project + role = "roles/composer.user" + member = "serviceAccount:${data.google_project.ci.number}@cloudbuild.gserviceaccount.com" +} + +resource "google_project_iam_member" "ci-cloudbuild-containers-developer" { + project = var.ci_project + role = "roles/container.admin" + member = "serviceAccount:${data.google_project.ci.number}@cloudbuild.gserviceaccount.com" +} + +resource "google_project_iam_member" "ci-cloudbuild-artifact-admin" { + project = var.project_id + role = "roles/storage.admin" + member = "serviceAccount:${var.push_sa}" +} + +resource "google_project_iam_member" "cloudbuild-artifact-reader" { + project = var.project_id + role = "roles/storage.objectViewer" + member = "serviceAccount:${data.google_project.ci.number}@cloudbuild.gserviceaccount.com" +} + +data google_project "ci" { + project_id = var.ci_project +} + +data google_project "artifacts" { + project_id = var.project_id +} + +data google_project "prod" { + project_id = var.project_id +} + diff --git a/terraform/artifacts/outputs.tf b/terraform/artifacts/outputs.tf new file mode 100644 index 0000000..198f2f6 --- /dev/null +++ b/terraform/artifacts/outputs.tf @@ -0,0 +1,4 @@ +output "dataflow_artifacts_bucket" { + value = module.artifacts-buckets.buckets[0] +} + diff --git a/terraform/artifacts/terragrunt.hcl b/terraform/artifacts/terragrunt.hcl new file mode 100644 index 0000000..0a32d86 --- /dev/null +++ b/terraform/artifacts/terragrunt.hcl @@ -0,0 +1,34 @@ +/** + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +include { + path = find_in_parent_folders() +} + +dependency "ci" { + config_path = "../ci" +} + +inputs = { + project_id = "datapipelines-artifacts" + ci_project = trimprefix(dependency.ci.outputs.project.id,"projects/") + ci_composer_env = dependency.ci.outputs.composer-env-name + ci_composer_region = dependency.ci.outputs.composer-region + ci_composer_dags_bucket = dependency.ci.outputs.composer-dags-bucket + dataflow_jars_bucket = dependency.ci.outputs.dataflow-jars-bucket + dataflow_staging_bucket = dependency.ci.outputs.dataflow-staging-bucket + push_sa = dependency.ci.outputs.cloudbuild-sa +} diff --git a/terraform/artifacts/variables.tf b/terraform/artifacts/variables.tf new file mode 100644 index 0000000..d485fce --- /dev/null +++ b/terraform/artifacts/variables.tf @@ -0,0 +1,31 @@ +variable "project_id" { + description = "Project ID for your GCP project to store artifacts" +} + +variable "ci_project" { + description = "Continuous Integration Project which pushes artifacts" +} + +variable "ci_composer_env" { + description = "CI Cloud Composer environment" +} + +variable "ci_composer_region" { + description = "CI compute region for Cloud Composer" +} + +variable "ci_composer_dags_bucket" { + description = "GSC location for Dags for CI Cloud Composer environment" +} + +variable "dataflow_jars_bucket" { + description = "CI tests will pick up Dataflow JARs from here" +} + +variable "dataflow_staging_bucket" { + description = "CI tests will run Dataflow jobs with this staging bucket" +} + +variable "push_sa" { + description = "service account responsible for pushing artifacts. this is typically the cloud build SA in the CI project." +} diff --git a/terraform/backend.tf b/terraform/backend.tf new file mode 100644 index 0000000..07d0536 --- /dev/null +++ b/terraform/backend.tf @@ -0,0 +1,7 @@ +# Generated by Terragrunt. Sig: nIlQXj57tbuaRZEa +terraform { + backend "gcs" { + bucket = "datapipelines-ci-tfstate" + prefix = "./terraform.tfstate" + } +} diff --git a/terraform/ci/terragrunt.hcl b/terraform/ci/terragrunt.hcl new file mode 100644 index 0000000..31d9b0f --- /dev/null +++ b/terraform/ci/terragrunt.hcl @@ -0,0 +1,37 @@ +/** + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +include { + path = find_in_parent_folders() +} + +locals { + env = "ci" +} + +terraform { + source = "${get_terragrunt_dir()}/../datapipelines-infra" +} + +inputs = { + project_id = "datapipelines-ci-282719" + network_name = "datapipelines-net" + composer_region = "us-central1" + composer_subnet = "composer-subnet" + composer_env_name = "datapipelines-orchestration" + env = local.env + artifacts_project = "datapipelines-artifacts" +} diff --git a/terraform/datapipelines-infra/README.md b/terraform/datapipelines-infra/README.md new file mode 100644 index 0000000..7d4d92d --- /dev/null +++ b/terraform/datapipelines-infra/README.md @@ -0,0 +1,36 @@ +# Datapipelines Infrastructure Module +Module to DRY up infrastructure for CI and prod datapipelines environments. + +## Requirements + +| Name | Version | +|------|---------| +| terraform | >= 0.12 | + +## Providers + +| Name | Version | +|------|---------| +| google | n/a | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| artifacts\_project | project to push artifacts for successful post commits | `any` | n/a | yes | +| composer\_env\_name | Composer Environment name | `string` | `"datapipelines-orchestration"` | no | +| composer\_region | Region for your composer environment | `string` | `"us-central1"` | no | +| composer\_subnet | Name for composer subnetwork to create | `string` | `"composer-subnet"` | no | +| env | Environment name ie. dev, test, prod | `string` | `""` | no | +| network\_name | The network your data pipelines should use | `string` | `"datapipelines-net"` | no | +| project\_id | Project ID for your GCP project to run CI tests | `any` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| cloudbuild-sa | The Cloud build SA for the project created by this module | +| composer-env | The Cloud Composer Environment created by this module | +| project | The project created by this module | +| vpc | The VPC network created by this module | + diff --git a/terraform/datapipelines-infra/composer.tf b/terraform/datapipelines-infra/composer.tf new file mode 100644 index 0000000..7b8f13b --- /dev/null +++ b/terraform/datapipelines-infra/composer.tf @@ -0,0 +1,81 @@ +# Copyright 2019 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + max_threads = 2 * var.composer_num_cpus + worker_concurrency = 6 * var.composer_num_cpus + parallelism = (6 * var.composer_num_cpus) * var.composer_node_count +} + +resource "google_composer_environment" "orchestration" { + project = var.project_id + name = var.composer_env_name + region = var.composer_region + + config { + node_count = var.composer_node_count + + software_config { + image_version = "composer-1.10.6-airflow-1.10.6" + python_version = "3" + + airflow_config_overrides = { + # Improves stability when Deleteing DAGs. + core-dags_are_paused_at_creation = "True" + # Number of processes to process DAG files + # estimate = 2*num_cpu_per_node + scheduler-max_threads = tostring(local.max_threads) + # Number of celery processes per Airflow worker + # estimate = num_dags * num_tasks_per_dag * execution_duration_per_task / dag_ scheduling_period / num_airflow_workers + # |----------------------------------- total time needed ------------------------------------| + # or estimate = num_cpu_per_node * 6 + # use lesser of the two estimates + celery-worker_concurrency = tostring(local.worker_concurrency) + # The amount of parallelism as a setting to the executor. This defines the max number of task instances that should run simultaneously + # estimate = worker_concurrency * num_airflow_workers + core-parallelism = tostring(local.parallelism) + # The number of task instances allowed to run concurrently by the scheduler + # estimate = parallelism + core-dag_concurrency = tostring(local.parallelism) + # When not using pools, tasks are run in the "default pool", whose size is guided by this config element + # estimate = parallelism + core-non_pooled_task_slot_count = tostring(local.parallelism) + core-store_serialized_dags = "True" + } + } + + node_config { + zone = "us-central1-f" + machine_type = "n1-highmem-${var.composer_num_cpus}" + disk_size_gb = "30" + network = module.vpc.network_self_link + subnetwork = module.vpc.subnets["${var.composer_region}/${var.composer_subnet}"].self_link + } + } + + depends_on = [google_project_iam_member.composer-worker] +} + +resource "google_service_account" "composer_sa" { + project = var.project_id + account_id = "composer-env-account" + display_name = "Service Account for Composer Environment" +} + +resource "google_project_iam_member" "composer-worker" { + project = var.project_id + role = "roles/composer.worker" + member = "serviceAccount:${google_service_account.composer_sa.email}" +} + diff --git a/terraform/datapipelines-infra/gcs.tf b/terraform/datapipelines-infra/gcs.tf new file mode 100644 index 0000000..a2b6cfc --- /dev/null +++ b/terraform/datapipelines-infra/gcs.tf @@ -0,0 +1,53 @@ +module "data_buckets" { + source = "terraform-google-modules/cloud-storage/google" + version = "~> 1.6" + project_id = var.project_id + location = "US" + + prefix = var.project_id + + names = [ + "wordcount_input", + "wordcount_result", + "wordcount_ref", + ] + + versioning = { + first = true + } + + creators = [ + "serviceAccount:${google_service_account.composer_sa.email}", + ] + + viewers = [ + "serviceAccount:${google_service_account.composer_sa.email}", + ] +} + +module "dataflow_buckets" { + source = "terraform-google-modules/cloud-storage/google" + version = "~> 1.6" + project_id = var.project_id + location = "US" + + prefix = var.project_id + + names = [ + "dataflow_jars", + "dataflow_staging", + ] + + versioning = { + first = true + } + + creators = [ + "serviceAccount:${google_service_account.composer_sa.email}", + ] + + viewers = [ + "serviceAccount:${google_service_account.composer_sa.email}", + ] +} + diff --git a/terraform/datapipelines-infra/network.tf b/terraform/datapipelines-infra/network.tf new file mode 100644 index 0000000..cc8b05b --- /dev/null +++ b/terraform/datapipelines-infra/network.tf @@ -0,0 +1,50 @@ +# Copyright 2019 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +module "vpc" { + source = "terraform-google-modules/network/google" + version = "~> 2.3" + + project_id = var.project_id + network_name = var.network_name + routing_mode = "GLOBAL" + + subnets = [ + { + subnet_name = var.composer_subnet + subnet_ip = "10.2.0.0/16" + subnet_region = "us-central1" + subnet_private_access = "true" + description = "Subnet to house Cloud Composer Environment" + }, + { + subnet_name = "dataflow-subnet" + subnet_ip = "10.3.0.0/16" + subnet_region = "us-central1" + subnet_private_access = "true" + subnet_flow_logs = "true" + description = "Subnet for Cloud Dataflow workers" + }, + ] + + routes = [ + { + name = "egress-internet" + description = "route through IGW to access internet" + destination_range = "0.0.0.0/0" + tags = "egress-inet" + next_hop_internet = "true" + } + ] +} diff --git a/terraform/datapipelines-infra/outputs.tf b/terraform/datapipelines-infra/outputs.tf new file mode 100644 index 0000000..ad47afe --- /dev/null +++ b/terraform/datapipelines-infra/outputs.tf @@ -0,0 +1,41 @@ +data google_project "project" { + project_id = var.project_id +} +output "vpc" { + value = module.vpc + description = "The VPC network created by this module" +} + +output "composer-region" { + value = var.composer_region +} + +output "composer-env-name" { + value = google_composer_environment.orchestration.name + description = "The Cloud Composer Environment created by this module" +} + +output "composer-dags-bucket" { + value = trimsuffix(google_composer_environment.orchestration.config[0].dag_gcs_prefix, "dags") + description = "The Cloud Composer Environment created by this module" +} + +output "cloudbuild-sa" { + value = "${data.google_project.project.number}@cloudbuild.gserviceaccount.com" + description = "The Cloud build SA for the project created by this module" +} + +output "dataflow-jars-bucket" { + value = module.dataflow_buckets.names_list[0] + description = "Bucket where composer pulls Dataflow JARs from" +} + +output "dataflow-staging-bucket" { + value = module.dataflow_buckets.names_list[1] + description = "Staging bucket where for dataflow jobs" +} + +output "project" { + value = data.google_project.project + description = "The project created by this module" +} diff --git a/terraform/datapipelines-infra/prod.tfvars b/terraform/datapipelines-infra/prod.tfvars new file mode 100644 index 0000000..d13f112 --- /dev/null +++ b/terraform/datapipelines-infra/prod.tfvars @@ -0,0 +1,7 @@ +project_id = "datapipelines-prod" +project_num = "715427528296" +network_name = "datapipelines-net" +composer_region = "us-central1" +composer_subnet = "composer-subnet" +composer_env_name = "datapipelines-orchestration" +env = "prod" diff --git a/terraform/datapipelines-infra/services.tf b/terraform/datapipelines-infra/services.tf new file mode 100644 index 0000000..34915fd --- /dev/null +++ b/terraform/datapipelines-infra/services.tf @@ -0,0 +1,32 @@ +# Copyright 2019 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +module "project-services" { + source = "terraform-google-modules/project-factory/google//modules/project_services" + version = "7.1.0" + + project_id = var.project_id + + activate_apis = [ + "compute.googleapis.com", + "iam.googleapis.com", + "cloudbuild.googleapis.com", + "sourcerepo.googleapis.com", + "composer.googleapis.com", + "dataflow.googleapis.com", + "pubsub.googleapis.com", + "stackdriver.googleapis.com", + ] +} + diff --git a/terraform/datapipelines-infra/terragrunt.hcl b/terraform/datapipelines-infra/terragrunt.hcl new file mode 100644 index 0000000..f598fa1 --- /dev/null +++ b/terraform/datapipelines-infra/terragrunt.hcl @@ -0,0 +1,19 @@ +/** + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +include { + path = find_in_parent_folders() +} diff --git a/terraform/datapipelines-infra/variables.tf b/terraform/datapipelines-infra/variables.tf new file mode 100644 index 0000000..b79478f --- /dev/null +++ b/terraform/datapipelines-infra/variables.tf @@ -0,0 +1,58 @@ +# Copyright 2019 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "project_id" { + description = "Project ID for your GCP project to run CI tests" +} + +variable "artifacts_project" { + description = "project to push artifacts for successful post commits" +} + +variable "network_name" { + description = "The network your data pipelines should use" + default = "datapipelines-net" +} + +variable "composer_region" { + description = "Region for your composer environment" + default = "us-central1" +} + +variable "composer_subnet" { + description = "Name for composer subnetwork to create" + default = "composer-subnet" +} + +variable "composer_env_name" { + description = "Composer Environment name" + default = "datapipelines-orchestration" +} + +variable "composer_node_count" { + description = "Composer Environment node count" + type = number + default = 3 +} + +variable "composer_num_cpus" { + description = "Composer Environment node count. This should be 2,4,6,16,32,64." + type = number + default = 8 +} + +variable "env" { + description = "Environment name ie. dev, test, prod" + default = "" +} diff --git a/terraform/datapipelines-infra/versions.tf b/terraform/datapipelines-infra/versions.tf new file mode 100644 index 0000000..ac97c6a --- /dev/null +++ b/terraform/datapipelines-infra/versions.tf @@ -0,0 +1,4 @@ + +terraform { + required_version = ">= 0.12" +} diff --git a/terraform/prod/terragrunt.hcl b/terraform/prod/terragrunt.hcl new file mode 100644 index 0000000..ef21471 --- /dev/null +++ b/terraform/prod/terragrunt.hcl @@ -0,0 +1,37 @@ +/** + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +include { + path = find_in_parent_folders() +} + +locals { + env = "prod" +} + +terraform { + source = "${get_terragrunt_dir()}/../datapipelines-infra" +} + +inputs = { + project_id = "datapipelines-${local.env}" + network_name = "datapipelines-net" + composer_region = "us-central1" + composer_subnet = "composer-subnet" + composer_env_name = "datapipelines-orchestration" + env = local.env + artifacts_project = "datapipelines-artifacts" +} diff --git a/terraform/terragrunt.hcl b/terraform/terragrunt.hcl new file mode 100644 index 0000000..e319261 --- /dev/null +++ b/terraform/terragrunt.hcl @@ -0,0 +1,31 @@ +/** + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +remote_state { + backend = "gcs" + generate = { + path = "backend.tf" + if_exists = "overwrite" + } + config = { + bucket = "datapipelines-terraform-state" + project = "datapipelines-prod" + location = "us" + prefix = "terraform_state/${path_relative_to_include()}/terraform.tfstate" + } +} + +