diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..c0ee48f
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,3 @@
+[flake8]
+max-line-length=80
+exclude=.git,__pycache__,.venv
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..91b747e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,146 @@
+target/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# static files generated from Django application using `collectstatic`
+media
+static
+
+.idea/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 939e534..e69de29 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,28 +0,0 @@
-# How to Contribute
-
-We'd love to accept your patches and contributions to this project. There are
-just a few small guidelines you need to follow.
-
-## Contributor License Agreement
-
-Contributions to this project must be accompanied by a Contributor License
-Agreement. You (or your employer) retain the copyright to your contribution;
-this simply gives us permission to use and redistribute your contributions as
-part of the project. Head over to to see
-your current agreements on file or to sign a new one.
-
-You generally only need to submit a CLA once, so if you've already submitted one
-(even if it was for a different project), you probably don't need to do it
-again.
-
-## Code reviews
-
-All submissions, including submissions by project members, require review. We
-use GitHub pull requests for this purpose. Consult
-[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
-information on using pull requests.
-
-## Community Guidelines
-
-This project follows [Google's Open Source Community
-Guidelines](https://opensource.google.com/conduct/).
diff --git a/LICENSE b/LICENSE
index f433b1a..a5f721e 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,3 @@
-
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
@@ -175,3 +174,28 @@
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright 2019 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..ee5c242
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,40 @@
+# Copyright 2018 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Make will use bash instead of sh
+SHELL := /usr/bin/env bash
+
+# The .PHONY directive tells make that this isn't a file target
+.PHONY: fmt
+fmt: ## Format files, including README
+ @$$SHELL ./helpers/format.sh
+
+help: ## Prints help for targets with comments
+ @grep -E '^[a-zA-Z._-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "make \033[36m%- 30s\033[0m %s\n", $$1, $$2}'
+
+.PHONY: test
+test: ## Test if all files are properly formatted
+ @$$SHELL ./helpers/check_format.sh && python3 -m flake8 --max-line-length=100 && ./helpers/run_tests.sh
+
+.PHONY: precommit
+precommit: ## Test if all files are properly formatted
+ @$$SHELL ./helpers/check_format.sh && python3 -m flake8 --max-line-length=100 && ./helpers/run_relevant_cloudbuilds.sh precommit_cloudbuild.yaml
+
+.PHONY: push_ci_image
+push_ci_image:
+ @cd ci && gcloud builds submit --project=datapipelines-ci-282719 --tag gcr.io/datapipelines-ci-282719/make .
+
+.PHONY: push_deploydags_image
+push_deploydags_image:
+ @cd composer/cloudbuild/go/dagsdeployer && gcloud builds submit --project=datapipelines-ci-282719 --tag gcr.io/datapipelines-ci-282719/deploydags .
+
diff --git a/README.md b/README.md
index ceaff0b..4257295 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,346 @@
-# CI/CD for data processing workflow
-This repository contains source code for the guide on how to use Cloud Build and Cloud Composer to create a CI/CD pipeline for building, deployment and testing of a data processing workflow.
+# Data Pipelines CI/CD Mono Repo
+This repo provides an example of using [Cloud Build](https://cloud.google.com/cloud-build/)
+to deploy various artifacts to deploy GCP D&A technologies.
+The repo includes a Terraform directory to spin up infrastructure as well as
+Cloud Build Triggers which will automate the deployments of new commits to
+master.
-Please refer to the solution guide for the steps to run the code: [solution
-tutorial](https://cloud.google.com/solutions/cicd-pipeline-for-data-processing)
+## GCP Project Structure
+This example focuses on CI checks on PRs, Artifact staging and production deployment.
+1. CI: Houses infrastructure similar to production to facilitate Continuous
+Integration tests on PRs.
+1. Aritfacts: Houses built artifacts (such as images, executables, etc.) that
+passed all CI checks. Pushed from CI; Pulled from Prod.
+1. Production: Where the workload runs that actually serves the business.
+
+The formal [similarity](https://en.wikipedia.org/wiki/Similarity_(geometry))
+between CI and production is enforced as they are provisioned with terraform
+with different inputs.
+This includes pointing to different projects / buckets. This might include
+sizing differences in Composer environment for production scale workload.
+
+This project uses [terragrunt](https://terragrunt.gruntwork.io/) to manage all
+ci, artifacts and production projects keep terraform configs and backends DRY
+and handle passing dependencies between the terraform states. This was chosen as
+an OSS alternative to terraform enterprise.
+
+CI/CD for IaC is a topic of it's own and is only included here for
+reproducibility and examples sake.
+
+In many organizations, there is a concept of "QA" or "Staging" project /
+environment where additional manual validation is done.
+The concepts in this repo can be extended to accomodate such a structure
+by adding a directory under terraform with a `terragrunt.hcl` file that
+handles inputs and dependencies.
+
+## Flow
+### Development Flow
+1. Prepare changes and run `make test` to run static / unit tests locally.
+1. Open PR. Unit and style checks will run automatically.
+1. Maintainer's `/gcbrun` comment triggers CI process (below) in CI project.
+1. Fix anything that is causing the build to fail (this could include adding
+new build steps if necessary).
+1. On successful CI run pushes artifacts to the artifacts project.
+Images go to GCR, JARs go to GCS with a `SHORT_SHA` prefix.
+
+### Deployment Flow
+Run any necessary large scale integration testing or manual confirmation of the
+CI environment. These tests do not fit comfortably in the Cloud Build 10 minute
+timeout and were out of scope for this example but could also be automated in a
+more persistent CI framework like spinnaker, jenkins or gitlab.
+Run the root cloudbuild with the production substitution values.
+
+## Precommit and Postcommit "Discovery"
+Each directory in this repo containing code to be tested with a precommit and/or
+deployed with a post commit can be picked up by the build discovery script
+defined in `./helpers/run_relevant_cloudbuilds.sh` by defining the following:
+1. a `precommit_cloudbuild.yaml`: defines unit tests and static analysis beyond
+what the repo enforces.
+1. a `cloudbuild.yaml`: integration tests, deploys artifacts and updates
+necessary references for System Tests. For example build a dataflow jar and
+update the Airflow Variable in Composer Environment that tells the DAG what jar
+to run.
+
+All nested cloudbuilds should assume they run from the root of the repo and set
+`dir` accordingly.
+
+#### Precommit
+The precommit should run without substitutions.
+
+### Cloud Build
+The Cloud Build should accept the following substitutions:
+- `_COMPOSER_REGION`
+- `_COMPOSER_ENV_NAME`
+- `_DATAFLOW_JAR_BUCKET`
+
+The precommit will be run on every PR including changes under that file tree.
+The build will deploy to the CI environment on a "/gcbrun" comment.
+
+## The Cloud Build CI Process
+1. init-git-repo: initialize the git repository.
+1. merge-master: merge to master branch so we test post merge code.
+1. run-builds: search for post commit `cloudbuild.yaml`s to run using `helpers/run_relevant_cloudbuilds.sh`
+1. deploy-sql-queries-for-composer: Copy the BigQuery SQL to the dags folder of the target Composer Environment.
+1. deploy-airflowignore: Copies an [`.airflowignore`](https://airflow.apache.org/docs/stable/concepts.html#airflowignore)
+to ignore non-dag definition files (like sql files) in the dag parser.
+1. deploy-test-input-file: Copies a file to GCS (just for example purpose of this DAG)
+1. deploy-test-ref-file: Copies a file to GCS (just for example purpose of this DAG)
+1. stage-airflow-variables: Copies the rendered AirflowVariables.json file to the Cloud Composer wokers.
+1. import-airflow-variables: Imports the rendered AirflowVariables.json file to the Cloud Composer Environment.
+1. deploy-custom-plugins: Copy the source code for the Airflow plugins to the `plugins/` directory of
+the Composer Bucket.
+1. stage-for-integration-test: Copy the airflow dags to a `data/test/` directory in the Composer
+environment for integration test.
+1. dag-parse-integration-test: Run `list_dags` on the `data/test/` directory in the Composer
+environment. This is validates that dags don't refer to variables or connections that don't exist in the target environment
+1. clean-up-data-dir-dags: Clean up the integration test artifacts.
+1. pull-deploydags: pull the existing deploydags image (to facilitate caching if possible).
+1. build-deploydags: Build the golang `deploydags` application
+(documented in `composer/cloudbuild/README.md`)
+1. run-deploydags: Run the deploy dags application.
+
+
+## Setup Cloud Shell Development Environment (for example's sake)
+Install terragrunt and ensure java 8.
+```bash
+sudo ./helpers/init_cloudshell.sh
+```
+
+
+You can confirm things look roughly like this:
+```
+# Python for airflow / beam development
+$ python3 --version
+Python 3.7.3
+
+# Java for beam development
+$ mvn -version
+mvn -version
+Apache Maven 3.6.3 (cecedd343002696d0abb50b32b541b8a6ba2883f)
+Maven home: /opt/maven
+Java version: 1.8.0_232, vendor: Oracle Corporation, runtime: /usr/lib/jvm/java-8-openjdk-amd64/jre
+Default locale: en_US, platform encoding: UTF-8
+OS name: "linux", version: "4.19.112+", arch: "amd64", family: "uni"
+
+# Golang for modifying deploydags app
+$ go version
+go version go1.14.4 linux/amd64
+
+# Terragrunt / Terraform for IaC for the projects
+$ terraform -version
+Terraform v0.12.24
+
+$ terragrunt -version
+terragrunt version v0.23.24
+```
+
+To setup python dependencies for running the tests:
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements-dev.txt
+cd composer
+python3 -m pytest
+```
+
+### Formatting Code Locally
+Runs `go fmt`, `yapf`, `google-java-format`, `terraform fmt` on appropriate files.
+```bash
+make fmt
+```
+
+### Running Tests Locally
+Runs linters, static code analysis and unit tests.
+```bash
+make test
+```
+
+### Pushing a new version of the deploydags golang application
+Changes to the deploydags golang app can be pushed with
+```bash
+make push_deploydags_image
+```
+
+## Repo Structure
+```
+.
+├── bigquery
+│ ├── precommit_cloudbuild.yaml
+│ ├── README.md
+│ ├── sql
+│ │ └── shakespeare_top_25.sql
+│ └── tests
+│ └── test_sql.sh
+├── cd
+│ └── prod.yaml
+├── ci
+│ └── Dockerfile
+├── cloudbuild.yaml
+├── composer
+│ ├── cloudbuild
+│ │ ├── bin
+│ │ │ └── run_tests.sh
+│ │ ├── go
+│ │ │ └── dagsdeployer
+│ │ │ ├── cmd
+│ │ │ │ └── deploydags
+│ │ │ │ ├── deploydags
+│ │ │ │ └── main.go
+│ │ │ ├── Dockerfile
+│ │ │ ├── go.mod
+│ │ │ ├── go.sum
+│ │ │ └── internal
+│ │ │ ├── composerdeployer
+│ │ │ │ ├── composer_ops.go
+│ │ │ │ └── composer_ops_test.go
+│ │ │ └── gcshasher
+│ │ │ ├── gcs_hash.go
+│ │ │ ├── gcs_hash_test.go
+│ │ │ └── testdata
+│ │ │ ├── test_diff.txt
+│ │ │ └── test.txt
+│ │ ├── Makefile
+│ │ └── README.md
+│ ├── config
+│ │ ├── AirflowVariables.json
+│ │ └── ci_dags.txt
+│ ├── dags
+│ │ ├── support-files
+│ │ │ ├── input.txt
+│ │ │ └── ref.txt
+│ │ ├── tutorial.py
+│ │ └── wordcount_dag.py
+│ ├── deploydags
+│ ├── __init__.py
+│ ├── plugins
+│ │ └── xcom_utils_plugin
+│ │ ├── __init__.py
+│ │ ├── operators
+│ │ │ ├── compare_xcom_maps.py
+│ │ │ ├── __init__.py
+│ │ │ └── __pycache__
+│ │ │ ├── compare_xcom_maps.cpython-37.pyc
+│ │ │ └── __init__.cpython-37.pyc
+│ │ └── __pycache__
+│ │ └── __init__.cpython-37.pyc
+│ ├── precommit_cloudbuild.yaml
+│ ├── __pycache__
+│ │ └── __init__.cpython-37.pyc
+│ ├── requirements-dev.txt
+│ └── tests
+│ ├── __init__.py
+│ ├── __pycache__
+│ │ ├── __init__.cpython-37.pyc
+│ │ ├── test_compare_xcom_maps.cpython-37.pyc
+│ │ ├── test_compare_xcom_maps.cpython-37-pytest-5.4.3.pyc
+│ │ ├── test_dag_validation.cpython-37.pyc
+│ │ └── test_dag_validation.cpython-37-pytest-5.4.3.pyc
+│ ├── test_compare_xcom_maps.py
+│ └── test_dag_validation.py
+├── CONTRIBUTING.md
+├── dataflow
+│ └── java
+│ └── wordcount
+│ ├── cloudbuild.yaml
+│ ├── pom.xml
+│ ├── precommit_cloudbuild.yaml
+│ ├── src
+│ │ ├── main
+│ │ │ └── java
+│ │ │ └── org
+│ │ │ └── apache
+│ │ │ └── beam
+│ │ │ └── examples
+│ │ │ └── WordCount.java
+│ │ └── test
+│ │ └── java
+│ │ └── org
+│ │ └── apache
+│ │ └── beam
+│ │ └── examples
+│ │ └── WordCountTest.java
+│ └── target
+│ ├── classes
+│ │ └── org
+│ │ └── apache
+│ │ └── beam
+│ │ └── examples
+│ │ ├── WordCount$CountWords.class
+│ │ ├── WordCount$ExtractWordsFn.class
+│ │ ├── WordCount$FormatAsTextFn.class
+│ │ ├── WordCount$WordCountOptions.class
+│ │ └── WordCount.class
+│ ├── generated-sources
+│ │ └── annotations
+│ ├── generated-test-sources
+│ │ └── test-annotations
+│ ├── maven-archiver
+│ │ └── pom.properties
+│ ├── maven-status
+│ │ └── maven-compiler-plugin
+│ │ ├── compile
+│ │ │ └── default-compile
+│ │ │ ├── createdFiles.lst
+│ │ │ └── inputFiles.lst
+│ │ └── testCompile
+│ │ └── default-testCompile
+│ │ ├── createdFiles.lst
+│ │ └── inputFiles.lst
+│ ├── surefire-reports
+│ │ ├── org.apache.beam.examples.WordCountTest-output.txt
+│ │ ├── org.apache.beam.examples.WordCountTest.txt
+│ │ └── TEST-org.apache.beam.examples.WordCountTest.xml
+│ ├── test-classes
+│ │ └── org
+│ │ └── apache
+│ │ └── beam
+│ │ └── examples
+│ │ └── WordCountTest.class
+│ ├── word-count-beam-0.1.jar
+│ └── word-count-beam-bundled-0.1.jar
+├── helpers
+│ ├── check_format.sh
+│ ├── exclusion_list.txt
+│ ├── format.sh
+│ ├── init_cloudshell.sh
+│ ├── init_git_repo.sh
+│ ├── run_relevant_cloudbuilds.sh
+│ └── run_tests.sh
+├── LICENSE
+├── license-templates
+│ └── LICENSE.txt
+├── Makefile
+├── precommit_cloudbuild.yaml
+├── README.md
+├── scripts
+│ ├── get_composer_properties.sh
+│ └── set_env.sh
+└── terraform
+ ├── artifacts
+ │ ├── backend.tf
+ │ ├── main.tf
+ │ ├── outputs.tf
+ │ ├── README.md
+ │ ├── terragrunt.hcl
+ │ └── variables.tf
+ ├── backend.tf
+ ├── ci
+ │ └── terragrunt.hcl
+ ├── datapipelines-infra
+ │ ├── backend.tf
+ │ ├── composer.tf
+ │ ├── gcs.tf
+ │ ├── network.tf
+ │ ├── outputs.tf
+ │ ├── prod.tfvars
+ │ ├── README.md
+ │ ├── services.tf
+ │ ├── terragrunt.hcl
+ │ ├── variables.tf
+ │ └── versions.tf
+ ├── prod
+ │ └── terragrunt.hcl
+ └── terragrunt.hcl
+
+74 directories, 103 files
+```
diff --git a/bigquery/README.md b/bigquery/README.md
new file mode 100644
index 0000000..6ef7022
--- /dev/null
+++ b/bigquery/README.md
@@ -0,0 +1,3 @@
+# BigQuery
+Store sql files under teh `sql` directory.
+If your sql contains jinja templates add a json file with substitution values that will make the query pass in the CI project.
diff --git a/bigquery/precommit_cloudbuild.yaml b/bigquery/precommit_cloudbuild.yaml
new file mode 100644
index 0000000..774b92a
--- /dev/null
+++ b/bigquery/precommit_cloudbuild.yaml
@@ -0,0 +1,8 @@
+steps:
+# Dry Run SQL.
+- name: 'google/cloud-sdk'
+ waitFor: ['-']
+ dir: 'bigquery'
+ entrypoint: 'bash'
+ args: ['tests/test_sql.sh']
+ id: 'test-sql-queries'
diff --git a/bigquery/sql/shakespeare_top_25.sql b/bigquery/sql/shakespeare_top_25.sql
new file mode 100644
index 0000000..8aa1cc7
--- /dev/null
+++ b/bigquery/sql/shakespeare_top_25.sql
@@ -0,0 +1,11 @@
+#standardSQL
+SELECT
+ word,
+ SUM(word_count) as wc
+FROM
+ `bigquery-public-data.samples.shakespeare`
+GROUP BY
+ word
+ORDER BY
+ wc DESC
+LIMIT 25
diff --git a/bigquery/tests/test_sql.sh b/bigquery/tests/test_sql.sh
new file mode 100755
index 0000000..ad16d30
--- /dev/null
+++ b/bigquery/tests/test_sql.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# Copyright 2019 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+# $1 is a query string to dry_run
+function dry_run_query() {
+ bq query \
+ --use_legacy_sql=false \
+ --dry_run \
+ "$1"
+}
+
+while IFS= read -r query_file
+do
+ echo "$query_file"
+ dry_run_query "$(cat "$query_file")"
+ result="$?"
+ if [ "$result" -ne 0 ]; then
+ echo "Failed to dry run $query_file"
+ exit "$result"
+ fi
+done < <(find ./sql -path "*.sql")
+
diff --git a/cd/prod.yaml b/cd/prod.yaml
new file mode 100644
index 0000000..aaa7a64
--- /dev/null
+++ b/cd/prod.yaml
@@ -0,0 +1,152 @@
+# Copyright 2019 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+steps:
+# [Dataflow]
+# Stage JARs on GCS.
+- name: gcr.io/cloud-builders/gsutil
+ args: [
+ 'cp',
+ 'gs://${_DATAFLOW_ARTIFACT_BUCKET}/${_CI_BUILD_ID}/wordcount.jar'
+ 'gs://${_DATAFLOW_JAR_BUCKET}/wordcount.jar']
+ id: 'deploy-wordcount-jar'
+# [BigQuery]
+# Copy SQL to DAGs folder in prod.
+- name: gcr.io/cloud-builders/gsutil
+ waitFor: ['test-sql-queries']
+ args: [
+ 'rsync','-r', '-d',
+ 'sql', '${_COMPOSER_DAG_BUCKET}dags/sql'
+ ]
+ dir: './bigquery/'
+ id: 'deploy-sql-queries-for-composer'
+# [Composer]
+# Render AirflowVariables.json with production values
+- name: 'gcr.io/${PROJECT_ID}/envsubst'
+ waitFor: ['-']
+ env: [
+ "GCP_PROJECT_ID=${PROJECT_ID}",
+ "COMPOSER_REGION=${_COMPOSER_REGION}",
+ "DATAFLOW_JAR_BUCKET=${_DATAFLOW_JAR_BUCKET}",
+ "INPUT_BUCKET=${_WORDCOUNT_INPUT_BUCKET}",
+ "REF_BUCKET=${_WORDCOUNT_REF_BUCKET}",
+ "RESULT_BUCKET=${_WORDCOUNT_RESULT_BUCKET}",
+ "DATAFLOW_STAGING_BUCKET=${_DATAFLOW_STAGING_BUCKET}",
+ ]
+ args: ['AirflowVariables.json']
+ dir: './composer/config'
+ id: 'render-airflow-variables'
+# Add .airflowignore to GCS prod DAGs folder.
+- name: gcr.io/cloud-builders/gcloud
+ waitFor: ['run-unit-tests']
+ args: [
+ 'composer', 'environments', 'storage', 'dags', 'import',
+ '--source','.airflowignore',
+ '--environment', '${_COMPOSER_ENV_NAME}',
+ '--location', '${_COMPOSER_REGION}'
+ ]
+ dir: './composer/dags/'
+ id: 'deploy-airflowignore'
+# Stage AirflowVariables.json to data directory to be synced to workers.
+- name: gcr.io/cloud-builders/gcloud
+ waitFor: ['render-airflow-variables']
+ args: [
+ 'composer', 'environments', 'storage', 'data', 'import',
+ '--location=${_COMPOSER_REGION}',
+ '--environment=${_COMPOSER_ENV_NAME}',
+ '--source','AirflowVariables.json',
+ '--destination', 'config'
+ ]
+ dir: './composer/config/'
+ id: 'stage-airflow-variables'
+# Import AirflowVariables.json
+- name: gcr.io/cloud-builders/gcloud
+ waitFor: ['stage-airflow-variables']
+ args: [
+ 'composer', 'environments', 'run',
+ '--location=${_COMPOSER_REGION}',
+ '${_COMPOSER_ENV_NAME}',
+ 'variables', '--',
+ '--import', '/home/airflow/gcs/data/config/AirflowVariables.json'
+ ]
+ id: 'import-airflow-variables'
+# Override JAR reference variable to the artifact built in this build.
+- name: gcr.io/cloud-builders/gcloud
+ args: [
+ 'composer', 'environments', 'run',
+ '--location', '${_COMPOSER_REGION}',
+ '${_COMPOSER_ENV_NAME}',
+ 'variables', '--',
+ '--set', 'dataflow_jar_file_test', 'wordcount.jar'
+ ]
+ id: 'set-composer-test-jar-ref'
+# Sync plugins to GCS plugins dir
+- name: gcr.io/cloud-builders/gsutil
+ args: [
+ 'rsync','-r', '-d',
+ 'plugins/',
+ '${_COMPOSER_DAG_BUCKET}plugins'
+ ]
+ dir: './composer/'
+ id: 'deploy-custom-plugins'
+# Sync DAGs to data dir for integration test parsing in target Composer Environment.
+- name: gcr.io/cloud-builders/gsutil
+ waitFor: ['deploy-custom-plugins']
+ args: [
+ 'rsync','-r', '-d',
+ 'dags/',
+ '${_COMPOSER_DAG_BUCKET}data/test-dags/$BUILD_ID'
+ ]
+ dir: './composer/'
+ id: 'stage-for-integration-test'
+# Run integration tests parsing in target Composer Environment.
+- name: gcr.io/cloud-builders/gcloud
+ waitFor: ['stage-for-integration-test']
+ args: [
+ 'composer', 'environments', 'run',
+ '--location', '${_COMPOSER_REGION}',
+ '${_COMPOSER_ENV_NAME}',
+ 'list_dags', '--',
+ '-sd', '/home/airflow/gcs/data/test-dags/$BUILD_ID'
+ ]
+ id: 'dag-parse-integration-test'
+# Clean up.
+- name: gcr.io/cloud-builders/gsutil
+ waitFor: ['dag-parse-integration-test']
+ args: [
+ '-m', 'rm','-r',
+ '${_COMPOSER_DAG_BUCKET}data/test-dags/$BUILD_ID'
+ ]
+ dir: './composer/'
+ id: 'clean-up-data-dir-dags'
+# Run dags deployer golang app.
+- name: gcr.io/${_CI_PROJECT_ID}/deploydags
+ dir: './composer'
+ waitFor: [
+ 'run-style-and-unit-tests',
+ 'build-deploydags',
+ 'clean-up-data-dir-dags',
+ 'deploy-wordcount-jar'
+ ]
+ args: [
+ '-dagList=./config/running_dags.txt',
+ '-dagsFolder=./dags',
+ '-project=${PROJECT_ID}',
+ '-region=${_COMPOSER_REGION}',
+ '-composerEnv=${_COMPOSER_ENV_NAME}',
+ '-dagBucketPrefix=${_COMPOSER_DAG_BUCKET}dags',
+ '-replace'
+ ]
+ id: 'run-deploydags'
+options:
+ machineType: 'N1_HIGHCPU_8'
diff --git a/ci/Dockerfile b/ci/Dockerfile
new file mode 100644
index 0000000..830e485
--- /dev/null
+++ b/ci/Dockerfile
@@ -0,0 +1,29 @@
+# This Dockerfile builds the image used in Cloud Build CI to run 'make test'.
+
+FROM python:buster
+
+# install core tools
+RUN apt-get update && apt-get install -y build-essential
+
+RUN curl -sSL https://sdk.cloud.google.com | bash
+
+# install shellcheck
+RUN apt-get install shellcheck
+
+# install yapf
+RUN pip3 install yapf flake8 pytest apache-airflow[gcp]==1.10.6
+
+# install golang (+gofmt)
+RUN apt-get install -y golang
+
+# Install java + google-java-format jar
+RUN apt-get install -y default-jdk maven
+RUN wget https://github.com/google/google-java-format/releases/download/google-java-format-1.7/google-java-format-1.7-all-deps.jar --directory-prefix=/usr/share/java/
+
+# install terraform (+fmt )
+RUN wget https://releases.hashicorp.com/terraform/0.12.24/terraform_0.12.24_linux_amd64.zip \
+ && unzip terraform_0.12.24_linux_amd64.zip \
+ && mv terraform /usr/bin \
+ && rm terraform_0.12.24_linux_amd64.zip
+
+ENTRYPOINT ["make"]
diff --git a/cloudbuild.yaml b/cloudbuild.yaml
new file mode 100644
index 0000000..aca6271
--- /dev/null
+++ b/cloudbuild.yaml
@@ -0,0 +1,216 @@
+# Copyright 2019 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+steps:
+- name: 'gcr.io/cloud-builders/git'
+ entrypoint: 'bash'
+ args: [
+ './helpers/init_git_repo.sh'
+ ]
+ env: [
+ 'COMMIT_SHA=${COMMIT_SHA}',
+ 'BASE_REPO_URL=https://github.com/jaketf/ci-cd-for-data-processing-workflow.git',
+ 'BASE_BRANCH=${_BASE_BRANCH}',
+ 'PR_NUMBER=${_PR_NUMBER}'
+ ]
+ id: 'init-git-repo'
+# Merge Master because this is a post-commit
+- name: 'gcr.io/cloud-builders/git'
+ args: ['merge', 'origin/master']
+ waitFor: ['init-git-repo']
+ id: 'merge-master'
+# Run linters and relevant cloudbuilds (we have to do this again in case merge
+# broke a unit test or static check)
+# Individual cloudbuild.yaml's (e.g. for a dataflow pipeline) are responsible
+# for pushing their own artifacts to the appropriate gcs location in the
+# artifacts project and updating the necessary references in composer.
+- name: 'google/cloud-sdk'
+ waitFor: ['merge-master']
+ entrypoint: 'bash'
+ args: [
+ './helpers/run_relevant_cloudbuilds.sh',
+ 'cloudbuild.yaml',
+ '--substitutions=_SHORT_SHA=${SHORT_SHA},_COMPOSER_REGION=${_COMPOSER_REGION},_COMPOSER_ENV_NAME=${_COMPOSER_ENV_NAME},_DATAFLOW_JAR_BUCKET=${_DATAFLOW_JAR_BUCKET}'
+ ]
+ id: 'run-builds'
+# [BigQuery]
+# Copy SQL to DAGs folder.
+- name: 'google/cloud-sdk'
+ waitFor: ['run-builds']
+ entrypoint: 'gsutil'
+ args: [
+ 'rsync','-r', '-d',
+ 'sql', '${_COMPOSER_DAG_BUCKET}dags/sql'
+ ]
+ dir: './bigquery/'
+ id: 'deploy-sql-queries-for-composer'
+# [Composer]
+# Render AirflowVariables.json
+- name: 'gcr.io/${PROJECT_ID}/envsubst'
+ waitFor: ['-']
+ env: [
+ "GCP_PROJECT_ID=${PROJECT_ID}",
+ "COMPOSER_REGION=${_COMPOSER_REGION}",
+ "DATAFLOW_JAR_BUCKET=${_DATAFLOW_ARTIFACTS_BUCKET}",
+ "INPUT_BUCKET=${_WORDCOUNT_INPUT_BUCKET}",
+ "REF_BUCKET=${_WORDCOUNT_REF_BUCKET}",
+ "RESULT_BUCKET=${_WORDCOUNT_RESULT_BUCKET}",
+ "DATAFLOW_STAGING_BUCKET=${_DATAFLOW_STAGING_BUCKET}",
+ ]
+ args: ['AirflowVariables.json']
+ dir: './composer/config'
+ id: 'render-airflow-variables'
+# Add .airflowignore to GCS DAGs folder.
+- name: 'google/cloud-sdk'
+ waitFor: ['run-builds']
+ entrypoint: 'gcloud'
+ args: [
+ 'composer', 'environments', 'storage', 'dags', 'import',
+ '--source','.airflowignore',
+ '--environment', '${_COMPOSER_ENV_NAME}',
+ '--location', '${_COMPOSER_REGION}'
+ ]
+ dir: './composer/dags/'
+ id: 'deploy-airflowignore'
+# Stage files for running the example.
+- name: 'google/cloud-sdk'
+ waitFor: ['-']
+ entrypoint: 'gsutil'
+ args: [
+ 'cp',
+ 'support-files/input.txt',
+ 'gs://${_WORDCOUNT_INPUT_BUCKET}'
+ ]
+ dir: './composer/dags'
+ id: 'deploy-test-input-file'
+- name: 'google/cloud-sdk'
+ waitFor: ['-']
+ entrypoint: 'gsutil'
+ args: [
+ 'cp',
+ 'support-files/ref.txt',
+ 'gs://${_WORDCOUNT_REF_BUCKET}'
+ ]
+ dir: './composer/dags'
+ id: 'deploy-test-ref-file'
+# Stage AirflowVariables.json to data directory to be synced to workers.
+- name: 'google/cloud-sdk'
+ waitFor: ['render-airflow-variables']
+ entrypoint: 'gcloud'
+ args: [
+ 'composer', 'environments', 'storage', 'data', 'import',
+ '--location=${_COMPOSER_REGION}',
+ '--environment=${_COMPOSER_ENV_NAME}',
+ '--source','AirflowVariables.json',
+ '--destination', 'config'
+ ]
+ dir: './composer/config/'
+ id: 'stage-airflow-variables'
+# Import AirflowVariables.json
+- name: 'google/cloud-sdk'
+ waitFor: ['stage-airflow-variables']
+ entrypoint: 'gcloud'
+ args: [
+ 'composer', 'environments', 'run',
+ '--location=${_COMPOSER_REGION}',
+ '${_COMPOSER_ENV_NAME}',
+ 'variables', '--',
+ '--import', '/home/airflow/gcs/data/config/AirflowVariables.json'
+ ]
+ id: 'import-airflow-variables'
+# Sync plugins to GCS plugins dir
+- name: 'google/cloud-sdk'
+ waitFor: ['run-builds']
+ entrypoint: 'gsutil'
+ args: [
+ 'rsync','-r', '-d',
+ 'plugins/',
+ '${_COMPOSER_DAG_BUCKET}plugins'
+ ]
+ dir: './composer/'
+ id: 'deploy-custom-plugins'
+# Sync DAGs to data dir for integration test parsing in target Composer Environment.
+- name: 'google/cloud-sdk'
+ waitFor: ['deploy-custom-plugins']
+ entrypoint: 'gsutil'
+ args: [
+ 'rsync','-r', '-d',
+ 'dags/',
+ '${_COMPOSER_DAG_BUCKET}data/test-dags/$BUILD_ID'
+ ]
+ dir: './composer/'
+ id: 'stage-for-integration-test'
+# Run integration tests parsing in target Composer Environment.
+- name: 'google/cloud-sdk'
+ waitFor: ['stage-for-integration-test']
+ entrypoint: 'gcloud'
+ args: [
+ 'composer', 'environments', 'run',
+ '--location', '${_COMPOSER_REGION}',
+ '${_COMPOSER_ENV_NAME}',
+ 'list_dags', '--',
+ '-sd', '/home/airflow/gcs/data/test-dags/$BUILD_ID'
+ ]
+ id: 'dag-parse-integration-test'
+# Clean up.
+- name: 'google/cloud-sdk'
+ waitFor: ['dag-parse-integration-test']
+ entrypoint: 'gsutil'
+ args: [
+ '-m', 'rm','-r',
+ '${_COMPOSER_DAG_BUCKET}data/test-dags/$BUILD_ID'
+ ]
+ dir: './composer/'
+ id: 'clean-up-data-dir-dags'
+# pull dags deployer golang app.
+- name: gcr.io/cloud-builders/docker
+ waitFor: ['-']
+ entrypoint: 'bash'
+ args: [
+ '-c',
+ 'docker pull gcr.io/${_ARTIFACTS_PROJECT_ID}/deploydags:latest || exit 0'
+ ]
+ id: 'pull-deploydags'
+# build with cache
+- name: gcr.io/cloud-builders/docker
+ waitFor: ['pull-deploydags']
+ dir: './composer/cloudbuild/go/dagsdeployer'
+ args: [
+ 'build',
+ '-t', 'gcr.io/${PROJECT_ID}/deploydags:latest',
+ '--cache-from', 'gcr.io/${_ARTIFACTS_PROJECT_ID}/deploydags:latest',
+ '.'
+ ]
+ id: 'build-deploydags'
+# Run dags deployer golang app.
+- name: gcr.io/${PROJECT_ID}/deploydags
+ dir: './composer'
+ waitFor: [
+ 'run-builds',
+ 'build-deploydags',
+ 'clean-up-data-dir-dags',
+ ]
+ args: [
+ '-dagList=./config/running_dags.txt',
+ '-dagsFolder=./dags',
+ '-project=${PROJECT_ID}',
+ '-region=${_COMPOSER_REGION}',
+ '-composerEnv=${_COMPOSER_ENV_NAME}',
+ '-dagBucketPrefix=${_COMPOSER_DAG_BUCKET}dags',
+ '-replace'
+ ]
+ id: 'run-deploydags'
+artifacts:
+ images: ['gcr.io/${_ARTIFACTS_PROJECT_ID}/deploydags']
+options:
+ machineType: 'N1_HIGHCPU_32' # For test and deploy dags parallelization.
diff --git a/composer/.gitignore b/composer/.gitignore
new file mode 100644
index 0000000..71c88fa
--- /dev/null
+++ b/composer/.gitignore
@@ -0,0 +1,144 @@
+deploydags
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# static files generated from Django application using `collectstatic`
+media
+static
diff --git a/composer/__init__.py b/composer/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/composer/cloudbuild/Makefile b/composer/cloudbuild/Makefile
new file mode 100644
index 0000000..f2199a8
--- /dev/null
+++ b/composer/cloudbuild/Makefile
@@ -0,0 +1,26 @@
+# Copyright 2018 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Make will use bash instead of sh
+SHELL := /usr/bin/env bash
+
+help: ## Prints help for targets with comments
+ @grep -E '^[a-zA-Z._-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "make \033[36m%- 30s\033[0m %s\n", $$1, $$2}'
+
+.PHONY: test
+test: ## Test if all files are properly formatted
+ @(cd go/dagsdeployer/internal/ && go test ./... && go vet ./... )
+
+.PHONY: push_deploydags_image
+push_deploydags_image:
+ @cd go/dagsdeployer/ && gcloud builds submit --project=datapipelines-ci --tag gcr.io/datapipelines-ci/deploydags .
diff --git a/composer/cloudbuild/README.md b/composer/cloudbuild/README.md
new file mode 100644
index 0000000..2323f88
--- /dev/null
+++ b/composer/cloudbuild/README.md
@@ -0,0 +1,55 @@
+# Dags Deployer Application
+
+The Dags Deployer Application seeks to automate the following steps in the DAG deployment process:
+1. Identify Dags to Start / Stop based on presence of the dag id in the `running_dags.txt` config file.
+1. Check if a DAG needs to be redeployed be checking the filehash of the GCS object against that of the file in the repo.
+1. Stop DAGs: 1) Pause the DAG 2) Delete the GCS source file for the DAG 3) Delete the metadata in the airflowdb for the DAG.
+1. Start DAGs: 1) Copy the source file the GCS dags folder 2) Unpause the DAG.
+
+The process for [how Composer stores code in GCS](https://cloud.google.com/composer/docs/concepts/cloud-storage)
+and syncs to the airflow workers / webserver is eventually consistent. Therefore this Dags Deployer Application
+retries operations that we might expect to fail (e.g. unpausing a DAG immediately after copying it to GCS may occur
+before the scheduler has parsed the DAG, registering it with the airflowdb). This retry process can take minutes so
+golang was selected as the implementation language to leverage goroutines to concurrently perform the
+DAG stop / DAG start processes to speed up deployments involving the starting / stopping of many DAGs.
+
+Cloud Build will build golang application creating an executable with the parameters documented below.
+
+## Parameters
+- `repoRoot`: path to the root of this repo
+- `projectID`: GCP project ID
+- `composerRegion`: GCP Region wher Composer Environment lives
+- `composerEnvName`: Cloud Composer Environment name
+- `dagBucketPrefix`: The GCS dags bucket prefix
+
+### Running the dags deployer tests
+From this directory run
+```bash
+make test
+```
+
+### Deploying a new image
+From this directory run
+```bash
+make push_deploydags_image
+```
+
+### run_tests.sh
+In order for DAG validation to pass, all files(e.g. sql query files), variables and connections
+must exist in the local airflow environment.
+`run_tests.sh` is a script to set up a local airflow environment to run dag validation tests.
+It takes three arguments:
+1. Relative path to local BigQuery SQL.
+1. Relative path to a local JSON files with AirflowVariables necessary for your tests.
+1. Relative path to plugins directory
+
+Installing dependencies
+```bash
+python3 -m venv .venv && source .venv/bin/activate
+pip3 install -r ../requirements-dev.txt
+```
+
+Running the dag validation tests
+```bash
+(cd .. && ./cloudbuild/bin/run_tests.sh ../bigquery/sql ./config/AirflowVariables.json ./plugins)
+```
diff --git a/composer/cloudbuild/bin/run_tests.sh b/composer/cloudbuild/bin/run_tests.sh
new file mode 100755
index 0000000..fe55346
--- /dev/null
+++ b/composer/cloudbuild/bin/run_tests.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+
+# Copyright 2019 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+PATH=$PATH:/usr/local/airflow/google-cloud-sdk/bin
+export AIRFLOW_HOME=/tmp/airflow
+
+# $1 relative path to directory containing bigquery sql.
+# $2 relative path to JSON file contianing Airflow Variables.
+# $3 relative path to plugins directory.
+function setup_local_airflow() {
+ LOCAL_SQL_DIR=$1
+ LOCAL_VARIABLES_JSON=$2
+ LOCAL_PLUGIN_DIR=$3
+ mkdir -p $AIRFLOW_HOME
+ echo "setting up local aiflow"
+ airflow version
+ echo "initialize airflow database."
+ airflow initdb
+ if [ -z "$LOCAL_PLUGIN_DIR" ];
+ then
+ echo "no plugins dir provided; skipping copy to plugins dir."
+ else
+ echo "copying ${LOCAL_PLUGIN_DIR} to ${AIRFLOW_HOME}."
+ cp -r "$LOCAL_PLUGIN_DIR" "$AIRFLOW_HOME/"
+ fi
+
+
+ if [ -z "$LOCAL_SQL_DIR" ];
+ then
+ echo "no sql dir provided; skipping copy to dags dir."
+ else
+ echo "setting up sql."
+ SQL_PREFIX=$AIRFLOW_HOME/dags/sql
+ mkdir -p "$SQL_PREFIX"
+ rsync -r -d "$LOCAL_SQL_DIR" "$SQL_PREFIX"
+ fi
+
+ echo "generating fernet key."
+ FERNET_KEY=$(python3 -c "from cryptography.fernet import Fernet; \
+ print(Fernet.generate_key().decode('utf-8'))")
+ export FERNET_KEY
+
+ echo "uploading connections."
+ for conn_id in $AIRFLOW_CONN_LIST; do
+ set_local_conn "$conn_id"
+ done
+
+ # Import Airflow Variables to local Airflow.
+ if [ -z "$LOCAL_VARIABLES_JSON" ]
+ then
+ echo "not local variables json provided; skipping import."
+ else
+ echo "import airflow vaiables."
+ airflow variables --import "$LOCAL_VARIABLES_JSON"
+ echo "imported airflow vaiables:"
+ airflow variables --export /tmp/AirflowVariables.json.exported
+ cat /tmp/AirflowVariables.json.exported
+ rm /tmp/AirflowVariables.json.exported
+ fi
+
+
+ echo "setting up DAGs."
+ rsync -r dags $AIRFLOW_HOME
+}
+
+# Upload custom connetions to local Airflow.
+# $1 conn_id
+function set_local_conn() {
+ echo "uploading connection: $conn_id."
+ #TODO remove assumption that custom connections are http.
+ airflow connections --add --conn_id "$1" --conn_type http || \
+ echo "Upload $1 to local Airflow failed"
+}
+
+# Run DAG validation tests.
+function run_tests() {
+ python3 -m unittest discover
+}
+
+function clean_up() {
+ echo "cleaning up AIRFLOW_HOME"
+ rm -rf $AIRFLOW_HOME
+ unset AIRFLOW_HOME
+}
+
+# Might be necessary if we chose another image.
+function install_airflow() {
+ python3 -m venv airflow-env
+ # shellcheck disable=SC1091
+ source airflow-env/bin/activate
+ pip3 install -r requirements-dev.txt
+}
+
+# $1 relative path to directory containing bigquery sql.
+# $2 relative path to JSON file contianing Airflow Variables.
+main() {
+ setup_local_airflow "$1" "$2" "$3"
+ run_tests
+ TEST_STATUS=$?
+ clean_up
+ exit $TEST_STATUS
+}
+
+main "$1" "$2" "$3"
diff --git a/composer/cloudbuild/go/.gitignore b/composer/cloudbuild/go/.gitignore
new file mode 100644
index 0000000..e541d15
--- /dev/null
+++ b/composer/cloudbuild/go/.gitignore
@@ -0,0 +1,3 @@
+pkg/
+bin/
+src/
diff --git a/composer/cloudbuild/go/dagsdeployer/Dockerfile b/composer/cloudbuild/go/dagsdeployer/Dockerfile
new file mode 100644
index 0000000..4dbde08
--- /dev/null
+++ b/composer/cloudbuild/go/dagsdeployer/Dockerfile
@@ -0,0 +1,10 @@
+FROM golang:buster AS builder
+COPY . /dagsdeployer
+WORKDIR /dagsdeployer/cmd/deploydags
+ENV GO111MODULE=on
+RUN CGO_ENABLED=0 GOOS=linux go build -o /app/deploydags .
+
+FROM google/cloud-sdk:latest
+COPY --from=builder /app/deploydags /app/deploydags
+WORKDIR /app
+ENTRYPOINT ["/app/deploydags"]
diff --git a/composer/cloudbuild/go/dagsdeployer/cmd/deploydags/main.go b/composer/cloudbuild/go/dagsdeployer/cmd/deploydags/main.go
new file mode 100644
index 0000000..1273aef
--- /dev/null
+++ b/composer/cloudbuild/go/dagsdeployer/cmd/deploydags/main.go
@@ -0,0 +1,63 @@
+// Copyright 2019 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+
+package main
+
+import (
+ "flag"
+ "log"
+ "source.cloud.google.com/datapipelines-ci/composer/cloudbuild/go/dagsdeployer/internal/composerdeployer"
+)
+
+func main() {
+
+ var dagsFolder, dagList, projectID, composerRegion, composerEnvName, dagBucketPrefix string
+ var replace bool
+
+ flag.StringVar(&dagList, "dagList", "./config/running_dags.txt", "path to the list of dags that should be running after the deploy")
+ flag.StringVar(&dagsFolder, "dagsFolder", "./dags", "path to the dags folder in the repo.")
+ flag.StringVar(&projectID, "project", "", "gcp project id")
+ flag.StringVar(&composerRegion, "region", "", "project")
+ flag.StringVar(&composerEnvName, "composerEnv", "", "Composer environment name")
+ flag.StringVar(&dagBucketPrefix, "dagBucketPrefix", "", "Composer DAGs bucket prefix")
+ flag.BoolVar(&replace, "replace", false, "Boolean flag to indicatae if source dag mismatches the object of same name in GCS delte the old version and deploy over it")
+
+ flag.Parse()
+
+ flags := map[string]string{
+ "dagsFolder": dagsFolder,
+ "dagList": dagList,
+ "projectID": projectID,
+ "composerRegion": composerRegion,
+ "composerEnvName": composerEnvName,
+ "dagBucketPrefix": dagBucketPrefix,
+ }
+
+ // Check flags are not empty.
+ for k, v := range flags {
+ if v == "" {
+ log.Panicf("%v must not be empty.", k)
+ }
+ }
+
+ c := composerdeployer.ComposerEnv{
+ Name: composerEnvName,
+ Project: projectID,
+ Location: composerRegion,
+ DagBucketPrefix: dagBucketPrefix,
+ LocalDagsPrefix: dagsFolder}
+
+ dagsToStop, dagsToStart := c.GetStopAndStartDags(dagList, replace)
+ c.StopDags(dagsToStop, !replace)
+ c.StartDags(dagsFolder, dagsToStart)
+}
diff --git a/composer/cloudbuild/go/dagsdeployer/go.mod b/composer/cloudbuild/go/dagsdeployer/go.mod
new file mode 100644
index 0000000..c46a9f5
--- /dev/null
+++ b/composer/cloudbuild/go/dagsdeployer/go.mod
@@ -0,0 +1,8 @@
+module source.cloud.google.com/datapipelines-ci/composer/cloudbuild/go/dagsdeployer
+
+go 1.12
+
+require (
+ cloud.google.com/go/storage v1.1.2
+ github.com/bmatcuk/doublestar v1.2.3
+)
diff --git a/composer/cloudbuild/go/dagsdeployer/go.sum b/composer/cloudbuild/go/dagsdeployer/go.sum
new file mode 100644
index 0000000..e0948df
--- /dev/null
+++ b/composer/cloudbuild/go/dagsdeployer/go.sum
@@ -0,0 +1,168 @@
+cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
+cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
+cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU=
+cloud.google.com/go v0.44.1/go.mod h1:iSa0KzasP4Uvy3f1mN/7PiObzGgflwredwwASm/v6AU=
+cloud.google.com/go v0.44.2/go.mod h1:60680Gw3Yr4ikxnPRS/oxxkBccT6SA1yMk63TGekxKY=
+cloud.google.com/go v0.45.1/go.mod h1:RpBamKRgapWJb87xiFSdk4g1CME7QZg3uwTez+TSTjc=
+cloud.google.com/go v0.46.3 h1:AVXDdKsrtX33oR9fbCMu/+c1o8Ofjq6Ku/MInaLVg5Y=
+cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg0=
+cloud.google.com/go/bigquery v1.0.1 h1:hL+ycaJpVE9M7nLoiXb/Pn10ENE2u+oddxbD8uu0ZVU=
+cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o=
+cloud.google.com/go/datastore v1.0.0 h1:Kt+gOPPp2LEPWp8CSfxhsM8ik9CcyE/gYu+0r+RnZvM=
+cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE=
+cloud.google.com/go/pubsub v1.0.1 h1:W9tAK3E57P75u0XLLR82LZyw8VpAnhmyTOxW9qzmyj8=
+cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I=
+cloud.google.com/go/storage v1.1.2 h1:q7KNypEb3CARnitCAqY63g+dZp9HDEgv/c6IPlPLMJI=
+cloud.google.com/go/storage v1.1.2/go.mod h1:/03MkR5FWjF0OpcKpdJ4RgWybEaYAr2boHXq5RDlxbw=
+dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
+github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ=
+github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
+github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
+github.com/bmatcuk/doublestar v1.2.3 h1:ChLVAfc51TZWXjnbB/3ZKMbk78j0vs0WhtgeDS+L/+I=
+github.com/bmatcuk/doublestar v1.2.3/go.mod h1:wiQtGV+rzVYxB7WIlirSN++5HPtPlXEo9MEoZQC/PmE=
+github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
+github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
+github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b h1:VKtxabqXZkF25pY9ekfRL6a582T4P37/31XEstQ5p58=
+github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
+github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
+github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
+github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y=
+github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.3.2 h1:6nsPYzhq5kReh6QImI3k5qWzO4PEbvbIW2cwSfR/6xs=
+github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
+github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
+github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
+github.com/google/go-cmp v0.3.0 h1:crn/baboCvb5fXaQ0IJ1SGTsTVrWpDsCWC8EGETZijY=
+github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/google/martian v2.1.0+incompatible h1:/CP5g8u/VJHijgedC/Legn3BAbAaWPgecwXBIDzw5no=
+github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs=
+github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
+github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
+github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI=
+github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg=
+github.com/googleapis/gax-go/v2 v2.0.5 h1:sjZBwGj9Jlw33ImPtvFviGYvseOtDM7hkSKB7+Tv3SM=
+github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk=
+github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
+github.com/hashicorp/golang-lru v0.5.1 h1:0hERBMJE1eitiLkihrMvRVBYAkpHzc/J3QdDN+dAcgU=
+github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
+github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024 h1:rBMNdlhTLzJjJSDIjNEXX1Pz3Hmwmz91v+zycvx9PJc=
+github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=
+github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
+github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
+github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
+github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
+github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
+go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU=
+go.opencensus.io v0.22.0 h1:C9hSCOW830chIVkdja34wa6Ky+IzWllkUinR+BtRZd4=
+go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
+golang.org/x/exp v0.0.0-20190829153037-c13cbed26979/go.mod h1:86+5VVa7VpoJ4kLfm080zCjGlMRFzhUhsZKEZO7MGek=
+golang.org/x/exp v0.0.0-20191014171548-69215a2ee97e h1:ewBcnrlKhy0GKnQ31tXkOC/G7/jHC4ogar1TiIfANC4=
+golang.org/x/exp v0.0.0-20191014171548-69215a2ee97e/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY=
+golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
+golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
+golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
+golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
+golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
+golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
+golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
+golang.org/x/lint v0.0.0-20190909230951-414d861bb4ac/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
+golang.org/x/lint v0.0.0-20190930215403-16217165b5de h1:5hukYrvBGR8/eNkX5mdUezrA6JiaEZDtJb9Ei+1LlBs=
+golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
+golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE=
+golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o=
+golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc=
+golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY=
+golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190501004415-9ce7a6920f09/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859 h1:R/3boaszxrf1GEUWTVDzSKVwLmSJpwZ1yqXm8j0v2QI=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
+golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
+golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45 h1:SVwTIAaPC2U/AvvLNZ2a7OVsmBpC8L5BlwK1whH3hm0=
+golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
+golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58 h1:8gQV6CLnAEikrhgkHFbMAEhagSSnXWGV915qUMm9mrU=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0 h1:HyfiK1WMnHj5FXFXatD+Qs1A/xC2Run6RzeW1SyHxpc=
+golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
+golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
+golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
+golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
+golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
+golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
+golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
+golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
+golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
+golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.0.0-20191022210528-83d82311fd1f h1:X4UYO3m0+b0v4ctMUiMVB/vdVP5v25QRYMtH88N+Ne8=
+golang.org/x/tools v0.0.0-20191022210528-83d82311fd1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE=
+google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M=
+google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=
+google.golang.org/api v0.9.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=
+google.golang.org/api v0.11.0 h1:n/qM3q0/rV2F0pox7o0CvNhlPvZAo7pLbef122cbLJ0=
+google.golang.org/api v0.11.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI=
+google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
+google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
+google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
+google.golang.org/appengine v1.6.1 h1:QzqyMA1tlu6CgqCDUtU9V+ZKhLFT2dkJuANu5QaxI3I=
+google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0=
+google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
+google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
+google.golang.org/genproto v0.0.0-20190418145605-e7d98fc518a7/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
+google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
+google.golang.org/genproto v0.0.0-20190502173448-54afdca5d873/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
+google.golang.org/genproto v0.0.0-20190801165951-fa694d86fc64/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
+google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
+google.golang.org/genproto v0.0.0-20190911173649-1774047e7e51/go.mod h1:IbNlFCBrqXvoKpeg0TB2l7cyZUmoaFKYIwrEpbDKLA8=
+google.golang.org/genproto v0.0.0-20191009194640-548a555dbc03 h1:4HYDjxeNXAOTv3o1N2tjo8UUSlhQgAD52FVkwxnWgM8=
+google.golang.org/genproto v0.0.0-20191009194640-548a555dbc03/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc=
+google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
+google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38=
+google.golang.org/grpc v1.21.1 h1:j6XxA85m/6txkUCHvzlV5f+HBNl/1r5cZ2A/3IEFOO8=
+google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
+honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
+honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
+honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
+honnef.co/go/tools v0.0.1-2019.2.3 h1:3JgtbtFHMiCmsznwGVTUWbgGov+pVqnlf1dEJTNAXeM=
+honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg=
+rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
diff --git a/composer/cloudbuild/go/dagsdeployer/internal/composerdeployer/composer_ops.go b/composer/cloudbuild/go/dagsdeployer/internal/composerdeployer/composer_ops.go
new file mode 100644
index 0000000..a2b817c
--- /dev/null
+++ b/composer/cloudbuild/go/dagsdeployer/internal/composerdeployer/composer_ops.go
@@ -0,0 +1,566 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package composerdeployer
+
+import (
+ "bufio"
+ "fmt"
+ "github.com/bmatcuk/doublestar"
+ "io/ioutil"
+ "log"
+ "math/rand"
+ "net/url"
+ "os"
+ "os/exec"
+ "path"
+ "path/filepath"
+ "regexp"
+ "source.cloud.google.com/datapipelines-ci/composer/cloudbuild/go/dagsdeployer/internal/gcshasher"
+ "strings"
+ "sync"
+ "time"
+)
+
+// ComposerEnv is a lightweight representaataion of Cloud Composer environment
+type ComposerEnv struct {
+ Name string
+ Project string
+ Location string
+ DagBucketPrefix string
+ LocalDagsPrefix string
+}
+
+func logDagList(a map[string]bool) {
+ for k := range a {
+ log.Printf("\t%s", k)
+ }
+ return
+}
+
+// DagList is a set of dags (for quick membership check)
+type DagList map[string]bool
+
+// ReadRunningDagsTxt reads a newline separated list of dags from a text file
+func ReadRunningDagsTxt(filename string) (map[string]bool, error) {
+ file, err := os.Open(filename)
+ if err != nil {
+ return nil, err
+ }
+ defer file.Close()
+
+ dagsToRun := make(map[string]bool)
+ sc := bufio.NewScanner(file)
+
+ for sc.Scan() {
+ dagsToRun[sc.Text()] = true
+ }
+ log.Printf("Read dagsToRun from %s:", filename)
+ logDagList(dagsToRun)
+ return dagsToRun, err
+}
+
+// DagListIntersect finds the common keys in two map[string]bool representing a
+// list of airflow DAG IDs.
+func DagListIntersect(a map[string]bool, b map[string]bool) map[string]bool {
+ short := make(map[string]bool)
+ long := make(map[string]bool)
+ in := make(map[string]bool)
+
+ if len(a) < len(b) {
+ short, long = a, b
+ } else {
+ short, long = b, a
+ }
+ for k := range short {
+ if long[k] {
+ in[k] = true
+ }
+ }
+ return in
+}
+
+// DagListDiff finds the keys in the first map[string]bool that do no appear in
+// the second.
+func DagListDiff(a map[string]bool, b map[string]bool) map[string]bool {
+ diff := make(map[string]bool)
+ for k := range a {
+ if !b[k] {
+ diff[k] = true
+ }
+ }
+ return diff
+}
+
+// shell out to call gsutil
+func gsutil(args ...string) ([]byte, error) {
+ c := exec.Command("gsutil", args...)
+ return c.CombinedOutput()
+}
+
+func (c *ComposerEnv) assembleComposerRunCmd(subCmd string, args ...string) []string {
+ subCmdArgs := []string{
+ "composer", "environments", "run",
+ c.Name,
+ fmt.Sprintf("--location=%s", c.Location),
+ subCmd}
+
+ if len(args) > 0 {
+ subCmdArgs = append(subCmdArgs, "--")
+ subCmdArgs = append(subCmdArgs, args...)
+ }
+ return subCmdArgs
+}
+
+// Run is used to run airflow cli commands
+// it is a wrapper of gcloud composer environments run
+func (c *ComposerEnv) Run(subCmd string, args ...string) ([]byte, error) {
+ subCmdArgs := c.assembleComposerRunCmd(subCmd, args...)
+ log.Printf("running gcloud with subCmd Args: %s", subCmdArgs)
+ cmd := exec.Command(
+ "gcloud", subCmdArgs...)
+ return cmd.CombinedOutput()
+}
+
+func parseListDagsOuput(out []byte) map[string]bool {
+ runningDags := make(map[string]bool)
+ outArr := strings.Split(string(out[:]), "\n")
+
+ // Find the DAGs in output
+ dagSep := "-------------------------------------------------------------------"
+ var dagsIdx, nSep int
+
+ for nSep < 2 {
+ if outArr[dagsIdx] == dagSep {
+ nSep++
+ }
+ dagsIdx++
+ if dagsIdx >= len(outArr) {
+ log.Fatalf("list_dags output did not contain expected separators: %s", out)
+ }
+ }
+
+ // Ignore empty newline and airflow_monitoring dag.
+ for _, dag := range outArr[dagsIdx:] {
+ if dag != "" && dag != "airflow_monitoring" {
+ runningDags[dag] = true
+ }
+ }
+
+ return runningDags
+}
+
+// GetRunningDags lists dags currently running in Composer Environment.
+func (c *ComposerEnv) GetRunningDags() (map[string]bool, error) {
+ runningDags := make(map[string]bool)
+ out, err := c.Run("list_dags")
+ if err != nil {
+ log.Fatalf("list_dags failed: %s with %s", err, out)
+ }
+
+ runningDags = parseListDagsOuput(out)
+ log.Printf("running DAGs:")
+ logDagList(runningDags)
+ return runningDags, err
+}
+
+func readCommentScrubbedLines(path string) ([]string, error) {
+ log.Printf("scrubbing comments in %v", path)
+ commentPattern, err := regexp.Compile(`#.+`)
+ if err != nil {
+ return nil, fmt.Errorf("error compiling regex: %v", err)
+ }
+ file, err := os.Open(path)
+ if err != nil {
+ return nil, fmt.Errorf("couldn't open file %v: %v", path, err)
+ }
+ defer file.Close()
+
+ lines := make([]string, 0, 1)
+ scanner := bufio.NewScanner(file)
+ for scanner.Scan() {
+ candidate := commentPattern.ReplaceAllString(scanner.Text(), "")
+ if len(candidate) > 0 {
+ lines = append(lines, candidate)
+ }
+ }
+
+ return lines, scanner.Err()
+}
+
+// FindDagFilesInLocalTree searches for Dag files in dagsRoot with names in dagNames respecting .airflowignores
+func FindDagFilesInLocalTree(dagsRoot string, dagNames map[string]bool) (map[string][]string, error) {
+
+ if len(dagNames) == 0 {
+ return make(map[string][]string), nil
+ }
+ log.Printf("searching for these DAGs in %v:", dagsRoot)
+ logDagList(dagNames)
+ matches := make(map[string][]string)
+ // This should map a dir to the ignore patterns in it's airflow ignore if relevant
+ // this allows us to easily identify the patterns relevant to this dir and it's parents, grandparents, etc.
+ airflowignoreTree := make(map[string][]string)
+ _, err := ioutil.ReadDir(dagsRoot)
+ if err != nil {
+ return matches, fmt.Errorf("error reading dagRoot: %v. %v", dagsRoot, err)
+ }
+ filepath.Walk(dagsRoot, func(path string, info os.FileInfo, err error) error {
+ dagID := strings.TrimSuffix(info.Name(), ".py")
+ relPath, err := filepath.Rel(dagsRoot, path)
+
+ if info == nil {
+ dur, _ := time.ParseDuration("5s")
+ time.Sleep(dur)
+ }
+ // resepect .airflowignore
+ if info.Name() == ".airflowignore" {
+ log.Printf("found %v, adding to airflowignoreTree", path)
+ patterns, err := readCommentScrubbedLines(path)
+ if err != nil {
+ return err
+ }
+ dir, err := filepath.Rel(dagsRoot, filepath.Dir(path))
+ if err != nil {
+ return fmt.Errorf("error making %v relative to dag root %v: %v", filepath.Dir(path), dagsRoot, err)
+ }
+ fullyQualifiedPatterns := make([]string, 0, len(patterns))
+ for _, p := range patterns {
+ fullyQualifiedPatterns = append(fullyQualifiedPatterns, filepath.Join(dir, p))
+ }
+ log.Printf("adding the following patterns to airflowignoreTree[%v]: %+v", dir, fullyQualifiedPatterns)
+ airflowignoreTree[filepath.Dir(path)] = fullyQualifiedPatterns
+ return nil
+ }
+
+ if !info.IsDir() && !dagNames[dagID] { // skip to next file if this is not relevant to dagNames
+ return nil
+ }
+
+ relevantIgnores := make([]string, 0)
+ p := path
+
+ if ignores, ok := airflowignoreTree[p]; ok {
+ relevantIgnores = append(relevantIgnores, ignores...)
+ }
+
+ // walk back to respect all parents' .airflowignore
+ for {
+ if p == filepath.Dir(dagsRoot) {
+ break
+ }
+ parent := filepath.Dir(p)
+ p = parent // for next iteration.
+ if patterns, ok := airflowignoreTree[parent]; ok { // parent has .airflowignore
+ relevantIgnores = append(relevantIgnores, patterns...)
+ }
+ }
+
+ thisMatch := make(map[string]bool)
+ if err != nil {
+ log.Printf("error making %v relative to %v, %v", path, dagsRoot, err)
+ return fmt.Errorf("error making %v relative to %v, %v", path, dagsRoot, err)
+ }
+
+ for _, ignore := range relevantIgnores {
+ absIgnore, err := filepath.Abs(filepath.Join(".", ignore))
+ if err != nil {
+ return err
+ }
+ absPath, err := filepath.Abs(filepath.Join(".", relPath))
+ if err != nil {
+ return err
+ }
+ var match bool
+ if strings.Contains(absIgnore, "**") {
+ match, err = doublestar.PathMatch(absIgnore, absPath)
+ if err != nil {
+ return err
+ }
+ }
+ if !match && !strings.Contains(ignore, "**") {
+ match, err = regexp.MatchString(ignore, relPath)
+ if err != nil {
+ log.Printf("ERROR: comparing %v %v: %v", relPath, ignore, err)
+ return err
+ }
+ }
+
+ // don't walk dirs we don't have to
+ if match && info.IsDir() {
+ log.Printf("ignoring dir: %v because matched %v", relPath, ignore)
+ return filepath.SkipDir
+ }
+
+ // remove matches if previously added but now matches this ignore pattern
+ if match && !info.IsDir() && dagNames[dagID] {
+ log.Printf("ignoring path: %v because matched %v", relPath, ignore)
+ if _, ok := matches[dagID]; ok {
+ matches[dagID] = make([]string, 0)
+ break // no other ignore patterns relevant if we now know this file should be ignored
+ }
+ return nil
+ }
+
+ // if we shouldn't ignore it and it is in dagNames then add it to matches if not already present
+ if !match && !info.IsDir() && dagNames[dagID] {
+ thisMatch[dagID] = true
+ }
+ }
+
+ if thisMatch[dagID] {
+ alreadyMatched := false
+ for _, p := range matches[dagID] {
+ if relPath == p {
+ alreadyMatched = true
+ break
+ }
+ }
+ if !alreadyMatched {
+ matches[dagID] = append(matches[dagID], relPath)
+ }
+ }
+
+ return nil
+ })
+
+ errs := make([]error, 0)
+
+ // should match exactly one path in the tree.
+ for dag, matches := range matches {
+ if len(matches) == 0 {
+ errs = append(errs, fmt.Errorf("did not find match for %v", dag))
+ } else if len(matches) > 1 {
+ errs = append(errs, fmt.Errorf("found multiple matches for %v: %v", dag, matches))
+ }
+ }
+
+ if len(errs) > 0 {
+ return matches, fmt.Errorf("Encountered errors matching files to dags: %+v", errs)
+ }
+ return matches, nil
+}
+
+// FindDagFilesInGcsPrefix necessary find the file path of a dag that has been deleted from VCS
+func FindDagFilesInGcsPrefix(prefix string, dagFileNames map[string]bool) (map[string][]string, error) {
+ dir, err := ioutil.TempDir("", "gcsDags_")
+ if err != nil {
+ return nil, fmt.Errorf("error creating temp dir to pull gcs dags: %v", err)
+ }
+ defer os.RemoveAll(dir) // clean up temp dir
+
+ // copy gcs dags dir to local temp dir
+ log.Printf("pulling down %v", prefix)
+ _, err = gsutil("-m", "cp", "-r", prefix, dir)
+ if err != nil {
+ return nil, fmt.Errorf("error fetching dags dir from GCS: %v", err)
+ }
+ return FindDagFilesInLocalTree(filepath.Join(dir, "dags"), dagFileNames)
+}
+
+func (c *ComposerEnv) getRestartDags(sameDags map[string]string) map[string]bool {
+ dagsToRestart := make(map[string]bool)
+ for dag, relPath := range sameDags {
+ // We know that the file name = dag id from the dag validation test asseting this.
+ local := filepath.Join(c.LocalDagsPrefix, relPath)
+ gcs, err := url.Parse(c.DagBucketPrefix)
+ gcs.Path = path.Join(gcs.Path, relPath)
+ eq, err := gcshasher.LocalFileEqGCS(local, gcs.String())
+ if err != nil {
+ log.Printf("error comparing file hashes %s, attempting to restart: %s", err, dag)
+ dagsToRestart[dag] = true
+ } else if !eq {
+ dagsToRestart[dag] = true
+ }
+ }
+ return dagsToRestart
+}
+
+// Dag is a type for dag containing it's path
+type Dag struct {
+ ID string
+ Path string
+}
+
+// GetStopAndStartDags uses set differences between dags running in the Composer
+// Environment and those in the running dags text config file.
+func (c *ComposerEnv) GetStopAndStartDags(filename string, replace bool) (map[string]string, map[string]string) {
+ dagsToRun, err := ReadRunningDagsTxt(filename)
+ if err != nil {
+ log.Fatalf("couldn't read running_dags.txt: %v", filename)
+ }
+ runningDags, err := c.GetRunningDags()
+ if err != nil {
+ log.Fatalf("couldn't list dags in composer environment: %v", err)
+ }
+ dagsToStop := DagListDiff(runningDags, dagsToRun)
+ dagsToStart := DagListDiff(dagsToRun, runningDags)
+ dagsSame := DagListIntersect(runningDags, dagsToRun)
+ log.Printf("DAGs same:")
+ logDagList(dagsSame)
+
+ dagPathListsSame, err := FindDagFilesInGcsPrefix(c.DagBucketPrefix, dagsToStop)
+ if err != nil {
+ log.Fatalf("error finding dags to stop: %v", err)
+ }
+ // unnest out of slice
+ dagPathsSame := make(map[string]string)
+ for k, v := range dagPathListsSame {
+ dagPathsSame[k] = v[0]
+ }
+ restartDags := c.getRestartDags(dagPathsSame)
+
+ if replace {
+ for k, v := range restartDags {
+ dagsToStop[k], dagsToStart[k] = v, v
+ }
+ } else {
+ log.Fatalf("FAILED: tried to overwite DAGs in place put replace=false the following existing dags: %#v", restartDags)
+ }
+
+ log.Printf("DAGs to Stop:")
+ logDagList(dagsToStop)
+ log.Printf("DAGs to Start:")
+ logDagList(dagsToStart)
+
+ dagPathListsToStop, err := FindDagFilesInGcsPrefix(c.DagBucketPrefix, dagsToStop)
+ if err != nil {
+ log.Fatalf("error finding dags to stop: %v", err)
+ }
+ dagPathsToStop := make(map[string]string)
+ for k, v := range dagPathListsToStop {
+ dagPathsToStop[k] = v[0]
+ }
+ dagPathListsToStart, err := FindDagFilesInLocalTree(c.LocalDagsPrefix, dagsToStart)
+ if err != nil {
+ log.Fatalf("error finding dags to start: %v", err)
+ }
+
+ dagPathsToStart := make(map[string]string)
+ for k, v := range dagPathListsToStart {
+ dagPathsToStart[k] = v[0]
+ }
+ return dagPathsToStop, dagPathsToStart
+}
+
+// ComposerEnv.stopDag pauses the dag, removes the dag definition file from gcs
+// and deletes the DAG from the airflow db.
+func (c *ComposerEnv) stopDag(dag string, relPath string, pauseOnly bool, wg *sync.WaitGroup) (err error) {
+ defer wg.Done()
+ log.Printf("pausing dag: %v with relPath: %v", dag, relPath)
+ out, err := c.Run("pause", dag)
+ if err != nil {
+ return fmt.Errorf("error pausing dag %v: %v", dag, string(out))
+ }
+ if !pauseOnly {
+ log.Printf("parsing gcs url %v", c.DagBucketPrefix)
+ gcs, err := url.Parse(c.DagBucketPrefix)
+ if err != nil {
+ panic("error parsing dag bucket prefix")
+ }
+
+ gcs.Path = path.Join(gcs.Path, relPath)
+ log.Printf("deleting %v", gcs.String())
+ out, err = gsutil("rm", gcs.String())
+ if err != nil {
+ panic("error deleting from gcs")
+ }
+
+ _, err = c.Run("delete_dag", dag)
+ if err != nil {
+ panic("error deleteing dag")
+ }
+
+ for i := 0; i < 5; i++ {
+ if err == nil {
+ break
+ }
+ log.Printf("Waiting 5s to retry")
+ dur, _ := time.ParseDuration("5s")
+ time.Sleep(dur)
+ log.Printf("Retrying delete %s", dag)
+ _, err = c.Run("delete_dag", dag)
+ }
+ if err != nil {
+ return fmt.Errorf("Retried 5x, pause still failing with: %v", string(out))
+ }
+ }
+ return err
+}
+
+// StopDags deletes a list of dags in parallel go routines
+func (c *ComposerEnv) StopDags(dagsToStop map[string]string, pauseOnly bool) error {
+ var stopWg sync.WaitGroup
+ for k, v := range dagsToStop {
+ stopWg.Add(1)
+ go c.stopDag(k, v, pauseOnly, &stopWg)
+ }
+ stopWg.Wait()
+ return nil
+}
+
+func jitter(d time.Duration) time.Duration {
+ const pct = 0.10 //Jitter up to 10% of the supplied duration.
+ jit := 1 + pct*(rand.Float64()*2-1)
+ return time.Duration(jit * float64(d))
+}
+
+// ComposerEnv.waitForDeploy polls a Composer environment trying to unpause
+// dags. This should be called after copying a dag file to gcs when
+// dag_paused_on_creation=True.
+func (c *ComposerEnv) waitForDeploy(dag string) error {
+ _, err := c.Run("unpause", dag)
+ for i := 0; i < 5; i++ {
+ if err == nil {
+ break
+ }
+ log.Printf("Waiting 60s to retry")
+ time.Sleep(jitter(time.Minute))
+ log.Printf("Retrying unpause %s", dag)
+ _, err = c.Run("unpause", dag)
+ }
+ if err != nil {
+ err = fmt.Errorf("Retried 5x, unpause still failing with: %s", err)
+ }
+ return err
+}
+
+// ComposerEnv.startDag copies a DAG definition file to GCS and waits until you can
+// successfully unpause.
+func (c *ComposerEnv) startDag(dagsFolder string, dag string, relPath string, wg *sync.WaitGroup) error {
+ defer wg.Done()
+ loc := filepath.Join(dagsFolder, relPath)
+ gcs, err := url.Parse(c.DagBucketPrefix)
+ if err != nil {
+ return fmt.Errorf("error parsing dags prefix %v", err)
+ }
+ gcs.Path = path.Join(gcs.Path, relPath)
+ _, err = gsutil("cp", loc, gcs.String())
+ if err != nil {
+ return fmt.Errorf("error copying file %v to gcs: %v", loc, err)
+ }
+ c.waitForDeploy(dag)
+ return err
+}
+
+// StartDags deploys a list of dags in parallel go routines
+func (c *ComposerEnv) StartDags(dagsFolder string, dagsToStart map[string]string) error {
+ c.Run("unpause", "airflow_monitoring")
+ var startWg sync.WaitGroup
+ for k, v := range dagsToStart {
+ startWg.Add(1)
+ go c.startDag(dagsFolder, k, v, &startWg)
+ }
+ startWg.Wait()
+ return nil
+}
diff --git a/composer/cloudbuild/go/dagsdeployer/internal/composerdeployer/composer_ops_test.go b/composer/cloudbuild/go/dagsdeployer/internal/composerdeployer/composer_ops_test.go
new file mode 100644
index 0000000..78b90fe
--- /dev/null
+++ b/composer/cloudbuild/go/dagsdeployer/internal/composerdeployer/composer_ops_test.go
@@ -0,0 +1,206 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package composerdeployer
+
+import (
+ "fmt"
+ "io/ioutil"
+ "os"
+ "path/filepath"
+ "reflect"
+ "testing"
+)
+
+// test dag lists
+var ab = map[string]bool{"a": true, "b": true}
+
+var ac = map[string]bool{"a": true, "c": true}
+
+var a = map[string]bool{"a": true}
+
+var d = map[string]bool{"d": true}
+
+func TestDagListIntersect(t *testing.T) {
+
+ testTable := []struct {
+ a map[string]bool
+ b map[string]bool
+ out map[string]bool
+ }{
+ {ab, ac, map[string]bool{"a": true}},
+ {ac, ab, map[string]bool{"a": true}}, // commutative
+ {ab, ab, ab}, // identity
+ {ab, a, a},
+ {a, ab, a},
+ {ab, d, make(map[string]bool)}}
+
+ for _, tt := range testTable {
+ t.Run(fmt.Sprintf("%+v", tt), func(t *testing.T) {
+ if got := DagListIntersect(tt.a, tt.b); !reflect.DeepEqual(got, tt.out) {
+ t.Errorf("DagListIntersect(%+v, %+v) = %+v, want %+v", tt.a, tt.b, got, tt.out)
+ }
+ })
+ }
+}
+
+func TestDagListDiff(t *testing.T) {
+
+ testTable := []struct {
+ a map[string]bool
+ b map[string]bool
+ out map[string]bool
+ }{
+ {ab, ac, map[string]bool{"b": true}},
+ {ac, ab, map[string]bool{"c": true}}, // commutative
+ {ab, ab, make(map[string]bool)},
+ {ab, a, map[string]bool{"b": true}},
+ {a, ab, make(map[string]bool)},
+ {ab, d, ab},
+ {d, ab, d}}
+
+ for _, tt := range testTable {
+ t.Run(fmt.Sprintf("%+v, %+v", tt.a, tt.b), func(t *testing.T) {
+ if got := DagListDiff(tt.a, tt.b); !reflect.DeepEqual(got, tt.out) {
+ t.Errorf("DagListDiff(%+v, %+v) = %+v, want %+v", tt.a, tt.b, got, tt.out)
+ }
+ })
+ }
+
+}
+
+func TestAssembleComposerRunCmd(t *testing.T) {
+ c := ComposerEnv{
+ Name: "composerenv",
+ Location: "us-central1",
+ DagBucketPrefix: "gs://composerenv-bucket/dags/",
+ }
+ // Test single command.
+ want := []string{
+ "composer", "environments", "run",
+ "composerenv",
+ "--location=us-central1",
+ "list_dags"}
+
+ got := c.assembleComposerRunCmd("list_dags")
+ if !reflect.DeepEqual(got, want) {
+ t.Errorf("ComposerEnv.assembleComposerrRunCmd(\"list_dags\") = %+v, want %+v", got, want)
+ }
+
+ // Test command w/ arguments
+ want = []string{
+ "composer", "environments", "run",
+ "composerenv",
+ "--location=us-central1",
+ "variables", "--", "import", "AirflowVariables.json"}
+
+ got = c.assembleComposerRunCmd("variables", "import", "AirflowVariables.json")
+ if !reflect.DeepEqual(got, want) {
+ t.Errorf("ComposerEnv.assembleComposerrRunCmd(\"variables\", \"import\", \"AirflowVariables.json\") = %+v, want %+v", got, want)
+ }
+}
+
+func populateAirflowIgnore(path string, ignores []string) error {
+ f, err := os.OpenFile(filepath.Join(path, ".airflowignore"), os.O_RDWR|os.O_CREATE, 0755)
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+ for _, ignore := range ignores {
+ _, err := f.WriteString(ignore + "\n")
+ if err != nil {
+ panic(fmt.Sprintf("couldn't write %v to %v: %v", ignore, f, err))
+ }
+ }
+ return nil
+}
+
+func prepareTestTempDirTree() (string, error) {
+ tmpDir, err := ioutil.TempDir("", "")
+ if err != nil {
+ return "", fmt.Errorf("error creating temp dir: %v", err)
+ }
+
+ // create dir tree
+ for _, p := range []string{
+ "team1/usecase1/sql",
+ "team1/helpers/utils",
+ "team2/usecase1/",
+ "team2/usecase2/",
+ "team2/helpers/"} {
+ err = os.MkdirAll(filepath.Join(tmpDir, p), 0755)
+ if err != nil {
+ return tmpDir, err
+ }
+ }
+
+ // add some files
+ for _, name := range []string{
+ ".airflowignore",
+ "team1/.airflowignore",
+ "team1/usecase1/sql/foo.sql",
+ "team1/usecase1/sql/dag1.py", // sometimes people define sql in python files as constants (should be ignored in dag finding)
+ "team1/helpers/create_dag.py", // some dag generation helper utility (should be ignored in dag finding)
+ "team1/usecase1/dag1.py",
+ "team1/usecase1/dag2.py",
+ "team2/usecase1/create_dag.py", // conflicts with utility file in team1/helpers, but should be ok as that was ignored.
+ "team2/usecase2/dag2.py", // uh-oh a real dag name conflict! (we will that this fails in second test)
+ "team2/helpers/helper_dag.py", // this should not be ignored as team2 follows a different convention.
+ } {
+ f, err := os.Create(filepath.Join(tmpDir, name))
+ if err != nil {
+ f.Close()
+ return tmpDir, err
+ }
+ }
+
+ // add some contents to .airflowignore files
+ populateAirflowIgnore(tmpDir, []string{"./**/sql/"}) // ignore sql/ dirs throughout the tree
+ populateAirflowIgnore(filepath.Join(tmpDir, "team1"), []string{"helpers/"}) // ignore helpers/ dir under team1
+ return tmpDir, nil
+}
+
+func TestFindDagFilesInLocalTree(t *testing.T) {
+ tmpDir, err := prepareTestTempDirTree()
+ defer os.RemoveAll(tmpDir)
+ if err != nil {
+ t.Errorf("couldn't initialize test dir tree: %v", err)
+ }
+
+ // look for the dags we know not to have name conflicts.
+ got, err := FindDagFilesInLocalTree(tmpDir, map[string]bool{
+ "helper_dag": true,
+ "create_dag": true,
+ "dag1": true,
+ })
+
+ want := map[string][]string{
+ "helper_dag": []string{"team2/helpers/helper_dag.py"},
+ "create_dag": []string{"team2/usecase1/create_dag.py"},
+ "dag1": []string{"team1/usecase1/dag1.py"},
+ }
+
+ if !reflect.DeepEqual(got, want) {
+ t.Errorf("got: %+v,\n want %+v", got, want)
+ }
+
+ // test w/ name conflict
+ _, err = FindDagFilesInLocalTree(tmpDir, map[string]bool{
+ "dag2": true,
+ })
+
+ if err == nil {
+ t.Errorf("should error on duplicate dag names but didn't")
+ }
+}
diff --git a/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/gcs_hash.go b/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/gcs_hash.go
new file mode 100644
index 0000000..28d669b
--- /dev/null
+++ b/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/gcs_hash.go
@@ -0,0 +1,94 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gcshasher
+
+import (
+ "bytes"
+ "cloud.google.com/go/storage"
+ "context"
+ "crypto/md5"
+ "fmt"
+ "io"
+ "log"
+ "net/url"
+ "os"
+)
+
+func parseGcsPath(gcsPath string) (bucket string, path string, err error) {
+ uri, err := url.Parse(gcsPath)
+ bucket = ""
+ path = ""
+ if err != nil {
+ err = fmt.Errorf("couldn't parse GCS URI %+v", gcsPath)
+ return
+ }
+ if uri.Scheme != "gs" {
+ err = fmt.Errorf("couldn't parse GCS URI: %+v scheme should be 'gs'", gcsPath)
+ return
+ }
+ bucket = uri.Host
+ path = uri.Path[1:]
+ return
+}
+func gcsMD5(gcsPath string) ([]byte, error) {
+ bktName, path, err := parseGcsPath(gcsPath)
+ if err != nil {
+ log.Fatalf("%s", err)
+ }
+
+ ctx := context.Background()
+ client, err := storage.NewClient(ctx)
+ if err != nil {
+ return nil, fmt.Errorf("Couldn't authenticate GCS client: %s", err)
+ }
+
+ attrs, err := client.Bucket(bktName).Object(path).Attrs(ctx)
+ if err != nil {
+ return nil, fmt.Errorf("Couldn't read file hash for %s: %s", path, err)
+ }
+
+ hash := attrs.MD5
+ return hash, nil
+}
+
+func localMD5(path string) ([]byte, error) {
+ f, err := os.Open(path)
+ if err != nil {
+ return nil, err
+ }
+ defer f.Close()
+
+ h := md5.New()
+ if _, err := io.Copy(h, f); err != nil {
+ return nil, err
+ }
+ return h.Sum(nil), nil
+}
+
+// LocalFileEqGCS check equalit of local file and GCS object using md5 hash
+func LocalFileEqGCS(localPath, gcsPath string) (bool, error) {
+ loc, err := localMD5(localPath)
+ if err != nil {
+ err = fmt.Errorf("Local file not found %s", err)
+ return false, err
+ }
+ gcs, err := gcsMD5(gcsPath)
+ if err != nil {
+ err = fmt.Errorf("GCS file not found %s", err)
+ return false, err
+ }
+
+ return bytes.Compare(loc, gcs) == 0, nil
+}
diff --git a/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/gcs_hash_test.go b/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/gcs_hash_test.go
new file mode 100644
index 0000000..6d90b83
--- /dev/null
+++ b/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/gcs_hash_test.go
@@ -0,0 +1,74 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gcshasher
+
+import (
+ "cloud.google.com/go/storage"
+ "context"
+ "flag"
+ "io"
+ "os"
+ "path/filepath"
+ "testing"
+)
+
+var testBkt = flag.String("bkt", "", "The bucket to use for testing the hash comparison")
+
+func TestLocalMD5(t *testing.T) {
+ locPath := filepath.Join("testdata", "test.txt")
+ _, err := localMD5(locPath)
+ if err != nil {
+ t.Errorf("error hashing local file: %s", err)
+ }
+}
+
+func TestLocalFileEqGCS(t *testing.T) {
+ if *testBkt == "" {
+ t.Skip("skipping hash comparison integration test because no test bucket passed")
+ }
+
+ locPath := filepath.Join("testdata", "test.txt")
+ ctx := context.Background()
+ client, err := storage.NewClient(ctx)
+ if err != nil {
+ t.Errorf("Couldn't authenticate GCS client: %s", err)
+ }
+
+ var r io.Reader
+ f, err := os.Open(locPath)
+ defer f.Close()
+ r = f
+
+ obj := client.Bucket(*testBkt).Object("testdata/test.txt")
+ w := obj.NewWriter(ctx)
+ io.Copy(w, r)
+ if err := w.Close(); err != nil {
+ t.Errorf("couldn't write test object %s ", err)
+ }
+
+ eq, err := LocalFileEqGCS(locPath, "gs://"+*testBkt+"/testdata/test.txt")
+ if !eq {
+ t.Errorf("hashes were not equal for local test.txt vs gcs test.txt")
+ }
+
+ diffLocPath := filepath.Join("testdata", "test_diff.txt")
+ eq, err = LocalFileEqGCS(diffLocPath, "gs://"+*testBkt+"/testdata/test.txt")
+ if eq {
+ t.Errorf("hashes were equal for local test_diff.txt vs gcs test.txt")
+ }
+ if err := obj.Delete(ctx); err != nil {
+ t.Logf("couldn't clean up test object: %s", err)
+ }
+}
diff --git a/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/testdata/test.txt b/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/testdata/test.txt
new file mode 100644
index 0000000..ce27bd9
--- /dev/null
+++ b/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/testdata/test.txt
@@ -0,0 +1 @@
+This is a test file for hashing.
diff --git a/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/testdata/test_diff.txt b/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/testdata/test_diff.txt
new file mode 100644
index 0000000..17e67a8
--- /dev/null
+++ b/composer/cloudbuild/go/dagsdeployer/internal/gcshasher/testdata/test_diff.txt
@@ -0,0 +1 @@
+This is a different test file for hashing.
diff --git a/composer/config/AirflowVariables.json b/composer/config/AirflowVariables.json
new file mode 100644
index 0000000..ae4985f
--- /dev/null
+++ b/composer/config/AirflowVariables.json
@@ -0,0 +1,14 @@
+{
+ "gcp_project": "${GCP_PROJECT_ID}",
+ "gcp_region": "${COMPOSER_REGION}",
+ "dataflow_jar_location": "${DATAFLOW_JAR_BUCKET}",
+ "dataflow_jar_file": "to_be_overriden",
+ "gcs_input_bucket": "${INPUT_BUCKET}",
+ "gcs_ref_bucket": "${REF_BUCKET}",
+ "gcs_output_bucket": "${RESULT_BUCKET}",
+ "dataflow_staging_bucket": "${DATAFLOW_STAGING_BUCKET}",
+ "dataproc_bucket": "${DATFLOW_STAGING_BUCKET}",
+ "gce_zone": "${COMPOSER_REGION}-a",
+ "gcs_bucket": "spark_bucket",
+ "bq_output_table": "${GCP_PROJECT_ID}.nyc_taxi.avg_speed"
+}
diff --git a/composer/config/running_dags.txt b/composer/config/running_dags.txt
new file mode 100644
index 0000000..e69de29
diff --git a/composer/dags/.airflowignore b/composer/dags/.airflowignore
new file mode 100644
index 0000000..46a265c
--- /dev/null
+++ b/composer/dags/.airflowignore
@@ -0,0 +1,2 @@
+sql/
+support-files/
diff --git a/composer/dags/ephemeral_dataproc_spark_dag.py b/composer/dags/ephemeral_dataproc_spark_dag.py
new file mode 100644
index 0000000..9f328bb
--- /dev/null
+++ b/composer/dags/ephemeral_dataproc_spark_dag.py
@@ -0,0 +1,160 @@
+# Copyright 2018 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from datetime import datetime, timedelta
+
+from airflow import DAG
+from airflow.contrib.operators.dataproc_operator import (
+ DataprocClusterCreateOperator,
+ DataProcPySparkOperator,
+ DataprocClusterDeleteOperator)
+from airflow.contrib.operators.gcs_to_bq import (
+ GoogleCloudStorageToBigQueryOperator)
+from airflow.operators import BashOperator
+from airflow.models import Variable
+from airflow.utils.trigger_rule import TriggerRule
+
+##################################################################
+# This file defines the DAG for the logic pictured below. #
+##################################################################
+# #
+# create_cluster #
+# | #
+# V #
+# submit_pyspark....... #
+# | . #
+# / \ V #
+# / \ move_failed_files #
+# / \ ^ #
+# | | . #
+# V V . #
+# delete_cluster bq_load..... #
+# | #
+# V #
+# delete_transformed_files #
+# #
+# (Note: Dotted lines indicate conditional trigger rule on #
+# failure of the up stream tasks. In this case the files in the #
+# raw-{timestamp}/ GCS path will be moved to a failed-{timestamp}#
+# path.) #
+##################################################################
+
+# Airflow parameters, see https://airflow.incubator.apache.org/code.html
+DEFAULT_DAG_ARGS = {
+ 'owner': 'jferriero@google.com', # The owner of the task.
+ # Task instance should not rely on the previous task's schedule to succeed.
+ 'depends_on_past': False,
+ # We use this in combination with schedule_interval=None to only trigger the
+ # DAG with a POST to the REST API.
+ # Alternatively, we could set this to yesterday and the dag will be
+ # triggered upon upload to the dag folder.
+ 'start_date': datetime(2020, 1, 1),
+ 'email_on_failure': False,
+ 'email_on_retry': False,
+ 'retries': 1, # Retry once before failing the task.
+ 'retry_delay': timedelta(minutes=5), # Time between retries.
+ 'project_id': Variable.get('gcp_project'), # Cloud Composer project ID.
+ # We only want the DAG to run when we POST to the api.
+ # Alternatively, this could be set to '@daily' to run the job once a day.
+ # more options at https://airflow.apache.org/scheduler.html#dag-runs
+}
+
+# Create Directed Acyclic Graph for Airflow
+with DAG('ephemeral_dataproc_spark_dag', default_args=DEFAULT_DAG_ARGS,
+ schedule_interval=None) as dag: # Here we are using dag as context.
+ # Create the Cloud Dataproc cluster.
+ # Note: this operator will be flagged a success if the cluster by this name
+ # already exists.
+ create_cluster = DataprocClusterCreateOperator(
+ task_id='create_dataproc_cluster',
+ # ds_nodash is an airflow macro for "[Execution] Date string no dashes"
+ # in YYYYMMDD format.
+ # See docs https://airflow.apache.org/code.html?highlight=macros#macros
+ cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
+ image_version='1.5-debian10',
+ num_workers=2,
+ storage_bucket=Variable.get('dataproc_bucket'),
+ zone=Variable.get('gce_zone'))
+
+ # Submit the PySpark job.
+ submit_pyspark = DataProcPySparkOperator(
+ task_id='run_dataproc_pyspark',
+ main='gs://' + Variable.get('gcs_bucket') +
+ '/spark-jobs/spark_avg_speed.py',
+ # Obviously needs to match the name of cluster created in the prior
+ # Operator.
+ cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
+ # Let's template our arguments for the pyspark job from the POST
+ # payload.
+ arguments=[
+ "--gcs_path_raw={{ dag_run.conf['raw_path'] }}",
+ "--gcs_path_transformed=gs://{{ var.value.gcs_bucket}}" +
+ "/{{ dag_run.conf['transformed_path'] }}"
+ ])
+
+ # Load the transformed files to a BigQuery table.
+ bq_load = GoogleCloudStorageToBigQueryOperator(
+ task_id='GCS_to_BigQuery',
+ bucket='{{ var.value.gcs_bucket }}',
+ # Wildcard for objects created by spark job to be written to BigQuery
+ # Reads the relative path to the objects transformed by the spark job
+ # from the POST message.
+ source_objects=["{{ dag_run.conf['transformed_path'] }}/part-*"],
+ destination_project_dataset_table='{{ var.value.bq_output_table }}',
+ schema_fields=None,
+ # Relative gcs path to schema file.
+ schema_object='schemas/nyc-tlc-yellow.json',
+ # Note that our spark job does json -> csv conversion.
+ source_format='CSV',
+ create_disposition='CREATE_IF_NEEDED',
+ skip_leading_rows=0,
+ write_disposition='WRITE_TRUNCATE', # If the table exists, overwrite it
+ max_bad_records=0)
+
+ # Delete the Cloud Dataproc cluster.
+ delete_cluster = DataprocClusterDeleteOperator(
+ task_id='delete_dataproc_cluster',
+ # Obviously needs to match the name of cluster created in the prior two
+ # Operators.
+ cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
+ # This will tear down the cluster even if there are failures in upstream
+ # tasks.
+ trigger_rule=TriggerRule.ALL_DONE)
+
+ # Delete gcs files in the timestamped transformed folder.
+ delete_transformed_files = BashOperator(
+ task_id='delete_transformed_files',
+ bash_command="gsutil -m rm -r gs://{{ var.value.gcs_bucket }}" +
+ "/{{ dag_run.conf['transformed_path'] }}/")
+
+ # If the spark job or BQ Load fails we rename the timestamped raw path to
+ # a timestamped failed path.
+ move_failed_files = BashOperator(
+ task_id='move_failed_files',
+ bash_command="gsutil mv gs://{{ var.value.gcs_bucket }}" +
+ "/{{ dag_run.conf['raw_path'] }}/ " + "gs://{{ var.value.gcs_bucket}}" +
+ "/{{ dag_run.conf['failed_path'] }}/",
+ trigger_rule=TriggerRule.ONE_FAILED)
+ # Set the dag property of the first Operators, this will be inherited by
+ # downstream Operators.
+
+ create_cluster.dag = dag
+
+ create_cluster.set_downstream(submit_pyspark)
+
+ submit_pyspark.set_downstream([delete_cluster, bq_load])
+
+ bq_load.set_downstream(delete_transformed_files)
+
+ move_failed_files.set_upstream([bq_load, submit_pyspark])
diff --git a/source-code/workflow-dag/support-files/input.txt b/composer/dags/support-files/input.txt
similarity index 100%
rename from source-code/workflow-dag/support-files/input.txt
rename to composer/dags/support-files/input.txt
diff --git a/source-code/workflow-dag/support-files/ref.txt b/composer/dags/support-files/ref.txt
similarity index 100%
rename from source-code/workflow-dag/support-files/ref.txt
rename to composer/dags/support-files/ref.txt
diff --git a/composer/dags/tutorial.py b/composer/dags/tutorial.py
new file mode 100644
index 0000000..fc6306d
--- /dev/null
+++ b/composer/dags/tutorial.py
@@ -0,0 +1,121 @@
+# -*- coding: utf-8 -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+### Tutorial Documentation
+Documentation that goes along with the Airflow tutorial located
+[here](https://airflow.apache.org/tutorial.html)
+"""
+# [START tutorial]
+from datetime import timedelta
+
+# [START import_module]
+# The DAG object; we'll need this to instantiate a DAG
+from airflow import DAG
+# Operators; we need this to operate!
+from airflow.operators.bash_operator import BashOperator
+from airflow.utils.dates import days_ago
+
+# [END import_module]
+
+# [START default_args]
+# These args will get passed on to each operator
+# You can override them on a per-task basis during operator initialization
+default_args = {
+ 'owner': 'jferriero@google.com',
+ 'depends_on_past': False,
+ 'start_date': days_ago(2),
+ 'email': ['airflow@example.com'],
+ 'email_on_failure': False,
+ 'email_on_retry': False,
+ 'retries': 1,
+ 'retry_delay': timedelta(minutes=5),
+ # 'queue': 'bash_queue',
+ # 'pool': 'backfill',
+ # 'priority_weight': 10,
+ # 'end_date': datetime(2016, 1, 1),
+ # 'wait_for_downstream': False,
+ # 'dag': dag,
+ # 'sla': timedelta(hours=2),
+ # 'execution_timeout': timedelta(seconds=300),
+ # 'on_failure_callback': some_function,
+ # 'on_success_callback': some_other_function,
+ # 'on_retry_callback': another_function,
+ # 'sla_miss_callback': yet_another_function,
+ # 'trigger_rule': 'all_success'
+}
+# [END default_args]
+
+# [START instantiate_dag]
+dag = DAG(
+ 'tutorial',
+ default_args=default_args,
+ description='A simple tutorial DAG',
+ schedule_interval=timedelta(days=1),
+)
+# [END instantiate_dag]
+
+# t1, t2 and t3 are examples of tasks created by instantiating operators
+# [START basic_task]
+t1 = BashOperator(
+ task_id='print_date',
+ bash_command='date',
+ dag=dag,
+)
+
+t2 = BashOperator(
+ task_id='sleep',
+ depends_on_past=False,
+ bash_command='sleep 5',
+ retries=3,
+ dag=dag,
+)
+# [END basic_task]
+
+# [START documentation]
+dag.doc_md = __doc__
+
+t1.doc_md = """\
+#### Task Documentation
+You can document your task using the attributes `doc_md` (markdown),
+`doc` (plain text), `doc_rst`, `doc_json`, `doc_yaml` which gets
+rendered in the UI's Task Instance Details page.
+![img](http://montcs.bloomu.edu/~bobmon/Semesters/2012-01/491/import%20soul.png)
+"""
+# [END documentation]
+
+# [START jinja_template]
+templated_command = """
+{% for i in range(5) %}
+ echo "{{ ds }}"
+ echo "{{ macros.ds_add(ds, 7)}}"
+ echo "{{ params.my_param }}"
+{% endfor %}
+"""
+
+t3 = BashOperator(
+ task_id='templated',
+ depends_on_past=False,
+ bash_command=templated_command,
+ params={'my_param': 'Parameter I passed in'},
+ dag=dag,
+)
+# [END jinja_template]
+
+t1 >> [t2, t3]
+# [END tutorial]
diff --git a/composer/dags/wordcount_dag.py b/composer/dags/wordcount_dag.py
new file mode 100644
index 0000000..9423570
--- /dev/null
+++ b/composer/dags/wordcount_dag.py
@@ -0,0 +1,121 @@
+# Copyright 2019 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Data processing test workflow definition.
+"""
+
+import datetime
+import os
+from airflow import models
+from airflow.contrib.operators.bigquery_operator import BigQueryOperator
+from airflow.contrib.operators.dataflow_operator import DataFlowJavaOperator
+from airflow.contrib.operators.gcs_download_operator import \
+ GoogleCloudStorageDownloadOperator
+# pylint: disable=import-error
+from airflow.operators.xcom_utils_plugin import CompareXComMapsOperator
+
+DATAFLOW_STAGING_BUCKET = 'gs://{{ var.value.dataflow_staging_bucket }}/staging'
+
+DATAFLOW_JAR_LOCATION = ('gs://{{ var.value.dataflow_jar_location }}'
+ '/{{ var.dataflow_word_count_jar }}')
+
+PROJECT = models.Variable.get('gcp_project')
+REGION = models.Variable.get('gcp_region')
+INPUT_BUCKET = 'gs://' + models.Variable.get('gcs_input_bucket')
+OUTPUT_BUCKET_NAME = models.Variable.get('gcs_output_bucket')
+OUTPUT_BUCKET = 'gs://' + OUTPUT_BUCKET_NAME
+REF_BUCKET = models.Variable.get('gcs_ref_bucket')
+OUTPUT_PREFIX = 'output'
+DOWNLOAD_TASK_PREFIX = 'download_result'
+
+# Dynamic prefix gives us flexibility for running airflow in a ci container or
+# on composer.
+SQL_PREFIX = os.path.join(os.environ.get('AIRFLOW_HOME', '/home/airflow'),
+ 'gcs', 'data', 'sql')
+
+SHAKESPEARE_SQL = os.path.join(SQL_PREFIX, 'shakespeare_top_25.sql')
+
+YESTERDAY = datetime.datetime.combine(
+ datetime.datetime.today() - datetime.timedelta(1),
+ datetime.datetime.min.time())
+
+DEFAULT_ARGS = {
+ 'owner': 'jferriero@google.com',
+ 'dataflow_default_options': {
+ 'project': PROJECT,
+ 'region': REGION,
+ 'stagingLocation': DATAFLOW_STAGING_BUCKET
+ }
+}
+
+with models.DAG('wordcount_dag',
+ start_date=YESTERDAY,
+ schedule_interval=None,
+ default_args=DEFAULT_ARGS) as dag:
+
+ DATAFLOW_EXECUTION = DataFlowJavaOperator(
+ task_id='wordcount-run',
+ jar=DATAFLOW_JAR_LOCATION,
+ options={
+ 'autoscalingAlgorithm': 'THROUGHPUT_BASED',
+ 'maxNumWorkers': '3',
+ 'inputFile': f'{INPUT_BUCKET}/input.txt',
+ 'output': f'{OUTPUT_BUCKET}/{OUTPUT_PREFIX}'
+ })
+
+ DOWNLOAD_EXPECTED = GoogleCloudStorageDownloadOperator(
+ task_id='download_ref_string',
+ bucket=REF_BUCKET,
+ object='ref.txt',
+ store_to_xcom_key='ref_str',
+ )
+
+ DOWNLOAD_RESULT_ONE = GoogleCloudStorageDownloadOperator(
+ task_id=DOWNLOAD_TASK_PREFIX + '_1',
+ bucket=OUTPUT_BUCKET_NAME,
+ object=OUTPUT_PREFIX + '-00000-of-00003',
+ store_to_xcom_key='res_str_1',
+ )
+
+ DOWNLOAD_RESULT_TWO = GoogleCloudStorageDownloadOperator(
+ task_id=DOWNLOAD_TASK_PREFIX + '_2',
+ bucket=OUTPUT_BUCKET_NAME,
+ object=OUTPUT_PREFIX + '-00001-of-00003',
+ store_to_xcom_key='res_str_2',
+ )
+
+ DOWNLOAD_RESULT_THREE = GoogleCloudStorageDownloadOperator(
+ task_id=DOWNLOAD_TASK_PREFIX + '_3',
+ bucket=OUTPUT_BUCKET_NAME,
+ object=OUTPUT_PREFIX + '-00002-of-00003',
+ store_to_xcom_key='res_str_3',
+ )
+
+ COMPARE_RESULT = CompareXComMapsOperator(
+ task_id='do_comparison',
+ ref_task_ids=['download_ref_string'],
+ res_task_ids=[
+ DOWNLOAD_TASK_PREFIX + '_1', DOWNLOAD_TASK_PREFIX + '_2',
+ DOWNLOAD_TASK_PREFIX + '_3'
+ ],
+ )
+ RUN_QUERY = BigQueryOperator(task_id='run_sql', sql=SHAKESPEARE_SQL)
+
+ RUN_QUERY >> DATAFLOW_EXECUTION # pylint: disable=pointless-statement
+ DATAFLOW_EXECUTION.set_downstream(
+ [DOWNLOAD_RESULT_ONE, DOWNLOAD_RESULT_TWO, DOWNLOAD_RESULT_THREE])
+
+ COMPARE_RESULT.set_upstream([
+ DOWNLOAD_EXPECTED, DOWNLOAD_RESULT_ONE, DOWNLOAD_RESULT_TWO,
+ DOWNLOAD_RESULT_THREE
+ ])
diff --git a/composer/plugins/xcom_utils_plugin/__init__.py b/composer/plugins/xcom_utils_plugin/__init__.py
new file mode 100644
index 0000000..9aa5b70
--- /dev/null
+++ b/composer/plugins/xcom_utils_plugin/__init__.py
@@ -0,0 +1,41 @@
+# Copyright 2019 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Defines Plugin for XCom Operators."""
+
+from typing import Any, List
+from airflow.plugins_manager import AirflowPlugin
+
+# Allow unittests to easily import.
+try:
+ from xcom_utils_plugin.operators.compare_xcom_maps import \
+ CompareXComMapsOperator
+except ModuleNotFoundError:
+ from plugins.xcom_utils_plugin.operators.compare_xcom_maps import \
+ CompareXComMapsOperator
+
+
+class XComUtilsPlugin(AirflowPlugin):
+ """Plugin to define operators perform common logic on XComs.
+ Operators:
+ CompareXComMapsOperator: An Operator that checks the equality
+ of XComs.
+ """
+ name = "xcom_utils_plugin"
+ operators = [CompareXComMapsOperator]
+ hooks: List[Any] = []
+ executors: List[Any] = []
+ macros: List[Any] = []
+ admin_views: List[Any] = []
+ flask_blueprints: List[Any] = []
+ menu_links: List[Any] = []
diff --git a/composer/plugins/xcom_utils_plugin/operators/__init__.py b/composer/plugins/xcom_utils_plugin/operators/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/composer/plugins/xcom_utils_plugin/operators/compare_xcom_maps.py b/composer/plugins/xcom_utils_plugin/operators/compare_xcom_maps.py
new file mode 100644
index 0000000..512f55c
--- /dev/null
+++ b/composer/plugins/xcom_utils_plugin/operators/compare_xcom_maps.py
@@ -0,0 +1,84 @@
+# Copyright 2019 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Custom operator that compares dictionaries in xcom.
+"""
+
+from airflow.models import BaseOperator
+from airflow.utils.decorators import apply_defaults
+
+
+class CompareXComMapsOperator(BaseOperator):
+ """Compare dictionary stored in xcom.
+
+ Args:
+ ref_task_ids: list of task ids from where the reference dictionary
+ is fetched
+ res_task_ids: list of task ids from where the comparing dictionary
+ is fetched
+ """
+
+ @apply_defaults
+ def __init__(self, ref_task_ids, res_task_ids, *args, **kwargs):
+ super(CompareXComMapsOperator, self).__init__(*args, **kwargs)
+ self.ref_task_ids = ref_task_ids
+ self.res_task_ids = res_task_ids
+
+ def execute(self, context):
+ """Perform the XCom comparison based on the ref and res task_ids.
+ """
+ ref_obj = self.read_value_as_obj(self.ref_task_ids, context)
+ res_obj = self.read_value_as_obj(self.res_task_ids, context)
+ self.compare_obj(ref_obj, res_obj)
+ return 'result contains the expected values'
+
+ def read_value_as_obj(self, task_ids, context):
+ """Reads XComs from task_ids as dict.
+ """
+ ret_obj = {}
+ for task_id in task_ids:
+ value_str = context['ti'].xcom_pull(key=None, task_ids=task_id)
+ self.parse_str_obj(value_str, ret_obj)
+ return ret_obj
+
+ @staticmethod
+ def parse_str_obj(str_rep, obj):
+ """Parses Handles key: value strings to dict.
+ """
+ entries = str_rep.split('\n')
+ for entry in entries:
+ if entry:
+ key, value = entry.split(': ')
+ obj[key] = value
+
+ def compare_obj(self, ref_obj, res_obj):
+ """Raise ValueError if objects are not equal"""
+ if ref_obj != res_obj:
+ raise ValueError(self.create_diff_str(ref_obj, res_obj))
+
+ @staticmethod
+ def create_diff_str(ref_obj, res_obj):
+ """Creates an informative error message detailing the differences
+ in the objects.
+ """
+ msg = 'The result differs from expected in the following ways:'
+ for k in ref_obj:
+ if k not in res_obj:
+ msg = msg + ('\nmissing key: %s in result' % k)
+ elif ref_obj[k] != res_obj[k]:
+ msg = msg + ('\nexpected %s: %s but got %s: %s' %
+ (k, ref_obj[k], k, res_obj[k]))
+ for k in res_obj:
+ if k not in ref_obj:
+ msg = msg + ('\nunexpected key: %s in result' % k)
+ return msg
diff --git a/composer/precommit_cloudbuild.yaml b/composer/precommit_cloudbuild.yaml
new file mode 100644
index 0000000..e7388a4
--- /dev/null
+++ b/composer/precommit_cloudbuild.yaml
@@ -0,0 +1,31 @@
+# Copyright 2019 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Composer is an orchestrator and therefore to test appropritely, we need
+# to build everything that it will be orchestrating and stage it in a known,
+# readable place to facilitate the tests.
+steps:
+# Run unit tests in Airflow container (local to cloud build).
+- name: 'gcr.io/cloud-solutions-images/apache-airflow:1.10'
+ dir: 'composer'
+ entrypoint: 'bash'
+ args: [
+ 'cloudbuild/bin/run_tests.sh',
+ '../bigquery/sql',
+ './config/AirflowVariables.json',
+ './plugins']
+ id: 'run-unit-tests'
+options:
+ machineType: 'N1_HIGHCPU_8' # For test and deploy dags parallelization.
diff --git a/composer/requirements-dev.txt b/composer/requirements-dev.txt
new file mode 100644
index 0000000..4a10aff
--- /dev/null
+++ b/composer/requirements-dev.txt
@@ -0,0 +1,13 @@
+apache-airflow[gcp_api]==1.10.6
+flake8>=3.6.0
+flake8-colors
+nose
+pytest
+parameterized
+pre-commit
+pylint~=2.3.1
+nose
+mock
+mypy
+tenacity==5.1.5
+Werkzeug==0.16.0
diff --git a/composer/tests/__init__.py b/composer/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/composer/tests/test_compare_xcom_maps.py b/composer/tests/test_compare_xcom_maps.py
new file mode 100644
index 0000000..2c1e05f
--- /dev/null
+++ b/composer/tests/test_compare_xcom_maps.py
@@ -0,0 +1,130 @@
+# Copyright 2019 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Unit test of the CompareXComMapsOperator.
+"""
+import unittest
+import mock
+
+# pylint: disable=import-error
+from plugins.xcom_utils_plugin.operators.compare_xcom_maps import \
+ CompareXComMapsOperator
+
+TASK_ID = 'test_compare_task_id'
+REF_TASK_ID = 'download_ref_string'
+DOWNLOAD_TASK_PREFIX = 'download_result'
+CONTEXT_CLASS_NAME = 'airflow.ti_deps.dep_context'
+ERROR_LINE_ONE = 'The result differs from expected in the following ways:\n'
+
+
+def generate_mock_function(first_value, second_value, third_value):
+ """Mock dictionary for XCom."""
+
+ def mock_function(**kwargs):
+ return {
+ REF_TASK_ID: 'a: 1\nb: 2\nc: 3',
+ DOWNLOAD_TASK_PREFIX + '_1': first_value,
+ DOWNLOAD_TASK_PREFIX + '_2': second_value,
+ DOWNLOAD_TASK_PREFIX + '_3': third_value
+ }[kwargs['task_ids']]
+
+ return mock_function
+
+
+def equal_mock():
+ """Mocks no change."""
+ return generate_mock_function('c: 3', 'b: 2', 'a: 1')
+
+
+def missing_value_mock():
+ """Mock missing key."""
+ return generate_mock_function('b: 2', 'a: 1', 'b: 2')
+
+
+def wrong_value_mock():
+ """Mock wrong value."""
+ return generate_mock_function('a: 1', 'b: 4', 'c: 3')
+
+
+def unexpected_value_mock():
+ """Mock wrong key."""
+ return generate_mock_function('a: 1', 'c: 3\nd: 4', 'b: 2')
+
+
+class CompareXComMapsOperatorTest(unittest.TestCase):
+ """Test class for XComMapsOperator for success case and various
+ error handling."""
+
+ def setUp(self):
+ """Set up test fixture."""
+ super(CompareXComMapsOperatorTest, self).setUp()
+ self.xcom_compare = CompareXComMapsOperator(
+ task_id=TASK_ID,
+ ref_task_ids=[REF_TASK_ID],
+ res_task_ids=[
+ DOWNLOAD_TASK_PREFIX + '_1', DOWNLOAD_TASK_PREFIX + '_2',
+ DOWNLOAD_TASK_PREFIX + '_3'
+ ])
+
+ def test_init(self):
+ """Test the Operator's constructor."""
+ self.assertEqual(self.xcom_compare.task_id, TASK_ID)
+ self.assertListEqual(self.xcom_compare.ref_task_ids, [REF_TASK_ID])
+ self.assertListEqual(self.xcom_compare.res_task_ids, [
+ DOWNLOAD_TASK_PREFIX + '_1', DOWNLOAD_TASK_PREFIX + '_2',
+ DOWNLOAD_TASK_PREFIX + '_3'
+ ])
+
+ def assert_raises_with_message(self, error_type, msg, func, *args,
+ **kwargs):
+ """Utility method for asserting a message was produced."""
+ with self.assertRaises(error_type) as context:
+ func(*args, **kwargs)
+ self.assertEqual(msg, str(context.exception))
+
+ def execute_value_error(self, mock_func, error_expect_tr):
+ """Utility for testing various ValueError paths."""
+ with mock.patch(CONTEXT_CLASS_NAME) as context_mock:
+ context_mock['ti'].xcom_pull = mock_func
+ self.assert_raises_with_message(ValueError, error_expect_tr,
+ self.xcom_compare.execute,
+ context_mock)
+
+ def test_equal(self):
+ """Test success case."""
+ with mock.patch(CONTEXT_CLASS_NAME) as context_mock:
+ context_mock['ti'].xcom_pull = equal_mock()
+ self.xcom_compare.execute(context_mock)
+
+ def test_missing_value(self):
+ """Test expected error message when missing key."""
+ self.execute_value_error(
+ missing_value_mock(), '{}{}'.format(ERROR_LINE_ONE,
+ 'missing key: c in result'))
+
+ def test_wrong_value(self):
+ """Test expected error message if xcom values don't match."""
+ self.execute_value_error(
+ wrong_value_mock(), '{}{}'.format(ERROR_LINE_ONE,
+ 'expected b: 2 but got b: 4'))
+
+ def test_unexpected_value(self):
+ """Test expected error message if xcom contains unexpected key."""
+ self.execute_value_error(
+ unexpected_value_mock(),
+ '{}{}'.format(ERROR_LINE_ONE, 'unexpected key: d in result'))
+
+
+SUITE = unittest.TestLoader().loadTestsFromTestCase(CompareXComMapsOperatorTest)
+
+unittest.TextTestRunner(verbosity=2).run(SUITE)
diff --git a/composer/tests/test_dag_validation.py b/composer/tests/test_dag_validation.py
new file mode 100644
index 0000000..b7d5776
--- /dev/null
+++ b/composer/tests/test_dag_validation.py
@@ -0,0 +1,89 @@
+# Copyright 2019 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DAG Quality tests."""
+
+import os
+from pathlib import Path
+import time
+import unittest
+
+from airflow.models import DagBag
+
+
+class TestDagIntegrity(unittest.TestCase):
+ """Tests DAG Syntax, compatibility with environment and load time."""
+ LOAD_SECOND_THRESHOLD = 2
+
+ def setUp(self):
+ """Setup dagbag for each test."""
+ self.dagbag = DagBag(
+ dag_folder=os.environ.get('AIRFLOW_HOME', "~/airflow/") + '/dags/',
+ include_examples=False)
+ with open('./config/running_dags.txt') as running_dags_txt:
+ self.running_dag_ids = running_dags_txt.read().splitlines()
+
+ def test_no_ignore_running_dags(self):
+ """
+ Tests that we don't have any dags in running_dags.txt that are
+ ignored by .airflowignore
+ """
+ for dag_id in self.running_dag_ids:
+ try:
+ self.assertTrue(self.dagbag.get_dag(dag_id) is not None)
+ except AssertionError:
+ self.fail(f"{dag_id} is in running_dags.txt but not dagbag.")
+
+ def test_import_dags(self):
+ """Tests there are no syntax issues or environment compaibility issues.
+ """
+ self.assertFalse(
+ len(self.dagbag.import_errors),
+ 'DAG import failures. Errors: {}'.format(
+ self.dagbag.import_errors))
+
+ def test_non_airflow_owner(self):
+ """Tests that owners are set for all dags"""
+ for dag_id in self.dagbag.dag_ids:
+ if dag_id != 'airflow_monitoring':
+ dag = self.dagbag.get_dag(dag_id)
+ try:
+ self.assertIsNotNone(dag.owner)
+ self.assertNotEqual(dag.owner, 'airflow')
+ except AssertionError as err:
+ self.fail(f"issue validating owner for DAG {dag_id}: {err}")
+
+ def test_same_file_and_dag_id_name(self):
+ """Tests that filename matches dag_id"""
+ for dag_id in self.dagbag.dag_ids:
+ dag = self.dagbag.get_dag(dag_id)
+ if not dag.is_subdag:
+ stripped_filename = os.path.splitext(
+ Path(self.dagbag.get_dag(dag_id).filepath).name)[0]
+ self.assertEqual(dag_id, stripped_filename)
+
+ def test_import_time(self):
+ """Test that all DAGs can be parsed under the threshold time."""
+ for dag_id in self.dagbag.dag_ids:
+ start = time.time()
+
+ self.dagbag.process_file(self.dagbag.get_dag(dag_id).filepath)
+
+ end = time.time()
+ total = end - start
+
+ self.assertLessEqual(total, self.LOAD_SECOND_THRESHOLD)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/dataflow/java/wordcount/cloudbuild.yaml b/dataflow/java/wordcount/cloudbuild.yaml
new file mode 100644
index 0000000..b50cfde
--- /dev/null
+++ b/dataflow/java/wordcount/cloudbuild.yaml
@@ -0,0 +1,25 @@
+steps:
+# [Dataflow]
+# Maven package will run compile run the prior phases (validate, compile, test)
+# https://maven.apache.org/guides/introduction/introduction-to-the-lifecycle.html#a-build-lifecycle-is-made-up-of-phases
+- name: maven:3.6.0-jdk-8-slim
+ waitFor: ['-']
+ dir: 'dataflow/java/wordcount'
+ entrypoint: 'mvn'
+ args: ['package', '-q']
+ id: 'build-wordcount-jar'
+# Override JAR reference variable to the artifact built in this build to the
+# airflow DAG that orchestrates this job picks up this version of the JAR.
+- name: 'google/cloud-sdk'
+ waitFor: ['build-wordcount-jar']
+ dir: 'dataflow/java/wordcount'
+ entrypoint: 'bash'
+ args: [
+ '-c',
+ 'gcloud composer environments run --location ${_COMPOSER_REGION} ${_COMPOSER_ENV_NAME} variables -- --set dataflow_word_count_jar "wordcount/${_SHORT_SHA}/$(basename $(ls target/word-count-beam-bundled-*.jar))"'
+ ]
+ id: 'set-composer-jar-ref'
+artifacts:
+ objects:
+ location: 'gs://${_DATAFLOW_JAR_BUCKET}/wordcount/${_SHORT_SHA}/'
+ paths: ['./dataflow/java/wordcount/target/word-count-beam-bundled-*.jar']
diff --git a/source-code/data-processing-code/pom.xml b/dataflow/java/wordcount/pom.xml
similarity index 100%
rename from source-code/data-processing-code/pom.xml
rename to dataflow/java/wordcount/pom.xml
diff --git a/dataflow/java/wordcount/precommit_cloudbuild.yaml b/dataflow/java/wordcount/precommit_cloudbuild.yaml
new file mode 100644
index 0000000..5bc80b0
--- /dev/null
+++ b/dataflow/java/wordcount/precommit_cloudbuild.yaml
@@ -0,0 +1,10 @@
+steps:
+# [Dataflow]
+# Maven package will run compile run the prior phases (validate, compile, test)
+# https://maven.apache.org/guides/introduction/introduction-to-the-lifecycle.html#a-build-lifecycle-is-made-up-of-phases
+- name: maven:3.6.0-jdk-8-slim
+ waitFor: ['-']
+ dir: 'dataflow/java/wordcount'
+ entrypoint: 'mvn'
+ args: ['package', '-q']
+ id: 'build-wordcount-jar'
\ No newline at end of file
diff --git a/dataflow/java/wordcount/src/main/java/org/apache/beam/examples/WordCount.java b/dataflow/java/wordcount/src/main/java/org/apache/beam/examples/WordCount.java
new file mode 100644
index 0000000..4b20d48
--- /dev/null
+++ b/dataflow/java/wordcount/src/main/java/org/apache/beam/examples/WordCount.java
@@ -0,0 +1,195 @@
+/*
+ * Copyright 2019 Google Inc.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.examples;
+
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.io.TextIO;
+import org.apache.beam.sdk.metrics.Counter;
+import org.apache.beam.sdk.metrics.Distribution;
+import org.apache.beam.sdk.metrics.Metrics;
+import org.apache.beam.sdk.options.Default;
+import org.apache.beam.sdk.options.Description;
+import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.options.PipelineOptionsFactory;
+import org.apache.beam.sdk.options.Validation.Required;
+import org.apache.beam.sdk.transforms.Count;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.MapElements;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.transforms.SimpleFunction;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollection;
+
+/**
+ * An example that counts words in Shakespeare and includes Beam best practices.
+ *
+ *
This class, {@link WordCount}, is the second in a series of four successively more detailed
+ * 'word count' examples. You may first want to take a look at {@link MinimalWordCount}. After
+ * you've looked at this example, then see the {@link DebuggingWordCount} pipeline, for introduction
+ * of additional concepts.
+ *
+ *
For a detailed walkthrough of this example, see
+ * https://beam.apache.org/get-started/wordcount-example/
+ *
+ *
Basic concepts, also in the MinimalWordCount example: Reading text files; counting a
+ * PCollection; writing to text files
+ *
+ *
New Concepts:
+ *
+ *
+ * 1. Executing a Pipeline both locally and using the selected runner
+ * 2. Using ParDo with static DoFns defined out-of-line
+ * 3. Building a composite transform
+ * 4. Defining your own pipeline options
+ *
+ *
+ * Concept #1: you can execute this pipeline either locally or using by selecting another runner.
+ * These are now command-line options and not hard-coded as they were in the MinimalWordCount
+ * example.
+ *
+ *
To change the runner, specify:
+ *
+ *
{@code
+ * --runner=YOUR_SELECTED_RUNNER
+ * }
+ *
+ * To execute this pipeline, specify a local output file (if using the {@code DirectRunner}) or
+ * output prefix on a supported distributed file system.
+ *
+ *
{@code
+ * --output=[YOUR_LOCAL_FILE | YOUR_OUTPUT_PREFIX]
+ * }
+ *
+ * The input file defaults to a public data set containing the text of of King Lear, by William
+ * Shakespeare. You can override it and choose your own input with {@code --inputFile}.
+ */
+public class WordCount {
+
+ /**
+ * Concept #2: You can make your pipeline assembly code less verbose by defining your DoFns
+ * statically out-of-line. This DoFn tokenizes lines of text into individual words; we pass it to
+ * a ParDo in the pipeline.
+ */
+ static class ExtractWordsFn extends DoFn {
+ private final Counter emptyLines = Metrics.counter(ExtractWordsFn.class, "emptyLines");
+ private final Distribution lineLenDist =
+ Metrics.distribution(ExtractWordsFn.class, "lineLenDistro");
+ private static final String TOKENIZER_PATTERN = "[^\\p{L}]+";
+
+ @ProcessElement
+ public void processElement(@Element String element, OutputReceiver receiver) {
+ lineLenDist.update(element.length());
+ if (element.trim().isEmpty()) {
+ emptyLines.inc();
+ }
+
+ // Split the line into words.
+ String[] words = element.split(TOKENIZER_PATTERN, -1);
+
+ // Output each word encountered into the output PCollection.
+ for (String word : words) {
+ if (!word.isEmpty()) {
+ receiver.output(word);
+ }
+ }
+ }
+ }
+
+ /** A SimpleFunction that converts a Word and Count into a printable string. */
+ public static class FormatAsTextFn extends SimpleFunction, String> {
+ @Override
+ public String apply(KV input) {
+ return input.getKey() + ": " + input.getValue();
+ }
+ }
+
+ /**
+ * A PTransform that converts a PCollection containing lines of text into a PCollection of
+ * formatted word counts.
+ *
+ * Concept #3: This is a custom composite transform that bundles two transforms (ParDo and
+ * Count) as a reusable PTransform subclass. Using composite transforms allows for easy reuse,
+ * modular testing, and an improved monitoring experience.
+ */
+ public static class CountWords
+ extends PTransform, PCollection>> {
+ @Override
+ public PCollection> expand(PCollection lines) {
+
+ // Convert lines of text into individual words.
+ PCollection words = lines.apply(ParDo.of(new ExtractWordsFn()));
+
+ // Count the number of times each word occurs.
+ PCollection> wordCounts = words.apply(Count.perElement());
+
+ return wordCounts;
+ }
+ }
+
+ /**
+ * Options supported by {@link WordCount}.
+ *
+ * Concept #4: Defining your own configuration options. Here, you can add your own arguments to
+ * be processed by the command-line parser, and specify default values for them. You can then
+ * access the options values in your pipeline code.
+ *
+ *
Inherits standard configuration options.
+ */
+ public interface WordCountOptions extends PipelineOptions {
+
+ /**
+ * By default, this example reads from a public dataset containing the text of King Lear. Set
+ * this option to choose a different input file or glob.
+ */
+ @Description("Path of the file to read from")
+ @Default.String("gs://apache-beam-samples/shakespeare/kinglear.txt")
+ String getInputFile();
+
+ void setInputFile(String value);
+
+ /** Set this required option to specify where to write the output. */
+ @Description("Path of the file to write to")
+ @Required
+ String getOutput();
+
+ void setOutput(String value);
+ }
+
+ static void runWordCount(WordCountOptions options) {
+ Pipeline p = Pipeline.create(options);
+
+ // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the
+ // static FormatAsTextFn() to the ParDo transform.
+ p.apply("ReadLines", TextIO.read().from(options.getInputFile()))
+ .apply(new CountWords())
+ .apply(MapElements.via(new FormatAsTextFn()))
+ .apply("WriteCounts", TextIO.write().to(options.getOutput()));
+
+ p.run().waitUntilFinish();
+ }
+
+ public static void main(String[] args) {
+ WordCountOptions options =
+ PipelineOptionsFactory.fromArgs(args).withValidation().as(WordCountOptions.class);
+ runWordCount(options);
+ }
+}
diff --git a/source-code/data-processing-code/src/test/java/org/apache/beam/examples/WordCountTest.java b/dataflow/java/wordcount/src/test/java/org/apache/beam/examples/WordCountTest.java
similarity index 55%
rename from source-code/data-processing-code/src/test/java/org/apache/beam/examples/WordCountTest.java
rename to dataflow/java/wordcount/src/test/java/org/apache/beam/examples/WordCountTest.java
index df8f3a3..7c76573 100644
--- a/source-code/data-processing-code/src/test/java/org/apache/beam/examples/WordCountTest.java
+++ b/dataflow/java/wordcount/src/test/java/org/apache/beam/examples/WordCountTest.java
@@ -45,52 +45,42 @@
@RunWith(JUnit4.class)
public class WordCountTest {
- /** Example test that tests a specific {@link DoFn}. */
- @Test
- public void testExtractWordsFn() throws Exception {
- DoFnTester < String, String > extractWordsFn = DoFnTester.of(new ExtractWordsFn());
+ /** Example test that tests a specific {@link DoFn}. */
+ @Test
+ public void testExtractWordsFn() throws Exception {
+ DoFnTester extractWordsFn = DoFnTester.of(new ExtractWordsFn());
- Assert.assertThat(
- extractWordsFn.processBundle(" some input words "),
- CoreMatchers.hasItems("some", "input", "words"));
- Assert.assertThat(extractWordsFn.processBundle(" "), CoreMatchers.hasItems());
- Assert.assertThat(
- extractWordsFn.processBundle(" some ", " input", " words"),
- CoreMatchers.hasItems("some", "input", "words"));
- }
+ Assert.assertThat(
+ extractWordsFn.processBundle(" some input words "),
+ CoreMatchers.hasItems("some", "input", "words"));
+ Assert.assertThat(extractWordsFn.processBundle(" "), CoreMatchers.hasItems());
+ Assert.assertThat(
+ extractWordsFn.processBundle(" some ", " input", " words"),
+ CoreMatchers.hasItems("some", "input", "words"));
+ }
- static final String[] WORDS_ARRAY =
- new String[] {
- "five",
- "five four",
- "five four three",
- "five four three two",
- "",
- "five four three two one"
- };
+ static final String[] WORDS_ARRAY =
+ new String[] {
+ "five", "five four", "five four three", "five four three two", "", "five four three two one"
+ };
- static final List < String > WORDS = Arrays.asList(WORDS_ARRAY);
+ static final List WORDS = Arrays.asList(WORDS_ARRAY);
- static final String[] COUNTS_ARRAY = new String[] {
- "five: 5",
- "four: 4",
- "three: 3",
- "two: 2",
- "one: 1"
- };
+ static final String[] COUNTS_ARRAY =
+ new String[] {"five: 5", "four: 4", "three: 3", "two: 2", "one: 1"};
- @Rule public TestPipeline p = TestPipeline.create();
+ @Rule public TestPipeline p = TestPipeline.create();
- /** Example test that tests a PTransform by using an in-memory input and inspecting the output. */
- @Test
- @Category(ValidatesRunner.class)
- public void testCountWords() throws Exception {
- PCollection < String > input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of()));
+ /** Example test that tests a PTransform by using an in-memory input and inspecting the output. */
+ @Test
+ @Category(ValidatesRunner.class)
+ public void testCountWords() throws Exception {
+ PCollection input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of()));
- PCollection < String > output =
- input.apply(new CountWords()).apply(MapElements.via(new FormatAsTextFn()));
+ PCollection output =
+ input.apply(new CountWords()).apply(MapElements.via(new FormatAsTextFn()));
- PAssert.that(output).containsInAnyOrder(COUNTS_ARRAY);
- p.run().waitUntilFinish();
- }
+ PAssert.that(output).containsInAnyOrder(COUNTS_ARRAY);
+ p.run().waitUntilFinish();
+ }
}
diff --git a/env-setup/create_buckets.sh b/env-setup/create_buckets.sh
deleted file mode 100644
index 88ea048..0000000
--- a/env-setup/create_buckets.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-#
-# This script creates the buckets used by the build pipelines and the data
-# processing workflow. It also gives the Cloud Composer service account the
-# access level it need to execute the data processing workflow
-#
-# Copyright 2019 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-gsutil ls -L "gs://${DATAFLOW_JAR_BUCKET_TEST}" 2>/dev/null \
-|| gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${DATAFLOW_JAR_BUCKET_TEST}"
-gsutil ls -L "gs://${INPUT_BUCKET_TEST}" 2>/dev/null \
-|| gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${INPUT_BUCKET_TEST}"
-gsutil ls -L "gs://${REF_BUCKET_TEST}" 2>/dev/null \
-|| gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${REF_BUCKET_TEST}"
-gsutil ls -L "gs://${RESULT_BUCKET_TEST}" 2>/dev/null \
-|| gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${RESULT_BUCKET_TEST}"
-gsutil ls -L "gs://${DATAFLOW_STAGING_BUCKET_TEST}" 2>/dev/null \
-|| gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${DATAFLOW_STAGING_BUCKET_TEST}"
-gsutil ls -L "gs://${DATAFLOW_JAR_BUCKET_PROD}" 2>/dev/null \
-|| gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${DATAFLOW_JAR_BUCKET_PROD}"
-gsutil ls -L "gs://${INPUT_BUCKET_PROD}" 2>/dev/null \
-|| gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${INPUT_BUCKET_PROD}"
-gsutil ls -L "gs://${RESULT_BUCKET_PROD}" 2>/dev/null \
-|| gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${RESULT_BUCKET_PROD}"
-gsutil ls -L "gs://${DATAFLOW_STAGING_BUCKET_PROD}" 2>/dev/null \
-|| gsutil mb -c regional -l "${COMPOSER_REGION}" "gs://${DATAFLOW_STAGING_BUCKET_PROD}"
-
-gsutil acl ch -u "${COMPOSER_SERVICE_ACCOUNT}:R" \
- "gs://${DATAFLOW_JAR_BUCKET_TEST}" \
- "gs://${INPUT_BUCKET_TEST}" \
- "gs://${REF_BUCKET_TEST}" \
- "gs://${DATAFLOW_JAR_BUCKET_PROD}" "gs://${INPUT_BUCKET_PROD}"
-gsutil acl ch -u "${COMPOSER_SERVICE_ACCOUNT}:W" \
- "gs://${RESULT_BUCKET_TEST}" \
- "gs://${DATAFLOW_STAGING_BUCKET_TEST}" \
- "gs://${RESULT_BUCKET_PROD}" "gs://${DATAFLOW_STAGING_BUCKET_PROD}"
diff --git a/env-setup/set_composer_variables.sh b/env-setup/set_composer_variables.sh
deleted file mode 100644
index 67fbffa..0000000
--- a/env-setup/set_composer_variables.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-#
-# This script sets the variables in Composer. The variables are needed for the
-# data processing DAGs to properly execute, such as project-id, GCP region and
-#zone. It also sets Cloud Storage buckets where test files are stored.
-#
-# Copyright 2019 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-declare -A variables
-variables["gcp_project"]="${GCP_PROJECT_ID}"
-variables["gcp_region"]="${COMPOSER_REGION}"
-variables["gcp_zone"]="${COMPOSER_ZONE_ID}"
-variables["dataflow_jar_location_test"]="${DATAFLOW_JAR_BUCKET_TEST}"
-variables["dataflow_jar_file_test"]="to_be_overriden"
-variables["gcs_input_bucket_test"]="${INPUT_BUCKET_TEST}"
-variables["gcs_ref_bucket_test"]="${REF_BUCKET_TEST}"
-variables["gcs_output_bucket_test"]="${RESULT_BUCKET_TEST}"
-variables["dataflow_staging_bucket_test"]="${DATAFLOW_STAGING_BUCKET_TEST}"
-variables["dataflow_jar_location_prod"]="${DATAFLOW_JAR_BUCKET_PROD}"
-variables["dataflow_jar_file_prod"]="to_be_overriden"
-variables["gcs_input_bucket_prod"]="${INPUT_BUCKET_PROD}"
-variables["gcs_output_bucket_prod"]="${RESULT_BUCKET_PROD}"
-variables["dataflow_staging_bucket_prod"]="${DATAFLOW_STAGING_BUCKET_PROD}"
-
-for i in "${!variables[@]}"; do
- gcloud composer environments run "${COMPOSER_ENV_NAME}" \
- --location "${COMPOSER_REGION}" variables -- --set "${i}" "${variables[$i]}"
-done
diff --git a/env-setup/set_env.sh b/env-setup/set_env.sh
deleted file mode 100644
index 458a28b..0000000
--- a/env-setup/set_env.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-#
-# This script sets the environment variables for project environment specific
-# information such as project_id, region and zone choice. And also name of
-# buckets that are used by the build pipeline and the data processing workflow.
-#
-# Copyright 2019 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-export TEST='test'
-export GCP_PROJECT_ID=$(gcloud config list --format 'value(core.project)')
-export PROJECT_NUMBER=$(gcloud projects describe "${GCP_PROJECT_ID}" --format='get(projectNumber)')
-export DATAFLOW_JAR_BUCKET_TEST="${GCP_PROJECT_ID}-composer-dataflow-source-${TEST}"
-export INPUT_BUCKET_TEST="${GCP_PROJECT_ID}-composer-input-${TEST}"
-export RESULT_BUCKET_TEST="${GCP_PROJECT_ID}-composer-result-${TEST}"
-export REF_BUCKET_TEST="${GCP_PROJECT_ID}-composer-ref-${TEST}"
-export DATAFLOW_STAGING_BUCKET_TEST="${GCP_PROJECT_ID}-dataflow-staging-${TEST}"
-export PROD='prod'
-export DATAFLOW_JAR_BUCKET_PROD="${GCP_PROJECT_ID}-composer-dataflow-source-${PROD}"
-export INPUT_BUCKET_PROD="${GCP_PROJECT_ID}-composer-input-${PROD}"
-export RESULT_BUCKET_PROD="${GCP_PROJECT_ID}-composer-result-${PROD}"
-export DATAFLOW_STAGING_BUCKET_PROD="${GCP_PROJECT_ID}-dataflow-staging-${PROD}"
-export COMPOSER_REGION='us-central1'
-export RESULT_BUCKET_REGION="${COMPOSER_REGION}"
-export COMPOSER_ZONE_ID='us-central1-a'
-
-export COMPOSER_ENV_NAME='data-pipeline-composer'
-export SOURCE_CODE_REPO='data-pipeline-source'
-export COMPOSER_DAG_NAME_TEST='test_word_count'
-export COMPOSER_DAG_NAME_PROD='prod_word_count'
diff --git a/helpers/check_format.sh b/helpers/check_format.sh
new file mode 100755
index 0000000..2f60566
--- /dev/null
+++ b/helpers/check_format.sh
@@ -0,0 +1,206 @@
+#!/bin/bash
+
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script checks the format of various files in the tools/ subfolders
+# based on Google open source style guidelines.
+#
+# The following languages are currently supported:
+# - python (using yapf)
+
+set -e
+
+# need_formatting - helper function to error out when
+# a folder contains files that need formatting
+# @args $1 - Folder local path
+# @args $2 - List of files in that folder that need formatting
+# Exit with error code 1 - always
+need_formatting() {
+ FOLDER=$1
+ FILES_TO_LINT=${*:2}
+ echo "Some files need to be formatted in $FOLDER - FAIL"
+ echo "$FILES_TO_LINT"
+ exit 1
+}
+
+# validate_bash - takes a folder path as input and shell checks files
+validate_bash() {
+ FOLDER=$1
+ echo "Validating $FOLDER - Checking bash files"
+
+ FILES_TO_CHECK=$(find "$FOLDER" -type f -name "*.sh")
+
+ # Initialize FILES_TO_LINT to empty string
+ FILES_TO_LINT=""
+
+ if [[ -n "$FILES_TO_CHECK" ]]
+ then
+ for FILE_TO_CHECK in $FILES_TO_CHECK
+ do
+ if ! shellcheck "$FILE_TO_CHECK";
+ then
+ FILES_TO_LINT+="$FILE_TO_CHECK "
+ fi
+ done
+
+ if [[ -n "$FILES_TO_LINT" ]]
+ then
+ need_formatting "$FOLDER" "$FILES_TO_LINT"
+ fi
+ else
+ echo "No bash files found for $FOLDER - SKIP"
+ fi
+}
+
+# validate_terraform - checks terraform files in terraform/
+validate_terraform() {
+ FOLDER=$1
+ echo "Checking terraform fmt in $FOLDER"
+ FILES_TO_CHECK=$(find "$FOLDER" -type f -name "*.tf")
+
+ # Initialize FILES_TO_LINT to empty string
+ FILES_TO_LINT=""
+
+ if [[ -n "$FILES_TO_CHECK" ]]
+ then
+ FILES_TO_LINT=""
+ if ! terraform fmt -check "$FOLDER";
+ then
+ FILES_TO_LINT+="$TF_TO_LINT "
+ need_formatting "$FOLDER" "$FILES_TO_LINT"
+ fi
+ else
+ echo "No terraform files found for $FOLDER - SKIP"
+ fi
+}
+
+# validate_python - takes a folder path as input and validate python files
+# using yapf (supports both python2 and python3)
+# errors out if yapf --diff -r --style google returns a non-0 status
+validate_python() {
+ FOLDER=$1
+ echo "Validating $FOLDER - Checking python files"
+
+ FILES_TO_CHECK=$(find "$FOLDER" -type f -name "*.py")
+
+ # Initialize FILES_TO_LINT to empty string
+ FILES_TO_LINT=""
+
+ if [[ -n "$FILES_TO_CHECK" ]]
+ then
+ # Checking python files
+ # python 2 yapf
+ echo "Testing formatting for python2 files in $FOLDER"
+
+ # Getting the list of files to lint
+ YAPF_PYTHON2_OUTPUT=$(python2 -m yapf --diff -r --style google "$FILES_TO_CHECK" 2>&1)
+ YAPF_PYTHON2_STATUS=$?
+ FILES_TO_LINT+=$( echo "$YAPF_PYTHON2_OUTPUT" | grep -E '^---.*\(original\)$' | awk '{print $2}')
+
+ if [[ -n "$FILES_TO_LINT" ]]
+ then
+ # Error out with details
+ need_formatting "$FOLDER" "$FILES_TO_LINT"
+ fi
+
+ # Checking python files if python2 failed (i.e not python2 compatible code)
+ if [[ "$YAPF_PYTHON2_STATUS" -ne 0 ]]
+ then
+ # python 3 yapf
+ echo "Testing formatting for python3 files in $FOLDER"
+ FILES_TO_LINT+=$(python3 -m yapf --diff -r --style google "$FILES_TO_CHECK" | grep -E '^---.*\(original\)$' | awk '{print $2}')
+
+ if [[ -n "$FILES_TO_LINT" ]]
+ then
+ # Error out with details
+ need_formatting "$FOLDER" "$FILES_TO_LINT"
+ fi
+
+ if [[ -z "$FILES_TO_LINT" ]]
+ then
+ echo "No files need to be formatted in $FOLDER - PASS"
+ fi
+ fi
+ else
+ echo "No python files found for $FOLDER - SKIP"
+ fi
+}
+
+# validate_go - takes a folder path as input and validate go files
+# using gofmt
+# errors out if gofmt returns a non-0 status
+validate_go() {
+ FOLDER=$1
+ echo "Validating $FOLDER - Checking GO files"
+
+ FILES_TO_LINT=$(gofmt -l "$FOLDER")
+
+ if [[ -n "$FILES_TO_LINT" ]]
+ then
+ # Error out with details
+ need_formatting "$FOLDER" "$FILES_TO_LINT"
+ else
+ echo "No go files need formatting for $FOLDER - SKIP"
+ fi
+}
+
+# validate_java - takes a folder path as input and validate folder
+# using gts
+# errors out if gts init or npm audit returns a non-0 status
+validate_java(){
+ FOLDER=$1
+ echo "Validating $FOLDER - Checking java files"
+
+ FILES_TO_CHECK=$(find "$FOLDER" -type f -name "*.java")
+
+ # Initialize FILES_TO_LINT to empty string
+ FILES_TO_LINT=""
+
+ if [[ -n "$FILES_TO_CHECK" ]]
+ then
+ echo "Testing formatting for java files in $FOLDER"
+ # shellcheck disable=SC2086
+ FILES_TO_LINT=$(java -jar "/usr/share/java/google-java-format-1.7-all-deps.jar" --set-exit-if-changed -n $FILES_TO_CHECK)
+
+ if [[ -n "$FILES_TO_LINT" ]]
+ then
+ need_formatting "$FOLDER" "$FILES_TO_LINT"
+ fi
+
+ if [[ -z "$FILES_TO_LINT" ]]
+ then
+ echo "No files need to be formatted in $FOLDER - PASS"
+ fi
+ else
+ echo "No java files found for $FOLDER - SKIP"
+ fi
+}
+
+# temporary list of folders to exclude
+EXCLUDE_FOLDERS=$(cat helpers/exclusion_list.txt)
+while IFS= read -r -d '' FOLDER
+do
+ if [[ ! ${EXCLUDE_FOLDERS[*]} =~ $FOLDER ]]
+ then
+ validate_java "$FOLDER"
+ validate_python "$FOLDER"
+ validate_go "$FOLDER"
+ validate_bash "$FOLDER"
+ validate_terraform "$FOLDER"
+ else
+ echo "$FOLDER in exclusion list - SKIP "
+ fi
+done < <(find . -maxdepth 1 -mindepth 1 -type d -print0)
+echo "finished checking format"
diff --git a/helpers/exclusion_list.txt b/helpers/exclusion_list.txt
new file mode 100644
index 0000000..d2de6ef
--- /dev/null
+++ b/helpers/exclusion_list.txt
@@ -0,0 +1,3 @@
+./.git/
+./terraform/.terraform/
+./composer/.venv/
diff --git a/helpers/format.sh b/helpers/format.sh
new file mode 100755
index 0000000..06096f2
--- /dev/null
+++ b/helpers/format.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script formats the various files in this repository based on Google open source
+# style guidelines. This script is automatically called when running
+# "make fmt" at the root of the repository.
+#
+# NOTE: The files will be formatted in place.
+#
+# The following languages are currently supported:
+# - python (using yapf)
+
+# temporary list of folders to exclude
+EXCLUDE_FOLDERS=$(cat helpers/exclusion_list.txt)
+
+while IFS= read -r -d '' FOLDER
+do
+ if [[ ! ${EXCLUDE_FOLDERS[*]} =~ $FOLDER ]]
+ then
+ echo "Formatting $FOLDER"
+
+ echo "Formatting python files (if any)"
+
+ FILES_TO_FORMAT=$(find "$FOLDER" -type f -name "*.py")
+ if [[ -n "$FILES_TO_FORMAT" ]]
+ then
+ # format all python files in place for python2
+
+
+ # If python2 failed, try to format using python3 instead
+ if ! python2 -m yapf -i -r --style google "$FILES_TO_FORMAT" > /dev/null 2>&1
+ then
+ # format all python files in place for python2
+ python3 -m yapf -i -r --style google "$FILES_TO_FORMAT" > /dev/null
+ fi
+ else
+ echo "No python files found for $FOLDER - SKIP"
+ fi
+
+ echo "Formatting go files (if any)"
+ gofmt -w "$FOLDER"
+
+ echo "Formatting terraform files (if any)"
+ terraform fmt -r "$FOLDER"
+
+ echo "Formatting java files (if any)"
+
+ FILES_TO_FORMAT=$(find "$FOLDER" -type f -name "*.java")
+ if [[ -n "$FILES_TO_FORMAT" ]]
+ then
+ # format all java files in place
+ java -jar /usr/share/java/google-java-format-1.7-all-deps.jar -i "$FILES_TO_FORMAT"
+ else
+ echo "No java files found for $FOLDER - SKIP"
+ fi
+ fi
+done < <(find . -maxdepth 1 -mindepth 1 -type d)
diff --git a/helpers/init_cloudshell.sh b/helpers/init_cloudshell.sh
new file mode 100755
index 0000000..56fdc63
--- /dev/null
+++ b/helpers/init_cloudshell.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+echo "downloading terragrunt"
+INSTALL_DIR=$(command -v terraform | sed s/terraform/terragrunt/g)
+wget https://github.com/gruntwork-io/terragrunt/releases/download/v0.23.25/terragrunt_linux_amd64
+mv terragrunt_linux_amd64 "$INSTALL_DIR"
+chmod +x "$INSTALL_DIR"
+echo "terragrunt install successful!"
+terragrunt -version
+
+echo "resetting to java 8"
+update-java-alternatives -s java-1.8.0-openjdk-amd64 && export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre
+java -version
diff --git a/helpers/init_git_repo.sh b/helpers/init_git_repo.sh
new file mode 100755
index 0000000..6555151
--- /dev/null
+++ b/helpers/init_git_repo.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+################################################################################
+# Checkout the repo for a PR and add the remote of the target branch #
+################################################################################
+
+# Fetches master branch from GitHub and "resets" local changes to be relative to it,
+# so we can diff what changed relatively to master branch.
+git init
+git config user.email "ia-tests@presubmit.example.com"
+git config user.name "ia-tests"
+
+git commit -m "empty commit"
+git remote add origin "${BASE_REPO_URL}"
+git fetch origin master
+
+# Fetch all PRs to get history for PRs created from forked repos
+git fetch origin +refs/pull/*/merge:refs/remotes/origin/pr/*
+
+git reset --hard "origin/pr/${PR_NUMBER}"
+
+if ! git rebase "origin/${BASE_BRANCH}"
+then
+ exit 1
+fi
+
+echo "successfully rebased PR #${PR_NUMBER} on master"
diff --git a/helpers/run_relevant_cloudbuilds.sh b/helpers/run_relevant_cloudbuilds.sh
new file mode 100755
index 0000000..6072582
--- /dev/null
+++ b/helpers/run_relevant_cloudbuilds.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+################################################################################
+# Construct and submit a dynamic cloudbuild yaml file to run the nested #
+# cloud builds for directories containing changes according to git diff master.#
+# #
+# Arguments: #
+# $1 - file to search for (e.g. cloudbuild.yaml or precommit_cloudbuild.yaml) #
+# all subsequent args will be passed to gcloud builds submit commands #
+# https://cloud.google.com/sdk/gcloud/reference/builds/submit #
+# #
+# Example usage: #
+# ./run_relevant_cloudbuilds.sh precommit_cloudbuild.yaml #
+################################################################################
+
+set -e
+
+COMMIT_SHA=$(git rev-parse HEAD)
+
+# list of changed files.
+DIFF=$(git diff --name-only origin/master)
+
+# Temporary file to define a dynamic cloud build based on changed files.
+PRE_COMMIT_BUILD=relevant-cloudbuilds-for-${COMMIT_SHA}.yaml
+
+# get a list of dirs containin cloud builds and the list of passed files.
+# $1 - cloudbuild filename to search for
+# $2 - list of files containing changes
+function find_relevant_cloud_build_dirs(){
+ DIRS_WITH_COULD_BUILD_PATTERN="(^$(find . -type f -path "./*/$1" -printf '%h|' | sed s#\\./##g | sed s/\|$//g))"
+ echo "$2" | grep -oP "$DIRS_WITH_COULD_BUILD_PATTERN" | sort | uniq
+}
+# utility for adding a line to the working build file
+function append_to_build(){
+ echo "$1" >> "$PRE_COMMIT_BUILD"
+}
+# initializes a cloud build file
+function init_build() {
+ touch "$PRE_COMMIT_BUILD"
+ append_to_build "steps:"
+}
+# loop through the diff and add a step to run each relevant nested cloud build.
+# $1 - cloudbuild file to look for
+# $2 - additional arguments for gcloud builds submit
+function construct_build(){
+ for DIR in $DIRS_WITH_DIFF_AND_BUILD
+ do
+ append_to_build '- name: google/cloud-sdk'
+ append_to_build " entrypoint: 'gcloud'"
+ if [ -z "$2" ]
+ then
+ append_to_build " args: ['builds', 'submit', '.' , '--config=$DIR/$1']"
+ else
+ append_to_build " args: ['builds', 'submit', '.' , '--config=$DIR/$1', '$2']"
+ fi
+ append_to_build " waitFor: ['-']" # run nested builds in parallel
+ append_to_build " id: '$DIR'"
+ done
+ # beef up resources for parallelizaiton
+ append_to_build "options:"
+ append_to_build " machineType: 'N1_HIGHCPU_8'"
+}
+# run the cloud build created in this script
+function run() {
+ echo "running relevant pre-commits for $COMMIT_SHA"
+ cat "$PRE_COMMIT_BUILD"
+ gcloud builds submit . --config="$PRE_COMMIT_BUILD"
+ BUILD_STATUS=$?
+ # clean up
+ rm "$PRE_COMMIT_BUILD"
+ exit $BUILD_STATUS
+}
+
+function main(){
+ FILENAME="$1"
+ CLOUD_BUILD_EXTRA_ARGS="${*:2}"
+ echo "${CLOUD_BUILD_EXTRA_ARGS}"
+ DIRS_WITH_DIFF_AND_BUILD=$(find_relevant_cloud_build_dirs "$FILENAME" "$DIFF")
+ # If there are no cloudbuilds in dirs with diff we should not fail.
+ if [ -z "$DIRS_WITH_DIFF_AND_BUILD" ]
+ then
+ echo "no cloudbuilds to run."
+ exit 0
+ else
+ init_build
+ construct_build "$FILENAME" "${CLOUD_BUILD_EXTRA_ARGS[*]}"
+ run
+ fi
+ echo "all relevant cloudbuilds succeeded!"
+}
+
+main "$@"
\ No newline at end of file
diff --git a/source-code/build-pipeline/wait_for_dag_deployed.sh b/helpers/run_tests.sh
old mode 100644
new mode 100755
similarity index 57%
rename from source-code/build-pipeline/wait_for_dag_deployed.sh
rename to helpers/run_tests.sh
index 93c7184..b2f0798
--- a/source-code/build-pipeline/wait_for_dag_deployed.sh
+++ b/helpers/run_tests.sh
@@ -1,14 +1,12 @@
#!/bin/bash
-#
-# Script that waits for the specified Cloud Composer DAG to deploy.
-#
-# Copyright 2019 Google Inc.
+
+# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
-# https://www.apache.org/licenses/LICENSE-2.0
+# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
@@ -16,14 +14,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-n=0
-until [[ $n -ge $4 ]]
-do
- status=0
- gcloud composer environments run "${1}" --location "${2}" list_dags \
- 2>&1 >/dev/null | grep "${3}" && break
- status=$?
- n=$(($n+1))
- sleep "${5}"
-done
-exit $status
+set -e
+
+echo "running deploydags go tests..."
+if ! (cd ./composer/cloudbuild/go/dagsdeployer/internal/ && go vet ./... && go test ./...);
+then
+ echo "go tests for dags deployer failed"
+ exit 1
+fi
+
+echo "running dataflow java tests..."
+find ./dataflow/java/ -name pom.xml -execdir mvn test \;
diff --git a/license-templates/LICENSE.txt b/license-templates/LICENSE.txt
new file mode 100644
index 0000000..7748fc1
--- /dev/null
+++ b/license-templates/LICENSE.txt
@@ -0,0 +1,13 @@
+Copyright 2019 Google Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/precommit_cloudbuild.yaml b/precommit_cloudbuild.yaml
new file mode 100644
index 0000000..81d921d
--- /dev/null
+++ b/precommit_cloudbuild.yaml
@@ -0,0 +1,36 @@
+# Copyright 2019 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+steps:
+ - name: 'gcr.io/cloud-builders/git'
+ entrypoint: 'bash'
+ args: [
+ './helpers/init_git_repo.sh',
+ ]
+ env: [
+ 'COMMIT_SHA=${COMMIT_SHA}',
+ 'BASE_REPO_URL=https://github.com/jaketf/ci-cd-for-data-processing-workflow.git',
+ 'BASE_BRANCH=${_BASE_BRANCH}',
+ 'PR_NUMBER=${_PR_NUMBER}',
+ ]
+ id: 'checkout-pr-branch'
+ # Run linters
+ - name: 'gcr.io/${PROJECT_ID}/make'
+ args: ['test']
+ waitFor: ['checkout-pr-branch']
+ id: 'run-style-and-unit-tests'
+ - name: 'google/cloud-sdk'
+ waitFor: ['checkout-pr-branch']
+ entrypoint: 'bash'
+ args: ['./helpers/run_relevant_cloudbuilds.sh', 'precommit_cloudbuild.yaml']
+ id: 'run-relevant-cloud-builds'
diff --git a/source-code/build-pipeline/build_deploy_test.yaml b/source-code/build-pipeline/build_deploy_test.yaml
deleted file mode 100644
index e28e6bb..0000000
--- a/source-code/build-pipeline/build_deploy_test.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright 2019 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-steps:
-- name: gcr.io/cloud-builders/git
- args: ['clone', 'https://source.developers.google.com/p/$PROJECT_ID/r/$REPO_NAME']
- id: 'check-out-source-code'
-- name: gcr.io/cloud-builders/mvn
- args: ['package', '-q']
- dir: '$REPO_NAME/data-processing-code'
- id: 'build-jar'
-- name: gcr.io/cloud-builders/gsutil
- args: ['cp', '*bundled*.jar', 'gs://${_DATAFLOW_JAR_BUCKET}/dataflow_deployment_$BUILD_ID.jar']
- dir: '$REPO_NAME/data-processing-code/target'
- id: 'deploy-jar'
-- name: apache/airflow:master
- entrypoint: 'python'
- args: ['test_compare_xcom_maps.py']
- dir: '$REPO_NAME/workflow-dag'
- id: 'unit-test-on-operator-code'
-- name: gcr.io/cloud-builders/gsutil
- args: ['cp', 'support-files/input.txt', 'gs://${_COMPOSER_INPUT_BUCKET}']
- dir: '$REPO_NAME/workflow-dag'
- id: 'deploy-test-input-file'
-- name: gcr.io/cloud-builders/gsutil
- args: ['cp', 'support-files/ref.txt', 'gs://${_COMPOSER_REF_BUCKET}']
- dir: '$REPO_NAME/workflow-dag'
- id: 'deploy-test-ref-file'
-- name: gcr.io/cloud-builders/gcloud
- args: ['composer', 'environments', 'run', '${_COMPOSER_ENV_NAME}', '--location', '${_COMPOSER_REGION}','variables', '--', '--set', 'dataflow_jar_file_test', 'dataflow_deployment_$BUILD_ID.jar']
- id: 'set-composer-jar-ref'
-- name: gcr.io/cloud-builders/gsutil
- args: ['cp', 'compare_xcom_maps.py', '${_COMPOSER_DAG_BUCKET}']
- dir: '$REPO_NAME/workflow-dag'
- id: 'deploy-custom-operator'
-- name: gcr.io/cloud-builders/gsutil
- args: ['cp', 'data-pipeline-test.py', '${_COMPOSER_DAG_BUCKET}']
- dir: '$REPO_NAME/workflow-dag'
- id: 'deploy-processing-pipeline'
-- name: gcr.io/cloud-builders/gcloud
- entrypoint: 'bash'
- args: ['wait_for_dag_deployed.sh', '${_COMPOSER_ENV_NAME}', '${_COMPOSER_REGION}', '${_COMPOSER_DAG_NAME_TEST}', '6', '20']
- dir: '$REPO_NAME/build-pipeline'
- id: 'wait-for-dag-deployed-on-composer'
-- name: gcr.io/cloud-builders/gcloud
- args: ['composer', 'environments', 'run', '${_COMPOSER_ENV_NAME}', '--location', '${_COMPOSER_REGION}', 'trigger_dag', '--', '${_COMPOSER_DAG_NAME_TEST}', '--run_id=$BUILD_ID']
- id: 'trigger-pipeline-execution'
diff --git a/source-code/build-pipeline/deploy_prod.yaml b/source-code/build-pipeline/deploy_prod.yaml
deleted file mode 100644
index fc43e4f..0000000
--- a/source-code/build-pipeline/deploy_prod.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright 2019 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-steps:
-- name: gcr.io/cloud-builders/gsutil
- args: ['cp', 'gs://${_DATAFLOW_JAR_BUCKET_TEST}/${_DATAFLOW_JAR_FILE_LATEST}', 'gs://${_DATAFLOW_JAR_BUCKET_PROD}/dataflow_deployment_$BUILD_ID.jar']
- id: 'deploy-jar-to-prod'
-- name: gcr.io/cloud-builders/git
- args: ['clone', 'https://source.developers.google.com/p/$PROJECT_ID/r/$REPO_NAME']
- id: 'check-out-source-code'
-- name: gcr.io/cloud-builders/gsutil
- args: ['cp', 'support-files/input.txt', 'gs://${_COMPOSER_INPUT_BUCKET}']
- dir: '$REPO_NAME/workflow-dag'
- id: 'deploy-input-file'
-- name: gcr.io/cloud-builders/gcloud
- args: ['composer', 'environments', 'run', '${_COMPOSER_ENV_NAME}', '--location', '${_COMPOSER_REGION}','variables', '--', '--set', 'dataflow_jar_file_prod', 'dataflow_deployment_$BUILD_ID.jar']
- id: 'set-composer-jar-ref'
-- name: gcr.io/cloud-builders/gsutil
- args: ['cp', 'data-pipeline-prod.py', '${_COMPOSER_DAG_BUCKET}']
- dir: '$REPO_NAME/workflow-dag'
- id: 'deploy-processing-pipeline'
-- name: gcr.io/cloud-builders/gcloud
- entrypoint: 'bash'
- args: ['wait_for_dag_deployed.sh', '${_COMPOSER_ENV_NAME}', '${_COMPOSER_REGION}', '${_COMPOSER_DAG_NAME_PROD}', '6', '20']
- dir: '$REPO_NAME/build-pipeline'
- id: 'wait-for-dag-deployed-on-composer'
diff --git a/source-code/data-processing-code/src/main/java/org/apache/beam/examples/WordCount.java b/source-code/data-processing-code/src/main/java/org/apache/beam/examples/WordCount.java
deleted file mode 100644
index afcd838..0000000
--- a/source-code/data-processing-code/src/main/java/org/apache/beam/examples/WordCount.java
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright 2019 Google Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.examples;
-
-import org.apache.beam.sdk.Pipeline;
-import org.apache.beam.sdk.io.TextIO;
-import org.apache.beam.sdk.metrics.Counter;
-import org.apache.beam.sdk.metrics.Distribution;
-import org.apache.beam.sdk.metrics.Metrics;
-import org.apache.beam.sdk.options.Default;
-import org.apache.beam.sdk.options.Description;
-import org.apache.beam.sdk.options.PipelineOptions;
-import org.apache.beam.sdk.options.PipelineOptionsFactory;
-import org.apache.beam.sdk.options.Validation.Required;
-import org.apache.beam.sdk.transforms.Count;
-import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.MapElements;
-import org.apache.beam.sdk.transforms.PTransform;
-import org.apache.beam.sdk.transforms.ParDo;
-import org.apache.beam.sdk.transforms.SimpleFunction;
-import org.apache.beam.sdk.values.KV;
-import org.apache.beam.sdk.values.PCollection;
-
-/**
- * An example that counts words in Shakespeare and includes Beam best practices.
- *
- * This class, {@link WordCount}, is the second in a series of four successively more detailed
- * 'word count' examples. You may first want to take a look at {@link MinimalWordCount}. After
- * you've looked at this example, then see the {@link DebuggingWordCount} pipeline, for introduction
- * of additional concepts.
- *
- *
For a detailed walkthrough of this example, see
- * https://beam.apache.org/get-started/wordcount-example/
- *
- *
Basic concepts, also in the MinimalWordCount example: Reading text files; counting a
- * PCollection; writing to text files
- *
- *
New Concepts:
- *
- *
- * 1. Executing a Pipeline both locally and using the selected runner
- * 2. Using ParDo with static DoFns defined out-of-line
- * 3. Building a composite transform
- * 4. Defining your own pipeline options
- *
- *
- * Concept #1: you can execute this pipeline either locally or using by selecting another runner.
- * These are now command-line options and not hard-coded as they were in the MinimalWordCount
- * example.
- *
- *
To change the runner, specify:
- *
- *
{@code
- * --runner=YOUR_SELECTED_RUNNER
- * }
- *
- * To execute this pipeline, specify a local output file (if using the {@code DirectRunner}) or
- * output prefix on a supported distributed file system.
- *
- *
{@code
- * --output=[YOUR_LOCAL_FILE | YOUR_OUTPUT_PREFIX]
- * }
- *
- * The input file defaults to a public data set containing the text of of King Lear, by William
- * Shakespeare. You can override it and choose your own input with {@code --inputFile}.
- */
-public class WordCount {
-
- /**
- * Concept #2: You can make your pipeline assembly code less verbose by defining your DoFns
- * statically out-of-line. This DoFn tokenizes lines of text into individual words; we pass it to
- * a ParDo in the pipeline.
- */
- static class ExtractWordsFn extends DoFn {
- private final Counter emptyLines = Metrics.counter(ExtractWordsFn.class, "emptyLines");
- private final Distribution lineLenDist =
- Metrics.distribution(ExtractWordsFn.class, "lineLenDistro");
- private static final String TOKENIZER_PATTERN = "[^\\p{L}]+";
-
- @ProcessElement
- public void processElement(@Element String element, OutputReceiver receiver) {
- lineLenDist.update(element.length());
- if (element.trim().isEmpty()) {
- emptyLines.inc();
- }
-
- // Split the line into words.
- String[] words = element.split(TOKENIZER_PATTERN, -1);
-
- // Output each word encountered into the output PCollection.
- for (String word: words) {
- if (!word.isEmpty()) {
- receiver.output(word);
- }
- }
- }
- }
-
- /** A SimpleFunction that converts a Word and Count into a printable string. */
- public static class FormatAsTextFn extends SimpleFunction, String> {
- @Override
- public String apply(KV input) {
- return input.getKey() + ": " + input.getValue();
- }
- }
-
- /**
- * A PTransform that converts a PCollection containing lines of text into a PCollection of
- * formatted word counts.
- *
- * Concept #3: This is a custom composite transform that bundles two transforms (ParDo and
- * Count) as a reusable PTransform subclass. Using composite transforms allows for easy reuse,
- * modular testing, and an improved monitoring experience.
- */
- public static class CountWords
- extends PTransform, PCollection>> {
- @Override
- public PCollection>expand(PCollection lines) {
-
- // Convert lines of text into individual words.
- PCollection words = lines.apply(ParDo.of(new ExtractWordsFn()));
-
- // Count the number of times each word occurs.
- PCollection> wordCounts = words.apply(Count.perElement());
-
- return wordCounts;
- }
- }
-
- /**
- * Options supported by {@link WordCount}.
- *
- * Concept #4: Defining your own configuration options. Here, you can add your own arguments to
- * be processed by the command-line parser, and specify default values for them. You can then
- * access the options values in your pipeline code.
- *
- *
Inherits standard configuration options.
- */
- public interface WordCountOptions extends PipelineOptions {
-
- /**
- * By default, this example reads from a public dataset containing the text of King Lear. Set
- * this option to choose a different input file or glob.
- */
- @Description("Path of the file to read from")
- @Default.String("gs://apache-beam-samples/shakespeare/kinglear.txt")
- String getInputFile();
-
- void setInputFile(String value);
-
- /** Set this required option to specify where to write the output. */
- @Description("Path of the file to write to")
- @Required
- String getOutput();
-
- void setOutput(String value);
- }
-
- static void runWordCount(WordCountOptions options) {
- Pipeline p = Pipeline.create(options);
-
- // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the
- // static FormatAsTextFn() to the ParDo transform.
- p.apply("ReadLines", TextIO.read().from(options.getInputFile()))
- .apply(new CountWords())
- .apply(MapElements.via(new FormatAsTextFn()))
- .apply("WriteCounts", TextIO.write().to(options.getOutput()));
-
- p.run().waitUntilFinish();
- }
-
- public static void main(String[] args) {
- WordCountOptions options =
- PipelineOptionsFactory.fromArgs(args).withValidation().as(WordCountOptions.class);
- runWordCount(options);
- }
-}
diff --git a/source-code/workflow-dag/compare_xcom_maps.py b/source-code/workflow-dag/compare_xcom_maps.py
deleted file mode 100644
index 9ad28e8..0000000
--- a/source-code/workflow-dag/compare_xcom_maps.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright 2019 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Custom operator that compares dictionaries in xcom.
-"""
-
-from airflow.models import BaseOperator
-from airflow.utils.decorators import apply_defaults
-
-
-class CompareXComMapsOperator(BaseOperator):
- """Compare dictionary stored in xcom.
-
- Args:
- ref_task_ids: list of task ids from where the reference dictionary
- is fetched
- res_task_ids: list of task ids from where the comparing dictionary
- is fetched
- """
-
- @apply_defaults
- def __init__(
- self,
- ref_task_ids,
- res_task_ids,
- *args, **kwargs):
- super(CompareXComMapsOperator, self).__init__(*args, **kwargs)
- self.ref_task_ids = ref_task_ids
- self.res_task_ids = res_task_ids
-
- def execute(self, context):
- ref_obj = self.read_value_as_obj(self.ref_task_ids, context)
- res_obj = self.read_value_as_obj(self.res_task_ids, context)
- self.compare_obj(ref_obj, res_obj)
- return 'result contains the expected values'
-
- def read_value_as_obj(self, task_ids, context):
- ret_obj = {}
- for task_id in task_ids:
- value_str = context['ti'].xcom_pull(
- key=None,
- task_ids=task_id)
- self.parse_str_obj(value_str, ret_obj)
- return ret_obj
-
- def parse_str_obj(self, str_rep, obj):
- entries = str_rep.split('\n')
- for entry in entries:
- if entry:
- key, value = entry.split(': ')
- obj[key] = value
-
- def compare_obj(self, ref_obj, res_obj):
- if ref_obj != res_obj:
- raise ValueError(self.create_diff_str(ref_obj, res_obj))
-
- def create_diff_str(self, ref_obj, res_obj):
- msg = 'The result differs from the expected in the following ways:'
- for k in ref_obj:
- if k not in res_obj:
- msg = msg + ('\nmissing key: %s in result' % k)
- elif ref_obj[k] != res_obj[k]:
- msg = msg + ('\nexpected %s: %s but got %s: %s' % (
- k, ref_obj[k], k, res_obj[k]))
- for k in res_obj:
- if k not in ref_obj:
- msg = msg + ('\nunexpected key: %s in result' % k)
- return msg
diff --git a/source-code/workflow-dag/data-pipeline-prod.py b/source-code/workflow-dag/data-pipeline-prod.py
deleted file mode 100644
index 1f0f993..0000000
--- a/source-code/workflow-dag/data-pipeline-prod.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright 2019 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Data processing production workflow definition.
-"""
-import datetime
-from airflow import models
-from airflow.contrib.operators.dataflow_operator import DataFlowJavaOperator
-
-dataflow_staging_bucket = 'gs://%s/staging' % (
- models.Variable.get('dataflow_staging_bucket_prod'))
-
-dataflow_jar_location = 'gs://%s/%s' % (
- models.Variable.get('dataflow_jar_location_prod'),
- models.Variable.get('dataflow_jar_file_prod'))
-
-project = models.Variable.get('gcp_project')
-region = models.Variable.get('gcp_region')
-zone = models.Variable.get('gcp_zone')
-input_bucket = 'gs://' + models.Variable.get('gcs_input_bucket_prod')
-output_bucket_name = models.Variable.get('gcs_output_bucket_prod')
-output_bucket = 'gs://' + output_bucket_name
-output_prefix = 'output'
-download_task_prefix = 'download_result'
-
-yesterday = datetime.datetime.combine(
- datetime.datetime.today() - datetime.timedelta(1),
- datetime.datetime.min.time())
-
-default_args = {
- 'dataflow_default_options': {
- 'project': project,
- 'zone': zone,
- 'region': region,
- 'stagingLocation': dataflow_staging_bucket
- }
-}
-
-with models.DAG(
- 'prod_word_count',
- schedule_interval=None,
- default_args=default_args) as dag:
- dataflow_execution = DataFlowJavaOperator(
- task_id='wordcount-run',
- jar=dataflow_jar_location,
- start_date=yesterday,
- options={
- 'autoscalingAlgorithm': 'THROUGHPUT_BASED',
- 'maxNumWorkers': '3',
- 'inputFile': input_bucket+'/input.txt',
- 'output': output_bucket+'/'+output_prefix
- }
- )
diff --git a/source-code/workflow-dag/data-pipeline-test.py b/source-code/workflow-dag/data-pipeline-test.py
deleted file mode 100644
index 17da7d2..0000000
--- a/source-code/workflow-dag/data-pipeline-test.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright 2019 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Data processing test workflow definition.
-"""
-import datetime
-from airflow import models
-from airflow.contrib.operators.dataflow_operator import DataFlowJavaOperator
-from airflow.contrib.operators.gcs_download_operator import GoogleCloudStorageDownloadOperator
-from compare_xcom_maps import CompareXComMapsOperator
-
-dataflow_staging_bucket = 'gs://%s/staging' % (
- models.Variable.get('dataflow_staging_bucket_test'))
-
-dataflow_jar_location = 'gs://%s/%s' % (
- models.Variable.get('dataflow_jar_location_test'),
- models.Variable.get('dataflow_jar_file_test'))
-
-project = models.Variable.get('gcp_project')
-region = models.Variable.get('gcp_region')
-zone = models.Variable.get('gcp_zone')
-input_bucket = 'gs://' + models.Variable.get('gcs_input_bucket_test')
-output_bucket_name = models.Variable.get('gcs_output_bucket_test')
-output_bucket = 'gs://' + output_bucket_name
-ref_bucket = models.Variable.get('gcs_ref_bucket_test')
-output_prefix = 'output'
-download_task_prefix = 'download_result'
-
-yesterday = datetime.datetime.combine(
- datetime.datetime.today() - datetime.timedelta(1),
- datetime.datetime.min.time())
-
-default_args = {
- 'dataflow_default_options': {
- 'project': project,
- 'zone': zone,
- 'region': region,
- 'stagingLocation': dataflow_staging_bucket
- }
-}
-
-with models.DAG(
- 'test_word_count',
- schedule_interval=None,
- default_args=default_args) as dag:
- dataflow_execution = DataFlowJavaOperator(
- task_id='wordcount-run',
- jar=dataflow_jar_location,
- start_date=yesterday,
- options={
- 'autoscalingAlgorithm': 'THROUGHPUT_BASED',
- 'maxNumWorkers': '3',
- 'inputFile': input_bucket+'/input.txt',
- 'output': output_bucket+'/'+output_prefix
- }
- )
- download_expected = GoogleCloudStorageDownloadOperator(
- task_id='download_ref_string',
- bucket=ref_bucket,
- object='ref.txt',
- store_to_xcom_key='ref_str',
- start_date=yesterday
- )
- download_result_one = GoogleCloudStorageDownloadOperator(
- task_id=download_task_prefix+'_1',
- bucket=output_bucket_name,
- object=output_prefix+'-00000-of-00003',
- store_to_xcom_key='res_str_1',
- start_date=yesterday
- )
- download_result_two = GoogleCloudStorageDownloadOperator(
- task_id=download_task_prefix+'_2',
- bucket=output_bucket_name,
- object=output_prefix+'-00001-of-00003',
- store_to_xcom_key='res_str_2',
- start_date=yesterday
- )
- download_result_three = GoogleCloudStorageDownloadOperator(
- task_id=download_task_prefix+'_3',
- bucket=output_bucket_name,
- object=output_prefix+'-00002-of-00003',
- store_to_xcom_key='res_str_3',
- start_date=yesterday
- )
- compare_result = CompareXComMapsOperator(
- task_id='do_comparison',
- ref_task_ids=['download_ref_string'],
- res_task_ids=[download_task_prefix+'_1',
- download_task_prefix+'_2',
- download_task_prefix+'_3'],
- start_date=yesterday
- )
-
- dataflow_execution >> download_result_one
- dataflow_execution >> download_result_two
- dataflow_execution >> download_result_three
-
- download_expected >> compare_result
- download_result_one >> compare_result
- download_result_two >> compare_result
- download_result_three >> compare_result
diff --git a/source-code/workflow-dag/test_compare_xcom_maps.py b/source-code/workflow-dag/test_compare_xcom_maps.py
deleted file mode 100644
index 9da06c2..0000000
--- a/source-code/workflow-dag/test_compare_xcom_maps.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright 2019 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Unit test of the CompareXComMapsOperator.
-"""
-import unittest
-from compare_xcom_maps import CompareXComMapsOperator
-import mock
-
-TASK_ID = 'test_compare_task_id'
-REF_TASK_ID = 'download_ref_string'
-DOWNLOAD_TASK_PREFIX = 'download_result'
-CONTEXT_CLASS_NAME = 'airflow.ti_deps.dep_context'
-ERROR_LINE_ONE = 'The result differs from the expected in the following ways:\n'
-
-
-def generate_mock_function(first_value, second_value, third_value):
- def mock_function(**kwargs):
- return {
- REF_TASK_ID: 'a: 1\nb: 2\nc: 3',
- DOWNLOAD_TASK_PREFIX+'_1': first_value,
- DOWNLOAD_TASK_PREFIX+'_2': second_value,
- DOWNLOAD_TASK_PREFIX+'_3': third_value
- }[kwargs['task_ids']]
- return mock_function
-
-
-def equal_mock():
- return generate_mock_function('c: 3', 'b: 2', 'a: 1')
-
-
-def missing_value_mock():
- return generate_mock_function('b: 2', 'a: 1', 'b: 2')
-
-
-def wrong_value_mock():
- return generate_mock_function('a: 1', 'b: 4', 'c: 3')
-
-
-def unexpected_value_mock():
- return generate_mock_function('a: 1', 'c: 3\nd: 4', 'b: 2')
-
-
-class CompareXComMapsOperatorTest(unittest.TestCase):
-
- def setUp(self):
- super(CompareXComMapsOperatorTest, self).setUp()
- self.xcom_compare = CompareXComMapsOperator(
- task_id=TASK_ID,
- ref_task_ids=[REF_TASK_ID],
- res_task_ids=[DOWNLOAD_TASK_PREFIX+'_1',
- DOWNLOAD_TASK_PREFIX+'_2',
- DOWNLOAD_TASK_PREFIX+'_3'])
-
- def test_init(self):
- self.assertEqual(self.xcom_compare.task_id, TASK_ID)
- self.assertListEqual(self.xcom_compare.ref_task_ids, [REF_TASK_ID])
- self.assertListEqual(self.xcom_compare.res_task_ids,
- [DOWNLOAD_TASK_PREFIX+'_1',
- DOWNLOAD_TASK_PREFIX+'_2',
- DOWNLOAD_TASK_PREFIX+'_3'])
-
- def assertRaisesWithMessage(self, error_type, msg, func, *args, **kwargs):
- with self.assertRaises(error_type) as context:
- func(*args, **kwargs)
- self.assertEqual(msg, str(context.exception))
-
- def execute_value_error(self, mock_func, error_expect_tr):
- with mock.patch(CONTEXT_CLASS_NAME) as context_mock:
- context_mock['ti'].xcom_pull = mock_func
- self.assertRaisesWithMessage(
- ValueError,
- error_expect_tr,
- self.xcom_compare.execute, context_mock)
-
- def test_equal(self):
- with mock.patch(CONTEXT_CLASS_NAME) as context_mock:
- context_mock['ti'].xcom_pull = equal_mock()
- self.xcom_compare.execute(context_mock)
-
- def test_missing_value(self):
- self.execute_value_error(
- missing_value_mock(),
- '{}{}'.format(ERROR_LINE_ONE, 'missing key: c in result'))
-
- def test_wrong_value(self):
- self.execute_value_error(
- wrong_value_mock(),
- '{}{}'.format(ERROR_LINE_ONE, 'expected b: 2 but got b: 4'))
-
- def test_unexpected_value(self):
- self.execute_value_error(
- unexpected_value_mock(),
- '{}{}'.format(ERROR_LINE_ONE, 'unexpected key: d in result'))
-
-suite = unittest.TestLoader().loadTestsFromTestCase(CompareXComMapsOperatorTest)
-unittest.TextTestRunner(verbosity=2).run(suite)
diff --git a/terraform/.gitignore b/terraform/.gitignore
new file mode 100644
index 0000000..9eb783f
--- /dev/null
+++ b/terraform/.gitignore
@@ -0,0 +1,44 @@
+# Terragrunt
+.terragrunt-cache/
+
+# OSX leaves these everywhere on SMB shares
+._*
+
+# OSX trash
+.DS_Store
+
+# Python
+*.pyc
+
+# Emacs save files
+*~
+\#*\#
+.\#*
+
+# Vim-related files
+[._]*.s[a-w][a-z]
+[._]s[a-w][a-z]
+*.un~
+Session.vim
+.netrwhist
+
+### https://raw.github.com/github/gitignore/90f149de451a5433aebd94d02d11b0e28843a1af/Terraform.gitignore
+
+# Local .terraform directories
+**/.terraform/*
+
+# .tfstate files
+*.tfstate
+*.tfstate.*
+
+# Crash log files
+crash.log
+
+# Kitchen files
+**/inspec.lock
+**/.kitchen
+**/kitchen.local.yml
+**/Gemfile.lock
+
+
+credentials.json
diff --git a/terraform/README.md b/terraform/README.md
new file mode 100644
index 0000000..3251d08
--- /dev/null
+++ b/terraform/README.md
@@ -0,0 +1,9 @@
+# Datapipelines CICD terraform IaC
+This defines terraform for the ci, artifacts and prod projects.
+For more details on the inputs and outputs look at the READMEs in
+the artifacts and datapipelines-infra modules.
+
+## Running
+```
+terragrunt apply-all
+```
diff --git a/terraform/artifacts/README.md b/terraform/artifacts/README.md
new file mode 100644
index 0000000..5f9ccb7
--- /dev/null
+++ b/terraform/artifacts/README.md
@@ -0,0 +1,29 @@
+# Artifacts and Cloud Build
+The terraform in this dir handles the infrastructure for building and storing
+artifacts that are built in the CI environment and used in the production environment.
+## Requirements
+
+No requirements.
+
+## Providers
+
+| Name | Version |
+|------|---------|
+| google | n/a |
+| google-beta | n/a |
+
+## Inputs
+
+| Name | Description | Type | Default | Required |
+|------|-------------|------|---------|:--------:|
+| ci\_composer\_env | n/a | `string` | `""` | no |
+| ci\_project | Continuous Integration Project which pushes artifacts | `any` | n/a | yes |
+| prod\_project | Production project which pulls artifacts | `any` | n/a | yes |
+| project\_id | Project ID for your GCP project to store artifacts | `any` | n/a | yes |
+
+## Outputs
+
+| Name | Description |
+|------|-------------|
+| dataflow\_artifacts\_bucket | n/a |
+
diff --git a/terraform/artifacts/backend.tf b/terraform/artifacts/backend.tf
new file mode 100644
index 0000000..3af586a
--- /dev/null
+++ b/terraform/artifacts/backend.tf
@@ -0,0 +1,7 @@
+# Generated by Terragrunt. Sig: nIlQXj57tbuaRZEa
+terraform {
+ backend "gcs" {
+ bucket = "datapipelines-terraform-state"
+ prefix = "terraform_state/artifacts/terraform.tfstate"
+ }
+}
diff --git a/terraform/artifacts/main.tf b/terraform/artifacts/main.tf
new file mode 100644
index 0000000..4a89de2
--- /dev/null
+++ b/terraform/artifacts/main.tf
@@ -0,0 +1,130 @@
+# Copyright 2019 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+module "project-services" {
+ source = "terraform-google-modules/project-factory/google//modules/project_services"
+ version = "7.1.0"
+
+ project_id = var.project_id
+
+ activate_apis = [
+ "compute.googleapis.com",
+ "cloudbuild.googleapis.com",
+ "sourcerepo.googleapis.com",
+ "artifactregistry.googleapis.com",
+ "containerregistry.googleapis.com",
+ "containerscanning.googleapis.com",
+ "storage-component.googleapis.com",
+ "storage-api.googleapis.com",
+ "pubsub.googleapis.com",
+ "stackdriver.googleapis.com",
+ ]
+}
+
+module "artifacts-buckets" {
+ source = "terraform-google-modules/cloud-storage/google"
+ version = "~> 1.6"
+ project_id = var.project_id
+ location = "US"
+ names = ["dataflow"]
+ prefix = var.project_id
+ set_admin_roles = true
+ admins = [""]
+ versioning = {
+ first = true
+ }
+}
+
+resource "google_cloudbuild_trigger" "ci-pre-commit-trigger" {
+ provider = google-beta
+ description = "Datapipelines Pre Commit"
+ project = var.ci_project
+
+ github {
+ owner = "jaketf"
+ name = "ci-cd-for-data-processing-workflow"
+ pull_request {
+ branch = ".*"
+ }
+ }
+
+ filename = "precommit_cloudbuild.yaml"
+}
+
+resource "google_cloudbuild_trigger" "ci-post-commit-trigger" {
+ provider = google-beta
+ description = "Data Pipelines Post Commit"
+ project = var.ci_project
+
+ github {
+ owner = "jaketf"
+ name = "ci-cd-for-data-processing-workflow"
+ pull_request {
+ branch = ".*"
+ comment_control = "COMMENTS_ENABLED"
+ }
+ }
+
+ substitutions = {
+ _COMPOSER_ENV_NAME = var.ci_composer_env
+ _COMPOSER_REGION = var.ci_composer_region
+ _DATAFLOW_JAR_BUCKET = var.dataflow_jars_bucket
+ _DATAFLOW_STAGING_BUCKET = "${var.ci_project}-us-dataflow_staging"
+ _COMPOSER_DAG_BUCKET = var.ci_composer_dags_bucket
+ _WORDCOUNT_INPUT_BUCKET = "${var.ci_project}-us-wordcount_input"
+ _WORDCOUNT_RESULT_BUCKET = "${var.ci_project}-us-wordcount_result"
+ _WORDCOUNT_REF_BUCKET = "${var.ci_project}-us-wordcount_ref"
+ _ARTIFACTS_PROJECT_ID = var.project_id
+ _DATAFLOW_ARTIFACTS_BUCKET = module.artifacts-buckets.names_list[0]
+ }
+
+ filename = "cloudbuild.yaml"
+}
+
+resource "google_project_iam_member" "ci-cloudbuild-composer-user" {
+ project = var.ci_project
+ role = "roles/composer.user"
+ member = "serviceAccount:${data.google_project.ci.number}@cloudbuild.gserviceaccount.com"
+}
+
+resource "google_project_iam_member" "ci-cloudbuild-containers-developer" {
+ project = var.ci_project
+ role = "roles/container.admin"
+ member = "serviceAccount:${data.google_project.ci.number}@cloudbuild.gserviceaccount.com"
+}
+
+resource "google_project_iam_member" "ci-cloudbuild-artifact-admin" {
+ project = var.project_id
+ role = "roles/storage.admin"
+ member = "serviceAccount:${var.push_sa}"
+}
+
+resource "google_project_iam_member" "cloudbuild-artifact-reader" {
+ project = var.project_id
+ role = "roles/storage.objectViewer"
+ member = "serviceAccount:${data.google_project.ci.number}@cloudbuild.gserviceaccount.com"
+}
+
+data google_project "ci" {
+ project_id = var.ci_project
+}
+
+data google_project "artifacts" {
+ project_id = var.project_id
+}
+
+data google_project "prod" {
+ project_id = var.project_id
+}
+
diff --git a/terraform/artifacts/outputs.tf b/terraform/artifacts/outputs.tf
new file mode 100644
index 0000000..198f2f6
--- /dev/null
+++ b/terraform/artifacts/outputs.tf
@@ -0,0 +1,4 @@
+output "dataflow_artifacts_bucket" {
+ value = module.artifacts-buckets.buckets[0]
+}
+
diff --git a/terraform/artifacts/terragrunt.hcl b/terraform/artifacts/terragrunt.hcl
new file mode 100644
index 0000000..0a32d86
--- /dev/null
+++ b/terraform/artifacts/terragrunt.hcl
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+include {
+ path = find_in_parent_folders()
+}
+
+dependency "ci" {
+ config_path = "../ci"
+}
+
+inputs = {
+ project_id = "datapipelines-artifacts"
+ ci_project = trimprefix(dependency.ci.outputs.project.id,"projects/")
+ ci_composer_env = dependency.ci.outputs.composer-env-name
+ ci_composer_region = dependency.ci.outputs.composer-region
+ ci_composer_dags_bucket = dependency.ci.outputs.composer-dags-bucket
+ dataflow_jars_bucket = dependency.ci.outputs.dataflow-jars-bucket
+ dataflow_staging_bucket = dependency.ci.outputs.dataflow-staging-bucket
+ push_sa = dependency.ci.outputs.cloudbuild-sa
+}
diff --git a/terraform/artifacts/variables.tf b/terraform/artifacts/variables.tf
new file mode 100644
index 0000000..d485fce
--- /dev/null
+++ b/terraform/artifacts/variables.tf
@@ -0,0 +1,31 @@
+variable "project_id" {
+ description = "Project ID for your GCP project to store artifacts"
+}
+
+variable "ci_project" {
+ description = "Continuous Integration Project which pushes artifacts"
+}
+
+variable "ci_composer_env" {
+ description = "CI Cloud Composer environment"
+}
+
+variable "ci_composer_region" {
+ description = "CI compute region for Cloud Composer"
+}
+
+variable "ci_composer_dags_bucket" {
+ description = "GSC location for Dags for CI Cloud Composer environment"
+}
+
+variable "dataflow_jars_bucket" {
+ description = "CI tests will pick up Dataflow JARs from here"
+}
+
+variable "dataflow_staging_bucket" {
+ description = "CI tests will run Dataflow jobs with this staging bucket"
+}
+
+variable "push_sa" {
+ description = "service account responsible for pushing artifacts. this is typically the cloud build SA in the CI project."
+}
diff --git a/terraform/backend.tf b/terraform/backend.tf
new file mode 100644
index 0000000..07d0536
--- /dev/null
+++ b/terraform/backend.tf
@@ -0,0 +1,7 @@
+# Generated by Terragrunt. Sig: nIlQXj57tbuaRZEa
+terraform {
+ backend "gcs" {
+ bucket = "datapipelines-ci-tfstate"
+ prefix = "./terraform.tfstate"
+ }
+}
diff --git a/terraform/ci/terragrunt.hcl b/terraform/ci/terragrunt.hcl
new file mode 100644
index 0000000..31d9b0f
--- /dev/null
+++ b/terraform/ci/terragrunt.hcl
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+include {
+ path = find_in_parent_folders()
+}
+
+locals {
+ env = "ci"
+}
+
+terraform {
+ source = "${get_terragrunt_dir()}/../datapipelines-infra"
+}
+
+inputs = {
+ project_id = "datapipelines-ci-282719"
+ network_name = "datapipelines-net"
+ composer_region = "us-central1"
+ composer_subnet = "composer-subnet"
+ composer_env_name = "datapipelines-orchestration"
+ env = local.env
+ artifacts_project = "datapipelines-artifacts"
+}
diff --git a/terraform/datapipelines-infra/README.md b/terraform/datapipelines-infra/README.md
new file mode 100644
index 0000000..7d4d92d
--- /dev/null
+++ b/terraform/datapipelines-infra/README.md
@@ -0,0 +1,36 @@
+# Datapipelines Infrastructure Module
+Module to DRY up infrastructure for CI and prod datapipelines environments.
+
+## Requirements
+
+| Name | Version |
+|------|---------|
+| terraform | >= 0.12 |
+
+## Providers
+
+| Name | Version |
+|------|---------|
+| google | n/a |
+
+## Inputs
+
+| Name | Description | Type | Default | Required |
+|------|-------------|------|---------|:--------:|
+| artifacts\_project | project to push artifacts for successful post commits | `any` | n/a | yes |
+| composer\_env\_name | Composer Environment name | `string` | `"datapipelines-orchestration"` | no |
+| composer\_region | Region for your composer environment | `string` | `"us-central1"` | no |
+| composer\_subnet | Name for composer subnetwork to create | `string` | `"composer-subnet"` | no |
+| env | Environment name ie. dev, test, prod | `string` | `""` | no |
+| network\_name | The network your data pipelines should use | `string` | `"datapipelines-net"` | no |
+| project\_id | Project ID for your GCP project to run CI tests | `any` | n/a | yes |
+
+## Outputs
+
+| Name | Description |
+|------|-------------|
+| cloudbuild-sa | The Cloud build SA for the project created by this module |
+| composer-env | The Cloud Composer Environment created by this module |
+| project | The project created by this module |
+| vpc | The VPC network created by this module |
+
diff --git a/terraform/datapipelines-infra/composer.tf b/terraform/datapipelines-infra/composer.tf
new file mode 100644
index 0000000..7b8f13b
--- /dev/null
+++ b/terraform/datapipelines-infra/composer.tf
@@ -0,0 +1,81 @@
+# Copyright 2019 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+locals {
+ max_threads = 2 * var.composer_num_cpus
+ worker_concurrency = 6 * var.composer_num_cpus
+ parallelism = (6 * var.composer_num_cpus) * var.composer_node_count
+}
+
+resource "google_composer_environment" "orchestration" {
+ project = var.project_id
+ name = var.composer_env_name
+ region = var.composer_region
+
+ config {
+ node_count = var.composer_node_count
+
+ software_config {
+ image_version = "composer-1.10.6-airflow-1.10.6"
+ python_version = "3"
+
+ airflow_config_overrides = {
+ # Improves stability when Deleteing DAGs.
+ core-dags_are_paused_at_creation = "True"
+ # Number of processes to process DAG files
+ # estimate = 2*num_cpu_per_node
+ scheduler-max_threads = tostring(local.max_threads)
+ # Number of celery processes per Airflow worker
+ # estimate = num_dags * num_tasks_per_dag * execution_duration_per_task / dag_ scheduling_period / num_airflow_workers
+ # |----------------------------------- total time needed ------------------------------------|
+ # or estimate = num_cpu_per_node * 6
+ # use lesser of the two estimates
+ celery-worker_concurrency = tostring(local.worker_concurrency)
+ # The amount of parallelism as a setting to the executor. This defines the max number of task instances that should run simultaneously
+ # estimate = worker_concurrency * num_airflow_workers
+ core-parallelism = tostring(local.parallelism)
+ # The number of task instances allowed to run concurrently by the scheduler
+ # estimate = parallelism
+ core-dag_concurrency = tostring(local.parallelism)
+ # When not using pools, tasks are run in the "default pool", whose size is guided by this config element
+ # estimate = parallelism
+ core-non_pooled_task_slot_count = tostring(local.parallelism)
+ core-store_serialized_dags = "True"
+ }
+ }
+
+ node_config {
+ zone = "us-central1-f"
+ machine_type = "n1-highmem-${var.composer_num_cpus}"
+ disk_size_gb = "30"
+ network = module.vpc.network_self_link
+ subnetwork = module.vpc.subnets["${var.composer_region}/${var.composer_subnet}"].self_link
+ }
+ }
+
+ depends_on = [google_project_iam_member.composer-worker]
+}
+
+resource "google_service_account" "composer_sa" {
+ project = var.project_id
+ account_id = "composer-env-account"
+ display_name = "Service Account for Composer Environment"
+}
+
+resource "google_project_iam_member" "composer-worker" {
+ project = var.project_id
+ role = "roles/composer.worker"
+ member = "serviceAccount:${google_service_account.composer_sa.email}"
+}
+
diff --git a/terraform/datapipelines-infra/gcs.tf b/terraform/datapipelines-infra/gcs.tf
new file mode 100644
index 0000000..a2b6cfc
--- /dev/null
+++ b/terraform/datapipelines-infra/gcs.tf
@@ -0,0 +1,53 @@
+module "data_buckets" {
+ source = "terraform-google-modules/cloud-storage/google"
+ version = "~> 1.6"
+ project_id = var.project_id
+ location = "US"
+
+ prefix = var.project_id
+
+ names = [
+ "wordcount_input",
+ "wordcount_result",
+ "wordcount_ref",
+ ]
+
+ versioning = {
+ first = true
+ }
+
+ creators = [
+ "serviceAccount:${google_service_account.composer_sa.email}",
+ ]
+
+ viewers = [
+ "serviceAccount:${google_service_account.composer_sa.email}",
+ ]
+}
+
+module "dataflow_buckets" {
+ source = "terraform-google-modules/cloud-storage/google"
+ version = "~> 1.6"
+ project_id = var.project_id
+ location = "US"
+
+ prefix = var.project_id
+
+ names = [
+ "dataflow_jars",
+ "dataflow_staging",
+ ]
+
+ versioning = {
+ first = true
+ }
+
+ creators = [
+ "serviceAccount:${google_service_account.composer_sa.email}",
+ ]
+
+ viewers = [
+ "serviceAccount:${google_service_account.composer_sa.email}",
+ ]
+}
+
diff --git a/terraform/datapipelines-infra/network.tf b/terraform/datapipelines-infra/network.tf
new file mode 100644
index 0000000..cc8b05b
--- /dev/null
+++ b/terraform/datapipelines-infra/network.tf
@@ -0,0 +1,50 @@
+# Copyright 2019 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+module "vpc" {
+ source = "terraform-google-modules/network/google"
+ version = "~> 2.3"
+
+ project_id = var.project_id
+ network_name = var.network_name
+ routing_mode = "GLOBAL"
+
+ subnets = [
+ {
+ subnet_name = var.composer_subnet
+ subnet_ip = "10.2.0.0/16"
+ subnet_region = "us-central1"
+ subnet_private_access = "true"
+ description = "Subnet to house Cloud Composer Environment"
+ },
+ {
+ subnet_name = "dataflow-subnet"
+ subnet_ip = "10.3.0.0/16"
+ subnet_region = "us-central1"
+ subnet_private_access = "true"
+ subnet_flow_logs = "true"
+ description = "Subnet for Cloud Dataflow workers"
+ },
+ ]
+
+ routes = [
+ {
+ name = "egress-internet"
+ description = "route through IGW to access internet"
+ destination_range = "0.0.0.0/0"
+ tags = "egress-inet"
+ next_hop_internet = "true"
+ }
+ ]
+}
diff --git a/terraform/datapipelines-infra/outputs.tf b/terraform/datapipelines-infra/outputs.tf
new file mode 100644
index 0000000..ad47afe
--- /dev/null
+++ b/terraform/datapipelines-infra/outputs.tf
@@ -0,0 +1,41 @@
+data google_project "project" {
+ project_id = var.project_id
+}
+output "vpc" {
+ value = module.vpc
+ description = "The VPC network created by this module"
+}
+
+output "composer-region" {
+ value = var.composer_region
+}
+
+output "composer-env-name" {
+ value = google_composer_environment.orchestration.name
+ description = "The Cloud Composer Environment created by this module"
+}
+
+output "composer-dags-bucket" {
+ value = trimsuffix(google_composer_environment.orchestration.config[0].dag_gcs_prefix, "dags")
+ description = "The Cloud Composer Environment created by this module"
+}
+
+output "cloudbuild-sa" {
+ value = "${data.google_project.project.number}@cloudbuild.gserviceaccount.com"
+ description = "The Cloud build SA for the project created by this module"
+}
+
+output "dataflow-jars-bucket" {
+ value = module.dataflow_buckets.names_list[0]
+ description = "Bucket where composer pulls Dataflow JARs from"
+}
+
+output "dataflow-staging-bucket" {
+ value = module.dataflow_buckets.names_list[1]
+ description = "Staging bucket where for dataflow jobs"
+}
+
+output "project" {
+ value = data.google_project.project
+ description = "The project created by this module"
+}
diff --git a/terraform/datapipelines-infra/prod.tfvars b/terraform/datapipelines-infra/prod.tfvars
new file mode 100644
index 0000000..d13f112
--- /dev/null
+++ b/terraform/datapipelines-infra/prod.tfvars
@@ -0,0 +1,7 @@
+project_id = "datapipelines-prod"
+project_num = "715427528296"
+network_name = "datapipelines-net"
+composer_region = "us-central1"
+composer_subnet = "composer-subnet"
+composer_env_name = "datapipelines-orchestration"
+env = "prod"
diff --git a/terraform/datapipelines-infra/services.tf b/terraform/datapipelines-infra/services.tf
new file mode 100644
index 0000000..34915fd
--- /dev/null
+++ b/terraform/datapipelines-infra/services.tf
@@ -0,0 +1,32 @@
+# Copyright 2019 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+module "project-services" {
+ source = "terraform-google-modules/project-factory/google//modules/project_services"
+ version = "7.1.0"
+
+ project_id = var.project_id
+
+ activate_apis = [
+ "compute.googleapis.com",
+ "iam.googleapis.com",
+ "cloudbuild.googleapis.com",
+ "sourcerepo.googleapis.com",
+ "composer.googleapis.com",
+ "dataflow.googleapis.com",
+ "pubsub.googleapis.com",
+ "stackdriver.googleapis.com",
+ ]
+}
+
diff --git a/terraform/datapipelines-infra/terragrunt.hcl b/terraform/datapipelines-infra/terragrunt.hcl
new file mode 100644
index 0000000..f598fa1
--- /dev/null
+++ b/terraform/datapipelines-infra/terragrunt.hcl
@@ -0,0 +1,19 @@
+/**
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+include {
+ path = find_in_parent_folders()
+}
diff --git a/terraform/datapipelines-infra/variables.tf b/terraform/datapipelines-infra/variables.tf
new file mode 100644
index 0000000..b79478f
--- /dev/null
+++ b/terraform/datapipelines-infra/variables.tf
@@ -0,0 +1,58 @@
+# Copyright 2019 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+variable "project_id" {
+ description = "Project ID for your GCP project to run CI tests"
+}
+
+variable "artifacts_project" {
+ description = "project to push artifacts for successful post commits"
+}
+
+variable "network_name" {
+ description = "The network your data pipelines should use"
+ default = "datapipelines-net"
+}
+
+variable "composer_region" {
+ description = "Region for your composer environment"
+ default = "us-central1"
+}
+
+variable "composer_subnet" {
+ description = "Name for composer subnetwork to create"
+ default = "composer-subnet"
+}
+
+variable "composer_env_name" {
+ description = "Composer Environment name"
+ default = "datapipelines-orchestration"
+}
+
+variable "composer_node_count" {
+ description = "Composer Environment node count"
+ type = number
+ default = 3
+}
+
+variable "composer_num_cpus" {
+ description = "Composer Environment node count. This should be 2,4,6,16,32,64."
+ type = number
+ default = 8
+}
+
+variable "env" {
+ description = "Environment name ie. dev, test, prod"
+ default = ""
+}
diff --git a/terraform/datapipelines-infra/versions.tf b/terraform/datapipelines-infra/versions.tf
new file mode 100644
index 0000000..ac97c6a
--- /dev/null
+++ b/terraform/datapipelines-infra/versions.tf
@@ -0,0 +1,4 @@
+
+terraform {
+ required_version = ">= 0.12"
+}
diff --git a/terraform/prod/terragrunt.hcl b/terraform/prod/terragrunt.hcl
new file mode 100644
index 0000000..ef21471
--- /dev/null
+++ b/terraform/prod/terragrunt.hcl
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+include {
+ path = find_in_parent_folders()
+}
+
+locals {
+ env = "prod"
+}
+
+terraform {
+ source = "${get_terragrunt_dir()}/../datapipelines-infra"
+}
+
+inputs = {
+ project_id = "datapipelines-${local.env}"
+ network_name = "datapipelines-net"
+ composer_region = "us-central1"
+ composer_subnet = "composer-subnet"
+ composer_env_name = "datapipelines-orchestration"
+ env = local.env
+ artifacts_project = "datapipelines-artifacts"
+}
diff --git a/terraform/terragrunt.hcl b/terraform/terragrunt.hcl
new file mode 100644
index 0000000..e319261
--- /dev/null
+++ b/terraform/terragrunt.hcl
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+remote_state {
+ backend = "gcs"
+ generate = {
+ path = "backend.tf"
+ if_exists = "overwrite"
+ }
+ config = {
+ bucket = "datapipelines-terraform-state"
+ project = "datapipelines-prod"
+ location = "us"
+ prefix = "terraform_state/${path_relative_to_include()}/terraform.tfstate"
+ }
+}
+
+