From 44f0d49a8208ef696b2d3daf8c1207673391ce80 Mon Sep 17 00:00:00 2001 From: Alena Date: Wed, 5 Jun 2024 14:58:50 +0200 Subject: [PATCH 1/2] google secrets demo --- secrets-providers-demo/.dlt/config.toml | 6 ++ .../.dlt/example.secrets.toml | 4 + secrets-providers-demo/dlt_pipeline.py | 75 +++++++++++++++++++ 3 files changed, 85 insertions(+) create mode 100644 secrets-providers-demo/.dlt/config.toml create mode 100644 secrets-providers-demo/.dlt/example.secrets.toml create mode 100644 secrets-providers-demo/dlt_pipeline.py diff --git a/secrets-providers-demo/.dlt/config.toml b/secrets-providers-demo/.dlt/config.toml new file mode 100644 index 0000000..79d1ef5 --- /dev/null +++ b/secrets-providers-demo/.dlt/config.toml @@ -0,0 +1,6 @@ +# put your configuration values here + +[runtime] +log_level="WARNING" # the system log level of dlt +# use the dlthub_telemetry setting to enable/disable anonymous usage data reporting, see https://dlthub.com/docs/telemetry +dlthub_telemetry = true diff --git a/secrets-providers-demo/.dlt/example.secrets.toml b/secrets-providers-demo/.dlt/example.secrets.toml new file mode 100644 index 0000000..801962e --- /dev/null +++ b/secrets-providers-demo/.dlt/example.secrets.toml @@ -0,0 +1,4 @@ +[google_secrets.credentials] +"project_id" = "" +"private_key" = "-----BEGIN PRIVATE KEY-----\n....\n-----END PRIVATE KEY-----\n" +"client_email" = "....gserviceaccount.com" diff --git a/secrets-providers-demo/dlt_pipeline.py b/secrets-providers-demo/dlt_pipeline.py new file mode 100644 index 0000000..96aa109 --- /dev/null +++ b/secrets-providers-demo/dlt_pipeline.py @@ -0,0 +1,75 @@ +import json + +import dlt +import requests +from dlt.common.configuration.inject import with_config +from dlt.common.configuration.specs import GcpServiceAccountCredentials +from google.cloud import secretmanager + + +@with_config(sections=("google_secrets",)) +def get_secret_dict( + secret_id, credentials: GcpServiceAccountCredentials = dlt.secrets.value +): + """ + Retrieve a secret from Google Cloud Secret Manager and convert to a dictionary. + + Args: + secret_id (str): ID of the secret to retrieve. + credentials (GcpServiceAccountCredentials): Credentials for accessing the secret manager. + + Returns: + dict: The secret data as a dictionary. + """ + # Create the Secret Manager client with provided credentials + client = secretmanager.SecretManagerServiceClient( + credentials=credentials.to_native_credentials() + ) + # Build the resource name of the secret version + name = f"projects/{credentials.project_id}/secrets/{secret_id}/versions/latest" + + # Access the secret version + response = client.access_secret_version(request={"name": name}) + # Decode the payload to a string and convert it to a dictionary + secret_string = response.payload.data.decode("UTF-8") + secret_dict = json.loads(secret_string) + + return secret_dict + + +@dlt.resource() +def get_repositories( + api_token: str = dlt.secrets.value, organization: str = dlt.secrets.value +): + """ + Retrieve repositories of a specified organization from GitHub. + + Args: + api_token (str): GitHub API token for authentication. + organization (str): The GitHub organization from which to retrieve repositories. + + Yields: + list: A list of repositories for the specified organization. + """ + BASE_URL = "https://api.github.com" + url = f"{BASE_URL}/orgs/{organization}/repos" + headers = { + "Authorization": f"token {api_token}", + "Accept": "application/vnd.github+json", + } + + response = requests.get(url, headers=headers) + response.raise_for_status() # Ensure that a HTTP error is raised for bad responses + yield response.json() + + +if __name__ == "__main__": + secret_data = get_secret_dict("temp-secret") + data = get_repositories(api_token=secret_data["api_token"], organization="dlt-hub") + + pipeline = dlt.pipeline( + pipeline_name="quick_start", destination="duckdb", dataset_name="mydata" + ) + load_info = pipeline.run(data, table_name="repos") + + print(load_info) From 58e7468d4405e7d987f9589f29494b3e6b475dc1 Mon Sep 17 00:00:00 2001 From: Alena Date: Wed, 5 Jun 2024 15:10:50 +0200 Subject: [PATCH 2/2] add readme --- secrets-providers-demo/README.md | 30 +++++++++++++++++++ ...py => dlt_with_google_secrets_pipeline.py} | 0 2 files changed, 30 insertions(+) create mode 100644 secrets-providers-demo/README.md rename secrets-providers-demo/{dlt_pipeline.py => dlt_with_google_secrets_pipeline.py} (100%) diff --git a/secrets-providers-demo/README.md b/secrets-providers-demo/README.md new file mode 100644 index 0000000..e800b45 --- /dev/null +++ b/secrets-providers-demo/README.md @@ -0,0 +1,30 @@ +# Use `dlt` with Cloud Secrets Vaults + +## Google Cloud Secret Manager +To retrieve secrets from Google Cloud Secret Manager using Python, and convert them into a dictionary format, you'll need to follow these steps. First, ensure that you have the necessary permissions to access the secrets on Google Cloud, and have the `google-cloud-secret-manager` library installed. If not, you can install it using pip: + +```bash +pip install google-cloud-secret-manager +``` +[Google Docs](https://cloud.google.com/secret-manager/docs/reference/libraries) + +Here's how you can retrieve secrets and convert them into a dictionary: + +1. **Set up the Secret Manager client**: Create a client that will interact with the Secret Manager API. +2. **Access the secret**: Use the client to access the secret's latest version. +3. **Convert to a dictionary**: If the secret is stored in a structured format (like JSON), parse it into a Python dictionary. + +Assume we store secrets in JSON format: +```json +{"api_token": "ghp_Kskdgf98dugjf98ghd...."} +``` + +In the script `dlt_with_google_secrets_pipeline.py` you can find an example how to use Google Secrets in `dlt` pipelines. + +### Points to Note: + +- **Permissions**: Ensure the service account or user credentials you are using have the necessary permissions to access the Secret Manager and the specific secrets. +- **Secret Format**: This example assumes that the secret is stored in a JSON string format. If your secret is in a different format, you will need to adjust the parsing method accordingly. +- **Google Cloud Authentication**: Make sure your environment is authenticated with Google Cloud. This can typically be done by setting credentials in `.dlt/secrets.toml` or setting the `GOOGLE_SECRETS__CREDENTIALS` environment variable to the path of your service account key file or the dict of credentials as a string. + +With this setup, you can effectively retrieve secrets stored in Google Cloud Secret Manager and use them in your `dlt` pipelines as dictionaries. \ No newline at end of file diff --git a/secrets-providers-demo/dlt_pipeline.py b/secrets-providers-demo/dlt_with_google_secrets_pipeline.py similarity index 100% rename from secrets-providers-demo/dlt_pipeline.py rename to secrets-providers-demo/dlt_with_google_secrets_pipeline.py