Skip to content

Commit

Permalink
move tileset commands to pipelines
Browse files Browse the repository at this point in the history
add merge command
  • Loading branch information
raphaellaude committed Sep 21, 2024
1 parent 1c7e129 commit 6ce61f7
Show file tree
Hide file tree
Showing 5 changed files with 245 additions and 97 deletions.
92 changes: 0 additions & 92 deletions backend/cli.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os
from typing import Iterable
import click
import logging

Expand Down Expand Up @@ -173,97 +172,6 @@ def delete_parent_child_edges(districtr_map: str):
session.close()


@cli.command("create-gerrydb-tileset")
@click.option(
"--layer", "-n", help="Name of the layer in the gerrydb view to load", required=True
)
@click.option(
"--gpkg",
"-g",
help="Path or URL to GeoPackage file. If URL, must be s3 URI",
required=True,
)
@click.option("--replace", "-f", help="Replace files they exist", is_flag=True)
@click.option(
"--column",
"-c",
help="Column to include in tileset",
multiple=True,
default=[
"path",
"geography",
"total_pop",
],
)
def create_gerrydb_tileset(
layer: str, gpkg: str, replace: bool, column: Iterable[str]
) -> None:
"""
Create a tileset from a GeoPackage file. Does not upload the tileset to S3. Use the s3 cli for that.
Note: this command is intended to be run locally. The server doesn't have the tippecannoe dependency. That's
intentional for now as we don't want to burden the server with memory intensive tasks.
"""
logger.info("Creating GerryDB tileset...")
s3 = settings.get_s3_client()

url = urlparse(gpkg)
logger.info("URL: %s", url)

path = gpkg

if url.scheme == "s3":
assert s3, "S3 client is not available"
path = download_file_from_s3(s3, url, replace)

fbg_path = f"{settings.VOLUME_PATH}/{layer}.fgb"
logger.info("Creating flatgeobuf...")
if os.path.exists(fbg_path) and not replace:
logger.info("File already exists. Skipping creation.")
else:
result = subprocess.run(
args=[
"ogr2ogr",
"-f",
"FlatGeobuf",
"-select",
",".join(column),
"-t_srs",
"EPSG:4326",
fbg_path,
path,
layer,
]
)

if result.returncode != 0:
logger.error("ogr2ogr failed. Got %s", result)
raise ValueError(f"ogr2ogr failed with return code {result.returncode}")

logger.info("Creating tileset...")
tileset_path = f"{settings.VOLUME_PATH}/{layer}.pmtiles"

args = [
"tippecanoe",
"-zg",
"--coalesce-smallest-as-needed",
"--extend-zooms-if-still-dropping",
"-o",
tileset_path,
"-l",
layer,
fbg_path,
]
if replace:
args.append("--force")

result = subprocess.run(args=args)

if result.returncode != 0:
logger.error("tippecanoe failed. Got %s", result)
raise ValueError(f"tippecanoe failed with return code {result.returncode}")


@cli.command("create-districtr-map")
@click.option("--name", help="Name of the districtr map", required=True)
@click.option("--parent-layer-name", help="Parent gerrydb layer name", required=True)
Expand Down
6 changes: 4 additions & 2 deletions pipelines/simple_elt/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,16 @@ Dependencies are managed with uv as noted in the root README. Follow set-up inst

### Geospatial libraries

#### GDAL

Follow the [installation instructions](https://docs.djangoproject.com/en/5.0/ref/contrib/gis/install/geolibs/) for GeoDjango. Although we are not using Django, the instructions are super useful / kept up-to-date.

You'll need `ogr2ogr` installed, part of GDAL. You can test that it was installed properly with `which ogr2ogr` or `ogr2ogr --version`.

### DuckDB
#### DuckDB

Follow [DuckDB installation instructions](https://duckdb.org/docs/installation/)

### Tippecanoe
#### Tippecanoe

Follow [Tippecanoe installation instructions](https://github.com/felt/tippecanoe?tab=readme-ov-file#installation).
45 changes: 43 additions & 2 deletions pipelines/simple_elt/files.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,48 @@
import os
import zipfile
from urllib.request import urlretrieve
from urllib.parse import urlparse
from urllib.parse import urlparse, ParseResult
from pathlib import Path
import zipfile
from settings import settings

import logging

LOGGER = logging.getLogger(__name__)
LOGGER.setLevel(logging.DEBUG)


def download_file_from_s3(s3, url: ParseResult, replace=False) -> str:
"""
Download a file from S3 to the local volume path.
Args:
s3: S3 client
url (ParseResult): URL of the file to download
replace (bool): If True, replace the file if it already exists
Returns the path to the downloaded file.
"""
if not s3:
raise ValueError("S3 client is not available")

file_name = url.path.lstrip("/")
LOGGER.debug("File name: %s", file_name)
object_information = s3.head_object(Bucket=url.netloc, Key=file_name)
LOGGER.debug("Object information: %s", object_information)

if object_information["ResponseMetadata"]["HTTPStatusCode"] != 200:
raise ValueError(
f"GeoPackage file {file_name} not found in S3 bucket {url.netloc}"
)

path = os.path.join(settings.OUT_SCRATCH, file_name)
LOGGER.debug("Path: %s", path)

if not os.path.exists(path) or replace:
LOGGER.debug("Downloading file...")
s3.download_file(url.netloc, file_name, path)

return path


def download_and_unzip_zipfile(zip_file_url: str, out_dir: Path | str) -> Path:
Expand Down
145 changes: 144 additions & 1 deletion pipelines/simple_elt/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
import logging
from urllib.parse import urlparse
from subprocess import run
from typing import Iterable

from files import download_and_unzip_zipfile, exists_in_s3
from files import download_and_unzip_zipfile, exists_in_s3, download_file_from_s3
from settings import settings

TIGER_YEAR = 2023
Expand Down Expand Up @@ -168,5 +169,147 @@ def wi_blocks():
)


@cli.command("create-gerrydb-tileset")
@click.option(
"--layer", "-n", help="Name of the layer in the gerrydb view to load", required=True
)
@click.option(
"--gpkg",
"-g",
help="Path or URL to GeoPackage file. If URL, must be s3 URI",
required=True,
)
@click.option("--replace", "-f", help="Replace files they exist", is_flag=True)
@click.option(
"--column",
"-c",
help="Column to include in tileset",
multiple=True,
default=[
"path",
"geography",
"total_pop",
],
)
def create_gerrydb_tileset(
layer: str, gpkg: str, replace: bool, column: Iterable[str]
) -> None:
"""
Create a tileset from a GeoPackage file. Does not upload the tileset to S3. Use the s3 cli for that.
"""
LOGGER.info("Creating GerryDB tileset...")
s3 = settings.get_s3_client()

url = urlparse(gpkg)
LOGGER.info("URL: %s", url)

path = gpkg

if url.scheme == "s3":
assert s3, "S3 client is not available"
path = download_file_from_s3(s3, url, replace)

fbg_path = f"{settings.OUT_SCRATCH}/{layer}.fgb"
LOGGER.info("Creating flatgeobuf...")
if os.path.exists(fbg_path) and not replace:
LOGGER.info("File already exists. Skipping creation.")
else:
result = run(
args=[
"ogr2ogr",
"-f",
"FlatGeobuf",
"-select",
",".join(column),
"-t_srs",
"EPSG:4326",
fbg_path,
path,
layer,
]
)

if result.returncode != 0:
LOGGER.error("ogr2ogr failed. Got %s", result)
raise ValueError(f"ogr2ogr failed with return code {result.returncode}")

LOGGER.info("Creating tileset...")
tileset_path = f"{settings.OUT_SCRATCH}/{layer}.pmtiles"

args = [
"tippecanoe",
"-zg",
"--coalesce-smallest-as-needed",
"--extend-zooms-if-still-dropping",
"-o",
tileset_path,
"-l",
layer,
fbg_path,
]
if replace:
args.append("--force")

result = run(args=args)

if result.returncode != 0:
LOGGER.error("tippecanoe failed. Got %s", result)
raise ValueError(f"tippecanoe failed with return code {result.returncode}")


@cli.command("merge-gerrydb-tilesets")
@click.option("--out-name", "-o", help="Name of the output tileset", required=True)
@click.option(
"--parent-layer",
help="Path to the parent layer to load. Can be an S3 URI",
required=True,
)
@click.option(
"--child-layer",
help="Path to the child layer to load. Can be an S3 URI",
required=True,
)
@click.option("--replace", "-f", help="Replace files they exist", is_flag=True)
def merge_gerrydb_tilesets(
out_name: str, parent_layer: str, child_layer: str, replace: bool
) -> None:
"""
Merge two tilesets. Does not upload the tileset to S3. Use the s3 cli for that.
"""
LOGGER.info("Merging GerryDB tilesets...")

s3 = settings.get_s3_client()

parent_url = urlparse(parent_layer)
LOGGER.info("Parent URL: %s", parent_url)

parent_path = parent_layer

if parent_url.scheme == "s3":
assert s3, "S3 client is not available"
parent_path = download_file_from_s3(s3, parent_url, replace)

child_url = urlparse(child_layer)
LOGGER.info("Child URL: %s", child_url)

child_path = child_layer

if child_url.scheme == "s3":
assert s3, "S3 client is not available"
child_path = download_file_from_s3(s3, child_url, replace)

run(
[
"tile-join",
"-o",
f"{settings.OUT_SCRATCH}/{out_name}.pmtiles",
parent_path,
child_path,
"--overzoom",
"--force",
]
)


if __name__ == "__main__":
cli()
Loading

0 comments on commit 6ce61f7

Please sign in to comment.