From 87cdbc79d695b1652b6ff7668368a903dabb3e3f Mon Sep 17 00:00:00 2001 From: Tristan Damron Date: Tue, 13 Aug 2024 17:00:32 -0700 Subject: [PATCH] Use Tabor for DBBackups --- .gitmodules | 3 - Dockerfiles/Dockerfile.geocml-desktop | 3 + Dockerfiles/Dockerfile.geocml-task-scheduler | 32 +++++-- .../geocml-task-scheduler-playbook.yaml | 34 +++++++ .../geocml-task-scheduler-requirements.yaml | 12 +++ .../geocml-task-scheduler/backup_geocml_db.py | 95 ++++++++----------- .../restore_geocml_db_from_backups.py | 69 +++++++++----- .../geocml-task-scheduler/schedule.py | 2 +- docker-compose.yml | 3 +- 9 files changed, 156 insertions(+), 97 deletions(-) delete mode 100644 .gitmodules create mode 100644 ansible-playbooks/geocml-task-scheduler-playbook.yaml create mode 100644 ansible-playbooks/geocml-task-scheduler-requirements.yaml diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index e459b34..0000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "ubuntu-vnc-xfce-g3"] - path = ubuntu-vnc-xfce-g3 - url = git@github.com:accetto/ubuntu-vnc-xfce-g3.git diff --git a/Dockerfiles/Dockerfile.geocml-desktop b/Dockerfiles/Dockerfile.geocml-desktop index 28554c0..962ab28 100644 --- a/Dockerfiles/Dockerfile.geocml-desktop +++ b/Dockerfiles/Dockerfile.geocml-desktop @@ -26,6 +26,9 @@ RUN ansible-galaxy collection install ansible.posix && ansible-galaxy install -r ######### Customize Container Here ########### ######### End Customizations ########### +# Uninstall Ansible stuff +RUN rm -rf $HOME/.ansible && apt purge -y ansible* + # Remove install cache RUN apt clean autoclean && apt autoremove -y && rm -rf /var/lib/{apt,dpkg,cache,log}/ diff --git a/Dockerfiles/Dockerfile.geocml-task-scheduler b/Dockerfiles/Dockerfile.geocml-task-scheduler index 729bc71..1fe46b3 100644 --- a/Dockerfiles/Dockerfile.geocml-task-scheduler +++ b/Dockerfiles/Dockerfile.geocml-task-scheduler @@ -1,21 +1,37 @@ FROM ubuntu:22.04 USER root +ARG DEBIAN_FRONTEND=noninteractive RUN apt update -y +RUN apt install -y software-properties-common +RUN add-apt-repository ppa:deadsnakes/ppa && apt update -y -# Install Python and Pip -RUN apt install -y python3-pip && pip install psycopg2-binary +# Install Python3.12 and Pip +RUN apt install -y python3.12 python3-pip + +# Install Ansible dependencies +RUN apt install -y git python3.12-venv python3.12-dev # Install psycopg2-binary -RUN pip install psycopg2-binary +RUN pip3 install psycopg2-binary + +# Install Ansible +RUN apt install -y ansible -# Create task_log file -RUN touch /task_log +# Copy gTS build resources to the container +COPY ./build-resources/geocml-task-scheduler/ /geocml-task-scheduler -# Copy gTS to the container -COPY ./build-resources/geocml-task-scheduler/geocml-task-scheduler/ /geocml-task-scheduler +# Install Ansible dependencies and run through playbook +COPY ./ansible-playbooks/geocml-task-scheduler-requirements.yaml ./ansible-playbooks/geocml-task-scheduler-playbook.yaml ./ +RUN ansible-galaxy collection install ansible.posix && ansible-galaxy install -r geocml-task-scheduler-requirements.yaml && ansible-playbook -i,localhost geocml-task-scheduler-playbook.yaml --tags "all" && rm -f ./*.yaml ######### Customize Container Here ########### ######### End Customizations ########### -CMD python3 /geocml-task-scheduler/schedule.py \ No newline at end of file +# Uninstall Ansible stuff +RUN rm -rf $HOME/.ansible && apt purge -y ansible* && apt purge -y git* && apt purge -y virtualenv* + +# Remove install cache +RUN apt clean autoclean && apt autoremove -y && rm -rf /var/lib/{apt,dpkg,cache,log}/ + +CMD python3 /geocml-task-scheduler/geocml-task-scheduler/schedule.py diff --git a/ansible-playbooks/geocml-task-scheduler-playbook.yaml b/ansible-playbooks/geocml-task-scheduler-playbook.yaml new file mode 100644 index 0000000..cccb20f --- /dev/null +++ b/ansible-playbooks/geocml-task-scheduler-playbook.yaml @@ -0,0 +1,34 @@ +--- +- # Configure geocml-task-scheduler + hosts: localhost + connection: local + gather_facts: yes + become: yes + tasks: + - name: Create task_log file + ansible.builtin.command: touch /task_log + - name: Clone Tabor source + ansible.builtin.command: + chdir: /geocml-task-scheduler + cmd: git clone https://github.com/geoCML/tabor.git + - name: Update Tabor submodule to latest release tag + shell: | + cd /geocml-task-scheduler/tabor + git fetch --tags + TAG=$(git tag | tail -1) + #git checkout $TAG + - name: Create python venv for Tabor build + ansible.builtin.command: + chdir: /geocml-task-scheduler/tabor + cmd: python3.12 -m venv ./venv + - name: Install Tabor requirements + ansible.builtin.pip: + chdir: /geocml-task-scheduler/tabor + virtualenv: ./venv + requirements: ./requirements.txt + - name: Build Tabor + shell: | + cd /geocml-task-scheduler/tabor + source ./venv/bin/activate && python3.12 -m pip install -U pyinstaller==6.9.0 && pyinstaller --paths=./src -y ./src/tabor.py + args: + executable: /bin/bash diff --git a/ansible-playbooks/geocml-task-scheduler-requirements.yaml b/ansible-playbooks/geocml-task-scheduler-requirements.yaml new file mode 100644 index 0000000..b0d9299 --- /dev/null +++ b/ansible-playbooks/geocml-task-scheduler-requirements.yaml @@ -0,0 +1,12 @@ +--- +# Add Ansible Galaxy Packages here, role_example_hello included to prevent errors in template testing + +# NOTE: Uncomment the two lines below if you want to test your Ansible installation +# before proceeding with the rest of the playbook. +# +# This is typically only necessary if you are changing the default Ansible installation +# that comes with the geoCML base image. +#roles: +#- irixjp.role_example_hello # https://galaxy.ansible.com/irixjp/role_example_hello +collections: + - community.general diff --git a/build-resources/geocml-task-scheduler/geocml-task-scheduler/backup_geocml_db.py b/build-resources/geocml-task-scheduler/geocml-task-scheduler/backup_geocml_db.py index 766954e..df80830 100644 --- a/build-resources/geocml-task-scheduler/geocml-task-scheduler/backup_geocml_db.py +++ b/build-resources/geocml-task-scheduler/geocml-task-scheduler/backup_geocml_db.py @@ -1,37 +1,52 @@ import psycopg2 import os +import subprocess from time import time from task_logger import log -ignore_tables = ('spatial_ref_sys', 'geometry_columns', 'geography_columns') -ignore_schemas = ('pg_catalog', 'information_schema') +ignore_tables = ("spatial_ref_sys", "geometry_columns", "geography_columns") +ignore_schemas = ("pg_catalog", "information_schema") def backup_geocml_db(): try: - conn = psycopg2.connect(dbname='geocml_db', - user='postgres', - password='admin', - host='geocml-postgres', + conn = psycopg2.connect(dbname="geocml_db", + user="geocml", + password="geocml", + host="geocml-postgres", port=5432) except psycopg2.OperationalError: - log('Couldn\'t connect to geocml_db; is the postgresql service started?') + log("Couldn\'t connect to geocml_db; is the postgresql service started?") return - cursor = conn.cursor() - cursor.execute('SELECT DISTINCT table_schema FROM information_schema.tables;') - schemas = cursor.fetchall() back_up_timestamp = time() - path_to_backup_dir = os.path.join(os.sep, 'DBBackups', str(back_up_timestamp)) + path_to_backup_dir = os.path.join(os.sep, "DBBackups", str(back_up_timestamp)) os.mkdir(path_to_backup_dir) delete_backup_dir = True - + + # Write table schemata to .tabor file + out = subprocess.run(["tabor", "write", "--db", "geocml_db", + "--username", "postgres", "--password", "admin", + "--host", "geocml-postgres", + "--file", os.path.join(path_to_backup_dir, "geocml_db.tabor")], + capture_output=True) + + if out.stderr: + log("Failed to generate .tabor file {}".format(out.stderr)) + os.rmdir(path_to_backup_dir) + return + + cursor = conn.cursor() + cursor.execute("""SELECT DISTINCT table_schema FROM information_schema.tables;""") + schemas = cursor.fetchall() + + # Write table data to CSV file for schema in schemas: if schema[0] in ignore_schemas: continue - - cursor.execute('SELECT * FROM information_schema.tables WHERE table_schema = \'{}\';' - .format(schema[0])) - tables = cursor.fetchall() + + cursor.execute(f"""SELECT * FROM information_schema.tables WHERE table_schema = '{schema[0]}';""") + + tables = cursor.fetchall() for table in tables: if table[2] in ignore_tables: @@ -39,50 +54,14 @@ def backup_geocml_db(): delete_backup_dir = False - # Write to schema file - schema_file_path = os.path.join(path_to_backup_dir, 'schema:{}.{}.sql'.format(schema[0], table[2])) - schema_file = open(schema_file_path, 'w') - - if not schema[0] == 'public': - cursor.execute('SELECT DISTINCT grantee FROM information_schema.role_table_grants WHERE table_schema = \'{}\';' - .format(schema[0])) - schema_owner = cursor.fetchall() - schema_file.write('CREATE SCHEMA IF NOT EXISTS {} AUTHORIZATION {};\n' - .format(schema[0], schema_owner[0][0])) - - cursor.execute('SELECT pg_get_constraintdef(oid) FROM pg_constraint WHERE contype = \'p\' AND conrelid::regclass::text LIKE \'%{}%\';'.format(table[2])) - - pk = cursor.fetchall() - - cursor.execute('SELECT column_name, udt_name FROM information_schema.columns WHERE table_name = \'{}\';' - .format(table[2])) - - columns_and_datatypes = [] - for row in cursor: - if len(row) == 3: # column has a constraint - columns_and_datatypes.append('{} {} {}'.format(row[0], row[1], row[2])) - else: - columns_and_datatypes.append('{} {}'.format(row[0], row[1])) - columns_and_datatypes = ', '.join(columns_and_datatypes) - - if len(pk) > 0: # table has primary key (expected) - schema_file.write('CREATE TABLE IF NOT EXISTS {}."{}" ({}, {});\n'.format(schema[0], table[2], columns_and_datatypes, pk[0][0])) - else: - schema_file.write('CREATE TABLE IF NOT EXISTS {}."{}" ({});\n'.format(schema[0], table[2], columns_and_datatypes)) - - cursor.execute('SELECT tableowner FROM pg_tables WHERE tablename = \'{}\';'.format(table[2])) - table_owner = cursor.fetchall() - - schema_file.write('ALTER TABLE {}."{}" OWNER TO {};'.format(schema[0], table[2], table_owner[0][0])) - schema_file.close() - - # Write to data file - data_file_path = os.path.join(path_to_backup_dir, 'data:{}.{}.csv'.format(schema[0], table[2])) - data_file = open(data_file_path, 'w') - cursor.copy_expert('COPY {}."{}" TO STDOUT WITH (FORMAT csv, DELIMITER \',\', HEADER FALSE);'.format(schema[0], table[2]), data_file) + data_file_path = os.path.join(path_to_backup_dir, "data:{}.{}.csv".format(schema[0], table[2])) + data_file = open(data_file_path, "w") + cursor.copy_expert(f"""COPY {schema[0]}."{table[2]}" TO STDOUT WITH (FORMAT csv, DELIMITER ',', HEADER);""", data_file) data_file.close() if delete_backup_dir: # nothing to back up - path_to_backup_dir.rmdir() + log("Nothing to backup") + os.rmdir(path_to_backup_dir) + cursor.close() conn.close() diff --git a/build-resources/geocml-task-scheduler/geocml-task-scheduler/restore_geocml_db_from_backups.py b/build-resources/geocml-task-scheduler/geocml-task-scheduler/restore_geocml_db_from_backups.py index ede235c..2058553 100644 --- a/build-resources/geocml-task-scheduler/geocml-task-scheduler/restore_geocml_db_from_backups.py +++ b/build-resources/geocml-task-scheduler/geocml-task-scheduler/restore_geocml_db_from_backups.py @@ -1,55 +1,72 @@ +import ast +from io import StringIO import psycopg2 import os +import subprocess from time import time from task_logger import log def restore_geocml_db_from_backups(): try: - conn = psycopg2.connect(dbname='geocml_db', - user='postgres', - password='admin', - host='geocml-postgres', + conn = psycopg2.connect(dbname="geocml_db", + user="postgres", + password="admin", + host="geocml-postgres", port=5432) except psycopg2.OperationalError: - log('Couldn\'t connect to geocml_db; is the postgresql service started?') + log("Couldn\'t connect to geocml_db; is the postgresql service started?") return - db_backups_dir = os.path.join(os.sep, 'DBBackups') + db_backups_dir = os.path.join(os.sep, "DBBackups") now = time() - delta = float('inf') - most_recent_backup = '' + delta = float("inf") + most_recent_backup = "" for subdir in os.walk(db_backups_dir): try: - subdir_timestamp = float(subdir[0].split('/')[-1]) + subdir_timestamp = float(subdir[0].split("/")[-1]) if now - subdir_timestamp < delta: delta = now - subdir_timestamp most_recent_backup = subdir[0] except ValueError: if not subdir[0] == db_backups_dir: - log('Found something unexpected in backup directory, skipping over: {}'.format(subdir[0])) + log("Found something unexpected in backup directory, skipping over: {}".format(subdir[0])) - if most_recent_backup == '': - log('No recent backups found. Aborting restoration process.') + if most_recent_backup == "": + log("No recent backups found. Aborting restoration process.") return 0 - log('Restoring geocml_db from {}'.format(most_recent_backup)) - cursor = conn.cursor() - for sql_schema_file in os.listdir(most_recent_backup): # rebuild table schema - if sql_schema_file.split(':')[0] == 'schema': - log('Found SQL schema file {}'.format(sql_schema_file)) - cursor.execute(open('{}/{}'.format(most_recent_backup, sql_schema_file), 'r').read()) + log("Restoring geocml_db from {}".format(most_recent_backup)) + + cursor = conn.cursor() + + # Rebuild tables from .tabor file + + out = subprocess.run(["tabor", "read", "--file", os.path.join(most_recent_backup, "geocml_db.tabor")], + capture_output=True) + + if out.stderr: + log("Failed to read .tabor file {}".format(out.stderr)) + + psql_data = ast.literal_eval(out.stdout.decode()) + + for table, psql_queries in psql_data.items(): + log("Restoring table: {}".format(table)) + for _, value in psql_queries.items(): + cursor.execute(value) conn.commit() # commit schema changes to the database before loading data from the CSV + log("Tables restored!") for csv_data_file in os.listdir(most_recent_backup): # load data from CSV backups - file_name_split = csv_data_file.split(':') - - if file_name_split[0] == 'data': - log('Found CSV data file {}'.format(csv_data_file)) - file_name_split = file_name_split[1].split('.') - data_file = open(os.path.join(db_backups_dir, most_recent_backup, csv_data_file), 'r') - cursor.copy_expert('COPY {}."{}" FROM STDIN DELIMITER \',\' CSV HEADER;' - .format(file_name_split[0], file_name_split[1]), data_file) + file_name_split = csv_data_file.split(":") + + if file_name_split[0] == "data": + log("Found CSV data file {}".format(csv_data_file)) + file_name_split = file_name_split[1].split(".") + data_file = open(os.path.join(db_backups_dir, most_recent_backup, csv_data_file), "r").readlines() + cursor.copy_from(StringIO("".join(data_file[1::])), f"{file_name_split[1]}", sep=",", + columns=tuple(data_file[0].replace("\n", "").split(","))) + log("Finished loading data!") conn.commit() cursor.close() diff --git a/build-resources/geocml-task-scheduler/geocml-task-scheduler/schedule.py b/build-resources/geocml-task-scheduler/geocml-task-scheduler/schedule.py index 435e6e2..511a52e 100644 --- a/build-resources/geocml-task-scheduler/geocml-task-scheduler/schedule.py +++ b/build-resources/geocml-task-scheduler/geocml-task-scheduler/schedule.py @@ -9,4 +9,4 @@ restore_geocml_db_task.start() while True: - pass # keep schedule.py process running in container \ No newline at end of file + pass # keep schedule.py process running in container diff --git a/docker-compose.yml b/docker-compose.yml index 8104eb8..c8e5a96 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -7,6 +7,8 @@ services: dockerfile: ./Dockerfiles/Dockerfile.geocml-task-scheduler image: ghcr.io/geocml/geocml-base-deployment:task-scheduler hostname: geocml-task-scheduler + environment: + - PATH=/geocml-task-scheduler/tabor/dist/tabor:$PATH networks: - geocml-network volumes: @@ -55,4 +57,3 @@ networks: geocml-network: external: true driver: bridge -...