Skip to content

Commit

Permalink
Use Tabor for DBBackups
Browse files Browse the repository at this point in the history
  • Loading branch information
TristanDamron committed Aug 14, 2024
1 parent 848a5ad commit 87cdbc7
Show file tree
Hide file tree
Showing 9 changed files with 156 additions and 97 deletions.
3 changes: 0 additions & 3 deletions .gitmodules

This file was deleted.

3 changes: 3 additions & 0 deletions Dockerfiles/Dockerfile.geocml-desktop
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ RUN ansible-galaxy collection install ansible.posix && ansible-galaxy install -r
######### Customize Container Here ###########
######### End Customizations ###########

# Uninstall Ansible stuff
RUN rm -rf $HOME/.ansible && apt purge -y ansible*

# Remove install cache
RUN apt clean autoclean && apt autoremove -y && rm -rf /var/lib/{apt,dpkg,cache,log}/

Expand Down
32 changes: 24 additions & 8 deletions Dockerfiles/Dockerfile.geocml-task-scheduler
Original file line number Diff line number Diff line change
@@ -1,21 +1,37 @@
FROM ubuntu:22.04

USER root
ARG DEBIAN_FRONTEND=noninteractive
RUN apt update -y
RUN apt install -y software-properties-common
RUN add-apt-repository ppa:deadsnakes/ppa && apt update -y

# Install Python and Pip
RUN apt install -y python3-pip && pip install psycopg2-binary
# Install Python3.12 and Pip
RUN apt install -y python3.12 python3-pip

# Install Ansible dependencies
RUN apt install -y git python3.12-venv python3.12-dev

# Install psycopg2-binary
RUN pip install psycopg2-binary
RUN pip3 install psycopg2-binary

# Install Ansible
RUN apt install -y ansible

# Create task_log file
RUN touch /task_log
# Copy gTS build resources to the container
COPY ./build-resources/geocml-task-scheduler/ /geocml-task-scheduler

# Copy gTS to the container
COPY ./build-resources/geocml-task-scheduler/geocml-task-scheduler/ /geocml-task-scheduler
# Install Ansible dependencies and run through playbook
COPY ./ansible-playbooks/geocml-task-scheduler-requirements.yaml ./ansible-playbooks/geocml-task-scheduler-playbook.yaml ./
RUN ansible-galaxy collection install ansible.posix && ansible-galaxy install -r geocml-task-scheduler-requirements.yaml && ansible-playbook -i,localhost geocml-task-scheduler-playbook.yaml --tags "all" && rm -f ./*.yaml

######### Customize Container Here ###########
######### End Customizations ###########

CMD python3 /geocml-task-scheduler/schedule.py
# Uninstall Ansible stuff
RUN rm -rf $HOME/.ansible && apt purge -y ansible* && apt purge -y git* && apt purge -y virtualenv*

# Remove install cache
RUN apt clean autoclean && apt autoremove -y && rm -rf /var/lib/{apt,dpkg,cache,log}/

CMD python3 /geocml-task-scheduler/geocml-task-scheduler/schedule.py
34 changes: 34 additions & 0 deletions ansible-playbooks/geocml-task-scheduler-playbook.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
---
- # Configure geocml-task-scheduler
hosts: localhost
connection: local
gather_facts: yes
become: yes
tasks:
- name: Create task_log file
ansible.builtin.command: touch /task_log
- name: Clone Tabor source
ansible.builtin.command:
chdir: /geocml-task-scheduler
cmd: git clone https://github.com/geoCML/tabor.git
- name: Update Tabor submodule to latest release tag
shell: |
cd /geocml-task-scheduler/tabor
git fetch --tags
TAG=$(git tag | tail -1)
#git checkout $TAG
- name: Create python venv for Tabor build
ansible.builtin.command:
chdir: /geocml-task-scheduler/tabor
cmd: python3.12 -m venv ./venv
- name: Install Tabor requirements
ansible.builtin.pip:
chdir: /geocml-task-scheduler/tabor
virtualenv: ./venv
requirements: ./requirements.txt
- name: Build Tabor
shell: |
cd /geocml-task-scheduler/tabor
source ./venv/bin/activate && python3.12 -m pip install -U pyinstaller==6.9.0 && pyinstaller --paths=./src -y ./src/tabor.py
args:
executable: /bin/bash
12 changes: 12 additions & 0 deletions ansible-playbooks/geocml-task-scheduler-requirements.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
# Add Ansible Galaxy Packages here, role_example_hello included to prevent errors in template testing

# NOTE: Uncomment the two lines below if you want to test your Ansible installation
# before proceeding with the rest of the playbook.
#
# This is typically only necessary if you are changing the default Ansible installation
# that comes with the geoCML base image.
#roles:
#- irixjp.role_example_hello # https://galaxy.ansible.com/irixjp/role_example_hello
collections:
- community.general
Original file line number Diff line number Diff line change
@@ -1,88 +1,67 @@
import psycopg2
import os
import subprocess
from time import time
from task_logger import log

ignore_tables = ('spatial_ref_sys', 'geometry_columns', 'geography_columns')
ignore_schemas = ('pg_catalog', 'information_schema')
ignore_tables = ("spatial_ref_sys", "geometry_columns", "geography_columns")
ignore_schemas = ("pg_catalog", "information_schema")

def backup_geocml_db():
try:
conn = psycopg2.connect(dbname='geocml_db',
user='postgres',
password='admin',
host='geocml-postgres',
conn = psycopg2.connect(dbname="geocml_db",
user="geocml",
password="geocml",
host="geocml-postgres",
port=5432)
except psycopg2.OperationalError:
log('Couldn\'t connect to geocml_db; is the postgresql service started?')
log("Couldn\'t connect to geocml_db; is the postgresql service started?")
return

cursor = conn.cursor()
cursor.execute('SELECT DISTINCT table_schema FROM information_schema.tables;')
schemas = cursor.fetchall()
back_up_timestamp = time()
path_to_backup_dir = os.path.join(os.sep, 'DBBackups', str(back_up_timestamp))
path_to_backup_dir = os.path.join(os.sep, "DBBackups", str(back_up_timestamp))
os.mkdir(path_to_backup_dir)
delete_backup_dir = True


# Write table schemata to .tabor file
out = subprocess.run(["tabor", "write", "--db", "geocml_db",
"--username", "postgres", "--password", "admin",
"--host", "geocml-postgres",
"--file", os.path.join(path_to_backup_dir, "geocml_db.tabor")],
capture_output=True)

if out.stderr:
log("Failed to generate .tabor file {}".format(out.stderr))
os.rmdir(path_to_backup_dir)
return

cursor = conn.cursor()
cursor.execute("""SELECT DISTINCT table_schema FROM information_schema.tables;""")
schemas = cursor.fetchall()

# Write table data to CSV file
for schema in schemas:
if schema[0] in ignore_schemas:
continue
cursor.execute('SELECT * FROM information_schema.tables WHERE table_schema = \'{}\';'
.format(schema[0]))
tables = cursor.fetchall()

cursor.execute(f"""SELECT * FROM information_schema.tables WHERE table_schema = '{schema[0]}';""")

tables = cursor.fetchall()

for table in tables:
if table[2] in ignore_tables:
continue

delete_backup_dir = False

# Write to schema file
schema_file_path = os.path.join(path_to_backup_dir, 'schema:{}.{}.sql'.format(schema[0], table[2]))
schema_file = open(schema_file_path, 'w')

if not schema[0] == 'public':
cursor.execute('SELECT DISTINCT grantee FROM information_schema.role_table_grants WHERE table_schema = \'{}\';'
.format(schema[0]))
schema_owner = cursor.fetchall()
schema_file.write('CREATE SCHEMA IF NOT EXISTS {} AUTHORIZATION {};\n'
.format(schema[0], schema_owner[0][0]))

cursor.execute('SELECT pg_get_constraintdef(oid) FROM pg_constraint WHERE contype = \'p\' AND conrelid::regclass::text LIKE \'%{}%\';'.format(table[2]))

pk = cursor.fetchall()

cursor.execute('SELECT column_name, udt_name FROM information_schema.columns WHERE table_name = \'{}\';'
.format(table[2]))

columns_and_datatypes = []
for row in cursor:
if len(row) == 3: # column has a constraint
columns_and_datatypes.append('{} {} {}'.format(row[0], row[1], row[2]))
else:
columns_and_datatypes.append('{} {}'.format(row[0], row[1]))
columns_and_datatypes = ', '.join(columns_and_datatypes)

if len(pk) > 0: # table has primary key (expected)
schema_file.write('CREATE TABLE IF NOT EXISTS {}."{}" ({}, {});\n'.format(schema[0], table[2], columns_and_datatypes, pk[0][0]))
else:
schema_file.write('CREATE TABLE IF NOT EXISTS {}."{}" ({});\n'.format(schema[0], table[2], columns_and_datatypes))

cursor.execute('SELECT tableowner FROM pg_tables WHERE tablename = \'{}\';'.format(table[2]))
table_owner = cursor.fetchall()

schema_file.write('ALTER TABLE {}."{}" OWNER TO {};'.format(schema[0], table[2], table_owner[0][0]))
schema_file.close()

# Write to data file
data_file_path = os.path.join(path_to_backup_dir, 'data:{}.{}.csv'.format(schema[0], table[2]))
data_file = open(data_file_path, 'w')
cursor.copy_expert('COPY {}."{}" TO STDOUT WITH (FORMAT csv, DELIMITER \',\', HEADER FALSE);'.format(schema[0], table[2]), data_file)
data_file_path = os.path.join(path_to_backup_dir, "data:{}.{}.csv".format(schema[0], table[2]))
data_file = open(data_file_path, "w")
cursor.copy_expert(f"""COPY {schema[0]}."{table[2]}" TO STDOUT WITH (FORMAT csv, DELIMITER ',', HEADER);""", data_file)
data_file.close()

if delete_backup_dir: # nothing to back up
path_to_backup_dir.rmdir()
log("Nothing to backup")
os.rmdir(path_to_backup_dir)

cursor.close()
conn.close()
Original file line number Diff line number Diff line change
@@ -1,55 +1,72 @@
import ast
from io import StringIO
import psycopg2
import os
import subprocess
from time import time
from task_logger import log

def restore_geocml_db_from_backups():
try:
conn = psycopg2.connect(dbname='geocml_db',
user='postgres',
password='admin',
host='geocml-postgres',
conn = psycopg2.connect(dbname="geocml_db",
user="postgres",
password="admin",
host="geocml-postgres",
port=5432)
except psycopg2.OperationalError:
log('Couldn\'t connect to geocml_db; is the postgresql service started?')
log("Couldn\'t connect to geocml_db; is the postgresql service started?")
return

db_backups_dir = os.path.join(os.sep, 'DBBackups')
db_backups_dir = os.path.join(os.sep, "DBBackups")
now = time()
delta = float('inf')
most_recent_backup = ''
delta = float("inf")
most_recent_backup = ""
for subdir in os.walk(db_backups_dir):
try:
subdir_timestamp = float(subdir[0].split('/')[-1])
subdir_timestamp = float(subdir[0].split("/")[-1])
if now - subdir_timestamp < delta:
delta = now - subdir_timestamp
most_recent_backup = subdir[0]
except ValueError:
if not subdir[0] == db_backups_dir:
log('Found something unexpected in backup directory, skipping over: {}'.format(subdir[0]))
log("Found something unexpected in backup directory, skipping over: {}".format(subdir[0]))

if most_recent_backup == '':
log('No recent backups found. Aborting restoration process.')
if most_recent_backup == "":
log("No recent backups found. Aborting restoration process.")
return 0

log('Restoring geocml_db from {}'.format(most_recent_backup))
cursor = conn.cursor()
for sql_schema_file in os.listdir(most_recent_backup): # rebuild table schema
if sql_schema_file.split(':')[0] == 'schema':
log('Found SQL schema file {}'.format(sql_schema_file))
cursor.execute(open('{}/{}'.format(most_recent_backup, sql_schema_file), 'r').read())
log("Restoring geocml_db from {}".format(most_recent_backup))

cursor = conn.cursor()

# Rebuild tables from .tabor file

out = subprocess.run(["tabor", "read", "--file", os.path.join(most_recent_backup, "geocml_db.tabor")],
capture_output=True)

if out.stderr:
log("Failed to read .tabor file {}".format(out.stderr))

psql_data = ast.literal_eval(out.stdout.decode())

for table, psql_queries in psql_data.items():
log("Restoring table: {}".format(table))
for _, value in psql_queries.items():
cursor.execute(value)

conn.commit() # commit schema changes to the database before loading data from the CSV
log("Tables restored!")

for csv_data_file in os.listdir(most_recent_backup): # load data from CSV backups
file_name_split = csv_data_file.split(':')

if file_name_split[0] == 'data':
log('Found CSV data file {}'.format(csv_data_file))
file_name_split = file_name_split[1].split('.')
data_file = open(os.path.join(db_backups_dir, most_recent_backup, csv_data_file), 'r')
cursor.copy_expert('COPY {}."{}" FROM STDIN DELIMITER \',\' CSV HEADER;'
.format(file_name_split[0], file_name_split[1]), data_file)
file_name_split = csv_data_file.split(":")

if file_name_split[0] == "data":
log("Found CSV data file {}".format(csv_data_file))
file_name_split = file_name_split[1].split(".")
data_file = open(os.path.join(db_backups_dir, most_recent_backup, csv_data_file), "r").readlines()
cursor.copy_from(StringIO("".join(data_file[1::])), f"{file_name_split[1]}", sep=",",
columns=tuple(data_file[0].replace("\n", "").split(",")))
log("Finished loading data!")

conn.commit()
cursor.close()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@
restore_geocml_db_task.start()

while True:
pass # keep schedule.py process running in container
pass # keep schedule.py process running in container
3 changes: 2 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ services:
dockerfile: ./Dockerfiles/Dockerfile.geocml-task-scheduler
image: ghcr.io/geocml/geocml-base-deployment:task-scheduler
hostname: geocml-task-scheduler
environment:
- PATH=/geocml-task-scheduler/tabor/dist/tabor:$PATH
networks:
- geocml-network
volumes:
Expand Down Expand Up @@ -55,4 +57,3 @@ networks:
geocml-network:
external: true
driver: bridge
...

0 comments on commit 87cdbc7

Please sign in to comment.