From 83703a7cfefcb9f13a7bf4893daf24fd4c3fca46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Francisco=20Calvo?= Date: Tue, 9 Jul 2024 16:24:55 +0200 Subject: [PATCH 01/28] =?UTF-8?q?Revert=20"Revert=20"improvement:=20add=20?= =?UTF-8?q?some=20SQLite=20pragma=20statement=20settings=20to=20imp?= =?UTF-8?q?=E2=80=A6"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 3512cbdb434400370b8c49517905661761d583bf. --- argilla-server/CHANGELOG.md | 5 ++++ argilla-server/src/argilla_server/database.py | 27 +++++++++++++++++++ argilla-server/tests/unit/test_database.py | 5 ++++ 3 files changed, 37 insertions(+) diff --git a/argilla-server/CHANGELOG.md b/argilla-server/CHANGELOG.md index 7406fb08c8..ccd5659bc6 100644 --- a/argilla-server/CHANGELOG.md +++ b/argilla-server/CHANGELOG.md @@ -82,6 +82,10 @@ These are the section headers that we use: - Added `REINDEX_DATASETS` environment variable to Argilla server Docker image. ([#5268](https://github.com/argilla-io/argilla/pull/5268)) - Added `argilla-hf-spaces` docker image for running Argilla server in HF spaces. ([#5307](https://github.com/argilla-io/argilla/pull/5307)) +### Added + +- Added some new performance tuning settings for SQLite database. ([#5150](https://github.com/argilla-io/argilla/pull/5150)) + ### Changed - Change `responses` table to delete rows on cascade when a user is deleted. ([#5126](https://github.com/argilla-io/argilla/pull/5126)) @@ -115,6 +119,7 @@ These are the section headers that we use: - Fixed error when updating records in bulk with wrong `external_id` but correct record `id`. ([#5014](https://github.com/argilla-io/argilla/pull/5014)) - Fixed error when searching all record response values. ([#5003](https://github.com/argilla-io/argilla/pull/5003)) +- Fixed SQLite connection settings not working correctly due to a outdated conditional. ([#5149](https://github.com/argilla-io/argilla/pull/5149)) ## [1.29.0](https://github.com/argilla-io/argilla/compare/v1.28.0...v1.29.0) diff --git a/argilla-server/src/argilla_server/database.py b/argilla-server/src/argilla_server/database.py index 9d8eb7ae50..cdc012f659 100644 --- a/argilla-server/src/argilla_server/database.py +++ b/argilla-server/src/argilla_server/database.py @@ -45,7 +45,34 @@ def set_sqlite_pragma(dbapi_connection, connection_record): if isinstance(dbapi_connection, AsyncAdapt_aiosqlite_connection): cursor = dbapi_connection.cursor() + + # Enforce foreign key constraints + # https://www.sqlite.org/pragma.html#pragma_foreign_keys + # https://www.sqlite.org/foreignkeys.html cursor.execute("PRAGMA foreign_keys = ON") + + # Journal mode WAL allows for greater concurrency (many readers + one writer) + # https://www.sqlite.org/pragma.html#pragma_journal_mode + cursor.execute("PRAGMA journal_mode = WAL") + + # Set more relaxed level of database durability + # 2 = "FULL" (sync on every write), 1 = "NORMAL" (sync every 1000 written pages) and 0 = "NONE" + # https://www.sqlite.org/pragma.html#pragma_synchronous + cursor.execute("PRAGMA synchronous = NORMAL") + + # Set the global memory map so all processes can share some data + # https://www.sqlite.org/pragma.html#pragma_mmap_size + # https://www.sqlite.org/mmap.html + cursor.execute("PRAGMA mmap_size = 134217728") # 128 megabytes + + # Impose a limit on the WAL file to prevent unlimited growth + # https://www.sqlite.org/pragma.html#pragma_journal_size_limit + cursor.execute("PRAGMA journal_size_limit = 67108864") # 64 megabytes + + # Set the local connection cache to 2000 pages + # https://www.sqlite.org/pragma.html#pragma_cache_size + cursor.execute("PRAGMA cache_size = 2000") + cursor.close() diff --git a/argilla-server/tests/unit/test_database.py b/argilla-server/tests/unit/test_database.py index ccf879758c..0269219be2 100644 --- a/argilla-server/tests/unit/test_database.py +++ b/argilla-server/tests/unit/test_database.py @@ -26,3 +26,8 @@ async def test_sqlite_pragma_settings(self, db: AsyncSession): return assert (await db.execute(text("PRAGMA foreign_keys"))).scalar() == 1 + assert (await db.execute(text("PRAGMA journal_mode"))).scalar() == "wal" + assert (await db.execute(text("PRAGMA synchronous"))).scalar() == 1 + assert (await db.execute(text("PRAGMA journal_size_limit"))).scalar() == 67108864 + assert (await db.execute(text("PRAGMA mmap_size"))).scalar() == 134217728 + assert (await db.execute(text("PRAGMA cache_size"))).scalar() == 2000 From 4b3fa3219acd720721f3d3dfbfc128e80c29c8df Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Tue, 8 Oct 2024 12:32:32 +0200 Subject: [PATCH 02/28] chore: Add script to backup the sqlite db --- .../scripts/db_backup_cron.py | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 argilla-server/docker/argilla-hf-spaces/scripts/db_backup_cron.py diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/db_backup_cron.py b/argilla-server/docker/argilla-hf-spaces/scripts/db_backup_cron.py new file mode 100644 index 0000000000..73ee39448c --- /dev/null +++ b/argilla-server/docker/argilla-hf-spaces/scripts/db_backup_cron.py @@ -0,0 +1,38 @@ +# Copyright 2021-present, the Recognai S.L. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sqlite3 +import time +from urllib.parse import urlparse +from pathlib import Path + +from argilla_server.database import database_url_sync + +url_db = database_url_sync() +db_path = Path(urlparse(url_db).path) + +backup_path = Path(db_path.parent / "backup") + +if not backup_path.exists(): + backup_path.mkdir() + +while True: + with sqlite3.connect(db_path, isolation_level="DEFERRED") as conn: + backup_file = os.path.join(backup_path.absolute(), db_path.name) + + os.system(f"cp {db_path.absolute()} {backup_file}") + os.system(f"cp {db_path.absolute()}-wal {backup_file}-wal") + + time.sleep(15) From e73708b8b03cad737d330904b95e4651e7db59a9 Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Tue, 8 Oct 2024 12:33:42 +0200 Subject: [PATCH 03/28] refactor: Add logic to backup and restore db on server restart --- argilla-server/docker/argilla-hf-spaces/Dockerfile | 3 +-- argilla-server/docker/argilla-hf-spaces/Procfile | 1 + argilla-server/docker/argilla-hf-spaces/scripts/start.sh | 6 ++++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/argilla-server/docker/argilla-hf-spaces/Dockerfile b/argilla-server/docker/argilla-hf-spaces/Dockerfile index 131b0a9e6b..08b7cdb432 100644 --- a/argilla-server/docker/argilla-hf-spaces/Dockerfile +++ b/argilla-server/docker/argilla-hf-spaces/Dockerfile @@ -6,7 +6,7 @@ FROM ${ARGILLA_SERVER_IMAGE}:${ARGILLA_VERSION} USER root # Copy Argilla distribution files -COPY scripts/start.sh /home/argilla +COPY scripts/* /home/argilla COPY Procfile /home/argilla COPY requirements.txt /packages/requirements.txt @@ -59,7 +59,6 @@ USER argilla ENV ELASTIC_CONTAINER=true ENV ES_JAVA_OPTS="-Xms1g -Xmx1g" -ENV ARGILLA_HOME_PATH=/data/argilla ENV BACKGROUND_NUM_WORKERS=2 ENV REINDEX_DATASETS=1 diff --git a/argilla-server/docker/argilla-hf-spaces/Procfile b/argilla-server/docker/argilla-hf-spaces/Procfile index 751d36e4b4..2cb8e12ed1 100644 --- a/argilla-server/docker/argilla-hf-spaces/Procfile +++ b/argilla-server/docker/argilla-hf-spaces/Procfile @@ -2,3 +2,4 @@ elastic: /usr/share/elasticsearch/bin/elasticsearch redis: /usr/bin/redis-server worker: sleep 30; rq worker-pool --num-workers ${BACKGROUND_NUM_WORKERS} argilla: sleep 30; /bin/bash start_argilla_server.sh +db-backup: sleep 30; python db_backup_cron.py \ No newline at end of file diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/start.sh b/argilla-server/docker/argilla-hf-spaces/scripts/start.sh index cc29f104d3..49a9e7e840 100644 --- a/argilla-server/docker/argilla-hf-spaces/scripts/start.sh +++ b/argilla-server/docker/argilla-hf-spaces/scripts/start.sh @@ -15,4 +15,10 @@ export USERNAME="${USERNAME:-$DEFAULT_USERNAME}" DEFAULT_PASSWORD=$(pwgen -s 16 1) export PASSWORD="${PASSWORD:-$DEFAULT_PASSWORD}" +# Copy the backup file to the correct location +cp -r /data/argilla/backup/* /data/* || true + +# Copy all the persistent storage files to the correct location +cp -r /data/argilla/* /home/argilla/ + honcho start From 0562c73ccc65816b7322b3ce7bdfa9f8a49d37e1 Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Tue, 8 Oct 2024 12:38:26 +0200 Subject: [PATCH 04/28] change backup folder --- .../docker/argilla-hf-spaces/scripts/db_backup_cron.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/db_backup_cron.py b/argilla-server/docker/argilla-hf-spaces/scripts/db_backup_cron.py index 73ee39448c..0dd6726e75 100644 --- a/argilla-server/docker/argilla-hf-spaces/scripts/db_backup_cron.py +++ b/argilla-server/docker/argilla-hf-spaces/scripts/db_backup_cron.py @@ -23,7 +23,7 @@ url_db = database_url_sync() db_path = Path(urlparse(url_db).path) -backup_path = Path(db_path.parent / "backup") +backup_path = Path("/data/argilla/backup") if not backup_path.exists(): backup_path.mkdir() From cec7a73db20f16bf966f53e7fb88f0d64758e011 Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Tue, 8 Oct 2024 13:02:29 +0200 Subject: [PATCH 05/28] fix: Using argilla home to restore backup files --- argilla-server/docker/argilla-hf-spaces/scripts/start.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/start.sh b/argilla-server/docker/argilla-hf-spaces/scripts/start.sh index 49a9e7e840..5bf91f4e43 100644 --- a/argilla-server/docker/argilla-hf-spaces/scripts/start.sh +++ b/argilla-server/docker/argilla-hf-spaces/scripts/start.sh @@ -19,6 +19,6 @@ export PASSWORD="${PASSWORD:-$DEFAULT_PASSWORD}" cp -r /data/argilla/backup/* /data/* || true # Copy all the persistent storage files to the correct location -cp -r /data/argilla/* /home/argilla/ +cp -r /data/argilla/* $ARGILLA_HOME_PATH honcho start From e25409383befbf86d14fcf7706a95f2f8dcf760c Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Tue, 8 Oct 2024 14:20:19 +0200 Subject: [PATCH 06/28] create a zero-backup of the existing files --- .../docker/argilla-hf-spaces/scripts/start.sh | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/start.sh b/argilla-server/docker/argilla-hf-spaces/scripts/start.sh index 5bf91f4e43..3cc87048e4 100644 --- a/argilla-server/docker/argilla-hf-spaces/scripts/start.sh +++ b/argilla-server/docker/argilla-hf-spaces/scripts/start.sh @@ -15,10 +15,18 @@ export USERNAME="${USERNAME:-$DEFAULT_USERNAME}" DEFAULT_PASSWORD=$(pwgen -s 16 1) export PASSWORD="${PASSWORD:-$DEFAULT_PASSWORD}" -# Copy the backup file to the correct location -cp -r /data/argilla/backup/* /data/* || true +# These lines only make sense if the container is running in a Hugging Face Spaces environment with persistent storage + +# 1. Create a backup of the existing persistent storage files once. If something goes wrong, we can restore the files +# from the zero-backup directory +if [ ! -d /data/argilla/backup.0 ]; then + mkdir -p /data/argilla/backup.0 + cp -r /data/argilla/* /data/argilla/backup.0 +fi -# Copy all the persistent storage files to the correct location +# 2. Copy the backup file to the correct location +cp -r /data/argilla/backup/* /data/* || true +# 3. Copy all the persistent storage files to the correct location cp -r /data/argilla/* $ARGILLA_HOME_PATH honcho start From 3b7ac8f3d79c4a1baa136642ef1b906d4bc0e9a3 Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Tue, 8 Oct 2024 14:25:50 +0200 Subject: [PATCH 07/28] chore: Remove extra line in CHANGELOG --- argilla-server/CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/argilla-server/CHANGELOG.md b/argilla-server/CHANGELOG.md index ccd5659bc6..90b66112a8 100644 --- a/argilla-server/CHANGELOG.md +++ b/argilla-server/CHANGELOG.md @@ -119,7 +119,6 @@ These are the section headers that we use: - Fixed error when updating records in bulk with wrong `external_id` but correct record `id`. ([#5014](https://github.com/argilla-io/argilla/pull/5014)) - Fixed error when searching all record response values. ([#5003](https://github.com/argilla-io/argilla/pull/5003)) -- Fixed SQLite connection settings not working correctly due to a outdated conditional. ([#5149](https://github.com/argilla-io/argilla/pull/5149)) ## [1.29.0](https://github.com/argilla-io/argilla/compare/v1.28.0...v1.29.0) From 853628cb9068f9f87715611ab3190823f01f45ca Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Tue, 8 Oct 2024 14:26:01 +0200 Subject: [PATCH 08/28] format --- argilla-server/docker/argilla-hf-spaces/Procfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/argilla-server/docker/argilla-hf-spaces/Procfile b/argilla-server/docker/argilla-hf-spaces/Procfile index 2cb8e12ed1..40d3b11f07 100644 --- a/argilla-server/docker/argilla-hf-spaces/Procfile +++ b/argilla-server/docker/argilla-hf-spaces/Procfile @@ -2,4 +2,4 @@ elastic: /usr/share/elasticsearch/bin/elasticsearch redis: /usr/bin/redis-server worker: sleep 30; rq worker-pool --num-workers ${BACKGROUND_NUM_WORKERS} argilla: sleep 30; /bin/bash start_argilla_server.sh -db-backup: sleep 30; python db_backup_cron.py \ No newline at end of file +db-backup: sleep 30; python db_backup_cron.py From 4f0e3312e29ef0f88ce6948f7641aaa27b2151db Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Tue, 8 Oct 2024 14:35:20 +0200 Subject: [PATCH 09/28] fix recursive backup copy --- argilla-server/docker/argilla-hf-spaces/scripts/start.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/start.sh b/argilla-server/docker/argilla-hf-spaces/scripts/start.sh index 3cc87048e4..d3e0cd6d65 100644 --- a/argilla-server/docker/argilla-hf-spaces/scripts/start.sh +++ b/argilla-server/docker/argilla-hf-spaces/scripts/start.sh @@ -19,9 +19,9 @@ export PASSWORD="${PASSWORD:-$DEFAULT_PASSWORD}" # 1. Create a backup of the existing persistent storage files once. If something goes wrong, we can restore the files # from the zero-backup directory -if [ ! -d /data/argilla/backup.0 ]; then - mkdir -p /data/argilla/backup.0 - cp -r /data/argilla/* /data/argilla/backup.0 +if [ ! -d /data/backups/argilla ]; then + mkdir -p /data/backups/argilla + cp -r /data/argilla/* /data/backups/argilla fi # 2. Copy the backup file to the correct location From 57c635f2f17877a5c6e86cb690606c6f62feb580 Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Tue, 8 Oct 2024 17:21:51 +0200 Subject: [PATCH 10/28] backup also the server id file --- .../docker/argilla-hf-spaces/Procfile | 2 +- .../scripts/argilla_home_backup_cron.py | 77 +++++++++++++++++++ .../scripts/db_backup_cron.py | 38 --------- .../src/argilla_server/telemetry/_helpers.py | 4 +- 4 files changed, 80 insertions(+), 41 deletions(-) create mode 100644 argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py delete mode 100644 argilla-server/docker/argilla-hf-spaces/scripts/db_backup_cron.py diff --git a/argilla-server/docker/argilla-hf-spaces/Procfile b/argilla-server/docker/argilla-hf-spaces/Procfile index 40d3b11f07..6dd2ff0a65 100644 --- a/argilla-server/docker/argilla-hf-spaces/Procfile +++ b/argilla-server/docker/argilla-hf-spaces/Procfile @@ -2,4 +2,4 @@ elastic: /usr/share/elasticsearch/bin/elasticsearch redis: /usr/bin/redis-server worker: sleep 30; rq worker-pool --num-workers ${BACKGROUND_NUM_WORKERS} argilla: sleep 30; /bin/bash start_argilla_server.sh -db-backup: sleep 30; python db_backup_cron.py +home-backup: sleep 15; python argilla_home_backup_cron.py diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py b/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py new file mode 100644 index 0000000000..fd257dc815 --- /dev/null +++ b/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py @@ -0,0 +1,77 @@ +# Copyright 2021-present, the Recognai S.L. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sqlite3 +import time +from pathlib import Path +from urllib.parse import urlparse + +from argilla_server.database import database_url_sync +from argilla_server.settings import settings +from argilla_server.telemetry import get_server_id, SERVER_ID_DAT_FILE + + +def backup(src, dst): + src_conn = sqlite3.connect(src, isolation_level="DEFERRED") + dst_conn = sqlite3.connect(dst, isolation_level="DEFERRED") + + try: + with src_conn, dst_conn: + print("Creating a db backup") + src_conn.backup(dst_conn) + print("Backup created") + finally: + src_conn.close() + dst_conn.close() + + +def db_backup(backup_folder: str, interval: int = 15): + url_db = database_url_sync() + db_path = Path(urlparse(url_db).path) + + backup_path = Path(backup_folder).absolute() + + if not backup_path.exists(): + backup_path.mkdir() + + backup_file = os.path.join(backup_path, db_path.name) + + while True: + try: + backup(src=db_path, dst=backup_file) + except Exception as e: + print(f"Error backing", e) + + time.sleep(interval) + + +def server_id_backup(backup_folder: str): + backup_path = Path(backup_folder).absolute() + if not backup_path.exists(): + backup_path.mkdir() + + # Force to create the server id file + get_server_id() + + server_id_file = os.path.join(settings.home_path, SERVER_ID_DAT_FILE) + + os.system(f"cp {server_id_file} {backup_folder}") + + +if __name__ == "__main__": + backup_folder: str = "/data/argilla/backup" + + server_id_backup(backup_folder) + db_backup(backup_folder) diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/db_backup_cron.py b/argilla-server/docker/argilla-hf-spaces/scripts/db_backup_cron.py deleted file mode 100644 index 0dd6726e75..0000000000 --- a/argilla-server/docker/argilla-hf-spaces/scripts/db_backup_cron.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright 2021-present, the Recognai S.L. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sqlite3 -import time -from urllib.parse import urlparse -from pathlib import Path - -from argilla_server.database import database_url_sync - -url_db = database_url_sync() -db_path = Path(urlparse(url_db).path) - -backup_path = Path("/data/argilla/backup") - -if not backup_path.exists(): - backup_path.mkdir() - -while True: - with sqlite3.connect(db_path, isolation_level="DEFERRED") as conn: - backup_file = os.path.join(backup_path.absolute(), db_path.name) - - os.system(f"cp {db_path.absolute()} {backup_file}") - os.system(f"cp {db_path.absolute()}-wal {backup_file}-wal") - - time.sleep(15) diff --git a/argilla-server/src/argilla_server/telemetry/_helpers.py b/argilla-server/src/argilla_server/telemetry/_helpers.py index 95a4986bf5..09a7e0ead7 100644 --- a/argilla-server/src/argilla_server/telemetry/_helpers.py +++ b/argilla-server/src/argilla_server/telemetry/_helpers.py @@ -21,7 +21,7 @@ _LOGGER = logging.getLogger(__name__) -_SERVER_ID_DAT_FILE = "server_id.dat" +SERVER_ID_DAT_FILE = "server_id.dat" def get_server_id() -> UUID: @@ -34,7 +34,7 @@ def get_server_id() -> UUID: """ - server_id_file = os.path.join(settings.home_path, _SERVER_ID_DAT_FILE) + server_id_file = os.path.join(settings.home_path, SERVER_ID_DAT_FILE) if os.path.exists(server_id_file): with open(server_id_file, "r") as f: From 139e227c9b4e13783152f206e16cb74946d4252d Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Tue, 8 Oct 2024 17:22:33 +0200 Subject: [PATCH 11/28] chore: Change statement order --- argilla-server/docker/argilla-hf-spaces/Dockerfile | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/argilla-server/docker/argilla-hf-spaces/Dockerfile b/argilla-server/docker/argilla-hf-spaces/Dockerfile index 08b7cdb432..507759ef38 100644 --- a/argilla-server/docker/argilla-hf-spaces/Dockerfile +++ b/argilla-server/docker/argilla-hf-spaces/Dockerfile @@ -5,11 +5,6 @@ FROM ${ARGILLA_SERVER_IMAGE}:${ARGILLA_VERSION} USER root -# Copy Argilla distribution files -COPY scripts/* /home/argilla -COPY Procfile /home/argilla -COPY requirements.txt /packages/requirements.txt - RUN apt-get update && \ apt-get install -y apt-transport-https gnupg wget @@ -24,6 +19,13 @@ RUN wget -qO - https://packages.redis.io/gpg | gpg --dearmor -o /usr/share/keyri RUN apt-get install -y lsb-release RUN echo "deb [signed-by=/usr/share/keyrings/redis-archive-keyring.gpg] https://packages.redis.io/deb $(lsb_release -cs) main" | tee /etc/apt/sources.list.d/redis.list + +# Copy Argilla distribution files +COPY scripts/* /home/argilla +COPY Procfile /home/argilla +COPY requirements.txt /packages/requirements.txt + +# Install dependencies RUN \ # Create a directory where Argilla will store the data mkdir /data && \ From 3555f1527941d91325a51564867f7d2910489722 Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Tue, 8 Oct 2024 17:23:08 +0200 Subject: [PATCH 12/28] chore: Change init logic to copy from existing argilla files to backup folder --- .../docker/argilla-hf-spaces/scripts/start.sh | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/start.sh b/argilla-server/docker/argilla-hf-spaces/scripts/start.sh index d3e0cd6d65..1d35857206 100644 --- a/argilla-server/docker/argilla-hf-spaces/scripts/start.sh +++ b/argilla-server/docker/argilla-hf-spaces/scripts/start.sh @@ -12,21 +12,20 @@ export OAUTH2_HUGGINGFACE_SCOPE=$OAUTH_SCOPES # See https://huggingface.co/docs/hub/en/spaces-overview#helper-environment-variables for more details DEFAULT_USERNAME=$(curl -L -s https://huggingface.co/api/users/${SPACES_CREATOR_USER_ID}/overview | jq -r '.user' || echo "${SPACE_AUTHOR_NAME}") export USERNAME="${USERNAME:-$DEFAULT_USERNAME}" + DEFAULT_PASSWORD=$(pwgen -s 16 1) export PASSWORD="${PASSWORD:-$DEFAULT_PASSWORD}" -# These lines only make sense if the container is running in a Hugging Face Spaces environment with persistent storage +if [ ! -d /data/argilla/backup ]; then + mkdir -p /data/argilla/backup +fi -# 1. Create a backup of the existing persistent storage files once. If something goes wrong, we can restore the files -# from the zero-backup directory -if [ ! -d /data/backups/argilla ]; then - mkdir -p /data/backups/argilla - cp -r /data/argilla/* /data/backups/argilla +if [ -d /data/argilla ]; then + # Initialize the backup folder with the existing argilla files + cp -r /data/argilla/* /data/argilla/backup || true fi -# 2. Copy the backup file to the correct location -cp -r /data/argilla/backup/* /data/* || true -# 3. Copy all the persistent storage files to the correct location -cp -r /data/argilla/* $ARGILLA_HOME_PATH +# Copy the backup files to the argilla folder +cp -r /data/argilla/backup/* $ARGILLA_HOME_PATH || true honcho start From 12dca42c010d8d93cbb3c83033d7215e9f50ea67 Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Tue, 8 Oct 2024 17:24:14 +0200 Subject: [PATCH 13/28] =?UTF-8?q?Revert=20"Revert=20"Revert=20"improvement?= =?UTF-8?q?:=20add=20some=20SQLite=20pragma=20statement=20settings=20to=20?= =?UTF-8?q?imp=E2=80=A6""?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 83703a7cfefcb9f13a7bf4893daf24fd4c3fca46. --- argilla-server/CHANGELOG.md | 4 --- argilla-server/src/argilla_server/database.py | 27 ------------------- argilla-server/tests/unit/test_database.py | 5 ---- 3 files changed, 36 deletions(-) diff --git a/argilla-server/CHANGELOG.md b/argilla-server/CHANGELOG.md index 90b66112a8..7406fb08c8 100644 --- a/argilla-server/CHANGELOG.md +++ b/argilla-server/CHANGELOG.md @@ -82,10 +82,6 @@ These are the section headers that we use: - Added `REINDEX_DATASETS` environment variable to Argilla server Docker image. ([#5268](https://github.com/argilla-io/argilla/pull/5268)) - Added `argilla-hf-spaces` docker image for running Argilla server in HF spaces. ([#5307](https://github.com/argilla-io/argilla/pull/5307)) -### Added - -- Added some new performance tuning settings for SQLite database. ([#5150](https://github.com/argilla-io/argilla/pull/5150)) - ### Changed - Change `responses` table to delete rows on cascade when a user is deleted. ([#5126](https://github.com/argilla-io/argilla/pull/5126)) diff --git a/argilla-server/src/argilla_server/database.py b/argilla-server/src/argilla_server/database.py index cdc012f659..9d8eb7ae50 100644 --- a/argilla-server/src/argilla_server/database.py +++ b/argilla-server/src/argilla_server/database.py @@ -45,34 +45,7 @@ def set_sqlite_pragma(dbapi_connection, connection_record): if isinstance(dbapi_connection, AsyncAdapt_aiosqlite_connection): cursor = dbapi_connection.cursor() - - # Enforce foreign key constraints - # https://www.sqlite.org/pragma.html#pragma_foreign_keys - # https://www.sqlite.org/foreignkeys.html cursor.execute("PRAGMA foreign_keys = ON") - - # Journal mode WAL allows for greater concurrency (many readers + one writer) - # https://www.sqlite.org/pragma.html#pragma_journal_mode - cursor.execute("PRAGMA journal_mode = WAL") - - # Set more relaxed level of database durability - # 2 = "FULL" (sync on every write), 1 = "NORMAL" (sync every 1000 written pages) and 0 = "NONE" - # https://www.sqlite.org/pragma.html#pragma_synchronous - cursor.execute("PRAGMA synchronous = NORMAL") - - # Set the global memory map so all processes can share some data - # https://www.sqlite.org/pragma.html#pragma_mmap_size - # https://www.sqlite.org/mmap.html - cursor.execute("PRAGMA mmap_size = 134217728") # 128 megabytes - - # Impose a limit on the WAL file to prevent unlimited growth - # https://www.sqlite.org/pragma.html#pragma_journal_size_limit - cursor.execute("PRAGMA journal_size_limit = 67108864") # 64 megabytes - - # Set the local connection cache to 2000 pages - # https://www.sqlite.org/pragma.html#pragma_cache_size - cursor.execute("PRAGMA cache_size = 2000") - cursor.close() diff --git a/argilla-server/tests/unit/test_database.py b/argilla-server/tests/unit/test_database.py index 0269219be2..ccf879758c 100644 --- a/argilla-server/tests/unit/test_database.py +++ b/argilla-server/tests/unit/test_database.py @@ -26,8 +26,3 @@ async def test_sqlite_pragma_settings(self, db: AsyncSession): return assert (await db.execute(text("PRAGMA foreign_keys"))).scalar() == 1 - assert (await db.execute(text("PRAGMA journal_mode"))).scalar() == "wal" - assert (await db.execute(text("PRAGMA synchronous"))).scalar() == 1 - assert (await db.execute(text("PRAGMA journal_size_limit"))).scalar() == 67108864 - assert (await db.execute(text("PRAGMA mmap_size"))).scalar() == 134217728 - assert (await db.execute(text("PRAGMA cache_size"))).scalar() == 2000 From 352e93acfbc710f932de6649f4a22ae6dfa905b1 Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Tue, 8 Oct 2024 18:18:21 +0200 Subject: [PATCH 14/28] chore: Review start.sh --- .../docker/argilla-hf-spaces/scripts/start.sh | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/start.sh b/argilla-server/docker/argilla-hf-spaces/scripts/start.sh index 1d35857206..59d95b4a9d 100644 --- a/argilla-server/docker/argilla-hf-spaces/scripts/start.sh +++ b/argilla-server/docker/argilla-hf-spaces/scripts/start.sh @@ -17,15 +17,26 @@ DEFAULT_PASSWORD=$(pwgen -s 16 1) export PASSWORD="${PASSWORD:-$DEFAULT_PASSWORD}" if [ ! -d /data/argilla/backup ]; then + echo "Creating backup folder..." mkdir -p /data/argilla/backup fi -if [ -d /data/argilla ]; then - # Initialize the backup folder with the existing argilla files - cp -r /data/argilla/* /data/argilla/backup || true +# if exists the db file, copy it to the backup folder and rename it +if [ -f /data/argilla/argilla.db ]; then + echo "Found argilla.db file, moving it to the backup folder..." + cp /data/argilla/argilla.db /data/argilla/backup || true + mv /data/argilla/argilla.db /data/argilla/argilla.db.bak || true +fi + +# if exists the server id file, copy it to the argilla folder +if [ -f /data/argilla/server_id.dat ]; then + echo "Found server_id.dat file, moving it to the backup folder..." + cp /data/argilla/server_id.dat /data/argilla/backup || true fi # Copy the backup files to the argilla folder +echo "Restoring files from backup folder..." cp -r /data/argilla/backup/* $ARGILLA_HOME_PATH || true +echo "Starting processes..." honcho start From 898afafb984704d456d738a3f8cc8a4505b0fae7 Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Tue, 8 Oct 2024 22:29:59 +0200 Subject: [PATCH 15/28] chore: configure lOG --- .../scripts/argilla_home_backup_cron.py | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py b/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py index fd257dc815..a0c7ac99d9 100644 --- a/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py +++ b/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import logging import os import sqlite3 import time @@ -22,6 +22,15 @@ from argilla_server.settings import settings from argilla_server.telemetry import get_server_id, SERVER_ID_DAT_FILE +logging.basicConfig( + handlers=[logging.StreamHandler()], + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + level=logging.INFO, + force=True, +) + +_LOGGER = logging.getLogger("argilla.backup") + def backup(src, dst): src_conn = sqlite3.connect(src, isolation_level="DEFERRED") @@ -29,9 +38,9 @@ def backup(src, dst): try: with src_conn, dst_conn: - print("Creating a db backup") + _LOGGER.info("Creating a db backup...") src_conn.backup(dst_conn) - print("Backup created") + _LOGGER.info("DB backup created!") finally: src_conn.close() dst_conn.close() @@ -52,7 +61,7 @@ def db_backup(backup_folder: str, interval: int = 15): try: backup(src=db_path, dst=backup_file) except Exception as e: - print(f"Error backing", e) + _LOGGER.exception(f"Error creating backup: {e}") time.sleep(interval) @@ -67,11 +76,12 @@ def server_id_backup(backup_folder: str): server_id_file = os.path.join(settings.home_path, SERVER_ID_DAT_FILE) + _LOGGER.info(f"Copying server id file to {backup_folder}") os.system(f"cp {server_id_file} {backup_folder}") + _LOGGER.info("Server id file copied!") if __name__ == "__main__": backup_folder: str = "/data/argilla/backup" - server_id_backup(backup_folder) db_backup(backup_folder) From 80a7d4edd6d990e2863a2f28fcd5cc83d65328f3 Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Tue, 8 Oct 2024 22:35:54 +0200 Subject: [PATCH 16/28] define backup interval env var --- .../scripts/argilla_home_backup_cron.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py b/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py index a0c7ac99d9..ee13650d25 100644 --- a/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py +++ b/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py @@ -32,7 +32,7 @@ _LOGGER = logging.getLogger("argilla.backup") -def backup(src, dst): +def _run_backup(src, dst): src_conn = sqlite3.connect(src, isolation_level="DEFERRED") dst_conn = sqlite3.connect(dst, isolation_level="DEFERRED") @@ -59,7 +59,7 @@ def db_backup(backup_folder: str, interval: int = 15): while True: try: - backup(src=db_path, dst=backup_file) + _run_backup(src=db_path, dst=backup_file) except Exception as e: _LOGGER.exception(f"Error creating backup: {e}") @@ -83,5 +83,8 @@ def server_id_backup(backup_folder: str): if __name__ == "__main__": backup_folder: str = "/data/argilla/backup" + + backup_interval = int(os.getenv("ARGILLA_BACKUP_INTERVAL") or "15") + server_id_backup(backup_folder) - db_backup(backup_folder) + db_backup(backup_folder, interval=backup_interval) From efbd7219677ed524cfdfc5b6ad274ce7a471a337 Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Wed, 9 Oct 2024 10:43:45 +0200 Subject: [PATCH 17/28] chore: Skip init if backup folder exists --- .../docker/argilla-hf-spaces/scripts/start.sh | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/start.sh b/argilla-server/docker/argilla-hf-spaces/scripts/start.sh index 59d95b4a9d..77f55235ec 100644 --- a/argilla-server/docker/argilla-hf-spaces/scripts/start.sh +++ b/argilla-server/docker/argilla-hf-spaces/scripts/start.sh @@ -17,21 +17,23 @@ DEFAULT_PASSWORD=$(pwgen -s 16 1) export PASSWORD="${PASSWORD:-$DEFAULT_PASSWORD}" if [ ! -d /data/argilla/backup ]; then - echo "Creating backup folder..." + echo "Initializing backup folder..." mkdir -p /data/argilla/backup -fi -# if exists the db file, copy it to the backup folder and rename it -if [ -f /data/argilla/argilla.db ]; then - echo "Found argilla.db file, moving it to the backup folder..." - cp /data/argilla/argilla.db /data/argilla/backup || true - mv /data/argilla/argilla.db /data/argilla/argilla.db.bak || true -fi + # if exists the db file, copy it to the backup folder and rename it + if [ -f /data/argilla/argilla.db ]; then + echo "Found argilla.db file, moving it to the backup folder..." + cp /data/argilla/argilla.db /data/argilla/backup || true + fi + + # if exists the server id file, copy it to the argilla folder + if [ -f /data/argilla/server_id.dat ]; then + echo "Found server_id.dat file, moving it to the backup folder..." + cp /data/argilla/server_id.dat /data/argilla/backup || true + fi -# if exists the server id file, copy it to the argilla folder -if [ -f /data/argilla/server_id.dat ]; then - echo "Found server_id.dat file, moving it to the backup folder..." - cp /data/argilla/server_id.dat /data/argilla/backup || true +else + echo "Backup folder already exists..." fi # Copy the backup files to the argilla folder From 88178892b0be24b5c90bad1331161573c792b548 Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Wed, 9 Oct 2024 10:44:03 +0200 Subject: [PATCH 18/28] chore: Rename process --- argilla-server/docker/argilla-hf-spaces/Procfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/argilla-server/docker/argilla-hf-spaces/Procfile b/argilla-server/docker/argilla-hf-spaces/Procfile index 6dd2ff0a65..c53224e140 100644 --- a/argilla-server/docker/argilla-hf-spaces/Procfile +++ b/argilla-server/docker/argilla-hf-spaces/Procfile @@ -2,4 +2,4 @@ elastic: /usr/share/elasticsearch/bin/elasticsearch redis: /usr/bin/redis-server worker: sleep 30; rq worker-pool --num-workers ${BACKGROUND_NUM_WORKERS} argilla: sleep 30; /bin/bash start_argilla_server.sh -home-backup: sleep 15; python argilla_home_backup_cron.py +argilla-backup: sleep 15; python argilla_home_backup_cron.py From 777b939073cfb5a433b4d462fa4969d35679a9b6 Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Wed, 9 Oct 2024 16:44:00 +0200 Subject: [PATCH 19/28] apply some changes --- .../scripts/argilla_home_backup_cron.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py b/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py index ee13650d25..193cfb8a42 100644 --- a/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py +++ b/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py @@ -32,15 +32,24 @@ _LOGGER = logging.getLogger("argilla.backup") -def _run_backup(src, dst): +def _run_backup(src: Path, dst_folder: str): + bak_folder = Path(dst_folder) / "bak" + + # Creating a copy of existing backup + os.system(f"rm -rf {bak_folder}/") + bak_folder.mkdir(exist_ok=True) + os.system(f"mv {os.path.join(dst_folder, src.name)}* {bak_folder}/") + + backup_file = os.path.join(dst_folder, src.name) + src_conn = sqlite3.connect(src, isolation_level="DEFERRED") - dst_conn = sqlite3.connect(dst, isolation_level="DEFERRED") + dst_conn = sqlite3.connect(backup_file, isolation_level="DEFERRED") try: + _LOGGER.info("Creating a db backup...") with src_conn, dst_conn: - _LOGGER.info("Creating a db backup...") src_conn.backup(dst_conn) - _LOGGER.info("DB backup created!") + _LOGGER.info("DB backup created!") finally: src_conn.close() dst_conn.close() @@ -55,11 +64,9 @@ def db_backup(backup_folder: str, interval: int = 15): if not backup_path.exists(): backup_path.mkdir() - backup_file = os.path.join(backup_path, db_path.name) - while True: try: - _run_backup(src=db_path, dst=backup_file) + _run_backup(src=db_path, dst_folder=backup_path) except Exception as e: _LOGGER.exception(f"Error creating backup: {e}") @@ -82,7 +89,7 @@ def server_id_backup(backup_folder: str): if __name__ == "__main__": - backup_folder: str = "/data/argilla/backup" + backup_folder: str = "./data/argilla/backup" backup_interval = int(os.getenv("ARGILLA_BACKUP_INTERVAL") or "15") From 1da94cc8c73d59de8914a1ad50f8ba7f1ff73869 Mon Sep 17 00:00:00 2001 From: Paco Aranda Date: Wed, 9 Oct 2024 23:55:54 +0200 Subject: [PATCH 20/28] Update argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py --- .../argilla-hf-spaces/scripts/argilla_home_backup_cron.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py b/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py index 193cfb8a42..71c0b34202 100644 --- a/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py +++ b/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py @@ -89,7 +89,7 @@ def server_id_backup(backup_folder: str): if __name__ == "__main__": - backup_folder: str = "./data/argilla/backup" + backup_folder: str = "/data/argilla/backup" backup_interval = int(os.getenv("ARGILLA_BACKUP_INTERVAL") or "15") From 183711e6434f76cd1aef52b75750e413fdc84fe5 Mon Sep 17 00:00:00 2001 From: Paco Aranda Date: Thu, 10 Oct 2024 08:41:06 +0200 Subject: [PATCH 21/28] Update argilla-server/docker/argilla-hf-spaces/Procfile --- argilla-server/docker/argilla-hf-spaces/Procfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/argilla-server/docker/argilla-hf-spaces/Procfile b/argilla-server/docker/argilla-hf-spaces/Procfile index c53224e140..85b3f3facb 100644 --- a/argilla-server/docker/argilla-hf-spaces/Procfile +++ b/argilla-server/docker/argilla-hf-spaces/Procfile @@ -2,4 +2,4 @@ elastic: /usr/share/elasticsearch/bin/elasticsearch redis: /usr/bin/redis-server worker: sleep 30; rq worker-pool --num-workers ${BACKGROUND_NUM_WORKERS} argilla: sleep 30; /bin/bash start_argilla_server.sh -argilla-backup: sleep 15; python argilla_home_backup_cron.py +argilla-backup: sleep 30; python argilla_home_backup_cron.py From 73f907634f8302cc6e1143c348e9550fa17e23fb Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Thu, 10 Oct 2024 16:19:55 +0200 Subject: [PATCH 22/28] add restore python script --- .../scripts/restore_argilla_backup.py | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 argilla-server/docker/argilla-hf-spaces/scripts/restore_argilla_backup.py diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/restore_argilla_backup.py b/argilla-server/docker/argilla-hf-spaces/scripts/restore_argilla_backup.py new file mode 100644 index 0000000000..93f9a733f3 --- /dev/null +++ b/argilla-server/docker/argilla-hf-spaces/scripts/restore_argilla_backup.py @@ -0,0 +1,41 @@ +# Copyright 2021-present, the Recognai S.L. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import glob +import logging +import os + +logging.basicConfig( + handlers=[logging.StreamHandler()], + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + level=logging.INFO, + force=True, +) + +_LOGGER = logging.getLogger("argilla.backup") + +if __name__ == "__main__": + backups_path = os.environ["ARGILLA_BACKUPS_PATH"] + + folders = glob.glob(f"{backups_path}/*") + folders.sort(key=os.path.getmtime, reverse=True) + + if len(folders) > 1: + safe_backup = folders[1] + argilla_home = os.getenv("ARGILLA_HOME_PATH") + + _LOGGER.info(f"Copying {safe_backup} backup to the argilla home folder at {argilla_home}") + os.system(f"cp -r {safe_backup}/* $ARGILLA_HOME_PATH") + _LOGGER.info("Backup restored!") + else: + _LOGGER.info("No safe backup found to restore. Exiting...") From 4e4ec989b0ddcc52e0cba3c91a18cb3fb1c0cfff Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Thu, 10 Oct 2024 16:20:17 +0200 Subject: [PATCH 23/28] refactor: Improve backup process --- .../scripts/argilla_home_backup_cron.py | 46 +++++++++++++------ .../docker/argilla-hf-spaces/scripts/start.sh | 18 ++++---- 2 files changed, 42 insertions(+), 22 deletions(-) diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py b/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py index 71c0b34202..ebe8b802df 100644 --- a/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py +++ b/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py @@ -18,6 +18,8 @@ from pathlib import Path from urllib.parse import urlparse +import httpx + from argilla_server.database import database_url_sync from argilla_server.settings import settings from argilla_server.telemetry import get_server_id, SERVER_ID_DAT_FILE @@ -32,30 +34,28 @@ _LOGGER = logging.getLogger("argilla.backup") -def _run_backup(src: Path, dst_folder: str): - bak_folder = Path(dst_folder) / "bak" +def _run_backup(src: Path, dst_folder: Path, backup_id: int): + backup_folder = Path(dst_folder) / str(backup_id) # Creating a copy of existing backup - os.system(f"rm -rf {bak_folder}/") - bak_folder.mkdir(exist_ok=True) - os.system(f"mv {os.path.join(dst_folder, src.name)}* {bak_folder}/") + backup_folder.mkdir(exist_ok=True) - backup_file = os.path.join(dst_folder, src.name) + backup_file = os.path.join(backup_folder, src.name) src_conn = sqlite3.connect(src, isolation_level="DEFERRED") dst_conn = sqlite3.connect(backup_file, isolation_level="DEFERRED") try: - _LOGGER.info("Creating a db backup...") + _LOGGER.info("Creating a db backup in %s", backup_file) with src_conn, dst_conn: src_conn.backup(dst_conn) - _LOGGER.info("DB backup created!") + _LOGGER.info("DB backup created at %s", backup_file) finally: src_conn.close() dst_conn.close() -def db_backup(backup_folder: str, interval: int = 15): +def db_backup(backup_folder: str, interval: int = 15, num_of_backups: int = 20): url_db = database_url_sync() db_path = Path(urlparse(url_db).path) @@ -64,9 +64,11 @@ def db_backup(backup_folder: str, interval: int = 15): if not backup_path.exists(): backup_path.mkdir() + backup_id = 0 while True: try: - _run_backup(src=db_path, dst_folder=backup_path) + _run_backup(src=db_path, dst_folder=backup_path, backup_id=backup_id) + backup_id = (backup_id + 1) % num_of_backups except Exception as e: _LOGGER.exception(f"Error creating backup: {e}") @@ -88,10 +90,26 @@ def server_id_backup(backup_folder: str): _LOGGER.info("Server id file copied!") -if __name__ == "__main__": - backup_folder: str = "/data/argilla/backup" +def is_argilla_alive(): + try: + with httpx.Client() as client: + response = client.get("http://localhost:6900/api/v1/status") + response.raise_for_status() + return True + except Exception as e: + _LOGGER.exception(f"Error checking if argilla is alive: {e}") + return False + +if __name__ == "__main__": + argilla_data: str = "/data/argilla" + backup_path = os.environ["ARGILLA_BACKUP_PATH"] backup_interval = int(os.getenv("ARGILLA_BACKUP_INTERVAL") or "15") + num_of_backups = int(os.getenv("ARGILLA_NUM_OF_BACKUPS") or "20") + + while not is_argilla_alive(): + _LOGGER.info("Waiting for the server to be ready...") + time.sleep(5) - server_id_backup(backup_folder) - db_backup(backup_folder, interval=backup_interval) + server_id_backup(argilla_data) + db_backup(backup_path, interval=backup_interval, num_of_backups=num_of_backups) diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/start.sh b/argilla-server/docker/argilla-hf-spaces/scripts/start.sh index 77f55235ec..2dcca12c95 100644 --- a/argilla-server/docker/argilla-hf-spaces/scripts/start.sh +++ b/argilla-server/docker/argilla-hf-spaces/scripts/start.sh @@ -16,20 +16,22 @@ export USERNAME="${USERNAME:-$DEFAULT_USERNAME}" DEFAULT_PASSWORD=$(pwgen -s 16 1) export PASSWORD="${PASSWORD:-$DEFAULT_PASSWORD}" -if [ ! -d /data/argilla/backup ]; then - echo "Initializing backup folder..." - mkdir -p /data/argilla/backup +export ARGILLA_BACKUPS_PATH=/data/argilla/backups + +if [ ! -d ARGILLA_BACKUPS_PATH ]; then + echo "Initializing backups folder..." + mkdir -p ARGILLA_BACKUPS_PATH # if exists the db file, copy it to the backup folder and rename it if [ -f /data/argilla/argilla.db ]; then - echo "Found argilla.db file, moving it to the backup folder..." - cp /data/argilla/argilla.db /data/argilla/backup || true + echo "Found argilla.db file, moving it to the argilla home path..." + cp /data/argilla/argilla.db $ARGILLA_HOME_PATH || true fi # if exists the server id file, copy it to the argilla folder if [ -f /data/argilla/server_id.dat ]; then - echo "Found server_id.dat file, moving it to the backup folder..." - cp /data/argilla/server_id.dat /data/argilla/backup || true + echo "Found server_id.dat file, moving it to argilla home path..." + cp /data/argilla/server_id.dat $ARGILLA_HOME_PATH || true fi else @@ -38,7 +40,7 @@ fi # Copy the backup files to the argilla folder echo "Restoring files from backup folder..." -cp -r /data/argilla/backup/* $ARGILLA_HOME_PATH || true +python restore_argilla_backup.py echo "Starting processes..." honcho start From 5b35c9be614e28694ca0ebfe1cd03f439d08386a Mon Sep 17 00:00:00 2001 From: Paco Aranda Date: Thu, 10 Oct 2024 16:49:50 +0200 Subject: [PATCH 24/28] Update argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py --- .../argilla-hf-spaces/scripts/argilla_home_backup_cron.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py b/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py index ebe8b802df..cf0ca4a347 100644 --- a/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py +++ b/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py @@ -103,7 +103,7 @@ def is_argilla_alive(): if __name__ == "__main__": argilla_data: str = "/data/argilla" - backup_path = os.environ["ARGILLA_BACKUP_PATH"] + backup_path = os.environ["ARGILLA_BACKUPS_PATH"] backup_interval = int(os.getenv("ARGILLA_BACKUP_INTERVAL") or "15") num_of_backups = int(os.getenv("ARGILLA_NUM_OF_BACKUPS") or "20") From 0324f51536efe32e0c61dbba5253bb76c1303c99 Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Thu, 10 Oct 2024 17:52:14 +0200 Subject: [PATCH 25/28] ci: Update step --- .github/workflows/argilla-server.build-docker-images.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/argilla-server.build-docker-images.yml b/.github/workflows/argilla-server.build-docker-images.yml index 7e2604f1f8..f7c26b6897 100644 --- a/.github/workflows/argilla-server.build-docker-images.yml +++ b/.github/workflows/argilla-server.build-docker-images.yml @@ -92,7 +92,7 @@ jobs: path: argilla-server/docker/server/dist - name: Build and push `argilla-server` image - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: context: argilla-server/docker/server platforms: ${{ env.PLATFORMS }} @@ -102,7 +102,7 @@ jobs: - name: Push latest `argilla-server` image if: ${{ env.PUBLISH_LATEST == 'true' }} - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: context: argilla-server/docker/server platforms: ${{ env.PLATFORMS }} @@ -111,7 +111,7 @@ jobs: push: true - name: Build and push `argilla-hf-spaces` image - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: context: argilla-server/docker/argilla-hf-spaces platforms: ${{ env.PLATFORMS }} @@ -124,7 +124,7 @@ jobs: - name: Push latest `argilla-hf-spaces` image if: ${{ env.PUBLISH_LATEST == 'true' }} - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: context: argilla-server/docker/argilla-hf-spaces platforms: ${{ env.PLATFORMS }} From cf2fc60f0932276d4d8a0aa59f7f2f102aa9448a Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Fri, 11 Oct 2024 09:36:35 +0200 Subject: [PATCH 26/28] chore: Change logging message --- .../argilla-hf-spaces/scripts/argilla_home_backup_cron.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py b/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py index cf0ca4a347..50624a1517 100644 --- a/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py +++ b/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py @@ -97,7 +97,7 @@ def is_argilla_alive(): response.raise_for_status() return True except Exception as e: - _LOGGER.exception(f"Error checking if argilla is alive: {e}") + _LOGGER.warning(f"Argilla server is not running: {e}") return False From 9cf5978b4712df08baba8a7d735bf063d576a9d9 Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Fri, 11 Oct 2024 10:16:40 +0200 Subject: [PATCH 27/28] Clean backup folder before --- .../argilla-hf-spaces/scripts/argilla_home_backup_cron.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py b/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py index 50624a1517..71b96dcf58 100644 --- a/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py +++ b/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py @@ -38,6 +38,9 @@ def _run_backup(src: Path, dst_folder: Path, backup_id: int): backup_folder = Path(dst_folder) / str(backup_id) # Creating a copy of existing backup + if backup_folder.exists(): + _LOGGER.info("Removing existing backup folder %s", backup_folder) + os.system(f"rm -rf {backup_folder}") backup_folder.mkdir(exist_ok=True) backup_file = os.path.join(backup_folder, src.name) From b4a23f96e6fa1da85928bd75b9ac40e80be6cfff Mon Sep 17 00:00:00 2001 From: Francisco Aranda Date: Fri, 11 Oct 2024 10:38:07 +0200 Subject: [PATCH 28/28] increase backup id after error --- .../argilla-hf-spaces/scripts/argilla_home_backup_cron.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py b/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py index 71b96dcf58..95e81d897b 100644 --- a/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py +++ b/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py @@ -71,10 +71,10 @@ def db_backup(backup_folder: str, interval: int = 15, num_of_backups: int = 20): while True: try: _run_backup(src=db_path, dst_folder=backup_path, backup_id=backup_id) - backup_id = (backup_id + 1) % num_of_backups except Exception as e: _LOGGER.exception(f"Error creating backup: {e}") - + finally: + backup_id = (backup_id + 1) % num_of_backups time.sleep(interval)