diff --git a/.github/workflows/argilla-server.build-docker-images.yml b/.github/workflows/argilla-server.build-docker-images.yml index 7e2604f1f8..f7c26b6897 100644 --- a/.github/workflows/argilla-server.build-docker-images.yml +++ b/.github/workflows/argilla-server.build-docker-images.yml @@ -92,7 +92,7 @@ jobs: path: argilla-server/docker/server/dist - name: Build and push `argilla-server` image - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: context: argilla-server/docker/server platforms: ${{ env.PLATFORMS }} @@ -102,7 +102,7 @@ jobs: - name: Push latest `argilla-server` image if: ${{ env.PUBLISH_LATEST == 'true' }} - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: context: argilla-server/docker/server platforms: ${{ env.PLATFORMS }} @@ -111,7 +111,7 @@ jobs: push: true - name: Build and push `argilla-hf-spaces` image - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: context: argilla-server/docker/argilla-hf-spaces platforms: ${{ env.PLATFORMS }} @@ -124,7 +124,7 @@ jobs: - name: Push latest `argilla-hf-spaces` image if: ${{ env.PUBLISH_LATEST == 'true' }} - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: context: argilla-server/docker/argilla-hf-spaces platforms: ${{ env.PLATFORMS }} diff --git a/argilla-server/docker/argilla-hf-spaces/Dockerfile b/argilla-server/docker/argilla-hf-spaces/Dockerfile index 131b0a9e6b..507759ef38 100644 --- a/argilla-server/docker/argilla-hf-spaces/Dockerfile +++ b/argilla-server/docker/argilla-hf-spaces/Dockerfile @@ -5,11 +5,6 @@ FROM ${ARGILLA_SERVER_IMAGE}:${ARGILLA_VERSION} USER root -# Copy Argilla distribution files -COPY scripts/start.sh /home/argilla -COPY Procfile /home/argilla -COPY requirements.txt /packages/requirements.txt - RUN apt-get update && \ apt-get install -y apt-transport-https gnupg wget @@ -24,6 +19,13 @@ RUN wget -qO - https://packages.redis.io/gpg | gpg --dearmor -o /usr/share/keyri RUN apt-get install -y lsb-release RUN echo "deb [signed-by=/usr/share/keyrings/redis-archive-keyring.gpg] https://packages.redis.io/deb $(lsb_release -cs) main" | tee /etc/apt/sources.list.d/redis.list + +# Copy Argilla distribution files +COPY scripts/* /home/argilla +COPY Procfile /home/argilla +COPY requirements.txt /packages/requirements.txt + +# Install dependencies RUN \ # Create a directory where Argilla will store the data mkdir /data && \ @@ -59,7 +61,6 @@ USER argilla ENV ELASTIC_CONTAINER=true ENV ES_JAVA_OPTS="-Xms1g -Xmx1g" -ENV ARGILLA_HOME_PATH=/data/argilla ENV BACKGROUND_NUM_WORKERS=2 ENV REINDEX_DATASETS=1 diff --git a/argilla-server/docker/argilla-hf-spaces/Procfile b/argilla-server/docker/argilla-hf-spaces/Procfile index 751d36e4b4..85b3f3facb 100644 --- a/argilla-server/docker/argilla-hf-spaces/Procfile +++ b/argilla-server/docker/argilla-hf-spaces/Procfile @@ -2,3 +2,4 @@ elastic: /usr/share/elasticsearch/bin/elasticsearch redis: /usr/bin/redis-server worker: sleep 30; rq worker-pool --num-workers ${BACKGROUND_NUM_WORKERS} argilla: sleep 30; /bin/bash start_argilla_server.sh +argilla-backup: sleep 30; python argilla_home_backup_cron.py diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py b/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py new file mode 100644 index 0000000000..95e81d897b --- /dev/null +++ b/argilla-server/docker/argilla-hf-spaces/scripts/argilla_home_backup_cron.py @@ -0,0 +1,118 @@ +# Copyright 2021-present, the Recognai S.L. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import os +import sqlite3 +import time +from pathlib import Path +from urllib.parse import urlparse + +import httpx + +from argilla_server.database import database_url_sync +from argilla_server.settings import settings +from argilla_server.telemetry import get_server_id, SERVER_ID_DAT_FILE + +logging.basicConfig( + handlers=[logging.StreamHandler()], + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + level=logging.INFO, + force=True, +) + +_LOGGER = logging.getLogger("argilla.backup") + + +def _run_backup(src: Path, dst_folder: Path, backup_id: int): + backup_folder = Path(dst_folder) / str(backup_id) + + # Creating a copy of existing backup + if backup_folder.exists(): + _LOGGER.info("Removing existing backup folder %s", backup_folder) + os.system(f"rm -rf {backup_folder}") + backup_folder.mkdir(exist_ok=True) + + backup_file = os.path.join(backup_folder, src.name) + + src_conn = sqlite3.connect(src, isolation_level="DEFERRED") + dst_conn = sqlite3.connect(backup_file, isolation_level="DEFERRED") + + try: + _LOGGER.info("Creating a db backup in %s", backup_file) + with src_conn, dst_conn: + src_conn.backup(dst_conn) + _LOGGER.info("DB backup created at %s", backup_file) + finally: + src_conn.close() + dst_conn.close() + + +def db_backup(backup_folder: str, interval: int = 15, num_of_backups: int = 20): + url_db = database_url_sync() + db_path = Path(urlparse(url_db).path) + + backup_path = Path(backup_folder).absolute() + + if not backup_path.exists(): + backup_path.mkdir() + + backup_id = 0 + while True: + try: + _run_backup(src=db_path, dst_folder=backup_path, backup_id=backup_id) + except Exception as e: + _LOGGER.exception(f"Error creating backup: {e}") + finally: + backup_id = (backup_id + 1) % num_of_backups + time.sleep(interval) + + +def server_id_backup(backup_folder: str): + backup_path = Path(backup_folder).absolute() + if not backup_path.exists(): + backup_path.mkdir() + + # Force to create the server id file + get_server_id() + + server_id_file = os.path.join(settings.home_path, SERVER_ID_DAT_FILE) + + _LOGGER.info(f"Copying server id file to {backup_folder}") + os.system(f"cp {server_id_file} {backup_folder}") + _LOGGER.info("Server id file copied!") + + +def is_argilla_alive(): + try: + with httpx.Client() as client: + response = client.get("http://localhost:6900/api/v1/status") + response.raise_for_status() + return True + except Exception as e: + _LOGGER.warning(f"Argilla server is not running: {e}") + return False + + +if __name__ == "__main__": + argilla_data: str = "/data/argilla" + backup_path = os.environ["ARGILLA_BACKUPS_PATH"] + backup_interval = int(os.getenv("ARGILLA_BACKUP_INTERVAL") or "15") + num_of_backups = int(os.getenv("ARGILLA_NUM_OF_BACKUPS") or "20") + + while not is_argilla_alive(): + _LOGGER.info("Waiting for the server to be ready...") + time.sleep(5) + + server_id_backup(argilla_data) + db_backup(backup_path, interval=backup_interval, num_of_backups=num_of_backups) diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/restore_argilla_backup.py b/argilla-server/docker/argilla-hf-spaces/scripts/restore_argilla_backup.py new file mode 100644 index 0000000000..93f9a733f3 --- /dev/null +++ b/argilla-server/docker/argilla-hf-spaces/scripts/restore_argilla_backup.py @@ -0,0 +1,41 @@ +# Copyright 2021-present, the Recognai S.L. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import glob +import logging +import os + +logging.basicConfig( + handlers=[logging.StreamHandler()], + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + level=logging.INFO, + force=True, +) + +_LOGGER = logging.getLogger("argilla.backup") + +if __name__ == "__main__": + backups_path = os.environ["ARGILLA_BACKUPS_PATH"] + + folders = glob.glob(f"{backups_path}/*") + folders.sort(key=os.path.getmtime, reverse=True) + + if len(folders) > 1: + safe_backup = folders[1] + argilla_home = os.getenv("ARGILLA_HOME_PATH") + + _LOGGER.info(f"Copying {safe_backup} backup to the argilla home folder at {argilla_home}") + os.system(f"cp -r {safe_backup}/* $ARGILLA_HOME_PATH") + _LOGGER.info("Backup restored!") + else: + _LOGGER.info("No safe backup found to restore. Exiting...") diff --git a/argilla-server/docker/argilla-hf-spaces/scripts/start.sh b/argilla-server/docker/argilla-hf-spaces/scripts/start.sh index cc29f104d3..2dcca12c95 100644 --- a/argilla-server/docker/argilla-hf-spaces/scripts/start.sh +++ b/argilla-server/docker/argilla-hf-spaces/scripts/start.sh @@ -12,7 +12,35 @@ export OAUTH2_HUGGINGFACE_SCOPE=$OAUTH_SCOPES # See https://huggingface.co/docs/hub/en/spaces-overview#helper-environment-variables for more details DEFAULT_USERNAME=$(curl -L -s https://huggingface.co/api/users/${SPACES_CREATOR_USER_ID}/overview | jq -r '.user' || echo "${SPACE_AUTHOR_NAME}") export USERNAME="${USERNAME:-$DEFAULT_USERNAME}" + DEFAULT_PASSWORD=$(pwgen -s 16 1) export PASSWORD="${PASSWORD:-$DEFAULT_PASSWORD}" +export ARGILLA_BACKUPS_PATH=/data/argilla/backups + +if [ ! -d ARGILLA_BACKUPS_PATH ]; then + echo "Initializing backups folder..." + mkdir -p ARGILLA_BACKUPS_PATH + + # if exists the db file, copy it to the backup folder and rename it + if [ -f /data/argilla/argilla.db ]; then + echo "Found argilla.db file, moving it to the argilla home path..." + cp /data/argilla/argilla.db $ARGILLA_HOME_PATH || true + fi + + # if exists the server id file, copy it to the argilla folder + if [ -f /data/argilla/server_id.dat ]; then + echo "Found server_id.dat file, moving it to argilla home path..." + cp /data/argilla/server_id.dat $ARGILLA_HOME_PATH || true + fi + +else + echo "Backup folder already exists..." +fi + +# Copy the backup files to the argilla folder +echo "Restoring files from backup folder..." +python restore_argilla_backup.py + +echo "Starting processes..." honcho start diff --git a/argilla-server/src/argilla_server/telemetry/_helpers.py b/argilla-server/src/argilla_server/telemetry/_helpers.py index 95a4986bf5..09a7e0ead7 100644 --- a/argilla-server/src/argilla_server/telemetry/_helpers.py +++ b/argilla-server/src/argilla_server/telemetry/_helpers.py @@ -21,7 +21,7 @@ _LOGGER = logging.getLogger(__name__) -_SERVER_ID_DAT_FILE = "server_id.dat" +SERVER_ID_DAT_FILE = "server_id.dat" def get_server_id() -> UUID: @@ -34,7 +34,7 @@ def get_server_id() -> UUID: """ - server_id_file = os.path.join(settings.home_path, _SERVER_ID_DAT_FILE) + server_id_file = os.path.join(settings.home_path, SERVER_ID_DAT_FILE) if os.path.exists(server_id_file): with open(server_id_file, "r") as f: