diff --git a/README.md b/README.md index b236f804..f33f4842 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,17 @@ We'll start by setting up our cluster with the environment and compute configura +### Credentials +```bash +touch .env +``` +```bash +# Inside .env +GITHUB_USERNAME="CHANGE_THIS_TO_YOUR_USERNAME" # ← CHANGE THIS +```bash +source .env +``` + ### Git setup Create a repository by following these instructions: [Create a new repository](https://github.com/new) → name it `Made-With-ML` → Toggle `Add a README file` (**very important** as this creates a `main` branch) → Click `Create repository` (scroll down) @@ -109,7 +120,7 @@ Now we're ready to clone the repository that has all of our code: ```bash git clone https://github.com/GokuMohandas/Made-With-ML.git . -git remote set-url origin https://github.com/GITHUB_USERNAME/Made-With-ML.git # <-- CHANGE THIS to your username +git remote set-url origin https://github.com/$GITHUB_USERNAME/Made-With-ML.git # <-- CHANGE THIS to your username git checkout -b dev ``` @@ -317,15 +328,7 @@ python madewithml/predict.py predict \ python madewithml/serve.py --run_id $RUN_ID ``` - While the application is running, we can use it via cURL, Python, etc.: - - ```bash - # via cURL - curl -X POST -H "Content-Type: application/json" -d '{ - "title": "Transfer learning with transformers", - "description": "Using transformers for transfer learning on text classification tasks." - }' http://127.0.0.1:8000/predict - ``` + Once the application is running, we can use it via cURL, Python, etc.: ```python # via Python @@ -341,13 +344,6 @@ python madewithml/predict.py predict \ ray stop # shutdown ``` -```bash -export HOLDOUT_LOC="https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/datasets/holdout.csv" -curl -X POST -H "Content-Type: application/json" -d '{ - "dataset_loc": "https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/datasets/holdout.csv" - }' http://127.0.0.1:8000/evaluate -``` -
@@ -362,15 +358,7 @@ curl -X POST -H "Content-Type: application/json" -d '{ python madewithml/serve.py --run_id $RUN_ID ``` - While the application is running, we can use it via cURL, Python, etc.: - - ```bash - # via cURL - curl -X POST -H "Content-Type: application/json" -d '{ - "title": "Transfer learning with transformers", - "description": "Using transformers for transfer learning on text classification tasks." - }' http://127.0.0.1:8000/predict - ``` + Once the application is running, we can use it via cURL, Python, etc.: ```python # via Python @@ -399,7 +387,7 @@ export RUN_ID=$(python madewithml/predict.py get-best-run-id --experiment-name $ pytest --run-id=$RUN_ID tests/model --verbose --disable-warnings # Coverage -python3 -m pytest --cov madewithml --cov-report html +python3 -m pytest tests/code --cov madewithml --cov-report html --disable-warnings ``` ## Production diff --git a/madewithml/__init__.py b/madewithml/__init__.py new file mode 100644 index 00000000..bf6bd6c5 --- /dev/null +++ b/madewithml/__init__.py @@ -0,0 +1,3 @@ +from dotenv import load_dotenv + +load_dotenv() diff --git a/madewithml/config.py b/madewithml/config.py index 2319419f..93b1de9d 100644 --- a/madewithml/config.py +++ b/madewithml/config.py @@ -1,5 +1,6 @@ # config.py import logging +import os import sys from pathlib import Path @@ -10,9 +11,10 @@ ROOT_DIR = Path(__file__).parent.parent.absolute() LOGS_DIR = Path(ROOT_DIR, "logs") LOGS_DIR.mkdir(parents=True, exist_ok=True) +EFS_DIR = Path(f"/efs/shared_storage/madewithml/{os.environ.get('GITHUB_USERNAME', '')}") # Config MLflow -MODEL_REGISTRY = Path("/tmp/mlflow") +MODEL_REGISTRY = Path(f"{EFS_DIR}/mlflow") Path(MODEL_REGISTRY).mkdir(parents=True, exist_ok=True) MLFLOW_TRACKING_URI = "file://" + str(MODEL_REGISTRY.absolute()) mlflow.set_tracking_uri(MLFLOW_TRACKING_URI) diff --git a/madewithml/predict.py b/madewithml/predict.py index b97e9f89..45f92a2b 100644 --- a/madewithml/predict.py +++ b/madewithml/predict.py @@ -125,7 +125,7 @@ def predict( # Load components best_checkpoint = get_best_checkpoint(run_id=run_id) predictor = TorchPredictor.from_checkpoint(best_checkpoint) - preprocessor = predictor.get_preprocessor() + # preprocessor = predictor.get_preprocessor() # Predict sample_df = pd.DataFrame([{"title": title, "description": description, "tag": "other"}]) diff --git a/madewithml/serve.py b/madewithml/serve.py index 2e30ab28..331a74b6 100644 --- a/madewithml/serve.py +++ b/madewithml/serve.py @@ -1,4 +1,5 @@ import argparse +import os from http import HTTPStatus from typing import Dict @@ -75,5 +76,5 @@ async def _predict(self, request: Request) -> Dict: parser.add_argument("--run_id", help="run ID to use for serving.") parser.add_argument("--threshold", type=float, default=0.9, help="threshold for `other` class.") args = parser.parse_args() - ray.init() + ray.init(runtime_env={"env_vars": {"GITHUB_USERNAME": os.environ["GITHUB_USERNAME"]}}) serve.run(ModelDeployment.bind(run_id=args.run_id, threshold=args.threshold)) diff --git a/madewithml/train.py b/madewithml/train.py index 00e1a44f..75a63f7c 100644 --- a/madewithml/train.py +++ b/madewithml/train.py @@ -1,5 +1,6 @@ import datetime import json +import os from typing import Tuple import numpy as np @@ -23,7 +24,7 @@ from typing_extensions import Annotated from madewithml import data, models, utils -from madewithml.config import MLFLOW_TRACKING_URI, logger +from madewithml.config import EFS_DIR, MLFLOW_TRACKING_URI, logger # Initialize Typer CLI app app = typer.Typer() @@ -200,10 +201,7 @@ def train_model( ) # Run config - run_config = RunConfig( - callbacks=[mlflow_callback], - checkpoint_config=checkpoint_config, - ) + run_config = RunConfig(callbacks=[mlflow_callback], checkpoint_config=checkpoint_config, storage_path=EFS_DIR) # Dataset ds = data.load_data(dataset_loc=dataset_loc, num_samples=train_loop_config["num_samples"]) @@ -252,5 +250,5 @@ def train_model( if __name__ == "__main__": # pragma: no cover, application if ray.is_initialized(): ray.shutdown() - ray.init() + ray.init(runtime_env={"env_vars": {"GITHUB_USERNAME": os.environ["GITHUB_USERNAME"]}}) app() diff --git a/madewithml/tune.py b/madewithml/tune.py index 77e21f4f..13d0d437 100644 --- a/madewithml/tune.py +++ b/madewithml/tune.py @@ -1,5 +1,6 @@ import datetime import json +import os import ray import typer @@ -19,7 +20,7 @@ from typing_extensions import Annotated from madewithml import data, train, utils -from madewithml.config import MLFLOW_TRACKING_URI, logger +from madewithml.config import EFS_DIR, MLFLOW_TRACKING_URI, logger # Initialize Typer CLI app app = typer.Typer() @@ -117,10 +118,7 @@ def tune_models( experiment_name=experiment_name, save_artifact=True, ) - run_config = RunConfig( - callbacks=[mlflow_callback], - checkpoint_config=checkpoint_config, - ) + run_config = RunConfig(callbacks=[mlflow_callback], checkpoint_config=checkpoint_config, storage_path=EFS_DIR) # Hyperparameters to start with initial_params = json.loads(initial_params) @@ -178,5 +176,5 @@ def tune_models( if __name__ == "__main__": # pragma: no cover, application if ray.is_initialized(): ray.shutdown() - ray.init() + ray.init(runtime_env={"env_vars": {"GITHUB_USERNAME": os.environ["GITHUB_USERNAME"]}}) app() diff --git a/notebooks/madewithml.ipynb b/notebooks/madewithml.ipynb index b0f40895..04ab696d 100644 --- a/notebooks/madewithml.ipynb +++ b/notebooks/madewithml.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "acbetMKBt825" @@ -29,7 +28,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "oh-HuNfDrPg0" @@ -39,7 +37,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "XTNsIiUrqoJW" @@ -53,7 +50,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "tags": [] @@ -63,7 +59,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -72,13 +67,15 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 163, "metadata": { "tags": [] }, "outputs": [], "source": [ - "import ray" + "import ray\n", + "from dotenv import load_dotenv; load_dotenv()\n", + "import warnings; warnings.filterwarnings(\"ignore\")" ] }, { @@ -92,17 +89,16 @@ "name": "stderr", "output_type": "stream", "text": [ - "Snapshotting files: 100%|██████████| 4/4 [00:00<00:00, 578.90file/s]\n", - "2023-07-24 18:09:33,959\tINFO worker.py:1431 -- Connecting to existing Ray cluster at address: 10.0.56.150:6379...\n", - "2023-07-24 18:09:34,005\tINFO worker.py:1612 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://session-cqhhlg21by7asj3ryvctbha9ek.i.anyscaleuserdata.com \u001b[39m\u001b[22m\n", - "2023-07-24 18:09:34,010\tINFO packaging.py:346 -- Pushing file package 'gcs://_ray_pkg_6914fbe58df99e42f77368975fb6b629.zip' (1.24MiB) to Ray cluster...\n", - "2023-07-24 18:09:34,014\tINFO packaging.py:359 -- Successfully pushed file package 'gcs://_ray_pkg_6914fbe58df99e42f77368975fb6b629.zip'.\n" + "2023-09-14 15:00:03,106\tINFO worker.py:1431 -- Connecting to existing Ray cluster at address: 10.0.41.50:6379...\n", + "2023-09-14 15:00:03,149\tINFO worker.py:1612 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://session-59p8a57mrjelmhz1ec7m8c7yke.i.anyscaleuserdata.com \u001b[39m\u001b[22m\n", + "2023-09-14 15:00:03,155\tINFO packaging.py:346 -- Pushing file package 'gcs://_ray_pkg_b89128278b4b23728836e7ef293f2994.zip' (1.46MiB) to Ray cluster...\n", + "2023-09-14 15:00:03,159\tINFO packaging.py:359 -- Successfully pushed file package 'gcs://_ray_pkg_b89128278b4b23728836e7ef293f2994.zip'.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "58d1ddc62cd74bf19166797b77a5da12", + "model_id": "0600f97bbc1944ac9a46fc1edce91293", "version_major": 2, "version_minor": 0 }, @@ -134,7 +130,7 @@ " \n", " \n", " Dashboard:\n", - " http://session-cqhhlg21by7asj3ryvctbha9ek.i.anyscaleuserdata.com\n", + " http://session-59p8a57mrjelmhz1ec7m8c7yke.i.anyscaleuserdata.com\n", "\n", "\n", "\n", @@ -143,7 +139,7 @@ "\n" ], "text/plain": [ - "RayContext(dashboard_url='session-cqhhlg21by7asj3ryvctbha9ek.i.anyscaleuserdata.com', python_version='3.10.8', ray_version='2.6.0', ray_commit='0db82e31e249eac614f7c8e7da1c4f8f05c9064a', protocol_version=None)" + "RayContext(dashboard_url='session-59p8a57mrjelmhz1ec7m8c7yke.i.anyscaleuserdata.com', python_version='3.10.8', ray_version='2.6.0', ray_commit='0db82e31e249eac614f7c8e7da1c4f8f05c9064a', protocol_version=None)" ] }, "execution_count": 2, @@ -168,11 +164,13 @@ { "data": { "text/plain": [ - "{'CPU': 8.0,\n", - " 'object_store_memory': 9492578304.0,\n", - " 'memory': 34359738368.0,\n", + "{'node:10.0.41.50': 1.0,\n", + " 'CPU': 24.0,\n", " 'node:__internal_head__': 1.0,\n", - " 'node:10.0.56.150': 1.0}" + " 'object_store_memory': 28764595813.0,\n", + " 'memory': 103079215104.0,\n", + " 'GPU': 1.0,\n", + " 'node:10.0.50.151': 1.0}" ] }, "execution_count": 3, @@ -185,7 +183,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -206,7 +203,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -219,7 +215,27 @@ ] }, { - "attachments": {}, + "cell_type": "code", + "execution_count": 165, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/efs/shared_storage/madewithml/GokuMohandas\n" + ] + } + ], + "source": [ + "# Storage\n", + "EFS_DIR = f\"/efs/shared_storage/madewithml/{os.environ['GITHUB_USERNAME']}\"\n", + "print (EFS_DIR)" + ] + }, + { "cell_type": "markdown", "metadata": {}, "source": [ @@ -227,7 +243,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "tags": [] @@ -238,7 +253,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": { "tags": [] }, @@ -249,7 +264,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": { "tags": [] }, @@ -343,7 +358,7 @@ "4 A PyTorch Implementation of \"Watch Your Step: ... other " ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -356,7 +371,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "tags": [] @@ -367,7 +381,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": { "tags": [] }, @@ -378,7 +392,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": { "tags": [] }, @@ -394,7 +408,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -406,7 +420,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": { "tags": [] }, @@ -419,7 +433,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": { "tags": [] }, @@ -435,7 +449,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -447,7 +461,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": { "tags": [] }, @@ -463,7 +477,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -474,7 +488,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "WuCrsbxbNkSV" @@ -484,7 +497,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "eOJ3nlEgnSTJ" @@ -495,7 +507,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": { "id": "tHdQmqTBNkSV", "tags": [] @@ -511,7 +523,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": { "tags": [] }, @@ -525,7 +537,7 @@ " ('mlops', 63)]" ] }, - "execution_count": 13, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -538,7 +550,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -571,7 +583,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "pfjVstecaFC5" @@ -582,7 +593,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -605,16 +616,16 @@ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -637,7 +648,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "b8ua3MFhrOaX" @@ -647,7 +657,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "HFifXKl_eKsN" @@ -657,7 +666,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "RxAZ1AmteRaD" @@ -668,7 +676,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": { "tags": [] }, @@ -682,7 +690,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "6VgTwEQboTGc" @@ -692,7 +699,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "U_001GPyMZsC" @@ -703,7 +709,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": { "id": "3x1ldAFQNkSU", "tags": [] @@ -715,7 +721,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -724,7 +729,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -750,7 +755,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": { "id": "VfdWkkV8LlNR", "tags": [] @@ -778,7 +783,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -805,7 +810,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -814,7 +818,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": { "tags": [] }, @@ -883,7 +887,7 @@ "4 attentionwalk pytorch implementation watch ste... other" ] }, - "execution_count": 21, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -897,7 +901,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -905,7 +908,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -914,7 +916,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": { "tags": [] }, @@ -928,7 +930,7 @@ " 'other': 3}" ] }, - "execution_count": 22, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -943,7 +945,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": { "tags": [] }, @@ -1012,7 +1014,7 @@ "4 attentionwalk pytorch implementation watch ste... 3" ] }, - "execution_count": 23, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -1025,7 +1027,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": { "tags": [] }, @@ -1037,7 +1039,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": { "tags": [] }, @@ -1048,7 +1050,7 @@ "['computer-vision', 'computer-vision', 'other', 'other', 'other']" ] }, - "execution_count": 25, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1059,7 +1061,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1067,7 +1068,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1076,7 +1076,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": { "tags": [] }, @@ -1088,7 +1088,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": { "tags": [] }, @@ -1115,7 +1115,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": { "tags": [] }, @@ -1129,7 +1129,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": { "tags": [] }, @@ -1144,7 +1144,7 @@ " 'targets': array([2])}" ] }, - "execution_count": 29, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -1155,7 +1155,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1163,7 +1162,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1172,7 +1170,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "metadata": { "tags": [] }, @@ -1191,7 +1189,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "metadata": { "tags": [] }, @@ -1243,7 +1241,7 @@ " 2, 3, 2, 2, 0, 1, 2, 2, 2, 0, 1, 2, 1, 3, 0, 2, 3])}" ] }, - "execution_count": 31, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -1254,7 +1252,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1262,7 +1259,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -1271,7 +1267,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "metadata": { "tags": [] }, @@ -1285,7 +1281,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 34, "metadata": { "tags": [] }, @@ -1294,11 +1290,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-07-24 18:09:40,715\tINFO read_api.py:374 -- To satisfy the requested parallelism of 16, each read task output will be split into 16 smaller blocks.\n", - "2023-07-24 18:09:40,719\tINFO dataset.py:2180 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.\n", - "2023-07-24 18:09:40,720\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(16)] -> AllToAllOperator[RandomShuffle]\n", - "2023-07-24 18:09:40,721\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:09:40,722\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:00:09,809\tINFO read_api.py:374 -- To satisfy the requested parallelism of 48, each read task output will be split into 48 smaller blocks.\n", + "2023-09-14 15:00:09,814\tINFO dataset.py:2180 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.\n", + "2023-09-14 15:00:09,816\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> AllToAllOperator[RandomShuffle]\n", + "2023-09-14 15:00:09,817\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:00:09,817\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -1309,7 +1305,7 @@ "version_minor": 0 }, "text/plain": [ - "- RandomShuffle 1: 0%| | 0/16 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(16)] -> AllToAllOperator[RandomShuffle] -> LimitOperator[limit=1]\n", - "2023-07-24 18:09:41,438\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:09:41,439\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:00:13,415\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> AllToAllOperator[RandomShuffle] -> LimitOperator[limit=1]\n", + "2023-09-14 15:00:13,416\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:00:13,418\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -1403,7 +1399,7 @@ "version_minor": 0 }, "text/plain": [ - "- RandomShuffle 1: 0%| | 0/16 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(16)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> LimitOperator[limit=1]\n", - "2023-07-24 18:09:45,960\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:09:45,961\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:00:15,649\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> LimitOperator[limit=1]\n", + "2023-09-14 15:00:15,650\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:00:15,650\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -1482,7 +1478,7 @@ "version_minor": 0 }, "text/plain": [ - "- RandomShuffle 1: 0%| | 0/16 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/16 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/48 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(16)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> AllToAllOperator[Aggregate] -> TaskPoolMapOperator[MapBatches()]\n", - "2023-07-24 18:09:47,955\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:09:47,957\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:00:17,755\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> AllToAllOperator[Aggregate] -> TaskPoolMapOperator[MapBatches()]\n", + "2023-09-14 15:00:17,756\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:00:17,757\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -1659,7 +1655,7 @@ "version_minor": 0 }, "text/plain": [ - "- RandomShuffle 1: 0%| | 0/16 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/16 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/48 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(16)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> TaskPoolMapOperator[MapBatches(preprocess)]\n", - "2023-07-24 18:09:48,802\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:09:48,802\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:00:19,882\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> TaskPoolMapOperator[MapBatches(preprocess)]\n", + "2023-09-14 15:00:19,882\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:00:19,883\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -1904,7 +1902,7 @@ "version_minor": 0 }, "text/plain": [ - "- RandomShuffle 1: 0%| | 0/16 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/16 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/48 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(16)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> TaskPoolMapOperator[MapBatches(preprocess)]\n", - "2023-07-24 18:09:52,026\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:09:52,026\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:00:23,986\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> TaskPoolMapOperator[MapBatches(preprocess)]\n", + "2023-09-14 15:00:23,987\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:00:23,988\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -2460,7 +2451,7 @@ "version_minor": 0 }, "text/plain": [ - "- RandomShuffle 1: 0%| | 0/16 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/16 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/48 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(16)] -> AllToAllOperator[RandomShuffle] -> LimitOperator[limit=1]\n", - "2023-07-24 18:09:53,142\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:09:53,143\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:00:26,135\tINFO read_api.py:374 -- To satisfy the requested parallelism of 48, each read task output will be split into 48 smaller blocks.\n", + "2023-09-14 15:00:26,138\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> AllToAllOperator[RandomShuffle] -> LimitOperator[limit=1]\n", + "2023-09-14 15:00:26,139\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:00:26,139\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -2889,7 +2876,7 @@ "version_minor": 0 }, "text/plain": [ - "- RandomShuffle 1: 0%| | 0/16 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(16)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> LimitOperator[limit=1]\n", - "2023-07-24 18:09:53,361\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:09:53,362\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:00:26,512\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> LimitOperator[limit=1]\n", + "2023-09-14 15:00:26,513\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:00:26,513\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -2969,7 +2955,7 @@ "version_minor": 0 }, "text/plain": [ - "- RandomShuffle 1: 0%| | 0/16 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/16 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/48 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(16)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> AllToAllOperator[Aggregate] -> TaskPoolMapOperator[MapBatches()]\n", - "2023-07-24 18:09:53,953\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:09:53,953\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:00:27,771\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> AllToAllOperator[Aggregate] -> TaskPoolMapOperator[MapBatches()]\n", + "2023-09-14 15:00:27,772\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:00:27,773\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -3146,7 +3132,7 @@ "version_minor": 0 }, "text/plain": [ - "- RandomShuffle 1: 0%| | 0/16 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/16 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/48 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(16)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)]\n", - "2023-07-24 18:09:54,693\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:09:54,693\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:00:29,414\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)]\n", + "2023-09-14 15:00:29,415\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:00:29,416\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -3379,7 +3365,7 @@ "version_minor": 0 }, "text/plain": [ - "- RandomShuffle 1: 0%| | 0/16 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/16 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/48 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(16)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)]\n", - "2023-07-24 18:09:55,680\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:09:55,681\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:00:31,130\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)]\n", + "2023-09-14 15:00:31,130\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:00:31,131\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -3556,7 +3542,7 @@ "version_minor": 0 }, "text/plain": [ - "- RandomShuffle 1: 0%| | 0/16 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/16 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/48 [00:00Tune Status\n", " \n", "\n", - "\n", - "\n", - "\n", + "\n", + "\n", + "\n", "\n", "
Current time:2023-07-24 18:11:25
Running for: 00:01:29.04
Memory: 7.6/30.9 GiB
Current time:2023-09-14 15:02:25
Running for: 00:01:52.74
Memory: 6.4/30.9 GiB
\n", " \n", "
\n", "
\n", "

System Info

\n", - " Using FIFO scheduling algorithm.
Logical resource usage: 4.0/12 CPUs, 1.0/1 GPUs\n", + " Using FIFO scheduling algorithm.
Logical resource usage: 4.0/24 CPUs, 1.0/1 GPUs\n", "
\n", " \n", " \n", @@ -3804,10 +3790,10 @@ "

Trial Status

\n", " \n", "\n", - "\n", + "\n", "\n", "\n", - "\n", + "\n", "\n", "
Trial name status loc iter total time (s) epoch lr train_loss
Trial name status loc iter total time (s) epoch lr train_loss
TorchTrainer_f80f6_00000TERMINATED10.0.56.150:281250 10 77.6459 90.0001 0.0364764
TorchTrainer_2044b_00000TERMINATED10.0.50.151:49611 10 106.8 98e-05 0.0400874
\n", " \n", @@ -3850,48 +3836,40 @@ "metadata": {}, "output_type": "display_data" }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(autoscaler +30s) Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.\n", - "(autoscaler +30s) Resized to 12 CPUs, 1 GPUs.\n" - ] - }, { "name": "stderr", "output_type": "stream", "text": [ - "(TorchTrainer pid=281250) The dict form of `dataset_config` is deprecated. Use the DataConfig class instead. Support for this will be dropped in a future release.\n", - "(TorchTrainer pid=281250) The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.\n", - "(TorchTrainer pid=281250) Starting distributed worker processes: ['2349 (10.0.6.57)']\n", - "(RayTrainWorker pid=2349, ip=10.0.6.57) Setting up process group for: env:// [rank=0, world_size=1]\n", - "Downloading (…)lve/main/config.json: 100%|██████████| 385/385 [00:00<00:00, 2.83MB/s]\n", - "Downloading pytorch_model.bin: 0%| | 0.00/442M [00:00\n", " 0\n", " 0\n", - " 0.0001\n", - " 0.578165\n", - " 0.492538\n", - " 1690247421\n", - " 19.374968\n", + " 0.00010\n", + " 0.579703\n", + " 0.498519\n", + " 1694728846\n", + " 16.904209\n", " True\n", " False\n", " 1\n", - " f80f6_00000\n", - " 2023-07-24_18-10-25\n", - " 19.374968\n", - " 281250\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 19.374968\n", + " 2044b_00000\n", + " 2023-09-14_15-00-53\n", + " 16.904209\n", + " 49611\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 16.904209\n", " 1\n", " \n", " \n", " 1\n", " 1\n", - " 0.0001\n", - " 0.486276\n", - " 0.419530\n", - " 1690247428\n", - " 6.751568\n", + " 0.00010\n", + " 0.480723\n", + " 0.407913\n", + " 1694728856\n", + " 10.286396\n", " True\n", " False\n", " 2\n", - " f80f6_00000\n", - " 2023-07-24_18-10-31\n", - " 26.126536\n", - " 281250\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 26.126536\n", + " 2044b_00000\n", + " 2023-09-14_15-01-03\n", + " 27.190605\n", + " 49611\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 27.190605\n", " 2\n", " \n", " \n", " 2\n", " 2\n", - " 0.0001\n", - " 0.398447\n", - " 0.317161\n", - " 1690247435\n", - " 6.416867\n", + " 0.00010\n", + " 0.402693\n", + " 0.324105\n", + " 1694728867\n", + " 9.887647\n", " True\n", " False\n", " 3\n", - " f80f6_00000\n", - " 2023-07-24_18-10-38\n", - " 32.543403\n", - " 281250\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 32.543403\n", + " 2044b_00000\n", + " 2023-09-14_15-01-13\n", + " 37.078252\n", + " 49611\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 37.078252\n", " 3\n", " \n", " \n", " 3\n", " 3\n", - " 0.0001\n", - " 0.286960\n", - " 0.234889\n", - " 1690247441\n", - " 6.434473\n", + " 0.00010\n", + " 0.278217\n", + " 0.256301\n", + " 1694728877\n", + " 9.893445\n", " True\n", " False\n", " 4\n", - " f80f6_00000\n", - " 2023-07-24_18-10-44\n", - " 38.977876\n", - " 281250\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 38.977876\n", + " 2044b_00000\n", + " 2023-09-14_15-01-23\n", + " 46.971696\n", + " 49611\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 46.971696\n", " 4\n", " \n", " \n", " 4\n", " 4\n", - " 0.0001\n", - " 0.208955\n", - " 0.199119\n", - " 1690247448\n", - " 6.407677\n", + " 0.00010\n", + " 0.203579\n", + " 0.249053\n", + " 1694728887\n", + " 9.843567\n", " True\n", " False\n", " 5\n", - " f80f6_00000\n", - " 2023-07-24_18-10-51\n", - " 45.385553\n", - " 281250\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 45.385553\n", + " 2044b_00000\n", + " 2023-09-14_15-01-33\n", + " 56.815263\n", + " 49611\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 56.815263\n", " 5\n", " \n", " \n", " 5\n", " 5\n", - " 0.0001\n", - " 0.141784\n", - " 0.161738\n", - " 1690247454\n", - " 6.420556\n", + " 0.00010\n", + " 0.152601\n", + " 0.180287\n", + " 1694728896\n", + " 10.026406\n", " True\n", " False\n", " 6\n", - " f80f6_00000\n", - " 2023-07-24_18-10-57\n", - " 51.806109\n", - " 281250\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 51.806109\n", + " 2044b_00000\n", + " 2023-09-14_15-01-43\n", + " 66.841670\n", + " 49611\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 66.841670\n", " 6\n", " \n", " \n", " 6\n", " 6\n", - " 0.0001\n", - " 0.098122\n", - " 0.152620\n", - " 1690247460\n", - " 6.416981\n", + " 0.00010\n", + " 0.105872\n", + " 0.184052\n", + " 1694728906\n", + " 9.971822\n", " True\n", " False\n", " 7\n", - " f80f6_00000\n", - " 2023-07-24_18-11-03\n", - " 58.223091\n", - " 281250\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 58.223091\n", + " 2044b_00000\n", + " 2023-09-14_15-01-53\n", + " 76.813491\n", + " 49611\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 76.813491\n", " 7\n", " \n", " \n", " 7\n", " 7\n", - " 0.0001\n", - " 0.069849\n", - " 0.133828\n", - " 1690247467\n", - " 6.472243\n", + " 0.00010\n", + " 0.074756\n", + " 0.184786\n", + " 1694728916\n", + " 9.988855\n", " True\n", " False\n", " 8\n", - " f80f6_00000\n", - " 2023-07-24_18-11-10\n", - " 64.695333\n", - " 281250\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 64.695333\n", + " 2044b_00000\n", + " 2023-09-14_15-02-03\n", + " 86.802346\n", + " 49611\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 86.802346\n", " 8\n", " \n", " \n", " 8\n", " 8\n", - " 0.0001\n", - " 0.046368\n", - " 0.135197\n", - " 1690247473\n", - " 6.461530\n", + " 0.00010\n", + " 0.055098\n", + " 0.201344\n", + " 1694728926\n", + " 9.961005\n", " True\n", " False\n", " 9\n", - " f80f6_00000\n", - " 2023-07-24_18-11-16\n", - " 71.156864\n", - " 281250\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 71.156864\n", + " 2044b_00000\n", + " 2023-09-14_15-02-13\n", + " 96.763352\n", + " 49611\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 96.763352\n", " 9\n", " \n", " \n", " 9\n", " 9\n", - " 0.0001\n", - " 0.036476\n", - " 0.123047\n", - " 1690247480\n", - " 6.489086\n", + " 0.00008\n", + " 0.040087\n", + " 0.210514\n", + " 1694728936\n", + " 10.036337\n", " True\n", " False\n", " 10\n", - " f80f6_00000\n", - " 2023-07-24_18-11-23\n", - " 77.645949\n", - " 281250\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 77.645949\n", + " 2044b_00000\n", + " 2023-09-14_15-02-23\n", + " 106.799689\n", + " 49611\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 106.799689\n", " 10\n", " \n", " \n", @@ -4154,56 +4132,56 @@ "" ], "text/plain": [ - " epoch lr train_loss val_loss timestamp time_this_iter_s \n", - "0 0 0.0001 0.578165 0.492538 1690247421 19.374968 \\\n", - "1 1 0.0001 0.486276 0.419530 1690247428 6.751568 \n", - "2 2 0.0001 0.398447 0.317161 1690247435 6.416867 \n", - "3 3 0.0001 0.286960 0.234889 1690247441 6.434473 \n", - "4 4 0.0001 0.208955 0.199119 1690247448 6.407677 \n", - "5 5 0.0001 0.141784 0.161738 1690247454 6.420556 \n", - "6 6 0.0001 0.098122 0.152620 1690247460 6.416981 \n", - "7 7 0.0001 0.069849 0.133828 1690247467 6.472243 \n", - "8 8 0.0001 0.046368 0.135197 1690247473 6.461530 \n", - "9 9 0.0001 0.036476 0.123047 1690247480 6.489086 \n", + " epoch lr train_loss val_loss timestamp time_this_iter_s \n", + "0 0 0.00010 0.579703 0.498519 1694728846 16.904209 \\\n", + "1 1 0.00010 0.480723 0.407913 1694728856 10.286396 \n", + "2 2 0.00010 0.402693 0.324105 1694728867 9.887647 \n", + "3 3 0.00010 0.278217 0.256301 1694728877 9.893445 \n", + "4 4 0.00010 0.203579 0.249053 1694728887 9.843567 \n", + "5 5 0.00010 0.152601 0.180287 1694728896 10.026406 \n", + "6 6 0.00010 0.105872 0.184052 1694728906 9.971822 \n", + "7 7 0.00010 0.074756 0.184786 1694728916 9.988855 \n", + "8 8 0.00010 0.055098 0.201344 1694728926 9.961005 \n", + "9 9 0.00008 0.040087 0.210514 1694728936 10.036337 \n", "\n", " should_checkpoint done training_iteration trial_id \n", - "0 True False 1 f80f6_00000 \\\n", - "1 True False 2 f80f6_00000 \n", - "2 True False 3 f80f6_00000 \n", - "3 True False 4 f80f6_00000 \n", - "4 True False 5 f80f6_00000 \n", - "5 True False 6 f80f6_00000 \n", - "6 True False 7 f80f6_00000 \n", - "7 True False 8 f80f6_00000 \n", - "8 True False 9 f80f6_00000 \n", - "9 True False 10 f80f6_00000 \n", - "\n", - " date time_total_s pid hostname node_ip \n", - "0 2023-07-24_18-10-25 19.374968 281250 ip-10-0-56-150 10.0.56.150 \\\n", - "1 2023-07-24_18-10-31 26.126536 281250 ip-10-0-56-150 10.0.56.150 \n", - "2 2023-07-24_18-10-38 32.543403 281250 ip-10-0-56-150 10.0.56.150 \n", - "3 2023-07-24_18-10-44 38.977876 281250 ip-10-0-56-150 10.0.56.150 \n", - "4 2023-07-24_18-10-51 45.385553 281250 ip-10-0-56-150 10.0.56.150 \n", - "5 2023-07-24_18-10-57 51.806109 281250 ip-10-0-56-150 10.0.56.150 \n", - "6 2023-07-24_18-11-03 58.223091 281250 ip-10-0-56-150 10.0.56.150 \n", - "7 2023-07-24_18-11-10 64.695333 281250 ip-10-0-56-150 10.0.56.150 \n", - "8 2023-07-24_18-11-16 71.156864 281250 ip-10-0-56-150 10.0.56.150 \n", - "9 2023-07-24_18-11-23 77.645949 281250 ip-10-0-56-150 10.0.56.150 \n", + "0 True False 1 2044b_00000 \\\n", + "1 True False 2 2044b_00000 \n", + "2 True False 3 2044b_00000 \n", + "3 True False 4 2044b_00000 \n", + "4 True False 5 2044b_00000 \n", + "5 True False 6 2044b_00000 \n", + "6 True False 7 2044b_00000 \n", + "7 True False 8 2044b_00000 \n", + "8 True False 9 2044b_00000 \n", + "9 True False 10 2044b_00000 \n", + "\n", + " date time_total_s pid hostname node_ip \n", + "0 2023-09-14_15-00-53 16.904209 49611 ip-10-0-50-151 10.0.50.151 \\\n", + "1 2023-09-14_15-01-03 27.190605 49611 ip-10-0-50-151 10.0.50.151 \n", + "2 2023-09-14_15-01-13 37.078252 49611 ip-10-0-50-151 10.0.50.151 \n", + "3 2023-09-14_15-01-23 46.971696 49611 ip-10-0-50-151 10.0.50.151 \n", + "4 2023-09-14_15-01-33 56.815263 49611 ip-10-0-50-151 10.0.50.151 \n", + "5 2023-09-14_15-01-43 66.841670 49611 ip-10-0-50-151 10.0.50.151 \n", + "6 2023-09-14_15-01-53 76.813491 49611 ip-10-0-50-151 10.0.50.151 \n", + "7 2023-09-14_15-02-03 86.802346 49611 ip-10-0-50-151 10.0.50.151 \n", + "8 2023-09-14_15-02-13 96.763352 49611 ip-10-0-50-151 10.0.50.151 \n", + "9 2023-09-14_15-02-23 106.799689 49611 ip-10-0-50-151 10.0.50.151 \n", "\n", " time_since_restore iterations_since_restore \n", - "0 19.374968 1 \n", - "1 26.126536 2 \n", - "2 32.543403 3 \n", - "3 38.977876 4 \n", - "4 45.385553 5 \n", - "5 51.806109 6 \n", - "6 58.223091 7 \n", - "7 64.695333 8 \n", - "8 71.156864 9 \n", - "9 77.645949 10 " + "0 16.904209 1 \n", + "1 27.190605 2 \n", + "2 37.078252 3 \n", + "3 46.971696 4 \n", + "4 56.815263 5 \n", + "5 66.841670 6 \n", + "6 76.813491 7 \n", + "7 86.802346 8 \n", + "8 96.763352 9 \n", + "9 106.799689 10 " ] }, - "execution_count": 62, + "execution_count": 63, "metadata": {}, "output_type": "execute_result" } @@ -4215,7 +4193,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 64, "metadata": { "tags": [] }, @@ -4223,22 +4201,22 @@ { "data": { "text/plain": [ - "[(TorchCheckpoint(local_path=/home/ray/ray_results/llm/TorchTrainer_f80f6_00000_0_2023-07-24_18-09-56/checkpoint_000009),\n", - " {'epoch': 9,\n", + "[(TorchCheckpoint(local_path=/efs/shared_storage/madewithml/GokuMohandas/llm/TorchTrainer_2044b_00000_0_2023-09-14_15-00-33/checkpoint_000005),\n", + " {'epoch': 5,\n", " 'lr': 0.0001,\n", - " 'train_loss': 0.03647640720009804,\n", - " 'val_loss': 0.12304694950580597,\n", - " 'timestamp': 1690247480,\n", - " 'time_this_iter_s': 6.489085674285889,\n", + " 'train_loss': 0.152601088086764,\n", + " 'val_loss': 0.1802874505519867,\n", + " 'timestamp': 1694728896,\n", + " 'time_this_iter_s': 10.026406288146973,\n", " 'should_checkpoint': True,\n", - " 'done': True,\n", - " 'training_iteration': 10,\n", - " 'trial_id': 'f80f6_00000',\n", - " 'date': '2023-07-24_18-11-23',\n", - " 'time_total_s': 77.6459493637085,\n", - " 'pid': 281250,\n", - " 'hostname': 'ip-10-0-56-150',\n", - " 'node_ip': '10.0.56.150',\n", + " 'done': False,\n", + " 'training_iteration': 6,\n", + " 'trial_id': '2044b_00000',\n", + " 'date': '2023-09-14_15-01-43',\n", + " 'time_total_s': 66.84166955947876,\n", + " 'pid': 49611,\n", + " 'hostname': 'ip-10-0-50-151',\n", + " 'node_ip': '10.0.50.151',\n", " 'config': {'train_loop_config': {'dropout_p': 0.5,\n", " 'lr': 0.0001,\n", " 'lr_factor': 0.8,\n", @@ -4246,12 +4224,12 @@ " 'num_epochs': 10,\n", " 'batch_size': 256,\n", " 'num_classes': 4}},\n", - " 'time_since_restore': 77.6459493637085,\n", - " 'iterations_since_restore': 10,\n", + " 'time_since_restore': 66.84166955947876,\n", + " 'iterations_since_restore': 6,\n", " 'experiment_tag': '0'})]" ] }, - "execution_count": 63, + "execution_count": 64, "metadata": {}, "output_type": "execute_result" } @@ -4262,7 +4240,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -4271,7 +4248,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 65, "metadata": { "tags": [] }, @@ -4283,7 +4260,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 66, "metadata": { "tags": [] }, @@ -4297,7 +4274,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 67, "metadata": { "tags": [] }, @@ -4306,10 +4283,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-07-24 18:11:26,923\tINFO read_api.py:374 -- To satisfy the requested parallelism of 24, each read task output will be split into 24 smaller blocks.\n", - "2023-07-24 18:11:26,927\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(24)] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)]\n", - "2023-07-24 18:11:26,927\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:11:26,928\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:02:31,960\tINFO read_api.py:374 -- To satisfy the requested parallelism of 48, each read task output will be split into 48 smaller blocks.\n", + "2023-09-14 15:02:31,963\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)]\n", + "2023-09-14 15:02:31,964\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:02:31,965\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -4320,7 +4297,7 @@ "version_minor": 0 }, "text/plain": [ - "Running 0: 0%| | 0/24 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(24)] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)->MapBatches()]\n", - "2023-07-24 18:11:27,397\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:11:27,397\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:02:32,386\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)->MapBatches()]\n", + "2023-09-14 15:02:32,387\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:02:32,388\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -4373,19 +4349,12 @@ "version_minor": 0 }, "text/plain": [ - "Running 0: 0%| | 0/24 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(24)] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)->MapBatches()]\n", - "2023-07-24 18:11:36,377\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:11:36,379\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:02:36,046\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)->MapBatches()]\n", + "2023-09-14 15:02:36,047\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:02:36,047\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -4536,7 +4505,7 @@ "version_minor": 0 }, "text/plain": [ - "Running 0: 0%| | 0/24 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(24)] -> AllToAllOperator[RandomShuffle] -> LimitOperator[limit=1]\n", - "2023-07-24 18:11:40,817\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:11:40,818\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:02:39,948\tINFO read_api.py:374 -- To satisfy the requested parallelism of 48, each read task output will be split into 48 smaller blocks.\n", + "2023-09-14 15:02:39,951\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> AllToAllOperator[RandomShuffle] -> LimitOperator[limit=1]\n", + "2023-09-14 15:02:39,952\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:02:39,952\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -4773,7 +4740,7 @@ "version_minor": 0 }, "text/plain": [ - "- RandomShuffle 1: 0%| | 0/24 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(24)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> LimitOperator[limit=1]\n", - "2023-07-24 18:11:41,067\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:11:41,068\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:02:40,402\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> LimitOperator[limit=1]\n", + "2023-09-14 15:02:40,403\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:02:40,405\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -4852,7 +4819,7 @@ "version_minor": 0 }, "text/plain": [ - "- RandomShuffle 1: 0%| | 0/24 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/24 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/48 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(24)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> AllToAllOperator[Aggregate] -> TaskPoolMapOperator[MapBatches()]\n", - "2023-07-24 18:11:41,891\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:11:41,892\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:02:41,749\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> AllToAllOperator[Aggregate] -> TaskPoolMapOperator[MapBatches()]\n", + "2023-09-14 15:02:41,750\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:02:41,751\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -5029,7 +4996,7 @@ "version_minor": 0 }, "text/plain": [ - "- RandomShuffle 1: 0%| | 0/24 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/24 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/48 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(24)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)]\n", - "2023-07-24 18:11:42,979\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:11:42,982\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:02:43,296\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)]\n", + "2023-09-14 15:02:43,297\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:02:43,297\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -5262,7 +5229,7 @@ "version_minor": 0 }, "text/plain": [ - "- RandomShuffle 1: 0%| | 0/24 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/24 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/48 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(24)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)]\n", - "2023-07-24 18:11:44,083\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:11:44,083\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:02:45,162\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)]\n", + "2023-09-14 15:02:45,163\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:02:45,164\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -5439,7 +5406,7 @@ "version_minor": 0 }, "text/plain": [ - "- RandomShuffle 1: 0%| | 0/24 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/24 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/48 [00:00Tune Status\n", " \n", "\n", - "\n", - "\n", - "\n", + "\n", + "\n", + "\n", "\n", "
Current time:2023-07-24 18:13:07
Running for: 00:01:22.27
Memory: 8.3/30.9 GiB
Current time:2023-09-14 15:04:48
Running for: 00:02:01.46
Memory: 7.4/30.9 GiB
\n", " \n", "
\n", "
\n", "

System Info

\n", - " Using FIFO scheduling algorithm.
Logical resource usage: 4.0/12 CPUs, 1.0/1 GPUs\n", + " Using FIFO scheduling algorithm.
Logical resource usage: 4.0/24 CPUs, 1.0/1 GPUs\n", "
\n", " \n", " \n", @@ -5672,10 +5639,10 @@ "

Trial Status

\n", " \n", "\n", - "\n", + "\n", "\n", "\n", - "\n", + "\n", "\n", "
Trial name status loc iter total time (s) epoch lr train_loss
Trial name status loc iter total time (s) epoch lr train_loss
TorchTrainer_38d2b_00000TERMINATED10.0.56.150:282335 10 75.3763 90.0001 0.0344348
TorchTrainer_701e4_00000TERMINATED10.0.50.151:50696 10 106.195 90.0001 0.0361183
\n", " \n", @@ -5722,51 +5689,47 @@ "name": "stderr", "output_type": "stream", "text": [ - "(TorchTrainer pid=282335) The dict form of `dataset_config` is deprecated. Use the DataConfig class instead. Support for this will be dropped in a future release.\n", - "(TorchTrainer pid=282335) The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.\n", - "(TorchTrainer pid=282335) Starting distributed worker processes: ['2994 (10.0.6.57)']\n", - "(RayTrainWorker pid=2994, ip=10.0.6.57) Setting up process group for: env:// [rank=0, world_size=1]\n", - "(RayTrainWorker pid=2994, ip=10.0.6.57) Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']\n", - "(RayTrainWorker pid=2994, ip=10.0.6.57) - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "(RayTrainWorker pid=2994, ip=10.0.6.57) - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", - "(RayTrainWorker pid=2994, ip=10.0.6.57) Moving model to device: cuda:0\n", - "(RayTrainWorker pid=2994, ip=10.0.6.57) /tmp/ipykernel_280376/1209796013.py:7: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.)\n" + "(TorchTrainer pid=50696, ip=10.0.50.151) The dict form of `dataset_config` is deprecated. Use the DataConfig class instead. Support for this will be dropped in a future release.\n", + "(TorchTrainer pid=50696, ip=10.0.50.151) The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.\n", + "(TorchTrainer pid=50696, ip=10.0.50.151) Starting distributed worker processes: ['50741 (10.0.50.151)']\n", + "(RayTrainWorker pid=50741, ip=10.0.50.151) Setting up process group for: env:// [rank=0, world_size=1]\n", + "(RayTrainWorker pid=50741, ip=10.0.50.151) Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']\n", + "(RayTrainWorker pid=50741, ip=10.0.50.151) - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "(RayTrainWorker pid=50741, ip=10.0.50.151) - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "(RayTrainWorker pid=50741, ip=10.0.50.151) Moving model to device: cuda:0\n", + "(RayTrainWorker pid=50741, ip=10.0.50.151) /tmp/ipykernel_117917/1209796013.py:7: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-11-45/TorchTrainer_38d2b_00000_0_2023-07-24_18-11-45/progress.csv -> /tmp/mlflow/598544898920467811/8991516da6674b5395affb9fb6217964/artifacts\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-11-45/TorchTrainer_38d2b_00000_0_2023-07-24_18-11-45/events.out.tfevents.1690247508.ip-10-0-56-150 -> /tmp/mlflow/598544898920467811/8991516da6674b5395affb9fb6217964/artifacts\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-11-45/TorchTrainer_38d2b_00000_0_2023-07-24_18-11-45/result.json -> /tmp/mlflow/598544898920467811/8991516da6674b5395affb9fb6217964/artifacts\n", - "creating /tmp/mlflow/598544898920467811/8991516da6674b5395affb9fb6217964/artifacts/checkpoint_000008\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-11-45/TorchTrainer_38d2b_00000_0_2023-07-24_18-11-45/checkpoint_000008/.is_checkpoint -> /tmp/mlflow/598544898920467811/8991516da6674b5395affb9fb6217964/artifacts/checkpoint_000008\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-11-45/TorchTrainer_38d2b_00000_0_2023-07-24_18-11-45/checkpoint_000008/.tune_metadata -> /tmp/mlflow/598544898920467811/8991516da6674b5395affb9fb6217964/artifacts/checkpoint_000008\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-11-45/TorchTrainer_38d2b_00000_0_2023-07-24_18-11-45/checkpoint_000008/dict_checkpoint.pkl -> /tmp/mlflow/598544898920467811/8991516da6674b5395affb9fb6217964/artifacts/checkpoint_000008\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-11-45/TorchTrainer_38d2b_00000_0_2023-07-24_18-11-45/checkpoint_000008/.metadata.pkl -> /tmp/mlflow/598544898920467811/8991516da6674b5395affb9fb6217964/artifacts/checkpoint_000008\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-11-45/TorchTrainer_38d2b_00000_0_2023-07-24_18-11-45/params.json -> /tmp/mlflow/598544898920467811/8991516da6674b5395affb9fb6217964/artifacts\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-11-45/TorchTrainer_38d2b_00000_0_2023-07-24_18-11-45/params.pkl -> /tmp/mlflow/598544898920467811/8991516da6674b5395affb9fb6217964/artifacts\n", - "creating /tmp/mlflow/598544898920467811/8991516da6674b5395affb9fb6217964/artifacts/checkpoint_000009\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-11-45/TorchTrainer_38d2b_00000_0_2023-07-24_18-11-45/checkpoint_000009/.is_checkpoint -> /tmp/mlflow/598544898920467811/8991516da6674b5395affb9fb6217964/artifacts/checkpoint_000009\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-11-45/TorchTrainer_38d2b_00000_0_2023-07-24_18-11-45/checkpoint_000009/.tune_metadata -> /tmp/mlflow/598544898920467811/8991516da6674b5395affb9fb6217964/artifacts/checkpoint_000009\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-11-45/TorchTrainer_38d2b_00000_0_2023-07-24_18-11-45/checkpoint_000009/dict_checkpoint.pkl -> /tmp/mlflow/598544898920467811/8991516da6674b5395affb9fb6217964/artifacts/checkpoint_000009\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-11-45/TorchTrainer_38d2b_00000_0_2023-07-24_18-11-45/checkpoint_000009/.metadata.pkl -> /tmp/mlflow/598544898920467811/8991516da6674b5395affb9fb6217964/artifacts/checkpoint_000009\n" + "creating /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/cb6a74df7c4e43988598be5aee45e660/artifacts/checkpoint_000009\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-02-46/TorchTrainer_701e4_00000_0_2023-09-14_15-02-46/checkpoint_000009/dict_checkpoint.pkl -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/cb6a74df7c4e43988598be5aee45e660/artifacts/checkpoint_000009\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-02-46/TorchTrainer_701e4_00000_0_2023-09-14_15-02-46/checkpoint_000009/.metadata.pkl -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/cb6a74df7c4e43988598be5aee45e660/artifacts/checkpoint_000009\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-02-46/TorchTrainer_701e4_00000_0_2023-09-14_15-02-46/checkpoint_000009/.tune_metadata -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/cb6a74df7c4e43988598be5aee45e660/artifacts/checkpoint_000009\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-02-46/TorchTrainer_701e4_00000_0_2023-09-14_15-02-46/checkpoint_000009/.is_checkpoint -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/cb6a74df7c4e43988598be5aee45e660/artifacts/checkpoint_000009\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-02-46/TorchTrainer_701e4_00000_0_2023-09-14_15-02-46/params.json -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/cb6a74df7c4e43988598be5aee45e660/artifacts\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-02-46/TorchTrainer_701e4_00000_0_2023-09-14_15-02-46/events.out.tfevents.1694728971.ip-10-0-41-50 -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/cb6a74df7c4e43988598be5aee45e660/artifacts\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-02-46/TorchTrainer_701e4_00000_0_2023-09-14_15-02-46/params.pkl -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/cb6a74df7c4e43988598be5aee45e660/artifacts\n", + "creating /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/cb6a74df7c4e43988598be5aee45e660/artifacts/rank_0\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-02-46/TorchTrainer_701e4_00000_0_2023-09-14_15-02-46/result.json -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/cb6a74df7c4e43988598be5aee45e660/artifacts\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-02-46/TorchTrainer_701e4_00000_0_2023-09-14_15-02-46/progress.csv -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/cb6a74df7c4e43988598be5aee45e660/artifacts\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2023-07-24 18:13:07,524\tINFO tune.py:1148 -- Total run time: 82.33 seconds (82.27 seconds for the tuning loop).\n" + "2023-09-14 15:04:48,385\tINFO tune.py:1148 -- Total run time: 121.54 seconds (121.43 seconds for the tuning loop).\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.82 s, sys: 981 ms, total: 2.8 s\n", - "Wall time: 1min 22s\n" + "CPU times: user 1.72 s, sys: 758 ms, total: 2.47 s\n", + "Wall time: 2min 1s\n" ] } ], @@ -5778,7 +5741,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 85, "metadata": { "tags": [] }, @@ -5828,200 +5791,200 @@ " 0\n", " 0\n", " 0.0001\n", - " 0.581092\n", - " 0.493169\n", - " 1690247520\n", - " 14.871574\n", + " 0.575292\n", + " 0.488448\n", + " 1694728980\n", + " 16.911473\n", " True\n", " False\n", " 1\n", - " 38d2b_00000\n", - " 2023-07-24_18-12-03\n", - " 14.871574\n", - " 282335\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 14.871574\n", + " 701e4_00000\n", + " 2023-09-14_15-03-07\n", + " 16.911473\n", + " 50696\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 16.911473\n", " 1\n", " \n", " \n", " 1\n", " 1\n", " 0.0001\n", - " 0.478663\n", - " 0.423611\n", - " 1690247527\n", - " 6.952936\n", + " 0.477536\n", + " 0.394798\n", + " 1694728991\n", + " 10.108304\n", " True\n", " False\n", " 2\n", - " 38d2b_00000\n", - " 2023-07-24_18-12-10\n", - " 21.824510\n", - " 282335\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 21.824510\n", + " 701e4_00000\n", + " 2023-09-14_15-03-18\n", + " 27.019777\n", + " 50696\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 27.019777\n", " 2\n", " \n", " \n", " 2\n", " 2\n", " 0.0001\n", - " 0.386111\n", - " 0.367975\n", - " 1690247534\n", - " 6.707607\n", + " 0.352722\n", + " 0.354552\n", + " 1694729001\n", + " 9.832275\n", " True\n", " False\n", " 3\n", - " 38d2b_00000\n", - " 2023-07-24_18-12-17\n", - " 28.532117\n", - " 282335\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 28.532117\n", + " 701e4_00000\n", + " 2023-09-14_15-03-27\n", + " 36.852052\n", + " 50696\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 36.852052\n", " 3\n", " \n", " \n", " 3\n", " 3\n", " 0.0001\n", - " 0.287210\n", - " 0.304643\n", - " 1690247541\n", - " 6.656057\n", + " 0.270199\n", + " 0.237122\n", + " 1694729011\n", + " 9.836403\n", " True\n", " False\n", " 4\n", - " 38d2b_00000\n", - " 2023-07-24_18-12-24\n", - " 35.188173\n", - " 282335\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 35.188173\n", + " 701e4_00000\n", + " 2023-09-14_15-03-37\n", + " 46.688455\n", + " 50696\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 46.688455\n", " 4\n", " \n", " \n", " 4\n", " 4\n", " 0.0001\n", - " 0.209951\n", - " 0.278133\n", - " 1690247547\n", - " 6.671985\n", + " 0.192776\n", + " 0.225016\n", + " 1694729021\n", + " 9.903057\n", " True\n", " False\n", " 5\n", - " 38d2b_00000\n", - " 2023-07-24_18-12-30\n", - " 41.860158\n", - " 282335\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 41.860158\n", + " 701e4_00000\n", + " 2023-09-14_15-03-47\n", + " 56.591512\n", + " 50696\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 56.591512\n", " 5\n", " \n", " \n", " 5\n", " 5\n", " 0.0001\n", - " 0.152847\n", - " 0.258787\n", - " 1690247554\n", - " 6.679954\n", + " 0.141014\n", + " 0.196045\n", + " 1694729031\n", + " 9.963775\n", " True\n", " False\n", " 6\n", - " 38d2b_00000\n", - " 2023-07-24_18-12-37\n", - " 48.540112\n", - " 282335\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 48.540112\n", + " 701e4_00000\n", + " 2023-09-14_15-03-57\n", + " 66.555288\n", + " 50696\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 66.555288\n", " 6\n", " \n", " \n", " 6\n", " 6\n", " 0.0001\n", - " 0.102787\n", - " 0.248396\n", - " 1690247561\n", - " 6.690619\n", + " 0.093777\n", + " 0.201736\n", + " 1694729041\n", + " 9.954871\n", " True\n", " False\n", " 7\n", - " 38d2b_00000\n", - " 2023-07-24_18-12-44\n", - " 55.230731\n", - " 282335\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 55.230731\n", + " 701e4_00000\n", + " 2023-09-14_15-04-07\n", + " 76.510159\n", + " 50696\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 76.510159\n", " 7\n", " \n", " \n", " 7\n", " 7\n", " 0.0001\n", - " 0.066364\n", - " 0.232449\n", - " 1690247567\n", - " 6.750745\n", + " 0.070405\n", + " 0.174259\n", + " 1694729051\n", + " 9.840459\n", " True\n", " False\n", " 8\n", - " 38d2b_00000\n", - " 2023-07-24_18-12-50\n", - " 61.981476\n", - " 282335\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 61.981476\n", + " 701e4_00000\n", + " 2023-09-14_15-04-17\n", + " 86.350618\n", + " 50696\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 86.350618\n", " 8\n", " \n", " \n", " 8\n", " 8\n", " 0.0001\n", - " 0.049473\n", - " 0.219698\n", - " 1690247574\n", - " 6.681975\n", + " 0.048866\n", + " 0.159393\n", + " 1694729060\n", + " 9.945646\n", " True\n", " False\n", " 9\n", - " 38d2b_00000\n", - " 2023-07-24_18-12-57\n", - " 68.663451\n", - " 282335\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 68.663451\n", + " 701e4_00000\n", + " 2023-09-14_15-04-27\n", + " 96.296264\n", + " 50696\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 96.296264\n", " 9\n", " \n", " \n", " 9\n", " 9\n", " 0.0001\n", - " 0.034435\n", - " 0.221598\n", - " 1690247581\n", - " 6.712862\n", + " 0.036118\n", + " 0.137778\n", + " 1694729070\n", + " 9.898459\n", " True\n", " False\n", " 10\n", - " 38d2b_00000\n", - " 2023-07-24_18-13-04\n", - " 75.376313\n", - " 282335\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 75.376313\n", + " 701e4_00000\n", + " 2023-09-14_15-04-37\n", + " 106.194723\n", + " 50696\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 106.194723\n", " 10\n", " \n", " \n", @@ -6030,55 +5993,55 @@ ], "text/plain": [ " epoch lr train_loss val_loss timestamp time_this_iter_s \n", - "0 0 0.0001 0.581092 0.493169 1690247520 14.871574 \\\n", - "1 1 0.0001 0.478663 0.423611 1690247527 6.952936 \n", - "2 2 0.0001 0.386111 0.367975 1690247534 6.707607 \n", - "3 3 0.0001 0.287210 0.304643 1690247541 6.656057 \n", - "4 4 0.0001 0.209951 0.278133 1690247547 6.671985 \n", - "5 5 0.0001 0.152847 0.258787 1690247554 6.679954 \n", - "6 6 0.0001 0.102787 0.248396 1690247561 6.690619 \n", - "7 7 0.0001 0.066364 0.232449 1690247567 6.750745 \n", - "8 8 0.0001 0.049473 0.219698 1690247574 6.681975 \n", - "9 9 0.0001 0.034435 0.221598 1690247581 6.712862 \n", + "0 0 0.0001 0.575292 0.488448 1694728980 16.911473 \\\n", + "1 1 0.0001 0.477536 0.394798 1694728991 10.108304 \n", + "2 2 0.0001 0.352722 0.354552 1694729001 9.832275 \n", + "3 3 0.0001 0.270199 0.237122 1694729011 9.836403 \n", + "4 4 0.0001 0.192776 0.225016 1694729021 9.903057 \n", + "5 5 0.0001 0.141014 0.196045 1694729031 9.963775 \n", + "6 6 0.0001 0.093777 0.201736 1694729041 9.954871 \n", + "7 7 0.0001 0.070405 0.174259 1694729051 9.840459 \n", + "8 8 0.0001 0.048866 0.159393 1694729060 9.945646 \n", + "9 9 0.0001 0.036118 0.137778 1694729070 9.898459 \n", "\n", " should_checkpoint done training_iteration trial_id \n", - "0 True False 1 38d2b_00000 \\\n", - "1 True False 2 38d2b_00000 \n", - "2 True False 3 38d2b_00000 \n", - "3 True False 4 38d2b_00000 \n", - "4 True False 5 38d2b_00000 \n", - "5 True False 6 38d2b_00000 \n", - "6 True False 7 38d2b_00000 \n", - "7 True False 8 38d2b_00000 \n", - "8 True False 9 38d2b_00000 \n", - "9 True False 10 38d2b_00000 \n", - "\n", - " date time_total_s pid hostname node_ip \n", - "0 2023-07-24_18-12-03 14.871574 282335 ip-10-0-56-150 10.0.56.150 \\\n", - "1 2023-07-24_18-12-10 21.824510 282335 ip-10-0-56-150 10.0.56.150 \n", - "2 2023-07-24_18-12-17 28.532117 282335 ip-10-0-56-150 10.0.56.150 \n", - "3 2023-07-24_18-12-24 35.188173 282335 ip-10-0-56-150 10.0.56.150 \n", - "4 2023-07-24_18-12-30 41.860158 282335 ip-10-0-56-150 10.0.56.150 \n", - "5 2023-07-24_18-12-37 48.540112 282335 ip-10-0-56-150 10.0.56.150 \n", - "6 2023-07-24_18-12-44 55.230731 282335 ip-10-0-56-150 10.0.56.150 \n", - "7 2023-07-24_18-12-50 61.981476 282335 ip-10-0-56-150 10.0.56.150 \n", - "8 2023-07-24_18-12-57 68.663451 282335 ip-10-0-56-150 10.0.56.150 \n", - "9 2023-07-24_18-13-04 75.376313 282335 ip-10-0-56-150 10.0.56.150 \n", + "0 True False 1 701e4_00000 \\\n", + "1 True False 2 701e4_00000 \n", + "2 True False 3 701e4_00000 \n", + "3 True False 4 701e4_00000 \n", + "4 True False 5 701e4_00000 \n", + "5 True False 6 701e4_00000 \n", + "6 True False 7 701e4_00000 \n", + "7 True False 8 701e4_00000 \n", + "8 True False 9 701e4_00000 \n", + "9 True False 10 701e4_00000 \n", + "\n", + " date time_total_s pid hostname node_ip \n", + "0 2023-09-14_15-03-07 16.911473 50696 ip-10-0-50-151 10.0.50.151 \\\n", + "1 2023-09-14_15-03-18 27.019777 50696 ip-10-0-50-151 10.0.50.151 \n", + "2 2023-09-14_15-03-27 36.852052 50696 ip-10-0-50-151 10.0.50.151 \n", + "3 2023-09-14_15-03-37 46.688455 50696 ip-10-0-50-151 10.0.50.151 \n", + "4 2023-09-14_15-03-47 56.591512 50696 ip-10-0-50-151 10.0.50.151 \n", + "5 2023-09-14_15-03-57 66.555288 50696 ip-10-0-50-151 10.0.50.151 \n", + "6 2023-09-14_15-04-07 76.510159 50696 ip-10-0-50-151 10.0.50.151 \n", + "7 2023-09-14_15-04-17 86.350618 50696 ip-10-0-50-151 10.0.50.151 \n", + "8 2023-09-14_15-04-27 96.296264 50696 ip-10-0-50-151 10.0.50.151 \n", + "9 2023-09-14_15-04-37 106.194723 50696 ip-10-0-50-151 10.0.50.151 \n", "\n", " time_since_restore iterations_since_restore \n", - "0 14.871574 1 \n", - "1 21.824510 2 \n", - "2 28.532117 3 \n", - "3 35.188173 4 \n", - "4 41.860158 5 \n", - "5 48.540112 6 \n", - "6 55.230731 7 \n", - "7 61.981476 8 \n", - "8 68.663451 9 \n", - "9 75.376313 10 " + "0 16.911473 1 \n", + "1 27.019777 2 \n", + "2 36.852052 3 \n", + "3 46.688455 4 \n", + "4 56.591512 5 \n", + "5 66.555288 6 \n", + "6 76.510159 7 \n", + "7 86.350618 8 \n", + "8 96.296264 9 \n", + "9 106.194723 10 " ] }, - "execution_count": 84, + "execution_count": 85, "metadata": {}, "output_type": "execute_result" } @@ -6089,7 +6052,7 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 86, "metadata": { "tags": [] }, @@ -6121,19 +6084,19 @@ " artifact_uri\n", " start_time\n", " end_time\n", - " metrics.time_since_restore\n", - " metrics.time_this_iter_s\n", - " metrics.config/train_loop_config/lr\n", - " metrics.done\n", + " metrics.pid\n", + " metrics.epoch\n", + " metrics.config/train_loop_config/batch_size\n", + " metrics.iterations_since_restore\n", " ...\n", - " metrics.val_loss\n", - " params.train_loop_config/num_classes\n", + " metrics.lr\n", " params.train_loop_config/dropout_p\n", - " params.train_loop_config/num_epochs\n", + " params.train_loop_config/batch_size\n", + " params.train_loop_config/lr_factor\n", + " params.train_loop_config/num_classes\n", " params.train_loop_config/lr_patience\n", + " params.train_loop_config/num_epochs\n", " params.train_loop_config/lr\n", - " params.train_loop_config/lr_factor\n", - " params.train_loop_config/batch_size\n", " tags.trial_name\n", " tags.mlflow.runName\n", " \n", @@ -6141,68 +6104,68 @@ " \n", " \n", " 0\n", - " 8991516da6674b5395affb9fb6217964\n", - " 598544898920467811\n", + " cb6a74df7c4e43988598be5aee45e660\n", + " 917462352875586010\n", " FINISHED\n", - " file:///tmp/mlflow/598544898920467811/8991516d...\n", - " 2023-07-25 01:11:48.948000+00:00\n", - " 2023-07-25 01:13:07.513000+00:00\n", - " 75.376313\n", - " 6.712862\n", - " 0.0001\n", - " 0.0\n", + " file:///efs/shared_storage/madewithml/GokuMoha...\n", + " 2023-09-14 22:02:50.670000+00:00\n", + " 2023-09-14 22:04:48.292000+00:00\n", + " 50696.0\n", + " 9.0\n", + " 256.0\n", + " 10.0\n", " ...\n", - " 0.221598\n", - " 4\n", + " 0.0001\n", " 0.5\n", - " 10\n", + " 256\n", + " 0.8\n", + " 4\n", " 3\n", + " 10\n", " 0.0001\n", - " 0.8\n", - " 256\n", - " TorchTrainer_38d2b_00000\n", - " TorchTrainer_38d2b_00000\n", + " TorchTrainer_701e4_00000\n", + " TorchTrainer_701e4_00000\n", " \n", " \n", "\n", - "

1 rows × 35 columns

\n", + "

1 rows × 36 columns

\n", "" ], "text/plain": [ " run_id experiment_id status \n", - "0 8991516da6674b5395affb9fb6217964 598544898920467811 FINISHED \\\n", + "0 cb6a74df7c4e43988598be5aee45e660 917462352875586010 FINISHED \\\n", "\n", " artifact_uri \n", - "0 file:///tmp/mlflow/598544898920467811/8991516d... \\\n", + "0 file:///efs/shared_storage/madewithml/GokuMoha... \\\n", "\n", " start_time end_time \n", - "0 2023-07-25 01:11:48.948000+00:00 2023-07-25 01:13:07.513000+00:00 \\\n", + "0 2023-09-14 22:02:50.670000+00:00 2023-09-14 22:04:48.292000+00:00 \\\n", "\n", - " metrics.time_since_restore metrics.time_this_iter_s \n", - "0 75.376313 6.712862 \\\n", + " metrics.pid metrics.epoch metrics.config/train_loop_config/batch_size \n", + "0 50696.0 9.0 256.0 \\\n", "\n", - " metrics.config/train_loop_config/lr metrics.done ... metrics.val_loss \n", - "0 0.0001 0.0 ... 0.221598 \\\n", + " metrics.iterations_since_restore ... metrics.lr \n", + "0 10.0 ... 0.0001 \\\n", "\n", - " params.train_loop_config/num_classes params.train_loop_config/dropout_p \n", - "0 4 0.5 \\\n", + " params.train_loop_config/dropout_p params.train_loop_config/batch_size \n", + "0 0.5 256 \\\n", "\n", - " params.train_loop_config/num_epochs params.train_loop_config/lr_patience \n", - "0 10 3 \\\n", + " params.train_loop_config/lr_factor params.train_loop_config/num_classes \n", + "0 0.8 4 \\\n", "\n", - " params.train_loop_config/lr params.train_loop_config/lr_factor \n", - "0 0.0001 0.8 \\\n", + " params.train_loop_config/lr_patience params.train_loop_config/num_epochs \n", + "0 3 10 \\\n", "\n", - " params.train_loop_config/batch_size tags.trial_name \n", - "0 256 TorchTrainer_38d2b_00000 \\\n", + " params.train_loop_config/lr tags.trial_name \n", + "0 0.0001 TorchTrainer_701e4_00000 \\\n", "\n", " tags.mlflow.runName \n", - "0 TorchTrainer_38d2b_00000 \n", + "0 TorchTrainer_701e4_00000 \n", "\n", - "[1 rows x 35 columns]" + "[1 rows x 36 columns]" ] }, - "execution_count": 85, + "execution_count": 86, "metadata": {}, "output_type": "execute_result" } @@ -6215,7 +6178,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 87, "metadata": { "tags": [] }, @@ -6223,45 +6186,46 @@ { "data": { "text/plain": [ - "run_id 8991516da6674b5395affb9fb6217964\n", - "experiment_id 598544898920467811\n", + "run_id cb6a74df7c4e43988598be5aee45e660\n", + "experiment_id 917462352875586010\n", "status FINISHED\n", - "artifact_uri file:///tmp/mlflow/598544898920467811/8991516d...\n", - "start_time 2023-07-25 01:11:48.948000+00:00\n", - "end_time 2023-07-25 01:13:07.513000+00:00\n", - "metrics.time_since_restore 75.376313\n", - "metrics.time_this_iter_s 6.712862\n", - "metrics.config/train_loop_config/lr 0.0001\n", - "metrics.done 0.0\n", - "metrics.pid 282335.0\n", + "artifact_uri file:///efs/shared_storage/madewithml/GokuMoha...\n", + "start_time 2023-09-14 22:02:50.670000+00:00\n", + "end_time 2023-09-14 22:04:48.292000+00:00\n", + "metrics.pid 50696.0\n", + "metrics.epoch 9.0\n", + "metrics.config/train_loop_config/batch_size 256.0\n", "metrics.iterations_since_restore 10.0\n", - "metrics.training_iteration 10.0\n", - "metrics.config/train_loop_config/num_classes 4.0\n", - "metrics.lr 0.0001\n", - "metrics.config/train_loop_config/num_epochs 10.0\n", - "metrics.timestamp 1690247581.0\n", - "metrics.time_total_s 75.376313\n", "metrics.config/train_loop_config/lr_patience 3.0\n", - "metrics.config/train_loop_config/batch_size 256.0\n", - "metrics.epoch 9.0\n", - "metrics.should_checkpoint 1.0\n", - "metrics.train_loss 0.034435\n", + "metrics.done 0.0\n", "metrics.config/train_loop_config/dropout_p 0.5\n", + "metrics.config/train_loop_config/num_classes 4.0\n", + "metrics.time_total_s 106.194723\n", + "metrics.config/train_loop_config/lr 0.0001\n", "metrics.config/train_loop_config/lr_factor 0.8\n", - "metrics.val_loss 0.221598\n", - "params.train_loop_config/num_classes 4\n", + "metrics.should_checkpoint 1.0\n", + "metrics.val_loss 0.137778\n", + "metrics.train_loss 0.036118\n", + "metrics.timestamp 1694729070.0\n", + "metrics.time_since_restore 106.194723\n", + "metrics.time_this_iter_s 9.898459\n", + "metrics.config/train_loop_config/num_epochs 10.0\n", + "metrics.training_iteration 10.0\n", + "metrics.trial_id inf\n", + "metrics.lr 0.0001\n", "params.train_loop_config/dropout_p 0.5\n", - "params.train_loop_config/num_epochs 10\n", + "params.train_loop_config/batch_size 256\n", + "params.train_loop_config/lr_factor 0.8\n", + "params.train_loop_config/num_classes 4\n", "params.train_loop_config/lr_patience 3\n", + "params.train_loop_config/num_epochs 10\n", "params.train_loop_config/lr 0.0001\n", - "params.train_loop_config/lr_factor 0.8\n", - "params.train_loop_config/batch_size 256\n", - "tags.trial_name TorchTrainer_38d2b_00000\n", - "tags.mlflow.runName TorchTrainer_38d2b_00000\n", + "tags.trial_name TorchTrainer_701e4_00000\n", + "tags.mlflow.runName TorchTrainer_701e4_00000\n", "Name: 0, dtype: object" ] }, - "execution_count": 86, + "execution_count": 87, "metadata": {}, "output_type": "execute_result" } @@ -6273,7 +6237,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -6281,19 +6244,17 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Let's view what we've tracked from our experiment. MLFlow serves a dashboard for us to view and explore our experiments on a localhost port:\n", "\n", "```bash\n", - "mlflow server -h 0.0.0.0 -p 8080 --backend-store-uri /tmp/mlflow/\n", + "mlflow server -h 0.0.0.0 -p 8080 --backend-store-uri $EFS_DIR/mlflow\n", "```" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -6311,7 +6272,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -6320,7 +6280,7 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 88, "metadata": { "tags": [] }, @@ -6332,7 +6292,7 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 89, "metadata": { "tags": [] }, @@ -6346,7 +6306,7 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 90, "metadata": { "tags": [] }, @@ -6355,9 +6315,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-07-24 18:13:08,541\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(24)] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)->MapBatches()]\n", - "2023-07-24 18:13:08,542\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:13:08,543\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:04:49,356\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)->MapBatches()]\n", + "2023-09-14 15:04:49,357\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:04:49,357\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -6368,7 +6328,7 @@ "version_minor": 0 }, "text/plain": [ - "Running 0: 0%| | 0/24 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(24)] -> AllToAllOperator[RandomShuffle] -> LimitOperator[limit=1]\n", - "2023-07-24 18:13:12,895\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:13:12,895\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:04:52,757\tINFO read_api.py:374 -- To satisfy the requested parallelism of 48, each read task output will be split into 48 smaller blocks.\n", + "2023-09-14 15:04:52,760\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> AllToAllOperator[RandomShuffle] -> LimitOperator[limit=1]\n", + "2023-09-14 15:04:52,761\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:04:52,761\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -6520,7 +6481,7 @@ "version_minor": 0 }, "text/plain": [ - "- RandomShuffle 1: 0%| | 0/24 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(24)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> LimitOperator[limit=1]\n", - "2023-07-24 18:13:14,122\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:13:14,123\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:04:53,537\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> LimitOperator[limit=1]\n", + "2023-09-14 15:04:53,538\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:04:53,539\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -6599,7 +6560,7 @@ "version_minor": 0 }, "text/plain": [ - "- RandomShuffle 1: 0%| | 0/24 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/24 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/48 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(24)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> AllToAllOperator[Aggregate] -> TaskPoolMapOperator[MapBatches()]\n", - "2023-07-24 18:13:15,060\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:13:15,061\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:04:54,839\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> AllToAllOperator[Aggregate] -> TaskPoolMapOperator[MapBatches()]\n", + "2023-09-14 15:04:54,841\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:04:54,842\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -6776,7 +6737,7 @@ "version_minor": 0 }, "text/plain": [ - "- RandomShuffle 1: 0%| | 0/24 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/24 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/48 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(24)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)]\n", - "2023-07-24 18:13:16,029\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:13:16,032\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:04:56,618\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)]\n", + "2023-09-14 15:04:56,619\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:04:56,620\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -7009,7 +6970,7 @@ "version_minor": 0 }, "text/plain": [ - "- RandomShuffle 1: 0%| | 0/24 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/24 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/48 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(24)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)]\n", - "2023-07-24 18:13:17,237\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:13:17,237\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:04:58,287\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> AllToAllOperator[RandomShuffle] -> AllToAllOperator[Sort] -> AllToAllOperator[MapBatches(group_fn)->MapBatches(_filter_split)->RandomShuffle] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)]\n", + "2023-09-14 15:04:58,288\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:04:58,289\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -7186,7 +7147,7 @@ "version_minor": 0 }, "text/plain": [ - "- RandomShuffle 1: 0%| | 0/24 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/24 [00:00MapBatches(_filter_split)->RandomShuffle 8: 0%| | 0/48 [00:00Tune Status\n", " \n", "\n", - "\n", - "\n", - "\n", + "\n", + "\n", + "\n", "\n", "
Current time:2023-07-24 18:15:25
Running for: 00:02:06.87
Memory: 9.0/30.9 GiB
Current time:2023-09-14 15:08:20
Running for: 00:03:20.50
Memory: 7.7/30.9 GiB
\n", " \n", "
\n", "
\n", "

System Info

\n", - " Using AsyncHyperBand: num_stopped=2
Bracket: Iter 5.000: -0.3085140064358711
Logical resource usage: 4.0/12 CPUs, 1.0/1 GPUs\n", + " Using AsyncHyperBand: num_stopped=2
Bracket: Iter 5.000: -0.2885470949113369
Logical resource usage: 4.0/24 CPUs, 1.0/1 GPUs\n", "
\n", " \n", " \n", @@ -7532,14 +7493,14 @@ "

Trial Status

\n", " \n", "\n", - "\n", + "_patience\n", "\n", "\n", - "\n", - "\n", + "\n", + "\n", "\n", "
Trial name status loc train_loop_config/dr\n", + "
Trial name status loc train_loop_config/dr\n", "opout_p train_loop_config/lr train_loop_config/lr\n", "_factor train_loop_config/lr\n", - "_patience iter total time (s) epoch lr train_loss
iter total time (s) epoch lr train_loss
TorchTrainer_578373f0TERMINATED10.0.56.150:2833230.5 0.0001 0.8 3 10 76.0183 90.0001 0.0386915
TorchTrainer_50d8c90fTERMINATED10.0.56.150:2841490.356927 2.63429e-050.1487252.50789 5 42.2436 42.63429e-05 0.407936
TorchTrainer_55490282TERMINATED10.0.50.151:514600.5 0.0001 0.8 3 10 107.565 90.0001 0.0393358
TorchTrainer_45b199dfTERMINATED10.0.50.151:520630.797623 2.6943e-050.2671435.21456 5 57.5858 42.6943e-05 0.482763
\n", " \n", @@ -7586,79 +7547,86 @@ "name": "stderr", "output_type": "stream", "text": [ - "(TorchTrainer pid=283323) The dict form of `dataset_config` is deprecated. Use the DataConfig class instead. Support for this will be dropped in a future release.\n", - "(TorchTrainer pid=283323) The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.\n", - "(TorchTrainer pid=283323) Starting distributed worker processes: ['3541 (10.0.6.57)']\n", - "(RayTrainWorker pid=3541, ip=10.0.6.57) Setting up process group for: env:// [rank=0, world_size=1]\n", - "(RayTrainWorker pid=3541, ip=10.0.6.57) Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']\n", - "(RayTrainWorker pid=3541, ip=10.0.6.57) - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "(RayTrainWorker pid=3541, ip=10.0.6.57) - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", - "(RayTrainWorker pid=3541, ip=10.0.6.57) Moving model to device: cuda:0\n", - "(RayTrainWorker pid=3541, ip=10.0.6.57) /tmp/ipykernel_280376/1209796013.py:7: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.)\n" + "(TorchTrainer pid=51460, ip=10.0.50.151) The dict form of `dataset_config` is deprecated. Use the DataConfig class instead. Support for this will be dropped in a future release.\n", + "(TorchTrainer pid=51460, ip=10.0.50.151) The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.\n", + "(TorchTrainer pid=51460, ip=10.0.50.151) Starting distributed worker processes: ['51502 (10.0.50.151)']\n", + "(RayTrainWorker pid=51502, ip=10.0.50.151) Setting up process group for: env:// [rank=0, world_size=1]\n", + "(RayTrainWorker pid=51502, ip=10.0.50.151) Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']\n", + "(RayTrainWorker pid=51502, ip=10.0.50.151) - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "(RayTrainWorker pid=51502, ip=10.0.50.151) - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "(RayTrainWorker pid=51502, ip=10.0.50.151) Moving model to device: cuda:0\n", + "(RayTrainWorker pid=51502, ip=10.0.50.151) /tmp/ipykernel_117917/1209796013.py:7: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-13-18/TorchTrainer_578373f0_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-07-24_18-13-18/events.out.tfevents.1690247602.ip-10-0-56-150 -> /tmp/mlflow/598544898920467811/6b854ffda94844e28013fc6deb157af4/artifacts\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-13-18/TorchTrainer_578373f0_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-07-24_18-13-18/progress.csv -> /tmp/mlflow/598544898920467811/6b854ffda94844e28013fc6deb157af4/artifacts\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-13-18/TorchTrainer_578373f0_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-07-24_18-13-18/result.json -> /tmp/mlflow/598544898920467811/6b854ffda94844e28013fc6deb157af4/artifacts\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-13-18/TorchTrainer_578373f0_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-07-24_18-13-18/params.json -> /tmp/mlflow/598544898920467811/6b854ffda94844e28013fc6deb157af4/artifacts\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-13-18/TorchTrainer_578373f0_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-07-24_18-13-18/.lazy_checkpoint_marker -> /tmp/mlflow/598544898920467811/6b854ffda94844e28013fc6deb157af4/artifacts\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-13-18/TorchTrainer_578373f0_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-07-24_18-13-18/params.pkl -> /tmp/mlflow/598544898920467811/6b854ffda94844e28013fc6deb157af4/artifacts\n", - "creating /tmp/mlflow/598544898920467811/6b854ffda94844e28013fc6deb157af4/artifacts/checkpoint_000009\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-13-18/TorchTrainer_578373f0_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-07-24_18-13-18/checkpoint_000009/.is_checkpoint -> /tmp/mlflow/598544898920467811/6b854ffda94844e28013fc6deb157af4/artifacts/checkpoint_000009\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-13-18/TorchTrainer_578373f0_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-07-24_18-13-18/checkpoint_000009/.tune_metadata -> /tmp/mlflow/598544898920467811/6b854ffda94844e28013fc6deb157af4/artifacts/checkpoint_000009\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-13-18/TorchTrainer_578373f0_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-07-24_18-13-18/checkpoint_000009/dict_checkpoint.pkl -> /tmp/mlflow/598544898920467811/6b854ffda94844e28013fc6deb157af4/artifacts/checkpoint_000009\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-13-18/TorchTrainer_578373f0_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-07-24_18-13-18/checkpoint_000009/.metadata.pkl -> /tmp/mlflow/598544898920467811/6b854ffda94844e28013fc6deb157af4/artifacts/checkpoint_000009\n" + "creating /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/97a3692584eb4a51999f7e97176e3297/artifacts/checkpoint_000009\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_55490282_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-09-14_15-05-00/checkpoint_000009/dict_checkpoint.pkl -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/97a3692584eb4a51999f7e97176e3297/artifacts/checkpoint_000009\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_55490282_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-09-14_15-05-00/checkpoint_000009/.metadata.pkl -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/97a3692584eb4a51999f7e97176e3297/artifacts/checkpoint_000009\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_55490282_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-09-14_15-05-00/checkpoint_000009/.tune_metadata -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/97a3692584eb4a51999f7e97176e3297/artifacts/checkpoint_000009\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_55490282_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-09-14_15-05-00/checkpoint_000009/.is_checkpoint -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/97a3692584eb4a51999f7e97176e3297/artifacts/checkpoint_000009\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_55490282_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-09-14_15-05-00/params.json -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/97a3692584eb4a51999f7e97176e3297/artifacts\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_55490282_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-09-14_15-05-00/.lazy_checkpoint_marker -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/97a3692584eb4a51999f7e97176e3297/artifacts\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_55490282_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-09-14_15-05-00/params.pkl -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/97a3692584eb4a51999f7e97176e3297/artifacts\n", + "creating /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/97a3692584eb4a51999f7e97176e3297/artifacts/rank_0\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_55490282_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-09-14_15-05-00/events.out.tfevents.1694729104.ip-10-0-41-50 -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/97a3692584eb4a51999f7e97176e3297/artifacts\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_55490282_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-09-14_15-05-00/result.json -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/97a3692584eb4a51999f7e97176e3297/artifacts\n", + "creating /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/97a3692584eb4a51999f7e97176e3297/artifacts/checkpoint_000008\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_55490282_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-09-14_15-05-00/checkpoint_000008/dict_checkpoint.pkl -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/97a3692584eb4a51999f7e97176e3297/artifacts/checkpoint_000008\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_55490282_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-09-14_15-05-00/checkpoint_000008/.metadata.pkl -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/97a3692584eb4a51999f7e97176e3297/artifacts/checkpoint_000008\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_55490282_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-09-14_15-05-00/checkpoint_000008/.tune_metadata -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/97a3692584eb4a51999f7e97176e3297/artifacts/checkpoint_000008\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_55490282_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-09-14_15-05-00/checkpoint_000008/.is_checkpoint -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/97a3692584eb4a51999f7e97176e3297/artifacts/checkpoint_000008\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_55490282_1_dropout_p=0.5000,lr=0.0001,lr_factor=0.8000,lr_patience=3.0000_2023-09-14_15-05-00/progress.csv -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/97a3692584eb4a51999f7e97176e3297/artifacts\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "(TorchTrainer pid=284149) The dict form of `dataset_config` is deprecated. Use the DataConfig class instead. Support for this will be dropped in a future release.\n", - "(TorchTrainer pid=284149) The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.\n", - "(TorchTrainer pid=284149) Starting distributed worker processes: ['3934 (10.0.6.57)']\n", - "(RayTrainWorker pid=3934, ip=10.0.6.57) Setting up process group for: env:// [rank=0, world_size=1]\n", - "(RayTrainWorker pid=3934, ip=10.0.6.57) Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']\n", - "(RayTrainWorker pid=3934, ip=10.0.6.57) - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "(RayTrainWorker pid=3934, ip=10.0.6.57) - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", - "(RayTrainWorker pid=3934, ip=10.0.6.57) Moving model to device: cuda:0\n", - "(RayTrainWorker pid=3934, ip=10.0.6.57) /tmp/ipykernel_280376/1209796013.py:7: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.)\n" + "(TorchTrainer pid=52063, ip=10.0.50.151) The dict form of `dataset_config` is deprecated. Use the DataConfig class instead. Support for this will be dropped in a future release.\n", + "(TorchTrainer pid=52063, ip=10.0.50.151) The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.\n", + "(TorchTrainer pid=52063, ip=10.0.50.151) Starting distributed worker processes: ['52105 (10.0.50.151)']\n", + "(RayTrainWorker pid=52105, ip=10.0.50.151) Setting up process group for: env:// [rank=0, world_size=1]\n", + "(RayTrainWorker pid=52105, ip=10.0.50.151) Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']\n", + "(RayTrainWorker pid=52105, ip=10.0.50.151) - This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "(RayTrainWorker pid=52105, ip=10.0.50.151) - This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "(RayTrainWorker pid=52105, ip=10.0.50.151) Moving model to device: cuda:0\n", + "(RayTrainWorker pid=52105, ip=10.0.50.151) /tmp/ipykernel_117917/1209796013.py:7: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-13-18/TorchTrainer_50d8c90f_2_dropout_p=0.3569,lr=0.0000,lr_factor=0.1487,lr_patience=2.5079_2023-07-24_18-13-22/progress.csv -> /tmp/mlflow/598544898920467811/85444db0a2794c43a14accae3054d0e2/artifacts\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-13-18/TorchTrainer_50d8c90f_2_dropout_p=0.3569,lr=0.0000,lr_factor=0.1487,lr_patience=2.5079_2023-07-24_18-13-22/events.out.tfevents.1690247682.ip-10-0-56-150 -> /tmp/mlflow/598544898920467811/85444db0a2794c43a14accae3054d0e2/artifacts\n", - "creating /tmp/mlflow/598544898920467811/85444db0a2794c43a14accae3054d0e2/artifacts/checkpoint_000004\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-13-18/TorchTrainer_50d8c90f_2_dropout_p=0.3569,lr=0.0000,lr_factor=0.1487,lr_patience=2.5079_2023-07-24_18-13-22/checkpoint_000004/.is_checkpoint -> /tmp/mlflow/598544898920467811/85444db0a2794c43a14accae3054d0e2/artifacts/checkpoint_000004\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-13-18/TorchTrainer_50d8c90f_2_dropout_p=0.3569,lr=0.0000,lr_factor=0.1487,lr_patience=2.5079_2023-07-24_18-13-22/checkpoint_000004/.tune_metadata -> /tmp/mlflow/598544898920467811/85444db0a2794c43a14accae3054d0e2/artifacts/checkpoint_000004\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-13-18/TorchTrainer_50d8c90f_2_dropout_p=0.3569,lr=0.0000,lr_factor=0.1487,lr_patience=2.5079_2023-07-24_18-13-22/checkpoint_000004/dict_checkpoint.pkl -> /tmp/mlflow/598544898920467811/85444db0a2794c43a14accae3054d0e2/artifacts/checkpoint_000004\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-13-18/TorchTrainer_50d8c90f_2_dropout_p=0.3569,lr=0.0000,lr_factor=0.1487,lr_patience=2.5079_2023-07-24_18-13-22/checkpoint_000004/.metadata.pkl -> /tmp/mlflow/598544898920467811/85444db0a2794c43a14accae3054d0e2/artifacts/checkpoint_000004\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-13-18/TorchTrainer_50d8c90f_2_dropout_p=0.3569,lr=0.0000,lr_factor=0.1487,lr_patience=2.5079_2023-07-24_18-13-22/result.json -> /tmp/mlflow/598544898920467811/85444db0a2794c43a14accae3054d0e2/artifacts\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-13-18/TorchTrainer_50d8c90f_2_dropout_p=0.3569,lr=0.0000,lr_factor=0.1487,lr_patience=2.5079_2023-07-24_18-13-22/params.json -> /tmp/mlflow/598544898920467811/85444db0a2794c43a14accae3054d0e2/artifacts\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-13-18/TorchTrainer_50d8c90f_2_dropout_p=0.3569,lr=0.0000,lr_factor=0.1487,lr_patience=2.5079_2023-07-24_18-13-22/.lazy_checkpoint_marker -> /tmp/mlflow/598544898920467811/85444db0a2794c43a14accae3054d0e2/artifacts\n", - "copying /home/ray/ray_results/TorchTrainer_2023-07-24_18-13-18/TorchTrainer_50d8c90f_2_dropout_p=0.3569,lr=0.0000,lr_factor=0.1487,lr_patience=2.5079_2023-07-24_18-13-22/params.pkl -> /tmp/mlflow/598544898920467811/85444db0a2794c43a14accae3054d0e2/artifacts\n" + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_45b199df_2_dropout_p=0.7976,lr=0.0000,lr_factor=0.2671,lr_patience=5.2146_2023-09-14_15-05-04/params.json -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/57c609dd76c9477896863f04f2a46bb5/artifacts\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_45b199df_2_dropout_p=0.7976,lr=0.0000,lr_factor=0.2671,lr_patience=5.2146_2023-09-14_15-05-04/events.out.tfevents.1694729233.ip-10-0-41-50 -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/57c609dd76c9477896863f04f2a46bb5/artifacts\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_45b199df_2_dropout_p=0.7976,lr=0.0000,lr_factor=0.2671,lr_patience=5.2146_2023-09-14_15-05-04/.lazy_checkpoint_marker -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/57c609dd76c9477896863f04f2a46bb5/artifacts\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_45b199df_2_dropout_p=0.7976,lr=0.0000,lr_factor=0.2671,lr_patience=5.2146_2023-09-14_15-05-04/params.pkl -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/57c609dd76c9477896863f04f2a46bb5/artifacts\n", + "creating /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/57c609dd76c9477896863f04f2a46bb5/artifacts/checkpoint_000004\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_45b199df_2_dropout_p=0.7976,lr=0.0000,lr_factor=0.2671,lr_patience=5.2146_2023-09-14_15-05-04/checkpoint_000004/dict_checkpoint.pkl -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/57c609dd76c9477896863f04f2a46bb5/artifacts/checkpoint_000004\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_45b199df_2_dropout_p=0.7976,lr=0.0000,lr_factor=0.2671,lr_patience=5.2146_2023-09-14_15-05-04/checkpoint_000004/.metadata.pkl -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/57c609dd76c9477896863f04f2a46bb5/artifacts/checkpoint_000004\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_45b199df_2_dropout_p=0.7976,lr=0.0000,lr_factor=0.2671,lr_patience=5.2146_2023-09-14_15-05-04/checkpoint_000004/.tune_metadata -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/57c609dd76c9477896863f04f2a46bb5/artifacts/checkpoint_000004\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_45b199df_2_dropout_p=0.7976,lr=0.0000,lr_factor=0.2671,lr_patience=5.2146_2023-09-14_15-05-04/checkpoint_000004/.is_checkpoint -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/57c609dd76c9477896863f04f2a46bb5/artifacts/checkpoint_000004\n", + "creating /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/57c609dd76c9477896863f04f2a46bb5/artifacts/rank_0\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_45b199df_2_dropout_p=0.7976,lr=0.0000,lr_factor=0.2671,lr_patience=5.2146_2023-09-14_15-05-04/result.json -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/57c609dd76c9477896863f04f2a46bb5/artifacts\n", + "copying /efs/shared_storage/madewithml/GokuMohandas/TorchTrainer_2023-09-14_15-05-00/TorchTrainer_45b199df_2_dropout_p=0.7976,lr=0.0000,lr_factor=0.2671,lr_patience=5.2146_2023-09-14_15-05-04/progress.csv -> /efs/shared_storage/madewithml/GokuMohandas/mlflow/917462352875586010/57c609dd76c9477896863f04f2a46bb5/artifacts\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2023-07-24 18:15:25,696\tINFO tune.py:1148 -- Total run time: 126.91 seconds (126.86 seconds for the tuning loop).\n" + "2023-09-14 15:08:20,659\tINFO tune.py:1148 -- Total run time: 200.56 seconds (200.45 seconds for the tuning loop).\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3.49 s, sys: 1.41 s, total: 4.9 s\n", - "Wall time: 2min 6s\n" + "CPU times: user 3.63 s, sys: 2.11 s, total: 5.73 s\n", + "Wall time: 3min 20s\n" ] } ], @@ -7670,7 +7638,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 107, "metadata": { "tags": [] }, @@ -7724,49 +7692,49 @@ " 0\n", " 9\n", " 0.000100\n", - " 0.038692\n", - " 0.226971\n", - " 1690247675\n", - " 6.857913\n", + " 0.039336\n", + " 0.160408\n", + " 1694729205\n", + " 10.023823\n", " True\n", " True\n", " 10\n", - " 578373f0\n", + " 55490282\n", " ...\n", - " 283323\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 76.018291\n", + " 51460\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 107.564980\n", " 10\n", " 0.500000\n", " 0.000100\n", " 0.800000\n", " 3.000000\n", - " /home/ray/ray_results/TorchTrainer_2023-07-24_...\n", + " /efs/shared_storage/madewithml/GokuMohandas/To...\n", " \n", " \n", " 1\n", " 4\n", - " 0.000026\n", - " 0.407936\n", - " 0.389877\n", - " 1690247722\n", - " 6.744698\n", + " 0.000027\n", + " 0.482763\n", + " 0.436918\n", + " 1694729285\n", + " 10.070868\n", " True\n", " True\n", " 5\n", - " 50d8c90f\n", + " 45b199df\n", " ...\n", - " 284149\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 42.243630\n", + " 52063\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 57.585811\n", " 5\n", - " 0.356927\n", - " 0.000026\n", - " 0.148725\n", - " 2.507894\n", - " /home/ray/ray_results/TorchTrainer_2023-07-24_...\n", + " 0.797623\n", + " 0.000027\n", + " 0.267143\n", + " 5.214562\n", + " /efs/shared_storage/madewithml/GokuMohandas/To...\n", " \n", " \n", "\n", @@ -7775,33 +7743,33 @@ ], "text/plain": [ " epoch lr train_loss val_loss timestamp time_this_iter_s \n", - "0 9 0.000100 0.038692 0.226971 1690247675 6.857913 \\\n", - "1 4 0.000026 0.407936 0.389877 1690247722 6.744698 \n", + "0 9 0.000100 0.039336 0.160408 1694729205 10.023823 \\\n", + "1 4 0.000027 0.482763 0.436918 1694729285 10.070868 \n", "\n", - " should_checkpoint done training_iteration trial_id ... pid \n", - "0 True True 10 578373f0 ... 283323 \\\n", - "1 True True 5 50d8c90f ... 284149 \n", + " should_checkpoint done training_iteration trial_id ... pid \n", + "0 True True 10 55490282 ... 51460 \\\n", + "1 True True 5 45b199df ... 52063 \n", "\n", " hostname node_ip time_since_restore iterations_since_restore \n", - "0 ip-10-0-56-150 10.0.56.150 76.018291 10 \\\n", - "1 ip-10-0-56-150 10.0.56.150 42.243630 5 \n", + "0 ip-10-0-50-151 10.0.50.151 107.564980 10 \\\n", + "1 ip-10-0-50-151 10.0.50.151 57.585811 5 \n", "\n", " config/train_loop_config/dropout_p config/train_loop_config/lr \n", "0 0.500000 0.000100 \\\n", - "1 0.356927 0.000026 \n", + "1 0.797623 0.000027 \n", "\n", " config/train_loop_config/lr_factor config/train_loop_config/lr_patience \n", "0 0.800000 3.000000 \\\n", - "1 0.148725 2.507894 \n", + "1 0.267143 5.214562 \n", "\n", " logdir \n", - "0 /home/ray/ray_results/TorchTrainer_2023-07-24_... \n", - "1 /home/ray/ray_results/TorchTrainer_2023-07-24_... \n", + "0 /efs/shared_storage/madewithml/GokuMohandas/To... \n", + "1 /efs/shared_storage/madewithml/GokuMohandas/To... \n", "\n", "[2 rows x 22 columns]" ] }, - "execution_count": 106, + "execution_count": 107, "metadata": {}, "output_type": "execute_result" } @@ -7813,7 +7781,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 108, "metadata": { "tags": [] }, @@ -7863,200 +7831,200 @@ " 0\n", " 0\n", " 0.0001\n", - " 0.581037\n", - " 0.501312\n", - " 1690247613\n", - " 14.747473\n", + " 0.577241\n", + " 0.493672\n", + " 1694729113\n", + " 16.759313\n", " True\n", " False\n", " 1\n", - " 578373f0\n", - " 2023-07-24_18-13-36\n", - " 14.747473\n", - " 283323\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 14.747473\n", + " 55490282\n", + " 2023-09-14_15-05-20\n", + " 16.759313\n", + " 51460\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 16.759313\n", " 1\n", " \n", " \n", " 1\n", " 1\n", " 0.0001\n", - " 0.486921\n", - " 0.432752\n", - " 1690247620\n", - " 7.031596\n", + " 0.476221\n", + " 0.417544\n", + " 1694729124\n", + " 10.431773\n", " True\n", " False\n", " 2\n", - " 578373f0\n", - " 2023-07-24_18-13-44\n", - " 21.779069\n", - " 283323\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 21.779069\n", + " 55490282\n", + " 2023-09-14_15-05-31\n", + " 27.191086\n", + " 51460\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 27.191086\n", " 2\n", " \n", " \n", " 2\n", " 2\n", " 0.0001\n", - " 0.383275\n", - " 0.357377\n", - " 1690247627\n", - " 6.705714\n", + " 0.368710\n", + " 0.373515\n", + " 1694729135\n", + " 10.071463\n", " True\n", " False\n", " 3\n", - " 578373f0\n", - " 2023-07-24_18-13-50\n", - " 28.484783\n", - " 283323\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 28.484783\n", + " 55490282\n", + " 2023-09-14_15-05-41\n", + " 37.262549\n", + " 51460\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 37.262549\n", " 3\n", " \n", " \n", " 3\n", " 3\n", " 0.0001\n", - " 0.276686\n", - " 0.300713\n", - " 1690247634\n", - " 6.760320\n", + " 0.292983\n", + " 0.275497\n", + " 1694729145\n", + " 10.122023\n", " True\n", " False\n", " 4\n", - " 578373f0\n", - " 2023-07-24_18-13-57\n", - " 35.245103\n", - " 283323\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 35.245103\n", + " 55490282\n", + " 2023-09-14_15-05-51\n", + " 47.384572\n", + " 51460\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 47.384572\n", " 4\n", " \n", " \n", " 4\n", " 4\n", " 0.0001\n", - " 0.208469\n", - " 0.281393\n", - " 1690247641\n", - " 6.757150\n", + " 0.210210\n", + " 0.239090\n", + " 1694729155\n", + " 9.991925\n", " True\n", " False\n", " 5\n", - " 578373f0\n", - " 2023-07-24_18-14-04\n", - " 42.002253\n", - " 283323\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 42.002253\n", + " 55490282\n", + " 2023-09-14_15-06-01\n", + " 57.376497\n", + " 51460\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 57.376497\n", " 5\n", " \n", " \n", " 5\n", " 5\n", " 0.0001\n", - " 0.146135\n", - " 0.260962\n", - " 1690247647\n", - " 6.796252\n", + " 0.161824\n", + " 0.197741\n", + " 1694729165\n", + " 10.084616\n", " True\n", " False\n", " 6\n", - " 578373f0\n", - " 2023-07-24_18-14-11\n", - " 48.798505\n", - " 283323\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 48.798505\n", + " 55490282\n", + " 2023-09-14_15-06-11\n", + " 67.461113\n", + " 51460\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 67.461113\n", " 6\n", " \n", " \n", " 6\n", " 6\n", " 0.0001\n", - " 0.099608\n", - " 0.260589\n", - " 1690247654\n", - " 6.804206\n", + " 0.113558\n", + " 0.159887\n", + " 1694729175\n", + " 9.953290\n", " True\n", " False\n", " 7\n", - " 578373f0\n", - " 2023-07-24_18-14-17\n", - " 55.602711\n", - " 283323\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 55.602711\n", + " 55490282\n", + " 2023-09-14_15-06-21\n", + " 77.414402\n", + " 51460\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 77.414402\n", " 7\n", " \n", " \n", " 7\n", " 7\n", " 0.0001\n", - " 0.072397\n", - " 0.253754\n", - " 1690247661\n", - " 6.785470\n", + " 0.076707\n", + " 0.174182\n", + " 1694729185\n", + " 10.031824\n", " True\n", " False\n", " 8\n", - " 578373f0\n", - " 2023-07-24_18-14-24\n", - " 62.388181\n", - " 283323\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 62.388181\n", + " 55490282\n", + " 2023-09-14_15-06-31\n", + " 87.446226\n", + " 51460\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 87.446226\n", " 8\n", " \n", " \n", " 8\n", " 8\n", " 0.0001\n", - " 0.049509\n", - " 0.235428\n", - " 1690247668\n", - " 6.772196\n", + " 0.056161\n", + " 0.152457\n", + " 1694729195\n", + " 10.094931\n", " True\n", " False\n", " 9\n", - " 578373f0\n", - " 2023-07-24_18-14-31\n", - " 69.160377\n", - " 283323\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 69.160377\n", + " 55490282\n", + " 2023-09-14_15-06-41\n", + " 97.541157\n", + " 51460\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 97.541157\n", " 9\n", " \n", " \n", " 9\n", " 9\n", " 0.0001\n", - " 0.038692\n", - " 0.226971\n", - " 1690247675\n", - " 6.857913\n", + " 0.039336\n", + " 0.160408\n", + " 1694729205\n", + " 10.023823\n", " True\n", " True\n", " 10\n", - " 578373f0\n", - " 2023-07-24_18-14-38\n", - " 76.018291\n", - " 283323\n", - " ip-10-0-56-150\n", - " 10.0.56.150\n", - " 76.018291\n", + " 55490282\n", + " 2023-09-14_15-06-51\n", + " 107.564980\n", + " 51460\n", + " ip-10-0-50-151\n", + " 10.0.50.151\n", + " 107.564980\n", " 10\n", " \n", " \n", @@ -8065,55 +8033,55 @@ ], "text/plain": [ " epoch lr train_loss val_loss timestamp time_this_iter_s \n", - "0 0 0.0001 0.581037 0.501312 1690247613 14.747473 \\\n", - "1 1 0.0001 0.486921 0.432752 1690247620 7.031596 \n", - "2 2 0.0001 0.383275 0.357377 1690247627 6.705714 \n", - "3 3 0.0001 0.276686 0.300713 1690247634 6.760320 \n", - "4 4 0.0001 0.208469 0.281393 1690247641 6.757150 \n", - "5 5 0.0001 0.146135 0.260962 1690247647 6.796252 \n", - "6 6 0.0001 0.099608 0.260589 1690247654 6.804206 \n", - "7 7 0.0001 0.072397 0.253754 1690247661 6.785470 \n", - "8 8 0.0001 0.049509 0.235428 1690247668 6.772196 \n", - "9 9 0.0001 0.038692 0.226971 1690247675 6.857913 \n", + "0 0 0.0001 0.577241 0.493672 1694729113 16.759313 \\\n", + "1 1 0.0001 0.476221 0.417544 1694729124 10.431773 \n", + "2 2 0.0001 0.368710 0.373515 1694729135 10.071463 \n", + "3 3 0.0001 0.292983 0.275497 1694729145 10.122023 \n", + "4 4 0.0001 0.210210 0.239090 1694729155 9.991925 \n", + "5 5 0.0001 0.161824 0.197741 1694729165 10.084616 \n", + "6 6 0.0001 0.113558 0.159887 1694729175 9.953290 \n", + "7 7 0.0001 0.076707 0.174182 1694729185 10.031824 \n", + "8 8 0.0001 0.056161 0.152457 1694729195 10.094931 \n", + "9 9 0.0001 0.039336 0.160408 1694729205 10.023823 \n", "\n", " should_checkpoint done training_iteration trial_id \n", - "0 True False 1 578373f0 \\\n", - "1 True False 2 578373f0 \n", - "2 True False 3 578373f0 \n", - "3 True False 4 578373f0 \n", - "4 True False 5 578373f0 \n", - "5 True False 6 578373f0 \n", - "6 True False 7 578373f0 \n", - "7 True False 8 578373f0 \n", - "8 True False 9 578373f0 \n", - "9 True True 10 578373f0 \n", - "\n", - " date time_total_s pid hostname node_ip \n", - "0 2023-07-24_18-13-36 14.747473 283323 ip-10-0-56-150 10.0.56.150 \\\n", - "1 2023-07-24_18-13-44 21.779069 283323 ip-10-0-56-150 10.0.56.150 \n", - "2 2023-07-24_18-13-50 28.484783 283323 ip-10-0-56-150 10.0.56.150 \n", - "3 2023-07-24_18-13-57 35.245103 283323 ip-10-0-56-150 10.0.56.150 \n", - "4 2023-07-24_18-14-04 42.002253 283323 ip-10-0-56-150 10.0.56.150 \n", - "5 2023-07-24_18-14-11 48.798505 283323 ip-10-0-56-150 10.0.56.150 \n", - "6 2023-07-24_18-14-17 55.602711 283323 ip-10-0-56-150 10.0.56.150 \n", - "7 2023-07-24_18-14-24 62.388181 283323 ip-10-0-56-150 10.0.56.150 \n", - "8 2023-07-24_18-14-31 69.160377 283323 ip-10-0-56-150 10.0.56.150 \n", - "9 2023-07-24_18-14-38 76.018291 283323 ip-10-0-56-150 10.0.56.150 \n", + "0 True False 1 55490282 \\\n", + "1 True False 2 55490282 \n", + "2 True False 3 55490282 \n", + "3 True False 4 55490282 \n", + "4 True False 5 55490282 \n", + "5 True False 6 55490282 \n", + "6 True False 7 55490282 \n", + "7 True False 8 55490282 \n", + "8 True False 9 55490282 \n", + "9 True True 10 55490282 \n", + "\n", + " date time_total_s pid hostname node_ip \n", + "0 2023-09-14_15-05-20 16.759313 51460 ip-10-0-50-151 10.0.50.151 \\\n", + "1 2023-09-14_15-05-31 27.191086 51460 ip-10-0-50-151 10.0.50.151 \n", + "2 2023-09-14_15-05-41 37.262549 51460 ip-10-0-50-151 10.0.50.151 \n", + "3 2023-09-14_15-05-51 47.384572 51460 ip-10-0-50-151 10.0.50.151 \n", + "4 2023-09-14_15-06-01 57.376497 51460 ip-10-0-50-151 10.0.50.151 \n", + "5 2023-09-14_15-06-11 67.461113 51460 ip-10-0-50-151 10.0.50.151 \n", + "6 2023-09-14_15-06-21 77.414402 51460 ip-10-0-50-151 10.0.50.151 \n", + "7 2023-09-14_15-06-31 87.446226 51460 ip-10-0-50-151 10.0.50.151 \n", + "8 2023-09-14_15-06-41 97.541157 51460 ip-10-0-50-151 10.0.50.151 \n", + "9 2023-09-14_15-06-51 107.564980 51460 ip-10-0-50-151 10.0.50.151 \n", "\n", " time_since_restore iterations_since_restore \n", - "0 14.747473 1 \n", - "1 21.779069 2 \n", - "2 28.484783 3 \n", - "3 35.245103 4 \n", - "4 42.002253 5 \n", - "5 48.798505 6 \n", - "6 55.602711 7 \n", - "7 62.388181 8 \n", - "8 69.160377 9 \n", - "9 76.018291 10 " + "0 16.759313 1 \n", + "1 27.191086 2 \n", + "2 37.262549 3 \n", + "3 47.384572 4 \n", + "4 57.376497 5 \n", + "5 67.461113 6 \n", + "6 77.414402 7 \n", + "7 87.446226 8 \n", + "8 97.541157 9 \n", + "9 107.564980 10 " ] }, - "execution_count": 107, + "execution_count": 108, "metadata": {}, "output_type": "execute_result" } @@ -8126,7 +8094,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 109, "metadata": { "tags": [] }, @@ -8137,7 +8105,7 @@ "{'dropout_p': 0.5, 'lr': 0.0001, 'lr_factor': 0.8, 'lr_patience': 3.0}" ] }, - "execution_count": 108, + "execution_count": 109, "metadata": {}, "output_type": "execute_result" } @@ -8149,7 +8117,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 110, "metadata": { "tags": [] }, @@ -8181,19 +8149,19 @@ " artifact_uri\n", " start_time\n", " end_time\n", - " metrics.time_since_restore\n", - " metrics.time_this_iter_s\n", - " metrics.config/train_loop_config/lr\n", - " metrics.done\n", + " metrics.pid\n", + " metrics.epoch\n", + " metrics.config/train_loop_config/batch_size\n", + " metrics.iterations_since_restore\n", " ...\n", - " metrics.val_loss\n", - " params.train_loop_config/num_classes\n", + " metrics.lr\n", " params.train_loop_config/dropout_p\n", - " params.train_loop_config/num_epochs\n", + " params.train_loop_config/batch_size\n", + " params.train_loop_config/lr_factor\n", + " params.train_loop_config/num_classes\n", " params.train_loop_config/lr_patience\n", + " params.train_loop_config/num_epochs\n", " params.train_loop_config/lr\n", - " params.train_loop_config/lr_factor\n", - " params.train_loop_config/batch_size\n", " tags.trial_name\n", " tags.mlflow.runName\n", " \n", @@ -8201,136 +8169,136 @@ " \n", " \n", " 0\n", - " 8991516da6674b5395affb9fb6217964\n", - " 598544898920467811\n", + " cb6a74df7c4e43988598be5aee45e660\n", + " 917462352875586010\n", " FINISHED\n", - " file:///tmp/mlflow/598544898920467811/8991516d...\n", - " 2023-07-25 01:11:48.948000+00:00\n", - " 2023-07-25 01:13:07.513000+00:00\n", - " 75.376313\n", - " 6.712862\n", - " 0.000100\n", - " 0.0\n", + " file:///efs/shared_storage/madewithml/GokuMoha...\n", + " 2023-09-14 22:02:50.670000+00:00\n", + " 2023-09-14 22:04:48.292000+00:00\n", + " 50696.0\n", + " 9.0\n", + " 256.0\n", + " 10.0\n", " ...\n", - " 0.221598\n", - " 4\n", + " 0.000100\n", " 0.5\n", - " 10\n", + " 256\n", + " 0.8\n", + " 4\n", " 3\n", + " 10\n", " 0.0001\n", - " 0.8\n", - " 256\n", - " TorchTrainer_38d2b_00000\n", - " TorchTrainer_38d2b_00000\n", + " TorchTrainer_701e4_00000\n", + " TorchTrainer_701e4_00000\n", " \n", " \n", " 1\n", - " 6b854ffda94844e28013fc6deb157af4\n", - " 598544898920467811\n", + " 97a3692584eb4a51999f7e97176e3297\n", + " 917462352875586010\n", " FINISHED\n", - " file:///tmp/mlflow/598544898920467811/6b854ffd...\n", - " 2023-07-25 01:13:22.223000+00:00\n", - " 2023-07-25 01:14:38.804000+00:00\n", - " 76.018291\n", - " 6.857913\n", - " 0.000100\n", - " 1.0\n", + " file:///efs/shared_storage/madewithml/GokuMoha...\n", + " 2023-09-14 22:05:03.880000+00:00\n", + " 2023-09-14 22:07:09.776000+00:00\n", + " 51460.0\n", + " 9.0\n", + " NaN\n", + " 10.0\n", " ...\n", - " 0.226971\n", - " None\n", + " 0.000100\n", " 0.5\n", " None\n", - " 3.0\n", - " 0.0001\n", " 0.8\n", " None\n", - " TorchTrainer_578373f0\n", - " TorchTrainer_578373f0\n", + " 3.0\n", + " None\n", + " 0.0001\n", + " TorchTrainer_55490282\n", + " TorchTrainer_55490282\n", " \n", " \n", " 2\n", - " 85444db0a2794c43a14accae3054d0e2\n", - " 598544898920467811\n", + " 57c609dd76c9477896863f04f2a46bb5\n", + " 917462352875586010\n", " FINISHED\n", - " file:///tmp/mlflow/598544898920467811/85444db0...\n", - " 2023-07-25 01:14:42.883000+00:00\n", - " 2023-07-25 01:15:25.678000+00:00\n", - " 42.243630\n", - " 6.744698\n", - " 0.000026\n", - " 1.0\n", + " file:///efs/shared_storage/madewithml/GokuMoha...\n", + " 2023-09-14 22:07:13.514000+00:00\n", + " 2023-09-14 22:08:20.525000+00:00\n", + " 52063.0\n", + " 4.0\n", + " NaN\n", + " 5.0\n", " ...\n", - " 0.389877\n", + " 0.000027\n", + " 0.7976231355896662\n", " None\n", - " 0.3569270357483055\n", + " 0.267142900040756\n", " None\n", - " 2.507894040733857\n", - " 2.6342924831000413e-05\n", - " 0.14872459915992878\n", + " 5.214561723373508\n", " None\n", - " TorchTrainer_50d8c90f\n", - " TorchTrainer_50d8c90f\n", + " 2.6942959077294993e-05\n", + " TorchTrainer_45b199df\n", + " TorchTrainer_45b199df\n", " \n", " \n", "\n", - "

3 rows × 35 columns

\n", + "

3 rows × 36 columns

\n", "" ], "text/plain": [ " run_id experiment_id status \n", - "0 8991516da6674b5395affb9fb6217964 598544898920467811 FINISHED \\\n", - "1 6b854ffda94844e28013fc6deb157af4 598544898920467811 FINISHED \n", - "2 85444db0a2794c43a14accae3054d0e2 598544898920467811 FINISHED \n", + "0 cb6a74df7c4e43988598be5aee45e660 917462352875586010 FINISHED \\\n", + "1 97a3692584eb4a51999f7e97176e3297 917462352875586010 FINISHED \n", + "2 57c609dd76c9477896863f04f2a46bb5 917462352875586010 FINISHED \n", "\n", " artifact_uri \n", - "0 file:///tmp/mlflow/598544898920467811/8991516d... \\\n", - "1 file:///tmp/mlflow/598544898920467811/6b854ffd... \n", - "2 file:///tmp/mlflow/598544898920467811/85444db0... \n", + "0 file:///efs/shared_storage/madewithml/GokuMoha... \\\n", + "1 file:///efs/shared_storage/madewithml/GokuMoha... \n", + "2 file:///efs/shared_storage/madewithml/GokuMoha... \n", "\n", " start_time end_time \n", - "0 2023-07-25 01:11:48.948000+00:00 2023-07-25 01:13:07.513000+00:00 \\\n", - "1 2023-07-25 01:13:22.223000+00:00 2023-07-25 01:14:38.804000+00:00 \n", - "2 2023-07-25 01:14:42.883000+00:00 2023-07-25 01:15:25.678000+00:00 \n", - "\n", - " metrics.time_since_restore metrics.time_this_iter_s \n", - "0 75.376313 6.712862 \\\n", - "1 76.018291 6.857913 \n", - "2 42.243630 6.744698 \n", - "\n", - " metrics.config/train_loop_config/lr metrics.done ... metrics.val_loss \n", - "0 0.000100 0.0 ... 0.221598 \\\n", - "1 0.000100 1.0 ... 0.226971 \n", - "2 0.000026 1.0 ... 0.389877 \n", - "\n", - " params.train_loop_config/num_classes params.train_loop_config/dropout_p \n", - "0 4 0.5 \\\n", - "1 None 0.5 \n", - "2 None 0.3569270357483055 \n", - "\n", - " params.train_loop_config/num_epochs params.train_loop_config/lr_patience \n", - "0 10 3 \\\n", - "1 None 3.0 \n", - "2 None 2.507894040733857 \n", - "\n", - " params.train_loop_config/lr params.train_loop_config/lr_factor \n", - "0 0.0001 0.8 \\\n", - "1 0.0001 0.8 \n", - "2 2.6342924831000413e-05 0.14872459915992878 \n", - "\n", - " params.train_loop_config/batch_size tags.trial_name \n", - "0 256 TorchTrainer_38d2b_00000 \\\n", - "1 None TorchTrainer_578373f0 \n", - "2 None TorchTrainer_50d8c90f \n", + "0 2023-09-14 22:02:50.670000+00:00 2023-09-14 22:04:48.292000+00:00 \\\n", + "1 2023-09-14 22:05:03.880000+00:00 2023-09-14 22:07:09.776000+00:00 \n", + "2 2023-09-14 22:07:13.514000+00:00 2023-09-14 22:08:20.525000+00:00 \n", + "\n", + " metrics.pid metrics.epoch metrics.config/train_loop_config/batch_size \n", + "0 50696.0 9.0 256.0 \\\n", + "1 51460.0 9.0 NaN \n", + "2 52063.0 4.0 NaN \n", + "\n", + " metrics.iterations_since_restore ... metrics.lr \n", + "0 10.0 ... 0.000100 \\\n", + "1 10.0 ... 0.000100 \n", + "2 5.0 ... 0.000027 \n", + "\n", + " params.train_loop_config/dropout_p params.train_loop_config/batch_size \n", + "0 0.5 256 \\\n", + "1 0.5 None \n", + "2 0.7976231355896662 None \n", + "\n", + " params.train_loop_config/lr_factor params.train_loop_config/num_classes \n", + "0 0.8 4 \\\n", + "1 0.8 None \n", + "2 0.267142900040756 None \n", + "\n", + " params.train_loop_config/lr_patience params.train_loop_config/num_epochs \n", + "0 3 10 \\\n", + "1 3.0 None \n", + "2 5.214561723373508 None \n", + "\n", + " params.train_loop_config/lr tags.trial_name \n", + "0 0.0001 TorchTrainer_701e4_00000 \\\n", + "1 0.0001 TorchTrainer_55490282 \n", + "2 2.6942959077294993e-05 TorchTrainer_45b199df \n", "\n", " tags.mlflow.runName \n", - "0 TorchTrainer_38d2b_00000 \n", - "1 TorchTrainer_578373f0 \n", - "2 TorchTrainer_50d8c90f \n", + "0 TorchTrainer_701e4_00000 \n", + "1 TorchTrainer_55490282 \n", + "2 TorchTrainer_45b199df \n", "\n", - "[3 rows x 35 columns]" + "[3 rows x 36 columns]" ] }, - "execution_count": 109, + "execution_count": 110, "metadata": {}, "output_type": "execute_result" } @@ -8343,7 +8311,7 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 111, "metadata": { "tags": [] }, @@ -8352,9 +8320,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-07-24 18:15:26,376\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(24)] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)->MapBatches()]\n", - "2023-07-24 18:15:26,377\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:15:26,377\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:08:21,709\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)->MapBatches()]\n", + "2023-09-14 15:08:21,710\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:08:21,711\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -8365,7 +8333,7 @@ "version_minor": 0 }, "text/plain": [ - "Running 0: 0%| | 0/24 [00:00 TaskPoolMapOperator[ReadCSV->SplitBlocks(24)] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)->MapBatches()]\n", - "2023-07-24 18:15:30,005\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:15:30,006\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" + "2023-09-14 15:08:25,603\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> TaskPoolMapOperator[MapBatches(CustomPreprocessor._transform_pandas)->MapBatches()]\n", + "2023-09-14 15:08:25,603\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:08:25,604\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n" ] }, { @@ -8512,7 +8478,7 @@ "version_minor": 0 }, "text/plain": [ - "Running 0: 0%| | 0/24 [00:0084\n", " 2020-03-20 15:18:43\n", " Mention Classifier\n", - " Category prediction model\\r\\nThis repo contain...\n", + " Category prediction model\\nThis repo contains ...\n", " natural-language-processing\n", - " Mention Classifier Category prediction model\\r...\n", + " Mention Classifier Category prediction model\\n...\n", " natural-language-processing\n", " \n", " \n", @@ -8698,7 +8664,7 @@ "1 A PyTorch implementation of \"Graph Wavelet Neu... \n", "2 A PyTorch implementation of \"Capsule Graph Neu... \n", "3 Representing scenes as neural radiance fields ... \n", - "4 Category prediction model\\r\\nThis repo contain... \n", + "4 Category prediction model\\nThis repo contains ... \n", "\n", " tag \n", "0 other \\\n", @@ -8712,7 +8678,7 @@ "1 Graph Wavelet Neural Network A PyTorch impleme... \n", "2 Capsule Graph Neural Network A PyTorch impleme... \n", "3 NeRF: Neural Radiance Fields Representing scen... \n", - "4 Mention Classifier Category prediction model\\r... \n", + "4 Mention Classifier Category prediction model\\n... \n", "\n", " prediction \n", "0 computer-vision \n", @@ -8722,7 +8688,7 @@ "4 natural-language-processing " ] }, - "execution_count": 117, + "execution_count": 118, "metadata": {}, "output_type": "execute_result" } @@ -8736,7 +8702,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "TiXcls5JoNA8" @@ -8747,7 +8712,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 119, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -8762,9 +8727,9 @@ "output_type": "stream", "text": [ "{\n", - " \"precision\": 0.9281010510531216,\n", - " \"recall\": 0.9267015706806283,\n", - " \"f1\": 0.9269438615952555,\n", + " \"precision\": 0.9206853613661888,\n", + " \"recall\": 0.9162303664921466,\n", + " \"f1\": 0.9167279878033617,\n", " \"num_samples\": 191.0\n", "}\n" ] @@ -8781,7 +8746,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "zl3xSuXRutKG" @@ -8792,7 +8756,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 120, "metadata": { "id": "jqetm3ybN9C1", "tags": [] @@ -8804,7 +8768,7 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 121, "metadata": { "id": "1zIAI4mwusoX", "tags": [] @@ -8824,7 +8788,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 122, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -8839,9 +8803,9 @@ "output_type": "stream", "text": [ "{\n", - " \"precision\": 0.95,\n", + " \"precision\": 0.9156626506024096,\n", " \"recall\": 0.9743589743589743,\n", - " \"f1\": 0.9620253164556962,\n", + " \"f1\": 0.9440993788819876,\n", " \"num_samples\": 78.0\n", "}\n" ] @@ -8855,7 +8819,7 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 123, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -8872,36 +8836,36 @@ "[\n", " \"natural-language-processing\",\n", " {\n", - " \"precision\": 0.95,\n", + " \"precision\": 0.9156626506024096,\n", " \"recall\": 0.9743589743589743,\n", - " \"f1\": 0.9620253164556962,\n", + " \"f1\": 0.9440993788819876,\n", " \"num_samples\": 78.0\n", " }\n", "]\n", "[\n", - " \"computer-vision\",\n", + " \"other\",\n", " {\n", - " \"precision\": 0.9552238805970149,\n", - " \"recall\": 0.9014084507042254,\n", - " \"f1\": 0.927536231884058,\n", - " \"num_samples\": 71.0\n", + " \"precision\": 0.9583333333333334,\n", + " \"recall\": 0.8846153846153846,\n", + " \"f1\": 0.9199999999999999,\n", + " \"num_samples\": 26.0\n", " }\n", "]\n", "[\n", - " \"other\",\n", + " \"computer-vision\",\n", " {\n", - " \"precision\": 0.8888888888888888,\n", - " \"recall\": 0.9230769230769231,\n", - " \"f1\": 0.9056603773584906,\n", - " \"num_samples\": 26.0\n", + " \"precision\": 0.9538461538461539,\n", + " \"recall\": 0.8732394366197183,\n", + " \"f1\": 0.9117647058823529,\n", + " \"num_samples\": 71.0\n", " }\n", "]\n", "[\n", " \"mlops\",\n", " {\n", - " \"precision\": 0.7647058823529411,\n", - " \"recall\": 0.8125,\n", - " \"f1\": 0.787878787878788,\n", + " \"precision\": 0.7368421052631579,\n", + " \"recall\": 0.875,\n", + " \"f1\": 0.7999999999999999,\n", " \"num_samples\": 16.0\n", " }\n", "]\n" @@ -8917,7 +8881,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "f-juex26zvBF" @@ -8927,7 +8890,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "xPUao0S4k99c" @@ -8942,7 +8904,7 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 124, "metadata": { "id": "ZG2SgsPAzukL", "tags": [] @@ -8965,7 +8927,7 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 125, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -8979,9 +8941,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "[4, 9, 12, 16, 17, 19, 23, 26, 29, 30, 31, 32, 33, 34, 42, 47, 49, 50, 54, 56, 65, 66, 68, 71, 75, 76, 77, 78, 79, 82, 92, 94, 95, 97, 99, 101, 109, 113, 114, 115, 118, 120, 122, 126, 128, 129, 130, 131, 133, 134, 135, 138, 139, 140, 141, 142, 144, 148, 149, 152, 159, 160, 161, 163, 166, 170, 172, 173, 174, 177, 179, 183, 184, 187, 189, 190]\n", - "[24, 104, 165, 188]\n", - "[25, 112]\n" + "[4, 9, 12, 16, 17, 19, 23, 25, 26, 29, 30, 31, 32, 33, 34, 42, 47, 49, 50, 54, 56, 65, 66, 68, 71, 75, 77, 78, 79, 82, 92, 94, 95, 97, 99, 101, 109, 113, 114, 115, 118, 120, 122, 126, 128, 129, 130, 131, 133, 134, 135, 138, 139, 140, 141, 142, 144, 148, 149, 152, 159, 160, 161, 163, 166, 170, 172, 173, 174, 177, 179, 183, 184, 187, 189, 190]\n", + "[41, 55, 61, 104, 107, 154, 165]\n", + "[76, 112]\n" ] } ], @@ -8993,7 +8955,7 @@ }, { "cell_type": "code", - "execution_count": 125, + "execution_count": 126, "metadata": {}, "outputs": [ { @@ -9017,27 +8979,27 @@ "\n", "\n", "=== False positives ===\n", - "Keras OCR A packaged and flexible version of the CRAFT text detector and Keras CRNN recognition model. \n", - " true: computer-vision\n", + "How Docker Can Help You Become A More Effective Data Scientist A look at Docker from the perspective of a data scientist.\n", + " true: mlops\n", " pred: natural-language-processing\n", "\n", - "Open Compound Domain Adaptation Pytorch implementation for \"Open Compound Domain Adaptation\"\n", - " true: computer-vision\n", + "The AI Economist Improving Equality and Productivity with AI-Driven Tax Policies\n", + " true: other\n", " pred: natural-language-processing\n", "\n", - "Unpopular Opinion - Data Scientists Should Be More End-to-End I believe data scientists can be more effective by being end-to-end.\n", - " true: mlops\n", + "Differential Subspace Search in High-Dimensional Latent Space Differential subspace search to allow efficient iterative user exploration in such a space, without relying on domain- or data-specific assumptions.\n", + " true: computer-vision\n", " pred: natural-language-processing\n", "\n", "\n", "=== False negatives ===\n", - "Visualizing Memorization in RNNs Inspecting gradient magnitudes in context can be a powerful tool to see when recurrent units use short-term or long-term contextual understanding.\n", + "Get Subreddit Suggestions for a Post Trained on 4M Reddit posts from 4k Subreddits. End-to-end ML pipeline built with fasttext and FastAPI, deployed to Valohai.\n", " true: natural-language-processing\n", " pred: computer-vision\n", "\n", "Machine Learning Projects This Repo contains projects done by me while learning the basics. All the familiar types of regression, classification, and clustering methods have been used.\n", " true: natural-language-processing\n", - " pred: other\n", + " pred: mlops\n", "\n" ] } @@ -9056,7 +9018,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "6S5LZdP2Myjh" @@ -9066,7 +9027,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "ZW5nY_h-M08p" @@ -9089,7 +9049,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "OuN8xKFZlo2t" @@ -9100,7 +9059,7 @@ }, { "cell_type": "code", - "execution_count": 126, + "execution_count": 127, "metadata": { "id": "3FCrRUb2GANr", "tags": [] @@ -9115,7 +9074,7 @@ }, { "cell_type": "code", - "execution_count": 127, + "execution_count": 128, "metadata": { "id": "sKQxFU0iU-w-", "tags": [] @@ -9137,7 +9096,7 @@ }, { "cell_type": "code", - "execution_count": 128, + "execution_count": 129, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -9150,17 +9109,17 @@ { "data": { "text/plain": [ - "[{'text': 'Visualizing Memorization in RNNs Inspecting gradient magnitudes in context can be a powerful tool to see when recurrent units use short-term or long-term contextual understanding.',\n", + "[{'text': 'Get Subreddit Suggestions for a Post Trained on 4M Reddit posts from 4k Subreddits. End-to-end ML pipeline built with fasttext and FastAPI, deployed to Valohai.',\n", " 'true': 'natural-language-processing',\n", " 'pred': 'computer-vision',\n", - " 'prob': 0.0070185387},\n", + " 'prob': 0.00060260075},\n", " {'text': 'Machine Learning Projects This Repo contains projects done by me while learning the basics. All the familiar types of regression, classification, and clustering methods have been used.',\n", " 'true': 'natural-language-processing',\n", - " 'pred': 'other',\n", - " 'prob': 0.006060462}]" + " 'pred': 'mlops',\n", + " 'prob': 0.0076674907}]" ] }, - "execution_count": 128, + "execution_count": 129, "metadata": {}, "output_type": "execute_result" } @@ -9170,7 +9129,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "JwL1ltdiUjH2" @@ -9194,7 +9152,7 @@ }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 130, "metadata": { "id": "XX3cORGPPXXM", "tags": [] @@ -9207,7 +9165,7 @@ }, { "cell_type": "code", - "execution_count": 130, + "execution_count": 131, "metadata": { "tags": [] }, @@ -9216,38 +9174,38 @@ "name": "stderr", "output_type": "stream", "text": [ - "*** SIGTERM received at time=1690247734 on cpu 6 ***\n", - "*** SIGTERM received at time=1690247734 on cpu 1 ***\n", - "*** SIGTERM received at time=1690247734 on cpu 0 ***\n", - "*** SIGTERM received at time=1690247734 on cpu 4 ***\n", - "PC: @ 0x7fb69c9c011b (unknown) _Exit\n", - " @ 0x7fb69cc3d420 28547728 (unknown)\n", - "PC: @ 0x4edd8a (unknown) _PyEval_MakeFrameVector\n", - " @ 0x7fb69cc3d420 (unknown) (unknown)\n", - "PC: @ 0x4ef4bc (unknown) _PyEval_EvalFrameDefault\n", - "PC: @ 0x4eddcd (unknown) _PyEval_MakeFrameVector\n", - " @ 0x7260e0 (unknown) (unknown)\n", - " @ 0x7fb69cc3d420 (unknown) (unknown)\n", - " @ 0x7fb69cc3d420 (unknown) (unknown)\n", + "*** SIGTERM received at time=1694729309 on cpu 5 ***\n", + "*** SIGTERM received at time=1694729309 on cpu 4 ***\n", + "*** SIGTERM received at time=1694729309 on cpu 0 ***\n", + "*** SIGTERM received at time=1694729309 on cpu 2 ***\n", + "PC: @ 0x4fefa3 (unknown) _PyObject_GetMethod\n", + "PC: @ 0x4fce88 (unknown) _PyObject_GenericGetAttrWithDict\n", + " @ 0x7fe11d01f420 27754944 (unknown)\n", + " @ 0x7fe11d01f420 26229312 (unknown)\n", + "PC: @ 0x4f369b (unknown) _PyEval_EvalFrameDefault\n", + "PC: @ 0x4ff156 (unknown) _PyObject_GetMethod\n", + " @ 0x7fe11d01f420 (unknown) (unknown)\n", + " @ 0x7fe11d01f420 27580544 (unknown)\n", " @ ... and at least 1 more frames\n", - "[2023-07-24 18:15:34,914 E 284736 280376] logging.cc:361: *** SIGTERM received at time=1690247734 on cpu 6 ***\n", - "[2023-07-24 18:15:34,915 E 284736 280376] logging.cc:361: PC: @ 0x7fb69c9c011b (unknown) _Exit\n", - "[2023-07-24 18:15:34,915 E 284738 280376] logging.cc:361: *** SIGTERM received at time=1690247734 on cpu 4 ***\n", - "[2023-07-24 18:15:34,915 E 284738 280376] logging.cc:361: PC: @ 0x4ef4bc (unknown) _PyEval_EvalFrameDefault\n", - "[2023-07-24 18:15:34,915 E 284738 280376] logging.cc:361: @ 0x7fb69cc3d420 (unknown) (unknown)\n", - "[2023-07-24 18:15:34,915 E 284738 280376] logging.cc:361: @ ... and at least 1 more frames\n", - " @ 0x72c720 (unknown) (unknown)\n", - "[2023-07-24 18:15:34,917 E 284737 280376] logging.cc:361: *** SIGTERM received at time=1690247734 on cpu 1 ***\n", - "[2023-07-24 18:15:34,917 E 284737 280376] logging.cc:361: PC: @ 0x4edd8a (unknown) _PyEval_MakeFrameVector\n", - "[2023-07-24 18:15:34,918 E 284736 280376] logging.cc:361: @ 0x7fb69cc3d420 28547728 (unknown)\n", - " @ 0x72c720 (unknown) (unknown)\n", - "[2023-07-24 18:15:34,920 E 284737 280376] logging.cc:361: @ 0x7fb69cc3d420 (unknown) (unknown)\n", - "[2023-07-24 18:15:34,920 E 284735 280376] logging.cc:361: *** SIGTERM received at time=1690247734 on cpu 0 ***\n", - "[2023-07-24 18:15:34,921 E 284735 280376] logging.cc:361: PC: @ 0x4eddcd (unknown) _PyEval_MakeFrameVector\n", - "[2023-07-24 18:15:34,922 E 284736 280376] logging.cc:361: @ 0x7260e0 (unknown) (unknown)\n", - "[2023-07-24 18:15:34,924 E 284737 280376] logging.cc:361: @ 0x72c720 (unknown) (unknown)\n", - "[2023-07-24 18:15:34,926 E 284735 280376] logging.cc:361: @ 0x7fb69cc3d420 (unknown) (unknown)\n", - "[2023-07-24 18:15:34,932 E 284735 280376] logging.cc:361: @ 0x72c720 (unknown) (unknown)\n" + "[2023-09-14 15:08:29,998 E 123487 117917] logging.cc:361: *** SIGTERM received at time=1694729309 on cpu 4 ***\n", + "[2023-09-14 15:08:29,998 E 123487 117917] logging.cc:361: PC: @ 0x4f369b (unknown) _PyEval_EvalFrameDefault\n", + "[2023-09-14 15:08:29,998 E 123487 117917] logging.cc:361: @ 0x7fe11d01f420 (unknown) (unknown)\n", + "[2023-09-14 15:08:29,998 E 123487 117917] logging.cc:361: @ ... and at least 1 more frames\n", + " @ 0x742da0 (unknown) (unknown)\n", + " @ 0x742da0 (unknown) (unknown)\n", + "[2023-09-14 15:08:30,000 E 123488 117917] logging.cc:361: *** SIGTERM received at time=1694729309 on cpu 5 ***\n", + "[2023-09-14 15:08:30,000 E 123488 117917] logging.cc:361: PC: @ 0x4fefa3 (unknown) _PyObject_GetMethod\n", + "[2023-09-14 15:08:30,000 E 123489 117917] logging.cc:361: *** SIGTERM received at time=1694729309 on cpu 2 ***\n", + "[2023-09-14 15:08:30,000 E 123489 117917] logging.cc:361: PC: @ 0x4fce88 (unknown) _PyObject_GenericGetAttrWithDict\n", + "[2023-09-14 15:08:30,003 E 123488 117917] logging.cc:361: @ 0x7fe11d01f420 27754944 (unknown)\n", + "[2023-09-14 15:08:30,003 E 123489 117917] logging.cc:361: @ 0x7fe11d01f420 26229312 (unknown)\n", + " @ 0x742da0 (unknown) (unknown)\n", + "[2023-09-14 15:08:30,003 E 123486 117917] logging.cc:361: *** SIGTERM received at time=1694729309 on cpu 0 ***\n", + "[2023-09-14 15:08:30,003 E 123486 117917] logging.cc:361: PC: @ 0x4ff156 (unknown) _PyObject_GetMethod\n", + "[2023-09-14 15:08:30,007 E 123488 117917] logging.cc:361: @ 0x742da0 (unknown) (unknown)\n", + "[2023-09-14 15:08:30,007 E 123489 117917] logging.cc:361: @ 0x742da0 (unknown) (unknown)\n", + "[2023-09-14 15:08:30,009 E 123486 117917] logging.cc:361: @ 0x7fe11d01f420 27580544 (unknown)\n", + "[2023-09-14 15:08:30,015 E 123486 117917] logging.cc:361: @ 0x742da0 (unknown) (unknown)\n" ] }, { @@ -9281,15 +9239,6 @@ " \n", " \n", " \n", - " 165\n", - " 2137\n", - " 2020-08-13 02:10:03\n", - " Unpopular Opinion - Data Scientists Should Be ...\n", - " I believe data scientists can be more effectiv...\n", - " mlops\n", - " natural-language-processing\n", - " \n", - " \n", " 103\n", " 1459\n", " 2020-06-16 03:06:10\n", @@ -9299,31 +9248,40 @@ " computer-vision\n", " \n", " \n", - " 188\n", - " 2413\n", - " 2020-10-01 23:50:04\n", - " Keeping Data Pipelines healthy w/ Great Expect...\n", - " We show you how you can use GitHub Actions tog...\n", + " 165\n", + " 2137\n", + " 2020-08-13 02:10:03\n", + " Unpopular Opinion - Data Scientists Should Be ...\n", + " I believe data scientists can be more effectiv...\n", " mlops\n", " natural-language-processing\n", " \n", " \n", - " 112\n", - " 1524\n", - " 2020-06-20 10:42:25\n", - " Machine Learning Projects\n", - " This Repo contains projects done by me while l...\n", + " 76\n", + " 916\n", + " 2020-05-19 08:11:05\n", + " Get Subreddit Suggestions for a Post\n", + " Trained on 4M Reddit posts from 4k Subreddits....\n", " natural-language-processing\n", - " other\n", + " computer-vision\n", " \n", " \n", - " 25\n", - " 384\n", - " 2020-04-08 21:22:25\n", - " Visualizing Memorization in RNNs\n", - " Inspecting gradient magnitudes in context can ...\n", + " 41\n", + " 561\n", + " 2020-04-16 16:27:31\n", + " How Docker Can Help You Become A More Effectiv...\n", + " A look at Docker from the perspective of a dat...\n", + " mlops\n", " natural-language-processing\n", + " \n", + " \n", + " 104\n", + " 1462\n", + " 2020-06-16 03:28:40\n", + " Open Compound Domain Adaptation\n", + " Pytorch implementation for \"Open Compound Doma...\n", " computer-vision\n", + " natural-language-processing\n", " \n", " \n", "\n", @@ -9331,35 +9289,35 @@ ], "text/plain": [ " id created_on \n", - "165 2137 2020-08-13 02:10:03 \\\n", - "103 1459 2020-06-16 03:06:10 \n", - "188 2413 2020-10-01 23:50:04 \n", - "112 1524 2020-06-20 10:42:25 \n", - "25 384 2020-04-08 21:22:25 \n", + "103 1459 2020-06-16 03:06:10 \\\n", + "165 2137 2020-08-13 02:10:03 \n", + "76 916 2020-05-19 08:11:05 \n", + "41 561 2020-04-16 16:27:31 \n", + "104 1462 2020-06-16 03:28:40 \n", "\n", " title \n", - "165 Unpopular Opinion - Data Scientists Should Be ... \\\n", - "103 SuperGlue: Learning Feature Matching with Grap... \n", - "188 Keeping Data Pipelines healthy w/ Great Expect... \n", - "112 Machine Learning Projects \n", - "25 Visualizing Memorization in RNNs \n", + "103 SuperGlue: Learning Feature Matching with Grap... \\\n", + "165 Unpopular Opinion - Data Scientists Should Be ... \n", + "76 Get Subreddit Suggestions for a Post \n", + "41 How Docker Can Help You Become A More Effectiv... \n", + "104 Open Compound Domain Adaptation \n", "\n", " description \n", - "165 I believe data scientists can be more effectiv... \\\n", - "103 SuperGlue, a neural network that matches two s... \n", - "188 We show you how you can use GitHub Actions tog... \n", - "112 This Repo contains projects done by me while l... \n", - "25 Inspecting gradient magnitudes in context can ... \n", + "103 SuperGlue, a neural network that matches two s... \\\n", + "165 I believe data scientists can be more effectiv... \n", + "76 Trained on 4M Reddit posts from 4k Subreddits.... \n", + "41 A look at Docker from the perspective of a dat... \n", + "104 Pytorch implementation for \"Open Compound Doma... \n", "\n", " tag prediction \n", - "165 mlops natural-language-processing \n", "103 other computer-vision \n", - "188 mlops natural-language-processing \n", - "112 natural-language-processing other \n", - "25 natural-language-processing computer-vision " + "165 mlops natural-language-processing \n", + "76 natural-language-processing computer-vision \n", + "41 mlops natural-language-processing \n", + "104 computer-vision natural-language-processing " ] }, - "execution_count": 130, + "execution_count": 131, "metadata": {}, "output_type": "execute_result" } @@ -9371,7 +9329,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "UtXjpKf9FU4C" @@ -9381,7 +9338,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "dvS3UpusXP_R" @@ -9391,7 +9347,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "eeWWMG38Ny4U" @@ -9409,7 +9364,7 @@ }, { "cell_type": "code", - "execution_count": 131, + "execution_count": 132, "metadata": { "id": "ZyueOtQsXdGm", "tags": [] @@ -9423,7 +9378,7 @@ }, { "cell_type": "code", - "execution_count": 132, + "execution_count": 133, "metadata": { "id": "coutP2KtXdLG", "tags": [] @@ -9441,7 +9396,7 @@ }, { "cell_type": "code", - "execution_count": 133, + "execution_count": 134, "metadata": { "id": "PbxmLvi-D7lq", "tags": [] @@ -9455,7 +9410,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "2Vxg5X9OD-Ax" @@ -9466,7 +9420,7 @@ }, { "cell_type": "code", - "execution_count": 134, + "execution_count": 135, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -9481,7 +9435,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 191/191 [00:00<00:00, 28411.25it/s]\n" + "100%|██████████| 191/191 [00:00<00:00, 27560.88it/s]\n" ] }, { @@ -9555,7 +9509,7 @@ "31 natural-language-processing " ] }, - "execution_count": 134, + "execution_count": 135, "metadata": {}, "output_type": "execute_result" } @@ -9567,7 +9521,7 @@ }, { "cell_type": "code", - "execution_count": 135, + "execution_count": 136, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -9582,7 +9536,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 191/191 [00:00<00:00, 52118.41it/s]\n" + "100%|██████████| 191/191 [00:00<00:00, 62729.00it/s]\n" ] }, { @@ -9656,7 +9610,7 @@ "140 natural-language-processing " ] }, - "execution_count": 135, + "execution_count": 136, "metadata": {}, "output_type": "execute_result" } @@ -9667,7 +9621,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "kZuDZwTNO93Q" @@ -9678,7 +9631,7 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": 137, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -9692,7 +9645,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 191/191 [00:00<00:00, 25025.37it/s]\n" + "100%|██████████| 191/191 [00:00<00:00, 26776.93it/s]\n" ] }, { @@ -9725,7 +9678,7 @@ " dtype=[('nlp_llm', '\n", " " @@ -47027,7 +46975,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "5Pkm_pH847x1" @@ -47038,7 +46985,7 @@ }, { "cell_type": "code", - "execution_count": 142, + "execution_count": 143, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -47054,7 +47001,7 @@ "['natural-language-processing', 'natural-language-processing']" ] }, - "execution_count": 142, + "execution_count": 143, "metadata": {}, "output_type": "execute_result" } @@ -47068,7 +47015,7 @@ }, { "cell_type": "code", - "execution_count": 143, + "execution_count": 144, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -47084,7 +47031,7 @@ "['natural-language-processing', 'computer-vision']" ] }, - "execution_count": 143, + "execution_count": 144, "metadata": {}, "output_type": "execute_result" } @@ -47098,7 +47045,7 @@ }, { "cell_type": "code", - "execution_count": 144, + "execution_count": 145, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -47114,7 +47061,7 @@ "['natural-language-processing', 'mlops']" ] }, - "execution_count": 144, + "execution_count": 145, "metadata": {}, "output_type": "execute_result" } @@ -47127,7 +47074,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "OkBxFVAA47x2" @@ -47137,7 +47083,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -47145,7 +47090,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "tags": [] @@ -47156,7 +47100,7 @@ }, { "cell_type": "code", - "execution_count": 145, + "execution_count": 146, "metadata": { "tags": [] }, @@ -47169,7 +47113,7 @@ }, { "cell_type": "code", - "execution_count": 146, + "execution_count": 147, "metadata": { "tags": [] }, @@ -47182,7 +47126,7 @@ }, { "cell_type": "code", - "execution_count": 147, + "execution_count": 148, "metadata": { "tags": [] }, @@ -47200,7 +47144,7 @@ }, { "cell_type": "code", - "execution_count": 148, + "execution_count": 149, "metadata": { "tags": [] }, @@ -47217,17 +47161,17 @@ }, { "cell_type": "code", - "execution_count": 149, + "execution_count": 150, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2023-07-24 18:16:03,510\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(24)] -> ActorPoolMapOperator[MapBatches(Predictor)]\n", - "2023-07-24 18:16:03,511\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", - "2023-07-24 18:16:03,512\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", - "2023-07-24 18:16:05,779\tINFO actor_pool_map_operator.py:117 -- MapBatches(Predictor): Waiting for 1 pool actors to start...\n" + "2023-09-14 15:08:58,658\tINFO streaming_executor.py:92 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadCSV->SplitBlocks(48)] -> ActorPoolMapOperator[MapBatches(Predictor)]\n", + "2023-09-14 15:08:58,659\tINFO streaming_executor.py:93 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=True, actor_locality_enabled=True, verbose_progress=False)\n", + "2023-09-14 15:08:58,660\tINFO streaming_executor.py:95 -- Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n", + "2023-09-14 15:09:01,060\tINFO actor_pool_map_operator.py:117 -- MapBatches(Predictor): Waiting for 1 pool actors to start...\n" ] }, { @@ -47238,7 +47182,7 @@ "version_minor": 0 }, "text/plain": [ - "Running 0: 0%| | 0/24 [00:00