Skip to content

Commit

Permalink
Expand PersHelper to support disk/memory approaches to communicate pe…
Browse files Browse the repository at this point in the history
…rsistence, add unit tests.
  • Loading branch information
user committed Apr 28, 2024
1 parent 1a1c406 commit ad7c34f
Show file tree
Hide file tree
Showing 14 changed files with 1,323 additions and 562 deletions.
136 changes: 0 additions & 136 deletions examples/fairytales_demo.txt

This file was deleted.

362 changes: 183 additions & 179 deletions examples/gpt-demo/01-GPT-Training.ipynb
Original file line number Diff line number Diff line change
@@ -1,180 +1,184 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h1>Character Level GPT on Text Data</h1>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Edit the filename below to train the GPT model on the corpus. Select \"Run\" -> \"Run All\"."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%scorep_env\n",
"SCOREP_ENABLE_TRACING=1\n",
"SCOREP_ENABLE_PROFILING=0\n",
"SCOREP_TOTAL_MEMORY=3g"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#%env SCOREP_KERNEL_PERSISTENCE_MODE MEMORY"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%env"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%switch_serializer\n",
"cloudpickle"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%scorep_python_binding_arguments\n",
"--noinstrumenter"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"filename = \"fairy_test.txt\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%execute_with_scorep\n",
"import scorep\n",
"import logging\n",
"\n",
"logging.basicConfig(\n",
" format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\",\n",
" datefmt=\"%d/%m/%Y %H:%M:%S\",\n",
" level=logging.INFO)\n",
"\n",
"from utils import set_seed\n",
"set_seed(42)\n",
"\n",
"import numpy as numpy\n",
"import torch\n",
"import torch.nn as nn\n",
"from torch.nn import functional as F\n",
"\n",
"import math\n",
"from torch.utils.data import Dataset\n",
"\n",
"class CharDataset(Dataset):\n",
" def __init__(self, data, block_size):\n",
" chars = sorted(list(set(data)))\n",
" data_size, vocab_size = len(data), len(chars)\n",
" print(\"data has %d characters, %d unique.\" % (data_size, vocab_size))\n",
"\n",
" self.stoi = {ch:i for i, ch in enumerate(chars)}\n",
" self.itos = {i:ch for i, ch in enumerate(chars)}\n",
" self.block_size = block_size\n",
" self.vocab_size = vocab_size\n",
" self.data = data\n",
"\n",
" def __len__(self):\n",
" return len(self.data) - self.block_size\n",
"\n",
" def __getitem__(self, idx):\n",
" chunk = self.data[idx : idx+self.block_size+1]\n",
" dix = [self.stoi[s] for s in chunk]\n",
"\n",
" x = torch.tensor(dix[:-1], dtype = torch.long)\n",
" y = torch.tensor(dix[1:], dtype = torch.long)\n",
" return x, y\n",
"\n",
"with scorep.instrumenter.enable():\n",
" block_size = 32\n",
"\n",
" text = open(\"./{}\".format(filename), \"r\").read()\n",
" train_dataset = CharDataset(text, block_size)\n",
"\n",
" from model import GPT, GPTconfig\n",
" mconf = GPTconfig(train_dataset.vocab_size, train_dataset.block_size,\n",
" n_layer=1, n_head=8, n_embd=512)\n",
" model = GPT(mconf)\n",
"\n",
" from trainer import Trainer, TrainerConfig\n",
"\n",
" tconf = TrainerConfig(max_epochs=1, batch_size=512, learning_rate=6e-4,\n",
" lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*block_size,\n",
" num_workers=4)\n",
" trainer = Trainer(model, train_dataset, None, tconf)\n",
" trainer.train()\n",
"\n",
" torch.save(model.state_dict(), \"./saved_models/trained_gpt_model\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"@webio": {
"lastCommId": null,
"lastKernelId": null
},
"kernelspec": {
"display_name": "scorep-python",
"language": "python",
"name": "scorep-python"
},
"language_info": {
"file_extension": ".py",
"mimetype": "text/plain",
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h1>Character Level GPT on Text Data</h1>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Edit the filename below to train the GPT model on the corpus. Select \"Run\" -> \"Run All\"."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"set user environment sucessfully: {'SCOREP_ENABLE_TRACING': '1', 'SCOREP_ENABLE_PROFILING': '0', 'SCOREP_TOTAL_MEMORY': '3g'}"
]
}
],
"source": [
"%%scorep_env\n",
"SCOREP_ENABLE_TRACING=1\n",
"SCOREP_ENABLE_PROFILING=0\n",
"SCOREP_TOTAL_MEMORY=3g"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"use the following scorep python binding arguments: --noinstrumenter"
]
}
],
"source": [
"%%scorep_python_binding_arguments\n",
"--noinstrumenter"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"filename = \"fairytales.txt\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"data has 49496 characters, 79 unique.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"15/09/2021 15:35:08 - INFO - model - Number of parameters : 2.531738e+07\n",
"/home/h9/s4122485/virtualenv_jupyterkernel_scorep_python/lib/python3.8/site-packages/torch/utils/data/dataloader.py:478: UserWarning: This DataLoader will create 4 worker processes in total. Our suggested max number of worker in current system is 1, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.\n",
" warnings.warn(_create_warning_msg(\n",
"epoch 1 iter 96: train loss 2.31473. lr 0.00030152924503397155: 100%|██████████| 97/97 [02:27<00:00, 1.52s/it]\n",
"[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
"[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
"[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
"[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
"epoch 2 iter 96: train loss 2.05380. lr 5.9999999999999995e-05: 100%|██████████| 97/97 [02:27<00:00, 1.52s/it]\n",
"[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
"[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
"[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
"[W pthreadpool-cpp.cc:90] Warning: Leaking Caffe2 thread-pool after fork. (function pthreadpool)\n",
"epoch 3 iter 96: train loss 1.88871. lr 0.0003015292450339715: 100%|██████████| 97/97 [02:27<00:00, 1.52s/it] \n"
]
}
],
"source": [
"%%execute_with_scorep\n",
"import scorep\n",
"import logging\n",
"\n",
"logging.basicConfig(\n",
" format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\",\n",
" datefmt=\"%d/%m/%Y %H:%M:%S\",\n",
" level=logging.INFO)\n",
"\n",
"from utils import set_seed\n",
"set_seed(42)\n",
"\n",
"import numpy as numpy\n",
"import torch\n",
"import torch.nn as nn\n",
"from torch.nn import functional as F\n",
"\n",
"import math\n",
"from torch.utils.data import Dataset\n",
"\n",
"class CharDataset(Dataset):\n",
" def __init__(self, data, block_size):\n",
" chars = sorted(list(set(data)))\n",
" data_size, vocab_size = len(data), len(chars)\n",
" print(\"data has %d characters, %d unique.\" % (data_size, vocab_size))\n",
"\n",
" self.stoi = {ch:i for i, ch in enumerate(chars)}\n",
" self.itos = {i:ch for i, ch in enumerate(chars)}\n",
" self.block_size = block_size\n",
" self.vocab_size = vocab_size\n",
" self.data = data\n",
"\n",
" def __len__(self):\n",
" return len(self.data) - self.block_size\n",
"\n",
" def __getitem__(self, idx):\n",
" chunk = self.data[idx : idx+self.block_size+1]\n",
" dix = [self.stoi[s] for s in chunk]\n",
"\n",
" x = torch.tensor(dix[:-1], dtype = torch.long)\n",
" y = torch.tensor(dix[1:], dtype = torch.long)\n",
" return x, y\n",
"\n",
"with scorep.instrumenter.enable():\n",
" block_size = 32\n",
"\n",
" text = open(\"./{}\".format(filename), \"r\").read()\n",
" train_dataset = CharDataset(text, block_size)\n",
"\n",
" from model import GPT, GPTconfig\n",
" mconf = GPTconfig(train_dataset.vocab_size, train_dataset.block_size,\n",
" n_layer=8, n_head=8, n_embd=512)\n",
" model = GPT(mconf)\n",
"\n",
" from trainer import Trainer, TrainerConfig\n",
"\n",
" tconf = TrainerConfig(max_epochs=3, batch_size=512, learning_rate=6e-4,\n",
" lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*block_size,\n",
" num_workers=4)\n",
" trainer = Trainer(model, train_dataset, None, tconf)\n",
"\n",
" torch.cuda.empty_cache()\n",
" trainer.train()\n",
"\n",
" torch.save(model.state_dict(), \"./saved_models/trained_gpt_model\")"
]
}
],
"metadata": {
"@webio": {
"lastCommId": null,
"lastKernelId": null
},
"kernelspec": {
"display_name": "scorep-python3",
"language": "python3",
"name": "scorep-python3"
},
"language_info": {
"file_extension": ".py",
"mimetype": "text/plain",
"name": "Any text"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
6 changes: 2 additions & 4 deletions examples/gpt-demo/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(self, model, train_dataset, test_dataset, config):
self.test_dataset = test_dataset
self.config = config

self.device = "cpu"
self.device = "gpu"
if torch.cuda.is_available():
self.device = torch.cuda.current_device()
self.model = torch.nn.DataParallel(self.model).to(self.device)
Expand Down Expand Up @@ -112,6 +112,4 @@ def run_epoch(split):
good_model = self.test_dataset is None or test_loss < best_loss
if self.config.ckpt_path is not None and good_model:
best_loss = test_loss
self.save_checkpoint()


self.save_checkpoint()
Loading

0 comments on commit ad7c34f

Please sign in to comment.