-
Notifications
You must be signed in to change notification settings - Fork 1
/
lora.yaml
55 lines (55 loc) · 1.37 KB
/
lora.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
name: mistral lora hard
debug: false
environment:
environment_variables:
- NCCL_DEBUG=INFO
- NCCL_SOCKET_IFNAME=ens,eth,ib
image:
gpu: determinedai/environments:cuda-11.8-pytorch-2.0-gpu-95c7a14
cpu: determinedai/environments:py-3.10-pytorch-2.0-cpu-03ae7d7
resources:
slots_per_trial: 2
resource_pool: <RESOURCE_POOL> # We used A100 40GB GPUs
workspace: <WORKSPACE_NAME>
project: <PROJECT>
searcher:
name: grid
max_length:
batches: 3000
metric: eval_accuracy
smaller_is_better: false
hyperparameters:
model: "mistralai/Mistral-7B-Instruct-v0.2"
model_commit_hash: "99259002b41e116d28ccb2d04a9fbe22baed0c7f"
dataset_subset: "hard"
lora: true
r:
type: categorical
vals: [2, 8, 32, 128]
lora_alpha:
type: categorical
vals: [0.5, 1, 2, 8, 32, 128, 256, 512]
lora_dropout:
type: categorical
vals: [0.1]
hf_token: <HF_TOKEN>
training_args:
output_dir: "/tmp/llm_finetuning"
max_steps: 3000
per_device_train_batch_size: 4
per_device_eval_batch_size: 4
bf16: true
evaluation_strategy: "steps"
eval_steps: 500
logging_strategy: "steps"
logging_steps: 100
save_strategy: "steps"
save_steps: 1000
learning_rate: 1e-5
deepspeed: true
gradient_checkpointing: true
use_rslora: false
entrypoint: >-
python -m determined.launch.torch_distributed
python finetune.py
max_restarts: 0