blog/lora-parameters/lora.yaml

name: mistral lora hard
debug: false
environment:
  environment_variables:
    - NCCL_DEBUG=INFO
    - NCCL_SOCKET_IFNAME=ens,eth,ib
  image: 
    gpu: determinedai/environments:cuda-11.8-pytorch-2.0-gpu-95c7a14
    cpu: determinedai/environments:py-3.10-pytorch-2.0-cpu-03ae7d7
resources:
  slots_per_trial: 2
  resource_pool: <RESOURCE_POOL> # We used A100 40GB GPUs
workspace: <WORKSPACE_NAME>
project: <PROJECT>
searcher:
  name: grid
  max_length:
    batches: 3000
  metric: eval_accuracy
  smaller_is_better: false
hyperparameters:
  model: "mistralai/Mistral-7B-Instruct-v0.2"
  model_commit_hash: "99259002b41e116d28ccb2d04a9fbe22baed0c7f"
  dataset_subset: "hard"
  lora: true
  r:
    type: categorical
    vals: [2, 8, 32, 128]
  lora_alpha:
    type: categorical
    vals: [0.5, 1, 2, 8, 32, 128, 256, 512]
  lora_dropout: 
    type: categorical
    vals: [0.1]
  hf_token: <HF_TOKEN>
  training_args:
    output_dir: "/tmp/llm_finetuning"
    max_steps: 3000
    per_device_train_batch_size: 4
    per_device_eval_batch_size: 4
    bf16: true
    evaluation_strategy: "steps"
    eval_steps: 500
    logging_strategy: "steps"
    logging_steps: 100
    save_strategy: "steps"
    save_steps: 1000
    learning_rate: 1e-5
    deepspeed: true
    gradient_checkpointing: true
  use_rslora: false
entrypoint: >-
  python -m determined.launch.torch_distributed
  python finetune.py
max_restarts: 0