From 9bf0baa50d2a481bd3342886284af1396f299dca Mon Sep 17 00:00:00 2001 From: Casper Date: Mon, 1 Jul 2024 12:28:50 +0200 Subject: [PATCH 1/2] AzureML doc on multi-node training --- docs/source/en/basics/launch_colossalai.md | 37 ++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/docs/source/en/basics/launch_colossalai.md b/docs/source/en/basics/launch_colossalai.md index 8a6028d6c49a..ced650697d8c 100644 --- a/docs/source/en/basics/launch_colossalai.md +++ b/docs/source/en/basics/launch_colossalai.md @@ -228,3 +228,40 @@ mpirun --hostfile -np python train.py --host + +### Launch on AzureML Compute Cluster + +AzureML automatically wraps PyTorch in an abstraction layer. That means you do not need to use `colossalai` or `torchrun` because AzureML does it for you automatically. Instead, you only need to launch your training script using `python`. The following script launches training on a compute cluster with 2 nodes of 8 GPUs. + +Notes: +- For multi-node distributed training, AzureML has built-in functionality for multi-node communication which means you do not need SSH access between nodes. +- You will need to build a Docker image for ColossalAI and push it to an Azure Container Registry and create an AzureML environment before you can launch a job. + +``` +import os +from azure.ai.ml import MLClient, command +from azure.identity import DefaultAzureCredential + +# client +ml_client = MLClient.from_config(credential=DefaultAzureCredential()) + +# Define the job configuration +job = command( + code="./", + command="python train.py --arg1 value1 --arg2 value2", + environment="YOUR_AZUREML_ENVIRONMENT", + compute="YOUR_CLUSTER_NAME", + instance_count=2, + distribution={ + "type": "PyTorch", + "process_count_per_instance": 8, + }, + display_name="Training Run Multi Node", + experiment_name="COLOSSAL_TRAINING" +) + +# Submit the job +returned_job = ml_client.jobs.create_or_update(job) +print(f"Job {returned_job.name} submitted.") +print(f"Monitor your job at: {returned_job.studio_url}") +``` From 16d7dcd36dcf9948212d94d950c33b323ead96c9 Mon Sep 17 00:00:00 2001 From: Casper Date: Mon, 1 Jul 2024 12:29:13 +0200 Subject: [PATCH 2/2] Fix markdown linting of Python --- docs/source/en/basics/launch_colossalai.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/basics/launch_colossalai.md b/docs/source/en/basics/launch_colossalai.md index ced650697d8c..59c3a73d6588 100644 --- a/docs/source/en/basics/launch_colossalai.md +++ b/docs/source/en/basics/launch_colossalai.md @@ -237,7 +237,7 @@ Notes: - For multi-node distributed training, AzureML has built-in functionality for multi-node communication which means you do not need SSH access between nodes. - You will need to build a Docker image for ColossalAI and push it to an Azure Container Registry and create an AzureML environment before you can launch a job. -``` +```python import os from azure.ai.ml import MLClient, command from azure.identity import DefaultAzureCredential