From 5ab9121d6cef5501d3e0fd77f004dc0dbfff253d Mon Sep 17 00:00:00 2001 From: luowyang Date: Tue, 26 Sep 2023 16:09:58 +0800 Subject: [PATCH] Fix issue 1721 by always initializing process group. --- yolox/core/launch.py | 44 +++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/yolox/core/launch.py b/yolox/core/launch.py index 9f8eec61e..05dff1772 100644 --- a/yolox/core/launch.py +++ b/yolox/core/launch.py @@ -57,17 +57,34 @@ def launch( args (tuple): arguments passed to main_func """ world_size = num_machines * num_gpus_per_machine + if world_size <= 0: + raise ValueError('`world_size` should be positive, currently {}'.format(world_size)) + + # Even if `world_size == 1`, we have to initialize the process group, + # so the user code can use all the `torch.dist`` facilities. This + # makes the code uniform whether there is one or more processes. + + if dist_url == "auto": + assert ( + num_machines == 1 + ), "`dist_url=auto` cannot work with distributed training." + port = _find_free_port() + dist_url = f"tcp://127.0.0.1:{port}" + + worker_args = ( + main_func, + world_size, + num_gpus_per_machine, + machine_rank, + backend, + dist_url, + args, + ) + if world_size > 1: # https://github.com/pytorch/pytorch/pull/14391 # TODO prctl in spawned processes - if dist_url == "auto": - assert ( - num_machines == 1 - ), "dist_url=auto cannot work with distributed training." - port = _find_free_port() - dist_url = f"tcp://127.0.0.1:{port}" - start_method = "spawn" cache = vars(args[1]).get("cache", False) @@ -82,20 +99,13 @@ def launch( mp.start_processes( _distributed_worker, nprocs=num_gpus_per_machine, - args=( - main_func, - world_size, - num_gpus_per_machine, - machine_rank, - backend, - dist_url, - args, - ), + args=worker_args, daemon=False, start_method=start_method, ) + else: - main_func(*args) + _distributed_worker(0, *worker_args) def _distributed_worker(