changing the implementation of avoiding race condition in unzip of TRT-LLM wheel by using lock file

apbose · apbose · commit f4f338a835e9 · 2025-10-17T15:50:28.000-07:00
diff --git a/examples/distributed_inference/tensor_parallel_initialize_dist.py b/examples/distributed_inference/tensor_parallel_initialize_dist.py
@@ -14,7 +14,9 @@
 import tensorrt as trt
 import torch
 import torch.distributed as dist
-from torch.distributed._tensor.device_mesh import init_device_mesh
+from torch.distributed._tensor.device_mesh import DeviceMesh, init_device_mesh
+
+logger = logging.getLogger(__name__)
 
 
 # this is kept at the application level, when mpirun is used to run the application
@@ -54,3 +56,37 @@ def cleanup_distributed_env():
     """Clean up distributed process group to prevent resource leaks."""
     if dist.is_initialized():
         dist.destroy_process_group()
+
+
+def check_tensor_parallel_device_number(world_size: int) -> None:
+    if world_size % 2 != 0:
+        raise ValueError(
+            f"TP examples require even number of GPUs, but got {world_size} gpus"
+        )
+
+
+def get_tensor_parallel_device_mesh(
+    rank: int = 0, world_size: int = 1
+) -> tuple[DeviceMesh, int, int]:
+    local_rank = int(
+        os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK", rank % torch.cuda.device_count())
+    )
+    world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE", world_size))
+    device_mesh = init_device_mesh(device_type="cuda", mesh_shape=(world_size,))
+    rank = device_mesh.get_rank()
+    assert rank == local_rank
+    device_id = (
+        rank % torch.cuda.device_count()
+    )  # Ensure each rank gets a unique device
+    torch.cuda.set_device(device_id)
+
+    return device_mesh, world_size, rank
+
+
+def initialize_distributed_logger(rank: int, logger_file_name: str) -> logging.Logger:
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    fh = logging.FileHandler(logger_file_name + f"_{rank}.log", mode="w")
+    fh.setLevel(logging.INFO)
+    logger.addHandler(fh)
+    return logger
diff --git a/examples/distributed_inference/tensor_parallel_rotary_embedding.py b/examples/distributed_inference/tensor_parallel_rotary_embedding.py
@@ -9,25 +9,21 @@
 
 """
 
-import logging
-import os
 import time
 
 import torch
 import torch.distributed as dist
 from tensor_parallel_initialize_dist import (
     cleanup_distributed_env,
+    get_tensor_parallel_device_mesh,
     initialize_distributed_env,
+    initialize_distributed_logger,
 )
 
 if not dist.is_initialized():
     initialize_distributed_env()
 
 import torch_tensorrt
-from torch_tensorrt.dynamo.distributed.utils import (
-    get_tensor_parallel_device_mesh,
-    initialize_distributed_logger,
-)
 
 device_mesh, _world_size, _rank = get_tensor_parallel_device_mesh()
 logger = initialize_distributed_logger(_rank, "tensor_parallel_rotary_embedding")
@@ -36,8 +32,8 @@
 
 """
 This example covers the rotary embedding in Llama3 model and is derived from https://lightning.ai/lightning-ai/studios/tensor-parallelism-supercharging-large-model-training-with-pytorch-lightning
-Command to run with single GPU: mpirun -n 1 --allow-run-as-root python tensor_parallel_rotary_embedding.py
-Command to run with 2 GPUs: mpirun -n 2 --allow-run-as-root python tensor_parallel_rotary_embedding.py
+Command to run with single GPU: USE_TRTLLM_PLUGINS=1 mpirun -n 1 --allow-run-as-root python tensor_parallel_rotary_embedding.py
+Command to run with 2 GPUs: USE_TRTLLM_PLUGINS=1 mpirun -n 2 --allow-run-as-root python tensor_parallel_rotary_embedding.py
 """
 
 BATCH = 2
diff --git a/examples/distributed_inference/tensor_parallel_simple_example.py b/examples/distributed_inference/tensor_parallel_simple_example.py
@@ -16,7 +16,7 @@
 -----
 .. code-block:: bash
 
-    mpirun -n 2 --allow-run-as-root python tensor_parallel_simple_example.py
+    USE_TRTLLM_PLUGINS=1 mpirun -n 2 --allow-run-as-root python tensor_parallel_simple_example.py
 """
 
 import time
@@ -27,7 +27,9 @@
 import torch.nn as nn
 from tensor_parallel_initialize_dist import (
     cleanup_distributed_env,
+    get_tensor_parallel_device_mesh,
     initialize_distributed_env,
+    initialize_distributed_logger,
 )
 
 if not dist.is_initialized():
diff --git a/py/torch_tensorrt/_utils.py b/py/torch_tensorrt/_utils.py
@@ -5,6 +5,7 @@
 import platform
 import sys
 import tempfile
+import time
 import urllib.request
 from pathlib import Path
 from typing import Any, Optional
@@ -144,47 +145,59 @@ def _extracted_dir_trtllm(platform_system: str, platform_machine: str) -> Path:
 
 
 def extract_wheel_file(wheel_path: Path, extract_dir: Path) -> None:
-    # this will not be encountered in case of platforms not supporting torch distributed/nccl/TRT-LLM
-    from torch.distributed import barrier, get_rank, is_initialized
-
-    if not is_initialized():
-        # Single process case, just unzip
-        is_master = True
-    else:
-        is_master = get_rank() == 0  # only rank 0 does the unzip
-
-    if is_master:
+    """
+    Safely extract a wheel file to a directory with a lock to prevent concurrent extraction.
+    """
+    rank = int(os.environ.get("OMPI_COMM_WORLD_RANK", 0))  # MPI rank from OpenMPI
+    torch.cuda.set_device(rank)
+    lock_file = extract_dir / ".extracting"
+
+    # Rank 0 performs extraction
+    if rank == 0:
+        logger.debug(
+            f"[Rank {rank}] Starting extraction of {wheel_path} to {extract_dir}"
+        )
         try:
             import zipfile
         except ImportError as e:
             raise ImportError(
                 "zipfile module is required but not found. Please install zipfile"
             )
+        # Create lock file to signal extraction in progress
+        extract_dir.mkdir(parents=True, exist_ok=False)
+        lock_file.touch(exist_ok=False)
         try:
             with zipfile.ZipFile(wheel_path) as zip_ref:
                 zip_ref.extractall(extract_dir)
-                logger.debug(f"Extracted wheel to {extract_dir}")
-
+            logger.debug(f"[Rank {rank}] Extraction complete: {extract_dir}")
+            print(f"[Rank {rank}] Extraction complete: {extract_dir}")
         except FileNotFoundError as e:
-            # This should capture the errors in the download failure above
-            logger.error(f"Wheel file not found at {wheel_path}: {e}")
+            logger.error(f"[Rank {rank}] Wheel file not found at {wheel_path}: {e}")
             raise RuntimeError(
                 f"Failed to find downloaded wheel file at {wheel_path}"
             ) from e
         except zipfile.BadZipFile as e:
-            logger.error(f"Invalid or corrupted wheel file: {e}")
+            logger.error(f"[Rank {rank}] Invalid or corrupted wheel file: {e}")
             raise RuntimeError(
                 "Downloaded wheel file is corrupted or not a valid zip archive"
             ) from e
         except Exception as e:
-            logger.error(f"Unexpected error while extracting wheel: {e}")
+            logger.error(f"[Rank {rank}] Unexpected error while extracting wheel: {e}")
             raise RuntimeError(
                 "Unexpected error during extraction of TensorRT-LLM wheel"
             ) from e
+        finally:
+            # Remove lock file to signal completion
+            lock_file.unlink(missing_ok=True)
 
-    # Make sure others wait until unzip is done
-    if is_initialized():
-        barrier()
+    else:
+        # Other ranks wait for extraction to complete
+        while lock_file.exists():
+            logger.debug(
+                f"[Rank {rank}] Waiting for extraction to finish at {extract_dir}..."
+            )
+            print(f"[Rank {rank}] Waiting... device:", torch.cuda.current_device())
+            time.sleep(0.5)
 
 
 def download_and_get_plugin_lib_path() -> Optional[str]:
diff --git a/py/torch_tensorrt/dynamo/distributed/__init__.py b/py/torch_tensorrt/dynamo/distributed/__init__.py
diff --git a/py/torch_tensorrt/dynamo/distributed/utils.py b/py/torch_tensorrt/dynamo/distributed/utils.py
diff --git a/setup.py b/setup.py
@@ -450,7 +450,6 @@ def run(self):
     "torch_tensorrt.dynamo.conversion.impl.unary",
     "torch_tensorrt.dynamo.conversion.plugins",
     "torch_tensorrt.dynamo.debug",
-    "torch_tensorrt.dynamo.distributed",
     "torch_tensorrt.dynamo.lowering",
     "torch_tensorrt.dynamo.lowering.passes",
     "torch_tensorrt.dynamo.partitioning",