fix unit tests

Chenyaaang · Chenyaaang · commit 8ba5cc99f50b · 2025-11-19T22:03:42.000Z
Signed-off-by: Chenyaaang &lt;chenyangli@google.com&gt;
diff --git a/tests/worker/tpu_worker_test.py b/tests/worker/tpu_worker_test.py
@@ -25,6 +25,7 @@ def mock_vllm_config():
     mock_parallel_conf = MagicMock()
     mock_parallel_conf.tensor_parallel_size = 2
     mock_parallel_conf.data_parallel_size = 1
+    mock_parallel_conf.pipeline_parallel_size = 1
     mock_parallel_conf.nnodes = 1
     mock_parallel_conf.nnodes_within_dp = 1
 
@@ -118,8 +119,14 @@ def test_init_device_with_provided_devices(
 
         worker.init_device()
 
-        mock_jax.devices.assert_not_called()
-        mock_runner_cls.assert_called_once_with(mock_vllm_config, mock_devices)
+        mock_jax.local_devices.assert_not_called()
+        expected_rank = 0
+        expected_is_first_rank = True
+        expected_is_last_rank = True
+        mock_runner_cls.assert_called_once_with(mock_vllm_config, mock_devices,
+                                                expected_rank,
+                                                expected_is_first_rank,
+                                                expected_is_last_rank)
         assert isinstance(worker.model_runner, MagicMock)
 
     @patch('tpu_inference.worker.tpu_worker.TPUModelRunner')
@@ -137,15 +144,24 @@ def test_init_device_autodetects_devices(
             distributed_init_method="test_method",
             devices=[]  # No devices provided, should trigger auto-detection
         )
-        mock_jax.devices.return_value = ['tpu:0', 'tpu:1', 'tpu:2', 'tpu:3']
+        mock_jax.local_device_count.return_value = 4
+        mock_jax.local_devices.return_value = [
+            'tpu:0', 'tpu:1', 'tpu:2', 'tpu:3'
+        ]
 
         worker.init_device()
 
-        mock_jax.devices.assert_called_once()
+        mock_jax.local_devices.assert_called_once()
         expected_devices = ['tpu:0', 'tpu:1']  # Sliced by tensor_parallel_size
         assert worker.devices == expected_devices
+        expected_rank = 0
+        expected_is_first_rank = True
+        expected_is_last_rank = True
         mock_runner_cls.assert_called_once_with(mock_vllm_config,
-                                                expected_devices)
+                                                expected_devices,
+                                                expected_rank,
+                                                expected_is_first_rank,
+                                                expected_is_last_rank)
 
     @patch('tpu_inference.worker.tpu_worker.utils')
     def test_determine_available_memory(self, mock_utils, mock_vllm_config):
@@ -194,7 +210,7 @@ def test_execute_model(self, mock_runner_cls, mock_vllm_config):
 
         # Assert the runner was called with the scheduler output directly
         worker.model_runner.execute_model.assert_called_once_with(
-            mock_scheduler_input)
+            mock_scheduler_input, None)
         # Assert the final result is the concrete model output
         assert result == mock_model_output
 
diff --git a/tpu_inference/runner/tpu_runner.py b/tpu_inference/runner/tpu_runner.py
@@ -223,6 +223,9 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         devices: List[Any],
+        rank: int = 0,
+        is_first_rank: bool = True,
+        is_last_rank: bool = True,
     ):
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
diff --git a/tpu_inference/worker/tpu_worker.py b/tpu_inference/worker/tpu_worker.py
@@ -117,7 +117,7 @@ def __init__(
         # TPU Worker is initialized. The profiler server needs to start after
         # MP runtime is initialized.
         self.profile_dir = None
-        if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1 and self.pp_world_size == 1:
+        if vllm_envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1 and self.pp_config.pp_world_size == 1:
             if not self.devices or 0 in self.device_ranks:
                 # For TPU, we can only have 1 active profiler session for 1 profiler
                 # server. So we only profile on rank0.
@@ -126,9 +126,9 @@ def __init__(
                             self.profile_dir)
 
         # For PP, we use MPMD so we want to profile every worker.
-        if self.pp_world_size > 1 and envs.VLLM_TORCH_PROFILER_DIR:
+        if self.pp_config.pp_world_size > 1 and vllm_envs.VLLM_TORCH_PROFILER_DIR:
             self.profile_dir = os.path.join(
-                envs.VLLM_TORCH_PROFILER_DIR,
+                vllm_envs.VLLM_TORCH_PROFILER_DIR,
                 f"pprank_{self.rank}_ppworldsize_{self.pp_config.pp_world_size}"
             )
             os.makedirs(self.profile_dir, exist_ok=True)
@@ -161,7 +161,7 @@ def init_device(self,
         if multihost_backend != "ray" and self.parallel_config.pipeline_parallel_size > 1:
             tpu_ports = [
                 jax_parallel_state.BASE_JAX_PORT + i
-                for i in range(self.pp_world_size)
+                for i in range(self.pp_config.pp_world_size)
             ]
             os.environ["TPU_PROCESS_ADDRESSES"] = ",".join(
                 [f"localhost:{port}" for port in tpu_ports])
@@ -206,7 +206,7 @@ def init_device(self,
                     if device is None:
                         raise KeyError(
                             f"Device index {device_index} not found in "
-                            f"jax.devices() with IDs {list(device_dict.keys())}!"
+                            f"jax.local_devices() with IDs {list(device_dict.keys())}!"
                         )
                     self.devices.append(device)
                 assert len(self.devices) >= sharding_config.total_devices
@@ -240,9 +240,9 @@ def init_device(self,
             need_pp=self.parallel_config.pipeline_parallel_size > 1)
 
         ensure_kv_transfer_initialized(self.vllm_config)
-        self.model_runner = TPUModelRunner(self.vllm_config, self.devices,
-                                           self.rank, self.rank == 0,
-                                           self.rank == self.pp_world_size - 1)
+        self.model_runner = TPUModelRunner(
+            self.vllm_config, self.devices, self.rank, self.rank == 0,
+            self.rank == self.pp_config.pp_world_size - 1)
         logger.info(f"Init worker | "
                     f"rank={self.rank} | "
                     f"node_id={get_node_id()} | "