feat: drafted pp e2e test for fwd/bwd pass

le1nux · le1nux · commit 83c87b9d6d6f · 2025-08-19T14:39:58.000+02:00
diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/__init__.py b/tests/fsdp2_parallelization/pipeline_parallelism/__init__.py
diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml b/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml
@@ -0,0 +1,171 @@
+settings:
+  experiment_id: ${modalities_env:experiment_id}
+  config_file_path: ${modalities_env:config_file_path}
+  referencing_keys:
+    sample_key: input_ids
+    target_key: target_ids
+    prediction_key: logits
+  cuda_env:
+    local_rank: ${cuda_env:LOCAL_RANK}
+    global_rank: ${cuda_env:RANK}
+    world_size: ${cuda_env:WORLD_SIZE}
+  step_profile:
+    gradient_accumulation_steps: 1
+    local_train_micro_batch_size: 2
+    sequence_length: 256
+
+loss_fn:
+  component_key: loss
+  variant_key: clm_cross_entropy_loss
+  config:
+    target_key: ${settings.referencing_keys.target_key}
+    prediction_key: ${settings.referencing_keys.prediction_key}
+
+device_mesh:
+  component_key: device_mesh
+  variant_key: default
+  config:
+    device_type: cuda
+    data_parallel_replicate_degree: 1
+    pipeline_parallel_degree: 2
+    data_parallel_shard_degree: -1
+    world_size: ${settings.cuda_env.world_size}
+
+initialized_model:
+  component_key: model
+  variant_key: model_initialized
+  config:
+    model:
+      component_key: pipeline
+      variant_key: selector
+      config:
+        pipeline:
+          instance_key: scheduled_pipeline
+          pass_type: BY_REFERENCE
+        selection_type: MODEL
+    model_initializer:
+      component_key: model_initialization
+      variant_key: composed
+      config:
+        model_type: gpt2
+        weight_init_type: scaled
+        mean: 0.0
+        std: 0.02
+        num_layers: ${model_raw.config.n_layer}
+
+scheduled_pipeline:
+  component_key: pipeline
+  variant_key: scheduled
+  config:
+    loss_fn:
+      instance_key: loss_fn
+      pass_type: BY_REFERENCE
+    pp_schedule_name: gpipe
+    batch_size: ${settings.step_profile.local_train_micro_batch_size}
+    microbatch_size: 1
+    pp_degree: ${device_mesh.config.pipeline_parallel_degree}
+    pipeline:
+      component_key: pipeline
+      variant_key: builder
+      config:
+        stage:
+          component_key: pipeline
+          variant_key: selector
+          config:
+            pipeline:
+              instance_key: staged_pipeline
+              pass_type: BY_REFERENCE
+            selection_type: STAGE
+        model:
+          instance_key: fsdp_model
+          pass_type: BY_REFERENCE
+        
+fsdp_model:
+  component_key: model
+  variant_key: fsdp2_wrapped
+  config:
+    model:
+      instance_key: model_part
+      pass_type: BY_REFERENCE
+    device_mesh:
+      instance_key: device_mesh
+      pass_type: BY_REFERENCE
+    mixed_precision_settings:
+      param_dtype: BF_16
+      reduce_dtype: BF_16
+    block_names: [GPT2Block]
+
+model_part:
+  component_key: pipeline
+  variant_key: selector
+  config:
+    pipeline:
+      instance_key: staged_pipeline
+      pass_type: BY_REFERENCE
+    selection_type: MODEL
+
+staged_pipeline:
+  component_key: pipeline
+  variant_key: staged
+  config:
+    whole_model:
+      instance_key: model_raw
+      pass_type: BY_REFERENCE
+    stages_generator:
+      component_key: stages_generator
+      variant_key: gpt2_stages_generator
+      config:
+        num_model_layers: ${model_raw.config.n_layer}
+        input_layer_equivalence: 1
+        output_layer_equivalence: 1
+    device_mesh:
+      instance_key: device_mesh
+      pass_type: BY_REFERENCE
+    local_rank: ${settings.cuda_env.local_rank}
+    pp_schedule_name: gpipe
+    num_layers_per_stage: 2
+
+model_raw:
+  component_key: model
+  variant_key: gpt2
+  config:
+    use_meta_device: true
+    use_weight_tying: false
+    sample_key: ${settings.referencing_keys.sample_key}
+    poe_type: NOPE
+    sequence_length: ${settings.step_profile.sequence_length}
+    prediction_key: ${loss_fn.config.prediction_key}
+    vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
+    n_layer: 2
+    n_head_q: 8
+    n_head_kv: 4
+    ffn_hidden: 128
+    n_embd: 128
+    dropout: 0.0
+    bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+    attention_config:
+      qkv_transforms:
+        - type_hint: RotaryTransform
+          config:
+            n_embd: ${model_raw.config.n_embd}
+            n_head: ${model_raw.config.n_head_q} #it has to be head_q here
+            seq_length_dim: -2
+            base_freq: 10000
+    attention_implementation: manual
+    activation_type: swiglu
+    attention_norm_config:
+      norm_type: layer_norm
+      config:
+        normalized_shape: ${model_raw.config.n_embd}
+        eps: 1e-5
+    ffn_norm_config:
+      norm_type: layer_norm
+      config:
+        normalized_shape: ${model_raw.config.n_embd}
+        eps: 1e-5
+    lm_head_norm_config:
+      norm_type: layer_norm
+      config:
+        normalized_shape: ${model_raw.config.n_embd}
+        eps: 1e-5
+
diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py b/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py
@@ -0,0 +1,104 @@
+import os
+import tempfile
+from pathlib import Path
+
+import pytest
+import torch
+import torch.multiprocessing as mp
+import yaml
+from pydantic import BaseModel
+
+from modalities.__main__ import Main
+from modalities.config.config import ProcessGroupBackendType
+from modalities.config.pydantic_if_types import PydanticFSDP2ModuleType, PydanticPipelineType
+from tests.end2end_tests.custom_components import MultiProcessingCudaEnv
+
+
+@pytest.fixture
+def temp_file_path() -> Path:
+    # Create a NamedTemporaryFile that persists after closing (delete=False)
+    with tempfile.NamedTemporaryFile(delete=False) as tf:
+        file_path = tf.name
+    try:
+        yield Path(file_path)
+    finally:
+        # Clean up the file after the test
+        if os.path.exists(file_path):
+            os.remove(file_path)
+
+
+class ComponentsInstantiationModel(BaseModel):
+    initialized_model: PydanticFSDP2ModuleType
+    scheduled_pipeline: PydanticPipelineType
+
+
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 8,
+    reason="This test requires 8 GPUs",
+)
+class TestPipelineParallelism:
+    def _get_tmp_sharding_config_path(
+        self, sharding_degree: int, tp_degree: int, pp_degree: int, temp_file_path: Path
+    ) -> Path:
+        working_dir = Path(os.path.dirname(__file__))
+        config_file_path = working_dir / "configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml"
+
+        with open(config_file_path, "r") as file:
+            config_string = file.read()
+            config_dict = yaml.safe_load(config_string)
+            config_dict["device_mesh"]["config"]["data_parallel_shard_degree"] = sharding_degree
+            config_dict["device_mesh"]["config"]["tensor_parallel_degree"] = tp_degree
+            config_dict["device_mesh"]["config"]["pipeline_parallel_degree"] = pp_degree
+
+        # save to temporary file
+        with open(temp_file_path, "w") as file:
+            yaml.dump(config_dict, file)
+
+        return temp_file_path
+
+    def _get_components(self, config_file_path: Path) -> ComponentsInstantiationModel:
+        main_obj = Main(config_file_path)
+        components: ComponentsInstantiationModel = main_obj.build_components(
+            components_model_type=ComponentsInstantiationModel
+        )
+        return components
+
+    @pytest.mark.parametrize(
+        "sharding_degree, tp_degree, pp_degree, world_size",
+        [
+            (2, 1, 2, 4),
+            # (2, 1, 4, 8),
+            # (2, 2, 2, 8), # TODO need to support this case
+        ],
+    )
+    def test_pp(self, sharding_degree: int, tp_degree: int, pp_degree: int, world_size: int, temp_file_path: Path):
+        tmp_sharding_config_path = self._get_tmp_sharding_config_path(
+            sharding_degree=sharding_degree,
+            tp_degree=tp_degree,
+            pp_degree=pp_degree,
+            temp_file_path=temp_file_path,
+        )
+        mp.spawn(
+            self._test_pp_impl,
+            args=(world_size, sharding_degree, tmp_sharding_config_path),
+            nprocs=world_size,
+            join=True,
+        )
+
+    def _test_pp_impl(
+        self,
+        process_id: int,
+        world_size: int,
+        sharding_degree: int,
+        gpt2_model_config_path: Path,
+    ):
+        # wraps the actual test function to be able to run it in a distributed  multiprocessing setup
+        with MultiProcessingCudaEnv(
+            process_group_backend=ProcessGroupBackendType.nccl,
+            global_rank=process_id,
+            local_rank=process_id,
+            world_size=world_size,
+            rdvz_port=22356,
+        ):
+            self._get_components(gpt2_model_config_path)
+            pass