From f8174af3a3580301b09f878fa0b83cdb54293c31 Mon Sep 17 00:00:00 2001 From: Harshang Akabari Date: Tue, 17 Feb 2026 23:35:07 +0530 Subject: [PATCH] Fix zero and non-finite guard handling in key math paths. Add explicit validation for divisor inputs in groups and inference utilities, enforce valid throughput report intervals, and reject invalid HPU dequantization scales to avoid ZeroDivisionError and silent inf/nan propagation. --- deepspeed/inference/v2/inference_utils.py | 2 ++ deepspeed/utils/groups.py | 2 ++ deepspeed/utils/timer.py | 12 +++++++- deepspeed/utils/validation.py | 14 +++++++++ op_builder/hpu/fp_quantizer.py | 5 ++++ tests/unit/inference/test_inference_utils.py | 18 ++++++++++++ .../test_fp_quantizer_scale_validation.py | 27 +++++++++++++++++ tests/unit/utils/test_groups.py | 8 ++++- tests/unit/utils/test_timer.py | 29 +++++++++++++++++++ 9 files changed, 115 insertions(+), 2 deletions(-) create mode 100644 deepspeed/utils/validation.py create mode 100644 tests/unit/inference/test_inference_utils.py create mode 100644 tests/unit/ops/fp_quantizer/test_fp_quantizer_scale_validation.py create mode 100644 tests/unit/utils/test_timer.py diff --git a/deepspeed/inference/v2/inference_utils.py b/deepspeed/inference/v2/inference_utils.py index 7b2dd4237353..774f74d06ee4 100644 --- a/deepspeed/inference/v2/inference_utils.py +++ b/deepspeed/inference/v2/inference_utils.py @@ -8,6 +8,7 @@ import torch from enum import Enum, IntEnum +from deepspeed.utils.validation import ensure_nonzero_divisor class NormTypeEnum(Enum): @@ -102,4 +103,5 @@ def ceil_div(a: int, b: int) -> int: """ Return ceil(a / b). """ + ensure_nonzero_divisor(b, name="b") return -(-a // b) diff --git a/deepspeed/utils/groups.py b/deepspeed/utils/groups.py index a6f0a7228977..55f98d7faed0 100644 --- a/deepspeed/utils/groups.py +++ b/deepspeed/utils/groups.py @@ -29,6 +29,7 @@ from deepspeed.utils import log_dist from deepspeed.utils.bwc import bwc_tensor_model_parallel_world_size, bwc_pipeline_parallel_world_size from deepspeed.utils.exceptions import DeprecatedException +from deepspeed.utils.validation import ensure_nonzero_divisor from deepspeed.accelerator import get_accelerator # Expert parallel group that the current rank belongs to. @@ -63,6 +64,7 @@ def initialize(ep_size=1, mpu=None): def _ensure_divisibility(numerator, denominator): """Ensure that numerator is divisible by the denominator.""" + ensure_nonzero_divisor(denominator, name="denominator") assert numerator % denominator == 0, '{} is not divisible by {}'.format(numerator, denominator) diff --git a/deepspeed/utils/timer.py b/deepspeed/utils/timer.py index 0aa7be55d829..532edee7742e 100755 --- a/deepspeed/utils/timer.py +++ b/deepspeed/utils/timer.py @@ -4,6 +4,7 @@ # DeepSpeed Team import time +import numbers from numpy import mean from deepspeed.utils.logging import print_dist from deepspeed.accelerator import get_accelerator @@ -211,7 +212,16 @@ def __init__(self, config, batch_size, start_step=2, steps_per_output=None, moni self.global_step_count = 0 self.total_elapsed_time = 0 self.step_elapsed_time = 0 - self.steps_per_output = steps_per_output + if steps_per_output is not None: + if not isinstance(steps_per_output, numbers.Integral): + raise ValueError( + f"steps_per_output must be a positive integer or None, got {type(steps_per_output).__name__}" + ) + if steps_per_output <= 0: + raise ValueError(f"steps_per_output must be greater than 0, got {steps_per_output}") + self.steps_per_output = int(steps_per_output) + else: + self.steps_per_output = None self.monitor_memory = monitor_memory self.logging = logging_fn if self.logging is None: diff --git a/deepspeed/utils/validation.py b/deepspeed/utils/validation.py new file mode 100644 index 000000000000..28510169b5ff --- /dev/null +++ b/deepspeed/utils/validation.py @@ -0,0 +1,14 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from typing import Any + + +def ensure_nonzero_divisor(divisor: Any, *, name: str = "divisor") -> None: + """ + Validate that a divisor is non-zero before modulo/division math. + """ + if divisor == 0: + raise ValueError(f"{name} must be non-zero") diff --git a/op_builder/hpu/fp_quantizer.py b/op_builder/hpu/fp_quantizer.py index c74affb55045..855e589bcdd1 100644 --- a/op_builder/hpu/fp_quantizer.py +++ b/op_builder/hpu/fp_quantizer.py @@ -56,6 +56,11 @@ def selective_dequantize(cls, val_q, scales, indexes, group_size, q_mantisa_bits def dequantize(cls, fp_out, input_q, scale, group_size, q_mantisa_bits, q_exponent_bits): orig_shape = fp_out.shape orig_dtype = fp_out.dtype + scale_tensor = scale if torch.is_tensor(scale) else torch.as_tensor(scale) + if not torch.all(torch.isfinite(scale_tensor)): + raise ValueError("dequantize scale must contain finite values") + if torch.any(scale_tensor == 0): + raise ValueError("dequantize scale must be non-zero") dequant_out = torch.ops.hpu.cast_from_fp8(input_q, (1.0 / scale), orig_dtype).view(orig_shape) fp_out.copy_(dequant_out) return fp_out diff --git a/tests/unit/inference/test_inference_utils.py b/tests/unit/inference/test_inference_utils.py new file mode 100644 index 000000000000..f985afc56f52 --- /dev/null +++ b/tests/unit/inference/test_inference_utils.py @@ -0,0 +1,18 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import pytest + +from deepspeed.inference.v2.inference_utils import ceil_div + + +def test_ceil_div_basic_behavior(): + assert ceil_div(10, 4) == 3 + assert ceil_div(12, 4) == 3 + + +def test_ceil_div_rejects_zero_divisor(): + with pytest.raises(ValueError, match="b must be non-zero"): + ceil_div(10, 0) diff --git a/tests/unit/ops/fp_quantizer/test_fp_quantizer_scale_validation.py b/tests/unit/ops/fp_quantizer/test_fp_quantizer_scale_validation.py new file mode 100644 index 000000000000..f835269c2574 --- /dev/null +++ b/tests/unit/ops/fp_quantizer/test_fp_quantizer_scale_validation.py @@ -0,0 +1,27 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import pytest +import torch + +from op_builder.hpu.fp_quantizer import FPQuantizer + + +def test_dequantize_rejects_non_finite_scale(): + fp_out = torch.zeros(4, dtype=torch.float16) + input_q = torch.zeros(4, dtype=torch.uint8) + scale = torch.tensor([float("inf")], dtype=torch.float32) + + with pytest.raises(ValueError, match="dequantize scale must contain finite values"): + FPQuantizer.dequantize(fp_out, input_q, scale, group_size=1, q_mantisa_bits=3, q_exponent_bits=4) + + +def test_dequantize_rejects_zero_scale(): + fp_out = torch.zeros(4, dtype=torch.float16) + input_q = torch.zeros(4, dtype=torch.uint8) + scale = torch.tensor([0.0], dtype=torch.float32) + + with pytest.raises(ValueError, match="dequantize scale must be non-zero"): + FPQuantizer.dequantize(fp_out, input_q, scale, group_size=1, q_mantisa_bits=3, q_exponent_bits=4) diff --git a/tests/unit/utils/test_groups.py b/tests/unit/utils/test_groups.py index 5cd35baf3510..5450b88a02da 100644 --- a/tests/unit/utils/test_groups.py +++ b/tests/unit/utils/test_groups.py @@ -3,7 +3,8 @@ # DeepSpeed Team -from deepspeed.utils.groups import _get_expert_parallel_ranks +import pytest +from deepspeed.utils.groups import _ensure_divisibility, _get_expert_parallel_ranks def test_get_expert_parallel_ranks(): @@ -36,3 +37,8 @@ def test_get_expert_parallel_ranks(): [5, 13], [7, 15], ] + + +def test_ensure_divisibility_rejects_zero_denominator(): + with pytest.raises(ValueError, match="denominator must be non-zero"): + _ensure_divisibility(8, 0) diff --git a/tests/unit/utils/test_timer.py b/tests/unit/utils/test_timer.py new file mode 100644 index 000000000000..9563d856d81d --- /dev/null +++ b/tests/unit/utils/test_timer.py @@ -0,0 +1,29 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +from types import SimpleNamespace +import pytest + +from deepspeed.utils.timer import ThroughputTimer + + +def _timer_config(): + return SimpleNamespace(enabled=True, synchronized=False) + + +def test_steps_per_output_rejects_zero(): + with pytest.raises(ValueError, match="steps_per_output must be greater than 0"): + ThroughputTimer(config=_timer_config(), batch_size=1, steps_per_output=0) + + +def test_steps_per_output_rejects_non_integral(): + with pytest.raises(ValueError, match="steps_per_output must be a positive integer or None"): + ThroughputTimer(config=_timer_config(), batch_size=1, steps_per_output=1.5) + + +def test_report_boundary_for_valid_steps_per_output(): + timer = ThroughputTimer(config=_timer_config(), batch_size=1, steps_per_output=3) + timer.global_step_count = 6 + assert timer._is_report_boundary()