NVIDIA · JanuszL · Apr 11, 2026 · Mar 23, 2026 · Mar 31, 2026
diff --git a/dali/python/nvidia/dali/experimental/dynamic/_batch.py b/dali/python/nvidia/dali/experimental/dynamic/_batch.py
@@ -164,6 +164,42 @@ class Batch:
 
     In case of lazy evaluation, the operations are executed only after an attempt is made to access
     the tensor data or properties which cannot be obtained without running the underlying operation.
+
+    .. warning::
+        :class:`Batch` objects should not be constructed directly, use :func:`batch` or
+        :func:`as_batch` instead.
+
+    The batch object can be created either from an existing object, passed as `tensors` or
+    from an invocation result.
+    Unless explicitly requested with the `copy` parameter, this constructor will make best
+    effort to avoid the copy.
+
+    Parameters
+    ----------
+    tensors : TensorLike, default: None
+        The data to construct the batch from. It can be a list of tensors, a TensorList,
+        or other supported types. If None, the batch is constructed from an invocation result.
+        Supported types are:
+
+        - a list of tensor-like objects; the objects need to have matching number of dimensions,
+          data types and layouts,
+        - a tensor-like object; the outermost dimension is interpreted as the batch dimension
+        - a dali.backend.TensorListCPU or dali.backend.TensorListGPU
+    dtype : DType, default: None
+        The desired data type of the batch. If not specified, the data type is inferred
+        from the input tensors. If specified, the input tensors are cast to the desired
+        data type. The `dtype` is required if `tensors` are an empty list.
+    device : Device or str, optional, default: None
+        The device on which the batch should reside (e.g., "cpu" or "gpu").
+        If not specified, the device is inferred from the input tensors.
+    layout : str, optional, default: None
+        The layout string describing the dimensions of the batch (e.g., "HWC").
+        If not specified, the layout is inferred from the input tensors.
+    invocation_result : _invocation.InvocationResult, default: None
+        The result of a DALI operator invocation, used for lazy evaluation.
+    copy : bool, optional, default: False
+        If True, the input tensors are copied. If False, the constructor will avoid
+        copying data when possible.
     """
 
     def __init__(
@@ -175,44 +211,6 @@ def __init__(
         invocation_result: _invocation.InvocationResult | None = None,
         copy: bool = False,
     ):
-        """Constructs a :class:`Batch` object.
-
-        .. warning::
-            :class:`Batch` objects should not be constructed directly, use :meth:`batch` or
-            :meth:`as_batch` instead.
-
-        The batch object can be created either from an existing object, passed as `tensors` or
-        from an invocation result.
-        Unless explicitly requested with the `copy` parameter, this constructor will make best
-        effort to avoid the copy.
-
-        Parameters
-        ----------
-        tensors : TensorLike, default: None
-            The data to construct the batch from. It can be a list of tensors, a TensorList,
-            or other supported types. If None, the batch is constructed from an invocation result.
-            Supported types are:
-
-            - a list of tensor-like objects; the objects need to have matching number of dimensions,
-            data types and layouts,
-            - a tensor-like object; the outermost dimension is interpreted as the batch dimension
-            - a dali.backend.TensorListCPU or dali.backend.TensorListGPU
-        dtype : DType, default: None
-            The desired data type of the batch. If not specified, the data type is inferred
-            from the input tensors. If specified, the input tensors are cast to the desired
-            data type. The `dtype` is required if `tensors` are an empty list.
-        device : Device or str, optional, default: None
-            The device on which the batch should reside (e.g., "cpu" or "gpu").
-            If not specified, the device is inferred from the input tensors.
-        layout : str, optional, default: None
-            The layout string describing the dimensions of the batch (e.g., "HWC").
-            If not specified, the layout is inferred from the input tensors.
-        invocation_result : _invocation.InvocationResult, default: None
-            The result of a DALI operator invocation, used for lazy evaluation
-        copy : bool, optional, default: False
-            If True, the input tensors are copied. If False, the constructor will avoid
-            copying data when possible.
-        """
         assert isinstance(layout, str) or layout is None
         if device is not None and not isinstance(device, Device):
             device = _device(device)

diff --git a/dali/python/nvidia/dali/experimental/dynamic/_device.py b/dali/python/nvidia/dali/experimental/dynamic/_device.py
@@ -33,6 +33,22 @@ class Device:
     Device on which data is stored and operators are executed.
 
     The device can be either CPU or (specific) GPU.
+
+    .. warning::
+        It's recommended to use the :func:`device` function rather than to construct
+        :class:`Device` directly.
+
+    Parameters
+    ----------
+    name : str
+        The name of the device. It can be either ``"cpu"``, ``"gpu"`` or ``"gpu:<id>"`` where
+        ``<id>`` is a CUDA device ordinal, as used by CUDA runtime API (not the absolute index
+        used by NVML).
+    device_id : int, optional
+        The optional device ordinal, as used by CUDA runtime API. If not specified and the name
+        is ``"gpu"``, the current CUDA device will be used.
+        If `name` is ``"cpu"``, `device_id` must be ``None``.
+        This parameter must not be used if `name` already contains the id.
     """
 
     _thread_local = local()
@@ -46,24 +62,6 @@ def _default_device_type():
         return Device._default_device_type()
 
     def __init__(self, name: str, device_id: int | None = None):
-        """
-        Initializes the device object with a name and, optionally, device id.
-
-        .. warning::
-            It's recommended to use :meth:`device` function rather than to construct :class:`Device`
-            directly.
-
-        Args
-        ----
-        name : str
-            The name of the device. It can be either "cpu", "gpu" or "gpu:<id>" where <id> is a
-            CUDA device ordinal, as used by CUDA runtime API (not the absolute index used by NVML).
-        device_id : int, optional
-            The optional device ordinal, as used by CUDA runtime API. If not specified and the name
-            is "gpu", then current CUDA device will be used.
-            If `name` is "cpu", `device_id` must be `None`.
-            This parameter must not be used if the `name` already contains the id.
-        """
         device_type, name_device_id = Device._split_device_type_and_id(name)
         if name_device_id is not None and device_id is not None:
             raise ValueError(

diff --git a/dali/python/nvidia/dali/experimental/dynamic/_eval_context.py b/dali/python/nvidia/dali/experimental/dynamic/_eval_context.py
@@ -41,43 +41,39 @@ class EvalContext:
 
     - CUDA device
     - thread pool
-    - cuda stream.
+    - CUDA stream.
 
     ``EvalContext`` is a context manager.
+
+    Parameters
+    ----------
+    thread_pool : ThreadPool, optional
+        The thread pool which will be used by multi-threaded operators.
+        It must be associated with the same `device_id` as the one passed to this constructor.
+        This parameter is mutually exclusive with `num_threads`.
+    num_threads : int, optional
+        If specified, a new thread pool with this number of threads is created and associated
+        with the context. Note that creating a thread pool constitutes considerable overhead.
+        This argument is mutually exclusive with `thread_pool`.
+    device_id : int, optional
+        The ordinal of the GPU associated with the context. If not specified, the current CUDA
+        device will be used.
+    cuda_stream : stream object, optional
+        The CUDA stream on which GPU operators will be executed. If not provided, the value is
+        assigned by trying several options, in this order:
+
+        1. the thread's default stream, set by calling :func:`set_current_stream`
+        2. the default stream set by calling :func:`set_default_stream`
+        3. a new stream, if neither of the above was set.
+
+        Compatible streams include DALI :class:`Stream`, any object exposing
+        ``__cuda_stream__`` interface, raw CUDA stream handles, and PyTorch streams.
+        See :class:`Stream` for details.
     """
 
     _default_context_stream_sentinel = object()
 
     def __init__(self, *, num_threads=None, device_id=None, cuda_stream=None, thread_pool=None):
-        """
-        Constructs an ``EvalContext`` object.
-
-        Keyword Args
-        ------------
-        thread_pool : ThreadPool, optional
-            The thread pool which will be used by multi-threaded operators.
-            It must be associated with the same `device_id` as the one passed to this function
-            This parameter is mutually exclusive with `num_threads`.
-        num_threads : int, optional
-            If specified, a new thread pool with this number of threads is created and associated
-            with the context. Note that creating a thread pool constitutes considerable overhead.
-            This argument is mutually exclusive with `thread_pool`.
-        device_id : int, optional
-            The ordinal of the GPU associated with the context. If not specified, the current CUDA
-            device will be used.
-        cuda_stream : stream object, optional
-            The cuda_stream on which GPU operators will be executed. If not provided, the value is
-            assigned by trying several options, in this order:
-            - the thread's default stream, set by calling :meth:`set_current_stream`
-            - the default stream set by calling :meth:`set_default_stream`
-            - a new stream, if neither of the above was set.
-            Compatible streams include:
-            - DALI :class:`Stream` class
-            - any object exposing ``__cuda_stream__`` interface
-            - raw CUDA stream handle
-            - PyTorch stream
-            see :class:`Stream` for details.
-        """
         self._invocations = []
         self._default_stream = None
 
@@ -213,8 +209,8 @@ def cuda_stream(self):
         CUDA stream for this ``EvalContext``
 
         .. note::
-            In case of the thread's default context, this value is affected by calls to methods
-            :meth:`set_default_stream` and :meth:`set_current_stream`.
+            In case of the thread's default context, this value is affected by calls to
+            :func:`set_default_stream` and :func:`set_current_stream`.
         """
         if self._cuda_stream is None:
             s = _stream.get_default_stream(self.device_id)

diff --git a/dali/python/nvidia/dali/experimental/dynamic/_imread.py b/dali/python/nvidia/dali/experimental/dynamic/_imread.py
@@ -50,8 +50,8 @@ def _imread_impl(filepaths: Union[str, List[str], Tensor, Batch], device: str =
 
     Note
     ----
-    This function is currently implemented by combining :meth:`io.file.read` and
-    :meth:`decoders.image`, providing a simple interface for loading images
+    This function is currently implemented by combining :func:`io.file.read` and
+    :func:`decoders.image`, providing a simple interface for loading images
     from disk. This may change in the future to provide a more efficient implementation.
 
     Examples
@@ -83,7 +83,7 @@ def _imread_impl(filepaths: Union[str, List[str], Tensor, Batch], device: str =
     Note
     ----
     The filepath encoding is handled automatically when passing strings.
-    If you already have encoded filepaths as Tensors (from :meth:`io.file.read`
+    If you already have encoded filepaths as Tensors (from :func:`io.file.read`
     documentation format), you can pass them directly.
     """
     from . import io, decoders

diff --git a/dali/python/nvidia/dali/experimental/dynamic/_stream.py b/dali/python/nvidia/dali/experimental/dynamic/_stream.py
@@ -44,14 +44,26 @@ class Stream:
 
     This class wraps a CUDA stream object. It can be either a stream created by DALI or a
     compatible object created by a third-party library.
+
+    .. warning::
+        Do not construct this class directly. Use :func:`stream` instead.
+
+    Parameters
+    ----------
+    stream : stream object or Stream.create_new sentinel, optional
+        A compatible stream object to wrap, or ``Stream.create_new`` (default) to create a new
+        stream. Compatible objects include objects exposing ``__cuda_stream__`` interface,
+        PyTorch streams, and raw CUDA stream handles.
+    device_id : int, optional
+        The GPU device ordinal to associate with the stream. If ``stream`` is ``Stream.create_new``,
+        a new stream is created on this device; if not specified, the current CUDA device is used.
+        If ``stream`` is an existing stream object, the device id is inferred from it and this
+        parameter is used only for validation.
     """
 
     create_new = object()
 
     def __init__(self, *, stream=create_new, device_id=None):
-        """
-        Do not construct this class directly. Use :meth:`stream` instead.
-        """
         if stream is None:
             raise ValueError(
                 "The stream must not be None. To create a new stream, omit the stream parameter."
@@ -188,7 +200,7 @@ def set_default_stream(cuda_stream, /, device_id=None):
     .. warning::
         This function is intended to be used once, at the beginning of the program, to set the
         default stream for DALI operations. Calling it affects all default contexts in all threads
-        that haven't set their current streams with a call to :meth:`set_current_stream`.
+        that haven't set their current streams with a call to :func:`set_current_stream`.
     """
     global _global_streams
     if not _global_streams:
@@ -208,7 +220,7 @@ def get_default_stream(device_id=None):
     """Gets the default stream
 
     This stream is used when not overridden by thread's current stream (see
-    :meth:`set_current_stream`).
+    :func:`set_current_stream`).
     """
     if _global_streams is None:
         return None
@@ -226,8 +238,8 @@ def set_current_stream(cuda_stream, /):
     newly created :class:`EvalContext` objects with the current device to use this stream.
 
     Passing ``None`` resets the current thread's default context stream. After that, the value
-    returned by :meth:`get_current_stream` will either point to the value returned by
-    :meth:`get_default_stream` or a new stream.
+    returned by :func:`get_current_stream` will either point to the value returned by
+    :func:`get_default_stream` or a new stream.
 
     .. warning::
         Setting the current stream doesn't establish any synchronization between the work

diff --git a/dali/python/nvidia/dali/experimental/dynamic/_tensor.py b/dali/python/nvidia/dali/experimental/dynamic/_tensor.py
@@ -89,6 +89,39 @@ class Tensor:
 
     In case of lazy evaluation, the operations are executed only after an attempt is made to access
     the tensor data or properties which cannot be obtained without running the underlying operation.
+
+    .. warning::
+        :class:`Tensor` objects should not be constructed directly, use :func:`tensor` or
+        :func:`as_tensor` instead.
+
+    The :class:`Tensor` object can be created either from an existing object, passed as `data`
+    or from an invocation result.
+    Unless explicitly requested with the `copy` parameter, this constructor will make best
+    effort to avoid the copy.
+
+    Parameters
+    ----------
+    data : TensorLike, default: None
+        The data to construct the tensor from. It can be a tensor-like object, a (nested) list,
+        TensorCPU/TensorGPU or other supported type.
+    dtype : DType, default: None
+        The desired data type of the tensor. If not specified, the data type is inferred
+        from the input data. If specified, the input data is cast to the desired data type.
+    device : Device or str, optional, default: None
+        The device on which the tensor should reside (e.g., "cpu" or "gpu").
+        If not specified, the device is inferred from the input data.
+    layout : str, optional, default: None
+        The layout string describing the dimensions of the tensor (e.g., "HWC").
+        If not specified, the layout is inferred from the input data, if possible.
+    batch : Batch, optional, default: None
+        Use if the tensor is a view of a sample in a batch. Used together with `index_in_batch`.
+    index_in_batch : int, optional, default: None
+        The index of the tensor in the batch. Used together with `batch`.
+    invocation_result : _invocation.InvocationResult, default: None
+        The result of a DALI operator invocation, used for lazy evaluation.
+    copy : bool, optional, default: False
+        If True, the input data is copied. If False, the constructor will avoid
+        copying data when possible.
     """
 
     def __init__(
@@ -102,41 +135,6 @@ def __init__(
         invocation_result: _invocation.InvocationResult | None = None,
         copy: bool = False,
     ):
-        """Constructs a :class:`Tensor` object.
-
-        .. warning::
-            :class:`Tensor` objects should not be constructed directly, use :meth:`tensor` or
-            :meth:`as_tensor` instead.
-
-        The :class:`Tensor` object can be created either from an existing object, passed as `data`
-        or from an invocation result.
-        Unless explicitly requested with the `copy` parameter, this constructor will make best
-        effort to avoid the copy.
-
-        Parameters
-        ----------
-        data : TensorLike, default: None
-            The data to construct the tensor from. It can be a tensor-like object, a (nested) list,
-            TensorCPU/TensorGPU or other supported type.
-        dtype : DType, default: None
-            The desired data type of the tensor. If not specified, the data type is inferred
-            from the input data. If specified, the input data is cast to the desired data type.
-        device : Device or str, optional, default: None
-            The device on which the tensor should reside (e.g., "cpu" or "gpu").
-            If not specified, the device is inferred from the input data.
-        layout : str, optional, default: None
-            The layout string describing the dimensions of the tensor (e.g., "HWC").
-            If not specified, the layout is inferred from the input data, if possible.
-        batch : Batch, optional, default: None
-            Use if the tensor is a view of a sample in a batch. Used together with `index_in_batch`.
-        index_in_batch : int, optional, default: None
-            The index of the tensor in the batch. Used together with `batch`.
-        invocation_result : _invocation.InvocationResult, default: None
-            The result of a DALI operator invocation, used for lazy evaluation
-        copy : bool, optional, default: False
-            If True, the input data is copied. If False, the constructor will avoid
-            copying data when possible.
-        """
         if layout is None:
             layout = ""
         elif not isinstance(layout, str):