awslabs · acere · Aug 25, 2025 · Aug 25, 2025 · Aug 28, 2025 · Aug 28, 2025
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,9 @@
 examples/outputs/
 test_output/
 
+# Test files generated during testing
+test_file.json*
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/llmeter/callbacks/cost/results.py b/llmeter/callbacks/cost/results.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # Python Built-Ins:
 from __future__ import annotations
+
 from numbers import Number
 from typing import Literal, Sequence
 

diff --git a/llmeter/callbacks/mlflow.py b/llmeter/callbacks/mlflow.py
@@ -15,28 +15,29 @@
 
 class MlflowCallback(Callback):
     """LLMeter callback to log test run parameters and metrics to an MLFlow tracking server.
-    
+
     This callback integrates with MLflow to track and log parameters and metrics from LLMeter test runs.
     It can operate either in the current MLflow run context or create nested runs for each test.
-    
+
     Example:
         ```python
         import mlflow
         from llmeter.callbacks import MlflowCallback
-        
+
         with mlflow.start_run():
             runner = Runner(
                 endpoint=endpoint,
                 callbacks=[MlflowCallback()]
             )
             results = await runner.run()
         ```
-    
+
     Attributes:
         step (int | None): Step number for MLflow metrics logging
         nested (bool): Whether to create nested runs for each test
         parameters_names (list): List of parameter names to log to MLflow
     """
+
     parameters_names = [
         "total_requests",
         "clients",
@@ -59,7 +60,7 @@ def __init__(self, step: int | None = None, nested: bool = False) -> None:
                 `Result.run_name`.
 
         Raises:
-            ImportError: If MLflow is not installed                
+            ImportError: If MLflow is not installed
         """
         super().__init__()
         self.step = step

diff --git a/llmeter/endpoints/__init__.py b/llmeter/endpoints/__init__.py
@@ -6,7 +6,11 @@
 
 spec = importlib.util.find_spec("openai")
 if spec:
-    from .openai import OpenAIEndpoint, OpenAICompletionEndpoint , OpenAICompletionStreamEndpoint # noqa: F401
+    from .openai import (
+        OpenAIEndpoint as OpenAIEndpoint,
+        OpenAICompletionEndpoint as OpenAICompletionEndpoint,
+        OpenAICompletionStreamEndpoint as OpenAICompletionStreamEndpoint,
+    )
 
 spec = importlib.util.find_spec("litellm")
 if spec:

diff --git a/llmeter/endpoints/litellm.py b/llmeter/endpoints/litellm.py
@@ -79,16 +79,19 @@ class LiteLLM(LiteLLMBase):
     def invoke(self, payload, **kwargs):
         try:
             response = completion(model=self.litellm_model, **payload, **kwargs)
-            assert isinstance(response, ModelResponse)
+            if not isinstance(response, ModelResponse):
+                raise ValueError(f"Expected ModelResponse, got {type(response)}")
             response = self._parse_converse_response(response)
             response.input_prompt = self._parse_payload(payload)
             return response
 
         except Exception as e:
             logger.exception(e)
-            return InvocationResponse.error_output(
-                id=uuid4().hex, error=str(e), input_prompt=self._parse_payload(payload)
+            response = InvocationResponse.error_output(
+                input_payload=payload, error=e, id=uuid4().hex
             )
+            response.input_prompt = self._parse_payload(payload)
+            return response
 
     def _parse_converse_response(
         self, client_response: ModelResponse
@@ -102,29 +105,52 @@ def _parse_converse_response(
             response.num_tokens_input = usage.prompt_tokens
             response.num_tokens_output = usage.completion_tokens
         except AttributeError:
-            pass
+            response.num_tokens_input = None
+            response.num_tokens_output = None
 
         return response
 
 
 class LiteLLMStreaming(LiteLLMBase):
     def invoke(self, payload, **kwargs):
-        if ("stream" not in kwargs) or ("stream" not in payload):
-            kwargs["stream"] = True
-
-        if ("stream_options" not in kwargs) or ("stream_options" not in payload):
-            kwargs["stream_options"] = {"include_usage": True}
+        # Make a copy of payload to avoid modifying the original
+        payload_copy = payload.copy()
+
+        # Create a clean kwargs dict without conflicting parameters
+        clean_kwargs = {}
+        for key, value in kwargs.items():
+            if key not in ["stream", "stream_options"]:
+                clean_kwargs[key] = value
+
+        # Ensure streaming is enabled
+        payload_copy["stream"] = True
+
+        # Handle stream_options - merge if exists in kwargs, otherwise set default
+        if "stream_options" in kwargs:
+            existing_options = kwargs.get("stream_options", {})
+            payload_copy["stream_options"] = {**existing_options, "include_usage": True}
+        elif "stream_options" not in payload_copy:
+            payload_copy["stream_options"] = {"include_usage": True}
+        else:
+            # Merge with existing stream_options in payload if present
+            existing_options = payload_copy.get("stream_options", {})
+            payload_copy["stream_options"] = {**existing_options, "include_usage": True}
 
         try:
             start_t = time.perf_counter()
-            response = completion(model=self.litellm_model, **payload, **kwargs)
+            response = completion(
+                model=self.litellm_model, **payload_copy, **clean_kwargs
+            )
         except Exception as e:
             logger.exception(e)
-            return InvocationResponse.error_output(
-                id=uuid4().hex, error=str(e), input_prompt=self._parse_payload(payload)
+            response = InvocationResponse.error_output(
+                input_payload=payload, error=e, id=uuid4().hex
             )
+            response.input_prompt = self._parse_payload(payload)
+            return response
 
-        assert isinstance(response, CustomStreamWrapper)
+        if not isinstance(response, CustomStreamWrapper):
+            raise ValueError(f"Expected CustomStreamWrapper, got {type(response)}")
         response = self._parse_stream(response, start_t)
         response.input_prompt = self._parse_payload(payload)
         return response
@@ -136,12 +162,21 @@ def _parse_stream(
         time_flag = True
         time_to_first_token = None
         output_text = ""
+        id = None
+
         for chunk in client_response:
-            output_text += chunk.choices[0].delta.content or ""  # type: ignore
-            if time_flag:
+            content = chunk.choices[0].delta.content or ""  # type: ignore
+            output_text += content
+
+            # Record time to first token only when we get actual content
+            if time_flag and content:
                 time_to_first_token = time.perf_counter() - start_t
                 time_flag = False
+
+            # Always capture the ID from the first chunk
+            if id is None:
                 id = chunk.id
+
             try:
                 usage = chunk.usage  # type: ignore
             except AttributeError:

diff --git a/llmeter/experiments.py b/llmeter/experiments.py
@@ -51,8 +51,8 @@ def plot_results(self, show: bool = True, format: Literal["html", "png"] = "html
         for i, (_, f) in enumerate(figs.items()):
             f.update_layout(colorway=c_seqs[i % len(c_seqs)])
 
-        output_path = Path(self.output_path)
-        if output_path:
+        if self.output_path is not None:
+            output_path = Path(self.output_path)
             # save figure to the output path
             output_path.parent.mkdir(parents=True, exist_ok=True)
             for k, f in figs.items():
@@ -66,7 +66,9 @@ def plot_results(self, show: bool = True, format: Literal["html", "png"] = "html
         return figs
 
     @classmethod
-    def load(cls, load_path: Path | str | None, test_name: str | None = None) -> "LoadTestResult":
+    def load(
+        cls, load_path: Path | str | None, test_name: str | None = None
+    ) -> "LoadTestResult":
         """Load test results from a directory.
 
         Args:
@@ -137,6 +139,7 @@ async def run(self, output_path: os.PathLike | None = None):
                 n_requests=self._get_n_requests(c),
                 run_name=f"{c:05.0f}-clients",
                 callbacks=self.callbacks,
+                output_path=output_path,
             )
             for c in tqdm(
                 self.sequence_of_clients, desc="Configurations", disable=_disable_tqdm
@@ -216,19 +219,26 @@ def __post_init__(self) -> None:
 
         self._runner = Runner(
             endpoint=self.endpoint,
-            output_path=Path(self.output_path),
+            output_path=Path(self.output_path)
+            if self.output_path is not None
+            else None,
             tokenizer=self.tokenizer,
         )
 
     async def run(self, output_path=None):
+        # Handle None output_path properly
+        final_output_path = output_path or self.output_path
+        if final_output_path is not None:
+            final_output_path = Path(final_output_path)
+
         heatmap_results = await self._runner.run(
             payload=self.payload,
             clients=self.clients,
             n_requests=len(self.input_lengths)
             * len(self.output_lengths)
             * self.requests_per_combination
             // self.clients,
-            output_path=Path(output_path) or self.output_path,
+            output_path=final_output_path,
         )
         self._results = heatmap_results
         return heatmap_results

diff --git a/llmeter/plotting/heatmap.py b/llmeter/plotting/heatmap.py
@@ -131,10 +131,32 @@ def _cut(arr, bins):
          [Interval(1,2.5), Interval(2.5,4)])
     """
     t_max, t_min = max(arr), min(arr)
-    bin_width = ceil((t_max - t_min) / bins)
+
+    # Handle edge case where all values are the same
+    if t_max == t_min:
+        # Create a single bin that contains all values
+        bin_boundaries = [t_min, t_max + 1]  # Add small offset to create valid interval
+        bin_indexes = [0] * len(arr)  # All values go to first bin
+        return [Interval(bin_boundaries[0], bin_boundaries[1]) for _ in arr], [
+            Interval(bin_boundaries[0], bin_boundaries[1])
+        ]
+
+    # Calculate bin width, ensuring we don't get zero width
+    bin_width = max(1, ceil((t_max - t_min) / bins))
     bin_boundaries = [i * bin_width + t_min for i in range(bins + 1)]
-    bin_indexes = [floor((k - t_min) / bin_width) for k in arr]
-    # binned = [Interval(bin_boundaries[k], bin_boundaries[k + 1]) for k in bin_indexes]
+
+    # Ensure the last boundary covers the maximum value
+    if bin_boundaries[-1] <= t_max:
+        bin_boundaries[-1] = t_max + 1
+
+    # Calculate bin indexes, ensuring they're within valid range
+    bin_indexes = []
+    for k in arr:
+        idx = min(floor((k - t_min) / bin_width), bins - 1)
+        # Ensure index is valid
+        idx = max(0, min(idx, len(bin_boundaries) - 2))
+        bin_indexes.append(idx)
+
     return [Interval(bin_boundaries[k], bin_boundaries[k + 1]) for k in bin_indexes], [
         Interval(left, right)
         for left, right in zip(bin_boundaries[:-1], bin_boundaries[1:])

diff --git a/llmeter/plotting/plotting.py b/llmeter/plotting/plotting.py
@@ -234,7 +234,9 @@ def stat_clients(load_test_result: LoadTestResult, stat: str, **scatter_kwargs):
     return go.Scatter(**default_kwargs)
 
 
-def error_clients_fig(load_test_result: LoadTestResult, log_scale=False, **scatter_kwargs):
+def error_clients_fig(
+    load_test_result: LoadTestResult, log_scale=False, **scatter_kwargs
+):
     """
     Create a figure showing error rate vs number of clients.
 
@@ -247,7 +249,9 @@ def error_clients_fig(load_test_result: LoadTestResult, log_scale=False, **scatt
         plotly.graph_objects.Figure: Figure showing error rate vs number of clients
     """
     fig = go.Figure()
-    fig.add_trace(stat_clients(load_test_result, "failed_requests_rate", **scatter_kwargs))
+    fig.add_trace(
+        stat_clients(load_test_result, "failed_requests_rate", **scatter_kwargs)
+    )
     fig.update_layout(
         title="Error rate vs number of clients",
         xaxis_title="Number of clients",
@@ -261,7 +265,9 @@ def error_clients_fig(load_test_result: LoadTestResult, log_scale=False, **scatt
     return fig
 
 
-def rpm_clients_fig(load_test_result: LoadTestResult, log_scale=False, **scatter_kwargs):
+def rpm_clients_fig(
+    load_test_result: LoadTestResult, log_scale=False, **scatter_kwargs
+):
     """
     Create a figure showing requests per minute vs number of clients.
 
@@ -274,7 +280,9 @@ def rpm_clients_fig(load_test_result: LoadTestResult, log_scale=False, **scatter
         plotly.graph_objects.Figure: Figure showing requests per minute vs number of clients
     """
     fig = go.Figure()
-    fig.add_trace(stat_clients(load_test_result, "requests_per_minute", **scatter_kwargs))
+    fig.add_trace(
+        stat_clients(load_test_result, "requests_per_minute", **scatter_kwargs)
+    )
     fig.update_layout(
         title="Requests per minute vs number of clients",
         xaxis_title="Number of clients",
@@ -295,7 +303,9 @@ def average_input_tokens_clients_fig(
 ):
     fig = go.Figure()
     fig.add_trace(
-        stat_clients(load_test_result, "average_input_tokens_per_minute", **scatter_kwargs)
+        stat_clients(
+            load_test_result, "average_input_tokens_per_minute", **scatter_kwargs
+        )
     )
     fig.update_layout(
         title="Average input tokens per minute vs number of clients",
@@ -317,7 +327,9 @@ def average_output_tokens_clients_fig(
 ):
     fig = go.Figure()
     fig.add_trace(
-        stat_clients(load_test_result, "average_output_tokens_per_minute", **scatter_kwargs)
+        stat_clients(
+            load_test_result, "average_output_tokens_per_minute", **scatter_kwargs
+        )
     )
     fig.update_layout(
         title="Average output tokens per minute vs number of clients",
@@ -439,14 +451,18 @@ def plot_load_test_results(
     Returns:
         dict: Dictionary containing the following plots:
             - time_to_first_token: Figure showing time to first token vs number of clients
-            - time_to_last_token: Figure showing time to last token vs number of clients  
+            - time_to_last_token: Figure showing time to last token vs number of clients
             - requests_per_minute: Figure showing requests per minute vs number of clients
             - error_rate: Figure showing error rate vs number of clients
             - average_input_tokens_clients: Figure showing average input tokens per minute vs clients
             - average_output_tokens_clients: Figure showing average output tokens per minute vs clients
     """
-    f1 = latency_clients_fig(load_test_result, "time_to_first_token", log_scale=log_scale)
-    f2 = latency_clients_fig(load_test_result, "time_to_last_token", log_scale=log_scale)
+    f1 = latency_clients_fig(
+        load_test_result, "time_to_first_token", log_scale=log_scale
+    )
+    f2 = latency_clients_fig(
+        load_test_result, "time_to_last_token", log_scale=log_scale
+    )
     f3 = rpm_clients_fig(load_test_result, log_scale=log_scale)
     f4 = error_clients_fig(load_test_result, log_scale=log_scale)
     f5 = average_input_tokens_clients_fig(load_test_result, log_scale=log_scale)

diff --git a/llmeter/results.py b/llmeter/results.py
@@ -176,9 +176,7 @@ def load(cls, result_path: os.PathLike | str):
             for key in ["start_time", "end_time"]:
                 if key in summary and summary[key] and isinstance(summary[key], str):
                     try:
-                        summary[key] = datetime.fromisoformat(
-                            summary[key]
-                        )
+                        summary[key] = datetime.fromisoformat(summary[key])
                     except ValueError:
                         pass
         return cls(responses=responses, **summary)

diff --git a/llmeter/utils.py b/llmeter/utils.py
@@ -24,7 +24,12 @@ class DeferredError:
     """
 
     def __init__(self, exception):
-        self.exc = exception
+        # Ensure the exception is a BaseException instance
+        if isinstance(exception, BaseException):
+            self.exc = exception
+        else:
+            # If it's not a BaseException, wrap it in an ImportError
+            self.exc = ImportError(str(exception))
 
     def __getattr__(self, name):
         """Called by Python interpreter before using any method or property on the object.

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -0,0 +1,2 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
		# SPDX-License-Identifier: Apache-2.0
Copy link Contributor athewsey Sep 18, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. For me, `ruff format` picks up & adds the missing trailing newline in this and `tests/endpoint/__init__.py`... Does it not for you?