Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
examples/outputs/
test_output/

# Test files generated during testing
test_file.json*

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
1 change: 1 addition & 0 deletions llmeter/callbacks/cost/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0
# Python Built-Ins:
from __future__ import annotations

from numbers import Number
from typing import Literal, Sequence

Expand Down
11 changes: 6 additions & 5 deletions llmeter/callbacks/mlflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,28 +15,29 @@

class MlflowCallback(Callback):
"""LLMeter callback to log test run parameters and metrics to an MLFlow tracking server.

This callback integrates with MLflow to track and log parameters and metrics from LLMeter test runs.
It can operate either in the current MLflow run context or create nested runs for each test.

Example:
```python
import mlflow
from llmeter.callbacks import MlflowCallback

with mlflow.start_run():
runner = Runner(
endpoint=endpoint,
callbacks=[MlflowCallback()]
)
results = await runner.run()
```

Attributes:
step (int | None): Step number for MLflow metrics logging
nested (bool): Whether to create nested runs for each test
parameters_names (list): List of parameter names to log to MLflow
"""

parameters_names = [
"total_requests",
"clients",
Expand All @@ -59,7 +60,7 @@ def __init__(self, step: int | None = None, nested: bool = False) -> None:
`Result.run_name`.

Raises:
ImportError: If MLflow is not installed
ImportError: If MLflow is not installed
"""
super().__init__()
self.step = step
Expand Down
6 changes: 5 additions & 1 deletion llmeter/endpoints/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@

spec = importlib.util.find_spec("openai")
if spec:
from .openai import OpenAIEndpoint, OpenAICompletionEndpoint , OpenAICompletionStreamEndpoint # noqa: F401
from .openai import (
OpenAIEndpoint as OpenAIEndpoint,
OpenAICompletionEndpoint as OpenAICompletionEndpoint,
OpenAICompletionStreamEndpoint as OpenAICompletionStreamEndpoint,
)

spec = importlib.util.find_spec("litellm")
if spec:
Expand Down
65 changes: 50 additions & 15 deletions llmeter/endpoints/litellm.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,16 +79,19 @@ class LiteLLM(LiteLLMBase):
def invoke(self, payload, **kwargs):
try:
response = completion(model=self.litellm_model, **payload, **kwargs)
assert isinstance(response, ModelResponse)
if not isinstance(response, ModelResponse):
raise ValueError(f"Expected ModelResponse, got {type(response)}")
response = self._parse_converse_response(response)
response.input_prompt = self._parse_payload(payload)
return response

except Exception as e:
logger.exception(e)
return InvocationResponse.error_output(
id=uuid4().hex, error=str(e), input_prompt=self._parse_payload(payload)
response = InvocationResponse.error_output(
input_payload=payload, error=e, id=uuid4().hex
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like runner.py, endpoints/bedrock.py, endpoints/openai.py, and endpoints/sagemaker.py all still have some cases using error=str(e) - do we care enough to fix that consistently and add an Exception | str | None type annotation to InvocationResponse.error_output()'s definition?

)
response.input_prompt = self._parse_payload(payload)
return response

def _parse_converse_response(
self, client_response: ModelResponse
Expand All @@ -102,29 +105,52 @@ def _parse_converse_response(
response.num_tokens_input = usage.prompt_tokens
response.num_tokens_output = usage.completion_tokens
except AttributeError:
pass
response.num_tokens_input = None
response.num_tokens_output = None
Comment on lines -105 to +109
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at the InvocationResponse definition, these fields should default to None anyway. Why do we need to set them here?


return response


class LiteLLMStreaming(LiteLLMBase):
def invoke(self, payload, **kwargs):
if ("stream" not in kwargs) or ("stream" not in payload):
kwargs["stream"] = True

if ("stream_options" not in kwargs) or ("stream_options" not in payload):
kwargs["stream_options"] = {"include_usage": True}
# Make a copy of payload to avoid modifying the original
payload_copy = payload.copy()

# Create a clean kwargs dict without conflicting parameters
clean_kwargs = {}
for key, value in kwargs.items():
if key not in ["stream", "stream_options"]:
clean_kwargs[key] = value

# Ensure streaming is enabled
payload_copy["stream"] = True

# Handle stream_options - merge if exists in kwargs, otherwise set default
if "stream_options" in kwargs:
existing_options = kwargs.get("stream_options", {})
payload_copy["stream_options"] = {**existing_options, "include_usage": True}
Comment on lines +130 to +131
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we merge in case there are some stream_options defined in payload as well as the kwargs? Currently we're supporting passing in either way, but not mixing.

Could be e.g.

existing_kwargs_options = kwargs["stream_options"]
existing_payload_options = payload_copy.get("stream_options", {})
payload_copy["stream_options"] = {
    **existing_payload_options,
    **existing_kwargs_options,
    "include_usage": True,
}

elif "stream_options" not in payload_copy:
payload_copy["stream_options"] = {"include_usage": True}
else:
# Merge with existing stream_options in payload if present
existing_options = payload_copy.get("stream_options", {})
payload_copy["stream_options"] = {**existing_options, "include_usage": True}

try:
start_t = time.perf_counter()
response = completion(model=self.litellm_model, **payload, **kwargs)
response = completion(
model=self.litellm_model, **payload_copy, **clean_kwargs
)
except Exception as e:
logger.exception(e)
return InvocationResponse.error_output(
id=uuid4().hex, error=str(e), input_prompt=self._parse_payload(payload)
response = InvocationResponse.error_output(
input_payload=payload, error=e, id=uuid4().hex
)
response.input_prompt = self._parse_payload(payload)
return response

assert isinstance(response, CustomStreamWrapper)
if not isinstance(response, CustomStreamWrapper):
raise ValueError(f"Expected CustomStreamWrapper, got {type(response)}")
response = self._parse_stream(response, start_t)
response.input_prompt = self._parse_payload(payload)
return response
Expand All @@ -136,12 +162,21 @@ def _parse_stream(
time_flag = True
time_to_first_token = None
output_text = ""
id = None

for chunk in client_response:
output_text += chunk.choices[0].delta.content or "" # type: ignore
if time_flag:
content = chunk.choices[0].delta.content or "" # type: ignore
output_text += content

# Record time to first token only when we get actual content
if time_flag and content:
time_to_first_token = time.perf_counter() - start_t
time_flag = False

# Always capture the ID from the first chunk
if id is None:
id = chunk.id

try:
usage = chunk.usage # type: ignore
except AttributeError:
Expand Down
20 changes: 15 additions & 5 deletions llmeter/experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ def plot_results(self, show: bool = True, format: Literal["html", "png"] = "html
for i, (_, f) in enumerate(figs.items()):
f.update_layout(colorway=c_seqs[i % len(c_seqs)])

output_path = Path(self.output_path)
if output_path:
if self.output_path is not None:
output_path = Path(self.output_path)
# save figure to the output path
output_path.parent.mkdir(parents=True, exist_ok=True)
for k, f in figs.items():
Expand All @@ -66,7 +66,9 @@ def plot_results(self, show: bool = True, format: Literal["html", "png"] = "html
return figs

@classmethod
def load(cls, load_path: Path | str | None, test_name: str | None = None) -> "LoadTestResult":
def load(
cls, load_path: Path | str | None, test_name: str | None = None
) -> "LoadTestResult":
"""Load test results from a directory.

Args:
Expand Down Expand Up @@ -137,6 +139,7 @@ async def run(self, output_path: os.PathLike | None = None):
n_requests=self._get_n_requests(c),
run_name=f"{c:05.0f}-clients",
callbacks=self.callbacks,
output_path=output_path,
)
for c in tqdm(
self.sequence_of_clients, desc="Configurations", disable=_disable_tqdm
Expand Down Expand Up @@ -216,19 +219,26 @@ def __post_init__(self) -> None:

self._runner = Runner(
endpoint=self.endpoint,
output_path=Path(self.output_path),
output_path=Path(self.output_path)
if self.output_path is not None
else None,
tokenizer=self.tokenizer,
)

async def run(self, output_path=None):
# Handle None output_path properly
final_output_path = output_path or self.output_path
if final_output_path is not None:
final_output_path = Path(final_output_path)

heatmap_results = await self._runner.run(
payload=self.payload,
clients=self.clients,
n_requests=len(self.input_lengths)
* len(self.output_lengths)
* self.requests_per_combination
// self.clients,
output_path=Path(output_path) or self.output_path,
output_path=final_output_path,
)
self._results = heatmap_results
return heatmap_results
Expand Down
28 changes: 25 additions & 3 deletions llmeter/plotting/heatmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,32 @@ def _cut(arr, bins):
[Interval(1,2.5), Interval(2.5,4)])
"""
t_max, t_min = max(arr), min(arr)
bin_width = ceil((t_max - t_min) / bins)

# Handle edge case where all values are the same
if t_max == t_min:
# Create a single bin that contains all values
bin_boundaries = [t_min, t_max + 1] # Add small offset to create valid interval
bin_indexes = [0] * len(arr) # All values go to first bin
return [Interval(bin_boundaries[0], bin_boundaries[1]) for _ in arr], [
Interval(bin_boundaries[0], bin_boundaries[1])
]

# Calculate bin width, ensuring we don't get zero width
bin_width = max(1, ceil((t_max - t_min) / bins))
bin_boundaries = [i * bin_width + t_min for i in range(bins + 1)]
bin_indexes = [floor((k - t_min) / bin_width) for k in arr]
# binned = [Interval(bin_boundaries[k], bin_boundaries[k + 1]) for k in bin_indexes]

# Ensure the last boundary covers the maximum value
if bin_boundaries[-1] <= t_max:
bin_boundaries[-1] = t_max + 1

# Calculate bin indexes, ensuring they're within valid range
bin_indexes = []
for k in arr:
idx = min(floor((k - t_min) / bin_width), bins - 1)
# Ensure index is valid
idx = max(0, min(idx, len(bin_boundaries) - 2))
bin_indexes.append(idx)

return [Interval(bin_boundaries[k], bin_boundaries[k + 1]) for k in bin_indexes], [
Interval(left, right)
for left, right in zip(bin_boundaries[:-1], bin_boundaries[1:])
Expand Down
34 changes: 25 additions & 9 deletions llmeter/plotting/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,9 @@ def stat_clients(load_test_result: LoadTestResult, stat: str, **scatter_kwargs):
return go.Scatter(**default_kwargs)


def error_clients_fig(load_test_result: LoadTestResult, log_scale=False, **scatter_kwargs):
def error_clients_fig(
load_test_result: LoadTestResult, log_scale=False, **scatter_kwargs
):
"""
Create a figure showing error rate vs number of clients.

Expand All @@ -247,7 +249,9 @@ def error_clients_fig(load_test_result: LoadTestResult, log_scale=False, **scatt
plotly.graph_objects.Figure: Figure showing error rate vs number of clients
"""
fig = go.Figure()
fig.add_trace(stat_clients(load_test_result, "failed_requests_rate", **scatter_kwargs))
fig.add_trace(
stat_clients(load_test_result, "failed_requests_rate", **scatter_kwargs)
)
fig.update_layout(
title="Error rate vs number of clients",
xaxis_title="Number of clients",
Expand All @@ -261,7 +265,9 @@ def error_clients_fig(load_test_result: LoadTestResult, log_scale=False, **scatt
return fig


def rpm_clients_fig(load_test_result: LoadTestResult, log_scale=False, **scatter_kwargs):
def rpm_clients_fig(
load_test_result: LoadTestResult, log_scale=False, **scatter_kwargs
):
"""
Create a figure showing requests per minute vs number of clients.

Expand All @@ -274,7 +280,9 @@ def rpm_clients_fig(load_test_result: LoadTestResult, log_scale=False, **scatter
plotly.graph_objects.Figure: Figure showing requests per minute vs number of clients
"""
fig = go.Figure()
fig.add_trace(stat_clients(load_test_result, "requests_per_minute", **scatter_kwargs))
fig.add_trace(
stat_clients(load_test_result, "requests_per_minute", **scatter_kwargs)
)
fig.update_layout(
title="Requests per minute vs number of clients",
xaxis_title="Number of clients",
Expand All @@ -295,7 +303,9 @@ def average_input_tokens_clients_fig(
):
fig = go.Figure()
fig.add_trace(
stat_clients(load_test_result, "average_input_tokens_per_minute", **scatter_kwargs)
stat_clients(
load_test_result, "average_input_tokens_per_minute", **scatter_kwargs
)
)
fig.update_layout(
title="Average input tokens per minute vs number of clients",
Expand All @@ -317,7 +327,9 @@ def average_output_tokens_clients_fig(
):
fig = go.Figure()
fig.add_trace(
stat_clients(load_test_result, "average_output_tokens_per_minute", **scatter_kwargs)
stat_clients(
load_test_result, "average_output_tokens_per_minute", **scatter_kwargs
)
)
fig.update_layout(
title="Average output tokens per minute vs number of clients",
Expand Down Expand Up @@ -439,14 +451,18 @@ def plot_load_test_results(
Returns:
dict: Dictionary containing the following plots:
- time_to_first_token: Figure showing time to first token vs number of clients
- time_to_last_token: Figure showing time to last token vs number of clients
- time_to_last_token: Figure showing time to last token vs number of clients
- requests_per_minute: Figure showing requests per minute vs number of clients
- error_rate: Figure showing error rate vs number of clients
- average_input_tokens_clients: Figure showing average input tokens per minute vs clients
- average_output_tokens_clients: Figure showing average output tokens per minute vs clients
"""
f1 = latency_clients_fig(load_test_result, "time_to_first_token", log_scale=log_scale)
f2 = latency_clients_fig(load_test_result, "time_to_last_token", log_scale=log_scale)
f1 = latency_clients_fig(
load_test_result, "time_to_first_token", log_scale=log_scale
)
f2 = latency_clients_fig(
load_test_result, "time_to_last_token", log_scale=log_scale
)
f3 = rpm_clients_fig(load_test_result, log_scale=log_scale)
f4 = error_clients_fig(load_test_result, log_scale=log_scale)
f5 = average_input_tokens_clients_fig(load_test_result, log_scale=log_scale)
Expand Down
4 changes: 1 addition & 3 deletions llmeter/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,7 @@ def load(cls, result_path: os.PathLike | str):
for key in ["start_time", "end_time"]:
if key in summary and summary[key] and isinstance(summary[key], str):
try:
summary[key] = datetime.fromisoformat(
summary[key]
)
summary[key] = datetime.fromisoformat(summary[key])
except ValueError:
pass
return cls(responses=responses, **summary)
Expand Down
7 changes: 6 additions & 1 deletion llmeter/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,12 @@ class DeferredError:
"""

def __init__(self, exception):
self.exc = exception
# Ensure the exception is a BaseException instance
if isinstance(exception, BaseException):
self.exc = exception
else:
# If it's not a BaseException, wrap it in an ImportError
self.exc = ImportError(str(exception))
Comment on lines +30 to +32
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Although we use it only for ImportErrors so far, this class was originally written to be generic. The need for this if/else is arising from inconsistent usage across LLMeter:

  • In some cases e.g. plotly in plotting.py, we use it as originally documented - to store a ModuleNotFoundError/ImportError.
  • In others e.g. kaleido in plotting.py, we pass in a message string instead.

My asks would probably be to:

  1. Standardize our usage one way or the other, unless we have good reasons not to, and
  2. If we still want to support both string and error in here, then either
    • Create a more generic base class like Exception in the else clause, OR
    • Change the name and docstring of this DeferredError (e.g. DeferredImportError?) to indicate that it's only intended for import errors.


def __getattr__(self, name):
"""Called by Python interpreter before using any method or property on the object.
Expand Down
2 changes: 2 additions & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For me, ruff format picks up & adds the missing trailing newline in this and tests/endpoint/__init__.py... Does it not for you?

Loading