Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,856 changes: 371 additions & 2,485 deletions pylock.toml

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ dependencies = [
"pyyaml>=6.0.0",
"rich",
"sanic",
"tabulate",
"transformers",
"uvloop>=0.18",
"torch",
Expand Down Expand Up @@ -129,6 +130,7 @@ dev = [
"mdformat-gfm~=0.3.6",

# type-checking
"pandas-stubs",
"types-PyYAML~=6.0.1",
"types-requests~=2.32.0",
"types-toml",
Expand Down
173 changes: 30 additions & 143 deletions src/guidellm/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,26 +45,12 @@
reimport_benchmarks_report,
)
from guidellm.mock_server import MockServer, MockServerConfig
from guidellm.preprocess.dataset import ShortPromptStrategy, process_dataset
from guidellm.scheduler import StrategyType
from guidellm.schemas import GenerativeRequestType
from guidellm.settings import print_config
from guidellm.utils import Console, DefaultGroupHandler, get_literal_vals
from guidellm.utils import cli as cli_tools

__all__ = [
"STRATEGY_PROFILE_CHOICES",
"benchmark",
"cli",
"config",
"dataset",
"decode_escaped_str",
"from_file",
"mock_server",
"preprocess",
"run",
]

STRATEGY_PROFILE_CHOICES: list[str] = list(get_literal_vals(ProfileType | StrategyType))
"""Available strategy and profile type choices for benchmark execution."""

Expand Down Expand Up @@ -256,7 +242,7 @@ def benchmark():
help="Number of worker processes for data loading.",
)
@click.option(
"--dataloader_kwargs",
"--dataloader-kwargs",
default=BenchmarkGenerativeTextArgs.get_default("dataloader_kwargs"),
callback=cli_tools.parse_json,
help="JSON string of arguments to pass to the dataloader constructor.",
Expand Down Expand Up @@ -305,22 +291,45 @@ def benchmark():
"--warmup",
"--warmup-percent", # legacy alias
"warmup",
type=float,
default=BenchmarkGenerativeTextArgs.get_default("warmup"),
callback=cli_tools.parse_json,
help=(
"Warmup specification: if in (0,1) = percent, if >=1 = number of "
"requests/seconds (depends on active constraint)."
"Warmup specification: int, float, or dict as string "
"(json or key=value). "
"Controls time or requests before measurement starts. "
"Numeric in (0, 1): percent of duration or request count. "
"Numeric >=1: duration in seconds or request count. "
"Advanced config: see TransientPhaseConfig schema."
),
)
@click.option(
"--cooldown",
"--cooldown-percent", # legacy alias
"cooldown",
type=float,
default=BenchmarkGenerativeTextArgs.get_default("cooldown"),
callback=cli_tools.parse_json,
help=(
"Cooldown specification: int, float, or dict as string "
"(json or key=value). "
"Controls time or requests after measurement ends. "
"Numeric in (0, 1): percent of duration or request count. "
"Numeric >=1: duration in seconds or request count. "
"Advanced config: see TransientPhaseConfig schema."
),
)
@click.option(
"--rampup",
default=BenchmarkGenerativeTextArgs.get_default("rampup"),
callback=cli_tools.parse_json,
help=(
"Cooldown specification: if in (0,1) = percent, if >=1 = number of "
"requests/seconds (depends on active constraint)."
"Rampup specification: int, float, or dict as string "
"(json or key=value). "
"Controls time to linearly ramp up requests. "
"Only for Throughput/Concurrent strategies, "
"not Synchronous/Rate-based. "
"Numeric in (0, 1): percent of duration. "
"Numeric >=1: duration in seconds. "
"Advanced config: see TransientPhaseConfig schema."
),
)
@click.option(
Expand Down Expand Up @@ -469,128 +478,6 @@ def preprocess():
"""Dataset preprocessing utilities."""


@preprocess.command(
"dataset",
help=(
"Process a dataset to have specific prompt and output token sizes. "
"Supports multiple strategies for handling prompts and optional "
"Hugging Face Hub upload.\n\n"
"DATA: Path to the input dataset or dataset ID.\n\n"
"OUTPUT_PATH: Path to save the processed dataset, including file suffix."
),
context_settings={"auto_envvar_prefix": "GUIDELLM"},
)
@click.argument(
"data",
type=str,
required=True,
)
@click.argument(
"output_path",
type=click.Path(file_okay=True, dir_okay=False, writable=True, resolve_path=True),
required=True,
)
@click.option(
"--processor",
type=str,
required=True,
help="Processor or tokenizer name for calculating token counts.",
)
@click.option(
"--processor-args",
default=None,
callback=cli_tools.parse_json,
help="JSON string of arguments to pass to the processor constructor.",
)
@click.option(
"--data-args",
callback=cli_tools.parse_json,
help="JSON string of arguments to pass to dataset creation.",
)
@click.option(
"--short-prompt-strategy",
type=click.Choice([s.value for s in ShortPromptStrategy]),
default=ShortPromptStrategy.IGNORE.value,
show_default=True,
help="Strategy for handling prompts shorter than target length.",
)
@click.option(
"--pad-char",
type=str,
default="",
callback=decode_escaped_str,
help="Character to pad short prompts with when using 'pad' strategy.",
)
@click.option(
"--concat-delimiter",
type=str,
default="",
help=(
"Delimiter for concatenating short prompts (used with 'concatenate' strategy)."
),
)
@click.option(
"--prompt-tokens",
type=str,
default=None,
help="Prompt tokens configuration (JSON, YAML file, or key=value string).",
)
@click.option(
"--output-tokens",
type=str,
default=None,
help="Output tokens configuration (JSON, YAML file, or key=value string).",
)
@click.option(
"--push-to-hub",
is_flag=True,
help="Push the processed dataset to Hugging Face Hub.",
)
@click.option(
"--hub-dataset-id",
type=str,
default=None,
help=("Hugging Face Hub dataset ID for upload (required if --push-to-hub is set)."),
)
@click.option(
"--random-seed",
type=int,
default=42,
show_default=True,
help="Random seed for reproducible token sampling.",
)
def dataset(
data,
output_path,
processor,
processor_args,
data_args,
short_prompt_strategy,
pad_char,
concat_delimiter,
prompt_tokens,
output_tokens,
push_to_hub,
hub_dataset_id,
random_seed,
):
process_dataset(
data=data,
output_path=output_path,
processor=processor,
prompt_tokens=prompt_tokens,
output_tokens=output_tokens,
processor_args=processor_args,
data_args=data_args,
short_prompt_strategy=short_prompt_strategy,
pad_char=pad_char,
concat_delimiter=concat_delimiter,
push_to_hub=push_to_hub,
hub_dataset_id=hub_dataset_id,
random_seed=random_seed,
)


@cli.command(
"mock-server",
help=(
Expand Down
3 changes: 1 addition & 2 deletions src/guidellm/backends/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,8 @@ def requests_limit(self) -> int | None:
return None

@abstractmethod
async def default_model(self) -> str | None:
async def default_model(self) -> str:
"""
:return: The default model name or identifier for generation requests,
None if no default model is available
"""
...
63 changes: 21 additions & 42 deletions src/guidellm/backends/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,7 @@
import httpx

from guidellm.backends.backend import Backend
from guidellm.backends.response_handlers import (
GenerationResponseHandler,
GenerationResponseHandlerFactory,
)
from guidellm.backends.response_handlers import GenerationResponseHandlerFactory
from guidellm.schemas import GenerationRequest, GenerationResponse, RequestInfo

__all__ = ["OpenAIHTTPBackend"]
Expand Down Expand Up @@ -54,7 +51,7 @@ class OpenAIHTTPBackend(Backend):
def __init__(
self,
target: str,
model: str | None = None,
model: str = "",
api_routes: dict[str, str] | None = None,
response_handlers: dict[str, Any] | None = None,
timeout: float = 60.0,
Expand Down Expand Up @@ -192,7 +189,7 @@ async def available_models(self) -> list[str]:

return [item["id"] for item in response.json()["data"]]

async def default_model(self) -> str | None:
async def default_model(self) -> str:
"""
Get the default model for this backend.

Expand All @@ -202,9 +199,9 @@ async def default_model(self) -> str | None:
return self.model

models = await self.available_models()
return models[0] if models else None
return models[0] if models else ""

async def resolve(
async def resolve( # type: ignore[override]
self,
request: GenerationRequest,
request_info: RequestInfo,
Expand All @@ -230,11 +227,9 @@ async def resolve(
if history is not None:
raise NotImplementedError("Multi-turn requests not yet supported")

response_handler = self._resolve_response_handler(
request_type=request.request_type
)
if (request_path := self.api_routes.get(request.request_type)) is None:
raise ValueError(f"Unsupported request type '{request.request_type}'")

request_url = f"{self.target}/{request_path}"
request_files = (
{
Expand All @@ -246,6 +241,9 @@ async def resolve(
)
request_json = request.arguments.body if not request_files else None
request_data = request.arguments.body if request_files else None
response_handler = GenerationResponseHandlerFactory.create(
request.request_type, handler_overrides=self.response_handlers
)

if not request.arguments.stream:
request_info.timings.request_start = time.time()
Expand Down Expand Up @@ -282,24 +280,22 @@ async def resolve(
async for chunk in stream.aiter_lines():
iter_time = time.time()

if (
(iterations := response_handler.add_streaming_line(chunk))
is None
or iterations < 0
or end_reached
):
if request_info.timings.first_request_iteration is None:
request_info.timings.first_request_iteration = iter_time
request_info.timings.last_request_iteration = iter_time
request_info.timings.request_iterations += 1

iterations = response_handler.add_streaming_line(chunk)
if iterations is None or iterations <= 0 or end_reached:
end_reached = end_reached or iterations is None
continue

if (
request_info.timings.first_iteration is None
or request_info.timings.iterations is None
):
request_info.timings.first_iteration = iter_time
request_info.timings.iterations = 0
if request_info.timings.first_token_iteration is None:
request_info.timings.first_token_iteration = iter_time
request_info.timings.token_iterations = 0

request_info.timings.last_iteration = iter_time
request_info.timings.iterations += iterations
request_info.timings.last_token_iteration = iter_time
request_info.timings.token_iterations += iterations

request_info.timings.request_end = time.time()
yield response_handler.compile_streaming(request), request_info
Expand Down Expand Up @@ -336,20 +332,3 @@ def _resolve_validate_kwargs(
validate_kwargs["method"] = "GET"

return validate_kwargs

def _resolve_response_handler(self, request_type: str) -> GenerationResponseHandler:
if (
self.response_handlers is not None
and (handler := self.response_handlers.get(request_type)) is not None
):
return handler

handler_class = GenerationResponseHandlerFactory.get_registered_object(
request_type
)
if handler_class is None:
raise ValueError(
f"No response handler registered for request type '{request_type}'"
)

return handler_class()
Loading
Loading