From db1f850d605bfb435e2d03d220e6f8ddc2992c1d Mon Sep 17 00:00:00 2001 From: Siddhartha Jana Date: Mon, 9 Mar 2026 20:47:39 -0700 Subject: [PATCH] Initial dtoopt commit - basic CLI and Core --- .gitignore | 5 + Makefile | 4 + README.md | 102 ++++++++++++++++- dtoopt/README.md | 35 ++++++ dtoopt/__init__.py | 1 + dtoopt/__main__.py | 11 ++ dtoopt/cli.py | 94 ++++++++++++++++ dtoopt/core.py | 236 +++++++++++++++++++++++++++++++++++++++ dtoopt/pyproject.toml | 3 + dtoopt/setup.py | 20 ++++ dtoopt/workload_probe.sh | 4 + 11 files changed, 509 insertions(+), 6 deletions(-) create mode 100644 dtoopt/README.md create mode 100644 dtoopt/__init__.py create mode 100644 dtoopt/__main__.py create mode 100644 dtoopt/cli.py create mode 100644 dtoopt/core.py create mode 100644 dtoopt/pyproject.toml create mode 100644 dtoopt/setup.py create mode 100644 dtoopt/workload_probe.sh diff --git a/.gitignore b/.gitignore index 221b15a..7856182 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,7 @@ dto-test* libdto.so* +*.egg-info/ +__pycache__/ +build/ +dist/ +.pytest_cache/ diff --git a/Makefile b/Makefile index 95af413..2603ec5 100644 --- a/Makefile +++ b/Makefile @@ -27,5 +27,9 @@ dto-test: dto-test.c dto-test-wodto: dto-test.c gcc -g dto-test.c $(DML_LIB_CXX) -o dto-test-wodto -lpthread +dtoopt-install: + python3 -m pip install --user . + @echo "dtoopt installed. If needed, add ~/.local/bin to PATH" + clean: rm -rf *.o *.so dto-test diff --git a/README.md b/README.md index f0324f4..0ff40cf 100644 --- a/README.md +++ b/README.md @@ -137,7 +137,7 @@ to the DSA device. 3. Using with other applications (two ways to use it) 3a. Using "-ldto" linker option (requires recompiling the application) i. Recompile the application with "-ldto" linker options - ii. Setup DTO environment variables (examples below) + ii. Setup DTO environment variables (examples below) export DTO_USESTDC_CALLS=0 export DTO_COLLECT_STATS=1 export DTO_WAIT_METHOD=busypoll @@ -146,12 +146,12 @@ to the DSA device. export DTO_AUTO_ADJUST_KNOBS=1 export DTO_WQ_LIST="wq0.0;wq2.0;wq4.0;wq6.0" export DTO_IS_NUMA_AWARE=1 - iii. Run the application - (CacheBench example below) - 3b. Using LD_PRELOAD method (doesn not require recompiling the application) - i. setup LD_PRELOAD environment variable to point to DTO library + iii. Run the application - (CacheBench example below) + 3b. Using LD_PRELOAD method (does not require recompiling the application) + i. setup LD_PRELOAD environment variable to point to DTO library export LD_PRELOAD=:$LD_PRELOAD - ii. Export all environment variables (similar to ii. in option 3a. above) - iii. Run the application - (CacheBench example below) + ii. Export all environment variables (similar to ii. in option 3a. above) + iii. Run the application - (CacheBench example below) /cachebench -json_test_config --progress_stats_file=dto.log --report_api_latency @@ -225,3 +225,93 @@ When linking DTO using LD_PRELOAD environment variable special care is required - When the application is started by a script with #! which invokes another script with #!, for unknown reasons DTO causes a segmentation fault during a memset operation on an 8K sized buffer. This can be avoided by setting the minimum DTO size above 8K, or by avoiding this invocation sequence. + +## DTO Parameter Optimization (dtoopt) + +DTO includes a standalone Bayesian optimization tool under `dtoopt/` that tunes the following three DTO environment variables: + +- `DTO_MIN_BYTES` +- `DTO_CPU_SIZE_FRACTION` +- `DTO_AUTO_ADJUST_KNOBS` + +The tool runs a user-provided workload command multiple times, sets trial DTO values as environment variables for each run, extracts a metric from workload output, and returns the best parameter combination. + +### Prerequisites + +- Python 3 +- `scikit-optimize` +- `pip` + +```bash +python3 -m pip install --user ./dtoopt +``` + +This installs a standalone `dtoopt` command from this repository. +If your shell cannot find it, add `~/.local/bin` to your `PATH`. + +### Run the optimizer + +You can run `dtoopt` from any directory, as long as the `dtoopt` executable is available in your `PATH`. +The repository root is only required when you rely on repository-relative paths (for example, `python3 -m pip install --user ./dtoopt` or `--command "./dto-test-wodto"`). + +```bash +dtoopt --command "" --trials 30 --results-dir "optimizer_results" +``` + +Alternative (without installing the command): + +```bash +python3 -m dtoopt --command "" --trials 30 --results-dir "optimizer_results" +``` + +By default, `--metric-regex` captures DTO runtime from lines like: + +```text +DTO Run Time: 22826 ms +``` + +using this default regex: + +```text +DTO\s+Run\s+Time:\s*([0-9]+(?:\.[0-9]+)?)\s*ms +``` + +If your workload prints a different metric, override `--metric-regex`. + +### Representative validation with dto-test + +Build test app without DTO link (for `LD_PRELOAD` style experiments): + +```bash +make dto-test-wodto +``` + +Example: optimize against the progress metric printed by `dto-test`: + +```bash +dtoopt \ + --command "./dto-test-wodto" \ + --metric-regex "completed\\s+([0-9]+)\\s+ops" \ + --mode maximize \ + --trials 20 \ + --results-dir "optimizer_results" +``` + +### Useful knobs + +- `--min-bytes-low`: Lower bound (inclusive) of the `DTO_MIN_BYTES` search range. +- `--min-bytes-high`: Upper bound (inclusive) of the `DTO_MIN_BYTES` search range. +- `--min-bytes-step`: Step size used to snap candidate `DTO_MIN_BYTES` values. +- `--cpu-fraction-low`: Lower bound (inclusive) of the `DTO_CPU_SIZE_FRACTION` search range. +- `--cpu-fraction-high`: Upper bound (inclusive) of the `DTO_CPU_SIZE_FRACTION` search range. +- `--cpu-fraction-step`: Step size used to snap candidate `DTO_CPU_SIZE_FRACTION` values. +- `--initial-points`: Number of initial random trials before Bayesian-guided trials begin. +- `--seed`: Random seed for reproducible trial sequences. +- `--timeout-sec`: Per-trial command timeout in seconds. + +### Output artifacts + +The optimizer writes artifacts under the directory provided by `--results-dir`: + +- `/trials.jsonl`: one record per trial +- `/best.json`: best parameters and objective summary diff --git a/dtoopt/README.md b/dtoopt/README.md new file mode 100644 index 0000000..91af4b3 --- /dev/null +++ b/dtoopt/README.md @@ -0,0 +1,35 @@ +# dtoopt Directory Guide + +This folder contains the standalone DTO parameter optimization tool. + +## Source files + +- `__init__.py` + Marks `dtoopt` as a Python package. + +- `__main__.py` + Entrypoint for module execution (`python -m dtoopt`) and console script handoff. + +- `cli.py` + Defines and parses command-line arguments (search ranges, trials, mode, regex, output paths, etc.). + +- `core.py` + Implements optimization logic: search space setup, trial execution, metric extraction, failure penalty handling, and output artifact writing. + +- `pyproject.toml` and `setup.py` + Packaging/build metadata for installing `dtoopt` via `pip install ./dtoopt`. + +- `workload_probe.sh` + Helper wrapper that runs `dto-test-wodto` and prints a parseable metric line (`completed ops`) for optimizer runs. + +## Generated / runtime artifacts + +- `__pycache__/` + Python bytecode cache generated automatically at runtime. + +## Typical outputs (for normal usage) + +During regular runs, the optimizer writes results to the directory you pass via `--results-dir`, with: + +- `trials.jsonl`: one JSON record per trial +- `best.json`: best parameter set and objective summary diff --git a/dtoopt/__init__.py b/dtoopt/__init__.py new file mode 100644 index 0000000..fe98b73 --- /dev/null +++ b/dtoopt/__init__.py @@ -0,0 +1 @@ +"""DTO Bayesian optimization package.""" diff --git a/dtoopt/__main__.py b/dtoopt/__main__.py new file mode 100644 index 0000000..586bbd7 --- /dev/null +++ b/dtoopt/__main__.py @@ -0,0 +1,11 @@ +from dtoopt.cli import parse_args +from dtoopt.core import run + + +def main() -> int: + args = parse_args() + return run(args) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/dtoopt/cli.py b/dtoopt/cli.py new file mode 100644 index 0000000..64d14c4 --- /dev/null +++ b/dtoopt/cli.py @@ -0,0 +1,94 @@ +import argparse + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Bayesian optimization tool for DTO parameters using skopt." + ) + parser.add_argument( + "--command", + required=True, + help="Command to execute for each trial (quote as a single string).", + ) + parser.add_argument( + "--metric-regex", + default=r"DTO\s+Run\s+Time:\s*([0-9]+(?:\.[0-9]+)?)\s*ms", + help=( + "Regex with one capturing group to extract numeric metric from command output. " + "Default captures DTO runtime in milliseconds from lines like: 'DTO Run Time: 22826 ms'." + ), + ) + parser.add_argument( + "--mode", + choices=["maximize", "minimize"], + default="minimize", + help="Whether the extracted metric should be maximized or minimized.", + ) + parser.add_argument( + "--trials", + type=int, + default=30, + help="Total optimization trials.", + ) + parser.add_argument( + "--initial-points", + type=int, + default=10, + help="Number of initial random evaluations.", + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Random seed for reproducibility.", + ) + parser.add_argument( + "--timeout-sec", + type=int, + default=600, + help="Per-trial command timeout in seconds.", + ) + parser.add_argument( + "--results-dir", + required=True, + help="Directory for artifacts: trial log and best config.", + ) + + parser.add_argument( + "--min-bytes-low", + type=int, + default=4096, + help="Lower bound for DTO_MIN_BYTES search range (inclusive).", + ) + parser.add_argument( + "--min-bytes-high", + type=int, + default=262144, + help="Upper bound for DTO_MIN_BYTES search range (inclusive).", + ) + parser.add_argument( + "--min-bytes-step", + type=int, + default=1024, + help="Step size used to snap DTO_MIN_BYTES trial values.", + ) + parser.add_argument( + "--cpu-fraction-low", + type=float, + default=0.0, + help="Lower bound for DTO_CPU_SIZE_FRACTION search range (inclusive).", + ) + parser.add_argument( + "--cpu-fraction-high", + type=float, + default=1.0, + help="Upper bound for DTO_CPU_SIZE_FRACTION search range (inclusive).", + ) + parser.add_argument( + "--cpu-fraction-step", + type=float, + default=0.05, + help="Step size used to snap DTO_CPU_SIZE_FRACTION trial values.", + ) + + return parser.parse_args() diff --git a/dtoopt/core.py b/dtoopt/core.py new file mode 100644 index 0000000..994ffb1 --- /dev/null +++ b/dtoopt/core.py @@ -0,0 +1,236 @@ +import json +import math +import os +import re +import shlex +import subprocess +import sys +import time +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Dict, List, Tuple + +try: + from skopt import gp_minimize + from skopt.space import Categorical, Integer, Real +except ImportError: + print( + "Missing dependency: skopt. Install with: pip install scikit-optimize", + file=sys.stderr, + ) + raise + + +# Large objective assigned to failed/invalid trials so the optimizer avoids them. +FAILURE_PENALTY = 1e12 + + +@dataclass +class TrialResult: + trial_index: int + params: Dict[str, object] + score_raw: float + objective: float + mode: str + return_code: int + duration_sec: float + timed_out: bool + metric_source: str + + +def build_search_space(args): + if args.min_bytes_low <= 0 or args.min_bytes_low >= args.min_bytes_high: + raise ValueError("Invalid bounds for DTO_MIN_BYTES") + if args.min_bytes_step <= 0: + raise ValueError("DTO_MIN_BYTES step must be > 0") + if not (0.0 <= args.cpu_fraction_low < args.cpu_fraction_high <= 1.0): + raise ValueError("DTO_CPU_SIZE_FRACTION bounds must satisfy 0 <= low < high <= 1") + if args.cpu_fraction_step <= 0: + raise ValueError("DTO_CPU_SIZE_FRACTION step must be > 0") + + return [ + Integer( + low=args.min_bytes_low, + high=args.min_bytes_high, + prior="log-uniform", + name="DTO_MIN_BYTES", + ), + Real( + low=args.cpu_fraction_low, + high=args.cpu_fraction_high, + prior="uniform", + name="DTO_CPU_SIZE_FRACTION", + ), + Categorical(categories=[0, 1], name="DTO_AUTO_ADJUST_KNOBS"), + ] + + +def snap_int(value: int, low: int, high: int, step: int) -> int: + snapped = low + round((value - low) / step) * step + return max(low, min(high, snapped)) + + +def snap_float(value: float, low: float, high: float, step: float) -> float: + snapped = low + round((value - low) / step) * step + snapped = max(low, min(high, snapped)) + return round(snapped, 6) + + +def sanitize_params(point: List[object], args) -> Dict[str, str]: + min_bytes = snap_int(int(point[0]), args.min_bytes_low, args.min_bytes_high, args.min_bytes_step) + cpu_fraction = snap_float( + float(point[1]), + args.cpu_fraction_low, + args.cpu_fraction_high, + args.cpu_fraction_step, + ) + auto_adjust = int(point[2]) + + return { + "DTO_MIN_BYTES": str(min_bytes), + "DTO_CPU_SIZE_FRACTION": f"{cpu_fraction:.4f}", + "DTO_AUTO_ADJUST_KNOBS": str(auto_adjust), + } + + +def extract_metric(output_text: str, metric_pattern: re.Pattern) -> float: + matches = metric_pattern.findall(output_text) + if not matches: + raise ValueError("Metric regex did not match command output") + + last_match = matches[-1] + if isinstance(last_match, tuple): + value_str = last_match[0] + else: + value_str = last_match + + return float(value_str) + + +def evaluate_trial( + command: str, + metric_pattern: re.Pattern, + mode: str, + timeout_sec: int, + env_patch: Dict[str, str], +) -> Tuple[float, float, int, float, bool, str]: + env = os.environ.copy() + env.update(env_patch) + + start = time.perf_counter() + timed_out = False + metric_source = "stdout/stderr" + try: + proc = subprocess.run( + shlex.split(command), + env=env, + capture_output=True, + text=True, + timeout=timeout_sec, + check=False, + ) + output_text = f"{proc.stdout}\n{proc.stderr}" + raw_metric = extract_metric(output_text, metric_pattern) + return_code = proc.returncode + except subprocess.TimeoutExpired as exc: + timed_out = True + raw_metric = math.nan + return_code = 124 + output_text = f"{exc.stdout or ''}\n{exc.stderr or ''}" + except Exception: + raw_metric = math.nan + return_code = 1 + output_text = "" + + duration_sec = time.perf_counter() - start + + if not math.isfinite(raw_metric): + objective = math.inf + else: + objective = -raw_metric if mode == "maximize" else raw_metric + + if return_code != 0: + metric_source = "failed-command" + + return raw_metric, objective, return_code, duration_sec, timed_out, metric_source + + +def write_jsonl(path: Path, payload: Dict[str, object]) -> None: + with path.open("a", encoding="utf-8") as handle: + handle.write(json.dumps(payload) + "\n") + + +def run(args) -> int: + results_dir = Path(args.results_dir) + results_dir.mkdir(parents=True, exist_ok=True) + trial_log = results_dir / "trials.jsonl" + best_path = results_dir / "best.json" + + metric_pattern = re.compile(args.metric_regex) + search_space = build_search_space(args) + + trial_counter = {"index": 0} + + def objective(point: List[object]) -> float: + env_patch = sanitize_params(point, args) + trial_index = trial_counter["index"] + trial_counter["index"] += 1 + + raw_metric, objective_value, return_code, duration_sec, timed_out, metric_source = evaluate_trial( + command=args.command, + metric_pattern=metric_pattern, + mode=args.mode, + timeout_sec=args.timeout_sec, + env_patch=env_patch, + ) + + if return_code != 0 or not math.isfinite(objective_value): + objective_value = FAILURE_PENALTY + + result = TrialResult( + trial_index=trial_index, + params=env_patch, + score_raw=raw_metric if math.isfinite(raw_metric) else float("nan"), + objective=objective_value, + mode=args.mode, + return_code=return_code, + duration_sec=duration_sec, + timed_out=timed_out, + metric_source=metric_source, + ) + + write_jsonl(trial_log, asdict(result)) + print( + f"trial={trial_index} objective={objective_value:.6g} raw={raw_metric:.6g} rc={return_code} env={env_patch}", + flush=True, + ) + return objective_value + + result = gp_minimize( + func=objective, + dimensions=search_space, + n_calls=args.trials, + n_initial_points=min(args.initial_points, args.trials), + random_state=args.seed, + acq_func="EI", + ) + + best_env = sanitize_params(result.x, args) + best_objective = float(result.fun) + best_raw_metric = -best_objective if args.mode == "maximize" else best_objective + + summary = { + "best_params": best_env, + "best_objective": best_objective, + "best_metric_estimate": best_raw_metric, + "mode": args.mode, + "n_calls": len(result.func_vals), + "results_dir": str(results_dir), + } + + with best_path.open("w", encoding="utf-8") as handle: + json.dump(summary, handle, indent=2) + + print("Optimization complete") + print(json.dumps(summary, indent=2)) + return 0 diff --git a/dtoopt/pyproject.toml b/dtoopt/pyproject.toml new file mode 100644 index 0000000..642ab3c --- /dev/null +++ b/dtoopt/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" diff --git a/dtoopt/setup.py b/dtoopt/setup.py new file mode 100644 index 0000000..cac1268 --- /dev/null +++ b/dtoopt/setup.py @@ -0,0 +1,20 @@ +from setuptools import setup +from pathlib import Path + + +setup( + name="dtoopt", + version="0.1.0", + description="Bayesian optimization tool for DTO tuning", + long_description=Path("README.md").read_text(encoding="utf-8"), + long_description_content_type="text/markdown", + python_requires=">=3.8", + install_requires=["scikit-optimize"], + package_dir={"dtoopt": "."}, + packages=["dtoopt"], + entry_points={ + "console_scripts": [ + "dtoopt=dtoopt.__main__:main", + ] + }, +) diff --git a/dtoopt/workload_probe.sh b/dtoopt/workload_probe.sh new file mode 100644 index 0000000..8d7e9d2 --- /dev/null +++ b/dtoopt/workload_probe.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash +set -euo pipefail + +./dto-test-wodto | awk '/completed [0-9]+ ops/ { print; exit }'