From db1f850d605bfb435e2d03d220e6f8ddc2992c1d Mon Sep 17 00:00:00 2001
From: Siddhartha Jana <siddharthajana24@gmail.com>
Date: Mon, 9 Mar 2026 20:47:39 -0700
Subject: [PATCH] Initial dtoopt commit - basic CLI and Core

---
 .gitignore               |   5 +
 Makefile                 |   4 +
 README.md                | 102 ++++++++++++++++-
 dtoopt/README.md         |  35 ++++++
 dtoopt/__init__.py       |   1 +
 dtoopt/__main__.py       |  11 ++
 dtoopt/cli.py            |  94 ++++++++++++++++
 dtoopt/core.py           | 236 +++++++++++++++++++++++++++++++++++++++
 dtoopt/pyproject.toml    |   3 +
 dtoopt/setup.py          |  20 ++++
 dtoopt/workload_probe.sh |   4 +
 11 files changed, 509 insertions(+), 6 deletions(-)
 create mode 100644 dtoopt/README.md
 create mode 100644 dtoopt/__init__.py
 create mode 100644 dtoopt/__main__.py
 create mode 100644 dtoopt/cli.py
 create mode 100644 dtoopt/core.py
 create mode 100644 dtoopt/pyproject.toml
 create mode 100644 dtoopt/setup.py
 create mode 100644 dtoopt/workload_probe.sh
diff --git a/.gitignore b/.gitignore
index 221b15a..7856182 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,7 @@
 dto-test*
 libdto.so*
+*.egg-info/
+__pycache__/
+build/
+dist/
+.pytest_cache/
diff --git a/Makefile b/Makefile
index 95af413..2603ec5 100644
--- a/Makefile
+++ b/Makefile
@@ -27,5 +27,9 @@ dto-test: dto-test.c
 dto-test-wodto: dto-test.c
 	gcc -g dto-test.c $(DML_LIB_CXX) -o dto-test-wodto -lpthread
 
+dtoopt-install:
+	python3 -m pip install --user .
+	@echo "dtoopt installed. If needed, add ~/.local/bin to PATH"
+
 clean:
 	rm -rf *.o *.so dto-test
diff --git a/README.md b/README.md
index f0324f4..0ff40cf 100644
--- a/README.md
+++ b/README.md
@@ -137,7 +137,7 @@ to the DSA device.
 3. Using with other applications (two ways to use it)
     3a. Using "-ldto" linker option (requires recompiling the application)
         i. Recompile the application with "-ldto" linker options
-	ii. Setup DTO environment variables (examples below)
+        ii. Setup DTO environment variables (examples below)
             export DTO_USESTDC_CALLS=0
             export DTO_COLLECT_STATS=1
             export DTO_WAIT_METHOD=busypoll
@@ -146,12 +146,12 @@ to the DSA device.
             export DTO_AUTO_ADJUST_KNOBS=1
             export DTO_WQ_LIST="wq0.0;wq2.0;wq4.0;wq6.0"
             export DTO_IS_NUMA_AWARE=1
-	iii. Run the application - (CacheBench example below)
-    3b. Using LD_PRELOAD method (doesn not require recompiling the application)
-	i. setup LD_PRELOAD environment variable to point to DTO library
+        iii. Run the application - (CacheBench example below)
+    3b. Using LD_PRELOAD method (does not require recompiling the application)
+        i. setup LD_PRELOAD environment variable to point to DTO library
             export  LD_PRELOAD=<libdto file path>:$LD_PRELOAD
-	ii. Export all environment variables (similar to ii. in option 3a. above)
-	iii. Run the application - (CacheBench example below)
+        ii. Export all environment variables (similar to ii. in option 3a. above)
+        iii. Run the application - (CacheBench example below)
 
 	<CBENCH_DIR>/cachebench -json_test_config <json file> --progress_stats_file=dto.log --report_api_latency
 
@@ -225,3 +225,93 @@ When linking DTO using LD_PRELOAD environment variable special care is required
       - When the application is started by a script with #!<location of shell> which invokes another script with #!<location of shell>, for 
         unknown reasons DTO causes a segmentation fault during a memset operation on an 8K sized buffer. This can be avoided by setting the minimum 
         DTO size above 8K, or by avoiding this invocation sequence.
+
+## DTO Parameter Optimization (dtoopt)
+
+DTO includes a standalone Bayesian optimization tool under `dtoopt/` that tunes the following three DTO environment variables:
+
+- `DTO_MIN_BYTES`
+- `DTO_CPU_SIZE_FRACTION`
+- `DTO_AUTO_ADJUST_KNOBS`
+
+The tool runs a user-provided workload command multiple times, sets trial DTO values as environment variables for each run, extracts a metric from workload output, and returns the best parameter combination.
+
+### Prerequisites
+
+- Python 3
+- `scikit-optimize`
+- `pip`
+
+```bash
+python3 -m pip install --user ./dtoopt
+```
+
+This installs a standalone `dtoopt` command from this repository.
+If your shell cannot find it, add `~/.local/bin` to your `PATH`.
+
+### Run the optimizer
+
+You can run `dtoopt` from any directory, as long as the `dtoopt` executable is available in your `PATH`.
+The repository root is only required when you rely on repository-relative paths (for example, `python3 -m pip install --user ./dtoopt` or `--command "./dto-test-wodto"`).
+
+```bash
+dtoopt --command "<your workload command>" --trials 30 --results-dir "optimizer_results"
+```
+
+Alternative (without installing the command):
+
+```bash
+python3 -m dtoopt --command "<your workload command>" --trials 30 --results-dir "optimizer_results"
+```
+
+By default, `--metric-regex` captures DTO runtime from lines like:
+
+```text
+DTO Run Time: 22826 ms
+```
+
+using this default regex:
+
+```text
+DTO\s+Run\s+Time:\s*([0-9]+(?:\.[0-9]+)?)\s*ms
+```
+
+If your workload prints a different metric, override `--metric-regex`.
+
+### Representative validation with dto-test
+
+Build test app without DTO link (for `LD_PRELOAD` style experiments):
+
+```bash
+make dto-test-wodto
+```
+
+Example: optimize against the progress metric printed by `dto-test`:
+
+```bash
+dtoopt \
+   --command "./dto-test-wodto" \
+   --metric-regex "completed\\s+([0-9]+)\\s+ops" \
+   --mode maximize \
+   --trials 20 \
+   --results-dir "optimizer_results"
+```
+
+### Useful knobs
+
+- `--min-bytes-low`: Lower bound (inclusive) of the `DTO_MIN_BYTES` search range.
+- `--min-bytes-high`: Upper bound (inclusive) of the `DTO_MIN_BYTES` search range.
+- `--min-bytes-step`: Step size used to snap candidate `DTO_MIN_BYTES` values.
+- `--cpu-fraction-low`: Lower bound (inclusive) of the `DTO_CPU_SIZE_FRACTION` search range.
+- `--cpu-fraction-high`: Upper bound (inclusive) of the `DTO_CPU_SIZE_FRACTION` search range.
+- `--cpu-fraction-step`: Step size used to snap candidate `DTO_CPU_SIZE_FRACTION` values.
+- `--initial-points`: Number of initial random trials before Bayesian-guided trials begin.
+- `--seed`: Random seed for reproducible trial sequences.
+- `--timeout-sec`: Per-trial command timeout in seconds.
+
+### Output artifacts
+
+The optimizer writes artifacts under the directory provided by `--results-dir`:
+
+- `<results-dir>/trials.jsonl`: one record per trial
+- `<results-dir>/best.json`: best parameters and objective summary
diff --git a/dtoopt/README.md b/dtoopt/README.md
new file mode 100644
index 0000000..91af4b3
--- /dev/null
+++ b/dtoopt/README.md
@@ -0,0 +1,35 @@
+# dtoopt Directory Guide
+
+This folder contains the standalone DTO parameter optimization tool.
+
+## Source files
+
+- `__init__.py`  
+  Marks `dtoopt` as a Python package.
+
+- `__main__.py`  
+  Entrypoint for module execution (`python -m dtoopt`) and console script handoff.
+
+- `cli.py`  
+  Defines and parses command-line arguments (search ranges, trials, mode, regex, output paths, etc.).
+
+- `core.py`  
+  Implements optimization logic: search space setup, trial execution, metric extraction, failure penalty handling, and output artifact writing.
+
+- `pyproject.toml` and `setup.py`
+  Packaging/build metadata for installing `dtoopt` via `pip install ./dtoopt`.
+
+- `workload_probe.sh`  
+  Helper wrapper that runs `dto-test-wodto` and prints a parseable metric line (`completed <N> ops`) for optimizer runs.
+
+## Generated / runtime artifacts
+
+- `__pycache__/`  
+  Python bytecode cache generated automatically at runtime.
+
+## Typical outputs (for normal usage)
+
+During regular runs, the optimizer writes results to the directory you pass via `--results-dir`, with:
+
+- `trials.jsonl`: one JSON record per trial
+- `best.json`: best parameter set and objective summary
diff --git a/dtoopt/__init__.py b/dtoopt/__init__.py
new file mode 100644
index 0000000..fe98b73
--- /dev/null
+++ b/dtoopt/__init__.py
@@ -0,0 +1 @@
+"""DTO Bayesian optimization package."""
diff --git a/dtoopt/__main__.py b/dtoopt/__main__.py
new file mode 100644
index 0000000..586bbd7
--- /dev/null
+++ b/dtoopt/__main__.py
@@ -0,0 +1,11 @@
+from dtoopt.cli import parse_args
+from dtoopt.core import run
+
+
+def main() -> int:
+    args = parse_args()
+    return run(args)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/dtoopt/cli.py b/dtoopt/cli.py
new file mode 100644
index 0000000..64d14c4
--- /dev/null
+++ b/dtoopt/cli.py
@@ -0,0 +1,94 @@
+import argparse
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Bayesian optimization tool for DTO parameters using skopt."
+    )
+    parser.add_argument(
+        "--command",
+        required=True,
+        help="Command to execute for each trial (quote as a single string).",
+    )
+    parser.add_argument(
+        "--metric-regex",
+        default=r"DTO\s+Run\s+Time:\s*([0-9]+(?:\.[0-9]+)?)\s*ms",
+        help=(
+            "Regex with one capturing group to extract numeric metric from command output. "
+            "Default captures DTO runtime in milliseconds from lines like: 'DTO Run Time: 22826 ms'."
+        ),
+    )
+    parser.add_argument(
+        "--mode",
+        choices=["maximize", "minimize"],
+        default="minimize",
+        help="Whether the extracted metric should be maximized or minimized.",
+    )
+    parser.add_argument(
+        "--trials",
+        type=int,
+        default=30,
+        help="Total optimization trials.",
+    )
+    parser.add_argument(
+        "--initial-points",
+        type=int,
+        default=10,
+        help="Number of initial random evaluations.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed for reproducibility.",
+    )
+    parser.add_argument(
+        "--timeout-sec",
+        type=int,
+        default=600,
+        help="Per-trial command timeout in seconds.",
+    )
+    parser.add_argument(
+        "--results-dir",
+        required=True,
+        help="Directory for artifacts: trial log and best config.",
+    )
+
+    parser.add_argument(
+        "--min-bytes-low",
+        type=int,
+        default=4096,
+        help="Lower bound for DTO_MIN_BYTES search range (inclusive).",
+    )
+    parser.add_argument(
+        "--min-bytes-high",
+        type=int,
+        default=262144,
+        help="Upper bound for DTO_MIN_BYTES search range (inclusive).",
+    )
+    parser.add_argument(
+        "--min-bytes-step",
+        type=int,
+        default=1024,
+        help="Step size used to snap DTO_MIN_BYTES trial values.",
+    )
+    parser.add_argument(
+        "--cpu-fraction-low",
+        type=float,
+        default=0.0,
+        help="Lower bound for DTO_CPU_SIZE_FRACTION search range (inclusive).",
+    )
+    parser.add_argument(
+        "--cpu-fraction-high",
+        type=float,
+        default=1.0,
+        help="Upper bound for DTO_CPU_SIZE_FRACTION search range (inclusive).",
+    )
+    parser.add_argument(
+        "--cpu-fraction-step",
+        type=float,
+        default=0.05,
+        help="Step size used to snap DTO_CPU_SIZE_FRACTION trial values.",
+    )
+
+    return parser.parse_args()
diff --git a/dtoopt/core.py b/dtoopt/core.py
new file mode 100644
index 0000000..994ffb1
--- /dev/null
+++ b/dtoopt/core.py
@@ -0,0 +1,236 @@
+import json
+import math
+import os
+import re
+import shlex
+import subprocess
+import sys
+import time
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+try:
+    from skopt import gp_minimize
+    from skopt.space import Categorical, Integer, Real
+except ImportError:
+    print(
+        "Missing dependency: skopt. Install with: pip install scikit-optimize",
+        file=sys.stderr,
+    )
+    raise
+
+
+# Large objective assigned to failed/invalid trials so the optimizer avoids them.
+FAILURE_PENALTY = 1e12
+
+
+@dataclass
+class TrialResult:
+    trial_index: int
+    params: Dict[str, object]
+    score_raw: float
+    objective: float
+    mode: str
+    return_code: int
+    duration_sec: float
+    timed_out: bool
+    metric_source: str
+
+
+def build_search_space(args):
+    if args.min_bytes_low <= 0 or args.min_bytes_low >= args.min_bytes_high:
+        raise ValueError("Invalid bounds for DTO_MIN_BYTES")
+    if args.min_bytes_step <= 0:
+        raise ValueError("DTO_MIN_BYTES step must be > 0")
+    if not (0.0 <= args.cpu_fraction_low < args.cpu_fraction_high <= 1.0):
+        raise ValueError("DTO_CPU_SIZE_FRACTION bounds must satisfy 0 <= low < high <= 1")
+    if args.cpu_fraction_step <= 0:
+        raise ValueError("DTO_CPU_SIZE_FRACTION step must be > 0")
+
+    return [
+        Integer(
+            low=args.min_bytes_low,
+            high=args.min_bytes_high,
+            prior="log-uniform",
+            name="DTO_MIN_BYTES",
+        ),
+        Real(
+            low=args.cpu_fraction_low,
+            high=args.cpu_fraction_high,
+            prior="uniform",
+            name="DTO_CPU_SIZE_FRACTION",
+        ),
+        Categorical(categories=[0, 1], name="DTO_AUTO_ADJUST_KNOBS"),
+    ]
+
+
+def snap_int(value: int, low: int, high: int, step: int) -> int:
+    snapped = low + round((value - low) / step) * step
+    return max(low, min(high, snapped))
+
+
+def snap_float(value: float, low: float, high: float, step: float) -> float:
+    snapped = low + round((value - low) / step) * step
+    snapped = max(low, min(high, snapped))
+    return round(snapped, 6)
+
+
+def sanitize_params(point: List[object], args) -> Dict[str, str]:
+    min_bytes = snap_int(int(point[0]), args.min_bytes_low, args.min_bytes_high, args.min_bytes_step)
+    cpu_fraction = snap_float(
+        float(point[1]),
+        args.cpu_fraction_low,
+        args.cpu_fraction_high,
+        args.cpu_fraction_step,
+    )
+    auto_adjust = int(point[2])
+
+    return {
+        "DTO_MIN_BYTES": str(min_bytes),
+        "DTO_CPU_SIZE_FRACTION": f"{cpu_fraction:.4f}",
+        "DTO_AUTO_ADJUST_KNOBS": str(auto_adjust),
+    }
+
+
+def extract_metric(output_text: str, metric_pattern: re.Pattern) -> float:
+    matches = metric_pattern.findall(output_text)
+    if not matches:
+        raise ValueError("Metric regex did not match command output")
+
+    last_match = matches[-1]
+    if isinstance(last_match, tuple):
+        value_str = last_match[0]
+    else:
+        value_str = last_match
+
+    return float(value_str)
+
+
+def evaluate_trial(
+    command: str,
+    metric_pattern: re.Pattern,
+    mode: str,
+    timeout_sec: int,
+    env_patch: Dict[str, str],
+) -> Tuple[float, float, int, float, bool, str]:
+    env = os.environ.copy()
+    env.update(env_patch)
+
+    start = time.perf_counter()
+    timed_out = False
+    metric_source = "stdout/stderr"
+    try:
+        proc = subprocess.run(
+            shlex.split(command),
+            env=env,
+            capture_output=True,
+            text=True,
+            timeout=timeout_sec,
+            check=False,
+        )
+        output_text = f"{proc.stdout}\n{proc.stderr}"
+        raw_metric = extract_metric(output_text, metric_pattern)
+        return_code = proc.returncode
+    except subprocess.TimeoutExpired as exc:
+        timed_out = True
+        raw_metric = math.nan
+        return_code = 124
+        output_text = f"{exc.stdout or ''}\n{exc.stderr or ''}"
+    except Exception:
+        raw_metric = math.nan
+        return_code = 1
+        output_text = ""
+
+    duration_sec = time.perf_counter() - start
+
+    if not math.isfinite(raw_metric):
+        objective = math.inf
+    else:
+        objective = -raw_metric if mode == "maximize" else raw_metric
+
+    if return_code != 0:
+        metric_source = "failed-command"
+
+    return raw_metric, objective, return_code, duration_sec, timed_out, metric_source
+
+
+def write_jsonl(path: Path, payload: Dict[str, object]) -> None:
+    with path.open("a", encoding="utf-8") as handle:
+        handle.write(json.dumps(payload) + "\n")
+
+
+def run(args) -> int:
+    results_dir = Path(args.results_dir)
+    results_dir.mkdir(parents=True, exist_ok=True)
+    trial_log = results_dir / "trials.jsonl"
+    best_path = results_dir / "best.json"
+
+    metric_pattern = re.compile(args.metric_regex)
+    search_space = build_search_space(args)
+
+    trial_counter = {"index": 0}
+
+    def objective(point: List[object]) -> float:
+        env_patch = sanitize_params(point, args)
+        trial_index = trial_counter["index"]
+        trial_counter["index"] += 1
+
+        raw_metric, objective_value, return_code, duration_sec, timed_out, metric_source = evaluate_trial(
+            command=args.command,
+            metric_pattern=metric_pattern,
+            mode=args.mode,
+            timeout_sec=args.timeout_sec,
+            env_patch=env_patch,
+        )
+
+        if return_code != 0 or not math.isfinite(objective_value):
+            objective_value = FAILURE_PENALTY
+
+        result = TrialResult(
+            trial_index=trial_index,
+            params=env_patch,
+            score_raw=raw_metric if math.isfinite(raw_metric) else float("nan"),
+            objective=objective_value,
+            mode=args.mode,
+            return_code=return_code,
+            duration_sec=duration_sec,
+            timed_out=timed_out,
+            metric_source=metric_source,
+        )
+
+        write_jsonl(trial_log, asdict(result))
+        print(
+            f"trial={trial_index} objective={objective_value:.6g} raw={raw_metric:.6g} rc={return_code} env={env_patch}",
+            flush=True,
+        )
+        return objective_value
+
+    result = gp_minimize(
+        func=objective,
+        dimensions=search_space,
+        n_calls=args.trials,
+        n_initial_points=min(args.initial_points, args.trials),
+        random_state=args.seed,
+        acq_func="EI",
+    )
+
+    best_env = sanitize_params(result.x, args)
+    best_objective = float(result.fun)
+    best_raw_metric = -best_objective if args.mode == "maximize" else best_objective
+
+    summary = {
+        "best_params": best_env,
+        "best_objective": best_objective,
+        "best_metric_estimate": best_raw_metric,
+        "mode": args.mode,
+        "n_calls": len(result.func_vals),
+        "results_dir": str(results_dir),
+    }
+
+    with best_path.open("w", encoding="utf-8") as handle:
+        json.dump(summary, handle, indent=2)
+
+    print("Optimization complete")
+    print(json.dumps(summary, indent=2))
+    return 0
diff --git a/dtoopt/pyproject.toml b/dtoopt/pyproject.toml
new file mode 100644
index 0000000..642ab3c
--- /dev/null
+++ b/dtoopt/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
diff --git a/dtoopt/setup.py b/dtoopt/setup.py
new file mode 100644
index 0000000..cac1268
--- /dev/null
+++ b/dtoopt/setup.py
@@ -0,0 +1,20 @@
+from setuptools import setup
+from pathlib import Path
+
+
+setup(
+    name="dtoopt",
+    version="0.1.0",
+    description="Bayesian optimization tool for DTO tuning",
+    long_description=Path("README.md").read_text(encoding="utf-8"),
+    long_description_content_type="text/markdown",
+    python_requires=">=3.8",
+    install_requires=["scikit-optimize"],
+    package_dir={"dtoopt": "."},
+    packages=["dtoopt"],
+    entry_points={
+        "console_scripts": [
+            "dtoopt=dtoopt.__main__:main",
+        ]
+    },
+)
diff --git a/dtoopt/workload_probe.sh b/dtoopt/workload_probe.sh
new file mode 100644
index 0000000..8d7e9d2
--- /dev/null
+++ b/dtoopt/workload_probe.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+./dto-test-wodto | awk '/completed [0-9]+ ops/ { print; exit }'