intel · sidjana · Mar 10, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,7 @@
 dto-test*
 libdto.so*
+*.egg-info/
+__pycache__/
+build/
+dist/
+.pytest_cache/
diff --git a/Makefile b/Makefile
@@ -27,5 +27,9 @@ dto-test: dto-test.c
 dto-test-wodto: dto-test.c
 	gcc -g dto-test.c $(DML_LIB_CXX) -o dto-test-wodto -lpthread
 
+dtoopt-install:
+	python3 -m pip install --user .
+	@echo "dtoopt installed. If needed, add ~/.local/bin to PATH"
+
 clean:
 	rm -rf *.o *.so dto-test
diff --git a/README.md b/README.md
@@ -137,7 +137,7 @@ to the DSA device.
 3. Using with other applications (two ways to use it)
     3a. Using "-ldto" linker option (requires recompiling the application)
         i. Recompile the application with "-ldto" linker options
-	ii. Setup DTO environment variables (examples below)
+        ii. Setup DTO environment variables (examples below)
             export DTO_USESTDC_CALLS=0
             export DTO_COLLECT_STATS=1
             export DTO_WAIT_METHOD=busypoll
@@ -146,12 +146,12 @@ to the DSA device.
             export DTO_AUTO_ADJUST_KNOBS=1
             export DTO_WQ_LIST="wq0.0;wq2.0;wq4.0;wq6.0"
             export DTO_IS_NUMA_AWARE=1
-	iii. Run the application - (CacheBench example below)
-    3b. Using LD_PRELOAD method (doesn not require recompiling the application)
-	i. setup LD_PRELOAD environment variable to point to DTO library
+        iii. Run the application - (CacheBench example below)
+    3b. Using LD_PRELOAD method (does not require recompiling the application)
+        i. setup LD_PRELOAD environment variable to point to DTO library
             export  LD_PRELOAD=<libdto file path>:$LD_PRELOAD
-	ii. Export all environment variables (similar to ii. in option 3a. above)
-	iii. Run the application - (CacheBench example below)
+        ii. Export all environment variables (similar to ii. in option 3a. above)
+        iii. Run the application - (CacheBench example below)
 
 	<CBENCH_DIR>/cachebench -json_test_config <json file> --progress_stats_file=dto.log --report_api_latency
 
@@ -225,3 +225,93 @@ When linking DTO using LD_PRELOAD environment variable special care is required
       - When the application is started by a script with #!<location of shell> which invokes another script with #!<location of shell>, for 
         unknown reasons DTO causes a segmentation fault during a memset operation on an 8K sized buffer. This can be avoided by setting the minimum 
         DTO size above 8K, or by avoiding this invocation sequence.
+
+## DTO Parameter Optimization (dtoopt)
+
+DTO includes a standalone Bayesian optimization tool under `dtoopt/` that tunes the following three DTO environment variables:
+
+- `DTO_MIN_BYTES`
+- `DTO_CPU_SIZE_FRACTION`
+- `DTO_AUTO_ADJUST_KNOBS`
+
+The tool runs a user-provided workload command multiple times, sets trial DTO values as environment variables for each run, extracts a metric from workload output, and returns the best parameter combination.
+
+### Prerequisites
+
+- Python 3
+- `scikit-optimize`
+- `pip`
+
+```bash
+python3 -m pip install --user ./dtoopt
+```
+
+This installs a standalone `dtoopt` command from this repository.
+If your shell cannot find it, add `~/.local/bin` to your `PATH`.
+
+### Run the optimizer
+
+You can run `dtoopt` from any directory, as long as the `dtoopt` executable is available in your `PATH`.
+The repository root is only required when you rely on repository-relative paths (for example, `python3 -m pip install --user ./dtoopt` or `--command "./dto-test-wodto"`).
+
+```bash
+dtoopt --command "<your workload command>" --trials 30 --results-dir "optimizer_results"
+```
+
+Alternative (without installing the command):
+
+```bash
+python3 -m dtoopt --command "<your workload command>" --trials 30 --results-dir "optimizer_results"
+```
+
+By default, `--metric-regex` captures DTO runtime from lines like:
+
+```text
+DTO Run Time: 22826 ms
+```
+
+using this default regex:
+
+```text
+DTO\s+Run\s+Time:\s*([0-9]+(?:\.[0-9]+)?)\s*ms
+```
+
+If your workload prints a different metric, override `--metric-regex`.
+
+### Representative validation with dto-test
+
+Build test app without DTO link (for `LD_PRELOAD` style experiments):
+
+```bash
+make dto-test-wodto
+```
+
+Example: optimize against the progress metric printed by `dto-test`:
+
+```bash
+dtoopt \
+   --command "./dto-test-wodto" \
+   --metric-regex "completed\\s+([0-9]+)\\s+ops" \
+   --mode maximize \
+   --trials 20 \
+   --results-dir "optimizer_results"
+```
+
+### Useful knobs
+
+- `--min-bytes-low`: Lower bound (inclusive) of the `DTO_MIN_BYTES` search range.
+- `--min-bytes-high`: Upper bound (inclusive) of the `DTO_MIN_BYTES` search range.
+- `--min-bytes-step`: Step size used to snap candidate `DTO_MIN_BYTES` values.
+- `--cpu-fraction-low`: Lower bound (inclusive) of the `DTO_CPU_SIZE_FRACTION` search range.
+- `--cpu-fraction-high`: Upper bound (inclusive) of the `DTO_CPU_SIZE_FRACTION` search range.
+- `--cpu-fraction-step`: Step size used to snap candidate `DTO_CPU_SIZE_FRACTION` values.
+- `--initial-points`: Number of initial random trials before Bayesian-guided trials begin.
+- `--seed`: Random seed for reproducible trial sequences.
+- `--timeout-sec`: Per-trial command timeout in seconds.
+
+### Output artifacts
+
+The optimizer writes artifacts under the directory provided by `--results-dir`:
+
+- `<results-dir>/trials.jsonl`: one record per trial
+- `<results-dir>/best.json`: best parameters and objective summary
diff --git a/dtoopt/README.md b/dtoopt/README.md
@@ -0,0 +1,35 @@
+# dtoopt Directory Guide
+
+This folder contains the standalone DTO parameter optimization tool.
+
+## Source files
+
+- `__init__.py`  
+  Marks `dtoopt` as a Python package.
+
+- `__main__.py`  
+  Entrypoint for module execution (`python -m dtoopt`) and console script handoff.
+
+- `cli.py`  
+  Defines and parses command-line arguments (search ranges, trials, mode, regex, output paths, etc.).
+
+- `core.py`  
+  Implements optimization logic: search space setup, trial execution, metric extraction, failure penalty handling, and output artifact writing.
+
+- `pyproject.toml` and `setup.py`
+  Packaging/build metadata for installing `dtoopt` via `pip install ./dtoopt`.
+
+- `workload_probe.sh`  
+  Helper wrapper that runs `dto-test-wodto` and prints a parseable metric line (`completed <N> ops`) for optimizer runs.
+
+## Generated / runtime artifacts
+
+- `__pycache__/`  
+  Python bytecode cache generated automatically at runtime.
+
+## Typical outputs (for normal usage)
+
+During regular runs, the optimizer writes results to the directory you pass via `--results-dir`, with:
+
+- `trials.jsonl`: one JSON record per trial
+- `best.json`: best parameter set and objective summary
diff --git a/dtoopt/__init__.py b/dtoopt/__init__.py
@@ -0,0 +1 @@
+"""DTO Bayesian optimization package."""
diff --git a/dtoopt/__main__.py b/dtoopt/__main__.py
@@ -0,0 +1,11 @@
+from dtoopt.cli import parse_args
+from dtoopt.core import run
+
+
+def main() -> int:
+    args = parse_args()
+    return run(args)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/dtoopt/cli.py b/dtoopt/cli.py
@@ -0,0 +1,94 @@
+import argparse
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Bayesian optimization tool for DTO parameters using skopt."
+    )
+    parser.add_argument(
+        "--command",
+        required=True,
+        help="Command to execute for each trial (quote as a single string).",
+    )
+    parser.add_argument(
+        "--metric-regex",
+        default=r"DTO\s+Run\s+Time:\s*([0-9]+(?:\.[0-9]+)?)\s*ms",
+        help=(
+            "Regex with one capturing group to extract numeric metric from command output. "
+            "Default captures DTO runtime in milliseconds from lines like: 'DTO Run Time: 22826 ms'."
+        ),
+    )
+    parser.add_argument(
+        "--mode",
+        choices=["maximize", "minimize"],
+        default="minimize",
+        help="Whether the extracted metric should be maximized or minimized.",
+    )
+    parser.add_argument(
+        "--trials",
+        type=int,
+        default=30,
+        help="Total optimization trials.",
+    )
+    parser.add_argument(
+        "--initial-points",
+        type=int,
+        default=10,
+        help="Number of initial random evaluations.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed for reproducibility.",
+    )
+    parser.add_argument(
+        "--timeout-sec",
+        type=int,
+        default=600,
+        help="Per-trial command timeout in seconds.",
+    )
+    parser.add_argument(
+        "--results-dir",
+        required=True,
+        help="Directory for artifacts: trial log and best config.",
+    )
+
+    parser.add_argument(
+        "--min-bytes-low",
+        type=int,
+        default=4096,
+        help="Lower bound for DTO_MIN_BYTES search range (inclusive).",
+    )
+    parser.add_argument(
+        "--min-bytes-high",
+        type=int,
+        default=262144,
+        help="Upper bound for DTO_MIN_BYTES search range (inclusive).",
+    )
+    parser.add_argument(
+        "--min-bytes-step",
+        type=int,
+        default=1024,
+        help="Step size used to snap DTO_MIN_BYTES trial values.",
+    )
+    parser.add_argument(
+        "--cpu-fraction-low",
+        type=float,
+        default=0.0,
+        help="Lower bound for DTO_CPU_SIZE_FRACTION search range (inclusive).",
+    )
+    parser.add_argument(
+        "--cpu-fraction-high",
+        type=float,
+        default=1.0,
+        help="Upper bound for DTO_CPU_SIZE_FRACTION search range (inclusive).",
+    )
+    parser.add_argument(
+        "--cpu-fraction-step",
+        type=float,
+        default=0.05,
+        help="Step size used to snap DTO_CPU_SIZE_FRACTION trial values.",
+    )
+
+    return parser.parse_args()