Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
c6af872
diffuse: inline structural convolution matrices for 45.631 CPU execut…
kofa73 Mar 7, 2026
4739dc2
diffuse: scalar gradient fetching optimization for 40.958s execution sum
kofa73 Mar 7, 2026
488bd4c
diffuse: spatial lookup loop unroll for 40.024s execution sum
kofa73 Mar 7, 2026
0264f6f
housekeeping files
kofa73 Mar 7, 2026
a2931bf
diffuse: eliminate intermediate neighbour pixel arrays for 39.215s ex…
kofa73 Mar 7, 2026
5caec26
diffuse: squared-magnitude angle computation for 38.847s execution sum
kofa73 Mar 7, 2026
d2823b7
diffuse: paired convolution loops for shared angle data for 36.877s e…
kofa73 Mar 7, 2026
40744c8
Add diffuse integration tests and anisotropy code path documentation
kofa73 Mar 7, 2026
712a1e1
diffuse: unroll derivative accumulation loop for 36.231s execution sum
kofa73 Mar 7, 2026
1b85251
diffuse: merge variance regularization into output loop for 35.972s e…
kofa73 Mar 7, 2026
5f1be81
diffuse: remove 0.5f scaling from gradient/laplacian, absorb into hal…
kofa73 Mar 7, 2026
d2c2838
diffuse: record failed shared diag intermediates experiment
kofa73 Mar 7, 2026
b569669
diffuse: record failed pre-halve cs experiment and sync tracking files
kofa73 Mar 7, 2026
bca42b9
diffuse: record failed a11+a22 identity, neg_magnitude, center reuse,…
kofa73 Mar 7, 2026
3cc8707
diffuse: reorder sums loop before angle loop for L1 cache locality, y…
kofa73 Mar 8, 2026
740a0ef
diffuse: record failed fast-math pragma and fused dt_vector_exp exper…
kofa73 Mar 8, 2026
de4a40f
perf: Reduced baseline from 35.224s to 33.355s (~5.3% drop). Fused an…
kofa73 Mar 9, 2026
0860c8f
diffuse: removed unrelated files
kofa73 Mar 11, 2026
d54c23c
diffuse: updated progress
kofa73 Mar 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 121 additions & 0 deletions benchmark_diffuse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#!/usr/bin/env python3
import subprocess
import re
import sys
import argparse
import glob
import os

def run_test(mode="cpu"):
cmd = [
os.path.expanduser("~/darktable-build/bin/darktable-cli"),
"/workspace/darktable/diffuse-perf-test-files/DSC_9034.NEF",
"/workspace/darktable/diffuse-perf-test-files/DSC_9034.NEF.xmp",
"/workspace/darktable/diffuse-perf-test-files",
"--core",
"-d", "perf",
"-d", "opencl",
"--configdir", "/tmp/darktable-perftest/"
]
if mode == "cpu":
cmd.insert(5, "--disable-opencl")

process = subprocess.run(cmd, capture_output=True, text=True)

diffuse_times = []
search_mode = "on CPU" if mode == "cpu" else "on GPU"
for line in process.stdout.splitlines():
if "processed `diffuse" in line and search_mode in line:
m = re.search(r"took (\d+\.\d+) secs", line)
if m:
diffuse_times.append(float(m.group(1)))

for f in glob.glob("/workspace/darktable/diffuse-perf-test-files/*.jpg"):
try:
os.remove(f)
except OSError:
pass

return diffuse_times

def main():
parser = argparse.ArgumentParser()
parser.add_argument('--mode', choices=['cpu', 'gpu'], default='cpu')
args = parser.parse_args()

global_mode = args.mode

baseline = None
try:
with open("/workspace/darktable/diffuse-performance.log", "r") as f:
lines = f.read().strip().split('\n')
if lines:
baseline = float(lines[-1].strip())
except Exception as e:
print(f"Warning: Could not read baseline from diffuse-performance.log: {e}")

best_times = None
best_sum_at_last_improvement = None

run_idx = 1
last_improved = 0

print(f"Starting new performance measurement for {global_mode.upper()}...")
while True:
times = run_test(global_mode)

if len(times) != 3:
print(f"Warning: Expected 3 diffuse times, got {len(times)}: {times}")
if len(times) < 3:
times.extend([float('inf')] * (3 - len(times)))

if best_times is None:
best_times = times[:3]
best_sum_at_last_improvement = sum(best_times)
last_improved = run_idx

times_str = ", ".join(f"{t:g}" for t in times[:3])
best_str = ", ".join(f"{b:g}" for b in best_times)
print(f"run {run_idx}: {times_str} -> best so far: {best_str}; last improved in run {last_improved}")

if baseline is not None:
current_sum = sum(best_times)
if current_sum > baseline * 1.10:
print(f"FAILED FAST: First iteration sum ({current_sum:.3f}s) is >10% worse than baseline ({baseline:.3f}s). Aborting.")
sys.exit(1)

print("(we cannot stop, there was an improvement in one of the last 5 steps)")
else:
improved_elements = False
for i in range(3):
if times[i] < best_times[i]:
best_times[i] = times[i]
improved_elements = True

current_best_sum = sum(best_times)

percent_improvement = ((best_sum_at_last_improvement - current_best_sum) / best_sum_at_last_improvement) * 100.0

if improved_elements and percent_improvement >= 0.1:
last_improved = run_idx
best_sum_at_last_improvement = current_best_sum

times_str = ", ".join(f"{t:g}" for t in times[:3])
best_str = ", ".join(f"{b:g}" for b in best_times)

print(f"run {run_idx}: {times_str} -> best so far: {best_str}; last improved in run {last_improved}")

if run_idx - last_improved >= 5:
print(f"(we can stop, there was an improvement of {percent_improvement:.3f}% in the last 5 steps, which is less than 0.1%)")
break
else:
print(f"(we cannot stop, there was an improvement of {percent_improvement:.3f}% in the last 5 steps)")

run_idx += 1

best_sum = sum(best_times)
best_str = ", ".join(f"{b:g}" for b in best_times)
print(f"\nBenchmark result: sum({best_str}) = {best_sum:.3f}")

if __name__ == "__main__":
main()
92 changes: 92 additions & 0 deletions benchmark_general_case.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env python3
import subprocess
import re
import sys
import os
import glob

def run_test(xmp_name):
cmd = [
os.path.expanduser("~/darktable-build/bin/darktable-cli"),
"/workspace/darktable/diffuse-perf-test-files/DSC_9034.NEF",
f"/workspace/darktable/diffuse-perf-test-files/general_{xmp_name}.xmp",
"/workspace/darktable/diffuse-perf-test-files",
"--core",
"--disable-opencl",
"-d", "perf",
"--configdir", "/tmp/darktable-perftest/"
]

process = subprocess.run(cmd, capture_output=True, text=True)

diffuse_times = []
for line in process.stdout.splitlines():
if "processed `diffuse" in line and "on CPU" in line:
m = re.search(r"took (\d+\.\d+) secs", line)
if m:
diffuse_times.append(float(m.group(1)))

for f in glob.glob("/workspace/darktable/diffuse-perf-test-files/*.jpg"):
try:
os.remove(f)
except OSError:
pass

return diffuse_times

def benchmark_case(case_name):
print(f"Benchmarking Test Case {case_name}...")
best_time = None
best_time_at_last_improvement = None
run_idx = 1
last_improved = 0

while True:
times = run_test(case_name)
if not times:
print(f" run {run_idx}: FAILED")
run_idx += 1
continue

t = times[0]
if best_time is None:
best_time = t
best_time_at_last_improvement = t
last_improved = run_idx
print(f" run {run_idx}: {t:.3f} -> best so far: {best_time:.3f}; last improved in run {last_improved}")
else:
improved = False
if t < best_time:
best_time = t
improved = True

percent_improvement = ((best_time_at_last_improvement - best_time) / best_time_at_last_improvement) * 100.0

if improved and percent_improvement >= 0.1:
last_improved = run_idx
best_time_at_last_improvement = best_time

print(f" run {run_idx}: {t:.3f} -> best so far: {best_time:.3f}; last improved in run {last_improved}")

if run_idx - last_improved >= 5:
print(f" (stopping for {case_name}, improvement {percent_improvement:.3f}% < 0.1% in last 5 runs)")
break

run_idx += 1
return best_time

def main():
test_cases = ["G1", "G2", "G3"]
results = {}

for case in test_cases:
results[case] = benchmark_case(case)

total_sum = sum(results.values())
print("\n--- General Case Benchmark Results ---")
for case in test_cases:
print(f"Test Case {case}: {results[case]:.3f}s")
print(f"TOTAL SUM: {total_sum:.3f}s")

if __name__ == "__main__":
main()
92 changes: 92 additions & 0 deletions benchmark_presets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env python3
import subprocess
import re
import sys
import os
import glob

def run_test(xmp_name):
cmd = [
os.path.expanduser("~/darktable-build/bin/darktable-cli"),
"/workspace/darktable/diffuse-perf-test-files/DSC_9034.NEF",
f"/workspace/darktable/diffuse-perf-test-files/preset_{xmp_name}.xmp",
"/workspace/darktable/diffuse-perf-test-files",
"--core",
"--disable-opencl",
"-d", "perf",
"--configdir", "/tmp/darktable-perftest/"
]

process = subprocess.run(cmd, capture_output=True, text=True)

diffuse_times = []
for line in process.stdout.splitlines():
if "processed `diffuse" in line and "on CPU" in line:
m = re.search(r"took (\d+\.\d+) secs", line)
if m:
diffuse_times.append(float(m.group(1)))

for f in glob.glob("/workspace/darktable/diffuse-perf-test-files/*.jpg"):
try:
os.remove(f)
except OSError:
pass

return diffuse_times

def benchmark_case(case_name):
print(f"Benchmarking Preset {case_name}...")
best_time = None
best_time_at_last_improvement = None
run_idx = 1
last_improved = 0

while True:
times = run_test(case_name)
if not times:
print(f" run {run_idx}: FAILED")
run_idx += 1
continue

t = times[0]
if best_time is None:
best_time = t
best_time_at_last_improvement = t
last_improved = run_idx
print(f" run {run_idx}: {t:.3f} -> best so far: {best_time:.3f}; last improved in run {last_improved}")
else:
improved = False
if t < best_time:
best_time = t
improved = True

percent_improvement = ((best_time_at_last_improvement - best_time) / best_time_at_last_improvement) * 100.0

if improved and percent_improvement >= 0.1:
last_improved = run_idx
best_time_at_last_improvement = best_time

print(f" run {run_idx}: {t:.3f} -> best so far: {best_time:.3f}; last improved in run {last_improved}")

if run_idx - last_improved >= 5:
print(f" (stopping for {case_name}, improvement {percent_improvement:.3f}% < 0.1% in last 5 runs)")
break

run_idx += 1
return best_time

def main():
presets = ["deblur_medium", "denoise_medium", "local_contrast_normal", "sharpness_fast", "bloom"]
results = {}

for p in presets:
results[p] = benchmark_case(p)

total_sum = sum(results.values())
print("\n--- Preset Benchmark Results ---")
for p in presets:
print(f"Preset {p}: {results[p]:.3f}s")
print(f"TOTAL SUM: {total_sum:.3f}s")

if __name__ == "__main__":
main()
Loading
Loading