diff --git a/.gitignore b/.gitignore
index 718829be..b64dee42 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,3 +56,6 @@ imgui.ini
*.npz
/cmake-build-*
*.pyc
+/.cmake
+/CMakeFiles
+/.debug
diff --git a/.gitmodules b/.gitmodules
index be28e636..fbd707c9 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,9 +4,9 @@
[submodule "external/glfw"]
path = external/glfw
url = https://github.com/glfw/glfw.git
-[submodule "external/glad"]
- path = external/glad
- url = https://github.com/Dav1dde/glad.git
[submodule "external/imgui"]
path = external/imgui
url = https://github.com/ocornut/imgui.git
+[submodule "Python/external/shaderc"]
+ path = Python/external/shaderc
+ url = https://github.com/google/shaderc
diff --git a/.idea/TensorFrost.iml b/.idea/TensorFrost.iml
index 1837251a..83f477b5 100644
--- a/.idea/TensorFrost.iml
+++ b/.idea/TensorFrost.iml
@@ -2,7 +2,7 @@
-
+
\ No newline at end of file
diff --git a/.idea/copilot.data.migration.agent.xml b/.idea/copilot.data.migration.agent.xml
new file mode 100644
index 00000000..4ea72a91
--- /dev/null
+++ b/.idea/copilot.data.migration.agent.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/editor.xml b/.idea/editor.xml
index 6692539a..4d514864 100644
--- a/.idea/editor.xml
+++ b/.idea/editor.xml
@@ -1,302 +1,60 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
-
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
+
-
-
+
@@ -304,6 +62,11 @@
+
+
+
+
+
@@ -315,28 +78,29 @@
+
+
+
+
+
+
+
-
-
-
+
+
+
+
+
-
+
-
-
+
+
+
-
-
-
-
-
-
-
-
-
-
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
index a8974b61..a48f14df 100644
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@@ -2,7 +2,6 @@
-
diff --git a/.run/TensorFrost.run.xml b/.run/TensorFrost.run.xml
index 97e80504..61d7af64 100644
--- a/.run/TensorFrost.run.xml
+++ b/.run/TensorFrost.run.xml
@@ -1,5 +1,5 @@
-
+
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 00000000..f15737e5
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,10 @@
+# Agent Guide
+
+Follow these expectations whenever you work in this repository:
+
+1. **Full rebuild & virtual environment** — Run `setup_python_env.cmd` from the repo root. It configures the Python virtual environment and performs a clean rebuild so you start from a consistent state.
+2. **Partial rebuilds** — Use CMake for incremental builds. Invoke the appropriate CMake build command (for example, `cmake --build --target `) to rebuild only what you need.
+3. **C++ changes** — Any edits under `TensorFrost/` or other C++ sources require a rebuild before the changes take effect.
+4. **Python script changes** - After edits of python script, you should run them to make sure they work correctly. No recompilation needed.
+5. **API validation** — After modifying functionality, run the relevant tests in the `tests/` folder to confirm the Python API still behaves as expected.
+6. **Scenario validation** — Run the sample programs in the `examples/` folder to make sure the updated stack handles more complex end-to-end flows.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3c927cf8..09010907 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,10 +34,11 @@ set(GLFW_BUILD_TESTS OFF CACHE BOOL "" FORCE)
set(GLFW_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
set(PYBIND11_FINDPYTHON ON)
+find_package(Vulkan REQUIRED)
+
add_subdirectory(external/pybind11)
add_subdirectory(external/glfw)
-add_subdirectory(external/glad/cmake)
add_subdirectory(TensorFrost)
add_subdirectory(examples)
-set_property(DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT TensorFrost)
\ No newline at end of file
+set_property(DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT TensorFrost)
diff --git a/ProtoIR.txt b/ProtoIR.txt
new file mode 100644
index 00000000..ac6519dc
--- /dev/null
+++ b/ProtoIR.txt
@@ -0,0 +1,16 @@
+Node = [name, arguments, attributes]
+
+
+n = input_dim(attributes{type=int32, input_index=0, dim_index=0})
+a = input(args{shape=[n]}, attributes{type=float32, input_index=0})
+b = sin(args{input=[a], shape=[n]}, attributes{type=float32})
+c = load(args{input=[b], indices=[0]}, attributes{type=float32})
+d = average(args{input=[b]}, attributes{type=float32})
+res = div(args{input=[d,c]}, attributes{type=float32})
+ids = parallel(args{shape=[n, n]}, attributes{type=tuple}) {
+ i = unpack_tuple(args{input=[ids,0]}, attributes{type=int32})
+ j = unpack_tuple(args{input=[ids,0]}, attributes{type=int32})
+ a0 = load(args{input=[b], indices=[i]}, attributes{type=float32})
+ a1 = load(args{input=[b], indices=[j]}, attributes{type=float32})
+ outer = mul(args{input=[a0, a1]}, attributes{type=float32, output_index=0})
+}
\ No newline at end of file
diff --git a/Python/TensorFrost/__init__.py b/Python/TensorFrost/__init__.py
index 3fddb9ae..ce11ec6e 100644
--- a/Python/TensorFrost/__init__.py
+++ b/Python/TensorFrost/__init__.py
@@ -4,7 +4,6 @@
from . import regularizers
from . import clipping
from . import random
-from . import sort
from .default import *
# def compile(func):
diff --git a/Python/TensorFrost/clipping.py b/Python/TensorFrost/clipping.py
index b343917e..7c79851d 100644
--- a/Python/TensorFrost/clipping.py
+++ b/Python/TensorFrost/clipping.py
@@ -1,5 +1,5 @@
-from .optimizers import *
-
-clamp = ModuleOptimizer.ClippingType.Clamp
-norm = ModuleOptimizer.ClippingType.Norm
-none = ModuleOptimizer.ClippingType.None_
\ No newline at end of file
+# from .optimizers import *
+#
+# clamp = ModuleOptimizer.ClippingType.Clamp
+# norm = ModuleOptimizer.ClippingType.Norm
+# none = ModuleOptimizer.ClippingType.None_
\ No newline at end of file
diff --git a/Python/TensorFrost/default.py b/Python/TensorFrost/default.py
index 41853504..a015108c 100644
--- a/Python/TensorFrost/default.py
+++ b/Python/TensorFrost/default.py
@@ -1,14 +1,14 @@
-from . import TensorFrost as tf
-
-def zeros_like(tensor):
- return tf.zeros(tensor.shape, tensor.type)
-
-def eye(n):
- i, j = tf.indices([n, n])
- return tf.select(i == j, 1.0, 0.0)
-
-def eye_like(tensor):
- return eye(tensor.shape[0])
-
-def ones_like(tensor):
- return tf.ones(tensor.shape, tensor.type)
\ No newline at end of file
+# from . import TensorFrost as tf
+#
+# def zeros_like(tensor):
+# return tf.zeros(tensor.shape, tensor.type)
+#
+# def eye(n):
+# i, j = tf.indices([n, n])
+# return tf.select(i == j, 1.0, 0.0)
+#
+# def eye_like(tensor):
+# return eye(tensor.shape[0])
+#
+# def ones_like(tensor):
+# return tf.ones(tensor.shape, tensor.type)
\ No newline at end of file
diff --git a/Python/TensorFrost/optimizers.py b/Python/TensorFrost/optimizers.py
index 731ae81b..21b07ce4 100644
--- a/Python/TensorFrost/optimizers.py
+++ b/Python/TensorFrost/optimizers.py
@@ -1,219 +1,219 @@
-from . import TensorFrost as tf
-
-class ModuleOptimizer(tf.Module):
- class OptimizerType:
- ADAM = 0
- SGD = 1
- RMSProp = 2
-
- class RegularizerType:
- None_ = 0
- L1 = 1
- L2 = 2
-
- class ClippingType:
- Clamp = 0
- Norm = 1
- None_ = 2
-
- def __init__(self, optimizer_type, regularizer_type, net, params):
- super().__init__()
- self.optimizer_type = optimizer_type
- self.regularizer_type = regularizer_type
- self.clipping_type = self.ClippingType.Clamp
- self.epsilon = 1e-8
-
- # Set passed parameters as attributes
- self.net = net
- for k, v in params.items():
- setattr(self, k, v)
-
- # Initialize t
- t = tf.Parameter([1], tf.float32, False) # mimic Parameter({1}, TFType::Float, false)
- self.t = t
-
- self.initializeOptimizer(net)
-
- def set_clipping_type(self, ctype):
- self.clipping_type = ctype
-
- def initializeOptimizer(self, net):
- net_params = net.parameters()
- requires_grads = net.requires_grads_list()
-
- if self.optimizer_type == self.OptimizerType.ADAM:
- self.initializeParameterArray("m", net_params, requires_grads)
- self.initializeParameterArray("v", net_params, requires_grads)
- elif self.optimizer_type == self.OptimizerType.SGD:
- # No additional parameters needed
- pass
- elif self.optimizer_type == self.OptimizerType.RMSProp:
- self.initializeParameterArray("v", net_params, requires_grads)
-
- def initializeParameterArray(self, name, net_params, requires_grads):
- arr = tf.ParameterArray()
-
- for i, param in enumerate(net_params):
- if not requires_grads[i]:
- continue
-
- new_param = tf.Parameter(param.shape, tf.float32, False)
- arr[i] = new_param
-
- setattr(self, name, arr)
-
- def assert_parameters(self):
- net_params = self.net.parameters()
- requires_grads = self.net.requires_grads_list()
- self.assertParameterArray("m", net_params, requires_grads)
- self.assertParameterArray("v", net_params, requires_grads)
-
- def gradient_norm(self, grad):
- # sum of squares
- g = grad * grad
- shape = grad.shape
- num_dims = len(shape)
- for i in range(num_dims):
- g = tf.sum(g)
- return tf.sqrt(g)
-
- def assertParameterArray(self, name, net_params, requires_grads):
- if hasattr(self, name):
- arr = getattr(self, name)
- for i, param in enumerate(net_params):
- if not requires_grads[i]:
- continue
- arr_item = arr[i]
- arr_item = tf.assert_tensor(arr_item, param.shape, param.type)
- arr[i] = arr_item
-
- def step(self, *args):
- # Overloaded step:
- # step(X, Y) or step(loss)
- if len(args) == 2:
- X, Y = args
- loss = self.net.loss(X, Y)
- self._step(loss)
- return loss
- elif len(args) == 1:
- (loss,) = args
- self._step(loss)
- else:
- raise ValueError("Invalid arguments to step")
-
- def _step(self, loss):
- # Increment t by 1
- self.t = self.t + 1.0
-
- net = self.net
- net_params = net.parameters()
- requires_grads = net.requires_grads_list()
-
- learning_rate = self.learning_rate
- grad_clip = self.grad_clip
- has_clip = isinstance(grad_clip, float) and grad_clip > 0.0
-
- for i, param in enumerate(net_params):
- if not requires_grads[i]:
- continue
-
- grad = tf.grad(loss, param)
- if has_clip:
- if self.clipping_type == self.ClippingType.Clamp:
- grad = tf.clamp(grad, -grad_clip, grad_clip)
- elif self.clipping_type == self.ClippingType.Norm:
- grad_norm = tf.max(1e-6, self.gradient_norm(grad))
- grad = grad * tf.min(1.0, grad_clip / grad_norm)
-
- if self.optimizer_type == self.OptimizerType.ADAM:
- update = self.adam_update(i, param, grad, self.t, learning_rate)
- elif self.optimizer_type == self.OptimizerType.SGD:
- update = self.sgd_update(param, grad, learning_rate)
- elif self.optimizer_type == self.OptimizerType.RMSProp:
- update = self.rmsprop_update(i, param, grad, learning_rate)
- else:
- raise RuntimeError("Unknown optimizer type")
-
- # Apply regularization if needed
- if self.regularizer_type == self.RegularizerType.L1:
- param = param - learning_rate * self.reg * tf.sign(param)
- elif self.regularizer_type == self.RegularizerType.L2:
- param = param - learning_rate * self.reg * param
-
- # Update parameter with computed update
- param = param - update
- net_params[i] = param
-
- net.update_parameters(net_params)
-
- def adam_update(self, i, param, grad, t, learning_rate):
- beta1 = tf.float(self.beta1)
- beta2 = tf.float(self.beta2)
-
- m = self.m[i]
- v = self.v[i]
-
- m = tf.lerp(grad, m, beta1)
- v = tf.lerp(grad * grad, v, beta2)
-
- # t is a Parameter with shape [1]; get the scalar
- t_val = self.t[0]
- mhat = m / (1.0 - tf.pow(beta1, t_val))
- vhat = v / (1.0 - tf.pow(beta2, t_val))
-
- self.m[i] = m
- self.v[i] = v
-
- return learning_rate * mhat / (tf.sqrt(vhat) + self.epsilon)
-
- def sgd_update(self, param, grad, learning_rate):
- return learning_rate * grad
-
- def rmsprop_update(self, i, param, grad, learning_rate):
- decay = tf.float(self.decay)
-
- v = self.v[i]
- v = tf.lerp(grad * grad, v, decay)
- self.v[i] = v
-
- return (grad * learning_rate) / (tf.sqrt(v) + self.epsilon)
-
-
-def adam(net, reg_type=ModuleOptimizer.RegularizerType.None_, learning_rate=0.001, beta1=0.9, beta2=0.999, clip=0.0, reg=0.0):
- return ModuleOptimizer(
- ModuleOptimizer.OptimizerType.ADAM,
- reg_type,
- net,
- {
- "learning_rate": learning_rate,
- "beta1": beta1,
- "beta2": beta2,
- "grad_clip": clip,
- "reg": reg,
- }
- )
-
-def sgd(net, reg_type=ModuleOptimizer.RegularizerType.None_, learning_rate=0.001, clip=0.0, reg=0.0):
- return ModuleOptimizer(
- ModuleOptimizer.OptimizerType.SGD,
- reg_type,
- net,
- {
- "learning_rate": learning_rate,
- "grad_clip": clip,
- "reg": reg,
- }
- )
-
-def rmsprop(net, reg_type=ModuleOptimizer.RegularizerType.None_, learning_rate=0.001, decay=0.9, clip=0.0, reg=0.0):
- return ModuleOptimizer(
- ModuleOptimizer.OptimizerType.RMSProp,
- reg_type,
- net,
- {
- "learning_rate": learning_rate,
- "decay": decay,
- "grad_clip": clip,
- "reg": reg,
- }
- )
\ No newline at end of file
+# from . import TensorFrost as tf
+#
+# class ModuleOptimizer(tf.Module):
+# class OptimizerType:
+# ADAM = 0
+# SGD = 1
+# RMSProp = 2
+#
+# class RegularizerType:
+# None_ = 0
+# L1 = 1
+# L2 = 2
+#
+# class ClippingType:
+# Clamp = 0
+# Norm = 1
+# None_ = 2
+#
+# def __init__(self, optimizer_type, regularizer_type, net, params):
+# super().__init__()
+# self.optimizer_type = optimizer_type
+# self.regularizer_type = regularizer_type
+# self.clipping_type = self.ClippingType.Clamp
+# self.epsilon = 1e-8
+#
+# # Set passed parameters as attributes
+# self.net = net
+# for k, v in params.items():
+# setattr(self, k, v)
+#
+# # Initialize t
+# t = tf.Parameter([1], tf.float32, False) # mimic Parameter({1}, TFType::Float, false)
+# self.t = t
+#
+# self.initializeOptimizer(net)
+#
+# def set_clipping_type(self, ctype):
+# self.clipping_type = ctype
+#
+# def initializeOptimizer(self, net):
+# net_params = net.parameters()
+# requires_grads = net.requires_grads_list()
+#
+# if self.optimizer_type == self.OptimizerType.ADAM:
+# self.initializeParameterArray("m", net_params, requires_grads)
+# self.initializeParameterArray("v", net_params, requires_grads)
+# elif self.optimizer_type == self.OptimizerType.SGD:
+# # No additional parameters needed
+# pass
+# elif self.optimizer_type == self.OptimizerType.RMSProp:
+# self.initializeParameterArray("v", net_params, requires_grads)
+#
+# def initializeParameterArray(self, name, net_params, requires_grads):
+# arr = tf.ParameterArray()
+#
+# for i, param in enumerate(net_params):
+# if not requires_grads[i]:
+# continue
+#
+# new_param = tf.Parameter(param.shape, tf.float32, False)
+# arr[i] = new_param
+#
+# setattr(self, name, arr)
+#
+# def assert_parameters(self):
+# net_params = self.net.parameters()
+# requires_grads = self.net.requires_grads_list()
+# self.assertParameterArray("m", net_params, requires_grads)
+# self.assertParameterArray("v", net_params, requires_grads)
+#
+# def gradient_norm(self, grad):
+# # sum of squares
+# g = grad * grad
+# shape = grad.shape
+# num_dims = len(shape)
+# for i in range(num_dims):
+# g = tf.sum(g)
+# return tf.sqrt(g)
+#
+# def assertParameterArray(self, name, net_params, requires_grads):
+# if hasattr(self, name):
+# arr = getattr(self, name)
+# for i, param in enumerate(net_params):
+# if not requires_grads[i]:
+# continue
+# arr_item = arr[i]
+# arr_item = tf.assert_tensor(arr_item, param.shape, param.type)
+# arr[i] = arr_item
+#
+# def step(self, *args):
+# # Overloaded step:
+# # step(X, Y) or step(loss)
+# if len(args) == 2:
+# X, Y = args
+# loss = self.net.loss(X, Y)
+# self._step(loss)
+# return loss
+# elif len(args) == 1:
+# (loss,) = args
+# self._step(loss)
+# else:
+# raise ValueError("Invalid arguments to step")
+#
+# def _step(self, loss):
+# # Increment t by 1
+# self.t = self.t + 1.0
+#
+# net = self.net
+# net_params = net.parameters()
+# requires_grads = net.requires_grads_list()
+#
+# learning_rate = self.learning_rate
+# grad_clip = self.grad_clip
+# has_clip = isinstance(grad_clip, float) and grad_clip > 0.0
+#
+# for i, param in enumerate(net_params):
+# if not requires_grads[i]:
+# continue
+#
+# grad = tf.grad(loss, param)
+# if has_clip:
+# if self.clipping_type == self.ClippingType.Clamp:
+# grad = tf.clamp(grad, -grad_clip, grad_clip)
+# elif self.clipping_type == self.ClippingType.Norm:
+# grad_norm = tf.max(1e-6, self.gradient_norm(grad))
+# grad = grad * tf.min(1.0, grad_clip / grad_norm)
+#
+# if self.optimizer_type == self.OptimizerType.ADAM:
+# update = self.adam_update(i, param, grad, self.t, learning_rate)
+# elif self.optimizer_type == self.OptimizerType.SGD:
+# update = self.sgd_update(param, grad, learning_rate)
+# elif self.optimizer_type == self.OptimizerType.RMSProp:
+# update = self.rmsprop_update(i, param, grad, learning_rate)
+# else:
+# raise RuntimeError("Unknown optimizer type")
+#
+# # Apply regularization if needed
+# if self.regularizer_type == self.RegularizerType.L1:
+# param = param - learning_rate * self.reg * tf.sign(param)
+# elif self.regularizer_type == self.RegularizerType.L2:
+# param = param - learning_rate * self.reg * param
+#
+# # Update parameter with computed update
+# param = param - update
+# net_params[i] = param
+#
+# net.update_parameters(net_params)
+#
+# def adam_update(self, i, param, grad, t, learning_rate):
+# beta1 = tf.float(self.beta1)
+# beta2 = tf.float(self.beta2)
+#
+# m = self.m[i]
+# v = self.v[i]
+#
+# m = tf.lerp(grad, m, beta1)
+# v = tf.lerp(grad * grad, v, beta2)
+#
+# # t is a Parameter with shape [1]; get the scalar
+# t_val = self.t[0]
+# mhat = m / (1.0 - tf.pow(beta1, t_val))
+# vhat = v / (1.0 - tf.pow(beta2, t_val))
+#
+# self.m[i] = m
+# self.v[i] = v
+#
+# return learning_rate * mhat / (tf.sqrt(vhat) + self.epsilon)
+#
+# def sgd_update(self, param, grad, learning_rate):
+# return learning_rate * grad
+#
+# def rmsprop_update(self, i, param, grad, learning_rate):
+# decay = tf.float(self.decay)
+#
+# v = self.v[i]
+# v = tf.lerp(grad * grad, v, decay)
+# self.v[i] = v
+#
+# return (grad * learning_rate) / (tf.sqrt(v) + self.epsilon)
+#
+#
+# def adam(net, reg_type=ModuleOptimizer.RegularizerType.None_, learning_rate=0.001, beta1=0.9, beta2=0.999, clip=0.0, reg=0.0):
+# return ModuleOptimizer(
+# ModuleOptimizer.OptimizerType.ADAM,
+# reg_type,
+# net,
+# {
+# "learning_rate": learning_rate,
+# "beta1": beta1,
+# "beta2": beta2,
+# "grad_clip": clip,
+# "reg": reg,
+# }
+# )
+#
+# def sgd(net, reg_type=ModuleOptimizer.RegularizerType.None_, learning_rate=0.001, clip=0.0, reg=0.0):
+# return ModuleOptimizer(
+# ModuleOptimizer.OptimizerType.SGD,
+# reg_type,
+# net,
+# {
+# "learning_rate": learning_rate,
+# "grad_clip": clip,
+# "reg": reg,
+# }
+# )
+#
+# def rmsprop(net, reg_type=ModuleOptimizer.RegularizerType.None_, learning_rate=0.001, decay=0.9, clip=0.0, reg=0.0):
+# return ModuleOptimizer(
+# ModuleOptimizer.OptimizerType.RMSProp,
+# reg_type,
+# net,
+# {
+# "learning_rate": learning_rate,
+# "decay": decay,
+# "grad_clip": clip,
+# "reg": reg,
+# }
+# )
\ No newline at end of file
diff --git a/Python/TensorFrost/random.py b/Python/TensorFrost/random.py
index de3d0ad0..ab91f028 100644
--- a/Python/TensorFrost/random.py
+++ b/Python/TensorFrost/random.py
@@ -1,45 +1,45 @@
-from . import TensorFrost as tf
-
-def randn2(shape, seed=0):
- #Box-Muller transform
- r1 = tf.random_value(shape, seed=seed)
- r2 = tf.random_value(shape, seed=tf.hash(seed))
- rho = tf.sqrt(-2.0*tf.log(tf.max(1e-6, r1)))
- theta = 2.0*tf.pi*r2
- return rho*tf.cos(theta), rho*tf.sin(theta)
-
-def randn(shape, seed=0):
- return randn2(shape, seed=seed)[0]
-
-def rand(shape, seed=0):
- return tf.random_value(shape, seed=seed)
-
-def randn_like(tensor, seed=0):
- return randn(tensor.shape, seed=seed)
-
-def rand_like(tensor, seed=0):
- return rand(tensor.shape, seed=seed)
-
-def rand_int(seed, max_value):
- return tf.int(tf.pcg(tf.uint(seed)) % tf.uint(max_value))
-
-def xor_swap(idx, n, seed):
- xor_seed = rand_int(seed, n)
- xor_idx = (idx ^ xor_seed)
- max_idx = tf.max(idx, xor_idx)
- min_idx = tf.min(idx, xor_idx)
- swap = rand_int(min_idx * 451 + seed, 2) == 0
- return tf.select(swap & (max_idx < n), xor_idx, idx)
-
-def reverse(idx, n):
- return n - 1 - idx
-
-def shuffle(idx, n, seed = 0, iters = 16):
- for i in range(iters):
- idx = xor_swap(idx, n, seed + i)
- idx = reverse(idx, n)
- return idx
-
-def permutation(n, seed = 0):
- idx = tf.indices([n])[0]
- return shuffle(idx, n, seed)
\ No newline at end of file
+# from . import TensorFrost as tf
+#
+# def randn2(shape, seed=0):
+# #Box-Muller transform
+# r1 = tf.random_value(shape, seed=seed)
+# r2 = tf.random_value(shape, seed=tf.hash(seed))
+# rho = tf.sqrt(-2.0*tf.log(tf.max(1e-6, r1)))
+# theta = 2.0*tf.pi*r2
+# return rho*tf.cos(theta), rho*tf.sin(theta)
+#
+# def randn(shape, seed=0):
+# return randn2(shape, seed=seed)[0]
+#
+# def rand(shape, seed=0):
+# return tf.random_value(shape, seed=seed)
+#
+# def randn_like(tensor, seed=0):
+# return randn(tensor.shape, seed=seed)
+#
+# def rand_like(tensor, seed=0):
+# return rand(tensor.shape, seed=seed)
+#
+# def rand_int(seed, max_value):
+# return tf.int(tf.pcg(tf.uint(seed)) % tf.uint(max_value))
+#
+# def xor_swap(idx, n, seed):
+# xor_seed = rand_int(seed, n)
+# xor_idx = (idx ^ xor_seed)
+# max_idx = tf.max(idx, xor_idx)
+# min_idx = tf.min(idx, xor_idx)
+# swap = rand_int(min_idx * 451 + seed, 2) == 0
+# return tf.select(swap & (max_idx < n), xor_idx, idx)
+#
+# def reverse(idx, n):
+# return n - 1 - idx
+#
+# def shuffle(idx, n, seed = 0, iters = 16):
+# for i in range(iters):
+# idx = xor_swap(idx, n, seed + i)
+# idx = reverse(idx, n)
+# return idx
+#
+# def permutation(n, seed = 0):
+# idx = tf.indices([n])[0]
+# return shuffle(idx, n, seed)
\ No newline at end of file
diff --git a/Python/TensorFrost/regularizers.py b/Python/TensorFrost/regularizers.py
index b384bcef..989767cc 100644
--- a/Python/TensorFrost/regularizers.py
+++ b/Python/TensorFrost/regularizers.py
@@ -1,5 +1,5 @@
-from .optimizers import *
-
-l1 = ModuleOptimizer.RegularizerType.L1
-l2 = ModuleOptimizer.RegularizerType.L2
-none = ModuleOptimizer.RegularizerType.None_
\ No newline at end of file
+# from .optimizers import *
+#
+# l1 = ModuleOptimizer.RegularizerType.L1
+# l2 = ModuleOptimizer.RegularizerType.L2
+# none = ModuleOptimizer.RegularizerType.None_
\ No newline at end of file
diff --git a/Python/TensorFrost/sort.py b/Python/TensorFrost/sort.py
index 4d585c64..4a236bb9 100644
--- a/Python/TensorFrost/sort.py
+++ b/Python/TensorFrost/sort.py
@@ -1,187 +1,516 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from importlib import resources
+from typing import Dict, Optional, Tuple
+
+import numpy as np
+
from . import TensorFrost as tf
-#in-place bitonic sort
-def bitonic(keys, values = None):
- tf.region_begin('Bitonic sort')
- keys = tf.copy(keys)
- if values is not None:
- values = tf.copy(values)
- element_count = keys.shape[0]
- log2_count = tf.int(tf.ceil(tf.log2(tf.float(element_count))))
- count_round = 1 << log2_count
- idx = tf.indices([count_round / 2])[0]
- with tf.loop(log2_count) as k:
- with tf.loop(k+1) as j:
- s = 1 << (k-j)
- m_inner = s - 1
- m_outer = ~m_inner
- m_xor = s + tf.select(j == 0, m_inner, 0)
-
- id1 = (2 * (idx & m_outer) + (idx & m_inner))
- id2 = id1 ^ m_xor
- key1, key2 = keys[id1], keys[id2]
- with tf.if_cond((key1 >= key2) & (id1 < element_count) & (id2 < element_count)):
- if values is not None:
- val1, val2 = values[id1], values[id2]
- values[id1] = val2
- values[id2] = val1
- keys[id1] = key2
- keys[id2] = key1
-
- tf.region_end('Bitonic sort')
- if values is not None:
- return keys, values
- else:
- return keys
-
-#histogram radix sort
-def radix(keys, values = None, bits_per_pass = 6, max_bits = 32):
- def prefix_sum_grouped(A, axis = -1):
- axis = len(A.shape) + axis if axis < 0 else axis
- group_size = 64
- grouped = tf.split_dim(A, group_size, axis)
- group_scan = tf.prefix_sum(tf.sum(grouped, axis = axis + 1), axis = axis)
- ids = grouped.indices
- gid, eid = ids[axis], ids[axis + 1]
- ids = [ids[i] for i in range(len(ids)) if i != axis + 1]
- ids[axis] = gid - 1
- group_scan = tf.prefix_sum(grouped + tf.select((gid == 0) | (eid != 0), tf.uint(0), group_scan[tuple(ids)]), axis = axis + 1)
- full_scan = tf.merge_dim(group_scan, target_size = A.shape[axis], axis = axis + 1)
- return full_scan
-
- sign_bit = ~tf.uint(0x7FFFFFFF)
-
- def map_float_to_uint(x):
- # Convert float to uint representation
- ux = tf.asuint(x)
- # Compute mask
- mask = tf.select((ux >> 31) == 1, ~tf.uint(0), sign_bit)
- # Apply XOR
- return ux ^ mask
-
- def map_uint_to_float(x):
- # Compute mask
- mask = tf.select((x >> 31) == 0, ~tf.uint(0), sign_bit)
- # Apply XOR and convert back to float
- return tf.asfloat(x ^ mask)
-
- def map_int_to_uint(x):
- return tf.asuint(x) ^ sign_bit
-
- def map_uint_to_int(x):
- return tf.asint(x ^ sign_bit)
-
- tf.region_begin('Radix sort')
-
- has_values = values is not None
-
- keys = tf.copy(keys)
- if has_values:
- values = tf.copy(values)
-
- original_type = keys.type
- if(original_type == tf.float32):
- keys = map_float_to_uint(keys)
-
- if(original_type == tf.int32):
- keys = map_int_to_uint(keys)
-
- iters = (max_bits + bits_per_pass - 1) // bits_per_pass
- group_size = 128
- histogram_size = 2 ** bits_per_pass
-
- def GetBits(A, i):
- return (A >> (i * bits_per_pass)) & tf.uint(histogram_size - 1)
-
- keys1 = tf.buffer(keys.shape, keys.type)
- values1 = None
-
- if has_values:
- values1 = tf.buffer(values.shape, values.type)
-
- with tf.loop(iters // 2) as iter:
- def SortIteration(keys_in, keys_out, values_in, values_out, iter):
- tf.region_begin('Radix sort iteration')
- grouped = tf.split_dim(GetBits(keys_in, iter), group_size)
-
- # Do a packed histogram, since we sum 128 elements at a time, we can pack 4 values into a single uint32
- g, e, i = tf.indices([grouped.shape[0], grouped.shape[1], tf.int(histogram_size/4)])
- this_key = grouped[g, e]
- packed_is_bit = (tf.uint(this_key == tf.uint(4*i))) + (tf.uint(this_key == tf.uint(4*i+1)) << 8) + (tf.uint(this_key == tf.uint(4*i+2)) << 16) + (tf.uint(this_key == tf.uint(4*i+3)) << 24)
- packed_is_bit = tf.select((g*group_size + e) < keys_in.shape[0], packed_is_bit, tf.uint(0))
- group_histogram_packed = tf.sum(packed_is_bit, axis = 1)
-
- g, i = tf.indices([grouped.shape[0], histogram_size])
- group_histogram = tf.uint((group_histogram_packed[g, i / 4] >> (8*(i % 4))) & tf.uint(0xFF))
-
- group_histogram_scan = prefix_sum_grouped(group_histogram, axis = 0)
- i, = tf.indices([histogram_size])
- total_bit_histogram = tf.prefix_sum(group_histogram_scan[group_histogram_scan.shape[0] - 1, i])
-
- with tf.kernel(grouped.shape, group_size=[group_size]) as (g, e):
- if(tf.current_backend() == tf.cpu): #dont use group barriers on CPU - doesn't work
- element = g * group_size + e
- with tf.if_cond(element < keys_in.shape[0]):
- old_key = keys_in[element]
- old_val = values_in[element]
- bit = GetBits(old_key, iter)
- total_offset = tf.select(g == 0, tf.uint(0), group_histogram_scan[g - 1, bit]) + tf.select(bit == tf.uint(0), tf.uint(0), total_bit_histogram[bit - tf.uint(1)])
- with tf.loop(e) as j:
- total_offset.val += tf.uint(grouped[g, j] == bit)
- keys_out[total_offset] = old_key
- values_out[total_offset] = old_val
- else:
- temp = tf.group_buffer(group_size, tf.uint32)
- half_count = tf.group_buffer(histogram_size, tf.uint32)
- gtid = g.block_thread_index(0)
-
- #initialize counters
- for i in range((histogram_size + group_size - 1) // group_size):
- index = gtid + i * group_size
- with tf.if_cond(index < histogram_size):
- half_count[index] = 0
- tf.group_barrier()
-
- element = g * group_size + e
- with tf.if_cond(element < keys_in.shape[0]):
- old_key = keys_in[element]
- bit = GetBits(old_key, iter)
- temp[gtid] = bit
-
- #count number of bits set in previous sub groups
- quarter_index = e / (group_size // 4)
- with tf.if_cond(quarter_index < 3):
- tf.scatterAdd(half_count[bit], tf.uint(quarter_index < 1) | (tf.uint(quarter_index < 2) << 8) | (tf.uint(quarter_index < 3) << 16))
-
- tf.group_barrier()
-
- if has_values:
- old_val = values_in[element]
-
- total_offset = tf.select(g == 0, tf.uint(0), group_histogram_scan[g - 1, tf.int(bit)]) + tf.select(tf.int(bit) == 0, tf.uint(0), total_bit_histogram[tf.int(bit) - 1])
- total_offset += tf.select(quarter_index > 0, (half_count[bit] >> (8*(quarter_index-1))) & tf.uint(0xFF), tf.uint(0))
- begin_index = quarter_index * (group_size // 4)
- with tf.loop(begin_index, e) as j:
- total_offset.val += tf.uint(temp[j] == bit)
- keys_out[total_offset] = old_key
-
- if has_values:
- values_out[total_offset] = old_val
-
- tf.region_end('Radix sort iteration')
-
- SortIteration(keys, keys1, values, values1, 2 * iter)
- SortIteration(keys1, keys, values1, values, 2 * iter + 1)
-
- tf.region_end('Radix sort')
-
- if(original_type == tf.float32):
- keys = map_uint_to_float(keys)
-
- if(original_type == tf.int32):
- keys = map_uint_to_int(keys)
-
- if has_values:
- return keys, values
- else:
- return keys
+__all__ = ["HistogramRadixSort", "radix_sort"]
+
+_TYPE_CODES: Dict[str, np.uint32] = {
+ "uint": np.uint32(0),
+ "int": np.uint32(1),
+ "float": np.uint32(2),
+}
+
+
+def _dispatch_groups(work_items: int, threads_per_group: int) -> int:
+ if work_items <= 0:
+ return 0
+ return (work_items + threads_per_group - 1) // threads_per_group
+
+
+def _prepare_keys(keys: np.ndarray) -> Tuple[np.ndarray, np.dtype, str]:
+ array = np.asarray(keys)
+ if array.ndim != 1:
+ raise ValueError("radix_sort expects a 1D array of keys")
+
+ dtype = array.dtype
+ if dtype == np.uint32:
+ return array, dtype, "uint"
+
+ if dtype == np.int32:
+ return array, dtype, "int"
+
+ if dtype == np.float32:
+ return array, dtype, "float"
+
+ raise TypeError(f"Unsupported key dtype {dtype}; expected uint32, int32, or float32")
+
+
+def _prepare_values(values: np.ndarray) -> Tuple[np.ndarray, np.dtype]:
+ array = np.asarray(values)
+ if array.ndim != 1:
+ raise ValueError("radix_sort expects a 1D array of values when provided")
+
+ dtype = array.dtype
+ if dtype not in (np.uint32, np.int32, np.float32):
+ raise TypeError(f"Unsupported value dtype {dtype}; expected uint32, int32, or float32")
+
+ return array, dtype
+
+
+def _load_shader_source(filename: str) -> str:
+ package = f"{__package__}.shaders.radix"
+ try:
+ return resources.files(package).joinpath(filename).read_text(encoding="utf-8") # type: ignore[attr-defined]
+ except AttributeError:
+ return resources.read_text(package, filename)
+
+
+@dataclass(frozen=True)
+class _SorterKey:
+ bits_per_pass: int
+ block_size: int
+ group_size: int
+
+
+class HistogramRadixSort:
+ """GPU histogram radix sort implemented with Slang + Vulkan."""
+
+ def __init__(self, *, bits_per_pass: int = 6, block_size: int = 64, group_size: int = 128) -> None:
+ if bits_per_pass <= 0:
+ raise ValueError("bits_per_pass must be positive")
+ if bits_per_pass > 8:
+ raise ValueError("bits_per_pass must be <= 8 to fit within MAX_HIST_SIZE")
+ if group_size != 128:
+ raise ValueError("This implementation currently requires group_size == 128")
+ if block_size <= 0 or block_size > 1024:
+ raise ValueError("block_size must be within (0, 1024]")
+
+ self.bits_per_pass = bits_per_pass
+ self.block_size = block_size
+ self.group_size = group_size
+ self.histogram_size = 1 << bits_per_pass
+
+ self._map_to_uint_program = tf.createComputeProgramFromSlang(
+ "radix_map_to_uint",
+ _load_shader_source("map_to_uint.slang"),
+ "csMapToUint",
+ ro_count=1,
+ rw_count=1,
+ push_constant_size=8,
+ )
+ self._map_from_uint_program = tf.createComputeProgramFromSlang(
+ "radix_map_from_uint",
+ _load_shader_source("map_from_uint.slang"),
+ "csMapFromUint",
+ ro_count=1,
+ rw_count=1,
+ push_constant_size=8,
+ )
+
+ self._histogram_program = tf.createComputeProgramFromSlang(
+ "radix_histogram",
+ _load_shader_source("histogram.slang"),
+ "csHistogram",
+ ro_count=1,
+ rw_count=1,
+ push_constant_size=32,
+ )
+ self._unpack_program = tf.createComputeProgramFromSlang(
+ "radix_unpack",
+ _load_shader_source("unpack.slang"),
+ "csUnpack",
+ ro_count=1,
+ rw_count=1,
+ push_constant_size=32,
+ )
+ self._prefix_local_program = tf.createComputeProgramFromSlang(
+ "radix_prefix_local",
+ _load_shader_source("prefix_local.slang"),
+ "csPrefixLocal",
+ ro_count=1,
+ rw_count=2,
+ push_constant_size=32,
+ )
+ self._prefix_blocks_program = tf.createComputeProgramFromSlang(
+ "radix_prefix_blocks",
+ _load_shader_source("prefix_block.slang"),
+ "csPrefixBlocks",
+ ro_count=1,
+ rw_count=1,
+ push_constant_size=32,
+ )
+ self._prefix_accum_program = tf.createComputeProgramFromSlang(
+ "radix_prefix_accum",
+ _load_shader_source("prefix_accum.slang"),
+ "csPrefixAccumulate",
+ ro_count=1,
+ rw_count=1,
+ push_constant_size=32,
+ )
+ self._bucket_scan_program = tf.createComputeProgramFromSlang(
+ "radix_bucket_scan",
+ _load_shader_source("bucket_scan.slang"),
+ "csBucketScan",
+ ro_count=1,
+ rw_count=1,
+ push_constant_size=32,
+ )
+ scatter_source = f"#define TF_HISTOGRAM_SIZE {self.histogram_size}u\n" + _load_shader_source("scatter.slang")
+ self._scatter_program = tf.createComputeProgramFromSlang(
+ "radix_scatter",
+ scatter_source,
+ "csScatter",
+ ro_count=4,
+ rw_count=2,
+ )
+
+ self._dummy_values_buffer = tf.createBuffer(1, 4, False)
+
+ def close(self) -> None:
+ for attr in (
+ "_map_to_uint_program",
+ "_map_from_uint_program",
+ "_histogram_program",
+ "_unpack_program",
+ "_prefix_local_program",
+ "_prefix_blocks_program",
+ "_prefix_accum_program",
+ "_bucket_scan_program",
+ "_scatter_program",
+ ):
+ setattr(self, attr, None)
+ self._dummy_values_buffer = None
+
+ def sort(
+ self,
+ keys: np.ndarray,
+ values: Optional[np.ndarray] = None,
+ *,
+ max_bits: int = 32,
+ ) -> Tuple[np.ndarray, Optional[np.ndarray]]:
+ keys_array, key_dtype, key_kind = _prepare_keys(keys)
+ element_count = int(keys_array.shape[0])
+
+ if values is not None:
+ values_array, values_dtype = _prepare_values(values)
+ if values_array.shape[0] != element_count:
+ raise ValueError("values must have the same length as keys")
+ else:
+ values_array = None
+ values_dtype = None
+
+ if element_count == 0:
+ empty_keys = keys_array.copy()
+ if values_array is None:
+ return empty_keys, None
+ return empty_keys, values_array.copy()
+
+ max_bits = int(min(max_bits, 32))
+ histogram_size = self.histogram_size
+ mask = np.uint32(histogram_size - 1)
+
+ num_groups = max((element_count + self.group_size - 1) // self.group_size, 1)
+ block_count = max((num_groups + self.block_size - 1) // self.block_size, 1)
+ packed_count = (histogram_size + 3) // 4
+ passes = max((max_bits + self.bits_per_pass - 1) // self.bits_per_pass, 1)
+
+ params_array = np.zeros(8, dtype=np.uint32)
+ params_array[0] = np.uint32(element_count)
+ params_array[1] = np.uint32(histogram_size)
+ params_array[3] = mask
+ params_array[4] = np.uint32(num_groups)
+ params_array[5] = np.uint32(self.block_size)
+ params_array[6] = np.uint32(block_count)
+ params_array[7] = np.uint32(1 if values_array is not None else 0)
+
+ map_params = np.zeros(2, dtype=np.uint32)
+ map_params[0] = np.uint32(element_count)
+ map_params[1] = _TYPE_CODES[key_kind]
+
+ key_buffers = [tf.createBuffer(max(element_count, 1), 4, False) for _ in range(2)]
+ key_buffers[0].setData(keys_array)
+
+ if values_array is not None:
+ value_buffers = [tf.createBuffer(max(element_count, 1), 4, False) for _ in range(2)]
+ value_buffers[0].setData(values_array)
+ else:
+ dummy = self._dummy_values_buffer
+ value_buffers = [dummy, dummy]
+
+ packed_hist_buffer = tf.createBuffer(max(packed_count * num_groups, 1), 4, False)
+ group_hist_buffer = tf.createBuffer(max(histogram_size * num_groups, 1), 4, False)
+ prefix_buffer = tf.createBuffer(max(histogram_size * num_groups, 1), 4, False)
+ block_totals_buffer = tf.createBuffer(max(histogram_size * block_count, 1), 4, False)
+ block_prefix_buffer = tf.createBuffer(max(histogram_size * block_count, 1), 4, False)
+ bucket_scan_buffer = tf.createBuffer(max(histogram_size, 1), 4, False)
+
+ map_groups = _dispatch_groups(element_count, self.group_size)
+ reduction_group_size = 64
+ unpack_groups = _dispatch_groups(histogram_size * num_groups, reduction_group_size)
+ prefix_local_groups = _dispatch_groups(histogram_size * block_count, reduction_group_size)
+ prefix_block_groups = _dispatch_groups(histogram_size, reduction_group_size)
+ prefix_accum_groups = _dispatch_groups(histogram_size * block_count, reduction_group_size)
+ bucket_scan_groups = _dispatch_groups(histogram_size, reduction_group_size)
+ scatter_groups = num_groups
+ histogram_groups = num_groups
+
+ self._map_to_uint_program.run(
+ [key_buffers[0]],
+ [key_buffers[1]],
+ map_groups,
+ map_params,
+ )
+
+ key_in = key_buffers[1]
+ key_out = key_buffers[0]
+ val_in, val_out = value_buffers
+
+ for pass_index in range(passes):
+ params_array[2] = np.uint32(pass_index * self.bits_per_pass)
+
+ self._histogram_program.run(
+ [key_in],
+ [packed_hist_buffer],
+ histogram_groups,
+ params_array,
+ )
+
+ self._unpack_program.run(
+ [packed_hist_buffer],
+ [group_hist_buffer],
+ unpack_groups,
+ params_array,
+ )
+
+ self._prefix_local_program.run(
+ [group_hist_buffer],
+ [prefix_buffer, block_totals_buffer],
+ prefix_local_groups,
+ params_array,
+ )
+
+ self._prefix_blocks_program.run(
+ [block_totals_buffer],
+ [block_prefix_buffer],
+ prefix_block_groups,
+ params_array,
+ )
+
+ self._prefix_accum_program.run(
+ [block_prefix_buffer],
+ [prefix_buffer],
+ prefix_accum_groups,
+ params_array,
+ )
+
+ self._bucket_scan_program.run(
+ [prefix_buffer],
+ [bucket_scan_buffer],
+ bucket_scan_groups,
+ params_array,
+ )
+
+ self._scatter_program.run(
+ [key_in, val_in, prefix_buffer, bucket_scan_buffer],
+ [key_out, val_out],
+ scatter_groups,
+ params_array,
+ )
+
+ key_in, key_out = key_out, key_in
+ if values_array is not None:
+ val_in, val_out = val_out, val_in
+
+ self._map_from_uint_program.run(
+ [key_in],
+ [key_out],
+ map_groups,
+ map_params,
+ )
+
+ sorted_keys = key_out.getData(key_dtype, element_count)
+ if values_array is not None and values_dtype is not None:
+ sorted_values = val_in.getData(values_dtype, element_count)
+ else:
+ sorted_values = None
+
+ return sorted_keys, sorted_values
+
+
+_SORTER_CACHE: Dict[_SorterKey, HistogramRadixSort] = {}
+
+
+def _get_sorter(bits_per_pass: int, block_size: int, group_size: int) -> HistogramRadixSort:
+ key = _SorterKey(bits_per_pass, block_size, group_size)
+ sorter = _SORTER_CACHE.get(key)
+ if sorter is None:
+ sorter = HistogramRadixSort(bits_per_pass=bits_per_pass, block_size=block_size, group_size=group_size)
+ _SORTER_CACHE[key] = sorter
+ return sorter
+
+
+def radix_sort(
+ keys: np.ndarray,
+ values: Optional[np.ndarray] = None,
+ *,
+ bits_per_pass: int = 6,
+ max_bits: int = 32,
+ block_size: int = 64,
+ group_size: int = 128,
+):
+ """Run the GPU histogram radix sort on the provided keys (and optional values).
+
+ Returns the sorted keys, and when ``values`` is provided also returns the permuted values.
+ """
+
+ sorter = _get_sorter(bits_per_pass, block_size, group_size)
+ sorted_keys, sorted_values = sorter.sort(keys, values, max_bits=max_bits)
+ if values is None:
+ return sorted_keys
+ return sorted_keys, sorted_values
+# def radix(keys, values = None, bits_per_pass = 6, max_bits = 32):
+# def prefix_sum_grouped(A, axis = -1):
+# axis = len(A.shape) + axis if axis < 0 else axis
+# group_size = 64
+# grouped = tf.split_dim(A, group_size, axis)
+# group_scan = tf.prefix_sum(tf.sum(grouped, axis = axis + 1), axis = axis)
+# ids = grouped.indices
+# gid, eid = ids[axis], ids[axis + 1]
+# ids = [ids[i] for i in range(len(ids)) if i != axis + 1]
+# ids[axis] = gid - 1
+# group_scan = tf.prefix_sum(grouped + tf.select((gid == 0) | (eid != 0), tf.uint(0), group_scan[tuple(ids)]), axis = axis + 1)
+# full_scan = tf.merge_dim(group_scan, target_size = A.shape[axis], axis = axis + 1)
+# return full_scan
+#
+# sign_bit = ~tf.uint(0x7FFFFFFF)
+#
+# def map_float_to_uint(x):
+# # Convert float to uint representation
+# ux = tf.asuint(x)
+# # Compute mask
+# mask = tf.select((ux >> 31) == 1, ~tf.uint(0), sign_bit)
+# # Apply XOR
+# return ux ^ mask
+#
+# def map_uint_to_float(x):
+# # Compute mask
+# mask = tf.select((x >> 31) == 0, ~tf.uint(0), sign_bit)
+# # Apply XOR and convert back to float
+# return tf.asfloat(x ^ mask)
+#
+# def map_int_to_uint(x):
+# return tf.asuint(x) ^ sign_bit
+#
+# def map_uint_to_int(x):
+# return tf.asint(x ^ sign_bit)
+#
+# tf.region_begin('Radix sort')
+#
+# has_values = values is not None
+#
+# keys = tf.copy(keys)
+# if has_values:
+# values = tf.copy(values)
+#
+# original_type = keys.type
+# if(original_type == tf.float32):
+# keys = map_float_to_uint(keys)
+#
+# if(original_type == tf.int32):
+# keys = map_int_to_uint(keys)
+#
+# iters = (max_bits + bits_per_pass - 1) // bits_per_pass
+# group_size = 128
+# histogram_size = 2 ** bits_per_pass
+#
+# def GetBits(A, i):
+# return (A >> (i * bits_per_pass)) & tf.uint(histogram_size - 1)
+#
+# keys1 = tf.buffer(keys.shape, keys.type)
+# values1 = None
+#
+# if has_values:
+# values1 = tf.buffer(values.shape, values.type)
+#
+# with tf.loop(iters // 2) as iter:
+# def SortIteration(keys_in, keys_out, values_in, values_out, iter):
+# tf.region_begin('Radix sort iteration')
+# grouped = tf.split_dim(GetBits(keys_in, iter), group_size)
+#
+# # Do a packed histogram, since we sum 128 elements at a time, we can pack 4 values into a single uint32
+# g, e, i = tf.indices([grouped.shape[0], grouped.shape[1], tf.int(histogram_size/4)])
+# this_key = grouped[g, e]
+# packed_is_bit = (tf.uint(this_key == tf.uint(4*i))) + (tf.uint(this_key == tf.uint(4*i+1)) << 8) + (tf.uint(this_key == tf.uint(4*i+2)) << 16) + (tf.uint(this_key == tf.uint(4*i+3)) << 24)
+# packed_is_bit = tf.select((g*group_size + e) < keys_in.shape[0], packed_is_bit, tf.uint(0))
+# group_histogram_packed = tf.sum(packed_is_bit, axis = 1)
+#
+# g, i = tf.indices([grouped.shape[0], histogram_size])
+# group_histogram = tf.uint((group_histogram_packed[g, i / 4] >> (8*(i % 4))) & tf.uint(0xFF))
+#
+# group_histogram_scan = prefix_sum_grouped(group_histogram, axis = 0)
+# i, = tf.indices([histogram_size])
+# total_bit_histogram = tf.prefix_sum(group_histogram_scan[group_histogram_scan.shape[0] - 1, i])
+#
+# with tf.kernel(grouped.shape, group_size=[group_size]) as (g, e):
+# if(tf.current_backend() == tf.cpu): #dont use group barriers on CPU - doesn't work
+# element = g * group_size + e
+# with tf.if_cond(element < keys_in.shape[0]):
+# old_key = keys_in[element]
+# old_val = values_in[element]
+# bit = GetBits(old_key, iter)
+# total_offset = tf.select(g == 0, tf.uint(0), group_histogram_scan[g - 1, bit]) + tf.select(bit == tf.uint(0), tf.uint(0), total_bit_histogram[bit - tf.uint(1)])
+# with tf.loop(e) as j:
+# total_offset.val += tf.uint(grouped[g, j] == bit)
+# keys_out[total_offset] = old_key
+# values_out[total_offset] = old_val
+# else:
+# temp = tf.group_buffer(group_size, tf.uint32)
+# half_count = tf.group_buffer(histogram_size, tf.uint32)
+# gtid = g.block_thread_index(0)
+#
+# #initialize counters
+# for i in range((histogram_size + group_size - 1) // group_size):
+# index = gtid + i * group_size
+# with tf.if_cond(index < histogram_size):
+# half_count[index] = 0
+# tf.group_barrier()
+#
+# element = g * group_size + e
+# with tf.if_cond(element < keys_in.shape[0]):
+# old_key = keys_in[element]
+# bit = GetBits(old_key, iter)
+# temp[gtid] = bit
+#
+# #count number of bits set in previous sub groups
+# quarter_index = e / (group_size // 4)
+# with tf.if_cond(quarter_index < 3):
+# tf.scatterAdd(half_count[bit], tf.uint(quarter_index < 1) | (tf.uint(quarter_index < 2) << 8) | (tf.uint(quarter_index < 3) << 16))
+#
+# tf.group_barrier()
+#
+# if has_values:
+# old_val = values_in[element]
+#
+# total_offset = tf.select(g == 0, tf.uint(0), group_histogram_scan[g - 1, tf.int(bit)]) + tf.select(tf.int(bit) == 0, tf.uint(0), total_bit_histogram[tf.int(bit) - 1])
+# total_offset += tf.select(quarter_index > 0, (half_count[bit] >> (8*(quarter_index-1))) & tf.uint(0xFF), tf.uint(0))
+# begin_index = quarter_index * (group_size // 4)
+# with tf.loop(begin_index, e) as j:
+# total_offset.val += tf.uint(temp[j] == bit)
+# keys_out[total_offset] = old_key
+#
+# if has_values:
+# values_out[total_offset] = old_val
+#
+# tf.region_end('Radix sort iteration')
+#
+# SortIteration(keys, keys1, values, values1, 2 * iter)
+# SortIteration(keys1, keys, values1, values, 2 * iter + 1)
+#
+# tf.region_end('Radix sort')
+#
+# if(original_type == tf.float32):
+# keys = map_uint_to_float(keys)
+#
+# if(original_type == tf.int32):
+# keys = map_uint_to_int(keys)
+#
+# if has_values:
+# return keys, values
+# else:
+# return keys
diff --git a/Python/pyproject.toml b/Python/pyproject.toml
index 39eb00ff..55af72a3 100644
--- a/Python/pyproject.toml
+++ b/Python/pyproject.toml
@@ -8,7 +8,7 @@ build-backend = "scikit_build_core.build"
[project]
name = "TensorFrost"
-version = "0.7.4"
+version = "2.0.0.dev0"
description = "A static optimizing tensor compiler with a Python frontend"
authors = [{name = "Mykhailo Moroz", email = "michael08840884@gmail.com"}]
requires-python = ">=3.7"
diff --git a/README.md b/README.md
index dd9a14a3..e7b6787e 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ pip install tensorfrost
## From source
-You need to have CMake installed to build the library.
+You need to have CMake and Vulkan SDK installed to build the library.
First clone the repository:
```bash
@@ -689,11 +689,12 @@ You can also specify the clipping type for the gradients, by default the value o
For debugging convenience there are 2 function types that you can call inside a tensor program:
```python
+tf.renderdoc_is_available()
tf.renderdoc_start_capture()
tf.renderdoc_end_capture()
```
-These functions will start and end a RenderDoc capture, only if python is started from the RenderDoc GUI. This is useful for debugging the OpenGL backend, as it allows you to inspect compiled kernel execution, its code and buffers.
+These functions will start and end a RenderDoc capture, only if python is started from the RenderDoc GUI. Call `tf.renderdoc_is_available()` first to check whether RenderDoc is attached so you can skip capture logic when it isn't. This is useful for debugging the OpenGL backend, as it allows you to inspect compiled kernel execution, its code and buffers.
```python
tf.region_begin('Region name')
diff --git a/TensorFrost/Backend/Backend.cpp b/TensorFrost/Backend/Backend.cpp
deleted file mode 100644
index 8163a39e..00000000
--- a/TensorFrost/Backend/Backend.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-#include "Backend.h"
-
-namespace TensorFrost {
-
-BackendType current_backend = BackendType::NotInitialized;
-CodeGenLang current_kernel_lang = CodeGenLang::CPP;
-CodeGenLang current_main_lang = CodeGenLang::CPP;
-bool strip_debug_names = false;
-
-void InitializeBackend(BackendType backendType, const string& compilerOptions, CodeGenLang kernelType) {
- if (current_backend != BackendType::NotInitialized) {
- cout << "Warning: Backend already initialized, stopping current backend\n" << endl;
-
- switch (current_backend) {
- case BackendType::CPU:
- break;
- case BackendType::Vulkan:
- break;
- case BackendType::OpenGL:
- StopOpenGL();
- break;
- default:
- throw std::runtime_error("Backend not implemented");
- }
- }
-
- if (!compilerOptions.empty()) {
- kernelCompileOptions = compilerOptions;
- } else if(backendType != BackendType::CPU) {
- kernelCompileOptions = ""; //no need for cpu optimizations on other backends
- } else {
-#ifdef _WIN32
- kernelCompileOptions = "/O2 /fp:fast /openmp";
-#else
- kernelCompileOptions = "-O3 -ffast-math -fopenmp";
-#endif
- }
-
-#ifdef _DEBUG
-#ifdef _WIN32
- kernelCompileOptions = "/Zi";
-#else
- kernelCompileOptions = "-g";
-#endif
-#endif
-
- current_backend = backendType;
-
- switch (backendType) {
- case BackendType::CPU:
- case BackendType::CodeGen:
- current_kernel_lang = CodeGenLang::CPP;
- global_memory_manager = new CpuMemoryManager();
- global_kernel_manager = new CpuKernelManager();
- break;
- case BackendType::Vulkan:
- throw std::runtime_error("Vulkan backend not implemented yet");
- current_kernel_lang = CodeGenLang::GLSL;
- break;
- case BackendType::OpenGL:
- StartOpenGL();
- current_kernel_lang = CodeGenLang::GLSL;
- global_memory_manager = new OpenGLMemoryManager();
- global_kernel_manager = new OpenGLKernelManager();
- break;
- default:
- throw std::runtime_error("Backend not implemented");
- }
-
- if (kernelType != CodeGenLang::None) {
- current_kernel_lang = kernelType;
- }
-}
-
-void CompileKernels(Program* program) {
- auto start_time = chrono::high_resolution_clock::now();
- for(auto& kernel : program->kernels_) {
- switch (current_backend) {
- case BackendType::CPU:
- //already in the host program
- break;
- case BackendType::Vulkan:
- throw std::runtime_error("Vulkan backend not implemented yet");
- case BackendType::OpenGL:
- ((OpenGLKernelManager*)global_kernel_manager)->CompileKernel(&kernel);
- break;
- default:
- throw std::runtime_error("Backend not implemented");
- }
- }
- auto end_time = chrono::high_resolution_clock::now();
- float milliseconds = chrono::duration(end_time - start_time).count();
- program->shader_compile_time = milliseconds;
-}
-
-TFTensor Allocator(const char* name, const size_t* a, size_t dim, TFDataFormat format, void* data) {
- try {
- vector shape(a, a + dim);
- return *global_memory_manager->AllocateTensor(shape, format, name);
- } catch (const std::exception& e) {
- size_t size = 1;
- for (size_t i = 0; i < dim; i++) {
- size *= a[i];
- }
- throw std::runtime_error("Error allocating tensor " + string(name) + ": " + e.what() + ", requested size: " + to_string(size));
- }
-}
-
-void Deallocator(TFTensor a, void* data) {
- global_memory_manager->DeallocateTensor(a);
-}
-
-uint Readback(TFTensor a, size_t index, void* data) {
- return global_memory_manager->ReadbackValue(&a, index);
-}
-
-void Writeback(TFTensor a, size_t index, uint32_t value, void* data) {
- global_memory_manager->WritebackValue(&a, index, value);
-}
-
-void Dispatch(TFDispatchInfo info, void* data) {
- global_kernel_manager->DispatchKernel(info);
-}
-
-void Region(const char* name, bool begin, void* data) {
- if (current_backend == BackendType::OpenGL) {
- if (begin) {
- StartDebugRegion(name);
- } else {
- EndDebugRegion();
- }
- }
-}
-
-//#define PROFILE_EXECUTION
-
-vector ExecuteProgram(
- Program* program, vector inputs) {
-
- if (current_backend == BackendType::CodeGen) {
- throw std::runtime_error("Cannot execute program with code generation backend");
- }
-
- int memory_input_count = (int)program->ir_->input_memory_map.size();
-
- if (memory_input_count != inputs.size()) {
- throw std::runtime_error(
- "Invalid number of inputs for TensorProgram. Expected " +
- to_string(memory_input_count) + ", got " + to_string(inputs.size()));
- }
-
- vector input_tensors;
- for (int i = 0; i < memory_input_count; i++) {
- input_tensors.push_back(*inputs[i]);
- }
-
- unordered_map output_memory_map = program->ir_->output_memory_map;
- int output_count = (int)output_memory_map.size();
-
- TFTensor* in = input_tensors.data();
- TFTensor* out = new TFTensor[output_count];
-
-#ifdef PROFILE_EXECUTION
- auto start = chrono::high_resolution_clock::now();
-#endif
- try {
- program->execute_callback(in, out, {Allocator, Deallocator, Readback, Writeback, Dispatch, Region, nullptr});
- } catch (const std::exception& e) {
- throw std::runtime_error("Error executing program " + program->program_name + ": " + e.what());
- }
-
-#ifdef PROFILE_EXECUTION
- Finish();
- auto end = chrono::high_resolution_clock::now();
- float milliseconds = chrono::duration(end - start).count();
- program->last_execution_time = milliseconds;
-#endif
-
- vector outputs = vector(output_count);
- for (int i = 0; i < output_count; i++) {
- outputs[i] = &out[i];
- }
-
- return outputs;
-}
-
-} // namespace TensorFrost
\ No newline at end of file
diff --git a/TensorFrost/Backend/Backend.h b/TensorFrost/Backend/Backend.h
deleted file mode 100644
index dd4151f9..00000000
--- a/TensorFrost/Backend/Backend.h
+++ /dev/null
@@ -1,49 +0,0 @@
-#pragma once
-
-#include
-#include
-#include
-#include