diff --git a/.gitignore b/.gitignore
index 81fa90ff..ddb1ba11 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,7 @@
 # Installation
 downloads
 install
-
+.vscode
 # T2S generated tmp
 *.out
 tmp.txt
@@ -14,7 +14,7 @@ profile.mon
 *-interface.*
 *_genx.cpp
 temp*
-*.png
+.png
 *.o
 *.isa
 signed* 
diff --git a/Halide/Makefile b/Halide/Makefile
index fb7a5618..1f66efac 100644
--- a/Halide/Makefile
+++ b/Halide/Makefile
@@ -129,6 +129,7 @@ WITH_PTX ?= $(findstring nvptx, $(LLVM_COMPONENTS))
 WITH_AMDGPU ?= $(findstring amdgpu, $(LLVM_COMPONENTS))
 WITH_WEBASSEMBLY ?= $(findstring webassembly, $(LLVM_COMPONENTS))
 WITH_OPENCL ?= not-empty
+WITH_ONEAPI ?= not-empty
 WITH_METAL ?= not-empty
 WITH_OPENGL ?= not-empty
 WITH_D3D12 ?= not-empty
@@ -193,6 +194,9 @@ AMDGPU_LLVM_CONFIG_LIB=$(if $(WITH_AMDGPU), amdgpu, )
 OPENCL_CXX_FLAGS=$(if $(WITH_OPENCL), -DWITH_OPENCL, )
 OPENCL_LLVM_CONFIG_LIB=$(if $(WITH_OPENCL), , )
 
+ONEAPI_CXX_FLAGS=$(if $(WITH_ONEAPI), -DWITH_OPENCL, )
+ONEAPI_LLVM_CONFIG_LIB=$(if $(WITH_ONEAPI), , )
+
 METAL_CXX_FLAGS=$(if $(WITH_METAL), -DWITH_METAL, )
 METAL_LLVM_CONFIG_LIB=$(if $(WITH_METAL), , )
 
@@ -249,6 +253,7 @@ CXX_FLAGS += $(HEXAGON_CXX_FLAGS)
 CXX_FLAGS += $(AARCH64_CXX_FLAGS)
 CXX_FLAGS += $(X86_CXX_FLAGS)
 CXX_FLAGS += $(OPENCL_CXX_FLAGS)
+CXX_FLAGS += $(ONEAPI_CXX_FLAGS)
 CXX_FLAGS += $(METAL_CXX_FLAGS)
 CXX_FLAGS += $(OPENGL_CXX_FLAGS)
 CXX_FLAGS += $(D3D12_CXX_FLAGS)
@@ -281,6 +286,7 @@ LLVM_STATIC_LIBFILES = \
 	$(X86_LLVM_CONFIG_LIB) \
 	$(ARM_LLVM_CONFIG_LIB) \
 	$(OPENCL_LLVM_CONFIG_LIB) \
+	$(ONEAPI_LLVM_CONFIG_LIB) \
 	$(METAL_LLVM_CONFIG_LIB) \
 	$(PTX_LLVM_CONFIG_LIB) \
 	$(AARCH64_LLVM_CONFIG_LIB) \
@@ -474,6 +480,7 @@ SOURCE_FILES = \
   CanonicalizeGPUVars.cpp \
   Closure.cpp \
   CodeGen_ARM.cpp \
+  CodeGen_DPC_Dev.cpp \
   CodeGen_C.cpp \
   CodeGen_D3D12Compute_Dev.cpp \
   CodeGen_GPU_Dev.cpp \
@@ -647,6 +654,7 @@ HEADER_FILES = \
   CanonicalizeGPUVars.h \
   Closure.h \
   CodeGen_ARM.h \
+  CodeGen_DPC_Dev.h \
   CodeGen_C.h \
   CodeGen_D3D12Compute_Dev.h \
   CodeGen_GPU_Dev.h \
diff --git a/Halide/src/CodeGen_CM_Dev.cpp b/Halide/src/CodeGen_CM_Dev.cpp
index e3adb2e3..bc727baf 100644
--- a/Halide/src/CodeGen_CM_Dev.cpp
+++ b/Halide/src/CodeGen_CM_Dev.cpp
@@ -350,8 +350,10 @@ void CodeGen_CM_Dev::CodeGen_CM_C::print_media_block_rw(Type t, vector<Expr> arg
         for (int j = 0; j < cols; j += max_cols_at_once) {
             int cols_at_once = j + max_cols_at_once <= cols ? max_cols_at_once : cols-j;
 
+            // Replace the buffer name with the one specified in stensors
+            string name = is_write ? args[7].as<StringImm>()->value : print_expr(args[0]);
             stream << get_indent() << (is_write ? "write(" : "read(");
-            stream << print_name(print_expr(args[0])) << ", ";
+            stream << print_name(name) << ", ";
             stream << print_expr(args[1] * bytes) << ", ";
             stream << print_expr(args[2] + i) << ", ";
             auto ramp = args[4].as<Ramp>();
@@ -683,6 +685,28 @@ void CodeGen_CM_Dev::add_kernel(Stmt s,
     src_stream.str(str);
 }
 
+class FindRefName : public IRVisitor
+{
+    const string &buf_name;
+public:
+    using IRVisitor::visit;
+    string ref_name;
+
+    void visit(const Call *op) override {
+        if (op->is_intrinsic(Call::cm_store_2d)) {
+            internal_assert(op->args[0].as<Variable>());
+            auto &name = op->args[0].as<Variable>()->name;
+            if (name == buf_name && op->args.size() == 8) {
+                internal_assert(op->args[7].as<StringImm>());
+                ref_name = op->args[7].as<StringImm>()->value;
+            }
+        }
+    }
+
+    FindRefName(const string &_b)
+        : buf_name(_b) {}
+};
+
 void CodeGen_CM_Dev::CodeGen_CM_C::add_kernel(Stmt s,
                                               const string &name,
                                               const vector<DeviceArgument> &args) {
@@ -692,7 +716,14 @@ void CodeGen_CM_Dev::CodeGen_CM_C::add_kernel(Stmt s,
     stream << "extern \"C\" _GENX_MAIN_ void " << name << "(\n";
     for (size_t i = 0; i < args.size(); i++) {
         if (args[i].is_buffer) {
-            stream << "SurfaceIndex " << print_name(args[i].name)
+            string name = args[i].name;
+            // Trick: replace the buffer name with the one specified in stensor
+            FindRefName frn(name);
+            s.accept(&frn);
+            if (!frn.ref_name.empty()) {
+                name = frn.ref_name;
+            }
+            stream << "SurfaceIndex " << print_name(name)
                    << " [[type(\"image2d_t " << print_type(args[i].type) << "\")]]";
             Allocation alloc;
             alloc.type = args[i].type;
@@ -760,4 +791,4 @@ vector<char> CodeGen_CM_Dev::compile_to_src() {
 }
 
 }  // namespace Internal
-}  // namespace Halide
+}  // namespace Halide
\ No newline at end of file
diff --git a/Halide/src/CodeGen_DPC_Dev.cpp b/Halide/src/CodeGen_DPC_Dev.cpp
new file mode 100644
index 00000000..3d855820
--- /dev/null
+++ b/Halide/src/CodeGen_DPC_Dev.cpp
@@ -0,0 +1,890 @@
+#include "CodeGen_DPC_Dev.h"
+#include "IROperator.h"
+#include "Simplify.h"
+#include "Substitute.h"
+#include "fstream"
+#include <cmath>
+#include <sstream>
+#include <stack>
+#include <string>
+#include <vector>
+namespace Halide {
+namespace Internal {
+
+using std::ostringstream;
+using std::stack;
+using std::string;
+using std::to_string;
+using std::vector;
+static bool is_broadcast;
+stack<string> range_info_Block;
+stack<string> range_info_thread;
+vector<string> function_var;
+string global_declaration;
+CodeGen_DPC_Dev::CodeGen_DPC_Dev(Target t)
+    : clc(src_stream, t) {
+}
+
+string CodeGen_DPC_Dev::CodeGen_DPC_C::print_type(Type type, AppendSpaceIfNeeded space) {
+    ostringstream oss;
+    if (type.is_float()) {
+        switch (type.bits()) {
+        case 16:
+            oss << "half";
+            break;
+        case 32:
+            oss << "float";
+            break;
+        case 64:
+            oss << "double";
+            break;
+        default:
+            user_error << "Can't represent a float with this many bits in DPC C: " << type << "\n";
+        }
+    } else {
+        if (type.is_uint()) {
+            oss << "unsigned ";
+        }
+        switch (type.bits()) {
+        case 1:
+            oss << "char";
+            break;
+        case 8:
+            oss << "char";
+            break;
+        case 16:
+            oss << "short";
+            break;
+        case 32:
+            oss << "int";
+            break;
+        case 64:
+            oss << "long long";
+            break;
+            break;
+        default:
+            user_error << "Can't represent an integer with this many bits in DPC C: " << type << "\n";
+        }
+    }
+    if (space == AppendSpace) {
+        oss << " ";
+    }
+    return oss.str();
+}
+
+namespace {
+string simt_intrinsic(const string &name) {
+    if (ends_with(name, ".__thread_id_x")) {
+        return "ndi.get_local_id(0)";
+    } else if (ends_with(name, ".__thread_id_y")) {
+        return "ndi.get_local_id(1)";
+    } else if (ends_with(name, ".__thread_id_z")) {
+        return "ndi.get_local_id(2)";
+    } else if (ends_with(name, ".__block_id_x")) {
+        return "ndi.get_group(0)";
+    } else if (ends_with(name, ".__block_id_y")) {
+        return "ndi.get_group(1)";
+    } else if (ends_with(name, ".__block_id_z")) {
+        return "ndi.get_group(2)";
+    }
+    internal_error << "simt_intrinsic called on bad variable name: " << name << "\n";
+    return "";
+}
+}  // namespace
+
+string CodeGen_DPC_Dev::CodeGen_DPC_C::print_vector_op(const string &tmpl, char prefix) {
+    auto cached = cache.find(tmpl);
+    if (cached == cache.end()) {
+        string gen;
+        id = unique_name(prefix);
+        gen = replace_all(tmpl, "$ID$", id);
+        stream << gen;
+        cache[tmpl] = id;
+    } else {
+        id = cached->second;
+    }
+
+    return id;
+}
+
+string CodeGen_DPC_Dev::CodeGen_DPC_C::get_vector_declaration(Type t,
+                                                              const string &id) {
+    ostringstream os_tmpl;
+    os_tmpl << get_indent() << "simd<"
+            << print_type(t) << ", "
+            << t.lanes() << "> "
+            << id << ";\n";
+    return os_tmpl.str();
+}
+
+string CodeGen_DPC_Dev::CodeGen_DPC_C::get_vector_declaration(Type t,
+                                                              const string &id,
+                                                              const string &init) {
+    ostringstream os_tmpl;
+    os_tmpl << get_indent() << "simd<"
+            << print_type(t) << ", "
+            << t.lanes() << "> "
+            << id << "("
+            << init << ");\n";
+    return os_tmpl.str();
+}
+
+string CodeGen_DPC_Dev::CodeGen_DPC_C::get_vector_init_tmpl(Type t,
+                                                            const string &base,
+                                                            const string &stride) {
+    ostringstream os_tmpl;
+    os_tmpl << get_vector_declaration(t, "$ID");
+    for (int i = 0; i < t.lanes(); i++) {
+        os_tmpl << get_indent() << "$ID$[" << to_string(i) << "]=(" << base << "+";
+        while (i) {
+            os_tmpl << stride << "+";
+            i--;
+        }
+        os_tmpl << "0);\n";
+    }
+    return os_tmpl.str();
+}
+string CodeGen_DPC_Dev::CodeGen_DPC_C::get_vector_read_tmpl(Type t,
+                                                            const string &name,
+                                                            const string &global_offset,
+                                                            const string &element_offset,
+                                                            bool with_decl = false) {
+    ostringstream os_tmpl;
+    if (with_decl)
+        os_tmpl << get_vector_declaration(t, "$ID$");
+    os_tmpl << get_indent() << "$ID$ = gather<" << print_type(t) << "," <<  t.lanes() << ">(" << print_name(name) << "," << element_offset << "," << global_offset << ")";
+    return os_tmpl.str();
+}
+string CodeGen_DPC_Dev::CodeGen_DPC_C::get_vector_read_tmpl(Type t,
+                                                            const string &name,
+                                                            const string &offset,
+                                                            bool dword_aligned = false) {
+    ostringstream os_tmpl;
+    os_tmpl << get_vector_declaration(t, "$ID$");
+    os_tmpl << get_indent() << "$ID$=gather<" << print_type(t) << ", " << t.lanes() << ">(" << print_name(name) << "," << offset << ");";
+    return os_tmpl.str();
+}
+
+string CodeGen_DPC_Dev::CodeGen_DPC_C::get_slm_read_tmpl(Type t,
+                                                         const string &name,
+                                                         const string &global_offset,
+                                                         const string &element_offset) {
+    ostringstream os_tmpl;
+    os_tmpl << get_vector_declaration(t, "$ID$");
+    os_tmpl << "$ID$ = " << "slm_gather<" << print_type(t) << ", " << t.lanes() << ">(" << element_offset << ");\n";
+    return os_tmpl.str();
+}
+
+string CodeGen_DPC_Dev::CodeGen_DPC_C::get_vector_write_tmpl(Type t,
+                                                             const string &name,
+                                                             const string &global_offset,
+                                                             const string &element_offset) {
+    ostringstream os_tmpl;
+    os_tmpl << get_indent() << "scatter<" <<print_type(t) << ", " << t.lanes()<<">("
+            << print_name(name) << ", "
+            << element_offset << ", "
+            << "$ID$" << ","
+            << global_offset << ");\n";
+    return os_tmpl.str();
+}
+
+string CodeGen_DPC_Dev::CodeGen_DPC_C::get_vector_write_tmpl(Type t,
+                                                             const string &name,
+                                                             const string &offset) {
+    ostringstream os_tmpl;
+    os_tmpl << get_indent() << "scatter<" <<print_type(t) << ", " << t.lanes()<<">("
+            << print_name(name) << ", "
+            << offset << ", "
+            << "$ID$);\n";
+    return os_tmpl.str();
+}
+
+string CodeGen_DPC_Dev::CodeGen_DPC_C::get_slm_write_tmpl(Type t,
+                                                          const string &name,
+                                                          const string &global_offset,
+                                                          const string &element_offset) {
+    ostringstream os_tmpl;
+    os_tmpl << get_indent() << "slm_scatter<" << print_type(t) <<", " << t.lanes() << ">("
+            << element_offset << ", "
+            << "$ID$);\n";
+    return os_tmpl.str();
+}
+
+string CodeGen_DPC_Dev::CodeGen_DPC_C::get_vector_select(const string &name,
+                                                         const string &base,
+                                                         int size,
+                                                         int stride) {
+    ostringstream os_tmpl;
+    os_tmpl << print_name(name)
+            << ".select<" << size << ", " << stride << ">"
+            << "(" << base << ")";
+    return os_tmpl.str();
+}
+
+string CodeGen_DPC_Dev::CodeGen_DPC_C::get_vector_iselect(const string &name,
+                                                          const string &idx) {
+    ostringstream os_tmpl;
+    os_tmpl << print_name(name)
+            << ".iselect(" << idx << ")";
+    return os_tmpl.str();
+}
+
+string CodeGen_DPC_Dev::CodeGen_DPC_C::print_assignment(Type t, const std::string &rhs) {
+    auto cached = cache.find(rhs);
+    if (cached != cache.end())
+        return cached->second;
+
+    id = unique_name('_');
+    if (t.lanes() > 1) {
+        stream << get_vector_declaration(t, id);
+        stream << get_indent() << id << " = " << rhs << ";\n";
+    } else {
+        stream << get_indent() << print_type(t, AppendSpace)
+               << id << " = " << rhs << ";\n";
+    }
+    cache[rhs] = id;
+    return id;
+}
+
+string CodeGen_DPC_Dev::CodeGen_DPC_C::get_vector_element(const string &name,
+                                                          const string &index) {
+    ostringstream rhs;
+    rhs << print_name(name);
+    rhs << "[" << index << "]";
+    return rhs.str();
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::visit(const Min *op) {
+    internal_assert(op->a.type() == op->b.type());
+    id = "min<" + print_type(op->a.type()) + ">(" + print_expr(op->a) + ", " + print_expr(op->b) + ")";
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::visit(const Max *op) {
+    internal_assert(op->a.type() == op->b.type());
+    id = "max<" + print_type(op->a.type()) + ">(" + print_expr(op->a) + ", " + print_expr(op->b) + ")";
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::visit(const Mod *op) {
+    visit_binop(op->type, op->a, op->b, "%");
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::visit(const Div *op) {
+    visit_binop(op->type, op->a, op->b, "/");
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::visit(const FloatImm *op) {
+    id = to_string(op->value);
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::visit_binop(Type t, Expr a, Expr b, const char *op) {
+    string sa = print_expr(a);
+    string sb = print_expr(b);
+    id = "(" + sa + op + sb + ")";
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::visit(const For *loop) {
+    user_assert(loop->for_type != ForType::GPULane)
+        << "The CM backend does not support the gpu_lanes() scheduling directive.";
+    user_assert(loop->for_type != ForType::Parallel)
+        << "Cannot use parallel loops inside CM kernel\n";
+
+    if (is_gpu_var(loop->name)) {
+        internal_assert((loop->for_type == ForType::GPUBlock) ||
+                        (loop->for_type == ForType::GPUThread))
+            << "kernel loop must be either gpu block or gpu thread\n";
+        internal_assert(is_zero(loop->min));
+        // call a function:store extent
+        //  stream << "flag before reset";
+        //  stream.seekp(0);
+        stream << get_indent() << print_type(Int(32)) << " "
+               << print_name(loop->name) << " = "
+               << simt_intrinsic(loop->name) << ";\n";
+        if (loop->for_type == ForType::GPUBlock) {
+            range_info_Block.push(print_expr(loop->extent));
+        } else if (loop->for_type == ForType::GPUThread) {
+            range_info_thread.push(print_expr(loop->extent));
+        }
+        loop->body.accept(this);
+        return;
+    }
+    if (loop->for_type == ForType::PragmaUnrolled)
+        stream << get_indent() << "#pragma unroll\n";
+    CodeGen_C::visit(loop);
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::visit(const Ramp *op) {
+    string base = print_expr(op->base);
+    string stride = print_expr(op->stride);
+
+    Type t = op->type.with_lanes(op->lanes);
+    string tmpl = get_vector_init_tmpl(t, base, stride);
+    print_vector_op(tmpl);
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::visit(const Broadcast *op) {
+    // Since CM supports operations on scalar and vector, just treat Broadcast node as scalar
+    is_broadcast = true;
+    op->value.accept(this);
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::visit(const Variable *op) {
+    if (op->type.is_handle()) {
+        id = "_" + op->name;
+        return;
+    }
+    CodeGen_C::visit(op);
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::visit(const Evaluate *op) {
+    print_expr(op->value);
+}
+
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::print_media_block_rw(Type t, vector<Expr> args, bool is_write) {
+    int cols = args[5].as<IntImm>()->value;
+    int rows = args[6].as<IntImm>()->value;
+    // internal_assert(cols <= 8);
+    int bytes = t.bits() / 8;
+    int max_cols_at_once = (cols < 8 ? cols : 8);
+    int max_rows_at_once = 256 / (max_cols_at_once * bytes);
+
+    for (int i = 0; i < rows; i += max_rows_at_once) {
+        int rows_at_once = i + max_rows_at_once <= rows ? max_rows_at_once : rows - i;
+        for (int j = 0; j < cols; j += max_cols_at_once) {
+            int cols_at_once = j + max_cols_at_once <= cols ? max_cols_at_once : cols - j;
+            auto ramp = args[4].as<Ramp>();
+            // Replace the buffer name with the one specified in stensors
+            string name = is_write ? args[7].as<StringImm>()->value : print_expr(args[0]);
+            if (is_write == false) {
+                stream << get_indent() << print_expr(args[3]) << ".select<"
+                       << ramp->lanes << ", " << ramp->stride << ">(" << ramp->base << ")"
+                       << ".bit_cast_view<" << print_type(t) << ", " << rows << ", " << cols << ">()"
+                       << ".select<" << rows_at_once << ", 1, " << cols_at_once << ", 1>("
+                       << i << ", " << j << ")=";
+                stream << "media_block_load<" << print_type(t) << ", " << rows_at_once << ", " << cols_at_once << ">(";
+                stream << print_name(name) << ", ";
+                stream << print_expr(args[1] * bytes) << ", ";
+                stream << print_expr(args[2] + i) << ");\n";
+            } else if (is_write == true) {
+                stream << get_indent() << "media_block_store<" << print_type(t) << ", " << rows_at_once << ", " << cols_at_once << ">(";
+                stream << print_name(name) << ", ";
+                stream << print_expr(args[1] * bytes) << ", ";
+                stream << print_expr(args[2] + i) << ", ";
+                auto ramp = args[4].as<Ramp>();
+                stream << print_expr(args[3]) << ".select<"
+                       << ramp->lanes << ", " << ramp->stride << ">(" << ramp->base << ")"
+                       << ".bit_cast_view<" << print_type(t) << ", " << rows << ", " << cols << ">()"
+                       << ".select<" << rows_at_once << ", 1, " << cols_at_once << ", 1>("
+                       << i << ", " << j << "));\n";
+            }
+        }
+    }
+}
+string print_corr_buf_idx_DPC(vector<Expr> args, string id) {
+    ostringstream oss;
+    oss << "inline int cm_corr_buf_idx_" << id << "(int i) {\n";
+    auto &cond = args[1].as<Shuffle>()->vectors;
+    auto &acc = args[2].as<Shuffle>()->vectors;
+    for (size_t i = 0; i < cond.size(); i++) {
+        oss << "  if (i < " << cond[i] << ") return (i - " << acc[i] << ");\n";
+    }
+    oss << "  return i;\n}\n";
+    return oss.str();
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::visit(const Call *op) {
+    if (op->is_intrinsic(Call::gpu_thread_barrier)) {
+        stream << get_indent() << "cm_barrier();\n";
+        return;
+    }
+    if (op->is_intrinsic(Call::cm_load_2d)) {
+        print_media_block_rw(op->type, op->args, false);
+        return;
+    }
+    if (op->is_intrinsic(Call::cm_store_2d)) {
+        print_media_block_rw(op->type, op->args, true);
+        return;
+    }
+    if (op->is_intrinsic(Call::cm_corr_buf_idx)) {
+        string id = unique_name('f');
+        string rhs = "cm_corr_buf_idx_" + id + "(" + print_expr(op->args[0]) + ")";
+        init_funcs += print_corr_buf_idx_DPC(op->args, id);
+        print_assignment(op->type, rhs);
+        return;
+    }
+    CodeGen_C::visit(op);
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::visit(const Load *op) {
+    user_assert(is_one(op->predicate))
+        << "Predicated load is not supported inside CM kernel.\n";
+
+    internal_assert(allocations.contains(op->name))
+        << op->name << "is not allocated\n";
+    const auto &alloc = allocations.get(op->name);
+    is_broadcast = false;
+    if (alloc.memory_type == MemoryType::Register) {
+        if (op->index.type().is_vector()) {
+            auto ramp = op->index.as<Ramp>();
+            auto stride = ramp->stride.as<IntImm>()->value;
+            auto num_base = ramp->base.as<IntImm>();
+            auto ramp_base = print_expr(ramp->base);
+
+            if (stride < 0) {
+                Type t = UInt(16).with_lanes(ramp->lanes);
+                string tmpl = get_vector_init_tmpl(t, ramp_base, to_string(stride));
+                print_vector_op(tmpl);
+                id = get_vector_iselect(op->name, id);
+                return;
+            }
+
+            if (num_base && num_base->value < 0) {
+                size_t lanes = ramp->lanes + num_base->value;
+                id = unique_name('_');
+                stream << get_vector_declaration(op->type, id);
+                string rhs = get_vector_select(op->name, "0", lanes, stride);
+                string lhs = get_vector_select(id, to_string(0 - num_base->value), lanes, stride);
+                stream << get_indent() << lhs << " = " << rhs << ";\n";
+                return;
+            }
+
+            id = get_vector_select(op->name, ramp_base, ramp->lanes, stride);
+        } else {
+            string index = print_expr(op->index);
+            id = get_vector_element(op->name, index);
+        }
+        return;
+    }
+    // If we're loading a contiguous ramp into a vector, use block read instead.
+    bool is_continuous = strided_ramp_base(op->index).defined();
+    if (is_continuous) {
+        auto ramp = op->index.as<Ramp>();
+        string ramp_base = print_expr(simplify(ramp->base * op->type.bits() / 8));
+        if (ramp->stride.as<IntImm>()->value == -1)
+            ramp_base += "-" + to_string(ramp->lanes * op->type.bits() / 8);
+
+        if (alloc.memory_type == MemoryType::Heap) {
+            string tmpl = get_vector_read_tmpl(op->type, op->name, ramp_base);
+            print_vector_op(tmpl);
+        }
+    } else if (op->index.type().is_vector()) {
+        // If index is a vector, gather vector elements.
+        internal_assert(op->type.is_vector());
+        Type idx_t = op->index.type().with_code(halide_type_uint);
+        op->index.set_type(idx_t);
+        string index = print_expr(op->index);
+
+        if (alloc.memory_type == MemoryType::Heap) {
+            if (in_buffer) {
+                id = get_vector_read_tmpl(op->type, op->name, "0", index);
+            } else {
+                string tmpl = get_vector_read_tmpl(op->type, op->name, "0", index, true);
+                print_vector_op(tmpl);
+            }
+        }
+    } else {
+        if (alloc.memory_type == MemoryType::Heap) {
+            size_t bytes = op->type.bits() / 8;
+            internal_assert(bytes == 4);
+            Type t = op->type.with_lanes(16 / bytes);
+            string index = print_expr(simplify(op->index * 4));
+            string tmpl = get_vector_read_tmpl(t, op->name, index, true);
+            print_vector_op(tmpl);
+            id = get_vector_element(id, "0");
+        }
+    }
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::visit(const Store *op) {
+    user_assert(is_one(op->predicate))
+        << "Predicated store is not supported inside CM kernel.\n";
+
+    internal_assert(allocations.contains(op->name))
+        << op->name << "is not allocated\n";
+    const auto &alloc = allocations.get(op->name);
+    if (ends_with(op->name, "_buf"))
+        in_buffer = true;
+    string value = print_expr(op->value);
+    Type t = op->value.type();
+    in_buffer = false;
+    if (alloc.memory_type == MemoryType::Register) {
+        if (op->index.type().is_vector()) {
+            auto ramp = op->index.as<Ramp>();
+            string ramp_base = print_expr(ramp->base);
+            size_t lanes = ramp->lanes;
+            size_t stride = ramp->stride.as<IntImm>()->value;
+            string dest = get_vector_select(op->name, ramp_base, lanes, stride);
+            if (value.find("$ID$") != value.npos)
+                stream << replace_all(value, "$ID$", dest);
+            else {
+                if (is_broadcast == true && (value.find("_buf") != value.npos)) {
+                    // stream << "here is my test,value:=" << value << '\n';
+                    is_broadcast = false;
+                    int buf_pos = value.find("_buf");
+                    buf_pos += 4;
+                    string select_stmt = ".select<" + to_string(lanes) + ",0>";
+                    value.insert(buf_pos, select_stmt);
+                    stream << get_indent() << dest << " = " << value << ";\n";
+                } else
+                    stream << get_indent() << dest << " = " << value << ";\n";
+            }
+        } else {
+            string index = print_expr(op->index);
+            stream << get_indent() << get_vector_element(op->name, index)
+                   << " = " << value << ";\n";
+        }
+        return;
+    }
+    bool is_continuous = strided_ramp_base(op->index).defined() || strided_ramp_base(op->index, -1).defined();
+    if (is_continuous) {
+        auto ramp = op->index.as<Ramp>();
+        string ramp_base = print_expr(simplify(ramp->base * t.bits() / 8));
+        if (ramp->stride.as<IntImm>()->value == -1)
+            ramp_base += "-" + to_string(ramp->lanes * t.bits() / 8);
+
+        if (alloc.memory_type == MemoryType::Heap) {
+            string tmpl = get_vector_write_tmpl(t, op->name, ramp_base);
+            stream << replace_all(tmpl, "$ID$", value);
+        }
+    } else {
+        Type idx_t = op->index.type().with_code(halide_type_uint);
+        op->index.set_type(idx_t);
+        string index = print_expr(op->index);
+
+        if (alloc.memory_type == MemoryType::Heap) {
+            string tmpl = get_vector_write_tmpl(t, op->name, "0", index);
+            stream << replace_all(tmpl, "$ID$", value);
+        }
+    }
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::visit(const Select *op) {
+    ostringstream rhs;
+    string cond = print_expr(op->condition);
+    string true_val = print_expr(op->true_value);
+    string false_val = print_expr(op->false_value);
+    // trick: replace logical operation with bitwise operation
+    cond = replace_all(cond, "&&", "&");
+    cond = replace_all(cond, "||", "|");
+
+    if (!op->condition.type().is_scalar()) {
+        id = unique_name('_');
+        stream << get_vector_declaration(op->type, id);
+
+        stream << get_indent() << "SIMD_IF_BEGIN(" << cond << ") {\n";
+        indent += 2;
+        stream << get_indent() << id << " = " << op->true_value << ";\n";
+        indent -= 2;
+        stream << get_indent() << "} SIMD_ELSE {\n";
+        indent += 2;
+        stream << get_indent() << id << " = " << false_val << ";\n";
+        indent -= 2;
+        stream << get_indent() << "} SIMD_IF_END;\n";
+        return;
+    }
+
+    rhs << "(" << cond
+        << " ? " << true_val
+        << " : " << false_val
+        << ")";
+    id = rhs.str();
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::visit(const Allocate *op) {
+    user_assert(!op->new_expr.defined()) << "Allocate node inside CM kernel has custom new expression.\n"
+                                         << "(Memoization is not supported inside GPU kernels at present.)\n";
+
+    if (op->memory_type == MemoryType::GPUShared) {
+        Allocation alloc;
+        alloc.type = op->type;
+        alloc.memory_type = op->memory_type;
+        allocations.push(op->name, alloc);
+        open_scope();
+
+        int32_t size = op->constant_allocation_size();
+        internal_assert(size % 4 == 0);
+        stream << get_indent() << "cm_slm_init(" << size << ");\n";
+        stream << get_indent() << "unsigned int " << print_name(op->name)
+               << " = cm_slm_alloc(" << size << ");\n";
+        op->body.accept(this);
+
+        close_scope("alloc " + print_name(op->name));
+        // Should have been freed internally
+        internal_assert(!allocations.contains(op->name));
+    } else {
+        int32_t size = op->constant_allocation_size();
+        user_assert(size > 0)
+            << "Allocation " << op->name << " has a dynamic size. "
+            << "Only fixed-size allocations are supported on the gpu. "
+            << "Try storing into shared memory instead.";
+        internal_assert(size % 4 == 0);
+
+        Allocation alloc;
+        alloc.type = op->type;
+        alloc.memory_type = MemoryType::Register;
+        allocations.push(op->name, alloc);
+
+        stream << get_vector_declaration(op->type.with_lanes(size), print_name(op->name));
+        op->body.accept(this);
+
+        internal_assert(!allocations.contains(op->name));
+    }
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::visit(const Free *op) {
+    // Should have been freed internally
+    internal_assert(allocations.contains(op->name));
+    allocations.pop(op->name);
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::visit(const AssertStmt *op) {
+    user_warning << "Ignoring assertion inside CM kernel: " << op->condition << "\n";
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::visit(const LetStmt *op) {
+    Stmt body = op->body;
+    Expr exp_val = op->value;
+
+    if (starts_with(op->name, "val.")) {
+        string value = print_expr(exp_val);
+        Expr new_var = Variable::make(exp_val.type(), value);
+        body = substitute(op->name, new_var, body);
+    }
+    if (starts_with(op->name, "var.") || starts_with(op->name, "ref.")) {
+        string value = print_expr(exp_val);
+        string id = unique_name('_');
+        stream << get_indent() << print_type(exp_val.type(), AppendSpace)
+               << id << " = " << value << ";\n";
+        Expr new_var = Variable::make(op->value.type(), id);
+        body = substitute(op->name, new_var, body);
+    }
+    body.accept(this);
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::visit(const Shuffle *op) {
+
+    if (op->is_concat()) {
+        size_t op_lanes = op->type.lanes();
+        std::stringstream vec;
+        string vec_id = unique_name("init_v");
+        vec << "const int " << vec_id << "[" << op_lanes << "] = {";
+        for (size_t i = 0; i < op_lanes; i++) {
+            auto item = op->vectors[i].as<IntImm>();
+            user_assert(item)
+                << "We only support const data item\n";
+            vec << item->value << ", ";
+        }
+        vec << "};\n";
+        init_vecs += vec.str();
+
+        id = unique_name('_');
+        Type t = op->type.with_code(halide_type_uint);
+        // vector<int,6>A(init_v) index
+        // init_v = {6,5,4,3,2,1}
+        // simd<int,size_2>init_v
+        // simd<int,size>A{init_v}
+        stream << get_vector_declaration(t, id, vec_id);
+        return;
+    }
+    if (op->is_slice()) {
+        string vec_id = print_expr(op->vectors[0]);
+        string base = to_string(op->slice_begin());
+        id = get_vector_select(vec_id, base, op->indices.size(), op->slice_stride());
+        return;
+    }
+    CodeGen_C::visit(op);
+}
+
+class FindRefName : public IRVisitor {
+    const string &buf_name;
+
+public:
+    using IRVisitor::visit;
+    string ref_name;
+
+    void visit(const Call *op) override {
+        if (op->is_intrinsic(Call::cm_store_2d)) {
+            internal_assert(op->args[0].as<Variable>());
+            auto &name = op->args[0].as<Variable>()->name;
+            if (name == buf_name && op->args.size() == 8) {
+                internal_assert(op->args[7].as<StringImm>());
+                ref_name = op->args[7].as<StringImm>()->value;
+            }
+        }
+    }
+
+    FindRefName(const string &_b)
+        : buf_name(_b) {
+    }
+};
+
+void CodeGen_DPC_Dev::add_kernel(Stmt s,
+                                 const string &name,
+                                 const vector<DeviceArgument> &args) {
+    debug(2) << "CodeGen_DPC_Dev::compile " << name << "\n";
+
+    // TODO: do we have to uniquify these names, or can we trust that they are safe?
+    cur_kernel_name = name;
+    clc.add_kernel(s, name, args);
+    global_declaration = clc.init_vecs + clc.init_funcs;
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::add_kernel(Stmt s,
+                                                const string &name,
+                                                const vector<DeviceArgument> &args) {
+    debug(2) << "Adding oneapi_gpu kernel " << name << "\n";
+    stream << get_indent() << "auto e = q.submit([&](handler &cgh) {\n";
+    indent += 2;
+    for (size_t i = 0; i < args.size(); i++) {
+        if (args[i].is_buffer) {
+            string name = args[i].name;
+            // Trick: replace the buffer name with the one specified in stensor
+            FindRefName frn(name);
+            s.accept(&frn);
+            if (!frn.ref_name.empty()) {
+                name = frn.ref_name;
+            }
+            stream << get_indent() << "auto " << print_name(name) << "=img" << print_name(name) << ".get_access<uint4,access::mode::" << (args[i].read ? "read" : "write") << ">(cgh)";
+            string fn_img_var  = "sycl::image<2>& img" + print_name(name);
+            function_var.push_back(fn_img_var);
+            string fn_ext_var = "int " + print_name(name) + "_extent_0";
+            function_var.push_back(fn_ext_var);
+            fn_ext_var = "int " + print_name(name) + "_extent_1";
+            function_var.push_back(fn_ext_var);
+            Allocation alloc;
+            alloc.type = args[i].type;
+            alloc.memory_type = MemoryType::Heap;
+            allocations.push(args[i].name, alloc);
+            if (i < args.size()) {
+                stream << ";\n";
+            }
+        }
+    }
+    stream << get_indent() << "cgh.parallel_for<class Test>(\n";
+    indent += 2;
+    stream << get_indent() << "nd_range{GlobalRange, LocalRange},\n";
+    stream << get_indent() << "[=](nd_item<nd_item_dimension> ndi) SYCL_ESIMD_KERNEL {\n";
+    indent += 2;
+    stream << get_indent() << "using namespace sycl::ext::intel::esimd;\n";
+    // stream << ")\n";
+
+    // open_scope();
+    print(s);
+    close_scope("kernel " + name);
+    indent -= 2;
+    // assert(indent != 0);
+    stream << get_indent() << ");\n";
+    indent -= 2;
+    // assert(indent != 0);
+    stream << get_indent() << "});\n";
+    // indent -= 2;
+    // stream << get_indent() << "}\n";
+    for (size_t i = 0; i < args.size(); i++) {
+        // Remove buffer arguments from allocation scope
+        if (args[i].is_buffer) {
+            allocations.pop(args[i].name);
+        }
+    }
+    // auto end_pos = stream.tellp();
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::open_scope() {
+    stream << get_indent();
+    indent += 2;
+    stream << "{\n";
+}
+
+void CodeGen_DPC_Dev::CodeGen_DPC_C::close_scope(const std::string &comment) {
+    indent -= 2;
+    stream << get_indent();
+    if (!comment.empty()) {
+        stream << "} // " << comment << "\n";
+    } else {
+        stream << "}\n";
+    }
+}
+
+void CodeGen_DPC_Dev::init_module() {
+    debug(2) << "oneapi_gpu device codegen init_module\n";
+
+    // wipe the internal kernel source
+    src_stream.str("");
+    src_stream.clear();
+    cur_kernel_name = "";
+}
+
+vector<char> CodeGen_DPC_Dev::compile_to_src() {
+    string str = src_stream.str();
+    debug(1) << "DPC kernel:\n"
+             << str << "\n";
+    int block_range_dim = range_info_Block.size();
+    int thread_range_dim = range_info_thread.size();
+    int nd_item_dim = thread_range_dim > block_range_dim ? thread_range_dim : block_range_dim;
+    int range_delta = block_range_dim - thread_range_dim;
+    string glb_range = "range<" + to_string(nd_item_dim) + "> GlobalRange(";
+    string lcl_range = "range<" + to_string(nd_item_dim) + "> LocalRange(";
+    //thread range size > block range size
+    if (range_delta < 0) {
+        while (!range_info_Block.empty()) {
+            glb_range += range_info_Block.top() + "*" + range_info_thread.top() + ",";
+            lcl_range += range_info_thread.top() + ",";
+            range_info_Block.pop();
+            range_info_thread.pop();
+        }
+        while (!range_info_thread.empty())
+        {
+            glb_range += to_string(1) + "*" + range_info_thread.top() + ",";
+            lcl_range += range_info_thread.top() + ",";
+            range_info_thread.pop();
+        }
+    }
+    else if (range_delta == 0) {
+    while (!range_info_Block.empty()) {
+        glb_range += range_info_Block.top() + "*" + range_info_thread.top() + ",";
+        lcl_range += range_info_thread.top() + ",";
+        range_info_Block.pop();
+        range_info_thread.pop();
+    }
+    }
+    else if (range_delta > 0) {
+        while (!range_info_thread.empty()) {
+            glb_range += range_info_Block.top() + "*" + range_info_thread.top() + ",";
+            lcl_range += range_info_thread.top() + ",";
+            range_info_Block.pop();
+            range_info_thread.pop();
+        }
+        while (!range_info_Block.empty())
+        {
+            glb_range += range_info_Block.top() + "*" + to_string(1) + ",";
+            lcl_range += to_string(1) + ",";
+            range_info_Block.pop();
+        }
+    }
+    glb_range.pop_back();
+    glb_range += ");\n";
+    lcl_range.pop_back();
+    lcl_range += ");\n";
+    string range_str = "#include \"esimd_test_utils.hpp\"\n"; 
+    range_str += "#include <iostream>\n";
+    range_str += "#include <sycl/ext/intel/esimd.hpp>\n";
+    range_str += "#include <CL/sycl.hpp>\n";
+    range_str += global_declaration + "\n";
+    range_str += "void execution(";
+    for (unsigned i = 0; i < function_var.size()-1; i++)
+    {
+        range_str += function_var[i];
+        range_str += ",";
+    }
+    range_str += function_var[function_var.size() - 1];
+    range_str += "){\n";
+    range_str += "sycl::queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler(),property::queue::enable_profiling{});\n" + glb_range + lcl_range;
+    range_str += "const int nd_item_dimension =" + to_string(nd_item_dim) + ";\n";
+    range_str += str;
+    range_str += "}";
+    
+    vector<char> buffer(range_str.begin(), range_str.end());
+    return buffer;
+}
+
+}  // namespace Internal
+}  // namespace Halide
diff --git a/Halide/src/CodeGen_DPC_Dev.h b/Halide/src/CodeGen_DPC_Dev.h
new file mode 100644
index 00000000..e931d1ae
--- /dev/null
+++ b/Halide/src/CodeGen_DPC_Dev.h
@@ -0,0 +1,149 @@
+#ifndef HALIDE_CODEGEN_DPC_DEV_H
+#define HALIDE_CODEGEN_DPC_DEV_H
+
+/** \file
+ * Defines the code-generator for producing OpenCL C kernel code
+ */
+
+#include <sstream>
+
+#include "CodeGen_C.h"
+#include "CodeGen_GPU_Dev.h"
+#include "Target.h"
+using std::string;
+
+namespace Halide {
+namespace Internal {
+
+class CodeGen_DPC_Dev : public CodeGen_GPU_Dev {
+public:
+    CodeGen_DPC_Dev(Target target);
+
+    /** Compile a GPU kernel into the module. This may be called many times
+     * with different kernels, which will all be accumulated into a single
+     * source module shared by a given Halide pipeline. */
+    void add_kernel(Stmt stmt,
+                    const string &name,
+                    const std::vector<DeviceArgument> &args) override;
+
+    /** (Re)initialize the GPU kernel module. This is separate from compile,
+     * since a GPU device module will often have many kernels compiled into it
+     * for a single pipeline. */
+    void init_module() override;
+
+    std::vector<char> compile_to_src() override;
+
+    string get_current_kernel_name() override {
+        return cur_kernel_name;
+    }
+
+    void dump() override {
+        std::cerr << src_stream.str() << "\n";
+    }
+
+    string print_gpu_name(const string &name) override {
+        return name;
+    }
+
+    string api_unique_name() override {
+        return "dpc";
+    }
+
+protected:
+    class CodeGen_DPC_C : public CodeGen_C {
+    public:
+        string init_vecs;
+        string init_funcs;
+
+        CodeGen_DPC_C(std::ostream &s, Target t)
+            : CodeGen_C(s, t) {
+        }
+        void add_kernel(Stmt stmt,
+                        const string &name,
+                        const std::vector<DeviceArgument> &args);
+    private:
+        size_t trick_size = 0;
+        bool in_buffer = false;
+        string get_vector_element(const string &name,
+                                  const string &id_index);
+        string get_vector_init_tmpl(Type t,
+                                    const string &base,
+                                    const string &stride);
+        string get_vector_declaration(Type t,
+                                      const string &id);
+        string get_vector_declaration(Type t,
+                                      const string &id,
+                                      const string &init);
+        string get_vector_read_tmpl(Type t,
+                                    const string &name,
+                                    const string &global_offset,
+                                    const string &element_offset,
+                                    bool with_decl);
+        string get_vector_read_tmpl(Type t,
+                                    const string &name,
+                                    const string &offset,
+                                    bool dword_aligned);
+        string get_vector_write_tmpl(Type t,
+                                    const string &name,
+                                    const string &global_offset,
+                                    const string &element_offset);
+        string get_vector_write_tmpl(Type t,
+                                     const string &name,
+                                     const string &offset);
+        string get_slm_read_tmpl(Type t,
+                                 const string &name,
+                                 const string &global_offset,
+                                 const string &element_offset);
+        string get_slm_write_tmpl(Type t,
+                                  const string &name,
+                                  const string &global_offset,
+                                  const string &element_offset);
+        string get_vector_select(const string &name,
+                                 const string &base,
+                                 int size,
+                                 int stride);
+        string get_vector_iselect(const string &name,
+                                  const string &idx);
+        string print_vector_op(const string& tmpl,
+                               char prefix = '_');
+        void print_media_block_rw(Type t, std::vector<Expr> args, bool is_write);
+
+    protected:
+        using CodeGen_C::visit;
+        string print_assignment(Type t, const std::string &rhs) override;
+        string print_type(Type type, AppendSpaceIfNeeded append_space = DoNotAppendSpace) override;
+
+        void visit(const For *) override;
+        void visit(const Ramp *op) override;
+        void visit(const Broadcast *op) override;
+        void visit(const Min *op) override;
+        void visit(const Max *op) override;
+        void visit(const Mod *op) override;
+        void visit(const Div *op) override;
+        void visit(const Call *op) override;
+        void visit(const Load *op) override;
+        void visit(const Store *op) override;
+        void visit(const Select *op) override;
+        void visit(const LetStmt *op) override;
+        void visit(const FloatImm *op) override;
+        void visit(const Allocate *op) override;
+        void visit(const Free *op) override;
+        void visit(const AssertStmt *op) override;
+        void visit(const Shuffle *op) override;
+        void visit(const Variable *op) override;
+        void visit(const Evaluate *op) override;
+        void visit_binop(Type t, Expr a, Expr b, const char *op) override;
+
+        void open_scope() override;
+        void close_scope(const std::string &comment) override;
+    };
+
+    std::ostringstream src_stream;
+    string cur_kernel_name;
+    CodeGen_DPC_C clc;
+};
+
+}  // namespace Internal
+}  // namespace Halide
+
+#endif
diff --git a/Halide/src/CodeGen_GPU_Host.cpp b/Halide/src/CodeGen_GPU_Host.cpp
index 66b997f9..d471b61c 100644
--- a/Halide/src/CodeGen_GPU_Host.cpp
+++ b/Halide/src/CodeGen_GPU_Host.cpp
@@ -1,13 +1,15 @@
-#include <sstream>
 #include <fstream>
+#include <sstream>
 
+#include "../../t2s/src/CodeGen_OneAPI_Dev.h"
+#include "../../t2s/src/Utilities.h"
+#include "CodeGen_CM_Dev.h"
 #include "CodeGen_D3D12Compute_Dev.h"
+#include "CodeGen_DPC_Dev.h"
 #include "CodeGen_GPU_Host.h"
 #include "CodeGen_Internal.h"
 #include "CodeGen_Metal_Dev.h"
 #include "CodeGen_OpenCL_Dev.h"
-#include "../../t2s/src/CodeGen_OneAPI_Dev.h"
-#include "CodeGen_CM_Dev.h"
 #include "CodeGen_OpenGLCompute_Dev.h"
 #include "CodeGen_OpenGL_Dev.h"
 #include "CodeGen_PTX_Dev.h"
@@ -19,8 +21,6 @@
 #include "Simplify.h"
 #include "Util.h"
 #include "VaryingAttributes.h"
-#include "../../t2s/src/Utilities.h"
-
 namespace Halide {
 namespace Internal {
 
@@ -117,7 +117,7 @@ CodeGen_GPU_Host<CodeGen_CPU>::CodeGen_GPU_Host(Target target)
         debug(1) << "Constructing CUDA device codegen\n";
         cgdev[DeviceAPI::CUDA] = new CodeGen_PTX_Dev(target);
     }
-    if (target.has_feature(Target::IntelGPU)) {
+    if (target.has_feature(Target::CM) && target.has_feature(Target::IntelGPU)) {
         debug(1) << "Constructing CM device codegen\n";
         cgdev[DeviceAPI::CM] = new CodeGen_CM_Dev(target);
     }
@@ -126,8 +126,13 @@ CodeGen_GPU_Host<CodeGen_CPU>::CodeGen_GPU_Host(Target target)
         cgdev[DeviceAPI::OpenCL] = new CodeGen_OpenCL_Dev(target);
     }
     if (target.has_feature(Target::OneAPI)) {
-        debug(1) << "Constructing OneAPI device codegen\n";
-        cgdev[DeviceAPI::OneAPI] = new CodeGen_OneAPI_Dev(target);
+        if (target.has_feature(Target::IntelFPGA)) {
+            debug(1) << "Constructing OneAPI device codegen for FPGA\n";
+            cgdev[DeviceAPI::OneAPI] = new CodeGen_OneAPI_Dev(target);
+        } else if (target.has_feature(Target::IntelGPU)) {
+            debug(1) << "Constructing OneAPI device codegen for GPU\n";
+            cgdev[DeviceAPI::OneAPI] = new CodeGen_DPC_Dev(target);
+        }
     }
     if (target.has_feature(Target::Metal)) {
         debug(1) << "Constructing Metal device codegen\n";
@@ -166,19 +171,18 @@ void CodeGen_GPU_Host<CodeGen_CPU>::compile_func(const LoweredFunc &f,
         if (target.has_feature(Target::IntelFPGA) && !target.has_feature(Target::OneAPI)) {
             internal_assert(cgdev.find(DeviceAPI::OpenCL) != cgdev.end());
             // Defines Pipes/Channels and their data types
-            ((CodeGen_OpenCL_Dev*)cgdev[DeviceAPI::OpenCL])->print_global_data_structures_before_kernel(&f.body);
+            ((CodeGen_OpenCL_Dev *)cgdev[DeviceAPI::OpenCL])->print_global_data_structures_before_kernel(&f.body);
 
             // Gather shift registers' allocations.
-            ((CodeGen_OpenCL_Dev*)cgdev[DeviceAPI::OpenCL])->gather_shift_regs_allocates(&f.body);
+            ((CodeGen_OpenCL_Dev *)cgdev[DeviceAPI::OpenCL])->gather_shift_regs_allocates(&f.body);
         }
-        if(target.has_feature(Target::IntelFPGA) && target.has_feature(Target::OneAPI)){
+        if (target.has_feature(Target::IntelFPGA) && target.has_feature(Target::OneAPI)) {
             internal_assert(cgdev.find(DeviceAPI::OneAPI) != cgdev.end());
-            ((CodeGen_OneAPI_Dev*)cgdev[DeviceAPI::OneAPI])->print_global_data_structures_before_kernel(&f.body);
+            ((CodeGen_OneAPI_Dev *)cgdev[DeviceAPI::OneAPI])->print_global_data_structures_before_kernel(&f.body);
 
             // Gather shift registers' allocations.
-            ((CodeGen_OneAPI_Dev*)cgdev[DeviceAPI::OneAPI])->gather_shift_regs_allocates(&f.body);
+            ((CodeGen_OneAPI_Dev *)cgdev[DeviceAPI::OneAPI])->gather_shift_regs_allocates(&f.body);
         }
-
     }
 
     // Call the base implementation to create the function.
@@ -231,8 +235,17 @@ void CodeGen_GPU_Host<CodeGen_CPU>::compile_func(const LoweredFunc &f,
             file.close();
             return;
         }
+        if (api_unique_name == "dpc") {
+            debug(1) << "Currently, we do not implement DPC++ runtime, so we just emit source code.\n";
+            std::ofstream file(simple_name + ".sycl.h", std::fstream::out);
+            std::string src(kernel_src.cbegin(), kernel_src.cend());
+            if (file.is_open())
+                file << src;
+            file.close();
+            return;
+        }
 
-        if(api_unique_name == "oneapi"){
+        if (api_unique_name == "oneapi") {
             debug(1) << "Currently, we do not implement OneAPI runtime, so we just emit source code.\n";
             debug(1) << "Emmiting " << api_unique_name << " code\n";
             std::ofstream file(simple_name + ".generated_oneapi_header.h", std::fstream::out);
@@ -259,8 +272,8 @@ void CodeGen_GPU_Host<CodeGen_CPU>::compile_func(const LoweredFunc &f,
         Value *user_context = get_user_context();
         Value *kernel_size = ConstantInt::get(i32_t, kernel_src.size());
         std::string init_kernels_name;
-        if( api_unique_name == "oneapi" ){
-            init_kernels_name = "halide_opencl_initialize_kernels";        
+        if (api_unique_name == "oneapi") {
+            init_kernels_name = "halide_opencl_initialize_kernels";
         } else {
             init_kernels_name = "halide_" + api_unique_name + "_initialize_kernels";
         }
@@ -286,10 +299,13 @@ void CodeGen_GPU_Host<CodeGen_CPU>::visit(const Realize *op) {
 
 class KernelStoresToMemory : public IRVisitor {
     using IRVisitor::visit;
+
 public:
     bool stores_to_memory;
 
-    KernelStoresToMemory() : stores_to_memory(false) {}
+    KernelStoresToMemory()
+        : stores_to_memory(false) {
+    }
 
     void visit(const Store *op) override {
         stores_to_memory = true;
@@ -322,7 +338,7 @@ string create_kernel_name(const For *op) {
 
 template<typename CodeGen_CPU>
 void CodeGen_GPU_Host<CodeGen_CPU>::visit(const For *loop) {
-    if (CodeGen_GPU_Dev::is_gpu_var(loop->name) || ends_with(loop->name, ".run_on_device") || target.has_feature(Target::OneAPI)) {
+    if (CodeGen_GPU_Dev::is_gpu_var(loop->name) || ends_with(loop->name, ".run_on_device") || (target.has_feature(Target::OneAPI) && !target.has_feature(Target::IntelGPU))) {
         // We're in the loop over outermost block dimension
         debug(2) << "Kernel launch: " << loop->name << "\n";
 
@@ -425,23 +441,20 @@ void CodeGen_GPU_Host<CodeGen_CPU>::visit(const For *loop) {
             }
         }
 
-
-
         CodeGen_GPU_Dev *gpu_codegen;
-        
-        if( target.has_feature(Target::OneAPI) ){
-            // OneAPI combines host and device code into the same file, so use that code generator 
+
+        if (target.has_feature(Target::OneAPI) && !target.has_feature(Target::IntelGPU)) {
+            // OneAPI combines host and device code into the same file, so use that code generator
             // since there is no device API, force into OneAPI CodeGen
             debug(2) << "Using <OneAPI> for : " << kernel_name << "\n";
-            gpu_codegen = ((CodeGen_OneAPI_Dev*)cgdev[DeviceAPI::OneAPI]);
+            gpu_codegen = ((CodeGen_OneAPI_Dev *)cgdev[DeviceAPI::OneAPI]);
         } else {
             gpu_codegen = cgdev[loop->device_api];
         }
         user_assert(gpu_codegen != nullptr)
             << "Loop is scheduled on device " << loop->device_api
             << " which does not appear in target " << target.to_string() << "\n";
-        if( target.has_feature(Target::OneAPI) ){
-            
+        if (target.has_feature(Target::OneAPI) && !target.has_feature(Target::IntelGPU)) {
         }
         gpu_codegen->add_kernel(loop, kernel_name, closure_args);
 
@@ -611,7 +624,7 @@ void CodeGen_GPU_Host<CodeGen_CPU>::visit(const For *loop) {
 
         // Order-of-evaluation is guaranteed to be in order in brace-init-lists,
         // so the multiple calls to codegen here are fine
-        if (loop->device_api == DeviceAPI::CM) {
+        if (loop->device_api == DeviceAPI::CM || (loop->device_api == DeviceAPI::OneAPI && loop->for_type == Halide::Internal::ForType::GPUBlock)) {
             debug(1) << "Currently, we do not implement CM runtime.\n";
             get_module_state(api_unique_name);
             return;
@@ -646,7 +659,7 @@ void CodeGen_GPU_Host<CodeGen_CPU>::visit(const For *loop) {
             gpu_num_coords_dim1,
         };
         std::string run_fn_name;
-        if( api_unique_name == "oneapi"){
+        if (api_unique_name == "oneapi") {
             run_fn_name = "halide_opencl_run";
         } else {
             run_fn_name = "halide_" + api_unique_name + "_run";
diff --git a/Halide/src/CodeGen_LLVM.cpp b/Halide/src/CodeGen_LLVM.cpp
index 94fb3660..df65b583 100644
--- a/Halide/src/CodeGen_LLVM.cpp
+++ b/Halide/src/CodeGen_LLVM.cpp
@@ -576,8 +576,6 @@ void CodeGen_LLVM::init_codegen(const std::string &name, bool any_strict_float)
 
     internal_assert(module && context);
 
-    debug(1) << "Target triple of initial module: " << module->getTargetTriple() << "\n";
-
     module->setModuleIdentifier(name);
 
     // Add some target specific info to the module as metadata.
@@ -635,8 +633,6 @@ void CodeGen_LLVM::compile_to_devsrc(const Module &input) {
         const auto names = get_mangled_names(f, get_target());
         compile_func(f, names.simple_name, names.extern_name);
     }
-
-    debug(2) << module.get() << "\n";;
 }
 
 std::unique_ptr<llvm::Module> CodeGen_LLVM::compile(const Module &input) {
@@ -648,7 +644,6 @@ std::unique_ptr<llvm::Module> CodeGen_LLVM::compile(const Module &input) {
     add_external_code(input);
 
     // Generate the code for this module.
-    debug(1) << "Generating llvm bitcode...\n";
     for (const auto &b : input.buffers()) {
         compile_buffer(b);
     }
@@ -670,8 +665,6 @@ std::unique_ptr<llvm::Module> CodeGen_LLVM::compile(const Module &input) {
         }
     }
 
-    debug(2) << module.get() << "\n";
-
     return finish_codegen();
 }
 
@@ -732,7 +725,6 @@ void CodeGen_LLVM::begin_func(LinkageType linkage, const std::string &name,
         }
     }
 
-    debug(1) << "Generating llvm bitcode prolog for function " << name << "...\n";
 
     // Null out the destructor block.
     destructor_block = nullptr;
@@ -1084,7 +1076,6 @@ llvm::Function *CodeGen_LLVM::add_argv_wrapper(llvm::Function *fn,
             wrapper_args.push_back(builder->CreateLoad(ptr));
         }
     }
-    debug(4) << "Creating call from wrapper to actual function\n";
     llvm::CallInst *result = builder->CreateCall(fn, wrapper_args);
     // This call should never inline
     result->setIsNoInline();
@@ -1219,11 +1210,7 @@ llvm::Type *CodeGen_LLVM::llvm_type_of(const Type &t) const {
 }
 
 void CodeGen_LLVM::optimize_module() {
-    debug(3) << "Optimizing module\n";
 
-    if (debug::debug_level() >= 3) {
-        module->print(dbgs(), nullptr, false, true);
-    }
 
     std::unique_ptr<TargetMachine> tm = make_target_machine(*module);
 
@@ -1341,7 +1328,6 @@ void CodeGen_LLVM::optimize_module() {
             : legacy::FunctionPassManager(m) {
         }
         void add(Pass *p) override {
-            debug(2) << "Adding function pass: " << p->getPassName().str() << "\n";
             legacy::FunctionPassManager::add(p);
         }
     };
@@ -1349,7 +1335,6 @@ void CodeGen_LLVM::optimize_module() {
     class MyModulePassManager : public legacy::PassManager {
     public:
         void add(Pass *p) override {
-            debug(2) << "Adding module pass: " << p->getPassName().str() << "\n";
             legacy::PassManager::add(p);
         }
     };
@@ -1434,10 +1419,7 @@ void CodeGen_LLVM::optimize_module() {
     module_pass_manager.run(*module);
 #endif
 
-    debug(3) << "After LLVM optimizations:\n";
-    if (debug::debug_level() >= 2) {
-        module->print(dbgs(), nullptr, false, true);
-    }
+
 }
 
 void CodeGen_LLVM::sym_push(const string &name, llvm::Value *value) {
@@ -1477,7 +1459,6 @@ bool CodeGen_LLVM::sym_exists(const string &name) const {
 
 Value *CodeGen_LLVM::codegen(Expr e) {
     internal_assert(e.defined());
-    debug(4) << "Codegen: " << e.type() << ", " << e << "\n";
     value = nullptr;
     e.accept(this);
     internal_assert(value) << "Codegen of an expr did not produce an llvm value\n";
@@ -1503,7 +1484,6 @@ Value *CodeGen_LLVM::codegen(Expr e) {
 
 void CodeGen_LLVM::codegen(Stmt s) {
     internal_assert(s.defined());
-    debug(3) << "Codegen: " << s << "\n";
     value = nullptr;
     s.accept(this);
 }
@@ -1565,7 +1545,6 @@ void CodeGen_LLVM::visit(const Cast *op) {
     if (upgrade_type_for_arithmetic(src) != src ||
         upgrade_type_for_arithmetic(dst) != dst) {
         // Handle casts to and from types for which we don't have native support.
-        debug(4) << "Emulating cast from " << src << " to " << dst << "\n";
         if ((src.is_float() && src.bits() < 32) ||
             (dst.is_float() && dst.bits() < 32)) {
             Expr equiv = lower_float16_cast(op);
@@ -2297,7 +2276,6 @@ void CodeGen_LLVM::scalarize(Expr e) {
 void CodeGen_LLVM::codegen_predicated_vector_store(const Store *op) {
     const Ramp *ramp = op->index.as<Ramp>();
     if (ramp && is_one(ramp->stride)) {  // Dense vector store
-        debug(4) << "Predicated dense vector store\n\t" << Stmt(op) << "\n";
         Value *vpred = codegen(op->predicate);
         Halide::Type value_type = op->value.type();
         Value *val = codegen(op->value);
@@ -2343,7 +2321,6 @@ void CodeGen_LLVM::codegen_predicated_vector_store(const Store *op) {
             add_tbaa_metadata(store_inst, op->name, slice_index);
         }
     } else {  // It's not dense vector store, we need to scalarize it
-        debug(4) << "Scalarize predicated vector store\n";
         Type value_type = op->value.type().element_of();
         Value *vpred = codegen(op->predicate);
         Value *vval = codegen(op->value);
@@ -2376,7 +2353,6 @@ void CodeGen_LLVM::codegen_predicated_vector_store(const Store *op) {
 }
 
 Value *CodeGen_LLVM::codegen_dense_vector_load(const Load *load, Value *vpred) {
-    debug(4) << "Vectorize predicated dense vector load:\n\t" << Expr(load) << "\n";
 
     const Ramp *ramp = load->index.as<Ramp>();
     internal_assert(ramp && is_one(ramp->stride)) << "Should be dense vector load\n";
@@ -2450,7 +2426,6 @@ void CodeGen_LLVM::codegen_predicated_vector_load(const Load *op) {
         Value *vpred = codegen(op->predicate);
         value = codegen_dense_vector_load(op, vpred);
     } else if (ramp && stride && stride->value == -1) {
-        debug(4) << "Predicated dense vector load with stride -1\n\t" << Expr(op) << "\n";
         vector<int> indices(ramp->lanes);
         for (int i = 0; i < ramp->lanes; i++) {
             indices[i] = ramp->lanes - 1 - i;
@@ -2475,7 +2450,6 @@ void CodeGen_LLVM::codegen_predicated_vector_load(const Load *op) {
     } else {  // It's not dense vector load, we need to scalarize it
         Expr load_expr = Load::make(op->type, op->name, op->index, op->image,
                                     op->param, const_true(op->type.lanes()), op->alignment);
-        debug(4) << "Scalarize predicated vector load\n\t" << load_expr << "\n";
         Expr pred_load = Call::make(load_expr.type(),
                                     Call::if_then_else,
                                     {op->predicate, load_expr, make_zero(load_expr.type())},
@@ -3138,8 +3112,6 @@ void CodeGen_LLVM::visit(const Call *op) {
                                                        current_function_args,
                                                        get_target())
                                          .extern_name;
-                debug(1) << "Did not find function " << sub_fn_name
-                         << ", assuming extern \"C\" " << extern_sub_fn_name << "\n";
                 vector<llvm::Type *> arg_types;
                 for (const auto &arg : function->args()) {
                     arg_types.push_back(arg.getType());
@@ -3153,7 +3125,6 @@ void CodeGen_LLVM::visit(const Call *op) {
 
             llvm::GlobalValue *sub_fn_ptr = module->getNamedValue(extern_sub_fn_name);
             if (!sub_fn_ptr) {
-                debug(1) << "Did not find function ptr " << extern_sub_fn_name << ", assuming extern \"C\".\n";
                 sub_fn_ptr = new GlobalVariable(*module, sub_fn->getType(),
                                                 /*isConstant*/ true, GlobalValue::ExternalLinkage,
                                                 /*initializer*/ nullptr, extern_sub_fn_name);
@@ -3388,7 +3359,6 @@ void CodeGen_LLVM::visit(const Call *op) {
         bool takes_user_context = function_takes_user_context(op->name);
         if (takes_user_context) {
             internal_assert(fn) << "External function " << op->name << " is marked as taking user_context, but is not in the runtime module. Check if runtime_api.cpp needs to be rebuilt.\n";
-            debug(4) << "Adding user_context to " << op->name << " args\n";
             args.insert(args.begin(), get_user_context());
         }
 
@@ -3413,9 +3383,7 @@ void CodeGen_LLVM::visit(const Call *op) {
 
             fn = llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, name, module.get());
             fn->setCallingConv(CallingConv::C);
-            debug(4) << "Did not find " << op->name << ". Declared it extern \"C\".\n";
         } else {
-            debug(4) << "Found " << op->name << "\n";
 
             // TODO: Say something more accurate here as there is now
             // partial information in the handle_type field, but it is
@@ -3442,8 +3410,6 @@ void CodeGen_LLVM::visit(const Call *op) {
                     }
 
                     if (t != args[i]->getType()) {
-                        debug(4) << "Pointer casting argument to extern call: "
-                                 << halide_arg << "\n";
                         args[i] = builder->CreatePointerCast(args[i], t);
                     }
                 }
@@ -3954,7 +3920,6 @@ void CodeGen_LLVM::do_parallel_tasks(const vector<ParallelTask> &tasks) {
             internal_assert(do_par_for) << "Could not find halide_do_par_for in initial module\n";
             do_par_for->addParamAttr(4, Attribute::NoAlias);
             Value *args[] = {get_user_context(), task_ptr, min, extent, closure_ptr};
-            debug(4) << "Creating call to do_par_for\n";
             result = builder->CreateCall(do_par_for, args);
         } else {
             // Populate the task struct
diff --git a/Halide/src/Func.cpp b/Halide/src/Func.cpp
index 0a616a86..65460a1b 100644
--- a/Halide/src/Func.cpp
+++ b/Halide/src/Func.cpp
@@ -2663,11 +2663,12 @@ Func &Func::gpu_fetch(Var loop_level, MemoryType mem_type, vector<Var> outs, vec
     return *this;
 }
 
-Func &Func::gpu_store(const vector<Expr> &args, size_t sz) {
+Func &Func::gpu_store(const vector<Expr> &args, const string &name, size_t sz) {
     invalidate_cache();
 
     StoreParams &rp = func.definition().schedule().store_params();
     rp.shape_args = args;
+    rp.name = name;
     rp.rw_len = sz;
 
     return *this;
diff --git a/Halide/src/Func.h b/Halide/src/Func.h
index 7160f580..2ba2c1f3 100644
--- a/Halide/src/Func.h
+++ b/Halide/src/Func.h
@@ -2232,7 +2232,7 @@ class Func {
     Func &compute_at(LoopLevel loop_level);
 
     Func &gpu_fetch(Var loop_level, MemoryType mem_type, vector<Var> outs, vector<Expr> reuse_args);
-    Func &gpu_store(const vector<Expr> &args, size_t sz = 16);
+    Func &gpu_store(const vector<Expr> &args, const string &name, size_t sz = 16);
 
     /** Schedule the iteration over the initial definition of this function
      *  to be fused with another stage 's' from outermost loop to a
diff --git a/Halide/src/Module.cpp b/Halide/src/Module.cpp
index 019e23f3..d625cfc6 100644
--- a/Halide/src/Module.cpp
+++ b/Halide/src/Module.cpp
@@ -52,7 +52,7 @@ std::map<Output, OutputInfo> get_output_info(const Target &target) {
         {Output::featurization, {"featurization", ".featurization"}},
         {Output::llvm_assembly, {"llvm_assembly", ".ll"}},
         {Output::object, {"object", is_windows_coff ? ".obj" : ".o"}},
-        {Output::oneapi, {"c_header", ".generated_oneapi_header.h"}},
+        {Output::oneapi_fpga, {"c_header", ".sycl.h"}},
         {Output::python_extension, {"python_extension", ".py.cpp"}},
         {Output::pytorch_wrapper, {"pytorch_wrapper", ".pytorch.h"}},
         {Output::registration, {"registration", ".registration.cpp"}},
@@ -62,7 +62,8 @@ std::map<Output, OutputInfo> get_output_info(const Target &target) {
         {Output::stmt_html, {"stmt_html", ".stmt.html"}},
         {Output::dev_src, {"dev_src", "_genx.cpp"}},
         {Output::host_header, {"host_header", ".h"}},
-        {Output::host_src, {"host_src", ".cpp"}}
+        {Output::host_src, {"host_src", ".cpp"}},
+        {Output::oneapi_gpu, {"oneapi_gpu", "_genx.cpp"}},
     };
     return ext;
 }
@@ -592,7 +593,33 @@ void Module::compile(const std::map<Output, std::string> &output_files) const {
         ret->compile_to_devsrc(*this);
         delete ret;
     }
+    if (contains(output_files, Output::oneapi_gpu)) {
+        debug(1) << "Module.compile(): oneapi_src " << output_files.at(Output::oneapi_gpu) << "\n";
+        //std::ofstream file(output_files.at(Output::oneapi_gpu));
+        llvm::LLVMContext context;
+        CodeGen_LLVM *ret = new CodeGen_GPU_Host<CodeGen_X86>(this->target());
+        ret->set_context(context);
+        ret->compile_to_devsrc(*this);
+        delete ret;
+    }
+    if (contains(output_files, Output::oneapi_fpga)) {
+        debug(1) << "Module.compile(): oneapi_dev " << output_files.at(Output::oneapi_fpga) << "\n";
+        auto t = target();
+        t.set_feature(Target::OpenCL, false);
+
+        // CodeGen_OneAPI expects to be compiled with DPC++ i.e. C++17
+        // So we hard set the featrues here
+        t.set_feature(Target::CPlusPlusMangling, true);
 
+        // We invoke compile() like method using the OneAPI CodeGenerator much like CodeGen_C 
+        // Unlike outputing the devsrc only as done in Output::cm_devsrc with CodeGen_GPU_Host<CodeGen_X86>(t)
+        // Since the CodeGen_OneAPI_C is protected class of CodeGen_OneAPI_Dev, 
+        // we use a public CodeGen_OneAPI_C method to do this
+        std::ofstream file(output_files.at(Output::oneapi_fpga));
+        Internal::CodeGen_OneAPI_Dev cg(t);
+        std::string out_str = cg.compile_oneapi(*this);
+        file << out_str;
+    }
     if (contains(output_files, Output::object) || contains(output_files, Output::assembly) ||
         contains(output_files, Output::bitcode) || contains(output_files, Output::llvm_assembly) ||
         contains(output_files, Output::static_library)) {
@@ -655,24 +682,6 @@ void Module::compile(const std::map<Output, std::string> &output_files) const {
                                target().has_feature(Target::CPlusPlusMangling) ? Internal::CodeGen_C::CPlusPlusImplementation : Internal::CodeGen_C::CImplementation);
         cg.compile(*this);
     }
-    if (contains(output_files, Output::oneapi)) {
-        debug(1) << "Module.compile(): oneapi_dev " << output_files.at(Output::oneapi) << "\n";
-        auto t = target();
-        t.set_feature(Target::OpenCL, false);
-
-        // CodeGen_OneAPI expects to be compiled with DPC++ i.e. C++17
-        // So we hard set the featrues here
-        t.set_feature(Target::CPlusPlusMangling, true);
-
-        // We invoke compile() like method using the OneAPI CodeGenerator much like CodeGen_C 
-        // Unlike outputing the devsrc only as done in Output::cm_devsrc with CodeGen_GPU_Host<CodeGen_X86>(t)
-        // Since the CodeGen_OneAPI_C is protected class of CodeGen_OneAPI_Dev, 
-        // we use a public CodeGen_OneAPI_C method to do this
-        std::ofstream file(output_files.at(Output::oneapi));
-        Internal::CodeGen_OneAPI_Dev cg(t);
-        std::string out_str = cg.compile_oneapi(*this);
-        file << out_str;
-    }
     if (contains(output_files, Output::host_header)) {
         debug(1) << "Module.compile(): host_header " << output_files.at(Output::host_header) << "\n";
         std::ofstream file(output_files.at(Output::host_header));
diff --git a/Halide/src/Module.h b/Halide/src/Module.h
index 2b745b61..9168d19a 100644
--- a/Halide/src/Module.h
+++ b/Halide/src/Module.h
@@ -29,7 +29,7 @@ enum class Output {
     featurization,
     llvm_assembly,
     object,
-    oneapi,
+    oneapi_fpga,
     python_extension,
     pytorch_wrapper,
     registration,
@@ -39,7 +39,8 @@ enum class Output {
     stmt_html,
     dev_src,
     host_header,
-    host_src
+    host_src,
+    oneapi_gpu
 };
 
 /** Type of linkage a function in a lowered Halide module can have.
diff --git a/Halide/src/Pipeline.cpp b/Halide/src/Pipeline.cpp
index f708a9b2..6c350222 100644
--- a/Halide/src/Pipeline.cpp
+++ b/Halide/src/Pipeline.cpp
@@ -33,9 +33,10 @@ std::string output_name(const string &filename, const string &fn_name, const str
 std::string output_name(const string &filename, const Module &m, const string &ext) {
     return output_name(filename, m.name(), ext);
 }
-
+//function name: filename here
 std::map<Output, std::string> single_output(const string &filename, const Module &m, Output output_type) {
     auto ext = get_output_info(m.target());
+    //here:output_name returns fn_name
     std::map<Output, std::string> outputs = {
         {output_type, output_name(filename, m, ext.at(output_type).extension)}};
     return outputs;
@@ -350,8 +351,14 @@ void Pipeline::compile_to_oneapi(const vector<Argument> &args,
     
     debug(2) << "OneAPI-compiling for: " << target << "\n";
     Module m = compile_to_module(args, fn_name, target);
-    auto ext = get_output_info(target);
-    m.compile(single_output( fn_name + ext.at(Output::oneapi).extension, m, Output::oneapi));
+    if (target.has_feature(Target::IntelGPU)) {
+        //Output::oneapi_gpu -> fn_name
+        m.compile(single_output(fn_name, m, Output::oneapi_gpu));
+    }
+    else if (target.has_feature(Target::IntelFPGA)) {
+        auto ext = get_output_info(target);
+        m.compile(single_output( fn_name + ext.at(Output::oneapi_fpga).extension, m, Output::oneapi_fpga));
+    }
 }
 
 void Pipeline::print_loop_nest() {
@@ -410,6 +417,7 @@ void Pipeline::compile_to_host(const string &filename_prefix,
     auto ext = get_output_info(target);
     std::map<Output, std::string> outputs = {
         {Output::dev_src, fn_name},
+        {Output::oneapi_gpu, fn_name},
         {Output::host_header, filename_prefix + ext.at(Output::host_header).extension},
         {Output::host_src, filename_prefix + ext.at(Output::host_src).extension},
     };
diff --git a/Halide/src/Schedule.h b/Halide/src/Schedule.h
index b3d4df22..b263e204 100644
--- a/Halide/src/Schedule.h
+++ b/Halide/src/Schedule.h
@@ -474,6 +474,7 @@ struct FetchParams {
  
 struct StoreParams {
     std::vector<Expr> shape_args;
+    std::string name;
     size_t rw_len;
 };
  
diff --git a/Halide/src/Target.cpp b/Halide/src/Target.cpp
index e483081e..ed8372c7 100644
--- a/Halide/src/Target.cpp
+++ b/Halide/src/Target.cpp
@@ -373,6 +373,7 @@ const std::map<std::string, Target::Feature> feature_name_map = {
     {"sve2", Target::SVE2},
     {"intel_fpga", Target::IntelFPGA},
     {"intel_gpu", Target::IntelGPU},
+    {"cm", Target::CM},
     {"enable_synthesis", Target::EnableSynthesis}
     // NOTE: When adding features to this map, be sure to update
     // PyEnums.cpp and halide.cmake as well.
@@ -663,7 +664,10 @@ bool Target::supported() const {
     bad |= has_feature(Target::CUDA);
 #endif
 #if !defined(WITH_OPENCL)
-    bad |= has_feature(Target::OpenCL) || has_feature(Target::OneAPI); // (TODO) Be able to seperate OneAPI from OpenCL
+    bad |= has_feature(Target::OpenCL); 
+#endif
+#if !defined(WITH_ONEAPI)
+    bad |= has_feature(Target::OneAPI);
 #endif
 #if !defined(WITH_CM)
     bad |= has_feature(Target::IntelGPU);
@@ -687,10 +691,6 @@ void Target::set_feature(Feature f, bool value) {
     if (f == Target::IntelFPGA && value) {
         // Enabling generating OpenCL code for Intel FPGAs
         features.set(Target::OpenCL, true);
-    } else if  (f == Target::OneAPI && value) {
-        // Enabling generating OpenCL OneAPI code for IntelFPGAs w/ CodeGen_OneAPI_Dev.h/.cpp
-        // NOTE, the IntelFPGA must be set before the OneAPI is set
-        features.set(Target::OpenCL, false);
     }
 }
 
diff --git a/Halide/src/Target.h b/Halide/src/Target.h
index 48fb6579..01cb622d 100644
--- a/Halide/src/Target.h
+++ b/Halide/src/Target.h
@@ -128,6 +128,7 @@ struct Target {
         OneAPI = halide_target_feature_one_api,
         IntelGPU = halide_target_feature_intel_gpu,
         EnableSynthesis = halide_target_feature_enable_synthesis,
+        CM = halide_target_feature_cm,
         FeatureEnd = halide_target_feature_end
     };
     Target()
diff --git a/Halide/src/runtime/HalideRuntime.h b/Halide/src/runtime/HalideRuntime.h
index 2aefb623..bb94e86b 100644
--- a/Halide/src/runtime/HalideRuntime.h
+++ b/Halide/src/runtime/HalideRuntime.h
@@ -1310,6 +1310,7 @@ typedef enum halide_target_feature_t {
     halide_target_feature_one_api, ///< Enable Intel OneAPI dpcpp program generation
     halide_target_feature_intel_gpu, ///< Enable Intel Graphics
     halide_target_feature_enable_synthesis, ///< Enable synthesizing binaries. Currently used only for Intel FPGAs.
+    halide_target_feature_cm, ///< Enable C for metal
     halide_target_feature_end ///< A sentinel. Every target is considered to have this feature, and setting this feature does nothing.
 } halide_target_feature_t;
 
diff --git a/README.md b/README.md
index 32551870..6a2727f6 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,6 @@ Currently, we support only Intel FPGAs and GPUs. We assume your device is local
    ```
 
 # Install tools (once)
-
 + [DevCloud] From the **head node**, submit a job with one of the following commands, based on the type of device you will use: 
 
   ```
@@ -48,8 +47,7 @@ Currently, we support only Intel FPGAs and GPUs. We assume your device is local
   qsub -l nodes=1:iris_xe_max:ppn=2 -d $HOME/t2sp $HOME/t2sp/install-tools.sh
   ````
   This may take 1-5 hours on DevCloud, depending on the specific machine allocated for the job. 
-  
-  A known issue: on a GEN 9.5 GPU machine, it is possible to see some errors during installing `m4`, but it turns out that package is not necessary for that machine, and we can ignore the error.  
+
   
 + [Local machine with an FPGA or a GPU]
 
@@ -64,9 +62,12 @@ Currently, we support only Intel FPGAs and GPUs. We assume your device is local
   tar -xvf AOCL-pro-*-linux.tar 
   ./setup_pro.sh
   ```
++ known issues: 
+  -  on a GEN 9.5 GPU machine, it is possible to see some errors during installing `m4`, but it turns out that package is not necessary for that machine, and we can ignore the error.  
+  - Python 2.x is required for ninja. Make sure you already have python 2.x. `Install-tools.sh ` will not help you download it.
 
+  
 Note: 
-
 + We assume your system has python >= 2.7 already installed.
 + The above `install-tools.sh` command installs llvm-clang >= 9.0, gcc >= 7.5.0, and python's numpy and matplotlib package. The command installs all of them and their dependencies we know to make the system self-contained. If your system has some of the tools already installed, you could edit `install-tools.sh` to disable the installations of these tools, then modify the environment setting as shown below.
 
diff --git a/install-tool.sh b/install-tool.sh
index c0e88a4a..4cedf96c 100755
--- a/install-tool.sh
+++ b/install-tool.sh
@@ -2,13 +2,13 @@
 
 function show_usage {
     echo "Usage:"
-    echo "  ./install-tool.sh m4|gmp|mpfr|mpc|cmake|gcc|llvm-clang|python-packages|cm|git-lfs"
+    echo "  ./install-tool.sh m4|gmp|mpfr|mpc|cmake|gcc|llvm-clang|python-packages|cm|git-lfs|ninja|re2c|oneapi-esimd|oneapi-support"
 }
 
 # No matter the script is sourced or directly run, BASH_SOURCE is always this script, and $1 is the
 # argument to the script
 T2S_PATH="$( cd "$(dirname "$BASH_SOURCE" )" >/dev/null 2>&1 ; pwd -P )" # The path to this script
-if [ "$1" != "m4"  -a  "$1" != "gmp" -a  "$1" != "mpfr" -a  "$1" != "mpc" -a  "$1" != "cmake" -a  "$1" != "gcc" -a "$1" != "llvm-clang" -a "$1" != "python-packages" -a "$1" != "cm" -a "$1" != "git-lfs" ]; then
+if [ "$1" != "m4"  -a  "$1" != "gmp" -a  "$1" != "mpfr" -a  "$1" != "mpc" -a  "$1" != "cmake" -a  "$1" != "gcc" -a "$1" != "llvm-clang" -a "$1" != "python-packages" -a "$1" != "cm" -a "$1" != "git-lfs" -a "$1" != "ninja" -a "$1" != "re2c" -a "$1" != "oneapi-esimd" -a "$1" != "oneapi-support" ]; then
     show_usage
     if [ $0 == $BASH_SOURCE ]; then
         # The script is directly run
@@ -159,6 +159,53 @@ function install_git_lfs {
     cd ..
 }
 
+function install_ninja {
+    git clone https://github.com/ninja-build/ninja.git
+    cd ninja
+    echo "if you have problems in running configure.py,try replacing the first line of configure.py(#!/usr/bin/env python) to #!/usr/bin/env python3"
+    ./configure.py --bootstrap
+    cd ..
+    cp -rf ninja $T2S_PATH/install
+}
+
+function install_re2c {
+    wget https://github.com/skvadrik/re2c/releases/download/3.0/re2c-3.0.tar.xz
+    tar -xvf re2c-3.0.tar.xz
+    rm re2c-3.0.tar.xz
+    cd re2c-3.0
+    autoreconf -i -W all
+    ./configure
+    make
+    make install
+    cd ..
+    cp -rf re2c-3.0 $T2S_PATH/install
+}
+
+function install_oneapi-esmid-extention {
+    export DPCPP_HOME=$T2S_PATH/downloads/sycl_workspace
+    mkdir $DPCPP_HOME
+    cd $DPCPP_HOME
+    git clone https://github.com/intel/llvm -b sycl
+    python $DPCPP_HOME/llvm/buildbot/configure.py
+    cd ..
+    cp -rf sycl_workspace/ $T2S_PATH/install/
+    export DPCPP_HOME=$T2S_PATH/install/sycl_workspace
+    python $DPCPP_HOME/llvm/buildbot/configure.py
+    python $DPCPP_HOME/llvm/buildbot/compile.py
+    git clone https://github.com/intel/llvm-test-suite.git
+    cp -rf llvm-test-suite/ $T2S_PATH/install/
+
+}
+
+function install-oneapi-support {
+    wget https://oneapi.team/tattle/oneAPI-samples/-/raw/9d8b94a38f2a98042cf933adfb91ec1da3d5ad51/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/pipe_array.hpp?inline=false
+    wget https://oneapi.team/tattle/oneAPI-samples/-/raw/9d8b94a38f2a98042cf933adfb91ec1da3d5ad51/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/pipe_array_internal.hpp?inline=false
+    mv pipe_array.hpp?inline=false pipe_array.hpp
+    mv pipe_array_internal.hpp?inline=false pipe_array_internal.hpp
+    mv pipe_array.hpp $T2S_PATH/t2s/src/oneapi-src
+    mv pipe_array_internal.hpp $T2S_PATH/t2s/src/oneapi-src
+}
+
 # Below we install newer version of gcc and llvm-clang and their dependencies
 mkdir -p $T2S_PATH/install $T2S_PATH/install/bin
 export PATH=$T2S_PATH/install/bin:$PATH
@@ -166,7 +213,6 @@ export PATH=$T2S_PATH/install/bin:$PATH
 cd $T2S_PATH
 mkdir -p downloads
 cd downloads
-
 if [ "$component" == "m4" ]; then
     install_m4         "1.4.18"
 fi
@@ -180,10 +226,10 @@ if [ "$component" == "mpc" ]; then
     install_mpc        "1.2.1"
 fi
 if [ "$component" == "cmake" ]; then
-    install_cmake      "3.11"  "3.11.1"
+    install_cmake      "3.15"  "3.15.7"
 fi
 if [ "$component" == "gcc" ]; then
-    install_gcc        "7.5.0"
+    install_gcc        "8.4.0"
 fi
 if [ "$component" == "llvm-clang" ]; then
     install_llvm_clang "90"    "9.0"    "7.5.0"
@@ -195,8 +241,20 @@ if [ "$component" == "cm" ]; then
     # install_cm_20211028
     install_cm_20200119
 fi
+if [ "$component" == "ninja" ]; then
+    install_ninja
+fi
+if [ "$component" == "re2c" ]; then
+    install_re2c
+fi
 if [ "$component" == "git-lfs" ]; then
     install_git_lfs 3.1.4
 fi
+if [ "$component" == "oneapi-esimd" ]; then
+    install_oneapi-esmid-extention
+fi
+if [ "$component" == "oneapi-support" ]; then
+    install-oneapi-support
+fi
 cd ..
 
diff --git a/install-tools.sh b/install-tools.sh
index 59b82883..3e797345 100755
--- a/install-tools.sh
+++ b/install-tools.sh
@@ -8,3 +8,8 @@
 ./install-tool.sh python-packages
 ./install-tool.sh cm
 ./install-tool.sh git-lfs
+./install-tool.sh re2c
+./install-tool.sh ninja
+./install-tool.sh oneapi-esimd
+./install-tool.sh oneapi-support
+
diff --git a/setenv.sh b/setenv.sh
index b8fc25f2..e195a257 100755
--- a/setenv.sh
+++ b/setenv.sh
@@ -168,6 +168,9 @@ fi
 export PATH=$T2S_PATH/Halide/bin:$PATH
 export LD_LIBRARY_PATH=$T2S_PATH/Halide/bin:$LD_LIBRARY_PATH
 
+# Add ESIMD extension clang++
+export DPCPP_HOME=$T2S_PATH/install/sycl_workspace
+
 # Common options for compiling a specification
 export COMMON_OPTIONS_COMPILING_SPEC="-I $T2S_PATH/Halide/include -L $T2S_PATH/Halide/bin -lz -lpthread -ldl -std=c++11"
 
diff --git a/t2s/preprocessor/sample/build_01.sh b/t2s/preprocessor/sample/build_01.sh
deleted file mode 100755
index b45ba4f6..00000000
--- a/t2s/preprocessor/sample/build_01.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-
-# Change to clang include directory
-CWD="$PWD"
-cd ${PWD}/sample_01/
-# echo "CWD: $PWD"
-
-echo "Building T2S...";
-echo "CMD: g++ post.t2s.test.t2s.cpp -I ${T2S_PATH}/t2s/src/ -I ${T2S_PATH}/t2s/tests/correctness/util -I ${T2S_PATH}/Halide/include -L ${T2S_PATH}/Halide/bin -lz -lpthread -ldl -std=c++11 -lHalide -DTINY -DFPGA_EMULATOR -DFPGA;";
-g++ post.t2s.test.t2s.cpp \
-    -I ${T2S_PATH}/t2s/src/ \
-    -I ${T2S_PATH}/t2s/tests/correctness/util \
-    -I ${T2S_PATH}/Halide/include \
-    -L ${T2S_PATH}/Halide/bin \
-    -lz -lpthread -ldl -std=c++11 -lHalide -DTINY -DFPGA_EMULATOR -DFPGA;
-
-echo "Executing T2S...";
-./a.out;
-
-echo "Building OneAPI...";
-echo "CMD: dpcpp post.run.test.t2s.cpp -I ${T2S_PATH}/Halide/include -L ${T2S_PATH}/Halide/bin -lHalide -lz -lpthread -ldl -fintelfpga -fsycl -fsycl-device-code-split=off -DTINY -DFPGA_EMULATOR -DFPGA -o ./test.fpga_emu;";
-dpcpp post.run.test.t2s.cpp \
-    -I ${T2S_PATH}/Halide/include \
-    -L ${T2S_PATH}/Halide/bin \
-    -lHalide -lz -lpthread -ldl \
-    -fintelfpga -fsycl -fsycl-device-code-split=off \
-    -DTINY -DFPGA_EMULATOR -DFPGA \
-    -o ./test.fpga_emu;
-
-echo "Executing final binary...";
-./test.fpga_emu;
-
-# Return back to the directory
-# cd ${CWD}
-echo "CWD: $PWD"
diff --git a/t2s/preprocessor/sample/sample_01/HalideBuffer.h b/t2s/preprocessor/sample/sample_01/HalideBuffer.h
deleted file mode 100644
index dfc64e10..00000000
--- a/t2s/preprocessor/sample/sample_01/HalideBuffer.h
+++ /dev/null
@@ -1,2397 +0,0 @@
-/** \file
- * Defines a Buffer type that wraps from buffer_t and adds
- * functionality, and methods for more conveniently iterating over the
- * samples in a buffer_t outside of Halide code. */
-
-#ifndef HALIDE_RUNTIME_BUFFER_H
-#define HALIDE_RUNTIME_BUFFER_H
-
-#include <algorithm>
-#include <atomic>
-#include <cassert>
-#include <limits>
-#include <memory>
-#include <stdint.h>
-#include <string.h>
-#include <vector>
-
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
-#include <sanitizer/msan_interface.h>
-#endif
-#endif
-
-#include "HalideRuntime.h"
-
-#ifdef _MSC_VER
-#define HALIDE_ALLOCA _alloca
-#else
-#define HALIDE_ALLOCA __builtin_alloca
-#endif
-
-// gcc 5.1 has a false positive warning on this code
-#if __GNUC__ == 5 && __GNUC_MINOR__ == 1
-#pragma GCC diagnostic ignored "-Warray-bounds"
-#endif
-
-namespace Halide {
-namespace Runtime {
-
-// Forward-declare our Buffer class
-template<typename T, int D> class Buffer;
-
-// A helper to check if a parameter pack is entirely implicitly
-// int-convertible to use with std::enable_if
-template<typename ...Args>
-struct AllInts : std::false_type {};
-
-template<>
-struct AllInts<> : std::true_type {};
-
-template<typename T, typename ...Args>
-struct AllInts<T, Args...> {
-    static const bool value = std::is_convertible<T, int>::value && AllInts<Args...>::value;
-};
-
-// Floats and doubles are technically implicitly int-convertible, but
-// doing so produces a warning we treat as an error, so just disallow
-// it here.
-template<typename ...Args>
-struct AllInts<float, Args...> : std::false_type {};
-
-template<typename ...Args>
-struct AllInts<double, Args...> : std::false_type {};
-
-// A helper to detect if there are any zeros in a container
-namespace Internal {
-template<typename Container>
-bool any_zero(const Container &c) {
-    for (int i : c) {
-        if (i == 0) return true;
-    }
-    return false;
-}
-}
-
-/** A struct acting as a header for allocations owned by the Buffer
- * class itself. */
-struct AllocationHeader {
-    void (*deallocate_fn)(void *);
-    std::atomic<int> ref_count;
-
-    // Note that ref_count always starts at 1
-    AllocationHeader(void (*deallocate_fn)(void *)) : deallocate_fn(deallocate_fn), ref_count(1) {}
-};
-
-/** This indicates how to deallocate the device for a Halide::Runtime::Buffer. */
-enum struct BufferDeviceOwnership : int {
-    Allocated,     ///> halide_device_free will be called when device ref count goes to zero
-    WrappedNative, ///> halide_device_detach_native will be called when device ref count goes to zero
-    Unmanaged,     ///> No free routine will be called when device ref count goes to zero
-    AllocatedDeviceAndHost, ///> Call device_and_host_free when DevRefCount goes to zero.
-    Cropped,       ///> Call halide_device_release_crop when DevRefCount goes to zero.
-};
-
-/** A similar struct for managing device allocations. */
-struct DeviceRefCount {
-    // This is only ever constructed when there's something to manage,
-    // so start at one.
-    std::atomic<int> count {1};
-    BufferDeviceOwnership ownership{BufferDeviceOwnership::Allocated};
-};
-
-/** A templated Buffer class that wraps halide_buffer_t and adds
- * functionality. When using Halide from C++, this is the preferred
- * way to create input and output buffers. The overhead of using this
- * class relative to a naked halide_buffer_t is minimal - it uses another
- * ~16 bytes on the stack, and does no dynamic allocations when using
- * it to represent existing memory of a known maximum dimensionality.
- *
- * The template parameter T is the element type. For buffers where the
- * element type is unknown, or may vary, use void or const void.
- *
- * D is the maximum number of dimensions that can be represented using
- * space inside the class itself. Set it to the maximum dimensionality
- * you expect this buffer to be. If the actual dimensionality exceeds
- * this, heap storage is allocated to track the shape of the buffer. D
- * defaults to 4, which should cover nearly all usage.
- *
- * The class optionally allocates and owns memory for the image using
- * a shared pointer allocated with the provided allocator. If they are
- * null, malloc and free are used.  Any device-side allocation is
- * considered as owned if and only if the host-side allocation is
- * owned. */
-template<typename T = void, int D = 4>
-class Buffer {
-    /** The underlying buffer_t */
-    halide_buffer_t buf = {0};
-
-    /** Some in-class storage for shape of the dimensions. */
-    halide_dimension_t shape[D];
-
-    /** The allocation owned by this Buffer. NULL if the Buffer does not
-     * own the memory. */
-    AllocationHeader *alloc = nullptr;
-
-    /** A reference count for the device allocation owned by this
-     * buffer. */
-    mutable DeviceRefCount *dev_ref_count = nullptr;
-
-    /** True if T is of type void or const void */
-    static const bool T_is_void = std::is_same<typename std::remove_const<T>::type, void>::value;
-
-    /** A type function that adds a const qualifier if T is a const type. */
-    template<typename T2>
-    using add_const_if_T_is_const = typename std::conditional<std::is_const<T>::value, const T2, T2>::type;
-
-    /** T unless T is (const) void, in which case (const)
-     * uint8_t. Useful for providing return types for operator() */
-    using not_void_T = typename std::conditional<T_is_void,
-                                                 add_const_if_T_is_const<uint8_t>,
-                                                 T>::type;
-
-    /** T with constness removed. Useful for return type of copy(). */
-    using not_const_T = typename std::remove_const<T>::type;
-
-
-    /** The type the elements are stored as. Equal to not_void_T
-     * unless T is a pointer, in which case uint64_t. Halide stores
-     * all pointer types as uint64s internally, even on 32-bit
-     * systems. */
-    using storage_T = typename std::conditional<std::is_pointer<T>::value, uint64_t, not_void_T>::type;
-
-public:
-    /** True if the Halide type is not void (or const void). */
-    static constexpr bool has_static_halide_type = !T_is_void;
-
-    /** Get the Halide type of T. Callers should not use the result if
-     * has_static_halide_type is false. */
-    static halide_type_t static_halide_type() {
-        return halide_type_of<typename std::remove_cv<not_void_T>::type>();
-    }
-
-    /** Does this Buffer own the host memory it refers to? */
-    bool owns_host_memory() const {
-        return alloc != nullptr;
-    }
-
-private:
-    /** Increment the reference count of any owned allocation */
-    void incref() const {
-        if (owns_host_memory()) {
-            alloc->ref_count++;
-        }
-        if (buf.device) {
-            if (!dev_ref_count) {
-                // I seem to have a non-zero dev field but no
-                // reference count for it. I must have been given a
-                // device allocation by a Halide pipeline, and have
-                // never been copied from since. Take sole ownership
-                // of it.
-                dev_ref_count = new DeviceRefCount;
-            }
-            dev_ref_count->count++;
-        }
-    }
-
-    // Note that this is called "cropped" but can also encompass a slice/embed
-    // operation as well.
-    struct DevRefCountCropped : DeviceRefCount {
-        Buffer<T, D> cropped_from;
-        DevRefCountCropped(const Buffer<T, D> &cropped_from) : cropped_from(cropped_from) {
-            ownership = BufferDeviceOwnership::Cropped;
-        }
-    };
-
-    /** Setup the device ref count for a buffer to indicate it is a crop (or slice, embed, etc) of cropped_from */
-    void crop_from(const Buffer<T, D> &cropped_from) {
-        assert(dev_ref_count == nullptr);
-        dev_ref_count = new DevRefCountCropped(cropped_from);
-    }
-
-    /** Decrement the reference count of any owned allocation and free host
-     * and device memory if it hits zero. Sets alloc to nullptr. */
-    void decref() {
-        if (owns_host_memory()) {
-            int new_count = --(alloc->ref_count);
-            if (new_count == 0) {
-                void (*fn)(void *) = alloc->deallocate_fn;
-                alloc->~AllocationHeader();
-                fn(alloc);
-            }
-            buf.host = nullptr;
-            alloc = nullptr;
-            set_host_dirty(false);
-        }
-        decref_dev();
-    }
-
-    void decref_dev() {
-        int new_count = 0;
-        if (dev_ref_count) {
-            new_count = --(dev_ref_count->count);
-        }
-        if (new_count == 0) {
-            if (buf.device) {
-                assert(!(alloc && device_dirty()) &&
-                       "Implicitly freeing a dirty device allocation while a host allocation still lives. "
-                       "Call device_free explicitly if you want to drop dirty device-side data. "
-                       "Call copy_to_host explicitly if you want the data copied to the host allocation "
-                       "before the device allocation is freed.");
-                if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::WrappedNative) {
-                    buf.device_interface->detach_native(nullptr, &buf);
-                } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::AllocatedDeviceAndHost) {
-                    buf.device_interface->device_and_host_free(nullptr, &buf);
-                } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
-                    buf.device_interface->device_release_crop(nullptr, &buf);
-                } else if (dev_ref_count == nullptr || dev_ref_count->ownership == BufferDeviceOwnership::Allocated) {
-                    buf.device_interface->device_free(nullptr, &buf);
-                }
-            }
-            if (dev_ref_count) {
-                if (dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
-                    delete (DevRefCountCropped *)dev_ref_count;
-                } else {
-                    delete dev_ref_count;
-                }
-            }
-        }
-        buf.device = 0;
-        buf.device_interface = nullptr;
-        dev_ref_count = nullptr;
-    }
-
-    void free_shape_storage() {
-        if (buf.dim != shape) {
-            delete[] buf.dim;
-            buf.dim = nullptr;
-        }
-    }
-
-    void make_shape_storage(const int dimensions) {
-        // This should usually be inlined, so if dimensions is statically known,
-        // we can skip the call to new
-        buf.dimensions = dimensions;
-        buf.dim = (dimensions <= D) ? shape : new halide_dimension_t[dimensions];
-    }
-
-    void copy_shape_from(const halide_buffer_t &other) {
-        // All callers of this ensure that buf.dimensions == other.dimensions.
-        make_shape_storage(other.dimensions);
-        std::copy(other.dim, other.dim + other.dimensions, buf.dim);
-    }
-
-    template<typename T2, int D2>
-    void move_shape_from(Buffer<T2, D2> &&other) {
-        if (other.shape == other.buf.dim) {
-            copy_shape_from(other.buf);
-        } else {
-            buf.dim = other.buf.dim;
-            other.buf.dim = nullptr;
-        }
-    }
-
-    /** Initialize the shape from a halide_buffer_t. */
-    void initialize_from_buffer(const halide_buffer_t &b,
-                                BufferDeviceOwnership ownership) {
-        memcpy(&buf, &b, sizeof(halide_buffer_t));
-        copy_shape_from(b);
-        if (b.device) {
-            dev_ref_count = new DeviceRefCount;
-            dev_ref_count->ownership = ownership;
-        }
-    }
-
-    /** Initialize the shape from an array of ints */
-    void initialize_shape(const int *sizes) {
-        for (int i = 0; i < buf.dimensions; i++) {
-            buf.dim[i].min = 0;
-            buf.dim[i].extent = sizes[i];
-            if (i == 0) {
-                buf.dim[i].stride = 1;
-            } else {
-                buf.dim[i].stride = buf.dim[i-1].stride * buf.dim[i-1].extent;
-            }
-        }
-    }
-
-    /** Initialize the shape from a vector of extents */
-    void initialize_shape(const std::vector<int> &sizes) {
-        assert(buf.dimensions == (int)sizes.size());
-        initialize_shape(sizes.data());
-    }
-
-    /** Initialize the shape from the static shape of an array */
-    template<typename Array, size_t N>
-    void initialize_shape_from_array_shape(int next, Array (&vals)[N]) {
-        buf.dim[next].min = 0;
-        buf.dim[next].extent = (int)N;
-        if (next == 0) {
-            buf.dim[next].stride = 1;
-        } else {
-            initialize_shape_from_array_shape(next - 1, vals[0]);
-            buf.dim[next].stride = buf.dim[next - 1].stride * buf.dim[next - 1].extent;
-        }
-    }
-
-    /** Base case for the template recursion above. */
-    template<typename T2>
-    void initialize_shape_from_array_shape(int, const T2 &) {
-    }
-
-    /** Get the dimensionality of a multi-dimensional C array */
-    template<typename Array, size_t N>
-    static int dimensionality_of_array(Array (&vals)[N]) {
-        return dimensionality_of_array(vals[0]) + 1;
-    }
-
-    template<typename T2>
-    static int dimensionality_of_array(const T2 &) {
-        return 0;
-    }
-
-    /** Get the underlying halide_type_t of an array's element type. */
-    template<typename Array, size_t N>
-    static halide_type_t scalar_type_of_array(Array (&vals)[N]) {
-        return scalar_type_of_array(vals[0]);
-    }
-
-    template<typename T2>
-    static halide_type_t scalar_type_of_array(const T2 &) {
-        return halide_type_of<typename std::remove_cv<T2>::type>();
-    }
-
-    /** Crop a single dimension without handling device allocation. */
-    void crop_host(int d, int min, int extent) {
-       assert(dim(d).min() <= min);
-       assert(dim(d).max() >= min + extent - 1);
-        int shift = min - dim(d).min();
-        if (buf.host != nullptr) {
-            buf.host += shift * dim(d).stride() * type().bytes();
-        }
-        buf.dim[d].min = min;
-        buf.dim[d].extent = extent;
-    }
-
-    /** Crop as many dimensions as are in rect, without handling device allocation. */
-    void crop_host(const std::vector<std::pair<int, int>> &rect) {
-        assert(rect.size() <= static_cast<decltype(rect.size())>(std::numeric_limits<int>::max()));
-        int limit = (int)rect.size();
-        assert(limit <= dimensions());
-        for (int i = 0; i < limit; i++) {
-            crop_host(i, rect[i].first, rect[i].second);
-        }
-    }
-
-    void complete_device_crop(Buffer<T, D> &result_host_cropped) const {
-        assert(buf.device_interface != nullptr);
-        if (buf.device_interface->device_crop(nullptr, &this->buf, &result_host_cropped.buf) == 0) {
-            const Buffer<T, D> *cropped_from = this;
-            // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here?
-            // is it possible to get to this point without incref having run at least once since
-            // the device field was set? (I.e. in the internal logic of crop. incref might have been
-            // called.)
-            if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
-                cropped_from = &((DevRefCountCropped *)dev_ref_count)->cropped_from;
-            }
-            result_host_cropped.crop_from(*cropped_from);
-        }
-    }
-
-    /** slice a single dimension without handling device allocation. */
-    void slice_host(int d, int pos) {
-        assert(d >= 0 && d < dimensions());
-        assert(pos >= dim(d).min() && pos <= dim(d).max());
-        buf.dimensions--;
-        int shift = pos - buf.dim[d].min;
-        if (buf.host != nullptr) {
-            buf.host += shift * buf.dim[d].stride * type().bytes();
-        }
-        for (int i = d; i < buf.dimensions; i++) {
-            buf.dim[i] = buf.dim[i+1];
-        }
-        buf.dim[buf.dimensions] = {0, 0, 0};
-    }
-
-    void complete_device_slice(Buffer<T, D> &result_host_sliced, int d, int pos) const {
-        assert(buf.device_interface != nullptr);
-        if (buf.device_interface->device_slice(nullptr, &this->buf, d, pos, &result_host_sliced.buf) == 0) {
-            const Buffer<T, D> *sliced_from = this;
-            // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here?
-            // is it possible to get to this point without incref having run at least once since
-            // the device field was set? (I.e. in the internal logic of slice. incref might have been
-            // called.)
-            if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
-                sliced_from = &((DevRefCountCropped *)dev_ref_count)->cropped_from;
-            }
-            // crop_from() is correct here, despite the fact that we are slicing.
-            result_host_sliced.crop_from(*sliced_from);
-        }
-    }
-
-    void init_from_legacy_buffer_t(const buffer_t &old_buf, halide_type_t t) {
-        if (!T_is_void) {
-            assert(static_halide_type() == t);
-        }
-        assert(old_buf.elem_size == t.bytes());
-        buf.host = old_buf.host;
-        buf.type = t;
-        int d;
-        for (d = 0; d < 4 && old_buf.extent[d]; d++);
-        make_shape_storage(d);
-        for (int i = 0; i < d; i++) {
-            buf.dim[i].min = old_buf.min[i];
-            buf.dim[i].extent = old_buf.extent[i];
-            buf.dim[i].stride = old_buf.stride[i];
-        }
-        buf.set_host_dirty(old_buf.host_dirty);
-        assert(old_buf.dev == 0 && "Cannot construct a Halide::Runtime::Buffer from a legacy buffer_t with a device allocation. Use halide_upgrade_buffer_t to upgrade it to a halide_buffer_t first.");
-    }
-
-public:
-
-    typedef T ElemType;
-
-    /** Read-only access to the shape */
-    class Dimension {
-        const halide_dimension_t &d;
-    public:
-        /** The lowest coordinate in this dimension */
-        HALIDE_ALWAYS_INLINE int min() const {
-            return d.min;
-        }
-
-        /** The number of elements in memory you have to step over to
-         * increment this coordinate by one. */
-        HALIDE_ALWAYS_INLINE int stride() const {
-            return d.stride;
-        }
-
-        /** The extent of the image along this dimension */
-        HALIDE_ALWAYS_INLINE int extent() const {
-            return d.extent;
-        }
-
-        /** The highest coordinate in this dimension */
-        HALIDE_ALWAYS_INLINE int max() const {
-            return min() + extent() - 1;
-        }
-
-        /** An iterator class, so that you can iterate over
-         * coordinates in a dimensions using a range-based for loop. */
-        struct iterator {
-            int val;
-            int operator*() const {return val;}
-            bool operator!=(const iterator &other) const {return val != other.val;}
-            iterator &operator++() {val++; return *this;}
-        };
-
-        /** An iterator that points to the min coordinate */
-        HALIDE_ALWAYS_INLINE iterator begin() const {
-            return {min()};
-        }
-
-        /** An iterator that points to one past the max coordinate */
-        HALIDE_ALWAYS_INLINE iterator end() const {
-            return {min() + extent()};
-        }
-
-        Dimension(const halide_dimension_t &dim) : d(dim) {};
-    };
-
-    /** Access the shape of the buffer */
-    HALIDE_ALWAYS_INLINE Dimension dim(int i) const {
-        assert(i >= 0 && i < this->dimensions());
-        return Dimension(buf.dim[i]);
-    }
-
-    /** Access to the mins, strides, extents. Will be deprecated. Do not use. */
-    // @{
-    int min(int i) const { return dim(i).min(); }
-    int extent(int i) const { return dim(i).extent(); }
-    int stride(int i) const { return dim(i).stride(); }
-    // @}
-
-    /** The total number of elements this buffer represents. Equal to
-     * the product of the extents */
-    size_t number_of_elements() const {
-        size_t s = 1;
-        for (int i = 0; i < dimensions(); i++) {
-            s *= dim(i).extent();
-        }
-        return s;
-    }
-
-    /** Get the dimensionality of the buffer. */
-    int dimensions() const {
-        return buf.dimensions;
-    }
-
-    /** Get the type of the elements. */
-    halide_type_t type() const {
-        return buf.type;
-    }
-
-private:
-    /** Offset to the element with the lowest address. If all
-     * strides are positive, equal to zero. Offset is in elements, not bytes. */
-    ptrdiff_t begin_offset() const {
-        ptrdiff_t index = 0;
-        for (int i = 0; i < dimensions(); i++) {
-            if (dim(i).stride() < 0) {
-                index += dim(i).stride() * (dim(i).extent() - 1);
-            }
-        }
-        return index;
-    }
-
-    /** An offset to one beyond the element with the highest address.
-     * Offset is in elements, not bytes. */
-    ptrdiff_t end_offset() const {
-        ptrdiff_t index = 0;
-        for (int i = 0; i < dimensions(); i++) {
-            if (dim(i).stride() > 0) {
-                index += dim(i).stride() * (dim(i).extent() - 1);
-            }
-        }
-        index += 1;
-        return index;
-    }
-
-public:
-    /** A pointer to the element with the lowest address. If all
-     * strides are positive, equal to the host pointer. */
-    T *begin() const {
-        assert(buf.host != nullptr); // Cannot call begin() on an unallocated Buffer.
-        return (T *)(buf.host + begin_offset() * type().bytes());
-    }
-
-    /** A pointer to one beyond the element with the highest address. */
-    T *end() const {
-        assert(buf.host != nullptr); // Cannot call end() on an unallocated Buffer.
-        return (T *)(buf.host + end_offset() * type().bytes());
-    }
-
-    /** The total number of bytes spanned by the data in memory. */
-    size_t size_in_bytes() const {
-        return (size_t)(end_offset() - begin_offset()) * type().bytes();
-    }
-
-    /** Reset the Buffer to be equivalent to a default-constructed Buffer
-     * of the same static type (if any); Buffer<void> will have its runtime
-     * type reset to uint8. */
-    void reset() {
-        *this = Buffer();
-    }
-
-    Buffer() : shape() {
-        buf.type = static_halide_type();
-        make_shape_storage(0);
-    }
-
-    /** Make a Buffer from a halide_buffer_t */
-    explicit Buffer(const halide_buffer_t &buf,
-           BufferDeviceOwnership ownership = BufferDeviceOwnership::Unmanaged) {
-        assert(T_is_void || buf.type == static_halide_type());
-        initialize_from_buffer(buf, ownership);
-    }
-
-    /** Make a Buffer from a legacy buffer_t, with an explicit halide_type. */
-    explicit Buffer(const buffer_t &old_buf, halide_type_t t) {
-        init_from_legacy_buffer_t(old_buf, t);
-    }
-
-    /** Make a Buffer from a legacy buffer_t, which is assumed to match our static
-     * type. (Cannot use with Buffer<void>.) */
-    explicit Buffer(const buffer_t &old_buf) {
-        static_assert(!T_is_void, "Cannot construct a Buffer<void> from a buffer_t without an explicit type.");
-        init_from_legacy_buffer_t(old_buf, static_halide_type());
-    }
-
-    /** Populate the fields of a legacy buffer_t using this
-     * Buffer. Does not copy device metadata. */
-    buffer_t make_legacy_buffer_t() const {
-        buffer_t old_buf = {0};
-        assert(!has_device_allocation() && "Cannot construct a legacy buffer_t from a Halide::Runtime::Buffer with a device allocation. Use halide_downgrade_buffer_t instead.");
-        old_buf.host = buf.host;
-        old_buf.elem_size = buf.type.bytes();
-        assert(dimensions() <= 4 && "Cannot construct a legacy buffer_t from a Halide::Runtime::Buffer with more than four dimensions.");
-        for (int i = 0; i < dimensions(); i++) {
-            old_buf.min[i] = dim(i).min();
-            old_buf.extent[i] = dim(i).extent();
-            old_buf.stride[i] = dim(i).stride();
-        }
-        return old_buf;
-    }
-
-    /** Give Buffers access to the members of Buffers of different dimensionalities and types. */
-    template<typename T2, int D2> friend class Buffer;
-
-private:
-    template<typename T2, int D2>
-    static void static_assert_can_convert_from() {
-        static_assert((!std::is_const<T2>::value || std::is_const<T>::value),
-                      "Can't convert from a Buffer<const T> to a Buffer<T>");
-        static_assert(std::is_same<typename std::remove_const<T>::type,
-                                   typename std::remove_const<T2>::type>::value ||
-                      T_is_void || Buffer<T2, D2>::T_is_void,
-                      "type mismatch constructing Buffer");
-    }
-
-public:
-    /** Determine if if an Buffer<T, D> can be constructed from some other Buffer type.
-     * If this can be determined at compile time, fail with a static assert; otherwise
-     * return a boolean based on runtime typing. */
-    template<typename T2, int D2>
-    static bool can_convert_from(const Buffer<T2, D2> &other) {
-        static_assert_can_convert_from<T2, D2>();
-        if (Buffer<T2, D2>::T_is_void && !T_is_void) {
-            return other.type() == static_halide_type();
-        }
-        return true;
-    }
-
-    /** Fail an assertion at runtime or compile-time if an Buffer<T, D>
-     * cannot be constructed from some other Buffer type. */
-    template<typename T2, int D2>
-    static void assert_can_convert_from(const Buffer<T2, D2> &other) {
-        // Explicitly call static_assert_can_convert_from() here so
-        // that we always get compile-time checking, even if compiling with
-        // assertions disabled.
-        static_assert_can_convert_from<T2, D2>();
-        assert(can_convert_from(other));
-    }
-
-    /** Copy constructor. Does not copy underlying data. */
-    Buffer(const Buffer<T, D> &other) : buf(other.buf),
-                                        alloc(other.alloc) {
-        other.incref();
-        dev_ref_count = other.dev_ref_count;
-        copy_shape_from(other.buf);
-    }
-
-    /** Construct a Buffer from a Buffer of different dimensionality
-     * and type. Asserts that the type matches (at runtime, if one of
-     * the types is void). Note that this constructor is
-     * implicit. This, for example, lets you pass things like
-     * Buffer<T> or Buffer<const void> to functions expected
-     * Buffer<const T>. */
-    template<typename T2, int D2>
-    Buffer(const Buffer<T2, D2> &other) : buf(other.buf),
-                                          alloc(other.alloc) {
-        assert_can_convert_from(other);
-        other.incref();
-        dev_ref_count = other.dev_ref_count;
-        copy_shape_from(other.buf);
-    }
-
-    /** Move constructor */
-    Buffer(Buffer<T, D> &&other) : buf(other.buf),
-                                   alloc(other.alloc),
-                                   dev_ref_count(other.dev_ref_count) {
-        other.dev_ref_count = nullptr;
-        other.alloc = nullptr;
-        move_shape_from(std::forward<Buffer<T, D>>(other));
-        other.buf = halide_buffer_t();
-    }
-
-    /** Move-construct a Buffer from a Buffer of different
-     * dimensionality and type. Asserts that the types match (at
-     * runtime if one of the types is void). */
-    template<typename T2, int D2>
-    Buffer(Buffer<T2, D2> &&other) : buf(other.buf),
-                                     alloc(other.alloc),
-                                     dev_ref_count(other.dev_ref_count) {
-        assert_can_convert_from(other);
-        other.dev_ref_count = nullptr;
-        other.alloc = nullptr;
-        move_shape_from(std::forward<Buffer<T2, D2>>(other));
-        other.buf = halide_buffer_t();
-    }
-
-    /** Assign from another Buffer of possibly-different
-     * dimensionality and type. Asserts that the types match (at
-     * runtime if one of the types is void). */
-    template<typename T2, int D2>
-    Buffer<T, D> &operator=(const Buffer<T2, D2> &other) {
-        if ((const void *)this == (const void *)&other) {
-            return *this;
-        }
-        assert_can_convert_from(other);
-        other.incref();
-        decref();
-        dev_ref_count = other.dev_ref_count;
-        alloc = other.alloc;
-        free_shape_storage();
-        buf = other.buf;
-        copy_shape_from(other.buf);
-        return *this;
-    }
-
-    /** Standard assignment operator */
-    Buffer<T, D> &operator=(const Buffer<T, D> &other) {
-        if (this == &other) {
-            return *this;
-        }
-        other.incref();
-        decref();
-        dev_ref_count = other.dev_ref_count;
-        alloc = other.alloc;
-        free_shape_storage();
-        buf = other.buf;
-        copy_shape_from(other.buf);
-        return *this;
-    }
-
-    /** Move from another Buffer of possibly-different
-     * dimensionality and type. Asserts that the types match (at
-     * runtime if one of the types is void). */
-    template<typename T2, int D2>
-    Buffer<T, D> &operator=(Buffer<T2, D2> &&other) {
-        assert_can_convert_from(other);
-        decref();
-        alloc = other.alloc;
-        other.alloc = nullptr;
-        dev_ref_count = other.dev_ref_count;
-        other.dev_ref_count = nullptr;
-        free_shape_storage();
-        buf = other.buf;
-        move_shape_from(std::forward<Buffer<T2, D2>>(other));
-        other.buf = halide_buffer_t();
-        return *this;
-    }
-
-    /** Standard move-assignment operator */
-    Buffer<T, D> &operator=(Buffer<T, D> &&other) {
-        decref();
-        alloc = other.alloc;
-        other.alloc = nullptr;
-        dev_ref_count = other.dev_ref_count;
-        other.dev_ref_count = nullptr;
-        free_shape_storage();
-        buf = other.buf;
-        move_shape_from(std::forward<Buffer<T, D>>(other));
-        other.buf = halide_buffer_t();
-        return *this;
-    }
-
-    /** Check the product of the extents fits in memory. */
-    void check_overflow() {
-        size_t size = type().bytes();
-        for (int i = 0; i < dimensions(); i++) {
-            size *= dim(i).extent();
-        }
-        // We allow 2^31 or 2^63 bytes, so drop the top bit.
-        size = (size << 1) >> 1;
-        for (int i = 0; i < dimensions(); i++) {
-            size /= dim(i).extent();
-        }
-        assert(size == (size_t)type().bytes() && "Error: Overflow computing total size of buffer.");
-    }
-
-    /** Allocate memory for this Buffer. Drops the reference to any
-     * owned memory. */
-    void allocate(void *(*allocate_fn)(size_t) = nullptr,
-                  void (*deallocate_fn)(void *) = nullptr) {
-        if (!allocate_fn) {
-            allocate_fn = malloc;
-        }
-        if (!deallocate_fn) {
-            deallocate_fn = free;
-        }
-
-        // Drop any existing allocation
-        deallocate();
-
-        // Conservatively align images to 128 bytes. This is enough
-        // alignment for all the platforms we might use.
-        size_t size = size_in_bytes();
-        const size_t alignment = 128;
-        size = (size + alignment - 1) & ~(alignment - 1);
-        void *alloc_storage = allocate_fn(size + sizeof(AllocationHeader) + alignment - 1);
-        alloc = new (alloc_storage) AllocationHeader(deallocate_fn);
-        uint8_t *unaligned_ptr = ((uint8_t *)alloc) + sizeof(AllocationHeader);
-        buf.host = (uint8_t *)((uintptr_t)(unaligned_ptr + alignment - 1) & ~(alignment - 1));
-    }
-
-    /** Drop reference to any owned host or device memory, possibly
-     * freeing it, if this buffer held the last reference to
-     * it. Retains the shape of the buffer. Does nothing if this
-     * buffer did not allocate its own memory. */
-    void deallocate() {
-        decref();
-    }
-
-    /** Drop reference to any owned device memory, possibly freeing it
-     * if this buffer held the last reference to it. Asserts that
-     * device_dirty is false. */
-    void device_deallocate() {
-        decref_dev();
-    }
-
-    /** Allocate a new image of the given size with a runtime
-     * type. Only used when you do know what size you want but you
-     * don't know statically what type the elements are. Pass zeroes
-     * to make a buffer suitable for bounds query calls. */
-    template<typename ...Args,
-             typename = typename std::enable_if<AllInts<Args...>::value>::type>
-    Buffer(halide_type_t t, int first, Args... rest) {
-        if (!T_is_void) {
-            assert(static_halide_type() == t);
-        }
-        int extents[] = {first, (int)rest...};
-        buf.type = t;
-        constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
-        make_shape_storage(buf_dimensions);
-        initialize_shape(extents);
-        if (!Internal::any_zero(extents)) {
-            check_overflow();
-            allocate();
-        }
-    }
-
-
-    /** Allocate a new image of the given size. Pass zeroes to make a
-     * buffer suitable for bounds query calls. */
-    // @{
-
-    // The overload with one argument is 'explicit', so that
-    // (say) int is not implicitly convertable to Buffer<int>
-    explicit Buffer(int first) {
-        static_assert(!T_is_void,
-                      "To construct an Buffer<void>, pass a halide_type_t as the first argument to the constructor");
-        int extents[] = {first};
-        buf.type = static_halide_type();
-        constexpr int buf_dimensions = 1;
-        make_shape_storage(buf_dimensions);
-        initialize_shape(extents);
-        if (first != 0) {
-            check_overflow();
-            allocate();
-        }
-    }
-
-    template<typename ...Args,
-             typename = typename std::enable_if<AllInts<Args...>::value>::type>
-    Buffer(int first, int second, Args... rest) {
-        static_assert(!T_is_void,
-                      "To construct an Buffer<void>, pass a halide_type_t as the first argument to the constructor");
-        int extents[] = {first, second, (int)rest...};
-        buf.type = static_halide_type();
-        constexpr int buf_dimensions = 2 + (int)(sizeof...(rest));
-        make_shape_storage(buf_dimensions);
-        initialize_shape(extents);
-        if (!Internal::any_zero(extents)) {
-            check_overflow();
-            allocate();
-        }
-    }
-    // @}
-
-    /** Allocate a new image of unknown type using a vector of ints as the size. */
-    Buffer(halide_type_t t, const std::vector<int> &sizes) {
-        if (!T_is_void) {
-            assert(static_halide_type() == t);
-        }
-        buf.type = t;
-        make_shape_storage((int)sizes.size());
-        initialize_shape(sizes);
-        if (!Internal::any_zero(sizes)) {
-            check_overflow();
-            allocate();
-        }
-    }
-
-    /** Allocate a new image of known type using a vector of ints as the size. */
-    explicit Buffer(const std::vector<int> &sizes) : Buffer(static_halide_type(), sizes) {}
-
-private:
-    // Create a copy of the sizes vector, ordered as specified by order.
-    static std::vector<int> make_ordered_sizes(const std::vector<int> &sizes, const std::vector<int> &order) {
-        assert(order.size() == sizes.size());
-        std::vector<int> ordered_sizes(sizes.size());
-        for (size_t i = 0; i < sizes.size(); ++i) {
-            ordered_sizes[i] = sizes.at(order[i]);
-        }
-        return ordered_sizes;
-    }
-
-public:
-    /** Allocate a new image of unknown type using a vector of ints as the size and
-     * a vector of indices indicating the storage order for each dimension. The
-     * length of the sizes vector and the storage-order vector must match. For instance,
-     * to allocate an interleaved RGB buffer, you would pass {2, 0, 1} for storage_order. */
-    Buffer(halide_type_t t, const std::vector<int> &sizes, const std::vector<int> &storage_order)
-        : Buffer(t, make_ordered_sizes(sizes, storage_order)) {
-        transpose(storage_order);
-    }
-
-    Buffer(const std::vector<int> &sizes, const std::vector<int> &storage_order)
-        : Buffer(static_halide_type(), sizes, storage_order) {}
-
-    /** Make an Buffer that refers to a statically sized array. Does not
-     * take ownership of the data, and does not set the host_dirty flag. */
-    template<typename Array, size_t N>
-    explicit Buffer(Array (&vals)[N]) {
-        const int buf_dimensions = dimensionality_of_array(vals);
-        buf.type = scalar_type_of_array(vals);
-        buf.host = (uint8_t *)vals;
-        make_shape_storage(buf_dimensions);
-        initialize_shape_from_array_shape(buf.dimensions - 1, vals);
-    }
-
-    /** Initialize an Buffer of runtime type from a pointer and some
-     * sizes. Assumes dense row-major packing and a min coordinate of
-     * zero. Does not take ownership of the data and does not set the
-     * host_dirty flag. */
-    template<typename ...Args,
-             typename = typename std::enable_if<AllInts<Args...>::value>::type>
-    explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int first, Args&&... rest) {
-        if (!T_is_void) {
-            assert(static_halide_type() == t);
-        }
-        int extents[] = {first, (int)rest...};
-        buf.type = t;
-        constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
-        buf.host = (uint8_t *) const_cast<void *>(data);
-        make_shape_storage(buf_dimensions);
-        initialize_shape(extents);
-    }
-
-    /** Initialize an Buffer from a pointer and some sizes. Assumes
-     * dense row-major packing and a min coordinate of zero. Does not
-     * take ownership of the data and does not set the host_dirty flag. */
-    template<typename ...Args,
-             typename = typename std::enable_if<AllInts<Args...>::value>::type>
-    explicit Buffer(T *data, int first, Args&&... rest) {
-        int extents[] = {first, (int)rest...};
-        buf.type = static_halide_type();
-        constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
-        buf.host = (uint8_t *) const_cast<typename std::remove_const<T>::type *>(data);
-        make_shape_storage(buf_dimensions);
-        initialize_shape(extents);
-    }
-
-    /** Initialize an Buffer from a pointer and a vector of
-     * sizes. Assumes dense row-major packing and a min coordinate of
-     * zero. Does not take ownership of the data and does not set the
-     * host_dirty flag. */
-    explicit Buffer(T *data, const std::vector<int> &sizes) {
-        buf.type = static_halide_type();
-        buf.host = (uint8_t *) const_cast<typename std::remove_const<T>::type *>(data);
-        make_shape_storage((int)sizes.size());
-        initialize_shape(sizes);
-    }
-
-    /** Initialize an Buffer of runtime type from a pointer and a
-     * vector of sizes. Assumes dense row-major packing and a min
-     * coordinate of zero. Does not take ownership of the data and
-     * does not set the host_dirty flag. */
-    explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, const std::vector<int> &sizes) {
-        if (!T_is_void) {
-            assert(static_halide_type() == t);
-        }
-        buf.type = t;
-        buf.host = (uint8_t *) const_cast<void *>(data);
-        make_shape_storage((int)sizes.size());
-        initialize_shape(sizes);
-    }
-
-    /** Initialize an Buffer from a pointer to the min coordinate and
-     * an array describing the shape.  Does not take ownership of the
-     * data, and does not set the host_dirty flag. */
-    explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int d, const halide_dimension_t *shape) {
-        if (!T_is_void) {
-            assert(static_halide_type() == t);
-        }
-        buf.type = t;
-        buf.host = (uint8_t *) const_cast<void *>(data);
-        make_shape_storage(d);
-        for (int i = 0; i < d; i++) {
-            buf.dim[i] = shape[i];
-        }
-    }
-
-    /** Initialize a Buffer from a pointer to the min coordinate and
-     * a vector describing the shape.  Does not take ownership of the
-     * data, and does not set the host_dirty flag. */
-    explicit inline Buffer(halide_type_t t, add_const_if_T_is_const<void> *data,
-                           const std::vector<halide_dimension_t> &shape)
-        : Buffer(t, data, (int) shape.size(), shape.data()) {}
-
-    /** Initialize an Buffer from a pointer to the min coordinate and
-     * an array describing the shape.  Does not take ownership of the
-     * data and does not set the host_dirty flag. */
-    explicit Buffer(T *data, int d, const halide_dimension_t *shape) {
-        buf.type = static_halide_type();
-        buf.host = (uint8_t *) const_cast<typename std::remove_const<T>::type *>(data);
-        make_shape_storage(d);
-        for (int i = 0; i < d; i++) {
-            buf.dim[i] = shape[i];
-        }
-    }
-
-    /** Initialize a Buffer from a pointer to the min coordinate and
-     * a vector describing the shape.  Does not take ownership of the
-     * data, and does not set the host_dirty flag. */
-    explicit inline Buffer(T *data, const std::vector<halide_dimension_t> &shape)
-        : Buffer(data, (int) shape.size(), shape.data()) {}
-
-    /** Destructor. Will release any underlying owned allocation if
-     * this is the last reference to it. Will assert fail if there are
-     * weak references to this Buffer outstanding. */
-    ~Buffer() {
-        free_shape_storage();
-        decref();
-    }
-
-    /** Get a pointer to the raw halide_buffer_t this wraps. */
-    // @{
-    halide_buffer_t *raw_buffer() {
-        return &buf;
-    }
-
-    const halide_buffer_t *raw_buffer() const {
-        return &buf;
-    }
-    // @}
-
-    /** Provide a cast operator to halide_buffer_t *, so that
-     * instances can be passed directly to Halide filters. */
-    operator halide_buffer_t *() {
-        return &buf;
-    }
-
-    /** Return a typed reference to this Buffer. Useful for converting
-     * a reference to a Buffer<void> to a reference to, for example, a
-     * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
-     * Does a runtime assert if the source buffer type is void. */
-    template<typename T2, int D2 = D,
-             typename = typename std::enable_if<(D2 <= D)>::type>
-    HALIDE_ALWAYS_INLINE
-    Buffer<T2, D2> &as() & {
-        Buffer<T2, D>::assert_can_convert_from(*this);
-        return *((Buffer<T2, D2> *)this);
-    }
-
-    /** Return a const typed reference to this Buffer. Useful for
-     * converting a conference reference to one Buffer type to a const
-     * reference to another Buffer type. Does a runtime assert if the
-     * source buffer type is void. */
-    template<typename T2, int D2 = D,
-             typename = typename std::enable_if<(D2 <= D)>::type>
-    HALIDE_ALWAYS_INLINE
-    const Buffer<T2, D2> &as() const &  {
-        Buffer<T2, D>::assert_can_convert_from(*this);
-        return *((const Buffer<T2, D2> *)this);
-    }
-
-    /** Returns this rval Buffer with a different type attached. Does
-     * a dynamic type check if the source type is void. */
-    template<typename T2, int D2 = D>
-    HALIDE_ALWAYS_INLINE
-    Buffer<T2, D2> as() && {
-        Buffer<T2, D2>::assert_can_convert_from(*this);
-        return *((Buffer<T2, D2> *)this);
-    }
-
-    /** as_const() is syntactic sugar for .as<const T>(), to avoid the need
-     * to recapitulate the type argument. */
-    // @{
-    HALIDE_ALWAYS_INLINE
-    Buffer<typename std::add_const<T>::type, D> &as_const() & {
-        // Note that we can skip the assert_can_convert_from(), since T -> const T
-        // conversion is always legal.
-        return *((Buffer<typename std::add_const<T>::type> *)this);
-    }
-
-    HALIDE_ALWAYS_INLINE
-    const Buffer<typename std::add_const<T>::type, D> &as_const() const & {
-        return *((const Buffer<typename std::add_const<T>::type> *)this);
-    }
-
-    HALIDE_ALWAYS_INLINE
-    Buffer<typename std::add_const<T>::type, D> as_const() && {
-        return *((Buffer<typename std::add_const<T>::type> *)this);
-    }
-    // @}
-
-    /** Conventional names for the first three dimensions. */
-    // @{
-    int width() const {
-        return (dimensions() > 0) ? dim(0).extent() : 1;
-    }
-    int height() const {
-        return (dimensions() > 1) ? dim(1).extent() : 1;
-    }
-    int channels() const {
-        return (dimensions() > 2) ? dim(2).extent() : 1;
-    }
-    // @}
-
-    /** Conventional names for the min and max value of each dimension */
-    // @{
-    int left() const {
-        return dim(0).min();
-    }
-
-    int right() const {
-        return dim(0).max();
-    }
-
-    int top() const {
-        return dim(1).min();
-    }
-
-    int bottom() const {
-        return dim(1).max();
-    }
-    // @}
-
-    /** Make a new image which is a deep copy of this image. Use crop
-     * or slice followed by copy to make a copy of only a portion of
-     * the image. The new image uses the same memory layout as the
-     * original, with holes compacted away. Note that the returned
-     * Buffer is always of a non-const type T (ie:
-     *
-     *     Buffer<const T>.copy() -> Buffer<T> rather than Buffer<const T>
-     *
-     * which is always safe, since we are making a deep copy. (The caller
-     * can easily cast it back to Buffer<const T> if desired, which is
-     * always safe and free.)
-     */
-    Buffer<not_const_T, D> copy(void *(*allocate_fn)(size_t) = nullptr,
-                                void (*deallocate_fn)(void *) = nullptr) const {
-        Buffer<not_const_T, D> dst = Buffer<not_const_T, D>::make_with_shape_of(*this, allocate_fn, deallocate_fn);
-        dst.copy_from(*this);
-        return dst;
-    }
-
-    /** Like copy(), but the copy is created in interleaved memory layout
-     * (vs. keeping the same memory layout as the original). Requires that 'this'
-     * has exactly 3 dimensions.
-     */
-    Buffer<not_const_T, D> copy_to_interleaved(void *(*allocate_fn)(size_t) = nullptr,
-                                               void (*deallocate_fn)(void *) = nullptr) const {
-        assert(dimensions() == 3);
-        Buffer<not_const_T, D> dst = Buffer<not_const_T, D>::make_interleaved(nullptr, width(), height(), channels());
-        dst.set_min(min(0), min(1), min(2));
-        dst.allocate(allocate_fn, deallocate_fn);
-        dst.copy_from(*this);
-        return dst;
-    }
-
-    /** Like copy(), but the copy is created in planar memory layout
-     * (vs. keeping the same memory layout as the original).
-     */
-    Buffer<not_const_T, D> copy_to_planar(void *(*allocate_fn)(size_t) = nullptr,
-                                          void (*deallocate_fn)(void *) = nullptr) const {
-        std::vector<int> mins, extents;
-        const int dims = dimensions();
-        mins.reserve(dims);
-        extents.reserve(dims);
-        for (int d = 0; d < dims; ++d) {
-            mins.push_back(dim(d).min());
-            extents.push_back(dim(d).extent());
-        }
-        Buffer<not_const_T, D> dst = Buffer<not_const_T, D>(nullptr, extents);
-        dst.set_min(mins);
-        dst.allocate(allocate_fn, deallocate_fn);
-        dst.copy_from(*this);
-        return dst;
-    }
-
-    /** Make a copy of the Buffer which shares the underlying host and/or device
-     * allocations as the existing Buffer. This is purely syntactic sugar for
-     * cases where you have a const reference to a Buffer but need a temporary
-     * non-const copy (e.g. to make a call into AOT-generated Halide code), and want a terse
-     * inline way to create a temporary. \code
-     * void call_my_func(const Buffer<const uint8_t>& input) {
-     *     my_func(input.alias(), output);
-     * }\endcode
-     */
-    inline Buffer<T, D> alias() const {
-        return *this;
-    }
-
-    /** Fill a Buffer with the values at the same coordinates in
-     * another Buffer. Restricts itself to coordinates contained
-     * within the intersection of the two buffers. If the two Buffers
-     * are not in the same coordinate system, you will need to
-     * translate the argument Buffer first. E.g. if you're blitting a
-     * sprite onto a framebuffer, you'll want to translate the sprite
-     * to the correct location first like so: \code
-     * framebuffer.copy_from(sprite.translated({x, y})); \endcode
-    */
-    template<typename T2, int D2>
-    void copy_from(const Buffer<T2, D2> &other) {
-        static_assert(!std::is_const<T>::value, "Cannot call copy_from() on a Buffer<const T>");
-        assert(!device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty destination.");
-        assert(!other.device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty source.");
-
-        Buffer<const T, D> src(other);
-        Buffer<T, D> dst(*this);
-
-        assert(src.dimensions() == dst.dimensions());
-
-        // Trim the copy to the region in common
-        for (int i = 0; i < dimensions(); i++) {
-            int min_coord = std::max(dst.dim(i).min(), src.dim(i).min());
-            int max_coord = std::min(dst.dim(i).max(), src.dim(i).max());
-            if (max_coord < min_coord) {
-                // The buffers do not overlap.
-                return;
-            }
-            dst.crop(i, min_coord, max_coord - min_coord + 1);
-            src.crop(i, min_coord, max_coord - min_coord + 1);
-        }
-
-        // If T is void, we need to do runtime dispatch to an
-        // appropriately-typed lambda. We're copying, so we only care
-        // about the element size. (If not, this should optimize away
-        // into a static dispatch to the right-sized copy.)
-        if (T_is_void ? (type().bytes() == 1) : (sizeof(not_void_T) == 1)) {
-            using MemType = uint8_t;
-            auto &typed_dst = (Buffer<MemType, D> &)dst;
-            auto &typed_src = (Buffer<const MemType, D> &)src;
-            typed_dst.for_each_value([&](MemType &dst, MemType src) {dst = src;}, typed_src);
-        } else if (T_is_void ? (type().bytes() == 2) : (sizeof(not_void_T) == 2)) {
-            using MemType = uint16_t;
-            auto &typed_dst = (Buffer<MemType, D> &)dst;
-            auto &typed_src = (Buffer<const MemType, D> &)src;
-            typed_dst.for_each_value([&](MemType &dst, MemType src) {dst = src;}, typed_src);
-        } else if (T_is_void ? (type().bytes() == 4) : (sizeof(not_void_T) == 4)) {
-            using MemType = uint32_t;
-            auto &typed_dst = (Buffer<MemType, D> &)dst;
-            auto &typed_src = (Buffer<const MemType, D> &)src;
-            typed_dst.for_each_value([&](MemType &dst, MemType src) {dst = src;}, typed_src);
-        } else if (T_is_void ? (type().bytes() == 8) : (sizeof(not_void_T) == 8)) {
-            using MemType = uint64_t;
-            auto &typed_dst = (Buffer<MemType, D> &)dst;
-            auto &typed_src = (Buffer<const MemType, D> &)src;
-            typed_dst.for_each_value([&](MemType &dst, MemType src) {dst = src;}, typed_src);
-        } else {
-            assert(false && "type().bytes() must be 1, 2, 4, or 8");
-        }
-        set_host_dirty();
-    }
-
-    /** Make an image that refers to a sub-range of this image along
-     * the given dimension. Asserts that the crop region is within
-     * the existing bounds: you cannot "crop outwards", even if you know there
-     * is valid Buffer storage (e.g. because you already cropped inwards). */
-    Buffer<T, D> cropped(int d, int min, int extent) const {
-        // Make a fresh copy of the underlying buffer (but not a fresh
-        // copy of the allocation, if there is one).
-        Buffer<T, D> im = *this;
-
-        // This guarantees the prexisting device ref is dropped if the
-        // device_crop call fails and maintains the buffer in a consistent
-        // state.
-        im.device_deallocate();
-
-        im.crop_host(d, min, extent);
-        if (buf.device_interface != nullptr) {
-            complete_device_crop(im);
-        }
-        return im;
-    }
-
-    /** Crop an image in-place along the given dimension. This does
-     * not move any data around in memory - it just changes the min
-     * and extent of the given dimension. */
-    void crop(int d, int min, int extent) {
-        // An optimization for non-device buffers. For the device case,
-        // a temp buffer is required, so reuse the not-in-place version.
-        // TODO(zalman|abadams): Are nop crops common enough to special
-        // case the device part of the if to do nothing?
-        if (buf.device_interface != nullptr) {
-            *this = cropped(d, min, extent);
-        } else {
-            crop_host(d, min, extent);
-        }
-    }
-
-    /** Make an image that refers to a sub-rectangle of this image along
-     * the first N dimensions. Asserts that the crop region is within
-     * the existing bounds. The cropped image may drop any device handle
-     * if the device_interface cannot accomplish the crop in-place. */
-    Buffer<T, D> cropped(const std::vector<std::pair<int, int>> &rect) const {
-        // Make a fresh copy of the underlying buffer (but not a fresh
-        // copy of the allocation, if there is one).
-        Buffer<T, D> im = *this;
-
-        // This guarantees the prexisting device ref is dropped if the
-        // device_crop call fails and maintains the buffer in a consistent
-        // state.
-        im.device_deallocate();
-
-        im.crop_host(rect);
-        if (buf.device_interface != nullptr) {
-            complete_device_crop(im);
-        }
-        return im;
-    }
-
-    /** Crop an image in-place along the first N dimensions. This does
-     * not move any data around in memory, nor does it free memory. It
-     * just rewrites the min/extent of each dimension to refer to a
-     * subregion of the same allocation. */
-    void crop(const std::vector<std::pair<int, int>> &rect) {
-        // An optimization for non-device buffers. For the device case,
-        // a temp buffer is required, so reuse the not-in-place version.
-        // TODO(zalman|abadams): Are nop crops common enough to special
-        // case the device part of the if to do nothing?
-        if (buf.device_interface != nullptr) {
-            *this = cropped(rect);
-        } else {
-            crop_host(rect);
-        }
-    }
-
-    /** Make an image which refers to the same data with using
-     * translated coordinates in the given dimension. Positive values
-     * move the image data to the right or down relative to the
-     * coordinate system. Drops any device handle. */
-    Buffer<T, D> translated(int d, int dx) const {
-        Buffer<T, D> im = *this;
-        im.translate(d, dx);
-        return im;
-    }
-
-    /** Translate an image in-place along one dimension by changing
-     * how it is indexed. Does not move any data around in memory. */
-    void translate(int d, int delta) {
-        assert(d >= 0 && d < this->dimensions());
-        device_deallocate();
-        buf.dim[d].min += delta;
-    }
-
-    /** Make an image which refers to the same data translated along
-     * the first N dimensions. */
-    Buffer<T, D> translated(const std::vector<int> &delta) const {
-        Buffer<T, D> im = *this;
-        im.translate(delta);
-        return im;
-    }
-
-    /** Translate an image along the first N dimensions by changing
-     * how it is indexed. Does not move any data around in memory. */
-    void translate(const std::vector<int> &delta) {
-        device_deallocate();
-        assert(delta.size() <= static_cast<decltype(delta.size())>(std::numeric_limits<int>::max()));
-        int limit = (int)delta.size();
-        assert(limit <= dimensions());
-        for (int i = 0; i < limit; i++) {
-            translate(i, delta[i]);
-        }
-    }
-
-    /** Set the min coordinate of an image in the first N dimensions. */
-    // @{
-    void set_min(const std::vector<int> &mins) {
-        assert(mins.size() <= static_cast<decltype(mins.size())>(dimensions()));
-        device_deallocate();
-        for (size_t i = 0; i < mins.size(); i++) {
-            buf.dim[i].min = mins[i];
-        }
-    }
-
-    template<typename ...Args>
-    void set_min(Args... args) {
-        set_min(std::vector<int>{args...});
-    }
-    // @}
-
-    /** Test if a given coordinate is within the bounds of an image. */
-    // @{
-    bool contains(const std::vector<int> &coords) const {
-        assert(coords.size() <= static_cast<decltype(coords.size())>(dimensions()));
-        for (size_t i = 0; i < coords.size(); i++) {
-            if (coords[i] < dim((int) i).min() || coords[i] > dim((int) i).max()) {
-                return false;
-            }
-        }
-        return true;
-    }
-
-    template<typename ...Args>
-    bool contains(Args... args) const {
-        return contains(std::vector<int>{args...});
-    }
-    // @}
-
-    /** Make a buffer which refers to the same data in the same layout
-     * using a swapped indexing order for the dimensions given. So
-     * A = B.transposed(0, 1) means that A(i, j) == B(j, i), and more
-     * strongly that A.address_of(i, j) == B.address_of(j, i). */
-    Buffer<T, D> transposed(int d1, int d2) const {
-        Buffer<T, D> im = *this;
-        im.transpose(d1, d2);
-        return im;
-    }
-
-    /** Transpose a buffer in-place by changing how it is indexed. For
-     * example, transpose(0, 1) on a two-dimensional buffer means that
-     * the value referred to by coordinates (i, j) is now reached at
-     * the coordinates (j, i), and vice versa. This is done by
-     * reordering the per-dimension metadata rather than by moving
-     * data around in memory, so other views of the same memory will
-     * not see the data as having been transposed. */
-    void transpose(int d1, int d2) {
-        assert(d1 >= 0 && d1 < this->dimensions());
-        assert(d2 >= 0 && d2 < this->dimensions());
-        std::swap(buf.dim[d1], buf.dim[d2]);
-    }
-
-    /** A generalized transpose: instead of swapping two dimensions,
-     * pass a vector that lists each dimension index exactly once, in
-     * the desired order. This does not move any data around in memory
-     * - it just permutes how it is indexed. */
-    void transpose(const std::vector<int> &order) {
-        assert((int) order.size() == dimensions());
-        if (dimensions() < 2) {
-            // My, that was easy
-            return;
-        }
-
-        std::vector<int> order_sorted = order;
-        for (size_t i = 1; i < order_sorted.size(); i++) {
-            for (size_t j = i; j > 0 && order_sorted[j-1] > order_sorted[j]; j--) {
-                std::swap(order_sorted[j], order_sorted[j-1]);
-                transpose(j, j-1);
-            }
-        }
-    }
-
-    /** Make a buffer which refers to the same data in the same
-     * layout using a different ordering of the dimensions. */
-    Buffer<T, D> transposed(const std::vector<int> &order) const {
-        Buffer<T, D> im = *this;
-        im.transpose(order);
-        return im;
-    }
-
-    /** Make a lower-dimensional buffer that refers to one slice of
-     * this buffer. */
-    Buffer<T, D> sliced(int d, int pos) const {
-        Buffer<T, D> im = *this;
-
-        // This guarantees the prexisting device ref is dropped if the
-        // device_slice call fails and maintains the buffer in a consistent
-        // state.
-        im.device_deallocate();
-
-        im.slice_host(d, pos);
-        if (buf.device_interface != nullptr) {
-            complete_device_slice(im, d, pos);
-        }
-        return im;
-    }
-
-    /** Make a lower-dimensional buffer that refers to one slice of this
-     * buffer at the dimension's minimum. */
-    inline Buffer<T, D> sliced(int d) const {
-        return sliced(d, dim(d).min());
-    }
-
-    /** Rewrite the buffer to refer to a single lower-dimensional
-     * slice of itself along the given dimension at the given
-     * coordinate. Does not move any data around or free the original
-     * memory, so other views of the same data are unaffected. */
-    void slice(int d, int pos) {
-        // An optimization for non-device buffers. For the device case,
-        // a temp buffer is required, so reuse the not-in-place version.
-        // TODO(zalman|abadams): Are nop slices common enough to special
-        // case the device part of the if to do nothing?
-        if (buf.device_interface != nullptr) {
-            *this = sliced(d, pos);
-        } else {
-            slice_host(d, pos);
-        }
-    }
-
-    /** Slice a buffer in-place at the dimension's minimum. */
-    inline void slice(int d) {
-        slice(d, dim(d).min());
-    }
-
-    /** Make a new buffer that views this buffer as a single slice in a
-     * higher-dimensional space. The new dimension has extent one and
-     * the given min. This operation is the opposite of slice. As an
-     * example, the following condition is true:
-     *
-     \code
-     im2 = im.embedded(1, 17);
-     &im(x, y, c) == &im2(x, 17, y, c);
-     \endcode
-     */
-    Buffer<T, D> embedded(int d, int pos = 0) const {
-        Buffer<T, D> im(*this);
-        im.embed(d, pos);
-        return im;
-    }
-
-    /** Embed a buffer in-place, increasing the
-     * dimensionality. */
-    void embed(int d, int pos = 0) {
-        assert(d >= 0 && d <= dimensions());
-        add_dimension();
-        translate(dimensions() - 1, pos);
-        for (int i = dimensions() - 1; i > d; i--) {
-            transpose(i, i-1);
-        }
-    }
-
-    /** Add a new dimension with a min of zero and an extent of
-     * one. The stride is the extent of the outermost dimension times
-     * its stride. The new dimension is the last dimension. This is a
-     * special case of embed. */
-    void add_dimension() {
-        const int dims = buf.dimensions;
-        buf.dimensions++;
-        if (buf.dim != shape) {
-            // We're already on the heap. Reallocate.
-            halide_dimension_t *new_shape = new halide_dimension_t[buf.dimensions];
-            for (int i = 0; i < dims; i++) {
-                new_shape[i] = buf.dim[i];
-            }
-            delete[] buf.dim;
-            buf.dim = new_shape;
-        } else if (dims == D) {
-            // Transition from the in-class storage to the heap
-            make_shape_storage(buf.dimensions);
-            for (int i = 0; i < dims; i++) {
-                buf.dim[i] = shape[i];
-            }
-        } else {
-            // We still fit in the class
-        }
-        buf.dim[dims] = {0, 1, 0};
-        if (dims == 0) {
-            buf.dim[dims].stride = 1;
-        } else {
-            buf.dim[dims].stride = buf.dim[dims-1].extent * buf.dim[dims-1].stride;
-        }
-    }
-
-    /** Add a new dimension with a min of zero, an extent of one, and
-     * the specified stride. The new dimension is the last
-     * dimension. This is a special case of embed. */
-    void add_dimension_with_stride(int s) {
-        add_dimension();
-        buf.dim[buf.dimensions-1].stride = s;
-    }
-
-    /** Methods for managing any GPU allocation. */
-    // @{
-    // Set the host dirty flag. Called by every operator()
-    // access. Must be inlined so it can be hoisted out of loops.
-    HALIDE_ALWAYS_INLINE
-    void set_host_dirty(bool v = true) {
-        assert((!v || !device_dirty()) && "Cannot set host dirty when device is already dirty.");
-        buf.set_host_dirty(v);
-    }
-
-    // Check if the device allocation is dirty. Called by
-    // set_host_dirty, which is called by every accessor. Must be
-    // inlined so it can be hoisted out of loops.
-    HALIDE_ALWAYS_INLINE
-    bool device_dirty() const {
-        return buf.device_dirty();
-    }
-
-    bool host_dirty() const {
-        return buf.host_dirty();
-    }
-
-    void set_device_dirty(bool v = true) {
-        assert((!v || !host_dirty()) && "Cannot set device dirty when host is already dirty.");
-        buf.set_device_dirty(v);
-    }
-
-    int copy_to_host(void *ctx = nullptr) {
-        if (device_dirty()) {
-            return buf.device_interface->copy_to_host(ctx, &buf);
-        }
-        return 0;
-    }
-
-    int copy_to_device(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
-        if (host_dirty()) {
-            return device_interface->copy_to_device(ctx, &buf, device_interface);
-        }
-        return 0;
-    }
-
-    int device_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
-        return device_interface->device_malloc(ctx, &buf, device_interface);
-    }
-
-    int device_free(void *ctx = nullptr) {
-        if (dev_ref_count) {
-            assert(dev_ref_count->ownership == BufferDeviceOwnership::Allocated &&
-                   "Can't call device_free on an unmanaged or wrapped native device handle. "
-                   "Free the source allocation or call device_detach_native instead.");
-            // Multiple people may be holding onto this dev field
-            assert(dev_ref_count->count == 1 &&
-                   "Multiple Halide::Runtime::Buffer objects share this device "
-                   "allocation. Freeing it would create dangling references. "
-                   "Don't call device_free on Halide buffers that you have copied or "
-                   "passed by value.");
-        }
-        int ret = 0;
-        if (buf.device_interface) {
-            ret = buf.device_interface->device_free(ctx, &buf);
-        }
-        if (dev_ref_count) {
-            delete dev_ref_count;
-            dev_ref_count = nullptr;
-        }
-        return ret;
-    }
-
-    int device_wrap_native(const struct halide_device_interface_t *device_interface,
-                           uint64_t handle, void *ctx = nullptr) {
-        assert(device_interface);
-        dev_ref_count = new DeviceRefCount;
-        dev_ref_count->ownership = BufferDeviceOwnership::WrappedNative;
-        return device_interface->wrap_native(ctx, &buf, handle, device_interface);
-    }
-
-    int device_detach_native(void *ctx = nullptr) {
-        assert(dev_ref_count &&
-               dev_ref_count->ownership == BufferDeviceOwnership::WrappedNative &&
-               "Only call device_detach_native on buffers wrapping a native "
-               "device handle via device_wrap_native. This buffer was allocated "
-               "using device_malloc, or is unmanaged. "
-               "Call device_free or free the original allocation instead.");
-        // Multiple people may be holding onto this dev field
-        assert(dev_ref_count->count == 1 &&
-               "Multiple Halide::Runtime::Buffer objects share this device "
-               "allocation. Freeing it could create dangling references. "
-               "Don't call device_detach_native on Halide buffers that you "
-               "have copied or passed by value.");
-        int ret = 0;
-        if (buf.device_interface) {
-            ret = buf.device_interface->detach_native(ctx, &buf);
-        }
-        delete dev_ref_count;
-        dev_ref_count = nullptr;
-        return ret;
-    }
-
-    int device_and_host_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
-        return device_interface->device_and_host_malloc(ctx, &buf, device_interface);
-    }
-
-    int device_and_host_free(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
-        if (dev_ref_count) {
-            assert(dev_ref_count->ownership == BufferDeviceOwnership::AllocatedDeviceAndHost &&
-                   "Can't call device_and_host_free on a device handle not allocated with device_and_host_malloc. "
-                   "Free the source allocation or call device_detach_native instead.");
-            // Multiple people may be holding onto this dev field
-            assert(dev_ref_count->count == 1 &&
-                   "Multiple Halide::Runtime::Buffer objects share this device "
-                   "allocation. Freeing it would create dangling references. "
-                   "Don't call device_and_host_free on Halide buffers that you have copied or "
-                   "passed by value.");
-        }
-        int ret = 0;
-        if (buf.device_interface) {
-            ret = buf.device_interface->device_and_host_free(ctx, &buf);
-        }
-        if (dev_ref_count) {
-            delete dev_ref_count;
-            dev_ref_count = nullptr;
-        }
-        return ret;
-    }
-
-    int device_sync(void *ctx = nullptr) {
-        if (buf.device_interface) {
-            return buf.device_interface->device_sync(ctx, &buf);
-        } else {
-            return 0;
-        }
-    }
-
-    bool has_device_allocation() const {
-        return buf.device != 0;
-    }
-
-    /** Return the method by which the device field is managed. */
-    BufferDeviceOwnership device_ownership() const {
-        if (dev_ref_count == nullptr) {
-            return BufferDeviceOwnership::Allocated;
-        }
-        return dev_ref_count->ownership;
-    }
-    // @}
-
-    /** If you use the (x, y, c) indexing convention, then Halide
-     * Buffers are stored planar by default. This function constructs
-     * an interleaved RGB or RGBA image that can still be indexed
-     * using (x, y, c). Passing it to a generator requires that the
-     * generator has been compiled with support for interleaved (also
-     * known as packed or chunky) memory layouts. */
-    static Buffer<void, D> make_interleaved(halide_type_t t, int width, int height, int channels) {
-        Buffer<void, D> im(t, channels, width, height);
-        // Note that this is equivalent to calling transpose({2, 0, 1}),
-        // but slightly more efficient.
-        im.transpose(0, 1);
-        im.transpose(1, 2);
-        return im;
-    }
-
-    /** If you use the (x, y, c) indexing convention, then Halide
-     * Buffers are stored planar by default. This function constructs
-     * an interleaved RGB or RGBA image that can still be indexed
-     * using (x, y, c). Passing it to a generator requires that the
-     * generator has been compiled with support for interleaved (also
-     * known as packed or chunky) memory layouts. */
-    static Buffer<T, D> make_interleaved(int width, int height, int channels) {
-        return make_interleaved(static_halide_type(), width, height, channels);
-    }
-
-    /** Wrap an existing interleaved image. */
-    static Buffer<add_const_if_T_is_const<void>, D>
-    make_interleaved(halide_type_t t, T *data, int width, int height, int channels) {
-        Buffer<add_const_if_T_is_const<void>, D> im(t, data, channels, width, height);
-        im.transpose(0, 1);
-        im.transpose(1, 2);
-        return im;
-    }
-
-    /** Wrap an existing interleaved image. */
-    static Buffer<T, D> make_interleaved(T *data, int width, int height, int channels) {
-        return make_interleaved(static_halide_type(), data, width, height, channels);
-    }
-
-    /** Make a zero-dimensional Buffer */
-    static Buffer<add_const_if_T_is_const<void>, D> make_scalar(halide_type_t t) {
-        Buffer<add_const_if_T_is_const<void>, 1> buf(t, 1);
-        buf.slice(0, 0);
-        return buf;
-    }
-
-    /** Make a zero-dimensional Buffer */
-    static Buffer<T, D> make_scalar() {
-        Buffer<T, 1> buf(1);
-        buf.slice(0, 0);
-        return buf;
-    }
-
-    /** Make a zero-dimensional Buffer that points to non-owned, existing data */
-    static Buffer<T, D> make_scalar(T* data) {
-        Buffer<T, 1> buf(data, 1);
-        buf.slice(0, 0);
-        return buf;
-    }
-
-    /** Make a buffer with the same shape and memory nesting order as
-     * another buffer. It may have a different type. */
-    template<typename T2, int D2>
-    static Buffer<T, D> make_with_shape_of(Buffer<T2, D2> src,
-                                           void *(*allocate_fn)(size_t) = nullptr,
-                                           void (*deallocate_fn)(void *) = nullptr) {
-
-        const halide_type_t dst_type = T_is_void
-            ? src.type()
-            : halide_type_of<typename std::remove_cv<not_void_T>::type>();
-        return Buffer<>::make_with_shape_of_helper(dst_type, src.
-                                                   dimensions(), src.buf.dim,
-                                                   allocate_fn, deallocate_fn);
-    }
-
-private:
-
-    static Buffer<> make_with_shape_of_helper(halide_type_t dst_type,
-                                              int dimensions,
-                                              halide_dimension_t *shape,
-                                              void *(*allocate_fn)(size_t),
-                                              void (*deallocate_fn)(void *)) {
-        // Reorder the dimensions of src to have strides in increasing order
-        std::vector<int> swaps;
-        for (int i = dimensions - 1; i > 0; i--) {
-            for (int j = i; j > 0; j--) {
-                if (shape[j-1].stride > shape[j].stride) {
-                    std::swap(shape[j-1], shape[j]);
-                    swaps.push_back(j);
-                }
-            }
-        }
-
-        // Rewrite the strides to be dense (this messes up src, which
-        // is why we took it by value).
-        for (int i = 0; i < dimensions; i++) {
-            if (i == 0) {
-                shape[i].stride = 1;
-            } else {
-                shape[i].stride = shape[i-1].extent * shape[i-1].stride;
-            }
-        }
-
-        // Undo the dimension reordering
-        while (!swaps.empty()) {
-            int j = swaps.back();
-            std::swap(shape[j-1], shape[j]);
-            swaps.pop_back();
-        }
-
-        // Use an explicit runtime type, and make dst a Buffer<void>, to allow
-        // using this method with Buffer<void> for either src or dst.
-        Buffer<> dst(dst_type, nullptr, dimensions, shape);
-        dst.allocate(allocate_fn, deallocate_fn);
-
-        return dst;
-    }
-
-    template<typename ...Args>
-    HALIDE_ALWAYS_INLINE
-    ptrdiff_t offset_of(int d, int first, Args... rest) const {
-        return offset_of(d+1, rest...) + this->buf.dim[d].stride * (first - this->buf.dim[d].min);
-    }
-
-    HALIDE_ALWAYS_INLINE
-    ptrdiff_t offset_of(int d) const {
-        return 0;
-    }
-
-    template<typename ...Args>
-    HALIDE_ALWAYS_INLINE
-    storage_T *address_of(Args... args) const {
-        if (T_is_void) {
-            return (storage_T *)(this->buf.host) + offset_of(0, args...) * type().bytes();
-        } else {
-            return (storage_T *)(this->buf.host) + offset_of(0, args...);
-        }
-    }
-
-    HALIDE_ALWAYS_INLINE
-    ptrdiff_t offset_of(const int *pos) const {
-        ptrdiff_t offset = 0;
-        for (int i = this->dimensions() - 1; i >= 0; i--) {
-            offset += this->buf.dim[i].stride * (pos[i] - this->buf.dim[i].min);
-        }
-        return offset;
-    }
-
-    HALIDE_ALWAYS_INLINE
-    storage_T *address_of(const int *pos) const {
-        if (T_is_void) {
-            return (storage_T *)this->buf.host + offset_of(pos) * type().bytes();
-        } else {
-            return (storage_T *)this->buf.host + offset_of(pos);
-        }
-    }
-
-public:
-
-    /** Get a pointer to the address of the min coordinate. */
-    T *data() const {
-        return (T *)(this->buf.host);
-    }
-
-    /** Access elements. Use im(...) to get a reference to an element,
-     * and use &im(...) to get the address of an element. If you pass
-     * fewer arguments than the buffer has dimensions, the rest are
-     * treated as their min coordinate. The non-const versions set the
-     * host_dirty flag to true.
-     */
-    //@{
-    template<typename ...Args,
-             typename = typename std::enable_if<AllInts<Args...>::value>::type>
-    HALIDE_ALWAYS_INLINE
-    const not_void_T &operator()(int first, Args... rest) const {
-        static_assert(!T_is_void,
-                      "Cannot use operator() on Buffer<void> types");
-        assert(!device_dirty());
-        return *((const not_void_T *)(address_of(first, rest...)));
-    }
-
-    HALIDE_ALWAYS_INLINE
-    const not_void_T &
-    operator()() const {
-        static_assert(!T_is_void,
-                      "Cannot use operator() on Buffer<void> types");
-        assert(!device_dirty());
-        return *((const not_void_T *)(data()));
-    }
-
-    HALIDE_ALWAYS_INLINE
-    const not_void_T &
-    operator()(const int *pos) const {
-        static_assert(!T_is_void,
-                      "Cannot use operator() on Buffer<void> types");
-        assert(!device_dirty());
-        return *((const not_void_T *)(address_of(pos)));
-    }
-
-    template<typename ...Args,
-             typename = typename std::enable_if<AllInts<Args...>::value>::type>
-    HALIDE_ALWAYS_INLINE
-    not_void_T &operator()(int first, Args... rest) {
-        static_assert(!T_is_void,
-                      "Cannot use operator() on Buffer<void> types");
-        set_host_dirty();
-        return *((not_void_T *)(address_of(first, rest...)));
-    }
-
-    HALIDE_ALWAYS_INLINE
-    not_void_T &
-    operator()() {
-        static_assert(!T_is_void,
-                      "Cannot use operator() on Buffer<void> types");
-        set_host_dirty();
-        return *((not_void_T *)(data()));
-    }
-
-    HALIDE_ALWAYS_INLINE
-    not_void_T &
-    operator()(const int *pos) {
-        static_assert(!T_is_void,
-                      "Cannot use operator() on Buffer<void> types");
-        set_host_dirty();
-        return *((not_void_T *)(address_of(pos)));
-    }
-    // @}
-
-    /** Tests that all values in this buffer are equal to val. */
-    bool all_equal(not_void_T val) const{
-        bool all_equal = true;
-        for_each_element([&](const int *pos) {all_equal &= (*this)(pos) == val;});
-        return all_equal;
-    }
-
-    Buffer<T, D> &fill(not_void_T val) {
-        set_host_dirty();
-        for_each_value([=](T &v) {v = val;});
-        return *this;
-    }
-
-private:
-    /** Helper functions for for_each_value. */
-    // @{
-    template<int N>
-    struct for_each_value_task_dim {
-        int extent;
-        int stride[N];
-    };
-
-    // Given an array of strides, and a bunch of pointers to pointers
-    // (all of different types), advance the pointers using the
-    // strides.
-    template<typename Ptr, typename ...Ptrs>
-    HALIDE_ALWAYS_INLINE
-    static void advance_ptrs(const int *stride, Ptr *ptr, Ptrs... ptrs) {
-        (*ptr) += *stride;
-        advance_ptrs(stride + 1, ptrs...);
-    }
-
-    HALIDE_ALWAYS_INLINE
-    static void advance_ptrs(const int *) {}
-
-    // Same as the above, but just increments the pointers.
-    template<typename Ptr, typename ...Ptrs>
-    HALIDE_ALWAYS_INLINE
-    static void increment_ptrs(Ptr *ptr, Ptrs... ptrs) {
-        (*ptr)++;
-        increment_ptrs(ptrs...);
-    }
-
-    HALIDE_ALWAYS_INLINE
-    static void increment_ptrs() {}
-
-    template<typename Fn, typename... Ptrs>
-    HALIDE_NEVER_INLINE
-    static void for_each_value_helper(Fn &&f, int d, bool innermost_strides_are_one,
-                                      const for_each_value_task_dim<sizeof...(Ptrs)> *t, Ptrs... ptrs) {
-        if (d == -1) {
-            f((*ptrs)...);
-        } else if (d == 0) {
-            if (innermost_strides_are_one) {
-                for (int i = t[0].extent; i != 0; i--) {
-                    f((*ptrs)...);
-                    increment_ptrs((&ptrs)...);
-                }
-            } else {
-                for (int i = t[0].extent; i != 0; i--) {
-                    f((*ptrs)...);
-                    advance_ptrs(t[0].stride, (&ptrs)...);
-                }
-            }
-        } else {
-            for (int i = t[d].extent; i != 0; i--) {
-                for_each_value_helper(f, d-1, innermost_strides_are_one, t, ptrs...);
-                advance_ptrs(t[d].stride, (&ptrs)...);
-            }
-        }
-    }
-
-    template<int N>
-    HALIDE_NEVER_INLINE
-    static bool for_each_value_prep(for_each_value_task_dim<N> *t,
-                                    const halide_buffer_t **buffers) {
-        const int dimensions = buffers[0]->dimensions;
-
-        // Extract the strides in all the dimensions
-        for (int i = 0; i < dimensions; i++) {
-            for (int j = 0; j < N; j++) {
-                assert(buffers[j]->dimensions == dimensions);
-                assert(buffers[j]->dim[i].extent == buffers[0]->dim[i].extent &&
-                       buffers[j]->dim[i].min == buffers[0]->dim[i].min);
-                const int s = buffers[j]->dim[i].stride;
-                t[i].stride[j] = s;
-            }
-            t[i].extent = buffers[0]->dim[i].extent;
-
-            // Order the dimensions by stride, so that the traversal is cache-coherent.
-            for (int j = i; j > 0 && t[j].stride[0] < t[j-1].stride[0]; j--) {
-                std::swap(t[j], t[j-1]);
-            }
-        }
-
-        // flatten dimensions where possible to make a larger inner
-        // loop for autovectorization.
-        int d = dimensions;
-        for (int i = 1; i < d; i++) {
-            bool flat = true;
-            for (int j = 0; j < N; j++) {
-                flat = flat && t[i-1].stride[j] * t[i-1].extent == t[i].stride[j];
-            }
-            if (flat) {
-                t[i-1].extent *= t[i].extent;
-                for (int j = i; j < d; j++) {
-                    t[j] = t[j+1];
-                }
-                i--;
-                d--;
-                t[d].extent = 1;
-            }
-        }
-
-        bool innermost_strides_are_one = true;
-        if (dimensions > 0) {
-            for (int i = 0; i < N; i++) {
-                innermost_strides_are_one &= (t[0].stride[i] == 1);
-            }
-        }
-
-        return innermost_strides_are_one;
-    }
-
-    template<typename Fn, typename ...Args, int N = sizeof...(Args) + 1>
-    void for_each_value_impl(Fn &&f, Args&&... other_buffers) const {
-        Buffer<>::for_each_value_task_dim<N> *t =
-        (Buffer<>::for_each_value_task_dim<N> *)HALIDE_ALLOCA((dimensions()+1) * sizeof(for_each_value_task_dim<N>));
-        // Move the preparatory code into a non-templated helper to
-        // save code size.
-        const halide_buffer_t *buffers[] = {&buf, (&other_buffers.buf)...};
-        bool innermost_strides_are_one = Buffer<>::for_each_value_prep(t, buffers);
-
-        Buffer<>::for_each_value_helper(f, dimensions() - 1,
-                                        innermost_strides_are_one,
-                                        t,
-                                        data(), (other_buffers.data())...);
-    }
-    // @}
-
-public:
-    /** Call a function on every value in the buffer, and the
-     * corresponding values in some number of other buffers of the
-     * same size. The function should take a reference, const
-     * reference, or value of the correct type for each buffer. This
-     * effectively lifts a function of scalars to an element-wise
-     * function of buffers. This produces code that the compiler can
-     * autovectorize. This is slightly cheaper than for_each_element,
-     * because it does not need to track the coordinates.
-     *
-     * Note that constness of Buffers is preserved: a const Buffer<T> (for either
-     * 'this' or the other-buffers arguments) will allow mutation of the
-     * buffer contents, while a Buffer<const T> will not. Attempting to specify
-     * a mutable reference for the lambda argument of a Buffer<const T>
-     * will result in a compilation error. */
-    // @{
-    template<typename Fn, typename ...Args, int N = sizeof...(Args) + 1>
-    HALIDE_ALWAYS_INLINE
-    const Buffer<T, D> &for_each_value(Fn &&f, Args&&... other_buffers) const {
-        for_each_value_impl(f, std::forward<Args>(other_buffers)...);
-        return *this;
-    }
-
-    template<typename Fn, typename ...Args, int N = sizeof...(Args) + 1>
-    HALIDE_ALWAYS_INLINE
-    Buffer<T, D> &for_each_value(Fn &&f, Args&&... other_buffers) {
-        for_each_value_impl(f, std::forward<Args>(other_buffers)...);
-        return *this;
-    }
-    // @}
-
-private:
-
-    // Helper functions for for_each_element
-    struct for_each_element_task_dim {
-        int min, max;
-    };
-
-    /** If f is callable with this many args, call it. The first
-     * argument is just to make the overloads distinct. Actual
-     * overload selection is done using the enable_if. */
-    template<typename Fn,
-             typename ...Args,
-             typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
-    HALIDE_ALWAYS_INLINE
-    static void for_each_element_variadic(int, int, const for_each_element_task_dim *, Fn &&f, Args... args) {
-        f(args...);
-    }
-
-    /** If the above overload is impossible, we add an outer loop over
-     * an additional argument and try again. */
-    template<typename Fn,
-             typename ...Args>
-    HALIDE_ALWAYS_INLINE
-    static void for_each_element_variadic(double, int d, const for_each_element_task_dim *t, Fn &&f, Args... args) {
-        for (int i = t[d].min; i <= t[d].max; i++) {
-            for_each_element_variadic(0, d - 1, t, std::forward<Fn>(f), i, args...);
-        }
-    }
-
-    /** Determine the minimum number of arguments a callable can take
-     * using the same trick. */
-    template<typename Fn,
-             typename ...Args,
-             typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
-    HALIDE_ALWAYS_INLINE
-    static int num_args(int, Fn &&, Args...) {
-        return (int)(sizeof...(Args));
-    }
-
-    /** The recursive version is only enabled up to a recursion limit
-     * of 256. This catches callables that aren't callable with any
-     * number of ints. */
-    template<typename Fn,
-             typename ...Args>
-    HALIDE_ALWAYS_INLINE
-    static int num_args(double, Fn &&f, Args... args) {
-        static_assert(sizeof...(args) <= 256,
-                      "Callable passed to for_each_element must accept either a const int *,"
-                      " or up to 256 ints. No such operator found. Expect infinite template recursion.");
-        return num_args(0, std::forward<Fn>(f), 0, args...);
-    }
-
-    /** A version where the callable takes a position array instead,
-     * with compile-time recursion on the dimensionality.  This
-     * overload is preferred to the one below using the same int vs
-     * double trick as above, but is impossible once d hits -1 using
-     * std::enable_if. */
-    template<int d,
-             typename Fn,
-             typename = typename std::enable_if<(d >= 0)>::type>
-    HALIDE_ALWAYS_INLINE
-    static void for_each_element_array_helper(int, const for_each_element_task_dim *t, Fn &&f, int *pos) {
-        for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) {
-            for_each_element_array_helper<d - 1>(0, t, std::forward<Fn>(f), pos);
-        }
-    }
-
-    /** Base case for recursion above. */
-    template<int d,
-             typename Fn,
-             typename = typename std::enable_if<(d < 0)>::type>
-    HALIDE_ALWAYS_INLINE
-    static void for_each_element_array_helper(double, const for_each_element_task_dim *t, Fn &&f, int *pos) {
-        f(pos);
-    }
-
-    /** A run-time-recursive version (instead of
-     * compile-time-recursive) that requires the callable to take a
-     * pointer to a position array instead. Dispatches to the
-     * compile-time-recursive version once the dimensionality gets
-     * small. */
-    template<typename Fn>
-    static void for_each_element_array(int d, const for_each_element_task_dim *t, Fn &&f, int *pos) {
-        if (d == -1) {
-            f(pos);
-        } else if (d == 0) {
-            // Once the dimensionality gets small enough, dispatch to
-            // a compile-time-recursive version for better codegen of
-            // the inner loops.
-            for_each_element_array_helper<0, Fn>(0, t, std::forward<Fn>(f), pos);
-        } else if (d == 1) {
-            for_each_element_array_helper<1, Fn>(0, t, std::forward<Fn>(f), pos);
-        } else if (d == 2) {
-            for_each_element_array_helper<2, Fn>(0, t, std::forward<Fn>(f), pos);
-        } else if (d == 3) {
-            for_each_element_array_helper<3, Fn>(0, t, std::forward<Fn>(f), pos);
-        } else {
-            for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) {
-                for_each_element_array(d - 1, t, std::forward<Fn>(f), pos);
-            }
-        }
-    }
-
-    /** We now have two overloads for for_each_element. This one
-     * triggers if the callable takes a const int *.
-     */
-    template<typename Fn,
-             typename = decltype(std::declval<Fn>()((const int *)nullptr))>
-    static void for_each_element(int, int dims, const for_each_element_task_dim *t, Fn &&f, int check = 0) {
-        int *pos = (int *)HALIDE_ALLOCA(dims * sizeof(int));
-        for_each_element_array(dims - 1, t, std::forward<Fn>(f), pos);
-    }
-
-    /** This one triggers otherwise. It treats the callable as
-     * something that takes some number of ints. */
-    template<typename Fn>
-    HALIDE_ALWAYS_INLINE
-    static void for_each_element(double, int dims, const for_each_element_task_dim *t, Fn &&f) {
-        int args = num_args(0, std::forward<Fn>(f));
-        assert(dims >= args);
-        for_each_element_variadic(0, args - 1, t, std::forward<Fn>(f));
-    }
-
-    template<typename Fn>
-    void for_each_element_impl(Fn &&f) const {
-        for_each_element_task_dim *t =
-            (for_each_element_task_dim *)HALIDE_ALLOCA(dimensions() * sizeof(for_each_element_task_dim));
-        for (int i = 0; i < dimensions(); i++) {
-            t[i].min = dim(i).min();
-            t[i].max = dim(i).max();
-        }
-        for_each_element(0, dimensions(), t, std::forward<Fn>(f));
-    }
-
-public:
-    /** Call a function at each site in a buffer. This is likely to be
-     * much slower than using Halide code to populate a buffer, but is
-     * convenient for tests. If the function has more arguments than the
-     * buffer has dimensions, the remaining arguments will be zero. If it
-     * has fewer arguments than the buffer has dimensions then the last
-     * few dimensions of the buffer are not iterated over. For example,
-     * the following code exploits this to set a floating point RGB image
-     * to red:
-
-     \code
-     Buffer<float, 3> im(100, 100, 3);
-     im.for_each_element([&](int x, int y) {
-         im(x, y, 0) = 1.0f;
-         im(x, y, 1) = 0.0f;
-         im(x, y, 2) = 0.0f:
-     });
-     \endcode
-
-     * The compiled code is equivalent to writing the a nested for loop,
-     * and compilers are capable of optimizing it in the same way.
-     *
-     * If the callable can be called with an int * as the sole argument,
-     * that version is called instead. Each location in the buffer is
-     * passed to it in a coordinate array. This version is higher-overhead
-     * than the variadic version, but is useful for writing generic code
-     * that accepts buffers of arbitrary dimensionality. For example, the
-     * following sets the value at all sites in an arbitrary-dimensional
-     * buffer to their first coordinate:
-
-     \code
-     im.for_each_element([&](const int *pos) {im(pos) = pos[0];});
-     \endcode
-
-     * It is also possible to use for_each_element to iterate over entire
-     * rows or columns by cropping the buffer to a single column or row
-     * respectively and iterating over elements of the result. For example,
-     * to set the diagonal of the image to 1 by iterating over the columns:
-
-     \code
-     Buffer<float, 3> im(100, 100, 3);
-         im.sliced(1, 0).for_each_element([&](int x, int c) {
-         im(x, x, c) = 1.0f;
-     });
-     \endcode
-
-     * Or, assuming the memory layout is known to be dense per row, one can
-     * memset each row of an image like so:
-
-     \code
-     Buffer<float, 3> im(100, 100, 3);
-     im.sliced(0, 0).for_each_element([&](int y, int c) {
-         memset(&im(0, y, c), 0, sizeof(float) * im.width());
-     });
-     \endcode
-
-    */
-    // @{
-    template<typename Fn>
-    HALIDE_ALWAYS_INLINE
-    const Buffer<T, D> &for_each_element(Fn &&f) const {
-        for_each_element_impl(f);
-        return *this;
-    }
-
-    template<typename Fn>
-    HALIDE_ALWAYS_INLINE
-    Buffer<T, D> &for_each_element(Fn &&f) {
-        for_each_element_impl(f);
-        return *this;
-    }
-    // @}
-
-private:
-    template<typename Fn>
-    struct FillHelper {
-        Fn f;
-        Buffer<T, D> *buf;
-
-        template<typename... Args,
-                 typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
-        void operator()(Args... args) {
-            (*buf)(args...) = f(args...);
-        }
-
-        FillHelper(Fn &&f, Buffer<T, D> *buf) : f(std::forward<Fn>(f)), buf(buf) {}
-    };
-
-public:
-    /** Fill a buffer by evaluating a callable at every site. The
-     * callable should look much like a callable passed to
-     * for_each_element, but it should return the value that should be
-     * stored to the coordinate corresponding to the arguments. */
-    template<typename Fn,
-             typename = typename std::enable_if<!std::is_arithmetic<typename std::decay<Fn>::type>::value>::type>
-    Buffer<T, D> &fill(Fn &&f) {
-        // We'll go via for_each_element. We need a variadic wrapper lambda.
-        FillHelper<Fn> wrapper(std::forward<Fn>(f), this);
-        return for_each_element(wrapper);
-    }
-
-    /** Check if an input buffer passed extern stage is a querying
-     * bounds. Compared to doing the host pointer check directly,
-     * this both adds clarity to code and will facilitate moving to
-     * another representation for bounds query arguments. */
-    bool is_bounds_query() const {
-        return buf.is_bounds_query();
-    }
-
-    /** Convenient check to verify that all of the interesting bytes in the Buffer
-     * are initialized under MSAN. Note that by default, we use for_each_value() here so that
-     * we skip any unused padding that isn't part of the Buffer; this isn't efficient,
-     * but in MSAN mode, it doesn't matter. (Pass true for the flag to force check
-     * the entire Buffer storage.) */
-    void msan_check_mem_is_initialized(bool entire = false) const {
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
-        if (entire) {
-            __msan_check_mem_is_initialized(data(), size_in_bytes());
-        } else {
-            for_each_value([](T &v) { __msan_check_mem_is_initialized(&v, sizeof(T)); ;});
-        }
-#endif
-#endif
-    }
-};
-
-}  // namespace Runtime
-}  // namespace Halide
-
-#undef HALIDE_ALLOCA
-
-#endif  // HALIDE_RUNTIME_IMAGE_H
diff --git a/t2s/preprocessor/sample/sample_01/HalideRuntime.h b/t2s/preprocessor/sample/sample_01/HalideRuntime.h
deleted file mode 100644
index 2aefb623..00000000
--- a/t2s/preprocessor/sample/sample_01/HalideRuntime.h
+++ /dev/null
@@ -1,1982 +0,0 @@
-#ifndef HALIDE_HALIDERUNTIME_H
-#define HALIDE_HALIDERUNTIME_H
-
-#ifndef COMPILING_HALIDE_RUNTIME
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-#else
-#include "runtime_internal.h"
-#endif
-
-#ifdef __cplusplus
-// Forward declare type to allow naming typed handles.
-// See Type.h for documentation.
-template<typename T> struct halide_handle_traits;
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Note that you should not use "inline" along with HALIDE_ALWAYS_INLINE;
-// it is not necessary, and may produce warnings for some build configurations.
-#ifdef _MSC_VER
-#define HALIDE_ALWAYS_INLINE __forceinline
-#define HALIDE_NEVER_INLINE __declspec(noinline)
-#else
-#define HALIDE_ALWAYS_INLINE __attribute__((always_inline)) inline
-#define HALIDE_NEVER_INLINE __attribute__((noinline))
-#endif
-
-#ifndef HALIDE_MUST_USE_RESULT
-#ifdef __has_attribute
-#if __has_attribute(nodiscard)
-// C++17 or later
-#define HALIDE_MUST_USE_RESULT [[nodiscard]]
-#elif __has_attribute(warn_unused_result)
-// Clang/GCC
-#define HALIDE_MUST_USE_RESULT __attribute__((warn_unused_result))
-#else
-#define HALIDE_MUST_USE_RESULT
-#endif
-#else
-#define HALIDE_MUST_USE_RESULT
-#endif
-#endif
-
-/** \file
- *
- * This file declares the routines used by Halide internally in its
- * runtime. On platforms that support weak linking, these can be
- * replaced with user-defined versions by defining an extern "C"
- * function with the same name and signature.
- *
- * When doing Just In Time (JIT) compilation methods on the Func being
- * compiled must be called instead. The corresponding methods are
- * documented below.
- *
- * All of these functions take a "void *user_context" parameter as their
- * first argument; if the Halide kernel that calls back to any of these
- * functions has been compiled with the UserContext feature set on its Target,
- * then the value of that pointer passed from the code that calls the
- * Halide kernel is piped through to the function.
- *
- * Some of these are also useful to call when using the default
- * implementation. E.g. halide_shutdown_thread_pool.
- *
- * Note that even on platforms with weak linking, some linker setups
- * may not respect the override you provide. E.g. if the override is
- * in a shared library and the halide object files are linked directly
- * into the output, the builtin versions of the runtime functions will
- * be called. See your linker documentation for more details. On
- * Linux, LD_DYNAMIC_WEAK=1 may help.
- *
- */
-
-// Forward-declare to suppress warnings if compiling as C.
-struct halide_buffer_t;
-struct buffer_t;
-
-/** Print a message to stderr. Main use is to support tracing
- * functionality, print, and print_when calls. Also called by the default
- * halide_error.  This function can be replaced in JITed code by using
- * halide_custom_print and providing an implementation of halide_print
- * in AOT code. See Func::set_custom_print.
- */
-// @{
-extern void halide_print(void *user_context, const char *);
-extern void halide_default_print(void *user_context, const char *);
-typedef void (*halide_print_t)(void *, const char *);
-extern halide_print_t halide_set_custom_print(halide_print_t print);
-// @}
-
-/** Halide calls this function on runtime errors (for example bounds
- * checking failures). This function can be replaced in JITed code by
- * using Func::set_error_handler, or in AOT code by calling
- * halide_set_error_handler. In AOT code on platforms that support
- * weak linking (i.e. not Windows), you can also override it by simply
- * defining your own halide_error.
- */
-// @{
-extern void halide_error(void *user_context, const char *);
-extern void halide_default_error(void *user_context, const char *);
-typedef void (*halide_error_handler_t)(void *, const char *);
-extern halide_error_handler_t halide_set_error_handler(halide_error_handler_t handler);
-// @}
-
-/** Cross-platform mutex. Must be initialized with zero and implementation
- * must treat zero as an unlocked mutex with no waiters, etc.
- */
-struct halide_mutex {
-    uintptr_t _private[1];
-};
-
-/** Cross platform condition variable. Must be initialized to 0. */
-struct halide_cond {
-    uintptr_t _private[1];
-};
-
-/** A basic set of mutex and condition variable functions, which call
- * platform specific code for mutual exclusion. Equivalent to posix
- * calls. */
-//@{
-extern void halide_mutex_lock(struct halide_mutex *mutex);
-extern void halide_mutex_unlock(struct halide_mutex *mutex);
-extern void halide_cond_signal(struct halide_cond *cond);
-extern void halide_cond_broadcast(struct halide_cond *cond);
-extern void halide_cond_wait(struct halide_cond *cond, struct halide_mutex *mutex);
-//@}
-
-/** Functions for constructing/destroying/locking/unlocking arrays of mutexes. */
-struct halide_mutex_array;
-//@{
-extern struct halide_mutex_array* halide_mutex_array_create(int sz);
-extern void halide_mutex_array_destroy(void *user_context, void *array);
-extern int halide_mutex_array_lock(struct halide_mutex_array *array, int entry);
-extern int halide_mutex_array_unlock(struct halide_mutex_array *array, int entry);
-//@}
-
-/** Define halide_do_par_for to replace the default thread pool
- * implementation. halide_shutdown_thread_pool can also be called to
- * release resources used by the default thread pool on platforms
- * where it makes sense. (E.g. On Mac OS, Grand Central Dispatch is
- * used so %Halide does not own the threads backing the pool and they
- * cannot be released.)  See Func::set_custom_do_task and
- * Func::set_custom_do_par_for. Should return zero if all the jobs
- * return zero, or an arbitrarily chosen return value from one of the
- * jobs otherwise.
- */
-//@{
-typedef int (*halide_task_t)(void *user_context, int task_number, uint8_t *closure);
-extern int halide_do_par_for(void *user_context,
-                             halide_task_t task,
-                             int min, int size, uint8_t *closure);
-extern void halide_shutdown_thread_pool();
-//@}
-
-/** Set a custom method for performing a parallel for loop. Returns
- * the old do_par_for handler. */
-typedef int (*halide_do_par_for_t)(void *, halide_task_t, int, int, uint8_t*);
-extern halide_do_par_for_t halide_set_custom_do_par_for(halide_do_par_for_t do_par_for);
-
-/** An opaque struct representing a semaphore. Used by the task system for async tasks. */
-struct halide_semaphore_t {
-    uint64_t _private[2];
-};
-
-/** A struct representing a semaphore and a number of items that must
- * be acquired from it. Used in halide_parallel_task_t below. */
-struct halide_semaphore_acquire_t {
-    struct halide_semaphore_t *semaphore;
-    int count;
-};
-extern int halide_semaphore_init(struct halide_semaphore_t *, int n);
-extern int halide_semaphore_release(struct halide_semaphore_t *, int n);
-extern bool halide_semaphore_try_acquire(struct halide_semaphore_t *, int n);
-typedef int (*halide_semaphore_init_t)(struct halide_semaphore_t *, int);
-typedef int (*halide_semaphore_release_t)(struct halide_semaphore_t *, int);
-typedef bool (*halide_semaphore_try_acquire_t)(struct halide_semaphore_t *, int);
-
-
-/** A task representing a serial for loop evaluated over some range.
- * Note that task_parent is a pass through argument that should be
- * passed to any dependent taks that are invokved using halide_do_parallel_tasks
- * underneath this call. */
-typedef int (*halide_loop_task_t)(void *user_context, int min, int extent,
-                                  uint8_t *closure, void *task_parent);
-
-/** A parallel task to be passed to halide_do_parallel_tasks. This
- * task may recursively call halide_do_parallel_tasks, and there may
- * be complex dependencies between seemingly unrelated tasks expressed
- * using semaphores. If you are using a custom task system, care must
- * be taken to avoid potential deadlock. This can be done by carefully
- * respecting the static metadata at the end of the task struct.*/
-struct halide_parallel_task_t {
-    // The function to call. It takes a user context, a min and
-    // extent, a closure, and a task system pass through argument.
-    halide_loop_task_t fn;
-
-    // The closure to pass it
-    uint8_t *closure;
-
-    // The name of the function to be called. For debugging purposes only.
-    const char *name;
-
-    // An array of semaphores that must be acquired before the
-    // function is called. Must be reacquired for every call made.
-    struct halide_semaphore_acquire_t *semaphores;
-    int num_semaphores;
-
-    // The entire range the function should be called over. This range
-    // may be sliced up and the function called multiple times.
-    int min, extent;
-
-    // A parallel task provides several pieces of metadata to prevent
-    // unbounded resource usage or deadlock.
-
-    // The first is the minimum number of execution contexts (call
-    // stacks or threads) necessary for the function to run to
-    // completion. This may be greater than one when there is nested
-    // parallelism with internal producer-consumer relationships
-    // (calling the function recursively spawns and blocks on parallel
-    // sub-tasks that communicate with each other via semaphores). If
-    // a parallel runtime calls the function when fewer than this many
-    // threads are idle, it may need to create more threads to
-    // complete the task, or else risk deadlock due to committing all
-    // threads to tasks that cannot complete without more.
-    //
-    // FIXME: Note that extern stages are assumed to only require a
-    // single thread to complete. If the extern stage is itself a
-    // Halide pipeline, this may be an underestimate.
-    int min_threads;
-
-    // The calls to the function should be in serial order from min to min+extent-1, with only
-    // one executing at a time. If false, any order is fine, and
-    // concurrency is fine.
-    bool serial;
-};
-
-/** Enqueue some number of the tasks described above and wait for them
- * to complete. While waiting, the calling threads assists with either
- * the tasks enqueued, or other non-blocking tasks in the task
- * system. Note that task_parent should be NULL for top-level calls
- * and the pass through argument if this call is being made from
- * another task. */
-extern int halide_do_parallel_tasks(void *user_context, int num_tasks,
-                                    struct halide_parallel_task_t *tasks,
-                                    void *task_parent);
-
-/** If you use the default do_par_for, you can still set a custom
- * handler to perform each individual task. Returns the old handler. */
-//@{
-typedef int (*halide_do_task_t)(void *, halide_task_t, int, uint8_t *);
-extern halide_do_task_t halide_set_custom_do_task(halide_do_task_t do_task);
-extern int halide_do_task(void *user_context, halide_task_t f, int idx,
-                          uint8_t *closure);
-//@}
-
-/** The version of do_task called for loop tasks. By default calls the
- * loop task with the same arguments. */
-// @{
-  typedef int (*halide_do_loop_task_t)(void *, halide_loop_task_t, int, int, uint8_t *, void *);
-extern halide_do_loop_task_t halide_set_custom_do_loop_task(halide_do_loop_task_t do_task);
-extern int halide_do_loop_task(void *user_context, halide_loop_task_t f, int min, int extent,
-                               uint8_t *closure, void *task_parent);
-//@}
-
-/** Provide an entire custom tasking runtime via function
- * pointers. Note that do_task and semaphore_try_acquire are only ever
- * called by halide_default_do_par_for and
- * halide_default_do_parallel_tasks, so it's only necessary to provide
- * those if you are mixing in the default implementations of
- * do_par_for and do_parallel_tasks. */
-// @{
-typedef int (*halide_do_parallel_tasks_t)(void *, int, struct halide_parallel_task_t *,
-                                          void *task_parent);
-extern void halide_set_custom_parallel_runtime(
-    halide_do_par_for_t,
-    halide_do_task_t,
-    halide_do_loop_task_t,
-    halide_do_parallel_tasks_t,
-    halide_semaphore_init_t,
-    halide_semaphore_try_acquire_t,
-    halide_semaphore_release_t
-    );
-// @}
-
-/** The default versions of the parallel runtime functions. */
-// @{
-extern int halide_default_do_par_for(void *user_context,
-                                     halide_task_t task,
-                                     int min, int size, uint8_t *closure);
-extern int halide_default_do_parallel_tasks(void *user_context,
-                                            int num_tasks,
-                                            struct halide_parallel_task_t *tasks,
-                                            void *task_parent);
-extern int halide_default_do_task(void *user_context, halide_task_t f, int idx,
-                                  uint8_t *closure);
-extern int halide_default_do_loop_task(void *user_context, halide_loop_task_t f,
-                                       int min, int extent,
-                                       uint8_t *closure, void *task_parent);
-extern int halide_default_semaphore_init(struct halide_semaphore_t *, int n);
-extern int halide_default_semaphore_release(struct halide_semaphore_t *, int n);
-extern bool halide_default_semaphore_try_acquire(struct halide_semaphore_t *, int n);
-// @}
-
-struct halide_thread;
-
-/** Spawn a thread. Returns a handle to the thread for the purposes of
- * joining it. The thread must be joined in order to clean up any
- * resources associated with it. */
-extern struct halide_thread *halide_spawn_thread(void (*f)(void *), void *closure);
-
-/** Join a thread. */
-extern void halide_join_thread(struct halide_thread *);
-
-/** Set the number of threads used by Halide's thread pool. Returns
- * the old number.
- *
- * n < 0  : error condition
- * n == 0 : use a reasonable system default (typically, number of cpus online).
- * n == 1 : use exactly one thread; this will always enforce serial execution
- * n > 1  : use a pool of exactly n threads.
- *
- * (Note that this is only guaranteed when using the default implementations
- * of halide_do_par_for(); custom implementations may completely ignore values
- * passed to halide_set_num_threads().)
- */
-extern int halide_set_num_threads(int n);
-
-/** Halide calls these functions to allocate and free memory. To
- * replace in AOT code, use the halide_set_custom_malloc and
- * halide_set_custom_free, or (on platforms that support weak
- * linking), simply define these functions yourself. In JIT-compiled
- * code use Func::set_custom_allocator.
- *
- * If you override them, and find yourself wanting to call the default
- * implementation from within your override, use
- * halide_default_malloc/free.
- *
- * Note that halide_malloc must return a pointer aligned to the
- * maximum meaningful alignment for the platform for the purpose of
- * vector loads and stores. The default implementation uses 32-byte
- * alignment, which is safe for arm and x86. Additionally, it must be
- * safe to read at least 8 bytes before the start and beyond the
- * end.
- */
-//@{
-extern void *halide_malloc(void *user_context, size_t x);
-extern void halide_free(void *user_context, void *ptr);
-extern void *halide_default_malloc(void *user_context, size_t x);
-extern void halide_default_free(void *user_context, void *ptr);
-typedef void *(*halide_malloc_t)(void *, size_t);
-typedef void (*halide_free_t)(void *, void *);
-extern halide_malloc_t halide_set_custom_malloc(halide_malloc_t user_malloc);
-extern halide_free_t halide_set_custom_free(halide_free_t user_free);
-//@}
-
-/** Halide calls these functions to interact with the underlying
- * system runtime functions. To replace in AOT code on platforms that
- * support weak linking, define these functions yourself, or use
- * the halide_set_custom_load_library() and halide_set_custom_get_library_symbol()
- * functions. In JIT-compiled code, use JITSharedRuntime::set_default_handlers().
- *
- * halide_load_library and halide_get_library_symbol are equivalent to
- * dlopen and dlsym. halide_get_symbol(sym) is equivalent to
- * dlsym(RTLD_DEFAULT, sym).
- */
-//@{
-extern void *halide_get_symbol(const char *name);
-extern void *halide_load_library(const char *name);
-extern void *halide_get_library_symbol(void *lib, const char *name);
-extern void *halide_default_get_symbol(const char *name);
-extern void *halide_default_load_library(const char *name);
-extern void *halide_default_get_library_symbol(void *lib, const char *name);
-typedef void *(*halide_get_symbol_t)(const char *name);
-typedef void *(*halide_load_library_t)(const char *name);
-typedef void *(*halide_get_library_symbol_t)(void *lib, const char *name);
-extern halide_get_symbol_t halide_set_custom_get_symbol(halide_get_symbol_t user_get_symbol);
-extern halide_load_library_t halide_set_custom_load_library(halide_load_library_t user_load_library);
-extern halide_get_library_symbol_t halide_set_custom_get_library_symbol(halide_get_library_symbol_t user_get_library_symbol);
-//@}
-
-/** Called when debug_to_file is used inside %Halide code.  See
- * Func::debug_to_file for how this is called
- *
- * Cannot be replaced in JITted code at present.
- */
-extern int32_t halide_debug_to_file(void *user_context, const char *filename,
-                                    int32_t type_code,
-                                    struct halide_buffer_t *buf);
-
-/** Types in the halide type system. They can be ints, unsigned ints,
- * or floats (of various bit-widths), or a handle (which is always 64-bits).
- * Note that the int/uint/float values do not imply a specific bit width
- * (the bit width is expected to be encoded in a separate value).
- */
-typedef enum halide_type_code_t
-#if __cplusplus >= 201103L
-: uint8_t
-#endif
-{
-    halide_type_int = 0,   //!< signed integers
-    halide_type_uint = 1,  //!< unsigned integers
-    halide_type_float = 2, //!< IEEE floating point numbers
-    halide_type_handle = 3, //!< opaque pointer type (void *)
-    halide_type_bfloat = 4, //!< floating point numbers in the bfloat format
-} halide_type_code_t;
-
-// Note that while __attribute__ can go before or after the declaration,
-// __declspec apparently is only allowed before.
-#ifndef HALIDE_ATTRIBUTE_ALIGN
-    #ifdef _MSC_VER
-        #define HALIDE_ATTRIBUTE_ALIGN(x) __declspec(align(x))
-    #else
-        #define HALIDE_ATTRIBUTE_ALIGN(x) __attribute__((aligned(x)))
-    #endif
-#endif
-
-/** A runtime tag for a type in the halide type system. Can be ints,
- * unsigned ints, or floats of various bit-widths (the 'bits'
- * field). Can also be vectors of the same (by setting the 'lanes'
- * field to something larger than one). This struct should be
- * exactly 32-bits in size. */
-struct halide_type_t {
-    /** The basic type code: signed integer, unsigned integer, or floating point. */
-#if __cplusplus >= 201103L
-    HALIDE_ATTRIBUTE_ALIGN(1) halide_type_code_t code; // halide_type_code_t
-#else
-    HALIDE_ATTRIBUTE_ALIGN(1) uint8_t code; // halide_type_code_t
-#endif
-
-    /** The number of bits of precision of a single scalar value of this type. */
-    HALIDE_ATTRIBUTE_ALIGN(1) uint8_t bits;
-
-    /** How many elements in a vector. This is 1 for scalar types. */
-    HALIDE_ATTRIBUTE_ALIGN(2) uint16_t lanes;
-
-#ifdef __cplusplus
-    /** Construct a runtime representation of a Halide type from:
-     * code: The fundamental type from an enum.
-     * bits: The bit size of one element.
-     * lanes: The number of vector elements in the type. */
-    HALIDE_ALWAYS_INLINE halide_type_t(halide_type_code_t code, uint8_t bits, uint16_t lanes = 1)
-        : code(code), bits(bits), lanes(lanes) {
-    }
-
-    /** Default constructor is required e.g. to declare halide_trace_event
-     * instances. */
-    HALIDE_ALWAYS_INLINE halide_type_t() : code((halide_type_code_t)0), bits(0), lanes(0) {}
-
-    HALIDE_ALWAYS_INLINE halide_type_t with_lanes(uint16_t new_lanes) const {
-        return halide_type_t((halide_type_code_t) code, bits, new_lanes);
-    }
-
-    /** Compare two types for equality. */
-    HALIDE_ALWAYS_INLINE bool operator==(const halide_type_t &other) const {
-        return as_u32() == other.as_u32();
-    }
-
-    HALIDE_ALWAYS_INLINE bool operator!=(const halide_type_t &other) const {
-        return !(*this == other);
-    }
-
-    HALIDE_ALWAYS_INLINE bool operator<(const halide_type_t &other) const {
-        return as_u32() < other.as_u32();
-    }
-
-    /** Size in bytes for a single element, even if width is not 1, of this type. */
-    HALIDE_ALWAYS_INLINE int bytes() const { return (bits + 7) / 8; }
-
-    HALIDE_ALWAYS_INLINE uint32_t as_u32() const {
-        uint32_t u;
-        memcpy(&u, this, sizeof(u));
-        return u;
-    }
-#endif
-};
-
-enum halide_trace_event_code_t {halide_trace_load = 0,
-                                halide_trace_store = 1,
-                                halide_trace_begin_realization = 2,
-                                halide_trace_end_realization = 3,
-                                halide_trace_produce = 4,
-                                halide_trace_end_produce = 5,
-                                halide_trace_consume = 6,
-                                halide_trace_end_consume = 7,
-                                halide_trace_begin_pipeline = 8,
-                                halide_trace_end_pipeline = 9,
-                                halide_trace_tag = 10 };
-
-struct halide_trace_event_t {
-    /** The name of the Func or Pipeline that this event refers to */
-    const char *func;
-
-    /** If the event type is a load or a store, this points to the
-     * value being loaded or stored. Use the type field to safely cast
-     * this to a concrete pointer type and retrieve it. For other
-     * events this is null. */
-    void *value;
-
-    /** For loads and stores, an array which contains the location
-     * being accessed. For vector loads or stores it is an array of
-     * vectors of coordinates (the vector dimension is innermost).
-     *
-     * For realization or production-related events, this will contain
-     * the mins and extents of the region being accessed, in the order
-     * min0, extent0, min1, extent1, ...
-     *
-     * For pipeline-related events, this will be null.
-     */
-    int32_t *coordinates;
-
-    /** For halide_trace_tag, this points to a read-only null-terminated string
-     * of arbitrary text. For all other events, this will be null.
-     */
-    const char *trace_tag;
-
-    /** If the event type is a load or a store, this is the type of
-     * the data. Otherwise, the value is meaningless. */
-    struct halide_type_t type;
-
-    /** The type of event */
-    enum halide_trace_event_code_t event;
-
-    /* The ID of the parent event (see below for an explanation of
-     * event ancestry). */
-    int32_t parent_id;
-
-    /** If this was a load or store of a Tuple-valued Func, this is
-     * which tuple element was accessed. */
-    int32_t value_index;
-
-    /** The length of the coordinates array */
-    int32_t dimensions;
-
-#ifdef __cplusplus
-    // If we don't explicitly mark the default ctor as inline,
-    // certain build configurations can fail (notably iOS)
-    HALIDE_ALWAYS_INLINE halide_trace_event_t() {}
-#endif
-};
-
-/** Called when Funcs are marked as trace_load, trace_store, or
- * trace_realization. See Func::set_custom_trace. The default
- * implementation either prints events via halide_print, or if
- * HL_TRACE_FILE is defined, dumps the trace to that file in a
- * sequence of trace packets. The header for a trace packet is defined
- * below. If the trace is going to be large, you may want to make the
- * file a named pipe, and then read from that pipe into gzip.
- *
- * halide_trace returns a unique ID which will be passed to future
- * events that "belong" to the earlier event as the parent id. The
- * ownership hierarchy looks like:
- *
- * begin_pipeline
- * +--trace_tag (if any)
- * +--trace_tag (if any)
- * ...
- * +--begin_realization
- * |  +--produce
- * |  |  +--load/store
- * |  |  +--end_produce
- * |  +--consume
- * |  |  +--load
- * |  |  +--end_consume
- * |  +--end_realization
- * +--end_pipeline
- *
- * Threading means that ownership cannot be inferred from the ordering
- * of events. There can be many active realizations of a given
- * function, or many active productions for a single
- * realization. Within a single production, the ordering of events is
- * meaningful.
- *
- * Note that all trace_tag events (if any) will occur just after the begin_pipeline
- * event, but before any begin_realization events. All trace_tags for a given Func
- * will be emitted in the order added.
- */
-// @}
-extern int32_t halide_trace(void *user_context, const struct halide_trace_event_t *event);
-extern int32_t halide_default_trace(void *user_context, const struct halide_trace_event_t *event);
-typedef int32_t (*halide_trace_t)(void *user_context, const struct halide_trace_event_t *);
-extern halide_trace_t halide_set_custom_trace(halide_trace_t trace);
-// @}
-
-/** The header of a packet in a binary trace. All fields are 32-bit. */
-struct halide_trace_packet_t {
-    /** The total size of this packet in bytes. Always a multiple of
-     * four. Equivalently, the number of bytes until the next
-     * packet. */
-    uint32_t size;
-
-    /** The id of this packet (for the purpose of parent_id). */
-    int32_t id;
-
-    /** The remaining fields are equivalent to those in halide_trace_event_t */
-    // @{
-    struct halide_type_t type;
-    enum halide_trace_event_code_t event;
-    int32_t parent_id;
-    int32_t value_index;
-    int32_t dimensions;
-    // @}
-
-    #ifdef __cplusplus
-    // If we don't explicitly mark the default ctor as inline,
-    // certain build configurations can fail (notably iOS)
-    HALIDE_ALWAYS_INLINE halide_trace_packet_t() {}
-
-    /** Get the coordinates array, assuming this packet is laid out in
-     * memory as it was written. The coordinates array comes
-     * immediately after the packet header. */
-    HALIDE_ALWAYS_INLINE const int *coordinates() const {
-        return (const int *)(this + 1);
-    }
-
-    HALIDE_ALWAYS_INLINE int *coordinates() {
-        return (int *)(this + 1);
-    }
-
-    /** Get the value, assuming this packet is laid out in memory as
-     * it was written. The packet comes immediately after the coordinates
-     * array. */
-    HALIDE_ALWAYS_INLINE const void *value() const {
-        return (const void *)(coordinates() + dimensions);
-    }
-
-    HALIDE_ALWAYS_INLINE void *value() {
-        return (void *)(coordinates() + dimensions);
-    }
-
-    /** Get the func name, assuming this packet is laid out in memory
-     * as it was written. It comes after the value. */
-    HALIDE_ALWAYS_INLINE const char *func() const {
-        return (const char *)value() + type.lanes * type.bytes();
-    }
-
-    HALIDE_ALWAYS_INLINE char *func() {
-        return (char *)value() + type.lanes * type.bytes();
-    }
-
-    /** Get the trace_tag (if any), assuming this packet is laid out in memory
-     * as it was written. It comes after the func name. If there is no trace_tag,
-     * this will return a pointer to an empty string. */
-    HALIDE_ALWAYS_INLINE const char *trace_tag() const {
-        const char *f = func();
-        // strlen may not be available here
-        while (*f++) {
-            // nothing
-        }
-        return f;
-    }
-
-    HALIDE_ALWAYS_INLINE char *trace_tag() {
-        char *f = func();
-        // strlen may not be available here
-        while (*f++) {
-            // nothing
-        }
-        return f;
-    }
-    #endif
-};
-
-
-
-/** Set the file descriptor that Halide should write binary trace
- * events to. If called with 0 as the argument, Halide outputs trace
- * information to stdout in a human-readable format. If never called,
- * Halide checks the for existence of an environment variable called
- * HL_TRACE_FILE and opens that file. If HL_TRACE_FILE is not defined,
- * it outputs trace information to stdout in a human-readable
- * format. */
-extern void halide_set_trace_file(int fd);
-
-/** Halide calls this to retrieve the file descriptor to write binary
- * trace events to. The default implementation returns the value set
- * by halide_set_trace_file. Implement it yourself if you wish to use
- * a custom file descriptor per user_context. Return zero from your
- * implementation to tell Halide to print human-readable trace
- * information to stdout. */
-extern int halide_get_trace_file(void *user_context);
-
-/** If tracing is writing to a file. This call closes that file
- * (flushing the trace). Returns zero on success. */
-extern int halide_shutdown_trace();
-
-/** All Halide GPU or device backend implementations provide an
- * interface to be used with halide_device_malloc, etc. This is
- * accessed via the functions below.
- */
-
-/** An opaque struct containing per-GPU API implementations of the
- * device functions. */
-struct halide_device_interface_impl_t;
-
-/** Each GPU API provides a halide_device_interface_t struct pointing
- * to the code that manages device allocations. You can access these
- * functions directly from the struct member function pointers, or by
- * calling the functions declared below. Note that the global
- * functions are not available when using Halide as a JIT compiler.
- * If you are using raw halide_buffer_t in that context you must use
- * the function pointers in the device_interface struct.
- *
- * The function pointers below are currently the same for every GPU
- * API; only the impl field varies. These top-level functions do the
- * bookkeeping that is common across all GPU APIs, and then dispatch
- * to more API-specific functions via another set of function pointers
- * hidden inside the impl field.
- */
-struct halide_device_interface_t {
-    int (*device_malloc)(void *user_context, struct halide_buffer_t *buf,
-                         const struct halide_device_interface_t *device_interface);
-    int (*device_free)(void *user_context, struct halide_buffer_t *buf);
-    int (*device_sync)(void *user_context, struct halide_buffer_t *buf);
-    void (*device_release)(void *user_context,
-                          const struct halide_device_interface_t *device_interface);
-    int (*copy_to_host)(void *user_context, struct halide_buffer_t *buf);
-    int (*copy_to_device)(void *user_context, struct halide_buffer_t *buf,
-                          const struct halide_device_interface_t *device_interface);
-    int (*device_and_host_malloc)(void *user_context, struct halide_buffer_t *buf,
-                                  const struct halide_device_interface_t *device_interface);
-    int (*device_and_host_free)(void *user_context, struct halide_buffer_t *buf);
-    int (*buffer_copy)(void *user_context, struct halide_buffer_t *src,
-                       const struct halide_device_interface_t *dst_device_interface, struct halide_buffer_t *dst);
-    int (*device_crop)(void *user_context, const struct halide_buffer_t *src,
-                       struct halide_buffer_t *dst);
-    int (*device_slice)(void *user_context, const struct halide_buffer_t *src,
-                        int slice_dim, int slice_pos, struct halide_buffer_t *dst);
-    int (*device_release_crop)(void *user_context, struct halide_buffer_t *buf);
-    int (*wrap_native)(void *user_context, struct halide_buffer_t *buf, uint64_t handle,
-                       const struct halide_device_interface_t *device_interface);
-    int (*detach_native)(void *user_context, struct halide_buffer_t *buf);
-    int (*compute_capability)(void *user_context, int *major, int *minor);
-    const struct halide_device_interface_impl_t *impl;
-};
-
-/** Release all data associated with the given device interface, in
- * particular all resources (memory, texture, context handles)
- * allocated by Halide. Must be called explicitly when using AOT
- * compilation. This is *not* thread-safe with respect to actively
- * running Halide code. Ensure all pipelines are finished before
- * calling this. */
-extern void halide_device_release(void *user_context,
-                                  const struct halide_device_interface_t *device_interface);
-
-/** Copy image data from device memory to host memory. This must be called
- * explicitly to copy back the results of a GPU-based filter. */
-extern int halide_copy_to_host(void *user_context, struct halide_buffer_t *buf);
-
-/** Copy image data from host memory to device memory. This should not
- * be called directly; Halide handles copying to the device
- * automatically.  If interface is NULL and the buf has a non-zero dev
- * field, the device associated with the dev handle will be
- * used. Otherwise if the dev field is 0 and interface is NULL, an
- * error is returned. */
-extern int halide_copy_to_device(void *user_context, struct halide_buffer_t *buf,
-                                 const struct halide_device_interface_t *device_interface);
-
-/** Copy data from one buffer to another. The buffers may have
- * different shapes and sizes, but the destination buffer's shape must
- * be contained within the source buffer's shape. That is, for each
- * dimension, the min on the destination buffer must be greater than
- * or equal to the min on the source buffer, and min+extent on the
- * destination buffer must be less that or equal to min+extent on the
- * source buffer. The source data is pulled from either device or
- * host memory on the source, depending on the dirty flags. host is
- * preferred if both are valid. The dst_device_interface parameter
- * controls the destination memory space. NULL means host memory. */
-extern int halide_buffer_copy(void *user_context, struct halide_buffer_t *src,
-                              const struct halide_device_interface_t *dst_device_interface,
-                              struct halide_buffer_t *dst);
-
-/** Give the destination buffer a device allocation which is an alias
- * for the same coordinate range in the source buffer. Modifies the
- * device, device_interface, and the device_dirty flag only. Only
- * supported by some device APIs (others will return
- * halide_error_code_device_crop_unsupported). Call
- * halide_device_release_crop instead of halide_device_free to clean
- * up resources associated with the cropped view. Do not free the
- * device allocation on the source buffer while the destination buffer
- * still lives. Note that the two buffers do not share dirty flags, so
- * care must be taken to update them together as needed. Note that src
- * and dst are required to have the same number of dimensions.
- *
- * Note also that (in theory) device interfaces which support cropping may
- * still not support cropping a crop (instead, create a new crop of the parent
- * buffer); in practice, no known implementation has this limitation, although
- * it is possible that some future implementations may require it. */
-extern int halide_device_crop(void *user_context,
-                              const struct halide_buffer_t *src,
-                              struct halide_buffer_t *dst);
-
-/** Give the destination buffer a device allocation which is an alias
- * for a similar coordinate range in the source buffer, but with one dimension
- * sliced away in the dst. Modifies the device, device_interface, and the
- * device_dirty flag only. Only supported by some device APIs (others will return
- * halide_error_code_device_crop_unsupported). Call
- * halide_device_release_crop instead of halide_device_free to clean
- * up resources associated with the sliced view. Do not free the
- * device allocation on the source buffer while the destination buffer
- * still lives. Note that the two buffers do not share dirty flags, so
- * care must be taken to update them together as needed. Note that the dst buffer
- * must have exactly one fewer dimension than the src buffer, and that slice_dim
- * and slice_pos must be valid within src. */
-extern int halide_device_slice(void *user_context,
-                               const struct halide_buffer_t *src,
-                               int slice_dim, int slice_pos,
-                               struct halide_buffer_t *dst);
-
-/** Release any resources associated with a cropped/sliced view of another
- * buffer. */
-extern int halide_device_release_crop(void *user_context,
-                                      struct halide_buffer_t *buf);
-
-/** Wait for current GPU operations to complete. Calling this explicitly
- * should rarely be necessary, except maybe for profiling. */
-extern int halide_device_sync(void *user_context, struct halide_buffer_t *buf);
-
-/** Allocate device memory to back a halide_buffer_t. */
-extern int halide_device_malloc(void *user_context, struct halide_buffer_t *buf,
-                                const struct halide_device_interface_t *device_interface);
-
-/** Free device memory. */
-extern int halide_device_free(void *user_context, struct halide_buffer_t *buf);
-
-/** Wrap or detach a native device handle, setting the device field
- * and device_interface field as appropriate for the given GPU
- * API. The meaning of the opaque handle is specific to the device
- * interface, so if you know the device interface in use, call the
- * more specific functions in the runtime headers for your specific
- * device API instead (e.g. HalideRuntimeCuda.h). */
-// @{
-extern int halide_device_wrap_native(void *user_context,
-                                     struct halide_buffer_t *buf,
-                                     uint64_t handle,
-                                     const struct halide_device_interface_t *device_interface);
-extern int halide_device_detach_native(void *user_context, struct halide_buffer_t *buf);
-// @}
-
-/** Versions of the above functions that accept legacy buffer_t structs. */
-// @{
-extern int halide_copy_to_host_legacy(void *user_context, struct buffer_t *buf);
-extern int halide_copy_to_device_legacy(void *user_context, struct buffer_t *buf,
-                                 const struct halide_device_interface_t *device_interface);
-extern int halide_device_sync_legacy(void *user_context, struct buffer_t *buf);
-extern int halide_device_malloc_legacy(void *user_context, struct buffer_t *buf,
-                                const struct halide_device_interface_t *device_interface);
-extern int halide_device_free_legacy(void *user_context, struct buffer_t *buf);
-// @}
-
-/** Selects which gpu device to use. 0 is usually the display
- * device. If never called, Halide uses the environment variable
- * HL_GPU_DEVICE. If that variable is unset, Halide uses the last
- * device. Set this to -1 to use the last device. */
-extern void halide_set_gpu_device(int n);
-
-/** Halide calls this to get the desired halide gpu device
- * setting. Implement this yourself to use a different gpu device per
- * user_context. The default implementation returns the value set by
- * halide_set_gpu_device, or the environment variable
- * HL_GPU_DEVICE. */
-extern int halide_get_gpu_device(void *user_context);
-
-/** Set the soft maximum amount of memory, in bytes, that the LRU
- *  cache will use to memoize Func results.  This is not a strict
- *  maximum in that concurrency and simultaneous use of memoized
- *  reults larger than the cache size can both cause it to
- *  temporariliy be larger than the size specified here.
- */
-extern void halide_memoization_cache_set_size(int64_t size);
-
-/** Given a cache key for a memoized result, currently constructed
- *  from the Func name and top-level Func name plus the arguments of
- *  the computation, determine if the result is in the cache and
- *  return it if so. (The internals of the cache key should be
- *  considered opaque by this function.) If this routine returns true,
- *  it is a cache miss. Otherwise, it will return false and the
- *  buffers passed in will be filled, via copying, with memoized
- *  data. The last argument is a list if halide_buffer_t pointers which
- *  represents the outputs of the memoized Func. If the Func does not
- *  return a Tuple, there will only be one halide_buffer_t in the list. The
- *  tuple_count parameters determines the length of the list.
- *
- * The return values are:
- * -1: Signals an error.
- *  0: Success and cache hit.
- *  1: Success and cache miss.
- */
-extern int halide_memoization_cache_lookup(void *user_context, const uint8_t *cache_key, int32_t size,
-                                           struct halide_buffer_t *realized_bounds,
-                                           int32_t tuple_count, struct halide_buffer_t **tuple_buffers);
-
-/** Given a cache key for a memoized result, currently constructed
- *  from the Func name and top-level Func name plus the arguments of
- *  the computation, store the result in the cache for futre access by
- *  halide_memoization_cache_lookup. (The internals of the cache key
- *  should be considered opaque by this function.) Data is copied out
- *  from the inputs and inputs are unmodified. The last argument is a
- *  list if halide_buffer_t pointers which represents the outputs of the
- *  memoized Func. If the Func does not return a Tuple, there will
- *  only be one halide_buffer_t in the list. The tuple_count parameters
- *  determines the length of the list.
- *
- * If there is a memory allocation failure, the store does not store
- * the data into the cache.
- */
-extern int halide_memoization_cache_store(void *user_context, const uint8_t *cache_key, int32_t size,
-                                          struct halide_buffer_t *realized_bounds,
-                                          int32_t tuple_count,
-                                          struct halide_buffer_t **tuple_buffers);
-
-/** If halide_memoization_cache_lookup succeeds,
- * halide_memoization_cache_release must be called to signal the
- * storage is no longer being used by the caller. It will be passed
- * the host pointer of one the buffers returned by
- * halide_memoization_cache_lookup. That is
- * halide_memoization_cache_release will be called multiple times for
- * the case where halide_memoization_cache_lookup is handling multiple
- * buffers.  (This corresponds to memoizing a Tuple in Halide.) Note
- * that the host pointer must be sufficient to get to all information
- * the relase operation needs. The default Halide cache impleemntation
- * accomplishes this by storing extra data before the start of the user
- * modifiable host storage.
- *
- * This call is like free and does not have a failure return.
-  */
-extern void halide_memoization_cache_release(void *user_context, void *host);
-
-/** Free all memory and resources associated with the memoization cache.
- * Must be called at a time when no other threads are accessing the cache.
- */
-extern void halide_memoization_cache_cleanup();
-
-/** Annotate that a given range of memory has been initialized;
- * only used when Target::MSAN is enabled.
- *
- * The default implementation uses the LLVM-provided AnnotateMemoryIsInitialized() function.
- */
-extern int halide_msan_annotate_memory_is_initialized(void *user_context, const void *ptr, uint64_t len);
-
-/** Mark the data pointed to by the buffer_t as initialized (but *not* the buffer_t itself),
- * using halide_msan_annotate_memory_is_initialized() for marking.
- *
- * The default implementation takes pains to only mark the active memory ranges
- * (skipping padding), and sorting into ranges to always mark the smallest number of
- * ranges, in monotonically increasing memory order.
- *
- * Most client code should never need to replace the default implementation.
- */
-extern int halide_msan_annotate_buffer_is_initialized(void *user_context, struct halide_buffer_t *buffer);
-extern void halide_msan_annotate_buffer_is_initialized_as_destructor(void *user_context, void *buffer);
-
-/** The error codes that may be returned by a Halide pipeline. */
-enum halide_error_code_t {
-    /** There was no error. This is the value returned by Halide on success. */
-    halide_error_code_success = 0,
-
-    /** An uncategorized error occurred. Refer to the string passed to halide_error. */
-    halide_error_code_generic_error = -1,
-
-    /** A Func was given an explicit bound via Func::bound, but this
-     * was not large enough to encompass the region that is used of
-     * the Func by the rest of the pipeline. */
-    halide_error_code_explicit_bounds_too_small = -2,
-
-    /** The elem_size field of a halide_buffer_t does not match the size in
-     * bytes of the type of that ImageParam. Probable type mismatch. */
-    halide_error_code_bad_type = -3,
-
-    /** A pipeline would access memory outside of the halide_buffer_t passed
-     * in. */
-    halide_error_code_access_out_of_bounds = -4,
-
-    /** A halide_buffer_t was given that spans more than 2GB of memory. */
-    halide_error_code_buffer_allocation_too_large = -5,
-
-    /** A halide_buffer_t was given with extents that multiply to a number
-     * greater than 2^31-1 */
-    halide_error_code_buffer_extents_too_large = -6,
-
-    /** Applying explicit constraints on the size of an input or
-     * output buffer shrank the size of that buffer below what will be
-     * accessed by the pipeline. */
-    halide_error_code_constraints_make_required_region_smaller = -7,
-
-    /** A constraint on a size or stride of an input or output buffer
-     * was not met by the halide_buffer_t passed in. */
-    halide_error_code_constraint_violated = -8,
-
-    /** A scalar parameter passed in was smaller than its minimum
-     * declared value. */
-    halide_error_code_param_too_small = -9,
-
-    /** A scalar parameter passed in was greater than its minimum
-     * declared value. */
-    halide_error_code_param_too_large = -10,
-
-    /** A call to halide_malloc returned NULL. */
-    halide_error_code_out_of_memory = -11,
-
-    /** A halide_buffer_t pointer passed in was NULL. */
-    halide_error_code_buffer_argument_is_null = -12,
-
-    /** debug_to_file failed to open or write to the specified
-     * file. */
-    halide_error_code_debug_to_file_failed = -13,
-
-    /** The Halide runtime encountered an error while trying to copy
-     * from device to host. Turn on -debug in your target string to
-     * see more details. */
-    halide_error_code_copy_to_host_failed = -14,
-
-    /** The Halide runtime encountered an error while trying to copy
-     * from host to device. Turn on -debug in your target string to
-     * see more details. */
-    halide_error_code_copy_to_device_failed = -15,
-
-    /** The Halide runtime encountered an error while trying to
-     * allocate memory on device. Turn on -debug in your target string
-     * to see more details. */
-    halide_error_code_device_malloc_failed = -16,
-
-    /** The Halide runtime encountered an error while trying to
-     * synchronize with a device. Turn on -debug in your target string
-     * to see more details. */
-    halide_error_code_device_sync_failed = -17,
-
-    /** The Halide runtime encountered an error while trying to free a
-     * device allocation. Turn on -debug in your target string to see
-     * more details. */
-    halide_error_code_device_free_failed = -18,
-
-    /** Buffer has a non-zero device but no device interface, which
-     * violates a Halide invariant. */
-    halide_error_code_no_device_interface = -19,
-
-    /** An error occurred when attempting to initialize the Matlab
-     * runtime. */
-    halide_error_code_matlab_init_failed = -20,
-
-    /** The type of an mxArray did not match the expected type. */
-    halide_error_code_matlab_bad_param_type = -21,
-
-    /** There is a bug in the Halide compiler. */
-    halide_error_code_internal_error = -22,
-
-    /** The Halide runtime encountered an error while trying to launch
-     * a GPU kernel. Turn on -debug in your target string to see more
-     * details. */
-    halide_error_code_device_run_failed = -23,
-
-    /** The Halide runtime encountered a host pointer that violated
-     * the alignment set for it by way of a call to
-     * set_host_alignment */
-    halide_error_code_unaligned_host_ptr = -24,
-
-    /** A fold_storage directive was used on a dimension that is not
-     * accessed in a monotonically increasing or decreasing fashion. */
-    halide_error_code_bad_fold = -25,
-
-    /** A fold_storage directive was used with a fold factor that was
-     * too small to store all the values of a producer needed by the
-     * consumer. */
-    halide_error_code_fold_factor_too_small = -26,
-
-    /** User-specified require() expression was not satisfied. */
-    halide_error_code_requirement_failed = -27,
-
-    /** At least one of the buffer's extents are negative. */
-    halide_error_code_buffer_extents_negative = -28,
-
-    /** A compiled pipeline was passed the old deprecated buffer_t
-     * struct, and it could not be upgraded to a halide_buffer_t. */
-    halide_error_code_failed_to_upgrade_buffer_t = -29,
-
-    /** A compiled pipeline was passed the old deprecated buffer_t
-     * struct in bounds inference mode, but the returned information
-     * can't be expressed in the old buffer_t. */
-    halide_error_code_failed_to_downgrade_buffer_t = -30,
-
-    /** A specialize_fail() schedule branch was selected at runtime. */
-    halide_error_code_specialize_fail = -31,
-
-    /** The Halide runtime encountered an error while trying to wrap a
-     * native device handle.  Turn on -debug in your target string to
-     * see more details. */
-    halide_error_code_device_wrap_native_failed = -32,
-
-    /** The Halide runtime encountered an error while trying to detach
-     * a native device handle.  Turn on -debug in your target string
-     * to see more details. */
-    halide_error_code_device_detach_native_failed = -33,
-
-    /** The host field on an input or output was null, the device
-     * field was not zero, and the pipeline tries to use the buffer on
-     * the host. You may be passing a GPU-only buffer to a pipeline
-     * which is scheduled to use it on the CPU. */
-    halide_error_code_host_is_null = -34,
-
-    /** A folded buffer was passed to an extern stage, but the region
-     * touched wraps around the fold boundary. */
-    halide_error_code_bad_extern_fold = -35,
-
-    /** Buffer has a non-null device_interface but device is 0, which
-     * violates a Halide invariant. */
-    halide_error_code_device_interface_no_device= -36,
-
-    /** Buffer has both host and device dirty bits set, which violates
-     * a Halide invariant. */
-    halide_error_code_host_and_device_dirty = -37,
-
-    /** The halide_buffer_t * passed to a halide runtime routine is
-     * nullptr and this is not allowed. */
-    halide_error_code_buffer_is_null = -38,
-
-    /** The Halide runtime encountered an error while trying to copy
-     * from one buffer to another. Turn on -debug in your target
-     * string to see more details. */
-    halide_error_code_device_buffer_copy_failed = -39,
-
-    /** Attempted to make cropped/sliced alias of a buffer with a device
-     * field, but the device_interface does not support cropping. */
-    halide_error_code_device_crop_unsupported = -40,
-
-    /** Cropping/slicing a buffer failed for some other reason. Turn on -debug
-     * in your target string. */
-    halide_error_code_device_crop_failed = -41,
-
-    /** An operation on a buffer required an allocation on a
-     * particular device interface, but a device allocation already
-     * existed on a different device interface. Free the old one
-     * first. */
-    halide_error_code_incompatible_device_interface = -42,
-
-    /** The dimensions field of a halide_buffer_t does not match the dimensions of that ImageParam. */
-    halide_error_code_bad_dimensions = -43,
-
-    /** An expression that would perform an integer division or modulo
-     * by zero was evaluated. */
-    halide_error_code_integer_division_by_zero = -44,
-
-};
-
-/** Halide calls the functions below on various error conditions. The
- * default implementations construct an error message, call
- * halide_error, then return the matching error code above. On
- * platforms that support weak linking, you can override these to
- * catch the errors individually. */
-
-/** A call into an extern stage for the purposes of bounds inference
- * failed. Returns the error code given by the extern stage. */
-extern int halide_error_bounds_inference_call_failed(void *user_context, const char *extern_stage_name, int result);
-
-/** A call to an extern stage failed. Returned the error code given by
- * the extern stage. */
-extern int halide_error_extern_stage_failed(void *user_context, const char *extern_stage_name, int result);
-
-/** Various other error conditions. See the enum above for a
- * description of each. */
-// @{
-extern int halide_error_explicit_bounds_too_small(void *user_context, const char *func_name, const char *var_name,
-                                                      int min_bound, int max_bound, int min_required, int max_required);
-extern int halide_error_bad_type(void *user_context, const char *func_name,
-                                 uint32_t type_given, uint32_t correct_type); // N.B. The last two args are the bit representation of a halide_type_t
-extern int halide_error_bad_dimensions(void *user_context, const char *func_name,
-                                       int32_t dimensions_given, int32_t correct_dimensions);
-extern int halide_error_access_out_of_bounds(void *user_context, const char *func_name,
-                                             int dimension, int min_touched, int max_touched,
-                                             int min_valid, int max_valid);
-extern int halide_error_buffer_allocation_too_large(void *user_context, const char *buffer_name,
-                                                    uint64_t allocation_size, uint64_t max_size);
-extern int halide_error_buffer_extents_negative(void *user_context, const char *buffer_name, int dimension, int extent);
-extern int halide_error_buffer_extents_too_large(void *user_context, const char *buffer_name,
-                                                 int64_t actual_size, int64_t max_size);
-extern int halide_error_constraints_make_required_region_smaller(void *user_context, const char *buffer_name,
-                                                                 int dimension,
-                                                                 int constrained_min, int constrained_extent,
-                                                                 int required_min, int required_extent);
-extern int halide_error_constraint_violated(void *user_context, const char *var, int val,
-                                            const char *constrained_var, int constrained_val);
-extern int halide_error_param_too_small_i64(void *user_context, const char *param_name,
-                                            int64_t val, int64_t min_val);
-extern int halide_error_param_too_small_u64(void *user_context, const char *param_name,
-                                            uint64_t val, uint64_t min_val);
-extern int halide_error_param_too_small_f64(void *user_context, const char *param_name,
-                                            double val, double min_val);
-extern int halide_error_param_too_large_i64(void *user_context, const char *param_name,
-                                            int64_t val, int64_t max_val);
-extern int halide_error_param_too_large_u64(void *user_context, const char *param_name,
-                                            uint64_t val, uint64_t max_val);
-extern int halide_error_param_too_large_f64(void *user_context, const char *param_name,
-                                            double val, double max_val);
-extern int halide_error_out_of_memory(void *user_context);
-extern int halide_error_buffer_argument_is_null(void *user_context, const char *buffer_name);
-extern int halide_error_debug_to_file_failed(void *user_context, const char *func,
-                                             const char *filename, int error_code);
-extern int halide_error_unaligned_host_ptr(void *user_context, const char *func_name, int alignment);
-extern int halide_error_host_is_null(void *user_context, const char *func_name);
-extern int halide_error_failed_to_upgrade_buffer_t(void *user_context,
-                                                   const char *input_name,
-                                                   const char *reason);
-extern int halide_error_failed_to_downgrade_buffer_t(void *user_context,
-                                                     const char *input_name,
-                                                     const char *reason);
-extern int halide_error_bad_fold(void *user_context, const char *func_name, const char *var_name,
-                                 const char *loop_name);
-extern int halide_error_bad_extern_fold(void *user_context, const char *func_name,
-                                        int dim, int min, int extent, int valid_min, int fold_factor);
-
-extern int halide_error_fold_factor_too_small(void *user_context, const char *func_name, const char *var_name,
-                                              int fold_factor, const char *loop_name, int required_extent);
-extern int halide_error_requirement_failed(void *user_context, const char *condition, const char *message);
-extern int halide_error_specialize_fail(void *user_context, const char *message);
-extern int halide_error_no_device_interface(void *user_context);
-extern int halide_error_device_interface_no_device(void *user_context);
-extern int halide_error_host_and_device_dirty(void *user_context);
-extern int halide_error_buffer_is_null(void *user_context, const char *routine);
-extern int halide_error_integer_division_by_zero(void *user_context);
-// @}
-
-/** Optional features a compilation Target can have.
- * Be sure to keep this in sync with the Feature enum in Target.h and the implementation of
- * get_runtime_compatible_target in Target.cpp if you add a new feature.
- */
-typedef enum halide_target_feature_t {
-    halide_target_feature_jit = 0,  ///< Generate code that will run immediately inside the calling process.
-    halide_target_feature_debug,  ///< Turn on debug info and output for runtime code.
-    halide_target_feature_no_asserts,  ///< Disable all runtime checks, for slightly tighter code.
-    halide_target_feature_no_bounds_query, ///< Disable the bounds querying functionality.
-
-    halide_target_feature_sse41,  ///< Use SSE 4.1 and earlier instructions. Only relevant on x86.
-    halide_target_feature_avx,  ///< Use AVX 1 instructions. Only relevant on x86.
-    halide_target_feature_avx2,  ///< Use AVX 2 instructions. Only relevant on x86.
-    halide_target_feature_fma,  ///< Enable x86 FMA instruction
-    halide_target_feature_fma4,  ///< Enable x86 (AMD) FMA4 instruction set
-    halide_target_feature_f16c,  ///< Enable x86 16-bit float support
-
-    halide_target_feature_armv7s,  ///< Generate code for ARMv7s. Only relevant for 32-bit ARM.
-    halide_target_feature_no_neon,  ///< Avoid using NEON instructions. Only relevant for 32-bit ARM.
-
-    halide_target_feature_vsx,  ///< Use VSX instructions. Only relevant on POWERPC.
-    halide_target_feature_power_arch_2_07,  ///< Use POWER ISA 2.07 new instructions. Only relevant on POWERPC.
-
-    halide_target_feature_cuda,  ///< Enable the CUDA runtime. Defaults to compute capability 2.0 (Fermi)
-    halide_target_feature_cuda_capability30,  ///< Enable CUDA compute capability 3.0 (Kepler)
-    halide_target_feature_cuda_capability32,  ///< Enable CUDA compute capability 3.2 (Tegra K1)
-    halide_target_feature_cuda_capability35,  ///< Enable CUDA compute capability 3.5 (Kepler)
-    halide_target_feature_cuda_capability50,  ///< Enable CUDA compute capability 5.0 (Maxwell)
-
-    halide_target_feature_opencl,  ///< Enable the OpenCL runtime.
-    halide_target_size_opencl, ///<Temporary testing target placeholder marked by size>
-    halide_target_feature_cl_doubles,  ///< Enable double support on OpenCL targets
-    halide_target_feature_cl_atomic64, ///< Enable 64-bit atomics operations on OpenCL targets
-
-    halide_target_feature_opengl,  ///< Enable the OpenGL runtime.
-    halide_target_feature_openglcompute, ///< Enable OpenGL Compute runtime.
-
-    halide_target_feature_user_context,  ///< Generated code takes a user_context pointer as first argument
-
-    halide_target_feature_matlab,  ///< Generate a mexFunction compatible with Matlab mex libraries. See tools/mex_halide.m.
-
-    halide_target_feature_profile, ///< Launch a sampling profiler alongside the Halide pipeline that monitors and reports the runtime used by each Func
-    halide_target_feature_no_runtime, ///< Do not include a copy of the Halide runtime in any generated object file or assembly
-
-    halide_target_feature_metal, ///< Enable the (Apple) Metal runtime.
-    halide_target_feature_mingw, ///< For Windows compile to MinGW toolset rather then Visual Studio
-
-    halide_target_feature_c_plus_plus_mangling, ///< Generate C++ mangled names for result function, et al
-
-    halide_target_feature_large_buffers, ///< Enable 64-bit buffer indexing to support buffers > 2GB. Ignored if bits != 64.
-
-    halide_target_feature_hvx_64, ///< Enable HVX 64 byte mode.
-    halide_target_feature_hvx_128, ///< Enable HVX 128 byte mode.
-    halide_target_feature_hvx_v62, ///< Enable Hexagon v62 architecture.
-    halide_target_feature_fuzz_float_stores, ///< On every floating point store, set the last bit of the mantissa to zero. Pipelines for which the output is very different with this feature enabled may also produce very different output on different processors.
-    halide_target_feature_soft_float_abi, ///< Enable soft float ABI. This only enables the soft float ABI calling convention, which does not necessarily use soft floats.
-    halide_target_feature_msan, ///< Enable hooks for MSAN support.
-    halide_target_feature_avx512, ///< Enable the base AVX512 subset supported by all AVX512 architectures. The specific feature sets are AVX-512F and AVX512-CD. See https://en.wikipedia.org/wiki/AVX-512 for a description of each AVX subset.
-    halide_target_feature_avx512_knl, ///< Enable the AVX512 features supported by Knight's Landing chips, such as the Xeon Phi x200. This includes the base AVX512 set, and also AVX512-CD and AVX512-ER.
-    halide_target_feature_avx512_skylake, ///< Enable the AVX512 features supported by Skylake Xeon server processors. This adds AVX512-VL, AVX512-BW, and AVX512-DQ to the base set. The main difference from the base AVX512 set is better support for small integer ops. Note that this does not include the Knight's Landing features. Note also that these features are not available on Skylake desktop and mobile processors.
-    halide_target_feature_avx512_cannonlake, ///< Enable the AVX512 features expected to be supported by future Cannonlake processors. This includes all of the Skylake features, plus AVX512-IFMA and AVX512-VBMI.
-    halide_target_feature_hvx_use_shared_object, ///< Deprecated
-    halide_target_feature_trace_loads, ///< Trace all loads done by the pipeline. Equivalent to calling Func::trace_loads on every non-inlined Func.
-    halide_target_feature_trace_stores, ///< Trace all stores done by the pipeline. Equivalent to calling Func::trace_stores on every non-inlined Func.
-    halide_target_feature_trace_realizations, ///< Trace all realizations done by the pipeline. Equivalent to calling Func::trace_realizations on every non-inlined Func.
-    halide_target_feature_trace_pipeline, ///< Trace the pipeline.
-    halide_target_feature_cuda_capability61,  ///< Enable CUDA compute capability 6.1 (Pascal)
-    halide_target_feature_hvx_v65, ///< Enable Hexagon v65 architecture.
-    halide_target_feature_hvx_v66, ///< Enable Hexagon v66 architecture.
-    halide_target_feature_cl_half,  ///< Enable half support on OpenCL targets
-    halide_target_feature_strict_float, ///< Turn off all non-IEEE floating-point optimization. Currently applies only to LLVM targets.
-    halide_target_feature_legacy_buffer_wrappers,  ///< Emit legacy wrapper code for buffer_t (vs halide_buffer_t) when AOT-compiled.
-    halide_target_feature_tsan, ///< Enable hooks for TSAN support.
-    halide_target_feature_asan, ///< Enable hooks for ASAN support.
-    halide_target_feature_d3d12compute, ///< Enable Direct3D 12 Compute runtime.
-    halide_target_feature_check_unsafe_promises, ///< Insert assertions for promises.
-    halide_target_feature_hexagon_dma, ///< Enable Hexagon DMA buffers.
-    halide_target_feature_embed_bitcode,  ///< Emulate clang -fembed-bitcode flag.
-    halide_target_feature_enable_llvm_loop_opt,  ///< Enable loop vectorization + unrolling in LLVM. Overrides halide_target_feature_disable_llvm_loop_opt. (Ignored for non-LLVM targets.)
-    halide_target_feature_disable_llvm_loop_opt,  ///< Disable loop vectorization + unrolling in LLVM. (Ignored for non-LLVM targets.)
-    halide_target_feature_wasm_simd128,  ///< Enable +simd128 instructions for WebAssembly codegen.
-    halide_target_feature_wasm_signext,  ///< Enable +sign-ext instructions for WebAssembly codegen.
-    halide_target_feature_sve, ///< Enable ARM Scalable Vector Extensions
-    halide_target_feature_sve2, ///< Enable ARM Scalable Vector Extensions v2
-    halide_target_feature_egl,            ///< Force use of EGL support.
-    halide_target_feature_intel_fpga, ///< Enable Intel FPGAs
-    halide_target_feature_one_api, ///< Enable Intel OneAPI dpcpp program generation
-    halide_target_feature_intel_gpu, ///< Enable Intel Graphics
-    halide_target_feature_enable_synthesis, ///< Enable synthesizing binaries. Currently used only for Intel FPGAs.
-    halide_target_feature_end ///< A sentinel. Every target is considered to have this feature, and setting this feature does nothing.
-} halide_target_feature_t;
-
-/** This function is called internally by Halide in some situations to determine
- * if the current execution environment can support the given set of
- * halide_target_feature_t flags. The implementation must do the following:
- *
- * -- If there are flags set in features that the function knows *cannot* be supported, return 0.
- * -- Otherwise, return 1.
- * -- Note that any flags set in features that the function doesn't know how to test should be ignored;
- * this implies that a return value of 1 means "not known to be bad" rather than "known to be good".
- *
- * In other words: a return value of 0 means "It is not safe to use code compiled with these features",
- * while a return value of 1 means "It is not obviously unsafe to use code compiled with these features".
- *
- * The default implementation simply calls halide_default_can_use_target_features.
- *
- * Note that `features` points to an array of `count` uint64_t; this array must contain enough
- * bits to represent all the currently known features. Any excess bits must be set to zero.
- */
-// @{
-extern int halide_can_use_target_features(int count, const uint64_t *features);
-typedef int (*halide_can_use_target_features_t)(int count, const uint64_t *features);
-extern halide_can_use_target_features_t halide_set_custom_can_use_target_features(halide_can_use_target_features_t);
-// @}
-
-/**
- * This is the default implementation of halide_can_use_target_features; it is provided
- * for convenience of user code that may wish to extend halide_can_use_target_features
- * but continue providing existing support, e.g.
- *
- *     int halide_can_use_target_features(int count, const uint64_t *features) {
- *          if (features[halide_target_somefeature >> 6] & (1LL << (halide_target_somefeature & 63))) {
- *              if (!can_use_somefeature()) {
- *                  return 0;
- *              }
- *          }
- *          return halide_default_can_use_target_features(count, features);
- *     }
- */
-extern int halide_default_can_use_target_features(int count, const uint64_t *features);
-
-
-typedef struct halide_dimension_t {
-    int32_t min, extent, stride;
-
-    // Per-dimension flags. None are defined yet (This is reserved for future use).
-    uint32_t flags;
-
-#ifdef __cplusplus
-    HALIDE_ALWAYS_INLINE halide_dimension_t() : min(0), extent(0), stride(0), flags(0) {}
-    HALIDE_ALWAYS_INLINE halide_dimension_t(int32_t m, int32_t e, int32_t s, uint32_t f = 0) :
-        min(m), extent(e), stride(s), flags(f) {}
-
-    HALIDE_ALWAYS_INLINE bool operator==(const halide_dimension_t &other) const {
-        return (min == other.min) &&
-            (extent == other.extent) &&
-            (stride == other.stride) &&
-            (flags == other.flags);
-    }
-
-    HALIDE_ALWAYS_INLINE bool operator!=(const halide_dimension_t &other) const {
-        return !(*this == other);
-    }
-#endif
-} halide_dimension_t;
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-typedef enum {halide_buffer_flag_host_dirty = 1,
-              halide_buffer_flag_device_dirty = 2} halide_buffer_flags;
-
-/**
- * The raw representation of an image passed around by generated
- * Halide code. It includes some stuff to track whether the image is
- * not actually in main memory, but instead on a device (like a
- * GPU). For a more convenient C++ wrapper, use Halide::Buffer<T>. */
-typedef struct halide_buffer_t {
-    /** A device-handle for e.g. GPU memory used to back this buffer. */
-    uint64_t device;
-
-    /** The interface used to interpret the above handle. */
-    const struct halide_device_interface_t *device_interface;
-
-    /** A pointer to the start of the data in main memory. In terms of
-     * the Halide coordinate system, this is the address of the min
-     * coordinates (defined below). */
-    uint8_t* host;
-
-    /** flags with various meanings. */
-    uint64_t flags;
-
-    /** The type of each buffer element. */
-    struct halide_type_t type;
-
-    /** The dimensionality of the buffer. */
-    int32_t dimensions;
-
-    /** The shape of the buffer. Halide does not own this array - you
-     * must manage the memory for it yourself. */
-    halide_dimension_t *dim;
-
-    /** Pads the buffer up to a multiple of 8 bytes */
-    void *padding;
-
-#ifdef __cplusplus
-    /** Convenience methods for accessing the flags */
-    // @{
-    HALIDE_ALWAYS_INLINE bool get_flag(halide_buffer_flags flag) const {
-        return (flags & flag) != 0;
-    }
-
-    HALIDE_ALWAYS_INLINE void set_flag(halide_buffer_flags flag, bool value) {
-        if (value) {
-            flags |= flag;
-        } else {
-            flags &= ~flag;
-        }
-    }
-
-    HALIDE_ALWAYS_INLINE bool host_dirty() const {
-        return get_flag(halide_buffer_flag_host_dirty);
-    }
-
-    HALIDE_ALWAYS_INLINE bool device_dirty() const {
-        return get_flag(halide_buffer_flag_device_dirty);
-    }
-
-    HALIDE_ALWAYS_INLINE void set_host_dirty(bool v = true) {
-        set_flag(halide_buffer_flag_host_dirty, v);
-    }
-
-    HALIDE_ALWAYS_INLINE void set_device_dirty(bool v = true) {
-        set_flag(halide_buffer_flag_device_dirty, v);
-    }
-    // @}
-
-    /** The total number of elements this buffer represents. Equal to
-     * the product of the extents */
-    HALIDE_ALWAYS_INLINE size_t number_of_elements() const {
-        size_t s = 1;
-        for (int i = 0; i < dimensions; i++) {
-            s *= dim[i].extent;
-        }
-        return s;
-    }
-
-    /** A pointer to the element with the lowest address. If all
-     * strides are positive, equal to the host pointer. */
-    HALIDE_ALWAYS_INLINE uint8_t *begin() const {
-        ptrdiff_t index = 0;
-        for (int i = 0; i < dimensions; i++) {
-            if (dim[i].stride < 0) {
-                index += dim[i].stride * (dim[i].extent - 1);
-            }
-        }
-        return host + index * type.bytes();
-    }
-
-    /** A pointer to one beyond the element with the highest address. */
-    HALIDE_ALWAYS_INLINE uint8_t *end() const {
-        ptrdiff_t index = 0;
-        for (int i = 0; i < dimensions; i++) {
-            if (dim[i].stride > 0) {
-                index += dim[i].stride * (dim[i].extent - 1);
-            }
-        }
-        index += 1;
-        return host + index * type.bytes();
-    }
-
-    /** The total number of bytes spanned by the data in memory. */
-    HALIDE_ALWAYS_INLINE size_t size_in_bytes() const {
-        return (size_t)(end() - begin());
-    }
-
-    /** A pointer to the element at the given location. */
-    HALIDE_ALWAYS_INLINE uint8_t *address_of(const int *pos) const {
-        ptrdiff_t index = 0;
-        for (int i = 0; i < dimensions; i++) {
-            index += dim[i].stride * (pos[i] - dim[i].min);
-        }
-        return host + index * type.bytes();
-    }
-
-    /** Attempt to call device_sync for the buffer. If the buffer
-     * has no device_interface (or no device_sync), this is a quiet no-op.
-     * Calling this explicitly should rarely be necessary, except for profiling. */
-    HALIDE_ALWAYS_INLINE int device_sync(void *ctx = NULL) {
-        if (device_interface && device_interface->device_sync) {
-            return device_interface->device_sync(ctx, this);
-        }
-        return 0;
-    }
-
-    /** Check if an input buffer passed extern stage is a querying
-     * bounds. Compared to doing the host pointer check directly,
-     * this both adds clarity to code and will facilitate moving to
-     * another representation for bounds query arguments. */
-    HALIDE_ALWAYS_INLINE bool is_bounds_query() const {
-        return host == NULL && device == 0;
-    }
-
-#endif
-} halide_buffer_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef HALIDE_ATTRIBUTE_DEPRECATED
-#ifdef HALIDE_ALLOW_DEPRECATED
-#define HALIDE_ATTRIBUTE_DEPRECATED(x)
-#else
-#ifdef _MSC_VER
-#define HALIDE_ATTRIBUTE_DEPRECATED(x) __declspec(deprecated(x))
-#else
-#define HALIDE_ATTRIBUTE_DEPRECATED(x) __attribute__((deprecated(x)))
-#endif
-#endif
-#endif
-
-/** The old buffer_t, included for compatibility with old code. Don't
- * use it. */
-#ifndef BUFFER_T_DEFINED
-#define BUFFER_T_DEFINED
-typedef struct buffer_t {
-    uint64_t dev;
-    uint8_t* host;
-    int32_t extent[4];
-    int32_t stride[4];
-    int32_t min[4];
-    int32_t elem_size;
-    HALIDE_ATTRIBUTE_ALIGN(1) bool host_dirty;
-    HALIDE_ATTRIBUTE_ALIGN(1) bool dev_dirty;
-    HALIDE_ATTRIBUTE_ALIGN(1) uint8_t _padding[10 - sizeof(void *)];
-} buffer_t;
-#endif // BUFFER_T_DEFINED
-
-/** Copies host pointer, mins, extents, strides, and device state from
- * an old-style buffer_t into a new-style halide_buffer_t. If bounds_query_only is nonzero,
- * the copy is only done if the old_buf has null host and dev (ie, a bounds query is being
- * performed); otherwise new_buf is left untouched. (This is used for input buffers to avoid
- * benign data races.) The dimensions and type fields of the new buffer_t should already be
- * set. Returns an error code if the upgrade could not be performed. */
-extern int halide_upgrade_buffer_t(void *user_context, const char *name,
-                                   const buffer_t *old_buf, halide_buffer_t *new_buf,
-                                   int bounds_query_only);
-
-/** Copies the host pointer, mins, extents, strides, and device state
- * from a halide_buffer_t to a buffer_t. Also sets elem_size. Useful
- * for backporting the results of bounds inference. */
-extern int halide_downgrade_buffer_t(void *user_context, const char *name,
-                                     const halide_buffer_t *new_buf, buffer_t *old_buf);
-
-/** Copies the dirty flags and device allocation state from a new
- * buffer_t back to a legacy buffer_t. */
-extern int halide_downgrade_buffer_t_device_fields(void *user_context, const char *name,
-                                                   const halide_buffer_t *new_buf, buffer_t *old_buf);
-
-/** halide_scalar_value_t is a simple union able to represent all the well-known
- * scalar values in a filter argument. Note that it isn't tagged with a type;
- * you must ensure you know the proper type before accessing. Most user
- * code will never need to create instances of this struct; its primary use
- * is to hold def/min/max values in a halide_filter_argument_t. (Note that
- * this is conceptually just a union; it's wrapped in a struct to ensure
- * that it doesn't get anonymized by LLVM.)
- */
-struct halide_scalar_value_t {
-    union {
-        bool b;
-        int8_t i8;
-        int16_t i16;
-        int32_t i32;
-        int64_t i64;
-        uint8_t u8;
-        uint16_t u16;
-        uint32_t u32;
-        uint64_t u64;
-        float f32;
-        double f64;
-        void *handle;
-    } u;
-    #ifdef __cplusplus
-    HALIDE_ALWAYS_INLINE halide_scalar_value_t() {u.u64 = 0;}
-    #endif
-};
-
-enum halide_argument_kind_t {
-    halide_argument_kind_input_scalar = 0,
-    halide_argument_kind_input_buffer = 1,
-    halide_argument_kind_output_buffer = 2
-};
-
-/*
-    These structs must be robust across different compilers and settings; when
-    modifying them, strive for the following rules:
-
-    1) All fields are explicitly sized. I.e. must use int32_t and not "int"
-    2) All fields must land on an alignment boundary that is the same as their size
-    3) Explicit padding is added to make that so
-    4) The sizeof the struct is padded out to a multiple of the largest natural size thing in the struct
-    5) don't forget that 32 and 64 bit pointers are different sizes
-*/
-
-/**
- * Obsolete version of halide_filter_argument_t; only present in
- * code that wrote halide_filter_metadata_t version 0.
- */
-struct halide_filter_argument_t_v0 {
-    const char *name;
-    int32_t kind;
-    int32_t dimensions;
-    struct halide_type_t type;
-    const struct halide_scalar_value_t *def, *min, *max;
-};
-
-/**
- * halide_filter_argument_t is essentially a plain-C-struct equivalent to
- * Halide::Argument; most user code will never need to create one.
- */
-struct halide_filter_argument_t {
-    const char *name;       // name of the argument; will never be null or empty.
-    int32_t kind;           // actually halide_argument_kind_t
-    int32_t dimensions;     // always zero for scalar arguments
-    struct halide_type_t type;
-    // These pointers should always be null for buffer arguments,
-    // and *may* be null for scalar arguments. (A null value means
-    // there is no def/min/max/estimate specified for this argument.)
-    const struct halide_scalar_value_t *scalar_def, *scalar_min, *scalar_max, *scalar_estimate;
-    // This pointer should always be null for scalar arguments,
-    // and *may* be null for buffer arguments. If not null, it should always
-    // point to an array of dimensions*2 pointers, which will be the (min, extent)
-    // estimates for each dimension of the buffer. (Note that any of the pointers
-    // may be null as well.)
-    int64_t const* const* buffer_estimates;
-};
-
-struct halide_filter_metadata_t {
-#ifdef __cplusplus
-    static const int32_t VERSION = 1;
-#endif
-
-    /** version of this metadata; currently always 1. */
-    int32_t version;
-
-    /** The number of entries in the arguments field. This is always >= 1. */
-    int32_t num_arguments;
-
-    /** An array of the filters input and output arguments; this will never be
-     * null. The order of arguments is not guaranteed (input and output arguments
-     * may come in any order); however, it is guaranteed that all arguments
-     * will have a unique name within a given filter. */
-    const struct halide_filter_argument_t* arguments;
-
-    /** The Target for which the filter was compiled. This is always
-     * a canonical Target string (ie a product of Target::to_string). */
-    const char* target;
-
-    /** The function name of the filter. */
-    const char* name;
-};
-
-/** halide_register_argv_and_metadata() is a **user-defined** function that
- * must be provided in order to use the registration.cc files produced
- * by Generators when the 'registration' output is requested. Each registration.cc
- * file provides a static initializer that calls this function with the given
- * filter's argv-call variant, its metadata, and (optionally) and additional
- * textual data that the build system chooses to tack on for its own purposes.
- * Note that this will be called at static-initializer time (i.e., before
- * main() is called), and in an unpredictable order. Note that extra_key_value_pairs
- * may be nullptr; if it's not null, it's expected to be a null-terminated list
- * of strings, with an even number of entries. */
-void halide_register_argv_and_metadata(
-    int (*filter_argv_call)(void **),
-    const struct halide_filter_metadata_t *filter_metadata,
-    const char * const *extra_key_value_pairs
-);
-
-/** The functions below here are relevant for pipelines compiled with
- * the -profile target flag, which runs a sampling profiler thread
- * alongside the pipeline. */
-
-/** Per-Func state tracked by the sampling profiler. */
-struct halide_profiler_func_stats {
-    /** Total time taken evaluating this Func (in nanoseconds). */
-    uint64_t time;
-
-    /** The current memory allocation of this Func. */
-    uint64_t memory_current;
-
-    /** The peak memory allocation of this Func. */
-    uint64_t memory_peak;
-
-    /** The total memory allocation of this Func. */
-    uint64_t memory_total;
-
-    /** The peak stack allocation of this Func's threads. */
-    uint64_t stack_peak;
-
-    /** The average number of thread pool worker threads active while computing this Func. */
-    uint64_t active_threads_numerator, active_threads_denominator;
-
-    /** The name of this Func. A global constant string. */
-    const char *name;
-
-    /** The total number of memory allocation of this Func. */
-    int num_allocs;
-};
-
-/** Per-pipeline state tracked by the sampling profiler. These exist
- * in a linked list. */
-struct halide_profiler_pipeline_stats {
-    /** Total time spent inside this pipeline (in nanoseconds) */
-    uint64_t time;
-
-    /** The current memory allocation of funcs in this pipeline. */
-    uint64_t memory_current;
-
-    /** The peak memory allocation of funcs in this pipeline. */
-    uint64_t memory_peak;
-
-    /** The total memory allocation of funcs in this pipeline. */
-    uint64_t memory_total;
-
-    /** The average number of thread pool worker threads doing useful
-     * work while computing this pipeline. */
-    uint64_t active_threads_numerator, active_threads_denominator;
-
-    /** The name of this pipeline. A global constant string. */
-    const char *name;
-
-    /** An array containing states for each Func in this pipeline. */
-    struct halide_profiler_func_stats *funcs;
-
-    /** The next pipeline_stats pointer. It's a void * because types
-     * in the Halide runtime may not currently be recursive. */
-    void *next;
-
-    /** The number of funcs in this pipeline. */
-    int num_funcs;
-
-    /** An internal base id used to identify the funcs in this pipeline. */
-    int first_func_id;
-
-    /** The number of times this pipeline has been run. */
-    int runs;
-
-    /** The total number of samples taken inside of this pipeline. */
-    int samples;
-
-    /** The total number of memory allocation of funcs in this pipeline. */
-    int num_allocs;
-};
-
-/** The global state of the profiler. */
-
-struct halide_profiler_state {
-    /** Guards access to the fields below. If not locked, the sampling
-     * profiler thread is free to modify things below (including
-     * reordering the linked list of pipeline stats). */
-    struct halide_mutex lock;
-
-    /** The amount of time the profiler thread sleeps between samples
-     * in milliseconds. Defaults to 1 */
-    int sleep_time;
-
-    /** An internal id used for bookkeeping. */
-    int first_free_id;
-
-    /** The id of the current running Func. Set by the pipeline, read
-     * periodically by the profiler thread. */
-    int current_func;
-
-    /** The number of threads currently doing work. */
-    int active_threads;
-
-    /** A linked list of stats gathered for each pipeline. */
-    struct halide_profiler_pipeline_stats *pipelines;
-
-    /** Retrieve remote profiler state. Used so that the sampling
-     * profiler can follow along with execution that occurs elsewhere,
-     * e.g. on a DSP. If null, it reads from the int above instead. */
-    void (*get_remote_profiler_state)(int *func, int *active_workers);
-
-    /** Sampling thread reference to be joined at shutdown. */
-    struct halide_thread *sampling_thread;
-};
-
-/** Profiler func ids with special meanings. */
-enum {
-    /// current_func takes on this value when not inside Halide code
-    halide_profiler_outside_of_halide = -1,
-    /// Set current_func to this value to tell the profiling thread to
-    /// halt. It will start up again next time you run a pipeline with
-    /// profiling enabled.
-    halide_profiler_please_stop = -2
-};
-
-/** Get a pointer to the global profiler state for programmatic
- * inspection. Lock it before using to pause the profiler. */
-extern struct halide_profiler_state *halide_profiler_get_state();
-
-/** Get a pointer to the pipeline state associated with pipeline_name.
- * This function grabs the global profiler state's lock on entry. */
-extern struct halide_profiler_pipeline_stats *halide_profiler_get_pipeline_state(const char *pipeline_name);
-
-/** Reset profiler state cheaply. May leave threads running or some
- * memory allocated but all accumluated statistics are reset.
- * WARNING: Do NOT call this method while any halide pipeline is
- * running; halide_profiler_memory_allocate/free and
- * halide_profiler_stack_peak_update update the profiler pipeline's
- * state without grabbing the global profiler state's lock. */
-extern void halide_profiler_reset();
-
-/** Reset all profiler state.
- * WARNING: Do NOT call this method while any halide pipeline is
- * running; halide_profiler_memory_allocate/free and
- * halide_profiler_stack_peak_update update the profiler pipeline's
- * state without grabbing the global profiler state's lock. */
-void halide_profiler_shutdown();
-
-/** Print out timing statistics for everything run since the last
- * reset. Also happens at process exit. */
-extern void halide_profiler_report(void *user_context);
-
-/// \name "Float16" functions
-/// These functions operate of bits (``uint16_t``) representing a half
-/// precision floating point number (IEEE-754 2008 binary16).
-//{@
-
-/** Read bits representing a half precision floating point number and return
- *  the float that represents the same value */
-extern float halide_float16_bits_to_float(uint16_t);
-
-/** Read bits representing a half precision floating point number and return
- *  the double that represents the same value */
-extern double halide_float16_bits_to_double(uint16_t);
-
-// TODO: Conversion functions to half
-
-//@}
-
-// Allocating and freeing device memory is often very slow. The
-// methods below give Halide's runtime permission to hold onto device
-// memory to service future requests instead of returning it to the
-// underlying device API. The API does not manage an allocation pool,
-// all it does is provide access to a shared counter that acts as a
-// limit on the unused memory not yet returned to the underlying
-// device API. It makes callbacks to participants when memory needs to
-// be released because the limit is about to be exceeded (either
-// because the limit has been reduced, or because the memory owned by
-// some participant becomes unused).
-
-/** Tell Halide whether or not it is permitted to hold onto device
- * allocations to service future requests instead of returning them
- * eagerly to the underlying device API. Many device allocators are
- * quite slow, so it can be beneficial to set this to true. The
- * default value for now is false.
- *
- * Note that if enabled, the eviction policy is very simplistic. The
- * 32 most-recently used allocations are preserved, regardless of
- * their size. Additionally, if a call to cuMalloc results in an
- * out-of-memory error, the entire cache is flushed and the allocation
- * is retried. See https://github.com/halide/Halide/issues/4093
- *
- * If set to false, releases all unused device allocations back to the
- * underlying device APIs. For finer-grained control, see specific
- * methods in each device api runtime. */
-extern int halide_reuse_device_allocations(void *user_context, bool);
-
-/** Determines whether on device_free the memory is returned
- * immediately to the device API, or placed on a free list for future
- * use. Override and switch based on the user_context for
- * finer-grained control. By default just returns the value most
- * recently set by the method above. */
-extern bool halide_can_reuse_device_allocations(void *user_context);
-
-struct halide_device_allocation_pool {
-    int (*release_unused)(void *user_context);
-    struct halide_device_allocation_pool *next;
-};
-
-/** Register a callback to be informed when
- * halide_reuse_device_allocations(false) is called, and all unused
- * device allocations must be released. The object passed should have
- * global lifetime, and its next field will be clobbered. */
-extern void halide_register_device_allocation_pool(struct halide_device_allocation_pool *);
-
-#ifdef __cplusplus
-} // End extern "C"
-#endif
-
-#ifdef __cplusplus
-
-namespace {
-template<typename T> struct check_is_pointer;
-template<typename T> struct check_is_pointer<T *> {};
-}
-
-/** Construct the halide equivalent of a C type */
-template<typename T>
-HALIDE_ALWAYS_INLINE halide_type_t halide_type_of() {
-    // Create a compile-time error if T is not a pointer (without
-    // using any includes - this code goes into the runtime).
-    check_is_pointer<T> check;
-    (void)check;
-    return halide_type_t(halide_type_handle, 64);
-}
-
-template<>
-HALIDE_ALWAYS_INLINE halide_type_t halide_type_of<float>() {
-    return halide_type_t(halide_type_float, 32);
-}
-
-template<>
-HALIDE_ALWAYS_INLINE halide_type_t halide_type_of<double>() {
-    return halide_type_t(halide_type_float, 64);
-}
-
-template<>
-HALIDE_ALWAYS_INLINE halide_type_t halide_type_of<bool>() {
-    return halide_type_t(halide_type_uint, 1);
-}
-
-template<>
-HALIDE_ALWAYS_INLINE halide_type_t halide_type_of<uint8_t>() {
-    return halide_type_t(halide_type_uint, 8);
-}
-
-template<>
-HALIDE_ALWAYS_INLINE halide_type_t halide_type_of<uint16_t>() {
-    return halide_type_t(halide_type_uint, 16);
-}
-
-template<>
-HALIDE_ALWAYS_INLINE halide_type_t halide_type_of<uint32_t>() {
-    return halide_type_t(halide_type_uint, 32);
-}
-
-template<>
-HALIDE_ALWAYS_INLINE halide_type_t halide_type_of<uint64_t>() {
-    return halide_type_t(halide_type_uint, 64);
-}
-
-template<>
-HALIDE_ALWAYS_INLINE halide_type_t halide_type_of<int8_t>() {
-    return halide_type_t(halide_type_int, 8);
-}
-
-template<>
-HALIDE_ALWAYS_INLINE halide_type_t halide_type_of<int16_t>() {
-    return halide_type_t(halide_type_int, 16);
-}
-
-template<>
-HALIDE_ALWAYS_INLINE halide_type_t halide_type_of<int32_t>() {
-    return halide_type_t(halide_type_int, 32);
-}
-
-template<>
-HALIDE_ALWAYS_INLINE halide_type_t halide_type_of<int64_t>() {
-    return halide_type_t(halide_type_int, 64);
-}
-
-#endif
-
-#endif // HALIDE_HALIDERUNTIME_H
diff --git a/t2s/preprocessor/sample/sample_01/buffer_t.cpp b/t2s/preprocessor/sample/sample_01/buffer_t.cpp
deleted file mode 100644
index 4720708b..00000000
--- a/t2s/preprocessor/sample/sample_01/buffer_t.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-#ifdef COMPILING_HALIDE_RUNTIME
-#include "HalideRuntime.h"
-#define HALIDE_BUFFER_HELPER_ATTRS __attribute__((always_inline, weak))
-#else
-#define HALIDE_BUFFER_HELPER_ATTRS inline
-#endif
-
-// Structs are annoying to deal with from within Halide Stmts. These
-// utility functions are for dealing with buffer_t in that
-// context. They are not intended for use outside of Halide code, and
-// not exposed in HalideRuntime.h. The symbols are private to the
-// module and should be inlined and then stripped. This blob of code
-// also gets copy-pasted into C outputs.
-
-extern "C" {
-
-HALIDE_BUFFER_HELPER_ATTRS
-int _halide_buffer_get_dimensions(const halide_buffer_t *buf) {
-    return buf->dimensions;
-}
-
-HALIDE_BUFFER_HELPER_ATTRS
-uint8_t *_halide_buffer_get_host(const halide_buffer_t *buf) {
-    return buf->host;
-}
-
-HALIDE_BUFFER_HELPER_ATTRS
-uint64_t _halide_buffer_get_device(const halide_buffer_t *buf) {
-    return buf->device;
-}
-
-HALIDE_BUFFER_HELPER_ATTRS
-const struct halide_device_interface_t *_halide_buffer_get_device_interface(const halide_buffer_t *buf) {
-    return buf->device_interface;
-}
-
-HALIDE_BUFFER_HELPER_ATTRS
-int _halide_buffer_get_min(const halide_buffer_t *buf, int d) {
-    return buf->dim[d].min;
-}
-
-HALIDE_BUFFER_HELPER_ATTRS
-int _halide_buffer_get_max(const halide_buffer_t *buf, int d) {
-    return buf->dim[d].min + buf->dim[d].extent - 1;
-}
-
-HALIDE_BUFFER_HELPER_ATTRS
-int _halide_buffer_get_extent(const halide_buffer_t *buf, int d) {
-    return buf->dim[d].extent;
-}
-
-HALIDE_BUFFER_HELPER_ATTRS
-int _halide_buffer_get_stride(const halide_buffer_t *buf, int d) {
-    return buf->dim[d].stride;
-}
-
-HALIDE_BUFFER_HELPER_ATTRS
-int _halide_buffer_set_host_dirty(halide_buffer_t *buf, bool val) {
-    buf->set_host_dirty(val);
-    return 0;
-}
-
-HALIDE_BUFFER_HELPER_ATTRS
-int _halide_buffer_set_device_dirty(halide_buffer_t *buf, bool val) {
-    buf->set_device_dirty(val);
-    return 0;
-}
-
-HALIDE_BUFFER_HELPER_ATTRS
-bool _halide_buffer_get_host_dirty(const halide_buffer_t *buf) {
-    return buf->host_dirty();
-}
-
-HALIDE_BUFFER_HELPER_ATTRS
-bool _halide_buffer_get_device_dirty(const halide_buffer_t *buf) {
-    return buf->device_dirty();
-}
-
-HALIDE_BUFFER_HELPER_ATTRS
-halide_dimension_t *_halide_buffer_get_shape(halide_buffer_t *buf) {
-    return buf->dim;
-}
-
-HALIDE_BUFFER_HELPER_ATTRS
-bool _halide_buffer_is_bounds_query(const halide_buffer_t *buf) {
-    return buf->host == NULL && buf->device == 0;
-}
-
-HALIDE_BUFFER_HELPER_ATTRS
-uint32_t _halide_buffer_get_type(const halide_buffer_t *buf) {
-    return buf->type.as_u32();
-}
-
-HALIDE_BUFFER_HELPER_ATTRS
-halide_buffer_t *_halide_buffer_init(halide_buffer_t *dst,
-                                     halide_dimension_t *dst_shape,
-                                     void *host,
-                                     uint64_t device,
-                                     const halide_device_interface_t *device_interface,
-                                     int type_code, int type_bits,
-                                     int dimensions,
-                                     halide_dimension_t *shape,
-                                     uint64_t flags) {
-    dst->host = (uint8_t *)host;
-    dst->device = device;
-    dst->device_interface = device_interface;
-    dst->type.code = (halide_type_code_t)type_code;
-    dst->type.bits = (uint8_t)type_bits;
-    dst->type.lanes = 1;
-    dst->dimensions = dimensions;
-    dst->dim = dst_shape;
-    if (shape != dst->dim) {
-        for (int i = 0; i < dimensions; i++) {
-            dst->dim[i] = shape[i];
-        }
-    }
-    dst->flags = flags;
-    return dst;
-}
-
-HALIDE_BUFFER_HELPER_ATTRS
-halide_buffer_t *_halide_buffer_init_from_buffer(halide_buffer_t *dst,
-                                                 halide_dimension_t *dst_shape,
-                                                 const halide_buffer_t *src) {
-    dst->host = src->host;
-    dst->device = src->device;
-    dst->device_interface = src->device_interface;
-    dst->type = src->type;
-    dst->dimensions = src->dimensions;
-    dst->dim = dst_shape;
-    dst->flags = src->flags;
-    for (int i = 0; i < dst->dimensions; i++) {
-        dst->dim[i] = src->dim[i];
-    }
-    return dst;
-}
-
-HALIDE_BUFFER_HELPER_ATTRS
-halide_buffer_t *_halide_buffer_crop(void *user_context,
-                                     halide_buffer_t *dst,
-                                     halide_dimension_t *dst_shape,
-                                     const halide_buffer_t *src,
-                                     const int *min, const int *extent) {
-    *dst = *src;
-    dst->dim = dst_shape;
-    int64_t offset = 0;
-    for (int i = 0; i < dst->dimensions; i++) {
-        dst->dim[i] = src->dim[i];
-        dst->dim[i].min = min[i];
-        dst->dim[i].extent = extent[i];
-        offset += (min[i] - src->dim[i].min) * src->dim[i].stride;
-    }
-    if (dst->host) {
-        dst->host += offset * src->type.bytes();
-    }
-    dst->device_interface = 0;
-    dst->device = 0;
-    if (src->device_interface) {
-        src->device_interface->device_crop(user_context, src, dst);
-    }
-    return dst;
-}
-
-
-// Called on return from an extern stage where the output buffer was a
-// crop of some other larger buffer. This happens for extern stages
-// with distinct store_at/compute_at levels. Each call to the stage
-// only fills in part of the buffer.
-HALIDE_BUFFER_HELPER_ATTRS
-int _halide_buffer_retire_crop_after_extern_stage(void *user_context,
-                                                   void *obj) {
-    halide_buffer_t **buffers = (halide_buffer_t **)obj;
-    halide_buffer_t *crop = buffers[0];
-    halide_buffer_t *parent = buffers[1];
-
-    if (crop->device) {
-        if (!parent->device) {
-            // We have been given a device allocation by the extern
-            // stage. It only represents the cropped region, so we
-            // can't just give it to the parent.
-            if (crop->device_dirty()) {
-                crop->device_interface->copy_to_host(user_context, crop);
-            }
-            crop->device_interface->device_free(user_context, crop);
-        } else {
-            // We are a crop of an existing device allocation.
-            if (crop->device_dirty()) {
-                parent->set_device_dirty();
-            }
-            crop->device_interface->device_release_crop(user_context, crop);
-        }
-    }
-    if (crop->host_dirty()) {
-        parent->set_host_dirty();
-    }
-    return 0;
-}
-
-HALIDE_BUFFER_HELPER_ATTRS
-int _halide_buffer_retire_crops_after_extern_stage(void *user_context,
-                                                    void *obj) {
-    halide_buffer_t **buffers = (halide_buffer_t **)obj;
-    while (*buffers) {
-        _halide_buffer_retire_crop_after_extern_stage(user_context, buffers);
-        buffers += 2;
-    }
-    return 0;
-}
-
-HALIDE_BUFFER_HELPER_ATTRS
-halide_buffer_t *_halide_buffer_set_bounds(halide_buffer_t *buf,
-                                           int dim, int min, int extent) {
-    buf->dim[dim].min = min;
-    buf->dim[dim].extent = extent;
-    return buf;
-}
-
-}
-
-#undef HALIDE_BUFFER_HELPER_ATTRS
diff --git a/t2s/preprocessor/sample/sample_01/pipe_array.hpp b/t2s/preprocessor/sample/sample_01/pipe_array.hpp
deleted file mode 100644
index 32edc725..00000000
--- a/t2s/preprocessor/sample/sample_01/pipe_array.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-//==============================================================
-// Copyright Intel Corporation
-//
-// SPDX-License-Identifier: MIT
-// =============================================================
-#ifndef __PIPE_ARRAY_HPP__
-#define __PIPE_ARRAY_HPP__
-
-#include <CL/sycl.hpp>
-#include <sycl/ext/intel/fpga_extensions.hpp>
-#include <utility>
-
-#include "pipe_array_internal.hpp"
-
-template <class Id, typename BaseTy, size_t depth, size_t... dims>
-struct PipeArray {
-  PipeArray() = delete;
-
-  template <size_t... idxs>
-  struct StructId;
-
-  template <size_t... idxs>
-  struct VerifyIndices {
-    static_assert(sizeof...(idxs) == sizeof...(dims),
-                  "Indexing into a PipeArray requires as many indices as "
-                  "dimensions of the PipeArray.");
-    static_assert(VerifierDimLayer<dims...>::template VerifierIdxLayer<
-                      idxs...>::IsValid(),
-                  "Index out of bounds");
-    using VerifiedPipe =
-        cl::sycl::ext::intel::pipe<StructId<idxs...>, BaseTy, depth>;
-  };
-
-  template <size_t... idxs>
-  using PipeAt = typename VerifyIndices<idxs...>::VerifiedPipe;
-};
-
-#endif /* __PIPE_ARRAY_HPP__ */
diff --git a/t2s/preprocessor/sample/sample_01/pipe_array_internal.hpp b/t2s/preprocessor/sample/sample_01/pipe_array_internal.hpp
deleted file mode 100644
index a4262d4c..00000000
--- a/t2s/preprocessor/sample/sample_01/pipe_array_internal.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-//==============================================================
-// Copyright Intel Corporation
-//
-// SPDX-License-Identifier: MIT
-// =============================================================
-
-#ifndef __PIPE_ARRAY_INTERNAL_HPP__
-#define __PIPE_ARRAY_INTERNAL_HPP__
-
-namespace {
-template <size_t dim1, size_t... dims>
-struct VerifierDimLayer {
-  template <size_t idx1, size_t... idxs>
-  struct VerifierIdxLayer {
-    static constexpr bool IsValid() {
-      return idx1 < dim1 &&
-             (VerifierDimLayer<dims...>::template VerifierIdxLayer<
-                 idxs...>::IsValid());
-    }
-  };
-};
-template <size_t dim>
-struct VerifierDimLayer<dim> {
-  template <size_t idx>
-  struct VerifierIdxLayer {
-    static constexpr bool IsValid() { return idx < dim; }
-  };
-};
-}  // namespace
-
-#endif /* __PIPE_ARRAY_INTERNAL_HPP__ */
diff --git a/t2s/preprocessor/sample/sample_01/runtime_internal.h b/t2s/preprocessor/sample/sample_01/runtime_internal.h
deleted file mode 100644
index 6dea68ca..00000000
--- a/t2s/preprocessor/sample/sample_01/runtime_internal.h
+++ /dev/null
@@ -1,216 +0,0 @@
-#ifndef HALIDE_RUNTIME_INTERNAL_H
-#define HALIDE_RUNTIME_INTERNAL_H
-
-#if __STDC_HOSTED__
-#error "Halide runtime files must be compiled with clang in freestanding mode."
-#endif
-
-#ifdef __UINT8_TYPE__
-typedef __INT64_TYPE__ int64_t;
-typedef __UINT64_TYPE__ uint64_t;
-typedef __INT32_TYPE__ int32_t;
-typedef __UINT32_TYPE__ uint32_t;
-typedef __INT16_TYPE__ int16_t;
-typedef __UINT16_TYPE__ uint16_t;
-typedef __INT8_TYPE__ int8_t;
-typedef __UINT8_TYPE__ uint8_t;
-#else
-typedef signed __INT64_TYPE__ int64_t;
-typedef unsigned __INT64_TYPE__ uint64_t;
-typedef signed __INT32_TYPE__ int32_t;
-typedef unsigned __INT32_TYPE__ uint32_t;
-typedef signed __INT16_TYPE__ int16_t;
-typedef unsigned __INT16_TYPE__ uint16_t;
-typedef signed __INT8_TYPE__ int8_t;
-typedef unsigned __INT8_TYPE__ uint8_t;
-#endif
-typedef __SIZE_TYPE__ size_t;
-typedef __PTRDIFF_TYPE__ ptrdiff_t;
-
-typedef ptrdiff_t ssize_t;
-
-#define NULL 0
-#define WEAK __attribute__((weak))
-
-#ifdef BITS_64
-#define INT64_C(c)  c ## L
-#define UINT64_C(c) c ## UL
-typedef uint64_t uintptr_t;
-typedef int64_t intptr_t;
-#endif
-
-#ifdef BITS_32
-#define INT64_C(c)  c ## LL
-#define UINT64_C(c) c ## ULL
-typedef uint32_t uintptr_t;
-typedef int32_t intptr_t;
-#endif
-
-#define STDOUT_FILENO 1
-#define STDERR_FILENO 2
-
-// Commonly-used extern functions
-extern "C" {
-void *halide_malloc(void *user_context, size_t x);
-void halide_free(void *user_context, void *ptr);
-WEAK int64_t halide_current_time_ns(void *user_context);
-WEAK void halide_print(void *user_context, const char *msg);
-WEAK void halide_error(void *user_context, const char *msg);
-WEAK void (*halide_set_custom_print(void (*print)(void *, const char *)))(void *, const char *);
-WEAK void (*halide_set_error_handler(void (*handler)(void *, const char *)))(void *, const char *);
-
-char *getenv(const char *);
-void free(void *);
-void *malloc(size_t);
-const char *strstr(const char *, const char *);
-int atoi(const char *);
-int strcmp(const char* s, const char* t);
-int strncmp(const char* s, const char* t, size_t n);
-size_t strlen(const char* s);
-const char *strchr(const char* s, int c);
-void* memcpy(void* s1, const void* s2, size_t n);
-int memcmp(const void* s1, const void* s2, size_t n);
-void *memset(void *s, int val, size_t n);
-// Use fopen+fileno+fclose instead of open+close - the value of the
-// flags passed to open are different on every platform
-void *fopen(const char *, const char *);
-int fprintf(void *stream, const char *format, ...);
-int fileno(void *);
-int fclose(void *);
-int close(int);
-size_t fwrite(const void *, size_t, size_t, void *);
-ssize_t write(int fd, const void *buf, size_t bytes);
-int remove(const char *pathname);
-int ioctl(int fd, unsigned long request, ...);
-char *strncpy(char *dst, const char *src, size_t n);
-
-// Below are prototypes for various functions called by generated code
-// and parts of the runtime but not exposed to users:
-
-// Similar to strncpy, but with various non-string arguments. Writes
-// arg to dst. Does not write to pointer end or beyond. Returns
-// pointer to one beyond the last character written so that calls can
-// be chained.
-
-struct halide_buffer_t;
-struct halide_type_t;
-WEAK char *halide_string_to_string(char *dst, char *end, const char *arg);
-WEAK char *halide_double_to_string(char *dst, char *end, double arg, int scientific);
-WEAK char *halide_int64_to_string(char *dst, char *end, int64_t arg, int digits);
-WEAK char *halide_uint64_to_string(char *dst, char *end, uint64_t arg, int digits);
-WEAK char *halide_pointer_to_string(char *dst, char *end, const void *arg);
-WEAK char *halide_buffer_to_string(char *dst, char *end, const halide_buffer_t *arg);
-WEAK char *halide_type_to_string(char *dst, char *end, const halide_type_t *arg);
-
-// Search the current process for a symbol with the given name.
-WEAK void *halide_get_symbol(const char *name);
-// Platform specific implementations of dlopen/dlsym.
-WEAK void *halide_load_library(const char *name);
-// If lib is NULL, this call should be equivalent to halide_get_symbol(name).
-WEAK void *halide_get_library_symbol(void *lib, const char *name);
-
-WEAK int halide_start_clock(void *user_context);
-WEAK int64_t halide_current_time_ns(void *user_context);
-WEAK void halide_sleep_ms(void *user_context, int ms);
-WEAK void halide_device_free_as_destructor(void *user_context, void *obj);
-WEAK void halide_device_and_host_free_as_destructor(void *user_context, void *obj);
-WEAK void halide_device_host_nop_free(void *user_context, void *obj);
-
-// The pipeline_state is declared as void* type since halide_profiler_pipeline_stats
-// is defined inside HalideRuntime.h which includes this header file.
-WEAK void halide_profiler_stack_peak_update(void *user_context,
-                                            void *pipeline_state,
-                                            uint64_t *f_values);
-WEAK void halide_profiler_memory_allocate(void *user_context,
-                                          void *pipeline_state,
-                                          int func_id,
-                                          uint64_t incr);
-WEAK void halide_profiler_memory_free(void *user_context,
-                                      void *pipeline_state,
-                                      int func_id,
-                                      uint64_t decr);
-WEAK int halide_profiler_pipeline_start(void *user_context,
-                                        const char *pipeline_name,
-                                        int num_funcs,
-                                        const uint64_t *func_names);
-WEAK int halide_host_cpu_count();
-
-WEAK int halide_device_and_host_malloc(void *user_context, struct halide_buffer_t *buf,
-                                       const struct halide_device_interface_t *device_interface);
-WEAK int halide_device_and_host_free(void *user_context, struct halide_buffer_t *buf);
-
-struct halide_filter_metadata_t;
-
-struct mxArray;
-WEAK int halide_matlab_call_pipeline(void *user_context,
-                                     int (*pipeline)(void **args), const halide_filter_metadata_t *metadata,
-                                     int nlhs, mxArray **plhs, int nrhs, const mxArray **prhs);
-
-WEAK int halide_trace_helper(void *user_context,
-                             const char *func,
-                             void *value, int *coords,
-                             int type_code, int type_bits, int type_lanes,
-                             int code,
-                             int parent_id, int value_index, int dimensions,
-                             const char *trace_tag);
-
-struct halide_pseudostack_slot_t {
-    void *ptr;
-    size_t size;
-};
-
-}  // extern "C"
-
-// A convenient namespace for weak functions that are internal to the
-// halide runtime.
-namespace Halide { namespace Runtime { namespace Internal {
-
-extern WEAK void halide_use_jit_module();
-extern WEAK void halide_release_jit_module();
-
-template <typename T>
-__attribute__((always_inline)) void swap(T &a, T &b) {
-    T t = a;
-    a = b;
-    b = t;
-}
-
-template <typename T>
-__attribute__((always_inline)) T max(const T &a, const T &b) {
-    return a > b ? a : b;
-}
-
-template <typename T>
-__attribute__((always_inline)) T min(const T &a, const T &b) {
-    return a < b ? a : b;
-}
-
-template <typename T, typename U>
-__attribute__((always_inline)) T reinterpret(const U &x) {
-    T ret;
-    memcpy(&ret, &x, min(sizeof(T), sizeof(U)));
-    return ret;
-}
-
-extern WEAK __attribute__((always_inline)) int halide_malloc_alignment();
-
-extern WEAK __attribute__((always_inline)) void halide_abort();
-
-void halide_thread_yield();
-
-}}}
-
-/** A macro that calls halide_print if the supplied condition is
- * false, then aborts. Used for unrecoverable errors, or
- * should-never-happen errors. */
-#define _halide_stringify(x) #x
-#define _halide_expand_and_stringify(x) _halide_stringify(x)
-#define halide_assert(user_context, cond)                               \
-    if (!(cond)) {                                                      \
-        halide_print(user_context, __FILE__ ":" _halide_expand_and_stringify(__LINE__) " Assert failed: " #cond "\n"); \
-        halide_abort();                                                        \
-    }
-
-using namespace Halide::Runtime::Internal;
-
-#endif
diff --git a/t2s/preprocessor/sample/sample_01/util.h b/t2s/preprocessor/sample/sample_01/util.h
deleted file mode 100644
index 60b680d4..00000000
--- a/t2s/preprocessor/sample/sample_01/util.h
+++ /dev/null
@@ -1,534 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the BSD-2-Clause Plus Patent License (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* https://opensource.org/licenses/BSDplusPatent
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: BSD-2-Clause-Patent
-*******************************************************************************/
-#ifndef UTIL_H
-#define UTIL_H
-
-#include "Halide.h"
-#include <iostream>
-#include <stdlib.h>
-#include <assert.h>
-#include <cmath>
-
-using namespace Halide;
-using namespace Halide::Internal;
-using namespace std;
-
-enum VALUES {
-    RANDOM,
-    SEQUENTIAL,
-    CONSTANT
-};
-
-template<typename T, size_t N>
-Buffer<T> new_data(VALUES v) {
-    Buffer<T> b(N);
-    for (size_t i = 0; i < N; i++) {
-        if (v == VALUES::RANDOM) {
-            b(i) = (T)rand();
-        } else {
-            b(i) = i;
-        }
-    }
-    return b;
-}
-
-template<typename T, int N1, int N2>
-Buffer<T> new_data_2d(VALUES v) {
-    Buffer<T> b(N1, N2);
-    for (int i = 0; i < N1; i++) {
-        for (int j = 0; j < N2; j++) {
-            if (v == VALUES::RANDOM) {
-                b(i, j) = (T) rand() + 1;
-            } else {
-                // b(i, j) = (i + 1)*(j + 1) + log(i * j + 1);
-                b(i, j) = i + j + 1;
-            }
-        }
-    }
-    return b;
-}
-
-template<typename T, int N1, int N2>
-Buffer<T> new_data_2d_lu(VALUES v) {
-    Buffer<T> b(N1, N2);
-    for (int i = 0; i < N1; i++) {
-        for (int j = 0; j < N2; j++) {
-            if (v == VALUES::RANDOM) {
-                b(i, j) = (T) rand() + 1;
-            } else {
-                b(i, j) = (i + 1)*(j + 1) + log(i * j + 1);
-            }
-        }
-    }
-    return b;
-}
-
-template<typename T, int N1, int N2, int N3>
-Buffer<T> new_data_3d_lu(VALUES v) {
-    Buffer<T> b(N1, N2, N3);
-    for (int i = 0; i < N1; i++) {
-        for (int j = 0; j < N2; j++) {
-            for (int k = 0; k < N3; k++) {
-                if (v == VALUES::RANDOM) {
-                    b(i, j, k) = (T) rand();
-                } else {
-                    b(i, j, k) = (i + 1)*(j + 1) + log(i * j + 1);
-                }
-            }
-        }
-    }
-    return b;
-}
-
-template<typename T, int N1, int N2, int N3>
-Buffer<T> new_data_3d(VALUES v) {
-    Buffer<T> b(N1, N2, N3);
-    for (int i = 0; i < N1; i++) {
-        for (int j = 0; j < N2; j++) {
-            for (int k = 0; k < N3; k++) {
-                if (v == VALUES::RANDOM) {
-                    b(i, j, k) = (T) rand();
-                } else {
-                    b(i, j, k) = i + j + k;
-                }
-            }
-        }
-    }
-    return b;
-}
-
-template <typename T, size_t M, size_t N>
-Buffer<T> new_matrix(VALUES v) {
-    Buffer<T> b(M, N);
-    for (size_t i = 0; i < M; i++) {
-        for (size_t j = 0; j < N; j++) {
-            if (v == VALUES::RANDOM) {
-                b(i, j) = (T)rand();
-            }
-            else {
-                b(i, j) = i * N + j;
-            }
-        }
-    }
-    return b;
-}
-
-template <typename T, size_t M, size_t N, size_t P>
-Buffer<T> new_data_3D(VALUES v) {
-    Buffer<T> b(M, N, P);
-    for (size_t i = 0; i < M; i++) {
-        for (size_t j = 0; j < N; j++) {
-            for (size_t p = 0; p < P; p++) {
-                if (v == VALUES::RANDOM) {
-                    b(i, j, p) = (T)rand();
-                }
-                else {
-                    b(i, j, p) = i * N * P + j * P + p;
-                }
-            }
-        }
-    }
-    return b;
-}
-
-template <typename T, size_t M, size_t N, size_t P, size_t Q>
-Buffer<T> new_data_4D(VALUES v) {
-    Buffer<T> b(M, N, P, Q);
-    for (size_t i = 0; i < M; i++) {
-        for (size_t j = 0; j < N; j++) {
-            for (size_t p = 0; p < P; p++) {
-                for (size_t q = 0; q < Q; q++) {
-                    if (v == VALUES::RANDOM) {
-                        b(i, j, p, q) = (T)rand();
-                    }
-                    else {
-                        b(i, j, p, q) = i * N * P * Q + j * P * Q + p * Q + q;
-                    }
-                }
-            }
-        }
-    }
-    return b;
-}
-
-template <typename T, size_t M, size_t N, size_t P, size_t Q, size_t X>
-Buffer<T> new_data_5D(VALUES v) {
-    Buffer<T> b(M, N, P, Q, X);
-    for (size_t i = 0; i < M; i++) {
-        for (size_t j = 0; j < N; j++) {
-            for (size_t p = 0; p < P; p++) {
-                for (size_t q = 0; q < Q; q++) {
-                    for (size_t x = 0; x < X; x++) {
-                        if (v == VALUES::RANDOM) {
-                            b(i, j, p, q, x) = (T)rand();
-                        }
-                        else {
-                            b(i, j, p, q, x) = i * N * P * Q * X +
-                                               j * P * Q * X +
-                                               p * Q * X +
-                                               q * X +
-                                               x;
-                        }
-                    }
-                }
-            }
-        }
-    }
-    return b;
-}
-
-template <typename T, size_t M, size_t N, size_t P, size_t Q, size_t X, size_t Y>
-Buffer<T> new_data_6D(VALUES v) {
-    Buffer<T> b(M, N, P, Q, X, Y);
-    for (size_t i = 0; i < M; i++) {
-        for (size_t j = 0; j < N; j++) {
-            for (size_t p = 0; p < P; p++) {
-                for (size_t q = 0; q < Q; q++) {
-                    for (size_t x = 0; x < X; x++) {
-                        for (size_t y = 0; y < Y; y++) {
-                            if (v == VALUES::RANDOM) {
-                                b(i, j, p, q, x, y) = (T)rand();
-                            } else
-                            if (v == VALUES::CONSTANT) {
-                                b(i, j, p, q, x, y) = (T)1;
-                            } else {
-                                b(i, j, p, q, x, y) = i * N * P * Q * X * Y +
-                                                      j * P * Q * X * Y +
-                                                      p * Q * X * Y +
-                                                      q * X * Y +
-                                                      x * Y;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-    return b;
-}
-
-template <typename T>
-void check_equal(const Buffer<T> &a, const Buffer<T> &b) {
-    assert(a.number_of_elements() == b.number_of_elements());
-    a.for_each_element([&](int x) {
-#ifdef VERBOSE_DEBUG
-        cout << a(x) << ", " << b(x) << "\n";
-#endif
-        assert(a(x) == b(x));
-    });
-}
-
-template <typename T>
-void check_equal_2D(const Buffer<T> &a, const Buffer<T> &b) {
-    assert(a.number_of_elements() == b.number_of_elements());
-    int line_number = a.width();
-    a.for_each_element([&](int x, int y) {
-#ifdef VERBOSE_DEBUG
-        cout << a(x, y) << ", " << b(x, y) << " ";
-        if (x == line_number - 1)
-            cout << std::endl;
-#endif
-        assert(a(x, y) == b(x, y));
-    });
-}
-
-template <typename T>
-void check_equal_3D(const Buffer<T> &a, const Buffer<T> &b) {
-    assert(a.number_of_elements() == b.number_of_elements());
-    int width = a.width(), height = a.height();
-    a.for_each_element([&](int x, int y, int z) {
-#ifdef VERBOSE_DEBUG
-        cout << a(x, y, z) << ", " << b(x, y, z) << " ";
-        if (x == width - 1){
-            cout << std::endl;
-            if (y == height - 1)
-                cout << std::endl;
-        }
-#endif
-        assert(a(x, y, z) == b(x, y, z));
-    });
-}
-
-template <typename T>
-void check_equal_4D(const Buffer<T> &a, const Buffer<T> &b) {
-    assert(a.number_of_elements() == b.number_of_elements());
-    int width = a.width(), height = a.height(), channel = a.channels();
-    a.for_each_element([&](int x, int y, int z, int w) {
-#ifdef VERBOSE_DEBUG
-        cout << a(x, y, z, w) << ", " << b(x, y, z, w) << " ";
-        if (x == width - 1) {
-            cout << std::endl;
-            if (y == height - 1) {
-                cout << std::endl;
-                if (z == channel - 1)
-                    cout << std::endl;
-            }
-        }
-#endif
-        assert(a(x, y, z, w) == b(x, y, z, w));
-});
-}
-
-template <typename T>
-void check_equal_5D(const Buffer<T> &a, const Buffer<T> &b) {
-    assert(a.number_of_elements() == b.number_of_elements());
-    int width = a.width(), height = a.height(), channel = a.channels();
-    a.for_each_element([&](int x, int y, int z, int w, int u) {
-#ifdef VERBOSE_DEBUG
-        cout << "(" << x << ", " << y << ", " << z << ", " << w << ", " << u << ") = " << a(x, y, z, w, u) << ", " << b(x, y, z, w, u) << "\n";
-        if (x == width - 1) {
-            cout << std::endl;
-            if (y == height - 1) {
-                cout << std::endl;
-                if (z == channel - 1)
-                    cout << std::endl;
-            }
-        }
-#endif
-        assert(a(x, y, z, w, u) == b(x, y, z, w, u));
-    });
-}
-
-template <typename T>
-void check_equal_6D(const Buffer<T> &a, const Buffer<T> &b) {
-    assert(a.number_of_elements() == b.number_of_elements());
-    int width = a.width(), height = a.height(), channel = a.channels();
-    a.for_each_element([&](int x, int y, int z, int w, int u, int v) {
-#ifdef VERBOSE_DEBUG
-        cout << "(" << x << ", " << y << ", " << z << ", " << w << ", " << u << ", " << v << ") = " << a(x, y, z, w, u, v) << ", " << b(x, y, z, w, u, v) << "\n";
-        if (x == width - 1) {
-            cout << std::endl;
-            if (y == height - 1) {
-                cout << std::endl;
-                if (z == channel - 1)
-                    cout << std::endl;
-            }
-        }
-#endif
-        assert(a(x, y, z, w, u, v) == b(x, y, z, w, u, v));
-    });
-}
-
-template <typename T>
-class CheckHelper {
- public:
-    const Buffer<T>& a;
-    const Buffer<T>& b;
-    CheckHelper(const Buffer<T>& a, const Buffer<T> &b) : a(a), b(b) {}
-    template<typename... Args>
-    void operator() (Args... args) {
-#ifdef VERBOSE_DEBUG
-        cout << a(args...) << ", " << b(args...) << "\n";
-#endif
-        assert(a(args...) == b(args...));
-    }
-};
-
-template <typename T>
-class EqualHelper {
- public:
-    bool equal;
-    const Buffer<T>& a;
-    const Buffer<T>& b;
-    EqualHelper(const Buffer<T>& a, const Buffer<T>& b) : a(a), b(b) {
-        equal = false;
-    }
-    // template<typename... Args>
-    // void operator() (Args... args) {
-    //     equal &= (a(args...) == b(args...));
-    // } 
-    void operator() (int x) {
-        equal &= (a(x) == b(x));
-    }
-};
-
-template <typename T>
-void check_equal_ND(const Buffer<T> &a, const Buffer<T> &b) {
-    CheckHelper<T> helper = CheckHelper<T>(a, b);
-    assert(a.number_of_elements() == b.number_of_elements());
-    a.for_each_element(helper);
-}
-
-template <typename T>
-bool buffer_equal(const Buffer<T> &a, const Buffer<T> &b) {
-    if (!(a.number_of_elements() == b.number_of_elements())) {
-        return false;
-    }
-    EqualHelper<T> helper = EqualHelper<T>(a, b);
-    a.for_each_element(helper);
-    return helper.equal;
-}
-
-template<typename T, int N1, int N2>
-Buffer<T> get_result_of_simple_case1() {
-    Buffer<T> b(N1, N2);
-    for (int i = 0; i < N1; i++) {
-        for (int j = 0; j < N2; j++) {
-            b(i, j) = (T) (i + j - 1); // (j > 0 ? i + j - 1 : i + j);
-        }
-    }
-    return b;
-}
-
-template<typename T, int N1, int N2>
-Buffer<T> get_result_of_simple_case2() {
-    Buffer<T> b(N1, N2);
-    for (int i = 0; i < N1; i++) {
-        for (int j = 0; j < N2; j++) {
-            b(i, j) = (T) (i + j); // (j > 0 ? i + j - 1 : i + j);
-        }
-    }
-    return b;
-}
-
-template<typename T, int N1, int N2, int N3>
-Buffer<T> get_result_of_mm(const Buffer<T> &a, const Buffer<T> &b) {
-    Buffer<T> c(N1, N2);
-    for (int i = 0; i < N1; i++) {
-        for (int j = 0; j < N2; j++) {
-            c(i, j) = 0;
-            for (int k = 0; k < N3; k++) {
-                c(i, j) += (T) a(i, k) * b(k, j);
-            }
-        }
-    }
-    return c;
-}
-
-template<typename T, int N1, int N2, int N3>
-Buffer<T> get_result_of_mm2(const Buffer<T> &a, const Buffer<T> &b) {
-    Buffer<T> c(N1, N2);
-    for (int i = 0; i < N1; i++) {
-        for (int j = 0; j < N2; j++) {
-            c(i, j) = 0;
-            for (int k = 0; k < N3; k++) {
-                c(i, j) += (T) a(k, i) * b(k, j);
-            }
-        }
-    }
-    return c;
-}
-
-template<typename T, int L, int I, int J, int K>
-Buffer<T> get_result_of_tmm(const Buffer<T> &a, const Buffer<T> &b) {
-    Buffer<T> c(I, J, K);
-    for (int i = 0; i < I; i++) {
-        for (int j = 0; j < J; j++) {
-            for (int k = 0; k < K; k++) {
-                c(i, j, k) = 0;
-                for (int l = 0; l < L; l++) {
-                    c(i, j, k) += (T) a(l, i, j) * b(l, k);
-                }
-            }
-        }
-    }
-    return c;
-}
-
-template<typename T, int I, int O, int R, int C, int P, int Q>
-Buffer<T> get_result_of_conv(const Buffer<T> &a, const Buffer<T> &b) {
-    Buffer<T> d(O, R, C);
-    for (int o = 0; o < O; o++) {
-        for (int r = 0; r < R; r++) {
-            for (int c = 0; c < C; c++) {
-                d(o, r, c) = 0;
-                for (int i = 0; i < I; i++) {
-                    for (int p = 0; p < P; p++) {
-                        for (int q = 0; q < Q; q++) {
-                            d(o, r, c) += (T) a(i, p+r, c+q) * b(i, o, p, q);
-                        }
-                    }
-                }
-            }
-        }
-    }
-    return d;
-}
-
-template<typename T, int L, int I, int J, int K>
-Buffer<T> get_result_of_mttkrp(const Buffer<T> &a, const Buffer<T> &b, const Buffer<T> &c) {
-    Buffer<T> d(I, J);
-    Buffer<T> e(I, J, K);
-    for (int i = 0; i < K; i++) {
-        for (int j = 0; j < J; j++) {
-            d(i, j) = 0;
-            for (int k = 0; k < K; k++) {
-                e(i, j, k) = 0;
-                for (int l = 0; l < L; l++) {
-                    e(i, j, k) += (T) a(l, i, k) * c(l, j);
-                }
-                d(i, j) += (T) e(i, j, k) * b(k, j);
-            }
-        }
-    }
-    return d;
-}
-
-template<typename T, int N1, int N2, int N3>
-Buffer<T> extract_result_of_mm(const Buffer<T> &a) {
-    Buffer<T> b(N1, N2);
-    for (int i = 0; i < N1; i++) {
-        for (int j = 0; j < N2; j++) {
-            b(i, j) = a(i, j, N3 - 1);
-        }
-    }
-    return b;
-}
-
-void print_type(const Expr *op) {
-    if (op->as<StringImm>()) {
-        printf("is string\n");
-    } else if (op->as<IntImm>()) {
-        printf("is int\n");
-    } else if (op->as<FloatImm>()) {
-        printf("is float\n");
-    } else if (op->as<Cast>()) {
-        printf("is cast\n");
-    } else if (op->as<Add>()) {
-        printf("is add\n");
-    } else if (op->as<Sub>()) {
-        printf("is sub\n");
-    } else if (op->as<Mul>()) {
-        printf("is mul\n");
-    } else if (op->as<Div>()) {
-        printf("is div\n");
-    } else if (op->as<Call>()) {
-        printf("is call\n");
-    } else if (op->as<Select>()) {
-        printf("is select\n");
-    } else if (op->as<Shuffle>()) {
-        printf("is shuffle\n");
-    } else if (op->as<Ramp>()) {
-        printf("is ramp\n");
-    } else if (op->as<Load>()) {
-        printf("is load\n");
-    } else if (op->as<Store>()) {
-        printf("is store\n");
-    } else if (op->as<Broadcast>()) {
-        printf("is broadcast\n");
-    } else if (op->as<Let>()) {
-        printf("is let\n");
-    } else if (op->as<Variable>()) {
-        printf("is variable\n");
-    } else {
-        printf("don't know this ir\n");
-    }
-}
-
-#endif
diff --git a/t2s/preprocessor/src/t2spreprocessor.h b/t2s/preprocessor/src/t2spreprocessor.h
deleted file mode 100644
index 11034871..00000000
--- a/t2s/preprocessor/src/t2spreprocessor.h
+++ /dev/null
@@ -1,542 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the BSD-2-Clause Plus Patent License (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* https://opensource.org/licenses/BSDplusPatent
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: BSD-2-Clause-Patent
-*******************************************************************************/
-
-#ifndef T2SPREPROCESSOR_H_
-#define T2SPREPROCESSOR_H_
-
-// clang include
-#include <clang-c/Index.h>
-
-// // LLVM includes
-#include <llvm/Support/CommandLine.h>
-#include "clang/AST/ASTConsumer.h"
-#include "clang/Frontend/CompilerInstance.h"
-
-// Clang includes
-#include "clang/Tooling/CommonOptionsParser.h"
-#include "clang/Tooling/CompilationDatabase.h"
-#include "clang/AST/RecursiveASTVisitor.h"
-#include "clang/AST/ASTContext.h"
-
-// Clang Parse Attributes
-#include "clang/AST/Attr.h"
-#include "clang/Sema/ParsedAttr.h"
-#include "clang/Sema/Sema.h"
-#include "clang/Sema/SemaDiagnostic.h"
-#include "llvm/IR/Attributes.h"
-
-// Declares clang::SyntaxOnlyAction.
-#include "clang/Frontend/FrontendActions.h"
-#include "clang/Tooling/Tooling.h"
-
-// // Find matching statements
-#include "clang/ASTMatchers/ASTMatchers.h"
-#include "clang/ASTMatchers/ASTMatchFinder.h"
-
-// For rewritting
-#include "clang/Rewrite/Core/Rewriter.h"
-
-// For llvm Inner
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/StringRef.h"
-
-// Standard includes
-#include <cassert>
-#include <cstdlib>
-#include <fstream>
-#include <iostream>
-#include <regex>
-#include <string>
-#include <vector>
-#include <stdlib.h>
-
-using namespace clang::tooling;
-using namespace llvm;
-using namespace clang;
-using namespace clang::ast_matchers;
-
-// =============== Helpers =============== 
-
-const char * USAGE_EXAMPLE = R"(
-/* filename.cpp */
-int main(){
-	// Initialize data 
-	const int TOTAL_I = III * II * I;
-	const int TOTAL_J = JJJ * JJ * J;
-	const int TOTAL_K = KKK * KK * K;
-	float *a = new float[TOTAL_K * TOTAL_I];
-	float *b = new float[TOTAL_J * TOTAL_K];
-	float *c = new float[TOTAL_J * TOTAL_I];
-	for(unsigned int i = 0; i < (TOTAL_K * TOTAL_I); i++){ a[i] = random(); }
-	for(unsigned int i = 0; i < (TOTAL_J * TOTAL_K); i++){ b[i] = random(); }
-	for(unsigned int i = 0; i < (TOTAL_J * TOTAL_I); i++){ c[i] = 0.0f; }
-
-	// Dimensions of the data
-	int a_dim[2] = {TOTAL_K, TOTAL_I};
-	int b_dim[2] = {TOTAL_J, TOTAL_K};
-	int c_dim[6] = {JJJ, III, JJ, II, J, I};
-
-
-#pragma t2s_spec_start
-	// ...
-	ImageParam A("A", TTYPE, 2), B("B", TTYPE, 2);
-	// T2S Speficiations ... 
-	C.compile_to_oneapi( { A, B }, "gemm", IntelFPGA);
-
-#pragma t2s_spec_end
-
-#pragma t2s_submit gemm (A, a, a_dim) (B, b, b_dim) (C, c, c_dim)
-
-}
-)";
-
-
-namespace t2sprinter {
-  
-  const std::string t2sprinterName = "t2spreprocessor: ";
-
-  const std::string HEADER = "\033[95m";
-  const std::string OKBLUE = "\033[94m";
-  const std::string OKCYAN = "\033[96m";
-  const std::string OKGREEN = "\033[92m";
-  const std::string WARNING = "\033[93m";
-  const std::string FAIL = "\033[91m";
-  const std::string ENDC = "\033[0m";
-  const std::string BOLD = "\033[1m";
-  const std::string UNDERLINE = "\033[4m";
-
-
-  std::string simple(){
-    std::ostringstream rhs;
-    rhs << t2sprinterName;
-    return rhs.str();
-  }
-
-  std::string simple_tab(){
-    std::ostringstream rhs;
-    rhs << "\t" << t2sprinterName;
-    return rhs.str();
-  }
-
-
-  std::string header(){
-    std::ostringstream rhs;
-    rhs << OKGREEN << t2sprinterName << ENDC;
-    return rhs.str();
-  }
-
-  std::string warning(){
-    std::ostringstream rhs;
-    rhs << WARNING << t2sprinterName << ENDC;
-    return rhs.str();
-  }
-
-  std::string error(){
-    std::ostringstream rhs;
-    rhs << FAIL << t2sprinterName << ENDC;
-    return rhs.str();
-  }
-
-
-  std::string base_name(std::string const & path){
-    const std::string delims = "/\\";
-    std::size_t pos = path.find_last_of(delims);
-    return path.substr(pos + 1);
-  }
-
-  std::string path_only(std::string const & path){
-    const std::string delims = "/\\";
-    std::size_t pos = path.find_last_of(delims);
-    return path.substr(0, pos + 1);
-  }
-
-
-  std::string usage(){
-    std::ostringstream rhs;
-    std::string space = " ";
-    rhs << header() << "Expected usage goes as follows: " 
-      << "\n\n";
-    
-    rhs << USAGE_EXAMPLE;
-    // << "int main(){\n\n"
-    // << space   << "float *a = new float[DIM_1 * DIM_2];" << "\n" 
-    // << space   << "float *b = new float[DIM_3 * DIM_4 * DIM_5];" << "\n\n" 
-    // << space   << "int a_dim[2] = {DIM_1, DIM_2};" << "\n" 
-    // << space   << "int b_dim[3] = {DIM_3, DIM_4, DIM_5};" << "\n\n" 
-    // << space   << "#pragma t2s_arg A, a, a_dim" << "\n"
-    // << space   << "#pragma t2s_arg C, c, c_dim" << "\n"
-    // << space   << "#pragma t2s_arg B, b, b_dim" << "\n"
-    // << space   << "#pragma t2s_spec_start" << "\n\n" 
-    // << space << space << "ImageParam A(\"A\", TTYPE, 2), B(\"B\", TTYPE, 2);" << "\n" 
-    // << space << space << "// T2S Speficiations ... " << "\n" 
-    // << space << space << "C.compile_to_oneapi( { A, B }, \"gemm\", IntelFPGA);" << "\n\n" 
-    // << space   << "#pragma t2s_spec_end" << "\n\n"
-    // << "}\n\n";
-    return rhs.str();
-  }
-
-
-  std::string get_base_type(std::string const & arg_type){
-    std::vector<std::string> arg_type_vec;
-    size_t start;
-    size_t end = 0;
-    while ((start = arg_type.find_first_not_of(" ", end)) != std::string::npos){
-      end = arg_type.find(" ", start);
-      arg_type_vec.push_back(arg_type.substr(start, end - start));
-    }
-    std::string ret = arg_type_vec[0];
-    return ret;    
-  }
-
-
-  std::string parseOneAPIName(std::string const & name){
-    std::string funcName = name;
-    // Check to make sure that the second paramater, the function name, is not a variable 
-    if( funcName.find("\"") == std::string::npos && funcName.find("'") == std::string::npos ){
-      std::cout << t2sprinter::error() << "Error: unable to parse the name passed provided i.e. `" << funcName << "`.\n";
-      std::cout << t2sprinter::usage();
-      exit(1);        
-    }
-
-    // str.erase(std::remove(str.begin(), str.end(), 'a'), str.end());
-    while( funcName.find("\"") != std::string::npos ){ funcName.replace( funcName.find("\""), 1, ""); }
-    while( funcName.find("'") != std::string::npos ){ funcName.replace( funcName.find("'"), 1, ""); }
-    return funcName;
-  }
-
-  std::vector<std::string> parseOneAPIVector(std::string const & vec){
-    std::string vec_tmp = vec;
-    std::vector<std::string> args;
-    // Check to make sure we can parse the vector directly
-    if( vec_tmp.find("{") == std::string::npos && vec_tmp.find("}") == std::string::npos ){
-      std::cout << t2sprinter::error() << "Error: unable to parse the vector passed provided i.e. `" << vec << "`.\n";
-      std::cout << t2sprinter::usage();
-      exit(1);        
-    }
-
-    while( vec_tmp.find("{") != std::string::npos ){ vec_tmp.replace( vec_tmp.find("{"), 1, ""); }
-    while( vec_tmp.find("}") != std::string::npos ){ vec_tmp.replace( vec_tmp.find("}"), 1, ""); }
-    while( vec_tmp.find(" ") != std::string::npos ){ vec_tmp.replace( vec_tmp.find(" "), 1, ""); }
-
-    size_t pos = 0;
-    while( vec_tmp.find(",") != std::string::npos ){
-      pos = vec_tmp.find(",");
-      args.push_back( vec_tmp.substr(0,pos) );
-      // std::cout << t2sprinter::warning() << "\t parseVector @ arg: " << vec_tmp.substr(0,pos) << "\n";
-      vec_tmp.erase(0, pos + 1);
-    }
-    // std::cout << t2sprinter::warning() << "\t parseVector @ arg: " << vec_tmp << "\n";
-    args.push_back( vec_tmp );
-
-
-    return args;    
-  }
-
-  std::string parseOneAPIOutputArg(std::string const & func){
-    std::string arg = func;
-    // Check to make sure that the second paramater, the function name, is not a variable 
-    if( arg.find(".") == std::string::npos ){
-      std::cout << t2sprinter::error() << "Error: unable to parse the output argument name passed provided i.e. `" << func << "`.\n";
-      std::cout << t2sprinter::usage();
-      exit(1);        
-    }
-
-    size_t pos = arg.find(".");
-    std::string ret = arg.substr(0,pos);
-
-    return ret;
-  }
-
-}
-
-struct OneAPIArgStruct {
-  std::string arg_name;
-  std::string ptr_name;
-  std::string dim_name;
-
-  std::string full_args;
-
-  void ParseArgs(std::string s){
-    full_args = s;
-    std::vector<std::string> args;
-
-    // Remove all white spaces
-    s.erase(remove_if(s.begin(), s.end(), isspace), s.end());
-
-    // Seperate the args 
-    std::string tmp;    
-    size_t pos;
-    while((pos = s.find(',')) != std::string::npos ){
-      tmp = s.substr(0, pos);
-      args.push_back( tmp );
-      s.erase(0, pos + 1 );
-    }
-    args.push_back( s );
-
-    // Check that we have 3 args
-    assert( args.size() == 3);
-
-
-    // Allocate them to members accordingly
-    arg_name = args[0];
-    ptr_name = args[1];
-    dim_name = args[2];
-  }
-
-  std::string str(){
-    std::ostringstream rhs;
-    rhs << "\t" << arg_name << " -> ptr: [" << ptr_name << "] , dim: [" << dim_name << "]";
-    return rhs.str(); 
-  }
-};
-
-struct OneAPIFuncStruct {
-  std::string funcName;
-  std::vector<OneAPIArgStruct> SubmitArgs;
-  std::vector<std::string> oneapi_args_names;
-  std::string oneapi_output_arg;
-
-  std::vector<const clang::Expr*> args_expr_pntrs;
-
-  // OneAPI Location i.e. `compile_to_onepai()` method
-  SourceLocation oneapi_start;
-  SourceLocation oneapi_end;
-
-  // Pragma t2s_submit pragma location
-  SourceLocation submit_start;
-  SourceLocation submit_end;
-
-  std::string str(){
-    std::ostringstream rhs;
-    rhs << "\n";
-    rhs << "funcName: " << funcName << "\n";
-    rhs << "OutputArg: " << oneapi_output_arg << "\n";
-
-
-    rhs << "oneapi_args_names: [";
-    for(unsigned int i = 0; i < oneapi_args_names.size(); i++){
-      rhs << oneapi_args_names[i];
-      if(i < oneapi_args_names.size()-1){
-        rhs << ", ";
-      }
-    }
-    rhs << "]\n";
-    for(unsigned int i = 0; i < SubmitArgs.size(); i++){
-      rhs << SubmitArgs[i].str() << "\n";
-    }
-
-    return rhs.str();
-  }
-
-  // For parsing the `compile_to_oneapi()` method 
-  void parseExprPntrs(Rewriter &R, SourceLocation start, SourceLocation end){
-
-    if( args_expr_pntrs.size() < 3){
-      std::cout << t2sprinter::error() << "Error: args_expr_pntrs is size zero.\n";
-    }
-
-    if( funcName.size() == 0){
-      std::string ret = t2sprinter::parseOneAPIName(    
-        R.getRewrittenText( SourceRange( args_expr_pntrs[1]->getBeginLoc(), args_expr_pntrs[1]->getEndLoc()) )
-      );
-      funcName = ret;
-    }
-
-    if( oneapi_args_names.size() == 0){
-      std::vector<std::string> ret = t2sprinter::parseOneAPIVector(    
-        R.getRewrittenText( SourceRange( args_expr_pntrs[0]->getBeginLoc(), args_expr_pntrs[0]->getEndLoc()) ) 
-      );
-      oneapi_args_names = ret;
-    }
-
-    if( oneapi_output_arg.size() == 0){
-      std::string ret = t2sprinter::parseOneAPIOutputArg(    
-        R.getRewrittenText( SourceRange( start, end) ) 
-      );
-      oneapi_output_arg = ret;      
-    }
-  }
-
-
-
-  // For parsing the arugments themselves
-  void parseArgPntrs(Rewriter &R, SourceLocation start, SourceLocation end){
-    // Get the t2s_submit pragma line as a string
-    std::string Args = R.getRewrittenText( SourceRange( start, end) );
-    parseArgPntrsHelper(Args);
-  }
-
-  void parseArgPntrsHelper(std::string Args){
-    // Get the open & close bracket index
-    std::vector<int> open_bracket;
-    std::vector<int> close_bracket;
-    for(unsigned int i; i < Args.size(); i++){
-      if( Args.at(i) == '(' ){  open_bracket.push_back(i); }
-    }
-    for(unsigned int i; i < Args.size(); i++){
-      if( Args.at(i) == ')' ){  close_bracket.push_back(i); }
-    }
-
-
-    // Create and Push back the OneAPIArgStruct
-    for(unsigned int i = 0; i < open_bracket.size(); i++){
-      std::string sub_str = Args.substr( open_bracket[i]+1, close_bracket[i] - (open_bracket[i]+1)  );
-      OneAPIArgStruct tmp;
-      tmp.ParseArgs(  sub_str  );
-      SubmitArgs.push_back( tmp );
-    }
-  }
-
-};
-
-
-struct ParamLocs {
-  SourceLocation start;
-  SourceLocation end;
-  Token *tkn;
-  
-  std::string args;
-
-  std::vector<QualType> arg_types;
-
-  OneAPIFuncStruct oneapiStruct;
-
-  ParamLocs(SourceLocation s, SourceLocation e): start(s), end(e) {}
-  ParamLocs(SourceLocation s, SourceLocation e, Token *t): start(s), end(e), tkn(t) {}
-
-  std::vector<std::string> SplitArgs(){
-    // Get the argument names and put them into a string vector
-    std::string tmp_args = args;
-    std::vector<std::string> args_vector;
-    std::string delim = ",";
-    size_t pos = 0;
-    bool last_arg = false;
-    while((pos = tmp_args.find(delim)) != std::string::npos ||  !last_arg ){
-      std::string token = tmp_args.substr(0, pos);
-      args_vector.push_back( token );
-      tmp_args.erase(0, pos + delim.length() );
-
-      if(pos == std::string::npos)
-        last_arg = true;
-    }
-    return args_vector;
-  }
-
-  void initOneAPIStruct(Rewriter &orig_Rewriter){
-    oneapiStruct.parseExprPntrs( orig_Rewriter, start, end);
-  }
-
-};
-
-
-
-
-std::string GenRunPostCode(OneAPIFuncStruct oneapiStruct, std::vector<std::string> args_name, std::vector<QualType> args_types,  std::string funcName){
-    std::ostringstream rhs;
-
-    // check that there is an even number i.e. paris of args and their dimension vector
-    if( ((args_name.size() % 3) != 0) || ((args_types.size() % 3) != 0)     ){
-      std::cout << t2sprinter::error() << "Uneven match of argument names and argument types i.e. [" << args_name.size() << "] arg names and [" << args_types.size() <<"] arg types found.\n";
-      std::cout << t2sprinter::usage();
-      exit(1);
-    }
-
-
-    rhs << "\n\n";
-
-    std::vector<std::string> new_halide_names_vector;
-    std::string new_halide_output_name;
-
-    for(unsigned int i = 0; i < args_name.size(); i = i + 3){
-        unsigned int FuncArgName_index = i;
-        unsigned int pointer_index = i+1;
-        unsigned int dim_index = i+2;
-
-        std::string func_arg_name = args_name[FuncArgName_index];
-        std::string new_halide_name = func_arg_name + "_h";
-
-        if( func_arg_name == oneapiStruct.oneapi_output_arg ){ new_halide_output_name = new_halide_name; }
-
-        std::string arg_ptr_name = args_name[pointer_index];
-        new_halide_names_vector.push_back( new_halide_name );
-        std::string arg_dim_vector_name = args_name[dim_index];
-
-        // Retrieve the QualTypes
-        QualType t_ptr = args_types[pointer_index]; // pointer type
-        QualType t_dim = args_types[dim_index]; // array type
-
-        // // Get the type of Halide::Runtime::Buffer by Splitting the type to get the base type
-        std::string arg_type = t_ptr.getAsString();
-        arg_type = t2sprinter::get_base_type( arg_type );
-
-
-        // // Get the number of dimensions
-        std::string arg_dim_type = t_dim.getAsString();
-        arg_dim_type = t2sprinter::get_base_type( arg_dim_type );
-        std::string arg_dim_vector_size = "(sizeof(" + arg_dim_vector_name + ")/sizeof(" + arg_dim_type + "))";
-
-        // // Get the dimensions as an std::vector
-        std::string arg_dims_as_vector = "std::vector<" + arg_dim_type + ">(std::begin(" + arg_dim_vector_name + "), std::end(" + arg_dim_vector_name + "))";
-
-        // // For debugging purposes
-        // rhs << "// func_arg_name: " << func_arg_name << "\n";
-        // rhs << "// arg_ptr_name: " << arg_ptr_name << "\n";
-        // rhs << "// arg_type: " << arg_type << "\n";
-        // rhs << "// new_halide_name: " << new_halide_name << "\n";
-        // rhs << "// arg_dim_vector_name: " << arg_dim_vector_name << "\n";
-        // rhs << "// arg_dim_type: " << arg_dim_type << "\n";
-        // rhs << "// arg_dim_vector_size: " << arg_dim_vector_size << "\n";
-        // rhs << "// arg_dims_as_vector: " << arg_dims_as_vector << "\n";
-        // rhs << "\n\n";
-
-        // // Write the initalization of the halide runtime buffer
-        rhs << "Halide::Runtime::Buffer<" << arg_type << ", "<< arg_dim_vector_size << "> " << new_halide_name << "("<< arg_ptr_name << ", " << arg_dims_as_vector << ");\n";
-
-    }
-
-    // Write out the device selector options
-    rhs << "#if defined(FPGA_EMULATOR)\n";
-    rhs << "std::cout << \"USING FPGA EMULATOR\\n\";\n";
-    rhs << "sycl::ext::intel::fpga_emulator_selector device_selector; // (NOTE) for emulation\n";
-    rhs << "#else\n";
-    rhs << "std::cout << \"USING REAL FPGA\\n\";\n";
-    rhs << "sycl::ext::intel::fpga_selector device_selector; // (NOTE) for full compile and hardware profiling \n";
-    rhs << "#endif\n";
-
-    // Write out the execution of the function
-    rhs << "std::cout << \"Start Run\\n\";\n";
-    rhs << "double exec_time = 0;\n";
-    rhs << "exec_time = "+ funcName + "(device_selector, ";
-    for(unsigned int i = 0; i < new_halide_names_vector.size(); i++){
-      if( new_halide_names_vector[i] != new_halide_output_name ){
-        rhs << new_halide_names_vector[i] << ".raw_buffer()";
-        if( i != new_halide_names_vector.size() - 1){ rhs << ", "; }
-      }
-    }
-    rhs << new_halide_output_name << ".raw_buffer()";
-    rhs << ");\n";
-    rhs << "std::cout << \"Run completed!\\n\";\n";
-    rhs << "std::cout << \"kernel exec time: \" << exec_time << \"\\n\";\n";
-
-    return rhs.str();
-}
-
-
-
-#endif // T2SPREPROCESSOR_H_
\ No newline at end of file
diff --git a/t2s/src/.gitignore b/t2s/src/.gitignore
new file mode 100644
index 00000000..0d93bbc0
--- /dev/null
+++ b/t2s/src/.gitignore
@@ -0,0 +1 @@
+oneapi-src
\ No newline at end of file
diff --git a/t2s/src/MemorySchedule.cpp b/t2s/src/MemorySchedule.cpp
index 29b72975..fc72e690 100644
--- a/t2s/src/MemorySchedule.cpp
+++ b/t2s/src/MemorySchedule.cpp
@@ -994,7 +994,7 @@ class GPUStoreInserter : public IRMutator
     bool in_output = false;
     bool in_thread = false;
     string output_func;
-
+    string buf_ref_name;
     Stmt make_store(string name, Expr ld_idx, size_t len) {
         const auto &st_name = ori_store->name;
         const auto &ld_name = ori_load->name;
@@ -1018,7 +1018,7 @@ class GPUStoreInserter : public IRMutator
             call_args.push_back(ld_idx);
             for (size_t i = 0; i < elems.size(); i++)
                 call_args.push_back(elems[i]);
-
+            call_args.push_back(buf_ref_name);
             Expr call = Call::make(ld_type, Call::cm_store_2d, call_args, Call::Intrinsic);
             return Evaluate::make(call);
         }
@@ -1041,8 +1041,10 @@ class GPUStoreInserter : public IRMutator
         for (auto &kv : func_info) {
             Function func;
             internal_assert(function_is_in_environment(kv.first, env, func));
-            if (kv.second.is_output && func.definition().schedule().has_store())
+            if (kv.second.is_output && func.definition().schedule().has_store()) {
                 output_func = kv.first;
+                buf_ref_name = func.definition().schedule().store_params().name;
+            }
         }
     }
 
diff --git a/t2s/src/Place.cpp b/t2s/src/Place.cpp
index 598b9313..513fc5ec 100644
--- a/t2s/src/Place.cpp
+++ b/t2s/src/Place.cpp
@@ -49,7 +49,7 @@ class PlaceDeviceFunctions : public IRMutator {
                 << "Device Func " << op->name << " is expected to have one and only one definition\n";
             Stmt body = mutate(op->body);
             Stmt new_body;
-            if( target.has_feature(Target::OneAPI) ){
+            if( target.has_feature(Target::OneAPI) && !target.has_feature(Target::IntelGPU)){
                 new_body = For::make(op->name + ".s0.run_on_device",
                                         0,
                                         1,
diff --git a/t2s/src/SpaceTimeTransform.cpp b/t2s/src/SpaceTimeTransform.cpp
index 643f3198..33775ba0 100644
--- a/t2s/src/SpaceTimeTransform.cpp
+++ b/t2s/src/SpaceTimeTransform.cpp
@@ -913,7 +913,7 @@ class SpaceTimeTransformer : public IRMutator {
                 // loop as Unrolled.
                 ForType for_type = target.has_feature(Target::IntelFPGA) ? ForType::Unrolled : ForType::Serial;
                 if (k == 0) {
-                    if ((target.has_feature(Target::IntelFPGA) || target.has_feature(Target::OneAPI)) && vectorized_loop_name != "") {\
+                    if ((target.has_feature(Target::IntelFPGA) || (target.has_feature(Target::OneAPI) && !target.has_feature(Target::IntelGPU))) && vectorized_loop_name != "") {\
                         debug(4) << "Vectorize loop: " << vectorized_loop_name << "\n";
                         internal_assert(extract_last_token(vectorized_loop_name) == param.dst_vars[k])
                         << "After space time transformation, the vectorized loop "
diff --git a/t2s/src/Stensor.cpp b/t2s/src/Stensor.cpp
index e71323b1..521997de 100644
--- a/t2s/src/Stensor.cpp
+++ b/t2s/src/Stensor.cpp
@@ -579,7 +579,7 @@ class RealizeOnGPU
     void gpu_store(Schain &c) {
         for (auto &s : c.stensors) {
             if (s.dims.size() > 0) {
-                c.outf.gpu_store(s.dims);
+                c.outf.gpu_store(s.dims, s.name);
                 debug(1) << c.outf.name() << ".gpu_store("
                          << to_string(s.dims) << ");\n";
             }
@@ -682,6 +682,7 @@ void Stensor::compile_to_host(string file_name, const vector<Argument> &args,
         Target acc = get_host_target();
         acc.set_feature(Target::IntelFPGA);
         acc.set_feature(Target::EnableSynthesis);
+        acc.set_feature(Target::CM);
         f.compile_to_host(file_name, args, fn_name, acc);
     }
     if (t == Starget::IntelGPU) {
@@ -689,11 +690,12 @@ void Stensor::compile_to_host(string file_name, const vector<Argument> &args,
                         "so we just emit out the source code in " << fn_name << "_genx.cpp\n";
         Target acc = get_host_target();
         acc.set_feature(Target::IntelGPU);
+        acc.set_feature(Target::CM);
         f.compile_to_cm(fn_name, std::move(args), acc);
     }
 }
 
-
+//here we want to store fn_name
 void Stensor::compile_to_oneapi(const vector<Argument> &args,
                               const std::string fn_name, Starget t) {
     Func f = stensor_realize_wrapper(t);
@@ -704,6 +706,10 @@ void Stensor::compile_to_oneapi(const vector<Argument> &args,
         acc.set_feature(Target::EnableSynthesis);
         f.compile_to_oneapi(args, fn_name, acc);
     }
+    else if (t == Starget::IntelGPU){
+        acc.set_feature(Target::IntelGPU);
+        f.compile_to_oneapi(args,fn_name,acc);
+    }
 }
 
 
diff --git a/t2s/preprocessor/README.md b/t2s/src/preprocessor/README.md
similarity index 59%
rename from t2s/preprocessor/README.md
rename to t2s/src/preprocessor/README.md
index 695c8318..3e503380 100644
--- a/t2s/preprocessor/README.md
+++ b/t2s/src/preprocessor/README.md
@@ -5,44 +5,54 @@ The T2S Preprocessor is a clang lib-tool to create OneAPI generated code using s
 ## Overview
 
 The preprocessor is an executable source-to-source translation tool built using Clang.
-All source code can be found in the `t2s/preprocessor/src/` directory.
 
-The preprocessor generates two files given a file e.g. `filename.cpp`:
-- (1) `post.t2s.<filename>.cpp`: to create a OneAPI Code Generator & generate a header file to be used by the file 2
-- (2) `post.run.<filename>.cpp`: to generate a final executable 
+- For FPGA, T2S spec is translated to oneAPI DPC++.
 
-The preprocessor uses the following `#pragma`s & `.compile_to_oneapi()` method to identify:
-- (1) Function Arguments
-- (2) T2S Specifications
-- (3) Header file name created by the OneAPI Code Generater through the `.compile_to_oneapi()` method
+- For GPU, T2S spec is translated to oneAPI DPC++ with ESIMD extention.
+
+All source code can be found in the `t2s/src/preprocessor/` directory.
+
+The preprocessor generates two files for a given file (e.g. `filename.cpp` ) 
+
+- (1) `<filename>.<device>.spec.cpp`: 
+  - to create a OneAPI Code Generator & generate a header file to be used by the file 2
+- (2) `<filename>.<device>.run.cpp`: 
+  - to generate a final executable
+- Currently, `<device>` is FPGA or GPU
+
+## <a name="workflow"></a>Work flow for preprocessor
+
+![](./workflow.png)
+
+## File structure
 
 Below is a boiled down example of the file structure the preprocessor is expecting.
 
 ```C++
 /* filename.cpp */
 int main(){
-	// Initialize data 
-	const int TOTAL_I = III * II * I;
-	const int TOTAL_J = JJJ * JJ * J;
-	const int TOTAL_K = KKK * KK * K;
-	float *a = new float[TOTAL_K * TOTAL_I];
-	float *b = new float[TOTAL_J * TOTAL_K];
-	float *c = new float[TOTAL_J * TOTAL_I];
-	for(unsigned int i = 0; i < (TOTAL_K * TOTAL_I); i++){ a[i] = random(); }
-	for(unsigned int i = 0; i < (TOTAL_J * TOTAL_K); i++){ b[i] = random(); }
-	for(unsigned int i = 0; i < (TOTAL_J * TOTAL_I); i++){ c[i] = 0.0f; }
-
-	// Dimensions of the data
-	int a_dim[2] = {TOTAL_K, TOTAL_I};
-	int b_dim[2] = {TOTAL_J, TOTAL_K};
-	int c_dim[6] = {JJJ, III, JJ, II, J, I};
+    // Initialize data 
+    const int TOTAL_I = III * II * I;
+    const int TOTAL_J = JJJ * JJ * J;
+    const int TOTAL_K = KKK * KK * K;
+    float *a = new float[TOTAL_K * TOTAL_I];
+    float *b = new float[TOTAL_J * TOTAL_K];
+    float *c = new float[TOTAL_J * TOTAL_I];
+    for(unsigned int i = 0; i < (TOTAL_K * TOTAL_I); i++){ a[i] = random(); }
+    for(unsigned int i = 0; i < (TOTAL_J * TOTAL_K); i++){ b[i] = random(); }
+    for(unsigned int i = 0; i < (TOTAL_J * TOTAL_I); i++){ c[i] = 0.0f; }
+
+    // Dimensions of the data
+    int a_dim[2] = {TOTAL_K, TOTAL_I};
+    int b_dim[2] = {TOTAL_J, TOTAL_K};
+    int c_dim[6] = {JJJ, III, JJ, II, J, I};
 
 
 #pragma t2s_spec_start
-	// ...
-	ImageParam A("A", TTYPE, 2), B("B", TTYPE, 2);
-	// T2S Speficiations ... 
-	C.compile_to_oneapi( { A, B }, "gemm", IntelFPGA);
+    // ...
+    ImageParam A("A", TTYPE, 2), B("B", TTYPE, 2);
+    // T2S Speficiations ... 
+    C.compile_to_oneapi( { A, B }, "gemm", IntelFPGA);
 
 #pragma t2s_spec_end
 
@@ -51,11 +61,19 @@ int main(){
 }
 ```
 
+### Tips for you to write your own file:
+
+- Go to `preprocessor/sample`, there are some very useful examples. Check these example files carefully before you start your own one.
+- Prepare your data before `#pragma t2s_spec_start` 
+- Using `#pragma t2s_spec_start`  at the beginning of T2S spec and `#pragma t2s_spec_end` at the end of T2S spec
+- Using  `#pragma t2s_submit` to submit your `Stensor`/`Imageparam` with its corresponding data Initialization array like `*a,*b,*c` and data dimension infomation array like `a_dim,b_dim,c_dim`. 
+
 ## Build
 
 ### Preprocessor
 
 To build the preprocessor executable use the following steps for Intel DevCloud:
+
 ```bash
 ## Setup
 
@@ -76,8 +94,9 @@ $ ls -l
 
 ### Sample
 
-Currently, there is one example, `~/path/to/t2sp/preprocessor/sample/sample_01/`. 
-The core file used by the preprocessor is located at `~/path/to/t2sp/preprocessor/sample/sample_01/test.t2s.cpp`.
+#### FPGA
+
+Currently, there is only one example, `~/path/to/t2sp/preprocessor/sample/sample_01/`. 
 
 To build `sample_01`, execute the following steps assuming you have already built the preprocessor:
 
@@ -85,20 +104,66 @@ To build `sample_01`, execute the following steps assuming you have already buil
 cd ~/path/to/t2sp/
 cd ./t2s/preprocessor/sample/
 # To run the preprocessor on sample_01, run the following Pre-made script
-./run_01.sh 
+./run_01.sh FPGA
 
 # To compile and execute the sample_01, run the following Pre-made script:
 # You can ignore warrnings generated.
 # The final executable should be ./sample_01/test.fpga_emu 
-./build_01.sh
+./build_01.sh FPGA
 
 # To run the executable manually, execute the following steps:
 cd sample_01/
 ./test.fpga_emu
 ```
 
+#### GPU
+
+Currently, there are three examples, check them in directory `~/path/to/t2sp/preprocessor/sample/` 
+- `sample_01(sgemm)`
+- `sample_02(2-D convolution)`
+- `sample_03(Capsule convolution)`
+
+To build these samples, execute the following steps assuming you have already built the preprocessor.Take `sample_01` as an example:
+
+```bash
+cd ~/path/to/t2sp/
+cd ./t2s/preprocessor/sample/
+# using another clang++ for DPC++ ESIMD
+#!!!!!!!!!!VERY IMPORTANT!!!!REMENBER TO SOURCE THIS BASH SCRIPT
+source ./setenv.sh
+# To run the preprocessor on sample_01, run the following Pre-made script
+./run_01.sh GPU
+
+# To compile and execute the sample_01, run the following Pre-made script:
+./build_01.sh GPU
+
+#if you want to clean all the generated files
+./clean_01.sh
+```
+
+You can run other samples in a similar way by running `run_02.sh run_03.sh ` and `build_02.sh build_03.sh`.
+
+
+## Performance
+
+### FPGA Performance
+
+Check the image in [Work flow for preprocessor](#workflow) in this Readme.md.
+
+### GPU Performance
+
+|                     | Gen 9.5                           | Gen 12                           |
+| ------------------- | --------------------------------- | -------------------------------- |
+| SGEMM               | 353.036 GFLOPS, 76% machine peak  | 1761.85 GFLOPS, 70% machine peak |
+| 2-D Convolution     | 371.651 GFLOPS,  82% machine peak | 2343.5 GFLOPS, 92% machine peak  |
+| Capsule Convolution | 354.976 GFLOPS, 78% machine peak  | 1364.18 GFLOPS, 54% machine peak |
+
+
+
 ## Troubleshooting
 
+#### Preprocessor
+
 For further help assuming you have built the preprocessor, use the following steps for more information
 
 ```bash
@@ -177,3 +242,13 @@ cd t2s/preprocessor/src/
 #}
 ```
 
+
+
+## Feel free to report your problem
+### For problems related to GPU:
+At present, DPC++  backend for T2S is still in the development stage. If you encounter any problems, please report them to me and other developers.
+
+You can contact me through my Email : tanzl@mail.ustc.edu.cn
+
+
+
diff --git a/t2s/src/preprocessor/sample/build_01.sh b/t2s/src/preprocessor/sample/build_01.sh
new file mode 100755
index 00000000..f348a449
--- /dev/null
+++ b/t2s/src/preprocessor/sample/build_01.sh
@@ -0,0 +1,65 @@
+
+# Change to clang include directory
+CWD="$PWD"
+cd ${PWD}/sample_01/
+# echo "CWD: $PWD"
+if [ "$1" == "FPGA" ];
+then
+echo "Building T2S...";
+echo "CMD: g++ gemm.fpga.spec.cpp -I ${T2S_PATH}/t2s/src/ -I ${T2S_PATH}/t2s/tests/correctness/util -I ${T2S_PATH}/Halide/include -L ${T2S_PATH}/Halide/bin -I ${T2S_PATH}/t2s/src/oneapi-src -lz -lpthread -ldl -std=c++11 -lHalide -DTINY -DFPGA_EMULATOR -DFPGA;";
+g++ gemm.fpga.spec.cpp \
+    -I ${T2S_PATH}/t2s/src/ \
+    -I ${T2S_PATH}/t2s/tests/correctness/util \
+    -I ${T2S_PATH}/Halide/include \
+    -I ${T2S_PATH}/t2s/src/oneapi-src \
+    -L ${T2S_PATH}/Halide/bin \
+    -lz -lpthread -ldl -std=c++11 -lHalide -DTINY -DFPGA_EMULATOR -DFPGA;
+
+echo "Executing T2S...";
+./a.out;
+
+echo "Building OneAPI...";
+echo "CMD: dpcpp gemm.fpga.run.cpp -I ${T2S_PATH}/Halide/include -I ${T2S_PATH}/t2s/src/oneapi-src -L ${T2S_PATH}/Halide/bin -lHalide -lz -lpthread -ldl -fintelfpga -fsycl -fsycl-device-code-split=off -DTINY -DFPGA_EMULATOR -DFPGA -o ./test.fpga_emu;";
+dpcpp gemm.fpga.run.cpp \
+    -I ${T2S_PATH}/Halide/include \
+    -I ${T2S_PATH}/t2s/tests/correctness/util \
+    -I ${T2S_PATH}/t2s/src/oneapi-src \
+    -L ${T2S_PATH}/Halide/bin \
+    -lHalide -lz -lpthread -ldl \
+    -fintelfpga -fsycl -fsycl-device-code-split=off \
+    -DTINY -DFPGA_EMULATOR -DFPGA \
+    -o ./test.fpga_emu;
+
+echo "Executing final binary...";
+./test.fpga_emu;
+
+# Return back to the directory
+# cd ${CWD}
+echo "CWD: $PWD"
+elif [ "$1" == "GPU" ];
+then
+echo "Building T2S for Intel GPU...";
+echo "CMD: g++ gemm.gpu.spec.cpp -I ${T2S_PATH}/t2s/src/ -I ${T2S_PATH}/t2s/tests/correctness/util -I ${T2S_PATH}/Halide/include -L ${T2S_PATH}/Halide/bin -lz -lpthread -ldl -std=c++11 -lHalide -DTINY -DGPU;";
+g++ gemm.gpu.spec.cpp \
+    -I ${T2S_PATH}/t2s/src/ \
+    -I ${T2S_PATH}/t2s/tests/correctness/util \
+    -I ${T2S_PATH}/Halide/include \
+    -L ${T2S_PATH}/Halide/bin \
+    -lz -lpthread -ldl -std=c++11 -lHalide -DTINY -DGPU;
+
+echo "Executing T2S...";
+./a.out;
+#format the file
+clang-format -style=LLVM -i *.sycl.cpp 
+#building oneapi esimd
+echo "Building oneAPI...";
+echo "clang++ -fsycl -I $T2S_PATH/install/llvm-test-suite/SYCL/ESIMD/ -I $T2S_PATH/Halide/include -I $T2S_PATH/t2s/tests/correctness/util gemm.gpu.run.cpp"
+clang++ -fsycl -I /home/tzl/betat2sp/install/llvm-test-suite/SYCL/ESIMD/ -I /home/tzl/betat2sp/Halide/include -I /home/tzl/betat2sp/t2s/tests/correctness/util gemm.gpu.run.cpp
+#execution
+echo "Executing final binary..."
+./a.out
+echo "done"
+# Return back to the directory
+# cd ${CWD}
+echo "CWD: $PWD"
+fi
\ No newline at end of file
diff --git a/t2s/src/preprocessor/sample/build_02.sh b/t2s/src/preprocessor/sample/build_02.sh
new file mode 100755
index 00000000..0b9f2cb0
--- /dev/null
+++ b/t2s/src/preprocessor/sample/build_02.sh
@@ -0,0 +1,35 @@
+
+# Change to clang include directory
+CWD="$PWD"
+cd ${PWD}/sample_02/
+# echo "CWD: $PWD"
+if [ "$1" == "FPGA" ];
+then
+echo "Conv FPGA example has not been added to this directory yet";
+elif [ "$1" == "GPU" ];
+then
+echo "Building T2S for Intel GPU...";
+echo "CMD: g++ conv.gpu.spec.cpp -I ${T2S_PATH}/t2s/src/ -I ${T2S_PATH}/t2s/tests/correctness/util -I ${T2S_PATH}/Halide/include -L ${T2S_PATH}/Halide/bin -lz -lpthread -ldl -std=c++11 -lHalide -DTINY -DGPU;";
+g++ conv.gpu.spec.cpp \
+    -I ${T2S_PATH}/t2s/src/ \
+    -I ${T2S_PATH}/t2s/tests/correctness/util \
+    -I ${T2S_PATH}/Halide/include \
+    -L ${T2S_PATH}/Halide/bin \
+    -lz -lpthread -ldl -std=c++11 -lHalide -DTINY -DGPU;
+
+echo "Executing T2S...";
+./a.out;
+#format the file
+clang-format -style=LLVM -i *_genx.cpp 
+#building oneapi esimd
+echo "Building oneAPI...";
+echo "clang++ -fsycl -I $T2S_PATH/install/llvm-test-suite/SYCL/ESIMD/ conv.gpu.run.cpp"
+clang++ -fsycl -I $T2S_PATH/install/llvm-test-suite/SYCL/ESIMD/ conv.gpu.run.cpp;
+#execution
+echo "Executing final binary..."
+./a.out
+echo "done"
+# Return back to the directory
+# cd ${CWD}
+echo "CWD: $PWD"
+fi
\ No newline at end of file
diff --git a/t2s/src/preprocessor/sample/build_03.sh b/t2s/src/preprocessor/sample/build_03.sh
new file mode 100755
index 00000000..95d5eb3b
--- /dev/null
+++ b/t2s/src/preprocessor/sample/build_03.sh
@@ -0,0 +1,35 @@
+
+# Change to clang include directory
+CWD="$PWD"
+cd ${PWD}/sample_03/
+# echo "CWD: $PWD"
+if [ "$1" == "FPGA" ];
+then
+echo "Conv FPGA example has not been added to this directory yet";
+elif [ "$1" == "GPU" ];
+then
+echo "Building T2S for Intel GPU...";
+echo "CMD: g++ capsule.gpu.spec.cpp -I ${T2S_PATH}/t2s/src/ -I ${T2S_PATH}/t2s/tests/correctness/util -I ${T2S_PATH}/Halide/include -L ${T2S_PATH}/Halide/bin -lz -lpthread -ldl -std=c++11 -lHalide -DTINY -DGPU;";
+g++ capsule.gpu.spec.cpp \
+    -I ${T2S_PATH}/t2s/src/ \
+    -I ${T2S_PATH}/t2s/tests/correctness/util \
+    -I ${T2S_PATH}/Halide/include \
+    -L ${T2S_PATH}/Halide/bin \
+    -lz -lpthread -ldl -std=c++11 -lHalide -DTINY -DGPU;
+
+echo "Executing T2S...";
+./a.out;
+#format the file
+clang-format -style=LLVM -i *_genx.cpp 
+#building oneapi esimd
+echo "Building oneAPI...";
+echo "clang++ -fsycl -I $T2S_PATH/install/llvm-test-suite/SYCL/ESIMD/ capsule.gpu.run.cpp"
+clang++ -fsycl -I $T2S_PATH/install/llvm-test-suite/SYCL/ESIMD/ capsule.gpu.run.cpp;
+#execution
+echo "Executing final binary..."
+./a.out
+echo "done"
+# Return back to the directory
+# cd ${CWD}
+echo "CWD: $PWD"
+fi
\ No newline at end of file
diff --git a/t2s/preprocessor/sample/clean_01.sh b/t2s/src/preprocessor/sample/clean_01.sh
similarity index 74%
rename from t2s/preprocessor/sample/clean_01.sh
rename to t2s/src/preprocessor/sample/clean_01.sh
index 3c33bf16..d4a9696c 100755
--- a/t2s/preprocessor/sample/clean_01.sh
+++ b/t2s/src/preprocessor/sample/clean_01.sh
@@ -7,9 +7,10 @@ echo "CWD: $PWD"
 echo "cleaning..."
 rm -rf a.out;
 rm -rf test.fpga_emu;
-rm -rf post.*;
-rm -rf gemm.generated_oneapi_header.h;
-
+rm -rf *.sycl.h;
+rm -rf *_genx.cpp
+rm -rf *.spec.cpp
+rm -rf *.run.cpp
 # Return back to the directory
 cd ${CWD}
 echo "CWD: $PWD"
diff --git a/t2s/src/preprocessor/sample/clean_02.sh b/t2s/src/preprocessor/sample/clean_02.sh
new file mode 100755
index 00000000..a0070899
--- /dev/null
+++ b/t2s/src/preprocessor/sample/clean_02.sh
@@ -0,0 +1,17 @@
+# Change to clang include directory
+CWD="$PWD"
+cd ./sample_02/
+echo "CWD: $PWD"
+
+# Run
+echo "cleaning..."
+rm -rf a.out;
+rm -rf test.fpga_emu;
+rm -rf *.sycl.h;
+rm -rf *_genx.cpp
+rm -rf *.spec.cpp
+rm -rf *.run.cpp
+# Return back to the directory
+cd ${CWD}
+echo "CWD: $PWD"
+
diff --git a/t2s/src/preprocessor/sample/clean_03.sh b/t2s/src/preprocessor/sample/clean_03.sh
new file mode 100755
index 00000000..ac62736d
--- /dev/null
+++ b/t2s/src/preprocessor/sample/clean_03.sh
@@ -0,0 +1,17 @@
+# Change to clang include directory
+CWD="$PWD"
+cd ./sample_03/
+echo "CWD: $PWD"
+
+# Run
+echo "cleaning..."
+rm -rf a.out;
+rm -rf test.fpga_emu;
+rm -rf *.sycl.h;
+rm -rf *_genx.cpp
+rm -rf *.spec.cpp
+rm -rf *.run.cpp
+# Return back to the directory
+cd ${CWD}
+echo "CWD: $PWD"
+
diff --git a/t2s/preprocessor/sample/run_01.sh b/t2s/src/preprocessor/sample/run_01.sh
similarity index 51%
rename from t2s/preprocessor/sample/run_01.sh
rename to t2s/src/preprocessor/sample/run_01.sh
index 3214d003..2a748798 100755
--- a/t2s/preprocessor/sample/run_01.sh
+++ b/t2s/src/preprocessor/sample/run_01.sh
@@ -4,8 +4,17 @@ cd ./sample_01/
 # echo "CWD: $PWD"
 
 # Run
+if [ "$1" == "GPU" ]
+then
 echo "Running preprocessor..."
-../../src/t2spreprocessor ./test.t2s.cpp
+../../src/t2spreprocessor ./gemm.gpu.cpp
+fi
+
+if [ "$1" == "FPGA" ]
+then
+echo "Running preprocessor..."
+../../src/t2spreprocessor ./gemm.fpga.cpp
+fi
 
 # Return back to the directory
 cd ${CWD}
diff --git a/t2s/src/preprocessor/sample/run_02.sh b/t2s/src/preprocessor/sample/run_02.sh
new file mode 100755
index 00000000..ea773c13
--- /dev/null
+++ b/t2s/src/preprocessor/sample/run_02.sh
@@ -0,0 +1,21 @@
+# Change to clang include directory
+CWD="$PWD"
+cd ./sample_02/
+echo "CWD: $PWD"
+
+# Run
+if [ "$1" == "GPU" ]
+then
+echo "Running preprocessor..."
+../../src/t2spreprocessor ./conv.gpu.cpp
+fi
+
+if [ "$1" == "FPGA" ]
+then
+echo "currently FPGA conv example has not been added to this directory"
+fi
+
+# Return back to the directory
+cd ${CWD}
+# echo "CWD: $PWD"
+
diff --git a/t2s/src/preprocessor/sample/run_03.sh b/t2s/src/preprocessor/sample/run_03.sh
new file mode 100755
index 00000000..61d826be
--- /dev/null
+++ b/t2s/src/preprocessor/sample/run_03.sh
@@ -0,0 +1,21 @@
+# Change to clang include directory
+CWD="$PWD"
+cd ./sample_03/
+echo "CWD: $PWD"
+
+# Run
+if [ "$1" == "GPU" ]
+then
+echo "Running preprocessor..."
+../../src/t2spreprocessor ./capsule.gpu.cpp
+fi
+
+if [ "$1" == "FPGA" ]
+then
+echo "currently FPGA capsule example has not been added to this directory"
+fi
+
+# Return back to the directory
+cd ${CWD}
+# echo "CWD: $PWD"
+
diff --git a/t2s/preprocessor/sample/sample_01/oneapi-target-4-parameters.h b/t2s/src/preprocessor/sample/sample_01/const-parameter.h
similarity index 94%
rename from t2s/preprocessor/sample/sample_01/oneapi-target-4-parameters.h
rename to t2s/src/preprocessor/sample/sample_01/const-parameter.h
index 4e1116a4..b81d1249 100644
--- a/t2s/preprocessor/sample/sample_01/oneapi-target-4-parameters.h
+++ b/t2s/src/preprocessor/sample/sample_01/const-parameter.h
@@ -16,9 +16,8 @@
 *
 * SPDX-License-Identifier: BSD-2-Clause-Patent
 *******************************************************************************/
-#ifndef ONEAPI_TAREGET_4_PARAMETERS_H
-#define ONEAPI_TAREGET_4_PARAMETERS_H
-
+#ifndef GEMM_CONST_PARAMS_H
+#define GEMM_CONST_PARAMS_H
 
 // Inner loop bounds, which are static constant parameters of the design
 #ifdef GPU
@@ -46,4 +45,4 @@
     #endif
 #endif
 
-#endif
\ No newline at end of file
+#endif
diff --git a/t2s/preprocessor/sample/sample_01/test.t2s.cpp b/t2s/src/preprocessor/sample/sample_01/gemm.fpga.cpp
similarity index 97%
rename from t2s/preprocessor/sample/sample_01/test.t2s.cpp
rename to t2s/src/preprocessor/sample/sample_01/gemm.fpga.cpp
index fac08aaf..bb51d1f7 100644
--- a/t2s/preprocessor/sample/sample_01/test.t2s.cpp
+++ b/t2s/src/preprocessor/sample/sample_01/gemm.fpga.cpp
@@ -19,7 +19,7 @@
 #include "util.h"
 
 // The only header file needed for including T2S.
-#include "HalideBuffer.h"
+#include "Halide.h"
 
 // Standard includes
 #include <iostream>
@@ -27,8 +27,8 @@
 #include <assert.h>
 
 // Constant parameters (inner loop bounds) of the design
-#include "oneapi-target-4-parameters.h"
-
+#include "const-parameter.h"
+#define FPGA
 // Outer loop bounds for testing
 #ifdef TINY // For verifying correctness only
     #define K           4
@@ -43,7 +43,6 @@
 
 
 
-
 int main(){
     const int TOTAL_I = III * II * I;
     const int TOTAL_J = JJJ * JJ * J;
@@ -136,9 +135,10 @@ int main(){
 #ifdef GPU
     // C.compile_to_host("gemm-interface", { A, B }, "gemm", IntelGPU);
     // C.compile_to_oneapi( { A, B }, "gemm", IntelGPU);
-    std::cout << "(NOTE) T2S w/ OneAPI on GPU is Not implemented at this time\n";
+    C.compile_to_oneapi( { A, B }, "gemm", IntelGPU);
 #else
     // C.compile_to_host("gemm-interface", { A, B }, "gemm", IntelFPGA);
+    // THE NAME OF SECOND VAR IS VERY IMPORTANT
     C.compile_to_oneapi( { A, B }, "gemm", IntelFPGA);
 #endif
 
@@ -149,4 +149,4 @@ int main(){
 
 printf("Success!\n");
 
-}
+}
\ No newline at end of file
diff --git a/t2s/src/preprocessor/sample/sample_01/gemm.gpu.cpp b/t2s/src/preprocessor/sample/sample_01/gemm.gpu.cpp
new file mode 100644
index 00000000..259e38bd
--- /dev/null
+++ b/t2s/src/preprocessor/sample/sample_01/gemm.gpu.cpp
@@ -0,0 +1,135 @@
+/*******************************************************************************
+* Copyright 2021 Intel Corporation
+*
+* Licensed under the BSD-2-Clause Plus Patent License (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* https://opensource.org/licenses/BSDplusPatent
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: BSD-2-Clause-Patent
+*******************************************************************************/
+#define GPU 1
+#include "Halide.h"
+#include "util.h"
+#include "const-parameter.h"
+// Constant parameters (inner loop bounds) of the design
+using namespace Halide;
+// Outer loop bounds for testing
+#ifdef TINY // For verifying correctness only
+    #define K           4
+    #define J           4
+    #define I           4
+#else
+    #define K           32
+    #define J           32
+    #define I           32
+#endif
+int main(){
+    const int TOTAL_I = III * II * I;
+    const int TOTAL_J = JJJ * JJ * J;
+    const int TOTAL_K = KKK * KK * K;
+
+    float *a = new float[TOTAL_K * TOTAL_I];
+    float *b = new float[TOTAL_J * TOTAL_K];
+    float *c = new float[TOTAL_J * TOTAL_I];
+    
+    for(unsigned int i = 0; i < (TOTAL_K * TOTAL_I); i++){ a[i] = random(); }
+    for(unsigned int i = 0; i < (TOTAL_J * TOTAL_K); i++){ b[i] = random(); }
+    for(unsigned int i = 0; i < (TOTAL_J * TOTAL_I); i++){ c[i] = 0.0f; }
+
+
+    // Specifications to be preprocessed
+    // std::vector<int> a_dim{TOTAL_K, TOTAL_I};
+    // std::vector<int> b_dim{TOTAL_J, TOTAL_K};
+    // std::vector<int> c_dim{JJJ, III, JJ, II, J, I};
+
+    size_t a_dim[2] = {TOTAL_K, TOTAL_I};
+    size_t b_dim[2] = {TOTAL_J, TOTAL_K};
+    size_t c_dim[2] = {TOTAL_J, TOTAL_I};
+    
+
+#pragma t2s_spec_start
+    #define P               kkk,      jjj,  iii,  jj, ii, kk,     k,  j,i
+    #define P_kkk_minus_1   kkk-1,    jjj,  iii,  jj, ii, kk,     k,  j,i
+    #define P_kk_minus_1    kkk+KKK-1,jjj,  iii,  jj, ii, kk-1,   k,  j,i
+    #define P_k_minus_1     kkk+KKK-1,jjj,  iii,  jj, ii, kk+KK-1,k-1,j,i
+    #define P_jjj_minus_1   kkk,      jjj-1,iii,  jj, ii, kk,     k,  j,i
+    #define P_iii_minus_1   kkk,      jjj,  iii-1,jj, ii, kk,     k,  j,i
+    #define P_Out                     jjj,  iii,  jj, ii,             j,i
+
+    // Linearized addresses
+    #define total_i         (iii + III * ii + III * II * i)
+    #define total_j         (jjj + JJJ * jj + JJJ * JJ * j)
+    #define total_k         (kkk + KKK * kk + KKK * KK * k)
+
+    // Outer loop bounds, which are determined by input sizes
+    #define I (A.dim(1).extent() / (III * II))
+    #define J (B.dim(0).extent() / (JJJ * JJ))
+    #define K (A.dim(0).extent() / (KKK * KK))
+
+    // Type of the data to process in C and T2S
+    #define CTYPE float
+    #define TTYPE Float(32)
+
+    // Inputs
+    ImageParam A("A", TTYPE, 2), B("B", TTYPE, 2);
+
+    // UREs
+    Var kkk("kkk"), jjj("jjj"), iii("iii"), jj("jj"), ii("ii"), kk("kk"), k("k"), j("j"), i("i");
+    URE X("X", TTYPE, {P}), Y("Y", TTYPE, {P}), Z("Z", TTYPE, {P}), Out("Out");
+    X(P) = select(jjj == 0, A(total_k, total_i), X(P_jjj_minus_1));
+    Y(P) = select(iii == 0, B(total_j, total_k), Y(P_iii_minus_1));
+    Z(P) = select(kkk == 0 && kk == 0 && k == 0, 0,
+                select(kkk == 0, select(kk == 0, Z(P_k_minus_1), Z(P_kk_minus_1)), Z(P_kkk_minus_1)))
+                + X(P) * Y(P);
+    Out(P_Out) = select(kkk == KKK-1 && kk == KK-1 && k == K-1, Z(P));
+
+    // Put all the UREs inside the same loop nest of X.
+    X.merge_ures(Y, Z, Out);
+
+    // Explicitly set the loop bounds
+    X.set_bounds(jjj, 0, JJJ, iii, 0, III, kkk, 0, KKK)
+     .set_bounds(jj,  0, JJ,  ii,  0, II,  kk,  0, KK)
+     .set_bounds(j,   0, J,   i,   0, I,   k,   0, K);
+
+    // Create a systolic array
+    X.space_time_transform(jjj, iii);
+
+    // GPU can have many threads running in parallel.
+#ifdef GPU
+    X.gpu_blocks(j, i).gpu_threads(jj, ii);
+#endif
+
+    // I/O network
+    Stensor DA("aLoader", DRAM), SA("aFeeder", SRAM), DB("bLoader", DRAM), SB("bFeeder", SRAM);
+    Stensor RC2("drainer", REG), RC1("collector", REG), DC("unloader", DRAM), C("C");
+    A >> DA.out(kkk) >> FIFO(128)
+      >> SA.scope(k).out(kkk, iii) >> FIFO(128);
+    B >> DB.out(kkk) >> FIFO(128)
+      >> SB.scope(k).out(kkk, jjj) >> FIFO(128);
+    Out >> FIFO(1024) >> RC2.scope(jj).out(jjj, iii)
+        >> FIFO(128)  >> RC1.scope(iii).out(jjj)
+        >> FIFO(128)  >> DC >> C(total_j, total_i);
+
+    // Compile the kernel to an FPGA bitstream, and expose a C interface for the host to invoke
+#ifdef GPU
+    C.compile_to_oneapi( { A, B }, "gemm", IntelGPU);
+    //C.compile_to_host("gemm-interface", { A, B }, "gemm", IntelGPU);
+    //C.compile_to_oneapi( { A, B }, "gemm", IntelFPGA);
+#else
+    C.compile_to_oneapi( { A, B }, "gemm", IntelFPGA);
+#endif
+
+#pragma t2s_spec_end
+
+
+#pragma t2s_submit gemm (A, a, a_dim) (B, b, b_dim) (C, c, c_dim)
+}
diff --git a/t2s/src/preprocessor/sample/sample_02/conv.gpu.cpp b/t2s/src/preprocessor/sample/sample_02/conv.gpu.cpp
new file mode 100644
index 00000000..8548df4c
--- /dev/null
+++ b/t2s/src/preprocessor/sample/sample_02/conv.gpu.cpp
@@ -0,0 +1,150 @@
+/*******************************************************************************
+* Copyright 2021 Intel Corporation
+*
+* Licensed under the BSD-2-Clause Plus Patent License (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* https://opensource.org/licenses/BSDplusPatent
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: BSD-2-Clause-Patent
+*******************************************************************************/
+#define GPU 1
+#include "HalideBuffer.h"
+#include "util.h"
+// Constant parameters (inner loop bounds) of the design
+#define KY              3
+#define KX              3
+#define CII         8
+#define CI          32
+#define COOO        8
+#define COO         1
+#define CO          32
+#define YYY         32
+#define XXX         1
+#define YY          2
+#define XX          2
+#define Y           1
+#define X           32
+#define TOTAL_IX        (XXX * XX * X + KX - 1)
+#define TOTAL_IY        (YYY * YY * Y + KY - 1)
+#define TOTAL_OX        (XXX * XX * X)
+#define TOTAL_OY        (YYY * YY * Y)
+#define TOTAL_CO        (COOO * COO * CO)
+#define TOTAL_CI        (CII * CI)
+#define N           4
+#define SIZE_I_0    TOTAL_CI * N
+#define SIZE_I_1    TOTAL_IY * TOTAL_IX
+#define SIZE_K_0    TOTAL_CO * KX
+#define SIZE_K_1    TOTAL_CI * KY
+#define SIZE_O_0    TOTAL_CO * N
+#define SIZE_O_1    TOTAL_OY * TOTAL_OX
+using namespace Halide;
+int main(){
+    float *i = new float[SIZE_I_0 * SIZE_I_1];
+    float *k = new float[SIZE_K_0 * SIZE_K_1];
+    float *o = new float[SIZE_O_0 * SIZE_O_1];
+    for (unsigned m = 0; m < SIZE_I_0 * SIZE_I_1; ++m)
+        i[m] = random()%5;
+    for (unsigned m = 0; m < SIZE_K_0 * SIZE_K_1; ++m)
+        k[m] = random()%5;
+    for (unsigned m = 0; m < SIZE_O_0 * SIZE_O_1; ++m)
+        o[m] = random()%5;
+    // Specifications to be preprocessed
+    // std::vector<int> a_dim{TOTAL_K, TOTAL_I};
+    // std::vector<int> b_dim{TOTAL_J, TOTAL_K};
+    // std::vector<int> c_dim{JJJ, III, JJ, II, J, I};
+
+    size_t i_dim[2] = {SIZE_I_0, SIZE_I_1};
+    size_t k_dim[2] = {SIZE_K_0, SIZE_K_1};
+    size_t o_dim[2] = {SIZE_O_0, SIZE_O_1};
+    
+
+#pragma t2s_spec_start
+    // Dependences
+    #define P               cii,       cooo,   yyy,   xxx, coo, yy, xx,  ky,      kx,      ci,   y, x, co, n
+    #define P_cii_minus_1   cii-1,     cooo,   yyy,   xxx, coo, yy, xx,  ky,      kx,      ci,   y, x, co, n
+    #define P_ky_minus_1    cii+CII-1, cooo,   yyy,   xxx, coo, yy, xx,  ky-1,    kx,      ci,   y, x, co, n
+    #define P_kx_minus_1    cii+CII-1, cooo,   yyy,   xxx, coo, yy, xx,  ky+KY-1, kx-1,    ci,   y, x, co, n
+    #define P_ci_minus_1    cii+CII-1, cooo,   yyy,   xxx, coo, yy, xx,  ky+KY-1, kx+KX-1, ci-1, y, x, co, n
+    #define P_cooo_minus_1  cii,       cooo-1, yyy,   xxx, coo, yy, xx,  ky,      kx,      ci,   y, x, co, n
+    #define P_yyy_minus_1   cii,       cooo,   yyy-1, xxx, coo, yy, xx,  ky,      kx,      ci,   y, x, co, n
+    #define P_Out                      cooo,   yyy,   xxx, coo, yy, xx,                          y, x, co, n
+    // Linearized addresso
+    #define total_iy        (yyy + YYY*yy + YYY*YY*y + ky)
+    #define total_ix        (xxx + XXX*xx + XXX*XX*x + kx)
+    #define total_oy        (yyy + YYY*yy + YYY*YY*y)
+    #define total_ox        (xxx + XXX*xx + XXX*XX*x)
+    #define total_ci        (cii + CII*ci)
+    #define total_co        (cooo + COOO*coo + COOO*COO*co)
+
+    // Type of the data to process in C and T2S
+    #define CTYPE float
+    #define TTYPE Float(32)
+
+    // Inputs
+    ImageParam I("I", TTYPE, 2), K("K", TTYPE, 2);
+    #define P_I     total_ci + (TOTAL_CI) * n,  total_iy + (TOTAL_IY) * total_ix
+    #define P_K     total_co + (TOTAL_CO) * kx, total_ci + (TOTAL_CI) * ky
+    #define P_O     total_co + (TOTAL_CO) * n,  total_oy + (TOTAL_OY) * total_ox
+    #define UN      (I.dim(0).extent() / TOTAL_CI)
+
+    // UREs
+    Var cii("cii"), ci("ci"), cooo("cooo"), coo("coo"), co("co"), ky("ky"), kx("kx"), yyy("yyy"), xxx("xxx"), yy("yy"), xx("xx"), y("y"), x("x"), n("n");
+    URE A("A", TTYPE, {P}), B("B", TTYPE, {P}), C("C", TTYPE, {P}), Out("Out");
+    A(P) = select(cooo == 0, I(P_I), A(P_cooo_minus_1));
+    B(P) = select(yyy == 0, K(P_K), B(P_yyy_minus_1));
+    C(P) = select(cii == 0 && ky == 0 && kx == 0 && ci == 0, 0,
+                select(cii == 0, select(ky == 0, select(kx == 0, C(P_ci_minus_1), C(P_kx_minus_1)), C(P_ky_minus_1)), C(P_cii_minus_1)))
+                + A(P) * B(P);
+    Out(P_Out) = select(cii == CII-1 && ky == KY-1 && kx == KX-1 && ci == CI-1, C(P));
+
+    // Put all the UREs inside the same loop nest of X.
+    A.merge_ures(B, C, Out);
+
+    // Explicitly set the loop bounds
+    A.set_bounds(cooo,  0, COOO, coo,  0, COO, co, 0, CO)
+     .set_bounds(ky,    0, KY,   kx,   0, KX)
+     .set_bounds(cii,   0, CII,  ci,   0, CI)
+     .set_bounds(yyy,   0, YYY,  xxx,  0, XXX)
+     .set_bounds(yy,    0, YY,   xx,   0, XX)
+     .set_bounds(y,     0, Y,    x,    0, X)
+     .set_bounds(n,     0, UN);
+
+    // Create a systolic array
+    A.space_time_transform(cooo, yyy);
+
+    // GPU can have many threads running in parallel.
+#ifdef GPU
+    A.gpu_blocks(x, co, n).gpu_threads(yy, xx);
+#endif
+
+    // I/O network
+    Stensor DI("iLoader", DRAM), SI("iFeeder", SRAM), DK("kLoader", DRAM), SK("kFeeder", SRAM);
+    Stensor RO("collector", REG), DO("unloader", DRAM), O("O");
+    I >> DI.out(cii)                 >> FIFO(256)
+      >> SI.scope(kx).out(cii, yyy)  >> FIFO(256);
+    K >> DK.out(cii)                 >> FIFO(256)
+      >> SK.scope(kx).out(cii, cooo) >> FIFO(256);
+    Out >> RO.scope(yyy).out(cooo)   >> FIFO(256)
+        >> DO >> O(P_O);
+
+    // Compile the kernel to an FPGA bitstream, and expose a C interface for the host to invoke
+#ifdef GPU
+    O.compile_to_oneapi({ I, K }, "conv", IntelGPU);
+#else
+    O.compile_to_oneapi({ I, K }, "conv", IntelFPGA);
+#endif
+
+#pragma t2s_spec_end
+
+
+#pragma t2s_submit conv (I, i, i_dim) (K, k, k_dim) (O, o, o_dim)
+}
diff --git a/t2s/src/preprocessor/sample/sample_03/capsule.gpu.cpp b/t2s/src/preprocessor/sample/sample_03/capsule.gpu.cpp
new file mode 100644
index 00000000..6527535c
--- /dev/null
+++ b/t2s/src/preprocessor/sample/sample_03/capsule.gpu.cpp
@@ -0,0 +1,135 @@
+/*******************************************************************************
+* Copyright 2021 Intel Corporation
+*
+* Licensed under the BSD-2-Clause Plus Patent License (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* https://opensource.org/licenses/BSDplusPatent
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: BSD-2-Clause-Patent
+*******************************************************************************/
+#define GPU 1
+#include "Halide.h"
+#include "util.h"
+// Constant parameters (inner loop bounds) of the design
+#include "capsule_const-parameter.h"
+#define N           4
+#define TOTAL_N     N * NN
+#define SIZE_P_0    TOTAL_CI * MK * MX * NN
+#define SIZE_P_1    TOTAL_IY * TOTAL_IX * N
+#define SIZE_W_0    TOTAL_CO * MY
+#define SIZE_W_1    TOTAL_CI * KY * KX * MK
+#define SIZE_V_0    TOTAL_CO * MY * MX * NN
+#define SIZE_V_1    OY * OX * N
+using namespace Halide;
+int main(){
+    float *p = new float[SIZE_P_0 * SIZE_P_1];
+    float *w = new float[SIZE_W_0 * SIZE_W_1];
+    float *v = new float[SIZE_V_0 * SIZE_V_1];
+    for (unsigned m = 0; m < SIZE_P_0 * SIZE_P_1; ++m)
+        p[m] = random()%5;
+    for (unsigned m = 0; m < SIZE_W_0 * SIZE_W_1; ++m)
+        w[m] = random()%5;
+    for (unsigned m = 0; m < SIZE_V_0 * SIZE_V_1; ++m)
+        v[m] = random()%5;
+
+
+    size_t p_dim[2] = {SIZE_P_0, SIZE_P_1};
+    size_t w_dim[2] = {SIZE_W_0, SIZE_W_1};
+    size_t v_dim[2] = {SIZE_V_0, SIZE_V_1};
+    
+
+#pragma t2s_spec_start
+        // Dependences
+    #define Index               cii,       cooo,   yyy_xxx,   yy_xx, y_x, my, mx, coo, nn, ky,      kx,      ci,      mk,   co, n
+    #define Index_cii_minus_1   cii-1,     cooo,   yyy_xxx,   yy_xx, y_x, my, mx, coo, nn, ky,      kx,      ci,      mk,   co, n
+    #define Index_ky_minus_1    cii+CII-1, cooo,   yyy_xxx,   yy_xx, y_x, my, mx, coo, nn, ky-1,    kx,      ci,      mk,   co, n
+    #define Index_kx_minus_1    cii+CII-1, cooo,   yyy_xxx,   yy_xx, y_x, my, mx, coo, nn, ky+KY-1, kx-1,    ci,      mk,   co, n
+    #define Index_ci_minus_1    cii+CII-1, cooo,   yyy_xxx,   yy_xx, y_x, my, mx, coo, nn, ky+KY-1, kx+KX-1, ci-1,    mk,   co, n
+    #define Index_mk_minus_1    cii+CII-1, cooo,   yyy_xxx,   yy_xx, y_x, my, mx, coo, nn, ky+KY-1, kx+KX-1, ci+CI-1, mk-1, co, n
+    #define Index_co3_minus_1   cii,       cooo-1, yyy_xxx,   yy_xx, y_x, my, mx, coo, nn, ky,      kx,      ci,      mk,   co, n
+    #define Index_yx3_minus_1   cii,       cooo,   yyy_xxx-1, yy_xx, y_x, my, mx, coo, nn, ky,      kx,      ci,      mk,   co, n
+    #define Index_Out                      cooo,   yyy_xxx,   yy_xx, y_x, my, mx, coo, nn,                                  co, n
+    // Linearized addresses
+    #define total_oy        ((yyy_xxx + YYY_XXX*yy_xx + YYY_XXX*YY_XX*y_x) % OY)
+    #define total_ox        ((yyy_xxx + YYY_XXX*yy_xx + YYY_XXX*YY_XX*y_x) / OY)
+    #define total_iy        (total_oy * 2 + ky)
+    #define total_ix        (total_ox * 2 + kx)
+    #define total_ci        (cii  + CII*ci)
+    #define total_n         (nn   + NN*n)
+    #define total_co        (cooo + COOO*coo + COOO*COO*co)
+
+    // Type of the data to process in C and T2S
+    #define CTYPE float
+    #define TTYPE Float(32)
+
+    // Inputs
+    ImageParam P("P", TTYPE, 2), W("W", TTYPE, 2);
+    #define Index_P     total_ci + (TOTAL_CI)*mk + (TOTAL_CI*MK)*mx + (TOTAL_CI*MK*MX)*nn,  total_iy + (TOTAL_IY)*total_ix + (TOTAL_IY*TOTAL_IX)*n
+    #define Index_W     total_co + (TOTAL_CO)*my,                                           cii + (CII)*ky + (CII*KY)*kx + (CII*KY*KX)*ci + (TOTAL_CI*KY*KX)*mk
+    #define Index_V     total_co + (TOTAL_CO)*my + (TOTAL_CO*MY)*mx + (TOTAL_CO*MY*MX)*nn,  total_oy + (OY)*total_ox + (OY*OX)*n
+    #define UN          (P.dim(1).extent() / (TOTAL_IY*TOTAL_IX))
+
+    // UREs
+    Var cii("cii"), my("my"), mx("mx"), nn("nn"), ky("ky"), kx("kx"), ci("ci"), mk("mk"), n("n");
+    Var yyy_xxx("yyy_xxx"), yy_xx("yy_xx"), y_x("y_x"), cooo("cooo"), coo("coo"), co("co");
+    URE A("A", TTYPE, {Index}), B("B", TTYPE, {Index}), C("C", TTYPE, {Index}), Out("Out");
+    A(Index) = select(cooo == 0, P(Index_P), A(Index_co3_minus_1));
+    B(Index) = select(yyy_xxx == 0, W(Index_W), B(Index_yx3_minus_1));
+    C(Index) = select(cii == 0 && ci == 0 && mk == 0 && ky == 0 && kx == 0, 0,
+                select(cii == 0, select(ky == 0, select(kx == 0, select(ci == 0, C(Index_mk_minus_1), C(Index_ci_minus_1)), C(Index_kx_minus_1)), C(Index_ky_minus_1)), C(Index_cii_minus_1)))
+                + A(Index) * B(Index);
+    Out(Index_Out) = select(cii == CII-1 && ci == CI-1 && mk == MK-1 && ky == KY-1 && kx == KX-1, C(Index));
+
+    // Put all the UREs inside the same loop nest of X.
+    A.merge_ures(B, C, Out);
+
+    // Explicitly set the loop bounds
+    A.set_bounds(cooo,    0, COOO,    coo,   0, COO,   co,  0, CO)
+     .set_bounds(my,      0, MY,      mx,    0, MX,    mk,  0, MK)
+     .set_bounds(yyy_xxx, 0, YYY_XXX, yy_xx, 0, YY_XX, y_x, 0, Y_X)
+     .set_bounds(cii,     0, CII,     ci,    0, CI)
+     .set_bounds(ky,      0, KY,      kx,    0, KX)
+     .set_bounds(nn,      0, NN,      n,     0, UN);
+    A.space_time_transform(cooo, yyy_xxx, yy_xx);
+
+#ifdef GPU
+    // GPU can have many threads running in parallel.
+    A.gpu_blocks(co, nn, n).gpu_threads(my, mx);
+    A.reorder(cii, cooo, y_x, my, mx, coo, ky, kx, yyy_xxx, yy_xx, ci, mk, co, nn, n);
+#endif
+
+    // I/O network
+    Stensor DP("PLoader", DRAM), SP("PFeeder", SRAM), DW("WLoader", DRAM), SW("WFeeder", SRAM);
+    Stensor RV("collector", REG), DV("unloader", DRAM), V("V");
+#ifdef GPU
+    SP.scope(yy_xx).out(cii, yyy_xxx);
+#else
+    SP.scope(ci).out(cii, yyy_xxx);
+#endif
+    P >> DP.out(cii) >> FIFO(256) >> SP >> FIFO(256);
+    W >> DW.out(cii) >> FIFO(256)
+      >> SW.scope(ci).out(cii, cooo)    >> FIFO(256);
+    Out >> RV.scope(yyy_xxx).out(cooo)  >> FIFO(256)
+        >> DV >> V(Index_V);
+
+    // Compile the kernel to an FPGA bitstream, and expose a C interface for the host to invoke
+#ifdef GPU
+    V.compile_to_oneapi({ P, W }, "capsule", IntelGPU);
+#else
+    V.compile_to_oneapi({ P, W }, "capsule", IntelFPGA);
+#endif
+
+#pragma t2s_spec_end
+
+
+#pragma t2s_submit capsule (P, p, p_dim) (W, w, w_dim) (V, v, v_dim)
+}
diff --git a/t2s/src/preprocessor/sample/sample_03/capsule_const-parameter.h b/t2s/src/preprocessor/sample/sample_03/capsule_const-parameter.h
new file mode 100644
index 00000000..52d43f7d
--- /dev/null
+++ b/t2s/src/preprocessor/sample/sample_03/capsule_const-parameter.h
@@ -0,0 +1,80 @@
+/*******************************************************************************
+* Copyright 2021 Intel Corporation
+*
+* Licensed under the BSD-2-Clause Plus Patent License (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* https://opensource.org/licenses/BSDplusPatent
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: BSD-2-Clause-Patent
+*******************************************************************************/
+#ifndef CAPSULE_CONST_PARAMS_H
+#define CAPSULE_CONST_PARAMS_H
+
+#define OX              7
+#define OY              7
+#define KY              3
+#define KX              3
+#define MY              4
+#define MX              4
+#define MK              4
+
+// Inner loop bounds, which are static constant parameters of the design
+#ifdef GPU
+    #define CII         4
+    #define CI          8
+    #define COOO        8
+    #define COO         1
+    #define CO          4
+    #define YYY_XXX     7
+    #define YY_XX       7
+    #define Y_X         1
+    #define NN          16
+#else // FPGA
+    #ifdef TINY // For verifying correctness only
+        #define CII         4
+        #define CI          4
+        #define COOO        4
+        #define COO         4
+        #define CO          1
+        #define YYY_XXX     7
+        #define YY_XX       1
+        #define Y_X         7
+        #define NN          1
+    #elif S10
+        #define CII         32
+        #define CI          1
+        #define COOO        8
+        #define COO         4
+        #define CO          1
+        #define YYY_XXX     14
+        #define YY_XX       1
+        #define Y_X         4
+        #define NN          4
+    #else
+        #define CII         16
+        #define CI          2
+        #define COOO        8
+        #define COO         4
+        #define CO          1
+        #define YYY_XXX     10
+        #define YY_XX       1
+        #define Y_X         5
+        #define NN          4
+    #endif
+#endif
+
+#define TOTAL_IX        (OX * 2 + KX - 2)
+#define TOTAL_IY        (OY * 2 + KY - 2)
+#define TOTAL_CO        (COOO * COO * CO)
+#define TOTAL_CI        (CII * CI)
+
+#endif
diff --git a/t2s/preprocessor/src/Makefile b/t2s/src/preprocessor/src/Makefile
similarity index 96%
rename from t2s/preprocessor/src/Makefile
rename to t2s/src/preprocessor/src/Makefile
index d4d177f0..31dd4e27 100644
--- a/t2s/preprocessor/src/Makefile
+++ b/t2s/src/preprocessor/src/Makefile
@@ -19,7 +19,7 @@ LLVM_CXX_LD_FLAGS := `$(LLVM_CONFIG) --cxxflags --ldflags`
 LLVM_LIBS=`$(LLVM_CONFIG) --libs --system-libs`
 CLANG_INCLUDES= -I $(BASE_DIR)/tools/clang/include/ -I $(T2S_PATH)/install 
 CLANG_INCLUDES += -I ${T2S_PATH}/install/lib/clang/9.0.1/include/
-OTHER_FLAGS= -I ${T2S_PATH}/install/gcc-7.5.0/lib/gcc/x86_64-pc-linux-gnu/7.5.0/include/ -I ${T2S_PATH}/install/lib/clang/9.0.1/include/
+OTHER_FLAGS= -I ${T2S_PATH}/install/gcc-7.5.0/lib/gcc/x86_64-pc-linux-gnu/8.4.0/include/ -I ${T2S_PATH}/install/lib/clang/9.0.1/include/
 
 CLANG_LIBS := \
 	-Wl,--start-group \
diff --git a/t2s/preprocessor/src/t2spreprocessor.cpp b/t2s/src/preprocessor/src/t2spreprocessor.cpp
similarity index 95%
rename from t2s/preprocessor/src/t2spreprocessor.cpp
rename to t2s/src/preprocessor/src/t2spreprocessor.cpp
index 89fd1374..ebc70400 100644
--- a/t2s/preprocessor/src/t2spreprocessor.cpp
+++ b/t2s/src/preprocessor/src/t2spreprocessor.cpp
@@ -345,7 +345,6 @@ class T2SPreprocessorFrontendAction : public clang::ASTFrontendAction {
         for(int i = 0; i < types_vecstor.size() / 3; i++){
           QualType t_ImageParam = types_vecstor[i];
           const clang::Type* t_ptr_ImageParam = t_ImageParam.getTypePtr();
-
           QualType t_data = types_vecstor[i+1];
           const clang::Type* t_ptr_data = t_data.getTypePtr();
 
@@ -364,8 +363,10 @@ class T2SPreprocessorFrontendAction : public clang::ASTFrontendAction {
       // Insert necessary includes at the `SourceLocation` includeLoc 
       std::ostringstream rhs_insert;
       rhs_insert << "\n\n";
+      rhs_insert << "#ifdef FPGA\n";
       rhs_insert << "#include \"HalideBuffer.h\"\n";
-      rhs_insert << "#include \"" << funcName << ".generated_oneapi_header.h\"\n";
+      rhs_insert << "#endif\n";
+      rhs_insert << "#include \"" << funcName << ".sycl.h\"\n";
       R.InsertTextAfterToken( includeLoc , rhs_insert.str() );
       
       // Insert the internal code to run the generated OneAPI Function
@@ -541,8 +542,22 @@ class T2SPreprocessorFrontendAction : public clang::ASTFrontendAction {
       runfile.open( runfilename.str().c_str() );
       RB = &run_Rewriter.getEditBuffer(run_Rewriter.getSourceMgr().getMainFileID());
       runfile << std::string(RB->begin(), RB->end());
-
-
+      std::ifstream reading_sbl(runfilename.str().c_str());
+      std::string total_buff_read;
+      std::string buff_read;
+      while (getline(reading_sbl, buff_read)) {
+        if ((buff_read.find("Halide") != std::string::npos) || (buff_read.find("util.h") != std::string::npos)) {
+            total_buff_read += "#ifndef GPU\n";
+        }
+        total_buff_read += buff_read;
+        total_buff_read += '\n';
+        if ((buff_read.find("Halide") != std::string::npos) || (buff_read.find("util.h") != std::string::npos)) {
+            total_buff_read += "#endif\n";
+        }
+      }
+      std::ofstream runfile_gpu;
+      runfile_gpu.open(runfilename.str().c_str());
+      runfile_gpu << total_buff_read;
       //-----------------------------------------------//
     }
 
@@ -564,8 +579,9 @@ class T2SPreprocessorFrontendAction : public clang::ASTFrontendAction {
     
       std::cout << t2sprinter::header() << "orig filename: " << filename << "\n";
       std::cout << t2sprinter::header() << "orig pathname: " << pathname << "\n";
-      t2sfilename << pathname << "post.t2s." << filename;
-      runfilename << pathname << "post.run." << filename;
+      filename = filename.substr(0,filename.length()-4);
+      t2sfilename << pathname << filename << ".spec.cpp";
+      runfilename << pathname << filename << ".run.cpp";
       std::cout << t2sprinter::header() << "new t2sfilename: " << t2sfilename.str() << "\n";
       std::cout << t2sprinter::header() << "new runfilename: " << runfilename.str() << "\n";
 
@@ -600,9 +616,10 @@ int main(int argc, char* argv[]){
   Tool.appendArgumentsAdjuster( getInsertArgumentAdjuster(std::string("-I" + std::string(T2S_PATH) + "/t2s/src/").c_str()) );
   Tool.appendArgumentsAdjuster( getInsertArgumentAdjuster(std::string("-I" + std::string(T2S_PATH) + "/Halide/include/").c_str()) );
   Tool.appendArgumentsAdjuster( getInsertArgumentAdjuster(std::string("-I" + std::string(T2S_PATH) + "/install/lib/clang/9.0.1/include").c_str()) );
+  Tool.appendArgumentsAdjuster( getInsertArgumentAdjuster(std::string("-I" + std::string(T2S_PATH) + "/t2s/tests/correctness/util").c_str()) );
   Tool.appendArgumentsAdjuster( getInsertArgumentAdjuster(std::string("-std=c++11").c_str()) );
 
   int ret = Tool.run(newFrontendActionFactory<T2SPreprocessorFrontendAction>().get());
   std::cout << t2sprinter::header() << "finished! ...\n";
   return ret;
-}
\ No newline at end of file
+}
diff --git a/t2s/src/preprocessor/src/t2spreprocessor.h b/t2s/src/preprocessor/src/t2spreprocessor.h
new file mode 100644
index 00000000..e7aaa11e
--- /dev/null
+++ b/t2s/src/preprocessor/src/t2spreprocessor.h
@@ -0,0 +1,644 @@
+/*******************************************************************************
+ * Copyright 2021 Intel Corporation
+ *
+ * Licensed under the BSD-2-Clause Plus Patent License (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://opensource.org/licenses/BSDplusPatent
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ *
+ * SPDX-License-Identifier: BSD-2-Clause-Patent
+ *******************************************************************************/
+
+#ifndef T2SPREPROCESSOR_H_
+#define T2SPREPROCESSOR_H_
+
+// clang include
+#include <clang-c/Index.h>
+
+// // LLVM includes
+#include <llvm/Support/CommandLine.h>
+#include "clang/AST/ASTConsumer.h"
+#include "clang/Frontend/CompilerInstance.h"
+
+// Clang includes
+#include "clang/Tooling/CommonOptionsParser.h"
+#include "clang/Tooling/CompilationDatabase.h"
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/AST/ASTContext.h"
+
+// Clang Parse Attributes
+#include "clang/AST/Attr.h"
+#include "clang/Sema/ParsedAttr.h"
+#include "clang/Sema/Sema.h"
+#include "clang/Sema/SemaDiagnostic.h"
+#include "llvm/IR/Attributes.h"
+
+// Declares clang::SyntaxOnlyAction.
+#include "clang/Frontend/FrontendActions.h"
+#include "clang/Tooling/Tooling.h"
+
+// // Find matching statements
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+
+// For rewritting
+#include "clang/Rewrite/Core/Rewriter.h"
+
+// For llvm Inner
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/StringRef.h"
+
+// Standard includes
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <regex>
+#include <string>
+#include <vector>
+#include <stdlib.h>
+
+using namespace clang::tooling;
+using namespace llvm;
+using namespace clang;
+using namespace clang::ast_matchers;
+
+// =============== Helpers ===============
+
+const char *USAGE_EXAMPLE = R"(
+/* filename.cpp */
+int main(){
+	// Initialize data 
+	const int TOTAL_I = III * II * I;
+	const int TOTAL_J = JJJ * JJ * J;
+	const int TOTAL_K = KKK * KK * K;
+	float *a = new float[TOTAL_K * TOTAL_I];
+	float *b = new float[TOTAL_J * TOTAL_K];
+	float *c = new float[TOTAL_J * TOTAL_I];
+	for(unsigned int i = 0; i < (TOTAL_K * TOTAL_I); i++){ a[i] = random(); }
+	for(unsigned int i = 0; i < (TOTAL_J * TOTAL_K); i++){ b[i] = random(); }
+	for(unsigned int i = 0; i < (TOTAL_J * TOTAL_I); i++){ c[i] = 0.0f; }
+
+	// Dimensions of the data
+	int a_dim[2] = {TOTAL_K, TOTAL_I};
+	int b_dim[2] = {TOTAL_J, TOTAL_K};
+	int c_dim[6] = {JJJ, III, JJ, II, J, I};
+
+
+#pragma t2s_spec_start
+	// ...
+	ImageParam A("A", TTYPE, 2), B("B", TTYPE, 2);
+	// T2S Speficiations ... 
+	C.compile_to_oneapi( { A, B }, "gemm", IntelFPGA);
+
+#pragma t2s_spec_end
+
+#pragma t2s_submit gemm (A, a, a_dim) (B, b, b_dim) (C, c, c_dim)
+
+}
+)";
+
+namespace t2sprinter
+{
+
+  const std::string t2sprinterName = "t2spreprocessor: ";
+
+  const std::string HEADER = "\033[95m";
+  const std::string OKBLUE = "\033[94m";
+  const std::string OKCYAN = "\033[96m";
+  const std::string OKGREEN = "\033[92m";
+  const std::string WARNING = "\033[93m";
+  const std::string FAIL = "\033[91m";
+  const std::string ENDC = "\033[0m";
+  const std::string BOLD = "\033[1m";
+  const std::string UNDERLINE = "\033[4m";
+
+  std::string simple()
+  {
+    std::ostringstream rhs;
+    rhs << t2sprinterName;
+    return rhs.str();
+  }
+
+  std::string simple_tab()
+  {
+    std::ostringstream rhs;
+    rhs << "\t" << t2sprinterName;
+    return rhs.str();
+  }
+
+  std::string header()
+  {
+    std::ostringstream rhs;
+    rhs << OKGREEN << t2sprinterName << ENDC;
+    return rhs.str();
+  }
+
+  std::string warning()
+  {
+    std::ostringstream rhs;
+    rhs << WARNING << t2sprinterName << ENDC;
+    return rhs.str();
+  }
+
+  std::string error()
+  {
+    std::ostringstream rhs;
+    rhs << FAIL << t2sprinterName << ENDC;
+    return rhs.str();
+  }
+
+  std::string base_name(std::string const &path)
+  {
+    const std::string delims = "/\\";
+    std::size_t pos = path.find_last_of(delims);
+    return path.substr(pos + 1);
+  }
+
+  std::string path_only(std::string const &path)
+  {
+    const std::string delims = "/\\";
+    std::size_t pos = path.find_last_of(delims);
+    return path.substr(0, pos + 1);
+  }
+
+  std::string usage()
+  {
+    std::ostringstream rhs;
+    std::string space = " ";
+    rhs << header() << "Expected usage goes as follows: "
+        << "\n\n";
+
+    rhs << USAGE_EXAMPLE;
+    // << "int main(){\n\n"
+    // << space   << "float *a = new float[DIM_1 * DIM_2];" << "\n"
+    // << space   << "float *b = new float[DIM_3 * DIM_4 * DIM_5];" << "\n\n"
+    // << space   << "int a_dim[2] = {DIM_1, DIM_2};" << "\n"
+    // << space   << "int b_dim[3] = {DIM_3, DIM_4, DIM_5};" << "\n\n"
+    // << space   << "#pragma t2s_arg A, a, a_dim" << "\n"
+    // << space   << "#pragma t2s_arg C, c, c_dim" << "\n"
+    // << space   << "#pragma t2s_arg B, b, b_dim" << "\n"
+    // << space   << "#pragma t2s_spec_start" << "\n\n"
+    // << space << space << "ImageParam A(\"A\", TTYPE, 2), B(\"B\", TTYPE, 2);" << "\n"
+    // << space << space << "// T2S Speficiations ... " << "\n"
+    // << space << space << "C.compile_to_oneapi( { A, B }, \"gemm\", IntelFPGA);" << "\n\n"
+    // << space   << "#pragma t2s_spec_end" << "\n\n"
+    // << "}\n\n";
+    return rhs.str();
+  }
+
+  std::string get_base_type(std::string const &arg_type)
+  {
+    std::vector<std::string> arg_type_vec;
+    size_t start;
+    size_t end = 0;
+    while ((start = arg_type.find_first_not_of(" ", end)) != std::string::npos)
+    {
+      end = arg_type.find(" ", start);
+      arg_type_vec.push_back(arg_type.substr(start, end - start));
+    }
+    std::string ret = arg_type_vec[0];
+    return ret;
+  }
+
+  std::string parseOneAPIName(std::string const &name)
+  {
+    std::string funcName = name;
+    // Check to make sure that the second paramater, the function name, is not a variable
+    if (funcName.find("\"") == std::string::npos && funcName.find("'") == std::string::npos)
+    {
+      std::cout << t2sprinter::error() << "Error: unable to parse the name passed provided i.e. `" << funcName << "`.\n";
+      std::cout << t2sprinter::usage();
+      exit(1);
+    }
+
+    // str.erase(std::remove(str.begin(), str.end(), 'a'), str.end());
+    while (funcName.find("\"") != std::string::npos)
+    {
+      funcName.replace(funcName.find("\""), 1, "");
+    }
+    while (funcName.find("'") != std::string::npos)
+    {
+      funcName.replace(funcName.find("'"), 1, "");
+    }
+    return funcName;
+  }
+
+  std::vector<std::string> parseOneAPIVector(std::string const &vec)
+  {
+    std::string vec_tmp = vec;
+    std::vector<std::string> args;
+    // Check to make sure we can parse the vector directly
+    if (vec_tmp.find("{") == std::string::npos && vec_tmp.find("}") == std::string::npos)
+    {
+      std::cout << t2sprinter::error() << "Error: unable to parse the vector passed provided i.e. `" << vec << "`.\n";
+      std::cout << t2sprinter::usage();
+      exit(1);
+    }
+
+    while (vec_tmp.find("{") != std::string::npos)
+    {
+      vec_tmp.replace(vec_tmp.find("{"), 1, "");
+    }
+    while (vec_tmp.find("}") != std::string::npos)
+    {
+      vec_tmp.replace(vec_tmp.find("}"), 1, "");
+    }
+    while (vec_tmp.find(" ") != std::string::npos)
+    {
+      vec_tmp.replace(vec_tmp.find(" "), 1, "");
+    }
+
+    size_t pos = 0;
+    while (vec_tmp.find(",") != std::string::npos)
+    {
+      pos = vec_tmp.find(",");
+      args.push_back(vec_tmp.substr(0, pos));
+      // std::cout << t2sprinter::warning() << "\t parseVector @ arg: " << vec_tmp.substr(0,pos) << "\n";
+      vec_tmp.erase(0, pos + 1);
+    }
+    // std::cout << t2sprinter::warning() << "\t parseVector @ arg: " << vec_tmp << "\n";
+    args.push_back(vec_tmp);
+
+    return args;
+  }
+
+  std::string parseOneAPIOutputArg(std::string const &func)
+  {
+    std::string arg = func;
+    // Check to make sure that the second paramater, the function name, is not a variable
+    if (arg.find(".") == std::string::npos)
+    {
+      std::cout << t2sprinter::error() << "Error: unable to parse the output argument name passed provided i.e. `" << func << "`.\n";
+      std::cout << t2sprinter::usage();
+      exit(1);
+    }
+
+    size_t pos = arg.find(".");
+    std::string ret = arg.substr(0, pos);
+
+    return ret;
+  }
+
+}
+
+struct OneAPIArgStruct
+{
+  std::string arg_name;
+  std::string ptr_name;
+  std::string dim_name;
+
+  std::string full_args;
+
+  void ParseArgs(std::string s)
+  {
+    full_args = s;
+    std::vector<std::string> args;
+
+    // Remove all white spaces
+    s.erase(remove_if(s.begin(), s.end(), isspace), s.end());
+
+    // Seperate the args
+    std::string tmp;
+    size_t pos;
+    while ((pos = s.find(',')) != std::string::npos)
+    {
+      tmp = s.substr(0, pos);
+      args.push_back(tmp);
+      s.erase(0, pos + 1);
+    }
+    args.push_back(s);
+
+    // Check that we have 3 args
+    assert(args.size() == 3);
+
+    // Allocate them to members accordingly
+    arg_name = args[0];
+    ptr_name = args[1];
+    dim_name = args[2];
+  }
+
+  std::string str()
+  {
+    std::ostringstream rhs;
+    rhs << "\t" << arg_name << " -> ptr: [" << ptr_name << "] , dim: [" << dim_name << "]";
+    return rhs.str();
+  }
+};
+
+struct OneAPIFuncStruct
+{
+  std::string funcName;
+  std::vector<OneAPIArgStruct> SubmitArgs;
+  std::vector<std::string> oneapi_args_names;
+  std::string oneapi_output_arg;
+
+  std::vector<const clang::Expr *> args_expr_pntrs;
+
+  // OneAPI Location i.e. `compile_to_onepai()` method
+  SourceLocation oneapi_start;
+  SourceLocation oneapi_end;
+
+  // Pragma t2s_submit pragma location
+  SourceLocation submit_start;
+  SourceLocation submit_end;
+
+  std::string str()
+  {
+    std::ostringstream rhs;
+    rhs << "\n";
+    rhs << "funcName: " << funcName << "\n";
+    rhs << "OutputArg: " << oneapi_output_arg << "\n";
+
+    rhs << "oneapi_args_names: [";
+    for (unsigned int i = 0; i < oneapi_args_names.size(); i++)
+    {
+      rhs << oneapi_args_names[i];
+      if (i < oneapi_args_names.size() - 1)
+      {
+        rhs << ", ";
+      }
+    }
+    rhs << "]\n";
+    for (unsigned int i = 0; i < SubmitArgs.size(); i++)
+    {
+      rhs << SubmitArgs[i].str() << "\n";
+    }
+
+    return rhs.str();
+  }
+
+  // For parsing the `compile_to_oneapi()` method
+  void parseExprPntrs(Rewriter &R, SourceLocation start, SourceLocation end)
+  {
+
+    if (args_expr_pntrs.size() < 3)
+    {
+      std::cout << t2sprinter::error() << "Error: args_expr_pntrs is size zero.\n";
+    }
+
+    if (funcName.size() == 0)
+    {
+      std::string ret = t2sprinter::parseOneAPIName(
+          R.getRewrittenText(SourceRange(args_expr_pntrs[1]->getBeginLoc(), args_expr_pntrs[1]->getEndLoc())));
+      funcName = ret;
+    }
+
+    if (oneapi_args_names.size() == 0)
+    {
+      std::vector<std::string> ret = t2sprinter::parseOneAPIVector(
+          R.getRewrittenText(SourceRange(args_expr_pntrs[0]->getBeginLoc(), args_expr_pntrs[0]->getEndLoc())));
+      oneapi_args_names = ret;
+    }
+
+    if (oneapi_output_arg.size() == 0)
+    {
+      std::string ret = t2sprinter::parseOneAPIOutputArg(
+          R.getRewrittenText(SourceRange(start, end)));
+      oneapi_output_arg = ret;
+    }
+  }
+
+  // For parsing the arugments themselves
+  void parseArgPntrs(Rewriter &R, SourceLocation start, SourceLocation end)
+  {
+    // Get the t2s_submit pragma line as a string
+    std::string Args = R.getRewrittenText(SourceRange(start, end));
+    parseArgPntrsHelper(Args);
+  }
+
+  void parseArgPntrsHelper(std::string Args)
+  {
+    // Get the open & close bracket index
+    std::vector<int> open_bracket;
+    std::vector<int> close_bracket;
+    for (unsigned int i; i < Args.size(); i++)
+    {
+      if (Args.at(i) == '(')
+      {
+        open_bracket.push_back(i);
+      }
+    }
+    for (unsigned int i; i < Args.size(); i++)
+    {
+      if (Args.at(i) == ')')
+      {
+        close_bracket.push_back(i);
+      }
+    }
+
+    // Create and Push back the OneAPIArgStruct
+    for (unsigned int i = 0; i < open_bracket.size(); i++)
+    {
+      std::string sub_str = Args.substr(open_bracket[i] + 1, close_bracket[i] - (open_bracket[i] + 1));
+      OneAPIArgStruct tmp;
+      tmp.ParseArgs(sub_str);
+      SubmitArgs.push_back(tmp);
+    }
+  }
+};
+
+struct ParamLocs
+{
+  SourceLocation start;
+  SourceLocation end;
+  Token *tkn;
+
+  std::string args;
+
+  std::vector<QualType> arg_types;
+
+  OneAPIFuncStruct oneapiStruct;
+
+  ParamLocs(SourceLocation s, SourceLocation e) : start(s), end(e) {}
+  ParamLocs(SourceLocation s, SourceLocation e, Token *t) : start(s), end(e), tkn(t) {}
+
+  std::vector<std::string> SplitArgs()
+  {
+    // Get the argument names and put them into a string vector
+    std::string tmp_args = args;
+    std::vector<std::string> args_vector;
+    std::string delim = ",";
+    size_t pos = 0;
+    bool last_arg = false;
+    while ((pos = tmp_args.find(delim)) != std::string::npos || !last_arg)
+    {
+      std::string token = tmp_args.substr(0, pos);
+      args_vector.push_back(token);
+      tmp_args.erase(0, pos + delim.length());
+
+      if (pos == std::string::npos)
+        last_arg = true;
+    }
+    return args_vector;
+  }
+
+  void initOneAPIStruct(Rewriter &orig_Rewriter)
+  {
+    oneapiStruct.parseExprPntrs(orig_Rewriter, start, end);
+  }
+};
+
+std::string GenRunPostCode(OneAPIFuncStruct oneapiStruct, std::vector<std::string> args_name, std::vector<QualType> args_types, std::string funcName)
+{
+  std::ostringstream rhs;  
+  // check that there is an even number i.e. paris of args and their dimension vector
+  if (((args_name.size() % 3) != 0) || ((args_types.size() % 3) != 0))
+  {
+    std::cout << t2sprinter::error() << "Uneven match of argument names and argument types i.e. [" << args_name.size() << "] arg names and [" << args_types.size() << "] arg types found.\n";
+    std::cout << t2sprinter::usage();
+    exit(1);
+  }
+
+  rhs << "\n\n";
+
+  std::vector<std::string> new_halide_names_vector;
+  std::string new_halide_output_name;
+  std::vector<std::string> fn_var;
+  for (unsigned int i = 0; i < args_name.size(); i = i + 3)
+  {
+    unsigned int FuncArgName_index = i;
+    unsigned int pointer_index = i + 1;
+    unsigned int dim_index = i + 2;
+
+    std::string func_arg_name = args_name[FuncArgName_index];
+    std::string new_halide_name = func_arg_name + "_h";
+
+    if (func_arg_name == oneapiStruct.oneapi_output_arg)
+    {
+      new_halide_output_name = new_halide_name;
+    }
+
+    std::string arg_ptr_name = args_name[pointer_index];
+    new_halide_names_vector.push_back(new_halide_name);
+    std::string arg_dim_vector_name = args_name[dim_index];
+
+    // Retrieve the QualTypes
+    QualType t_ptr = args_types[pointer_index]; // pointer type
+    QualType t_dim = args_types[dim_index];     // array type
+
+    // // Get the type of Halide::Runtime::Buffer by Splitting the type to get the base type
+    std::string arg_type = t_ptr.getAsString();
+    arg_type = t2sprinter::get_base_type(arg_type);
+
+    // // Get the number of dimensions
+    std::string arg_dim_type = t_dim.getAsString();
+    arg_dim_type = t2sprinter::get_base_type(arg_dim_type);
+    std::string arg_dim_vector_size = "(sizeof(" + arg_dim_vector_name + ")/sizeof(" + arg_dim_type + "))";
+
+    // // Get the dimensions as an std::vector
+    std::string arg_dims_as_vector = "std::vector<" + arg_dim_type + ">(std::begin(" + arg_dim_vector_name + "), std::end(" + arg_dim_vector_name + "))";
+
+    // // For debugging purposes
+    // rhs << "// func_arg_name: " << func_arg_name << "\n";
+    // rhs << "// arg_ptr_name: " << arg_ptr_name << "\n";
+    // rhs << "// arg_type: " << arg_type << "\n";
+    // rhs << "// new_halide_name: " << new_halide_name << "\n";
+    // rhs << "// arg_dim_vector_name: " << arg_dim_vector_name << "\n";
+    // rhs << "// arg_dim_type: " << arg_dim_type << "\n";
+    // rhs << "// arg_dim_vector_size: " << arg_dim_vector_size << "\n";
+    // rhs << "// arg_dims_as_vector: " << arg_dims_as_vector << "\n";
+    // rhs << "\n\n";
+
+    // // Write the initalization of the halide runtime buffer
+    rhs << "#ifdef FPGA\n";
+    rhs << "Halide::Runtime::Buffer<" << arg_type << ", " << arg_dim_vector_size << "> " << new_halide_name << "(" << arg_ptr_name << ", " << arg_dims_as_vector << ");\n";
+    rhs << "#endif\n";
+    // ATTENTION:only 2-D image is supported by preprocessor
+    /*
+    HERE WE GOT FOLLOWING CL DATA TYPES:
+      snorm_int8 = 0,
+      snorm_int16 = 1,
+      unorm_int8 = 2,
+      unorm_int16 = 3,
+      unorm_short_565 = 4,
+      unorm_short_555 = 5,
+      unorm_int_101010 = 6,
+      signed_int8 = 7,
+      signed_int16 = 8,
+      signed_int32 = 9,
+      unsigned_int8 = 10,
+      unsigned_int16 = 11,
+      unsigned_int32 = 12,
+      fp16 = 13,
+      fp32 = 14
+    */
+    std::string image_type;
+    if (arg_type == "float")
+      image_type = "fp32";
+    else if (arg_type == "int")
+      image_type = "signed_int32";
+    else if (arg_type == "unsigned int")
+      image_type = "unsigned_int32";
+    else
+    {
+      std::cout << t2sprinter::error() << "currently, image type does not support arg type:" << arg_type << ",please check t2sp/t2s/preprocessor/t2spreprocessor.h line 555 for detailed info";
+      exit(0);
+    }
+    rhs << "#ifdef GPU\n";
+    //currently only 2-D image is supported
+    //here we must explicitly print out T2S image extension
+    //std::cout << t2sprinter::header() << "debugging purpose:loop variable i = " << i <<'\n';
+    rhs << "int _"<<func_arg_name << "_extent_" << "0 = "<< arg_dim_vector_name << "[0];\n";
+    rhs << "int _"<<func_arg_name << "_extent_" << "1 = "<< arg_dim_vector_name << "[1];\n";
+    rhs << "sycl::image<2> img" << "_" << func_arg_name << "(" << arg_ptr_name << ", image_channel_order::rgba, image_channel_type::" << image_type << ",\n"
+        << "range<2>{" << arg_dim_vector_name << "[0]/4 , " << arg_dim_vector_name << "[1]});\n";
+    fn_var.push_back("sycl::image<2> img_" + func_arg_name); 
+    fn_var.push_back("int _" + func_arg_name + "_extent_0");
+    fn_var.push_back("int _" + func_arg_name + "_extent_1");   
+    rhs << "#endif\n";
+  }
+
+  // Write out the device selector options
+  // preprocessor for FPGA
+  rhs << "#ifdef FPGA\n";
+  rhs << "#if defined(FPGA_EMULATOR)\n";
+  rhs << "std::cout << \"USING FPGA EMULATOR\\n\";\n";
+  rhs << "sycl::ext::intel::fpga_emulator_selector device_selector; // (NOTE) for emulation\n";
+  rhs << "#else\n";
+  rhs << "std::cout << \"USING REAL FPGA\\n\";\n";
+  rhs << "sycl::ext::intel::fpga_selector device_selector; // (NOTE) for full compile and hardware profiling \n";
+  rhs << "#endif\n";
+
+  // Write out the execution of the function
+  rhs << "std::cout << \"Start Run\\n\";\n";
+  rhs << "double exec_time = 0;\n";
+  rhs << "exec_time = " + funcName + "(device_selector, ";
+  for (unsigned int i = 0; i < new_halide_names_vector.size(); i++)
+  {
+    if (new_halide_names_vector[i] != new_halide_output_name)
+    {
+      rhs << new_halide_names_vector[i] << ".raw_buffer()";
+      if (i != new_halide_names_vector.size() - 1)
+      {
+        rhs << ", ";
+      }
+    }
+  }
+  rhs << new_halide_output_name << ".raw_buffer()";
+  rhs << ");\n";
+  rhs << "std::cout << \"Run completed!\\n\";\n";
+  rhs << "std::cout << \"kernel exec time: \" << exec_time << \"\\n\";\n";
+  rhs << "#endif\n";
+  rhs << "#ifdef GPU\n";
+  rhs << "void execution(";
+  for (unsigned i = 0; i < fn_var.size()-1; i++)
+  {
+    rhs << fn_var[i];
+    rhs << ",";
+  }
+  rhs << fn_var[fn_var.size()-1];
+  rhs << ");\n";
+  rhs << "#endif\n";
+  return rhs.str();
+}
+
+#endif // T2SPREPROCESSOR_H_
\ No newline at end of file
diff --git a/t2s/src/preprocessor/workflow.png b/t2s/src/preprocessor/workflow.png
new file mode 100644
index 00000000..bfe25f8a
Binary files /dev/null and b/t2s/src/preprocessor/workflow.png differ
diff --git a/t2s/tests/performance/capsule/capsule.cpp b/t2s/tests/performance/capsule/capsule.cpp
index 02fd2b75..8e4d843e 100644
--- a/t2s/tests/performance/capsule/capsule.cpp
+++ b/t2s/tests/performance/capsule/capsule.cpp
@@ -85,7 +85,7 @@ int main(void)
 
     // I/O network
     Stensor DP("PLoader", DRAM), SP("PFeeder", SRAM), DW("WLoader", DRAM), SW("WFeeder", SRAM);
-    Stensor RV("collector", REG), DV("unloader", DRAM), V("deserializer");
+    Stensor RV("collector", REG), DV("unloader", DRAM), V("V");
 #ifdef GPU
     SP.scope(yy_xx).out(cii, yyy_xxx);
 #else
@@ -105,4 +105,4 @@ int main(void)
 #endif
     printf("Success\n");
     return 0;
-}
+}
\ No newline at end of file
diff --git a/t2s/tests/performance/capsule/const-parameters.h b/t2s/tests/performance/capsule/const-parameters.h
index 52d43f7d..3d734f16 100644
--- a/t2s/tests/performance/capsule/const-parameters.h
+++ b/t2s/tests/performance/capsule/const-parameters.h
@@ -77,4 +77,4 @@
 #define TOTAL_CO        (COOO * COO * CO)
 #define TOTAL_CI        (CII * CI)
 
-#endif
+#endif
\ No newline at end of file
diff --git a/t2s/tests/performance/conv/conv.cpp b/t2s/tests/performance/conv/conv.cpp
index 8ee966f8..81cb839a 100644
--- a/t2s/tests/performance/conv/conv.cpp
+++ b/t2s/tests/performance/conv/conv.cpp
@@ -86,7 +86,7 @@ int main(void)
 
     // I/O network
     Stensor DI("iLoader", DRAM), SI("iFeeder", SRAM), DK("kLoader", DRAM), SK("kFeeder", SRAM);
-    Stensor RO("collector", REG), DO("unloader", DRAM), O("deserializer");
+    Stensor RO("collector", REG), DO("unloader", DRAM), O("O");
     I >> DI.out(cii)                 >> FIFO(256)
       >> SI.scope(kx).out(cii, yyy)  >> FIFO(256);
     K >> DK.out(cii)                 >> FIFO(256)
@@ -102,4 +102,4 @@ int main(void)
 #endif
     printf("Success\n");
     return 0;
-}
+}
\ No newline at end of file
diff --git a/t2s/tests/performance/gemm/const-parameters.h b/t2s/tests/performance/gemm/const-parameters.h
index f476ba86..b81d1249 100644
--- a/t2s/tests/performance/gemm/const-parameters.h
+++ b/t2s/tests/performance/gemm/const-parameters.h
@@ -23,10 +23,10 @@
 #ifdef GPU
     #define KKK         8
     #define JJJ         8
-    #define III         16
-    #define JJ          2
-    #define II          4
-    #define KK          4
+    #define III         32
+    #define JJ          8
+    #define II          2
+    #define KK          1
 #else // FPGA
     #ifdef TINY // For verifying correctness only
         #define KKK         4
@@ -35,14 +35,7 @@
         #define JJ          4
         #define II          4
         #define KK          4
-    #elif S10
-        #define KKK         16
-        #define JJJ         16
-        #define III         14
-        #define JJ          32
-        #define II          32
-        #define KK          32
-    #else   // For A10
+    #else
         #define KKK         16
         #define JJJ         8
         #define III         10
diff --git a/t2s/tests/performance/gemm/gemm.cpp b/t2s/tests/performance/gemm/gemm.cpp
index a48bc741..5b026b21 100644
--- a/t2s/tests/performance/gemm/gemm.cpp
+++ b/t2s/tests/performance/gemm/gemm.cpp
@@ -80,7 +80,7 @@ int main()
 
     // I/O network
     Stensor DA("aLoader", DRAM), SA("aFeeder", SRAM), DB("bLoader", DRAM), SB("bFeeder", SRAM);
-    Stensor RC("collector", REG), DC("unloader", DRAM), C("deserializer");
+    Stensor RC("collector", REG), DC("unloader", DRAM), C("C");
     A >> DA.out(kkk)                >> FIFO(256)
       >> SA.scope(k).out(kkk, iii)  >> FIFO(256);
     B >> DB.out(kkk)                >> FIFO(256)
diff --git a/t2s/tests/performance/gemm/info.txt b/t2s/tests/performance/gemm/info.txt
new file mode 100644
index 00000000..5eb9a89a
--- /dev/null
+++ b/t2s/tests/performance/gemm/info.txt
@@ -0,0 +1,8646 @@
+Section: 
+Section: .interp
+Section: .note.ABI-tag
+Section: .hash
+Section: .gnu.hash
+Section: .dynsym
+Section: .dynstr
+Section: .gnu.version
+Section: .gnu.version_r
+Section: .rela.dyn
+Section: .rela.plt
+Section: .init
+Section: .plt
+Section: .text
+Section: .fini
+Section: .rodata
+Section: .eh_frame_hdr
+Section: .eh_frame
+Section: .gcc_except_table
+Section: .init_array
+Section: .fini_array
+Section: .data.rel.ro
+Section: .dynamic
+Section: .got
+Section: .got.plt
+Section: .data
+Section: .bss
+Section: .comment
+Section: .debug_aranges
+Section: .debug_info
+Section: .debug_abbrev
+Section: .debug_line
+Section: .debug_str
+Section: .debug_ranges
+Section: .symtab
+Section: .strtab
+Section: .shstrtab
+User error triggered at /home/tzl/betat2sp/Halide/src/Func.cpp:1858
+Warning: (at ./gemm.cpp:93) Out already has a compute_with at Z.s0.jjj. Replacing it with a new compute_with at Z.s0.kkk
+User error triggered at /home/tzl/betat2sp/Halide/src/Func.cpp:1858
+Warning: (at ./gemm.cpp:93) Z already has a compute_with at Y.s0.kkk. Replacing it with a new compute_with at Y.s0.kkk
+User error triggered at /home/tzl/betat2sp/Halide/src/Func.cpp:1858
+Warning: (at ./gemm.cpp:93) Y already has a compute_with at X.s0.kkk. Replacing it with a new compute_with at X.s0.kkk
+A.gpu_fetch(k, {kkk, iii});
+B.gpu_fetch(k, {kkk, jjj});
+Out.gpu_store(((jjj + (8*jj)) + (64*j)), ((iii + (32*ii)) + (64*i)));
+User error triggered at /home/tzl/betat2sp/Halide/../t2s/src/Stensor.cpp:689
+Warning: (at ./gemm.cpp:93) Currently the GPU runtime is under developement, so we just emit out the source code in gemm_genx.cpp
+User error triggered at /home/tzl/betat2sp/Halide/src/Func.cpp:1858
+Warning: (at ./gemm.cpp:93) Out already has a compute_with at Z.s0.jjj. Replacing it with a new compute_with at Z.s0.kkk
+User error triggered at /home/tzl/betat2sp/Halide/src/Func.cpp:1858
+Warning: (at ./gemm.cpp:93) Z already has a compute_with at Y.s0.kkk. Replacing it with a new compute_with at Y.s0.kkk
+User error triggered at /home/tzl/betat2sp/Halide/src/Func.cpp:1858
+Warning: (at ./gemm.cpp:93) Y already has a compute_with at X.s0.kkk. Replacing it with a new compute_with at X.s0.kkk
+Creating initial loop nests...
+Injecting realization of { X, Y, Z, Out }
+for (.__root, 0, 1) {
+ realize Z[Z.kkk.min_realized, Z.kkk.extent_realized], [Z.jjj.min_realized, Z.jjj.extent_realized], [Z.iii.min_realized, Z.iii.extent_realized], [Z.jj.min_realized, Z.jj.extent_realized], [Z.ii.min_realized, Z.ii.extent_realized], [Z.kk.min_realized, Z.kk.extent_realized], [Z.k.min_realized, Z.k.extent_realized], [Z.j.min_realized, Z.j.extent_realized], [Z.i.min_realized, Z.i.extent_realized] of type `float32' {
+  realize Y[Y.kkk.min_realized, Y.kkk.extent_realized], [Y.jjj.min_realized, Y.jjj.extent_realized], [Y.iii.min_realized, Y.iii.extent_realized], [Y.jj.min_realized, Y.jj.extent_realized], [Y.ii.min_realized, Y.ii.extent_realized], [Y.kk.min_realized, Y.kk.extent_realized], [Y.k.min_realized, Y.k.extent_realized], [Y.j.min_realized, Y.j.extent_realized], [Y.i.min_realized, Y.i.extent_realized] of type `float32' {
+   realize X[X.kkk.min_realized, X.kkk.extent_realized], [X.jjj.min_realized, X.jjj.extent_realized], [X.iii.min_realized, X.iii.extent_realized], [X.jj.min_realized, X.jj.extent_realized], [X.ii.min_realized, X.ii.extent_realized], [X.kk.min_realized, X.kk.extent_realized], [X.k.min_realized, X.k.extent_realized], [X.j.min_realized, X.j.extent_realized], [X.i.min_realized, X.i.extent_realized] of type `float32' {
+    produce Out {
+     produce Z {
+      produce Y {
+       produce X {
+        let X.s0.__outermost.loop_extent = 1
+        let X.s0.__outermost.loop_max = 0
+        let X.s0.__outermost.loop_min = 0
+        let Y.s0.__outermost.loop_extent = 1
+        let Y.s0.__outermost.loop_max = 0
+        let Y.s0.__outermost.loop_min = 0
+        let Z.s0.__outermost.loop_extent = 1
+        let Z.s0.__outermost.loop_max = 0
+        let Z.s0.__outermost.loop_min = 0
+        let Out.s0.__outermost.loop_extent = 1
+        let Out.s0.__outermost.loop_max = 0
+        let Out.s0.__outermost.loop_min = 0
+        for (X.s0.__outermost, X.s0.__outermost.loop_min, X.s0.__outermost.loop_extent) {
+         let X.s0.i.loop_max = ((0 + (A.extent.1/64)) - 1)
+         let X.s0.i.loop_min = 0
+         let X.s0.i.loop_extent = ((((0 + (A.extent.1/64)) - 1) + 1) - 0)
+         gpu_block<Default_GPU> (X.s0.i, X.s0.i.loop_min, X.s0.i.loop_extent) {
+          let X.s0.j.loop_max = ((0 + (B.extent.0/64)) - 1)
+          let X.s0.j.loop_min = 0
+          let X.s0.j.loop_extent = ((((0 + (B.extent.0/64)) - 1) + 1) - 0)
+          gpu_block<Default_GPU> (X.s0.j, X.s0.j.loop_min, X.s0.j.loop_extent) {
+           let X.s0.ii.loop_max = ((0 + 2) - 1)
+           let X.s0.ii.loop_min = 0
+           let X.s0.ii.loop_extent = ((((0 + 2) - 1) + 1) - 0)
+           gpu_thread<Default_GPU> (X.s0.ii, X.s0.ii.loop_min, X.s0.ii.loop_extent) {
+            let X.s0.jj.loop_max = ((0 + 8) - 1)
+            let X.s0.jj.loop_min = 0
+            let X.s0.jj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+            gpu_thread<Default_GPU> (X.s0.jj, X.s0.jj.loop_min, X.s0.jj.loop_extent) {
+             let X.s0.k.loop_max = ((0 + (A.extent.0/8)) - 1)
+             let X.s0.k.loop_min = 0
+             let X.s0.k.loop_extent = ((((0 + (A.extent.0/8)) - 1) + 1) - 0)
+             for (X.s0.k, X.s0.k.loop_min, X.s0.k.loop_extent) {
+              let X.s0.kk.loop_max = ((0 + 1) - 1)
+              let X.s0.kk.loop_min = 0
+              let X.s0.kk.loop_extent = ((((0 + 1) - 1) + 1) - 0)
+              for (X.s0.kk, X.s0.kk.loop_min, X.s0.kk.loop_extent) {
+               let X.s0.iii.loop_max = ((0 + 32) - 1)
+               let X.s0.iii.loop_min = 0
+               let X.s0.iii.loop_extent = ((((0 + 32) - 1) + 1) - 0)
+               for (X.s0.iii, X.s0.iii.loop_min, X.s0.iii.loop_extent) {
+                let X.s0.jjj.loop_max = ((0 + 8) - 1)
+                let X.s0.jjj.loop_min = 0
+                let X.s0.jjj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                vectorized (X.s0.jjj, X.s0.jjj.loop_min, X.s0.jjj.loop_extent) {
+                 let X.s0.kkk.loop_max = ((0 + 8) - 1)
+                 let X.s0.kkk.loop_min = 0
+                 let X.s0.kkk.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                 for (X.s0.kkk, X.s0.kkk.loop_min, X.s0.kkk.loop_extent) {
+                  X(X.s0.kkk, X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.kk, X.s0.k, X.s0.j, X.s0.i) = select((X.s0.jjj == 0), (float32)A_im(((X.s0.kkk + (8*X.s0.kk)) + (8*X.s0.k)), ((X.s0.iii + (32*X.s0.ii)) + (64*X.s0.i))), X(X.s0.kkk, (X.s0.jjj - 1), X.s0.iii, X.s0.jj, X.s0.ii, X.s0.kk, X.s0.k, X.s0.j, X.s0.i))
+                  for (Y.s0.__outermost, Y.s0.__outermost.loop_min, Y.s0.__outermost.loop_extent) {
+                   let Y.s0.i.loop_max = ((0 + (A.extent.1/64)) - 1)
+                   let Y.s0.i.loop_min = 0
+                   let Y.s0.i.loop_extent = ((((0 + (A.extent.1/64)) - 1) + 1) - 0)
+                   let Y.s0.fused.i.loop_extent = 1
+                   let Y.s0.fused.i.loop_max = X.s0.i
+                   let Y.s0.fused.i.loop_min = X.s0.i
+                   for (Y.s0.fused.i, Y.s0.fused.i.loop_min, Y.s0.fused.i.loop_extent) {
+                    let Y.s0.j.loop_max = ((0 + (B.extent.0/64)) - 1)
+                    let Y.s0.j.loop_min = 0
+                    let Y.s0.j.loop_extent = ((((0 + (B.extent.0/64)) - 1) + 1) - 0)
+                    let Y.s0.fused.j.loop_extent = 1
+                    let Y.s0.fused.j.loop_max = X.s0.j
+                    let Y.s0.fused.j.loop_min = X.s0.j
+                    for (Y.s0.fused.j, Y.s0.fused.j.loop_min, Y.s0.fused.j.loop_extent) {
+                     let Y.s0.ii.loop_max = ((0 + 2) - 1)
+                     let Y.s0.ii.loop_min = 0
+                     let Y.s0.ii.loop_extent = ((((0 + 2) - 1) + 1) - 0)
+                     let Y.s0.fused.ii.loop_extent = 1
+                     let Y.s0.fused.ii.loop_max = X.s0.ii
+                     let Y.s0.fused.ii.loop_min = X.s0.ii
+                     for (Y.s0.fused.ii, Y.s0.fused.ii.loop_min, Y.s0.fused.ii.loop_extent) {
+                      let Y.s0.jj.loop_max = ((0 + 8) - 1)
+                      let Y.s0.jj.loop_min = 0
+                      let Y.s0.jj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                      let Y.s0.fused.jj.loop_extent = 1
+                      let Y.s0.fused.jj.loop_max = X.s0.jj
+                      let Y.s0.fused.jj.loop_min = X.s0.jj
+                      for (Y.s0.fused.jj, Y.s0.fused.jj.loop_min, Y.s0.fused.jj.loop_extent) {
+                       let Y.s0.k.loop_max = ((0 + (A.extent.0/8)) - 1)
+                       let Y.s0.k.loop_min = 0
+                       let Y.s0.k.loop_extent = ((((0 + (A.extent.0/8)) - 1) + 1) - 0)
+                       let Y.s0.fused.k.loop_extent = 1
+                       let Y.s0.fused.k.loop_max = X.s0.k
+                       let Y.s0.fused.k.loop_min = X.s0.k
+                       for (Y.s0.fused.k, Y.s0.fused.k.loop_min, Y.s0.fused.k.loop_extent) {
+                        let Y.s0.kk.loop_max = ((0 + 1) - 1)
+                        let Y.s0.kk.loop_min = 0
+                        let Y.s0.kk.loop_extent = ((((0 + 1) - 1) + 1) - 0)
+                        let Y.s0.fused.kk.loop_extent = 1
+                        let Y.s0.fused.kk.loop_max = X.s0.kk
+                        let Y.s0.fused.kk.loop_min = X.s0.kk
+                        for (Y.s0.fused.kk, Y.s0.fused.kk.loop_min, Y.s0.fused.kk.loop_extent) {
+                         let Y.s0.iii.loop_max = ((0 + 32) - 1)
+                         let Y.s0.iii.loop_min = 0
+                         let Y.s0.iii.loop_extent = ((((0 + 32) - 1) + 1) - 0)
+                         let Y.s0.fused.iii.loop_extent = 1
+                         let Y.s0.fused.iii.loop_max = X.s0.iii
+                         let Y.s0.fused.iii.loop_min = X.s0.iii
+                         for (Y.s0.fused.iii, Y.s0.fused.iii.loop_min, Y.s0.fused.iii.loop_extent) {
+                          let Y.s0.jjj.loop_max = ((0 + 8) - 1)
+                          let Y.s0.jjj.loop_min = 0
+                          let Y.s0.jjj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                          let Y.s0.fused.jjj.loop_extent = 1
+                          let Y.s0.fused.jjj.loop_max = X.s0.jjj
+                          let Y.s0.fused.jjj.loop_min = X.s0.jjj
+                          for (Y.s0.fused.jjj, Y.s0.fused.jjj.loop_min, Y.s0.fused.jjj.loop_extent) {
+                           let Y.s0.kkk.loop_max = ((0 + 8) - 1)
+                           let Y.s0.kkk.loop_min = 0
+                           let Y.s0.kkk.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                           let Y.s0.fused.kkk.loop_extent = 1
+                           let Y.s0.fused.kkk.loop_max = X.s0.kkk
+                           let Y.s0.fused.kkk.loop_min = X.s0.kkk
+                           for (Y.s0.fused.kkk, Y.s0.fused.kkk.loop_min, Y.s0.fused.kkk.loop_extent) {
+                            Y(Y.s0.fused.kkk, Y.s0.fused.jjj, Y.s0.fused.iii, Y.s0.fused.jj, Y.s0.fused.ii, Y.s0.fused.kk, Y.s0.fused.k, Y.s0.fused.j, Y.s0.fused.i) = select((Y.s0.fused.iii == 0), (float32)B_im(((Y.s0.fused.jjj + (8*Y.s0.fused.jj)) + (64*Y.s0.fused.j)), ((Y.s0.fused.kkk + (8*Y.s0.fused.kk)) + (8*Y.s0.fused.k))), Y(Y.s0.fused.kkk, Y.s0.fused.jjj, (Y.s0.fused.iii - 1), Y.s0.fused.jj, Y.s0.fused.ii, Y.s0.fused.kk, Y.s0.fused.k, Y.s0.fused.j, Y.s0.fused.i))
+                            for (Z.s0.__outermost, Z.s0.__outermost.loop_min, Z.s0.__outermost.loop_extent) {
+                             let Z.s0.i.loop_max = ((0 + (A.extent.1/64)) - 1)
+                             let Z.s0.i.loop_min = 0
+                             let Z.s0.i.loop_extent = ((((0 + (A.extent.1/64)) - 1) + 1) - 0)
+                             let Z.s0.fused.i.loop_extent = 1
+                             let Z.s0.fused.i.loop_max = Y.s0.fused.i
+                             let Z.s0.fused.i.loop_min = Y.s0.fused.i
+                             for (Z.s0.fused.i, Z.s0.fused.i.loop_min, Z.s0.fused.i.loop_extent) {
+                              let Z.s0.j.loop_max = ((0 + (B.extent.0/64)) - 1)
+                              let Z.s0.j.loop_min = 0
+                              let Z.s0.j.loop_extent = ((((0 + (B.extent.0/64)) - 1) + 1) - 0)
+                              let Z.s0.fused.j.loop_extent = 1
+                              let Z.s0.fused.j.loop_max = Y.s0.fused.j
+                              let Z.s0.fused.j.loop_min = Y.s0.fused.j
+                              for (Z.s0.fused.j, Z.s0.fused.j.loop_min, Z.s0.fused.j.loop_extent) {
+                               let Z.s0.ii.loop_max = ((0 + 2) - 1)
+                               let Z.s0.ii.loop_min = 0
+                               let Z.s0.ii.loop_extent = ((((0 + 2) - 1) + 1) - 0)
+                               let Z.s0.fused.ii.loop_extent = 1
+                               let Z.s0.fused.ii.loop_max = Y.s0.fused.ii
+                               let Z.s0.fused.ii.loop_min = Y.s0.fused.ii
+                               for (Z.s0.fused.ii, Z.s0.fused.ii.loop_min, Z.s0.fused.ii.loop_extent) {
+                                let Z.s0.jj.loop_max = ((0 + 8) - 1)
+                                let Z.s0.jj.loop_min = 0
+                                let Z.s0.jj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                                let Z.s0.fused.jj.loop_extent = 1
+                                let Z.s0.fused.jj.loop_max = Y.s0.fused.jj
+                                let Z.s0.fused.jj.loop_min = Y.s0.fused.jj
+                                for (Z.s0.fused.jj, Z.s0.fused.jj.loop_min, Z.s0.fused.jj.loop_extent) {
+                                 let Z.s0.k.loop_max = ((0 + (A.extent.0/8)) - 1)
+                                 let Z.s0.k.loop_min = 0
+                                 let Z.s0.k.loop_extent = ((((0 + (A.extent.0/8)) - 1) + 1) - 0)
+                                 let Z.s0.fused.k.loop_extent = 1
+                                 let Z.s0.fused.k.loop_max = Y.s0.fused.k
+                                 let Z.s0.fused.k.loop_min = Y.s0.fused.k
+                                 for (Z.s0.fused.k, Z.s0.fused.k.loop_min, Z.s0.fused.k.loop_extent) {
+                                  let Z.s0.kk.loop_max = ((0 + 1) - 1)
+                                  let Z.s0.kk.loop_min = 0
+                                  let Z.s0.kk.loop_extent = ((((0 + 1) - 1) + 1) - 0)
+                                  let Z.s0.fused.kk.loop_extent = 1
+                                  let Z.s0.fused.kk.loop_max = Y.s0.fused.kk
+                                  let Z.s0.fused.kk.loop_min = Y.s0.fused.kk
+                                  for (Z.s0.fused.kk, Z.s0.fused.kk.loop_min, Z.s0.fused.kk.loop_extent) {
+                                   let Z.s0.iii.loop_max = ((0 + 32) - 1)
+                                   let Z.s0.iii.loop_min = 0
+                                   let Z.s0.iii.loop_extent = ((((0 + 32) - 1) + 1) - 0)
+                                   let Z.s0.fused.iii.loop_extent = 1
+                                   let Z.s0.fused.iii.loop_max = Y.s0.fused.iii
+                                   let Z.s0.fused.iii.loop_min = Y.s0.fused.iii
+                                   for (Z.s0.fused.iii, Z.s0.fused.iii.loop_min, Z.s0.fused.iii.loop_extent) {
+                                    let Z.s0.jjj.loop_max = ((0 + 8) - 1)
+                                    let Z.s0.jjj.loop_min = 0
+                                    let Z.s0.jjj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                                    let Z.s0.fused.jjj.loop_extent = 1
+                                    let Z.s0.fused.jjj.loop_max = Y.s0.fused.jjj
+                                    let Z.s0.fused.jjj.loop_min = Y.s0.fused.jjj
+                                    for (Z.s0.fused.jjj, Z.s0.fused.jjj.loop_min, Z.s0.fused.jjj.loop_extent) {
+                                     let Z.s0.kkk.loop_max = ((0 + 8) - 1)
+                                     let Z.s0.kkk.loop_min = 0
+                                     let Z.s0.kkk.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                                     let Z.s0.fused.kkk.loop_extent = 1
+                                     let Z.s0.fused.kkk.loop_max = Y.s0.fused.kkk
+                                     let Z.s0.fused.kkk.loop_min = Y.s0.fused.kkk
+                                     for (Z.s0.fused.kkk, Z.s0.fused.kkk.loop_min, Z.s0.fused.kkk.loop_extent) {
+                                      Z(Z.s0.fused.kkk, Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, Z.s0.fused.kk, Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i) = (select((((Z.s0.fused.kkk == 0) && (Z.s0.fused.kk == 0)) && (Z.s0.fused.k == 0)), 0.000000f, select((Z.s0.fused.kkk == 0), select((Z.s0.fused.kk == 0), Z(((Z.s0.fused.kkk + 8) - 1), Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, ((Z.s0.fused.kk + 1) - 1), (Z.s0.fused.k - 1), Z.s0.fused.j, Z.s0.fused.i), Z(((Z.s0.fused.kkk + 8) - 1), Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, (Z.s0.fused.kk - 1), Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i)), Z((Z.s0.fused.kkk - 1), Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, Z.s0.fused.kk, Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i))) + (X(Z.s0.fused.kkk, Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, Z.s0.fused.kk, Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i)*Y(Z.s0.fused.kkk, Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, Z.s0.fused.kk, Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i)))
+                                      for (Out.s0.__outermost, Out.s0.__outermost.loop_min, Out.s0.__outermost.loop_extent) {
+                                       let Out.s0.i.loop_max = ((0 + (A.extent.1/64)) - 1)
+                                       let Out.s0.i.loop_min = 0
+                                       let Out.s0.i.loop_extent = ((((0 + (A.extent.1/64)) - 1) + 1) - 0)
+                                       let Out.s0.fused.i.loop_extent = 1
+                                       let Out.s0.fused.i.loop_max = Z.s0.fused.i
+                                       let Out.s0.fused.i.loop_min = Z.s0.fused.i
+                                       for (Out.s0.fused.i, Out.s0.fused.i.loop_min, Out.s0.fused.i.loop_extent) {
+                                        let Out.s0.j.loop_max = ((0 + (B.extent.0/64)) - 1)
+                                        let Out.s0.j.loop_min = 0
+                                        let Out.s0.j.loop_extent = ((((0 + (B.extent.0/64)) - 1) + 1) - 0)
+                                        let Out.s0.fused.j.loop_extent = 1
+                                        let Out.s0.fused.j.loop_max = Z.s0.fused.j
+                                        let Out.s0.fused.j.loop_min = Z.s0.fused.j
+                                        for (Out.s0.fused.j, Out.s0.fused.j.loop_min, Out.s0.fused.j.loop_extent) {
+                                         let Out.s0.ii.loop_max = ((0 + 2) - 1)
+                                         let Out.s0.ii.loop_min = 0
+                                         let Out.s0.ii.loop_extent = ((((0 + 2) - 1) + 1) - 0)
+                                         let Out.s0.fused.ii.loop_extent = 1
+                                         let Out.s0.fused.ii.loop_max = Z.s0.fused.ii
+                                         let Out.s0.fused.ii.loop_min = Z.s0.fused.ii
+                                         for (Out.s0.fused.ii, Out.s0.fused.ii.loop_min, Out.s0.fused.ii.loop_extent) {
+                                          let Out.s0.jj.loop_max = ((0 + 8) - 1)
+                                          let Out.s0.jj.loop_min = 0
+                                          let Out.s0.jj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                                          let Out.s0.fused.jj.loop_extent = 1
+                                          let Out.s0.fused.jj.loop_max = Z.s0.fused.jj
+                                          let Out.s0.fused.jj.loop_min = Z.s0.fused.jj
+                                          for (Out.s0.fused.jj, Out.s0.fused.jj.loop_min, Out.s0.fused.jj.loop_extent) {
+                                           let Out.s0.iii.loop_max = ((0 + 32) - 1)
+                                           let Out.s0.iii.loop_min = 0
+                                           let Out.s0.iii.loop_extent = ((((0 + 32) - 1) + 1) - 0)
+                                           let Out.s0.fused.iii.loop_extent = 1
+                                           let Out.s0.fused.iii.loop_max = Z.s0.fused.iii
+                                           let Out.s0.fused.iii.loop_min = Z.s0.fused.iii
+                                           for (Out.s0.fused.iii, Out.s0.fused.iii.loop_min, Out.s0.fused.iii.loop_extent) {
+                                            let Out.s0.jjj.loop_max = ((0 + 8) - 1)
+                                            let Out.s0.jjj.loop_min = 0
+                                            let Out.s0.jjj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                                            let Out.s0.fused.jjj.loop_extent = 1
+                                            let Out.s0.fused.jjj.loop_max = Z.s0.fused.jjj
+                                            let Out.s0.fused.jjj.loop_min = Z.s0.fused.jjj
+                                            for (Out.s0.fused.jjj, Out.s0.fused.jjj.loop_min, Out.s0.fused.jjj.loop_extent) {
+                                             Out(Out.s0.fused.jjj, Out.s0.fused.iii, Out.s0.fused.jj, Out.s0.fused.ii, Out.s0.fused.j, Out.s0.fused.i) = select((((Z.s0.fused.kkk == 7) && (Z.s0.fused.kk == 0)) && (Z.s0.fused.k == ((A.extent.0/8) - 1))), Z(Z.s0.fused.kkk, Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, Z.s0.fused.kk, Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i))
+                                            }
+                                           }
+                                          }
+                                         }
+                                        }
+                                       }
+                                      }
+                                     }
+                                    }
+                                   }
+                                  }
+                                 }
+                                }
+                               }
+                              }
+                             }
+                            }
+                           }
+                          }
+                         }
+                        }
+                       }
+                      }
+                     }
+                    }
+                   }
+                  }
+                 }
+                }
+               }
+              }
+             }
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+    consume Z {
+     consume Y {
+      consume X {
+       0
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+Inlining B_im
+for (.__root, 0, 1) {
+ realize Z[Z.kkk.min_realized, Z.kkk.extent_realized], [Z.jjj.min_realized, Z.jjj.extent_realized], [Z.iii.min_realized, Z.iii.extent_realized], [Z.jj.min_realized, Z.jj.extent_realized], [Z.ii.min_realized, Z.ii.extent_realized], [Z.kk.min_realized, Z.kk.extent_realized], [Z.k.min_realized, Z.k.extent_realized], [Z.j.min_realized, Z.j.extent_realized], [Z.i.min_realized, Z.i.extent_realized] of type `float32' {
+  realize Y[Y.kkk.min_realized, Y.kkk.extent_realized], [Y.jjj.min_realized, Y.jjj.extent_realized], [Y.iii.min_realized, Y.iii.extent_realized], [Y.jj.min_realized, Y.jj.extent_realized], [Y.ii.min_realized, Y.ii.extent_realized], [Y.kk.min_realized, Y.kk.extent_realized], [Y.k.min_realized, Y.k.extent_realized], [Y.j.min_realized, Y.j.extent_realized], [Y.i.min_realized, Y.i.extent_realized] of type `float32' {
+   realize X[X.kkk.min_realized, X.kkk.extent_realized], [X.jjj.min_realized, X.jjj.extent_realized], [X.iii.min_realized, X.iii.extent_realized], [X.jj.min_realized, X.jj.extent_realized], [X.ii.min_realized, X.ii.extent_realized], [X.kk.min_realized, X.kk.extent_realized], [X.k.min_realized, X.k.extent_realized], [X.j.min_realized, X.j.extent_realized], [X.i.min_realized, X.i.extent_realized] of type `float32' {
+    produce Out {
+     produce Z {
+      produce Y {
+       produce X {
+        let X.s0.__outermost.loop_extent = 1
+        let X.s0.__outermost.loop_max = 0
+        let X.s0.__outermost.loop_min = 0
+        let Y.s0.__outermost.loop_extent = 1
+        let Y.s0.__outermost.loop_max = 0
+        let Y.s0.__outermost.loop_min = 0
+        let Z.s0.__outermost.loop_extent = 1
+        let Z.s0.__outermost.loop_max = 0
+        let Z.s0.__outermost.loop_min = 0
+        let Out.s0.__outermost.loop_extent = 1
+        let Out.s0.__outermost.loop_max = 0
+        let Out.s0.__outermost.loop_min = 0
+        for (X.s0.__outermost, X.s0.__outermost.loop_min, X.s0.__outermost.loop_extent) {
+         let X.s0.i.loop_max = ((0 + (A.extent.1/64)) - 1)
+         let X.s0.i.loop_min = 0
+         let X.s0.i.loop_extent = ((((0 + (A.extent.1/64)) - 1) + 1) - 0)
+         gpu_block<Default_GPU> (X.s0.i, X.s0.i.loop_min, X.s0.i.loop_extent) {
+          let X.s0.j.loop_max = ((0 + (B.extent.0/64)) - 1)
+          let X.s0.j.loop_min = 0
+          let X.s0.j.loop_extent = ((((0 + (B.extent.0/64)) - 1) + 1) - 0)
+          gpu_block<Default_GPU> (X.s0.j, X.s0.j.loop_min, X.s0.j.loop_extent) {
+           let X.s0.ii.loop_max = ((0 + 2) - 1)
+           let X.s0.ii.loop_min = 0
+           let X.s0.ii.loop_extent = ((((0 + 2) - 1) + 1) - 0)
+           gpu_thread<Default_GPU> (X.s0.ii, X.s0.ii.loop_min, X.s0.ii.loop_extent) {
+            let X.s0.jj.loop_max = ((0 + 8) - 1)
+            let X.s0.jj.loop_min = 0
+            let X.s0.jj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+            gpu_thread<Default_GPU> (X.s0.jj, X.s0.jj.loop_min, X.s0.jj.loop_extent) {
+             let X.s0.k.loop_max = ((0 + (A.extent.0/8)) - 1)
+             let X.s0.k.loop_min = 0
+             let X.s0.k.loop_extent = ((((0 + (A.extent.0/8)) - 1) + 1) - 0)
+             for (X.s0.k, X.s0.k.loop_min, X.s0.k.loop_extent) {
+              let X.s0.kk.loop_max = ((0 + 1) - 1)
+              let X.s0.kk.loop_min = 0
+              let X.s0.kk.loop_extent = ((((0 + 1) - 1) + 1) - 0)
+              for (X.s0.kk, X.s0.kk.loop_min, X.s0.kk.loop_extent) {
+               let X.s0.iii.loop_max = ((0 + 32) - 1)
+               let X.s0.iii.loop_min = 0
+               let X.s0.iii.loop_extent = ((((0 + 32) - 1) + 1) - 0)
+               for (X.s0.iii, X.s0.iii.loop_min, X.s0.iii.loop_extent) {
+                let X.s0.jjj.loop_max = ((0 + 8) - 1)
+                let X.s0.jjj.loop_min = 0
+                let X.s0.jjj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                vectorized (X.s0.jjj, X.s0.jjj.loop_min, X.s0.jjj.loop_extent) {
+                 let X.s0.kkk.loop_max = ((0 + 8) - 1)
+                 let X.s0.kkk.loop_min = 0
+                 let X.s0.kkk.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                 for (X.s0.kkk, X.s0.kkk.loop_min, X.s0.kkk.loop_extent) {
+                  X(X.s0.kkk, X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.kk, X.s0.k, X.s0.j, X.s0.i) = select((X.s0.jjj == 0), (float32)A_im(((X.s0.kkk + (8*X.s0.kk)) + (8*X.s0.k)), ((X.s0.iii + (32*X.s0.ii)) + (64*X.s0.i))), X(X.s0.kkk, (X.s0.jjj - 1), X.s0.iii, X.s0.jj, X.s0.ii, X.s0.kk, X.s0.k, X.s0.j, X.s0.i))
+                  for (Y.s0.__outermost, Y.s0.__outermost.loop_min, Y.s0.__outermost.loop_extent) {
+                   let Y.s0.i.loop_max = ((0 + (A.extent.1/64)) - 1)
+                   let Y.s0.i.loop_min = 0
+                   let Y.s0.i.loop_extent = ((((0 + (A.extent.1/64)) - 1) + 1) - 0)
+                   let Y.s0.fused.i.loop_extent = 1
+                   let Y.s0.fused.i.loop_max = X.s0.i
+                   let Y.s0.fused.i.loop_min = X.s0.i
+                   for (Y.s0.fused.i, Y.s0.fused.i.loop_min, Y.s0.fused.i.loop_extent) {
+                    let Y.s0.j.loop_max = ((0 + (B.extent.0/64)) - 1)
+                    let Y.s0.j.loop_min = 0
+                    let Y.s0.j.loop_extent = ((((0 + (B.extent.0/64)) - 1) + 1) - 0)
+                    let Y.s0.fused.j.loop_extent = 1
+                    let Y.s0.fused.j.loop_max = X.s0.j
+                    let Y.s0.fused.j.loop_min = X.s0.j
+                    for (Y.s0.fused.j, Y.s0.fused.j.loop_min, Y.s0.fused.j.loop_extent) {
+                     let Y.s0.ii.loop_max = ((0 + 2) - 1)
+                     let Y.s0.ii.loop_min = 0
+                     let Y.s0.ii.loop_extent = ((((0 + 2) - 1) + 1) - 0)
+                     let Y.s0.fused.ii.loop_extent = 1
+                     let Y.s0.fused.ii.loop_max = X.s0.ii
+                     let Y.s0.fused.ii.loop_min = X.s0.ii
+                     for (Y.s0.fused.ii, Y.s0.fused.ii.loop_min, Y.s0.fused.ii.loop_extent) {
+                      let Y.s0.jj.loop_max = ((0 + 8) - 1)
+                      let Y.s0.jj.loop_min = 0
+                      let Y.s0.jj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                      let Y.s0.fused.jj.loop_extent = 1
+                      let Y.s0.fused.jj.loop_max = X.s0.jj
+                      let Y.s0.fused.jj.loop_min = X.s0.jj
+                      for (Y.s0.fused.jj, Y.s0.fused.jj.loop_min, Y.s0.fused.jj.loop_extent) {
+                       let Y.s0.k.loop_max = ((0 + (A.extent.0/8)) - 1)
+                       let Y.s0.k.loop_min = 0
+                       let Y.s0.k.loop_extent = ((((0 + (A.extent.0/8)) - 1) + 1) - 0)
+                       let Y.s0.fused.k.loop_extent = 1
+                       let Y.s0.fused.k.loop_max = X.s0.k
+                       let Y.s0.fused.k.loop_min = X.s0.k
+                       for (Y.s0.fused.k, Y.s0.fused.k.loop_min, Y.s0.fused.k.loop_extent) {
+                        let Y.s0.kk.loop_max = ((0 + 1) - 1)
+                        let Y.s0.kk.loop_min = 0
+                        let Y.s0.kk.loop_extent = ((((0 + 1) - 1) + 1) - 0)
+                        let Y.s0.fused.kk.loop_extent = 1
+                        let Y.s0.fused.kk.loop_max = X.s0.kk
+                        let Y.s0.fused.kk.loop_min = X.s0.kk
+                        for (Y.s0.fused.kk, Y.s0.fused.kk.loop_min, Y.s0.fused.kk.loop_extent) {
+                         let Y.s0.iii.loop_max = ((0 + 32) - 1)
+                         let Y.s0.iii.loop_min = 0
+                         let Y.s0.iii.loop_extent = ((((0 + 32) - 1) + 1) - 0)
+                         let Y.s0.fused.iii.loop_extent = 1
+                         let Y.s0.fused.iii.loop_max = X.s0.iii
+                         let Y.s0.fused.iii.loop_min = X.s0.iii
+                         for (Y.s0.fused.iii, Y.s0.fused.iii.loop_min, Y.s0.fused.iii.loop_extent) {
+                          let Y.s0.jjj.loop_max = ((0 + 8) - 1)
+                          let Y.s0.jjj.loop_min = 0
+                          let Y.s0.jjj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                          let Y.s0.fused.jjj.loop_extent = 1
+                          let Y.s0.fused.jjj.loop_max = X.s0.jjj
+                          let Y.s0.fused.jjj.loop_min = X.s0.jjj
+                          for (Y.s0.fused.jjj, Y.s0.fused.jjj.loop_min, Y.s0.fused.jjj.loop_extent) {
+                           let Y.s0.kkk.loop_max = ((0 + 8) - 1)
+                           let Y.s0.kkk.loop_min = 0
+                           let Y.s0.kkk.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                           let Y.s0.fused.kkk.loop_extent = 1
+                           let Y.s0.fused.kkk.loop_max = X.s0.kkk
+                           let Y.s0.fused.kkk.loop_min = X.s0.kkk
+                           for (Y.s0.fused.kkk, Y.s0.fused.kkk.loop_min, Y.s0.fused.kkk.loop_extent) {
+                            Y(Y.s0.fused.kkk, Y.s0.fused.jjj, Y.s0.fused.iii, Y.s0.fused.jj, Y.s0.fused.ii, Y.s0.fused.kk, Y.s0.fused.k, Y.s0.fused.j, Y.s0.fused.i) = select((Y.s0.fused.iii == 0), (float32)B(((Y.s0.fused.jjj + (8*Y.s0.fused.jj)) + (64*Y.s0.fused.j)), ((Y.s0.fused.kkk + (8*Y.s0.fused.kk)) + (8*Y.s0.fused.k))), Y(Y.s0.fused.kkk, Y.s0.fused.jjj, (Y.s0.fused.iii - 1), Y.s0.fused.jj, Y.s0.fused.ii, Y.s0.fused.kk, Y.s0.fused.k, Y.s0.fused.j, Y.s0.fused.i))
+                            for (Z.s0.__outermost, Z.s0.__outermost.loop_min, Z.s0.__outermost.loop_extent) {
+                             let Z.s0.i.loop_max = ((0 + (A.extent.1/64)) - 1)
+                             let Z.s0.i.loop_min = 0
+                             let Z.s0.i.loop_extent = ((((0 + (A.extent.1/64)) - 1) + 1) - 0)
+                             let Z.s0.fused.i.loop_extent = 1
+                             let Z.s0.fused.i.loop_max = Y.s0.fused.i
+                             let Z.s0.fused.i.loop_min = Y.s0.fused.i
+                             for (Z.s0.fused.i, Z.s0.fused.i.loop_min, Z.s0.fused.i.loop_extent) {
+                              let Z.s0.j.loop_max = ((0 + (B.extent.0/64)) - 1)
+                              let Z.s0.j.loop_min = 0
+                              let Z.s0.j.loop_extent = ((((0 + (B.extent.0/64)) - 1) + 1) - 0)
+                              let Z.s0.fused.j.loop_extent = 1
+                              let Z.s0.fused.j.loop_max = Y.s0.fused.j
+                              let Z.s0.fused.j.loop_min = Y.s0.fused.j
+                              for (Z.s0.fused.j, Z.s0.fused.j.loop_min, Z.s0.fused.j.loop_extent) {
+                               let Z.s0.ii.loop_max = ((0 + 2) - 1)
+                               let Z.s0.ii.loop_min = 0
+                               let Z.s0.ii.loop_extent = ((((0 + 2) - 1) + 1) - 0)
+                               let Z.s0.fused.ii.loop_extent = 1
+                               let Z.s0.fused.ii.loop_max = Y.s0.fused.ii
+                               let Z.s0.fused.ii.loop_min = Y.s0.fused.ii
+                               for (Z.s0.fused.ii, Z.s0.fused.ii.loop_min, Z.s0.fused.ii.loop_extent) {
+                                let Z.s0.jj.loop_max = ((0 + 8) - 1)
+                                let Z.s0.jj.loop_min = 0
+                                let Z.s0.jj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                                let Z.s0.fused.jj.loop_extent = 1
+                                let Z.s0.fused.jj.loop_max = Y.s0.fused.jj
+                                let Z.s0.fused.jj.loop_min = Y.s0.fused.jj
+                                for (Z.s0.fused.jj, Z.s0.fused.jj.loop_min, Z.s0.fused.jj.loop_extent) {
+                                 let Z.s0.k.loop_max = ((0 + (A.extent.0/8)) - 1)
+                                 let Z.s0.k.loop_min = 0
+                                 let Z.s0.k.loop_extent = ((((0 + (A.extent.0/8)) - 1) + 1) - 0)
+                                 let Z.s0.fused.k.loop_extent = 1
+                                 let Z.s0.fused.k.loop_max = Y.s0.fused.k
+                                 let Z.s0.fused.k.loop_min = Y.s0.fused.k
+                                 for (Z.s0.fused.k, Z.s0.fused.k.loop_min, Z.s0.fused.k.loop_extent) {
+                                  let Z.s0.kk.loop_max = ((0 + 1) - 1)
+                                  let Z.s0.kk.loop_min = 0
+                                  let Z.s0.kk.loop_extent = ((((0 + 1) - 1) + 1) - 0)
+                                  let Z.s0.fused.kk.loop_extent = 1
+                                  let Z.s0.fused.kk.loop_max = Y.s0.fused.kk
+                                  let Z.s0.fused.kk.loop_min = Y.s0.fused.kk
+                                  for (Z.s0.fused.kk, Z.s0.fused.kk.loop_min, Z.s0.fused.kk.loop_extent) {
+                                   let Z.s0.iii.loop_max = ((0 + 32) - 1)
+                                   let Z.s0.iii.loop_min = 0
+                                   let Z.s0.iii.loop_extent = ((((0 + 32) - 1) + 1) - 0)
+                                   let Z.s0.fused.iii.loop_extent = 1
+                                   let Z.s0.fused.iii.loop_max = Y.s0.fused.iii
+                                   let Z.s0.fused.iii.loop_min = Y.s0.fused.iii
+                                   for (Z.s0.fused.iii, Z.s0.fused.iii.loop_min, Z.s0.fused.iii.loop_extent) {
+                                    let Z.s0.jjj.loop_max = ((0 + 8) - 1)
+                                    let Z.s0.jjj.loop_min = 0
+                                    let Z.s0.jjj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                                    let Z.s0.fused.jjj.loop_extent = 1
+                                    let Z.s0.fused.jjj.loop_max = Y.s0.fused.jjj
+                                    let Z.s0.fused.jjj.loop_min = Y.s0.fused.jjj
+                                    for (Z.s0.fused.jjj, Z.s0.fused.jjj.loop_min, Z.s0.fused.jjj.loop_extent) {
+                                     let Z.s0.kkk.loop_max = ((0 + 8) - 1)
+                                     let Z.s0.kkk.loop_min = 0
+                                     let Z.s0.kkk.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                                     let Z.s0.fused.kkk.loop_extent = 1
+                                     let Z.s0.fused.kkk.loop_max = Y.s0.fused.kkk
+                                     let Z.s0.fused.kkk.loop_min = Y.s0.fused.kkk
+                                     for (Z.s0.fused.kkk, Z.s0.fused.kkk.loop_min, Z.s0.fused.kkk.loop_extent) {
+                                      Z(Z.s0.fused.kkk, Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, Z.s0.fused.kk, Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i) = (select((((Z.s0.fused.kkk == 0) && (Z.s0.fused.kk == 0)) && (Z.s0.fused.k == 0)), 0.000000f, select((Z.s0.fused.kkk == 0), select((Z.s0.fused.kk == 0), Z(((Z.s0.fused.kkk + 8) - 1), Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, ((Z.s0.fused.kk + 1) - 1), (Z.s0.fused.k - 1), Z.s0.fused.j, Z.s0.fused.i), Z(((Z.s0.fused.kkk + 8) - 1), Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, (Z.s0.fused.kk - 1), Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i)), Z((Z.s0.fused.kkk - 1), Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, Z.s0.fused.kk, Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i))) + (X(Z.s0.fused.kkk, Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, Z.s0.fused.kk, Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i)*Y(Z.s0.fused.kkk, Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, Z.s0.fused.kk, Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i)))
+                                      for (Out.s0.__outermost, Out.s0.__outermost.loop_min, Out.s0.__outermost.loop_extent) {
+                                       let Out.s0.i.loop_max = ((0 + (A.extent.1/64)) - 1)
+                                       let Out.s0.i.loop_min = 0
+                                       let Out.s0.i.loop_extent = ((((0 + (A.extent.1/64)) - 1) + 1) - 0)
+                                       let Out.s0.fused.i.loop_extent = 1
+                                       let Out.s0.fused.i.loop_max = Z.s0.fused.i
+                                       let Out.s0.fused.i.loop_min = Z.s0.fused.i
+                                       for (Out.s0.fused.i, Out.s0.fused.i.loop_min, Out.s0.fused.i.loop_extent) {
+                                        let Out.s0.j.loop_max = ((0 + (B.extent.0/64)) - 1)
+                                        let Out.s0.j.loop_min = 0
+                                        let Out.s0.j.loop_extent = ((((0 + (B.extent.0/64)) - 1) + 1) - 0)
+                                        let Out.s0.fused.j.loop_extent = 1
+                                        let Out.s0.fused.j.loop_max = Z.s0.fused.j
+                                        let Out.s0.fused.j.loop_min = Z.s0.fused.j
+                                        for (Out.s0.fused.j, Out.s0.fused.j.loop_min, Out.s0.fused.j.loop_extent) {
+                                         let Out.s0.ii.loop_max = ((0 + 2) - 1)
+                                         let Out.s0.ii.loop_min = 0
+                                         let Out.s0.ii.loop_extent = ((((0 + 2) - 1) + 1) - 0)
+                                         let Out.s0.fused.ii.loop_extent = 1
+                                         let Out.s0.fused.ii.loop_max = Z.s0.fused.ii
+                                         let Out.s0.fused.ii.loop_min = Z.s0.fused.ii
+                                         for (Out.s0.fused.ii, Out.s0.fused.ii.loop_min, Out.s0.fused.ii.loop_extent) {
+                                          let Out.s0.jj.loop_max = ((0 + 8) - 1)
+                                          let Out.s0.jj.loop_min = 0
+                                          let Out.s0.jj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                                          let Out.s0.fused.jj.loop_extent = 1
+                                          let Out.s0.fused.jj.loop_max = Z.s0.fused.jj
+                                          let Out.s0.fused.jj.loop_min = Z.s0.fused.jj
+                                          for (Out.s0.fused.jj, Out.s0.fused.jj.loop_min, Out.s0.fused.jj.loop_extent) {
+                                           let Out.s0.iii.loop_max = ((0 + 32) - 1)
+                                           let Out.s0.iii.loop_min = 0
+                                           let Out.s0.iii.loop_extent = ((((0 + 32) - 1) + 1) - 0)
+                                           let Out.s0.fused.iii.loop_extent = 1
+                                           let Out.s0.fused.iii.loop_max = Z.s0.fused.iii
+                                           let Out.s0.fused.iii.loop_min = Z.s0.fused.iii
+                                           for (Out.s0.fused.iii, Out.s0.fused.iii.loop_min, Out.s0.fused.iii.loop_extent) {
+                                            let Out.s0.jjj.loop_max = ((0 + 8) - 1)
+                                            let Out.s0.jjj.loop_min = 0
+                                            let Out.s0.jjj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                                            let Out.s0.fused.jjj.loop_extent = 1
+                                            let Out.s0.fused.jjj.loop_max = Z.s0.fused.jjj
+                                            let Out.s0.fused.jjj.loop_min = Z.s0.fused.jjj
+                                            for (Out.s0.fused.jjj, Out.s0.fused.jjj.loop_min, Out.s0.fused.jjj.loop_extent) {
+                                             Out(Out.s0.fused.jjj, Out.s0.fused.iii, Out.s0.fused.jj, Out.s0.fused.ii, Out.s0.fused.j, Out.s0.fused.i) = select((((Z.s0.fused.kkk == 7) && (Z.s0.fused.kk == 0)) && (Z.s0.fused.k == ((A.extent.0/8) - 1))), Z(Z.s0.fused.kkk, Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, Z.s0.fused.kk, Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i))
+                                            }
+                                           }
+                                          }
+                                         }
+                                        }
+                                       }
+                                      }
+                                     }
+                                    }
+                                   }
+                                  }
+                                 }
+                                }
+                               }
+                              }
+                             }
+                            }
+                           }
+                          }
+                         }
+                        }
+                       }
+                      }
+                     }
+                    }
+                   }
+                  }
+                 }
+                }
+               }
+              }
+             }
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+    consume Z {
+     consume Y {
+      consume X {
+       0
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+Inlining A_im
+for (.__root, 0, 1) {
+ realize Z[Z.kkk.min_realized, Z.kkk.extent_realized], [Z.jjj.min_realized, Z.jjj.extent_realized], [Z.iii.min_realized, Z.iii.extent_realized], [Z.jj.min_realized, Z.jj.extent_realized], [Z.ii.min_realized, Z.ii.extent_realized], [Z.kk.min_realized, Z.kk.extent_realized], [Z.k.min_realized, Z.k.extent_realized], [Z.j.min_realized, Z.j.extent_realized], [Z.i.min_realized, Z.i.extent_realized] of type `float32' {
+  realize Y[Y.kkk.min_realized, Y.kkk.extent_realized], [Y.jjj.min_realized, Y.jjj.extent_realized], [Y.iii.min_realized, Y.iii.extent_realized], [Y.jj.min_realized, Y.jj.extent_realized], [Y.ii.min_realized, Y.ii.extent_realized], [Y.kk.min_realized, Y.kk.extent_realized], [Y.k.min_realized, Y.k.extent_realized], [Y.j.min_realized, Y.j.extent_realized], [Y.i.min_realized, Y.i.extent_realized] of type `float32' {
+   realize X[X.kkk.min_realized, X.kkk.extent_realized], [X.jjj.min_realized, X.jjj.extent_realized], [X.iii.min_realized, X.iii.extent_realized], [X.jj.min_realized, X.jj.extent_realized], [X.ii.min_realized, X.ii.extent_realized], [X.kk.min_realized, X.kk.extent_realized], [X.k.min_realized, X.k.extent_realized], [X.j.min_realized, X.j.extent_realized], [X.i.min_realized, X.i.extent_realized] of type `float32' {
+    produce Out {
+     produce Z {
+      produce Y {
+       produce X {
+        let X.s0.__outermost.loop_extent = 1
+        let X.s0.__outermost.loop_max = 0
+        let X.s0.__outermost.loop_min = 0
+        let Y.s0.__outermost.loop_extent = 1
+        let Y.s0.__outermost.loop_max = 0
+        let Y.s0.__outermost.loop_min = 0
+        let Z.s0.__outermost.loop_extent = 1
+        let Z.s0.__outermost.loop_max = 0
+        let Z.s0.__outermost.loop_min = 0
+        let Out.s0.__outermost.loop_extent = 1
+        let Out.s0.__outermost.loop_max = 0
+        let Out.s0.__outermost.loop_min = 0
+        for (X.s0.__outermost, X.s0.__outermost.loop_min, X.s0.__outermost.loop_extent) {
+         let X.s0.i.loop_max = ((0 + (A.extent.1/64)) - 1)
+         let X.s0.i.loop_min = 0
+         let X.s0.i.loop_extent = ((((0 + (A.extent.1/64)) - 1) + 1) - 0)
+         gpu_block<Default_GPU> (X.s0.i, X.s0.i.loop_min, X.s0.i.loop_extent) {
+          let X.s0.j.loop_max = ((0 + (B.extent.0/64)) - 1)
+          let X.s0.j.loop_min = 0
+          let X.s0.j.loop_extent = ((((0 + (B.extent.0/64)) - 1) + 1) - 0)
+          gpu_block<Default_GPU> (X.s0.j, X.s0.j.loop_min, X.s0.j.loop_extent) {
+           let X.s0.ii.loop_max = ((0 + 2) - 1)
+           let X.s0.ii.loop_min = 0
+           let X.s0.ii.loop_extent = ((((0 + 2) - 1) + 1) - 0)
+           gpu_thread<Default_GPU> (X.s0.ii, X.s0.ii.loop_min, X.s0.ii.loop_extent) {
+            let X.s0.jj.loop_max = ((0 + 8) - 1)
+            let X.s0.jj.loop_min = 0
+            let X.s0.jj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+            gpu_thread<Default_GPU> (X.s0.jj, X.s0.jj.loop_min, X.s0.jj.loop_extent) {
+             let X.s0.k.loop_max = ((0 + (A.extent.0/8)) - 1)
+             let X.s0.k.loop_min = 0
+             let X.s0.k.loop_extent = ((((0 + (A.extent.0/8)) - 1) + 1) - 0)
+             for (X.s0.k, X.s0.k.loop_min, X.s0.k.loop_extent) {
+              let X.s0.kk.loop_max = ((0 + 1) - 1)
+              let X.s0.kk.loop_min = 0
+              let X.s0.kk.loop_extent = ((((0 + 1) - 1) + 1) - 0)
+              for (X.s0.kk, X.s0.kk.loop_min, X.s0.kk.loop_extent) {
+               let X.s0.iii.loop_max = ((0 + 32) - 1)
+               let X.s0.iii.loop_min = 0
+               let X.s0.iii.loop_extent = ((((0 + 32) - 1) + 1) - 0)
+               for (X.s0.iii, X.s0.iii.loop_min, X.s0.iii.loop_extent) {
+                let X.s0.jjj.loop_max = ((0 + 8) - 1)
+                let X.s0.jjj.loop_min = 0
+                let X.s0.jjj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                vectorized (X.s0.jjj, X.s0.jjj.loop_min, X.s0.jjj.loop_extent) {
+                 let X.s0.kkk.loop_max = ((0 + 8) - 1)
+                 let X.s0.kkk.loop_min = 0
+                 let X.s0.kkk.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                 for (X.s0.kkk, X.s0.kkk.loop_min, X.s0.kkk.loop_extent) {
+                  X(X.s0.kkk, X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.kk, X.s0.k, X.s0.j, X.s0.i) = select((X.s0.jjj == 0), (float32)A(((X.s0.kkk + (8*X.s0.kk)) + (8*X.s0.k)), ((X.s0.iii + (32*X.s0.ii)) + (64*X.s0.i))), X(X.s0.kkk, (X.s0.jjj - 1), X.s0.iii, X.s0.jj, X.s0.ii, X.s0.kk, X.s0.k, X.s0.j, X.s0.i))
+                  for (Y.s0.__outermost, Y.s0.__outermost.loop_min, Y.s0.__outermost.loop_extent) {
+                   let Y.s0.i.loop_max = ((0 + (A.extent.1/64)) - 1)
+                   let Y.s0.i.loop_min = 0
+                   let Y.s0.i.loop_extent = ((((0 + (A.extent.1/64)) - 1) + 1) - 0)
+                   let Y.s0.fused.i.loop_extent = 1
+                   let Y.s0.fused.i.loop_max = X.s0.i
+                   let Y.s0.fused.i.loop_min = X.s0.i
+                   for (Y.s0.fused.i, Y.s0.fused.i.loop_min, Y.s0.fused.i.loop_extent) {
+                    let Y.s0.j.loop_max = ((0 + (B.extent.0/64)) - 1)
+                    let Y.s0.j.loop_min = 0
+                    let Y.s0.j.loop_extent = ((((0 + (B.extent.0/64)) - 1) + 1) - 0)
+                    let Y.s0.fused.j.loop_extent = 1
+                    let Y.s0.fused.j.loop_max = X.s0.j
+                    let Y.s0.fused.j.loop_min = X.s0.j
+                    for (Y.s0.fused.j, Y.s0.fused.j.loop_min, Y.s0.fused.j.loop_extent) {
+                     let Y.s0.ii.loop_max = ((0 + 2) - 1)
+                     let Y.s0.ii.loop_min = 0
+                     let Y.s0.ii.loop_extent = ((((0 + 2) - 1) + 1) - 0)
+                     let Y.s0.fused.ii.loop_extent = 1
+                     let Y.s0.fused.ii.loop_max = X.s0.ii
+                     let Y.s0.fused.ii.loop_min = X.s0.ii
+                     for (Y.s0.fused.ii, Y.s0.fused.ii.loop_min, Y.s0.fused.ii.loop_extent) {
+                      let Y.s0.jj.loop_max = ((0 + 8) - 1)
+                      let Y.s0.jj.loop_min = 0
+                      let Y.s0.jj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                      let Y.s0.fused.jj.loop_extent = 1
+                      let Y.s0.fused.jj.loop_max = X.s0.jj
+                      let Y.s0.fused.jj.loop_min = X.s0.jj
+                      for (Y.s0.fused.jj, Y.s0.fused.jj.loop_min, Y.s0.fused.jj.loop_extent) {
+                       let Y.s0.k.loop_max = ((0 + (A.extent.0/8)) - 1)
+                       let Y.s0.k.loop_min = 0
+                       let Y.s0.k.loop_extent = ((((0 + (A.extent.0/8)) - 1) + 1) - 0)
+                       let Y.s0.fused.k.loop_extent = 1
+                       let Y.s0.fused.k.loop_max = X.s0.k
+                       let Y.s0.fused.k.loop_min = X.s0.k
+                       for (Y.s0.fused.k, Y.s0.fused.k.loop_min, Y.s0.fused.k.loop_extent) {
+                        let Y.s0.kk.loop_max = ((0 + 1) - 1)
+                        let Y.s0.kk.loop_min = 0
+                        let Y.s0.kk.loop_extent = ((((0 + 1) - 1) + 1) - 0)
+                        let Y.s0.fused.kk.loop_extent = 1
+                        let Y.s0.fused.kk.loop_max = X.s0.kk
+                        let Y.s0.fused.kk.loop_min = X.s0.kk
+                        for (Y.s0.fused.kk, Y.s0.fused.kk.loop_min, Y.s0.fused.kk.loop_extent) {
+                         let Y.s0.iii.loop_max = ((0 + 32) - 1)
+                         let Y.s0.iii.loop_min = 0
+                         let Y.s0.iii.loop_extent = ((((0 + 32) - 1) + 1) - 0)
+                         let Y.s0.fused.iii.loop_extent = 1
+                         let Y.s0.fused.iii.loop_max = X.s0.iii
+                         let Y.s0.fused.iii.loop_min = X.s0.iii
+                         for (Y.s0.fused.iii, Y.s0.fused.iii.loop_min, Y.s0.fused.iii.loop_extent) {
+                          let Y.s0.jjj.loop_max = ((0 + 8) - 1)
+                          let Y.s0.jjj.loop_min = 0
+                          let Y.s0.jjj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                          let Y.s0.fused.jjj.loop_extent = 1
+                          let Y.s0.fused.jjj.loop_max = X.s0.jjj
+                          let Y.s0.fused.jjj.loop_min = X.s0.jjj
+                          for (Y.s0.fused.jjj, Y.s0.fused.jjj.loop_min, Y.s0.fused.jjj.loop_extent) {
+                           let Y.s0.kkk.loop_max = ((0 + 8) - 1)
+                           let Y.s0.kkk.loop_min = 0
+                           let Y.s0.kkk.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                           let Y.s0.fused.kkk.loop_extent = 1
+                           let Y.s0.fused.kkk.loop_max = X.s0.kkk
+                           let Y.s0.fused.kkk.loop_min = X.s0.kkk
+                           for (Y.s0.fused.kkk, Y.s0.fused.kkk.loop_min, Y.s0.fused.kkk.loop_extent) {
+                            Y(Y.s0.fused.kkk, Y.s0.fused.jjj, Y.s0.fused.iii, Y.s0.fused.jj, Y.s0.fused.ii, Y.s0.fused.kk, Y.s0.fused.k, Y.s0.fused.j, Y.s0.fused.i) = select((Y.s0.fused.iii == 0), (float32)B(((Y.s0.fused.jjj + (8*Y.s0.fused.jj)) + (64*Y.s0.fused.j)), ((Y.s0.fused.kkk + (8*Y.s0.fused.kk)) + (8*Y.s0.fused.k))), Y(Y.s0.fused.kkk, Y.s0.fused.jjj, (Y.s0.fused.iii - 1), Y.s0.fused.jj, Y.s0.fused.ii, Y.s0.fused.kk, Y.s0.fused.k, Y.s0.fused.j, Y.s0.fused.i))
+                            for (Z.s0.__outermost, Z.s0.__outermost.loop_min, Z.s0.__outermost.loop_extent) {
+                             let Z.s0.i.loop_max = ((0 + (A.extent.1/64)) - 1)
+                             let Z.s0.i.loop_min = 0
+                             let Z.s0.i.loop_extent = ((((0 + (A.extent.1/64)) - 1) + 1) - 0)
+                             let Z.s0.fused.i.loop_extent = 1
+                             let Z.s0.fused.i.loop_max = Y.s0.fused.i
+                             let Z.s0.fused.i.loop_min = Y.s0.fused.i
+                             for (Z.s0.fused.i, Z.s0.fused.i.loop_min, Z.s0.fused.i.loop_extent) {
+                              let Z.s0.j.loop_max = ((0 + (B.extent.0/64)) - 1)
+                              let Z.s0.j.loop_min = 0
+                              let Z.s0.j.loop_extent = ((((0 + (B.extent.0/64)) - 1) + 1) - 0)
+                              let Z.s0.fused.j.loop_extent = 1
+                              let Z.s0.fused.j.loop_max = Y.s0.fused.j
+                              let Z.s0.fused.j.loop_min = Y.s0.fused.j
+                              for (Z.s0.fused.j, Z.s0.fused.j.loop_min, Z.s0.fused.j.loop_extent) {
+                               let Z.s0.ii.loop_max = ((0 + 2) - 1)
+                               let Z.s0.ii.loop_min = 0
+                               let Z.s0.ii.loop_extent = ((((0 + 2) - 1) + 1) - 0)
+                               let Z.s0.fused.ii.loop_extent = 1
+                               let Z.s0.fused.ii.loop_max = Y.s0.fused.ii
+                               let Z.s0.fused.ii.loop_min = Y.s0.fused.ii
+                               for (Z.s0.fused.ii, Z.s0.fused.ii.loop_min, Z.s0.fused.ii.loop_extent) {
+                                let Z.s0.jj.loop_max = ((0 + 8) - 1)
+                                let Z.s0.jj.loop_min = 0
+                                let Z.s0.jj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                                let Z.s0.fused.jj.loop_extent = 1
+                                let Z.s0.fused.jj.loop_max = Y.s0.fused.jj
+                                let Z.s0.fused.jj.loop_min = Y.s0.fused.jj
+                                for (Z.s0.fused.jj, Z.s0.fused.jj.loop_min, Z.s0.fused.jj.loop_extent) {
+                                 let Z.s0.k.loop_max = ((0 + (A.extent.0/8)) - 1)
+                                 let Z.s0.k.loop_min = 0
+                                 let Z.s0.k.loop_extent = ((((0 + (A.extent.0/8)) - 1) + 1) - 0)
+                                 let Z.s0.fused.k.loop_extent = 1
+                                 let Z.s0.fused.k.loop_max = Y.s0.fused.k
+                                 let Z.s0.fused.k.loop_min = Y.s0.fused.k
+                                 for (Z.s0.fused.k, Z.s0.fused.k.loop_min, Z.s0.fused.k.loop_extent) {
+                                  let Z.s0.kk.loop_max = ((0 + 1) - 1)
+                                  let Z.s0.kk.loop_min = 0
+                                  let Z.s0.kk.loop_extent = ((((0 + 1) - 1) + 1) - 0)
+                                  let Z.s0.fused.kk.loop_extent = 1
+                                  let Z.s0.fused.kk.loop_max = Y.s0.fused.kk
+                                  let Z.s0.fused.kk.loop_min = Y.s0.fused.kk
+                                  for (Z.s0.fused.kk, Z.s0.fused.kk.loop_min, Z.s0.fused.kk.loop_extent) {
+                                   let Z.s0.iii.loop_max = ((0 + 32) - 1)
+                                   let Z.s0.iii.loop_min = 0
+                                   let Z.s0.iii.loop_extent = ((((0 + 32) - 1) + 1) - 0)
+                                   let Z.s0.fused.iii.loop_extent = 1
+                                   let Z.s0.fused.iii.loop_max = Y.s0.fused.iii
+                                   let Z.s0.fused.iii.loop_min = Y.s0.fused.iii
+                                   for (Z.s0.fused.iii, Z.s0.fused.iii.loop_min, Z.s0.fused.iii.loop_extent) {
+                                    let Z.s0.jjj.loop_max = ((0 + 8) - 1)
+                                    let Z.s0.jjj.loop_min = 0
+                                    let Z.s0.jjj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                                    let Z.s0.fused.jjj.loop_extent = 1
+                                    let Z.s0.fused.jjj.loop_max = Y.s0.fused.jjj
+                                    let Z.s0.fused.jjj.loop_min = Y.s0.fused.jjj
+                                    for (Z.s0.fused.jjj, Z.s0.fused.jjj.loop_min, Z.s0.fused.jjj.loop_extent) {
+                                     let Z.s0.kkk.loop_max = ((0 + 8) - 1)
+                                     let Z.s0.kkk.loop_min = 0
+                                     let Z.s0.kkk.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                                     let Z.s0.fused.kkk.loop_extent = 1
+                                     let Z.s0.fused.kkk.loop_max = Y.s0.fused.kkk
+                                     let Z.s0.fused.kkk.loop_min = Y.s0.fused.kkk
+                                     for (Z.s0.fused.kkk, Z.s0.fused.kkk.loop_min, Z.s0.fused.kkk.loop_extent) {
+                                      Z(Z.s0.fused.kkk, Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, Z.s0.fused.kk, Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i) = (select((((Z.s0.fused.kkk == 0) && (Z.s0.fused.kk == 0)) && (Z.s0.fused.k == 0)), 0.000000f, select((Z.s0.fused.kkk == 0), select((Z.s0.fused.kk == 0), Z(((Z.s0.fused.kkk + 8) - 1), Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, ((Z.s0.fused.kk + 1) - 1), (Z.s0.fused.k - 1), Z.s0.fused.j, Z.s0.fused.i), Z(((Z.s0.fused.kkk + 8) - 1), Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, (Z.s0.fused.kk - 1), Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i)), Z((Z.s0.fused.kkk - 1), Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, Z.s0.fused.kk, Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i))) + (X(Z.s0.fused.kkk, Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, Z.s0.fused.kk, Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i)*Y(Z.s0.fused.kkk, Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, Z.s0.fused.kk, Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i)))
+                                      for (Out.s0.__outermost, Out.s0.__outermost.loop_min, Out.s0.__outermost.loop_extent) {
+                                       let Out.s0.i.loop_max = ((0 + (A.extent.1/64)) - 1)
+                                       let Out.s0.i.loop_min = 0
+                                       let Out.s0.i.loop_extent = ((((0 + (A.extent.1/64)) - 1) + 1) - 0)
+                                       let Out.s0.fused.i.loop_extent = 1
+                                       let Out.s0.fused.i.loop_max = Z.s0.fused.i
+                                       let Out.s0.fused.i.loop_min = Z.s0.fused.i
+                                       for (Out.s0.fused.i, Out.s0.fused.i.loop_min, Out.s0.fused.i.loop_extent) {
+                                        let Out.s0.j.loop_max = ((0 + (B.extent.0/64)) - 1)
+                                        let Out.s0.j.loop_min = 0
+                                        let Out.s0.j.loop_extent = ((((0 + (B.extent.0/64)) - 1) + 1) - 0)
+                                        let Out.s0.fused.j.loop_extent = 1
+                                        let Out.s0.fused.j.loop_max = Z.s0.fused.j
+                                        let Out.s0.fused.j.loop_min = Z.s0.fused.j
+                                        for (Out.s0.fused.j, Out.s0.fused.j.loop_min, Out.s0.fused.j.loop_extent) {
+                                         let Out.s0.ii.loop_max = ((0 + 2) - 1)
+                                         let Out.s0.ii.loop_min = 0
+                                         let Out.s0.ii.loop_extent = ((((0 + 2) - 1) + 1) - 0)
+                                         let Out.s0.fused.ii.loop_extent = 1
+                                         let Out.s0.fused.ii.loop_max = Z.s0.fused.ii
+                                         let Out.s0.fused.ii.loop_min = Z.s0.fused.ii
+                                         for (Out.s0.fused.ii, Out.s0.fused.ii.loop_min, Out.s0.fused.ii.loop_extent) {
+                                          let Out.s0.jj.loop_max = ((0 + 8) - 1)
+                                          let Out.s0.jj.loop_min = 0
+                                          let Out.s0.jj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                                          let Out.s0.fused.jj.loop_extent = 1
+                                          let Out.s0.fused.jj.loop_max = Z.s0.fused.jj
+                                          let Out.s0.fused.jj.loop_min = Z.s0.fused.jj
+                                          for (Out.s0.fused.jj, Out.s0.fused.jj.loop_min, Out.s0.fused.jj.loop_extent) {
+                                           let Out.s0.iii.loop_max = ((0 + 32) - 1)
+                                           let Out.s0.iii.loop_min = 0
+                                           let Out.s0.iii.loop_extent = ((((0 + 32) - 1) + 1) - 0)
+                                           let Out.s0.fused.iii.loop_extent = 1
+                                           let Out.s0.fused.iii.loop_max = Z.s0.fused.iii
+                                           let Out.s0.fused.iii.loop_min = Z.s0.fused.iii
+                                           for (Out.s0.fused.iii, Out.s0.fused.iii.loop_min, Out.s0.fused.iii.loop_extent) {
+                                            let Out.s0.jjj.loop_max = ((0 + 8) - 1)
+                                            let Out.s0.jjj.loop_min = 0
+                                            let Out.s0.jjj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                                            let Out.s0.fused.jjj.loop_extent = 1
+                                            let Out.s0.fused.jjj.loop_max = Z.s0.fused.jjj
+                                            let Out.s0.fused.jjj.loop_min = Z.s0.fused.jjj
+                                            for (Out.s0.fused.jjj, Out.s0.fused.jjj.loop_min, Out.s0.fused.jjj.loop_extent) {
+                                             Out(Out.s0.fused.jjj, Out.s0.fused.iii, Out.s0.fused.jj, Out.s0.fused.ii, Out.s0.fused.j, Out.s0.fused.i) = select((((Z.s0.fused.kkk == 7) && (Z.s0.fused.kk == 0)) && (Z.s0.fused.k == ((A.extent.0/8) - 1))), Z(Z.s0.fused.kkk, Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, Z.s0.fused.kk, Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i))
+                                            }
+                                           }
+                                          }
+                                         }
+                                        }
+                                       }
+                                      }
+                                     }
+                                    }
+                                   }
+                                  }
+                                 }
+                                }
+                               }
+                              }
+                             }
+                            }
+                           }
+                          }
+                         }
+                        }
+                       }
+                      }
+                     }
+                    }
+                   }
+                  }
+                 }
+                }
+               }
+              }
+             }
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+    consume Z {
+     consume Y {
+      consume X {
+       0
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+Lowering after creating initial loop nests:
+realize Z[Z.kkk.min_realized, Z.kkk.extent_realized], [Z.jjj.min_realized, Z.jjj.extent_realized], [Z.iii.min_realized, Z.iii.extent_realized], [Z.jj.min_realized, Z.jj.extent_realized], [Z.ii.min_realized, Z.ii.extent_realized], [Z.kk.min_realized, Z.kk.extent_realized], [Z.k.min_realized, Z.k.extent_realized], [Z.j.min_realized, Z.j.extent_realized], [Z.i.min_realized, Z.i.extent_realized] of type `float32' {
+ realize Y[Y.kkk.min_realized, Y.kkk.extent_realized], [Y.jjj.min_realized, Y.jjj.extent_realized], [Y.iii.min_realized, Y.iii.extent_realized], [Y.jj.min_realized, Y.jj.extent_realized], [Y.ii.min_realized, Y.ii.extent_realized], [Y.kk.min_realized, Y.kk.extent_realized], [Y.k.min_realized, Y.k.extent_realized], [Y.j.min_realized, Y.j.extent_realized], [Y.i.min_realized, Y.i.extent_realized] of type `float32' {
+  realize X[X.kkk.min_realized, X.kkk.extent_realized], [X.jjj.min_realized, X.jjj.extent_realized], [X.iii.min_realized, X.iii.extent_realized], [X.jj.min_realized, X.jj.extent_realized], [X.ii.min_realized, X.ii.extent_realized], [X.kk.min_realized, X.kk.extent_realized], [X.k.min_realized, X.k.extent_realized], [X.j.min_realized, X.j.extent_realized], [X.i.min_realized, X.i.extent_realized] of type `float32' {
+   produce Out {
+    produce Z {
+     produce Y {
+      produce X {
+       let X.s0.i.loop_max = ((0 + (A.extent.1/64)) - 1)
+       let X.s0.i.loop_min = 0
+       let X.s0.i.loop_extent = ((((0 + (A.extent.1/64)) - 1) + 1) - 0)
+       gpu_block<Default_GPU> (X.s0.i, X.s0.i.loop_min, X.s0.i.loop_extent) {
+        let X.s0.j.loop_max = ((0 + (B.extent.0/64)) - 1)
+        let X.s0.j.loop_min = 0
+        let X.s0.j.loop_extent = ((((0 + (B.extent.0/64)) - 1) + 1) - 0)
+        gpu_block<Default_GPU> (X.s0.j, X.s0.j.loop_min, X.s0.j.loop_extent) {
+         let X.s0.ii.loop_max = ((0 + 2) - 1)
+         let X.s0.ii.loop_min = 0
+         let X.s0.ii.loop_extent = ((((0 + 2) - 1) + 1) - 0)
+         gpu_thread<Default_GPU> (X.s0.ii, X.s0.ii.loop_min, X.s0.ii.loop_extent) {
+          let X.s0.jj.loop_max = ((0 + 8) - 1)
+          let X.s0.jj.loop_min = 0
+          let X.s0.jj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+          gpu_thread<Default_GPU> (X.s0.jj, X.s0.jj.loop_min, X.s0.jj.loop_extent) {
+           let X.s0.k.loop_max = ((0 + (A.extent.0/8)) - 1)
+           let X.s0.k.loop_min = 0
+           let X.s0.k.loop_extent = ((((0 + (A.extent.0/8)) - 1) + 1) - 0)
+           for (X.s0.k, X.s0.k.loop_min, X.s0.k.loop_extent) {
+            let X.s0.kk.loop_max = ((0 + 1) - 1)
+            let X.s0.kk.loop_min = 0
+            let X.s0.kk.loop_extent = ((((0 + 1) - 1) + 1) - 0)
+            for (X.s0.kk, X.s0.kk.loop_min, X.s0.kk.loop_extent) {
+             let X.s0.iii.loop_max = ((0 + 32) - 1)
+             let X.s0.iii.loop_min = 0
+             let X.s0.iii.loop_extent = ((((0 + 32) - 1) + 1) - 0)
+             for (X.s0.iii, X.s0.iii.loop_min, X.s0.iii.loop_extent) {
+              let X.s0.jjj.loop_max = ((0 + 8) - 1)
+              let X.s0.jjj.loop_min = 0
+              let X.s0.jjj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+              vectorized (X.s0.jjj, X.s0.jjj.loop_min, X.s0.jjj.loop_extent) {
+               let X.s0.kkk.loop_max = ((0 + 8) - 1)
+               let X.s0.kkk.loop_min = 0
+               let X.s0.kkk.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+               for (X.s0.kkk, X.s0.kkk.loop_min, X.s0.kkk.loop_extent) {
+                X(X.s0.kkk, X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.kk, X.s0.k, X.s0.j, X.s0.i) = select((X.s0.jjj == 0), (float32)A(((X.s0.kkk + (8*X.s0.kk)) + (8*X.s0.k)), ((X.s0.iii + (32*X.s0.ii)) + (64*X.s0.i))), X(X.s0.kkk, (X.s0.jjj - 1), X.s0.iii, X.s0.jj, X.s0.ii, X.s0.kk, X.s0.k, X.s0.j, X.s0.i))
+                let Y.s0.i.loop_max = ((0 + (A.extent.1/64)) - 1)
+                let Y.s0.i.loop_min = 0
+                let Y.s0.i.loop_extent = ((((0 + (A.extent.1/64)) - 1) + 1) - 0)
+                let Y.s0.fused.i.loop_extent = 1
+                let Y.s0.fused.i.loop_max = X.s0.i
+                let Y.s0.fused.i.loop_min = X.s0.i
+                for (Y.s0.fused.i, Y.s0.fused.i.loop_min, Y.s0.fused.i.loop_extent) {
+                 let Y.s0.j.loop_max = ((0 + (B.extent.0/64)) - 1)
+                 let Y.s0.j.loop_min = 0
+                 let Y.s0.j.loop_extent = ((((0 + (B.extent.0/64)) - 1) + 1) - 0)
+                 let Y.s0.fused.j.loop_extent = 1
+                 let Y.s0.fused.j.loop_max = X.s0.j
+                 let Y.s0.fused.j.loop_min = X.s0.j
+                 for (Y.s0.fused.j, Y.s0.fused.j.loop_min, Y.s0.fused.j.loop_extent) {
+                  let Y.s0.ii.loop_max = ((0 + 2) - 1)
+                  let Y.s0.ii.loop_min = 0
+                  let Y.s0.ii.loop_extent = ((((0 + 2) - 1) + 1) - 0)
+                  let Y.s0.fused.ii.loop_extent = 1
+                  let Y.s0.fused.ii.loop_max = X.s0.ii
+                  let Y.s0.fused.ii.loop_min = X.s0.ii
+                  for (Y.s0.fused.ii, Y.s0.fused.ii.loop_min, Y.s0.fused.ii.loop_extent) {
+                   let Y.s0.jj.loop_max = ((0 + 8) - 1)
+                   let Y.s0.jj.loop_min = 0
+                   let Y.s0.jj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                   let Y.s0.fused.jj.loop_extent = 1
+                   let Y.s0.fused.jj.loop_max = X.s0.jj
+                   let Y.s0.fused.jj.loop_min = X.s0.jj
+                   for (Y.s0.fused.jj, Y.s0.fused.jj.loop_min, Y.s0.fused.jj.loop_extent) {
+                    let Y.s0.k.loop_max = ((0 + (A.extent.0/8)) - 1)
+                    let Y.s0.k.loop_min = 0
+                    let Y.s0.k.loop_extent = ((((0 + (A.extent.0/8)) - 1) + 1) - 0)
+                    let Y.s0.fused.k.loop_extent = 1
+                    let Y.s0.fused.k.loop_max = X.s0.k
+                    let Y.s0.fused.k.loop_min = X.s0.k
+                    for (Y.s0.fused.k, Y.s0.fused.k.loop_min, Y.s0.fused.k.loop_extent) {
+                     let Y.s0.kk.loop_max = ((0 + 1) - 1)
+                     let Y.s0.kk.loop_min = 0
+                     let Y.s0.kk.loop_extent = ((((0 + 1) - 1) + 1) - 0)
+                     let Y.s0.fused.kk.loop_extent = 1
+                     let Y.s0.fused.kk.loop_max = X.s0.kk
+                     let Y.s0.fused.kk.loop_min = X.s0.kk
+                     for (Y.s0.fused.kk, Y.s0.fused.kk.loop_min, Y.s0.fused.kk.loop_extent) {
+                      let Y.s0.iii.loop_max = ((0 + 32) - 1)
+                      let Y.s0.iii.loop_min = 0
+                      let Y.s0.iii.loop_extent = ((((0 + 32) - 1) + 1) - 0)
+                      let Y.s0.fused.iii.loop_extent = 1
+                      let Y.s0.fused.iii.loop_max = X.s0.iii
+                      let Y.s0.fused.iii.loop_min = X.s0.iii
+                      for (Y.s0.fused.iii, Y.s0.fused.iii.loop_min, Y.s0.fused.iii.loop_extent) {
+                       let Y.s0.jjj.loop_max = ((0 + 8) - 1)
+                       let Y.s0.jjj.loop_min = 0
+                       let Y.s0.jjj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                       let Y.s0.fused.jjj.loop_extent = 1
+                       let Y.s0.fused.jjj.loop_max = X.s0.jjj
+                       let Y.s0.fused.jjj.loop_min = X.s0.jjj
+                       for (Y.s0.fused.jjj, Y.s0.fused.jjj.loop_min, Y.s0.fused.jjj.loop_extent) {
+                        let Y.s0.kkk.loop_max = ((0 + 8) - 1)
+                        let Y.s0.kkk.loop_min = 0
+                        let Y.s0.kkk.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                        let Y.s0.fused.kkk.loop_extent = 1
+                        let Y.s0.fused.kkk.loop_max = X.s0.kkk
+                        let Y.s0.fused.kkk.loop_min = X.s0.kkk
+                        for (Y.s0.fused.kkk, Y.s0.fused.kkk.loop_min, Y.s0.fused.kkk.loop_extent) {
+                         Y(Y.s0.fused.kkk, Y.s0.fused.jjj, Y.s0.fused.iii, Y.s0.fused.jj, Y.s0.fused.ii, Y.s0.fused.kk, Y.s0.fused.k, Y.s0.fused.j, Y.s0.fused.i) = select((Y.s0.fused.iii == 0), (float32)B(((Y.s0.fused.jjj + (8*Y.s0.fused.jj)) + (64*Y.s0.fused.j)), ((Y.s0.fused.kkk + (8*Y.s0.fused.kk)) + (8*Y.s0.fused.k))), Y(Y.s0.fused.kkk, Y.s0.fused.jjj, (Y.s0.fused.iii - 1), Y.s0.fused.jj, Y.s0.fused.ii, Y.s0.fused.kk, Y.s0.fused.k, Y.s0.fused.j, Y.s0.fused.i))
+                         let Z.s0.i.loop_max = ((0 + (A.extent.1/64)) - 1)
+                         let Z.s0.i.loop_min = 0
+                         let Z.s0.i.loop_extent = ((((0 + (A.extent.1/64)) - 1) + 1) - 0)
+                         let Z.s0.fused.i.loop_extent = 1
+                         let Z.s0.fused.i.loop_max = Y.s0.fused.i
+                         let Z.s0.fused.i.loop_min = Y.s0.fused.i
+                         for (Z.s0.fused.i, Z.s0.fused.i.loop_min, Z.s0.fused.i.loop_extent) {
+                          let Z.s0.j.loop_max = ((0 + (B.extent.0/64)) - 1)
+                          let Z.s0.j.loop_min = 0
+                          let Z.s0.j.loop_extent = ((((0 + (B.extent.0/64)) - 1) + 1) - 0)
+                          let Z.s0.fused.j.loop_extent = 1
+                          let Z.s0.fused.j.loop_max = Y.s0.fused.j
+                          let Z.s0.fused.j.loop_min = Y.s0.fused.j
+                          for (Z.s0.fused.j, Z.s0.fused.j.loop_min, Z.s0.fused.j.loop_extent) {
+                           let Z.s0.ii.loop_max = ((0 + 2) - 1)
+                           let Z.s0.ii.loop_min = 0
+                           let Z.s0.ii.loop_extent = ((((0 + 2) - 1) + 1) - 0)
+                           let Z.s0.fused.ii.loop_extent = 1
+                           let Z.s0.fused.ii.loop_max = Y.s0.fused.ii
+                           let Z.s0.fused.ii.loop_min = Y.s0.fused.ii
+                           for (Z.s0.fused.ii, Z.s0.fused.ii.loop_min, Z.s0.fused.ii.loop_extent) {
+                            let Z.s0.jj.loop_max = ((0 + 8) - 1)
+                            let Z.s0.jj.loop_min = 0
+                            let Z.s0.jj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                            let Z.s0.fused.jj.loop_extent = 1
+                            let Z.s0.fused.jj.loop_max = Y.s0.fused.jj
+                            let Z.s0.fused.jj.loop_min = Y.s0.fused.jj
+                            for (Z.s0.fused.jj, Z.s0.fused.jj.loop_min, Z.s0.fused.jj.loop_extent) {
+                             let Z.s0.k.loop_max = ((0 + (A.extent.0/8)) - 1)
+                             let Z.s0.k.loop_min = 0
+                             let Z.s0.k.loop_extent = ((((0 + (A.extent.0/8)) - 1) + 1) - 0)
+                             let Z.s0.fused.k.loop_extent = 1
+                             let Z.s0.fused.k.loop_max = Y.s0.fused.k
+                             let Z.s0.fused.k.loop_min = Y.s0.fused.k
+                             for (Z.s0.fused.k, Z.s0.fused.k.loop_min, Z.s0.fused.k.loop_extent) {
+                              let Z.s0.kk.loop_max = ((0 + 1) - 1)
+                              let Z.s0.kk.loop_min = 0
+                              let Z.s0.kk.loop_extent = ((((0 + 1) - 1) + 1) - 0)
+                              let Z.s0.fused.kk.loop_extent = 1
+                              let Z.s0.fused.kk.loop_max = Y.s0.fused.kk
+                              let Z.s0.fused.kk.loop_min = Y.s0.fused.kk
+                              for (Z.s0.fused.kk, Z.s0.fused.kk.loop_min, Z.s0.fused.kk.loop_extent) {
+                               let Z.s0.iii.loop_max = ((0 + 32) - 1)
+                               let Z.s0.iii.loop_min = 0
+                               let Z.s0.iii.loop_extent = ((((0 + 32) - 1) + 1) - 0)
+                               let Z.s0.fused.iii.loop_extent = 1
+                               let Z.s0.fused.iii.loop_max = Y.s0.fused.iii
+                               let Z.s0.fused.iii.loop_min = Y.s0.fused.iii
+                               for (Z.s0.fused.iii, Z.s0.fused.iii.loop_min, Z.s0.fused.iii.loop_extent) {
+                                let Z.s0.jjj.loop_max = ((0 + 8) - 1)
+                                let Z.s0.jjj.loop_min = 0
+                                let Z.s0.jjj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                                let Z.s0.fused.jjj.loop_extent = 1
+                                let Z.s0.fused.jjj.loop_max = Y.s0.fused.jjj
+                                let Z.s0.fused.jjj.loop_min = Y.s0.fused.jjj
+                                for (Z.s0.fused.jjj, Z.s0.fused.jjj.loop_min, Z.s0.fused.jjj.loop_extent) {
+                                 let Z.s0.kkk.loop_max = ((0 + 8) - 1)
+                                 let Z.s0.kkk.loop_min = 0
+                                 let Z.s0.kkk.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                                 let Z.s0.fused.kkk.loop_extent = 1
+                                 let Z.s0.fused.kkk.loop_max = Y.s0.fused.kkk
+                                 let Z.s0.fused.kkk.loop_min = Y.s0.fused.kkk
+                                 for (Z.s0.fused.kkk, Z.s0.fused.kkk.loop_min, Z.s0.fused.kkk.loop_extent) {
+                                  Z(Z.s0.fused.kkk, Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, Z.s0.fused.kk, Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i) = (select((((Z.s0.fused.kkk == 0) && (Z.s0.fused.kk == 0)) && (Z.s0.fused.k == 0)), 0.000000f, select((Z.s0.fused.kkk == 0), select((Z.s0.fused.kk == 0), Z(((Z.s0.fused.kkk + 8) - 1), Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, ((Z.s0.fused.kk + 1) - 1), (Z.s0.fused.k - 1), Z.s0.fused.j, Z.s0.fused.i), Z(((Z.s0.fused.kkk + 8) - 1), Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, (Z.s0.fused.kk - 1), Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i)), Z((Z.s0.fused.kkk - 1), Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, Z.s0.fused.kk, Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i))) + (X(Z.s0.fused.kkk, Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, Z.s0.fused.kk, Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i)*Y(Z.s0.fused.kkk, Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, Z.s0.fused.kk, Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i)))
+                                  let Out.s0.i.loop_max = ((0 + (A.extent.1/64)) - 1)
+                                  let Out.s0.i.loop_min = 0
+                                  let Out.s0.i.loop_extent = ((((0 + (A.extent.1/64)) - 1) + 1) - 0)
+                                  let Out.s0.fused.i.loop_extent = 1
+                                  let Out.s0.fused.i.loop_max = Z.s0.fused.i
+                                  let Out.s0.fused.i.loop_min = Z.s0.fused.i
+                                  for (Out.s0.fused.i, Out.s0.fused.i.loop_min, Out.s0.fused.i.loop_extent) {
+                                   let Out.s0.j.loop_max = ((0 + (B.extent.0/64)) - 1)
+                                   let Out.s0.j.loop_min = 0
+                                   let Out.s0.j.loop_extent = ((((0 + (B.extent.0/64)) - 1) + 1) - 0)
+                                   let Out.s0.fused.j.loop_extent = 1
+                                   let Out.s0.fused.j.loop_max = Z.s0.fused.j
+                                   let Out.s0.fused.j.loop_min = Z.s0.fused.j
+                                   for (Out.s0.fused.j, Out.s0.fused.j.loop_min, Out.s0.fused.j.loop_extent) {
+                                    let Out.s0.ii.loop_max = ((0 + 2) - 1)
+                                    let Out.s0.ii.loop_min = 0
+                                    let Out.s0.ii.loop_extent = ((((0 + 2) - 1) + 1) - 0)
+                                    let Out.s0.fused.ii.loop_extent = 1
+                                    let Out.s0.fused.ii.loop_max = Z.s0.fused.ii
+                                    let Out.s0.fused.ii.loop_min = Z.s0.fused.ii
+                                    for (Out.s0.fused.ii, Out.s0.fused.ii.loop_min, Out.s0.fused.ii.loop_extent) {
+                                     let Out.s0.jj.loop_max = ((0 + 8) - 1)
+                                     let Out.s0.jj.loop_min = 0
+                                     let Out.s0.jj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                                     let Out.s0.fused.jj.loop_extent = 1
+                                     let Out.s0.fused.jj.loop_max = Z.s0.fused.jj
+                                     let Out.s0.fused.jj.loop_min = Z.s0.fused.jj
+                                     for (Out.s0.fused.jj, Out.s0.fused.jj.loop_min, Out.s0.fused.jj.loop_extent) {
+                                      let Out.s0.iii.loop_max = ((0 + 32) - 1)
+                                      let Out.s0.iii.loop_min = 0
+                                      let Out.s0.iii.loop_extent = ((((0 + 32) - 1) + 1) - 0)
+                                      let Out.s0.fused.iii.loop_extent = 1
+                                      let Out.s0.fused.iii.loop_max = Z.s0.fused.iii
+                                      let Out.s0.fused.iii.loop_min = Z.s0.fused.iii
+                                      for (Out.s0.fused.iii, Out.s0.fused.iii.loop_min, Out.s0.fused.iii.loop_extent) {
+                                       let Out.s0.jjj.loop_max = ((0 + 8) - 1)
+                                       let Out.s0.jjj.loop_min = 0
+                                       let Out.s0.jjj.loop_extent = ((((0 + 8) - 1) + 1) - 0)
+                                       let Out.s0.fused.jjj.loop_extent = 1
+                                       let Out.s0.fused.jjj.loop_max = Z.s0.fused.jjj
+                                       let Out.s0.fused.jjj.loop_min = Z.s0.fused.jjj
+                                       for (Out.s0.fused.jjj, Out.s0.fused.jjj.loop_min, Out.s0.fused.jjj.loop_extent) {
+                                        Out(Out.s0.fused.jjj, Out.s0.fused.iii, Out.s0.fused.jj, Out.s0.fused.ii, Out.s0.fused.j, Out.s0.fused.i) = select((((Z.s0.fused.kkk == 7) && (Z.s0.fused.kk == 0)) && (Z.s0.fused.k == ((A.extent.0/8) - 1))), Z(Z.s0.fused.kkk, Z.s0.fused.jjj, Z.s0.fused.iii, Z.s0.fused.jj, Z.s0.fused.ii, Z.s0.fused.kk, Z.s0.fused.k, Z.s0.fused.j, Z.s0.fused.i))
+                                       }
+                                      }
+                                     }
+                                    }
+                                   }
+                                  }
+                                 }
+                                }
+                               }
+                              }
+                             }
+                            }
+                           }
+                          }
+                         }
+                        }
+                       }
+                      }
+                     }
+                    }
+                   }
+                  }
+                 }
+                }
+               }
+              }
+             }
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+   consume Z {
+    consume Y {
+     consume X {
+      0
+     }
+    }
+   }
+  }
+ }
+}
+
+Applying space time transformation...
+Load matrix A_im that starts at (0, 0), and with extents (8, 32)
+Load matrix B_im that starts at (0, 0), and with extents (8, 8)
+Lowering after applying space time transformation:
+gpu_block<Default_GPU> (X.s0.i, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j, 0, (B.extent.0/64)) {
+  gpu_thread<Default_GPU> (X.s0.ii, 0, 2) {
+   gpu_thread<Default_GPU> (X.s0.jj, 0, 8) {
+    realize X[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+     realize Y[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+      realize Z[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+       for (X.s0.k, 0, (A.extent.0/8)) {
+        for (X.s0.kk, 0, 1) {
+         for (X.s0.iii, 0, 32) {
+          vectorized (X.s0.jjj, 0, 8) {
+           for (X.s0.kkk, 0, 8) {
+            let X.s0.kkk = X.s0.kkk
+            let X.s0.jjj = X.s0.jjj
+            let X.s0.iii = X.s0.iii
+            X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)A(((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)), ((X.s0.i*64) + ((X.s0.ii*32) + X.s0.iii)))
+            Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)B(((X.s0.j*64) + ((X.s0.jj*8) + X.s0.jjj)), ((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)))
+            Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (select((((X.s0.kkk == 0) && (X.s0.kk == 0)) && (X.s0.k == 0)), 0.000000f, select((X.s0.kkk == 0), select((X.s0.kk == 0), Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0), Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)), Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0))) + (X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)*Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)))
+            if ((((X.s0.kkk == 7) && (X.s0.kk == 0)) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+             Out(X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.j, X.s0.i) = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+
+Fixing calls' args that correspond to loops marked as removed ...
+Lowering after fixing calls' args that correspond to loops marked as removed:
+gpu_block<Default_GPU> (X.s0.i, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j, 0, (B.extent.0/64)) {
+  gpu_thread<Default_GPU> (X.s0.ii, 0, 2) {
+   gpu_thread<Default_GPU> (X.s0.jj, 0, 8) {
+    realize X[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+     realize Y[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+      realize Z[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+       for (X.s0.k, 0, (A.extent.0/8)) {
+        for (X.s0.kk, 0, 1) {
+         for (X.s0.iii, 0, 32) {
+          vectorized (X.s0.jjj, 0, 8) {
+           for (X.s0.kkk, 0, 8) {
+            let X.s0.kkk = X.s0.kkk
+            let X.s0.jjj = X.s0.jjj
+            let X.s0.iii = X.s0.iii
+            X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)A(((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)), ((X.s0.i*64) + ((X.s0.ii*32) + X.s0.iii)))
+            Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)B(((X.s0.j*64) + ((X.s0.jj*8) + X.s0.jjj)), ((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)))
+            Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (select((((X.s0.kkk == 0) && (X.s0.kk == 0)) && (X.s0.k == 0)), 0.000000f, select((X.s0.kkk == 0), select((X.s0.kk == 0), Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0), Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)), Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0))) + (X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)*Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)))
+            if ((((X.s0.kkk == 7) && (X.s0.kk == 0)) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+             Out(X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.j, X.s0.i) = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+
+Skipping injecting memoization...
+Injecting tracing...
+Lowering after injecting tracing:
+gpu_block<Default_GPU> (X.s0.i, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j, 0, (B.extent.0/64)) {
+  gpu_thread<Default_GPU> (X.s0.ii, 0, 2) {
+   gpu_thread<Default_GPU> (X.s0.jj, 0, 8) {
+    realize X[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+     realize Y[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+      realize Z[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+       for (X.s0.k, 0, (A.extent.0/8)) {
+        for (X.s0.kk, 0, 1) {
+         for (X.s0.iii, 0, 32) {
+          vectorized (X.s0.jjj, 0, 8) {
+           for (X.s0.kkk, 0, 8) {
+            let X.s0.kkk = X.s0.kkk
+            let X.s0.jjj = X.s0.jjj
+            let X.s0.iii = X.s0.iii
+            X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)A(((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)), ((X.s0.i*64) + ((X.s0.ii*32) + X.s0.iii)))
+            Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)B(((X.s0.j*64) + ((X.s0.jj*8) + X.s0.jjj)), ((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)))
+            Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (select((((X.s0.kkk == 0) && (X.s0.kk == 0)) && (X.s0.k == 0)), 0.000000f, select((X.s0.kkk == 0), select((X.s0.kk == 0), Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0), Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)), Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0))) + (X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)*Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)))
+            if ((((X.s0.kkk == 7) && (X.s0.kk == 0)) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+             Out(X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.j, X.s0.i) = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+Adding checks for recursice calls
+Adding checks for parameters
+Lowering after injecting parameter checks:
+gpu_block<Default_GPU> (X.s0.i, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j, 0, (B.extent.0/64)) {
+  gpu_thread<Default_GPU> (X.s0.ii, 0, 2) {
+   gpu_thread<Default_GPU> (X.s0.jj, 0, 8) {
+    realize X[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+     realize Y[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+      realize Z[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+       for (X.s0.k, 0, (A.extent.0/8)) {
+        for (X.s0.kk, 0, 1) {
+         for (X.s0.iii, 0, 32) {
+          vectorized (X.s0.jjj, 0, 8) {
+           for (X.s0.kkk, 0, 8) {
+            let X.s0.kkk = X.s0.kkk
+            let X.s0.jjj = X.s0.jjj
+            let X.s0.iii = X.s0.iii
+            X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)A(((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)), ((X.s0.i*64) + ((X.s0.ii*32) + X.s0.iii)))
+            Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)B(((X.s0.j*64) + ((X.s0.jj*8) + X.s0.jjj)), ((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)))
+            Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (select((((X.s0.kkk == 0) && (X.s0.kk == 0)) && (X.s0.k == 0)), 0.000000f, select((X.s0.kkk == 0), select((X.s0.kk == 0), Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0), Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)), Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0))) + (X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)*Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)))
+            if ((((X.s0.kkk == 7) && (X.s0.kk == 0)) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+             Out(X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.j, X.s0.i) = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+Computing bounds of each function's value
+Bounds on value 0 for func A_im are: ((void *))neg_inf, ((void *))pos_inf
+Bounds on value 0 for func B_im are: ((void *))neg_inf, ((void *))pos_inf
+Bounds on value 0 for func X are: ((void *))neg_inf, ((void *))pos_inf
+Bounds on value 0 for func Y are: ((void *))neg_inf, ((void *))pos_inf
+Bounds on value 0 for func Z are: ((void *))neg_inf, ((void *))pos_inf
+Bounds on value 0 for func Out are: ((void *))neg_inf, ((void *))pos_inf
+Adding checks for images
+Injecting constraints for A.0
+Injecting constraints for A.1
+Injecting constraints for B.0
+Injecting constraints for B.1
+Injecting constraints for Out.0
+Injecting constraints for Out.1
+Injecting constraints for Out.2
+Injecting constraints for Out.3
+Injecting constraints for Out.4
+Injecting constraints for Out.5
+Lowering after injecting image checks:
+let A.extent.0.required = A.extent.0
+let A.min.0.required = A.min.0
+let A.stride.0.required = 1
+let A.extent.1.required = A.extent.1
+let A.min.1.required = A.min.1
+let A.stride.1.required = (A.stride.0.required*A.extent.0.required)
+let B.extent.0.required = B.extent.0
+let B.min.0.required = B.min.0
+let B.stride.0.required = 1
+let B.extent.1.required = B.extent.1
+let B.min.1.required = B.min.1
+let B.stride.1.required = (B.stride.0.required*B.extent.0.required)
+let Out.extent.0.required = ((7 + 1) - 0)
+let Out.min.0.required = 0
+let Out.stride.0.required = 1
+let Out.extent.1.required = ((31 + 1) - 0)
+let Out.min.1.required = 0
+let Out.stride.1.required = (Out.stride.0.required*Out.extent.0.required)
+let Out.extent.2.required = ((((8 + 0) - 1) + 1) - 0)
+let Out.min.2.required = 0
+let Out.stride.2.required = (Out.stride.1.required*Out.extent.1.required)
+let Out.extent.3.required = ((((2 + 0) - 1) + 1) - 0)
+let Out.min.3.required = 0
+let Out.stride.3.required = (Out.stride.2.required*Out.extent.2.required)
+let Out.extent.4.required = (((((B.extent.0/64) + 0) - 1) + 1) - 0)
+let Out.min.4.required = 0
+let Out.stride.4.required = (Out.stride.3.required*Out.extent.3.required)
+let Out.extent.5.required = (((((A.extent.1/64) + 0) - 1) + 1) - 0)
+let Out.min.5.required = 0
+let Out.stride.5.required = (Out.stride.4.required*Out.extent.4.required)
+let A.stride.0.constrained = 1
+let B.stride.0.constrained = 1
+let Out.stride.0.constrained = 1
+let A.stride.0.proposed = 1
+let A.min.0.proposed = A.min.0.required
+let A.extent.0.proposed = A.extent.0.required
+let A.stride.1.proposed = A.stride.1.required
+let A.min.1.proposed = A.min.1.required
+let A.extent.1.proposed = A.extent.1.required
+let B.stride.0.proposed = 1
+let B.min.0.proposed = B.min.0.required
+let B.extent.0.proposed = B.extent.0.required
+let B.stride.1.proposed = B.stride.1.required
+let B.min.1.proposed = B.min.1.required
+let B.extent.1.proposed = B.extent.1.required
+let Out.stride.0.proposed = 1
+let Out.min.0.proposed = Out.min.0.required
+let Out.extent.0.proposed = Out.extent.0.required
+let Out.stride.1.proposed = Out.stride.1.required
+let Out.min.1.proposed = Out.min.1.required
+let Out.extent.1.proposed = Out.extent.1.required
+let Out.stride.2.proposed = Out.stride.2.required
+let Out.min.2.proposed = Out.min.2.required
+let Out.extent.2.proposed = Out.extent.2.required
+let Out.stride.3.proposed = Out.stride.3.required
+let Out.min.3.proposed = Out.min.3.required
+let Out.extent.3.proposed = Out.extent.3.required
+let Out.stride.4.proposed = Out.stride.4.required
+let Out.min.4.proposed = Out.min.4.required
+let Out.extent.4.proposed = Out.extent.4.required
+let Out.stride.5.proposed = Out.stride.5.required
+let Out.min.5.proposed = Out.min.5.required
+let Out.extent.5.proposed = Out.extent.5.required
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || ((A.min.0.proposed <= A.min.0.required) && (((A.min.0.proposed + A.extent.0.proposed) - 1) >= ((A.min.0.required + A.extent.0.required) - 1)))), halide_error_constraints_make_required_region_smaller("Input buffer A", 0, A.min.0.proposed, ((A.min.0.proposed + A.extent.0.proposed) - 1), A.min.0.required, ((A.min.0.required + A.extent.0.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || ((A.min.1.proposed <= A.min.1.required) && (((A.min.1.proposed + A.extent.1.proposed) - 1) >= ((A.min.1.required + A.extent.1.required) - 1)))), halide_error_constraints_make_required_region_smaller("Input buffer A", 1, A.min.1.proposed, ((A.min.1.proposed + A.extent.1.proposed) - 1), A.min.1.required, ((A.min.1.required + A.extent.1.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer) || ((B.min.0.proposed <= B.min.0.required) && (((B.min.0.proposed + B.extent.0.proposed) - 1) >= ((B.min.0.required + B.extent.0.required) - 1)))), halide_error_constraints_make_required_region_smaller("Input buffer B", 0, B.min.0.proposed, ((B.min.0.proposed + B.extent.0.proposed) - 1), B.min.0.required, ((B.min.0.required + B.extent.0.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer) || ((B.min.1.proposed <= B.min.1.required) && (((B.min.1.proposed + B.extent.1.proposed) - 1) >= ((B.min.1.required + B.extent.1.required) - 1)))), halide_error_constraints_make_required_region_smaller("Input buffer B", 1, B.min.1.proposed, ((B.min.1.proposed + B.extent.1.proposed) - 1), B.min.1.required, ((B.min.1.required + B.extent.1.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.0.proposed <= Out.min.0.required) && (((Out.min.0.proposed + Out.extent.0.proposed) - 1) >= ((Out.min.0.required + Out.extent.0.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 0, Out.min.0.proposed, ((Out.min.0.proposed + Out.extent.0.proposed) - 1), Out.min.0.required, ((Out.min.0.required + Out.extent.0.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.1.proposed <= Out.min.1.required) && (((Out.min.1.proposed + Out.extent.1.proposed) - 1) >= ((Out.min.1.required + Out.extent.1.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 1, Out.min.1.proposed, ((Out.min.1.proposed + Out.extent.1.proposed) - 1), Out.min.1.required, ((Out.min.1.required + Out.extent.1.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.2.proposed <= Out.min.2.required) && (((Out.min.2.proposed + Out.extent.2.proposed) - 1) >= ((Out.min.2.required + Out.extent.2.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 2, Out.min.2.proposed, ((Out.min.2.proposed + Out.extent.2.proposed) - 1), Out.min.2.required, ((Out.min.2.required + Out.extent.2.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.3.proposed <= Out.min.3.required) && (((Out.min.3.proposed + Out.extent.3.proposed) - 1) >= ((Out.min.3.required + Out.extent.3.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 3, Out.min.3.proposed, ((Out.min.3.proposed + Out.extent.3.proposed) - 1), Out.min.3.required, ((Out.min.3.required + Out.extent.3.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.4.proposed <= Out.min.4.required) && (((Out.min.4.proposed + Out.extent.4.proposed) - 1) >= ((Out.min.4.required + Out.extent.4.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 4, Out.min.4.proposed, ((Out.min.4.proposed + Out.extent.4.proposed) - 1), Out.min.4.required, ((Out.min.4.required + Out.extent.4.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.5.proposed <= Out.min.5.required) && (((Out.min.5.proposed + Out.extent.5.proposed) - 1) >= ((Out.min.5.required + Out.extent.5.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 5, Out.min.5.proposed, ((Out.min.5.proposed + Out.extent.5.proposed) - 1), Out.min.5.required, ((Out.min.5.required + Out.extent.5.required) - 1)))
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0.proposed, A.extent.0.proposed, A.stride.0.proposed, 0, A.min.1.proposed, A.extent.1.proposed, A.stride.1.proposed, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0.proposed, B.extent.0.proposed, B.stride.0.proposed, 0, B.min.1.proposed, B.extent.1.proposed, B.stride.1.proposed, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(Out.min.0.proposed, Out.extent.0.proposed, Out.stride.0.proposed, 0, Out.min.1.proposed, Out.extent.1.proposed, Out.stride.1.proposed, 0, Out.min.2.proposed, Out.extent.2.proposed, Out.stride.2.proposed, 0, Out.min.3.proposed, Out.extent.3.proposed, Out.stride.3.proposed, 0, Out.min.4.proposed, Out.extent.4.proposed, Out.stride.4.proposed, 0, Out.min.5.proposed, Out.extent.5.proposed, Out.stride.5.proposed, 0), (uint64)0)
+}
+if (!((((uint1)0 || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer))) {
+ assert(((uint32)A.type == (uint32)73730), halide_error_bad_type("Input buffer A", (uint32)A.type, (uint32)73730))
+ assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+ assert(((uint32)B.type == (uint32)73730), halide_error_bad_type("Input buffer B", (uint32)B.type, (uint32)73730))
+ assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+ assert(((uint32)Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", (uint32)Out.type, (uint32)73730))
+ assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+ assert(((A.min.0 <= A.min.0.required) && (((A.min.0 + A.extent.0) - 1) >= ((A.min.0.required + A.extent.0.required) - 1))), halide_error_access_out_of_bounds("Input buffer A", 0, A.min.0.required, ((A.min.0.required + A.extent.0.required) - 1), A.min.0, ((A.min.0 + A.extent.0) - 1)))
+ assert((A.extent.0 >= 0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+ assert(((A.min.1 <= A.min.1.required) && (((A.min.1 + A.extent.1) - 1) >= ((A.min.1.required + A.extent.1.required) - 1))), halide_error_access_out_of_bounds("Input buffer A", 1, A.min.1.required, ((A.min.1.required + A.extent.1.required) - 1), A.min.1, ((A.min.1 + A.extent.1) - 1)))
+ assert((A.extent.1 >= 0), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+ assert(((B.min.0 <= B.min.0.required) && (((B.min.0 + B.extent.0) - 1) >= ((B.min.0.required + B.extent.0.required) - 1))), halide_error_access_out_of_bounds("Input buffer B", 0, B.min.0.required, ((B.min.0.required + B.extent.0.required) - 1), B.min.0, ((B.min.0 + B.extent.0) - 1)))
+ assert((B.extent.0 >= 0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+ assert(((B.min.1 <= B.min.1.required) && (((B.min.1 + B.extent.1) - 1) >= ((B.min.1.required + B.extent.1.required) - 1))), halide_error_access_out_of_bounds("Input buffer B", 1, B.min.1.required, ((B.min.1.required + B.extent.1.required) - 1), B.min.1, ((B.min.1 + B.extent.1) - 1)))
+ assert((B.extent.1 >= 0), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+ assert(((Out.min.0 <= Out.min.0.required) && (((Out.min.0 + Out.extent.0) - 1) >= ((Out.min.0.required + Out.extent.0.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 0, Out.min.0.required, ((Out.min.0.required + Out.extent.0.required) - 1), Out.min.0, ((Out.min.0 + Out.extent.0) - 1)))
+ assert((Out.extent.0 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+ assert(((Out.min.1 <= Out.min.1.required) && (((Out.min.1 + Out.extent.1) - 1) >= ((Out.min.1.required + Out.extent.1.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 1, Out.min.1.required, ((Out.min.1.required + Out.extent.1.required) - 1), Out.min.1, ((Out.min.1 + Out.extent.1) - 1)))
+ assert((Out.extent.1 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+ assert(((Out.min.2 <= Out.min.2.required) && (((Out.min.2 + Out.extent.2) - 1) >= ((Out.min.2.required + Out.extent.2.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 2, Out.min.2.required, ((Out.min.2.required + Out.extent.2.required) - 1), Out.min.2, ((Out.min.2 + Out.extent.2) - 1)))
+ assert((Out.extent.2 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+ assert(((Out.min.3 <= Out.min.3.required) && (((Out.min.3 + Out.extent.3) - 1) >= ((Out.min.3.required + Out.extent.3.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 3, Out.min.3.required, ((Out.min.3.required + Out.extent.3.required) - 1), Out.min.3, ((Out.min.3 + Out.extent.3) - 1)))
+ assert((Out.extent.3 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+ assert(((Out.min.4 <= Out.min.4.required) && (((Out.min.4 + Out.extent.4) - 1) >= ((Out.min.4.required + Out.extent.4.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 4, Out.min.4.required, ((Out.min.4.required + Out.extent.4.required) - 1), Out.min.4, ((Out.min.4 + Out.extent.4) - 1)))
+ assert((Out.extent.4 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+ assert(((Out.min.5 <= Out.min.5.required) && (((Out.min.5 + Out.extent.5) - 1) >= ((Out.min.5.required + Out.extent.5.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 5, Out.min.5.required, ((Out.min.5.required + Out.extent.5.required) - 1), Out.min.5, ((Out.min.5 + Out.extent.5) - 1)))
+ assert((Out.extent.5 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+ assert((A.stride.0 == A.stride.0.constrained), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", A.stride.0.constrained))
+ assert((B.stride.0 == B.stride.0.constrained), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", B.stride.0.constrained))
+ assert((Out.stride.0 == Out.stride.0.constrained), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", Out.stride.0.constrained))
+ let A.total_extent.0 = int64(A.extent.0)
+ let A.total_extent.1 = (int64(A.extent.1)*A.total_extent.0)
+ let B.total_extent.0 = int64(B.extent.0)
+ let B.total_extent.1 = (int64(B.extent.1)*B.total_extent.0)
+ let Out.total_extent.0 = int64(Out.extent.0)
+ let Out.total_extent.1 = (int64(Out.extent.1)*Out.total_extent.0)
+ let Out.total_extent.2 = (int64(Out.extent.2)*Out.total_extent.1)
+ let Out.total_extent.3 = (int64(Out.extent.3)*Out.total_extent.2)
+ let Out.total_extent.4 = (int64(Out.extent.4)*Out.total_extent.3)
+ let Out.total_extent.5 = (int64(Out.extent.5)*Out.total_extent.4)
+ assert(((uint64)abs((int64(A.extent.0)*int64(A.stride.0.constrained))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.0)*int64(A.stride.0.constrained))), (uint64)2147483647))
+ assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+ assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(B.extent.0)*int64(B.stride.0.constrained))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.0)*int64(B.stride.0.constrained))), (uint64)2147483647))
+ assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+ assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.0)*int64(Out.stride.0.constrained))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.0)*int64(Out.stride.0.constrained))), (uint64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+ assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+ assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+ assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+ assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+ assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+ gpu_block<Default_GPU> (X.s0.i, 0, (A.extent.1/64)) {
+  gpu_block<Default_GPU> (X.s0.j, 0, (B.extent.0/64)) {
+   gpu_thread<Default_GPU> (X.s0.ii, 0, 2) {
+    gpu_thread<Default_GPU> (X.s0.jj, 0, 8) {
+     realize X[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+      realize Y[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+       realize Z[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+        for (X.s0.k, 0, (A.extent.0/8)) {
+         for (X.s0.kk, 0, 1) {
+          for (X.s0.iii, 0, 32) {
+           vectorized (X.s0.jjj, 0, 8) {
+            for (X.s0.kkk, 0, 8) {
+             let X.s0.kkk = X.s0.kkk
+             let X.s0.jjj = X.s0.jjj
+             let X.s0.iii = X.s0.iii
+             X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)A(((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)), ((X.s0.i*64) + ((X.s0.ii*32) + X.s0.iii)))
+             Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)B(((X.s0.j*64) + ((X.s0.jj*8) + X.s0.jjj)), ((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)))
+             Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (select((((X.s0.kkk == 0) && (X.s0.kk == 0)) && (X.s0.k == 0)), 0.000000f, select((X.s0.kkk == 0), select((X.s0.kk == 0), Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0), Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)), Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0))) + (X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)*Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)))
+             if ((((X.s0.kkk == 7) && (X.s0.kk == 0)) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+              Out(X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.j, X.s0.i) = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)
+             }
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+Performing computation bounds inference...
+Lowering after computation bounds inference:
+let A.extent.0.required = A.extent.0
+let A.min.0.required = A.min.0
+let A.stride.0.required = 1
+let A.extent.1.required = A.extent.1
+let A.min.1.required = A.min.1
+let A.stride.1.required = (A.stride.0.required*A.extent.0.required)
+let B.extent.0.required = B.extent.0
+let B.min.0.required = B.min.0
+let B.stride.0.required = 1
+let B.extent.1.required = B.extent.1
+let B.min.1.required = B.min.1
+let B.stride.1.required = (B.stride.0.required*B.extent.0.required)
+let Out.extent.0.required = ((7 + 1) - 0)
+let Out.min.0.required = 0
+let Out.stride.0.required = 1
+let Out.extent.1.required = ((31 + 1) - 0)
+let Out.min.1.required = 0
+let Out.stride.1.required = (Out.stride.0.required*Out.extent.0.required)
+let Out.extent.2.required = ((((8 + 0) - 1) + 1) - 0)
+let Out.min.2.required = 0
+let Out.stride.2.required = (Out.stride.1.required*Out.extent.1.required)
+let Out.extent.3.required = ((((2 + 0) - 1) + 1) - 0)
+let Out.min.3.required = 0
+let Out.stride.3.required = (Out.stride.2.required*Out.extent.2.required)
+let Out.extent.4.required = (((((B.extent.0/64) + 0) - 1) + 1) - 0)
+let Out.min.4.required = 0
+let Out.stride.4.required = (Out.stride.3.required*Out.extent.3.required)
+let Out.extent.5.required = (((((A.extent.1/64) + 0) - 1) + 1) - 0)
+let Out.min.5.required = 0
+let Out.stride.5.required = (Out.stride.4.required*Out.extent.4.required)
+let A.stride.0.constrained = 1
+let B.stride.0.constrained = 1
+let Out.stride.0.constrained = 1
+let A.stride.0.proposed = 1
+let A.min.0.proposed = A.min.0.required
+let A.extent.0.proposed = A.extent.0.required
+let A.stride.1.proposed = A.stride.1.required
+let A.min.1.proposed = A.min.1.required
+let A.extent.1.proposed = A.extent.1.required
+let B.stride.0.proposed = 1
+let B.min.0.proposed = B.min.0.required
+let B.extent.0.proposed = B.extent.0.required
+let B.stride.1.proposed = B.stride.1.required
+let B.min.1.proposed = B.min.1.required
+let B.extent.1.proposed = B.extent.1.required
+let Out.stride.0.proposed = 1
+let Out.min.0.proposed = Out.min.0.required
+let Out.extent.0.proposed = Out.extent.0.required
+let Out.stride.1.proposed = Out.stride.1.required
+let Out.min.1.proposed = Out.min.1.required
+let Out.extent.1.proposed = Out.extent.1.required
+let Out.stride.2.proposed = Out.stride.2.required
+let Out.min.2.proposed = Out.min.2.required
+let Out.extent.2.proposed = Out.extent.2.required
+let Out.stride.3.proposed = Out.stride.3.required
+let Out.min.3.proposed = Out.min.3.required
+let Out.extent.3.proposed = Out.extent.3.required
+let Out.stride.4.proposed = Out.stride.4.required
+let Out.min.4.proposed = Out.min.4.required
+let Out.extent.4.proposed = Out.extent.4.required
+let Out.stride.5.proposed = Out.stride.5.required
+let Out.min.5.proposed = Out.min.5.required
+let Out.extent.5.proposed = Out.extent.5.required
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || ((A.min.0.proposed <= A.min.0.required) && (((A.min.0.proposed + A.extent.0.proposed) - 1) >= ((A.min.0.required + A.extent.0.required) - 1)))), halide_error_constraints_make_required_region_smaller("Input buffer A", 0, A.min.0.proposed, ((A.min.0.proposed + A.extent.0.proposed) - 1), A.min.0.required, ((A.min.0.required + A.extent.0.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || ((A.min.1.proposed <= A.min.1.required) && (((A.min.1.proposed + A.extent.1.proposed) - 1) >= ((A.min.1.required + A.extent.1.required) - 1)))), halide_error_constraints_make_required_region_smaller("Input buffer A", 1, A.min.1.proposed, ((A.min.1.proposed + A.extent.1.proposed) - 1), A.min.1.required, ((A.min.1.required + A.extent.1.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer) || ((B.min.0.proposed <= B.min.0.required) && (((B.min.0.proposed + B.extent.0.proposed) - 1) >= ((B.min.0.required + B.extent.0.required) - 1)))), halide_error_constraints_make_required_region_smaller("Input buffer B", 0, B.min.0.proposed, ((B.min.0.proposed + B.extent.0.proposed) - 1), B.min.0.required, ((B.min.0.required + B.extent.0.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer) || ((B.min.1.proposed <= B.min.1.required) && (((B.min.1.proposed + B.extent.1.proposed) - 1) >= ((B.min.1.required + B.extent.1.required) - 1)))), halide_error_constraints_make_required_region_smaller("Input buffer B", 1, B.min.1.proposed, ((B.min.1.proposed + B.extent.1.proposed) - 1), B.min.1.required, ((B.min.1.required + B.extent.1.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.0.proposed <= Out.min.0.required) && (((Out.min.0.proposed + Out.extent.0.proposed) - 1) >= ((Out.min.0.required + Out.extent.0.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 0, Out.min.0.proposed, ((Out.min.0.proposed + Out.extent.0.proposed) - 1), Out.min.0.required, ((Out.min.0.required + Out.extent.0.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.1.proposed <= Out.min.1.required) && (((Out.min.1.proposed + Out.extent.1.proposed) - 1) >= ((Out.min.1.required + Out.extent.1.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 1, Out.min.1.proposed, ((Out.min.1.proposed + Out.extent.1.proposed) - 1), Out.min.1.required, ((Out.min.1.required + Out.extent.1.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.2.proposed <= Out.min.2.required) && (((Out.min.2.proposed + Out.extent.2.proposed) - 1) >= ((Out.min.2.required + Out.extent.2.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 2, Out.min.2.proposed, ((Out.min.2.proposed + Out.extent.2.proposed) - 1), Out.min.2.required, ((Out.min.2.required + Out.extent.2.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.3.proposed <= Out.min.3.required) && (((Out.min.3.proposed + Out.extent.3.proposed) - 1) >= ((Out.min.3.required + Out.extent.3.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 3, Out.min.3.proposed, ((Out.min.3.proposed + Out.extent.3.proposed) - 1), Out.min.3.required, ((Out.min.3.required + Out.extent.3.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.4.proposed <= Out.min.4.required) && (((Out.min.4.proposed + Out.extent.4.proposed) - 1) >= ((Out.min.4.required + Out.extent.4.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 4, Out.min.4.proposed, ((Out.min.4.proposed + Out.extent.4.proposed) - 1), Out.min.4.required, ((Out.min.4.required + Out.extent.4.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.5.proposed <= Out.min.5.required) && (((Out.min.5.proposed + Out.extent.5.proposed) - 1) >= ((Out.min.5.required + Out.extent.5.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 5, Out.min.5.proposed, ((Out.min.5.proposed + Out.extent.5.proposed) - 1), Out.min.5.required, ((Out.min.5.required + Out.extent.5.required) - 1)))
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0.proposed, A.extent.0.proposed, A.stride.0.proposed, 0, A.min.1.proposed, A.extent.1.proposed, A.stride.1.proposed, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0.proposed, B.extent.0.proposed, B.stride.0.proposed, 0, B.min.1.proposed, B.extent.1.proposed, B.stride.1.proposed, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(Out.min.0.proposed, Out.extent.0.proposed, Out.stride.0.proposed, 0, Out.min.1.proposed, Out.extent.1.proposed, Out.stride.1.proposed, 0, Out.min.2.proposed, Out.extent.2.proposed, Out.stride.2.proposed, 0, Out.min.3.proposed, Out.extent.3.proposed, Out.stride.3.proposed, 0, Out.min.4.proposed, Out.extent.4.proposed, Out.stride.4.proposed, 0, Out.min.5.proposed, Out.extent.5.proposed, Out.stride.5.proposed, 0), (uint64)0)
+}
+if (!((((uint1)0 || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer))) {
+ assert(((uint32)A.type == (uint32)73730), halide_error_bad_type("Input buffer A", (uint32)A.type, (uint32)73730))
+ assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+ assert(((uint32)B.type == (uint32)73730), halide_error_bad_type("Input buffer B", (uint32)B.type, (uint32)73730))
+ assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+ assert(((uint32)Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", (uint32)Out.type, (uint32)73730))
+ assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+ assert(((A.min.0 <= A.min.0.required) && (((A.min.0 + A.extent.0) - 1) >= ((A.min.0.required + A.extent.0.required) - 1))), halide_error_access_out_of_bounds("Input buffer A", 0, A.min.0.required, ((A.min.0.required + A.extent.0.required) - 1), A.min.0, ((A.min.0 + A.extent.0) - 1)))
+ assert((A.extent.0 >= 0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+ assert(((A.min.1 <= A.min.1.required) && (((A.min.1 + A.extent.1) - 1) >= ((A.min.1.required + A.extent.1.required) - 1))), halide_error_access_out_of_bounds("Input buffer A", 1, A.min.1.required, ((A.min.1.required + A.extent.1.required) - 1), A.min.1, ((A.min.1 + A.extent.1) - 1)))
+ assert((A.extent.1 >= 0), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+ assert(((B.min.0 <= B.min.0.required) && (((B.min.0 + B.extent.0) - 1) >= ((B.min.0.required + B.extent.0.required) - 1))), halide_error_access_out_of_bounds("Input buffer B", 0, B.min.0.required, ((B.min.0.required + B.extent.0.required) - 1), B.min.0, ((B.min.0 + B.extent.0) - 1)))
+ assert((B.extent.0 >= 0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+ assert(((B.min.1 <= B.min.1.required) && (((B.min.1 + B.extent.1) - 1) >= ((B.min.1.required + B.extent.1.required) - 1))), halide_error_access_out_of_bounds("Input buffer B", 1, B.min.1.required, ((B.min.1.required + B.extent.1.required) - 1), B.min.1, ((B.min.1 + B.extent.1) - 1)))
+ assert((B.extent.1 >= 0), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+ assert(((Out.min.0 <= Out.min.0.required) && (((Out.min.0 + Out.extent.0) - 1) >= ((Out.min.0.required + Out.extent.0.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 0, Out.min.0.required, ((Out.min.0.required + Out.extent.0.required) - 1), Out.min.0, ((Out.min.0 + Out.extent.0) - 1)))
+ assert((Out.extent.0 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+ assert(((Out.min.1 <= Out.min.1.required) && (((Out.min.1 + Out.extent.1) - 1) >= ((Out.min.1.required + Out.extent.1.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 1, Out.min.1.required, ((Out.min.1.required + Out.extent.1.required) - 1), Out.min.1, ((Out.min.1 + Out.extent.1) - 1)))
+ assert((Out.extent.1 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+ assert(((Out.min.2 <= Out.min.2.required) && (((Out.min.2 + Out.extent.2) - 1) >= ((Out.min.2.required + Out.extent.2.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 2, Out.min.2.required, ((Out.min.2.required + Out.extent.2.required) - 1), Out.min.2, ((Out.min.2 + Out.extent.2) - 1)))
+ assert((Out.extent.2 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+ assert(((Out.min.3 <= Out.min.3.required) && (((Out.min.3 + Out.extent.3) - 1) >= ((Out.min.3.required + Out.extent.3.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 3, Out.min.3.required, ((Out.min.3.required + Out.extent.3.required) - 1), Out.min.3, ((Out.min.3 + Out.extent.3) - 1)))
+ assert((Out.extent.3 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+ assert(((Out.min.4 <= Out.min.4.required) && (((Out.min.4 + Out.extent.4) - 1) >= ((Out.min.4.required + Out.extent.4.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 4, Out.min.4.required, ((Out.min.4.required + Out.extent.4.required) - 1), Out.min.4, ((Out.min.4 + Out.extent.4) - 1)))
+ assert((Out.extent.4 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+ assert(((Out.min.5 <= Out.min.5.required) && (((Out.min.5 + Out.extent.5) - 1) >= ((Out.min.5.required + Out.extent.5.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 5, Out.min.5.required, ((Out.min.5.required + Out.extent.5.required) - 1), Out.min.5, ((Out.min.5 + Out.extent.5) - 1)))
+ assert((Out.extent.5 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+ assert((A.stride.0 == A.stride.0.constrained), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", A.stride.0.constrained))
+ assert((B.stride.0 == B.stride.0.constrained), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", B.stride.0.constrained))
+ assert((Out.stride.0 == Out.stride.0.constrained), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", Out.stride.0.constrained))
+ let A.total_extent.0 = int64(A.extent.0)
+ let A.total_extent.1 = (int64(A.extent.1)*A.total_extent.0)
+ let B.total_extent.0 = int64(B.extent.0)
+ let B.total_extent.1 = (int64(B.extent.1)*B.total_extent.0)
+ let Out.total_extent.0 = int64(Out.extent.0)
+ let Out.total_extent.1 = (int64(Out.extent.1)*Out.total_extent.0)
+ let Out.total_extent.2 = (int64(Out.extent.2)*Out.total_extent.1)
+ let Out.total_extent.3 = (int64(Out.extent.3)*Out.total_extent.2)
+ let Out.total_extent.4 = (int64(Out.extent.4)*Out.total_extent.3)
+ let Out.total_extent.5 = (int64(Out.extent.5)*Out.total_extent.4)
+ assert(((uint64)abs((int64(A.extent.0)*int64(A.stride.0.constrained))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.0)*int64(A.stride.0.constrained))), (uint64)2147483647))
+ assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+ assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(B.extent.0)*int64(B.stride.0.constrained))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.0)*int64(B.stride.0.constrained))), (uint64)2147483647))
+ assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+ assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.0)*int64(Out.stride.0.constrained))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.0)*int64(Out.stride.0.constrained))), (uint64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+ assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+ assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+ assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+ assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+ assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+ gpu_block<Default_GPU> (X.s0.i, 0, (A.extent.1/64)) {
+  gpu_block<Default_GPU> (X.s0.j, 0, (B.extent.0/64)) {
+   gpu_thread<Default_GPU> (X.s0.ii, 0, 2) {
+    gpu_thread<Default_GPU> (X.s0.jj, 0, 8) {
+     realize X[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+      realize Y[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+       realize Z[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+        for (X.s0.k, 0, (A.extent.0/8)) {
+         for (X.s0.kk, 0, 1) {
+          for (X.s0.iii, 0, 32) {
+           vectorized (X.s0.jjj, 0, 8) {
+            for (X.s0.kkk, 0, 8) {
+             let X.s0.kkk = X.s0.kkk
+             let X.s0.jjj = X.s0.jjj
+             let X.s0.iii = X.s0.iii
+             X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)A(((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)), ((X.s0.i*64) + ((X.s0.ii*32) + X.s0.iii)))
+             Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)B(((X.s0.j*64) + ((X.s0.jj*8) + X.s0.jjj)), ((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)))
+             Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (select((((X.s0.kkk == 0) && (X.s0.kk == 0)) && (X.s0.k == 0)), 0.000000f, select((X.s0.kkk == 0), select((X.s0.kk == 0), Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0), Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)), Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0))) + (X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)*Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)))
+             if ((((X.s0.kkk == 7) && (X.s0.kk == 0)) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+              Out(X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.j, X.s0.i) = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)
+             }
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+Uniquifying variable names...
+Lowering after uniquifying variable names:
+let A.extent.0.required = A.extent.0
+let A.min.0.required = A.min.0
+let A.stride.0.required = 1
+let A.extent.1.required = A.extent.1
+let A.min.1.required = A.min.1
+let A.stride.1.required = (A.stride.0.required*A.extent.0.required)
+let B.extent.0.required = B.extent.0
+let B.min.0.required = B.min.0
+let B.stride.0.required = 1
+let B.extent.1.required = B.extent.1
+let B.min.1.required = B.min.1
+let B.stride.1.required = (B.stride.0.required*B.extent.0.required)
+let Out.extent.0.required = ((7 + 1) - 0)
+let Out.min.0.required = 0
+let Out.stride.0.required = 1
+let Out.extent.1.required = ((31 + 1) - 0)
+let Out.min.1.required = 0
+let Out.stride.1.required = (Out.stride.0.required*Out.extent.0.required)
+let Out.extent.2.required = ((((8 + 0) - 1) + 1) - 0)
+let Out.min.2.required = 0
+let Out.stride.2.required = (Out.stride.1.required*Out.extent.1.required)
+let Out.extent.3.required = ((((2 + 0) - 1) + 1) - 0)
+let Out.min.3.required = 0
+let Out.stride.3.required = (Out.stride.2.required*Out.extent.2.required)
+let Out.extent.4.required = (((((B.extent.0/64) + 0) - 1) + 1) - 0)
+let Out.min.4.required = 0
+let Out.stride.4.required = (Out.stride.3.required*Out.extent.3.required)
+let Out.extent.5.required = (((((A.extent.1/64) + 0) - 1) + 1) - 0)
+let Out.min.5.required = 0
+let Out.stride.5.required = (Out.stride.4.required*Out.extent.4.required)
+let A.stride.0.constrained = 1
+let B.stride.0.constrained = 1
+let Out.stride.0.constrained = 1
+let A.stride.0.proposed = 1
+let A.min.0.proposed = A.min.0.required
+let A.extent.0.proposed = A.extent.0.required
+let A.stride.1.proposed = A.stride.1.required
+let A.min.1.proposed = A.min.1.required
+let A.extent.1.proposed = A.extent.1.required
+let B.stride.0.proposed = 1
+let B.min.0.proposed = B.min.0.required
+let B.extent.0.proposed = B.extent.0.required
+let B.stride.1.proposed = B.stride.1.required
+let B.min.1.proposed = B.min.1.required
+let B.extent.1.proposed = B.extent.1.required
+let Out.stride.0.proposed = 1
+let Out.min.0.proposed = Out.min.0.required
+let Out.extent.0.proposed = Out.extent.0.required
+let Out.stride.1.proposed = Out.stride.1.required
+let Out.min.1.proposed = Out.min.1.required
+let Out.extent.1.proposed = Out.extent.1.required
+let Out.stride.2.proposed = Out.stride.2.required
+let Out.min.2.proposed = Out.min.2.required
+let Out.extent.2.proposed = Out.extent.2.required
+let Out.stride.3.proposed = Out.stride.3.required
+let Out.min.3.proposed = Out.min.3.required
+let Out.extent.3.proposed = Out.extent.3.required
+let Out.stride.4.proposed = Out.stride.4.required
+let Out.min.4.proposed = Out.min.4.required
+let Out.extent.4.proposed = Out.extent.4.required
+let Out.stride.5.proposed = Out.stride.5.required
+let Out.min.5.proposed = Out.min.5.required
+let Out.extent.5.proposed = Out.extent.5.required
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || ((A.min.0.proposed <= A.min.0.required) && (((A.min.0.proposed + A.extent.0.proposed) - 1) >= ((A.min.0.required + A.extent.0.required) - 1)))), halide_error_constraints_make_required_region_smaller("Input buffer A", 0, A.min.0.proposed, ((A.min.0.proposed + A.extent.0.proposed) - 1), A.min.0.required, ((A.min.0.required + A.extent.0.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || ((A.min.1.proposed <= A.min.1.required) && (((A.min.1.proposed + A.extent.1.proposed) - 1) >= ((A.min.1.required + A.extent.1.required) - 1)))), halide_error_constraints_make_required_region_smaller("Input buffer A", 1, A.min.1.proposed, ((A.min.1.proposed + A.extent.1.proposed) - 1), A.min.1.required, ((A.min.1.required + A.extent.1.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer) || ((B.min.0.proposed <= B.min.0.required) && (((B.min.0.proposed + B.extent.0.proposed) - 1) >= ((B.min.0.required + B.extent.0.required) - 1)))), halide_error_constraints_make_required_region_smaller("Input buffer B", 0, B.min.0.proposed, ((B.min.0.proposed + B.extent.0.proposed) - 1), B.min.0.required, ((B.min.0.required + B.extent.0.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer) || ((B.min.1.proposed <= B.min.1.required) && (((B.min.1.proposed + B.extent.1.proposed) - 1) >= ((B.min.1.required + B.extent.1.required) - 1)))), halide_error_constraints_make_required_region_smaller("Input buffer B", 1, B.min.1.proposed, ((B.min.1.proposed + B.extent.1.proposed) - 1), B.min.1.required, ((B.min.1.required + B.extent.1.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.0.proposed <= Out.min.0.required) && (((Out.min.0.proposed + Out.extent.0.proposed) - 1) >= ((Out.min.0.required + Out.extent.0.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 0, Out.min.0.proposed, ((Out.min.0.proposed + Out.extent.0.proposed) - 1), Out.min.0.required, ((Out.min.0.required + Out.extent.0.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.1.proposed <= Out.min.1.required) && (((Out.min.1.proposed + Out.extent.1.proposed) - 1) >= ((Out.min.1.required + Out.extent.1.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 1, Out.min.1.proposed, ((Out.min.1.proposed + Out.extent.1.proposed) - 1), Out.min.1.required, ((Out.min.1.required + Out.extent.1.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.2.proposed <= Out.min.2.required) && (((Out.min.2.proposed + Out.extent.2.proposed) - 1) >= ((Out.min.2.required + Out.extent.2.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 2, Out.min.2.proposed, ((Out.min.2.proposed + Out.extent.2.proposed) - 1), Out.min.2.required, ((Out.min.2.required + Out.extent.2.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.3.proposed <= Out.min.3.required) && (((Out.min.3.proposed + Out.extent.3.proposed) - 1) >= ((Out.min.3.required + Out.extent.3.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 3, Out.min.3.proposed, ((Out.min.3.proposed + Out.extent.3.proposed) - 1), Out.min.3.required, ((Out.min.3.required + Out.extent.3.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.4.proposed <= Out.min.4.required) && (((Out.min.4.proposed + Out.extent.4.proposed) - 1) >= ((Out.min.4.required + Out.extent.4.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 4, Out.min.4.proposed, ((Out.min.4.proposed + Out.extent.4.proposed) - 1), Out.min.4.required, ((Out.min.4.required + Out.extent.4.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.5.proposed <= Out.min.5.required) && (((Out.min.5.proposed + Out.extent.5.proposed) - 1) >= ((Out.min.5.required + Out.extent.5.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 5, Out.min.5.proposed, ((Out.min.5.proposed + Out.extent.5.proposed) - 1), Out.min.5.required, ((Out.min.5.required + Out.extent.5.required) - 1)))
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0.proposed, A.extent.0.proposed, A.stride.0.proposed, 0, A.min.1.proposed, A.extent.1.proposed, A.stride.1.proposed, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0.proposed, B.extent.0.proposed, B.stride.0.proposed, 0, B.min.1.proposed, B.extent.1.proposed, B.stride.1.proposed, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(Out.min.0.proposed, Out.extent.0.proposed, Out.stride.0.proposed, 0, Out.min.1.proposed, Out.extent.1.proposed, Out.stride.1.proposed, 0, Out.min.2.proposed, Out.extent.2.proposed, Out.stride.2.proposed, 0, Out.min.3.proposed, Out.extent.3.proposed, Out.stride.3.proposed, 0, Out.min.4.proposed, Out.extent.4.proposed, Out.stride.4.proposed, 0, Out.min.5.proposed, Out.extent.5.proposed, Out.stride.5.proposed, 0), (uint64)0)
+}
+if (!((((uint1)0 || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer))) {
+ assert(((uint32)A.type == (uint32)73730), halide_error_bad_type("Input buffer A", (uint32)A.type, (uint32)73730))
+ assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+ assert(((uint32)B.type == (uint32)73730), halide_error_bad_type("Input buffer B", (uint32)B.type, (uint32)73730))
+ assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+ assert(((uint32)Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", (uint32)Out.type, (uint32)73730))
+ assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+ assert(((A.min.0 <= A.min.0.required) && (((A.min.0 + A.extent.0) - 1) >= ((A.min.0.required + A.extent.0.required) - 1))), halide_error_access_out_of_bounds("Input buffer A", 0, A.min.0.required, ((A.min.0.required + A.extent.0.required) - 1), A.min.0, ((A.min.0 + A.extent.0) - 1)))
+ assert((A.extent.0 >= 0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+ assert(((A.min.1 <= A.min.1.required) && (((A.min.1 + A.extent.1) - 1) >= ((A.min.1.required + A.extent.1.required) - 1))), halide_error_access_out_of_bounds("Input buffer A", 1, A.min.1.required, ((A.min.1.required + A.extent.1.required) - 1), A.min.1, ((A.min.1 + A.extent.1) - 1)))
+ assert((A.extent.1 >= 0), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+ assert(((B.min.0 <= B.min.0.required) && (((B.min.0 + B.extent.0) - 1) >= ((B.min.0.required + B.extent.0.required) - 1))), halide_error_access_out_of_bounds("Input buffer B", 0, B.min.0.required, ((B.min.0.required + B.extent.0.required) - 1), B.min.0, ((B.min.0 + B.extent.0) - 1)))
+ assert((B.extent.0 >= 0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+ assert(((B.min.1 <= B.min.1.required) && (((B.min.1 + B.extent.1) - 1) >= ((B.min.1.required + B.extent.1.required) - 1))), halide_error_access_out_of_bounds("Input buffer B", 1, B.min.1.required, ((B.min.1.required + B.extent.1.required) - 1), B.min.1, ((B.min.1 + B.extent.1) - 1)))
+ assert((B.extent.1 >= 0), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+ assert(((Out.min.0 <= Out.min.0.required) && (((Out.min.0 + Out.extent.0) - 1) >= ((Out.min.0.required + Out.extent.0.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 0, Out.min.0.required, ((Out.min.0.required + Out.extent.0.required) - 1), Out.min.0, ((Out.min.0 + Out.extent.0) - 1)))
+ assert((Out.extent.0 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+ assert(((Out.min.1 <= Out.min.1.required) && (((Out.min.1 + Out.extent.1) - 1) >= ((Out.min.1.required + Out.extent.1.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 1, Out.min.1.required, ((Out.min.1.required + Out.extent.1.required) - 1), Out.min.1, ((Out.min.1 + Out.extent.1) - 1)))
+ assert((Out.extent.1 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+ assert(((Out.min.2 <= Out.min.2.required) && (((Out.min.2 + Out.extent.2) - 1) >= ((Out.min.2.required + Out.extent.2.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 2, Out.min.2.required, ((Out.min.2.required + Out.extent.2.required) - 1), Out.min.2, ((Out.min.2 + Out.extent.2) - 1)))
+ assert((Out.extent.2 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+ assert(((Out.min.3 <= Out.min.3.required) && (((Out.min.3 + Out.extent.3) - 1) >= ((Out.min.3.required + Out.extent.3.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 3, Out.min.3.required, ((Out.min.3.required + Out.extent.3.required) - 1), Out.min.3, ((Out.min.3 + Out.extent.3) - 1)))
+ assert((Out.extent.3 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+ assert(((Out.min.4 <= Out.min.4.required) && (((Out.min.4 + Out.extent.4) - 1) >= ((Out.min.4.required + Out.extent.4.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 4, Out.min.4.required, ((Out.min.4.required + Out.extent.4.required) - 1), Out.min.4, ((Out.min.4 + Out.extent.4) - 1)))
+ assert((Out.extent.4 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+ assert(((Out.min.5 <= Out.min.5.required) && (((Out.min.5 + Out.extent.5) - 1) >= ((Out.min.5.required + Out.extent.5.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 5, Out.min.5.required, ((Out.min.5.required + Out.extent.5.required) - 1), Out.min.5, ((Out.min.5 + Out.extent.5) - 1)))
+ assert((Out.extent.5 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+ assert((A.stride.0 == A.stride.0.constrained), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", A.stride.0.constrained))
+ assert((B.stride.0 == B.stride.0.constrained), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", B.stride.0.constrained))
+ assert((Out.stride.0 == Out.stride.0.constrained), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", Out.stride.0.constrained))
+ let A.total_extent.0 = int64(A.extent.0)
+ let A.total_extent.1 = (int64(A.extent.1)*A.total_extent.0)
+ let B.total_extent.0 = int64(B.extent.0)
+ let B.total_extent.1 = (int64(B.extent.1)*B.total_extent.0)
+ let Out.total_extent.0 = int64(Out.extent.0)
+ let Out.total_extent.1 = (int64(Out.extent.1)*Out.total_extent.0)
+ let Out.total_extent.2 = (int64(Out.extent.2)*Out.total_extent.1)
+ let Out.total_extent.3 = (int64(Out.extent.3)*Out.total_extent.2)
+ let Out.total_extent.4 = (int64(Out.extent.4)*Out.total_extent.3)
+ let Out.total_extent.5 = (int64(Out.extent.5)*Out.total_extent.4)
+ assert(((uint64)abs((int64(A.extent.0)*int64(A.stride.0.constrained))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.0)*int64(A.stride.0.constrained))), (uint64)2147483647))
+ assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+ assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(B.extent.0)*int64(B.stride.0.constrained))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.0)*int64(B.stride.0.constrained))), (uint64)2147483647))
+ assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+ assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.0)*int64(Out.stride.0.constrained))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.0)*int64(Out.stride.0.constrained))), (uint64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+ assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+ assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+ assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+ assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+ assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+ gpu_block<Default_GPU> (X.s0.i, 0, (A.extent.1/64)) {
+  gpu_block<Default_GPU> (X.s0.j, 0, (B.extent.0/64)) {
+   gpu_thread<Default_GPU> (X.s0.ii, 0, 2) {
+    gpu_thread<Default_GPU> (X.s0.jj, 0, 8) {
+     realize X[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+      realize Y[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+       realize Z[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+        for (X.s0.k, 0, (A.extent.0/8)) {
+         for (X.s0.kk, 0, 1) {
+          for (X.s0.iii, 0, 32) {
+           vectorized (X.s0.jjj, 0, 8) {
+            for (X.s0.kkk, 0, 8) {
+             let X.s0.kkk_1 = X.s0.kkk
+             let X.s0.jjj_1 = X.s0.jjj
+             let X.s0.iii_1 = X.s0.iii
+             X(X.s0.jjj_1, X.s0.iii_1, 0, 0, 0, 0, 0, 0, 0) = (float32)A(((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk_1)), ((X.s0.i*64) + ((X.s0.ii*32) + X.s0.iii_1)))
+             Y(X.s0.jjj_1, X.s0.iii_1, 0, 0, 0, 0, 0, 0, 0) = (float32)B(((X.s0.j*64) + ((X.s0.jj*8) + X.s0.jjj_1)), ((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk_1)))
+             Z(X.s0.jjj_1, X.s0.iii_1, 0, 0, 0, 0, 0, 0, 0) = (select((((X.s0.kkk_1 == 0) && (X.s0.kk == 0)) && (X.s0.k == 0)), 0.000000f, select((X.s0.kkk_1 == 0), select((X.s0.kk == 0), Z(X.s0.jjj_1, X.s0.iii_1, 0, 0, 0, 0, 0, 0, 0), Z(X.s0.jjj_1, X.s0.iii_1, 0, 0, 0, 0, 0, 0, 0)), Z(X.s0.jjj_1, X.s0.iii_1, 0, 0, 0, 0, 0, 0, 0))) + (X(X.s0.jjj_1, X.s0.iii_1, 0, 0, 0, 0, 0, 0, 0)*Y(X.s0.jjj_1, X.s0.iii_1, 0, 0, 0, 0, 0, 0, 0)))
+             if ((((X.s0.kkk_1 == 7) && (X.s0.kk == 0)) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+              Out(X.s0.jjj_1, X.s0.iii_1, X.s0.jj, X.s0.ii, X.s0.j, X.s0.i) = Z(X.s0.jjj_1, X.s0.iii_1, 0, 0, 0, 0, 0, 0, 0)
+             }
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+
+Partitioning loops to simplify boundary conditions...
+Lowering after partitioning loops :
+let A.extent.0.required = A.extent.0
+let A.min.0.required = A.min.0
+let A.stride.0.required = 1
+let A.extent.1.required = A.extent.1
+let A.min.1.required = A.min.1
+let A.stride.1.required = (A.stride.0.required*A.extent.0.required)
+let B.extent.0.required = B.extent.0
+let B.min.0.required = B.min.0
+let B.stride.0.required = 1
+let B.extent.1.required = B.extent.1
+let B.min.1.required = B.min.1
+let B.stride.1.required = (B.stride.0.required*B.extent.0.required)
+let Out.extent.0.required = ((7 + 1) - 0)
+let Out.min.0.required = 0
+let Out.stride.0.required = 1
+let Out.extent.1.required = ((31 + 1) - 0)
+let Out.min.1.required = 0
+let Out.stride.1.required = (Out.stride.0.required*Out.extent.0.required)
+let Out.extent.2.required = ((((8 + 0) - 1) + 1) - 0)
+let Out.min.2.required = 0
+let Out.stride.2.required = (Out.stride.1.required*Out.extent.1.required)
+let Out.extent.3.required = ((((2 + 0) - 1) + 1) - 0)
+let Out.min.3.required = 0
+let Out.stride.3.required = (Out.stride.2.required*Out.extent.2.required)
+let Out.extent.4.required = (((((B.extent.0/64) + 0) - 1) + 1) - 0)
+let Out.min.4.required = 0
+let Out.stride.4.required = (Out.stride.3.required*Out.extent.3.required)
+let Out.extent.5.required = (((((A.extent.1/64) + 0) - 1) + 1) - 0)
+let Out.min.5.required = 0
+let Out.stride.5.required = (Out.stride.4.required*Out.extent.4.required)
+let A.stride.0.constrained = 1
+let B.stride.0.constrained = 1
+let Out.stride.0.constrained = 1
+let A.stride.0.proposed = 1
+let A.min.0.proposed = A.min.0.required
+let A.extent.0.proposed = A.extent.0.required
+let A.stride.1.proposed = A.stride.1.required
+let A.min.1.proposed = A.min.1.required
+let A.extent.1.proposed = A.extent.1.required
+let B.stride.0.proposed = 1
+let B.min.0.proposed = B.min.0.required
+let B.extent.0.proposed = B.extent.0.required
+let B.stride.1.proposed = B.stride.1.required
+let B.min.1.proposed = B.min.1.required
+let B.extent.1.proposed = B.extent.1.required
+let Out.stride.0.proposed = 1
+let Out.min.0.proposed = Out.min.0.required
+let Out.extent.0.proposed = Out.extent.0.required
+let Out.stride.1.proposed = Out.stride.1.required
+let Out.min.1.proposed = Out.min.1.required
+let Out.extent.1.proposed = Out.extent.1.required
+let Out.stride.2.proposed = Out.stride.2.required
+let Out.min.2.proposed = Out.min.2.required
+let Out.extent.2.proposed = Out.extent.2.required
+let Out.stride.3.proposed = Out.stride.3.required
+let Out.min.3.proposed = Out.min.3.required
+let Out.extent.3.proposed = Out.extent.3.required
+let Out.stride.4.proposed = Out.stride.4.required
+let Out.min.4.proposed = Out.min.4.required
+let Out.extent.4.proposed = Out.extent.4.required
+let Out.stride.5.proposed = Out.stride.5.required
+let Out.min.5.proposed = Out.min.5.required
+let Out.extent.5.proposed = Out.extent.5.required
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || ((A.min.0.proposed <= A.min.0.required) && (((A.min.0.proposed + A.extent.0.proposed) - 1) >= ((A.min.0.required + A.extent.0.required) - 1)))), halide_error_constraints_make_required_region_smaller("Input buffer A", 0, A.min.0.proposed, ((A.min.0.proposed + A.extent.0.proposed) - 1), A.min.0.required, ((A.min.0.required + A.extent.0.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || ((A.min.1.proposed <= A.min.1.required) && (((A.min.1.proposed + A.extent.1.proposed) - 1) >= ((A.min.1.required + A.extent.1.required) - 1)))), halide_error_constraints_make_required_region_smaller("Input buffer A", 1, A.min.1.proposed, ((A.min.1.proposed + A.extent.1.proposed) - 1), A.min.1.required, ((A.min.1.required + A.extent.1.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer) || ((B.min.0.proposed <= B.min.0.required) && (((B.min.0.proposed + B.extent.0.proposed) - 1) >= ((B.min.0.required + B.extent.0.required) - 1)))), halide_error_constraints_make_required_region_smaller("Input buffer B", 0, B.min.0.proposed, ((B.min.0.proposed + B.extent.0.proposed) - 1), B.min.0.required, ((B.min.0.required + B.extent.0.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer) || ((B.min.1.proposed <= B.min.1.required) && (((B.min.1.proposed + B.extent.1.proposed) - 1) >= ((B.min.1.required + B.extent.1.required) - 1)))), halide_error_constraints_make_required_region_smaller("Input buffer B", 1, B.min.1.proposed, ((B.min.1.proposed + B.extent.1.proposed) - 1), B.min.1.required, ((B.min.1.required + B.extent.1.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.0.proposed <= Out.min.0.required) && (((Out.min.0.proposed + Out.extent.0.proposed) - 1) >= ((Out.min.0.required + Out.extent.0.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 0, Out.min.0.proposed, ((Out.min.0.proposed + Out.extent.0.proposed) - 1), Out.min.0.required, ((Out.min.0.required + Out.extent.0.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.1.proposed <= Out.min.1.required) && (((Out.min.1.proposed + Out.extent.1.proposed) - 1) >= ((Out.min.1.required + Out.extent.1.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 1, Out.min.1.proposed, ((Out.min.1.proposed + Out.extent.1.proposed) - 1), Out.min.1.required, ((Out.min.1.required + Out.extent.1.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.2.proposed <= Out.min.2.required) && (((Out.min.2.proposed + Out.extent.2.proposed) - 1) >= ((Out.min.2.required + Out.extent.2.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 2, Out.min.2.proposed, ((Out.min.2.proposed + Out.extent.2.proposed) - 1), Out.min.2.required, ((Out.min.2.required + Out.extent.2.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.3.proposed <= Out.min.3.required) && (((Out.min.3.proposed + Out.extent.3.proposed) - 1) >= ((Out.min.3.required + Out.extent.3.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 3, Out.min.3.proposed, ((Out.min.3.proposed + Out.extent.3.proposed) - 1), Out.min.3.required, ((Out.min.3.required + Out.extent.3.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.4.proposed <= Out.min.4.required) && (((Out.min.4.proposed + Out.extent.4.proposed) - 1) >= ((Out.min.4.required + Out.extent.4.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 4, Out.min.4.proposed, ((Out.min.4.proposed + Out.extent.4.proposed) - 1), Out.min.4.required, ((Out.min.4.required + Out.extent.4.required) - 1)))
+assert((!(uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((Out.min.5.proposed <= Out.min.5.required) && (((Out.min.5.proposed + Out.extent.5.proposed) - 1) >= ((Out.min.5.required + Out.extent.5.required) - 1)))), halide_error_constraints_make_required_region_smaller("Output buffer Out", 5, Out.min.5.proposed, ((Out.min.5.proposed + Out.extent.5.proposed) - 1), Out.min.5.required, ((Out.min.5.required + Out.extent.5.required) - 1)))
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0.proposed, A.extent.0.proposed, A.stride.0.proposed, 0, A.min.1.proposed, A.extent.1.proposed, A.stride.1.proposed, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0.proposed, B.extent.0.proposed, B.stride.0.proposed, 0, B.min.1.proposed, B.extent.1.proposed, B.stride.1.proposed, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(Out.min.0.proposed, Out.extent.0.proposed, Out.stride.0.proposed, 0, Out.min.1.proposed, Out.extent.1.proposed, Out.stride.1.proposed, 0, Out.min.2.proposed, Out.extent.2.proposed, Out.stride.2.proposed, 0, Out.min.3.proposed, Out.extent.3.proposed, Out.stride.3.proposed, 0, Out.min.4.proposed, Out.extent.4.proposed, Out.stride.4.proposed, 0, Out.min.5.proposed, Out.extent.5.proposed, Out.stride.5.proposed, 0), (uint64)0)
+}
+if (!((((uint1)0 || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer))) {
+ assert(((uint32)A.type == (uint32)73730), halide_error_bad_type("Input buffer A", (uint32)A.type, (uint32)73730))
+ assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+ assert(((uint32)B.type == (uint32)73730), halide_error_bad_type("Input buffer B", (uint32)B.type, (uint32)73730))
+ assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+ assert(((uint32)Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", (uint32)Out.type, (uint32)73730))
+ assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+ assert(((A.min.0 <= A.min.0.required) && (((A.min.0 + A.extent.0) - 1) >= ((A.min.0.required + A.extent.0.required) - 1))), halide_error_access_out_of_bounds("Input buffer A", 0, A.min.0.required, ((A.min.0.required + A.extent.0.required) - 1), A.min.0, ((A.min.0 + A.extent.0) - 1)))
+ assert((A.extent.0 >= 0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+ assert(((A.min.1 <= A.min.1.required) && (((A.min.1 + A.extent.1) - 1) >= ((A.min.1.required + A.extent.1.required) - 1))), halide_error_access_out_of_bounds("Input buffer A", 1, A.min.1.required, ((A.min.1.required + A.extent.1.required) - 1), A.min.1, ((A.min.1 + A.extent.1) - 1)))
+ assert((A.extent.1 >= 0), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+ assert(((B.min.0 <= B.min.0.required) && (((B.min.0 + B.extent.0) - 1) >= ((B.min.0.required + B.extent.0.required) - 1))), halide_error_access_out_of_bounds("Input buffer B", 0, B.min.0.required, ((B.min.0.required + B.extent.0.required) - 1), B.min.0, ((B.min.0 + B.extent.0) - 1)))
+ assert((B.extent.0 >= 0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+ assert(((B.min.1 <= B.min.1.required) && (((B.min.1 + B.extent.1) - 1) >= ((B.min.1.required + B.extent.1.required) - 1))), halide_error_access_out_of_bounds("Input buffer B", 1, B.min.1.required, ((B.min.1.required + B.extent.1.required) - 1), B.min.1, ((B.min.1 + B.extent.1) - 1)))
+ assert((B.extent.1 >= 0), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+ assert(((Out.min.0 <= Out.min.0.required) && (((Out.min.0 + Out.extent.0) - 1) >= ((Out.min.0.required + Out.extent.0.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 0, Out.min.0.required, ((Out.min.0.required + Out.extent.0.required) - 1), Out.min.0, ((Out.min.0 + Out.extent.0) - 1)))
+ assert((Out.extent.0 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+ assert(((Out.min.1 <= Out.min.1.required) && (((Out.min.1 + Out.extent.1) - 1) >= ((Out.min.1.required + Out.extent.1.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 1, Out.min.1.required, ((Out.min.1.required + Out.extent.1.required) - 1), Out.min.1, ((Out.min.1 + Out.extent.1) - 1)))
+ assert((Out.extent.1 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+ assert(((Out.min.2 <= Out.min.2.required) && (((Out.min.2 + Out.extent.2) - 1) >= ((Out.min.2.required + Out.extent.2.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 2, Out.min.2.required, ((Out.min.2.required + Out.extent.2.required) - 1), Out.min.2, ((Out.min.2 + Out.extent.2) - 1)))
+ assert((Out.extent.2 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+ assert(((Out.min.3 <= Out.min.3.required) && (((Out.min.3 + Out.extent.3) - 1) >= ((Out.min.3.required + Out.extent.3.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 3, Out.min.3.required, ((Out.min.3.required + Out.extent.3.required) - 1), Out.min.3, ((Out.min.3 + Out.extent.3) - 1)))
+ assert((Out.extent.3 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+ assert(((Out.min.4 <= Out.min.4.required) && (((Out.min.4 + Out.extent.4) - 1) >= ((Out.min.4.required + Out.extent.4.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 4, Out.min.4.required, ((Out.min.4.required + Out.extent.4.required) - 1), Out.min.4, ((Out.min.4 + Out.extent.4) - 1)))
+ assert((Out.extent.4 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+ assert(((Out.min.5 <= Out.min.5.required) && (((Out.min.5 + Out.extent.5) - 1) >= ((Out.min.5.required + Out.extent.5.required) - 1))), halide_error_access_out_of_bounds("Output buffer Out", 5, Out.min.5.required, ((Out.min.5.required + Out.extent.5.required) - 1), Out.min.5, ((Out.min.5 + Out.extent.5) - 1)))
+ assert((Out.extent.5 >= 0), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+ assert((A.stride.0 == A.stride.0.constrained), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", A.stride.0.constrained))
+ assert((B.stride.0 == B.stride.0.constrained), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", B.stride.0.constrained))
+ assert((Out.stride.0 == Out.stride.0.constrained), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", Out.stride.0.constrained))
+ let A.total_extent.0 = int64(A.extent.0)
+ let A.total_extent.1 = (int64(A.extent.1)*A.total_extent.0)
+ let B.total_extent.0 = int64(B.extent.0)
+ let B.total_extent.1 = (int64(B.extent.1)*B.total_extent.0)
+ let Out.total_extent.0 = int64(Out.extent.0)
+ let Out.total_extent.1 = (int64(Out.extent.1)*Out.total_extent.0)
+ let Out.total_extent.2 = (int64(Out.extent.2)*Out.total_extent.1)
+ let Out.total_extent.3 = (int64(Out.extent.3)*Out.total_extent.2)
+ let Out.total_extent.4 = (int64(Out.extent.4)*Out.total_extent.3)
+ let Out.total_extent.5 = (int64(Out.extent.5)*Out.total_extent.4)
+ assert(((uint64)abs((int64(A.extent.0)*int64(A.stride.0.constrained))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.0)*int64(A.stride.0.constrained))), (uint64)2147483647))
+ assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+ assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(B.extent.0)*int64(B.stride.0.constrained))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.0)*int64(B.stride.0.constrained))), (uint64)2147483647))
+ assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+ assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.0)*int64(Out.stride.0.constrained))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.0)*int64(Out.stride.0.constrained))), (uint64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+ assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+ assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+ assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+ assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+ assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+ gpu_block<Default_GPU> (X.s0.i, 0, (A.extent.1/64)) {
+  gpu_block<Default_GPU> (X.s0.j, 0, (B.extent.0/64)) {
+   gpu_thread<Default_GPU> (X.s0.ii, 0, 2) {
+    gpu_thread<Default_GPU> (X.s0.jj, 0, 8) {
+     realize X[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+      realize Y[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+       realize Z[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+        for (X.s0.k, 0, (A.extent.0/8)) {
+         for (X.s0.kk, 0, 1) {
+          for (X.s0.iii, 0, 32) {
+           vectorized (X.s0.jjj, 0, 8) {
+            for (X.s0.kkk, 0, 8) {
+             let X.s0.kkk_1 = X.s0.kkk
+             let X.s0.jjj_1 = X.s0.jjj
+             let X.s0.iii_1 = X.s0.iii
+             X(X.s0.jjj_1, X.s0.iii_1, 0, 0, 0, 0, 0, 0, 0) = (float32)A(((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk_1)), ((X.s0.i*64) + ((X.s0.ii*32) + X.s0.iii_1)))
+             Y(X.s0.jjj_1, X.s0.iii_1, 0, 0, 0, 0, 0, 0, 0) = (float32)B(((X.s0.j*64) + ((X.s0.jj*8) + X.s0.jjj_1)), ((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk_1)))
+             Z(X.s0.jjj_1, X.s0.iii_1, 0, 0, 0, 0, 0, 0, 0) = ((let t3 = select(((X.s0.kkk_1 == 0) && (X.s0.kk == 0)), Z(X.s0.jjj_1, X.s0.iii_1, 0, 0, 0, 0, 0, 0, 0), Z(X.s0.jjj_1, X.s0.iii_1, 0, 0, 0, 0, 0, 0, 0)) in select((((X.s0.kkk_1 == 0) && (X.s0.kk == 0)) && (X.s0.k == 0)), 0.000000f, t3)) + (X(X.s0.jjj_1, X.s0.iii_1, 0, 0, 0, 0, 0, 0, 0)*Y(X.s0.jjj_1, X.s0.iii_1, 0, 0, 0, 0, 0, 0, 0)))
+             if ((((X.s0.kkk_1 == 7) && (X.s0.kk == 0)) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+              Out(X.s0.jjj_1, X.s0.iii_1, X.s0.jj, X.s0.ii, X.s0.j, X.s0.i) = Z(X.s0.jjj_1, X.s0.iii_1, 0, 0, 0, 0, 0, 0, 0)
+             }
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+
+Simplifying IfThenElse but keeping unit loops...
+Lowering after simplifying IfThenElse but keeping unit loops:
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+if (!((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)))) {
+ assert(((uint32)A.type == (uint32)73730), halide_error_bad_type("Input buffer A", (uint32)A.type, (uint32)73730))
+ assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+ assert(((uint32)B.type == (uint32)73730), halide_error_bad_type("Input buffer B", (uint32)B.type, (uint32)73730))
+ assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+ assert(((uint32)Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", (uint32)Out.type, (uint32)73730))
+ assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+ 0
+ assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+ 0
+ assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+ 0
+ assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+ 0
+ assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+ assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+ assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+ assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+ assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+ assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+ assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+ assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+ assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+ assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+ assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+ assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+ assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+ assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+ assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+ assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+ let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+ let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+ let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+ let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+ let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+ let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+ let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+ assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+ assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+ assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+ assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+ assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+ assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+ assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+ assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+ gpu_block<Default_GPU> (X.s0.i, 0, (A.extent.1/64)) {
+  gpu_block<Default_GPU> (X.s0.j, 0, (B.extent.0/64)) {
+   gpu_thread<Default_GPU> (X.s0.ii, 0, 2) {
+    gpu_thread<Default_GPU> (X.s0.jj, 0, 8) {
+     realize X[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+      realize Y[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+       realize Z[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+        for (X.s0.k, 0, (A.extent.0/8)) {
+         for (X.s0.kk, 0, 1) {
+          for (X.s0.iii, 0, 32) {
+           vectorized (X.s0.jjj, 0, 8) {
+            for (X.s0.kkk, 0, 8) {
+             X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)A(((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)), ((X.s0.i*64) + ((X.s0.ii*32) + X.s0.iii)))
+             Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)B(((X.s0.j*64) + ((X.s0.jj*8) + X.s0.jjj)), ((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)))
+             Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = ((let t3 = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) in select((((X.s0.kkk == 0) && (X.s0.kk == 0)) && (X.s0.k == 0)), 0.000000f, t3)) + (X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)*Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)))
+             if ((((X.s0.kkk == 7) && (X.s0.kk == 0)) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+              Out(X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.j, X.s0.i) = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)
+             }
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+
+Removing extern loops...
+Lowering after removing extern loops:
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+if (!((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)))) {
+ assert(((uint32)A.type == (uint32)73730), halide_error_bad_type("Input buffer A", (uint32)A.type, (uint32)73730))
+ assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+ assert(((uint32)B.type == (uint32)73730), halide_error_bad_type("Input buffer B", (uint32)B.type, (uint32)73730))
+ assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+ assert(((uint32)Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", (uint32)Out.type, (uint32)73730))
+ assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+ 0
+ assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+ 0
+ assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+ 0
+ assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+ 0
+ assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+ assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+ assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+ assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+ assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+ assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+ assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+ assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+ assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+ assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+ assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+ assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+ assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+ assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+ assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+ assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+ let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+ let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+ let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+ let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+ let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+ let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+ let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+ assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+ assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+ assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+ assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+ assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+ assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+ assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+ assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+ gpu_block<Default_GPU> (X.s0.i, 0, (A.extent.1/64)) {
+  gpu_block<Default_GPU> (X.s0.j, 0, (B.extent.0/64)) {
+   gpu_thread<Default_GPU> (X.s0.ii, 0, 2) {
+    gpu_thread<Default_GPU> (X.s0.jj, 0, 8) {
+     realize X[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+      realize Y[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+       realize Z[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+        for (X.s0.k, 0, (A.extent.0/8)) {
+         for (X.s0.kk, 0, 1) {
+          for (X.s0.iii, 0, 32) {
+           vectorized (X.s0.jjj, 0, 8) {
+            for (X.s0.kkk, 0, 8) {
+             X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)A(((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)), ((X.s0.i*64) + ((X.s0.ii*32) + X.s0.iii)))
+             Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)B(((X.s0.j*64) + ((X.s0.jj*8) + X.s0.jjj)), ((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)))
+             Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = ((let t3 = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) in select((((X.s0.kkk == 0) && (X.s0.kk == 0)) && (X.s0.k == 0)), 0.000000f, t3)) + (X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)*Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)))
+             if ((((X.s0.kkk == 7) && (X.s0.kk == 0)) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+              Out(X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.j, X.s0.i) = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)
+             }
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+Performing sliding window optimization...
+Lowering after sliding window:
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+if (!((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)))) {
+ assert(((uint32)A.type == (uint32)73730), halide_error_bad_type("Input buffer A", (uint32)A.type, (uint32)73730))
+ assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+ assert(((uint32)B.type == (uint32)73730), halide_error_bad_type("Input buffer B", (uint32)B.type, (uint32)73730))
+ assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+ assert(((uint32)Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", (uint32)Out.type, (uint32)73730))
+ assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+ 0
+ assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+ 0
+ assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+ 0
+ assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+ 0
+ assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+ assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+ assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+ assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+ assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+ assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+ assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+ assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+ assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+ assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+ assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+ assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+ assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+ assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+ assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+ assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+ let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+ let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+ let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+ let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+ let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+ let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+ let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+ assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+ assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+ assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+ assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+ assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+ assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+ assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+ assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+ gpu_block<Default_GPU> (X.s0.i, 0, (A.extent.1/64)) {
+  gpu_block<Default_GPU> (X.s0.j, 0, (B.extent.0/64)) {
+   gpu_thread<Default_GPU> (X.s0.ii, 0, 2) {
+    gpu_thread<Default_GPU> (X.s0.jj, 0, 8) {
+     realize X[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+      realize Y[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+       realize Z[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+        for (X.s0.k, 0, (A.extent.0/8)) {
+         for (X.s0.kk, 0, 1) {
+          for (X.s0.iii, 0, 32) {
+           vectorized (X.s0.jjj, 0, 8) {
+            for (X.s0.kkk, 0, 8) {
+             X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)A(((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)), ((X.s0.i*64) + ((X.s0.ii*32) + X.s0.iii)))
+             Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)B(((X.s0.j*64) + ((X.s0.jj*8) + X.s0.jjj)), ((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)))
+             Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = ((let t3 = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) in select((((X.s0.kkk == 0) && (X.s0.kk == 0)) && (X.s0.k == 0)), 0.000000f, t3)) + (X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)*Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)))
+             if ((((X.s0.kkk == 7) && (X.s0.kk == 0)) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+              Out(X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.j, X.s0.i) = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)
+             }
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+Simplifying correlated differences...
+Lowering after simplifying correlated differences:
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+if (!((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)))) {
+ assert(((uint32)A.type == (uint32)73730), halide_error_bad_type("Input buffer A", (uint32)A.type, (uint32)73730))
+ assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+ assert(((uint32)B.type == (uint32)73730), halide_error_bad_type("Input buffer B", (uint32)B.type, (uint32)73730))
+ assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+ assert(((uint32)Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", (uint32)Out.type, (uint32)73730))
+ assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+ 0
+ assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+ 0
+ assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+ 0
+ assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+ 0
+ assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+ assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+ assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+ assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+ assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+ assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+ assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+ assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+ assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+ assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+ assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+ assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+ assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+ assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+ assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+ assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+ let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+ let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+ let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+ let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+ let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+ let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+ let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+ assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+ assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+ assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+ assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+ assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+ assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+ assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+ assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+ gpu_block<Default_GPU> (X.s0.i, 0, (A.extent.1/64)) {
+  gpu_block<Default_GPU> (X.s0.j, 0, (B.extent.0/64)) {
+   gpu_thread<Default_GPU> (X.s0.ii, 0, 2) {
+    gpu_thread<Default_GPU> (X.s0.jj, 0, 8) {
+     realize X[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+      realize Y[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+       realize Z[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+        for (X.s0.k, 0, (A.extent.0/8)) {
+         for (X.s0.kk, 0, 1) {
+          for (X.s0.iii, 0, 32) {
+           vectorized (X.s0.jjj, 0, 8) {
+            for (X.s0.kkk, 0, 8) {
+             X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)A(((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)), ((X.s0.i*64) + ((X.s0.ii*32) + X.s0.iii)))
+             Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)B(((X.s0.j*64) + ((X.s0.jj*8) + X.s0.jjj)), ((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)))
+             Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = ((let t3 = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) in select((((X.s0.kkk == 0) && (X.s0.kk == 0)) && (X.s0.k == 0)), 0.000000f, t3)) + (X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)*Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)))
+             if ((((X.s0.kkk == 7) && (X.s0.kk == 0)) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+              Out(X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.j, X.s0.i) = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)
+             }
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+Performing allocation bounds inference...
+Lowering after allocation bounds inference:
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+if (!((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)))) {
+ assert(((uint32)A.type == (uint32)73730), halide_error_bad_type("Input buffer A", (uint32)A.type, (uint32)73730))
+ assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+ assert(((uint32)B.type == (uint32)73730), halide_error_bad_type("Input buffer B", (uint32)B.type, (uint32)73730))
+ assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+ assert(((uint32)Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", (uint32)Out.type, (uint32)73730))
+ assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+ 0
+ assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+ 0
+ assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+ 0
+ assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+ 0
+ assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+ assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+ assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+ assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+ assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+ assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+ assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+ assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+ assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+ assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+ assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+ assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+ assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+ assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+ assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+ assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+ let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+ let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+ let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+ let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+ let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+ let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+ let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+ assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+ assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+ assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+ assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+ assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+ assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+ assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+ assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+ gpu_block<Default_GPU> (X.s0.i, 0, (A.extent.1/64)) {
+  gpu_block<Default_GPU> (X.s0.j, 0, (B.extent.0/64)) {
+   gpu_thread<Default_GPU> (X.s0.ii, 0, 2) {
+    gpu_thread<Default_GPU> (X.s0.jj, 0, 8) {
+     let X.i.max_realized = ((A.extent.1/64) + -1)
+     let X.i.min_realized = 0
+     let X.i.extent_realized = (A.extent.1/64)
+     let X.j.max_realized = ((B.extent.0/64) + -1)
+     let X.j.min_realized = 0
+     let X.j.extent_realized = (B.extent.0/64)
+     let X.k.max_realized = ((A.extent.0/8) + -1)
+     let X.k.min_realized = 0
+     let X.k.extent_realized = (A.extent.0/8)
+     let X.kk.max_realized = 0
+     let X.kk.min_realized = 0
+     let X.kk.extent_realized = 1
+     let X.ii.max_realized = 1
+     let X.ii.min_realized = 0
+     let X.ii.extent_realized = 2
+     let X.jj.max_realized = 7
+     let X.jj.min_realized = 0
+     let X.jj.extent_realized = 8
+     let X.iii.max_realized = 31
+     let X.iii.min_realized = 0
+     let X.iii.extent_realized = 32
+     let X.jjj.max_realized = 7
+     let X.jjj.min_realized = 0
+     let X.jjj.extent_realized = 8
+     let X.kkk.max_realized = 7
+     let X.kkk.min_realized = 0
+     let X.kkk.extent_realized = 8
+     realize X[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+      let Y.i.max_realized = ((A.extent.1/64) + -1)
+      let Y.i.min_realized = 0
+      let Y.i.extent_realized = (A.extent.1/64)
+      let Y.j.max_realized = ((B.extent.0/64) + -1)
+      let Y.j.min_realized = 0
+      let Y.j.extent_realized = (B.extent.0/64)
+      let Y.k.max_realized = ((A.extent.0/8) + -1)
+      let Y.k.min_realized = 0
+      let Y.k.extent_realized = (A.extent.0/8)
+      let Y.kk.max_realized = 0
+      let Y.kk.min_realized = 0
+      let Y.kk.extent_realized = 1
+      let Y.ii.max_realized = 1
+      let Y.ii.min_realized = 0
+      let Y.ii.extent_realized = 2
+      let Y.jj.max_realized = 7
+      let Y.jj.min_realized = 0
+      let Y.jj.extent_realized = 8
+      let Y.iii.max_realized = 31
+      let Y.iii.min_realized = 0
+      let Y.iii.extent_realized = 32
+      let Y.jjj.max_realized = 7
+      let Y.jjj.min_realized = 0
+      let Y.jjj.extent_realized = 8
+      let Y.kkk.max_realized = 7
+      let Y.kkk.min_realized = 0
+      let Y.kkk.extent_realized = 8
+      realize Y[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+       let Z.i.max_realized = ((A.extent.1/64) + -1)
+       let Z.i.min_realized = 0
+       let Z.i.extent_realized = (A.extent.1/64)
+       let Z.j.max_realized = ((B.extent.0/64) + -1)
+       let Z.j.min_realized = 0
+       let Z.j.extent_realized = (B.extent.0/64)
+       let Z.k.max_realized = ((A.extent.0/8) + -1)
+       let Z.k.min_realized = 0
+       let Z.k.extent_realized = (A.extent.0/8)
+       let Z.kk.max_realized = 0
+       let Z.kk.min_realized = 0
+       let Z.kk.extent_realized = 1
+       let Z.ii.max_realized = 1
+       let Z.ii.min_realized = 0
+       let Z.ii.extent_realized = 2
+       let Z.jj.max_realized = 7
+       let Z.jj.min_realized = 0
+       let Z.jj.extent_realized = 8
+       let Z.iii.max_realized = 31
+       let Z.iii.min_realized = 0
+       let Z.iii.extent_realized = 32
+       let Z.jjj.max_realized = 7
+       let Z.jjj.min_realized = 0
+       let Z.jjj.extent_realized = 8
+       let Z.kkk.max_realized = 7
+       let Z.kkk.min_realized = 0
+       let Z.kkk.extent_realized = 8
+       realize Z[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+        for (X.s0.k, 0, (A.extent.0/8)) {
+         for (X.s0.kk, 0, 1) {
+          for (X.s0.iii, 0, 32) {
+           vectorized (X.s0.jjj, 0, 8) {
+            for (X.s0.kkk, 0, 8) {
+             X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)A(((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)), ((X.s0.i*64) + ((X.s0.ii*32) + X.s0.iii)))
+             Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)B(((X.s0.j*64) + ((X.s0.jj*8) + X.s0.jjj)), ((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)))
+             Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = ((let t3 = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) in select((((X.s0.kkk == 0) && (X.s0.kk == 0)) && (X.s0.k == 0)), 0.000000f, t3)) + (X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)*Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)))
+             if ((((X.s0.kkk == 7) && (X.s0.kk == 0)) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+              Out(X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.j, X.s0.i) = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)
+             }
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+Removing code that depends on undef values...
+Lowering after removing code that depends on undef values:
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+if (!((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)))) {
+ assert(((uint32)A.type == (uint32)73730), halide_error_bad_type("Input buffer A", (uint32)A.type, (uint32)73730))
+ assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+ assert(((uint32)B.type == (uint32)73730), halide_error_bad_type("Input buffer B", (uint32)B.type, (uint32)73730))
+ assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+ assert(((uint32)Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", (uint32)Out.type, (uint32)73730))
+ assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+ 0
+ assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+ 0
+ assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+ 0
+ assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+ 0
+ assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+ assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+ assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+ assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+ assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+ assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+ assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+ assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+ assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+ assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+ assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+ assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+ assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+ assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+ assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+ assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+ let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+ let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+ let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+ let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+ let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+ let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+ let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+ assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+ assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+ assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+ assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+ assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+ assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+ assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+ assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+ gpu_block<Default_GPU> (X.s0.i, 0, (A.extent.1/64)) {
+  gpu_block<Default_GPU> (X.s0.j, 0, (B.extent.0/64)) {
+   gpu_thread<Default_GPU> (X.s0.ii, 0, 2) {
+    gpu_thread<Default_GPU> (X.s0.jj, 0, 8) {
+     let X.i.max_realized = ((A.extent.1/64) + -1)
+     let X.i.min_realized = 0
+     let X.i.extent_realized = (A.extent.1/64)
+     let X.j.max_realized = ((B.extent.0/64) + -1)
+     let X.j.min_realized = 0
+     let X.j.extent_realized = (B.extent.0/64)
+     let X.k.max_realized = ((A.extent.0/8) + -1)
+     let X.k.min_realized = 0
+     let X.k.extent_realized = (A.extent.0/8)
+     let X.kk.max_realized = 0
+     let X.kk.min_realized = 0
+     let X.kk.extent_realized = 1
+     let X.ii.max_realized = 1
+     let X.ii.min_realized = 0
+     let X.ii.extent_realized = 2
+     let X.jj.max_realized = 7
+     let X.jj.min_realized = 0
+     let X.jj.extent_realized = 8
+     let X.iii.max_realized = 31
+     let X.iii.min_realized = 0
+     let X.iii.extent_realized = 32
+     let X.jjj.max_realized = 7
+     let X.jjj.min_realized = 0
+     let X.jjj.extent_realized = 8
+     let X.kkk.max_realized = 7
+     let X.kkk.min_realized = 0
+     let X.kkk.extent_realized = 8
+     realize X[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+      let Y.i.max_realized = ((A.extent.1/64) + -1)
+      let Y.i.min_realized = 0
+      let Y.i.extent_realized = (A.extent.1/64)
+      let Y.j.max_realized = ((B.extent.0/64) + -1)
+      let Y.j.min_realized = 0
+      let Y.j.extent_realized = (B.extent.0/64)
+      let Y.k.max_realized = ((A.extent.0/8) + -1)
+      let Y.k.min_realized = 0
+      let Y.k.extent_realized = (A.extent.0/8)
+      let Y.kk.max_realized = 0
+      let Y.kk.min_realized = 0
+      let Y.kk.extent_realized = 1
+      let Y.ii.max_realized = 1
+      let Y.ii.min_realized = 0
+      let Y.ii.extent_realized = 2
+      let Y.jj.max_realized = 7
+      let Y.jj.min_realized = 0
+      let Y.jj.extent_realized = 8
+      let Y.iii.max_realized = 31
+      let Y.iii.min_realized = 0
+      let Y.iii.extent_realized = 32
+      let Y.jjj.max_realized = 7
+      let Y.jjj.min_realized = 0
+      let Y.jjj.extent_realized = 8
+      let Y.kkk.max_realized = 7
+      let Y.kkk.min_realized = 0
+      let Y.kkk.extent_realized = 8
+      realize Y[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+       let Z.i.max_realized = ((A.extent.1/64) + -1)
+       let Z.i.min_realized = 0
+       let Z.i.extent_realized = (A.extent.1/64)
+       let Z.j.max_realized = ((B.extent.0/64) + -1)
+       let Z.j.min_realized = 0
+       let Z.j.extent_realized = (B.extent.0/64)
+       let Z.k.max_realized = ((A.extent.0/8) + -1)
+       let Z.k.min_realized = 0
+       let Z.k.extent_realized = (A.extent.0/8)
+       let Z.kk.max_realized = 0
+       let Z.kk.min_realized = 0
+       let Z.kk.extent_realized = 1
+       let Z.ii.max_realized = 1
+       let Z.ii.min_realized = 0
+       let Z.ii.extent_realized = 2
+       let Z.jj.max_realized = 7
+       let Z.jj.min_realized = 0
+       let Z.jj.extent_realized = 8
+       let Z.iii.max_realized = 31
+       let Z.iii.min_realized = 0
+       let Z.iii.extent_realized = 32
+       let Z.jjj.max_realized = 7
+       let Z.jjj.min_realized = 0
+       let Z.jjj.extent_realized = 8
+       let Z.kkk.max_realized = 7
+       let Z.kkk.min_realized = 0
+       let Z.kkk.extent_realized = 8
+       realize Z[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+        for (X.s0.k, 0, (A.extent.0/8)) {
+         for (X.s0.kk, 0, 1) {
+          for (X.s0.iii, 0, 32) {
+           vectorized (X.s0.jjj, 0, 8) {
+            for (X.s0.kkk, 0, 8) {
+             X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)A(((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)), ((X.s0.i*64) + ((X.s0.ii*32) + X.s0.iii)))
+             Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)B(((X.s0.j*64) + ((X.s0.jj*8) + X.s0.jjj)), ((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)))
+             Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = ((let t3 = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) in select((((X.s0.kkk == 0) && (X.s0.kk == 0)) && (X.s0.k == 0)), 0.000000f, t3)) + (X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)*Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)))
+             if ((((X.s0.kkk == 7) && (X.s0.kk == 0)) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+              Out(X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.j, X.s0.i) = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)
+             }
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+
+Placing device functions...
+Lowering after placing device functions:
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+if (!((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)))) {
+ assert(((uint32)A.type == (uint32)73730), halide_error_bad_type("Input buffer A", (uint32)A.type, (uint32)73730))
+ assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+ assert(((uint32)B.type == (uint32)73730), halide_error_bad_type("Input buffer B", (uint32)B.type, (uint32)73730))
+ assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+ assert(((uint32)Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", (uint32)Out.type, (uint32)73730))
+ assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+ 0
+ assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+ 0
+ assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+ 0
+ assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+ 0
+ assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+ assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+ assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+ assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+ assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+ assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+ assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+ assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+ assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+ assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+ assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+ assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+ assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+ assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+ assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+ assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+ let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+ let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+ let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+ let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+ let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+ let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+ let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+ assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+ assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+ assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+ assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+ assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+ assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+ assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+ assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+ gpu_block<Default_GPU> (X.s0.i, 0, (A.extent.1/64)) {
+  gpu_block<Default_GPU> (X.s0.j, 0, (B.extent.0/64)) {
+   gpu_thread<Default_GPU> (X.s0.ii, 0, 2) {
+    gpu_thread<Default_GPU> (X.s0.jj, 0, 8) {
+     let X.i.max_realized = ((A.extent.1/64) + -1)
+     let X.i.min_realized = 0
+     let X.i.extent_realized = (A.extent.1/64)
+     let X.j.max_realized = ((B.extent.0/64) + -1)
+     let X.j.min_realized = 0
+     let X.j.extent_realized = (B.extent.0/64)
+     let X.k.max_realized = ((A.extent.0/8) + -1)
+     let X.k.min_realized = 0
+     let X.k.extent_realized = (A.extent.0/8)
+     let X.kk.max_realized = 0
+     let X.kk.min_realized = 0
+     let X.kk.extent_realized = 1
+     let X.ii.max_realized = 1
+     let X.ii.min_realized = 0
+     let X.ii.extent_realized = 2
+     let X.jj.max_realized = 7
+     let X.jj.min_realized = 0
+     let X.jj.extent_realized = 8
+     let X.iii.max_realized = 31
+     let X.iii.min_realized = 0
+     let X.iii.extent_realized = 32
+     let X.jjj.max_realized = 7
+     let X.jjj.min_realized = 0
+     let X.jjj.extent_realized = 8
+     let X.kkk.max_realized = 7
+     let X.kkk.min_realized = 0
+     let X.kkk.extent_realized = 8
+     realize X[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+      let Y.i.max_realized = ((A.extent.1/64) + -1)
+      let Y.i.min_realized = 0
+      let Y.i.extent_realized = (A.extent.1/64)
+      let Y.j.max_realized = ((B.extent.0/64) + -1)
+      let Y.j.min_realized = 0
+      let Y.j.extent_realized = (B.extent.0/64)
+      let Y.k.max_realized = ((A.extent.0/8) + -1)
+      let Y.k.min_realized = 0
+      let Y.k.extent_realized = (A.extent.0/8)
+      let Y.kk.max_realized = 0
+      let Y.kk.min_realized = 0
+      let Y.kk.extent_realized = 1
+      let Y.ii.max_realized = 1
+      let Y.ii.min_realized = 0
+      let Y.ii.extent_realized = 2
+      let Y.jj.max_realized = 7
+      let Y.jj.min_realized = 0
+      let Y.jj.extent_realized = 8
+      let Y.iii.max_realized = 31
+      let Y.iii.min_realized = 0
+      let Y.iii.extent_realized = 32
+      let Y.jjj.max_realized = 7
+      let Y.jjj.min_realized = 0
+      let Y.jjj.extent_realized = 8
+      let Y.kkk.max_realized = 7
+      let Y.kkk.min_realized = 0
+      let Y.kkk.extent_realized = 8
+      realize Y[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+       let Z.i.max_realized = ((A.extent.1/64) + -1)
+       let Z.i.min_realized = 0
+       let Z.i.extent_realized = (A.extent.1/64)
+       let Z.j.max_realized = ((B.extent.0/64) + -1)
+       let Z.j.min_realized = 0
+       let Z.j.extent_realized = (B.extent.0/64)
+       let Z.k.max_realized = ((A.extent.0/8) + -1)
+       let Z.k.min_realized = 0
+       let Z.k.extent_realized = (A.extent.0/8)
+       let Z.kk.max_realized = 0
+       let Z.kk.min_realized = 0
+       let Z.kk.extent_realized = 1
+       let Z.ii.max_realized = 1
+       let Z.ii.min_realized = 0
+       let Z.ii.extent_realized = 2
+       let Z.jj.max_realized = 7
+       let Z.jj.min_realized = 0
+       let Z.jj.extent_realized = 8
+       let Z.iii.max_realized = 31
+       let Z.iii.min_realized = 0
+       let Z.iii.extent_realized = 32
+       let Z.jjj.max_realized = 7
+       let Z.jjj.min_realized = 0
+       let Z.jjj.extent_realized = 8
+       let Z.kkk.max_realized = 7
+       let Z.kkk.min_realized = 0
+       let Z.kkk.extent_realized = 8
+       realize Z[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+        for (X.s0.k, 0, (A.extent.0/8)) {
+         for (X.s0.kk, 0, 1) {
+          for (X.s0.iii, 0, 32) {
+           vectorized (X.s0.jjj, 0, 8) {
+            for (X.s0.kkk, 0, 8) {
+             X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)A(((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)), ((X.s0.i*64) + ((X.s0.ii*32) + X.s0.iii)))
+             Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)B(((X.s0.j*64) + ((X.s0.jj*8) + X.s0.jjj)), ((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)))
+             Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = ((let t3 = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) in select((((X.s0.kkk == 0) && (X.s0.kk == 0)) && (X.s0.k == 0)), 0.000000f, t3)) + (X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)*Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)))
+             if ((((X.s0.kkk == 7) && (X.s0.kk == 0)) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+              Out(X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.j, X.s0.i) = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)
+             }
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+
+Replacing references with channels and shift registers...
+Lowering after replacing references with channels and shift registers:
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+if (!((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)))) {
+ assert(((uint32)A.type == (uint32)73730), halide_error_bad_type("Input buffer A", (uint32)A.type, (uint32)73730))
+ assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+ assert(((uint32)B.type == (uint32)73730), halide_error_bad_type("Input buffer B", (uint32)B.type, (uint32)73730))
+ assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+ assert(((uint32)Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", (uint32)Out.type, (uint32)73730))
+ assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+ 0
+ assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+ 0
+ assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+ 0
+ assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+ 0
+ assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+ assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+ assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+ assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+ assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+ assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+ assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+ assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+ assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+ assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+ assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+ assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+ assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+ assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+ assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+ assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+ let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+ let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+ let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+ let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+ let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+ let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+ let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+ assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+ assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+ assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+ assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+ assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+ assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+ assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+ assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+ gpu_block<Default_GPU> (X.s0.i, 0, (A.extent.1/64)) {
+  gpu_block<Default_GPU> (X.s0.j, 0, (B.extent.0/64)) {
+   gpu_thread<Default_GPU> (X.s0.ii, 0, 2) {
+    gpu_thread<Default_GPU> (X.s0.jj, 0, 8) {
+     let X.i.max_realized = ((A.extent.1/64) + -1)
+     let X.i.min_realized = 0
+     let X.i.extent_realized = (A.extent.1/64)
+     let X.j.max_realized = ((B.extent.0/64) + -1)
+     let X.j.min_realized = 0
+     let X.j.extent_realized = (B.extent.0/64)
+     let X.k.max_realized = ((A.extent.0/8) + -1)
+     let X.k.min_realized = 0
+     let X.k.extent_realized = (A.extent.0/8)
+     let X.kk.max_realized = 0
+     let X.kk.min_realized = 0
+     let X.kk.extent_realized = 1
+     let X.ii.max_realized = 1
+     let X.ii.min_realized = 0
+     let X.ii.extent_realized = 2
+     let X.jj.max_realized = 7
+     let X.jj.min_realized = 0
+     let X.jj.extent_realized = 8
+     let X.iii.max_realized = 31
+     let X.iii.min_realized = 0
+     let X.iii.extent_realized = 32
+     let X.jjj.max_realized = 7
+     let X.jjj.min_realized = 0
+     let X.jjj.extent_realized = 8
+     let X.kkk.max_realized = 7
+     let X.kkk.min_realized = 0
+     let X.kkk.extent_realized = 8
+     realize X[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+      let Y.i.max_realized = ((A.extent.1/64) + -1)
+      let Y.i.min_realized = 0
+      let Y.i.extent_realized = (A.extent.1/64)
+      let Y.j.max_realized = ((B.extent.0/64) + -1)
+      let Y.j.min_realized = 0
+      let Y.j.extent_realized = (B.extent.0/64)
+      let Y.k.max_realized = ((A.extent.0/8) + -1)
+      let Y.k.min_realized = 0
+      let Y.k.extent_realized = (A.extent.0/8)
+      let Y.kk.max_realized = 0
+      let Y.kk.min_realized = 0
+      let Y.kk.extent_realized = 1
+      let Y.ii.max_realized = 1
+      let Y.ii.min_realized = 0
+      let Y.ii.extent_realized = 2
+      let Y.jj.max_realized = 7
+      let Y.jj.min_realized = 0
+      let Y.jj.extent_realized = 8
+      let Y.iii.max_realized = 31
+      let Y.iii.min_realized = 0
+      let Y.iii.extent_realized = 32
+      let Y.jjj.max_realized = 7
+      let Y.jjj.min_realized = 0
+      let Y.jjj.extent_realized = 8
+      let Y.kkk.max_realized = 7
+      let Y.kkk.min_realized = 0
+      let Y.kkk.extent_realized = 8
+      realize Y[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+       let Z.i.max_realized = ((A.extent.1/64) + -1)
+       let Z.i.min_realized = 0
+       let Z.i.extent_realized = (A.extent.1/64)
+       let Z.j.max_realized = ((B.extent.0/64) + -1)
+       let Z.j.min_realized = 0
+       let Z.j.extent_realized = (B.extent.0/64)
+       let Z.k.max_realized = ((A.extent.0/8) + -1)
+       let Z.k.min_realized = 0
+       let Z.k.extent_realized = (A.extent.0/8)
+       let Z.kk.max_realized = 0
+       let Z.kk.min_realized = 0
+       let Z.kk.extent_realized = 1
+       let Z.ii.max_realized = 1
+       let Z.ii.min_realized = 0
+       let Z.ii.extent_realized = 2
+       let Z.jj.max_realized = 7
+       let Z.jj.min_realized = 0
+       let Z.jj.extent_realized = 8
+       let Z.iii.max_realized = 31
+       let Z.iii.min_realized = 0
+       let Z.iii.extent_realized = 32
+       let Z.jjj.max_realized = 7
+       let Z.jjj.min_realized = 0
+       let Z.jjj.extent_realized = 8
+       let Z.kkk.max_realized = 7
+       let Z.kkk.min_realized = 0
+       let Z.kkk.extent_realized = 8
+       realize Z[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+        for (X.s0.k, 0, (A.extent.0/8)) {
+         for (X.s0.kk, 0, 1) {
+          for (X.s0.iii, 0, 32) {
+           vectorized (X.s0.jjj, 0, 8) {
+            for (X.s0.kkk, 0, 8) {
+             X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)A(((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)), ((X.s0.i*64) + ((X.s0.ii*32) + X.s0.iii)))
+             Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)B(((X.s0.j*64) + ((X.s0.jj*8) + X.s0.jjj)), ((X.s0.k*8) + ((X.s0.kk*8) + X.s0.kkk)))
+             Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = ((let t3 = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) in select((((X.s0.kkk == 0) && (X.s0.kk == 0)) && (X.s0.k == 0)), 0.000000f, t3)) + (X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)*Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)))
+             if ((((X.s0.kkk == 7) && (X.s0.kk == 0)) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+              Out(X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.j, X.s0.i) = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)
+             }
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+
+Simplifying IfThenElse without keeping unit loops...
+Lowering after simplifying IfThenElse without keeping unit loops:
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+if (!((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)))) {
+ assert(((uint32)A.type == (uint32)73730), halide_error_bad_type("Input buffer A", (uint32)A.type, (uint32)73730))
+ assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+ assert(((uint32)B.type == (uint32)73730), halide_error_bad_type("Input buffer B", (uint32)B.type, (uint32)73730))
+ assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+ assert(((uint32)Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", (uint32)Out.type, (uint32)73730))
+ assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+ assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+ assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+ assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+ assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+ assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+ assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+ assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+ assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+ assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+ assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+ assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+ assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+ assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+ assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+ assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+ assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+ assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+ assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+ assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+ let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+ let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+ let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+ let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+ let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+ let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+ let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+ assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+ assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+ assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+ assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+ assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+ assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+ assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+ assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+ gpu_block<Default_GPU> (X.s0.i, 0, (A.extent.1/64)) {
+  gpu_block<Default_GPU> (X.s0.j, 0, (B.extent.0/64)) {
+   gpu_thread<Default_GPU> (X.s0.ii, 0, 2) {
+    gpu_thread<Default_GPU> (X.s0.jj, 0, 8) {
+     realize X[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+      realize Y[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+       realize Z[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+        for (X.s0.k, 0, (A.extent.0/8)) {
+         for (X.s0.iii, 0, 32) {
+          vectorized (X.s0.jjj, 0, 8) {
+           for (X.s0.kkk, 0, 8) {
+            X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)A(((X.s0.k*8) + X.s0.kkk), ((X.s0.i*64) + ((X.s0.ii*32) + X.s0.iii)))
+            Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)B(((X.s0.j*64) + ((X.s0.jj*8) + X.s0.jjj)), ((X.s0.k*8) + X.s0.kkk))
+            Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = ((let t3 = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) in select(((X.s0.kkk == 0) && (X.s0.k == 0)), 0.000000f, t3)) + (X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)*Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)))
+            if (((X.s0.kkk == 7) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+             Out(X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.j, X.s0.i) = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+
+Performing storage folding optimization...
+Lowering after storage folding:
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+if (!((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)))) {
+ assert(((uint32)A.type == (uint32)73730), halide_error_bad_type("Input buffer A", (uint32)A.type, (uint32)73730))
+ assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+ assert(((uint32)B.type == (uint32)73730), halide_error_bad_type("Input buffer B", (uint32)B.type, (uint32)73730))
+ assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+ assert(((uint32)Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", (uint32)Out.type, (uint32)73730))
+ assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+ assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+ assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+ assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+ assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+ assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+ assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+ assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+ assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+ assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+ assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+ assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+ assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+ assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+ assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+ assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+ assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+ assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+ assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+ assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+ let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+ let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+ let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+ let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+ let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+ let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+ let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+ assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+ assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+ assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+ assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+ assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+ assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+ assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+ assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+ gpu_block<Default_GPU> (X.s0.i, 0, (A.extent.1/64)) {
+  gpu_block<Default_GPU> (X.s0.j, 0, (B.extent.0/64)) {
+   gpu_thread<Default_GPU> (X.s0.ii, 0, 2) {
+    gpu_thread<Default_GPU> (X.s0.jj, 0, 8) {
+     realize X[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+      realize Y[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+       realize Z[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+        for (X.s0.k, 0, (A.extent.0/8)) {
+         for (X.s0.iii, 0, 32) {
+          vectorized (X.s0.jjj, 0, 8) {
+           for (X.s0.kkk, 0, 8) {
+            X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)A(((X.s0.k*8) + X.s0.kkk), ((X.s0.i*64) + ((X.s0.ii*32) + X.s0.iii)))
+            Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)B(((X.s0.j*64) + ((X.s0.jj*8) + X.s0.jjj)), ((X.s0.k*8) + X.s0.kkk))
+            Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = ((let t3 = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) in select(((X.s0.kkk == 0) && (X.s0.k == 0)), 0.000000f, t3)) + (X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)*Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)))
+            if (((X.s0.kkk == 7) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+             Out(X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.j, X.s0.i) = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+Injecting debug_to_file calls...
+Lowering after injecting debug_to_file calls:
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+if (!((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)))) {
+ assert(((uint32)A.type == (uint32)73730), halide_error_bad_type("Input buffer A", (uint32)A.type, (uint32)73730))
+ assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+ assert(((uint32)B.type == (uint32)73730), halide_error_bad_type("Input buffer B", (uint32)B.type, (uint32)73730))
+ assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+ assert(((uint32)Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", (uint32)Out.type, (uint32)73730))
+ assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+ assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+ assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+ assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+ assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+ assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+ assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+ assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+ assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+ assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+ assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+ assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+ assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+ assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+ assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+ assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+ assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+ assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+ assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+ assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+ let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+ let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+ let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+ let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+ let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+ let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+ let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+ assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+ assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+ assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+ assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+ assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+ assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+ assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+ assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+ gpu_block<Default_GPU> (X.s0.i, 0, (A.extent.1/64)) {
+  gpu_block<Default_GPU> (X.s0.j, 0, (B.extent.0/64)) {
+   gpu_thread<Default_GPU> (X.s0.ii, 0, 2) {
+    gpu_thread<Default_GPU> (X.s0.jj, 0, 8) {
+     realize X[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+      realize Y[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+       realize Z[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+        for (X.s0.k, 0, (A.extent.0/8)) {
+         for (X.s0.iii, 0, 32) {
+          vectorized (X.s0.jjj, 0, 8) {
+           for (X.s0.kkk, 0, 8) {
+            X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)A(((X.s0.k*8) + X.s0.kkk), ((X.s0.i*64) + ((X.s0.ii*32) + X.s0.iii)))
+            Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)B(((X.s0.j*64) + ((X.s0.jj*8) + X.s0.jjj)), ((X.s0.k*8) + X.s0.kkk))
+            Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = ((let t3 = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) in select(((X.s0.kkk == 0) && (X.s0.k == 0)), 0.000000f, t3)) + (X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)*Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)))
+            if (((X.s0.kkk == 7) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+             Out(X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.j, X.s0.i) = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+Injecting prefetches...
+Lowering after injecting prefetches:
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+if (!((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)))) {
+ assert(((uint32)A.type == (uint32)73730), halide_error_bad_type("Input buffer A", (uint32)A.type, (uint32)73730))
+ assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+ assert(((uint32)B.type == (uint32)73730), halide_error_bad_type("Input buffer B", (uint32)B.type, (uint32)73730))
+ assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+ assert(((uint32)Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", (uint32)Out.type, (uint32)73730))
+ assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+ assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+ assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+ assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+ assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+ assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+ assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+ assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+ assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+ assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+ assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+ assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+ assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+ assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+ assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+ assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+ assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+ assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+ assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+ assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+ let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+ let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+ let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+ let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+ let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+ let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+ let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+ assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+ assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+ assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+ assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+ assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+ assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+ assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+ assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+ gpu_block<Default_GPU> (X.s0.i, 0, (A.extent.1/64)) {
+  gpu_block<Default_GPU> (X.s0.j, 0, (B.extent.0/64)) {
+   gpu_thread<Default_GPU> (X.s0.ii, 0, 2) {
+    gpu_thread<Default_GPU> (X.s0.jj, 0, 8) {
+     realize X[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+      realize Y[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+       realize Z[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+        for (X.s0.k, 0, (A.extent.0/8)) {
+         for (X.s0.iii, 0, 32) {
+          vectorized (X.s0.jjj, 0, 8) {
+           for (X.s0.kkk, 0, 8) {
+            X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)A(((X.s0.k*8) + X.s0.kkk), ((X.s0.i*64) + ((X.s0.ii*32) + X.s0.iii)))
+            Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)B(((X.s0.j*64) + ((X.s0.jj*8) + X.s0.jjj)), ((X.s0.k*8) + X.s0.kkk))
+            Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = ((let t3 = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) in select(((X.s0.kkk == 0) && (X.s0.k == 0)), 0.000000f, t3)) + (X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)*Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)))
+            if (((X.s0.kkk == 7) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+             Out(X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.j, X.s0.i) = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+
+Destructuring tuple-valued realizations...
+Lowering after destructuring tuple-valued realizations:
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+if (!((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)))) {
+ assert(((uint32)A.type == (uint32)73730), halide_error_bad_type("Input buffer A", (uint32)A.type, (uint32)73730))
+ assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+ assert(((uint32)B.type == (uint32)73730), halide_error_bad_type("Input buffer B", (uint32)B.type, (uint32)73730))
+ assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+ assert(((uint32)Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", (uint32)Out.type, (uint32)73730))
+ assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+ assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+ assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+ assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+ assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+ assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+ assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+ assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+ assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+ assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+ assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+ assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+ assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+ assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+ assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+ assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+ assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+ assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+ assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+ assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+ let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+ let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+ let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+ let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+ let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+ let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+ let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+ assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+ assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+ assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+ assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+ assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+ assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+ assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+ assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+ gpu_block<Default_GPU> (X.s0.i, 0, (A.extent.1/64)) {
+  gpu_block<Default_GPU> (X.s0.j, 0, (B.extent.0/64)) {
+   gpu_thread<Default_GPU> (X.s0.ii, 0, 2) {
+    gpu_thread<Default_GPU> (X.s0.jj, 0, 8) {
+     realize X[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+      realize Y[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+       realize Z[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+        for (X.s0.k, 0, (A.extent.0/8)) {
+         for (X.s0.iii, 0, 32) {
+          vectorized (X.s0.jjj, 0, 8) {
+           for (X.s0.kkk, 0, 8) {
+            X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)A(((X.s0.k*8) + X.s0.kkk), ((X.s0.i*64) + ((X.s0.ii*32) + X.s0.iii)))
+            Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)B(((X.s0.j*64) + ((X.s0.jj*8) + X.s0.jjj)), ((X.s0.k*8) + X.s0.kkk))
+            Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = ((let t3 = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) in select(((X.s0.kkk == 0) && (X.s0.k == 0)), 0.000000f, t3)) + (X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)*Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)))
+            if (((X.s0.kkk == 7) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+             Out(X.s0.jjj, X.s0.iii, X.s0.jj, X.s0.ii, X.s0.j, X.s0.i) = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+
+Canonicalizing GPU var names...
+Lowering after canonicalizing GPU var names:
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+if (!((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)))) {
+ assert(((uint32)A.type == (uint32)73730), halide_error_bad_type("Input buffer A", (uint32)A.type, (uint32)73730))
+ assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+ assert(((uint32)B.type == (uint32)73730), halide_error_bad_type("Input buffer B", (uint32)B.type, (uint32)73730))
+ assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+ assert(((uint32)Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", (uint32)Out.type, (uint32)73730))
+ assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+ assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+ assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+ assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+ assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+ assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+ assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+ assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+ assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+ assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+ assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+ assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+ assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+ assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+ assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+ assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+ assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+ assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+ assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+ assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+ let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+ let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+ let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+ let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+ let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+ let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+ let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+ assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+ assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+ assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+ assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+ assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+ assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+ assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+ assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+ gpu_block<Default_GPU> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+  gpu_block<Default_GPU> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+   gpu_thread<Default_GPU> (X.s0.ii.__thread_id_y, 0, 2) {
+    gpu_thread<Default_GPU> (X.s0.jj.__thread_id_x, 0, 8) {
+     realize X[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+      realize Y[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+       realize Z[0, 8], [0, 32], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1] of type `float32' {
+        for (X.s0.k, 0, (A.extent.0/8)) {
+         for (X.s0.iii, 0, 32) {
+          vectorized (X.s0.jjj, 0, 8) {
+           for (X.s0.kkk, 0, 8) {
+            X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)A(((X.s0.k*8) + X.s0.kkk), ((X.s0.i.__block_id_y*64) + ((X.s0.ii.__thread_id_y*32) + X.s0.iii)))
+            Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = (float32)B(((X.s0.j.__block_id_x*64) + ((X.s0.jj.__thread_id_x*8) + X.s0.jjj)), ((X.s0.k*8) + X.s0.kkk))
+            Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) = ((let t3 = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0) in select(((X.s0.kkk == 0) && (X.s0.k == 0)), 0.000000f, t3)) + (X(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)*Y(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)))
+            if (((X.s0.kkk == 7) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+             Out(X.s0.jjj, X.s0.iii, X.s0.jj.__thread_id_x, X.s0.ii.__thread_id_y, X.s0.j.__block_id_x, X.s0.i.__block_id_y) = Z(X.s0.jjj, X.s0.iii, 0, 0, 0, 0, 0, 0, 0)
+            }
+           }
+          }
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+Performing storage flattening...
+Lowering after storage flattening:
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+if (!((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer) || ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer) || (uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)))) {
+ assert(((uint32)A.type == (uint32)73730), halide_error_bad_type("Input buffer A", (uint32)A.type, (uint32)73730))
+ assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+ assert(((uint32)B.type == (uint32)73730), halide_error_bad_type("Input buffer B", (uint32)B.type, (uint32)73730))
+ assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+ assert(((uint32)Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", (uint32)Out.type, (uint32)73730))
+ assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+ assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+ assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+ assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+ assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+ assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+ assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+ assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+ assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+ assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+ assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+ assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+ assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+ assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+ assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+ assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+ assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+ assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+ assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+ assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+ let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+ let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+ let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+ let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+ let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+ let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+ let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+ assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+ assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+ assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+ assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+ assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+ assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+ assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+ assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+ assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+ gpu_block<Default_GPU> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+  gpu_block<Default_GPU> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+   gpu_thread<Default_GPU> (X.s0.ii.__thread_id_y, 0, 2) {
+    gpu_thread<Default_GPU> (X.s0.jj.__thread_id_x, 0, 8) {
+     let X.extent.0 = 8
+     let X.min.0 = 0
+     let X.extent.1 = 32
+     let X.min.1 = 0
+     let X.extent.2 = 1
+     let X.min.2 = 0
+     let X.extent.3 = 1
+     let X.min.3 = 0
+     let X.extent.4 = 1
+     let X.min.4 = 0
+     let X.extent.5 = 1
+     let X.min.5 = 0
+     let X.extent.6 = 1
+     let X.min.6 = 0
+     let X.extent.7 = 1
+     let X.min.7 = 0
+     let X.extent.8 = 1
+     let X.min.8 = 0
+     let X.stride.0 = 1
+     let X.stride.1 = (X.stride.0*8)
+     let X.stride.2 = (X.stride.1*32)
+     let X.stride.3 = (X.stride.2*1)
+     let X.stride.4 = (X.stride.3*1)
+     let X.stride.5 = (X.stride.4*1)
+     let X.stride.6 = (X.stride.5*1)
+     let X.stride.7 = (X.stride.6*1)
+     let X.stride.8 = (X.stride.7*1)
+     allocate X[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+     let X.buffer = (let t18 = ((halide_dimension_t *))make_struct(X.min.0, X.extent.0, X.stride.0, 0, X.min.1, X.extent.1, X.stride.1, 0, X.min.2, X.extent.2, X.stride.2, 0, X.min.3, X.extent.3, X.stride.3, 0, X.min.4, X.extent.4, X.stride.4, 0, X.min.5, X.extent.5, X.stride.5, 0, X.min.6, X.extent.6, X.stride.6, 0, X.min.7, X.extent.7, X.stride.7, 0, X.min.8, X.extent.8, X.stride.8, 0) in ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))alloca(size_of_halide_buffer_t()), t18, X, (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 9, t18, (uint64)0))
+     let Y.extent.0 = 8
+     let Y.min.0 = 0
+     let Y.extent.1 = 32
+     let Y.min.1 = 0
+     let Y.extent.2 = 1
+     let Y.min.2 = 0
+     let Y.extent.3 = 1
+     let Y.min.3 = 0
+     let Y.extent.4 = 1
+     let Y.min.4 = 0
+     let Y.extent.5 = 1
+     let Y.min.5 = 0
+     let Y.extent.6 = 1
+     let Y.min.6 = 0
+     let Y.extent.7 = 1
+     let Y.min.7 = 0
+     let Y.extent.8 = 1
+     let Y.min.8 = 0
+     let Y.stride.0 = 1
+     let Y.stride.1 = (Y.stride.0*8)
+     let Y.stride.2 = (Y.stride.1*32)
+     let Y.stride.3 = (Y.stride.2*1)
+     let Y.stride.4 = (Y.stride.3*1)
+     let Y.stride.5 = (Y.stride.4*1)
+     let Y.stride.6 = (Y.stride.5*1)
+     let Y.stride.7 = (Y.stride.6*1)
+     let Y.stride.8 = (Y.stride.7*1)
+     allocate Y[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+     let Y.buffer = (let t17 = ((halide_dimension_t *))make_struct(Y.min.0, Y.extent.0, Y.stride.0, 0, Y.min.1, Y.extent.1, Y.stride.1, 0, Y.min.2, Y.extent.2, Y.stride.2, 0, Y.min.3, Y.extent.3, Y.stride.3, 0, Y.min.4, Y.extent.4, Y.stride.4, 0, Y.min.5, Y.extent.5, Y.stride.5, 0, Y.min.6, Y.extent.6, Y.stride.6, 0, Y.min.7, Y.extent.7, Y.stride.7, 0, Y.min.8, Y.extent.8, Y.stride.8, 0) in ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))alloca(size_of_halide_buffer_t()), t17, Y, (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 9, t17, (uint64)0))
+     let Z.extent.0 = 8
+     let Z.min.0 = 0
+     let Z.extent.1 = 32
+     let Z.min.1 = 0
+     let Z.extent.2 = 1
+     let Z.min.2 = 0
+     let Z.extent.3 = 1
+     let Z.min.3 = 0
+     let Z.extent.4 = 1
+     let Z.min.4 = 0
+     let Z.extent.5 = 1
+     let Z.min.5 = 0
+     let Z.extent.6 = 1
+     let Z.min.6 = 0
+     let Z.extent.7 = 1
+     let Z.min.7 = 0
+     let Z.extent.8 = 1
+     let Z.min.8 = 0
+     let Z.stride.0 = 1
+     let Z.stride.1 = (Z.stride.0*8)
+     let Z.stride.2 = (Z.stride.1*32)
+     let Z.stride.3 = (Z.stride.2*1)
+     let Z.stride.4 = (Z.stride.3*1)
+     let Z.stride.5 = (Z.stride.4*1)
+     let Z.stride.6 = (Z.stride.5*1)
+     let Z.stride.7 = (Z.stride.6*1)
+     let Z.stride.8 = (Z.stride.7*1)
+     allocate Z[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+     let Z.buffer = (let t16 = ((halide_dimension_t *))make_struct(Z.min.0, Z.extent.0, Z.stride.0, 0, Z.min.1, Z.extent.1, Z.stride.1, 0, Z.min.2, Z.extent.2, Z.stride.2, 0, Z.min.3, Z.extent.3, Z.stride.3, 0, Z.min.4, Z.extent.4, Z.stride.4, 0, Z.min.5, Z.extent.5, Z.stride.5, 0, Z.min.6, Z.extent.6, Z.stride.6, 0, Z.min.7, Z.extent.7, Z.stride.7, 0, Z.min.8, Z.extent.8, Z.stride.8, 0) in ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))alloca(size_of_halide_buffer_t()), t16, Z, (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 9, t16, (uint64)0))
+     for (X.s0.k, 0, (A.extent.0/8)) {
+      for (X.s0.iii, 0, 32) {
+       vectorized (X.s0.jjj, 0, 8) {
+        for (X.s0.kkk, 0, 8) {
+         X[(((((((((0 + ((X.s0.jjj - X.min.0)*X.stride.0)) + ((X.s0.iii - X.min.1)*X.stride.1)) + ((0 - X.min.2)*X.stride.2)) + ((0 - X.min.3)*X.stride.3)) + ((0 - X.min.4)*X.stride.4)) + ((0 - X.min.5)*X.stride.5)) + ((0 - X.min.6)*X.stride.6)) + ((0 - X.min.7)*X.stride.7)) + ((0 - X.min.8)*X.stride.8))] = (float32)A[(((0 + (((X.s0.k*8) + X.s0.kkk)*A.stride.0)) + (((X.s0.i.__block_id_y*64) + ((X.s0.ii.__thread_id_y*32) + X.s0.iii))*A.stride.1)) - ((0 + (A.min.0*A.stride.0)) + (A.min.1*A.stride.1)))]
+         Y[(((((((((0 + ((X.s0.jjj - Y.min.0)*Y.stride.0)) + ((X.s0.iii - Y.min.1)*Y.stride.1)) + ((0 - Y.min.2)*Y.stride.2)) + ((0 - Y.min.3)*Y.stride.3)) + ((0 - Y.min.4)*Y.stride.4)) + ((0 - Y.min.5)*Y.stride.5)) + ((0 - Y.min.6)*Y.stride.6)) + ((0 - Y.min.7)*Y.stride.7)) + ((0 - Y.min.8)*Y.stride.8))] = (float32)B[(((0 + (((X.s0.j.__block_id_x*64) + ((X.s0.jj.__thread_id_x*8) + X.s0.jjj))*B.stride.0)) + (((X.s0.k*8) + X.s0.kkk)*B.stride.1)) - ((0 + (B.min.0*B.stride.0)) + (B.min.1*B.stride.1)))]
+         Z[(((((((((0 + ((X.s0.jjj - Z.min.0)*Z.stride.0)) + ((X.s0.iii - Z.min.1)*Z.stride.1)) + ((0 - Z.min.2)*Z.stride.2)) + ((0 - Z.min.3)*Z.stride.3)) + ((0 - Z.min.4)*Z.stride.4)) + ((0 - Z.min.5)*Z.stride.5)) + ((0 - Z.min.6)*Z.stride.6)) + ((0 - Z.min.7)*Z.stride.7)) + ((0 - Z.min.8)*Z.stride.8))] = ((let t3 = Z[(((((((((0 + ((X.s0.jjj - Z.min.0)*Z.stride.0)) + ((X.s0.iii - Z.min.1)*Z.stride.1)) + ((0 - Z.min.2)*Z.stride.2)) + ((0 - Z.min.3)*Z.stride.3)) + ((0 - Z.min.4)*Z.stride.4)) + ((0 - Z.min.5)*Z.stride.5)) + ((0 - Z.min.6)*Z.stride.6)) + ((0 - Z.min.7)*Z.stride.7)) + ((0 - Z.min.8)*Z.stride.8))] in select(((X.s0.kkk == 0) && (X.s0.k == 0)), 0.000000f, t3)) + (X[(((((((((0 + ((X.s0.jjj - X.min.0)*X.stride.0)) + ((X.s0.iii - X.min.1)*X.stride.1)) + ((0 - X.min.2)*X.stride.2)) + ((0 - X.min.3)*X.stride.3)) + ((0 - X.min.4)*X.stride.4)) + ((0 - X.min.5)*X.stride.5)) + ((0 - X.min.6)*X.stride.6)) + ((0 - X.min.7)*X.stride.7)) + ((0 - X.min.8)*X.stride.8))]*Y[(((((((((0 + ((X.s0.jjj - Y.min.0)*Y.stride.0)) + ((X.s0.iii - Y.min.1)*Y.stride.1)) + ((0 - Y.min.2)*Y.stride.2)) + ((0 - Y.min.3)*Y.stride.3)) + ((0 - Y.min.4)*Y.stride.4)) + ((0 - Y.min.5)*Y.stride.5)) + ((0 - Y.min.6)*Y.stride.6)) + ((0 - Y.min.7)*Y.stride.7)) + ((0 - Y.min.8)*Y.stride.8))]))
+         if (((X.s0.kkk == 7) && (X.s0.k == ((A.extent.0/8) + -1)))) {
+          Out[(((((((0 + (X.s0.jjj*Out.stride.0)) + (X.s0.iii*Out.stride.1)) + (X.s0.jj.__thread_id_x*Out.stride.2)) + (X.s0.ii.__thread_id_y*Out.stride.3)) + (X.s0.j.__block_id_x*Out.stride.4)) + (X.s0.i.__block_id_y*Out.stride.5)) - ((((((0 + (Out.min.0*Out.stride.0)) + (Out.min.1*Out.stride.1)) + (Out.min.2*Out.stride.2)) + (Out.min.3*Out.stride.3)) + (Out.min.4*Out.stride.4)) + (Out.min.5*Out.stride.5)))] = Z[(((((((((0 + ((X.s0.jjj - Z.min.0)*Z.stride.0)) + ((X.s0.iii - Z.min.1)*Z.stride.1)) + ((0 - Z.min.2)*Z.stride.2)) + ((0 - Z.min.3)*Z.stride.3)) + ((0 - Z.min.4)*Z.stride.4)) + ((0 - Z.min.5)*Z.stride.5)) + ((0 - Z.min.6)*Z.stride.6)) + ((0 - Z.min.7)*Z.stride.7)) + ((0 - Z.min.8)*Z.stride.8))]
+         }
+        }
+       }
+      }
+     }
+    }
+   }
+  }
+ }
+}
+
+
+Applying memory schedule...
+Lowering after memory schedule:
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert(((uint32)A.type == (uint32)73730), halide_error_bad_type("Input buffer A", (uint32)A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert(((uint32)B.type == (uint32)73730), halide_error_bad_type("Input buffer B", (uint32)B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert(((uint32)Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", (uint32)Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<Default_GPU> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  gpu_thread<Default_GPU> (X.s0.ii.__thread_id_y, 0, 2) {
+   gpu_thread<Default_GPU> (X.s0.jj.__thread_id_x, 0, 8) {
+    allocate B_im_buf[float32 * 8 * 8] in Register
+    allocate A_im_buf[float32 * 8 * 32] in Register
+    let X.extent.0 = 8
+    let X.min.0 = 0
+    let X.extent.1 = 32
+    let X.min.1 = 0
+    let X.extent.2 = 1
+    let X.min.2 = 0
+    let X.extent.3 = 1
+    let X.min.3 = 0
+    let X.extent.4 = 1
+    let X.min.4 = 0
+    let X.extent.5 = 1
+    let X.min.5 = 0
+    let X.extent.6 = 1
+    let X.min.6 = 0
+    let X.extent.7 = 1
+    let X.min.7 = 0
+    let X.extent.8 = 1
+    let X.min.8 = 0
+    let X.stride.0 = 1
+    let X.stride.1 = (X.stride.0*8)
+    let X.stride.2 = (X.stride.1*32)
+    let X.stride.3 = (X.stride.2*1)
+    let X.stride.4 = (X.stride.3*1)
+    let X.stride.5 = (X.stride.4*1)
+    let X.stride.6 = (X.stride.5*1)
+    let X.stride.7 = (X.stride.6*1)
+    let X.stride.8 = (X.stride.7*1)
+    allocate X[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    let X.buffer = (let t18 = ((halide_dimension_t *))make_struct(X.min.0, X.extent.0, X.stride.0, 0, X.min.1, X.extent.1, X.stride.1, 0, X.min.2, X.extent.2, X.stride.2, 0, X.min.3, X.extent.3, X.stride.3, 0, X.min.4, X.extent.4, X.stride.4, 0, X.min.5, X.extent.5, X.stride.5, 0, X.min.6, X.extent.6, X.stride.6, 0, X.min.7, X.extent.7, X.stride.7, 0, X.min.8, X.extent.8, X.stride.8, 0) in ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))alloca(size_of_halide_buffer_t()), t18, X, (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 9, t18, (uint64)0))
+    let Y.extent.0 = 8
+    let Y.min.0 = 0
+    let Y.extent.1 = 32
+    let Y.min.1 = 0
+    let Y.extent.2 = 1
+    let Y.min.2 = 0
+    let Y.extent.3 = 1
+    let Y.min.3 = 0
+    let Y.extent.4 = 1
+    let Y.min.4 = 0
+    let Y.extent.5 = 1
+    let Y.min.5 = 0
+    let Y.extent.6 = 1
+    let Y.min.6 = 0
+    let Y.extent.7 = 1
+    let Y.min.7 = 0
+    let Y.extent.8 = 1
+    let Y.min.8 = 0
+    let Y.stride.0 = 1
+    let Y.stride.1 = (Y.stride.0*8)
+    let Y.stride.2 = (Y.stride.1*32)
+    let Y.stride.3 = (Y.stride.2*1)
+    let Y.stride.4 = (Y.stride.3*1)
+    let Y.stride.5 = (Y.stride.4*1)
+    let Y.stride.6 = (Y.stride.5*1)
+    let Y.stride.7 = (Y.stride.6*1)
+    let Y.stride.8 = (Y.stride.7*1)
+    allocate Y[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    let Y.buffer = (let t17 = ((halide_dimension_t *))make_struct(Y.min.0, Y.extent.0, Y.stride.0, 0, Y.min.1, Y.extent.1, Y.stride.1, 0, Y.min.2, Y.extent.2, Y.stride.2, 0, Y.min.3, Y.extent.3, Y.stride.3, 0, Y.min.4, Y.extent.4, Y.stride.4, 0, Y.min.5, Y.extent.5, Y.stride.5, 0, Y.min.6, Y.extent.6, Y.stride.6, 0, Y.min.7, Y.extent.7, Y.stride.7, 0, Y.min.8, Y.extent.8, Y.stride.8, 0) in ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))alloca(size_of_halide_buffer_t()), t17, Y, (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 9, t17, (uint64)0))
+    let Z.extent.0 = 8
+    let Z.min.0 = 0
+    let Z.extent.1 = 32
+    let Z.min.1 = 0
+    let Z.extent.2 = 1
+    let Z.min.2 = 0
+    let Z.extent.3 = 1
+    let Z.min.3 = 0
+    let Z.extent.4 = 1
+    let Z.min.4 = 0
+    let Z.extent.5 = 1
+    let Z.min.5 = 0
+    let Z.extent.6 = 1
+    let Z.min.6 = 0
+    let Z.extent.7 = 1
+    let Z.min.7 = 0
+    let Z.extent.8 = 1
+    let Z.min.8 = 0
+    let Z.stride.0 = 1
+    let Z.stride.1 = (Z.stride.0*8)
+    let Z.stride.2 = (Z.stride.1*32)
+    let Z.stride.3 = (Z.stride.2*1)
+    let Z.stride.4 = (Z.stride.3*1)
+    let Z.stride.5 = (Z.stride.4*1)
+    let Z.stride.6 = (Z.stride.5*1)
+    let Z.stride.7 = (Z.stride.6*1)
+    let Z.stride.8 = (Z.stride.7*1)
+    allocate Z[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    let Z.buffer = (let t16 = ((halide_dimension_t *))make_struct(Z.min.0, Z.extent.0, Z.stride.0, 0, Z.min.1, Z.extent.1, Z.stride.1, 0, Z.min.2, Z.extent.2, Z.stride.2, 0, Z.min.3, Z.extent.3, Z.stride.3, 0, Z.min.4, Z.extent.4, Z.stride.4, 0, Z.min.5, Z.extent.5, Z.stride.5, 0, Z.min.6, Z.extent.6, Z.stride.6, 0, Z.min.7, Z.extent.7, Z.stride.7, 0, Z.min.8, Z.extent.8, Z.stride.8, 0) in ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))alloca(size_of_halide_buffer_t()), t16, Z, (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 9, t16, (uint64)0))
+    if ((uint1)1) {
+     Z[ramp(0, 1, 256)] = x256(0.000000f)
+    }
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+     (float32)cm_load_2d(((void *))B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(((void *))A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      vectorized (X.s0.jjj, 0, 8) {
+       punrolled (X.s0.kkk, 0, 8) {
+        0
+        0
+        Z[(((((((((0 + ((X.s0.jjj - Z.min.0)*Z.stride.0)) + ((X.s0.iii - Z.min.1)*Z.stride.1)) + ((0 - Z.min.2)*Z.stride.2)) + ((0 - Z.min.3)*Z.stride.3)) + ((0 - Z.min.4)*Z.stride.4)) + ((0 - Z.min.5)*Z.stride.5)) + ((0 - Z.min.6)*Z.stride.6)) + ((0 - Z.min.7)*Z.stride.7)) + ((0 - Z.min.8)*Z.stride.8))] = ((let t3 = Z[(((((((((0 + ((X.s0.jjj - Z.min.0)*Z.stride.0)) + ((X.s0.iii - Z.min.1)*Z.stride.1)) + ((0 - Z.min.2)*Z.stride.2)) + ((0 - Z.min.3)*Z.stride.3)) + ((0 - Z.min.4)*Z.stride.4)) + ((0 - Z.min.5)*Z.stride.5)) + ((0 - Z.min.6)*Z.stride.6)) + ((0 - Z.min.7)*Z.stride.7)) + ((0 - Z.min.8)*Z.stride.8))] in t3) + (A_im_buf[((0 + ((0*8) + X.s0.kkk)) + (X.s0.iii*8))]*B_im_buf[((0 + X.s0.jjj) + (((0*8) + X.s0.kkk)*8))]))
+       }
+      }
+     }
+    }
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+    unrolled (Out.store.t0, 0, 1) {
+     (float32x256)cm_store_2d(((void *))Out, (var.Out.store.addr.0 + 0), (var.Out.store.addr.1 + 0), Z, ramp((Out.store.t0*256), 1, 256), 8, 32, "C")
+    }
+   }
+  }
+ }
+}
+
+
+Adding atomic mutex allocation...
+Lowering after adding atomic mutex allocation:
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert(((uint32)A.type == (uint32)73730), halide_error_bad_type("Input buffer A", (uint32)A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert(((uint32)B.type == (uint32)73730), halide_error_bad_type("Input buffer B", (uint32)B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert(((uint32)Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", (uint32)Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<Default_GPU> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  gpu_thread<Default_GPU> (X.s0.ii.__thread_id_y, 0, 2) {
+   gpu_thread<Default_GPU> (X.s0.jj.__thread_id_x, 0, 8) {
+    allocate B_im_buf[float32 * 8 * 8] in Register
+    allocate A_im_buf[float32 * 8 * 32] in Register
+    let X.extent.0 = 8
+    let X.min.0 = 0
+    let X.extent.1 = 32
+    let X.min.1 = 0
+    let X.extent.2 = 1
+    let X.min.2 = 0
+    let X.extent.3 = 1
+    let X.min.3 = 0
+    let X.extent.4 = 1
+    let X.min.4 = 0
+    let X.extent.5 = 1
+    let X.min.5 = 0
+    let X.extent.6 = 1
+    let X.min.6 = 0
+    let X.extent.7 = 1
+    let X.min.7 = 0
+    let X.extent.8 = 1
+    let X.min.8 = 0
+    let X.stride.0 = 1
+    let X.stride.1 = (X.stride.0*8)
+    let X.stride.2 = (X.stride.1*32)
+    let X.stride.3 = (X.stride.2*1)
+    let X.stride.4 = (X.stride.3*1)
+    let X.stride.5 = (X.stride.4*1)
+    let X.stride.6 = (X.stride.5*1)
+    let X.stride.7 = (X.stride.6*1)
+    let X.stride.8 = (X.stride.7*1)
+    allocate X[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    let X.buffer = (let t18 = ((halide_dimension_t *))make_struct(X.min.0, X.extent.0, X.stride.0, 0, X.min.1, X.extent.1, X.stride.1, 0, X.min.2, X.extent.2, X.stride.2, 0, X.min.3, X.extent.3, X.stride.3, 0, X.min.4, X.extent.4, X.stride.4, 0, X.min.5, X.extent.5, X.stride.5, 0, X.min.6, X.extent.6, X.stride.6, 0, X.min.7, X.extent.7, X.stride.7, 0, X.min.8, X.extent.8, X.stride.8, 0) in ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))alloca(size_of_halide_buffer_t()), t18, X, (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 9, t18, (uint64)0))
+    let Y.extent.0 = 8
+    let Y.min.0 = 0
+    let Y.extent.1 = 32
+    let Y.min.1 = 0
+    let Y.extent.2 = 1
+    let Y.min.2 = 0
+    let Y.extent.3 = 1
+    let Y.min.3 = 0
+    let Y.extent.4 = 1
+    let Y.min.4 = 0
+    let Y.extent.5 = 1
+    let Y.min.5 = 0
+    let Y.extent.6 = 1
+    let Y.min.6 = 0
+    let Y.extent.7 = 1
+    let Y.min.7 = 0
+    let Y.extent.8 = 1
+    let Y.min.8 = 0
+    let Y.stride.0 = 1
+    let Y.stride.1 = (Y.stride.0*8)
+    let Y.stride.2 = (Y.stride.1*32)
+    let Y.stride.3 = (Y.stride.2*1)
+    let Y.stride.4 = (Y.stride.3*1)
+    let Y.stride.5 = (Y.stride.4*1)
+    let Y.stride.6 = (Y.stride.5*1)
+    let Y.stride.7 = (Y.stride.6*1)
+    let Y.stride.8 = (Y.stride.7*1)
+    allocate Y[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    let Y.buffer = (let t17 = ((halide_dimension_t *))make_struct(Y.min.0, Y.extent.0, Y.stride.0, 0, Y.min.1, Y.extent.1, Y.stride.1, 0, Y.min.2, Y.extent.2, Y.stride.2, 0, Y.min.3, Y.extent.3, Y.stride.3, 0, Y.min.4, Y.extent.4, Y.stride.4, 0, Y.min.5, Y.extent.5, Y.stride.5, 0, Y.min.6, Y.extent.6, Y.stride.6, 0, Y.min.7, Y.extent.7, Y.stride.7, 0, Y.min.8, Y.extent.8, Y.stride.8, 0) in ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))alloca(size_of_halide_buffer_t()), t17, Y, (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 9, t17, (uint64)0))
+    let Z.extent.0 = 8
+    let Z.min.0 = 0
+    let Z.extent.1 = 32
+    let Z.min.1 = 0
+    let Z.extent.2 = 1
+    let Z.min.2 = 0
+    let Z.extent.3 = 1
+    let Z.min.3 = 0
+    let Z.extent.4 = 1
+    let Z.min.4 = 0
+    let Z.extent.5 = 1
+    let Z.min.5 = 0
+    let Z.extent.6 = 1
+    let Z.min.6 = 0
+    let Z.extent.7 = 1
+    let Z.min.7 = 0
+    let Z.extent.8 = 1
+    let Z.min.8 = 0
+    let Z.stride.0 = 1
+    let Z.stride.1 = (Z.stride.0*8)
+    let Z.stride.2 = (Z.stride.1*32)
+    let Z.stride.3 = (Z.stride.2*1)
+    let Z.stride.4 = (Z.stride.3*1)
+    let Z.stride.5 = (Z.stride.4*1)
+    let Z.stride.6 = (Z.stride.5*1)
+    let Z.stride.7 = (Z.stride.6*1)
+    let Z.stride.8 = (Z.stride.7*1)
+    allocate Z[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    let Z.buffer = (let t16 = ((halide_dimension_t *))make_struct(Z.min.0, Z.extent.0, Z.stride.0, 0, Z.min.1, Z.extent.1, Z.stride.1, 0, Z.min.2, Z.extent.2, Z.stride.2, 0, Z.min.3, Z.extent.3, Z.stride.3, 0, Z.min.4, Z.extent.4, Z.stride.4, 0, Z.min.5, Z.extent.5, Z.stride.5, 0, Z.min.6, Z.extent.6, Z.stride.6, 0, Z.min.7, Z.extent.7, Z.stride.7, 0, Z.min.8, Z.extent.8, Z.stride.8, 0) in ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))alloca(size_of_halide_buffer_t()), t16, Z, (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 9, t16, (uint64)0))
+    if ((uint1)1) {
+     Z[ramp(0, 1, 256)] = x256(0.000000f)
+    }
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+     (float32)cm_load_2d(((void *))B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(((void *))A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      vectorized (X.s0.jjj, 0, 8) {
+       punrolled (X.s0.kkk, 0, 8) {
+        0
+        0
+        Z[(((((((((0 + ((X.s0.jjj - Z.min.0)*Z.stride.0)) + ((X.s0.iii - Z.min.1)*Z.stride.1)) + ((0 - Z.min.2)*Z.stride.2)) + ((0 - Z.min.3)*Z.stride.3)) + ((0 - Z.min.4)*Z.stride.4)) + ((0 - Z.min.5)*Z.stride.5)) + ((0 - Z.min.6)*Z.stride.6)) + ((0 - Z.min.7)*Z.stride.7)) + ((0 - Z.min.8)*Z.stride.8))] = ((let t3 = Z[(((((((((0 + ((X.s0.jjj - Z.min.0)*Z.stride.0)) + ((X.s0.iii - Z.min.1)*Z.stride.1)) + ((0 - Z.min.2)*Z.stride.2)) + ((0 - Z.min.3)*Z.stride.3)) + ((0 - Z.min.4)*Z.stride.4)) + ((0 - Z.min.5)*Z.stride.5)) + ((0 - Z.min.6)*Z.stride.6)) + ((0 - Z.min.7)*Z.stride.7)) + ((0 - Z.min.8)*Z.stride.8))] in t3) + (A_im_buf[((0 + ((0*8) + X.s0.kkk)) + (X.s0.iii*8))]*B_im_buf[((0 + X.s0.jjj) + (((0*8) + X.s0.kkk)*8))]))
+       }
+      }
+     }
+    }
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+    unrolled (Out.store.t0, 0, 1) {
+     (float32x256)cm_store_2d(((void *))Out, (var.Out.store.addr.0 + 0), (var.Out.store.addr.1 + 0), Z, ramp((Out.store.t0*256), 1, 256), 8, 32, "C")
+    }
+   }
+  }
+ }
+}
+
+
+Unpacking buffer arguments...
+Lowering after unpacking buffer arguments...
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<Default_GPU> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  gpu_thread<Default_GPU> (X.s0.ii.__thread_id_y, 0, 2) {
+   gpu_thread<Default_GPU> (X.s0.jj.__thread_id_x, 0, 8) {
+    allocate B_im_buf[float32 * 8 * 8] in Register
+    allocate A_im_buf[float32 * 8 * 32] in Register
+    let X.extent.0 = 8
+    let X.min.0 = 0
+    let X.extent.1 = 32
+    let X.min.1 = 0
+    let X.extent.2 = 1
+    let X.min.2 = 0
+    let X.extent.3 = 1
+    let X.min.3 = 0
+    let X.extent.4 = 1
+    let X.min.4 = 0
+    let X.extent.5 = 1
+    let X.min.5 = 0
+    let X.extent.6 = 1
+    let X.min.6 = 0
+    let X.extent.7 = 1
+    let X.min.7 = 0
+    let X.extent.8 = 1
+    let X.min.8 = 0
+    let X.stride.0 = 1
+    let X.stride.1 = (X.stride.0*8)
+    let X.stride.2 = (X.stride.1*32)
+    let X.stride.3 = (X.stride.2*1)
+    let X.stride.4 = (X.stride.3*1)
+    let X.stride.5 = (X.stride.4*1)
+    let X.stride.6 = (X.stride.5*1)
+    let X.stride.7 = (X.stride.6*1)
+    let X.stride.8 = (X.stride.7*1)
+    allocate X[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    let X.buffer = (let t18 = ((halide_dimension_t *))make_struct(X.min.0, X.extent.0, X.stride.0, 0, X.min.1, X.extent.1, X.stride.1, 0, X.min.2, X.extent.2, X.stride.2, 0, X.min.3, X.extent.3, X.stride.3, 0, X.min.4, X.extent.4, X.stride.4, 0, X.min.5, X.extent.5, X.stride.5, 0, X.min.6, X.extent.6, X.stride.6, 0, X.min.7, X.extent.7, X.stride.7, 0, X.min.8, X.extent.8, X.stride.8, 0) in ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))alloca(size_of_halide_buffer_t()), t18, X, (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 9, t18, (uint64)0))
+    let Y.extent.0 = 8
+    let Y.min.0 = 0
+    let Y.extent.1 = 32
+    let Y.min.1 = 0
+    let Y.extent.2 = 1
+    let Y.min.2 = 0
+    let Y.extent.3 = 1
+    let Y.min.3 = 0
+    let Y.extent.4 = 1
+    let Y.min.4 = 0
+    let Y.extent.5 = 1
+    let Y.min.5 = 0
+    let Y.extent.6 = 1
+    let Y.min.6 = 0
+    let Y.extent.7 = 1
+    let Y.min.7 = 0
+    let Y.extent.8 = 1
+    let Y.min.8 = 0
+    let Y.stride.0 = 1
+    let Y.stride.1 = (Y.stride.0*8)
+    let Y.stride.2 = (Y.stride.1*32)
+    let Y.stride.3 = (Y.stride.2*1)
+    let Y.stride.4 = (Y.stride.3*1)
+    let Y.stride.5 = (Y.stride.4*1)
+    let Y.stride.6 = (Y.stride.5*1)
+    let Y.stride.7 = (Y.stride.6*1)
+    let Y.stride.8 = (Y.stride.7*1)
+    allocate Y[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    let Y.buffer = (let t17 = ((halide_dimension_t *))make_struct(Y.min.0, Y.extent.0, Y.stride.0, 0, Y.min.1, Y.extent.1, Y.stride.1, 0, Y.min.2, Y.extent.2, Y.stride.2, 0, Y.min.3, Y.extent.3, Y.stride.3, 0, Y.min.4, Y.extent.4, Y.stride.4, 0, Y.min.5, Y.extent.5, Y.stride.5, 0, Y.min.6, Y.extent.6, Y.stride.6, 0, Y.min.7, Y.extent.7, Y.stride.7, 0, Y.min.8, Y.extent.8, Y.stride.8, 0) in ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))alloca(size_of_halide_buffer_t()), t17, Y, (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 9, t17, (uint64)0))
+    let Z.extent.0 = 8
+    let Z.min.0 = 0
+    let Z.extent.1 = 32
+    let Z.min.1 = 0
+    let Z.extent.2 = 1
+    let Z.min.2 = 0
+    let Z.extent.3 = 1
+    let Z.min.3 = 0
+    let Z.extent.4 = 1
+    let Z.min.4 = 0
+    let Z.extent.5 = 1
+    let Z.min.5 = 0
+    let Z.extent.6 = 1
+    let Z.min.6 = 0
+    let Z.extent.7 = 1
+    let Z.min.7 = 0
+    let Z.extent.8 = 1
+    let Z.min.8 = 0
+    let Z.stride.0 = 1
+    let Z.stride.1 = (Z.stride.0*8)
+    let Z.stride.2 = (Z.stride.1*32)
+    let Z.stride.3 = (Z.stride.2*1)
+    let Z.stride.4 = (Z.stride.3*1)
+    let Z.stride.5 = (Z.stride.4*1)
+    let Z.stride.6 = (Z.stride.5*1)
+    let Z.stride.7 = (Z.stride.6*1)
+    let Z.stride.8 = (Z.stride.7*1)
+    allocate Z[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    let Z.buffer = (let t16 = ((halide_dimension_t *))make_struct(Z.min.0, Z.extent.0, Z.stride.0, 0, Z.min.1, Z.extent.1, Z.stride.1, 0, Z.min.2, Z.extent.2, Z.stride.2, 0, Z.min.3, Z.extent.3, Z.stride.3, 0, Z.min.4, Z.extent.4, Z.stride.4, 0, Z.min.5, Z.extent.5, Z.stride.5, 0, Z.min.6, Z.extent.6, Z.stride.6, 0, Z.min.7, Z.extent.7, Z.stride.7, 0, Z.min.8, Z.extent.8, Z.stride.8, 0) in ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))alloca(size_of_halide_buffer_t()), t16, Z, (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 9, t16, (uint64)0))
+    if ((uint1)1) {
+     Z[ramp(0, 1, 256)] = x256(0.000000f)
+    }
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      vectorized (X.s0.jjj, 0, 8) {
+       punrolled (X.s0.kkk, 0, 8) {
+        0
+        0
+        Z[(((((((((0 + ((X.s0.jjj - Z.min.0)*Z.stride.0)) + ((X.s0.iii - Z.min.1)*Z.stride.1)) + ((0 - Z.min.2)*Z.stride.2)) + ((0 - Z.min.3)*Z.stride.3)) + ((0 - Z.min.4)*Z.stride.4)) + ((0 - Z.min.5)*Z.stride.5)) + ((0 - Z.min.6)*Z.stride.6)) + ((0 - Z.min.7)*Z.stride.7)) + ((0 - Z.min.8)*Z.stride.8))] = ((let t3 = Z[(((((((((0 + ((X.s0.jjj - Z.min.0)*Z.stride.0)) + ((X.s0.iii - Z.min.1)*Z.stride.1)) + ((0 - Z.min.2)*Z.stride.2)) + ((0 - Z.min.3)*Z.stride.3)) + ((0 - Z.min.4)*Z.stride.4)) + ((0 - Z.min.5)*Z.stride.5)) + ((0 - Z.min.6)*Z.stride.6)) + ((0 - Z.min.7)*Z.stride.7)) + ((0 - Z.min.8)*Z.stride.8))] in t3) + (A_im_buf[((0 + ((0*8) + X.s0.kkk)) + (X.s0.iii*8))]*B_im_buf[((0 + X.s0.jjj) + (((0*8) + X.s0.kkk)*8))]))
+       }
+      }
+     }
+    }
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+    unrolled (Out.store.t0, 0, 1) {
+     (float32x256)cm_store_2d(Out, (var.Out.store.addr.0 + 0), (var.Out.store.addr.1 + 0), Z, ramp((Out.store.t0*256), 1, 256), 8, 32, "C")
+    }
+   }
+  }
+ }
+}
+
+
+Skipping rewriting memoized allocations...
+Second simplification...
+Lowering after second simplifcation:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<Default_GPU> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  gpu_thread<Default_GPU> (X.s0.ii.__thread_id_y, 0, 2) {
+   gpu_thread<Default_GPU> (X.s0.jj.__thread_id_x, 0, 8) {
+    allocate B_im_buf[float32 * 8 * 8] in Register
+    allocate A_im_buf[float32 * 8 * 32] in Register
+    allocate X[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    allocate Y[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    allocate Z[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      vectorized (X.s0.jjj, 0, 8) {
+       punrolled (X.s0.kkk, 0, 8) {
+        Z[((X.s0.iii*8) + X.s0.jjj)] = ((let t3 = Z[((X.s0.iii*8) + X.s0.jjj)] in t3) + (A_im_buf[((X.s0.iii*8) + X.s0.kkk)]*B_im_buf[((X.s0.kkk*8) + X.s0.jjj)]))
+       }
+      }
+     }
+    }
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+   }
+  }
+ }
+}
+
+
+Reduce prefetch dimension...
+Lowering after reduce prefetch dimension:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<Default_GPU> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  gpu_thread<Default_GPU> (X.s0.ii.__thread_id_y, 0, 2) {
+   gpu_thread<Default_GPU> (X.s0.jj.__thread_id_x, 0, 8) {
+    allocate B_im_buf[float32 * 8 * 8] in Register
+    allocate A_im_buf[float32 * 8 * 32] in Register
+    allocate X[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    allocate Y[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    allocate Z[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      vectorized (X.s0.jjj, 0, 8) {
+       punrolled (X.s0.kkk, 0, 8) {
+        Z[((X.s0.iii*8) + X.s0.jjj)] = ((let t3 = Z[((X.s0.iii*8) + X.s0.jjj)] in t3) + (A_im_buf[((X.s0.iii*8) + X.s0.kkk)]*B_im_buf[((X.s0.kkk*8) + X.s0.jjj)]))
+       }
+      }
+     }
+    }
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+   }
+  }
+ }
+}
+
+Simplifying correlated differences...
+Lowering after simplifying correlated differences:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<Default_GPU> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  gpu_thread<Default_GPU> (X.s0.ii.__thread_id_y, 0, 2) {
+   gpu_thread<Default_GPU> (X.s0.jj.__thread_id_x, 0, 8) {
+    allocate B_im_buf[float32 * 8 * 8] in Register
+    allocate A_im_buf[float32 * 8 * 32] in Register
+    allocate X[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    allocate Y[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    allocate Z[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      vectorized (X.s0.jjj, 0, 8) {
+       punrolled (X.s0.kkk, 0, 8) {
+        Z[((X.s0.iii*8) + X.s0.jjj)] = ((let t3 = Z[((X.s0.iii*8) + X.s0.jjj)] in t3) + (A_im_buf[((X.s0.iii*8) + X.s0.kkk)]*B_im_buf[((X.s0.kkk*8) + X.s0.jjj)]))
+       }
+      }
+     }
+    }
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+   }
+  }
+ }
+}
+
+Vectorizing...
+Lowering after vectorizing:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<Default_GPU> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  gpu_thread<Default_GPU> (X.s0.ii.__thread_id_y, 0, 2) {
+   gpu_thread<Default_GPU> (X.s0.jj.__thread_id_x, 0, 8) {
+    allocate B_im_buf[float32 * 8 * 8] in Register
+    allocate A_im_buf[float32 * 8 * 32] in Register
+    allocate X[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    allocate Y[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    allocate Z[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[(x8((X.s0.iii*8)) + ramp(0, 1, 8))] = ((let t3.x8 = Z[(x8((X.s0.iii*8)) + ramp(0, 1, 8))] in t3.x8) + (x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])*B_im_buf[(x8((X.s0.kkk*8)) + ramp(0, 1, 8))]))
+      }
+     }
+    }
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+   }
+  }
+ }
+}
+
+
+Lowering after simplify after vectorizing:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<Default_GPU> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  gpu_thread<Default_GPU> (X.s0.ii.__thread_id_y, 0, 2) {
+   gpu_thread<Default_GPU> (X.s0.jj.__thread_id_x, 0, 8) {
+    allocate B_im_buf[float32 * 8 * 8] in Register
+    allocate A_im_buf[float32 * 8 * 32] in Register
+    allocate X[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    allocate Y[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    allocate Z[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = ((let t3.x8 = Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] in t3.x8) + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+   }
+  }
+ }
+}
+
+
+Combining channels ...
+Lowering after combining channels:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<Default_GPU> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  gpu_thread<Default_GPU> (X.s0.ii.__thread_id_y, 0, 2) {
+   gpu_thread<Default_GPU> (X.s0.jj.__thread_id_x, 0, 8) {
+    allocate B_im_buf[float32 * 8 * 8] in Register
+    allocate A_im_buf[float32 * 8 * 32] in Register
+    allocate X[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    allocate Y[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    allocate Z[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = ((let t3.x8 = Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] in t3.x8) + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+   }
+  }
+ }
+}
+
+
+Trimming loops to the region over which they do something...
+Lowering after loop trimming:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<Default_GPU> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  gpu_thread<Default_GPU> (X.s0.ii.__thread_id_y, 0, 2) {
+   gpu_thread<Default_GPU> (X.s0.jj.__thread_id_x, 0, 8) {
+    allocate B_im_buf[float32 * 8 * 8] in Register
+    allocate A_im_buf[float32 * 8 * 32] in Register
+    allocate X[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    allocate Y[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    allocate Z[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = ((let t3.x8 = Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] in t3.x8) + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+   }
+  }
+ }
+}
+
+
+Remove Lets and LetStmts in funcs with buffering or scattering...
+Lowering after removing Lets and LetStmts in funcs with buffering or scattering:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<Default_GPU> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  gpu_thread<Default_GPU> (X.s0.ii.__thread_id_y, 0, 2) {
+   gpu_thread<Default_GPU> (X.s0.jj.__thread_id_x, 0, 8) {
+    allocate B_im_buf[float32 * 8 * 8] in Register
+    allocate A_im_buf[float32 * 8 * 32] in Register
+    allocate X[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    allocate Y[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    allocate Z[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = ((let t3.x8 = Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] in t3.x8) + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+   }
+  }
+ }
+}
+
+
+Scattering and buffering...
+Lowering after Scattering and buffering:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<Default_GPU> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  gpu_thread<Default_GPU> (X.s0.ii.__thread_id_y, 0, 2) {
+   gpu_thread<Default_GPU> (X.s0.jj.__thread_id_x, 0, 8) {
+    allocate B_im_buf[float32 * 8 * 8] in Register
+    allocate A_im_buf[float32 * 8 * 32] in Register
+    allocate X[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    allocate Y[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    allocate Z[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = ((let t3.x8 = Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] in t3.x8) + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+   }
+  }
+ }
+}
+
+
+Gathering...
+Lowering after Gathering:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<Default_GPU> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  gpu_thread<Default_GPU> (X.s0.ii.__thread_id_y, 0, 2) {
+   gpu_thread<Default_GPU> (X.s0.jj.__thread_id_x, 0, 8) {
+    allocate B_im_buf[float32 * 8 * 8] in Register
+    allocate A_im_buf[float32 * 8 * 32] in Register
+    allocate X[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    allocate Y[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    allocate Z[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = ((let t3.x8 = Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] in t3.x8) + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+   }
+  }
+ }
+}
+
+
+Unrolling...
+Lowering after unrolling:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<Default_GPU> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  gpu_thread<Default_GPU> (X.s0.ii.__thread_id_y, 0, 2) {
+   gpu_thread<Default_GPU> (X.s0.jj.__thread_id_x, 0, 8) {
+    allocate B_im_buf[float32 * 8 * 8] in Register
+    allocate A_im_buf[float32 * 8 * 32] in Register
+    allocate X[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    allocate Y[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    allocate Z[float32 * 8 * 32 * 1 * 1 * 1 * 1 * 1 * 1 * 1]
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = ((let t3.x8 = Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] in t3.x8) + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + X.s0.ii.__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + X.s0.jj.__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+   }
+  }
+ }
+}
+
+
+Injecting per-block gpu synchronization...
+Lowering after injecting per-block gpu synchronization:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<Default_GPU> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  let group_0.shared_offset = 0
+  let sentinel.shared_offset = group_0.shared_offset
+  allocate __shared[uint8 * group_0.shared_offset] in GPUShared
+  gpu_thread<Default_GPU> (.__thread_id_y, 0, 2) {
+   gpu_thread<Default_GPU> (.__thread_id_x, 0, 8) {
+    allocate Z[float32 * 256]
+    allocate Y[float32 * 256]
+    allocate X[float32 * 256]
+    allocate A_im_buf[float32 * 256] in Register
+    allocate B_im_buf[float32 * 64] in Register
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + (.__thread_id_x + 0))*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + (.__thread_id_y + 0))*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = ((let t3.x8 = Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] in t3.x8) + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + (.__thread_id_y + 0))*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + (.__thread_id_x + 0))*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+   }
+  }
+ }
+}
+
+
+Partitioning loops to simplify boundary conditions...
+Lowering after partitioning loops:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<Default_GPU> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  allocate __shared[uint8 * 0] in GPUShared
+  gpu_thread<Default_GPU> (.__thread_id_y, 0, 2) {
+   gpu_thread<Default_GPU> (.__thread_id_x, 0, 8) {
+    allocate Z[float32 * 256]
+    allocate Y[float32 * 256]
+    allocate X[float32 * 256]
+    allocate A_im_buf[float32 * 256] in Register
+    allocate B_im_buf[float32 * 64] in Register
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = ((let t3.x8 = Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] in t3.x8) + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+   }
+  }
+ }
+}
+
+
+Injecting early frees...
+Lowering after injecting early frees:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<Default_GPU> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  allocate __shared[uint8 * 0] in GPUShared
+  gpu_thread<Default_GPU> (.__thread_id_y, 0, 2) {
+   gpu_thread<Default_GPU> (.__thread_id_x, 0, 8) {
+    allocate Z[float32 * 256]
+    allocate Y[float32 * 256]
+    allocate X[float32 * 256]
+    allocate A_im_buf[float32 * 256] in Register
+    allocate B_im_buf[float32 * 64] in Register
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = ((let t3.x8 = Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] in t3.x8) + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    free A_im_buf
+    free B_im_buf
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+    free Z
+    free X
+    free Y
+   }
+  }
+  free __shared
+ }
+}
+
+
+Simplifying correlated differences...
+Lowering after simplifying correlated differences:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<Default_GPU> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  allocate __shared[uint8 * 0] in GPUShared
+  gpu_thread<Default_GPU> (.__thread_id_y, 0, 2) {
+   gpu_thread<Default_GPU> (.__thread_id_x, 0, 8) {
+    allocate Z[float32 * 256]
+    allocate Y[float32 * 256]
+    allocate X[float32 * 256]
+    allocate A_im_buf[float32 * 256] in Register
+    allocate B_im_buf[float32 * 64] in Register
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = ((let t3.x8 = Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] in t3.x8) + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    free A_im_buf
+    free B_im_buf
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+    free Z
+    free X
+    free Y
+   }
+  }
+  free __shared
+ }
+}
+
+Replace memory channel with references...
+IR after removing LetStmts in device kernels ...
+
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<Default_GPU> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  allocate __shared[uint8 * 0] in GPUShared
+  gpu_thread<Default_GPU> (.__thread_id_y, 0, 2) {
+   gpu_thread<Default_GPU> (.__thread_id_x, 0, 8) {
+    allocate Z[float32 * 256]
+    allocate Y[float32 * 256]
+    allocate X[float32 * 256]
+    allocate A_im_buf[float32 * 256] in Register
+    allocate B_im_buf[float32 * 64] in Register
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = ((let t3.x8 = Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] in t3.x8) + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    free A_im_buf
+    free B_im_buf
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+    free Z
+    free X
+    free Y
+   }
+  }
+  free __shared
+ }
+}
+
+Lowering after replacing memory channels:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<Default_GPU> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<Default_GPU> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  allocate __shared[uint8 * 0] in GPUShared
+  gpu_thread<Default_GPU> (.__thread_id_y, 0, 2) {
+   gpu_thread<Default_GPU> (.__thread_id_x, 0, 8) {
+    allocate Z[float32 * 256]
+    allocate Y[float32 * 256]
+    allocate X[float32 * 256]
+    allocate A_im_buf[float32 * 256] in Register
+    allocate B_im_buf[float32 * 64] in Register
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = ((let t3.x8 = Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] in t3.x8) + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    free A_im_buf
+    free B_im_buf
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+    free Z
+    free X
+    free Y
+   }
+  }
+  free __shared
+ }
+}
+
+
+Selecting a GPU API for GPU loops...
+Lowering after selecting a GPU API:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<CM> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<CM> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  allocate __shared[uint8 * 0] in GPUShared
+  gpu_thread<CM> (.__thread_id_y, 0, 2) {
+   gpu_thread<CM> (.__thread_id_x, 0, 8) {
+    allocate Z[float32 * 256]
+    allocate Y[float32 * 256]
+    allocate X[float32 * 256]
+    allocate A_im_buf[float32 * 256] in Register
+    allocate B_im_buf[float32 * 64] in Register
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = ((let t3.x8 = Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] in t3.x8) + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    free A_im_buf
+    free B_im_buf
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+    free Z
+    free X
+    free Y
+   }
+  }
+  free __shared
+ }
+}
+
+
+Injecting host <-> dev buffer copies...
+Lowering after injecting host <-> dev buffer copies:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<CM> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<CM> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  allocate __shared[uint8 * 0] in GPUShared
+  gpu_thread<CM> (.__thread_id_y, 0, 2) {
+   gpu_thread<CM> (.__thread_id_x, 0, 8) {
+    allocate Z[float32 * 256]
+    allocate Y[float32 * 256]
+    allocate X[float32 * 256]
+    allocate A_im_buf[float32 * 256] in Register
+    allocate B_im_buf[float32 * 64] in Register
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = ((let t3.x8 = Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] in t3.x8) + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    free A_im_buf
+    free B_im_buf
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+    free Z
+    free X
+    free Y
+   }
+  }
+  free __shared
+ }
+}
+
+
+Selecting a GPU API for extern stages...
+Lowering after selecting a GPU API for extern stages:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<CM> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<CM> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  allocate __shared[uint8 * 0] in GPUShared
+  gpu_thread<CM> (.__thread_id_y, 0, 2) {
+   gpu_thread<CM> (.__thread_id_x, 0, 8) {
+    allocate Z[float32 * 256]
+    allocate Y[float32 * 256]
+    allocate X[float32 * 256]
+    allocate A_im_buf[float32 * 256] in Register
+    allocate B_im_buf[float32 * 64] in Register
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = ((let t3.x8 = Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] in t3.x8) + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    free A_im_buf
+    free B_im_buf
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+    free Z
+    free X
+    free Y
+   }
+  }
+  free __shared
+ }
+}
+
+
+Bounding small allocations...
+Lowering after bounding small allocations:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<CM> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<CM> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  allocate __shared[uint8 * 0] in GPUShared if (uint1)0
+  gpu_thread<CM> (.__thread_id_y, 0, 2) {
+   gpu_thread<CM> (.__thread_id_x, 0, 8) {
+    allocate Z[float32 * 256]
+    allocate Y[float32 * 256]
+    allocate X[float32 * 256]
+    allocate A_im_buf[float32 * 256] in Register
+    allocate B_im_buf[float32 * 64] in Register
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = ((let t3.x8 = Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] in t3.x8) + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    free A_im_buf
+    free B_im_buf
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+    free Z
+    free X
+    free Y
+   }
+  }
+  free __shared
+ }
+}
+
+
+CSE...
+Lowering after CSE:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<CM> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<CM> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  allocate __shared[uint8 * 0] in GPUShared if (uint1)0
+  gpu_thread<CM> (.__thread_id_y, 0, 2) {
+   gpu_thread<CM> (.__thread_id_x, 0, 8) {
+    allocate Z[float32 * 256]
+    allocate Y[float32 * 256]
+    allocate X[float32 * 256]
+    allocate A_im_buf[float32 * 256] in Register
+    allocate B_im_buf[float32 * 64] in Register
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = (Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    free A_im_buf
+    free B_im_buf
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+    free Z
+    free X
+    free Y
+   }
+  }
+  free __shared
+ }
+}
+
+
+Matching compute patterns...
+Lowering after matching patterns:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<CM> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<CM> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  allocate __shared[uint8 * 0] in GPUShared if (uint1)0
+  gpu_thread<CM> (.__thread_id_y, 0, 2) {
+   gpu_thread<CM> (.__thread_id_x, 0, 8) {
+    allocate Z[float32 * 256]
+    allocate Y[float32 * 256]
+    allocate X[float32 * 256]
+    allocate A_im_buf[float32 * 256] in Register
+    allocate B_im_buf[float32 * 64] in Register
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = (Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    free A_im_buf
+    free B_im_buf
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+    free Z
+    free X
+    free Y
+   }
+  }
+  free __shared
+ }
+}
+
+
+Lowering unsafe promises...
+Lowering after lowering unsafe promises:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<CM> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<CM> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  allocate __shared[uint8 * 0] in GPUShared if (uint1)0
+  gpu_thread<CM> (.__thread_id_y, 0, 2) {
+   gpu_thread<CM> (.__thread_id_x, 0, 8) {
+    allocate Z[float32 * 256]
+    allocate Y[float32 * 256]
+    allocate X[float32 * 256]
+    allocate A_im_buf[float32 * 256] in Register
+    allocate B_im_buf[float32 * 64] in Register
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = (Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    free A_im_buf
+    free B_im_buf
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+    free Z
+    free X
+    free Y
+   }
+  }
+  free __shared
+ }
+}
+
+
+Removing dead allocationss and dimensions...
+Lowering after final simplification:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<CM> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<CM> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  gpu_thread<CM> (.__thread_id_y, 0, 2) {
+   gpu_thread<CM> (.__thread_id_x, 0, 8) {
+    allocate Z[float32 * 256]
+    allocate A_im_buf[float32 * 256] in Register
+    allocate B_im_buf[float32 * 64] in Register
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = (Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    free A_im_buf
+    free B_im_buf
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+    free Z
+   }
+  }
+ }
+}
+
+
+Promoting channels...
+Lowering after channel promotion:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<CM> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<CM> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  gpu_thread<CM> (.__thread_id_y, 0, 2) {
+   gpu_thread<CM> (.__thread_id_x, 0, 8) {
+    allocate Z[float32 * 256]
+    allocate A_im_buf[float32 * 256] in Register
+    allocate B_im_buf[float32 * 64] in Register
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = (Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    free A_im_buf
+    free B_im_buf
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+    free Z
+   }
+  }
+ }
+}
+
+
+Flatten triangular loop...
+Lowering after triangular loop optimizing:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<CM> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<CM> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  gpu_thread<CM> (.__thread_id_y, 0, 2) {
+   gpu_thread<CM> (.__thread_id_x, 0, 8) {
+    allocate Z[float32 * 256]
+    allocate A_im_buf[float32 * 256] in Register
+    allocate B_im_buf[float32 * 64] in Register
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = (Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    free A_im_buf
+    free B_im_buf
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+    free Z
+   }
+  }
+ }
+}
+
+
+Late fuse...
+Lowering after late fuse:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<CM> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<CM> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  gpu_thread<CM> (.__thread_id_y, 0, 2) {
+   gpu_thread<CM> (.__thread_id_x, 0, 8) {
+    allocate Z[float32 * 256]
+    allocate A_im_buf[float32 * 256] in Register
+    allocate B_im_buf[float32 * 64] in Register
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = (Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    free A_im_buf
+    free B_im_buf
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+    free Z
+   }
+  }
+ }
+}
+
+
+Creating overlay scheduler...
+Lowering after creating overlay scheduler:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<CM> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<CM> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  gpu_thread<CM> (.__thread_id_y, 0, 2) {
+   gpu_thread<CM> (.__thread_id_x, 0, 8) {
+    allocate Z[float32 * 256]
+    allocate A_im_buf[float32 * 256] in Register
+    allocate B_im_buf[float32 * 64] in Register
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = (Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    free A_im_buf
+    free B_im_buf
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+    free Z
+   }
+  }
+ }
+}
+
+
+Skipping Hexagon offload...
+Remove lets...
+Lowering after removing lets:
+assert(((uint64)reinterpret(((buffer_t *))Out.buffer) != (uint64)0), halide_error_buffer_argument_is_null("Out"))
+assert(((uint64)reinterpret(((buffer_t *))B.buffer) != (uint64)0), halide_error_buffer_argument_is_null("B"))
+assert(((uint64)reinterpret(((buffer_t *))A.buffer) != (uint64)0), halide_error_buffer_argument_is_null("A"))
+let A = ((void *))_halide_buffer_get_host(((buffer_t *))A.buffer)
+let A.type = (uint32)_halide_buffer_get_type(((buffer_t *))A.buffer)
+let A.dimensions = _halide_buffer_get_dimensions(((buffer_t *))A.buffer)
+let A.min.0 = _halide_buffer_get_min(((buffer_t *))A.buffer, 0)
+let A.extent.0 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 0)
+let A.stride.0 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 0)
+let A.min.1 = _halide_buffer_get_min(((buffer_t *))A.buffer, 1)
+let A.extent.1 = _halide_buffer_get_extent(((buffer_t *))A.buffer, 1)
+let A.stride.1 = _halide_buffer_get_stride(((buffer_t *))A.buffer, 1)
+let B = ((void *))_halide_buffer_get_host(((buffer_t *))B.buffer)
+let B.type = (uint32)_halide_buffer_get_type(((buffer_t *))B.buffer)
+let B.dimensions = _halide_buffer_get_dimensions(((buffer_t *))B.buffer)
+let B.min.0 = _halide_buffer_get_min(((buffer_t *))B.buffer, 0)
+let B.extent.0 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 0)
+let B.stride.0 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 0)
+let B.min.1 = _halide_buffer_get_min(((buffer_t *))B.buffer, 1)
+let B.extent.1 = _halide_buffer_get_extent(((buffer_t *))B.buffer, 1)
+let B.stride.1 = _halide_buffer_get_stride(((buffer_t *))B.buffer, 1)
+let Out = ((void *))_halide_buffer_get_host(((buffer_t *))Out.buffer)
+let Out.type = (uint32)_halide_buffer_get_type(((buffer_t *))Out.buffer)
+let Out.dimensions = _halide_buffer_get_dimensions(((buffer_t *))Out.buffer)
+let Out.min.0 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 0)
+let Out.extent.0 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 0)
+let Out.stride.0 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 0)
+let Out.min.1 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 1)
+let Out.extent.1 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 1)
+let Out.stride.1 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 1)
+let Out.min.2 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 2)
+let Out.extent.2 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 2)
+let Out.stride.2 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 2)
+let Out.min.3 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 3)
+let Out.extent.3 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 3)
+let Out.stride.3 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 3)
+let Out.min.4 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 4)
+let Out.extent.4 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 4)
+let Out.stride.4 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 4)
+let Out.min.5 = _halide_buffer_get_min(((buffer_t *))Out.buffer, 5)
+let Out.extent.5 = _halide_buffer_get_extent(((buffer_t *))Out.buffer, 5)
+let Out.stride.5 = _halide_buffer_get_stride(((buffer_t *))Out.buffer, 5)
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))A.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))A.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))A.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(A.min.0, A.extent.0, 1, 0, A.min.1, A.extent.1, A.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))B.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))B.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))B.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 2, ((halide_dimension_t *))make_struct(B.min.0, B.extent.0, 1, 0, B.min.1, B.extent.1, B.extent.0, 0), (uint64)0)
+}
+if ((uint1)_halide_buffer_is_bounds_query(((buffer_t *))Out.buffer)) {
+ ((halide_buffer_t *))_halide_buffer_init(((halide_buffer_t *))Out.buffer, ((halide_dimension_t *))_halide_buffer_get_shape(((halide_buffer_t *))Out.buffer), ((void *))reinterpret((uint64)0), (uint64)0, ((halide_device_interface_t *))reinterpret((uint64)0), 2, 32, 6, ((halide_dimension_t *))make_struct(0, 8, 1, 0, 0, 32, 8, 0, 0, 8, 256, 0, 0, 2, 2048, 0, 0, (B.extent.0/64), 4096, 0, 0, (A.extent.1/64), ((B.extent.0/64)*4096), 0), (uint64)0)
+}
+assert((A.type == (uint32)73730), halide_error_bad_type("Input buffer A", A.type, (uint32)73730))
+assert((A.dimensions == 2), halide_error_bad_dimensions("Input buffer A", A.dimensions, 2))
+assert((B.type == (uint32)73730), halide_error_bad_type("Input buffer B", B.type, (uint32)73730))
+assert((B.dimensions == 2), halide_error_bad_dimensions("Input buffer B", B.dimensions, 2))
+assert((Out.type == (uint32)73730), halide_error_bad_type("Output buffer Out", Out.type, (uint32)73730))
+assert((Out.dimensions == 6), halide_error_bad_dimensions("Output buffer Out", Out.dimensions, 6))
+assert((0 <= A.extent.0), halide_error_buffer_extents_negative("Input buffer A", 0, A.extent.0))
+assert((0 <= A.extent.1), halide_error_buffer_extents_negative("Input buffer A", 1, A.extent.1))
+assert((0 <= B.extent.0), halide_error_buffer_extents_negative("Input buffer B", 0, B.extent.0))
+assert((0 <= B.extent.1), halide_error_buffer_extents_negative("Input buffer B", 1, B.extent.1))
+assert(((Out.min.0 <= 0) && (8 <= (Out.extent.0 + Out.min.0))), halide_error_access_out_of_bounds("Output buffer Out", 0, 0, 7, Out.min.0, ((Out.extent.0 + Out.min.0) + -1)))
+assert((0 <= Out.extent.0), halide_error_buffer_extents_negative("Output buffer Out", 0, Out.extent.0))
+assert(((Out.min.1 <= 0) && (32 <= (Out.extent.1 + Out.min.1))), halide_error_access_out_of_bounds("Output buffer Out", 1, 0, 31, Out.min.1, ((Out.extent.1 + Out.min.1) + -1)))
+assert((0 <= Out.extent.1), halide_error_buffer_extents_negative("Output buffer Out", 1, Out.extent.1))
+assert(((Out.min.2 <= 0) && (8 <= (Out.extent.2 + Out.min.2))), halide_error_access_out_of_bounds("Output buffer Out", 2, 0, 7, Out.min.2, ((Out.extent.2 + Out.min.2) + -1)))
+assert((0 <= Out.extent.2), halide_error_buffer_extents_negative("Output buffer Out", 2, Out.extent.2))
+assert(((Out.min.3 <= 0) && (2 <= (Out.extent.3 + Out.min.3))), halide_error_access_out_of_bounds("Output buffer Out", 3, 0, 1, Out.min.3, ((Out.extent.3 + Out.min.3) + -1)))
+assert((0 <= Out.extent.3), halide_error_buffer_extents_negative("Output buffer Out", 3, Out.extent.3))
+assert(((Out.min.4 <= 0) && ((B.extent.0/64) <= (Out.extent.4 + Out.min.4))), halide_error_access_out_of_bounds("Output buffer Out", 4, 0, ((B.extent.0/64) + -1), Out.min.4, ((Out.extent.4 + Out.min.4) + -1)))
+assert((0 <= Out.extent.4), halide_error_buffer_extents_negative("Output buffer Out", 4, Out.extent.4))
+assert(((Out.min.5 <= 0) && ((A.extent.1/64) <= (Out.extent.5 + Out.min.5))), halide_error_access_out_of_bounds("Output buffer Out", 5, 0, ((A.extent.1/64) + -1), Out.min.5, ((Out.extent.5 + Out.min.5) + -1)))
+assert((0 <= Out.extent.5), halide_error_buffer_extents_negative("Output buffer Out", 5, Out.extent.5))
+assert((A.stride.0 == 1), halide_error_constraint_violated("A.stride.0", A.stride.0, "1", 1))
+assert((B.stride.0 == 1), halide_error_constraint_violated("B.stride.0", B.stride.0, "1", 1))
+assert((Out.stride.0 == 1), halide_error_constraint_violated("Out.stride.0", Out.stride.0, "1", 1))
+let A.total_extent.1 = (int64(A.extent.1)*int64(A.extent.0))
+let B.total_extent.1 = (int64(B.extent.1)*int64(B.extent.0))
+let Out.total_extent.1 = (int64(Out.extent.1)*int64(Out.extent.0))
+let Out.total_extent.2 = (Out.total_extent.1*int64(Out.extent.2))
+let Out.total_extent.3 = (Out.total_extent.2*int64(Out.extent.3))
+let Out.total_extent.4 = (Out.total_extent.3*int64(Out.extent.4))
+let Out.total_extent.5 = (Out.total_extent.4*int64(Out.extent.5))
+assert(((uint64)abs(int64(A.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs(int64(A.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(A.extent.1)*int64(A.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("A", (uint64)abs((int64(A.extent.1)*int64(A.stride.1))), (uint64)2147483647))
+assert((A.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("A", A.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(B.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs(int64(B.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(B.extent.1)*int64(B.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("B", (uint64)abs((int64(B.extent.1)*int64(B.stride.1))), (uint64)2147483647))
+assert((B.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("B", B.total_extent.1, (int64)2147483647))
+assert(((uint64)abs(int64(Out.extent.0)) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs(int64(Out.extent.0)), (uint64)2147483647))
+assert(((uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.1)*int64(Out.stride.1))), (uint64)2147483647))
+assert((Out.total_extent.1 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.1, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.2)*int64(Out.stride.2))), (uint64)2147483647))
+assert((Out.total_extent.2 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.2, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.3)*int64(Out.stride.3))), (uint64)2147483647))
+assert((Out.total_extent.3 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.3, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.4)*int64(Out.stride.4))), (uint64)2147483647))
+assert((Out.total_extent.4 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.4, (int64)2147483647))
+assert(((uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))) <= (uint64)2147483647), halide_error_buffer_allocation_too_large("Out", (uint64)abs((int64(Out.extent.5)*int64(Out.stride.5))), (uint64)2147483647))
+assert((Out.total_extent.5 <= (int64)2147483647), halide_error_buffer_extents_too_large("Out", Out.total_extent.5, (int64)2147483647))
+gpu_block<CM> (X.s0.i.__block_id_y, 0, (A.extent.1/64)) {
+ gpu_block<CM> (X.s0.j.__block_id_x, 0, (B.extent.0/64)) {
+  gpu_thread<CM> (.__thread_id_y, 0, 2) {
+   gpu_thread<CM> (.__thread_id_x, 0, 8) {
+    allocate Z[float32 * 256]
+    allocate A_im_buf[float32 * 256] in Register
+    allocate B_im_buf[float32 * 64] in Register
+    Z[ramp(0, 1, 256)] = x256(0.000000f)
+    for (X.s0.k, 0, (A.extent.0/8)) {
+     let var.B_im.load.addr.1 = (X.s0.k*8)
+     let var.B_im.load.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+     (float32)cm_load_2d(B, var.B_im.load.addr.0, var.B_im.load.addr.1, B_im_buf, ramp(0, 1, 64), 8, 8)
+     let var.A_im.load.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+     let var.A_im.load.addr.0 = (X.s0.k*8)
+     (float32)cm_load_2d(A, var.A_im.load.addr.0, var.A_im.load.addr.1, A_im_buf, ramp(0, 1, 256), 8, 32)
+     punrolled (X.s0.iii, 0, 32) {
+      punrolled (X.s0.kkk, 0, 8) {
+       Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] = (Z[ramp((X.s0.iii*8), 1, 8) aligned(8, 0)] + (B_im_buf[ramp((X.s0.kkk*8), 1, 8) aligned(8, 0)]*x8(A_im_buf[((X.s0.iii*8) + X.s0.kkk)])))
+      }
+     }
+    }
+    free A_im_buf
+    free B_im_buf
+    let var.Out.store.addr.1 = (((X.s0.i.__block_id_y*2) + .__thread_id_y)*32)
+    let var.Out.store.addr.0 = (((X.s0.j.__block_id_x*8) + .__thread_id_x)*8)
+    (float32x256)cm_store_2d(Out, var.Out.store.addr.0, var.Out.store.addr.1, Z, ramp(0, 1, 256), 8, 32, "C")
+    free Z
+   }
+  }
+ }
+}
+
+Module.compile(): dev_src gemm
+Constructing CM device codegen
+CM device codegen init_module
+Generating llvm bitcode prolog for function gemm...
+Generating llvm bitcode for function gemm...
+Kernel launch: X.s0.i.__block_id_y
+Kernel bounds: (8, 2, 1, 1) threads, ((B.extent.0/64), (A.extent.1/64), 1, 1) blocks
+var: A
+var: A.extent.0
+var: B
+var: Out
+buffer: A 0 (read)
+buffer: B 0 (read)
+buffer: Out 0 (write)
+CodeGen_CM_Dev::compile kernel_X
+Adding CM kernel kernel_X
+Compiled launch to kernel "kernel_X"
+Currently, we do not implement CM runtime.
+Generating init_kernels for cm
+CM kernel:
+#include <cm/cm.h>
+#include <cm/cmtl.h>
+extern "C" _GENX_MAIN_ void kernel_X(
+int _A_extent_0,
+SurfaceIndex _A [[type("image2d_t float")]],
+SurfaceIndex _B [[type("image2d_t float")]],
+SurfaceIndex _C [[type("image2d_t float")]])
+{
+  int _X_s0_i___block_id_y = cm_group_id(1);
+  int _X_s0_j___block_id_x = cm_group_id(0);
+  int ___thread_id_y = cm_local_id(1);
+  int ___thread_id_x = cm_local_id(0);
+  vector<float, 256> _Z;
+  vector<float, 256> _A_im_buf;
+  vector<float, 64> _B_im_buf;
+  _Z.select<256, 1>(0) = 0.000000f;
+  for (int _X_s0_k = 0; _X_s0_k < 0 + (_A_extent_0/8); _X_s0_k++)
+  {
+    int _54 = (_X_s0_k*8);
+    int _55 = (((_X_s0_j___block_id_x*8)+___thread_id_x)*8);
+    read(_B, (_55*4), (_54+0), _B_im_buf.select<64, 1>(0).format<float, 8, 8>().select<8, 1, 8, 1>(0, 0));
+    int _56 = (((_X_s0_i___block_id_y*2)+___thread_id_y)*32);
+    int _57 = (_X_s0_k*8);
+    read(_A, (_57*4), (_56+0), _A_im_buf.select<256, 1>(0).format<float, 32, 8>().select<8, 1, 8, 1>(0, 0));
+    read(_A, (_57*4), (_56+8), _A_im_buf.select<256, 1>(0).format<float, 32, 8>().select<8, 1, 8, 1>(8, 0));
+    read(_A, (_57*4), (_56+16), _A_im_buf.select<256, 1>(0).format<float, 32, 8>().select<8, 1, 8, 1>(16, 0));
+    read(_A, (_57*4), (_56+24), _A_im_buf.select<256, 1>(0).format<float, 32, 8>().select<8, 1, 8, 1>(24, 0));
+    #pragma unroll
+    for (int _X_s0_iii = 0; _X_s0_iii < 0 + 32; _X_s0_iii++)
+    {
+      #pragma unroll
+      for (int _X_s0_kkk = 0; _X_s0_kkk < 0 + 8; _X_s0_kkk++)
+      {
+        _Z.select<8, 1>((_X_s0_iii*8)) = (_Z.select<8, 1>((_X_s0_iii*8))+(_B_im_buf.select<8, 1>((_X_s0_kkk*8))*_A_im_buf(((_X_s0_iii*8)+_X_s0_kkk))));
+      } // for _X_s0_kkk
+    } // for _X_s0_iii
+  } // for _X_s0_k
+  int _58 = (((_X_s0_i___block_id_y*2)+___thread_id_y)*32);
+  int _59 = (((_X_s0_j___block_id_x*8)+___thread_id_x)*8);
+  write(_C, (_59*4), (_58+0), _Z.select<256, 1>(0).format<float, 32, 8>().select<8, 1, 8, 1>(0, 0));
+  write(_C, (_59*4), (_58+8), _Z.select<256, 1>(0).format<float, 32, 8>().select<8, 1, 8, 1>(8, 0));
+  write(_C, (_59*4), (_58+16), _Z.select<256, 1>(0).format<float, 32, 8>().select<8, 1, 8, 1>(16, 0));
+  write(_C, (_59*4), (_58+24), _Z.select<256, 1>(0).format<float, 32, 8>().select<8, 1, 8, 1>(24, 0));
+} // kernel kernel_X
+
+Currently, we do not implement CM runtime, so we just emit source code.
+Success
diff --git a/t2s/tests/performance/oneapi/src/pipe_array.hpp b/t2s/tests/performance/oneapi/src/pipe_array.hpp
index 32edc725..3410062b 100644
--- a/t2s/tests/performance/oneapi/src/pipe_array.hpp
+++ b/t2s/tests/performance/oneapi/src/pipe_array.hpp
@@ -1,38 +1 @@
-//==============================================================
-// Copyright Intel Corporation
-//
-// SPDX-License-Identifier: MIT
-// =============================================================
-#ifndef __PIPE_ARRAY_HPP__
-#define __PIPE_ARRAY_HPP__
-
-#include <CL/sycl.hpp>
-#include <sycl/ext/intel/fpga_extensions.hpp>
-#include <utility>
-
-#include "pipe_array_internal.hpp"
-
-template <class Id, typename BaseTy, size_t depth, size_t... dims>
-struct PipeArray {
-  PipeArray() = delete;
-
-  template <size_t... idxs>
-  struct StructId;
-
-  template <size_t... idxs>
-  struct VerifyIndices {
-    static_assert(sizeof...(idxs) == sizeof...(dims),
-                  "Indexing into a PipeArray requires as many indices as "
-                  "dimensions of the PipeArray.");
-    static_assert(VerifierDimLayer<dims...>::template VerifierIdxLayer<
-                      idxs...>::IsValid(),
-                  "Index out of bounds");
-    using VerifiedPipe =
-        cl::sycl::ext::intel::pipe<StructId<idxs...>, BaseTy, depth>;
-  };
-
-  template <size_t... idxs>
-  using PipeAt = typename VerifyIndices<idxs...>::VerifiedPipe;
-};
-
-#endif /* __PIPE_ARRAY_HPP__ */
+c
\ No newline at end of file