[FIX][ARM] Revert v3_2 acl matmul changes

EgorDuplensky · EgorDuplensky · commit 60b922c89de2 · 2023-07-20T14:12:16.000+02:00
v3_2 acl matmul requires 'any' format for weights to use
has_opt_impl() feature.
Adaptation in CPU plugin is required and performance is not
guaranteed.
diff --git a/src/cpu/acl/matmul/acl_matmul.cpp b/src/cpu/acl/matmul/acl_matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Arm Ltd. and affiliates
+* Copyright 2021-2022 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,6 +31,7 @@ status_t acl_matmul_t::execute_forward(const exec_ctx_t &ctx) const {
     auto wei_base = CTX_IN_MEM(const data_t *, DNNL_ARG_WEIGHTS);
 
     bool is_transA = pd()->amp_.is_transA;
+    bool is_transB = pd()->amp_.is_transB;
     bool use_dst_acc = pd()->amp_.use_dst_acc;
 
     std::lock_guard<std::mutex> _lock {this->mtx};
@@ -42,13 +43,29 @@ status_t acl_matmul_t::execute_forward(const exec_ctx_t &ctx) const {
         nullptr, &acl_obj.dst_tensor, pd()->amp_.alpha, 0.0f, pd()->amp_.gemm_info);
 
     // Run transpose kernel
-    if (is_transA) {
+    if (is_transA && !is_transB) {
         acl_obj.src_tensor.allocator()->allocate();
         acl_obj.src_acc_tensor.allocator()->import_memory(
                 const_cast<data_t *>(src_base));
         acl_obj.transA.run();
         acl_obj.wei_tensor.allocator()->import_memory(
                 const_cast<data_t *>(wei_base));
+    } else if (is_transB && !is_transA) {
+        acl_obj.wei_tensor.allocator()->allocate();
+        acl_obj.wei_acc_tensor.allocator()->import_memory(
+                const_cast<data_t *>(wei_base));
+        acl_obj.transB.run();
+        acl_obj.src_tensor.allocator()->import_memory(
+                const_cast<data_t *>(src_base));
+    } else if (is_transA && is_transB) {
+        acl_obj.src_tensor.allocator()->allocate();
+        acl_obj.src_acc_tensor.allocator()->import_memory(
+                const_cast<data_t *>(src_base));
+        acl_obj.wei_tensor.allocator()->allocate();
+        acl_obj.wei_acc_tensor.allocator()->import_memory(
+                const_cast<data_t *>(wei_base));
+        acl_obj.transA.run();
+        acl_obj.transB.run();
     } else {
         acl_obj.src_tensor.allocator()->import_memory(
                 const_cast<data_t *>(src_base));
@@ -57,7 +74,7 @@ status_t acl_matmul_t::execute_forward(const exec_ctx_t &ctx) const {
     }
 
     if (use_dst_acc) {
-        // Put the result in a new tensor, it will be accumulated to the dst
+        // Put the result in a new tensor, it will be accumalated to the dst
         // during the post ops
         acl_obj.dst_tensor.allocator()->allocate();
     } else {
@@ -70,6 +87,7 @@ status_t acl_matmul_t::execute_forward(const exec_ctx_t &ctx) const {
     acl_obj.src_tensor.allocator()->free();
     acl_obj.wei_tensor.allocator()->free();
     if (is_transA) acl_obj.src_acc_tensor.allocator()->free();
+    if (is_transB) acl_obj.wei_acc_tensor.allocator()->free();
 
     void *dst = acl_obj.dst_tensor.buffer();
     pd()->post_ops.execute(ctx, dst);
diff --git a/src/cpu/acl/matmul/acl_matmul.hpp b/src/cpu/acl/matmul/acl_matmul.hpp
@@ -32,15 +32,20 @@ struct acl_resource_t : public resource_t {
 
     status_t configure(const acl_matmul_conf_t &amp) {
         if (!acl_obj_) return status::out_of_memory;
-        acl_obj_->src_tensor.allocator()->init(amp.src_tensor_info);
-        acl_obj_->wei_tensor.allocator()->init(amp.wei_tensor_info);
-        acl_obj_->dst_tensor.allocator()->init(amp.dst_tensor_info);
+        acl_obj_->src_tensor.allocator()->init(amp.src_info);
+        acl_obj_->wei_tensor.allocator()->init(amp.wei_info);
+        acl_obj_->dst_tensor.allocator()->init(amp.dst_info);
         // Configure transpose kernel for src, wei or both
         if (amp.is_transA) {
             acl_obj_->src_acc_tensor.allocator()->init(amp.src_acc_info);
             acl_obj_->transA.configure(
                     &acl_obj_->src_acc_tensor, &acl_obj_->src_tensor);
         }
+        if (amp.is_transB) {
+            acl_obj_->wei_acc_tensor.allocator()->init(amp.wei_acc_info);
+            acl_obj_->transB.configure(
+                    &acl_obj_->wei_acc_tensor, &acl_obj_->wei_tensor);
+        }
         // Configure GEMM
         acl_obj_->gemm.configure(&acl_obj_->src_tensor, &acl_obj_->wei_tensor,
                 nullptr, &acl_obj_->dst_tensor, amp.alpha, 0.0f, amp.gemm_info);
@@ -78,9 +83,7 @@ struct acl_matmul_t : public primitive_t {
                     && platform::has_data_type_support(data_type::f16);
             bool ok = is_dense_data()
                     && utils::one_of(true, is_fp32_ok, is_fp16_ok)
-                    && !has_zero_dim_memory()
-                    && weights_md_.format_kind == format_kind::any
-                    && set_default_formats()
+                    && !has_zero_dim_memory() && set_default_formats()
                     && attr()->has_default_values(
                             smask_t::oscale | smask_t::post_ops)
                     && attr_oscale_ok() && !has_runtime_dims_or_strides();
@@ -95,9 +98,9 @@ struct acl_matmul_t : public primitive_t {
             amp_.use_dst_acc = post_ops.has_sum();
 
             // Validate ACL GEMM
-            ACL_CHECK_VALID(arm_compute::NEGEMM::validate(&amp_.src_tensor_info,
-                    &amp_.wei_tensor_info, nullptr, &amp_.dst_tensor_info,
-                    amp_.alpha, 0.0f, amp_.gemm_info));
+            ACL_CHECK_VALID(arm_compute::NEGEMM::validate(&amp_.src_info,
+                    &amp_.wei_info, nullptr, &amp_.dst_info, amp_.alpha, 0.0f,
+                    amp_.gemm_info));
 
             return status::success;
         }
diff --git a/src/cpu/acl/matmul/acl_matmul_utils.cpp b/src/cpu/acl/matmul/acl_matmul_utils.cpp
@@ -41,7 +41,6 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
     const dim_t src_batch = helper.src_batch();
     const dim_t wei_batch = helper.wei_batch();
 
-    // We can only broadcast on one of src or wei at once
     // ACL supports broadcast for 3D shapes, and 4D shapes
     // for e.g when ab in abcd is 1x1
     bool batch_ok = IMPLICATION(src_batch > 1, wei_batch == 1)
@@ -54,18 +53,19 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
     bool with_bias = md.bias_desc.format_kind != format_kind::undef;
     ACL_CHECK_SUPPORT(with_bias, "ACL does not support bias for matmul");
 
-    // The two innermost dimensions can be transposed, but the batch dimensions
-    // must be the outermost
     using namespace format_tag;
     auto src_tag = memory_desc_matches_one_of_tag(
             src_md, abcd, abdc, abc, acb, ab, ba);
+    auto wei_tag = memory_desc_matches_one_of_tag(
+            wei_md, abcd, abdc, abc, acb, ab, ba);
     auto dst_tag = memory_desc_matches_one_of_tag(dst_md, abcd, abc, ab, ba);
-    ACL_CHECK_SUPPORT(utils::one_of(format_tag::undef, src_tag, dst_tag),
+    ACL_CHECK_SUPPORT(
+            utils::one_of(format_tag::undef, src_tag, wei_tag, dst_tag),
             "Format tag is undefined");
 
-    // Transpose A (src)
+    // Transpose A (src) or B (wei)
     amp.is_transA = helper.transA() == 'T';
-
+    amp.is_transB = helper.transB() == 'T';
     auto acl_src_data_t = acl_utils::get_acl_data_t(src_md.data_type);
     auto acl_wei_data_t = acl_utils::get_acl_data_t(wei_md.data_type);
     auto acl_dst_data_t = acl_utils::get_acl_data_t(dst_md.data_type);
@@ -74,14 +74,21 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
         amp.src_acc_info = arm_compute::TensorInfo(
                 arm_compute::TensorShape(M, K, 1, src_batch), 1,
                 acl_src_data_t);
+    if (amp.is_transB)
+        amp.wei_acc_info = arm_compute::TensorInfo(
+                arm_compute::TensorShape(K, N, wei_batch), 1, acl_wei_data_t);
 
-    amp.src_tensor_info = arm_compute::TensorInfo(
+    amp.src_info = arm_compute::TensorInfo(
             arm_compute::TensorShape(K, M, 1, src_batch), 1, acl_src_data_t);
-    amp.wei_tensor_info = arm_compute::TensorInfo(
+    amp.wei_info = arm_compute::TensorInfo(
             arm_compute::TensorShape(N, K, wei_batch), 1, acl_wei_data_t);
-    amp.dst_tensor_info = arm_compute::TensorInfo(
+    amp.dst_info = arm_compute::TensorInfo(
             arm_compute::TensorShape(N, M, 1, dst_batch), 1, acl_dst_data_t);
 
+    bool is_fastmath_enabled = utils::one_of(
+            attr.fpmath_mode_, fpmath_mode::bf16, fpmath_mode::any);
+    amp.gemm_info.set_fast_math(is_fastmath_enabled);
+
     // Set alpha (output scaling)
     // TODO: Add runtime scales support. Creation time scales will be remove
     // in 3.0.
@@ -91,45 +98,10 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
     // Validate ACL transpose
     if (amp.is_transA)
         ACL_CHECK_VALID(arm_compute::NETranspose::validate(
-                &amp.src_acc_info, &amp.src_tensor_info));
-
-    bool is_fastmath_enabled = utils::one_of(
-            attr.fpmath_mode_, fpmath_mode::bf16, fpmath_mode::any);
-    amp.gemm_info.set_fast_math(is_fastmath_enabled);
-
-    amp.gemm_info.set_fixed_format(true);
-
-    // WeightFormat::ANY tells ACL we can handle any format
-    amp.gemm_info.set_weight_format(arm_compute::WeightFormat::ANY);
-
-    // Get the format that the ACL kernel will expect the weights to be
-    // in (if a kernel exists). Note that these are referred to as fixed format
-    // kernels, because they require one specific weights format
-    arm_compute::WeightFormat expected_weight_format;
-    ACL_CHECK_VALID(arm_compute::NEGEMM::has_opt_impl(expected_weight_format,
-            &amp.src_tensor_info, &amp.wei_tensor_info, nullptr,
-            &amp.dst_tensor_info, amp.alpha, 0.0f, amp.gemm_info));
-
-    // Set gemm weights info to the one returned by has_opt_impl
-    amp.gemm_info.set_weight_format(expected_weight_format);
-
-    // has_opt_impl may return a non fast math kernel, even if we requested one
-    amp.gemm_info.set_fast_math(
-            arm_compute::is_fixed_format_fast_math(expected_weight_format));
-
-    // Logical dimension indices
-    dim_t innermost_dim = wei_md.ndims - 1;
-    dim_t N_dim = innermost_dim;
-    dim_t K_dim = innermost_dim - 1;
-
-    // The logical indices of dimensions related to the batch, ordered from
-    // innermost to outermost
-    std::vector<dim_t> batch_dims = {};
-    for (dim_t i = K_dim - 1; i >= 0; --i)
-        batch_dims.push_back(i);
-
-    acl_utils::reorder_to_weight_format(amp.wei_tensor_info, wei_md,
-            expected_weight_format, K_dim, N_dim, {}, batch_dims);
+                &amp.src_acc_info, &amp.src_info));
+    if (amp.is_transB)
+        ACL_CHECK_VALID(arm_compute::NETranspose::validate(
+                &amp.wei_acc_info, &amp.wei_info));
 
     return status::success;
 }
diff --git a/src/cpu/acl/matmul/acl_matmul_utils.hpp b/src/cpu/acl/matmul/acl_matmul_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Arm Ltd. and affiliates
+* Copyright 2021-2022 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_ACL_MATMUL_UTILS_HPP
-#define CPU_ACL_MATMUL_UTILS_HPP
+#ifndef CPU_AARCH64_ACL_MATMUL_UTILS_HPP
+#define CPU_AARCH64_ACL_MATMUL_UTILS_HPP
 
 #include "cpu/matmul/cpu_matmul_pd.hpp"
 
@@ -29,21 +29,25 @@ namespace acl {
 struct acl_matmul_obj_t {
     arm_compute::NEGEMM gemm;
     arm_compute::NETranspose transA;
+    arm_compute::NETranspose transB;
     arm_compute::Tensor src_tensor;
     arm_compute::Tensor src_acc_tensor;
     arm_compute::Tensor wei_tensor;
+    arm_compute::Tensor wei_acc_tensor;
     arm_compute::Tensor dst_tensor;
 };
 
 struct acl_matmul_conf_t {
     bool is_transA;
+    bool is_transB;
     // If this is true, the result of the matmul goes into a temporarily
     // allocated ACL tensor to be accumulated into the oneDNN dst during postops
     bool use_dst_acc;
-    arm_compute::TensorInfo src_tensor_info;
+    arm_compute::TensorInfo src_info;
     arm_compute::TensorInfo src_acc_info;
-    arm_compute::TensorInfo wei_tensor_info;
-    arm_compute::TensorInfo dst_tensor_info;
+    arm_compute::TensorInfo wei_info;
+    arm_compute::TensorInfo wei_acc_info;
+    arm_compute::TensorInfo dst_info;
     arm_compute::GEMMInfo gemm_info;
     float alpha;
 };
@@ -61,4 +65,4 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_ACL_MATMUL_UTILS_HPP
+#endif // CPU_AARCH64_ACL_MATMUL_UTILS_HPP