From 8f92bfc94cd72fe0374ab3be197d7364342bd912 Mon Sep 17 00:00:00 2001
From: zhangqiu <zhangqiu1994@outlook.com>
Date: Thu, 8 Aug 2024 11:13:36 +0800
Subject: [PATCH 1/5] fix index_put

---
 impl/ascend/aclnn/adaptor.hpp       |  12 +-
 impl/ascend/ascend_tensor.cpp       | 104 ++++++++-
 impl/ascend/ascend_tensor.hpp       |   6 +-
 impl/ascend/device_configs.py       |   5 +
 impl/ascend/functions/index.cpp     | 320 ++++++++++++++++++++++++++++
 impl/ascend/functions/index_put.cpp | 291 ++++++++++++++++++++++++-
 impl/ascend_npu/CMakeLists.txt      |   1 +
 impl/ascend_npu/ascend_config.yaml  |   4 +-
 8 files changed, 733 insertions(+), 10 deletions(-)
 create mode 100644 impl/ascend/functions/index.cpp
diff --git a/impl/ascend/aclnn/adaptor.hpp b/impl/ascend/aclnn/adaptor.hpp
index 117423c78..f4881be30 100644
--- a/impl/ascend/aclnn/adaptor.hpp
+++ b/impl/ascend/aclnn/adaptor.hpp
@@ -149,6 +149,10 @@ struct IsBoolStdArray<std::array<bool, N>> : std::true_type {};
 
 inline aclIntArray* createAclIntArrayFromIntVector(const std::vector<int64_t>& vec) { return ::aclCreateIntArray(vec.data(), vec.size()); }
 
+inline aclTensorList* createAclTensorListFromAclTensorVector(const std::vector<aclTensor*>& tensorsVec) {
+    return ::aclCreateTensorList(tensorsVec.data(), tensorsVec.size());
+}
+
 inline aclTensorList* createAclTensorListFromAscendTensorVector(const std::vector<AscendTensor>& tensorsVec) {
     std::vector<const aclTensor*> tList(tensorsVec.size());
     for (size_t i = 0; i < tensorsVec.size(); i++) {
@@ -175,7 +179,11 @@ inline aclTensorList* createAclTensorListFromConstDiopiTensorVector(const std::v
 
 template <class T, class U = std::remove_cv_t<std::remove_reference_t<T>>>
 decltype(auto) convertType(T&& param) {
-    if constexpr (std::is_same_v<U, AscendTensor>) {
+    if constexpr (std::is_same_v<U, aclTensor*>) {
+        return std::forward<T>(param);
+    } else if constexpr (std::is_same_v<U, std::vector<aclTensor*>>) {
+        return createAclTensorListFromAclTensorVector(std::forward<T>(param));
+    } else if constexpr (std::is_same_v<U, AscendTensor>) {
         return createAclTensorFromAscendTensor(std::forward<T>(param));
     } else if constexpr (std::is_same_v<U, diopiTensorHandle_t> || std::is_same_v<U, diopiConstTensorHandle_t>) {
         return createAclTensorFromDiopiTensor(std::forward<T>(param));
@@ -385,4 +393,4 @@ void callAclnnImpl(diopiContextHandle_t ctx, const std::tuple<Args...>& tuple) {
         DIOPI_ASECND_CALL_ACLNN_TYPE_SYNC(api, ctx, convertedParams.params())             \
     } while (false);
 
-#endif  // IMPL_ASCEND_ACLNN_ADAPTOR_HPP_
+#endif  // IMPL_ASCEND_ACLNN_ADAPTOR_HPP_
\ No newline at end of file
diff --git a/impl/ascend/ascend_tensor.cpp b/impl/ascend/ascend_tensor.cpp
index e966bc5f4..d71d5b941 100644
--- a/impl/ascend/ascend_tensor.cpp
+++ b/impl/ascend/ascend_tensor.cpp
@@ -6,9 +6,11 @@
 
 #include "ascend_tensor.hpp"
 
+// #include <algorithm>
 #include <array>
 #include <cstdint>
 #include <mutex>
+#include <numeric>
 #include <utility>
 
 #include "common/debug.hpp"
@@ -82,6 +84,106 @@ AscendTensor& AscendTensor::asStrided(const std::vector<int64_t>& shape, const s
     return *this;
 }
 
+AscendTensor& AscendTensor::permute(std::vector<int64_t> dims) {
+    ASCEND_CHECK_ABORT(this->dim() == dims.size(), "permute dims does not match the tensor dims.");
+
+    std::vector<int64_t> newShape(dims.size(), 0);
+    std::vector<int64_t> newStride(dims.size(), 0);
+
+    for (size_t i = 0; i < dims.size(); i++) {
+        newShape[i] = this->shape(dims[i]);
+        newStride[i] = this->stride(dims[i]);
+    }
+
+    this->shape_ = newShape;
+    this->stride_ = newStride;
+
+    return *this;
+}
+
+AscendTensor& AscendTensor::expand(std::vector<int64_t> shape) {
+    ASCEND_CHECK_ABORT(shape.size() >= this->dim(),
+                       "the number of sizes provided[% ld] must be greater or eaqual to the number of dimensions of the tensor[% ld].",
+                       shape.size(),
+                       this->dim());
+
+    // todo: dim() == 0
+    int64_t expandDims = shape.size() - this->shape().size();
+    std::vector<int64_t> tShapeExp(expandDims, 0);
+    auto tShape = this->shape();
+    tShapeExp.insert(tShapeExp.end(), tShape.begin(), tShape.end());
+    std::vector<int64_t> newShape = shape;
+
+    for (int64_t i = 0; i < newShape.size(); i++) {
+        if (newShape[i] < 0 && i < expandDims) {
+            ASCEND_CHECK_ABORT(false, "The expanded size of the tensor (%ld) isn't allowed in a leading, non-existing dimension %ld", newShape[i], i);
+        }
+
+        if (i >= expandDims) {
+            if (newShape[i] == -1) {
+                newShape[i] = tShapeExp[i];
+            } else {
+                ASCEND_CHECK_ABORT(tShapeExp[i] == 1 || newShape[i] == tShapeExp[i],
+                                   "The expanded size of the tensor (%ld) must match the existing size (%ld) at non-singleton dimension %ld.",
+                                   newShape[i],
+                                   tShapeExp[i],
+                                   i);
+            }
+        }
+    }
+
+    int64_t numElem = std::accumulate(newShape.begin(), newShape.end(), 1, std::multiplies<>());
+    std::vector<int64_t> newStride(expandDims, 0);
+    auto tStride = this->stride();
+    newStride.insert(newStride.end(), tStride.begin(), tStride.end());
+    for (int64_t i = expandDims; i < shape.size(); i++) {
+        if (shape[i] == -1 || shape[i] == tShapeExp[i]) {
+            continue;
+        } else {
+            newStride[i] = 0;
+        }
+    }
+
+    this->numel_ = numElem;
+    this->shape_ = newShape;
+    this->stride_ = newStride;
+
+    return *this;
+}
+
+AscendTensor& AscendTensor::resize(const std::vector<int64_t>& shape) {
+    int64_t numElem = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<>());
+    std::vector<int64_t> stride(shape.size(), 1);
+    for (int64_t j = shape.size() - 2; j >= 0; j--) {
+        stride[j] = stride[j + 1] * shape[j + 1];
+    }
+
+    this->numel_ = numElem;
+    this->shape_ = shape;
+    this->stride_ = stride;
+
+    return *this;
+}
+AscendTensor& AscendTensor::select(int64_t dim, int64_t index) {
+    auto shape = this->shape();
+    auto stride = this->stride();
+
+    ASCEND_CHECK_ABORT(dim >= 0 && dim < shape.size(), "selected dim [%ld] execeed the tensor dims [%ld].", dim, shape.size());
+
+    if (dim < shape.size() - 1) {
+        int64_t offset = dim * shape[dim] * stride[dim];
+        this->storageOffset_ = offset;
+    }
+    this->numel_ /= shape[dim];
+
+    shape.erase(shape.begin() + dim);
+    stride.erase(stride.begin() + dim);
+    this->shape_ = shape;
+    this->stride_ = stride;
+
+    return *this;
+}
+
 AscendTensor& AscendTensor::unsqueeze(int dim) {
     // Note: `channels_last` tensor uses this will become uncontiguous
     // which is same with pytorch
@@ -240,4 +342,4 @@ aclFormat inferAclDataFormat(int64_t dim, const int64_t* shape, const int64_t* s
     return ACL_FORMAT_ND;
 }
 }  // namespace ascend
-}  // namespace impl
+}  // namespace impl
\ No newline at end of file
diff --git a/impl/ascend/ascend_tensor.hpp b/impl/ascend/ascend_tensor.hpp
index 5c20faab4..20b29b6f5 100644
--- a/impl/ascend/ascend_tensor.hpp
+++ b/impl/ascend/ascend_tensor.hpp
@@ -245,6 +245,10 @@ class AscendTensor final {
     AscendTensor& asStrided(const std::vector<int64_t>& shape, const std::vector<int64_t>& stride);
     AscendTensor& unsqueeze(int dim);
     AscendTensor& view(const std::vector<int64_t>& shape);
+    AscendTensor& resize(const std::vector<int64_t>& shape);
+    AscendTensor& select(int64_t dim, int64_t index);
+    AscendTensor& permute(std::vector<int64_t> dims);
+    AscendTensor& expand(std::vector<int64_t> shape);
 
 private:
     // diopi origin tensor
@@ -262,4 +266,4 @@ class AscendTensor final {
 }  // namespace ascend
 }  // namespace impl
 
-#endif  // IMPL_ASCEND_ASCEND_TENSOR_HPP_
+#endif  // IMPL_ASCEND_ASCEND_TENSOR_HPP_
\ No newline at end of file
diff --git a/impl/ascend/device_configs.py b/impl/ascend/device_configs.py
index 1377c420e..9df743ff6 100755
--- a/impl/ascend/device_configs.py
+++ b/impl/ascend/device_configs.py
@@ -1177,6 +1177,11 @@
         skip_all=True
     ),
     
+    'index_put_acc_bool_indices_zeros': dict(
+        name=['index_put'],
+        skip_all=True
+    ),
+    
     # TODO(zhangqiu) Due to a bug in the software stack, this test will be skipped for now.
     'embedding': dict(
         name=['embedding'],
diff --git a/impl/ascend/functions/index.cpp b/impl/ascend/functions/index.cpp
new file mode 100644
index 000000000..b9cf3c81b
--- /dev/null
+++ b/impl/ascend/functions/index.cpp
@@ -0,0 +1,320 @@
+/**
+ * @file
+ * @author DeepLink
+ * @copyright  (c) 2024, DeepLink.
+ */
+
+#include <ostream>
+
+#include "../aclnn/acl_scalar.hpp"
+#include "../aclnn/adaptor.hpp"
+
+namespace impl {
+namespace ascend {
+
+static std::vector<AscendTensor> castIntIndicesToLongIndices(diopiContextHandle_t ctx, std::vector<AscendTensor>& indices) {
+    std::vector<AscendTensor> result;
+    for (auto& t : indices) {
+        if (!t.defined()) {
+            result.emplace_back(nullptr);
+            continue;
+        }
+        if (t.dtype() == diopi_dtype_int32) {
+            diopiTensorHandle_t indexHandle = nullptr;
+            auto shape = t.shape();
+            diopiSize_t size = vectorToDiopiSize(shape);
+            diopiRequireTensor(ctx, &indexHandle, &size, nullptr, diopi_dtype_int64, diopi_device);
+            DIOPI_ASCEND_CALL_ACLNN(aclnnCast, ctx, t, diopi_dtype_int64, indexHandle);
+            result.emplace_back(indexHandle);
+        } else {
+            if (t.device() == diopi_host) {
+                result.emplace_back(hostToDevice(ctx, t.tensorHandle()));
+            } else {
+                result.emplace_back(t);
+            }
+        }
+    }
+    return result;
+}
+
+static void checkIndexTensorTypes(const std::vector<AscendTensor>& indices) {
+    for (const auto& t : indices) {
+        if (t.defined()) {
+            diopiDtype_t type = t.dtype();
+            ASCEND_CHECK_ABORT(type == diopi_dtype_int64 || type == diopi_dtype_bool || type == diopi_dtype_uint8,
+                               "tensors used as indices must be long, byte or bool tensors");
+        }
+    }
+}
+
+static AscendTensor nonZeroTensor(diopiContextHandle_t ctx, const AscendTensor& self) {
+    int64_t numELem = self.numel() * self.dim();
+    std::vector<int64_t> nShape{self.numel(), self.dim()};
+    std::vector<int64_t> nStride(nShape.size(), 1);
+    for (int64_t i = nShape.size() - 2; i >= 0; i--) {
+        nStride[i] = nStride[i + 1] * nShape[i + 1];
+    }
+
+    diopiTensorHandle_t nzBuff = nullptr;
+    diopiSize_t nzBuffSize = vectorToDiopiSize(nShape);
+    diopiRequireTensor(ctx, &nzBuff, &nzBuffSize, nullptr, diopi_dtype_int64, diopi_device);
+    AscendTensor nzTensor(nzBuff);
+
+    auto aclNZTensor = ::aclCreateTensor(
+        nShape.data(), nShape.size(), aclDataType::ACL_INT64, nStride.data(), 0, aclFormat::ACL_FORMAT_ND, &numELem, 1, const_cast<void*>(nzTensor.data()));
+    DIOPI_ASCEND_CALL_ACLNN(aclnnNonzero, ctx, self, aclNZTensor);
+
+    int64_t* vDims = nullptr;
+    uint64_t vDimsNum = 0;
+    auto ret = aclGetViewShape(aclNZTensor, &vDims, &vDimsNum);
+    ASCEND_CHECK_ABORT(ret == 0, "NonZero aclGetViewShape failed.");
+
+    std::vector<int64_t> nzShape(vDims, vDims + vDimsNum);
+    nzTensor = nzTensor.resize(nzShape);
+
+    delete vDims;
+    vDims = nullptr;
+
+    diopiTensorHandle_t nzTrans = nullptr;
+    std::vector<int64_t> nzTransShape{nzShape[1], nzShape[0]};
+    diopiSize_t nzTransSize = vectorToDiopiSize(nzTransShape);
+    diopiRequireTensor(ctx, &nzTrans, &nzTransSize, nullptr, diopi_dtype_int64, diopi_device);
+    std::vector<int64_t> transDims{1, 0};
+    diopiSize_t permuteDims = vectorToDiopiSize(transDims);
+    DIOPI_ASCEND_CALL_ACLNN(aclnnPermute, ctx, nzTensor, permuteDims, nzTrans);
+
+    return AscendTensor(nzTrans);
+}
+
+static std::vector<AscendTensor> expandIndicesTensors(diopiContextHandle_t ctx, const AscendTensor& self, const std::vector<AscendTensor>& indices) {
+    std::vector<AscendTensor> result;
+    for (auto& t : indices) {
+        if (!t.defined()) {
+            result.push_back(t);
+        } else {
+            if (t.dtype() == diopi_dtype_uint8 || t.dtype() == diopi_dtype_bool) {
+                ASCEND_CHECK(t.dtype() == diopi_dtype_uint8,
+                             "indexing with dtype torch.uint8 is now deprecated,"
+                             " please use a dtype torch.bool instead.");
+                for (uint64_t j = 0; j < static_cast<uint64_t>(t.dim()); j++) {
+                    uint64_t srcIdx = result.size() + j;
+                    ASCEND_CHECK_ABORT(t.shape(j) == self.shape(srcIdx),
+                                       "The shape of the mask  %ld at index  %ld does not match the shape of the indexed tensor %ld at index %ld",
+                                       t.dim(),
+                                       j,
+                                       self.dim(),
+                                       srcIdx);
+                }
+                AscendTensor non = nonZeroTensor(ctx, t);
+                for (int64_t j = 0; j < t.dim(); j++) {
+                    result.push_back(non.select(0, j));
+                }
+            } else {
+                result.push_back(t);
+            }
+        }
+    }
+    return result;
+}
+
+static aclTensor* createEmptyAclTensor() {
+    std::vector<int64_t> nShape{0};
+    std::vector<int64_t> nStride{1};
+    int64_t storageSize = 0;
+    void* storage = nullptr;
+
+    return ::aclCreateTensor(nShape.data(), nShape.size(), aclDataType::ACL_FLOAT16, nStride.data(), 0, aclFormat::ACL_FORMAT_ND, &storageSize, 0, storage);
+}
+
+static std::vector<AscendTensor> indicesExpandedOutplace(std::vector<AscendTensor> indices) {
+    bool first = true;
+    std::vector<int64_t> sizes;
+
+    for (auto& idx : indices) {
+        if (!idx.defined()) {
+            continue;
+        } else if (first) {
+            sizes = idx.shape();
+            first = false;
+        } else {
+            sizes = inferSize(sizes, idx.shape());
+        }
+    }
+
+    std::vector<AscendTensor> result;
+    for (auto& idx : indices) {
+        if (!idx.defined() || (idx.shape() == sizes)) {
+            result.push_back(idx);
+        } else {
+            result.push_back(idx.expand(sizes));
+        }
+    }
+    return result;
+}
+
+static bool hasContiguousSubspace(std::vector<AscendTensor> indices) {  // true if all the non-null tensors are adjacent
+    auto isDefined = [](const AscendTensor& tensor) { return tensor.defined(); };
+    auto isNull = [](const AscendTensor& tensor) { return !tensor.defined(); };
+    auto start = std::find_if(indices.begin(), indices.end(), isDefined);
+    auto stop = std::find_if(indices.rbegin(), indices.rend(), isDefined);
+    auto it = std::find_if(start, stop.base(), isNull);
+    return it == stop.base();
+}
+
+static std::tuple<AscendTensor, std::vector<AscendTensor>> transposeToFront(AscendTensor self, std::vector<AscendTensor> indices) {
+    std::vector<int64_t> dims;
+    std::vector<AscendTensor> transposedIndices;
+
+    dims.reserve(self.dim());
+    for (int64_t i = 0; i < self.dim(); i++) {
+        if (indices[i].defined()) {
+            dims.push_back(i);
+            transposedIndices.push_back(indices[i]);
+        }
+    }
+
+    for (int64_t i = 0; i < self.dim(); i++) {
+        if (!indices[i].defined()) {
+            dims.push_back(i);
+            transposedIndices.push_back(indices[i]);
+        }
+    }
+
+    return std::make_tuple(self.permute(dims), transposedIndices);
+}
+
+static std::vector<int64_t> indexReshape(std::vector<AscendTensor> endIndices, int64_t dimsBefore, int64_t dimsAfter) {
+    std::vector<int64_t> indexShape;
+    for (auto& idx : endIndices) {
+        if (idx.defined()) {
+            std::vector<int64_t> shape;
+            shape.insert(shape.end(), dimsBefore, 1);
+            shape.insert(shape.end(), idx.shape().begin(), idx.shape().end());
+            shape.insert(shape.end(), dimsAfter, 1);
+            if (indexShape.empty()) {
+                indexShape = shape;
+            } else {
+                indexShape = inferSize(indexShape, shape);
+            }
+        }
+    }
+    return indexShape;
+}
+
+static std::vector<int64_t> indexOutputSize(const AscendTensor& self, std::vector<AscendTensor>& indices) {
+    std::vector<AscendTensor> midIndices = indicesExpandedOutplace(indices);
+    while (midIndices.size() < (size_t)self.dim()) {
+        midIndices.emplace_back(nullptr);
+    }
+
+    AscendTensor src = self;
+    std::vector<AscendTensor> endIndices = midIndices;
+    if (!hasContiguousSubspace(midIndices)) {
+        endIndices.clear();
+        std::tie(src, endIndices) = transposeToFront(self, midIndices);
+    }
+
+    int64_t dimsBefore = 0;
+    int64_t dimsAfter = 0;
+    int64_t dimsIndexed = 0;
+
+    std::vector<int64_t> replaceShape;
+    std::vector<int64_t> indexedSizes;
+
+    for (size_t dim = 0; dim < endIndices.size(); dim++) {
+        if (!endIndices[dim].defined()) {
+            if (dimsIndexed == 0) {
+                dimsBefore++;
+            } else {
+                dimsAfter++;
+            }
+        } else {
+            dimsIndexed++;
+            replaceShape = endIndices[dim].shape();
+            indexedSizes.push_back(src.shape(dim));
+        }
+    }
+
+    if (std::find(indexedSizes.begin(), indexedSizes.end(), 0) != indexedSizes.end() &&
+        std::find(replaceShape.begin(), replaceShape.end(), 0) == replaceShape.end()) {
+        ASCEND_CHECK_ABORT(false, "index is out of bounds for dimension with size 0");
+    }
+
+    auto selfShape = src.shape();
+    int64_t end = dimsBefore + dimsIndexed;
+    selfShape.erase(selfShape.begin() + dimsBefore, selfShape.begin() + end);
+    selfShape.insert(selfShape.begin() + dimsBefore, replaceShape.begin(), replaceShape.end());
+
+    std::vector<int64_t> indexShape = indexReshape(endIndices, dimsBefore, dimsAfter);
+    std::vector<int64_t> outputSize = indexShape;
+    if (indexShape != selfShape) {
+        outputSize = inferSize(indexShape, selfShape);
+    }
+
+    return outputSize;
+}
+
+diopiError_t diopiIndex(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t* indices, int64_t nums) {
+    AscendTensor inputAt(input);
+    std::vector<AscendTensor> indicesOrigin(nums);
+    for (int64_t i = 0; i < nums; i++) {
+        if (indices[i] != nullptr) {
+            indicesOrigin[i] = AscendTensor(indices[i]);
+        }
+    }
+
+    std::vector<AscendTensor> indicesList = castIntIndicesToLongIndices(ctx, indicesOrigin);
+    checkIndexTensorTypes(indicesList);
+
+    auto indicesExpanded = expandIndicesTensors(ctx, inputAt, indicesList);
+
+    std::vector<aclTensor*> allDefinedIndices;
+    auto emptyTensor = createEmptyAclTensor();
+    for (const auto& idx : indicesExpanded) {
+        if (idx.defined()) {
+            allDefinedIndices.push_back(aclnn_adaptor::createAclTensorFromAscendTensor(idx));
+        } else {
+            allDefinedIndices.push_back(emptyTensor);
+        }
+    }
+
+    std::vector<int64_t> outShape = indexOutputSize(inputAt, indicesExpanded);
+
+    diopiSize_t outSize = vectorToDiopiSize(outShape);
+    diopiRequireTensor(ctx, out, &outSize, nullptr, inputAt.dtype(), diopi_device);
+
+    DIOPI_ASCEND_CALL_ACLNN(aclnnIndex, ctx, inputAt, allDefinedIndices, *out);
+    return diopiSuccess;
+}
+
+diopiError_t diopiIndexBackward(diopiContextHandle_t ctx, diopiTensorHandle_t gradInput, diopiTensorHandle_t zerosLikeInput, diopiConstTensorHandle_t* indices,
+                                int64_t nums, diopiConstTensorHandle_t gradOutput) {
+    AscendTensor gradInputTensor(gradInput);
+    AscendTensor gradOutputTensor(gradOutput);
+    if (gradInputTensor.numel() == 0 || gradOutputTensor.numel() == 0) {
+        return diopiSuccess;
+    }
+
+    std::vector<diopiConstTensorHandle_t> indicesVec;
+    indicesVec.reserve(nums);
+
+    for (int i = 0; i < nums; i++) {
+        if (indices[i] != nullptr) {
+            indicesVec.emplace_back(indices[i]);
+        } else {
+            int64_t array[1] = {0};
+            diopiSize_t size = {array, 1};
+            diopiTensorHandle_t emptyTensor = nullptr;
+            diopiRequireTensor(ctx, &emptyTensor, &size, nullptr, gradOutputTensor.dtype(), diopi_device);
+            indicesVec.emplace_back(emptyTensor);
+        }
+    }
+
+    DIOPI_ASCEND_CALL_ACLNN(aclnnInplaceCopy, ctx, gradInput, zerosLikeInput);
+    DIOPI_ASCEND_CALL_ACLNN(aclnnIndexPutImpl, ctx, gradInput, indicesVec, gradOutput, true, false);
+
+    return diopiSuccess;
+}
+
+}  // namespace ascend
+}  // namespace impl
\ No newline at end of file
diff --git a/impl/ascend/functions/index_put.cpp b/impl/ascend/functions/index_put.cpp
index 3b01d6cfd..d3d0565c9 100755
--- a/impl/ascend/functions/index_put.cpp
+++ b/impl/ascend/functions/index_put.cpp
@@ -9,18 +9,301 @@
 
 namespace impl {
 namespace ascend {
+
+static std::vector<AscendTensor> castIntIndicesToLongIndices(diopiContextHandle_t ctx, std::vector<AscendTensor>& indices) {
+    std::vector<AscendTensor> result;
+    for (auto& t : indices) {
+        if (!t.defined()) {
+            result.emplace_back(nullptr);
+            continue;
+        }
+        if (t.dtype() == diopi_dtype_int32) {
+            diopiTensorHandle_t indexHandle = nullptr;
+            auto shape = t.shape();
+            diopiSize_t size = vectorToDiopiSize(shape);
+            diopiRequireTensor(ctx, &indexHandle, &size, nullptr, diopi_dtype_int64, diopi_device);
+            DIOPI_ASCEND_CALL_ACLNN(aclnnCast, ctx, t, diopi_dtype_int64, indexHandle);
+            result.emplace_back(indexHandle);
+        } else {
+            if (t.device() == diopi_host) {
+                result.emplace_back(hostToDevice(ctx, t.tensorHandle()));
+            } else {
+                result.emplace_back(t);
+            }
+        }
+    }
+    return result;
+}
+
+static void checkIndexTensorTypes(const std::vector<AscendTensor>& indices) {
+    for (const auto& t : indices) {
+        if (t.defined()) {
+            diopiDtype_t type = t.dtype();
+            ASCEND_CHECK_ABORT(type == diopi_dtype_int64 || type == diopi_dtype_bool || type == diopi_dtype_uint8,
+                               "tensors used as indices must be long, byte or bool tensors");
+        }
+    }
+}
+
+static AscendTensor nonZeroTensor(diopiContextHandle_t ctx, const AscendTensor& self) {
+    int64_t numELem = self.numel() * self.dim();
+    std::vector<int64_t> nShape{self.numel(), self.dim()};
+    std::vector<int64_t> nStride(nShape.size(), 1);
+    for (int64_t i = nShape.size() - 2; i >= 0; i--) {
+        nStride[i] = nStride[i + 1] * nShape[i + 1];
+    }
+
+    diopiTensorHandle_t nzBuff = nullptr;
+    diopiSize_t nzBuffSize = vectorToDiopiSize(nShape);
+    diopiRequireTensor(ctx, &nzBuff, &nzBuffSize, nullptr, diopi_dtype_int64, diopi_device);
+    AscendTensor nzTensor(nzBuff);
+
+    auto aclNZTensor = ::aclCreateTensor(
+        nShape.data(), nShape.size(), aclDataType::ACL_INT64, nStride.data(), 0, aclFormat::ACL_FORMAT_ND, &numELem, 1, const_cast<void*>(nzTensor.data()));
+    DIOPI_ASCEND_CALL_ACLNN(aclnnNonzero, ctx, self, aclNZTensor);
+
+    int64_t* vDims = nullptr;
+    uint64_t vDimsNum = 0;
+    auto ret = aclGetViewShape(aclNZTensor, &vDims, &vDimsNum);
+    ASCEND_CHECK_ABORT(ret == 0, "NonZero aclGetViewShape failed.");
+
+    std::vector<int64_t> nzShape(vDims, vDims + vDimsNum);
+    nzTensor = nzTensor.resize(nzShape);
+
+    delete vDims;
+    vDims = nullptr;
+
+    diopiTensorHandle_t nzTrans = nullptr;
+    std::vector<int64_t> nzTransShape{nzShape[1], nzShape[0]};
+    diopiSize_t nzTransSize = vectorToDiopiSize(nzTransShape);
+    diopiRequireTensor(ctx, &nzTrans, &nzTransSize, nullptr, diopi_dtype_int64, diopi_device);
+    std::vector<int64_t> transDims{1, 0};
+    diopiSize_t permuteDims = vectorToDiopiSize(transDims);
+    DIOPI_ASCEND_CALL_ACLNN(aclnnPermute, ctx, nzTensor, permuteDims, nzTrans);
+
+    return AscendTensor(nzTrans);
+}
+
+static std::vector<AscendTensor> expandIndicesTensors(diopiContextHandle_t ctx, const AscendTensor& self, const std::vector<AscendTensor>& indices) {
+    std::vector<AscendTensor> result;
+    for (auto& t : indices) {
+        if (!t.defined()) {
+            result.push_back(t);
+        } else {
+            if (t.dtype() == diopi_dtype_uint8 || t.dtype() == diopi_dtype_bool) {
+                ASCEND_CHECK(t.dtype() == diopi_dtype_uint8,
+                             "indexing with dtype torch.uint8 is now deprecated,"
+                             " please use a dtype torch.bool instead.");
+                for (uint64_t j = 0; j < static_cast<uint64_t>(t.dim()); j++) {
+                    uint64_t srcIdx = result.size() + j;
+                    ASCEND_CHECK_ABORT(t.shape(j) == self.shape(srcIdx),
+                                       "The shape of the mask  %ld at index  %ld does not match the shape of the indexed tensor %ld at index %ld",
+                                       t.dim(),
+                                       j,
+                                       self.dim(),
+                                       srcIdx);
+                }
+                AscendTensor non = nonZeroTensor(ctx, t);
+                for (int64_t j = 0; j < t.dim(); j++) {
+                    result.push_back(non.select(0, j));
+                }
+            } else {
+                result.push_back(t);
+            }
+        }
+    }
+    return result;
+}
+
+
+static aclTensor* createEmptyAclTensor() {
+    std::vector<int64_t> nShape{0};
+    std::vector<int64_t> nStride{1};
+    int64_t storageSize = 0;
+    void* storage = nullptr;
+
+    return ::aclCreateTensor(nShape.data(), nShape.size(), aclDataType::ACL_FLOAT16, nStride.data(), 0, aclFormat::ACL_FORMAT_ND, &storageSize, 0, storage);
+}
+
+static std::vector<AscendTensor> indicesExpandedOutplace(std::vector<AscendTensor> indices) {
+    bool first = true;
+    std::vector<int64_t> sizes;
+
+    for (auto& idx : indices) {
+        if (!idx.defined()) {
+            continue;
+        } else if (first) {
+            sizes = idx.shape();
+            first = false;
+        } else {
+            sizes = inferSize(sizes, idx.shape());
+        }
+    }
+
+    std::vector<AscendTensor> result;
+    for (auto& idx : indices) {
+        if (!idx.defined() || (idx.shape() == sizes)) {
+            result.push_back(idx);
+        } else {
+            result.push_back(idx.expand(sizes));
+        }
+    }
+    return result;
+}
+
+static bool hasContiguousSubspace(std::vector<AscendTensor> indices) {  // true if all the non-null tensors are adjacent
+    auto isDefined = [](const AscendTensor& tensor) { return tensor.defined(); };
+    auto isNull = [](const AscendTensor& tensor) { return !tensor.defined(); };
+    auto start = std::find_if(indices.begin(), indices.end(), isDefined);
+    auto stop = std::find_if(indices.rbegin(), indices.rend(), isDefined);
+    auto it = std::find_if(start, stop.base(), isNull);
+    return it == stop.base();
+}
+
+static std::tuple<AscendTensor, std::vector<AscendTensor>> transposeToFront(AscendTensor self, std::vector<AscendTensor> indices) {
+    std::vector<int64_t> dims;
+    std::vector<AscendTensor> transposedIndices;
+
+    dims.reserve(self.dim());
+    for (int64_t i = 0; i < self.dim(); i++) {
+        if (indices[i].defined()) {
+            dims.push_back(i);
+            transposedIndices.push_back(indices[i]);
+        }
+    }
+
+    for (int64_t i = 0; i < self.dim(); i++) {
+        if (!indices[i].defined()) {
+            dims.push_back(i);
+            transposedIndices.push_back(indices[i]);
+        }
+    }
+
+    return std::make_tuple(self.permute(dims), transposedIndices);
+}
+
+static std::vector<int64_t> indexReshape(std::vector<AscendTensor> endIndices, int64_t dimsBefore, int64_t dimsAfter) {
+    std::vector<int64_t> indexShape;
+    for (auto& idx : endIndices) {
+        if (idx.defined()) {
+            std::vector<int64_t> shape;
+            shape.insert(shape.end(), dimsBefore, 1);
+            shape.insert(shape.end(), idx.shape().begin(), idx.shape().end());
+            shape.insert(shape.end(), dimsAfter, 1);
+            if (indexShape.empty()) {
+                indexShape = shape;
+            } else {
+                indexShape = inferSize(indexShape, shape);
+            }
+        }
+    }
+    return indexShape;
+}
+
+static std::vector<int64_t> indexOutputSize(const AscendTensor& self, std::vector<AscendTensor>& indices) {
+    std::vector<AscendTensor> midIndices = indicesExpandedOutplace(indices);
+    while (midIndices.size() < (size_t)self.dim()) {
+        midIndices.emplace_back(nullptr);
+    }
+
+    AscendTensor src = self;
+    std::vector<AscendTensor> endIndices = midIndices;
+    if (!hasContiguousSubspace(midIndices)) {
+        endIndices.clear();
+        std::tie(src, endIndices) = transposeToFront(self, midIndices);
+    }
+
+    int64_t dimsBefore = 0;
+    int64_t dimsAfter = 0;
+    int64_t dimsIndexed = 0;
+
+    std::vector<int64_t> replaceShape;
+    std::vector<int64_t> indexedSizes;
+
+    for (size_t dim = 0; dim < endIndices.size(); dim++) {
+        if (!endIndices[dim].defined()) {
+            if (dimsIndexed == 0) {
+                dimsBefore++;
+            } else {
+                dimsAfter++;
+            }
+        } else {
+            dimsIndexed++;
+            replaceShape = endIndices[dim].shape();
+            indexedSizes.push_back(src.shape(dim));
+        }
+    }
+
+    if (std::find(indexedSizes.begin(), indexedSizes.end(), 0) != indexedSizes.end() &&
+        std::find(replaceShape.begin(), replaceShape.end(), 0) == replaceShape.end()) {
+        ASCEND_CHECK_ABORT(false, "index is out of bounds for dimension with size 0");
+    }
+
+    auto selfShape = src.shape();
+    int64_t end = dimsBefore + dimsIndexed;
+    selfShape.erase(selfShape.begin() + dimsBefore, selfShape.begin() + end);
+    selfShape.insert(selfShape.begin() + dimsBefore, replaceShape.begin(), replaceShape.end());
+
+    std::vector<int64_t> indexShape = indexReshape(endIndices, dimsBefore, dimsAfter);
+    std::vector<int64_t> outputSize = indexShape;
+    if (indexShape != selfShape) {
+        outputSize = inferSize(indexShape, selfShape);
+    }
+
+    return outputSize;
+}
+
 diopiError_t diopiIndexPut(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t values,
                            diopiConstTensorHandle_t* indices, int64_t indicesCounts, bool accumulate) {
     diopiCopyInp(ctx, input, out);
-    std::vector<diopiConstTensorHandle_t> indicesVec(indices, indices + indicesCounts);
-    DIOPI_ASCEND_CALL_ACLNN(aclnnIndexPutImpl, ctx, out, indicesVec, values, accumulate, false);
+    AscendTensor inputAt(input);
+    std::vector<AscendTensor> indicesOrigin(indicesCounts);
+    for (int64_t i = 0; i < indicesCounts; i++) {
+        if (indices[i] != nullptr) {
+            indicesOrigin[i] = AscendTensor(indices[i]);
+        }
+    }
+    std::vector<AscendTensor> indicesList = castIntIndicesToLongIndices(ctx, indicesOrigin);
+    checkIndexTensorTypes(indicesList);
+    auto indicesExpanded = expandIndicesTensors(ctx, inputAt, indicesList);
+    std::vector<aclTensor*> allDefinedIndices;
+    auto emptyTensor = createEmptyAclTensor();
+    for (const auto& idx : indicesExpanded) {
+        if (idx.defined()) {
+            allDefinedIndices.push_back(aclnn_adaptor::createAclTensorFromAscendTensor(idx));
+        } else {
+            allDefinedIndices.push_back(emptyTensor);
+        }
+    }
+
+    DIOPI_ASCEND_CALL_ACLNN(aclnnIndexPutImpl, ctx, out, allDefinedIndices, values, accumulate, false);
     return diopiSuccess;
+
 }
 
 diopiError_t diopiIndexPutInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t values, diopiConstTensorHandle_t* indices,
                               int64_t indicesCounts, bool accumulate) {
-    std::vector<diopiConstTensorHandle_t> indicesVec(indices, indices + indicesCounts);
-    DIOPI_ASCEND_CALL_ACLNN(aclnnIndexPutImpl, ctx, input, indicesVec, values, accumulate, false);
+    AscendTensor inputAt(input);
+    std::vector<AscendTensor> indicesOrigin(indicesCounts);
+    for (int64_t i = 0; i < indicesCounts; i++) {
+        if (indices[i] != nullptr) {
+            indicesOrigin[i] = AscendTensor(indices[i]);
+        }
+    }
+    std::vector<AscendTensor> indicesList = castIntIndicesToLongIndices(ctx, indicesOrigin);
+    checkIndexTensorTypes(indicesList);
+    auto indicesExpanded = expandIndicesTensors(ctx, inputAt, indicesList);
+    std::vector<aclTensor*> allDefinedIndices;
+    auto emptyTensor = createEmptyAclTensor();
+    for (const auto& idx : indicesExpanded) {
+        if (idx.defined()) {
+            allDefinedIndices.push_back(aclnn_adaptor::createAclTensorFromAscendTensor(idx));
+        } else {
+            allDefinedIndices.push_back(emptyTensor);
+        }
+    }
+
+    DIOPI_ASCEND_CALL_ACLNN(aclnnIndexPutImpl, ctx, input, allDefinedIndices, values, accumulate, false);
     return diopiSuccess;
 }
 
diff --git a/impl/ascend_npu/CMakeLists.txt b/impl/ascend_npu/CMakeLists.txt
index ba7701105..84f285f0b 100755
--- a/impl/ascend_npu/CMakeLists.txt
+++ b/impl/ascend_npu/CMakeLists.txt
@@ -167,6 +167,7 @@ set(OLD_IMPL_SRC
     ${OLD_IMPL_DIR}/functions/arange.cpp
     ${OLD_IMPL_DIR}/functions/gather.cpp
     ${OLD_IMPL_DIR}/functions/layer_norm.cpp
+    ${OLD_IMPL_DIR}/functions/index.cpp
     ${OLD_IMPL_DIR}/functions/index_put.cpp
     ${OLD_IMPL_DIR}/functions/index_select.cpp
     ${OLD_IMPL_DIR}/functions/repeat.cpp
diff --git a/impl/ascend_npu/ascend_config.yaml b/impl/ascend_npu/ascend_config.yaml
index 9dbdec336..7def339c0 100755
--- a/impl/ascend_npu/ascend_config.yaml
+++ b/impl/ascend_npu/ascend_config.yaml
@@ -112,6 +112,8 @@ ascend:
 - diopiHardtanh
 - diopiHardtanhBackward
 - diopiHardtanhInp
+- diopiIndex
+- diopiIndexBackward
 - diopiIndexPut
 - diopiIndexPutInp
 - diopiIndexSelect
@@ -265,8 +267,6 @@ ascend_npu:
 - diopiApplyPenalty
 - diopiContextAttentionInference
 - diopiGetNativeMemoryFormat
-- diopiIndex
-- diopiIndexBackward
 - diopiNLLLoss
 - diopiNLLLossBackward
 - diopiNLLLossV2

From f713c69ce7fdbd61874f358e8578a95cda36c9ca Mon Sep 17 00:00:00 2001
From: zhangqiu <zhangqiu1994@outlook.com>
Date: Thu, 8 Aug 2024 17:05:38 +0800
Subject: [PATCH 2/5] fix

---
 adaptor/codegen/gen.py              |   2 +-
 impl/ascend/aclnn/adaptor.hpp       |   2 +-
 impl/ascend/ascend_tensor.cpp       |   2 +-
 impl/ascend/ascend_tensor.hpp       |   2 +-
 impl/ascend/functions/index.cpp     |  32 ++--
 impl/ascend/functions/index_put.cpp | 265 ++--------------------------
 6 files changed, 35 insertions(+), 270 deletions(-)

diff --git a/adaptor/codegen/gen.py b/adaptor/codegen/gen.py
index af8727d8b..1f9319b12 100644
--- a/adaptor/codegen/gen.py
+++ b/adaptor/codegen/gen.py
@@ -196,7 +196,7 @@ def prepare() -> Tuple[dict, str]:
 
     impl_plugin = options.impl_plugin
     base_device = options.base_device
-    assert(base_device is None or base_device == "" or base_device == "torch", f"invalid base_device:{base_device}")
+    assert base_device is None or base_device == "" or base_device == "torch", f"invalid base_device:{base_device}"
     if base_device == "":
         base_device = None
     def create_if_not_exist(name):
diff --git a/impl/ascend/aclnn/adaptor.hpp b/impl/ascend/aclnn/adaptor.hpp
index f4881be30..f0c4ff953 100644
--- a/impl/ascend/aclnn/adaptor.hpp
+++ b/impl/ascend/aclnn/adaptor.hpp
@@ -393,4 +393,4 @@ void callAclnnImpl(diopiContextHandle_t ctx, const std::tuple<Args...>& tuple) {
         DIOPI_ASECND_CALL_ACLNN_TYPE_SYNC(api, ctx, convertedParams.params())             \
     } while (false);
 
-#endif  // IMPL_ASCEND_ACLNN_ADAPTOR_HPP_
\ No newline at end of file
+#endif  // IMPL_ASCEND_ACLNN_ADAPTOR_HPP_
diff --git a/impl/ascend/ascend_tensor.cpp b/impl/ascend/ascend_tensor.cpp
index d71d5b941..f39f87902 100644
--- a/impl/ascend/ascend_tensor.cpp
+++ b/impl/ascend/ascend_tensor.cpp
@@ -342,4 +342,4 @@ aclFormat inferAclDataFormat(int64_t dim, const int64_t* shape, const int64_t* s
     return ACL_FORMAT_ND;
 }
 }  // namespace ascend
-}  // namespace impl
\ No newline at end of file
+}  // namespace impl
diff --git a/impl/ascend/ascend_tensor.hpp b/impl/ascend/ascend_tensor.hpp
index 20b29b6f5..cf295e87b 100644
--- a/impl/ascend/ascend_tensor.hpp
+++ b/impl/ascend/ascend_tensor.hpp
@@ -266,4 +266,4 @@ class AscendTensor final {
 }  // namespace ascend
 }  // namespace impl
 
-#endif  // IMPL_ASCEND_ASCEND_TENSOR_HPP_
\ No newline at end of file
+#endif  // IMPL_ASCEND_ASCEND_TENSOR_HPP_
diff --git a/impl/ascend/functions/index.cpp b/impl/ascend/functions/index.cpp
index b9cf3c81b..ac2cd78c3 100644
--- a/impl/ascend/functions/index.cpp
+++ b/impl/ascend/functions/index.cpp
@@ -12,7 +12,8 @@
 namespace impl {
 namespace ascend {
 
-static std::vector<AscendTensor> castIntIndicesToLongIndices(diopiContextHandle_t ctx, std::vector<AscendTensor>& indices) {
+namespace indexProcess {
+std::vector<AscendTensor> castIntIndicesToLongIndices(diopiContextHandle_t ctx, std::vector<AscendTensor>& indices) {
     std::vector<AscendTensor> result;
     for (auto& t : indices) {
         if (!t.defined()) {
@@ -37,7 +38,7 @@ static std::vector<AscendTensor> castIntIndicesToLongIndices(diopiContextHandle_
     return result;
 }
 
-static void checkIndexTensorTypes(const std::vector<AscendTensor>& indices) {
+void checkIndexTensorTypes(const std::vector<AscendTensor>& indices) {
     for (const auto& t : indices) {
         if (t.defined()) {
             diopiDtype_t type = t.dtype();
@@ -47,7 +48,7 @@ static void checkIndexTensorTypes(const std::vector<AscendTensor>& indices) {
     }
 }
 
-static AscendTensor nonZeroTensor(diopiContextHandle_t ctx, const AscendTensor& self) {
+AscendTensor nonZeroTensor(diopiContextHandle_t ctx, const AscendTensor& self) {
     int64_t numELem = self.numel() * self.dim();
     std::vector<int64_t> nShape{self.numel(), self.dim()};
     std::vector<int64_t> nStride(nShape.size(), 1);
@@ -86,7 +87,7 @@ static AscendTensor nonZeroTensor(diopiContextHandle_t ctx, const AscendTensor&
     return AscendTensor(nzTrans);
 }
 
-static std::vector<AscendTensor> expandIndicesTensors(diopiContextHandle_t ctx, const AscendTensor& self, const std::vector<AscendTensor>& indices) {
+std::vector<AscendTensor> expandIndicesTensors(diopiContextHandle_t ctx, const AscendTensor& self, const std::vector<AscendTensor>& indices) {
     std::vector<AscendTensor> result;
     for (auto& t : indices) {
         if (!t.defined()) {
@@ -117,7 +118,7 @@ static std::vector<AscendTensor> expandIndicesTensors(diopiContextHandle_t ctx,
     return result;
 }
 
-static aclTensor* createEmptyAclTensor() {
+aclTensor* createEmptyAclTensor() {
     std::vector<int64_t> nShape{0};
     std::vector<int64_t> nStride{1};
     int64_t storageSize = 0;
@@ -152,7 +153,7 @@ static std::vector<AscendTensor> indicesExpandedOutplace(std::vector<AscendTenso
     return result;
 }
 
-static bool hasContiguousSubspace(std::vector<AscendTensor> indices) {  // true if all the non-null tensors are adjacent
+bool hasContiguousSubspace(std::vector<AscendTensor> indices) {  // true if all the non-null tensors are adjacent
     auto isDefined = [](const AscendTensor& tensor) { return tensor.defined(); };
     auto isNull = [](const AscendTensor& tensor) { return !tensor.defined(); };
     auto start = std::find_if(indices.begin(), indices.end(), isDefined);
@@ -161,7 +162,7 @@ static bool hasContiguousSubspace(std::vector<AscendTensor> indices) {  // true
     return it == stop.base();
 }
 
-static std::tuple<AscendTensor, std::vector<AscendTensor>> transposeToFront(AscendTensor self, std::vector<AscendTensor> indices) {
+std::tuple<AscendTensor, std::vector<AscendTensor>> transposeToFront(AscendTensor self, std::vector<AscendTensor> indices) {
     std::vector<int64_t> dims;
     std::vector<AscendTensor> transposedIndices;
 
@@ -183,7 +184,7 @@ static std::tuple<AscendTensor, std::vector<AscendTensor>> transposeToFront(Asce
     return std::make_tuple(self.permute(dims), transposedIndices);
 }
 
-static std::vector<int64_t> indexReshape(std::vector<AscendTensor> endIndices, int64_t dimsBefore, int64_t dimsAfter) {
+std::vector<int64_t> indexReshape(std::vector<AscendTensor> endIndices, int64_t dimsBefore, int64_t dimsAfter) {
     std::vector<int64_t> indexShape;
     for (auto& idx : endIndices) {
         if (idx.defined()) {
@@ -201,7 +202,7 @@ static std::vector<int64_t> indexReshape(std::vector<AscendTensor> endIndices, i
     return indexShape;
 }
 
-static std::vector<int64_t> indexOutputSize(const AscendTensor& self, std::vector<AscendTensor>& indices) {
+std::vector<int64_t> indexOutputSize(const AscendTensor& self, std::vector<AscendTensor>& indices) {
     std::vector<AscendTensor> midIndices = indicesExpandedOutplace(indices);
     while (midIndices.size() < (size_t)self.dim()) {
         midIndices.emplace_back(nullptr);
@@ -253,6 +254,7 @@ static std::vector<int64_t> indexOutputSize(const AscendTensor& self, std::vecto
 
     return outputSize;
 }
+}  // namespace indexProcess
 
 diopiError_t diopiIndex(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t* indices, int64_t nums) {
     AscendTensor inputAt(input);
@@ -263,13 +265,13 @@ diopiError_t diopiIndex(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diop
         }
     }
 
-    std::vector<AscendTensor> indicesList = castIntIndicesToLongIndices(ctx, indicesOrigin);
-    checkIndexTensorTypes(indicesList);
+    std::vector<AscendTensor> indicesList = indexProcess::castIntIndicesToLongIndices(ctx, indicesOrigin);
+    indexProcess::checkIndexTensorTypes(indicesList);
 
-    auto indicesExpanded = expandIndicesTensors(ctx, inputAt, indicesList);
+    auto indicesExpanded = indexProcess::expandIndicesTensors(ctx, inputAt, indicesList);
 
     std::vector<aclTensor*> allDefinedIndices;
-    auto emptyTensor = createEmptyAclTensor();
+    auto emptyTensor = indexProcess::createEmptyAclTensor();
     for (const auto& idx : indicesExpanded) {
         if (idx.defined()) {
             allDefinedIndices.push_back(aclnn_adaptor::createAclTensorFromAscendTensor(idx));
@@ -278,7 +280,7 @@ diopiError_t diopiIndex(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diop
         }
     }
 
-    std::vector<int64_t> outShape = indexOutputSize(inputAt, indicesExpanded);
+    std::vector<int64_t> outShape = indexProcess::indexOutputSize(inputAt, indicesExpanded);
 
     diopiSize_t outSize = vectorToDiopiSize(outShape);
     diopiRequireTensor(ctx, out, &outSize, nullptr, inputAt.dtype(), diopi_device);
@@ -317,4 +319,4 @@ diopiError_t diopiIndexBackward(diopiContextHandle_t ctx, diopiTensorHandle_t gr
 }
 
 }  // namespace ascend
-}  // namespace impl
\ No newline at end of file
+}  // namespace impl
diff --git a/impl/ascend/functions/index_put.cpp b/impl/ascend/functions/index_put.cpp
index d3d0565c9..383e133b2 100755
--- a/impl/ascend/functions/index_put.cpp
+++ b/impl/ascend/functions/index_put.cpp
@@ -10,248 +10,12 @@
 namespace impl {
 namespace ascend {
 
-static std::vector<AscendTensor> castIntIndicesToLongIndices(diopiContextHandle_t ctx, std::vector<AscendTensor>& indices) {
-    std::vector<AscendTensor> result;
-    for (auto& t : indices) {
-        if (!t.defined()) {
-            result.emplace_back(nullptr);
-            continue;
-        }
-        if (t.dtype() == diopi_dtype_int32) {
-            diopiTensorHandle_t indexHandle = nullptr;
-            auto shape = t.shape();
-            diopiSize_t size = vectorToDiopiSize(shape);
-            diopiRequireTensor(ctx, &indexHandle, &size, nullptr, diopi_dtype_int64, diopi_device);
-            DIOPI_ASCEND_CALL_ACLNN(aclnnCast, ctx, t, diopi_dtype_int64, indexHandle);
-            result.emplace_back(indexHandle);
-        } else {
-            if (t.device() == diopi_host) {
-                result.emplace_back(hostToDevice(ctx, t.tensorHandle()));
-            } else {
-                result.emplace_back(t);
-            }
-        }
-    }
-    return result;
-}
-
-static void checkIndexTensorTypes(const std::vector<AscendTensor>& indices) {
-    for (const auto& t : indices) {
-        if (t.defined()) {
-            diopiDtype_t type = t.dtype();
-            ASCEND_CHECK_ABORT(type == diopi_dtype_int64 || type == diopi_dtype_bool || type == diopi_dtype_uint8,
-                               "tensors used as indices must be long, byte or bool tensors");
-        }
-    }
-}
-
-static AscendTensor nonZeroTensor(diopiContextHandle_t ctx, const AscendTensor& self) {
-    int64_t numELem = self.numel() * self.dim();
-    std::vector<int64_t> nShape{self.numel(), self.dim()};
-    std::vector<int64_t> nStride(nShape.size(), 1);
-    for (int64_t i = nShape.size() - 2; i >= 0; i--) {
-        nStride[i] = nStride[i + 1] * nShape[i + 1];
-    }
-
-    diopiTensorHandle_t nzBuff = nullptr;
-    diopiSize_t nzBuffSize = vectorToDiopiSize(nShape);
-    diopiRequireTensor(ctx, &nzBuff, &nzBuffSize, nullptr, diopi_dtype_int64, diopi_device);
-    AscendTensor nzTensor(nzBuff);
-
-    auto aclNZTensor = ::aclCreateTensor(
-        nShape.data(), nShape.size(), aclDataType::ACL_INT64, nStride.data(), 0, aclFormat::ACL_FORMAT_ND, &numELem, 1, const_cast<void*>(nzTensor.data()));
-    DIOPI_ASCEND_CALL_ACLNN(aclnnNonzero, ctx, self, aclNZTensor);
-
-    int64_t* vDims = nullptr;
-    uint64_t vDimsNum = 0;
-    auto ret = aclGetViewShape(aclNZTensor, &vDims, &vDimsNum);
-    ASCEND_CHECK_ABORT(ret == 0, "NonZero aclGetViewShape failed.");
-
-    std::vector<int64_t> nzShape(vDims, vDims + vDimsNum);
-    nzTensor = nzTensor.resize(nzShape);
-
-    delete vDims;
-    vDims = nullptr;
-
-    diopiTensorHandle_t nzTrans = nullptr;
-    std::vector<int64_t> nzTransShape{nzShape[1], nzShape[0]};
-    diopiSize_t nzTransSize = vectorToDiopiSize(nzTransShape);
-    diopiRequireTensor(ctx, &nzTrans, &nzTransSize, nullptr, diopi_dtype_int64, diopi_device);
-    std::vector<int64_t> transDims{1, 0};
-    diopiSize_t permuteDims = vectorToDiopiSize(transDims);
-    DIOPI_ASCEND_CALL_ACLNN(aclnnPermute, ctx, nzTensor, permuteDims, nzTrans);
-
-    return AscendTensor(nzTrans);
-}
-
-static std::vector<AscendTensor> expandIndicesTensors(diopiContextHandle_t ctx, const AscendTensor& self, const std::vector<AscendTensor>& indices) {
-    std::vector<AscendTensor> result;
-    for (auto& t : indices) {
-        if (!t.defined()) {
-            result.push_back(t);
-        } else {
-            if (t.dtype() == diopi_dtype_uint8 || t.dtype() == diopi_dtype_bool) {
-                ASCEND_CHECK(t.dtype() == diopi_dtype_uint8,
-                             "indexing with dtype torch.uint8 is now deprecated,"
-                             " please use a dtype torch.bool instead.");
-                for (uint64_t j = 0; j < static_cast<uint64_t>(t.dim()); j++) {
-                    uint64_t srcIdx = result.size() + j;
-                    ASCEND_CHECK_ABORT(t.shape(j) == self.shape(srcIdx),
-                                       "The shape of the mask  %ld at index  %ld does not match the shape of the indexed tensor %ld at index %ld",
-                                       t.dim(),
-                                       j,
-                                       self.dim(),
-                                       srcIdx);
-                }
-                AscendTensor non = nonZeroTensor(ctx, t);
-                for (int64_t j = 0; j < t.dim(); j++) {
-                    result.push_back(non.select(0, j));
-                }
-            } else {
-                result.push_back(t);
-            }
-        }
-    }
-    return result;
-}
-
-
-static aclTensor* createEmptyAclTensor() {
-    std::vector<int64_t> nShape{0};
-    std::vector<int64_t> nStride{1};
-    int64_t storageSize = 0;
-    void* storage = nullptr;
-
-    return ::aclCreateTensor(nShape.data(), nShape.size(), aclDataType::ACL_FLOAT16, nStride.data(), 0, aclFormat::ACL_FORMAT_ND, &storageSize, 0, storage);
-}
-
-static std::vector<AscendTensor> indicesExpandedOutplace(std::vector<AscendTensor> indices) {
-    bool first = true;
-    std::vector<int64_t> sizes;
-
-    for (auto& idx : indices) {
-        if (!idx.defined()) {
-            continue;
-        } else if (first) {
-            sizes = idx.shape();
-            first = false;
-        } else {
-            sizes = inferSize(sizes, idx.shape());
-        }
-    }
-
-    std::vector<AscendTensor> result;
-    for (auto& idx : indices) {
-        if (!idx.defined() || (idx.shape() == sizes)) {
-            result.push_back(idx);
-        } else {
-            result.push_back(idx.expand(sizes));
-        }
-    }
-    return result;
-}
-
-static bool hasContiguousSubspace(std::vector<AscendTensor> indices) {  // true if all the non-null tensors are adjacent
-    auto isDefined = [](const AscendTensor& tensor) { return tensor.defined(); };
-    auto isNull = [](const AscendTensor& tensor) { return !tensor.defined(); };
-    auto start = std::find_if(indices.begin(), indices.end(), isDefined);
-    auto stop = std::find_if(indices.rbegin(), indices.rend(), isDefined);
-    auto it = std::find_if(start, stop.base(), isNull);
-    return it == stop.base();
-}
-
-static std::tuple<AscendTensor, std::vector<AscendTensor>> transposeToFront(AscendTensor self, std::vector<AscendTensor> indices) {
-    std::vector<int64_t> dims;
-    std::vector<AscendTensor> transposedIndices;
-
-    dims.reserve(self.dim());
-    for (int64_t i = 0; i < self.dim(); i++) {
-        if (indices[i].defined()) {
-            dims.push_back(i);
-            transposedIndices.push_back(indices[i]);
-        }
-    }
-
-    for (int64_t i = 0; i < self.dim(); i++) {
-        if (!indices[i].defined()) {
-            dims.push_back(i);
-            transposedIndices.push_back(indices[i]);
-        }
-    }
-
-    return std::make_tuple(self.permute(dims), transposedIndices);
-}
-
-static std::vector<int64_t> indexReshape(std::vector<AscendTensor> endIndices, int64_t dimsBefore, int64_t dimsAfter) {
-    std::vector<int64_t> indexShape;
-    for (auto& idx : endIndices) {
-        if (idx.defined()) {
-            std::vector<int64_t> shape;
-            shape.insert(shape.end(), dimsBefore, 1);
-            shape.insert(shape.end(), idx.shape().begin(), idx.shape().end());
-            shape.insert(shape.end(), dimsAfter, 1);
-            if (indexShape.empty()) {
-                indexShape = shape;
-            } else {
-                indexShape = inferSize(indexShape, shape);
-            }
-        }
-    }
-    return indexShape;
-}
-
-static std::vector<int64_t> indexOutputSize(const AscendTensor& self, std::vector<AscendTensor>& indices) {
-    std::vector<AscendTensor> midIndices = indicesExpandedOutplace(indices);
-    while (midIndices.size() < (size_t)self.dim()) {
-        midIndices.emplace_back(nullptr);
-    }
-
-    AscendTensor src = self;
-    std::vector<AscendTensor> endIndices = midIndices;
-    if (!hasContiguousSubspace(midIndices)) {
-        endIndices.clear();
-        std::tie(src, endIndices) = transposeToFront(self, midIndices);
-    }
-
-    int64_t dimsBefore = 0;
-    int64_t dimsAfter = 0;
-    int64_t dimsIndexed = 0;
-
-    std::vector<int64_t> replaceShape;
-    std::vector<int64_t> indexedSizes;
-
-    for (size_t dim = 0; dim < endIndices.size(); dim++) {
-        if (!endIndices[dim].defined()) {
-            if (dimsIndexed == 0) {
-                dimsBefore++;
-            } else {
-                dimsAfter++;
-            }
-        } else {
-            dimsIndexed++;
-            replaceShape = endIndices[dim].shape();
-            indexedSizes.push_back(src.shape(dim));
-        }
-    }
-
-    if (std::find(indexedSizes.begin(), indexedSizes.end(), 0) != indexedSizes.end() &&
-        std::find(replaceShape.begin(), replaceShape.end(), 0) == replaceShape.end()) {
-        ASCEND_CHECK_ABORT(false, "index is out of bounds for dimension with size 0");
-    }
-
-    auto selfShape = src.shape();
-    int64_t end = dimsBefore + dimsIndexed;
-    selfShape.erase(selfShape.begin() + dimsBefore, selfShape.begin() + end);
-    selfShape.insert(selfShape.begin() + dimsBefore, replaceShape.begin(), replaceShape.end());
-
-    std::vector<int64_t> indexShape = indexReshape(endIndices, dimsBefore, dimsAfter);
-    std::vector<int64_t> outputSize = indexShape;
-    if (indexShape != selfShape) {
-        outputSize = inferSize(indexShape, selfShape);
-    }
-
-    return outputSize;
-}
+namespace indexProcess {
+extern std::vector<AscendTensor> castIntIndicesToLongIndices(diopiContextHandle_t ctx, std::vector<AscendTensor>& indices);
+extern void checkIndexTensorTypes(const std::vector<AscendTensor>& indices);
+extern std::vector<AscendTensor> expandIndicesTensors(diopiContextHandle_t ctx, const AscendTensor& self, const std::vector<AscendTensor>& indices);
+extern aclTensor* createEmptyAclTensor();
+}  // namespace indexProcess
 
 diopiError_t diopiIndexPut(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiConstTensorHandle_t values,
                            diopiConstTensorHandle_t* indices, int64_t indicesCounts, bool accumulate) {
@@ -263,11 +27,11 @@ diopiError_t diopiIndexPut(diopiContextHandle_t ctx, diopiTensorHandle_t out, di
             indicesOrigin[i] = AscendTensor(indices[i]);
         }
     }
-    std::vector<AscendTensor> indicesList = castIntIndicesToLongIndices(ctx, indicesOrigin);
-    checkIndexTensorTypes(indicesList);
-    auto indicesExpanded = expandIndicesTensors(ctx, inputAt, indicesList);
+    std::vector<AscendTensor> indicesList = indexProcess::castIntIndicesToLongIndices(ctx, indicesOrigin);
+    indexProcess::checkIndexTensorTypes(indicesList);
+    auto indicesExpanded = indexProcess::expandIndicesTensors(ctx, inputAt, indicesList);
     std::vector<aclTensor*> allDefinedIndices;
-    auto emptyTensor = createEmptyAclTensor();
+    auto emptyTensor = indexProcess::createEmptyAclTensor();
     for (const auto& idx : indicesExpanded) {
         if (idx.defined()) {
             allDefinedIndices.push_back(aclnn_adaptor::createAclTensorFromAscendTensor(idx));
@@ -278,7 +42,6 @@ diopiError_t diopiIndexPut(diopiContextHandle_t ctx, diopiTensorHandle_t out, di
 
     DIOPI_ASCEND_CALL_ACLNN(aclnnIndexPutImpl, ctx, out, allDefinedIndices, values, accumulate, false);
     return diopiSuccess;
-
 }
 
 diopiError_t diopiIndexPutInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t values, diopiConstTensorHandle_t* indices,
@@ -290,11 +53,11 @@ diopiError_t diopiIndexPutInp(diopiContextHandle_t ctx, diopiTensorHandle_t inpu
             indicesOrigin[i] = AscendTensor(indices[i]);
         }
     }
-    std::vector<AscendTensor> indicesList = castIntIndicesToLongIndices(ctx, indicesOrigin);
-    checkIndexTensorTypes(indicesList);
-    auto indicesExpanded = expandIndicesTensors(ctx, inputAt, indicesList);
+    std::vector<AscendTensor> indicesList = indexProcess::castIntIndicesToLongIndices(ctx, indicesOrigin);
+    indexProcess::checkIndexTensorTypes(indicesList);
+    auto indicesExpanded = indexProcess::expandIndicesTensors(ctx, inputAt, indicesList);
     std::vector<aclTensor*> allDefinedIndices;
-    auto emptyTensor = createEmptyAclTensor();
+    auto emptyTensor = indexProcess::createEmptyAclTensor();
     for (const auto& idx : indicesExpanded) {
         if (idx.defined()) {
             allDefinedIndices.push_back(aclnn_adaptor::createAclTensorFromAscendTensor(idx));

From 903378a696b30fc975b5e065fcea0d94e42bd53d Mon Sep 17 00:00:00 2001
From: zhangqiu <zhangqiu1994@outlook.com>
Date: Thu, 8 Aug 2024 20:48:58 +0800
Subject: [PATCH 3/5] fix device_config

---
 impl/ascend/device_configs.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/impl/ascend/device_configs.py b/impl/ascend/device_configs.py
index 9df743ff6..7858623c5 100755
--- a/impl/ascend/device_configs.py
+++ b/impl/ascend/device_configs.py
@@ -901,6 +901,7 @@
 
     'index_put_acc_bool_indices_zeros': dict( # llm used
         name=['index_put'],
+        skip_all=True,
         para=dict(
             accumulate=[Skip(False),],
         ),
@@ -911,15 +912,26 @@
         para=dict(
             accumulate=[Skip(False),],
         ),
+        tensor_para=dict(
+            args=[
+                {
+                    "ins": ['input'],
+                    "shape": [Skip((16, 4, 4)),],
+                },
+            ]
+        ),
     ),
 
     'index_put_bool_indices_value': dict( # llm used
         name=['index_put'],
+        para=dict(
+            accumulate=[Skip(False),],
+        ),
         tensor_para=dict(
             args=[
                 {
                     "ins": ['input'],
-                    "shape": [Skip((3, 2, 2, 20)),],
+                    "shape": [Skip((3, 2, 2, 20)), Skip((4, 2, 2, 6, 2))],
                 },
             ]
         ),
@@ -1177,11 +1189,6 @@
         skip_all=True
     ),
     
-    'index_put_acc_bool_indices_zeros': dict(
-        name=['index_put'],
-        skip_all=True
-    ),
-    
     # TODO(zhangqiu) Due to a bug in the software stack, this test will be skipped for now.
     'embedding': dict(
         name=['embedding'],

From d1ea96b6bf5953989a5935c0f305e43d35231bd6 Mon Sep 17 00:00:00 2001
From: zhangqiu <zhangqiu1994@outlook.com>
Date: Fri, 9 Aug 2024 15:04:35 +0800
Subject: [PATCH 4/5] fix check for uint8

---
 impl/ascend/functions/index.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/impl/ascend/functions/index.cpp b/impl/ascend/functions/index.cpp
index ac2cd78c3..708ae4a0d 100644
--- a/impl/ascend/functions/index.cpp
+++ b/impl/ascend/functions/index.cpp
@@ -94,7 +94,7 @@ std::vector<AscendTensor> expandIndicesTensors(diopiContextHandle_t ctx, const A
             result.push_back(t);
         } else {
             if (t.dtype() == diopi_dtype_uint8 || t.dtype() == diopi_dtype_bool) {
-                ASCEND_CHECK(t.dtype() == diopi_dtype_uint8,
+                ASCEND_CHECK(t.dtype() == diopi_dtype_bool,
                              "indexing with dtype torch.uint8 is now deprecated,"
                              " please use a dtype torch.bool instead.");
                 for (uint64_t j = 0; j < static_cast<uint64_t>(t.dim()); j++) {
@@ -281,7 +281,6 @@ diopiError_t diopiIndex(diopiContextHandle_t ctx, diopiTensorHandle_t* out, diop
     }
 
     std::vector<int64_t> outShape = indexProcess::indexOutputSize(inputAt, indicesExpanded);
-
     diopiSize_t outSize = vectorToDiopiSize(outShape);
     diopiRequireTensor(ctx, out, &outSize, nullptr, inputAt.dtype(), diopi_device);
 

From 288dd2d80c363dd7ad698c97a3e54afbee58d9dd Mon Sep 17 00:00:00 2001
From: zhangqiu <zhangqiu1994@outlook.com>
Date: Fri, 9 Aug 2024 16:49:36 +0800
Subject: [PATCH 5/5] fix

---
 impl/ascend/device_configs.py       | 1 -
 impl/ascend/functions/index_put.cpp | 8 ++++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/impl/ascend/device_configs.py b/impl/ascend/device_configs.py
index 7858623c5..c4cc9b31f 100755
--- a/impl/ascend/device_configs.py
+++ b/impl/ascend/device_configs.py
@@ -901,7 +901,6 @@
 
     'index_put_acc_bool_indices_zeros': dict( # llm used
         name=['index_put'],
-        skip_all=True,
         para=dict(
             accumulate=[Skip(False),],
         ),
diff --git a/impl/ascend/functions/index_put.cpp b/impl/ascend/functions/index_put.cpp
index 383e133b2..1354d8d0e 100755
--- a/impl/ascend/functions/index_put.cpp
+++ b/impl/ascend/functions/index_put.cpp
@@ -21,6 +21,10 @@ diopiError_t diopiIndexPut(diopiContextHandle_t ctx, diopiTensorHandle_t out, di
                            diopiConstTensorHandle_t* indices, int64_t indicesCounts, bool accumulate) {
     diopiCopyInp(ctx, input, out);
     AscendTensor inputAt(input);
+    AscendTensor valuesAt(values);
+    if (inputAt.numel() == 0 || valuesAt.numel() == 0) {
+        return diopiSuccess;
+    }
     std::vector<AscendTensor> indicesOrigin(indicesCounts);
     for (int64_t i = 0; i < indicesCounts; i++) {
         if (indices[i] != nullptr) {
@@ -47,6 +51,10 @@ diopiError_t diopiIndexPut(diopiContextHandle_t ctx, diopiTensorHandle_t out, di
 diopiError_t diopiIndexPutInp(diopiContextHandle_t ctx, diopiTensorHandle_t input, diopiConstTensorHandle_t values, diopiConstTensorHandle_t* indices,
                               int64_t indicesCounts, bool accumulate) {
     AscendTensor inputAt(input);
+    AscendTensor valuesAt(values);
+    if (inputAt.numel() == 0 || valuesAt.numel() == 0) {
+        return diopiSuccess;
+    }
     std::vector<AscendTensor> indicesOrigin(indicesCounts);
     for (int64_t i = 0; i < indicesCounts; i++) {
         if (indices[i] != nullptr) {