Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions lib/kernels/include/kernels/accessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,12 @@ std::vector<real_type_t<DT> const *>
return out;
}

bool accessor_data_is_equal(GenericTensorAccessorR const &accessor_a,
GenericTensorAccessorR const &accessor_b);

bool accessors_are_equal(GenericTensorAccessorR const &accessor_a,
GenericTensorAccessorR const &accessor_b);

GenericTensorAccessorR read_only_accessor_from_write_accessor(
GenericTensorAccessorW const &write_accessor);

Expand All @@ -280,9 +286,6 @@ bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
std::pair<ArrayShape, DataType>
get_shape_and_datatype(GenericTensorAccessorR const &accessor);

void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor,
GenericTensorAccessorR const &src_accessor);

} // namespace FlexFlow

namespace FlexFlow {
Expand Down
11 changes: 11 additions & 0 deletions lib/kernels/include/kernels/copy_tensor_accessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@

namespace FlexFlow {

void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor,
GenericTensorAccessorR const &src_accessor);

GenericTensorAccessorR
copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
Allocator &allocator);
Expand All @@ -14,6 +17,14 @@ GenericTensorAccessorW
copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor,
Allocator &allocator);

GenericTensorAccessorW
copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor,
Allocator &allocator);

GenericTensorAccessorR
copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor,
Allocator &allocator);

} // namespace FlexFlow

#endif
73 changes: 40 additions & 33 deletions lib/kernels/src/accessor.cc
Original file line number Diff line number Diff line change
@@ -1,38 +1,45 @@
#include "kernels/accessor.h"
#include "kernels/allocation.h"
#include "kernels/copy_tensor_accessor.h"
#include "kernels/datatype_dispatch.h"
#include "kernels/local_cpu_allocator.h"
#include <cstring>
#include <iostream>

namespace FlexFlow {

void copy_accessor_data_to_l_from_r(
GenericTensorAccessorW &dst_accessor,
GenericTensorAccessorR const &src_accessor) {
size_t num_bytes =
dst_accessor.shape.get_volume().unwrap_nonnegative() *
size_of_datatype(dst_accessor.data_type).unwrap_nonnegative();

DeviceType dst_device_type = dst_accessor.device_type;
DeviceType src_device_type = src_accessor.device_type;

if (src_device_type == DeviceType::CPU &&
dst_device_type == DeviceType::CPU) {
memcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes);
} else if (src_device_type == DeviceType::CPU &&
dst_device_type == DeviceType::GPU) {
checkCUDA(cudaMemcpy(
dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyHostToDevice));
} else if (src_device_type == DeviceType::GPU &&
dst_device_type == DeviceType::CPU) {
checkCUDA(cudaMemcpy(
dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost));
} else {
assert(src_device_type == DeviceType::GPU);
assert(dst_device_type == DeviceType::GPU);
checkCUDA(cudaMemcpy(dst_accessor.ptr,
src_accessor.ptr,
num_bytes,
cudaMemcpyDeviceToDevice));
template <DataType DT>
struct AccessorDataIsEqual {
bool operator()(GenericTensorAccessorR const &a,
GenericTensorAccessorR const &b) {
int const num_elements = a.shape.num_elements().unwrap_nonnegative();
if (num_elements != b.shape.num_elements().unwrap_nonnegative()) {
return false;
}

Allocator cpu_allocator = create_local_cpu_memory_allocator();
auto cpu_a = copy_accessor_r_to_cpu_if_necessary(a, cpu_allocator);
auto cpu_b = copy_accessor_r_to_cpu_if_necessary(b, cpu_allocator);

using T = real_type_t<DT>;
T const *a_ptr = cpu_a.get<DT>();
T const *b_ptr = cpu_b.get<DT>();

return std::equal(a_ptr, a_ptr + num_elements, b_ptr);
}
};

bool accessor_data_is_equal(GenericTensorAccessorR const &accessor_a,
GenericTensorAccessorR const &accessor_b) {
return DataTypeDispatch1<AccessorDataIsEqual>{}(
accessor_a.data_type, accessor_a, accessor_b);
}

bool accessors_are_equal(GenericTensorAccessorR const &accessor_a,
GenericTensorAccessorR const &accessor_b) {
return accessor_a.data_type == accessor_b.data_type &&
accessor_a.device_type == accessor_b.device_type &&
accessor_a.shape == accessor_b.shape &&
accessor_data_is_equal(accessor_a, accessor_b);
}

GenericTensorAccessorW::operator GenericTensorAccessorR() const {
Expand All @@ -56,12 +63,12 @@ std::tuple<DataType const &,

bool GenericTensorAccessorW::operator==(
GenericTensorAccessorW const &other) const {
return this->tie() == other.tie();
return accessors_are_equal(*this, other);
}

bool GenericTensorAccessorW::operator!=(
GenericTensorAccessorW const &other) const {
return this->tie() != other.tie();
return !(accessors_are_equal(*this, other));
}

int32_t *GenericTensorAccessorW::get_int32_ptr() const {
Expand Down Expand Up @@ -112,12 +119,12 @@ std::tuple<DataType const &,

bool GenericTensorAccessorR::operator==(
GenericTensorAccessorR const &other) const {
return this->tie() == other.tie();
return accessors_are_equal(*this, other);
}

bool GenericTensorAccessorR::operator!=(
GenericTensorAccessorR const &other) const {
return this->tie() != other.tie();
return !(accessors_are_equal(*this, other));
}

int32_t const *GenericTensorAccessorR::get_int32_ptr() const {
Expand Down
38 changes: 33 additions & 5 deletions lib/kernels/src/array_shape.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,18 +51,40 @@ nonnegative_int ArrayShape::at(ff_dim_t idx) const {
return dims.at(legion_dim_from_ff_dim(idx, this->num_dims()));
}

legion_dim_t ArrayShape::last_idx() const {
if (this->dims.size() == 0) {
throw mk_runtime_error("Cannot get last index of an empty shape");
}
return legion_dim_t(nonnegative_int{this->dims.size() - 1});
}

legion_dim_t ArrayShape::neg_idx(int idx) const {
if (std::abs(idx) > this->dims.size()) {
throw mk_runtime_error(
fmt::format("Invalid negative index: {} (shape has {} dimensions)",
idx,
this->dims.size()));
}

if (idx >= 0) {
throw mk_runtime_error(fmt::format(
"Idx should be negative for negative indexing, got {}", idx));
}

return legion_dim_t(nonnegative_int{this->dims.size() + idx});
}

bool ArrayShape::operator==(ArrayShape const &other) const {
return this->tie() == other.tie();
return this->dims == other.dims;
}

bool ArrayShape::operator!=(ArrayShape const &other) const {
return this->tie() != other.tie();
return !(this->dims == other.dims);
}

ArrayShape ArrayShape::sub_shape(
std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const {

nonnegative_int num_dims = this->num_dims();

auto to_legion_index = [num_dims](auto arg) -> nonnegative_int {
Expand All @@ -85,7 +107,9 @@ ArrayShape ArrayShape::sub_shape(
"Invalid sub_shape range: start={}, end={}", start_idx, end_idx));
}

return ArrayShape(&this->dims[legion_dim_t{start_idx}], end_idx - start_idx);
return ArrayShape(std::vector<nonnegative_int>(
this->dims.begin() + start_idx.unwrap_nonnegative(),
this->dims.begin() + end_idx.unwrap_nonnegative()));
}

std::optional<nonnegative_int> ArrayShape::at_maybe(legion_dim_t index) const {
Expand All @@ -97,7 +121,11 @@ std::optional<nonnegative_int> ArrayShape::at_maybe(legion_dim_t index) const {
}

std::optional<nonnegative_int> ArrayShape::at_maybe(ff_dim_t index) const {
return this->at_maybe(legion_dim_from_ff_dim(index, this->num_dims()));
if (index.value < this->num_dims()) {
return this->at_maybe(legion_dim_from_ff_dim(index, this->num_dims()));
} else {
return std::nullopt;
}
}

std::tuple<LegionOrdered<nonnegative_int> const &> ArrayShape::tie() const {
Expand Down
59 changes: 59 additions & 0 deletions lib/kernels/src/copy_tensor_accessor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,37 @@

namespace FlexFlow {

void copy_accessor_data_to_l_from_r(
GenericTensorAccessorW &dst_accessor,
GenericTensorAccessorR const &src_accessor) {
size_t num_bytes =
dst_accessor.shape.get_volume().unwrap_nonnegative() *
size_of_datatype(dst_accessor.data_type).unwrap_nonnegative();

DeviceType dst_device_type = dst_accessor.device_type;
DeviceType src_device_type = src_accessor.device_type;

if (src_device_type == DeviceType::CPU &&
dst_device_type == DeviceType::CPU) {
memcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes);
} else if (src_device_type == DeviceType::CPU &&
dst_device_type == DeviceType::GPU) {
checkCUDA(cudaMemcpy(
dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyHostToDevice));
} else if (src_device_type == DeviceType::GPU &&
dst_device_type == DeviceType::CPU) {
checkCUDA(cudaMemcpy(
dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost));
} else {
assert(src_device_type == DeviceType::GPU);
assert(dst_device_type == DeviceType::GPU);
checkCUDA(cudaMemcpy(dst_accessor.ptr,
src_accessor.ptr,
num_bytes,
cudaMemcpyDeviceToDevice));
}
}

template <DataType DT>
struct CopyTensorAccessorW {
GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor,
Expand Down Expand Up @@ -45,4 +76,32 @@ GenericTensorAccessorR
src_accessor.data_type, src_accessor, allocator);
}

GenericTensorAccessorR
copy_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &accessor,
Allocator &cpu_allocator) {
if (cpu_allocator.get_allocation_device_type() == DeviceType::GPU) {
throw mk_runtime_error("Allocator must be a CPU allocator");
}

GenericTensorAccessorR cpu_accessor = accessor;
if (accessor.device_type == DeviceType::GPU) {
cpu_accessor = copy_tensor_accessor_r(accessor, cpu_allocator);
}
return cpu_accessor;
}

GenericTensorAccessorW
copy_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &accessor,
Allocator &cpu_allocator) {
if (cpu_allocator.get_allocation_device_type() == DeviceType::GPU) {
throw mk_runtime_error("Allocator must be a CPU allocator");
}

GenericTensorAccessorW cpu_accessor = accessor;
if (accessor.device_type == DeviceType::GPU) {
cpu_accessor = copy_tensor_accessor_w(accessor, cpu_allocator);
}
return cpu_accessor;
}

} // namespace FlexFlow
1 change: 0 additions & 1 deletion lib/kernels/src/legion_dim.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value) {
legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim,
nonnegative_int num_dimensions) {
return legion_dim_t{num_dimensions - ff_dim.value - 1_n};
;
}

} // namespace FlexFlow
5 changes: 3 additions & 2 deletions lib/kernels/src/perf_metrics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@ PerfMetrics::PerfMetrics(int _train_all,
double _start_time_micro,
double _current_time_micro)
: train_all(_train_all), train_correct(_train_correct), cce_loss(_cce_loss),
mse_loss(_mse_loss), rmse_loss(_rmse_loss), mae_loss(_mae_loss),
start_time(_start_time_micro), current_time(_current_time_micro) {}
sparse_cce_loss(_sparse_cce_loss), mse_loss(_mse_loss),
rmse_loss(_rmse_loss), mae_loss(_mae_loss), start_time(_start_time_micro),
current_time(_current_time_micro) {}

float get_throughput(PerfMetrics const &m) {
return m.train_all / (m.current_time - m.start_time);
Expand Down
Loading
Loading