Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ void single_task(queue Q, const KernelType &KernelObj,
!(detail::KernelLambdaHasKernelHandlerArgT<KernelType,
void>::value)) {
detail::submit_kernel_direct_single_task<KernelName>(
std::move(Q), empty_properties_t{}, KernelObj, CodeLoc);
std::move(Q), empty_properties_t{}, KernelObj, {}, CodeLoc);
} else {
submit(
std::move(Q),
Expand Down Expand Up @@ -281,7 +281,7 @@ void nd_launch(queue Q, nd_range<Dimensions> Range, const KernelType &KernelObj,
!(detail::KernelLambdaHasKernelHandlerArgT<
KernelType, sycl::nd_item<Dimensions>>::value)) {
detail::submit_kernel_direct_parallel_for<KernelName>(
std::move(Q), empty_properties_t{}, Range, KernelObj);
std::move(Q), empty_properties_t{}, Range, KernelObj, {});
} else {
submit(std::move(Q), [&](handler &CGH) {
nd_launch<KernelName>(CGH, Range, KernelObj,
Expand Down
8 changes: 4 additions & 4 deletions sycl/include/sycl/khr/free_function_commands.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ void launch_grouped(const queue &q, range<1> r, range<1> size, KernelType &&k,
KernelType, sycl::nd_item<1>>::value)) {
detail::submit_kernel_direct_parallel_for(
q, ext::oneapi::experimental::empty_properties_t{},
nd_range<1>(r, size), std::forward<KernelType>(k));
nd_range<1>(r, size), std::forward<KernelType>(k), {});
} else {
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
Expand All @@ -187,7 +187,7 @@ void launch_grouped(const queue &q, range<2> r, range<2> size, KernelType &&k,
KernelType, sycl::nd_item<2>>::value)) {
detail::submit_kernel_direct_parallel_for(
q, ext::oneapi::experimental::empty_properties_t{},
nd_range<2>(r, size), std::forward<KernelType>(k));
nd_range<2>(r, size), std::forward<KernelType>(k), {});
} else {
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
Expand All @@ -208,7 +208,7 @@ void launch_grouped(const queue &q, range<3> r, range<3> size, KernelType &&k,
KernelType, sycl::nd_item<3>>::value)) {
detail::submit_kernel_direct_parallel_for(
q, ext::oneapi::experimental::empty_properties_t{},
nd_range<3>(r, size), std::forward<KernelType>(k));
nd_range<3>(r, size), std::forward<KernelType>(k), {});
} else {
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
Expand Down Expand Up @@ -333,7 +333,7 @@ void launch_task(const sycl::queue &q, KernelType &&k,
void>::value)) {
detail::submit_kernel_direct_single_task(
q, ext::oneapi::experimental::empty_properties_t{},
std::forward<KernelType>(k), codeLoc);
std::forward<KernelType>(k), {}, codeLoc);
} else {
submit(q, [&](handler &h) { launch_task<KernelType>(h, k); }, codeLoc);
}
Expand Down
134 changes: 99 additions & 35 deletions sycl/include/sycl/queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include <sycl/nd_range.hpp> // for nd_range
#include <sycl/property_list.hpp> // for property_list
#include <sycl/range.hpp> // for range
#include <sycl/sycl_span.hpp> // for sycl::span

#include <cstddef> // for size_t
#include <functional> // for function
Expand Down Expand Up @@ -68,14 +69,16 @@ event __SYCL_EXPORT submit_kernel_direct_with_event_impl(
const queue &Queue, const nd_range<Dims> &Range,
detail::HostKernelRefBase &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);
sycl::span<const event> DepEvents, const detail::code_location &CodeLoc,
bool IsTopCodeLoc);

template <int Dims>
void __SYCL_EXPORT submit_kernel_direct_without_event_impl(
const queue &Queue, const nd_range<Dims> &Range,
detail::HostKernelRefBase &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);
sycl::span<const event> DepEvents, const detail::code_location &CodeLoc,
bool IsTopCodeLoc);

namespace detail {
class queue_impl;
Expand Down Expand Up @@ -163,6 +166,7 @@ template <detail::WrapAs WrapAs, typename LambdaArgType,
auto submit_kernel_direct(
const queue &Queue, [[maybe_unused]] PropertiesT Props,
const nd_range<Dims> &Range, KernelTypeUniversalRef &&KernelFunc,
sycl::span<const event> DepEvents,
const detail::code_location &CodeLoc = detail::code_location::current()) {
// TODO Properties not supported yet
static_assert(
Expand Down Expand Up @@ -212,11 +216,11 @@ auto submit_kernel_direct(

if constexpr (EventNeeded) {
return submit_kernel_direct_with_event_impl(
Queue, Range, HostKernel, DeviceKernelInfoPtr,
Queue, Range, HostKernel, DeviceKernelInfoPtr, DepEvents,
TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel());
} else {
submit_kernel_direct_without_event_impl(
Queue, Range, HostKernel, DeviceKernelInfoPtr,
Queue, Range, HostKernel, DeviceKernelInfoPtr, DepEvents,
TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel());
}
}
Expand All @@ -225,7 +229,7 @@ template <typename KernelName = detail::auto_name, bool EventNeeded = false,
typename PropertiesT, typename KernelTypeUniversalRef, int Dims>
auto submit_kernel_direct_parallel_for(
const queue &Queue, PropertiesT Props, const nd_range<Dims> &Range,
KernelTypeUniversalRef &&KernelFunc,
KernelTypeUniversalRef &&KernelFunc, sycl::span<const event> DepEvents,
const detail::code_location &CodeLoc = detail::code_location::current()) {

using KernelType =
Expand All @@ -247,20 +251,21 @@ auto submit_kernel_direct_parallel_for(
KernelName, EventNeeded, PropertiesT,
KernelTypeUniversalRef, Dims>(
Queue, Props, Range, std::forward<KernelTypeUniversalRef>(KernelFunc),
CodeLoc);
DepEvents, CodeLoc);
}

template <typename KernelName = detail::auto_name, bool EventNeeded = false,
typename PropertiesT, typename KernelTypeUniversalRef>
auto submit_kernel_direct_single_task(
const queue &Queue, PropertiesT Props, KernelTypeUniversalRef &&KernelFunc,
sycl::span<const event> DepEvents,
const detail::code_location &CodeLoc = detail::code_location::current()) {

return submit_kernel_direct<detail::WrapAs::single_task, void, KernelName,
EventNeeded, PropertiesT, KernelTypeUniversalRef,
1>(
Queue, Props, nd_range<1>{1, 1},
std::forward<KernelTypeUniversalRef>(KernelFunc), CodeLoc);
std::forward<KernelTypeUniversalRef>(KernelFunc), DepEvents, CodeLoc);
}

} // namespace detail
Expand Down Expand Up @@ -2786,7 +2791,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
!(detail::KernelLambdaHasKernelHandlerArgT<KernelType, void>::value)) {
return detail::submit_kernel_direct_single_task<KernelName, true>(
*this, ext::oneapi::experimental::empty_properties_t{}, KernelFunc,
TlsCodeLocCapture.query());
{}, TlsCodeLocCapture.query());
} else {
return submit(
[&](handler &CGH) {
Expand Down Expand Up @@ -2836,13 +2841,28 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
"Use queue.submit() instead");

detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvent);
CGH.template single_task<KernelName, KernelType, PropertiesT>(
Properties, KernelFunc);
},
TlsCodeLocCapture.query());

// TODO The handler-less path does not support kernel
// function properties and kernel functions with the kernel_handler
// type argument yet.
if constexpr (
std::is_same_v<PropertiesT,
ext::oneapi::experimental::empty_properties_t> &&
!(ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod<
const KernelType &>::value) &&
!(detail::KernelLambdaHasKernelHandlerArgT<KernelType, void>::value)) {
return detail::submit_kernel_direct_single_task<KernelName, true>(
*this, ext::oneapi::experimental::empty_properties_t{}, KernelFunc,
sycl::span<const event>(&DepEvent, 1), TlsCodeLocCapture.query());
} else {
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvent);
CGH.template single_task<KernelName, KernelType, PropertiesT>(
Properties, KernelFunc);
},
TlsCodeLocCapture.query());
}
}

/// single_task version with a kernel represented as a lambda.
Expand Down Expand Up @@ -2887,13 +2907,25 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
"Use queue.submit() instead");

detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvents);
CGH.template single_task<KernelName, KernelType, PropertiesT>(
Properties, KernelFunc);
},
TlsCodeLocCapture.query());

if constexpr (
std::is_same_v<PropertiesT,
ext::oneapi::experimental::empty_properties_t> &&
!(ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod<
const KernelType &>::value) &&
!(detail::KernelLambdaHasKernelHandlerArgT<KernelType, void>::value)) {
return detail::submit_kernel_direct_single_task<KernelName, true>(
*this, ext::oneapi::experimental::empty_properties_t{}, KernelFunc,
DepEvents, TlsCodeLocCapture.query());
} else {
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvents);
CGH.template single_task<KernelName, KernelType, PropertiesT>(
Properties, KernelFunc);
},
TlsCodeLocCapture.query());
}
}

/// single_task version with a kernel represented as a lambda.
Expand Down Expand Up @@ -3355,7 +3387,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
KernelType, sycl::nd_item<Dims>>::value)) {
return detail::submit_kernel_direct_parallel_for<KernelName, true>(
*this, ext::oneapi::experimental::empty_properties_t{}, Range,
Rest..., TlsCodeLocCapture.query());
Rest..., {}, TlsCodeLocCapture.query());
} else {
return submit(
[&](handler &CGH) {
Expand Down Expand Up @@ -3407,12 +3439,28 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
parallel_for(nd_range<Dims> Range, event DepEvent, RestT &&...Rest) {
constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvent);
CGH.template parallel_for<KernelName>(Range, Rest...);
},
TlsCodeLocCapture.query());
using KernelType = std::tuple_element_t<0, std::tuple<RestT...>>;
// TODO The handler-less path does not support reductions, kernel
// function properties and kernel functions with the kernel_handler
// type argument yet.
if constexpr (sizeof...(RestT) == 1 &&
!(ext::oneapi::experimental::detail::
HasKernelPropertiesGetMethod<
const KernelType &>::value) &&
!(detail::KernelLambdaHasKernelHandlerArgT<
KernelType, sycl::nd_item<Dims>>::value)) {
return detail::submit_kernel_direct_parallel_for<KernelName, true>(
*this, ext::oneapi::experimental::empty_properties_t{}, Range,
Rest..., sycl::span<const event>(&DepEvent, 1),
TlsCodeLocCapture.query());
} else {
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvent);
CGH.template parallel_for<KernelName>(Range, Rest...);
},
TlsCodeLocCapture.query());
}
}

/// parallel_for version with a kernel represented as a lambda + nd_range that
Expand Down Expand Up @@ -3461,12 +3509,28 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
RestT &&...Rest) {
constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvents);
CGH.template parallel_for<KernelName>(Range, Rest...);
},
TlsCodeLocCapture.query());
using KernelType = std::tuple_element_t<0, std::tuple<RestT...>>;

// TODO The handler-less path does not support reductions, kernel
// function properties and kernel functions with the kernel_handler
// type argument yet.
if constexpr (sizeof...(RestT) == 1 &&
!(ext::oneapi::experimental::detail::
HasKernelPropertiesGetMethod<
const KernelType &>::value) &&
!(detail::KernelLambdaHasKernelHandlerArgT<
KernelType, sycl::nd_item<Dims>>::value)) {
return detail::submit_kernel_direct_parallel_for<KernelName, true>(
*this, ext::oneapi::experimental::empty_properties_t{}, Range,
Rest..., DepEvents, TlsCodeLocCapture.query());
} else {
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvents);
CGH.template parallel_for<KernelName>(Range, Rest...);
},
TlsCodeLocCapture.query());
}
}

/// Copies data from a memory region pointed to by a placeholder accessor to
Expand Down
93 changes: 93 additions & 0 deletions sycl/source/detail/event_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -611,6 +611,99 @@ bool event_impl::isCompleted() {

void event_impl::setCommand(Command *Cmd) { MCommand = Cmd; }

template <bool LockQueue>
void registerEventDependency(
const detail::EventImplPtr &EventImpl,
std::vector<detail::EventImplPtr> &EventsRegistered,
detail::queue_impl *QueueImpl, const detail::context_impl &ContextImpl,
const detail::device_impl &DeviceImpl,
const std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> &Graph,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do not need shared_ptr here. We can just pass a raw pointer to the graph_impl, right?

sycl::detail::CGType CommandGroupType) {

if (!EventImpl)
return;
if (EventImpl->isDiscarded()) {
throw sycl::exception(make_error_code(errc::invalid),
"Queue operation cannot depend on discarded event.");
}

// Async alloc calls adapter immediately. Any explicit/implicit dependencies
// are handled at that point, including in order queue deps. Further calls to
// depends_on after an async alloc are explicitly disallowed.
if (CommandGroupType == CGType::AsyncAlloc) {
throw sycl::exception(make_error_code(errc::invalid),
"Cannot submit a dependency after an asynchronous "
"allocation has already been executed!");
}

auto EventGraph = EventImpl->getCommandGraph();
if (QueueImpl && EventGraph) {
auto QueueGraph = QueueImpl->getCommandGraph();

if (&EventGraph->getContextImpl() != &ContextImpl) {
throw sycl::exception(
make_error_code(errc::invalid),
"Cannot submit to a queue with a dependency from a graph that is "
"associated with a different context.");
}

if (&EventGraph->getDeviceImpl() != &DeviceImpl) {
throw sycl::exception(
make_error_code(errc::invalid),
"Cannot submit to a queue with a dependency from a graph that is "
"associated with a different device.");
}

if (QueueGraph && QueueGraph != EventGraph) {
throw sycl::exception(sycl::make_error_code(errc::invalid),
"Cannot submit to a recording queue with a "
"dependency from a different graph.");
}

// If the event dependency has a graph, that means that the queue that
// created it was in recording mode. If the current queue is not recording,
// we need to set it to recording (implements the transitive queue recording
// feature).
if (!QueueGraph) {
if constexpr (LockQueue) {
EventGraph->beginRecording(*QueueImpl);
} else {
EventGraph->beginRecordingUnlockedQueue(*QueueImpl);
}
}
}

if (Graph) {
if (EventGraph == nullptr) {
throw sycl::exception(
make_error_code(errc::invalid),
"Graph nodes cannot depend on events from outside the graph.");
}
if (EventGraph != Graph) {
throw sycl::exception(
make_error_code(errc::invalid),
"Graph nodes cannot depend on events from another graph.");
}
}
EventsRegistered.push_back(EventImpl);
}

template void registerEventDependency<true>(
const detail::EventImplPtr &EventImpl,
std::vector<detail::EventImplPtr> &EventsRegistered,
detail::queue_impl *QueueImpl, const detail::context_impl &ContextImpl,
const detail::device_impl &DeviceImpl,
const std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> &Graph,
sycl::detail::CGType CommandGroupType);

template void registerEventDependency<false>(
const detail::EventImplPtr &EventImpl,
std::vector<detail::EventImplPtr> &EventsRegistered,
detail::queue_impl *QueueImpl, const detail::context_impl &ContextImpl,
const detail::device_impl &DeviceImpl,
const std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> &Graph,
sycl::detail::CGType CommandGroupType);
Comment on lines +691 to +705
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need it?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant why we cannot declare the template function in the header file?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, it is a circular dependency between the headers.


} // namespace detail
} // namespace _V1
} // namespace sycl
Loading
Loading