From 2f1181544fe549cc976e4965bd6396f837408806 Mon Sep 17 00:00:00 2001 From: "Agarwal, Udit" Date: Mon, 13 Oct 2025 23:27:55 +0200 Subject: [PATCH 01/18] Add support for kernel launch properties in no-handler path --- .../sycl/detail/kernel_launch_helper.hpp | 215 +++++++++++++++++- .../oneapi/experimental/enqueue_functions.hpp | 37 +-- sycl/include/sycl/handler.hpp | 151 +++++------- .../sycl/khr/free_function_commands.hpp | 37 ++- sycl/include/sycl/queue.hpp | 80 +++++-- sycl/source/detail/kernel_data.hpp | 66 ++++++ sycl/source/detail/queue_impl.cpp | 7 + sycl/source/detail/queue_impl.hpp | 7 +- sycl/source/handler.cpp | 57 ++--- sycl/source/queue.cpp | 12 +- .../non_esimd_kernel_fp_control.cpp | 4 +- .../include_deps/sycl_detail_core.hpp.cpp | 4 +- .../virtual-functions/properties-negative.cpp | 6 +- 13 files changed, 470 insertions(+), 213 deletions(-) diff --git a/sycl/include/sycl/detail/kernel_launch_helper.hpp b/sycl/include/sycl/detail/kernel_launch_helper.hpp index a80ddc9feb83f..781a992cef03f 100644 --- a/sycl/include/sycl/detail/kernel_launch_helper.hpp +++ b/sycl/include/sycl/detail/kernel_launch_helper.hpp @@ -14,6 +14,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -253,21 +256,219 @@ struct KernelWrapper< } }; // KernelWrapper struct -struct KernelLaunchPropertyWrapper { - template - static void parseProperties([[maybe_unused]] PropertyProcessor h, - [[maybe_unused]] const KernelType &KernelFunc) { +// This struct is inherited by sycl::handler. +class KernelLaunchPropertyWrapper { +public: + // This struct is used to store kernel launch properties. + // std::optional is used to indicate that the property is not set. + // In some code paths, kernel launch properties are set multiple times + // for the same kernel, that is why using std::optional to avoid overriding + // previously set properties. + struct KernelLaunchPropertiesT { + + struct ScopeForwardProgressProperty { + std::optional + Guarantee; + std::optional ExecScope; + std::optional + CoordinationScope; + }; + + std::optional MCacheConfig = std::nullopt; + std::optional MIsCooperative = std::nullopt; + std::optional MWorkGroupMemorySize = std::nullopt; + std::optional MUsesClusterLaunch = std::nullopt; + size_t MClusterDims = 0; + std::array MClusterSize = {0, 0, 0}; + + // Forward progress guarantee properties for work_item, sub_group and + // work_group scopes. + // Indexed by ExecutionScope enum. + std::array MForwardProgressProperties; + + KernelLaunchPropertiesT() = default; + + // TODO: Do you even need this? + KernelLaunchPropertiesT( + ur_kernel_cache_config_t _CacheConfig, bool _IsCooperative, + uint32_t _WorkGroupMemorySize, bool _UsesClusterLaunch, + size_t _ClusterDims, std::array _ClusterSize, + std::array _ForwardProgressProperties) + : MCacheConfig(_CacheConfig), MIsCooperative(_IsCooperative), + MWorkGroupMemorySize(_WorkGroupMemorySize), + MUsesClusterLaunch(_UsesClusterLaunch), MClusterDims(_ClusterDims), + MClusterSize(_ClusterSize), + MForwardProgressProperties(_ForwardProgressProperties) {} + }; // struct KernelLaunchPropertiesT + + /// Process runtime kernel properties. + /// + /// Stores information about kernel properties into the handler. + template + static KernelLaunchPropertiesT + processKernelLaunchProperties(PropertiesT Props) { + using namespace sycl::ext::oneapi::experimental; + using namespace sycl::ext::oneapi::experimental::detail; + KernelLaunchPropertiesT retval; + + // Process Kernel cache configuration property. + { + if constexpr (PropertiesT::template has_property< + sycl::ext::intel::experimental::cache_config_key>()) { + auto Config = Props.template get_property< + sycl::ext::intel::experimental::cache_config_key>(); + if (Config == sycl::ext::intel::experimental::large_slm) { + retval.MCacheConfig = UR_KERNEL_CACHE_CONFIG_LARGE_SLM; + } else if (Config == sycl::ext::intel::experimental::large_data) { + retval.MCacheConfig = UR_KERNEL_CACHE_CONFIG_LARGE_DATA; + } + } else { + std::ignore = Props; + } + } + + // Process Kernel cooperative property. + { + if constexpr (PropertiesT::template has_property()) + retval.MIsCooperative = true; + } + + // Process device progress properties. + { + using forward_progress = + sycl::ext::oneapi::experimental::forward_progress_guarantee; + if constexpr (PropertiesT::template has_property< + work_group_progress_key>()) { + auto prop = Props.template get_property(); + retval.MForwardProgressProperties[0].Guarantee = prop.guarantee; + retval.MForwardProgressProperties[0].ExecScope = + execution_scope::work_group; + retval.MForwardProgressProperties[0].CoordinationScope = + prop.coordinationScope; + + // If we are here, the device supports the guarantee required but there + // is a caveat in that if the guarantee required is a concurrent + // guarantee, then we most likely also need to enable cooperative launch + // of the kernel. That is, although the device supports the required + // guarantee, some setup work is needed to truly make the device provide + // that guarantee at runtime. Otherwise, we will get the default + // guarantee which is weaker than concurrent. Same reasoning applies for + // sub_group but not for work_item. + // TODO: Further design work is probably needed to reflect this behavior + // in Unified Runtime. + if constexpr (prop.guarantee == forward_progress::concurrent) + retval.MIsCooperative = true; + } + if constexpr (PropertiesT::template has_property< + sub_group_progress_key>()) { + auto prop = Props.template get_property(); + retval.MForwardProgressProperties[1].Guarantee = prop.guarantee; + retval.MForwardProgressProperties[1].ExecScope = + execution_scope::sub_group; + retval.MForwardProgressProperties[1].CoordinationScope = + prop.coordinationScope; + + // Same reasoning as above for work_group applies here. + if constexpr (prop.guarantee == forward_progress::concurrent) + retval.MIsCooperative = true; + } + if constexpr (PropertiesT::template has_property< + work_item_progress_key>()) { + auto prop = Props.template get_property(); + retval.MForwardProgressProperties[2].Guarantee = prop.guarantee; + retval.MForwardProgressProperties[2].ExecScope = + execution_scope::work_item; + retval.MForwardProgressProperties[2].CoordinationScope = + prop.coordinationScope; + } + } + + // Process work group scratch memory property. + { + if constexpr (PropertiesT::template has_property< + work_group_scratch_size>()) { + auto WorkGroupMemSize = + Props.template get_property(); + retval.MWorkGroupMemorySize = WorkGroupMemSize.size; + } + } + + // Parse cluster properties. + { + constexpr std::size_t ClusterDim = getClusterDim(); + if constexpr (ClusterDim > 0) { + + auto ClusterSize = + Props.template get_property>() + .get_cluster_size(); + retval.MUsesClusterLaunch = true; + retval.MClusterDims = ClusterDim; + if (ClusterDim == 1) { + retval.MClusterSize[0] = ClusterSize[0]; + } else if (ClusterDim == 2) { + retval.MClusterSize[0] = ClusterSize[0]; + retval.MClusterSize[1] = ClusterSize[1]; + } else if (ClusterDim == 3) { + retval.MClusterSize[0] = ClusterSize[0]; + retval.MClusterSize[1] = ClusterSize[1]; + retval.MClusterSize[2] = ClusterSize[2]; + } else { + assert(ClusterDim <= 3 && + "Only 1D, 2D, and 3D cluster launch is supported."); + } + } + } + + return retval; + } + + /// Process kernel properties. + /// + /// Stores information about kernel properties into the handler. + /// + /// Note: it is important that this function *does not* depend on kernel + /// name or kernel type, because then it will be instantiated for every + /// kernel, even though body of those instantiated functions could be almost + /// the same, thus unnecessary increasing compilation time. + template < + bool IsESIMDKernel, + typename PropertiesT = ext::oneapi::experimental::empty_properties_t> + static KernelLaunchPropertiesT processKernelProperties(PropertiesT Props) { + static_assert( + ext::oneapi::experimental::is_property_list::value, + "Template type is not a property list."); + static_assert( + !PropertiesT::template has_property< + sycl::ext::intel::experimental::fp_control_key>() || + (PropertiesT::template has_property< + sycl::ext::intel::experimental::fp_control_key>() && + IsESIMDKernel), + "Floating point control property is supported for ESIMD kernels only."); + static_assert( + !PropertiesT::template has_property< + sycl::ext::oneapi::experimental::indirectly_callable_key>(), + "indirectly_callable property cannot be applied to SYCL kernels"); + + return processKernelLaunchProperties(Props); + } + + // Returns KernelLaunchPropertiesT or std::nullopt based on whether the + // kernel functor has a get method that returns properties. + template + static std::optional + parseProperties([[maybe_unused]] const KernelType &KernelFunc) { #ifndef __SYCL_DEVICE_ONLY__ // If there are properties provided by get method then process them. if constexpr (ext::oneapi::experimental::detail:: HasKernelPropertiesGetMethod::value) { - h->template processProperties< - detail::CompileTimeKernelInfo.IsESIMD>( + return processKernelProperties( KernelFunc.get(ext::oneapi::experimental::properties_tag{})); } #endif + // If there are no properties provided by get method then return empty + // optional. + return std::nullopt; } }; // KernelLaunchPropertyWrapper struct diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index e393bd626d4d6..49aa3a252682e 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -259,17 +259,12 @@ template void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, ReductionsT &&...Reductions) { - // TODO The handler-less path does not support reductions, kernel - // function properties and kernel functions with the kernel_handler - // type argument yet. + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. if constexpr (sizeof...(ReductionsT) == 0 && - !(ext::oneapi::experimental::detail:: - HasKernelPropertiesGetMethod< - const KernelType &>::value) && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { - detail::submit_kernel_direct(std::move(Q), empty_properties_t{}, - Range, KernelObj); + detail::submit_kernel_direct(std::move(Q), Range, KernelObj); } else { submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Range, KernelObj, @@ -296,13 +291,25 @@ template void nd_launch(queue Q, launch_config, Properties> Config, const KernelType &KernelObj, ReductionsT &&...Reductions) { - // TODO This overload of the nd_launch function takes the kernel function - // properties, which are not yet supported for the handler-less path, - // so it only supports handler based submission for now - submit(std::move(Q), [&](handler &CGH) { - nd_launch(CGH, Config, KernelObj, - std::forward(Reductions)...); - }); + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. + if constexpr (sizeof...(ReductionsT) == 0 && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, sycl::nd_item>::value)) { + + ext::oneapi::experimental::detail::LaunchConfigAccess, + Properties> + ConfigAccess(Config); + + detail::submit_kernel_direct(std::move(Q), + ConfigAccess.getRange(), KernelObj, + Config.getProperties()); + } else { + submit(std::move(Q), [&](handler &CGH) { + nd_launch(CGH, Config, KernelObj, + std::forward(Reductions)...); + }); + } } template diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 67f21bc05857f..78d58c80d6b4f 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -877,6 +877,7 @@ class __SYCL_EXPORT handler { } } +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES void verifyDeviceHasProgressGuarantee( sycl::ext::oneapi::experimental::forward_progress_guarantee guarantee, sycl::ext::oneapi::experimental::execution_scope threadScope, @@ -901,64 +902,10 @@ class __SYCL_EXPORT handler { /// Stores information about kernel properties into the handler. template void processLaunchProperties(PropertiesT Props) { - if constexpr (PropertiesT::template has_property< - sycl::ext::intel::experimental::cache_config_key>()) { - auto Config = Props.template get_property< - sycl::ext::intel::experimental::cache_config_key>(); - if (Config == sycl::ext::intel::experimental::large_slm) { - setKernelCacheConfig(StableKernelCacheConfig::LargeSLM); - } else if (Config == sycl::ext::intel::experimental::large_data) { - setKernelCacheConfig(StableKernelCacheConfig::LargeData); - } - } else { - (void)Props; - } - - constexpr bool UsesRootSync = PropertiesT::template has_property< - sycl::ext::oneapi::experimental::use_root_sync_key>(); - if (UsesRootSync) { - setKernelIsCooperative(UsesRootSync); - } - if constexpr (PropertiesT::template has_property< - sycl::ext::oneapi::experimental:: - work_group_progress_key>()) { - auto prop = Props.template get_property< - sycl::ext::oneapi::experimental::work_group_progress_key>(); - verifyDeviceHasProgressGuarantee( - prop.guarantee, - sycl::ext::oneapi::experimental::execution_scope::work_group, - prop.coordinationScope); - } - if constexpr (PropertiesT::template has_property< - sycl::ext::oneapi::experimental:: - sub_group_progress_key>()) { - auto prop = Props.template get_property< - sycl::ext::oneapi::experimental::sub_group_progress_key>(); - verifyDeviceHasProgressGuarantee( - prop.guarantee, - sycl::ext::oneapi::experimental::execution_scope::sub_group, - prop.coordinationScope); - } - if constexpr (PropertiesT::template has_property< - sycl::ext::oneapi::experimental:: - work_item_progress_key>()) { - auto prop = Props.template get_property< - sycl::ext::oneapi::experimental::work_item_progress_key>(); - verifyDeviceHasProgressGuarantee( - prop.guarantee, - sycl::ext::oneapi::experimental::execution_scope::work_item, - prop.coordinationScope); - } - - if constexpr (PropertiesT::template has_property< - sycl::ext::oneapi::experimental:: - work_group_scratch_size>()) { - auto WorkGroupMemSize = Props.template get_property< - sycl::ext::oneapi::experimental::work_group_scratch_size>(); - setKernelWorkGroupMem(WorkGroupMemSize.size); - } - - checkAndSetClusterRange(Props); + detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT ParsedProp = + detail::KernelLaunchPropertyWrapper::processKernelLaunchProperties< + PropertiesT>(Props); + setKernelLaunchProperties(ParsedProp); } /// Process kernel properties. @@ -973,23 +920,12 @@ class __SYCL_EXPORT handler { bool IsESIMDKernel, typename PropertiesT = ext::oneapi::experimental::empty_properties_t> void processProperties(PropertiesT Props) { - static_assert( - ext::oneapi::experimental::is_property_list::value, - "Template type is not a property list."); - static_assert( - !PropertiesT::template has_property< - sycl::ext::intel::experimental::fp_control_key>() || - (PropertiesT::template has_property< - sycl::ext::intel::experimental::fp_control_key>() && - IsESIMDKernel), - "Floating point control property is supported for ESIMD kernels only."); - static_assert( - !PropertiesT::template has_property< - sycl::ext::oneapi::experimental::indirectly_callable_key>(), - "indirectly_callable property cannot be applied to SYCL kernels"); - - processLaunchProperties(Props); + detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT ParsedProp = + detail::KernelLaunchPropertyWrapper::processKernelProperties< + IsESIMDKernel>(Props); + setKernelLaunchProperties(ParsedProp); } +#endif // INTEL_PREVIEW_BREAKING_CHANGES /// Checks whether it is possible to copy the source shape to the destination /// shape(the shapes are described by the accessor ranges) by using @@ -1297,8 +1233,11 @@ class __SYCL_EXPORT handler { decltype(Wrapper), TransformedArgType, PropertiesT>::wrap(Wrapper); - detail::KernelLaunchPropertyWrapper::parseProperties(this, - Wrapper); + if (auto prop = detail::KernelLaunchPropertyWrapper::parseProperties< + KName, Info.IsESIMD>(Wrapper)) { + setKernelLaunchProperties(*prop); + } + #ifndef __SYCL_DEVICE_ONLY__ verifyUsedKernelBundleInternal(Info.Name); // We are executing over the rounded range, but there are still @@ -1322,11 +1261,17 @@ class __SYCL_EXPORT handler { // kernel is generated detail::KernelWrapper::wrap(KernelFunc); - detail::KernelLaunchPropertyWrapper::parseProperties(this, - KernelFunc); + if (auto prop = detail::KernelLaunchPropertyWrapper::parseProperties< + NameT, Info.IsESIMD>(KernelFunc)) { + setKernelLaunchProperties(*prop); + } #ifndef __SYCL_DEVICE_ONLY__ verifyUsedKernelBundleInternal(Info.Name); - processProperties(Props); + detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT + ProcessedProps = + detail::KernelLaunchPropertyWrapper::processKernelProperties< + Info.IsESIMD, PropertiesT>(Props); + setKernelLaunchProperties(ProcessedProps); detail::checkValueRange(UserRange); setNDRangeDescriptor(std::move(UserRange)); StoreLambda( @@ -1355,7 +1300,10 @@ class __SYCL_EXPORT handler { setDeviceKernelInfo(std::move(Kernel)); detail::checkValueRange(NumWorkItems); setNDRangeDescriptor(std::move(NumWorkItems)); - processLaunchProperties(Props); + detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT ParsedProp = + detail::KernelLaunchPropertyWrapper::processKernelLaunchProperties< + PropertiesT>(Props); + setKernelLaunchProperties(ParsedProp); extractArgsAndReqs(); #endif } @@ -1378,7 +1326,10 @@ class __SYCL_EXPORT handler { setDeviceKernelInfo(std::move(Kernel)); detail::checkValueRange(NDRange); setNDRangeDescriptor(std::move(NDRange)); - processLaunchProperties(Props); + detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT ParsedProp = + detail::KernelLaunchPropertyWrapper::processKernelLaunchProperties< + PropertiesT>(Props); + setKernelLaunchProperties(ParsedProp); extractArgsAndReqs(); #endif } @@ -1395,12 +1346,14 @@ class __SYCL_EXPORT handler { using NameT = typename detail::get_kernel_name_t::name; (void)Props; + constexpr auto Info = detail::CompileTimeKernelInfo; detail::KernelWrapper::wrap(KernelFunc); - detail::KernelLaunchPropertyWrapper::parseProperties(this, - KernelFunc); + if (auto prop = detail::KernelLaunchPropertyWrapper::parseProperties< + NameT, Info.IsESIMD>(KernelFunc)) { + setKernelLaunchProperties(*prop); + } #ifndef __SYCL_DEVICE_ONLY__ - constexpr auto Info = detail::CompileTimeKernelInfo; if constexpr (WrapAsVal == detail::WrapAs::single_task) { throwOnKernelParameterMisuse(Info); } @@ -1416,7 +1369,11 @@ class __SYCL_EXPORT handler { } StoreLambda(std::move(KernelFunc)); - processProperties(Props); + detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT + ProcessedProps = + detail::KernelLaunchPropertyWrapper::processKernelProperties< + Info.IsESIMD, PropertiesT>(Props); + setKernelLaunchProperties(ProcessedProps); #endif } @@ -1439,8 +1396,10 @@ class __SYCL_EXPORT handler { (void)Kernel; detail::KernelWrapper::wrap(KernelFunc); - detail::KernelLaunchPropertyWrapper::parseProperties(this, - KernelFunc); + if (auto prop = detail::KernelLaunchPropertyWrapper::parseProperties( + KernelFunc)) { + setKernelLaunchProperties(*prop); + } #ifndef __SYCL_DEVICE_ONLY__ constexpr auto Info = detail::CompileTimeKernelInfo; if constexpr (WrapAsVal == detail::WrapAs::single_task) { @@ -1467,7 +1426,11 @@ class __SYCL_EXPORT handler { "the kernel name must match the name of the lambda"); } StoreLambda(std::move(KernelFunc)); - processProperties(Props); + detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT + ProcessedProps = + detail::KernelLaunchPropertyWrapper::processKernelProperties< + Info.IsESIMD, PropertiesT>(Props); + setKernelLaunchProperties(ProcessedProps); #endif } #endif // __INTEL_PREVIEW_BREAKING_CHANGES @@ -3490,7 +3453,9 @@ class __SYCL_EXPORT handler { bool IsDeviceImageScoped, size_t NumBytes, size_t Offset); - // Changing values in this will break ABI/API. +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES + // Modeled after ur_kernel_cache_config_t + // Used as an argument to setKernelCacheConfig that's part of the ABI. enum class StableKernelCacheConfig : int32_t { Default = 0, LargeSLM = 1, @@ -3503,15 +3468,18 @@ class __SYCL_EXPORT handler { void setKernelIsCooperative(bool); // Set using cuda thread block cluster launch flag and set the launch bounds. -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES void setKernelClusterLaunch(sycl::range<3> ClusterSize, int Dims); -#endif void setKernelClusterLaunch(sycl::range<3> ClusterSize); void setKernelClusterLaunch(sycl::range<2> ClusterSize); void setKernelClusterLaunch(sycl::range<1> ClusterSize); // Set the request work group memory size (work_group_static ext). void setKernelWorkGroupMem(size_t Size); +#endif + + void setKernelLaunchProperties( + detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT + &KernelLaunchProperties); // Various checks that are only meaningful for host compilation, because they // result in runtime errors (i.e. exceptions being thrown). To save time @@ -3668,7 +3636,6 @@ class __SYCL_EXPORT handler { void instantiateKernelOnHost(void *InstantiateKernelOnHostPtr); friend class detail::HandlerAccess; - friend struct detail::KernelLaunchPropertyWrapper; #ifdef __INTEL_PREVIEW_BREAKING_CHANGES __SYCL_DLL_LOCAL detail::handler_impl *get_impl() { return impl; } diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index b04fac17a6f9c..b422964f3a3a1 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -157,16 +157,15 @@ template r, range<1> size, KernelType &&k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - // TODO The handler-less path does not support kernel function properties - // and kernel functions with the kernel_handler type argument yet. + // TODO The handler-less path does not support kernel functions with the + // kernel_handler type argument yet. if constexpr (!(ext::oneapi::experimental::detail:: HasKernelPropertiesGetMethod< const KernelType &>::value) && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item<1>>::value)) { - detail::submit_kernel_direct( - q, ext::oneapi::experimental::empty_properties_t{}, - nd_range<1>(r, size), std::forward(k)); + detail::submit_kernel_direct(q, nd_range<1>(r, size), + std::forward(k)); } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, @@ -178,16 +177,12 @@ template r, range<2> size, KernelType &&k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - // TODO The handler-less path does not support kernel function properties - // and kernel functions with the kernel_handler type argument yet. - if constexpr (!(ext::oneapi::experimental::detail:: - HasKernelPropertiesGetMethod< - const KernelType &>::value) && - !(detail::KernelLambdaHasKernelHandlerArgT< + // TODO The handler-less path does not support kernel functions with the + // kernel_handler type argument yet. + if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item<2>>::value)) { - detail::submit_kernel_direct( - q, ext::oneapi::experimental::empty_properties_t{}, - nd_range<2>(r, size), std::forward(k)); + detail::submit_kernel_direct(q, nd_range<2>(r, size), + std::forward(k)); } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, @@ -199,16 +194,12 @@ template r, range<3> size, KernelType &&k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - // TODO The handler-less path does not support kernel function properties - // and kernel functions with the kernel_handler type argument yet. - if constexpr (!(ext::oneapi::experimental::detail:: - HasKernelPropertiesGetMethod< - const KernelType &>::value) && - !(detail::KernelLambdaHasKernelHandlerArgT< + // TODO The handler-less path does not support kernel functions with the + // kernel_handler type argument yet. + if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item<3>>::value)) { - detail::submit_kernel_direct( - q, ext::oneapi::experimental::empty_properties_t{}, - nd_range<3>(r, size), std::forward(k)); + detail::submit_kernel_direct(q, nd_range<3>(r, size), + std::forward(k)); } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 755b07f39bbf3..8cdf35f50a93e 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -68,6 +68,7 @@ event __SYCL_EXPORT submit_kernel_direct_with_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template @@ -75,6 +76,7 @@ void __SYCL_EXPORT submit_kernel_direct_without_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); namespace detail { @@ -158,17 +160,13 @@ class __SYCL_EXPORT SubmissionInfo { } // namespace v1 template + typename PropertiesT = ext::oneapi::experimental::empty_properties_t, + typename KernelTypeUniversalRef, int Dims> auto submit_kernel_direct( - const queue &Queue, PropertiesT Props, const nd_range &Range, + const queue &Queue, const nd_range &Range, KernelTypeUniversalRef &&KernelFunc, + PropertiesT ExtraProps = ext::oneapi::experimental::empty_properties_t{}, const detail::code_location &CodeLoc = detail::code_location::current()) { - // TODO Properties not supported yet - (void)Props; - static_assert( - std::is_same_v, - "Setting properties not supported yet for no-CGH kernel submit."); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); using KernelType = @@ -194,6 +192,32 @@ auto submit_kernel_direct( HostKernelRef HostKernel(std::forward(KernelFunc)); + // Get Kernel Launch properties. User can specify properties either + // via specifying get(property_tag{}) method in kernel type or by using + // launch_config API or by explicitly passing them in call to + // parallel_for (deprecated API). + // ExtraProps are properties passed explicitly or via launch_config. + + // Asumption: If user specify properties via launch_config or explicitly + // then we don't check for properties specified via get() method. + detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT parsedProps; + if constexpr (std::is_same_v) { + // Use properties passed via. get() method. + if constexpr (ext::oneapi::experimental::detail:: + HasKernelPropertiesGetMethod::value) { + auto prop = KernelFunc.get(ext::oneapi::experimental::properties_tag{}); + parsedProps = + detail::KernelLaunchPropertyWrapper::processKernelProperties( + prop); + } + } else { + // Use ExtraProps + parsedProps = + detail::KernelLaunchPropertyWrapper::processKernelProperties( + ExtraProps); + } + // Instantiating the kernel on the host improves debugging. // Passing this pointer to another translation unit prevents optimization. #ifndef NDEBUG @@ -207,11 +231,11 @@ auto submit_kernel_direct( if constexpr (EventNeeded) { return submit_kernel_direct_with_event_impl( - Queue, Range, HostKernel, DeviceKernelInfoPtr, + Queue, Range, HostKernel, DeviceKernelInfoPtr, parsedProps, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } else { submit_kernel_direct_without_event_impl( - Queue, Range, HostKernel, DeviceKernelInfoPtr, + Queue, Range, HostKernel, DeviceKernelInfoPtr, parsedProps, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } } @@ -3261,11 +3285,24 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.template parallel_for(Range, Properties, Rest...); - }, - TlsCodeLocCapture.query()); + using KernelType = std::tuple_element_t<0, std::tuple>; + + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. + if constexpr (sizeof...(RestT) == 1 && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, sycl::nd_item>::value)) { + + // FIXME: Can it happen that user defined both get() and properties? + // If so, we should use MergedProperties instead of Properties here. + return detail::submit_kernel_direct( + *this, Range, Rest..., Properties, TlsCodeLocCapture.query()); + } else + return submit( + [&](handler &CGH) { + CGH.template parallel_for(Range, Properties, Rest...); + }, + TlsCodeLocCapture.query()); } /// parallel_for version with a kernel represented as a lambda + nd_range that @@ -3282,18 +3319,15 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); using KernelType = std::tuple_element_t<0, std::tuple>; - // TODO The handler-less path does not support reductions, kernel - // function properties and kernel functions with the kernel_handler - // type argument yet. + // TODO The handler-less path does not support reductions, and + // kernel functions with the kernel_handler type argument yet. if constexpr (sizeof...(RestT) == 1 && - !(ext::oneapi::experimental::detail:: - HasKernelPropertiesGetMethod< - const KernelType &>::value) && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { return detail::submit_kernel_direct( - *this, ext::oneapi::experimental::empty_properties_t{}, Range, - Rest..., TlsCodeLocCapture.query()); + *this, Range, Rest..., + ext::oneapi::experimental::empty_properties_t{}, + TlsCodeLocCapture.query()); } else { return submit( [&](handler &CGH) { diff --git a/sycl/source/detail/kernel_data.hpp b/sycl/source/detail/kernel_data.hpp index 7ba849dc33f1f..037341b9c0691 100644 --- a/sycl/source/detail/kernel_data.hpp +++ b/sycl/source/detail/kernel_data.hpp @@ -13,7 +13,10 @@ #include #include +#include + #include +#include #include @@ -107,6 +110,7 @@ class KernelData { void setDeviceKernelInfoPtr(DeviceKernelInfo *Ptr) { MDeviceKernelInfoPtr = Ptr; } + #ifndef __INTEL_PREVIEW_BREAKING_CHANGES void setKernelInfo(void *KernelFuncPtr, int KernelNumArgs, KernelParamDescGetterT KernelParamDescGetter, @@ -134,6 +138,7 @@ class KernelData { return MDeviceKernelInfoPtr->usesAssert(); } + // Kernel launch properties getter and setters. ur_kernel_cache_config_t getKernelCacheConfig() const { return MKernelCacheConfig; } @@ -163,6 +168,67 @@ class KernelData { MKernelWorkGroupMemorySize = Size; } + void validateAndSetKernelLaunchProperties( + const KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Kprop, + bool HasGraph, const device_impl &dev) { + + // Validate properties before setting. + { + if (HasGraph) { + if (Kprop.MWorkGroupMemorySize) { + throw sycl::exception( + sycl::make_error_code(errc::invalid), + "Setting work group scratch memory size is not yet supported " + "for use with the SYCL Graph extension."); + } + + if (Kprop.MUsesClusterLaunch && *Kprop.MUsesClusterLaunch) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "Cluster launch is not yet supported " + "for use with the SYCL Graph extension."); + } + } + + for (int i = 0; i < 3; i++) { + if (Kprop.MForwardProgressProperties[i].Guarantee.has_value()) { + + if (!dev.supportsForwardProgress( + *Kprop.MForwardProgressProperties[i].Guarantee, + *Kprop.MForwardProgressProperties[i].ExecScope, + *Kprop.MForwardProgressProperties[i].CoordinationScope)) { + // TODO: Make the error message more descriptive. + throw sycl::exception( + sycl::make_error_code(errc::feature_not_supported), + "The device associated with the queue does not support the " + "requested forward progress guarantee."); + } + } + } + } + + // Set properties. + if (Kprop.MIsCooperative) + setCooperative(*Kprop.MIsCooperative); + + if (Kprop.MCacheConfig) + setKernelCacheConfig(*Kprop.MCacheConfig); + + if (Kprop.MWorkGroupMemorySize) + setKernelWorkGroupMemorySize(*Kprop.MWorkGroupMemorySize); + + if (Kprop.MUsesClusterLaunch && *Kprop.MUsesClusterLaunch) { + if (Kprop.MClusterDims == 1) + setClusterDimensions(sycl::range<1>{Kprop.MClusterSize[0]}); + else if (Kprop.MClusterDims == 2) + setClusterDimensions( + sycl::range<2>{Kprop.MClusterSize[0], Kprop.MClusterSize[1]}); + else if (Kprop.MClusterDims == 3) + setClusterDimensions(sycl::range<3>{Kprop.MClusterSize[0], + Kprop.MClusterSize[1], + Kprop.MClusterSize[2]}); + } + } + KernelNameStrRefT getKernelName() const { assert(MDeviceKernelInfoPtr); return static_cast(MDeviceKernelInfoPtr->Name); diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 4ed73e700d8ce..c1e0452796fb3 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -567,6 +567,7 @@ EventImplPtr queue_impl::submit_command_to_graph( EventImplPtr queue_impl::submit_kernel_direct_impl( const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { KernelData KData; @@ -574,6 +575,12 @@ EventImplPtr queue_impl::submit_kernel_direct_impl( KData.setDeviceKernelInfoPtr(DeviceKernelInfo); KData.setNDRDesc(NDRDesc); + // Validate and set kernel launch properties. + KData.validateAndSetKernelLaunchProperties( + Props, getCommandGraph() != nullptr, /*HasGraph?*/ + getDeviceImpl() /*device_impl*/ + ); + auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData, bool SchedulerBypass) -> EventImplPtr { if (SchedulerBypass) { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 49da7aee8c448..4528752ba95f4 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -363,10 +363,11 @@ class queue_impl : public std::enable_shared_from_this { event submit_kernel_direct_with_event( const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, - true, CodeLoc, IsTopCodeLoc); + true, Props, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } @@ -374,9 +375,10 @@ class queue_impl : public std::enable_shared_from_this { void submit_kernel_direct_without_event( const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, - false, CodeLoc, IsTopCodeLoc); + false, Props, CodeLoc, IsTopCodeLoc); } void submit_without_event(const detail::type_erased_cgfo_ty &CGF, @@ -929,6 +931,7 @@ class queue_impl : public std::enable_shared_from_this { EventImplPtr submit_kernel_direct_impl( const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 26477c99be62c..2aacd56fda15d 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -1752,51 +1752,17 @@ static bool checkContextSupports(detail::context_impl &ContextImpl, return SupportsOp; } +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES void handler::verifyDeviceHasProgressGuarantee( sycl::ext::oneapi::experimental::forward_progress_guarantee guarantee, sycl::ext::oneapi::experimental::execution_scope threadScope, sycl::ext::oneapi::experimental::execution_scope coordinationScope) { - using execution_scope = sycl::ext::oneapi::experimental::execution_scope; - using forward_progress = - sycl::ext::oneapi::experimental::forward_progress_guarantee; - const bool supported = impl->get_device().supportsForwardProgress( - guarantee, threadScope, coordinationScope); - if (threadScope == execution_scope::work_group) { - if (!supported) { - throw sycl::exception( - sycl::errc::feature_not_supported, - "Required progress guarantee for work groups is not " - "supported by this device."); - } - // If we are here, the device supports the guarantee required but there is a - // caveat in that if the guarantee required is a concurrent guarantee, then - // we most likely also need to enable cooperative launch of the kernel. That - // is, although the device supports the required guarantee, some setup work - // is needed to truly make the device provide that guarantee at runtime. - // Otherwise, we will get the default guarantee which is weaker than - // concurrent. Same reasoning applies for sub_group but not for work_item. - // TODO: Further design work is probably needed to reflect this behavior in - // Unified Runtime. - if (guarantee == forward_progress::concurrent) - setKernelIsCooperative(true); - } else if (threadScope == execution_scope::sub_group) { - if (!supported) { - throw sycl::exception(sycl::errc::feature_not_supported, - "Required progress guarantee for sub groups is not " - "supported by this device."); - } - // Same reasoning as above. - if (guarantee == forward_progress::concurrent) - setKernelIsCooperative(true); - } else { // threadScope is execution_scope::work_item otherwise undefined - // behavior - if (!supported) { - throw sycl::exception(sycl::errc::feature_not_supported, - "Required progress guarantee for work items is not " - "supported by this device."); - } - } + + // FIXME! + detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT Kprop; + setKernelLaunchProperties(Kprop); } +#endif bool handler::supportsUSMMemcpy2D() { if (impl->get_graph_or_null()) @@ -1910,6 +1876,13 @@ void handler::memcpyFromHostOnlyDeviceGlobal(void *Dest, }); } +void handler::setKernelLaunchProperties( + detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Kprop) { + impl->MKernelData.validateAndSetKernelLaunchProperties( + Kprop, getCommandGraph() != nullptr /*hasGraph?*/, + impl->get_device() /*device_impl*/); +} + #ifndef __INTEL_PREVIEW_BREAKING_CHANGES const std::shared_ptr & handler::getContextImplPtr() const { @@ -1927,6 +1900,7 @@ detail::context_impl &handler::getContextImpl() const { return impl->get_queue().getContextImpl(); } +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES void handler::setKernelCacheConfig(handler::StableKernelCacheConfig Config) { switch (Config) { case handler::StableKernelCacheConfig::Default: @@ -1945,7 +1919,6 @@ void handler::setKernelIsCooperative(bool KernelIsCooperative) { impl->MKernelData.setCooperative(KernelIsCooperative); } -#ifndef __INTEL_PREVIEW_BREAKING_CHANGES void handler::setKernelClusterLaunch(sycl::range<3> ClusterSize, int Dims) { throwIfGraphAssociated< syclex::detail::UnsupportedGraphFeatures:: @@ -1961,7 +1934,6 @@ void handler::setKernelClusterLaunch(sycl::range<3> ClusterSize, int Dims) { impl->MKernelData.setClusterDimensions(ClusterSize); } } -#endif void handler::setKernelClusterLaunch(sycl::range<3> ClusterSize) { throwIfGraphAssociated< @@ -1989,6 +1961,7 @@ void handler::setKernelWorkGroupMem(size_t Size) { sycl_ext_oneapi_work_group_scratch_memory>(); impl->MKernelData.setKernelWorkGroupMemorySize(Size); } +#endif // __INTEL_PREVIEW_BREAKING_CHANGES void handler::ext_oneapi_graph( ext::oneapi::experimental::command_graph< diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index f34da47852266..73a53a01abda2 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -476,27 +476,31 @@ event submit_kernel_direct_with_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { return getSyclObjImpl(Queue)->submit_kernel_direct_with_event( - Range, HostKernel, DeviceKernelInfo, CodeLoc, IsTopCodeLoc); + Range, HostKernel, DeviceKernelInfo, Props, CodeLoc, IsTopCodeLoc); } template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<1>( const queue &Queue, const nd_range<1> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<2>( const queue &Queue, const nd_range<2> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<3>( const queue &Queue, const nd_range<3> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template @@ -504,27 +508,31 @@ void submit_kernel_direct_without_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { getSyclObjImpl(Queue)->submit_kernel_direct_without_event( - Range, HostKernel, DeviceKernelInfo, CodeLoc, IsTopCodeLoc); + Range, HostKernel, DeviceKernelInfo, Props, CodeLoc, IsTopCodeLoc); } template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<1>( const queue &Queue, const nd_range<1> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<2>( const queue &Queue, const nd_range<2> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<3>( const queue &Queue, const nd_range<3> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, + const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); } // namespace _V1 diff --git a/sycl/test/extensions/properties/non_esimd_kernel_fp_control.cpp b/sycl/test/extensions/properties/non_esimd_kernel_fp_control.cpp index 46d11eccdfe54..e6910484bf52f 100644 --- a/sycl/test/extensions/properties/non_esimd_kernel_fp_control.cpp +++ b/sycl/test/extensions/properties/non_esimd_kernel_fp_control.cpp @@ -20,7 +20,7 @@ struct ESIMDKernel { int main(void) { queue q; - // expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement {{.+}}: Floating point control property is supported for ESIMD kernels only.}} + // expected-error-re@sycl/detail/kernel_launch_helper.hpp:* {{static assertion failed due to requirement {{.+}}: Floating point control property is supported for ESIMD kernels only.}} syclex::properties properties7{ intelex::fp_control}; @@ -28,7 +28,7 @@ int main(void) { cgh.single_task(properties7, [=]() {}); }); - // expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement {{.+}}: Floating point control property is supported for ESIMD kernels only.}} + // expected-error-re@sycl/detail/kernel_launch_helper.hpp:* {{static assertion failed due to requirement {{.+}}: Floating point control property is supported for ESIMD kernels only.}} ESIMDKernel Kern; q.submit([&](handler &cgh) { cgh.parallel_for(range<1>(1), Kern); }); diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp index cf98e8708254a..9f68a2bbe4d9d 100644 --- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp +++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp @@ -138,6 +138,8 @@ // CHECK-NEXT: detail/kernel_launch_helper.hpp // CHECK-NEXT: ext/intel/experimental/fp_control_kernel_properties.hpp // CHECK-NEXT: ext/intel/experimental/kernel_execution_properties.hpp +// CHECK-NEXT: ext/oneapi/experimental/cluster_group_prop.hpp +// CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp // CHECK-NEXT: ext/oneapi/experimental/virtual_functions.hpp // CHECK-NEXT: ext/oneapi/kernel_properties/properties.hpp // CHECK-NEXT: ext/oneapi/work_group_scratch_memory.hpp @@ -149,9 +151,7 @@ // CHECK-NEXT: ext/oneapi/bindless_images_interop.hpp // CHECK-NEXT: ext/oneapi/interop_common.hpp // CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp -// CHECK-NEXT: ext/oneapi/experimental/cluster_group_prop.hpp // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp -// CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp // CHECK-NEXT: feature_test.hpp diff --git a/sycl/test/virtual-functions/properties-negative.cpp b/sycl/test/virtual-functions/properties-negative.cpp index b8e1b75f1d9a9..0ef06b3652ad1 100644 --- a/sycl/test/virtual-functions/properties-negative.cpp +++ b/sycl/test/virtual-functions/properties-negative.cpp @@ -17,15 +17,15 @@ int main() { oneapi::properties props_int{oneapi::indirectly_callable_in}; oneapi::properties props_user{oneapi::indirectly_callable_in}; - // expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement {{.*}} indirectly_callable property cannot be applied to SYCL kernels}} + // expected-error-re@sycl/detail/kernel_launch_helper.hpp:* {{static assertion failed due to requirement {{.*}} indirectly_callable property cannot be applied to SYCL kernels}} q.single_task(props_empty, [=]() {}); // When both "props_empty" and "props_void" are in use, we won't see the // static assert firing for the second one, because there will be only one // instantiation of handler::processProperties. q.single_task(props_void, [=]() {}); - // expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement {{.*}} indirectly_callable property cannot be applied to SYCL kernels}} + // expected-error-re@sycl/detail/kernel_launch_helper.hpp:* {{static assertion failed due to requirement {{.*}} indirectly_callable property cannot be applied to SYCL kernels}} q.single_task(props_int, [=]() {}); - // expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement {{.*}} indirectly_callable property cannot be applied to SYCL kernels}} + // expected-error-re@sycl/detail/kernel_launch_helper.hpp:* {{static assertion failed due to requirement {{.*}} indirectly_callable property cannot be applied to SYCL kernels}} q.single_task(props_user, [=]() {}); return 0; From 012c7931acf3fee4409bf86e01fe9b2a9c05be60 Mon Sep 17 00:00:00 2001 From: "Agarwal, Udit" Date: Tue, 14 Oct 2025 00:16:14 +0200 Subject: [PATCH 02/18] Minor fixes --- .../sycl/detail/kernel_launch_helper.hpp | 30 +++---------- sycl/source/detail/kernel_data.hpp | 3 +- sycl/source/handler.cpp | 43 +++++++++++++++++-- 3 files changed, 47 insertions(+), 29 deletions(-) diff --git a/sycl/include/sycl/detail/kernel_launch_helper.hpp b/sycl/include/sycl/detail/kernel_launch_helper.hpp index 781a992cef03f..bd203d58850cb 100644 --- a/sycl/include/sycl/detail/kernel_launch_helper.hpp +++ b/sycl/include/sycl/detail/kernel_launch_helper.hpp @@ -256,7 +256,8 @@ struct KernelWrapper< } }; // KernelWrapper struct -// This struct is inherited by sycl::handler. +// This class encapsulates everything related to parsing kernel launch +// properties. class KernelLaunchPropertyWrapper { public: // This struct is used to store kernel launch properties. @@ -264,8 +265,9 @@ class KernelLaunchPropertyWrapper { // In some code paths, kernel launch properties are set multiple times // for the same kernel, that is why using std::optional to avoid overriding // previously set properties. + // This struct is used to pass kernel launch properties across the ABI + // boundary. struct KernelLaunchPropertiesT { - struct ScopeForwardProgressProperty { std::optional Guarantee; @@ -282,28 +284,11 @@ class KernelLaunchPropertyWrapper { std::array MClusterSize = {0, 0, 0}; // Forward progress guarantee properties for work_item, sub_group and - // work_group scopes. - // Indexed by ExecutionScope enum. + // work_group scopes. We need to store them for validation later. std::array MForwardProgressProperties; - - KernelLaunchPropertiesT() = default; - - // TODO: Do you even need this? - KernelLaunchPropertiesT( - ur_kernel_cache_config_t _CacheConfig, bool _IsCooperative, - uint32_t _WorkGroupMemorySize, bool _UsesClusterLaunch, - size_t _ClusterDims, std::array _ClusterSize, - std::array _ForwardProgressProperties) - : MCacheConfig(_CacheConfig), MIsCooperative(_IsCooperative), - MWorkGroupMemorySize(_WorkGroupMemorySize), - MUsesClusterLaunch(_UsesClusterLaunch), MClusterDims(_ClusterDims), - MClusterSize(_ClusterSize), - MForwardProgressProperties(_ForwardProgressProperties) {} - }; // struct KernelLaunchPropertiesT + }; /// Process runtime kernel properties. - /// - /// Stores information about kernel properties into the handler. template static KernelLaunchPropertiesT processKernelLaunchProperties(PropertiesT Props) { @@ -423,9 +408,6 @@ class KernelLaunchPropertyWrapper { } /// Process kernel properties. - /// - /// Stores information about kernel properties into the handler. - /// /// Note: it is important that this function *does not* depend on kernel /// name or kernel type, because then it will be instantiated for every /// kernel, even though body of those instantiated functions could be almost diff --git a/sycl/source/detail/kernel_data.hpp b/sycl/source/detail/kernel_data.hpp index 037341b9c0691..d73229a909fa9 100644 --- a/sycl/source/detail/kernel_data.hpp +++ b/sycl/source/detail/kernel_data.hpp @@ -8,13 +8,12 @@ #pragma once +#include #include #include #include #include -#include - #include #include diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 2aacd56fda15d..619f8b121aa6d 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -1758,9 +1758,46 @@ void handler::verifyDeviceHasProgressGuarantee( sycl::ext::oneapi::experimental::execution_scope threadScope, sycl::ext::oneapi::experimental::execution_scope coordinationScope) { - // FIXME! - detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT Kprop; - setKernelLaunchProperties(Kprop); + using execution_scope = sycl::ext::oneapi::experimental::execution_scope; + using forward_progress = + sycl::ext::oneapi::experimental::forward_progress_guarantee; + const bool supported = impl->get_device().supportsForwardProgress( + guarantee, threadScope, coordinationScope); + if (threadScope == execution_scope::work_group) { + if (!supported) { + throw sycl::exception( + sycl::errc::feature_not_supported, + "Required progress guarantee for work groups is not " + "supported by this device."); + } + // If we are here, the device supports the guarantee required but there is a + // caveat in that if the guarantee required is a concurrent guarantee, then + // we most likely also need to enable cooperative launch of the kernel. That + // is, although the device supports the required guarantee, some setup work + // is needed to truly make the device provide that guarantee at runtime. + // Otherwise, we will get the default guarantee which is weaker than + // concurrent. Same reasoning applies for sub_group but not for work_item. + // TODO: Further design work is probably needed to reflect this behavior in + // Unified Runtime. + if (guarantee == forward_progress::concurrent) + setKernelIsCooperative(true); + } else if (threadScope == execution_scope::sub_group) { + if (!supported) { + throw sycl::exception(sycl::errc::feature_not_supported, + "Required progress guarantee for sub groups is not " + "supported by this device."); + } + // Same reasoning as above. + if (guarantee == forward_progress::concurrent) + setKernelIsCooperative(true); + } else { // threadScope is execution_scope::work_item otherwise undefined + // behavior + if (!supported) { + throw sycl::exception(sycl::errc::feature_not_supported, + "Required progress guarantee for work items is not " + "supported by this device."); + } + } } #endif From 6f12dff1235c756d4116358d4aea61628b91af0b Mon Sep 17 00:00:00 2001 From: Udit Kumar Agarwal Date: Mon, 13 Oct 2025 15:17:44 -0700 Subject: [PATCH 03/18] Update sycl/include/sycl/queue.hpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- sycl/include/sycl/queue.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 8cdf35f50a93e..24edef878a3b1 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -198,7 +198,7 @@ auto submit_kernel_direct( // parallel_for (deprecated API). // ExtraProps are properties passed explicitly or via launch_config. - // Asumption: If user specify properties via launch_config or explicitly + // Assumption: If user specify properties via launch_config or explicitly // then we don't check for properties specified via get() method. detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT parsedProps; if constexpr (std::is_same_v Date: Wed, 15 Oct 2025 01:11:32 +0200 Subject: [PATCH 04/18] Address feedback --- .../sycl/detail/kernel_launch_helper.hpp | 330 ++++++++---------- .../oneapi/experimental/enqueue_functions.hpp | 8 +- sycl/include/sycl/handler.hpp | 54 ++- sycl/include/sycl/queue.hpp | 14 +- sycl/source/detail/kernel_data.hpp | 83 +++-- sycl/source/detail/queue_impl.cpp | 2 +- sycl/source/detail/queue_impl.hpp | 6 +- sycl/source/handler.cpp | 2 +- sycl/source/queue.cpp | 16 +- 9 files changed, 247 insertions(+), 268 deletions(-) diff --git a/sycl/include/sycl/detail/kernel_launch_helper.hpp b/sycl/include/sycl/detail/kernel_launch_helper.hpp index bd203d58850cb..05b76bd0e1b5f 100644 --- a/sycl/include/sycl/detail/kernel_launch_helper.hpp +++ b/sycl/include/sycl/detail/kernel_launch_helper.hpp @@ -256,203 +256,179 @@ struct KernelWrapper< } }; // KernelWrapper struct -// This class encapsulates everything related to parsing kernel launch +// This namespace encapsulates everything related to parsing kernel launch // properties. -class KernelLaunchPropertyWrapper { -public: - // This struct is used to store kernel launch properties. - // std::optional is used to indicate that the property is not set. - // In some code paths, kernel launch properties are set multiple times - // for the same kernel, that is why using std::optional to avoid overriding - // previously set properties. - // This struct is used to pass kernel launch properties across the ABI - // boundary. - struct KernelLaunchPropertiesT { - struct ScopeForwardProgressProperty { - std::optional - Guarantee; - std::optional ExecScope; - std::optional - CoordinationScope; - }; - - std::optional MCacheConfig = std::nullopt; - std::optional MIsCooperative = std::nullopt; - std::optional MWorkGroupMemorySize = std::nullopt; - std::optional MUsesClusterLaunch = std::nullopt; - size_t MClusterDims = 0; - std::array MClusterSize = {0, 0, 0}; +inline namespace kernel_launch_properties_v1 { + +// This struct is used to store kernel launch properties. +// std::optional is used to indicate that the property is not set. +// This struct is used to pass kernel launch properties across the ABI +// boundary. +struct KernelLaunchPropertiesTy { + // Modeled after ur_kernel_cache_config_t + enum class StableKernelCacheConfig : int32_t { + Default = 0, + LargeSLM = 1, + LargeData = 2 + }; - // Forward progress guarantee properties for work_item, sub_group and - // work_group scopes. We need to store them for validation later. - std::array MForwardProgressProperties; + struct ScopeForwardProgressProperty { + std::optional + Guarantee; + std::optional ExecScope; + std::optional + CoordinationScope; }; - /// Process runtime kernel properties. - template - static KernelLaunchPropertiesT - processKernelLaunchProperties(PropertiesT Props) { - using namespace sycl::ext::oneapi::experimental; - using namespace sycl::ext::oneapi::experimental::detail; - KernelLaunchPropertiesT retval; - - // Process Kernel cache configuration property. - { - if constexpr (PropertiesT::template has_property< - sycl::ext::intel::experimental::cache_config_key>()) { - auto Config = Props.template get_property< - sycl::ext::intel::experimental::cache_config_key>(); - if (Config == sycl::ext::intel::experimental::large_slm) { - retval.MCacheConfig = UR_KERNEL_CACHE_CONFIG_LARGE_SLM; - } else if (Config == sycl::ext::intel::experimental::large_data) { - retval.MCacheConfig = UR_KERNEL_CACHE_CONFIG_LARGE_DATA; - } - } else { - std::ignore = Props; - } - } + std::optional MCacheConfig = std::nullopt; + std::optional MIsCooperative = std::nullopt; + std::optional MWorkGroupMemorySize = std::nullopt; + std::optional MUsesClusterLaunch = std::nullopt; + size_t MClusterDims = 0; + std::array MClusterSize = {0, 0, 0}; - // Process Kernel cooperative property. - { - if constexpr (PropertiesT::template has_property()) - retval.MIsCooperative = true; - } + // Forward progress guarantee properties for work_item, sub_group and + // work_group scopes. We need to store them for validation later. + std::array MForwardProgressProperties; +}; - // Process device progress properties. - { - using forward_progress = - sycl::ext::oneapi::experimental::forward_progress_guarantee; - if constexpr (PropertiesT::template has_property< - work_group_progress_key>()) { - auto prop = Props.template get_property(); - retval.MForwardProgressProperties[0].Guarantee = prop.guarantee; - retval.MForwardProgressProperties[0].ExecScope = - execution_scope::work_group; - retval.MForwardProgressProperties[0].CoordinationScope = - prop.coordinationScope; - - // If we are here, the device supports the guarantee required but there - // is a caveat in that if the guarantee required is a concurrent - // guarantee, then we most likely also need to enable cooperative launch - // of the kernel. That is, although the device supports the required - // guarantee, some setup work is needed to truly make the device provide - // that guarantee at runtime. Otherwise, we will get the default - // guarantee which is weaker than concurrent. Same reasoning applies for - // sub_group but not for work_item. - // TODO: Further design work is probably needed to reflect this behavior - // in Unified Runtime. - if constexpr (prop.guarantee == forward_progress::concurrent) - retval.MIsCooperative = true; - } - if constexpr (PropertiesT::template has_property< - sub_group_progress_key>()) { - auto prop = Props.template get_property(); - retval.MForwardProgressProperties[1].Guarantee = prop.guarantee; - retval.MForwardProgressProperties[1].ExecScope = - execution_scope::sub_group; - retval.MForwardProgressProperties[1].CoordinationScope = - prop.coordinationScope; - - // Same reasoning as above for work_group applies here. - if constexpr (prop.guarantee == forward_progress::concurrent) - retval.MIsCooperative = true; - } - if constexpr (PropertiesT::template has_property< - work_item_progress_key>()) { - auto prop = Props.template get_property(); - retval.MForwardProgressProperties[2].Guarantee = prop.guarantee; - retval.MForwardProgressProperties[2].ExecScope = - execution_scope::work_item; - retval.MForwardProgressProperties[2].CoordinationScope = - prop.coordinationScope; +template +constexpr KernelLaunchPropertiesTy +processKernelLaunchProperties(PropertiesT Props) { + using namespace sycl::ext::oneapi::experimental; + using namespace sycl::ext::oneapi::experimental::detail; + KernelLaunchPropertiesTy retval; + + // Process Kernel cache configuration property. + { + if constexpr (PropertiesT::template has_property< + sycl::ext::intel::experimental::cache_config_key>()) { + auto Config = Props.template get_property< + sycl::ext::intel::experimental::cache_config_key>(); + if (Config == sycl::ext::intel::experimental::large_slm) { + retval.MCacheConfig = + KernelLaunchPropertiesTy::StableKernelCacheConfig::LargeSLM; + } else if (Config == sycl::ext::intel::experimental::large_data) { + retval.MCacheConfig = + KernelLaunchPropertiesTy::StableKernelCacheConfig::LargeData; } + } else { + std::ignore = Props; } + } - // Process work group scratch memory property. - { - if constexpr (PropertiesT::template has_property< - work_group_scratch_size>()) { - auto WorkGroupMemSize = - Props.template get_property(); - retval.MWorkGroupMemorySize = WorkGroupMemSize.size; - } - } + // Process Kernel cooperative property. + { + if constexpr (PropertiesT::template has_property()) + retval.MIsCooperative = true; + } - // Parse cluster properties. - { - constexpr std::size_t ClusterDim = getClusterDim(); - if constexpr (ClusterDim > 0) { - - auto ClusterSize = - Props.template get_property>() - .get_cluster_size(); - retval.MUsesClusterLaunch = true; - retval.MClusterDims = ClusterDim; - if (ClusterDim == 1) { - retval.MClusterSize[0] = ClusterSize[0]; - } else if (ClusterDim == 2) { - retval.MClusterSize[0] = ClusterSize[0]; - retval.MClusterSize[1] = ClusterSize[1]; - } else if (ClusterDim == 3) { - retval.MClusterSize[0] = ClusterSize[0]; - retval.MClusterSize[1] = ClusterSize[1]; - retval.MClusterSize[2] = ClusterSize[2]; - } else { - assert(ClusterDim <= 3 && - "Only 1D, 2D, and 3D cluster launch is supported."); - } - } + // Process device progress properties. + { + if constexpr (PropertiesT::template has_property< + work_group_progress_key>()) { + auto prop = Props.template get_property(); + retval.MForwardProgressProperties[0].Guarantee = prop.guarantee; + retval.MForwardProgressProperties[0].ExecScope = + execution_scope::work_group; + retval.MForwardProgressProperties[0].CoordinationScope = + prop.coordinationScope; } + if constexpr (PropertiesT::template has_property< + sub_group_progress_key>()) { + auto prop = Props.template get_property(); + retval.MForwardProgressProperties[1].Guarantee = prop.guarantee; + retval.MForwardProgressProperties[1].ExecScope = + execution_scope::sub_group; + retval.MForwardProgressProperties[1].CoordinationScope = + prop.coordinationScope; + } + if constexpr (PropertiesT::template has_property< + work_item_progress_key>()) { + auto prop = Props.template get_property(); + retval.MForwardProgressProperties[2].Guarantee = prop.guarantee; + retval.MForwardProgressProperties[2].ExecScope = + execution_scope::work_item; + retval.MForwardProgressProperties[2].CoordinationScope = + prop.coordinationScope; + } + } - return retval; + // Process work group scratch memory property. + { + if constexpr (PropertiesT::template has_property< + work_group_scratch_size>()) { + auto WorkGroupMemSize = + Props.template get_property(); + retval.MWorkGroupMemorySize = WorkGroupMemSize.size; + } } - /// Process kernel properties. - /// Note: it is important that this function *does not* depend on kernel - /// name or kernel type, because then it will be instantiated for every - /// kernel, even though body of those instantiated functions could be almost - /// the same, thus unnecessary increasing compilation time. - template < - bool IsESIMDKernel, - typename PropertiesT = ext::oneapi::experimental::empty_properties_t> - static KernelLaunchPropertiesT processKernelProperties(PropertiesT Props) { - static_assert( - ext::oneapi::experimental::is_property_list::value, - "Template type is not a property list."); - static_assert( - !PropertiesT::template has_property< - sycl::ext::intel::experimental::fp_control_key>() || - (PropertiesT::template has_property< - sycl::ext::intel::experimental::fp_control_key>() && - IsESIMDKernel), - "Floating point control property is supported for ESIMD kernels only."); - static_assert( - !PropertiesT::template has_property< - sycl::ext::oneapi::experimental::indirectly_callable_key>(), - "indirectly_callable property cannot be applied to SYCL kernels"); + // Parse cluster properties. + { + constexpr std::size_t ClusterDim = getClusterDim(); + if constexpr (ClusterDim > 0) { + static_assert(ClusterDim <= 3, + "Only 1D, 2D, and 3D cluster launch is supported."); + + auto ClusterSize = + Props.template get_property>() + .get_cluster_size(); - return processKernelLaunchProperties(Props); + retval.MUsesClusterLaunch = true; + retval.MClusterDims = ClusterDim; + + for (size_t dim = 0; dim < ClusterDim; dim++) + retval.MClusterSize[dim] = ClusterSize[dim]; + } } - // Returns KernelLaunchPropertiesT or std::nullopt based on whether the - // kernel functor has a get method that returns properties. - template - static std::optional - parseProperties([[maybe_unused]] const KernelType &KernelFunc) { + return retval; +} + +/// Note: it is important that this function *does not* depend on kernel +/// name or kernel type, because then it will be instantiated for every +/// kernel, even though body of those instantiated functions could be almost +/// the same, thus unnecessary increasing compilation time. +template +constexpr KernelLaunchPropertiesTy processKernelProperties(PropertiesT Props) { + static_assert(ext::oneapi::experimental::is_property_list::value, + "Template type is not a property list."); + static_assert( + !PropertiesT::template has_property< + sycl::ext::intel::experimental::fp_control_key>() || + (PropertiesT::template has_property< + sycl::ext::intel::experimental::fp_control_key>() && + IsESIMDKernel), + "Floating point control property is supported for ESIMD kernels only."); + static_assert( + !PropertiesT::template has_property< + sycl::ext::oneapi::experimental::indirectly_callable_key>(), + "indirectly_callable property cannot be applied to SYCL kernels"); + + return processKernelLaunchProperties(Props); +} + +// Returns KernelLaunchPropertiesTy or std::nullopt based on whether the +// kernel functor has a get method that returns properties. +template +constexpr std::optional +parseProperties([[maybe_unused]] const KernelType &KernelFunc) { #ifndef __SYCL_DEVICE_ONLY__ - // If there are properties provided by get method then process them. - if constexpr (ext::oneapi::experimental::detail:: - HasKernelPropertiesGetMethod::value) { + // If there are properties provided by get method then process them. + if constexpr (ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod< + const KernelType &>::value) { - return processKernelProperties( - KernelFunc.get(ext::oneapi::experimental::properties_tag{})); - } -#endif - // If there are no properties provided by get method then return empty - // optional. - return std::nullopt; + return processKernelProperties( + KernelFunc.get(ext::oneapi::experimental::properties_tag{})); } -}; // KernelLaunchPropertyWrapper struct +#endif + // If there are no properties provided by get method then return empty + // optional. + return std::nullopt; +} +} // namespace kernel_launch_properties_v1 } // namespace detail } // namespace _V1 diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 49aa3a252682e..9aaf817d99973 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -299,11 +299,11 @@ void nd_launch(queue Q, launch_config, Properties> Config, ext::oneapi::experimental::detail::LaunchConfigAccess, Properties> - ConfigAccess(Config); + LaunchConfigAccess(Config); - detail::submit_kernel_direct(std::move(Q), - ConfigAccess.getRange(), KernelObj, - Config.getProperties()); + detail::submit_kernel_direct( + std::move(Q), LaunchConfigAccess.getRange(), KernelObj, + LaunchConfigAccess.getProperties()); } else { submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Config, KernelObj, diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 78d58c80d6b4f..30629f0c5589e 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -902,9 +902,8 @@ class __SYCL_EXPORT handler { /// Stores information about kernel properties into the handler. template void processLaunchProperties(PropertiesT Props) { - detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT ParsedProp = - detail::KernelLaunchPropertyWrapper::processKernelLaunchProperties< - PropertiesT>(Props); + detail::KernelLaunchPropertiesTy ParsedProp = + detail::processKernelLaunchProperties(Props); setKernelLaunchProperties(ParsedProp); } @@ -920,9 +919,8 @@ class __SYCL_EXPORT handler { bool IsESIMDKernel, typename PropertiesT = ext::oneapi::experimental::empty_properties_t> void processProperties(PropertiesT Props) { - detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT ParsedProp = - detail::KernelLaunchPropertyWrapper::processKernelProperties< - IsESIMDKernel>(Props); + detail::KernelLaunchPropertiesTy ParsedProp = + detail::processKernelProperties(Props); setKernelLaunchProperties(ParsedProp); } #endif // INTEL_PREVIEW_BREAKING_CHANGES @@ -1233,8 +1231,7 @@ class __SYCL_EXPORT handler { decltype(Wrapper), TransformedArgType, PropertiesT>::wrap(Wrapper); - if (auto prop = detail::KernelLaunchPropertyWrapper::parseProperties< - KName, Info.IsESIMD>(Wrapper)) { + if (auto prop = detail::parseProperties(Wrapper)) { setKernelLaunchProperties(*prop); } @@ -1261,16 +1258,14 @@ class __SYCL_EXPORT handler { // kernel is generated detail::KernelWrapper::wrap(KernelFunc); - if (auto prop = detail::KernelLaunchPropertyWrapper::parseProperties< - NameT, Info.IsESIMD>(KernelFunc)) { + if (auto prop = + detail::parseProperties(KernelFunc)) { setKernelLaunchProperties(*prop); } #ifndef __SYCL_DEVICE_ONLY__ verifyUsedKernelBundleInternal(Info.Name); - detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT - ProcessedProps = - detail::KernelLaunchPropertyWrapper::processKernelProperties< - Info.IsESIMD, PropertiesT>(Props); + detail::KernelLaunchPropertiesTy ProcessedProps = + detail::processKernelProperties(Props); setKernelLaunchProperties(ProcessedProps); detail::checkValueRange(UserRange); setNDRangeDescriptor(std::move(UserRange)); @@ -1300,9 +1295,8 @@ class __SYCL_EXPORT handler { setDeviceKernelInfo(std::move(Kernel)); detail::checkValueRange(NumWorkItems); setNDRangeDescriptor(std::move(NumWorkItems)); - detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT ParsedProp = - detail::KernelLaunchPropertyWrapper::processKernelLaunchProperties< - PropertiesT>(Props); + detail::KernelLaunchPropertiesTy ParsedProp = + detail::processKernelLaunchProperties(Props); setKernelLaunchProperties(ParsedProp); extractArgsAndReqs(); #endif @@ -1326,9 +1320,8 @@ class __SYCL_EXPORT handler { setDeviceKernelInfo(std::move(Kernel)); detail::checkValueRange(NDRange); setNDRangeDescriptor(std::move(NDRange)); - detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT ParsedProp = - detail::KernelLaunchPropertyWrapper::processKernelLaunchProperties< - PropertiesT>(Props); + detail::KernelLaunchPropertiesTy ParsedProp = + detail::processKernelLaunchProperties(Props); setKernelLaunchProperties(ParsedProp); extractArgsAndReqs(); #endif @@ -1349,8 +1342,7 @@ class __SYCL_EXPORT handler { constexpr auto Info = detail::CompileTimeKernelInfo; detail::KernelWrapper::wrap(KernelFunc); - if (auto prop = detail::KernelLaunchPropertyWrapper::parseProperties< - NameT, Info.IsESIMD>(KernelFunc)) { + if (auto prop = detail::parseProperties(KernelFunc)) { setKernelLaunchProperties(*prop); } #ifndef __SYCL_DEVICE_ONLY__ @@ -1369,10 +1361,8 @@ class __SYCL_EXPORT handler { } StoreLambda(std::move(KernelFunc)); - detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT - ProcessedProps = - detail::KernelLaunchPropertyWrapper::processKernelProperties< - Info.IsESIMD, PropertiesT>(Props); + detail::KernelLaunchPropertiesTy ProcessedProps = + detail::processKernelProperties(Props); setKernelLaunchProperties(ProcessedProps); #endif } @@ -1396,8 +1386,7 @@ class __SYCL_EXPORT handler { (void)Kernel; detail::KernelWrapper::wrap(KernelFunc); - if (auto prop = detail::KernelLaunchPropertyWrapper::parseProperties( - KernelFunc)) { + if (auto prop = detail::parseProperties(KernelFunc)) { setKernelLaunchProperties(*prop); } #ifndef __SYCL_DEVICE_ONLY__ @@ -1426,10 +1415,8 @@ class __SYCL_EXPORT handler { "the kernel name must match the name of the lambda"); } StoreLambda(std::move(KernelFunc)); - detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT - ProcessedProps = - detail::KernelLaunchPropertyWrapper::processKernelProperties< - Info.IsESIMD, PropertiesT>(Props); + detail::KernelLaunchPropertiesTy ProcessedProps = + detail::processKernelProperties(Props); setKernelLaunchProperties(ProcessedProps); #endif } @@ -3478,8 +3465,7 @@ class __SYCL_EXPORT handler { #endif void setKernelLaunchProperties( - detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT - &KernelLaunchProperties); + const detail::KernelLaunchPropertiesTy &KernelLaunchProperties); // Various checks that are only meaningful for host compilation, because they // result in runtime errors (i.e. exceptions being thrown). To save time diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 24edef878a3b1..ded855a70416f 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -68,7 +68,7 @@ event __SYCL_EXPORT submit_kernel_direct_with_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template @@ -76,7 +76,7 @@ void __SYCL_EXPORT submit_kernel_direct_without_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); namespace detail { @@ -200,22 +200,18 @@ auto submit_kernel_direct( // Assumption: If user specify properties via launch_config or explicitly // then we don't check for properties specified via get() method. - detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT parsedProps; + KernelLaunchPropertiesTy parsedProps; if constexpr (std::is_same_v) { // Use properties passed via. get() method. if constexpr (ext::oneapi::experimental::detail:: HasKernelPropertiesGetMethod::value) { auto prop = KernelFunc.get(ext::oneapi::experimental::properties_tag{}); - parsedProps = - detail::KernelLaunchPropertyWrapper::processKernelProperties( - prop); + parsedProps = detail::processKernelProperties(prop); } } else { // Use ExtraProps - parsedProps = - detail::KernelLaunchPropertyWrapper::processKernelProperties( - ExtraProps); + parsedProps = detail::processKernelProperties(ExtraProps); } // Instantiating the kernel on the host improves debugging. diff --git a/sycl/source/detail/kernel_data.hpp b/sycl/source/detail/kernel_data.hpp index d73229a909fa9..f78845e223057 100644 --- a/sycl/source/detail/kernel_data.hpp +++ b/sycl/source/detail/kernel_data.hpp @@ -168,49 +168,70 @@ class KernelData { } void validateAndSetKernelLaunchProperties( - const KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Kprop, - bool HasGraph, const device_impl &dev) { + const detail::KernelLaunchPropertiesTy &Kprop, bool HasGraph, + const device_impl &dev) { + using execScope = ext::oneapi::experimental::execution_scope; // Validate properties before setting. - { - if (HasGraph) { - if (Kprop.MWorkGroupMemorySize) { - throw sycl::exception( - sycl::make_error_code(errc::invalid), - "Setting work group scratch memory size is not yet supported " - "for use with the SYCL Graph extension."); - } + if (HasGraph) { + if (Kprop.MWorkGroupMemorySize) { + throw sycl::exception( + sycl::make_error_code(errc::invalid), + "Setting work group scratch memory size is not yet supported " + "for use with the SYCL Graph extension."); + } - if (Kprop.MUsesClusterLaunch && *Kprop.MUsesClusterLaunch) { - throw sycl::exception(sycl::make_error_code(errc::invalid), - "Cluster launch is not yet supported " - "for use with the SYCL Graph extension."); - } + if (Kprop.MUsesClusterLaunch && *Kprop.MUsesClusterLaunch) { + throw sycl::exception(sycl::make_error_code(errc::invalid), + "Cluster launch is not yet supported " + "for use with the SYCL Graph extension."); } + } - for (int i = 0; i < 3; i++) { - if (Kprop.MForwardProgressProperties[i].Guarantee.has_value()) { - - if (!dev.supportsForwardProgress( - *Kprop.MForwardProgressProperties[i].Guarantee, - *Kprop.MForwardProgressProperties[i].ExecScope, - *Kprop.MForwardProgressProperties[i].CoordinationScope)) { - // TODO: Make the error message more descriptive. - throw sycl::exception( - sycl::make_error_code(errc::feature_not_supported), - "The device associated with the queue does not support the " - "requested forward progress guarantee."); - } + // Validate and set forward progress guarantees. + for (int i = 0; i < 3; i++) { + if (Kprop.MForwardProgressProperties[i].Guarantee.has_value()) { + + if (!dev.supportsForwardProgress( + *Kprop.MForwardProgressProperties[i].Guarantee, + *Kprop.MForwardProgressProperties[i].ExecScope, + *Kprop.MForwardProgressProperties[i].CoordinationScope)) { + throw sycl::exception( + sycl::make_error_code(errc::feature_not_supported), + "The device associated with the queue does not support the " + "requested forward progress guarantee."); + } + + auto execScope = *Kprop.MForwardProgressProperties[i].ExecScope; + // If we are here, the device supports the guarantee required but there + // is a caveat in that if the guarantee required is a concurrent + // guarantee, then we most likely also need to enable cooperative launch + // of the kernel. That is, although the device supports the required + // guarantee, some setup work is needed to truly make the device provide + // that guarantee at runtime. Otherwise, we will get the default + // guarantee which is weaker than concurrent. Same reasoning applies for + // sub_group but not for work_item. + // TODO: Further design work is probably needed to reflect this behavior + // in Unified Runtime. + if ((execScope == execScope::work_group || + execScope == execScope::sub_group) && + (*Kprop.MForwardProgressProperties[i].Guarantee == + ext::oneapi::experimental::forward_progress_guarantee:: + concurrent)) { + setCooperative(true); } } } - // Set properties. if (Kprop.MIsCooperative) setCooperative(*Kprop.MIsCooperative); - if (Kprop.MCacheConfig) - setKernelCacheConfig(*Kprop.MCacheConfig); + if (Kprop.MCacheConfig) { + // KernelLaunchPropertiesTy::StableKernelCacheConfig is modeled after + // ur_kernel_cache_config_t, so this cast is safe. + setKernelCacheConfig( + static_cast(*Kprop.MCacheConfig)); + } if (Kprop.MWorkGroupMemorySize) setKernelWorkGroupMemorySize(*Kprop.MWorkGroupMemorySize); diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index c1e0452796fb3..3c970e2d57c53 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -567,7 +567,7 @@ EventImplPtr queue_impl::submit_command_to_graph( EventImplPtr queue_impl::submit_kernel_direct_impl( const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { KernelData KData; diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 4528752ba95f4..f57297d9f3b35 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -363,7 +363,7 @@ class queue_impl : public std::enable_shared_from_this { event submit_kernel_direct_with_event( const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, @@ -375,7 +375,7 @@ class queue_impl : public std::enable_shared_from_this { void submit_kernel_direct_without_event( const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, false, Props, CodeLoc, IsTopCodeLoc); @@ -931,7 +931,7 @@ class queue_impl : public std::enable_shared_from_this { EventImplPtr submit_kernel_direct_impl( const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 619f8b121aa6d..07ec95f8ed39b 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -1914,7 +1914,7 @@ void handler::memcpyFromHostOnlyDeviceGlobal(void *Dest, } void handler::setKernelLaunchProperties( - detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Kprop) { + const detail::KernelLaunchPropertiesTy &Kprop) { impl->MKernelData.validateAndSetKernelLaunchProperties( Kprop, getCommandGraph() != nullptr /*hasGraph?*/, impl->get_device() /*device_impl*/); diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 73a53a01abda2..c97798b80bdba 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -476,7 +476,7 @@ event submit_kernel_direct_with_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { return getSyclObjImpl(Queue)->submit_kernel_direct_with_event( Range, HostKernel, DeviceKernelInfo, Props, CodeLoc, IsTopCodeLoc); @@ -486,21 +486,21 @@ template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<1>( const queue &Queue, const nd_range<1> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<2>( const queue &Queue, const nd_range<2> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<3>( const queue &Queue, const nd_range<3> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template @@ -508,7 +508,7 @@ void submit_kernel_direct_without_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { getSyclObjImpl(Queue)->submit_kernel_direct_without_event( Range, HostKernel, DeviceKernelInfo, Props, CodeLoc, IsTopCodeLoc); @@ -518,21 +518,21 @@ template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<1>( const queue &Queue, const nd_range<1> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<2>( const queue &Queue, const nd_range<2> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<3>( const queue &Queue, const nd_range<3> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertyWrapper::KernelLaunchPropertiesT &Props, + const detail::KernelLaunchPropertiesTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); } // namespace _V1 From 19d81cac3d557db2901b62d0ec96ec652f6eedb1 Mon Sep 17 00:00:00 2001 From: "Agarwal, Udit" Date: Mon, 27 Oct 2025 19:13:55 +0100 Subject: [PATCH 05/18] Refactor property parsing via template --- .../sycl/detail/kernel_launch_helper.hpp | 199 +++++++++--------- .../experimental/cluster_group_prop.hpp | 2 +- sycl/include/sycl/handler.hpp | 16 +- sycl/include/sycl/queue.hpp | 6 +- sycl/source/detail/kernel_data.hpp | 148 +++++++++---- sycl/source/detail/queue_impl.cpp | 2 +- sycl/source/detail/queue_impl.hpp | 6 +- sycl/source/handler.cpp | 2 +- sycl/source/queue.cpp | 16 +- 9 files changed, 224 insertions(+), 173 deletions(-) diff --git a/sycl/include/sycl/detail/kernel_launch_helper.hpp b/sycl/include/sycl/detail/kernel_launch_helper.hpp index 05b76bd0e1b5f..cb8aef61b4cc7 100644 --- a/sycl/include/sycl/detail/kernel_launch_helper.hpp +++ b/sycl/include/sycl/detail/kernel_launch_helper.hpp @@ -260,17 +260,49 @@ struct KernelWrapper< // properties. inline namespace kernel_launch_properties_v1 { -// This struct is used to store kernel launch properties. -// std::optional is used to indicate that the property is not set. -// This struct is used to pass kernel launch properties across the ABI -// boundary. -struct KernelLaunchPropertiesTy { - // Modeled after ur_kernel_cache_config_t - enum class StableKernelCacheConfig : int32_t { - Default = 0, - LargeSLM = 1, - LargeData = 2 - }; +template struct MarshalledProperty; + +// Generic implementation for runtime properties. +template +struct MarshalledProperty>> { + std::optional property; + + template + MarshalledProperty([[maybe_unused]] const InputPropertyTy &InputProperties) { + if constexpr (ext::oneapi::experimental::is_property_list_v< + InputPropertyTy>) + if constexpr (InputPropertyTy::template has_property()) { + std::cout << "Got property: " << typeid(PropertyTy).name() << "\n"; + property = InputProperties.template get_property(); + } + } + + MarshalledProperty() = default; +}; + +// Specialization for use_root_sync_key property. +template <> +struct MarshalledProperty { + + bool isRootSyncPropPresent = false; + + template + MarshalledProperty([[maybe_unused]] const InputPropertyTy &Props) { + using namespace sycl::ext::oneapi::experimental; + if constexpr (ext::oneapi::experimental::is_property_list_v< + InputPropertyTy>) + if constexpr (InputPropertyTy::template has_property()) + isRootSyncPropPresent = true; + } + + MarshalledProperty() = default; +}; + +// Specialization for work group progress property. +template <> +struct MarshalledProperty< + sycl::ext::oneapi::experimental::work_group_progress_key> { struct ScopeForwardProgressProperty { std::optional @@ -280,110 +312,69 @@ struct KernelLaunchPropertiesTy { CoordinationScope; }; - std::optional MCacheConfig = std::nullopt; - std::optional MIsCooperative = std::nullopt; - std::optional MWorkGroupMemorySize = std::nullopt; - std::optional MUsesClusterLaunch = std::nullopt; - size_t MClusterDims = 0; - std::array MClusterSize = {0, 0, 0}; - // Forward progress guarantee properties for work_item, sub_group and // work_group scopes. We need to store them for validation later. std::array MForwardProgressProperties; -}; -template -constexpr KernelLaunchPropertiesTy -processKernelLaunchProperties(PropertiesT Props) { - using namespace sycl::ext::oneapi::experimental; - using namespace sycl::ext::oneapi::experimental::detail; - KernelLaunchPropertiesTy retval; - - // Process Kernel cache configuration property. - { - if constexpr (PropertiesT::template has_property< - sycl::ext::intel::experimental::cache_config_key>()) { - auto Config = Props.template get_property< - sycl::ext::intel::experimental::cache_config_key>(); - if (Config == sycl::ext::intel::experimental::large_slm) { - retval.MCacheConfig = - KernelLaunchPropertiesTy::StableKernelCacheConfig::LargeSLM; - } else if (Config == sycl::ext::intel::experimental::large_data) { - retval.MCacheConfig = - KernelLaunchPropertiesTy::StableKernelCacheConfig::LargeData; + template >> + MarshalledProperty([[maybe_unused]] const InputPropertyTy &Props) { + using namespace sycl::ext::oneapi::experimental; + + if constexpr (ext::oneapi::experimental::is_property_list_v< + InputPropertyTy>) { + if constexpr (InputPropertyTy::template has_property< + work_group_progress_key>()) { + auto prop = Props.template get_property(); + MForwardProgressProperties[0].Guarantee = prop.guarantee; + MForwardProgressProperties[0].ExecScope = execution_scope::work_group; + MForwardProgressProperties[0].CoordinationScope = + prop.coordinationScope; + } + if constexpr (InputPropertyTy::template has_property< + sub_group_progress_key>()) { + auto prop = Props.template get_property(); + MForwardProgressProperties[1].Guarantee = prop.guarantee; + MForwardProgressProperties[1].ExecScope = execution_scope::sub_group; + MForwardProgressProperties[1].CoordinationScope = + prop.coordinationScope; + } + if constexpr (InputPropertyTy::template has_property< + work_item_progress_key>()) { + auto prop = Props.template get_property(); + MForwardProgressProperties[2].Guarantee = prop.guarantee; + MForwardProgressProperties[2].ExecScope = execution_scope::work_item; + MForwardProgressProperties[2].CoordinationScope = + prop.coordinationScope; } - } else { - std::ignore = Props; - } - } - - // Process Kernel cooperative property. - { - if constexpr (PropertiesT::template has_property()) - retval.MIsCooperative = true; - } - - // Process device progress properties. - { - if constexpr (PropertiesT::template has_property< - work_group_progress_key>()) { - auto prop = Props.template get_property(); - retval.MForwardProgressProperties[0].Guarantee = prop.guarantee; - retval.MForwardProgressProperties[0].ExecScope = - execution_scope::work_group; - retval.MForwardProgressProperties[0].CoordinationScope = - prop.coordinationScope; - } - if constexpr (PropertiesT::template has_property< - sub_group_progress_key>()) { - auto prop = Props.template get_property(); - retval.MForwardProgressProperties[1].Guarantee = prop.guarantee; - retval.MForwardProgressProperties[1].ExecScope = - execution_scope::sub_group; - retval.MForwardProgressProperties[1].CoordinationScope = - prop.coordinationScope; - } - if constexpr (PropertiesT::template has_property< - work_item_progress_key>()) { - auto prop = Props.template get_property(); - retval.MForwardProgressProperties[2].Guarantee = prop.guarantee; - retval.MForwardProgressProperties[2].ExecScope = - execution_scope::work_item; - retval.MForwardProgressProperties[2].CoordinationScope = - prop.coordinationScope; } } - // Process work group scratch memory property. - { - if constexpr (PropertiesT::template has_property< - work_group_scratch_size>()) { - auto WorkGroupMemSize = - Props.template get_property(); - retval.MWorkGroupMemorySize = WorkGroupMemSize.size; - } - } + MarshalledProperty() = default; +}; - // Parse cluster properties. - { - constexpr std::size_t ClusterDim = getClusterDim(); - if constexpr (ClusterDim > 0) { - static_assert(ClusterDim <= 3, - "Only 1D, 2D, and 3D cluster launch is supported."); +template struct PropsHolder : MarshalledProperty... { - auto ClusterSize = - Props.template get_property>() - .get_cluster_size(); + template + PropsHolder(PropertiesT Props) : MarshalledProperty(Props)... {} - retval.MUsesClusterLaunch = true; - retval.MClusterDims = ClusterDim; + PropsHolder() = default; +}; - for (size_t dim = 0; dim < ClusterDim; dim++) - retval.MClusterSize[dim] = ClusterSize[dim]; - } - } +using KernelPropertyHolderStructTy = + PropsHolder, + sycl::ext::oneapi::experimental::cuda::cluster_size_key<2>, + sycl::ext::oneapi::experimental::cuda::cluster_size_key<3>>; - return retval; +template +constexpr auto processKernelLaunchProperties(PropertiesT Props) { + KernelPropertyHolderStructTy prop(Props); + return prop; } /// Note: it is important that this function *does not* depend on kernel @@ -392,7 +383,7 @@ processKernelLaunchProperties(PropertiesT Props) { /// the same, thus unnecessary increasing compilation time. template -constexpr KernelLaunchPropertiesTy processKernelProperties(PropertiesT Props) { +constexpr auto processKernelProperties(PropertiesT Props) { static_assert(ext::oneapi::experimental::is_property_list::value, "Template type is not a property list."); static_assert( @@ -413,7 +404,7 @@ constexpr KernelLaunchPropertiesTy processKernelProperties(PropertiesT Props) { // Returns KernelLaunchPropertiesTy or std::nullopt based on whether the // kernel functor has a get method that returns properties. template -constexpr std::optional +constexpr std::optional parseProperties([[maybe_unused]] const KernelType &KernelFunc) { #ifndef __SYCL_DEVICE_ONLY__ // If there are properties provided by get method then process them. diff --git a/sycl/include/sycl/ext/oneapi/experimental/cluster_group_prop.hpp b/sycl/include/sycl/ext/oneapi/experimental/cluster_group_prop.hpp index e7eae55636622..9e0d84afb660f 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cluster_group_prop.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cluster_group_prop.hpp @@ -22,7 +22,7 @@ struct cluster_size cluster_size, ::sycl::ext::oneapi::experimental::detail::ClusterLaunch> { cluster_size(const range &size) : size(size) {} - sycl::range get_cluster_size() { return size; } + sycl::range get_cluster_size() const { return size; } private: range size; diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 30629f0c5589e..ca9a9a8dc301b 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -902,7 +902,7 @@ class __SYCL_EXPORT handler { /// Stores information about kernel properties into the handler. template void processLaunchProperties(PropertiesT Props) { - detail::KernelLaunchPropertiesTy ParsedProp = + detail::KernelPropertyHolderStructTy ParsedProp = detail::processKernelLaunchProperties(Props); setKernelLaunchProperties(ParsedProp); } @@ -919,7 +919,7 @@ class __SYCL_EXPORT handler { bool IsESIMDKernel, typename PropertiesT = ext::oneapi::experimental::empty_properties_t> void processProperties(PropertiesT Props) { - detail::KernelLaunchPropertiesTy ParsedProp = + detail::KernelPropertyHolderStructTy ParsedProp = detail::processKernelProperties(Props); setKernelLaunchProperties(ParsedProp); } @@ -1264,7 +1264,7 @@ class __SYCL_EXPORT handler { } #ifndef __SYCL_DEVICE_ONLY__ verifyUsedKernelBundleInternal(Info.Name); - detail::KernelLaunchPropertiesTy ProcessedProps = + detail::KernelPropertyHolderStructTy ProcessedProps = detail::processKernelProperties(Props); setKernelLaunchProperties(ProcessedProps); detail::checkValueRange(UserRange); @@ -1295,7 +1295,7 @@ class __SYCL_EXPORT handler { setDeviceKernelInfo(std::move(Kernel)); detail::checkValueRange(NumWorkItems); setNDRangeDescriptor(std::move(NumWorkItems)); - detail::KernelLaunchPropertiesTy ParsedProp = + detail::KernelPropertyHolderStructTy ParsedProp = detail::processKernelLaunchProperties(Props); setKernelLaunchProperties(ParsedProp); extractArgsAndReqs(); @@ -1320,7 +1320,7 @@ class __SYCL_EXPORT handler { setDeviceKernelInfo(std::move(Kernel)); detail::checkValueRange(NDRange); setNDRangeDescriptor(std::move(NDRange)); - detail::KernelLaunchPropertiesTy ParsedProp = + detail::KernelPropertyHolderStructTy ParsedProp = detail::processKernelLaunchProperties(Props); setKernelLaunchProperties(ParsedProp); extractArgsAndReqs(); @@ -1361,7 +1361,7 @@ class __SYCL_EXPORT handler { } StoreLambda(std::move(KernelFunc)); - detail::KernelLaunchPropertiesTy ProcessedProps = + detail::KernelPropertyHolderStructTy ProcessedProps = detail::processKernelProperties(Props); setKernelLaunchProperties(ProcessedProps); #endif @@ -1415,7 +1415,7 @@ class __SYCL_EXPORT handler { "the kernel name must match the name of the lambda"); } StoreLambda(std::move(KernelFunc)); - detail::KernelLaunchPropertiesTy ProcessedProps = + detail::KernelPropertyHolderStructTy ProcessedProps = detail::processKernelProperties(Props); setKernelLaunchProperties(ProcessedProps); #endif @@ -3465,7 +3465,7 @@ class __SYCL_EXPORT handler { #endif void setKernelLaunchProperties( - const detail::KernelLaunchPropertiesTy &KernelLaunchProperties); + const detail::KernelPropertyHolderStructTy &KernelLaunchProperties); // Various checks that are only meaningful for host compilation, because they // result in runtime errors (i.e. exceptions being thrown). To save time diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index ded855a70416f..36d504d42f4d3 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -68,7 +68,7 @@ event __SYCL_EXPORT submit_kernel_direct_with_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertiesTy &Props, + const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template @@ -76,7 +76,7 @@ void __SYCL_EXPORT submit_kernel_direct_without_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertiesTy &Props, + const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); namespace detail { @@ -200,7 +200,7 @@ auto submit_kernel_direct( // Assumption: If user specify properties via launch_config or explicitly // then we don't check for properties specified via get() method. - KernelLaunchPropertiesTy parsedProps; + KernelPropertyHolderStructTy parsedProps; if constexpr (std::is_same_v) { // Use properties passed via. get() method. diff --git a/sycl/source/detail/kernel_data.hpp b/sycl/source/detail/kernel_data.hpp index f78845e223057..1866b53c94f86 100644 --- a/sycl/source/detail/kernel_data.hpp +++ b/sycl/source/detail/kernel_data.hpp @@ -29,6 +29,11 @@ class KernelData { using DynamicParametersVecT = std::vector>; using ArgsVecT = std::vector; + template + using MarshalledProperty = detail::MarshalledProperty; + template + using cluster_size_key = + sycl::ext::oneapi::experimental::cuda::cluster_size_key; KernelData() = default; ~KernelData() = default; @@ -167,21 +172,84 @@ class KernelData { MKernelWorkGroupMemorySize = Size; } + void parseAndSetCacheConfigProperty( + const sycl::ext::intel::experimental::cache_config_key &prop) { + using namespace sycl::ext::intel::experimental; + + ur_kernel_cache_config_t CacheConfig = + ur_kernel_cache_config_t::UR_KERNEL_CACHE_CONFIG_DEFAULT; + if (prop == large_slm) { + CacheConfig = ur_kernel_cache_config_t::UR_KERNEL_CACHE_CONFIG_LARGE_SLM; + } else if (prop == large_data) { + CacheConfig = ur_kernel_cache_config_t::UR_KERNEL_CACHE_CONFIG_LARGE_DATA; + } else + assert(false && "unknown cache property type"); + + MKernelCacheConfig = CacheConfig; + } + + template + void parseAndSetClusterDimProperty( + const std::optional> &prop) { + if (prop) { + static_assert(ClusterDims < 4 && ClusterDims > 0, + "Invalid cluster dimensions"); + + auto ClusterSize = prop->get_cluster_size(); + MKernelUsesClusterLaunch = true; + + if constexpr (ClusterDims == 1) + MNDRDesc.setClusterDimensions(sycl::range<1>{ClusterSize[0]}); + else if constexpr (ClusterDims == 2) + MNDRDesc.setClusterDimensions( + sycl::range<2>{ClusterSize[0], ClusterSize[1]}); + else if constexpr (ClusterDims == 3) + MNDRDesc.setClusterDimensions( + sycl::range<3>{ClusterSize[0], ClusterSize[1], ClusterSize[2]}); + } + } + void validateAndSetKernelLaunchProperties( - const detail::KernelLaunchPropertiesTy &Kprop, bool HasGraph, + const detail::KernelPropertyHolderStructTy Kprop, bool HasGraph, const device_impl &dev) { using execScope = ext::oneapi::experimental::execution_scope; - - // Validate properties before setting. + using namespace sycl::ext::oneapi::experimental; + using namespace sycl::ext::oneapi::experimental::detail; + + const auto *WorkGroupMemSizeProp = + static_cast *>( + &Kprop); + const auto *CacheConfigProp = static_cast *>(&Kprop); + const auto *UseRootSyncProp = + static_cast *>(&Kprop); + const auto *ForwardProgressProp = + static_cast *>( + &Kprop); + const auto *ClusterLaunchPropDim1 = + static_cast> *>( + &Kprop); + const auto *ClusterLaunchPropDim2 = + static_cast> *>( + &Kprop); + const auto *ClusterLaunchPropDim3 = + static_cast> *>( + &Kprop); + + const bool isClusterDimPropPresent = ClusterLaunchPropDim1->property || + ClusterLaunchPropDim2->property || + ClusterLaunchPropDim3->property; + + // Early validation for graph-incompatible properties if (HasGraph) { - if (Kprop.MWorkGroupMemorySize) { + if (WorkGroupMemSizeProp->property) { throw sycl::exception( sycl::make_error_code(errc::invalid), "Setting work group scratch memory size is not yet supported " "for use with the SYCL Graph extension."); } - if (Kprop.MUsesClusterLaunch && *Kprop.MUsesClusterLaunch) { + if (isClusterDimPropPresent) { throw sycl::exception(sycl::make_error_code(errc::invalid), "Cluster launch is not yet supported " "for use with the SYCL Graph extension."); @@ -190,62 +258,54 @@ class KernelData { // Validate and set forward progress guarantees. for (int i = 0; i < 3; i++) { - if (Kprop.MForwardProgressProperties[i].Guarantee.has_value()) { + if (ForwardProgressProp->MForwardProgressProperties[i] + .Guarantee.has_value()) { if (!dev.supportsForwardProgress( - *Kprop.MForwardProgressProperties[i].Guarantee, - *Kprop.MForwardProgressProperties[i].ExecScope, - *Kprop.MForwardProgressProperties[i].CoordinationScope)) { + *ForwardProgressProp->MForwardProgressProperties[i].Guarantee, + *ForwardProgressProp->MForwardProgressProperties[i].ExecScope, + *ForwardProgressProp->MForwardProgressProperties[i] + .CoordinationScope)) { throw sycl::exception( sycl::make_error_code(errc::feature_not_supported), "The device associated with the queue does not support the " "requested forward progress guarantee."); } - auto execScope = *Kprop.MForwardProgressProperties[i].ExecScope; - // If we are here, the device supports the guarantee required but there - // is a caveat in that if the guarantee required is a concurrent - // guarantee, then we most likely also need to enable cooperative launch - // of the kernel. That is, although the device supports the required - // guarantee, some setup work is needed to truly make the device provide - // that guarantee at runtime. Otherwise, we will get the default - // guarantee which is weaker than concurrent. Same reasoning applies for - // sub_group but not for work_item. - // TODO: Further design work is probably needed to reflect this behavior - // in Unified Runtime. + auto execScope = + *ForwardProgressProp->MForwardProgressProperties[i].ExecScope; + // If we are here, the device supports the guarantee required but + // there is a caveat in that if the guarantee required is a concurrent + // guarantee, then we most likely also need to enable cooperative + // launch of the kernel. That is, although the device supports the + // required guarantee, some setup work is needed to truly make the + // device provide that guarantee at runtime. Otherwise, we will get + // the default guarantee which is weaker than concurrent. Same + // reasoning applies for sub_group but not for work_item. + // TODO: Further design work is probably needed to reflect this + // behavior in Unified Runtime. if ((execScope == execScope::work_group || execScope == execScope::sub_group) && - (*Kprop.MForwardProgressProperties[i].Guarantee == - ext::oneapi::experimental::forward_progress_guarantee:: - concurrent)) { + (*ForwardProgressProp->MForwardProgressProperties[i].Guarantee == + forward_progress_guarantee::concurrent)) { setCooperative(true); } } } - if (Kprop.MIsCooperative) - setCooperative(*Kprop.MIsCooperative); + if (UseRootSyncProp->isRootSyncPropPresent) + setCooperative(true); - if (Kprop.MCacheConfig) { - // KernelLaunchPropertiesTy::StableKernelCacheConfig is modeled after - // ur_kernel_cache_config_t, so this cast is safe. - setKernelCacheConfig( - static_cast(*Kprop.MCacheConfig)); - } + if (CacheConfigProp->property) + parseAndSetCacheConfigProperty(*(CacheConfigProp->property)); + + if (WorkGroupMemSizeProp->property) + setKernelWorkGroupMemorySize((*WorkGroupMemSizeProp->property).size); - if (Kprop.MWorkGroupMemorySize) - setKernelWorkGroupMemorySize(*Kprop.MWorkGroupMemorySize); - - if (Kprop.MUsesClusterLaunch && *Kprop.MUsesClusterLaunch) { - if (Kprop.MClusterDims == 1) - setClusterDimensions(sycl::range<1>{Kprop.MClusterSize[0]}); - else if (Kprop.MClusterDims == 2) - setClusterDimensions( - sycl::range<2>{Kprop.MClusterSize[0], Kprop.MClusterSize[1]}); - else if (Kprop.MClusterDims == 3) - setClusterDimensions(sycl::range<3>{Kprop.MClusterSize[0], - Kprop.MClusterSize[1], - Kprop.MClusterSize[2]}); + if (isClusterDimPropPresent) { + parseAndSetClusterDimProperty(ClusterLaunchPropDim1->property); + parseAndSetClusterDimProperty(ClusterLaunchPropDim2->property); + parseAndSetClusterDimProperty(ClusterLaunchPropDim3->property); } } diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 3c970e2d57c53..d62366e140837 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -567,7 +567,7 @@ EventImplPtr queue_impl::submit_command_to_graph( EventImplPtr queue_impl::submit_kernel_direct_impl( const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, - const detail::KernelLaunchPropertiesTy &Props, + const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { KernelData KData; diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index f57297d9f3b35..031b0a01f56bc 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -363,7 +363,7 @@ class queue_impl : public std::enable_shared_from_this { event submit_kernel_direct_with_event( const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertiesTy &Props, + const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, @@ -375,7 +375,7 @@ class queue_impl : public std::enable_shared_from_this { void submit_kernel_direct_without_event( const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertiesTy &Props, + const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, false, Props, CodeLoc, IsTopCodeLoc); @@ -931,7 +931,7 @@ class queue_impl : public std::enable_shared_from_this { EventImplPtr submit_kernel_direct_impl( const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, - const detail::KernelLaunchPropertiesTy &Props, + const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 07ec95f8ed39b..e9f139126c4c6 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -1914,7 +1914,7 @@ void handler::memcpyFromHostOnlyDeviceGlobal(void *Dest, } void handler::setKernelLaunchProperties( - const detail::KernelLaunchPropertiesTy &Kprop) { + const detail::KernelPropertyHolderStructTy &Kprop) { impl->MKernelData.validateAndSetKernelLaunchProperties( Kprop, getCommandGraph() != nullptr /*hasGraph?*/, impl->get_device() /*device_impl*/); diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index c97798b80bdba..7fe5649aecc2a 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -476,7 +476,7 @@ event submit_kernel_direct_with_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertiesTy &Props, + const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { return getSyclObjImpl(Queue)->submit_kernel_direct_with_event( Range, HostKernel, DeviceKernelInfo, Props, CodeLoc, IsTopCodeLoc); @@ -486,21 +486,21 @@ template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<1>( const queue &Queue, const nd_range<1> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertiesTy &Props, + const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<2>( const queue &Queue, const nd_range<2> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertiesTy &Props, + const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<3>( const queue &Queue, const nd_range<3> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertiesTy &Props, + const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template @@ -508,7 +508,7 @@ void submit_kernel_direct_without_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertiesTy &Props, + const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { getSyclObjImpl(Queue)->submit_kernel_direct_without_event( Range, HostKernel, DeviceKernelInfo, Props, CodeLoc, IsTopCodeLoc); @@ -518,21 +518,21 @@ template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<1>( const queue &Queue, const nd_range<1> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertiesTy &Props, + const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<2>( const queue &Queue, const nd_range<2> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertiesTy &Props, + const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<3>( const queue &Queue, const nd_range<3> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelLaunchPropertiesTy &Props, + const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); } // namespace _V1 From c29b6c7fd6d1c08feeacc586f86224e8f9f7012b Mon Sep 17 00:00:00 2001 From: "Agarwal, Udit" Date: Mon, 27 Oct 2025 19:31:58 +0100 Subject: [PATCH 06/18] Update ABI and include_deps test --- sycl/test/abi/sycl_symbols_linux.dump | 13 ++++---- .../sycl_khr_includes_context.hpp.cpp | 2 +- .../sycl_khr_includes_functional.hpp.cpp | 1 - .../sycl_khr_includes_handler.hpp.cpp | 30 +++++++++---------- .../sycl_khr_includes_kernel_bundle.hpp.cpp | 30 +++++++++---------- .../sycl_khr_includes_math.hpp.cpp | 2 +- .../sycl_khr_includes_queue.hpp.cpp | 4 +-- .../sycl_khr_includes_reduction.hpp.cpp | 30 +++++++++---------- .../sycl_khr_includes_stream.hpp.cpp | 30 +++++++++---------- .../sycl_khr_includes_usm.hpp.cpp | 6 ++-- .../sycl_khr_includes_vec.hpp.cpp | 2 +- 11 files changed, 75 insertions(+), 75 deletions(-) diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 032b82ae74293..83292f20db599 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -2985,12 +2985,12 @@ _ZN4sycl3_V121__isgreaterequal_implEdd _ZN4sycl3_V121__isgreaterequal_implEff _ZN4sycl3_V122accelerator_selector_vERKNS0_6deviceE _ZN4sycl3_V128verifyUSMAllocatorPropertiesERKNS0_13property_listE -_ZN4sycl3_V136submit_kernel_direct_with_event_implILi1EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoERKNSA_13code_locationEb -_ZN4sycl3_V136submit_kernel_direct_with_event_implILi2EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoERKNSA_13code_locationEb -_ZN4sycl3_V136submit_kernel_direct_with_event_implILi3EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoERKNSA_13code_locationEb -_ZN4sycl3_V139submit_kernel_direct_without_event_implILi1EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoERKNS9_13code_locationEb -_ZN4sycl3_V139submit_kernel_direct_without_event_implILi2EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoERKNS9_13code_locationEb -_ZN4sycl3_V139submit_kernel_direct_without_event_implILi3EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoERKNS9_13code_locationEb +_ZN4sycl3_V136submit_kernel_direct_with_event_implILi1EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoERKNSA_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSH_5intel12experimental12cache_configENSJ_17use_root_sync_keyENSJ_23work_group_progress_keyENSJ_4cuda12cluster_sizeILi1EEENSR_ILi2EEENSR_ILi3EEEEEERKNSA_13code_locationEb +_ZN4sycl3_V136submit_kernel_direct_with_event_implILi2EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoERKNSA_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSH_5intel12experimental12cache_configENSJ_17use_root_sync_keyENSJ_23work_group_progress_keyENSJ_4cuda12cluster_sizeILi1EEENSR_ILi2EEENSR_ILi3EEEEEERKNSA_13code_locationEb +_ZN4sycl3_V136submit_kernel_direct_with_event_implILi3EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoERKNSA_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSH_5intel12experimental12cache_configENSJ_17use_root_sync_keyENSJ_23work_group_progress_keyENSJ_4cuda12cluster_sizeILi1EEENSR_ILi2EEENSR_ILi3EEEEEERKNSA_13code_locationEb +_ZN4sycl3_V139submit_kernel_direct_without_event_implILi1EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoERKNS9_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSG_5intel12experimental12cache_configENSI_17use_root_sync_keyENSI_23work_group_progress_keyENSI_4cuda12cluster_sizeILi1EEENSQ_ILi2EEENSQ_ILi3EEEEEERKNS9_13code_locationEb +_ZN4sycl3_V139submit_kernel_direct_without_event_implILi2EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoERKNS9_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSG_5intel12experimental12cache_configENSI_17use_root_sync_keyENSI_23work_group_progress_keyENSI_4cuda12cluster_sizeILi1EEENSQ_ILi2EEENSQ_ILi3EEEEEERKNS9_13code_locationEb +_ZN4sycl3_V139submit_kernel_direct_without_event_implILi3EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoERKNS9_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSG_5intel12experimental12cache_configENSI_17use_root_sync_keyENSI_23work_group_progress_keyENSI_4cuda12cluster_sizeILi1EEENSQ_ILi2EEENSQ_ILi3EEEEEERKNS9_13code_locationEb _ZN4sycl3_V13ext5intel12experimental9pipe_base13get_pipe_nameB5cxx11EPKv _ZN4sycl3_V13ext5intel12experimental9pipe_base17wait_non_blockingERKNS0_5eventE _ZN4sycl3_V13ext5intel12experimental9pipe_base18get_pipe_name_implEPKv @@ -3618,6 +3618,7 @@ _ZN4sycl3_V17handler24ext_oneapi_memset2d_implEPvmimm _ZN4sycl3_V17handler24registerDynamicParameterEPNS0_3ext6oneapi12experimental6detail22dynamic_parameter_implEi _ZN4sycl3_V17handler24registerDynamicParameterERNS0_3ext6oneapi12experimental6detail22dynamic_parameter_baseEi _ZN4sycl3_V17handler25ext_intel_write_host_pipeENS0_6detail11string_viewEPvmb +_ZN4sycl3_V17handler25setKernelLaunchPropertiesERKNS0_6detail27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENS5_5intel12experimental12cache_configENS7_17use_root_sync_keyENS7_23work_group_progress_keyENS7_4cuda12cluster_sizeILi1EEENSF_ILi2EEENSF_ILi3EEEEEE _ZN4sycl3_V17handler26associateWithHandlerCommonESt10shared_ptrINS0_6detail16AccessorImplHostEEi _ZN4sycl3_V17handler26setKernelNameBasedCachePtrEPNS0_6detail21KernelNameBasedCacheTE _ZN4sycl3_V17handler26setNDRangeDescriptorPaddedENS0_5rangeILi3EEENS0_2idILi3EEEi diff --git a/sycl/test/include_deps/sycl_khr_includes_context.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_context.hpp.cpp index ddaf62e33ae82..3608d2902d692 100644 --- a/sycl/test/include_deps/sycl_khr_includes_context.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_context.hpp.cpp @@ -10,8 +10,8 @@ // CHECK-NEXT: feature_test.hpp // CHECK-NEXT: context.hpp // CHECK-NEXT: async_handler.hpp -// CHECK-NEXT: stl_wrappers/cstdlib // CHECK-NEXT: backend_types.hpp +// CHECK-NEXT: stl_wrappers/cstdlib // CHECK-NEXT: detail/export.hpp // CHECK-NEXT: detail/info_desc_helpers.hpp // CHECK-NEXT: aspects.hpp diff --git a/sycl/test/include_deps/sycl_khr_includes_functional.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_functional.hpp.cpp index 692e49c927e57..8b19443974f66 100644 --- a/sycl/test/include_deps/sycl_khr_includes_functional.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_functional.hpp.cpp @@ -9,5 +9,4 @@ // CHECK-NEXT: detail/defines_elementary.hpp // CHECK-NEXT: feature_test.hpp // CHECK-NEXT: functional.hpp -// CHECK-NEXT: stl_wrappers/cstdlib // CHECK-EMPTY: diff --git a/sycl/test/include_deps/sycl_khr_includes_handler.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_handler.hpp.cpp index 9d5f05f24d95d..cb34a26c6ef34 100644 --- a/sycl/test/include_deps/sycl_khr_includes_handler.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_handler.hpp.cpp @@ -116,10 +116,23 @@ // CHECK-NEXT: ext/oneapi/properties/property.hpp // CHECK-NEXT: ext/oneapi/properties/property_value.hpp // CHECK-NEXT: ext/intel/experimental/kernel_execution_properties.hpp -// CHECK-NEXT: ext/oneapi/experimental/virtual_functions.hpp +// CHECK-NEXT: ext/oneapi/experimental/cluster_group_prop.hpp +// CHECK-NEXT: ext/oneapi/properties/properties.hpp // CHECK-NEXT: ext/oneapi/properties/property_utils.hpp +// CHECK-NEXT: ext/oneapi/experimental/graph.hpp +// CHECK-NEXT: ext/oneapi/experimental/graph/command_graph.hpp +// CHECK-NEXT: ext/oneapi/experimental/graph/common.hpp +// CHECK-NEXT: ext/oneapi/experimental/graph/executable_graph.hpp +// CHECK-NEXT: ext/oneapi/experimental/graph/node.hpp +// CHECK-NEXT: ext/oneapi/experimental/detail/properties/graph_properties.hpp +// CHECK-NEXT: ext/oneapi/experimental/detail/properties/graph_properties.def +// CHECK-NEXT: ext/oneapi/experimental/detail/properties/node_properties.def +// CHECK-NEXT: ext/oneapi/experimental/graph/modifiable_graph.hpp +// CHECK-NEXT: ext/oneapi/experimental/graph/dynamic.hpp +// CHECK-NEXT: ext/oneapi/experimental/work_group_memory.hpp +// CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp +// CHECK-NEXT: ext/oneapi/experimental/virtual_functions.hpp // CHECK-NEXT: ext/oneapi/kernel_properties/properties.hpp -// CHECK-NEXT: ext/oneapi/properties/properties.hpp // CHECK-NEXT: ext/oneapi/work_group_scratch_memory.hpp // CHECK-NEXT: detail/sycl_local_mem_builtins.hpp // CHECK-NEXT: detail/kernel_name_str_t.hpp @@ -136,20 +149,7 @@ // CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp // CHECK-NEXT: ext/oneapi/device_global/device_global.hpp // CHECK-NEXT: ext/oneapi/device_global/properties.hpp -// CHECK-NEXT: ext/oneapi/experimental/cluster_group_prop.hpp -// CHECK-NEXT: ext/oneapi/experimental/graph.hpp -// CHECK-NEXT: ext/oneapi/experimental/graph/command_graph.hpp -// CHECK-NEXT: ext/oneapi/experimental/graph/common.hpp -// CHECK-NEXT: ext/oneapi/experimental/graph/executable_graph.hpp -// CHECK-NEXT: ext/oneapi/experimental/graph/node.hpp -// CHECK-NEXT: ext/oneapi/experimental/detail/properties/graph_properties.hpp -// CHECK-NEXT: ext/oneapi/experimental/detail/properties/graph_properties.def -// CHECK-NEXT: ext/oneapi/experimental/detail/properties/node_properties.def -// CHECK-NEXT: ext/oneapi/experimental/graph/modifiable_graph.hpp -// CHECK-NEXT: ext/oneapi/experimental/graph/dynamic.hpp -// CHECK-NEXT: ext/oneapi/experimental/work_group_memory.hpp // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp -// CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp // CHECK-EMPTY: diff --git a/sycl/test/include_deps/sycl_khr_includes_kernel_bundle.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_kernel_bundle.hpp.cpp index 3c05eb6715259..23e72bf910ad4 100644 --- a/sycl/test/include_deps/sycl_khr_includes_kernel_bundle.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_kernel_bundle.hpp.cpp @@ -124,21 +124,9 @@ // CHECK-NEXT: ext/oneapi/properties/property.hpp // CHECK-NEXT: ext/oneapi/properties/property_value.hpp // CHECK-NEXT: ext/intel/experimental/kernel_execution_properties.hpp -// CHECK-NEXT: ext/oneapi/experimental/virtual_functions.hpp -// CHECK-NEXT: ext/oneapi/properties/property_utils.hpp -// CHECK-NEXT: ext/oneapi/kernel_properties/properties.hpp -// CHECK-NEXT: ext/oneapi/properties/properties.hpp -// CHECK-NEXT: ext/oneapi/work_group_scratch_memory.hpp -// CHECK-NEXT: detail/sycl_local_mem_builtins.hpp -// CHECK-NEXT: detail/kernel_name_str_t.hpp -// CHECK-NEXT: detail/reduction_forward.hpp -// CHECK-NEXT: event.hpp -// CHECK-NEXT: ext/oneapi/bindless_images_interop.hpp -// CHECK-NEXT: ext/oneapi/interop_common.hpp -// CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp -// CHECK-NEXT: ext/oneapi/device_global/device_global.hpp -// CHECK-NEXT: ext/oneapi/device_global/properties.hpp // CHECK-NEXT: ext/oneapi/experimental/cluster_group_prop.hpp +// CHECK-NEXT: ext/oneapi/properties/properties.hpp +// CHECK-NEXT: ext/oneapi/properties/property_utils.hpp // CHECK-NEXT: ext/oneapi/experimental/graph.hpp // CHECK-NEXT: ext/oneapi/experimental/graph/command_graph.hpp // CHECK-NEXT: ext/oneapi/experimental/graph/common.hpp @@ -150,8 +138,20 @@ // CHECK-NEXT: ext/oneapi/experimental/graph/modifiable_graph.hpp // CHECK-NEXT: ext/oneapi/experimental/graph/dynamic.hpp // CHECK-NEXT: ext/oneapi/experimental/work_group_memory.hpp -// CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp +// CHECK-NEXT: ext/oneapi/experimental/virtual_functions.hpp +// CHECK-NEXT: ext/oneapi/kernel_properties/properties.hpp +// CHECK-NEXT: ext/oneapi/work_group_scratch_memory.hpp +// CHECK-NEXT: detail/sycl_local_mem_builtins.hpp +// CHECK-NEXT: detail/kernel_name_str_t.hpp +// CHECK-NEXT: detail/reduction_forward.hpp +// CHECK-NEXT: event.hpp +// CHECK-NEXT: ext/oneapi/bindless_images_interop.hpp +// CHECK-NEXT: ext/oneapi/interop_common.hpp +// CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp +// CHECK-NEXT: ext/oneapi/device_global/device_global.hpp +// CHECK-NEXT: ext/oneapi/device_global/properties.hpp +// CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: sampler.hpp // CHECK-NEXT: sycl_span.hpp // CHECK-NEXT: ext/oneapi/experimental/free_function_traits.hpp diff --git a/sycl/test/include_deps/sycl_khr_includes_math.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_math.hpp.cpp index ea5a9c8b7f049..8dcf562a657e2 100644 --- a/sycl/test/include_deps/sycl_khr_includes_math.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_math.hpp.cpp @@ -34,11 +34,11 @@ // CHECK-NEXT: vector.hpp // CHECK-NEXT: detail/named_swizzles_mixin.hpp // CHECK-NEXT: detail/vector_arith.hpp -// CHECK-NEXT: stl_wrappers/cstdlib // CHECK-NEXT: detail/common.hpp // CHECK-NEXT: stl_wrappers/cassert // CHECK-NEXT: stl_wrappers/assert.h // CHECK-NEXT: detail/fwd/accessor.hpp +// CHECK-NEXT: stl_wrappers/cstdlib // CHECK-NEXT: marray.hpp // CHECK-NEXT: multi_ptr.hpp // CHECK-NEXT: detail/address_space_cast.hpp diff --git a/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp index a37e63c0cc7de..7a73642eda287 100644 --- a/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_queue.hpp.cpp @@ -142,6 +142,8 @@ // CHECK-NEXT: detail/kernel_launch_helper.hpp // CHECK-NEXT: ext/intel/experimental/fp_control_kernel_properties.hpp // CHECK-NEXT: ext/intel/experimental/kernel_execution_properties.hpp +// CHECK-NEXT: ext/oneapi/experimental/cluster_group_prop.hpp +// CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp // CHECK-NEXT: ext/oneapi/experimental/virtual_functions.hpp // CHECK-NEXT: ext/oneapi/kernel_properties/properties.hpp // CHECK-NEXT: ext/oneapi/work_group_scratch_memory.hpp @@ -153,9 +155,7 @@ // CHECK-NEXT: ext/oneapi/bindless_images_interop.hpp // CHECK-NEXT: ext/oneapi/interop_common.hpp // CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp -// CHECK-NEXT: ext/oneapi/experimental/cluster_group_prop.hpp // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp -// CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp // CHECK-EMPTY: diff --git a/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp index d233d264267ac..defd42a0bfd30 100644 --- a/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_reduction.hpp.cpp @@ -152,21 +152,9 @@ // CHECK-NEXT: ext/oneapi/properties/property.hpp // CHECK-NEXT: ext/oneapi/properties/property_value.hpp // CHECK-NEXT: ext/intel/experimental/kernel_execution_properties.hpp -// CHECK-NEXT: ext/oneapi/experimental/virtual_functions.hpp -// CHECK-NEXT: ext/oneapi/properties/property_utils.hpp -// CHECK-NEXT: ext/oneapi/kernel_properties/properties.hpp -// CHECK-NEXT: ext/oneapi/properties/properties.hpp -// CHECK-NEXT: ext/oneapi/work_group_scratch_memory.hpp -// CHECK-NEXT: detail/sycl_local_mem_builtins.hpp -// CHECK-NEXT: detail/kernel_name_str_t.hpp -// CHECK-NEXT: detail/ur.hpp -// CHECK-NEXT: ur_api_funcs.def -// CHECK-NEXT: ext/oneapi/bindless_images_interop.hpp -// CHECK-NEXT: ext/oneapi/interop_common.hpp -// CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp -// CHECK-NEXT: ext/oneapi/device_global/device_global.hpp -// CHECK-NEXT: ext/oneapi/device_global/properties.hpp // CHECK-NEXT: ext/oneapi/experimental/cluster_group_prop.hpp +// CHECK-NEXT: ext/oneapi/properties/properties.hpp +// CHECK-NEXT: ext/oneapi/properties/property_utils.hpp // CHECK-NEXT: ext/oneapi/experimental/graph.hpp // CHECK-NEXT: ext/oneapi/experimental/graph/command_graph.hpp // CHECK-NEXT: ext/oneapi/experimental/graph/common.hpp @@ -178,8 +166,20 @@ // CHECK-NEXT: ext/oneapi/experimental/graph/modifiable_graph.hpp // CHECK-NEXT: ext/oneapi/experimental/graph/dynamic.hpp // CHECK-NEXT: ext/oneapi/experimental/work_group_memory.hpp -// CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp +// CHECK-NEXT: ext/oneapi/experimental/virtual_functions.hpp +// CHECK-NEXT: ext/oneapi/kernel_properties/properties.hpp +// CHECK-NEXT: ext/oneapi/work_group_scratch_memory.hpp +// CHECK-NEXT: detail/sycl_local_mem_builtins.hpp +// CHECK-NEXT: detail/kernel_name_str_t.hpp +// CHECK-NEXT: detail/ur.hpp +// CHECK-NEXT: ur_api_funcs.def +// CHECK-NEXT: ext/oneapi/bindless_images_interop.hpp +// CHECK-NEXT: ext/oneapi/interop_common.hpp +// CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp +// CHECK-NEXT: ext/oneapi/device_global/device_global.hpp +// CHECK-NEXT: ext/oneapi/device_global/properties.hpp +// CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp // CHECK-NEXT: queue.hpp diff --git a/sycl/test/include_deps/sycl_khr_includes_stream.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_stream.hpp.cpp index 0be5245e52157..ef7d9acea668b 100644 --- a/sycl/test/include_deps/sycl_khr_includes_stream.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_stream.hpp.cpp @@ -135,10 +135,23 @@ // CHECK-NEXT: ext/oneapi/properties/property.hpp // CHECK-NEXT: ext/oneapi/properties/property_value.hpp // CHECK-NEXT: ext/intel/experimental/kernel_execution_properties.hpp -// CHECK-NEXT: ext/oneapi/experimental/virtual_functions.hpp +// CHECK-NEXT: ext/oneapi/experimental/cluster_group_prop.hpp +// CHECK-NEXT: ext/oneapi/properties/properties.hpp // CHECK-NEXT: ext/oneapi/properties/property_utils.hpp +// CHECK-NEXT: ext/oneapi/experimental/graph.hpp +// CHECK-NEXT: ext/oneapi/experimental/graph/command_graph.hpp +// CHECK-NEXT: ext/oneapi/experimental/graph/common.hpp +// CHECK-NEXT: ext/oneapi/experimental/graph/executable_graph.hpp +// CHECK-NEXT: ext/oneapi/experimental/graph/node.hpp +// CHECK-NEXT: ext/oneapi/experimental/detail/properties/graph_properties.hpp +// CHECK-NEXT: ext/oneapi/experimental/detail/properties/graph_properties.def +// CHECK-NEXT: ext/oneapi/experimental/detail/properties/node_properties.def +// CHECK-NEXT: ext/oneapi/experimental/graph/modifiable_graph.hpp +// CHECK-NEXT: ext/oneapi/experimental/graph/dynamic.hpp +// CHECK-NEXT: ext/oneapi/experimental/work_group_memory.hpp +// CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp +// CHECK-NEXT: ext/oneapi/experimental/virtual_functions.hpp // CHECK-NEXT: ext/oneapi/kernel_properties/properties.hpp -// CHECK-NEXT: ext/oneapi/properties/properties.hpp // CHECK-NEXT: ext/oneapi/work_group_scratch_memory.hpp // CHECK-NEXT: detail/sycl_local_mem_builtins.hpp // CHECK-NEXT: detail/kernel_name_str_t.hpp @@ -155,20 +168,7 @@ // CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp // CHECK-NEXT: ext/oneapi/device_global/device_global.hpp // CHECK-NEXT: ext/oneapi/device_global/properties.hpp -// CHECK-NEXT: ext/oneapi/experimental/cluster_group_prop.hpp -// CHECK-NEXT: ext/oneapi/experimental/graph.hpp -// CHECK-NEXT: ext/oneapi/experimental/graph/command_graph.hpp -// CHECK-NEXT: ext/oneapi/experimental/graph/common.hpp -// CHECK-NEXT: ext/oneapi/experimental/graph/executable_graph.hpp -// CHECK-NEXT: ext/oneapi/experimental/graph/node.hpp -// CHECK-NEXT: ext/oneapi/experimental/detail/properties/graph_properties.hpp -// CHECK-NEXT: ext/oneapi/experimental/detail/properties/graph_properties.def -// CHECK-NEXT: ext/oneapi/experimental/detail/properties/node_properties.def -// CHECK-NEXT: ext/oneapi/experimental/graph/modifiable_graph.hpp -// CHECK-NEXT: ext/oneapi/experimental/graph/dynamic.hpp -// CHECK-NEXT: ext/oneapi/experimental/work_group_memory.hpp // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp -// CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp // CHECK-EMPTY: diff --git a/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp index 87de587378f40..c1d74a687701c 100644 --- a/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp @@ -35,11 +35,11 @@ // CHECK-NEXT: vector.hpp // CHECK-NEXT: detail/named_swizzles_mixin.hpp // CHECK-NEXT: detail/vector_arith.hpp -// CHECK-NEXT: stl_wrappers/cstdlib // CHECK-NEXT: detail/common.hpp // CHECK-NEXT: stl_wrappers/cassert // CHECK-NEXT: stl_wrappers/assert.h // CHECK-NEXT: detail/fwd/accessor.hpp +// CHECK-NEXT: stl_wrappers/cstdlib // CHECK-NEXT: marray.hpp // CHECK-NEXT: multi_ptr.hpp // CHECK-NEXT: detail/address_space_cast.hpp @@ -157,6 +157,8 @@ // CHECK-NEXT: detail/kernel_launch_helper.hpp // CHECK-NEXT: ext/intel/experimental/fp_control_kernel_properties.hpp // CHECK-NEXT: ext/intel/experimental/kernel_execution_properties.hpp +// CHECK-NEXT: ext/oneapi/experimental/cluster_group_prop.hpp +// CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp // CHECK-NEXT: ext/oneapi/experimental/virtual_functions.hpp // CHECK-NEXT: ext/oneapi/kernel_properties/properties.hpp // CHECK-NEXT: ext/oneapi/work_group_scratch_memory.hpp @@ -168,9 +170,7 @@ // CHECK-NEXT: ext/oneapi/bindless_images_interop.hpp // CHECK-NEXT: ext/oneapi/interop_common.hpp // CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp -// CHECK-NEXT: ext/oneapi/experimental/cluster_group_prop.hpp // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp -// CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp // CHECK-NEXT: kernel.hpp // CHECK-NEXT: sampler.hpp // CHECK-NEXT: usm/usm_pointer_info.hpp diff --git a/sycl/test/include_deps/sycl_khr_includes_vec.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_vec.hpp.cpp index 6de63c741a4c3..5baff0436851f 100644 --- a/sycl/test/include_deps/sycl_khr_includes_vec.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_vec.hpp.cpp @@ -19,7 +19,6 @@ // CHECK-NEXT: detail/type_traits.hpp // CHECK-NEXT: detail/type_traits/vec_marray_traits.hpp // CHECK-NEXT: detail/fwd/multi_ptr.hpp -// CHECK-NEXT: stl_wrappers/cstdlib // CHECK-NEXT: detail/common.hpp // CHECK-NEXT: detail/export.hpp // CHECK-NEXT: stl_wrappers/cassert @@ -28,4 +27,5 @@ // CHECK-NEXT: detail/fwd/accessor.hpp // CHECK-NEXT: detail/defines.hpp // CHECK-NEXT: detail/memcpy.hpp +// CHECK-NEXT: stl_wrappers/cstdlib // CHECK-EMPTY: From bf9401d0fdd59a6b6fee53f345d2c69787e22702 Mon Sep 17 00:00:00 2001 From: "Agarwal, Udit" Date: Mon, 27 Oct 2025 21:40:31 +0100 Subject: [PATCH 07/18] clang format --- .../ext/oneapi/experimental/enqueue_functions.hpp | 4 ++-- sycl/include/sycl/khr/free_function_commands.hpp | 15 ++++++--------- sycl/include/sycl/queue.hpp | 7 +++---- 3 files changed, 11 insertions(+), 15 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index eae2dcbce91b3..673d4c703fe52 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -276,8 +276,8 @@ void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, if constexpr (sizeof...(ReductionsT) == 0 && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { - detail::submit_kernel_direct_parallel_for( - std::move(Q), Range, KernelObj); + detail::submit_kernel_direct_parallel_for(std::move(Q), Range, + KernelObj); } else { submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Range, KernelObj, diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index 3c46e4509bc9b..04f73dac91ae0 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -164,9 +164,8 @@ void launch_grouped(const queue &q, range<1> r, range<1> size, KernelType &&k, const KernelType &>::value) && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item<1>>::value)) { - detail::submit_kernel_direct_parallel_for( - q, - nd_range<1>(r, size), std::forward(k)); + detail::submit_kernel_direct_parallel_for(q, nd_range<1>(r, size), + std::forward(k)); } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, @@ -182,9 +181,8 @@ void launch_grouped(const queue &q, range<2> r, range<2> size, KernelType &&k, // kernel_handler type argument yet. if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item<2>>::value)) { - detail::submit_kernel_direct_parallel_for( - q, - nd_range<2>(r, size), std::forward(k)); + detail::submit_kernel_direct_parallel_for(q, nd_range<2>(r, size), + std::forward(k)); } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, @@ -200,9 +198,8 @@ void launch_grouped(const queue &q, range<3> r, range<3> size, KernelType &&k, // kernel_handler type argument yet. if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item<3>>::value)) { - detail::submit_kernel_direct_parallel_for( - q, - nd_range<3>(r, size), std::forward(k)); + detail::submit_kernel_direct_parallel_for(q, nd_range<3>(r, size), + std::forward(k)); } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 7b0f974041a47..e0c4de7635d68 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -164,8 +164,8 @@ template auto submit_kernel_direct( - const queue &Queue, - const nd_range &Range, KernelTypeUniversalRef &&KernelFunc, + const queue &Queue, const nd_range &Range, + KernelTypeUniversalRef &&KernelFunc, PropertiesT ExtraProps = ext::oneapi::experimental::empty_properties_t{}, const detail::code_location &CodeLoc = detail::code_location::current()) { @@ -3385,8 +3385,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { return detail::submit_kernel_direct_parallel_for( - *this, Range, - Rest..., + *this, Range, Rest..., ext::oneapi::experimental::empty_properties_t{}, TlsCodeLocCapture.query()); } else { From 98b640059998eaaaff053fe5f7d314da418987af Mon Sep 17 00:00:00 2001 From: "Agarwal, Udit" Date: Tue, 28 Oct 2025 03:38:50 +0100 Subject: [PATCH 08/18] Address feedback --- .../sycl/detail/kernel_launch_helper.hpp | 90 ++++++++----------- sycl/include/sycl/handler.hpp | 6 +- sycl/source/detail/kernel_data.hpp | 15 ++-- sycl/test/abi/sycl_symbols_windows.dump | 13 +-- 4 files changed, 52 insertions(+), 72 deletions(-) diff --git a/sycl/include/sycl/detail/kernel_launch_helper.hpp b/sycl/include/sycl/detail/kernel_launch_helper.hpp index cb8aef61b4cc7..98ff4e544a1f8 100644 --- a/sycl/include/sycl/detail/kernel_launch_helper.hpp +++ b/sycl/include/sycl/detail/kernel_launch_helper.hpp @@ -264,18 +264,16 @@ template struct MarshalledProperty; // Generic implementation for runtime properties. template -struct MarshalledProperty>> { +struct MarshalledProperty< + PropertyTy, + std::enable_if_t && + std::is_same_v>> { std::optional property; template MarshalledProperty([[maybe_unused]] const InputPropertyTy &InputProperties) { - if constexpr (ext::oneapi::experimental::is_property_list_v< - InputPropertyTy>) - if constexpr (InputPropertyTy::template has_property()) { - std::cout << "Got property: " << typeid(PropertyTy).name() << "\n"; - property = InputProperties.template get_property(); - } + if constexpr (InputPropertyTy::template has_property()) + property = InputProperties.template get_property(); } MarshalledProperty() = default; @@ -290,10 +288,8 @@ struct MarshalledProperty { template MarshalledProperty([[maybe_unused]] const InputPropertyTy &Props) { using namespace sycl::ext::oneapi::experimental; - if constexpr (ext::oneapi::experimental::is_property_list_v< - InputPropertyTy>) - if constexpr (InputPropertyTy::template has_property()) - isRootSyncPropPresent = true; + isRootSyncPropPresent = + InputPropertyTy::template has_property(); } MarshalledProperty() = default; @@ -305,16 +301,15 @@ struct MarshalledProperty< sycl::ext::oneapi::experimental::work_group_progress_key> { struct ScopeForwardProgressProperty { - std::optional - Guarantee; - std::optional ExecScope; - std::optional - CoordinationScope; + sycl::ext::oneapi::experimental::forward_progress_guarantee Guarantee; + sycl::ext::oneapi::experimental::execution_scope ExecScope; + sycl::ext::oneapi::experimental::execution_scope CoordinationScope; }; // Forward progress guarantee properties for work_item, sub_group and // work_group scopes. We need to store them for validation later. - std::array MForwardProgressProperties; + std::array, 3> + MForwardProgressProperties; template ) { - if constexpr (InputPropertyTy::template has_property< - work_group_progress_key>()) { - auto prop = Props.template get_property(); - MForwardProgressProperties[0].Guarantee = prop.guarantee; - MForwardProgressProperties[0].ExecScope = execution_scope::work_group; - MForwardProgressProperties[0].CoordinationScope = - prop.coordinationScope; - } - if constexpr (InputPropertyTy::template has_property< - sub_group_progress_key>()) { - auto prop = Props.template get_property(); - MForwardProgressProperties[1].Guarantee = prop.guarantee; - MForwardProgressProperties[1].ExecScope = execution_scope::sub_group; - MForwardProgressProperties[1].CoordinationScope = - prop.coordinationScope; - } - if constexpr (InputPropertyTy::template has_property< - work_item_progress_key>()) { - auto prop = Props.template get_property(); - MForwardProgressProperties[2].Guarantee = prop.guarantee; - MForwardProgressProperties[2].ExecScope = execution_scope::work_item; - MForwardProgressProperties[2].CoordinationScope = - prop.coordinationScope; - } + if constexpr (InputPropertyTy::template has_property< + work_group_progress_key>()) { + auto prop = Props.template get_property(); + MForwardProgressProperties[0] = { + prop.guarantee, execution_scope::work_group, prop.coordinationScope}; + } + if constexpr (InputPropertyTy::template has_property< + sub_group_progress_key>()) { + auto prop = Props.template get_property(); + MForwardProgressProperties[1] = { + prop.guarantee, execution_scope::sub_group, prop.coordinationScope}; + } + if constexpr (InputPropertyTy::template has_property< + work_item_progress_key>()) { + auto prop = Props.template get_property(); + MForwardProgressProperties[2] = { + prop.guarantee, execution_scope::work_item, prop.coordinationScope}; } } @@ -371,21 +357,14 @@ using KernelPropertyHolderStructTy = sycl::ext::oneapi::experimental::cuda::cluster_size_key<2>, sycl::ext::oneapi::experimental::cuda::cluster_size_key<3>>; -template -constexpr auto processKernelLaunchProperties(PropertiesT Props) { - KernelPropertyHolderStructTy prop(Props); - return prop; -} - /// Note: it is important that this function *does not* depend on kernel /// name or kernel type, because then it will be instantiated for every /// kernel, even though body of those instantiated functions could be almost /// the same, thus unnecessary increasing compilation time. -template +template >> constexpr auto processKernelProperties(PropertiesT Props) { - static_assert(ext::oneapi::experimental::is_property_list::value, - "Template type is not a property list."); static_assert( !PropertiesT::template has_property< sycl::ext::intel::experimental::fp_control_key>() || @@ -398,7 +377,8 @@ constexpr auto processKernelProperties(PropertiesT Props) { sycl::ext::oneapi::experimental::indirectly_callable_key>(), "indirectly_callable property cannot be applied to SYCL kernels"); - return processKernelLaunchProperties(Props); + KernelPropertyHolderStructTy prop(Props); + return prop; } // Returns KernelLaunchPropertiesTy or std::nullopt based on whether the diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index ca9a9a8dc301b..bf003be86e80a 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -903,7 +903,7 @@ class __SYCL_EXPORT handler { template void processLaunchProperties(PropertiesT Props) { detail::KernelPropertyHolderStructTy ParsedProp = - detail::processKernelLaunchProperties(Props); + detail::processKernelProperties(Props); setKernelLaunchProperties(ParsedProp); } @@ -1296,7 +1296,7 @@ class __SYCL_EXPORT handler { detail::checkValueRange(NumWorkItems); setNDRangeDescriptor(std::move(NumWorkItems)); detail::KernelPropertyHolderStructTy ParsedProp = - detail::processKernelLaunchProperties(Props); + detail::processKernelProperties(Props); setKernelLaunchProperties(ParsedProp); extractArgsAndReqs(); #endif @@ -1321,7 +1321,7 @@ class __SYCL_EXPORT handler { detail::checkValueRange(NDRange); setNDRangeDescriptor(std::move(NDRange)); detail::KernelPropertyHolderStructTy ParsedProp = - detail::processKernelLaunchProperties(Props); + detail::processKernelProperties(Props); setKernelLaunchProperties(ParsedProp); extractArgsAndReqs(); #endif diff --git a/sycl/source/detail/kernel_data.hpp b/sycl/source/detail/kernel_data.hpp index 1866b53c94f86..9e838aaf10154 100644 --- a/sycl/source/detail/kernel_data.hpp +++ b/sycl/source/detail/kernel_data.hpp @@ -258,14 +258,13 @@ class KernelData { // Validate and set forward progress guarantees. for (int i = 0; i < 3; i++) { - if (ForwardProgressProp->MForwardProgressProperties[i] - .Guarantee.has_value()) { + if (ForwardProgressProp->MForwardProgressProperties[i].has_value()) { if (!dev.supportsForwardProgress( - *ForwardProgressProp->MForwardProgressProperties[i].Guarantee, - *ForwardProgressProp->MForwardProgressProperties[i].ExecScope, - *ForwardProgressProp->MForwardProgressProperties[i] - .CoordinationScope)) { + ForwardProgressProp->MForwardProgressProperties[i]->Guarantee, + ForwardProgressProp->MForwardProgressProperties[i]->ExecScope, + ForwardProgressProp->MForwardProgressProperties[i] + ->CoordinationScope)) { throw sycl::exception( sycl::make_error_code(errc::feature_not_supported), "The device associated with the queue does not support the " @@ -273,7 +272,7 @@ class KernelData { } auto execScope = - *ForwardProgressProp->MForwardProgressProperties[i].ExecScope; + ForwardProgressProp->MForwardProgressProperties[i]->ExecScope; // If we are here, the device supports the guarantee required but // there is a caveat in that if the guarantee required is a concurrent // guarantee, then we most likely also need to enable cooperative @@ -286,7 +285,7 @@ class KernelData { // behavior in Unified Runtime. if ((execScope == execScope::work_group || execScope == execScope::sub_group) && - (*ForwardProgressProp->MForwardProgressProperties[i].Guarantee == + (ForwardProgressProp->MForwardProgressProperties[i]->Guarantee == forward_progress_guarantee::concurrent)) { setCooperative(true); } diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index d5f53a5bbb505..e11cca8be25cb 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -286,12 +286,12 @@ ??$is_image_handle_supported@Usampled_image_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA_NAEBUimage_descriptor@01234@W4image_memory_handle_type@01234@AEBVqueue@34@@Z ??$is_image_handle_supported@Uunsampled_image_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA_NAEBUimage_descriptor@01234@W4image_memory_handle_type@01234@AEBVdevice@34@AEBVcontext@34@@Z ??$is_image_handle_supported@Uunsampled_image_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA_NAEBUimage_descriptor@01234@W4image_memory_handle_type@01234@AEBVqueue@34@@Z -??$submit_kernel_direct_with_event_impl@$00@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$00@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@AEBUcode_location@601@_N@Z -??$submit_kernel_direct_with_event_impl@$01@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$01@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@AEBUcode_location@601@_N@Z -??$submit_kernel_direct_with_event_impl@$02@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$02@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@AEBUcode_location@601@_N@Z -??$submit_kernel_direct_without_event_impl@$00@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$00@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@AEBUcode_location@501@_N@Z -??$submit_kernel_direct_without_event_impl@$01@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$01@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@AEBUcode_location@501@_N@Z -??$submit_kernel_direct_without_event_impl@$02@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$02@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@AEBUcode_location@501@_N@Z +??$submit_kernel_direct_with_event_impl@$00@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$00@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@601@AEBUcode_location@601@_N@Z +??$submit_kernel_direct_with_event_impl@$01@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$01@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@601@AEBUcode_location@601@_N@Z +??$submit_kernel_direct_with_event_impl@$02@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$02@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@601@AEBUcode_location@601@_N@Z +??$submit_kernel_direct_without_event_impl@$00@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$00@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@501@AEBUcode_location@501@_N@Z +??$submit_kernel_direct_without_event_impl@$01@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$01@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@501@AEBUcode_location@501@_N@Z +??$submit_kernel_direct_without_event_impl@$02@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$02@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@501@AEBUcode_location@501@_N@Z ??$update_nd_range@$00@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$00@45@@Z ??$update_nd_range@$01@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$01@45@@Z ??$update_nd_range@$02@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$02@45@@Z @@ -4428,6 +4428,7 @@ ?setKernelFunc@handler@_V1@sycl@@AEAAXPEAX@Z ?setKernelInfo@handler@_V1@sycl@@AEAAXPEAXHP6A?AUkernel_param_desc_t@detail@23@H@Z_N2@Z ?setKernelIsCooperative@handler@_V1@sycl@@AEAAX_N@Z +?setKernelLaunchProperties@handler@_V1@sycl@@AEAAXAEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@detail@23@@Z ?setKernelNameBasedCachePtr@handler@_V1@sycl@@AEAAXPEAUKernelNameBasedCacheT@detail@23@@Z ?setKernelWorkGroupMem@handler@_V1@sycl@@AEAAX_K@Z ?setLocalAccessorArgHelper@handler@_V1@sycl@@AEAAXHAEAVLocalAccessorBaseHost@detail@23@@Z From b043a2d61a1a11fcb0314a91bf29583c1271919d Mon Sep 17 00:00:00 2001 From: "Agarwal, Udit" Date: Tue, 28 Oct 2025 03:44:49 +0100 Subject: [PATCH 09/18] Fix test failures --- sycl/test/include_deps/sycl_khr_includes_context.hpp.cpp | 2 +- sycl/test/include_deps/sycl_khr_includes_functional.hpp.cpp | 1 + sycl/test/include_deps/sycl_khr_includes_math.hpp.cpp | 2 +- sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp | 2 +- sycl/test/include_deps/sycl_khr_includes_vec.hpp.cpp | 2 +- 5 files changed, 5 insertions(+), 4 deletions(-) diff --git a/sycl/test/include_deps/sycl_khr_includes_context.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_context.hpp.cpp index 3608d2902d692..ddaf62e33ae82 100644 --- a/sycl/test/include_deps/sycl_khr_includes_context.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_context.hpp.cpp @@ -10,8 +10,8 @@ // CHECK-NEXT: feature_test.hpp // CHECK-NEXT: context.hpp // CHECK-NEXT: async_handler.hpp -// CHECK-NEXT: backend_types.hpp // CHECK-NEXT: stl_wrappers/cstdlib +// CHECK-NEXT: backend_types.hpp // CHECK-NEXT: detail/export.hpp // CHECK-NEXT: detail/info_desc_helpers.hpp // CHECK-NEXT: aspects.hpp diff --git a/sycl/test/include_deps/sycl_khr_includes_functional.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_functional.hpp.cpp index 8b19443974f66..692e49c927e57 100644 --- a/sycl/test/include_deps/sycl_khr_includes_functional.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_functional.hpp.cpp @@ -9,4 +9,5 @@ // CHECK-NEXT: detail/defines_elementary.hpp // CHECK-NEXT: feature_test.hpp // CHECK-NEXT: functional.hpp +// CHECK-NEXT: stl_wrappers/cstdlib // CHECK-EMPTY: diff --git a/sycl/test/include_deps/sycl_khr_includes_math.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_math.hpp.cpp index 8dcf562a657e2..ea5a9c8b7f049 100644 --- a/sycl/test/include_deps/sycl_khr_includes_math.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_math.hpp.cpp @@ -34,11 +34,11 @@ // CHECK-NEXT: vector.hpp // CHECK-NEXT: detail/named_swizzles_mixin.hpp // CHECK-NEXT: detail/vector_arith.hpp +// CHECK-NEXT: stl_wrappers/cstdlib // CHECK-NEXT: detail/common.hpp // CHECK-NEXT: stl_wrappers/cassert // CHECK-NEXT: stl_wrappers/assert.h // CHECK-NEXT: detail/fwd/accessor.hpp -// CHECK-NEXT: stl_wrappers/cstdlib // CHECK-NEXT: marray.hpp // CHECK-NEXT: multi_ptr.hpp // CHECK-NEXT: detail/address_space_cast.hpp diff --git a/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp index c1d74a687701c..99f87aee43e62 100644 --- a/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_usm.hpp.cpp @@ -35,11 +35,11 @@ // CHECK-NEXT: vector.hpp // CHECK-NEXT: detail/named_swizzles_mixin.hpp // CHECK-NEXT: detail/vector_arith.hpp +// CHECK-NEXT: stl_wrappers/cstdlib // CHECK-NEXT: detail/common.hpp // CHECK-NEXT: stl_wrappers/cassert // CHECK-NEXT: stl_wrappers/assert.h // CHECK-NEXT: detail/fwd/accessor.hpp -// CHECK-NEXT: stl_wrappers/cstdlib // CHECK-NEXT: marray.hpp // CHECK-NEXT: multi_ptr.hpp // CHECK-NEXT: detail/address_space_cast.hpp diff --git a/sycl/test/include_deps/sycl_khr_includes_vec.hpp.cpp b/sycl/test/include_deps/sycl_khr_includes_vec.hpp.cpp index 5baff0436851f..6de63c741a4c3 100644 --- a/sycl/test/include_deps/sycl_khr_includes_vec.hpp.cpp +++ b/sycl/test/include_deps/sycl_khr_includes_vec.hpp.cpp @@ -19,6 +19,7 @@ // CHECK-NEXT: detail/type_traits.hpp // CHECK-NEXT: detail/type_traits/vec_marray_traits.hpp // CHECK-NEXT: detail/fwd/multi_ptr.hpp +// CHECK-NEXT: stl_wrappers/cstdlib // CHECK-NEXT: detail/common.hpp // CHECK-NEXT: detail/export.hpp // CHECK-NEXT: stl_wrappers/cassert @@ -27,5 +28,4 @@ // CHECK-NEXT: detail/fwd/accessor.hpp // CHECK-NEXT: detail/defines.hpp // CHECK-NEXT: detail/memcpy.hpp -// CHECK-NEXT: stl_wrappers/cstdlib // CHECK-EMPTY: From 0a3b4e6557c4d7fc395e861f01a7235f66ed22cd Mon Sep 17 00:00:00 2001 From: "Agarwal, Udit" Date: Tue, 28 Oct 2025 05:52:59 +0100 Subject: [PATCH 10/18] Fix RHEL build --- sycl/include/sycl/detail/kernel_launch_helper.hpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sycl/include/sycl/detail/kernel_launch_helper.hpp b/sycl/include/sycl/detail/kernel_launch_helper.hpp index 98ff4e544a1f8..af26b611c3b6f 100644 --- a/sycl/include/sycl/detail/kernel_launch_helper.hpp +++ b/sycl/include/sycl/detail/kernel_launch_helper.hpp @@ -271,9 +271,10 @@ struct MarshalledProperty< std::optional property; template - MarshalledProperty([[maybe_unused]] const InputPropertyTy &InputProperties) { + MarshalledProperty(const InputPropertyTy &Props) { + (void)Props; if constexpr (InputPropertyTy::template has_property()) - property = InputProperties.template get_property(); + property = Props.template get_property(); } MarshalledProperty() = default; @@ -286,8 +287,10 @@ struct MarshalledProperty { bool isRootSyncPropPresent = false; template - MarshalledProperty([[maybe_unused]] const InputPropertyTy &Props) { + MarshalledProperty(const InputPropertyTy &Props) { using namespace sycl::ext::oneapi::experimental; + + (void)Props; isRootSyncPropPresent = InputPropertyTy::template has_property(); } @@ -314,8 +317,9 @@ struct MarshalledProperty< template >> - MarshalledProperty([[maybe_unused]] const InputPropertyTy &Props) { + MarshalledProperty(const InputPropertyTy &Props) { using namespace sycl::ext::oneapi::experimental; + (void)Props; if constexpr (InputPropertyTy::template has_property< work_group_progress_key>()) { From 1bf1a1646a3e56f0b54eb3736a9170a92449126f Mon Sep 17 00:00:00 2001 From: "Agarwal, Udit" Date: Tue, 28 Oct 2025 06:20:00 +0100 Subject: [PATCH 11/18] [NFC][SYCL] Refactor kernel property parsing --- .../oneapi/experimental/enqueue_functions.hpp | 38 ++++---- .../sycl/khr/free_function_commands.hpp | 37 +++++--- sycl/include/sycl/queue.hpp | 88 ++++++------------- sycl/source/detail/queue_impl.cpp | 7 -- sycl/source/detail/queue_impl.hpp | 7 +- sycl/source/queue.cpp | 12 +-- sycl/test/abi/sycl_symbols_linux.dump | 13 ++- sycl/test/abi/sycl_symbols_windows.dump | 13 ++- 8 files changed, 82 insertions(+), 133 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 673d4c703fe52..8c8488a99e354 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -271,13 +271,17 @@ template void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, ReductionsT &&...Reductions) { - // TODO The handler-less path does not support reductions, and - // kernel functions with the kernel_handler type argument yet. + // TODO The handler-less path does not support reductions, kernel + // function properties and kernel functions with the kernel_handler + // type argument yet. if constexpr (sizeof...(ReductionsT) == 0 && + !(ext::oneapi::experimental::detail:: + HasKernelPropertiesGetMethod< + const KernelType &>::value) && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { - detail::submit_kernel_direct_parallel_for(std::move(Q), Range, - KernelObj); + detail::submit_kernel_direct_parallel_for( + std::move(Q), empty_properties_t{}, Range, KernelObj); } else { submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Range, KernelObj, @@ -304,25 +308,13 @@ template void nd_launch(queue Q, launch_config, Properties> Config, const KernelType &KernelObj, ReductionsT &&...Reductions) { - // TODO The handler-less path does not support reductions, and - // kernel functions with the kernel_handler type argument yet. - if constexpr (sizeof...(ReductionsT) == 0 && - !(detail::KernelLambdaHasKernelHandlerArgT< - KernelType, sycl::nd_item>::value)) { - - ext::oneapi::experimental::detail::LaunchConfigAccess, - Properties> - LaunchConfigAccess(Config); - - detail::submit_kernel_direct_parallel_for( - std::move(Q), LaunchConfigAccess.getRange(), KernelObj, - LaunchConfigAccess.getProperties()); - } else { - submit(std::move(Q), [&](handler &CGH) { - nd_launch(CGH, Config, KernelObj, - std::forward(Reductions)...); - }); - } + // TODO This overload of the nd_launch function takes the kernel function + // properties, which are not yet supported for the handler-less path, + // so it only supports handler based submission for now + submit(std::move(Q), [&](handler &CGH) { + nd_launch(CGH, Config, KernelObj, + std::forward(Reductions)...); + }); } template diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index 04f73dac91ae0..68dd159bf8211 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -157,15 +157,16 @@ template r, range<1> size, KernelType &&k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - // TODO The handler-less path does not support kernel functions with the - // kernel_handler type argument yet. + // TODO The handler-less path does not support kernel function properties + // and kernel functions with the kernel_handler type argument yet. if constexpr (!(ext::oneapi::experimental::detail:: HasKernelPropertiesGetMethod< const KernelType &>::value) && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item<1>>::value)) { - detail::submit_kernel_direct_parallel_for(q, nd_range<1>(r, size), - std::forward(k)); + detail::submit_kernel_direct_parallel_for( + q, ext::oneapi::experimental::empty_properties_t{}, + nd_range<1>(r, size), std::forward(k)); } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, @@ -177,12 +178,16 @@ template r, range<2> size, KernelType &&k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - // TODO The handler-less path does not support kernel functions with the - // kernel_handler type argument yet. - if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT< + // TODO The handler-less path does not support kernel function properties + // and kernel functions with the kernel_handler type argument yet. + if constexpr (!(ext::oneapi::experimental::detail:: + HasKernelPropertiesGetMethod< + const KernelType &>::value) && + !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item<2>>::value)) { - detail::submit_kernel_direct_parallel_for(q, nd_range<2>(r, size), - std::forward(k)); + detail::submit_kernel_direct_parallel_for( + q, ext::oneapi::experimental::empty_properties_t{}, + nd_range<2>(r, size), std::forward(k)); } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, @@ -194,12 +199,16 @@ template r, range<3> size, KernelType &&k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - // TODO The handler-less path does not support kernel functions with the - // kernel_handler type argument yet. - if constexpr (!(detail::KernelLambdaHasKernelHandlerArgT< + // TODO The handler-less path does not support kernel function properties + // and kernel functions with the kernel_handler type argument yet. + if constexpr (!(ext::oneapi::experimental::detail:: + HasKernelPropertiesGetMethod< + const KernelType &>::value) && + !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item<3>>::value)) { - detail::submit_kernel_direct_parallel_for(q, nd_range<3>(r, size), - std::forward(k)); + detail::submit_kernel_direct_parallel_for( + q, ext::oneapi::experimental::empty_properties_t{}, + nd_range<3>(r, size), std::forward(k)); } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index e0c4de7635d68..4a7f1fac789a3 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -68,7 +68,6 @@ event __SYCL_EXPORT submit_kernel_direct_with_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template @@ -76,7 +75,6 @@ void __SYCL_EXPORT submit_kernel_direct_without_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); namespace detail { @@ -161,14 +159,16 @@ class __SYCL_EXPORT SubmissionInfo { template + typename PropertiesT, typename KernelTypeUniversalRef, int Dims> auto submit_kernel_direct( - const queue &Queue, const nd_range &Range, - KernelTypeUniversalRef &&KernelFunc, - PropertiesT ExtraProps = ext::oneapi::experimental::empty_properties_t{}, + const queue &Queue, [[maybe_unused]] PropertiesT Props, + const nd_range &Range, KernelTypeUniversalRef &&KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()) { - + // TODO Properties not supported yet + static_assert( + std::is_same_v, + "Setting properties not supported yet for no-CGH kernel submit."); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); using KernelType = @@ -210,46 +210,22 @@ auto submit_kernel_direct( "-fsycl-host-compiler-options='/std:c++latest' " "might also help."); - // Get Kernel Launch properties. User can specify properties either - // via specifying get(property_tag{}) method in kernel type or by using - // launch_config API or by explicitly passing them in call to - // parallel_for (deprecated API). - // ExtraProps are properties passed explicitly or via launch_config. - - // Assumption: If user specify properties via launch_config or explicitly - // then we don't check for properties specified via get() method. - KernelPropertyHolderStructTy parsedProps; - if constexpr (std::is_same_v) { - // Use properties passed via. get() method. - if constexpr (ext::oneapi::experimental::detail:: - HasKernelPropertiesGetMethod::value) { - auto prop = KernelFunc.get(ext::oneapi::experimental::properties_tag{}); - parsedProps = detail::processKernelProperties(prop); - } - } else { - // Use ExtraProps - parsedProps = detail::processKernelProperties(ExtraProps); - } - if constexpr (EventNeeded) { return submit_kernel_direct_with_event_impl( - Queue, Range, HostKernel, DeviceKernelInfoPtr, parsedProps, + Queue, Range, HostKernel, DeviceKernelInfoPtr, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } else { submit_kernel_direct_without_event_impl( - Queue, Range, HostKernel, DeviceKernelInfoPtr, parsedProps, + Queue, Range, HostKernel, DeviceKernelInfoPtr, TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); } } template + typename PropertiesT, typename KernelTypeUniversalRef, int Dims> auto submit_kernel_direct_parallel_for( - const queue &Queue, const nd_range &Range, + const queue &Queue, PropertiesT Props, const nd_range &Range, KernelTypeUniversalRef &&KernelFunc, - PropertiesT Props = ext::oneapi::experimental::empty_properties_t{}, const detail::code_location &CodeLoc = detail::code_location::current()) { using KernelType = @@ -270,7 +246,7 @@ auto submit_kernel_direct_parallel_for( return submit_kernel_direct( - Queue, Range, std::forward(KernelFunc), Props, + Queue, Props, Range, std::forward(KernelFunc), CodeLoc); } @@ -283,8 +259,8 @@ auto submit_kernel_direct_single_task( return submit_kernel_direct( - Queue, nd_range<1>{1, 1}, - std::forward(KernelFunc), Props, CodeLoc); + Queue, Props, nd_range<1>{1, 1}, + std::forward(KernelFunc), CodeLoc); } } // namespace detail @@ -3347,22 +3323,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - using KernelType = std::tuple_element_t<0, std::tuple>; - - // TODO The handler-less path does not support reductions, and - // kernel functions with the kernel_handler type argument yet. - if constexpr (sizeof...(RestT) == 1 && - !(detail::KernelLambdaHasKernelHandlerArgT< - KernelType, sycl::nd_item>::value)) { - - return detail::submit_kernel_direct_parallel_for( - *this, Range, Rest..., Properties, TlsCodeLocCapture.query()); - } else - return submit( - [&](handler &CGH) { - CGH.template parallel_for(Range, Properties, Rest...); - }, - TlsCodeLocCapture.query()); + return submit( + [&](handler &CGH) { + CGH.template parallel_for(Range, Properties, Rest...); + }, + TlsCodeLocCapture.query()); } /// parallel_for version with a kernel represented as a lambda + nd_range that @@ -3379,15 +3344,18 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); using KernelType = std::tuple_element_t<0, std::tuple>; - // TODO The handler-less path does not support reductions, and - // kernel functions with the kernel_handler type argument yet. + // TODO The handler-less path does not support reductions, kernel + // function properties and kernel functions with the kernel_handler + // type argument yet. if constexpr (sizeof...(RestT) == 1 && + !(ext::oneapi::experimental::detail:: + HasKernelPropertiesGetMethod< + const KernelType &>::value) && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { return detail::submit_kernel_direct_parallel_for( - *this, Range, Rest..., - ext::oneapi::experimental::empty_properties_t{}, - TlsCodeLocCapture.query()); + *this, ext::oneapi::experimental::empty_properties_t{}, Range, + Rest..., TlsCodeLocCapture.query()); } else { return submit( [&](handler &CGH) { diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 3ec6cc2ec2fce..d86f6a5c6aac2 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -567,7 +567,6 @@ EventImplPtr queue_impl::submit_command_to_graph( EventImplPtr queue_impl::submit_kernel_direct_impl( const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, - const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { KernelData KData; @@ -575,12 +574,6 @@ EventImplPtr queue_impl::submit_kernel_direct_impl( KData.setDeviceKernelInfoPtr(DeviceKernelInfo); KData.setNDRDesc(NDRDesc); - // Validate and set kernel launch properties. - KData.validateAndSetKernelLaunchProperties( - Props, getCommandGraph() != nullptr, /*HasGraph?*/ - getDeviceImpl() /*device_impl*/ - ); - auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData, bool SchedulerBypass) -> EventImplPtr { if (SchedulerBypass) { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 031b0a01f56bc..49da7aee8c448 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -363,11 +363,10 @@ class queue_impl : public std::enable_shared_from_this { event submit_kernel_direct_with_event( const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { detail::EventImplPtr EventImpl = submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, - true, Props, CodeLoc, IsTopCodeLoc); + true, CodeLoc, IsTopCodeLoc); return createSyclObjFromImpl(EventImpl); } @@ -375,10 +374,9 @@ class queue_impl : public std::enable_shared_from_this { void submit_kernel_direct_without_event( const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { submit_kernel_direct_impl(NDRDescT{Range}, HostKernel, DeviceKernelInfo, - false, Props, CodeLoc, IsTopCodeLoc); + false, CodeLoc, IsTopCodeLoc); } void submit_without_event(const detail::type_erased_cgfo_ty &CGF, @@ -931,7 +929,6 @@ class queue_impl : public std::enable_shared_from_this { EventImplPtr submit_kernel_direct_impl( const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, - const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 7fe5649aecc2a..f34da47852266 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -476,31 +476,27 @@ event submit_kernel_direct_with_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { return getSyclObjImpl(Queue)->submit_kernel_direct_with_event( - Range, HostKernel, DeviceKernelInfo, Props, CodeLoc, IsTopCodeLoc); + Range, HostKernel, DeviceKernelInfo, CodeLoc, IsTopCodeLoc); } template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<1>( const queue &Queue, const nd_range<1> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<2>( const queue &Queue, const nd_range<2> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template event __SYCL_EXPORT submit_kernel_direct_with_event_impl<3>( const queue &Queue, const nd_range<3> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template @@ -508,31 +504,27 @@ void submit_kernel_direct_without_event_impl( const queue &Queue, const nd_range &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { getSyclObjImpl(Queue)->submit_kernel_direct_without_event( - Range, HostKernel, DeviceKernelInfo, Props, CodeLoc, IsTopCodeLoc); + Range, HostKernel, DeviceKernelInfo, CodeLoc, IsTopCodeLoc); } template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<1>( const queue &Queue, const nd_range<1> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<2>( const queue &Queue, const nd_range<2> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template void __SYCL_EXPORT submit_kernel_direct_without_event_impl<3>( const queue &Queue, const nd_range<3> &Range, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, - const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); } // namespace _V1 diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 83292f20db599..032b82ae74293 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -2985,12 +2985,12 @@ _ZN4sycl3_V121__isgreaterequal_implEdd _ZN4sycl3_V121__isgreaterequal_implEff _ZN4sycl3_V122accelerator_selector_vERKNS0_6deviceE _ZN4sycl3_V128verifyUSMAllocatorPropertiesERKNS0_13property_listE -_ZN4sycl3_V136submit_kernel_direct_with_event_implILi1EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoERKNSA_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSH_5intel12experimental12cache_configENSJ_17use_root_sync_keyENSJ_23work_group_progress_keyENSJ_4cuda12cluster_sizeILi1EEENSR_ILi2EEENSR_ILi3EEEEEERKNSA_13code_locationEb -_ZN4sycl3_V136submit_kernel_direct_with_event_implILi2EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoERKNSA_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSH_5intel12experimental12cache_configENSJ_17use_root_sync_keyENSJ_23work_group_progress_keyENSJ_4cuda12cluster_sizeILi1EEENSR_ILi2EEENSR_ILi3EEEEEERKNSA_13code_locationEb -_ZN4sycl3_V136submit_kernel_direct_with_event_implILi3EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoERKNSA_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSH_5intel12experimental12cache_configENSJ_17use_root_sync_keyENSJ_23work_group_progress_keyENSJ_4cuda12cluster_sizeILi1EEENSR_ILi2EEENSR_ILi3EEEEEERKNSA_13code_locationEb -_ZN4sycl3_V139submit_kernel_direct_without_event_implILi1EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoERKNS9_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSG_5intel12experimental12cache_configENSI_17use_root_sync_keyENSI_23work_group_progress_keyENSI_4cuda12cluster_sizeILi1EEENSQ_ILi2EEENSQ_ILi3EEEEEERKNS9_13code_locationEb -_ZN4sycl3_V139submit_kernel_direct_without_event_implILi2EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoERKNS9_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSG_5intel12experimental12cache_configENSI_17use_root_sync_keyENSI_23work_group_progress_keyENSI_4cuda12cluster_sizeILi1EEENSQ_ILi2EEENSQ_ILi3EEEEEERKNS9_13code_locationEb -_ZN4sycl3_V139submit_kernel_direct_without_event_implILi3EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoERKNS9_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSG_5intel12experimental12cache_configENSI_17use_root_sync_keyENSI_23work_group_progress_keyENSI_4cuda12cluster_sizeILi1EEENSQ_ILi2EEENSQ_ILi3EEEEEERKNS9_13code_locationEb +_ZN4sycl3_V136submit_kernel_direct_with_event_implILi1EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoERKNSA_13code_locationEb +_ZN4sycl3_V136submit_kernel_direct_with_event_implILi2EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoERKNSA_13code_locationEb +_ZN4sycl3_V136submit_kernel_direct_with_event_implILi3EEENS0_5eventERKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNSA_16DeviceKernelInfoERKNSA_13code_locationEb +_ZN4sycl3_V139submit_kernel_direct_without_event_implILi1EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoERKNS9_13code_locationEb +_ZN4sycl3_V139submit_kernel_direct_without_event_implILi2EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoERKNS9_13code_locationEb +_ZN4sycl3_V139submit_kernel_direct_without_event_implILi3EEEvRKNS0_5queueERKNS0_8nd_rangeIXT_EEERNS0_6detail17HostKernelRefBaseEPNS9_16DeviceKernelInfoERKNS9_13code_locationEb _ZN4sycl3_V13ext5intel12experimental9pipe_base13get_pipe_nameB5cxx11EPKv _ZN4sycl3_V13ext5intel12experimental9pipe_base17wait_non_blockingERKNS0_5eventE _ZN4sycl3_V13ext5intel12experimental9pipe_base18get_pipe_name_implEPKv @@ -3618,7 +3618,6 @@ _ZN4sycl3_V17handler24ext_oneapi_memset2d_implEPvmimm _ZN4sycl3_V17handler24registerDynamicParameterEPNS0_3ext6oneapi12experimental6detail22dynamic_parameter_implEi _ZN4sycl3_V17handler24registerDynamicParameterERNS0_3ext6oneapi12experimental6detail22dynamic_parameter_baseEi _ZN4sycl3_V17handler25ext_intel_write_host_pipeENS0_6detail11string_viewEPvmb -_ZN4sycl3_V17handler25setKernelLaunchPropertiesERKNS0_6detail27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENS5_5intel12experimental12cache_configENS7_17use_root_sync_keyENS7_23work_group_progress_keyENS7_4cuda12cluster_sizeILi1EEENSF_ILi2EEENSF_ILi3EEEEEE _ZN4sycl3_V17handler26associateWithHandlerCommonESt10shared_ptrINS0_6detail16AccessorImplHostEEi _ZN4sycl3_V17handler26setKernelNameBasedCachePtrEPNS0_6detail21KernelNameBasedCacheTE _ZN4sycl3_V17handler26setNDRangeDescriptorPaddedENS0_5rangeILi3EEENS0_2idILi3EEEi diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index e11cca8be25cb..d5f53a5bbb505 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -286,12 +286,12 @@ ??$is_image_handle_supported@Usampled_image_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA_NAEBUimage_descriptor@01234@W4image_memory_handle_type@01234@AEBVqueue@34@@Z ??$is_image_handle_supported@Uunsampled_image_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA_NAEBUimage_descriptor@01234@W4image_memory_handle_type@01234@AEBVdevice@34@AEBVcontext@34@@Z ??$is_image_handle_supported@Uunsampled_image_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA_NAEBUimage_descriptor@01234@W4image_memory_handle_type@01234@AEBVqueue@34@@Z -??$submit_kernel_direct_with_event_impl@$00@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$00@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@601@AEBUcode_location@601@_N@Z -??$submit_kernel_direct_with_event_impl@$01@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$01@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@601@AEBUcode_location@601@_N@Z -??$submit_kernel_direct_with_event_impl@$02@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$02@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@601@AEBUcode_location@601@_N@Z -??$submit_kernel_direct_without_event_impl@$00@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$00@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@501@AEBUcode_location@501@_N@Z -??$submit_kernel_direct_without_event_impl@$01@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$01@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@501@AEBUcode_location@501@_N@Z -??$submit_kernel_direct_without_event_impl@$02@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$02@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@AEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@501@AEBUcode_location@501@_N@Z +??$submit_kernel_direct_with_event_impl@$00@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$00@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@AEBUcode_location@601@_N@Z +??$submit_kernel_direct_with_event_impl@$01@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$01@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@AEBUcode_location@601@_N@Z +??$submit_kernel_direct_with_event_impl@$02@_V1@sycl@@YA?AVevent@01@AEBVqueue@01@AEBV?$nd_range@$02@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@601@AEBUcode_location@601@_N@Z +??$submit_kernel_direct_without_event_impl@$00@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$00@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@AEBUcode_location@501@_N@Z +??$submit_kernel_direct_without_event_impl@$01@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$01@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@AEBUcode_location@501@_N@Z +??$submit_kernel_direct_without_event_impl@$02@_V1@sycl@@YAXAEBVqueue@01@AEBV?$nd_range@$02@01@AEAVHostKernelRefBase@detail@01@PEAVDeviceKernelInfo@501@AEBUcode_location@501@_N@Z ??$update_nd_range@$00@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$00@45@@Z ??$update_nd_range@$01@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$01@45@@Z ??$update_nd_range@$02@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$02@45@@Z @@ -4428,7 +4428,6 @@ ?setKernelFunc@handler@_V1@sycl@@AEAAXPEAX@Z ?setKernelInfo@handler@_V1@sycl@@AEAAXPEAXHP6A?AUkernel_param_desc_t@detail@23@H@Z_N2@Z ?setKernelIsCooperative@handler@_V1@sycl@@AEAAX_N@Z -?setKernelLaunchProperties@handler@_V1@sycl@@AEAAXAEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@detail@23@@Z ?setKernelNameBasedCachePtr@handler@_V1@sycl@@AEAAXPEAUKernelNameBasedCacheT@detail@23@@Z ?setKernelWorkGroupMem@handler@_V1@sycl@@AEAAX_K@Z ?setLocalAccessorArgHelper@handler@_V1@sycl@@AEAAXHAEAVLocalAccessorBaseHost@detail@23@@Z From 9dac6d0f5f160e0cd537497f8c0b8a55d2f41be5 Mon Sep 17 00:00:00 2001 From: "Agarwal, Udit" Date: Tue, 28 Oct 2025 18:19:33 +0100 Subject: [PATCH 12/18] Address feedback --- .../sycl/detail/kernel_launch_helper.hpp | 46 +++++++++------ sycl/include/sycl/handler.hpp | 58 ++++++++----------- sycl/source/detail/kernel_data.hpp | 31 ++++------ sycl/test/abi/sycl_symbols_linux.dump | 1 + sycl/test/abi/sycl_symbols_windows.dump | 1 + 5 files changed, 65 insertions(+), 72 deletions(-) diff --git a/sycl/include/sycl/detail/kernel_launch_helper.hpp b/sycl/include/sycl/detail/kernel_launch_helper.hpp index af26b611c3b6f..8d1ab555df2b0 100644 --- a/sycl/include/sycl/detail/kernel_launch_helper.hpp +++ b/sycl/include/sycl/detail/kernel_launch_helper.hpp @@ -280,19 +280,19 @@ struct MarshalledProperty< MarshalledProperty() = default; }; -// Specialization for use_root_sync_key property. -template <> -struct MarshalledProperty { +// Generic implementation for properties with non-template value_t. +template +struct MarshalledProperty> { - bool isRootSyncPropPresent = false; + bool present = false; template MarshalledProperty(const InputPropertyTy &Props) { using namespace sycl::ext::oneapi::experimental; - (void)Props; - isRootSyncPropPresent = - InputPropertyTy::template has_property(); + + present = InputPropertyTy::template has_property(); } MarshalledProperty() = default; @@ -314,9 +314,7 @@ struct MarshalledProperty< std::array, 3> MForwardProgressProperties; - template >> + template MarshalledProperty(const InputPropertyTy &Props) { using namespace sycl::ext::oneapi::experimental; (void)Props; @@ -345,11 +343,22 @@ struct MarshalledProperty< }; template struct PropsHolder : MarshalledProperty... { + bool Empty = true; - template - PropsHolder(PropertiesT Props) : MarshalledProperty(Props)... {} + template >> + PropsHolder(PropertiesT Props) + : MarshalledProperty(Props)..., + Empty(((!PropertiesT::template has_property() && ...))) {} PropsHolder() = default; + + operator bool() const { return !Empty; } + + template constexpr auto get() const { + return static_cast *>(this); + } }; using KernelPropertyHolderStructTy = @@ -368,7 +377,8 @@ using KernelPropertyHolderStructTy = template >> -constexpr auto processKernelProperties(PropertiesT Props) { +constexpr KernelPropertyHolderStructTy +processKernelProperties(PropertiesT Props) { static_assert( !PropertiesT::template has_property< sycl::ext::intel::experimental::fp_control_key>() || @@ -388,20 +398,20 @@ constexpr auto processKernelProperties(PropertiesT Props) { // Returns KernelLaunchPropertiesTy or std::nullopt based on whether the // kernel functor has a get method that returns properties. template -constexpr std::optional +constexpr KernelPropertyHolderStructTy parseProperties([[maybe_unused]] const KernelType &KernelFunc) { + + KernelPropertyHolderStructTy props; #ifndef __SYCL_DEVICE_ONLY__ // If there are properties provided by get method then process them. if constexpr (ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod< const KernelType &>::value) { - return processKernelProperties( + props = processKernelProperties( KernelFunc.get(ext::oneapi::experimental::properties_tag{})); } #endif - // If there are no properties provided by get method then return empty - // optional. - return std::nullopt; + return props; } } // namespace kernel_launch_properties_v1 diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index bf003be86e80a..db2ac207482a6 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -902,9 +902,7 @@ class __SYCL_EXPORT handler { /// Stores information about kernel properties into the handler. template void processLaunchProperties(PropertiesT Props) { - detail::KernelPropertyHolderStructTy ParsedProp = - detail::processKernelProperties(Props); - setKernelLaunchProperties(ParsedProp); + CheckAndSetKernelLaunchProperties(detail::processKernelProperties(Props)); } /// Process kernel properties. @@ -921,7 +919,7 @@ class __SYCL_EXPORT handler { void processProperties(PropertiesT Props) { detail::KernelPropertyHolderStructTy ParsedProp = detail::processKernelProperties(Props); - setKernelLaunchProperties(ParsedProp); + CheckAndSetKernelLaunchProperties(ParsedProp); } #endif // INTEL_PREVIEW_BREAKING_CHANGES @@ -1231,10 +1229,8 @@ class __SYCL_EXPORT handler { decltype(Wrapper), TransformedArgType, PropertiesT>::wrap(Wrapper); - if (auto prop = detail::parseProperties(Wrapper)) { - setKernelLaunchProperties(*prop); - } - + auto prop = detail::parseProperties(Wrapper); + CheckAndSetKernelLaunchProperties(prop); #ifndef __SYCL_DEVICE_ONLY__ verifyUsedKernelBundleInternal(Info.Name); // We are executing over the rounded range, but there are still @@ -1258,15 +1254,12 @@ class __SYCL_EXPORT handler { // kernel is generated detail::KernelWrapper::wrap(KernelFunc); - if (auto prop = - detail::parseProperties(KernelFunc)) { - setKernelLaunchProperties(*prop); - } + auto prop = detail::parseProperties(KernelFunc); + CheckAndSetKernelLaunchProperties(prop); #ifndef __SYCL_DEVICE_ONLY__ verifyUsedKernelBundleInternal(Info.Name); - detail::KernelPropertyHolderStructTy ProcessedProps = - detail::processKernelProperties(Props); - setKernelLaunchProperties(ProcessedProps); + CheckAndSetKernelLaunchProperties( + detail::processKernelProperties(Props)); detail::checkValueRange(UserRange); setNDRangeDescriptor(std::move(UserRange)); StoreLambda( @@ -1295,9 +1288,7 @@ class __SYCL_EXPORT handler { setDeviceKernelInfo(std::move(Kernel)); detail::checkValueRange(NumWorkItems); setNDRangeDescriptor(std::move(NumWorkItems)); - detail::KernelPropertyHolderStructTy ParsedProp = - detail::processKernelProperties(Props); - setKernelLaunchProperties(ParsedProp); + CheckAndSetKernelLaunchProperties(detail::processKernelProperties(Props)); extractArgsAndReqs(); #endif } @@ -1320,9 +1311,7 @@ class __SYCL_EXPORT handler { setDeviceKernelInfo(std::move(Kernel)); detail::checkValueRange(NDRange); setNDRangeDescriptor(std::move(NDRange)); - detail::KernelPropertyHolderStructTy ParsedProp = - detail::processKernelProperties(Props); - setKernelLaunchProperties(ParsedProp); + CheckAndSetKernelLaunchProperties(detail::processKernelProperties(Props)); extractArgsAndReqs(); #endif } @@ -1342,9 +1331,8 @@ class __SYCL_EXPORT handler { constexpr auto Info = detail::CompileTimeKernelInfo; detail::KernelWrapper::wrap(KernelFunc); - if (auto prop = detail::parseProperties(KernelFunc)) { - setKernelLaunchProperties(*prop); - } + auto prop = detail::parseProperties(KernelFunc); + CheckAndSetKernelLaunchProperties(prop); #ifndef __SYCL_DEVICE_ONLY__ if constexpr (WrapAsVal == detail::WrapAs::single_task) { throwOnKernelParameterMisuse(Info); @@ -1361,9 +1349,8 @@ class __SYCL_EXPORT handler { } StoreLambda(std::move(KernelFunc)); - detail::KernelPropertyHolderStructTy ProcessedProps = - detail::processKernelProperties(Props); - setKernelLaunchProperties(ProcessedProps); + CheckAndSetKernelLaunchProperties( + detail::processKernelProperties(Props)); #endif } @@ -1386,9 +1373,8 @@ class __SYCL_EXPORT handler { (void)Kernel; detail::KernelWrapper::wrap(KernelFunc); - if (auto prop = detail::parseProperties(KernelFunc)) { - setKernelLaunchProperties(*prop); - } + CheckAndSetKernelLaunchProperties( + detail::parseProperties(KernelFunc)); #ifndef __SYCL_DEVICE_ONLY__ constexpr auto Info = detail::CompileTimeKernelInfo; if constexpr (WrapAsVal == detail::WrapAs::single_task) { @@ -1415,9 +1401,8 @@ class __SYCL_EXPORT handler { "the kernel name must match the name of the lambda"); } StoreLambda(std::move(KernelFunc)); - detail::KernelPropertyHolderStructTy ProcessedProps = - detail::processKernelProperties(Props); - setKernelLaunchProperties(ProcessedProps); + CheckAndSetKernelLaunchProperties( + detail::processKernelProperties(Props)); #endif } #endif // __INTEL_PREVIEW_BREAKING_CHANGES @@ -3467,6 +3452,13 @@ class __SYCL_EXPORT handler { void setKernelLaunchProperties( const detail::KernelPropertyHolderStructTy &KernelLaunchProperties); + inline void CheckAndSetKernelLaunchProperties( + const detail::KernelPropertyHolderStructTy &KernelLaunchProperties) { + + if (KernelLaunchProperties) + setKernelLaunchProperties(KernelLaunchProperties); + } + // Various checks that are only meaningful for host compilation, because they // result in runtime errors (i.e. exceptions being thrown). To save time // during device compilations (by reducing amount of templates we have to diff --git a/sycl/source/detail/kernel_data.hpp b/sycl/source/detail/kernel_data.hpp index 9e838aaf10154..c1289dc98ebf4 100644 --- a/sycl/source/detail/kernel_data.hpp +++ b/sycl/source/detail/kernel_data.hpp @@ -210,31 +210,20 @@ class KernelData { } void validateAndSetKernelLaunchProperties( - const detail::KernelPropertyHolderStructTy Kprop, bool HasGraph, + const detail::KernelPropertyHolderStructTy &Kprop, bool HasGraph, const device_impl &dev) { using execScope = ext::oneapi::experimental::execution_scope; using namespace sycl::ext::oneapi::experimental; using namespace sycl::ext::oneapi::experimental::detail; + using namespace sycl::ext::intel::experimental; - const auto *WorkGroupMemSizeProp = - static_cast *>( - &Kprop); - const auto *CacheConfigProp = static_cast *>(&Kprop); - const auto *UseRootSyncProp = - static_cast *>(&Kprop); - const auto *ForwardProgressProp = - static_cast *>( - &Kprop); - const auto *ClusterLaunchPropDim1 = - static_cast> *>( - &Kprop); - const auto *ClusterLaunchPropDim2 = - static_cast> *>( - &Kprop); - const auto *ClusterLaunchPropDim3 = - static_cast> *>( - &Kprop); + const auto *WorkGroupMemSizeProp = Kprop.get(); + const auto *CacheConfigProp = Kprop.get(); + const auto *UseRootSyncProp = Kprop.get(); + const auto *ForwardProgressProp = Kprop.get(); + const auto *ClusterLaunchPropDim1 = Kprop.get>(); + const auto *ClusterLaunchPropDim2 = Kprop.get>(); + const auto *ClusterLaunchPropDim3 = Kprop.get>(); const bool isClusterDimPropPresent = ClusterLaunchPropDim1->property || ClusterLaunchPropDim2->property || @@ -292,7 +281,7 @@ class KernelData { } } - if (UseRootSyncProp->isRootSyncPropPresent) + if (UseRootSyncProp->present) setCooperative(true); if (CacheConfigProp->property) diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 032b82ae74293..4e326191c67a7 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3618,6 +3618,7 @@ _ZN4sycl3_V17handler24ext_oneapi_memset2d_implEPvmimm _ZN4sycl3_V17handler24registerDynamicParameterEPNS0_3ext6oneapi12experimental6detail22dynamic_parameter_implEi _ZN4sycl3_V17handler24registerDynamicParameterERNS0_3ext6oneapi12experimental6detail22dynamic_parameter_baseEi _ZN4sycl3_V17handler25ext_intel_write_host_pipeENS0_6detail11string_viewEPvmb +_ZN4sycl3_V17handler25setKernelLaunchPropertiesERKNS0_6detail27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENS5_5intel12experimental12cache_configENS7_17use_root_sync_keyENS7_23work_group_progress_keyENS7_4cuda12cluster_sizeILi1EEENSF_ILi2EEENSF_ILi3EEEEEE _ZN4sycl3_V17handler26associateWithHandlerCommonESt10shared_ptrINS0_6detail16AccessorImplHostEEi _ZN4sycl3_V17handler26setKernelNameBasedCachePtrEPNS0_6detail21KernelNameBasedCacheTE _ZN4sycl3_V17handler26setNDRangeDescriptorPaddedENS0_5rangeILi3EEENS0_2idILi3EEEi diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index d5f53a5bbb505..0ed50b4d4e2e8 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -4428,6 +4428,7 @@ ?setKernelFunc@handler@_V1@sycl@@AEAAXPEAX@Z ?setKernelInfo@handler@_V1@sycl@@AEAAXPEAXHP6A?AUkernel_param_desc_t@detail@23@H@Z_N2@Z ?setKernelIsCooperative@handler@_V1@sycl@@AEAAX_N@Z +?setKernelLaunchProperties@handler@_V1@sycl@@AEAAXAEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@detail@23@@Z ?setKernelNameBasedCachePtr@handler@_V1@sycl@@AEAAXPEAUKernelNameBasedCacheT@detail@23@@Z ?setKernelWorkGroupMem@handler@_V1@sycl@@AEAAX_K@Z ?setLocalAccessorArgHelper@handler@_V1@sycl@@AEAAXHAEAVLocalAccessorBaseHost@detail@23@@Z From 8afc2e9c42682d4e0d321458e72aa4632e184b23 Mon Sep 17 00:00:00 2001 From: "Agarwal, Udit" Date: Tue, 28 Oct 2025 22:42:29 +0100 Subject: [PATCH 13/18] Fix bug with forward progress property parsing. Address feedback --- .../sycl/detail/kernel_launch_helper.hpp | 75 +++----- .../oneapi/kernel_properties/properties.hpp | 11 ++ sycl/include/sycl/handler.hpp | 19 +- sycl/source/detail/kernel_data.hpp | 167 ++++++++++-------- 4 files changed, 138 insertions(+), 134 deletions(-) diff --git a/sycl/include/sycl/detail/kernel_launch_helper.hpp b/sycl/include/sycl/detail/kernel_launch_helper.hpp index 8d1ab555df2b0..d9a8d0312b30e 100644 --- a/sycl/include/sycl/detail/kernel_launch_helper.hpp +++ b/sycl/include/sycl/detail/kernel_launch_helper.hpp @@ -268,13 +268,13 @@ struct MarshalledProperty< PropertyTy, std::enable_if_t && std::is_same_v>> { - std::optional property; + std::optional MProperty; template MarshalledProperty(const InputPropertyTy &Props) { (void)Props; if constexpr (InputPropertyTy::template has_property()) - property = Props.template get_property(); + MProperty = Props.template get_property(); } MarshalledProperty() = default; @@ -284,58 +284,40 @@ struct MarshalledProperty< template struct MarshalledProperty> { - - bool present = false; + bool MPresent = false; template - MarshalledProperty(const InputPropertyTy &Props) { + MarshalledProperty(const InputPropertyTy &) { using namespace sycl::ext::oneapi::experimental; - (void)Props; - - present = InputPropertyTy::template has_property(); + MPresent = InputPropertyTy::template has_property< + sycl::ext::oneapi::experimental::use_root_sync_key>(); } MarshalledProperty() = default; }; // Specialization for work group progress property. -template <> +template struct MarshalledProperty< - sycl::ext::oneapi::experimental::work_group_progress_key> { + PropertyTy, + std::enable_if_t::value>> { - struct ScopeForwardProgressProperty { - sycl::ext::oneapi::experimental::forward_progress_guarantee Guarantee; - sycl::ext::oneapi::experimental::execution_scope ExecScope; - sycl::ext::oneapi::experimental::execution_scope CoordinationScope; - }; + using forward_progress_guarantee = + sycl::ext::oneapi::experimental::forward_progress_guarantee; + using execution_scope = sycl::ext::oneapi::experimental::execution_scope; - // Forward progress guarantee properties for work_item, sub_group and - // work_group scopes. We need to store them for validation later. - std::array, 3> - MForwardProgressProperties; + std::optional MFPGuarantee; + std::optional MFPCoordinationScope; template MarshalledProperty(const InputPropertyTy &Props) { - using namespace sycl::ext::oneapi::experimental; (void)Props; - if constexpr (InputPropertyTy::template has_property< - work_group_progress_key>()) { - auto prop = Props.template get_property(); - MForwardProgressProperties[0] = { - prop.guarantee, execution_scope::work_group, prop.coordinationScope}; - } - if constexpr (InputPropertyTy::template has_property< - sub_group_progress_key>()) { - auto prop = Props.template get_property(); - MForwardProgressProperties[1] = { - prop.guarantee, execution_scope::sub_group, prop.coordinationScope}; - } - if constexpr (InputPropertyTy::template has_property< - work_item_progress_key>()) { - auto prop = Props.template get_property(); - MForwardProgressProperties[2] = { - prop.guarantee, execution_scope::work_item, prop.coordinationScope}; + if constexpr (InputPropertyTy::template has_property()) { + MFPGuarantee = Props.template get_property().guarantee; + MFPCoordinationScope = + Props.template get_property().coordinationScope; } } @@ -343,18 +325,18 @@ struct MarshalledProperty< }; template struct PropsHolder : MarshalledProperty... { - bool Empty = true; + bool MEmpty = true; template >> PropsHolder(PropertiesT Props) : MarshalledProperty(Props)..., - Empty(((!PropertiesT::template has_property() && ...))) {} + MEmpty(((!PropertiesT::template has_property() && ...))) {} PropsHolder() = default; - operator bool() const { return !Empty; } + constexpr bool isEmpty() const { return MEmpty; } template constexpr auto get() const { return static_cast *>(this); @@ -366,6 +348,8 @@ using KernelPropertyHolderStructTy = sycl::ext::intel::experimental::cache_config_key, sycl::ext::oneapi::experimental::use_root_sync_key, sycl::ext::oneapi::experimental::work_group_progress_key, + sycl::ext::oneapi::experimental::sub_group_progress_key, + sycl::ext::oneapi::experimental::work_item_progress_key, sycl::ext::oneapi::experimental::cuda::cluster_size_key<1>, sycl::ext::oneapi::experimental::cuda::cluster_size_key<2>, sycl::ext::oneapi::experimental::cuda::cluster_size_key<3>>; @@ -378,7 +362,7 @@ template >> constexpr KernelPropertyHolderStructTy -processKernelProperties(PropertiesT Props) { +extractKernelProperties(PropertiesT Props) { static_assert( !PropertiesT::template has_property< sycl::ext::intel::experimental::fp_control_key>() || @@ -391,26 +375,21 @@ processKernelProperties(PropertiesT Props) { sycl::ext::oneapi::experimental::indirectly_callable_key>(), "indirectly_callable property cannot be applied to SYCL kernels"); - KernelPropertyHolderStructTy prop(Props); - return prop; + return KernelPropertyHolderStructTy(Props); } -// Returns KernelLaunchPropertiesTy or std::nullopt based on whether the -// kernel functor has a get method that returns properties. template constexpr KernelPropertyHolderStructTy parseProperties([[maybe_unused]] const KernelType &KernelFunc) { KernelPropertyHolderStructTy props; -#ifndef __SYCL_DEVICE_ONLY__ // If there are properties provided by get method then process them. if constexpr (ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod< const KernelType &>::value) { - props = processKernelProperties( + props = extractKernelProperties( KernelFunc.get(ext::oneapi::experimental::properties_tag{})); } -#endif return props; } } // namespace kernel_launch_properties_v1 diff --git a/sycl/include/sycl/ext/oneapi/kernel_properties/properties.hpp b/sycl/include/sycl/ext/oneapi/kernel_properties/properties.hpp index 0b50f72d7019e..e61ab3f0e58fa 100644 --- a/sycl/include/sycl/ext/oneapi/kernel_properties/properties.hpp +++ b/sycl/include/sycl/ext/oneapi/kernel_properties/properties.hpp @@ -329,6 +329,17 @@ inline constexpr work_item_progress_key::value_t namespace detail { +// Helper to identify if a property is a forward progress property. +template +struct is_forward_progress_property : std::false_type {}; +template <> +struct is_forward_progress_property : std::true_type { +}; +template <> +struct is_forward_progress_property : std::true_type {}; +template <> +struct is_forward_progress_property : std::true_type {}; + template struct HasCompileTimeEffect> : std::true_type {}; diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index db2ac207482a6..90c34db935fa1 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -902,7 +902,7 @@ class __SYCL_EXPORT handler { /// Stores information about kernel properties into the handler. template void processLaunchProperties(PropertiesT Props) { - CheckAndSetKernelLaunchProperties(detail::processKernelProperties(Props)); + CheckAndSetKernelLaunchProperties(detail::extractKernelProperties(Props)); } /// Process kernel properties. @@ -918,7 +918,7 @@ class __SYCL_EXPORT handler { typename PropertiesT = ext::oneapi::experimental::empty_properties_t> void processProperties(PropertiesT Props) { detail::KernelPropertyHolderStructTy ParsedProp = - detail::processKernelProperties(Props); + detail::extractKernelProperties(Props); CheckAndSetKernelLaunchProperties(ParsedProp); } #endif // INTEL_PREVIEW_BREAKING_CHANGES @@ -1259,7 +1259,7 @@ class __SYCL_EXPORT handler { #ifndef __SYCL_DEVICE_ONLY__ verifyUsedKernelBundleInternal(Info.Name); CheckAndSetKernelLaunchProperties( - detail::processKernelProperties(Props)); + detail::extractKernelProperties(Props)); detail::checkValueRange(UserRange); setNDRangeDescriptor(std::move(UserRange)); StoreLambda( @@ -1288,7 +1288,7 @@ class __SYCL_EXPORT handler { setDeviceKernelInfo(std::move(Kernel)); detail::checkValueRange(NumWorkItems); setNDRangeDescriptor(std::move(NumWorkItems)); - CheckAndSetKernelLaunchProperties(detail::processKernelProperties(Props)); + CheckAndSetKernelLaunchProperties(detail::extractKernelProperties(Props)); extractArgsAndReqs(); #endif } @@ -1311,7 +1311,7 @@ class __SYCL_EXPORT handler { setDeviceKernelInfo(std::move(Kernel)); detail::checkValueRange(NDRange); setNDRangeDescriptor(std::move(NDRange)); - CheckAndSetKernelLaunchProperties(detail::processKernelProperties(Props)); + CheckAndSetKernelLaunchProperties(detail::extractKernelProperties(Props)); extractArgsAndReqs(); #endif } @@ -1350,7 +1350,7 @@ class __SYCL_EXPORT handler { StoreLambda(std::move(KernelFunc)); CheckAndSetKernelLaunchProperties( - detail::processKernelProperties(Props)); + detail::extractKernelProperties(Props)); #endif } @@ -1402,7 +1402,7 @@ class __SYCL_EXPORT handler { } StoreLambda(std::move(KernelFunc)); CheckAndSetKernelLaunchProperties( - detail::processKernelProperties(Props)); + detail::extractKernelProperties(Props)); #endif } #endif // __INTEL_PREVIEW_BREAKING_CHANGES @@ -3454,9 +3454,12 @@ class __SYCL_EXPORT handler { inline void CheckAndSetKernelLaunchProperties( const detail::KernelPropertyHolderStructTy &KernelLaunchProperties) { + (void)KernelLaunchProperties; - if (KernelLaunchProperties) +#ifndef __SYCL_DEVICE_ONLY__ + if (!KernelLaunchProperties.isEmpty()) setKernelLaunchProperties(KernelLaunchProperties); +#endif } // Various checks that are only meaningful for host compilation, because they diff --git a/sycl/source/detail/kernel_data.hpp b/sycl/source/detail/kernel_data.hpp index c1289dc98ebf4..676c1dfd4a1a6 100644 --- a/sycl/source/detail/kernel_data.hpp +++ b/sycl/source/detail/kernel_data.hpp @@ -183,7 +183,9 @@ class KernelData { } else if (prop == large_data) { CacheConfig = ur_kernel_cache_config_t::UR_KERNEL_CACHE_CONFIG_LARGE_DATA; } else - assert(false && "unknown cache property type"); + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "Unknown cache property type encountered in " + "parseAndSetCacheConfigProperty."); MKernelCacheConfig = CacheConfig; } @@ -191,110 +193,119 @@ class KernelData { template void parseAndSetClusterDimProperty( const std::optional> &prop) { - if (prop) { - static_assert(ClusterDims < 4 && ClusterDims > 0, - "Invalid cluster dimensions"); - - auto ClusterSize = prop->get_cluster_size(); - MKernelUsesClusterLaunch = true; - - if constexpr (ClusterDims == 1) - MNDRDesc.setClusterDimensions(sycl::range<1>{ClusterSize[0]}); - else if constexpr (ClusterDims == 2) - MNDRDesc.setClusterDimensions( - sycl::range<2>{ClusterSize[0], ClusterSize[1]}); - else if constexpr (ClusterDims == 3) - MNDRDesc.setClusterDimensions( - sycl::range<3>{ClusterSize[0], ClusterSize[1], ClusterSize[2]}); - } + static_assert(ClusterDims < 4 && ClusterDims > 0, + "ClusterDims must be 1, 2, or 3"); + + auto ClusterSize = prop->get_cluster_size(); + MKernelUsesClusterLaunch = true; + + if constexpr (ClusterDims == 1) + MNDRDesc.setClusterDimensions(sycl::range<1>{ClusterSize[0]}); + else if constexpr (ClusterDims == 2) + MNDRDesc.setClusterDimensions( + sycl::range<2>{ClusterSize[0], ClusterSize[1]}); + else if constexpr (ClusterDims == 3) + MNDRDesc.setClusterDimensions( + sycl::range<3>{ClusterSize[0], ClusterSize[1], ClusterSize[2]}); } - void validateAndSetKernelLaunchProperties( - const detail::KernelPropertyHolderStructTy &Kprop, bool HasGraph, - const device_impl &dev) { - using execScope = ext::oneapi::experimental::execution_scope; + void validateProperties(const detail::KernelPropertyHolderStructTy &Kprop, + bool HasGraph, const device_impl &dev) const { + using ExecScopeTy = ext::oneapi::experimental::execution_scope; using namespace sycl::ext::oneapi::experimental; using namespace sycl::ext::oneapi::experimental::detail; using namespace sycl::ext::intel::experimental; - const auto *WorkGroupMemSizeProp = Kprop.get(); - const auto *CacheConfigProp = Kprop.get(); - const auto *UseRootSyncProp = Kprop.get(); - const auto *ForwardProgressProp = Kprop.get(); - const auto *ClusterLaunchPropDim1 = Kprop.get>(); - const auto *ClusterLaunchPropDim2 = Kprop.get>(); - const auto *ClusterLaunchPropDim3 = Kprop.get>(); - - const bool isClusterDimPropPresent = ClusterLaunchPropDim1->property || - ClusterLaunchPropDim2->property || - ClusterLaunchPropDim3->property; - // Early validation for graph-incompatible properties if (HasGraph) { - if (WorkGroupMemSizeProp->property) { + if (Kprop.get()->MProperty) { throw sycl::exception( sycl::make_error_code(errc::invalid), "Setting work group scratch memory size is not yet supported " "for use with the SYCL Graph extension."); } - if (isClusterDimPropPresent) { + if (Kprop.get>()->MProperty || + Kprop.get>()->MProperty || + Kprop.get>()->MProperty) { throw sycl::exception(sycl::make_error_code(errc::invalid), "Cluster launch is not yet supported " "for use with the SYCL Graph extension."); } } - // Validate and set forward progress guarantees. - for (int i = 0; i < 3; i++) { - if (ForwardProgressProp->MForwardProgressProperties[i].has_value()) { - - if (!dev.supportsForwardProgress( - ForwardProgressProp->MForwardProgressProperties[i]->Guarantee, - ForwardProgressProp->MForwardProgressProperties[i]->ExecScope, - ForwardProgressProp->MForwardProgressProperties[i] - ->CoordinationScope)) { - throw sycl::exception( - sycl::make_error_code(errc::feature_not_supported), - "The device associated with the queue does not support the " - "requested forward progress guarantee."); - } - - auto execScope = - ForwardProgressProp->MForwardProgressProperties[i]->ExecScope; - // If we are here, the device supports the guarantee required but - // there is a caveat in that if the guarantee required is a concurrent - // guarantee, then we most likely also need to enable cooperative - // launch of the kernel. That is, although the device supports the - // required guarantee, some setup work is needed to truly make the - // device provide that guarantee at runtime. Otherwise, we will get - // the default guarantee which is weaker than concurrent. Same - // reasoning applies for sub_group but not for work_item. - // TODO: Further design work is probably needed to reflect this - // behavior in Unified Runtime. - if ((execScope == execScope::work_group || - execScope == execScope::sub_group) && - (ForwardProgressProp->MForwardProgressProperties[i]->Guarantee == - forward_progress_guarantee::concurrent)) { - setCooperative(true); - } + // Validate forward progress guarantees. + auto ForwardProgressPropValidator = [&](auto Guarantee, auto ExecScope, + auto CoordScope) { + if (Guarantee && + !dev.supportsForwardProgress(*Guarantee, ExecScope, *CoordScope)) { + throw sycl::exception( + sycl::make_error_code(errc::feature_not_supported), + "The device associated with the queue does not support the " + "requested forward progress guarantee."); } + }; + + const auto *FPWorkGroupProp = Kprop.get(); + const auto *FPSubGroupProp = Kprop.get(); + const auto *FPWorkItemProp = Kprop.get(); + + ForwardProgressPropValidator(FPWorkGroupProp->MFPGuarantee, + ExecScopeTy::work_group, + FPWorkGroupProp->MFPCoordinationScope); + ForwardProgressPropValidator(FPSubGroupProp->MFPGuarantee, + ExecScopeTy::sub_group, + FPSubGroupProp->MFPCoordinationScope); + ForwardProgressPropValidator(FPWorkItemProp->MFPGuarantee, + ExecScopeTy::work_item, + FPWorkItemProp->MFPCoordinationScope); + } + + void validateAndSetKernelLaunchProperties( + const detail::KernelPropertyHolderStructTy &Kprop, bool HasGraph, + const device_impl &dev) { + using namespace sycl::ext::oneapi::experimental; + using namespace sycl::ext::oneapi::experimental::detail; + using namespace sycl::ext::intel::experimental; + + validateProperties(Kprop, HasGraph, dev); + + // If we are here, the device supports the guarantee required but + // there is a caveat in that if the guarantee required is a concurrent + // guarantee, then we most likely also need to enable cooperative + // launch of the kernel. That is, although the device supports the + // required guarantee, some setup work is needed to truly make the + // device provide that guarantee at runtime. Otherwise, we will get + // the default guarantee which is weaker than concurrent. Same + // reasoning applies for sub_group but not for work_item. + // TODO: Further design work is probably needed to reflect this + // behavior in Unified Runtime. + const auto *FPWorkGroupProp = Kprop.get(); + const auto *FPSubGroupProp = Kprop.get(); + if ((Kprop.get()->MFPGuarantee && + *(FPWorkGroupProp->MFPGuarantee) == + forward_progress_guarantee::concurrent) || + (FPSubGroupProp->MFPGuarantee && + *FPSubGroupProp->MFPGuarantee == + forward_progress_guarantee::concurrent)) { + setCooperative(true); } - if (UseRootSyncProp->present) + if (Kprop.get()->MPresent) setCooperative(true); - if (CacheConfigProp->property) - parseAndSetCacheConfigProperty(*(CacheConfigProp->property)); + if (auto prop = Kprop.get()->MProperty) + parseAndSetCacheConfigProperty(*prop); - if (WorkGroupMemSizeProp->property) - setKernelWorkGroupMemorySize((*WorkGroupMemSizeProp->property).size); + if (auto prop = Kprop.get()->MProperty) + setKernelWorkGroupMemorySize(prop->size); - if (isClusterDimPropPresent) { - parseAndSetClusterDimProperty(ClusterLaunchPropDim1->property); - parseAndSetClusterDimProperty(ClusterLaunchPropDim2->property); - parseAndSetClusterDimProperty(ClusterLaunchPropDim3->property); - } + parseAndSetClusterDimProperty( + Kprop.get>()->MProperty); + parseAndSetClusterDimProperty( + Kprop.get>()->MProperty); + parseAndSetClusterDimProperty( + Kprop.get>()->MProperty); } KernelNameStrRefT getKernelName() const { From 1a0c5d0c65e5d3e5d43af2c71cac74590375a647 Mon Sep 17 00:00:00 2001 From: "Agarwal, Udit" Date: Wed, 29 Oct 2025 16:32:25 +0100 Subject: [PATCH 14/18] Add ABI layout LIT test for property holder struct --- sycl/test/abi/layout_property_holder.cpp | 173 +++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 sycl/test/abi/layout_property_holder.cpp diff --git a/sycl/test/abi/layout_property_holder.cpp b/sycl/test/abi/layout_property_holder.cpp new file mode 100644 index 0000000000000..8d29654650d72 --- /dev/null +++ b/sycl/test/abi/layout_property_holder.cpp @@ -0,0 +1,173 @@ +// RUN: %clangxx -fsycl -c -fno-color-diagnostics -Xclang -fdump-record-layouts %s -o %t.out | FileCheck %s +// RUN: %clangxx -fsycl -fsycl-device-only -c -fno-color-diagnostics -Xclang -fdump-record-layouts %s -o %t.out | FileCheck %s + +// REQUIRES: linux +// UNSUPPORTED: libcxx + +// clang-format off + +#include + +// CHECK: *** Dumping AST Record Layout +// CHECK: 0 | struct sycl::detail::PropsHolder, struct sycl::ext::oneapi::experimental::cuda::cluster_size<2>, struct sycl::ext::oneapi::experimental::cuda::cluster_size<3> > +// CHECK: 0 | struct sycl::detail::MarshalledProperty (base) +// CHECK: 0 | class std::optional MProperty +// CHECK: 0 | struct std::_Optional_base (base) +// CHECK: 0 | class std::_Optional_base_impl > (base) (empty) +// CHECK: 0 | struct std::_Optional_payload _M_payload +// CHECK: 0 | struct std::_Optional_payload_base (base) +// CHECK: 0 | union std::_Optional_payload_base::_Storage _M_payload +// CHECK: 0 | struct std::_Optional_payload_base::_Empty_byte _M_empty (empty) +// CHECK: 0 | struct sycl::ext::oneapi::experimental::work_group_scratch_size _M_value +// CHECK: 0 | struct sycl::ext::oneapi::experimental::detail::run_time_property_key (base) (empty) +// CHECK: 0 | struct sycl::ext::oneapi::experimental::detail::property_key_base_tag (base) (empty) +// CHECK: 0 | struct sycl::ext::oneapi::experimental::detail::property_base (base) (empty) +// CHECK: 0 | struct sycl::ext::oneapi::experimental::detail::property_key_tag (base) (empty) +// CHECK: 0 | struct sycl::ext::oneapi::experimental::detail::property_tag (base) (empty) +// CHECK: 0 | size_t size +// CHECK: 8 | _Bool _M_engaged +// CHECK: 0 | struct std::_Enable_copy_move > (base) (empty) +// CHECK: 16 | struct sycl::detail::MarshalledProperty (base) +// CHECK: 16 | class std::optional MProperty +// CHECK: 16 | struct std::_Optional_base (base) +// CHECK: 16 | class std::_Optional_base_impl > (base) (empty) +// CHECK: 16 | struct std::_Optional_payload _M_payload +// CHECK: 16 | struct std::_Optional_payload_base (base) +// CHECK: 16 | union std::_Optional_payload_base::_Storage _M_payload +// CHECK: 16 | struct std::_Optional_payload_base::_Empty_byte _M_empty (empty) +// CHECK: 16 | struct sycl::ext::intel::experimental::cache_config _M_value +// CHECK: 16 | struct sycl::ext::oneapi::experimental::detail::run_time_property_key (base) (empty) +// CHECK: 16 | struct sycl::ext::oneapi::experimental::detail::property_key_base_tag (base) (empty) +// CHECK: 16 | struct sycl::ext::oneapi::experimental::detail::property_base (base) (empty) +// CHECK: 16 | struct sycl::ext::oneapi::experimental::detail::property_key_tag (base) (empty) +// CHECK: 16 | struct sycl::ext::oneapi::experimental::detail::property_tag (base) (empty) +// CHECK: 16 | cache_config_enum value +// CHECK: 18 | _Bool _M_engaged +// CHECK: 16 | struct std::_Enable_copy_move > (base) (empty) +// CHECK: 20 | struct sycl::detail::MarshalledProperty (base) +// CHECK: 20 | _Bool MPresent +// CHECK: 24 | struct sycl::detail::MarshalledProperty (base) +// CHECK: 24 | class std::optional MFPGuarantee +// CHECK: 24 | struct std::_Optional_base (base) +// CHECK: 24 | class std::_Optional_base_impl > (base) (empty) +// CHECK: 24 | struct std::_Optional_payload _M_payload +// CHECK: 24 | struct std::_Optional_payload_base (base) +// CHECK: 24 | union std::_Optional_payload_base::_Storage _M_payload +// CHECK: 24 | struct std::_Optional_payload_base::_Empty_byte _M_empty (empty) +// CHECK: 24 | enum sycl::ext::oneapi::experimental::forward_progress_guarantee _M_value +// CHECK: 28 | _Bool _M_engaged +// CHECK: 24 | struct std::_Enable_copy_move > (base) (empty) +// CHECK: 32 | class std::optional MFPCoordinationScope +// CHECK: 32 | struct std::_Optional_base (base) +// CHECK: 32 | class std::_Optional_base_impl > (base) (empty) +// CHECK: 32 | struct std::_Optional_payload _M_payload +// CHECK: 32 | struct std::_Optional_payload_base (base) +// CHECK: 32 | union std::_Optional_payload_base::_Storage _M_payload +// CHECK: 32 | struct std::_Optional_payload_base::_Empty_byte _M_empty (empty) +// CHECK: 32 | enum sycl::ext::oneapi::experimental::execution_scope _M_value +// CHECK: 36 | _Bool _M_engaged +// CHECK: 32 | struct std::_Enable_copy_move > (base) (empty) +// CHECK: 40 | struct sycl::detail::MarshalledProperty (base) +// CHECK: 40 | class std::optional MFPGuarantee +// CHECK: 40 | struct std::_Optional_base (base) +// CHECK: 40 | class std::_Optional_base_impl > (base) (empty) +// CHECK: 40 | struct std::_Optional_payload _M_payload +// CHECK: 40 | struct std::_Optional_payload_base (base) +// CHECK: 40 | union std::_Optional_payload_base::_Storage _M_payload +// CHECK: 40 | struct std::_Optional_payload_base::_Empty_byte _M_empty (empty) +// CHECK: 40 | enum sycl::ext::oneapi::experimental::forward_progress_guarantee _M_value +// CHECK: 44 | _Bool _M_engaged +// CHECK: 40 | struct std::_Enable_copy_move > (base) (empty) +// CHECK: 48 | class std::optional MFPCoordinationScope +// CHECK: 48 | struct std::_Optional_base (base) +// CHECK: 48 | class std::_Optional_base_impl > (base) (empty) +// CHECK: 48 | struct std::_Optional_payload _M_payload +// CHECK: 48 | struct std::_Optional_payload_base (base) +// CHECK: 48 | union std::_Optional_payload_base::_Storage _M_payload +// CHECK: 48 | struct std::_Optional_payload_base::_Empty_byte _M_empty (empty) +// CHECK: 48 | enum sycl::ext::oneapi::experimental::execution_scope _M_value +// CHECK: 52 | _Bool _M_engaged +// CHECK: 48 | struct std::_Enable_copy_move > (base) (empty) +// CHECK: 56 | struct sycl::detail::MarshalledProperty (base) +// CHECK: 56 | class std::optional MFPGuarantee +// CHECK: 56 | struct std::_Optional_base (base) +// CHECK: 56 | class std::_Optional_base_impl > (base) (empty) +// CHECK: 56 | struct std::_Optional_payload _M_payload +// CHECK: 56 | struct std::_Optional_payload_base (base) +// CHECK: 56 | union std::_Optional_payload_base::_Storage _M_payload +// CHECK: 56 | struct std::_Optional_payload_base::_Empty_byte _M_empty (empty) +// CHECK: 56 | enum sycl::ext::oneapi::experimental::forward_progress_guarantee _M_value +// CHECK: 60 | _Bool _M_engaged +// CHECK: 56 | struct std::_Enable_copy_move > (base) (empty) +// CHECK: 64 | class std::optional MFPCoordinationScope +// CHECK: 64 | struct std::_Optional_base (base) +// CHECK: 64 | class std::_Optional_base_impl > (base) (empty) +// CHECK: 64 | struct std::_Optional_payload _M_payload +// CHECK: 64 | struct std::_Optional_payload_base (base) +// CHECK: 64 | union std::_Optional_payload_base::_Storage _M_payload +// CHECK: 64 | struct std::_Optional_payload_base::_Empty_byte _M_empty (empty) +// CHECK: 64 | enum sycl::ext::oneapi::experimental::execution_scope _M_value +// CHECK: 68 | _Bool _M_engaged +// CHECK: 64 | struct std::_Enable_copy_move > (base) (empty) +// CHECK: 72 | struct sycl::detail::MarshalledProperty > (base) +// CHECK: 72 | class std::optional > MProperty +// CHECK: 72 | struct std::_Optional_base > (base) +// CHECK: 72 | class std::_Optional_base_impl, struct std::_Optional_base > > (base) (empty) +// CHECK: 72 | struct std::_Optional_payload > _M_payload +// CHECK: 72 | struct std::_Optional_payload_base > (base) +// CHECK: 72 | union std::_Optional_payload_base >::_Storage > _M_payload +// CHECK: 72 | struct std::_Optional_payload_base >::_Empty_byte _M_empty (empty) +// CHECK: 72 | struct sycl::ext::oneapi::experimental::cuda::cluster_size<1> _M_value +// CHECK: 72 | struct sycl::ext::oneapi::experimental::detail::run_time_property_key, sycl::ext::oneapi::experimental::detail::ClusterLaunch> (base) (empty) +// CHECK: 72 | struct sycl::ext::oneapi::experimental::detail::property_key_base_tag (base) (empty) +// CHECK: 72 | struct sycl::ext::oneapi::experimental::detail::property_base, sycl::ext::oneapi::experimental::detail::ClusterLaunch> (base) (empty) +// CHECK: 72 | struct sycl::ext::oneapi::experimental::detail::property_key_tag > (base) (empty) +// CHECK: 72 | struct sycl::ext::oneapi::experimental::detail::property_tag (base) (empty) +// CHECK: 72 | class sycl::range<> size +// CHECK: 72 | class sycl::detail::array<> (base) +// CHECK: 72 | size_t[1] common_array +// CHECK: 80 | _Bool _M_engaged +// CHECK: 72 | struct std::_Enable_copy_move > > (base) (empty) +// CHECK: 88 | struct sycl::detail::MarshalledProperty > (base) +// CHECK: 88 | class std::optional > MProperty +// CHECK: 88 | struct std::_Optional_base > (base) +// CHECK: 88 | class std::_Optional_base_impl, struct std::_Optional_base > > (base) (empty) +// CHECK: 88 | struct std::_Optional_payload > _M_payload +// CHECK: 88 | struct std::_Optional_payload_base > (base) +// CHECK: 88 | union std::_Optional_payload_base >::_Storage > _M_payload +// CHECK: 88 | struct std::_Optional_payload_base >::_Empty_byte _M_empty (empty) +// CHECK: 88 | struct sycl::ext::oneapi::experimental::cuda::cluster_size<2> _M_value +// CHECK: 88 | struct sycl::ext::oneapi::experimental::detail::run_time_property_key, sycl::ext::oneapi::experimental::detail::ClusterLaunch> (base) (empty) +// CHECK: 88 | struct sycl::ext::oneapi::experimental::detail::property_key_base_tag (base) (empty) +// CHECK: 88 | struct sycl::ext::oneapi::experimental::detail::property_base, sycl::ext::oneapi::experimental::detail::ClusterLaunch> (base) (empty) +// CHECK: 88 | struct sycl::ext::oneapi::experimental::detail::property_key_tag > (base) (empty) +// CHECK: 88 | struct sycl::ext::oneapi::experimental::detail::property_tag (base) (empty) +// CHECK: 88 | class sycl::range<2> size +// CHECK: 88 | class sycl::detail::array<2> (base) +// CHECK: 88 | size_t[2] common_array +// CHECK: 104 | _Bool _M_engaged +// CHECK: 88 | struct std::_Enable_copy_move > > (base) (empty) +// CHECK: 112 | struct sycl::detail::MarshalledProperty > (base) +// CHECK: 112 | class std::optional > MProperty +// CHECK: 112 | struct std::_Optional_base > (base) +// CHECK: 112 | class std::_Optional_base_impl, struct std::_Optional_base > > (base) (empty) +// CHECK: 112 | struct std::_Optional_payload > _M_payload +// CHECK: 112 | struct std::_Optional_payload_base > (base) +// CHECK: 112 | union std::_Optional_payload_base >::_Storage > _M_payload +// CHECK: 112 | struct std::_Optional_payload_base >::_Empty_byte _M_empty (empty) +// CHECK: 112 | struct sycl::ext::oneapi::experimental::cuda::cluster_size<3> _M_value +// CHECK: 112 | struct sycl::ext::oneapi::experimental::detail::run_time_property_key, sycl::ext::oneapi::experimental::detail::ClusterLaunch> (base) (empty) +// CHECK: 112 | struct sycl::ext::oneapi::experimental::detail::property_key_base_tag (base) (empty) +// CHECK: 112 | struct sycl::ext::oneapi::experimental::detail::property_base, sycl::ext::oneapi::experimental::detail::ClusterLaunch> (base) (empty) +// CHECK: 112 | struct sycl::ext::oneapi::experimental::detail::property_key_tag > (base) (empty) +// CHECK: 112 | struct sycl::ext::oneapi::experimental::detail::property_tag (base) (empty) +// CHECK: 112 | class sycl::range<3> size +// CHECK: 112 | class sycl::detail::array<3> (base) +// CHECK: 112 | size_t[3] common_array +// CHECK: 136 | _Bool _M_engaged +// CHECK: 112 | struct std::_Enable_copy_move > > (base) (empty) +// CHECK: 144 | _Bool MEmpty +// CHECK: | [sizeof=152, dsize=145, align=8, +// CHECK: | nvsize=145, nvalign=8] + +SYCL_EXTERNAL void foo(sycl::detail::KernelPropertyHolderStructTy prop) {} From 725e85509370acbc27208c4d0480478d3b4c9ae3 Mon Sep 17 00:00:00 2001 From: "Agarwal, Udit" Date: Wed, 29 Oct 2025 17:13:56 +0100 Subject: [PATCH 15/18] Address feedback --- .../sycl/detail/kernel_launch_helper.hpp | 22 ++----- .../oneapi/kernel_properties/properties.hpp | 11 ---- sycl/include/sycl/handler.hpp | 58 +++++++++++++------ sycl/source/detail/kernel_data.hpp | 3 + sycl/test/abi/sycl_symbols_linux.dump | 2 +- sycl/test/abi/sycl_symbols_windows.dump | 3 +- 6 files changed, 51 insertions(+), 48 deletions(-) diff --git a/sycl/include/sycl/detail/kernel_launch_helper.hpp b/sycl/include/sycl/detail/kernel_launch_helper.hpp index d9a8d0312b30e..187f3f014c0ed 100644 --- a/sycl/include/sycl/detail/kernel_launch_helper.hpp +++ b/sycl/include/sycl/detail/kernel_launch_helper.hpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -300,8 +301,10 @@ struct MarshalledProperty struct MarshalledProperty< PropertyTy, - std::enable_if_t::value>> { + std::enable_if_t>> { using forward_progress_guarantee = sycl::ext::oneapi::experimental::forward_progress_guarantee; @@ -377,21 +380,6 @@ extractKernelProperties(PropertiesT Props) { return KernelPropertyHolderStructTy(Props); } - -template -constexpr KernelPropertyHolderStructTy -parseProperties([[maybe_unused]] const KernelType &KernelFunc) { - - KernelPropertyHolderStructTy props; - // If there are properties provided by get method then process them. - if constexpr (ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod< - const KernelType &>::value) { - - props = extractKernelProperties( - KernelFunc.get(ext::oneapi::experimental::properties_tag{})); - } - return props; -} } // namespace kernel_launch_properties_v1 } // namespace detail diff --git a/sycl/include/sycl/ext/oneapi/kernel_properties/properties.hpp b/sycl/include/sycl/ext/oneapi/kernel_properties/properties.hpp index e61ab3f0e58fa..0b50f72d7019e 100644 --- a/sycl/include/sycl/ext/oneapi/kernel_properties/properties.hpp +++ b/sycl/include/sycl/ext/oneapi/kernel_properties/properties.hpp @@ -329,17 +329,6 @@ inline constexpr work_item_progress_key::value_t namespace detail { -// Helper to identify if a property is a forward progress property. -template -struct is_forward_progress_property : std::false_type {}; -template <> -struct is_forward_progress_property : std::true_type { -}; -template <> -struct is_forward_progress_property : std::true_type {}; -template <> -struct is_forward_progress_property : std::true_type {}; - template struct HasCompileTimeEffect> : std::true_type {}; diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 90c34db935fa1..d8d46d2a27814 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -902,7 +902,7 @@ class __SYCL_EXPORT handler { /// Stores information about kernel properties into the handler. template void processLaunchProperties(PropertiesT Props) { - CheckAndSetKernelLaunchProperties(detail::extractKernelProperties(Props)); + SetKernelLaunchpropertiesIfNotEmpty(detail::extractKernelProperties(Props)); } /// Process kernel properties. @@ -917,9 +917,8 @@ class __SYCL_EXPORT handler { bool IsESIMDKernel, typename PropertiesT = ext::oneapi::experimental::empty_properties_t> void processProperties(PropertiesT Props) { - detail::KernelPropertyHolderStructTy ParsedProp = - detail::extractKernelProperties(Props); - CheckAndSetKernelLaunchProperties(ParsedProp); + SetKernelLaunchpropertiesIfNotEmpty( + detail::extractKernelProperties(Props)); } #endif // INTEL_PREVIEW_BREAKING_CHANGES @@ -1229,8 +1228,13 @@ class __SYCL_EXPORT handler { decltype(Wrapper), TransformedArgType, PropertiesT>::wrap(Wrapper); - auto prop = detail::parseProperties(Wrapper); - CheckAndSetKernelLaunchProperties(prop); + if constexpr (ext::oneapi::experimental::detail:: + HasKernelPropertiesGetMethod< + decltype(Wrapper)>::value) { + SetKernelLaunchpropertiesIfNotEmpty(detail::extractKernelProperties( + Wrapper.get(ext::oneapi::experimental::properties_tag{}))); + } + #ifndef __SYCL_DEVICE_ONLY__ verifyUsedKernelBundleInternal(Info.Name); // We are executing over the rounded range, but there are still @@ -1254,11 +1258,18 @@ class __SYCL_EXPORT handler { // kernel is generated detail::KernelWrapper::wrap(KernelFunc); - auto prop = detail::parseProperties(KernelFunc); - CheckAndSetKernelLaunchProperties(prop); + + if constexpr (ext::oneapi::experimental::detail:: + HasKernelPropertiesGetMethod< + const KernelType &>::value) { + SetKernelLaunchpropertiesIfNotEmpty( + detail::extractKernelProperties( + KernelFunc.get(ext::oneapi::experimental::properties_tag{}))); + } + #ifndef __SYCL_DEVICE_ONLY__ verifyUsedKernelBundleInternal(Info.Name); - CheckAndSetKernelLaunchProperties( + SetKernelLaunchpropertiesIfNotEmpty( detail::extractKernelProperties(Props)); detail::checkValueRange(UserRange); setNDRangeDescriptor(std::move(UserRange)); @@ -1288,7 +1299,7 @@ class __SYCL_EXPORT handler { setDeviceKernelInfo(std::move(Kernel)); detail::checkValueRange(NumWorkItems); setNDRangeDescriptor(std::move(NumWorkItems)); - CheckAndSetKernelLaunchProperties(detail::extractKernelProperties(Props)); + SetKernelLaunchpropertiesIfNotEmpty(detail::extractKernelProperties(Props)); extractArgsAndReqs(); #endif } @@ -1311,7 +1322,7 @@ class __SYCL_EXPORT handler { setDeviceKernelInfo(std::move(Kernel)); detail::checkValueRange(NDRange); setNDRangeDescriptor(std::move(NDRange)); - CheckAndSetKernelLaunchProperties(detail::extractKernelProperties(Props)); + SetKernelLaunchpropertiesIfNotEmpty(detail::extractKernelProperties(Props)); extractArgsAndReqs(); #endif } @@ -1331,8 +1342,14 @@ class __SYCL_EXPORT handler { constexpr auto Info = detail::CompileTimeKernelInfo; detail::KernelWrapper::wrap(KernelFunc); - auto prop = detail::parseProperties(KernelFunc); - CheckAndSetKernelLaunchProperties(prop); + + if constexpr (ext::oneapi::experimental::detail:: + HasKernelPropertiesGetMethod::value) { + SetKernelLaunchpropertiesIfNotEmpty( + detail::extractKernelProperties( + KernelFunc.get(ext::oneapi::experimental::properties_tag{}))); + } + #ifndef __SYCL_DEVICE_ONLY__ if constexpr (WrapAsVal == detail::WrapAs::single_task) { throwOnKernelParameterMisuse(Info); @@ -1349,7 +1366,7 @@ class __SYCL_EXPORT handler { } StoreLambda(std::move(KernelFunc)); - CheckAndSetKernelLaunchProperties( + SetKernelLaunchpropertiesIfNotEmpty( detail::extractKernelProperties(Props)); #endif } @@ -1373,8 +1390,13 @@ class __SYCL_EXPORT handler { (void)Kernel; detail::KernelWrapper::wrap(KernelFunc); - CheckAndSetKernelLaunchProperties( - detail::parseProperties(KernelFunc)); + + if constexpr (ext::oneapi::experimental::detail:: + HasKernelPropertiesGetMethod::value) { + SetKernelLaunchpropertiesIfNotEmpty(detail::extractKernelProperties( + KernelFunc.get(ext::oneapi::experimental::properties_tag{}))); + } + #ifndef __SYCL_DEVICE_ONLY__ constexpr auto Info = detail::CompileTimeKernelInfo; if constexpr (WrapAsVal == detail::WrapAs::single_task) { @@ -1401,7 +1423,7 @@ class __SYCL_EXPORT handler { "the kernel name must match the name of the lambda"); } StoreLambda(std::move(KernelFunc)); - CheckAndSetKernelLaunchProperties( + SetKernelLaunchpropertiesIfNotEmpty( detail::extractKernelProperties(Props)); #endif } @@ -3452,7 +3474,7 @@ class __SYCL_EXPORT handler { void setKernelLaunchProperties( const detail::KernelPropertyHolderStructTy &KernelLaunchProperties); - inline void CheckAndSetKernelLaunchProperties( + inline constexpr void SetKernelLaunchpropertiesIfNotEmpty( const detail::KernelPropertyHolderStructTy &KernelLaunchProperties) { (void)KernelLaunchProperties; diff --git a/sycl/source/detail/kernel_data.hpp b/sycl/source/detail/kernel_data.hpp index 676c1dfd4a1a6..9f22fafa85aa4 100644 --- a/sycl/source/detail/kernel_data.hpp +++ b/sycl/source/detail/kernel_data.hpp @@ -196,6 +196,9 @@ class KernelData { static_assert(ClusterDims < 4 && ClusterDims > 0, "ClusterDims must be 1, 2, or 3"); + if (!prop.has_value()) + return; + auto ClusterSize = prop->get_cluster_size(); MKernelUsesClusterLaunch = true; diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 4e326191c67a7..bfb11f8c79fe8 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3618,7 +3618,7 @@ _ZN4sycl3_V17handler24ext_oneapi_memset2d_implEPvmimm _ZN4sycl3_V17handler24registerDynamicParameterEPNS0_3ext6oneapi12experimental6detail22dynamic_parameter_implEi _ZN4sycl3_V17handler24registerDynamicParameterERNS0_3ext6oneapi12experimental6detail22dynamic_parameter_baseEi _ZN4sycl3_V17handler25ext_intel_write_host_pipeENS0_6detail11string_viewEPvmb -_ZN4sycl3_V17handler25setKernelLaunchPropertiesERKNS0_6detail27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENS5_5intel12experimental12cache_configENS7_17use_root_sync_keyENS7_23work_group_progress_keyENS7_4cuda12cluster_sizeILi1EEENSF_ILi2EEENSF_ILi3EEEEEE +_ZN4sycl3_V17handler25setKernelLaunchPropertiesERKNS0_6detail27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENS5_5intel12experimental12cache_configENS7_17use_root_sync_keyENS7_23work_group_progress_keyENS7_22sub_group_progress_keyENS7_22work_item_progress_keyENS7_4cuda12cluster_sizeILi1EEENSH_ILi2EEENSH_ILi3EEEEEE _ZN4sycl3_V17handler26associateWithHandlerCommonESt10shared_ptrINS0_6detail16AccessorImplHostEEi _ZN4sycl3_V17handler26setKernelNameBasedCachePtrEPNS0_6detail21KernelNameBasedCacheTE _ZN4sycl3_V17handler26setNDRangeDescriptorPaddedENS0_5rangeILi3EEENS0_2idILi3EEEi diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 0ed50b4d4e2e8..1043200c0bbda 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -716,6 +716,7 @@ ??_Fcontext@_V1@sycl@@QEAAXXZ ??_Fqueue@_V1@sycl@@QEAAXXZ ?AccessTargetMask@handler@_V1@sycl@@0HB +?CheckAndSetKernelLaunchProperties@handler@_V1@sycl@@AEAAXAEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@detail@23@@Z ?Clear@exception_list@_V1@sycl@@AEAAXXZ ?DirSep@OSUtil@detail@_V1@sycl@@2QEBDEB ?DisableRangeRounding@handler@_V1@sycl@@AEAA_NXZ @@ -4428,7 +4429,7 @@ ?setKernelFunc@handler@_V1@sycl@@AEAAXPEAX@Z ?setKernelInfo@handler@_V1@sycl@@AEAAXPEAXHP6A?AUkernel_param_desc_t@detail@23@H@Z_N2@Z ?setKernelIsCooperative@handler@_V1@sycl@@AEAAX_N@Z -?setKernelLaunchProperties@handler@_V1@sycl@@AEAAXAEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@detail@23@@Z +?setKernelLaunchProperties@handler@_V1@sycl@@AEAAXAEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@detail@23@@Z ?setKernelNameBasedCachePtr@handler@_V1@sycl@@AEAAXPEAUKernelNameBasedCacheT@detail@23@@Z ?setKernelWorkGroupMem@handler@_V1@sycl@@AEAAX_K@Z ?setLocalAccessorArgHelper@handler@_V1@sycl@@AEAAXHAEAVLocalAccessorBaseHost@detail@23@@Z From fc81db167af90ea8bb1643ca69283b733e306b09 Mon Sep 17 00:00:00 2001 From: "Agarwal, Udit" Date: Wed, 29 Oct 2025 18:42:43 +0100 Subject: [PATCH 16/18] Fix WIndows ABI --- sycl/test/abi/sycl_symbols_windows.dump | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 1043200c0bbda..b2c805ac1b9d0 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -716,7 +716,6 @@ ??_Fcontext@_V1@sycl@@QEAAXXZ ??_Fqueue@_V1@sycl@@QEAAXXZ ?AccessTargetMask@handler@_V1@sycl@@0HB -?CheckAndSetKernelLaunchProperties@handler@_V1@sycl@@AEAAXAEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@detail@23@@Z ?Clear@exception_list@_V1@sycl@@AEAAXXZ ?DirSep@OSUtil@detail@_V1@sycl@@2QEBDEB ?DisableRangeRounding@handler@_V1@sycl@@AEAA_NXZ @@ -741,6 +740,7 @@ ?SecondaryQueue@SubmissionInfo@v1@detail@_V1@sycl@@QEBAAEBV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@XZ ?SetHostTask@handler@_V1@sycl@@AEAAX$$QEAV?$function@$$A6AXVinterop_handle@_V1@sycl@@@Z@std@@@Z ?SetHostTask@handler@_V1@sycl@@AEAAX$$QEAV?$function@$$A6AXXZ@std@@@Z +?SetKernelLaunchpropertiesIfNotEmpty@handler@_V1@sycl@@AEAAXAEBU?$PropsHolder@Uwork_group_scratch_size@experimental@oneapi@ext@_V1@sycl@@Ucache_config@2intel@456@Uuse_root_sync_key@23456@Uwork_group_progress_key@23456@Usub_group_progress_key@23456@Uwork_item_progress_key@23456@U?$cluster_size@$00@cuda@23456@U?$cluster_size@$01@cuda@23456@U?$cluster_size@$02@cuda@23456@@kernel_launch_properties_v1@detail@23@@Z ?__abs_diff_impl@_V1@sycl@@YA?AV?$vec@C$00@12@V312@0@Z ?__abs_diff_impl@_V1@sycl@@YA?AV?$vec@C$01@12@V312@0@Z ?__abs_diff_impl@_V1@sycl@@YA?AV?$vec@C$02@12@V312@0@Z From b924f4c459394cd2b855f6f60164278f21456502 Mon Sep 17 00:00:00 2001 From: "Agarwal, Udit" Date: Wed, 29 Oct 2025 19:15:39 +0100 Subject: [PATCH 17/18] Store KernelPropertyHolderStructTy in KenrelData --- sycl/source/detail/kernel_data.hpp | 125 +++++++++++++++-------------- 1 file changed, 64 insertions(+), 61 deletions(-) diff --git a/sycl/source/detail/kernel_data.hpp b/sycl/source/detail/kernel_data.hpp index 9f22fafa85aa4..d0a6e54caffa9 100644 --- a/sycl/source/detail/kernel_data.hpp +++ b/sycl/source/detail/kernel_data.hpp @@ -144,50 +144,86 @@ class KernelData { // Kernel launch properties getter and setters. ur_kernel_cache_config_t getKernelCacheConfig() const { - return MKernelCacheConfig; + using namespace sycl::ext::oneapi::experimental; + using namespace sycl::ext::oneapi::experimental::detail; + using namespace sycl::ext::intel::experimental; + + if (auto prop = MKernelProps.get()->MProperty) { + if (*prop == large_slm) { + return ur_kernel_cache_config_t::UR_KERNEL_CACHE_CONFIG_LARGE_SLM; + } else if (*prop == large_data) { + return ur_kernel_cache_config_t::UR_KERNEL_CACHE_CONFIG_LARGE_DATA; + } else + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "Unknown cache property type encountered in " + "parseAndSetCacheConfigProperty."); + } + + return ur_kernel_cache_config_t::UR_KERNEL_CACHE_CONFIG_DEFAULT; } void setKernelCacheConfig(ur_kernel_cache_config_t Config) { - MKernelCacheConfig = Config; + // FIXME: Why we need this? } - bool isCooperative() const { return MKernelIsCooperative; } + bool isCooperative() const { + using namespace sycl::ext::oneapi::experimental; + using namespace sycl::ext::oneapi::experimental::detail; + using namespace sycl::ext::intel::experimental; + + // If we are here, the device supports the guarantee required but + // there is a caveat in that if the guarantee required is a concurrent + // guarantee, then we most likely also need to enable cooperative + // launch of the kernel. That is, although the device supports the + // required guarantee, some setup work is needed to truly make the + // device provide that guarantee at runtime. Otherwise, we will get + // the default guarantee which is weaker than concurrent. Same + // reasoning applies for sub_group but not for work_item. + // TODO: Further design work is probably needed to reflect this + // behavior in Unified Runtime. + const auto *FPWorkGroupProp = MKernelProps.get(); + const auto *FPSubGroupProp = MKernelProps.get(); + if ((MKernelProps.get()->MFPGuarantee && + *(FPWorkGroupProp->MFPGuarantee) == + forward_progress_guarantee::concurrent) || + (FPSubGroupProp->MFPGuarantee && + *FPSubGroupProp->MFPGuarantee == + forward_progress_guarantee::concurrent) || + (MKernelProps.get()->MPresent)) { + return true; + } + + return false; + } void setCooperative(bool IsCooperative) { - MKernelIsCooperative = IsCooperative; + // FIXME: Why we need this? } - bool usesClusterLaunch() const { return MKernelUsesClusterLaunch; } + bool usesClusterLaunch() const { + using namespace sycl::ext::oneapi::experimental; + using namespace sycl::ext::oneapi::experimental::detail; + using namespace sycl::ext::intel::experimental; + return MKernelProps.get>()->MProperty || + MKernelProps.get>()->MProperty || + MKernelProps.get>()->MProperty; + } template void setClusterDimensions(sycl::range N) { - MKernelUsesClusterLaunch = true; MNDRDesc.setClusterDimensions(N); } uint32_t getKernelWorkGroupMemorySize() const { - return MKernelWorkGroupMemorySize; - } + using namespace sycl::ext::oneapi::experimental; - void setKernelWorkGroupMemorySize(uint32_t Size) { - MKernelWorkGroupMemorySize = Size; - } + if (auto prop = MKernelProps.get()->MProperty) + return prop->size; - void parseAndSetCacheConfigProperty( - const sycl::ext::intel::experimental::cache_config_key &prop) { - using namespace sycl::ext::intel::experimental; + return 0; + } - ur_kernel_cache_config_t CacheConfig = - ur_kernel_cache_config_t::UR_KERNEL_CACHE_CONFIG_DEFAULT; - if (prop == large_slm) { - CacheConfig = ur_kernel_cache_config_t::UR_KERNEL_CACHE_CONFIG_LARGE_SLM; - } else if (prop == large_data) { - CacheConfig = ur_kernel_cache_config_t::UR_KERNEL_CACHE_CONFIG_LARGE_DATA; - } else - throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), - "Unknown cache property type encountered in " - "parseAndSetCacheConfigProperty."); - - MKernelCacheConfig = CacheConfig; + void setKernelWorkGroupMemorySize(uint32_t Size) { + // FIXME: Why we need this? } template @@ -200,7 +236,6 @@ class KernelData { return; auto ClusterSize = prop->get_cluster_size(); - MKernelUsesClusterLaunch = true; if constexpr (ClusterDims == 1) MNDRDesc.setClusterDimensions(sycl::range<1>{ClusterSize[0]}); @@ -273,35 +308,7 @@ class KernelData { validateProperties(Kprop, HasGraph, dev); - // If we are here, the device supports the guarantee required but - // there is a caveat in that if the guarantee required is a concurrent - // guarantee, then we most likely also need to enable cooperative - // launch of the kernel. That is, although the device supports the - // required guarantee, some setup work is needed to truly make the - // device provide that guarantee at runtime. Otherwise, we will get - // the default guarantee which is weaker than concurrent. Same - // reasoning applies for sub_group but not for work_item. - // TODO: Further design work is probably needed to reflect this - // behavior in Unified Runtime. - const auto *FPWorkGroupProp = Kprop.get(); - const auto *FPSubGroupProp = Kprop.get(); - if ((Kprop.get()->MFPGuarantee && - *(FPWorkGroupProp->MFPGuarantee) == - forward_progress_guarantee::concurrent) || - (FPSubGroupProp->MFPGuarantee && - *FPSubGroupProp->MFPGuarantee == - forward_progress_guarantee::concurrent)) { - setCooperative(true); - } - - if (Kprop.get()->MPresent) - setCooperative(true); - - if (auto prop = Kprop.get()->MProperty) - parseAndSetCacheConfigProperty(*prop); - - if (auto prop = Kprop.get()->MProperty) - setKernelWorkGroupMemorySize(prop->size); + MKernelProps = Kprop; parseAndSetClusterDimProperty( Kprop.get>()->MProperty); @@ -337,11 +344,7 @@ class KernelData { /// The list of arguments for the kernel. std::vector MArgs; - ur_kernel_cache_config_t MKernelCacheConfig = UR_KERNEL_CACHE_CONFIG_DEFAULT; - - bool MKernelIsCooperative = false; - bool MKernelUsesClusterLaunch = false; - uint32_t MKernelWorkGroupMemorySize = 0; + detail::KernelPropertyHolderStructTy MKernelProps; /// Struct that encodes global size, local size, ... detail::NDRDescT MNDRDesc; From bf897b8b729d01fe980846e85cc4f2d5a7c59e9e Mon Sep 17 00:00:00 2001 From: "Agarwal, Udit" Date: Wed, 29 Oct 2025 20:01:34 +0100 Subject: [PATCH 18/18] Revert "Store KernelPropertyHolderStructTy in KenrelData" This reverts commit b924f4c459394cd2b855f6f60164278f21456502. --- sycl/source/detail/kernel_data.hpp | 125 ++++++++++++++--------------- 1 file changed, 61 insertions(+), 64 deletions(-) diff --git a/sycl/source/detail/kernel_data.hpp b/sycl/source/detail/kernel_data.hpp index d0a6e54caffa9..9f22fafa85aa4 100644 --- a/sycl/source/detail/kernel_data.hpp +++ b/sycl/source/detail/kernel_data.hpp @@ -144,86 +144,50 @@ class KernelData { // Kernel launch properties getter and setters. ur_kernel_cache_config_t getKernelCacheConfig() const { - using namespace sycl::ext::oneapi::experimental; - using namespace sycl::ext::oneapi::experimental::detail; - using namespace sycl::ext::intel::experimental; - - if (auto prop = MKernelProps.get()->MProperty) { - if (*prop == large_slm) { - return ur_kernel_cache_config_t::UR_KERNEL_CACHE_CONFIG_LARGE_SLM; - } else if (*prop == large_data) { - return ur_kernel_cache_config_t::UR_KERNEL_CACHE_CONFIG_LARGE_DATA; - } else - throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), - "Unknown cache property type encountered in " - "parseAndSetCacheConfigProperty."); - } - - return ur_kernel_cache_config_t::UR_KERNEL_CACHE_CONFIG_DEFAULT; + return MKernelCacheConfig; } void setKernelCacheConfig(ur_kernel_cache_config_t Config) { - // FIXME: Why we need this? + MKernelCacheConfig = Config; } - bool isCooperative() const { - using namespace sycl::ext::oneapi::experimental; - using namespace sycl::ext::oneapi::experimental::detail; - using namespace sycl::ext::intel::experimental; - - // If we are here, the device supports the guarantee required but - // there is a caveat in that if the guarantee required is a concurrent - // guarantee, then we most likely also need to enable cooperative - // launch of the kernel. That is, although the device supports the - // required guarantee, some setup work is needed to truly make the - // device provide that guarantee at runtime. Otherwise, we will get - // the default guarantee which is weaker than concurrent. Same - // reasoning applies for sub_group but not for work_item. - // TODO: Further design work is probably needed to reflect this - // behavior in Unified Runtime. - const auto *FPWorkGroupProp = MKernelProps.get(); - const auto *FPSubGroupProp = MKernelProps.get(); - if ((MKernelProps.get()->MFPGuarantee && - *(FPWorkGroupProp->MFPGuarantee) == - forward_progress_guarantee::concurrent) || - (FPSubGroupProp->MFPGuarantee && - *FPSubGroupProp->MFPGuarantee == - forward_progress_guarantee::concurrent) || - (MKernelProps.get()->MPresent)) { - return true; - } - - return false; - } + bool isCooperative() const { return MKernelIsCooperative; } void setCooperative(bool IsCooperative) { - // FIXME: Why we need this? + MKernelIsCooperative = IsCooperative; } - bool usesClusterLaunch() const { - using namespace sycl::ext::oneapi::experimental; - using namespace sycl::ext::oneapi::experimental::detail; - using namespace sycl::ext::intel::experimental; - return MKernelProps.get>()->MProperty || - MKernelProps.get>()->MProperty || - MKernelProps.get>()->MProperty; - } + bool usesClusterLaunch() const { return MKernelUsesClusterLaunch; } template void setClusterDimensions(sycl::range N) { + MKernelUsesClusterLaunch = true; MNDRDesc.setClusterDimensions(N); } uint32_t getKernelWorkGroupMemorySize() const { - using namespace sycl::ext::oneapi::experimental; - - if (auto prop = MKernelProps.get()->MProperty) - return prop->size; - - return 0; + return MKernelWorkGroupMemorySize; } void setKernelWorkGroupMemorySize(uint32_t Size) { - // FIXME: Why we need this? + MKernelWorkGroupMemorySize = Size; + } + + void parseAndSetCacheConfigProperty( + const sycl::ext::intel::experimental::cache_config_key &prop) { + using namespace sycl::ext::intel::experimental; + + ur_kernel_cache_config_t CacheConfig = + ur_kernel_cache_config_t::UR_KERNEL_CACHE_CONFIG_DEFAULT; + if (prop == large_slm) { + CacheConfig = ur_kernel_cache_config_t::UR_KERNEL_CACHE_CONFIG_LARGE_SLM; + } else if (prop == large_data) { + CacheConfig = ur_kernel_cache_config_t::UR_KERNEL_CACHE_CONFIG_LARGE_DATA; + } else + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "Unknown cache property type encountered in " + "parseAndSetCacheConfigProperty."); + + MKernelCacheConfig = CacheConfig; } template @@ -236,6 +200,7 @@ class KernelData { return; auto ClusterSize = prop->get_cluster_size(); + MKernelUsesClusterLaunch = true; if constexpr (ClusterDims == 1) MNDRDesc.setClusterDimensions(sycl::range<1>{ClusterSize[0]}); @@ -308,7 +273,35 @@ class KernelData { validateProperties(Kprop, HasGraph, dev); - MKernelProps = Kprop; + // If we are here, the device supports the guarantee required but + // there is a caveat in that if the guarantee required is a concurrent + // guarantee, then we most likely also need to enable cooperative + // launch of the kernel. That is, although the device supports the + // required guarantee, some setup work is needed to truly make the + // device provide that guarantee at runtime. Otherwise, we will get + // the default guarantee which is weaker than concurrent. Same + // reasoning applies for sub_group but not for work_item. + // TODO: Further design work is probably needed to reflect this + // behavior in Unified Runtime. + const auto *FPWorkGroupProp = Kprop.get(); + const auto *FPSubGroupProp = Kprop.get(); + if ((Kprop.get()->MFPGuarantee && + *(FPWorkGroupProp->MFPGuarantee) == + forward_progress_guarantee::concurrent) || + (FPSubGroupProp->MFPGuarantee && + *FPSubGroupProp->MFPGuarantee == + forward_progress_guarantee::concurrent)) { + setCooperative(true); + } + + if (Kprop.get()->MPresent) + setCooperative(true); + + if (auto prop = Kprop.get()->MProperty) + parseAndSetCacheConfigProperty(*prop); + + if (auto prop = Kprop.get()->MProperty) + setKernelWorkGroupMemorySize(prop->size); parseAndSetClusterDimProperty( Kprop.get>()->MProperty); @@ -344,7 +337,11 @@ class KernelData { /// The list of arguments for the kernel. std::vector MArgs; - detail::KernelPropertyHolderStructTy MKernelProps; + ur_kernel_cache_config_t MKernelCacheConfig = UR_KERNEL_CACHE_CONFIG_DEFAULT; + + bool MKernelIsCooperative = false; + bool MKernelUsesClusterLaunch = false; + uint32_t MKernelWorkGroupMemorySize = 0; /// Struct that encodes global size, local size, ... detail::NDRDescT MNDRDesc;