8000 [NFCI][SYCL] Refactor reduction-handler interactions by aelovikov-intel · Pull Request #18794 · intel/llvm · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

[NFCI][SYCL] Refactor reduction-handler interactions #18794

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 92 additions & 2 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3426,8 +3426,6 @@ class __SYCL_EXPORT handler {
friend class detail::reduction_impl_algo;

friend inline void detail::reduction::finalizeHandler(handler &CGH);
template <class FunctorTy>
friend void detail::reduction::withAuxHandler(handler &CGH, FunctorTy Func);

template <typename KernelName, detail::reduction::strategy Strategy, int Dims,
typename PropertiesT, typename... RestT>
Expand Down Expand Up @@ -3870,6 +3868,8 @@ class __SYCL_EXPORT handler {
void setKernelNameBasedCachePtr(
detail::KernelNameBasedCacheT *KernelNameBasedCachePtr);

queue getQueue();

protected:
/// Registers event dependencies in this command group.
void depends_on(const detail::EventImplPtr &Event);
Expand All @@ -3889,6 +3889,96 @@ class HandlerAccess {
kernel Kernel) {
Handler.parallel_for_impl(Range, Props, Kernel);
}

template <typename T, typename> struct dependent {
using type = T;
};
template <typename T>
using dependent_queue_t = typename dependent<queue, T>::type;
template <typename T>
using dependent_handler_t = typename dependent<handler, T>::type;

// pre/postProcess are used only for reductions right now, but the
// abstractions they provide aren't reduction-specific. The main problem they
// solve is
//
// # User code
// q.submit([&](handler &cgh) {
// set_dependencies(cgh);
// enqueue_whatever(cgh);
// }); // single submission
//
// that needs to be implemented as multiple enqueues involving
// pre-/post-processing internally. SYCL prohibits recursive submits from
// inside control group function object (lambda above) so we resort to a
// somewhat hacky way of creating multiple `handler`s and manual finalization
// of them (instead of the one in `queue::submit`).
< 8000 /td> //
// Overloads with `queue &q` are provided in case the caller has it created
// already to avoid unnecessary reference count increments associated with
// `handler::getQueue()`.
template <class FunctorTy>
static void preProcess(handler &CGH, dependent_queue_t<FunctorTy> &q,
FunctorTy Func) {
bool EventNeeded = !q.is_in_order();
handler AuxHandler(getSyclObjImpl(q), EventNeeded);
AuxHandler.copyCodeLoc(CGH);
std::forward<FunctorTy>(Func)(AuxHandler);
auto E = AuxHandler.finalize();
assert(!CGH.MIsFinalized &&
"Can't do pre-processing if the command has been enqueued already!");
if (EventNeeded)
CGH.depends_on(E);
}
template <class FunctorTy>
static void preProcess(dependent_handler_t<FunctorTy> &CGH,
FunctorTy &&Func) {
preProcess(CGH, CGH.getQueue(), std::forward<FunctorTy>(Func));
}
template <class FunctorTy>
static void postProcess(dependent_handler_t<FunctorTy> &CGH,
FunctorTy &&Func) {
// The "hacky" `handler`s manipulation mentioned above and implemented here
// is far from perfect. A better approach would be
//
// bool OrigNeedsEvent = CGH.needsEvent()
// assert(CGH.not_finalized/enqueued());
// if (!InOrderQueue)
// CGH.setNeedsEvent()
//
// handler PostProcessHandler(Queue, OrigNeedsEvent)
// auto E = CGH.finalize(); // enqueue original or current last
// // post-process
// if (!InOrder)
// PostProcessHandler.depends_on(E)
//
// swap_impls(CGH, PostProcessHandler)
// return; // queue::submit finalizes PostProcessHandler and returns its
// // event if necessary.
//
// Still hackier than "real" `queue::submit` but at least somewhat sane.
// That, however hasn't been tried yet and we have an even hackier approach
// copied from what's been done in an old reductions implementation before
// eventless submission work has started. Not sure how feasible the approach
// above is at this moment.

// This `finalize` is wrong (at least logically) if
// `assert(!CGH.eventNeeded())`
auto E = CGH.finalize();
dependent_queue_t<FunctorTy> Queue = CGH.getQueue();
bool InOrder = Queue.is_in_order();
// Cannot use `CGH.eventNeeded()` alone as there might be subsequent
// `postProcess` calls and we cannot address them properly similarly to the
// `finalize` issue described above. `swap_impls` suggested above might be
// able to handle this scenario naturally.
handler AuxHandler(getSyclObjImpl(Queue), CGH.eventNeeded() || !InOrder);
if (!InOrder)
AuxHandler.depends_on(E);
AuxHandler.copyCodeLoc(CGH);
std::forward<FunctorTy>(Func)(AuxHandler);
CGH.MLastEvent = AuxHandler.finalize();
return;
}
};
} // namespace detail

Expand Down
41 changes: 10 additions & 31 deletions sycl/include/sycl/reduction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -834,10 +834,6 @@ using __sycl_init_mem_for =
std::conditional_t<std::is_same_v<KernelName, auto_name>, auto_name,
reduction::InitMemKrn<KernelName>>;

__SYCL_EXPORT void
addCounterInit(handler &CGH, std::shared_ptr<sycl::detail::queue_impl> &Queue,
std::shared_ptr<int> &Counter);

template <typename T, class BinaryOperation, int Dims, size_t Extent,
bool ExplicitIdentity, typename RedOutVar>
class reduction_impl_algo {
Expand Down Expand Up @@ -995,7 +991,7 @@ class reduction_impl_algo {
accessor Mem{*Buf, CGH};
Func(Mem);

reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
HandlerAccess::postProcess(CGH, [&](handler &CopyHandler) {
// MSVC (19.32.31329) has problems compiling the line below when used
// as a host compiler in c++17 mode (but not in c++latest)
// accessor Mem{*Buf, CopyHandler};
Expand Down Expand Up @@ -1071,19 +1067,16 @@ class reduction_impl_algo {
// On discrete (vs. integrated) GPUs it's faster to initialize memory with an
// extra kernel than copy it from the host.
auto getGroupsCounterAccDiscrete(handler &CGH) {
queue q = createSyclObjFromImpl<queue>(CGH.MQueue);
device Dev = q.get_device();
queue q = CGH.getQueue();
auto Deleter = [=](auto *Ptr) { free(Ptr, q); };

std::shared_ptr<int> Counter(malloc_device<int>(1, q), Deleter);
CGH.addReduction(Counter);

#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
std::shared_ptr<detail::queue_impl> Queue(CGH.MQueue);
#else
std::shared_ptr<detail::queue_impl> &Queue = CGH.MQueue;
#endif
addCounterInit(CGH, Queue, Counter);
HandlerAccess::preProcess(CGH, q,
[Counter = Counter.get()](handler &AuxHandler) {
AuxHandler.memset(Counter, 0, sizeof(int));
});

return Counter.get();
}
Expand Down Expand Up @@ -1178,20 +1171,6 @@ auto make_reduction(RedOutVar RedVar, RestTy &&...Rest) {

namespace reduction {
inline void finalizeHandler(handler &CGH) { CGH.finalize(); }
template <class FunctorTy> void withAuxHandler(handler &CGH, FunctorTy Func) {
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
detail::EventImplPtr E = CGH.finalize();
#else
event E = CGH.finalize();
#endif
handler AuxHandler(CGH.MQueue, CGH.eventNeeded());
if (!createSyclObjFromImpl<queue>(CGH.MQueue).is_in_order())
AuxHandler.depends_on(E);
AuxHandler.copyCodeLoc(CGH);
Func(AuxHandler);
CGH.MLastEvent = AuxHandler.finalize();
return;
}
} // namespace reduction

// This method is used for implementation of parallel_for accepting 1 reduction.
Expand Down Expand Up @@ -1785,7 +1764,7 @@ struct NDRangeReduction<
"the reduction.");
size_t NWorkItems = NDRange.get_group_range().size();
while (NWorkItems > 1) {
reduction::withAuxHandler(CGH, [&](handler &AuxHandler) {
HandlerAccess::postProcess(CGH, [&](handler &AuxHandler) {
size_t NElements = Reduction::num_elements;
size_t NWorkGroups;
size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
Expand Down Expand Up @@ -1837,7 +1816,7 @@ struct NDRangeReduction<
} // end while (NWorkItems > 1)

if constexpr (Reduction::is_usm) {
reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
HandlerAccess::postProcess(CGH, [&](handler &CopyHandler) {
reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
});
}
Expand Down Expand Up @@ -1969,7 +1948,7 @@ template <> struct NDRangeReduction<reduction::strategy::basic> {
size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);

auto Rest = [&](auto KernelTag) {
reduction::withAuxHandler(CGH, [&](handler &AuxHandler) {
HandlerAccess::postProcess(CGH, [&](handler &AuxHandler) {
// We can deduce IsOneWG from the tag type.
constexpr bool IsOneWG =
std::is_same_v<std::remove_reference_t<decltype(KernelTag)>,
Expand Down Expand Up @@ -2650,7 +2629,7 @@ template <> struct NDRangeReduction<reduction::strategy::multi> {

size_t NWorkItems = NDRange.get_group_range().size();
while (NWorkItems > 1) {
reduction::withAuxHandler(CGH, [&](handler &AuxHandler) {
HandlerAccess::postProcess(CGH, [&](handler &AuxHandler) {
NWorkItems = reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
});
Expand Down
2 changes: 2 additions & 0 deletions sycl/source/detail/reduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
return reduGetMaxWGSize(Queue, LocalMemBytesPerWorkItem);
}

#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
__SYCL_EXPORT void
addCounterInit(handler &CGH, std::shared_ptr<sycl::detail::queue_impl> &Queue,
std::shared_ptr<int> &Counter) {
Expand All @@ -189,6 +190,7 @@ addCounterInit(handler &CGH, std::shared_ptr<sycl::detail::queue_impl> &Queue,
EventImpl->setHandle(UREvent);
CGH.depends_on(createSyclObjFromImpl<event>(EventImpl));
}
#endif

__SYCL_EXPORT void verifyReductionProps(const property_list &Props) {
auto CheckDataLessProperties = [](int PropertyKind) {
Expand Down
1 change: 1 addition & 0 deletions sycl/source/handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2408,5 +2408,6 @@ void handler::copyCodeLoc(const handler &other) {
impl->MIsTopCodeLoc = other.impl->MIsTopCodeLoc;
}

queue handler::getQueue() { return createSyclObjFromImpl<queue>(MQueue); }
} // namespace _V1
} // namespace sycl
1 change: 1 addition & 0 deletions sycl/test/abi/sycl_symbols_linux.dump
Original file line number Diff line number Diff line change
Expand Up @@ -3608,6 +3608,7 @@ _ZN4sycl3_V17handler6memcpyEPvPKvm
_ZN4sycl3_V17handler6memsetEPvim
_ZN4sycl3_V17handler7setTypeENS0_6detail6CGTypeE
_ZN4sycl3_V17handler8finalizeEv
_ZN4sycl3_V17handler8getQueueEv
_ZN4sycl3_V17handler8prefetchEPKvm
_ZN4sycl3_V17handler9clearArgsEv
_ZN4sycl3_V17handler9fill_implEPvPKvmm
Expand Down
1 change: 1 addition & 0 deletions sycl/test/abi/sycl_symbols_windows.dump
Original file line number Diff line number Diff line change
Expand Up @@ -4103,6 +4103,7 @@
?getPtr@SampledImageAccessorBaseHost@detail@_V1@sycl@@QEBAPEAXXZ
?getPtr@UnsampledImageAccessorBaseHost@detail@_V1@sycl@@QEAAPEAXXZ
?getPtr@UnsampledImageAccessorBaseHost@detail@_V1@sycl@@QEBAPEAXXZ
?getQueue@handler@_V1@sycl@@AEAA?AVqueue@23@XZ
?getRowPitch@image_plain@detail@_V1@sycl@@IEBA_KXZ
?getSampler@SampledImageAccessorBaseHost@detail@_V1@sycl@@QEBA?AUimage_sampler@34@XZ
?getSampler@image_plain@detail@_V1@sycl@@IEBA?AUimage_sampler@34@XZ
Expand Down
0