intel · aelovikov-intel · Jun 6, 2025 · Jun 5, 2025 · Jun 6, 2025
@@ -3426,8 +3426,6 @@ class __SYCL_EXPORT handler {
   friend class detail::reduction_impl_algo;
 
   friend inline void detail::reduction::finalizeHandler(handler &CGH);
-  template <class FunctorTy>
-  friend void detail::reduction::withAuxHandler(handler &CGH, FunctorTy Func);
 
   template <typename KernelName, detail::reduction::strategy Strategy, int Dims,
             typename PropertiesT, typename... RestT>
@@ -3870,6 +3868,8 @@ class __SYCL_EXPORT handler {
   void setKernelNameBasedCachePtr(
       detail::KernelNameBasedCacheT *KernelNameBasedCachePtr);
 
+  queue getQueue();
+
 protected:
   /// Registers event dependencies in this command group.
   void depends_on(const detail::EventImplPtr &Event);
@@ -3889,6 +3889,96 @@ class HandlerAccess {
                               kernel Kernel) {
     Handler.parallel_for_impl(Range, Props, Kernel);
   }
+
+  template <typename T, typename> struct dependent {
+    using type = T;
+  };
+  template <typename T>
+  using dependent_queue_t = typename dependent<queue, T>::type;
+  template <typename T>
+  using dependent_handler_t = typename dependent<handler, T>::type;
+
+  // pre/postProcess are used only for reductions right now, but the
+  // abstractions they provide aren't reduction-specific. The main problem they
+  // solve is
+  //
+  //   # User code
+  //   q.submit([&](handler &cgh) {
+  //     set_dependencies(cgh);
+  //     enqueue_whatever(cgh);
+  //   });  // single submission
+  //
+  // that needs to be implemented as multiple enqueues involving
+  // pre-/post-processing internally. SYCL prohibits recursive submits from
+  // inside control group function object (lambda above) so we resort to a
+  // somewhat hacky way of creating multiple `handler`s and manual finalization
+  // of them (instead of the one in `queue::submit`).
+  //
+  // Overloads with `queue &q` are provided in case the caller has it created
+  // already to avoid unnecessary reference count increments associated with
+  // `handler::getQueue()`.
+  template <class FunctorTy>
+  static void preProcess(handler &CGH, dependent_queue_t<FunctorTy> &q,
+                         FunctorTy Func) {
+    bool EventNeeded = !q.is_in_order();
+    handler AuxHandler(getSyclObjImpl(q), EventNeeded);
+    AuxHandler.copyCodeLoc(CGH);
+    std::forward<FunctorTy>(Func)(AuxHandler);
+    auto E = AuxHandler.finalize();
+    assert(!CGH.MIsFinalized &&
+           "Can't do pre-processing if the command has been enqueued already!");
+    if (EventNeeded)
+      CGH.depends_on(E);
+  }
+  template <class FunctorTy>
+  static void preProcess(dependent_handler_t<FunctorTy> &CGH,
+                         FunctorTy &&Func) {
+    preProcess(CGH, CGH.getQueue(), std::forward<FunctorTy>(Func));
+  }
+  template <class FunctorTy>
+  static void postProcess(dependent_handler_t<FunctorTy> &CGH,
+                          FunctorTy &&Func) {
+    // The "hacky" `handler`s manipulation mentioned above and implemented here
+    // is far from perfect. A better approach would be
+    //
+    //    bool OrigNeedsEvent = CGH.needsEvent()
+    //    assert(CGH.not_finalized/enqueued());
+    //    if (!InOrderQueue)
+    //      CGH.setNeedsEvent()
+    //
+    //    handler PostProcessHandler(Queue, OrigNeedsEvent)
+    //    auto E = CGH.finalize(); // enqueue original or current last
+    //                             // post-process
+    //    if (!InOrder)
+    //      PostProcessHandler.depends_on(E)
+    //
+    //    swap_impls(CGH, PostProcessHandler)
+    //    return; // queue::submit finalizes PostProcessHandler and returns its
+    //            // event if necessary.
+    //
+    // Still hackier than "real" `queue::submit` but at least somewhat sane.
+    // That, however hasn't been tried yet and we have an even hackier approach
+    // copied from what's been done in an old reductions implementation before
+    // eventless submission work has started. Not sure how feasible the approach
+    // above is at this moment.
+
+    // This `finalize` is wrong (at least logically) if
+    //   `assert(!CGH.eventNeeded())`
+    auto E = CGH.finalize();
+    dependent_queue_t<FunctorTy> Queue = CGH.getQueue();
+    bool InOrder = Queue.is_in_order();
+    // Cannot use `CGH.eventNeeded()` alone as there might be subsequent
+    // `postProcess` calls and we cannot address them properly similarly to the
+    // `finalize` issue described above. `swap_impls` suggested above might be
+    // able to handle this scenario naturally.
+    handler AuxHandler(getSyclObjImpl(Queue), CGH.eventNeeded() || !InOrder);
+    if (!InOrder)
+      AuxHandler.depends_on(E);
+    AuxHandler.copyCodeLoc(CGH);
+    std::forward<FunctorTy>(Func)(AuxHandler);
+    CGH.MLastEvent = AuxHandler.finalize();
+    return;
+  }
 };
 } // namespace detail
 

@@ -834,10 +834,6 @@ using __sycl_init_mem_for =
     std::conditional_t<std::is_same_v<KernelName, auto_name>, auto_name,
                        reduction::InitMemKrn<KernelName>>;
 
-__SYCL_EXPORT void
-addCounterInit(handler &CGH, std::shared_ptr<sycl::detail::queue_impl> &Queue,
-               std::shared_ptr<int> &Counter);
-
 template <typename T, class BinaryOperation, int Dims, size_t Extent,
           bool ExplicitIdentity, typename RedOutVar>
 class reduction_impl_algo {
@@ -995,7 +991,7 @@ class reduction_impl_algo {
       accessor Mem{*Buf, CGH};
       Func(Mem);
 
-      reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
+      HandlerAccess::postProcess(CGH, [&](handler &CopyHandler) {
         // MSVC (19.32.31329) has problems compiling the line below when used
         // as a host compiler in c++17 mode (but not in c++latest)
         //   accessor Mem{*Buf, CopyHandler};
@@ -1071,19 +1067,16 @@ class reduction_impl_algo {
   // On discrete (vs. integrated) GPUs it's faster to initialize memory with an
   // extra kernel than copy it from the host.
   auto getGroupsCounterAccDiscrete(handler &CGH) {
-    queue q = createSyclObjFromImpl<queue>(CGH.MQueue);
-    device Dev = q.get_device();
+    queue q = CGH.getQueue();
     auto Deleter = [=](auto *Ptr) { free(Ptr, q); };
 
     std::shared_ptr<int> Counter(malloc_device<int>(1, q), Deleter);
     CGH.addReduction(Counter);
 
-#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
-    std::shared_ptr<detail::queue_impl> Queue(CGH.MQueue);
-#else
-    std::shared_ptr<detail::queue_impl> &Queue = CGH.MQueue;
-#endif
-    addCounterInit(CGH, Queue, Counter);
+    HandlerAccess::preProcess(CGH, q,
+                              [Counter = Counter.get()](handler &AuxHandler) {
+                                AuxHandler.memset(Counter, 0, sizeof(int));
+                              });
 
     return Counter.get();
   }
@@ -1178,20 +1171,6 @@ auto make_reduction(RedOutVar RedVar, RestTy &&...Rest) {
 
 namespace reduction {
 inline void finalizeHandler(handler &CGH) { CGH.finalize(); }
-template <class FunctorTy> void withAuxHandler(handler &CGH, FunctorTy Func) {
-#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
-  detail::EventImplPtr E = CGH.finalize();
-#else
-  event E = CGH.finalize();
-#endif
-  handler AuxHandler(CGH.MQueue, CGH.eventNeeded());
-  if (!createSyclObjFromImpl<queue>(CGH.MQueue).is_in_order())
-    AuxHandler.depends_on(E);
-  AuxHandler.copyCodeLoc(CGH);
-  Func(AuxHandler);
-  CGH.MLastEvent = AuxHandler.finalize();
-  return;
-}
 } // namespace reduction
 
 // This method is used for implementation of parallel_for accepting 1 reduction.
@@ -1785,7 +1764,7 @@ struct NDRangeReduction<
                             "the reduction.");
     size_t NWorkItems = NDRange.get_group_range().size();
     while (NWorkItems > 1) {
-      reduction::withAuxHandler(CGH, [&](handler &AuxHandler) {
+      HandlerAccess::postProcess(CGH, [&](handler &AuxHandler) {
         size_t NElements = Reduction::num_elements;
         size_t NWorkGroups;
         size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
@@ -1837,7 +1816,7 @@ struct NDRangeReduction<
     } // end while (NWorkItems > 1)
 
     if constexpr (Reduction::is_usm) {
-      reduction::withAuxHandler(CGH, [&](handler &CopyHandler) {
+      HandlerAccess::postProcess(CGH, [&](handler &CopyHandler) {
         reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
       });
     }
@@ -1969,7 +1948,7 @@ template <> struct NDRangeReduction<reduction::strategy::basic> {
       size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);
 
       auto Rest = [&](auto KernelTag) {
-        reduction::withAuxHandler(CGH, [&](handler &AuxHandler) {
+        HandlerAccess::postProcess(CGH, [&](handler &AuxHandler) {
           // We can deduce IsOneWG from the tag type.
           constexpr bool IsOneWG =
               std::is_same_v<std::remove_reference_t<decltype(KernelTag)>,
@@ -2650,7 +2629,7 @@ template <> struct NDRangeReduction<reduction::strategy::multi> {
 
     size_t NWorkItems = NDRange.get_group_range().size();
     while (NWorkItems > 1) {
-      reduction::withAuxHandler(CGH, [&](handler &AuxHandler) {
+      HandlerAccess::postProcess(CGH, [&](handler &AuxHandler) {
         NWorkItems = reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
             AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
       });

@@ -177,6 +177,7 @@ __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
   return reduGetMaxWGSize(Queue, LocalMemBytesPerWorkItem);
 }
 
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
 __SYCL_EXPORT void
 addCounterInit(handler &CGH, std::shared_ptr<sycl::detail::queue_impl> &Queue,
                std::shared_ptr<int> &Counter) {
@@ -189,6 +190,7 @@ addCounterInit(handler &CGH, std::shared_ptr<sycl::detail::queue_impl> &Queue,
   EventImpl->setHandle(UREvent);
   CGH.depends_on(createSyclObjFromImpl<event>(EventImpl));
 }
+#endif
 
 __SYCL_EXPORT void verifyReductionProps(const property_list &Props) {
   auto CheckDataLessProperties = [](int PropertyKind) {

@@ -2408,5 +2408,6 @@ void handler::copyCodeLoc(const handler &other) {
   impl->MIsTopCodeLoc = other.impl->MIsTopCodeLoc;
 }
 
+queue handler::getQueue() { return createSyclObjFromImpl<queue>(MQueue); }
 } // namespace _V1
 } // namespace sycl
@@ -3608,6 +3608,7 @@ _ZN4sycl3_V17handler6memcpyEPvPKvm
 _ZN4sycl3_V17handler6memsetEPvim
 _ZN4sycl3_V17handler7setTypeENS0_6detail6CGTypeE
 _ZN4sycl3_V17handler8finalizeEv
+_ZN4sycl3_V17handler8getQueueEv
 _ZN4sycl3_V17handler8prefetchEPKvm
 _ZN4sycl3_V17handler9clearArgsEv
 _ZN4sycl3_V17handler9fill_implEPvPKvmm

@@ -4103,6 +4103,7 @@
 ?getPtr@SampledImageAccessorBaseHost@detail@_V1@sycl@@QEBAPEAXXZ
 ?getPtr@UnsampledImageAccessorBaseHost@detail@_V1@sycl@@QEAAPEAXXZ
 ?getPtr@UnsampledImageAccessorBaseHost@detail@_V1@sycl@@QEBAPEAXXZ
+?getQueue@handler@_V1@sycl@@AEAA?AVqueue@23@XZ
 ?getRowPitch@image_plain@detail@_V1@sycl@@IEBA_KXZ
 ?getSampler@SampledImageAccessorBaseHost@detail@_V1@sycl@@QEBA?AUimage_sampler@34@XZ
 ?getSampler@image_plain@detail@_V1@sycl@@IEBA?AUimage_sampler@34@XZ