Description
Describe the bug
the program failed with this output
terminate called after throwing an instance of 'sycl::_V1::runtime_error'
what(): Native API failed. Native API returns: -999 (Unknown PI error) -999 (Unknown PI error)
i have an intel core I7 1255U and no dedicated graphics card the bug happen when running on the intel integrated gpu
To reproduce
- #include <sycl/detail/helpers.hpp>
#include <sycl/sycl.hpp>
#include <math.h>
#include
int main()
{
float m1[4][4] = {{2, 2, 9, 1}, {8,3,2,4}, {2,5,6,7}, {3,5,6,4}};
float m2[4][4] = {{2, 2, 91, 1}, {8,3,21,4}, {2,3,6,0}, {31,5,6,4}};
sycl::queue q{sycl::gpu_selector_v};
float **device_arr = (float **)malloc(sizeof(float *) * 4);
float **device_m1 = (float **)malloc(sizeof(float *) * 4);
float **device_m2 = (float **)malloc(sizeof(float *) * 4);
for(int i = 0; i < 4; i++)
{
device_arr[i] = sycl::malloc_device<float>(4, q);
device_m1[i] = sycl::malloc_device<float>(4, q);
q.memcpy(device_m1[i], m1[i], sizeof(float) * 4);
device_m2[i] = sycl::malloc_device<float>(4, q);
q.memcpy(device_m2[i], m2[i], sizeof(float) * 4);
}
q.wait();
const long long work_size_x = 4;
const long long work_size_y = 4;
const unsigned long work_size = std::pow(2, 32);
const unsigned long workers = work_size/4096;
for(unsigned long a = 0; a < work_size; a+=workers)
{
if((a+workers) > work_size)
{
q.parallel_for(sycl::range{work_size - a}, [=](sycl::item<1> it){
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
device_arr[i][j] = 0;
for (int k = 0; k < 4; k++) {
device_arr[i][j] += device_m1[i][k] * device_m2[k][j];
}
}
}
});
}else
{
q.parallel_for(sycl::range{workers}, [=](sycl::item<1> it){
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 4; j++) {
device_arr[i][j] = 0;
for (int k = 0; k < 4; k++) {
device_arr[i][j] += device_m1[i][k] * device_m2[k][j];
}
}
}
});
}
}
clock_t begin_sycl = clock();
q.wait();
clock_t end_sycl = clock();
double time_spent_sycl = (double)(end_sycl - begin_sycl) / CLOCKS_PER_SEC / 12;
std::cout << "Time spent on SYCL: " << time_spent_sycl << std::endl;
for(int i = 0; i < 4; i++)
{
sycl::free(device_arr[i], q);
sycl::free(device_m1[i], q);
sycl::free(device_m2[i], q);
}
}
dpcpp -xhost c++\ vs\ sycl/sycl.cpp -O3
- ./a.out
- expected to compute the matrix multiply but it doesnt it gives an error
Environment
- OS: arch linux
- target: intel iris xe 96EU gpu
- DPC++ version: Intel(R) oneAPI DPC++/C++ Compiler 2024.1.0 (2024.1.0.20240308)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /opt/intel/oneapi/compiler/2024.1/bin/compiler
Configuration file: /opt/intel/oneapi/compiler/2024.1/bin/compiler/../icpx.cfg - Dependencies version: ```[abdelilah@archlinux learn SYCL]$ sycl-ls --verbose
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2024.17.3.0.08_160000]
[opencl:cpu:1] Intel(R) OpenCL, 12th Gen Intel(R) Core(TM) i7-1255U OpenCL 3.0 (Build 0) [2024.17.3.0.08_160000]
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Iris(R) Xe Graphics OpenCL 3.0 NEO [24.26.30049]
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.30049]
Platforms: 4
Platform [#1]:
Version : OpenCL 1.2 Intel(R) FPGA SDK for OpenCL(TM), Version 20.3
Name : Intel(R) FPGA Emulation Platform for OpenCL(TM)
Vendor : Intel(R) Corporation
Devices : 1
Device [#0]:
Type : acc
Version : OpenCL 1.2
Name : Intel(R) FPGA Emulation Device
Vendor : Intel(R) Corporation
Driver : 2024.17.3.0.08_160000
Aspects : accelerator fp64 online_compiler online_linker queue_profiling usm_device_allocations usm_host_allocations usm_shared_allocations usm_atomic_host_allocations usm_atomic_shared_allocations ext_oneapi_srgb ext_oneapi_non_uniform_groups
info::device::sub_group_sizes: 4 8 16 32 64
Platform [#2]:
Version : OpenCL 3.0 LINUX
Name : Intel(R) OpenCL
Vendor : Intel(R) Corporation
Devices : 1
Device [#1]:
Type : cpu
Version : OpenCL 3.0 (Build 0)
Name : 12th Gen Intel(R) Core(TM) i7-1255U
Vendor : Intel(R) Corporation
Driver : 2024.17.3.0.08_160000
Aspects : cpu fp16 fp64 online_compiler online_linker queue_profiling usm_device_allocations usm_host_allocations usm_shared_allocations usm_system_allocations usm_atomic_host_allocations usm_atomic_shared_allocations atomic64 ext_oneapi_srgb ext_oneapi_native_assert ext_intel_legacy_image ext_oneapi_non_uniform_groups
info::device::sub_group_sizes: 4 8 16 32 64
Platform [#3]:
Version : OpenCL 3.0
Name : Intel(R) OpenCL Graphics
Vendor : Intel(R) Corporation
Devices : 1
Device [#2]:
Type : gpu
Version : OpenCL 3.0 NEO
Name : Intel(R) Iris(R) Xe Graphics
Vendor : Intel(R) Corporation
Driver : 24.26.30049
Aspects : gpu fp16 online_compiler online_linker queue_profiling usm_device_allocations usm_host_allocations usm_shared_allocations atomic64 ext_oneapi_srgb ext_intel_device_id ext_intel_legacy_image ext_intel_esimd ext_oneapi_non_uniform_groups
info::device::sub_group_sizes: 8 16 32
Platform [#4]:
Version : 1.3
Name : Intel(R) Level-Zero
Vendor : Intel(R) Corporation
Devices : 1
Device [#0]:
Type : gpu
Version : 1.3
Name : Intel(R) Iris(R) Xe Graphics
Vendor : Intel(R) Corporation
Driver : 1.3.30049
Aspects : gpu fp16 online_compiler online_linker queue_profiling usm_device_allocations usm_host_allocations usm_shared_allocations ext_intel_pci_address ext_intel_gpu_eu_count ext_intel_gpu_eu_simd_width ext_intel_gpu_slices ext_intel_gpu_subslices_per_slice ext_intel_gpu_eu_count_per_subslice atomic64 ext_intel_device_info_uuid ext_intel_gpu_hw_threads_per_eu ext_intel_device_id ext_intel_memory_clock_rate ext_intel_memory_bus_width ext_intel_legacy_image ext_intel_esimd ext_oneapi_non_uniform_groups
info::device::sub_group_sizes: 8 16 32
default_selector() : gpu, Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.30049]
accelerator_selector() : acc, Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2024.17.3.0.08_160000]
cpu_selector() : cpu, Intel(R) OpenCL, 12th Gen Intel(R) Core(TM) i7-1255U OpenCL 3.0 (Build 0) [2024.17.3.0.08_160000]
gpu_selector() : gpu, Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.30049]
custom_selector(gpu) : gpu, Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.30049]
custom_selector(cpu) : cpu, Intel(R) OpenCL, 12th Gen Intel(R) Core(TM) i7-1255U OpenCL 3.0 (Build 0) [2024.17.3.0.08_160000]
custom_selector(acc) : acc, Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2024.17.3.0.08_160000]
[abdelilah@archlinux learn SYCL]$```
Additional context
No response