From 0c33c74e5d211e92664fe41f8b270970bf89449f Mon Sep 17 00:00:00 2001 From: ShaniBabayoff Date: Thu, 2 Jan 2025 09:23:44 +0200 Subject: [PATCH 01/12] Update docusaurus.config.ts (#720) change from V3.2 TO V3.3 --- docs/docusaurus.config.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docusaurus.config.ts b/docs/docusaurus.config.ts index 8b2078716a..b4bb5ce484 100644 --- a/docs/docusaurus.config.ts +++ b/docs/docusaurus.config.ts @@ -175,7 +175,7 @@ const config: Config = { announcementBar: { id: 'announcement', // Any value that will identify this message. content: - 'โ„๏ธ๐ŸŽ‰ New Release! ICICLE v3.2! ๐ŸŽ‰โ„๏ธ', + 'โ„๏ธ๐ŸŽ‰ New Release! ICICLE v3.3! ๐ŸŽ‰โ„๏ธ', backgroundColor: '#64f5ef', // Light blue background color. textColor: '#000000', // Black text color. isCloseable: true, // Defaults to `true`. From 29fe5de5a93e009e0b61fdac0ae0721057ffa495 Mon Sep 17 00:00:00 2001 From: Jeremy Felder Date: Sun, 5 Jan 2025 09:04:12 +0200 Subject: [PATCH 02/12] Add capability to set a default device for all threads (#699) --- docs/docs/icicle/programmers_guide/cpp.md | 15 +++ docs/docs/icicle/programmers_guide/general.md | 1 + docs/docs/icicle/programmers_guide/go.md | 17 ++- docs/docs/icicle/programmers_guide/rust.md | 15 +++ icicle/include/icicle/device_api.h | 1 + icicle/include/icicle/runtime.h | 8 ++ icicle/src/device_api.cpp | 16 +++ icicle/src/runtime.cpp | 5 + icicle/tests/test_device_api.cpp | 31 +++++ wrappers/golang/runtime/device.go | 6 + wrappers/golang/runtime/include/runtime.h | 1 + wrappers/golang/runtime/tests/device_test.go | 118 +++++++++++++----- wrappers/golang/runtime/tests/main_test.go | 14 +-- wrappers/golang/runtime/tests/stream_test.go | 8 +- wrappers/rust/icicle-runtime/src/device.rs | 2 +- wrappers/rust/icicle-runtime/src/runtime.rs | 10 ++ wrappers/rust/icicle-runtime/src/tests.rs | 32 +++++ 17 files changed, 245 insertions(+), 55 deletions(-) diff --git a/docs/docs/icicle/programmers_guide/cpp.md b/docs/docs/icicle/programmers_guide/cpp.md index b8f1f285f7..cff576571b 100644 --- a/docs/docs/icicle/programmers_guide/cpp.md +++ b/docs/docs/icicle/programmers_guide/cpp.md @@ -32,6 +32,21 @@ eIcicleError result = icicle_set_device(device); eIcicleError result = icicle_get_active_device(device); ``` +### Setting and Getting the Default Device + +You can set the default device for all threads: + +```cpp +icicle::Device device = {"CUDA", 0}; // or other +eIcicleError result = icicle_set_default_device(device); +``` + +:::caution + +Setting a default device should be done **once** from the main thread of the application. If another device or backend is needed for a specific thread [icicle_set_device](#setting-and-getting-active-device) should be used instead. + +::: + ### Querying Device Information Retrieve the number of available devices and check if a pointer is allocated on the host or on the active device: diff --git a/docs/docs/icicle/programmers_guide/general.md b/docs/docs/icicle/programmers_guide/general.md index 0bef2b8500..0405cf35cf 100644 --- a/docs/docs/icicle/programmers_guide/general.md +++ b/docs/docs/icicle/programmers_guide/general.md @@ -85,6 +85,7 @@ ICICLE provides a device abstraction layer that allows you to interact with diff - **Loading Backends**: Backends are loaded dynamically based on the environment configuration or a specified path. - **Setting Active Device**: The active device for a thread can be set, allowing for targeted computation on a specific device. +- **Setting Default Device**: The default device for any thread without an active device can be set, removing the need to specify an alternative device on each thread. This is especially useful when running on a backend that is not the built-in CPU backend which is the default device to start. ## Streams diff --git a/docs/docs/icicle/programmers_guide/go.md b/docs/docs/icicle/programmers_guide/go.md index 92df226f97..03f7932c1b 100644 --- a/docs/docs/icicle/programmers_guide/go.md +++ b/docs/docs/icicle/programmers_guide/go.md @@ -27,12 +27,27 @@ result := runtime.LoadBackend("/path/to/backend/installdir", true) You can set the active device for the current thread and retrieve it when needed: ```go -device = runtime.CreateDevice("CUDA", 0) // or other +device := runtime.CreateDevice("CUDA", 0) // or other result := runtime.SetDevice(device) // or query current (thread) device activeDevice := runtime.GetActiveDevice() ``` +### Setting and Getting the Default Device + +You can set the default device for all threads: + +```go +device := runtime.CreateDevice("CUDA", 0) // or other +defaultDevice := runtime.SetDefaultDevice(device); +``` + +:::caution + +Setting a default device should be done **once** from the main thread of the application. If another device or backend is needed for a specific thread [runtime.SetDevice](#setting-and-getting-active-device) should be used instead. + +::: + ### Querying Device Information Retrieve the number of available devices and check if a pointer is allocated on the host or on the active device: diff --git a/docs/docs/icicle/programmers_guide/rust.md b/docs/docs/icicle/programmers_guide/rust.md index af5caa5a82..55188cfe2b 100644 --- a/docs/docs/icicle/programmers_guide/rust.md +++ b/docs/docs/icicle/programmers_guide/rust.md @@ -55,6 +55,21 @@ icicle_runtime::set_device(&device).unwrap(); let active_device = icicle_runtime::get_active_device().unwrap(); ``` +### Setting and Getting the Default Device + +You can set the default device for all threads: + +```caution +let device = Device::new("CUDA", 0); // or other +let default_device = icicle_runtime::set_default_device(device); +``` + +:::note + +Setting a default device should be done **once** from the main thread of the application. If another device or backend is needed for a specific thread [icicle_runtime::set_device](#setting-and-getting-active-device) should be used instead. + +::: + ### Querying Device Information Retrieve the number of available devices and check if a pointer is allocated on the host or on the active device: diff --git a/icicle/include/icicle/device_api.h b/icicle/include/icicle/device_api.h index 483b5f96bc..0768f135c0 100644 --- a/icicle/include/icicle/device_api.h +++ b/icicle/include/icicle/device_api.h @@ -188,6 +188,7 @@ namespace icicle { public: static eIcicleError set_thread_local_device(const Device& device); + static eIcicleError set_default_device(const Device& device); static const Device& get_thread_local_device(); static const DeviceAPI* get_thread_local_deviceAPI(); static DeviceTracker& get_global_memory_tracker() { return sMemTracker; } diff --git a/icicle/include/icicle/runtime.h b/icicle/include/icicle/runtime.h index a14d80bd84..3dad72986e 100644 --- a/icicle/include/icicle/runtime.h +++ b/icicle/include/icicle/runtime.h @@ -36,6 +36,14 @@ extern "C" eIcicleError icicle_load_backend_from_env_or_default(); */ extern "C" eIcicleError icicle_set_device(const icicle::Device& device); +/** + * @brief Set default device for all threads + * + + * @return eIcicleError::SUCCESS if successful, otherwise throws INVALID_DEVICE + */ +extern "C" eIcicleError icicle_set_default_device(const icicle::Device& device); + /** * @brief Get active device for thread * diff --git a/icicle/src/device_api.cpp b/icicle/src/device_api.cpp index 179b8a3cb4..25ecc9c966 100644 --- a/icicle/src/device_api.cpp +++ b/icicle/src/device_api.cpp @@ -58,6 +58,17 @@ namespace icicle { const Device& get_default_device() { return m_default_device; } + eIcicleError set_default_device(const Device& dev) + { + if (!is_device_registered(dev.type)) { + ICICLE_LOG_ERROR << "Device type " + std::string(dev.type) + " is not valid as it has not been registered"; + return eIcicleError::INVALID_DEVICE; + } + + m_default_device = dev; + return eIcicleError::SUCCESS; + } + std::vector get_registered_devices_list() { std::vector registered_devices; @@ -116,6 +127,11 @@ namespace icicle { return default_deviceAPI.get(); } + eIcicleError DeviceAPI::set_default_device(const Device& dev) + { + return DeviceAPIRegistry::Global().set_default_device(dev); + } + /********************************************************************************** */ DeviceAPI* get_deviceAPI(const Device& device) { return DeviceAPIRegistry::Global().get_deviceAPI(device).get(); } diff --git a/icicle/src/runtime.cpp b/icicle/src/runtime.cpp index 8e9028cfcc..eefed8ad2b 100644 --- a/icicle/src/runtime.cpp +++ b/icicle/src/runtime.cpp @@ -14,6 +14,11 @@ using namespace icicle; extern "C" eIcicleError icicle_set_device(const Device& device) { return DeviceAPI::set_thread_local_device(device); } +extern "C" eIcicleError icicle_set_default_device(const Device& device) +{ + return DeviceAPI::set_default_device(device); +} + extern "C" eIcicleError icicle_get_active_device(icicle::Device& device) { const Device& active_device = DeviceAPI::get_thread_local_device(); diff --git a/icicle/tests/test_device_api.cpp b/icicle/tests/test_device_api.cpp index f63d3c9878..8a5ea68874 100644 --- a/icicle/tests/test_device_api.cpp +++ b/icicle/tests/test_device_api.cpp @@ -1,5 +1,6 @@ #include +#include #include #include "icicle/runtime.h" @@ -19,6 +20,36 @@ TEST_F(DeviceApiTest, UnregisteredDeviceError) EXPECT_ANY_THROW(get_deviceAPI(dev)); } +TEST_F(DeviceApiTest, SetDefaultDevice) +{ + icicle::Device active_dev = {UNKOWN_DEVICE, -1}; + + icicle::Device cpu_dev = {s_ref_device, 0}; + EXPECT_NO_THROW(icicle_set_device(cpu_dev)); + EXPECT_NO_THROW(icicle_get_active_device(active_dev)); + + ASSERT_EQ(cpu_dev, active_dev); + + active_dev = {UNKOWN_DEVICE, -1}; + + icicle::Device gpu_dev = {s_main_device, 0}; + EXPECT_NO_THROW(icicle_set_default_device(gpu_dev)); + + // setting a new default device doesn't override already set local thread devices + EXPECT_NO_THROW(icicle_get_active_device(active_dev)); + ASSERT_EQ(cpu_dev, active_dev); + + active_dev = {UNKOWN_DEVICE, -1}; + auto thread_func = [&active_dev, &gpu_dev]() { + EXPECT_NO_THROW(icicle_get_active_device(active_dev)); + ASSERT_EQ(gpu_dev, active_dev); + }; + + std::thread worker_thread(thread_func); + + worker_thread.join(); +} + TEST_F(DeviceApiTest, MemoryCopySync) { int input[2] = {1, 2}; diff --git a/wrappers/golang/runtime/device.go b/wrappers/golang/runtime/device.go index fac5b0f09e..7f0965bab3 100644 --- a/wrappers/golang/runtime/device.go +++ b/wrappers/golang/runtime/device.go @@ -50,6 +50,12 @@ func SetDevice(device *Device) EIcicleError { return EIcicleError(cErr) } +func SetDefaultDevice(device *Device) EIcicleError { + cDevice := (*C.Device)(unsafe.Pointer(device)) + cErr := C.icicle_set_default_device(cDevice) + return EIcicleError(cErr) +} + func GetActiveDevice() (*Device, EIcicleError) { device := CreateDevice("invalid", -1) cDevice := (*C.Device)(unsafe.Pointer(&device)) diff --git a/wrappers/golang/runtime/include/runtime.h b/wrappers/golang/runtime/include/runtime.h index 003bb08947..16d4d348ae 100644 --- a/wrappers/golang/runtime/include/runtime.h +++ b/wrappers/golang/runtime/include/runtime.h @@ -13,6 +13,7 @@ typedef struct DeviceProperties DeviceProperties; int icicle_load_backend(const char* path, bool is_recursive); int icicle_load_backend_from_env_or_default(); int icicle_set_device(const Device* device); +int icicle_set_default_device(const Device* device); int icicle_get_active_device(Device* device); int icicle_is_host_memory(const void* ptr); int icicle_is_active_device_memory(const void* ptr); diff --git a/wrappers/golang/runtime/tests/device_test.go b/wrappers/golang/runtime/tests/device_test.go index a4c114389f..3219604c29 100644 --- a/wrappers/golang/runtime/tests/device_test.go +++ b/wrappers/golang/runtime/tests/device_test.go @@ -1,70 +1,122 @@ package tests import ( + "fmt" "os/exec" + "runtime" + "strconv" + "strings" + "syscall" "testing" - "github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime" + icicle_runtime "github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime" "github.com/stretchr/testify/assert" ) func TestGetDeviceType(t *testing.T) { expectedDeviceName := "test" - config := runtime.CreateDevice(expectedDeviceName, 0) + config := icicle_runtime.CreateDevice(expectedDeviceName, 0) assert.Equal(t, expectedDeviceName, config.GetDeviceType()) expectedDeviceNameLong := "testtesttesttesttesttesttesttesttesttesttesttesttesttesttesttest" - configLargeName := runtime.CreateDevice(expectedDeviceNameLong, 1) + configLargeName := icicle_runtime.CreateDevice(expectedDeviceNameLong, 1) assert.NotEqual(t, expectedDeviceNameLong, configLargeName.GetDeviceType()) } func TestIsDeviceAvailable(t *testing.T) { - runtime.LoadBackendFromEnvOrDefault() - dev := runtime.CreateDevice("CUDA", 0) - _ = runtime.SetDevice(&dev) - res, err := runtime.GetDeviceCount() - - expectedNumDevices, error := exec.Command("nvidia-smi", "-L", "|", "wc", "-l").Output() - if error != nil { - t.Skip("Failed to get number of devices") + dev := icicle_runtime.CreateDevice("CUDA", 0) + _ = icicle_runtime.SetDevice(&dev) + res, err := icicle_runtime.GetDeviceCount() + + smiCommand := exec.Command("nvidia-smi", "-L") + smiCommandStdout, _ := smiCommand.StdoutPipe() + wcCommand := exec.Command("wc", "-l") + wcCommand.Stdin = smiCommandStdout + + smiCommand.Start() + + expectedNumDevicesRaw, wcErr := wcCommand.Output() + smiCommand.Wait() + + expectedNumDevicesAsString := strings.TrimRight(string(expectedNumDevicesRaw), " \n\r\t") + expectedNumDevices, _ := strconv.Atoi(expectedNumDevicesAsString) + if wcErr != nil { + t.Skip("Failed to get number of devices:", wcErr) } - assert.Equal(t, runtime.Success, err) + assert.Equal(t, icicle_runtime.Success, err) assert.Equal(t, expectedNumDevices, res) - err = runtime.LoadBackendFromEnvOrDefault() - assert.Equal(t, runtime.Success, err) - devCuda := runtime.CreateDevice("CUDA", 0) - assert.True(t, runtime.IsDeviceAvailable(&devCuda)) - devCpu := runtime.CreateDevice("CPU", 0) - assert.True(t, runtime.IsDeviceAvailable(&devCpu)) - devInvalid := runtime.CreateDevice("invalid", 0) - assert.False(t, runtime.IsDeviceAvailable(&devInvalid)) + assert.Equal(t, icicle_runtime.Success, err) + devCuda := icicle_runtime.CreateDevice("CUDA", 0) + assert.True(t, icicle_runtime.IsDeviceAvailable(&devCuda)) + devCpu := icicle_runtime.CreateDevice("CPU", 0) + assert.True(t, icicle_runtime.IsDeviceAvailable(&devCpu)) + devInvalid := icicle_runtime.CreateDevice("invalid", 0) + assert.False(t, icicle_runtime.IsDeviceAvailable(&devInvalid)) +} + +func TestSetDefaultDevice(t *testing.T) { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + tidOuter := syscall.Gettid() + + gpuDevice := icicle_runtime.CreateDevice("CUDA", 0) + icicle_runtime.SetDefaultDevice(&gpuDevice) + + activeDevice, err := icicle_runtime.GetActiveDevice() + assert.Equal(t, icicle_runtime.Success, err) + assert.Equal(t, gpuDevice, *activeDevice) + + done := make(chan struct{}, 1) + go func() { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + // Ensure we are operating on an OS thread other than the original one + tidInner := syscall.Gettid() + for tidInner == tidOuter { + fmt.Println("Locked thread is the same as original, getting new locked thread") + runtime.UnlockOSThread() + runtime.LockOSThread() + tidInner = syscall.Gettid() + } + + activeDevice, err := icicle_runtime.GetActiveDevice() + assert.Equal(t, icicle_runtime.Success, err) + assert.Equal(t, gpuDevice, *activeDevice) + + close(done) + }() + + <-done + + cpuDevice := icicle_runtime.CreateDevice("CPU", 0) + icicle_runtime.SetDefaultDevice(&cpuDevice) } func TestRegisteredDevices(t *testing.T) { - err := runtime.LoadBackendFromEnvOrDefault() - assert.Equal(t, runtime.Success, err) - devices, _ := runtime.GetRegisteredDevices() + devices, _ := icicle_runtime.GetRegisteredDevices() assert.Equal(t, []string{"CUDA", "CPU"}, devices) } func TestDeviceProperties(t *testing.T) { - _, err := runtime.GetDeviceProperties() - assert.Equal(t, runtime.Success, err) + _, err := icicle_runtime.GetDeviceProperties() + assert.Equal(t, icicle_runtime.Success, err) } func TestActiveDevice(t *testing.T) { - runtime.SetDevice(&DEVICE) - activeDevice, err := runtime.GetActiveDevice() - assert.Equal(t, runtime.Success, err) - assert.Equal(t, DEVICE, *activeDevice) - memory1, err := runtime.GetAvailableMemory() - if err == runtime.ApiNotImplemented { - t.Skipf("GetAvailableMemory() function is not implemented on %s device", DEVICE.GetDeviceType()) + devCpu := icicle_runtime.CreateDevice("CUDA", 0) + icicle_runtime.SetDevice(&devCpu) + activeDevice, err := icicle_runtime.GetActiveDevice() + assert.Equal(t, icicle_runtime.Success, err) + assert.Equal(t, devCpu, *activeDevice) + memory1, err := icicle_runtime.GetAvailableMemory() + if err == icicle_runtime.ApiNotImplemented { + t.Skipf("GetAvailableMemory() function is not implemented on %s device", devCpu.GetDeviceType()) } - assert.Equal(t, runtime.Success, err) + assert.Equal(t, icicle_runtime.Success, err) assert.Greater(t, memory1.Total, uint(0)) assert.Greater(t, memory1.Free, uint(0)) } diff --git a/wrappers/golang/runtime/tests/main_test.go b/wrappers/golang/runtime/tests/main_test.go index 226fee7836..800c34816e 100644 --- a/wrappers/golang/runtime/tests/main_test.go +++ b/wrappers/golang/runtime/tests/main_test.go @@ -6,19 +6,7 @@ import ( "github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime" ) -var DEVICE runtime.Device - func TestMain(m *testing.M) { runtime.LoadBackendFromEnvOrDefault() - devices, e := runtime.GetRegisteredDevices() - if e != runtime.Success { - panic("Failed to load registered devices") - } - for _, deviceType := range devices { - DEVICE = runtime.CreateDevice(deviceType, 0) - runtime.SetDevice(&DEVICE) - - // execute tests - m.Run() - } + m.Run() } diff --git a/wrappers/golang/runtime/tests/stream_test.go b/wrappers/golang/runtime/tests/stream_test.go index c32cae8960..8acd7089fa 100644 --- a/wrappers/golang/runtime/tests/stream_test.go +++ b/wrappers/golang/runtime/tests/stream_test.go @@ -8,19 +8,15 @@ import ( ) func TestCreateStream(t *testing.T) { - err := runtime.LoadBackendFromEnvOrDefault() - assert.Equal(t, runtime.Success, err) dev := runtime.CreateDevice("CUDA", 0) assert.True(t, runtime.IsDeviceAvailable(&dev)) - err = runtime.SetDevice(&dev) + err := runtime.SetDevice(&dev) assert.Equal(t, runtime.Success, err) _, err = runtime.CreateStream() assert.Equal(t, runtime.Success, err, "Unable to create stream due to %d", err) } func TestDestroyStream(t *testing.T) { - err := runtime.LoadBackendFromEnvOrDefault() - assert.Equal(t, runtime.Success, err) dev := runtime.CreateDevice("CUDA", 0) assert.True(t, runtime.IsDeviceAvailable(&dev)) stream, err := runtime.CreateStream() @@ -31,8 +27,6 @@ func TestDestroyStream(t *testing.T) { } func TestSyncStream(t *testing.T) { - err := runtime.LoadBackendFromEnvOrDefault() - assert.Equal(t, runtime.Success, err) dev := runtime.CreateDevice("CUDA", 0) assert.True(t, runtime.IsDeviceAvailable(&dev)) runtime.SetDevice(&dev) diff --git a/wrappers/rust/icicle-runtime/src/device.rs b/wrappers/rust/icicle-runtime/src/device.rs index 9248a805bd..ee8f2de677 100644 --- a/wrappers/rust/icicle-runtime/src/device.rs +++ b/wrappers/rust/icicle-runtime/src/device.rs @@ -4,7 +4,7 @@ use std::os::raw::c_char; const MAX_TYPE_SIZE: usize = 64; -#[derive(Clone)] +#[derive(Clone, PartialEq)] #[repr(C)] pub struct Device { device_type: [c_char; MAX_TYPE_SIZE], diff --git a/wrappers/rust/icicle-runtime/src/runtime.rs b/wrappers/rust/icicle-runtime/src/runtime.rs index c1c88d162c..fce1733a5d 100644 --- a/wrappers/rust/icicle-runtime/src/runtime.rs +++ b/wrappers/rust/icicle-runtime/src/runtime.rs @@ -11,6 +11,7 @@ extern "C" { fn icicle_load_backend(path: *const c_char, is_recursive: bool) -> eIcicleError; fn icicle_load_backend_from_env_or_default() -> eIcicleError; fn icicle_set_device(device: &Device) -> eIcicleError; + fn icicle_set_default_device(device: &Device) -> eIcicleError; fn icicle_get_active_device(device: &mut Device) -> eIcicleError; fn icicle_is_host_memory(ptr: *const c_void) -> eIcicleError; fn icicle_is_active_device_memory(ptr: *const c_void) -> eIcicleError; @@ -66,6 +67,15 @@ pub fn set_device(device: &Device) -> Result<(), eIcicleError> { } } +pub fn set_default_device(device: &Device) -> Result<(), eIcicleError> { + let result = unsafe { icicle_set_default_device(device) }; + if result == eIcicleError::Success { + Ok(()) + } else { + Err(result) + } +} + pub fn get_active_device() -> Result { let mut device: Device = Device::new("invalid", -1); unsafe { icicle_get_active_device(&mut device).wrap_value::(device) } diff --git a/wrappers/rust/icicle-runtime/src/tests.rs b/wrappers/rust/icicle-runtime/src/tests.rs index e2a22b3c44..5555dd6d3f 100644 --- a/wrappers/rust/icicle-runtime/src/tests.rs +++ b/wrappers/rust/icicle-runtime/src/tests.rs @@ -6,6 +6,7 @@ mod tests { use crate::test_utilities; use crate::*; use std::sync::Once; + use std::thread; static INIT: Once = Once::new(); @@ -28,6 +29,37 @@ mod tests { test_utilities::test_set_ref_device(); } + #[test] + fn test_set_default_device() { + initialize(); + + // block scope is necessary in order to free the mutex lock + // to be used by the spawned thread + let outer_thread_id = thread::current().id(); + { + let main_device = test_utilities::TEST_MAIN_DEVICE + .lock() + .unwrap(); + set_default_device(&main_device).unwrap(); + + let active_device = get_active_device().unwrap(); + assert_eq!(*main_device, active_device); + } + + let handle = thread::spawn(move || { + let inner_thread_id = thread::current().id(); + assert_ne!(outer_thread_id, inner_thread_id); + + let active_device = get_active_device().unwrap(); + let main_device = test_utilities::TEST_MAIN_DEVICE + .lock() + .unwrap(); + assert_eq!(*main_device, active_device); + }); + + let _ = handle.join(); + } + #[test] fn test_sync_memory_copy() { initialize(); From 33c708fd2361404586e854eb01f861b376f9f45c Mon Sep 17 00:00:00 2001 From: HadarIngonyama <102164010+HadarIngonyama@users.noreply.github.com> Date: Mon, 6 Jan 2025 18:10:21 +0200 Subject: [PATCH 03/12] Hadar/b const mult (#723) --- .../include/icicle/curves/params/bls12_377.h | 8 + .../include/icicle/curves/params/bls12_381.h | 8 + icicle/include/icicle/curves/params/bn254.h | 8 + icicle/include/icicle/curves/params/bw6_761.h | 17 +- .../include/icicle/curves/params/grumpkin.h | 10 +- icicle/include/icicle/curves/projective.h | 196 +++++++++--------- .../include/icicle/fields/complex_extension.h | 84 ++++++++ icicle/include/icicle/fields/field.h | 36 ++++ 8 files changed, 262 insertions(+), 105 deletions(-) diff --git a/icicle/include/icicle/curves/params/bls12_377.h b/icicle/include/icicle/curves/params/bls12_377.h index da19ab2bda..6ad767e5f3 100644 --- a/icicle/include/icicle/curves/params/bls12_377.h +++ b/icicle/include/icicle/curves/params/bls12_377.h @@ -25,6 +25,9 @@ namespace bls12_377 { static constexpr point_field_t weierstrass_b = {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + + static constexpr bool is_b_u32 = true; + static constexpr bool is_b_neg = false; }; // G1 struct G2 { @@ -47,6 +50,11 @@ namespace bls12_377 { 0x3c6bf800, 0x129207b6, 0xcd5fd889, 0xdc7b4f91, 0x7460c589, 0x43bd0373, 0xdb0fd6f3, 0x010222f6}; + static constexpr bool is_b_u32_g2_re = true; + static constexpr bool is_b_neg_g2_re = false; + static constexpr bool is_b_u32_g2_im = false; + static constexpr bool is_b_neg_g2_im = false; + static constexpr g2_point_field_t gen_x = {g2_gen_x_re, g2_gen_x_im}; static constexpr g2_point_field_t gen_y = {g2_gen_y_re, g2_gen_y_im}; static constexpr g2_point_field_t weierstrass_b = {weierstrass_b_g2_re, weierstrass_b_g2_im}; diff --git a/icicle/include/icicle/curves/params/bls12_381.h b/icicle/include/icicle/curves/params/bls12_381.h index 7457bd49c5..18fbf97496 100644 --- a/icicle/include/icicle/curves/params/bls12_381.h +++ b/icicle/include/icicle/curves/params/bls12_381.h @@ -25,6 +25,9 @@ namespace bls12_381 { static constexpr point_field_t weierstrass_b = {0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + + static constexpr bool is_b_u32 = true; + static constexpr bool is_b_neg = false; }; struct G2 { @@ -48,6 +51,11 @@ namespace bls12_381 { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + static constexpr bool is_b_u32_g2_re = true; + static constexpr bool is_b_neg_g2_re = false; + static constexpr bool is_b_u32_g2_im = true; + static constexpr bool is_b_neg_g2_im = false; + static constexpr g2_point_field_t gen_x = {g2_gen_x_re, g2_gen_x_im}; static constexpr g2_point_field_t gen_y = {g2_gen_y_re, g2_gen_y_im}; static constexpr g2_point_field_t weierstrass_b = {weierstrass_b_g2_re, weierstrass_b_g2_im}; diff --git a/icicle/include/icicle/curves/params/bn254.h b/icicle/include/icicle/curves/params/bn254.h index b8095b53fe..0a628afcb9 100644 --- a/icicle/include/icicle/curves/params/bn254.h +++ b/icicle/include/icicle/curves/params/bn254.h @@ -24,6 +24,9 @@ namespace bn254 { 0x00000000, 0x00000000, 0x00000000, 0x00000000}; static constexpr point_field_t weierstrass_b = {0x00000003, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + + static constexpr bool is_b_u32 = true; + static constexpr bool is_b_neg = false; }; // G1 struct G2 { @@ -40,6 +43,11 @@ namespace bn254 { static constexpr point_field_t weierstrass_b_g2_im = {0x85c315d2, 0xe4a2bd06, 0xe52d1852, 0xa74fa084, 0xeed8fdf4, 0xcd2cafad, 0x3af0fed4, 0x009713b0}; + static constexpr bool is_b_u32_g2_re = false; + static constexpr bool is_b_neg_g2_re = false; + static constexpr bool is_b_u32_g2_im = false; + static constexpr bool is_b_neg_g2_im = false; + static constexpr g2_point_field_t gen_x = {g2_gen_x_re, g2_gen_x_im}; static constexpr g2_point_field_t gen_y = {g2_gen_y_re, g2_gen_y_im}; static constexpr g2_point_field_t weierstrass_b = {weierstrass_b_g2_re, weierstrass_b_g2_im}; diff --git a/icicle/include/icicle/curves/params/bw6_761.h b/icicle/include/icicle/curves/params/bw6_761.h index c0a1fcf5ed..7760c2023b 100644 --- a/icicle/include/icicle/curves/params/bw6_761.h +++ b/icicle/include/icicle/curves/params/bw6_761.h @@ -25,10 +25,18 @@ namespace bw6_761 { 0xb3053253, 0x9f9df141, 0x6fc2cdd4, 0xbe3fb90b, 0x717a4c55, 0xcc685d31, 0x71b5b806, 0xc5b8fa17, 0xaf7e0dba, 0x265909f1, 0xa2e573a3, 0x1a7348d2, 0x884c9ec6, 0x0f952589, 0x45cc2a42, 0xe6fd637b, 0x0a6fc574, 0x0058b84e}; + // actual value: + // static constexpr point_field_t weierstrass_b = { + // 0x0000008a, 0xf49d0000, 0x70000082, 0xe6913e68, 0xeaf0a437, 0x160cf8ae, 0x5667a8f8, 0x98a116c2, + // 0x73ebff2e, 0x71dcd3dc, 0x12f9fd90, 0x8689c8ed, 0x25b42304, 0x03cebaff, 0xe584e919, 0x707ba638, + // 0x8087be41, 0x528275ef, 0x81d14688, 0xb926186a, 0x04faff3e, 0xd187c940, 0xfb83ce0a, 0x0122e824}; static constexpr point_field_t weierstrass_b = { - 0x0000008a, 0xf49d0000, 0x70000082, 0xe6913e68, 0xeaf0a437, 0x160cf8ae, 0x5667a8f8, 0x98a116c2, - 0x73ebff2e, 0x71dcd3dc, 0x12f9fd90, 0x8689c8ed, 0x25b42304, 0x03cebaff, 0xe584e919, 0x707ba638, - 0x8087be41, 0x528275ef, 0x81d14688, 0xb926186a, 0x04faff3e, 0xd187c940, 0xfb83ce0a, 0x0122e824}; + 0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + + static constexpr bool is_b_u32 = true; + static constexpr bool is_b_neg = true; }; struct G2 { @@ -44,5 +52,8 @@ namespace bw6_761 { 0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + + static constexpr bool is_b_u32 = true; + static constexpr bool is_b_neg = false; }; } // namespace bw6_761 diff --git a/icicle/include/icicle/curves/params/grumpkin.h b/icicle/include/icicle/curves/params/grumpkin.h index a468fd911d..d09390bccc 100644 --- a/icicle/include/icicle/curves/params/grumpkin.h +++ b/icicle/include/icicle/curves/params/grumpkin.h @@ -16,7 +16,13 @@ namespace grumpkin { static constexpr point_field_t gen_y = {0x823f272c, 0x833fc48d, 0xf1181294, 0x2d270d45, 0x6a45d63, 0xcf135e75, 0x00000002, 0x00000000}; - static constexpr point_field_t weierstrass_b = {0xeffffff0, 0x43e1f593, 0x79b97091, 0x2833e848, - 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72}; + // actual value: + // static constexpr point_field_t weierstrass_b = {0xeffffff0, 0x43e1f593, 0x79b97091, 0x2833e848, + // 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72}; + static constexpr point_field_t weierstrass_b = {0x00000011, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000}; + + static constexpr bool is_b_u32 = true; + static constexpr bool is_b_neg = true; }; // G1 } // namespace grumpkin diff --git a/icicle/include/icicle/curves/projective.h b/icicle/include/icicle/curves/projective.h index 87fe0ab238..17cf9329c8 100644 --- a/icicle/include/icicle/curves/projective.h +++ b/icicle/include/icicle/curves/projective.h @@ -58,70 +58,68 @@ class Projective const FF Z = point.z; // TODO: Change to efficient dbl once implemented for field.cuh - FF t0 = FF::sqr(Y); // 1. t0 โ† Y ยท Y - FF Z3 = t0 + t0; // 2. Z3 โ† t0 + t0 - Z3 = Z3 + Z3; // 3. Z3 โ† Z3 + Z3 - Z3 = Z3 + Z3; // 4. Z3 โ† Z3 + Z3 - FF t1 = Y * Z; // 5. t1 โ† Y ยท Z - FF t2 = FF::sqr(Z); // 6. t2 โ† Z ยท Z - t2 = FF::template mul_unsigned<3>(FF::template mul_const(t2)); // 7. t2 โ† b3 ยท t2 - FF X3 = t2 * Z3; // 8. X3 โ† t2 ยท Z3 - FF Y3 = t0 + t2; // 9. Y3 โ† t0 + t2 - Z3 = t1 * Z3; // 10. Z3 โ† t1 ยท Z3 - t1 = t2 + t2; // 11. t1 โ† t2 + t2 - t2 = t1 + t2; // 12. t2 โ† t1 + t2 - t0 = t0 - t2; // 13. t0 โ† t0 โˆ’ t2 - Y3 = t0 * Y3; // 14. Y3 โ† t0 ยท Y3 - Y3 = X3 + Y3; // 15. Y3 โ† X3 + Y3 - t1 = X * Y; // 16. t1 โ† X ยท Y - X3 = t0 * t1; // 17. X3 โ† t0 ยท t1 - X3 = X3 + X3; // 18. X3 โ† X3 + X3 + FF t0 = FF::sqr(Y); // 1. t0 โ† Y ยท Y + FF Z3 = t0 + t0; // 2. Z3 โ† t0 + t0 + Z3 = Z3 + Z3; // 3. Z3 โ† Z3 + Z3 + Z3 = Z3 + Z3; // 4. Z3 โ† Z3 + Z3 + FF t1 = Y * Z; // 5. t1 โ† Y ยท Z + FF t2 = FF::sqr(Z); // 6. t2 โ† Z ยท Z + t2 = FF::template mul_weierstrass_b(t2); // 7. t2 โ† b3 ยท t2 + FF X3 = t2 * Z3; // 8. X3 โ† t2 ยท Z3 + FF Y3 = t0 + t2; // 9. Y3 โ† t0 + t2 + Z3 = t1 * Z3; // 10. Z3 โ† t1 ยท Z3 + t1 = t2 + t2; // 11. t1 โ† t2 + t2 + t2 = t1 + t2; // 12. t2 โ† t1 + t2 + t0 = t0 - t2; // 13. t0 โ† t0 โˆ’ t2 + Y3 = t0 * Y3; // 14. Y3 โ† t0 ยท Y3 + Y3 = X3 + Y3; // 15. Y3 โ† X3 + Y3 + t1 = X * Y; // 16. t1 โ† X ยท Y + X3 = t0 * t1; // 17. X3 โ† t0 ยท t1 + X3 = X3 + X3; // 18. X3 โ† X3 + X3 return {X3, Y3, Z3}; } friend HOST_DEVICE Projective operator+(Projective p1, const Projective& p2) { - const FF X1 = p1.x; // < 2 - const FF Y1 = p1.y; // < 2 - const FF Z1 = p1.z; // < 2 - const FF X2 = p2.x; // < 2 - const FF Y2 = p2.y; // < 2 - const FF Z2 = p2.z; // < 2 - const FF t00 = X1 * X2; // t00 โ† X1 ยท X2 < 2 - const FF t01 = Y1 * Y2; // t01 โ† Y1 ยท Y2 < 2 - const FF t02 = Z1 * Z2; // t02 โ† Z1 ยท Z2 < 2 - const FF t03 = X1 + Y1; // t03 โ† X1 + Y1 < 4 - const FF t04 = X2 + Y2; // t04 โ† X2 + Y2 < 4 - const FF t05 = t03 * t04; // t03 โ† t03 ยท t04 < 3 - const FF t06 = t00 + t01; // t06 โ† t00 + t01 < 4 - const FF t07 = t05 - t06; // t05 โ† t05 โˆ’ t06 < 2 - const FF t08 = Y1 + Z1; // t08 โ† Y1 + Z1 < 4 - const FF t09 = Y2 + Z2; // t09 โ† Y2 + Z2 < 4 - const FF t10 = t08 * t09; // t10 โ† t08 ยท t09 < 3 - const FF t11 = t01 + t02; // t11 โ† t01 + t02 < 4 - const FF t12 = t10 - t11; // t12 โ† t10 โˆ’ t11 < 2 - const FF t13 = X1 + Z1; // t13 โ† X1 + Z1 < 4 - const FF t14 = X2 + Z2; // t14 โ† X2 + Z2 < 4 - const FF t15 = t13 * t14; // t15 โ† t13 ยท t14 < 3 - const FF t16 = t00 + t02; // t16 โ† t00 + t02 < 4 - const FF t17 = t15 - t16; // t17 โ† t15 โˆ’ t16 < 2 - const FF t18 = t00 + t00; // t18 โ† t00 + t00 < 2 - const FF t19 = t18 + t00; // t19 โ† t18 + t00 < 2 - const FF t20 = - FF::template mul_unsigned<3>(FF::template mul_const(t02)); // t20 โ† b3 ยท t02 < 2 - const FF t21 = t01 + t20; // t21 โ† t01 + t20 < 2 - const FF t22 = t01 - t20; // t22 โ† t01 โˆ’ t20 < 2 - const FF t23 = - FF::template mul_unsigned<3>(FF::template mul_const(t17)); // t23 โ† b3 ยท t17 < 2 - const auto t24 = FF::mul_wide(t12, t23); // t24 โ† t12 ยท t23 < 2 - const auto t25 = FF::mul_wide(t07, t22); // t25 โ† t07 ยท t22 < 2 - const FF X3 = FF::reduce(t25 - t24); // X3 โ† t25 โˆ’ t24 < 2 - const auto t27 = FF::mul_wide(t23, t19); // t27 โ† t23 ยท t19 < 2 - const auto t28 = FF::mul_wide(t22, t21); // t28 โ† t22 ยท t21 < 2 - const FF Y3 = FF::reduce(t28 + t27); // Y3 โ† t28 + t27 < 2 - const auto t30 = FF::mul_wide(t19, t07); // t30 โ† t19 ยท t07 < 2 - const auto t31 = FF::mul_wide(t21, t12); // t31 โ† t21 ยท t12 < 2 - const FF Z3 = FF::reduce(t31 + t30); // Z3 โ† t31 + t30 < 2 + const FF X1 = p1.x; // < 2 + const FF Y1 = p1.y; // < 2 + const FF Z1 = p1.z; // < 2 + const FF X2 = p2.x; // < 2 + const FF Y2 = p2.y; // < 2 + const FF Z2 = p2.z; // < 2 + const FF t00 = X1 * X2; // t00 โ† X1 ยท X2 < 2 + const FF t01 = Y1 * Y2; // t01 โ† Y1 ยท Y2 < 2 + const FF t02 = Z1 * Z2; // t02 โ† Z1 ยท Z2 < 2 + const FF t03 = X1 + Y1; // t03 โ† X1 + Y1 < 4 + const FF t04 = X2 + Y2; // t04 โ† X2 + Y2 < 4 + const FF t05 = t03 * t04; // t03 โ† t03 ยท t04 < 3 + const FF t06 = t00 + t01; // t06 โ† t00 + t01 < 4 + const FF t07 = t05 - t06; // t05 โ† t05 โˆ’ t06 < 2 + const FF t08 = Y1 + Z1; // t08 โ† Y1 + Z1 < 4 + const FF t09 = Y2 + Z2; // t09 โ† Y2 + Z2 < 4 + const FF t10 = t08 * t09; // t10 โ† t08 ยท t09 < 3 + const FF t11 = t01 + t02; // t11 โ† t01 + t02 < 4 + const FF t12 = t10 - t11; // t12 โ† t10 โˆ’ t11 < 2 + const FF t13 = X1 + Z1; // t13 โ† X1 + Z1 < 4 + const FF t14 = X2 + Z2; // t14 โ† X2 + Z2 < 4 + const FF t15 = t13 * t14; // t15 โ† t13 ยท t14 < 3 + const FF t16 = t00 + t02; // t16 โ† t00 + t02 < 4 + const FF t17 = t15 - t16; // t17 โ† t15 โˆ’ t16 < 2 + const FF t18 = t00 + t00; // t18 โ† t00 + t00 < 2 + const FF t19 = t18 + t00; // t19 โ† t18 + t00 < 2 + const FF t20 = FF::template mul_weierstrass_b(t02); // t20 โ† b3 ยท t02 < 2 + const FF t21 = t01 + t20; // t21 โ† t01 + t20 < 2 + const FF t22 = t01 - t20; // t22 โ† t01 โˆ’ t20 < 2 + const FF t23 = FF::template mul_weierstrass_b(t17); // t23 โ† b3 ยท t17 < 2 + const auto t24 = FF::mul_wide(t12, t23); // t24 โ† t12 ยท t23 < 2 + const auto t25 = FF::mul_wide(t07, t22); // t25 โ† t07 ยท t22 < 2 + const FF X3 = FF::reduce(t25 - t24); // X3 โ† t25 โˆ’ t24 < 2 + const auto t27 = FF::mul_wide(t23, t19); // t27 โ† t23 ยท t19 < 2 + const auto t28 = FF::mul_wide(t22, t21); // t28 โ† t22 ยท t21 < 2 + const FF Y3 = FF::reduce(t28 + t27); // Y3 โ† t28 + t27 < 2 + const auto t30 = FF::mul_wide(t19, t07); // t30 โ† t19 ยท t07 < 2 + const auto t31 = FF::mul_wide(t21, t12); // t31 โ† t21 ยท t12 < 2 + const FF Z3 = FF::reduce(t31 + t30); // Z3 โ† t31 + t30 < 2 return {X3, Y3, Z3}; } @@ -129,46 +127,44 @@ class Projective friend HOST_DEVICE Projective operator+(Projective p1, const Affine& p2) { - const FF X1 = p1.x; // < 2 - const FF Y1 = p1.y; // < 2 - const FF Z1 = p1.z; // < 2 - const FF X2 = p2.x; // < 2 - const FF Y2 = p2.y; // < 2 - const FF t00 = X1 * X2; // t00 โ† X1 ยท X2 < 2 - const FF t01 = Y1 * Y2; // t01 โ† Y1 ยท Y2 < 2 - const FF t02 = Z1; // t02 โ† Z1 < 2 - const FF t03 = X1 + Y1; // t03 โ† X1 + Y1 < 4 - const FF t04 = X2 + Y2; // t04 โ† X2 + Y2 < 4 - const FF t05 = t03 * t04; // t03 โ† t03 ยท t04 < 3 - const FF t06 = t00 + t01; // t06 โ† t00 + t01 < 4 - const FF t07 = t05 - t06; // t05 โ† t05 โˆ’ t06 < 2 - const FF t08 = Y1 + Z1; // t08 โ† Y1 + Z1 < 4 - const FF t09 = Y2 + FF::one(); // t09 โ† Y2 + 1 < 4 - const FF t10 = t08 * t09; // t10 โ† t08 ยท t09 < 3 - const FF t11 = t01 + t02; // t11 โ† t01 + t02 < 4 - const FF t12 = t10 - t11; // t12 โ† t10 โˆ’ t11 < 2 - const FF t13 = X1 + Z1; // t13 โ† X1 + Z1 < 4 - const FF t14 = X2 + FF::one(); // t14 โ† X2 + 1 < 4 - const FF t15 = t13 * t14; // t15 โ† t13 ยท t14 < 3 - const FF t16 = t00 + t02; // t16 โ† t00 + t02 < 4 - const FF t17 = t15 - t16; // t17 โ† t15 โˆ’ t16 < 2 - const FF t18 = t00 + t00; // t18 โ† t00 + t00 < 2 - const FF t19 = t18 + t00; // t19 โ† t18 + t00 < 2 - const FF t20 = - FF::template mul_unsigned<3>(FF::template mul_const(t02)); // t20 โ† b3 ยท t02 < 2 - const FF t21 = t01 + t20; // t21 โ† t01 + t20 < 2 - const FF t22 = t01 - t20; // t22 โ† t01 โˆ’ t20 < 2 - const FF t23 = - FF::template mul_unsigned<3>(FF::template mul_const(t17)); // t23 โ† b3 ยท t17 < 2 - const auto t24 = FF::mul_wide(t12, t23); // t24 โ† t12 ยท t23 < 2 - const auto t25 = FF::mul_wide(t07, t22); // t25 โ† t07 ยท t22 < 2 - const FF X3 = FF::reduce(t25 - t24); // X3 โ† t25 โˆ’ t24 < 2 - const auto t27 = FF::mul_wide(t23, t19); // t27 โ† t23 ยท t19 < 2 - const auto t28 = FF::mul_wide(t22, t21); // t28 โ† t22 ยท t21 < 2 - const FF Y3 = FF::reduce(t28 + t27); // Y3 โ† t28 + t27 < 2 - const auto t30 = FF::mul_wide(t19, t07); // t30 โ† t19 ยท t07 < 2 - const auto t31 = FF::mul_wide(t21, t12); // t31 โ† t21 ยท t12 < 2 - const FF Z3 = FF::reduce(t31 + t30); // Z3 โ† t31 + t30 < 2 + const FF X1 = p1.x; // < 2 + const FF Y1 = p1.y; // < 2 + const FF Z1 = p1.z; // < 2 + const FF X2 = p2.x; // < 2 + const FF Y2 = p2.y; // < 2 + const FF t00 = X1 * X2; // t00 โ† X1 ยท X2 < 2 + const FF t01 = Y1 * Y2; // t01 โ† Y1 ยท Y2 < 2 + const FF t02 = Z1; // t02 โ† Z1 < 2 + const FF t03 = X1 + Y1; // t03 โ† X1 + Y1 < 4 + const FF t04 = X2 + Y2; // t04 โ† X2 + Y2 < 4 + const FF t05 = t03 * t04; // t03 โ† t03 ยท t04 < 3 + const FF t06 = t00 + t01; // t06 โ† t00 + t01 < 4 + const FF t07 = t05 - t06; // t05 โ† t05 โˆ’ t06 < 2 + const FF t08 = Y1 + Z1; // t08 โ† Y1 + Z1 < 4 + const FF t09 = Y2 + FF::one(); // t09 โ† Y2 + 1 < 4 + const FF t10 = t08 * t09; // t10 โ† t08 ยท t09 < 3 + const FF t11 = t01 + t02; // t11 โ† t01 + t02 < 4 + const FF t12 = t10 - t11; // t12 โ† t10 โˆ’ t11 < 2 + const FF t13 = X1 + Z1; // t13 โ† X1 + Z1 < 4 + const FF t14 = X2 + FF::one(); // t14 โ† X2 + 1 < 4 + const FF t15 = t13 * t14; // t15 โ† t13 ยท t14 < 3 + const FF t16 = t00 + t02; // t16 โ† t00 + t02 < 4 + const FF t17 = t15 - t16; // t17 โ† t15 โˆ’ t16 < 2 + const FF t18 = t00 + t00; // t18 โ† t00 + t00 < 2 + const FF t19 = t18 + t00; // t19 โ† t18 + t00 < 2 + const FF t20 = FF::template mul_weierstrass_b(t02); // t20 โ† b3 ยท t02 < 2 + const FF t21 = t01 + t20; // t21 โ† t01 + t20 < 2 + const FF t22 = t01 - t20; // t22 โ† t01 โˆ’ t20 < 2 + const FF t23 = FF::template mul_weierstrass_b(t17); // t23 โ† b3 ยท t17 < 2 + const auto t24 = FF::mul_wide(t12, t23); // t24 โ† t12 ยท t23 < 2 + const auto t25 = FF::mul_wide(t07, t22); // t25 โ† t07 ยท t22 < 2 + const FF X3 = FF::reduce(t25 - t24); // X3 โ† t25 โˆ’ t24 < 2 + const auto t27 = FF::mul_wide(t23, t19); // t27 โ† t23 ยท t19 < 2 + const auto t28 = FF::mul_wide(t22, t21); // t28 โ† t22 ยท t21 < 2 + const FF Y3 = FF::reduce(t28 + t27); // Y3 โ† t28 + t27 < 2 + const auto t30 = FF::mul_wide(t19, t07); // t30 โ† t19 ยท t07 < 2 + const auto t31 = FF::mul_wide(t21, t12); // t31 โ† t21 ยท t12 < 2 + const FF Z3 = FF::reduce(t31 + t30); // Z3 โ† t31 + t30 < 2 return {X3, Y3, Z3}; } @@ -235,7 +231,7 @@ class Projective { if (is_zero(point)) return true; bool eq_holds = - (FF::template mul_const(FF::sqr(point.z) * point.z) + FF::sqr(point.x) * point.x == + (FF::template mul_weierstrass_b(FF::sqr(point.z) * point.z) + FF::sqr(point.x) * point.x == point.z * FF::sqr(point.y)); return point.z != FF::zero() && eq_holds; } diff --git a/icicle/include/icicle/fields/complex_extension.h b/icicle/include/icicle/fields/complex_extension.h index 42740859fb..c65ad37ec4 100644 --- a/icicle/include/icicle/fields/complex_extension.h +++ b/icicle/include/icicle/fields/complex_extension.h @@ -158,6 +158,90 @@ class ComplexExtensionField return !(xs == ys); } + template + static HOST_DEVICE_INLINE FF mul_weierstrass_b_real(const FF& xs) + { + FF r = {}; + constexpr FF b_mult = []() { + FF b_mult = FF{Gen::weierstrass_b_g2_re}; + if constexpr (!IS_3B) return b_mult; + typename FF::ff_storage temp = {}; + typename FF::ff_storage modulus = FF::get_modulus(); + host_math::template add_sub_limbs( + b_mult.limbs_storage, b_mult.limbs_storage, b_mult.limbs_storage); + b_mult.limbs_storage = + host_math::template add_sub_limbs(b_mult.limbs_storage, modulus, temp) + ? b_mult.limbs_storage + : temp; + host_math::template add_sub_limbs( + b_mult.limbs_storage, FF{Gen::weierstrass_b_g2_re}.limbs_storage, b_mult.limbs_storage); + b_mult.limbs_storage = + host_math::template add_sub_limbs(b_mult.limbs_storage, modulus, temp) + ? b_mult.limbs_storage + : temp; + return b_mult; + }(); + if constexpr (Gen::is_b_u32_g2_re) { + r = FF::template mul_unsigned(xs); + if constexpr (Gen::is_b_neg_g2_re) + return FF::neg(r); + else { + return r; + } + } else { + return b_mult * xs; + } + } + + template + static HOST_DEVICE_INLINE FF mul_weierstrass_b_imag(const FF& xs) + { + FF r = {}; + constexpr FF b_mult = []() { + FF b_mult = FF{Gen::weierstrass_b_g2_im}; + if constexpr (!IS_3B) return b_mult; + typename FF::ff_storage temp = {}; + typename FF::ff_storage modulus = FF::get_modulus(); + host_math::template add_sub_limbs( + b_mult.limbs_storage, b_mult.limbs_storage, b_mult.limbs_storage); + b_mult.limbs_storage = + host_math::template add_sub_limbs(b_mult.limbs_storage, modulus, temp) + ? b_mult.limbs_storage + : temp; + host_math::template add_sub_limbs( + b_mult.limbs_storage, FF{Gen::weierstrass_b_g2_im}.limbs_storage, b_mult.limbs_storage); + b_mult.limbs_storage = + host_math::template add_sub_limbs(b_mult.limbs_storage, modulus, temp) + ? b_mult.limbs_storage + : temp; + return b_mult; + }(); + if constexpr (Gen::is_b_u32_g2_im) { + r = FF::template mul_unsigned(xs); + if constexpr (Gen::is_b_neg_g2_im) + return FF::neg(r); + else { + return r; + } + } else { + return b_mult * xs; + } + } + + template + static HOST_DEVICE_INLINE ComplexExtensionField mul_weierstrass_b(const ComplexExtensionField& xs) + { + const FF xs_real = xs.real; + const FF xs_imaginary = xs.imaginary; + FF real_prod = mul_weierstrass_b_real(xs_real); + FF imaginary_prod = mul_weierstrass_b_imag(xs_imaginary); + FF re_im = mul_weierstrass_b_real(xs_imaginary); + FF im_re = mul_weierstrass_b_imag(xs_real); + FF nonresidue_times_im = FF::template mul_unsigned(imaginary_prod); + nonresidue_times_im = CONFIG::nonresidue_is_negative ? FF::neg(nonresidue_times_im) : nonresidue_times_im; + return ComplexExtensionField{real_prod + nonresidue_times_im, re_im + im_re}; + } + template static HOST_DEVICE_INLINE ComplexExtensionField mul_const(const ComplexExtensionField& xs) { diff --git a/icicle/include/icicle/fields/field.h b/icicle/include/icicle/fields/field.h index 80c4b4178c..3563424c00 100644 --- a/icicle/include/icicle/fields/field.h +++ b/icicle/include/icicle/fields/field.h @@ -875,6 +875,42 @@ class Field friend HOST_DEVICE bool operator!=(const Field& xs, const Field& ys) { return !(xs == ys); } + template + static HOST_DEVICE_INLINE Field mul_weierstrass_b(const Field& xs) + { + Field r = {}; + constexpr Field b_mult = []() { + Field b_mult = Field{Gen::weierstrass_b}; + if constexpr (!IS_3B) return b_mult; + ff_storage temp = {}; + ff_storage modulus = get_modulus<>(); + host_math::template add_sub_limbs( + b_mult.limbs_storage, b_mult.limbs_storage, b_mult.limbs_storage); + b_mult.limbs_storage = + host_math::template add_sub_limbs(b_mult.limbs_storage, modulus, temp) + ? b_mult.limbs_storage + : temp; + host_math::template add_sub_limbs( + b_mult.limbs_storage, Field{Gen::weierstrass_b}.limbs_storage, b_mult.limbs_storage); + b_mult.limbs_storage = + host_math::template add_sub_limbs(b_mult.limbs_storage, modulus, temp) + ? b_mult.limbs_storage + : temp; + return b_mult; + }(); + + if constexpr (Gen::is_b_u32) { // assumes that 3b is also u32 + r = mul_unsigned(xs); + if constexpr (Gen::is_b_neg) + return neg(r); + else { + return r; + } + } else { + return b_mult * xs; + } + } + template static HOST_DEVICE_INLINE Field mul_const(const Field& xs) { From 56f406e4d26870999e2317b282086a1a87939307 Mon Sep 17 00:00:00 2001 From: yshekel Date: Tue, 7 Jan 2025 11:23:27 +0200 Subject: [PATCH 04/12] Support vulkan backend in build system (#703) - Added a script to pull Vulkan private backend sources. - Updated CMake configuration to integrate Vulkan support in the build process. - Introduced a Rust feature flag for building the Vulkan backend (requires access to private sources). - Refined the test device API for improved compatibility and functionality. --- examples/c++/examples_utils.h | 2 +- .../arkworks-icicle-conversions/Cargo.toml | 23 ++++-- examples/rust/hash-and-merkle/Cargo.toml | 28 ++++--- examples/rust/msm/Cargo.toml | 18 ++++- examples/rust/ntt/Cargo.toml | 16 +++- examples/rust/polynomials/Cargo.toml | 22 ++++-- examples/rust/poseidon2/Cargo.toml | 26 +++++-- icicle/CMakeLists.txt | 57 ++------------ icicle/cmake/backend_include.cmake | 74 +++++++++++++++++++ icicle/tests/test_device_api.cpp | 73 +++++++++++------- scripts/pull_metal_backend.sh | 2 - scripts/pull_vulkan_backend.sh | 42 +++++++++++ .../icicle-curves/icicle-bls12-377/Cargo.toml | 3 +- .../icicle-curves/icicle-bls12-377/build.rs | 5 ++ .../icicle-curves/icicle-bls12-381/Cargo.toml | 2 + .../icicle-curves/icicle-bls12-381/build.rs | 5 ++ .../icicle-curves/icicle-bn254/Cargo.toml | 2 + .../rust/icicle-curves/icicle-bn254/build.rs | 5 ++ .../icicle-curves/icicle-bw6-761/build.rs | 5 ++ .../icicle-curves/icicle-grumpkin/Cargo.toml | 2 + .../icicle-curves/icicle-grumpkin/build.rs | 5 ++ .../icicle-fields/icicle-babybear/Cargo.toml | 4 +- .../icicle-fields/icicle-babybear/build.rs | 5 ++ .../icicle-fields/icicle-koalabear/Cargo.toml | 4 +- .../icicle-fields/icicle-koalabear/build.rs | 5 ++ .../rust/icicle-fields/icicle-m31/Cargo.toml | 2 + .../rust/icicle-fields/icicle-m31/build.rs | 5 ++ .../icicle-fields/icicle-stark252/Cargo.toml | 4 +- wrappers/rust/icicle-hash/Cargo.toml | 4 +- wrappers/rust/icicle-hash/build.rs | 5 ++ wrappers/rust/icicle-runtime/Cargo.toml | 2 + wrappers/rust/icicle-runtime/build.rs | 5 ++ 32 files changed, 341 insertions(+), 121 deletions(-) create mode 100644 icicle/cmake/backend_include.cmake create mode 100755 scripts/pull_vulkan_backend.sh diff --git a/examples/c++/examples_utils.h b/examples/c++/examples_utils.h index 65abf3bebb..59142ddf17 100644 --- a/examples/c++/examples_utils.h +++ b/examples/c++/examples_utils.h @@ -29,5 +29,5 @@ void try_load_and_set_backend_device(int argc = 0, char** argv = nullptr) return; } - ICICLE_LOG_INFO << "Device " << selected_device << " is not available, falling back to CPU"; + ICICLE_LOG_ERROR << "Device " << selected_device << " is not available, falling back to CPU"; } \ No newline at end of file diff --git a/examples/rust/arkworks-icicle-conversions/Cargo.toml b/examples/rust/arkworks-icicle-conversions/Cargo.toml index f054855c44..84fa7ffdc7 100644 --- a/examples/rust/arkworks-icicle-conversions/Cargo.toml +++ b/examples/rust/arkworks-icicle-conversions/Cargo.toml @@ -5,24 +5,33 @@ edition = "2018" [dependencies] icicle-runtime = { path = "../../../wrappers/rust/icicle-runtime" } -icicle-core = { path = "../../../wrappers/rust/icicle-core"} +icicle-core = { path = "../../../wrappers/rust/icicle-core" } icicle-bn254 = { path = "../../../wrappers/rust/icicle-curves/icicle-bn254" } -ark-bn254 = { version = "0.4.0"} -ark-ff = { version = "0.4.0" , features=["parallel"]} -ark-ec = { version = "0.4.0" , features=["parallel"]} +ark-bn254 = { version = "0.4.0" } +ark-ff = { version = "0.4.0", features = ["parallel"] } +ark-ec = { version = "0.4.0", features = ["parallel"] } rand = "0.8" rayon = "1.5" clap = { version = "<=4.4.12", features = ["derive"] } [features] -cuda = ["icicle-runtime/cuda_backend", +cuda = [ + "icicle-runtime/cuda_backend", "icicle-bn254/cuda_backend", "icicle-bn254/no_ecntt", "icicle-bn254/no_g2", ] -metal = ["icicle-runtime/metal_backend", +metal = [ + "icicle-runtime/metal_backend", "icicle-bn254/metal_backend", "icicle-bn254/no_ecntt", "icicle-bn254/no_g2", -] \ No newline at end of file +] + +vulkan = [ + "icicle-runtime/vulkan_backend", + "icicle-bn254/vulkan_backend", + "icicle-bn254/no_ecntt", + "icicle-bn254/no_g2", +] diff --git a/examples/rust/hash-and-merkle/Cargo.toml b/examples/rust/hash-and-merkle/Cargo.toml index 95045b11de..3e0e9f0dde 100644 --- a/examples/rust/hash-and-merkle/Cargo.toml +++ b/examples/rust/hash-and-merkle/Cargo.toml @@ -4,20 +4,28 @@ version = "0.1.0" edition = "2018" [dependencies] -hex = "0.4" +hex = "0.4" clap = { version = "<=4.4.12", features = ["derive"] } -icicle-core = {path = "../../../wrappers/rust/icicle-core" } -icicle-runtime = {path = "../../../wrappers/rust/icicle-runtime" } -icicle-hash = {path = "../../../wrappers/rust/icicle-hash" } -icicle-babybear = {path = "../../../wrappers/rust/icicle-fields/icicle-babybear" } +icicle-core = { path = "../../../wrappers/rust/icicle-core" } +icicle-runtime = { path = "../../../wrappers/rust/icicle-runtime" } +icicle-hash = { path = "../../../wrappers/rust/icicle-hash" } +icicle-babybear = { path = "../../../wrappers/rust/icicle-fields/icicle-babybear" } [features] -cuda = ["icicle-runtime/cuda_backend", +cuda = [ + "icicle-runtime/cuda_backend", "icicle-babybear/cuda_backend", - "icicle-hash/cuda_backend", + "icicle-hash/cuda_backend", ] -metal = ["icicle-runtime/metal_backend", +metal = [ + "icicle-runtime/metal_backend", "icicle-babybear/metal_backend", - "icicle-hash/metal_backend", -] \ No newline at end of file + "icicle-hash/metal_backend", +] + +vulkan = [ + "icicle-runtime/vulkan_backend", + "icicle-babybear/vulkan_backend", + "icicle-hash/vulkan_backend", +] diff --git a/examples/rust/msm/Cargo.toml b/examples/rust/msm/Cargo.toml index 29e1cf1add..22a6be4ab4 100644 --- a/examples/rust/msm/Cargo.toml +++ b/examples/rust/msm/Cargo.toml @@ -11,16 +11,26 @@ icicle-bls12-377 = { path = "../../../wrappers/rust/icicle-curves/icicle-bls12-3 clap = { version = "<=4.4.12", features = ["derive"] } [features] -cuda = ["icicle-runtime/cuda_backend", +cuda = [ + "icicle-runtime/cuda_backend", "icicle-bn254/cuda_backend", "icicle-bn254/no_ecntt", "icicle-bls12-377/cuda_backend", - "icicle-bls12-377/no_ecntt" + "icicle-bls12-377/no_ecntt", ] -metal = ["icicle-runtime/metal_backend", +metal = [ + "icicle-runtime/metal_backend", "icicle-bn254/metal_backend", "icicle-bn254/no_ecntt", "icicle-bls12-377/metal_backend", - "icicle-bls12-377/no_ecntt" + "icicle-bls12-377/no_ecntt", +] + +vulkan = [ + "icicle-runtime/vulkan_backend", + "icicle-bn254/vulkan_backend", + "icicle-bn254/no_ecntt", + "icicle-bls12-377/vulkan_backend", + "icicle-bls12-377/no_ecntt", ] diff --git a/examples/rust/ntt/Cargo.toml b/examples/rust/ntt/Cargo.toml index 616610f706..97cf3818de 100644 --- a/examples/rust/ntt/Cargo.toml +++ b/examples/rust/ntt/Cargo.toml @@ -12,7 +12,8 @@ icicle-bls12-377 = { path = "../../../wrappers/rust/icicle-curves/icicle-bls12-3 clap = { version = "<=4.4.12", features = ["derive"] } [features] -cuda = ["icicle-runtime/cuda_backend", +cuda = [ + "icicle-runtime/cuda_backend", "icicle-bn254/cuda_backend", "icicle-bn254/no_ecntt", "icicle-bn254/no_g2", @@ -21,7 +22,8 @@ cuda = ["icicle-runtime/cuda_backend", "icicle-bls12-377/no_g2", ] -metal = ["icicle-runtime/metal_backend", +metal = [ + "icicle-runtime/metal_backend", "icicle-bn254/metal_backend", "icicle-bn254/no_ecntt", "icicle-bn254/no_g2", @@ -29,3 +31,13 @@ metal = ["icicle-runtime/metal_backend", "icicle-bls12-377/no_ecntt", "icicle-bls12-377/no_g2", ] + +vulkan = [ + "icicle-runtime/vulkan_backend", + "icicle-bn254/vulkan_backend", + "icicle-bn254/no_ecntt", + "icicle-bn254/no_g2", + "icicle-bls12-377/vulkan_backend", + "icicle-bls12-377/no_ecntt", + "icicle-bls12-377/no_g2", +] diff --git a/examples/rust/polynomials/Cargo.toml b/examples/rust/polynomials/Cargo.toml index 527b0bd267..742dbffac2 100644 --- a/examples/rust/polynomials/Cargo.toml +++ b/examples/rust/polynomials/Cargo.toml @@ -12,16 +12,26 @@ icicle-babybear = { path = "../../../wrappers/rust/icicle-fields/icicle-babybear clap = { version = "<=4.4.12", features = ["derive"] } [features] -cuda = ["icicle-runtime/cuda_backend", +cuda = [ + "icicle-runtime/cuda_backend", "icicle-bn254/cuda_backend", - "icicle-bn254/no_ecntt", - "icicle-bn254/no_g2", + "icicle-bn254/no_ecntt", + "icicle-bn254/no_g2", "icicle-babybear/cuda_backend", ] -metal = ["icicle-runtime/metal_backend", + +metal = [ + "icicle-runtime/metal_backend", "icicle-bn254/metal_backend", - "icicle-bn254/no_ecntt", - "icicle-bn254/no_g2", + "icicle-bn254/no_ecntt", + "icicle-bn254/no_g2", "icicle-babybear/metal_backend", ] +vulkan = [ + "icicle-runtime/vulkan_backend", + "icicle-bn254/vulkan_backend", + "icicle-bn254/no_ecntt", + "icicle-bn254/no_g2", + "icicle-babybear/vulkan_backend", +] diff --git a/examples/rust/poseidon2/Cargo.toml b/examples/rust/poseidon2/Cargo.toml index 9416d34f88..5f07f8df07 100644 --- a/examples/rust/poseidon2/Cargo.toml +++ b/examples/rust/poseidon2/Cargo.toml @@ -4,12 +4,12 @@ version = "0.1.0" edition = "2018" [dependencies] -icicle-core = {path = "../../../wrappers/rust/icicle-core" } -icicle-runtime = {path = "../../../wrappers/rust/icicle-runtime" } -icicle-hash = {path = "../../../wrappers/rust/icicle-hash" } -icicle-babybear = {path = "../../../wrappers/rust/icicle-fields/icicle-babybear" } -icicle-m31 = {path = "../../../wrappers/rust/icicle-fields/icicle-m31" } -hex = "0.4" +icicle-core = { path = "../../../wrappers/rust/icicle-core" } +icicle-runtime = { path = "../../../wrappers/rust/icicle-runtime" } +icicle-hash = { path = "../../../wrappers/rust/icicle-hash" } +icicle-babybear = { path = "../../../wrappers/rust/icicle-fields/icicle-babybear" } +icicle-m31 = { path = "../../../wrappers/rust/icicle-fields/icicle-m31" } +hex = "0.4" rand = "0.8" clap = { version = "<=4.4.12", features = ["derive"] } @@ -20,3 +20,17 @@ cuda = [ "icicle-babybear/cuda_backend", "icicle-m31/cuda_backend", ] + +metal = [ + "icicle-runtime/metal_backend", + "icicle-hash/metal_backend", + "icicle-babybear/metal_backend", + "icicle-m31/metal_backend", +] + +vulkan = [ + "icicle-runtime/vulkan_backend", + "icicle-hash/vulkan_backend", + "icicle-babybear/vulkan_backend", + "icicle-m31/vulkan_backend", +] diff --git a/icicle/CMakeLists.txt b/icicle/CMakeLists.txt index 2a32d70d3e..246215676f 100644 --- a/icicle/CMakeLists.txt +++ b/icicle/CMakeLists.txt @@ -55,11 +55,13 @@ endif() set(CMAKE_POSITION_INDEPENDENT_CODE ON) # Build options -option(BUILD_TESTS "Build unit tests. Default=OFF" OFF) +option(BUILD_TESTS "Build unit test2s. Default=OFF" OFF) +# Backends: typically CPU is built into the frontend, the rest are DSOs loaded at runtime from installation option(CPU_BACKEND "Build CPU backend. Default=ON" ON) -# TODO Yuval: consider decoupling backends from frontend build. Is that worth the effort? +# To enable building backends, use the following options: (note they are in private repos) option(CUDA_BACKEND "Branch/commit to pull for CUDA backend or `local` if under icicle/backend/cuda. Default=OFF" OFF) option(METAL_BACKEND "Branch/commit to pull for METAL backend or `local` if under icicle/backend/metal. Default=OFF" OFF) +option(VULKAN_BACKEND "Branch/commit to pull for VULKAN backend or `local` if under icicle/backend/vulkan. Default=OFF" OFF) # features that some fields/curves have and some don't. option(NTT "Build NTT" ON) @@ -130,55 +132,8 @@ if (CPU_BACKEND) add_subdirectory(backend/cpu) endif() -if (CUDA_BACKEND) - string(TOLOWER "${CUDA_BACKEND}" CUDA_BACKEND_LOWER) - if (CUDA_BACKEND_LOWER STREQUAL "local") - # CUDA backend is local, no need to pull - message(STATUS "Adding CUDA backend from local path: icicle/backend/cuda") - add_subdirectory(backend/cuda) - - # Set the compile definition for the backend build directory - add_compile_definitions(BACKEND_BUILD_DIR="${CMAKE_BINARY_DIR}/backend") - else() - set(CUDA_BACKEND_URL "git@github.com:ingonyama-zk/icicle-cuda-backend.git") - - include(FetchContent) - message(STATUS "Fetching cuda backend from ${CUDA_BACKEND_URL}:${CUDA_BACKEND}") - FetchContent_Declare( - cuda_backend - GIT_REPOSITORY ${CUDA_BACKEND_URL} - GIT_TAG ${CUDA_BACKEND} - ) - FetchContent_MakeAvailable(cuda_backend) - # Set the compile definition for the backend build directory - add_compile_definitions(BACKEND_BUILD_DIR="${CMAKE_BINARY_DIR}/_deps/cuda_backend-build") - endif() -endif() - -if (METAL_BACKEND) - string(TOLOWER "${METAL_BACKEND}" METAL_BACKEND_LOWER) - if (METAL_BACKEND_LOWER STREQUAL "local") - # METAL backend is local, no need to pull - message(STATUS "Adding Metal backend from local path: icicle/backend/metal") - add_subdirectory(backend/metal) - - # Set the compile definition for the backend build directory - add_compile_definitions(BACKEND_BUILD_DIR="${CMAKE_BINARY_DIR}/backend") - else() - set(METAL_BACKEND_URL "git@github.com:ingonyama-zk/icicle-metal-backend.git") - - include(FetchContent) - message(STATUS "Fetching cuda backend from ${METAL_BACKEND_URL}:${METAL_BACKEND}") - FetchContent_Declare( - metal_backend - GIT_REPOSITORY ${METAL_BACKEND_URL} - GIT_TAG ${METAL_BACKEND} - ) - FetchContent_MakeAvailable(metal_backend) - # Set the compile definition for the backend build directory - add_compile_definitions(BACKEND_BUILD_DIR="${CMAKE_BINARY_DIR}/_deps/metal_backend-build") - endif() -endif() +# Include and configure (for build) backends based on the backend options +include(cmake/backend_include.cmake) if (BUILD_TESTS) add_subdirectory(tests) diff --git a/icicle/cmake/backend_include.cmake b/icicle/cmake/backend_include.cmake new file mode 100644 index 0000000000..565c07282e --- /dev/null +++ b/icicle/cmake/backend_include.cmake @@ -0,0 +1,74 @@ +if (CUDA_BACKEND) + string(TOLOWER "${CUDA_BACKEND}" CUDA_BACKEND_LOWER) + if (CUDA_BACKEND_LOWER STREQUAL "local") + # CUDA backend is local, no need to pull + message(STATUS "Adding CUDA backend from local path: icicle/backend/cuda") + add_subdirectory(backend/cuda) + + # Set the compile definition for the backend build directory + add_compile_definitions(BACKEND_BUILD_DIR="${CMAKE_BINARY_DIR}/backend") + else() + set(CUDA_BACKEND_URL "git@github.com:ingonyama-zk/icicle-cuda-backend.git") + + include(FetchContent) + message(STATUS "Fetching cuda backend from ${CUDA_BACKEND_URL}:${CUDA_BACKEND}") + FetchContent_Declare( + cuda_backend + GIT_REPOSITORY ${CUDA_BACKEND_URL} + GIT_TAG ${CUDA_BACKEND} + ) + FetchContent_MakeAvailable(cuda_backend) + # Set the compile definition for the backend build directory + add_compile_definitions(BACKEND_BUILD_DIR="${CMAKE_BINARY_DIR}/_deps/cuda_backend-build") + endif() +endif() + +if (METAL_BACKEND) + string(TOLOWER "${METAL_BACKEND}" METAL_BACKEND_LOWER) + if (METAL_BACKEND_LOWER STREQUAL "local") + # METAL backend is local, no need to pull + message(STATUS "Adding Metal backend from local path: icicle/backend/metal") + add_subdirectory(backend/metal) + + # Set the compile definition for the backend build directory + add_compile_definitions(BACKEND_BUILD_DIR="${CMAKE_BINARY_DIR}/backend") + else() + set(METAL_BACKEND_URL "git@github.com:ingonyama-zk/icicle-metal-backend.git") + + include(FetchContent) + message(STATUS "Fetching cuda backend from ${METAL_BACKEND_URL}:${METAL_BACKEND}") + FetchContent_Declare( + metal_backend + GIT_REPOSITORY ${METAL_BACKEND_URL} + GIT_TAG ${METAL_BACKEND} + ) + FetchContent_MakeAvailable(metal_backend) + # Set the compile definition for the backend build directory + add_compile_definitions(BACKEND_BUILD_DIR="${CMAKE_BINARY_DIR}/_deps/metal_backend-build") + endif() +endif() + +if (VULKAN_BACKEND) + string(TOLOWER "${VULKAN_BACKEND}" VULKAN_BACKEND_LOWER) + if (VULKAN_BACKEND_LOWER STREQUAL "local") + # VULKAN backend is local, no need to pull + message(STATUS "Adding Vulkan backend from local path: icicle/backend/vulkan") + add_subdirectory(backend/vulkan) + + # Set the compile definition for the backend build directory + add_compile_definitions(BACKEND_BUILD_DIR="${CMAKE_BINARY_DIR}/backend") + else() + set(VULKAN_BACKEND_URL "git@github.com:ingonyama-zk/icicle-vulkan-backend.git") + + include(FetchContent) + message(STATUS "Fetching cuda backend from ${VULKAN_BACKEND_URL}:${VULKAN_BACKEND}") + FetchContent_Declare( + vulkan_backend + GIT_REPOSITORY ${VULKAN_BACKEND_URL} + GIT_TAG ${VULKAN_BACKEND} + ) + FetchContent_MakeAvailable(vulkan_backend) + # Set the compile definition for the backend build directory + add_compile_definitions(BACKEND_BUILD_DIR="${CMAKE_BINARY_DIR}/_deps/vulkan-backend-build") + endif() +endif() \ No newline at end of file diff --git a/icicle/tests/test_device_api.cpp b/icicle/tests/test_device_api.cpp index 8a5ea68874..f6ac8880e6 100644 --- a/icicle/tests/test_device_api.cpp +++ b/icicle/tests/test_device_api.cpp @@ -58,14 +58,17 @@ TEST_F(DeviceApiTest, MemoryCopySync) int output[2] = {0, 0}; icicle::Device dev = {device_type, 0}; - icicle_set_device(dev); - - void* dev_mem = nullptr; - ICICLE_CHECK(icicle_malloc(&dev_mem, sizeof(input))); - ICICLE_CHECK(icicle_copy_to_device(dev_mem, input, sizeof(input))); - ICICLE_CHECK(icicle_copy_to_host(output, dev_mem, sizeof(input))); - ICICLE_CHECK(icicle_free(dev_mem)); - + ICICLE_CHECK(icicle_set_device(dev)); + + void* dev_memA = nullptr; + void* dev_memB = nullptr; + // test copy host->device->device->host + ICICLE_CHECK(icicle_malloc(&dev_memA, sizeof(input))); + ICICLE_CHECK(icicle_malloc(&dev_memB, sizeof(input))); + ICICLE_CHECK(icicle_copy_to_device(dev_memA, input, sizeof(input))); + ICICLE_CHECK(icicle_copy(dev_memB, dev_memA, sizeof(input))); + ICICLE_CHECK(icicle_copy_to_host(output, dev_memB, sizeof(input))); + ICICLE_CHECK(icicle_free(dev_memB)); ASSERT_EQ(0, memcmp(input, output, sizeof(input))); } } @@ -73,22 +76,22 @@ TEST_F(DeviceApiTest, MemoryCopySync) TEST_F(DeviceApiTest, MemoryCopySyncWithOffset) { int input[4] = {1, 2, 3, 4}; + int expected[4] = {3, 4, 1, 2}; for (const auto& device_type : s_registered_devices) { - int output[2] = {0, 0}; + int output[4] = {0, 0, 0, 0}; icicle::Device dev = {device_type, 0}; - icicle_set_device(dev); + ICICLE_CHECK(icicle_set_device(dev)); int* dev_mem = nullptr; - ICICLE_CHECK( - icicle_malloc((void**)&dev_mem, sizeof(input))); // allocating larger memory to have offset on this buffer to copy - // 2 values from offset (that is copy {2,3} only) - ICICLE_CHECK(icicle_copy_to_device(dev_mem + 1, input + 1, sizeof(output))); - ICICLE_CHECK(icicle_copy_to_host(output, dev_mem + 1, sizeof(output))); + ICICLE_CHECK(icicle_malloc((void**)&dev_mem, 4 * sizeof(int))); + ICICLE_CHECK(icicle_copy_to_device(dev_mem, input + 2, 2 * sizeof(int))); + ICICLE_CHECK(icicle_copy_to_device(dev_mem + 2, input, 2 * sizeof(int))); + ICICLE_CHECK(icicle_copy_to_host(output, dev_mem, 4 * sizeof(int))); ICICLE_CHECK(icicle_free(dev_mem)); - ASSERT_EQ(0, memcmp(input + 1, output, sizeof(output))); + ASSERT_EQ(0, memcmp(expected, output, 4 * sizeof(int))); } } @@ -99,7 +102,7 @@ TEST_F(DeviceApiTest, MemoryCopyAsync) int output[2] = {0, 0}; icicle::Device dev = {device_type, 0}; - icicle_set_device(dev); + ICICLE_CHECK(icicle_set_device(dev)); void* dev_mem = nullptr; icicleStreamHandle stream; @@ -121,7 +124,7 @@ TEST_F(DeviceApiTest, CopyDeviceInference) int output[2] = {0, 0}; icicle::Device dev = {device_type, 0}; - icicle_set_device(dev); + ICICLE_CHECK(icicle_set_device(dev)); void* dev_mem = nullptr; ICICLE_CHECK(icicle_malloc(&dev_mem, sizeof(input))); @@ -132,15 +135,14 @@ TEST_F(DeviceApiTest, CopyDeviceInference) } } -TEST_F(DeviceApiTest, Memest) +TEST_F(DeviceApiTest, Memset) { char expected[2] = {1, 2}; for (const auto& device_type : s_registered_devices) { char host_mem[2] = {0, 0}; - // icicle::Device dev = {device_type, 0}; - icicle::Device dev = {"CPU", 0}; - icicle_set_device(dev); + icicle::Device dev = {device_type, 0}; + ICICLE_CHECK(icicle_set_device(dev)); char* dev_mem = nullptr; ICICLE_CHECK(icicle_malloc((void**)&dev_mem, sizeof(host_mem))); @@ -156,7 +158,7 @@ TEST_F(DeviceApiTest, ApiError) { for (const auto& device_type : s_registered_devices) { icicle::Device dev = {device_type, 0}; - icicle_set_device(dev); + ICICLE_CHECK(icicle_set_device(dev)); void* dev_mem = nullptr; EXPECT_ANY_THROW(ICICLE_CHECK(icicle_malloc(&dev_mem, -1))); } @@ -168,7 +170,7 @@ TEST_F(DeviceApiTest, AvailableMemory) const bool is_cuda_registered = eIcicleError::SUCCESS == icicle_is_device_available(dev); if (!is_cuda_registered) { GTEST_SKIP(); } // most devices do not support this - icicle_set_device(dev); + ICICLE_CHECK(icicle_set_device(dev)); size_t total, free; ASSERT_EQ(eIcicleError::SUCCESS, icicle_get_available_memory(total, free)); @@ -190,13 +192,13 @@ TEST_F(DeviceApiTest, memoryTracker) { // need two devices for this test if (s_registered_devices.size() == 1) { return; } - const int NOF_ALLOCS = 1000; + const int NOF_ALLOCS = 200; // Note that some backends have a bound (typically 256) on allocations const int ALLOC_SIZE = 1 << 20; MemoryTracker tracker{}; ICICLE_ASSERT(s_main_device != UNKOWN_DEVICE) << "memoryTracker test assumes more than one device"; Device main_device = {s_main_device, 0}; - icicle_set_device(main_device); + ICICLE_CHECK(icicle_set_device(main_device)); std::vector allocated_addresses(NOF_ALLOCS, nullptr); @@ -226,12 +228,29 @@ TEST_F(DeviceApiTest, memoryTracker) ASSERT_EQ(eIcicleError::INVALID_POINTER, icicle_is_active_device_memory(host_mem.get())); // test that we still identify correctly after switching device - icicle_set_device({"CPU", 0}); + ICICLE_CHECK(icicle_set_device({"CPU", 0})); const void* addr = (void*)((size_t)*allocated_addresses.begin() + rand_uint_32b(0, RAND_MAX) % ALLOC_SIZE); ASSERT_EQ(eIcicleError::INVALID_POINTER, icicle_is_active_device_memory(addr)); ASSERT_EQ(eIcicleError::INVALID_POINTER, icicle_is_active_device_memory(host_mem.get())); auto it = tracker.identify(addr); ASSERT_EQ(*it->first, main_device); + + ICICLE_CHECK(icicle_set_device(main_device)); + START_TIMER(remove); + for (auto& it : allocated_addresses) { + tracker.remove_allocation(it); + } + END_TIMER_AVERAGE(remove, "memory-tracker: remove average", true, NOF_ALLOCS); + + START_TIMER(free); + for (auto& it : allocated_addresses) { + icicle_free(it); + } + END_TIMER_AVERAGE(free, "memory-tracker: free average", true, NOF_ALLOCS); + + void* mem; + ICICLE_CHECK(icicle_malloc(&mem, ALLOC_SIZE)); + ICICLE_CHECK(icicle_free(mem)); } int main(int argc, char** argv) diff --git a/scripts/pull_metal_backend.sh b/scripts/pull_metal_backend.sh index 7c5bb69f92..7e6c26aa0f 100755 --- a/scripts/pull_metal_backend.sh +++ b/scripts/pull_metal_backend.sh @@ -14,8 +14,6 @@ ABS_METAL_DIR=$(realpath ${BACKEND_DIR})/metal echo "Trying to pull Metal backend commit '${METAL_BACKEND}' to '${ABS_METAL_DIR}'" -exit 1 - if [ -d "${ABS_METAL_DIR}" ] && [ "$(ls -A ${ABS_METAL_DIR})" ]; then echo "Directory ${ABS_METAL_DIR} is not empty." read -p "Do you want to proceed with fetching and resetting? (y/n): " response diff --git a/scripts/pull_vulkan_backend.sh b/scripts/pull_vulkan_backend.sh new file mode 100755 index 0000000000..45ba5efc6f --- /dev/null +++ b/scripts/pull_vulkan_backend.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +VULKAN_BACKEND=${1:-main} +BACKEND_DIR=${2:-./icicle/backend} + +# Check if BACKEND_DIR exists +if [ ! -d "${BACKEND_DIR}" ]; then + echo "Error: Directory '${BACKEND_DIR}' does not exist." + exit 1 +fi + +# Get the absolute path of the backend directory +ABS_VULKAN_DIR=$(realpath ${BACKEND_DIR})/vulkan + +echo "Trying to pull vulkan backend commit '${VULKAN_BACKEND}' to '${ABS_VULKAN_DIR}'" + +if [ -d "${ABS_VULKAN_DIR}" ] && [ "$(ls -A ${ABS_VULKAN_DIR})" ]; then + echo "Directory ${ABS_VULKAN_DIR} is not empty." + read -p "Do you want to proceed with fetching and resetting? (y/n): " response + case "$response" in + [Yy]* ) + echo "Proceeding with fetch and reset..." + cd ${ABS_VULKAN_DIR} + git fetch origin + git reset --hard origin/${VULKAN_BACKEND} + ;; + [Nn]* ) + echo "Aborting." + exit 1 + ;; + * ) + echo "Invalid input. Aborting." + exit 1 + ;; + esac +else + echo "Directory ${ABS_VULKAN_DIR} is empty or does not exist. Cloning..." + mkdir -p ${ABS_VULKAN_DIR} + cd ${ABS_VULKAN_DIR} + git clone git@github.com:ingonyama-zk/icicle-vulkan-backend.git ${ABS_VULKAN_DIR} + git checkout ${VULKAN_BACKEND} +fi \ No newline at end of file diff --git a/wrappers/rust/icicle-curves/icicle-bls12-377/Cargo.toml b/wrappers/rust/icicle-curves/icicle-bls12-377/Cargo.toml index bf9c07bcb3..544d2bf0b3 100644 --- a/wrappers/rust/icicle-curves/icicle-bls12-377/Cargo.toml +++ b/wrappers/rust/icicle-curves/icicle-bls12-377/Cargo.toml @@ -29,6 +29,8 @@ cuda_backend = ["icicle-runtime/cuda_backend"] pull_cuda_backend = ["icicle-runtime/pull_cuda_backend"] metal_backend = ["icicle-runtime/metal_backend"] pull_metal_backend = ["icicle-runtime/pull_metal_backend"] +vulkan_backend = ["icicle-runtime/vulkan_backend"] +pull_vulkan_backend = ["icicle-runtime/pull_vulkan_backend"] [[bench]] @@ -42,4 +44,3 @@ harness = false [[bench]] name = "ecntt" harness = false - diff --git a/wrappers/rust/icicle-curves/icicle-bls12-377/build.rs b/wrappers/rust/icicle-curves/icicle-bls12-377/build.rs index d881fa7ca4..19f8257fe5 100644 --- a/wrappers/rust/icicle-curves/icicle-bls12-377/build.rs +++ b/wrappers/rust/icicle-curves/icicle-bls12-377/build.rs @@ -41,6 +41,11 @@ fn main() { } else if cfg!(feature = "pull_metal_backend") { config.define("METAL_BACKEND", "main"); } + if cfg!(feature = "vulkan_backend") { + config.define("VULKAN_BACKEND", "local"); + } else if cfg!(feature = "pull_vulkan_backend") { + config.define("VULKAN_BACKEND", "main"); + } // Optional Features that are default ON (so that default matches any backend) if cfg!(feature = "no_g2") { config.define("G2", "OFF"); diff --git a/wrappers/rust/icicle-curves/icicle-bls12-381/Cargo.toml b/wrappers/rust/icicle-curves/icicle-bls12-381/Cargo.toml index 30d9089a3f..c539209ca7 100644 --- a/wrappers/rust/icicle-curves/icicle-bls12-381/Cargo.toml +++ b/wrappers/rust/icicle-curves/icicle-bls12-381/Cargo.toml @@ -27,6 +27,8 @@ cuda_backend = ["icicle-runtime/cuda_backend"] pull_cuda_backend = ["icicle-runtime/pull_cuda_backend"] metal_backend = ["icicle-runtime/metal_backend"] pull_metal_backend = ["icicle-runtime/pull_metal_backend"] +vulkan_backend = ["icicle-runtime/vulkan_backend"] +pull_vulkan_backend = ["icicle-runtime/pull_vulkan_backend"] [[bench]] name = "ntt" diff --git a/wrappers/rust/icicle-curves/icicle-bls12-381/build.rs b/wrappers/rust/icicle-curves/icicle-bls12-381/build.rs index 87ace38e0a..ddec7268c8 100644 --- a/wrappers/rust/icicle-curves/icicle-bls12-381/build.rs +++ b/wrappers/rust/icicle-curves/icicle-bls12-381/build.rs @@ -41,6 +41,11 @@ fn main() { } else if cfg!(feature = "pull_metal_backend") { config.define("METAL_BACKEND", "main"); } + if cfg!(feature = "vulkan_backend") { + config.define("VULKAN_BACKEND", "local"); + } else if cfg!(feature = "pull_vulkan_backend") { + config.define("VULKAN_BACKEND", "main"); + } // Optional Features that are default ON (so that default matches any backend) if cfg!(feature = "no_g2") { config.define("G2", "OFF"); diff --git a/wrappers/rust/icicle-curves/icicle-bn254/Cargo.toml b/wrappers/rust/icicle-curves/icicle-bn254/Cargo.toml index 74b6d9843b..257570af18 100644 --- a/wrappers/rust/icicle-curves/icicle-bn254/Cargo.toml +++ b/wrappers/rust/icicle-curves/icicle-bn254/Cargo.toml @@ -27,6 +27,8 @@ cuda_backend = ["icicle-runtime/cuda_backend"] pull_cuda_backend = ["icicle-runtime/pull_cuda_backend"] metal_backend = ["icicle-runtime/metal_backend"] pull_metal_backend = ["icicle-runtime/pull_metal_backend"] +vulkan_backend = ["icicle-runtime/vulkan_backend"] +pull_vulkan_backend = ["icicle-runtime/pull_vulkan_backend"] [[bench]] name = "ntt" diff --git a/wrappers/rust/icicle-curves/icicle-bn254/build.rs b/wrappers/rust/icicle-curves/icicle-bn254/build.rs index 1d1d98b2ef..00bf3aafb7 100644 --- a/wrappers/rust/icicle-curves/icicle-bn254/build.rs +++ b/wrappers/rust/icicle-curves/icicle-bn254/build.rs @@ -40,6 +40,11 @@ fn main() { } else if cfg!(feature = "pull_metal_backend") { config.define("METAL_BACKEND", "main"); } + if cfg!(feature = "vulkan_backend") { + config.define("VULKAN_BACKEND", "local"); + } else if cfg!(feature = "pull_vulkan_backend") { + config.define("VULKAN_BACKEND", "main"); + } // Optional Features that are default ON (so that default matches any backend) if cfg!(feature = "no_g2") { config.define("G2", "OFF"); diff --git a/wrappers/rust/icicle-curves/icicle-bw6-761/build.rs b/wrappers/rust/icicle-curves/icicle-bw6-761/build.rs index 319ea57e3d..fdebc5cc6a 100644 --- a/wrappers/rust/icicle-curves/icicle-bw6-761/build.rs +++ b/wrappers/rust/icicle-curves/icicle-bw6-761/build.rs @@ -40,6 +40,11 @@ fn main() { } else if cfg!(feature = "pull_metal_backend") { config.define("METAL_BACKEND", "main"); } + if cfg!(feature = "vulkan_backend") { + config.define("VULKAN_BACKEND", "local"); + } else if cfg!(feature = "pull_vulkan_backend") { + config.define("VULKAN_BACKEND", "main"); + } // Build let _ = config diff --git a/wrappers/rust/icicle-curves/icicle-grumpkin/Cargo.toml b/wrappers/rust/icicle-curves/icicle-grumpkin/Cargo.toml index 590e1ac750..f2ee067da6 100644 --- a/wrappers/rust/icicle-curves/icicle-grumpkin/Cargo.toml +++ b/wrappers/rust/icicle-curves/icicle-grumpkin/Cargo.toml @@ -25,6 +25,8 @@ cuda_backend = ["icicle-runtime/cuda_backend"] pull_cuda_backend = ["icicle-runtime/pull_cuda_backend"] metal_backend = ["icicle-runtime/metal_backend"] pull_metal_backend = ["icicle-runtime/pull_metal_backend"] +vulkan_backend = ["icicle-runtime/vulkan_backend"] +pull_vulkan_backend = ["icicle-runtime/pull_vulkan_backend"] [[bench]] diff --git a/wrappers/rust/icicle-curves/icicle-grumpkin/build.rs b/wrappers/rust/icicle-curves/icicle-grumpkin/build.rs index d52afa259b..9ec47ecc6b 100644 --- a/wrappers/rust/icicle-curves/icicle-grumpkin/build.rs +++ b/wrappers/rust/icicle-curves/icicle-grumpkin/build.rs @@ -40,6 +40,11 @@ fn main() { } else if cfg!(feature = "pull_metal_backend") { config.define("METAL_BACKEND", "main"); } + if cfg!(feature = "vulkan_backend") { + config.define("VULKAN_BACKEND", "local"); + } else if cfg!(feature = "pull_vulkan_backend") { + config.define("VULKAN_BACKEND", "main"); + } // Build let _ = config diff --git a/wrappers/rust/icicle-fields/icicle-babybear/Cargo.toml b/wrappers/rust/icicle-fields/icicle-babybear/Cargo.toml index e9d222324e..02a6689550 100644 --- a/wrappers/rust/icicle-fields/icicle-babybear/Cargo.toml +++ b/wrappers/rust/icicle-fields/icicle-babybear/Cargo.toml @@ -9,7 +9,7 @@ repository.workspace = true # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -icicle-core = {workspace = true } +icicle-core = { workspace = true } icicle-runtime = { workspace = true } icicle-hash = { workspace = true } @@ -27,6 +27,8 @@ cuda_backend = ["icicle-runtime/cuda_backend"] pull_cuda_backend = ["icicle-runtime/pull_cuda_backend"] metal_backend = ["icicle-runtime/metal_backend"] pull_metal_backend = ["icicle-runtime/pull_metal_backend"] +vulkan_backend = ["icicle-runtime/vulkan_backend"] +pull_vulkan_backend = ["icicle-runtime/pull_vulkan_backend"] [[bench]] name = "ntt" diff --git a/wrappers/rust/icicle-fields/icicle-babybear/build.rs b/wrappers/rust/icicle-fields/icicle-babybear/build.rs index 763ad02698..f1412b4d3f 100644 --- a/wrappers/rust/icicle-fields/icicle-babybear/build.rs +++ b/wrappers/rust/icicle-fields/icicle-babybear/build.rs @@ -40,6 +40,11 @@ fn main() { } else if cfg!(feature = "pull_metal_backend") { config.define("METAL_BACKEND", "main"); } + if cfg!(feature = "vulkan_backend") { + config.define("VULKAN_BACKEND", "local"); + } else if cfg!(feature = "pull_vulkan_backend") { + config.define("VULKAN_BACKEND", "main"); + } // Build let _ = config diff --git a/wrappers/rust/icicle-fields/icicle-koalabear/Cargo.toml b/wrappers/rust/icicle-fields/icicle-koalabear/Cargo.toml index 585031b61f..bff263dedd 100644 --- a/wrappers/rust/icicle-fields/icicle-koalabear/Cargo.toml +++ b/wrappers/rust/icicle-fields/icicle-koalabear/Cargo.toml @@ -9,7 +9,7 @@ repository.workspace = true # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -icicle-core = {workspace = true } +icicle-core = { workspace = true } icicle-runtime = { workspace = true } icicle-hash = { workspace = true } @@ -27,6 +27,8 @@ cuda_backend = ["icicle-runtime/cuda_backend"] pull_cuda_backend = ["icicle-runtime/pull_cuda_backend"] metal_backend = ["icicle-runtime/metal_backend"] pull_metal_backend = ["icicle-runtime/pull_metal_backend"] +vulkan_backend = ["icicle-runtime/vulkan_backend"] +pull_vulkan_backend = ["icicle-runtime/pull_vulkan_backend"] [[bench]] name = "ntt" diff --git a/wrappers/rust/icicle-fields/icicle-koalabear/build.rs b/wrappers/rust/icicle-fields/icicle-koalabear/build.rs index 59d0e9982f..2a0611ade5 100644 --- a/wrappers/rust/icicle-fields/icicle-koalabear/build.rs +++ b/wrappers/rust/icicle-fields/icicle-koalabear/build.rs @@ -40,6 +40,11 @@ fn main() { } else if cfg!(feature = "pull_metal_backend") { config.define("METAL_BACKEND", "main"); } + if cfg!(feature = "vulkan_backend") { + config.define("VULKAN_BACKEND", "local"); + } else if cfg!(feature = "pull_vulkan_backend") { + config.define("VULKAN_BACKEND", "main"); + } // Build let _ = config diff --git a/wrappers/rust/icicle-fields/icicle-m31/Cargo.toml b/wrappers/rust/icicle-fields/icicle-m31/Cargo.toml index 35011a4984..2c3f804166 100644 --- a/wrappers/rust/icicle-fields/icicle-m31/Cargo.toml +++ b/wrappers/rust/icicle-fields/icicle-m31/Cargo.toml @@ -20,3 +20,5 @@ cuda_backend = ["icicle-runtime/cuda_backend"] pull_cuda_backend = ["icicle-runtime/pull_cuda_backend"] metal_backend = ["icicle-runtime/metal_backend"] pull_metal_backend = ["icicle-runtime/pull_metal_backend"] +vulkan_backend = ["icicle-runtime/vulkan_backend"] +pull_vulkan_backend = ["icicle-runtime/pull_vulkan_backend"] diff --git a/wrappers/rust/icicle-fields/icicle-m31/build.rs b/wrappers/rust/icicle-fields/icicle-m31/build.rs index c25209b290..dfd0f58e07 100644 --- a/wrappers/rust/icicle-fields/icicle-m31/build.rs +++ b/wrappers/rust/icicle-fields/icicle-m31/build.rs @@ -40,6 +40,11 @@ fn main() { } else if cfg!(feature = "pull_metal_backend") { config.define("METAL_BACKEND", "main"); } + if cfg!(feature = "vulkan_backend") { + config.define("VULKAN_BACKEND", "local"); + } else if cfg!(feature = "pull_vulkan_backend") { + config.define("VULKAN_BACKEND", "main"); + } // Build let _ = config diff --git a/wrappers/rust/icicle-fields/icicle-stark252/Cargo.toml b/wrappers/rust/icicle-fields/icicle-stark252/Cargo.toml index 8346d7e939..f57c8037ef 100644 --- a/wrappers/rust/icicle-fields/icicle-stark252/Cargo.toml +++ b/wrappers/rust/icicle-fields/icicle-stark252/Cargo.toml @@ -9,7 +9,7 @@ repository.workspace = true # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -icicle-core = {workspace = true } +icicle-core = { workspace = true } icicle-runtime = { workspace = true } icicle-hash = { workspace = true } @@ -26,6 +26,8 @@ cuda_backend = ["icicle-runtime/cuda_backend"] pull_cuda_backend = ["icicle-runtime/pull_cuda_backend"] metal_backend = ["icicle-runtime/metal_backend"] pull_metal_backend = ["icicle-runtime/pull_metal_backend"] +vulkan_backend = ["icicle-runtime/vulkan_backend"] +pull_vulkan_backend = ["icicle-runtime/pull_vulkan_backend"] [[bench]] name = "ntt" diff --git a/wrappers/rust/icicle-hash/Cargo.toml b/wrappers/rust/icicle-hash/Cargo.toml index 7cc89ba801..fad2cd3aaf 100644 --- a/wrappers/rust/icicle-hash/Cargo.toml +++ b/wrappers/rust/icicle-hash/Cargo.toml @@ -7,7 +7,7 @@ homepage.workspace = true repository.workspace = true [dependencies] -icicle-core = {workspace = true } +icicle-core = { workspace = true } icicle-runtime = { workspace = true } rand = "0.8" @@ -19,3 +19,5 @@ cuda_backend = ["icicle-runtime/cuda_backend"] pull_cuda_backend = ["icicle-runtime/pull_cuda_backend"] metal_backend = ["icicle-runtime/metal_backend"] pull_metal_backend = ["icicle-runtime/pull_metal_backend"] +vulkan_backend = ["icicle-runtime/vulkan_backend"] +pull_vulkan_backend = ["icicle-runtime/pull_vulkan_backend"] diff --git a/wrappers/rust/icicle-hash/build.rs b/wrappers/rust/icicle-hash/build.rs index 5658fe3c1c..2c997e76b6 100644 --- a/wrappers/rust/icicle-hash/build.rs +++ b/wrappers/rust/icicle-hash/build.rs @@ -38,6 +38,11 @@ fn main() { } else if cfg!(feature = "pull_metal_backend") { config.define("METAL_BACKEND", "main"); } + if cfg!(feature = "vulkan_backend") { + config.define("VULKAN_BACKEND", "local"); + } else if cfg!(feature = "pull_vulkan_backend") { + config.define("VULKAN_BACKEND", "main"); + } // Build let _ = config diff --git a/wrappers/rust/icicle-runtime/Cargo.toml b/wrappers/rust/icicle-runtime/Cargo.toml index 1ae0f72c78..e5c9b568cc 100644 --- a/wrappers/rust/icicle-runtime/Cargo.toml +++ b/wrappers/rust/icicle-runtime/Cargo.toml @@ -22,3 +22,5 @@ cuda_backend = [] pull_cuda_backend = [] metal_backend = [] pull_metal_backend = [] +vulkan_backend = [] +pull_vulkan_backend = [] diff --git a/wrappers/rust/icicle-runtime/build.rs b/wrappers/rust/icicle-runtime/build.rs index c93c9ed946..6c7613001f 100644 --- a/wrappers/rust/icicle-runtime/build.rs +++ b/wrappers/rust/icicle-runtime/build.rs @@ -38,6 +38,11 @@ fn main() { } else if cfg!(feature = "pull_metal_backend") { config.define("METAL_BACKEND", "main"); } + if cfg!(feature = "vulkan_backend") { + config.define("VULKAN_BACKEND", "local"); + } else if cfg!(feature = "pull_vulkan_backend") { + config.define("VULKAN_BACKEND", "main"); + } // Build let _ = config From 95c33e4d8d351d63bc88a7e0379bd30e7c127b37 Mon Sep 17 00:00:00 2001 From: Miki <100796045+mickeyasa@users.noreply.github.com> Date: Wed, 8 Jan 2025 12:59:20 +0200 Subject: [PATCH 05/12] Redesign CPU MSM with taskflow (#718) Accelerating MSM for CPU using taskflow library to distribute threads. --- icicle/backend/cpu/CMakeLists.txt | 9 +- icicle/backend/cpu/src/curve/cpu_msm.hpp | 945 +++++++---------------- 2 files changed, 296 insertions(+), 658 deletions(-) diff --git a/icicle/backend/cpu/CMakeLists.txt b/icicle/backend/cpu/CMakeLists.txt index 6aab973296..75a35f1e84 100644 --- a/icicle/backend/cpu/CMakeLists.txt +++ b/icicle/backend/cpu/CMakeLists.txt @@ -1,7 +1,12 @@ cmake_minimum_required(VERSION 3.18) message(STATUS "Fetching Taskflow v3.8.0 (CPU backend)") + + include(FetchContent) +# Temporarily redefine message() to suppress output +macro(message) +endmacro() FetchContent_Declare( Taskflow GIT_REPOSITORY https://github.com/taskflow/taskflow.git @@ -20,10 +25,12 @@ FetchContent_MakeAvailable(Taskflow) # Use icicle_device as interface for TaskFlow headers target_include_directories(icicle_device INTERFACE ${Taskflow_SOURCE_DIR}) +# Restore the original message behavior +unset(message) + # CPU backend is built directly into icicle library target_sources(icicle_device PRIVATE src/cpu_device_api.cpp) - # field API library if (FIELD) target_sources(icicle_field PRIVATE diff --git a/icicle/backend/cpu/src/curve/cpu_msm.hpp b/icicle/backend/cpu/src/curve/cpu_msm.hpp index 7cfd1032d2..2f129cd699 100644 --- a/icicle/backend/cpu/src/curve/cpu_msm.hpp +++ b/icicle/backend/cpu/src/curve/cpu_msm.hpp @@ -13,7 +13,7 @@ #include "icicle/curves/projective.h" #include "icicle/curves/curve_config.h" #include "icicle/msm.h" -#include "tasks_manager.h" +#include "taskflow/taskflow.hpp" #include "icicle/backend/msm_config.h" #ifdef MEASURE_MSM_TIMES #include "icicle/utils/timer.hpp" @@ -22,701 +22,344 @@ using namespace icicle; using namespace curve_config; -#define LOG_EC_ADDITIONS_IN_BATCH 2 -#define NOF_EC_ADDITIONS_IN_BATCH (1 << LOG_EC_ADDITIONS_IN_BATCH) - -/** - * @class EcAddTask - * @brief class deriving `TaskBase` to allow use of tasks_manager.h for MSM - * (The task to be executed is an elliptic curve addition). - */ template -class EcAddTask : public TaskBase +class Msm { public: - /** - * @brief constructor for the task ensuring points are zeros and other fields are init to invalid values (to avoid - * falsely handling a result at the start). The execution adds points m_point1,m_point2 and stores the result in - * m_point1. - */ - EcAddTask() - : TaskBase(), m_a_points(NOF_EC_ADDITIONS_IN_BATCH, P::zero()), m_b_points(NOF_EC_ADDITIONS_IN_BATCH, P::zero()), - m_b_point_ptrs(NOF_EC_ADDITIONS_IN_BATCH, nullptr), m_b_affine_points(NOF_EC_ADDITIONS_IN_BATCH, A::zero()), - m_return_idx(NOF_EC_ADDITIONS_IN_BATCH, -1), m_opcodes(NOF_EC_ADDITIONS_IN_BATCH, ADD_P1_P2_BY_VALUE), - m_is_line(NOF_EC_ADDITIONS_IN_BATCH, true), m_nof_valid_points(0) + // Constructor + Msm(const int msm_size, const MSMConfig& config) : m_msm_size(msm_size), m_config(config) { - } - - /** - * @brief Function to be executed by the tasks manager. It can be configured according to the value odf - */ - void execute() - { - static int counter = 0; - for (int i = 0; i < m_nof_valid_points; i++) { - switch (m_opcodes[i]) { - case ADD_P1_P2_BY_VALUE: - m_a_points[i] = m_a_points[i] + m_b_points[i]; - continue; - case ADD_P1_AND_P2_POINTER: - m_a_points[i] = m_a_points[i] + *(m_b_point_ptrs[i]); - continue; - case ADD_P1_AND_P2_AFFINE: - m_a_points[i] = m_a_points[i] + m_b_affine_points[i]; - // continue; - } + // TBD: for sall size MSM - prefer double and add + calc_optimal_parameters(); + + // Resize the thread buckets according to optimal parameters + m_workers_buckets.resize(m_nof_workers); + m_workers_buckets_busy.resize(m_nof_workers); + for (int worker_i = 0; worker_i < m_nof_workers; worker_i++) { + m_workers_buckets[worker_i].resize(m_nof_total_buckets); + m_workers_buckets_busy[worker_i].resize(m_nof_total_buckets); } } - /** - * @brief Dispatch task even if batch is not full. This function is mostly used in ends of phases where the left - * additions are not enough to fill a batch. - */ - void dispatch_if_not_empty() + // run MSM based on pippenger algorithm + void run_msm(const scalar_t* scalars, const A* bases, P* results) { - if (is_idle() && m_nof_valid_points > 0) { dispatch(); } - } + // TBD: move to threads + init_workers_buckets_busy(); - /** - * @brief Set up phase 1 addition between a new base and the target bucket. - * @param bucket - bucket to be added. It isn't passed as Per as the bucket's value can be changed during - * addition execution. - * @param base - affine base to be added. Passed as a Per as the input is constant and it saves copying. - * @param negate_affine - flag to indicate that the base needs to be subbed instead of added. - * @param is_montgomery - flag to indicate that the base is in Montgomery form and first needs to be converted. - */ - void set_phase1_addition_with_affine(const P& bucket, const A& base, int bucket_idx) - { - m_a_points[m_nof_valid_points] = bucket; - m_b_affine_points[m_nof_valid_points] = base; - m_return_idx[m_nof_valid_points] = bucket_idx; - m_opcodes[m_nof_valid_points] = ADD_P1_AND_P2_AFFINE; + phase1_populate_buckets(scalars, bases); - m_nof_valid_points++; - if (m_nof_valid_points == NOF_EC_ADDITIONS_IN_BATCH) { dispatch(); } - } + phase2_collapse_segments(); - /** - * @brief Set up phase 2 addition by value. It is required for the first set of additions - the first line addition - * of each segment. - * @param line_sum - line sum to be copied because it is also given to triangle sum. - * @param bucket - bucket to be added to the line. - * @param segment_idx - relevant segment to identify the returning result. - */ - void set_phase2_addition_by_value(const P& line_sum, const P& bucket, int segment_idx) - { - m_a_points[m_nof_valid_points] = line_sum; - m_b_points[m_nof_valid_points] = bucket; - m_return_idx[m_nof_valid_points] = segment_idx; - m_is_line[m_nof_valid_points] = true; - m_opcodes[m_nof_valid_points] = ADD_P1_P2_BY_VALUE; - - m_nof_valid_points++; - if (m_nof_valid_points == NOF_EC_ADDITIONS_IN_BATCH) { dispatch(); } + // TBD: start the next task in batch in parallel to phase 3 + phase3_final_accumulator(results); } - /** - * @brief Set up line addition for phase 2. - * @param line_sum - line sum to be copied because it is also given to triangle sum. - * @param bucket_ptr - Per to the bucket to be added to the line, passed as a Per to avoid copying. - * @param segment_idx - relevant segment to identify the returning result. - */ - void set_phase2_line_addition(const P& line_sum, P* bucket_ptr, const int segment_idx) + // Calculate the optimal C based on the problem size, config and machine parameters. + static unsigned get_optimal_c(unsigned msm_size, const MSMConfig& config) { - m_a_points[m_nof_valid_points] = line_sum; - m_b_point_ptrs[m_nof_valid_points] = bucket_ptr; - m_return_idx[m_nof_valid_points] = segment_idx; - m_is_line[m_nof_valid_points] = true; - m_opcodes[m_nof_valid_points] = ADD_P1_AND_P2_POINTER; - - m_nof_valid_points++; - if (m_nof_valid_points == NOF_EC_ADDITIONS_IN_BATCH) { dispatch(); } + // TBD: optimize - condsider nof workers, do experiments + int optimal_c = config.c > 0 ? config.c : // c given by config. + std::max((int)(0.7 * std::log2(msm_size * config.precompute_factor)), 8); // Empirical formula + return optimal_c; } - /** - * @brief Set up triangle addition for phase 2. - * No need to specify segment idx as this is only assigned to a task already handling the triangle's segment idx - * (A previous line or triangle sum of this segment). - * @param line_sum - line sum to be copied because its value can be updated during the triangle sum execution. - * @param triangle_sum_ptr - Per to the triangle sum, passed as a Per to avoid unnecessary copying. - */ - void set_phase2_triangle_addition(P& line_sum, P* triangle_sum_ptr) - { - m_a_points[m_nof_valid_points] = line_sum; - m_b_point_ptrs[m_nof_valid_points] = triangle_sum_ptr; - m_is_line[m_nof_valid_points] = false; - m_opcodes[m_nof_valid_points] = ADD_P1_AND_P2_POINTER; +private: + // A single bucket data base + // TBD: check adding a pointer to affine to reduce copies + struct Bucket { + P point; + }; - m_nof_valid_points++; - if (m_nof_valid_points == NOF_EC_ADDITIONS_IN_BATCH) { dispatch(); } - } + // A single segment data base + struct Segment { + P line_sum; + P triangle_sum; + }; - /** - * @brief Chain addition used in phase 1 when a collision between a new result and an occupied bucket. - * @param result - the previous EC addition result. - * @param bucket - the bucket value to be added to the existing result in p1. - * @param segment_idx - index of the bucket to return the result to. - */ - void set_phase1_collision_task(const P& result, const P& bucket, const int segment_idx) + // members + tf::Taskflow m_taskflow; // Accumulate tasks + tf::Executor m_executor; // execute all tasks accumulated on multiple threads + const int m_msm_size; // number of scalars in the problem + const MSMConfig& m_config; // extra parameters for the problem + uint32_t m_scalar_size; // the number of bits at the scalar + uint32_t m_c; // the number of bits each bucket module is responsible for + uint32_t m_bm_size; // number of buckets in a single bucket module. + uint32_t m_nof_buckets_module; // number of bucket modules. Each BM contains m_bm_size buckets except for the last one + uint64_t m_nof_total_buckets; // total number of buckets across all bucket modules + uint32_t m_precompute_factor; // the number of bases precomputed for each scalar + uint32_t m_segment_size; // segments size for phase 2. + uint32_t m_nof_workers; // number of threads in current machine + + // per worker: + std::vector> m_workers_buckets; // all buckets used by the worker + std::vector> m_workers_buckets_busy; // for each bucket, an indication if it is busy + + std::vector m_segments; // A vector of all segments for phase 2 + + // set the parameters based on the problem size and the machine properties + void calc_optimal_parameters() { - m_a_points[m_nof_valid_points] = result; - m_b_points[m_nof_valid_points] = bucket; - m_return_idx[m_nof_valid_points] = segment_idx; - m_opcodes[m_nof_valid_points] = ADD_P1_P2_BY_VALUE; + // set nof workers + m_nof_workers = std::thread::hardware_concurrency(); + if (m_config.ext && m_config.ext->has(CpuBackendConfig::CPU_NOF_THREADS)) { + m_nof_workers = m_config.ext && m_config.ext->has(CpuBackendConfig::CPU_NOF_THREADS) + ? m_config.ext->get(CpuBackendConfig::CPU_NOF_THREADS) + : // number of threads provided by config + std::thread::hardware_concurrency(); // check machine properties + } + if (m_nof_workers <= 0) { + ICICLE_LOG_WARNING << "Unable to detect number of hardware supported threads - fixing it to 1\n"; + m_nof_workers = 1; + } - m_nof_valid_points++; - if (m_nof_valid_points == NOF_EC_ADDITIONS_IN_BATCH) { dispatch(); } + // phase 1 properties + m_scalar_size = scalar_t::NBITS; // TBD handle this config.bitsize != 0 ? config.bitsize : scalar_t::NBITS; + m_c = get_optimal_c(m_msm_size, m_config); + m_precompute_factor = m_config.precompute_factor; + m_nof_buckets_module = ((m_scalar_size - 1) / (m_config.precompute_factor * m_c)) + 1; + m_bm_size = 1 << (m_c - 1); + const uint64_t last_bm_size = + m_precompute_factor > 1 ? m_bm_size : 1 << (m_scalar_size - ((m_nof_buckets_module - 1) * m_c)); + m_nof_total_buckets = (m_nof_buckets_module - 1) * m_bm_size + last_bm_size; + + // phase 2 properties + m_segment_size = + std::min(m_bm_size, (uint32_t)(1 << (uint32_t)(std::log2(m_nof_total_buckets / m_nof_workers) - 4))); + const int nof_segments = (m_nof_total_buckets + m_segment_size - 1) / m_segment_size; + m_segments.resize(nof_segments); } - /** - * @brief Resets task to idle, resetting the valid points counter to 0. - */ - void reset_idle() + // init all the workers buckets to empty + void init_workers_buckets_busy() // TBD move to threads - note that not al threads are working { - m_nof_valid_points = 0; - set_idle(); - } - - int m_nof_valid_points; - - std::vector

m_a_points; // One of the addends that also stores addition results. - - std::vector m_return_idx; // Idx allowing manager to figure out where the result belong to. - std::vector m_is_line; // Indicator for phase 2 sums between line sum and triangle sum. - -private: - // Variations of the second addend which will be used depending on the opcode below - std::vector

m_b_points; - std::vector m_b_affine_points; - std::vector m_b_point_ptrs; - - enum eAddType { ADD_P1_P2_BY_VALUE, ADD_P1_AND_P2_POINTER, ADD_P1_AND_P2_AFFINE }; - std::vector m_opcodes; -}; - -/** - * @class MSM - * @brief class for solving multi-scalar-multiplication on the cpu. - * The class is a template depending on the element relevant to MSM (for instance EC P). - * NOTE The msm runs only if nof_threads * 4 > nof_bms. The user can guarantee the condition by assigning a proper - * precompute factor that will decrease the number of BMs required to calculate the MSM. - */ -template -class Msm -{ -public: - /** - * @brief Constructor for Msm class. - * @param config - msm config. important parameters that are part of the config extension are: . NOTE: ensure c - * doesn't divide the scalar width without a remainder. - * @param c - c separately after cpu_msm handled problematic c values (less than 1 or dividing scalar_t::NBITS - * without remainder) - * @param nof_threads - number of worker threads for EC additions. - */ - Msm(const MSMConfig& config, const int c, const int nof_threads); - - /** - * @brief Destructor for Msm class. - * Ensures phase 3 threads have finished and joins them (other deletion are implemented implicitly). - */ - ~Msm() - { - for (std::thread& phase3_thread : m_phase3_threads) { - phase3_thread.join(); + for (auto& buckets_busy : m_workers_buckets_busy) { + fill(buckets_busy.begin(), buckets_busy.end(), false); } } - /** - * @brief Main function to execute MSM computation. Launches the 3 phases implemented in the functions below - * (accumulation, bm sums, final accumulator). - * @param scalars - Input scalars for MSM. - * @param bases - EC P input, affine representation. - * @param msm_size - Size of the above arrays, as they are given as Pers. - * @param batch_idx - number of current MSM in the batch. - * @param results - Per to P array in which to store the results. NOTE: the user is expected to preallocate - * the results array. - */ - void run_msm( - const scalar_t* scalars, const A* bases, const unsigned int msm_size, const unsigned int batch_idx, P* results); - - /** - * @brief Calculate approximate value of c to minimize number of EC additions in the MSM calculation. Having said - * that, the value of c might be suboptimal in the case of physical memory limitations (Due to required memory size - * for the BMs, determined by c). - * @param msm_size - Number of inputs in the MSM calculation. - * @param precompute_factor - Precompute factor determined in the config. - * @return Value of c to minimize EC additions while not filling up the physical memory. - */ - static unsigned get_optimal_c(unsigned msm_size, int precompute_factor) + // execute all tasks in taskfkow, wait for them to complete and clear taskflow. + void run_workers_and_wait() { - // Approximation for optimal c size while ignoring memory limitation. - int optimal_c = precompute_factor > 1 - ? std::max((int)std::log2(msm_size) + (int)std::log2(precompute_factor) - 5, 8) - : std::max((int)std::log2(msm_size) - 5, 8); - - // Get physical memory limitation (by some factor < 1 - chosen to be 3/4) - uint64_t point_size = 3 * scalar_t::NBITS; // NOTE this is valid under the assumption of projective points in BMs - uint64_t _0_75_of_mem_size = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGE_SIZE) * 3 / 4; - uint64_t max_nof_points_in_mem = _0_75_of_mem_size / point_size; - - // Reduce c until it doesn't exceed the memory limitation - int c = optimal_c + 1; - uint64_t total_num_of_points; - do { - c--; - uint64_t num_bms = ((scalar_t::NBITS - 1) / (precompute_factor * c)) + 1; - total_num_of_points = num_bms << (c - 1); - } while (total_num_of_points > max_nof_points_in_mem); - ICICLE_LOG_DEBUG << "Chosen c:\t" << c - << (c < optimal_c ? "\t(Not optimal, BMs are memory bound by " - : "\t(Optimal, BMs are smaller than limit of ") - << (_0_75_of_mem_size >> 20) << "MB)"; - - return c; + m_executor.run(m_taskflow).wait(); + m_taskflow.clear(); } -private: - TasksManager> manager; // Tasks manager for multithreading - - const unsigned int m_c; // Pipenger constant - const unsigned int m_num_bkts; // Number of buckets in each bucket module - const unsigned int m_precompute_factor; // multiplication of bases already calculated trading memory for performance - const unsigned int m_num_bms; // Number of bucket modules (windows in Pipenger's algorithm) - const bool m_are_scalars_mont; // Are the input scalars in Montgomery representation - const bool m_are_points_mont; // Are the input points in Montgomery representation - const int m_batch_size; - - EcAddTask* m_curr_task; - - // Phase 1 members - std::vector

m_buckets; // Vector of all buckets required for phase 1 (All bms in order) - std::vector m_bkts_occupancy; // Boolean vector indicating if the corresponding bucket is occupied - - // Phase 2 members - const int m_log_num_segments; - const int m_num_bm_segments; // Number of segments in a BM - to maximize parallel operation from the serial bm sums. - const int m_segment_size; - - /** - * @struct BmSumSegment - * @brief Struct bundling the required data members for each BM segment in phase 2. - */ - struct BmSumSegment { - // Imagining the required BM sum as a right angle triangle - Nth element is a N high column of the element - - // A method to summing the BM serially is starting from the top with a triangle sum up to the current line and a - // line sum which is calculated for each line then added to the triangle. - // Therefore both sums are need as a part of this struct. - P triangle_sum; - P line_sum; - - int m_nof_received_sums = 1; // Counter for numbers of sum received in this row idx - used to determine when a new - // row idx can be dispatched (Due to it depending on the two sums from the prev idx). - int m_idx_in_segment; // Counter counting down how far the current sum is through the segment. - int m_segment_mem_start; // Offset of segment start in memory / vector. - }; - - // Phase 3 members - std::vector m_phase3_threads; - - /** - * @brief Phase 1 (accumulation) of MSM - sorting input points to buckets according to corresponding input scalars. - * @param scalars - scalar input. - * @param bases - EC P input, affine representation (Regardless of the defined P type of the class). - * @param msm_size - Size of the above arrays, as they are given as Pers. - */ - void phase1_bucket_accumulator(const scalar_t* scalars, const A* bases, const unsigned int msm_size); - - /** - * @brief Push addition task during phase 1. - * Th function also handles completed addition results while attempting to insert the new addition task (including - * potential collision that requires more urgent ec-addition of the result and the current stored value). - * @param task_bkt_idx - address in m_buckets in which to store the result in the future. - * @param bkt - the P from m_buckets. - * @param base - the P from the input bases - * @param negate_base - flag to signal the task to subtract base instead of adding it. - */ - void phase1_push_addition(const unsigned int task_bkt_idx, const P bkt, const A base); - /** - * @brief Handles the final results of phase 1 (after no other planned additions are required). - * The function also handles the potential collision similarly to push_addition above. - */ - void phase1_wait_for_completion(); - - /** - * @brief Phase 2 of MSM. Function handling the initial parallel part of summing the bucket modules. It splits the - * BMs into segments, summing each separately and passing the segments sum to be handled by the final accumulator. - * @param segments vector to contain segments line and triangle sums for the final accumulator (output of function). - */ - void phase2_bm_sum(std::vector& segments); - - /** - * @brief Setting up phase 2 class members according to phase 1 results. - * NOTE: This setup function also potentially blocks until the previous MSM's phase 3 finishes (Given that in batched - * MSM, phase 3 is initiated on a separate thread while the next MSM begins phase 1). - */ - void phase2_setup(std::vector& segments); - - /** - * @brief Final accumulation required for MSM calculation. the function will either launch a thread to perform the - * calculation (`phase3_tread` function) or by the main thread, depending on the position in the batch (Last msm or - * not). - * @param segments_ptr - pointer to vector containing the segment calculations of phase 2 - * @param idx_in_batch - idx of the current MSM in the batch. - * @param result - output, Per to write the MSM result to. Memory for the Per has already been allocated by - * the user. - */ - void phase3_final_accumulator(std::vector& segments, int idx_in_batch, P* result); - - /** - * @brief Phase 3 function to be ran by a separate thread (or main in the end of the run) - i.e. without using the - * tasks manager. It performs the remaining serial addition in each BM and sums them to one final MSM result. - * @param result - Per to write the MSM result to. Memory for the Per has already been allocated by the user. - */ - void phase3_thread(std::vector segments, P* result); - - /** - * @brief Function for resetting class members between batch runs. - */ - void batch_run_reset() { std::fill(m_bkts_occupancy.begin(), m_bkts_occupancy.end(), false); } -}; - -template -Msm::Msm(const MSMConfig& config, const int c, const int nof_threads) - : manager( - nof_threads, - // Minimal number of tasks required in phase 2 - 2 Tasks for each BM - (((scalar_t::NBITS - 1) / (config.precompute_factor * c)) + 1) * 2), - - m_curr_task(nullptr), - - m_c(c), m_num_bkts(1 << (m_c - 1)), m_precompute_factor(config.precompute_factor), - m_num_bms(((scalar_t::NBITS - 1) / (config.precompute_factor * c)) + 1), - m_are_scalars_mont(config.are_scalars_montgomery_form), m_are_points_mont(config.are_points_montgomery_form), - m_batch_size(config.batch_size), - - m_buckets(m_num_bms * m_num_bkts), m_bkts_occupancy(m_num_bms * m_num_bkts, false), + // phase 1: Each worker process a portion of the inputs and populate its buckets + void phase1_populate_buckets(const scalar_t* scalars, const A* bases) + { + // Divide the msm problem to workers + const int worker_msm_size = (m_msm_size + m_nof_workers - 1) / m_nof_workers; // round up + + // Run workers to build their buckets on a subset of the scalars and bases + for (int worker_i = 0; worker_i < m_nof_workers; worker_i++) { + m_taskflow.emplace([=]() { + const int scalars_start_idx = worker_msm_size * worker_i; + const int bases_start_idx = scalars_start_idx * m_precompute_factor; + const int cur_worker_msm_size = std::min(worker_msm_size, m_msm_size - worker_i * worker_msm_size); + worker_run_phase1(worker_i, scalars + scalars_start_idx, bases + bases_start_idx, cur_worker_msm_size); + }); + } - m_log_num_segments(std::max( - (int)std::floor( - std::log2((double)(nof_threads * TASKS_PER_THREAD * NOF_EC_ADDITIONS_IN_BATCH - 1) / (2 * m_num_bms))), - 0)), - m_num_bm_segments(std::min((int)(1 << m_log_num_segments), (int)(m_num_bkts))), - m_segment_size(std::max((int)(m_num_bkts >> m_log_num_segments), 1)), + // TBD build a graph of dependencies for task flow to execute phase 2 + run_workers_and_wait(); - m_phase3_threads(m_batch_size - 1) -{ -} - -template -void Msm::run_msm( - const scalar_t* scalars, const A* bases, const unsigned int msm_size, const unsigned int batch_idx, P* results) -{ -#ifdef MEASURE_MSM_TIMES - Timer tmsm("Total msm time"); - { - Timer tp1("Phase 1"); -#else - { -#endif - phase1_bucket_accumulator(scalars, bases, msm_size); - } - auto segments = std::vector(m_num_bms * m_num_bm_segments); - { -#ifdef MEASURE_MSM_TIMES - Timer tp1("Phase 2"); -#endif - phase2_bm_sum(segments); - } - { -#ifdef MEASURE_MSM_TIMES - Timer tp1("Phase 3"); -#endif - phase3_final_accumulator(segments, batch_idx, results); - if (batch_idx < m_batch_size - 1) { batch_run_reset(); } + // Collapse all the workers buckets into one + collapse_all_workers_buckets(); } -} -template -void Msm::phase1_bucket_accumulator(const scalar_t* scalars, const A* bases, const unsigned int msm_size) -{ - const int coeff_bit_mask_no_sign_bit = m_num_bkts - 1; - const int coeff_bit_mask_with_sign_bit = (1 << m_c) - 1; - // NUmber of windows / additions per scalar in case num_bms * precompute_factor exceed scalar width - const int num_bms_before_precompute = ((scalar_t::NBITS - 1) / m_c) + 1; // +1 for ceiling - int carry = 0; - for (int i = 0; i < msm_size; i++) { - carry = 0; - // Handle required preprocess of scalar - scalar_t scalar = m_are_scalars_mont ? scalar_t::from_montgomery(scalars[i]) : scalars[i]; - bool negate_p_and_s = scalar.get_scalar_digit(scalar_t::NBITS - 1, 1) > 0; - if (negate_p_and_s) { scalar = scalar_t::neg(scalar); } - for (int j = 0; j < m_precompute_factor; j++) { - // Handle required preprocess of base P - A base = - m_are_points_mont ? A::from_montgomery(bases[m_precompute_factor * i + j]) : bases[m_precompute_factor * i + j]; - if (base == A::zero()) { continue; } - if (negate_p_and_s) { base = A::neg(base); } - - for (int k = 0; k < m_num_bms; k++) { - // Avoid seg fault in case precompute_factor*c exceeds the scalar width by comparing index with num additions - if (m_num_bms * j + k >= num_bms_before_precompute) { break; } - - uint32_t curr_coeff = scalar.get_scalar_digit(m_num_bms * j + k, m_c) + carry; - int bkt_idx = 0; - // For the edge case of curr_coeff = c (limb=c-1, carry=1) use the sign bit mask - if ((curr_coeff & coeff_bit_mask_with_sign_bit) != 0) { - // Remove sign to infer the bkt idx. - carry = curr_coeff > m_num_bkts; - if (!carry) { - bkt_idx = m_num_bkts * k + (curr_coeff & coeff_bit_mask_no_sign_bit); - } else { - bkt_idx = m_num_bkts * k + ((-curr_coeff) & coeff_bit_mask_no_sign_bit); - } + // Each worker run this function and update its buckets - this function needs re writing + void worker_run_phase1(const int worker_idx, const scalar_t* scalars, const A* bases, const int msm_size) + { + // My buckets: + std::vector& buckets = m_workers_buckets[worker_idx]; + std::vector& buckets_busy = m_workers_buckets_busy[worker_idx]; + + const int coeff_bit_mask_no_sign_bit = m_bm_size - 1; + const int coeff_bit_mask_with_sign_bit = (1 << m_c) - 1; + // NUmber of windows / additions per scalar in case num_bms * precompute_factor exceed scalar width + const int num_bms_before_precompute = ((m_scalar_size - 1) / m_c) + 1; // +1 for ceiling + int carry = 0; + for (int i = 0; i < msm_size; i++) { + carry = 0; + // Handle required preprocess of scalar + scalar_t scalar = + m_config.are_scalars_montgomery_form ? scalar_t::from_montgomery(scalars[i]) : scalars[i]; // TBD: avoid copy + bool negate_p_and_s = scalar.get_scalar_digit(m_scalar_size - 1, 1) > 0; + if (negate_p_and_s) { scalar = scalar_t::neg(scalar); } // TBD: inplace + + for (int j = 0; j < m_precompute_factor; j++) { + // Handle required preprocess of base P + A base = m_config.are_points_montgomery_form ? A::from_montgomery(bases[m_precompute_factor * i + j]) + : bases[m_precompute_factor * i + j]; // TDB: avoid copy + if (base == A::zero()) { continue; } // TBD: why is that? can be done more efficiently? + A base_neg = A::neg(base); + + for (int bm_i = 0; bm_i < m_nof_buckets_module; bm_i++) { + // Avoid seg fault in case precompute_factor*c exceeds the scalar width by comparing index with num additions + if (m_nof_buckets_module * j + bm_i >= num_bms_before_precompute) { break; } + + uint32_t curr_coeff = scalar.get_scalar_digit(m_nof_buckets_module * j + bm_i, m_c) + carry; + int bkt_idx = 0; // TBD: move inside if and change to ( = ? : ) + // For the edge case of curr_coeff = c (limb=c-1, carry=1) use the sign bit mask + if ((curr_coeff & coeff_bit_mask_with_sign_bit) != 0) { + // Remove sign to infer the bkt idx. + carry = curr_coeff > m_bm_size; + if (!carry) { + bkt_idx = m_bm_size * bm_i + (curr_coeff & coeff_bit_mask_no_sign_bit); + } else { + bkt_idx = m_bm_size * bm_i + ((-curr_coeff) & coeff_bit_mask_no_sign_bit); + } - // Check for collision in that bucket and either dispatch an addition or store the P accordingly. - if (m_bkts_occupancy[bkt_idx]) { - m_bkts_occupancy[bkt_idx] = false; - phase1_push_addition(bkt_idx, m_buckets[bkt_idx], carry > 0 ? A::neg(base) : base); + // Check for collision in that bucket and either dispatch an addition or store the P accordingly. + if (buckets_busy[bkt_idx]) { + buckets[bkt_idx].point = + buckets[bkt_idx].point + ((negate_p_and_s ^ (carry > 0)) ? base_neg : base); // TBD: inplace + } else { + buckets_busy[bkt_idx] = true; + buckets[bkt_idx].point = + P::from_affine(((negate_p_and_s ^ (carry > 0)) ? base_neg : base)); // TBD: inplace + } } else { - m_bkts_occupancy[bkt_idx] = true; - m_buckets[bkt_idx] = carry > 0 ? P::neg(P::from_affine(base)) : P::from_affine(base); + // Handle edge case where coeff = 1 << c due to carry overflow which means: + // coeff & coeff_mask == 0 but there is a carry to propagate to the next segment + carry = curr_coeff >> m_c; } - } else { - // Handle edge case where coeff = 1 << c due to carry overflow which means: - // coeff & coeff_mask == 0 but there is a carry to propagate to the next segment - carry = curr_coeff >> m_c; } } } } - phase1_wait_for_completion(); -} -template -void Msm::phase1_push_addition(const unsigned int task_bkt_idx, const P bkt, const A base) -{ - while (m_curr_task == nullptr) { - // Use the search for an available (idle or completed) task as an opportunity to handle the existing results. - m_curr_task = manager.get_idle_or_completed_task(); - if (m_curr_task->is_completed()) { - const int nof_results = m_curr_task->m_nof_valid_points; - m_curr_task->reset_idle(); - for (int i = 0; i < nof_results; i++) { - // Check for collision in the destination bucket, and chain and addition / store result accordingly. - if (m_bkts_occupancy[m_curr_task->m_return_idx[i]]) { - m_bkts_occupancy[m_curr_task->m_return_idx[i]] = false; - m_curr_task->set_phase1_collision_task( - m_curr_task->m_a_points[i], m_buckets[m_curr_task->m_return_idx[i]], m_curr_task->m_return_idx[i]); - } else { - m_buckets[m_curr_task->m_return_idx[i]] = m_curr_task->m_a_points[i]; - m_bkts_occupancy[m_curr_task->m_return_idx[i]] = true; - } - } - } - // If the collision tasks cause the task to be dispatched again this task can't be assigned more additions - - // repeat the loop with a new task. - if (!m_curr_task->is_idle()) { m_curr_task = nullptr; } - } - // After handling the result a new one can be set. - m_curr_task->set_phase1_addition_with_affine(bkt, base, task_bkt_idx); - if (!m_curr_task->is_idle()) { m_curr_task = nullptr; } -} - -template -void Msm::phase1_wait_for_completion() -{ - // In case remaining additions are smaller than a batch size - dispatch current task - if (m_curr_task && m_curr_task->is_idle()) { m_curr_task->dispatch(); } - - EcAddTask* task = manager.get_completed_task(); - while (task != nullptr) { - const int nof_results = task->m_nof_valid_points; - task->reset_idle(); - bool had_collision = false; - for (int i = 0; i < nof_results; i++) { - // Check for collision in the destination bucket, and chain and addition / store result accordingly. - if (m_bkts_occupancy[task->m_return_idx[i]]) { - m_bkts_occupancy[task->m_return_idx[i]] = false; - task->set_phase1_collision_task(task->m_a_points[i], m_buckets[task->m_return_idx[i]], task->m_return_idx[i]); - had_collision = true; - } else { - m_buckets[task->m_return_idx[i]] = task->m_a_points[i]; - m_bkts_occupancy[task->m_return_idx[i]] = true; + // Collapse all workers buckets into worker 0 buckets? + void collapse_all_workers_buckets() + { + int bucket_start = 0; + const int nof_buckets_per_thread = (m_nof_total_buckets + m_nof_workers - 1) / m_nof_workers; + for (int worker_i = 0; worker_i < m_nof_workers; worker_i++) { + const int nof_buckets = + std::min(nof_buckets_per_thread, (int)m_nof_total_buckets - worker_i * nof_buckets_per_thread); + if (nof_buckets > 0) { + m_taskflow.emplace( + [this, bucket_start, nof_buckets]() { worker_collapse_all_workers_result(bucket_start, nof_buckets); }); + bucket_start += nof_buckets_per_thread; } } - task->dispatch_if_not_empty(); - - task = manager.get_completed_task(); + run_workers_and_wait(); } -} - -template -void Msm::phase2_bm_sum(std::vector& segments) -{ - phase2_setup(segments); - if (m_segment_size > 1) { - // Send first additions - line additions. - for (int i = 0; i < m_num_bms * m_num_bm_segments; i++) { - if (i % NOF_EC_ADDITIONS_IN_BATCH == 0) { m_curr_task = manager.get_idle_task(); } - BmSumSegment& curr_segment = segments[i]; // For readability - - int bkt_idx = curr_segment.m_segment_mem_start + curr_segment.m_idx_in_segment; - P bucket = m_bkts_occupancy[bkt_idx] ? m_buckets[bkt_idx] : P::zero(); - m_curr_task->set_phase2_addition_by_value(curr_segment.line_sum, bucket, i); - } - // Dispatch last task if it is not enough to fill a batch and dispatch automatically. - if (m_curr_task->is_idle()) { m_curr_task->dispatch(); } - // Loop until all line/tri sums are done. - int done_segments = 0; - while (done_segments < m_num_bms * m_num_bm_segments) { - m_curr_task = manager.get_completed_task(); - - // Check if there is a need for a line task according to the received sums counter of one of the received segments - EcAddTask* line_task = nullptr; - if (segments[m_curr_task->m_return_idx[0]].m_nof_received_sums == 1) { line_task = manager.get_idle_task(); } - - const int nof_results = m_curr_task->m_nof_valid_points; - m_curr_task->reset_idle(); - - for (int i = 0; i < nof_results; i++) { - BmSumSegment& curr_segment = segments[m_curr_task->m_return_idx[i]]; // For readability - - if (m_curr_task->m_is_line[i]) { - curr_segment.line_sum = m_curr_task->m_a_points[i]; - } else { - curr_segment.triangle_sum = m_curr_task->m_a_points[i]; - } - curr_segment.m_nof_received_sums++; - - // Check if this was the last addition in the segment - if (curr_segment.m_idx_in_segment < 0) { - done_segments++; - continue; - } - // Otherwise check if it is possible to assign new additions: - // Triangle sum is dependent on the 2 previous sums (line and triangle) - so check if 2 sums were received. - // Line sum (if not the last one in the segment). - - // Due to the choice of num segments being less than half of total tasks there ought to be an idle task - // for the line sum. - if (curr_segment.m_nof_received_sums == 2) { - curr_segment.m_nof_received_sums = 0; - m_curr_task->set_phase2_triangle_addition(curr_segment.line_sum, &(curr_segment.triangle_sum)); - curr_segment.m_idx_in_segment--; - - if (curr_segment.m_idx_in_segment >= 0) { - int bkt_idx = curr_segment.m_segment_mem_start + curr_segment.m_idx_in_segment; - if (m_bkts_occupancy[bkt_idx]) { - int return_idx = m_curr_task->m_return_idx[i]; - line_task->set_phase2_line_addition(curr_segment.line_sum, &m_buckets[bkt_idx], return_idx); - } else { - // curr_segment.m_nof_received_sums++; - int return_idx = m_curr_task->m_return_idx[i]; - line_task->set_phase2_addition_by_value(curr_segment.line_sum, P::zero(), return_idx); - } // No need to add a zero - just increase nof_received_sums. - } + // A single thread functionality + void worker_collapse_all_workers_result(const int bucket_start, const int nof_buckets) + { + // run over all buckets under my responsibility. + for (int bucket_i = bucket_start; bucket_i < bucket_start + nof_buckets; bucket_i++) { + bool worker_0_bucket_busy = m_workers_buckets_busy[0][bucket_i]; + + // run over all workers + for (int worker_i = 1; worker_i < m_nof_workers; worker_i++) { + if (m_workers_buckets_busy[worker_i][bucket_i]) { + // add busy buckets to worker_0's bucket + m_workers_buckets[0][bucket_i].point = + worker_0_bucket_busy ? m_workers_buckets[0][bucket_i].point + m_workers_buckets[worker_i][bucket_i].point + : // TBD add inplace + m_workers_buckets[worker_i][bucket_i].point; + worker_0_bucket_busy = true; } } - // Check if tri and line task haven't been dispatched due to not enough inputs - dispatch them - m_curr_task->dispatch_if_not_empty(); - if (line_task) { line_task->dispatch_if_not_empty(); } + // if the bucket is empty for all workers, reset it. + if (!worker_0_bucket_busy) { m_workers_buckets[0][bucket_i].point = P::zero(); } } } -} -template -void Msm::phase2_setup(std::vector& segments) -{ - // Init values of partial (line) and total (triangle) sums - for (int i = 0; i < m_num_bms; i++) { - for (int j = 0; j < m_num_bm_segments - 1; j++) { - BmSumSegment& segment = segments[m_num_bm_segments * i + j]; - int bkt_idx = m_num_bkts * i + m_segment_size * (j + 1); - if (m_bkts_occupancy[bkt_idx]) { - segment.triangle_sum = m_buckets[bkt_idx]; - segment.line_sum = m_buckets[bkt_idx]; - } else { - segment.triangle_sum = P::zero(); - segment.line_sum = P::zero(); - } - segment.m_idx_in_segment = m_segment_size - 2; - segment.m_segment_mem_start = m_num_bkts * i + m_segment_size * j + 1; - } - - // The most significant bucket of every bm is stored in address 0 - - // so the last tri/line sums will be initialized to bucket[0] - BmSumSegment& segment = segments[m_num_bm_segments * (i + 1) - 1]; - int bkt_idx = m_num_bkts * i; - if (m_bkts_occupancy[bkt_idx]) { - segment.triangle_sum = m_buckets[bkt_idx]; - segment.line_sum = m_buckets[bkt_idx]; - } else { - segment.triangle_sum = P::zero(); - segment.line_sum = P::zero(); + // phase 2: accumulate m_segment_size buckets into a line_sum and triangle_sum + void phase2_collapse_segments() + { + uint64_t bucket_start = 0; + for (int segment_idx = 0; segment_idx < m_segments.size(); + segment_idx++) { // TBD: divide the work among m_nof_workers only. + // Each thread is responsible for a sinืขle thread + m_taskflow.emplace([=]() { + const uint32_t segment_size = std::min(m_nof_total_buckets - bucket_start, (uint64_t)m_segment_size); + worker_collapse_segment(m_segments[segment_idx], bucket_start, segment_size); + }); + bucket_start += m_segment_size; } - segment.m_idx_in_segment = m_segment_size - 2; - segment.m_segment_mem_start = m_num_bkts * (i + 1) - m_segment_size + 1; + run_workers_and_wait(); } -} -template -void Msm::phase3_final_accumulator(std::vector& segments, int idx_in_batch, P* result) -{ - // If it isn't the last MSM in the batch - run phase 3 on a separate thread to start utilizing the tasks manager on - // the next phase 1. - if (idx_in_batch == m_batch_size - 1) { - phase3_thread(segments, result); - } else { - m_phase3_threads[idx_in_batch] = std::thread(&Msm::phase3_thread, this, std::move(segments), result); + // single worker task - accumulate a single segment + void worker_collapse_segment(Segment& segment, const int64_t bucket_start, const uint32_t segment_size) + { + // Assumption all buyckets are busy - no need to look at the busy flag + std::vector& buckets = m_workers_buckets[0]; + const int64_t last_bucket_i = bucket_start + segment_size; + const int64_t init_bucket_i = (last_bucket_i % m_bm_size) ? last_bucket_i + : // bucket 0 at the BM contains the last element + last_bucket_i - m_bm_size; + segment.triangle_sum = init_bucket_i < buckets.size() ? buckets[init_bucket_i].point : P::zero(); + segment.line_sum = segment.triangle_sum; + for (int64_t bucket_i = last_bucket_i - 1; bucket_i > bucket_start; bucket_i--) { + // Add the bucket to line sum + segment.line_sum = segment.line_sum + buckets[bucket_i].point; // TBD: inplace + // Add line_sum to triangle_sum + segment.triangle_sum = segment.triangle_sum + segment.line_sum; // TBD: inplace + } } -} -template -void Msm::phase3_thread(std::vector segments, P* result) -{ - for (int i = 0; i < m_num_bms; i++) { - // Weighted sum of all the lines for each bm - summed in a similar fashion of the triangle sum of phase 2 - P partial_sum = segments[m_num_bm_segments * (i + 1) - 1].line_sum; - P total_sum = segments[m_num_bm_segments * (i + 1) - 1].line_sum; - for (int j = m_num_bm_segments - 2; j > 0; j--) { - partial_sum = partial_sum + segments[m_num_bm_segments * i + j].line_sum; - total_sum = total_sum + partial_sum; - } - segments[m_num_bm_segments * i].line_sum = total_sum; + // Serial phase - accumulate all segments to final result + void phase3_final_accumulator(P* result) + { + const int nof_segments_per_bm = m_bm_size / m_segment_size; + int nof_segments_left = m_segments.size(); + for (int i = 0; i < m_nof_buckets_module; i++) { + const int cur_bm_nof_segments = std::min(nof_segments_per_bm, nof_segments_left); + const int log_nof_segments_per_bm = + std::log2(nof_segments_per_bm); // TBD: avoid logn. can be calculated differently. + // Weighted sum of all the lines for each bm - summed in a similar fashion of the triangle sum of phase 2 + P partial_sum = nof_segments_per_bm * (i + 1) - 1 < m_segments.size() + ? m_segments[nof_segments_per_bm * (i + 1) - 1].line_sum + : P::zero(); + P total_sum = partial_sum; + + // run over all segments in the BM + for (int j = cur_bm_nof_segments - 2; j > 0; j--) { + // accumulate the partial sum + partial_sum = partial_sum + m_segments[nof_segments_per_bm * i + j].line_sum; + // add the partial sum to the total sum + total_sum = total_sum + partial_sum; + } + m_segments[nof_segments_per_bm * i].line_sum = total_sum; - // Convert weighted lines sum to rectangles sum by doubling - int num_doubles = m_c - 1 - m_log_num_segments; - for (int k = 0; k < num_doubles; k++) { - segments[m_num_bm_segments * i].line_sum = P::dbl(segments[m_num_bm_segments * i].line_sum); - } + // Convert weighted lines sum to rectangles sum by doubling + int num_doubles = m_c - 1 - log_nof_segments_per_bm; + for (int k = 0; k < num_doubles; k++) { + m_segments[nof_segments_per_bm * i].line_sum = P::dbl(m_segments[nof_segments_per_bm * i].line_sum); + } - // Sum triangles within bm linearly - for (int j = 1; j < m_num_bm_segments; j++) { - segments[m_num_bm_segments * i].triangle_sum = - segments[m_num_bm_segments * i].triangle_sum + segments[m_num_bm_segments * i + j].triangle_sum; - } + // Sum triangles within bm linearly + for (int j = 1; j < cur_bm_nof_segments && nof_segments_per_bm * i + j < m_segments.size(); j++) { + m_segments[nof_segments_per_bm * i].triangle_sum = + m_segments[nof_segments_per_bm * i].triangle_sum + m_segments[nof_segments_per_bm * i + j].triangle_sum; + } - // After which add the lines and triangle sums to one sum of the entire BM - if (m_num_bm_segments > 1) { - segments[m_num_bm_segments * i].triangle_sum = - segments[m_num_bm_segments * i].triangle_sum + segments[m_num_bm_segments * i].line_sum; + // After which add the lines and triangle sums to one sum of the entire BM + if (cur_bm_nof_segments > 1) { + m_segments[nof_segments_per_bm * i].triangle_sum = + m_segments[nof_segments_per_bm * i].triangle_sum + m_segments[nof_segments_per_bm * i].line_sum; + } + nof_segments_left -= nof_segments_per_bm; } - } - // Sum BM sums together - P final_sum = segments[(m_num_bms - 1) * m_num_bm_segments].triangle_sum; - for (int i = m_num_bms - 2; i >= 0; i--) { - // Multiply by the BM digit factor 2^c - i.e. c doublings - for (int j = 0; j < m_c; j++) { - final_sum = P::dbl(final_sum); + // Sum BM sums together + *result = m_segments[(m_nof_buckets_module - 1) * nof_segments_per_bm].triangle_sum; + for (int i = m_nof_buckets_module - 2; i >= 0; i--) { + // Multiply by the BM digit factor 2^c - i.e. c doublings + for (int j = 0; j < m_c; j++) { + *result = P::dbl(*result); + } + *result = *result + m_segments[nof_segments_per_bm * i].triangle_sum; } - final_sum = final_sum + segments[m_num_bm_segments * i].triangle_sum; } - *result = final_sum; -} - -// None class functions below: +}; // end of class Msm /** * @brief Super function that handles the Msm class to calculate a MSM. @@ -732,23 +375,11 @@ template eIcicleError cpu_msm( const Device& device, const scalar_t* scalars, const A* bases, int msm_size, const MSMConfig& config, P* results) { - int c = config.c; - if (c < 1) { c = Msm::get_optimal_c(msm_size, config.precompute_factor); } - - int nof_threads = std::thread::hardware_concurrency() - 1; - if (config.ext && config.ext->has(CpuBackendConfig::CPU_NOF_THREADS)) { - nof_threads = config.ext->get(CpuBackendConfig::CPU_NOF_THREADS); - } - if (nof_threads <= 0) { - ICICLE_LOG_WARNING << "Unable to detect number of hardware supported threads - fixing it to 1\n"; - nof_threads = 1; - } - auto msm = Msm{config, c, nof_threads}; - - for (int i = 0; i < config.batch_size; i++) { - int batch_start_idx = msm_size * i; - int bases_start_idx = config.are_points_shared_in_batch ? 0 : batch_start_idx; - msm.run_msm(&scalars[batch_start_idx], &bases[bases_start_idx], msm_size, i, &results[i]); + Msm msm(msm_size, config); + for (int batch_i = 0; batch_i < config.batch_size; batch_i++) { + const int batch_start_idx = msm_size * batch_i; + const int bases_start_idx = config.are_points_shared_in_batch ? 0 : batch_start_idx; + msm.run_msm(&scalars[batch_start_idx], &bases[bases_start_idx], &results[batch_i]); } return eIcicleError::SUCCESS; } @@ -771,17 +402,17 @@ eIcicleError cpu_msm_precompute_bases( const MSMConfig& config, A* output_bases) // Pre assigned? { - int c = config.c; - if (c < 1) { c = Msm::get_optimal_c(nof_bases, config.precompute_factor); } + int c = Msm::get_optimal_c(nof_bases, config); - int precompute_factor = config.precompute_factor; - bool is_mont = config.are_points_montgomery_form; - const unsigned int num_bms_no_precomp = (scalar_t::NBITS - 1) / c + 1; + const int precompute_factor = config.precompute_factor; + const bool is_mont = config.are_points_montgomery_form; + const uint scalar_size = scalar_t::NBITS; // TBD handle this config.bitsize != 0 ? config.bitsize : scalar_t::NBITS; + const unsigned int num_bms_no_precomp = (scalar_size - 1) / c + 1; const unsigned int shift = c * ((num_bms_no_precomp - 1) / precompute_factor + 1); for (int i = 0; i < nof_bases; i++) { output_bases[precompute_factor * i] = input_bases[i]; P point = P::from_affine(is_mont ? A::from_montgomery(input_bases[i]) : input_bases[i]); - for (int j = 1; j < precompute_factor; j++) { + for (int j = 1; j < precompute_factor; j++) { // TBD parallelize this for (int k = 0; k < shift; k++) { point = P::dbl(point); } From d706320c9d74a24a732cb08efb1cc79227fd7fb1 Mon Sep 17 00:00:00 2001 From: Leon Hibnik <107353745+LeonHibnik@users.noreply.github.com> Date: Mon, 13 Jan 2025 09:56:16 +0200 Subject: [PATCH 06/12] Fix/release script (#721) --- .github/workflows/release.yml | 4 +++- scripts/release/Dockerfile.ubi8 | 1 + scripts/release/Dockerfile.ubi9 | 1 + scripts/release/Dockerfile.ubuntu20 | 1 + scripts/release/Dockerfile.ubuntu22 | 1 + scripts/release/build_release_and_tar.sh | 2 +- 6 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index f9f58318f3..9886b5635f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -64,7 +64,9 @@ jobs: npm run docusaurus docs:version $LATEST_VERSION git add --all git commit -m "Bump docs version" - git push + - name: Push to github branch main + run: | + git push origin main --tags - name: Create draft release env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/scripts/release/Dockerfile.ubi8 b/scripts/release/Dockerfile.ubi8 index b12519fa57..3026099939 100644 --- a/scripts/release/Dockerfile.ubi8 +++ b/scripts/release/Dockerfile.ubi8 @@ -10,6 +10,7 @@ RUN dnf update -y && dnf install -y \ ninja-build \ wget \ gnupg2 \ + git \ && dnf clean all # Add the RPM-based LLVM repository for Clang diff --git a/scripts/release/Dockerfile.ubi9 b/scripts/release/Dockerfile.ubi9 index ad8528b9b4..605df5d1b7 100644 --- a/scripts/release/Dockerfile.ubi9 +++ b/scripts/release/Dockerfile.ubi9 @@ -14,6 +14,7 @@ RUN dnf update -y && dnf install -y \ gcc-c++ \ make \ gnupg \ + git \ && dnf clean all # Add LLVM repository for Clang installation diff --git a/scripts/release/Dockerfile.ubuntu20 b/scripts/release/Dockerfile.ubuntu20 index 20d8024536..2f307275ca 100644 --- a/scripts/release/Dockerfile.ubuntu20 +++ b/scripts/release/Dockerfile.ubuntu20 @@ -22,6 +22,7 @@ RUN apt-get update && apt-get install -y \ clang \ lldb \ lld \ + git \ && apt-get clean && rm -rf /var/lib/apt/lists/* # Verify installations diff --git a/scripts/release/Dockerfile.ubuntu22 b/scripts/release/Dockerfile.ubuntu22 index 0613edc0ad..5bdc103808 100644 --- a/scripts/release/Dockerfile.ubuntu22 +++ b/scripts/release/Dockerfile.ubuntu22 @@ -13,6 +13,7 @@ RUN apt-get update && apt-get install -y \ software-properties-common \ wget \ gnupg \ + git \ && wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - \ && add-apt-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy main" diff --git a/scripts/release/build_release_and_tar.sh b/scripts/release/build_release_and_tar.sh index 2ea4648744..ca058b2a9c 100755 --- a/scripts/release/build_release_and_tar.sh +++ b/scripts/release/build_release_and_tar.sh @@ -8,7 +8,7 @@ ICICLE_OS=${2:-unknown_os} # Default to "unknown_os" if not set ICICLE_CUDA_VERSION=${3:-cuda_unknown} # Default to "cuda_unknown" if not set # List of fields and curves -fields=("babybear" "stark252" "m31") +fields=("babybear" "stark252" "m31" "koalabear") curves=("bn254" "bls12_381" "bls12_377" "bw6_761" "grumpkin") cd / From b27da85bdfa398f27340d44c807e4bb9d54f09d4 Mon Sep 17 00:00:00 2001 From: yshekel Date: Mon, 13 Jan 2025 09:59:52 +0200 Subject: [PATCH 07/12] Fix bug in CPU vec ops regarding nof workers (#731) --- icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 25 ++++++++++---------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp index f279c7b034..496621c312 100644 --- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp +++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp @@ -371,7 +371,7 @@ class VectorOpTask : public TaskBase public: T m_intermidiate_res; // pointer to the output. Can be a vector or scalar pointer uint64_t m_idx_in_batch; // index in the batch. Used in intermediate res tasks -}; // class VectorOpTask +}; #define NOF_OPERATIONS_PER_TASK 512 #define CONFIG_NOF_THREADS_KEY "n_threads" @@ -381,8 +381,9 @@ int get_nof_workers(const VecOpsConfig& config) { if (config.ext && config.ext->has(CONFIG_NOF_THREADS_KEY)) { return config.ext->get(CONFIG_NOF_THREADS_KEY); } - int hw_threads = std::thread::hardware_concurrency(); - return ((hw_threads > 1) ? hw_threads - 1 : 1); // reduce 1 for the main + const int hw_threads = std::thread::hardware_concurrency(); + // Note: no need to account for the main thread in vec-ops since it's doing little work + return std::max(1, hw_threads); } // Execute a full task from the type vector = vector (op) vector @@ -390,7 +391,7 @@ template eIcicleError cpu_2vectors_op(VecOperation op, const T* vec_a, const U* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { - TasksManager> task_manager(get_nof_workers(config) - 1); + TasksManager> task_manager(get_nof_workers(config)); const uint64_t total_nof_operations = size * config.batch_size; for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) { VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); @@ -406,7 +407,7 @@ template eIcicleError cpu_scalar_vector_op( VecOperation op, const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output) { - TasksManager> task_manager(get_nof_workers(config) - 1); + TasksManager> task_manager(get_nof_workers(config)); const uint64_t total_nof_operations = size; const uint32_t stride = config.columns_batch ? config.batch_size : 1; for (uint32_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) { @@ -479,7 +480,7 @@ template eIcicleError cpu_convert_montgomery( const Device& device, const T* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, T* output) { - TasksManager> task_manager(get_nof_workers(config) - 1); + TasksManager> task_manager(get_nof_workers(config)); const uint64_t total_nof_operations = size * config.batch_size; for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) { VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); @@ -499,7 +500,7 @@ REGISTER_CONVERT_MONTGOMERY_BACKEND("CPU", cpu_convert_montgomery); template eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output) { - TasksManager> task_manager(get_nof_workers(config) - 1); + TasksManager> task_manager(get_nof_workers(config)); std::vector output_initialized = std::vector(config.batch_size, false); uint64_t vec_a_offset = 0; uint64_t idx_in_batch = 0; @@ -539,7 +540,7 @@ template eIcicleError cpu_vector_product(const Device& device, const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output) { - TasksManager> task_manager(get_nof_workers(config) - 1); + TasksManager> task_manager(get_nof_workers(config)); std::vector output_initialized = std::vector(config.batch_size, false); uint64_t vec_a_offset = 0; uint64_t idx_in_batch = 0; @@ -610,7 +611,7 @@ template eIcicleError out_of_place_matrix_transpose( const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out) { - TasksManager> task_manager(get_nof_workers(config) - 1); + TasksManager> task_manager(get_nof_workers(config)); uint32_t stride = config.columns_batch ? config.batch_size : 1; const uint64_t total_elements_one_mat = static_cast(nof_rows) * nof_cols; const uint32_t NOF_ROWS_PER_TASK = @@ -695,7 +696,7 @@ eIcicleError matrix_transpose_necklaces( std::vector start_indices_in_mat; // Collect start indices gen_necklace(1, 1, k, length, necklace, start_indices_in_mat); - TasksManager> task_manager(get_nof_workers(config) - 1); + TasksManager> task_manager(get_nof_workers(config)); for (uint64_t i = 0; i < start_indices_in_mat.size(); i += max_nof_operations) { uint64_t nof_operations = std::min((uint64_t)max_nof_operations, start_indices_in_mat.size() - i); for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) { @@ -746,7 +747,7 @@ cpu_bit_reverse(const Device& device, const T* vec_in, uint64_t size, const VecO ICICLE_ASSERT((1ULL << logn) == size) << "Invalid argument - size is not a power of 2"; // Perform the bit reverse - TasksManager> task_manager(get_nof_workers(config) - 1); + TasksManager> task_manager(get_nof_workers(config)); for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) { for (uint64_t i = 0; i < size; i += NOF_OPERATIONS_PER_TASK) { VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); @@ -783,7 +784,7 @@ eIcicleError cpu_slice( ICICLE_ASSERT(vec_in != nullptr && vec_out != nullptr) << "Error: Invalid argument - input or output vector is null"; ICICLE_ASSERT(offset + (size_out - 1) * stride < size_in) << "Error: Invalid argument - slice out of bound"; - TasksManager> task_manager(get_nof_workers(config) - 1); + TasksManager> task_manager(get_nof_workers(config)); for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) { for (uint64_t i = 0; i < size_out; i += NOF_OPERATIONS_PER_TASK) { VectorOpTask* task_p = task_manager.get_idle_or_completed_task(); From a751555f0cfb9f68b78c1fe8357f882072ecf62f Mon Sep 17 00:00:00 2001 From: yshekel Date: Tue, 14 Jan 2025 13:10:44 +0200 Subject: [PATCH 08/12] Support android and vulkan (#735) - Update Cmake for android cross compilation - Update log.h for android logcat --- icicle/CMakeLists.txt | 42 +--------------- icicle/cmake/curve.cmake | 2 +- icicle/cmake/field.cmake | 2 +- icicle/cmake/setup.cmake | 82 +++++++++++++++++++++++++++++++ icicle/include/icicle/utils/log.h | 44 +++++++++++++++-- 5 files changed, 125 insertions(+), 47 deletions(-) create mode 100644 icicle/cmake/setup.cmake diff --git a/icicle/CMakeLists.txt b/icicle/CMakeLists.txt index 246215676f..45d0e3db71 100644 --- a/icicle/CMakeLists.txt +++ b/icicle/CMakeLists.txt @@ -1,59 +1,23 @@ cmake_minimum_required(VERSION 3.18) +include(cmake/setup.cmake) + project(icicle) # Specify the C++ standard set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED True) -# Select the C++ compiler -find_program(CLANG_COMPILER clang++) -find_program(CLANG_C_COMPILER clang) - -if(CLANG_COMPILER AND CLANG_C_COMPILER) - set(CMAKE_CXX_COMPILER ${CLANG_COMPILER} CACHE STRING "Clang++ compiler" FORCE) - set(CMAKE_C_COMPILER ${CLANG_C_COMPILER} CACHE STRING "Clang compiler" FORCE) - message(STATUS "Using Clang++ as the C++ compiler: ${CLANG_COMPILER}") - message(STATUS "Using Clang as the C compiler: ${CLANG_C_COMPILER}") -else() - message(WARNING "ICICLE CPU works best with clang++ and clang. Defaulting to ${CLANG_COMPILER}") -endif() - - include(cmake/field.cmake) include(cmake/curve.cmake) include(cmake/hash.cmake) -# Set the default build type to Release if not specified -if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build: Debug, Release, RelWithDebInfo, MinSizeRel." FORCE) -endif() - -# Print the selected build type -message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") - # Prevent build if both SANITIZE and CUDA_BACKEND are enabled if(SANITIZE AND CUDA_BACKEND) message(FATAL_ERROR "Address sanitizer and Cuda cannot be enabled at the same time.") endif() -# Find the ccache program -find_program(CCACHE_PROGRAM ccache) -# If ccache is found, use it as the compiler launcher -if(CCACHE_PROGRAM) - message(STATUS "ccache found: ${CCACHE_PROGRAM}") - - # Use ccache for C and C++ compilers - set(CMAKE_C_COMPILER_LAUNCHER ${CCACHE_PROGRAM}) - set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM}) - set(CMAKE_CUDA_COMPILER_LAUNCHER ${CCACHE_PROGRAM}) -else() - message(STATUS "ccache not found") -endif() - -set(CMAKE_POSITION_INDEPENDENT_CODE ON) - # Build options option(BUILD_TESTS "Build unit test2s. Default=OFF" OFF) # Backends: typically CPU is built into the frontend, the rest are DSOs loaded at runtime from installation @@ -87,8 +51,6 @@ add_library(icicle_device SHARED src/runtime.cpp src/config_extension.cpp ) -target_link_libraries(icicle_device PUBLIC dl) - include_directories(include) # Define the install directory (default is /usr/local) diff --git a/icicle/cmake/curve.cmake b/icicle/cmake/curve.cmake index c82d1b90b0..53148314ce 100644 --- a/icicle/cmake/curve.cmake +++ b/icicle/cmake/curve.cmake @@ -58,7 +58,7 @@ function(setup_curve_target CURVE CURVE_INDEX FEATURES_STRING) # Add additional feature handling calls here set_target_properties(icicle_curve PROPERTIES OUTPUT_NAME "icicle_curve_${CURVE}") - target_link_libraries(icicle_curve PUBLIC icicle_device icicle_field pthread) + target_link_libraries(icicle_curve PUBLIC icicle_device icicle_field) # Ensure CURVE is defined in the cache for backends to see set(CURVE "${CURVE}" CACHE STRING "") diff --git a/icicle/cmake/field.cmake b/icicle/cmake/field.cmake index 953c0d6fca..b7864c869a 100644 --- a/icicle/cmake/field.cmake +++ b/icicle/cmake/field.cmake @@ -56,7 +56,7 @@ function(setup_field_target FIELD FIELD_INDEX FEATURES_STRING) # Add additional feature handling calls here set_target_properties(icicle_field PROPERTIES OUTPUT_NAME "icicle_field_${FIELD}") - target_link_libraries(icicle_field PUBLIC icicle_device pthread) + target_link_libraries(icicle_field PUBLIC icicle_device) # Ensure FIELD is defined in the cache for backends to see set(FIELD "${FIELD}" CACHE STRING "") diff --git a/icicle/cmake/setup.cmake b/icicle/cmake/setup.cmake new file mode 100644 index 0000000000..c60d5330e6 --- /dev/null +++ b/icicle/cmake/setup.cmake @@ -0,0 +1,82 @@ + +# Option to cross-compile for Android +option(BUILD_FOR_ANDROID "Cross-compile for Android" OFF) + +if (BUILD_FOR_ANDROID) + message(STATUS "Configuring for Android...") + + # Check for NDK in the environment variable + if (NOT DEFINED ENV{ANDROID_NDK} AND NOT DEFINED ANDROID_NDK) + message(FATAL_ERROR "ANDROID_NDK is not defined. Please set the environment variable or pass -DANDROID_NDK=") + endif() + + # Use the CMake option if specified; otherwise, use the environment variable + if (DEFINED ANDROID_NDK) + set(CMAKE_ANDROID_NDK ${ANDROID_NDK}) + elseif (DEFINED ENV{ANDROID_NDK}) + set(CMAKE_ANDROID_NDK $ENV{ANDROID_NDK}) + endif() + + # Debugging message for NDK path + message(STATUS "Using Android NDK: ${CMAKE_ANDROID_NDK}") + + # Set toolchain and other options + set(ANDROID_MIN_API 24) # Minimum API (24 is for android 7.0 and later) + set(CMAKE_SYSTEM_NAME Android CACHE STRING "Target system name for cross-compilation") + set(ANDROID_ABI arm64-v8a CACHE STRING "Default Android ABI") + set(ANDROID_PLATFORM "android-${ANDROID_MIN_API}" CACHE STRING "Android API level") + set(CMAKE_ANDROID_ARCH_ABI "${ANDROID_ABI}" CACHE STRING "Target ABI for Android") + set(CMAKE_ANDROID_STL_TYPE c++_shared CACHE STRING "Android STL type") + set(CMAKE_TOOLCHAIN_FILE "${CMAKE_ANDROID_NDK}/build/cmake/android.toolchain.cmake" CACHE FILEPATH "Path to the Android toolchain file") + list(APPEND CMAKE_SYSTEM_LIBRARY_PATH "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android/${ANDROID_MIN_API}") + + message(STATUS "Using ANDROID_MIN_API: ${ANDROID_MIN_API}") + message(STATUS "Using ANDROID_ABI: ${ANDROID_ABI}") + +endif() + +# Platform specific libraries and compiler +if (CMAKE_SYSTEM_NAME STREQUAL "Android") + find_library(LOG_LIB log REQUIRED) # Android log library + set(PLATFORM_LIBS ${LOG_LIB}) +else() + message(STATUS "Configuring for native platform...") + # Select the C++ compiler + find_program(CLANG_COMPILER clang++) + find_program(CLANG_C_COMPILER clang) + + if(CLANG_COMPILER AND CLANG_C_COMPILER) + set(CMAKE_CXX_COMPILER ${CLANG_COMPILER} CACHE STRING "Clang++ compiler" FORCE) + set(CMAKE_C_COMPILER ${CLANG_C_COMPILER} CACHE STRING "Clang compiler" FORCE) + else() + message(WARNING "ICICLE CPU works best with clang++ and clang. Defaulting to ${CLANG_COMPILER}") + endif() + + set(PLATFORM_LIBS pthread dl) +endif() + +link_libraries(${PLATFORM_LIBS}) + +# Find the ccache program +find_program(CCACHE_PROGRAM ccache) +# If ccache is found, use it as the compiler launcher +if(CCACHE_PROGRAM) + message(STATUS "ccache found: ${CCACHE_PROGRAM}") + + # Use ccache for C and C++ compilers + set(CMAKE_C_COMPILER_LAUNCHER ${CCACHE_PROGRAM}) + set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM}) + set(CMAKE_CUDA_COMPILER_LAUNCHER ${CCACHE_PROGRAM}) +else() + message(STATUS "ccache not found") +endif() + +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +# Set the default build type to Release if not specified +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build: Debug, Release, RelWithDebInfo, MinSizeRel." FORCE) +endif() + +# Print the selected build type +message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") \ No newline at end of file diff --git a/icicle/include/icicle/utils/log.h b/icicle/include/icicle/utils/log.h index f8e0af490b..62ae869f66 100644 --- a/icicle/include/icicle/utils/log.h +++ b/icicle/include/icicle/utils/log.h @@ -3,6 +3,10 @@ #include #include +#ifdef __ANDROID__ + #include +#endif + #define ICICLE_LOG_VERBOSE Log(Log::Verbose) #define ICICLE_LOG_DEBUG Log(Log::Debug) #define ICICLE_LOG_INFO Log(Log::Info) @@ -21,7 +25,16 @@ class Log ~Log() { - if (level >= s_min_log_level) { std::cerr << oss.str() << std::endl; } + if (level >= s_min_log_level) { +#ifdef __ANDROID__ + // Use Android logcat + android_LogPriority androidPriority = logLevelToAndroidPriority(level); + __android_log_print(androidPriority, "ICICLE", "%s", oss.str().c_str()); +#else + // Use standard error stream for other platforms + std::cerr << oss.str() << std::endl; +#endif + } } template @@ -31,7 +44,7 @@ class Log return *this; } - // Static method to set the log level + // Static method to set the minimum log level static void set_min_log_level(eLogLevel level) { s_min_log_level = level; } private: @@ -43,7 +56,7 @@ class Log { switch (level) { case Verbose: - return "DEBUG"; + return "VERBOSE"; case Debug: return "DEBUG"; case Info: @@ -57,7 +70,28 @@ class Log } } - // logging message with level>=s_min_log_level +#ifdef __ANDROID__ + // Map custom log level to Android log priority + android_LogPriority logLevelToAndroidPriority(eLogLevel level) const + { + switch (level) { + case Verbose: + return ANDROID_LOG_VERBOSE; + case Debug: + return ANDROID_LOG_DEBUG; + case Info: + return ANDROID_LOG_INFO; + case Warning: + return ANDROID_LOG_WARN; + case Error: + return ANDROID_LOG_ERROR; + default: + return ANDROID_LOG_UNKNOWN; + } + } +#endif + + // Static member to hold the minimum log level #if defined(NDEBUG) static inline eLogLevel s_min_log_level = eLogLevel::Info; #else @@ -65,4 +99,4 @@ class Log #endif // Note: for verbose, need to explicitly call `set_min_log_level(eLogLevel::Verbose)` -}; +}; \ No newline at end of file From c10aa35f3cbf5685029e7c01bbe2c7b1141204dd Mon Sep 17 00:00:00 2001 From: Miki <100796045+mickeyasa@users.noreply.github.com> Date: Tue, 14 Jan 2025 13:19:50 +0200 Subject: [PATCH 09/12] Parallelize-vecop-program-execution (#736) --- icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 39 ++++++++++++++------ 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp index 496621c312..38d859e022 100644 --- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp +++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp @@ -10,6 +10,7 @@ #include #include +#include "taskflow/taskflow.hpp" #include "icicle/program/program.h" #include "cpu_program_executor.h" @@ -852,20 +853,36 @@ eIcicleError cpu_execute_program( << " parameters"; return eIcicleError::INVALID_ARGUMENT; } + tf::Taskflow taskflow; // Accumulate tasks + tf::Executor executor; // execute all tasks accumulated on multiple threads const uint64_t total_nof_operations = size * config.batch_size; - CpuProgramExecutor prog_executor(program); - // init prog_executor to point to data vectors - for (int param_idx = 0; param_idx < program.m_nof_parameters; ++param_idx) { - prog_executor.m_variable_ptrs[param_idx] = data[param_idx]; - } - // run over all elements in the arrays and execute the program - for (uint64_t i = 0; i < total_nof_operations; i++) { - prog_executor.execute(); - for (int param_idx = 0; param_idx < program.m_nof_parameters; ++param_idx) { - (prog_executor.m_variable_ptrs[param_idx])++; - } + // Divide the problem to workers + const int nof_workers = get_nof_workers(config); + const uint64_t worker_task_size = (total_nof_operations + nof_workers - 1) / nof_workers; // round up + + for (uint64_t start_idx = 0; start_idx < total_nof_operations; start_idx += worker_task_size) { + taskflow.emplace([=]() { + CpuProgramExecutor prog_executor(program); + // init prog_executor to point to data vectors + for (int param_idx = 0; param_idx < program.m_nof_parameters; ++param_idx) { + prog_executor.m_variable_ptrs[param_idx] = &(data[param_idx][start_idx]); + } + + const uint64_t task_size = std::min(worker_task_size, total_nof_operations - start_idx); + // run over all task elements in the arrays and execute the program + for (uint64_t i = 0; i < task_size; i++) { + prog_executor.execute(); + // update the program pointers + for (int param_idx = 0; param_idx < program.m_nof_parameters; ++param_idx) { + (prog_executor.m_variable_ptrs[param_idx])++; + } + } + }); } + + executor.run(taskflow).wait(); + taskflow.clear(); return eIcicleError::SUCCESS; } From eab9c508a240e348bcf0b1f030c40dea4aeb2529 Mon Sep 17 00:00:00 2001 From: idanfr-ingo Date: Tue, 14 Jan 2025 13:55:38 +0200 Subject: [PATCH 10/12] Create docs for program & program execution (#722) --- docs/docs/icicle/primitives/program.md | 88 ++++++++++++++++++++++++++ docs/docs/icicle/primitives/vec_ops.md | 15 +++++ docs/sidebars.ts | 5 ++ 3 files changed, 108 insertions(+) create mode 100644 docs/docs/icicle/primitives/program.md diff --git a/docs/docs/icicle/primitives/program.md b/docs/docs/icicle/primitives/program.md new file mode 100644 index 0000000000..85e5441883 --- /dev/null +++ b/docs/docs/icicle/primitives/program.md @@ -0,0 +1,88 @@ +# Programs + +## Overview + +Program is a class that let users define expressions on vector elements, and have ICICLE compile it for the backends for a fused implementation. This solves memory bottlenecks and also let users customize algorithms such as sumcheck. Program can create only element-wise lambda functions. + + +## C++ API + +### Symbol + +Symbol is the basic (template) class that allow users to define their own program. The lambda function the user define will operate on symbols. + +### Defining lambda function + +To define a custom lambda function the user will use Symbol: +```cpp +void lambda_multi_result(std::vector>& vars) +{ + const Symbol& A = vars[0]; + const Symbol& B = vars[1]; + const Symbol& C = vars[2]; + const Symbol& EQ = vars[3]; + vars[4] = EQ * (A * B - C) + scalar_t::from(9); + vars[5] = A * B - C.inverse(); + vars[6] = vars[5]; + vars[3] = 2 * (var[0] + var[1]) // all variables can be both inputs and outputs +} +``` + +Each symbol element at the vector argument `var` represent an input or an output. The type of the symbol (`scalar_t` in this example) will be the type of the inputs and outputs. In this example we created a lambda function with four inputs and three outputs. + +In this example there are four input variables and three three outputs. Inside the function the user can define custom expressions on them. + +Program support few pre-defined programs. The user can use those pre-defined programs without creating a lambda function, as will be explained in the next section. + +### Creating program + +To execute the lambda function we just created we need to create a program from it. +To create program from lambda function we can use the following constructor: + +```cpp +Program(std::function>&)> program_func, const int nof_parameters) +``` + +`program_func` is the lambda function (in the example above `lambda_multi_result`) and `nof_parameters` is the total number of parameter (inputs + outputs) for the lambda (seven in the above example). + +#### Pre-defined programs + +As mentioned before, there are few pre-defined programs the user can use without the need to create a lambda function first. The enum `PreDefinedPrograms` contains the pre-defined program. Using pre-defined function will lead to better performance compared to creating the equivalent lambda function. +To create a pre-defined program a different constructor is bing used: + +```cpp +Program(PreDefinedPrograms pre_def) +``` + +`pre_def` is the pre-defined program (from `PreDefinedPrograms`). + +##### PreDefinedPrograms + +```cpp +enum PreDefinedPrograms { + AB_MINUS_C = 0, + EQ_X_AB_MINUS_C +}; +``` + +`AB_MINUS_C` - the pre-defined program `AB - C` for the input vectors `A`, `B` and `C` + +`EQ_X_AB_MINUS_C` - the pre-defined program `EQ(AB - C)` for the input vectors `A`, `B`, `C` and `EQ` + + +### Executing program + +To execute the program the `execute_program` function from the vector operation API should be used. This operation is supported by the CPU and CUDA backends. + + +```cpp +template +eIcicleError +execute_program(std::vector& data, const Program& program, uint64_t size, const VecOpsConfig& config); +``` + +The `data` vector is a vector of pointers to the inputs and output vectors, `program` is the program to execute, `size` is the length of the vectors and `config` is the configuration of the operation. + +For the configuration the field `is_a_on_device` determined whethere the data (*inputs and outputs*) is on device or not. After the execution `data` will reside in the same place as it did before (i.e. the field `is_result_on_device` is irrelevant.) + +> **_NOTE:_** Using program for executing lambdas is recommended only while using the CUDA backend. Program's primary use is to let users to customize algorithms (such as sumcheck). diff --git a/docs/docs/icicle/primitives/vec_ops.md b/docs/docs/icicle/primitives/vec_ops.md index 7f546dc16c..4725f7a490 100644 --- a/docs/docs/icicle/primitives/vec_ops.md +++ b/docs/docs/icicle/primitives/vec_ops.md @@ -78,6 +78,20 @@ template eIcicleError vector_div(const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output); ``` +#### `execute_program` + +Execute a user-defined lambda function with arbitrary number of input and output variables. + +```cpp +template +eIcicleError +execute_program(std::vector& data, const Program& program, uint64_t size, const VecOpsConfig& config); +``` + +`is_result_on_device` of VecOpsConfig is not used here. + +For more details see [program](./program.md). + #### `vector_accumulate` Adds vector b to a, inplace. @@ -208,6 +222,7 @@ template eIcicleError polynomial_division(const T* numerator, int64_t numerator_deg, const T* denumerator, int64_t denumerator_deg, const VecOpsConfig& config, T* q_out /*OUT*/, uint64_t q_size, T* r_out /*OUT*/, uint64_t r_size); ``` + ### Rust and Go bindings - [Golang](../golang-bindings/vec-ops.md) diff --git a/docs/sidebars.ts b/docs/sidebars.ts index 32f264d709..dcc9b77c0b 100644 --- a/docs/sidebars.ts +++ b/docs/sidebars.ts @@ -66,6 +66,11 @@ const cppApi = [ label: "Vector operations", id: "icicle/primitives/vec_ops", }, + { + type: "doc", + label: "Program", + id: "icicle/primitives/program", + }, { type: "doc", label: "Polynomials", From 488421bcb43704ec3e9228ad65ec2c9d71dfb964 Mon Sep 17 00:00:00 2001 From: Aviad Dahan <116077675+aviadingo@users.noreply.github.com> Date: Tue, 14 Jan 2025 14:59:51 +0200 Subject: [PATCH 11/12] Feat: Blake3 (#733) --- .github/workflows/cpp-golang-rust.yml | 111 ++-- docs/docs/icicle/libraries.md | 1 + docs/docs/icicle/primitives/hash.md | 9 +- examples/c++/best-practice-ntt/example.cpp | 4 +- icicle/backend/cpu/CMakeLists.txt | 4 + icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 2 +- icicle/backend/cpu/src/hash/blake3.c | 591 ++++++++++++++++++ icicle/backend/cpu/src/hash/blake3.h | 57 ++ icicle/backend/cpu/src/hash/blake3_dispatch.c | 62 ++ icicle/backend/cpu/src/hash/blake3_impl.h | 205 ++++++ icicle/backend/cpu/src/hash/blake3_portable.c | 176 ++++++ icicle/backend/cpu/src/hash/cpu_blake3.cpp | 53 ++ icicle/backend/cpu/src/hash/cpu_poseidon2.cpp | 2 +- icicle/cmake/hash.cmake | 1 + .../icicle/backend/hash/blake3_backend.h | 25 + .../include/icicle/fields/quartic_extension.h | 2 +- icicle/include/icicle/fields/storage.h | 12 +- icicle/include/icicle/hash/blake3.h | 20 + icicle/src/hash/blake3.cpp | 17 + icicle/src/hash/hash_c_api.cpp | 14 + icicle/tests/test_hash_api.cpp | 73 +++ wrappers/golang/hash/blake3.go | 19 + wrappers/golang/hash/include/blake3.h | 17 + wrappers/golang/hash/tests/hash_test.go | 73 +++ wrappers/rust/icicle-hash/src/blake3.rs | 18 + wrappers/rust/icicle-hash/src/lib.rs | 1 + wrappers/rust/icicle-hash/src/tests.rs | 67 ++ 27 files changed, 1569 insertions(+), 67 deletions(-) create mode 100644 icicle/backend/cpu/src/hash/blake3.c create mode 100644 icicle/backend/cpu/src/hash/blake3.h create mode 100644 icicle/backend/cpu/src/hash/blake3_dispatch.c create mode 100644 icicle/backend/cpu/src/hash/blake3_impl.h create mode 100644 icicle/backend/cpu/src/hash/blake3_portable.c create mode 100644 icicle/backend/cpu/src/hash/cpu_blake3.cpp create mode 100644 icicle/include/icicle/backend/hash/blake3_backend.h create mode 100644 icicle/include/icicle/hash/blake3.h create mode 100644 icicle/src/hash/blake3.cpp create mode 100644 wrappers/golang/hash/blake3.go create mode 100644 wrappers/golang/hash/include/blake3.h create mode 100644 wrappers/rust/icicle-hash/src/blake3.rs diff --git a/.github/workflows/cpp-golang-rust.yml b/.github/workflows/cpp-golang-rust.yml index 11c9242c7a..09eda977d8 100644 --- a/.github/workflows/cpp-golang-rust.yml +++ b/.github/workflows/cpp-golang-rust.yml @@ -1,4 +1,4 @@ -name: Spell Check, C++/CUDA/Go/RUST & Examples +name: C++/CUDA/Go/RUST on: pull_request: @@ -31,6 +31,8 @@ jobs: name: Check Code Format runs-on: ubuntu-22.04 needs: check-changed-files + container: + image: silkeh/clang:19-bookworm # Replace with a container having the desired clang-format version steps: - name: Checkout uses: actions/checkout@v4 @@ -41,7 +43,9 @@ jobs: go-version: '1.22.0' - name: Check clang-format if: needs.check-changed-files.outputs.cpp == 'true' - run: ./scripts/format_all.sh . --check --exclude "build|wrappers" + run: | + clang-format --version + ./scripts/format_all.sh . --check --exclude "build|wrappers" - name: Check gofmt if: needs.check-changed-files.outputs.golang == 'true' run: if [[ $(go list ./... | xargs go fmt) ]]; then echo "Please run go fmt"; exit 1; fi @@ -82,7 +86,7 @@ jobs: - name: Checkout Repo uses: actions/checkout@v4 - name: Checkout CUDA Backend - if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' + if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' || needs.check-changed-files.outputs.rust == 'true' uses: actions/checkout@v4 with: repository: ingonyama-zk/icicle-cuda-backend @@ -90,7 +94,7 @@ jobs: ssh-key: ${{ secrets.CUDA_PULL_KEY }} ref: ${{ needs.extract-cuda-backend-branch.outputs.cuda-backend-branch }} - name: Get CUDA Backend Commit SHA - if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' + if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' || needs.check-changed-files.outputs.rust == 'true' working-directory: ./icicle/backend/cuda id: extract-cuda-sha run: | @@ -98,7 +102,7 @@ jobs: echo "CUDA Backend Commit SHA: $CUDA_BACKEND_SHA" echo "cuda-backend-sha=$CUDA_BACKEND_SHA" >> $GITHUB_OUTPUT - name: Set CUDA backend flag - if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' + if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' || needs.check-changed-files.outputs.rust == 'true' id: cuda-flag run: | CUDA_BACKEND_SHA=${{ steps.extract-cuda-sha.outputs.cuda-backend-sha }} @@ -136,33 +140,17 @@ jobs: echo "CMAKE_INSTALL_PREFIX=-DCMAKE_INSTALL_PREFIX=${INSTALL_PATH}" >> $GITHUB_OUTPUT echo "ICICLE_BACKEND_INSTALL_DIR=${INSTALL_PATH}/lib" >> $GITHUB_OUTPUT fi - - name: Setup go - if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' - timeout-minutes: 15 - uses: actions/setup-go@v5 - with: - go-version: '1.22.0' - name: Build curve working-directory: ./icicle - if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' + if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' || needs.check-changed-files.outputs.rust == 'true' run: | mkdir -p build && rm -rf build/* cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DCURVE=${{ matrix.curve.name }} ${{ matrix.curve.build_args }} ${{ steps.cuda-flag.outputs.CUDA_FLAG }} ${{ steps.cuda-flag.outputs.CMAKE_INSTALL_PREFIX }} -S . -B build cmake --build build --target install -j rm -rf ${{ steps.cuda-flag.outputs.INSTALL_PATH }}/lib/gh_commit_sha_${{ matrix.curve.name }}* touch ${{ steps.cuda-flag.outputs.COMMIT_FILE_PATH }} - - name: Run RUST Curve Tests - working-directory: ./wrappers/rust/icicle-curves - if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp == 'true' - run: | - CURVE=${{ matrix.curve.name }} - CURVE_DIR=icicle-${CURVE//_/-} - export ICICLE_BACKEND_INSTALL_DIR=${{ steps.cuda-flag.outputs.INSTALL_PATH }} - cd ./$CURVE_DIR - cargo test --release --verbose -- --skip phase - cargo test phase2 --release - cargo test phase3 --release + - name: Run C++ Curve Tests working-directory: ./icicle/build/tests if: needs.check-changed-files.outputs.cpp == 'true' @@ -183,6 +171,23 @@ jobs: cd - fi done + - name: Run RUST Curve Tests + working-directory: ./wrappers/rust/icicle-curves + if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp == 'true' + run: | + CURVE=${{ matrix.curve.name }} + CURVE_DIR=icicle-${CURVE//_/-} + export ICICLE_BACKEND_INSTALL_DIR=${{ steps.cuda-flag.outputs.INSTALL_PATH }} + cd ./$CURVE_DIR + cargo test --release --verbose -- --skip phase + cargo test phase2 --release + cargo test phase3 --release + - name: Setup go + if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' + timeout-minutes: 15 + uses: actions/setup-go@v5 + with: + go-version: '1.22.0' - name: Run Golang curve Tests working-directory: ./wrappers/golang/curves if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' @@ -214,7 +219,7 @@ jobs: - name: Checkout Repo uses: actions/checkout@v4 - name: Checkout CUDA Backend - if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' + if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' || needs.check-changed-files.outputs.rust == 'true' uses: actions/checkout@v4 with: repository: ingonyama-zk/icicle-cuda-backend @@ -222,7 +227,7 @@ jobs: ssh-key: ${{ secrets.CUDA_PULL_KEY }} ref: ${{ needs.extract-cuda-backend-branch.outputs.cuda-backend-branch }} - name: Get CUDA Backend Commit SHA - if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' + if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' || needs.check-changed-files.outputs.rust == 'true' working-directory: ./icicle/backend/cuda id: extract-cuda-sha run: | @@ -230,7 +235,7 @@ jobs: echo "CUDA Backend Commit SHA: $CUDA_BACKEND_SHA" echo "cuda-backend-sha=$CUDA_BACKEND_SHA" >> $GITHUB_OUTPUT - name: Set CUDA backend flag - if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' + if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' || needs.check-changed-files.outputs.rust == 'true' id: cuda-flag run: | CUDA_BACKEND_SHA=${{ steps.extract-cuda-sha.outputs.cuda-backend-sha }} @@ -268,12 +273,6 @@ jobs: echo "CMAKE_INSTALL_PREFIX=-DCMAKE_INSTALL_PREFIX=${INSTALL_PATH}" >> $GITHUB_OUTPUT echo "ICICLE_BACKEND_INSTALL_DIR=${INSTALL_PATH}/lib" >> $GITHUB_OUTPUT fi - - name: Setup go - if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' - timeout-minutes: 15 - uses: actions/setup-go@v5 - with: - go-version: '1.22.0' - name: Build field working-directory: ./icicle @@ -284,17 +283,7 @@ jobs: cmake --build build --target install -j rm -rf ${{ steps.cuda-flag.outputs.INSTALL_PATH }}/lib/gh_commit_sha_${{ matrix.field.name }}* touch ${{ steps.cuda-flag.outputs.COMMIT_FILE_PATH }} - - name: Run RUST Field Tests - working-directory: ./wrappers/rust/icicle-fields - if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp == 'true' - run: | - FIELD=${{ matrix.field.name }} - FIELD_DIR=icicle-${FIELD//_/-} - export ICICLE_BACKEND_INSTALL_DIR=${{ steps.cuda-flag.outputs.INSTALL_PATH }} - cd ./$FIELD_DIR - cargo test --release --verbose -- --skip phase - cargo test phase2 --release - cargo test phase3 --release + - name: Run C++ field Tests working-directory: ./icicle/build/tests if: needs.check-changed-files.outputs.cpp == 'true' @@ -317,6 +306,24 @@ jobs: fi done + - name: Run RUST Field Tests + working-directory: ./wrappers/rust/icicle-fields + if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp == 'true' + run: | + FIELD=${{ matrix.field.name }} + FIELD_DIR=icicle-${FIELD//_/-} + export ICICLE_BACKEND_INSTALL_DIR=${{ steps.cuda-flag.outputs.INSTALL_PATH }} + cd ./$FIELD_DIR + cargo test --release --verbose -- --skip phase + cargo test phase2 --release + cargo test phase3 --release + + - name: Setup go + if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' + timeout-minutes: 15 + uses: actions/setup-go@v5 + with: + go-version: '1.22.0' - name: Run Golang field Tests working-directory: ./wrappers/golang/fields if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' @@ -343,7 +350,7 @@ jobs: - name: Checkout Repo uses: actions/checkout@v4 - name: Checkout CUDA Backend - if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' + if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' || needs.check-changed-files.outputs.rust == 'true' uses: actions/checkout@v4 with: repository: ingonyama-zk/icicle-cuda-backend @@ -351,7 +358,7 @@ jobs: ssh-key: ${{ secrets.CUDA_PULL_KEY }} ref: ${{ needs.extract-cuda-backend-branch.outputs.cuda-backend-branch }} - name: Get CUDA Backend Commit SHA - if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' + if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' || needs.check-changed-files.outputs.rust == 'true' working-directory: ./icicle/backend/cuda id: extract-cuda-sha run: | @@ -359,7 +366,7 @@ jobs: echo "CUDA Backend Commit SHA: $CUDA_BACKEND_SHA" echo "cuda-backend-sha=$CUDA_BACKEND_SHA" >> $GITHUB_OUTPUT - name: Set CUDA backend flag - if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' + if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' || needs.check-changed-files.outputs.rust == 'true' id: cuda-flag run: | CUDA_BACKEND_SHA=${{ steps.extract-cuda-sha.outputs.cuda-backend-sha }} @@ -396,12 +403,6 @@ jobs: echo "CMAKE_INSTALL_PREFIX=-DCMAKE_INSTALL_PREFIX=${INSTALL_PATH}" >> $GITHUB_OUTPUT echo "ICICLE_BACKEND_INSTALL_DIR=${INSTALL_PATH}/lib" >> $GITHUB_OUTPUT fi - - name: Setup go - if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' - timeout-minutes: 15 - uses: actions/setup-go@v5 - with: - go-version: '1.22.0' - name: Build working-directory: ./icicle if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' @@ -420,6 +421,12 @@ jobs: cargo test --release --verbose -- --skip phase cargo test phase2 --release cargo test phase3 --release + - name: Setup go + if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' + timeout-minutes: 15 + uses: actions/setup-go@v5 + with: + go-version: '1.22.0' - name: Test GoLang Hashes working-directory: ./wrappers/golang/hash if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' diff --git a/docs/docs/icicle/libraries.md b/docs/docs/icicle/libraries.md index 684a6b6177..6a71e4cb1e 100644 --- a/docs/docs/icicle/libraries.md +++ b/docs/docs/icicle/libraries.md @@ -53,6 +53,7 @@ Each library has a corresponding crate. See [programmers guide](./programmers_gu | [Keccak](primitives/hash#keccak-and-sha3) | supporting 256b and 512b digest | | [SHA3](primitives/hash#keccak-and-sha3) | supporting 256b and 512b digest | | [Blake2s](primitives/hash#blake2s) | digest is 256b | +| [Blake3](primitives/hash#blake3) | digest is 256b | | [Merkle-Tree](primitives/merkle) | works with any combination of hash functions | diff --git a/docs/docs/icicle/primitives/hash.md b/docs/docs/icicle/primitives/hash.md index 702487a5a0..a88bc996fe 100644 --- a/docs/docs/icicle/primitives/hash.md +++ b/docs/docs/icicle/primitives/hash.md @@ -23,8 +23,9 @@ ICICLE supports the following hash functions: 3. **SHA3-256** 4. **SHA3-512** 5. **Blake2s** -6. **Poseidon** -7. **Poseidon2** +6. **Blake3** +7. **Poseidon** +8. **Poseidon2** :::info Additional hash functions might be added in the future. Stay tuned! @@ -40,6 +41,10 @@ Keccak can take input messages of any length and produce a fixed-size hash. It u [Blake2s](https://www.rfc-editor.org/rfc/rfc7693.txt) is an optimized cryptographic hash function that provides high performance while ensuring strong security. Blake2s is ideal for hashing small data (such as field elements), especially when speed is crucial. It produces a 256-bit (32-byte) output and is often used in cryptographic protocols. +### Blake3 + +[Blake3](https://www.ietf.org/archive/id/draft-aumasson-blake3-00.html) is a high-performance cryptographic hash function designed for both small and large data. With a a tree-based design for efficient parallelism, it offers strong security, speed, and scalability for modern cryptographic applications. + ### Poseidon [Poseidon](https://eprint.iacr.org/2019/458) is a cryptographic hash function designed specifically for field elements. It is highly optimized for zero-knowledge proofs (ZKPs) and is commonly used in ZK-SNARK systems. Poseidonโ€™s main strength lies in its arithmetization-friendly design, meaning it can be efficiently expressed as arithmetic constraints within a ZK-SNARK circuit. diff --git a/examples/c++/best-practice-ntt/example.cpp b/examples/c++/best-practice-ntt/example.cpp index 7db88aa357..7f4bc974d1 100644 --- a/examples/c++/best-practice-ntt/example.cpp +++ b/examples/c++/best-practice-ntt/example.cpp @@ -116,8 +116,8 @@ int main(int argc, char* argv[]) // Clean-up for (int i = 0; i < 2; i++) { ICICLE_CHECK(icicle_free(d_vec[i])); - delete[](h_inp[i]); - delete[](h_out[i]); + delete[] (h_inp[i]); + delete[] (h_out[i]); } ICICLE_CHECK(icicle_destroy_stream(stream_compute)); ICICLE_CHECK(icicle_destroy_stream(stream_d2h)); diff --git a/icicle/backend/cpu/CMakeLists.txt b/icicle/backend/cpu/CMakeLists.txt index 75a35f1e84..20eae67f01 100644 --- a/icicle/backend/cpu/CMakeLists.txt +++ b/icicle/backend/cpu/CMakeLists.txt @@ -67,6 +67,10 @@ if (HASH) target_sources(icicle_hash PRIVATE src/hash/cpu_keccak.cpp src/hash/cpu_blake2s.cpp + src/hash/cpu_blake3.cpp + src/hash/blake3.c + src/hash/blake3_dispatch.c + src/hash/blake3_portable.c src/hash/cpu_merkle_tree.cpp ) target_include_directories(icicle_hash PUBLIC include) diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp index 38d859e022..18fb5dd032 100644 --- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp +++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp @@ -372,7 +372,7 @@ class VectorOpTask : public TaskBase public: T m_intermidiate_res; // pointer to the output. Can be a vector or scalar pointer uint64_t m_idx_in_batch; // index in the batch. Used in intermediate res tasks -}; +}; // class VectorOpTask #define NOF_OPERATIONS_PER_TASK 512 #define CONFIG_NOF_THREADS_KEY "n_threads" diff --git a/icicle/backend/cpu/src/hash/blake3.c b/icicle/backend/cpu/src/hash/blake3.c new file mode 100644 index 0000000000..617949f109 --- /dev/null +++ b/icicle/backend/cpu/src/hash/blake3.c @@ -0,0 +1,591 @@ +/*BLAKE3 Hash function based on the original design by the BLAKE3 team https://github.com/BLAKE3-team/BLAKE3 */ + +#include +#include "blake3.h" +#include "blake3_impl.h" + +const char* blake3_version(void) { return BLAKE3_VERSION_STRING; } + +INLINE void chunk_state_init(blake3_chunk_state* self, const uint32_t key[8], uint8_t flags) +{ + memcpy(self->cv, key, BLAKE3_KEY_LEN); + self->chunk_counter = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + self->buf_len = 0; + self->blocks_compressed = 0; + self->flags = flags; +} + +INLINE void chunk_state_reset(blake3_chunk_state* self, const uint32_t key[8], uint64_t chunk_counter) +{ + memcpy(self->cv, key, BLAKE3_KEY_LEN); + self->chunk_counter = chunk_counter; + self->blocks_compressed = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + self->buf_len = 0; +} + +INLINE size_t chunk_state_len(const blake3_chunk_state* self) +{ + return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + ((size_t)self->buf_len); +} + +INLINE size_t chunk_state_fill_buf(blake3_chunk_state* self, const uint8_t* input, size_t input_len) +{ + size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len); + if (take > input_len) { take = input_len; } + uint8_t* dest = self->buf + ((size_t)self->buf_len); + memcpy(dest, input, take); + self->buf_len += (uint8_t)take; + return take; +} + +INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state* self) +{ + if (self->blocks_compressed == 0) { + return CHUNK_START; + } else { + return 0; + } +} + +typedef struct { + uint32_t input_cv[8]; + uint64_t counter; + uint8_t block[BLAKE3_BLOCK_LEN]; + uint8_t block_len; + uint8_t flags; +} output_t; + +INLINE output_t make_output( + const uint32_t input_cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) +{ + output_t ret; + memcpy(ret.input_cv, input_cv, 32); + memcpy(ret.block, block, BLAKE3_BLOCK_LEN); + ret.block_len = block_len; + ret.counter = counter; + ret.flags = flags; + return ret; +} + +// Chaining values within a given chunk (specifically the compress_in_place +// interface) are represented as words. This avoids unnecessary bytes<->words +// conversion overhead in the portable implementation. However, the hash_many +// interface handles both user input and parent node blocks, so it accepts +// bytes. For that reason, chaining values in the CV stack are represented as +// bytes. +INLINE void output_chaining_value(const output_t* self, uint8_t cv[32]) +{ + uint32_t cv_words[8]; + memcpy(cv_words, self->input_cv, 32); + blake3_compress_in_place(cv_words, self->block, self->block_len, self->counter, self->flags); + store_cv_words(cv, cv_words); +} + +INLINE void output_root_bytes(const output_t* self, uint64_t seek, uint8_t* out, size_t out_len) +{ + if (out_len == 0) { return; } + uint64_t output_block_counter = seek / 64; + size_t offset_within_block = seek % 64; + uint8_t wide_buf[64]; + if (offset_within_block) { + blake3_compress_xof( + self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf); + const size_t available_bytes = 64 - offset_within_block; + const size_t bytes = out_len > available_bytes ? available_bytes : out_len; + memcpy(out, wide_buf + offset_within_block, bytes); + out += bytes; + out_len -= bytes; + output_block_counter += 1; + } + if (out_len / 64) { + blake3_xof_many( + self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, out, out_len / 64); + } + output_block_counter += out_len / 64; + out += out_len & -64; + out_len -= out_len & -64; + if (out_len) { + blake3_compress_xof( + self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf); + memcpy(out, wide_buf, out_len); + } +} + +INLINE void chunk_state_update(blake3_chunk_state* self, const uint8_t* input, size_t input_len) +{ + if (self->buf_len > 0) { + size_t take = chunk_state_fill_buf(self, input, input_len); + input += take; + input_len -= take; + if (input_len > 0) { + blake3_compress_in_place( + self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, self->flags | chunk_state_maybe_start_flag(self)); + self->blocks_compressed += 1; + self->buf_len = 0; + memset(self->buf, 0, BLAKE3_BLOCK_LEN); + } + } + + while (input_len > BLAKE3_BLOCK_LEN) { + blake3_compress_in_place( + self->cv, input, BLAKE3_BLOCK_LEN, self->chunk_counter, self->flags | chunk_state_maybe_start_flag(self)); + self->blocks_compressed += 1; + input += BLAKE3_BLOCK_LEN; + input_len -= BLAKE3_BLOCK_LEN; + } + + chunk_state_fill_buf(self, input, input_len); +} + +INLINE output_t chunk_state_output(const blake3_chunk_state* self) +{ + uint8_t block_flags = self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END; + return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, block_flags); +} + +INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], const uint32_t key[8], uint8_t flags) +{ + return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT); +} + +// Given some input larger than one chunk, return the number of bytes that +// should go in the left subtree. This is the largest power-of-2 number of +// chunks that leaves at least 1 byte for the right subtree. +INLINE size_t left_len(size_t content_len) +{ + // Subtract 1 to reserve at least one byte for the right side. content_len + // should always be greater than BLAKE3_CHUNK_LEN. + size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; + return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN; +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time +// on a single thread. Write out the chunk chaining values and return the +// number of chunks hashed. These chunks are never the root and never empty; +// those cases use a different codepath. +INLINE size_t compress_chunks_parallel( + const uint8_t* input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t* out) +{ + const uint8_t* chunks_array[MAX_SIMD_DEGREE]; + size_t input_position = 0; + size_t chunks_array_len = 0; + while (input_len - input_position >= BLAKE3_CHUNK_LEN) { + chunks_array[chunks_array_len] = &input[input_position]; + input_position += BLAKE3_CHUNK_LEN; + chunks_array_len += 1; + } + + blake3_hash_many( + chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, true, flags, CHUNK_START, + CHUNK_END, out); + + // Hash the remaining partial chunk, if there is one. Note that the empty + // chunk (meaning the empty message) is a different codepath. + if (input_len > input_position) { + uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; + blake3_chunk_state chunk_state; + chunk_state_init(&chunk_state, key, flags); + chunk_state.chunk_counter = counter; + chunk_state_update(&chunk_state, &input[input_position], input_len - input_position); + output_t output = chunk_state_output(&chunk_state); + output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]); + return chunks_array_len + 1; + } else { + return chunks_array_len; + } +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time +// on a single thread. Write out the parent chaining values and return the +// number of parents hashed. (If there's an odd input chaining value left over, +// return it as an additional output.) These parents are never the root and +// never empty; those cases use a different codepath. +INLINE size_t compress_parents_parallel( + const uint8_t* child_chaining_values, size_t num_chaining_values, const uint32_t key[8], uint8_t flags, uint8_t* out) +{ + const uint8_t* parents_array[MAX_SIMD_DEGREE_OR_2]; + size_t parents_array_len = 0; + while (num_chaining_values - (2 * parents_array_len) >= 2) { + parents_array[parents_array_len] = &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; + parents_array_len += 1; + } + + blake3_hash_many( + parents_array, parents_array_len, 1, key, + 0, // Parents always use counter 0. + false, flags | PARENT, + 0, // Parents have no start flags. + 0, // Parents have no end flags. + out); + + // If there's an odd child left over, it becomes an output. + if (num_chaining_values > 2 * parents_array_len) { + memcpy( + &out[parents_array_len * BLAKE3_OUT_LEN], &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], + BLAKE3_OUT_LEN); + return parents_array_len + 1; + } else { + return parents_array_len; + } +} + +// The wide helper function returns (writes out) an array of chaining values +// and returns the length of that array. The number of chaining values returned +// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, +// if the input is shorter than that many chunks. The reason for maintaining a +// wide array of chaining values going back up the tree, is to allow the +// implementation to hash as many parents in parallel as possible. +// +// As a special case when the SIMD degree is 1, this function will still return +// at least 2 outputs. This guarantees that this function doesn't perform the +// root compression. (If it did, it would use the wrong flags, and also we +// wouldn't be able to implement extendable output.) Note that this function is +// not used when the whole input is only 1 chunk long; that's a different +// codepath. +// +// Why not just have the caller split the input on the first update(), instead +// of implementing this special rule? Because we don't want to limit SIMD or +// multi-threading parallelism for that update(). +static size_t blake3_compress_subtree_wide( + const uint8_t* input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t* out) +{ + // Note that the single chunk case does *not* bump the SIMD degree up to 2 + // when it is 1. If this implementation adds multi-threading in the future, + // this gives us the option of multi-threading even the 2-chunk case, which + // can help performance on smaller platforms. + if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) { + return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, out); + } + + // With more than simd_degree chunks, we need to recurse. Start by dividing + // the input into left and right subtrees. (Note that this is only optimal + // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree + // of 3 or something, we'll need a more complicated strategy.) + size_t left_input_len = left_len(input_len); + size_t right_input_len = input_len - left_input_len; + const uint8_t* right_input = &input[left_input_len]; + uint64_t right_chunk_counter = chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); + + // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to + // account for the special case of returning 2 outputs when the SIMD degree + // is 1. + uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t degree = blake3_simd_degree(); + if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { + // The special case: We always use a degree of at least two, to make + // sure there are two outputs. Except, as noted above, at the chunk + // level, where we allow degree=1. (Note that the 1-chunk-input case is + // a different codepath.) + degree = 2; + } + uint8_t* right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; + + // Recurse! If this implementation adds multi-threading support in the + // future, this is where it will go. + size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key, chunk_counter, flags, cv_array); + size_t right_n = + blake3_compress_subtree_wide(right_input, right_input_len, key, right_chunk_counter, flags, right_cvs); + + // The special case again. If simd_degree=1, then we'll have left_n=1 and + // right_n=1. Rather than compressing them into a single output, return + // them directly, to make sure we always have at least two outputs. + if (left_n == 1) { + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); + return 2; + } + + // Otherwise, do one layer of parent node compression. + size_t num_chaining_values = left_n + right_n; + return compress_parents_parallel(cv_array, num_chaining_values, key, flags, out); +} + +// Hash a subtree with compress_subtree_wide(), and then condense the resulting +// list of chaining values down to a single parent node. Don't compress that +// last parent node, however. Instead, return its message bytes (the +// concatenated chaining values of its children). This is necessary when the +// first call to update() supplies a complete subtree, because the topmost +// parent node of that subtree could end up being the root. It's also necessary +// for extended output in the general case. +// +// As with compress_subtree_wide(), this function is not used on inputs of 1 +// chunk or less. That's a different codepath. +INLINE void compress_subtree_to_parent_node( + const uint8_t* input, + size_t input_len, + const uint32_t key[8], + uint64_t chunk_counter, + uint8_t flags, + uint8_t out[2 * BLAKE3_OUT_LEN]) +{ + uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, chunk_counter, flags, cv_array); + assert(num_cvs <= MAX_SIMD_DEGREE_OR_2); + // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because + // as we just asserted, num_cvs will always be <=2 in that case. But GCC + // (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is + // set then it emits incorrect warnings here. We tried a few different + // hacks to silence these, but in the end our hacks just produced different + // warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of + // desperation, we ifdef out this entire loop when we know it's not needed. +#if MAX_SIMD_DEGREE_OR_2 > 2 + // If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input, + // compress_subtree_wide() returns more than 2 chaining values. Condense + // them into 2 by forming parent nodes repeatedly. + uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; + while (num_cvs > 2) { + num_cvs = compress_parents_parallel(cv_array, num_cvs, key, flags, out_array); + memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); + } +#endif + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); +} + +INLINE void hasher_init_base(blake3_hasher* self, const uint32_t key[8], uint8_t flags) +{ + memcpy(self->key, key, BLAKE3_KEY_LEN); + chunk_state_init(&self->chunk, key, flags); + self->cv_stack_len = 0; +} + +void blake3_hasher_init(blake3_hasher* self) { hasher_init_base(self, IV, 0); } + +void blake3_hasher_init_keyed(blake3_hasher* self, const uint8_t key[BLAKE3_KEY_LEN]) +{ + uint32_t key_words[8]; + load_key_words(key, key_words); + hasher_init_base(self, key_words, KEYED_HASH); +} + +void blake3_hasher_init_derive_key_raw(blake3_hasher* self, const void* context, size_t context_len) +{ + blake3_hasher context_hasher; + hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT); + blake3_hasher_update(&context_hasher, context, context_len); + uint8_t context_key[BLAKE3_KEY_LEN]; + blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN); + uint32_t context_key_words[8]; + load_key_words(context_key, context_key_words); + hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL); +} + +void blake3_hasher_init_derive_key(blake3_hasher* self, const char* context) +{ + blake3_hasher_init_derive_key_raw(self, context, strlen(context)); +} + +// As described in hasher_push_cv() below, we do "lazy merging", delaying +// merges until right before the next CV is about to be added. This is +// different from the reference implementation. Another difference is that we +// aren't always merging 1 chunk at a time. Instead, each CV might represent +// any power-of-two number of chunks, as long as the smaller-above-larger stack +// order is maintained. Instead of the "count the trailing 0-bits" algorithm +// described in the spec, we use a "count the total number of 1-bits" variant +// that doesn't require us to retain the subtree size of the CV on top of the +// stack. The principle is the same: each CV that should remain in the stack is +// represented by a 1-bit in the total number of chunks (or bytes) so far. +INLINE void hasher_merge_cv_stack(blake3_hasher* self, uint64_t total_len) +{ + size_t post_merge_stack_len = (size_t)popcnt(total_len); + while (self->cv_stack_len > post_merge_stack_len) { + uint8_t* parent_node = &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN]; + output_t output = parent_output(parent_node, self->key, self->chunk.flags); + output_chaining_value(&output, parent_node); + self->cv_stack_len -= 1; + } +} + +// In reference_impl.rs, we merge the new CV with existing CVs from the stack +// before pushing it. We can do that because we know more input is coming, so +// we know none of the merges are root. +// +// This setting is different. We want to feed as much input as possible to +// compress_subtree_wide(), without setting aside anything for the chunk_state. +// If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once +// as a single subtree, if at all possible. +// +// This leads to two problems: +// 1) This 64 KiB input might be the only call that ever gets made to update. +// In this case, the root node of the 64 KiB subtree would be the root node +// of the whole tree, and it would need to be ROOT finalized. We can't +// compress it until we know. +// 2) This 64 KiB input might complete a larger tree, whose root node is +// similarly going to be the root of the whole tree. For example, maybe +// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the +// node at the root of the 256 KiB subtree until we know how to finalize it. +// +// The second problem is solved with "lazy merging". That is, when we're about +// to add a CV to the stack, we don't merge it with anything first, as the +// reference impl does. Instead we do merges using the *previous* CV that was +// added, which is sitting on top of the stack, and we put the new CV +// (unmerged) on top of the stack afterwards. This guarantees that we never +// merge the root node until finalize(). +// +// Solving the first problem requires an additional tool, +// compress_subtree_to_parent_node(). That function always returns the top +// *two* chaining values of the subtree it's compressing. We then do lazy +// merging with each of them separately, so that the second CV will always +// remain unmerged. (That also helps us support extendable output when we're +// hashing an input all-at-once.) +INLINE void hasher_push_cv(blake3_hasher* self, uint8_t new_cv[BLAKE3_OUT_LEN], uint64_t chunk_counter) +{ + hasher_merge_cv_stack(self, chunk_counter); + memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, BLAKE3_OUT_LEN); + self->cv_stack_len += 1; +} + +void blake3_hasher_update(blake3_hasher* self, const void* input, size_t input_len) +{ + // Explicitly checking for zero avoids causing UB by passing a null pointer + // to memcpy. This comes up in practice with things like: + // std::vector v; + // blake3_hasher_update(&hasher, v.data(), v.size()); + if (input_len == 0) { return; } + + const uint8_t* input_bytes = (const uint8_t*)input; + + // If we have some partial chunk bytes in the internal chunk_state, we need + // to finish that chunk first. + if (chunk_state_len(&self->chunk) > 0) { + size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk); + if (take > input_len) { take = input_len; } + chunk_state_update(&self->chunk, input_bytes, take); + input_bytes += take; + input_len -= take; + // If we've filled the current chunk and there's more coming, finalize this + // chunk and proceed. In this case we know it's not the root. + if (input_len > 0) { + output_t output = chunk_state_output(&self->chunk); + uint8_t chunk_cv[32]; + output_chaining_value(&output, chunk_cv); + hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter); + chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1); + } else { + return; + } + } + + // Now the chunk_state is clear, and we have more input. If there's more than + // a single chunk (so, definitely not the root chunk), hash the largest whole + // subtree we can, with the full benefits of SIMD (and maybe in the future, + // multi-threading) parallelism. Two restrictions: + // - The subtree has to be a power-of-2 number of chunks. Only subtrees along + // the right edge can be incomplete, and we don't know where the right edge + // is going to be until we get to finalize(). + // - The subtree must evenly divide the total number of chunks up until this + // point (if total is not 0). If the current incomplete subtree is only + // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have + // to complete the current subtree first. + // Because we might need to break up the input to form powers of 2, or to + // evenly divide what we already have, this part runs in a loop. + while (input_len > BLAKE3_CHUNK_LEN) { + size_t subtree_len = round_down_to_power_of_2(input_len); + uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; + // Shrink the subtree_len until it evenly divides the count so far. We know + // that subtree_len itself is a power of 2, so we can use a bitmasking + // trick instead of an actual remainder operation. (Note that if the caller + // consistently passes power-of-2 inputs of the same size, as is hopefully + // typical, this loop condition will always fail, and subtree_len will + // always be the full length of the input.) + // + // An aside: We don't have to shrink subtree_len quite this much. For + // example, if count_so_far is 1, we could pass 2 chunks to + // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still + // get the right answer in the end, and we might get to use 2-way SIMD + // parallelism. The problem with this optimization, is that it gets us + // stuck always hashing 2 chunks. The total number of chunks will remain + // odd, and we'll never graduate to higher degrees of parallelism. See + // https://github.com/BLAKE3-team/BLAKE3/issues/69. + while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { + subtree_len /= 2; + } + // The shrunken subtree_len might now be 1 chunk long. If so, hash that one + // chunk by itself. Otherwise, compress the subtree into a pair of CVs. + uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; + if (subtree_len <= BLAKE3_CHUNK_LEN) { + blake3_chunk_state chunk_state; + chunk_state_init(&chunk_state, self->key, self->chunk.flags); + chunk_state.chunk_counter = self->chunk.chunk_counter; + chunk_state_update(&chunk_state, input_bytes, subtree_len); + output_t output = chunk_state_output(&chunk_state); + uint8_t cv[BLAKE3_OUT_LEN]; + output_chaining_value(&output, cv); + hasher_push_cv(self, cv, chunk_state.chunk_counter); + } else { + // This is the high-performance happy path, though getting here depends + // on the caller giving us a long enough input. + uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; + compress_subtree_to_parent_node( + input_bytes, subtree_len, self->key, self->chunk.chunk_counter, self->chunk.flags, cv_pair); + hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); + hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], self->chunk.chunk_counter + (subtree_chunks / 2)); + } + self->chunk.chunk_counter += subtree_chunks; + input_bytes += subtree_len; + input_len -= subtree_len; + } + + // If there's any remaining input less than a full chunk, add it to the chunk + // state. In that case, also do a final merge loop to make sure the subtree + // stack doesn't contain any unmerged pairs. The remaining input means we + // know these merges are non-root. This merge loop isn't strictly necessary + // here, because hasher_push_chunk_cv already does its own merge loop, but it + // simplifies blake3_hasher_finalize below. + if (input_len > 0) { + chunk_state_update(&self->chunk, input_bytes, input_len); + hasher_merge_cv_stack(self, self->chunk.chunk_counter); + } +} + +void blake3_hasher_finalize(const blake3_hasher* self, uint8_t* out, size_t out_len) +{ + blake3_hasher_finalize_seek(self, 0, out, out_len); +} + +void blake3_hasher_finalize_seek(const blake3_hasher* self, uint64_t seek, uint8_t* out, size_t out_len) +{ + // Explicitly checking for zero avoids causing UB by passing a null pointer + // to memcpy. This comes up in practice with things like: + // std::vector v; + // blake3_hasher_finalize(&hasher, v.data(), v.size()); + if (out_len == 0) { return; } + + // If the subtree stack is empty, then the current chunk is the root. + if (self->cv_stack_len == 0) { + output_t output = chunk_state_output(&self->chunk); + output_root_bytes(&output, seek, out, out_len); + return; + } + // If there are any bytes in the chunk state, finalize that chunk and do a + // roll-up merge between that chunk hash and every subtree in the stack. In + // this case, the extra merge loop at the end of blake3_hasher_update + // guarantees that none of the subtrees in the stack need to be merged with + // each other first. Otherwise, if there are no bytes in the chunk state, + // then the top of the stack is a chunk hash, and we start the merge from + // that. + output_t output; + size_t cvs_remaining; + if (chunk_state_len(&self->chunk) > 0) { + cvs_remaining = self->cv_stack_len; + output = chunk_state_output(&self->chunk); + } else { + // There are always at least 2 CVs in the stack in this case. + cvs_remaining = self->cv_stack_len - 2; + output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, self->chunk.flags); + } + while (cvs_remaining > 0) { + cvs_remaining -= 1; + uint8_t parent_block[BLAKE3_BLOCK_LEN]; + memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32); + output_chaining_value(&output, &parent_block[32]); + output = parent_output(parent_block, self->key, self->chunk.flags); + } + output_root_bytes(&output, seek, out, out_len); +} + +void blake3_hasher_reset(blake3_hasher* self) +{ + chunk_state_reset(&self->chunk, self->key, 0); + self->cv_stack_len = 0; +} diff --git a/icicle/backend/cpu/src/hash/blake3.h b/icicle/backend/cpu/src/hash/blake3.h new file mode 100644 index 0000000000..55b238f1ac --- /dev/null +++ b/icicle/backend/cpu/src/hash/blake3.h @@ -0,0 +1,57 @@ +/*BLAKE3 Hash function based on the original design by the BLAKE3 team https://github.com/BLAKE3-team/BLAKE3 */ + +#ifndef BLAKE3_H +#define BLAKE3_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define BLAKE3_VERSION_STRING "1.5.5" +#define BLAKE3_KEY_LEN 32 +#define BLAKE3_OUT_LEN 32 +#define BLAKE3_BLOCK_LEN 64 +#define BLAKE3_CHUNK_LEN 1024 +#define BLAKE3_MAX_DEPTH 54 + +// This struct is a private implementation detail. It has to be here because +// it's part of blake3_hasher below. +typedef struct { + uint32_t cv[8]; + uint64_t chunk_counter; + uint8_t buf[BLAKE3_BLOCK_LEN]; + uint8_t buf_len; + uint8_t blocks_compressed; + uint8_t flags; +} blake3_chunk_state; + +typedef struct { + uint32_t key[8]; + blake3_chunk_state chunk; + uint8_t cv_stack_len; + // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, + // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk + // requires a 4th entry, rather than merging everything down to 1, because we + // don't know whether more input is coming. This is different from how the + // reference implementation does things. + uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; +} blake3_hasher; + +const char* blake3_version(void); +void blake3_hasher_init(blake3_hasher* self); +void blake3_hasher_init_keyed(blake3_hasher* self, const uint8_t key[BLAKE3_KEY_LEN]); +void blake3_hasher_init_derive_key(blake3_hasher* self, const char* context); +void blake3_hasher_init_derive_key_raw(blake3_hasher* self, const void* context, size_t context_len); +void blake3_hasher_update(blake3_hasher* self, const void* input, size_t input_len); +void blake3_hasher_finalize(const blake3_hasher* self, uint8_t* out, size_t out_len); +void blake3_hasher_finalize_seek(const blake3_hasher* self, uint64_t seek, uint8_t* out, size_t out_len); +void blake3_hasher_reset(blake3_hasher* self); + +#ifdef __cplusplus +} +#endif + +#endif /* BLAKE3_H */ diff --git a/icicle/backend/cpu/src/hash/blake3_dispatch.c b/icicle/backend/cpu/src/hash/blake3_dispatch.c new file mode 100644 index 0000000000..296e769e31 --- /dev/null +++ b/icicle/backend/cpu/src/hash/blake3_dispatch.c @@ -0,0 +1,62 @@ +/*BLAKE3 Hash function based on the original design by the BLAKE3 team https://github.com/BLAKE3-team/BLAKE3 */ + +#include +#include +#include + +#include "blake3_impl.h" + +void blake3_compress_in_place( + uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) +{ + blake3_compress_in_place_portable(cv, block, block_len, counter, flags); +} + +void blake3_compress_xof( + const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, + uint64_t counter, + uint8_t flags, + uint8_t out[64]) +{ + blake3_compress_xof_portable(cv, block, block_len, counter, flags, out); +} + +void blake3_xof_many( + const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, + uint64_t counter, + uint8_t flags, + uint8_t out[64], + size_t outblocks) +{ + if (outblocks == 0) { + // The current assembly implementation always outputs at least 1 block. + return; + } + + for (size_t i = 0; i < outblocks; ++i) { + blake3_compress_xof(cv, block, block_len, counter + i, flags, out + 64 * i); + } +} + +void blake3_hash_many( + const uint8_t* const* inputs, + size_t num_inputs, + size_t blocks, + const uint32_t key[8], + uint64_t counter, + bool increment_counter, + uint8_t flags, + uint8_t flags_start, + uint8_t flags_end, + uint8_t* out) +{ + blake3_hash_many_portable( + inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); +} + +// The dynamically detected SIMD degree of the current platform. +size_t blake3_simd_degree(void) { return 1; } diff --git a/icicle/backend/cpu/src/hash/blake3_impl.h b/icicle/backend/cpu/src/hash/blake3_impl.h new file mode 100644 index 0000000000..1bd2bc0046 --- /dev/null +++ b/icicle/backend/cpu/src/hash/blake3_impl.h @@ -0,0 +1,205 @@ +/*BLAKE3 Hash function based on the original design by the BLAKE3 team https://github.com/BLAKE3-team/BLAKE3 */ + +#ifndef BLAKE3_IMPL_H +#define BLAKE3_IMPL_H + +#include +#include +#include +#include +#include + +#include "blake3.h" + +// internal flags +enum blake3_flags { + CHUNK_START = 1 << 0, + CHUNK_END = 1 << 1, + PARENT = 1 << 2, + ROOT = 1 << 3, + KEYED_HASH = 1 << 4, + DERIVE_KEY_CONTEXT = 1 << 5, + DERIVE_KEY_MATERIAL = 1 << 6, +}; + +// This C implementation tries to support recent versions of GCC, Clang, and +// MSVC. + +#define INLINE static inline __attribute__((always_inline)) +#define MAX_SIMD_DEGREE 1 + +// There are some places where we want a static size that's equal to the +// MAX_SIMD_DEGREE, but also at least 2. +#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) + +static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, + 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL}; + +static const uint8_t MSG_SCHEDULE[7][16] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, + {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, + {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, + {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, +}; + +/* Find index of the highest set bit */ +/* x is assumed to be nonzero. */ +static unsigned int highest_one(uint64_t x) +{ +#if defined(__GNUC__) || defined(__clang__) + return 63 ^ (unsigned int)__builtin_clzll(x); +#else + unsigned int c = 0; + if (x & 0xffffffff00000000ULL) { + x >>= 32; + c += 32; + } + if (x & 0x00000000ffff0000ULL) { + x >>= 16; + c += 16; + } + if (x & 0x000000000000ff00ULL) { + x >>= 8; + c += 8; + } + if (x & 0x00000000000000f0ULL) { + x >>= 4; + c += 4; + } + if (x & 0x000000000000000cULL) { + x >>= 2; + c += 2; + } + if (x & 0x0000000000000002ULL) { c += 1; } + return c; +#endif +} + +// Count the number of 1 bits. +INLINE unsigned int popcnt(uint64_t x) +{ +#if defined(__GNUC__) || defined(__clang__) + return (unsigned int)__builtin_popcountll(x); +#else + unsigned int count = 0; + while (x != 0) { + count += 1; + x &= x - 1; + } + return count; +#endif +} + +// Largest power of two less than or equal to x. As a special case, returns 1 +// when x is 0. +INLINE uint64_t round_down_to_power_of_2(uint64_t x) { return 1ULL << highest_one(x | 1); } + +INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; } + +INLINE uint32_t counter_high(uint64_t counter) { return (uint32_t)(counter >> 32); } + +INLINE uint32_t load32(const void* src) +{ + const uint8_t* p = (const uint8_t*)src; + return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); +} + +INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], uint32_t key_words[8]) +{ + key_words[0] = load32(&key[0 * 4]); + key_words[1] = load32(&key[1 * 4]); + key_words[2] = load32(&key[2 * 4]); + key_words[3] = load32(&key[3 * 4]); + key_words[4] = load32(&key[4 * 4]); + key_words[5] = load32(&key[5 * 4]); + key_words[6] = load32(&key[6 * 4]); + key_words[7] = load32(&key[7 * 4]); +} + +INLINE void load_block_words(const uint8_t block[BLAKE3_BLOCK_LEN], uint32_t block_words[16]) +{ + for (size_t i = 0; i < 16; i++) { + block_words[i] = load32(&block[i * 4]); + } +} + +INLINE void store32(void* dst, uint32_t w) +{ + uint8_t* p = (uint8_t*)dst; + p[0] = (uint8_t)(w >> 0); + p[1] = (uint8_t)(w >> 8); + p[2] = (uint8_t)(w >> 16); + p[3] = (uint8_t)(w >> 24); +} + +INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) +{ + store32(&bytes_out[0 * 4], cv_words[0]); + store32(&bytes_out[1 * 4], cv_words[1]); + store32(&bytes_out[2 * 4], cv_words[2]); + store32(&bytes_out[3 * 4], cv_words[3]); + store32(&bytes_out[4 * 4], cv_words[4]); + store32(&bytes_out[5 * 4], cv_words[5]); + store32(&bytes_out[6 * 4], cv_words[6]); + store32(&bytes_out[7 * 4], cv_words[7]); +} + +void blake3_compress_in_place( + uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); + +void blake3_compress_xof( + const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, + uint64_t counter, + uint8_t flags, + uint8_t out[64]); + +void blake3_xof_many( + const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, + uint64_t counter, + uint8_t flags, + uint8_t out[64], + size_t outblocks); + +void blake3_hash_many( + const uint8_t* const* inputs, + size_t num_inputs, + size_t blocks, + const uint32_t key[8], + uint64_t counter, + bool increment_counter, + uint8_t flags, + uint8_t flags_start, + uint8_t flags_end, + uint8_t* out); + +size_t blake3_simd_degree(void); + +// Declarations for implementation-specific functions. +void blake3_compress_in_place_portable( + uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); + +void blake3_compress_xof_portable( + const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, + uint64_t counter, + uint8_t flags, + uint8_t out[64]); + +void blake3_hash_many_portable( + const uint8_t* const* inputs, + size_t num_inputs, + size_t blocks, + const uint32_t key[8], + uint64_t counter, + bool increment_counter, + uint8_t flags, + uint8_t flags_start, + uint8_t flags_end, + uint8_t* out); + +#endif /* BLAKE3_IMPL_H */ diff --git a/icicle/backend/cpu/src/hash/blake3_portable.c b/icicle/backend/cpu/src/hash/blake3_portable.c new file mode 100644 index 0000000000..9ef1293418 --- /dev/null +++ b/icicle/backend/cpu/src/hash/blake3_portable.c @@ -0,0 +1,176 @@ +/*BLAKE3 Hash function based on the original design by the BLAKE3 team https://github.com/BLAKE3-team/BLAKE3 */ + +#include "blake3_impl.h" +#include + +INLINE uint32_t rotr32(uint32_t w, uint32_t c) { return (w >> c) | (w << (32 - c)); } + +INLINE void g(uint32_t* state, size_t a, size_t b, size_t c, size_t d, uint32_t x, uint32_t y) +{ + state[a] = state[a] + state[b] + x; + state[d] = rotr32(state[d] ^ state[a], 16); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 12); + state[a] = state[a] + state[b] + y; + state[d] = rotr32(state[d] ^ state[a], 8); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 7); +} + +INLINE void round_fn(uint32_t state[16], const uint32_t* msg, size_t round) +{ + // Select the message schedule based on the round. + const uint8_t* schedule = MSG_SCHEDULE[round]; + + // Mix the columns. + g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); + g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); + g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); + g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); + + // Mix the rows. + g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); + g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); + g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); + g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); +} + +INLINE void compress_pre( + uint32_t state[16], + const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, + uint64_t counter, + uint8_t flags) +{ + uint32_t block_words[16]; + block_words[0] = load32(block + 4 * 0); + block_words[1] = load32(block + 4 * 1); + block_words[2] = load32(block + 4 * 2); + block_words[3] = load32(block + 4 * 3); + block_words[4] = load32(block + 4 * 4); + block_words[5] = load32(block + 4 * 5); + block_words[6] = load32(block + 4 * 6); + block_words[7] = load32(block + 4 * 7); + block_words[8] = load32(block + 4 * 8); + block_words[9] = load32(block + 4 * 9); + block_words[10] = load32(block + 4 * 10); + block_words[11] = load32(block + 4 * 11); + block_words[12] = load32(block + 4 * 12); + block_words[13] = load32(block + 4 * 13); + block_words[14] = load32(block + 4 * 14); + block_words[15] = load32(block + 4 * 15); + + state[0] = cv[0]; + state[1] = cv[1]; + state[2] = cv[2]; + state[3] = cv[3]; + state[4] = cv[4]; + state[5] = cv[5]; + state[6] = cv[6]; + state[7] = cv[7]; + state[8] = IV[0]; + state[9] = IV[1]; + state[10] = IV[2]; + state[11] = IV[3]; + state[12] = counter_low(counter); + state[13] = counter_high(counter); + state[14] = (uint32_t)block_len; + state[15] = (uint32_t)flags; + + round_fn(state, &block_words[0], 0); + round_fn(state, &block_words[0], 1); + round_fn(state, &block_words[0], 2); + round_fn(state, &block_words[0], 3); + round_fn(state, &block_words[0], 4); + round_fn(state, &block_words[0], 5); + round_fn(state, &block_words[0], 6); +} + +void blake3_compress_in_place_portable( + uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) +{ + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + cv[0] = state[0] ^ state[8]; + cv[1] = state[1] ^ state[9]; + cv[2] = state[2] ^ state[10]; + cv[3] = state[3] ^ state[11]; + cv[4] = state[4] ^ state[12]; + cv[5] = state[5] ^ state[13]; + cv[6] = state[6] ^ state[14]; + cv[7] = state[7] ^ state[15]; +} + +void blake3_compress_xof_portable( + const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, + uint64_t counter, + uint8_t flags, + uint8_t out[64]) +{ + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + + store32(&out[0 * 4], state[0] ^ state[8]); + store32(&out[1 * 4], state[1] ^ state[9]); + store32(&out[2 * 4], state[2] ^ state[10]); + store32(&out[3 * 4], state[3] ^ state[11]); + store32(&out[4 * 4], state[4] ^ state[12]); + store32(&out[5 * 4], state[5] ^ state[13]); + store32(&out[6 * 4], state[6] ^ state[14]); + store32(&out[7 * 4], state[7] ^ state[15]); + store32(&out[8 * 4], state[8] ^ cv[0]); + store32(&out[9 * 4], state[9] ^ cv[1]); + store32(&out[10 * 4], state[10] ^ cv[2]); + store32(&out[11 * 4], state[11] ^ cv[3]); + store32(&out[12 * 4], state[12] ^ cv[4]); + store32(&out[13 * 4], state[13] ^ cv[5]); + store32(&out[14 * 4], state[14] ^ cv[6]); + store32(&out[15 * 4], state[15] ^ cv[7]); +} + +INLINE void hash_one_portable( + const uint8_t* input, + size_t blocks, + const uint32_t key[8], + uint64_t counter, + uint8_t flags, + uint8_t flags_start, + uint8_t flags_end, + uint8_t out[BLAKE3_OUT_LEN]) +{ + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { block_flags |= flags_end; } + blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + store_cv_words(out, cv); +} + +void blake3_hash_many_portable( + const uint8_t* const* inputs, + size_t num_inputs, + size_t blocks, + const uint32_t key[8], + uint64_t counter, + bool increment_counter, + uint8_t flags, + uint8_t flags_start, + uint8_t flags_end, + uint8_t* out) +{ + while (num_inputs > 0) { + hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out); + if (increment_counter) { counter += 1; } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} diff --git a/icicle/backend/cpu/src/hash/cpu_blake3.cpp b/icicle/backend/cpu/src/hash/cpu_blake3.cpp new file mode 100644 index 0000000000..9afe6674b1 --- /dev/null +++ b/icicle/backend/cpu/src/hash/cpu_blake3.cpp @@ -0,0 +1,53 @@ +/*BLAKE3 Hash function based on the original design by the BLAKE3 team https://github.com/BLAKE3-team/BLAKE3 */ + +#include "blake3.h" +#include "icicle/backend/hash/blake3_backend.h" +#include "icicle/utils/modifiers.h" +#include +#include +#include + +namespace icicle { + + class Blake3BackendCPU : public HashBackend + { + public: + Blake3BackendCPU(uint64_t input_chunk_size) : HashBackend("Blake3-CPU", BLAKE3_OUTBYTES, input_chunk_size) {} + + eIcicleError hash(const std::byte* input, uint64_t size, const HashConfig& config, std::byte* output) const override + { + const auto digest_size_in_bytes = output_size(); + const auto single_input_size = get_single_chunk_size(size); + + // Initialize the hasher + blake3_hasher hasher; + + for (unsigned batch_idx = 0; batch_idx < config.batch; ++batch_idx) { + const std::byte* batch_input = input + batch_idx * single_input_size; + uint64_t batch_size = single_input_size; + + blake3_hasher_init(&hasher); + blake3_hasher_update(&hasher, reinterpret_cast(batch_input), batch_size); + + uint8_t* batch_output = reinterpret_cast(output + batch_idx * digest_size_in_bytes); + blake3_hasher_finalize(&hasher, batch_output, digest_size_in_bytes); + } + + return eIcicleError::SUCCESS; + } + + private: + static constexpr unsigned int BLAKE3_OUTBYTES = 32; // BLAKE3 default output size in bytes + }; + + /************************ Blake3 registration ************************/ + eIcicleError + create_blake3_hash_backend(const Device& device, uint64_t input_chunk_size, std::shared_ptr& backend) + { + backend = std::make_shared(input_chunk_size); + return eIcicleError::SUCCESS; + } + + REGISTER_BLAKE3_FACTORY_BACKEND("CPU", create_blake3_hash_backend); + +} // namespace icicle \ No newline at end of file diff --git a/icicle/backend/cpu/src/hash/cpu_poseidon2.cpp b/icicle/backend/cpu/src/hash/cpu_poseidon2.cpp index 5f664860e7..df1c519cd6 100644 --- a/icicle/backend/cpu/src/hash/cpu_poseidon2.cpp +++ b/icicle/backend/cpu/src/hash/cpu_poseidon2.cpp @@ -147,7 +147,7 @@ namespace icicle { ICICLE_LOG_ERROR << "cpu_poseidon2_init_default_constants: T (width) must be one of [2, 3, 4, 8, 12, 16, 20, 24]\n"; return eIcicleError::INVALID_ARGUMENT; - } // switch (T) { + } // switch (T) { if (full_rounds == 0 && partial_rounds == 0) { // All arrays are empty in this case. continue; } diff --git a/icicle/cmake/hash.cmake b/icicle/cmake/hash.cmake index 6d43c3e034..e8fc0d97c0 100644 --- a/icicle/cmake/hash.cmake +++ b/icicle/cmake/hash.cmake @@ -5,6 +5,7 @@ function(setup_hash_target) target_sources(icicle_hash PRIVATE src/hash/keccak.cpp src/hash/blake2s.cpp + src/hash/blake3.cpp src/hash/merkle_tree.cpp src/hash/hash_c_api.cpp src/hash/merkle_c_api.cpp diff --git a/icicle/include/icicle/backend/hash/blake3_backend.h b/icicle/include/icicle/backend/hash/blake3_backend.h new file mode 100644 index 0000000000..f9b5825946 --- /dev/null +++ b/icicle/include/icicle/backend/hash/blake3_backend.h @@ -0,0 +1,25 @@ +#pragma once + +#include +#include "icicle/utils/utils.h" +#include "icicle/device.h" +#include "icicle/hash/blake3.h" + +namespace icicle { + + /*************************** Backend registration ***************************/ + using Blake3FactoryImpl = std::function& backend /*OUT*/)>; + + // Blake3 256 + void register_blake3_factory(const std::string& deviceType, Blake3FactoryImpl impl); + +#define REGISTER_BLAKE3_FACTORY_BACKEND(DEVICE_TYPE, FUNC) \ + namespace { \ + static bool UNIQUE(_reg_blake3) = []() -> bool { \ + register_blake3_factory(DEVICE_TYPE, FUNC); \ + return true; \ + }(); \ + } + +} // namespace icicle \ No newline at end of file diff --git a/icicle/include/icicle/fields/quartic_extension.h b/icicle/include/icicle/fields/quartic_extension.h index 784298cac9..6478e915c6 100644 --- a/icicle/include/icicle/fields/quartic_extension.h +++ b/icicle/include/icicle/fields/quartic_extension.h @@ -241,7 +241,7 @@ class QuarticExtensionField FF::reduce( (CONFIG::nonresidue_is_negative ? (FF::mul_wide(xs.real, x0) + FF::template mul_unsigned(FF::mul_wide(xs.im2, x2))) - : (FF::mul_wide(xs.real, x0)) - FF::template mul_unsigned(FF::mul_wide(xs.im2, x2)))), + : (FF::mul_wide(xs.real, x0))-FF::template mul_unsigned(FF::mul_wide(xs.im2, x2)))), FF::reduce( (CONFIG::nonresidue_is_negative ? FWide::neg(FF::template mul_unsigned(FF::mul_wide(xs.im3, x2))) diff --git a/icicle/include/icicle/fields/storage.h b/icicle/include/icicle/fields/storage.h index 76245db166..097db881fd 100644 --- a/icicle/include/icicle/fields/storage.h +++ b/icicle/include/icicle/fields/storage.h @@ -16,8 +16,7 @@ struct #ifdef __CUDA_ARCH__ __align__(LIMBS_ALIGNMENT(1)) #endif - storage<1> -{ + storage<1> { static constexpr unsigned LC = 1; uint32_t limbs[1]; }; @@ -28,8 +27,7 @@ struct #ifdef __CUDA_ARCH__ __align__(LIMBS_ALIGNMENT(1)) #endif - storage<3> -{ + storage<3> { static constexpr unsigned LC = 3; uint32_t limbs[3]; }; @@ -40,8 +38,7 @@ struct #ifdef __CUDA_ARCH__ __align__(LIMBS_ALIGNMENT(LIMBS_COUNT)) #endif - storage -{ + storage { static_assert(LIMBS_COUNT % 2 == 0, "odd number of limbs is not supported\n"); static constexpr unsigned LC = LIMBS_COUNT; union { // works only with even LIMBS_COUNT @@ -55,7 +52,6 @@ struct #ifdef __CUDA_ARCH__ __align__(LIMBS_ALIGNMENT(LIMBS_COUNT)) #endif - storage_array -{ + storage_array { storage storages[OMEGAS_COUNT]; }; \ No newline at end of file diff --git a/icicle/include/icicle/hash/blake3.h b/icicle/include/icicle/hash/blake3.h new file mode 100644 index 0000000000..089122572d --- /dev/null +++ b/icicle/include/icicle/hash/blake3.h @@ -0,0 +1,20 @@ +#pragma once + +#include "icicle/hash/hash.h" + +namespace icicle { + + /** + * @brief Creates a Blake3 hash object. + * + * This function constructs a Hash object configured for Blake3, with the + * appropriate backend selected based on the current device. + * + * @param input_chunk_size size of input in bytes for the Blake3 hash. + * @return Hash object encapsulating the Blake3 backend. + */ + Hash create_blake3_hash(uint64_t input_chunk_size = 0); + struct Blake3 { + inline static Hash create(uint64_t input_chunk_size = 0) { return create_blake3_hash(input_chunk_size); } + }; +} // namespace icicle \ No newline at end of file diff --git a/icicle/src/hash/blake3.cpp b/icicle/src/hash/blake3.cpp new file mode 100644 index 0000000000..3279fb0155 --- /dev/null +++ b/icicle/src/hash/blake3.cpp @@ -0,0 +1,17 @@ +#include "icicle/errors.h" +#include "icicle/backend/hash/blake3_backend.h" +#include "icicle/dispatcher.h" + +namespace icicle { + + // Blake3 + ICICLE_DISPATCHER_INST(Blake3Dispatcher, blake3_factory, Blake3FactoryImpl); + + Hash create_blake3_hash(uint64_t input_chunk_size) + { + std::shared_ptr backend; + ICICLE_CHECK(Blake3Dispatcher::execute(input_chunk_size, backend)); + Hash blake3{backend}; + return blake3; + } +} // namespace icicle \ No newline at end of file diff --git a/icicle/src/hash/hash_c_api.cpp b/icicle/src/hash/hash_c_api.cpp index 460b83d17c..820af1545f 100644 --- a/icicle/src/hash/hash_c_api.cpp +++ b/icicle/src/hash/hash_c_api.cpp @@ -3,6 +3,7 @@ #include "icicle/errors.h" #include "icicle/hash/keccak.h" #include "icicle/hash/blake2s.h" +#include "icicle/hash/blake3.h" extern "C" { // Define a type for the HasherHandle (which is a pointer to Hash) @@ -123,4 +124,17 @@ HasherHandle icicle_create_blake2s(uint64_t input_chunk_size) { return new icicle::Hash(icicle::create_blake2s_hash(input_chunk_size)); } + +/** + * @brief Creates a Blake3 hash object. + * + * This function constructs a Hash object configured for Blake2s. + * + * @param input_chunk_size Size of the input in bytes for the Blake2s hash. + * @return HasherHandle A handle to the created Blake2s Hash object. + */ +HasherHandle icicle_create_blake3(uint64_t input_chunk_size) +{ + return new icicle::Hash(icicle::create_blake3_hash(input_chunk_size)); +} } \ No newline at end of file diff --git a/icicle/tests/test_hash_api.cpp b/icicle/tests/test_hash_api.cpp index c04aaecebf..860e42e358 100644 --- a/icicle/tests/test_hash_api.cpp +++ b/icicle/tests/test_hash_api.cpp @@ -7,6 +7,7 @@ #include "icicle/hash/hash.h" #include "icicle/hash/keccak.h" #include "icicle/hash/blake2s.h" +#include "icicle/hash/blake3.h" #include "icicle/merkle/merkle_tree.h" #include "icicle/fields/field.h" @@ -93,6 +94,30 @@ TEST_F(HashApiTest, Blake2s) } } +TEST_F(HashApiTest, Blake3) +{ + // TODO: Add CUDA test, same as blake2s + auto config = default_hash_config(); + + const std::string input = + "Hello world I am blake3. This is a semi-long C++ test with a lot of characters. " + "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"; + const std::string expected_output = "4b71f2c5cb7c26da2ba67cc742228e55b66c8b64b2b250e7ccce6f7f6d17c9ae"; + + const uint64_t output_size = 32; + auto output = std::make_unique(output_size); + for (const auto& device : s_registered_devices) { + ICICLE_LOG_DEBUG << "Blake2s test on device=" << device; + ICICLE_CHECK(icicle_set_device("CPU")); + + auto blake3 = Blake3::create(); + ICICLE_CHECK(blake3.hash(input.data(), input.size() / config.batch, config, output.get())); + // Convert the output do a hex string and compare to expected output string + std::string output_as_str = voidPtrToHexString(output.get(), output_size); + ASSERT_EQ(output_as_str, expected_output); + } +} + TEST_F(HashApiTest, Keccak256Batch) { auto config = default_hash_config(); @@ -214,6 +239,54 @@ TEST_F(HashApiTest, Blake2sLarge) ICICLE_CHECK(icicle_free(d_output)); } +TEST_F(HashApiTest, Blake3Large) +{ + auto config = default_hash_config(); + config.batch = 1 << 8; + const unsigned chunk_size = 1 << 11; // 2KB chunks + const unsigned total_size = chunk_size * config.batch; + auto input = std::make_unique(total_size); + randomize((uint64_t*)input.get(), total_size / sizeof(uint64_t)); + + const uint64_t output_size = 32; + auto output_main = std::make_unique(output_size * config.batch); + auto output_main_case_2 = std::make_unique(output_size * config.batch); + auto output_ref = std::make_unique(output_size * config.batch); + + ICICLE_CHECK(icicle_set_device(IcicleTestBase::reference_device())); + auto blake3CPU = Blake3::create(); + START_TIMER(cpu_timer); + ICICLE_CHECK(blake3CPU.hash(input.get(), chunk_size, config, output_ref.get())); + END_TIMER(cpu_timer, "CPU blake3 large time", true); + + ICICLE_CHECK(icicle_set_device(IcicleTestBase::main_device())); + auto blake3MainDev = Blake3::create(); + + // test with host memory + START_TIMER(mainDev_timer); + config.are_inputs_on_device = false; + config.are_outputs_on_device = false; + ICICLE_CHECK(blake3MainDev.hash(input.get(), chunk_size, config, output_main.get())); + END_TIMER(mainDev_timer, "MainDev blake3 large time (on host memory)", true); + ASSERT_EQ(0, memcmp(output_main.get(), output_ref.get(), output_size * config.batch)); + + // test with device memory + std::byte *d_input = nullptr, *d_output = nullptr; + ICICLE_CHECK(icicle_malloc((void**)&d_input, total_size)); + ICICLE_CHECK(icicle_malloc((void**)&d_output, output_size * config.batch)); + ICICLE_CHECK(icicle_copy(d_input, input.get(), total_size)); + config.are_inputs_on_device = true; + config.are_outputs_on_device = true; + START_TIMER(mainDev_timer_device_mem); + ICICLE_CHECK(blake3MainDev.hash(d_input, chunk_size, config, d_output)); + END_TIMER(mainDev_timer_device_mem, "MainDev blake3 large time (on device memory)", true); + ICICLE_CHECK(icicle_copy(output_main_case_2.get(), d_output, output_size * config.batch)); + ASSERT_EQ(0, memcmp(output_main_case_2.get(), output_ref.get(), output_size * config.batch)); + + ICICLE_CHECK(icicle_free(d_input)); + ICICLE_CHECK(icicle_free(d_output)); +} + TEST_F(HashApiTest, sha3) { auto config = default_hash_config(); diff --git a/wrappers/golang/hash/blake3.go b/wrappers/golang/hash/blake3.go new file mode 100644 index 0000000000..bd985c2f3b --- /dev/null +++ b/wrappers/golang/hash/blake3.go @@ -0,0 +1,19 @@ +package hash + +// #cgo CFLAGS: -I./include/ +// #include "blake3.h" +import "C" +import ( + "github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime" +) + +func NewBlake3Hasher(inputChunkSize uint64) (Hasher, runtime.EIcicleError) { + h := C.icicle_create_blake3((C.ulong)(inputChunkSize)) + if h == nil { + return Hasher{handle: nil}, runtime.UnknownError + } + + return Hasher{ + handle: h, + }, runtime.Success +} diff --git a/wrappers/golang/hash/include/blake3.h b/wrappers/golang/hash/include/blake3.h new file mode 100644 index 0000000000..f855f15b3c --- /dev/null +++ b/wrappers/golang/hash/include/blake3.h @@ -0,0 +1,17 @@ +#include +#include "hash.h" + +#ifndef _BLAKE3_HASH + #define _BLAKE3_HASH + + #ifdef __cplusplus +extern "C" { + #endif + +Hash* icicle_create_blake3(uint64_t default_input_chunk_size); + + #ifdef __cplusplus +} + #endif + +#endif diff --git a/wrappers/golang/hash/tests/hash_test.go b/wrappers/golang/hash/tests/hash_test.go index dbd32786b7..f424d7394d 100644 --- a/wrappers/golang/hash/tests/hash_test.go +++ b/wrappers/golang/hash/tests/hash_test.go @@ -99,6 +99,77 @@ func testBlake2s(s *suite.Suite) { s.NotEqual(outputEmpty, outputMain) } +func testBlake3_cpu_gpu(s *suite.Suite) { + singleHashInputSize := 567 + batch := 11 + const outputBytes = 32 // 32 bytes is output size of Blake3 + + input := make([]byte, singleHashInputSize*batch) + _, err := rand.Read(input) + if err != nil { + fmt.Println("error:", err) + return + } + + Blake3Hasher, error := hash.NewBlake3Hasher(0 /*default chunk size*/) + if error != runtime.Success { + fmt.Println("error:", error) + return + } + + outputRef := make([]byte, outputBytes*batch) + Blake3Hasher.Hash( + core.HostSliceFromElements(input), + core.HostSliceFromElements(outputRef), + core.GetDefaultHashConfig(), + ) + + runtime.SetDevice(&devices[1]) + Blake3Hasher, error = hash.NewBlake3Hasher(0 /*default chunk size*/) + if error != runtime.Success { + fmt.Println("error:", error) + return + } + + outputMain := make([]byte, outputBytes*batch) + Blake3Hasher.Hash( + core.HostSliceFromElements(input), + core.HostSliceFromElements(outputMain), + core.GetDefaultHashConfig(), + ) + + outputEmpty := make([]byte, outputBytes*batch) + s.Equal(outputRef, outputMain) + s.NotEqual(outputEmpty, outputMain) +} + +func testBlake3(s *suite.Suite) { + const outputBytes = 32 // 32 bytes is output size of Blake3 + + // Known input string and expected hash + inputString := "Hello world I am blake3. This is a semi-long Go test with a lot of characters. 0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + expectedHash := "a2b794acb5a604bbd2c4c0380e935697e0b934ea6f194b9f5246fbb212ebe549" + + input := []byte(inputString) + + Blake3Hasher, error := hash.NewBlake3Hasher(0 /*default chunk size*/) + if error != runtime.Success { + fmt.Println("error:", error) + return + } + + outputRef := make([]byte, outputBytes) + Blake3Hasher.Hash( + core.HostSliceFromElements(input), + core.HostSliceFromElements(outputRef), + core.GetDefaultHashConfig(), + ) + + outputRefHex := fmt.Sprintf("%x", outputRef) + + s.Equal(expectedHash, outputRefHex, "Hash mismatch: got %s, expected %s", outputRefHex, expectedHash) +} + func testSha3(s *suite.Suite) { singleHashInputSize := 1153 batch := 1 @@ -150,6 +221,8 @@ type HashTestSuite struct { func (s *HashTestSuite) TestHash() { s.Run("TestKeccakBatch", testWrapper(&s.Suite, testKeccakBatch)) s.Run("TestBlake2s", testWrapper(&s.Suite, testBlake2s)) + s.Run("TestBlake3_CPU_GPU", testWrapper(&s.Suite, testBlake3_cpu_gpu)) + s.Run("TestBlake3", testWrapper(&s.Suite, testBlake3)) s.Run("TestSha3", testWrapper(&s.Suite, testSha3)) } diff --git a/wrappers/rust/icicle-hash/src/blake3.rs b/wrappers/rust/icicle-hash/src/blake3.rs new file mode 100644 index 0000000000..923be6de22 --- /dev/null +++ b/wrappers/rust/icicle-hash/src/blake3.rs @@ -0,0 +1,18 @@ +use icicle_core::hash::{Hasher, HasherHandle}; +use icicle_runtime::errors::eIcicleError; + +extern "C" { + fn icicle_create_blake3(default_input_chunk_size: u64) -> HasherHandle; +} + +pub struct Blake3; + +impl Blake3 { + pub fn new(default_input_chunk_size: u64) -> Result { + let handle: HasherHandle = unsafe { icicle_create_blake3(default_input_chunk_size) }; + if handle.is_null() { + return Err(eIcicleError::UnknownError); + } + Ok(Hasher::from_handle(handle)) + } +} diff --git a/wrappers/rust/icicle-hash/src/lib.rs b/wrappers/rust/icicle-hash/src/lib.rs index 67bf4dd183..df84faa31c 100644 --- a/wrappers/rust/icicle-hash/src/lib.rs +++ b/wrappers/rust/icicle-hash/src/lib.rs @@ -1,4 +1,5 @@ pub mod blake2s; +pub mod blake3; pub mod keccak; pub mod sha3; diff --git a/wrappers/rust/icicle-hash/src/tests.rs b/wrappers/rust/icicle-hash/src/tests.rs index 3618185b08..78f223d81b 100644 --- a/wrappers/rust/icicle-hash/src/tests.rs +++ b/wrappers/rust/icicle-hash/src/tests.rs @@ -3,6 +3,7 @@ mod tests { use crate::{ blake2s::Blake2s, + blake3::Blake3, keccak::{Keccak256, Keccak512}, sha3::Sha3_256, }; @@ -88,6 +89,72 @@ mod tests { assert_eq!(output_ref, output_main); } + #[test] + fn blake3_hashing_cpu_gpu() { + initialize(); + let single_hash_input_size = 567; + let batch = 11; + + let mut input = vec![0 as u8; single_hash_input_size * batch]; + rand::thread_rng().fill(&mut input[..]); + let mut output_ref = vec![0 as u8; 32 * batch]; // 32B (=256b) is the output size of blake3 + let mut output_main = vec![0 as u8; 32 * batch]; + + test_utilities::test_set_ref_device(); + let blake3_hasher = Blake3::new(0 /*default chunk size */).unwrap(); + blake3_hasher + .hash( + HostSlice::from_slice(&input), + &HashConfig::default(), + HostSlice::from_mut_slice(&mut output_ref), + ) + .unwrap(); + + test_utilities::test_set_main_device(); + let blake3_hasher = Blake3::new(0 /*default chunk size */).unwrap(); + blake3_hasher + .hash( + HostSlice::from_slice(&input), + &HashConfig::default(), + HostSlice::from_mut_slice(&mut output_main), + ) + .unwrap(); + assert_eq!(output_ref, output_main); + } + + #[test] + fn blake3_hashing() { + // Known input string and expected hash + let input_string = "Hello world I am blake3. This is a semi-long Rust test with a lot of characters. 0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"; + let expected_hash = "ee4941ff90437a4fd7489ffa6d559e644a68b2547e95a690949b902da128b273"; + + let input = input_string.as_bytes(); + let mut output_ref = vec![0u8; 32]; // 32B (=256b) is the output size of blake3 + + test_utilities::test_set_ref_device(); + let blake3_hasher = Blake3::new(0 /*default chunk size */).unwrap(); + blake3_hasher + .hash( + HostSlice::from_slice(&input), + &HashConfig::default(), + HostSlice::from_mut_slice(&mut output_ref), + ) + .unwrap(); + + // Convert output_ref to hex for comparison + let output_ref_hex: String = output_ref + .iter() + .map(|b| format!("{:02x}", b)) + .collect(); + assert_eq!( + output_ref_hex, expected_hash, + "Hash mismatch: got {}, expected {}", + output_ref_hex, expected_hash + ); + + println!("Test passed: Computed hash matches expected hash."); + } + #[test] fn sha3_hashing() { initialize(); From cb1cdcb37c7a10f1b19e29d28be4f6d39ac68f06 Mon Sep 17 00:00:00 2001 From: release-bot Date: Tue, 14 Jan 2025 15:00:58 +0000 Subject: [PATCH 12/12] Bump rust crates' version icicle-babybear@3.4.0 icicle-bls12-377@3.4.0 icicle-bls12-381@3.4.0 icicle-bn254@3.4.0 icicle-bw6-761@3.4.0 icicle-core@3.4.0 icicle-grumpkin@3.4.0 icicle-hash@3.4.0 icicle-koalabear@3.4.0 icicle-m31@3.4.0 icicle-runtime@3.4.0 icicle-stark252@3.4.0 Generated by cargo-workspaces --- wrappers/rust/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wrappers/rust/Cargo.toml b/wrappers/rust/Cargo.toml index a1c705e56a..9495b8ca73 100644 --- a/wrappers/rust/Cargo.toml +++ b/wrappers/rust/Cargo.toml @@ -17,7 +17,7 @@ members = [ exclude = [] [workspace.package] -version = "3.3.0" +version = "3.4.0" edition = "2021" authors = [ "Ingonyama" ] homepage = "https://www.ingonyama.com"