From 0c33c74e5d211e92664fe41f8b270970bf89449f Mon Sep 17 00:00:00 2001
From: ShaniBabayoff <shanib@ingonyama.com>
Date: Thu, 2 Jan 2025 09:23:44 +0200
Subject: [PATCH 01/12] Update docusaurus.config.ts (#720)

change from V3.2 TO V3.3
---
 docs/docusaurus.config.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/docusaurus.config.ts b/docs/docusaurus.config.ts
index 8b2078716a..b4bb5ce484 100644
--- a/docs/docusaurus.config.ts
+++ b/docs/docusaurus.config.ts
@@ -175,7 +175,7 @@ const config: Config = {
     announcementBar: {
       id: 'announcement', // Any value that will identify this message.
       content:
-        '<strong>❄️🎉 New Release! ICICLE v3.2! 🎉❄️</strong>',
+        '<strong>❄️🎉 New Release! ICICLE v3.3! 🎉❄️</strong>',
       backgroundColor: '#64f5ef', // Light blue background color.
       textColor: '#000000', // Black text color.
       isCloseable: true, // Defaults to `true`.

From 29fe5de5a93e009e0b61fdac0ae0721057ffa495 Mon Sep 17 00:00:00 2001
From: Jeremy Felder <jeremy.felder1@gmail.com>
Date: Sun, 5 Jan 2025 09:04:12 +0200
Subject: [PATCH 02/12] Add capability to set a default device for all threads
 (#699)

---
 docs/docs/icicle/programmers_guide/cpp.md     |  15 +++
 docs/docs/icicle/programmers_guide/general.md |   1 +
 docs/docs/icicle/programmers_guide/go.md      |  17 ++-
 docs/docs/icicle/programmers_guide/rust.md    |  15 +++
 icicle/include/icicle/device_api.h            |   1 +
 icicle/include/icicle/runtime.h               |   8 ++
 icicle/src/device_api.cpp                     |  16 +++
 icicle/src/runtime.cpp                        |   5 +
 icicle/tests/test_device_api.cpp              |  31 +++++
 wrappers/golang/runtime/device.go             |   6 +
 wrappers/golang/runtime/include/runtime.h     |   1 +
 wrappers/golang/runtime/tests/device_test.go  | 118 +++++++++++++-----
 wrappers/golang/runtime/tests/main_test.go    |  14 +--
 wrappers/golang/runtime/tests/stream_test.go  |   8 +-
 wrappers/rust/icicle-runtime/src/device.rs    |   2 +-
 wrappers/rust/icicle-runtime/src/runtime.rs   |  10 ++
 wrappers/rust/icicle-runtime/src/tests.rs     |  32 +++++
 17 files changed, 245 insertions(+), 55 deletions(-)

diff --git a/docs/docs/icicle/programmers_guide/cpp.md b/docs/docs/icicle/programmers_guide/cpp.md
index b8f1f285f7..cff576571b 100644
--- a/docs/docs/icicle/programmers_guide/cpp.md
+++ b/docs/docs/icicle/programmers_guide/cpp.md
@@ -32,6 +32,21 @@ eIcicleError result = icicle_set_device(device);
 eIcicleError result = icicle_get_active_device(device);
 ```
 
+### Setting and Getting the Default Device
+
+You can set the default device for all threads:
+
+```cpp
+icicle::Device device = {"CUDA", 0}; // or other
+eIcicleError result = icicle_set_default_device(device);
+```
+
+:::caution
+
+Setting a default device should be done **once** from the main thread of the application. If another device or backend is needed for a specific thread [icicle_set_device](#setting-and-getting-active-device) should be used instead.
+
+:::
+
 ### Querying Device Information
 
 Retrieve the number of available devices and check if a pointer is allocated on the host or on the active device:
diff --git a/docs/docs/icicle/programmers_guide/general.md b/docs/docs/icicle/programmers_guide/general.md
index 0bef2b8500..0405cf35cf 100644
--- a/docs/docs/icicle/programmers_guide/general.md
+++ b/docs/docs/icicle/programmers_guide/general.md
@@ -85,6 +85,7 @@ ICICLE provides a device abstraction layer that allows you to interact with diff
 
 - **Loading Backends**: Backends are loaded dynamically based on the environment configuration or a specified path.
 - **Setting Active Device**: The active device for a thread can be set, allowing for targeted computation on a specific device.
+- **Setting Default Device**: The default device for any thread without an active device can be set, removing the need to specify an alternative device on each thread. This is especially useful when running on a backend that is not the built-in CPU backend which is the default device to start.
 
 ## Streams
 
diff --git a/docs/docs/icicle/programmers_guide/go.md b/docs/docs/icicle/programmers_guide/go.md
index 92df226f97..03f7932c1b 100644
--- a/docs/docs/icicle/programmers_guide/go.md
+++ b/docs/docs/icicle/programmers_guide/go.md
@@ -27,12 +27,27 @@ result := runtime.LoadBackend("/path/to/backend/installdir", true)
 You can set the active device for the current thread and retrieve it when needed:
 
 ```go
-device = runtime.CreateDevice("CUDA", 0) // or other
+device := runtime.CreateDevice("CUDA", 0) // or other
 result := runtime.SetDevice(device)
 // or query current (thread) device 
 activeDevice := runtime.GetActiveDevice()
 ```
 
+### Setting and Getting the Default Device
+
+You can set the default device for all threads:
+
+```go
+device := runtime.CreateDevice("CUDA", 0) // or other
+defaultDevice := runtime.SetDefaultDevice(device);
+```
+
+:::caution
+
+Setting a default device should be done **once** from the main thread of the application. If another device or backend is needed for a specific thread [runtime.SetDevice](#setting-and-getting-active-device) should be used instead.
+
+:::
+
 ### Querying Device Information
 
 Retrieve the number of available devices and check if a pointer is allocated on the host or on the active device:
diff --git a/docs/docs/icicle/programmers_guide/rust.md b/docs/docs/icicle/programmers_guide/rust.md
index af5caa5a82..55188cfe2b 100644
--- a/docs/docs/icicle/programmers_guide/rust.md
+++ b/docs/docs/icicle/programmers_guide/rust.md
@@ -55,6 +55,21 @@ icicle_runtime::set_device(&device).unwrap();
 let active_device = icicle_runtime::get_active_device().unwrap();
 ```
 
+### Setting and Getting the Default Device
+
+You can set the default device for all threads:
+
+```caution
+let device = Device::new("CUDA", 0); // or other
+let default_device = icicle_runtime::set_default_device(device);
+```
+
+:::note
+
+Setting a default device should be done **once** from the main thread of the application. If another device or backend is needed for a specific thread [icicle_runtime::set_device](#setting-and-getting-active-device) should be used instead.
+
+:::
+
 ### Querying Device Information
 
 Retrieve the number of available devices and check if a pointer is allocated on the host or on the active device:
diff --git a/icicle/include/icicle/device_api.h b/icicle/include/icicle/device_api.h
index 483b5f96bc..0768f135c0 100644
--- a/icicle/include/icicle/device_api.h
+++ b/icicle/include/icicle/device_api.h
@@ -188,6 +188,7 @@ namespace icicle {
 
   public:
     static eIcicleError set_thread_local_device(const Device& device);
+    static eIcicleError set_default_device(const Device& device);
     static const Device& get_thread_local_device();
     static const DeviceAPI* get_thread_local_deviceAPI();
     static DeviceTracker& get_global_memory_tracker() { return sMemTracker; }
diff --git a/icicle/include/icicle/runtime.h b/icicle/include/icicle/runtime.h
index a14d80bd84..3dad72986e 100644
--- a/icicle/include/icicle/runtime.h
+++ b/icicle/include/icicle/runtime.h
@@ -36,6 +36,14 @@ extern "C" eIcicleError icicle_load_backend_from_env_or_default();
  */
 extern "C" eIcicleError icicle_set_device(const icicle::Device& device);
 
+/**
+ * @brief Set default device for all threads
+ *
+
+ * @return eIcicleError::SUCCESS if successful, otherwise throws INVALID_DEVICE
+ */
+extern "C" eIcicleError icicle_set_default_device(const icicle::Device& device);
+
 /**
  * @brief Get active device for thread
  *
diff --git a/icicle/src/device_api.cpp b/icicle/src/device_api.cpp
index 179b8a3cb4..25ecc9c966 100644
--- a/icicle/src/device_api.cpp
+++ b/icicle/src/device_api.cpp
@@ -58,6 +58,17 @@ namespace icicle {
 
     const Device& get_default_device() { return m_default_device; }
 
+    eIcicleError set_default_device(const Device& dev)
+    {
+      if (!is_device_registered(dev.type)) {
+        ICICLE_LOG_ERROR << "Device type " + std::string(dev.type) + " is not valid as it has not been registered";
+        return eIcicleError::INVALID_DEVICE;
+      }
+
+      m_default_device = dev;
+      return eIcicleError::SUCCESS;
+    }
+
     std::vector<std::string> get_registered_devices_list()
     {
       std::vector<std::string> registered_devices;
@@ -116,6 +127,11 @@ namespace icicle {
     return default_deviceAPI.get();
   }
 
+  eIcicleError DeviceAPI::set_default_device(const Device& dev)
+  {
+    return DeviceAPIRegistry::Global().set_default_device(dev);
+  }
+
   /********************************************************************************** */
 
   DeviceAPI* get_deviceAPI(const Device& device) { return DeviceAPIRegistry::Global().get_deviceAPI(device).get(); }
diff --git a/icicle/src/runtime.cpp b/icicle/src/runtime.cpp
index 8e9028cfcc..eefed8ad2b 100644
--- a/icicle/src/runtime.cpp
+++ b/icicle/src/runtime.cpp
@@ -14,6 +14,11 @@ using namespace icicle;
 
 extern "C" eIcicleError icicle_set_device(const Device& device) { return DeviceAPI::set_thread_local_device(device); }
 
+extern "C" eIcicleError icicle_set_default_device(const Device& device)
+{
+  return DeviceAPI::set_default_device(device);
+}
+
 extern "C" eIcicleError icicle_get_active_device(icicle::Device& device)
 {
   const Device& active_device = DeviceAPI::get_thread_local_device();
diff --git a/icicle/tests/test_device_api.cpp b/icicle/tests/test_device_api.cpp
index f63d3c9878..8a5ea68874 100644
--- a/icicle/tests/test_device_api.cpp
+++ b/icicle/tests/test_device_api.cpp
@@ -1,5 +1,6 @@
 
 #include <gtest/gtest.h>
+#include <thread>
 #include <iostream>
 
 #include "icicle/runtime.h"
@@ -19,6 +20,36 @@ TEST_F(DeviceApiTest, UnregisteredDeviceError)
   EXPECT_ANY_THROW(get_deviceAPI(dev));
 }
 
+TEST_F(DeviceApiTest, SetDefaultDevice)
+{
+  icicle::Device active_dev = {UNKOWN_DEVICE, -1};
+
+  icicle::Device cpu_dev = {s_ref_device, 0};
+  EXPECT_NO_THROW(icicle_set_device(cpu_dev));
+  EXPECT_NO_THROW(icicle_get_active_device(active_dev));
+
+  ASSERT_EQ(cpu_dev, active_dev);
+
+  active_dev = {UNKOWN_DEVICE, -1};
+
+  icicle::Device gpu_dev = {s_main_device, 0};
+  EXPECT_NO_THROW(icicle_set_default_device(gpu_dev));
+
+  // setting a new default device doesn't override already set local thread devices
+  EXPECT_NO_THROW(icicle_get_active_device(active_dev));
+  ASSERT_EQ(cpu_dev, active_dev);
+
+  active_dev = {UNKOWN_DEVICE, -1};
+  auto thread_func = [&active_dev, &gpu_dev]() {
+    EXPECT_NO_THROW(icicle_get_active_device(active_dev));
+    ASSERT_EQ(gpu_dev, active_dev);
+  };
+
+  std::thread worker_thread(thread_func);
+
+  worker_thread.join();
+}
+
 TEST_F(DeviceApiTest, MemoryCopySync)
 {
   int input[2] = {1, 2};
diff --git a/wrappers/golang/runtime/device.go b/wrappers/golang/runtime/device.go
index fac5b0f09e..7f0965bab3 100644
--- a/wrappers/golang/runtime/device.go
+++ b/wrappers/golang/runtime/device.go
@@ -50,6 +50,12 @@ func SetDevice(device *Device) EIcicleError {
 	return EIcicleError(cErr)
 }
 
+func SetDefaultDevice(device *Device) EIcicleError {
+	cDevice := (*C.Device)(unsafe.Pointer(device))
+	cErr := C.icicle_set_default_device(cDevice)
+	return EIcicleError(cErr)
+}
+
 func GetActiveDevice() (*Device, EIcicleError) {
 	device := CreateDevice("invalid", -1)
 	cDevice := (*C.Device)(unsafe.Pointer(&device))
diff --git a/wrappers/golang/runtime/include/runtime.h b/wrappers/golang/runtime/include/runtime.h
index 003bb08947..16d4d348ae 100644
--- a/wrappers/golang/runtime/include/runtime.h
+++ b/wrappers/golang/runtime/include/runtime.h
@@ -13,6 +13,7 @@ typedef struct DeviceProperties DeviceProperties;
 int icicle_load_backend(const char* path, bool is_recursive);
 int icicle_load_backend_from_env_or_default();
 int icicle_set_device(const Device* device);
+int icicle_set_default_device(const Device* device);
 int icicle_get_active_device(Device* device);
 int icicle_is_host_memory(const void* ptr);
 int icicle_is_active_device_memory(const void* ptr);
diff --git a/wrappers/golang/runtime/tests/device_test.go b/wrappers/golang/runtime/tests/device_test.go
index a4c114389f..3219604c29 100644
--- a/wrappers/golang/runtime/tests/device_test.go
+++ b/wrappers/golang/runtime/tests/device_test.go
@@ -1,70 +1,122 @@
 package tests
 
 import (
+	"fmt"
 	"os/exec"
+	"runtime"
+	"strconv"
+	"strings"
+	"syscall"
 	"testing"
 
-	"github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime"
+	icicle_runtime "github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime"
 
 	"github.com/stretchr/testify/assert"
 )
 
 func TestGetDeviceType(t *testing.T) {
 	expectedDeviceName := "test"
-	config := runtime.CreateDevice(expectedDeviceName, 0)
+	config := icicle_runtime.CreateDevice(expectedDeviceName, 0)
 	assert.Equal(t, expectedDeviceName, config.GetDeviceType())
 
 	expectedDeviceNameLong := "testtesttesttesttesttesttesttesttesttesttesttesttesttesttesttest"
-	configLargeName := runtime.CreateDevice(expectedDeviceNameLong, 1)
+	configLargeName := icicle_runtime.CreateDevice(expectedDeviceNameLong, 1)
 	assert.NotEqual(t, expectedDeviceNameLong, configLargeName.GetDeviceType())
 }
 
 func TestIsDeviceAvailable(t *testing.T) {
-	runtime.LoadBackendFromEnvOrDefault()
-	dev := runtime.CreateDevice("CUDA", 0)
-	_ = runtime.SetDevice(&dev)
-	res, err := runtime.GetDeviceCount()
-
-	expectedNumDevices, error := exec.Command("nvidia-smi", "-L", "|", "wc", "-l").Output()
-	if error != nil {
-		t.Skip("Failed to get number of devices")
+	dev := icicle_runtime.CreateDevice("CUDA", 0)
+	_ = icicle_runtime.SetDevice(&dev)
+	res, err := icicle_runtime.GetDeviceCount()
+
+	smiCommand := exec.Command("nvidia-smi", "-L")
+	smiCommandStdout, _ := smiCommand.StdoutPipe()
+	wcCommand := exec.Command("wc", "-l")
+	wcCommand.Stdin = smiCommandStdout
+
+	smiCommand.Start()
+
+	expectedNumDevicesRaw, wcErr := wcCommand.Output()
+	smiCommand.Wait()
+
+	expectedNumDevicesAsString := strings.TrimRight(string(expectedNumDevicesRaw), " \n\r\t")
+	expectedNumDevices, _ := strconv.Atoi(expectedNumDevicesAsString)
+	if wcErr != nil {
+		t.Skip("Failed to get number of devices:", wcErr)
 	}
 
-	assert.Equal(t, runtime.Success, err)
+	assert.Equal(t, icicle_runtime.Success, err)
 	assert.Equal(t, expectedNumDevices, res)
 
-	err = runtime.LoadBackendFromEnvOrDefault()
-	assert.Equal(t, runtime.Success, err)
-	devCuda := runtime.CreateDevice("CUDA", 0)
-	assert.True(t, runtime.IsDeviceAvailable(&devCuda))
-	devCpu := runtime.CreateDevice("CPU", 0)
-	assert.True(t, runtime.IsDeviceAvailable(&devCpu))
-	devInvalid := runtime.CreateDevice("invalid", 0)
-	assert.False(t, runtime.IsDeviceAvailable(&devInvalid))
+	assert.Equal(t, icicle_runtime.Success, err)
+	devCuda := icicle_runtime.CreateDevice("CUDA", 0)
+	assert.True(t, icicle_runtime.IsDeviceAvailable(&devCuda))
+	devCpu := icicle_runtime.CreateDevice("CPU", 0)
+	assert.True(t, icicle_runtime.IsDeviceAvailable(&devCpu))
+	devInvalid := icicle_runtime.CreateDevice("invalid", 0)
+	assert.False(t, icicle_runtime.IsDeviceAvailable(&devInvalid))
+}
+
+func TestSetDefaultDevice(t *testing.T) {
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+	tidOuter := syscall.Gettid()
+
+	gpuDevice := icicle_runtime.CreateDevice("CUDA", 0)
+	icicle_runtime.SetDefaultDevice(&gpuDevice)
+
+	activeDevice, err := icicle_runtime.GetActiveDevice()
+	assert.Equal(t, icicle_runtime.Success, err)
+	assert.Equal(t, gpuDevice, *activeDevice)
+
+	done := make(chan struct{}, 1)
+	go func() {
+		runtime.LockOSThread()
+		defer runtime.UnlockOSThread()
+
+		// Ensure we are operating on an OS thread other than the original one
+		tidInner := syscall.Gettid()
+		for tidInner == tidOuter {
+			fmt.Println("Locked thread is the same as original, getting new locked thread")
+			runtime.UnlockOSThread()
+			runtime.LockOSThread()
+			tidInner = syscall.Gettid()
+		}
+
+		activeDevice, err := icicle_runtime.GetActiveDevice()
+		assert.Equal(t, icicle_runtime.Success, err)
+		assert.Equal(t, gpuDevice, *activeDevice)
+
+		close(done)
+	}()
+
+	<-done
+
+	cpuDevice := icicle_runtime.CreateDevice("CPU", 0)
+	icicle_runtime.SetDefaultDevice(&cpuDevice)
 }
 
 func TestRegisteredDevices(t *testing.T) {
-	err := runtime.LoadBackendFromEnvOrDefault()
-	assert.Equal(t, runtime.Success, err)
-	devices, _ := runtime.GetRegisteredDevices()
+	devices, _ := icicle_runtime.GetRegisteredDevices()
 	assert.Equal(t, []string{"CUDA", "CPU"}, devices)
 }
 
 func TestDeviceProperties(t *testing.T) {
-	_, err := runtime.GetDeviceProperties()
-	assert.Equal(t, runtime.Success, err)
+	_, err := icicle_runtime.GetDeviceProperties()
+	assert.Equal(t, icicle_runtime.Success, err)
 }
 
 func TestActiveDevice(t *testing.T) {
-	runtime.SetDevice(&DEVICE)
-	activeDevice, err := runtime.GetActiveDevice()
-	assert.Equal(t, runtime.Success, err)
-	assert.Equal(t, DEVICE, *activeDevice)
-	memory1, err := runtime.GetAvailableMemory()
-	if err == runtime.ApiNotImplemented {
-		t.Skipf("GetAvailableMemory() function is not implemented on %s device", DEVICE.GetDeviceType())
+	devCpu := icicle_runtime.CreateDevice("CUDA", 0)
+	icicle_runtime.SetDevice(&devCpu)
+	activeDevice, err := icicle_runtime.GetActiveDevice()
+	assert.Equal(t, icicle_runtime.Success, err)
+	assert.Equal(t, devCpu, *activeDevice)
+	memory1, err := icicle_runtime.GetAvailableMemory()
+	if err == icicle_runtime.ApiNotImplemented {
+		t.Skipf("GetAvailableMemory() function is not implemented on %s device", devCpu.GetDeviceType())
 	}
-	assert.Equal(t, runtime.Success, err)
+	assert.Equal(t, icicle_runtime.Success, err)
 	assert.Greater(t, memory1.Total, uint(0))
 	assert.Greater(t, memory1.Free, uint(0))
 }
diff --git a/wrappers/golang/runtime/tests/main_test.go b/wrappers/golang/runtime/tests/main_test.go
index 226fee7836..800c34816e 100644
--- a/wrappers/golang/runtime/tests/main_test.go
+++ b/wrappers/golang/runtime/tests/main_test.go
@@ -6,19 +6,7 @@ import (
 	"github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime"
 )
 
-var DEVICE runtime.Device
-
 func TestMain(m *testing.M) {
 	runtime.LoadBackendFromEnvOrDefault()
-	devices, e := runtime.GetRegisteredDevices()
-	if e != runtime.Success {
-		panic("Failed to load registered devices")
-	}
-	for _, deviceType := range devices {
-		DEVICE = runtime.CreateDevice(deviceType, 0)
-		runtime.SetDevice(&DEVICE)
-
-		// execute tests
-		m.Run()
-	}
+	m.Run()
 }
diff --git a/wrappers/golang/runtime/tests/stream_test.go b/wrappers/golang/runtime/tests/stream_test.go
index c32cae8960..8acd7089fa 100644
--- a/wrappers/golang/runtime/tests/stream_test.go
+++ b/wrappers/golang/runtime/tests/stream_test.go
@@ -8,19 +8,15 @@ import (
 )
 
 func TestCreateStream(t *testing.T) {
-	err := runtime.LoadBackendFromEnvOrDefault()
-	assert.Equal(t, runtime.Success, err)
 	dev := runtime.CreateDevice("CUDA", 0)
 	assert.True(t, runtime.IsDeviceAvailable(&dev))
-	err = runtime.SetDevice(&dev)
+	err := runtime.SetDevice(&dev)
 	assert.Equal(t, runtime.Success, err)
 	_, err = runtime.CreateStream()
 	assert.Equal(t, runtime.Success, err, "Unable to create stream due to %d", err)
 }
 
 func TestDestroyStream(t *testing.T) {
-	err := runtime.LoadBackendFromEnvOrDefault()
-	assert.Equal(t, runtime.Success, err)
 	dev := runtime.CreateDevice("CUDA", 0)
 	assert.True(t, runtime.IsDeviceAvailable(&dev))
 	stream, err := runtime.CreateStream()
@@ -31,8 +27,6 @@ func TestDestroyStream(t *testing.T) {
 }
 
 func TestSyncStream(t *testing.T) {
-	err := runtime.LoadBackendFromEnvOrDefault()
-	assert.Equal(t, runtime.Success, err)
 	dev := runtime.CreateDevice("CUDA", 0)
 	assert.True(t, runtime.IsDeviceAvailable(&dev))
 	runtime.SetDevice(&dev)
diff --git a/wrappers/rust/icicle-runtime/src/device.rs b/wrappers/rust/icicle-runtime/src/device.rs
index 9248a805bd..ee8f2de677 100644
--- a/wrappers/rust/icicle-runtime/src/device.rs
+++ b/wrappers/rust/icicle-runtime/src/device.rs
@@ -4,7 +4,7 @@ use std::os::raw::c_char;
 
 const MAX_TYPE_SIZE: usize = 64;
 
-#[derive(Clone)]
+#[derive(Clone, PartialEq)]
 #[repr(C)]
 pub struct Device {
     device_type: [c_char; MAX_TYPE_SIZE],
diff --git a/wrappers/rust/icicle-runtime/src/runtime.rs b/wrappers/rust/icicle-runtime/src/runtime.rs
index c1c88d162c..fce1733a5d 100644
--- a/wrappers/rust/icicle-runtime/src/runtime.rs
+++ b/wrappers/rust/icicle-runtime/src/runtime.rs
@@ -11,6 +11,7 @@ extern "C" {
     fn icicle_load_backend(path: *const c_char, is_recursive: bool) -> eIcicleError;
     fn icicle_load_backend_from_env_or_default() -> eIcicleError;
     fn icicle_set_device(device: &Device) -> eIcicleError;
+    fn icicle_set_default_device(device: &Device) -> eIcicleError;
     fn icicle_get_active_device(device: &mut Device) -> eIcicleError;
     fn icicle_is_host_memory(ptr: *const c_void) -> eIcicleError;
     fn icicle_is_active_device_memory(ptr: *const c_void) -> eIcicleError;
@@ -66,6 +67,15 @@ pub fn set_device(device: &Device) -> Result<(), eIcicleError> {
     }
 }
 
+pub fn set_default_device(device: &Device) -> Result<(), eIcicleError> {
+    let result = unsafe { icicle_set_default_device(device) };
+    if result == eIcicleError::Success {
+        Ok(())
+    } else {
+        Err(result)
+    }
+}
+
 pub fn get_active_device() -> Result<Device, eIcicleError> {
     let mut device: Device = Device::new("invalid", -1);
     unsafe { icicle_get_active_device(&mut device).wrap_value::<Device>(device) }
diff --git a/wrappers/rust/icicle-runtime/src/tests.rs b/wrappers/rust/icicle-runtime/src/tests.rs
index e2a22b3c44..5555dd6d3f 100644
--- a/wrappers/rust/icicle-runtime/src/tests.rs
+++ b/wrappers/rust/icicle-runtime/src/tests.rs
@@ -6,6 +6,7 @@ mod tests {
     use crate::test_utilities;
     use crate::*;
     use std::sync::Once;
+    use std::thread;
 
     static INIT: Once = Once::new();
 
@@ -28,6 +29,37 @@ mod tests {
         test_utilities::test_set_ref_device();
     }
 
+    #[test]
+    fn test_set_default_device() {
+        initialize();
+
+        // block scope is necessary in order to free the mutex lock
+        // to be used by the spawned thread
+        let outer_thread_id = thread::current().id();
+        {
+            let main_device = test_utilities::TEST_MAIN_DEVICE
+                .lock()
+                .unwrap();
+            set_default_device(&main_device).unwrap();
+
+            let active_device = get_active_device().unwrap();
+            assert_eq!(*main_device, active_device);
+        }
+
+        let handle = thread::spawn(move || {
+            let inner_thread_id = thread::current().id();
+            assert_ne!(outer_thread_id, inner_thread_id);
+
+            let active_device = get_active_device().unwrap();
+            let main_device = test_utilities::TEST_MAIN_DEVICE
+                .lock()
+                .unwrap();
+            assert_eq!(*main_device, active_device);
+        });
+
+        let _ = handle.join();
+    }
+
     #[test]
     fn test_sync_memory_copy() {
         initialize();

From 33c708fd2361404586e854eb01f861b376f9f45c Mon Sep 17 00:00:00 2001
From: HadarIngonyama <102164010+HadarIngonyama@users.noreply.github.com>
Date: Mon, 6 Jan 2025 18:10:21 +0200
Subject: [PATCH 03/12] Hadar/b const mult (#723)

---
 .../include/icicle/curves/params/bls12_377.h  |   8 +
 .../include/icicle/curves/params/bls12_381.h  |   8 +
 icicle/include/icicle/curves/params/bn254.h   |   8 +
 icicle/include/icicle/curves/params/bw6_761.h |  17 +-
 .../include/icicle/curves/params/grumpkin.h   |  10 +-
 icicle/include/icicle/curves/projective.h     | 196 +++++++++---------
 .../include/icicle/fields/complex_extension.h |  84 ++++++++
 icicle/include/icicle/fields/field.h          |  36 ++++
 8 files changed, 262 insertions(+), 105 deletions(-)

diff --git a/icicle/include/icicle/curves/params/bls12_377.h b/icicle/include/icicle/curves/params/bls12_377.h
index da19ab2bda..6ad767e5f3 100644
--- a/icicle/include/icicle/curves/params/bls12_377.h
+++ b/icicle/include/icicle/curves/params/bls12_377.h
@@ -25,6 +25,9 @@ namespace bls12_377 {
     static constexpr point_field_t weierstrass_b = {0x00000001, 0x00000000, 0x00000000, 0x00000000,
                                                     0x00000000, 0x00000000, 0x00000000, 0x00000000,
                                                     0x00000000, 0x00000000, 0x00000000, 0x00000000};
+
+    static constexpr bool is_b_u32 = true;
+    static constexpr bool is_b_neg = false;
   }; // G1
 
   struct G2 {
@@ -47,6 +50,11 @@ namespace bls12_377 {
                                                           0x3c6bf800, 0x129207b6, 0xcd5fd889, 0xdc7b4f91,
                                                           0x7460c589, 0x43bd0373, 0xdb0fd6f3, 0x010222f6};
 
+    static constexpr bool is_b_u32_g2_re = true;
+    static constexpr bool is_b_neg_g2_re = false;
+    static constexpr bool is_b_u32_g2_im = false;
+    static constexpr bool is_b_neg_g2_im = false;
+
     static constexpr g2_point_field_t gen_x = {g2_gen_x_re, g2_gen_x_im};
     static constexpr g2_point_field_t gen_y = {g2_gen_y_re, g2_gen_y_im};
     static constexpr g2_point_field_t weierstrass_b = {weierstrass_b_g2_re, weierstrass_b_g2_im};
diff --git a/icicle/include/icicle/curves/params/bls12_381.h b/icicle/include/icicle/curves/params/bls12_381.h
index 7457bd49c5..18fbf97496 100644
--- a/icicle/include/icicle/curves/params/bls12_381.h
+++ b/icicle/include/icicle/curves/params/bls12_381.h
@@ -25,6 +25,9 @@ namespace bls12_381 {
     static constexpr point_field_t weierstrass_b = {0x00000004, 0x00000000, 0x00000000, 0x00000000,
                                                     0x00000000, 0x00000000, 0x00000000, 0x00000000,
                                                     0x00000000, 0x00000000, 0x00000000, 0x00000000};
+
+    static constexpr bool is_b_u32 = true;
+    static constexpr bool is_b_neg = false;
   };
 
   struct G2 {
@@ -48,6 +51,11 @@ namespace bls12_381 {
                                                           0x00000000, 0x00000000, 0x00000000, 0x00000000,
                                                           0x00000000, 0x00000000, 0x00000000, 0x00000000};
 
+    static constexpr bool is_b_u32_g2_re = true;
+    static constexpr bool is_b_neg_g2_re = false;
+    static constexpr bool is_b_u32_g2_im = true;
+    static constexpr bool is_b_neg_g2_im = false;
+
     static constexpr g2_point_field_t gen_x = {g2_gen_x_re, g2_gen_x_im};
     static constexpr g2_point_field_t gen_y = {g2_gen_y_re, g2_gen_y_im};
     static constexpr g2_point_field_t weierstrass_b = {weierstrass_b_g2_re, weierstrass_b_g2_im};
diff --git a/icicle/include/icicle/curves/params/bn254.h b/icicle/include/icicle/curves/params/bn254.h
index b8095b53fe..0a628afcb9 100644
--- a/icicle/include/icicle/curves/params/bn254.h
+++ b/icicle/include/icicle/curves/params/bn254.h
@@ -24,6 +24,9 @@ namespace bn254 {
                                             0x00000000, 0x00000000, 0x00000000, 0x00000000};
     static constexpr point_field_t weierstrass_b = {0x00000003, 0x00000000, 0x00000000, 0x00000000,
                                                     0x00000000, 0x00000000, 0x00000000, 0x00000000};
+
+    static constexpr bool is_b_u32 = true;
+    static constexpr bool is_b_neg = false;
   }; // G1
 
   struct G2 {
@@ -40,6 +43,11 @@ namespace bn254 {
     static constexpr point_field_t weierstrass_b_g2_im = {0x85c315d2, 0xe4a2bd06, 0xe52d1852, 0xa74fa084,
                                                           0xeed8fdf4, 0xcd2cafad, 0x3af0fed4, 0x009713b0};
 
+    static constexpr bool is_b_u32_g2_re = false;
+    static constexpr bool is_b_neg_g2_re = false;
+    static constexpr bool is_b_u32_g2_im = false;
+    static constexpr bool is_b_neg_g2_im = false;
+
     static constexpr g2_point_field_t gen_x = {g2_gen_x_re, g2_gen_x_im};
     static constexpr g2_point_field_t gen_y = {g2_gen_y_re, g2_gen_y_im};
     static constexpr g2_point_field_t weierstrass_b = {weierstrass_b_g2_re, weierstrass_b_g2_im};
diff --git a/icicle/include/icicle/curves/params/bw6_761.h b/icicle/include/icicle/curves/params/bw6_761.h
index c0a1fcf5ed..7760c2023b 100644
--- a/icicle/include/icicle/curves/params/bw6_761.h
+++ b/icicle/include/icicle/curves/params/bw6_761.h
@@ -25,10 +25,18 @@ namespace bw6_761 {
                                             0xb3053253, 0x9f9df141, 0x6fc2cdd4, 0xbe3fb90b, 0x717a4c55, 0xcc685d31,
                                             0x71b5b806, 0xc5b8fa17, 0xaf7e0dba, 0x265909f1, 0xa2e573a3, 0x1a7348d2,
                                             0x884c9ec6, 0x0f952589, 0x45cc2a42, 0xe6fd637b, 0x0a6fc574, 0x0058b84e};
+    // actual value:
+    // static constexpr point_field_t weierstrass_b = {
+    //   0x0000008a, 0xf49d0000, 0x70000082, 0xe6913e68, 0xeaf0a437, 0x160cf8ae, 0x5667a8f8, 0x98a116c2,
+    //   0x73ebff2e, 0x71dcd3dc, 0x12f9fd90, 0x8689c8ed, 0x25b42304, 0x03cebaff, 0xe584e919, 0x707ba638,
+    //   0x8087be41, 0x528275ef, 0x81d14688, 0xb926186a, 0x04faff3e, 0xd187c940, 0xfb83ce0a, 0x0122e824};
     static constexpr point_field_t weierstrass_b = {
-      0x0000008a, 0xf49d0000, 0x70000082, 0xe6913e68, 0xeaf0a437, 0x160cf8ae, 0x5667a8f8, 0x98a116c2,
-      0x73ebff2e, 0x71dcd3dc, 0x12f9fd90, 0x8689c8ed, 0x25b42304, 0x03cebaff, 0xe584e919, 0x707ba638,
-      0x8087be41, 0x528275ef, 0x81d14688, 0xb926186a, 0x04faff3e, 0xd187c940, 0xfb83ce0a, 0x0122e824};
+      0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+      0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+
+    static constexpr bool is_b_u32 = true;
+    static constexpr bool is_b_neg = true;
   };
 
   struct G2 {
@@ -44,5 +52,8 @@ namespace bw6_761 {
       0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
       0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
       0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+
+    static constexpr bool is_b_u32 = true;
+    static constexpr bool is_b_neg = false;
   };
 } // namespace bw6_761
diff --git a/icicle/include/icicle/curves/params/grumpkin.h b/icicle/include/icicle/curves/params/grumpkin.h
index a468fd911d..d09390bccc 100644
--- a/icicle/include/icicle/curves/params/grumpkin.h
+++ b/icicle/include/icicle/curves/params/grumpkin.h
@@ -16,7 +16,13 @@ namespace grumpkin {
     static constexpr point_field_t gen_y = {0x823f272c, 0x833fc48d, 0xf1181294, 0x2d270d45,
                                             0x6a45d63,  0xcf135e75, 0x00000002, 0x00000000};
 
-    static constexpr point_field_t weierstrass_b = {0xeffffff0, 0x43e1f593, 0x79b97091, 0x2833e848,
-                                                    0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72};
+    // actual value:
+    // static constexpr point_field_t weierstrass_b = {0xeffffff0, 0x43e1f593, 0x79b97091, 0x2833e848,
+    // 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72};
+    static constexpr point_field_t weierstrass_b = {0x00000011, 0x00000000, 0x00000000, 0x00000000,
+                                                    0x00000000, 0x00000000, 0x00000000, 0x00000000};
+
+    static constexpr bool is_b_u32 = true;
+    static constexpr bool is_b_neg = true;
   }; // G1
 } // namespace grumpkin
diff --git a/icicle/include/icicle/curves/projective.h b/icicle/include/icicle/curves/projective.h
index 87fe0ab238..17cf9329c8 100644
--- a/icicle/include/icicle/curves/projective.h
+++ b/icicle/include/icicle/curves/projective.h
@@ -58,70 +58,68 @@ class Projective
     const FF Z = point.z;
 
     // TODO: Change to efficient dbl once implemented for field.cuh
-    FF t0 = FF::sqr(Y);                                                                // 1. t0 ← Y · Y
-    FF Z3 = t0 + t0;                                                                   // 2. Z3 ← t0 + t0
-    Z3 = Z3 + Z3;                                                                      // 3. Z3 ← Z3 + Z3
-    Z3 = Z3 + Z3;                                                                      // 4. Z3 ← Z3 + Z3
-    FF t1 = Y * Z;                                                                     // 5. t1 ← Y · Z
-    FF t2 = FF::sqr(Z);                                                                // 6. t2 ← Z · Z
-    t2 = FF::template mul_unsigned<3>(FF::template mul_const<Gen::weierstrass_b>(t2)); // 7. t2 ← b3 · t2
-    FF X3 = t2 * Z3;                                                                   // 8. X3 ← t2 · Z3
-    FF Y3 = t0 + t2;                                                                   // 9. Y3 ← t0 + t2
-    Z3 = t1 * Z3;                                                                      // 10. Z3 ← t1 · Z3
-    t1 = t2 + t2;                                                                      // 11. t1 ← t2 + t2
-    t2 = t1 + t2;                                                                      // 12. t2 ← t1 + t2
-    t0 = t0 - t2;                                                                      // 13. t0 ← t0 − t2
-    Y3 = t0 * Y3;                                                                      // 14. Y3 ← t0 · Y3
-    Y3 = X3 + Y3;                                                                      // 15. Y3 ← X3 + Y3
-    t1 = X * Y;                                                                        // 16. t1 ← X · Y
-    X3 = t0 * t1;                                                                      // 17. X3 ← t0 · t1
-    X3 = X3 + X3;                                                                      // 18. X3 ← X3 + X3
+    FF t0 = FF::sqr(Y);                                 // 1. t0 ← Y · Y
+    FF Z3 = t0 + t0;                                    // 2. Z3 ← t0 + t0
+    Z3 = Z3 + Z3;                                       // 3. Z3 ← Z3 + Z3
+    Z3 = Z3 + Z3;                                       // 4. Z3 ← Z3 + Z3
+    FF t1 = Y * Z;                                      // 5. t1 ← Y · Z
+    FF t2 = FF::sqr(Z);                                 // 6. t2 ← Z · Z
+    t2 = FF::template mul_weierstrass_b<Gen, true>(t2); // 7. t2 ← b3 · t2
+    FF X3 = t2 * Z3;                                    // 8. X3 ← t2 · Z3
+    FF Y3 = t0 + t2;                                    // 9. Y3 ← t0 + t2
+    Z3 = t1 * Z3;                                       // 10. Z3 ← t1 · Z3
+    t1 = t2 + t2;                                       // 11. t1 ← t2 + t2
+    t2 = t1 + t2;                                       // 12. t2 ← t1 + t2
+    t0 = t0 - t2;                                       // 13. t0 ← t0 − t2
+    Y3 = t0 * Y3;                                       // 14. Y3 ← t0 · Y3
+    Y3 = X3 + Y3;                                       // 15. Y3 ← X3 + Y3
+    t1 = X * Y;                                         // 16. t1 ← X · Y
+    X3 = t0 * t1;                                       // 17. X3 ← t0 · t1
+    X3 = X3 + X3;                                       // 18. X3 ← X3 + X3
     return {X3, Y3, Z3};
   }
 
   friend HOST_DEVICE Projective operator+(Projective p1, const Projective& p2)
   {
-    const FF X1 = p1.x;       //                   < 2
-    const FF Y1 = p1.y;       //                   < 2
-    const FF Z1 = p1.z;       //                   < 2
-    const FF X2 = p2.x;       //                   < 2
-    const FF Y2 = p2.y;       //                   < 2
-    const FF Z2 = p2.z;       //                   < 2
-    const FF t00 = X1 * X2;   // t00 ← X1 · X2     < 2
-    const FF t01 = Y1 * Y2;   // t01 ← Y1 · Y2     < 2
-    const FF t02 = Z1 * Z2;   // t02 ← Z1 · Z2     < 2
-    const FF t03 = X1 + Y1;   // t03 ← X1 + Y1     < 4
-    const FF t04 = X2 + Y2;   // t04 ← X2 + Y2     < 4
-    const FF t05 = t03 * t04; // t03 ← t03 · t04   < 3
-    const FF t06 = t00 + t01; // t06 ← t00 + t01   < 4
-    const FF t07 = t05 - t06; // t05 ← t05 − t06   < 2
-    const FF t08 = Y1 + Z1;   // t08 ← Y1 + Z1     < 4
-    const FF t09 = Y2 + Z2;   // t09 ← Y2 + Z2     < 4
-    const FF t10 = t08 * t09; // t10 ← t08 · t09   < 3
-    const FF t11 = t01 + t02; // t11 ← t01 + t02   < 4
-    const FF t12 = t10 - t11; // t12 ← t10 − t11   < 2
-    const FF t13 = X1 + Z1;   // t13 ← X1 + Z1     < 4
-    const FF t14 = X2 + Z2;   // t14 ← X2 + Z2     < 4
-    const FF t15 = t13 * t14; // t15 ← t13 · t14   < 3
-    const FF t16 = t00 + t02; // t16 ← t00 + t02   < 4
-    const FF t17 = t15 - t16; // t17 ← t15 − t16   < 2
-    const FF t18 = t00 + t00; // t18 ← t00 + t00   < 2
-    const FF t19 = t18 + t00; // t19 ← t18 + t00   < 2
-    const FF t20 =
-      FF::template mul_unsigned<3>(FF::template mul_const<Gen::weierstrass_b>(t02)); // t20 ← b3 · t02    < 2
-    const FF t21 = t01 + t20;                                                        // t21 ← t01 + t20   < 2
-    const FF t22 = t01 - t20;                                                        // t22 ← t01 − t20   < 2
-    const FF t23 =
-      FF::template mul_unsigned<3>(FF::template mul_const<Gen::weierstrass_b>(t17)); // t23 ← b3 · t17    < 2
-    const auto t24 = FF::mul_wide(t12, t23);                                         // t24 ← t12 · t23   < 2
-    const auto t25 = FF::mul_wide(t07, t22);                                         // t25 ← t07 · t22   < 2
-    const FF X3 = FF::reduce(t25 - t24);                                             // X3 ← t25 − t24    < 2
-    const auto t27 = FF::mul_wide(t23, t19);                                         // t27 ← t23 · t19   < 2
-    const auto t28 = FF::mul_wide(t22, t21);                                         // t28 ← t22 · t21   < 2
-    const FF Y3 = FF::reduce(t28 + t27);                                             // Y3 ← t28 + t27    < 2
-    const auto t30 = FF::mul_wide(t19, t07);                                         // t30 ← t19 · t07   < 2
-    const auto t31 = FF::mul_wide(t21, t12);                                         // t31 ← t21 · t12   < 2
-    const FF Z3 = FF::reduce(t31 + t30);                                             // Z3 ← t31 + t30    < 2
+    const FF X1 = p1.x;                                            //                   < 2
+    const FF Y1 = p1.y;                                            //                   < 2
+    const FF Z1 = p1.z;                                            //                   < 2
+    const FF X2 = p2.x;                                            //                   < 2
+    const FF Y2 = p2.y;                                            //                   < 2
+    const FF Z2 = p2.z;                                            //                   < 2
+    const FF t00 = X1 * X2;                                        // t00 ← X1 · X2     < 2
+    const FF t01 = Y1 * Y2;                                        // t01 ← Y1 · Y2     < 2
+    const FF t02 = Z1 * Z2;                                        // t02 ← Z1 · Z2     < 2
+    const FF t03 = X1 + Y1;                                        // t03 ← X1 + Y1     < 4
+    const FF t04 = X2 + Y2;                                        // t04 ← X2 + Y2     < 4
+    const FF t05 = t03 * t04;                                      // t03 ← t03 · t04   < 3
+    const FF t06 = t00 + t01;                                      // t06 ← t00 + t01   < 4
+    const FF t07 = t05 - t06;                                      // t05 ← t05 − t06   < 2
+    const FF t08 = Y1 + Z1;                                        // t08 ← Y1 + Z1     < 4
+    const FF t09 = Y2 + Z2;                                        // t09 ← Y2 + Z2     < 4
+    const FF t10 = t08 * t09;                                      // t10 ← t08 · t09   < 3
+    const FF t11 = t01 + t02;                                      // t11 ← t01 + t02   < 4
+    const FF t12 = t10 - t11;                                      // t12 ← t10 − t11   < 2
+    const FF t13 = X1 + Z1;                                        // t13 ← X1 + Z1     < 4
+    const FF t14 = X2 + Z2;                                        // t14 ← X2 + Z2     < 4
+    const FF t15 = t13 * t14;                                      // t15 ← t13 · t14   < 3
+    const FF t16 = t00 + t02;                                      // t16 ← t00 + t02   < 4
+    const FF t17 = t15 - t16;                                      // t17 ← t15 − t16   < 2
+    const FF t18 = t00 + t00;                                      // t18 ← t00 + t00   < 2
+    const FF t19 = t18 + t00;                                      // t19 ← t18 + t00   < 2
+    const FF t20 = FF::template mul_weierstrass_b<Gen, true>(t02); // t20 ← b3 · t02    < 2
+    const FF t21 = t01 + t20;                                      // t21 ← t01 + t20   < 2
+    const FF t22 = t01 - t20;                                      // t22 ← t01 − t20   < 2
+    const FF t23 = FF::template mul_weierstrass_b<Gen, true>(t17); // t23 ← b3 · t17    < 2
+    const auto t24 = FF::mul_wide(t12, t23);                       // t24 ← t12 · t23   < 2
+    const auto t25 = FF::mul_wide(t07, t22);                       // t25 ← t07 · t22   < 2
+    const FF X3 = FF::reduce(t25 - t24);                           // X3 ← t25 − t24    < 2
+    const auto t27 = FF::mul_wide(t23, t19);                       // t27 ← t23 · t19   < 2
+    const auto t28 = FF::mul_wide(t22, t21);                       // t28 ← t22 · t21   < 2
+    const FF Y3 = FF::reduce(t28 + t27);                           // Y3 ← t28 + t27    < 2
+    const auto t30 = FF::mul_wide(t19, t07);                       // t30 ← t19 · t07   < 2
+    const auto t31 = FF::mul_wide(t21, t12);                       // t31 ← t21 · t12   < 2
+    const FF Z3 = FF::reduce(t31 + t30);                           // Z3 ← t31 + t30    < 2
     return {X3, Y3, Z3};
   }
 
@@ -129,46 +127,44 @@ class Projective
 
   friend HOST_DEVICE Projective operator+(Projective p1, const Affine<FF>& p2)
   {
-    const FF X1 = p1.x;            //                   < 2
-    const FF Y1 = p1.y;            //                   < 2
-    const FF Z1 = p1.z;            //                   < 2
-    const FF X2 = p2.x;            //                   < 2
-    const FF Y2 = p2.y;            //                   < 2
-    const FF t00 = X1 * X2;        // t00 ← X1 · X2     < 2
-    const FF t01 = Y1 * Y2;        // t01 ← Y1 · Y2     < 2
-    const FF t02 = Z1;             // t02 ← Z1          < 2
-    const FF t03 = X1 + Y1;        // t03 ← X1 + Y1     < 4
-    const FF t04 = X2 + Y2;        // t04 ← X2 + Y2     < 4
-    const FF t05 = t03 * t04;      // t03 ← t03 · t04   < 3
-    const FF t06 = t00 + t01;      // t06 ← t00 + t01   < 4
-    const FF t07 = t05 - t06;      // t05 ← t05 − t06   < 2
-    const FF t08 = Y1 + Z1;        // t08 ← Y1 + Z1     < 4
-    const FF t09 = Y2 + FF::one(); // t09 ← Y2 + 1      < 4
-    const FF t10 = t08 * t09;      // t10 ← t08 · t09   < 3
-    const FF t11 = t01 + t02;      // t11 ← t01 + t02   < 4
-    const FF t12 = t10 - t11;      // t12 ← t10 − t11   < 2
-    const FF t13 = X1 + Z1;        // t13 ← X1 + Z1     < 4
-    const FF t14 = X2 + FF::one(); // t14 ← X2 + 1      < 4
-    const FF t15 = t13 * t14;      // t15 ← t13 · t14   < 3
-    const FF t16 = t00 + t02;      // t16 ← t00 + t02   < 4
-    const FF t17 = t15 - t16;      // t17 ← t15 − t16   < 2
-    const FF t18 = t00 + t00;      // t18 ← t00 + t00   < 2
-    const FF t19 = t18 + t00;      // t19 ← t18 + t00   < 2
-    const FF t20 =
-      FF::template mul_unsigned<3>(FF::template mul_const<Gen::weierstrass_b>(t02)); // t20 ← b3 · t02    < 2
-    const FF t21 = t01 + t20;                                                        // t21 ← t01 + t20   < 2
-    const FF t22 = t01 - t20;                                                        // t22 ← t01 − t20   < 2
-    const FF t23 =
-      FF::template mul_unsigned<3>(FF::template mul_const<Gen::weierstrass_b>(t17)); // t23 ← b3 · t17    < 2
-    const auto t24 = FF::mul_wide(t12, t23);                                         // t24 ← t12 · t23   < 2
-    const auto t25 = FF::mul_wide(t07, t22);                                         // t25 ← t07 · t22   < 2
-    const FF X3 = FF::reduce(t25 - t24);                                             // X3 ← t25 − t24    < 2
-    const auto t27 = FF::mul_wide(t23, t19);                                         // t27 ← t23 · t19   < 2
-    const auto t28 = FF::mul_wide(t22, t21);                                         // t28 ← t22 · t21   < 2
-    const FF Y3 = FF::reduce(t28 + t27);                                             // Y3 ← t28 + t27    < 2
-    const auto t30 = FF::mul_wide(t19, t07);                                         // t30 ← t19 · t07   < 2
-    const auto t31 = FF::mul_wide(t21, t12);                                         // t31 ← t21 · t12   < 2
-    const FF Z3 = FF::reduce(t31 + t30);                                             // Z3 ← t31 + t30    < 2
+    const FF X1 = p1.x;                                            //                   < 2
+    const FF Y1 = p1.y;                                            //                   < 2
+    const FF Z1 = p1.z;                                            //                   < 2
+    const FF X2 = p2.x;                                            //                   < 2
+    const FF Y2 = p2.y;                                            //                   < 2
+    const FF t00 = X1 * X2;                                        // t00 ← X1 · X2     < 2
+    const FF t01 = Y1 * Y2;                                        // t01 ← Y1 · Y2     < 2
+    const FF t02 = Z1;                                             // t02 ← Z1          < 2
+    const FF t03 = X1 + Y1;                                        // t03 ← X1 + Y1     < 4
+    const FF t04 = X2 + Y2;                                        // t04 ← X2 + Y2     < 4
+    const FF t05 = t03 * t04;                                      // t03 ← t03 · t04   < 3
+    const FF t06 = t00 + t01;                                      // t06 ← t00 + t01   < 4
+    const FF t07 = t05 - t06;                                      // t05 ← t05 − t06   < 2
+    const FF t08 = Y1 + Z1;                                        // t08 ← Y1 + Z1     < 4
+    const FF t09 = Y2 + FF::one();                                 // t09 ← Y2 + 1      < 4
+    const FF t10 = t08 * t09;                                      // t10 ← t08 · t09   < 3
+    const FF t11 = t01 + t02;                                      // t11 ← t01 + t02   < 4
+    const FF t12 = t10 - t11;                                      // t12 ← t10 − t11   < 2
+    const FF t13 = X1 + Z1;                                        // t13 ← X1 + Z1     < 4
+    const FF t14 = X2 + FF::one();                                 // t14 ← X2 + 1      < 4
+    const FF t15 = t13 * t14;                                      // t15 ← t13 · t14   < 3
+    const FF t16 = t00 + t02;                                      // t16 ← t00 + t02   < 4
+    const FF t17 = t15 - t16;                                      // t17 ← t15 − t16   < 2
+    const FF t18 = t00 + t00;                                      // t18 ← t00 + t00   < 2
+    const FF t19 = t18 + t00;                                      // t19 ← t18 + t00   < 2
+    const FF t20 = FF::template mul_weierstrass_b<Gen, true>(t02); // t20 ← b3 · t02    < 2
+    const FF t21 = t01 + t20;                                      // t21 ← t01 + t20   < 2
+    const FF t22 = t01 - t20;                                      // t22 ← t01 − t20   < 2
+    const FF t23 = FF::template mul_weierstrass_b<Gen, true>(t17); // t23 ← b3 · t17    < 2
+    const auto t24 = FF::mul_wide(t12, t23);                       // t24 ← t12 · t23   < 2
+    const auto t25 = FF::mul_wide(t07, t22);                       // t25 ← t07 · t22   < 2
+    const FF X3 = FF::reduce(t25 - t24);                           // X3 ← t25 − t24    < 2
+    const auto t27 = FF::mul_wide(t23, t19);                       // t27 ← t23 · t19   < 2
+    const auto t28 = FF::mul_wide(t22, t21);                       // t28 ← t22 · t21   < 2
+    const FF Y3 = FF::reduce(t28 + t27);                           // Y3 ← t28 + t27    < 2
+    const auto t30 = FF::mul_wide(t19, t07);                       // t30 ← t19 · t07   < 2
+    const auto t31 = FF::mul_wide(t21, t12);                       // t31 ← t21 · t12   < 2
+    const FF Z3 = FF::reduce(t31 + t30);                           // Z3 ← t31 + t30    < 2
     return {X3, Y3, Z3};
   }
 
@@ -235,7 +231,7 @@ class Projective
   {
     if (is_zero(point)) return true;
     bool eq_holds =
-      (FF::template mul_const<Gen::weierstrass_b>(FF::sqr(point.z) * point.z) + FF::sqr(point.x) * point.x ==
+      (FF::template mul_weierstrass_b<Gen>(FF::sqr(point.z) * point.z) + FF::sqr(point.x) * point.x ==
        point.z * FF::sqr(point.y));
     return point.z != FF::zero() && eq_holds;
   }
diff --git a/icicle/include/icicle/fields/complex_extension.h b/icicle/include/icicle/fields/complex_extension.h
index 42740859fb..c65ad37ec4 100644
--- a/icicle/include/icicle/fields/complex_extension.h
+++ b/icicle/include/icicle/fields/complex_extension.h
@@ -158,6 +158,90 @@ class ComplexExtensionField
     return !(xs == ys);
   }
 
+  template <typename Gen, bool IS_3B = false>
+  static HOST_DEVICE_INLINE FF mul_weierstrass_b_real(const FF& xs)
+  {
+    FF r = {};
+    constexpr FF b_mult = []() {
+      FF b_mult = FF{Gen::weierstrass_b_g2_re};
+      if constexpr (!IS_3B) return b_mult;
+      typename FF::ff_storage temp = {};
+      typename FF::ff_storage modulus = FF::get_modulus();
+      host_math::template add_sub_limbs<CONFIG::limbs_count, false, false, true>(
+        b_mult.limbs_storage, b_mult.limbs_storage, b_mult.limbs_storage);
+      b_mult.limbs_storage =
+        host_math::template add_sub_limbs<CONFIG::limbs_count, true, true, true>(b_mult.limbs_storage, modulus, temp)
+          ? b_mult.limbs_storage
+          : temp;
+      host_math::template add_sub_limbs<CONFIG::limbs_count, false, false, true>(
+        b_mult.limbs_storage, FF{Gen::weierstrass_b_g2_re}.limbs_storage, b_mult.limbs_storage);
+      b_mult.limbs_storage =
+        host_math::template add_sub_limbs<CONFIG::limbs_count, true, true, true>(b_mult.limbs_storage, modulus, temp)
+          ? b_mult.limbs_storage
+          : temp;
+      return b_mult;
+    }();
+    if constexpr (Gen::is_b_u32_g2_re) {
+      r = FF::template mul_unsigned<b_mult.limbs_storage.limbs[0], FF>(xs);
+      if constexpr (Gen::is_b_neg_g2_re)
+        return FF::neg(r);
+      else {
+        return r;
+      }
+    } else {
+      return b_mult * xs;
+    }
+  }
+
+  template <typename Gen, bool IS_3B = false>
+  static HOST_DEVICE_INLINE FF mul_weierstrass_b_imag(const FF& xs)
+  {
+    FF r = {};
+    constexpr FF b_mult = []() {
+      FF b_mult = FF{Gen::weierstrass_b_g2_im};
+      if constexpr (!IS_3B) return b_mult;
+      typename FF::ff_storage temp = {};
+      typename FF::ff_storage modulus = FF::get_modulus();
+      host_math::template add_sub_limbs<CONFIG::limbs_count, false, false, true>(
+        b_mult.limbs_storage, b_mult.limbs_storage, b_mult.limbs_storage);
+      b_mult.limbs_storage =
+        host_math::template add_sub_limbs<CONFIG::limbs_count, true, true, true>(b_mult.limbs_storage, modulus, temp)
+          ? b_mult.limbs_storage
+          : temp;
+      host_math::template add_sub_limbs<CONFIG::limbs_count, false, false, true>(
+        b_mult.limbs_storage, FF{Gen::weierstrass_b_g2_im}.limbs_storage, b_mult.limbs_storage);
+      b_mult.limbs_storage =
+        host_math::template add_sub_limbs<CONFIG::limbs_count, true, true, true>(b_mult.limbs_storage, modulus, temp)
+          ? b_mult.limbs_storage
+          : temp;
+      return b_mult;
+    }();
+    if constexpr (Gen::is_b_u32_g2_im) {
+      r = FF::template mul_unsigned<b_mult.limbs_storage.limbs[0], FF>(xs);
+      if constexpr (Gen::is_b_neg_g2_im)
+        return FF::neg(r);
+      else {
+        return r;
+      }
+    } else {
+      return b_mult * xs;
+    }
+  }
+
+  template <typename Gen, bool IS_3B = false>
+  static HOST_DEVICE_INLINE ComplexExtensionField mul_weierstrass_b(const ComplexExtensionField& xs)
+  {
+    const FF xs_real = xs.real;
+    const FF xs_imaginary = xs.imaginary;
+    FF real_prod = mul_weierstrass_b_real<Gen, IS_3B>(xs_real);
+    FF imaginary_prod = mul_weierstrass_b_imag<Gen, IS_3B>(xs_imaginary);
+    FF re_im = mul_weierstrass_b_real<Gen, IS_3B>(xs_imaginary);
+    FF im_re = mul_weierstrass_b_imag<Gen, IS_3B>(xs_real);
+    FF nonresidue_times_im = FF::template mul_unsigned<CONFIG::nonresidue>(imaginary_prod);
+    nonresidue_times_im = CONFIG::nonresidue_is_negative ? FF::neg(nonresidue_times_im) : nonresidue_times_im;
+    return ComplexExtensionField{real_prod + nonresidue_times_im, re_im + im_re};
+  }
+
   template <const ComplexExtensionField& multiplier>
   static HOST_DEVICE_INLINE ComplexExtensionField mul_const(const ComplexExtensionField& xs)
   {
diff --git a/icicle/include/icicle/fields/field.h b/icicle/include/icicle/fields/field.h
index 80c4b4178c..3563424c00 100644
--- a/icicle/include/icicle/fields/field.h
+++ b/icicle/include/icicle/fields/field.h
@@ -875,6 +875,42 @@ class Field
 
   friend HOST_DEVICE bool operator!=(const Field& xs, const Field& ys) { return !(xs == ys); }
 
+  template <typename Gen, bool IS_3B = false>
+  static HOST_DEVICE_INLINE Field mul_weierstrass_b(const Field& xs)
+  {
+    Field r = {};
+    constexpr Field b_mult = []() {
+      Field b_mult = Field{Gen::weierstrass_b};
+      if constexpr (!IS_3B) return b_mult;
+      ff_storage temp = {};
+      ff_storage modulus = get_modulus<>();
+      host_math::template add_sub_limbs<TLC, false, false, true>(
+        b_mult.limbs_storage, b_mult.limbs_storage, b_mult.limbs_storage);
+      b_mult.limbs_storage =
+        host_math::template add_sub_limbs<TLC, true, true, true>(b_mult.limbs_storage, modulus, temp)
+          ? b_mult.limbs_storage
+          : temp;
+      host_math::template add_sub_limbs<TLC, false, false, true>(
+        b_mult.limbs_storage, Field{Gen::weierstrass_b}.limbs_storage, b_mult.limbs_storage);
+      b_mult.limbs_storage =
+        host_math::template add_sub_limbs<TLC, true, true, true>(b_mult.limbs_storage, modulus, temp)
+          ? b_mult.limbs_storage
+          : temp;
+      return b_mult;
+    }();
+
+    if constexpr (Gen::is_b_u32) { // assumes that 3b is also u32
+      r = mul_unsigned<b_mult.limbs_storage.limbs[0], Field>(xs);
+      if constexpr (Gen::is_b_neg)
+        return neg(r);
+      else {
+        return r;
+      }
+    } else {
+      return b_mult * xs;
+    }
+  }
+
   template <const Field& multiplier>
   static HOST_DEVICE_INLINE Field mul_const(const Field& xs)
   {

From 56f406e4d26870999e2317b282086a1a87939307 Mon Sep 17 00:00:00 2001
From: yshekel <yshekel@gmail.com>
Date: Tue, 7 Jan 2025 11:23:27 +0200
Subject: [PATCH 04/12] Support vulkan backend in build system (#703)

- Added a script to pull Vulkan private backend sources.
- Updated CMake configuration to integrate Vulkan support in the build process.
- Introduced a Rust feature flag for building the Vulkan backend (requires access to private sources).
- Refined the test device API for improved compatibility and functionality.
---
 examples/c++/examples_utils.h                 |  2 +-
 .../arkworks-icicle-conversions/Cargo.toml    | 23 ++++--
 examples/rust/hash-and-merkle/Cargo.toml      | 28 ++++---
 examples/rust/msm/Cargo.toml                  | 18 ++++-
 examples/rust/ntt/Cargo.toml                  | 16 +++-
 examples/rust/polynomials/Cargo.toml          | 22 ++++--
 examples/rust/poseidon2/Cargo.toml            | 26 +++++--
 icicle/CMakeLists.txt                         | 57 ++------------
 icicle/cmake/backend_include.cmake            | 74 +++++++++++++++++++
 icicle/tests/test_device_api.cpp              | 73 +++++++++++-------
 scripts/pull_metal_backend.sh                 |  2 -
 scripts/pull_vulkan_backend.sh                | 42 +++++++++++
 .../icicle-curves/icicle-bls12-377/Cargo.toml |  3 +-
 .../icicle-curves/icicle-bls12-377/build.rs   |  5 ++
 .../icicle-curves/icicle-bls12-381/Cargo.toml |  2 +
 .../icicle-curves/icicle-bls12-381/build.rs   |  5 ++
 .../icicle-curves/icicle-bn254/Cargo.toml     |  2 +
 .../rust/icicle-curves/icicle-bn254/build.rs  |  5 ++
 .../icicle-curves/icicle-bw6-761/build.rs     |  5 ++
 .../icicle-curves/icicle-grumpkin/Cargo.toml  |  2 +
 .../icicle-curves/icicle-grumpkin/build.rs    |  5 ++
 .../icicle-fields/icicle-babybear/Cargo.toml  |  4 +-
 .../icicle-fields/icicle-babybear/build.rs    |  5 ++
 .../icicle-fields/icicle-koalabear/Cargo.toml |  4 +-
 .../icicle-fields/icicle-koalabear/build.rs   |  5 ++
 .../rust/icicle-fields/icicle-m31/Cargo.toml  |  2 +
 .../rust/icicle-fields/icicle-m31/build.rs    |  5 ++
 .../icicle-fields/icicle-stark252/Cargo.toml  |  4 +-
 wrappers/rust/icicle-hash/Cargo.toml          |  4 +-
 wrappers/rust/icicle-hash/build.rs            |  5 ++
 wrappers/rust/icicle-runtime/Cargo.toml       |  2 +
 wrappers/rust/icicle-runtime/build.rs         |  5 ++
 32 files changed, 341 insertions(+), 121 deletions(-)
 create mode 100644 icicle/cmake/backend_include.cmake
 create mode 100755 scripts/pull_vulkan_backend.sh

diff --git a/examples/c++/examples_utils.h b/examples/c++/examples_utils.h
index 65abf3bebb..59142ddf17 100644
--- a/examples/c++/examples_utils.h
+++ b/examples/c++/examples_utils.h
@@ -29,5 +29,5 @@ void try_load_and_set_backend_device(int argc = 0, char** argv = nullptr)
     return;
   }
 
-  ICICLE_LOG_INFO << "Device " << selected_device << " is not available, falling back to CPU";
+  ICICLE_LOG_ERROR << "Device " << selected_device << " is not available, falling back to CPU";
 }
\ No newline at end of file
diff --git a/examples/rust/arkworks-icicle-conversions/Cargo.toml b/examples/rust/arkworks-icicle-conversions/Cargo.toml
index f054855c44..84fa7ffdc7 100644
--- a/examples/rust/arkworks-icicle-conversions/Cargo.toml
+++ b/examples/rust/arkworks-icicle-conversions/Cargo.toml
@@ -5,24 +5,33 @@ edition = "2018"
 
 [dependencies]
 icicle-runtime = { path = "../../../wrappers/rust/icicle-runtime" }
-icicle-core = { path = "../../../wrappers/rust/icicle-core"}
+icicle-core = { path = "../../../wrappers/rust/icicle-core" }
 icicle-bn254 = { path = "../../../wrappers/rust/icicle-curves/icicle-bn254" }
-ark-bn254 = { version = "0.4.0"}
-ark-ff = { version = "0.4.0" , features=["parallel"]}
-ark-ec = { version = "0.4.0" , features=["parallel"]}
+ark-bn254 = { version = "0.4.0" }
+ark-ff = { version = "0.4.0", features = ["parallel"] }
+ark-ec = { version = "0.4.0", features = ["parallel"] }
 rand = "0.8"
 rayon = "1.5"
 clap = { version = "<=4.4.12", features = ["derive"] }
 
 [features]
-cuda = ["icicle-runtime/cuda_backend",
+cuda = [
+        "icicle-runtime/cuda_backend",
         "icicle-bn254/cuda_backend",
         "icicle-bn254/no_ecntt",
         "icicle-bn254/no_g2",
 ]
 
-metal = ["icicle-runtime/metal_backend",
+metal = [
+        "icicle-runtime/metal_backend",
         "icicle-bn254/metal_backend",
         "icicle-bn254/no_ecntt",
         "icicle-bn254/no_g2",
-]
\ No newline at end of file
+]
+
+vulkan = [
+        "icicle-runtime/vulkan_backend",
+        "icicle-bn254/vulkan_backend",
+        "icicle-bn254/no_ecntt",
+        "icicle-bn254/no_g2",
+]
diff --git a/examples/rust/hash-and-merkle/Cargo.toml b/examples/rust/hash-and-merkle/Cargo.toml
index 95045b11de..3e0e9f0dde 100644
--- a/examples/rust/hash-and-merkle/Cargo.toml
+++ b/examples/rust/hash-and-merkle/Cargo.toml
@@ -4,20 +4,28 @@ version = "0.1.0"
 edition = "2018"
 
 [dependencies]
-hex = "0.4" 
+hex = "0.4"
 clap = { version = "<=4.4.12", features = ["derive"] }
-icicle-core = {path = "../../../wrappers/rust/icicle-core" }
-icicle-runtime = {path = "../../../wrappers/rust/icicle-runtime" }
-icicle-hash = {path = "../../../wrappers/rust/icicle-hash" }
-icicle-babybear = {path = "../../../wrappers/rust/icicle-fields/icicle-babybear" }
+icicle-core = { path = "../../../wrappers/rust/icicle-core" }
+icicle-runtime = { path = "../../../wrappers/rust/icicle-runtime" }
+icicle-hash = { path = "../../../wrappers/rust/icicle-hash" }
+icicle-babybear = { path = "../../../wrappers/rust/icicle-fields/icicle-babybear" }
 
 [features]
-cuda = ["icicle-runtime/cuda_backend",
+cuda = [
+        "icicle-runtime/cuda_backend",
         "icicle-babybear/cuda_backend",
-        "icicle-hash/cuda_backend",            
+        "icicle-hash/cuda_backend",
 ]
 
-metal = ["icicle-runtime/metal_backend",
+metal = [
+        "icicle-runtime/metal_backend",
         "icicle-babybear/metal_backend",
-        "icicle-hash/metal_backend",            
-]
\ No newline at end of file
+        "icicle-hash/metal_backend",
+]
+
+vulkan = [
+        "icicle-runtime/vulkan_backend",
+        "icicle-babybear/vulkan_backend",
+        "icicle-hash/vulkan_backend",
+]
diff --git a/examples/rust/msm/Cargo.toml b/examples/rust/msm/Cargo.toml
index 29e1cf1add..22a6be4ab4 100644
--- a/examples/rust/msm/Cargo.toml
+++ b/examples/rust/msm/Cargo.toml
@@ -11,16 +11,26 @@ icicle-bls12-377 = { path = "../../../wrappers/rust/icicle-curves/icicle-bls12-3
 clap = { version = "<=4.4.12", features = ["derive"] }
 
 [features]
-cuda = ["icicle-runtime/cuda_backend",
+cuda = [
+        "icicle-runtime/cuda_backend",
         "icicle-bn254/cuda_backend",
         "icicle-bn254/no_ecntt",
         "icicle-bls12-377/cuda_backend",
-        "icicle-bls12-377/no_ecntt"
+        "icicle-bls12-377/no_ecntt",
 ]
 
-metal = ["icicle-runtime/metal_backend",
+metal = [
+        "icicle-runtime/metal_backend",
         "icicle-bn254/metal_backend",
         "icicle-bn254/no_ecntt",
         "icicle-bls12-377/metal_backend",
-        "icicle-bls12-377/no_ecntt"
+        "icicle-bls12-377/no_ecntt",
+]
+
+vulkan = [
+        "icicle-runtime/vulkan_backend",
+        "icicle-bn254/vulkan_backend",
+        "icicle-bn254/no_ecntt",
+        "icicle-bls12-377/vulkan_backend",
+        "icicle-bls12-377/no_ecntt",
 ]
diff --git a/examples/rust/ntt/Cargo.toml b/examples/rust/ntt/Cargo.toml
index 616610f706..97cf3818de 100644
--- a/examples/rust/ntt/Cargo.toml
+++ b/examples/rust/ntt/Cargo.toml
@@ -12,7 +12,8 @@ icicle-bls12-377 = { path = "../../../wrappers/rust/icicle-curves/icicle-bls12-3
 clap = { version = "<=4.4.12", features = ["derive"] }
 
 [features]
-cuda = ["icicle-runtime/cuda_backend",
+cuda = [
+        "icicle-runtime/cuda_backend",
         "icicle-bn254/cuda_backend",
         "icicle-bn254/no_ecntt",
         "icicle-bn254/no_g2",
@@ -21,7 +22,8 @@ cuda = ["icicle-runtime/cuda_backend",
         "icicle-bls12-377/no_g2",
 ]
 
-metal = ["icicle-runtime/metal_backend",
+metal = [
+        "icicle-runtime/metal_backend",
         "icicle-bn254/metal_backend",
         "icicle-bn254/no_ecntt",
         "icicle-bn254/no_g2",
@@ -29,3 +31,13 @@ metal = ["icicle-runtime/metal_backend",
         "icicle-bls12-377/no_ecntt",
         "icicle-bls12-377/no_g2",
 ]
+
+vulkan = [
+        "icicle-runtime/vulkan_backend",
+        "icicle-bn254/vulkan_backend",
+        "icicle-bn254/no_ecntt",
+        "icicle-bn254/no_g2",
+        "icicle-bls12-377/vulkan_backend",
+        "icicle-bls12-377/no_ecntt",
+        "icicle-bls12-377/no_g2",
+]
diff --git a/examples/rust/polynomials/Cargo.toml b/examples/rust/polynomials/Cargo.toml
index 527b0bd267..742dbffac2 100644
--- a/examples/rust/polynomials/Cargo.toml
+++ b/examples/rust/polynomials/Cargo.toml
@@ -12,16 +12,26 @@ icicle-babybear = { path = "../../../wrappers/rust/icicle-fields/icicle-babybear
 clap = { version = "<=4.4.12", features = ["derive"] }
 
 [features]
-cuda = ["icicle-runtime/cuda_backend",
+cuda = [
+        "icicle-runtime/cuda_backend",
         "icicle-bn254/cuda_backend",
-        "icicle-bn254/no_ecntt",       
-        "icicle-bn254/no_g2",        
+        "icicle-bn254/no_ecntt",
+        "icicle-bn254/no_g2",
         "icicle-babybear/cuda_backend",
 ]
-metal = ["icicle-runtime/metal_backend",
+
+metal = [
+        "icicle-runtime/metal_backend",
         "icicle-bn254/metal_backend",
-        "icicle-bn254/no_ecntt",       
-        "icicle-bn254/no_g2",        
+        "icicle-bn254/no_ecntt",
+        "icicle-bn254/no_g2",
         "icicle-babybear/metal_backend",
 ]
 
+vulkan = [
+        "icicle-runtime/vulkan_backend",
+        "icicle-bn254/vulkan_backend",
+        "icicle-bn254/no_ecntt",
+        "icicle-bn254/no_g2",
+        "icicle-babybear/vulkan_backend",
+]
diff --git a/examples/rust/poseidon2/Cargo.toml b/examples/rust/poseidon2/Cargo.toml
index 9416d34f88..5f07f8df07 100644
--- a/examples/rust/poseidon2/Cargo.toml
+++ b/examples/rust/poseidon2/Cargo.toml
@@ -4,12 +4,12 @@ version = "0.1.0"
 edition = "2018"
 
 [dependencies]
-icicle-core = {path = "../../../wrappers/rust/icicle-core" }
-icicle-runtime = {path = "../../../wrappers/rust/icicle-runtime" }
-icicle-hash = {path = "../../../wrappers/rust/icicle-hash" }
-icicle-babybear = {path = "../../../wrappers/rust/icicle-fields/icicle-babybear" }
-icicle-m31 = {path = "../../../wrappers/rust/icicle-fields/icicle-m31" }
-hex = "0.4" 
+icicle-core = { path = "../../../wrappers/rust/icicle-core" }
+icicle-runtime = { path = "../../../wrappers/rust/icicle-runtime" }
+icicle-hash = { path = "../../../wrappers/rust/icicle-hash" }
+icicle-babybear = { path = "../../../wrappers/rust/icicle-fields/icicle-babybear" }
+icicle-m31 = { path = "../../../wrappers/rust/icicle-fields/icicle-m31" }
+hex = "0.4"
 rand = "0.8"
 clap = { version = "<=4.4.12", features = ["derive"] }
 
@@ -20,3 +20,17 @@ cuda = [
   "icicle-babybear/cuda_backend",
   "icicle-m31/cuda_backend",
 ]
+
+metal = [
+  "icicle-runtime/metal_backend",
+  "icicle-hash/metal_backend",
+  "icicle-babybear/metal_backend",
+  "icicle-m31/metal_backend",
+]
+
+vulkan = [
+  "icicle-runtime/vulkan_backend",
+  "icicle-hash/vulkan_backend",
+  "icicle-babybear/vulkan_backend",
+  "icicle-m31/vulkan_backend",
+]
diff --git a/icicle/CMakeLists.txt b/icicle/CMakeLists.txt
index 2a32d70d3e..246215676f 100644
--- a/icicle/CMakeLists.txt
+++ b/icicle/CMakeLists.txt
@@ -55,11 +55,13 @@ endif()
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 # Build options
-option(BUILD_TESTS "Build unit tests. Default=OFF" OFF)
+option(BUILD_TESTS "Build unit test2s. Default=OFF" OFF)
+# Backends: typically CPU is built into the frontend, the rest are DSOs loaded at runtime from installation
 option(CPU_BACKEND "Build CPU backend. Default=ON" ON)
-# TODO Yuval: consider decoupling backends from frontend build. Is that worth the effort?
+# To enable building backends, use the following options: (note they are in private repos)
 option(CUDA_BACKEND "Branch/commit to pull for CUDA backend or `local` if under icicle/backend/cuda. Default=OFF" OFF)
 option(METAL_BACKEND "Branch/commit to pull for METAL backend or `local` if under icicle/backend/metal. Default=OFF" OFF)
+option(VULKAN_BACKEND "Branch/commit to pull for VULKAN backend or `local` if under icicle/backend/vulkan. Default=OFF" OFF)
 
 # features that some fields/curves have and some don't.
 option(NTT "Build NTT" ON)
@@ -130,55 +132,8 @@ if (CPU_BACKEND)
   add_subdirectory(backend/cpu)
 endif()
 
-if (CUDA_BACKEND)
-  string(TOLOWER "${CUDA_BACKEND}" CUDA_BACKEND_LOWER)
-  if (CUDA_BACKEND_LOWER STREQUAL "local")
-    # CUDA backend is local, no need to pull
-    message(STATUS "Adding CUDA backend from local path: icicle/backend/cuda")
-    add_subdirectory(backend/cuda)
-
-    # Set the compile definition for the backend build directory
-    add_compile_definitions(BACKEND_BUILD_DIR="${CMAKE_BINARY_DIR}/backend")
-  else()
-    set(CUDA_BACKEND_URL "git@github.com:ingonyama-zk/icicle-cuda-backend.git")
-
-    include(FetchContent)
-    message(STATUS "Fetching cuda backend from ${CUDA_BACKEND_URL}:${CUDA_BACKEND}")
-    FetchContent_Declare(
-      cuda_backend
-      GIT_REPOSITORY ${CUDA_BACKEND_URL}
-      GIT_TAG ${CUDA_BACKEND}
-    )
-    FetchContent_MakeAvailable(cuda_backend)
-    # Set the compile definition for the backend build directory
-    add_compile_definitions(BACKEND_BUILD_DIR="${CMAKE_BINARY_DIR}/_deps/cuda_backend-build")
-    endif()
-endif()
-
-if (METAL_BACKEND)
-  string(TOLOWER "${METAL_BACKEND}" METAL_BACKEND_LOWER)
-  if (METAL_BACKEND_LOWER STREQUAL "local")
-    # METAL backend is local, no need to pull
-    message(STATUS "Adding Metal backend from local path: icicle/backend/metal")
-    add_subdirectory(backend/metal)
-
-    # Set the compile definition for the backend build directory
-    add_compile_definitions(BACKEND_BUILD_DIR="${CMAKE_BINARY_DIR}/backend")
-  else()
-    set(METAL_BACKEND_URL "git@github.com:ingonyama-zk/icicle-metal-backend.git")
-
-    include(FetchContent)
-    message(STATUS "Fetching cuda backend from ${METAL_BACKEND_URL}:${METAL_BACKEND}")
-    FetchContent_Declare(
-      metal_backend
-      GIT_REPOSITORY ${METAL_BACKEND_URL}
-      GIT_TAG ${METAL_BACKEND}
-    )
-    FetchContent_MakeAvailable(metal_backend)
-    # Set the compile definition for the backend build directory
-    add_compile_definitions(BACKEND_BUILD_DIR="${CMAKE_BINARY_DIR}/_deps/metal_backend-build")
-    endif()
-endif()
+# Include and configure (for build) backends based on the backend options
+include(cmake/backend_include.cmake)
 
 if (BUILD_TESTS)
   add_subdirectory(tests)
diff --git a/icicle/cmake/backend_include.cmake b/icicle/cmake/backend_include.cmake
new file mode 100644
index 0000000000..565c07282e
--- /dev/null
+++ b/icicle/cmake/backend_include.cmake
@@ -0,0 +1,74 @@
+if (CUDA_BACKEND)
+  string(TOLOWER "${CUDA_BACKEND}" CUDA_BACKEND_LOWER)
+  if (CUDA_BACKEND_LOWER STREQUAL "local")
+    # CUDA backend is local, no need to pull
+    message(STATUS "Adding CUDA backend from local path: icicle/backend/cuda")
+    add_subdirectory(backend/cuda)
+
+    # Set the compile definition for the backend build directory
+    add_compile_definitions(BACKEND_BUILD_DIR="${CMAKE_BINARY_DIR}/backend")
+  else()
+    set(CUDA_BACKEND_URL "git@github.com:ingonyama-zk/icicle-cuda-backend.git")
+
+    include(FetchContent)
+    message(STATUS "Fetching cuda backend from ${CUDA_BACKEND_URL}:${CUDA_BACKEND}")
+    FetchContent_Declare(
+      cuda_backend
+      GIT_REPOSITORY ${CUDA_BACKEND_URL}
+      GIT_TAG ${CUDA_BACKEND}
+    )
+    FetchContent_MakeAvailable(cuda_backend)
+    # Set the compile definition for the backend build directory
+    add_compile_definitions(BACKEND_BUILD_DIR="${CMAKE_BINARY_DIR}/_deps/cuda_backend-build")
+    endif()
+endif()
+
+if (METAL_BACKEND)
+  string(TOLOWER "${METAL_BACKEND}" METAL_BACKEND_LOWER)
+  if (METAL_BACKEND_LOWER STREQUAL "local")
+    # METAL backend is local, no need to pull
+    message(STATUS "Adding Metal backend from local path: icicle/backend/metal")
+    add_subdirectory(backend/metal)
+
+    # Set the compile definition for the backend build directory
+    add_compile_definitions(BACKEND_BUILD_DIR="${CMAKE_BINARY_DIR}/backend")
+  else()
+    set(METAL_BACKEND_URL "git@github.com:ingonyama-zk/icicle-metal-backend.git")
+
+    include(FetchContent)
+    message(STATUS "Fetching cuda backend from ${METAL_BACKEND_URL}:${METAL_BACKEND}")
+    FetchContent_Declare(
+      metal_backend
+      GIT_REPOSITORY ${METAL_BACKEND_URL}
+      GIT_TAG ${METAL_BACKEND}
+    )
+    FetchContent_MakeAvailable(metal_backend)
+    # Set the compile definition for the backend build directory
+    add_compile_definitions(BACKEND_BUILD_DIR="${CMAKE_BINARY_DIR}/_deps/metal_backend-build")
+    endif()
+endif()
+
+if (VULKAN_BACKEND)
+  string(TOLOWER "${VULKAN_BACKEND}" VULKAN_BACKEND_LOWER)
+  if (VULKAN_BACKEND_LOWER STREQUAL "local")
+    # VULKAN backend is local, no need to pull
+    message(STATUS "Adding Vulkan backend from local path: icicle/backend/vulkan")
+    add_subdirectory(backend/vulkan)
+
+    # Set the compile definition for the backend build directory
+    add_compile_definitions(BACKEND_BUILD_DIR="${CMAKE_BINARY_DIR}/backend")
+  else()
+    set(VULKAN_BACKEND_URL "git@github.com:ingonyama-zk/icicle-vulkan-backend.git")
+
+    include(FetchContent)
+    message(STATUS "Fetching cuda backend from ${VULKAN_BACKEND_URL}:${VULKAN_BACKEND}")
+    FetchContent_Declare(
+      vulkan_backend
+      GIT_REPOSITORY ${VULKAN_BACKEND_URL}
+      GIT_TAG ${VULKAN_BACKEND}
+    )
+    FetchContent_MakeAvailable(vulkan_backend)
+    # Set the compile definition for the backend build directory
+    add_compile_definitions(BACKEND_BUILD_DIR="${CMAKE_BINARY_DIR}/_deps/vulkan-backend-build")
+    endif()
+endif()
\ No newline at end of file
diff --git a/icicle/tests/test_device_api.cpp b/icicle/tests/test_device_api.cpp
index 8a5ea68874..f6ac8880e6 100644
--- a/icicle/tests/test_device_api.cpp
+++ b/icicle/tests/test_device_api.cpp
@@ -58,14 +58,17 @@ TEST_F(DeviceApiTest, MemoryCopySync)
     int output[2] = {0, 0};
 
     icicle::Device dev = {device_type, 0};
-    icicle_set_device(dev);
-
-    void* dev_mem = nullptr;
-    ICICLE_CHECK(icicle_malloc(&dev_mem, sizeof(input)));
-    ICICLE_CHECK(icicle_copy_to_device(dev_mem, input, sizeof(input)));
-    ICICLE_CHECK(icicle_copy_to_host(output, dev_mem, sizeof(input)));
-    ICICLE_CHECK(icicle_free(dev_mem));
-
+    ICICLE_CHECK(icicle_set_device(dev));
+
+    void* dev_memA = nullptr;
+    void* dev_memB = nullptr;
+    // test copy host->device->device->host
+    ICICLE_CHECK(icicle_malloc(&dev_memA, sizeof(input)));
+    ICICLE_CHECK(icicle_malloc(&dev_memB, sizeof(input)));
+    ICICLE_CHECK(icicle_copy_to_device(dev_memA, input, sizeof(input)));
+    ICICLE_CHECK(icicle_copy(dev_memB, dev_memA, sizeof(input)));
+    ICICLE_CHECK(icicle_copy_to_host(output, dev_memB, sizeof(input)));
+    ICICLE_CHECK(icicle_free(dev_memB));
     ASSERT_EQ(0, memcmp(input, output, sizeof(input)));
   }
 }
@@ -73,22 +76,22 @@ TEST_F(DeviceApiTest, MemoryCopySync)
 TEST_F(DeviceApiTest, MemoryCopySyncWithOffset)
 {
   int input[4] = {1, 2, 3, 4};
+  int expected[4] = {3, 4, 1, 2};
 
   for (const auto& device_type : s_registered_devices) {
-    int output[2] = {0, 0};
+    int output[4] = {0, 0, 0, 0};
 
     icicle::Device dev = {device_type, 0};
-    icicle_set_device(dev);
+    ICICLE_CHECK(icicle_set_device(dev));
 
     int* dev_mem = nullptr;
-    ICICLE_CHECK(
-      icicle_malloc((void**)&dev_mem, sizeof(input))); // allocating larger memory to have offset on this buffer to copy
-                                                       // 2 values from offset (that is copy {2,3} only)
-    ICICLE_CHECK(icicle_copy_to_device(dev_mem + 1, input + 1, sizeof(output)));
-    ICICLE_CHECK(icicle_copy_to_host(output, dev_mem + 1, sizeof(output)));
+    ICICLE_CHECK(icicle_malloc((void**)&dev_mem, 4 * sizeof(int)));
+    ICICLE_CHECK(icicle_copy_to_device(dev_mem, input + 2, 2 * sizeof(int)));
+    ICICLE_CHECK(icicle_copy_to_device(dev_mem + 2, input, 2 * sizeof(int)));
+    ICICLE_CHECK(icicle_copy_to_host(output, dev_mem, 4 * sizeof(int)));
     ICICLE_CHECK(icicle_free(dev_mem));
 
-    ASSERT_EQ(0, memcmp(input + 1, output, sizeof(output)));
+    ASSERT_EQ(0, memcmp(expected, output, 4 * sizeof(int)));
   }
 }
 
@@ -99,7 +102,7 @@ TEST_F(DeviceApiTest, MemoryCopyAsync)
     int output[2] = {0, 0};
 
     icicle::Device dev = {device_type, 0};
-    icicle_set_device(dev);
+    ICICLE_CHECK(icicle_set_device(dev));
     void* dev_mem = nullptr;
 
     icicleStreamHandle stream;
@@ -121,7 +124,7 @@ TEST_F(DeviceApiTest, CopyDeviceInference)
     int output[2] = {0, 0};
 
     icicle::Device dev = {device_type, 0};
-    icicle_set_device(dev);
+    ICICLE_CHECK(icicle_set_device(dev));
     void* dev_mem = nullptr;
 
     ICICLE_CHECK(icicle_malloc(&dev_mem, sizeof(input)));
@@ -132,15 +135,14 @@ TEST_F(DeviceApiTest, CopyDeviceInference)
   }
 }
 
-TEST_F(DeviceApiTest, Memest)
+TEST_F(DeviceApiTest, Memset)
 {
   char expected[2] = {1, 2};
   for (const auto& device_type : s_registered_devices) {
     char host_mem[2] = {0, 0};
 
-    // icicle::Device dev = {device_type, 0};
-    icicle::Device dev = {"CPU", 0};
-    icicle_set_device(dev);
+    icicle::Device dev = {device_type, 0};
+    ICICLE_CHECK(icicle_set_device(dev));
     char* dev_mem = nullptr;
 
     ICICLE_CHECK(icicle_malloc((void**)&dev_mem, sizeof(host_mem)));
@@ -156,7 +158,7 @@ TEST_F(DeviceApiTest, ApiError)
 {
   for (const auto& device_type : s_registered_devices) {
     icicle::Device dev = {device_type, 0};
-    icicle_set_device(dev);
+    ICICLE_CHECK(icicle_set_device(dev));
     void* dev_mem = nullptr;
     EXPECT_ANY_THROW(ICICLE_CHECK(icicle_malloc(&dev_mem, -1)));
   }
@@ -168,7 +170,7 @@ TEST_F(DeviceApiTest, AvailableMemory)
   const bool is_cuda_registered = eIcicleError::SUCCESS == icicle_is_device_available(dev);
   if (!is_cuda_registered) { GTEST_SKIP(); } // most devices do not support this
 
-  icicle_set_device(dev);
+  ICICLE_CHECK(icicle_set_device(dev));
   size_t total, free;
   ASSERT_EQ(eIcicleError::SUCCESS, icicle_get_available_memory(total, free));
 
@@ -190,13 +192,13 @@ TEST_F(DeviceApiTest, memoryTracker)
 {
   // need two devices for this test
   if (s_registered_devices.size() == 1) { return; }
-  const int NOF_ALLOCS = 1000;
+  const int NOF_ALLOCS = 200; // Note that some backends have a bound (typically 256) on allocations
   const int ALLOC_SIZE = 1 << 20;
 
   MemoryTracker<Device> tracker{};
   ICICLE_ASSERT(s_main_device != UNKOWN_DEVICE) << "memoryTracker test assumes more than one device";
   Device main_device = {s_main_device, 0};
-  icicle_set_device(main_device);
+  ICICLE_CHECK(icicle_set_device(main_device));
 
   std::vector<void*> allocated_addresses(NOF_ALLOCS, nullptr);
 
@@ -226,12 +228,29 @@ TEST_F(DeviceApiTest, memoryTracker)
   ASSERT_EQ(eIcicleError::INVALID_POINTER, icicle_is_active_device_memory(host_mem.get()));
 
   // test that we still identify correctly after switching device
-  icicle_set_device({"CPU", 0});
+  ICICLE_CHECK(icicle_set_device({"CPU", 0}));
   const void* addr = (void*)((size_t)*allocated_addresses.begin() + rand_uint_32b(0, RAND_MAX) % ALLOC_SIZE);
   ASSERT_EQ(eIcicleError::INVALID_POINTER, icicle_is_active_device_memory(addr));
   ASSERT_EQ(eIcicleError::INVALID_POINTER, icicle_is_active_device_memory(host_mem.get()));
   auto it = tracker.identify(addr);
   ASSERT_EQ(*it->first, main_device);
+
+  ICICLE_CHECK(icicle_set_device(main_device));
+  START_TIMER(remove);
+  for (auto& it : allocated_addresses) {
+    tracker.remove_allocation(it);
+  }
+  END_TIMER_AVERAGE(remove, "memory-tracker: remove average", true, NOF_ALLOCS);
+
+  START_TIMER(free);
+  for (auto& it : allocated_addresses) {
+    icicle_free(it);
+  }
+  END_TIMER_AVERAGE(free, "memory-tracker: free average", true, NOF_ALLOCS);
+
+  void* mem;
+  ICICLE_CHECK(icicle_malloc(&mem, ALLOC_SIZE));
+  ICICLE_CHECK(icicle_free(mem));
 }
 
 int main(int argc, char** argv)
diff --git a/scripts/pull_metal_backend.sh b/scripts/pull_metal_backend.sh
index 7c5bb69f92..7e6c26aa0f 100755
--- a/scripts/pull_metal_backend.sh
+++ b/scripts/pull_metal_backend.sh
@@ -14,8 +14,6 @@ ABS_METAL_DIR=$(realpath ${BACKEND_DIR})/metal
 
 echo "Trying to pull Metal backend commit '${METAL_BACKEND}' to '${ABS_METAL_DIR}'"
 
-exit 1
-
 if [ -d "${ABS_METAL_DIR}" ] && [ "$(ls -A ${ABS_METAL_DIR})" ]; then
     echo "Directory ${ABS_METAL_DIR} is not empty."
     read -p "Do you want to proceed with fetching and resetting? (y/n): " response
diff --git a/scripts/pull_vulkan_backend.sh b/scripts/pull_vulkan_backend.sh
new file mode 100755
index 0000000000..45ba5efc6f
--- /dev/null
+++ b/scripts/pull_vulkan_backend.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+VULKAN_BACKEND=${1:-main}
+BACKEND_DIR=${2:-./icicle/backend}
+
+# Check if BACKEND_DIR exists
+if [ ! -d "${BACKEND_DIR}" ]; then
+    echo "Error: Directory '${BACKEND_DIR}' does not exist."
+    exit 1
+fi
+
+# Get the absolute path of the backend directory
+ABS_VULKAN_DIR=$(realpath ${BACKEND_DIR})/vulkan
+
+echo "Trying to pull vulkan backend commit '${VULKAN_BACKEND}' to '${ABS_VULKAN_DIR}'"
+
+if [ -d "${ABS_VULKAN_DIR}" ] && [ "$(ls -A ${ABS_VULKAN_DIR})" ]; then
+    echo "Directory ${ABS_VULKAN_DIR} is not empty."
+    read -p "Do you want to proceed with fetching and resetting? (y/n): " response
+    case "$response" in
+        [Yy]* )
+            echo "Proceeding with fetch and reset..."
+            cd ${ABS_VULKAN_DIR}
+            git fetch origin
+            git reset --hard origin/${VULKAN_BACKEND}
+            ;;
+        [Nn]* )
+            echo "Aborting."
+            exit 1
+            ;;
+        * )
+            echo "Invalid input. Aborting."
+            exit 1
+            ;;
+    esac
+else
+    echo "Directory ${ABS_VULKAN_DIR} is empty or does not exist. Cloning..."
+    mkdir -p ${ABS_VULKAN_DIR}
+    cd ${ABS_VULKAN_DIR}
+    git clone git@github.com:ingonyama-zk/icicle-vulkan-backend.git ${ABS_VULKAN_DIR}
+    git checkout ${VULKAN_BACKEND}
+fi
\ No newline at end of file
diff --git a/wrappers/rust/icicle-curves/icicle-bls12-377/Cargo.toml b/wrappers/rust/icicle-curves/icicle-bls12-377/Cargo.toml
index bf9c07bcb3..544d2bf0b3 100644
--- a/wrappers/rust/icicle-curves/icicle-bls12-377/Cargo.toml
+++ b/wrappers/rust/icicle-curves/icicle-bls12-377/Cargo.toml
@@ -29,6 +29,8 @@ cuda_backend = ["icicle-runtime/cuda_backend"]
 pull_cuda_backend = ["icicle-runtime/pull_cuda_backend"]
 metal_backend = ["icicle-runtime/metal_backend"]
 pull_metal_backend = ["icicle-runtime/pull_metal_backend"]
+vulkan_backend = ["icicle-runtime/vulkan_backend"]
+pull_vulkan_backend = ["icicle-runtime/pull_vulkan_backend"]
 
 
 [[bench]]
@@ -42,4 +44,3 @@ harness = false
 [[bench]]
 name = "ecntt"
 harness = false
-
diff --git a/wrappers/rust/icicle-curves/icicle-bls12-377/build.rs b/wrappers/rust/icicle-curves/icicle-bls12-377/build.rs
index d881fa7ca4..19f8257fe5 100644
--- a/wrappers/rust/icicle-curves/icicle-bls12-377/build.rs
+++ b/wrappers/rust/icicle-curves/icicle-bls12-377/build.rs
@@ -41,6 +41,11 @@ fn main() {
     } else if cfg!(feature = "pull_metal_backend") {
         config.define("METAL_BACKEND", "main");
     }
+    if cfg!(feature = "vulkan_backend") {
+        config.define("VULKAN_BACKEND", "local");
+    } else if cfg!(feature = "pull_vulkan_backend") {
+        config.define("VULKAN_BACKEND", "main");
+    }
     // Optional Features that are default ON (so that default matches any backend)
     if cfg!(feature = "no_g2") {
         config.define("G2", "OFF");
diff --git a/wrappers/rust/icicle-curves/icicle-bls12-381/Cargo.toml b/wrappers/rust/icicle-curves/icicle-bls12-381/Cargo.toml
index 30d9089a3f..c539209ca7 100644
--- a/wrappers/rust/icicle-curves/icicle-bls12-381/Cargo.toml
+++ b/wrappers/rust/icicle-curves/icicle-bls12-381/Cargo.toml
@@ -27,6 +27,8 @@ cuda_backend = ["icicle-runtime/cuda_backend"]
 pull_cuda_backend = ["icicle-runtime/pull_cuda_backend"]
 metal_backend = ["icicle-runtime/metal_backend"]
 pull_metal_backend = ["icicle-runtime/pull_metal_backend"]
+vulkan_backend = ["icicle-runtime/vulkan_backend"]
+pull_vulkan_backend = ["icicle-runtime/pull_vulkan_backend"]
 
 [[bench]]
 name = "ntt"
diff --git a/wrappers/rust/icicle-curves/icicle-bls12-381/build.rs b/wrappers/rust/icicle-curves/icicle-bls12-381/build.rs
index 87ace38e0a..ddec7268c8 100644
--- a/wrappers/rust/icicle-curves/icicle-bls12-381/build.rs
+++ b/wrappers/rust/icicle-curves/icicle-bls12-381/build.rs
@@ -41,6 +41,11 @@ fn main() {
     } else if cfg!(feature = "pull_metal_backend") {
         config.define("METAL_BACKEND", "main");
     }
+    if cfg!(feature = "vulkan_backend") {
+        config.define("VULKAN_BACKEND", "local");
+    } else if cfg!(feature = "pull_vulkan_backend") {
+        config.define("VULKAN_BACKEND", "main");
+    }
     // Optional Features that are default ON (so that default matches any backend)
     if cfg!(feature = "no_g2") {
         config.define("G2", "OFF");
diff --git a/wrappers/rust/icicle-curves/icicle-bn254/Cargo.toml b/wrappers/rust/icicle-curves/icicle-bn254/Cargo.toml
index 74b6d9843b..257570af18 100644
--- a/wrappers/rust/icicle-curves/icicle-bn254/Cargo.toml
+++ b/wrappers/rust/icicle-curves/icicle-bn254/Cargo.toml
@@ -27,6 +27,8 @@ cuda_backend = ["icicle-runtime/cuda_backend"]
 pull_cuda_backend = ["icicle-runtime/pull_cuda_backend"]
 metal_backend = ["icicle-runtime/metal_backend"]
 pull_metal_backend = ["icicle-runtime/pull_metal_backend"]
+vulkan_backend = ["icicle-runtime/vulkan_backend"]
+pull_vulkan_backend = ["icicle-runtime/pull_vulkan_backend"]
 
 [[bench]]
 name = "ntt"
diff --git a/wrappers/rust/icicle-curves/icicle-bn254/build.rs b/wrappers/rust/icicle-curves/icicle-bn254/build.rs
index 1d1d98b2ef..00bf3aafb7 100644
--- a/wrappers/rust/icicle-curves/icicle-bn254/build.rs
+++ b/wrappers/rust/icicle-curves/icicle-bn254/build.rs
@@ -40,6 +40,11 @@ fn main() {
     } else if cfg!(feature = "pull_metal_backend") {
         config.define("METAL_BACKEND", "main");
     }
+    if cfg!(feature = "vulkan_backend") {
+        config.define("VULKAN_BACKEND", "local");
+    } else if cfg!(feature = "pull_vulkan_backend") {
+        config.define("VULKAN_BACKEND", "main");
+    }
     // Optional Features that are default ON (so that default matches any backend)
     if cfg!(feature = "no_g2") {
         config.define("G2", "OFF");
diff --git a/wrappers/rust/icicle-curves/icicle-bw6-761/build.rs b/wrappers/rust/icicle-curves/icicle-bw6-761/build.rs
index 319ea57e3d..fdebc5cc6a 100644
--- a/wrappers/rust/icicle-curves/icicle-bw6-761/build.rs
+++ b/wrappers/rust/icicle-curves/icicle-bw6-761/build.rs
@@ -40,6 +40,11 @@ fn main() {
     } else if cfg!(feature = "pull_metal_backend") {
         config.define("METAL_BACKEND", "main");
     }
+    if cfg!(feature = "vulkan_backend") {
+        config.define("VULKAN_BACKEND", "local");
+    } else if cfg!(feature = "pull_vulkan_backend") {
+        config.define("VULKAN_BACKEND", "main");
+    }
 
     // Build
     let _ = config
diff --git a/wrappers/rust/icicle-curves/icicle-grumpkin/Cargo.toml b/wrappers/rust/icicle-curves/icicle-grumpkin/Cargo.toml
index 590e1ac750..f2ee067da6 100644
--- a/wrappers/rust/icicle-curves/icicle-grumpkin/Cargo.toml
+++ b/wrappers/rust/icicle-curves/icicle-grumpkin/Cargo.toml
@@ -25,6 +25,8 @@ cuda_backend = ["icicle-runtime/cuda_backend"]
 pull_cuda_backend = ["icicle-runtime/pull_cuda_backend"]
 metal_backend = ["icicle-runtime/metal_backend"]
 pull_metal_backend = ["icicle-runtime/pull_metal_backend"]
+vulkan_backend = ["icicle-runtime/vulkan_backend"]
+pull_vulkan_backend = ["icicle-runtime/pull_vulkan_backend"]
 
 
 [[bench]]
diff --git a/wrappers/rust/icicle-curves/icicle-grumpkin/build.rs b/wrappers/rust/icicle-curves/icicle-grumpkin/build.rs
index d52afa259b..9ec47ecc6b 100644
--- a/wrappers/rust/icicle-curves/icicle-grumpkin/build.rs
+++ b/wrappers/rust/icicle-curves/icicle-grumpkin/build.rs
@@ -40,6 +40,11 @@ fn main() {
     } else if cfg!(feature = "pull_metal_backend") {
         config.define("METAL_BACKEND", "main");
     }
+    if cfg!(feature = "vulkan_backend") {
+        config.define("VULKAN_BACKEND", "local");
+    } else if cfg!(feature = "pull_vulkan_backend") {
+        config.define("VULKAN_BACKEND", "main");
+    }
 
     // Build
     let _ = config
diff --git a/wrappers/rust/icicle-fields/icicle-babybear/Cargo.toml b/wrappers/rust/icicle-fields/icicle-babybear/Cargo.toml
index e9d222324e..02a6689550 100644
--- a/wrappers/rust/icicle-fields/icicle-babybear/Cargo.toml
+++ b/wrappers/rust/icicle-fields/icicle-babybear/Cargo.toml
@@ -9,7 +9,7 @@ repository.workspace = true
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-icicle-core =  {workspace = true }
+icicle-core = { workspace = true }
 icicle-runtime = { workspace = true }
 icicle-hash = { workspace = true }
 
@@ -27,6 +27,8 @@ cuda_backend = ["icicle-runtime/cuda_backend"]
 pull_cuda_backend = ["icicle-runtime/pull_cuda_backend"]
 metal_backend = ["icicle-runtime/metal_backend"]
 pull_metal_backend = ["icicle-runtime/pull_metal_backend"]
+vulkan_backend = ["icicle-runtime/vulkan_backend"]
+pull_vulkan_backend = ["icicle-runtime/pull_vulkan_backend"]
 
 [[bench]]
 name = "ntt"
diff --git a/wrappers/rust/icicle-fields/icicle-babybear/build.rs b/wrappers/rust/icicle-fields/icicle-babybear/build.rs
index 763ad02698..f1412b4d3f 100644
--- a/wrappers/rust/icicle-fields/icicle-babybear/build.rs
+++ b/wrappers/rust/icicle-fields/icicle-babybear/build.rs
@@ -40,6 +40,11 @@ fn main() {
     } else if cfg!(feature = "pull_metal_backend") {
         config.define("METAL_BACKEND", "main");
     }
+    if cfg!(feature = "vulkan_backend") {
+        config.define("VULKAN_BACKEND", "local");
+    } else if cfg!(feature = "pull_vulkan_backend") {
+        config.define("VULKAN_BACKEND", "main");
+    }
 
     // Build
     let _ = config
diff --git a/wrappers/rust/icicle-fields/icicle-koalabear/Cargo.toml b/wrappers/rust/icicle-fields/icicle-koalabear/Cargo.toml
index 585031b61f..bff263dedd 100644
--- a/wrappers/rust/icicle-fields/icicle-koalabear/Cargo.toml
+++ b/wrappers/rust/icicle-fields/icicle-koalabear/Cargo.toml
@@ -9,7 +9,7 @@ repository.workspace = true
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-icicle-core =  {workspace = true }
+icicle-core = { workspace = true }
 icicle-runtime = { workspace = true }
 icicle-hash = { workspace = true }
 
@@ -27,6 +27,8 @@ cuda_backend = ["icicle-runtime/cuda_backend"]
 pull_cuda_backend = ["icicle-runtime/pull_cuda_backend"]
 metal_backend = ["icicle-runtime/metal_backend"]
 pull_metal_backend = ["icicle-runtime/pull_metal_backend"]
+vulkan_backend = ["icicle-runtime/vulkan_backend"]
+pull_vulkan_backend = ["icicle-runtime/pull_vulkan_backend"]
 
 [[bench]]
 name = "ntt"
diff --git a/wrappers/rust/icicle-fields/icicle-koalabear/build.rs b/wrappers/rust/icicle-fields/icicle-koalabear/build.rs
index 59d0e9982f..2a0611ade5 100644
--- a/wrappers/rust/icicle-fields/icicle-koalabear/build.rs
+++ b/wrappers/rust/icicle-fields/icicle-koalabear/build.rs
@@ -40,6 +40,11 @@ fn main() {
     } else if cfg!(feature = "pull_metal_backend") {
         config.define("METAL_BACKEND", "main");
     }
+    if cfg!(feature = "vulkan_backend") {
+        config.define("VULKAN_BACKEND", "local");
+    } else if cfg!(feature = "pull_vulkan_backend") {
+        config.define("VULKAN_BACKEND", "main");
+    }
 
     // Build
     let _ = config
diff --git a/wrappers/rust/icicle-fields/icicle-m31/Cargo.toml b/wrappers/rust/icicle-fields/icicle-m31/Cargo.toml
index 35011a4984..2c3f804166 100644
--- a/wrappers/rust/icicle-fields/icicle-m31/Cargo.toml
+++ b/wrappers/rust/icicle-fields/icicle-m31/Cargo.toml
@@ -20,3 +20,5 @@ cuda_backend = ["icicle-runtime/cuda_backend"]
 pull_cuda_backend = ["icicle-runtime/pull_cuda_backend"]
 metal_backend = ["icicle-runtime/metal_backend"]
 pull_metal_backend = ["icicle-runtime/pull_metal_backend"]
+vulkan_backend = ["icicle-runtime/vulkan_backend"]
+pull_vulkan_backend = ["icicle-runtime/pull_vulkan_backend"]
diff --git a/wrappers/rust/icicle-fields/icicle-m31/build.rs b/wrappers/rust/icicle-fields/icicle-m31/build.rs
index c25209b290..dfd0f58e07 100644
--- a/wrappers/rust/icicle-fields/icicle-m31/build.rs
+++ b/wrappers/rust/icicle-fields/icicle-m31/build.rs
@@ -40,6 +40,11 @@ fn main() {
     } else if cfg!(feature = "pull_metal_backend") {
         config.define("METAL_BACKEND", "main");
     }
+    if cfg!(feature = "vulkan_backend") {
+        config.define("VULKAN_BACKEND", "local");
+    } else if cfg!(feature = "pull_vulkan_backend") {
+        config.define("VULKAN_BACKEND", "main");
+    }
 
     // Build
     let _ = config
diff --git a/wrappers/rust/icicle-fields/icicle-stark252/Cargo.toml b/wrappers/rust/icicle-fields/icicle-stark252/Cargo.toml
index 8346d7e939..f57c8037ef 100644
--- a/wrappers/rust/icicle-fields/icicle-stark252/Cargo.toml
+++ b/wrappers/rust/icicle-fields/icicle-stark252/Cargo.toml
@@ -9,7 +9,7 @@ repository.workspace = true
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-icicle-core =  {workspace = true }
+icicle-core = { workspace = true }
 icicle-runtime = { workspace = true }
 icicle-hash = { workspace = true }
 
@@ -26,6 +26,8 @@ cuda_backend = ["icicle-runtime/cuda_backend"]
 pull_cuda_backend = ["icicle-runtime/pull_cuda_backend"]
 metal_backend = ["icicle-runtime/metal_backend"]
 pull_metal_backend = ["icicle-runtime/pull_metal_backend"]
+vulkan_backend = ["icicle-runtime/vulkan_backend"]
+pull_vulkan_backend = ["icicle-runtime/pull_vulkan_backend"]
 
 [[bench]]
 name = "ntt"
diff --git a/wrappers/rust/icicle-hash/Cargo.toml b/wrappers/rust/icicle-hash/Cargo.toml
index 7cc89ba801..fad2cd3aaf 100644
--- a/wrappers/rust/icicle-hash/Cargo.toml
+++ b/wrappers/rust/icicle-hash/Cargo.toml
@@ -7,7 +7,7 @@ homepage.workspace = true
 repository.workspace = true
 
 [dependencies]
-icicle-core =  {workspace = true }
+icicle-core = { workspace = true }
 icicle-runtime = { workspace = true }
 rand = "0.8"
 
@@ -19,3 +19,5 @@ cuda_backend = ["icicle-runtime/cuda_backend"]
 pull_cuda_backend = ["icicle-runtime/pull_cuda_backend"]
 metal_backend = ["icicle-runtime/metal_backend"]
 pull_metal_backend = ["icicle-runtime/pull_metal_backend"]
+vulkan_backend = ["icicle-runtime/vulkan_backend"]
+pull_vulkan_backend = ["icicle-runtime/pull_vulkan_backend"]
diff --git a/wrappers/rust/icicle-hash/build.rs b/wrappers/rust/icicle-hash/build.rs
index 5658fe3c1c..2c997e76b6 100644
--- a/wrappers/rust/icicle-hash/build.rs
+++ b/wrappers/rust/icicle-hash/build.rs
@@ -38,6 +38,11 @@ fn main() {
     } else if cfg!(feature = "pull_metal_backend") {
         config.define("METAL_BACKEND", "main");
     }
+    if cfg!(feature = "vulkan_backend") {
+        config.define("VULKAN_BACKEND", "local");
+    } else if cfg!(feature = "pull_vulkan_backend") {
+        config.define("VULKAN_BACKEND", "main");
+    }
 
     // Build
     let _ = config
diff --git a/wrappers/rust/icicle-runtime/Cargo.toml b/wrappers/rust/icicle-runtime/Cargo.toml
index 1ae0f72c78..e5c9b568cc 100644
--- a/wrappers/rust/icicle-runtime/Cargo.toml
+++ b/wrappers/rust/icicle-runtime/Cargo.toml
@@ -22,3 +22,5 @@ cuda_backend = []
 pull_cuda_backend = []
 metal_backend = []
 pull_metal_backend = []
+vulkan_backend = []
+pull_vulkan_backend = []
diff --git a/wrappers/rust/icicle-runtime/build.rs b/wrappers/rust/icicle-runtime/build.rs
index c93c9ed946..6c7613001f 100644
--- a/wrappers/rust/icicle-runtime/build.rs
+++ b/wrappers/rust/icicle-runtime/build.rs
@@ -38,6 +38,11 @@ fn main() {
     } else if cfg!(feature = "pull_metal_backend") {
         config.define("METAL_BACKEND", "main");
     }
+    if cfg!(feature = "vulkan_backend") {
+        config.define("VULKAN_BACKEND", "local");
+    } else if cfg!(feature = "pull_vulkan_backend") {
+        config.define("VULKAN_BACKEND", "main");
+    }
 
     // Build
     let _ = config

From 95c33e4d8d351d63bc88a7e0379bd30e7c127b37 Mon Sep 17 00:00:00 2001
From: Miki <100796045+mickeyasa@users.noreply.github.com>
Date: Wed, 8 Jan 2025 12:59:20 +0200
Subject: [PATCH 05/12] Redesign CPU MSM with taskflow (#718)

Accelerating MSM for CPU using taskflow library to distribute threads.
---
 icicle/backend/cpu/CMakeLists.txt        |   9 +-
 icicle/backend/cpu/src/curve/cpu_msm.hpp | 945 +++++++----------------
 2 files changed, 296 insertions(+), 658 deletions(-)

diff --git a/icicle/backend/cpu/CMakeLists.txt b/icicle/backend/cpu/CMakeLists.txt
index 6aab973296..75a35f1e84 100644
--- a/icicle/backend/cpu/CMakeLists.txt
+++ b/icicle/backend/cpu/CMakeLists.txt
@@ -1,7 +1,12 @@
 cmake_minimum_required(VERSION 3.18)
 
 message(STATUS "Fetching Taskflow v3.8.0 (CPU backend)")
+
+
 include(FetchContent)
+# Temporarily redefine message() to suppress output
+macro(message)
+endmacro()
 FetchContent_Declare(
   Taskflow
   GIT_REPOSITORY https://github.com/taskflow/taskflow.git
@@ -20,10 +25,12 @@ FetchContent_MakeAvailable(Taskflow)
 # Use icicle_device as interface for TaskFlow headers
 target_include_directories(icicle_device INTERFACE ${Taskflow_SOURCE_DIR})
 
+# Restore the original message behavior
+unset(message)
+
 # CPU backend is built directly into icicle library
 
 target_sources(icicle_device PRIVATE src/cpu_device_api.cpp)
-
 # field API library
 if (FIELD)
     target_sources(icicle_field PRIVATE
diff --git a/icicle/backend/cpu/src/curve/cpu_msm.hpp b/icicle/backend/cpu/src/curve/cpu_msm.hpp
index 7cfd1032d2..2f129cd699 100644
--- a/icicle/backend/cpu/src/curve/cpu_msm.hpp
+++ b/icicle/backend/cpu/src/curve/cpu_msm.hpp
@@ -13,7 +13,7 @@
 #include "icicle/curves/projective.h"
 #include "icicle/curves/curve_config.h"
 #include "icicle/msm.h"
-#include "tasks_manager.h"
+#include "taskflow/taskflow.hpp"
 #include "icicle/backend/msm_config.h"
 #ifdef MEASURE_MSM_TIMES
   #include "icicle/utils/timer.hpp"
@@ -22,701 +22,344 @@
 using namespace icicle;
 using namespace curve_config;
 
-#define LOG_EC_ADDITIONS_IN_BATCH 2
-#define NOF_EC_ADDITIONS_IN_BATCH (1 << LOG_EC_ADDITIONS_IN_BATCH)
-
-/**
- * @class EcAddTask
- * @brief class deriving `TaskBase` to allow use of tasks_manager.h for MSM
- * (The task to be executed is an elliptic curve addition).
- */
 template <typename A, typename P>
-class EcAddTask : public TaskBase
+class Msm
 {
 public:
-  /**
-   * @brief constructor for the task ensuring points are zeros and other fields are init to invalid values (to avoid
-   * falsely handling a result at the start). The execution adds points m_point1,m_point2 and stores the result in
-   * m_point1.
-   */
-  EcAddTask()
-      : TaskBase(), m_a_points(NOF_EC_ADDITIONS_IN_BATCH, P::zero()), m_b_points(NOF_EC_ADDITIONS_IN_BATCH, P::zero()),
-        m_b_point_ptrs(NOF_EC_ADDITIONS_IN_BATCH, nullptr), m_b_affine_points(NOF_EC_ADDITIONS_IN_BATCH, A::zero()),
-        m_return_idx(NOF_EC_ADDITIONS_IN_BATCH, -1), m_opcodes(NOF_EC_ADDITIONS_IN_BATCH, ADD_P1_P2_BY_VALUE),
-        m_is_line(NOF_EC_ADDITIONS_IN_BATCH, true), m_nof_valid_points(0)
+  // Constructor
+  Msm(const int msm_size, const MSMConfig& config) : m_msm_size(msm_size), m_config(config)
   {
-  }
-
-  /**
-   * @brief Function to be executed by the tasks manager. It can be configured according to the value odf
-   */
-  void execute()
-  {
-    static int counter = 0;
-    for (int i = 0; i < m_nof_valid_points; i++) {
-      switch (m_opcodes[i]) {
-      case ADD_P1_P2_BY_VALUE:
-        m_a_points[i] = m_a_points[i] + m_b_points[i];
-        continue;
-      case ADD_P1_AND_P2_POINTER:
-        m_a_points[i] = m_a_points[i] + *(m_b_point_ptrs[i]);
-        continue;
-      case ADD_P1_AND_P2_AFFINE:
-        m_a_points[i] = m_a_points[i] + m_b_affine_points[i];
-        // continue;
-      }
+    // TBD: for sall size MSM - prefer double and add
+    calc_optimal_parameters();
+
+    // Resize the thread buckets according to optimal parameters
+    m_workers_buckets.resize(m_nof_workers);
+    m_workers_buckets_busy.resize(m_nof_workers);
+    for (int worker_i = 0; worker_i < m_nof_workers; worker_i++) {
+      m_workers_buckets[worker_i].resize(m_nof_total_buckets);
+      m_workers_buckets_busy[worker_i].resize(m_nof_total_buckets);
     }
   }
 
-  /**
-   * @brief Dispatch task even if batch is not full. This function is mostly used in ends of phases where the left
-   * additions are not enough to fill a batch.
-   */
-  void dispatch_if_not_empty()
+  // run MSM based on pippenger algorithm
+  void run_msm(const scalar_t* scalars, const A* bases, P* results)
   {
-    if (is_idle() && m_nof_valid_points > 0) { dispatch(); }
-  }
+    // TBD: move to threads
+    init_workers_buckets_busy();
 
-  /**
-   * @brief Set up phase 1 addition between a new base and the target bucket.
-   * @param bucket - bucket to be added. It isn't passed as Per as the bucket's value can be changed during
-   * addition execution.
-   * @param base - affine base to be added. Passed as a Per as the input is constant and it saves copying.
-   * @param negate_affine - flag to indicate that the base needs to be subbed instead of added.
-   * @param is_montgomery - flag to indicate that the base is in Montgomery form and first needs to be converted.
-   */
-  void set_phase1_addition_with_affine(const P& bucket, const A& base, int bucket_idx)
-  {
-    m_a_points[m_nof_valid_points] = bucket;
-    m_b_affine_points[m_nof_valid_points] = base;
-    m_return_idx[m_nof_valid_points] = bucket_idx;
-    m_opcodes[m_nof_valid_points] = ADD_P1_AND_P2_AFFINE;
+    phase1_populate_buckets(scalars, bases);
 
-    m_nof_valid_points++;
-    if (m_nof_valid_points == NOF_EC_ADDITIONS_IN_BATCH) { dispatch(); }
-  }
+    phase2_collapse_segments();
 
-  /**
-   * @brief Set up phase 2 addition by value. It is required for the first set of additions - the first line addition
-   * of each segment.
-   * @param line_sum - line sum to be copied because it is also given to triangle sum.
-   * @param bucket - bucket to be added to the line.
-   * @param segment_idx - relevant segment to identify the returning result.
-   */
-  void set_phase2_addition_by_value(const P& line_sum, const P& bucket, int segment_idx)
-  {
-    m_a_points[m_nof_valid_points] = line_sum;
-    m_b_points[m_nof_valid_points] = bucket;
-    m_return_idx[m_nof_valid_points] = segment_idx;
-    m_is_line[m_nof_valid_points] = true;
-    m_opcodes[m_nof_valid_points] = ADD_P1_P2_BY_VALUE;
-
-    m_nof_valid_points++;
-    if (m_nof_valid_points == NOF_EC_ADDITIONS_IN_BATCH) { dispatch(); }
+    // TBD: start the next task in batch in parallel to phase 3
+    phase3_final_accumulator(results);
   }
 
-  /**
-   * @brief Set up line addition for phase 2.
-   * @param line_sum - line sum to be copied because it is also given to triangle sum.
-   * @param bucket_ptr - Per to the bucket to be added to the line, passed as a Per to avoid copying.
-   * @param segment_idx - relevant segment to identify the returning result.
-   */
-  void set_phase2_line_addition(const P& line_sum, P* bucket_ptr, const int segment_idx)
+  // Calculate the optimal C based on the problem size, config and machine parameters.
+  static unsigned get_optimal_c(unsigned msm_size, const MSMConfig& config)
   {
-    m_a_points[m_nof_valid_points] = line_sum;
-    m_b_point_ptrs[m_nof_valid_points] = bucket_ptr;
-    m_return_idx[m_nof_valid_points] = segment_idx;
-    m_is_line[m_nof_valid_points] = true;
-    m_opcodes[m_nof_valid_points] = ADD_P1_AND_P2_POINTER;
-
-    m_nof_valid_points++;
-    if (m_nof_valid_points == NOF_EC_ADDITIONS_IN_BATCH) { dispatch(); }
+    // TBD: optimize - condsider nof workers, do experiments
+    int optimal_c = config.c > 0 ? config.c :                                                   // c given by config.
+                      std::max((int)(0.7 * std::log2(msm_size * config.precompute_factor)), 8); // Empirical formula
+    return optimal_c;
   }
 
-  /**
-   * @brief Set up triangle addition for phase 2.
-   * No need to specify segment idx as this is only assigned to a task already handling the triangle's segment idx
-   * (A previous line or triangle sum of this segment).
-   * @param line_sum - line sum to be copied because its value can be updated during the triangle sum execution.
-   * @param triangle_sum_ptr - Per to the triangle sum, passed as a Per to avoid unnecessary copying.
-   */
-  void set_phase2_triangle_addition(P& line_sum, P* triangle_sum_ptr)
-  {
-    m_a_points[m_nof_valid_points] = line_sum;
-    m_b_point_ptrs[m_nof_valid_points] = triangle_sum_ptr;
-    m_is_line[m_nof_valid_points] = false;
-    m_opcodes[m_nof_valid_points] = ADD_P1_AND_P2_POINTER;
+private:
+  // A single bucket data base
+  // TBD: check adding a pointer to affine to reduce copies
+  struct Bucket {
+    P point;
+  };
 
-    m_nof_valid_points++;
-    if (m_nof_valid_points == NOF_EC_ADDITIONS_IN_BATCH) { dispatch(); }
-  }
+  // A single segment data base
+  struct Segment {
+    P line_sum;
+    P triangle_sum;
+  };
 
-  /**
-   * @brief Chain addition used in phase 1 when a collision between a new result and an occupied bucket.
-   * @param result - the previous EC addition result.
-   * @param bucket - the bucket value to be added to the existing result in p1.
-   * @param segment_idx - index of the bucket to return the result to.
-   */
-  void set_phase1_collision_task(const P& result, const P& bucket, const int segment_idx)
+  // members
+  tf::Taskflow m_taskflow;       // Accumulate tasks
+  tf::Executor m_executor;       // execute all tasks accumulated on multiple threads
+  const int m_msm_size;          // number of scalars in the problem
+  const MSMConfig& m_config;     // extra parameters for the problem
+  uint32_t m_scalar_size;        // the number of bits at the scalar
+  uint32_t m_c;                  // the number of bits each bucket module is responsible for
+  uint32_t m_bm_size;            // number of buckets in a single bucket module.
+  uint32_t m_nof_buckets_module; // number of bucket modules. Each BM contains m_bm_size buckets except for the last one
+  uint64_t m_nof_total_buckets;  // total number of buckets across all bucket modules
+  uint32_t m_precompute_factor;  // the number of bases precomputed for each scalar
+  uint32_t m_segment_size;       // segments size for phase 2.
+  uint32_t m_nof_workers;        // number of threads in current machine
+
+  // per worker:
+  std::vector<std::vector<Bucket>> m_workers_buckets;    // all buckets used by the worker
+  std::vector<std::vector<bool>> m_workers_buckets_busy; // for each bucket, an indication if it is busy
+
+  std::vector<Segment> m_segments; // A vector of all segments for phase 2
+
+  // set the parameters based on the problem size and the machine properties
+  void calc_optimal_parameters()
   {
-    m_a_points[m_nof_valid_points] = result;
-    m_b_points[m_nof_valid_points] = bucket;
-    m_return_idx[m_nof_valid_points] = segment_idx;
-    m_opcodes[m_nof_valid_points] = ADD_P1_P2_BY_VALUE;
+    // set nof workers
+    m_nof_workers = std::thread::hardware_concurrency();
+    if (m_config.ext && m_config.ext->has(CpuBackendConfig::CPU_NOF_THREADS)) {
+      m_nof_workers = m_config.ext && m_config.ext->has(CpuBackendConfig::CPU_NOF_THREADS)
+                        ? m_config.ext->get<int>(CpuBackendConfig::CPU_NOF_THREADS)
+                        :                                    // number of threads provided by config
+                        std::thread::hardware_concurrency(); // check machine properties
+    }
+    if (m_nof_workers <= 0) {
+      ICICLE_LOG_WARNING << "Unable to detect number of hardware supported threads - fixing it to 1\n";
+      m_nof_workers = 1;
+    }
 
-    m_nof_valid_points++;
-    if (m_nof_valid_points == NOF_EC_ADDITIONS_IN_BATCH) { dispatch(); }
+    // phase 1 properties
+    m_scalar_size = scalar_t::NBITS; // TBD handle this config.bitsize != 0 ? config.bitsize : scalar_t::NBITS;
+    m_c = get_optimal_c(m_msm_size, m_config);
+    m_precompute_factor = m_config.precompute_factor;
+    m_nof_buckets_module = ((m_scalar_size - 1) / (m_config.precompute_factor * m_c)) + 1;
+    m_bm_size = 1 << (m_c - 1);
+    const uint64_t last_bm_size =
+      m_precompute_factor > 1 ? m_bm_size : 1 << (m_scalar_size - ((m_nof_buckets_module - 1) * m_c));
+    m_nof_total_buckets = (m_nof_buckets_module - 1) * m_bm_size + last_bm_size;
+
+    // phase 2 properties
+    m_segment_size =
+      std::min(m_bm_size, (uint32_t)(1 << (uint32_t)(std::log2(m_nof_total_buckets / m_nof_workers) - 4)));
+    const int nof_segments = (m_nof_total_buckets + m_segment_size - 1) / m_segment_size;
+    m_segments.resize(nof_segments);
   }
 
-  /**
-   * @brief Resets task to idle, resetting the valid points counter to 0.
-   */
-  void reset_idle()
+  // init all the workers buckets to empty
+  void init_workers_buckets_busy() // TBD move to threads - note that not al threads are working
   {
-    m_nof_valid_points = 0;
-    set_idle();
-  }
-
-  int m_nof_valid_points;
-
-  std::vector<P> m_a_points; // One of the addends that also stores addition results.
-
-  std::vector<int> m_return_idx; // Idx allowing manager to figure out where the result belong to.
-  std::vector<bool> m_is_line;   // Indicator for phase 2 sums between line sum and triangle sum.
-
-private:
-  // Variations of the second addend which will be used depending on the opcode below
-  std::vector<P> m_b_points;
-  std::vector<A> m_b_affine_points;
-  std::vector<P*> m_b_point_ptrs;
-
-  enum eAddType { ADD_P1_P2_BY_VALUE, ADD_P1_AND_P2_POINTER, ADD_P1_AND_P2_AFFINE };
-  std::vector<eAddType> m_opcodes;
-};
-
-/**
- * @class MSM
- * @brief class for solving multi-scalar-multiplication on the cpu.
- * The class is a template depending on the element relevant to MSM (for instance EC P).
- * NOTE The msm runs only if nof_threads * 4 > nof_bms. The user can guarantee the condition by assigning a proper
- * precompute factor that will decrease the number of BMs required to calculate the MSM.
- */
-template <typename A, typename P>
-class Msm
-{
-public:
-  /**
-   * @brief Constructor for Msm class.
-   * @param config - msm config. important parameters that are part of the config extension are: . NOTE: ensure c
-   * doesn't divide the scalar width without a remainder.
-   * @param c - c separately after cpu_msm handled problematic c values (less than 1 or dividing scalar_t::NBITS
-   * without remainder)
-   * @param nof_threads - number of worker threads for EC additions.
-   */
-  Msm(const MSMConfig& config, const int c, const int nof_threads);
-
-  /**
-   * @brief Destructor for Msm class.
-   * Ensures phase 3 threads have finished and joins them (other deletion are implemented implicitly).
-   */
-  ~Msm()
-  {
-    for (std::thread& phase3_thread : m_phase3_threads) {
-      phase3_thread.join();
+    for (auto& buckets_busy : m_workers_buckets_busy) {
+      fill(buckets_busy.begin(), buckets_busy.end(), false);
     }
   }
 
-  /**
-   * @brief Main function to execute MSM computation. Launches the 3 phases implemented in the functions below
-   * (accumulation, bm sums, final accumulator).
-   * @param scalars - Input scalars for MSM.
-   * @param bases - EC P input, affine representation.
-   * @param msm_size - Size of the above arrays, as they are given as Pers.
-   * @param batch_idx - number of current MSM in the batch.
-   * @param results - Per to P array in which to store the results. NOTE: the user is expected to preallocate
-   * the results array.
-   */
-  void run_msm(
-    const scalar_t* scalars, const A* bases, const unsigned int msm_size, const unsigned int batch_idx, P* results);
-
-  /**
-   * @brief Calculate approximate value of c to minimize number of EC additions in the MSM calculation. Having said
-   * that, the value of c might be suboptimal in the case of physical memory limitations (Due to required memory size
-   * for the BMs, determined by c).
-   * @param msm_size - Number of inputs in the MSM calculation.
-   * @param precompute_factor - Precompute factor determined in the config.
-   * @return Value of c to minimize EC additions while not filling up the physical memory.
-   */
-  static unsigned get_optimal_c(unsigned msm_size, int precompute_factor)
+  // execute all tasks in taskfkow, wait for them to complete and clear taskflow.
+  void run_workers_and_wait()
   {
-    // Approximation for optimal c size while ignoring memory limitation.
-    int optimal_c = precompute_factor > 1
-                      ? std::max((int)std::log2(msm_size) + (int)std::log2(precompute_factor) - 5, 8)
-                      : std::max((int)std::log2(msm_size) - 5, 8);
-
-    // Get physical memory limitation (by some factor < 1 - chosen to be 3/4)
-    uint64_t point_size = 3 * scalar_t::NBITS; // NOTE this is valid under the assumption of projective points in BMs
-    uint64_t _0_75_of_mem_size = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGE_SIZE) * 3 / 4;
-    uint64_t max_nof_points_in_mem = _0_75_of_mem_size / point_size;
-
-    // Reduce c until it doesn't exceed the memory limitation
-    int c = optimal_c + 1;
-    uint64_t total_num_of_points;
-    do {
-      c--;
-      uint64_t num_bms = ((scalar_t::NBITS - 1) / (precompute_factor * c)) + 1;
-      total_num_of_points = num_bms << (c - 1);
-    } while (total_num_of_points > max_nof_points_in_mem);
-    ICICLE_LOG_DEBUG << "Chosen c:\t" << c
-                     << (c < optimal_c ? "\t(Not optimal, BMs are memory bound by "
-                                       : "\t(Optimal, BMs are smaller than limit of ")
-                     << (_0_75_of_mem_size >> 20) << "MB)";
-
-    return c;
+    m_executor.run(m_taskflow).wait();
+    m_taskflow.clear();
   }
 
-private:
-  TasksManager<EcAddTask<A, P>> manager; // Tasks manager for multithreading
-
-  const unsigned int m_c;                 // Pipenger constant
-  const unsigned int m_num_bkts;          // Number of buckets in each bucket module
-  const unsigned int m_precompute_factor; // multiplication of bases already calculated trading memory for performance
-  const unsigned int m_num_bms;           // Number of bucket modules (windows in Pipenger's algorithm)
-  const bool m_are_scalars_mont;          // Are the input scalars in Montgomery representation
-  const bool m_are_points_mont;           //  Are the input points in Montgomery representation
-  const int m_batch_size;
-
-  EcAddTask<A, P>* m_curr_task;
-
-  // Phase 1 members
-  std::vector<P> m_buckets;           // Vector of all buckets required for phase 1 (All bms in order)
-  std::vector<bool> m_bkts_occupancy; // Boolean vector indicating if the corresponding bucket is occupied
-
-  // Phase 2 members
-  const int m_log_num_segments;
-  const int m_num_bm_segments; // Number of segments in a BM - to maximize parallel operation from the serial bm sums.
-  const int m_segment_size;
-
-  /**
-   * @struct BmSumSegment
-   * @brief Struct bundling the required data members for each BM segment in phase 2.
-   */
-  struct BmSumSegment {
-    // Imagining the required BM sum as a right angle triangle - Nth element is a N high column of the element -
-    // A method to summing the BM serially is starting from the top with a triangle sum up to the current line and a
-    // line sum which is calculated for each line then added to the triangle.
-    // Therefore both sums are need as a part of this struct.
-    P triangle_sum;
-    P line_sum;
-
-    int m_nof_received_sums = 1; // Counter for numbers of sum received in this row idx - used to determine when a new
-                                 // row idx can be dispatched (Due to it depending on the two sums from the prev idx).
-    int m_idx_in_segment;        // Counter counting down how far the current sum is through the segment.
-    int m_segment_mem_start;     // Offset of segment start in memory / vector.
-  };
-
-  // Phase 3 members
-  std::vector<std::thread> m_phase3_threads;
-
-  /**
-   * @brief Phase 1 (accumulation) of MSM - sorting input points to buckets according to corresponding input scalars.
-   * @param scalars - scalar input.
-   * @param bases - EC P input, affine representation (Regardless of the defined P type of the class).
-   * @param msm_size - Size of the above arrays, as they are given as Pers.
-   */
-  void phase1_bucket_accumulator(const scalar_t* scalars, const A* bases, const unsigned int msm_size);
-
-  /**
-   * @brief Push addition task during phase 1.
-   * Th function also handles completed addition results while attempting to insert the new addition task (including
-   * potential collision that requires more urgent ec-addition of the result and the current stored value).
-   * @param task_bkt_idx - address in m_buckets in which to store the result in the future.
-   * @param bkt - the P from m_buckets.
-   * @param base - the P from the input bases
-   * @param negate_base - flag to signal the task to subtract base instead of adding it.
-   */
-  void phase1_push_addition(const unsigned int task_bkt_idx, const P bkt, const A base);
-  /**
-   * @brief Handles the final results of phase 1 (after no other planned additions are required).
-   * The function also handles the potential collision similarly to push_addition above.
-   */
-  void phase1_wait_for_completion();
-
-  /**
-   * @brief Phase 2 of MSM. Function handling the initial parallel part of summing the bucket modules. It splits the
-   * BMs into segments, summing each separately and passing the segments sum to be handled by the final accumulator.
-   * @param segments vector to contain segments line and triangle sums for the final accumulator (output of function).
-   */
-  void phase2_bm_sum(std::vector<BmSumSegment>& segments);
-
-  /**
-   * @brief Setting up phase 2 class members according to phase 1 results.
-   * NOTE: This setup function also potentially blocks until the previous MSM's phase 3 finishes (Given that in batched
-   * MSM, phase 3 is initiated on a separate thread while the next MSM begins phase 1).
-   */
-  void phase2_setup(std::vector<BmSumSegment>& segments);
-
-  /**
-   * @brief Final accumulation required for MSM calculation. the function will either launch a thread to perform the
-   * calculation (`phase3_tread` function) or by the main thread, depending on the position in the batch (Last msm or
-   * not).
-   * @param segments_ptr - pointer to vector containing the segment calculations of phase 2
-   * @param idx_in_batch - idx of the current MSM in the batch.
-   * @param result - output, Per to write the MSM result to. Memory for the Per has already been allocated by
-   * the user.
-   */
-  void phase3_final_accumulator(std::vector<BmSumSegment>& segments, int idx_in_batch, P* result);
-
-  /**
-   * @brief Phase 3 function to be ran by a separate thread (or main in the end of the run) - i.e. without using the
-   * tasks manager. It performs the remaining serial addition in each BM and sums them to one final MSM result.
-   * @param result - Per to write the MSM result to. Memory for the Per has already been allocated by the user.
-   */
-  void phase3_thread(std::vector<BmSumSegment> segments, P* result);
-
-  /**
-   * @brief Function for resetting class members between batch runs.
-   */
-  void batch_run_reset() { std::fill(m_bkts_occupancy.begin(), m_bkts_occupancy.end(), false); }
-};
-
-template <typename A, typename P>
-Msm<A, P>::Msm(const MSMConfig& config, const int c, const int nof_threads)
-    : manager(
-        nof_threads,
-        // Minimal number of tasks required in phase 2 - 2 Tasks for each BM
-        (((scalar_t::NBITS - 1) / (config.precompute_factor * c)) + 1) * 2),
-
-      m_curr_task(nullptr),
-
-      m_c(c), m_num_bkts(1 << (m_c - 1)), m_precompute_factor(config.precompute_factor),
-      m_num_bms(((scalar_t::NBITS - 1) / (config.precompute_factor * c)) + 1),
-      m_are_scalars_mont(config.are_scalars_montgomery_form), m_are_points_mont(config.are_points_montgomery_form),
-      m_batch_size(config.batch_size),
-
-      m_buckets(m_num_bms * m_num_bkts), m_bkts_occupancy(m_num_bms * m_num_bkts, false),
+  // phase 1: Each worker process a portion of the inputs and populate its buckets
+  void phase1_populate_buckets(const scalar_t* scalars, const A* bases)
+  {
+    // Divide the msm problem to workers
+    const int worker_msm_size = (m_msm_size + m_nof_workers - 1) / m_nof_workers; // round up
+
+    // Run workers to build their buckets on a subset of the scalars and bases
+    for (int worker_i = 0; worker_i < m_nof_workers; worker_i++) {
+      m_taskflow.emplace([=]() {
+        const int scalars_start_idx = worker_msm_size * worker_i;
+        const int bases_start_idx = scalars_start_idx * m_precompute_factor;
+        const int cur_worker_msm_size = std::min(worker_msm_size, m_msm_size - worker_i * worker_msm_size);
+        worker_run_phase1(worker_i, scalars + scalars_start_idx, bases + bases_start_idx, cur_worker_msm_size);
+      });
+    }
 
-      m_log_num_segments(std::max(
-        (int)std::floor(
-          std::log2((double)(nof_threads * TASKS_PER_THREAD * NOF_EC_ADDITIONS_IN_BATCH - 1) / (2 * m_num_bms))),
-        0)),
-      m_num_bm_segments(std::min((int)(1 << m_log_num_segments), (int)(m_num_bkts))),
-      m_segment_size(std::max((int)(m_num_bkts >> m_log_num_segments), 1)),
+    // TBD build a graph of dependencies for task flow to execute phase 2
+    run_workers_and_wait();
 
-      m_phase3_threads(m_batch_size - 1)
-{
-}
-
-template <typename A, typename P>
-void Msm<A, P>::run_msm(
-  const scalar_t* scalars, const A* bases, const unsigned int msm_size, const unsigned int batch_idx, P* results)
-{
-#ifdef MEASURE_MSM_TIMES
-  Timer tmsm("Total msm time");
-  {
-    Timer tp1("Phase 1");
-#else
-  {
-#endif
-    phase1_bucket_accumulator(scalars, bases, msm_size);
-  }
-  auto segments = std::vector<BmSumSegment>(m_num_bms * m_num_bm_segments);
-  {
-#ifdef MEASURE_MSM_TIMES
-    Timer tp1("Phase 2");
-#endif
-    phase2_bm_sum(segments);
-  }
-  {
-#ifdef MEASURE_MSM_TIMES
-    Timer tp1("Phase 3");
-#endif
-    phase3_final_accumulator(segments, batch_idx, results);
-    if (batch_idx < m_batch_size - 1) { batch_run_reset(); }
+    // Collapse all the workers buckets into one
+    collapse_all_workers_buckets();
   }
-}
 
-template <typename A, typename P>
-void Msm<A, P>::phase1_bucket_accumulator(const scalar_t* scalars, const A* bases, const unsigned int msm_size)
-{
-  const int coeff_bit_mask_no_sign_bit = m_num_bkts - 1;
-  const int coeff_bit_mask_with_sign_bit = (1 << m_c) - 1;
-  // NUmber of windows / additions per scalar in case num_bms * precompute_factor exceed scalar width
-  const int num_bms_before_precompute = ((scalar_t::NBITS - 1) / m_c) + 1; // +1 for ceiling
-  int carry = 0;
-  for (int i = 0; i < msm_size; i++) {
-    carry = 0;
-    // Handle required preprocess of scalar
-    scalar_t scalar = m_are_scalars_mont ? scalar_t::from_montgomery(scalars[i]) : scalars[i];
-    bool negate_p_and_s = scalar.get_scalar_digit(scalar_t::NBITS - 1, 1) > 0;
-    if (negate_p_and_s) { scalar = scalar_t::neg(scalar); }
-    for (int j = 0; j < m_precompute_factor; j++) {
-      // Handle required preprocess of base P
-      A base =
-        m_are_points_mont ? A::from_montgomery(bases[m_precompute_factor * i + j]) : bases[m_precompute_factor * i + j];
-      if (base == A::zero()) { continue; }
-      if (negate_p_and_s) { base = A::neg(base); }
-
-      for (int k = 0; k < m_num_bms; k++) {
-        // Avoid seg fault in case precompute_factor*c exceeds the scalar width by comparing index with num additions
-        if (m_num_bms * j + k >= num_bms_before_precompute) { break; }
-
-        uint32_t curr_coeff = scalar.get_scalar_digit(m_num_bms * j + k, m_c) + carry;
-        int bkt_idx = 0;
-        // For the edge case of curr_coeff = c (limb=c-1, carry=1) use the sign bit mask
-        if ((curr_coeff & coeff_bit_mask_with_sign_bit) != 0) {
-          // Remove sign to infer the bkt idx.
-          carry = curr_coeff > m_num_bkts;
-          if (!carry) {
-            bkt_idx = m_num_bkts * k + (curr_coeff & coeff_bit_mask_no_sign_bit);
-          } else {
-            bkt_idx = m_num_bkts * k + ((-curr_coeff) & coeff_bit_mask_no_sign_bit);
-          }
+  // Each worker run this function and update its buckets - this function needs re writing
+  void worker_run_phase1(const int worker_idx, const scalar_t* scalars, const A* bases, const int msm_size)
+  {
+    // My buckets:
+    std::vector<Bucket>& buckets = m_workers_buckets[worker_idx];
+    std::vector<bool>& buckets_busy = m_workers_buckets_busy[worker_idx];
+
+    const int coeff_bit_mask_no_sign_bit = m_bm_size - 1;
+    const int coeff_bit_mask_with_sign_bit = (1 << m_c) - 1;
+    // NUmber of windows / additions per scalar in case num_bms * precompute_factor exceed scalar width
+    const int num_bms_before_precompute = ((m_scalar_size - 1) / m_c) + 1; // +1 for ceiling
+    int carry = 0;
+    for (int i = 0; i < msm_size; i++) {
+      carry = 0;
+      // Handle required preprocess of scalar
+      scalar_t scalar =
+        m_config.are_scalars_montgomery_form ? scalar_t::from_montgomery(scalars[i]) : scalars[i]; // TBD: avoid copy
+      bool negate_p_and_s = scalar.get_scalar_digit(m_scalar_size - 1, 1) > 0;
+      if (negate_p_and_s) { scalar = scalar_t::neg(scalar); } // TBD: inplace
+
+      for (int j = 0; j < m_precompute_factor; j++) {
+        // Handle required preprocess of base P
+        A base = m_config.are_points_montgomery_form ? A::from_montgomery(bases[m_precompute_factor * i + j])
+                                                     : bases[m_precompute_factor * i + j]; // TDB: avoid copy
+        if (base == A::zero()) { continue; } // TBD: why is that? can be done more efficiently?
+        A base_neg = A::neg(base);
+
+        for (int bm_i = 0; bm_i < m_nof_buckets_module; bm_i++) {
+          // Avoid seg fault in case precompute_factor*c exceeds the scalar width by comparing index with num additions
+          if (m_nof_buckets_module * j + bm_i >= num_bms_before_precompute) { break; }
+
+          uint32_t curr_coeff = scalar.get_scalar_digit(m_nof_buckets_module * j + bm_i, m_c) + carry;
+          int bkt_idx = 0; // TBD: move inside if and change to ( = ? : )
+          // For the edge case of curr_coeff = c (limb=c-1, carry=1) use the sign bit mask
+          if ((curr_coeff & coeff_bit_mask_with_sign_bit) != 0) {
+            // Remove sign to infer the bkt idx.
+            carry = curr_coeff > m_bm_size;
+            if (!carry) {
+              bkt_idx = m_bm_size * bm_i + (curr_coeff & coeff_bit_mask_no_sign_bit);
+            } else {
+              bkt_idx = m_bm_size * bm_i + ((-curr_coeff) & coeff_bit_mask_no_sign_bit);
+            }
 
-          // Check for collision in that bucket and either dispatch an addition or store the P accordingly.
-          if (m_bkts_occupancy[bkt_idx]) {
-            m_bkts_occupancy[bkt_idx] = false;
-            phase1_push_addition(bkt_idx, m_buckets[bkt_idx], carry > 0 ? A::neg(base) : base);
+            // Check for collision in that bucket and either dispatch an addition or store the P accordingly.
+            if (buckets_busy[bkt_idx]) {
+              buckets[bkt_idx].point =
+                buckets[bkt_idx].point + ((negate_p_and_s ^ (carry > 0)) ? base_neg : base); // TBD: inplace
+            } else {
+              buckets_busy[bkt_idx] = true;
+              buckets[bkt_idx].point =
+                P::from_affine(((negate_p_and_s ^ (carry > 0)) ? base_neg : base)); // TBD: inplace
+            }
           } else {
-            m_bkts_occupancy[bkt_idx] = true;
-            m_buckets[bkt_idx] = carry > 0 ? P::neg(P::from_affine(base)) : P::from_affine(base);
+            // Handle edge case where coeff = 1 << c due to carry overflow which means:
+            // coeff & coeff_mask == 0 but there is a carry to propagate to the next segment
+            carry = curr_coeff >> m_c;
           }
-        } else {
-          // Handle edge case where coeff = 1 << c due to carry overflow which means:
-          // coeff & coeff_mask == 0 but there is a carry to propagate to the next segment
-          carry = curr_coeff >> m_c;
         }
       }
     }
   }
-  phase1_wait_for_completion();
-}
 
-template <typename A, typename P>
-void Msm<A, P>::phase1_push_addition(const unsigned int task_bkt_idx, const P bkt, const A base)
-{
-  while (m_curr_task == nullptr) {
-    // Use the search for an available (idle or completed) task as an opportunity to handle the existing results.
-    m_curr_task = manager.get_idle_or_completed_task();
-    if (m_curr_task->is_completed()) {
-      const int nof_results = m_curr_task->m_nof_valid_points;
-      m_curr_task->reset_idle();
-      for (int i = 0; i < nof_results; i++) {
-        // Check for collision in the destination bucket, and chain and addition / store result accordingly.
-        if (m_bkts_occupancy[m_curr_task->m_return_idx[i]]) {
-          m_bkts_occupancy[m_curr_task->m_return_idx[i]] = false;
-          m_curr_task->set_phase1_collision_task(
-            m_curr_task->m_a_points[i], m_buckets[m_curr_task->m_return_idx[i]], m_curr_task->m_return_idx[i]);
-        } else {
-          m_buckets[m_curr_task->m_return_idx[i]] = m_curr_task->m_a_points[i];
-          m_bkts_occupancy[m_curr_task->m_return_idx[i]] = true;
-        }
-      }
-    }
-    // If the collision tasks cause the task to be dispatched again this task can't be assigned more additions -
-    // repeat the loop with a new task.
-    if (!m_curr_task->is_idle()) { m_curr_task = nullptr; }
-  }
-  // After handling the result a new one can be set.
-  m_curr_task->set_phase1_addition_with_affine(bkt, base, task_bkt_idx);
-  if (!m_curr_task->is_idle()) { m_curr_task = nullptr; }
-}
-
-template <typename A, typename P>
-void Msm<A, P>::phase1_wait_for_completion()
-{
-  // In case remaining additions are smaller than a batch size - dispatch current task
-  if (m_curr_task && m_curr_task->is_idle()) { m_curr_task->dispatch(); }
-
-  EcAddTask<A, P>* task = manager.get_completed_task();
-  while (task != nullptr) {
-    const int nof_results = task->m_nof_valid_points;
-    task->reset_idle();
-    bool had_collision = false;
-    for (int i = 0; i < nof_results; i++) {
-      // Check for collision in the destination bucket, and chain and addition / store result accordingly.
-      if (m_bkts_occupancy[task->m_return_idx[i]]) {
-        m_bkts_occupancy[task->m_return_idx[i]] = false;
-        task->set_phase1_collision_task(task->m_a_points[i], m_buckets[task->m_return_idx[i]], task->m_return_idx[i]);
-        had_collision = true;
-      } else {
-        m_buckets[task->m_return_idx[i]] = task->m_a_points[i];
-        m_bkts_occupancy[task->m_return_idx[i]] = true;
+  // Collapse all workers buckets into worker 0 buckets?
+  void collapse_all_workers_buckets()
+  {
+    int bucket_start = 0;
+    const int nof_buckets_per_thread = (m_nof_total_buckets + m_nof_workers - 1) / m_nof_workers;
+    for (int worker_i = 0; worker_i < m_nof_workers; worker_i++) {
+      const int nof_buckets =
+        std::min(nof_buckets_per_thread, (int)m_nof_total_buckets - worker_i * nof_buckets_per_thread);
+      if (nof_buckets > 0) {
+        m_taskflow.emplace(
+          [this, bucket_start, nof_buckets]() { worker_collapse_all_workers_result(bucket_start, nof_buckets); });
+        bucket_start += nof_buckets_per_thread;
       }
     }
-    task->dispatch_if_not_empty();
-
-    task = manager.get_completed_task();
+    run_workers_and_wait();
   }
-}
-
-template <typename A, typename P>
-void Msm<A, P>::phase2_bm_sum(std::vector<BmSumSegment>& segments)
-{
-  phase2_setup(segments);
-  if (m_segment_size > 1) {
-    // Send first additions - line additions.
-    for (int i = 0; i < m_num_bms * m_num_bm_segments; i++) {
-      if (i % NOF_EC_ADDITIONS_IN_BATCH == 0) { m_curr_task = manager.get_idle_task(); }
-      BmSumSegment& curr_segment = segments[i]; // For readability
-
-      int bkt_idx = curr_segment.m_segment_mem_start + curr_segment.m_idx_in_segment;
-      P bucket = m_bkts_occupancy[bkt_idx] ? m_buckets[bkt_idx] : P::zero();
-      m_curr_task->set_phase2_addition_by_value(curr_segment.line_sum, bucket, i);
-    }
-    // Dispatch last task if it is not enough to fill a batch and dispatch automatically.
-    if (m_curr_task->is_idle()) { m_curr_task->dispatch(); }
 
-    // Loop until all line/tri sums are done.
-    int done_segments = 0;
-    while (done_segments < m_num_bms * m_num_bm_segments) {
-      m_curr_task = manager.get_completed_task();
-
-      // Check if there is a need for a line task according to the received sums counter of one of the received segments
-      EcAddTask<A, P>* line_task = nullptr;
-      if (segments[m_curr_task->m_return_idx[0]].m_nof_received_sums == 1) { line_task = manager.get_idle_task(); }
-
-      const int nof_results = m_curr_task->m_nof_valid_points;
-      m_curr_task->reset_idle();
-
-      for (int i = 0; i < nof_results; i++) {
-        BmSumSegment& curr_segment = segments[m_curr_task->m_return_idx[i]]; // For readability
-
-        if (m_curr_task->m_is_line[i]) {
-          curr_segment.line_sum = m_curr_task->m_a_points[i];
-        } else {
-          curr_segment.triangle_sum = m_curr_task->m_a_points[i];
-        }
-        curr_segment.m_nof_received_sums++;
-
-        // Check if this was the last addition in the segment
-        if (curr_segment.m_idx_in_segment < 0) {
-          done_segments++;
-          continue;
-        }
-        // Otherwise check if it is possible to assign new additions:
-        // Triangle sum is dependent on the 2 previous sums (line and triangle) - so check if 2 sums were received.
-        // Line sum (if not the last one in the segment).
-
-        // Due to the choice of num segments being less than half of total tasks there ought to be an idle task
-        // for the line sum.
-        if (curr_segment.m_nof_received_sums == 2) {
-          curr_segment.m_nof_received_sums = 0;
-          m_curr_task->set_phase2_triangle_addition(curr_segment.line_sum, &(curr_segment.triangle_sum));
-          curr_segment.m_idx_in_segment--;
-
-          if (curr_segment.m_idx_in_segment >= 0) {
-            int bkt_idx = curr_segment.m_segment_mem_start + curr_segment.m_idx_in_segment;
-            if (m_bkts_occupancy[bkt_idx]) {
-              int return_idx = m_curr_task->m_return_idx[i];
-              line_task->set_phase2_line_addition(curr_segment.line_sum, &m_buckets[bkt_idx], return_idx);
-            } else {
-              // curr_segment.m_nof_received_sums++;
-              int return_idx = m_curr_task->m_return_idx[i];
-              line_task->set_phase2_addition_by_value(curr_segment.line_sum, P::zero(), return_idx);
-            } // No need to add a zero - just increase nof_received_sums.
-          }
+  // A single thread functionality
+  void worker_collapse_all_workers_result(const int bucket_start, const int nof_buckets)
+  {
+    // run over all buckets under my responsibility.
+    for (int bucket_i = bucket_start; bucket_i < bucket_start + nof_buckets; bucket_i++) {
+      bool worker_0_bucket_busy = m_workers_buckets_busy[0][bucket_i];
+
+      // run over all workers
+      for (int worker_i = 1; worker_i < m_nof_workers; worker_i++) {
+        if (m_workers_buckets_busy[worker_i][bucket_i]) {
+          // add busy buckets to worker_0's bucket
+          m_workers_buckets[0][bucket_i].point =
+            worker_0_bucket_busy ? m_workers_buckets[0][bucket_i].point + m_workers_buckets[worker_i][bucket_i].point
+                                 : // TBD add inplace
+              m_workers_buckets[worker_i][bucket_i].point;
+          worker_0_bucket_busy = true;
         }
       }
-      // Check if tri and line task haven't been dispatched due to not enough inputs - dispatch them
-      m_curr_task->dispatch_if_not_empty();
-      if (line_task) { line_task->dispatch_if_not_empty(); }
+      // if the bucket is empty for all workers, reset it.
+      if (!worker_0_bucket_busy) { m_workers_buckets[0][bucket_i].point = P::zero(); }
     }
   }
-}
 
-template <typename A, typename P>
-void Msm<A, P>::phase2_setup(std::vector<BmSumSegment>& segments)
-{
-  // Init values of partial (line) and total (triangle) sums
-  for (int i = 0; i < m_num_bms; i++) {
-    for (int j = 0; j < m_num_bm_segments - 1; j++) {
-      BmSumSegment& segment = segments[m_num_bm_segments * i + j];
-      int bkt_idx = m_num_bkts * i + m_segment_size * (j + 1);
-      if (m_bkts_occupancy[bkt_idx]) {
-        segment.triangle_sum = m_buckets[bkt_idx];
-        segment.line_sum = m_buckets[bkt_idx];
-      } else {
-        segment.triangle_sum = P::zero();
-        segment.line_sum = P::zero();
-      }
-      segment.m_idx_in_segment = m_segment_size - 2;
-      segment.m_segment_mem_start = m_num_bkts * i + m_segment_size * j + 1;
-    }
-
-    // The most significant bucket of every bm is stored in address 0 -
-    // so the last tri/line sums will be initialized to bucket[0]
-    BmSumSegment& segment = segments[m_num_bm_segments * (i + 1) - 1];
-    int bkt_idx = m_num_bkts * i;
-    if (m_bkts_occupancy[bkt_idx]) {
-      segment.triangle_sum = m_buckets[bkt_idx];
-      segment.line_sum = m_buckets[bkt_idx];
-    } else {
-      segment.triangle_sum = P::zero();
-      segment.line_sum = P::zero();
+  // phase 2: accumulate m_segment_size buckets into a line_sum and triangle_sum
+  void phase2_collapse_segments()
+  {
+    uint64_t bucket_start = 0;
+    for (int segment_idx = 0; segment_idx < m_segments.size();
+         segment_idx++) { // TBD: divide the work among m_nof_workers only.
+      // Each thread is responsible for a sinעle thread
+      m_taskflow.emplace([=]() {
+        const uint32_t segment_size = std::min(m_nof_total_buckets - bucket_start, (uint64_t)m_segment_size);
+        worker_collapse_segment(m_segments[segment_idx], bucket_start, segment_size);
+      });
+      bucket_start += m_segment_size;
     }
-    segment.m_idx_in_segment = m_segment_size - 2;
-    segment.m_segment_mem_start = m_num_bkts * (i + 1) - m_segment_size + 1;
+    run_workers_and_wait();
   }
-}
 
-template <typename A, typename P>
-void Msm<A, P>::phase3_final_accumulator(std::vector<BmSumSegment>& segments, int idx_in_batch, P* result)
-{
-  // If it isn't the last MSM in the batch - run phase 3 on a separate thread to start utilizing the tasks manager on
-  // the next phase 1.
-  if (idx_in_batch == m_batch_size - 1) {
-    phase3_thread(segments, result);
-  } else {
-    m_phase3_threads[idx_in_batch] = std::thread(&Msm<A, P>::phase3_thread, this, std::move(segments), result);
+  // single worker task - accumulate a single segment
+  void worker_collapse_segment(Segment& segment, const int64_t bucket_start, const uint32_t segment_size)
+  {
+    // Assumption all buyckets are busy - no need to look at the busy flag
+    std::vector<Bucket>& buckets = m_workers_buckets[0];
+    const int64_t last_bucket_i = bucket_start + segment_size;
+    const int64_t init_bucket_i = (last_bucket_i % m_bm_size) ? last_bucket_i
+                                                              : // bucket 0 at the BM contains the last element
+                                    last_bucket_i - m_bm_size;
+    segment.triangle_sum = init_bucket_i < buckets.size() ? buckets[init_bucket_i].point : P::zero();
+    segment.line_sum = segment.triangle_sum;
+    for (int64_t bucket_i = last_bucket_i - 1; bucket_i > bucket_start; bucket_i--) {
+      // Add the bucket to line sum
+      segment.line_sum = segment.line_sum + buckets[bucket_i].point; // TBD: inplace
+      // Add line_sum to triangle_sum
+      segment.triangle_sum = segment.triangle_sum + segment.line_sum; // TBD: inplace
+    }
   }
-}
 
-template <typename A, typename P>
-void Msm<A, P>::phase3_thread(std::vector<BmSumSegment> segments, P* result)
-{
-  for (int i = 0; i < m_num_bms; i++) {
-    // Weighted sum of all the lines for each bm - summed in a similar fashion of the triangle sum of phase 2
-    P partial_sum = segments[m_num_bm_segments * (i + 1) - 1].line_sum;
-    P total_sum = segments[m_num_bm_segments * (i + 1) - 1].line_sum;
-    for (int j = m_num_bm_segments - 2; j > 0; j--) {
-      partial_sum = partial_sum + segments[m_num_bm_segments * i + j].line_sum;
-      total_sum = total_sum + partial_sum;
-    }
-    segments[m_num_bm_segments * i].line_sum = total_sum;
+  // Serial phase - accumulate all segments to final result
+  void phase3_final_accumulator(P* result)
+  {
+    const int nof_segments_per_bm = m_bm_size / m_segment_size;
+    int nof_segments_left = m_segments.size();
+    for (int i = 0; i < m_nof_buckets_module; i++) {
+      const int cur_bm_nof_segments = std::min(nof_segments_per_bm, nof_segments_left);
+      const int log_nof_segments_per_bm =
+        std::log2(nof_segments_per_bm); // TBD: avoid logn. can be calculated differently.
+      // Weighted sum of all the lines for each bm - summed in a similar fashion of the triangle sum of phase 2
+      P partial_sum = nof_segments_per_bm * (i + 1) - 1 < m_segments.size()
+                        ? m_segments[nof_segments_per_bm * (i + 1) - 1].line_sum
+                        : P::zero();
+      P total_sum = partial_sum;
+
+      // run over all segments in the BM
+      for (int j = cur_bm_nof_segments - 2; j > 0; j--) {
+        // accumulate the partial sum
+        partial_sum = partial_sum + m_segments[nof_segments_per_bm * i + j].line_sum;
+        // add the partial sum to the total sum
+        total_sum = total_sum + partial_sum;
+      }
+      m_segments[nof_segments_per_bm * i].line_sum = total_sum;
 
-    // Convert weighted lines sum to rectangles sum by doubling
-    int num_doubles = m_c - 1 - m_log_num_segments;
-    for (int k = 0; k < num_doubles; k++) {
-      segments[m_num_bm_segments * i].line_sum = P::dbl(segments[m_num_bm_segments * i].line_sum);
-    }
+      // Convert weighted lines sum to rectangles sum by doubling
+      int num_doubles = m_c - 1 - log_nof_segments_per_bm;
+      for (int k = 0; k < num_doubles; k++) {
+        m_segments[nof_segments_per_bm * i].line_sum = P::dbl(m_segments[nof_segments_per_bm * i].line_sum);
+      }
 
-    // Sum triangles within bm linearly
-    for (int j = 1; j < m_num_bm_segments; j++) {
-      segments[m_num_bm_segments * i].triangle_sum =
-        segments[m_num_bm_segments * i].triangle_sum + segments[m_num_bm_segments * i + j].triangle_sum;
-    }
+      // Sum triangles within bm linearly
+      for (int j = 1; j < cur_bm_nof_segments && nof_segments_per_bm * i + j < m_segments.size(); j++) {
+        m_segments[nof_segments_per_bm * i].triangle_sum =
+          m_segments[nof_segments_per_bm * i].triangle_sum + m_segments[nof_segments_per_bm * i + j].triangle_sum;
+      }
 
-    // After which add the lines and triangle sums to one sum of the entire BM
-    if (m_num_bm_segments > 1) {
-      segments[m_num_bm_segments * i].triangle_sum =
-        segments[m_num_bm_segments * i].triangle_sum + segments[m_num_bm_segments * i].line_sum;
+      // After which add the lines and triangle sums to one sum of the entire BM
+      if (cur_bm_nof_segments > 1) {
+        m_segments[nof_segments_per_bm * i].triangle_sum =
+          m_segments[nof_segments_per_bm * i].triangle_sum + m_segments[nof_segments_per_bm * i].line_sum;
+      }
+      nof_segments_left -= nof_segments_per_bm;
     }
-  }
 
-  // Sum BM sums together
-  P final_sum = segments[(m_num_bms - 1) * m_num_bm_segments].triangle_sum;
-  for (int i = m_num_bms - 2; i >= 0; i--) {
-    // Multiply by the BM digit factor 2^c - i.e. c doublings
-    for (int j = 0; j < m_c; j++) {
-      final_sum = P::dbl(final_sum);
+    // Sum BM sums together
+    *result = m_segments[(m_nof_buckets_module - 1) * nof_segments_per_bm].triangle_sum;
+    for (int i = m_nof_buckets_module - 2; i >= 0; i--) {
+      // Multiply by the BM digit factor 2^c - i.e. c doublings
+      for (int j = 0; j < m_c; j++) {
+        *result = P::dbl(*result);
+      }
+      *result = *result + m_segments[nof_segments_per_bm * i].triangle_sum;
     }
-    final_sum = final_sum + segments[m_num_bm_segments * i].triangle_sum;
   }
-  *result = final_sum;
-}
-
-// None class functions below:
+}; // end of class Msm
 
 /**
  * @brief Super function that handles the Msm class to calculate a MSM.
@@ -732,23 +375,11 @@ template <typename A, typename P>
 eIcicleError cpu_msm(
   const Device& device, const scalar_t* scalars, const A* bases, int msm_size, const MSMConfig& config, P* results)
 {
-  int c = config.c;
-  if (c < 1) { c = Msm<A, P>::get_optimal_c(msm_size, config.precompute_factor); }
-
-  int nof_threads = std::thread::hardware_concurrency() - 1;
-  if (config.ext && config.ext->has(CpuBackendConfig::CPU_NOF_THREADS)) {
-    nof_threads = config.ext->get<int>(CpuBackendConfig::CPU_NOF_THREADS);
-  }
-  if (nof_threads <= 0) {
-    ICICLE_LOG_WARNING << "Unable to detect number of hardware supported threads - fixing it to 1\n";
-    nof_threads = 1;
-  }
-  auto msm = Msm<A, P>{config, c, nof_threads};
-
-  for (int i = 0; i < config.batch_size; i++) {
-    int batch_start_idx = msm_size * i;
-    int bases_start_idx = config.are_points_shared_in_batch ? 0 : batch_start_idx;
-    msm.run_msm(&scalars[batch_start_idx], &bases[bases_start_idx], msm_size, i, &results[i]);
+  Msm<A, P> msm(msm_size, config);
+  for (int batch_i = 0; batch_i < config.batch_size; batch_i++) {
+    const int batch_start_idx = msm_size * batch_i;
+    const int bases_start_idx = config.are_points_shared_in_batch ? 0 : batch_start_idx;
+    msm.run_msm(&scalars[batch_start_idx], &bases[bases_start_idx], &results[batch_i]);
   }
   return eIcicleError::SUCCESS;
 }
@@ -771,17 +402,17 @@ eIcicleError cpu_msm_precompute_bases(
   const MSMConfig& config,
   A* output_bases) // Pre assigned?
 {
-  int c = config.c;
-  if (c < 1) { c = Msm<A, P>::get_optimal_c(nof_bases, config.precompute_factor); }
+  int c = Msm<A, P>::get_optimal_c(nof_bases, config);
 
-  int precompute_factor = config.precompute_factor;
-  bool is_mont = config.are_points_montgomery_form;
-  const unsigned int num_bms_no_precomp = (scalar_t::NBITS - 1) / c + 1;
+  const int precompute_factor = config.precompute_factor;
+  const bool is_mont = config.are_points_montgomery_form;
+  const uint scalar_size = scalar_t::NBITS; // TBD handle this config.bitsize != 0 ? config.bitsize : scalar_t::NBITS;
+  const unsigned int num_bms_no_precomp = (scalar_size - 1) / c + 1;
   const unsigned int shift = c * ((num_bms_no_precomp - 1) / precompute_factor + 1);
   for (int i = 0; i < nof_bases; i++) {
     output_bases[precompute_factor * i] = input_bases[i];
     P point = P::from_affine(is_mont ? A::from_montgomery(input_bases[i]) : input_bases[i]);
-    for (int j = 1; j < precompute_factor; j++) {
+    for (int j = 1; j < precompute_factor; j++) { // TBD parallelize this
       for (int k = 0; k < shift; k++) {
         point = P::dbl(point);
       }

From d706320c9d74a24a732cb08efb1cc79227fd7fb1 Mon Sep 17 00:00:00 2001
From: Leon Hibnik <107353745+LeonHibnik@users.noreply.github.com>
Date: Mon, 13 Jan 2025 09:56:16 +0200
Subject: [PATCH 06/12] Fix/release script (#721)

---
 .github/workflows/release.yml            | 4 +++-
 scripts/release/Dockerfile.ubi8          | 1 +
 scripts/release/Dockerfile.ubi9          | 1 +
 scripts/release/Dockerfile.ubuntu20      | 1 +
 scripts/release/Dockerfile.ubuntu22      | 1 +
 scripts/release/build_release_and_tar.sh | 2 +-
 6 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index f9f58318f3..9886b5635f 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -64,7 +64,9 @@ jobs:
           npm run docusaurus docs:version $LATEST_VERSION
           git add --all
           git commit -m "Bump docs version"
-          git push
+      - name: Push to github branch main
+        run: |
+          git push origin main --tags
       - name: Create draft release
         env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/scripts/release/Dockerfile.ubi8 b/scripts/release/Dockerfile.ubi8
index b12519fa57..3026099939 100644
--- a/scripts/release/Dockerfile.ubi8
+++ b/scripts/release/Dockerfile.ubi8
@@ -10,6 +10,7 @@ RUN dnf update -y && dnf install -y \
     ninja-build \
     wget \
     gnupg2 \
+    git \
     && dnf clean all
 
 # Add the RPM-based LLVM repository for Clang
diff --git a/scripts/release/Dockerfile.ubi9 b/scripts/release/Dockerfile.ubi9
index ad8528b9b4..605df5d1b7 100644
--- a/scripts/release/Dockerfile.ubi9
+++ b/scripts/release/Dockerfile.ubi9
@@ -14,6 +14,7 @@ RUN dnf update -y && dnf install -y \
     gcc-c++ \
     make \
     gnupg \
+    git \
     && dnf clean all
 
 # Add LLVM repository for Clang installation
diff --git a/scripts/release/Dockerfile.ubuntu20 b/scripts/release/Dockerfile.ubuntu20
index 20d8024536..2f307275ca 100644
--- a/scripts/release/Dockerfile.ubuntu20
+++ b/scripts/release/Dockerfile.ubuntu20
@@ -22,6 +22,7 @@ RUN apt-get update && apt-get install -y \
     clang \
     lldb \
     lld \
+    git \
     && apt-get clean && rm -rf /var/lib/apt/lists/*
 
 # Verify installations
diff --git a/scripts/release/Dockerfile.ubuntu22 b/scripts/release/Dockerfile.ubuntu22
index 0613edc0ad..5bdc103808 100644
--- a/scripts/release/Dockerfile.ubuntu22
+++ b/scripts/release/Dockerfile.ubuntu22
@@ -13,6 +13,7 @@ RUN apt-get update && apt-get install -y \
     software-properties-common \
     wget \
     gnupg \
+    git \
     && wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - \
     && add-apt-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy main"
 
diff --git a/scripts/release/build_release_and_tar.sh b/scripts/release/build_release_and_tar.sh
index 2ea4648744..ca058b2a9c 100755
--- a/scripts/release/build_release_and_tar.sh
+++ b/scripts/release/build_release_and_tar.sh
@@ -8,7 +8,7 @@ ICICLE_OS=${2:-unknown_os}              # Default to "unknown_os" if not set
 ICICLE_CUDA_VERSION=${3:-cuda_unknown} # Default to "cuda_unknown" if not set
 
 # List of fields and curves
-fields=("babybear" "stark252" "m31")
+fields=("babybear" "stark252" "m31" "koalabear")
 curves=("bn254" "bls12_381" "bls12_377" "bw6_761" "grumpkin")
 
 cd /

From b27da85bdfa398f27340d44c807e4bb9d54f09d4 Mon Sep 17 00:00:00 2001
From: yshekel <yshekel@gmail.com>
Date: Mon, 13 Jan 2025 09:59:52 +0200
Subject: [PATCH 07/12] Fix bug in CPU vec ops regarding nof workers (#731)

---
 icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 25 ++++++++++----------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
index f279c7b034..496621c312 100644
--- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
+++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
@@ -371,7 +371,7 @@ class VectorOpTask : public TaskBase
 public:
   T m_intermidiate_res;    // pointer to the output. Can be a vector or scalar pointer
   uint64_t m_idx_in_batch; // index in the batch. Used in intermediate res tasks
-};                         // class VectorOpTask
+};
 
 #define NOF_OPERATIONS_PER_TASK 512
 #define CONFIG_NOF_THREADS_KEY  "n_threads"
@@ -381,8 +381,9 @@ int get_nof_workers(const VecOpsConfig& config)
 {
   if (config.ext && config.ext->has(CONFIG_NOF_THREADS_KEY)) { return config.ext->get<int>(CONFIG_NOF_THREADS_KEY); }
 
-  int hw_threads = std::thread::hardware_concurrency();
-  return ((hw_threads > 1) ? hw_threads - 1 : 1); // reduce 1 for the main
+  const int hw_threads = std::thread::hardware_concurrency();
+  // Note: no need to account for the main thread in vec-ops since it's doing little work
+  return std::max(1, hw_threads);
 }
 
 // Execute a full task from the type vector = vector (op) vector
@@ -390,7 +391,7 @@ template <typename T, typename U>
 eIcicleError
 cpu_2vectors_op(VecOperation op, const T* vec_a, const U* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
-  TasksManager<VectorOpTask<T, U>> task_manager(get_nof_workers(config) - 1);
+  TasksManager<VectorOpTask<T, U>> task_manager(get_nof_workers(config));
   const uint64_t total_nof_operations = size * config.batch_size;
   for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) {
     VectorOpTask<T, U>* task_p = task_manager.get_idle_or_completed_task();
@@ -406,7 +407,7 @@ template <typename T>
 eIcicleError cpu_scalar_vector_op(
   VecOperation op, const T* scalar_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output)
 {
-  TasksManager<VectorOpTask<T, T>> task_manager(get_nof_workers(config) - 1);
+  TasksManager<VectorOpTask<T, T>> task_manager(get_nof_workers(config));
   const uint64_t total_nof_operations = size;
   const uint32_t stride = config.columns_batch ? config.batch_size : 1;
   for (uint32_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) {
@@ -479,7 +480,7 @@ template <typename T>
 eIcicleError cpu_convert_montgomery(
   const Device& device, const T* input, uint64_t size, bool is_to_montgomery, const VecOpsConfig& config, T* output)
 {
-  TasksManager<VectorOpTask<T, T>> task_manager(get_nof_workers(config) - 1);
+  TasksManager<VectorOpTask<T, T>> task_manager(get_nof_workers(config));
   const uint64_t total_nof_operations = size * config.batch_size;
   for (uint64_t i = 0; i < total_nof_operations; i += NOF_OPERATIONS_PER_TASK) {
     VectorOpTask<T, T>* task_p = task_manager.get_idle_or_completed_task();
@@ -499,7 +500,7 @@ REGISTER_CONVERT_MONTGOMERY_BACKEND("CPU", cpu_convert_montgomery<scalar_t>);
 template <typename T>
 eIcicleError cpu_vector_sum(const Device& device, const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output)
 {
-  TasksManager<VectorOpTask<T, T>> task_manager(get_nof_workers(config) - 1);
+  TasksManager<VectorOpTask<T, T>> task_manager(get_nof_workers(config));
   std::vector<bool> output_initialized = std::vector<bool>(config.batch_size, false);
   uint64_t vec_a_offset = 0;
   uint64_t idx_in_batch = 0;
@@ -539,7 +540,7 @@ template <typename T>
 eIcicleError
 cpu_vector_product(const Device& device, const T* vec_a, uint64_t size, const VecOpsConfig& config, T* output)
 {
-  TasksManager<VectorOpTask<T, T>> task_manager(get_nof_workers(config) - 1);
+  TasksManager<VectorOpTask<T, T>> task_manager(get_nof_workers(config));
   std::vector<bool> output_initialized = std::vector<bool>(config.batch_size, false);
   uint64_t vec_a_offset = 0;
   uint64_t idx_in_batch = 0;
@@ -610,7 +611,7 @@ template <typename T>
 eIcicleError out_of_place_matrix_transpose(
   const Device& device, const T* mat_in, uint32_t nof_rows, uint32_t nof_cols, const VecOpsConfig& config, T* mat_out)
 {
-  TasksManager<VectorOpTask<T, T>> task_manager(get_nof_workers(config) - 1);
+  TasksManager<VectorOpTask<T, T>> task_manager(get_nof_workers(config));
   uint32_t stride = config.columns_batch ? config.batch_size : 1;
   const uint64_t total_elements_one_mat = static_cast<uint64_t>(nof_rows) * nof_cols;
   const uint32_t NOF_ROWS_PER_TASK =
@@ -695,7 +696,7 @@ eIcicleError matrix_transpose_necklaces(
   std::vector<uint64_t> start_indices_in_mat; // Collect start indices
   gen_necklace<T>(1, 1, k, length, necklace, start_indices_in_mat);
 
-  TasksManager<VectorOpTask<T, T>> task_manager(get_nof_workers(config) - 1);
+  TasksManager<VectorOpTask<T, T>> task_manager(get_nof_workers(config));
   for (uint64_t i = 0; i < start_indices_in_mat.size(); i += max_nof_operations) {
     uint64_t nof_operations = std::min((uint64_t)max_nof_operations, start_indices_in_mat.size() - i);
     for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) {
@@ -746,7 +747,7 @@ cpu_bit_reverse(const Device& device, const T* vec_in, uint64_t size, const VecO
   ICICLE_ASSERT((1ULL << logn) == size) << "Invalid argument - size is not a power of 2";
 
   // Perform the bit reverse
-  TasksManager<VectorOpTask<T, T>> task_manager(get_nof_workers(config) - 1);
+  TasksManager<VectorOpTask<T, T>> task_manager(get_nof_workers(config));
   for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) {
     for (uint64_t i = 0; i < size; i += NOF_OPERATIONS_PER_TASK) {
       VectorOpTask<T, T>* task_p = task_manager.get_idle_or_completed_task();
@@ -783,7 +784,7 @@ eIcicleError cpu_slice(
   ICICLE_ASSERT(vec_in != nullptr && vec_out != nullptr) << "Error: Invalid argument - input or output vector is null";
   ICICLE_ASSERT(offset + (size_out - 1) * stride < size_in) << "Error: Invalid argument - slice out of bound";
 
-  TasksManager<VectorOpTask<T, T>> task_manager(get_nof_workers(config) - 1);
+  TasksManager<VectorOpTask<T, T>> task_manager(get_nof_workers(config));
   for (uint64_t idx_in_batch = 0; idx_in_batch < config.batch_size; idx_in_batch++) {
     for (uint64_t i = 0; i < size_out; i += NOF_OPERATIONS_PER_TASK) {
       VectorOpTask<T, T>* task_p = task_manager.get_idle_or_completed_task();

From a751555f0cfb9f68b78c1fe8357f882072ecf62f Mon Sep 17 00:00:00 2001
From: yshekel <yshekel@gmail.com>
Date: Tue, 14 Jan 2025 13:10:44 +0200
Subject: [PATCH 08/12] Support android and vulkan (#735)

- Update Cmake for android cross compilation
- Update log.h for android logcat
---
 icicle/CMakeLists.txt             | 42 +---------------
 icicle/cmake/curve.cmake          |  2 +-
 icicle/cmake/field.cmake          |  2 +-
 icicle/cmake/setup.cmake          | 82 +++++++++++++++++++++++++++++++
 icicle/include/icicle/utils/log.h | 44 +++++++++++++++--
 5 files changed, 125 insertions(+), 47 deletions(-)
 create mode 100644 icicle/cmake/setup.cmake

diff --git a/icicle/CMakeLists.txt b/icicle/CMakeLists.txt
index 246215676f..45d0e3db71 100644
--- a/icicle/CMakeLists.txt
+++ b/icicle/CMakeLists.txt
@@ -1,59 +1,23 @@
 cmake_minimum_required(VERSION 3.18)
 
+include(cmake/setup.cmake)
+
 project(icicle)
 
 # Specify the C++ standard
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
 
-# Select the C++ compiler
-find_program(CLANG_COMPILER clang++)
-find_program(CLANG_C_COMPILER clang)
-
-if(CLANG_COMPILER AND CLANG_C_COMPILER)
-    set(CMAKE_CXX_COMPILER ${CLANG_COMPILER} CACHE STRING "Clang++ compiler" FORCE)
-    set(CMAKE_C_COMPILER ${CLANG_C_COMPILER} CACHE STRING "Clang compiler" FORCE)
-    message(STATUS "Using Clang++ as the C++ compiler: ${CLANG_COMPILER}")
-    message(STATUS "Using Clang as the C compiler: ${CLANG_C_COMPILER}")    
-else()
-    message(WARNING "ICICLE CPU works best with clang++ and clang. Defaulting to ${CLANG_COMPILER}")        
-endif()
-
-
 
 include(cmake/field.cmake)
 include(cmake/curve.cmake)
 include(cmake/hash.cmake)
 
-# Set the default build type to Release if not specified
-if(NOT CMAKE_BUILD_TYPE)
-  set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build: Debug, Release, RelWithDebInfo, MinSizeRel." FORCE)
-endif()
-
-# Print the selected build type
-message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
-
 # Prevent build if both SANITIZE and CUDA_BACKEND are enabled
 if(SANITIZE AND CUDA_BACKEND)
   message(FATAL_ERROR "Address sanitizer and Cuda cannot be enabled at the same time.")
 endif()
 
-# Find the ccache program
-find_program(CCACHE_PROGRAM ccache)
-# If ccache is found, use it as the compiler launcher
-if(CCACHE_PROGRAM)
-    message(STATUS "ccache found: ${CCACHE_PROGRAM}")
-
-    # Use ccache for C and C++ compilers
-    set(CMAKE_C_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
-    set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
-    set(CMAKE_CUDA_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
-else()
-    message(STATUS "ccache not found")
-endif()
-
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-
 # Build options
 option(BUILD_TESTS "Build unit test2s. Default=OFF" OFF)
 # Backends: typically CPU is built into the frontend, the rest are DSOs loaded at runtime from installation
@@ -87,8 +51,6 @@ add_library(icicle_device SHARED
   src/runtime.cpp
   src/config_extension.cpp
 )
-target_link_libraries(icicle_device PUBLIC dl)
-
 include_directories(include)
 
 # Define the install directory (default is /usr/local)
diff --git a/icicle/cmake/curve.cmake b/icicle/cmake/curve.cmake
index c82d1b90b0..53148314ce 100644
--- a/icicle/cmake/curve.cmake
+++ b/icicle/cmake/curve.cmake
@@ -58,7 +58,7 @@ function(setup_curve_target CURVE CURVE_INDEX FEATURES_STRING)
   # Add additional feature handling calls here
 
   set_target_properties(icicle_curve PROPERTIES OUTPUT_NAME "icicle_curve_${CURVE}")
-  target_link_libraries(icicle_curve PUBLIC icicle_device icicle_field pthread)
+  target_link_libraries(icicle_curve PUBLIC icicle_device icicle_field)
 
   # Ensure CURVE is defined in the cache for backends to see
   set(CURVE "${CURVE}" CACHE STRING "")
diff --git a/icicle/cmake/field.cmake b/icicle/cmake/field.cmake
index 953c0d6fca..b7864c869a 100644
--- a/icicle/cmake/field.cmake
+++ b/icicle/cmake/field.cmake
@@ -56,7 +56,7 @@ function(setup_field_target FIELD FIELD_INDEX FEATURES_STRING)
   # Add additional feature handling calls here
 
   set_target_properties(icicle_field PROPERTIES OUTPUT_NAME "icicle_field_${FIELD}")
-  target_link_libraries(icicle_field PUBLIC icicle_device pthread)
+  target_link_libraries(icicle_field PUBLIC icicle_device)
 
   # Ensure FIELD is defined in the cache for backends to see
   set(FIELD "${FIELD}" CACHE STRING "")
diff --git a/icicle/cmake/setup.cmake b/icicle/cmake/setup.cmake
new file mode 100644
index 0000000000..c60d5330e6
--- /dev/null
+++ b/icicle/cmake/setup.cmake
@@ -0,0 +1,82 @@
+
+# Option to cross-compile for Android
+option(BUILD_FOR_ANDROID "Cross-compile for Android" OFF)
+
+if (BUILD_FOR_ANDROID)
+    message(STATUS "Configuring for Android...")
+
+    # Check for NDK in the environment variable
+    if (NOT DEFINED ENV{ANDROID_NDK} AND NOT DEFINED ANDROID_NDK)
+        message(FATAL_ERROR "ANDROID_NDK is not defined. Please set the environment variable or pass -DANDROID_NDK=<path>")
+    endif()
+
+    # Use the CMake option if specified; otherwise, use the environment variable
+    if (DEFINED ANDROID_NDK)
+        set(CMAKE_ANDROID_NDK ${ANDROID_NDK})
+    elseif (DEFINED ENV{ANDROID_NDK})
+        set(CMAKE_ANDROID_NDK $ENV{ANDROID_NDK})
+    endif()
+
+    # Debugging message for NDK path
+    message(STATUS "Using Android NDK: ${CMAKE_ANDROID_NDK}")
+
+    # Set toolchain and other options
+    set(ANDROID_MIN_API 24) # Minimum API (24 is for android 7.0 and later)
+    set(CMAKE_SYSTEM_NAME Android CACHE STRING "Target system name for cross-compilation")
+    set(ANDROID_ABI arm64-v8a CACHE STRING "Default Android ABI")
+    set(ANDROID_PLATFORM "android-${ANDROID_MIN_API}" CACHE STRING "Android API level")
+    set(CMAKE_ANDROID_ARCH_ABI "${ANDROID_ABI}" CACHE STRING "Target ABI for Android")
+    set(CMAKE_ANDROID_STL_TYPE c++_shared CACHE STRING "Android STL type")
+    set(CMAKE_TOOLCHAIN_FILE "${CMAKE_ANDROID_NDK}/build/cmake/android.toolchain.cmake" CACHE FILEPATH "Path to the Android toolchain file")
+    list(APPEND CMAKE_SYSTEM_LIBRARY_PATH "${CMAKE_ANDROID_NDK}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android/${ANDROID_MIN_API}")
+
+    message(STATUS "Using ANDROID_MIN_API: ${ANDROID_MIN_API}")
+    message(STATUS "Using ANDROID_ABI: ${ANDROID_ABI}")
+
+endif()
+
+# Platform specific libraries and compiler
+if (CMAKE_SYSTEM_NAME STREQUAL "Android")
+    find_library(LOG_LIB log REQUIRED)  # Android log library
+    set(PLATFORM_LIBS ${LOG_LIB})
+else()
+  message(STATUS "Configuring for native platform...")
+  # Select the C++ compiler
+  find_program(CLANG_COMPILER clang++)
+  find_program(CLANG_C_COMPILER clang)
+
+  if(CLANG_COMPILER AND CLANG_C_COMPILER)
+      set(CMAKE_CXX_COMPILER ${CLANG_COMPILER} CACHE STRING "Clang++ compiler" FORCE)
+      set(CMAKE_C_COMPILER ${CLANG_C_COMPILER} CACHE STRING "Clang compiler" FORCE)
+  else()
+      message(WARNING "ICICLE CPU works best with clang++ and clang. Defaulting to ${CLANG_COMPILER}")
+  endif()
+
+  set(PLATFORM_LIBS pthread dl)
+endif()
+
+link_libraries(${PLATFORM_LIBS})
+
+# Find the ccache program
+find_program(CCACHE_PROGRAM ccache)
+# If ccache is found, use it as the compiler launcher
+if(CCACHE_PROGRAM)
+    message(STATUS "ccache found: ${CCACHE_PROGRAM}")
+
+    # Use ccache for C and C++ compilers
+    set(CMAKE_C_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
+    set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
+    set(CMAKE_CUDA_COMPILER_LAUNCHER ${CCACHE_PROGRAM})
+else()
+    message(STATUS "ccache not found")
+endif()
+
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+# Set the default build type to Release if not specified
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build: Debug, Release, RelWithDebInfo, MinSizeRel." FORCE)
+endif()
+
+# Print the selected build type
+message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
\ No newline at end of file
diff --git a/icicle/include/icicle/utils/log.h b/icicle/include/icicle/utils/log.h
index f8e0af490b..62ae869f66 100644
--- a/icicle/include/icicle/utils/log.h
+++ b/icicle/include/icicle/utils/log.h
@@ -3,6 +3,10 @@
 #include <iostream>
 #include <sstream>
 
+#ifdef __ANDROID__
+  #include <android/log.h>
+#endif
+
 #define ICICLE_LOG_VERBOSE Log(Log::Verbose)
 #define ICICLE_LOG_DEBUG   Log(Log::Debug)
 #define ICICLE_LOG_INFO    Log(Log::Info)
@@ -21,7 +25,16 @@ class Log
 
   ~Log()
   {
-    if (level >= s_min_log_level) { std::cerr << oss.str() << std::endl; }
+    if (level >= s_min_log_level) {
+#ifdef __ANDROID__
+      // Use Android logcat
+      android_LogPriority androidPriority = logLevelToAndroidPriority(level);
+      __android_log_print(androidPriority, "ICICLE", "%s", oss.str().c_str());
+#else
+      // Use standard error stream for other platforms
+      std::cerr << oss.str() << std::endl;
+#endif
+    }
   }
 
   template <typename T>
@@ -31,7 +44,7 @@ class Log
     return *this;
   }
 
-  // Static method to set the log level
+  // Static method to set the minimum log level
   static void set_min_log_level(eLogLevel level) { s_min_log_level = level; }
 
 private:
@@ -43,7 +56,7 @@ class Log
   {
     switch (level) {
     case Verbose:
-      return "DEBUG";
+      return "VERBOSE";
     case Debug:
       return "DEBUG";
     case Info:
@@ -57,7 +70,28 @@ class Log
     }
   }
 
-  // logging message with level>=s_min_log_level
+#ifdef __ANDROID__
+  // Map custom log level to Android log priority
+  android_LogPriority logLevelToAndroidPriority(eLogLevel level) const
+  {
+    switch (level) {
+    case Verbose:
+      return ANDROID_LOG_VERBOSE;
+    case Debug:
+      return ANDROID_LOG_DEBUG;
+    case Info:
+      return ANDROID_LOG_INFO;
+    case Warning:
+      return ANDROID_LOG_WARN;
+    case Error:
+      return ANDROID_LOG_ERROR;
+    default:
+      return ANDROID_LOG_UNKNOWN;
+    }
+  }
+#endif
+
+  // Static member to hold the minimum log level
 #if defined(NDEBUG)
   static inline eLogLevel s_min_log_level = eLogLevel::Info;
 #else
@@ -65,4 +99,4 @@ class Log
 #endif
 
   // Note: for verbose, need to explicitly call `set_min_log_level(eLogLevel::Verbose)`
-};
+};
\ No newline at end of file

From c10aa35f3cbf5685029e7c01bbe2c7b1141204dd Mon Sep 17 00:00:00 2001
From: Miki <100796045+mickeyasa@users.noreply.github.com>
Date: Tue, 14 Jan 2025 13:19:50 +0200
Subject: [PATCH 09/12] Parallelize-vecop-program-execution (#736)

---
 icicle/backend/cpu/src/field/cpu_vec_ops.cpp | 39 ++++++++++++++------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
index 496621c312..38d859e022 100644
--- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
+++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
@@ -10,6 +10,7 @@
 #include <sys/types.h>
 #include <vector>
 
+#include "taskflow/taskflow.hpp"
 #include "icicle/program/program.h"
 #include "cpu_program_executor.h"
 
@@ -852,20 +853,36 @@ eIcicleError cpu_execute_program(
                      << " parameters";
     return eIcicleError::INVALID_ARGUMENT;
   }
+  tf::Taskflow taskflow; // Accumulate tasks
+  tf::Executor executor; // execute all tasks accumulated on multiple threads
   const uint64_t total_nof_operations = size * config.batch_size;
-  CpuProgramExecutor prog_executor(program);
-  // init prog_executor to point to data vectors
-  for (int param_idx = 0; param_idx < program.m_nof_parameters; ++param_idx) {
-    prog_executor.m_variable_ptrs[param_idx] = data[param_idx];
-  }
 
-  // run over all elements in the arrays and execute the program
-  for (uint64_t i = 0; i < total_nof_operations; i++) {
-    prog_executor.execute();
-    for (int param_idx = 0; param_idx < program.m_nof_parameters; ++param_idx) {
-      (prog_executor.m_variable_ptrs[param_idx])++;
-    }
+  // Divide the problem to workers
+  const int nof_workers = get_nof_workers(config);
+  const uint64_t worker_task_size = (total_nof_operations + nof_workers - 1) / nof_workers; // round up
+
+  for (uint64_t start_idx = 0; start_idx < total_nof_operations; start_idx += worker_task_size) {
+    taskflow.emplace([=]() {
+      CpuProgramExecutor prog_executor(program);
+      // init prog_executor to point to data vectors
+      for (int param_idx = 0; param_idx < program.m_nof_parameters; ++param_idx) {
+        prog_executor.m_variable_ptrs[param_idx] = &(data[param_idx][start_idx]);
+      }
+
+      const uint64_t task_size = std::min(worker_task_size, total_nof_operations - start_idx);
+      // run over all task elements in the arrays and execute the program
+      for (uint64_t i = 0; i < task_size; i++) {
+        prog_executor.execute();
+        // update the program pointers
+        for (int param_idx = 0; param_idx < program.m_nof_parameters; ++param_idx) {
+          (prog_executor.m_variable_ptrs[param_idx])++;
+        }
+      }
+    });
   }
+
+  executor.run(taskflow).wait();
+  taskflow.clear();
   return eIcicleError::SUCCESS;
 }
 

From eab9c508a240e348bcf0b1f030c40dea4aeb2529 Mon Sep 17 00:00:00 2001
From: idanfr-ingo <idan@ingonyama.com>
Date: Tue, 14 Jan 2025 13:55:38 +0200
Subject: [PATCH 10/12] Create docs for program & program execution (#722)

---
 docs/docs/icicle/primitives/program.md | 88 ++++++++++++++++++++++++++
 docs/docs/icicle/primitives/vec_ops.md | 15 +++++
 docs/sidebars.ts                       |  5 ++
 3 files changed, 108 insertions(+)
 create mode 100644 docs/docs/icicle/primitives/program.md

diff --git a/docs/docs/icicle/primitives/program.md b/docs/docs/icicle/primitives/program.md
new file mode 100644
index 0000000000..85e5441883
--- /dev/null
+++ b/docs/docs/icicle/primitives/program.md
@@ -0,0 +1,88 @@
+# Programs
+
+## Overview
+
+Program is a class that let users define expressions on vector elements, and have ICICLE compile it for the backends for a fused implementation. This solves memory bottlenecks and also let users customize algorithms such as sumcheck. Program can create only element-wise lambda functions.
+
+
+## C++ API
+
+### Symbol
+
+Symbol is the basic (template) class that allow users to define their own program. The lambda function the user define will operate on symbols.
+
+### Defining lambda function
+
+To define a custom lambda function the user will use Symbol:
+```cpp
+void lambda_multi_result(std::vector<Symbol<scalar_t>>& vars)
+{
+  const Symbol<scalar_t>& A = vars[0];
+  const Symbol<scalar_t>& B = vars[1];
+  const Symbol<scalar_t>& C = vars[2];
+  const Symbol<scalar_t>& EQ = vars[3];
+  vars[4] = EQ * (A * B - C) + scalar_t::from(9);
+  vars[5] = A * B - C.inverse();
+  vars[6] = vars[5];
+  vars[3] = 2 * (var[0] + var[1]) // all variables can be both inputs and outputs
+}
+```
+
+Each symbol element at the vector argument `var` represent an input or an output. The type of the symbol (`scalar_t` in this example) will be the type of the inputs and outputs. In this example we created a lambda function with four inputs and three outputs.
+
+In this example there are four input variables and three three outputs. Inside the function the user can define custom expressions on them.
+
+Program support few pre-defined programs. The user can use those pre-defined programs without creating a lambda function, as will be explained in the next section.
+
+### Creating program
+
+To execute the lambda function we just created we need to create a program from it.
+To create program from lambda function we can use the following constructor:
+
+```cpp
+Program(std::function<void(std::vector<Symbol<S>>&)> program_func, const int nof_parameters)
+```
+
+`program_func` is the lambda function (in the example above `lambda_multi_result`) and `nof_parameters` is the total number of parameter (inputs + outputs) for the lambda (seven in the above example).
+
+#### Pre-defined programs
+
+As mentioned before, there are few pre-defined programs the user can use without the need to create a lambda function first. The enum `PreDefinedPrograms` contains the pre-defined program. Using pre-defined function will lead to better performance compared to creating the equivalent lambda function.
+To create a pre-defined program a different constructor is bing used:
+
+```cpp
+Program(PreDefinedPrograms pre_def)
+```
+
+`pre_def` is the pre-defined program (from `PreDefinedPrograms`).
+
+##### PreDefinedPrograms
+
+```cpp
+enum PreDefinedPrograms {
+  AB_MINUS_C = 0,
+  EQ_X_AB_MINUS_C
+};
+```
+
+`AB_MINUS_C` - the pre-defined program `AB - C` for the input vectors `A`, `B` and `C`
+
+`EQ_X_AB_MINUS_C` - the pre-defined program `EQ(AB - C)` for the input vectors `A`, `B`, `C` and `EQ`
+
+
+### Executing program
+
+To execute the program the `execute_program` function from the vector operation API should be used. This operation is supported by the CPU and CUDA backends.
+
+
+```cpp
+template <typename T>
+eIcicleError
+execute_program(std::vector<T*>& data, const Program<T>& program, uint64_t size, const VecOpsConfig& config);
+```
+
+The `data` vector is a vector of pointers to the inputs and output vectors, `program` is the program to execute, `size` is the length of the vectors and `config` is the configuration of the operation.
+
+For the configuration the field `is_a_on_device` determined whethere the data (*inputs and outputs*) is on device or not. After the execution `data` will reside in the same place as it did before (i.e. the field `is_result_on_device` is irrelevant.)
+
+> **_NOTE:_**  Using program for executing lambdas is recommended only while using the CUDA backend. Program's primary use is to let users to customize algorithms (such as sumcheck). 
diff --git a/docs/docs/icicle/primitives/vec_ops.md b/docs/docs/icicle/primitives/vec_ops.md
index 7f546dc16c..4725f7a490 100644
--- a/docs/docs/icicle/primitives/vec_ops.md
+++ b/docs/docs/icicle/primitives/vec_ops.md
@@ -78,6 +78,20 @@ template <typename T>
 eIcicleError vector_div(const T* vec_a, const T* vec_b, uint64_t size, const VecOpsConfig& config, T* output);
 ```
 
+#### `execute_program`
+
+Execute a user-defined lambda function with arbitrary number of input and output variables.
+
+```cpp
+template <typename T>
+eIcicleError
+execute_program(std::vector<T*>& data, const Program<T>& program, uint64_t size, const VecOpsConfig& config);
+```
+
+`is_result_on_device` of VecOpsConfig is not used here.
+
+For more details see [program](./program.md).
+
 #### `vector_accumulate`
 
 Adds vector b to a, inplace.
@@ -208,6 +222,7 @@ template <typename T>
 eIcicleError polynomial_division(const T* numerator, int64_t numerator_deg, const T* denumerator, int64_t denumerator_deg, const VecOpsConfig& config, T* q_out /*OUT*/, uint64_t q_size, T* r_out /*OUT*/, uint64_t r_size);
 ```
 
+
 ### Rust and Go bindings
 
 - [Golang](../golang-bindings/vec-ops.md)
diff --git a/docs/sidebars.ts b/docs/sidebars.ts
index 32f264d709..dcc9b77c0b 100644
--- a/docs/sidebars.ts
+++ b/docs/sidebars.ts
@@ -66,6 +66,11 @@ const cppApi = [
     label: "Vector operations",
     id: "icicle/primitives/vec_ops",
   },
+  {
+    type: "doc",
+    label: "Program",
+    id: "icicle/primitives/program",
+  },
   {
     type: "doc",
     label: "Polynomials",

From 488421bcb43704ec3e9228ad65ec2c9d71dfb964 Mon Sep 17 00:00:00 2001
From: Aviad Dahan <116077675+aviadingo@users.noreply.github.com>
Date: Tue, 14 Jan 2025 14:59:51 +0200
Subject: [PATCH 11/12] Feat: Blake3 (#733)

---
 .github/workflows/cpp-golang-rust.yml         | 111 ++--
 docs/docs/icicle/libraries.md                 |   1 +
 docs/docs/icicle/primitives/hash.md           |   9 +-
 examples/c++/best-practice-ntt/example.cpp    |   4 +-
 icicle/backend/cpu/CMakeLists.txt             |   4 +
 icicle/backend/cpu/src/field/cpu_vec_ops.cpp  |   2 +-
 icicle/backend/cpu/src/hash/blake3.c          | 591 ++++++++++++++++++
 icicle/backend/cpu/src/hash/blake3.h          |  57 ++
 icicle/backend/cpu/src/hash/blake3_dispatch.c |  62 ++
 icicle/backend/cpu/src/hash/blake3_impl.h     | 205 ++++++
 icicle/backend/cpu/src/hash/blake3_portable.c | 176 ++++++
 icicle/backend/cpu/src/hash/cpu_blake3.cpp    |  53 ++
 icicle/backend/cpu/src/hash/cpu_poseidon2.cpp |   2 +-
 icicle/cmake/hash.cmake                       |   1 +
 .../icicle/backend/hash/blake3_backend.h      |  25 +
 .../include/icicle/fields/quartic_extension.h |   2 +-
 icicle/include/icicle/fields/storage.h        |  12 +-
 icicle/include/icicle/hash/blake3.h           |  20 +
 icicle/src/hash/blake3.cpp                    |  17 +
 icicle/src/hash/hash_c_api.cpp                |  14 +
 icicle/tests/test_hash_api.cpp                |  73 +++
 wrappers/golang/hash/blake3.go                |  19 +
 wrappers/golang/hash/include/blake3.h         |  17 +
 wrappers/golang/hash/tests/hash_test.go       |  73 +++
 wrappers/rust/icicle-hash/src/blake3.rs       |  18 +
 wrappers/rust/icicle-hash/src/lib.rs          |   1 +
 wrappers/rust/icicle-hash/src/tests.rs        |  67 ++
 27 files changed, 1569 insertions(+), 67 deletions(-)
 create mode 100644 icicle/backend/cpu/src/hash/blake3.c
 create mode 100644 icicle/backend/cpu/src/hash/blake3.h
 create mode 100644 icicle/backend/cpu/src/hash/blake3_dispatch.c
 create mode 100644 icicle/backend/cpu/src/hash/blake3_impl.h
 create mode 100644 icicle/backend/cpu/src/hash/blake3_portable.c
 create mode 100644 icicle/backend/cpu/src/hash/cpu_blake3.cpp
 create mode 100644 icicle/include/icicle/backend/hash/blake3_backend.h
 create mode 100644 icicle/include/icicle/hash/blake3.h
 create mode 100644 icicle/src/hash/blake3.cpp
 create mode 100644 wrappers/golang/hash/blake3.go
 create mode 100644 wrappers/golang/hash/include/blake3.h
 create mode 100644 wrappers/rust/icicle-hash/src/blake3.rs

diff --git a/.github/workflows/cpp-golang-rust.yml b/.github/workflows/cpp-golang-rust.yml
index 11c9242c7a..09eda977d8 100644
--- a/.github/workflows/cpp-golang-rust.yml
+++ b/.github/workflows/cpp-golang-rust.yml
@@ -1,4 +1,4 @@
-name: Spell Check, C++/CUDA/Go/RUST & Examples
+name: C++/CUDA/Go/RUST
 
 on:
   pull_request:
@@ -31,6 +31,8 @@ jobs:
     name: Check Code Format
     runs-on: ubuntu-22.04
     needs: check-changed-files
+    container:
+      image: silkeh/clang:19-bookworm # Replace with a container having the desired clang-format version
     steps:
     - name: Checkout
       uses: actions/checkout@v4
@@ -41,7 +43,9 @@ jobs:
         go-version: '1.22.0'
     - name: Check clang-format
       if: needs.check-changed-files.outputs.cpp == 'true'
-      run: ./scripts/format_all.sh . --check --exclude "build|wrappers"
+      run: |
+        clang-format --version  
+        ./scripts/format_all.sh . --check --exclude "build|wrappers"
     - name: Check gofmt
       if: needs.check-changed-files.outputs.golang == 'true'
       run: if [[ $(go list ./... | xargs go fmt) ]]; then echo "Please run go fmt"; exit 1; fi
@@ -82,7 +86,7 @@ jobs:
     - name: Checkout Repo
       uses: actions/checkout@v4
     - name: Checkout CUDA Backend
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true'
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' ||  needs.check-changed-files.outputs.rust == 'true'
       uses: actions/checkout@v4
       with:
         repository: ingonyama-zk/icicle-cuda-backend
@@ -90,7 +94,7 @@ jobs:
         ssh-key: ${{ secrets.CUDA_PULL_KEY }}
         ref: ${{ needs.extract-cuda-backend-branch.outputs.cuda-backend-branch }}
     - name: Get CUDA Backend Commit SHA
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true'
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' ||  needs.check-changed-files.outputs.rust == 'true'
       working-directory: ./icicle/backend/cuda
       id: extract-cuda-sha
       run: |
@@ -98,7 +102,7 @@ jobs:
         echo "CUDA Backend Commit SHA: $CUDA_BACKEND_SHA"
         echo "cuda-backend-sha=$CUDA_BACKEND_SHA" >> $GITHUB_OUTPUT
     - name: Set CUDA backend flag
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true'
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' ||  needs.check-changed-files.outputs.rust == 'true'
       id: cuda-flag
       run: |
         CUDA_BACKEND_SHA=${{ steps.extract-cuda-sha.outputs.cuda-backend-sha }}
@@ -136,33 +140,17 @@ jobs:
           echo "CMAKE_INSTALL_PREFIX=-DCMAKE_INSTALL_PREFIX=${INSTALL_PATH}" >> $GITHUB_OUTPUT
           echo "ICICLE_BACKEND_INSTALL_DIR=${INSTALL_PATH}/lib" >> $GITHUB_OUTPUT
         fi
-    - name: Setup go
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true'
-      timeout-minutes: 15
-      uses: actions/setup-go@v5
-      with:
-        go-version: '1.22.0'
 
     - name: Build curve
       working-directory: ./icicle
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true'
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' ||  needs.check-changed-files.outputs.rust == 'true'
       run: |
         mkdir -p build && rm -rf build/*
         cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=ON -DCURVE=${{ matrix.curve.name }} ${{ matrix.curve.build_args }} ${{ steps.cuda-flag.outputs.CUDA_FLAG }} ${{ steps.cuda-flag.outputs.CMAKE_INSTALL_PREFIX }} -S . -B build
         cmake --build build --target install -j
         rm -rf ${{ steps.cuda-flag.outputs.INSTALL_PATH }}/lib/gh_commit_sha_${{ matrix.curve.name }}*
         touch ${{ steps.cuda-flag.outputs.COMMIT_FILE_PATH }}
-    - name: Run RUST Curve Tests
-      working-directory: ./wrappers/rust/icicle-curves
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp == 'true'
-      run: |
-        CURVE=${{ matrix.curve.name }}
-        CURVE_DIR=icicle-${CURVE//_/-}
-        export ICICLE_BACKEND_INSTALL_DIR=${{ steps.cuda-flag.outputs.INSTALL_PATH }}
-        cd ./$CURVE_DIR
-        cargo test --release --verbose -- --skip phase
-        cargo test phase2 --release
-        cargo test phase3 --release
+
     - name: Run C++ Curve Tests
       working-directory: ./icicle/build/tests
       if: needs.check-changed-files.outputs.cpp == 'true'
@@ -183,6 +171,23 @@ jobs:
                 cd -
             fi
         done
+    - name: Run RUST Curve Tests
+      working-directory: ./wrappers/rust/icicle-curves
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp == 'true'
+      run: |
+        CURVE=${{ matrix.curve.name }}
+        CURVE_DIR=icicle-${CURVE//_/-}
+        export ICICLE_BACKEND_INSTALL_DIR=${{ steps.cuda-flag.outputs.INSTALL_PATH }}
+        cd ./$CURVE_DIR
+        cargo test --release --verbose -- --skip phase
+        cargo test phase2 --release
+        cargo test phase3 --release
+    - name: Setup go
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true'
+      timeout-minutes: 15
+      uses: actions/setup-go@v5
+      with:
+        go-version: '1.22.0'
     - name: Run Golang curve Tests
       working-directory: ./wrappers/golang/curves
       if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true'
@@ -214,7 +219,7 @@ jobs:
     - name: Checkout Repo
       uses: actions/checkout@v4
     - name: Checkout CUDA Backend
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true'
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' ||  needs.check-changed-files.outputs.rust == 'true'
       uses: actions/checkout@v4
       with:
         repository: ingonyama-zk/icicle-cuda-backend
@@ -222,7 +227,7 @@ jobs:
         ssh-key: ${{ secrets.CUDA_PULL_KEY }}
         ref: ${{ needs.extract-cuda-backend-branch.outputs.cuda-backend-branch }}
     - name: Get CUDA Backend Commit SHA
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true'
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' ||  needs.check-changed-files.outputs.rust == 'true'
       working-directory: ./icicle/backend/cuda
       id: extract-cuda-sha
       run: |
@@ -230,7 +235,7 @@ jobs:
         echo "CUDA Backend Commit SHA: $CUDA_BACKEND_SHA"
         echo "cuda-backend-sha=$CUDA_BACKEND_SHA" >> $GITHUB_OUTPUT
     - name: Set CUDA backend flag
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true'
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' ||  needs.check-changed-files.outputs.rust == 'true'
       id: cuda-flag
       run: |
         CUDA_BACKEND_SHA=${{ steps.extract-cuda-sha.outputs.cuda-backend-sha }}
@@ -268,12 +273,6 @@ jobs:
           echo "CMAKE_INSTALL_PREFIX=-DCMAKE_INSTALL_PREFIX=${INSTALL_PATH}" >> $GITHUB_OUTPUT
           echo "ICICLE_BACKEND_INSTALL_DIR=${INSTALL_PATH}/lib" >> $GITHUB_OUTPUT
         fi
-    - name: Setup go
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true'
-      timeout-minutes: 15
-      uses: actions/setup-go@v5
-      with:
-        go-version: '1.22.0'
 
     - name: Build field
       working-directory: ./icicle
@@ -284,17 +283,7 @@ jobs:
         cmake --build build --target install -j
         rm -rf ${{ steps.cuda-flag.outputs.INSTALL_PATH }}/lib/gh_commit_sha_${{ matrix.field.name }}*
         touch ${{ steps.cuda-flag.outputs.COMMIT_FILE_PATH }}
-    - name: Run RUST Field Tests
-      working-directory: ./wrappers/rust/icicle-fields
-      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp == 'true'
-      run: |
-        FIELD=${{ matrix.field.name }}
-        FIELD_DIR=icicle-${FIELD//_/-}
-        export ICICLE_BACKEND_INSTALL_DIR=${{ steps.cuda-flag.outputs.INSTALL_PATH }}
-        cd ./$FIELD_DIR
-        cargo test --release --verbose -- --skip phase
-        cargo test phase2 --release
-        cargo test phase3 --release
+
     - name: Run C++ field Tests
       working-directory: ./icicle/build/tests
       if: needs.check-changed-files.outputs.cpp == 'true'
@@ -317,6 +306,24 @@ jobs:
             fi
         done
 
+    - name: Run RUST Field Tests
+      working-directory: ./wrappers/rust/icicle-fields
+      if: needs.check-changed-files.outputs.rust == 'true' || needs.check-changed-files.outputs.cpp == 'true'
+      run: |
+        FIELD=${{ matrix.field.name }}
+        FIELD_DIR=icicle-${FIELD//_/-}
+        export ICICLE_BACKEND_INSTALL_DIR=${{ steps.cuda-flag.outputs.INSTALL_PATH }}
+        cd ./$FIELD_DIR
+        cargo test --release --verbose -- --skip phase
+        cargo test phase2 --release
+        cargo test phase3 --release
+
+    - name: Setup go
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true'
+      timeout-minutes: 15
+      uses: actions/setup-go@v5
+      with:
+        go-version: '1.22.0'
     - name: Run Golang field Tests
       working-directory: ./wrappers/golang/fields
       if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true'
@@ -343,7 +350,7 @@ jobs:
     - name: Checkout Repo
       uses: actions/checkout@v4
     - name: Checkout CUDA Backend
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true'
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' ||  needs.check-changed-files.outputs.rust == 'true'
       uses: actions/checkout@v4
       with:
         repository: ingonyama-zk/icicle-cuda-backend
@@ -351,7 +358,7 @@ jobs:
         ssh-key: ${{ secrets.CUDA_PULL_KEY }}
         ref: ${{ needs.extract-cuda-backend-branch.outputs.cuda-backend-branch }}
     - name: Get CUDA Backend Commit SHA
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true'
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' ||  needs.check-changed-files.outputs.rust == 'true'
       working-directory: ./icicle/backend/cuda
       id: extract-cuda-sha
       run: |
@@ -359,7 +366,7 @@ jobs:
         echo "CUDA Backend Commit SHA: $CUDA_BACKEND_SHA"
         echo "cuda-backend-sha=$CUDA_BACKEND_SHA" >> $GITHUB_OUTPUT
     - name: Set CUDA backend flag
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true'
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true' ||  needs.check-changed-files.outputs.rust == 'true'
       id: cuda-flag
       run: |
         CUDA_BACKEND_SHA=${{ steps.extract-cuda-sha.outputs.cuda-backend-sha }}
@@ -396,12 +403,6 @@ jobs:
           echo "CMAKE_INSTALL_PREFIX=-DCMAKE_INSTALL_PREFIX=${INSTALL_PATH}" >> $GITHUB_OUTPUT
           echo "ICICLE_BACKEND_INSTALL_DIR=${INSTALL_PATH}/lib" >> $GITHUB_OUTPUT
         fi
-    - name: Setup go
-      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true'
-      timeout-minutes: 15
-      uses: actions/setup-go@v5
-      with:
-        go-version: '1.22.0'
     - name: Build
       working-directory: ./icicle
       if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true'
@@ -420,6 +421,12 @@ jobs:
         cargo test --release --verbose -- --skip phase
         cargo test phase2 --release
         cargo test phase3 --release
+    - name: Setup go
+      if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true'
+      timeout-minutes: 15
+      uses: actions/setup-go@v5
+      with:
+        go-version: '1.22.0'
     - name: Test GoLang Hashes
       working-directory: ./wrappers/golang/hash
       if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp == 'true'
diff --git a/docs/docs/icicle/libraries.md b/docs/docs/icicle/libraries.md
index 684a6b6177..6a71e4cb1e 100644
--- a/docs/docs/icicle/libraries.md
+++ b/docs/docs/icicle/libraries.md
@@ -53,6 +53,7 @@ Each library has a corresponding crate. See [programmers guide](./programmers_gu
 | [Keccak](primitives/hash#keccak-and-sha3) |       supporting 256b and 512b digest        |
 | [SHA3](primitives/hash#keccak-and-sha3)   |       supporting 256b and 512b digest        |
 | [Blake2s](primitives/hash#blake2s)        |                digest is 256b                |
+| [Blake3](primitives/hash#blake3)        |                digest is 256b                |
 | [Merkle-Tree](primitives/merkle)          | works with any combination of hash functions |
 
 
diff --git a/docs/docs/icicle/primitives/hash.md b/docs/docs/icicle/primitives/hash.md
index 702487a5a0..a88bc996fe 100644
--- a/docs/docs/icicle/primitives/hash.md
+++ b/docs/docs/icicle/primitives/hash.md
@@ -23,8 +23,9 @@ ICICLE supports the following hash functions:
 3. **SHA3-256**
 4. **SHA3-512**
 5. **Blake2s**
-6. **Poseidon**
-7. **Poseidon2**
+6. **Blake3**
+7. **Poseidon**
+8. **Poseidon2**
 
 :::info
 Additional hash functions might be added in the future. Stay tuned!
@@ -40,6 +41,10 @@ Keccak can take input messages of any length and produce a fixed-size hash. It u
 
 [Blake2s](https://www.rfc-editor.org/rfc/rfc7693.txt) is an optimized cryptographic hash function that provides high performance while ensuring strong security. Blake2s is ideal for hashing small data (such as field elements), especially when speed is crucial. It produces a 256-bit (32-byte) output and is often used in cryptographic protocols.
 
+### Blake3
+
+[Blake3](https://www.ietf.org/archive/id/draft-aumasson-blake3-00.html) is a high-performance cryptographic hash function designed for both small and large data. With a a tree-based design for efficient parallelism, it offers strong security, speed, and scalability for modern cryptographic applications.
+
 ### Poseidon
 
 [Poseidon](https://eprint.iacr.org/2019/458) is a cryptographic hash function designed specifically for field elements. It is highly optimized for zero-knowledge proofs (ZKPs) and is commonly used in ZK-SNARK systems. Poseidon’s main strength lies in its arithmetization-friendly design, meaning it can be efficiently expressed as arithmetic constraints within a ZK-SNARK circuit.
diff --git a/examples/c++/best-practice-ntt/example.cpp b/examples/c++/best-practice-ntt/example.cpp
index 7db88aa357..7f4bc974d1 100644
--- a/examples/c++/best-practice-ntt/example.cpp
+++ b/examples/c++/best-practice-ntt/example.cpp
@@ -116,8 +116,8 @@ int main(int argc, char* argv[])
   // Clean-up
   for (int i = 0; i < 2; i++) {
     ICICLE_CHECK(icicle_free(d_vec[i]));
-    delete[](h_inp[i]);
-    delete[](h_out[i]);
+    delete[] (h_inp[i]);
+    delete[] (h_out[i]);
   }
   ICICLE_CHECK(icicle_destroy_stream(stream_compute));
   ICICLE_CHECK(icicle_destroy_stream(stream_d2h));
diff --git a/icicle/backend/cpu/CMakeLists.txt b/icicle/backend/cpu/CMakeLists.txt
index 75a35f1e84..20eae67f01 100644
--- a/icicle/backend/cpu/CMakeLists.txt
+++ b/icicle/backend/cpu/CMakeLists.txt
@@ -67,6 +67,10 @@ if (HASH)
     target_sources(icicle_hash PRIVATE
         src/hash/cpu_keccak.cpp
         src/hash/cpu_blake2s.cpp
+        src/hash/cpu_blake3.cpp
+        src/hash/blake3.c
+        src/hash/blake3_dispatch.c
+        src/hash/blake3_portable.c
         src/hash/cpu_merkle_tree.cpp
     )
     target_include_directories(icicle_hash PUBLIC include)
diff --git a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
index 38d859e022..18fb5dd032 100644
--- a/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
+++ b/icicle/backend/cpu/src/field/cpu_vec_ops.cpp
@@ -372,7 +372,7 @@ class VectorOpTask : public TaskBase
 public:
   T m_intermidiate_res;    // pointer to the output. Can be a vector or scalar pointer
   uint64_t m_idx_in_batch; // index in the batch. Used in intermediate res tasks
-};
+}; // class VectorOpTask
 
 #define NOF_OPERATIONS_PER_TASK 512
 #define CONFIG_NOF_THREADS_KEY  "n_threads"
diff --git a/icicle/backend/cpu/src/hash/blake3.c b/icicle/backend/cpu/src/hash/blake3.c
new file mode 100644
index 0000000000..617949f109
--- /dev/null
+++ b/icicle/backend/cpu/src/hash/blake3.c
@@ -0,0 +1,591 @@
+/*BLAKE3 Hash function based on the original design by the BLAKE3 team https://github.com/BLAKE3-team/BLAKE3 */
+
+#include <assert.h>
+#include "blake3.h"
+#include "blake3_impl.h"
+
+const char* blake3_version(void) { return BLAKE3_VERSION_STRING; }
+
+INLINE void chunk_state_init(blake3_chunk_state* self, const uint32_t key[8], uint8_t flags)
+{
+  memcpy(self->cv, key, BLAKE3_KEY_LEN);
+  self->chunk_counter = 0;
+  memset(self->buf, 0, BLAKE3_BLOCK_LEN);
+  self->buf_len = 0;
+  self->blocks_compressed = 0;
+  self->flags = flags;
+}
+
+INLINE void chunk_state_reset(blake3_chunk_state* self, const uint32_t key[8], uint64_t chunk_counter)
+{
+  memcpy(self->cv, key, BLAKE3_KEY_LEN);
+  self->chunk_counter = chunk_counter;
+  self->blocks_compressed = 0;
+  memset(self->buf, 0, BLAKE3_BLOCK_LEN);
+  self->buf_len = 0;
+}
+
+INLINE size_t chunk_state_len(const blake3_chunk_state* self)
+{
+  return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) + ((size_t)self->buf_len);
+}
+
+INLINE size_t chunk_state_fill_buf(blake3_chunk_state* self, const uint8_t* input, size_t input_len)
+{
+  size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len);
+  if (take > input_len) { take = input_len; }
+  uint8_t* dest = self->buf + ((size_t)self->buf_len);
+  memcpy(dest, input, take);
+  self->buf_len += (uint8_t)take;
+  return take;
+}
+
+INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state* self)
+{
+  if (self->blocks_compressed == 0) {
+    return CHUNK_START;
+  } else {
+    return 0;
+  }
+}
+
+typedef struct {
+  uint32_t input_cv[8];
+  uint64_t counter;
+  uint8_t block[BLAKE3_BLOCK_LEN];
+  uint8_t block_len;
+  uint8_t flags;
+} output_t;
+
+INLINE output_t make_output(
+  const uint32_t input_cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
+{
+  output_t ret;
+  memcpy(ret.input_cv, input_cv, 32);
+  memcpy(ret.block, block, BLAKE3_BLOCK_LEN);
+  ret.block_len = block_len;
+  ret.counter = counter;
+  ret.flags = flags;
+  return ret;
+}
+
+// Chaining values within a given chunk (specifically the compress_in_place
+// interface) are represented as words. This avoids unnecessary bytes<->words
+// conversion overhead in the portable implementation. However, the hash_many
+// interface handles both user input and parent node blocks, so it accepts
+// bytes. For that reason, chaining values in the CV stack are represented as
+// bytes.
+INLINE void output_chaining_value(const output_t* self, uint8_t cv[32])
+{
+  uint32_t cv_words[8];
+  memcpy(cv_words, self->input_cv, 32);
+  blake3_compress_in_place(cv_words, self->block, self->block_len, self->counter, self->flags);
+  store_cv_words(cv, cv_words);
+}
+
+INLINE void output_root_bytes(const output_t* self, uint64_t seek, uint8_t* out, size_t out_len)
+{
+  if (out_len == 0) { return; }
+  uint64_t output_block_counter = seek / 64;
+  size_t offset_within_block = seek % 64;
+  uint8_t wide_buf[64];
+  if (offset_within_block) {
+    blake3_compress_xof(
+      self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
+    const size_t available_bytes = 64 - offset_within_block;
+    const size_t bytes = out_len > available_bytes ? available_bytes : out_len;
+    memcpy(out, wide_buf + offset_within_block, bytes);
+    out += bytes;
+    out_len -= bytes;
+    output_block_counter += 1;
+  }
+  if (out_len / 64) {
+    blake3_xof_many(
+      self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, out, out_len / 64);
+  }
+  output_block_counter += out_len / 64;
+  out += out_len & -64;
+  out_len -= out_len & -64;
+  if (out_len) {
+    blake3_compress_xof(
+      self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
+    memcpy(out, wide_buf, out_len);
+  }
+}
+
+INLINE void chunk_state_update(blake3_chunk_state* self, const uint8_t* input, size_t input_len)
+{
+  if (self->buf_len > 0) {
+    size_t take = chunk_state_fill_buf(self, input, input_len);
+    input += take;
+    input_len -= take;
+    if (input_len > 0) {
+      blake3_compress_in_place(
+        self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter, self->flags | chunk_state_maybe_start_flag(self));
+      self->blocks_compressed += 1;
+      self->buf_len = 0;
+      memset(self->buf, 0, BLAKE3_BLOCK_LEN);
+    }
+  }
+
+  while (input_len > BLAKE3_BLOCK_LEN) {
+    blake3_compress_in_place(
+      self->cv, input, BLAKE3_BLOCK_LEN, self->chunk_counter, self->flags | chunk_state_maybe_start_flag(self));
+    self->blocks_compressed += 1;
+    input += BLAKE3_BLOCK_LEN;
+    input_len -= BLAKE3_BLOCK_LEN;
+  }
+
+  chunk_state_fill_buf(self, input, input_len);
+}
+
+INLINE output_t chunk_state_output(const blake3_chunk_state* self)
+{
+  uint8_t block_flags = self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END;
+  return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter, block_flags);
+}
+
+INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], const uint32_t key[8], uint8_t flags)
+{
+  return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT);
+}
+
+// Given some input larger than one chunk, return the number of bytes that
+// should go in the left subtree. This is the largest power-of-2 number of
+// chunks that leaves at least 1 byte for the right subtree.
+INLINE size_t left_len(size_t content_len)
+{
+  // Subtract 1 to reserve at least one byte for the right side. content_len
+  // should always be greater than BLAKE3_CHUNK_LEN.
+  size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
+  return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
+}
+
+// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
+// on a single thread. Write out the chunk chaining values and return the
+// number of chunks hashed. These chunks are never the root and never empty;
+// those cases use a different codepath.
+INLINE size_t compress_chunks_parallel(
+  const uint8_t* input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t* out)
+{
+  const uint8_t* chunks_array[MAX_SIMD_DEGREE];
+  size_t input_position = 0;
+  size_t chunks_array_len = 0;
+  while (input_len - input_position >= BLAKE3_CHUNK_LEN) {
+    chunks_array[chunks_array_len] = &input[input_position];
+    input_position += BLAKE3_CHUNK_LEN;
+    chunks_array_len += 1;
+  }
+
+  blake3_hash_many(
+    chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, true, flags, CHUNK_START,
+    CHUNK_END, out);
+
+  // Hash the remaining partial chunk, if there is one. Note that the empty
+  // chunk (meaning the empty message) is a different codepath.
+  if (input_len > input_position) {
+    uint64_t counter = chunk_counter + (uint64_t)chunks_array_len;
+    blake3_chunk_state chunk_state;
+    chunk_state_init(&chunk_state, key, flags);
+    chunk_state.chunk_counter = counter;
+    chunk_state_update(&chunk_state, &input[input_position], input_len - input_position);
+    output_t output = chunk_state_output(&chunk_state);
+    output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]);
+    return chunks_array_len + 1;
+  } else {
+    return chunks_array_len;
+  }
+}
+
+// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time
+// on a single thread. Write out the parent chaining values and return the
+// number of parents hashed. (If there's an odd input chaining value left over,
+// return it as an additional output.) These parents are never the root and
+// never empty; those cases use a different codepath.
+INLINE size_t compress_parents_parallel(
+  const uint8_t* child_chaining_values, size_t num_chaining_values, const uint32_t key[8], uint8_t flags, uint8_t* out)
+{
+  const uint8_t* parents_array[MAX_SIMD_DEGREE_OR_2];
+  size_t parents_array_len = 0;
+  while (num_chaining_values - (2 * parents_array_len) >= 2) {
+    parents_array[parents_array_len] = &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
+    parents_array_len += 1;
+  }
+
+  blake3_hash_many(
+    parents_array, parents_array_len, 1, key,
+    0, // Parents always use counter 0.
+    false, flags | PARENT,
+    0, // Parents have no start flags.
+    0, // Parents have no end flags.
+    out);
+
+  // If there's an odd child left over, it becomes an output.
+  if (num_chaining_values > 2 * parents_array_len) {
+    memcpy(
+      &out[parents_array_len * BLAKE3_OUT_LEN], &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN],
+      BLAKE3_OUT_LEN);
+    return parents_array_len + 1;
+  } else {
+    return parents_array_len;
+  }
+}
+
+// The wide helper function returns (writes out) an array of chaining values
+// and returns the length of that array. The number of chaining values returned
+// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
+// if the input is shorter than that many chunks. The reason for maintaining a
+// wide array of chaining values going back up the tree, is to allow the
+// implementation to hash as many parents in parallel as possible.
+//
+// As a special case when the SIMD degree is 1, this function will still return
+// at least 2 outputs. This guarantees that this function doesn't perform the
+// root compression. (If it did, it would use the wrong flags, and also we
+// wouldn't be able to implement extendable output.) Note that this function is
+// not used when the whole input is only 1 chunk long; that's a different
+// codepath.
+//
+// Why not just have the caller split the input on the first update(), instead
+// of implementing this special rule? Because we don't want to limit SIMD or
+// multi-threading parallelism for that update().
+static size_t blake3_compress_subtree_wide(
+  const uint8_t* input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t* out)
+{
+  // Note that the single chunk case does *not* bump the SIMD degree up to 2
+  // when it is 1. If this implementation adds multi-threading in the future,
+  // this gives us the option of multi-threading even the 2-chunk case, which
+  // can help performance on smaller platforms.
+  if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) {
+    return compress_chunks_parallel(input, input_len, key, chunk_counter, flags, out);
+  }
+
+  // With more than simd_degree chunks, we need to recurse. Start by dividing
+  // the input into left and right subtrees. (Note that this is only optimal
+  // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
+  // of 3 or something, we'll need a more complicated strategy.)
+  size_t left_input_len = left_len(input_len);
+  size_t right_input_len = input_len - left_input_len;
+  const uint8_t* right_input = &input[left_input_len];
+  uint64_t right_chunk_counter = chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN);
+
+  // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
+  // account for the special case of returning 2 outputs when the SIMD degree
+  // is 1.
+  uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
+  size_t degree = blake3_simd_degree();
+  if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) {
+    // The special case: We always use a degree of at least two, to make
+    // sure there are two outputs. Except, as noted above, at the chunk
+    // level, where we allow degree=1. (Note that the 1-chunk-input case is
+    // a different codepath.)
+    degree = 2;
+  }
+  uint8_t* right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
+
+  // Recurse! If this implementation adds multi-threading support in the
+  // future, this is where it will go.
+  size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key, chunk_counter, flags, cv_array);
+  size_t right_n =
+    blake3_compress_subtree_wide(right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
+
+  // The special case again. If simd_degree=1, then we'll have left_n=1 and
+  // right_n=1. Rather than compressing them into a single output, return
+  // them directly, to make sure we always have at least two outputs.
+  if (left_n == 1) {
+    memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
+    return 2;
+  }
+
+  // Otherwise, do one layer of parent node compression.
+  size_t num_chaining_values = left_n + right_n;
+  return compress_parents_parallel(cv_array, num_chaining_values, key, flags, out);
+}
+
+// Hash a subtree with compress_subtree_wide(), and then condense the resulting
+// list of chaining values down to a single parent node. Don't compress that
+// last parent node, however. Instead, return its message bytes (the
+// concatenated chaining values of its children). This is necessary when the
+// first call to update() supplies a complete subtree, because the topmost
+// parent node of that subtree could end up being the root. It's also necessary
+// for extended output in the general case.
+//
+// As with compress_subtree_wide(), this function is not used on inputs of 1
+// chunk or less. That's a different codepath.
+INLINE void compress_subtree_to_parent_node(
+  const uint8_t* input,
+  size_t input_len,
+  const uint32_t key[8],
+  uint64_t chunk_counter,
+  uint8_t flags,
+  uint8_t out[2 * BLAKE3_OUT_LEN])
+{
+  uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
+  size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key, chunk_counter, flags, cv_array);
+  assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
+  // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because
+  // as we just asserted, num_cvs will always be <=2 in that case. But GCC
+  // (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is
+  // set then it emits incorrect warnings here. We tried a few different
+  // hacks to silence these, but in the end our hacks just produced different
+  // warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of
+  // desperation, we ifdef out this entire loop when we know it's not needed.
+#if MAX_SIMD_DEGREE_OR_2 > 2
+  // If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input,
+  // compress_subtree_wide() returns more than 2 chaining values. Condense
+  // them into 2 by forming parent nodes repeatedly.
+  uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
+  while (num_cvs > 2) {
+    num_cvs = compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
+    memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
+  }
+#endif
+  memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
+}
+
+INLINE void hasher_init_base(blake3_hasher* self, const uint32_t key[8], uint8_t flags)
+{
+  memcpy(self->key, key, BLAKE3_KEY_LEN);
+  chunk_state_init(&self->chunk, key, flags);
+  self->cv_stack_len = 0;
+}
+
+void blake3_hasher_init(blake3_hasher* self) { hasher_init_base(self, IV, 0); }
+
+void blake3_hasher_init_keyed(blake3_hasher* self, const uint8_t key[BLAKE3_KEY_LEN])
+{
+  uint32_t key_words[8];
+  load_key_words(key, key_words);
+  hasher_init_base(self, key_words, KEYED_HASH);
+}
+
+void blake3_hasher_init_derive_key_raw(blake3_hasher* self, const void* context, size_t context_len)
+{
+  blake3_hasher context_hasher;
+  hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT);
+  blake3_hasher_update(&context_hasher, context, context_len);
+  uint8_t context_key[BLAKE3_KEY_LEN];
+  blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN);
+  uint32_t context_key_words[8];
+  load_key_words(context_key, context_key_words);
+  hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL);
+}
+
+void blake3_hasher_init_derive_key(blake3_hasher* self, const char* context)
+{
+  blake3_hasher_init_derive_key_raw(self, context, strlen(context));
+}
+
+// As described in hasher_push_cv() below, we do "lazy merging", delaying
+// merges until right before the next CV is about to be added. This is
+// different from the reference implementation. Another difference is that we
+// aren't always merging 1 chunk at a time. Instead, each CV might represent
+// any power-of-two number of chunks, as long as the smaller-above-larger stack
+// order is maintained. Instead of the "count the trailing 0-bits" algorithm
+// described in the spec, we use a "count the total number of 1-bits" variant
+// that doesn't require us to retain the subtree size of the CV on top of the
+// stack. The principle is the same: each CV that should remain in the stack is
+// represented by a 1-bit in the total number of chunks (or bytes) so far.
+INLINE void hasher_merge_cv_stack(blake3_hasher* self, uint64_t total_len)
+{
+  size_t post_merge_stack_len = (size_t)popcnt(total_len);
+  while (self->cv_stack_len > post_merge_stack_len) {
+    uint8_t* parent_node = &self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN];
+    output_t output = parent_output(parent_node, self->key, self->chunk.flags);
+    output_chaining_value(&output, parent_node);
+    self->cv_stack_len -= 1;
+  }
+}
+
+// In reference_impl.rs, we merge the new CV with existing CVs from the stack
+// before pushing it. We can do that because we know more input is coming, so
+// we know none of the merges are root.
+//
+// This setting is different. We want to feed as much input as possible to
+// compress_subtree_wide(), without setting aside anything for the chunk_state.
+// If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once
+// as a single subtree, if at all possible.
+//
+// This leads to two problems:
+// 1) This 64 KiB input might be the only call that ever gets made to update.
+//    In this case, the root node of the 64 KiB subtree would be the root node
+//    of the whole tree, and it would need to be ROOT finalized. We can't
+//    compress it until we know.
+// 2) This 64 KiB input might complete a larger tree, whose root node is
+//    similarly going to be the root of the whole tree. For example, maybe
+//    we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
+//    node at the root of the 256 KiB subtree until we know how to finalize it.
+//
+// The second problem is solved with "lazy merging". That is, when we're about
+// to add a CV to the stack, we don't merge it with anything first, as the
+// reference impl does. Instead we do merges using the *previous* CV that was
+// added, which is sitting on top of the stack, and we put the new CV
+// (unmerged) on top of the stack afterwards. This guarantees that we never
+// merge the root node until finalize().
+//
+// Solving the first problem requires an additional tool,
+// compress_subtree_to_parent_node(). That function always returns the top
+// *two* chaining values of the subtree it's compressing. We then do lazy
+// merging with each of them separately, so that the second CV will always
+// remain unmerged. (That also helps us support extendable output when we're
+// hashing an input all-at-once.)
+INLINE void hasher_push_cv(blake3_hasher* self, uint8_t new_cv[BLAKE3_OUT_LEN], uint64_t chunk_counter)
+{
+  hasher_merge_cv_stack(self, chunk_counter);
+  memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv, BLAKE3_OUT_LEN);
+  self->cv_stack_len += 1;
+}
+
+void blake3_hasher_update(blake3_hasher* self, const void* input, size_t input_len)
+{
+  // Explicitly checking for zero avoids causing UB by passing a null pointer
+  // to memcpy. This comes up in practice with things like:
+  //   std::vector<uint8_t> v;
+  //   blake3_hasher_update(&hasher, v.data(), v.size());
+  if (input_len == 0) { return; }
+
+  const uint8_t* input_bytes = (const uint8_t*)input;
+
+  // If we have some partial chunk bytes in the internal chunk_state, we need
+  // to finish that chunk first.
+  if (chunk_state_len(&self->chunk) > 0) {
+    size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk);
+    if (take > input_len) { take = input_len; }
+    chunk_state_update(&self->chunk, input_bytes, take);
+    input_bytes += take;
+    input_len -= take;
+    // If we've filled the current chunk and there's more coming, finalize this
+    // chunk and proceed. In this case we know it's not the root.
+    if (input_len > 0) {
+      output_t output = chunk_state_output(&self->chunk);
+      uint8_t chunk_cv[32];
+      output_chaining_value(&output, chunk_cv);
+      hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter);
+      chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1);
+    } else {
+      return;
+    }
+  }
+
+  // Now the chunk_state is clear, and we have more input. If there's more than
+  // a single chunk (so, definitely not the root chunk), hash the largest whole
+  // subtree we can, with the full benefits of SIMD (and maybe in the future,
+  // multi-threading) parallelism. Two restrictions:
+  // - The subtree has to be a power-of-2 number of chunks. Only subtrees along
+  //   the right edge can be incomplete, and we don't know where the right edge
+  //   is going to be until we get to finalize().
+  // - The subtree must evenly divide the total number of chunks up until this
+  //   point (if total is not 0). If the current incomplete subtree is only
+  //   waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have
+  //   to complete the current subtree first.
+  // Because we might need to break up the input to form powers of 2, or to
+  // evenly divide what we already have, this part runs in a loop.
+  while (input_len > BLAKE3_CHUNK_LEN) {
+    size_t subtree_len = round_down_to_power_of_2(input_len);
+    uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN;
+    // Shrink the subtree_len until it evenly divides the count so far. We know
+    // that subtree_len itself is a power of 2, so we can use a bitmasking
+    // trick instead of an actual remainder operation. (Note that if the caller
+    // consistently passes power-of-2 inputs of the same size, as is hopefully
+    // typical, this loop condition will always fail, and subtree_len will
+    // always be the full length of the input.)
+    //
+    // An aside: We don't have to shrink subtree_len quite this much. For
+    // example, if count_so_far is 1, we could pass 2 chunks to
+    // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still
+    // get the right answer in the end, and we might get to use 2-way SIMD
+    // parallelism. The problem with this optimization, is that it gets us
+    // stuck always hashing 2 chunks. The total number of chunks will remain
+    // odd, and we'll never graduate to higher degrees of parallelism. See
+    // https://github.com/BLAKE3-team/BLAKE3/issues/69.
+    while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) {
+      subtree_len /= 2;
+    }
+    // The shrunken subtree_len might now be 1 chunk long. If so, hash that one
+    // chunk by itself. Otherwise, compress the subtree into a pair of CVs.
+    uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;
+    if (subtree_len <= BLAKE3_CHUNK_LEN) {
+      blake3_chunk_state chunk_state;
+      chunk_state_init(&chunk_state, self->key, self->chunk.flags);
+      chunk_state.chunk_counter = self->chunk.chunk_counter;
+      chunk_state_update(&chunk_state, input_bytes, subtree_len);
+      output_t output = chunk_state_output(&chunk_state);
+      uint8_t cv[BLAKE3_OUT_LEN];
+      output_chaining_value(&output, cv);
+      hasher_push_cv(self, cv, chunk_state.chunk_counter);
+    } else {
+      // This is the high-performance happy path, though getting here depends
+      // on the caller giving us a long enough input.
+      uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
+      compress_subtree_to_parent_node(
+        input_bytes, subtree_len, self->key, self->chunk.chunk_counter, self->chunk.flags, cv_pair);
+      hasher_push_cv(self, cv_pair, self->chunk.chunk_counter);
+      hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], self->chunk.chunk_counter + (subtree_chunks / 2));
+    }
+    self->chunk.chunk_counter += subtree_chunks;
+    input_bytes += subtree_len;
+    input_len -= subtree_len;
+  }
+
+  // If there's any remaining input less than a full chunk, add it to the chunk
+  // state. In that case, also do a final merge loop to make sure the subtree
+  // stack doesn't contain any unmerged pairs. The remaining input means we
+  // know these merges are non-root. This merge loop isn't strictly necessary
+  // here, because hasher_push_chunk_cv already does its own merge loop, but it
+  // simplifies blake3_hasher_finalize below.
+  if (input_len > 0) {
+    chunk_state_update(&self->chunk, input_bytes, input_len);
+    hasher_merge_cv_stack(self, self->chunk.chunk_counter);
+  }
+}
+
+void blake3_hasher_finalize(const blake3_hasher* self, uint8_t* out, size_t out_len)
+{
+  blake3_hasher_finalize_seek(self, 0, out, out_len);
+}
+
+void blake3_hasher_finalize_seek(const blake3_hasher* self, uint64_t seek, uint8_t* out, size_t out_len)
+{
+  // Explicitly checking for zero avoids causing UB by passing a null pointer
+  // to memcpy. This comes up in practice with things like:
+  //   std::vector<uint8_t> v;
+  //   blake3_hasher_finalize(&hasher, v.data(), v.size());
+  if (out_len == 0) { return; }
+
+  // If the subtree stack is empty, then the current chunk is the root.
+  if (self->cv_stack_len == 0) {
+    output_t output = chunk_state_output(&self->chunk);
+    output_root_bytes(&output, seek, out, out_len);
+    return;
+  }
+  // If there are any bytes in the chunk state, finalize that chunk and do a
+  // roll-up merge between that chunk hash and every subtree in the stack. In
+  // this case, the extra merge loop at the end of blake3_hasher_update
+  // guarantees that none of the subtrees in the stack need to be merged with
+  // each other first. Otherwise, if there are no bytes in the chunk state,
+  // then the top of the stack is a chunk hash, and we start the merge from
+  // that.
+  output_t output;
+  size_t cvs_remaining;
+  if (chunk_state_len(&self->chunk) > 0) {
+    cvs_remaining = self->cv_stack_len;
+    output = chunk_state_output(&self->chunk);
+  } else {
+    // There are always at least 2 CVs in the stack in this case.
+    cvs_remaining = self->cv_stack_len - 2;
+    output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key, self->chunk.flags);
+  }
+  while (cvs_remaining > 0) {
+    cvs_remaining -= 1;
+    uint8_t parent_block[BLAKE3_BLOCK_LEN];
+    memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32);
+    output_chaining_value(&output, &parent_block[32]);
+    output = parent_output(parent_block, self->key, self->chunk.flags);
+  }
+  output_root_bytes(&output, seek, out, out_len);
+}
+
+void blake3_hasher_reset(blake3_hasher* self)
+{
+  chunk_state_reset(&self->chunk, self->key, 0);
+  self->cv_stack_len = 0;
+}
diff --git a/icicle/backend/cpu/src/hash/blake3.h b/icicle/backend/cpu/src/hash/blake3.h
new file mode 100644
index 0000000000..55b238f1ac
--- /dev/null
+++ b/icicle/backend/cpu/src/hash/blake3.h
@@ -0,0 +1,57 @@
+/*BLAKE3 Hash function based on the original design by the BLAKE3 team https://github.com/BLAKE3-team/BLAKE3 */
+
+#ifndef BLAKE3_H
+#define BLAKE3_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BLAKE3_VERSION_STRING "1.5.5"
+#define BLAKE3_KEY_LEN        32
+#define BLAKE3_OUT_LEN        32
+#define BLAKE3_BLOCK_LEN      64
+#define BLAKE3_CHUNK_LEN      1024
+#define BLAKE3_MAX_DEPTH      54
+
+// This struct is a private implementation detail. It has to be here because
+// it's part of blake3_hasher below.
+typedef struct {
+  uint32_t cv[8];
+  uint64_t chunk_counter;
+  uint8_t buf[BLAKE3_BLOCK_LEN];
+  uint8_t buf_len;
+  uint8_t blocks_compressed;
+  uint8_t flags;
+} blake3_chunk_state;
+
+typedef struct {
+  uint32_t key[8];
+  blake3_chunk_state chunk;
+  uint8_t cv_stack_len;
+  // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
+  // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
+  // requires a 4th entry, rather than merging everything down to 1, because we
+  // don't know whether more input is coming. This is different from how the
+  // reference implementation does things.
+  uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
+} blake3_hasher;
+
+const char* blake3_version(void);
+void blake3_hasher_init(blake3_hasher* self);
+void blake3_hasher_init_keyed(blake3_hasher* self, const uint8_t key[BLAKE3_KEY_LEN]);
+void blake3_hasher_init_derive_key(blake3_hasher* self, const char* context);
+void blake3_hasher_init_derive_key_raw(blake3_hasher* self, const void* context, size_t context_len);
+void blake3_hasher_update(blake3_hasher* self, const void* input, size_t input_len);
+void blake3_hasher_finalize(const blake3_hasher* self, uint8_t* out, size_t out_len);
+void blake3_hasher_finalize_seek(const blake3_hasher* self, uint64_t seek, uint8_t* out, size_t out_len);
+void blake3_hasher_reset(blake3_hasher* self);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BLAKE3_H */
diff --git a/icicle/backend/cpu/src/hash/blake3_dispatch.c b/icicle/backend/cpu/src/hash/blake3_dispatch.c
new file mode 100644
index 0000000000..296e769e31
--- /dev/null
+++ b/icicle/backend/cpu/src/hash/blake3_dispatch.c
@@ -0,0 +1,62 @@
+/*BLAKE3 Hash function based on the original design by the BLAKE3 team https://github.com/BLAKE3-team/BLAKE3 */
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "blake3_impl.h"
+
+void blake3_compress_in_place(
+  uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
+{
+  blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
+}
+
+void blake3_compress_xof(
+  const uint32_t cv[8],
+  const uint8_t block[BLAKE3_BLOCK_LEN],
+  uint8_t block_len,
+  uint64_t counter,
+  uint8_t flags,
+  uint8_t out[64])
+{
+  blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
+}
+
+void blake3_xof_many(
+  const uint32_t cv[8],
+  const uint8_t block[BLAKE3_BLOCK_LEN],
+  uint8_t block_len,
+  uint64_t counter,
+  uint8_t flags,
+  uint8_t out[64],
+  size_t outblocks)
+{
+  if (outblocks == 0) {
+    // The current assembly implementation always outputs at least 1 block.
+    return;
+  }
+
+  for (size_t i = 0; i < outblocks; ++i) {
+    blake3_compress_xof(cv, block, block_len, counter + i, flags, out + 64 * i);
+  }
+}
+
+void blake3_hash_many(
+  const uint8_t* const* inputs,
+  size_t num_inputs,
+  size_t blocks,
+  const uint32_t key[8],
+  uint64_t counter,
+  bool increment_counter,
+  uint8_t flags,
+  uint8_t flags_start,
+  uint8_t flags_end,
+  uint8_t* out)
+{
+  blake3_hash_many_portable(
+    inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out);
+}
+
+// The dynamically detected SIMD degree of the current platform.
+size_t blake3_simd_degree(void) { return 1; }
diff --git a/icicle/backend/cpu/src/hash/blake3_impl.h b/icicle/backend/cpu/src/hash/blake3_impl.h
new file mode 100644
index 0000000000..1bd2bc0046
--- /dev/null
+++ b/icicle/backend/cpu/src/hash/blake3_impl.h
@@ -0,0 +1,205 @@
+/*BLAKE3 Hash function based on the original design by the BLAKE3 team https://github.com/BLAKE3-team/BLAKE3 */
+
+#ifndef BLAKE3_IMPL_H
+#define BLAKE3_IMPL_H
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "blake3.h"
+
+// internal flags
+enum blake3_flags {
+  CHUNK_START = 1 << 0,
+  CHUNK_END = 1 << 1,
+  PARENT = 1 << 2,
+  ROOT = 1 << 3,
+  KEYED_HASH = 1 << 4,
+  DERIVE_KEY_CONTEXT = 1 << 5,
+  DERIVE_KEY_MATERIAL = 1 << 6,
+};
+
+// This C implementation tries to support recent versions of GCC, Clang, and
+// MSVC.
+
+#define INLINE          static inline __attribute__((always_inline))
+#define MAX_SIMD_DEGREE 1
+
+// There are some places where we want a static size that's equal to the
+// MAX_SIMD_DEGREE, but also at least 2.
+#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
+
+static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+                               0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL};
+
+static const uint8_t MSG_SCHEDULE[7][16] = {
+  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
+  {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
+  {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
+  {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
+};
+
+/* Find index of the highest set bit */
+/* x is assumed to be nonzero.       */
+static unsigned int highest_one(uint64_t x)
+{
+#if defined(__GNUC__) || defined(__clang__)
+  return 63 ^ (unsigned int)__builtin_clzll(x);
+#else
+  unsigned int c = 0;
+  if (x & 0xffffffff00000000ULL) {
+    x >>= 32;
+    c += 32;
+  }
+  if (x & 0x00000000ffff0000ULL) {
+    x >>= 16;
+    c += 16;
+  }
+  if (x & 0x000000000000ff00ULL) {
+    x >>= 8;
+    c += 8;
+  }
+  if (x & 0x00000000000000f0ULL) {
+    x >>= 4;
+    c += 4;
+  }
+  if (x & 0x000000000000000cULL) {
+    x >>= 2;
+    c += 2;
+  }
+  if (x & 0x0000000000000002ULL) { c += 1; }
+  return c;
+#endif
+}
+
+// Count the number of 1 bits.
+INLINE unsigned int popcnt(uint64_t x)
+{
+#if defined(__GNUC__) || defined(__clang__)
+  return (unsigned int)__builtin_popcountll(x);
+#else
+  unsigned int count = 0;
+  while (x != 0) {
+    count += 1;
+    x &= x - 1;
+  }
+  return count;
+#endif
+}
+
+// Largest power of two less than or equal to x. As a special case, returns 1
+// when x is 0.
+INLINE uint64_t round_down_to_power_of_2(uint64_t x) { return 1ULL << highest_one(x | 1); }
+
+INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; }
+
+INLINE uint32_t counter_high(uint64_t counter) { return (uint32_t)(counter >> 32); }
+
+INLINE uint32_t load32(const void* src)
+{
+  const uint8_t* p = (const uint8_t*)src;
+  return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
+}
+
+INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], uint32_t key_words[8])
+{
+  key_words[0] = load32(&key[0 * 4]);
+  key_words[1] = load32(&key[1 * 4]);
+  key_words[2] = load32(&key[2 * 4]);
+  key_words[3] = load32(&key[3 * 4]);
+  key_words[4] = load32(&key[4 * 4]);
+  key_words[5] = load32(&key[5 * 4]);
+  key_words[6] = load32(&key[6 * 4]);
+  key_words[7] = load32(&key[7 * 4]);
+}
+
+INLINE void load_block_words(const uint8_t block[BLAKE3_BLOCK_LEN], uint32_t block_words[16])
+{
+  for (size_t i = 0; i < 16; i++) {
+    block_words[i] = load32(&block[i * 4]);
+  }
+}
+
+INLINE void store32(void* dst, uint32_t w)
+{
+  uint8_t* p = (uint8_t*)dst;
+  p[0] = (uint8_t)(w >> 0);
+  p[1] = (uint8_t)(w >> 8);
+  p[2] = (uint8_t)(w >> 16);
+  p[3] = (uint8_t)(w >> 24);
+}
+
+INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8])
+{
+  store32(&bytes_out[0 * 4], cv_words[0]);
+  store32(&bytes_out[1 * 4], cv_words[1]);
+  store32(&bytes_out[2 * 4], cv_words[2]);
+  store32(&bytes_out[3 * 4], cv_words[3]);
+  store32(&bytes_out[4 * 4], cv_words[4]);
+  store32(&bytes_out[5 * 4], cv_words[5]);
+  store32(&bytes_out[6 * 4], cv_words[6]);
+  store32(&bytes_out[7 * 4], cv_words[7]);
+}
+
+void blake3_compress_in_place(
+  uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags);
+
+void blake3_compress_xof(
+  const uint32_t cv[8],
+  const uint8_t block[BLAKE3_BLOCK_LEN],
+  uint8_t block_len,
+  uint64_t counter,
+  uint8_t flags,
+  uint8_t out[64]);
+
+void blake3_xof_many(
+  const uint32_t cv[8],
+  const uint8_t block[BLAKE3_BLOCK_LEN],
+  uint8_t block_len,
+  uint64_t counter,
+  uint8_t flags,
+  uint8_t out[64],
+  size_t outblocks);
+
+void blake3_hash_many(
+  const uint8_t* const* inputs,
+  size_t num_inputs,
+  size_t blocks,
+  const uint32_t key[8],
+  uint64_t counter,
+  bool increment_counter,
+  uint8_t flags,
+  uint8_t flags_start,
+  uint8_t flags_end,
+  uint8_t* out);
+
+size_t blake3_simd_degree(void);
+
+// Declarations for implementation-specific functions.
+void blake3_compress_in_place_portable(
+  uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags);
+
+void blake3_compress_xof_portable(
+  const uint32_t cv[8],
+  const uint8_t block[BLAKE3_BLOCK_LEN],
+  uint8_t block_len,
+  uint64_t counter,
+  uint8_t flags,
+  uint8_t out[64]);
+
+void blake3_hash_many_portable(
+  const uint8_t* const* inputs,
+  size_t num_inputs,
+  size_t blocks,
+  const uint32_t key[8],
+  uint64_t counter,
+  bool increment_counter,
+  uint8_t flags,
+  uint8_t flags_start,
+  uint8_t flags_end,
+  uint8_t* out);
+
+#endif /* BLAKE3_IMPL_H */
diff --git a/icicle/backend/cpu/src/hash/blake3_portable.c b/icicle/backend/cpu/src/hash/blake3_portable.c
new file mode 100644
index 0000000000..9ef1293418
--- /dev/null
+++ b/icicle/backend/cpu/src/hash/blake3_portable.c
@@ -0,0 +1,176 @@
+/*BLAKE3 Hash function based on the original design by the BLAKE3 team https://github.com/BLAKE3-team/BLAKE3 */
+
+#include "blake3_impl.h"
+#include <string.h>
+
+INLINE uint32_t rotr32(uint32_t w, uint32_t c) { return (w >> c) | (w << (32 - c)); }
+
+INLINE void g(uint32_t* state, size_t a, size_t b, size_t c, size_t d, uint32_t x, uint32_t y)
+{
+  state[a] = state[a] + state[b] + x;
+  state[d] = rotr32(state[d] ^ state[a], 16);
+  state[c] = state[c] + state[d];
+  state[b] = rotr32(state[b] ^ state[c], 12);
+  state[a] = state[a] + state[b] + y;
+  state[d] = rotr32(state[d] ^ state[a], 8);
+  state[c] = state[c] + state[d];
+  state[b] = rotr32(state[b] ^ state[c], 7);
+}
+
+INLINE void round_fn(uint32_t state[16], const uint32_t* msg, size_t round)
+{
+  // Select the message schedule based on the round.
+  const uint8_t* schedule = MSG_SCHEDULE[round];
+
+  // Mix the columns.
+  g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
+  g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
+  g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
+  g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
+
+  // Mix the rows.
+  g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
+  g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
+  g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
+  g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
+}
+
+INLINE void compress_pre(
+  uint32_t state[16],
+  const uint32_t cv[8],
+  const uint8_t block[BLAKE3_BLOCK_LEN],
+  uint8_t block_len,
+  uint64_t counter,
+  uint8_t flags)
+{
+  uint32_t block_words[16];
+  block_words[0] = load32(block + 4 * 0);
+  block_words[1] = load32(block + 4 * 1);
+  block_words[2] = load32(block + 4 * 2);
+  block_words[3] = load32(block + 4 * 3);
+  block_words[4] = load32(block + 4 * 4);
+  block_words[5] = load32(block + 4 * 5);
+  block_words[6] = load32(block + 4 * 6);
+  block_words[7] = load32(block + 4 * 7);
+  block_words[8] = load32(block + 4 * 8);
+  block_words[9] = load32(block + 4 * 9);
+  block_words[10] = load32(block + 4 * 10);
+  block_words[11] = load32(block + 4 * 11);
+  block_words[12] = load32(block + 4 * 12);
+  block_words[13] = load32(block + 4 * 13);
+  block_words[14] = load32(block + 4 * 14);
+  block_words[15] = load32(block + 4 * 15);
+
+  state[0] = cv[0];
+  state[1] = cv[1];
+  state[2] = cv[2];
+  state[3] = cv[3];
+  state[4] = cv[4];
+  state[5] = cv[5];
+  state[6] = cv[6];
+  state[7] = cv[7];
+  state[8] = IV[0];
+  state[9] = IV[1];
+  state[10] = IV[2];
+  state[11] = IV[3];
+  state[12] = counter_low(counter);
+  state[13] = counter_high(counter);
+  state[14] = (uint32_t)block_len;
+  state[15] = (uint32_t)flags;
+
+  round_fn(state, &block_words[0], 0);
+  round_fn(state, &block_words[0], 1);
+  round_fn(state, &block_words[0], 2);
+  round_fn(state, &block_words[0], 3);
+  round_fn(state, &block_words[0], 4);
+  round_fn(state, &block_words[0], 5);
+  round_fn(state, &block_words[0], 6);
+}
+
+void blake3_compress_in_place_portable(
+  uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
+{
+  uint32_t state[16];
+  compress_pre(state, cv, block, block_len, counter, flags);
+  cv[0] = state[0] ^ state[8];
+  cv[1] = state[1] ^ state[9];
+  cv[2] = state[2] ^ state[10];
+  cv[3] = state[3] ^ state[11];
+  cv[4] = state[4] ^ state[12];
+  cv[5] = state[5] ^ state[13];
+  cv[6] = state[6] ^ state[14];
+  cv[7] = state[7] ^ state[15];
+}
+
+void blake3_compress_xof_portable(
+  const uint32_t cv[8],
+  const uint8_t block[BLAKE3_BLOCK_LEN],
+  uint8_t block_len,
+  uint64_t counter,
+  uint8_t flags,
+  uint8_t out[64])
+{
+  uint32_t state[16];
+  compress_pre(state, cv, block, block_len, counter, flags);
+
+  store32(&out[0 * 4], state[0] ^ state[8]);
+  store32(&out[1 * 4], state[1] ^ state[9]);
+  store32(&out[2 * 4], state[2] ^ state[10]);
+  store32(&out[3 * 4], state[3] ^ state[11]);
+  store32(&out[4 * 4], state[4] ^ state[12]);
+  store32(&out[5 * 4], state[5] ^ state[13]);
+  store32(&out[6 * 4], state[6] ^ state[14]);
+  store32(&out[7 * 4], state[7] ^ state[15]);
+  store32(&out[8 * 4], state[8] ^ cv[0]);
+  store32(&out[9 * 4], state[9] ^ cv[1]);
+  store32(&out[10 * 4], state[10] ^ cv[2]);
+  store32(&out[11 * 4], state[11] ^ cv[3]);
+  store32(&out[12 * 4], state[12] ^ cv[4]);
+  store32(&out[13 * 4], state[13] ^ cv[5]);
+  store32(&out[14 * 4], state[14] ^ cv[6]);
+  store32(&out[15 * 4], state[15] ^ cv[7]);
+}
+
+INLINE void hash_one_portable(
+  const uint8_t* input,
+  size_t blocks,
+  const uint32_t key[8],
+  uint64_t counter,
+  uint8_t flags,
+  uint8_t flags_start,
+  uint8_t flags_end,
+  uint8_t out[BLAKE3_OUT_LEN])
+{
+  uint32_t cv[8];
+  memcpy(cv, key, BLAKE3_KEY_LEN);
+  uint8_t block_flags = flags | flags_start;
+  while (blocks > 0) {
+    if (blocks == 1) { block_flags |= flags_end; }
+    blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter, block_flags);
+    input = &input[BLAKE3_BLOCK_LEN];
+    blocks -= 1;
+    block_flags = flags;
+  }
+  store_cv_words(out, cv);
+}
+
+void blake3_hash_many_portable(
+  const uint8_t* const* inputs,
+  size_t num_inputs,
+  size_t blocks,
+  const uint32_t key[8],
+  uint64_t counter,
+  bool increment_counter,
+  uint8_t flags,
+  uint8_t flags_start,
+  uint8_t flags_end,
+  uint8_t* out)
+{
+  while (num_inputs > 0) {
+    hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start, flags_end, out);
+    if (increment_counter) { counter += 1; }
+    inputs += 1;
+    num_inputs -= 1;
+    out = &out[BLAKE3_OUT_LEN];
+  }
+}
diff --git a/icicle/backend/cpu/src/hash/cpu_blake3.cpp b/icicle/backend/cpu/src/hash/cpu_blake3.cpp
new file mode 100644
index 0000000000..9afe6674b1
--- /dev/null
+++ b/icicle/backend/cpu/src/hash/cpu_blake3.cpp
@@ -0,0 +1,53 @@
+/*BLAKE3 Hash function based on the original design by the BLAKE3 team https://github.com/BLAKE3-team/BLAKE3 */
+
+#include "blake3.h"
+#include "icicle/backend/hash/blake3_backend.h"
+#include "icicle/utils/modifiers.h"
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+
+namespace icicle {
+
+  class Blake3BackendCPU : public HashBackend
+  {
+  public:
+    Blake3BackendCPU(uint64_t input_chunk_size) : HashBackend("Blake3-CPU", BLAKE3_OUTBYTES, input_chunk_size) {}
+
+    eIcicleError hash(const std::byte* input, uint64_t size, const HashConfig& config, std::byte* output) const override
+    {
+      const auto digest_size_in_bytes = output_size();
+      const auto single_input_size = get_single_chunk_size(size);
+
+      // Initialize the hasher
+      blake3_hasher hasher;
+
+      for (unsigned batch_idx = 0; batch_idx < config.batch; ++batch_idx) {
+        const std::byte* batch_input = input + batch_idx * single_input_size;
+        uint64_t batch_size = single_input_size;
+
+        blake3_hasher_init(&hasher);
+        blake3_hasher_update(&hasher, reinterpret_cast<const uint8_t*>(batch_input), batch_size);
+
+        uint8_t* batch_output = reinterpret_cast<uint8_t*>(output + batch_idx * digest_size_in_bytes);
+        blake3_hasher_finalize(&hasher, batch_output, digest_size_in_bytes);
+      }
+
+      return eIcicleError::SUCCESS;
+    }
+
+  private:
+    static constexpr unsigned int BLAKE3_OUTBYTES = 32; // BLAKE3 default output size in bytes
+  };
+
+  /************************ Blake3 registration ************************/
+  eIcicleError
+  create_blake3_hash_backend(const Device& device, uint64_t input_chunk_size, std::shared_ptr<HashBackend>& backend)
+  {
+    backend = std::make_shared<Blake3BackendCPU>(input_chunk_size);
+    return eIcicleError::SUCCESS;
+  }
+
+  REGISTER_BLAKE3_FACTORY_BACKEND("CPU", create_blake3_hash_backend);
+
+} // namespace icicle
\ No newline at end of file
diff --git a/icicle/backend/cpu/src/hash/cpu_poseidon2.cpp b/icicle/backend/cpu/src/hash/cpu_poseidon2.cpp
index 5f664860e7..df1c519cd6 100644
--- a/icicle/backend/cpu/src/hash/cpu_poseidon2.cpp
+++ b/icicle/backend/cpu/src/hash/cpu_poseidon2.cpp
@@ -147,7 +147,7 @@ namespace icicle {
         ICICLE_LOG_ERROR
           << "cpu_poseidon2_init_default_constants: T (width) must be one of [2, 3, 4, 8, 12, 16, 20, 24]\n";
         return eIcicleError::INVALID_ARGUMENT;
-      }                                              // switch (T) {
+      } // switch (T) {
       if (full_rounds == 0 && partial_rounds == 0) { // All arrays are empty in this case.
         continue;
       }
diff --git a/icicle/cmake/hash.cmake b/icicle/cmake/hash.cmake
index 6d43c3e034..e8fc0d97c0 100644
--- a/icicle/cmake/hash.cmake
+++ b/icicle/cmake/hash.cmake
@@ -5,6 +5,7 @@ function(setup_hash_target)
   target_sources(icicle_hash PRIVATE 
     src/hash/keccak.cpp
     src/hash/blake2s.cpp
+    src/hash/blake3.cpp
     src/hash/merkle_tree.cpp
     src/hash/hash_c_api.cpp
     src/hash/merkle_c_api.cpp
diff --git a/icicle/include/icicle/backend/hash/blake3_backend.h b/icicle/include/icicle/backend/hash/blake3_backend.h
new file mode 100644
index 0000000000..f9b5825946
--- /dev/null
+++ b/icicle/include/icicle/backend/hash/blake3_backend.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <functional>
+#include "icicle/utils/utils.h"
+#include "icicle/device.h"
+#include "icicle/hash/blake3.h"
+
+namespace icicle {
+
+  /*************************** Backend registration ***************************/
+  using Blake3FactoryImpl = std::function<eIcicleError(
+    const Device& device, uint64_t input_chunk_size, std::shared_ptr<HashBackend>& backend /*OUT*/)>;
+
+  // Blake3 256
+  void register_blake3_factory(const std::string& deviceType, Blake3FactoryImpl impl);
+
+#define REGISTER_BLAKE3_FACTORY_BACKEND(DEVICE_TYPE, FUNC)                                                             \
+  namespace {                                                                                                          \
+    static bool UNIQUE(_reg_blake3) = []() -> bool {                                                                   \
+      register_blake3_factory(DEVICE_TYPE, FUNC);                                                                      \
+      return true;                                                                                                     \
+    }();                                                                                                               \
+  }
+
+} // namespace icicle
\ No newline at end of file
diff --git a/icicle/include/icicle/fields/quartic_extension.h b/icicle/include/icicle/fields/quartic_extension.h
index 784298cac9..6478e915c6 100644
--- a/icicle/include/icicle/fields/quartic_extension.h
+++ b/icicle/include/icicle/fields/quartic_extension.h
@@ -241,7 +241,7 @@ class QuarticExtensionField
       FF::reduce(
         (CONFIG::nonresidue_is_negative
            ? (FF::mul_wide(xs.real, x0) + FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im2, x2)))
-           : (FF::mul_wide(xs.real, x0)) - FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im2, x2)))),
+           : (FF::mul_wide(xs.real, x0))-FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im2, x2)))),
       FF::reduce(
         (CONFIG::nonresidue_is_negative
            ? FWide::neg(FF::template mul_unsigned<CONFIG::nonresidue>(FF::mul_wide(xs.im3, x2)))
diff --git a/icicle/include/icicle/fields/storage.h b/icicle/include/icicle/fields/storage.h
index 76245db166..097db881fd 100644
--- a/icicle/include/icicle/fields/storage.h
+++ b/icicle/include/icicle/fields/storage.h
@@ -16,8 +16,7 @@ struct
 #ifdef __CUDA_ARCH__
   __align__(LIMBS_ALIGNMENT(1))
 #endif
-    storage<1>
-{
+    storage<1> {
   static constexpr unsigned LC = 1;
   uint32_t limbs[1];
 };
@@ -28,8 +27,7 @@ struct
 #ifdef __CUDA_ARCH__
   __align__(LIMBS_ALIGNMENT(1))
 #endif
-    storage<3>
-{
+    storage<3> {
   static constexpr unsigned LC = 3;
   uint32_t limbs[3];
 };
@@ -40,8 +38,7 @@ struct
 #ifdef __CUDA_ARCH__
   __align__(LIMBS_ALIGNMENT(LIMBS_COUNT))
 #endif
-    storage
-{
+    storage {
   static_assert(LIMBS_COUNT % 2 == 0, "odd number of limbs is not supported\n");
   static constexpr unsigned LC = LIMBS_COUNT;
   union { // works only with even LIMBS_COUNT
@@ -55,7 +52,6 @@ struct
 #ifdef __CUDA_ARCH__
   __align__(LIMBS_ALIGNMENT(LIMBS_COUNT))
 #endif
-    storage_array
-{
+    storage_array {
   storage<LIMBS_COUNT> storages[OMEGAS_COUNT];
 };
\ No newline at end of file
diff --git a/icicle/include/icicle/hash/blake3.h b/icicle/include/icicle/hash/blake3.h
new file mode 100644
index 0000000000..089122572d
--- /dev/null
+++ b/icicle/include/icicle/hash/blake3.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "icicle/hash/hash.h"
+
+namespace icicle {
+
+  /**
+   * @brief Creates a Blake3 hash object.
+   *
+   * This function constructs a Hash object configured for Blake3, with the
+   * appropriate backend selected based on the current device.
+   *
+   * @param input_chunk_size size of input in bytes for the Blake3 hash.
+   * @return Hash object encapsulating the Blake3 backend.
+   */
+  Hash create_blake3_hash(uint64_t input_chunk_size = 0);
+  struct Blake3 {
+    inline static Hash create(uint64_t input_chunk_size = 0) { return create_blake3_hash(input_chunk_size); }
+  };
+} // namespace icicle
\ No newline at end of file
diff --git a/icicle/src/hash/blake3.cpp b/icicle/src/hash/blake3.cpp
new file mode 100644
index 0000000000..3279fb0155
--- /dev/null
+++ b/icicle/src/hash/blake3.cpp
@@ -0,0 +1,17 @@
+#include "icicle/errors.h"
+#include "icicle/backend/hash/blake3_backend.h"
+#include "icicle/dispatcher.h"
+
+namespace icicle {
+
+  // Blake3
+  ICICLE_DISPATCHER_INST(Blake3Dispatcher, blake3_factory, Blake3FactoryImpl);
+
+  Hash create_blake3_hash(uint64_t input_chunk_size)
+  {
+    std::shared_ptr<HashBackend> backend;
+    ICICLE_CHECK(Blake3Dispatcher::execute(input_chunk_size, backend));
+    Hash blake3{backend};
+    return blake3;
+  }
+} // namespace icicle
\ No newline at end of file
diff --git a/icicle/src/hash/hash_c_api.cpp b/icicle/src/hash/hash_c_api.cpp
index 460b83d17c..820af1545f 100644
--- a/icicle/src/hash/hash_c_api.cpp
+++ b/icicle/src/hash/hash_c_api.cpp
@@ -3,6 +3,7 @@
 #include "icicle/errors.h"
 #include "icicle/hash/keccak.h"
 #include "icicle/hash/blake2s.h"
+#include "icicle/hash/blake3.h"
 
 extern "C" {
 // Define a type for the HasherHandle (which is a pointer to Hash)
@@ -123,4 +124,17 @@ HasherHandle icicle_create_blake2s(uint64_t input_chunk_size)
 {
   return new icicle::Hash(icicle::create_blake2s_hash(input_chunk_size));
 }
+
+/**
+ * @brief Creates a Blake3 hash object.
+ *
+ * This function constructs a Hash object configured for Blake2s.
+ *
+ * @param input_chunk_size Size of the input in bytes for the Blake2s hash.
+ * @return HasherHandle A handle to the created Blake2s Hash object.
+ */
+HasherHandle icicle_create_blake3(uint64_t input_chunk_size)
+{
+  return new icicle::Hash(icicle::create_blake3_hash(input_chunk_size));
+}
 }
\ No newline at end of file
diff --git a/icicle/tests/test_hash_api.cpp b/icicle/tests/test_hash_api.cpp
index c04aaecebf..860e42e358 100644
--- a/icicle/tests/test_hash_api.cpp
+++ b/icicle/tests/test_hash_api.cpp
@@ -7,6 +7,7 @@
 #include "icicle/hash/hash.h"
 #include "icicle/hash/keccak.h"
 #include "icicle/hash/blake2s.h"
+#include "icicle/hash/blake3.h"
 #include "icicle/merkle/merkle_tree.h"
 #include "icicle/fields/field.h"
 
@@ -93,6 +94,30 @@ TEST_F(HashApiTest, Blake2s)
   }
 }
 
+TEST_F(HashApiTest, Blake3)
+{
+  // TODO: Add CUDA test, same as blake2s
+  auto config = default_hash_config();
+
+  const std::string input =
+    "Hello world I am blake3. This is a semi-long C++ test with a lot of characters. "
+    "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef";
+  const std::string expected_output = "4b71f2c5cb7c26da2ba67cc742228e55b66c8b64b2b250e7ccce6f7f6d17c9ae";
+
+  const uint64_t output_size = 32;
+  auto output = std::make_unique<std::byte[]>(output_size);
+  for (const auto& device : s_registered_devices) {
+    ICICLE_LOG_DEBUG << "Blake2s test on device=" << device;
+    ICICLE_CHECK(icicle_set_device("CPU"));
+
+    auto blake3 = Blake3::create();
+    ICICLE_CHECK(blake3.hash(input.data(), input.size() / config.batch, config, output.get()));
+    // Convert the output do a hex string and compare to expected output string
+    std::string output_as_str = voidPtrToHexString(output.get(), output_size);
+    ASSERT_EQ(output_as_str, expected_output);
+  }
+}
+
 TEST_F(HashApiTest, Keccak256Batch)
 {
   auto config = default_hash_config();
@@ -214,6 +239,54 @@ TEST_F(HashApiTest, Blake2sLarge)
   ICICLE_CHECK(icicle_free(d_output));
 }
 
+TEST_F(HashApiTest, Blake3Large)
+{
+  auto config = default_hash_config();
+  config.batch = 1 << 8;
+  const unsigned chunk_size = 1 << 11; // 2KB chunks
+  const unsigned total_size = chunk_size * config.batch;
+  auto input = std::make_unique<std::byte[]>(total_size);
+  randomize((uint64_t*)input.get(), total_size / sizeof(uint64_t));
+
+  const uint64_t output_size = 32;
+  auto output_main = std::make_unique<std::byte[]>(output_size * config.batch);
+  auto output_main_case_2 = std::make_unique<std::byte[]>(output_size * config.batch);
+  auto output_ref = std::make_unique<std::byte[]>(output_size * config.batch);
+
+  ICICLE_CHECK(icicle_set_device(IcicleTestBase::reference_device()));
+  auto blake3CPU = Blake3::create();
+  START_TIMER(cpu_timer);
+  ICICLE_CHECK(blake3CPU.hash(input.get(), chunk_size, config, output_ref.get()));
+  END_TIMER(cpu_timer, "CPU blake3 large time", true);
+
+  ICICLE_CHECK(icicle_set_device(IcicleTestBase::main_device()));
+  auto blake3MainDev = Blake3::create();
+
+  // test with host memory
+  START_TIMER(mainDev_timer);
+  config.are_inputs_on_device = false;
+  config.are_outputs_on_device = false;
+  ICICLE_CHECK(blake3MainDev.hash(input.get(), chunk_size, config, output_main.get()));
+  END_TIMER(mainDev_timer, "MainDev blake3 large time (on host memory)", true);
+  ASSERT_EQ(0, memcmp(output_main.get(), output_ref.get(), output_size * config.batch));
+
+  // test with device memory
+  std::byte *d_input = nullptr, *d_output = nullptr;
+  ICICLE_CHECK(icicle_malloc((void**)&d_input, total_size));
+  ICICLE_CHECK(icicle_malloc((void**)&d_output, output_size * config.batch));
+  ICICLE_CHECK(icicle_copy(d_input, input.get(), total_size));
+  config.are_inputs_on_device = true;
+  config.are_outputs_on_device = true;
+  START_TIMER(mainDev_timer_device_mem);
+  ICICLE_CHECK(blake3MainDev.hash(d_input, chunk_size, config, d_output));
+  END_TIMER(mainDev_timer_device_mem, "MainDev blake3 large time (on device memory)", true);
+  ICICLE_CHECK(icicle_copy(output_main_case_2.get(), d_output, output_size * config.batch));
+  ASSERT_EQ(0, memcmp(output_main_case_2.get(), output_ref.get(), output_size * config.batch));
+
+  ICICLE_CHECK(icicle_free(d_input));
+  ICICLE_CHECK(icicle_free(d_output));
+}
+
 TEST_F(HashApiTest, sha3)
 {
   auto config = default_hash_config();
diff --git a/wrappers/golang/hash/blake3.go b/wrappers/golang/hash/blake3.go
new file mode 100644
index 0000000000..bd985c2f3b
--- /dev/null
+++ b/wrappers/golang/hash/blake3.go
@@ -0,0 +1,19 @@
+package hash
+
+// #cgo CFLAGS: -I./include/
+// #include "blake3.h"
+import "C"
+import (
+	"github.com/ingonyama-zk/icicle/v3/wrappers/golang/runtime"
+)
+
+func NewBlake3Hasher(inputChunkSize uint64) (Hasher, runtime.EIcicleError) {
+	h := C.icicle_create_blake3((C.ulong)(inputChunkSize))
+	if h == nil {
+		return Hasher{handle: nil}, runtime.UnknownError
+	}
+
+	return Hasher{
+		handle: h,
+	}, runtime.Success
+}
diff --git a/wrappers/golang/hash/include/blake3.h b/wrappers/golang/hash/include/blake3.h
new file mode 100644
index 0000000000..f855f15b3c
--- /dev/null
+++ b/wrappers/golang/hash/include/blake3.h
@@ -0,0 +1,17 @@
+#include <stdint.h>
+#include "hash.h"
+
+#ifndef _BLAKE3_HASH
+  #define _BLAKE3_HASH
+
+  #ifdef __cplusplus
+extern "C" {
+  #endif
+
+Hash* icicle_create_blake3(uint64_t default_input_chunk_size);
+
+  #ifdef __cplusplus
+}
+  #endif
+
+#endif
diff --git a/wrappers/golang/hash/tests/hash_test.go b/wrappers/golang/hash/tests/hash_test.go
index dbd32786b7..f424d7394d 100644
--- a/wrappers/golang/hash/tests/hash_test.go
+++ b/wrappers/golang/hash/tests/hash_test.go
@@ -99,6 +99,77 @@ func testBlake2s(s *suite.Suite) {
 	s.NotEqual(outputEmpty, outputMain)
 }
 
+func testBlake3_cpu_gpu(s *suite.Suite) {
+	singleHashInputSize := 567
+	batch := 11
+	const outputBytes = 32 // 32 bytes is output size of Blake3
+
+	input := make([]byte, singleHashInputSize*batch)
+	_, err := rand.Read(input)
+	if err != nil {
+		fmt.Println("error:", err)
+		return
+	}
+
+	Blake3Hasher, error := hash.NewBlake3Hasher(0 /*default chunk size*/)
+	if error != runtime.Success {
+		fmt.Println("error:", error)
+		return
+	}
+
+	outputRef := make([]byte, outputBytes*batch)
+	Blake3Hasher.Hash(
+		core.HostSliceFromElements(input),
+		core.HostSliceFromElements(outputRef),
+		core.GetDefaultHashConfig(),
+	)
+
+	runtime.SetDevice(&devices[1])
+	Blake3Hasher, error = hash.NewBlake3Hasher(0 /*default chunk size*/)
+	if error != runtime.Success {
+		fmt.Println("error:", error)
+		return
+	}
+
+	outputMain := make([]byte, outputBytes*batch)
+	Blake3Hasher.Hash(
+		core.HostSliceFromElements(input),
+		core.HostSliceFromElements(outputMain),
+		core.GetDefaultHashConfig(),
+	)
+
+	outputEmpty := make([]byte, outputBytes*batch)
+	s.Equal(outputRef, outputMain)
+	s.NotEqual(outputEmpty, outputMain)
+}
+
+func testBlake3(s *suite.Suite) {
+	const outputBytes = 32 // 32 bytes is output size of Blake3
+
+	// Known input string and expected hash
+	inputString := "Hello world I am blake3. This is a semi-long Go test with a lot of characters. 0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"
+	expectedHash := "a2b794acb5a604bbd2c4c0380e935697e0b934ea6f194b9f5246fbb212ebe549"
+
+	input := []byte(inputString)
+
+	Blake3Hasher, error := hash.NewBlake3Hasher(0 /*default chunk size*/)
+	if error != runtime.Success {
+		fmt.Println("error:", error)
+		return
+	}
+
+	outputRef := make([]byte, outputBytes)
+	Blake3Hasher.Hash(
+		core.HostSliceFromElements(input),
+		core.HostSliceFromElements(outputRef),
+		core.GetDefaultHashConfig(),
+	)
+
+	outputRefHex := fmt.Sprintf("%x", outputRef)
+
+	s.Equal(expectedHash, outputRefHex, "Hash mismatch: got %s, expected %s", outputRefHex, expectedHash)
+}
+
 func testSha3(s *suite.Suite) {
 	singleHashInputSize := 1153
 	batch := 1
@@ -150,6 +221,8 @@ type HashTestSuite struct {
 func (s *HashTestSuite) TestHash() {
 	s.Run("TestKeccakBatch", testWrapper(&s.Suite, testKeccakBatch))
 	s.Run("TestBlake2s", testWrapper(&s.Suite, testBlake2s))
+	s.Run("TestBlake3_CPU_GPU", testWrapper(&s.Suite, testBlake3_cpu_gpu))
+	s.Run("TestBlake3", testWrapper(&s.Suite, testBlake3))
 	s.Run("TestSha3", testWrapper(&s.Suite, testSha3))
 }
 
diff --git a/wrappers/rust/icicle-hash/src/blake3.rs b/wrappers/rust/icicle-hash/src/blake3.rs
new file mode 100644
index 0000000000..923be6de22
--- /dev/null
+++ b/wrappers/rust/icicle-hash/src/blake3.rs
@@ -0,0 +1,18 @@
+use icicle_core::hash::{Hasher, HasherHandle};
+use icicle_runtime::errors::eIcicleError;
+
+extern "C" {
+    fn icicle_create_blake3(default_input_chunk_size: u64) -> HasherHandle;
+}
+
+pub struct Blake3;
+
+impl Blake3 {
+    pub fn new(default_input_chunk_size: u64) -> Result<Hasher, eIcicleError> {
+        let handle: HasherHandle = unsafe { icicle_create_blake3(default_input_chunk_size) };
+        if handle.is_null() {
+            return Err(eIcicleError::UnknownError);
+        }
+        Ok(Hasher::from_handle(handle))
+    }
+}
diff --git a/wrappers/rust/icicle-hash/src/lib.rs b/wrappers/rust/icicle-hash/src/lib.rs
index 67bf4dd183..df84faa31c 100644
--- a/wrappers/rust/icicle-hash/src/lib.rs
+++ b/wrappers/rust/icicle-hash/src/lib.rs
@@ -1,4 +1,5 @@
 pub mod blake2s;
+pub mod blake3;
 pub mod keccak;
 pub mod sha3;
 
diff --git a/wrappers/rust/icicle-hash/src/tests.rs b/wrappers/rust/icicle-hash/src/tests.rs
index 3618185b08..78f223d81b 100644
--- a/wrappers/rust/icicle-hash/src/tests.rs
+++ b/wrappers/rust/icicle-hash/src/tests.rs
@@ -3,6 +3,7 @@ mod tests {
 
     use crate::{
         blake2s::Blake2s,
+        blake3::Blake3,
         keccak::{Keccak256, Keccak512},
         sha3::Sha3_256,
     };
@@ -88,6 +89,72 @@ mod tests {
         assert_eq!(output_ref, output_main);
     }
 
+    #[test]
+    fn blake3_hashing_cpu_gpu() {
+        initialize();
+        let single_hash_input_size = 567;
+        let batch = 11;
+
+        let mut input = vec![0 as u8; single_hash_input_size * batch];
+        rand::thread_rng().fill(&mut input[..]);
+        let mut output_ref = vec![0 as u8; 32 * batch]; // 32B (=256b) is the output size of blake3
+        let mut output_main = vec![0 as u8; 32 * batch];
+
+        test_utilities::test_set_ref_device();
+        let blake3_hasher = Blake3::new(0 /*default chunk size */).unwrap();
+        blake3_hasher
+            .hash(
+                HostSlice::from_slice(&input),
+                &HashConfig::default(),
+                HostSlice::from_mut_slice(&mut output_ref),
+            )
+            .unwrap();
+
+        test_utilities::test_set_main_device();
+        let blake3_hasher = Blake3::new(0 /*default chunk size */).unwrap();
+        blake3_hasher
+            .hash(
+                HostSlice::from_slice(&input),
+                &HashConfig::default(),
+                HostSlice::from_mut_slice(&mut output_main),
+            )
+            .unwrap();
+        assert_eq!(output_ref, output_main);
+    }
+
+    #[test]
+    fn blake3_hashing() {
+        // Known input string and expected hash
+        let input_string = "Hello world I am blake3. This is a semi-long Rust test with a lot of characters. 0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef";
+        let expected_hash = "ee4941ff90437a4fd7489ffa6d559e644a68b2547e95a690949b902da128b273";
+
+        let input = input_string.as_bytes();
+        let mut output_ref = vec![0u8; 32]; // 32B (=256b) is the output size of blake3
+
+        test_utilities::test_set_ref_device();
+        let blake3_hasher = Blake3::new(0 /*default chunk size */).unwrap();
+        blake3_hasher
+            .hash(
+                HostSlice::from_slice(&input),
+                &HashConfig::default(),
+                HostSlice::from_mut_slice(&mut output_ref),
+            )
+            .unwrap();
+
+        // Convert output_ref to hex for comparison
+        let output_ref_hex: String = output_ref
+            .iter()
+            .map(|b| format!("{:02x}", b))
+            .collect();
+        assert_eq!(
+            output_ref_hex, expected_hash,
+            "Hash mismatch: got {}, expected {}",
+            output_ref_hex, expected_hash
+        );
+
+        println!("Test passed: Computed hash matches expected hash.");
+    }
+
     #[test]
     fn sha3_hashing() {
         initialize();

From cb1cdcb37c7a10f1b19e29d28be4f6d39ac68f06 Mon Sep 17 00:00:00 2001
From: release-bot <release-bot@ingonyama.com>
Date: Tue, 14 Jan 2025 15:00:58 +0000
Subject: [PATCH 12/12] Bump rust crates' version

icicle-babybear@3.4.0
icicle-bls12-377@3.4.0
icicle-bls12-381@3.4.0
icicle-bn254@3.4.0
icicle-bw6-761@3.4.0
icicle-core@3.4.0
icicle-grumpkin@3.4.0
icicle-hash@3.4.0
icicle-koalabear@3.4.0
icicle-m31@3.4.0
icicle-runtime@3.4.0
icicle-stark252@3.4.0

Generated by cargo-workspaces
---
 wrappers/rust/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wrappers/rust/Cargo.toml b/wrappers/rust/Cargo.toml
index a1c705e56a..9495b8ca73 100644
--- a/wrappers/rust/Cargo.toml
+++ b/wrappers/rust/Cargo.toml
@@ -17,7 +17,7 @@ members = [
 exclude = []
 
 [workspace.package]
-version = "3.3.0"
+version = "3.4.0"
 edition = "2021"
 authors = [ "Ingonyama" ]
 homepage = "https://www.ingonyama.com"