Yosys optimizer pass error #1712

ludns · 2025-04-12T00:09:54Z

Runnig the Yosys optimizer pass (--yosys-optimizer=mode=Boolean) on this IR fails

module {
  func.func @matmul(%arg0: !secret.secret<memref<10x10xi16>>, %arg1: !secret.secret<memref<10x10xi16>>, %arg2: !secret.secret<memref<10x10xi16>>) attributes {llvm.linkage = #llvm.linkage<external>} {
    secret.generic ins(%arg0, %arg1, %arg2 : !secret.secret<memref<10x10xi16>>, !secret.secret<memref<10x10xi16>>, !secret.secret<memref<10x10xi16>>) {
    ^body(%input0: memref<10x10xi16>, %input1: memref<10x10xi16>, %input2: memref<10x10xi16>):
      affine.for %arg3 = 0 to 10 {
        affine.for %arg4 = 0 to 10 {
          affine.for %arg5 = 0 to 10 {
            %0 = affine.load %input0[%arg3, %arg5] : memref<10x10xi16>
            %1 = arith.extsi %0 : i16 to i32
            %2 = affine.load %input1[%arg5, %arg4] : memref<10x10xi16>
            %3 = arith.extsi %2 : i16 to i32
            %4 = arith.muli %1, %3 : i32
            %5 = affine.load %input2[%arg3, %arg4] : memref<10x10xi16>
            %6 = arith.trunci %4 : i32 to i16
            %7 = arith.addi %5, %6 : i16
            affine.store %7, %input2[%arg3, %arg4] : memref<10x10xi16>
          }
        }
      }
      secret.yield
    }
    return
  }
}

Error:

<stdin>:5:7: error: 'affine.for' op unable to find printer for op
      affine.for %arg3 = 0 to 10 {
      ^
<stdin>:5:7: note: see current operation: 
"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (10)>}> ({
^bb0(%arg3: index):
  "affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (10)>}> ({
  ^bb0(%arg4: index):
    "affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (10)>}> ({
    ^bb0(%arg5: index):
      %0 = "affine.load"(%arg0, %arg3, %arg5) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<10x10xi16>, index, index) -> i16
      %1 = "arith.extsi"(%0) : (i16) -> i32
      %2 = "affine.load"(%arg1, %arg5, %arg4) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<10x10xi16>, index, index) -> i16
      %3 = "arith.extsi"(%2) : (i16) -> i32
      %4 = "arith.muli"(%1, %3) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
      %5 = "affine.load"(%arg2, %arg3, %arg4) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<10x10xi16>, index, index) -> i16
      %6 = "arith.trunci"(%4) : (i32) -> i16
      %7 = "arith.addi"(%5, %6) <{overflowFlags = #arith.overflow<none>}> : (i16, i16) -> i16
      "affine.store"(%7, %arg2, %arg3, %arg4) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (i16, memref<10x10xi16>, index, index) -> ()
      "affine.yield"() : () -> ()
    }) : () -> ()
    "affine.yield"() : () -> ()
  }) : () -> ()
  "affine.yield"() : () -> ()
}) : () -> ()
<stdin>:5:7: error: 'affine.for' op Failed to translate op affine.for
      affine.for %arg3 = 0 to 10 {
      ^
<stdin>:5:7: note: see current operation: 
"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (10)>}> ({
^bb0(%arg3: index):
  "affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (10)>}> ({
  ^bb0(%arg4: index):
    "affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (10)>}> ({
    ^bb0(%arg5: index):
      %0 = "affine.load"(%arg0, %arg3, %arg5) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<10x10xi16>, index, index) -> i16
      %1 = "arith.extsi"(%0) : (i16) -> i32
      %2 = "affine.load"(%arg1, %arg5, %arg4) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<10x10xi16>, index, index) -> i16
      %3 = "arith.extsi"(%2) : (i16) -> i32
      %4 = "arith.muli"(%1, %3) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
      %5 = "affine.load"(%arg2, %arg3, %arg4) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<10x10xi16>, index, index) -> i16
      %6 = "arith.trunci"(%4) : (i32) -> i16
      %7 = "arith.addi"(%5, %6) <{overflowFlags = #arith.overflow<none>}> : (i16, i16) -> i16
      "affine.store"(%7, %arg2, %arg3, %arg4) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (i16, memref<10x10xi16>, index, index) -> ()
      "affine.yield"() : () -> ()
    }) : () -> ()
    "affine.yield"() : () -> ()
  }) : () -> ()
  "affine.yield"() : () -> ()
}) : () -> ()
<stdin>:1:1: error: 'func.func' op Failed to translate op func.func
module {
^
<stdin>:1:1: note: see current operation: 
"func.func"() <{function_type = (memref<10x10xi16>, memref<10x10xi16>, memref<10x10xi16>) -> memref<10x10xi16>, sym_name = "internal_generic_9731623302262804957", sym_visibility = "private"}> ({
^bb0(%arg0: memref<10x10xi16>, %arg1: memref<10x10xi16>, %arg2: memref<10x10xi16>):
  "affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (10)>}> ({
  ^bb0(%arg3: index):
    "affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (10)>}> ({
    ^bb0(%arg4: index):
      "affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (10)>}> ({
      ^bb0(%arg5: index):
        %0 = "affine.load"(%arg0, %arg3, %arg5) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<10x10xi16>, index, index) -> i16
        %1 = "arith.extsi"(%0) : (i16) -> i32
        %2 = "affine.load"(%arg1, %arg5, %arg4) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<10x10xi16>, index, index) -> i16
        %3 = "arith.extsi"(%2) : (i16) -> i32
        %4 = "arith.muli"(%1, %3) <{overflowFlags = #arith.overflow<none>}> : (i32, i32) -> i32
        %5 = "affine.load"(%arg2, %arg3, %arg4) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (memref<10x10xi16>, index, index) -> i16
        %6 = "arith.trunci"(%4) : (i32) -> i16
        %7 = "arith.addi"(%5, %6) <{overflowFlags = #arith.overflow<none>}> : (i16, i16) -> i16
        "affine.store"(%7, %arg2, %arg3, %arg4) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (i16, memref<10x10xi16>, index, index) -> ()
        "affine.yield"() : () -> ()
      }) : () -> ()
      "affine.yield"() : () -> ()
    }) : () -> ()
    "affine.yield"() : () -> ()
  }) : () -> ()
  "func.return"(%arg2) : (memref<10x10xi16>) -> ()
}) : () -> ()
<stdin>:3:5: error: failed to translate function.
    secret.generic ins(%arg0, %arg1, %arg2 : !secret.secret<memref<10x10xi16>>, !secret.secret<memref<10x10xi16>>, !secret.secret<memref<10x10xi16>>) {
    ^
<stdin>:3:5: note: see current operation: 
%0 = "secret.generic"(%arg0, %arg1, %arg2) ({
^bb0(%arg3: memref<10x10xi16>, %arg4: memref<10x10xi16>, %arg5: memref<10x10xi16>):
  %1 = "func.call"(%arg3, %arg4, %arg5) <{callee = @internal_generic_9731623302262804957}> : (memref<10x10xi16>, memref<10x10xi16>, memref<10x10xi16>) -> memref<10x10xi16>
  "secret.yield"(%1) : (memref<10x10xi16>) -> ()
}) : (!secret.secret<memref<10x10xi16>>, !secret.secret<memref<10x10xi16>>, !secret.secret<memref<10x10xi16>>) -> !secret.secret<memref<10x10xi16>>
<stdin>:3:5: error: 'secret.generic' op Failed to translate op secret.generic
    secret.generic ins(%arg0, %arg1, %arg2 : !secret.secret<memref<10x10xi16>>, !secret.secret<memref<10x10xi16>>, !secret.secret<memref<10x10xi16>>) {
    ^
<stdin>:3:5: note: see current operation: 
%0 = "secret.generic"(%arg0, %arg1, %arg2) ({
^bb0(%arg3: memref<10x10xi16>, %arg4: memref<10x10xi16>, %arg5: memref<10x10xi16>):
  %1 = "func.call"(%arg3, %arg4, %arg5) <{callee = @internal_generic_9731623302262804957}> : (memref<10x10xi16>, memref<10x10xi16>, memref<10x10xi16>) -> memref<10x10xi16>
  "secret.yield"(%1) : (memref<10x10xi16>) -> ()
}) : (!secret.secret<memref<10x10xi16>>, !secret.secret<memref<10x10xi16>>, !secret.secret<memref<10x10xi16>>) -> !secret.secret<memref<10x10xi16>>
<stdin>:3:5: error: Failed to translate to verilog
    secret.generic ins(%arg0, %arg1, %arg2 : !secret.secret<memref<10x10xi16>>, !secret.secret<memref<10x10xi16>>, !secret.secret<memref<10x10xi16>>) {
    ^
<stdin>:3:5: note: see current operation: 
%0 = "secret.generic"(%arg0, %arg1, %arg2) ({
^bb0(%arg3: memref<10x10xi16>, %arg4: memref<10x10xi16>, %arg5: memref<10x10xi16>):
  %1 = "func.call"(%arg3, %arg4, %arg5) <{callee = @internal_generic_9731623302262804957}> : (memref<10x10xi16>, memref<10x10xi16>, memref<10x10xi16>) -> memref<10x10xi16>
  "secret.yield"(%1) : (memref<10x10xi16>) -> ()
}) : (!secret.secret<memref<10x10xi16>>, !secret.secret<memref<10x10xi16>>, !secret.secret<memref<10x10xi16>>) -> !secret.secret<memref<10x10xi16>>

The text was updated successfully, but these errors were encountered:

ludns · 2025-04-12T00:50:58Z

This one fails too. This is taken from loop_test.py in the frontend folder.

module {
  func.func @loopa(%arg0: !secret.secret<i64>) -> !secret.secret<i64> {
    %0 = secret.generic ins(%arg0 : !secret.secret<i64>) {
    ^body(%input0: i64):
      %c0_i32 = arith.constant 0 : i32
      %c1_i32 = arith.constant 1 : i32
      %1:2 = affine.for %arg1 = 1 to 5 iter_args(%arg2 = %input0, %arg3 = %c0_i32) -> (i64, i32) {
        %2 = arith.addi %arg2, %arg2 : i64
        %3 = arith.addi %arg3, %c1_i32 : i32
        affine.yield %2, %3 : i64, i32
      }
      secret.yield %1#0 : i64
    } -> !secret.secret<i64>
    return %0 : !secret.secret<i64>
  }
}

j2kun · 2025-04-12T04:14:19Z

The pass is meant to be run after a variety of pre-processing is applied first, corresponding to the --mlir-to-cggi pass pipeline.

However, when I try that on these IRs I get errors. Looks like the early one-shot-bufferize pass doesn't like secret.generic, and actually waits to wrap-generic after doing some bufferization stuff. In that case you should be using this (simpler) IR instead:

  func.func @matmul(%input0: memref<10x10xi16> {secret.secret}, %input1: memref<10x10xi16> {secret.secret}, %input2: memref<10x10xi16> {secret.secret}) attributes {llvm.linkage = #llvm.linkage<external>} {
    affine.for %arg3 = 0 to 10 {
      affine.for %arg4 = 0 to 10 {
        affine.for %arg5 = 0 to 10 {
          %0 = affine.load %input0[%arg3, %arg5] : memref<10x10xi16>
          %1 = arith.extsi %0 : i16 to i32
          %2 = affine.load %input1[%arg5, %arg4] : memref<10x10xi16>
          %3 = arith.extsi %2 : i16 to i32
          %4 = arith.muli %1, %3 : i32
          %5 = affine.load %input2[%arg3, %arg4] : memref<10x10xi16>
          %6 = arith.trunci %4 : i32 to i16
          %7 = arith.addi %5, %6 : i16
          affine.store %7, %input2[%arg3, %arg4] : memref<10x10xi16>
        }
      }
    }
    return
  }
}

But ignoring that I also see a familiar ERROR: Can't open input file _main/lib/Transforms/YosysOptimizer/yosys/techmap.v' for reading: No such file or directory, which I though we had resolved by adding _main when we upgraded bazel, but maybe yosys is not running in the same execroot during bazel run and bazel test?

I think in the next week or two @asraa plans to add CGGI to the python frontend, so in that case Asra could you make sure to kick the tires on these two IRs to iron out these kinks?

ludns · 2025-04-12T04:41:35Z

If that helps, the way I solve ERROR: Can't open input file _main/lib/Transforms/YosysOptimizer/yosys/techmap.v' for reading: No such file or directory is by setting a HEIR_YOSYS_SCRIPTS_DIR variable to points to a directory with techmap.v from Yosys.

j2kun · 2025-04-12T16:17:05Z

If that helps, the way I solve ERROR: Can't open input file _main/lib/Transforms/YosysOptimizer/yosys/techmap.v' for reading: No such file or directory is by setting a HEIR_YOSYS_SCRIPTS_DIR variable to points to a directory with techmap.v from Yosys.

Yes but bazel run is supposed to handle this all for you automatically (you have to do it manually when running the binary as a standalone)

asraa · 2025-04-14T19:52:01Z

Hey! @ludns I'll be working on integrating the whole mlir-to-cggi pipeline into the frontend.

The issue with the one that you posted is like Jeremy said, but the transforms need to happen before the wrap-generic is called.

ludns · 2025-04-14T22:51:47Z

Hi @asraa. Is there a way to break down the the mlir-to-cggi pipeline into multiple passes?

It's easier for me to get a sense of what's going on. As an example, here is how I sequence passes so far (for CGGI):
heir-opt --secretize --wrap-generic --convert-to-data-oblivious --yosys-optimizer=mode=Boolean --secret-distribute-generic --secret-to-cggi --cse --cggi-boolean-vectorize --cggi-to-tfhe-rust-bool --canonicalize --cse

When you say the transforms need to happen before wrap-generic happens, which transform are you referring to?

ludns · 2025-04-14T23:12:05Z

I am having a really hard time getting pretty basic affine dialect to transform all the way down to a Tfhe dialect.
As an example, here are two (simple) targets I use for testing.
You can see I am playing both with pure functions (the variance in this case) and memref.

Matrix multiplication

  func.func @matmul(%input0: memref<10x10xi16>, %input1: memref<10x10xi16>, %input2: memref<10x10xi16>) attributes {llvm.linkage = #llvm.linkage<external>} {
    affine.for %arg3 = 0 to 10 {
      affine.for %arg4 = 0 to 10 {
        affine.for %arg5 = 0 to 10 {
          %0 = affine.load %input0[%arg3, %arg5] : memref<10x10xi16>
          %1 = arith.extsi %0 : i16 to i32
          %2 = affine.load %input1[%arg5, %arg4] : memref<10x10xi16>
          %3 = arith.extsi %2 : i16 to i32
          %4 = arith.muli %1, %3 : i32
          %5 = affine.load %input2[%arg3, %arg4] : memref<10x10xi16>
          %6 = arith.trunci %4 : i32 to i16
          %7 = arith.addi %5, %6 : i16
          affine.store %7, %input2[%arg3, %arg4] : memref<10x10xi16>
        }
      }
    }
    return
  }

Variance calculation

// Calculate Variance of i16 elements, returning i64
// Uses the formula: Var(X) = E[X^2] - (E[X])^2 = (sum_sq / N) - (sum / N)^2
func.func @calculate_variance(%input_tensor: tensor<10xi16>) -> i64 {
    %c0_i64 = arith.constant 0 : i64
    %c10_i64 = arith.constant 10 : i64 // Size N
    %c0_index = arith.constant 0 : index
    %c10_index = arith.constant 10 : index
    %c1_index = arith.constant 1 : index

    // Initialize accumulators for sum and sum of squares
    %sum_init = arith.constant 0 : i64
    %sum_sq_init = arith.constant 0 : i64

    // Loop through the tensor to calculate sum and sum of squares
    %loop_result:2 = scf.for %iv = %c0_index to %c10_index step %c1_index iter_args(%sum_iter = %sum_init, %sum_sq_iter = %sum_sq_init) -> (i64, i64) {
        // Extract element
        %element_i16 = tensor.extract %input_tensor[%iv] : tensor<10xi16>
        // Extend to i64
        %element_i64 = arith.extsi %element_i16 : i16 to i64

        // Accumulate sum
        %next_sum = arith.addi %sum_iter, %element_i64 : i64

        // Calculate square
        %element_sq_i64 = arith.muli %element_i64, %element_i64 : i64
        // Accumulate sum of squares
        %next_sum_sq = arith.addi %sum_sq_iter, %element_sq_i64 : i64

        scf.yield %next_sum, %next_sum_sq : i64, i64
    }

    // Calculate E[X] = sum / N (Use loop result #0 directly)
    %mean = arith.divsi %loop_result#0, %c10_i64 : i64

    // Calculate E[X^2] = sum_sq / N (Use loop result #1 directly)
    %mean_sq_val = arith.divsi %loop_result#1, %c10_i64 : i64

    // Calculate (E[X])^2 = mean * mean
    %mean_pow2 = arith.muli %mean, %mean : i64

    // Calculate Variance = E[X^2] - (E[X])^2
    %variance = arith.subi %mean_sq_val, %mean_pow2 : i64

    return %variance : i64
}

ludns · 2025-04-15T02:04:52Z

(keeping track of what I encounter in case it's useful): I am also unable to get the Yosys pass working with multiple outputs:

heir-opt --secretize --wrap-generic --convert-to-data-oblivious --yosys-optimizer=mode=Boolean --secret-distribute-generic --secret-to-cggi --cse --cggi-to-tfhe-rust-bool --canonicalize --cse applied to

func.func @multi_output(%a: i16) -> (i16, i16) {
    %0 = arith.constant 2 : i16
    %2 = arith.addi %a, %0 : i16
    func.return %2, %2 : i16, i16
}

Fails with

<stdin>:1:1: error: 'func.call' op Failed to translate op func.call
func.func @multi_output(%a: i16) -> (i16, i16) {
^
<stdin>:1:1: note: see current operation: %1:2 = "func.call"(%arg1) <{callee = @internal_generic_8143693811150977432}> : (i16) -> (i16, i16)
<stdin>:1:1: error: 'secret.generic' op Failed to translate op secret.generic
func.func @multi_output(%a: i16) -> (i16, i16) {
^
<stdin>:1:1: note: see current operation: 
%0:2 = "secret.generic"(%arg0) ({
^bb0(%arg1: i16):
  %1:2 = "func.call"(%arg1) <{callee = @internal_generic_8143693811150977432}> : (i16) -> (i16, i16)
  "secret.yield"(%1#0, %1#1) : (i16, i16) -> ()
}) : (!secret.secret<i16>) -> (!secret.secret<i16>, !secret.secret<i16>)
<stdin>:1:1: error: Failed to translate to verilog
func.func @multi_output(%a: i16) -> (i16, i16) {
^
<stdin>:1:1: note: see current operation: 
%0:2 = "secret.generic"(%arg0) ({
^bb0(%arg1: i16):
  %1:2 = "func.call"(%arg1) <{callee = @internal_generic_8143693811150977432}> : (i16) -> (i16, i16)
  "secret.yield"(%1#0, %1#1) : (i16, i16) -> ()
}) : (!secret.secret<i16>) -> (!secret.secret<i16>, !secret.secret<i16>)

asraa · 2025-04-15T15:34:39Z

Hi @asraa. Is there a way to break down the the mlir-to-cggi pipeline into multiple passes?

Oh! You may want to use the --mlir-print-ir-after-all flag. This will emit the IR after each pass in the pipeline, so you can get a sequence of what the IR transforms are. But it emits a lot (and so can be a little slow) so you will want to pipe it into a file to inspect.

When you say the transforms need to happen before wrap-generic happens, which transform are you referring to?

OK for @loopa - you will need to start from the IR with just the secret.secret annotations.

module {
  func.func @loopa(%arg0: i64 {secret.secret}) -> i64 {
    %c0_i32 = arith.constant 0 : i32
    %c1_i32 = arith.constant 1 : i32
    %0:2 = affine.for %arg1 = 1 to 5 iter_args(%arg2 = %arg0, %arg3 = %c0_i32) -> (i64, i32) {
      %1 = arith.addi %arg2, %arg2 : i64
      %2 = arith.addi %arg3, %c1_i32 : i32
      affine.yield %1, %2 : i64, i32
    }
    return %0#0 : i64
  }
}

Running --mlir-to-cggi=mode=Boolean on this will emit the CGGI code (it's just a mul by 4 so really it's just shifting the bits).

For the larger matmul example you posted, the default compilation path will take quite a while since it will (by default) unroll all the loops before it passes the IR to Yosys, and the bitwidths are fairly large. I have a suggestions, but first like before, use the secret annotated version. If you start with one that already has the secret.generic, then the bufferization passes in the start of --mlir-to-cggi will fail because they won't know how to bufferize the generic. That is what I meant by defer wrap-generic until after that. (Your sequence actually looks OK since it skips the bufferization and the program already uses memref semantics).

module {
  func.func @matmul(%arg0: memref<10x10xi16> {secret.secret}, %arg1: memref<10x10xi16> {secret.secret}, %arg2: memref<10x10xi16> {secret.secret}) attributes {llvm.linkage = #llvm.linkage<external>} {
    affine.for %arg3 = 0 to 10 {
      affine.for %arg4 = 0 to 10 {
        affine.for %arg5 = 0 to 10 {
          %0 = affine.load %arg0[%arg3, %arg5] : memref<10x10xi16>
          %1 = arith.extsi %0 : i16 to i32
          %2 = affine.load %arg1[%arg5, %arg4] : memref<10x10xi16>
          %3 = arith.extsi %2 : i16 to i32
          %4 = arith.muli %1, %3 : i32
          %5 = affine.load %arg2[%arg3, %arg4] : memref<10x10xi16>
          %6 = arith.trunci %4 : i32 to i16
          %7 = arith.addi %5, %6 : i16
          affine.store %7, %arg2[%arg3, %arg4] : memref<10x10xi16>
        }
      }
    }
    return
  }
}

If you run --mlir-to-cggi=mode=Boolean on this IR you will have a really long compilation time because of the full loop unroll. Yosys has a configuration for an unroll-factor option, but we haven't properly piped that into the mlir-to-cggi pipeline.

So for a faster compilation, without loop unrolling (which is part of the mlir-to-cggi pre-passes for now), I would want to run something like heir-opt --canonicalize --cse --wrap-generic --secret-distribute-generic="distribute-through=affine.for" --yosys-optimizer="mode=Boolean unroll-factor=0" --secret-distribute-generic --secret-to-cggi --canonicalize --cse but unfortunately out --secret-distribute-generic isn't handling the affine read/writes properly.

To be honest, this pipeline either expects smaller usecases or linalg matvecs for matrix-vector products.

I'm happy to pick apart making this easier to compile.

asraa · 2025-04-15T15:36:24Z

I am also unable to get the Yosys pass working with multiple outputs

That is intended, actually. We won't compile yosys circuits with multiple outputs - maybe you can file a separate issue for that.

There's also the arith-to-cggi passes that are intended to target a high level tfhe-rs API.

asraa · 2025-04-15T15:46:02Z

For your variance example, change the scf loop to an affine loop, and it should work with the arith::DivSIOp printer added (will tag you in the PR):

// Calculate Variance of i16 elements, returning i64
// Uses the formula: Var(X) = E[X^2] - (E[X])^2 = (sum_sq / N) - (sum / N)^2
func.func @calculate_variance(%input_tensor: tensor<10xi16> {secret.secret}) -> i64 {
    %c0_i64 = arith.constant 0 : i64
    %c10_i64 = arith.constant 10 : i64 // Size N
    %c0_index = arith.constant 0 : index
    %c10_index = arith.constant 10 : index
    %c1_index = arith.constant 1 : index

    // Initialize accumulators for sum and sum of squares
    %sum_init = arith.constant 0 : i64
    %sum_sq_init = arith.constant 0 : i64

    // Loop through the tensor to calculate sum and sum of squares
    %loop_result:2 = affine.for %iv = 0 to 10 iter_args(%sum_iter = %sum_init, %sum_sq_iter = %sum_sq_init) -> (i64, i64) {
        // Extract element
        %element_i16 = tensor.extract %input_tensor[%iv] : tensor<10xi16>
        // Extend to i64
        %element_i64 = arith.extsi %element_i16 : i16 to i64

        // Accumulate sum
        %next_sum = arith.addi %sum_iter, %element_i64 : i64

        // Calculate square
        %element_sq_i64 = arith.muli %element_i64, %element_i64 : i64
        // Accumulate sum of squares
        %next_sum_sq = arith.addi %sum_sq_iter, %element_sq_i64 : i64

        affine.yield %next_sum, %next_sum_sq : i64, i64
    }

    // Calculate E[X] = sum / N (Use loop result #0 directly)
    %mean = arith.divsi %loop_result#0, %c10_i64 : i64

    // Calculate E[X^2] = sum_sq / N (Use loop result #1 directly)
    %mean_sq_val = arith.divsi %loop_result#1, %c10_i64 : i64

    // Calculate (E[X])^2 = mean * mean
    %mean_pow2 = arith.muli %mean, %mean : i64

    // Calculate Variance = E[X^2] - (E[X])^2
    %variance = arith.subi %mean_sq_val, %mean_pow2 : i64

    return %variance : i64
}

I ran heir-opt --mlir-to-cggi=abc-fast=True on that input. You can also add the mode=Boolean if you need.

Part of #1712 PiperOrigin-RevId: 748290487

Part of #1712 PiperOrigin-RevId: 748395020

ludns · 2025-04-18T00:47:17Z

Thank you for all this information @asraa, I really appreciate.
A few questions:

Compiling large programs

To be honest, this pipeline either expects smaller usecases or linalg matvecs for matrix-vector products.

What is the recommended way to compile large programs to CGGI then?

Variance program

On the smaller variance example linked in this repo :
The CGGI pipeline succeeds, but the conversion to tfhe-rust-bool doesn't.
heir-opt --secretize --mlir-to-cggi=mode=Boolean --cse --cggi-to-tfhe-rust-bool --canonicalize --cse on the small variance fails with

<stdin>:1:1: error: failed to legalize unresolved materialization from ('memref<20x!lwe.lwe_ciphertext<encoding = #lwe.unspecified_bit_field_encoding<cleartext_bitwidth = 1>>>') to ('memref<20x!tfhe_rust_bool.eb>') that remained live after conversion
func.func @calculate_variance(%input_tensor: tensor<5xi4>) -> i32 {
^
<stdin>:1:1: note: see current operation: %35 = "builtin.unrealized_conversion_cast"(%34) : (memref<20x!lwe.lwe_ciphertext<encoding = #lwe.unspecified_bit_field_encoding<cleartext_bitwidth = 1>>>) -> memref<20x!tfhe_rust_bool.eb>
<stdin>:1:1: note: see existing live user here: %120 = memref.load %1[%c19] : memref<20x!tfhe_rust_bool.eb>

Matmul program

Compiling a much smaller matmul program (attached below) fails.
heir-opt --secretize --mlir-to-cggi=mode=Boolean --cse --cggi-to-tfhe-rust-bool --canonicalize --cse

module {
  func.func @matmul(%arg0: memref<10x10xi4>, %arg1: memref<10x10xi4>, %arg2: memref<10x10xi4>) attributes {llvm.linkage = #llvm.linkage<external>} {
    affine.for %arg3 = 0 to 10 {
      affine.for %arg4 = 0 to 10 {
        affine.for %arg5 = 0 to 10 {
          %0 = affine.load %arg0[%arg3, %arg5] : memref<10x10xi4>
          %1 = arith.extsi %0 : i4 to i8
          %2 = affine.load %arg1[%arg5, %arg4] : memref<10x10xi4>
          %3 = arith.extsi %2 : i4 to i8
          %4 = arith.muli %1, %3 : i8
          %5 = affine.load %arg2[%arg3, %arg4] : memref<10x10xi4>
          %6 = arith.trunci %4 : i8 to i4
          %7 = arith.addi %5, %6 : i4
          affine.store %7, %arg2[%arg3, %arg4] : memref<10x10xi4>
        }
      }
    }
    return
  }
}

Return code: -6
Stderr:
Assertion failed: (wireNameToValue.contains(name)), function getBit, file RTLILImporter.cpp, line 95.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.

Things that work now!

The loop example compiles now! Thank you.

j2kun · 2025-04-18T01:32:47Z

The main problem is that Yosys is slow to optimize large programs. We have a separate pipeline planned that, while compilation speed will be faster, the resulting program will be less performant. I'd hope in the long term we can make this tradeoff more finely controllable.

The secondary problem is that CGGI is just poorly suited to large bit width arithmetic. I don't think anyone knows of a good way around this problem, though there is likely more we can do in the compiler if we wanted to port more of the smarts in Zama's tfhe-rs upstream

ludns · 2025-04-18T01:36:44Z

Is there a way to compose MLIR func together in a way that Yosys can optimize individual functions while unoptimized routing is used across functions?

j2kun · 2025-04-18T03:34:26Z

Right now we already support something similar: running Yosys on the body of each secret. 7FC1 generic separately. IMO we haven't thought through a more comprehensive and clean design, say based on splitting apart a computation into separate functions (and letting the user annotate per function which method to use?) is that we haven't had a demand to scale to larger programs yet.

asraa · 2025-04-18T14:38:52Z

The CGGI pipeline succeeds, but the conversion to tfhe-rust-bool doesn't.

Yeah, I can fix that.

Compiling a much smaller matmul program (attached below) fails.

I ran into that as well, and the issue it seemed in the pipeline is a problem with how secret-distribute-generic ends up forming the body to booleanize. It believes there is not return value since we don't return a new array, we just update the arg in place. I have it on my plate to fix that issue as I integrate CGGI into the frontend.

IMO we haven't thought through a more comprehensive and clean design, say based on splitting apart a computation into separate functions (and letting the user annotate per function which method to use?) is that we haven't had a demand to scale to larger programs yet.

In addition to the non-Yosys pipeline that is currently in development (by @WoutLegiest), we also do have a secret.separator (search the tests for it) method that can force boundaries for Yosys compilation. But we haven't invested too much time in testing it because of how slow and unscalable Yosys is regardless.

j2kun added the bug Something isn't working label Apr 12, 2025

asraa mentioned this issue Apr 14, 2025

fix: fix path for yosys share files for bazel run #1718

Merged

This was referenced Apr 15, 2025

cggi: add verilog support for divsi op #1720

Merged

Yosys optimizer clean-ups and refactoring #1721

Open

cggi: support multiple outputs in verilog translation #1725

Merged

copybara-service bot pushed a commit that referenced this issue Apr 16, 2025

cggi: support multiple outputs in verilog translation

ff56dae

Part of #1712 PiperOrigin-RevId: 748290487

copybara-service bot pushed a commit that referenced this issue Apr 16, 2025

cggi: support multiple outputs in verilog translation

43d6287

Part of #1712 PiperOrigin-RevId: 748290487

copybara-service bot pushed a commit that referenced this issue Apr 16, 2025

cggi: support multiple outputs in verilog translation

6d646ef

Part of #1712 PiperOrigin-RevId: 748290487

copybara-service bot pushed a commit that referenced this issue Apr 16, 2025

cggi: support multiple outputs in verilog translation

a986c86

Part of #1712 PiperOrigin-RevId: 748290487

copybara-service bot pushed a commit that referenced this issue Apr 16, 2025

cggi: support multiple outputs in verilog translation

a24838a

Part of #1712 PiperOrigin-RevId: 748290487

copybara-service bot pushed a commit that referenced this issue Apr 16, 2025

cggi: support multiple outputs in verilog translation

d1fa697

Part of #1712 PiperOrigin-RevId: 748290487

copybara-service bot pushed a commit that referenced this issue Apr 16, 2025

cggi: support multiple outputs in verilog translation

cd790c8

Part of #1712 PiperOrigin-RevId: 748395020

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Yosys optimizer pass error #1712

Yosys optimizer pass error #1712

Yosys optimizer pass error #1712

Yosys optimizer pass error #1712

Comments

Compiling large programs

Variance program

Matmul program

Things that work now!