Matrix multiplication, missed value at the end of the matrix

Good evening, I have some strange issue while implementing non vectorised matrix multiplication, where aie part is taken from the programming_examples. When I want to upscale image up to the size 224x224 everything works fine. But when I try to increase the final size, the resulting image will miss values at the end, so it will have black area at the bottom of the image. When I move to the vectorised version, the problem was solved, but I would like to know the reason of such a behaviour. Can you help me?

Here is the code I use

test.cpp

#include <iostream>
#include <fstream>
#include <sstream>
#include <iomanip>
#include <vector>
#include <string>
#include <stdexcept>
#include <cstdlib>
#include <ctime>
#include <cstring>
#include <cassert>
#include <cmath>
#include <limits>
#include <chrono>
#include <algorithm>
#include <cfloat>
#include <cstdint>
#include <stdfloat>

#include <boost/program_options.hpp>
namespace po = boost::program_options;

// Include XRT libraries
#include "xrt/xrt_bo.h"
#include "xrt/xrt_device.h"
#include "xrt/xrt_kernel.h"

#include "utils.hpp"
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/imgproc/imgproc.hpp>

#include "OpenCVUtils.h"
// #include "test_utils.h"

#ifndef UPSCALE_FACTOR
#define UPSCALE_FACTOR 2      // Example: change as needed
#endif

#ifndef OUT_N
#define OUT_N 256      // Example: change as needed
#endif

using MATRIX_DATATYPE = float;

struct args {
  int verbosity;
  int do_verify;
  int n_iterations;
  int n_warmup_iterations;
  int trace_size;
  std::string instr;
  std::string xclbin;
  std::string kernel;
  std::string trace_file;
};


args parse_args(int argc, const char *argv[]) {
  po::options_description desc("Allowed options");
  po::variables_map vm;
  utils::add_default_options(desc);
  args myargs;
  utils::parse_options(argc, argv, desc, vm);
  myargs.verbosity = vm["verbosity"].as<int>();
  myargs.do_verify = vm["verify"].as<bool>();
  myargs.n_iterations = vm["iters"].as<int>();
  myargs.n_warmup_iterations = vm["warmup"].as<int>();
  myargs.trace_size = vm["trace_sz"].as<int>();
  myargs.instr = vm["instr"].as<std::string>();
  myargs.xclbin = vm["xclbin"].as<std::string>();
  myargs.kernel = vm["kernel"].as<std::string>();
  myargs.trace_file = vm["trace_file"].as<std::string>();
  return myargs;
}

void initialize_bufIn_random(float *bufIn, int SIZE) {
  for (int i = 0; i < SIZE; i++)
    bufIn[i] = std::rand() % 10;
}

void initialize_bufIn(float *bufIn, int SIZE, float val) {
  for (int i = 0; i < SIZE; i++)
    bufIn[i] = val;
}

void print_matrix(const char* name, float* data, int rows, int cols) {
    printf("\n%s:\n", name);
    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j) {
            printf("%8.3f ", data[i * cols + j]);
        }
        printf("\n");
    }
}

void writeMatrixToFile(const std::vector<MATRIX_DATATYPE>& C, int rows, int cols, const std::string& filename) {
    std::ofstream outFile(filename);
    if (!outFile) {
        std::cerr << "Failed to open file: " << filename << std::endl;
        return;
    }

    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j) {
            outFile << C[i * cols + j];
            if (j < cols - 1)
                outFile << " ";
        }
        outFile << "\n";
    }

    outFile.close();
}

cv::Mat vectorToMat(const std::vector<float>& vec, int rows, int cols) {
    CV_Assert(static_cast<int>(vec.size()) == rows * cols);

    cv::Mat mat(rows, cols, CV_32F);  // Create CV_32F matrix
    std::memcpy(mat.data, vec.data(), vec.size() * sizeof(float));
    return mat;
}

std::vector<float> matToFloatVector(const cv::Mat& img) {
    CV_Assert(img.type() == CV_8UC1);  // Ensure grayscale

    std::vector<float> vec;
    vec.reserve(img.total());

    for (int i = 0; i < img.rows; ++i) {
        const uchar* rowPtr = img.ptr<uchar>(i);
        for (int j = 0; j < img.cols; ++j) {
            vec.push_back(static_cast<float>(rowPtr[j]));  // or normalize: /255.0f
        }
    }

    return vec;
}

template <typename T>
std::vector<T> createRandomMatrix(int N, T min_val = static_cast<T>(0), T max_val = static_cast<T>(10)) {
    // Initialize random seed
    std::srand(static_cast<unsigned int>(std::time(nullptr)));

    std::vector<T> matrix(N * N);

    for (int i = 0; i < N * N; ++i) {
        if constexpr (std::is_integral<T>::value) {
            matrix[i] = min_val + (std::rand() % (max_val - min_val + 1));
        } else {
            T scale = static_cast<T>(std::rand()) / static_cast<T>(RAND_MAX);
            matrix[i] = min_val + scale * (max_val - min_val);
        }
    }

    return matrix;
}

template <typename T>
std::vector<T> transposeMatrix(const std::vector<T>& matrix, int N) {
    std::vector<T> transposed(N * N);

    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            transposed[j * N + i] = matrix[i * N + j];
        }
    }

    return transposed;
}


template <typename T>
std::vector<T> createDCTMatrix(int N) {
    // Create a vector to hold the DCT matrix (N x N elements)
    std::vector<T> T_matrix(N * N); 
    
    // Pre-calculate normalization factors
    T factor0 = std::sqrt(static_cast<T>(1.0) / static_cast<T>(N));
    T factor = std::sqrt(static_cast<T>(2.0) / static_cast<T>(N));
    
    // Fill the vector using the DCT coefficient formula
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            if (i == 0) {
                T_matrix[i * N + j] = factor0;  // Row-major access (flattened index)
            } else {
                T_matrix[i * N + j] = factor * std::cos((M_PI * (2 * j + 1) * i) / (2.0 * N));
            }
        }
    }

    return T_matrix;
}

void initialize_bufOut(std::float_t *bufOut, int SIZE) {
  // Assicurati di usare SIZE * sizeof(std::float_t) se inizializzi un buffer di float
  memset(bufOut, 0, SIZE * sizeof(std::float_t));
}

float* transposeMatrix(const float* input, int rows, int cols) {
    float* output = new float[rows * cols];

    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j) {
            output[j * rows + i] = input[i * cols + j];
        }
    }

    return output;
}


template <typename T>
void printMatrix(const char* name, const std::vector<T>& matrix, int N, int width = 8, int precision = 3) {
     printf("\n%s:\n", name);
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            std::cout << std::setw(width) << std::setprecision(precision) << std::fixed 
                      << matrix[i * N + j] << " ";
        }
        std::cout << "\n";
    }
}


int setup_and_run_aie(int IN_N,int debug, args myargs) {
    srand(time(NULL));
    std::vector<uint32_t> instr_v = utils::load_instr_binary(myargs.instr);
    if (myargs.verbosity >= 1)
    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
    
    // Initialize Device and Kernel
    xrt::device device;
    xrt::kernel kernel;
    utils::init_xrt_load_kernel(device, kernel, myargs.verbosity,
                          myargs.xclbin, myargs.kernel);
    
    int MATRIX_VOLUME = OUT_N*OUT_N;

    // Create Buffer 
    auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
    auto bo_in1 = xrt::bo(device, MATRIX_VOLUME * sizeof(MATRIX_DATATYPE),
                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
    auto bo_in2 = xrt::bo(device, MATRIX_VOLUME * sizeof(MATRIX_DATATYPE),
                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
    auto bo_out = xrt::bo(device, MATRIX_VOLUME * sizeof(MATRIX_DATATYPE),
                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));

    auto bo_tmp1 = xrt::bo(device, 1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(6));
    
    int tmp_trace_size = (myargs.trace_size > 0) ? myargs.trace_size : 1;
    auto bo_trace = xrt::bo(device, tmp_trace_size, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(7));
    
    // Load Instructions
    void *bufInstr = bo_instr.map<void *>();
    memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));

    MATRIX_DATATYPE *bufIn1 = bo_in1.map<MATRIX_DATATYPE *>();
    MATRIX_DATATYPE *bufIn2 = bo_in2.map<MATRIX_DATATYPE *>();
    MATRIX_DATATYPE *bufOut = bo_out.map<MATRIX_DATATYPE *>();
    char *bufTrace = bo_trace.map<char *>();
    
    std::vector<MATRIX_DATATYPE> A_out_size2 = createRandomMatrix<MATRIX_DATATYPE>(OUT_N, 1, 2);
    writeMatrixToFile(A_out_size2, OUT_N, OUT_N, "A_matrix.txt");
    memcpy(bufIn2, A_out_size2.data(), (A_out_size2.size() * sizeof(MATRIX_DATATYPE)));
    
    memcpy(bufIn1, A_out_size2.data(), (A_out_size2.size() * sizeof(MATRIX_DATATYPE)));
   
    bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
    bo_in1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
    bo_in2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
    bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE);

    unsigned int opcode = 3;

    auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in1, bo_in2, bo_out, bo_tmp1, bo_trace);
    run.wait();
    bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);

    std::vector<MATRIX_DATATYPE> C = std::vector<MATRIX_DATATYPE>(bufOut, bufOut + OUT_N*OUT_N);
    writeMatrixToFile(C, OUT_N, OUT_N, "BA_matrix.txt");

    int ret_val = 0;
    return ret_val;
}

/**
 * @brief Main function.
 *
 * Parses command-line arguments, prints the parameters, and launches the kernel execution.
 *
 * @param argc Number of command-line arguments.
 * @param argv Array of command-line argument strings.
 * @return 0 on success, 1 on failure.
 */
int main(int argc, const char *argv[]) {
    int debug = 0;

    constexpr int IN_N = OUT_N/UPSCALE_FACTOR;

    std::cout << "Going to run the kernel with the following parameters:\n";
    std::cout << "In matrix size: " << IN_N << " by " << IN_N << "\n";
    std::cout << "Upscale factor: " << UPSCALE_FACTOR << "\n";
    
    std::cout << "Out matrix size: " << OUT_N << " by " << OUT_N << "\n";


  args myargs = parse_args(argc, argv);
  int res = setup_and_run_aie(IN_N , debug, myargs);
  return 0;
}

aie.py

import numpy as np
import argparse
import sys

from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker
from aie.iron.placers import SequentialPlacer
from aie.iron.device import NPU1Col1, NPU2
from aie.iron.controlflow import range_
from aie.helpers.taplib import TensorAccessSequence, TensorTiler2D

# Need ceildiv to capture partial tiling patterns
def ceildiv(a, b):
    return (a + b - 1) // b
    
def my_mse(dev, tile_size):
    M = tile_s
82A7
ize
    N = M
    K = M
    m = 4
    n = m
    k = m
    b_col_maj = False
    

    dtype_in = np.float32
    dtype_out = np.float32

    M_div_m = M // m
    K_div_k = K // k
    N_div_n = N // n
    tiles = M_div_m * N_div_n

  
    # Define tensor types
    A_ty = np.ndarray[(M * K,), np.dtype[dtype_in]]
    B_ty = np.ndarray[(K * N,), np.dtype[dtype_in]]
    C_ty = np.ndarray[(M * N,), np.dtype[dtype_out]]
    a_ty = np.ndarray[(m, k), np.dtype[dtype_in]]
    b_ty = np.ndarray[(k, n), np.dtype[dtype_in]]
    c_ty = np.ndarray[(m, n), np.dtype[dtype_out]]
   
   
    matmul_kernel = Kernel(
        "matrix_mult_t",
        "scale.o",
        [a_ty, b_ty, c_ty],
    )

    zero_kernel = Kernel(
        "zero",
        "scale.o",
        [c_ty]
    )

    # AIE-array data movement with object fifos
    # Input A
    inA = ObjectFifo(a_ty, name="inA")
    a_dims = None
   
    memA = inA.cons().forward(name="memA", dims_to_stream=a_dims)

    # Input B
    inB = ObjectFifo(b_ty, name="inB")
    b_dims = None
    memB = inB.cons().forward(name="memB", dims_to_stream=b_dims)

    # Output C
    memC = ObjectFifo(c_ty, name="memC")
    c_dims = None
    outC = memC.cons().forward(name="outC", dims_to_stream=c_dims)

    # Task each core will run
    def core_fn(of_a, of_b, of_c, zero, matmul):
        for _ in range_(tiles) if tiles > 1 else range(1):  # issue #1547
            elem_out = of_c.acquire(1)
            zero(elem_out)

            # issue #1547
            for _ in range_(K_div_k) if K_div_k > 1 else range(1):
                elem_in_a = of_a.acquire(1)
                elem_in_b = of_b.acquire(1)
                
                matmul(elem_in_a, elem_in_b, elem_out)
                
                of_a.release(1)
                of_b.release(1)
            of_c.release(1)

    # Create worker from task
    worker = Worker(
        core_fn, [memA.cons(), memB.cons(), memC.prod(),zero_kernel, matmul_kernel]
    )

    # only do 4 tile rows at a time before synchronizing, so we can reuse BDs
    rows_per_block = 4

    # Define tensor access patterns for inputs/outputs
    A_tiles = TensorTiler2D.group_tiler(
        (M, K), (m, k), (1, K_div_k), pattern_repeat=N_div_n
    )
    # There is only one access pattern for B - it tiles the entire matrix in (k x n) tiles.
    if b_col_maj:
        b_tap = TensorTiler2D.group_tiler((K, N), (k, n), (K_div_k, N_div_n))[0]
    else:
        b_tap = TensorTiler2D.group_tiler(
            (K, N), (k, n), (K_div_k, N_div_n), tile_group_col_major=True
        )[0]

    C_tiles = TensorTiler2D.group_tiler((M, N), (m, n), (rows_per_block // 2, N_div_n))
    c_index = 0

    # Runtime operations to move data to/from the AIE-array
    rt = Runtime()
    with rt.sequence(A_ty, B_ty, C_ty) as (A, B, C):
        rt.start(worker)

        tgs = []
        for tile_row_block in range(ceildiv(M_div_m, rows_per_block)):
            # we only sync on half the BDs before reusing them, so the other half can concurrently keep running
            # that's what this loop is for. We can track of this in the task groups for syncing.
            for pingpong in [0, 1]:

                row_base = (
                    tile_row_block * rows_per_block + pingpong * rows_per_block // 2
                )
                num_tile_rows = min([rows_per_block // 2, M_div_m - row_base])
                if num_tile_rows <= 0:
                    # At the very last iteration, we may not need a 'pong' iteration
                    break
                tgs.append(rt.task_group())
                for tile_row in range(num_tile_rows):
                    # -- A --
                    tile_offset = (row_base + tile_row) % len(A_tiles)
                    rt.fill(inA.prod(), A, tap=A_tiles[tile_offset], task_group=tgs[-1])
                    # A_taps.append(A_tiles[tile_offset])

                    # -- B --
                    rt.fill(inB.prod(), B, tap=b_tap, task_group=tgs[-1])
                    # B_taps.append(b_tap)

                # -- C --
                rt.drain(
                    outC.cons(), C, tap=C_tiles[c_index], task_group=tgs[-1], wait=True
                )
                # C_taps.append(C_tiles[c_index])
                c_index += 1

                if tile_row_block > 0 or (tile_row_block == 0 and pingpong > 0):
                    rt.finish_task_group(tgs[-2])
                    del tgs[-2]

        rt.finish_task_group(tgs[-1])
        del tgs[-1]

    my_program = Program(dev, rt)
    return my_program.resolve_program(SequentialPlacer())

if len(sys.argv) < 5:
    raise ValueError(
        "[ERROR] Need at least 4 arguments (dev, in1_size, in2_size, out_size)"
    )

p = argparse.ArgumentParser()
p.add_argument("-d", "--dev", required=True, dest="device", help="AIE Device")
p.add_argument("-i1s", "--in1_size", required=True, dest="in1_size", help="Input 1 size")
p.add_argument("-i2s", "--in2_size", required=True, dest="in2_size", help="Input 2 size")
p.add_argument("-os", "--out_size", required=True, dest="out_size", help="Output size")
p.add_argument(
    "-t",
    "--trace_size",
    required=False,
    dest="trace_size",
    default=0,
    help="Trace buffer size",
)
opts = p.parse_args(sys.argv[1:])

if opts.device == "npu":
    dev = NPU1Col1()
elif opts.device == "npu2":
    dev = NPU2()
else:
    raise ValueError("[ERROR] Device name {} is unknown".format(opts.device))

tile_size = int(opts.in1_size)
module = my_mse(dev, tile_size)
print(module)

mse.cc

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <type_traits>
#include <aie_api/aie.hpp>


#define SUBMATRIX_SIZE 4

extern "C" {

void matrix_mult_t(float *a, float *b, float *c) {
 
  event0();
   int  rowA= SUBMATRIX_SIZE;
 
    
  for (int row = 0; row < rowA; row++) {
    for (int col = 0; col < rowA; col++) {
      float running_sum = 0;
      for (int i = 0; i < rowA; i++) {
          
        running_sum += a[row * rowA + i] * b[i * rowA + col];
      }
      c[row * rowA + col] += running_sum;
     // c[row * colB + col] += a[row * colB + col] ;

    }
      
  }
  event1();
}

void zero(float *c) {
 
  event0();
   
  int  rowA= SUBMATRIX_SIZE;
    
  for (int row = 0; row < rowA; row++) {
    for (int col = 0; col < rowA; col++) {
  
      c[row * rowA + col] = 0;

    }
      
  }
  event1();
}


} // extern "C"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions