Closed
Description
Good evening, I have some strange issue while implementing non vectorised matrix multiplication, where aie part is taken from the programming_examples. When I want to upscale image up to the size 224x224 everything works fine. But when I try to increase the final size, the resulting image will miss values at the end, so it will have black area at the bottom of the image. When I move to the vectorised version, the problem was solved, but I would like to know the reason of such a behaviour. Can you help me?
Here is the code I use
test.cpp
#include <iostream>
#include <fstream>
#include <sstream>
#include <iomanip>
#include <vector>
#include <string>
#include <stdexcept>
#include <cstdlib>
#include <ctime>
#include <cstring>
#include <cassert>
#include <cmath>
#include <limits>
#include <chrono>
#include <algorithm>
#include <cfloat>
#include <cstdint>
#include <stdfloat>
#include <boost/program_options.hpp>
namespace po = boost::program_options;
// Include XRT libraries
#include "xrt/xrt_bo.h"
#include "xrt/xrt_device.h"
#include "xrt/xrt_kernel.h"
#include "utils.hpp"
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include "OpenCVUtils.h"
// #include "test_utils.h"
#ifndef UPSCALE_FACTOR
#define UPSCALE_FACTOR 2 // Example: change as needed
#endif
#ifndef OUT_N
#define OUT_N 256 // Example: change as needed
#endif
using MATRIX_DATATYPE = float;
struct args {
int verbosity;
int do_verify;
int n_iterations;
int n_warmup_iterations;
int trace_size;
std::string instr;
std::string xclbin;
std::string kernel;
std::string trace_file;
};
args parse_args(int argc, const char *argv[]) {
po::options_description desc("Allowed options");
po::variables_map vm;
utils::add_default_options(desc);
args myargs;
utils::parse_options(argc, argv, desc, vm);
myargs.verbosity = vm["verbosity"].as<int>();
myargs.do_verify = vm["verify"].as<bool>();
myargs.n_iterations = vm["iters"].as<int>();
myargs.n_warmup_iterations = vm["warmup"].as<int>();
myargs.trace_size = vm["trace_sz"].as<int>();
myargs.instr = vm["instr"].as<std::string>();
myargs.xclbin = vm["xclbin"].as<std::string>();
myargs.kernel = vm["kernel"].as<std::string>();
myargs.trace_file = vm["trace_file"].as<std::string>();
return myargs;
}
void initialize_bufIn_random(float *bufIn, int SIZE) {
for (int i = 0; i < SIZE; i++)
bufIn[i] = std::rand() % 10;
}
void initialize_bufIn(float *bufIn, int SIZE, float val) {
for (int i = 0; i < SIZE; i++)
bufIn[i] = val;
}
void print_matrix(const char* name, float* data, int rows, int cols) {
printf("\n%s:\n", name);
for (int i = 0; i < rows; ++i) {
for (int j = 0; j < cols; ++j) {
printf("%8.3f ", data[i * cols + j]);
}
printf("\n");
}
}
void writeMatrixToFile(const std::vector<MATRIX_DATATYPE>& C, int rows, int cols, const std::string& filename) {
std::ofstream outFile(filename);
if (!outFile) {
std::cerr << "Failed to open file: " << filename << std::endl;
return;
}
for (int i = 0; i < rows; ++i) {
for (int j = 0; j < cols; ++j) {
outFile << C[i * cols + j];
if (j < cols - 1)
outFile << " ";
}
outFile << "\n";
}
outFile.close();
}
cv::Mat vectorToMat(const std::vector<float>& vec, int rows, int cols) {
CV_Assert(static_cast<int>(vec.size()) == rows * cols);
cv::Mat mat(rows, cols, CV_32F); // Create CV_32F matrix
std::memcpy(mat.data, vec.data(), vec.size() * sizeof(float));
return mat;
}
std::vector<float> matToFloatVector(const cv::Mat& img) {
CV_Assert(img.type() == CV_8UC1); // Ensure grayscale
std::vector<float> vec;
vec.reserve(img.total());
for (int i = 0; i < img.rows; ++i) {
const uchar* rowPtr = img.ptr<uchar>(i);
for (int j = 0; j < img.cols; ++j) {
vec.push_back(static_cast<float>(rowPtr[j])); // or normalize: /255.0f
}
}
return vec;
}
template <typename T>
std::vector<T> createRandomMatrix(int N, T min_val = static_cast<T>(0), T max_val = static_cast<T>(10)) {
// Initialize random seed
std::srand(static_cast<unsigned int>(std::time(nullptr)));
std::vector<T> matrix(N * N);
for (int i = 0; i < N * N; ++i) {
if constexpr (std::is_integral<T>::value) {
matrix[i] = min_val + (std::rand() % (max_val - min_val + 1));
} else {
T scale = static_cast<T>(std::rand()) / static_cast<T>(RAND_MAX);
matrix[i] = min_val + scale * (max_val - min_val);
}
}
return matrix;
}
template <typename T>
std::vector<T> transposeMatrix(const std::vector<T>& matrix, int N) {
std::vector<T> transposed(N * N);
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
transposed[j * N + i] = matrix[i * N + j];
}
}
return transposed;
}
template <typename T>
std::vector<T> createDCTMatrix(int N) {
// Create a vector to hold the DCT matrix (N x N elements)
std::vector<T> T_matrix(N * N);
// Pre-calculate normalization factors
T factor0 = std::sqrt(static_cast<T>(1.0) / static_cast<T>(N));
T factor = std::sqrt(static_cast<T>(2.0) / static_cast<T>(N));
// Fill the vector using the DCT coefficient formula
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
if (i == 0) {
T_matrix[i * N + j] = factor0; // Row-major access (flattened index)
} else {
T_matrix[i * N + j] = factor * std::cos((M_PI * (2 * j + 1) * i) / (2.0 * N));
}
}
}
return T_matrix;
}
void initialize_bufOut(std::float_t *bufOut, int SIZE) {
// Assicurati di usare SIZE * sizeof(std::float_t) se inizializzi un buffer di float
memset(bufOut, 0, SIZE * sizeof(std::float_t));
}
float* transposeMatrix(const float* input, int rows, int cols) {
float* output = new float[rows * cols];
for (int i = 0; i < rows; ++i) {
for (int j = 0; j < cols; ++j) {
output[j * rows + i] = input[i * cols + j];
}
}
return output;
}
template <typename T>
void printMatrix(const char* name, const std::vector<T>& matrix, int N, int width = 8, int precision = 3) {
printf("\n%s:\n", name);
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
std::cout << std::setw(width) << std::setprecision(precision) << std::fixed
<< matrix[i * N + j] << " ";
}
std::cout << "\n";
}
}
int setup_and_run_aie(int IN_N,int debug, args myargs) {
srand(time(NULL));
std::vector<uint32_t> instr_v = utils::load_instr_binary(myargs.instr);
if (myargs.verbosity >= 1)
std::cout << "Sequence instr count: " << instr_v.size() << "\n";
// Initialize Device and Kernel
xrt::device device;
xrt::kernel kernel;
utils::init_xrt_load_kernel(device, kernel, myargs.verbosity,
myargs.xclbin, myargs.kernel);
int MATRIX_VOLUME = OUT_N*OUT_N;
// Create Buffer
auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
auto bo_in1 = xrt::bo(device, MATRIX_VOLUME * sizeof(MATRIX_DATATYPE),
XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
auto bo_in2 = xrt::bo(device, MATRIX_VOLUME * sizeof(MATRIX_DATATYPE),
XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
auto bo_out = xrt::bo(device, MATRIX_VOLUME * sizeof(MATRIX_DATATYPE),
XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
auto bo_tmp1 = xrt::bo(device, 1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(6));
int tmp_trace_size = (myargs.trace_size > 0) ? myargs.trace_size : 1;
auto bo_trace = xrt::bo(device, tmp_trace_size, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(7));
// Load Instructions
void *bufInstr = bo_instr.map<void *>();
memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
MATRIX_DATATYPE *bufIn1 = bo_in1.map<MATRIX_DATATYPE *>();
MATRIX_DATATYPE *bufIn2 = bo_in2.map<MATRIX_DATATYPE *>();
MATRIX_DATATYPE *bufOut = bo_out.map<MATRIX_DATATYPE *>();
char *bufTrace = bo_trace.map<char *>();
std::vector<MATRIX_DATATYPE> A_out_size2 = createRandomMatrix<MATRIX_DATATYPE>(OUT_N, 1, 2);
writeMatrixToFile(A_out_size2, OUT_N, OUT_N, "A_matrix.txt");
memcpy(bufIn2, A_out_size2.data(), (A_out_size2.size() * sizeof(MATRIX_DATATYPE)));
memcpy(bufIn1, A_out_size2.data(), (A_out_size2.size() * sizeof(MATRIX_DATATYPE)));
bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_in1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_in2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE);
unsigned int opcode = 3;
auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in1, bo_in2, bo_out, bo_tmp1, bo_trace);
run.wait();
bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
std::vector<MATRIX_DATATYPE> C = std::vector<MATRIX_DATATYPE>(bufOut, bufOut + OUT_N*OUT_N);
writeMatrixToFile(C, OUT_N, OUT_N, "BA_matrix.txt");
int ret_val = 0;
return ret_val;
}
/**
* @brief Main function.
*
* Parses command-line arguments, prints the parameters, and launches the kernel execution.
*
* @param argc Number of command-line arguments.
* @param argv Array of command-line argument strings.
* @return 0 on success, 1 on failure.
*/
int main(int argc, const char *argv[]) {
int debug = 0;
constexpr int IN_N = OUT_N/UPSCALE_FACTOR;
std::cout << "Going to run the kernel with the following parameters:\n";
std::cout << "In matrix size: " << IN_N << " by " << IN_N << "\n";
std::cout << "Upscale factor: " << UPSCALE_FACTOR << "\n";
std::cout << "Out matrix size: " << OUT_N << " by " << OUT_N << "\n";
args myargs = parse_args(argc, argv);
int res = setup_and_run_aie(IN_N , debug, myargs);
return 0;
}
aie.py
import numpy as np
import argparse
import sys
from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker
from aie.iron.placers import SequentialPlacer
from aie.iron.device import NPU1Col1, NPU2
from aie.iron.controlflow import range_
from aie.helpers.taplib import TensorAccessSequence, TensorTiler2D
# Need ceildiv to capture partial tiling patterns
def ceildiv(a, b):
return (a + b - 1) // b
def my_mse(dev, tile_size):
M = tile_s
82A7
ize
N = M
K = M
m = 4
n = m
k = m
b_col_maj = False
dtype_in = np.float32
dtype_out = np.float32
M_div_m = M // m
K_div_k = K // k
N_div_n = N // n
tiles = M_div_m * N_div_n
# Define tensor types
A_ty = np.ndarray[(M * K,), np.dtype[dtype_in]]
B_ty = np.ndarray[(K * N,), np.dtype[dtype_in]]
C_ty = np.ndarray[(M * N,), np.dtype[dtype_out]]
a_ty = np.ndarray[(m, k), np.dtype[dtype_in]]
b_ty = np.ndarray[(k, n), np.dtype[dtype_in]]
c_ty = np.ndarray[(m, n), np.dtype[dtype_out]]
matmul_kernel = Kernel(
"matrix_mult_t",
"scale.o",
[a_ty, b_ty, c_ty],
)
zero_kernel = Kernel(
"zero",
"scale.o",
[c_ty]
)
# AIE-array data movement with object fifos
# Input A
inA = ObjectFifo(a_ty, name="inA")
a_dims = None
memA = inA.cons().forward(name="memA", dims_to_stream=a_dims)
# Input B
inB = ObjectFifo(b_ty, name="inB")
b_dims = None
memB = inB.cons().forward(name="memB", dims_to_stream=b_dims)
# Output C
memC = ObjectFifo(c_ty, name="memC")
c_dims = None
outC = memC.cons().forward(name="outC", dims_to_stream=c_dims)
# Task each core will run
def core_fn(of_a, of_b, of_c, zero, matmul):
for _ in range_(tiles) if tiles > 1 else range(1): # issue #1547
elem_out = of_c.acquire(1)
zero(elem_out)
# issue #1547
for _ in range_(K_div_k) if K_div_k > 1 else range(1):
elem_in_a = of_a.acquire(1)
elem_in_b = of_b.acquire(1)
matmul(elem_in_a, elem_in_b, elem_out)
of_a.release(1)
of_b.release(1)
of_c.release(1)
# Create worker from task
worker = Worker(
core_fn, [memA.cons(), memB.cons(), memC.prod(),zero_kernel, matmul_kernel]
)
# only do 4 tile rows at a time before synchronizing, so we can reuse BDs
rows_per_block = 4
# Define tensor access patterns for inputs/outputs
A_tiles = TensorTiler2D.group_tiler(
(M, K), (m, k), (1, K_div_k), pattern_repeat=N_div_n
)
# There is only one access pattern for B - it tiles the entire matrix in (k x n) tiles.
if b_col_maj:
b_tap = TensorTiler2D.group_tiler((K, N), (k, n), (K_div_k, N_div_n))[0]
else:
b_tap = TensorTiler2D.group_tiler(
(K, N), (k, n), (K_div_k, N_div_n), tile_group_col_major=True
)[0]
C_tiles = TensorTiler2D.group_tiler((M, N), (m, n), (rows_per_block // 2, N_div_n))
c_index = 0
# Runtime operations to move data to/from the AIE-array
rt = Runtime()
with rt.sequence(A_ty, B_ty, C_ty) as (A, B, C):
rt.start(worker)
tgs = []
for tile_row_block in range(ceildiv(M_div_m, rows_per_block)):
# we only sync on half the BDs before reusing them, so the other half can concurrently keep running
# that's what this loop is for. We can track of this in the task groups for syncing.
for pingpong in [0, 1]:
row_base = (
tile_row_block * rows_per_block + pingpong * rows_per_block // 2
)
num_tile_rows = min([rows_per_block // 2, M_div_m - row_base])
if num_tile_rows <= 0:
# At the very last iteration, we may not need a 'pong' iteration
break
tgs.append(rt.task_group())
for tile_row in range(num_tile_rows):
# -- A --
tile_offset = (row_base + tile_row) % len(A_tiles)
rt.fill(inA.prod(), A, tap=A_tiles[tile_offset], task_group=tgs[-1])
# A_taps.append(A_tiles[tile_offset])
# -- B --
rt.fill(inB.prod(), B, tap=b_tap, task_group=tgs[-1])
# B_taps.append(b_tap)
# -- C --
rt.drain(
outC.cons(), C, tap=C_tiles[c_index], task_group=tgs[-1], wait=True
)
# C_taps.append(C_tiles[c_index])
c_index += 1
if tile_row_block > 0 or (tile_row_block == 0 and pingpong > 0):
rt.finish_task_group(tgs[-2])
del tgs[-2]
rt.finish_task_group(tgs[-1])
del tgs[-1]
my_program = Program(dev, rt)
return my_program.resolve_program(SequentialPlacer())
if len(sys.argv) < 5:
raise ValueError(
"[ERROR] Need at least 4 arguments (dev, in1_size, in2_size, out_size)"
)
p = argparse.ArgumentParser()
p.add_argument("-d", "--dev", required=True, dest="device", help="AIE Device")
p.add_argument("-i1s", "--in1_size", required=True, dest="in1_size", help="Input 1 size")
p.add_argument("-i2s", "--in2_size", required=True, dest="in2_size", help="Input 2 size")
p.add_argument("-os", "--out_size", required=True, dest="out_size", help="Output size")
p.add_argument(
"-t",
"--trace_size",
required=False,
dest="trace_size",
default=0,
help="Trace buffer size",
)
opts = p.parse_args(sys.argv[1:])
if opts.device == "npu":
dev = NPU1Col1()
elif opts.device == "npu2":
dev = NPU2()
else:
raise ValueError("[ERROR] Device name {} is unknown".format(opts.device))
tile_size = int(opts.in1_size)
module = my_mse(dev, tile_size)
print(module)
mse.cc
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <type_traits>
#include <aie_api/aie.hpp>
#define SUBMATRIX_SIZE 4
extern "C" {
void matrix_mult_t(float *a, float *b, float *c) {
event0();
int rowA= SUBMATRIX_SIZE;
for (int row = 0; row < rowA; row++) {
for (int col = 0; col < rowA; col++) {
float running_sum = 0;
for (int i = 0; i < rowA; i++) {
running_sum += a[row * rowA + i] * b[i * rowA + col];
}
c[row * rowA + col] += running_sum;
// c[row * colB + col] += a[row * colB + col] ;
}
}
event1();
}
void zero(float *c) {
event0();
int rowA= SUBMATRIX_SIZE;
for (int row = 0; row < rowA; row++) {
for (int col = 0; col < rowA; col++) {
c[row * rowA + col] = 0;
}
}
event1();
}
} // extern "C"