Skip to content

Commit

Permalink
Add control packet e2e test (#1666)
Browse files Browse the repository at this point in the history
  • Loading branch information
fifield committed Aug 8, 2024
1 parent c8fbbac commit a7c562a
Show file tree
Hide file tree
Showing 3 changed files with 365 additions and 0 deletions.
126 changes: 126 additions & 0 deletions test/npu-xrt/add_one_ctrl_packet/aie.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// (c) Copyright 2024 Advanced Micro Devices, Inc.
//
//===----------------------------------------------------------------------===//

module {
aie.device(npu1_1col) {
memref.global "public" @out0 : memref<8xi32>
memref.global "public" @ctrl0 : memref<8xi32>

%tile_0_0 = aie.tile(0, 0)
%tile_0_2 = aie.tile(0, 2)

%input_lock0 = aie.lock(%tile_0_2, 0) {init = 0 : i32, sym_name = "input_lock0"}
%input_lock2 = aie.lock(%tile_0_2, 2) {init = 0 : i32, sym_name = "input_lock2"}
%output_lock4 = aie.lock(%tile_0_2, 4) {init = 0 : i32, sym_name = "output_lock4"}
%output_lock5 = aie.lock(%tile_0_2, 5) {init = 1 : i32, sym_name = "output_lock5"}

%input_buffer = aie.buffer(%tile_0_2) {sym_name = "input_buffer"} : memref<8xi32>
%output_buffer = aie.buffer(%tile_0_2) {sym_name = "output_buffer"} : memref<8xi32>

aie.packet_flow(0x1) {
aie.packet_source<%tile_0_0, DMA : 0>
aie.packet_dest<%tile_0_2, Ctrl : 0>
}

aie.flow(%tile_0_2, DMA : 0, %tile_0_0, DMA : 0)

%core_0_2 = aie.core(%tile_0_2) {
%c0 = arith.constant 0 : index
%c1_i32 = arith.constant 1 : i32
%c3_i32 = arith.constant 3 : i32
%c1 = arith.constant 1 : index
%c8 = arith.constant 8 : index
%c4294967295 = arith.constant 4294967295 : index
scf.for %arg0 = %c0 to %c4294967295 step %c1 {
// initialize to 3
scf.for %arg1 = %c0 to %c8 step %c1 {
memref.store %c3_i32, %input_buffer[%arg1] : memref<8xi32>
}
aie.use_lock(%input_lock0, AcquireGreaterEqual, 1)
scf.for %arg1 = %c0 to %c8 step %c1 {
// 4
%1 = memref.load %input_buffer[%arg1] : memref<8xi32>
%2 = arith.addi %1, %c1_i32 : i32
memref.store %2, %input_buffer[%arg1] : memref<8xi32>
}
aie.use_lock(%input_lock0, AcquireGreaterEqual, 1)
scf.for %arg1 = %c0 to %c8 step %c1 {
// 5
%1 = memref.load %input_buffer[%arg1] : memref<8xi32>
%2 = arith.addi %1, %c1_i32 : i32
memref.store %2, %input_buffer[%arg1] : memref<8xi32>
}
aie.use_lock(%input_lock2, AcquireGreaterEqual, 1)
scf.for %arg1 = %c0 to %c8 step %c1 {
// 6
%1 = memref.load %input_buffer[%arg1] : memref<8xi32>
%2 = arith.addi %1, %c1_i32 : i32
memref.store %2, %input_buffer[%arg1] : memref<8xi32>
}
aie.use_lock(%input_lock2, AcquireGreaterEqual, 1)
scf.for %arg1 = %c0 to %c8 step %c1 {
// 7
%1 = memref.load %input_buffer[%arg1] : memref<8xi32>
%2 = arith.addi %1, %c1_i32 : i32
memref.store %2, %input_buffer[%arg1] : memref<8xi32>
}
// write to output buffer
aie.use_lock(%output_lock5, AcquireGreaterEqual, 1)
scf.for %arg1 = %c0 to %c8 step %c1 {
%1 = memref.load %input_buffer[%arg1] : memref<8xi32>
memref.store %1, %output_buffer[%arg1] : memref<8xi32>
}
aie.use_lock(%output_lock4, Release, 1)
}
aie.end
}

%mem_0_2 = aie.mem(%tile_0_2) {
%0 = aie.dma_start(MM2S, 0, ^bb1, ^bb2)
^bb1: // 2 preds: ^bb0, ^bb2
aie.use_lock(%output_lock4, AcquireGreaterEqual, 1)
aie.dma_bd(%output_buffer : memref<8xi32>, 0, 8)
aie.use_lock(%output_lock5, Release, 1)
aie.next_bd ^bb1
^bb2:
aie.end
}

aie.shim_dma_allocation @ctrl0(MM2S, 0, 0)
aie.shim_dma_allocation @out0(S2MM, 0, 0)

memref.global "private" constant @blockwrite_data_0 : memref<8xi32> = dense<[2, 0, 0x40090000, 0, 0x40000000, 0, 0, 0x2000000]>
aiex.runtime_sequence @seq(%arg0: memref<8xi32>, %arg1: memref<8xi32>, %arg2: memref<8xi32>) {
%c0_i64 = arith.constant 0 : i64
%c1_i64 = arith.constant 1 : i64
%c8_i64 = arith.constant 8 : i64

// set Ctrl_Pkt_Tlast_Error_Enable=0 in Module_Clock_Control register
// aiex.npu.maskwrite32 {address = 0x00060000 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32, mask = 0x8 : ui32}

// write bd0
%0 = memref.get_global @blockwrite_data_0 : memref<8xi32>
aiex.npu.blockwrite(%0) {address = 0x1d000 : ui32, column = 0 : i32, row = 0 : i32} : memref<8xi32>

// patch bd0 address, push to mm2s_0_task_queue, wait
aiex.npu.address_patch {addr = 0x1d004 : ui32, arg_idx = 1 : i32, arg_plus = 0 : i32}
aiex.npu.write32 {address = 0x1d214 : ui32, column = 0 : i32, row = 0 : i32, value = 0x80000000 : ui32}
aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}

// patch bd0 address, push to mm2s_0_task_queue, wait
aiex.npu.address_patch {addr = 0x1d004 : ui32, arg_idx = 1 : i32, arg_plus = 8 : i32}
aiex.npu.write32 {address = 0x1d214 : ui32, column = 0 : i32, row = 0 : i32, value = 0x80000000 : ui32}
aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}

aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 2 : i64, issue_token = true, metadata = @out0} : memref<8xi32>
aiex.npu.dma_wait {symbol = @out0}
}
}
}
10 changes: 10 additions & 0 deletions test/npu-xrt/add_one_ctrl_packet/run.lit
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// (c) Copyright 2024 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// REQUIRES: ryzen_ai
//
// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --basic-alloc-scheme --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir
// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
// CHECK: PASS!

229 changes: 229 additions & 0 deletions test/npu-xrt/add_one_ctrl_packet/test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
//===- test.cpp -------------------------------------------------*- C++ -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// Copyright (C) 2024, Advanced Micro Devices, Inc.
//
//===----------------------------------------------------------------------===//

#include <boost/program_options.hpp>
#include <cstdint>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>

#include "xrt/xrt_bo.h"
#include "xrt/xrt_device.h"
#include "xrt/xrt_kernel.h"

constexpr int IN_SIZE = 64;
constexpr int OUT_SIZE = 64;

namespace po = boost::program_options;

void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
if (!vm_in.count(name)) {
throw std::runtime_error("Error: no " + name + " file was provided\n");
} else {
std::ifstream test(vm_in[name].as<std::string>());
if (!test) {
throw std::runtime_error("The " + name + " file " +
vm_in[name].as<std::string>() +
" does not exist.\n");
}
}
}

std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
std::ifstream instr_file(instr_path);
std::string line;
std::vector<uint32_t> instr_v;
while (std::getline(instr_file, line)) {
std::istringstream iss(line);
uint32_t a;
if (!(iss >> std::hex >> a)) {
throw std::runtime_error("Unable to parse instruction file\n");
}
instr_v.push_back(a);
}
return instr_v;
}

int main(int argc, const char *argv[]) {

// Program arguments parsing
po::options_description desc("Allowed options");
desc.add_options()("help,h", "produce help message")(
"xclbin,x", po::value<std::string>()->required(),
"the input xclbin path")(
"kernel,k", po::value<std::string>()->required(),
"the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
"verbosity,v", po::value<int>()->default_value(0),
"the verbosity of the output")(
"instr,i", po::value<std::string>()->required(),
"path of file containing userspace instructions to be sent to the LX6");
po::variables_map vm;

try {
po::store(po::parse_command_line(argc, argv, desc), vm);
po::notify(vm);

if (vm.count("help")) {
std::cout << desc << "\n";
return 1;
}
} catch (const std::exception &ex) {
std::cerr << ex.what() << "\n\n";
std::cerr << "Usage:\n" << desc << "\n";
return 1;
}

check_arg_file_exists(vm, "xclbin");
check_arg_file_exists(vm, "instr");

std::vector<uint32_t> instr_v =
load_instr_sequence(vm["instr"].as<std::string>());

int verbosity = vm["verbosity"].as<int>();
if (verbosity >= 1)
std::cout << "Sequence instr count: " << instr_v.size() << "\n";

// Start the XRT test code
// Get a device handle
unsigned int device_index = 0;
auto device = xrt::device(device_index);

// Load the xclbin
if (verbosity >= 1)
std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());

if (verbosity >= 1)
std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n";
std::string Node = vm["kernel"].as<std::string>();

// Get the kernel from the xclbin
auto xkernels = xclbin.get_kernels();
auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
[Node](xrt::xclbin::kernel &k) {
auto name = k.get_name();
std::cout << "Name: " << name << std::endl;
return name.rfind(Node, 0) == 0;
});
auto kernelName = xkernel.get_name();

if (verbosity >= 1)
std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
<< "\n";

device.register_xclbin(xclbin);

// get a hardware context
if (verbosity >= 1)
std::cout << "Getting hardware context.\n";
xrt::hw_context context(device, xclbin.get_uuid());

// get a kernel handle
if (verbosity >= 1)
std::cout << "Getting handle to kernel:" << kernelName << "\n";
auto kernel = xrt::kernel(context, kernelName);

auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));

if (verbosity >= 1)
std::cout << "Writing data into buffer objects.\n";

uint32_t *bufInA = bo_inA.map<uint32_t *>();
std::vector<uint32_t> srcVecA;
for (int i = 0; i < IN_SIZE; i++)
srcVecA.push_back(i + 1);
memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t)));

uint32_t beats = 1 - 1;
uint32_t operation = 0;
uint32_t stream_id = 0;
auto parity = [](uint32_t n) {
uint32_t p = 0;
while (n) {
p += n & 1;
n >>= 1;
}
return (p % 2) == 0;
};

// Lock0_value
uint32_t address = 0x0001F000;
uint32_t header0 = stream_id << 24 | operation << 22 | beats << 20 | address;
header0 |= (0x1 & parity(header0)) << 31;

// Lock2_value
address += 0x20;
uint32_t header1 = stream_id << 24 | operation << 22 | beats << 20 | address;
header1 |= (0x1 & parity(header1)) << 31;

// set lock values to 2
uint32_t data = 2;
std::vector<uint32_t> srcVecB = {
header0,
data,
header1,
data,
};
void *bufInB = bo_inB.map<void *>();
memcpy(bufInB, srcVecB.data(), srcVecB.size() * sizeof(int));

void *bufInstr = bo_instr.map<void *>();
memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));

bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_inB.sync(XCL_BO_SYNC_BO_TO_DEVICE);

if (verbosity >= 1)
std::cout << "Running Kernel.\n";
unsigned int opcode = 3;
auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);

ert_cmd_state r = run.wait();
if (r != ERT_CMD_STATE_COMPLETED) {
std::cout << "Kernel did not complete. Returned status: " << r << "\n";
return 1;
}

bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);

uint32_t *bufOut = bo_out.map<uint32_t *>();

int errors = 0;

for (uint32_t i = 0; i < 8; i++) {
uint32_t ref = 7;
if (*(bufOut + i) != ref) {
std::cout << "Error in output " << *(bufOut + i) << " != " << ref
<< std::endl;
errors++;
} else {
std::cout << "Correct output " << *(bufOut + i) << " == " << ref
<< std::endl;
}
}

if (!errors) {
std::cout << "\nPASS!\n\n";
return 0;
} else {
std::cout << "\nfailed.\n\n";
return 1;
}
}

0 comments on commit a7c562a

Please sign in to comment.