diff --git a/test/npu-xrt/add_one_ctrl_packet/aie.mlir b/test/npu-xrt/add_one_ctrl_packet/aie.mlir new file mode 100644 index 0000000000..2c5b5771af --- /dev/null +++ b/test/npu-xrt/add_one_ctrl_packet/aie.mlir @@ -0,0 +1,126 @@ +//===- aie.mlir ------------------------------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +module { + aie.device(npu1_1col) { + memref.global "public" @out0 : memref<8xi32> + memref.global "public" @ctrl0 : memref<8xi32> + + %tile_0_0 = aie.tile(0, 0) + %tile_0_2 = aie.tile(0, 2) + + %input_lock0 = aie.lock(%tile_0_2, 0) {init = 0 : i32, sym_name = "input_lock0"} + %input_lock2 = aie.lock(%tile_0_2, 2) {init = 0 : i32, sym_name = "input_lock2"} + %output_lock4 = aie.lock(%tile_0_2, 4) {init = 0 : i32, sym_name = "output_lock4"} + %output_lock5 = aie.lock(%tile_0_2, 5) {init = 1 : i32, sym_name = "output_lock5"} + + %input_buffer = aie.buffer(%tile_0_2) {sym_name = "input_buffer"} : memref<8xi32> + %output_buffer = aie.buffer(%tile_0_2) {sym_name = "output_buffer"} : memref<8xi32> + + aie.packet_flow(0x1) { + aie.packet_source<%tile_0_0, DMA : 0> + aie.packet_dest<%tile_0_2, Ctrl : 0> + } + + aie.flow(%tile_0_2, DMA : 0, %tile_0_0, DMA : 0) + + %core_0_2 = aie.core(%tile_0_2) { + %c0 = arith.constant 0 : index + %c1_i32 = arith.constant 1 : i32 + %c3_i32 = arith.constant 3 : i32 + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c4294967295 = arith.constant 4294967295 : index + scf.for %arg0 = %c0 to %c4294967295 step %c1 { + // initialize to 3 + scf.for %arg1 = %c0 to %c8 step %c1 { + memref.store %c3_i32, %input_buffer[%arg1] : memref<8xi32> + } + aie.use_lock(%input_lock0, AcquireGreaterEqual, 1) + scf.for %arg1 = %c0 to %c8 step %c1 { + // 4 + %1 = memref.load %input_buffer[%arg1] : memref<8xi32> + %2 = arith.addi %1, %c1_i32 : i32 + memref.store %2, %input_buffer[%arg1] : memref<8xi32> + } + aie.use_lock(%input_lock0, AcquireGreaterEqual, 1) + scf.for %arg1 = %c0 to %c8 step %c1 { + // 5 + %1 = memref.load %input_buffer[%arg1] : memref<8xi32> + %2 = arith.addi %1, %c1_i32 : i32 + memref.store %2, %input_buffer[%arg1] : memref<8xi32> + } + aie.use_lock(%input_lock2, AcquireGreaterEqual, 1) + scf.for %arg1 = %c0 to %c8 step %c1 { + // 6 + %1 = memref.load %input_buffer[%arg1] : memref<8xi32> + %2 = arith.addi %1, %c1_i32 : i32 + memref.store %2, %input_buffer[%arg1] : memref<8xi32> + } + aie.use_lock(%input_lock2, AcquireGreaterEqual, 1) + scf.for %arg1 = %c0 to %c8 step %c1 { + // 7 + %1 = memref.load %input_buffer[%arg1] : memref<8xi32> + %2 = arith.addi %1, %c1_i32 : i32 + memref.store %2, %input_buffer[%arg1] : memref<8xi32> + } + // write to output buffer + aie.use_lock(%output_lock5, AcquireGreaterEqual, 1) + scf.for %arg1 = %c0 to %c8 step %c1 { + %1 = memref.load %input_buffer[%arg1] : memref<8xi32> + memref.store %1, %output_buffer[%arg1] : memref<8xi32> + } + aie.use_lock(%output_lock4, Release, 1) + } + aie.end + } + + %mem_0_2 = aie.mem(%tile_0_2) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb2) + ^bb1: // 2 preds: ^bb0, ^bb2 + aie.use_lock(%output_lock4, AcquireGreaterEqual, 1) + aie.dma_bd(%output_buffer : memref<8xi32>, 0, 8) + aie.use_lock(%output_lock5, Release, 1) + aie.next_bd ^bb1 + ^bb2: + aie.end + } + + aie.shim_dma_allocation @ctrl0(MM2S, 0, 0) + aie.shim_dma_allocation @out0(S2MM, 0, 0) + + memref.global "private" constant @blockwrite_data_0 : memref<8xi32> = dense<[2, 0, 0x40090000, 0, 0x40000000, 0, 0, 0x2000000]> + aiex.runtime_sequence @seq(%arg0: memref<8xi32>, %arg1: memref<8xi32>, %arg2: memref<8xi32>) { + %c0_i64 = arith.constant 0 : i64 + %c1_i64 = arith.constant 1 : i64 + %c8_i64 = arith.constant 8 : i64 + + // set Ctrl_Pkt_Tlast_Error_Enable=0 in Module_Clock_Control register + // aiex.npu.maskwrite32 {address = 0x00060000 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32, mask = 0x8 : ui32} + + // write bd0 + %0 = memref.get_global @blockwrite_data_0 : memref<8xi32> + aiex.npu.blockwrite(%0) {address = 0x1d000 : ui32, column = 0 : i32, row = 0 : i32} : memref<8xi32> + + // patch bd0 address, push to mm2s_0_task_queue, wait + aiex.npu.address_patch {addr = 0x1d004 : ui32, arg_idx = 1 : i32, arg_plus = 0 : i32} + aiex.npu.write32 {address = 0x1d214 : ui32, column = 0 : i32, row = 0 : i32, value = 0x80000000 : ui32} + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + + // patch bd0 address, push to mm2s_0_task_queue, wait + aiex.npu.address_patch {addr = 0x1d004 : ui32, arg_idx = 1 : i32, arg_plus = 8 : i32} + aiex.npu.write32 {address = 0x1d214 : ui32, column = 0 : i32, row = 0 : i32, value = 0x80000000 : ui32} + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} + + aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 2 : i64, issue_token = true, metadata = @out0} : memref<8xi32> + aiex.npu.dma_wait {symbol = @out0} + } + } +} diff --git a/test/npu-xrt/add_one_ctrl_packet/run.lit b/test/npu-xrt/add_one_ctrl_packet/run.lit new file mode 100644 index 0000000000..f08641410e --- /dev/null +++ b/test/npu-xrt/add_one_ctrl_packet/run.lit @@ -0,0 +1,10 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai +// +// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --basic-alloc-scheme --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir +// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// CHECK: PASS! + diff --git a/test/npu-xrt/add_one_ctrl_packet/test.cpp b/test/npu-xrt/add_one_ctrl_packet/test.cpp new file mode 100644 index 0000000000..3a2b012ecc --- /dev/null +++ b/test/npu-xrt/add_one_ctrl_packet/test.cpp @@ -0,0 +1,229 @@ +//===- test.cpp -------------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +constexpr int IN_SIZE = 64; +constexpr int OUT_SIZE = 64; + +namespace po = boost::program_options; + +void check_arg_file_exists(po::variables_map &vm_in, std::string name) { + if (!vm_in.count(name)) { + throw std::runtime_error("Error: no " + name + " file was provided\n"); + } else { + std::ifstream test(vm_in[name].as()); + if (!test) { + throw std::runtime_error("The " + name + " file " + + vm_in[name].as() + + " does not exist.\n"); + } + } +} + +std::vector load_instr_sequence(std::string instr_path) { + std::ifstream instr_file(instr_path); + std::string line; + std::vector instr_v; + while (std::getline(instr_file, line)) { + std::istringstream iss(line); + uint32_t a; + if (!(iss >> std::hex >> a)) { + throw std::runtime_error("Unable to parse instruction file\n"); + } + instr_v.push_back(a); + } + return instr_v; +} + +int main(int argc, const char *argv[]) { + + // Program arguments parsing + po::options_description desc("Allowed options"); + desc.add_options()("help,h", "produce help message")( + "xclbin,x", po::value()->required(), + "the input xclbin path")( + "kernel,k", po::value()->required(), + "the kernel name in the XCLBIN (for instance PP_PRE_FD)")( + "verbosity,v", po::value()->default_value(0), + "the verbosity of the output")( + "instr,i", po::value()->required(), + "path of file containing userspace instructions to be sent to the LX6"); + po::variables_map vm; + + try { + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + if (vm.count("help")) { + std::cout << desc << "\n"; + return 1; + } + } catch (const std::exception &ex) { + std::cerr << ex.what() << "\n\n"; + std::cerr << "Usage:\n" << desc << "\n"; + return 1; + } + + check_arg_file_exists(vm, "xclbin"); + check_arg_file_exists(vm, "instr"); + + std::vector instr_v = + load_instr_sequence(vm["instr"].as()); + + int verbosity = vm["verbosity"].as(); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << "\n"; + + // Start the XRT test code + // Get a device handle + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + // Load the xclbin + if (verbosity >= 1) + std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n"; + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + + if (verbosity >= 1) + std::cout << "Kernel opcode: " << vm["kernel"].as() << "\n"; + std::string Node = vm["kernel"].as(); + + // Get the kernel from the xclbin + auto xkernels = xclbin.get_kernels(); + auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + [Node](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + std::cout << "Name: " << name << std::endl; + return name.rfind(Node, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + if (verbosity >= 1) + std::cout << "Registering xclbin: " << vm["xclbin"].as() + << "\n"; + + device.register_xclbin(xclbin); + + // get a hardware context + if (verbosity >= 1) + std::cout << "Getting hardware context.\n"; + xrt::hw_context context(device, xclbin.get_uuid()); + + // get a kernel handle + if (verbosity >= 1) + std::cout << "Getting handle to kernel:" << kernelName << "\n"; + auto kernel = xrt::kernel(context, kernelName); + + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects.\n"; + + uint32_t *bufInA = bo_inA.map(); + std::vector srcVecA; + for (int i = 0; i < IN_SIZE; i++) + srcVecA.push_back(i + 1); + memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t))); + + uint32_t beats = 1 - 1; + uint32_t operation = 0; + uint32_t stream_id = 0; + auto parity = [](uint32_t n) { + uint32_t p = 0; + while (n) { + p += n & 1; + n >>= 1; + } + return (p % 2) == 0; + }; + + // Lock0_value + uint32_t address = 0x0001F000; + uint32_t header0 = stream_id << 24 | operation << 22 | beats << 20 | address; + header0 |= (0x1 & parity(header0)) << 31; + + // Lock2_value + address += 0x20; + uint32_t header1 = stream_id << 24 | operation << 22 | beats << 20 | address; + header1 |= (0x1 & parity(header1)) << 31; + + // set lock values to 2 + uint32_t data = 2; + std::vector srcVecB = { + header0, + data, + header1, + data, + }; + void *bufInB = bo_inB.map(); + memcpy(bufInB, srcVecB.data(), srcVecB.size() * sizeof(int)); + + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inB.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + if (verbosity >= 1) + std::cout << "Running Kernel.\n"; + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + + ert_cmd_state r = run.wait(); + if (r != ERT_CMD_STATE_COMPLETED) { + std::cout << "Kernel did not complete. Returned status: " << r << "\n"; + return 1; + } + + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + uint32_t *bufOut = bo_out.map(); + + int errors = 0; + + for (uint32_t i = 0; i < 8; i++) { + uint32_t ref = 7; + if (*(bufOut + i) != ref) { + std::cout << "Error in output " << *(bufOut + i) << " != " << ref + << std::endl; + errors++; + } else { + std::cout << "Correct output " << *(bufOut + i) << " == " << ref + << std::endl; + } + } + + if (!errors) { + std::cout << "\nPASS!\n\n"; + return 0; + } else { + std::cout << "\nfailed.\n\n"; + return 1; + } +}