-
Notifications
You must be signed in to change notification settings - Fork 82
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
365 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
//===- aie.mlir ------------------------------------------------*- MLIR -*-===// | ||
// | ||
// This file is licensed under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
// (c) Copyright 2024 Advanced Micro Devices, Inc. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
module { | ||
aie.device(npu1_1col) { | ||
memref.global "public" @out0 : memref<8xi32> | ||
memref.global "public" @ctrl0 : memref<8xi32> | ||
|
||
%tile_0_0 = aie.tile(0, 0) | ||
%tile_0_2 = aie.tile(0, 2) | ||
|
||
%input_lock0 = aie.lock(%tile_0_2, 0) {init = 0 : i32, sym_name = "input_lock0"} | ||
%input_lock2 = aie.lock(%tile_0_2, 2) {init = 0 : i32, sym_name = "input_lock2"} | ||
%output_lock4 = aie.lock(%tile_0_2, 4) {init = 0 : i32, sym_name = "output_lock4"} | ||
%output_lock5 = aie.lock(%tile_0_2, 5) {init = 1 : i32, sym_name = "output_lock5"} | ||
|
||
%input_buffer = aie.buffer(%tile_0_2) {sym_name = "input_buffer"} : memref<8xi32> | ||
%output_buffer = aie.buffer(%tile_0_2) {sym_name = "output_buffer"} : memref<8xi32> | ||
|
||
aie.packet_flow(0x1) { | ||
aie.packet_source<%tile_0_0, DMA : 0> | ||
aie.packet_dest<%tile_0_2, Ctrl : 0> | ||
} | ||
|
||
aie.flow(%tile_0_2, DMA : 0, %tile_0_0, DMA : 0) | ||
|
||
%core_0_2 = aie.core(%tile_0_2) { | ||
%c0 = arith.constant 0 : index | ||
%c1_i32 = arith.constant 1 : i32 | ||
%c3_i32 = arith.constant 3 : i32 | ||
%c1 = arith.constant 1 : index | ||
%c8 = arith.constant 8 : index | ||
%c4294967295 = arith.constant 4294967295 : index | ||
scf.for %arg0 = %c0 to %c4294967295 step %c1 { | ||
// initialize to 3 | ||
scf.for %arg1 = %c0 to %c8 step %c1 { | ||
memref.store %c3_i32, %input_buffer[%arg1] : memref<8xi32> | ||
} | ||
aie.use_lock(%input_lock0, AcquireGreaterEqual, 1) | ||
scf.for %arg1 = %c0 to %c8 step %c1 { | ||
// 4 | ||
%1 = memref.load %input_buffer[%arg1] : memref<8xi32> | ||
%2 = arith.addi %1, %c1_i32 : i32 | ||
memref.store %2, %input_buffer[%arg1] : memref<8xi32> | ||
} | ||
aie.use_lock(%input_lock0, AcquireGreaterEqual, 1) | ||
scf.for %arg1 = %c0 to %c8 step %c1 { | ||
// 5 | ||
%1 = memref.load %input_buffer[%arg1] : memref<8xi32> | ||
%2 = arith.addi %1, %c1_i32 : i32 | ||
memref.store %2, %input_buffer[%arg1] : memref<8xi32> | ||
} | ||
aie.use_lock(%input_lock2, AcquireGreaterEqual, 1) | ||
scf.for %arg1 = %c0 to %c8 step %c1 { | ||
// 6 | ||
%1 = memref.load %input_buffer[%arg1] : memref<8xi32> | ||
%2 = arith.addi %1, %c1_i32 : i32 | ||
memref.store %2, %input_buffer[%arg1] : memref<8xi32> | ||
} | ||
aie.use_lock(%input_lock2, AcquireGreaterEqual, 1) | ||
scf.for %arg1 = %c0 to %c8 step %c1 { | ||
// 7 | ||
%1 = memref.load %input_buffer[%arg1] : memref<8xi32> | ||
%2 = arith.addi %1, %c1_i32 : i32 | ||
memref.store %2, %input_buffer[%arg1] : memref<8xi32> | ||
} | ||
// write to output buffer | ||
aie.use_lock(%output_lock5, AcquireGreaterEqual, 1) | ||
scf.for %arg1 = %c0 to %c8 step %c1 { | ||
%1 = memref.load %input_buffer[%arg1] : memref<8xi32> | ||
memref.store %1, %output_buffer[%arg1] : memref<8xi32> | ||
} | ||
aie.use_lock(%output_lock4, Release, 1) | ||
} | ||
aie.end | ||
} | ||
|
||
%mem_0_2 = aie.mem(%tile_0_2) { | ||
%0 = aie.dma_start(MM2S, 0, ^bb1, ^bb2) | ||
^bb1: // 2 preds: ^bb0, ^bb2 | ||
aie.use_lock(%output_lock4, AcquireGreaterEqual, 1) | ||
aie.dma_bd(%output_buffer : memref<8xi32>, 0, 8) | ||
aie.use_lock(%output_lock5, Release, 1) | ||
aie.next_bd ^bb1 | ||
^bb2: | ||
aie.end | ||
} | ||
|
||
aie.shim_dma_allocation @ctrl0(MM2S, 0, 0) | ||
aie.shim_dma_allocation @out0(S2MM, 0, 0) | ||
|
||
memref.global "private" constant @blockwrite_data_0 : memref<8xi32> = dense<[2, 0, 0x40090000, 0, 0x40000000, 0, 0, 0x2000000]> | ||
aiex.runtime_sequence @seq(%arg0: memref<8xi32>, %arg1: memref<8xi32>, %arg2: memref<8xi32>) { | ||
%c0_i64 = arith.constant 0 : i64 | ||
%c1_i64 = arith.constant 1 : i64 | ||
%c8_i64 = arith.constant 8 : i64 | ||
|
||
// set Ctrl_Pkt_Tlast_Error_Enable=0 in Module_Clock_Control register | ||
// aiex.npu.maskwrite32 {address = 0x00060000 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32, mask = 0x8 : ui32} | ||
|
||
// write bd0 | ||
%0 = memref.get_global @blockwrite_data_0 : memref<8xi32> | ||
aiex.npu.blockwrite(%0) {address = 0x1d000 : ui32, column = 0 : i32, row = 0 : i32} : memref<8xi32> | ||
|
||
// patch bd0 address, push to mm2s_0_task_queue, wait | ||
aiex.npu.address_patch {addr = 0x1d004 : ui32, arg_idx = 1 : i32, arg_plus = 0 : i32} | ||
aiex.npu.write32 {address = 0x1d214 : ui32, column = 0 : i32, row = 0 : i32, value = 0x80000000 : ui32} | ||
aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} | ||
|
||
// patch bd0 address, push to mm2s_0_task_queue, wait | ||
aiex.npu.address_patch {addr = 0x1d004 : ui32, arg_idx = 1 : i32, arg_plus = 8 : i32} | ||
aiex.npu.write32 {address = 0x1d214 : ui32, column = 0 : i32, row = 0 : i32, value = 0x80000000 : ui32} | ||
aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} | ||
|
||
aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 2 : i64, issue_token = true, metadata = @out0} : memref<8xi32> | ||
aiex.npu.dma_wait {symbol = @out0} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
// (c) Copyright 2024 Advanced Micro Devices, Inc. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
// REQUIRES: ryzen_ai | ||
// | ||
// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --basic-alloc-scheme --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir | ||
// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem | ||
// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s | ||
// CHECK: PASS! | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,229 @@ | ||
//===- test.cpp -------------------------------------------------*- C++ -*-===// | ||
// | ||
// This file is licensed under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
// Copyright (C) 2024, Advanced Micro Devices, Inc. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include <boost/program_options.hpp> | ||
#include <cstdint> | ||
#include <fstream> | ||
#include <iostream> | ||
#include <sstream> | ||
#include <string> | ||
#include <vector> | ||
|
||
#include "xrt/xrt_bo.h" | ||
#include "xrt/xrt_device.h" | ||
#include "xrt/xrt_kernel.h" | ||
|
||
constexpr int IN_SIZE = 64; | ||
constexpr int OUT_SIZE = 64; | ||
|
||
namespace po = boost::program_options; | ||
|
||
void check_arg_file_exists(po::variables_map &vm_in, std::string name) { | ||
if (!vm_in.count(name)) { | ||
throw std::runtime_error("Error: no " + name + " file was provided\n"); | ||
} else { | ||
std::ifstream test(vm_in[name].as<std::string>()); | ||
if (!test) { | ||
throw std::runtime_error("The " + name + " file " + | ||
vm_in[name].as<std::string>() + | ||
" does not exist.\n"); | ||
} | ||
} | ||
} | ||
|
||
std::vector<uint32_t> load_instr_sequence(std::string instr_path) { | ||
std::ifstream instr_file(instr_path); | ||
std::string line; | ||
std::vector<uint32_t> instr_v; | ||
while (std::getline(instr_file, line)) { | ||
std::istringstream iss(line); | ||
uint32_t a; | ||
if (!(iss >> std::hex >> a)) { | ||
throw std::runtime_error("Unable to parse instruction file\n"); | ||
} | ||
instr_v.push_back(a); | ||
} | ||
return instr_v; | ||
} | ||
|
||
int main(int argc, const char *argv[]) { | ||
|
||
// Program arguments parsing | ||
po::options_description desc("Allowed options"); | ||
desc.add_options()("help,h", "produce help message")( | ||
"xclbin,x", po::value<std::string>()->required(), | ||
"the input xclbin path")( | ||
"kernel,k", po::value<std::string>()->required(), | ||
"the kernel name in the XCLBIN (for instance PP_PRE_FD)")( | ||
"verbosity,v", po::value<int>()->default_value(0), | ||
"the verbosity of the output")( | ||
"instr,i", po::value<std::string>()->required(), | ||
"path of file containing userspace instructions to be sent to the LX6"); | ||
po::variables_map vm; | ||
|
||
try { | ||
po::store(po::parse_command_line(argc, argv, desc), vm); | ||
po::notify(vm); | ||
|
||
if (vm.count("help")) { | ||
std::cout << desc << "\n"; | ||
return 1; | ||
} | ||
} catch (const std::exception &ex) { | ||
std::cerr << ex.what() << "\n\n"; | ||
std::cerr << "Usage:\n" << desc << "\n"; | ||
return 1; | ||
} | ||
|
||
check_arg_file_exists(vm, "xclbin"); | ||
check_arg_file_exists(vm, "instr"); | ||
|
||
std::vector<uint32_t> instr_v = | ||
load_instr_sequence(vm["instr"].as<std::string>()); | ||
|
||
int verbosity = vm["verbosity"].as<int>(); | ||
if (verbosity >= 1) | ||
std::cout << "Sequence instr count: " << instr_v.size() << "\n"; | ||
|
||
// Start the XRT test code | ||
// Get a device handle | ||
unsigned int device_index = 0; | ||
auto device = xrt::device(device_index); | ||
|
||
// Load the xclbin | ||
if (verbosity >= 1) | ||
std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n"; | ||
auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>()); | ||
|
||
if (verbosity >= 1) | ||
std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n"; | ||
std::string Node = vm["kernel"].as<std::string>(); | ||
|
||
// Get the kernel from the xclbin | ||
auto xkernels = xclbin.get_kernels(); | ||
auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), | ||
[Node](xrt::xclbin::kernel &k) { | ||
auto name = k.get_name(); | ||
std::cout << "Name: " << name << std::endl; | ||
return name.rfind(Node, 0) == 0; | ||
}); | ||
auto kernelName = xkernel.get_name(); | ||
|
||
if (verbosity >= 1) | ||
std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>() | ||
<< "\n"; | ||
|
||
device.register_xclbin(xclbin); | ||
|
||
// get a hardware context | ||
if (verbosity >= 1) | ||
std::cout << "Getting hardware context.\n"; | ||
xrt::hw_context context(device, xclbin.get_uuid()); | ||
|
||
// get a kernel handle | ||
if (verbosity >= 1) | ||
std::cout << "Getting handle to kernel:" << kernelName << "\n"; | ||
auto kernel = xrt::kernel(context, kernelName); | ||
|
||
auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), | ||
XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); | ||
auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t), | ||
XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); | ||
auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t), | ||
XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); | ||
auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t), | ||
XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); | ||
|
||
if (verbosity >= 1) | ||
std::cout << "Writing data into buffer objects.\n"; | ||
|
||
uint32_t *bufInA = bo_inA.map<uint32_t *>(); | ||
std::vector<uint32_t> srcVecA; | ||
for (int i = 0; i < IN_SIZE; i++) | ||
srcVecA.push_back(i + 1); | ||
memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t))); | ||
|
||
uint32_t beats = 1 - 1; | ||
uint32_t operation = 0; | ||
uint32_t stream_id = 0; | ||
auto parity = [](uint32_t n) { | ||
uint32_t p = 0; | ||
while (n) { | ||
p += n & 1; | ||
n >>= 1; | ||
} | ||
return (p % 2) == 0; | ||
}; | ||
|
||
// Lock0_value | ||
uint32_t address = 0x0001F000; | ||
uint32_t header0 = stream_id << 24 | operation << 22 | beats << 20 | address; | ||
header0 |= (0x1 & parity(header0)) << 31; | ||
|
||
// Lock2_value | ||
address += 0x20; | ||
uint32_t header1 = stream_id << 24 | operation << 22 | beats << 20 | address; | ||
header1 |= (0x1 & parity(header1)) << 31; | ||
|
||
// set lock values to 2 | ||
uint32_t data = 2; | ||
std::vector<uint32_t> srcVecB = { | ||
header0, | ||
data, | ||
header1, | ||
data, | ||
}; | ||
void *bufInB = bo_inB.map<void *>(); | ||
memcpy(bufInB, srcVecB.data(), srcVecB.size() * sizeof(int)); | ||
|
||
void *bufInstr = bo_instr.map<void *>(); | ||
memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); | ||
|
||
bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); | ||
bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); | ||
bo_inB.sync(XCL_BO_SYNC_BO_TO_DEVICE); | ||
|
||
if (verbosity >= 1) | ||
std::cout << "Running Kernel.\n"; | ||
unsigned int opcode = 3; | ||
auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); | ||
|
||
ert_cmd_state r = run.wait(); | ||
if (r != ERT_CMD_STATE_COMPLETED) { | ||
std::cout << "Kernel did not complete. Returned status: " << r << "\n"; | ||
return 1; | ||
} | ||
|
||
bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); | ||
|
||
uint32_t *bufOut = bo_out.map<uint32_t *>(); | ||
|
||
int errors = 0; | ||
|
||
for (uint32_t i = 0; i < 8; i++) { | ||
uint32_t ref = 7; | ||
if (*(bufOut + i) != ref) { | ||
std::cout << "Error in output " << *(bufOut + i) << " != " << ref | ||
<< std::endl; | ||
errors++; | ||
} else { | ||
std::cout << "Correct output " << *(bufOut + i) << " == " << ref | ||
<< std::endl; | ||
} | ||
} | ||
|
||
if (!errors) { | ||
std::cout << "\nPASS!\n\n"; | ||
return 0; | ||
} else { | ||
std::cout << "\nfailed.\n\n"; | ||
return 1; | ||
} | ||
} |