diff --git a/test/npu-xrt/ctrl_packet_reconfig_1x4_cores/aie1.mlir b/test/npu-xrt/ctrl_packet_reconfig_1x4_cores/aie1.mlir new file mode 100644 index 0000000000..c21ae08df2 --- /dev/null +++ b/test/npu-xrt/ctrl_packet_reconfig_1x4_cores/aie1.mlir @@ -0,0 +1,20 @@ +//===- aie1.mlir -----------------------------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// + +module { + aie.device(npu1_1col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %tile_0_2 = aie.tile(0, 2) + %tile_0_3 = aie.tile(0, 3) + %tile_0_4 = aie.tile(0, 4) + %tile_0_5 = aie.tile(0, 5) + } +} diff --git a/test/npu-xrt/ctrl_packet_reconfig_1x4_cores/aie2.mlir b/test/npu-xrt/ctrl_packet_reconfig_1x4_cores/aie2.mlir new file mode 100644 index 0000000000..483dd9f8bb --- /dev/null +++ b/test/npu-xrt/ctrl_packet_reconfig_1x4_cores/aie2.mlir @@ -0,0 +1,298 @@ +//===- aie2.mlir -----------------------------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// + +module { + aie.device(npu1_1col) { + memref.global "public" @objFifo_in0 : memref<64x64xi8> + memref.global "public" @objFifo_out0 : memref<64x64xi8> + + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %tile_0_2 = aie.tile(0, 2) + %tile_0_3 = aie.tile(0, 3) + %tile_0_4 = aie.tile(0, 4) + %tile_0_5 = aie.tile(0, 5) + + // Tile 0, 2 buffers and locks + + %tile_0_2_buff_0 = aie.buffer(%tile_0_2) {sym_name = "tile_0_2_buff_0"} : memref<64x64xi8> + %tile_0_2_buff_1 = aie.buffer(%tile_0_2) {sym_name = "tile_0_2_buff_1"} : memref<64x64xi8> + + %tile_0_2_lock_0 = aie.lock(%tile_0_2, 0) {init = 1 : i32, sym_name = "tile_0_2_lock_0"} + %tile_0_2_lock_1 = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "tile_0_2_lock_1"} + %tile_0_2_lock_2 = aie.lock(%tile_0_2, 2) {init = 0 : i32, sym_name = "tile_0_2_lock_2"} + %tile_0_2_lock_3 = aie.lock(%tile_0_2, 3) {init = 1 : i32, sym_name = "tile_0_2_lock_3"} + + // Tile 0, 3 buffers and locks + + %tile_0_3_buff_0 = aie.buffer(%tile_0_3) {sym_name = "tile_0_3_buff_0"} : memref<64x64xi8> + %tile_0_3_buff_1 = aie.buffer(%tile_0_3) {sym_name = "tile_0_3_buff_1"} : memref<64x64xi8> + + %tile_0_3_lock_0 = aie.lock(%tile_0_3, 0) {init = 1 : i32, sym_name = "tile_0_3_lock_0"} + %tile_0_3_lock_1 = aie.lock(%tile_0_3, 1) {init = 0 : i32, sym_name = "tile_0_3_lock_1"} + %tile_0_3_lock_2 = aie.lock(%tile_0_3, 2) {init = 0 : i32, sym_name = "tile_0_3_lock_2"} + %tile_0_3_lock_3 = aie.lock(%tile_0_3, 3) {init = 1 : i32, sym_name = "tile_0_3_lock_3"} + + // Tile 0, 4 buffers and locks + + %tile_0_4_buff_0 = aie.buffer(%tile_0_4) {sym_name = "tile_0_4_buff_0"} : memref<64x64xi8> + %tile_0_4_buff_1 = aie.buffer(%tile_0_4) {sym_name = "tile_0_4_buff_1"} : memref<64x64xi8> + + %tile_0_4_lock_0 = aie.lock(%tile_0_4, 0) {init = 1 : i32, sym_name = "tile_0_4_lock_0"} + %tile_0_4_lock_1 = aie.lock(%tile_0_4, 1) {init = 0 : i32, sym_name = "tile_0_4_lock_1"} + %tile_0_4_lock_2 = aie.lock(%tile_0_4, 2) {init = 0 : i32, sym_name = "tile_0_4_lock_2"} + %tile_0_4_lock_3 = aie.lock(%tile_0_4, 3) {init = 1 : i32, sym_name = "tile_0_4_lock_3"} + + // Tile 0, 5 buffers and locks + + %tile_0_5_buff_0 = aie.buffer(%tile_0_5) {sym_name = "tile_0_5_buff_0"} : memref<64x64xi8> + %tile_0_5_buff_1 = aie.buffer(%tile_0_5) {sym_name = "tile_0_5_buff_1"} : memref<64x64xi8> + + %tile_0_5_lock_0 = aie.lock(%tile_0_5, 0) {init = 1 : i32, sym_name = "tile_0_5_lock_0"} + %tile_0_5_lock_1 = aie.lock(%tile_0_5, 1) {init = 0 : i32, sym_name = "tile_0_5_lock_1"} + %tile_0_5_lock_2 = aie.lock(%tile_0_5, 2) {init = 0 : i32, sym_name = "tile_0_5_lock_2"} + %tile_0_5_lock_3 = aie.lock(%tile_0_5, 3) {init = 1 : i32, sym_name = "tile_0_5_lock_3"} + + aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0) + aie.flow(%tile_0_1, DMA : 1, %tile_0_3, DMA : 0) + aie.flow(%tile_0_1, DMA : 2, %tile_0_4, DMA : 0) + aie.flow(%tile_0_1, DMA : 3, %tile_0_5, DMA : 0) + + aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 1) + aie.flow(%tile_0_3, DMA : 0, %tile_0_1, DMA : 2) + aie.flow(%tile_0_4, DMA : 0, %tile_0_1, DMA : 3) + aie.flow(%tile_0_5, DMA : 0, %tile_0_1, DMA : 4) + + aie.packet_flow(0) { + aie.packet_source<%tile_0_0, DMA : 0> + aie.packet_dest<%tile_0_1, DMA : 0> + } + aie.flow(%tile_0_1, DMA : 4, %tile_0_0, DMA : 0) + + // Tile 0, 2 core and dma + %core_0_2 = aie.core(%tile_0_2) { + %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c12_i8 = arith.constant 12 : i8 + %c2 = arith.constant 2 : index + %c64 = arith.constant 64 : index + aie.use_lock(%tile_0_2_lock_3, AcquireGreaterEqual, 1) + aie.use_lock(%tile_0_2_lock_1, AcquireGreaterEqual, 1) + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + %0 = memref.load %tile_0_2_buff_0[%arg1, %arg2] : memref<64x64xi8> + %1 = arith.addi %0, %c12_i8 : i8 + memref.store %1, %tile_0_2_buff_1[%arg1, %arg2] : memref<64x64xi8> + } + } + aie.use_lock(%tile_0_2_lock_0, Release, 1) + aie.use_lock(%tile_0_2_lock_2, Release, 1) + aie.end + } + + %mem_0_2 = aie.mem(%tile_0_2) { + %0 = aie.dma(S2MM, 0) [{ + aie.use_lock(%tile_0_2_lock_0, AcquireGreaterEqual, 1) + aie.dma_bd(%tile_0_2_buff_0 : memref<64x64xi8>) + aie.use_lock(%tile_0_2_lock_1, Release, 1) + }] + %1 = aie.dma(MM2S, 0) [{ + aie.use_lock(%tile_0_2_lock_2, AcquireGreaterEqual, 1) + aie.dma_bd(%tile_0_2_buff_1 : memref<64x64xi8>) + aie.use_lock(%tile_0_2_lock_3, Release, 1) + }] + aie.end + } + + // Tile 0, 3 core and dma + %core_0_3 = aie.core(%tile_0_3) { + %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c12_i8 = arith.constant 12 : i8 + %c2 = arith.constant 2 : index + %c64 = arith.constant 64 : index + aie.use_lock(%tile_0_3_lock_3, AcquireGreaterEqual, 1) + aie.use_lock(%tile_0_3_lock_1, AcquireGreaterEqual, 1) + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + %0 = memref.load %tile_0_3_buff_0[%arg1, %arg2] : memref<64x64xi8> + %1 = arith.addi %0, %c12_i8 : i8 + memref.store %1, %tile_0_3_buff_1[%arg1, %arg2] : memref<64x64xi8> + } + } + aie.use_lock(%tile_0_3_lock_0, Release, 1) + aie.use_lock(%tile_0_3_lock_2, Release, 1) + aie.end + } + + %mem_0_3 = aie.mem(%tile_0_3) { + %0 = aie.dma(S2MM, 0) [{ + aie.use_lock(%tile_0_3_lock_0, AcquireGreaterEqual, 1) + aie.dma_bd(%tile_0_3_buff_0 : memref<64x64xi8>) + aie.use_lock(%tile_0_3_lock_1, Release, 1) + }] + %1 = aie.dma(MM2S, 0) [{ + aie.use_lock(%tile_0_3_lock_2, AcquireGreaterEqual, 1) + aie.dma_bd(%tile_0_3_buff_1 : memref<64x64xi8>) + aie.use_lock(%tile_0_3_lock_3, Release, 1) + }] + aie.end + } + + // Tile 0, 4 core and dma + %core_0_4 = aie.core(%tile_0_4) { + %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c12_i8 = arith.constant 12 : i8 + %c2 = arith.constant 2 : index + %c64 = arith.constant 64 : index + aie.use_lock(%tile_0_4_lock_3, AcquireGreaterEqual, 1) + aie.use_lock(%tile_0_4_lock_1, AcquireGreaterEqual, 1) + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + %0 = memref.load %tile_0_4_buff_0[%arg1, %arg2] : memref<64x64xi8> + %1 = arith.addi %0, %c12_i8 : i8 + memref.store %1, %tile_0_4_buff_1[%arg1, %arg2] : memref<64x64xi8> + } + } + aie.use_lock(%tile_0_4_lock_0, Release, 1) + aie.use_lock(%tile_0_4_lock_2, Release, 1) + aie.end + } + + %mem_0_4 = aie.mem(%tile_0_4) { + %0 = aie.dma(S2MM, 0) [{ + aie.use_lock(%tile_0_4_lock_0, AcquireGreaterEqual, 1) + aie.dma_bd(%tile_0_4_buff_0 : memref<64x64xi8>) + aie.use_lock(%tile_0_4_lock_1, Release, 1) + }] + %1 = aie.dma(MM2S, 0) [{ + aie.use_lock(%tile_0_4_lock_2, AcquireGreaterEqual, 1) + aie.dma_bd(%tile_0_4_buff_1 : memref<64x64xi8>) + aie.use_lock(%tile_0_4_lock_3, Release, 1) + }] + aie.end + } + + // Tile 0, 5 core and dma + %core_0_5 = aie.core(%tile_0_5) { + %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c12_i8 = arith.constant 12 : i8 + %c2 = arith.constant 2 : index + %c64 = arith.constant 64 : index + aie.use_lock(%tile_0_5_lock_3, AcquireGreaterEqual, 1) + aie.use_lock(%tile_0_5_lock_1, AcquireGreaterEqual, 1) + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + %0 = memref.load %tile_0_5_buff_0[%arg1, %arg2] : memref<64x64xi8> + %1 = arith.addi %0, %c12_i8 : i8 + memref.store %1, %tile_0_5_buff_1[%arg1, %arg2] : memref<64x64xi8> + } + } + aie.use_lock(%tile_0_5_lock_0, Release, 1) + aie.use_lock(%tile_0_5_lock_2, Release, 1) + aie.end + } + + %mem_0_5 = aie.mem(%tile_0_5) { + %0 = aie.dma(S2MM, 0) [{ + aie.use_lock(%tile_0_5_lock_0, AcquireGreaterEqual, 1) + aie.dma_bd(%tile_0_5_buff_0 : memref<64x64xi8>) + aie.use_lock(%tile_0_5_lock_1, Release, 1) + }] + %1 = aie.dma(MM2S, 0) [{ + aie.use_lock(%tile_0_5_lock_2, AcquireGreaterEqual, 1) + aie.dma_bd(%tile_0_5_buff_1 : memref<64x64xi8>) + aie.use_lock(%tile_0_5_lock_3, Release, 1) + }] + aie.end + } + + %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { + %buff_0 = aie.buffer(%tile_0_1) {sym_name = "memtile_buff_0"} : memref<4x64x64xi8> + %buff_1 = aie.buffer(%tile_0_1) {sym_name = "memtile_buff_1"} : memref<4x64x64xi8> + %memtile_lock_0 = aie.lock(%tile_0_1, 0) {init = 4 : i32, sym_name = "memtile_lock_0"} + %memtile_lock_1 = aie.lock(%tile_0_1, 1) {init = 0 : i32, sym_name = "memtile_lock_1"} + %memtile_lock_2 = aie.lock(%tile_0_1, 2) {init = 0 : i32, sym_name = "memtile_lock_2"} + %memtile_lock_3 = aie.lock(%tile_0_1, 3) {init = 4 : i32, sym_name = "memtile_lock_3"} + %0 = aie.dma(S2MM, 0) [{ + aie.use_lock(%memtile_lock_0, AcquireGreaterEqual, 4) + aie.dma_bd(%buff_0 : memref<4x64x64xi8>, 0, 16384) + aie.use_lock(%memtile_lock_1, Release, 4) + }] + %1 = aie.dma(MM2S, 0) [{ + aie.use_lock(%memtile_lock_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buff_0 : memref<4x64x64xi8>, 0, 4096) + aie.use_lock(%memtile_lock_0, Release, 1) + }] + %2 = aie.dma(MM2S, 1) [{ + aie.use_lock(%memtile_lock_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buff_0 : memref<4x64x64xi8>, 4096, 4096) + aie.use_lock(%memtile_lock_0, Release, 1) + }] + %3 = aie.dma(MM2S, 2) [{ + aie.use_lock(%memtile_lock_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buff_0 : memref<4x64x64xi8>, 8192, 4096) + aie.use_lock(%memtile_lock_0, Release, 1) + }] + %4 = aie.dma(MM2S, 3) [{ + aie.use_lock(%memtile_lock_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buff_0 : memref<4x64x64xi8>, 12288, 4096) + aie.use_lock(%memtile_lock_0, Release, 1) + }] + + %5 = aie.dma(S2MM, 1) [{ + aie.use_lock(%memtile_lock_3, AcquireGreaterEqual, 1) + aie.dma_bd(%buff_1 : memref<4x64x64xi8>, 0, 4096) + aie.use_lock(%memtile_lock_2, Release, 1) + }] + %6 = aie.dma(S2MM, 2) [{ + aie.use_lock(%memtile_lock_3, AcquireGreaterEqual, 1) + aie.dma_bd(%buff_1 : memref<4x64x64xi8>, 4096, 4096) + aie.use_lock(%memtile_lock_2, Release, 1) + }] + %7 = aie.dma(S2MM, 3) [{ + aie.use_lock(%memtile_lock_3, AcquireGreaterEqual, 1) + aie.dma_bd(%buff_1 : memref<4x64x64xi8>, 8192, 4096) + aie.use_lock(%memtile_lock_2, Release, 1) + }] + %8 = aie.dma(S2MM, 4) [{ + aie.use_lock(%memtile_lock_3, AcquireGreaterEqual, 1) + aie.dma_bd(%buff_1 : memref<4x64x64xi8>, 12288, 4096) + aie.use_lock(%memtile_lock_2, Release, 1) + }] + %9 = aie.dma(MM2S, 4) [{ + aie.use_lock(%memtile_lock_2, AcquireGreaterEqual, 4) + aie.dma_bd(%buff_1 : memref<4x64x64xi8>, 0, 16384) + aie.use_lock(%memtile_lock_3, Release, 4) + }] + aie.end + } + + aie.shim_dma_allocation @objFifo_in0(MM2S, 0, 0) + aie.shim_dma_allocation @objFifo_out0(S2MM, 0, 0) + + aiex.runtime_sequence @run(%arg0: memref<4x64x64xi8>, %arg1: memref<32xi8>, %arg2: memref<4x64x64xi8>) { + %c0_i64 = arith.constant 0 : i64 + %c1_i64 = arith.constant 1 : i64 + %c4_i64 = arith.constant 4 : i64 + %c4096_i64 = arith.constant 4096 : i64 + %c64_i64 = arith.constant 64 : i64 + aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c4_i64, %c64_i64, %c64_i64][%c0_i64, %c4096_i64, %c64_i64, %c1_i64], packet = ) {id = 0 : i64, metadata = @objFifo_in0} : memref<4x64x64xi8> + aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c4_i64, %c64_i64, %c64_i64][%c0_i64, %c4096_i64, %c64_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<4x64x64xi8> + aiex.npu.dma_wait { symbol = @objFifo_out0 } + } + } +} diff --git a/test/npu-xrt/ctrl_packet_reconfig_1x4_cores/run.lit b/test/npu-xrt/ctrl_packet_reconfig_1x4_cores/run.lit new file mode 100644 index 0000000000..7e43d4efbb --- /dev/null +++ b/test/npu-xrt/ctrl_packet_reconfig_1x4_cores/run.lit @@ -0,0 +1,19 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai +// +// RUN: aie-opt -aie-generate-column-control-overlay="route-shim-to-tile-ctrl=true" %S/aie1.mlir -o aie1_overlay.mlir +// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --xclbin-name=aie1.xclbin aie1_overlay.mlir +// +// RUN: aie-opt -aie-generate-column-control-overlay="route-shim-to-tile-ctrl=true" %S/aie2.mlir -o aie2_overlay.mlir +// RUN: %python aiecc.py --no-aiesim --aie-generate-ctrlpkt --aie-generate-npu --no-compile-host --npu-insts-name=aie2_run_seq.txt aie2_overlay.mlir +// +// RUN: aie-translate -aie-ctrlpkt-to-bin -aie-sequence-name=configure aie2_overlay.mlir.prj/ctrlpkt.mlir -o ctrlpkt.txt +// +// RUN: aie-opt -aie-ctrl-packet-to-dma -aie-dma-to-npu aie2_overlay.mlir.prj/ctrlpkt.mlir -o ctrlpkt_dma_seq.mlir +// RUN: aie-translate -aie-npu-instgen -aie-sequence-name=configure ctrlpkt_dma_seq.mlir -o ctrlpkt_dma_seq.txt +// +// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem +// RUN: %run_on_npu ./test.exe | FileCheck %s +// CHECK: PASS! diff --git a/test/npu-xrt/ctrl_packet_reconfig_1x4_cores/test.cpp b/test/npu-xrt/ctrl_packet_reconfig_1x4_cores/test.cpp new file mode 100644 index 0000000000..4bbb4693b9 --- /dev/null +++ b/test/npu-xrt/ctrl_packet_reconfig_1x4_cores/test.cpp @@ -0,0 +1,174 @@ +//===- test.cpp -------------------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include "experimental/xrt_kernel.h" +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +constexpr int IN_SIZE = 4 * 64 * 64; +constexpr int OUT_SIZE = 4 * 64 * 64; + +#define IN_DATATYPE int8_t +#define OUT_DATATYPE int8_t + +std::vector load_instr_sequence(std::string instr_path) { + std::ifstream instr_file(instr_path); + std::string line; + std::vector instr_v; + while (std::getline(instr_file, line)) { + std::istringstream iss(line); + uint32_t a; + if (!(iss >> std::hex >> a)) { + throw std::runtime_error("Unable to parse instruction file\n"); + } + instr_v.push_back(a); + } + return instr_v; +} + +int main(int argc, const char *argv[]) { + std::vector instr_v = load_instr_sequence("aie_run_seq.txt"); + std::vector ctrlpkt_instr_v = + load_instr_sequence("ctrlpkt_dma_seq.txt"); + std::vector ctrlPackets = load_instr_sequence("ctrlpkt.txt"); + + // Start the XRT test code + // Get a device handle + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + // Load the xclbin + auto xclbin = xrt::xclbin("base.xclbin"); + + std::string Node = "MLIR_AIE"; + + // Get the kernel from the xclbin + auto xkernels = xclbin.get_kernels(); + auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + [Node](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + std::cout << "Name: " << name << std::endl; + return name.rfind(Node, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + device.register_xclbin(xclbin); + + // get a hardware context + xrt::hw_context context(device, xclbin.get_uuid()); + + // get a kernel handle + auto kernel = xrt::kernel(context, kernelName); + + auto bo_ctrlpkt_instr = xrt::bo(device, ctrlpkt_instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_ctrlpkt = xrt::bo(device, ctrlPackets.size() * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(IN_DATATYPE), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(IN_DATATYPE), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(OUT_DATATYPE), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); + + IN_DATATYPE *bufInA = bo_inA.map(); + std::vector srcVecA; + for (int i = 0; i < IN_SIZE; i++) + srcVecA.push_back(1); + memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(IN_DATATYPE))); + + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + void *bufCtrlpktInstr = bo_ctrlpkt_instr.map(); + memcpy(bufCtrlpktInstr, ctrlpkt_instr_v.data(), + ctrlpkt_instr_v.size() * sizeof(int)); + + void *bufctrlpkt = bo_ctrlpkt.map(); + memcpy(bufctrlpkt, ctrlPackets.data(), ctrlPackets.size() * sizeof(int)); + + bo_ctrlpkt_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_ctrlpkt.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + unsigned int opcode = 3; + + // Creating a runlist to contain two seperate runs + xrt::runlist runlist = xrt::runlist(context); + + // Run 0: configuration + auto run0 = xrt::run(kernel); + run0.set_arg(0, opcode); + run0.set_arg(1, bo_ctrlpkt_instr); + run0.set_arg(2, ctrlpkt_instr_v.size()); + run0.set_arg(3, bo_ctrlpkt); + run0.set_arg(4, 0); + run0.set_arg(5, 0); + run0.set_arg(6, 0); + run0.set_arg(7, 0); + // Run 1: the design + auto run1 = xrt::run(kernel); + run1.set_arg(0, opcode); + run1.set_arg(1, bo_instr); + run1.set_arg(2, instr_v.size()); + run1.set_arg(3, bo_inA); + run1.set_arg(4, bo_inB); + run1.set_arg(5, bo_out); + run1.set_arg(6, 0); + run1.set_arg(7, 0); + + // Executing and waiting on the runlist + runlist.add(run0); + runlist.add(run1); + runlist.execute(); + runlist.wait(); + + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + OUT_DATATYPE *bufOut = bo_out.map(); + + int errors = 0; + + for (uint32_t core = 0; core < 4; core++) { + for (uint32_t i = 0; i < 64; i++) { + for (uint32_t j = 0; j < 64; j++) { + uint32_t ref = 1 + 12; + if (*(bufOut + core * 4096 + i * 64 + j) != ref) { + std::cout << "Error at i=" << i << " j=" << j << " core=" << core + << " output: " + << std::to_string(bufOut[core * 4096 + i * 64 + j]) + << " != " << ref << std::endl; + errors++; + } + } + } + } + + if (!errors) { + std::cout << "\nPASS!\n\n"; + return 0; + } + + std::cout << "\nfailed.\n\n"; + std::cout << "failed count: " << errors << std::endl; + return 1; +}