diff --git a/test/npu-xrt/add_one_ctrl_packet/aie.mlir b/test/npu-xrt/add_one_ctrl_packet/aie.mlir
new file mode 100644
index 0000000000..2c5b5771af
--- /dev/null
+++ b/test/npu-xrt/add_one_ctrl_packet/aie.mlir
@@ -0,0 +1,126 @@
+//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+module {
+  aie.device(npu1_1col) {
+    memref.global "public" @out0 : memref<8xi32>
+    memref.global "public" @ctrl0 : memref<8xi32>
+
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_2 = aie.tile(0, 2)
+
+    %input_lock0 = aie.lock(%tile_0_2, 0) {init = 0 : i32, sym_name = "input_lock0"}
+    %input_lock2 = aie.lock(%tile_0_2, 2) {init = 0 : i32, sym_name = "input_lock2"}
+    %output_lock4 = aie.lock(%tile_0_2, 4) {init = 0 : i32, sym_name = "output_lock4"}
+    %output_lock5 = aie.lock(%tile_0_2, 5) {init = 1 : i32, sym_name = "output_lock5"}
+    
+    %input_buffer = aie.buffer(%tile_0_2) {sym_name = "input_buffer"} : memref<8xi32>
+    %output_buffer = aie.buffer(%tile_0_2) {sym_name = "output_buffer"} : memref<8xi32>
+    
+    aie.packet_flow(0x1) {
+      aie.packet_source<%tile_0_0, DMA : 0>
+      aie.packet_dest<%tile_0_2, Ctrl : 0>
+    }
+
+    aie.flow(%tile_0_2, DMA : 0, %tile_0_0, DMA : 0)
+
+    %core_0_2 = aie.core(%tile_0_2) {
+      %c0 = arith.constant 0 : index
+      %c1_i32 = arith.constant 1 : i32
+      %c3_i32 = arith.constant 3 : i32
+      %c1 = arith.constant 1 : index
+      %c8 = arith.constant 8 : index
+      %c4294967295 = arith.constant 4294967295 : index
+      scf.for %arg0 = %c0 to %c4294967295 step %c1 {
+        // initialize to 3
+        scf.for %arg1 = %c0 to %c8 step %c1 {
+          memref.store %c3_i32, %input_buffer[%arg1] : memref<8xi32>
+        }
+        aie.use_lock(%input_lock0, AcquireGreaterEqual, 1)
+        scf.for %arg1 = %c0 to %c8 step %c1 {
+          // 4
+          %1 = memref.load %input_buffer[%arg1] : memref<8xi32>
+          %2 = arith.addi %1, %c1_i32 : i32
+          memref.store %2, %input_buffer[%arg1] : memref<8xi32>
+        }
+        aie.use_lock(%input_lock0, AcquireGreaterEqual, 1)
+        scf.for %arg1 = %c0 to %c8 step %c1 {
+          // 5
+          %1 = memref.load %input_buffer[%arg1] : memref<8xi32>
+          %2 = arith.addi %1, %c1_i32 : i32
+          memref.store %2, %input_buffer[%arg1] : memref<8xi32>
+        }
+        aie.use_lock(%input_lock2, AcquireGreaterEqual, 1)
+        scf.for %arg1 = %c0 to %c8 step %c1 {
+          // 6
+          %1 = memref.load %input_buffer[%arg1] : memref<8xi32>
+          %2 = arith.addi %1, %c1_i32 : i32
+          memref.store %2, %input_buffer[%arg1] : memref<8xi32>
+        }
+        aie.use_lock(%input_lock2, AcquireGreaterEqual, 1)
+        scf.for %arg1 = %c0 to %c8 step %c1 {
+          // 7
+          %1 = memref.load %input_buffer[%arg1] : memref<8xi32>
+          %2 = arith.addi %1, %c1_i32 : i32
+          memref.store %2, %input_buffer[%arg1] : memref<8xi32>
+        }
+        // write to output buffer
+        aie.use_lock(%output_lock5, AcquireGreaterEqual, 1)
+        scf.for %arg1 = %c0 to %c8 step %c1 {
+            %1 = memref.load %input_buffer[%arg1] : memref<8xi32>
+            memref.store %1, %output_buffer[%arg1] : memref<8xi32>
+        }
+        aie.use_lock(%output_lock4, Release, 1)
+      }
+      aie.end
+    }
+
+    %mem_0_2 = aie.mem(%tile_0_2) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb2)
+    ^bb1:  // 2 preds: ^bb0, ^bb2
+      aie.use_lock(%output_lock4, AcquireGreaterEqual, 1)
+      aie.dma_bd(%output_buffer : memref<8xi32>, 0, 8)
+      aie.use_lock(%output_lock5, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:
+      aie.end
+    }
+
+    aie.shim_dma_allocation @ctrl0(MM2S, 0, 0)
+    aie.shim_dma_allocation @out0(S2MM, 0, 0)
+
+    memref.global "private" constant @blockwrite_data_0 : memref<8xi32> = dense<[2, 0, 0x40090000, 0, 0x40000000, 0, 0, 0x2000000]>
+    aiex.runtime_sequence @seq(%arg0: memref<8xi32>, %arg1: memref<8xi32>, %arg2: memref<8xi32>) {
+      %c0_i64 = arith.constant 0 : i64
+      %c1_i64 = arith.constant 1 : i64
+      %c8_i64 = arith.constant 8 : i64
+
+      // set Ctrl_Pkt_Tlast_Error_Enable=0 in Module_Clock_Control register
+      // aiex.npu.maskwrite32 {address = 0x00060000 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32, mask = 0x8 : ui32}
+
+      // write bd0
+      %0 = memref.get_global @blockwrite_data_0 : memref<8xi32>
+      aiex.npu.blockwrite(%0) {address = 0x1d000 : ui32, column = 0 : i32, row = 0 : i32} : memref<8xi32>
+
+      // patch bd0 address, push to mm2s_0_task_queue, wait
+      aiex.npu.address_patch {addr = 0x1d004 : ui32, arg_idx = 1 : i32, arg_plus = 0 : i32}
+      aiex.npu.write32 {address = 0x1d214 : ui32, column = 0 : i32, row = 0 : i32, value = 0x80000000 : ui32}
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+
+      // patch bd0 address, push to mm2s_0_task_queue, wait
+      aiex.npu.address_patch {addr = 0x1d004 : ui32, arg_idx = 1 : i32, arg_plus = 8 : i32}
+      aiex.npu.write32 {address = 0x1d214 : ui32, column = 0 : i32, row = 0 : i32, value = 0x80000000 : ui32}
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32}
+
+      aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c8_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 2 : i64, issue_token = true, metadata = @out0} : memref<8xi32>
+      aiex.npu.dma_wait {symbol = @out0}
+    }
+  }
+}
diff --git a/test/npu-xrt/add_one_ctrl_packet/run.lit b/test/npu-xrt/add_one_ctrl_packet/run.lit
new file mode 100644
index 0000000000..f08641410e
--- /dev/null
+++ b/test/npu-xrt/add_one_ctrl_packet/run.lit
@@ -0,0 +1,10 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai
+//
+// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --basic-alloc-scheme --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir
+// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// CHECK: PASS!
+
diff --git a/test/npu-xrt/add_one_ctrl_packet/test.cpp b/test/npu-xrt/add_one_ctrl_packet/test.cpp
new file mode 100644
index 0000000000..3a2b012ecc
--- /dev/null
+++ b/test/npu-xrt/add_one_ctrl_packet/test.cpp
@@ -0,0 +1,229 @@
+//===- test.cpp -------------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <boost/program_options.hpp>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+constexpr int IN_SIZE = 64;
+constexpr int OUT_SIZE = 64;
+
+namespace po = boost::program_options;
+
+void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
+  if (!vm_in.count(name)) {
+    throw std::runtime_error("Error: no " + name + " file was provided\n");
+  } else {
+    std::ifstream test(vm_in[name].as<std::string>());
+    if (!test) {
+      throw std::runtime_error("The " + name + " file " +
+                               vm_in[name].as<std::string>() +
+                               " does not exist.\n");
+    }
+  }
+}
+
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
+int main(int argc, const char *argv[]) {
+
+  // Program arguments parsing
+  po::options_description desc("Allowed options");
+  desc.add_options()("help,h", "produce help message")(
+      "xclbin,x", po::value<std::string>()->required(),
+      "the input xclbin path")(
+      "kernel,k", po::value<std::string>()->required(),
+      "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
+      "verbosity,v", po::value<int>()->default_value(0),
+      "the verbosity of the output")(
+      "instr,i", po::value<std::string>()->required(),
+      "path of file containing userspace instructions to be sent to the LX6");
+  po::variables_map vm;
+
+  try {
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    if (vm.count("help")) {
+      std::cout << desc << "\n";
+      return 1;
+    }
+  } catch (const std::exception &ex) {
+    std::cerr << ex.what() << "\n\n";
+    std::cerr << "Usage:\n" << desc << "\n";
+    return 1;
+  }
+
+  check_arg_file_exists(vm, "xclbin");
+  check_arg_file_exists(vm, "instr");
+
+  std::vector<uint32_t> instr_v =
+      load_instr_sequence(vm["instr"].as<std::string>());
+
+  int verbosity = vm["verbosity"].as<int>();
+  if (verbosity >= 1)
+    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+  // Start the XRT test code
+  // Get a device handle
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  // Load the xclbin
+  if (verbosity >= 1)
+    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
+  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
+
+  if (verbosity >= 1)
+    std::cout << "Kernel opcode: " << vm["kernel"].as<std::string>() << "\n";
+  std::string Node = vm["kernel"].as<std::string>();
+
+  // Get the kernel from the xclbin
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+                               [Node](xrt::xclbin::kernel &k) {
+                                 auto name = k.get_name();
+                                 std::cout << "Name: " << name << std::endl;
+                                 return name.rfind(Node, 0) == 0;
+                               });
+  auto kernelName = xkernel.get_name();
+
+  if (verbosity >= 1)
+    std::cout << "Registering xclbin: " << vm["xclbin"].as<std::string>()
+              << "\n";
+
+  device.register_xclbin(xclbin);
+
+  // get a hardware context
+  if (verbosity >= 1)
+    std::cout << "Getting hardware context.\n";
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // get a kernel handle
+  if (verbosity >= 1)
+    std::cout << "Getting handle to kernel:" << kernelName << "\n";
+  auto kernel = xrt::kernel(context, kernelName);
+
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+  auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
+
+  if (verbosity >= 1)
+    std::cout << "Writing data into buffer objects.\n";
+
+  uint32_t *bufInA = bo_inA.map<uint32_t *>();
+  std::vector<uint32_t> srcVecA;
+  for (int i = 0; i < IN_SIZE; i++)
+    srcVecA.push_back(i + 1);
+  memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t)));
+
+  uint32_t beats = 1 - 1;
+  uint32_t operation = 0;
+  uint32_t stream_id = 0;
+  auto parity = [](uint32_t n) {
+    uint32_t p = 0;
+    while (n) {
+      p += n & 1;
+      n >>= 1;
+    }
+    return (p % 2) == 0;
+  };
+
+  // Lock0_value
+  uint32_t address = 0x0001F000;
+  uint32_t header0 = stream_id << 24 | operation << 22 | beats << 20 | address;
+  header0 |= (0x1 & parity(header0)) << 31;
+
+  // Lock2_value
+  address += 0x20;
+  uint32_t header1 = stream_id << 24 | operation << 22 | beats << 20 | address;
+  header1 |= (0x1 & parity(header1)) << 31;
+
+  // set lock values to 2
+  uint32_t data = 2;
+  std::vector<uint32_t> srcVecB = {
+      header0,
+      data,
+      header1,
+      data,
+  };
+  void *bufInB = bo_inB.map<void *>();
+  memcpy(bufInB, srcVecB.data(), srcVecB.size() * sizeof(int));
+
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inB.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  if (verbosity >= 1)
+    std::cout << "Running Kernel.\n";
+  unsigned int opcode = 3;
+  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
+
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
+
+  bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+  uint32_t *bufOut = bo_out.map<uint32_t *>();
+
+  int errors = 0;
+
+  for (uint32_t i = 0; i < 8; i++) {
+    uint32_t ref = 7;
+    if (*(bufOut + i) != ref) {
+      std::cout << "Error in output " << *(bufOut + i) << " != " << ref
+                << std::endl;
+      errors++;
+    } else {
+      std::cout << "Correct output " << *(bufOut + i) << " == " << ref
+                << std::endl;
+    }
+  }
+
+  if (!errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  } else {
+    std::cout << "\nfailed.\n\n";
+    return 1;
+  }
+}