From cabe1c213b737a7532a978acbd4cfdc8479a52ba Mon Sep 17 00:00:00 2001
From: Arun George <arun.george@samsung.com>
Date: Tue, 7 Nov 2023 02:04:53 +0530
Subject: [PATCH] Adds the primary support for Flexible Data Placement(FDP)
 over NVMe into Cachelib

Summary:
This commit adds the device layer support for NVMe-FDP semantics and adds the RUH-awareness feature of NVMe-FDP in
the upper layers of Navy. This allows the BlockCache(large items) and BigHash(small items) of Navy to segregate their data streams
in physical NAND media by using the FDP placement Identifiers.

With this changes, the Cachelib can reduce the Device Write Amplification (WAF) significantly even in high SSD utilization scenarios
("nvmCacheSizeMB" above 50% of the SSD capacity) in most of the cachelib workloads.

This commit introduces a 'placementHandle' concept for data placement, which can be used by both BC and BH of Navy on
device write() calls, especially for FDP placements. The 'placementHandle' have to be allocated from the device.

io_uring_cmd interface(through nvme char device) is used to send FDP directives to Linux kernel, as sending it through the
conventional block interfaces is not suported yet. The user can select the NVMe block device (Namespace/partition) as usual
(Ex: "nvmCachePaths": ["/dev/nvme0n1p1"]), and the cachelib will pick the corresponding NVMe char device internally.

This commit adds a new config 'deviceEnableFDP' to enable FDP. The user needs to select this along with iOUring I/O Engine options.
("navyEnableIoUring": true, "navyQDepth": 1, "deviceEnableFDP": true).

Signed-off-by: Arun George <arun.george@samsung.com>
Signed-off-by: Vikash Kumar <vikash.k5@samsung.com>
---
 cachelib/allocator/nvmcache/NavyConfig.cpp    |   1 +
 cachelib/allocator/nvmcache/NavyConfig.h      |   7 +
 cachelib/allocator/nvmcache/NavySetup.cpp     |   1 +
 .../nvmcache/tests/NavyConfigTest.cpp         |   1 +
 cachelib/cachebench/cache/Cache-inl.h         |   1 +
 cachelib/cachebench/util/CacheConfig.cpp      |   3 +-
 cachelib/cachebench/util/CacheConfig.h        |   3 +
 cachelib/navy/CMakeLists.txt                  |   1 +
 cachelib/navy/Factory.cpp                     |   3 +
 cachelib/navy/Factory.h                       |   2 +
 cachelib/navy/bighash/BigHash.cpp             |   6 +-
 cachelib/navy/bighash/BigHash.h               |   2 +
 cachelib/navy/bighash/tests/BigHashTest.cpp   |  18 +-
 cachelib/navy/block_cache/RegionManager.cpp   |   7 +-
 cachelib/navy/block_cache/RegionManager.h     |   1 +
 .../navy/block_cache/tests/BlockCacheTest.cpp |  25 +-
 .../block_cache/tests/RegionManagerTest.cpp   |   4 +-
 cachelib/navy/common/Device.cpp               | 300 ++++++++++++----
 cachelib/navy/common/Device.h                 |  42 ++-
 cachelib/navy/common/FdpNvme.cpp              | 322 ++++++++++++++++++
 cachelib/navy/common/FdpNvme.h                | 216 ++++++++++++
 cachelib/navy/common/tests/DeviceTest.cpp     |   8 +-
 cachelib/navy/testing/MockDevice.cpp          |  10 +-
 cachelib/navy/testing/MockDevice.h            |   8 +-
 24 files changed, 881 insertions(+), 111 deletions(-)
 create mode 100644 cachelib/navy/common/FdpNvme.cpp
 create mode 100644 cachelib/navy/common/FdpNvme.h

diff --git a/cachelib/allocator/nvmcache/NavyConfig.cpp b/cachelib/allocator/nvmcache/NavyConfig.cpp
index 00cc8c736..a0dd094bc 100644
--- a/cachelib/allocator/nvmcache/NavyConfig.cpp
+++ b/cachelib/allocator/nvmcache/NavyConfig.cpp
@@ -278,6 +278,7 @@ std::map<std::string, std::string> NavyConfig::serialize() const {
       folly::to<std::string>(deviceMaxWriteSize_);
   configMap["navyConfig::ioEngine"] = getIoEngineName(ioEngine_).str();
   configMap["navyConfig::QDepth"] = folly::to<std::string>(qDepth_);
+  configMap["navyConfig::enableFDP"] = folly::to<std::string>(enableFDP_);
 
   // Job scheduler settings
   configMap["navyConfig::readerThreads"] =
diff --git a/cachelib/allocator/nvmcache/NavyConfig.h b/cachelib/allocator/nvmcache/NavyConfig.h
index 7c9dcdb94..abd60f923 100644
--- a/cachelib/allocator/nvmcache/NavyConfig.h
+++ b/cachelib/allocator/nvmcache/NavyConfig.h
@@ -485,6 +485,8 @@ class NavyConfig {
   bool isBigHashEnabled() const {
     return enginesConfigs_[0].bigHash().getSizePct() > 0;
   }
+  bool isFDPEnabled() const { return enableFDP_; }
+
   std::map<std::string, std::string> serialize() const;
 
   // Getters:
@@ -549,6 +551,8 @@ class NavyConfig {
   // ============ Device settings =============
   // Set the device block size, i.e., minimum unit of IO
   void setBlockSize(uint64_t blockSize) noexcept { blockSize_ = blockSize; }
+  // Set the NVMe FDP Device data placement mode in the Cachelib
+  void setEnableFDP(bool enable) noexcept { enableFDP_ = enable; }
   // Set the parameters for a simple file.
   // @throw std::invalid_argument if RAID files have been already set.
   void setSimpleFile(const std::string& fileName,
@@ -694,6 +698,9 @@ class NavyConfig {
   // Whether to use write size (instead of parcel size) for Navy admission
   // policy.
   bool useEstimatedWriteSize_{false};
+  // Whether Navy support the NVMe FDP data placement(TP4146) directives or not.
+  // Reference: https://nvmexpress.org/nvmeflexible-data-placement-fdp-blog/
+  bool enableFDP_{false};
 };
 } // namespace navy
 } // namespace cachelib
diff --git a/cachelib/allocator/nvmcache/NavySetup.cpp b/cachelib/allocator/nvmcache/NavySetup.cpp
index c06eda0e6..6d989cb00 100644
--- a/cachelib/allocator/nvmcache/NavySetup.cpp
+++ b/cachelib/allocator/nvmcache/NavySetup.cpp
@@ -349,6 +349,7 @@ std::unique_ptr<cachelib::navy::Device> createDevice(
         maxDeviceWriteSize > 0 ? alignDown(maxDeviceWriteSize, blockSize) : 0,
         config.getIoEngine(),
         config.getQDepth(),
+        config.isFDPEnabled(),
         std::move(encryptor));
   } else {
     return cachelib::navy::createMemoryDevice(config.getFileSize(),
diff --git a/cachelib/allocator/nvmcache/tests/NavyConfigTest.cpp b/cachelib/allocator/nvmcache/tests/NavyConfigTest.cpp
index 764d51646..0fabc91e7 100644
--- a/cachelib/allocator/nvmcache/tests/NavyConfigTest.cpp
+++ b/cachelib/allocator/nvmcache/tests/NavyConfigTest.cpp
@@ -189,6 +189,7 @@ TEST(NavyConfigTest, Serialization) {
   expectedConfigMap["navyConfig::deviceMaxWriteSize"] = "4194304";
   expectedConfigMap["navyConfig::ioEngine"] = "io_uring";
   expectedConfigMap["navyConfig::QDepth"] = "64";
+  expectedConfigMap["navyConfig::enableFDP"] = "0";
 
   expectedConfigMap["navyConfig::blockCacheLru"] = "false";
   expectedConfigMap["navyConfig::blockCacheRegionSize"] = "16777216";
diff --git a/cachelib/cachebench/cache/Cache-inl.h b/cachelib/cachebench/cache/Cache-inl.h
index 55d867725..4d9b1f88d 100644
--- a/cachelib/cachebench/cache/Cache-inl.h
+++ b/cachelib/cachebench/cache/Cache-inl.h
@@ -173,6 +173,7 @@ Cache<Allocator>::Cache(const CacheConfig& config,
           config_.navyReqOrderShardsPower);
     }
     nvmConfig.navyConfig.setBlockSize(config_.navyBlockSize);
+    nvmConfig.navyConfig.setEnableFDP(config_.deviceEnableFDP);
 
     // configure BlockCache
     auto& bcConfig = nvmConfig.navyConfig.blockCache()
diff --git a/cachelib/cachebench/util/CacheConfig.cpp b/cachelib/cachebench/util/CacheConfig.cpp
index a79186598..6d8f40874 100644
--- a/cachelib/cachebench/util/CacheConfig.cpp
+++ b/cachelib/cachebench/util/CacheConfig.cpp
@@ -88,6 +88,7 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) {
   JSONSetVal(configJson, truncateItemToOriginalAllocSizeInNvm);
   JSONSetVal(configJson, navyEncryption);
   JSONSetVal(configJson, deviceMaxWriteSize);
+  JSONSetVal(configJson, deviceEnableFDP);
 
   JSONSetVal(configJson, memoryOnlyTTL);
 
@@ -111,7 +112,7 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) {
   // if you added new fields to the configuration, update the JSONSetVal
   // to make them available for the json configs and increment the size
   // below
-  checkCorrectSize<CacheConfig, 752>();
+  checkCorrectSize<CacheConfig, 760>();
 
   if (numPools != poolSizes.size()) {
     throw std::invalid_argument(folly::sformat(
diff --git a/cachelib/cachebench/util/CacheConfig.h b/cachelib/cachebench/util/CacheConfig.h
index 2a9d5f225..0a1569615 100644
--- a/cachelib/cachebench/util/CacheConfig.h
+++ b/cachelib/cachebench/util/CacheConfig.h
@@ -227,6 +227,9 @@ struct CacheConfig : public JSONConfig {
   // Navy will split it into multiple IOs.
   uint32_t deviceMaxWriteSize{1024 * 1024};
 
+  // Enable the FDP Data placement mode in the device, if it is capable.
+  bool deviceEnableFDP{false};
+
   // Don't write to flash if cache TTL is smaller than this value.
   // Not used when its value is 0.  In seconds.
   uint32_t memoryOnlyTTL{0};
diff --git a/cachelib/navy/CMakeLists.txt b/cachelib/navy/CMakeLists.txt
index f53f3bad8..c23862e08 100644
--- a/cachelib/navy/CMakeLists.txt
+++ b/cachelib/navy/CMakeLists.txt
@@ -31,6 +31,7 @@ add_library (cachelib_navy
   block_cache/RegionManager.cpp
   common/Buffer.cpp
   common/Device.cpp
+  common/FdpNvme.cpp
   common/Hash.cpp
   common/SizeDistribution.cpp
   common/Types.cpp
diff --git a/cachelib/navy/Factory.cpp b/cachelib/navy/Factory.cpp
index 87cf3f712..c2735c85f 100644
--- a/cachelib/navy/Factory.cpp
+++ b/cachelib/navy/Factory.cpp
@@ -454,6 +454,7 @@ std::unique_ptr<Device> createFileDevice(
     uint32_t maxDeviceWriteSize,
     IoEngine ioEngine,
     uint32_t qDepth,
+    bool isFDPEnabled,
     std::shared_ptr<navy::DeviceEncryptor> encryptor) {
   // File paths are opened in the increasing order of the
   // path string. This ensures that RAID0 stripes aren't
@@ -476,12 +477,14 @@ std::unique_ptr<Device> createFileDevice(
   }
 
   return createDirectIoFileDevice(std::move(fileVec),
+                                  std::move(filePaths),
                                   fdSize,
                                   blockSize,
                                   stripeSize,
                                   maxDeviceWriteSize,
                                   ioEngine,
                                   qDepth,
+                                  isFDPEnabled,
                                   std::move(encryptor));
 }
 
diff --git a/cachelib/navy/Factory.h b/cachelib/navy/Factory.h
index 96bb168b2..9f0848a0a 100644
--- a/cachelib/navy/Factory.h
+++ b/cachelib/navy/Factory.h
@@ -209,6 +209,7 @@ std::unique_ptr<AbstractCache> createCache(std::unique_ptr<CacheProto> proto);
 // @param maxDeviceWriteSize    device maximum granularity of writes
 // @param ioEngine              IoEngine to be used for IO
 // @param qDepth                queue depth for async IO; 0 for sync IO
+// @param isFDPEnabled          whether FDP placement mode enabled or not
 // @param encryptor             encryption object
 std::unique_ptr<Device> createFileDevice(
     std::vector<std::string> filePaths,
@@ -219,6 +220,7 @@ std::unique_ptr<Device> createFileDevice(
     uint32_t maxDeviceWriteSize,
     IoEngine ioEngine,
     uint32_t qDepth,
+    bool isFDPEnabled,
     std::shared_ptr<navy::DeviceEncryptor> encryptor);
 
 } // namespace navy
diff --git a/cachelib/navy/bighash/BigHash.cpp b/cachelib/navy/bighash/BigHash.cpp
index 4d3021d8f..1815a225d 100644
--- a/cachelib/navy/bighash/BigHash.cpp
+++ b/cachelib/navy/bighash/BigHash.cpp
@@ -88,7 +88,8 @@ BigHash::BigHash(Config&& config, ValidConfigTag)
       cacheBaseOffset_{config.cacheBaseOffset},
       numBuckets_{config.numBuckets()},
       bloomFilter_{std::move(config.bloomFilter)},
-      device_{*config.device} {
+      device_{*config.device},
+      placementHandle_{device_.allocatePlacementHandle()} {
   XLOGF(INFO,
         "BigHash created: buckets: {}, bucket size: {}, base offset: {}",
         numBuckets_,
@@ -550,6 +551,7 @@ Buffer BigHash::readBucket(BucketId bid) {
 bool BigHash::writeBucket(BucketId bid, Buffer buffer) {
   auto* bucket = reinterpret_cast<Bucket*>(buffer.data());
   bucket->setChecksum(Bucket::computeChecksum(buffer.view()));
-  return device_.write(getBucketOffset(bid), std::move(buffer));
+  return device_.write(
+      getBucketOffset(bid), std::move(buffer), placementHandle_);
 }
 } // namespace facebook::cachelib::navy
diff --git a/cachelib/navy/bighash/BigHash.h b/cachelib/navy/bighash/BigHash.h
index 3d468ca88..6c60e4013 100644
--- a/cachelib/navy/bighash/BigHash.h
+++ b/cachelib/navy/bighash/BigHash.h
@@ -212,6 +212,8 @@ class BigHash final : public Engine {
   std::unique_ptr<BloomFilter> bloomFilter_;
   std::chrono::nanoseconds generationTime_{};
   Device& device_;
+  // handle for data placement technologies like FDP
+  int placementHandle_;
   std::unique_ptr<SharedMutex[]> mutex_{new SharedMutex[kNumMutexes]};
   // Spinlocks for bloom filter operations
   // We use spinlock in addition to the mutex to avoid contentions of
diff --git a/cachelib/navy/bighash/tests/BigHashTest.cpp b/cachelib/navy/bighash/tests/BigHashTest.cpp
index 4868d58a8..258dfaba0 100644
--- a/cachelib/navy/bighash/tests/BigHashTest.cpp
+++ b/cachelib/navy/bighash/tests/BigHashTest.cpp
@@ -213,7 +213,7 @@ TEST(BigHash, DeviceErrorStats) {
   BigHash bh(std::move(config));
 
   EXPECT_EQ(Status::Ok, bh.insert(makeHK("key1"), makeView("1")));
-  EXPECT_CALL(*device, writeImpl(0, 64, _)).WillOnce(Return(false));
+  EXPECT_CALL(*device, writeImpl(0, 64, _, _)).WillOnce(Return(false));
   EXPECT_EQ(Status::DeviceError, bh.insert(makeHK("key2"), makeView("1")));
   {
     MockCounterVisitor helper;
@@ -351,12 +351,13 @@ TEST(BigHash, WriteInTwoBuckets) {
       config.cacheBaseOffset + config.cacheSize, 128);
   {
     InSequence inSeq;
+    EXPECT_CALL(*device, allocatePlacementHandle());
     EXPECT_CALL(*device, readImpl(256, 128, _));
-    EXPECT_CALL(*device, writeImpl(256, 128, _));
+    EXPECT_CALL(*device, writeImpl(256, 128, _, _));
     EXPECT_CALL(*device, readImpl(384, 128, _));
-    EXPECT_CALL(*device, writeImpl(384, 128, _));
+    EXPECT_CALL(*device, writeImpl(384, 128, _, _));
     EXPECT_CALL(*device, readImpl(256, 128, _));
-    EXPECT_CALL(*device, writeImpl(256, 128, _));
+    EXPECT_CALL(*device, writeImpl(256, 128, _, _));
   }
   config.device = device.get();
 
@@ -375,10 +376,11 @@ TEST(BigHash, RemoveNotFound) {
   auto device = std::make_unique<StrictMock<MockDevice>>(config.cacheSize, 128);
   {
     InSequence inSeq;
+    EXPECT_CALL(*device, allocatePlacementHandle());
     EXPECT_CALL(*device, readImpl(0, 128, _));
-    EXPECT_CALL(*device, writeImpl(0, 128, _));
+    EXPECT_CALL(*device, writeImpl(0, 128, _, _));
     EXPECT_CALL(*device, readImpl(0, 128, _));
-    EXPECT_CALL(*device, writeImpl(0, 128, _));
+    EXPECT_CALL(*device, writeImpl(0, 128, _, _));
     EXPECT_CALL(*device, readImpl(0, 128, _));
   }
   config.device = device.get();
@@ -541,6 +543,7 @@ TEST(BigHash, BloomFilterRecoveryFail) {
   BigHash::Config config;
   setLayout(config, 128, 2);
   auto device = std::make_unique<StrictMock<MockDevice>>(config.cacheSize, 128);
+  EXPECT_CALL(*device, allocatePlacementHandle());
   EXPECT_CALL(*device, readImpl(_, _, _)).Times(0);
   config.device = device.get();
   config.bloomFilter = std::make_unique<BloomFilter>(2, 1, 4);
@@ -635,8 +638,9 @@ TEST(BigHash, BloomFilterRecovery) {
     setLayout(config, 128, 2);
     auto device =
         std::make_unique<StrictMock<MockDevice>>(config.cacheSize, 128);
+    EXPECT_CALL(*device, allocatePlacementHandle());
     EXPECT_CALL(*device, readImpl(0, 128, _));
-    EXPECT_CALL(*device, writeImpl(0, 128, _));
+    EXPECT_CALL(*device, writeImpl(0, 128, _, _));
     config.device = device.get();
     config.bloomFilter = std::make_unique<BloomFilter>(2, 1, 4);
 
diff --git a/cachelib/navy/block_cache/RegionManager.cpp b/cachelib/navy/block_cache/RegionManager.cpp
index b14f669e4..8a246feb7 100644
--- a/cachelib/navy/block_cache/RegionManager.cpp
+++ b/cachelib/navy/block_cache/RegionManager.cpp
@@ -45,7 +45,8 @@ RegionManager::RegionManager(uint32_t numRegions,
       numCleanRegions_{numCleanRegions},
       evictCb_{evictCb},
       cleanupCb_{cleanupCb},
-      numInMemBuffers_{numInMemBuffers} {
+      numInMemBuffers_{numInMemBuffers},
+      placementHandle_{device_.allocatePlacementHandle()} {
   XLOGF(INFO, "{} regions, {} bytes each", numRegions_, regionSize_);
   for (uint32_t i = 0; i < numRegions; i++) {
     regions_[i] = std::make_unique<Region>(RegionId{i}, regionSize_);
@@ -526,7 +527,7 @@ bool RegionManager::deviceWrite(RelAddress addr, Buffer buf) {
   const auto bufSize = buf.size();
   XDCHECK(isValidIORange(addr.offset(), bufSize));
   auto physOffset = physicalOffset(addr);
-  if (!device_.write(physOffset, std::move(buf))) {
+  if (!device_.write(physOffset, std::move(buf), placementHandle_)) {
     return false;
   }
   physicalWrittenCount_.add(bufSize);
@@ -537,7 +538,7 @@ bool RegionManager::deviceWrite(RelAddress addr, BufferView view) {
   const auto bufSize = view.size();
   XDCHECK(isValidIORange(addr.offset(), bufSize));
   auto physOffset = physicalOffset(addr);
-  if (!device_.write(physOffset, view)) {
+  if (!device_.write(physOffset, view, placementHandle_)) {
     return false;
   }
   physicalWrittenCount_.add(bufSize);
diff --git a/cachelib/navy/block_cache/RegionManager.h b/cachelib/navy/block_cache/RegionManager.h
index 470cef06e..45cd8d875 100644
--- a/cachelib/navy/block_cache/RegionManager.h
+++ b/cachelib/navy/block_cache/RegionManager.h
@@ -339,6 +339,7 @@ class RegionManager {
   mutable TimedMutex bufferMutex_;
   mutable util::ConditionVariable bufferCond_;
   std::vector<std::unique_ptr<Buffer>> buffers_;
+  int placementHandle_;
 };
 } // namespace navy
 } // namespace cachelib
diff --git a/cachelib/navy/block_cache/tests/BlockCacheTest.cpp b/cachelib/navy/block_cache/tests/BlockCacheTest.cpp
index 2dab75b7c..6131e24a2 100644
--- a/cachelib/navy/block_cache/tests/BlockCacheTest.cpp
+++ b/cachelib/navy/block_cache/tests/BlockCacheTest.cpp
@@ -592,14 +592,14 @@ TEST(BlockCache, ReclaimCorruption) {
   auto driver = makeDriver(std::move(engine), std::move(ex));
 
   // Allow any number of writes in between and after our expected writes
-  EXPECT_CALL(*device, writeImpl(_, _, _)).Times(testing::AtLeast(0));
+  EXPECT_CALL(*device, writeImpl(_, _, _, _)).Times(testing::AtLeast(0));
 
   // Note even tho this item's value is corrupted, we would have aborted
   // the reclaim before we got here. So we will not bump the value checksum
   // error stat on this.
-  EXPECT_CALL(*device, writeImpl(0, 16384, _))
+  EXPECT_CALL(*device, writeImpl(0, 16384, _, _))
       .WillOnce(testing::Invoke(
-          [&device](uint64_t offset, uint32_t size, const void* data) {
+          [&device](uint64_t offset, uint32_t size, const void* data, int) {
             // Note that all items are aligned to 512 bytes in in-mem buffer
             // stacked mode, and we write around 800 bytes, so each is aligned
             // to 1024 bytes
@@ -703,7 +703,7 @@ TEST(BlockCache, RegionUnderflow) {
   std::vector<uint32_t> hits(4);
   auto policy = std::make_unique<NiceMock<MockPolicy>>(&hits);
   auto device = std::make_unique<NiceMock<MockDevice>>(kDeviceSize, 1024);
-  EXPECT_CALL(*device, writeImpl(0, 16 * 1024, _));
+  EXPECT_CALL(*device, writeImpl(0, 16 * 1024, _, _));
   // Although 2k read buffer, shouldn't underflow the region!
   EXPECT_CALL(*device, readImpl(0, 1024, _));
   auto ex = makeJobScheduler();
@@ -730,7 +730,7 @@ TEST(BlockCache, SmallReadBuffer) {
   auto policy = std::make_unique<NiceMock<MockPolicy>>(&hits);
   auto device = std::make_unique<NiceMock<MockDevice>>(
       kDeviceSize, 4096 /* io alignment size */);
-  EXPECT_CALL(*device, writeImpl(0, 16 * 1024, _));
+  EXPECT_CALL(*device, writeImpl(0, 16 * 1024, _, _));
   EXPECT_CALL(*device, readImpl(0, 8192, _));
   auto ex = makeJobScheduler();
   auto config = makeConfig(*ex, std::move(policy), *device);
@@ -1057,10 +1057,11 @@ TEST(BlockCache, DeviceFailure) {
   auto device = std::make_unique<NiceMock<MockDevice>>(kDeviceSize, 1024);
   {
     testing::InSequence seq;
-    EXPECT_CALL(*device, writeImpl(0, kRegionSize, _)).WillOnce(Return(false));
-    EXPECT_CALL(*device, writeImpl(0, kRegionSize, _));
-    EXPECT_CALL(*device, writeImpl(kRegionSize, kRegionSize, _));
-    EXPECT_CALL(*device, writeImpl(kRegionSize * 2, kRegionSize, _));
+    EXPECT_CALL(*device, writeImpl(0, kRegionSize, _, _))
+        .WillOnce(Return(false));
+    EXPECT_CALL(*device, writeImpl(0, kRegionSize, _, _));
+    EXPECT_CALL(*device, writeImpl(kRegionSize, kRegionSize, _, _));
+    EXPECT_CALL(*device, writeImpl(kRegionSize * 2, kRegionSize, _, _));
 
     EXPECT_CALL(*device, readImpl(0, 1024, _));
     EXPECT_CALL(*device, readImpl(kRegionSize, 1024, _))
@@ -1116,7 +1117,7 @@ namespace {
 std::unique_ptr<Device> setupResetTestDevice(uint32_t size) {
   auto device = std::make_unique<NiceMock<MockDevice>>(size, 512);
   for (uint32_t i = 0; i < 2; i++) {
-    EXPECT_CALL(*device, writeImpl(i * 16 * 1024, 16 * 1024, _));
+    EXPECT_CALL(*device, writeImpl(i * 16 * 1024, 16 * 1024, _, _));
   }
   return device;
 }
@@ -2080,7 +2081,7 @@ TEST(BlockCache, DeviceFlushFailureSync) {
   auto device = std::make_unique<MockDevice>(kDeviceSize, 1024);
 
   testing::InSequence inSeq;
-  EXPECT_CALL(*device, writeImpl(_, _, _)).WillRepeatedly(Return(false));
+  EXPECT_CALL(*device, writeImpl(_, _, _, _)).WillRepeatedly(Return(false));
 
   auto ex = makeJobScheduler();
   auto config = makeConfig(*ex, std::move(policy), *device);
@@ -2122,7 +2123,7 @@ TEST(BlockCache, DeviceFlushFailureAsync) {
   auto device = std::make_unique<MockDevice>(kDeviceSize, 1024);
 
   testing::InSequence inSeq;
-  EXPECT_CALL(*device, writeImpl(_, _, _)).WillRepeatedly(Return(false));
+  EXPECT_CALL(*device, writeImpl(_, _, _, _)).WillRepeatedly(Return(false));
 
   auto ex = makeJobScheduler();
   auto config = makeConfig(*ex, std::move(policy), *device);
diff --git a/cachelib/navy/block_cache/tests/RegionManagerTest.cpp b/cachelib/navy/block_cache/tests/RegionManagerTest.cpp
index a5319f82c..61141e937 100644
--- a/cachelib/navy/block_cache/tests/RegionManagerTest.cpp
+++ b/cachelib/navy/block_cache/tests/RegionManagerTest.cpp
@@ -396,7 +396,7 @@ TEST(RegionManager, cleanupRegionFailureSync) {
 
   std::thread flushThread{[&sp, &device, &rm, &rid] {
     // Make sure flush will fail
-    EXPECT_CALL(*device, writeImpl(_, _, _)).WillRepeatedly(Return(false));
+    EXPECT_CALL(*device, writeImpl(_, _, _, _)).WillRepeatedly(Return(false));
     sp.wait(0); // Flush after active reader
     rm->doFlush(rid, false /* async */);
   }};
@@ -505,7 +505,7 @@ TEST(RegionManager, cleanupRegionFailureAsync) {
 
   std::thread flushThread{[&sp, &device, &rm, &rid] {
     // Make sure flush will fail
-    EXPECT_CALL(*device, writeImpl(_, _, _)).WillRepeatedly(Return(false));
+    EXPECT_CALL(*device, writeImpl(_, _, _, _)).WillRepeatedly(Return(false));
     sp.wait(0); // Flush after active reader
     rm->doFlush(rid, true /* async */);
   }};
diff --git a/cachelib/navy/common/Device.cpp b/cachelib/navy/common/Device.cpp
index 80554885c..6d2b4a56b 100644
--- a/cachelib/navy/common/Device.cpp
+++ b/cachelib/navy/common/Device.cpp
@@ -31,6 +31,7 @@
 #include <cstring>
 #include <numeric>
 
+#include "cachelib/navy/common/FdpNvme.h"
 #include "cachelib/navy/common/Utils.h"
 
 namespace facebook::cachelib::navy {
@@ -57,13 +58,15 @@ struct IOOp {
                 int fd,
                 uint64_t offset,
                 uint32_t size,
-                void* data)
+                void* data,
+                int handle)
       : parent_(parent),
         idx_(idx),
         fd_(fd),
         offset_(offset),
         size_(size),
-        data_(data) {}
+        data_(data),
+        handle_(handle) {}
 
   std::string toString() const;
 
@@ -78,6 +81,7 @@ struct IOOp {
   const uint64_t offset_ = 0;
   const uint32_t size_ = 0;
   void* const data_;
+  int handle_;
 
   // The number of resubmission on EAGAIN error
   uint8_t resubmitted_ = 0;
@@ -97,7 +101,8 @@ struct IOReq {
                  OpType opType,
                  uint64_t offset,
                  uint32_t size,
-                 void* data);
+                 void* data,
+                 int handle);
 
   const char* getOpName() const {
     switch (opType_) {
@@ -129,6 +134,7 @@ struct IOReq {
   const uint64_t offset_ = 0;
   const uint32_t size_ = 0;
   void* const data_;
+  int handle_;
 
   // Aggregate result of operations
   bool result_ = true;
@@ -167,7 +173,8 @@ class IoContext {
                                      uint32_t stripeSize,
                                      uint64_t offset,
                                      uint32_t size,
-                                     const void* data);
+                                     const void* data,
+                                     int handle);
 
   // Submit a IOOp to the device; should not fail for AsyncIoContext
   virtual bool submitIo(IOOp& op) = 0;
@@ -217,7 +224,8 @@ class AsyncIoContext : public IoContext {
                  size_t id,
                  folly::EventBase* evb,
                  size_t capacity,
-                 bool useIoUring);
+                 bool useIoUring,
+                 std::vector<std::shared_ptr<FdpNvme>> fdpNvmeVec);
 
   ~AsyncIoContext() override = default;
 
@@ -234,6 +242,11 @@ class AsyncIoContext : public IoContext {
  private:
   void handleCompletion(folly::Range<folly::AsyncBaseOp**>& completed);
 
+  std::unique_ptr<folly::AsyncBaseOp> prepAsyncIo(IOOp& op);
+
+  // Prepare an Nvme CMD IO through IOUring
+  std::unique_ptr<folly::AsyncBaseOp> prepNvmeIo(IOOp& op);
+
   // The maximum number of retries when IO failed with EBUSY. 5 is arbitrary
   static constexpr size_t kRetryLimit = 5;
 
@@ -260,6 +273,11 @@ class AsyncIoContext : public IoContext {
   size_t numOutstanding_ = 0;
   size_t numSubmitted_ = 0;
   size_t numCompleted_ = 0;
+
+  // Device info vector for FDP support
+  const std::vector<std::shared_ptr<FdpNvme>> fdpNvmeVec_{};
+  // As of now, only one FDP enabled Device is supported
+  static constexpr uint16_t kDefaultFdpIdx = 0u;
 };
 
 // An FileDevice manages direct I/O to either a single or multiple (RAID0)
@@ -267,9 +285,11 @@ class AsyncIoContext : public IoContext {
 class FileDevice : public Device {
  public:
   FileDevice(std::vector<folly::File>&& fvec,
+             std::vector<std::shared_ptr<FdpNvme>>&& fdpNvmeVec,
              uint64_t size,
              uint32_t blockSize,
              uint32_t stripeSize,
+             uint32_t maxIOSize,
              uint32_t maxDeviceWriteSize,
              IoEngine ioEngine,
              uint32_t qDepthPerContext,
@@ -281,15 +301,20 @@ class FileDevice : public Device {
  private:
   IoContext* getIoContext();
 
-  bool writeImpl(uint64_t, uint32_t, const void*) override;
+  bool writeImpl(uint64_t, uint32_t, const void*, int) override;
 
   bool readImpl(uint64_t, uint32_t, void*) override;
 
   void flushImpl() override;
 
+  int allocatePlacementHandle() override;
+
   // File vector for devices or regular files
   const std::vector<folly::File> fvec_{};
 
+  // Device info vector for FDP support
+  const std::vector<std::shared_ptr<FdpNvme>> fdpNvmeVec_{};
+
   // RAID stripe size when multiple devices are used
   const uint32_t stripeSize_;
 
@@ -321,7 +346,7 @@ class MemoryDevice final : public Device {
   explicit MemoryDevice(uint64_t size,
                         std::shared_ptr<DeviceEncryptor> encryptor,
                         uint32_t ioAlignSize)
-      : Device{size, std::move(encryptor), ioAlignSize,
+      : Device{size, std::move(encryptor), ioAlignSize, 0 /* max IO size */,
                0 /* max device write size */},
         buffer_{std::make_unique<uint8_t[]>(size)} {}
   MemoryDevice(const MemoryDevice&) = delete;
@@ -331,7 +356,8 @@ class MemoryDevice final : public Device {
  private:
   bool writeImpl(uint64_t offset,
                  uint32_t size,
-                 const void* value) noexcept override {
+                 const void* value,
+                 int /* unused */) noexcept override {
     XDCHECK_LE(offset + size, getSize());
     std::memcpy(buffer_.get() + offset, value, size);
     return true;
@@ -343,6 +369,8 @@ class MemoryDevice final : public Device {
     return true;
   }
 
+  int allocatePlacementHandle() override { return -1; }
+
   void flushImpl() override {
     // Noop
   }
@@ -351,20 +379,20 @@ class MemoryDevice final : public Device {
 };
 } // namespace
 
-bool Device::write(uint64_t offset, BufferView view) {
+bool Device::write(uint64_t offset, BufferView view, int handle) {
   if (encryptor_) {
     auto writeBuffer = makeIOBuffer(view.size());
     writeBuffer.copyFrom(0, view);
-    return write(offset, std::move(writeBuffer));
+    return write(offset, std::move(writeBuffer), handle);
   }
 
   const auto size = view.size();
   XDCHECK_LE(offset + size, size_);
   const uint8_t* data = reinterpret_cast<const uint8_t*>(view.data());
-  return writeInternal(offset, data, size);
+  return writeInternal(offset, data, size, handle);
 }
 
-bool Device::write(uint64_t offset, Buffer buffer) {
+bool Device::write(uint64_t offset, Buffer buffer, int handle) {
   const auto size = buffer.size();
   XDCHECK_LE(offset + buffer.size(), size_);
   uint8_t* data = reinterpret_cast<uint8_t*>(buffer.data());
@@ -377,10 +405,13 @@ bool Device::write(uint64_t offset, Buffer buffer) {
       return false;
     }
   }
-  return writeInternal(offset, data, size);
+  return writeInternal(offset, data, size, handle);
 }
 
-bool Device::writeInternal(uint64_t offset, const uint8_t* data, size_t size) {
+bool Device::writeInternal(uint64_t offset,
+                           const uint8_t* data,
+                           size_t size,
+                           int handle) {
   auto remainingSize = size;
   auto maxWriteSize = (maxWriteSize_ == 0) ? remainingSize : maxWriteSize_;
   bool result = true;
@@ -390,7 +421,7 @@ bool Device::writeInternal(uint64_t offset, const uint8_t* data, size_t size) {
     XDCHECK_EQ(writeSize % ioAlignmentSize_, 0ul);
 
     auto timeBegin = getSteadyClock();
-    result = writeImpl(offset, writeSize, data);
+    result = writeImpl(offset, writeSize, data, handle);
     writeLatencyEstimator_.trackValue(
         toMicros((getSteadyClock() - timeBegin)).count());
 
@@ -418,18 +449,31 @@ bool Device::writeInternal(uint64_t offset, const uint8_t* data, size_t size) {
 // returns true if successful, false otherwise.
 bool Device::readInternal(uint64_t offset, uint32_t size, void* value) {
   XDCHECK_EQ(reinterpret_cast<uint64_t>(value) % ioAlignmentSize_, 0ul);
-  XDCHECK_EQ(offset % ioAlignmentSize_, 0ul);
-  XDCHECK_EQ(size % ioAlignmentSize_, 0ul);
   XDCHECK_LE(offset + size, size_);
-  auto timeBegin = getSteadyClock();
-  bool result = readImpl(offset, size, value);
-  readLatencyEstimator_.trackValue(
-      toMicros(getSteadyClock() - timeBegin).count());
-  if (!result) {
-    readIOErrors_.inc();
-    return result;
+  uint8_t* data = reinterpret_cast<uint8_t*>(value);
+  auto remainingSize = size;
+  auto maxReadSize = (maxIOSize_ == 0) ? remainingSize : maxIOSize_;
+  bool result = true;
+  uint64_t curOffset = offset;
+  while (remainingSize > 0) {
+    auto readSize = std::min<size_t>(maxReadSize, remainingSize);
+    XDCHECK_EQ(curOffset % ioAlignmentSize_, 0ul);
+    XDCHECK_EQ(size % ioAlignmentSize_, 0ul);
+
+    auto timeBegin = getSteadyClock();
+    result = readImpl(curOffset, readSize, data);
+    readLatencyEstimator_.trackValue(
+        toMicros(getSteadyClock() - timeBegin).count());
+
+    if (!result) {
+      readIOErrors_.inc();
+      return false;
+    }
+    bytesRead_.add(readSize);
+    curOffset += readSize;
+    data += readSize;
+    remainingSize -= readSize;
   }
-  bytesRead_.add(size);
   if (encryptor_) {
     XCHECK_EQ(offset % encryptor_->encryptionBlockSize(), 0ul);
     auto res = encryptor_->decrypt(
@@ -440,7 +484,7 @@ bool Device::readInternal(uint64_t offset, uint32_t size, void* value) {
       return false;
     }
   }
-  return true;
+  return result;
 }
 
 // This API reads size bytes from the Device from offset into a Buffer and
@@ -504,15 +548,15 @@ std::string IOOp::toString() const {
       offset_, size_, data_, resubmitted_);
 }
 
-bool IOOp::done(ssize_t status) {
+bool IOOp::done(ssize_t size) {
   XDCHECK(parent_.opType_ == READ || parent_.opType_ == WRITE);
 
-  bool result = (status == size_);
+  bool result = (size == size_);
   if (!result) {
     // Report IO errors
     XLOG_N_PER_MS(ERR, 10, 1000) << folly::sformat(
         "[{}] IO error: {} ret={} errno={} ({})", parent_.context_.getName(),
-        toString(), status, errno, std::strerror(errno));
+        toString(), size, errno, std::strerror(errno));
   }
 
   // Check for timeout
@@ -540,12 +584,14 @@ IOReq::IOReq(IoContext& context,
              OpType opType,
              uint64_t offset,
              uint32_t size,
-             void* data)
+             void* data,
+             int handle)
     : context_(context),
       opType_(opType),
       offset_(offset),
       size_(size),
-      data_(data) {
+      data_(data),
+      handle_(handle) {
   uint8_t* buf = reinterpret_cast<uint8_t*>(data_);
   uint32_t idx = 0;
   if (fvec.size() > 1) {
@@ -559,14 +605,15 @@ IOReq::IOReq(IoContext& context,
 
       ops_.emplace_back(*this, idx++, fvec[fdIdx].fd(),
                         stripeStartOffset + ioOffsetInStripe, allowedIOSize,
-                        buf);
+                        buf, handle_);
 
       size -= allowedIOSize;
       offset += allowedIOSize;
       buf += allowedIOSize;
     }
   } else {
-    ops_.emplace_back(*this, idx++, fvec[0].fd(), offset_, size_, data_);
+    ops_.emplace_back(*this, idx++, fvec[0].fd(), offset_, size_, data_,
+                      handle_);
   }
 
   numRemaining_ = ops_.size();
@@ -632,7 +679,7 @@ std::shared_ptr<IOReq> IoContext::submitRead(
     uint32_t size,
     void* data) {
   auto req = std::make_shared<IOReq>(*this, fvec, stripeSize, OpType::READ,
-                                     offset, size, data);
+                                     offset, size, data, -1);
   submitReq(req);
   return req;
 }
@@ -642,9 +689,11 @@ std::shared_ptr<IOReq> IoContext::submitWrite(
     uint32_t stripeSize,
     uint64_t offset,
     uint32_t size,
-    const void* data) {
-  auto req = std::make_shared<IOReq>(*this, fvec, stripeSize, OpType::WRITE,
-                                     offset, size, const_cast<void*>(data));
+    const void* data,
+    int handle) {
+  auto req =
+      std::make_shared<IOReq>(*this, fvec, stripeSize, OpType::WRITE, offset,
+                              size, const_cast<void*>(data), handle);
   submitReq(req);
   return req;
 }
@@ -700,14 +749,16 @@ AsyncIoContext::AsyncIoContext(std::unique_ptr<folly::AsyncBase>&& asyncBase,
                                size_t id,
                                folly::EventBase* evb,
                                size_t capacity,
-                               bool useIoUring)
+                               bool useIoUring,
+                               std::vector<std::shared_ptr<FdpNvme>> fdpNvmeVec)
     : asyncBase_(std::move(asyncBase)),
       id_(id),
       qDepth_(capacity),
-      useIoUring_(useIoUring) {
+      useIoUring_(useIoUring),
+      fdpNvmeVec_(fdpNvmeVec) {
 #ifdef CACHELIB_IOURING_DISABLE
   // io_uring is not available on the system
-  XDCHECK(!useIoUring_);
+  XDCHECK(!useIoUring_ && !(fdpNvmeVec_.size() > 0));
   useIoUring_ = false;
 #endif
   if (evb) {
@@ -722,9 +773,10 @@ AsyncIoContext::AsyncIoContext(std::unique_ptr<folly::AsyncBase>&& asyncBase,
   }
 
   XLOGF(INFO,
-        "[{}] Created new async io context with qdepth {}{} io_engine {} ",
+        "[{}] Created new async io context with qdepth {}{} io_engine {} {}",
         getName(), qDepth_, qDepth_ == 1 ? " (sync wait)" : "",
-        useIoUring_ ? "io_uring" : "libaio");
+        useIoUring_ ? "io_uring" : "libaio",
+        (fdpNvmeVec_.size() > 0) ? "FDP enabled" : "");
 }
 
 void AsyncIoContext::pollCompletion() {
@@ -762,7 +814,12 @@ void AsyncIoContext::handleCompletion(
                          aop->result(), iop->toString());
     }
 
-    iop->done(aop->result());
+    auto result = aop->result();
+    if (fdpNvmeVec_.size() > 0) {
+      // 0 means success here, so get the completed size from iop
+      result = !result ? iop->size_ : 0;
+    }
+    iop->done(result);
 
     if (!waitList_.empty()) {
       auto& waiter = waitList_.front();
@@ -773,9 +830,8 @@ void AsyncIoContext::handleCompletion(
 }
 
 bool AsyncIoContext::submitIo(IOOp& op) {
-  IOReq& req = op.parent_;
-
   op.startTime_ = getSteadyClock();
+
   while (numOutstanding_ >= qDepth_) {
     if (qDepth_ > 1) {
       XLOG_EVERY_MS(ERR, 10000) << fmt::format(
@@ -788,6 +844,32 @@ bool AsyncIoContext::submitIo(IOOp& op) {
   }
 
   std::unique_ptr<folly::AsyncBaseOp> asyncOp;
+  asyncOp = prepAsyncIo(op);
+  asyncOp->setUserData(&op);
+  asyncBase_->submit(asyncOp.release());
+
+  op.submitTime_ = getSteadyClock();
+
+  numOutstanding_++;
+  numSubmitted_++;
+
+  if (!compHandler_) {
+    // Wait completion synchronously if completion handler is not available.
+    // i.e., when async io is used with non-epoll mode
+    auto completed = asyncBase_->wait(1);
+    handleCompletion(completed);
+  }
+
+  return true;
+}
+
+std::unique_ptr<folly::AsyncBaseOp> AsyncIoContext::prepAsyncIo(IOOp& op) {
+  if (fdpNvmeVec_.size() > 0) {
+    return prepNvmeIo(op);
+  }
+
+  std::unique_ptr<folly::AsyncBaseOp> asyncOp;
+  IOReq& req = op.parent_;
   if (useIoUring_) {
 #ifndef CACHELIB_IOURING_DISABLE
     asyncOp = std::make_unique<folly::IoUringOp>();
@@ -797,37 +879,49 @@ bool AsyncIoContext::submitIo(IOOp& op) {
   }
 
   if (req.opType_ == OpType::READ) {
-    asyncOp->setUserData(&op);
     asyncOp->pread(op.fd_, op.data_, op.size_, op.offset_);
   } else {
     XDCHECK_EQ(req.opType_, OpType::WRITE);
-    asyncOp->setUserData(&op);
     asyncOp->pwrite(op.fd_, op.data_, op.size_, op.offset_);
   }
 
-  asyncBase_->submit(asyncOp.release());
-  op.submitTime_ = getSteadyClock();
+  return asyncOp;
+}
 
-  numOutstanding_++;
-  numSubmitted_++;
+std::unique_ptr<folly::AsyncBaseOp> AsyncIoContext::prepNvmeIo(IOOp& op) {
+#ifndef CACHELIB_IOURING_DISABLE
+  std::unique_ptr<folly::IoUringOp> iouringCmdOp;
+  IOReq& req = op.parent_;
 
-  if (!compHandler_) {
-    // Wait completion synchronously if completion handler is not available.
-    // i.e., when async io is used with non-epoll mode
-    auto completed = asyncBase_->wait(1);
-    handleCompletion(completed);
-  }
+  auto& options = static_cast<folly::IoUring*>(asyncBase_.get())->getOptions();
+  iouringCmdOp = std::make_unique<folly::IoUringOp>(
+      folly::AsyncBaseOp::NotificationCallback(), options);
 
-  return true;
+  iouringCmdOp->initBase();
+  struct io_uring_sqe& sqe = iouringCmdOp->getSqe();
+  if (req.opType_ == OpType::READ) {
+    fdpNvmeVec_[kDefaultFdpIdx]->prepReadUringCmdSqe(sqe, op.data_, op.size_,
+                                                     op.offset_);
+  } else {
+    fdpNvmeVec_[kDefaultFdpIdx]->prepWriteUringCmdSqe(sqe, op.data_, op.size_,
+                                                      op.offset_, op.handle_);
+  }
+  io_uring_sqe_set_data(&sqe, iouringCmdOp.get());
+  return std::move(iouringCmdOp);
+#else
+  return nullptr;
+#endif
 }
 
 /*
  * FileDevice
  */
 FileDevice::FileDevice(std::vector<folly::File>&& fvec,
+                       std::vector<std::shared_ptr<FdpNvme>>&& fdpNvmeVec,
                        uint64_t fileSize,
                        uint32_t blockSize,
                        uint32_t stripeSize,
+                       uint32_t maxIOSize,
                        uint32_t maxDeviceWriteSize,
                        IoEngine ioEngine,
                        uint32_t qDepthPerContext,
@@ -835,8 +929,10 @@ FileDevice::FileDevice(std::vector<folly::File>&& fvec,
     : Device(fileSize * fvec.size(),
              std::move(encryptor),
              blockSize,
+             maxIOSize,
              maxDeviceWriteSize),
       fvec_(std::move(fvec)),
+      fdpNvmeVec_(std::move(fdpNvmeVec)),
       stripeSize_(stripeSize),
       ioEngine_(ioEngine),
       qDepthPerContext_(qDepthPerContext) {
@@ -866,11 +962,14 @@ FileDevice::FileDevice(std::vector<folly::File>&& fvec,
   // (e.g., recovery path, read random alloc path)
   syncIoContext_ = std::make_unique<SyncIoContext>();
 
-  XLOGF(INFO,
-        "Created device with num_devices {} size {} block_size {},"
-        "stripe_size {} max_write_size {} io_engine {} qdepth {}",
-        fvec_.size(), getSize(), blockSize, stripeSize, maxDeviceWriteSize,
-        getIoEngineName(ioEngine_), qDepthPerContext_);
+  XLOGF(
+      INFO,
+      "Created device with num_devices {} size {} block_size {},"
+      "stripe_size {} max_write_size {} max_io_size {} io_engine {} qdepth {},"
+      "num_fdp_devices {}",
+      fvec_.size(), getSize(), blockSize, stripeSize, maxDeviceWriteSize,
+      maxIOSize, getIoEngineName(ioEngine_), qDepthPerContext_,
+      fdpNvmeVec_.size());
 }
 
 bool FileDevice::readImpl(uint64_t offset, uint32_t size, void* value) {
@@ -879,9 +978,12 @@ bool FileDevice::readImpl(uint64_t offset, uint32_t size, void* value) {
   return req->waitCompletion();
 }
 
-bool FileDevice::writeImpl(uint64_t offset, uint32_t size, const void* value) {
-  auto req =
-      getIoContext()->submitWrite(fvec_, stripeSize_, offset, size, value);
+bool FileDevice::writeImpl(uint64_t offset,
+                           uint32_t size,
+                           const void* value,
+                           int handle) {
+  auto req = getIoContext()->submitWrite(fvec_, stripeSize_, offset, size,
+                                         value, handle);
   return req->waitCompletion();
 }
 
@@ -923,8 +1025,16 @@ IoContext* FileDevice::getIoContext() {
     std::unique_ptr<folly::AsyncBase> asyncBase;
     if (useIoUring) {
 #ifndef CACHELIB_IOURING_DISABLE
-      asyncBase = std::make_unique<folly::IoUring>(qDepthPerContext_, pollMode,
-                                                   qDepthPerContext_);
+      if (fdpNvmeVec_.size() > 0) {
+        folly::IoUringOp::Options options;
+        options.sqe128 = true;
+        options.cqe32 = true;
+        asyncBase = std::make_unique<folly::IoUring>(
+            qDepthPerContext_, pollMode, qDepthPerContext_, options);
+      } else {
+        asyncBase = std::make_unique<folly::IoUring>(
+            qDepthPerContext_, pollMode, qDepthPerContext_);
+      }
 #endif
     } else {
       XDCHECK_EQ(ioEngine_, IoEngine::LibAio);
@@ -933,7 +1043,8 @@ IoContext* FileDevice::getIoContext() {
 
     auto idx = incrementalIdx_++;
     tlContext_.reset(new AsyncIoContext(std::move(asyncBase), idx, evb,
-                                        qDepthPerContext_, useIoUring));
+                                        qDepthPerContext_, useIoUring,
+                                        fdpNvmeVec_));
 
     {
       // Keep pointers in a vector to ease the gdb debugging
@@ -948,6 +1059,16 @@ IoContext* FileDevice::getIoContext() {
   return tlContext_.get();
 }
 
+int FileDevice::allocatePlacementHandle() {
+  static constexpr uint16_t kDefaultFdpIdx = 0u;
+#ifndef CACHELIB_IOURING_DISABLE
+  if (fdpNvmeVec_.size() > 0) {
+    return fdpNvmeVec_[kDefaultFdpIdx]->allocateFdpHandle();
+  }
+#endif
+  return -1;
+}
+
 } // namespace
 
 std::unique_ptr<Device> createMemoryDevice(
@@ -960,19 +1081,58 @@ std::unique_ptr<Device> createMemoryDevice(
 
 std::unique_ptr<Device> createDirectIoFileDevice(
     std::vector<folly::File> fVec,
+    std::vector<std::string> filePaths,
     uint64_t fileSize,
     uint32_t blockSize,
     uint32_t stripeSize,
     uint32_t maxDeviceWriteSize,
     IoEngine ioEngine,
     uint32_t qDepthPerContext,
+    bool isFDPEnabled,
     std::shared_ptr<DeviceEncryptor> encryptor) {
   XDCHECK(folly::isPowTwo(blockSize));
 
+  uint32_t maxIOSize = maxDeviceWriteSize;
+  std::vector<std::shared_ptr<FdpNvme>> fdpNvmeVec{};
+#ifndef CACHELIB_IOURING_DISABLE
+  if (isFDPEnabled) {
+    try {
+      if (filePaths.size() > 1) {
+        throw std::invalid_argument(folly::sformat(
+            "{} input files; but FDP mode does not support RAID files yet",
+            filePaths.size()));
+      }
+
+      for (const auto& path : filePaths) {
+        auto fdpNvme = std::make_shared<FdpNvme>(path);
+
+        auto maxDevIOSize = fdpNvme->getMaxIOSize();
+        if (maxDevIOSize != 0u &&
+            (maxIOSize == 0u || maxDevIOSize < maxIOSize)) {
+          maxIOSize = maxDevIOSize;
+        }
+
+        fdpNvmeVec.push_back(std::move(fdpNvme));
+      }
+    } catch (const std::exception& e) {
+      XLOGF(ERR, "NVMe FDP mode could not be enabled {}, Errno: {}", e.what(),
+            errno);
+      fdpNvmeVec.clear();
+      maxIOSize = 0u;
+    }
+  }
+#endif
+
+  if (maxIOSize != 0u) {
+    maxDeviceWriteSize = std::min<size_t>(maxDeviceWriteSize, maxIOSize);
+  }
+
   return std::make_unique<FileDevice>(std::move(fVec),
+                                      std::move(fdpNvmeVec),
                                       fileSize,
                                       blockSize,
                                       stripeSize,
+                                      maxIOSize,
                                       maxDeviceWriteSize,
                                       ioEngine,
                                       qDepthPerContext,
@@ -987,12 +1147,14 @@ std::unique_ptr<Device> createDirectIoFileDevice(
     uint32_t maxDeviceWriteSize,
     std::shared_ptr<DeviceEncryptor> encryptor) {
   return createDirectIoFileDevice(std::move(fVec),
+                                  {},
                                   fileSize,
                                   blockSize,
                                   stripeSize,
                                   maxDeviceWriteSize,
                                   IoEngine::Sync,
                                   0,
+                                  false,
                                   encryptor);
 }
 
diff --git a/cachelib/navy/common/Device.h b/cachelib/navy/common/Device.h
index c024e0735..51cc1899f 100644
--- a/cachelib/navy/common/Device.h
+++ b/cachelib/navy/common/Device.h
@@ -69,19 +69,25 @@ class Device {
   Device(uint64_t size,
          std::shared_ptr<DeviceEncryptor> encryptor,
          uint32_t maxWriteSize)
-      : Device(
-            size, std::move(encryptor), kDefaultAlignmentSize, maxWriteSize) {}
+      : Device(size,
+               std::move(encryptor),
+               kDefaultAlignmentSize,
+               0 /* max device IO size */,
+               maxWriteSize) {}
 
   // @param size          total size of the device
   // @param encryptor     encryption object
   // @param ioAlignSize   alignment size for IO operations
+  // @param maxIOSize     max device IO size
   // @param maxWriteSize  max device write size
   Device(uint64_t size,
          std::shared_ptr<DeviceEncryptor> encryptor,
          uint32_t ioAlignSize,
+         uint32_t maxIOSize,
          uint32_t maxWriteSize)
       : size_(size),
         ioAlignmentSize_{ioAlignSize},
+        maxIOSize_(maxIOSize),
         maxWriteSize_(maxWriteSize),
         encryptor_{std::move(encryptor)} {
     if (ioAlignSize == 0) {
@@ -97,6 +103,11 @@ class Device {
       throw std::invalid_argument(folly::sformat(
           "Invalid max write size {} ioAlignSize {}", maxWriteSize_, size));
     }
+    if (maxIOSize_ % ioAlignmentSize_ != 0) {
+      throw std::invalid_argument(
+          folly::sformat("Invalid max io size {} ioAlignSize {}", maxIOSize_,
+                         ioAlignmentSize_));
+    }
   }
   virtual ~Device() = default;
 
@@ -116,11 +127,15 @@ class Device {
   // @param buffer    Data to write to the device. It must be aligned the same
   //                  way as `makeIOBuffer` would return.
   // @param offset    Must be ioAlignmentSize_ aligned
-  bool write(uint64_t offset, Buffer buffer);
+  // @param handle    Placement Handle for data placement technology like FDP
+  bool write(uint64_t offset, Buffer buffer, int handle = -1);
 
   // Write buffer view to the device. This call makes a copy of the buffer if
   // entryptor is present.
-  bool write(uint64_t offset, BufferView bufferView);
+  bool write(uint64_t offset, BufferView bufferView, int handle = -1);
+
+  // Allocate a new stream and return the handle for Placement capable devices.
+  virtual int allocatePlacementHandle() = 0;
 
   // Reads @size bytes from device at @deviceOffset and copys to @value
   // There must be sufficient space allocated already in the mutableView.
@@ -155,7 +170,10 @@ class Device {
   uint32_t getIOAlignmentSize() const { return ioAlignmentSize_; }
 
  protected:
-  virtual bool writeImpl(uint64_t offset, uint32_t size, const void* value) = 0;
+  virtual bool writeImpl(uint64_t offset,
+                         uint32_t size,
+                         const void* value,
+                         int handle = -1) = 0;
   virtual bool readImpl(uint64_t offset, uint32_t size, void* value) = 0;
   virtual void flushImpl() = 0;
 
@@ -172,7 +190,10 @@ class Device {
 
   bool readInternal(uint64_t offset, uint32_t size, void* value);
 
-  bool writeInternal(uint64_t offset, const uint8_t* data, size_t size);
+  bool writeInternal(uint64_t offset,
+                     const uint8_t* data,
+                     size_t size,
+                     int handle = -1);
 
   // size of the device. All offsets for write/read should be contained
   // below this.
@@ -181,6 +202,10 @@ class Device {
   // alignment granularity for the offsets and size to read/write calls.
   const uint32_t ioAlignmentSize_{kDefaultAlignmentSize};
 
+  // Some devices have this transfer size limit due to DMA size limitations.
+  // This limit is applicable for both writes and reads.
+  const uint32_t maxIOSize_{0};
+
   // When write-io is issued, it is broken down into writeImpl calls at
   // this granularity. maxWriteSize_ 0 means no maximum write size.
   // maxWriteSize_ option allows splitting the large writes to smaller
@@ -205,6 +230,7 @@ std::unique_ptr<Device> createMemoryDevice(
 // provided. If qDepth = 0, sync IO will be used all the time
 //
 // @param fVec                  vector of file descriptor(s)
+// @param filePaths             vector of file path(s)
 // @param fileSize              size of the file(s)
 // @param blockSize             device block size
 // @param stripeSize            RAID stripe size if applicable
@@ -214,15 +240,18 @@ std::unique_ptr<Device> createMemoryDevice(
 // @param ioEngine              IO engine to be used
 // @param qDepth                queue depth per each IO thread.
 //                              If 0, sync IO will be used
+// @param isFDPEnabled          Whether FDP placement mode is enabled or not.
 // @param encryptor             encryption object
 std::unique_ptr<Device> createDirectIoFileDevice(
     std::vector<folly::File> fVec,
+    std::vector<std::string> filePaths,
     uint64_t fileSize,
     uint32_t blockSize,
     uint32_t stripeSize,
     uint32_t maxDeviceWriteSize,
     IoEngine ioEngine,
     uint32_t qDepth,
+    bool isFDPEnabled,
     std::shared_ptr<DeviceEncryptor> encryptor);
 
 // A convenient wrapper for creating Device with a sync IO
@@ -240,7 +269,6 @@ std::unique_ptr<Device> createDirectIoFileDevice(
     uint32_t stripeSize,
     uint32_t maxDeviceWriteSize,
     std::shared_ptr<DeviceEncryptor> encryptor);
-
 } // namespace navy
 } // namespace cachelib
 } // namespace facebook
diff --git a/cachelib/navy/common/FdpNvme.cpp b/cachelib/navy/common/FdpNvme.cpp
new file mode 100644
index 000000000..3190c3028
--- /dev/null
+++ b/cachelib/navy/common/FdpNvme.cpp
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cachelib/navy/common/FdpNvme.h"
+
+#include <errno.h>
+#include <linux/nvme_ioctl.h>
+#include <sys/ioctl.h>
+
+#include <cstring>
+#include <regex>
+
+#ifndef CACHELIB_IOURING_DISABLE
+
+namespace facebook {
+namespace cachelib {
+namespace navy {
+
+FdpNvme::FdpNvme(const std::string& bdevName)
+    : file_(openNvmeCharFile(bdevName)) {
+  initializeFDP(bdevName);
+  XLOGF(INFO, "Initialized NVMe FDP Device on file: {}", bdevName);
+}
+
+int FdpNvme::allocateFdpHandle() {
+  uint16_t phndl;
+
+  // Get NS specific Fdp Placement Handle(PHNDL)
+  if (nextPIDIdx_ <= maxPIDIdx_) {
+    phndl = nextPIDIdx_++;
+  } else {
+    phndl = kDefaultPIDIdx;
+  }
+
+  XLOGF(INFO, "Allocated an FDP handle {}", phndl);
+  return static_cast<int>(phndl);
+}
+
+void FdpNvme::initializeFDP(const std::string& bdevName) {
+  nvmeData_ = readNvmeInfo(bdevName);
+
+  Buffer buffer = nvmeFdpStatus();
+  struct nvme_fdp_ruh_status* ruh_status =
+      reinterpret_cast<struct nvme_fdp_ruh_status*>(buffer.data());
+
+  if (!ruh_status->nruhsd) {
+    throw std::invalid_argument("Failed to initialize FDP; nruhsd is 0");
+  }
+  placementIDs_.reserve(ruh_status->nruhsd);
+  maxPIDIdx_ = ruh_status->nruhsd - 1;
+  for (uint16_t i = 0; i <= maxPIDIdx_; ++i) {
+    placementIDs_[i] = ruh_status->ruhss[i].pid;
+  }
+
+  XLOGF(DBG, "Creating NvmeFdp, fd: {}, Num_PID: {}, 1st PID: {}, Last PID: {}",
+        file_.fd(), maxPIDIdx_ + 1, placementIDs_[0],
+        placementIDs_[maxPIDIdx_]);
+}
+
+// NVMe IO Mnagement Receive fn for specific config reading
+int FdpNvme::nvmeIOMgmtRecv(uint32_t nsid,
+                            void* data,
+                            uint32_t data_len,
+                            uint8_t op,
+                            uint16_t op_specific) {
+  // IO management command details
+  uint32_t cdw10 = (op & 0xf) | (op_specific & 0xff << 16);
+  uint32_t cdw11 = (data_len >> 2) - 1; // cdw11 is 0 based
+
+  struct nvme_passthru_cmd cmd = {
+      .opcode = nvme_cmd_io_mgmt_recv,
+      .nsid = nsid,
+      .addr = (uint64_t)(uintptr_t)data,
+      .data_len = data_len,
+      .cdw10 = cdw10,
+      .cdw11 = cdw11,
+      .timeout_ms = NVME_DEFAULT_IOCTL_TIMEOUT,
+  };
+
+  return ioctl(file_.fd(), NVME_IOCTL_IO_CMD, &cmd);
+}
+
+// struct nvme_fdp_ruh_status is a variable sized object; so using Buffer.
+Buffer FdpNvme::nvmeFdpStatus() {
+  struct nvme_fdp_ruh_status hdr;
+  int err;
+
+  // Read FDP ruh status header to get Num_RUH Status Descriptors
+  err = nvmeIOMgmtRecv(nvmeData_.nsId(), &hdr, sizeof(hdr),
+                       NVME_IO_MGMT_RECV_RUH_STATUS, 0);
+  if (err) {
+    throw std::system_error(
+        errno,
+        std::system_category(),
+        folly::sformat("failed to get ruh status header, fd: {}", file_.fd()));
+  }
+
+  auto size = sizeof(struct nvme_fdp_ruh_status) +
+              (hdr.nruhsd * sizeof(struct nvme_fdp_ruh_status_desc));
+  auto buffer = Buffer(size);
+
+  // Read FDP RUH Status
+  err = nvmeIOMgmtRecv(nvmeData_.nsId(), buffer.data(), size,
+                       NVME_IO_MGMT_RECV_RUH_STATUS, 0);
+  if (err) {
+    throw std::system_error(
+        errno,
+        std::system_category(),
+        folly::sformat("failed to get ruh status, fd: {}", file_.fd()));
+  }
+
+  return buffer;
+}
+
+void FdpNvme::prepFdpUringCmdSqe(struct io_uring_sqe& sqe,
+                                 void* buf,
+                                 size_t size,
+                                 off_t start,
+                                 uint8_t opcode,
+                                 uint8_t dtype,
+                                 uint16_t dspec) {
+  uint32_t maxTfrSize = nvmeData_.getMaxTfrSize();
+  if ((maxTfrSize != 0) && (size > maxTfrSize)) {
+    throw std::invalid_argument("Exceeds max Transfer size");
+  }
+  // Clear the SQE entry to avoid some arbitrary flags being set.
+  memset(&sqe, 0, sizeof(struct io_uring_sqe));
+
+  sqe.fd = file_.fd();
+  sqe.opcode = IORING_OP_URING_CMD;
+  sqe.cmd_op = NVME_URING_CMD_IO;
+
+  struct nvme_uring_cmd* cmd = (struct nvme_uring_cmd*)&sqe.cmd;
+  if (cmd == nullptr) {
+    throw std::invalid_argument("Uring cmd is NULL!");
+  }
+  memset(cmd, 0, sizeof(struct nvme_uring_cmd));
+  cmd->opcode = opcode;
+
+  // start LBA of the IO = Req_start (offset in partition) + Partition_start
+  uint64_t sLba = (start >> nvmeData_.lbaShift()) + nvmeData_.partStartLba();
+  uint32_t nLb = (size >> nvmeData_.lbaShift()) - 1; // nLb is 0 based
+
+  /* cdw10 and cdw11 represent starting lba */
+  cmd->cdw10 = sLba & 0xffffffff;
+  cmd->cdw11 = sLba >> 32;
+  /* cdw12 represent number of lba's for read/write */
+  cmd->cdw12 = (dtype & 0xFF) << 20 | nLb;
+  cmd->cdw13 = (dspec << 16);
+  cmd->addr = (uint64_t)buf;
+  cmd->data_len = size;
+
+  cmd->nsid = nvmeData_.nsId();
+}
+
+void FdpNvme::prepReadUringCmdSqe(struct io_uring_sqe& sqe,
+                                  void* buf,
+                                  size_t size,
+                                  off_t start) {
+  // Placement Handle is not used for read.
+  prepFdpUringCmdSqe(sqe, buf, size, start, nvme_cmd_read, 0, 0);
+}
+
+void FdpNvme::prepWriteUringCmdSqe(
+    struct io_uring_sqe& sqe, void* buf, size_t size, off_t start, int handle) {
+  static constexpr uint8_t kPlacementMode = 2;
+  uint16_t pid;
+
+  if (handle == -1) {
+    pid = getFdpPID(kDefaultPIDIdx); // Use the default stream
+  } else if (handle >= 0 && handle <= maxPIDIdx_) {
+    pid = getFdpPID(static_cast<uint16_t>(handle));
+  } else {
+    throw std::invalid_argument("Invalid placement identifier");
+  }
+
+  prepFdpUringCmdSqe(sqe, buf, size, start, nvme_cmd_write, kPlacementMode,
+                     pid);
+}
+
+// Read the /sys/block/xx entry for any block device
+std::string readDevAttr(const std::string& bName, const std::string& attr) {
+  std::string path = "/sys/block/" + bName + '/' + attr;
+  std::string entry;
+  if (!folly::readFile(path.c_str(), entry)) {
+    throw std::runtime_error(folly::sformat("Unable to read {}", path));
+  }
+  return entry;
+}
+
+// Get the Namespace ID of an NVMe block device
+int getNvmeNsId(const std::string& nsName) {
+  return (folly::to<int>(readDevAttr(nsName, "nsid")));
+}
+
+// Get the Max Transfer size in bytes for an NVMe block device
+uint32_t getMaxTfrSize(const std::string& nsName) {
+  // max_hw_sectors_kb : This is the maximum number of kilobytes supported in a
+  // single data transfer.
+  // (https://www.kernel.org/doc/Documentation/block/queue-sysfs.txt)
+  return (1024u * /* multiply by kb */
+          folly::to<uint32_t>(readDevAttr(nsName, "queue/max_hw_sectors_kb")));
+}
+
+// Get LBA shift of an NVMe block device
+uint32_t getLbaShift(const std::string& nsName) {
+  return folly::constexpr_log2(
+      folly::to<uint32_t>(readDevAttr(nsName, "queue/logical_block_size")));
+}
+
+// Get the partition start in bytes
+uint64_t getPartStart(const std::string& nsName, const std::string& partName) {
+  return (512u * /* sysfs size is in terms of linux sector size */
+          folly::to<uint64_t>(readDevAttr(nsName + "/" + partName, "start")));
+}
+
+// It returns nsName = "nvme0n1" for both "/dev/nvme0n1" and "/dev/nvme0n1p1".
+// Also partName = "nvme0n1p1" for "/dev/nvme0n1p1" (partitioned NS),
+// and  "" for "/dev/nvme0n1" (non-partitioned NS)
+void getNsAndPartition(const std::string& bdevName,
+                       std::string& nsName,
+                       std::string& partName) {
+  size_t lastSlashPos = bdevName.find_last_of('/');
+  if (lastSlashPos == std::string::npos) {
+    throw std::invalid_argument("Invalid block dev name");
+  }
+
+  std::string baseName = bdevName.substr(lastSlashPos + 1);
+  size_t pPos = baseName.find_last_of('p');
+
+  if (pPos == std::string::npos) {
+    nsName = baseName;
+    partName = {};
+  } else {
+    nsName = baseName.substr(0, pPos);
+    partName = baseName;
+  }
+}
+
+// Reads the NVMe related info from a valid NVMe device path
+NvmeData FdpNvme::readNvmeInfo(const std::string& bdevName) {
+  std::string nsName, partName;
+
+  try {
+    getNsAndPartition(bdevName, nsName, partName);
+    int namespace_id = getNvmeNsId(nsName);
+    uint32_t lbaShift = getLbaShift(nsName);
+    uint32_t maxTfrSize = getMaxTfrSize(nsName);
+
+    uint64_t startLba{0};
+    if (!partName.empty()) {
+      startLba = getPartStart(nsName, partName) >> lbaShift;
+    }
+
+    XLOGF(INFO,
+          "Nvme Device Info, NS Id: {}, lbaShift: {},"
+          " Max Transfer size: {}, start Lba: {}",
+          namespace_id, lbaShift, maxTfrSize, startLba);
+
+    return NvmeData{namespace_id, lbaShift, maxTfrSize, startLba};
+  } catch (const std::system_error& e) {
+    XLOGF(ERR, "Exception in readNvmeInfo for: {}", bdevName);
+    throw;
+  }
+}
+
+bool isValidNvmeDevice(const std::string& bdevName) {
+  return std::regex_match(bdevName, std::regex("^/dev/nvme\\d+n\\d+(p\\d+)?$"));
+}
+
+// Converts an nvme block device name (ex: /dev/nvme0n1p1) to corresponding
+// nvme char device name (ex: /dev/ng0n1), to use Nvme FDP directives.
+std::string getNvmeCharDevice(const std::string& bdevName) {
+  // Extract dev and NS IDs, and ignore partition ID.
+  // Example: extract the string '0n1' from '/dev/nvme0n1p1'
+  size_t devPos = bdevName.find_first_of("0123456789");
+  size_t pPos = bdevName.find('p', devPos);
+
+  return "/dev/ng" + bdevName.substr(devPos, pPos - devPos);
+}
+
+// Open Nvme Character device for the given block dev @bdevName.
+// Throws std::system_error if failed.
+folly::File FdpNvme::openNvmeCharFile(const std::string& bdevName) {
+  if (!isValidNvmeDevice(bdevName)) {
+    throw std::invalid_argument("Invalid NVMe device name");
+  }
+
+  int flags{O_RDONLY};
+  folly::File f;
+
+  try {
+    auto cdevName = getNvmeCharDevice(bdevName);
+    XLOGF(INFO, "Opening NVMe Char Dev file: {}", cdevName);
+    f = folly::File(cdevName.c_str(), flags);
+  } catch (const std::system_error& e) {
+    XLOGF(ERR, "Exception in openNvmeCharFile for: {}", bdevName);
+    throw;
+  }
+  XDCHECK_GE(f.fd(), 0);
+
+  return f;
+}
+
+} // namespace navy
+} // namespace cachelib
+} // namespace facebook
+
+#endif
diff --git a/cachelib/navy/common/FdpNvme.h b/cachelib/navy/common/FdpNvme.h
new file mode 100644
index 000000000..23855f36f
--- /dev/null
+++ b/cachelib/navy/common/FdpNvme.h
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <folly/File.h>
+#include <folly/experimental/io/AsyncBase.h>
+#include <folly/experimental/io/IoUring.h>
+
+#include "cachelib/navy/common/Buffer.h"
+#include "cachelib/navy/common/Device.h"
+
+#ifndef CACHELIB_IOURING_DISABLE
+#include <liburing.h>
+#endif
+
+namespace facebook {
+namespace cachelib {
+namespace navy {
+#ifndef CACHELIB_IOURING_DISABLE
+
+// Reference: https://github.com/axboe/fio/blob/master/engines/nvme.h
+// If the uapi headers installed on the system lacks nvme uring command
+// support, use the local version to prevent compilation issues.
+#ifndef CONFIG_NVME_URING_CMD
+struct nvme_uring_cmd {
+  __u8 opcode;
+  __u8 flags;
+  __u16 rsvd1;
+  __u32 nsid;
+  __u32 cdw2;
+  __u32 cdw3;
+  __u64 metadata;
+  __u64 addr;
+  __u32 metadata_len;
+  __u32 data_len;
+  __u32 cdw10;
+  __u32 cdw11;
+  __u32 cdw12;
+  __u32 cdw13;
+  __u32 cdw14;
+  __u32 cdw15;
+  __u32 timeout_ms;
+  __u32 rsvd2;
+};
+
+#define NVME_URING_CMD_IO _IOWR('N', 0x80, struct nvme_uring_cmd)
+#define NVME_URING_CMD_IO_VEC _IOWR('N', 0x81, struct nvme_uring_cmd)
+#endif /* CONFIG_NVME_URING_CMD */
+
+#define NVME_DEFAULT_IOCTL_TIMEOUT 0
+
+enum nvme_io_mgmt_recv_mo {
+  NVME_IO_MGMT_RECV_RUH_STATUS = 0x1,
+};
+
+struct nvme_fdp_ruh_status_desc {
+  uint16_t pid;
+  uint16_t ruhid;
+  uint32_t earutr;
+  uint64_t ruamw;
+  uint8_t rsvd16[16];
+};
+
+struct nvme_fdp_ruh_status {
+  uint8_t rsvd0[14];
+  uint16_t nruhsd;
+  struct nvme_fdp_ruh_status_desc ruhss[];
+};
+
+enum nvme_io_opcode {
+  nvme_cmd_write = 0x01,
+  nvme_cmd_read = 0x02,
+  nvme_cmd_io_mgmt_recv = 0x12,
+  nvme_cmd_io_mgmt_send = 0x1d,
+};
+
+// NVMe specific data for a device
+//
+// This is needed because FDP-IO have to be sent through Io_Uring_Cmd interface.
+// So NVMe data is needed for initialization and IO cmd formation.
+class NvmeData {
+ public:
+  NvmeData() = default;
+  NvmeData& operator=(const NvmeData&) = default;
+
+  explicit NvmeData(int nsId,
+                    uint32_t lbaShift,
+                    uint32_t maxTfrSize,
+                    uint64_t startLba)
+      : nsId_(nsId),
+        lbaShift_(lbaShift),
+        maxTfrSize_(maxTfrSize),
+        startLba_(startLba) {}
+
+  // NVMe Namespace ID
+  int nsId() const { return nsId_; }
+
+  // LBA shift number to calculate blocksize
+  uint32_t lbaShift() const { return lbaShift_; }
+
+  // Get the max transfer size of NVMe device.
+  uint32_t getMaxTfrSize() { return maxTfrSize_; }
+
+  // Start LBA of the disk partition.
+  // It will be 0, if there is no partition and just an NS.
+  uint64_t partStartLba() const { return startLba_; }
+
+ private:
+  int nsId_;
+  uint32_t lbaShift_;
+  uint32_t maxTfrSize_;
+  uint64_t startLba_;
+};
+#endif
+
+// FDP specific info and handling
+//
+// This embeds the FDP semantics and specific io-handling.
+// Note: IO with FDP semantics need to be sent through Io_Uring_cmd interface
+// as of now; and not supported through conventional block interfaces.
+class FdpNvme {
+ public:
+  explicit FdpNvme(const std::string& fileName);
+
+  FdpNvme(const FdpNvme&) = delete;
+  FdpNvme& operator=(const FdpNvme&) = delete;
+
+#ifndef CACHELIB_IOURING_DISABLE
+  // Allocates an FDP specific placement handle. This handle will be
+  // interpreted by the device for data placement.
+  int allocateFdpHandle();
+
+  // Get the max IO transfer size of NVMe device.
+  uint32_t getMaxIOSize() { return nvmeData_.getMaxTfrSize(); }
+
+  // Get the NVMe specific info on this device.
+  NvmeData& getNvmeData() { return nvmeData_; }
+
+  // Reads FDP status descriptor into Buffer
+  Buffer nvmeFdpStatus();
+
+  // Prepares the Uring_Cmd sqe for read command.
+  void prepReadUringCmdSqe(struct io_uring_sqe& sqe,
+                           void* buf,
+                           size_t size,
+                           off_t start);
+
+  // Prepares the Uring_Cmd sqe for write command with FDP handle.
+  void prepWriteUringCmdSqe(struct io_uring_sqe& sqe,
+                            void* buf,
+                            size_t size,
+                            off_t start,
+                            int handle);
+
+ private:
+  // Open Nvme Character device for the given block dev @fileName.
+  folly::File openNvmeCharFile(const std::string& fileName);
+
+  // Prepares the Uring_Cmd sqe for read/write command with FDP directives.
+  void prepFdpUringCmdSqe(struct io_uring_sqe& sqe,
+                          void* buf,
+                          size_t size,
+                          off_t start,
+                          uint8_t opcode,
+                          uint8_t dtype,
+                          uint16_t dspec);
+
+  // Get FDP PlacementID for a NVMe NS specific PHNDL
+  uint16_t getFdpPID(uint16_t fdpPHNDL) { return placementIDs_[fdpPHNDL]; }
+
+  // Reads NvmeData for a NVMe device
+  NvmeData readNvmeInfo(const std::string& blockDevice);
+
+  // Initialize the FDP device and populate necessary info.
+  void initializeFDP(const std::string& blockDevice);
+
+  // Generic NVMe IO mgmnt receive cmd
+  int nvmeIOMgmtRecv(uint32_t nsid,
+                     void* data,
+                     uint32_t data_len,
+                     uint8_t op,
+                     uint16_t op_specific);
+
+  // 0u is considered as the default placement ID
+  static constexpr uint16_t kDefaultPIDIdx = 0u;
+
+  // The mapping table of PHNDL: PID in a Namespace
+  std::vector<uint16_t> placementIDs_{};
+
+  uint16_t maxPIDIdx_{0};
+  uint16_t nextPIDIdx_{kDefaultPIDIdx + 1};
+  NvmeData nvmeData_{};
+  // File handle for IO with FDP directives. Since FDP IO requires the use of
+  // NVMe character device interface, a separate file instance is kept from
+  // that of FileDevice.
+  folly::File file_;
+#endif
+};
+
+} // namespace navy
+} // namespace cachelib
+} // namespace facebook
diff --git a/cachelib/navy/common/tests/DeviceTest.cpp b/cachelib/navy/common/tests/DeviceTest.cpp
index d7cfb77da..97532986b 100644
--- a/cachelib/navy/common/tests/DeviceTest.cpp
+++ b/cachelib/navy/common/tests/DeviceTest.cpp
@@ -33,7 +33,7 @@ using testing::_;
 namespace facebook::cachelib::navy::tests {
 TEST(Device, BytesWritten) {
   MockDevice device{100, 1};
-  EXPECT_CALL(device, writeImpl(_, _, _))
+  EXPECT_CALL(device, writeImpl(_, _, _, _))
       .WillOnce(testing::Return(true))
       .WillOnce(testing::Return(true))
       .WillOnce(testing::Return(false));
@@ -109,7 +109,7 @@ TEST(Device, Latency) {
         std::this_thread::sleep_for(std::chrono::milliseconds{100});
         return true;
       }));
-  EXPECT_CALL(device, writeImpl(0, 1, _))
+  EXPECT_CALL(device, writeImpl(0, 1, _, _))
       .WillOnce(testing::InvokeWithoutArgs([] {
         std::this_thread::sleep_for(std::chrono::milliseconds{100});
         return true;
@@ -135,7 +135,7 @@ TEST(Device, IOError) {
   MockDevice device{1, 1};
   EXPECT_CALL(device, readImpl(0, 1, _))
       .WillOnce(testing::InvokeWithoutArgs([] { return false; }));
-  EXPECT_CALL(device, writeImpl(0, 1, _))
+  EXPECT_CALL(device, writeImpl(0, 1, _, _))
       .WillOnce(testing::InvokeWithoutArgs([] { return false; }));
 
   Buffer buf{1};
@@ -213,12 +213,14 @@ struct DeviceParamTest
       uint32_t maxDeviceWriteSize,
       std::shared_ptr<DeviceEncryptor> encryptor) {
     device_ = createDirectIoFileDevice(std::move(fVec),
+                                       {},
                                        fileSize,
                                        blockSize,
                                        stripeSize,
                                        maxDeviceWriteSize,
                                        ioEngine_,
                                        qDepth_,
+                                       false,
                                        std::move(encryptor));
     return device_;
   }
diff --git a/cachelib/navy/testing/MockDevice.cpp b/cachelib/navy/testing/MockDevice.cpp
index 182875421..3be70981a 100644
--- a/cachelib/navy/testing/MockDevice.cpp
+++ b/cachelib/navy/testing/MockDevice.cpp
@@ -22,7 +22,7 @@ namespace navy {
 MockDevice::MockDevice(uint64_t deviceSize,
                        uint32_t ioAlignSize,
                        std::shared_ptr<DeviceEncryptor> encryptor)
-    : Device{deviceSize, nullptr, ioAlignSize, 0},
+    : Device{deviceSize, nullptr, ioAlignSize, 0, 0},
       device_{deviceSize == 0
                   ? nullptr
                   : createMemoryDevice(
@@ -35,9 +35,9 @@ MockDevice::MockDevice(uint64_t deviceSize,
             return device_->read(offset, size, buffer);
           }));
 
-  ON_CALL(*this, writeImpl(testing::_, testing::_, testing::_))
+  ON_CALL(*this, writeImpl(testing::_, testing::_, testing::_, testing::_))
       .WillByDefault(testing::Invoke(
-          [this](uint64_t offset, uint32_t size, const void* data) {
+          [this](uint64_t offset, uint32_t size, const void* data, int) {
             XDCHECK_EQ(size % getIOAlignmentSize(), 0u);
             XDCHECK_EQ(offset % getIOAlignmentSize(), 0u);
             Buffer buffer = device_->makeIOBuffer(size);
@@ -48,6 +48,10 @@ MockDevice::MockDevice(uint64_t deviceSize,
   ON_CALL(*this, flushImpl()).WillByDefault(testing::Invoke([this]() {
     device_->flush();
   }));
+
+  ON_CALL(*this, allocatePlacementHandle()).WillByDefault(testing::Invoke([]() {
+    return -1;
+  }));
 }
 } // namespace navy
 } // namespace cachelib
diff --git a/cachelib/navy/testing/MockDevice.h b/cachelib/navy/testing/MockDevice.h
index d4933b33a..3333b429f 100644
--- a/cachelib/navy/testing/MockDevice.h
+++ b/cachelib/navy/testing/MockDevice.h
@@ -42,8 +42,9 @@ class MockDevice : public Device {
              std::shared_ptr<DeviceEncryptor> encryptor = nullptr);
 
   MOCK_METHOD3(readImpl, bool(uint64_t, uint32_t, void*));
-  MOCK_METHOD3(writeImpl, bool(uint64_t, uint32_t, const void*));
+  MOCK_METHOD4(writeImpl, bool(uint64_t, uint32_t, const void*, int));
   MOCK_METHOD0(flushImpl, void());
+  MOCK_METHOD0(allocatePlacementHandle, int());
 
   // Returns pointer to the device backing this mock object. This is
   // useful if user wants to bypass the mock to access the real device
@@ -71,9 +72,12 @@ class SizeMockDevice : public Device {
  public:
   explicit SizeMockDevice(uint64_t deviceSize) : Device(deviceSize) {}
 
-  bool writeImpl(uint64_t, uint32_t, const void*) override { return false; }
+  bool writeImpl(uint64_t, uint32_t, const void*, int) override {
+    return false;
+  }
   bool readImpl(uint64_t, uint32_t, void*) override { return false; }
   void flushImpl() override {}
+  int allocatePlacementHandle() override { return -1; }
 };
 } // namespace navy
 } // namespace cachelib