Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(PR2) Adds the support for Flexible Data Placement(FDP) over NVMe into the Cachelib #277

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cachelib/allocator/nvmcache/NavyConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ std::map<std::string, std::string> NavyConfig::serialize() const {
folly::to<std::string>(deviceMaxWriteSize_);
configMap["navyConfig::ioEngine"] = getIoEngineName(ioEngine_).str();
configMap["navyConfig::QDepth"] = folly::to<std::string>(qDepth_);
configMap["navyConfig::enableFDP"] = folly::to<std::string>(enableFDP_);

// Job scheduler settings
configMap["navyConfig::readerThreads"] =
Expand Down
7 changes: 7 additions & 0 deletions cachelib/allocator/nvmcache/NavyConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,8 @@ class NavyConfig {
bool isBigHashEnabled() const {
return enginesConfigs_[0].bigHash().getSizePct() > 0;
}
bool isFDPEnabled() const { return enableFDP_; }

std::map<std::string, std::string> serialize() const;

// Getters:
Expand Down Expand Up @@ -549,6 +551,8 @@ class NavyConfig {
// ============ Device settings =============
// Set the device block size, i.e., minimum unit of IO
void setBlockSize(uint64_t blockSize) noexcept { blockSize_ = blockSize; }
// Set the NVMe FDP Device data placement mode in the Cachelib
void setEnableFDP(bool enable) noexcept { enableFDP_ = enable; }
// Set the parameters for a simple file.
// @throw std::invalid_argument if RAID files have been already set.
void setSimpleFile(const std::string& fileName,
Expand Down Expand Up @@ -694,6 +698,9 @@ class NavyConfig {
// Whether to use write size (instead of parcel size) for Navy admission
// policy.
bool useEstimatedWriteSize_{false};
// Whether Navy support the NVMe FDP data placement(TP4146) directives or not.
// Reference: https://nvmexpress.org/nvmeflexible-data-placement-fdp-blog/
bool enableFDP_{false};
};
} // namespace navy
} // namespace cachelib
Expand Down
1 change: 1 addition & 0 deletions cachelib/allocator/nvmcache/NavySetup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,7 @@ std::unique_ptr<cachelib::navy::Device> createDevice(
maxDeviceWriteSize > 0 ? alignDown(maxDeviceWriteSize, blockSize) : 0,
config.getIoEngine(),
config.getQDepth(),
config.isFDPEnabled(),
std::move(encryptor));
} else {
return cachelib::navy::createMemoryDevice(config.getFileSize(),
Expand Down
1 change: 1 addition & 0 deletions cachelib/allocator/nvmcache/tests/NavyConfigTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ TEST(NavyConfigTest, Serialization) {
expectedConfigMap["navyConfig::deviceMaxWriteSize"] = "4194304";
expectedConfigMap["navyConfig::ioEngine"] = "io_uring";
expectedConfigMap["navyConfig::QDepth"] = "64";
expectedConfigMap["navyConfig::enableFDP"] = "0";

expectedConfigMap["navyConfig::blockCacheLru"] = "false";
expectedConfigMap["navyConfig::blockCacheRegionSize"] = "16777216";
Expand Down
1 change: 1 addition & 0 deletions cachelib/cachebench/cache/Cache-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ Cache<Allocator>::Cache(const CacheConfig& config,
config_.navyReqOrderShardsPower);
}
nvmConfig.navyConfig.setBlockSize(config_.navyBlockSize);
nvmConfig.navyConfig.setEnableFDP(config_.deviceEnableFDP);

// configure BlockCache
auto& bcConfig = nvmConfig.navyConfig.blockCache()
Expand Down
3 changes: 2 additions & 1 deletion cachelib/cachebench/util/CacheConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) {
JSONSetVal(configJson, truncateItemToOriginalAllocSizeInNvm);
JSONSetVal(configJson, navyEncryption);
JSONSetVal(configJson, deviceMaxWriteSize);
JSONSetVal(configJson, deviceEnableFDP);

JSONSetVal(configJson, memoryOnlyTTL);

Expand All @@ -111,7 +112,7 @@ CacheConfig::CacheConfig(const folly::dynamic& configJson) {
// if you added new fields to the configuration, update the JSONSetVal
// to make them available for the json configs and increment the size
// below
checkCorrectSize<CacheConfig, 752>();
checkCorrectSize<CacheConfig, 760>();

if (numPools != poolSizes.size()) {
throw std::invalid_argument(folly::sformat(
Expand Down
3 changes: 3 additions & 0 deletions cachelib/cachebench/util/CacheConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,9 @@ struct CacheConfig : public JSONConfig {
// Navy will split it into multiple IOs.
uint32_t deviceMaxWriteSize{1024 * 1024};

// Enable the FDP Data placement mode in the device, if it is capable.
bool deviceEnableFDP{false};

// Don't write to flash if cache TTL is smaller than this value.
// Not used when its value is 0. In seconds.
uint32_t memoryOnlyTTL{0};
Expand Down
1 change: 1 addition & 0 deletions cachelib/navy/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ add_library (cachelib_navy
block_cache/RegionManager.cpp
common/Buffer.cpp
common/Device.cpp
common/FdpNvme.cpp
common/Hash.cpp
common/SizeDistribution.cpp
common/Types.cpp
Expand Down
3 changes: 3 additions & 0 deletions cachelib/navy/Factory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,7 @@ std::unique_ptr<Device> createFileDevice(
uint32_t maxDeviceWriteSize,
IoEngine ioEngine,
uint32_t qDepth,
bool isFDPEnabled,
std::shared_ptr<navy::DeviceEncryptor> encryptor) {
// File paths are opened in the increasing order of the
// path string. This ensures that RAID0 stripes aren't
Expand All @@ -476,12 +477,14 @@ std::unique_ptr<Device> createFileDevice(
}

return createDirectIoFileDevice(std::move(fileVec),
std::move(filePaths),
fdSize,
blockSize,
stripeSize,
maxDeviceWriteSize,
ioEngine,
qDepth,
isFDPEnabled,
std::move(encryptor));
}

Expand Down
2 changes: 2 additions & 0 deletions cachelib/navy/Factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ std::unique_ptr<AbstractCache> createCache(std::unique_ptr<CacheProto> proto);
// @param maxDeviceWriteSize device maximum granularity of writes
// @param ioEngine IoEngine to be used for IO
// @param qDepth queue depth for async IO; 0 for sync IO
// @param isFDPEnabled whether FDP placement mode enabled or not
// @param encryptor encryption object
std::unique_ptr<Device> createFileDevice(
std::vector<std::string> filePaths,
Expand All @@ -219,6 +220,7 @@ std::unique_ptr<Device> createFileDevice(
uint32_t maxDeviceWriteSize,
IoEngine ioEngine,
uint32_t qDepth,
bool isFDPEnabled,
std::shared_ptr<navy::DeviceEncryptor> encryptor);

} // namespace navy
Expand Down
6 changes: 4 additions & 2 deletions cachelib/navy/bighash/BigHash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ BigHash::BigHash(Config&& config, ValidConfigTag)
cacheBaseOffset_{config.cacheBaseOffset},
numBuckets_{config.numBuckets()},
bloomFilter_{std::move(config.bloomFilter)},
device_{*config.device} {
device_{*config.device},
placementHandle_{device_.allocatePlacementHandle()} {
XLOGF(INFO,
"BigHash created: buckets: {}, bucket size: {}, base offset: {}",
numBuckets_,
Expand Down Expand Up @@ -550,6 +551,7 @@ Buffer BigHash::readBucket(BucketId bid) {
bool BigHash::writeBucket(BucketId bid, Buffer buffer) {
auto* bucket = reinterpret_cast<Bucket*>(buffer.data());
bucket->setChecksum(Bucket::computeChecksum(buffer.view()));
return device_.write(getBucketOffset(bid), std::move(buffer));
return device_.write(
getBucketOffset(bid), std::move(buffer), placementHandle_);
}
} // namespace facebook::cachelib::navy
2 changes: 2 additions & 0 deletions cachelib/navy/bighash/BigHash.h
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,8 @@ class BigHash final : public Engine {
std::unique_ptr<BloomFilter> bloomFilter_;
std::chrono::nanoseconds generationTime_{};
Device& device_;
// handle for data placement technologies like FDP
int placementHandle_;
std::unique_ptr<SharedMutex[]> mutex_{new SharedMutex[kNumMutexes]};
// Spinlocks for bloom filter operations
// We use spinlock in addition to the mutex to avoid contentions of
Expand Down
18 changes: 11 additions & 7 deletions cachelib/navy/bighash/tests/BigHashTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ TEST(BigHash, DeviceErrorStats) {
BigHash bh(std::move(config));

EXPECT_EQ(Status::Ok, bh.insert(makeHK("key1"), makeView("1")));
EXPECT_CALL(*device, writeImpl(0, 64, _)).WillOnce(Return(false));
EXPECT_CALL(*device, writeImpl(0, 64, _, _)).WillOnce(Return(false));
EXPECT_EQ(Status::DeviceError, bh.insert(makeHK("key2"), makeView("1")));
{
MockCounterVisitor helper;
Expand Down Expand Up @@ -351,12 +351,13 @@ TEST(BigHash, WriteInTwoBuckets) {
config.cacheBaseOffset + config.cacheSize, 128);
{
InSequence inSeq;
EXPECT_CALL(*device, allocatePlacementHandle());
EXPECT_CALL(*device, readImpl(256, 128, _));
EXPECT_CALL(*device, writeImpl(256, 128, _));
EXPECT_CALL(*device, writeImpl(256, 128, _, _));
EXPECT_CALL(*device, readImpl(384, 128, _));
EXPECT_CALL(*device, writeImpl(384, 128, _));
EXPECT_CALL(*device, writeImpl(384, 128, _, _));
EXPECT_CALL(*device, readImpl(256, 128, _));
EXPECT_CALL(*device, writeImpl(256, 128, _));
EXPECT_CALL(*device, writeImpl(256, 128, _, _));
}
config.device = device.get();

Expand All @@ -375,10 +376,11 @@ TEST(BigHash, RemoveNotFound) {
auto device = std::make_unique<StrictMock<MockDevice>>(config.cacheSize, 128);
{
InSequence inSeq;
EXPECT_CALL(*device, allocatePlacementHandle());
EXPECT_CALL(*device, readImpl(0, 128, _));
EXPECT_CALL(*device, writeImpl(0, 128, _));
EXPECT_CALL(*device, writeImpl(0, 128, _, _));
EXPECT_CALL(*device, readImpl(0, 128, _));
EXPECT_CALL(*device, writeImpl(0, 128, _));
EXPECT_CALL(*device, writeImpl(0, 128, _, _));
EXPECT_CALL(*device, readImpl(0, 128, _));
}
config.device = device.get();
Expand Down Expand Up @@ -541,6 +543,7 @@ TEST(BigHash, BloomFilterRecoveryFail) {
BigHash::Config config;
setLayout(config, 128, 2);
auto device = std::make_unique<StrictMock<MockDevice>>(config.cacheSize, 128);
EXPECT_CALL(*device, allocatePlacementHandle());
EXPECT_CALL(*device, readImpl(_, _, _)).Times(0);
config.device = device.get();
config.bloomFilter = std::make_unique<BloomFilter>(2, 1, 4);
Expand Down Expand Up @@ -635,8 +638,9 @@ TEST(BigHash, BloomFilterRecovery) {
setLayout(config, 128, 2);
auto device =
std::make_unique<StrictMock<MockDevice>>(config.cacheSize, 128);
EXPECT_CALL(*device, allocatePlacementHandle());
EXPECT_CALL(*device, readImpl(0, 128, _));
EXPECT_CALL(*device, writeImpl(0, 128, _));
EXPECT_CALL(*device, writeImpl(0, 128, _, _));
config.device = device.get();
config.bloomFilter = std::make_unique<BloomFilter>(2, 1, 4);

Expand Down
7 changes: 4 additions & 3 deletions cachelib/navy/block_cache/RegionManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ RegionManager::RegionManager(uint32_t numRegions,
numCleanRegions_{numCleanRegions},
evictCb_{evictCb},
cleanupCb_{cleanupCb},
numInMemBuffers_{numInMemBuffers} {
numInMemBuffers_{numInMemBuffers},
placementHandle_{device_.allocatePlacementHandle()} {
XLOGF(INFO, "{} regions, {} bytes each", numRegions_, regionSize_);
for (uint32_t i = 0; i < numRegions; i++) {
regions_[i] = std::make_unique<Region>(RegionId{i}, regionSize_);
Expand Down Expand Up @@ -526,7 +527,7 @@ bool RegionManager::deviceWrite(RelAddress addr, Buffer buf) {
const auto bufSize = buf.size();
XDCHECK(isValidIORange(addr.offset(), bufSize));
auto physOffset = physicalOffset(addr);
if (!device_.write(physOffset, std::move(buf))) {
if (!device_.write(physOffset, std::move(buf), placementHandle_)) {
return false;
}
physicalWrittenCount_.add(bufSize);
Expand All @@ -537,7 +538,7 @@ bool RegionManager::deviceWrite(RelAddress addr, BufferView view) {
const auto bufSize = view.size();
XDCHECK(isValidIORange(addr.offset(), bufSize));
auto physOffset = physicalOffset(addr);
if (!device_.write(physOffset, view)) {
if (!device_.write(physOffset, view, placementHandle_)) {
return false;
}
physicalWrittenCount_.add(bufSize);
Expand Down
1 change: 1 addition & 0 deletions cachelib/navy/block_cache/RegionManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,7 @@ class RegionManager {
mutable TimedMutex bufferMutex_;
mutable util::ConditionVariable bufferCond_;
std::vector<std::unique_ptr<Buffer>> buffers_;
int placementHandle_;
};
} // namespace navy
} // namespace cachelib
Expand Down
25 changes: 13 additions & 12 deletions cachelib/navy/block_cache/tests/BlockCacheTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -592,14 +592,14 @@ TEST(BlockCache, ReclaimCorruption) {
auto driver = makeDriver(std::move(engine), std::move(ex));

// Allow any number of writes in between and after our expected writes
EXPECT_CALL(*device, writeImpl(_, _, _)).Times(testing::AtLeast(0));
EXPECT_CALL(*device, writeImpl(_, _, _, _)).Times(testing::AtLeast(0));

// Note even tho this item's value is corrupted, we would have aborted
// the reclaim before we got here. So we will not bump the value checksum
// error stat on this.
EXPECT_CALL(*device, writeImpl(0, 16384, _))
EXPECT_CALL(*device, writeImpl(0, 16384, _, _))
.WillOnce(testing::Invoke(
[&device](uint64_t offset, uint32_t size, const void* data) {
[&device](uint64_t offset, uint32_t size, const void* data, int) {
// Note that all items are aligned to 512 bytes in in-mem buffer
// stacked mode, and we write around 800 bytes, so each is aligned
// to 1024 bytes
Expand Down Expand Up @@ -703,7 +703,7 @@ TEST(BlockCache, RegionUnderflow) {
std::vector<uint32_t> hits(4);
auto policy = std::make_unique<NiceMock<MockPolicy>>(&hits);
auto device = std::make_unique<NiceMock<MockDevice>>(kDeviceSize, 1024);
EXPECT_CALL(*device, writeImpl(0, 16 * 1024, _));
EXPECT_CALL(*device, writeImpl(0, 16 * 1024, _, _));
// Although 2k read buffer, shouldn't underflow the region!
EXPECT_CALL(*device, readImpl(0, 1024, _));
auto ex = makeJobScheduler();
Expand All @@ -730,7 +730,7 @@ TEST(BlockCache, SmallReadBuffer) {
auto policy = std::make_unique<NiceMock<MockPolicy>>(&hits);
auto device = std::make_unique<NiceMock<MockDevice>>(
kDeviceSize, 4096 /* io alignment size */);
EXPECT_CALL(*device, writeImpl(0, 16 * 1024, _));
EXPECT_CALL(*device, writeImpl(0, 16 * 1024, _, _));
EXPECT_CALL(*device, readImpl(0, 8192, _));
auto ex = makeJobScheduler();
auto config = makeConfig(*ex, std::move(policy), *device);
Expand Down Expand Up @@ -1057,10 +1057,11 @@ TEST(BlockCache, DeviceFailure) {
auto device = std::make_unique<NiceMock<MockDevice>>(kDeviceSize, 1024);
{
testing::InSequence seq;
EXPECT_CALL(*device, writeImpl(0, kRegionSize, _)).WillOnce(Return(false));
EXPECT_CALL(*device, writeImpl(0, kRegionSize, _));
EXPECT_CALL(*device, writeImpl(kRegionSize, kRegionSize, _));
EXPECT_CALL(*device, writeImpl(kRegionSize * 2, kRegionSize, _));
EXPECT_CALL(*device, writeImpl(0, kRegionSize, _, _))
.WillOnce(Return(false));
EXPECT_CALL(*device, writeImpl(0, kRegionSize, _, _));
EXPECT_CALL(*device, writeImpl(kRegionSize, kRegionSize, _, _));
EXPECT_CALL(*device, writeImpl(kRegionSize * 2, kRegionSize, _, _));

EXPECT_CALL(*device, readImpl(0, 1024, _));
EXPECT_CALL(*device, readImpl(kRegionSize, 1024, _))
Expand Down Expand Up @@ -1116,7 +1117,7 @@ namespace {
std::unique_ptr<Device> setupResetTestDevice(uint32_t size) {
auto device = std::make_unique<NiceMock<MockDevice>>(size, 512);
for (uint32_t i = 0; i < 2; i++) {
EXPECT_CALL(*device, writeImpl(i * 16 * 1024, 16 * 1024, _));
EXPECT_CALL(*device, writeImpl(i * 16 * 1024, 16 * 1024, _, _));
}
return device;
}
Expand Down Expand Up @@ -2080,7 +2081,7 @@ TEST(BlockCache, DeviceFlushFailureSync) {
auto device = std::make_unique<MockDevice>(kDeviceSize, 1024);

testing::InSequence inSeq;
EXPECT_CALL(*device, writeImpl(_, _, _)).WillRepeatedly(Return(false));
EXPECT_CALL(*device, writeImpl(_, _, _, _)).WillRepeatedly(Return(false));

auto ex = makeJobScheduler();
auto config = makeConfig(*ex, std::move(policy), *device);
Expand Down Expand Up @@ -2122,7 +2123,7 @@ TEST(BlockCache, DeviceFlushFailureAsync) {
auto device = std::make_unique<MockDevice>(kDeviceSize, 1024);

testing::InSequence inSeq;
EXPECT_CALL(*device, writeImpl(_, _, _)).WillRepeatedly(Return(false));
EXPECT_CALL(*device, writeImpl(_, _, _, _)).WillRepeatedly(Return(false));

auto ex = makeJobScheduler();
auto config = makeConfig(*ex, std::move(policy), *device);
Expand Down
4 changes: 2 additions & 2 deletions cachelib/navy/block_cache/tests/RegionManagerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ TEST(RegionManager, cleanupRegionFailureSync) {

std::thread flushThread{[&sp, &device, &rm, &rid] {
// Make sure flush will fail
EXPECT_CALL(*device, writeImpl(_, _, _)).WillRepeatedly(Return(false));
EXPECT_CALL(*device, writeImpl(_, _, _, _)).WillRepeatedly(Return(false));
sp.wait(0); // Flush after active reader
rm->doFlush(rid, false /* async */);
}};
Expand Down Expand Up @@ -505,7 +505,7 @@ TEST(RegionManager, cleanupRegionFailureAsync) {

std::thread flushThread{[&sp, &device, &rm, &rid] {
// Make sure flush will fail
EXPECT_CALL(*device, writeImpl(_, _, _)).WillRepeatedly(Return(false));
EXPECT_CALL(*device, writeImpl(_, _, _, _)).WillRepeatedly(Return(false));
sp.wait(0); // Flush after active reader
rm->doFlush(rid, true /* async */);
}};
Expand Down
Loading
Loading