Skip to content

Commit

Permalink
feat: add zstd compression support
Browse files Browse the repository at this point in the history
zstd experimental feature is enabled to calculate upper bound of
Vec capacity to be allocated while decompressing data using
zstd::bulk::Decompressor::upper_bound.

supported levels are 1~22, with 0 defaulting to level 3.
  • Loading branch information
shanipribadi committed Sep 25, 2024
1 parent 6000602 commit d566645
Show file tree
Hide file tree
Showing 7 changed files with 141 additions and 21 deletions.
8 changes: 5 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ path = "src/lib.rs"
default = []
lz4 = ["dep:lz4_flex"]
miniz = ["dep:miniz_oxide"]
zstd = ["dep:zstd"]
bloom = []
all = ["bloom", "lz4", "miniz"]
all = ["bloom", "lz4", "miniz", "zstd"]

[dependencies]
byteorder = "1.5.0"
Expand All @@ -40,6 +41,7 @@ tempfile = "3.12.0"
value-log = "1.0.0"
varint-rs = "2.2.0"
xxhash-rust = { version = "0.8.12", features = ["xxh3"] }
zstd = { version = "0.13.2", optional = true, features = ["experimental"] }

[dev-dependencies]
criterion = { version = "0.5.1", features = ["html_reports"] }
Expand Down Expand Up @@ -72,13 +74,13 @@ required-features = ["bloom"]
name = "block"
harness = false
path = "benches/block.rs"
required-features = ["lz4", "miniz"]
required-features = ["lz4", "miniz", "zstd"]

[[bench]]
name = "tree"
harness = false
path = "benches/tree.rs"
required-features = ["lz4", "miniz"]
required-features = ["lz4", "miniz", "zstd"]

[[bench]]
name = "level_manifest"
Expand Down
7 changes: 6 additions & 1 deletion benches/block.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use criterion::{criterion_group, criterion_main, Criterion};
use lsm_tree::{
coding::Encode,
segment::{
block::{header::Header as BlockHeader, ItemSize},
meta::CompressionType,
value_block::ValueBlock,
},
serde::Serializable,
Checksum, InternalValue,
};
use std::io::Write;
Expand Down Expand Up @@ -106,6 +106,11 @@ fn load_value_block_from_disk(c: &mut Criterion) {
CompressionType::None,
CompressionType::Lz4,
CompressionType::Miniz(6),
CompressionType::Zstd(-3),
CompressionType::Zstd(-1),
CompressionType::Zstd(1),
CompressionType::Zstd(3),
CompressionType::Zstd(12),
] {
for block_size in [1, 4, 8, 16, 32, 64, 128] {
let block_size = block_size * 1_024;
Expand Down
8 changes: 4 additions & 4 deletions benches/tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ fn tree_get_pairs(c: &mut Criterion) {
{
let folder = tempfile::tempdir().unwrap();
let tree = Config::new(folder)
.block_size(1_024)
.data_block_size(1_024)
.block_cache(Arc::new(BlockCache::with_capacity_bytes(0)))
.open()
.unwrap();
Expand Down Expand Up @@ -219,7 +219,7 @@ fn tree_get_pairs(c: &mut Criterion) {
{
let folder = tempfile::tempdir().unwrap();
let tree = Config::new(folder)
.block_size(1_024)
.data_block_size(1_024)
.block_cache(Arc::new(BlockCache::with_capacity_bytes(0)))
.open()
.unwrap();
Expand Down Expand Up @@ -262,7 +262,7 @@ fn disk_point_read(c: &mut Criterion) {
let folder = tempdir().unwrap();

let tree = Config::new(folder)
.block_size(1_024)
.data_block_size(1_024)
.block_cache(Arc::new(BlockCache::with_capacity_bytes(0)))
.open()
.unwrap();
Expand Down Expand Up @@ -300,7 +300,7 @@ fn disjoint_tree_minmax(c: &mut Criterion) {
let folder = tempfile::tempdir().unwrap();

let tree = Config::new(folder)
.block_size(1_024)
.data_block_size(1_024)
.block_cache(Arc::new(BlockCache::with_capacity_bytes(0)))
.open()
.unwrap();
Expand Down
11 changes: 11 additions & 0 deletions src/blob_tree/compression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ impl Compressor for MyCompressor {

#[cfg(feature = "miniz")]
CompressionType::Miniz(lvl) => miniz_oxide::deflate::compress_to_vec(bytes, lvl),

#[cfg(feature = "zstd")]
CompressionType::Zstd(level) => zstd::bulk::compress(bytes, level)?,
})
}

Expand All @@ -39,6 +42,14 @@ impl Compressor for MyCompressor {
#[cfg(feature = "miniz")]
CompressionType::Miniz(_) => miniz_oxide::inflate::decompress_to_vec(bytes)
.map_err(|_| value_log::Error::Decompress),

#[cfg(feature = "zstd")]
CompressionType::Zstd(_) => zstd::bulk::decompress(
bytes,
// TODO: assuming 4GB output size max
u32::MAX as usize,
)
.map_err(|_| value_log::Error::Decompress),
}
}
}
9 changes: 9 additions & 0 deletions src/segment/block/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ impl<T: Clone + Encode + Decode + ItemSize> Block<T> {
miniz_oxide::inflate::decompress_to_vec(&bytes)
.map_err(|_| crate::Error::Decompress(header.compression))?
}
#[cfg(feature = "zstd")]
super::meta::CompressionType::Zstd(_) => {
// TODO: assuming 4GB output size max
zstd::bulk::decompress(&bytes, u32::MAX as usize)
.map_err(|_| crate::Error::Decompress(header.compression))?
}
};
let mut bytes = Cursor::new(bytes);

Expand Down Expand Up @@ -130,6 +136,9 @@ impl<T: Clone + Encode + Decode + ItemSize> Block<T> {

#[cfg(feature = "miniz")]
CompressionType::Miniz(level) => miniz_oxide::deflate::compress_to_vec(&buf, level),

#[cfg(feature = "zstd")]
CompressionType::Zstd(level) => zstd::bulk::compress(&buf, level)?,
})
}
}
Expand Down
78 changes: 65 additions & 13 deletions src/segment/meta/compression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,17 @@ pub enum CompressionType {
/// - 10 may save even more space than 9, but the speed trade off may not be worth it
#[cfg(feature = "miniz")]
Miniz(u8),

/// zstd Compression
///
/// Compression level (-128-22) can be adjusted.
///
/// - -128~ -1 is fast compression level
/// - A level of `0` uses zstd's default (currently `3`).
/// - 1~19 normal compression level, higher is slower (1 is fastest, 3 is default, 12 is as fast as gzip level 6)
/// - 20~22 ultra compression level, increase memory on both compression and decompression
#[cfg(feature = "zstd")]
Zstd(i32),
}

impl Encode for CompressionType {
Expand All @@ -56,6 +67,17 @@ impl Encode for CompressionType {
writer.write_u8(2)?;
writer.write_u8(*level)?;
}

#[cfg(feature = "zstd")]
Self::Zstd(level) => {
assert!(
*level >= -128 && *level <= 22,
"invalid zstd compression level"
);
writer.write_u8(3)?;
// TODO: this is dependent on endianness
writer.write_u8(*level as u8)?;
}
};

Ok(())
Expand Down Expand Up @@ -87,26 +109,38 @@ impl Decode for CompressionType {
Ok(Self::Miniz(level))
}

#[cfg(feature = "zstd")]
3 => {
// TODO: this is dependent on endianness
let level = reader.read_u8()? as i8 as i32;

assert!(
level >= -128 && level <= 22,
"invalid zstd compression level"
);

Ok(Self::Zstd(level))
}

tag => Err(DecodeError::InvalidTag(("CompressionType", tag))),
}
}
}

impl std::fmt::Display for CompressionType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}",
match self {
Self::None => "no compression",
match self {
Self::None => write!(f, "no compression"),

#[cfg(feature = "lz4")]
Self::Lz4 => "lz4",
#[cfg(feature = "lz4")]
Self::Lz4 => write!(f, "lz4"),

#[cfg(feature = "miniz")]
Self::Miniz(_) => "miniz",
}
)
#[cfg(feature = "miniz")]
Self::Miniz(_) => write!(f, "miniz"),

#[cfg(feature = "zstd")]
Self::Zstd(level) => write!(f, "zstd({})", level),
}
}
}

Expand All @@ -118,6 +152,7 @@ mod tests {
fn compression_serialize_none() -> crate::Result<()> {
let serialized = CompressionType::None.encode_into_vec()?;
assert_eq!(2, serialized.len());
assert_eq!(vec![0u8, 0u8], serialized);
Ok(())
}

Expand All @@ -126,9 +161,10 @@ mod tests {
use super::*;

#[test_log::test]
fn compression_serialize_none() -> crate::Result<()> {
fn compression_serialize_lz4() -> crate::Result<()> {
let serialized = CompressionType::Lz4.encode_into_vec()?;
assert_eq!(2, serialized.len());
assert_eq!(vec![1u8, 0u8], serialized);
Ok(())
}
}
Expand All @@ -138,10 +174,26 @@ mod tests {
use super::*;

#[test_log::test]
fn compression_serialize_none() -> crate::Result<()> {
fn compression_serialize_miniz() -> crate::Result<()> {
for lvl in 0..10 {
let serialized = CompressionType::Miniz(lvl).encode_into_vec()?;
assert_eq!(2, serialized.len());
assert_eq!(vec![2u8, lvl], serialized);
}
Ok(())
}
}

#[cfg(feature = "zstd")]
mod zstd {
use super::*;

#[test_log::test]
fn compression_serialize_zstd() -> crate::Result<()> {
for lvl in -128..22 {
let serialized = CompressionType::Zstd(lvl).encode_into_vec()?;
assert_eq!(2, serialized.len());
assert_eq!(vec![3u8, lvl as u8], serialized);
}
Ok(())
}
Expand Down
41 changes: 41 additions & 0 deletions tests/blob_simple.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,44 @@ fn blob_tree_simple_compressed_2() -> lsm_tree::Result<()> {

Ok(())
}

#[cfg(feature = "zstd")]
#[test]
fn blob_tree_simple_compressed_zstd() -> lsm_tree::Result<()> {
let folder = tempfile::tempdir()?;
let path = folder.path();

let tree = lsm_tree::Config::new(path)
.compression(lsm_tree::CompressionType::Zstd(3))
.open_as_blob_tree()?;

let big_value = b"neptune!".repeat(128_000);

assert!(tree.get("big")?.is_none());
tree.insert("big", &big_value, 0);
tree.insert("smol", "small value", 0);

let value = tree.get("big")?.expect("should exist");
assert_eq!(&*value, big_value);

tree.flush_active_memtable(0)?;

let value = tree.get("big")?.expect("should exist");
assert_eq!(&*value, big_value);

let value = tree.get("smol")?.expect("should exist");
assert_eq!(&*value, b"small value");

let new_big_value = b"winter!".repeat(128_000);
tree.insert("big", &new_big_value, 1);

let value = tree.get("big")?.expect("should exist");
assert_eq!(&*value, new_big_value);

tree.flush_active_memtable(0)?;

let value = tree.get("big")?.expect("should exist");
assert_eq!(&*value, new_big_value);

Ok(())
}

0 comments on commit d566645

Please sign in to comment.