diff --git a/appendable-rs/Cargo.lock b/appendable-rs/Cargo.lock index d1722bab..66a24f91 100644 --- a/appendable-rs/Cargo.lock +++ b/appendable-rs/Cargo.lock @@ -5,11 +5,34 @@ version = 3 [[package]] name = "appendable" version = "0.1.0" +dependencies = [ + "protocol", + "tempfile", + "xxhash-rust", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" [[package]] name = "btree" version = "0.1.0" +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + [[package]] name = "cmd" version = "0.1.0" @@ -18,6 +41,141 @@ version = "0.1.0" name = "encoding" version = "0.1.0" +[[package]] +name = "errno" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fastrand" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" + +[[package]] +name = "libc" +version = "0.2.152" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7" + +[[package]] +name = "linux-raw-sys" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456" + [[package]] name = "protocol" version = "0.1.0" + +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "rustix" +version = "0.38.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" +dependencies = [ + "bitflags 2.4.1", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "tempfile" +version = "3.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" +dependencies = [ + "cfg-if", + "fastrand", + "redox_syscall", + "rustix", + "windows-sys", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" + +[[package]] +name = "xxhash-rust" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53be06678ed9e83edb1745eb72efc0bbcd7b5c3c35711a860906aed827a13d61" diff --git a/appendable-rs/appendable/Cargo.toml b/appendable-rs/appendable/Cargo.toml index 0d7d91a3..c797260e 100644 --- a/appendable-rs/appendable/Cargo.toml +++ b/appendable-rs/appendable/Cargo.toml @@ -6,3 +6,9 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +protocol = { path="../protocol" } +xxhash-rust = { version = "0.8.8", features = ["xxh3"] } + +[dev-dependencies] +tempfile = "3.9.0" + diff --git a/appendable-rs/appendable/mock_data.jsonl b/appendable-rs/appendable/mock_data.jsonl new file mode 100644 index 00000000..ae3c4543 --- /dev/null +++ b/appendable-rs/appendable/mock_data.jsonl @@ -0,0 +1,2 @@ +{"name": "matteo", "id": 2, "alpha": ["a", "b", "c"]} +{"name": "kevin", "id": 1, "alpha": ["x", "y", "z"]} diff --git a/appendable-rs/appendable/src/handler/jsonl_handler.rs b/appendable-rs/appendable/src/handler/jsonl_handler.rs new file mode 100644 index 00000000..0523c1fb --- /dev/null +++ b/appendable-rs/appendable/src/handler/jsonl_handler.rs @@ -0,0 +1,71 @@ +use crate::index_file::IndexFile; +use crate::io::DataHandler; +use std::io::{BufRead, BufReader, Cursor, Seek, SeekFrom}; +use xxhash_rust::xxh3::Xxh3; + +pub struct JSONLHandler { + // todo! change to borrowed type like &[u8] -- spent too long battling lifetimes + reader: BufReader>>, + xxh3: Xxh3, +} +impl JSONLHandler { + pub fn new(data: Vec) -> Self { + JSONLHandler { + reader: BufReader::new(Cursor::new(data)), + xxh3: Xxh3::new(), + } + } +} +impl Seek for JSONLHandler { + fn seek(&mut self, pos: SeekFrom) -> std::io::Result { + self.reader.seek(pos) + } +} +impl DataHandler for JSONLHandler { + fn synchronize(&mut self, index_file: &mut IndexFile) -> Result<(), String> { + let mut line = String::new(); + let mut start_offset: u64 = 0; + + while self + .reader + .read_line(&mut line) + .map_err(|e| e.to_string())? + > 0 + { + let existing_count = index_file.end_byte_offsets.len(); + // compute byte_offset for current line + let line_length = line.as_bytes().len() as u64; + let current_offset = start_offset + line_length + 1; + index_file.end_byte_offsets.push(current_offset); + + // compute checksum + self.xxh3.update(line.as_bytes()); + let checksum = self.xxh3.digest(); // produce the final hash value + index_file.checksums.push(checksum); + + // Process the JSON line and update indexes + handle_json_object( + line.into_bytes(), + index_file, + &mut vec![], + existing_count as u64, + start_offset, + )?; + + start_offset = current_offset; + line.clear(); + } + + Ok(()) + } +} + +fn handle_json_object( + json_line: Vec, + index_file: &mut IndexFile, + path: &mut Vec, + data_index: u64, + data_offset: u64, +) -> Result { + Ok(1) +} diff --git a/appendable-rs/appendable/src/index_file.rs b/appendable-rs/appendable/src/index_file.rs new file mode 100644 index 00000000..99ab40f8 --- /dev/null +++ b/appendable-rs/appendable/src/index_file.rs @@ -0,0 +1,127 @@ +use crate::io::DataHandler; +use protocol::field_type::FieldFlags; +use protocol::{FieldType, IndexRecord, Version}; +use std::collections::HashMap; +use std::fmt; +use std::fmt::Formatter; + +const CURRENT_VERSION: Version = 1; + +pub(crate) struct Index { + pub(crate) field_name: String, + pub(crate) field_type: FieldFlags, + pub(crate) index_records: HashMap>, +} + +/// `IndexFile` is a representation of the entire index file. +pub struct IndexFile { + version: Version, + pub(crate) indexes: Vec, + pub(crate) end_byte_offsets: Vec, + pub(crate) checksums: Vec, + tail: u32, +} + +impl IndexFile { + pub fn new(mut data_handler: Box) -> Result { + let mut file = IndexFile { + version: CURRENT_VERSION, + indexes: Vec::new(), + end_byte_offsets: Vec::new(), + checksums: Vec::new(), + tail: 0, + }; + + data_handler.synchronize(&mut file)?; + + Ok(file) + } + + pub(crate) fn find_index(&mut self, name: &str, value: &IndexKey) -> usize { + if let Some((position, _)) = self + .indexes + .iter() + .enumerate() + .find(|(_, index)| index.field_name == name) + { + if !self.indexes[position] + .field_type + .contains(value.field_type()) + { + self.indexes[position].field_type.set(value.field_type()); + } + + position + } else { + let mut new_index = Index { + field_name: name.to_string(), + field_type: FieldFlags::new(), + index_records: HashMap::new(), + }; + + new_index.field_type.set(value.field_type()); + self.indexes.push(new_index); + self.indexes.len() - 1 + } + } +} + +/// `IndexKey` addresses the dynamic typing of keys in `IndexRecord` by stating all possible variants +#[derive(Eq, PartialEq, Debug, Clone)] +pub enum IndexKey { + String(String), + Number(String), + Boolean(bool), + Array(Vec), + Object(HashMap), +} + +impl IndexKey { + fn field_type(&self) -> FieldType { + match self { + IndexKey::String(_) => FieldType::String, + IndexKey::Number(_) => FieldType::Number, + IndexKey::Boolean(_) => FieldType::Boolean, + IndexKey::Array(_) => FieldType::Array, + IndexKey::Object(_) => FieldType::Object, + } + } +} + +impl fmt::Display for Index { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!( + f, + "Field\nname: {}\n\ttype: {:?}\n\tindex_records: {:?}", + self.field_name, self.field_type, self.index_records + ) + } +} + +impl fmt::Display for IndexKey { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + IndexKey::String(s) => write!(f, "{}", s), + IndexKey::Number(n) => write!(f, "{}", n), + IndexKey::Boolean(b) => write!(f, "{}", b), + IndexKey::Array(v) => { + let elements = v + .iter() + .map(|element| format!("{}", element)) + .collect::>() + .join(", "); + + write!(f, "[{}]", elements) + } + IndexKey::Object(o) => { + let entries = o + .iter() + .map(|(key, value)| format!("{}: {}", key, value)) + .collect::>() + .join(", "); + + write!(f, "{{{}}}", entries) + } + } + } +} diff --git a/appendable-rs/appendable/src/io.rs b/appendable-rs/appendable/src/io.rs new file mode 100644 index 00000000..138aae92 --- /dev/null +++ b/appendable-rs/appendable/src/io.rs @@ -0,0 +1,6 @@ +use crate::index_file::{Index, IndexFile}; +use std::io::Seek; + +pub trait DataHandler: Seek { + fn synchronize(&mut self, index_file: &mut IndexFile) -> Result<(), String>; +} diff --git a/appendable-rs/appendable/src/json_tokenizer.rs b/appendable-rs/appendable/src/json_tokenizer.rs new file mode 100644 index 00000000..74223c17 --- /dev/null +++ b/appendable-rs/appendable/src/json_tokenizer.rs @@ -0,0 +1,87 @@ +pub enum Token { + OpenBracket, + CloseBracket, + Colon, + Comma, + String(String), + Number(String), + Boolean(bool), + OpenArray, + CloseArray, + Null, +} + +pub struct JSONTokenizer { + input: Vec, + position: usize, +} + +impl JSONTokenizer { + pub(crate) fn new(input: Vec) -> Self { + Self { input, position: 0 } + } + + pub(crate) fn next(&mut self) -> Result, String> { + // edge case: check if we've reached the end of line + if self.position >= self.input.len() { + Ok(None) + } else { + let current_byte = self.input[self.position]; + + return match current_byte { + b'{' => { + self.position += 1; + Ok(Some((Token::OpenBracket, self.position - 1))) + } + b'}' => { + self.position += 1; + Ok(Some((Token::CloseBracket, self.position - 1))) + } + b'[' => { + self.position += 1; + Ok(Some((Token::OpenArray, self.position - 1))) + } + b']' => { + self.position += 1; + Ok(Some((Token::CloseArray, self.position - 1))) + } + b'\"' => { + self.position += 1; + self.tokenize_string() + } + b':' => { + self.position += 1; + Ok(Some((Token::Colon, self.position - 1))) + } + _ => Err(format!( + "Unexpected character at position {}", + self.position - 1 + )), + }; + } + } + + fn tokenize_string(&mut self) -> Result, String> { + let start_position = self.position; + + while self.position < self.input.len() { + let current_byte = self.input[start_position]; + + match current_byte { + b'\"' => { + self.position += 1; + Ok(Some((Token::String, start_position))) + } + b'\\' => { + self.position += 2; // skip \n + continue; + } + _ => { + self.position += 1; + } + } + } + + Err("Unterminated string".to_string()) + } +} diff --git a/appendable-rs/appendable/src/lib.rs b/appendable-rs/appendable/src/lib.rs index 7d12d9af..47d37cd1 100644 --- a/appendable-rs/appendable/src/lib.rs +++ b/appendable-rs/appendable/src/lib.rs @@ -1,14 +1,11 @@ -pub fn add(left: usize, right: usize) -> usize { - left + right -} +pub mod index_file; +pub mod io; +mod json_tokenizer; -#[cfg(test)] -mod tests { - use super::*; +pub mod tests { + pub mod jsonl_index_file; +} - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); - } +pub mod handler { + pub mod jsonl_handler; } diff --git a/appendable-rs/appendable/src/tests/jsonl_index_file.rs b/appendable-rs/appendable/src/tests/jsonl_index_file.rs new file mode 100644 index 00000000..9fb96709 --- /dev/null +++ b/appendable-rs/appendable/src/tests/jsonl_index_file.rs @@ -0,0 +1,39 @@ +#[cfg(test)] +mod tests { + use crate::handler::jsonl_handler::JSONLHandler; + use crate::index_file::IndexFile; + use std::fs; + use std::io::Write; + use std::path::PathBuf; + use tempfile::NamedTempFile; + + fn mock_jsonl_file_to_disk() -> std::io::Result { + let mut temp_file = NamedTempFile::new()?; + + writeln!( + temp_file, + r#"{{"name": "matteo", "id": 2, "alpha": ["a", "b", "c"]}}"# + )?; + writeln!( + temp_file, + r#"{{"name": "kevin", "id": 1, "alpha": ["x", "y", "z"]}}"# + )?; + + let file_path = temp_file.into_temp_path(); + let persisted_file = file_path.keep()?; + Ok(persisted_file) + } + + #[test] + fn create_index_file() { + let file_path = mock_jsonl_file_to_disk().expect("Failed to create mock file"); + let data = fs::read(&file_path).expect("Unable to read mock file"); + + let jsonl_handler = JSONLHandler::new(data); + let index_file = IndexFile::new(Box::new(jsonl_handler)); + + assert!(index_file.is_ok()); + + let index_file = index_file.unwrap(); + } +} diff --git a/appendable-rs/protocol/src/field_type.rs b/appendable-rs/protocol/src/field_type.rs new file mode 100644 index 00000000..0b925e54 --- /dev/null +++ b/appendable-rs/protocol/src/field_type.rs @@ -0,0 +1,76 @@ +/// `FieldType` represents the type of data stored in the field, which follows JSON types excluding Object and null. Object is broken down into subfields and null is not stored. +pub enum FieldType { + String, + Number, + Object, + Array, + Boolean, + Null, +} + +/// `FieldFlags` is left as u64 to avoid shooting ourselves in the foot if we want to support more types in the future via other file formats +#[derive(Debug)] +pub struct FieldFlags { + flags: u64, +} + +impl FieldFlags { + pub fn new() -> Self { + FieldFlags { flags: 0 } + } + + pub fn set(&mut self, field: FieldType) { + match field { + FieldType::String => self.flags |= 1 << 0, + FieldType::Number => self.flags |= 1 << 1, + FieldType::Object => self.flags |= 1 << 2, + FieldType::Array => self.flags |= 1 << 3, + FieldType::Boolean => self.flags |= 1 << 4, + FieldType::Null => self.flags |= 1 << 5, + } + } + + pub fn contains(&self, field: FieldType) -> bool { + match field { + FieldType::String => (self.flags & (1 << 0)) != 0, + FieldType::Number => (self.flags & (1 << 1)) != 0, + FieldType::Object => (self.flags & (1 << 2)) != 0, + FieldType::Array => (self.flags & (1 << 3)) != 0, + FieldType::Boolean => (self.flags & (1 << 4)) != 0, + FieldType::Null => (self.flags & (1 << 5)) != 0, + } + } + + pub fn typescript_type(&self) -> String { + let mut components = Vec::new(); + + if self.contains(FieldType::String) { + components.push("string"); + } + + if self.contains(FieldType::Number) { + components.push("number"); + } + + if self.contains(FieldType::Object) { + components.push("Record"); + } + + if self.contains(FieldType::Array) { + components.push("any[]"); + } + + if self.contains(FieldType::Boolean) { + components.push("boolean"); + } + + if self.contains(FieldType::Null) { + components.push("null"); + } + + match components.is_empty() { + true => "unknown".to_string(), + false => components.join(" | "), + } + } +} diff --git a/appendable-rs/protocol/src/lib.rs b/appendable-rs/protocol/src/lib.rs index 7d12d9af..31c360e0 100644 --- a/appendable-rs/protocol/src/lib.rs +++ b/appendable-rs/protocol/src/lib.rs @@ -1,14 +1,13 @@ -pub fn add(left: usize, right: usize) -> usize { - left + right -} +pub mod field_type; +pub mod protocol; -#[cfg(test)] -mod tests { - use super::*; +pub use protocol::{ + IndexFileHeader, + IndexHeader, + IndexRecord, + Version +}; - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); - } -} +pub use field_type::{ + FieldType +}; \ No newline at end of file diff --git a/appendable-rs/protocol/src/protocol.rs b/appendable-rs/protocol/src/protocol.rs new file mode 100644 index 00000000..9b7439e4 --- /dev/null +++ b/appendable-rs/protocol/src/protocol.rs @@ -0,0 +1,88 @@ +use crate::field_type::FieldType; +use std::fmt::{Debug, Display, Formatter}; + +/* +The overall index file for AppendableDB is structured as: + ++-----------------------+ +| Version | ++-----------------------+ +| IndexFileHeader | ++-----------------------+ +| IndexHeader | ++-----------------------+ +| ... | ++-----------------------+ +| IndexHeader | ++-----------------------+ +| IndexRecord | ++-----------------------+ +| ... | ++-----------------------+ +| IndexRecord | ++-----------------------+ +| EndByteOffset | ++-----------------------+ +| ... | ++-----------------------+ +| EndByteOffset | ++-----------------------+ +| Checksum | ++-----------------------+ +| ... | ++-----------------------+ +| Checksum | ++-----------------------+ +*/ + +/// `Version` is the version of AppendableDB this library is compatible with. +pub type Version = u8; + +/// `IndexFileHeader` is the header of the index file. +/// +/// # Attributes +/// - `index_length` represents the number of bytes the `IndexHeader` occupy +/// - `data_count` represents the number of data records indexed by this index file +pub struct IndexFileHeader { + index_length: u64, + data_count: u64, +} + +/// `IndexHeader` is the header of each index record. This represents the field available in the data file. +/// +/// # Attributes +/// - `field_type` represents the type of data stored in the field. Note that the field data doesn't need to follow this type, but it is used to determine the Typescript typings for the field. +pub struct IndexHeader { + field_name: String, + field_type: FieldType, + index_record_count: u64, +} + +/// `IndexRecord` +/// +/// # Attributes +/// - `field_start_byte_offset` represents the byte offset of the field in the data file to fetch exactly in the field value. +/// - `field_length` is pessimistic: it is encoded value that is at least as long as the actual field value. +pub struct IndexRecord { + pub data_number: u64, + pub field_start_byte_offset: u64, + pub field_length: u64, +} + +impl Debug for IndexRecord { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self) + } +} +impl Display for IndexRecord { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "data_number: {}\nfield_start_byte_offset: {}\nfield_length: {}\n", + self.data_number, self.field_start_byte_offset, self.field_length + ) + } +} + +// Todo! write out JSON Token() implementation +// Linking: https://github.com/kevmo314/appendable/blob/main/pkg/protocol/protocol.go#L123