From 7b402ab9b6b5f29e69e1809b0355db5fc508aeb8 Mon Sep 17 00:00:00 2001 From: Matthew <38759997+friendlymatthew@users.noreply.github.com> Date: Wed, 10 Jan 2024 08:59:16 -0500 Subject: [PATCH] more work: change to Cursor, start tokenizer --- appendable-rs/appendable/Cargo.toml | 1 - .../appendable/src/handler/jsonl_handler.rs | 44 +++++----- appendable-rs/appendable/src/index_file.rs | 12 +-- appendable-rs/appendable/src/io.rs | 9 +- .../appendable/src/json_tokenizer.rs | 87 +++++++++++++++++++ appendable-rs/appendable/src/lib.rs | 1 + .../appendable/src/tests/jsonl_index_file.rs | 18 ++-- 7 files changed, 124 insertions(+), 48 deletions(-) create mode 100644 appendable-rs/appendable/src/json_tokenizer.rs diff --git a/appendable-rs/appendable/Cargo.toml b/appendable-rs/appendable/Cargo.toml index ff6ec574..c797260e 100644 --- a/appendable-rs/appendable/Cargo.toml +++ b/appendable-rs/appendable/Cargo.toml @@ -7,7 +7,6 @@ edition = "2021" [dependencies] protocol = { path="../protocol" } -serde_json = "1.0.111" xxhash-rust = { version = "0.8.8", features = ["xxh3"] } [dev-dependencies] diff --git a/appendable-rs/appendable/src/handler/jsonl_handler.rs b/appendable-rs/appendable/src/handler/jsonl_handler.rs index 44baff2e..0523c1fb 100644 --- a/appendable-rs/appendable/src/handler/jsonl_handler.rs +++ b/appendable-rs/appendable/src/handler/jsonl_handler.rs @@ -1,18 +1,17 @@ -use crate::index_file::Index; +use crate::index_file::IndexFile; use crate::io::DataHandler; -use serde_json::{Deserializer, Map, Value}; -use std::fs::File; -use std::io::{BufRead, BufReader, Seek, SeekFrom}; +use std::io::{BufRead, BufReader, Cursor, Seek, SeekFrom}; use xxhash_rust::xxh3::Xxh3; pub struct JSONLHandler { - reader: BufReader, + // todo! change to borrowed type like &[u8] -- spent too long battling lifetimes + reader: BufReader>>, xxh3: Xxh3, } impl JSONLHandler { - pub fn new(file: File) -> Self { + pub fn new(data: Vec) -> Self { JSONLHandler { - reader: BufReader::new(file), + reader: BufReader::new(Cursor::new(data)), xxh3: Xxh3::new(), } } @@ -23,12 +22,7 @@ impl Seek for JSONLHandler { } } impl DataHandler for JSONLHandler { - fn synchronize( - &mut self, - indexes: &mut Vec, - end_byte_offsets: &mut Vec, - checksums: &mut Vec, - ) -> Result<(), String> { + fn synchronize(&mut self, index_file: &mut IndexFile) -> Result<(), String> { let mut line = String::new(); let mut start_offset: u64 = 0; @@ -38,19 +32,25 @@ impl DataHandler for JSONLHandler { .map_err(|e| e.to_string())? > 0 { - let existing_count = end_byte_offsets.len(); + let existing_count = index_file.end_byte_offsets.len(); // compute byte_offset for current line let line_length = line.as_bytes().len() as u64; let current_offset = start_offset + line_length + 1; - end_byte_offsets.push(current_offset); + index_file.end_byte_offsets.push(current_offset); // compute checksum self.xxh3.update(line.as_bytes()); let checksum = self.xxh3.digest(); // produce the final hash value - checksums.push(checksum); + index_file.checksums.push(checksum); // Process the JSON line and update indexes - handle_json_object(&line, indexes, vec![], existing_count as u64, start_offset)?; + handle_json_object( + line.into_bytes(), + index_file, + &mut vec![], + existing_count as u64, + start_offset, + )?; start_offset = current_offset; line.clear(); @@ -61,11 +61,11 @@ impl DataHandler for JSONLHandler { } fn handle_json_object( - json_line: &str, - indexes: &mut Vec, - path: Vec, + json_line: Vec, + index_file: &mut IndexFile, + path: &mut Vec, data_index: u64, data_offset: u64, -) -> Result<(), String> { - Ok(()) +) -> Result { + Ok(1) } diff --git a/appendable-rs/appendable/src/index_file.rs b/appendable-rs/appendable/src/index_file.rs index 676ddd28..99ab40f8 100644 --- a/appendable-rs/appendable/src/index_file.rs +++ b/appendable-rs/appendable/src/index_file.rs @@ -8,8 +8,8 @@ use std::fmt::Formatter; const CURRENT_VERSION: Version = 1; pub(crate) struct Index { - field_name: String, - field_type: FieldFlags, + pub(crate) field_name: String, + pub(crate) field_type: FieldFlags, pub(crate) index_records: HashMap>, } @@ -19,7 +19,6 @@ pub struct IndexFile { pub(crate) indexes: Vec, pub(crate) end_byte_offsets: Vec, pub(crate) checksums: Vec, - data: Box, tail: u32, } @@ -28,17 +27,12 @@ impl IndexFile { let mut file = IndexFile { version: CURRENT_VERSION, indexes: Vec::new(), - data: data_handler, end_byte_offsets: Vec::new(), checksums: Vec::new(), tail: 0, }; - file.data.synchronize( - &mut file.indexes, - &mut file.end_byte_offsets, - &mut file.checksums, - )?; + data_handler.synchronize(&mut file)?; Ok(file) } diff --git a/appendable-rs/appendable/src/io.rs b/appendable-rs/appendable/src/io.rs index b259e1bf..138aae92 100644 --- a/appendable-rs/appendable/src/io.rs +++ b/appendable-rs/appendable/src/io.rs @@ -1,11 +1,6 @@ -use crate::index_file::Index; +use crate::index_file::{Index, IndexFile}; use std::io::Seek; pub trait DataHandler: Seek { - fn synchronize( - &mut self, - indexes: &mut Vec, - end_byte_offsets: &mut Vec, - checksums: &mut Vec, - ) -> Result<(), String>; + fn synchronize(&mut self, index_file: &mut IndexFile) -> Result<(), String>; } diff --git a/appendable-rs/appendable/src/json_tokenizer.rs b/appendable-rs/appendable/src/json_tokenizer.rs new file mode 100644 index 00000000..74223c17 --- /dev/null +++ b/appendable-rs/appendable/src/json_tokenizer.rs @@ -0,0 +1,87 @@ +pub enum Token { + OpenBracket, + CloseBracket, + Colon, + Comma, + String(String), + Number(String), + Boolean(bool), + OpenArray, + CloseArray, + Null, +} + +pub struct JSONTokenizer { + input: Vec, + position: usize, +} + +impl JSONTokenizer { + pub(crate) fn new(input: Vec) -> Self { + Self { input, position: 0 } + } + + pub(crate) fn next(&mut self) -> Result, String> { + // edge case: check if we've reached the end of line + if self.position >= self.input.len() { + Ok(None) + } else { + let current_byte = self.input[self.position]; + + return match current_byte { + b'{' => { + self.position += 1; + Ok(Some((Token::OpenBracket, self.position - 1))) + } + b'}' => { + self.position += 1; + Ok(Some((Token::CloseBracket, self.position - 1))) + } + b'[' => { + self.position += 1; + Ok(Some((Token::OpenArray, self.position - 1))) + } + b']' => { + self.position += 1; + Ok(Some((Token::CloseArray, self.position - 1))) + } + b'\"' => { + self.position += 1; + self.tokenize_string() + } + b':' => { + self.position += 1; + Ok(Some((Token::Colon, self.position - 1))) + } + _ => Err(format!( + "Unexpected character at position {}", + self.position - 1 + )), + }; + } + } + + fn tokenize_string(&mut self) -> Result, String> { + let start_position = self.position; + + while self.position < self.input.len() { + let current_byte = self.input[start_position]; + + match current_byte { + b'\"' => { + self.position += 1; + Ok(Some((Token::String, start_position))) + } + b'\\' => { + self.position += 2; // skip \n + continue; + } + _ => { + self.position += 1; + } + } + } + + Err("Unterminated string".to_string()) + } +} diff --git a/appendable-rs/appendable/src/lib.rs b/appendable-rs/appendable/src/lib.rs index d93e4df2..47d37cd1 100644 --- a/appendable-rs/appendable/src/lib.rs +++ b/appendable-rs/appendable/src/lib.rs @@ -1,5 +1,6 @@ pub mod index_file; pub mod io; +mod json_tokenizer; pub mod tests { pub mod jsonl_index_file; diff --git a/appendable-rs/appendable/src/tests/jsonl_index_file.rs b/appendable-rs/appendable/src/tests/jsonl_index_file.rs index 93246578..9fb96709 100644 --- a/appendable-rs/appendable/src/tests/jsonl_index_file.rs +++ b/appendable-rs/appendable/src/tests/jsonl_index_file.rs @@ -2,13 +2,12 @@ mod tests { use crate::handler::jsonl_handler::JSONLHandler; use crate::index_file::IndexFile; - use std::fs::File; + use std::fs; use std::io::Write; - use std::path::Path; + use std::path::PathBuf; use tempfile::NamedTempFile; - fn mock_jsonl_file() -> std::io::Result { - // Create a temporary file + fn mock_jsonl_file_to_disk() -> std::io::Result { let mut temp_file = NamedTempFile::new()?; writeln!( @@ -20,16 +19,17 @@ mod tests { r#"{{"name": "kevin", "id": 1, "alpha": ["x", "y", "z"]}}"# )?; - // Persist the file and return the File handle - let file = temp_file.persist(Path::new("mock_data.jsonl"))?; - Ok(file) + let file_path = temp_file.into_temp_path(); + let persisted_file = file_path.keep()?; + Ok(persisted_file) } #[test] fn create_index_file() { - let file = mock_jsonl_file().expect("Failed to create mock file"); - let jsonl_handler = JSONLHandler::new(file); + let file_path = mock_jsonl_file_to_disk().expect("Failed to create mock file"); + let data = fs::read(&file_path).expect("Unable to read mock file"); + let jsonl_handler = JSONLHandler::new(data); let index_file = IndexFile::new(Box::new(jsonl_handler)); assert!(index_file.is_ok());