From 8695f2e3fd192976bec75c0934e85190545cfba1 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Wed, 14 Feb 2024 10:51:27 -0500 Subject: [PATCH 1/4] fix: use offset as a disambiguator Fixes uniqueness issue --- pkg/btree/bptree.go | 36 +++++++------- pkg/btree/bptree_test.go | 94 +++++++++++++++++++---------------- pkg/btree/node.go | 16 ++++++ pkg/handlers/csv_test.go | 9 ++-- pkg/handlers/equality_test.go | 8 +-- pkg/handlers/jsonl_test.go | 17 ++++--- 6 files changed, 103 insertions(+), 77 deletions(-) diff --git a/pkg/btree/bptree.go b/pkg/btree/bptree.go index 7f720456..4e41895d 100644 --- a/pkg/btree/bptree.go +++ b/pkg/btree/bptree.go @@ -4,6 +4,7 @@ import ( "bytes" "fmt" "io" + "log" "slices" ) @@ -40,19 +41,20 @@ func (t *BPTree) root() (*BPTreeNode, MemoryPointer, error) { return root, mp, nil } -func (t *BPTree) Find(key []byte) (MemoryPointer, bool, error) { +func (t *BPTree) Find(key ReferencedValue) (ReferencedValue, MemoryPointer, error) { root, rootOffset, err := t.root() if err != nil { - return MemoryPointer{}, false, fmt.Errorf("read root node: %w", err) + return ReferencedValue{}, MemoryPointer{}, fmt.Errorf("read root node: %w", err) } if root == nil { - return MemoryPointer{}, false, nil + return ReferencedValue{}, MemoryPointer{}, nil } path, err := t.traverse(key, root, rootOffset) if err != nil { - return MemoryPointer{}, false, err + return ReferencedValue{}, MemoryPointer{}, err } - return path[0].node.Pointer(path[0].index), path[0].found, nil + log.Printf("path %#v", path) + return path[0].node.Keys[path[0].index], path[0].node.Pointer(path[0].index), nil } func (t *BPTree) readNode(ptr MemoryPointer) (*BPTreeNode, error) { @@ -76,15 +78,9 @@ type TraversalRecord struct { // traverse returns the path from root to leaf in reverse order (leaf first) // the last element is always the node passed in -func (t *BPTree) traverse(key []byte, node *BPTreeNode, ptr MemoryPointer) ([]TraversalRecord, error) { - // binary search node.Keys to find the first key greater than key (or gte if leaf) - index, found := slices.BinarySearchFunc(node.Keys, ReferencedValue{Value: key}, func(e ReferencedValue, t ReferencedValue) int { - if cmp := bytes.Compare(e.Value, t.Value); cmp == 0 && !node.leaf() { - return -1 - } else { - return cmp - } - }) +func (t *BPTree) traverse(key ReferencedValue, node *BPTreeNode, ptr MemoryPointer) ([]TraversalRecord, error) { + // binary search node.Keys to find the first key greater than key + index, found := slices.BinarySearchFunc(node.Keys, key, CompareReferencedValues) if node.leaf() { return []TraversalRecord{{node: node, index: index, found: found, ptr: ptr}}, nil @@ -122,16 +118,19 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error { return t.meta.SetRoot(MemoryPointer{Offset: uint64(offset), Length: uint32(len(buf))}) } - path, err := t.traverse(key.Value, root, rootOffset) + path, err := t.traverse(key, root, rootOffset) if err != nil { return err } + log.Printf("path: %v", path) + // insert the key into the leaf n := path[0].node - j, _ := slices.BinarySearchFunc(n.Keys, key, func(e ReferencedValue, t ReferencedValue) int { - return bytes.Compare(e.Value, t.Value) - }) + j, found := slices.BinarySearchFunc(n.Keys, key, CompareReferencedValues) + if found { + return fmt.Errorf("key already exists") + } if j == len(n.Keys) { n.Keys = append(n.Keys, key) n.leafPointers = append(n.leafPointers, value) @@ -147,6 +146,7 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error { tr := path[i] n := tr.node if int(n.Size()) > t.tree.PageSize() { + log.Printf("split!") // split the node // mid is the key that will be inserted into the parent mid := len(n.Keys) / 2 diff --git a/pkg/btree/bptree_test.go b/pkg/btree/bptree_test.go index 6b04d8c6..3bb47970 100644 --- a/pkg/btree/bptree_test.go +++ b/pkg/btree/bptree_test.go @@ -1,6 +1,7 @@ package btree import ( + "bytes" "encoding/binary" "math/rand" "testing" @@ -30,11 +31,11 @@ func TestBPTree(t *testing.T) { } tree := NewBPTree(p, &testMetaPage{}) // find a key that doesn't exist - _, found, err := tree.Find([]byte("hello")) + k, _, err := tree.Find(ReferencedValue{Value: []byte("hello")}) if err != nil { t.Fatal(err) } - if found { + if len(k.Value) != 0 { t.Fatal("expected not found") } }) @@ -49,11 +50,11 @@ func TestBPTree(t *testing.T) { if err := tree.Insert(ReferencedValue{Value: []byte("hello")}, MemoryPointer{Offset: 1}); err != nil { t.Fatal(err) } - v, found, err := tree.Find([]byte("hello")) + k, v, err := tree.Find(ReferencedValue{Value: []byte("hello")}) if err != nil { t.Fatal(err) } - if !found { + if !bytes.Equal(k.Value, []byte("hello")) { t.Fatal("expected to find key") } if v.Offset != 1 { @@ -74,21 +75,21 @@ func TestBPTree(t *testing.T) { if err := tree.Insert(ReferencedValue{Value: []byte("world")}, MemoryPointer{Offset: 2}); err != nil { t.Fatal(err) } - v1, f1, err := tree.Find([]byte("hello")) + k1, v1, err := tree.Find(ReferencedValue{Value: []byte("hello")}) if err != nil { t.Fatal(err) } - if !f1 { + if !bytes.Equal(k1.Value, []byte("hello")) { t.Fatal("expected to find key") } if v1.Offset != 1 { t.Fatalf("expected value 1, got %d", v1) } - v2, f2, err := tree.Find([]byte("world")) + k2, v2, err := tree.Find(ReferencedValue{Value: []byte("world")}) if err != nil { t.Fatal(err) } - if !f2 { + if !bytes.Equal(k2.Value, []byte("world")) { t.Fatal("expected to find key") } if v2.Offset != 2 { @@ -115,41 +116,41 @@ func TestBPTree(t *testing.T) { if err := tree.Insert(ReferencedValue{Value: []byte("cooow")}, MemoryPointer{Offset: 4}); err != nil { t.Fatal(err) } - v1, f1, err := tree.Find([]byte("hello")) + k1, v1, err := tree.Find(ReferencedValue{Value: []byte("hello")}) if err != nil { t.Fatal(err) } - if !f1 { + if !bytes.Equal(k1.Value, []byte("hello")) { t.Fatal("expected to find key") } if v1.Offset != 1 { t.Fatalf("expected value 1, got %d", v1) } - v2, f2, err := tree.Find([]byte("world")) + k2, v2, err := tree.Find(ReferencedValue{Value: []byte("world")}) if err != nil { t.Fatal(err) } - if !f2 { + if !bytes.Equal(k2.Value, []byte("world")) { t.Fatal("expected to find key") } if v2.Offset != 2 { t.Fatalf("expected value 2, got %d", v2) } - v3, f3, err := tree.Find([]byte("moooo")) + k3, v3, err := tree.Find(ReferencedValue{Value: []byte("moooo")}) if err != nil { t.Fatal(err) } - if !f3 { + if !bytes.Equal(k3.Value, []byte("moooo")) { t.Fatal("expected to find key") } if v3.Offset != 3 { t.Fatalf("expected value 3, got %d", v3) } - v4, f4, err := tree.Find([]byte("cooow")) + k4, v4, err := tree.Find(ReferencedValue{Value: []byte("cooow")}) if err != nil { t.Fatal(err) } - if !f4 { + if !bytes.Equal(k4.Value, []byte("cooow")) { t.Fatal("expected to find key") } if v4.Offset != 4 { @@ -180,37 +181,39 @@ func TestBPTree(t *testing.T) { t.Fatal(err) } }) +} - t.Run("insertion test", func(t *testing.T) { - b := buftest.NewSeekableBuffer() - p, err := NewPageFile(b) +func TestBPTree_SequentialInsertionTest(t *testing.T) { + b := buftest.NewSeekableBuffer() + p, err := NewPageFile(b) + if err != nil { + t.Fatal(err) + } + tree := NewBPTree(p, &testMetaPage{}) + for i := 0; i < 256; i++ { + buf := make([]byte, 8) + binary.BigEndian.PutUint64(buf, uint64(i)) + if err := tree.Insert(ReferencedValue{Value: buf}, MemoryPointer{Offset: uint64(i)}); err != nil { + t.Fatal(err) + } + } + for i := 0; i < 256; i++ { + buf := make([]byte, 8) + binary.BigEndian.PutUint64(buf, uint64(i)) + k, v, err := tree.Find(ReferencedValue{Value: buf}) if err != nil { t.Fatal(err) } - tree := NewBPTree(p, &testMetaPage{}) - for i := 0; i < 16384; i++ { - buf := make([]byte, 8) - binary.BigEndian.PutUint64(buf, uint64(i)) - if err := tree.Insert(ReferencedValue{Value: buf}, MemoryPointer{Offset: uint64(i)}); err != nil { - t.Fatal(err) - } + if !bytes.Equal(k.Value, buf) { + t.Fatalf("expected to find key %d", i) } - for i := 0; i < 16384; i++ { - buf := make([]byte, 8) - binary.BigEndian.PutUint64(buf, uint64(i)) - v, found, err := tree.Find(buf) - if err != nil { - t.Fatal(err) - } - if !found { - t.Fatalf("expected to find key %d", i) - } - if v.Offset != uint64(i) { - t.Fatalf("expected value %d, got %d", i, v) - } + if v.Offset != uint64(i) { + t.Fatalf("expected value %d, got %d", i, v) } - }) + } +} +func TestBPTree_RandomTests(t *testing.T) { t.Run("random insertion test", func(t *testing.T) { b := buftest.NewSeekableBuffer() p, err := NewPageFile(b) @@ -234,11 +237,11 @@ func TestBPTree(t *testing.T) { if _, err := s.Read(buf); err != nil { t.Fatal(err) } - v, found, err := tree.Find(buf) + k, v, err := tree.Find(ReferencedValue{Value: buf}) if err != nil { t.Fatal(err) } - if !found { + if !bytes.Equal(k.Value, buf) { t.Fatalf("expected to find key %d", i) } if v.Offset != uint64(i) { @@ -254,8 +257,13 @@ func TestBPTree(t *testing.T) { t.Fatal(err) } tree := NewBPTree(p, &testMetaPage{}) + tree.Data = make([]byte, 65536*4+8) for i := 0; i < 65536*4; i++ { - if err := tree.Insert(ReferencedValue{Value: []byte{1, 2, 3, 4, 5, 6, 7, 8}}, MemoryPointer{Offset: uint64(i)}); err != nil { + if err := tree.Insert(ReferencedValue{ + Value: []byte{1, 2, 3, 4, 5, 6, 7, 8}, + // DataPointer is used as a disambiguator. + DataPointer: MemoryPointer{Offset: uint64(i), Length: 8}, + }, MemoryPointer{Offset: uint64(i)}); err != nil { t.Fatal(err) } } diff --git a/pkg/btree/node.go b/pkg/btree/node.go index d224d275..5f5d8af9 100644 --- a/pkg/btree/node.go +++ b/pkg/btree/node.go @@ -1,6 +1,7 @@ package btree import ( + "bytes" "encoding/binary" "fmt" "io" @@ -17,10 +18,25 @@ type ReferencedValue struct { // value is taken to be unreferenced and is stored directly in the node. // if it is set, the value is used for comparison but the value is stored // as a reference to the DataPointer. + // + // caveat: DataPointer is used as a disambiguator for the value. the b+ tree + // implementation does not support duplicate keys and uses the DataPointer + // to disambiguate between keys that compare as equal. DataPointer MemoryPointer Value []byte } +func CompareReferencedValues(a, b ReferencedValue) int { + cmp := bytes.Compare(a.Value, b.Value) + if cmp != 0 { + return cmp + } + if a.DataPointer.Offset != b.DataPointer.Offset { + return int(a.DataPointer.Offset - b.DataPointer.Offset) + } + return int(a.DataPointer.Length - b.DataPointer.Length) +} + type BPTreeNode struct { Data []byte // contains the offset of the child node or the offset of the record for leaf diff --git a/pkg/handlers/csv_test.go b/pkg/handlers/csv_test.go index e2daefc2..c6d9bf68 100644 --- a/pkg/handlers/csv_test.go +++ b/pkg/handlers/csv_test.go @@ -8,6 +8,7 @@ import ( "testing" "github.com/kevmo314/appendable/pkg/appendable" + "github.com/kevmo314/appendable/pkg/btree" "github.com/kevmo314/appendable/pkg/buftest" ) @@ -98,7 +99,7 @@ func TestCSV(t *testing.T) { t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) } - mp1, found, err := collected[0].BPTree(r2).Find([]byte("test1")) + mp1, found, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) if err != nil { t.Fatal(err) } @@ -109,7 +110,7 @@ func TestCSV(t *testing.T) { t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = %+v, want {%d, %d}", mp1, len("test\n"), len("test1")) } - mp2, found, err := collected[0].BPTree(r2).Find([]byte("test2")) + mp2, found, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test2")}) if err != nil { t.Fatal(err) } @@ -157,7 +158,7 @@ func TestCSV(t *testing.T) { t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) } - mp1, found, err := collected[0].BPTree(r2).Find([]byte("test1")) + mp1, found, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) if err != nil { t.Fatal(err) } @@ -182,7 +183,7 @@ func TestCSV(t *testing.T) { v2 := make([]byte, 8) binary.BigEndian.PutUint64(v2, math.Float64bits(123)) - mp2, found, err := collected[1].BPTree(r2).Find(v2) + mp2, found, err := collected[1].BPTree(r2).Find(btree.ReferencedValue{Value: v2}) if err != nil { t.Fatal(err) } diff --git a/pkg/handlers/equality_test.go b/pkg/handlers/equality_test.go index fed5c944..b487989e 100644 --- a/pkg/handlers/equality_test.go +++ b/pkg/handlers/equality_test.go @@ -186,7 +186,7 @@ func compareMetaPages(i1, i2 []*btree.LinkedMetaPage, jr, cr []byte) (bool, stri if i == 0 { for _, val := range h1 { - _, found, err := collected1.BPTree(jr).Find([]byte(val)) + _, found, err := collected1.BPTree(jr).Find(btree.ReferencedValue{Value: []byte(val)}) if err != nil { return false, fmt.Sprintf("failed to find btree for jsonl reader %v", val) @@ -195,7 +195,7 @@ func compareMetaPages(i1, i2 []*btree.LinkedMetaPage, jr, cr []byte) (bool, stri return false, fmt.Sprintf("failed to find %v for josnl reader", val) } - _, found, err = collected2.BPTree(cr).Find([]byte(val)) + _, found, err = collected2.BPTree(cr).Find(btree.ReferencedValue{Value: []byte(val)}) if err != nil { return false, fmt.Sprintf("failed to find btree for jsonl reader %v", val) @@ -210,7 +210,7 @@ func compareMetaPages(i1, i2 []*btree.LinkedMetaPage, jr, cr []byte) (bool, stri v2 := make([]byte, 8) binary.BigEndian.PutUint64(v2, math.Float64bits(val)) - _, found, err := collected1.BPTree(jr).Find(v2) + _, found, err := collected1.BPTree(jr).Find(btree.ReferencedValue{Value: v2}) if err != nil { return false, fmt.Sprintf("failed to find btree for jsonl reader %v", val) @@ -219,7 +219,7 @@ func compareMetaPages(i1, i2 []*btree.LinkedMetaPage, jr, cr []byte) (bool, stri return false, fmt.Sprintf("failed to find %v for josnl reader", val) } - _, found, err = collected2.BPTree(cr).Find(v2) + _, found, err = collected2.BPTree(cr).Find(btree.ReferencedValue{Value: v2}) if err != nil { return false, fmt.Sprintf("failed to find btree for jsonl reader %v", val) diff --git a/pkg/handlers/jsonl_test.go b/pkg/handlers/jsonl_test.go index dd2bf3ca..9dd9179d 100644 --- a/pkg/handlers/jsonl_test.go +++ b/pkg/handlers/jsonl_test.go @@ -6,6 +6,7 @@ import ( "testing" "github.com/kevmo314/appendable/pkg/appendable" + "github.com/kevmo314/appendable/pkg/btree" "github.com/kevmo314/appendable/pkg/buftest" ) @@ -85,7 +86,7 @@ func TestJSONL(t *testing.T) { t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) } - mp1, found, err := collected[0].BPTree(r2).Find([]byte("test1")) + mp1, found, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) if err != nil { t.Fatal(err) } @@ -96,7 +97,7 @@ func TestJSONL(t *testing.T) { t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = %+v, want {0, %d}", mp1, len("{\"test\":\"test1\"}")) } - mp2, found, err := collected[0].BPTree(r2).Find([]byte("test3")) + mp2, found, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test3")}) if err != nil { t.Fatal(err) } @@ -140,7 +141,7 @@ func TestJSONL(t *testing.T) { t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) } - mp1, found, err := collected[0].BPTree(r2).Find([]byte("test1")) + mp1, found, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) if err != nil { t.Fatal(err) } @@ -163,7 +164,7 @@ func TestJSONL(t *testing.T) { t.Errorf("got i.Indexes[0].FieldType = %#v, want FieldTypeString", md1.FieldType) } - mp2, found, err := collected[1].BPTree(r2).Find([]byte("test3")) + mp2, found, err := collected[1].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test3")}) if err != nil { t.Fatal(err) } @@ -215,7 +216,7 @@ func TestJSONL(t *testing.T) { t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) } - mp1, found, err := collected[0].BPTree(r2).Find([]byte("test1")) + mp1, found, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) if err != nil { t.Fatal(err) } @@ -240,7 +241,7 @@ func TestJSONL(t *testing.T) { v2 := make([]byte, 8) binary.BigEndian.PutUint64(v2, math.Float64bits(123)) - mp2, found, err := collected[1].BPTree(r2).Find(v2) + mp2, found, err := collected[1].BPTree(r2).Find(btree.ReferencedValue{Value: v2}) if err != nil { t.Fatal(err) } @@ -506,7 +507,7 @@ func TestJSONL(t *testing.T) { t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) } - mp1, found, err := collected[0].BPTree(r2).Find([]byte("test1")) + mp1, found, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) if err != nil { t.Fatal(err) } @@ -529,7 +530,7 @@ func TestJSONL(t *testing.T) { t.Errorf("got i.Indexes[0].FieldType = %#v, want FieldTypeString", md1.FieldType) } - mp2, found, err := collected[1].BPTree(r2).Find([]byte{}) + mp2, found, err := collected[1].BPTree(r2).Find(btree.ReferencedValue{}) if err != nil { t.Fatal(err) } From 4963415c0b4b04fcabe93345d9259ce47f6dc15d Mon Sep 17 00:00:00 2001 From: Matthew <38759997+friendlymatthew@users.noreply.github.com> Date: Wed, 14 Feb 2024 15:07:41 -0500 Subject: [PATCH 2/4] fix some tests --- pkg/btree/bptree.go | 5 ++- pkg/btree/node.go | 2 + pkg/handlers/csv_test.go | 36 +++++++++++++---- pkg/handlers/equality_test.go | 32 ++++++++++----- pkg/handlers/jsonl.go | 10 ++++- pkg/handlers/jsonl_test.go | 75 +++++++++++++++++++++++++++-------- 6 files changed, 122 insertions(+), 38 deletions(-) diff --git a/pkg/btree/bptree.go b/pkg/btree/bptree.go index 4e41895d..cde709c3 100644 --- a/pkg/btree/bptree.go +++ b/pkg/btree/bptree.go @@ -98,6 +98,7 @@ func (t *BPTree) traverse(key ReferencedValue, node *BPTreeNode, ptr MemoryPoint } func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error { + fmt.Printf("\ninsert parameters look like \nkey: %v\nvalue: %v\n\n", key, value) root, rootOffset, err := t.root() if err != nil { return fmt.Errorf("read root node: %w", err) @@ -123,11 +124,13 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error { return err } - log.Printf("path: %v", path) + log.Printf("path: %v\n", path) // insert the key into the leaf n := path[0].node + fmt.Printf("keys %v and key %v", n.Keys, key) j, found := slices.BinarySearchFunc(n.Keys, key, CompareReferencedValues) + fmt.Printf("binary search results: j: %v and found %v", j, found) if found { return fmt.Errorf("key already exists") } diff --git a/pkg/btree/node.go b/pkg/btree/node.go index 5f5d8af9..7c0b0529 100644 --- a/pkg/btree/node.go +++ b/pkg/btree/node.go @@ -31,6 +31,8 @@ func CompareReferencedValues(a, b ReferencedValue) int { if cmp != 0 { return cmp } + + fmt.Printf("the offsets to compare are: %v %v\n", a.DataPointer.Offset, b.DataPointer.Offset) if a.DataPointer.Offset != b.DataPointer.Offset { return int(a.DataPointer.Offset - b.DataPointer.Offset) } diff --git a/pkg/handlers/csv_test.go b/pkg/handlers/csv_test.go index c6d9bf68..9db94e33 100644 --- a/pkg/handlers/csv_test.go +++ b/pkg/handlers/csv_test.go @@ -1,6 +1,7 @@ package handlers import ( + "bytes" "encoding/binary" "log/slog" "math" @@ -99,25 +100,35 @@ func TestCSV(t *testing.T) { t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) } - mp1, found, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) + rv1, mp1, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) if err != nil { t.Fatal(err) } - if !found { + + if mp1 == (btree.MemoryPointer{}) { t.Errorf("got i.Indexes[0].BPTree().Find(test1) = nil, want non-nil") } + + if !bytes.Equal(rv1.Value, []byte("test1")) { + t.Errorf("incorrect values, got %v, want %v", rv1.Value, []byte("test1")) + } + if mp1.Offset != uint64(len("test\n")) || mp1.Length != uint32(len("test1")) { t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = %+v, want {%d, %d}", mp1, len("test\n"), len("test1")) } - mp2, found, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test2")}) + rv2, mp2, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test2")}) if err != nil { t.Fatal(err) } - if !found { + if mp2 == (btree.MemoryPointer{}) { t.Errorf("got i.Indexes[0].BPTree().Find(\"test2\") = nil, want non-nil") } + if !bytes.Equal(rv2.Value, []byte("test2")) { + t.Errorf("incorrect values, got %v, want %v", rv2.Value, []byte("test2")) + } + if mp2.Offset != uint64(len("test\ntest1\n")) || mp2.Length != uint32(len("test2")) { t.Errorf("got i.Indexes[0].BPTree().Find(\"test2\") = %+v, want {%d, %d}", mp2, len("test\ntest1\n"), len("test2")) } @@ -158,13 +169,18 @@ func TestCSV(t *testing.T) { t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) } - mp1, found, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) + rv1, mp1, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) if err != nil { t.Fatal(err) } - if !found { + if mp1 == (btree.MemoryPointer{}) { t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = nil, want non-nil") } + + if !bytes.Equal(rv1.Value, []byte("test1")) { + t.Errorf("incorrect values, got %v, want %v", rv1.Value, []byte("test1")) + } + if mp1.Offset != uint64(len("test\n")) || mp1.Length != uint32(len("test1")) { t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = %+v, want {%d, %d}", mp1, len("test\n"), len("test1")) } @@ -183,13 +199,17 @@ func TestCSV(t *testing.T) { v2 := make([]byte, 8) binary.BigEndian.PutUint64(v2, math.Float64bits(123)) - mp2, found, err := collected[1].BPTree(r2).Find(btree.ReferencedValue{Value: v2}) + rv2, mp2, err := collected[1].BPTree(r2).Find(btree.ReferencedValue{Value: v2}) if err != nil { t.Fatal(err) } - if !found { + if mp2 == (btree.MemoryPointer{}) { t.Errorf("got i.Indexes[1].BPTree().Find(\"test3\") = nil, want non-nil") } + if !bytes.Equal(rv2.Value, v2) { + t.Errorf("incorrect values, got %v, want %v", rv1.Value, v2) + } + if mp2.Offset != uint64(len("test\ntest1\n")) || mp2.Length != uint32(len("123")) { t.Errorf("got i.Indexes[1].BPTree().Find(\"test3\") = %+v, want {%d, %d}", mp2, len("test\ntest1\n"), len("123")) } diff --git a/pkg/handlers/equality_test.go b/pkg/handlers/equality_test.go index b487989e..bf2f2821 100644 --- a/pkg/handlers/equality_test.go +++ b/pkg/handlers/equality_test.go @@ -1,6 +1,7 @@ package handlers import ( + "bytes" "encoding/binary" "fmt" "log/slog" @@ -186,23 +187,28 @@ func compareMetaPages(i1, i2 []*btree.LinkedMetaPage, jr, cr []byte) (bool, stri if i == 0 { for _, val := range h1 { - _, found, err := collected1.BPTree(jr).Find(btree.ReferencedValue{Value: []byte(val)}) + rv1, mp1, err := collected1.BPTree(jr).Find(btree.ReferencedValue{Value: []byte(val)}) if err != nil { return false, fmt.Sprintf("failed to find btree for jsonl reader %v", val) } - if !found { - return false, fmt.Sprintf("failed to find %v for josnl reader", val) + if mp1 == (btree.MemoryPointer{}) { + return false, fmt.Sprintf("failed to find %v for reader", val) } - _, found, err = collected2.BPTree(cr).Find(btree.ReferencedValue{Value: []byte(val)}) + rv2, mp2, err := collected2.BPTree(cr).Find(btree.ReferencedValue{Value: []byte(val)}) if err != nil { return false, fmt.Sprintf("failed to find btree for jsonl reader %v", val) } - if !found { - return false, fmt.Sprintf("failed to find %v for josnl reader", val) + if mp2 == (btree.MemoryPointer{}) { + return false, fmt.Sprintf("failed to find %v for reader", val) + } + + if !bytes.Equal(rv1.Value, rv2.Value) { + return false, fmt.Sprintf("mismatched keys: %v, %v", rv1.Value, rv2.Value) } + } } else if i == 1 { @@ -210,23 +216,29 @@ func compareMetaPages(i1, i2 []*btree.LinkedMetaPage, jr, cr []byte) (bool, stri v2 := make([]byte, 8) binary.BigEndian.PutUint64(v2, math.Float64bits(val)) - _, found, err := collected1.BPTree(jr).Find(btree.ReferencedValue{Value: v2}) + rv1, mp1, err := collected1.BPTree(jr).Find(btree.ReferencedValue{Value: v2}) if err != nil { return false, fmt.Sprintf("failed to find btree for jsonl reader %v", val) } - if !found { + if mp1 == (btree.MemoryPointer{}) { return false, fmt.Sprintf("failed to find %v for josnl reader", val) } - _, found, err = collected2.BPTree(cr).Find(btree.ReferencedValue{Value: v2}) + fmt.Printf("rv1: %v", rv1) + + rv2, mp2, err := collected2.BPTree(cr).Find(btree.ReferencedValue{Value: v2}) if err != nil { return false, fmt.Sprintf("failed to find btree for jsonl reader %v", val) } - if !found { + if mp2 == (btree.MemoryPointer{}) { return false, fmt.Sprintf("failed to find %v for josnl reader", val) } + + if !bytes.Equal(rv1.Value, rv2.Value) { + return false, fmt.Sprintf("mismatched keys: %v, %v", rv1.Value, rv2.Value) + } } } diff --git a/pkg/handlers/jsonl.go b/pkg/handlers/jsonl.go index ee95beef..1980195f 100644 --- a/pkg/handlers/jsonl.go +++ b/pkg/handlers/jsonl.go @@ -185,10 +185,16 @@ func handleJSONLObject(f *appendable.IndexFile, r []byte, dec *json.Decoder, pat return fmt.Errorf("unexpected token '%v'", value) } case nil: + fmt.Printf("\n\ninserting data with offset: %v\n", data.Offset) // nil values are a bit of a degenerate case, we are essentially using the btree // as a set. we store the value as an empty byte slice. - if err := page.BPTree(r).Insert(btree.ReferencedValue{Value: []byte{}}, data); err != nil { - return fmt.Errorf("failed to insert into b+tree: %w", err) + if err := page.BPTree(r).Insert(btree.ReferencedValue{ + Value: []byte{}, + DataPointer: data, + }, data); err != nil { + return fmt.Errorf("failed to insert into b+tree: %w\nmp: %v\n", err, data.Offset) + } else { + fmt.Printf("mp: %v\n", data.Offset) } default: return fmt.Errorf("unexpected type '%T'", value) diff --git a/pkg/handlers/jsonl_test.go b/pkg/handlers/jsonl_test.go index 9dd9179d..160c7f2d 100644 --- a/pkg/handlers/jsonl_test.go +++ b/pkg/handlers/jsonl_test.go @@ -1,6 +1,7 @@ package handlers import ( + "bytes" "encoding/binary" "math" "testing" @@ -86,24 +87,34 @@ func TestJSONL(t *testing.T) { t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) } - mp1, found, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) + rv1, mp1, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) if err != nil { t.Fatal(err) } - if !found { + if mp1 == (btree.MemoryPointer{}) { t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = nil, want non-nil") } + + if !bytes.Equal(rv1.Value, []byte("test1")) { + t.Errorf("incorrect values, got %v, want %v", rv1.Value, []byte("test1")) + } + if mp1.Offset != 0 || mp1.Length != uint32(len("{\"test\":\"test1\"}")) { t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = %+v, want {0, %d}", mp1, len("{\"test\":\"test1\"}")) } - mp2, found, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test3")}) + rv2, mp2, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test3")}) if err != nil { t.Fatal(err) } - if !found { + if mp2 == (btree.MemoryPointer{}) { t.Errorf("got i.Indexes[0].BPTree().Find(\"test3\") = nil, want non-nil") } + + if !bytes.Equal(rv2.Value, []byte("test3")) { + t.Errorf("incorrect values, got %v, want %v", rv2.Value, []byte("test3")) + } + if mp2.Offset != uint64(len("{\"test\":\"test1\"}\n")) || mp2.Length != uint32(len("{\"test\":\"test3\"}")) { t.Errorf("got i.Indexes[0].BPTree().Find(\"test3\") = %+v, want {%d, %d}", mp2, len("{\"test\":\"test1\"}\n"), len("{\"test\":\"test3\"}")) } @@ -141,13 +152,18 @@ func TestJSONL(t *testing.T) { t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) } - mp1, found, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) + rv1, mp1, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) if err != nil { t.Fatal(err) } - if !found { + if mp1 == (btree.MemoryPointer{}) { t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = nil, want non-nil") } + + if !bytes.Equal(rv1.Value, []byte("test1")) { + t.Errorf("incorrect values, got %v, want %v", rv1.Value, []byte("test1")) + } + if mp1.Offset != 0 || mp1.Length != uint32(len("{\"test\":\"test1\"}")) { t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = %+v, want {0, %d}", mp1, len("{\"test\":\"test1\"}")) } @@ -164,13 +180,18 @@ func TestJSONL(t *testing.T) { t.Errorf("got i.Indexes[0].FieldType = %#v, want FieldTypeString", md1.FieldType) } - mp2, found, err := collected[1].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test3")}) + rv2, mp2, err := collected[1].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test3")}) if err != nil { t.Fatal(err) } - if !found { + if mp2 == (btree.MemoryPointer{}) { t.Errorf("got i.Indexes[1].BPTree().Find(\"test3\") = nil, want non-nil") } + + if !bytes.Equal(rv2.Value, []byte("test3")) { + t.Errorf("incorrect values, got %v, want %v", rv2.Value, []byte("test3")) + } + if mp2.Offset != uint64(len("{\"test\":\"test1\"}\n")) || mp2.Length != uint32(len("{\"test2\":\"test3\"}")) { t.Errorf("got i.Indexes[1].BPTree().Find(\"test3\") = %+v, want {%d, %d}", mp2, len("{\"test\":\"test1\"}\n"), len("{\"test2\":\"test3\"}")) } @@ -216,13 +237,18 @@ func TestJSONL(t *testing.T) { t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) } - mp1, found, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) + rv1, mp1, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) if err != nil { t.Fatal(err) } - if !found { + if mp1 == (btree.MemoryPointer{}) { t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = nil, want non-nil") } + + if !bytes.Equal(rv1.Value, []byte("test1")) { + t.Errorf("incorrect values, got %v, want %v", rv1.Value, []byte("test1")) + } + if mp1.Offset != 0 || mp1.Length != uint32(len("{\"test\":\"test1\"}")) { t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = %+v, want {0, %d}", mp1, len("{\"test\":\"test1\"}")) } @@ -241,13 +267,18 @@ func TestJSONL(t *testing.T) { v2 := make([]byte, 8) binary.BigEndian.PutUint64(v2, math.Float64bits(123)) - mp2, found, err := collected[1].BPTree(r2).Find(btree.ReferencedValue{Value: v2}) + rv2, mp2, err := collected[1].BPTree(r2).Find(btree.ReferencedValue{Value: v2}) if err != nil { t.Fatal(err) } - if !found { + if mp2 == (btree.MemoryPointer{}) { t.Errorf("got i.Indexes[1].BPTree().Find(123) = nil, want non-nil") } + + if !bytes.Equal(rv2.Value, v2) { + t.Errorf("incorrect values, got %v, want %v", rv1.Value, v2) + } + if mp2.Offset != uint64(len("{\"test\":\"test1\"}\n")) || mp2.Length != uint32(len("{\"test\":123}")) { t.Errorf("got i.Indexes[1].BPTree().Find(123)= %+v, want {%d, %d}", mp2, len("{\"test\":\"test1\"}\n"), len("{\"test\":123}")) } @@ -507,13 +538,18 @@ func TestJSONL(t *testing.T) { t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) } - mp1, found, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) + rv1, mp1, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) if err != nil { t.Fatal(err) } - if !found { + if mp1 == (btree.MemoryPointer{}) { t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = nil, want non-nil") } + + if !bytes.Equal(rv1.Value, []byte("test1")) { + t.Errorf("incorrect values, got %v, want %v", rv1.Value, []byte("test1")) + } + if mp1.Offset != 0 || mp1.Length != uint32(len("{\"test\":\"test1\"}")) { t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = %+v, want {0, %d}", mp1, len("{\"test\":\"test1\"}")) } @@ -530,13 +566,18 @@ func TestJSONL(t *testing.T) { t.Errorf("got i.Indexes[0].FieldType = %#v, want FieldTypeString", md1.FieldType) } - mp2, found, err := collected[1].BPTree(r2).Find(btree.ReferencedValue{}) + rv2, mp2, err := collected[1].BPTree(r2).Find(btree.ReferencedValue{}) if err != nil { t.Fatal(err) } - if !found { - t.Errorf("got i.Indexes[1].BPTree().Find(\"test3\") = nil, want non-nil") + if mp2 == (btree.MemoryPointer{}) { + t.Errorf("got i.Indexes[1].BPTree().Find(null) = nil, want non-nil") } + + if len(rv2.Value) != 0 { + t.Errorf("incorrect values, got %v, want %v", rv2.Value, "null") + } + if mp2.Offset != uint64(len("{\"test\":\"test1\"}\n")) || mp2.Length != uint32(len("{\"test\":null}")) { t.Errorf("got i.Indexes[1].BPTree().Find(\"test3\") = %+v, want {%d, %d}", mp2, len("{\"test\":\"test1\"}\n"), len("{\"test\":null}")) } From f658fde0c34f9afa833cb8c264fcc1d8955de927 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Fri, 16 Feb 2024 01:08:56 -0500 Subject: [PATCH 3/4] fix off by one error --- pkg/btree/bptree.go | 17 ++++++----------- pkg/btree/node.go | 1 - 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/pkg/btree/bptree.go b/pkg/btree/bptree.go index e816efbf..b54364b8 100644 --- a/pkg/btree/bptree.go +++ b/pkg/btree/bptree.go @@ -4,7 +4,6 @@ import ( "bytes" "fmt" "io" - "log" "slices" ) @@ -51,7 +50,6 @@ func (t *BPTree) Find(key ReferencedValue) (ReferencedValue, MemoryPointer, erro if err != nil { return ReferencedValue{}, MemoryPointer{}, err } - log.Printf("path %#v", path) return path[0].node.Keys[path[0].index], path[0].node.Pointer(path[0].index), nil } @@ -69,7 +67,6 @@ func (t *BPTree) readNode(ptr MemoryPointer) (*BPTreeNode, error) { type TraversalRecord struct { node *BPTreeNode index int - found bool // the offset is useful so we know which page to free when we split ptr MemoryPointer } @@ -81,9 +78,13 @@ func (t *BPTree) traverse(key ReferencedValue, node *BPTreeNode, ptr MemoryPoint index, found := slices.BinarySearchFunc(node.Keys, key, CompareReferencedValues) if node.leaf() { - return []TraversalRecord{{node: node, index: index, found: found, ptr: ptr}}, nil + return []TraversalRecord{{node: node, index: index, ptr: ptr}}, nil } + if found { + // if the key is found, we need to go to the right child + index++ + } child, err := t.readNode(node.Pointer(index)) if err != nil { return nil, err @@ -92,11 +93,10 @@ func (t *BPTree) traverse(key ReferencedValue, node *BPTreeNode, ptr MemoryPoint if err != nil { return nil, err } - return append(path, TraversalRecord{node: node, index: index, found: found, ptr: ptr}), nil + return append(path, TraversalRecord{node: node, index: index, ptr: ptr}), nil } func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error { - fmt.Printf("\ninsert parameters look like \nkey: %v\nvalue: %v\n\n", key, value) root, rootOffset, err := t.root() if err != nil { return fmt.Errorf("read root node: %w", err) @@ -122,13 +122,9 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error { return err } - log.Printf("path: %v\n", path) - // insert the key into the leaf n := path[0].node - fmt.Printf("keys %v and key %v", n.Keys, key) j, found := slices.BinarySearchFunc(n.Keys, key, CompareReferencedValues) - fmt.Printf("binary search results: j: %v and found %v", j, found) if found { return fmt.Errorf("key already exists") } @@ -147,7 +143,6 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error { tr := path[i] n := tr.node if int(n.Size()) > t.tree.PageSize() { - log.Printf("split!") // split the node // mid is the key that will be inserted into the parent mid := len(n.Keys) / 2 diff --git a/pkg/btree/node.go b/pkg/btree/node.go index 2267a068..36733872 100644 --- a/pkg/btree/node.go +++ b/pkg/btree/node.go @@ -32,7 +32,6 @@ func CompareReferencedValues(a, b ReferencedValue) int { return cmp } - fmt.Printf("the offsets to compare are: %v %v\n", a.DataPointer.Offset, b.DataPointer.Offset) if a.DataPointer.Offset != b.DataPointer.Offset { return int(a.DataPointer.Offset - b.DataPointer.Offset) } From 9108fbcf77615c968e30e6d6efe42b9866dad23f Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Fri, 16 Feb 2024 01:43:52 -0500 Subject: [PATCH 4/4] wire data parser --- pkg/appendable/index_file.go | 1 + pkg/btree/bptree.go | 15 ++++--- pkg/btree/bptree_test.go | 9 +++- pkg/btree/multi.go | 3 +- pkg/btree/node.go | 17 +++++++- pkg/handlers/csv.go | 80 +++++++++++++++-------------------- pkg/handlers/csv_test.go | 8 ++-- pkg/handlers/equality_test.go | 8 ++-- pkg/handlers/jsonl.go | 76 ++++++++++++++++++++++++++------- pkg/handlers/jsonl_test.go | 16 +++---- 10 files changed, 145 insertions(+), 88 deletions(-) diff --git a/pkg/appendable/index_file.go b/pkg/appendable/index_file.go index 2d0490dd..20bffca3 100644 --- a/pkg/appendable/index_file.go +++ b/pkg/appendable/index_file.go @@ -10,6 +10,7 @@ import ( const CurrentVersion = 1 type DataHandler interface { + btree.DataParser Synchronize(f *IndexFile, df []byte) error Format() Format } diff --git a/pkg/btree/bptree.go b/pkg/btree/bptree.go index b54364b8..be58f1fd 100644 --- a/pkg/btree/bptree.go +++ b/pkg/btree/bptree.go @@ -19,13 +19,18 @@ type BPTree struct { tree ReadWriteSeekPager meta MetaPage - Data []byte + Data []byte + DataParser DataParser } func NewBPTree(tree ReadWriteSeekPager, meta MetaPage) *BPTree { return &BPTree{tree: tree, meta: meta} } +func NewBPTreeWithData(tree ReadWriteSeekPager, meta MetaPage, data []byte, parser DataParser) *BPTree { + return &BPTree{tree: tree, meta: meta, Data: data, DataParser: parser} +} + func (t *BPTree) root() (*BPTreeNode, MemoryPointer, error) { mp, err := t.meta.Root() if err != nil || mp.Length == 0 { @@ -57,7 +62,7 @@ func (t *BPTree) readNode(ptr MemoryPointer) (*BPTreeNode, error) { if _, err := t.tree.Seek(int64(ptr.Offset), io.SeekStart); err != nil { return nil, err } - node := &BPTreeNode{Data: t.Data} + node := &BPTreeNode{Data: t.Data, DataParser: t.DataParser} if _, err := node.ReadFrom(t.tree); err != nil { return nil, err } @@ -103,7 +108,7 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error { } if root == nil { // special case, create the root as the first node - node := &BPTreeNode{Data: t.Data} + node := &BPTreeNode{Data: t.Data, DataParser: t.DataParser} node.Keys = []ReferencedValue{key} node.leafPointers = []MemoryPointer{value} buf, err := node.MarshalBinary() @@ -149,7 +154,7 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error { midKey := n.Keys[mid] // n is the left node, m the right node - m := &BPTreeNode{Data: t.Data} + m := &BPTreeNode{Data: t.Data, DataParser: t.DataParser} if n.leaf() { m.leafPointers = n.leafPointers[mid:] m.Keys = n.Keys[mid:] @@ -203,7 +208,7 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error { // the parent will be written to disk in the next iteration } else { // the root split, so create a new root - p := &BPTreeNode{Data: t.Data} + p := &BPTreeNode{Data: t.Data, DataParser: t.DataParser} p.Keys = []ReferencedValue{midKey} p.internalPointers = []uint64{ uint64(noffset), uint64(moffset), diff --git a/pkg/btree/bptree_test.go b/pkg/btree/bptree_test.go index 3bb47970..789f3863 100644 --- a/pkg/btree/bptree_test.go +++ b/pkg/btree/bptree_test.go @@ -213,6 +213,12 @@ func TestBPTree_SequentialInsertionTest(t *testing.T) { } } +type StubDataParser struct{} + +func (s *StubDataParser) Parse(value []byte) []byte { + return []byte{1, 2, 3, 4, 5, 6, 7, 8} +} + func TestBPTree_RandomTests(t *testing.T) { t.Run("random insertion test", func(t *testing.T) { b := buftest.NewSeekableBuffer() @@ -256,8 +262,7 @@ func TestBPTree_RandomTests(t *testing.T) { if err != nil { t.Fatal(err) } - tree := NewBPTree(p, &testMetaPage{}) - tree.Data = make([]byte, 65536*4+8) + tree := NewBPTreeWithData(p, &testMetaPage{}, make([]byte, 65536*4+8), &StubDataParser{}) for i := 0; i < 65536*4; i++ { if err := tree.Insert(ReferencedValue{ Value: []byte{1, 2, 3, 4, 5, 6, 7, 8}, diff --git a/pkg/btree/multi.go b/pkg/btree/multi.go index e3c17d8e..3cb1e286 100644 --- a/pkg/btree/multi.go +++ b/pkg/btree/multi.go @@ -44,10 +44,11 @@ func (m *LinkedMetaPage) SetRoot(mp MemoryPointer) error { // // Generally, passing data is required, however if the tree // consists of only inlined values, it is not necessary. -func (m *LinkedMetaPage) BPTree(data []byte) *BPTree { +func (m *LinkedMetaPage) BPTree(data []byte, parser DataParser) *BPTree { t := NewBPTree(m.rws, m) if data != nil { t.Data = data + t.DataParser = parser } return t } diff --git a/pkg/btree/node.go b/pkg/btree/node.go index 36733872..e037a98b 100644 --- a/pkg/btree/node.go +++ b/pkg/btree/node.go @@ -13,6 +13,10 @@ type MemoryPointer struct { Length uint32 } +func (mp MemoryPointer) String() string { + return fmt.Sprintf("Pointer[%08x:%08x]", mp.Offset, mp.Offset+uint64(mp.Length)) +} + type ReferencedValue struct { // it is generally optional to set the DataPointer. if it is not set, the // value is taken to be unreferenced and is stored directly in the node. @@ -26,6 +30,10 @@ type ReferencedValue struct { Value []byte } +func (rv ReferencedValue) String() string { + return fmt.Sprintf("ReferencedValue@%s{%s}", rv.DataPointer, rv.Value) +} + func CompareReferencedValues(a, b ReferencedValue) int { cmp := bytes.Compare(a.Value, b.Value) if cmp != 0 { @@ -38,8 +46,13 @@ func CompareReferencedValues(a, b ReferencedValue) int { return int(a.DataPointer.Length - b.DataPointer.Length) } +type DataParser interface { + Parse([]byte) []byte +} + type BPTreeNode struct { - Data []byte + Data []byte + DataParser DataParser // contains the offset of the child node or the offset of the record for leaf // if the node is a leaf, the last pointer is the offset of the next leaf leafPointers []MemoryPointer @@ -150,7 +163,7 @@ func (n *BPTreeNode) UnmarshalBinary(buf []byte) error { n.Keys[i].DataPointer.Offset = binary.BigEndian.Uint64(buf[m+4 : m+12]) n.Keys[i].DataPointer.Length = binary.BigEndian.Uint32(buf[m+12 : m+16]) dp := n.Keys[i].DataPointer - n.Keys[i].Value = n.Data[dp.Offset : dp.Offset+uint64(dp.Length)] // resolving the data-file + n.Keys[i].Value = n.DataParser.Parse(n.Data[dp.Offset : dp.Offset+uint64(dp.Length)]) // resolving the data-file m += 4 + 12 } else { n.Keys[i].Value = buf[m+4 : m+4+int(l)] diff --git a/pkg/handlers/csv.go b/pkg/handlers/csv.go index 3c123c28..cab06fc3 100644 --- a/pkg/handlers/csv.go +++ b/pkg/handlers/csv.go @@ -62,7 +62,7 @@ func (c CSVHandler) Synchronize(f *appendable.IndexFile, df []byte) error { dec := csv.NewReader(bytes.NewReader(df[metadata.ReadOffset : metadata.ReadOffset+uint64(i)])) - if err := handleCSVLine(f, df, dec, headers, []string{}, btree.MemoryPointer{ + if err := c.handleCSVLine(f, df, dec, headers, []string{}, btree.MemoryPointer{ Offset: metadata.ReadOffset, Length: uint32(i), }); err != nil { @@ -125,7 +125,31 @@ func InferCSVField(fieldValue string) (interface{}, appendable.FieldType) { return fieldValue, appendable.FieldTypeString } -func handleCSVLine(f *appendable.IndexFile, df []byte, dec *csv.Reader, headers []string, path []string, data btree.MemoryPointer) error { +func (c CSVHandler) Parse(value []byte) []byte { + parsed, fieldType := InferCSVField(string(value)) + + switch fieldType { + case appendable.FieldTypeFloat64: + buf := make([]byte, 8) + binary.BigEndian.PutUint64(buf, math.Float64bits(parsed.(float64))) + return buf + case appendable.FieldTypeBoolean: + if parsed.(bool) { + return []byte{1} + } else { + return []byte{0} + } + case appendable.FieldTypeString: + return []byte(parsed.(string)) + case appendable.FieldTypeNull: + // nil values are a bit of a degenerate case, we are essentially using the btree + // as a set. we store the value as an empty byte slice. + return []byte{} + } + panic("unknown type") +} + +func (c CSVHandler) handleCSVLine(f *appendable.IndexFile, df []byte, dec *csv.Reader, headers []string, path []string, data btree.MemoryPointer) error { record, err := dec.Read() if err != nil { slog.Error("Failed to read CSV record at index", "error", err) @@ -147,58 +171,20 @@ func handleCSVLine(f *appendable.IndexFile, df []byte, dec *csv.Reader, headers fieldOffset := data.Offset + cumulativeLength fieldLength := uint32(len(fieldValue)) - value, fieldType := InferCSVField(fieldValue) + _, fieldType := InferCSVField(fieldValue) page, err := f.FindOrCreateIndex(name, fieldType) if err != nil { return fmt.Errorf("failed to find or create index: %w", err) } - switch fieldType { - case appendable.FieldTypeFloat64: - buf := make([]byte, 8) - binary.BigEndian.PutUint64(buf, math.Float64bits(value.(float64))) - if err := page.BPTree(df).Insert(btree.ReferencedValue{Value: buf}, data); err != nil { - return fmt.Errorf("failed to insert into b+tree: %w", err) - } - case appendable.FieldTypeBoolean: - if value.(bool) { - if err := page.BPTree(df).Insert(btree.ReferencedValue{Value: []byte{1}}, data); err != nil { - return fmt.Errorf("failed to insert into b+tree: %w", err) - } - } else { - if err := page.BPTree(df).Insert(btree.ReferencedValue{Value: []byte{0}}, data); err != nil { - return fmt.Errorf("failed to insert into b+tree: %w", err) - } - } - case appendable.FieldTypeString: - if err := page.BPTree(df).Insert(btree.ReferencedValue{ - DataPointer: btree.MemoryPointer{ - Offset: fieldOffset, - Length: fieldLength, - }, - // trim the quotes - Value: []byte(value.(string)), - }, data); err != nil { - return fmt.Errorf("failed to insert into b+tree: %w", err) - } - - slog.Debug("Appended index record", - slog.String("field", name), - slog.Any("value", value), - slog.Int("start", int(fieldOffset))) - - case appendable.FieldTypeNull: - // nil values are a bit of a degenerate case, we are essentially using the btree - // as a set. we store the value as an empty byte slice. - if err := page.BPTree(df).Insert(btree.ReferencedValue{Value: []byte{}}, data); err != nil { - return fmt.Errorf("failed to insert into b+tree: %w", err) - } - slog.Debug("Marked field", "name", name) + mp := btree.MemoryPointer{ + Offset: fieldOffset, + Length: fieldLength, + } - default: - slog.Error("Encountered unexpected type '%T' for field '%s'", value, name) - return fmt.Errorf("unexpected type '%T'", value) + if err := page.BPTree(df, CSVHandler{}).Insert(btree.ReferencedValue{Value: c.Parse([]byte(fieldValue)), DataPointer: mp}, data); err != nil { + return fmt.Errorf("failed to insert into b+tree: %w", err) } cumulativeLength += uint64(fieldLength + 1) diff --git a/pkg/handlers/csv_test.go b/pkg/handlers/csv_test.go index 9db94e33..21ae398a 100644 --- a/pkg/handlers/csv_test.go +++ b/pkg/handlers/csv_test.go @@ -100,7 +100,7 @@ func TestCSV(t *testing.T) { t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) } - rv1, mp1, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) + rv1, mp1, err := collected[0].BPTree(r2, CSVHandler{}).Find(btree.ReferencedValue{Value: []byte("test1")}) if err != nil { t.Fatal(err) } @@ -117,7 +117,7 @@ func TestCSV(t *testing.T) { t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = %+v, want {%d, %d}", mp1, len("test\n"), len("test1")) } - rv2, mp2, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test2")}) + rv2, mp2, err := collected[0].BPTree(r2, CSVHandler{}).Find(btree.ReferencedValue{Value: []byte("test2")}) if err != nil { t.Fatal(err) } @@ -169,7 +169,7 @@ func TestCSV(t *testing.T) { t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) } - rv1, mp1, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) + rv1, mp1, err := collected[0].BPTree(r2, CSVHandler{}).Find(btree.ReferencedValue{Value: []byte("test1")}) if err != nil { t.Fatal(err) } @@ -199,7 +199,7 @@ func TestCSV(t *testing.T) { v2 := make([]byte, 8) binary.BigEndian.PutUint64(v2, math.Float64bits(123)) - rv2, mp2, err := collected[1].BPTree(r2).Find(btree.ReferencedValue{Value: v2}) + rv2, mp2, err := collected[1].BPTree(r2, CSVHandler{}).Find(btree.ReferencedValue{Value: v2}) if err != nil { t.Fatal(err) } diff --git a/pkg/handlers/equality_test.go b/pkg/handlers/equality_test.go index bf2f2821..75c7cc0e 100644 --- a/pkg/handlers/equality_test.go +++ b/pkg/handlers/equality_test.go @@ -187,7 +187,7 @@ func compareMetaPages(i1, i2 []*btree.LinkedMetaPage, jr, cr []byte) (bool, stri if i == 0 { for _, val := range h1 { - rv1, mp1, err := collected1.BPTree(jr).Find(btree.ReferencedValue{Value: []byte(val)}) + rv1, mp1, err := collected1.BPTree(jr, JSONLHandler{}).Find(btree.ReferencedValue{Value: []byte(val)}) if err != nil { return false, fmt.Sprintf("failed to find btree for jsonl reader %v", val) @@ -196,7 +196,7 @@ func compareMetaPages(i1, i2 []*btree.LinkedMetaPage, jr, cr []byte) (bool, stri return false, fmt.Sprintf("failed to find %v for reader", val) } - rv2, mp2, err := collected2.BPTree(cr).Find(btree.ReferencedValue{Value: []byte(val)}) + rv2, mp2, err := collected2.BPTree(cr, CSVHandler{}).Find(btree.ReferencedValue{Value: []byte(val)}) if err != nil { return false, fmt.Sprintf("failed to find btree for jsonl reader %v", val) @@ -216,7 +216,7 @@ func compareMetaPages(i1, i2 []*btree.LinkedMetaPage, jr, cr []byte) (bool, stri v2 := make([]byte, 8) binary.BigEndian.PutUint64(v2, math.Float64bits(val)) - rv1, mp1, err := collected1.BPTree(jr).Find(btree.ReferencedValue{Value: v2}) + rv1, mp1, err := collected1.BPTree(jr, JSONLHandler{}).Find(btree.ReferencedValue{Value: v2}) if err != nil { return false, fmt.Sprintf("failed to find btree for jsonl reader %v", val) @@ -227,7 +227,7 @@ func compareMetaPages(i1, i2 []*btree.LinkedMetaPage, jr, cr []byte) (bool, stri fmt.Printf("rv1: %v", rv1) - rv2, mp2, err := collected2.BPTree(cr).Find(btree.ReferencedValue{Value: v2}) + rv2, mp2, err := collected2.BPTree(cr, CSVHandler{}).Find(btree.ReferencedValue{Value: v2}) if err != nil { return false, fmt.Sprintf("failed to find btree for jsonl reader %v", val) diff --git a/pkg/handlers/jsonl.go b/pkg/handlers/jsonl.go index 1980195f..3d86b754 100644 --- a/pkg/handlers/jsonl.go +++ b/pkg/handlers/jsonl.go @@ -5,6 +5,7 @@ import ( "encoding/binary" "encoding/json" "fmt" + "log/slog" "math" "strings" @@ -40,7 +41,7 @@ func (j JSONLHandler) Synchronize(f *appendable.IndexFile, df []byte) error { return fmt.Errorf("expected '%U', got '%U' (only json objects are supported at the root)", '{', t) } - if err := handleJSONLObject(f, df, dec, []string{}, btree.MemoryPointer{ + if err := j.handleJSONLObject(f, df, dec, []string{}, btree.MemoryPointer{ Offset: metadata.ReadOffset, Length: uint32(i), }); err != nil { @@ -86,7 +87,43 @@ func jsonTypeToFieldType(t json.Token) appendable.FieldType { panic(fmt.Sprintf("unexpected token '%v'", t)) } -func handleJSONLObject(f *appendable.IndexFile, r []byte, dec *json.Decoder, path []string, data btree.MemoryPointer) error { +func (j JSONLHandler) Parse(value []byte) []byte { + token, err := json.NewDecoder(bytes.NewReader(value)).Token() + if err != nil { + slog.Error("failed to parse token", "err", err) + return nil + } + switch token := token.(type) { + case string: + return []byte(token) + case json.Number, float64: + buf := make([]byte, 8) + switch token := token.(type) { + case json.Number: + f, err := token.Float64() + if err != nil { + slog.Error("failed to parse float", "err", err) + return nil + } + binary.BigEndian.PutUint64(buf, math.Float64bits(f)) + case float64: + binary.BigEndian.PutUint64(buf, math.Float64bits(token)) + } + return buf + case bool: + if token { + return []byte{1} + } + return []byte{0} + case json.Delim: + panic("unexpected delimiter, objects should not be indexed!") + case nil: + return []byte{} + } + panic(fmt.Sprintf("unexpected token '%v'", token)) +} + +func (j JSONLHandler) handleJSONLObject(f *appendable.IndexFile, r []byte, dec *json.Decoder, path []string, data btree.MemoryPointer) error { // while the next token is not }, read the key for dec.More() { key, err := dec.Token() @@ -111,15 +148,17 @@ func handleJSONLObject(f *appendable.IndexFile, r []byte, dec *json.Decoder, pat if err != nil { return fmt.Errorf("failed to find or create index: %w", err) } + + mp := btree.MemoryPointer{ + Offset: data.Offset + uint64(fieldOffset), + Length: uint32(dec.InputOffset() - fieldOffset), + } + switch value := value.(type) { case string: - if err := page.BPTree(r).Insert(btree.ReferencedValue{ - DataPointer: btree.MemoryPointer{ - Offset: data.Offset + uint64(fieldOffset) + 1, - Length: uint32(dec.InputOffset()-fieldOffset) - 2, - }, - // trim the quotes - Value: []byte(value), + if err := page.BPTree(r, j).Insert(btree.ReferencedValue{ + DataPointer: mp, + Value: []byte(value), }, data); err != nil { return fmt.Errorf("failed to insert into b+tree: %w", err) } @@ -135,16 +174,23 @@ func handleJSONLObject(f *appendable.IndexFile, r []byte, dec *json.Decoder, pat case float64: binary.BigEndian.PutUint64(buf, math.Float64bits(value)) } - if err := page.BPTree(r).Insert(btree.ReferencedValue{Value: buf}, data); err != nil { + if err := page.BPTree(r, j).Insert(btree.ReferencedValue{ + DataPointer: mp, Value: buf}, data); err != nil { return fmt.Errorf("failed to insert into b+tree: %w", err) } case bool: if value { - if err := page.BPTree(r).Insert(btree.ReferencedValue{Value: []byte{1}}, data); err != nil { + if err := page.BPTree(r, j).Insert(btree.ReferencedValue{ + DataPointer: mp, + Value: []byte{1}, + }, data); err != nil { return fmt.Errorf("failed to insert into b+tree: %w", err) } } else { - if err := page.BPTree(r).Insert(btree.ReferencedValue{Value: []byte{0}}, data); err != nil { + if err := page.BPTree(r, j).Insert(btree.ReferencedValue{ + DataPointer: mp, + Value: []byte{0}, + }, data); err != nil { return fmt.Errorf("failed to insert into b+tree: %w", err) } } @@ -174,7 +220,7 @@ func handleJSONLObject(f *appendable.IndexFile, r []byte, dec *json.Decoder, pat } case json.Delim('{'): // find the index to set the field type to unknown. - if err := handleJSONLObject(f, r, dec, append(path, key), data); err != nil { + if err := j.handleJSONLObject(f, r, dec, append(path, key), data); err != nil { return fmt.Errorf("failed to handle object: %w", err) } // read the } @@ -188,9 +234,9 @@ func handleJSONLObject(f *appendable.IndexFile, r []byte, dec *json.Decoder, pat fmt.Printf("\n\ninserting data with offset: %v\n", data.Offset) // nil values are a bit of a degenerate case, we are essentially using the btree // as a set. we store the value as an empty byte slice. - if err := page.BPTree(r).Insert(btree.ReferencedValue{ + if err := page.BPTree(r, j).Insert(btree.ReferencedValue{ Value: []byte{}, - DataPointer: data, + DataPointer: mp, }, data); err != nil { return fmt.Errorf("failed to insert into b+tree: %w\nmp: %v\n", err, data.Offset) } else { diff --git a/pkg/handlers/jsonl_test.go b/pkg/handlers/jsonl_test.go index 160c7f2d..fcb13b12 100644 --- a/pkg/handlers/jsonl_test.go +++ b/pkg/handlers/jsonl_test.go @@ -87,7 +87,7 @@ func TestJSONL(t *testing.T) { t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) } - rv1, mp1, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) + rv1, mp1, err := collected[0].BPTree(r2, JSONLHandler{}).Find(btree.ReferencedValue{Value: []byte("test1")}) if err != nil { t.Fatal(err) } @@ -103,7 +103,7 @@ func TestJSONL(t *testing.T) { t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = %+v, want {0, %d}", mp1, len("{\"test\":\"test1\"}")) } - rv2, mp2, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test3")}) + rv2, mp2, err := collected[0].BPTree(r2, JSONLHandler{}).Find(btree.ReferencedValue{Value: []byte("test3")}) if err != nil { t.Fatal(err) } @@ -152,7 +152,7 @@ func TestJSONL(t *testing.T) { t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) } - rv1, mp1, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) + rv1, mp1, err := collected[0].BPTree(r2, JSONLHandler{}).Find(btree.ReferencedValue{Value: []byte("test1")}) if err != nil { t.Fatal(err) } @@ -180,7 +180,7 @@ func TestJSONL(t *testing.T) { t.Errorf("got i.Indexes[0].FieldType = %#v, want FieldTypeString", md1.FieldType) } - rv2, mp2, err := collected[1].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test3")}) + rv2, mp2, err := collected[1].BPTree(r2, JSONLHandler{}).Find(btree.ReferencedValue{Value: []byte("test3")}) if err != nil { t.Fatal(err) } @@ -237,7 +237,7 @@ func TestJSONL(t *testing.T) { t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) } - rv1, mp1, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) + rv1, mp1, err := collected[0].BPTree(r2, JSONLHandler{}).Find(btree.ReferencedValue{Value: []byte("test1")}) if err != nil { t.Fatal(err) } @@ -267,7 +267,7 @@ func TestJSONL(t *testing.T) { v2 := make([]byte, 8) binary.BigEndian.PutUint64(v2, math.Float64bits(123)) - rv2, mp2, err := collected[1].BPTree(r2).Find(btree.ReferencedValue{Value: v2}) + rv2, mp2, err := collected[1].BPTree(r2, JSONLHandler{}).Find(btree.ReferencedValue{Value: v2}) if err != nil { t.Fatal(err) } @@ -538,7 +538,7 @@ func TestJSONL(t *testing.T) { t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) } - rv1, mp1, err := collected[0].BPTree(r2).Find(btree.ReferencedValue{Value: []byte("test1")}) + rv1, mp1, err := collected[0].BPTree(r2, JSONLHandler{}).Find(btree.ReferencedValue{Value: []byte("test1")}) if err != nil { t.Fatal(err) } @@ -566,7 +566,7 @@ func TestJSONL(t *testing.T) { t.Errorf("got i.Indexes[0].FieldType = %#v, want FieldTypeString", md1.FieldType) } - rv2, mp2, err := collected[1].BPTree(r2).Find(btree.ReferencedValue{}) + rv2, mp2, err := collected[1].BPTree(r2, JSONLHandler{}).Find(btree.ReferencedValue{}) if err != nil { t.Fatal(err) }