From 3f3e7b20f1e09cd27d8139d7c8b5a0aec35e3de1 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Wed, 26 Jun 2024 11:05:53 -0400 Subject: [PATCH 1/4] feat: stage work --- pkg/bptree/bptree.go | 3 +- pkg/btree/btree.go | 161 +++++++++++++++++++++++++++++++++++++++- pkg/btree/btree_test.go | 93 +++++++++++++++++++++++ pkg/btree/node.go | 11 ++- 4 files changed, 262 insertions(+), 6 deletions(-) diff --git a/pkg/bptree/bptree.go b/pkg/bptree/bptree.go index 9e9fd9c..4e42099 100644 --- a/pkg/bptree/bptree.go +++ b/pkg/bptree/bptree.go @@ -36,8 +36,7 @@ func (t *BPTree) root() (*BPTreeNode, pointer.MemoryPointer, error) { type TraversalRecord struct { node *BPTreeNode index int - // the offset is useful so we know which page to free when we split - ptr pointer.MemoryPointer + ptr pointer.MemoryPointer } type TraversalIterator struct { diff --git a/pkg/btree/btree.go b/pkg/btree/btree.go index e8c72de..3ee6e8e 100644 --- a/pkg/btree/btree.go +++ b/pkg/btree/btree.go @@ -1,10 +1,13 @@ package btree import ( + "fmt" + "github.com/kevmo314/appendable/pkg/hnsw" "github.com/kevmo314/appendable/pkg/metapage" "github.com/kevmo314/appendable/pkg/pagefile" "github.com/kevmo314/appendable/pkg/pointer" "io" + "slices" ) type BTree struct { @@ -16,7 +19,7 @@ type BTree struct { func (t *BTree) root() (*BTreeNode, pointer.MemoryPointer, error) { mp, err := t.MetaPage.Root() - if err != nil { + if err != nil || mp.Length == 0 { return nil, mp, err } @@ -46,3 +49,159 @@ func (t *BTree) readNode(offset uint64) (*BTreeNode, error) { return node, nil } + +func (t *BTree) Insert(key pointer.ReferencedValue, vector hnsw.Point) error { + root, rootOffset, err := t.root() + if err != nil { + return fmt.Errorf("read root node: %w", err) + } + + if root == nil { + node := &BTreeNode{Width: t.Width} + node.Keys = []pointer.ReferencedValue{key} + node.Vectors = []hnsw.Point{vector} + + buf, err := node.MarshalBinary() + if err != nil { + return err + } + offset, err := t.PageFile.NewPage(buf) + if err != nil { + return err + } + return t.MetaPage.SetRoot(pointer.MemoryPointer{Offset: uint64(offset), Length: uint32(len(buf))}) + } + + parent, parentOffset := root, rootOffset.Offset + for len(parent.Offsets) != 0 { + index, found := slices.BinarySearchFunc(parent.Keys, key, pointer.CompareReferencedValues) + + if found { + panic("cannot insert duplicate key") + + } + + loffset := parent.Offsets[index] + child, err := t.readNode(loffset) + if err != nil { + return err + } + + if int(child.Size()) > t.PageFile.PageSize() { + // split node here + mid := len(child.Keys) / 2 + midKey := child.Keys[mid] + + rightChild := &BTreeNode{Width: t.Width} + if !child.Leaf() { + rightChild.Offsets = child.Offsets[mid+1:] + child.Offsets = child.Offsets[:mid] + } + rightChild.Vectors = child.Vectors[mid+1:] + rightChild.Keys = child.Keys[mid+1:] + + rbuf, err := rightChild.MarshalBinary() + if err != nil { + return err + } + roffset, err := t.PageFile.NewPage(rbuf) + if err != nil { + return err + } + + // shrink left child (child) + child.Keys = child.Keys[:mid] + child.Vectors = child.Vectors[:mid] + if _, err := t.PageFile.Seek(int64(loffset), io.SeekStart); err != nil { + return err + } + + if _, err := child.WriteTo(t.PageFile); err != nil { + return err + } + + // update parent to include new key and store left right offsets + if index == len(parent.Keys) { + parent.Keys = append(parent.Keys, midKey) + } else { + parent.Keys = append(parent.Keys[:index+1], parent.Keys[index:]...) + parent.Keys[index] = midKey + } + + parent.Offsets = append(parent.Offsets[:index+2], parent.Offsets[:index+1]...) + parent.Offsets[index+1] = uint64(roffset) + if _, err := t.PageFile.Seek(int64(parentOffset), io.SeekStart); err != nil { + return err + } + if _, err := parent.WriteTo(t.PageFile); err != nil { + return err + } + + if pointer.CompareReferencedValues(midKey, key) == 1 { + parent, parentOffset = child, loffset + } else { + parent, parentOffset = rightChild, uint64(roffset) + } + } else { + if _, err := t.PageFile.Seek(int64(parentOffset), io.SeekStart); err != nil { + return err + } + if _, err := parent.WriteTo(t.PageFile); err != nil { + return err + } + parent, parentOffset = child, loffset + } + } + + index, found := slices.BinarySearchFunc(parent.Keys, key, pointer.CompareReferencedValues) + if found { + panic("cannot insert duplicate key") + } + + parent.Keys = append(parent.Keys[:index+1], parent.Keys[index:]...) + parent.Keys[index] = key + + parent.Vectors = append(parent.Vectors[:index+1], parent.Vectors[index:]...) + parent.Vectors[index] = vector + + if _, err := t.PageFile.Seek(int64(parentOffset), io.SeekStart); err != nil { + return err + } + if _, err := parent.WriteTo(t.PageFile); err != nil { + return err + } + + return nil +} + +func (t *BTree) Find(key pointer.ReferencedValue) (pointer.ReferencedValue, pointer.MemoryPointer, error) { + node, _, err := t.root() + if err != nil { + return pointer.ReferencedValue{}, pointer.MemoryPointer{}, err + } + + for { + if node == nil { + return pointer.ReferencedValue{}, pointer.MemoryPointer{}, nil + } + + index, found := slices.BinarySearchFunc(node.Keys, key, pointer.CompareReferencedValues) + + if found { + return node.Keys[index], pointer.MemoryPointer{Offset: node.Offsets[index]}, nil + } + + // no key found + if node.Leaf() { + return pointer.ReferencedValue{}, pointer.MemoryPointer{}, nil + } + + newOffset := node.Offsets[index] + newNode, err := t.readNode(newOffset) + if err != nil { + return pointer.ReferencedValue{}, pointer.MemoryPointer{}, err + } + + node = newNode + } +} diff --git a/pkg/btree/btree_test.go b/pkg/btree/btree_test.go index 3c38a25..b184630 100644 --- a/pkg/btree/btree_test.go +++ b/pkg/btree/btree_test.go @@ -1 +1,94 @@ package btree + +import ( + "bytes" + "encoding/binary" + "github.com/kevmo314/appendable/pkg/buftest" + "github.com/kevmo314/appendable/pkg/hnsw" + "github.com/kevmo314/appendable/pkg/pagefile" + "github.com/kevmo314/appendable/pkg/pointer" + "io" + "testing" +) + +type testMetaPage struct { + pf *pagefile.PageFile + root pointer.MemoryPointer +} + +func (m *testMetaPage) SetRoot(mp pointer.MemoryPointer) error { + m.root = mp + return m.write() +} + +func (m *testMetaPage) Root() (pointer.MemoryPointer, error) { + return m.root, nil +} + +func (m *testMetaPage) write() error { + buf := make([]byte, 8) + binary.LittleEndian.PutUint64(buf, m.root.Offset) + if _, err := m.pf.Seek(4096, io.SeekStart); err != nil { + return err + } + if _, err := m.pf.Write(buf); err != nil { + return err + } + return nil +} + +func newTestMetaPage(t *testing.T, pf *pagefile.PageFile) *testMetaPage { + meta := &testMetaPage{pf: pf} + offset, err := pf.NewPage([]byte{0, 0, 0, 0, 0, 0, 0, 0}) + if err != nil { + t.Fatal(err) + } + // first page is garbage collection + if offset != 4096 { + t.Fatalf("expected offset 0, got %d", offset) + } + return meta +} + +func TestBPTree(t *testing.T) { + t.Run("empty tree", func(t *testing.T) { + b := buftest.NewSeekableBuffer() + p, err := pagefile.NewPageFile(b) + if err != nil { + t.Fatal(err) + } + tree := &BTree{PageFile: p, MetaPage: newTestMetaPage(t, p)} + // find a key that doesn't exist + k, _, err := tree.Find(pointer.ReferencedValue{Value: []byte{1}}) + if err != nil { + t.Fatal(err) + } + if len(k.Value) != 0 { + t.Fatal("expected not found") + } + }) + + t.Run("insert creates a root", func(t *testing.T) { + b := buftest.NewSeekableBuffer() + p, err := pagefile.NewPageFile(b) + if err != nil { + t.Fatal(err) + } + tree := &BTree{PageFile: p, MetaPage: newTestMetaPage(t, p), Width: uint16(6)} + if err := tree.Insert(pointer.ReferencedValue{Value: []byte{1}}, hnsw.Point{1}); err != nil { + t.Fatal(err) + } + k, v, err := tree.Find(pointer.ReferencedValue{Value: []byte{1}}) + + if err != nil { + t.Fatal(err) + } + if !bytes.Equal(k.Value, []byte{1}) { + t.Fatalf("expected to find key %v, got %v", []byte{1}, k.Value) + } + if v.Offset != 1 { + t.Fatalf("expected value 1, got %d", v) + } + }) + +} diff --git a/pkg/btree/node.go b/pkg/btree/node.go index c3653dc..acc126b 100644 --- a/pkg/btree/node.go +++ b/pkg/btree/node.go @@ -2,15 +2,16 @@ package btree import ( "github.com/kevmo314/appendable/pkg/hnsw" + "github.com/kevmo314/appendable/pkg/pointer" "io" ) type BTreeNode struct { - Ids []hnsw.Id + Keys []pointer.ReferencedValue Vectors []hnsw.Point - Pointers []uint64 - Width uint16 + Offsets []uint64 + Width uint16 } func (n *BTreeNode) Size() int64 { @@ -37,3 +38,7 @@ func (n *BTreeNode) WriteTo(w io.Writer) (int64, error) { m, err := w.Write(buf) return int64(m), err } + +func (n *BTreeNode) Leaf() bool { + return len(n.Offsets) == 0 +} From 1f1d18c9f594d30c0cbdae3110b8bda4d70a7e27 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Thu, 27 Jun 2024 12:02:12 -0400 Subject: [PATCH 2/4] stage --- pkg/bptree/bptree.go | 1 + pkg/bptree/bptree_test.go | 3 + pkg/bptree/node.go | 2 +- pkg/btree/btree.go | 27 ++++----- pkg/btree/btree_test.go | 23 +++---- pkg/btree/node.go | 89 +++++++++++++++++++++++++--- pkg/pointer/referenced_value.go | 20 +++++++ pkg/pointer/referenced_value_test.go | 49 +++++++++++++++ 8 files changed, 182 insertions(+), 32 deletions(-) create mode 100644 pkg/pointer/referenced_value_test.go diff --git a/pkg/bptree/bptree.go b/pkg/bptree/bptree.go index 4e42099..85523c4 100644 --- a/pkg/bptree/bptree.go +++ b/pkg/bptree/bptree.go @@ -273,6 +273,7 @@ func (t *BPTree) Insert(key pointer.ReferencedValue, value pointer.MemoryPointer if err != nil { return err } + return t.MetaPage.SetRoot(pointer.MemoryPointer{Offset: uint64(offset), Length: uint32(len(buf))}) } diff --git a/pkg/bptree/bptree_test.go b/pkg/bptree/bptree_test.go index ac63aee..6fe16f8 100644 --- a/pkg/bptree/bptree_test.go +++ b/pkg/bptree/bptree_test.go @@ -82,6 +82,9 @@ func TestBPTree(t *testing.T) { if err := tree.Insert(pointer.ReferencedValue{Value: []byte("hello")}, pointer.MemoryPointer{Offset: 1, Length: 5}); err != nil { t.Fatal(err) } + + fmt.Printf("buffer: %v", b.Bytes()[4096*2:]) + k, v, err := tree.Find(pointer.ReferencedValue{Value: []byte("hello")}) if err != nil { t.Fatal(err) diff --git a/pkg/bptree/node.go b/pkg/bptree/node.go index db8c0ae..c06c085 100644 --- a/pkg/bptree/node.go +++ b/pkg/bptree/node.go @@ -44,7 +44,7 @@ func (n *BPTreeNode) Size() int64 { size := 4 // number of keys for _, k := range n.Keys { - o := encoding.SizeVarint(uint64(k.DataPointer.Offset)) + o := encoding.SizeVarint(k.DataPointer.Offset) l := encoding.SizeVarint(uint64(k.DataPointer.Length)) size += l + o diff --git a/pkg/btree/btree.go b/pkg/btree/btree.go index 3ee6e8e..f56327c 100644 --- a/pkg/btree/btree.go +++ b/pkg/btree/btree.go @@ -50,7 +50,7 @@ func (t *BTree) readNode(offset uint64) (*BTreeNode, error) { return node, nil } -func (t *BTree) Insert(key pointer.ReferencedValue, vector hnsw.Point) error { +func (t *BTree) Insert(key pointer.ReferencedId, vector hnsw.Point) error { root, rootOffset, err := t.root() if err != nil { return fmt.Errorf("read root node: %w", err) @@ -58,7 +58,7 @@ func (t *BTree) Insert(key pointer.ReferencedValue, vector hnsw.Point) error { if root == nil { node := &BTreeNode{Width: t.Width} - node.Keys = []pointer.ReferencedValue{key} + node.Keys = []pointer.ReferencedId{key} node.Vectors = []hnsw.Point{vector} buf, err := node.MarshalBinary() @@ -73,12 +73,11 @@ func (t *BTree) Insert(key pointer.ReferencedValue, vector hnsw.Point) error { } parent, parentOffset := root, rootOffset.Offset - for len(parent.Offsets) != 0 { - index, found := slices.BinarySearchFunc(parent.Keys, key, pointer.CompareReferencedValues) + for !parent.Leaf() { + index, found := slices.BinarySearchFunc(parent.Keys, key, pointer.CompareReferencedIds) if found { panic("cannot insert duplicate key") - } loffset := parent.Offsets[index] @@ -137,7 +136,7 @@ func (t *BTree) Insert(key pointer.ReferencedValue, vector hnsw.Point) error { return err } - if pointer.CompareReferencedValues(midKey, key) == 1 { + if pointer.CompareReferencedIds(midKey, key) == 1 { parent, parentOffset = child, loffset } else { parent, parentOffset = rightChild, uint64(roffset) @@ -153,7 +152,7 @@ func (t *BTree) Insert(key pointer.ReferencedValue, vector hnsw.Point) error { } } - index, found := slices.BinarySearchFunc(parent.Keys, key, pointer.CompareReferencedValues) + index, found := slices.BinarySearchFunc(parent.Keys, key, pointer.CompareReferencedIds) if found { panic("cannot insert duplicate key") } @@ -174,32 +173,32 @@ func (t *BTree) Insert(key pointer.ReferencedValue, vector hnsw.Point) error { return nil } -func (t *BTree) Find(key pointer.ReferencedValue) (pointer.ReferencedValue, pointer.MemoryPointer, error) { +func (t *BTree) Find(key pointer.ReferencedId) (pointer.ReferencedId, pointer.MemoryPointer, error) { node, _, err := t.root() if err != nil { - return pointer.ReferencedValue{}, pointer.MemoryPointer{}, err + return pointer.ReferencedId{}, pointer.MemoryPointer{}, err } for { if node == nil { - return pointer.ReferencedValue{}, pointer.MemoryPointer{}, nil + return pointer.ReferencedId{}, pointer.MemoryPointer{}, nil } - index, found := slices.BinarySearchFunc(node.Keys, key, pointer.CompareReferencedValues) + index, found := slices.BinarySearchFunc(node.Keys, key, pointer.CompareReferencedIds) if found { - return node.Keys[index], pointer.MemoryPointer{Offset: node.Offsets[index]}, nil + return node.Keys[index-1], pointer.MemoryPointer{Offset: node.Offsets[index]}, nil } // no key found if node.Leaf() { - return pointer.ReferencedValue{}, pointer.MemoryPointer{}, nil + return pointer.ReferencedId{}, pointer.MemoryPointer{}, nil } newOffset := node.Offsets[index] newNode, err := t.readNode(newOffset) if err != nil { - return pointer.ReferencedValue{}, pointer.MemoryPointer{}, err + return pointer.ReferencedId{}, pointer.MemoryPointer{}, err } node = newNode diff --git a/pkg/btree/btree_test.go b/pkg/btree/btree_test.go index b184630..495f0e9 100644 --- a/pkg/btree/btree_test.go +++ b/pkg/btree/btree_test.go @@ -1,7 +1,6 @@ package btree import ( - "bytes" "encoding/binary" "github.com/kevmo314/appendable/pkg/buftest" "github.com/kevmo314/appendable/pkg/hnsw" @@ -50,7 +49,7 @@ func newTestMetaPage(t *testing.T, pf *pagefile.PageFile) *testMetaPage { return meta } -func TestBPTree(t *testing.T) { +func TestBTree(t *testing.T) { t.Run("empty tree", func(t *testing.T) { b := buftest.NewSeekableBuffer() p, err := pagefile.NewPageFile(b) @@ -59,13 +58,15 @@ func TestBPTree(t *testing.T) { } tree := &BTree{PageFile: p, MetaPage: newTestMetaPage(t, p)} // find a key that doesn't exist - k, _, err := tree.Find(pointer.ReferencedValue{Value: []byte{1}}) + k, _, err := tree.Find(pointer.ReferencedId{Id: hnsw.Id(0)}) if err != nil { t.Fatal(err) } - if len(k.Value) != 0 { - t.Fatal("expected not found") + + if k.Id != hnsw.Id(0) { + t.Fatalf("expected id 0, got %d", k) } + }) t.Run("insert creates a root", func(t *testing.T) { @@ -74,18 +75,20 @@ func TestBPTree(t *testing.T) { if err != nil { t.Fatal(err) } - tree := &BTree{PageFile: p, MetaPage: newTestMetaPage(t, p), Width: uint16(6)} - if err := tree.Insert(pointer.ReferencedValue{Value: []byte{1}}, hnsw.Point{1}); err != nil { + tree := &BTree{PageFile: p, MetaPage: newTestMetaPage(t, p), Width: uint16(0)} + if err := tree.Insert(pointer.ReferencedId{Id: 1}, hnsw.Point{1}); err != nil { t.Fatal(err) } - k, v, err := tree.Find(pointer.ReferencedValue{Value: []byte{1}}) + k, v, err := tree.Find(pointer.ReferencedId{Id: 1}) if err != nil { t.Fatal(err) } - if !bytes.Equal(k.Value, []byte{1}) { - t.Fatalf("expected to find key %v, got %v", []byte{1}, k.Value) + + if k.Id != hnsw.Id(1) { + t.Fatalf("expected id 1, got %d", k) } + if v.Offset != 1 { t.Fatalf("expected value 1, got %d", v) } diff --git a/pkg/btree/node.go b/pkg/btree/node.go index acc126b..bd21c16 100644 --- a/pkg/btree/node.go +++ b/pkg/btree/node.go @@ -1,32 +1,107 @@ package btree import ( + "encoding/binary" + "github.com/kevmo314/appendable/pkg/encoding" "github.com/kevmo314/appendable/pkg/hnsw" "github.com/kevmo314/appendable/pkg/pointer" "io" ) type BTreeNode struct { - Keys []pointer.ReferencedValue + Keys []pointer.ReferencedId Vectors []hnsw.Point Offsets []uint64 - Width uint16 + + // Width should be 0 for varint + Width uint16 } func (n *BTreeNode) Size() int64 { - return 0 + size := 4 + + for _, k := range n.Keys { + size += encoding.SizeVarint(k.DataPointer.Offset) + size += encoding.SizeVarint(uint64(k.Id)) + } + + for _, n := range n.Offsets { + size += encoding.SizeVarint(n) + } + + return int64(size) } -// MarshalBinary TODO! func (n *BTreeNode) MarshalBinary() ([]byte, error) { - b := []byte{} + size := int32(len(n.Keys)) + + if size == 0 { + panic("writing empty node") + } - return b, nil + buf := make([]byte, n.Size()) + + if n.Leaf() { + binary.LittleEndian.PutUint32(buf[:4], uint32(-size)) + } else { + binary.LittleEndian.PutUint32(buf[:4], uint32(size)) + } + + ct := 4 + for _, k := range n.Keys { + on := binary.PutUvarint(buf[ct:], k.DataPointer.Offset) + vn := binary.PutUvarint(buf[ct+on:], uint64(k.Id)) + ct += on + vn + + } + + for _, o := range n.Offsets { + on := binary.PutUvarint(buf[ct:], o) + ct += on + } + + if ct != int(n.Size()) { + panic("size mismatch") + } + + return buf, nil } -// UnmarshalBinary TODO! func (n *BTreeNode) UnmarshalBinary(buf []byte) error { + size := int32(binary.LittleEndian.Uint32(buf[:4])) + leaf := size < 0 + + if leaf { + n.Offsets = make([]uint64, (-size)+1) + n.Keys = make([]pointer.ReferencedId, -size) + n.Vectors = make([]hnsw.Point, -size) + } else { + n.Keys = make([]pointer.ReferencedId, size) + n.Vectors = make([]hnsw.Point, size) + } + + if size == 0 { + panic("empty node") + } + + m := 4 + for i := range n.Keys { + o, on := binary.Uvarint(buf[m:]) + v, vn := binary.Uvarint(buf[m+on:]) + + n.Keys[i].Id = hnsw.Id(v) + n.Keys[i].DataPointer.Offset = o + + m += on + vn + } + + for i := range n.Offsets { + o, on := binary.Uvarint(buf[m:]) + n.Offsets[i] = o + m += on + } + return nil } diff --git a/pkg/pointer/referenced_value.go b/pkg/pointer/referenced_value.go index 6ff83b0..f51c1d9 100644 --- a/pkg/pointer/referenced_value.go +++ b/pkg/pointer/referenced_value.go @@ -3,6 +3,7 @@ package pointer import ( "bytes" "fmt" + "github.com/kevmo314/appendable/pkg/hnsw" ) type ReferencedValue struct { @@ -18,10 +19,19 @@ type ReferencedValue struct { Value []byte } +type ReferencedId struct { + DataPointer MemoryPointer + Id hnsw.Id +} + func (rv ReferencedValue) String() string { return fmt.Sprintf("ReferencedValue@%s{%s}", rv.DataPointer, rv.Value) } +func (ri ReferencedId) String() string { + return fmt.Sprintf("ReferencedValue@%d{%d}", ri.Id, ri.Id) +} + func CompareReferencedValues(a, b ReferencedValue) int { if cmp := bytes.Compare(a.Value, b.Value); cmp != 0 { return cmp @@ -36,3 +46,13 @@ func CompareReferencedValues(a, b ReferencedValue) int { } return 0 } + +func CompareReferencedIds(a, b ReferencedId) int { + if a.Id > b.Id { + return 1 + } else if a.Id < b.Id { + return -1 + } + + return 0 +} diff --git a/pkg/pointer/referenced_value_test.go b/pkg/pointer/referenced_value_test.go new file mode 100644 index 0000000..767d43c --- /dev/null +++ b/pkg/pointer/referenced_value_test.go @@ -0,0 +1,49 @@ +package pointer + +import ( + "slices" + "testing" +) + +func TestReferencedValue(t *testing.T) { + t.Run("compare referenced value", func(t *testing.T) { + keys := []ReferencedValue{ + { + Value: []byte{1}, + DataPointer: MemoryPointer{ + Offset: 100, + Length: 0, + }, + }, + { + Value: []byte{2}, + DataPointer: MemoryPointer{ + Offset: 200, + Length: 0, + }, + }, + { + Value: []byte{3}, + DataPointer: MemoryPointer{ + Offset: 300, + Length: 0, + }, + }, + } + + index, found := slices.BinarySearchFunc(keys, ReferencedValue{ + DataPointer: MemoryPointer{}, + Value: []byte{1}, + }, CompareUniqueReferencedValues) + + if !found { + t.Fatal("expected to find key 1") + } + + index++ + if index != 1 { + t.Fatalf("expected index to be 1, got: %v", index) + } + }) + +} From d9e740cf9907fb351fbfcc520e9f3874c32a9882 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Fri, 28 Jun 2024 13:34:44 -0400 Subject: [PATCH 3/4] fix --- pkg/btree/btree.go | 35 ++++++++++++++-------------- pkg/btree/btree_test.go | 16 +++++-------- pkg/btree/node.go | 6 +---- pkg/pointer/referenced_value_test.go | 14 +++++------ 4 files changed, 32 insertions(+), 39 deletions(-) diff --git a/pkg/btree/btree.go b/pkg/btree/btree.go index 7b40827..7984240 100644 --- a/pkg/btree/btree.go +++ b/pkg/btree/btree.go @@ -59,9 +59,10 @@ func (t *BTree) Insert(key pointer.ReferencedId, vector hnsw.Point) error { } if root == nil { - node := &BTreeNode{Width: t.Width} - node.Keys = []pointer.ReferencedId{key} + node := &BTreeNode{Width: t.Width, VectorDim: t.VectorDim} + node.Ids = []pointer.ReferencedId{key} node.Vectors = []hnsw.Point{vector} + node.Offsets = make([]uint64, 0) buf, err := node.MarshalBinary() if err != nil { @@ -76,7 +77,7 @@ func (t *BTree) Insert(key pointer.ReferencedId, vector hnsw.Point) error { parent, parentOffset := root, rootOffset.Offset for !parent.Leaf() { - index, found := slices.BinarySearchFunc(parent.Keys, key, pointer.CompareReferencedIds) + index, found := slices.BinarySearchFunc(parent.Ids, key, pointer.CompareReferencedIds) if found { panic("cannot insert duplicate key") @@ -90,16 +91,16 @@ func (t *BTree) Insert(key pointer.ReferencedId, vector hnsw.Point) error { if int(child.Size()) > t.PageFile.PageSize() { // split node here - mid := len(child.Keys) / 2 - midKey := child.Keys[mid] + mid := len(child.Ids) / 2 + midKey := child.Ids[mid] - rightChild := &BTreeNode{Width: t.Width} + rightChild := &BTreeNode{Width: t.Width, VectorDim: t.VectorDim} if !child.Leaf() { rightChild.Offsets = child.Offsets[mid+1:] child.Offsets = child.Offsets[:mid] } rightChild.Vectors = child.Vectors[mid+1:] - rightChild.Keys = child.Keys[mid+1:] + rightChild.Ids = child.Ids[mid+1:] rbuf, err := rightChild.MarshalBinary() if err != nil { @@ -111,7 +112,7 @@ func (t *BTree) Insert(key pointer.ReferencedId, vector hnsw.Point) error { } // shrink left child (child) - child.Keys = child.Keys[:mid] + child.Ids = child.Ids[:mid] child.Vectors = child.Vectors[:mid] if _, err := t.PageFile.Seek(int64(loffset), io.SeekStart); err != nil { return err @@ -122,11 +123,11 @@ func (t *BTree) Insert(key pointer.ReferencedId, vector hnsw.Point) error { } // update parent to include new key and store left right offsets - if index == len(parent.Keys) { - parent.Keys = append(parent.Keys, midKey) + if index == len(parent.Ids) { + parent.Ids = append(parent.Ids, midKey) } else { - parent.Keys = append(parent.Keys[:index+1], parent.Keys[index:]...) - parent.Keys[index] = midKey + parent.Ids = append(parent.Ids[:index+1], parent.Ids[index:]...) + parent.Ids[index] = midKey } parent.Offsets = append(parent.Offsets[:index+2], parent.Offsets[:index+1]...) @@ -154,13 +155,13 @@ func (t *BTree) Insert(key pointer.ReferencedId, vector hnsw.Point) error { } } - index, found := slices.BinarySearchFunc(parent.Keys, key, pointer.CompareReferencedIds) + index, found := slices.BinarySearchFunc(parent.Ids, key, pointer.CompareReferencedIds) if found { panic("cannot insert duplicate key") } - parent.Keys = append(parent.Keys[:index+1], parent.Keys[index:]...) - parent.Keys[index] = key + parent.Ids = append(parent.Ids[:index+1], parent.Ids[index:]...) + parent.Ids[index] = key parent.Vectors = append(parent.Vectors[:index+1], parent.Vectors[index:]...) parent.Vectors[index] = vector @@ -186,10 +187,10 @@ func (t *BTree) Find(key pointer.ReferencedId) (pointer.ReferencedId, pointer.Me return pointer.ReferencedId{}, pointer.MemoryPointer{}, nil } - index, found := slices.BinarySearchFunc(node.Keys, key, pointer.CompareReferencedIds) + index, found := slices.BinarySearchFunc(node.Ids, key, pointer.CompareReferencedIds) if found { - return node.Keys[index-1], pointer.MemoryPointer{Offset: node.Offsets[index]}, nil + return node.Ids[index], pointer.MemoryPointer{Offset: node.Ids[index].DataPointer.Offset}, nil } // no key found diff --git a/pkg/btree/btree_test.go b/pkg/btree/btree_test.go index 495f0e9..7768e92 100644 --- a/pkg/btree/btree_test.go +++ b/pkg/btree/btree_test.go @@ -58,12 +58,12 @@ func TestBTree(t *testing.T) { } tree := &BTree{PageFile: p, MetaPage: newTestMetaPage(t, p)} // find a key that doesn't exist - k, _, err := tree.Find(pointer.ReferencedId{Id: hnsw.Id(0)}) + k, _, err := tree.Find(pointer.ReferencedId{Value: hnsw.Id(0)}) if err != nil { t.Fatal(err) } - if k.Id != hnsw.Id(0) { + if k.Value != hnsw.Id(0) { t.Fatalf("expected id 0, got %d", k) } @@ -75,23 +75,19 @@ func TestBTree(t *testing.T) { if err != nil { t.Fatal(err) } - tree := &BTree{PageFile: p, MetaPage: newTestMetaPage(t, p), Width: uint16(0)} - if err := tree.Insert(pointer.ReferencedId{Id: 1}, hnsw.Point{1}); err != nil { + tree := &BTree{PageFile: p, MetaPage: newTestMetaPage(t, p), Width: uint16(0), VectorDim: 1} + if err := tree.Insert(pointer.ReferencedId{Value: 1}, hnsw.Point{1}); err != nil { t.Fatal(err) } - k, v, err := tree.Find(pointer.ReferencedId{Id: 1}) + k, _, err := tree.Find(pointer.ReferencedId{Value: 1}) if err != nil { t.Fatal(err) } - if k.Id != hnsw.Id(1) { + if k.Value != hnsw.Id(1) { t.Fatalf("expected id 1, got %d", k) } - - if v.Offset != 1 { - t.Fatalf("expected value 1, got %d", v) - } }) } diff --git a/pkg/btree/node.go b/pkg/btree/node.go index 74d3337..c8fdb46 100644 --- a/pkg/btree/node.go +++ b/pkg/btree/node.go @@ -39,7 +39,7 @@ func (n *BTreeNode) Size() int64 { } func (n *BTreeNode) Leaf() bool { - return n.Offsets == nil || len(n.Offsets) == 0 + return len(n.Offsets) == 0 } func (n *BTreeNode) MarshalBinary() ([]byte, error) { @@ -155,7 +155,3 @@ func (n *BTreeNode) WriteTo(w io.Writer) (int64, error) { m, err := w.Write(buf) return int64(m), err } - -func (n *BTreeNode) Leaf() bool { - return len(n.Offsets) == 0 -} diff --git a/pkg/pointer/referenced_value_test.go b/pkg/pointer/referenced_value_test.go index 767d43c..f70f650 100644 --- a/pkg/pointer/referenced_value_test.go +++ b/pkg/pointer/referenced_value_test.go @@ -7,23 +7,23 @@ import ( func TestReferencedValue(t *testing.T) { t.Run("compare referenced value", func(t *testing.T) { - keys := []ReferencedValue{ + keys := []ReferencedId{ { - Value: []byte{1}, + Value: 1, DataPointer: MemoryPointer{ Offset: 100, Length: 0, }, }, { - Value: []byte{2}, + Value: 2, DataPointer: MemoryPointer{ Offset: 200, Length: 0, }, }, { - Value: []byte{3}, + Value: 3, DataPointer: MemoryPointer{ Offset: 300, Length: 0, @@ -31,10 +31,10 @@ func TestReferencedValue(t *testing.T) { }, } - index, found := slices.BinarySearchFunc(keys, ReferencedValue{ + index, found := slices.BinarySearchFunc(keys, ReferencedId{ DataPointer: MemoryPointer{}, - Value: []byte{1}, - }, CompareUniqueReferencedValues) + Value: 1, + }, CompareReferencedIds) if !found { t.Fatal("expected to find key 1") From 7b18a46a055df990255c1a0b96997a907dd10d86 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Fri, 28 Jun 2024 13:36:03 -0400 Subject: [PATCH 4/4] clean --- pkg/bptree/bptree_test.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/pkg/bptree/bptree_test.go b/pkg/bptree/bptree_test.go index 6fe16f8..cd1e207 100644 --- a/pkg/bptree/bptree_test.go +++ b/pkg/bptree/bptree_test.go @@ -83,8 +83,6 @@ func TestBPTree(t *testing.T) { t.Fatal(err) } - fmt.Printf("buffer: %v", b.Bytes()[4096*2:]) - k, v, err := tree.Find(pointer.ReferencedValue{Value: []byte("hello")}) if err != nil { t.Fatal(err)