Skip to content

Commit

Permalink
Merge branch 'main' into refactor-btreejs
Browse files Browse the repository at this point in the history
  • Loading branch information
friendlymatthew authored Jan 21, 2024
2 parents 4cc65fe + 7aa3cff commit dd80cba
Show file tree
Hide file tree
Showing 7 changed files with 364 additions and 259 deletions.
18 changes: 18 additions & 0 deletions pkg/btree/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,21 @@

This package implements an on-disk B+ tree, taking some inspiration from
https://github.com/spy16/kiwi/tree/master/index/bptree.

## On the significance of the 4kB page size

The B+ tree is designed to be stored on disk, and as such, it is designed to
take advantage of the 4kB page size of most disks. However, in practice we
don't see a material impact on performance when using alternative sizes. So
why do we choose to use 4kB pages?

In order to garbage collect old B+ tree nodes, we want to have pointers to
freed pages to deallocate them entirely. That is, if we did not use page sizes
and stored nodes contiguously, it would be difficult to garbage collect the exact
number of bytes and we would end up with fragmentation. By using page sizes, we
can simply store a list of freed pages and deallocate them entirely and we can
be sure that the freed page will be sufficient to store the new node.

Therefore, we must choose a page size that is large enough to store a node.
In practice, the choice of 4kB specifically is arbitrary, but it is a nice way
to align with the page size of most disks.
197 changes: 12 additions & 185 deletions pkg/btree/bptree.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"bytes"
"fmt"
"io"
"slices"
)

// MetaPage is an abstract interface over the root page of a btree
Expand All @@ -15,20 +14,15 @@ type MetaPage interface {
SetRoot(MemoryPointer) error
}

type ReadWriteSeekTruncater interface {
io.ReadWriteSeeker
Truncate(size int64) error
}

type BPTree struct {
tree ReadWriteSeekTruncater
tree ReadWriteSeekPager
meta MetaPage

maxPageSize int
}

func NewBPTree(tree ReadWriteSeekTruncater, meta MetaPage, maxPageSize int) *BPTree {
return &BPTree{tree: tree, meta: meta, maxPageSize: maxPageSize}
func NewBPTree(tree ReadWriteSeekPager, meta MetaPage) *BPTree {
return &BPTree{tree: tree, meta: meta}
}

func (t *BPTree) root() (*BPTreeNode, MemoryPointer, error) {
Expand Down Expand Up @@ -116,7 +110,7 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error {
}
if root == nil {
// special case, create the root as the first node
offset, err := t.tree.Seek(0, io.SeekEnd)
offset, err := t.tree.NewPage()
if err != nil {
return err
}
Expand Down Expand Up @@ -151,9 +145,9 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error {
for i := 0; i < len(path); i++ {
tr := path[i]
n := tr.node
if len(n.Keys) > t.maxPageSize {
if len(n.Keys) > t.tree.PageSize() {
// split the node
moffset, err := t.tree.Seek(0, io.SeekEnd)
moffset, err := t.tree.NewPage()
if err != nil {
return err
}
Expand Down Expand Up @@ -184,7 +178,11 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error {
n.Pointers = n.Pointers[:mid+1]
n.Keys = n.Keys[:mid]
}
noffset := moffset + msize

noffset, err := t.tree.NewPage()
if err != nil {
return err
}
nsize, err := n.WriteTo(t.tree)
if err != nil {
return err
Expand Down Expand Up @@ -227,7 +225,7 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error {
}
} else {
// write this node to disk and update the parent
offset, err := t.tree.Seek(0, io.SeekEnd)
offset, err := t.tree.NewPage()
if err != nil {
return err
}
Expand Down Expand Up @@ -337,174 +335,3 @@ type Entry struct {
// }
// }
// }

func (t *BPTree) compact() error {
// read all the nodes and compile a list of nodes still referenced,
// then write out the nodes in order, removing unreferenced nodes and updating
// the parent pointers.

_, rootOffset, err := t.root()
if err != nil {
return err
}

if _, err := t.tree.Seek(0, io.SeekStart); err != nil {
return err
}

references := []MemoryPointer{rootOffset}
for {
node := &BPTreeNode{}
if _, err := node.ReadFrom(t.tree); err != nil {
if err == io.EOF {
break
}
return err
}
if !node.leaf() {
// all pointers are references
references = append(references, node.Pointers...)
}
}

// read all the nodes again and write out the referenced nodes
if _, err := t.tree.Seek(0, io.SeekStart); err != nil {
return err
}

slices.SortFunc(references, func(x, y MemoryPointer) int {
return int(x.Offset - y.Offset)
})

referenceMap := make(map[uint64]MemoryPointer)

offset := 0
for i, reference := range references {
// skip duplicates
if i > 0 && references[i-1] == reference {
continue
}
// read the referenced node
if _, err := t.tree.Seek(int64(reference.Offset), io.SeekStart); err != nil {
return err
}
node := &BPTreeNode{}
if _, err := node.ReadFrom(t.tree); err != nil {
return err
}
// write the node to the new offset
if _, err := t.tree.Seek(int64(offset), io.SeekStart); err != nil {
return err
}
n, err := node.WriteTo(t.tree)
if err != nil {
return err
}
// update the reference map
referenceMap[reference.Offset] = MemoryPointer{Offset: uint64(offset), Length: uint32(n)}
offset += int(n)
}

// truncate the file
if err := t.tree.Truncate(int64(offset)); err != nil {
return err
}

// update the parent pointers
if _, err := t.tree.Seek(0, io.SeekStart); err != nil {
return err
}
for {
offset, err := t.tree.Seek(0, io.SeekCurrent)
if err != nil {
return err
}
node := &BPTreeNode{}
if _, err := node.ReadFrom(t.tree); err != nil {
if err == io.EOF {
break
}
return err
}
if !node.leaf() {
// all pointers are references
for i, p := range node.Pointers {
node.Pointers[i] = referenceMap[p.Offset]
}
}
if _, err := t.tree.Seek(offset, io.SeekStart); err != nil {
return err
}
if _, err := node.WriteTo(t.tree); err != nil {
return err
}
}

// update the meta pointer
return t.meta.SetRoot(referenceMap[rootOffset.Offset])
}

func (t *BPTree) String() string {
var buf bytes.Buffer
// get the current seek position
seekPos, err := t.tree.Seek(0, io.SeekCurrent)
if err != nil {
return err.Error()
}
defer func() {
// reset the seek position
if _, err := t.tree.Seek(seekPos, io.SeekStart); err != nil {
panic(err)
}
}()
root, rootOffset, err := t.root()
if err != nil {
return err.Error()
}
if root == nil {
return "empty tree"
}
if _, err := buf.Write([]byte(fmt.Sprintf("root: %d\n", rootOffset))); err != nil {
return err.Error()
}
// seek to 8
if _, err := t.tree.Seek(0, io.SeekStart); err != nil {
return err.Error()
}
for {
offset, err := t.tree.Seek(0, io.SeekCurrent)
if err != nil {
return err.Error()
}
node := &BPTreeNode{}
if _, err := node.ReadFrom(t.tree); err != nil {
if err == io.EOF {
break
}
return err.Error()
}
if node.leaf() {
if _, err := buf.Write([]byte(fmt.Sprintf("%04d | ", offset))); err != nil {
return err.Error()
}
} else {
if _, err := buf.Write([]byte(fmt.Sprintf("%04d ", offset))); err != nil {
return err.Error()
}
}
for i := 0; i < len(node.Pointers); i++ {
if _, err := buf.Write([]byte(fmt.Sprintf("%04d ", node.Pointers[i]))); err != nil {
return err.Error()
}
if i < len(node.Keys) {
if _, err := buf.Write([]byte(fmt.Sprintf("%02d ", node.Keys[i]))); err != nil {
return err.Error()
}
}
}
if _, err := buf.Write([]byte("\n")); err != nil {
return err.Error()
}
}
return buf.String()
}
Loading

0 comments on commit dd80cba

Please sign in to comment.