From 2760dd46be9e638fd41e69980f830c9a9551f928 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Sun, 21 Jan 2024 19:23:58 -0500 Subject: [PATCH 1/5] feat: store metadata in linked pages (#51) --- pkg/btree/multi.go | 47 +++++++++++++++---------- pkg/btree/multi_test.go | 77 ++++++++++++++++++++++++++++------------- 2 files changed, 81 insertions(+), 43 deletions(-) diff --git a/pkg/btree/multi.go b/pkg/btree/multi.go index 90024c65..f52cbc10 100644 --- a/pkg/btree/multi.go +++ b/pkg/btree/multi.go @@ -26,23 +26,36 @@ func (m *LinkedMetaPage) SetRoot(mp MemoryPointer) error { return binary.Write(m.rws, binary.LittleEndian, mp) } -func (m *LinkedMetaPage) Metadata() (MemoryPointer, error) { - if _, err := m.rws.Seek(int64(m.offset)+12, io.SeekStart); err != nil { - return MemoryPointer{}, err +func (m *LinkedMetaPage) Metadata() ([]byte, error) { + if _, err := m.rws.Seek(int64(m.offset)+24, io.SeekStart); err != nil { + return nil, err } - var mp MemoryPointer - return mp, binary.Read(m.rws, binary.LittleEndian, &mp) + buf := make([]byte, m.rws.PageSize()-24) + if _, err := m.rws.Read(buf); err != nil { + return nil, err + } + // the first four bytes represents the length + length := binary.LittleEndian.Uint32(buf[:4]) + return buf[4 : 4+length], nil } -func (m *LinkedMetaPage) SetMetadata(mp MemoryPointer) error { - if _, err := m.rws.Seek(int64(m.offset)+12, io.SeekStart); err != nil { +func (m *LinkedMetaPage) SetMetadata(data []byte) error { + if len(data) > m.rws.PageSize()-24 { + return errors.New("metadata too large") + } + if _, err := m.rws.Seek(int64(m.offset)+24, io.SeekStart); err != nil { return err } - return binary.Write(m.rws, binary.LittleEndian, mp) + buf := append(make([]byte, 4), data...) + binary.LittleEndian.PutUint32(buf, uint32(len(data))) + if _, err := m.rws.Write(buf); err != nil { + return err + } + return nil } func (m *LinkedMetaPage) Next() (*LinkedMetaPage, error) { - if _, err := m.rws.Seek(int64(m.offset)+24, io.SeekStart); err != nil { + if _, err := m.rws.Seek(int64(m.offset)+12, io.SeekStart); err != nil { return nil, err } var next MemoryPointer @@ -73,7 +86,7 @@ func (m *LinkedMetaPage) AddNext() (*LinkedMetaPage, error) { return nil, err } // save the next pointer - if _, err := m.rws.Seek(int64(m.offset)+24, io.SeekStart); err != nil { + if _, err := m.rws.Seek(int64(m.offset)+12, io.SeekStart); err != nil { return nil, err } if err := binary.Write(m.rws, binary.LittleEndian, next.offset); err != nil { @@ -83,7 +96,7 @@ func (m *LinkedMetaPage) AddNext() (*LinkedMetaPage, error) { } func (m *LinkedMetaPage) MemoryPointer() MemoryPointer { - return MemoryPointer{Offset: m.offset, Length: 36} + return MemoryPointer{Offset: m.offset, Length: 24} } func (m *LinkedMetaPage) Exists() (bool, error) { @@ -98,17 +111,13 @@ func (m *LinkedMetaPage) Reset() error { if _, err := m.rws.Seek(int64(m.offset), io.SeekStart); err != nil { return err } - // write 36 bytes of zeros - if _, err := m.rws.Write(make([]byte, 36)); err != nil { + // write 28 bytes of zeros + if _, err := m.rws.Write(make([]byte, 28)); err != nil { return err } return nil } -func NewMultiBPTree(t ReadWriteSeekPager) (*LinkedMetaPage, error) { - offset, err := t.NewPage() - if err != nil { - return nil, err - } - return &LinkedMetaPage{rws: t, offset: uint64(offset)}, nil +func NewMultiBPTree(t ReadWriteSeekPager, offset uint64) *LinkedMetaPage { + return &LinkedMetaPage{rws: t, offset: offset} } diff --git a/pkg/btree/multi_test.go b/pkg/btree/multi_test.go index cca2228b..e656507e 100644 --- a/pkg/btree/multi_test.go +++ b/pkg/btree/multi_test.go @@ -12,10 +12,7 @@ func TestMultiBPTree(t *testing.T) { if err != nil { t.Fatal(err) } - tree, err := NewMultiBPTree(p) - if err != nil { - t.Fatal(err) - } + tree := NewMultiBPTree(p, uint64(p.PageSize())) exists, err := tree.Exists() if err != nil { t.Fatal(err) @@ -31,10 +28,7 @@ func TestMultiBPTree(t *testing.T) { if err != nil { t.Fatal(err) } - tree, err := NewMultiBPTree(p) - if err != nil { - t.Fatal(err) - } + tree := NewMultiBPTree(p, uint64(p.PageSize())) if err := tree.Reset(); err != nil { t.Fatal(err) } @@ -46,8 +40,8 @@ func TestMultiBPTree(t *testing.T) { t.Fatal("expected found") } mp := tree.MemoryPointer() - if mp.Length != 36 { - t.Fatalf("expected length 36, got %d", mp.Length) + if mp.Length != 24 { + t.Fatalf("expected length 24, got %d", mp.Length) } }) @@ -57,10 +51,7 @@ func TestMultiBPTree(t *testing.T) { if err != nil { t.Fatal(err) } - tree, err := NewMultiBPTree(p) - if err != nil { - t.Fatal(err) - } + tree := NewMultiBPTree(p, uint64(p.PageSize())) if err := tree.Reset(); err != nil { t.Fatal(err) } @@ -68,15 +59,15 @@ func TestMultiBPTree(t *testing.T) { if err != nil { t.Fatal(err) } - if next1.MemoryPointer().Length != 36 { - t.Fatalf("expected length 36, got %d", next1) + if next1.MemoryPointer().Length != 24 { + t.Fatalf("expected length 24, got %d", next1) } next2, err := next1.AddNext() if err != nil { t.Fatal(err) } - if next2.MemoryPointer().Length != 36 { - t.Fatalf("expected length 36, got %d", next2) + if next2.MemoryPointer().Length != 24 { + t.Fatalf("expected length 24, got %d", next2) } if next1.MemoryPointer().Offset == next2.MemoryPointer().Offset { @@ -99,10 +90,7 @@ func TestMultiBPTree(t *testing.T) { if err != nil { t.Fatal(err) } - tree, err := NewMultiBPTree(p) - if err != nil { - t.Fatal(err) - } + tree := NewMultiBPTree(p, uint64(p.PageSize())) if err := tree.Reset(); err != nil { t.Fatal(err) } @@ -110,12 +98,53 @@ func TestMultiBPTree(t *testing.T) { if err != nil { t.Fatal(err) } - if next1.MemoryPointer().Length != 36 { - t.Fatalf("expected length 36, got %d", next1) + if next1.MemoryPointer().Length != 24 { + t.Fatalf("expected length 24, got %d", next1) } _, err = tree.AddNext() if err == nil { t.Fatal("expected error") } }) + + t.Run("starts with empty metadata", func(t *testing.T) { + b := newSeekableBuffer() + p, err := NewPageFile(b) + if err != nil { + t.Fatal(err) + } + tree := NewMultiBPTree(p, uint64(p.PageSize())) + if err := tree.Reset(); err != nil { + t.Fatal(err) + } + metadata, err := tree.Metadata() + if err != nil { + t.Fatal(err) + } + if len(metadata) != 0 { + t.Fatalf("expected empty metadata, got %v", metadata) + } + }) + + t.Run("storing metadata works", func(t *testing.T) { + b := newSeekableBuffer() + p, err := NewPageFile(b) + if err != nil { + t.Fatal(err) + } + tree := NewMultiBPTree(p, uint64(p.PageSize())) + if err := tree.Reset(); err != nil { + t.Fatal(err) + } + if err := tree.SetMetadata([]byte("hello")); err != nil { + t.Fatal(err) + } + metadata, err := tree.Metadata() + if err != nil { + t.Fatal(err) + } + if !reflect.DeepEqual(metadata, []byte("hello")) { + t.Fatalf("got %v want %v", metadata, []byte("hello")) + } + }) } From 2d84074533d7c3cbf117512a7e77ff553a7c9b7b Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Sun, 21 Jan 2024 22:07:47 -0500 Subject: [PATCH 2/5] fix: use the node byte size to determine splits (#53) This also improves performance by quite a bit since it avoids linear scans. --- pkg/btree/bptree.go | 5 +++-- pkg/btree/multi.go | 4 ++++ pkg/btree/node.go | 22 ++++++++++++++++++++++ pkg/btree/pagefile.go | 12 ++++++++++-- 4 files changed, 39 insertions(+), 4 deletions(-) diff --git a/pkg/btree/bptree.go b/pkg/btree/bptree.go index 4f26152e..459e05a9 100644 --- a/pkg/btree/bptree.go +++ b/pkg/btree/bptree.go @@ -145,7 +145,7 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error { for i := 0; i < len(path); i++ { tr := path[i] n := tr.node - if len(n.Keys) > t.tree.PageSize() { + if int(n.Size()) > t.tree.PageSize() { // split the node moffset, err := t.tree.NewPage() if err != nil { @@ -194,7 +194,8 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error { j, _ := p.node.bsearch(midKey.Value) if j != p.index { // j should be equal to p.index...? - // panic("aww") + // if this panic never happens then we can probably remove the above bsearch. + panic("this assumption apparently isn't true") } // insert the key into the parent if j == len(p.node.Keys) { diff --git a/pkg/btree/multi.go b/pkg/btree/multi.go index f52cbc10..cdc00db7 100644 --- a/pkg/btree/multi.go +++ b/pkg/btree/multi.go @@ -26,6 +26,10 @@ func (m *LinkedMetaPage) SetRoot(mp MemoryPointer) error { return binary.Write(m.rws, binary.LittleEndian, mp) } +func (m *LinkedMetaPage) BPTree() *BPTree { + return NewBPTree(m.rws, m) +} + func (m *LinkedMetaPage) Metadata() ([]byte, error) { if _, err := m.rws.Seek(int64(m.offset)+24, io.SeekStart); err != nil { return nil, err diff --git a/pkg/btree/node.go b/pkg/btree/node.go index 6f2f9ab6..b5b6e521 100644 --- a/pkg/btree/node.go +++ b/pkg/btree/node.go @@ -13,6 +13,10 @@ type MemoryPointer struct { } type ReferencedValue struct { + // it is generally optional to set the DataPointer. if it is not set, the + // value is taken to be unreferenced and is stored directly in the node. + // if it is set, the value is used for comparison but the value is stored + // as a reference to the DataPointer. DataPointer MemoryPointer Value []byte } @@ -30,6 +34,21 @@ func (n *BPTreeNode) leaf() bool { return len(n.Pointers) == len(n.Keys) } +func (n *BPTreeNode) Size() int64 { + size := 4 // number of keys + for _, k := range n.Keys { + if k.DataPointer.Length > 0 { + size += 4 + 12 // length of key + length of pointer + } else { + size += 4 + len(k.Value) + } + } + for range n.Pointers { + size += 12 + } + return int64(size) +} + func (n *BPTreeNode) WriteTo(w io.Writer) (int64, error) { size := int32(len(n.Keys)) // set the first bit to 1 if it's a leaf @@ -69,6 +88,9 @@ func (n *BPTreeNode) WriteTo(w io.Writer) (int64, error) { } ct += 12 } + if ct != int(n.Size()) { + panic("size mismatch") + } return int64(ct), nil } diff --git a/pkg/btree/pagefile.go b/pkg/btree/pagefile.go index a1ff316b..906b4e4a 100644 --- a/pkg/btree/pagefile.go +++ b/pkg/btree/pagefile.go @@ -4,12 +4,12 @@ import ( "encoding/binary" "errors" "io" - "log" ) type ReadWriteSeekPager interface { io.ReadWriteSeeker + Page(int) (int64, error) NewPage() (int64, error) FreePage(int64) error @@ -24,6 +24,8 @@ type PageFile struct { freePageIndexes [512]int64 } +var _ ReadWriteSeekPager = &PageFile{} + const maxFreePageIndices = 512 const pageSizeBytes = 4096 // 4kB by default. @@ -55,11 +57,17 @@ func NewPageFile(rws io.ReadWriteSeeker) (*PageFile, error) { return pf, nil } +func (pf *PageFile) Page(i int) (int64, error) { + if i < 0 { + return 0, errors.New("page index cannot be negative") + } + return int64(i) * int64(pf.pageSize), nil +} + func (pf *PageFile) NewPage() (int64, error) { // if there are free pages, return the first one for i := 0; i < len(pf.freePageIndexes); i++ { if pf.freePageIndexes[i] != 0 { - log.Printf("found free page at index %d", i) offset := pf.freePageIndexes[i] // zero out this free page index on disk if _, err := pf.ReadWriteSeeker.Seek(int64(i*8), io.SeekStart); err != nil { From 1a6008d9daef9a1b8e2edfe45f19412a780238b3 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Sun, 21 Jan 2024 22:56:45 -0500 Subject: [PATCH 3/5] fix: add random test, fix off by one error (#54) --- pkg/btree/bptree.go | 41 +++++++++++++++++++++++++++++++++++++++- pkg/btree/bptree_test.go | 37 ++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/pkg/btree/bptree.go b/pkg/btree/bptree.go index 459e05a9..b116e8a4 100644 --- a/pkg/btree/bptree.go +++ b/pkg/btree/bptree.go @@ -202,7 +202,7 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error { p.node.Keys = append(p.node.Keys, midKey) } else { p.node.Keys = append(p.node.Keys[:j+1], p.node.Keys[j:]...) - p.node.Keys[j+1] = midKey + p.node.Keys[j] = midKey } p.node.Pointers = append(p.node.Pointers[:j+1], p.node.Pointers[j:]...) p.node.Pointers[j] = MemoryPointer{Offset: uint64(noffset), Length: uint32(nsize)} @@ -336,3 +336,42 @@ type Entry struct { // } // } // } + +func (t *BPTree) recursiveString(n *BPTreeNode, indent int) string { + // print the node itself + var buf bytes.Buffer + if !n.leaf() { + for i := range n.Pointers { + child, err := t.readNode(n.Pointers[i]) + if err != nil { + return fmt.Sprintf("error: failed to read child node: %v", err) + } + buf.WriteString(t.recursiveString(child, indent+1)) + if i < len(n.Pointers)-1 { + for i := 0; i < indent; i++ { + buf.WriteString(" ") + } + buf.WriteString(fmt.Sprintf("key %v\n", n.Keys[i])) + } + } + } else { + for i := range n.Pointers { + for i := 0; i < indent; i++ { + buf.WriteString(" ") + } + buf.WriteString(fmt.Sprintf("%v\n", n.Keys[i])) + } + } + return buf.String() +} + +func (t *BPTree) String() string { + root, _, err := t.root() + if err != nil { + return fmt.Sprintf("error: failed to read root node: %v", err) + } + if root == nil { + return "empty tree" + } + return "b+ tree ---\n" + t.recursiveString(root, 0) +} diff --git a/pkg/btree/bptree_test.go b/pkg/btree/bptree_test.go index ef1e6cd2..34772c8a 100644 --- a/pkg/btree/bptree_test.go +++ b/pkg/btree/bptree_test.go @@ -2,6 +2,7 @@ package btree import ( "encoding/binary" + "math/rand" "testing" ) @@ -208,6 +209,42 @@ func TestBPTree(t *testing.T) { } }) + t.Run("random insertion test", func(t *testing.T) { + b := newSeekableBuffer() + p, err := NewPageFile(b) + if err != nil { + t.Fatal(err) + } + tree := NewBPTree(p, &testMetaPage{}) + r := rand.New(rand.NewSource(12345)) + for i := 0; i < 65536; i++ { + buf := make([]byte, 8) + if _, err := r.Read(buf); err != nil { + t.Fatal(err) + } + if err := tree.Insert(ReferencedValue{Value: buf}, MemoryPointer{Offset: uint64(i)}); err != nil { + t.Fatal(err) + } + } + s := rand.New(rand.NewSource(12345)) + for i := 0; i < 65536; i++ { + buf := make([]byte, 8) + if _, err := s.Read(buf); err != nil { + t.Fatal(err) + } + v, found, err := tree.Find(buf) + if err != nil { + t.Fatal(err) + } + if !found { + t.Fatalf("expected to find key %d", i) + } + if v.Offset != uint64(i) { + t.Fatalf("expected value %d, got %d", i, v) + } + } + }) + // t.Run("bulk insert", func(t *testing.T) { // b := newSeekableBuffer() // tree :=NewBPTree(b, 2) From 2d00e0b6cff10a052cba54c44f65bac235906857 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 22 Jan 2024 03:57:35 +0000 Subject: [PATCH 4/5] build(deps-dev): bump follow-redirects from 1.15.3 to 1.15.5 (#55) Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- package-lock.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/package-lock.json b/package-lock.json index 8979a983..6d7f67d4 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2446,9 +2446,9 @@ } }, "node_modules/follow-redirects": { - "version": "1.15.3", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.3.tgz", - "integrity": "sha512-1VzOtuEM8pC9SFU1E+8KfTjZyMztRsgEfwQl44z8A25uy13jSzTj6dyK2Df52iV0vgHCfBwLhDWevLn95w5v6Q==", + "version": "1.15.5", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.5.tgz", + "integrity": "sha512-vSFWUON1B+yAw1VN4xMfxgn5fTUiaOzAJCKBwIIgT/+7CuGy9+r+5gITvP62j3RmaD5Ph65UaERdOSRGUzZtgw==", "dev": true, "funding": [ { From 79a8ff2e3de6d0f81bdba9655249bb702c5718f5 Mon Sep 17 00:00:00 2001 From: Matthew <38759997+friendlymatthew@users.noreply.github.com> Date: Mon, 22 Jan 2024 08:41:52 -0500 Subject: [PATCH 5/5] Enhance demo (#52) * enhance demo * prettier --- examples/client/index.html | 449 ++++++++++++++++++++++++++----------- package.json | 2 +- 2 files changed, 318 insertions(+), 133 deletions(-) diff --git a/examples/client/index.html b/examples/client/index.html index 11c4bb25..caedefd2 100644 --- a/examples/client/index.html +++ b/examples/client/index.html @@ -1,134 +1,319 @@ - + - - - - - - - -

- Appendable - NYC - Green Cab Trip Data in 01/2023 -

-
- Download the raw data here: - JSONL - - Appendable Index - - Source -
-

- Appendable is querying the JSONL and index files that GitHub pages hosts - directly. There is no server involved here! -

-

- Keep in mind that while the query syntax supports a lot of different - operations, Appendable doesn't support composite indexes yet. Therefore, - only one field at a time can be filtered on and that field must be used - for sorting. -

-
-
-

Fields

-

-      
-
-

Query

-
- -

Results

-

-        
-      
-
- - + + + + + + + +
+

+ Appendable - NYC + Green Cab Trip Data in 01/2023 +

+
+ Download the raw data here: + JSONL - + Appendable Index - + Source +
+

+ Appendable is querying the JSONL and index files that GitHub pages hosts + directly. There is no server involved here! +

+

+ Keep in mind that while the query syntax supports a lot of different + operations, Appendable doesn't support composite indexes yet. Therefore, + only one field at a time can be filtered on and that field must be used + for sorting. +

+
+
+
+

Fields

+

+			
+
+

Query

+ +
+

Results

+ +

+			
+
+ + diff --git a/package.json b/package.json index f11a1827..44638d14 100644 --- a/package.json +++ b/package.json @@ -5,7 +5,7 @@ "main": "index.js", "scripts": { "build": "esbuild src/index.ts --bundle --minify --sourcemap --outfile=dist/appendable.min.js", - "build-index": "go run cmd/main.go examples/workspace/green_tripdata_2023-01.jsonl", + "build-index": "go run cmd/main.go -jsonl examples/workspace/green_tripdata_2023-01.jsonl", "serve:example": "cd examples/client && npx http-server", "test": "jest" },