- Appendable is querying the JSONL and index files that GitHub pages hosts
- directly. There is no server involved here!
-
-
- Keep in mind that while the query syntax supports a lot of different
- operations, Appendable doesn't support composite indexes yet. Therefore,
- only one field at a time can be filtered on and that field must be used
- for sorting.
-
-
-
-
Fields
-
-
-
-
Query
-
-
-
Results
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+ Appendable - NYC
+ Green Cab Trip Data in 01/2023
+
+ Appendable is querying the JSONL and index files that GitHub pages hosts
+ directly. There is no server involved here!
+
+
+ Keep in mind that while the query syntax supports a lot of different
+ operations, Appendable doesn't support composite indexes yet. Therefore,
+ only one field at a time can be filtered on and that field must be used
+ for sorting.
+
+
+
+
+
Fields
+
+
+
+
Query
+
+
+
Results
+
+
+
+
+
+
diff --git a/package-lock.json b/package-lock.json
index 8979a983..6d7f67d4 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -2446,9 +2446,9 @@
}
},
"node_modules/follow-redirects": {
- "version": "1.15.3",
- "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.3.tgz",
- "integrity": "sha512-1VzOtuEM8pC9SFU1E+8KfTjZyMztRsgEfwQl44z8A25uy13jSzTj6dyK2Df52iV0vgHCfBwLhDWevLn95w5v6Q==",
+ "version": "1.15.5",
+ "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.5.tgz",
+ "integrity": "sha512-vSFWUON1B+yAw1VN4xMfxgn5fTUiaOzAJCKBwIIgT/+7CuGy9+r+5gITvP62j3RmaD5Ph65UaERdOSRGUzZtgw==",
"dev": true,
"funding": [
{
diff --git a/package.json b/package.json
index f11a1827..44638d14 100644
--- a/package.json
+++ b/package.json
@@ -5,7 +5,7 @@
"main": "index.js",
"scripts": {
"build": "esbuild src/index.ts --bundle --minify --sourcemap --outfile=dist/appendable.min.js",
- "build-index": "go run cmd/main.go examples/workspace/green_tripdata_2023-01.jsonl",
+ "build-index": "go run cmd/main.go -jsonl examples/workspace/green_tripdata_2023-01.jsonl",
"serve:example": "cd examples/client && npx http-server",
"test": "jest"
},
diff --git a/pkg/btree/bptree.go b/pkg/btree/bptree.go
index 4f26152e..b116e8a4 100644
--- a/pkg/btree/bptree.go
+++ b/pkg/btree/bptree.go
@@ -145,7 +145,7 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error {
for i := 0; i < len(path); i++ {
tr := path[i]
n := tr.node
- if len(n.Keys) > t.tree.PageSize() {
+ if int(n.Size()) > t.tree.PageSize() {
// split the node
moffset, err := t.tree.NewPage()
if err != nil {
@@ -194,14 +194,15 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error {
j, _ := p.node.bsearch(midKey.Value)
if j != p.index {
// j should be equal to p.index...?
- // panic("aww")
+ // if this panic never happens then we can probably remove the above bsearch.
+ panic("this assumption apparently isn't true")
}
// insert the key into the parent
if j == len(p.node.Keys) {
p.node.Keys = append(p.node.Keys, midKey)
} else {
p.node.Keys = append(p.node.Keys[:j+1], p.node.Keys[j:]...)
- p.node.Keys[j+1] = midKey
+ p.node.Keys[j] = midKey
}
p.node.Pointers = append(p.node.Pointers[:j+1], p.node.Pointers[j:]...)
p.node.Pointers[j] = MemoryPointer{Offset: uint64(noffset), Length: uint32(nsize)}
@@ -335,3 +336,42 @@ type Entry struct {
// }
// }
// }
+
+func (t *BPTree) recursiveString(n *BPTreeNode, indent int) string {
+ // print the node itself
+ var buf bytes.Buffer
+ if !n.leaf() {
+ for i := range n.Pointers {
+ child, err := t.readNode(n.Pointers[i])
+ if err != nil {
+ return fmt.Sprintf("error: failed to read child node: %v", err)
+ }
+ buf.WriteString(t.recursiveString(child, indent+1))
+ if i < len(n.Pointers)-1 {
+ for i := 0; i < indent; i++ {
+ buf.WriteString(" ")
+ }
+ buf.WriteString(fmt.Sprintf("key %v\n", n.Keys[i]))
+ }
+ }
+ } else {
+ for i := range n.Pointers {
+ for i := 0; i < indent; i++ {
+ buf.WriteString(" ")
+ }
+ buf.WriteString(fmt.Sprintf("%v\n", n.Keys[i]))
+ }
+ }
+ return buf.String()
+}
+
+func (t *BPTree) String() string {
+ root, _, err := t.root()
+ if err != nil {
+ return fmt.Sprintf("error: failed to read root node: %v", err)
+ }
+ if root == nil {
+ return "empty tree"
+ }
+ return "b+ tree ---\n" + t.recursiveString(root, 0)
+}
diff --git a/pkg/btree/bptree_test.go b/pkg/btree/bptree_test.go
index ef1e6cd2..34772c8a 100644
--- a/pkg/btree/bptree_test.go
+++ b/pkg/btree/bptree_test.go
@@ -2,6 +2,7 @@ package btree
import (
"encoding/binary"
+ "math/rand"
"testing"
)
@@ -208,6 +209,42 @@ func TestBPTree(t *testing.T) {
}
})
+ t.Run("random insertion test", func(t *testing.T) {
+ b := newSeekableBuffer()
+ p, err := NewPageFile(b)
+ if err != nil {
+ t.Fatal(err)
+ }
+ tree := NewBPTree(p, &testMetaPage{})
+ r := rand.New(rand.NewSource(12345))
+ for i := 0; i < 65536; i++ {
+ buf := make([]byte, 8)
+ if _, err := r.Read(buf); err != nil {
+ t.Fatal(err)
+ }
+ if err := tree.Insert(ReferencedValue{Value: buf}, MemoryPointer{Offset: uint64(i)}); err != nil {
+ t.Fatal(err)
+ }
+ }
+ s := rand.New(rand.NewSource(12345))
+ for i := 0; i < 65536; i++ {
+ buf := make([]byte, 8)
+ if _, err := s.Read(buf); err != nil {
+ t.Fatal(err)
+ }
+ v, found, err := tree.Find(buf)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if !found {
+ t.Fatalf("expected to find key %d", i)
+ }
+ if v.Offset != uint64(i) {
+ t.Fatalf("expected value %d, got %d", i, v)
+ }
+ }
+ })
+
// t.Run("bulk insert", func(t *testing.T) {
// b := newSeekableBuffer()
// tree :=NewBPTree(b, 2)
diff --git a/pkg/btree/multi.go b/pkg/btree/multi.go
index 90024c65..cdc00db7 100644
--- a/pkg/btree/multi.go
+++ b/pkg/btree/multi.go
@@ -26,23 +26,40 @@ func (m *LinkedMetaPage) SetRoot(mp MemoryPointer) error {
return binary.Write(m.rws, binary.LittleEndian, mp)
}
-func (m *LinkedMetaPage) Metadata() (MemoryPointer, error) {
- if _, err := m.rws.Seek(int64(m.offset)+12, io.SeekStart); err != nil {
- return MemoryPointer{}, err
+func (m *LinkedMetaPage) BPTree() *BPTree {
+ return NewBPTree(m.rws, m)
+}
+
+func (m *LinkedMetaPage) Metadata() ([]byte, error) {
+ if _, err := m.rws.Seek(int64(m.offset)+24, io.SeekStart); err != nil {
+ return nil, err
}
- var mp MemoryPointer
- return mp, binary.Read(m.rws, binary.LittleEndian, &mp)
+ buf := make([]byte, m.rws.PageSize()-24)
+ if _, err := m.rws.Read(buf); err != nil {
+ return nil, err
+ }
+ // the first four bytes represents the length
+ length := binary.LittleEndian.Uint32(buf[:4])
+ return buf[4 : 4+length], nil
}
-func (m *LinkedMetaPage) SetMetadata(mp MemoryPointer) error {
- if _, err := m.rws.Seek(int64(m.offset)+12, io.SeekStart); err != nil {
+func (m *LinkedMetaPage) SetMetadata(data []byte) error {
+ if len(data) > m.rws.PageSize()-24 {
+ return errors.New("metadata too large")
+ }
+ if _, err := m.rws.Seek(int64(m.offset)+24, io.SeekStart); err != nil {
return err
}
- return binary.Write(m.rws, binary.LittleEndian, mp)
+ buf := append(make([]byte, 4), data...)
+ binary.LittleEndian.PutUint32(buf, uint32(len(data)))
+ if _, err := m.rws.Write(buf); err != nil {
+ return err
+ }
+ return nil
}
func (m *LinkedMetaPage) Next() (*LinkedMetaPage, error) {
- if _, err := m.rws.Seek(int64(m.offset)+24, io.SeekStart); err != nil {
+ if _, err := m.rws.Seek(int64(m.offset)+12, io.SeekStart); err != nil {
return nil, err
}
var next MemoryPointer
@@ -73,7 +90,7 @@ func (m *LinkedMetaPage) AddNext() (*LinkedMetaPage, error) {
return nil, err
}
// save the next pointer
- if _, err := m.rws.Seek(int64(m.offset)+24, io.SeekStart); err != nil {
+ if _, err := m.rws.Seek(int64(m.offset)+12, io.SeekStart); err != nil {
return nil, err
}
if err := binary.Write(m.rws, binary.LittleEndian, next.offset); err != nil {
@@ -83,7 +100,7 @@ func (m *LinkedMetaPage) AddNext() (*LinkedMetaPage, error) {
}
func (m *LinkedMetaPage) MemoryPointer() MemoryPointer {
- return MemoryPointer{Offset: m.offset, Length: 36}
+ return MemoryPointer{Offset: m.offset, Length: 24}
}
func (m *LinkedMetaPage) Exists() (bool, error) {
@@ -98,17 +115,13 @@ func (m *LinkedMetaPage) Reset() error {
if _, err := m.rws.Seek(int64(m.offset), io.SeekStart); err != nil {
return err
}
- // write 36 bytes of zeros
- if _, err := m.rws.Write(make([]byte, 36)); err != nil {
+ // write 28 bytes of zeros
+ if _, err := m.rws.Write(make([]byte, 28)); err != nil {
return err
}
return nil
}
-func NewMultiBPTree(t ReadWriteSeekPager) (*LinkedMetaPage, error) {
- offset, err := t.NewPage()
- if err != nil {
- return nil, err
- }
- return &LinkedMetaPage{rws: t, offset: uint64(offset)}, nil
+func NewMultiBPTree(t ReadWriteSeekPager, offset uint64) *LinkedMetaPage {
+ return &LinkedMetaPage{rws: t, offset: offset}
}
diff --git a/pkg/btree/multi_test.go b/pkg/btree/multi_test.go
index cca2228b..e656507e 100644
--- a/pkg/btree/multi_test.go
+++ b/pkg/btree/multi_test.go
@@ -12,10 +12,7 @@ func TestMultiBPTree(t *testing.T) {
if err != nil {
t.Fatal(err)
}
- tree, err := NewMultiBPTree(p)
- if err != nil {
- t.Fatal(err)
- }
+ tree := NewMultiBPTree(p, uint64(p.PageSize()))
exists, err := tree.Exists()
if err != nil {
t.Fatal(err)
@@ -31,10 +28,7 @@ func TestMultiBPTree(t *testing.T) {
if err != nil {
t.Fatal(err)
}
- tree, err := NewMultiBPTree(p)
- if err != nil {
- t.Fatal(err)
- }
+ tree := NewMultiBPTree(p, uint64(p.PageSize()))
if err := tree.Reset(); err != nil {
t.Fatal(err)
}
@@ -46,8 +40,8 @@ func TestMultiBPTree(t *testing.T) {
t.Fatal("expected found")
}
mp := tree.MemoryPointer()
- if mp.Length != 36 {
- t.Fatalf("expected length 36, got %d", mp.Length)
+ if mp.Length != 24 {
+ t.Fatalf("expected length 24, got %d", mp.Length)
}
})
@@ -57,10 +51,7 @@ func TestMultiBPTree(t *testing.T) {
if err != nil {
t.Fatal(err)
}
- tree, err := NewMultiBPTree(p)
- if err != nil {
- t.Fatal(err)
- }
+ tree := NewMultiBPTree(p, uint64(p.PageSize()))
if err := tree.Reset(); err != nil {
t.Fatal(err)
}
@@ -68,15 +59,15 @@ func TestMultiBPTree(t *testing.T) {
if err != nil {
t.Fatal(err)
}
- if next1.MemoryPointer().Length != 36 {
- t.Fatalf("expected length 36, got %d", next1)
+ if next1.MemoryPointer().Length != 24 {
+ t.Fatalf("expected length 24, got %d", next1)
}
next2, err := next1.AddNext()
if err != nil {
t.Fatal(err)
}
- if next2.MemoryPointer().Length != 36 {
- t.Fatalf("expected length 36, got %d", next2)
+ if next2.MemoryPointer().Length != 24 {
+ t.Fatalf("expected length 24, got %d", next2)
}
if next1.MemoryPointer().Offset == next2.MemoryPointer().Offset {
@@ -99,10 +90,7 @@ func TestMultiBPTree(t *testing.T) {
if err != nil {
t.Fatal(err)
}
- tree, err := NewMultiBPTree(p)
- if err != nil {
- t.Fatal(err)
- }
+ tree := NewMultiBPTree(p, uint64(p.PageSize()))
if err := tree.Reset(); err != nil {
t.Fatal(err)
}
@@ -110,12 +98,53 @@ func TestMultiBPTree(t *testing.T) {
if err != nil {
t.Fatal(err)
}
- if next1.MemoryPointer().Length != 36 {
- t.Fatalf("expected length 36, got %d", next1)
+ if next1.MemoryPointer().Length != 24 {
+ t.Fatalf("expected length 24, got %d", next1)
}
_, err = tree.AddNext()
if err == nil {
t.Fatal("expected error")
}
})
+
+ t.Run("starts with empty metadata", func(t *testing.T) {
+ b := newSeekableBuffer()
+ p, err := NewPageFile(b)
+ if err != nil {
+ t.Fatal(err)
+ }
+ tree := NewMultiBPTree(p, uint64(p.PageSize()))
+ if err := tree.Reset(); err != nil {
+ t.Fatal(err)
+ }
+ metadata, err := tree.Metadata()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(metadata) != 0 {
+ t.Fatalf("expected empty metadata, got %v", metadata)
+ }
+ })
+
+ t.Run("storing metadata works", func(t *testing.T) {
+ b := newSeekableBuffer()
+ p, err := NewPageFile(b)
+ if err != nil {
+ t.Fatal(err)
+ }
+ tree := NewMultiBPTree(p, uint64(p.PageSize()))
+ if err := tree.Reset(); err != nil {
+ t.Fatal(err)
+ }
+ if err := tree.SetMetadata([]byte("hello")); err != nil {
+ t.Fatal(err)
+ }
+ metadata, err := tree.Metadata()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if !reflect.DeepEqual(metadata, []byte("hello")) {
+ t.Fatalf("got %v want %v", metadata, []byte("hello"))
+ }
+ })
}
diff --git a/pkg/btree/node.go b/pkg/btree/node.go
index 6f2f9ab6..b5b6e521 100644
--- a/pkg/btree/node.go
+++ b/pkg/btree/node.go
@@ -13,6 +13,10 @@ type MemoryPointer struct {
}
type ReferencedValue struct {
+ // it is generally optional to set the DataPointer. if it is not set, the
+ // value is taken to be unreferenced and is stored directly in the node.
+ // if it is set, the value is used for comparison but the value is stored
+ // as a reference to the DataPointer.
DataPointer MemoryPointer
Value []byte
}
@@ -30,6 +34,21 @@ func (n *BPTreeNode) leaf() bool {
return len(n.Pointers) == len(n.Keys)
}
+func (n *BPTreeNode) Size() int64 {
+ size := 4 // number of keys
+ for _, k := range n.Keys {
+ if k.DataPointer.Length > 0 {
+ size += 4 + 12 // length of key + length of pointer
+ } else {
+ size += 4 + len(k.Value)
+ }
+ }
+ for range n.Pointers {
+ size += 12
+ }
+ return int64(size)
+}
+
func (n *BPTreeNode) WriteTo(w io.Writer) (int64, error) {
size := int32(len(n.Keys))
// set the first bit to 1 if it's a leaf
@@ -69,6 +88,9 @@ func (n *BPTreeNode) WriteTo(w io.Writer) (int64, error) {
}
ct += 12
}
+ if ct != int(n.Size()) {
+ panic("size mismatch")
+ }
return int64(ct), nil
}
diff --git a/pkg/btree/pagefile.go b/pkg/btree/pagefile.go
index a1ff316b..906b4e4a 100644
--- a/pkg/btree/pagefile.go
+++ b/pkg/btree/pagefile.go
@@ -4,12 +4,12 @@ import (
"encoding/binary"
"errors"
"io"
- "log"
)
type ReadWriteSeekPager interface {
io.ReadWriteSeeker
+ Page(int) (int64, error)
NewPage() (int64, error)
FreePage(int64) error
@@ -24,6 +24,8 @@ type PageFile struct {
freePageIndexes [512]int64
}
+var _ ReadWriteSeekPager = &PageFile{}
+
const maxFreePageIndices = 512
const pageSizeBytes = 4096 // 4kB by default.
@@ -55,11 +57,17 @@ func NewPageFile(rws io.ReadWriteSeeker) (*PageFile, error) {
return pf, nil
}
+func (pf *PageFile) Page(i int) (int64, error) {
+ if i < 0 {
+ return 0, errors.New("page index cannot be negative")
+ }
+ return int64(i) * int64(pf.pageSize), nil
+}
+
func (pf *PageFile) NewPage() (int64, error) {
// if there are free pages, return the first one
for i := 0; i < len(pf.freePageIndexes); i++ {
if pf.freePageIndexes[i] != 0 {
- log.Printf("found free page at index %d", i)
offset := pf.freePageIndexes[i]
// zero out this free page index on disk
if _, err := pf.ReadWriteSeeker.Seek(int64(i*8), io.SeekStart); err != nil {