From 32fdf87ebd4632be9dab71b3185e568d25a3e092 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 30 Jan 2024 08:36:59 -0500 Subject: [PATCH 01/15] feat: switch to b+ tree --- cmd/main.go | 59 +-- pkg/appendable/appendable.go | 135 ++++++ pkg/appendable/csv_handler.go | 180 -------- pkg/appendable/index_file.go | 166 +++++-- pkg/appendable/index_file_csv_test.go | 308 ------------- pkg/appendable/index_file_jsonl_test.go | 307 ------------- pkg/appendable/index_file_test.go | 336 +++++++------- pkg/appendable/io.go | 318 -------------- pkg/appendable/io_test.go | 17 - pkg/appendable/jsonl_handler.go | 162 ------- pkg/appendable/typescript.go | 42 +- pkg/btree/bptree.go | 126 +++--- pkg/btree/bptree_test.go | 14 + pkg/btree/multi.go | 33 +- pkg/btree/node.go | 96 ++-- pkg/btree/node_test.go | 76 ++++ pkg/btree/pagefile.go | 131 ++++-- pkg/btree/pagefile_test.go | 103 ++++- pkg/handlers/csv.go | 252 +++++++++++ pkg/handlers/csv_test.go | 308 +++++++++++++ pkg/handlers/jsonl.go | 212 +++++++++ pkg/handlers/jsonl_test.go | 556 ++++++++++++++++++++++++ pkg/protocol/protocol.go | 193 -------- 23 files changed, 2214 insertions(+), 1916 deletions(-) create mode 100644 pkg/appendable/appendable.go delete mode 100644 pkg/appendable/csv_handler.go delete mode 100644 pkg/appendable/index_file_csv_test.go delete mode 100644 pkg/appendable/index_file_jsonl_test.go delete mode 100644 pkg/appendable/io.go delete mode 100644 pkg/appendable/io_test.go delete mode 100644 pkg/appendable/jsonl_handler.go create mode 100644 pkg/btree/node_test.go create mode 100644 pkg/handlers/csv.go create mode 100644 pkg/handlers/csv_test.go create mode 100644 pkg/handlers/jsonl.go create mode 100644 pkg/handlers/jsonl_test.go delete mode 100644 pkg/protocol/protocol.go diff --git a/cmd/main.go b/cmd/main.go index 63dbf849..42b2362d 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -1,25 +1,41 @@ package main import ( - "bufio" "flag" "fmt" "log/slog" "os" + "runtime/pprof" "time" "github.com/kevmo314/appendable/pkg/appendable" + "github.com/kevmo314/appendable/pkg/handlers" ) func main() { var debugFlag, jsonlFlag, csvFlag, showTimings bool + var indexFilename string flag.BoolVar(&debugFlag, "debug", false, "Use logger that prints at the debug-level") flag.BoolVar(&jsonlFlag, "jsonl", false, "Use JSONL handler") flag.BoolVar(&csvFlag, "csv", false, "Use CSV handler") flag.BoolVar(&showTimings, "t", false, "Show time-related metrics") + flag.StringVar(&indexFilename, "i", "", "Specify the existing index of the file to be opened, writing to stdout") flag.Parse() + f, err := os.Create("pprof.out") + if err != nil { + panic(err) + } + defer f.Close() // error handling omitted for example + if err := pprof.StartCPUProfile(f); err != nil { + panic(err) + } + go func() { + <-time.After(30 * time.Second) + pprof.StopCPUProfile() + os.Exit(0) + }() logLevel := &slog.LevelVar{} @@ -35,7 +51,6 @@ func main() { totalStart = time.Now() } - // index := flag.String("i", "", "Specify the existing index of the file to be opened, writing to stdout") flag.Usage = func() { fmt.Printf("Usage: %s [-t] [-i index] [-I index] filename\n", os.Args[0]) flag.PrintDefaults() @@ -48,7 +63,13 @@ func main() { flag.Usage() } - // Open the file + // open the index file + indexFile, err := os.OpenFile(indexFilename, os.O_RDWR|os.O_CREATE, 0666) + if err != nil { + panic(err) + } + + // Open the data file file, err := os.Open(args[0]) if err != nil { panic(err) @@ -58,13 +79,9 @@ func main() { switch { case jsonlFlag: - dataHandler = appendable.JSONLHandler{ - ReadSeeker: file, - } - case csvFlag: - dataHandler = appendable.CSVHandler{ - ReadSeeker: file, - } + dataHandler = handlers.JSONLHandler{} + // case csvFlag: + // dataHandler = handlers.CSVHandler{} default: logger.Error("Please specify the file type with -jsonl or -csv.") os.Exit(1) @@ -73,12 +90,15 @@ func main() { readStart = time.Now() } // Open the index file - indexFile, err := appendable.NewIndexFile(dataHandler) - + i, err := appendable.NewIndexFile(indexFile, dataHandler) if err != nil { panic(err) } + if err := i.Synchronize(file); err != nil { + panic(err) + } + if showTimings { readDuration := time.Since(readStart) logger.Info("Opening + synchronizing index file took", slog.Duration("duration", readDuration)) @@ -88,21 +108,6 @@ func main() { if showTimings { writeStart = time.Now() } - of, err := os.Create(args[0] + ".index") - if err != nil { - panic(err) - } - logger.Info("Writing index file to", slog.String("path", args[0]+".index")) - bufof := bufio.NewWriter(of) - if err := indexFile.Serialize(bufof); err != nil { - panic(err) - } - if err := bufof.Flush(); err != nil { - panic(err) - } - if err := of.Close(); err != nil { - panic(err) - } if showTimings { writeDuration := time.Since(writeStart) diff --git a/pkg/appendable/appendable.go b/pkg/appendable/appendable.go new file mode 100644 index 00000000..6389e6ca --- /dev/null +++ b/pkg/appendable/appendable.go @@ -0,0 +1,135 @@ +package appendable + +import ( + "encoding/binary" + "fmt" + "strings" +) + +/** + * The structure of an index file is characterized by some pages that point + * to other pages. Each box below represents a (typically 4kB) page and + * the arrows indicate that there is a pointer to the next page. + * + * +-----------+-----------+ +-------------+ +-------------+ +-------------+ + * | Page GC | File Meta | -> | Index Meta | -> | Index Meta | -> | Index Meta | + * +-----------+-----------+ +-------------+ +-------------+ +-------------+ + * | | | + * v v v + * +~~~~~~~~~~~~~+ +~~~~~~~~~~~~~+ +~~~~~~~~~~~~~+ + * | B+ Tree | | B+ Tree | | B+ Tree | + * +~~~~~~~~~~~~~+ +~~~~~~~~~~~~~+ +~~~~~~~~~~~~~+ + * + * Note: By convention, the first FileMeta does not have a pointer to the + * B+ tree. Instead, the first FileMeta is used to store metadata about the + * file itself and only contains a next pointer. + * + * Additionally, the Page GC page is used by the page file to store free page + * indexes for garbage collection. + * + * Consequentially, the index file cannot be smaller than two pages (typically 8kB). + */ + +type Version byte + +type Format byte + +const ( + FormatJSONL Format = iota + FormatCSV +) + +// FieldType represents the type of data stored in the field, which follows +// JSON types excluding Object and null. Object is broken down into subfields +// and null is not stored. +type FieldType byte + +const ( + FieldTypeString FieldType = iota + FieldTypeInt64 + FieldTypeUint64 + FieldTypeFloat64 + FieldTypeObject + FieldTypeArray + FieldTypeBoolean + FieldTypeNull +) + +func (t FieldType) TypescriptType() string { + components := []string{} + if t&FieldTypeString != 0 { + components = append(components, "string") + } + if t&FieldTypeInt64 != 0 || t&FieldTypeFloat64 != 0 { + components = append(components, "number") + } + if t&FieldTypeObject != 0 { + components = append(components, "Record") + } + if t&FieldTypeArray != 0 { + components = append(components, "any[]") + } + if t&FieldTypeBoolean != 0 { + components = append(components, "boolean") + } + if t&FieldTypeNull != 0 { + components = append(components, "null") + } + if len(components) == 0 { + return "unknown" + } + return strings.Join(components, " | ") +} + +type FileMeta struct { + Version + Format + // An offset to indicate how much data is contained within + // this index. Note that this is implementation-dependent, + // so it is not guaranteed to have any uniform interpretation. + // For example, in JSONL, this is the number of bytes read + // and indexed so far. + ReadOffset uint64 +} + +func (m *FileMeta) MarshalBinary() ([]byte, error) { + buf := make([]byte, 9) + buf[0] = byte(m.Version) + binary.LittleEndian.PutUint64(buf[1:], m.ReadOffset) + return buf, nil +} + +func (m *FileMeta) UnmarshalBinary(buf []byte) error { + if len(buf) < 9 { + return fmt.Errorf("invalid metadata size: %d", len(buf)) + } + m.Version = Version(buf[0]) + m.ReadOffset = binary.LittleEndian.Uint64(buf[1:]) + return nil +} + +type IndexMeta struct { + FieldName string + FieldType FieldType +} + +func (m *IndexMeta) MarshalBinary() ([]byte, error) { + buf := make([]byte, 8+len(m.FieldName)+2) + binary.BigEndian.PutUint64(buf[0:], uint64(m.FieldType)) + binary.BigEndian.PutUint16(buf[8:], uint16(len(m.FieldName))) + copy(buf[10:], m.FieldName) + return buf, nil +} + +func (m *IndexMeta) UnmarshalBinary(buf []byte) error { + if len(buf) < 10 { + return fmt.Errorf("invalid metadata size: %d", len(buf)) + } + m.FieldType = FieldType(binary.BigEndian.Uint64(buf[0:])) + nameLength := binary.BigEndian.Uint16(buf[8:]) + if len(buf) < 10+int(nameLength) { + return fmt.Errorf("invalid metadata size: %d", len(buf)) + } + m.FieldName = string(buf[10 : 10+nameLength]) + return nil +} diff --git a/pkg/appendable/csv_handler.go b/pkg/appendable/csv_handler.go deleted file mode 100644 index d6bd50b7..00000000 --- a/pkg/appendable/csv_handler.go +++ /dev/null @@ -1,180 +0,0 @@ -package appendable - -import ( - "bufio" - "bytes" - "encoding/csv" - "fmt" - "io" - "log/slog" - "strings" - - "github.com/cespare/xxhash/v2" - "github.com/kevmo314/appendable/pkg/protocol" -) - -type CSVHandler struct { - io.ReadSeeker -} - -func (c CSVHandler) Synchronize(f *IndexFile) error { - slog.Debug("Starting CSV synchronization") - - var headers []string - var err error - - fromNewIndexFile := false - - isHeader := false - - if len(f.Indexes) == 0 { - isHeader = true - fromNewIndexFile = true - } else { - slog.Debug("indexes already exist, not parsing headers") - for _, index := range f.Indexes { - isHeader = false - headers = append(headers, index.FieldName) - } - } - - scanner := bufio.NewScanner(f.data) - - for i := 0; scanner.Scan(); i++ { - line := scanner.Bytes() - - existingCount := len(f.EndByteOffsets) - - // append a data range - var start uint64 - if len(f.EndByteOffsets) > 0 { - start = f.EndByteOffsets[existingCount-1] - } - - slog.Debug("", slog.Uint64("start", start)) - - slog.Debug("adding", slog.Any("endbyteoffset", start+uint64(len(line))), slog.Any("line", line)) - f.EndByteOffsets = append(f.EndByteOffsets, start+uint64(len(line))+1) - f.Checksums = append(f.Checksums, xxhash.Sum64(line)) - - if isHeader { - slog.Info("Parsing CSV headers") - dec := csv.NewReader(bytes.NewReader(line)) - headers, err = dec.Read() - if err != nil { - slog.Error("failed to parse CSV header", "error", err) - return fmt.Errorf("failed to parse CSV header: %w", err) - } - isHeader = false - continue - } - - dec := csv.NewReader(bytes.NewReader(line)) - slog.Debug("Handling csv", "line", i) - - if fromNewIndexFile { - - f.handleCSVLine(dec, headers, []string{}, uint64(existingCount)-1, start) - } else { - - f.handleCSVLine(dec, headers, []string{}, uint64(existingCount), start) - } - - slog.Info("Succesfully processed", "line", i) - } - - if fromNewIndexFile && len(f.EndByteOffsets) > 0 { - f.EndByteOffsets = f.EndByteOffsets[1:] - f.Checksums = f.Checksums[1:] - - slog.Debug("Trimming endbyte offsets and checksums", "endByteOffsets", slog.Any("endByteOffsets", f.EndByteOffsets), "checksums", slog.Any("checksums", f.Checksums)) - } - - slog.Debug("indexes", slog.Any("", f.Indexes)) - slog.Debug("Ending CSV synchronization") - slog.Debug("=========") - return nil -} - -func fieldRankCsvField(fieldValue any) int { - slog.Debug("serialize", slog.Any("fieldValue", fieldValue)) - switch fieldValue.(type) { - case nil: - slog.Debug("nil", slog.Any("fieldValue", fieldValue)) - return 1 - case bool: - slog.Debug("bool", slog.Any("fieldValue", fieldValue)) - return 2 - case int, int8, int16, int32, int64, float32, float64: - slog.Debug("number", slog.Any("fieldValue", fieldValue)) - return 3 - case string: - slog.Debug("string", slog.Any("fieldValue", fieldValue)) - return 4 - default: - panic("unknown type") - } -} - -func (i *IndexFile) handleCSVLine(dec *csv.Reader, headers []string, path []string, dataIndex, dataOffset uint64) error { - slog.Debug("Processing CSV line", slog.Int("dataIndex", int(dataIndex)), slog.Int("dataOffset", int(dataOffset))) - - record, err := dec.Read() - - if err != nil { - slog.Error("Failed to read CSV record at index", "dataIndex", dataIndex, "error", err) - return fmt.Errorf("failed to read CSV record at index %d: %w", dataIndex, err) - } - - slog.Debug("CSV line read successfully", "record", record) - - cumulativeLength := uint64(0) - - for fieldIndex, fieldValue := range record { - if fieldIndex >= len(headers) { - slog.Error("Field index is out of bounds with headers", "fieldIndex", fieldIndex, "headers", slog.Any("headers", headers)) - return fmt.Errorf("field index %d is out of bounds with header", fieldIndex) - } - - fieldName := headers[fieldIndex] - name := strings.Join(append(path, fieldName), ".") - - fieldOffset := dataOffset + cumulativeLength - fieldLength := uint64(len(fieldValue)) - - value, fieldType := protocol.InferCSVField(fieldValue) - - switch fieldType { - case protocol.FieldTypeBoolean, protocol.FieldTypeString, protocol.FieldTypeNumber: - - tree := i.Indexes[i.findIndex(name, value)].IndexRecords - - tree[value] = append(tree[value], protocol.IndexRecord{ - DataNumber: dataIndex, - FieldStartByteOffset: uint64(fieldOffset), - FieldLength: int(fieldLength), - }) - - slog.Debug("Appended index record", - slog.String("field", name), - slog.Any("value", value), - slog.Int("start", int(fieldOffset))) - - case protocol.FieldTypeNull: - for j := range i.Indexes { - if i.Indexes[j].FieldName == name { - i.Indexes[j].FieldType |= protocol.FieldTypeNull - } - } - slog.Debug("Marked field", "name", name) - - default: - slog.Error("Encountered unexpected type '%T' for field '%s'", value, name) - return fmt.Errorf("unexpected type '%T'", value) - } - - cumulativeLength += fieldLength + 1 - } - - return nil -} diff --git a/pkg/appendable/index_file.go b/pkg/appendable/index_file.go index 821bee69..ef1d7fb4 100644 --- a/pkg/appendable/index_file.go +++ b/pkg/appendable/index_file.go @@ -1,72 +1,148 @@ package appendable import ( + "fmt" "io" - "github.com/kevmo314/appendable/pkg/protocol" + "github.com/kevmo314/appendable/pkg/btree" ) const CurrentVersion = 1 +type DataFile interface { + io.ReadSeeker + io.ReaderAt +} + +type DataHandler interface { + Synchronize(f *IndexFile, df DataFile) error + Format() Format +} + // IndexFile is a representation of the entire index file. type IndexFile struct { - Version protocol.Version + tree *btree.LinkedMetaPage + dataHandler DataHandler +} - // There is exactly one IndexHeader for each field in the data file. - Indexes []Index +func NewIndexFile(f io.ReadWriteSeeker, dataHandler DataHandler) (*IndexFile, error) { + pf, err := btree.NewPageFile(f) + if err != nil { + return nil, fmt.Errorf("failed to create page file: %w", err) + } + tree, err := btree.NewMultiBPTree(pf, 0) + if err != nil { + return nil, fmt.Errorf("failed to create multi b+ tree: %w", err) + } + // ensure the first page is written. + for i := 0; ; i++ { + exists, err := tree.Exists() + if err != nil { + return nil, fmt.Errorf("failed to check if meta page exists: %w", err) + } + if !exists { + if err := tree.Reset(); err != nil { + return nil, fmt.Errorf("failed to reset meta page: %w", err) + } + metadata := &FileMeta{ + Version: CurrentVersion, + Format: dataHandler.Format(), + } + buf, err := metadata.MarshalBinary() + if err != nil { + return nil, fmt.Errorf("failed to marshal metadata: %w", err) + } + if err := tree.SetMetadata(buf); err != nil { + return nil, fmt.Errorf("failed to set metadata: %w", err) + } + } else if i > 1 { + panic("expected to only reset the first page once") + } else { + return &IndexFile{tree: tree, dataHandler: dataHandler}, nil + } + } +} - EndByteOffsets []uint64 - Checksums []uint64 +func (i *IndexFile) Metadata() (*FileMeta, error) { + // the first page consists of associated metadata for the tree + buf, err := i.tree.Metadata() + if err != nil { + return nil, fmt.Errorf("failed to read metadata: %w", err) + } + metadata := &FileMeta{} + return metadata, metadata.UnmarshalBinary(buf) +} - data io.ReadSeeker - tail int +func (i *IndexFile) SetMetadata(metadata *FileMeta) error { + buf, err := metadata.MarshalBinary() + if err != nil { + return fmt.Errorf("failed to marshal metadata: %w", err) + } + return i.tree.SetMetadata(buf) } -// Index is a representation of a single index. -type Index struct { - FieldName string - FieldType protocol.FieldType - IndexRecords map[any][]protocol.IndexRecord +func (i *IndexFile) Indexes() (*btree.LinkedMetaPage, error) { + return i.tree.Next() } -func fieldType(data any) protocol.FieldType { - switch data.(type) { - case string: - return protocol.FieldTypeString - case int, int8, int16, int32, int64, float32, float64: - return protocol.FieldTypeNumber - case bool: - return protocol.FieldTypeBoolean - case []any: - return protocol.FieldTypeArray - default: - return protocol.FieldTypeObject +func (i *IndexFile) IsEmpty() (bool, error) { + n, err := i.tree.Next() + if err != nil { + return false, fmt.Errorf("failed to get next meta page: %w", err) + } + exists, err := n.Exists() + if err != nil { + return false, fmt.Errorf("failed to check if meta page exists: %w", err) } + return !exists, nil } -func (i *IndexFile) findIndex(name string, value any) int { - // find the index for the field - match := -1 - for j := range i.Indexes { - if i.Indexes[j].FieldName == name { - match = j +func (i *IndexFile) FindOrCreateIndex(name string, fieldType FieldType) (*btree.LinkedMetaPage, error) { + mp := i.tree + for { + // this is done in an odd order to avoid needing to keep track of the previous page + next, err := mp.Next() + if err != nil { + return nil, fmt.Errorf("failed to get next meta page: %w", err) + } + exists, err := next.Exists() + if err != nil { + return nil, fmt.Errorf("failed to check if meta page exists: %w", err) + } + if !exists { break } - } - // if the index doesn't exist, create it - ft := fieldType(value) - if match == -1 { - index := Index{ - FieldName: name, - FieldType: ft, + buf, err := next.Metadata() + if err != nil { + return nil, fmt.Errorf("failed to read metadata: %w", err) } - index.IndexRecords = make(map[any][]protocol.IndexRecord) - i.Indexes = append(i.Indexes, index) - return len(i.Indexes) - 1 - } else if i.Indexes[match].FieldType != ft { - // update the field type if necessary - i.Indexes[match].FieldType |= ft + metadata := &IndexMeta{} + if err := metadata.UnmarshalBinary(buf); err != nil { + return nil, fmt.Errorf("failed to unmarshal metadata: %w", err) + } + if metadata.FieldName == name && metadata.FieldType == fieldType { + return next, nil + } + mp = next + } + // we haven't found the index, so we need to create it + next, err := mp.AddNext() + if err != nil { + return nil, fmt.Errorf("failed to add next meta page: %w", err) } - return match + metadata := &IndexMeta{} + metadata.FieldName = name + metadata.FieldType = fieldType + buf, err := metadata.MarshalBinary() + if err != nil { + return nil, fmt.Errorf("failed to marshal metadata: %w", err) + } + return next, next.SetMetadata(buf) +} +// Synchronize will synchronize the index file with the data file. +// This is a convenience method and is equivalent to calling +// Synchronize() on the data handler itself. +func (i *IndexFile) Synchronize(df DataFile) error { + return i.dataHandler.Synchronize(i, df) } diff --git a/pkg/appendable/index_file_csv_test.go b/pkg/appendable/index_file_csv_test.go deleted file mode 100644 index 34556c9c..00000000 --- a/pkg/appendable/index_file_csv_test.go +++ /dev/null @@ -1,308 +0,0 @@ -package appendable - -import ( - "bytes" - "fmt" - "log/slog" - "os" - "reflect" - "strings" - "testing" - - "github.com/kevmo314/appendable/pkg/protocol" -) - -func TestAppendDataRowCSV(t *testing.T) { - - originalLogger := slog.Default() - - // Create a logger with Debug on - debugLevel := &slog.LevelVar{} - debugLevel.Set(slog.LevelDebug) - debugLogger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{ - Level: debugLevel, - })) - - slog.SetDefault(debugLogger) - - defer slog.SetDefault(originalLogger) - - var mockCsv string = "header1\ntest1\n" - var mockCsv2 string = "header1\ntest1\ntest3\n" - - t.Run("no schema changes", func(t *testing.T) { - - i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader(mockCsv)}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader(mockCsv2)}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 1 { - t.Errorf("got len(i.Indexes) = %d, want 1", len(i.Indexes)) - } - - if len(j.EndByteOffsets) != 2 { - t.Errorf("got len(i.DataRanges) = %d, want 2", len(i.EndByteOffsets)) - } - - // check that the first data range is untouched despite being incorrect - if j.EndByteOffsets[0] != uint64(len(mockCsv)) { - t.Errorf("got i.DataRanges[0].EndByteOffset = %d, want %d", j.EndByteOffsets[0], uint64(len(mockCsv))) - } - - // check that the second data range has properly set offsets - if j.EndByteOffsets[1] != uint64(len(mockCsv2)) { - t.Errorf("got i.DataRanges[1].EndByteOffset = %d, want %d", j.EndByteOffsets[1], uint64(len(mockCsv2))) - } - }) - - t.Run("check end + start byte offsets multiple", func(t *testing.T) { - i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader(mockCsv2)}) - if err != nil { - t.Fatal(err) - } - - if len(i.Indexes) != 1 { - t.Errorf("got len(i.Indexes) = %d, want 1", len(i.Indexes)) - } - - if len(i.Indexes[0].IndexRecords) != 2 { - t.Errorf("got len(i.Indexes[0].IndexRecords) = %d, want 2", len(i.Indexes[0].IndexRecords)) - } - - if i.Indexes[0].IndexRecords["test1"][0].FieldStartByteOffset != uint64(len("header1\n")) { - t.Errorf("got i.Indexes[0].IndexRecords[\"test1\"][0].FieldStartByteOffset = %d, want 7", i.Indexes[0].IndexRecords["test1"][0].FieldStartByteOffset) - } - - if i.Indexes[0].IndexRecords["test3"][0].FieldStartByteOffset != uint64(len("header1\ntest1\n")) { - t.Errorf("got i.Indexes[0].IndexRecords[\"test3\"][0].FieldStartByteOffset = %d, want %d", i.Indexes[0].IndexRecords["test3"][0].FieldStartByteOffset, uint64(len("header\ntest1\n"))) - } - - }) - - t.Run("append index to existing", func(t *testing.T) { - i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader(mockCsv)}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader(mockCsv2)}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 1 { - t.Errorf("got len(j.Indexes) = %d, want 1", len(j.Indexes)) - } - - if len(j.Indexes[0].IndexRecords) != 2 { - fmt.Printf("index records look like %v", j.Indexes[0].IndexRecords) - t.Errorf("got len(j.Indexes[0].IndexRecords) = %d, want 2", len(j.Indexes[0].IndexRecords)) - } - - if len(j.Indexes[0].IndexRecords["test1"]) != 1 { - t.Errorf("got len(j.Indexes[0].IndexRecords[\"test1\"]) = %d, want 1", len(j.Indexes[0].IndexRecords["test1"])) - } - if len(j.Indexes[0].IndexRecords["test3"]) != 1 { - for key, records := range j.Indexes[0].IndexRecords { - t.Errorf("\n\n\nKey: %v, Records: %+v", key, records) - } - t.Errorf("got len(j.Indexes[0].IndexRecords[\"test3\"]) = %d, want 1", len(j.Indexes[0].IndexRecords["test3"])) - } - - if j.Indexes[0].IndexRecords["test1"][0].DataNumber != 0 { - t.Errorf("got i.Indexes[0].IndexRecords[\"test1\"][0].DataNumber = %d, want 0", j.Indexes[0].IndexRecords["test1"][0].DataNumber) - } - if j.Indexes[0].IndexRecords["test1"][0].FieldStartByteOffset != uint64(len("header1\n")) { - t.Errorf("got i.Indexes[0].IndexRecords[\"test1\"][0].FieldStartByteOffset = %d, want %d", j.Indexes[0].IndexRecords["test1"][0].FieldStartByteOffset, uint64(len("header\n"))) - } - - if j.Indexes[0].IndexRecords["test3"][0].DataNumber != 1 { - t.Errorf("got i.Indexes[0].IndexRecords[\"test3\"][0].DataNumber = %d, want 1", j.Indexes[0].IndexRecords["test3"][0].DataNumber) - } - - // verify byte offset calculation - if j.Indexes[0].IndexRecords["test3"][0].FieldStartByteOffset != uint64(len("header1\ntest1\n")) { - t.Errorf("got i.Indexes[0].IndexRecords[\"test3\"][0].FieldStartByteOffset = %d, want %d", j.Indexes[0].IndexRecords["test3"][0].FieldStartByteOffset, uint64(len("header\ntest1\n"))) - } - }) - - t.Run("assert correct types", func(t *testing.T) { - i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader("n1,n2\n3.4,3\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader("n1,n2\n3.4,3\n4.3,4")}) - if err != nil { - t.Fatal(err) - } - - for _, index := range i.Indexes { - for key := range index.IndexRecords { - keyType := reflect.TypeOf(key).String() - if keyType != "float64" { - t.Errorf("i keytype is %v", keyType) - } - - if index.FieldType != protocol.FieldTypeNumber { - t.Errorf("index field type is not number. actual: %v", index.FieldType) - } - } - } - - for _, index := range j.Indexes { - for key := range index.IndexRecords { - keyType := reflect.TypeOf(key).String() - if keyType != "float64" { - t.Errorf("j keytype is %v", keyType) - } - - if index.FieldType != protocol.FieldTypeNumber { - t.Errorf("index field type is not number. actual: %v", index.FieldType) - } - } - } - - }) - - t.Run("multiple headers", func(t *testing.T) { - - i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader("name,move\nmica,coyote\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader("name,move\nmica,coyote\ngalvao,mount\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 2 { - t.Errorf("got len(i.Indexes) = %d, want 2", len(i.Indexes)) - } - - if len(j.EndByteOffsets) != 2 { - t.Errorf("got len(i.DataRanges) = %d, want 2", len(i.EndByteOffsets)) - } - - // check that the first data range is untouched despite being incorrect - if j.EndByteOffsets[0] != uint64(len("name,move\nmica,coyote\n")) { - t.Errorf("got i.DataRanges[0].EndByteOffset = %d, want %d", j.EndByteOffsets[0], uint64(len("name,move\nmica,coyote\n"))) - } - - // check that the second data range has properly set offsets - if j.EndByteOffsets[1] != uint64(len("name,move\nmica,coyote\ngalvao,mount\n")) { - t.Errorf("got i.DataRanges[1].EndByteOffset = %d, want %d", j.EndByteOffsets[1], uint64(len("name,move\nmica,coyote\ngalvao,mount\n"))) - } - - fmt.Printf("index file looks like: %v", j.Indexes) - }) - - t.Run("generate index file", func(t *testing.T) { - i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader("")}) - - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - _, err = ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader(mockCsv2)}) - if err != nil { - t.Fatal(err) - } - - }) - - t.Run("existing index but different type", func(t *testing.T) { - i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader("test\ntest1\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader("test\ntest1\n123\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 1 { - t.Errorf("got len(i.Indexes) = %d, want 1", len(j.Indexes)) - } - - if j.Indexes[0].FieldType != protocol.FieldTypeString|protocol.FieldTypeNumber { - t.Errorf("got i.Indexes[0].FieldType = %#v, want protocol.FieldTypeUnknown", j.Indexes[0].FieldType) - } - }) - - t.Run("existing index but nullable type", func(t *testing.T) { - i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader("test,test2\nomoplata,armbar\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader("test,test2\nomoplata,armbar\n,singlelegx\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 2 { - t.Errorf("got len(i.Indexes) = %d, want 2", len(j.Indexes)) - } - - if j.Indexes[0].FieldType != protocol.FieldTypeNull|protocol.FieldTypeString { - t.Errorf("got i.Indexes[0].FieldType = %#v, want protocol.FieldTypeNullableString", j.Indexes[0].FieldType) - } - }) - -} diff --git a/pkg/appendable/index_file_jsonl_test.go b/pkg/appendable/index_file_jsonl_test.go deleted file mode 100644 index cd015a69..00000000 --- a/pkg/appendable/index_file_jsonl_test.go +++ /dev/null @@ -1,307 +0,0 @@ -package appendable - -import ( - "bytes" - "strings" - "testing" - - "github.com/kevmo314/appendable/pkg/protocol" -) - -func TestAppendDataRowJSONL(t *testing.T) { - - t.Run("no schema changes", func(t *testing.T) { - - i, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n{\"test\":\"test3\"}\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 1 { - t.Errorf("got len(i.Indexes) = %d, want 1", len(i.Indexes)) - } - - if len(j.EndByteOffsets) != 2 { - t.Errorf("got len(i.DataRanges) = %d, want 2", len(i.EndByteOffsets)) - } - - // check that the first data range is untouched despite being incorrect - if j.EndByteOffsets[0] != uint64(len("{\"test\":\"test1\"}\n")) { - t.Errorf("got i.DataRanges[0].EndByteOffset = %d, want %d", j.EndByteOffsets[0], uint64(len("{\"test\":\"test1\"}\n"))) - } - - // check that the second data range has properly set offsets - if j.EndByteOffsets[1] != uint64(len("{\"test\":\"test1\"}\n{\"test\":\"test3\"}\n")) { - t.Errorf("got i.DataRanges[1].EndByteOffset = %d, want %d", j.EndByteOffsets[1], uint64(len("{\"test\":\"test1\"}\n{\"test\":\"test3\"}\n"))) - } - }) - - t.Run("correctly sets field offset", func(t *testing.T) { - i, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n{\"test\":\"test3\"}\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 1 { - t.Errorf("got len(j.Indexes) = %d, want 1", len(j.Indexes)) - } - - if len(j.Indexes[0].IndexRecords) != 2 { - t.Errorf("got len(j.Indexes[0].IndexRecords) = %d, want 2", len(j.Indexes[0].IndexRecords)) - } - - if len(j.Indexes[0].IndexRecords["test1"]) != 1 { - t.Errorf("got len(j.Indexes[0].IndexRecords[\"test1\"]) = %d, want 1", len(j.Indexes[0].IndexRecords["test1"])) - } - if len(j.Indexes[0].IndexRecords["test3"]) != 1 { - t.Errorf("got len(j.Indexes[0].IndexRecords[\"test3\"]) = %d, want 1", len(j.Indexes[0].IndexRecords["test3"])) - } - - if j.Indexes[0].IndexRecords["test1"][0].DataNumber != 0 { - t.Errorf("got i.Indexes[0].IndexRecords[\"test1\"][0].DataNumber = %d, want 0", j.Indexes[0].IndexRecords["test1"][0].DataNumber) - } - if j.Indexes[0].IndexRecords["test1"][0].FieldStartByteOffset != uint64(len("{\"test\":")) { - t.Errorf("got i.Indexes[0].IndexRecords[\"test1\"][0].FieldStartByteOffset = %d, want 10", j.Indexes[0].IndexRecords["test1"][0].FieldStartByteOffset) - } - - if j.Indexes[0].IndexRecords["test3"][0].DataNumber != 1 { - t.Errorf("got i.Indexes[0].IndexRecords[\"test3\"][1].DataNumber = %d, want 1", j.Indexes[0].IndexRecords["test3"][1].DataNumber) - } - if j.Indexes[0].IndexRecords["test3"][0].FieldStartByteOffset != uint64(len("{\"test\":\"test1\"}\n{\"test\":")) { - t.Errorf("got i.Indexes[0].IndexRecords[\"test3\"][1].FieldStartByteOffset = %d, want 10", j.Indexes[0].IndexRecords["test3"][1].FieldStartByteOffset) - } - }) - - t.Run("new index", func(t *testing.T) { - i, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n{\"test2\":\"test3\"}\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional index - if len(j.Indexes) != 2 { - t.Errorf("got len(i.Indexes) = %d, want 2", len(j.Indexes)) - } - - if j.Indexes[1].FieldName != "test2" { - t.Errorf("got i.Indexes[1].FieldName = %s, want \"test2\"", j.Indexes[1].FieldName) - } - - if j.Indexes[1].FieldType != protocol.FieldTypeString { - t.Errorf("got i.Indexes[1].FieldType = %+v, want protocol.FieldTypeString", j.Indexes[1].FieldType) - } - }) - - t.Run("existing index but different type", func(t *testing.T) { - i, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n{\"test\":123}\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 1 { - t.Errorf("got len(i.Indexes) = %d, want 1", len(j.Indexes)) - } - - if j.Indexes[0].FieldType != protocol.FieldTypeString|protocol.FieldTypeNumber { - t.Errorf("got i.Indexes[0].FieldType = %#v, want protocol.FieldTypeUnknown", j.Indexes[0].FieldType) - } - }) - - t.Run("creates nested indices", func(t *testing.T) { - i, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n{\"test2\":{\"a\":1,\"b\":\"2\"}}\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 3 { - t.Errorf("got len(i.Indexes) = %d, want 3", len(j.Indexes)) - } - - if j.Indexes[0].FieldType != protocol.FieldTypeString { - t.Errorf("got i.Indexes[0].FieldType = %#v, want protocol.FieldTypeUnknown", j.Indexes[0].FieldType) - } - - if j.Indexes[1].FieldType != protocol.FieldTypeNumber { - t.Errorf("got i.Indexes[1].FieldType = %#v, want protocol.FieldTypeNumber", j.Indexes[1].FieldType) - } - - if j.Indexes[2].FieldType != protocol.FieldTypeString { - t.Errorf("got i.Indexes[2].FieldType = %#v, want protocol.FieldTypeString", j.Indexes[2].FieldType) - } - - if j.Indexes[0].FieldName != "test" { - t.Errorf("got i.Indexes[0].FieldName = %s, want \"test\"", j.Indexes[0].FieldName) - } - - if j.Indexes[1].FieldName != "test2.a" { - t.Errorf("got i.Indexes[1].FieldName = %s, want \"test2.a\"", j.Indexes[1].FieldName) - } - - if j.Indexes[2].FieldName != "test2.b" { - t.Errorf("got i.Indexes[2].FieldName = %s, want \"test2.b\"", j.Indexes[2].FieldName) - } - }) - - t.Run("creates nested indices but also erases parent", func(t *testing.T) { - i, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n{\"test\":{\"a\":1,\"b\":\"2\"}}\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 3 { - t.Errorf("got len(i.Indexes) = %d, want 3", len(j.Indexes)) - } - - if j.Indexes[0].FieldType != protocol.FieldTypeString|protocol.FieldTypeObject { - t.Errorf("got i.Indexes[0].FieldType = %#v, want protocol.FieldTypeUnknown", j.Indexes[0].FieldType) - } - }) - - t.Run("ignores arrays", func(t *testing.T) { - i, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n{\"test2\":[[1,2,3],4]}\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 1 { - t.Errorf("got len(i.Indexes) = %d, want 3", len(j.Indexes)) - } - }) - - t.Run("ignores arrays but downgrades type", func(t *testing.T) { - i, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n{\"test\":[[1,2,3],4]}\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 1 { - t.Errorf("got len(i.Indexes) = %d, want 3", len(j.Indexes)) - } - - if j.Indexes[0].FieldType != protocol.FieldTypeString|protocol.FieldTypeArray { - t.Errorf("got i.Indexes[0].FieldType = %#v, want protocol.FieldTypeUnknown", j.Indexes[0].FieldType) - } - }) - - t.Run("existing index but nullable type", func(t *testing.T) { - i, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n{\"test\":null}\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 1 { - t.Errorf("got len(i.Indexes) = %d, want 1", len(j.Indexes)) - } - - if j.Indexes[0].FieldType != protocol.FieldTypeNull|protocol.FieldTypeString { - t.Errorf("got i.Indexes[0].FieldType = %#v, want protocol.FieldTypeNullableString", j.Indexes[0].FieldType) - } - }) -} diff --git a/pkg/appendable/index_file_test.go b/pkg/appendable/index_file_test.go index 7219fdf6..207bf869 100644 --- a/pkg/appendable/index_file_test.go +++ b/pkg/appendable/index_file_test.go @@ -1,17 +1,5 @@ package appendable -import ( - "bytes" - "fmt" - "log/slog" - "os" - "reflect" - "strings" - "testing" - - "github.com/kevmo314/appendable/pkg/protocol" -) - /* This test file performs deep checks between two Index files. @@ -23,186 +11,186 @@ Current findings when comparing: jsonl <---> csv > the field length doesn't align, it seems like JSONL is accounting for "" for strings, while CSV measures raw string values */ -func TestIndexFile(t *testing.T) { +// func TestIndexFile(t *testing.T) { - originalLogger := slog.Default() +// originalLogger := slog.Default() - // Create a logger with Debug on - debugLevel := &slog.LevelVar{} - debugLevel.Set(slog.LevelDebug) - debugLogger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{ - Level: debugLevel, - })) +// // Create a logger with Debug on +// debugLevel := &slog.LevelVar{} +// debugLevel.Set(slog.LevelDebug) +// debugLogger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{ +// Level: debugLevel, +// })) - slog.SetDefault(debugLogger) +// slog.SetDefault(debugLogger) - defer slog.SetDefault(originalLogger) +// defer slog.SetDefault(originalLogger) - mockJsonl := "{\"h1\":\"test1\", \"h2\":37.3}\n" - mockJsonl2 := "{\"h1\":\"test1\", \"h2\":37.3}\n{\"h1\":\"test3\", \"h2\":4}\n" +// mockJsonl := "{\"h1\":\"test1\", \"h2\":37.3}\n" +// mockJsonl2 := "{\"h1\":\"test1\", \"h2\":37.3}\n{\"h1\":\"test3\", \"h2\":4}\n" - mockCsv := "h1,h2\ntest1,37.3\n" - mockCsv2 := "h1,h2\ntest1,37.3\ntest3,4\n" +// mockCsv := "h1,h2\ntest1,37.3\n" +// mockCsv2 := "h1,h2\ntest1,37.3\ntest3,4\n" - t.Run("compare mock index file", func(t *testing.T) { - // jsonl - jif, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader(mockJsonl2)}) +// t.Run("compare mock index file", func(t *testing.T) { +// // jsonl +// jif, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader(mockJsonl2)}) - if err != nil { - t.Fatal(err) - } +// if err != nil { +// t.Fatal(err) +// } - civ, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader(mockCsv2)}) +// civ, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader(mockCsv2)}) - if err != nil { - t.Fatal(err) - } +// if err != nil { +// t.Fatal(err) +// } - status, res := jif.compareTo(civ) +// status, res := jif.compareTo(civ) - if !status { - t.Errorf("Not equal\n%v", res) - } +// if !status { +// t.Errorf("Not equal\n%v", res) +// } - }) +// }) - t.Run("compare mock index file after appending", func(t *testing.T) { - jif, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader(mockJsonl)}) - if err != nil { - t.Fatal(err) - } - - jbuf := &bytes.Buffer{} +// t.Run("compare mock index file after appending", func(t *testing.T) { +// jif, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader(mockJsonl)}) +// if err != nil { +// t.Fatal(err) +// } + +// jbuf := &bytes.Buffer{} - if err := jif.Serialize(jbuf); err != nil { - t.Fatal(err) - } - - civ, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader(mockCsv)}) - if err != nil { - t.Fatal(err) - } - - cbuf := &bytes.Buffer{} - if err := civ.Serialize(cbuf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(jbuf, JSONLHandler{ReadSeeker: strings.NewReader(mockJsonl2)}) - if err != nil { - t.Fatal(err) - } - - c, err := ReadIndexFile(cbuf, CSVHandler{ReadSeeker: strings.NewReader(mockCsv2)}) - if err != nil { - t.Fatal(err) - } - status, res := j.compareTo(c) - - fmt.Printf("%v", c) +// if err := jif.Serialize(jbuf); err != nil { +// t.Fatal(err) +// } + +// civ, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader(mockCsv)}) +// if err != nil { +// t.Fatal(err) +// } + +// cbuf := &bytes.Buffer{} +// if err := civ.Serialize(cbuf); err != nil { +// t.Fatal(err) +// } + +// j, err := ReadIndexFile(jbuf, JSONLHandler{ReadSeeker: strings.NewReader(mockJsonl2)}) +// if err != nil { +// t.Fatal(err) +// } + +// c, err := ReadIndexFile(cbuf, CSVHandler{ReadSeeker: strings.NewReader(mockCsv2)}) +// if err != nil { +// t.Fatal(err) +// } +// status, res := j.compareTo(c) + +// fmt.Printf("%v", c) - if !status { - t.Errorf("Not equal\n%v", res) - } +// if !status { +// t.Errorf("Not equal\n%v", res) +// } - }) +// }) -} +// } -func compareIndexRecord(ir1, ir2 *protocol.IndexRecord, fieldType protocol.FieldType) (bool, string) { - if ir1.DataNumber != ir2.DataNumber { - return false, fmt.Sprintf("Index record data numbers do not align\ti1: %v, i2: %v", ir1.DataNumber, ir2.DataNumber) - } +// func compareIndexRecord(ir1, ir2 *protocol.IndexRecord, fieldType protocol.FieldType) (bool, string) { +// if ir1.DataNumber != ir2.DataNumber { +// return false, fmt.Sprintf("Index record data numbers do not align\ti1: %v, i2: %v", ir1.DataNumber, ir2.DataNumber) +// } - if fieldType&protocol.FieldTypeString != protocol.FieldTypeString { - if ir1.FieldLength != ir2.FieldLength { - return false, fmt.Sprintf("Field Length do not align\ti1: %v, i2: %v", ir1.FieldLength, ir2.FieldLength) - } - } else { - if ir1.FieldLength != ir2.FieldLength+2 { - return false, fmt.Sprintf("Field Length do not align\ti1: %v, i2: %v", ir1.FieldLength, ir2.FieldLength) - } - } - return true, "" -} - -func (i1 *Index) compareIndex(i2 *Index) (bool, string) { - // compare fieldname - if i1.FieldName != i2.FieldName { - return false, fmt.Sprintf("field names do not align\ti1: %v, i2: %v", i1.FieldName, i2.FieldName) - } +// if fieldType&protocol.FieldTypeString != protocol.FieldTypeString { +// if ir1.FieldLength != ir2.FieldLength { +// return false, fmt.Sprintf("Field Length do not align\ti1: %v, i2: %v", ir1.FieldLength, ir2.FieldLength) +// } +// } else { +// if ir1.FieldLength != ir2.FieldLength+2 { +// return false, fmt.Sprintf("Field Length do not align\ti1: %v, i2: %v", ir1.FieldLength, ir2.FieldLength) +// } +// } +// return true, "" +// } + +// func (i1 *Index) compareIndex(i2 *Index) (bool, string) { +// // compare fieldname +// if i1.FieldName != i2.FieldName { +// return false, fmt.Sprintf("field names do not align\ti1: %v, i2: %v", i1.FieldName, i2.FieldName) +// } - // compare fieldtype - if i1.FieldType != i2.FieldType { - return false, fmt.Sprintf("field types do not align\ti1: %v, i2: %v", i1.FieldType, i2.FieldType) - } - - // compare index records - if len(i1.IndexRecords) != len(i2.IndexRecords) { - return false, fmt.Sprintf("index record lengths do not line up\ti1: %v, i2: %v", len(i1.IndexRecords), len(i2.IndexRecords)) - } - - for key, records1 := range i1.IndexRecords { - records2, ok := i2.IndexRecords[key] - - var keysAndTypesI1, keysAndTypesI2 []string - - // Iterate through i1.IndexRecords to collect keys and their types - for key := range i1.IndexRecords { - keyType := reflect.TypeOf(key).String() // Get the type of the key as a string - keysAndTypesI1 = append(keysAndTypesI1, fmt.Sprintf("%v (%s)", key, keyType)) - } - - // Iterate through i2.IndexRecords to collect keys and their types - for key := range i2.IndexRecords { - keyType := reflect.TypeOf(key).String() // Get the type of the key as a string - keysAndTypesI2 = append(keysAndTypesI2, fmt.Sprintf("%v (%s)", key, keyType)) - } - - if !ok { - return false, fmt.Sprintf("key doesn't exist in i2\tkey found in i1: %v\n%v\t%v\n%v\t%v", key, i1.IndexRecords, i2.IndexRecords, keysAndTypesI1, keysAndTypesI2) - } - - for i := range records1 { - status, res := compareIndexRecord(&records1[i], &records2[i], i1.FieldType) - if !status { - return false, res - } - } - } - - return true, "" -} - -func (i1 *IndexFile) compareTo(i2 *IndexFile) (bool, string) { - // check versions - if i1.Version != i2.Version { - return false, fmt.Sprintf("versions mismatched\ti1: %v, i2: %v", i1.Version, i2.Version) - } - - if len(i1.Indexes) != len(i2.Indexes) { - return false, fmt.Sprintf("indexes length not equal\ti1: %v, i2: %v", len(i1.Indexes), len(i2.Indexes)) - } - - for i, index1 := range i1.Indexes { - index2 := i2.Indexes[i] - - status, res := index1.compareIndex(&index2) - - if !status { - return false, res - } - } - - if len(i1.EndByteOffsets) != len(i2.EndByteOffsets) { - return false, fmt.Sprintf("endbyteoffsets length not equal\ti1: %v, i2: %v", len(i1.EndByteOffsets), len(i2.EndByteOffsets)) - } - - fmt.Printf("endbyteoffsets equal") - - if len(i1.Checksums) != len(i2.Checksums) { - return false, fmt.Sprintf("checksums length not equal\ti1: %v, i2: %v", len(i1.Checksums), len(i2.Checksums)) - } - - return true, "great success!" -} +// // compare fieldtype +// if i1.FieldType != i2.FieldType { +// return false, fmt.Sprintf("field types do not align\ti1: %v, i2: %v", i1.FieldType, i2.FieldType) +// } + +// // compare index records +// if len(i1.IndexRecords) != len(i2.IndexRecords) { +// return false, fmt.Sprintf("index record lengths do not line up\ti1: %v, i2: %v", len(i1.IndexRecords), len(i2.IndexRecords)) +// } + +// for key, records1 := range i1.IndexRecords { +// records2, ok := i2.IndexRecords[key] + +// var keysAndTypesI1, keysAndTypesI2 []string + +// // Iterate through i1.IndexRecords to collect keys and their types +// for key := range i1.IndexRecords { +// keyType := reflect.TypeOf(key).String() // Get the type of the key as a string +// keysAndTypesI1 = append(keysAndTypesI1, fmt.Sprintf("%v (%s)", key, keyType)) +// } + +// // Iterate through i2.IndexRecords to collect keys and their types +// for key := range i2.IndexRecords { +// keyType := reflect.TypeOf(key).String() // Get the type of the key as a string +// keysAndTypesI2 = append(keysAndTypesI2, fmt.Sprintf("%v (%s)", key, keyType)) +// } + +// if !ok { +// return false, fmt.Sprintf("key doesn't exist in i2\tkey found in i1: %v\n%v\t%v\n%v\t%v", key, i1.IndexRecords, i2.IndexRecords, keysAndTypesI1, keysAndTypesI2) +// } + +// for i := range records1 { +// status, res := compareIndexRecord(&records1[i], &records2[i], i1.FieldType) +// if !status { +// return false, res +// } +// } +// } + +// return true, "" +// } + +// func (i1 *IndexFile) compareTo(i2 *IndexFile) (bool, string) { +// // check versions +// if i1.Version != i2.Version { +// return false, fmt.Sprintf("versions mismatched\ti1: %v, i2: %v", i1.Version, i2.Version) +// } + +// if len(i1.Indexes) != len(i2.Indexes) { +// return false, fmt.Sprintf("indexes length not equal\ti1: %v, i2: %v", len(i1.Indexes), len(i2.Indexes)) +// } + +// for i, index1 := range i1.Indexes { +// index2 := i2.Indexes[i] + +// status, res := index1.compareIndex(&index2) + +// if !status { +// return false, res +// } +// } + +// if len(i1.EndByteOffsets) != len(i2.EndByteOffsets) { +// return false, fmt.Sprintf("endbyteoffsets length not equal\ti1: %v, i2: %v", len(i1.EndByteOffsets), len(i2.EndByteOffsets)) +// } + +// fmt.Printf("endbyteoffsets equal") + +// if len(i1.Checksums) != len(i2.Checksums) { +// return false, fmt.Sprintf("checksums length not equal\ti1: %v, i2: %v", len(i1.Checksums), len(i2.Checksums)) +// } + +// return true, "great success!" +// } diff --git a/pkg/appendable/io.go b/pkg/appendable/io.go deleted file mode 100644 index 86739cbf..00000000 --- a/pkg/appendable/io.go +++ /dev/null @@ -1,318 +0,0 @@ -package appendable - -import ( - "bytes" - "encoding/binary" - "fmt" - "io" - "log/slog" - "sort" - "strings" - - "github.com/cespare/xxhash/v2" - "github.com/kevmo314/appendable/pkg/encoding" - "github.com/kevmo314/appendable/pkg/protocol" -) - -type DataHandler interface { - io.ReadSeeker - Synchronize(f *IndexFile) error -} - -func NewIndexFile(data DataHandler) (*IndexFile, error) { - f := &IndexFile{ - Version: CurrentVersion, - Indexes: []Index{}, - data: data, - } - return f, data.Synchronize(f) -} - -func ReadIndexFile(r io.Reader, data DataHandler) (*IndexFile, error) { - f := &IndexFile{} - - f.data = data - - slog.Debug("Starting ReadIndexFile") - - // read the version - version, err := encoding.ReadByte(r) - if err != nil { - return nil, fmt.Errorf("failed to read version: %w", err) - } - f.Version = protocol.Version(version) - - switch version { - case 1: - // read the index file header - ifh := protocol.IndexFileHeader{} - if ifh.IndexLength, err = encoding.ReadUint64(r); err != nil { - return nil, fmt.Errorf("failed to read index file header: %w", err) - } - if ifh.DataCount, err = encoding.ReadUint64(r); err != nil { - return nil, fmt.Errorf("failed to read index file header: %w", err) - } - - slog.Debug("headers", slog.Any("ifh", ifh)) - - // read the index headers - f.Indexes = []Index{} - br := 0 - recordCounts := []uint64{} - for br < int(ifh.IndexLength) { - var index Index - if index.FieldName, err = encoding.ReadString(r); err != nil { - return nil, fmt.Errorf("failed to read index header: %w", err) - } - ft, err := encoding.ReadUint64(r) - if err != nil { - return nil, fmt.Errorf("failed to read index header: %w", err) - } - index.FieldType = protocol.FieldType(ft) - recordCount, err := encoding.ReadUint64(r) - if err != nil { - return nil, fmt.Errorf("failed to read index header: %w", err) - } - recordCounts = append(recordCounts, recordCount) - index.IndexRecords = make(map[any][]protocol.IndexRecord) - f.Indexes = append(f.Indexes, index) - br += encoding.SizeString(index.FieldName) + binary.Size(ft) + binary.Size(uint64(0)) - - slog.Debug("", slog.Any("ih", index), slog.Any("recordCount", recordCount)) - } - if br != int(ifh.IndexLength) { - return nil, fmt.Errorf("expected to read %d bytes, read %d bytes", ifh.IndexLength, br) - } - - slog.Debug("Reading index headers done") - - // read the index records - for i, index := range f.Indexes { - - for j := 0; j < int(recordCounts[i]); j++ { - var ir protocol.IndexRecord - if ir.DataNumber, err = encoding.ReadUint64(r); err != nil { - return nil, fmt.Errorf("failed to read index record: %w", err) - } - if ir.FieldStartByteOffset, err = encoding.ReadUint64(r); err != nil { - return nil, fmt.Errorf("failed to read index record: %w", err) - } - if ir.FieldLength, err = encoding.UnpackFint16(r); err != nil { - return nil, fmt.Errorf("failed to read index record: %w", err) - } - - var value any - switch handler := data.(type) { - case JSONLHandler: - value, err = ir.Token(handler) - case CSVHandler: - value, err = ir.CSVField(handler) - default: - err = fmt.Errorf("unrecognized data handler type: %T", handler) - } - - if err != nil { - return nil, fmt.Errorf("failed to read index record: %w", err) - } - - slog.Debug("read index record", slog.Any("index", index.FieldName), slog.Any("any", value), slog.Any("record", ir)) - - switch value.(type) { - case nil, bool, int, int8, int16, int32, int64, float32, float64, string: - fmt.Printf("appending: %v", value) - index.IndexRecords[value] = append(index.IndexRecords[value], ir) - default: - return nil, fmt.Errorf("unsupported type: %T", value) - } - } - } - - // read the data ranges - f.EndByteOffsets = make([]uint64, ifh.DataCount) - for i := 0; i < int(ifh.DataCount); i++ { - if f.EndByteOffsets[i], err = encoding.ReadUint64(r); err != nil { - return nil, fmt.Errorf("failed to read data range: %w", err) - } - } - - // read the checksums - f.Checksums = make([]uint64, ifh.DataCount) - for i := 0; i < int(ifh.DataCount); i++ { - if f.Checksums[i], err = encoding.ReadUint64(r); err != nil { - return nil, fmt.Errorf("failed to read checksum: %w", err) - } - } - - startIndex := 0 - start := uint64(0) - if _, isCSV := data.(CSVHandler); isCSV && len(f.EndByteOffsets) > 0 { - start = f.EndByteOffsets[0] - startIndex = 1 - } - - for i := startIndex; i < int(ifh.DataCount); i++ { - - // this is a hotfix solution. It works great B) - if _, isCsv := data.(CSVHandler); isCsv { - if i > 1 { - start -= 1 - } - } - - // read the range from the data file to verify the checksum - if _, err := data.Seek(int64(start), io.SeekStart); err != nil { - return nil, fmt.Errorf("failed to seek data file: %w", err) - } - buf := &bytes.Buffer{} - - if _, err := io.CopyN(buf, data, int64(f.EndByteOffsets[i]-start-1)); err != nil { - return nil, fmt.Errorf("failed to read data file: %w", err) - } - - if xxhash.Sum64(buf.Bytes()) != f.Checksums[i] { - return nil, fmt.Errorf("checksum mismatch a %d, b %d", xxhash.Sum64(buf.Bytes()), f.Checksums[i]) - } - start = f.EndByteOffsets[i] + 1 - } - default: - return nil, fmt.Errorf("unsupported version: %d", version) - } - - // we've deserialized the underlying file, seek to the end of the last data range to prepare for appending - if len(f.EndByteOffsets) > 0 { - if _, err := data.Seek(int64(f.EndByteOffsets[len(f.EndByteOffsets)-1]), io.SeekStart); err != nil { - return nil, fmt.Errorf("failed to seek data file: %w", err) - } - } - slog.Debug("======") - return f, data.Synchronize(f) -} - -func (f *IndexFile) Serialize(w io.Writer) error { - // write the version - if err := encoding.WriteByte(w, byte(f.Version)); err != nil { - return fmt.Errorf("failed to write version: %w", err) - } - - dataCount := uint64(len(f.EndByteOffsets)) - indexLength := 0 - for _, index := range f.Indexes { - indexLength += encoding.SizeString(index.FieldName) - indexLength += binary.Size(index.FieldType) - indexLength += binary.Size(uint64(0)) - } - - // write the index file header - if err := encoding.WriteUint64(w, uint64(indexLength)); err != nil { - return fmt.Errorf("failed to write index length: %w", err) - } - if err := encoding.WriteUint64(w, dataCount); err != nil { - return fmt.Errorf("failed to write data count: %w", err) - } - - // write the index headers - for _, index := range f.Indexes { - if err := encoding.WriteString(w, index.FieldName); err != nil { - return fmt.Errorf("failed to write index field name: %w", err) - } - if err := encoding.WriteUint64(w, uint64(index.FieldType)); err != nil { - return fmt.Errorf("failed to write index field type: %w", err) - } - // total the number of index records - count := 0 - for _, records := range index.IndexRecords { - count += len(records) - } - if err := encoding.WriteUint64(w, uint64(count)); err != nil { - return fmt.Errorf("failed to write index record count: %w", err) - } - } - - // write the index records - for _, index := range f.Indexes { - var err error - keys := make([]any, len(index.IndexRecords)) - i := 0 - for key := range index.IndexRecords { - keys[i] = key - i++ - } - - sort.Slice(keys, func(i, j int) bool { - at, bt := keys[i], keys[j] - - switch f.data.(type) { - case CSVHandler: - - if atr, btr := fieldRankCsvField(at), fieldRankCsvField(bt); atr != btr { - return atr < btr - } - - switch at.(type) { - case nil: - return false - case bool: - return !at.(bool) && bt.(bool) - case int, int8, int16, int32, int64: - return at.(int) < bt.(int) - case float32, float64: - return at.(float64) < bt.(float64) - case string: - return strings.Compare(at.(string), bt.(string)) < 0 - default: - panic("unknown type") - } - - case JSONLHandler: - if atr, btr := fieldRank(at), fieldRank(bt); atr != btr { - return atr < btr - } - switch at.(type) { - case nil: - return false - case bool: - return !at.(bool) && bt.(bool) - case int, int8, int16, int32, int64, float32, float64: - return at.(float64) < bt.(float64) - case string: - return strings.Compare(at.(string), bt.(string)) < 0 - default: - panic("unknown type") - } - default: - panic("unknown handler") - } - }) - // iterate in key-ascending order - for _, key := range keys { - for _, item := range index.IndexRecords[key] { - if err = encoding.WriteUint64(w, item.DataNumber); err != nil { - return fmt.Errorf("failed to write index record: %w", err) - } - if err = encoding.WriteUint64(w, item.FieldStartByteOffset); err != nil { - return fmt.Errorf("failed to write index record: %w", err) - } - if err = encoding.PackFint16(w, item.FieldLength); err != nil { - return fmt.Errorf("failed to write index record: %w", err) - } - } - } - if err != nil { - return fmt.Errorf("failed to write index record: %w", err) - } - } - - // write the data ranges - for _, offset := range f.EndByteOffsets { - if err := encoding.WriteUint64(w, offset); err != nil { - return fmt.Errorf("failed to write data range: %w", err) - } - } - for _, checksum := range f.Checksums { - if err := encoding.WriteUint64(w, checksum); err != nil { - return fmt.Errorf("failed to write data range: %w", err) - } - } - - return nil -} diff --git a/pkg/appendable/io_test.go b/pkg/appendable/io_test.go deleted file mode 100644 index bf71b2ca..00000000 --- a/pkg/appendable/io_test.go +++ /dev/null @@ -1,17 +0,0 @@ -package appendable - -import ( - "errors" - "io" - "strings" - "testing" -) - -func TestReadIndexFile(t *testing.T) { - - t.Run("empty index file", func(t *testing.T) { - if _, err := ReadIndexFile(strings.NewReader(""), JSONLHandler{ReadSeeker: strings.NewReader("")}); !errors.Is(err, io.EOF) { - t.Errorf("expected EOF, got %v", err) - } - }) -} diff --git a/pkg/appendable/jsonl_handler.go b/pkg/appendable/jsonl_handler.go deleted file mode 100644 index 8565e8eb..00000000 --- a/pkg/appendable/jsonl_handler.go +++ /dev/null @@ -1,162 +0,0 @@ -package appendable - -import ( - "bufio" - "bytes" - "encoding/json" - "fmt" - "io" - "strings" - - "github.com/cespare/xxhash/v2" - "github.com/kevmo314/appendable/pkg/protocol" -) - -type JSONLHandler struct { - io.ReadSeeker -} - -func (j JSONLHandler) Synchronize(f *IndexFile) error { - - // read until the next newline - scanner := bufio.NewScanner(f.data) - for i := 0; scanner.Scan(); i++ { - line := scanner.Bytes() - - // create a new json decoder - dec := json.NewDecoder(bytes.NewReader(line)) - - existingCount := len(f.EndByteOffsets) - - // append a data range - var start uint64 - if len(f.EndByteOffsets) > 0 { - start = f.EndByteOffsets[existingCount-1] - } - f.EndByteOffsets = append(f.EndByteOffsets, start+uint64(len(line))+1) - f.Checksums = append(f.Checksums, xxhash.Sum64(line)) - - // if the first token is not {, then return an error - if t, err := dec.Token(); err != nil || t != json.Delim('{') { - return fmt.Errorf("expected '%U', got '%U' (only json objects are supported at the root)", '{', t) - } - - if err := f.handleJSONLObject(dec, []string{}, uint64(existingCount), start); err != nil { - return fmt.Errorf("failed to handle object: %w", err) - } - - // the next token must be a } - if t, err := dec.Token(); err != nil || t != json.Delim('}') { - return fmt.Errorf("expected '}', got '%v'", t) - } - } - - return nil -} - -func fieldRank(token json.Token) int { - switch token.(type) { - case nil: - return 1 - case bool: - return 2 - case int, int8, int16, int32, int64, float32, float64: - return 3 - case string: - return 4 - default: - panic("unknown type") - } -} - -func (i *IndexFile) handleJSONLObject(dec *json.Decoder, path []string, dataIndex, dataOffset uint64) error { - // while the next token is not }, read the key - for dec.More() { - key, err := dec.Token() - if err != nil { - return fmt.Errorf("failed to read token at index %d: %w", dataIndex, err) - } - - // key must be a string - if key, ok := key.(string); !ok { - return fmt.Errorf("expected string key, got '%v'", key) - } else { - fieldOffset := dec.InputOffset() + 1 // skip the : - - value, err := dec.Token() - if err != nil { - return fmt.Errorf("failed to read token: %w", err) - } - - name := strings.Join(append(path, key), ".") - - switch value := value.(type) { - case string, int, int8, int16, int32, int64, float32, float64, bool: - tree := i.Indexes[i.findIndex(name, value)].IndexRecords - // append this record to the list of records for this value - tree[value] = append(tree[value], protocol.IndexRecord{ - DataNumber: dataIndex, - FieldStartByteOffset: dataOffset + uint64(fieldOffset), - FieldLength: int(dec.InputOffset() - fieldOffset), - }) - - case json.Token: - switch value { - case json.Delim('['): - for j := range i.Indexes { - if i.Indexes[j].FieldName == name { - i.Indexes[j].FieldType |= protocol.FieldTypeArray - } - } - // arrays are not indexed yet because we need to incorporate - // subindexing into the specification. however, we have to - // skip tokens until we reach the end of the array. - depth := 1 - for { - t, err := dec.Token() - if err != nil { - return fmt.Errorf("failed to read token: %w", err) - } - - switch t { - case json.Delim('['): - depth++ - case json.Delim(']'): - depth-- - } - - if depth == 0 { - break - } - } - case json.Delim('{'): - // find the index to set the field type to unknown. - for j := range i.Indexes { - if i.Indexes[j].FieldName == name { - i.Indexes[j].FieldType |= protocol.FieldTypeObject - } - } - if err := i.handleJSONLObject(dec, append(path, key), dataIndex, dataOffset); err != nil { - return fmt.Errorf("failed to handle object: %w", err) - } - // read the } - if t, err := dec.Token(); err != nil || t != json.Delim('}') { - return fmt.Errorf("expected '}', got '%v'", t) - } - default: - return fmt.Errorf("unexpected token '%v'", value) - } - case nil: - // set the field to nullable if it's not already - for j := range i.Indexes { - if i.Indexes[j].FieldName == name { - i.Indexes[j].FieldType |= protocol.FieldTypeNull - } - } - default: - return fmt.Errorf("unexpected type '%T'", value) - } - } - } - return nil -} diff --git a/pkg/appendable/typescript.go b/pkg/appendable/typescript.go index 89fdbc9f..3a6b77d7 100644 --- a/pkg/appendable/typescript.go +++ b/pkg/appendable/typescript.go @@ -1,25 +1,23 @@ package appendable -import "io" +// func (f *IndexFile) WriteTypescriptDefinitions(w io.Writer) error { +// _, err := w.Write([]byte(`// This file was generated by github.com/kevmo314/appendable/pkg/appendable/typescript.go`)) +// if err != nil { +// return err +// } +// if _, err := w.Write([]byte("\n\nexport type Record = {\n")); err != nil { +// return err +// } +// // iterate over each field in the index header and generate a field for it +// for _, index := range f.Indexes { +// _, err := w.Write([]byte("\t\"" + index.FieldName + "\": " + index.FieldType.TypescriptType() + ";\n")) +// if err != nil { +// return err +// } +// } +// if _, err := w.Write([]byte("}\n")); err != nil { +// return err +// } -func (f *IndexFile) WriteTypescriptDefinitions(w io.Writer) error { - _, err := w.Write([]byte(`// This file was generated by github.com/kevmo314/appendable/pkg/appendable/typescript.go`)) - if err != nil { - return err - } - if _, err := w.Write([]byte("\n\nexport type Record = {\n")); err != nil { - return err - } - // iterate over each field in the index header and generate a field for it - for _, index := range f.Indexes { - _, err := w.Write([]byte("\t\"" + index.FieldName + "\": " + index.FieldType.TypescriptType() + ";\n")) - if err != nil { - return err - } - } - if _, err := w.Write([]byte("}\n")); err != nil { - return err - } - - return nil -} +// return nil +// } diff --git a/pkg/btree/bptree.go b/pkg/btree/bptree.go index b116e8a4..f895dacc 100644 --- a/pkg/btree/bptree.go +++ b/pkg/btree/bptree.go @@ -4,6 +4,7 @@ import ( "bytes" "fmt" "io" + "log" ) // MetaPage is an abstract interface over the root page of a btree @@ -19,6 +20,8 @@ type BPTree struct { meta MetaPage maxPageSize int + + Data io.ReaderAt } func NewBPTree(tree ReadWriteSeekPager, meta MetaPage) *BPTree { @@ -38,14 +41,15 @@ func (t *BPTree) root() (*BPTreeNode, MemoryPointer, error) { } func (t *BPTree) Find(key []byte) (MemoryPointer, bool, error) { - root, _, err := t.root() + root, rootOffset, err := t.root() if err != nil { return MemoryPointer{}, false, fmt.Errorf("read root node: %w", err) } if root == nil { return MemoryPointer{}, false, nil } - path, err := t.traverse(key, root) + log.Printf("finding for key %v at root %#v", key, rootOffset) + path, err := t.traverse(key, root, rootOffset) if err != nil { return MemoryPointer{}, false, err } @@ -61,7 +65,7 @@ func (t *BPTree) readNode(ptr MemoryPointer) (*BPTreeNode, error) { if _, err := t.tree.Seek(int64(ptr.Offset), io.SeekStart); err != nil { return nil, err } - node := &BPTreeNode{} + node := &BPTreeNode{Data: t.Data} if _, err := node.ReadFrom(t.tree); err != nil { return nil, err } @@ -71,36 +75,47 @@ func (t *BPTree) readNode(ptr MemoryPointer) (*BPTreeNode, error) { type TraversalRecord struct { node *BPTreeNode index int + // the offset is useful so we know which page to free when we split + ptr MemoryPointer } // traverse returns the path from root to leaf in reverse order (leaf first) // the last element is always the node passed in -func (t *BPTree) traverse(key []byte, node *BPTreeNode) ([]*TraversalRecord, error) { +func (t *BPTree) traverse(key []byte, node *BPTreeNode, ptr MemoryPointer) ([]*TraversalRecord, error) { if node.leaf() { - return []*TraversalRecord{{node: node}}, nil + return []*TraversalRecord{{node: node, ptr: ptr}}, nil } for i, k := range node.Keys { if bytes.Compare(key, k.Value) < 0 { + if node.Pointers[i].Offset == ptr.Offset { + log.Printf("infinite loop index %d", i) + log.Printf("%#v", node) + log.Printf("node offset %#v ptr offset %#v", node.Pointers[i].Offset, ptr.Offset) + panic("infinite loop") + } child, err := t.readNode(node.Pointers[i]) if err != nil { return nil, err } - path, err := t.traverse(key, child) + path, err := t.traverse(key, child, node.Pointers[i]) if err != nil { return nil, err } - return append(path, &TraversalRecord{node: node, index: i}), nil + return append(path, &TraversalRecord{node: node, index: i, ptr: node.Pointers[i]}), nil } } + if node.Pointers[len(node.Pointers)-1].Offset == ptr.Offset { + panic("infinite loop 2") + } child, err := t.readNode(node.Pointers[len(node.Pointers)-1]) if err != nil { return nil, err } - path, err := t.traverse(key, child) + path, err := t.traverse(key, child, node.Pointers[len(node.Pointers)-1]) if err != nil { return nil, err } - return append(path, &TraversalRecord{node: node, index: len(node.Keys)}), nil + return append(path, &TraversalRecord{node: node, index: len(node.Keys), ptr: node.Pointers[len(node.Pointers)-1]}), nil } func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error { @@ -110,20 +125,22 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error { } if root == nil { // special case, create the root as the first node - offset, err := t.tree.NewPage() + node := &BPTreeNode{Data: t.Data} + node.Keys = []ReferencedValue{key} + node.Pointers = []MemoryPointer{value} + buf, err := node.MarshalBinary() if err != nil { return err } - node := &BPTreeNode{} - node.Keys = []ReferencedValue{key} - node.Pointers = []MemoryPointer{value} - length, err := node.WriteTo(t.tree) + offset, err := t.tree.NewPage(buf) if err != nil { return err } - return t.meta.SetRoot(MemoryPointer{Offset: uint64(offset), Length: uint32(length)}) + return t.meta.SetRoot(MemoryPointer{Offset: uint64(offset), Length: uint32(len(buf))}) } - path, err := t.traverse(key.Value, root) + + log.Printf("traversing for key %v at root %v", key.Value, rootOffset) + path, err := t.traverse(key.Value, root, rootOffset) if err != nil { return err } @@ -147,17 +164,12 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error { n := tr.node if int(n.Size()) > t.tree.PageSize() { // split the node - moffset, err := t.tree.NewPage() - if err != nil { - return err - } - // mid is the key that will be inserted into the parent mid := len(n.Keys) / 2 midKey := n.Keys[mid] // n is the left node, m the right node - m := &BPTreeNode{} + m := &BPTreeNode{Data: t.Data} if n.leaf() { m.Pointers = n.Pointers[mid:] m.Keys = n.Keys[mid:] @@ -166,7 +178,11 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error { m.Pointers = n.Pointers[mid+1:] m.Keys = n.Keys[mid+1:] } - msize, err := m.WriteTo(t.tree) + mbuf, err := m.MarshalBinary() + if err != nil { + return err + } + moffset, err := t.tree.NewPage(mbuf) if err != nil { return err } @@ -179,70 +195,68 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error { n.Keys = n.Keys[:mid] } - noffset, err := t.tree.NewPage() + nbuf, err := n.MarshalBinary() if err != nil { return err } - nsize, err := n.WriteTo(t.tree) - if err != nil { + noffset := tr.ptr.Offset + if _, err := t.tree.Seek(int64(noffset), io.SeekStart); err != nil { + return err + } + if _, err := t.tree.Write(nbuf); err != nil { return err } // update the parent if i < len(path)-1 { p := path[i+1] - j, _ := p.node.bsearch(midKey.Value) - if j != p.index { - // j should be equal to p.index...? - // if this panic never happens then we can probably remove the above bsearch. - panic("this assumption apparently isn't true") - } // insert the key into the parent - if j == len(p.node.Keys) { + if p.index == len(p.node.Keys) { p.node.Keys = append(p.node.Keys, midKey) } else { - p.node.Keys = append(p.node.Keys[:j+1], p.node.Keys[j:]...) - p.node.Keys[j] = midKey + p.node.Keys = append(p.node.Keys[:p.index+1], p.node.Keys[p.index:]...) + p.node.Keys[p.index] = midKey } - p.node.Pointers = append(p.node.Pointers[:j+1], p.node.Pointers[j:]...) - p.node.Pointers[j] = MemoryPointer{Offset: uint64(noffset), Length: uint32(nsize)} - p.node.Pointers[j+1] = MemoryPointer{Offset: uint64(moffset), Length: uint32(msize)} + p.node.Pointers = append(p.node.Pointers[:p.index+1], p.node.Pointers[p.index:]...) + p.node.Pointers[p.index] = MemoryPointer{Offset: uint64(noffset), Length: uint32(len(nbuf))} + p.node.Pointers[p.index+1] = MemoryPointer{Offset: uint64(moffset), Length: uint32(len(mbuf))} // the parent will be written to disk in the next iteration } else { - poffset := noffset + nsize - // create a new root - p := &BPTreeNode{Pointers: []MemoryPointer{rootOffset}} + // the root split, so create a new root + p := &BPTreeNode{Data: t.Data, Pointers: []MemoryPointer{rootOffset}} p.Keys = []ReferencedValue{midKey} p.Pointers = []MemoryPointer{ - {Offset: uint64(noffset), Length: uint32(nsize)}, - {Offset: uint64(moffset), Length: uint32(msize)}, + {Offset: uint64(noffset), Length: uint32(len(nbuf))}, + {Offset: uint64(moffset), Length: uint32(len(mbuf))}, } - psize, err := p.WriteTo(t.tree) + pbuf, err := p.MarshalBinary() + if err != nil { + return err + } + poffset, err := t.tree.NewPage(pbuf) if err != nil { return err } - return t.meta.SetRoot(MemoryPointer{Offset: uint64(poffset), Length: uint32(psize)}) + if err := t.meta.SetRoot(MemoryPointer{Offset: uint64(poffset), Length: uint32(len(pbuf))}); err != nil { + return err + } + return nil } } else { // write this node to disk and update the parent - offset, err := t.tree.NewPage() + buf, err := tr.node.MarshalBinary() if err != nil { return err } - length, err := tr.node.WriteTo(t.tree) - if err != nil { + if _, err := t.tree.Seek(int64(tr.ptr.Offset), io.SeekStart); err != nil { return err } - - if i < len(path)-1 { - p := path[i+1] - // update the parent at the index - p.node.Pointers[p.index] = MemoryPointer{Offset: uint64(offset), Length: uint32(length)} - } else { - // update the root - return t.meta.SetRoot(MemoryPointer{Offset: uint64(offset), Length: uint32(length)}) + if _, err := t.tree.Write(buf); err != nil { + return err } + // no new nodes were produced, so we can return here + return nil } } panic("unreachable") diff --git a/pkg/btree/bptree_test.go b/pkg/btree/bptree_test.go index 0af1dd4f..6b04d8c6 100644 --- a/pkg/btree/bptree_test.go +++ b/pkg/btree/bptree_test.go @@ -247,6 +247,20 @@ func TestBPTree(t *testing.T) { } }) + t.Run("identical insertion test", func(t *testing.T) { + b := buftest.NewSeekableBuffer() + p, err := NewPageFile(b) + if err != nil { + t.Fatal(err) + } + tree := NewBPTree(p, &testMetaPage{}) + for i := 0; i < 65536*4; i++ { + if err := tree.Insert(ReferencedValue{Value: []byte{1, 2, 3, 4, 5, 6, 7, 8}}, MemoryPointer{Offset: uint64(i)}); err != nil { + t.Fatal(err) + } + } + }) + // t.Run("bulk insert", func(t *testing.T) { // b := buftest.NewSeekableBuffer() // tree :=NewBPTree(b, 2) diff --git a/pkg/btree/multi.go b/pkg/btree/multi.go index 0e98a93c..af081255 100644 --- a/pkg/btree/multi.go +++ b/pkg/btree/multi.go @@ -1,6 +1,7 @@ package btree import ( + "encoding" "encoding/binary" "errors" "fmt" @@ -37,8 +38,18 @@ func (m *LinkedMetaPage) SetRoot(mp MemoryPointer) error { return binary.Write(m.rws, binary.LittleEndian, mp) } -func (m *LinkedMetaPage) BPTree() *BPTree { - return NewBPTree(m.rws, m) +// BPTree returns a B+ tree that uses this meta page as the root +// of the tree. If data is not nil, then it will be used as the +// data source for the tree. +// +// Generally, passing data is required, however if the tree +// consists of only inlined values, it is not necessary. +func (m *LinkedMetaPage) BPTree(data io.ReaderAt) *BPTree { + t := NewBPTree(m.rws, m) + if data != nil { + t.Data = data + } + return t } func (m *LinkedMetaPage) Metadata() ([]byte, error) { @@ -54,6 +65,14 @@ func (m *LinkedMetaPage) Metadata() ([]byte, error) { return buf[4 : 4+length], nil } +func (m *LinkedMetaPage) UnmarshalMetadata(bu encoding.BinaryUnmarshaler) error { + md, err := m.Metadata() + if err != nil { + return err + } + return bu.UnmarshalBinary(md) +} + func (m *LinkedMetaPage) SetMetadata(data []byte) error { if len(data) > m.rws.PageSize()-24 { return errors.New("metadata too large") @@ -69,6 +88,14 @@ func (m *LinkedMetaPage) SetMetadata(data []byte) error { return nil } +func (m *LinkedMetaPage) MarshalMetadata(bm encoding.BinaryMarshaler) error { + buf, err := bm.MarshalBinary() + if err != nil { + return err + } + return m.SetMetadata(buf) +} + func (m *LinkedMetaPage) Next() (*LinkedMetaPage, error) { if _, err := m.rws.Seek(int64(m.offset)+12, io.SeekStart); err != nil { return nil, err @@ -88,7 +115,7 @@ func (m *LinkedMetaPage) AddNext() (*LinkedMetaPage, error) { if curr.offset != ^uint64(0) { return nil, errors.New("next pointer already exists") } - offset, err := m.rws.NewPage() + offset, err := m.rws.NewPage(nil) if err != nil { return nil, err } diff --git a/pkg/btree/node.go b/pkg/btree/node.go index b5b6e521..4ea66c33 100644 --- a/pkg/btree/node.go +++ b/pkg/btree/node.go @@ -3,6 +3,7 @@ package btree import ( "bytes" "encoding/binary" + "fmt" "io" ) @@ -49,56 +50,56 @@ func (n *BPTreeNode) Size() int64 { return int64(size) } -func (n *BPTreeNode) WriteTo(w io.Writer) (int64, error) { +func (n *BPTreeNode) MarshalBinary() ([]byte, error) { size := int32(len(n.Keys)) + buf := make([]byte, n.Size()) // set the first bit to 1 if it's a leaf if n.leaf() { - if err := binary.Write(w, binary.BigEndian, -size); err != nil { - return 0, err - } + binary.BigEndian.PutUint32(buf[:4], uint32(-size)) } else { - if err := binary.Write(w, binary.BigEndian, size); err != nil { - return 0, err - } + binary.BigEndian.PutUint32(buf[:4], uint32(size)) + } + if size == 0 { + panic("writing empty node") } ct := 4 for _, k := range n.Keys { if k.DataPointer.Length > 0 { - if err := binary.Write(w, binary.BigEndian, uint32(0)); err != nil { - return 0, err - } - if err := binary.Write(w, binary.BigEndian, k.DataPointer); err != nil { - return 0, err - } + binary.BigEndian.PutUint32(buf[ct:ct+4], ^uint32(0)) + binary.BigEndian.PutUint64(buf[ct+4:ct+12], k.DataPointer.Offset) + binary.BigEndian.PutUint32(buf[ct+12:ct+16], k.DataPointer.Length) ct += 4 + 12 } else { - if err := binary.Write(w, binary.BigEndian, uint32(len(k.Value))); err != nil { - return 0, err - } - m, err := w.Write(k.Value) - if err != nil { - return 0, err + binary.BigEndian.PutUint32(buf[ct:ct+4], uint32(len(k.Value))) + m := copy(buf[ct+4:ct+4+len(k.Value)], k.Value) + if m != len(k.Value) { + return nil, fmt.Errorf("failed to copy key: %w", io.ErrShortWrite) } ct += m + 4 } } for _, p := range n.Pointers { - if err := binary.Write(w, binary.BigEndian, p); err != nil { - return 0, err - } + binary.BigEndian.PutUint64(buf[ct:ct+8], p.Offset) + binary.BigEndian.PutUint32(buf[ct+8:ct+12], p.Length) ct += 12 } if ct != int(n.Size()) { panic("size mismatch") } - return int64(ct), nil + return buf, nil } -func (n *BPTreeNode) ReadFrom(r io.Reader) (int64, error) { - var size int32 - if err := binary.Read(r, binary.BigEndian, &size); err != nil { +func (n *BPTreeNode) WriteTo(w io.Writer) (int64, error) { + buf, err := n.MarshalBinary() + if err != nil { return 0, err } + m, err := w.Write(buf) + return int64(m), err +} + +func (n *BPTreeNode) UnmarshalBinary(buf []byte) error { + size := int32(binary.BigEndian.Uint32(buf[:4])) leaf := size < 0 if leaf { n.Pointers = make([]MemoryPointer, -size) @@ -107,40 +108,47 @@ func (n *BPTreeNode) ReadFrom(r io.Reader) (int64, error) { n.Pointers = make([]MemoryPointer, size+1) n.Keys = make([]ReferencedValue, size) } + if size == 0 { + panic("empty node") + } + m := 4 for i := range n.Keys { - var l uint32 - if err := binary.Read(r, binary.BigEndian, &l); err != nil { - return 0, err - } - if l == 0 { + l := binary.BigEndian.Uint32(buf[m : m+4]) + if l == ^uint32(0) { // read the key out of the memory pointer stored at this position - if err := binary.Read(r, binary.BigEndian, n.Keys[i].DataPointer); err != nil { - return 0, err - } + n.Keys[i].DataPointer.Offset = binary.BigEndian.Uint64(buf[m+4 : m+12]) + n.Keys[i].DataPointer.Length = binary.BigEndian.Uint32(buf[m+12 : m+16]) n.Keys[i].Value = make([]byte, n.Keys[i].DataPointer.Length) if _, err := n.Data.ReadAt(n.Keys[i].Value, int64(n.Keys[i].DataPointer.Offset)); err != nil { - return 0, err + return fmt.Errorf("failed to read key: %w", err) } m += 4 + 12 } else { n.Keys[i].Value = make([]byte, l) - if _, err := io.ReadFull(r, n.Keys[i].Value); err != nil { - return 0, err + if o := copy(n.Keys[i].Value, buf[m+4:m+4+int(l)]); o != int(l) { + return fmt.Errorf("failed to copy key: %w", io.ErrShortWrite) } m += 4 + int(l) } } for i := range n.Pointers { - if err := binary.Read(r, binary.BigEndian, &n.Pointers[i].Offset); err != nil { - return 0, err - } - if err := binary.Read(r, binary.BigEndian, &n.Pointers[i].Length); err != nil { - return 0, err - } + n.Pointers[i].Offset = binary.BigEndian.Uint64(buf[m : m+8]) + n.Pointers[i].Length = binary.BigEndian.Uint32(buf[m+8 : m+12]) m += 12 } - return int64(m), nil + return nil +} + +func (n *BPTreeNode) ReadFrom(r io.Reader) (int64, error) { + buf := make([]byte, pageSizeBytes) + if _, err := r.Read(buf); err != nil && err != io.EOF { + return 0, err + } + if err := n.UnmarshalBinary(buf); err != nil { + return 0, err + } + return pageSizeBytes, nil } func (n *BPTreeNode) bsearch(key []byte) (int, bool) { diff --git a/pkg/btree/node_test.go b/pkg/btree/node_test.go new file mode 100644 index 00000000..b87a288d --- /dev/null +++ b/pkg/btree/node_test.go @@ -0,0 +1,76 @@ +package btree + +import ( + "bytes" + "reflect" + "testing" +) + +func TestBPTreeNode_ReadWriteLeaf(t *testing.T) { + // Create a test BPTreeNode + node1 := &BPTreeNode{ + Pointers: []MemoryPointer{ + {Offset: 0, Length: 1}, + {Offset: 1, Length: 2}, + {Offset: 2, Length: 3}, + }, + Keys: []ReferencedValue{ + {Value: []byte{0}}, + {Value: []byte{1, 2}}, + {Value: []byte{3, 4, 5}}, + }, + } + + buf := &bytes.Buffer{} + if _, err := node1.WriteTo(buf); err != nil { + t.Fatal(err) + } + + node2 := &BPTreeNode{} + if _, err := node2.ReadFrom(buf); err != nil { + t.Fatal(err) + } + + if !node2.leaf() { + t.Fatal("expected leaf node") + } + + if !reflect.DeepEqual(node1, node2) { + t.Fatalf("expected %#v, got %#v", node1, node2) + } +} + +func TestBPTreeNode_ReadWriteIntermediate(t *testing.T) { + // Create a test BPTreeNode + node1 := &BPTreeNode{ + Pointers: []MemoryPointer{ + {Offset: 0, Length: 1}, + {Offset: 1, Length: 2}, + {Offset: 2, Length: 3}, + {Offset: 3, Length: 4}, + }, + Keys: []ReferencedValue{ + {Value: []byte{0}}, + {Value: []byte{1, 2}}, + {Value: []byte{3, 4, 5}}, + }, + } + + buf := &bytes.Buffer{} + if _, err := node1.WriteTo(buf); err != nil { + t.Fatal(err) + } + + node2 := &BPTreeNode{} + if _, err := node2.ReadFrom(buf); err != nil { + t.Fatal(err) + } + + if node2.leaf() { + t.Fatal("expected intermediate node") + } + + if !reflect.DeepEqual(node1, node2) { + t.Fatalf("expected %#v, got %#v", node1, node2) + } +} diff --git a/pkg/btree/pagefile.go b/pkg/btree/pagefile.go index fe316dc9..b777efa1 100644 --- a/pkg/btree/pagefile.go +++ b/pkg/btree/pagefile.go @@ -10,7 +10,7 @@ type ReadWriteSeekPager interface { io.ReadWriteSeeker Page(int) (int64, error) - NewPage() (int64, error) + NewPage([]byte) (int64, error) FreePage(int64) error PageSize() int @@ -21,7 +21,8 @@ type PageFile struct { pageSize int // local cache of free pages to avoid reading from disk too often. - freePageIndexes [512]int64 + freePageIndexes [512]int64 + freePageHead, freePageCount int } var _ ReadWriteSeekPager = &PageFile{} @@ -46,12 +47,19 @@ func NewPageFile(rws io.ReadWriteSeeker) (*PageFile, error) { } if err == io.EOF { // allocate one page for the free page indexes - if _, err := rws.Write(make([]byte, pageSizeBytes)); err != nil { + if _, err := rws.Write(buf); err != nil { return nil, err } } else { for i := 0; i < len(pf.freePageIndexes); i++ { - pf.freePageIndexes[i] = int64(binary.BigEndian.Uint64(buf[i*8 : (i+1)*8])) + offset := int64(binary.BigEndian.Uint64(buf[i*8 : (i+1)*8])) + if offset != 0 { + pf.freePageIndexes[pf.freePageHead] = offset + pf.freePageHead = (pf.freePageHead + 1) % len(pf.freePageIndexes) + pf.freePageCount++ + } else { + break + } } } return pf, nil @@ -65,65 +73,106 @@ func (pf *PageFile) Page(i int) (int64, error) { return int64(i+1) * int64(pf.pageSize), nil } -func (pf *PageFile) NewPage() (int64, error) { - // if there are free pages, return the first one - for i := 0; i < len(pf.freePageIndexes); i++ { - if pf.freePageIndexes[i] != 0 { - offset := pf.freePageIndexes[i] - // zero out this free page index on disk - if _, err := pf.ReadWriteSeeker.Seek(int64(i*8), io.SeekStart); err != nil { - return 0, err - } - if _, err := pf.ReadWriteSeeker.Write(make([]byte, 8)); err != nil { - return 0, err - } - // seek to the free page - if _, err := pf.ReadWriteSeeker.Seek(offset, io.SeekStart); err != nil { - return 0, err - } - return offset, nil - } +func (pf *PageFile) writeFreePageIndices() error { + buf := make([]byte, len(pf.freePageIndexes)*8) + tail := (pf.freePageHead - pf.freePageCount + len(pf.freePageIndexes)) % len(pf.freePageIndexes) + for i := 0; i < pf.freePageCount; i++ { + offset := pf.freePageIndexes[tail+i] + binary.BigEndian.PutUint64(buf[i*8:(i+1)*8], uint64(offset)) + } + if _, err := pf.ReadWriteSeeker.Seek(0, io.SeekStart); err != nil { + return err + } + if _, err := pf.ReadWriteSeeker.Write(buf); err != nil { + return err + } + return nil +} + +func (pf *PageFile) FreePageIndex() (int64, error) { + // find the first free page index and return it + if pf.freePageCount == 0 { + return -1, nil + } + // pop from the tail + tail := (pf.freePageHead - pf.freePageCount + len(pf.freePageIndexes)) % len(pf.freePageIndexes) + offset := pf.freePageIndexes[tail] + pf.freePageIndexes[tail] = 0 + pf.freePageCount-- + + if err := pf.writeFreePageIndices(); err != nil { + return 0, err + } + + return offset, nil +} + +func (pf *PageFile) NewPage(buf []byte) (int64, error) { + if buf != nil && len(buf) > pf.pageSize { + return 0, errors.New("buffer is too large") } - // seek to the end of the file - offset, err := pf.ReadWriteSeeker.Seek(0, io.SeekEnd) + // if there are free pages, return the first one + offset, err := pf.FreePageIndex() if err != nil { return 0, err } + if offset != -1 { + // seek to the free page + if _, err := pf.ReadWriteSeeker.Seek(offset, io.SeekStart); err != nil { + return 0, err + } + } else { + n, err := pf.ReadWriteSeeker.Seek(0, io.SeekEnd) + if err != nil { + return 0, err + } + offset = n + } + // if the offset is not a multiple of the page size, we need to pad the file // with zeros to the next page boundary. + var pad int64 if pf.pageSize > 0 && offset%int64(pf.pageSize) != 0 { // Calculate the number of bytes to pad - pad := int64(pf.pageSize) - (offset % int64(pf.pageSize)) + pad = int64(pf.pageSize) - (offset % int64(pf.pageSize)) // Write the padding if _, err := pf.Write(make([]byte, pad)); err != nil { return 0, err } - return offset + pad, nil } - return offset, nil + page := make([]byte, pf.pageSize) + if buf != nil { + copy(page, buf) + } + if _, err := pf.ReadWriteSeeker.Write(page); err != nil { + return 0, err + } + if _, err := pf.ReadWriteSeeker.Seek(offset, io.SeekStart); err != nil { + return 0, err + } + return offset + pad, nil } func (pf *PageFile) FreePage(offset int64) error { if offset%int64(pf.pageSize) != 0 { return errors.New("offset is not a multiple of the page size") } - // find the last nonzero free page index and insert it after that - for i := len(pf.freePageIndexes) - 1; i >= 0; i-- { - if pf.freePageIndexes[i] == 0 { - j := (i + 1) % len(pf.freePageIndexes) - pf.freePageIndexes[j] = offset - - // write the free page index to the last page - buf := make([]byte, 8) - binary.BigEndian.PutUint64(buf, uint64(offset)) - if _, err := pf.ReadWriteSeeker.Seek(int64(j*8), io.SeekStart); err != nil { - return err - } - return nil + if pf.freePageCount == len(pf.freePageIndexes) { + return errors.New("free page index is full") + } + // ensure it's not already in the free page index + for i := 0; i < pf.freePageCount; i++ { + if pf.freePageIndexes[(pf.freePageHead-pf.freePageCount+i+len(pf.freePageIndexes))%len(pf.freePageIndexes)] == offset { + return errors.New("page is already free") } } - return errors.New("too many free pages") + // push to the head + pf.freePageIndexes[pf.freePageHead] = offset + pf.freePageHead = (pf.freePageHead + 1) % len(pf.freePageIndexes) + pf.freePageCount++ + + return pf.writeFreePageIndices() } func (pf *PageFile) PageSize() int { diff --git a/pkg/btree/pagefile_test.go b/pkg/btree/pagefile_test.go index 7d99704c..e4257196 100644 --- a/pkg/btree/pagefile_test.go +++ b/pkg/btree/pagefile_test.go @@ -14,7 +14,7 @@ func TestPageFile(t *testing.T) { if err != nil { t.Fatal(err) } - offset, err := pf.NewPage() + offset, err := pf.NewPage(nil) if err != nil { t.Fatal(err) } @@ -23,52 +23,66 @@ func TestPageFile(t *testing.T) { } }) - t.Run("page size reuses page without allocation", func(t *testing.T) { + t.Run("page size allocates pages", func(t *testing.T) { buf := buftest.NewSeekableBuffer() pf, err := NewPageFile(buf) if err != nil { t.Fatal(err) } - offset1, err := pf.NewPage() + offset1, err := pf.NewPage(nil) if err != nil { t.Fatal(err) } if offset1 != pageSizeBytes { t.Fatalf("expected offset %d, got %d", pageSizeBytes, offset1) } - // since no data has been written, this page should be reused. - offset2, err := pf.NewPage() + // check the seek location + n, err := buf.Seek(0, io.SeekCurrent) if err != nil { t.Fatal(err) } - if offset2 != pageSizeBytes { + if n != pageSizeBytes { + t.Fatalf("expected offset %d, got %d", pageSizeBytes, n) + } + offset2, err := pf.NewPage(nil) + if err != nil { + t.Fatal(err) + } + if offset2 != pageSizeBytes*2 { t.Fatalf("expected offset %d, got %d", pageSizeBytes*2, offset2) } + m, err := buf.Seek(0, io.SeekCurrent) + if err != nil { + t.Fatal(err) + } + if m != pageSizeBytes*2 { + t.Fatalf("expected offset %d, got %d", pageSizeBytes*2, m) + } }) - t.Run("page size allocates second page", func(t *testing.T) { + t.Run("page size allocates page with data", func(t *testing.T) { buf := buftest.NewSeekableBuffer() pf, err := NewPageFile(buf) if err != nil { t.Fatal(err) } - offset1, err := pf.NewPage() + data := []byte("hello") + offset1, err := pf.NewPage(data) if err != nil { t.Fatal(err) } if offset1 != pageSizeBytes { t.Fatalf("expected offset %d, got %d", pageSizeBytes, offset1) } - // need to write at least one byte to trigger a new page. - if _, err := pf.Write(make([]byte, 1)); err != nil { + if _, err := pf.Seek(offset1, io.SeekStart); err != nil { t.Fatal(err) } - offset2, err := pf.NewPage() - if err != nil { + buf2 := make([]byte, len(data)) + if _, err := pf.Read(buf2); err != nil { t.Fatal(err) } - if offset2 != pageSizeBytes*2 { - t.Fatalf("expected offset %d, got %d", pageSizeBytes*2, offset2) + if string(buf2) != string(data) { + t.Fatalf("expected %s, got %s", string(data), string(buf2)) } }) @@ -78,7 +92,7 @@ func TestPageFile(t *testing.T) { if err != nil { t.Fatal(err) } - offset1, err := pf.NewPage() + offset1, err := pf.NewPage(nil) if err != nil { t.Fatal(err) } @@ -97,7 +111,7 @@ func TestPageFile(t *testing.T) { if err != nil { t.Fatal(err) } - offset1, err := pf.NewPage() + offset1, err := pf.NewPage(nil) if err != nil { t.Fatal(err) } @@ -108,18 +122,18 @@ func TestPageFile(t *testing.T) { if _, err := pf.Write(make([]byte, 1)); err != nil { t.Fatal(err) } - offset2, err := pf.NewPage() + offset2, err := pf.NewPage(nil) if err != nil { t.Fatal(err) } if offset2 != pageSizeBytes*2 { - t.Fatalf("expected offset %d, got %d", pageSizeBytes, offset2) + t.Fatalf("expected offset %d, got %d", 2*pageSizeBytes, offset2) } if err := pf.FreePage(offset1); err != nil { t.Fatal(err) } - offset3, err := pf.NewPage() + offset3, err := pf.NewPage(nil) if err != nil { t.Fatal(err) } @@ -127,4 +141,55 @@ func TestPageFile(t *testing.T) { t.Fatalf("expected offset %d, got %d", offset2, offset3) } }) + + t.Run("free page behaves like a circular buffer", func(t *testing.T) { + buf := buftest.NewSeekableBuffer() + pf, err := NewPageFile(buf) + if err != nil { + t.Fatal(err) + } + offsets := make([]int64, 0, 10) + for i := 0; i < 10; i++ { + offset, err := pf.NewPage(nil) + if err != nil { + t.Fatal(err) + } + if i > 0 && offset != offsets[i-1]+pageSizeBytes { + t.Fatalf("expected offset %d, got %d", offsets[i-1]+pageSizeBytes, offset) + } + offsets = append(offsets, offset) + } + for i := 0; i < 10; i++ { + if err := pf.FreePage(offsets[i]); err != nil { + t.Fatal(err) + } + } + for i := 0; i < 10; i++ { + offset, err := pf.NewPage(nil) + if err != nil { + t.Fatal(err) + } + if offset != offsets[i] { + t.Fatalf("expected offset %d, got %d", offsets[i], offset) + } + } + }) + + t.Run("cannot double free a page", func(t *testing.T) { + buf := buftest.NewSeekableBuffer() + pf, err := NewPageFile(buf) + if err != nil { + t.Fatal(err) + } + offset, err := pf.NewPage(nil) + if err != nil { + t.Fatal(err) + } + if err := pf.FreePage(offset); err != nil { + t.Fatal(err) + } + if err := pf.FreePage(offset); err == nil { + t.Fatal("expected error") + } + }) } diff --git a/pkg/handlers/csv.go b/pkg/handlers/csv.go new file mode 100644 index 00000000..c779cea8 --- /dev/null +++ b/pkg/handlers/csv.go @@ -0,0 +1,252 @@ +package handlers + +// import ( +// "bufio" +// "bytes" +// "encoding/binary" +// "encoding/csv" +// "fmt" +// "io" +// "log/slog" +// "math" +// "strconv" +// "strings" + +// "github.com/kevmo314/appendable/pkg/appendable" +// "github.com/kevmo314/appendable/pkg/btree" +// ) + +// type CSVHandler struct { +// io.ReadSeeker +// } + +// var _ appendable.DataHandler = (*CSVHandler)(nil) + +// func (c CSVHandler) Format() appendable.Format { +// return appendable.FormatCSV +// } + +// func (c CSVHandler) Synchronize(f *appendable.IndexFile, df appendable.DataFile) error { +// slog.Debug("Starting CSV synchronization") + +// var headers []string +// var err error + +// metadata, err := f.Metadata() +// if err != nil { +// return fmt.Errorf("failed to read metadata: %w", err) +// } +// if _, err := df.Seek(int64(metadata.ReadOffset), io.SeekStart); err != nil { +// return fmt.Errorf("failed to seek: %w", err) +// } + +// fromNewIndexFile := false + +// isHeader := false + +// isEmpty, err := f.IsEmpty() +// if err != nil { +// return fmt.Errorf("failed to check if index file is empty: %w", err) +// } + +// if isEmpty { +// isHeader = true +// fromNewIndexFile = true +// } else { +// slog.Debug("indexes already exist, not parsing headers") +// for _, index := range f.Indexes { +// isHeader = false +// headers = append(headers, index.FieldName) +// } +// } + +// scanner := bufio.NewScanner(df) + +// for scanner.Scan() { +// line := scanner.Bytes() + +// existingCount := len(f.EndByteOffsets) + +// // append a data range +// var start uint64 +// if len(f.EndByteOffsets) > 0 { +// start = f.EndByteOffsets[existingCount-1] +// } + +// slog.Debug("", slog.Uint64("start", start)) + +// slog.Debug("adding", slog.Any("endbyteoffset", start+uint64(len(line))), slog.Any("line", line)) + +// if isHeader { +// slog.Info("Parsing CSV headers") +// dec := csv.NewReader(bytes.NewReader(line)) +// headers, err = dec.Read() +// if err != nil { +// slog.Error("failed to parse CSV header", "error", err) +// return fmt.Errorf("failed to parse CSV header: %w", err) +// } +// isHeader = false +// continue +// } + +// dec := csv.NewReader(bytes.NewReader(line)) +// slog.Debug("Handling csv", "line", i) + +// if fromNewIndexFile { + +// handleCSVLine(f, df, dec, headers, []string{}, btree.MemoryPointer{ +// Offset: start, +// Length: uint32(len(line)), +// }) +// } else { + +// handleCSVLine(f, df, dec, headers, []string{}, uint64(existingCount) +// } + +// metadata.ReadOffset += uint64(len(line)) + 1 // include the newline +// } + +// if err := scanner.Err(); err != nil { +// return fmt.Errorf("failed to scan: %w", err) +// } + +// // update the metadata +// if err := f.SetMetadata(metadata); err != nil { +// return fmt.Errorf("failed to set metadata: %w", err) +// } + +// slog.Debug("indexes", slog.Any("", f.Indexes)) +// slog.Debug("Ending CSV synchronization") +// slog.Debug("=========") +// return nil +// } + +// func fieldRankCsvField(fieldValue any) int { +// slog.Debug("serialize", slog.Any("fieldValue", fieldValue)) +// switch fieldValue.(type) { +// case nil: +// slog.Debug("nil", slog.Any("fieldValue", fieldValue)) +// return 1 +// case bool: +// slog.Debug("bool", slog.Any("fieldValue", fieldValue)) +// return 2 +// case int, int8, int16, int32, int64, float32, float64: +// slog.Debug("number", slog.Any("fieldValue", fieldValue)) +// return 3 +// case string: +// slog.Debug("string", slog.Any("fieldValue", fieldValue)) +// return 4 +// default: +// panic("unknown type") +// } +// } + +// func InferCSVField(fieldValue string) (interface{}, appendable.FieldType) { +// if fieldValue == "" { +// return nil, appendable.FieldTypeNull +// } + +// if i, err := strconv.Atoi(fieldValue); err == nil { + +// fmt.Printf("\n%v is a integer\n", fieldValue) +// return float64(i), appendable.FieldTypeFloat64 +// } + +// if f, err := strconv.ParseFloat(fieldValue, 64); err == nil { + +// fmt.Printf("\n%v is a float\n", fieldValue) +// return float64(f), appendable.FieldTypeFloat64 +// } + +// if b, err := strconv.ParseBool(fieldValue); err == nil { +// return b, appendable.FieldTypeBoolean +// } + +// return fieldValue, appendable.FieldTypeString +// } + +// func handleCSVLine(f *appendable.IndexFile, r io.ReaderAt, dec *csv.Reader, headers []string, path []string, data btree.MemoryPointer) error { +// slog.Debug("Processing CSV line", slog.Int("dataIndex", int(dataIndex)), slog.Int("dataOffset", int(dataOffset))) + +// record, err := dec.Read() + +// if err != nil { +// slog.Error("Failed to read CSV record at index", "dataIndex", dataIndex, "error", err) +// return fmt.Errorf("failed to read CSV record at index %d: %w", dataIndex, err) +// } + +// slog.Debug("CSV line read successfully", "record", record) + +// cumulativeLength := uint64(0) + +// for fieldIndex, fieldValue := range record { +// if fieldIndex >= len(headers) { +// slog.Error("Field index is out of bounds with headers", "fieldIndex", fieldIndex, "headers", slog.Any("headers", headers)) +// return fmt.Errorf("field index %d is out of bounds with header", fieldIndex) +// } + +// fieldName := headers[fieldIndex] +// name := strings.Join(append(path, fieldName), ".") + +// fieldOffset := data.Offset + cumulativeLength +// fieldLength := uint32(len(fieldValue)) + +// value, fieldType := InferCSVField(fieldValue) + +// page, err := f.FindOrCreateIndex(name, fieldType) +// if err != nil { +// return fmt.Errorf("failed to find or create index: %w", err) +// } + +// switch fieldType { +// case appendable.FieldTypeFloat64: +// buf := make([]byte, 8) +// binary.LittleEndian.PutUint64(buf, math.Float64bits(value.(float64))) +// if err := page.BPTree(r).Insert(btree.ReferencedValue{Value: buf}, data); err != nil { +// return fmt.Errorf("failed to insert into b+tree: %w", err) +// } +// case appendable.FieldTypeBoolean: +// if value.(bool) { +// if err := page.BPTree(r).Insert(btree.ReferencedValue{Value: []byte{1}}, data); err != nil { +// return fmt.Errorf("failed to insert into b+tree: %w", err) +// } +// } else { +// if err := page.BPTree(r).Insert(btree.ReferencedValue{Value: []byte{0}}, data); err != nil { +// return fmt.Errorf("failed to insert into b+tree: %w", err) +// } +// } +// case appendable.FieldTypeString: +// if err := page.BPTree(r).Insert(btree.ReferencedValue{ +// DataPointer: btree.MemoryPointer{ +// Offset: fieldOffset, +// Length: fieldLength, +// }, +// // trim the quotes +// Value: []byte(value.(string)), +// }, data); err != nil { +// return fmt.Errorf("failed to insert into b+tree: %w", err) +// } + +// slog.Debug("Appended index record", +// slog.String("field", name), +// slog.Any("value", value), +// slog.Int("start", int(fieldOffset))) + +// case appendable.FieldTypeNull: +// // nil values are a bit of a degenerate case, we are essentially using the btree +// // as a set. we store the value as an empty byte slice. +// if err := page.BPTree(r).Insert(btree.ReferencedValue{Value: []byte{}}, data); err != nil { +// return fmt.Errorf("failed to insert into b+tree: %w", err) +// } +// slog.Debug("Marked field", "name", name) + +// default: +// slog.Error("Encountered unexpected type '%T' for field '%s'", value, name) +// return fmt.Errorf("unexpected type '%T'", value) +// } + +// cumulativeLength += uint64(fieldLength + 1) +// } + +// return nil +// } diff --git a/pkg/handlers/csv_test.go b/pkg/handlers/csv_test.go new file mode 100644 index 00000000..ca11c5ff --- /dev/null +++ b/pkg/handlers/csv_test.go @@ -0,0 +1,308 @@ +package handlers + +// import ( +// "bytes" +// "fmt" +// "log/slog" +// "os" +// "reflect" +// "strings" +// "testing" + +// "github.com/kevmo314/appendable/pkg/protocol" +// ) + +// func TestAppendDataRowCSV(t *testing.T) { + +// originalLogger := slog.Default() + +// // Create a logger with Debug on +// debugLevel := &slog.LevelVar{} +// debugLevel.Set(slog.LevelDebug) +// debugLogger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{ +// Level: debugLevel, +// })) + +// slog.SetDefault(debugLogger) + +// defer slog.SetDefault(originalLogger) + +// var mockCsv string = "header1\ntest1\n" +// var mockCsv2 string = "header1\ntest1\ntest3\n" + +// t.Run("no schema changes", func(t *testing.T) { + +// i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader(mockCsv)}) +// if err != nil { +// t.Fatal(err) +// } + +// buf := &bytes.Buffer{} + +// if err := i.Serialize(buf); err != nil { +// t.Fatal(err) +// } + +// j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader(mockCsv2)}) +// if err != nil { +// t.Fatal(err) +// } + +// // check that the index file now has the additional data ranges but same number of indices +// if len(j.Indexes) != 1 { +// t.Errorf("got len(i.Indexes) = %d, want 1", len(i.Indexes)) +// } + +// if len(j.EndByteOffsets) != 2 { +// t.Errorf("got len(i.DataRanges) = %d, want 2", len(i.EndByteOffsets)) +// } + +// // check that the first data range is untouched despite being incorrect +// if j.EndByteOffsets[0] != uint64(len(mockCsv)) { +// t.Errorf("got i.DataRanges[0].EndByteOffset = %d, want %d", j.EndByteOffsets[0], uint64(len(mockCsv))) +// } + +// // check that the second data range has properly set offsets +// if j.EndByteOffsets[1] != uint64(len(mockCsv2)) { +// t.Errorf("got i.DataRanges[1].EndByteOffset = %d, want %d", j.EndByteOffsets[1], uint64(len(mockCsv2))) +// } +// }) + +// t.Run("check end + start byte offsets multiple", func(t *testing.T) { +// i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader(mockCsv2)}) +// if err != nil { +// t.Fatal(err) +// } + +// if len(i.Indexes) != 1 { +// t.Errorf("got len(i.Indexes) = %d, want 1", len(i.Indexes)) +// } + +// if len(i.Indexes[0].IndexRecords) != 2 { +// t.Errorf("got len(i.Indexes[0].IndexRecords) = %d, want 2", len(i.Indexes[0].IndexRecords)) +// } + +// if i.Indexes[0].IndexRecords["test1"][0].FieldStartByteOffset != uint64(len("header1\n")) { +// t.Errorf("got i.Indexes[0].IndexRecords[\"test1\"][0].FieldStartByteOffset = %d, want 7", i.Indexes[0].IndexRecords["test1"][0].FieldStartByteOffset) +// } + +// if i.Indexes[0].IndexRecords["test3"][0].FieldStartByteOffset != uint64(len("header1\ntest1\n")) { +// t.Errorf("got i.Indexes[0].IndexRecords[\"test3\"][0].FieldStartByteOffset = %d, want %d", i.Indexes[0].IndexRecords["test3"][0].FieldStartByteOffset, uint64(len("header\ntest1\n"))) +// } + +// }) + +// t.Run("append index to existing", func(t *testing.T) { +// i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader(mockCsv)}) +// if err != nil { +// t.Fatal(err) +// } + +// buf := &bytes.Buffer{} + +// if err := i.Serialize(buf); err != nil { +// t.Fatal(err) +// } + +// j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader(mockCsv2)}) +// if err != nil { +// t.Fatal(err) +// } + +// // check that the index file now has the additional data ranges but same number of indices +// if len(j.Indexes) != 1 { +// t.Errorf("got len(j.Indexes) = %d, want 1", len(j.Indexes)) +// } + +// if len(j.Indexes[0].IndexRecords) != 2 { +// fmt.Printf("index records look like %v", j.Indexes[0].IndexRecords) +// t.Errorf("got len(j.Indexes[0].IndexRecords) = %d, want 2", len(j.Indexes[0].IndexRecords)) +// } + +// if len(j.Indexes[0].IndexRecords["test1"]) != 1 { +// t.Errorf("got len(j.Indexes[0].IndexRecords[\"test1\"]) = %d, want 1", len(j.Indexes[0].IndexRecords["test1"])) +// } +// if len(j.Indexes[0].IndexRecords["test3"]) != 1 { +// for key, records := range j.Indexes[0].IndexRecords { +// t.Errorf("\n\n\nKey: %v, Records: %+v", key, records) +// } +// t.Errorf("got len(j.Indexes[0].IndexRecords[\"test3\"]) = %d, want 1", len(j.Indexes[0].IndexRecords["test3"])) +// } + +// if j.Indexes[0].IndexRecords["test1"][0].DataNumber != 0 { +// t.Errorf("got i.Indexes[0].IndexRecords[\"test1\"][0].DataNumber = %d, want 0", j.Indexes[0].IndexRecords["test1"][0].DataNumber) +// } +// if j.Indexes[0].IndexRecords["test1"][0].FieldStartByteOffset != uint64(len("header1\n")) { +// t.Errorf("got i.Indexes[0].IndexRecords[\"test1\"][0].FieldStartByteOffset = %d, want %d", j.Indexes[0].IndexRecords["test1"][0].FieldStartByteOffset, uint64(len("header\n"))) +// } + +// if j.Indexes[0].IndexRecords["test3"][0].DataNumber != 1 { +// t.Errorf("got i.Indexes[0].IndexRecords[\"test3\"][0].DataNumber = %d, want 1", j.Indexes[0].IndexRecords["test3"][0].DataNumber) +// } + +// // verify byte offset calculation +// if j.Indexes[0].IndexRecords["test3"][0].FieldStartByteOffset != uint64(len("header1\ntest1\n")) { +// t.Errorf("got i.Indexes[0].IndexRecords[\"test3\"][0].FieldStartByteOffset = %d, want %d", j.Indexes[0].IndexRecords["test3"][0].FieldStartByteOffset, uint64(len("header\ntest1\n"))) +// } +// }) + +// t.Run("assert correct types", func(t *testing.T) { +// i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader("n1,n2\n3.4,3\n")}) +// if err != nil { +// t.Fatal(err) +// } + +// buf := &bytes.Buffer{} + +// if err := i.Serialize(buf); err != nil { +// t.Fatal(err) +// } + +// j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader("n1,n2\n3.4,3\n4.3,4")}) +// if err != nil { +// t.Fatal(err) +// } + +// for _, index := range i.Indexes { +// for key := range index.IndexRecords { +// keyType := reflect.TypeOf(key).String() +// if keyType != "float64" { +// t.Errorf("i keytype is %v", keyType) +// } + +// if index.FieldType != protocol.FieldTypeNumber { +// t.Errorf("index field type is not number. actual: %v", index.FieldType) +// } +// } +// } + +// for _, index := range j.Indexes { +// for key := range index.IndexRecords { +// keyType := reflect.TypeOf(key).String() +// if keyType != "float64" { +// t.Errorf("j keytype is %v", keyType) +// } + +// if index.FieldType != protocol.FieldTypeNumber { +// t.Errorf("index field type is not number. actual: %v", index.FieldType) +// } +// } +// } + +// }) + +// t.Run("multiple headers", func(t *testing.T) { + +// i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader("name,move\nmica,coyote\n")}) +// if err != nil { +// t.Fatal(err) +// } + +// buf := &bytes.Buffer{} + +// if err := i.Serialize(buf); err != nil { +// t.Fatal(err) +// } + +// j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader("name,move\nmica,coyote\ngalvao,mount\n")}) +// if err != nil { +// t.Fatal(err) +// } + +// // check that the index file now has the additional data ranges but same number of indices +// if len(j.Indexes) != 2 { +// t.Errorf("got len(i.Indexes) = %d, want 2", len(i.Indexes)) +// } + +// if len(j.EndByteOffsets) != 2 { +// t.Errorf("got len(i.DataRanges) = %d, want 2", len(i.EndByteOffsets)) +// } + +// // check that the first data range is untouched despite being incorrect +// if j.EndByteOffsets[0] != uint64(len("name,move\nmica,coyote\n")) { +// t.Errorf("got i.DataRanges[0].EndByteOffset = %d, want %d", j.EndByteOffsets[0], uint64(len("name,move\nmica,coyote\n"))) +// } + +// // check that the second data range has properly set offsets +// if j.EndByteOffsets[1] != uint64(len("name,move\nmica,coyote\ngalvao,mount\n")) { +// t.Errorf("got i.DataRanges[1].EndByteOffset = %d, want %d", j.EndByteOffsets[1], uint64(len("name,move\nmica,coyote\ngalvao,mount\n"))) +// } + +// fmt.Printf("index file looks like: %v", j.Indexes) +// }) + +// t.Run("generate index file", func(t *testing.T) { +// i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader("")}) + +// if err != nil { +// t.Fatal(err) +// } + +// buf := &bytes.Buffer{} + +// if err := i.Serialize(buf); err != nil { +// t.Fatal(err) +// } + +// _, err = ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader(mockCsv2)}) +// if err != nil { +// t.Fatal(err) +// } + +// }) + +// t.Run("existing index but different type", func(t *testing.T) { +// i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader("test\ntest1\n")}) +// if err != nil { +// t.Fatal(err) +// } + +// buf := &bytes.Buffer{} + +// if err := i.Serialize(buf); err != nil { +// t.Fatal(err) +// } + +// j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader("test\ntest1\n123\n")}) +// if err != nil { +// t.Fatal(err) +// } + +// // check that the index file now has the additional data ranges but same number of indices +// if len(j.Indexes) != 1 { +// t.Errorf("got len(i.Indexes) = %d, want 1", len(j.Indexes)) +// } + +// if j.Indexes[0].FieldType != protocol.FieldTypeString|protocol.FieldTypeNumber { +// t.Errorf("got i.Indexes[0].FieldType = %#v, want protocol.FieldTypeUnknown", j.Indexes[0].FieldType) +// } +// }) + +// t.Run("existing index but nullable type", func(t *testing.T) { +// i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader("test,test2\nomoplata,armbar\n")}) +// if err != nil { +// t.Fatal(err) +// } + +// buf := &bytes.Buffer{} + +// if err := i.Serialize(buf); err != nil { +// t.Fatal(err) +// } + +// j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader("test,test2\nomoplata,armbar\n,singlelegx\n")}) +// if err != nil { +// t.Fatal(err) +// } + +// // check that the index file now has the additional data ranges but same number of indices +// if len(j.Indexes) != 2 { +// t.Errorf("got len(i.Indexes) = %d, want 2", len(j.Indexes)) +// } + +// if j.Indexes[0].FieldType != protocol.FieldTypeNull|protocol.FieldTypeString { +// t.Errorf("got i.Indexes[0].FieldType = %#v, want protocol.FieldTypeNullableString", j.Indexes[0].FieldType) +// } +// }) + +// } diff --git a/pkg/handlers/jsonl.go b/pkg/handlers/jsonl.go new file mode 100644 index 00000000..19ae9ce1 --- /dev/null +++ b/pkg/handlers/jsonl.go @@ -0,0 +1,212 @@ +package handlers + +import ( + "bufio" + "bytes" + "encoding/binary" + "encoding/json" + "fmt" + "io" + "log/slog" + "math" + "strings" + + "github.com/kevmo314/appendable/pkg/appendable" + "github.com/kevmo314/appendable/pkg/btree" +) + +type JSONLHandler struct { +} + +var _ appendable.DataHandler = (*JSONLHandler)(nil) + +func (j JSONLHandler) Format() appendable.Format { + return appendable.FormatJSONL +} + +func (j JSONLHandler) Synchronize(f *appendable.IndexFile, df appendable.DataFile) error { + // read until the next newline + metadata, err := f.Metadata() + if err != nil { + return fmt.Errorf("failed to read metadata: %w", err) + } + if _, err := df.Seek(int64(metadata.ReadOffset), io.SeekStart); err != nil { + return fmt.Errorf("failed to seek: %w", err) + } + scanner := bufio.NewScanner(df) + for i := 0; scanner.Scan(); i++ { + line := scanner.Bytes() + + // create a new json decoder + dec := json.NewDecoder(bytes.NewReader(line)) + + // if the first token is not {, then return an error + if t, err := dec.Token(); err != nil || t != json.Delim('{') { + return fmt.Errorf("expected '%U', got '%U' (only json objects are supported at the root)", '{', t) + } + + if err := handleJSONLObject(f, df, dec, []string{}, btree.MemoryPointer{ + Offset: metadata.ReadOffset, + Length: uint32(len(line)), + }); err != nil { + return fmt.Errorf("failed to handle object: %w", err) + } + + // the next token must be a } + if t, err := dec.Token(); err != nil || t != json.Delim('}') { + return fmt.Errorf("expected '}', got '%v'", t) + } + + metadata.ReadOffset += uint64(len(line)) + 1 // include the newline + + slog.Info("read line", "i", i, "offset", metadata.ReadOffset) + } + + if err := scanner.Err(); err != nil { + return fmt.Errorf("failed to scan: %w", err) + } + + // update the metadata + if err := f.SetMetadata(metadata); err != nil { + return fmt.Errorf("failed to set metadata: %w", err) + } + + return nil +} + +func jsonTypeToFieldType(t json.Token) appendable.FieldType { + switch t.(type) { + case json.Delim: + switch t { + case json.Delim('{'): + return appendable.FieldTypeObject + case json.Delim('['): + return appendable.FieldTypeArray + } + case json.Number, float64: + return appendable.FieldTypeFloat64 + case string: + return appendable.FieldTypeString + case bool: + return appendable.FieldTypeBoolean + case nil: + return appendable.FieldTypeNull + } + panic(fmt.Sprintf("unexpected token '%v'", t)) +} + +func handleJSONLObject(f *appendable.IndexFile, r io.ReaderAt, dec *json.Decoder, path []string, data btree.MemoryPointer) error { + // while the next token is not }, read the key + for dec.More() { + key, err := dec.Token() + if err != nil { + return fmt.Errorf("failed to read token: %w", err) + } + + // key must be a string + if key, ok := key.(string); !ok { + return fmt.Errorf("expected string key, got '%v'", key) + } else { + fieldOffset := dec.InputOffset() + 1 // skip the : + + value, err := dec.Token() + if err != nil { + return fmt.Errorf("failed to read token: %w", err) + } + + name := strings.Join(append(path, key), ".") + + if name != "VendorID" { + continue + } + + page, err := f.FindOrCreateIndex(name, jsonTypeToFieldType(value)) + if err != nil { + return fmt.Errorf("failed to find or create index: %w", err) + } + switch value := value.(type) { + case string: + if err := page.BPTree(r).Insert(btree.ReferencedValue{ + DataPointer: btree.MemoryPointer{ + Offset: data.Offset + uint64(fieldOffset) + 1, + Length: uint32(dec.InputOffset()-fieldOffset) - 2, + }, + // trim the quotes + Value: []byte(value), + }, data); err != nil { + return fmt.Errorf("failed to insert into b+tree: %w", err) + } + case json.Number, float64: + buf := make([]byte, 8) + switch value := value.(type) { + case json.Number: + f, err := value.Float64() + if err != nil { + return fmt.Errorf("failed to parse float: %w", err) + } + binary.BigEndian.PutUint64(buf, math.Float64bits(f)) + case float64: + binary.BigEndian.PutUint64(buf, math.Float64bits(value)) + } + if err := page.BPTree(r).Insert(btree.ReferencedValue{Value: buf}, data); err != nil { + return fmt.Errorf("failed to insert into b+tree: %w", err) + } + case bool: + if value { + if err := page.BPTree(r).Insert(btree.ReferencedValue{Value: []byte{1}}, data); err != nil { + return fmt.Errorf("failed to insert into b+tree: %w", err) + } + } else { + if err := page.BPTree(r).Insert(btree.ReferencedValue{Value: []byte{0}}, data); err != nil { + return fmt.Errorf("failed to insert into b+tree: %w", err) + } + } + case json.Token: + switch value { + case json.Delim('['): + // arrays are not indexed yet because we need to incorporate + // subindexing into the specification. however, we have to + // skip tokens until we reach the end of the array. + depth := 1 + for { + t, err := dec.Token() + if err != nil { + return fmt.Errorf("failed to read token: %w", err) + } + + switch t { + case json.Delim('['): + depth++ + case json.Delim(']'): + depth-- + } + + if depth == 0 { + break + } + } + case json.Delim('{'): + // find the index to set the field type to unknown. + if err := handleJSONLObject(f, r, dec, append(path, key), data); err != nil { + return fmt.Errorf("failed to handle object: %w", err) + } + // read the } + if t, err := dec.Token(); err != nil || t != json.Delim('}') { + return fmt.Errorf("expected '}', got '%v'", t) + } + default: + return fmt.Errorf("unexpected token '%v'", value) + } + case nil: + // nil values are a bit of a degenerate case, we are essentially using the btree + // as a set. we store the value as an empty byte slice. + if err := page.BPTree(r).Insert(btree.ReferencedValue{Value: []byte{}}, data); err != nil { + return fmt.Errorf("failed to insert into b+tree: %w", err) + } + default: + return fmt.Errorf("unexpected type '%T'", value) + } + } + } + return nil +} diff --git a/pkg/handlers/jsonl_test.go b/pkg/handlers/jsonl_test.go new file mode 100644 index 00000000..0453f6b9 --- /dev/null +++ b/pkg/handlers/jsonl_test.go @@ -0,0 +1,556 @@ +package handlers + +import ( + "encoding/binary" + "math" + "strings" + "testing" + + "github.com/kevmo314/appendable/pkg/appendable" + "github.com/kevmo314/appendable/pkg/buftest" +) + +func TestJSONL(t *testing.T) { + t.Run("no schema changes", func(t *testing.T) { + f := buftest.NewSeekableBuffer() + g := strings.NewReader("{\"test\":\"test1\"}\n") + + i, err := appendable.NewIndexFile(f, JSONLHandler{}) + if err != nil { + t.Fatal(err) + } + + // check that the index file now has the additional data ranges but same number of indices + indexes1, err := i.Indexes() + if err != nil { + t.Fatal(err) + } + + collected1, err := indexes1.Collect() + if err != nil { + t.Fatal(err) + } + + if len(collected1) != 0 { + t.Errorf("got len(i.Indexes) = %d, want 0", len(collected1)) + } + + if err := i.Synchronize(g); err != nil { + t.Fatal(err) + } + + indexes2, err := i.Indexes() + if err != nil { + t.Fatal(err) + } + + collected2, err := indexes2.Collect() + if err != nil { + t.Fatal(err) + } + + if len(collected2) != 1 { + t.Errorf("got len(i.Indexes) = %d, want 1", len(collected2)) + } + }) + + t.Run("correctly sets field offset", func(t *testing.T) { + f := buftest.NewSeekableBuffer() + + i, err := appendable.NewIndexFile(f, JSONLHandler{}) + if err != nil { + t.Fatal(err) + } + + if err := i.Synchronize(strings.NewReader("{\"test\":\"test1\"}\n")); err != nil { + t.Fatal(err) + } + + r2 := strings.NewReader("{\"test\":\"test1\"}\n{\"test\":\"test3\"}\n") + if err := i.Synchronize(r2); err != nil { + t.Fatal(err) + } + + // check that the index file now has the additional data ranges but same number of indices + indexes, err := i.Indexes() + if err != nil { + t.Fatal(err) + } + + collected, err := indexes.Collect() + if err != nil { + t.Fatal(err) + } + + if len(collected) != 1 { + t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) + } + + mp1, found, err := collected[0].BPTree(r2).Find([]byte("test1")) + if err != nil { + t.Fatal(err) + } + if !found { + t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = nil, want non-nil") + } + if mp1.Offset != 0 || mp1.Length != uint32(len("{\"test\":\"test1\"}")) { + t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = %+v, want {0, %d}", mp1, len("{\"test\":\"test1\"}")) + } + + mp2, found, err := collected[0].BPTree(r2).Find([]byte("test3")) + if err != nil { + t.Fatal(err) + } + if !found { + t.Errorf("got i.Indexes[0].BPTree().Find(\"test3\") = nil, want non-nil") + } + if mp2.Offset != uint64(len("{\"test\":\"test1\"}\n")) || mp2.Length != uint32(len("{\"test\":\"test3\"}")) { + t.Errorf("got i.Indexes[0].BPTree().Find(\"test3\") = %+v, want {%d, %d}", mp2, len("{\"test\":\"test1\"}\n"), len("{\"test\":\"test3\"}")) + } + }) + + t.Run("new index", func(t *testing.T) { + f := buftest.NewSeekableBuffer() + + i, err := appendable.NewIndexFile(f, JSONLHandler{}) + if err != nil { + t.Fatal(err) + } + + if err := i.Synchronize(strings.NewReader("{\"test\":\"test1\"}\n")); err != nil { + t.Fatal(err) + } + + r2 := strings.NewReader("{\"test\":\"test1\"}\n{\"test2\":\"test3\"}\n") + if err := i.Synchronize(r2); err != nil { + t.Fatal(err) + } + + indexes, err := i.Indexes() + if err != nil { + t.Fatal(err) + } + + collected, err := indexes.Collect() + if err != nil { + t.Fatal(err) + } + + // check that the index file now has the additional index + if len(collected) != 2 { + t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) + } + + mp1, found, err := collected[0].BPTree(r2).Find([]byte("test1")) + if err != nil { + t.Fatal(err) + } + if !found { + t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = nil, want non-nil") + } + if mp1.Offset != 0 || mp1.Length != uint32(len("{\"test\":\"test1\"}")) { + t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = %+v, want {0, %d}", mp1, len("{\"test\":\"test1\"}")) + } + + buf1, err := collected[0].Metadata() + if err != nil { + t.Fatal(err) + } + md1 := &appendable.IndexMeta{} + if err := md1.UnmarshalBinary(buf1); err != nil { + t.Fatal(err) + } + if md1.FieldType != appendable.FieldTypeString { + t.Errorf("got i.Indexes[0].FieldType = %#v, want FieldTypeString", md1.FieldType) + } + + mp2, found, err := collected[1].BPTree(r2).Find([]byte("test3")) + if err != nil { + t.Fatal(err) + } + if !found { + t.Errorf("got i.Indexes[1].BPTree().Find(\"test3\") = nil, want non-nil") + } + if mp2.Offset != uint64(len("{\"test\":\"test1\"}\n")) || mp2.Length != uint32(len("{\"test2\":\"test3\"}")) { + t.Errorf("got i.Indexes[1].BPTree().Find(\"test3\") = %+v, want {%d, %d}", mp2, len("{\"test\":\"test1\"}\n"), len("{\"test2\":\"test3\"}")) + } + + md2 := &appendable.IndexMeta{} + if err := collected[1].UnmarshalMetadata(md2); err != nil { + t.Fatal(err) + } + if md2.FieldType != appendable.FieldTypeString { + t.Errorf("got i.Indexes[1].FieldType = %#v, want FieldTypeString", md2.FieldType) + } + }) + + t.Run("existing index but different type", func(t *testing.T) { + f := buftest.NewSeekableBuffer() + + i, err := appendable.NewIndexFile(f, JSONLHandler{}) + if err != nil { + t.Fatal(err) + } + + if err := i.Synchronize(strings.NewReader("{\"test\":\"test1\"}\n")); err != nil { + t.Fatal(err) + } + + r2 := strings.NewReader("{\"test\":\"test1\"}\n{\"test\":123}\n") + if err := i.Synchronize(r2); err != nil { + t.Fatal(err) + } + + indexes, err := i.Indexes() + if err != nil { + t.Fatal(err) + } + + collected, err := indexes.Collect() + if err != nil { + t.Fatal(err) + } + + // check that the index file now has the additional index + if len(collected) != 2 { + t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) + } + + mp1, found, err := collected[0].BPTree(r2).Find([]byte("test1")) + if err != nil { + t.Fatal(err) + } + if !found { + t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = nil, want non-nil") + } + if mp1.Offset != 0 || mp1.Length != uint32(len("{\"test\":\"test1\"}")) { + t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = %+v, want {0, %d}", mp1, len("{\"test\":\"test1\"}")) + } + + buf1, err := collected[0].Metadata() + if err != nil { + t.Fatal(err) + } + md1 := &appendable.IndexMeta{} + if err := md1.UnmarshalBinary(buf1); err != nil { + t.Fatal(err) + } + if md1.FieldType != appendable.FieldTypeString { + t.Errorf("got i.Indexes[0].FieldType = %#v, want FieldTypeString", md1.FieldType) + } + + v2 := make([]byte, 8) + binary.LittleEndian.PutUint64(v2, math.Float64bits(123)) + mp2, found, err := collected[1].BPTree(r2).Find(v2) + if err != nil { + t.Fatal(err) + } + if !found { + t.Errorf("got i.Indexes[1].BPTree().Find(\"test3\") = nil, want non-nil") + } + if mp2.Offset != uint64(len("{\"test\":\"test1\"}\n")) || mp2.Length != uint32(len("{\"test\":123}")) { + t.Errorf("got i.Indexes[1].BPTree().Find(\"test3\") = %+v, want {%d, %d}", mp2, len("{\"test\":\"test1\"}\n"), len("{\"test\":123}")) + } + + md2 := &appendable.IndexMeta{} + if err := collected[1].UnmarshalMetadata(md2); err != nil { + t.Fatal(err) + } + if md2.FieldType != appendable.FieldTypeFloat64 { + t.Errorf("got i.Indexes[1].FieldType = %#v, want FieldTypeFloat64", md2.FieldType) + } + }) + + t.Run("creates nested indices", func(t *testing.T) { + f := buftest.NewSeekableBuffer() + + i, err := appendable.NewIndexFile(f, JSONLHandler{}) + if err != nil { + t.Fatal(err) + } + + if err := i.Synchronize(strings.NewReader("{\"test\":\"test1\"}\n{\"test2\":{\"a\":1,\"b\":\"2\"}}\n")); err != nil { + t.Fatal(err) + } + + indexes, err := i.Indexes() + if err != nil { + t.Fatal(err) + } + + collected, err := indexes.Collect() + if err != nil { + t.Fatal(err) + } + + // check that the index file now has the additional data ranges but same number of indices + if len(collected) != 4 { + t.Errorf("got len(i.Indexes) = %d, want 4", len(collected)) + } + + md0 := &appendable.IndexMeta{} + if err := collected[0].UnmarshalMetadata(md0); err != nil { + t.Fatal(err) + } + + md1 := &appendable.IndexMeta{} + if err := collected[1].UnmarshalMetadata(md1); err != nil { + t.Fatal(err) + } + + md2 := &appendable.IndexMeta{} + if err := collected[2].UnmarshalMetadata(md2); err != nil { + t.Fatal(err) + } + + md3 := &appendable.IndexMeta{} + if err := collected[3].UnmarshalMetadata(md3); err != nil { + t.Fatal(err) + } + + if md0.FieldType != appendable.FieldTypeString { + t.Errorf("got i.Indexes[0].FieldType = %#v, want FieldTypeString", md0.FieldType) + } + + if md1.FieldType != appendable.FieldTypeObject { + t.Errorf("got i.Indexes[1].FieldType = %#v, want FieldTypeObject", md1.FieldType) + } + + if md2.FieldType != appendable.FieldTypeFloat64 { + t.Errorf("got i.Indexes[2].FieldType = %#v, want FieldTypeFloat64", md2.FieldType) + } + + if md3.FieldType != appendable.FieldTypeString { + t.Errorf("got i.Indexes[3].FieldType = %#v, want FieldTypeString", md3.FieldType) + } + + if md0.FieldName != "test" { + t.Errorf("got i.Indexes[0].FieldName = %s, want \"test\"", md0.FieldName) + } + + if md1.FieldName != "test2" { + t.Errorf("got i.Indexes[1].FieldName = %s, want \"test2\"", md1.FieldName) + } + + if md2.FieldName != "test2.a" { + t.Errorf("got i.Indexes[2].FieldName = %s, want \"a\"", md2.FieldName) + } + + if md3.FieldName != "test2.b" { + t.Errorf("got i.Indexes[3].FieldName = %s, want \"b\"", md3.FieldName) + } + }) + + t.Run("creates second indices with same parent", func(t *testing.T) { + f := buftest.NewSeekableBuffer() + + i, err := appendable.NewIndexFile(f, JSONLHandler{}) + if err != nil { + t.Fatal(err) + } + + if err := i.Synchronize(strings.NewReader("{\"test\":\"test1\"}\n{\"test\":{\"a\":1,\"b\":\"2\"}}\n")); err != nil { + t.Fatal(err) + } + + indexes, err := i.Indexes() + if err != nil { + t.Fatal(err) + } + + collected, err := indexes.Collect() + if err != nil { + t.Fatal(err) + } + + // check that the index file now has the additional data ranges but same number of indices + if len(collected) != 4 { + t.Errorf("got len(i.Indexes) = %d, want 4", len(collected)) + } + + md0 := &appendable.IndexMeta{} + if err := collected[0].UnmarshalMetadata(md0); err != nil { + t.Fatal(err) + } + + md1 := &appendable.IndexMeta{} + if err := collected[1].UnmarshalMetadata(md1); err != nil { + t.Fatal(err) + } + + md2 := &appendable.IndexMeta{} + if err := collected[2].UnmarshalMetadata(md2); err != nil { + t.Fatal(err) + } + + md3 := &appendable.IndexMeta{} + if err := collected[3].UnmarshalMetadata(md3); err != nil { + t.Fatal(err) + } + + if md0.FieldType != appendable.FieldTypeString { + t.Errorf("got i.Indexes[0].FieldType = %#v, want FieldTypeString", md0.FieldType) + } + + if md1.FieldType != appendable.FieldTypeObject { + t.Errorf("got i.Indexes[1].FieldType = %#v, want FieldTypeObject", md1.FieldType) + } + + if md2.FieldType != appendable.FieldTypeFloat64 { + t.Errorf("got i.Indexes[2].FieldType = %#v, want FieldTypeFloat64", md2.FieldType) + } + + if md3.FieldType != appendable.FieldTypeString { + t.Errorf("got i.Indexes[3].FieldType = %#v, want FieldTypeString", md3.FieldType) + } + + if md0.FieldName != "test" { + t.Errorf("got i.Indexes[0].FieldName = %s, want \"test\"", md0.FieldName) + } + + if md1.FieldName != "test" { + t.Errorf("got i.Indexes[1].FieldName = %s, want \"test2\"", md1.FieldName) + } + + if md2.FieldName != "test.a" { + t.Errorf("got i.Indexes[2].FieldName = %s, want \"a\"", md2.FieldName) + } + + if md3.FieldName != "test.b" { + t.Errorf("got i.Indexes[3].FieldName = %s, want \"b\"", md3.FieldName) + } + }) + + t.Run("creates index for arrays", func(t *testing.T) { + f := buftest.NewSeekableBuffer() + + i, err := appendable.NewIndexFile(f, JSONLHandler{}) + if err != nil { + t.Fatal(err) + } + + if err := i.Synchronize(strings.NewReader("{\"test\":\"test1\"}\n{\"test2\":[[1,2,3],4]}\n")); err != nil { + t.Fatal(err) + } + + indexes, err := i.Indexes() + if err != nil { + t.Fatal(err) + } + + collected, err := indexes.Collect() + if err != nil { + t.Fatal(err) + } + + // check that the index file now has the additional data ranges but same number of indices + if len(collected) != 2 { + t.Errorf("got len(i.Indexes) = %d, want 2", len(collected)) + } + + md0 := &appendable.IndexMeta{} + if err := collected[0].UnmarshalMetadata(md0); err != nil { + t.Fatal(err) + } + + md1 := &appendable.IndexMeta{} + if err := collected[1].UnmarshalMetadata(md1); err != nil { + t.Fatal(err) + } + + if md0.FieldType != appendable.FieldTypeString { + t.Errorf("got i.Indexes[0].FieldType = %#v, want FieldTypeString", md0.FieldType) + } + + if md1.FieldType != appendable.FieldTypeArray { + t.Errorf("got i.Indexes[1].FieldType = %#v, want FieldTypeObject", md1.FieldType) + } + + if md0.FieldName != "test" { + t.Errorf("got i.Indexes[0].FieldName = %s, want \"test\"", md0.FieldName) + } + + if md1.FieldName != "test2" { + t.Errorf("got i.Indexes[1].FieldName = %s, want \"test2\"", md1.FieldName) + } + }) + + t.Run("existing index but nullable type", func(t *testing.T) { + f := buftest.NewSeekableBuffer() + + i, err := appendable.NewIndexFile(f, JSONLHandler{}) + if err != nil { + t.Fatal(err) + } + + if err := i.Synchronize(strings.NewReader("{\"test\":\"test1\"}\n")); err != nil { + t.Fatal(err) + } + + r2 := strings.NewReader("{\"test\":\"test1\"}\n{\"test\":null}\n") + if err := i.Synchronize(r2); err != nil { + t.Fatal(err) + } + + indexes, err := i.Indexes() + if err != nil { + t.Fatal(err) + } + + collected, err := indexes.Collect() + if err != nil { + t.Fatal(err) + } + + // check that the index file now has the additional index + if len(collected) != 2 { + t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) + } + + mp1, found, err := collected[0].BPTree(r2).Find([]byte("test1")) + if err != nil { + t.Fatal(err) + } + if !found { + t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = nil, want non-nil") + } + if mp1.Offset != 0 || mp1.Length != uint32(len("{\"test\":\"test1\"}")) { + t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = %+v, want {0, %d}", mp1, len("{\"test\":\"test1\"}")) + } + + buf1, err := collected[0].Metadata() + if err != nil { + t.Fatal(err) + } + md1 := &appendable.IndexMeta{} + if err := md1.UnmarshalBinary(buf1); err != nil { + t.Fatal(err) + } + if md1.FieldType != appendable.FieldTypeString { + t.Errorf("got i.Indexes[0].FieldType = %#v, want FieldTypeString", md1.FieldType) + } + + mp2, found, err := collected[1].BPTree(r2).Find([]byte{}) + if err != nil { + t.Fatal(err) + } + if !found { + t.Errorf("got i.Indexes[1].BPTree().Find(\"test3\") = nil, want non-nil") + } + if mp2.Offset != uint64(len("{\"test\":\"test1\"}\n")) || mp2.Length != uint32(len("{\"test\":null}")) { + t.Errorf("got i.Indexes[1].BPTree().Find(\"test3\") = %+v, want {%d, %d}", mp2, len("{\"test\":\"test1\"}\n"), len("{\"test\":null}")) + } + + buf2, err := collected[1].Metadata() + if err != nil { + t.Fatal(err) + } + md2 := &appendable.IndexMeta{} + if err := md2.UnmarshalBinary(buf2); err != nil { + t.Fatal(err) + } + if md2.FieldType != appendable.FieldTypeNull { + t.Errorf("got i.Indexes[1].FieldType = %#v, want FieldTypeNull", md2.FieldType) + } + }) +} diff --git a/pkg/protocol/protocol.go b/pkg/protocol/protocol.go deleted file mode 100644 index 12b64e55..00000000 --- a/pkg/protocol/protocol.go +++ /dev/null @@ -1,193 +0,0 @@ -package protocol - -import ( - "encoding/csv" - "encoding/json" - "fmt" - "io" - "log/slog" - "reflect" - "strconv" - "strings" -) - -/* -The overall index file for AppendableDB is structured as: - -+-----------------------+ -| Version | -+-----------------------+ -| IndexFileHeader | -+-----------------------+ -| IndexHeader | -+-----------------------+ -| ... | -+-----------------------+ -| IndexHeader | -+-----------------------+ -| IndexRecord | -+-----------------------+ -| ... | -+-----------------------+ -| IndexRecord | -+-----------------------+ -| EndByteOffset | -+-----------------------+ -| ... | -+-----------------------+ -| EndByteOffset | -+-----------------------+ -| Checksum | -+-----------------------+ -| ... | -+-----------------------+ -| Checksum | -+-----------------------+ -*/ - -// Version is the version of AppendableDB this library is compatible with. -type Version byte - -// IndexFileHeader is the header of the index file. -type IndexFileHeader struct { - // IndexLength represents the number of bytes the IndexHeaders occupy. - IndexLength uint64 - - // DataCount represents the number of data records indexed by this index - // file. - DataCount uint64 -} - -// IndexHeader is the header of each index record. This represents the fields -// available in the data file. -type IndexHeader struct { - FieldName string - - // FieldType represents the type of data stored in the field. Note that the - // field data doesn't need to follow this type, but it is used to determine - // the TypeScript typings for the field. - FieldType FieldType - - IndexRecordCount uint64 -} - -// FieldType represents the type of data stored in the field, which follows -// JSON types excluding Object and null. Object is broken down into subfields -// and null is not stored. -// -// FieldType is left as a uint64 to avoid shooting ourselves in the foot if we -// want to support more types in the future via other file formats. -type FieldType uint64 - -const ( - FieldTypeString FieldType = (1 << iota) - FieldTypeNumber - FieldTypeObject - FieldTypeArray - FieldTypeBoolean - FieldTypeNull -) - -func (t FieldType) TypescriptType() string { - components := []string{} - if t&FieldTypeString != 0 { - components = append(components, "string") - } - if t&FieldTypeNumber != 0 { - components = append(components, "number") - } - if t&FieldTypeObject != 0 { - components = append(components, "Record") - } - if t&FieldTypeArray != 0 { - components = append(components, "any[]") - } - if t&FieldTypeBoolean != 0 { - components = append(components, "boolean") - } - if t&FieldTypeNull != 0 { - components = append(components, "null") - } - if len(components) == 0 { - return "unknown" - } - return strings.Join(components, " | ") -} - -type IndexRecord struct { - DataNumber uint64 - // FieldByteOffset represents the byte offset of the field in the data - // file to fetch exactly the field value. - FieldStartByteOffset uint64 - // FieldLength is pessimistic: it is an encoded value that is at least as - // long as the actual field value. - FieldLength int -} - -func InferCSVField(fieldValue string) (interface{}, FieldType) { - if fieldValue == "" { - return nil, FieldTypeNull - } - - if i, err := strconv.Atoi(fieldValue); err == nil { - - fmt.Printf("\n%v is a integer\n", fieldValue) - return float64(i), FieldTypeNumber - } - - if f, err := strconv.ParseFloat(fieldValue, 64); err == nil { - - fmt.Printf("\n%v is a float\n", fieldValue) - return float64(f), FieldTypeNumber - } - - if b, err := strconv.ParseBool(fieldValue); err == nil { - return b, FieldTypeBoolean - } - - return fieldValue, FieldTypeString -} - -func (i IndexRecord) CSVField(r io.ReadSeeker) (any, error) { - offset, err := r.Seek(0, io.SeekCurrent) - if err != nil { - return nil, fmt.Errorf("failed to get current offset: %w", err) - } - - if _, err := r.Seek(int64(i.FieldStartByteOffset), io.SeekStart); err != nil { - return nil, fmt.Errorf("failed to seek to field start byte offset: %w", err) - } - - fields, err := csv.NewReader(io.LimitReader(r, int64(i.FieldLength))).Read() - if err != nil { - return nil, fmt.Errorf("failed to decode field: %w", err) - } - - if _, err := r.Seek(offset, io.SeekStart); err != nil { - return nil, fmt.Errorf("failed to seek to original offset: %w", err) - } - - slog.Debug("fields", slog.Any("F", fields), slog.Any("len", len(fields))) - slog.Debug("CSVField", slog.Any("fields", fields), slog.Any("firstField", fields[0]), slog.Any("firstFieldType", reflect.TypeOf(fields[0]).String())) - - field, _ := InferCSVField(fields[0]) - return field, nil -} - -func (i IndexRecord) Token(r io.ReadSeeker) (json.Token, error) { - offset, err := r.Seek(0, io.SeekCurrent) - if err != nil { - return nil, fmt.Errorf("failed to get current offset: %w", err) - } - if _, err := r.Seek(int64(i.FieldStartByteOffset), io.SeekStart); err != nil { - return nil, fmt.Errorf("failed to seek to field start byte offset: %w", err) - } - token, err := json.NewDecoder(io.LimitReader(r, int64(i.FieldLength))).Token() - if err != nil { - return nil, fmt.Errorf("failed to decode field: %w", err) - } - if _, err := r.Seek(offset, io.SeekStart); err != nil { - return nil, fmt.Errorf("failed to seek to original offset: %w", err) - } - return token, nil -} From 388724834895601943cb5bbb1681733077d35954 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 30 Jan 2024 09:46:45 -0500 Subject: [PATCH 02/15] fixes --- .gitignore | 5 ++++- cmd/main.go | 15 --------------- pkg/btree/bptree.go | 12 +++++------- pkg/btree/pagefile.go | 6 ------ pkg/handlers/jsonl.go | 4 ---- 5 files changed, 9 insertions(+), 33 deletions(-) diff --git a/.gitignore b/.gitignore index db4c6d9b..52c23dac 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ dist -node_modules \ No newline at end of file +node_modules + +examples/workspace/data.index +examples/workspace/green_tripdata_2023-01.jsonl \ No newline at end of file diff --git a/cmd/main.go b/cmd/main.go index 42b2362d..b1808078 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -5,7 +5,6 @@ import ( "fmt" "log/slog" "os" - "runtime/pprof" "time" "github.com/kevmo314/appendable/pkg/appendable" @@ -23,20 +22,6 @@ func main() { flag.StringVar(&indexFilename, "i", "", "Specify the existing index of the file to be opened, writing to stdout") flag.Parse() - f, err := os.Create("pprof.out") - if err != nil { - panic(err) - } - defer f.Close() // error handling omitted for example - if err := pprof.StartCPUProfile(f); err != nil { - panic(err) - } - go func() { - <-time.After(30 * time.Second) - pprof.StopCPUProfile() - os.Exit(0) - }() - logLevel := &slog.LevelVar{} if debugFlag { diff --git a/pkg/btree/bptree.go b/pkg/btree/bptree.go index f895dacc..3241e618 100644 --- a/pkg/btree/bptree.go +++ b/pkg/btree/bptree.go @@ -48,7 +48,6 @@ func (t *BPTree) Find(key []byte) (MemoryPointer, bool, error) { if root == nil { return MemoryPointer{}, false, nil } - log.Printf("finding for key %v at root %#v", key, rootOffset) path, err := t.traverse(key, root, rootOffset) if err != nil { return MemoryPointer{}, false, err @@ -81,9 +80,9 @@ type TraversalRecord struct { // traverse returns the path from root to leaf in reverse order (leaf first) // the last element is always the node passed in -func (t *BPTree) traverse(key []byte, node *BPTreeNode, ptr MemoryPointer) ([]*TraversalRecord, error) { +func (t *BPTree) traverse(key []byte, node *BPTreeNode, ptr MemoryPointer) ([]TraversalRecord, error) { if node.leaf() { - return []*TraversalRecord{{node: node, ptr: ptr}}, nil + return []TraversalRecord{{node: node, ptr: ptr}}, nil } for i, k := range node.Keys { if bytes.Compare(key, k.Value) < 0 { @@ -101,7 +100,7 @@ func (t *BPTree) traverse(key []byte, node *BPTreeNode, ptr MemoryPointer) ([]*T if err != nil { return nil, err } - return append(path, &TraversalRecord{node: node, index: i, ptr: node.Pointers[i]}), nil + return append(path, TraversalRecord{node: node, index: i, ptr: ptr}), nil } } if node.Pointers[len(node.Pointers)-1].Offset == ptr.Offset { @@ -115,7 +114,7 @@ func (t *BPTree) traverse(key []byte, node *BPTreeNode, ptr MemoryPointer) ([]*T if err != nil { return nil, err } - return append(path, &TraversalRecord{node: node, index: len(node.Keys), ptr: node.Pointers[len(node.Pointers)-1]}), nil + return append(path, TraversalRecord{node: node, index: len(node.Keys), ptr: ptr}), nil } func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error { @@ -139,7 +138,6 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error { return t.meta.SetRoot(MemoryPointer{Offset: uint64(offset), Length: uint32(len(buf))}) } - log.Printf("traversing for key %v at root %v", key.Value, rootOffset) path, err := t.traverse(key.Value, root, rootOffset) if err != nil { return err @@ -223,7 +221,7 @@ func (t *BPTree) Insert(key ReferencedValue, value MemoryPointer) error { // the parent will be written to disk in the next iteration } else { // the root split, so create a new root - p := &BPTreeNode{Data: t.Data, Pointers: []MemoryPointer{rootOffset}} + p := &BPTreeNode{Data: t.Data} p.Keys = []ReferencedValue{midKey} p.Pointers = []MemoryPointer{ {Offset: uint64(noffset), Length: uint32(len(nbuf))}, diff --git a/pkg/btree/pagefile.go b/pkg/btree/pagefile.go index b777efa1..8edbf0e7 100644 --- a/pkg/btree/pagefile.go +++ b/pkg/btree/pagefile.go @@ -161,12 +161,6 @@ func (pf *PageFile) FreePage(offset int64) error { if pf.freePageCount == len(pf.freePageIndexes) { return errors.New("free page index is full") } - // ensure it's not already in the free page index - for i := 0; i < pf.freePageCount; i++ { - if pf.freePageIndexes[(pf.freePageHead-pf.freePageCount+i+len(pf.freePageIndexes))%len(pf.freePageIndexes)] == offset { - return errors.New("page is already free") - } - } // push to the head pf.freePageIndexes[pf.freePageHead] = offset pf.freePageHead = (pf.freePageHead + 1) % len(pf.freePageIndexes) diff --git a/pkg/handlers/jsonl.go b/pkg/handlers/jsonl.go index 19ae9ce1..03880d7f 100644 --- a/pkg/handlers/jsonl.go +++ b/pkg/handlers/jsonl.go @@ -116,10 +116,6 @@ func handleJSONLObject(f *appendable.IndexFile, r io.ReaderAt, dec *json.Decoder name := strings.Join(append(path, key), ".") - if name != "VendorID" { - continue - } - page, err := f.FindOrCreateIndex(name, jsonTypeToFieldType(value)) if err != nil { return fmt.Errorf("failed to find or create index: %w", err) From 693154ba40081c5a9bf070fe47675ff9f7ab0afd Mon Sep 17 00:00:00 2001 From: Matthew <38759997+friendlymatthew@users.noreply.github.com> Date: Thu, 1 Feb 2024 13:58:54 -0500 Subject: [PATCH 03/15] init: csv handler --- pkg/appendable/index_file.go | 38 +++ pkg/handlers/csv.go | 477 ++++++++++++++++---------------- pkg/handlers/csv_test.go | 509 ++++++++++++++--------------------- 3 files changed, 468 insertions(+), 556 deletions(-) diff --git a/pkg/appendable/index_file.go b/pkg/appendable/index_file.go index ef1d7fb4..ed865464 100644 --- a/pkg/appendable/index_file.go +++ b/pkg/appendable/index_file.go @@ -97,6 +97,44 @@ func (i *IndexFile) IsEmpty() (bool, error) { return !exists, nil } +func (i *IndexFile) IndexFieldNames() ([]string, error) { + uniqueFieldNames := make(map[string]bool) + + mp := i.tree + + for { + next, err := mp.Next() + if err != nil { + return nil, fmt.Errorf("failed to get next meta page: %w", err) + } + exists, err := next.Exists() + if err != nil { + return nil, fmt.Errorf("failed to check if meta page exists: %w", err) + } + if !exists { + break + } + buf, err := next.Metadata() + if err != nil { + return nil, fmt.Errorf("failed to read metadata: %w", err) + } + metadata := &IndexMeta{} + if err := metadata.UnmarshalBinary(buf); err != nil { + return nil, fmt.Errorf("failed to unmarshal metadata: %w", err) + } + + uniqueFieldNames[metadata.FieldName] = true + mp = next + } + + var fieldNames []string + for fieldName := range uniqueFieldNames { + fieldNames = append(fieldNames, fieldName) + } + + return fieldNames, nil +} + func (i *IndexFile) FindOrCreateIndex(name string, fieldType FieldType) (*btree.LinkedMetaPage, error) { mp := i.tree for { diff --git a/pkg/handlers/csv.go b/pkg/handlers/csv.go index c779cea8..76d1e4b9 100644 --- a/pkg/handlers/csv.go +++ b/pkg/handlers/csv.go @@ -1,252 +1,229 @@ package handlers -// import ( -// "bufio" -// "bytes" -// "encoding/binary" -// "encoding/csv" -// "fmt" -// "io" -// "log/slog" -// "math" -// "strconv" -// "strings" - -// "github.com/kevmo314/appendable/pkg/appendable" -// "github.com/kevmo314/appendable/pkg/btree" -// ) - -// type CSVHandler struct { -// io.ReadSeeker -// } - -// var _ appendable.DataHandler = (*CSVHandler)(nil) - -// func (c CSVHandler) Format() appendable.Format { -// return appendable.FormatCSV -// } - -// func (c CSVHandler) Synchronize(f *appendable.IndexFile, df appendable.DataFile) error { -// slog.Debug("Starting CSV synchronization") - -// var headers []string -// var err error - -// metadata, err := f.Metadata() -// if err != nil { -// return fmt.Errorf("failed to read metadata: %w", err) -// } -// if _, err := df.Seek(int64(metadata.ReadOffset), io.SeekStart); err != nil { -// return fmt.Errorf("failed to seek: %w", err) -// } - -// fromNewIndexFile := false - -// isHeader := false - -// isEmpty, err := f.IsEmpty() -// if err != nil { -// return fmt.Errorf("failed to check if index file is empty: %w", err) -// } - -// if isEmpty { -// isHeader = true -// fromNewIndexFile = true -// } else { -// slog.Debug("indexes already exist, not parsing headers") -// for _, index := range f.Indexes { -// isHeader = false -// headers = append(headers, index.FieldName) -// } -// } - -// scanner := bufio.NewScanner(df) - -// for scanner.Scan() { -// line := scanner.Bytes() - -// existingCount := len(f.EndByteOffsets) - -// // append a data range -// var start uint64 -// if len(f.EndByteOffsets) > 0 { -// start = f.EndByteOffsets[existingCount-1] -// } - -// slog.Debug("", slog.Uint64("start", start)) - -// slog.Debug("adding", slog.Any("endbyteoffset", start+uint64(len(line))), slog.Any("line", line)) - -// if isHeader { -// slog.Info("Parsing CSV headers") -// dec := csv.NewReader(bytes.NewReader(line)) -// headers, err = dec.Read() -// if err != nil { -// slog.Error("failed to parse CSV header", "error", err) -// return fmt.Errorf("failed to parse CSV header: %w", err) -// } -// isHeader = false -// continue -// } - -// dec := csv.NewReader(bytes.NewReader(line)) -// slog.Debug("Handling csv", "line", i) - -// if fromNewIndexFile { - -// handleCSVLine(f, df, dec, headers, []string{}, btree.MemoryPointer{ -// Offset: start, -// Length: uint32(len(line)), -// }) -// } else { - -// handleCSVLine(f, df, dec, headers, []string{}, uint64(existingCount) -// } - -// metadata.ReadOffset += uint64(len(line)) + 1 // include the newline -// } - -// if err := scanner.Err(); err != nil { -// return fmt.Errorf("failed to scan: %w", err) -// } - -// // update the metadata -// if err := f.SetMetadata(metadata); err != nil { -// return fmt.Errorf("failed to set metadata: %w", err) -// } - -// slog.Debug("indexes", slog.Any("", f.Indexes)) -// slog.Debug("Ending CSV synchronization") -// slog.Debug("=========") -// return nil -// } - -// func fieldRankCsvField(fieldValue any) int { -// slog.Debug("serialize", slog.Any("fieldValue", fieldValue)) -// switch fieldValue.(type) { -// case nil: -// slog.Debug("nil", slog.Any("fieldValue", fieldValue)) -// return 1 -// case bool: -// slog.Debug("bool", slog.Any("fieldValue", fieldValue)) -// return 2 -// case int, int8, int16, int32, int64, float32, float64: -// slog.Debug("number", slog.Any("fieldValue", fieldValue)) -// return 3 -// case string: -// slog.Debug("string", slog.Any("fieldValue", fieldValue)) -// return 4 -// default: -// panic("unknown type") -// } -// } - -// func InferCSVField(fieldValue string) (interface{}, appendable.FieldType) { -// if fieldValue == "" { -// return nil, appendable.FieldTypeNull -// } - -// if i, err := strconv.Atoi(fieldValue); err == nil { - -// fmt.Printf("\n%v is a integer\n", fieldValue) -// return float64(i), appendable.FieldTypeFloat64 -// } - -// if f, err := strconv.ParseFloat(fieldValue, 64); err == nil { - -// fmt.Printf("\n%v is a float\n", fieldValue) -// return float64(f), appendable.FieldTypeFloat64 -// } - -// if b, err := strconv.ParseBool(fieldValue); err == nil { -// return b, appendable.FieldTypeBoolean -// } - -// return fieldValue, appendable.FieldTypeString -// } - -// func handleCSVLine(f *appendable.IndexFile, r io.ReaderAt, dec *csv.Reader, headers []string, path []string, data btree.MemoryPointer) error { -// slog.Debug("Processing CSV line", slog.Int("dataIndex", int(dataIndex)), slog.Int("dataOffset", int(dataOffset))) - -// record, err := dec.Read() - -// if err != nil { -// slog.Error("Failed to read CSV record at index", "dataIndex", dataIndex, "error", err) -// return fmt.Errorf("failed to read CSV record at index %d: %w", dataIndex, err) -// } - -// slog.Debug("CSV line read successfully", "record", record) - -// cumulativeLength := uint64(0) - -// for fieldIndex, fieldValue := range record { -// if fieldIndex >= len(headers) { -// slog.Error("Field index is out of bounds with headers", "fieldIndex", fieldIndex, "headers", slog.Any("headers", headers)) -// return fmt.Errorf("field index %d is out of bounds with header", fieldIndex) -// } - -// fieldName := headers[fieldIndex] -// name := strings.Join(append(path, fieldName), ".") - -// fieldOffset := data.Offset + cumulativeLength -// fieldLength := uint32(len(fieldValue)) - -// value, fieldType := InferCSVField(fieldValue) - -// page, err := f.FindOrCreateIndex(name, fieldType) -// if err != nil { -// return fmt.Errorf("failed to find or create index: %w", err) -// } - -// switch fieldType { -// case appendable.FieldTypeFloat64: -// buf := make([]byte, 8) -// binary.LittleEndian.PutUint64(buf, math.Float64bits(value.(float64))) -// if err := page.BPTree(r).Insert(btree.ReferencedValue{Value: buf}, data); err != nil { -// return fmt.Errorf("failed to insert into b+tree: %w", err) -// } -// case appendable.FieldTypeBoolean: -// if value.(bool) { -// if err := page.BPTree(r).Insert(btree.ReferencedValue{Value: []byte{1}}, data); err != nil { -// return fmt.Errorf("failed to insert into b+tree: %w", err) -// } -// } else { -// if err := page.BPTree(r).Insert(btree.ReferencedValue{Value: []byte{0}}, data); err != nil { -// return fmt.Errorf("failed to insert into b+tree: %w", err) -// } -// } -// case appendable.FieldTypeString: -// if err := page.BPTree(r).Insert(btree.ReferencedValue{ -// DataPointer: btree.MemoryPointer{ -// Offset: fieldOffset, -// Length: fieldLength, -// }, -// // trim the quotes -// Value: []byte(value.(string)), -// }, data); err != nil { -// return fmt.Errorf("failed to insert into b+tree: %w", err) -// } - -// slog.Debug("Appended index record", -// slog.String("field", name), -// slog.Any("value", value), -// slog.Int("start", int(fieldOffset))) - -// case appendable.FieldTypeNull: -// // nil values are a bit of a degenerate case, we are essentially using the btree -// // as a set. we store the value as an empty byte slice. -// if err := page.BPTree(r).Insert(btree.ReferencedValue{Value: []byte{}}, data); err != nil { -// return fmt.Errorf("failed to insert into b+tree: %w", err) -// } -// slog.Debug("Marked field", "name", name) - -// default: -// slog.Error("Encountered unexpected type '%T' for field '%s'", value, name) -// return fmt.Errorf("unexpected type '%T'", value) -// } - -// cumulativeLength += uint64(fieldLength + 1) -// } - -// return nil -// } +import ( + "bufio" + "bytes" + "encoding/binary" + "encoding/csv" + "fmt" + "io" + "log/slog" + "math" + "strconv" + "strings" + + "github.com/kevmo314/appendable/pkg/appendable" + "github.com/kevmo314/appendable/pkg/btree" +) + +type CSVHandler struct { + io.ReadSeeker +} + +var _ appendable.DataHandler = (*CSVHandler)(nil) + +func (c CSVHandler) Format() appendable.Format { + return appendable.FormatCSV +} + +func (c CSVHandler) Synchronize(f *appendable.IndexFile, df appendable.DataFile) error { + slog.Debug("Starting CSV synchronization") + + var headers []string + var err error + + metadata, err := f.Metadata() + if err != nil { + return fmt.Errorf("failed to read metadata: %w", err) + } + if _, err := df.Seek(int64(metadata.ReadOffset), io.SeekStart); err != nil { + return fmt.Errorf("failed to seek: %w", err) + } + + isHeader := false + + isEmpty, err := f.IsEmpty() + if err != nil { + return fmt.Errorf("failed to check if index file is empty: %w", err) + } + + if isEmpty { + isHeader = true + } else { + fieldNames, err := f.IndexFieldNames() + if err != nil { + return fmt.Errorf("failed to retrieve index field names: %w", err) + } + headers = fieldNames + } + + scanner := bufio.NewScanner(df) + + for scanner.Scan() { + line := scanner.Bytes() + + if isHeader { + slog.Info("Parsing CSV headers") + dec := csv.NewReader(bytes.NewReader(line)) + headers, err = dec.Read() + if err != nil { + slog.Error("failed to parse CSV header", "error", err) + return fmt.Errorf("failed to parse CSV header: %w", err) + } + metadata.ReadOffset += uint64(len(line)) + 1 + isHeader = false + continue + } + + dec := csv.NewReader(bytes.NewReader(line)) + + if err := handleCSVLine(f, df, dec, headers, []string{}, btree.MemoryPointer{ + Offset: metadata.ReadOffset, + Length: uint32(len(line)), + }); err != nil { + return fmt.Errorf("failed to handle object: %w", err) + } + + metadata.ReadOffset += uint64(len(line)) + 1 // include the newline + } + + if err := scanner.Err(); err != nil { + return fmt.Errorf("failed to scan: %w", err) + } + + // update the metadata + if err := f.SetMetadata(metadata); err != nil { + return fmt.Errorf("failed to set metadata: %w", err) + } + + slog.Debug("indexes", slog.Any("", f.Indexes)) + slog.Debug("Ending CSV synchronization") + slog.Debug("=========") + return nil +} + +func fieldRankCsvField(fieldValue any) int { + slog.Debug("serialize", slog.Any("fieldValue", fieldValue)) + switch fieldValue.(type) { + case nil: + slog.Debug("nil", slog.Any("fieldValue", fieldValue)) + return 1 + case bool: + slog.Debug("bool", slog.Any("fieldValue", fieldValue)) + return 2 + case int, int8, int16, int32, int64, float32, float64: + slog.Debug("number", slog.Any("fieldValue", fieldValue)) + return 3 + case string: + slog.Debug("string", slog.Any("fieldValue", fieldValue)) + return 4 + default: + panic("unknown type") + } +} + +func InferCSVField(fieldValue string) (interface{}, appendable.FieldType) { + if fieldValue == "" { + return nil, appendable.FieldTypeNull + } + + if i, err := strconv.Atoi(fieldValue); err == nil { + + fmt.Printf("\n%v is a integer\n", fieldValue) + return float64(i), appendable.FieldTypeFloat64 + } + + if f, err := strconv.ParseFloat(fieldValue, 64); err == nil { + + fmt.Printf("\n%v is a float\n", fieldValue) + return float64(f), appendable.FieldTypeFloat64 + } + + if b, err := strconv.ParseBool(fieldValue); err == nil { + return b, appendable.FieldTypeBoolean + } + + return fieldValue, appendable.FieldTypeString +} + +func handleCSVLine(f *appendable.IndexFile, r io.ReaderAt, dec *csv.Reader, headers []string, path []string, data btree.MemoryPointer) error { + record, err := dec.Read() + if err != nil { + slog.Error("Failed to read CSV record at index", "error", err) + return fmt.Errorf("failed to read CSV record: %w", err) + } + + cumulativeLength := uint64(0) + + for fieldIndex, fieldValue := range record { + if fieldIndex >= len(headers) { + slog.Error("Field index is out of bounds with headers", "fieldIndex", fieldIndex, "headers", slog.Any("headers", headers)) + return fmt.Errorf("field index %d is out of bounds with header", fieldIndex) + } + + fieldName := headers[fieldIndex] + + name := strings.Join(append(path, fieldName), ".") + + fieldOffset := data.Offset + cumulativeLength + fieldLength := uint32(len(fieldValue)) + + value, fieldType := InferCSVField(fieldValue) + page, err := f.FindOrCreateIndex(name, fieldType) + + if err != nil { + return fmt.Errorf("failed to find or create index: %w", err) + } + + switch fieldType { + case appendable.FieldTypeFloat64: + buf := make([]byte, 8) + binary.LittleEndian.PutUint64(buf, math.Float64bits(value.(float64))) + if err := page.BPTree(r).Insert(btree.ReferencedValue{Value: buf}, data); err != nil { + return fmt.Errorf("failed to insert into b+tree: %w", err) + } + case appendable.FieldTypeBoolean: + if value.(bool) { + if err := page.BPTree(r).Insert(btree.ReferencedValue{Value: []byte{1}}, data); err != nil { + return fmt.Errorf("failed to insert into b+tree: %w", err) + } + } else { + if err := page.BPTree(r).Insert(btree.ReferencedValue{Value: []byte{0}}, data); err != nil { + return fmt.Errorf("failed to insert into b+tree: %w", err) + } + } + case appendable.FieldTypeString: + if err := page.BPTree(r).Insert(btree.ReferencedValue{ + DataPointer: btree.MemoryPointer{ + Offset: fieldOffset, + Length: fieldLength, + }, + // trim the quotes + Value: []byte(value.(string)), + }, data); err != nil { + return fmt.Errorf("failed to insert into b+tree: %w", err) + } + + slog.Debug("Appended index record", + slog.String("field", name), + slog.Any("value", value), + slog.Int("start", int(fieldOffset))) + + case appendable.FieldTypeNull: + // nil values are a bit of a degenerate case, we are essentially using the btree + // as a set. we store the value as an empty byte slice. + if err := page.BPTree(r).Insert(btree.ReferencedValue{Value: []byte{}}, data); err != nil { + return fmt.Errorf("failed to insert into b+tree: %w", err) + } + slog.Debug("Marked field", "name", name) + + default: + slog.Error("Encountered unexpected type '%T' for field '%s'", value, name) + return fmt.Errorf("unexpected type '%T'", value) + } + + cumulativeLength += uint64(fieldLength + 1) + } + + return nil +} diff --git a/pkg/handlers/csv_test.go b/pkg/handlers/csv_test.go index ca11c5ff..14e40d28 100644 --- a/pkg/handlers/csv_test.go +++ b/pkg/handlers/csv_test.go @@ -1,308 +1,205 @@ package handlers -// import ( -// "bytes" -// "fmt" -// "log/slog" -// "os" -// "reflect" -// "strings" -// "testing" - -// "github.com/kevmo314/appendable/pkg/protocol" -// ) - -// func TestAppendDataRowCSV(t *testing.T) { - -// originalLogger := slog.Default() - -// // Create a logger with Debug on -// debugLevel := &slog.LevelVar{} -// debugLevel.Set(slog.LevelDebug) -// debugLogger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{ -// Level: debugLevel, -// })) - -// slog.SetDefault(debugLogger) - -// defer slog.SetDefault(originalLogger) - -// var mockCsv string = "header1\ntest1\n" -// var mockCsv2 string = "header1\ntest1\ntest3\n" - -// t.Run("no schema changes", func(t *testing.T) { - -// i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader(mockCsv)}) -// if err != nil { -// t.Fatal(err) -// } - -// buf := &bytes.Buffer{} - -// if err := i.Serialize(buf); err != nil { -// t.Fatal(err) -// } - -// j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader(mockCsv2)}) -// if err != nil { -// t.Fatal(err) -// } - -// // check that the index file now has the additional data ranges but same number of indices -// if len(j.Indexes) != 1 { -// t.Errorf("got len(i.Indexes) = %d, want 1", len(i.Indexes)) -// } - -// if len(j.EndByteOffsets) != 2 { -// t.Errorf("got len(i.DataRanges) = %d, want 2", len(i.EndByteOffsets)) -// } - -// // check that the first data range is untouched despite being incorrect -// if j.EndByteOffsets[0] != uint64(len(mockCsv)) { -// t.Errorf("got i.DataRanges[0].EndByteOffset = %d, want %d", j.EndByteOffsets[0], uint64(len(mockCsv))) -// } - -// // check that the second data range has properly set offsets -// if j.EndByteOffsets[1] != uint64(len(mockCsv2)) { -// t.Errorf("got i.DataRanges[1].EndByteOffset = %d, want %d", j.EndByteOffsets[1], uint64(len(mockCsv2))) -// } -// }) - -// t.Run("check end + start byte offsets multiple", func(t *testing.T) { -// i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader(mockCsv2)}) -// if err != nil { -// t.Fatal(err) -// } - -// if len(i.Indexes) != 1 { -// t.Errorf("got len(i.Indexes) = %d, want 1", len(i.Indexes)) -// } - -// if len(i.Indexes[0].IndexRecords) != 2 { -// t.Errorf("got len(i.Indexes[0].IndexRecords) = %d, want 2", len(i.Indexes[0].IndexRecords)) -// } - -// if i.Indexes[0].IndexRecords["test1"][0].FieldStartByteOffset != uint64(len("header1\n")) { -// t.Errorf("got i.Indexes[0].IndexRecords[\"test1\"][0].FieldStartByteOffset = %d, want 7", i.Indexes[0].IndexRecords["test1"][0].FieldStartByteOffset) -// } - -// if i.Indexes[0].IndexRecords["test3"][0].FieldStartByteOffset != uint64(len("header1\ntest1\n")) { -// t.Errorf("got i.Indexes[0].IndexRecords[\"test3\"][0].FieldStartByteOffset = %d, want %d", i.Indexes[0].IndexRecords["test3"][0].FieldStartByteOffset, uint64(len("header\ntest1\n"))) -// } - -// }) - -// t.Run("append index to existing", func(t *testing.T) { -// i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader(mockCsv)}) -// if err != nil { -// t.Fatal(err) -// } - -// buf := &bytes.Buffer{} - -// if err := i.Serialize(buf); err != nil { -// t.Fatal(err) -// } - -// j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader(mockCsv2)}) -// if err != nil { -// t.Fatal(err) -// } - -// // check that the index file now has the additional data ranges but same number of indices -// if len(j.Indexes) != 1 { -// t.Errorf("got len(j.Indexes) = %d, want 1", len(j.Indexes)) -// } - -// if len(j.Indexes[0].IndexRecords) != 2 { -// fmt.Printf("index records look like %v", j.Indexes[0].IndexRecords) -// t.Errorf("got len(j.Indexes[0].IndexRecords) = %d, want 2", len(j.Indexes[0].IndexRecords)) -// } - -// if len(j.Indexes[0].IndexRecords["test1"]) != 1 { -// t.Errorf("got len(j.Indexes[0].IndexRecords[\"test1\"]) = %d, want 1", len(j.Indexes[0].IndexRecords["test1"])) -// } -// if len(j.Indexes[0].IndexRecords["test3"]) != 1 { -// for key, records := range j.Indexes[0].IndexRecords { -// t.Errorf("\n\n\nKey: %v, Records: %+v", key, records) -// } -// t.Errorf("got len(j.Indexes[0].IndexRecords[\"test3\"]) = %d, want 1", len(j.Indexes[0].IndexRecords["test3"])) -// } - -// if j.Indexes[0].IndexRecords["test1"][0].DataNumber != 0 { -// t.Errorf("got i.Indexes[0].IndexRecords[\"test1\"][0].DataNumber = %d, want 0", j.Indexes[0].IndexRecords["test1"][0].DataNumber) -// } -// if j.Indexes[0].IndexRecords["test1"][0].FieldStartByteOffset != uint64(len("header1\n")) { -// t.Errorf("got i.Indexes[0].IndexRecords[\"test1\"][0].FieldStartByteOffset = %d, want %d", j.Indexes[0].IndexRecords["test1"][0].FieldStartByteOffset, uint64(len("header\n"))) -// } - -// if j.Indexes[0].IndexRecords["test3"][0].DataNumber != 1 { -// t.Errorf("got i.Indexes[0].IndexRecords[\"test3\"][0].DataNumber = %d, want 1", j.Indexes[0].IndexRecords["test3"][0].DataNumber) -// } - -// // verify byte offset calculation -// if j.Indexes[0].IndexRecords["test3"][0].FieldStartByteOffset != uint64(len("header1\ntest1\n")) { -// t.Errorf("got i.Indexes[0].IndexRecords[\"test3\"][0].FieldStartByteOffset = %d, want %d", j.Indexes[0].IndexRecords["test3"][0].FieldStartByteOffset, uint64(len("header\ntest1\n"))) -// } -// }) - -// t.Run("assert correct types", func(t *testing.T) { -// i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader("n1,n2\n3.4,3\n")}) -// if err != nil { -// t.Fatal(err) -// } - -// buf := &bytes.Buffer{} - -// if err := i.Serialize(buf); err != nil { -// t.Fatal(err) -// } - -// j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader("n1,n2\n3.4,3\n4.3,4")}) -// if err != nil { -// t.Fatal(err) -// } - -// for _, index := range i.Indexes { -// for key := range index.IndexRecords { -// keyType := reflect.TypeOf(key).String() -// if keyType != "float64" { -// t.Errorf("i keytype is %v", keyType) -// } - -// if index.FieldType != protocol.FieldTypeNumber { -// t.Errorf("index field type is not number. actual: %v", index.FieldType) -// } -// } -// } - -// for _, index := range j.Indexes { -// for key := range index.IndexRecords { -// keyType := reflect.TypeOf(key).String() -// if keyType != "float64" { -// t.Errorf("j keytype is %v", keyType) -// } - -// if index.FieldType != protocol.FieldTypeNumber { -// t.Errorf("index field type is not number. actual: %v", index.FieldType) -// } -// } -// } - -// }) - -// t.Run("multiple headers", func(t *testing.T) { - -// i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader("name,move\nmica,coyote\n")}) -// if err != nil { -// t.Fatal(err) -// } - -// buf := &bytes.Buffer{} - -// if err := i.Serialize(buf); err != nil { -// t.Fatal(err) -// } - -// j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader("name,move\nmica,coyote\ngalvao,mount\n")}) -// if err != nil { -// t.Fatal(err) -// } - -// // check that the index file now has the additional data ranges but same number of indices -// if len(j.Indexes) != 2 { -// t.Errorf("got len(i.Indexes) = %d, want 2", len(i.Indexes)) -// } - -// if len(j.EndByteOffsets) != 2 { -// t.Errorf("got len(i.DataRanges) = %d, want 2", len(i.EndByteOffsets)) -// } - -// // check that the first data range is untouched despite being incorrect -// if j.EndByteOffsets[0] != uint64(len("name,move\nmica,coyote\n")) { -// t.Errorf("got i.DataRanges[0].EndByteOffset = %d, want %d", j.EndByteOffsets[0], uint64(len("name,move\nmica,coyote\n"))) -// } - -// // check that the second data range has properly set offsets -// if j.EndByteOffsets[1] != uint64(len("name,move\nmica,coyote\ngalvao,mount\n")) { -// t.Errorf("got i.DataRanges[1].EndByteOffset = %d, want %d", j.EndByteOffsets[1], uint64(len("name,move\nmica,coyote\ngalvao,mount\n"))) -// } - -// fmt.Printf("index file looks like: %v", j.Indexes) -// }) - -// t.Run("generate index file", func(t *testing.T) { -// i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader("")}) - -// if err != nil { -// t.Fatal(err) -// } - -// buf := &bytes.Buffer{} - -// if err := i.Serialize(buf); err != nil { -// t.Fatal(err) -// } - -// _, err = ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader(mockCsv2)}) -// if err != nil { -// t.Fatal(err) -// } - -// }) - -// t.Run("existing index but different type", func(t *testing.T) { -// i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader("test\ntest1\n")}) -// if err != nil { -// t.Fatal(err) -// } - -// buf := &bytes.Buffer{} - -// if err := i.Serialize(buf); err != nil { -// t.Fatal(err) -// } - -// j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader("test\ntest1\n123\n")}) -// if err != nil { -// t.Fatal(err) -// } - -// // check that the index file now has the additional data ranges but same number of indices -// if len(j.Indexes) != 1 { -// t.Errorf("got len(i.Indexes) = %d, want 1", len(j.Indexes)) -// } - -// if j.Indexes[0].FieldType != protocol.FieldTypeString|protocol.FieldTypeNumber { -// t.Errorf("got i.Indexes[0].FieldType = %#v, want protocol.FieldTypeUnknown", j.Indexes[0].FieldType) -// } -// }) - -// t.Run("existing index but nullable type", func(t *testing.T) { -// i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader("test,test2\nomoplata,armbar\n")}) -// if err != nil { -// t.Fatal(err) -// } - -// buf := &bytes.Buffer{} - -// if err := i.Serialize(buf); err != nil { -// t.Fatal(err) -// } - -// j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader("test,test2\nomoplata,armbar\n,singlelegx\n")}) -// if err != nil { -// t.Fatal(err) -// } - -// // check that the index file now has the additional data ranges but same number of indices -// if len(j.Indexes) != 2 { -// t.Errorf("got len(i.Indexes) = %d, want 2", len(j.Indexes)) -// } - -// if j.Indexes[0].FieldType != protocol.FieldTypeNull|protocol.FieldTypeString { -// t.Errorf("got i.Indexes[0].FieldType = %#v, want protocol.FieldTypeNullableString", j.Indexes[0].FieldType) -// } -// }) - -// } +import ( + "encoding/binary" + "github.com/kevmo314/appendable/pkg/appendable" + "github.com/kevmo314/appendable/pkg/buftest" + "log/slog" + "math" + "os" + "strings" + "testing" +) + +func TestCSV(t *testing.T) { + originalLogger := slog.Default() + + debugLevel := &slog.LevelVar{} + debugLevel.Set(slog.LevelDebug) + debugLogger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{ + Level: debugLevel, + })) + + slog.SetDefault(debugLogger) + + defer slog.SetDefault(originalLogger) + + t.Run("no schema changes", func(t *testing.T) { + f := buftest.NewSeekableBuffer() + g := strings.NewReader("test\ntest1\n") + + i, err := appendable.NewIndexFile(f, CSVHandler{}) + if err != nil { + t.Fatal(err) + } + + indexes1, err := i.Indexes() + if err != nil { + t.Fatal(err) + } + + collected1, err := indexes1.Collect() + if err != nil { + t.Fatal(err) + } + + if len(collected1) != 0 { + t.Errorf("got len(i.Indexes) = %d, want 0", len(collected1)) + } + + if err := i.Synchronize(g); err != nil { + t.Fatal(err) + } + + indexes2, err := i.Indexes() + if err != nil { + t.Fatal(err) + } + + collected2, err := indexes2.Collect() + if err != nil { + t.Fatal(err) + } + + if len(collected2) != 1 { + t.Errorf("got len(i.Indexes) = %d, want 1", len(collected2)) + } + }) + t.Run("correctly sets field offset", func(t *testing.T) { + r1 := strings.NewReader("test\ntest1\n") + r2 := strings.NewReader("test\ntest1\ntest2\n") + + f := buftest.NewSeekableBuffer() + + i, err := appendable.NewIndexFile(f, CSVHandler{}) + if err != nil { + t.Fatal(err) + } + + if err := i.Synchronize(r1); err != nil { + t.Fatal(err) + } + + if err := i.Synchronize(r2); err != nil { + t.Fatal(err) + } + + indexes, err := i.Indexes() + if err != nil { + t.Fatal(err) + } + + collected, err := indexes.Collect() + if err != nil { + t.Fatal(err) + } + + if len(collected) != 1 { + t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) + } + + mp1, found, err := collected[0].BPTree(r2).Find([]byte("test1")) + if err != nil { + t.Fatal(err) + } + if !found { + t.Errorf("got i.Indexes[0].BPTree().Find(test1) = nil, want non-nil") + } + if mp1.Offset != 0 || mp1.Length != uint32(len("test\ntest1\n")) { + // t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = %+v, want {%d, %d}", mp1, len("test\n"), len("test\ntest1\n")) + } + + mp2, found, err := collected[0].BPTree(r2).Find([]byte("test2")) + if err != nil { + t.Fatal(err) + } + if !found { + t.Errorf("got i.Indexes[0].BPTree().Find(\"test2\") = nil, want non-nil") + } + + if mp2.Offset != uint64(len("test\ntest1\n")) || mp2.Length != uint32(len("test\ntest1\ntest2\n")) { + // t.Errorf("got i.Indexes[0].BPTree().Find(\"test2\") = %+v, want {%d, %d}", mp2, len("test\ntest1\n"), len("test\ntest1\ntest2\n")) + } + }) + + t.Run("existing index but different type", func(t *testing.T) { + + s1 := "test\ntest1\n" + s2 := "test\ntest1\n123\n" + + f := buftest.NewSeekableBuffer() + + i, err := appendable.NewIndexFile(f, CSVHandler{}) + if err != nil { + t.Fatal(err) + } + + if err := i.Synchronize(strings.NewReader(s1)); err != nil { + t.Fatal(err) + } + + r2 := strings.NewReader(s2) + if err := i.Synchronize(r2); err != nil { + t.Fatal(err) + } + + indexes, err := i.Indexes() + if err != nil { + t.Fatal(err) + } + + collected, err := indexes.Collect() + if err != nil { + t.Fatal(err) + } + + // check that the index file now has the additional index + if len(collected) != 2 { + t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) + } + + mp1, found, err := collected[0].BPTree(r2).Find([]byte("test1")) + if err != nil { + t.Fatal(err) + } + if !found { + t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = nil, want non-nil") + } + if mp1.Offset != 0 || mp1.Length != uint32(len("{\"test\":\"test1\"}")) { + // t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = %+v, want {0, %d}", mp1, len("{\"test\":\"test1\"}")) + } + + buf1, err := collected[0].Metadata() + if err != nil { + t.Fatal(err) + } + md1 := &appendable.IndexMeta{} + if err := md1.UnmarshalBinary(buf1); err != nil { + t.Fatal(err) + } + if md1.FieldType != appendable.FieldTypeString { + t.Errorf("got i.Indexes[0].FieldType = %#v, want FieldTypeString", md1.FieldType) + } + + v2 := make([]byte, 8) + binary.LittleEndian.PutUint64(v2, math.Float64bits(123)) + mp2, found, err := collected[1].BPTree(r2).Find(v2) + if err != nil { + t.Fatal(err) + } + if !found { + t.Errorf("got i.Indexes[1].BPTree().Find(\"test3\") = nil, want non-nil") + } + if mp2.Offset != uint64(len("{\"test\":\"test1\"}\n")) || mp2.Length != uint32(len("{\"test\":123}")) { + // t.Errorf("got i.Indexes[1].BPTree().Find(\"test3\") = %+v, want {%d, %d}", mp2, len("{\"test\":\"test1\"}\n"), len("{\"test\":123}")) + } + + md2 := &appendable.IndexMeta{} + if err := collected[1].UnmarshalMetadata(md2); err != nil { + t.Fatal(err) + } + if md2.FieldType != appendable.FieldTypeFloat64 { + t.Errorf("got i.Indexes[1].FieldType = %#v, want FieldTypeFloat64", md2.FieldType) + } + }) +} From a465acc87f45790fdc2bbddb35fadd5a8939e27d Mon Sep 17 00:00:00 2001 From: Matthew <38759997+friendlymatthew@users.noreply.github.com> Date: Thu, 1 Feb 2024 15:00:36 -0500 Subject: [PATCH 04/15] coverage: recognize null fields test --- pkg/handlers/csv_test.go | 65 ++++++++++++++++++++++++++++++++------ pkg/handlers/jsonl_test.go | 47 +++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 9 deletions(-) diff --git a/pkg/handlers/csv_test.go b/pkg/handlers/csv_test.go index 14e40d28..978cd858 100644 --- a/pkg/handlers/csv_test.go +++ b/pkg/handlers/csv_test.go @@ -105,8 +105,8 @@ func TestCSV(t *testing.T) { if !found { t.Errorf("got i.Indexes[0].BPTree().Find(test1) = nil, want non-nil") } - if mp1.Offset != 0 || mp1.Length != uint32(len("test\ntest1\n")) { - // t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = %+v, want {%d, %d}", mp1, len("test\n"), len("test\ntest1\n")) + if mp1.Offset != uint64(len("test\n")) || mp1.Length != uint32(len("test1")) { + t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = %+v, want {%d, %d}", mp1, len("test\n"), len("test1")) } mp2, found, err := collected[0].BPTree(r2).Find([]byte("test2")) @@ -117,11 +117,10 @@ func TestCSV(t *testing.T) { t.Errorf("got i.Indexes[0].BPTree().Find(\"test2\") = nil, want non-nil") } - if mp2.Offset != uint64(len("test\ntest1\n")) || mp2.Length != uint32(len("test\ntest1\ntest2\n")) { - // t.Errorf("got i.Indexes[0].BPTree().Find(\"test2\") = %+v, want {%d, %d}", mp2, len("test\ntest1\n"), len("test\ntest1\ntest2\n")) + if mp2.Offset != uint64(len("test\ntest1\n")) || mp2.Length != uint32(len("test2")) { + t.Errorf("got i.Indexes[0].BPTree().Find(\"test2\") = %+v, want {%d, %d}", mp2, len("test\ntest1\n"), len("test2")) } }) - t.Run("existing index but different type", func(t *testing.T) { s1 := "test\ntest1\n" @@ -165,8 +164,8 @@ func TestCSV(t *testing.T) { if !found { t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = nil, want non-nil") } - if mp1.Offset != 0 || mp1.Length != uint32(len("{\"test\":\"test1\"}")) { - // t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = %+v, want {0, %d}", mp1, len("{\"test\":\"test1\"}")) + if mp1.Offset != uint64(len("test\n")) || mp1.Length != uint32(len("test1")) { + t.Errorf("got i.Indexes[0].BPTree().Find(\"test1\") = %+v, want {%d, %d}", mp1, len("test\n"), len("test1")) } buf1, err := collected[0].Metadata() @@ -190,8 +189,8 @@ func TestCSV(t *testing.T) { if !found { t.Errorf("got i.Indexes[1].BPTree().Find(\"test3\") = nil, want non-nil") } - if mp2.Offset != uint64(len("{\"test\":\"test1\"}\n")) || mp2.Length != uint32(len("{\"test\":123}")) { - // t.Errorf("got i.Indexes[1].BPTree().Find(\"test3\") = %+v, want {%d, %d}", mp2, len("{\"test\":\"test1\"}\n"), len("{\"test\":123}")) + if mp2.Offset != uint64(len("test\ntest1\n")) || mp2.Length != uint32(len("123")) { + t.Errorf("got i.Indexes[1].BPTree().Find(\"test3\") = %+v, want {%d, %d}", mp2, len("test\ntest1\n"), len("123")) } md2 := &appendable.IndexMeta{} @@ -202,4 +201,52 @@ func TestCSV(t *testing.T) { t.Errorf("got i.Indexes[1].FieldType = %#v, want FieldTypeFloat64", md2.FieldType) } }) + + t.Run("recognize null fields", func(t *testing.T) { + r1 := strings.NewReader("nullheader,header1\n,wef\n") + r2 := strings.NewReader("nullheader,header1\n,wef\n,howdy\n") + + f := buftest.NewSeekableBuffer() + + i, err := appendable.NewIndexFile(f, CSVHandler{}) + if err != nil { + t.Fatal(err) + } + + if err := i.Synchronize(r1); err != nil { + t.Fatal(err) + } + + if err := i.Synchronize(r2); err != nil { + t.Fatal(err) + } + + indexes, err := i.Indexes() + if err != nil { + t.Fatal(err) + } + + collected, err := indexes.Collect() + if err != nil { + t.Fatal(err) + } + + if len(collected) != 2 { + t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) + } + buf1, err := collected[0].Metadata() + if err != nil { + t.Fatal(err) + } + md1 := &appendable.IndexMeta{} + + if err := md1.UnmarshalBinary(buf1); err != nil { + t.Fatal(err) + } + + if md1.FieldName != "nullheader" || md1.FieldType != appendable.FieldTypeNull { + t.Errorf("expected md1.FieldName nullheader, got: %v\nexpected field type to be null, got: %v", md1.FieldName, md1.FieldType) + } + }) + } diff --git a/pkg/handlers/jsonl_test.go b/pkg/handlers/jsonl_test.go index 0453f6b9..4310adda 100644 --- a/pkg/handlers/jsonl_test.go +++ b/pkg/handlers/jsonl_test.go @@ -553,4 +553,51 @@ func TestJSONL(t *testing.T) { t.Errorf("got i.Indexes[1].FieldType = %#v, want FieldTypeNull", md2.FieldType) } }) + + t.Run("recognize null fields", func(t *testing.T) { + r1 := strings.NewReader("{\"nullheader\":null}\n") + r2 := strings.NewReader("{\"nullheader\":null}\n{\"nullheader\":null}\n") + + f := buftest.NewSeekableBuffer() + + i, err := appendable.NewIndexFile(f, JSONLHandler{}) + if err != nil { + t.Fatal(err) + } + + if err := i.Synchronize(r1); err != nil { + t.Fatal(err) + } + + if err := i.Synchronize(r2); err != nil { + t.Fatal(err) + } + + indexes, err := i.Indexes() + if err != nil { + t.Fatal(err) + } + + collected, err := indexes.Collect() + if err != nil { + t.Fatal(err) + } + + if len(collected) != 1 { + t.Errorf("got len(i.Indexes) = %d, want 1", len(collected)) + } + buf1, err := collected[0].Metadata() + if err != nil { + t.Fatal(err) + } + md1 := &appendable.IndexMeta{} + + if err := md1.UnmarshalBinary(buf1); err != nil { + t.Fatal(err) + } + + if md1.FieldName != "nullheader" || md1.FieldType != appendable.FieldTypeNull { + t.Errorf("expected md1.FieldName nullheader, got: %v\nexpected field type to be null, got: %v", md1.FieldName, md1.FieldType) + } + }) } From 0bc0dfab968288bd06b148217a915ebaabc3ddb0 Mon Sep 17 00:00:00 2001 From: Matthew <38759997+friendlymatthew@users.noreply.github.com> Date: Thu, 1 Feb 2024 17:44:48 -0500 Subject: [PATCH 05/15] fix: ci --- .github/workflows/test.yml | 6 +++--- pkg/handlers/jsonl_test.go | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index def5e119..2356f4fb 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -10,8 +10,8 @@ jobs: - uses: actions/setup-go@v4 with: go-version: "1.21" - - run: go test -v ./... - - run: go vet -v ./... + - run: go test -v ./pkg/... + - run: go vet -v ./pkg/... node-test: runs-on: ubuntu-latest @@ -23,4 +23,4 @@ jobs: node-version: '18' - run: npm ci - run: npm run build - - run: npm test \ No newline at end of file + - run: npm test diff --git a/pkg/handlers/jsonl_test.go b/pkg/handlers/jsonl_test.go index 4310adda..d6b74c50 100644 --- a/pkg/handlers/jsonl_test.go +++ b/pkg/handlers/jsonl_test.go @@ -246,10 +246,10 @@ func TestJSONL(t *testing.T) { t.Fatal(err) } if !found { - t.Errorf("got i.Indexes[1].BPTree().Find(\"test3\") = nil, want non-nil") + t.Errorf("got i.Indexes[1].BPTree().Find(123) = nil, want non-nil") } if mp2.Offset != uint64(len("{\"test\":\"test1\"}\n")) || mp2.Length != uint32(len("{\"test\":123}")) { - t.Errorf("got i.Indexes[1].BPTree().Find(\"test3\") = %+v, want {%d, %d}", mp2, len("{\"test\":\"test1\"}\n"), len("{\"test\":123}")) + t.Errorf("got i.Indexes[1].BPTree().Find(123)= %+v, want {%d, %d}", mp2, len("{\"test\":\"test1\"}\n"), len("{\"test\":123}")) } md2 := &appendable.IndexMeta{} From 0c5279494f2fb5497329f5d7e2da9efda3a6cd9b Mon Sep 17 00:00:00 2001 From: Matthew <38759997+friendlymatthew@users.noreply.github.com> Date: Mon, 5 Feb 2024 09:44:42 -0500 Subject: [PATCH 06/15] pass tests --- .github/workflows/test.yml | 4 ++-- pkg/appendable/appendable.go | 4 ++-- pkg/btree/multi.go | 12 ++++++------ pkg/btree/pagefile.go | 7 +++++++ pkg/btree/pagefile_test.go | 3 +++ pkg/handlers/csv.go | 2 +- pkg/handlers/csv_test.go | 2 +- pkg/handlers/jsonl_test.go | 2 +- 8 files changed, 23 insertions(+), 13 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2356f4fb..aa672b79 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -10,8 +10,8 @@ jobs: - uses: actions/setup-go@v4 with: go-version: "1.21" - - run: go test -v ./pkg/... - - run: go vet -v ./pkg/... + - run: go test -v ./... + - run: go vet -v ./... node-test: runs-on: ubuntu-latest diff --git a/pkg/appendable/appendable.go b/pkg/appendable/appendable.go index 6389e6ca..1e4168d9 100644 --- a/pkg/appendable/appendable.go +++ b/pkg/appendable/appendable.go @@ -95,7 +95,7 @@ type FileMeta struct { func (m *FileMeta) MarshalBinary() ([]byte, error) { buf := make([]byte, 9) buf[0] = byte(m.Version) - binary.LittleEndian.PutUint64(buf[1:], m.ReadOffset) + binary.BigEndian.PutUint64(buf[1:], m.ReadOffset) return buf, nil } @@ -104,7 +104,7 @@ func (m *FileMeta) UnmarshalBinary(buf []byte) error { return fmt.Errorf("invalid metadata size: %d", len(buf)) } m.Version = Version(buf[0]) - m.ReadOffset = binary.LittleEndian.Uint64(buf[1:]) + m.ReadOffset = binary.BigEndian.Uint64(buf[1:]) return nil } diff --git a/pkg/btree/multi.go b/pkg/btree/multi.go index af081255..13c1c952 100644 --- a/pkg/btree/multi.go +++ b/pkg/btree/multi.go @@ -28,14 +28,14 @@ func (m *LinkedMetaPage) Root() (MemoryPointer, error) { return MemoryPointer{}, err } var mp MemoryPointer - return mp, binary.Read(m.rws, binary.LittleEndian, &mp) + return mp, binary.Read(m.rws, binary.BigEndian, &mp) } func (m *LinkedMetaPage) SetRoot(mp MemoryPointer) error { if _, err := m.rws.Seek(int64(m.offset), io.SeekStart); err != nil { return err } - return binary.Write(m.rws, binary.LittleEndian, mp) + return binary.Write(m.rws, binary.BigEndian, mp) } // BPTree returns a B+ tree that uses this meta page as the root @@ -61,7 +61,7 @@ func (m *LinkedMetaPage) Metadata() ([]byte, error) { return nil, err } // the first four bytes represents the length - length := binary.LittleEndian.Uint32(buf[:4]) + length := binary.BigEndian.Uint32(buf[:4]) return buf[4 : 4+length], nil } @@ -81,7 +81,7 @@ func (m *LinkedMetaPage) SetMetadata(data []byte) error { return err } buf := append(make([]byte, 4), data...) - binary.LittleEndian.PutUint32(buf, uint32(len(data))) + binary.BigEndian.PutUint32(buf, uint32(len(data))) if _, err := m.rws.Write(buf); err != nil { return err } @@ -101,7 +101,7 @@ func (m *LinkedMetaPage) Next() (*LinkedMetaPage, error) { return nil, err } var next MemoryPointer - if err := binary.Read(m.rws, binary.LittleEndian, &next); err != nil { + if err := binary.Read(m.rws, binary.BigEndian, &next); err != nil { return nil, err } return &LinkedMetaPage{rws: m.rws, offset: next.Offset}, nil @@ -127,7 +127,7 @@ func (m *LinkedMetaPage) AddNext() (*LinkedMetaPage, error) { if _, err := m.rws.Seek(int64(m.offset)+12, io.SeekStart); err != nil { return nil, err } - if err := binary.Write(m.rws, binary.LittleEndian, next.offset); err != nil { + if err := binary.Write(m.rws, binary.BigEndian, next.offset); err != nil { return nil, err } return next, nil diff --git a/pkg/btree/pagefile.go b/pkg/btree/pagefile.go index 8edbf0e7..e92b29f1 100644 --- a/pkg/btree/pagefile.go +++ b/pkg/btree/pagefile.go @@ -161,6 +161,13 @@ func (pf *PageFile) FreePage(offset int64) error { if pf.freePageCount == len(pf.freePageIndexes) { return errors.New("free page index is full") } + + for i, _ := range pf.freePageIndexes { + if pf.freePageIndexes[i] == offset { + return errors.New("offset already exists") + } + } + // push to the head pf.freePageIndexes[pf.freePageHead] = offset pf.freePageHead = (pf.freePageHead + 1) % len(pf.freePageIndexes) diff --git a/pkg/btree/pagefile_test.go b/pkg/btree/pagefile_test.go index e4257196..f04f7c0b 100644 --- a/pkg/btree/pagefile_test.go +++ b/pkg/btree/pagefile_test.go @@ -1,6 +1,7 @@ package btree import ( + "fmt" "io" "testing" @@ -185,9 +186,11 @@ func TestPageFile(t *testing.T) { if err != nil { t.Fatal(err) } + fmt.Printf("pageCount: %v\nindexes: %v", pf.freePageCount, len(pf.freePageIndexes)) if err := pf.FreePage(offset); err != nil { t.Fatal(err) } + fmt.Printf("pageCount: %v\nindexes: %v", pf.freePageCount, len(pf.freePageIndexes)) if err := pf.FreePage(offset); err == nil { t.Fatal("expected error") } diff --git a/pkg/handlers/csv.go b/pkg/handlers/csv.go index 76d1e4b9..2269025c 100644 --- a/pkg/handlers/csv.go +++ b/pkg/handlers/csv.go @@ -178,7 +178,7 @@ func handleCSVLine(f *appendable.IndexFile, r io.ReaderAt, dec *csv.Reader, head switch fieldType { case appendable.FieldTypeFloat64: buf := make([]byte, 8) - binary.LittleEndian.PutUint64(buf, math.Float64bits(value.(float64))) + binary.BigEndian.PutUint64(buf, math.Float64bits(value.(float64))) if err := page.BPTree(r).Insert(btree.ReferencedValue{Value: buf}, data); err != nil { return fmt.Errorf("failed to insert into b+tree: %w", err) } diff --git a/pkg/handlers/csv_test.go b/pkg/handlers/csv_test.go index 978cd858..c39b8135 100644 --- a/pkg/handlers/csv_test.go +++ b/pkg/handlers/csv_test.go @@ -181,7 +181,7 @@ func TestCSV(t *testing.T) { } v2 := make([]byte, 8) - binary.LittleEndian.PutUint64(v2, math.Float64bits(123)) + binary.BigEndian.PutUint64(v2, math.Float64bits(123)) mp2, found, err := collected[1].BPTree(r2).Find(v2) if err != nil { t.Fatal(err) diff --git a/pkg/handlers/jsonl_test.go b/pkg/handlers/jsonl_test.go index d6b74c50..362cf1c9 100644 --- a/pkg/handlers/jsonl_test.go +++ b/pkg/handlers/jsonl_test.go @@ -240,7 +240,7 @@ func TestJSONL(t *testing.T) { } v2 := make([]byte, 8) - binary.LittleEndian.PutUint64(v2, math.Float64bits(123)) + binary.BigEndian.PutUint64(v2, math.Float64bits(123)) mp2, found, err := collected[1].BPTree(r2).Find(v2) if err != nil { t.Fatal(err) From b71ecc07c1cd7987164d9a01169ec4105788df2c Mon Sep 17 00:00:00 2001 From: Matthew <38759997+friendlymatthew@users.noreply.github.com> Date: Mon, 29 Jan 2024 11:02:57 -0500 Subject: [PATCH 07/15] Add `selectFields` to query language (#71) * add support fields * rename + convert to Pick type --- examples/client/index.html | 23 ++++++++++++++++++++++- src/database.ts | 25 ++++++++++++++++++++++++- src/tests/database.test.ts | 35 +++++++++++++++++++++-------------- 3 files changed, 67 insertions(+), 16 deletions(-) diff --git a/examples/client/index.html b/examples/client/index.html index b4ab68ab..8ca2590c 100644 --- a/examples/client/index.html +++ b/examples/client/index.html @@ -141,6 +141,19 @@ } } + if (query.select) { + // validate the `selectFields` clause + if (!Array.isArray(query.select) || query.select.length === 0) { + return "Error: Invalid 'selectFields' clause."; + } + + for (const field of query.select) { + if (!dbFields.has(field)) { + return `Error: 'key': ${field} in 'selectFields' clause does not exist in dataset.`; + } + } + } + return "Valid Query"; } @@ -238,7 +251,7 @@

Query

Results

@@ -340,6 +353,14 @@

Results

direction: "ASC", }, ], + select: [ + "VendorID", + "trip_distance", + "passenger_count", + "fare_amount", + "tip_amount", + "mta_tax", + ], }, null, 2 diff --git a/src/database.ts b/src/database.ts index fa56b9e9..731b9708 100644 --- a/src/database.ts +++ b/src/database.ts @@ -17,9 +17,12 @@ type OrderBy = { direction: "ASC" | "DESC"; }; +type SelectField = keyof T; + export type Query = { where?: WhereNode[]; orderBy?: OrderBy[]; + select?: SelectField[]; }; export enum FieldType { @@ -249,13 +252,33 @@ export class Database { ); console.log(`Data record: `, dataRecord); - const dataFieldValue = parseIgnoringSuffix( + const parsedFieldValue = parseIgnoringSuffix( await this.dataFile.get( dataRecord.startByteOffset, dataRecord.endByteOffset ), this.formatType ); + + let dataFieldValue = parsedFieldValue; + + if (query.select && query.select.length > 0) { + if ( + typeof parsedFieldValue === "object" && + parsedFieldValue !== null + ) { + dataFieldValue = query.select.reduce( + (acc, field) => { + if (field in parsedFieldValue) { + acc[field] = parsedFieldValue[field]; + } + return acc; + }, + {} as Pick + ); + } + } + yield dataFieldValue; } } diff --git a/src/tests/database.test.ts b/src/tests/database.test.ts index a28e9955..c53fe666 100644 --- a/src/tests/database.test.ts +++ b/src/tests/database.test.ts @@ -1,4 +1,10 @@ -import { Database, FieldType, Query, containsType, parseCsvLine } from "../database"; +import { + Database, + FieldType, + Query, + containsType, + parseCsvLine, +} from "../database"; import { DataFile } from "../data-file"; import { IndexFile, VersionedIndexFile } from "../index-file"; import { FormatType } from ".."; @@ -26,7 +32,11 @@ describe("test query relation", () => { } as jest.Mocked>; // instantiate a Database object with given mocked data file and index file - database = Database.forDataFileAndIndexFile(mockDataFile, mockIndexFile, FormatType.Jsonl); + database = Database.forDataFileAndIndexFile( + mockDataFile, + mockIndexFile, + FormatType.Jsonl + ); }); /* @@ -113,19 +123,16 @@ describe("test field type", () => { }); }); - describe("test parsing csv", () => { - - it("check csv parse", async() => { + it("check csv parse", async () => { const testCases = [ - { data: "151,1", expected: 151}, - { data: ",95,5", expected: 95} + { data: "151,1", expected: 151 }, + { data: ",95,5", expected: 95 }, ]; - testCases.forEach(({ data, expected}) => { - let csv = parseCsvLine(data) - console.log(csv) - }) - - }) -}) \ No newline at end of file + testCases.forEach(({ data, expected }) => { + let csv = parseCsvLine(data); + console.log(csv); + }); + }); +}); From d0b22e79ac5d1e4ea576bce1a3737d3463451a47 Mon Sep 17 00:00:00 2001 From: Matthew <38759997+friendlymatthew@users.noreply.github.com> Date: Mon, 29 Jan 2024 12:38:49 -0500 Subject: [PATCH 08/15] CSV demo + support null fields (#67) * support csv, account for null * cleanup + refactor csv + include jsonl * cleanup + direct workflow to csv --- .github/workflows/example-client.yml | 8 +- examples/README.md | 12 +- examples/client/index.html | 6 +- package.json | 4 +- pkg/appendable/csv_handler.go | 192 +++++++++++ pkg/appendable/index_file.go | 36 ++ pkg/appendable/index_file_csv_test.go | 332 ++++++++++++++++++ pkg/appendable/index_file_jsonl_test.go | 332 ++++++++++++++++++ pkg/appendable/jsonl_handler.go | 177 ++++++++++ pkg/protocol/protocol.go | 189 +++++++++++ src/database.ts | 72 ++-- src/index-file.ts | 434 ++++++++++++------------ src/tests/database.test.ts | 23 +- 13 files changed, 1540 insertions(+), 277 deletions(-) create mode 100644 pkg/appendable/csv_handler.go create mode 100644 pkg/appendable/index_file_csv_test.go create mode 100644 pkg/appendable/index_file_jsonl_test.go create mode 100644 pkg/appendable/jsonl_handler.go create mode 100644 pkg/protocol/protocol.go diff --git a/.github/workflows/example-client.yml b/.github/workflows/example-client.yml index 180ee865..951a1e2d 100644 --- a/.github/workflows/example-client.yml +++ b/.github/workflows/example-client.yml @@ -29,15 +29,15 @@ jobs: # Fetch the data in workspace cd examples/workspace python3 -m pip install -r requirements.txt - python3 fetch_jsonl.py + python3 fetch_csv.py cd - # Build the index - go run cmd/main.go -jsonl examples/workspace/green_tripdata_2023-01.jsonl + go run cmd/main.go -csv examples/workspace/green_tripdata_2023-01.csv # Copy to client - cp examples/workspace/green_tripdata_2023-01.jsonl examples/client - cp examples/workspace/green_tripdata_2023-01.jsonl.index examples/client + cp examples/workspace/green_tripdata_2023-01.csv examples/client + cp examples/workspace/green_tripdata_2023-01.csv.index examples/client # Build the js lib npm ci diff --git a/examples/README.md b/examples/README.md index b93bbd7c..e65a5a4d 100644 --- a/examples/README.md +++ b/examples/README.md @@ -25,7 +25,11 @@ python3 fetch_jsonl.py Then run the indexing process: ```sh -npm run build-index +# for jsonl: +npm run build-index-jsonl + +# or for csv: +npm run build-index-csv ``` Copy the `.jsonl` and index file to `/client` @@ -35,6 +39,12 @@ cp green_tripdata_2023-01.jsonl ../client cp green_tripdata_2023-01.jsonl.index ../client ``` +or for csv: +```sh +cp green_tripdata_2023-01.csv ../client +cp green_tripdata_2023-01.csv.index ../client +``` + Build the AppendableDB client library: ```sh diff --git a/examples/client/index.html b/examples/client/index.html index 8ca2590c..eff0b437 100644 --- a/examples/client/index.html +++ b/examples/client/index.html @@ -9,9 +9,9 @@ + -
@@ -188,12 +178,14 @@

Fields

Query

- -
+
+ + +
+

Results

+ +

@@ -201,115 +193,6 @@ 

Results

- + diff --git a/examples/client/styles.css b/examples/client/styles.css new file mode 100644 index 00000000..659c238c --- /dev/null +++ b/examples/client/styles.css @@ -0,0 +1,39 @@ +body, +html { + margin: 0; + padding: 0px 0px 4px 4px; +} +.flex-1 { + flex: 1; + display: flex; + gap: 0 30px; + height: 100vh; + width: 100vw; +} +.result-row { + cursor: pointer; +} +.result-row:hover { + background-color: yellow; +} +#fields { + max-height: calc(100vh - 50px); + overflow-y: auto; +} +#results { + overflow-y: auto; + max-height: calc(100vh - 670px); +} +#results-header { + width: max-content; +} +.header-item, +.result-cell { + padding: 4px; + text-align: left; + min-width: 200px; +} +.header-item { + background-color: #f0f0f0; + font-weight: bold; +} diff --git a/src/db/database.ts b/src/db/database.ts index 8cf90c50..86e79a12 100644 --- a/src/db/database.ts +++ b/src/db/database.ts @@ -1,6 +1,7 @@ import { FormatType } from ".."; import { DataFile } from "../data-file"; import { IndexFile, VersionedIndexFile } from "../index-file"; +import { QueryBuilder } from "./query-builder"; import { validateQuery } from "./query-validation"; export type Schema = { @@ -24,6 +25,7 @@ export type Query = { where?: WhereNode[]; orderBy?: OrderBy[]; select?: SelectField[]; + limit?: number; }; export enum FieldType { @@ -294,4 +296,12 @@ export class Database { } } } + + where( + key: keyof T, + operation: WhereNode["operation"], + value: T[keyof T] + ): QueryBuilder { + return new QueryBuilder(this).where(key, operation, value); + } } diff --git a/src/db/query-builder.ts b/src/db/query-builder.ts new file mode 100644 index 00000000..44127c85 --- /dev/null +++ b/src/db/query-builder.ts @@ -0,0 +1,81 @@ +import { Database, OrderBy, Query, Schema, WhereNode } from "./database"; + +/** + * A class for building and executing database queries in a flexible API style. + * Allows chaining methods for 'where', 'orderBy', 'select', and 'limit' clauses. + */ +export class QueryBuilder { + private queryObject: Query = { + where: [], + orderBy: undefined, + select: undefined, + limit: undefined, + }; + + /** + * Initializes a new instance of the QueryBuilder class. + * @param {Database} database - An Appendable database instance to execute queries against. + */ + constructor(private database: Database) {} + + /** + * Executes the constructed query + */ + get() { + return this.database.query(this.queryObject); + } + + /** + * Adds a 'where' clause to the query. + * + * @param {keyof T} key - The index header's field name to apply the 'where' condition. + * @param {WhereNode["operation"]} operation - The comparison relation (e.g., >=, <=, ==, >=, >). + * @param {T[keyof T]} value - The value to compare against. + * @returns {QueryBuilder} The QueryBuilder instance. + */ + where( + key: keyof T, + operation: WhereNode["operation"], + value: T[keyof T] + ): QueryBuilder { + this.queryObject.where?.push({ key, operation, value }); + return this; + } + + /** + * Adds an 'orderBy' clause to the query. + * + * @param {keyof T} key - The index header's field name to order by. + * @param {OrderBy["direction"]} direction - The sorting direction (e.g., ASC, DESC). + * @returns {QueryBuilder} The QueryBuilder instance. + */ + orderBy(key: keyof T, direction: OrderBy["direction"]): QueryBuilder { + this.queryObject.orderBy + ? this.queryObject.orderBy.push({ key, direction }) + : (this.queryObject.orderBy = [{ key, direction }]); + + return this; + } + + /** + * Specifies the fields to be selected in the query. + * + * @param {(keyof T)[]} keys - A list of index header field names. + * @returns {QueryBuilder} The QueryBuilder instance. + */ + select(keys: (keyof T)[]): QueryBuilder { + this.queryObject.select = keys; + return this; + } + + /** + * Limits the number of records returned by the query. + * + * @param {number} limit - The maximum number of records to return. + * @returns {QueryBuilder} The QueryBuilder instance. + */ + limit(limit: number): QueryBuilder { + this.queryObject.limit = limit; + return this; + } +} diff --git a/src/db/query-validation.ts b/src/db/query-validation.ts index d000c27d..f779fdab 100644 --- a/src/db/query-validation.ts +++ b/src/db/query-validation.ts @@ -35,6 +35,8 @@ function validateWhere( throw new Error("Missing 'where' clause."); } + console.log("validating where: ", where); + for (const whereNode of where) { if (!["<", "<=", "==", ">=", ">"].includes(whereNode.operation)) { throw new Error("Invalid operation in 'where' clause."); @@ -114,6 +116,7 @@ function validateOrderBy( whereKey: string ): void { if (orderBy) { + console.log("validating orderBy: ", orderBy); if (!Array.isArray(orderBy) || orderBy.length === 0) { throw new Error("Invalid 'orderBy' clause."); } @@ -143,8 +146,9 @@ function validateSelect( headers: Header[] ): void { if (select) { + console.log("validating select: ", select); if (!Array.isArray(select) || select.length === 0) { - throw new Error("Invalid 'selectFields' clause"); + throw new Error("Invalid 'select' clause"); } for (const field of select) { @@ -152,7 +156,7 @@ function validateSelect( if (!header) { throw new Error( - `'key': ${field as string} in 'selectFields' clause does not exist in dataset.` + `'key': ${field as string} in 'select' clause does not exist in dataset.` ); } } diff --git a/src/index-file.ts b/src/index-file.ts index f7e35bbe..377b0939 100644 --- a/src/index-file.ts +++ b/src/index-file.ts @@ -140,7 +140,6 @@ class IndexFileV1 implements VersionedIndexFile { throw new Error("offset out of range"); } const headers = await this.indexHeaders(); - console.log("headers: ", headers); const headerIndex = headers.findIndex( (header) => header.fieldName === field ); From 276f3704adfd716ed2880ed60964961b70ef6fce Mon Sep 17 00:00:00 2001 From: Matthew <38759997+friendlymatthew@users.noreply.github.com> Date: Thu, 1 Feb 2024 11:16:55 -0500 Subject: [PATCH 12/15] Coverage: Assert query chain behavior (#79) * coverage * memory conservation --- src/db/query-builder.ts | 54 ++++-- src/tests/query-builder.test.ts | 175 ++++++++++++++++++ ...query.test.ts => query-validation.test.ts} | 0 3 files changed, 217 insertions(+), 12 deletions(-) create mode 100644 src/tests/query-builder.test.ts rename src/tests/{query.test.ts => query-validation.test.ts} (100%) diff --git a/src/db/query-builder.ts b/src/db/query-builder.ts index 44127c85..f5e2c4dc 100644 --- a/src/db/query-builder.ts +++ b/src/db/query-builder.ts @@ -18,6 +18,24 @@ export class QueryBuilder { */ constructor(private database: Database) {} + /** + * Retrieves an immutable copy of the current query. + * + * @returns {Query} The Query instance. + */ + toQuery(): Query { + return { + where: this.queryObject.where ? [...this.queryObject.where] : [], + orderBy: this.queryObject.orderBy + ? [...this.queryObject.orderBy] + : undefined, + select: this.queryObject.select + ? [...this.queryObject.select] + : undefined, + limit: this.queryObject.limit, + }; + } + /** * Executes the constructed query */ @@ -38,10 +56,13 @@ export class QueryBuilder { operation: WhereNode["operation"], value: T[keyof T] ): QueryBuilder { - this.queryObject.where?.push({ key, operation, value }); - return this; + const newQuery = new QueryBuilder(this.database); + newQuery.queryObject = { + ...this.queryObject, + where: [...(this.queryObject.where || []), { key, operation, value }], + }; + return newQuery; } - /** * Adds an 'orderBy' clause to the query. * @@ -50,11 +71,12 @@ export class QueryBuilder { * @returns {QueryBuilder} The QueryBuilder instance. */ orderBy(key: keyof T, direction: OrderBy["direction"]): QueryBuilder { - this.queryObject.orderBy - ? this.queryObject.orderBy.push({ key, direction }) - : (this.queryObject.orderBy = [{ key, direction }]); - - return this; + const newQuery = new QueryBuilder(this.database); + newQuery.queryObject = { + ...this.queryObject, + orderBy: [...(this.queryObject.orderBy || []), { key, direction }], + }; + return newQuery; } /** @@ -64,8 +86,12 @@ export class QueryBuilder { * @returns {QueryBuilder} The QueryBuilder instance. */ select(keys: (keyof T)[]): QueryBuilder { - this.queryObject.select = keys; - return this; + const newQuery = new QueryBuilder(this.database); + newQuery.queryObject = { + ...this.queryObject, + select: keys, + }; + return newQuery; } /** @@ -75,7 +101,11 @@ export class QueryBuilder { * @returns {QueryBuilder} The QueryBuilder instance. */ limit(limit: number): QueryBuilder { - this.queryObject.limit = limit; - return this; + const newQuery = new QueryBuilder(this.database); + newQuery.queryObject = { + ...this.queryObject, + limit: limit, + }; + return newQuery; } } diff --git a/src/tests/query-builder.test.ts b/src/tests/query-builder.test.ts new file mode 100644 index 00000000..7c0201a9 --- /dev/null +++ b/src/tests/query-builder.test.ts @@ -0,0 +1,175 @@ +import { DataFile } from "../data-file"; +import { Database, Query } from "../db/database"; +import { QueryBuilder } from "../db/query-builder"; +import { validateQuery } from "../db/query-validation"; +import { Header } from "../index-file"; + +describe("test validate queries", () => { + interface MockSchema { + [key: string]: {}; + VendorID: {}; + store_and_fwd_flag: {}; + fare_amount: {}; + payment_type: {}; + } + + const headers: Header[] = [ + { + fieldName: "VendorID", + fieldType: BigInt(2), + indexRecordCount: BigInt(683211), + }, + { + fieldName: "store_and_fwd_flag", + fieldType: BigInt(33), + indexRecordCount: BigInt(423), + }, + { + fieldName: "fare_amount", + fieldType: BigInt(2), + indexRecordCount: BigInt(68211), + }, + { + fieldName: "payment_type", + fieldType: BigInt(33), + indexRecordCount: BigInt(63887), + }, + ]; + + let database: Database; + + it(`test query builder`, async () => { + let qb = new QueryBuilder(database); + + let qb1 = qb.where("VendorID", "<=", 1); + + expect(async () => { + await validateQuery(qb1.toQuery(), headers); + }).not.toThrow(); + }); + + it(`test basic query chain`, async () => { + let q = new QueryBuilder(database).where("VendorID", "<=", 1); + let query = q.toQuery(); + + expect(query.where).not.toBeNull(); + expect(query.where).toEqual([ + { key: "VendorID", operation: "<=", value: 1 }, + ]); + + expect(async () => { + await validateQuery(query, headers); + }).not.toThrow(); + + q = q.orderBy("VendorID", "ASC"); + query = q.toQuery(); + + expect(query.where).not.toBeNull(); + expect(query.where).toEqual([ + { key: "VendorID", operation: "<=", value: 1 }, + ]); + expect(query.orderBy).not.toBeNull(); + expect(query.orderBy).toEqual([{ key: "VendorID", direction: "ASC" }]); + expect(async () => { + await validateQuery(query, headers); + }).not.toThrow(); + + q = q.select(["VendorID", "store_and_fwd_flag", "fare_amount"]); + query = q.toQuery(); + expect(query.where).not.toBeNull(); + expect(query.where).toEqual([ + { key: "VendorID", operation: "<=", value: 1 }, + ]); + expect(query.orderBy).not.toBeNull(); + expect(query.orderBy).toEqual([{ key: "VendorID", direction: "ASC" }]); + expect(query.select).not.toBeNull(); + expect(query.select).toEqual([ + "VendorID", + "store_and_fwd_flag", + "fare_amount", + ]); + }); + + it(`test basic derived query chain`, async () => { + const q0 = new QueryBuilder(database).where("fare_amount", "==", 1); + let query = q0.toQuery(); + + expect(query.where).not.toBeNull(); + expect(query.where).toEqual([ + { key: "fare_amount", operation: "==", value: 1 }, + ]); + + let q1 = q0.orderBy("fare_amount", "DESC"); + query = q1.toQuery(); + + expect(query.where).not.toBeNull(); + expect(query.where).toEqual([ + { key: "fare_amount", operation: "==", value: 1 }, + ]); + expect(query.orderBy).not.toBeNull(); + expect(query.orderBy).toEqual([{ key: "fare_amount", direction: "DESC" }]); + + let q2 = q1.select(["fare_amount"]); + query = q2.toQuery(); + expect(query.where).not.toBeNull(); + expect(query.where).toEqual([ + { key: "fare_amount", operation: "==", value: 1 }, + ]); + expect(query.orderBy).not.toBeNull(); + expect(query.orderBy).toEqual([{ key: "fare_amount", direction: "DESC" }]); + expect(query.select).not.toBeNull(); + expect(query.select).toEqual(["fare_amount"]); + }); + + it(`test multi derived query chain`, async () => { + const q0 = new QueryBuilder(database).where("fare_amount", "==", 1); + let query = q0.toQuery(); + + expect(query.where).not.toBeNull(); + expect(query.where).toEqual([ + { key: "fare_amount", operation: "==", value: 1 }, + ]); + + let q1 = q0.where("VendorID", "==", 3); + query = q1.toQuery(); + + expect(query.where).not.toBeNull(); + expect(query.where).toEqual([ + { key: "fare_amount", operation: "==", value: 1 }, + { key: "VendorID", operation: "==", value: 3 }, + ]); + }); + + it(`test green + red queries`, async () => { + const q0 = new QueryBuilder(database).where("payment_type", ">", ""); + const failQuery = q0.orderBy("VendorID", "ASC"); + expect(failQuery.toQuery().orderBy).toEqual([ + { key: "VendorID", direction: "ASC" }, + ]); + + const passQuery = q0.orderBy("payment_type", "DESC"); + expect(passQuery.toQuery().orderBy).toEqual([ + { key: "payment_type", direction: "DESC" }, + ]); + + const failQuery2 = passQuery.select(["wef"]); + const passQuery2 = passQuery.select([ + "VendorID", + "payment_type", + "fare_amount", + ]); + + // red queries + [failQuery, failQuery2].forEach(async (query) => { + await expect(() => + validateQuery(query.toQuery(), headers) + ).rejects.toThrow(); + }); + + // green queries + [passQuery, passQuery2].forEach(async (query) => { + console.log(query.toQuery()); + await expect(() => validateQuery(query.toQuery(), headers)).not.toThrow(); + }); + }); +}); diff --git a/src/tests/query.test.ts b/src/tests/query-validation.test.ts similarity index 100% rename from src/tests/query.test.ts rename to src/tests/query-validation.test.ts From 532f173a1f392447e077dae30a32a222c083cd08 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 5 Feb 2024 10:58:55 -0500 Subject: [PATCH 13/15] bad merge --- pkg/appendable/index_file.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/appendable/index_file.go b/pkg/appendable/index_file.go index be4e85da..72e319d3 100644 --- a/pkg/appendable/index_file.go +++ b/pkg/appendable/index_file.go @@ -25,7 +25,6 @@ type IndexFile struct { dataHandler DataHandler } -<<<<<<< HEAD func NewIndexFile(f io.ReadWriteSeeker, dataHandler DataHandler) (*IndexFile, error) { pf, err := btree.NewPageFile(f) if err != nil { From 6d9180ad4fc2f4058d6d78090079479e77aa30f6 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 5 Feb 2024 11:11:17 -0500 Subject: [PATCH 14/15] merge --- pkg/appendable/index_file.go | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/pkg/appendable/index_file.go b/pkg/appendable/index_file.go index 72e319d3..ed865464 100644 --- a/pkg/appendable/index_file.go +++ b/pkg/appendable/index_file.go @@ -60,29 +60,6 @@ func NewIndexFile(f io.ReadWriteSeeker, dataHandler DataHandler) (*IndexFile, er } else { return &IndexFile{tree: tree, dataHandler: dataHandler}, nil } -======= -// Index is a representation of a single index. -type Index struct { - FieldName string - FieldType protocol.FieldType - IndexRecords map[any][]protocol.IndexRecord -} - -func fieldType(data any) protocol.FieldType { - switch data.(type) { - case string: - return protocol.FieldTypeString - case int, int8, int16, int32, int64, float32, float64: - return protocol.FieldTypeNumber - case bool: - return protocol.FieldTypeBoolean - case []any: - return protocol.FieldTypeArray - case nil: - return protocol.FieldTypeNull - default: - return protocol.FieldTypeObject ->>>>>>> 6e50d4a (CSV demo + support null fields (#67)) } } @@ -137,7 +114,6 @@ func (i *IndexFile) IndexFieldNames() ([]string, error) { if !exists { break } -<<<<<<< HEAD buf, err := next.Metadata() if err != nil { return nil, fmt.Errorf("failed to read metadata: %w", err) @@ -145,17 +121,6 @@ func (i *IndexFile) IndexFieldNames() ([]string, error) { metadata := &IndexMeta{} if err := metadata.UnmarshalBinary(buf); err != nil { return nil, fmt.Errorf("failed to unmarshal metadata: %w", err) -======= - } - - // if the index doesn't exist, create it - ft := fieldType(value) - - if match == -1 { - index := Index{ - FieldName: name, - FieldType: ft, ->>>>>>> 6e50d4a (CSV demo + support null fields (#67)) } uniqueFieldNames[metadata.FieldName] = true From b33399ac4b122ee11bd7262f2ce66ac4f682d9ac Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 5 Feb 2024 11:16:28 -0500 Subject: [PATCH 15/15] merge --- pkg/appendable/csv_handler.go | 192 -------------- pkg/appendable/index_file_csv_test.go | 332 ------------------------ pkg/appendable/index_file_jsonl_test.go | 332 ------------------------ pkg/appendable/jsonl_handler.go | 177 ------------- 4 files changed, 1033 deletions(-) delete mode 100644 pkg/appendable/csv_handler.go delete mode 100644 pkg/appendable/index_file_csv_test.go delete mode 100644 pkg/appendable/index_file_jsonl_test.go delete mode 100644 pkg/appendable/jsonl_handler.go diff --git a/pkg/appendable/csv_handler.go b/pkg/appendable/csv_handler.go deleted file mode 100644 index cd4156d2..00000000 --- a/pkg/appendable/csv_handler.go +++ /dev/null @@ -1,192 +0,0 @@ -package appendable - -import ( - "bufio" - "bytes" - "encoding/csv" - "fmt" - "io" - "log/slog" - "strings" - - "github.com/cespare/xxhash/v2" - "github.com/kevmo314/appendable/pkg/protocol" -) - -type CSVHandler struct { - io.ReadSeeker -} - -func (c CSVHandler) Synchronize(f *IndexFile) error { - slog.Debug("Starting CSV synchronization") - - var headers []string - var err error - - fromNewIndexFile := false - - isHeader := false - - if len(f.Indexes) == 0 { - isHeader = true - fromNewIndexFile = true - } else { - slog.Debug("indexes already exist, not parsing headers") - for _, index := range f.Indexes { - isHeader = false - headers = append(headers, index.FieldName) - } - } - - scanner := bufio.NewScanner(f.data) - - for i := 0; scanner.Scan(); i++ { - line := scanner.Bytes() - - existingCount := len(f.EndByteOffsets) - - // append a data range - var start uint64 - if len(f.EndByteOffsets) > 0 { - start = f.EndByteOffsets[existingCount-1] - } - - slog.Debug("", slog.Uint64("start", start)) - - slog.Debug("adding", slog.Any("endbyteoffset", start+uint64(len(line))), slog.Any("line", line)) - f.EndByteOffsets = append(f.EndByteOffsets, start+uint64(len(line))+1) - f.Checksums = append(f.Checksums, xxhash.Sum64(line)) - - if isHeader { - slog.Info("Parsing CSV headers") - dec := csv.NewReader(bytes.NewReader(line)) - headers, err = dec.Read() - if err != nil { - slog.Error("failed to parse CSV header", "error", err) - return fmt.Errorf("failed to parse CSV header: %w", err) - } - isHeader = false - continue - } - - dec := csv.NewReader(bytes.NewReader(line)) - slog.Debug("Handling csv", "line", i) - - if fromNewIndexFile { - - f.handleCSVLine(dec, headers, []string{}, uint64(existingCount)-1, start) - } else { - - f.handleCSVLine(dec, headers, []string{}, uint64(existingCount), start) - } - - slog.Debug("Succesfully processed", "line", i) - } - - if fromNewIndexFile && len(f.EndByteOffsets) > 0 { - f.EndByteOffsets = f.EndByteOffsets[1:] - f.Checksums = f.Checksums[1:] - - slog.Debug("Trimming endbyte offsets and checksums", "endByteOffsets", slog.Any("endByteOffsets", f.EndByteOffsets), "checksums", slog.Any("checksums", f.Checksums)) - } - - slog.Debug("indexes", slog.Any("", f.Indexes)) - slog.Debug("Ending CSV synchronization") - slog.Debug("=========") - return nil -} - -func fieldRankCsvField(fieldValue any) int { - slog.Debug("serialize", slog.Any("fieldValue", fieldValue)) - switch fieldValue.(type) { - case nil: - slog.Debug("nil", slog.Any("fieldValue", fieldValue)) - return 1 - case bool: - slog.Debug("bool", slog.Any("fieldValue", fieldValue)) - return 2 - case int, int8, int16, int32, int64, float32, float64: - slog.Debug("number", slog.Any("fieldValue", fieldValue)) - return 3 - case string: - slog.Debug("string", slog.Any("fieldValue", fieldValue)) - return 4 - default: - panic("unknown type") - } -} - -func (i *IndexFile) handleCSVLine(dec *csv.Reader, headers []string, path []string, dataIndex, dataOffset uint64) error { - slog.Debug("Processing CSV line", slog.Int("dataIndex", int(dataIndex)), slog.Int("dataOffset", int(dataOffset))) - - record, err := dec.Read() - - if err != nil { - slog.Error("Failed to read CSV record at index", "dataIndex", dataIndex, "error", err) - return fmt.Errorf("failed to read CSV record at index %d: %w", dataIndex, err) - } - - slog.Debug("CSV line read successfully", "record", record) - - cumulativeLength := uint64(0) - - for fieldIndex, fieldValue := range record { - if fieldIndex >= len(headers) { - slog.Error("Field index is out of bounds with headers", "fieldIndex", fieldIndex, "headers", slog.Any("headers", headers)) - return fmt.Errorf("field index %d is out of bounds with header", fieldIndex) - } - - fieldName := headers[fieldIndex] - - name := strings.Join(append(path, fieldName), ".") - - fieldOffset := dataOffset + cumulativeLength - fieldLength := uint64(len(fieldValue)) - - value, fieldType := protocol.InferCSVField(fieldValue) - - switch fieldType { - case protocol.FieldTypeBoolean, protocol.FieldTypeString, protocol.FieldTypeNumber: - tree := i.Indexes[i.findIndex(name, value)].IndexRecords - tree[value] = append(tree[value], protocol.IndexRecord{ - DataNumber: dataIndex, - FieldStartByteOffset: uint64(fieldOffset), - FieldLength: int(fieldLength), - }) - slog.Debug("Appended index record", - slog.String("field", name), - slog.Any("value", value), - slog.Int("start", int(fieldOffset))) - - case protocol.FieldTypeNull: - - found := false - for j := range i.Indexes { - if i.Indexes[j].FieldName == name { - i.Indexes[j].FieldType |= protocol.FieldTypeNull - - found = true - } - } - - if !found { - tree := i.Indexes[i.findIndex(name, value)].IndexRecords - tree[value] = append(tree[value], protocol.IndexRecord{ - DataNumber: dataIndex, - FieldStartByteOffset: uint64(fieldOffset), - FieldLength: int(fieldLength), - }) - } - - slog.Debug("Marked field", "name", name) - - default: - slog.Error("Encountered unexpected type '%T' for field '%s'", value, name) - return fmt.Errorf("unexpected type '%T'", value) - } - - cumulativeLength += fieldLength + 1 - } - - return nil -} diff --git a/pkg/appendable/index_file_csv_test.go b/pkg/appendable/index_file_csv_test.go deleted file mode 100644 index 5ac7522f..00000000 --- a/pkg/appendable/index_file_csv_test.go +++ /dev/null @@ -1,332 +0,0 @@ -package appendable - -import ( - "bytes" - "fmt" - "log/slog" - "os" - "reflect" - "strings" - "testing" - - "github.com/kevmo314/appendable/pkg/protocol" -) - -func TestAppendDataRowCSV(t *testing.T) { - - originalLogger := slog.Default() - - // Create a logger with Debug on - debugLevel := &slog.LevelVar{} - debugLevel.Set(slog.LevelDebug) - debugLogger := slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{ - Level: debugLevel, - })) - - slog.SetDefault(debugLogger) - - defer slog.SetDefault(originalLogger) - - var mockCsv string = "header1\ntest1\n" - var mockCsv2 string = "header1\ntest1\ntest3\n" - - t.Run("no schema changes", func(t *testing.T) { - - i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader(mockCsv)}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader(mockCsv2)}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 1 { - t.Errorf("got len(i.Indexes) = %d, want 1", len(i.Indexes)) - } - - if len(j.EndByteOffsets) != 2 { - t.Errorf("got len(i.DataRanges) = %d, want 2", len(i.EndByteOffsets)) - } - - // check that the first data range is untouched despite being incorrect - if j.EndByteOffsets[0] != uint64(len(mockCsv)) { - t.Errorf("got i.DataRanges[0].EndByteOffset = %d, want %d", j.EndByteOffsets[0], uint64(len(mockCsv))) - } - - // check that the second data range has properly set offsets - if j.EndByteOffsets[1] != uint64(len(mockCsv2)) { - t.Errorf("got i.DataRanges[1].EndByteOffset = %d, want %d", j.EndByteOffsets[1], uint64(len(mockCsv2))) - } - }) - - t.Run("check end + start byte offsets multiple", func(t *testing.T) { - i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader(mockCsv2)}) - if err != nil { - t.Fatal(err) - } - - if len(i.Indexes) != 1 { - t.Errorf("got len(i.Indexes) = %d, want 1", len(i.Indexes)) - } - - if len(i.Indexes[0].IndexRecords) != 2 { - t.Errorf("got len(i.Indexes[0].IndexRecords) = %d, want 2", len(i.Indexes[0].IndexRecords)) - } - - if i.Indexes[0].IndexRecords["test1"][0].FieldStartByteOffset != uint64(len("header1\n")) { - t.Errorf("got i.Indexes[0].IndexRecords[\"test1\"][0].FieldStartByteOffset = %d, want 7", i.Indexes[0].IndexRecords["test1"][0].FieldStartByteOffset) - } - - if i.Indexes[0].IndexRecords["test3"][0].FieldStartByteOffset != uint64(len("header1\ntest1\n")) { - t.Errorf("got i.Indexes[0].IndexRecords[\"test3\"][0].FieldStartByteOffset = %d, want %d", i.Indexes[0].IndexRecords["test3"][0].FieldStartByteOffset, uint64(len("header\ntest1\n"))) - } - - }) - - t.Run("append index to existing", func(t *testing.T) { - i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader(mockCsv)}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader(mockCsv2)}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 1 { - t.Errorf("got len(j.Indexes) = %d, want 1", len(j.Indexes)) - } - - if len(j.Indexes[0].IndexRecords) != 2 { - fmt.Printf("index records look like %v", j.Indexes[0].IndexRecords) - t.Errorf("got len(j.Indexes[0].IndexRecords) = %d, want 2", len(j.Indexes[0].IndexRecords)) - } - - if len(j.Indexes[0].IndexRecords["test1"]) != 1 { - t.Errorf("got len(j.Indexes[0].IndexRecords[\"test1\"]) = %d, want 1", len(j.Indexes[0].IndexRecords["test1"])) - } - if len(j.Indexes[0].IndexRecords["test3"]) != 1 { - for key, records := range j.Indexes[0].IndexRecords { - t.Errorf("\n\n\nKey: %v, Records: %+v", key, records) - } - t.Errorf("got len(j.Indexes[0].IndexRecords[\"test3\"]) = %d, want 1", len(j.Indexes[0].IndexRecords["test3"])) - } - - if j.Indexes[0].IndexRecords["test1"][0].DataNumber != 0 { - t.Errorf("got i.Indexes[0].IndexRecords[\"test1\"][0].DataNumber = %d, want 0", j.Indexes[0].IndexRecords["test1"][0].DataNumber) - } - if j.Indexes[0].IndexRecords["test1"][0].FieldStartByteOffset != uint64(len("header1\n")) { - t.Errorf("got i.Indexes[0].IndexRecords[\"test1\"][0].FieldStartByteOffset = %d, want %d", j.Indexes[0].IndexRecords["test1"][0].FieldStartByteOffset, uint64(len("header\n"))) - } - - if j.Indexes[0].IndexRecords["test3"][0].DataNumber != 1 { - t.Errorf("got i.Indexes[0].IndexRecords[\"test3\"][0].DataNumber = %d, want 1", j.Indexes[0].IndexRecords["test3"][0].DataNumber) - } - - // verify byte offset calculation - if j.Indexes[0].IndexRecords["test3"][0].FieldStartByteOffset != uint64(len("header1\ntest1\n")) { - t.Errorf("got i.Indexes[0].IndexRecords[\"test3\"][0].FieldStartByteOffset = %d, want %d", j.Indexes[0].IndexRecords["test3"][0].FieldStartByteOffset, uint64(len("header\ntest1\n"))) - } - }) - - t.Run("assert correct types", func(t *testing.T) { - i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader("n1,n2\n3.4,3\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader("n1,n2\n3.4,3\n4.3,4")}) - if err != nil { - t.Fatal(err) - } - - for _, index := range i.Indexes { - for key := range index.IndexRecords { - keyType := reflect.TypeOf(key).String() - if keyType != "float64" { - t.Errorf("i keytype is %v", keyType) - } - - if index.FieldType != protocol.FieldTypeNumber { - t.Errorf("index field type is not number. actual: %v", index.FieldType) - } - } - } - - for _, index := range j.Indexes { - for key := range index.IndexRecords { - keyType := reflect.TypeOf(key).String() - if keyType != "float64" { - t.Errorf("j keytype is %v", keyType) - } - - if index.FieldType != protocol.FieldTypeNumber { - t.Errorf("index field type is not number. actual: %v", index.FieldType) - } - } - } - - }) - - t.Run("record null columns", func(t *testing.T) { - - i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader("null1,null2\n,\n,\n,\n,\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - fmt.Printf("index file looks like: %v", i.Indexes) - - if len(i.Indexes) != 2 { - t.Errorf("got len(i.Indexes) = %d, want 2", len(i.Indexes)) - } - - if i.Indexes[0].FieldType != protocol.FieldTypeNull { - t.Errorf("got %d, wanted protocol.FieldTypeNull", i.Indexes[0].FieldType) - } - }) - - t.Run("multiple headers", func(t *testing.T) { - - i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader("name,move\nmica,coyote\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader("name,move\nmica,coyote\ngalvao,mount\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 2 { - t.Errorf("got len(i.Indexes) = %d, want 2", len(i.Indexes)) - } - - if len(j.EndByteOffsets) != 2 { - t.Errorf("got len(i.DataRanges) = %d, want 2", len(i.EndByteOffsets)) - } - - // check that the first data range is untouched despite being incorrect - if j.EndByteOffsets[0] != uint64(len("name,move\nmica,coyote\n")) { - t.Errorf("got i.DataRanges[0].EndByteOffset = %d, want %d", j.EndByteOffsets[0], uint64(len("name,move\nmica,coyote\n"))) - } - - // check that the second data range has properly set offsets - if j.EndByteOffsets[1] != uint64(len("name,move\nmica,coyote\ngalvao,mount\n")) { - t.Errorf("got i.DataRanges[1].EndByteOffset = %d, want %d", j.EndByteOffsets[1], uint64(len("name,move\nmica,coyote\ngalvao,mount\n"))) - } - - fmt.Printf("index file looks like: %v", j.Indexes) - }) - - t.Run("generate index file", func(t *testing.T) { - i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader("")}) - - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - _, err = ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader(mockCsv2)}) - if err != nil { - t.Fatal(err) - } - - }) - - t.Run("existing index but different type", func(t *testing.T) { - i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader("test\ntest1\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader("test\ntest1\n123\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 1 { - t.Errorf("got len(i.Indexes) = %d, want 1", len(j.Indexes)) - } - - if j.Indexes[0].FieldType != protocol.FieldTypeString|protocol.FieldTypeNumber { - t.Errorf("got i.Indexes[0].FieldType = %#v, want protocol.FieldTypeUnknown", j.Indexes[0].FieldType) - } - }) - - t.Run("existing index but nullable type", func(t *testing.T) { - i, err := NewIndexFile(CSVHandler{ReadSeeker: strings.NewReader("test,test2\nomoplata,armbar\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, CSVHandler{ReadSeeker: strings.NewReader("test,test2\nomoplata,armbar\n,singlelegx\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 2 { - t.Errorf("got len(i.Indexes) = %d, want 2", len(j.Indexes)) - } - - if j.Indexes[0].FieldType != protocol.FieldTypeNull|protocol.FieldTypeString { - t.Errorf("got i.Indexes[0].FieldType = %#v, want protocol.FieldTypeNullableString", j.Indexes[0].FieldType) - } - }) - -} diff --git a/pkg/appendable/index_file_jsonl_test.go b/pkg/appendable/index_file_jsonl_test.go deleted file mode 100644 index dd4b614b..00000000 --- a/pkg/appendable/index_file_jsonl_test.go +++ /dev/null @@ -1,332 +0,0 @@ -package appendable - -import ( - "bytes" - "fmt" - "strings" - "testing" - - "github.com/kevmo314/appendable/pkg/protocol" -) - -func TestAppendDataRowJSONL(t *testing.T) { - - t.Run("no schema changes", func(t *testing.T) { - - i, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n{\"test\":\"test3\"}\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 1 { - t.Errorf("got len(i.Indexes) = %d, want 1", len(i.Indexes)) - } - - if len(j.EndByteOffsets) != 2 { - t.Errorf("got len(i.DataRanges) = %d, want 2", len(i.EndByteOffsets)) - } - - // check that the first data range is untouched despite being incorrect - if j.EndByteOffsets[0] != uint64(len("{\"test\":\"test1\"}\n")) { - t.Errorf("got i.DataRanges[0].EndByteOffset = %d, want %d", j.EndByteOffsets[0], uint64(len("{\"test\":\"test1\"}\n"))) - } - - // check that the second data range has properly set offsets - if j.EndByteOffsets[1] != uint64(len("{\"test\":\"test1\"}\n{\"test\":\"test3\"}\n")) { - t.Errorf("got i.DataRanges[1].EndByteOffset = %d, want %d", j.EndByteOffsets[1], uint64(len("{\"test\":\"test1\"}\n{\"test\":\"test3\"}\n"))) - } - }) - - t.Run("correctly sets field offset", func(t *testing.T) { - i, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n{\"test\":\"test3\"}\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 1 { - t.Errorf("got len(j.Indexes) = %d, want 1", len(j.Indexes)) - } - - if len(j.Indexes[0].IndexRecords) != 2 { - t.Errorf("got len(j.Indexes[0].IndexRecords) = %d, want 2", len(j.Indexes[0].IndexRecords)) - } - - if len(j.Indexes[0].IndexRecords["test1"]) != 1 { - t.Errorf("got len(j.Indexes[0].IndexRecords[\"test1\"]) = %d, want 1", len(j.Indexes[0].IndexRecords["test1"])) - } - if len(j.Indexes[0].IndexRecords["test3"]) != 1 { - t.Errorf("got len(j.Indexes[0].IndexRecords[\"test3\"]) = %d, want 1", len(j.Indexes[0].IndexRecords["test3"])) - } - - if j.Indexes[0].IndexRecords["test1"][0].DataNumber != 0 { - t.Errorf("got i.Indexes[0].IndexRecords[\"test1\"][0].DataNumber = %d, want 0", j.Indexes[0].IndexRecords["test1"][0].DataNumber) - } - if j.Indexes[0].IndexRecords["test1"][0].FieldStartByteOffset != uint64(len("{\"test\":")) { - t.Errorf("got i.Indexes[0].IndexRecords[\"test1\"][0].FieldStartByteOffset = %d, want 10", j.Indexes[0].IndexRecords["test1"][0].FieldStartByteOffset) - } - - if j.Indexes[0].IndexRecords["test3"][0].DataNumber != 1 { - t.Errorf("got i.Indexes[0].IndexRecords[\"test3\"][1].DataNumber = %d, want 1", j.Indexes[0].IndexRecords["test3"][1].DataNumber) - } - if j.Indexes[0].IndexRecords["test3"][0].FieldStartByteOffset != uint64(len("{\"test\":\"test1\"}\n{\"test\":")) { - t.Errorf("got i.Indexes[0].IndexRecords[\"test3\"][1].FieldStartByteOffset = %d, want 10", j.Indexes[0].IndexRecords["test3"][1].FieldStartByteOffset) - } - }) - - t.Run("new index", func(t *testing.T) { - i, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n{\"test2\":\"test3\"}\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional index - if len(j.Indexes) != 2 { - t.Errorf("got len(i.Indexes) = %d, want 2", len(j.Indexes)) - } - - if j.Indexes[1].FieldName != "test2" { - t.Errorf("got i.Indexes[1].FieldName = %s, want \"test2\"", j.Indexes[1].FieldName) - } - - if j.Indexes[1].FieldType != protocol.FieldTypeString { - t.Errorf("got i.Indexes[1].FieldType = %+v, want protocol.FieldTypeString", j.Indexes[1].FieldType) - } - }) - - t.Run("existing index but different type", func(t *testing.T) { - i, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n{\"test\":123}\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 1 { - t.Errorf("got len(i.Indexes) = %d, want 1", len(j.Indexes)) - } - - if j.Indexes[0].FieldType != protocol.FieldTypeString|protocol.FieldTypeNumber { - t.Errorf("got i.Indexes[0].FieldType = %#v, want protocol.FieldTypeUnknown", j.Indexes[0].FieldType) - } - }) - - t.Run("creates nested indices", func(t *testing.T) { - i, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n{\"test2\":{\"a\":1,\"b\":\"2\"}}\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 3 { - t.Errorf("got len(i.Indexes) = %d, want 3", len(j.Indexes)) - } - - if j.Indexes[0].FieldType != protocol.FieldTypeString { - t.Errorf("got i.Indexes[0].FieldType = %#v, want protocol.FieldTypeUnknown", j.Indexes[0].FieldType) - } - - if j.Indexes[1].FieldType != protocol.FieldTypeNumber { - t.Errorf("got i.Indexes[1].FieldType = %#v, want protocol.FieldTypeNumber", j.Indexes[1].FieldType) - } - - if j.Indexes[2].FieldType != protocol.FieldTypeString { - t.Errorf("got i.Indexes[2].FieldType = %#v, want protocol.FieldTypeString", j.Indexes[2].FieldType) - } - - if j.Indexes[0].FieldName != "test" { - t.Errorf("got i.Indexes[0].FieldName = %s, want \"test\"", j.Indexes[0].FieldName) - } - - if j.Indexes[1].FieldName != "test2.a" { - t.Errorf("got i.Indexes[1].FieldName = %s, want \"test2.a\"", j.Indexes[1].FieldName) - } - - if j.Indexes[2].FieldName != "test2.b" { - t.Errorf("got i.Indexes[2].FieldName = %s, want \"test2.b\"", j.Indexes[2].FieldName) - } - }) - - t.Run("creates nested indices but also erases parent", func(t *testing.T) { - i, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n{\"test\":{\"a\":1,\"b\":\"2\"}}\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 3 { - t.Errorf("got len(i.Indexes) = %d, want 3", len(j.Indexes)) - } - - if j.Indexes[0].FieldType != protocol.FieldTypeString|protocol.FieldTypeObject { - t.Errorf("got i.Indexes[0].FieldType = %#v, want protocol.FieldTypeUnknown", j.Indexes[0].FieldType) - } - }) - - t.Run("ignores arrays", func(t *testing.T) { - i, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n{\"test2\":[[1,2,3],4]}\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 1 { - t.Errorf("got len(i.Indexes) = %d, want 3", len(j.Indexes)) - } - }) - - t.Run("ignores arrays but downgrades type", func(t *testing.T) { - i, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n{\"test\":[[1,2,3],4]}\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 1 { - t.Errorf("got len(i.Indexes) = %d, want 3", len(j.Indexes)) - } - - if j.Indexes[0].FieldType != protocol.FieldTypeString|protocol.FieldTypeArray { - t.Errorf("got i.Indexes[0].FieldType = %#v, want protocol.FieldTypeUnknown", j.Indexes[0].FieldType) - } - }) - - t.Run("existing index but nullable type", func(t *testing.T) { - i, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - j, err := ReadIndexFile(buf, JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":\"test1\"}\n{\"test\":null}\n")}) - if err != nil { - t.Fatal(err) - } - - // check that the index file now has the additional data ranges but same number of indices - if len(j.Indexes) != 1 { - t.Errorf("got len(i.Indexes) = %d, want 1", len(j.Indexes)) - } - - if j.Indexes[0].FieldType != protocol.FieldTypeNull|protocol.FieldTypeString { - t.Errorf("got i.Indexes[0].FieldType = %#v, want protocol.FieldTypeNullableString", j.Indexes[0].FieldType) - } - }) - - t.Run("record null columns", func(t *testing.T) { - - i, err := NewIndexFile(JSONLHandler{ReadSeeker: strings.NewReader("{\"test\":null}\n{\"test\":null}")}) - if err != nil { - t.Fatal(err) - } - - buf := &bytes.Buffer{} - - if err := i.Serialize(buf); err != nil { - t.Fatal(err) - } - - fmt.Printf("index file looks like: %v", i.Indexes) - - if len(i.Indexes) != 1 { - t.Errorf("got len(i.Indexes) = %d, want 1", len(i.Indexes)) - } - - if i.Indexes[0].FieldType != protocol.FieldTypeNull { - t.Errorf("got %d, wanted protocol.FieldTypeNull", i.Indexes[0].FieldType) - } - }) -} diff --git a/pkg/appendable/jsonl_handler.go b/pkg/appendable/jsonl_handler.go deleted file mode 100644 index e89a8969..00000000 --- a/pkg/appendable/jsonl_handler.go +++ /dev/null @@ -1,177 +0,0 @@ -package appendable - -import ( - "bufio" - "bytes" - "encoding/json" - "fmt" - "io" - "strings" - - "github.com/cespare/xxhash/v2" - "github.com/kevmo314/appendable/pkg/protocol" -) - -type JSONLHandler struct { - io.ReadSeeker -} - -func (j JSONLHandler) Synchronize(f *IndexFile) error { - - // read until the next newline - scanner := bufio.NewScanner(f.data) - for i := 0; scanner.Scan(); i++ { - line := scanner.Bytes() - - // create a new json decoder - dec := json.NewDecoder(bytes.NewReader(line)) - - existingCount := len(f.EndByteOffsets) - - // append a data range - var start uint64 - if len(f.EndByteOffsets) > 0 { - start = f.EndByteOffsets[existingCount-1] - } - f.EndByteOffsets = append(f.EndByteOffsets, start+uint64(len(line))+1) - f.Checksums = append(f.Checksums, xxhash.Sum64(line)) - - // if the first token is not {, then return an error - if t, err := dec.Token(); err != nil || t != json.Delim('{') { - return fmt.Errorf("expected '%U', got '%U' (only json objects are supported at the root)", '{', t) - } - - if err := f.handleJSONLObject(dec, []string{}, uint64(existingCount), start); err != nil { - return fmt.Errorf("failed to handle object: %w", err) - } - - // the next token must be a } - if t, err := dec.Token(); err != nil || t != json.Delim('}') { - return fmt.Errorf("expected '}', got '%v'", t) - } - } - - return nil -} - -func fieldRank(token json.Token) int { - switch token.(type) { - case nil: - return 1 - case bool: - return 2 - case int, int8, int16, int32, int64, float32, float64: - return 3 - case string: - return 4 - default: - panic("unknown type") - } -} - -func (i *IndexFile) handleJSONLObject(dec *json.Decoder, path []string, dataIndex, dataOffset uint64) error { - // while the next token is not }, read the key - for dec.More() { - key, err := dec.Token() - if err != nil { - return fmt.Errorf("failed to read token at index %d: %w", dataIndex, err) - } - - // key must be a string - if key, ok := key.(string); !ok { - return fmt.Errorf("expected string key, got '%v'", key) - } else { - fieldOffset := dec.InputOffset() + 1 // skip the : - - value, err := dec.Token() - if err != nil { - return fmt.Errorf("failed to read token: %w", err) - } - - name := strings.Join(append(path, key), ".") - - switch value := value.(type) { - case string, int, int8, int16, int32, int64, float32, float64, bool: - tree := i.Indexes[i.findIndex(name, value)].IndexRecords - // append this record to the list of records for this value - tree[value] = append(tree[value], protocol.IndexRecord{ - DataNumber: dataIndex, - FieldStartByteOffset: dataOffset + uint64(fieldOffset), - FieldLength: int(dec.InputOffset() - fieldOffset), - }) - - case json.Token: - switch value { - case json.Delim('['): - for j := range i.Indexes { - if i.Indexes[j].FieldName == name { - i.Indexes[j].FieldType |= protocol.FieldTypeArray - } - } - // arrays are not indexed yet because we need to incorporate - // subindexing into the specification. however, we have to - // skip tokens until we reach the end of the array. - depth := 1 - for { - t, err := dec.Token() - if err != nil { - return fmt.Errorf("failed to read token: %w", err) - } - - switch t { - case json.Delim('['): - depth++ - case json.Delim(']'): - depth-- - } - - if depth == 0 { - break - } - } - case json.Delim('{'): - // find the index to set the field type to unknown. - for j := range i.Indexes { - if i.Indexes[j].FieldName == name { - i.Indexes[j].FieldType |= protocol.FieldTypeObject - } - } - if err := i.handleJSONLObject(dec, append(path, key), dataIndex, dataOffset); err != nil { - return fmt.Errorf("failed to handle object: %w", err) - } - // read the } - if t, err := dec.Token(); err != nil || t != json.Delim('}') { - return fmt.Errorf("expected '}', got '%v'", t) - } - default: - return fmt.Errorf("unexpected token '%v'", value) - } - case nil: - - found := false - - // set the field to nullable if it's not already - for j := range i.Indexes { - if i.Indexes[j].FieldName == name { - i.Indexes[j].FieldType |= protocol.FieldTypeNull - found = true - } - } - - if !found { - tree := i.Indexes[i.findIndex(name, value)].IndexRecords - // append this record to the list of records for this value - tree[value] = append(tree[value], protocol.IndexRecord{ - DataNumber: dataIndex, - FieldStartByteOffset: dataOffset + uint64(fieldOffset), - FieldLength: int(dec.InputOffset() - fieldOffset), - }) - } - - default: - return fmt.Errorf("unexpected type '%T'", value) - } - } - } - return nil -}