diff --git a/db.go b/db.go index db9dbafed..40252a720 100644 --- a/db.go +++ b/db.go @@ -36,12 +36,81 @@ const ( // All data access is performed through transactions which can be obtained through the DB. // All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called. type DB struct { + pagePool sync.Pool + + logger Logger + + openFile func(string, int, os.FileMode) (*os.File, error) + file *os.File + data *[maxMapSize]byte + meta0 *common.Meta + meta1 *common.Meta + rwtx *Tx + + freelist *freelist + batch *batch + + ops struct { + writeAt func(b []byte, off int64) (n int, err error) + } + + // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures + // dramatic performance degradation if database is large and fragmentation in freelist is common. + // The alternative one is using hashmap, it is faster in almost all circumstances + // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe. + // The default type is array + FreelistType FreelistType + + path string + // `dataref` isn't used at all on Windows, and the golangci-lint + // always fails on Windows platform. + //nolint + dataref []byte // mmap'ed readonly, write throws SEGV + txs []*Tx + // Put `stats` at the first field to ensure it's 64-bit aligned. Note that // the first word in an allocated struct can be relied upon to be 64-bit // aligned. Refer to https://pkg.go.dev/sync/atomic#pkg-note-BUG. Also // refer to discussion in https://github.com/etcd-io/bbolt/issues/577. stats Stats + // If you want to read the entire database fast, you can set MmapFlag to + // syscall.MAP_POPULATE on Linux 2.6.23+ for sequential read-ahead. + MmapFlags int + + // MaxBatchSize is the maximum size of a batch. Default value is + // copied from DefaultMaxBatchSize in Open. + // + // If <=0, disables batching. + // + // Do not change concurrently with calls to Batch. + MaxBatchSize int + + // MaxBatchDelay is the maximum delay before a batch starts. + // Default value is copied from DefaultMaxBatchDelay in Open. + // + // If <=0, effectively disables batching. + // + // Do not change concurrently with calls to Batch. + MaxBatchDelay time.Duration + + // AllocSize is the amount of space allocated when the database + // needs to create new pages. This is done to amortize the cost + // of truncate() and fsync() when growing the data file. + AllocSize int + + datasz int + pageSize int + mmaplock sync.RWMutex // Protects mmap access during remapping. + statlock sync.RWMutex // Protects stats access. + + freelistLoad sync.Once + + batchMu sync.Mutex + + rwlock sync.Mutex // Allows only one writer at a time. + metalock sync.Mutex // Protects meta page access. + // When enabled, the database will perform a Check() after every commit. // A panic is issued if the database is in an inconsistent state. This // flag has a large performance impact so it should only be used for @@ -65,13 +134,6 @@ type DB struct { // re-sync during recovery. NoFreelistSync bool - // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures - // dramatic performance degradation if database is large and fragmentation in freelist is common. - // The alternative one is using hashmap, it is faster in almost all circumstances - // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe. - // The default type is array - FreelistType FreelistType - // When true, skips the truncate call when growing the database. // Setting this to true is only safe on non-ext3/ext4 systems. // Skipping truncation avoids preallocation of hard drive space and @@ -85,71 +147,13 @@ type DB struct { // set to `true`. PreLoadFreelist bool - // If you want to read the entire database fast, you can set MmapFlag to - // syscall.MAP_POPULATE on Linux 2.6.23+ for sequential read-ahead. - MmapFlags int - - // MaxBatchSize is the maximum size of a batch. Default value is - // copied from DefaultMaxBatchSize in Open. - // - // If <=0, disables batching. - // - // Do not change concurrently with calls to Batch. - MaxBatchSize int - - // MaxBatchDelay is the maximum delay before a batch starts. - // Default value is copied from DefaultMaxBatchDelay in Open. - // - // If <=0, effectively disables batching. - // - // Do not change concurrently with calls to Batch. - MaxBatchDelay time.Duration - - // AllocSize is the amount of space allocated when the database - // needs to create new pages. This is done to amortize the cost - // of truncate() and fsync() when growing the data file. - AllocSize int - // Mlock locks database file in memory when set to true. // It prevents major page faults, however used memory can't be reclaimed. // // Supported only on Unix via mlock/munlock syscalls. Mlock bool - logger Logger - - path string - openFile func(string, int, os.FileMode) (*os.File, error) - file *os.File - // `dataref` isn't used at all on Windows, and the golangci-lint - // always fails on Windows platform. - //nolint - dataref []byte // mmap'ed readonly, write throws SEGV - data *[maxMapSize]byte - datasz int - meta0 *common.Meta - meta1 *common.Meta - pageSize int - opened bool - rwtx *Tx - txs []*Tx - - freelist *freelist - freelistLoad sync.Once - - pagePool sync.Pool - - batchMu sync.Mutex - batch *batch - - rwlock sync.Mutex // Allows only one writer at a time. - metalock sync.Mutex // Protects meta page access. - mmaplock sync.RWMutex // Protects mmap access during remapping. - statlock sync.RWMutex // Protects stats access. - - ops struct { - writeAt func(b []byte, off int64) (n int, err error) - } + opened bool // Read only mode. // When true, Update() and Begin(true) return ErrDatabaseReadOnly immediately. @@ -1012,8 +1016,8 @@ type call struct { type batch struct { db *DB timer *time.Timer - start sync.Once calls []call + start sync.Once } // trigger runs the batch if it hasn't already been run. @@ -1278,21 +1282,13 @@ func (db *DB) freepages() []common.Pgid { // Options represents the options that can be set when opening a database. type Options struct { - // Timeout is the amount of time to wait to obtain a file lock. - // When set to zero it will wait indefinitely. - Timeout time.Duration - - // Sets the DB.NoGrowSync flag before memory mapping the file. - NoGrowSync bool - // Do not sync freelist to disk. This improves the database write performance - // under normal operation, but requires a full database re-sync during recovery. - NoFreelistSync bool + // Logger is the logger used for bbolt. + Logger Logger - // PreLoadFreelist sets whether to load the free pages when opening - // the db file. Note when opening db in write mode, bbolt will always - // load the free pages. - PreLoadFreelist bool + // OpenFile is used to open files. It defaults to os.OpenFile. This option + // is useful for writing hermetic tests. + OpenFile func(string, int, os.FileMode) (*os.File, error) // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures // dramatic performance degradation if database is large and fragmentation in freelist is common. @@ -1301,9 +1297,9 @@ type Options struct { // The default type is array FreelistType FreelistType - // Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to - // grab a shared lock (UNIX). - ReadOnly bool + // Timeout is the amount of time to wait to obtain a file lock. + // When set to zero it will wait indefinitely. + Timeout time.Duration // Sets the DB.MmapFlags flag before memory mapping the file. MmapFlags int @@ -1321,22 +1317,31 @@ type Options struct { // PageSize overrides the default OS page size. PageSize int + // Sets the DB.NoGrowSync flag before memory mapping the file. + NoGrowSync bool + + // Do not sync freelist to disk. This improves the database write performance + // under normal operation, but requires a full database re-sync during recovery. + NoFreelistSync bool + + // PreLoadFreelist sets whether to load the free pages when opening + // the db file. Note when opening db in write mode, bbolt will always + // load the free pages. + PreLoadFreelist bool + + // Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to + // grab a shared lock (UNIX). + ReadOnly bool + // NoSync sets the initial value of DB.NoSync. Normally this can just be // set directly on the DB itself when returned from Open(), but this option // is useful in APIs which expose Options but not the underlying DB. NoSync bool - // OpenFile is used to open files. It defaults to os.OpenFile. This option - // is useful for writing hermetic tests. - OpenFile func(string, int, os.FileMode) (*os.File, error) - // Mlock locks database file in memory when set to true. // It prevents potential page faults, however // used memory can't be reclaimed. (UNIX only) Mlock bool - - // Logger is the logger used for bbolt. - Logger Logger } func (o *Options) String() string { diff --git a/freelist.go b/freelist.go index 731d75c46..7c45fa01a 100644 --- a/freelist.go +++ b/freelist.go @@ -22,20 +22,20 @@ type pidSet map[common.Pgid]struct{} // freelist represents a list of all pages that are available for allocation. // It also tracks pages that have been freed but are still in use by open transactions. type freelist struct { - freelistType FreelistType // freelist type - ids []common.Pgid // all free and available free page ids. allocs map[common.Pgid]common.Txid // mapping of Txid that allocated a pgid. pending map[common.Txid]*txPending // mapping of soon-to-be free page ids by tx. cache map[common.Pgid]struct{} // fast lookup of all free and pending page ids. freemaps map[uint64]pidSet // key is the size of continuous pages(span), value is a set which contains the starting pgids of same size forwardMap map[common.Pgid]uint64 // key is start pgid, value is its span size backwardMap map[common.Pgid]uint64 // key is end pgid, value is its span size - freePagesCount uint64 // count of free pages(hashmap version) allocate func(txid common.Txid, n int) common.Pgid // the freelist allocate func free_count func() int // the function which gives you free page number mergeSpans func(ids common.Pgids) // the mergeSpan func getFreePageIDs func() []common.Pgid // get free pgids func readIDs func(pgids []common.Pgid) // readIDs func reads list of pages and init the freelist + freelistType FreelistType // freelist type + ids []common.Pgid // all free and available free page ids. + freePagesCount uint64 // count of free pages(hashmap version) } // newFreelist returns an empty, initialized freelist. diff --git a/node.go b/node.go index fe67c3c89..ec7675a7b 100644 --- a/node.go +++ b/node.go @@ -11,14 +11,14 @@ import ( // node represents an in-memory, deserialized page. type node struct { bucket *Bucket - isLeaf bool - unbalanced bool - spilled bool - key []byte - pgid common.Pgid parent *node + key []byte children nodes inodes common.Inodes + pgid common.Pgid + isLeaf bool + unbalanced bool + spilled bool } // root returns the top-level node this node is attached to. diff --git a/tx.go b/tx.go index 81913b0fe..1ca24a5eb 100644 --- a/tx.go +++ b/tx.go @@ -25,15 +25,14 @@ import ( // are using them. A long running read transaction can cause the database to // quickly grow. type Tx struct { - writable bool - managed bool db *DB meta *common.Meta - root Bucket pages map[common.Pgid]*common.Page - stats TxStats + root Bucket commitHandlers []func() + stats TxStats + // WriteFlag specifies the flag for write-related methods like WriteTo(). // Tx opens the database file with the specified flag to copy the data. // @@ -41,6 +40,8 @@ type Tx struct { // workloads. For databases that are much larger than available RAM, // set the flag to syscall.O_DIRECT to avoid trashing the page cache. WriteFlag int + writable bool + managed bool } // init initializes the transaction.