github.com/bhojpur/cache@v0.0.4/pkg/memory/db.go (about) 1 package memory 2 3 // Copyright (c) 2018 Bhojpur Consulting Private Limited, India. All rights reserved. 4 5 // Permission is hereby granted, free of charge, to any person obtaining a copy 6 // of this software and associated documentation files (the "Software"), to deal 7 // in the Software without restriction, including without limitation the rights 8 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 // copies of the Software, and to permit persons to whom the Software is 10 // furnished to do so, subject to the following conditions: 11 12 // The above copyright notice and this permission notice shall be included in 13 // all copies or substantial portions of the Software. 14 15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 // THE SOFTWARE. 22 23 import ( 24 "errors" 25 "fmt" 26 "hash/fnv" 27 "log" 28 "os" 29 "runtime" 30 "sort" 31 "sync" 32 "time" 33 "unsafe" 34 ) 35 36 // The largest step that can be taken when remapping the mmap. 37 const maxMmapStep = 1 << 30 // 1GB 38 39 // The data file format version. 40 const version = 2 41 42 // Represents a marker value to indicate that a file is a Bhojpur Cache 43 // in-memory database DB. 44 const magic uint32 = 0xED0CDAED 45 46 const pgidNoFreelist pgid = 0xffffffffffffffff 47 48 // IgnoreNoSync specifies whether the NoSync field of a DB is ignored when 49 // syncing changes to a file. This is required as some operating systems, 50 // such as OpenBSD, do not have a unified buffer cache (UBC) and writes 51 // must be synchronized using the msync(2) syscall. 52 const IgnoreNoSync = runtime.GOOS == "openbsd" 53 54 // Default values if not set in a DB instance. 55 const ( 56 DefaultMaxBatchSize int = 1000 57 DefaultMaxBatchDelay = 10 * time.Millisecond 58 DefaultAllocSize = 16 * 1024 * 1024 59 ) 60 61 // default page size for db is set to the OS page size. 62 var defaultPageSize = os.Getpagesize() 63 64 // The time elapsed between consecutive file locking attempts. 65 const flockRetryTimeout = 50 * time.Millisecond 66 67 // FreelistType is the type of the freelist backend 68 type FreelistType string 69 70 const ( 71 // FreelistArrayType indicates backend freelist type is array 72 FreelistArrayType = FreelistType("array") 73 // FreelistMapType indicates backend freelist type is hashmap 74 FreelistMapType = FreelistType("hashmap") 75 ) 76 77 // DB represents a collection of buckets persisted to a file on disk. 78 // All data access is performed through transactions which can be obtained through the DB. 79 // All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called. 80 type DB struct { 81 // When enabled, the database will perform a Check() after every commit. 82 // A panic is issued if the database is in an inconsistent state. This 83 // flag has a large performance impact so it should only be used for 84 // debugging purposes. 85 StrictMode bool 86 87 // Setting the NoSync flag will cause the database to skip fsync() 88 // calls after each commit. This can be useful when bulk loading data 89 // into a database and you can restart the bulk load in the event of 90 // a system failure or database corruption. Do not set this flag for 91 // normal use. 92 // 93 // If the package global IgnoreNoSync constant is true, this value is 94 // ignored. See the comment on that constant for more details. 95 // 96 // THIS IS UNSAFE. PLEASE USE WITH CAUTION. 97 NoSync bool 98 99 // When true, skips syncing freelist to disk. This improves the database 100 // write performance under normal operation, but requires a full database 101 // re-sync during recovery. 102 NoFreelistSync bool 103 104 // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures 105 // dramatic performance degradation if database is large and framentation in freelist is common. 106 // The alternative one is using hashmap, it is faster in almost all circumstances 107 // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe. 108 // The default type is array 109 FreelistType FreelistType 110 111 // When true, skips the truncate call when growing the database. 112 // Setting this to true is only safe on non-ext3/ext4 systems. 113 // Skipping truncation avoids preallocation of hard drive space and 114 // bypasses a truncate() and fsync() syscall on remapping. 115 NoGrowSync bool 116 117 // If you want to read the entire database fast, you can set MmapFlag to 118 // syscall.MAP_POPULATE on Linux 2.6.23+ for sequential read-ahead. 119 MmapFlags int 120 121 // MaxBatchSize is the maximum size of a batch. Default value is 122 // copied from DefaultMaxBatchSize in Open. 123 // 124 // If <=0, disables batching. 125 // 126 // Do not change concurrently with calls to Batch. 127 MaxBatchSize int 128 129 // MaxBatchDelay is the maximum delay before a batch starts. 130 // Default value is copied from DefaultMaxBatchDelay in Open. 131 // 132 // If <=0, effectively disables batching. 133 // 134 // Do not change concurrently with calls to Batch. 135 MaxBatchDelay time.Duration 136 137 // AllocSize is the amount of space allocated when the database 138 // needs to create new pages. This is done to amortize the cost 139 // of truncate() and fsync() when growing the data file. 140 AllocSize int 141 142 // Mlock locks database file in memory when set to true. 143 // It prevents major page faults, however used memory can't be reclaimed. 144 // 145 // Supported only on Unix via mlock/munlock syscalls. 146 Mlock bool 147 148 path string 149 openFile func(string, int, os.FileMode) (*os.File, error) 150 file *os.File 151 dataref []byte // mmap'ed readonly, write throws SEGV 152 data *[maxMapSize]byte 153 datasz int 154 filesz int // current on disk file size 155 meta0 *meta 156 meta1 *meta 157 pageSize int 158 opened bool 159 rwtx *Tx 160 txs []*Tx 161 stats Stats 162 163 freelist *freelist 164 freelistLoad sync.Once 165 166 pagePool sync.Pool 167 168 batchMu sync.Mutex 169 batch *batch 170 171 rwlock sync.Mutex // Allows only one writer at a time. 172 metalock sync.Mutex // Protects meta page access. 173 mmaplock sync.RWMutex // Protects mmap access during remapping. 174 statlock sync.RWMutex // Protects stats access. 175 176 ops struct { 177 writeAt func(b []byte, off int64) (n int, err error) 178 } 179 180 // Read only mode. 181 // When true, Update() and Begin(true) return ErrDatabaseReadOnly immediately. 182 readOnly bool 183 } 184 185 // Path returns the path to currently open database file. 186 func (db *DB) Path() string { 187 return db.path 188 } 189 190 // GoString returns the Go string representation of the database. 191 func (db *DB) GoString() string { 192 return fmt.Sprintf("memcache.DB{path:%q}", db.path) 193 } 194 195 // String returns the string representation of the database. 196 func (db *DB) String() string { 197 return fmt.Sprintf("DB<%q>", db.path) 198 } 199 200 // Open creates and opens an In-Memory database at the given path. 201 // If the file does not exist then it will be created automatically. 202 // Passing in nil options will cause Bhojpur Cache to open the database 203 // with the default options. 204 func Open(path string, mode os.FileMode, options *Options) (*DB, error) { 205 db := &DB{ 206 opened: true, 207 } 208 // Set default options if no options are provided. 209 if options == nil { 210 options = DefaultOptions 211 } 212 db.NoSync = options.NoSync 213 db.NoGrowSync = options.NoGrowSync 214 db.MmapFlags = options.MmapFlags 215 db.NoFreelistSync = options.NoFreelistSync 216 db.FreelistType = options.FreelistType 217 db.Mlock = options.Mlock 218 219 // Set default values for later DB operations. 220 db.MaxBatchSize = DefaultMaxBatchSize 221 db.MaxBatchDelay = DefaultMaxBatchDelay 222 db.AllocSize = DefaultAllocSize 223 224 flag := os.O_RDWR 225 if options.ReadOnly { 226 flag = os.O_RDONLY 227 db.readOnly = true 228 } 229 230 db.openFile = options.OpenFile 231 if db.openFile == nil { 232 db.openFile = os.OpenFile 233 } 234 235 // Open data file and separate sync handler for metadata writes. 236 var err error 237 if db.file, err = db.openFile(path, flag|os.O_CREATE, mode); err != nil { 238 _ = db.close() 239 return nil, err 240 } 241 db.path = db.file.Name() 242 243 // Lock file so that other processes using Bhojpur Cache in read-write mode cannot 244 // use the database at the same time. This would cause corruption since 245 // the two processes would write meta pages and free pages separately. 246 // The database file is locked exclusively (only one process can grab the lock) 247 // if !options.ReadOnly. 248 // The database file is locked using the shared lock (more than one process may 249 // hold a lock at the same time) otherwise (options.ReadOnly is set). 250 if err := flock(db, !db.readOnly, options.Timeout); err != nil { 251 _ = db.close() 252 return nil, err 253 } 254 255 // Default values for test hooks 256 db.ops.writeAt = db.file.WriteAt 257 258 if db.pageSize = options.PageSize; db.pageSize == 0 { 259 // Set the default page size to the OS page size. 260 db.pageSize = defaultPageSize 261 } 262 263 // Initialize the database if it doesn't exist. 264 if info, err := db.file.Stat(); err != nil { 265 _ = db.close() 266 return nil, err 267 } else if info.Size() == 0 { 268 // Initialize new files with meta pages. 269 if err := db.init(); err != nil { 270 // clean up file descriptor on initialization fail 271 _ = db.close() 272 return nil, err 273 } 274 } else { 275 // Read the first meta page to determine the page size. 276 var buf [0x1000]byte 277 // If we can't read the page size, but can read a page, assume 278 // it's the same as the OS or one given -- since that's how the 279 // page size was chosen in the first place. 280 // 281 // If the first page is invalid and this OS uses a different 282 // page size than what the database was created with then we 283 // are out of luck and cannot access the database. 284 // 285 // TODO: scan for next page 286 if bw, err := db.file.ReadAt(buf[:], 0); err == nil && bw == len(buf) { 287 if m := db.pageInBuffer(buf[:], 0).meta(); m.validate() == nil { 288 db.pageSize = int(m.pageSize) 289 } 290 } else { 291 _ = db.close() 292 return nil, ErrInvalid 293 } 294 } 295 296 // Initialize page pool. 297 db.pagePool = sync.Pool{ 298 New: func() interface{} { 299 return make([]byte, db.pageSize) 300 }, 301 } 302 303 // Memory map the data file. 304 if err := db.mmap(options.InitialMmapSize); err != nil { 305 _ = db.close() 306 return nil, err 307 } 308 309 if db.readOnly { 310 return db, nil 311 } 312 313 db.loadFreelist() 314 315 // Flush freelist when transitioning from no sync to sync so 316 // NoFreelistSync unaware In-Memory db can open the db later. 317 if !db.NoFreelistSync && !db.hasSyncedFreelist() { 318 tx, err := db.Begin(true) 319 if tx != nil { 320 err = tx.Commit() 321 } 322 if err != nil { 323 _ = db.close() 324 return nil, err 325 } 326 } 327 328 // Mark the database as opened and return. 329 return db, nil 330 } 331 332 // loadFreelist reads the freelist if it is synced, or reconstructs it 333 // by scanning the DB if it is not synced. It assumes there are no 334 // concurrent accesses being made to the freelist. 335 func (db *DB) loadFreelist() { 336 db.freelistLoad.Do(func() { 337 db.freelist = newFreelist(db.FreelistType) 338 if !db.hasSyncedFreelist() { 339 // Reconstruct free list by scanning the DB. 340 db.freelist.readIDs(db.freepages()) 341 } else { 342 // Read free list from freelist page. 343 db.freelist.read(db.page(db.meta().freelist)) 344 } 345 db.stats.FreePageN = db.freelist.free_count() 346 }) 347 } 348 349 func (db *DB) hasSyncedFreelist() bool { 350 return db.meta().freelist != pgidNoFreelist 351 } 352 353 // mmap opens the underlying memory-mapped file and initializes the meta references. 354 // minsz is the minimum size that the new mmap can be. 355 func (db *DB) mmap(minsz int) error { 356 db.mmaplock.Lock() 357 defer db.mmaplock.Unlock() 358 359 info, err := db.file.Stat() 360 if err != nil { 361 return fmt.Errorf("mmap stat error: %s", err) 362 } else if int(info.Size()) < db.pageSize*2 { 363 return fmt.Errorf("file size too small") 364 } 365 366 // Ensure the size is at least the minimum size. 367 fileSize := int(info.Size()) 368 var size = fileSize 369 if size < minsz { 370 size = minsz 371 } 372 size, err = db.mmapSize(size) 373 if err != nil { 374 return err 375 } 376 377 if db.Mlock { 378 // Unlock db memory 379 if err := db.munlock(fileSize); err != nil { 380 return err 381 } 382 } 383 384 // Dereference all mmap references before unmapping. 385 if db.rwtx != nil { 386 db.rwtx.root.dereference() 387 } 388 389 // Unmap existing data before continuing. 390 if err := db.munmap(); err != nil { 391 return err 392 } 393 394 // Memory-map the data file as a byte slice. 395 if err := mmap(db, size); err != nil { 396 return err 397 } 398 399 if db.Mlock { 400 // Don't allow swapping of data file 401 if err := db.mlock(fileSize); err != nil { 402 return err 403 } 404 } 405 406 // Save references to the meta pages. 407 db.meta0 = db.page(0).meta() 408 db.meta1 = db.page(1).meta() 409 410 // Validate the meta pages. We only return an error if both meta pages fail 411 // validation, since meta0 failing validation means that it wasn't saved 412 // properly -- but we can recover using meta1. And vice-versa. 413 err0 := db.meta0.validate() 414 err1 := db.meta1.validate() 415 if err0 != nil && err1 != nil { 416 return err0 417 } 418 419 return nil 420 } 421 422 // munmap unmaps the data file from memory. 423 func (db *DB) munmap() error { 424 if err := munmap(db); err != nil { 425 return fmt.Errorf("unmap error: " + err.Error()) 426 } 427 return nil 428 } 429 430 // mmapSize determines the appropriate size for the mmap given the current size 431 // of the database. The minimum size is 32KB and doubles until it reaches 1GB. 432 // Returns an error if the new mmap size is greater than the max allowed. 433 func (db *DB) mmapSize(size int) (int, error) { 434 // Double the size from 32KB until 1GB. 435 for i := uint(15); i <= 30; i++ { 436 if size <= 1<<i { 437 return 1 << i, nil 438 } 439 } 440 441 // Verify the requested size is not above the maximum allowed. 442 if size > maxMapSize { 443 return 0, fmt.Errorf("mmap too large") 444 } 445 446 // If larger than 1GB then grow by 1GB at a time. 447 sz := int64(size) 448 if remainder := sz % int64(maxMmapStep); remainder > 0 { 449 sz += int64(maxMmapStep) - remainder 450 } 451 452 // Ensure that the mmap size is a multiple of the page size. 453 // This should always be true since we're incrementing in MBs. 454 pageSize := int64(db.pageSize) 455 if (sz % pageSize) != 0 { 456 sz = ((sz / pageSize) + 1) * pageSize 457 } 458 459 // If we've exceeded the max size then only grow up to the max size. 460 if sz > maxMapSize { 461 sz = maxMapSize 462 } 463 464 return int(sz), nil 465 } 466 467 func (db *DB) munlock(fileSize int) error { 468 if err := munlock(db, fileSize); err != nil { 469 return fmt.Errorf("munlock error: " + err.Error()) 470 } 471 return nil 472 } 473 474 func (db *DB) mlock(fileSize int) error { 475 if err := mlock(db, fileSize); err != nil { 476 return fmt.Errorf("mlock error: " + err.Error()) 477 } 478 return nil 479 } 480 481 func (db *DB) mrelock(fileSizeFrom, fileSizeTo int) error { 482 if err := db.munlock(fileSizeFrom); err != nil { 483 return err 484 } 485 if err := db.mlock(fileSizeTo); err != nil { 486 return err 487 } 488 return nil 489 } 490 491 // init creates a new database file and initializes its meta pages. 492 func (db *DB) init() error { 493 // Create two meta pages on a buffer. 494 buf := make([]byte, db.pageSize*4) 495 for i := 0; i < 2; i++ { 496 p := db.pageInBuffer(buf, pgid(i)) 497 p.id = pgid(i) 498 p.flags = metaPageFlag 499 500 // Initialize the meta page. 501 m := p.meta() 502 m.magic = magic 503 m.version = version 504 m.pageSize = uint32(db.pageSize) 505 m.freelist = 2 506 m.root = bucket{root: 3} 507 m.pgid = 4 508 m.txid = txid(i) 509 m.checksum = m.sum64() 510 } 511 512 // Write an empty freelist at page 3. 513 p := db.pageInBuffer(buf, pgid(2)) 514 p.id = pgid(2) 515 p.flags = freelistPageFlag 516 p.count = 0 517 518 // Write an empty leaf page at page 4. 519 p = db.pageInBuffer(buf, pgid(3)) 520 p.id = pgid(3) 521 p.flags = leafPageFlag 522 p.count = 0 523 524 // Write the buffer to our data file. 525 if _, err := db.ops.writeAt(buf, 0); err != nil { 526 return err 527 } 528 if err := fdatasync(db); err != nil { 529 return err 530 } 531 db.filesz = len(buf) 532 533 return nil 534 } 535 536 // Close releases all database resources. 537 // It will block waiting for any open transactions to finish 538 // before closing the database and returning. 539 func (db *DB) Close() error { 540 db.rwlock.Lock() 541 defer db.rwlock.Unlock() 542 543 db.metalock.Lock() 544 defer db.metalock.Unlock() 545 546 db.mmaplock.Lock() 547 defer db.mmaplock.Unlock() 548 549 return db.close() 550 } 551 552 func (db *DB) close() error { 553 if !db.opened { 554 return nil 555 } 556 557 db.opened = false 558 559 db.freelist = nil 560 561 // Clear ops. 562 db.ops.writeAt = nil 563 564 // Close the mmap. 565 if err := db.munmap(); err != nil { 566 return err 567 } 568 569 // Close file handles. 570 if db.file != nil { 571 // No need to unlock read-only file. 572 if !db.readOnly { 573 // Unlock the file. 574 if err := funlock(db); err != nil { 575 log.Printf("memcache.Close(): funlock error: %s", err) 576 } 577 } 578 579 // Close the file descriptor. 580 if err := db.file.Close(); err != nil { 581 return fmt.Errorf("db file close: %s", err) 582 } 583 db.file = nil 584 } 585 586 db.path = "" 587 return nil 588 } 589 590 // Begin starts a new transaction. 591 // Multiple read-only transactions can be used concurrently but only one 592 // write transaction can be used at a time. Starting multiple write transactions 593 // will cause the calls to block and be serialized until the current write 594 // transaction finishes. 595 // 596 // Transactions should not be dependent on one another. Opening a read 597 // transaction and a write transaction in the same goroutine can cause the 598 // writer to deadlock because the database periodically needs to re-mmap itself 599 // as it grows and it cannot do that while a read transaction is open. 600 // 601 // If a long running read transaction (for example, a snapshot transaction) is 602 // needed, you might want to set DB.InitialMmapSize to a large enough value 603 // to avoid potential blocking of write transaction. 604 // 605 // IMPORTANT: You must close read-only transactions after you are finished or 606 // else the database will not reclaim old pages. 607 func (db *DB) Begin(writable bool) (*Tx, error) { 608 if writable { 609 return db.beginRWTx() 610 } 611 return db.beginTx() 612 } 613 614 func (db *DB) beginTx() (*Tx, error) { 615 // Lock the meta pages while we initialize the transaction. We obtain 616 // the meta lock before the mmap lock because that's the order that the 617 // write transaction will obtain them. 618 db.metalock.Lock() 619 620 // Obtain a read-only lock on the mmap. When the mmap is remapped it will 621 // obtain a write lock so all transactions must finish before it can be 622 // remapped. 623 db.mmaplock.RLock() 624 625 // Exit if the database is not open yet. 626 if !db.opened { 627 db.mmaplock.RUnlock() 628 db.metalock.Unlock() 629 return nil, ErrDatabaseNotOpen 630 } 631 632 // Create a transaction associated with the database. 633 t := &Tx{} 634 t.init(db) 635 636 // Keep track of transaction until it closes. 637 db.txs = append(db.txs, t) 638 n := len(db.txs) 639 640 // Unlock the meta pages. 641 db.metalock.Unlock() 642 643 // Update the transaction stats. 644 db.statlock.Lock() 645 db.stats.TxN++ 646 db.stats.OpenTxN = n 647 db.statlock.Unlock() 648 649 return t, nil 650 } 651 652 func (db *DB) beginRWTx() (*Tx, error) { 653 // If the database was opened with Options.ReadOnly, return an error. 654 if db.readOnly { 655 return nil, ErrDatabaseReadOnly 656 } 657 658 // Obtain writer lock. This is released by the transaction when it closes. 659 // This enforces only one writer transaction at a time. 660 db.rwlock.Lock() 661 662 // Once we have the writer lock then we can lock the meta pages so that 663 // we can set up the transaction. 664 db.metalock.Lock() 665 defer db.metalock.Unlock() 666 667 // Exit if the database is not open yet. 668 if !db.opened { 669 db.rwlock.Unlock() 670 return nil, ErrDatabaseNotOpen 671 } 672 673 // Create a transaction associated with the database. 674 t := &Tx{writable: true} 675 t.init(db) 676 db.rwtx = t 677 db.freePages() 678 return t, nil 679 } 680 681 // freePages releases any pages associated with closed read-only transactions. 682 func (db *DB) freePages() { 683 // Free all pending pages prior to earliest open transaction. 684 sort.Sort(txsById(db.txs)) 685 minid := txid(0xFFFFFFFFFFFFFFFF) 686 if len(db.txs) > 0 { 687 minid = db.txs[0].meta.txid 688 } 689 if minid > 0 { 690 db.freelist.release(minid - 1) 691 } 692 // Release unused txid extents. 693 for _, t := range db.txs { 694 db.freelist.releaseRange(minid, t.meta.txid-1) 695 minid = t.meta.txid + 1 696 } 697 db.freelist.releaseRange(minid, txid(0xFFFFFFFFFFFFFFFF)) 698 // Any page both allocated and freed in an extent is safe to release. 699 } 700 701 type txsById []*Tx 702 703 func (t txsById) Len() int { return len(t) } 704 func (t txsById) Swap(i, j int) { t[i], t[j] = t[j], t[i] } 705 func (t txsById) Less(i, j int) bool { return t[i].meta.txid < t[j].meta.txid } 706 707 // removeTx removes a transaction from the database. 708 func (db *DB) removeTx(tx *Tx) { 709 // Release the read lock on the mmap. 710 db.mmaplock.RUnlock() 711 712 // Use the meta lock to restrict access to the DB object. 713 db.metalock.Lock() 714 715 // Remove the transaction. 716 for i, t := range db.txs { 717 if t == tx { 718 last := len(db.txs) - 1 719 db.txs[i] = db.txs[last] 720 db.txs[last] = nil 721 db.txs = db.txs[:last] 722 break 723 } 724 } 725 n := len(db.txs) 726 727 // Unlock the meta pages. 728 db.metalock.Unlock() 729 730 // Merge statistics. 731 db.statlock.Lock() 732 db.stats.OpenTxN = n 733 db.stats.TxStats.add(&tx.stats) 734 db.statlock.Unlock() 735 } 736 737 // Update executes a function within the context of a read-write managed transaction. 738 // If no error is returned from the function then the transaction is committed. 739 // If an error is returned then the entire transaction is rolled back. 740 // Any error that is returned from the function or returned from the commit is 741 // returned from the Update() method. 742 // 743 // Attempting to manually commit or rollback within the function will cause a panic. 744 func (db *DB) Update(fn func(*Tx) error) error { 745 t, err := db.Begin(true) 746 if err != nil { 747 return err 748 } 749 750 // Make sure the transaction rolls back in the event of a panic. 751 defer func() { 752 if t.db != nil { 753 t.rollback() 754 } 755 }() 756 757 // Mark as a managed tx so that the inner function cannot manually commit. 758 t.managed = true 759 760 // If an error is returned from the function then rollback and return error. 761 err = fn(t) 762 t.managed = false 763 if err != nil { 764 _ = t.Rollback() 765 return err 766 } 767 768 return t.Commit() 769 } 770 771 // View executes a function within the context of a managed read-only transaction. 772 // Any error that is returned from the function is returned from the View() method. 773 // 774 // Attempting to manually rollback within the function will cause a panic. 775 func (db *DB) View(fn func(*Tx) error) error { 776 t, err := db.Begin(false) 777 if err != nil { 778 return err 779 } 780 781 // Make sure the transaction rolls back in the event of a panic. 782 defer func() { 783 if t.db != nil { 784 t.rollback() 785 } 786 }() 787 788 // Mark as a managed tx so that the inner function cannot manually rollback. 789 t.managed = true 790 791 // If an error is returned from the function then pass it through. 792 err = fn(t) 793 t.managed = false 794 if err != nil { 795 _ = t.Rollback() 796 return err 797 } 798 799 return t.Rollback() 800 } 801 802 // Batch calls fn as part of a batch. It behaves similar to Update, 803 // except: 804 // 805 // 1. concurrent Batch calls can be combined into a single Bhojpur Cache 806 // transaction. 807 // 808 // 2. the function passed to Batch may be called multiple times, 809 // regardless of whether it returns error or not. 810 // 811 // This means that Batch function side effects must be idempotent and 812 // take permanent effect only after a successful return is seen in 813 // caller. 814 // 815 // The maximum batch size and delay can be adjusted with DB.MaxBatchSize 816 // and DB.MaxBatchDelay, respectively. 817 // 818 // Batch is only useful when there are multiple goroutines calling it. 819 func (db *DB) Batch(fn func(*Tx) error) error { 820 errCh := make(chan error, 1) 821 822 db.batchMu.Lock() 823 if (db.batch == nil) || (db.batch != nil && len(db.batch.calls) >= db.MaxBatchSize) { 824 // There is no existing batch, or the existing batch is full; start a new one. 825 db.batch = &batch{ 826 db: db, 827 } 828 db.batch.timer = time.AfterFunc(db.MaxBatchDelay, db.batch.trigger) 829 } 830 db.batch.calls = append(db.batch.calls, call{fn: fn, err: errCh}) 831 if len(db.batch.calls) >= db.MaxBatchSize { 832 // wake up batch, it's ready to run 833 go db.batch.trigger() 834 } 835 db.batchMu.Unlock() 836 837 err := <-errCh 838 if err == trySolo { 839 err = db.Update(fn) 840 } 841 return err 842 } 843 844 type call struct { 845 fn func(*Tx) error 846 err chan<- error 847 } 848 849 type batch struct { 850 db *DB 851 timer *time.Timer 852 start sync.Once 853 calls []call 854 } 855 856 // trigger runs the batch if it hasn't already been run. 857 func (b *batch) trigger() { 858 b.start.Do(b.run) 859 } 860 861 // run performs the transactions in the batch and communicates results 862 // back to DB.Batch. 863 func (b *batch) run() { 864 b.db.batchMu.Lock() 865 b.timer.Stop() 866 // Make sure no new work is added to this batch, but don't break 867 // other batches. 868 if b.db.batch == b { 869 b.db.batch = nil 870 } 871 b.db.batchMu.Unlock() 872 873 retry: 874 for len(b.calls) > 0 { 875 var failIdx = -1 876 err := b.db.Update(func(tx *Tx) error { 877 for i, c := range b.calls { 878 if err := safelyCall(c.fn, tx); err != nil { 879 failIdx = i 880 return err 881 } 882 } 883 return nil 884 }) 885 886 if failIdx >= 0 { 887 // take the failing transaction out of the batch. it's 888 // safe to shorten b.calls here because db.batch no longer 889 // points to us, and we hold the mutex anyway. 890 c := b.calls[failIdx] 891 b.calls[failIdx], b.calls = b.calls[len(b.calls)-1], b.calls[:len(b.calls)-1] 892 // tell the submitter re-run it solo, continue with the rest of the batch 893 c.err <- trySolo 894 continue retry 895 } 896 897 // pass success, or Bhojpur Cache in-memory storage engine internal 898 // errors, to all callers 899 for _, c := range b.calls { 900 c.err <- err 901 } 902 break retry 903 } 904 } 905 906 // trySolo is a special sentinel error value used for signaling that a 907 // transaction function should be re-run. It should never be seen by 908 // callers. 909 var trySolo = errors.New("batch function returned an error and should be re-run solo") 910 911 type panicked struct { 912 reason interface{} 913 } 914 915 func (p panicked) Error() string { 916 if err, ok := p.reason.(error); ok { 917 return err.Error() 918 } 919 return fmt.Sprintf("panic: %v", p.reason) 920 } 921 922 func safelyCall(fn func(*Tx) error, tx *Tx) (err error) { 923 defer func() { 924 if p := recover(); p != nil { 925 err = panicked{p} 926 } 927 }() 928 return fn(tx) 929 } 930 931 // Sync executes fdatasync() against the database file handle. 932 // 933 // This is not necessary under normal operation, however, if you use NoSync 934 // then it allows you to force the database file to sync against the disk. 935 func (db *DB) Sync() error { return fdatasync(db) } 936 937 // Stats retrieves ongoing performance stats for the database. 938 // This is only updated when a transaction closes. 939 func (db *DB) Stats() Stats { 940 db.statlock.RLock() 941 defer db.statlock.RUnlock() 942 return db.stats 943 } 944 945 // This is for internal access to the raw data bytes from the C cursor, use 946 // carefully, or not at all. 947 func (db *DB) Info() *Info { 948 return &Info{uintptr(unsafe.Pointer(&db.data[0])), db.pageSize} 949 } 950 951 // page retrieves a page reference from the mmap based on the current page size. 952 func (db *DB) page(id pgid) *page { 953 pos := id * pgid(db.pageSize) 954 return (*page)(unsafe.Pointer(&db.data[pos])) 955 } 956 957 // pageInBuffer retrieves a page reference from a given byte array based on the current page size. 958 func (db *DB) pageInBuffer(b []byte, id pgid) *page { 959 return (*page)(unsafe.Pointer(&b[id*pgid(db.pageSize)])) 960 } 961 962 // meta retrieves the current meta page reference. 963 func (db *DB) meta() *meta { 964 // We have to return the meta with the highest txid which doesn't fail 965 // validation. Otherwise, we can cause errors when in fact the database is 966 // in a consistent state. metaA is the one with the higher txid. 967 metaA := db.meta0 968 metaB := db.meta1 969 if db.meta1.txid > db.meta0.txid { 970 metaA = db.meta1 971 metaB = db.meta0 972 } 973 974 // Use higher meta page if valid. Otherwise fallback to previous, if valid. 975 if err := metaA.validate(); err == nil { 976 return metaA 977 } else if err := metaB.validate(); err == nil { 978 return metaB 979 } 980 981 // This should never be reached, because both meta1 and meta0 were validated 982 // on mmap() and we do fsync() on every write. 983 panic("memcache.DB.meta(): invalid meta pages") 984 } 985 986 // allocate returns a contiguous block of memory starting at a given page. 987 func (db *DB) allocate(txid txid, count int) (*page, error) { 988 // Allocate a temporary buffer for the page. 989 var buf []byte 990 if count == 1 { 991 buf = db.pagePool.Get().([]byte) 992 } else { 993 buf = make([]byte, count*db.pageSize) 994 } 995 p := (*page)(unsafe.Pointer(&buf[0])) 996 p.overflow = uint32(count - 1) 997 998 // Use pages from the freelist if they are available. 999 if p.id = db.freelist.allocate(txid, count); p.id != 0 { 1000 return p, nil 1001 } 1002 1003 // Resize mmap() if we're at the end. 1004 p.id = db.rwtx.meta.pgid 1005 var minsz = int((p.id+pgid(count))+1) * db.pageSize 1006 if minsz >= db.datasz { 1007 if err := db.mmap(minsz); err != nil { 1008 return nil, fmt.Errorf("mmap allocate error: %s", err) 1009 } 1010 } 1011 1012 // Move the page id high water mark. 1013 db.rwtx.meta.pgid += pgid(count) 1014 1015 return p, nil 1016 } 1017 1018 // grow grows the size of the database to the given sz. 1019 func (db *DB) grow(sz int) error { 1020 // Ignore if the new size is less than available file size. 1021 if sz <= db.filesz { 1022 return nil 1023 } 1024 1025 // If the data is smaller than the alloc size then only allocate what's needed. 1026 // Once it goes over the allocation size then allocate in chunks. 1027 if db.datasz < db.AllocSize { 1028 sz = db.datasz 1029 } else { 1030 sz += db.AllocSize 1031 } 1032 1033 // Truncate and fsync to ensure file size metadata is flushed. 1034 if !db.NoGrowSync && !db.readOnly { 1035 if runtime.GOOS != "windows" { 1036 if err := db.file.Truncate(int64(sz)); err != nil { 1037 return fmt.Errorf("file resize error: %s", err) 1038 } 1039 } 1040 if err := db.file.Sync(); err != nil { 1041 return fmt.Errorf("file sync error: %s", err) 1042 } 1043 if db.Mlock { 1044 // unlock old file and lock new one 1045 if err := db.mrelock(db.filesz, sz); err != nil { 1046 return fmt.Errorf("mlock/munlock error: %s", err) 1047 } 1048 } 1049 } 1050 1051 db.filesz = sz 1052 return nil 1053 } 1054 1055 func (db *DB) IsReadOnly() bool { 1056 return db.readOnly 1057 } 1058 1059 func (db *DB) freepages() []pgid { 1060 tx, err := db.beginTx() 1061 defer func() { 1062 err = tx.Rollback() 1063 if err != nil { 1064 panic("freepages: failed to rollback tx") 1065 } 1066 }() 1067 if err != nil { 1068 panic("freepages: failed to open read only tx") 1069 } 1070 1071 reachable := make(map[pgid]*page) 1072 nofreed := make(map[pgid]bool) 1073 ech := make(chan error) 1074 go func() { 1075 for e := range ech { 1076 panic(fmt.Sprintf("freepages: failed to get all reachable pages (%v)", e)) 1077 } 1078 }() 1079 tx.checkBucket(&tx.root, reachable, nofreed, ech) 1080 close(ech) 1081 1082 var fids []pgid 1083 for i := pgid(2); i < db.meta().pgid; i++ { 1084 if _, ok := reachable[i]; !ok { 1085 fids = append(fids, i) 1086 } 1087 } 1088 return fids 1089 } 1090 1091 // Options represents the options that can be set when opening a database. 1092 type Options struct { 1093 // Timeout is the amount of time to wait to obtain a file lock. 1094 // When set to zero it will wait indefinitely. This option is only 1095 // available on Darwin and Linux. 1096 Timeout time.Duration 1097 1098 // Sets the DB.NoGrowSync flag before memory mapping the file. 1099 NoGrowSync bool 1100 1101 // Do not sync freelist to disk. This improves the database write performance 1102 // under normal operation, but requires a full database re-sync during recovery. 1103 NoFreelistSync bool 1104 1105 // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures 1106 // dramatic performance degradation if database is large and framentation in freelist is common. 1107 // The alternative one is using hashmap, it is faster in almost all circumstances 1108 // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe. 1109 // The default type is array 1110 FreelistType FreelistType 1111 1112 // Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to 1113 // grab a shared lock (UNIX). 1114 ReadOnly bool 1115 1116 // Sets the DB.MmapFlags flag before memory mapping the file. 1117 MmapFlags int 1118 1119 // InitialMmapSize is the initial mmap size of the database 1120 // in bytes. Read transactions won't block write transaction 1121 // if the InitialMmapSize is large enough to hold database mmap 1122 // size. (See DB.Begin for more information) 1123 // 1124 // If <=0, the initial map size is 0. 1125 // If initialMmapSize is smaller than the previous database size, 1126 // it takes no effect. 1127 InitialMmapSize int 1128 1129 // PageSize overrides the default OS page size. 1130 PageSize int 1131 1132 // NoSync sets the initial value of DB.NoSync. Normally this can just be 1133 // set directly on the DB itself when returned from Open(), but this option 1134 // is useful in APIs which expose Options but not the underlying DB. 1135 NoSync bool 1136 1137 // OpenFile is used to open files. It defaults to os.OpenFile. This option 1138 // is useful for writing hermetic tests. 1139 OpenFile func(string, int, os.FileMode) (*os.File, error) 1140 1141 // Mlock locks database file in memory when set to true. 1142 // It prevents potential page faults, however 1143 // used memory can't be reclaimed. (UNIX only) 1144 Mlock bool 1145 } 1146 1147 // DefaultOptions represent the options used if nil options are passed into Open(). 1148 // No timeout is used which will cause Bhojpur Cache to wait indefinitely for a lock. 1149 var DefaultOptions = &Options{ 1150 Timeout: 0, 1151 NoGrowSync: false, 1152 FreelistType: FreelistArrayType, 1153 } 1154 1155 // Stats represents statistics about the database. 1156 type Stats struct { 1157 // Freelist stats 1158 FreePageN int // total number of free pages on the freelist 1159 PendingPageN int // total number of pending pages on the freelist 1160 FreeAlloc int // total bytes allocated in free pages 1161 FreelistInuse int // total bytes used by the freelist 1162 1163 // Transaction stats 1164 TxN int // total number of started read transactions 1165 OpenTxN int // number of currently open read transactions 1166 1167 TxStats TxStats // global, ongoing stats. 1168 } 1169 1170 // Sub calculates and returns the difference between two sets of database stats. 1171 // This is useful when obtaining stats at two different points and time and 1172 // you need the performance counters that occurred within that time span. 1173 func (s *Stats) Sub(other *Stats) Stats { 1174 if other == nil { 1175 return *s 1176 } 1177 var diff Stats 1178 diff.FreePageN = s.FreePageN 1179 diff.PendingPageN = s.PendingPageN 1180 diff.FreeAlloc = s.FreeAlloc 1181 diff.FreelistInuse = s.FreelistInuse 1182 diff.TxN = s.TxN - other.TxN 1183 diff.TxStats = s.TxStats.Sub(&other.TxStats) 1184 return diff 1185 } 1186 1187 type Info struct { 1188 Data uintptr 1189 PageSize int 1190 } 1191 1192 type meta struct { 1193 magic uint32 1194 version uint32 1195 pageSize uint32 1196 flags uint32 1197 root bucket 1198 freelist pgid 1199 pgid pgid 1200 txid txid 1201 checksum uint64 1202 } 1203 1204 // validate checks the marker bytes and version of the meta page to ensure it matches this binary. 1205 func (m *meta) validate() error { 1206 if m.magic != magic { 1207 return ErrInvalid 1208 } else if m.version != version { 1209 return ErrVersionMismatch 1210 } else if m.checksum != 0 && m.checksum != m.sum64() { 1211 return ErrChecksum 1212 } 1213 return nil 1214 } 1215 1216 // copy copies one meta object to another. 1217 func (m *meta) copy(dest *meta) { 1218 *dest = *m 1219 } 1220 1221 // write writes the meta onto a page. 1222 func (m *meta) write(p *page) { 1223 if m.root.root >= m.pgid { 1224 panic(fmt.Sprintf("root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid)) 1225 } else if m.freelist >= m.pgid && m.freelist != pgidNoFreelist { 1226 // TODO: reject pgidNoFreeList if !NoFreelistSync 1227 panic(fmt.Sprintf("freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid)) 1228 } 1229 1230 // Page id is either going to be 0 or 1 which we can determine by the transaction ID. 1231 p.id = pgid(m.txid % 2) 1232 p.flags |= metaPageFlag 1233 1234 // Calculate the checksum. 1235 m.checksum = m.sum64() 1236 1237 m.copy(p.meta()) 1238 } 1239 1240 // generates the checksum for the meta. 1241 func (m *meta) sum64() uint64 { 1242 var h = fnv.New64a() 1243 _, _ = h.Write((*[unsafe.Offsetof(meta{}.checksum)]byte)(unsafe.Pointer(m))[:]) 1244 return h.Sum64() 1245 } 1246 1247 // _assert will panic with a given formatted message if the given condition is false. 1248 func _assert(condition bool, msg string, v ...interface{}) { 1249 if !condition { 1250 panic(fmt.Sprintf("assertion failed: "+msg, v...)) 1251 } 1252 }