github.com/ledgerwatch/erigon-lib@v1.0.0/kv/mdbx/kv_mdbx.go (about) 1 /* 2 Copyright 2021 Erigon contributors 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package mdbx 18 19 import ( 20 "bytes" 21 "context" 22 "encoding/binary" 23 "fmt" 24 "os" 25 "runtime" 26 "sort" 27 "strings" 28 "sync" 29 "sync/atomic" 30 "time" 31 32 "github.com/c2h5oh/datasize" 33 "github.com/erigontech/mdbx-go/mdbx" 34 stack2 "github.com/go-stack/stack" 35 "github.com/ledgerwatch/erigon-lib/common/dbg" 36 "github.com/ledgerwatch/erigon-lib/kv" 37 "github.com/ledgerwatch/erigon-lib/kv/iter" 38 "github.com/ledgerwatch/erigon-lib/kv/order" 39 "github.com/ledgerwatch/log/v3" 40 "github.com/pbnjay/memory" 41 "golang.org/x/exp/maps" 42 "golang.org/x/sync/semaphore" 43 ) 44 45 const NonExistingDBI kv.DBI = 999_999_999 46 47 type TableCfgFunc func(defaultBuckets kv.TableCfg) kv.TableCfg 48 49 func WithChaindataTables(defaultBuckets kv.TableCfg) kv.TableCfg { 50 return defaultBuckets 51 } 52 53 type MdbxOpts struct { 54 // must be in the range from 12.5% (almost empty) to 50% (half empty) 55 // which corresponds to the range from 8192 and to 32768 in units respectively 56 log log.Logger 57 roTxsLimiter *semaphore.Weighted 58 bucketsCfg TableCfgFunc 59 path string 60 syncPeriod time.Duration 61 mapSize datasize.ByteSize 62 growthStep datasize.ByteSize 63 shrinkThreshold int 64 flags uint 65 pageSize uint64 66 dirtySpace uint64 // if exeed this space, modified pages will `spill` to disk 67 mergeThreshold uint64 68 verbosity kv.DBVerbosityLvl 69 label kv.Label // marker to distinct db instances - one process may open many databases. for example to collect metrics of only 1 database 70 inMem bool 71 } 72 73 func NewMDBX(log log.Logger) MdbxOpts { 74 opts := MdbxOpts{ 75 bucketsCfg: WithChaindataTables, 76 flags: mdbx.NoReadahead | mdbx.Coalesce | mdbx.Durable, 77 log: log, 78 pageSize: kv.DefaultPageSize(), 79 80 // default is (TOTAL_RAM+AVAILABLE_RAM)/42/pageSize 81 // but for reproducibility of benchmarks - please don't rely on Available RAM 82 dirtySpace: 2 * (memory.TotalMemory() / 42), 83 84 mapSize: 2 * datasize.TB, 85 growthStep: 2 * datasize.GB, 86 mergeThreshold: 3 * 8192, 87 shrinkThreshold: -1, // default 88 label: kv.InMem, 89 } 90 return opts 91 } 92 93 func (opts MdbxOpts) GetLabel() kv.Label { return opts.label } 94 func (opts MdbxOpts) GetInMem() bool { return opts.inMem } 95 func (opts MdbxOpts) GetPageSize() uint64 { return opts.pageSize } 96 97 func (opts MdbxOpts) Label(label kv.Label) MdbxOpts { 98 opts.label = label 99 return opts 100 } 101 102 func (opts MdbxOpts) DirtySpace(s uint64) MdbxOpts { 103 opts.dirtySpace = s 104 return opts 105 } 106 107 func (opts MdbxOpts) RoTxsLimiter(l *semaphore.Weighted) MdbxOpts { 108 opts.roTxsLimiter = l 109 return opts 110 } 111 112 func (opts MdbxOpts) PageSize(v uint64) MdbxOpts { 113 opts.pageSize = v 114 return opts 115 } 116 117 func (opts MdbxOpts) GrowthStep(v datasize.ByteSize) MdbxOpts { 118 opts.growthStep = v 119 return opts 120 } 121 122 func (opts MdbxOpts) Path(path string) MdbxOpts { 123 opts.path = path 124 return opts 125 } 126 127 func (opts MdbxOpts) Set(opt MdbxOpts) MdbxOpts { 128 return opt 129 } 130 131 func (opts MdbxOpts) InMem(tmpDir string) MdbxOpts { 132 if tmpDir != "" { 133 if err := os.MkdirAll(tmpDir, 0755); err != nil { 134 panic(err) 135 } 136 } 137 path, err := os.MkdirTemp(tmpDir, "erigon-memdb-") 138 if err != nil { 139 panic(err) 140 } 141 opts.path = path 142 opts.inMem = true 143 opts.flags = mdbx.UtterlyNoSync | mdbx.NoMetaSync | mdbx.LifoReclaim | mdbx.NoMemInit 144 opts.growthStep = 2 * datasize.MB 145 opts.mapSize = 512 * datasize.MB 146 opts.shrinkThreshold = 0 // disable 147 opts.label = kv.InMem 148 return opts 149 } 150 151 func (opts MdbxOpts) Exclusive() MdbxOpts { 152 opts.flags = opts.flags | mdbx.Exclusive 153 return opts 154 } 155 156 func (opts MdbxOpts) Flags(f func(uint) uint) MdbxOpts { 157 opts.flags = f(opts.flags) 158 return opts 159 } 160 161 func (opts MdbxOpts) HasFlag(flag uint) bool { return opts.flags&flag != 0 } 162 func (opts MdbxOpts) Readonly() MdbxOpts { 163 opts.flags = opts.flags | mdbx.Readonly 164 return opts 165 } 166 167 func (opts MdbxOpts) SyncPeriod(period time.Duration) MdbxOpts { 168 opts.syncPeriod = period 169 return opts 170 } 171 172 func (opts MdbxOpts) DBVerbosity(v kv.DBVerbosityLvl) MdbxOpts { 173 opts.verbosity = v 174 return opts 175 } 176 177 func (opts MdbxOpts) MapSize(sz datasize.ByteSize) MdbxOpts { 178 opts.mapSize = sz 179 return opts 180 } 181 182 func (opts MdbxOpts) WriteMap() MdbxOpts { 183 opts.flags |= mdbx.WriteMap 184 return opts 185 } 186 187 func (opts MdbxOpts) WriteMergeThreshold(v uint64) MdbxOpts { 188 opts.mergeThreshold = v 189 return opts 190 } 191 192 func (opts MdbxOpts) WithTableCfg(f TableCfgFunc) MdbxOpts { 193 opts.bucketsCfg = f 194 return opts 195 } 196 197 var pathDbMap = map[string]kv.RoDB{} 198 var pathDbMapLock sync.Mutex 199 200 func addToPathDbMap(path string, db kv.RoDB) { 201 pathDbMapLock.Lock() 202 defer pathDbMapLock.Unlock() 203 pathDbMap[path] = db 204 } 205 206 func removeFromPathDbMap(path string) { 207 pathDbMapLock.Lock() 208 defer pathDbMapLock.Unlock() 209 delete(pathDbMap, path) 210 } 211 212 func PathDbMap() map[string]kv.RoDB { 213 pathDbMapLock.Lock() 214 defer pathDbMapLock.Unlock() 215 return maps.Clone(pathDbMap) 216 } 217 218 func (opts MdbxOpts) Open() (kv.RwDB, error) { 219 if dbg.WriteMap() { 220 opts = opts.WriteMap() //nolint 221 } 222 if dbg.DirtySpace() > 0 { 223 opts = opts.DirtySpace(dbg.DirtySpace()) //nolint 224 } 225 if dbg.NoSync() { 226 opts = opts.Flags(func(u uint) uint { return u | mdbx.SafeNoSync }) //nolint 227 } 228 if dbg.MergeTr() > 0 { 229 opts = opts.WriteMergeThreshold(uint64(dbg.MergeTr() * 8192)) //nolint 230 } 231 if dbg.MdbxReadAhead() { 232 opts = opts.Flags(func(u uint) uint { return u &^ mdbx.NoReadahead }) //nolint 233 } 234 env, err := mdbx.NewEnv() 235 if err != nil { 236 return nil, err 237 } 238 if opts.verbosity != -1 { 239 err = env.SetDebug(mdbx.LogLvl(opts.verbosity), mdbx.DbgDoNotChange, mdbx.LoggerDoNotChange) // temporary disable error, because it works if call it 1 time, but returns error if call it twice in same process (what often happening in tests) 240 if err != nil { 241 return nil, fmt.Errorf("db verbosity set: %w", err) 242 } 243 } 244 if err = env.SetOption(mdbx.OptMaxDB, 200); err != nil { 245 return nil, err 246 } 247 if err = env.SetOption(mdbx.OptMaxReaders, kv.ReadersLimit); err != nil { 248 return nil, err 249 } 250 251 if opts.flags&mdbx.Accede == 0 { 252 if err = env.SetGeometry(-1, -1, int(opts.mapSize), int(opts.growthStep), opts.shrinkThreshold, int(opts.pageSize)); err != nil { 253 return nil, err 254 } 255 if err = os.MkdirAll(opts.path, 0744); err != nil { 256 return nil, fmt.Errorf("could not create dir: %s, %w", opts.path, err) 257 } 258 } 259 260 err = env.Open(opts.path, opts.flags, 0664) 261 if err != nil { 262 if err != nil { 263 return nil, fmt.Errorf("%w, label: %s, trace: %s", err, opts.label.String(), stack2.Trace().String()) 264 } 265 } 266 267 // mdbx will not change pageSize if db already exists. means need read real value after env.open() 268 in, err := env.Info(nil) 269 if err != nil { 270 if err != nil { 271 return nil, fmt.Errorf("%w, label: %s, trace: %s", err, opts.label.String(), stack2.Trace().String()) 272 } 273 } 274 275 opts.pageSize = uint64(in.PageSize) 276 277 //nolint 278 if opts.flags&mdbx.Accede == 0 && opts.flags&mdbx.Readonly == 0 { 279 } 280 // erigon using big transactions 281 // increase "page measured" options. need do it after env.Open() because default are depend on pageSize known only after env.Open() 282 if opts.flags&mdbx.Readonly == 0 { 283 // 1/8 is good for transactions with a lot of modifications - to reduce invalidation size. 284 // But Erigon app now using Batch and etl.Collectors to avoid writing to DB frequently changing data. 285 // It means most of our writes are: APPEND or "single UPSERT per key during transaction" 286 //if err = env.SetOption(mdbx.OptSpillMinDenominator, 8); err != nil { 287 // return nil, err 288 //} 289 290 txnDpInitial, err := env.GetOption(mdbx.OptTxnDpInitial) 291 if err != nil { 292 return nil, err 293 } 294 if err = env.SetOption(mdbx.OptTxnDpInitial, txnDpInitial*2); err != nil { 295 return nil, err 296 } 297 dpReserveLimit, err := env.GetOption(mdbx.OptDpReverseLimit) 298 if err != nil { 299 return nil, err 300 } 301 if err = env.SetOption(mdbx.OptDpReverseLimit, dpReserveLimit*2); err != nil { 302 return nil, err 303 } 304 305 if err = env.SetOption(mdbx.OptTxnDpLimit, opts.dirtySpace/opts.pageSize); err != nil { 306 return nil, err 307 } 308 // must be in the range from 12.5% (almost empty) to 50% (half empty) 309 // which corresponds to the range from 8192 and to 32768 in units respectively 310 if err = env.SetOption(mdbx.OptMergeThreshold16dot16Percent, opts.mergeThreshold); err != nil { 311 return nil, err 312 } 313 } 314 315 dirtyPagesLimit, err := env.GetOption(mdbx.OptTxnDpLimit) 316 if err != nil { 317 return nil, err 318 } 319 320 if opts.syncPeriod != 0 { 321 if err = env.SetSyncPeriod(opts.syncPeriod); err != nil { 322 env.Close() 323 return nil, err 324 } 325 } 326 //if err := env.SetOption(mdbx.OptSyncBytes, uint64(math2.MaxUint64)); err != nil { 327 // return nil, err 328 //} 329 330 if opts.roTxsLimiter == nil { 331 targetSemCount := int64(runtime.GOMAXPROCS(-1) * 16) 332 opts.roTxsLimiter = semaphore.NewWeighted(targetSemCount) // 1 less than max to allow unlocking to happen 333 } 334 db := &MdbxKV{ 335 opts: opts, 336 env: env, 337 log: opts.log, 338 wg: &sync.WaitGroup{}, 339 buckets: kv.TableCfg{}, 340 txSize: dirtyPagesLimit * opts.pageSize, 341 roTxsLimiter: opts.roTxsLimiter, 342 343 leakDetector: dbg.NewLeakDetector("db."+opts.label.String(), dbg.SlowTx()), 344 } 345 346 customBuckets := opts.bucketsCfg(kv.ChaindataTablesCfg) 347 for name, cfg := range customBuckets { // copy map to avoid changing global variable 348 db.buckets[name] = cfg 349 } 350 351 buckets := bucketSlice(db.buckets) 352 if err := db.openDBIs(buckets); err != nil { 353 return nil, err 354 } 355 356 // Configure buckets and open deprecated buckets 357 if err := env.View(func(tx *mdbx.Txn) error { 358 for _, name := range buckets { 359 // Open deprecated buckets if they exist, don't create 360 if !db.buckets[name].IsDeprecated { 361 continue 362 } 363 cnfCopy := db.buckets[name] 364 dbi, createErr := tx.OpenDBI(name, mdbx.DBAccede, nil, nil) 365 if createErr != nil { 366 if mdbx.IsNotFound(createErr) { 367 cnfCopy.DBI = NonExistingDBI 368 db.buckets[name] = cnfCopy 369 continue // if deprecated bucket couldn't be open - then it's deleted and it's fine 370 } else { 371 return fmt.Errorf("bucket: %s, %w", name, createErr) 372 } 373 } 374 cnfCopy.DBI = kv.DBI(dbi) 375 db.buckets[name] = cnfCopy 376 } 377 return nil 378 }); err != nil { 379 return nil, err 380 } 381 382 if !opts.inMem { 383 if staleReaders, err := db.env.ReaderCheck(); err != nil { 384 db.log.Error("failed ReaderCheck", "err", err) 385 } else if staleReaders > 0 { 386 db.log.Info("cleared reader slots from dead processes", "amount", staleReaders) 387 } 388 389 } 390 db.path = opts.path 391 addToPathDbMap(opts.path, db) 392 return db, nil 393 } 394 395 func (opts MdbxOpts) MustOpen() kv.RwDB { 396 db, err := opts.Open() 397 if err != nil { 398 panic(fmt.Errorf("fail to open mdbx: %w", err)) 399 } 400 return db 401 } 402 403 type MdbxKV struct { 404 log log.Logger 405 env *mdbx.Env 406 wg *sync.WaitGroup 407 buckets kv.TableCfg 408 roTxsLimiter *semaphore.Weighted // does limit amount of concurrent Ro transactions - in most casess runtime.NumCPU() is good value for this channel capacity - this channel can be shared with other components (like Decompressor) 409 opts MdbxOpts 410 txSize uint64 411 closed atomic.Bool 412 path string 413 414 leakDetector *dbg.LeakDetector 415 } 416 417 func (db *MdbxKV) PageSize() uint64 { return db.opts.pageSize } 418 func (db *MdbxKV) ReadOnly() bool { return db.opts.HasFlag(mdbx.Readonly) } 419 420 // openDBIs - first trying to open existing DBI's in RO transaction 421 // otherwise re-try by RW transaction 422 // it allow open DB from another process - even if main process holding long RW transaction 423 func (db *MdbxKV) openDBIs(buckets []string) error { 424 if db.ReadOnly() { 425 if err := db.View(context.Background(), func(tx kv.Tx) error { 426 for _, name := range buckets { 427 if db.buckets[name].IsDeprecated { 428 continue 429 } 430 if err := tx.(kv.BucketMigrator).CreateBucket(name); err != nil { 431 return err 432 } 433 } 434 return tx.Commit() // when open db as read-only, commit of this RO transaction is required 435 }); err != nil { 436 return err 437 } 438 } else { 439 if err := db.Update(context.Background(), func(tx kv.RwTx) error { 440 for _, name := range buckets { 441 if db.buckets[name].IsDeprecated { 442 continue 443 } 444 if err := tx.(kv.BucketMigrator).CreateBucket(name); err != nil { 445 return err 446 } 447 } 448 return nil 449 }); err != nil { 450 return err 451 } 452 } 453 return nil 454 } 455 456 // Close closes db 457 // All transactions must be closed before closing the database. 458 func (db *MdbxKV) Close() { 459 if ok := db.closed.CompareAndSwap(false, true); !ok { 460 return 461 } 462 db.wg.Wait() 463 db.env.Close() 464 db.env = nil 465 466 if db.opts.inMem { 467 if err := os.RemoveAll(db.opts.path); err != nil { 468 db.log.Warn("failed to remove in-mem db file", "err", err) 469 } 470 } 471 removeFromPathDbMap(db.path) 472 } 473 474 func (db *MdbxKV) BeginRo(ctx context.Context) (txn kv.Tx, err error) { 475 if db.closed.Load() { 476 return nil, fmt.Errorf("db closed") 477 } 478 479 // don't try to acquire if the context is already done 480 select { 481 case <-ctx.Done(): 482 return nil, ctx.Err() 483 default: 484 // otherwise carry on 485 } 486 487 // will return nil err if context is cancelled (may appear to acquire the semaphore) 488 if semErr := db.roTxsLimiter.Acquire(ctx, 1); semErr != nil { 489 return nil, semErr 490 } 491 492 defer func() { 493 if txn == nil { 494 // on error, or if there is whatever reason that we don't return a tx, 495 // we need to free up the limiter slot, otherwise it could lead to deadlocks 496 db.roTxsLimiter.Release(1) 497 } 498 }() 499 500 tx, err := db.env.BeginTxn(nil, mdbx.Readonly) 501 if err != nil { 502 return nil, fmt.Errorf("%w, label: %s, trace: %s", err, db.opts.label.String(), stack2.Trace().String()) 503 } 504 db.wg.Add(1) 505 return &MdbxTx{ 506 ctx: ctx, 507 db: db, 508 tx: tx, 509 readOnly: true, 510 id: db.leakDetector.Add(), 511 }, nil 512 } 513 514 func (db *MdbxKV) BeginRw(ctx context.Context) (kv.RwTx, error) { 515 return db.beginRw(ctx, 0) 516 } 517 func (db *MdbxKV) BeginRwNosync(ctx context.Context) (kv.RwTx, error) { 518 return db.beginRw(ctx, mdbx.TxNoSync) 519 } 520 521 func (db *MdbxKV) beginRw(ctx context.Context, flags uint) (txn kv.RwTx, err error) { 522 select { 523 case <-ctx.Done(): 524 return nil, ctx.Err() 525 default: 526 } 527 528 if db.closed.Load() { 529 return nil, fmt.Errorf("db closed") 530 } 531 runtime.LockOSThread() 532 tx, err := db.env.BeginTxn(nil, flags) 533 if err != nil { 534 runtime.UnlockOSThread() // unlock only in case of error. normal flow is "defer .Rollback()" 535 return nil, fmt.Errorf("%w, lable: %s, trace: %s", err, db.opts.label.String(), stack2.Trace().String()) 536 } 537 db.wg.Add(1) 538 return &MdbxTx{ 539 db: db, 540 tx: tx, 541 ctx: ctx, 542 id: db.leakDetector.Add(), 543 }, nil 544 } 545 546 type MdbxTx struct { 547 tx *mdbx.Txn 548 db *MdbxKV 549 cursors map[uint64]*mdbx.Cursor 550 streams []kv.Closer 551 statelessCursors map[string]kv.RwCursor 552 readOnly bool 553 cursorID uint64 554 ctx context.Context 555 id uint64 // set only if TRACE_TX=true 556 } 557 558 type MdbxCursor struct { 559 tx *MdbxTx 560 c *mdbx.Cursor 561 bucketName string 562 bucketCfg kv.TableCfgItem 563 dbi mdbx.DBI 564 id uint64 565 } 566 567 func (db *MdbxKV) Env() *mdbx.Env { 568 return db.env 569 } 570 571 func (db *MdbxKV) AllDBI() map[string]kv.DBI { 572 res := map[string]kv.DBI{} 573 for name, cfg := range db.buckets { 574 res[name] = cfg.DBI 575 } 576 return res 577 } 578 579 func (db *MdbxKV) AllTables() kv.TableCfg { 580 return db.buckets 581 } 582 583 func (tx *MdbxTx) ViewID() uint64 { return tx.tx.ID() } 584 585 func (tx *MdbxTx) CollectMetrics() { 586 if tx.db.opts.label != kv.ChainDB { 587 return 588 } 589 590 info, err := tx.db.env.Info(tx.tx) 591 if err != nil { 592 return 593 } 594 if info.SinceReaderCheck.Hours() > 1 { 595 if staleReaders, err := tx.db.env.ReaderCheck(); err != nil { 596 tx.db.log.Error("failed ReaderCheck", "err", err) 597 } else if staleReaders > 0 { 598 tx.db.log.Info("cleared reader slots from dead processes", "amount", staleReaders) 599 } 600 } 601 602 kv.DbSize.Set(info.Geo.Current) 603 kv.DbPgopsNewly.Set(info.PageOps.Newly) 604 kv.DbPgopsCow.Set(info.PageOps.Cow) 605 kv.DbPgopsClone.Set(info.PageOps.Clone) 606 kv.DbPgopsSplit.Set(info.PageOps.Split) 607 kv.DbPgopsMerge.Set(info.PageOps.Merge) 608 kv.DbPgopsSpill.Set(info.PageOps.Spill) 609 kv.DbPgopsUnspill.Set(info.PageOps.Unspill) 610 kv.DbPgopsWops.Set(info.PageOps.Wops) 611 612 txInfo, err := tx.tx.Info(true) 613 if err != nil { 614 return 615 } 616 617 kv.TxDirty.Set(txInfo.SpaceDirty) 618 kv.TxLimit.Set(tx.db.txSize) 619 kv.TxSpill.Set(txInfo.Spill) 620 kv.TxUnspill.Set(txInfo.Unspill) 621 622 gc, err := tx.BucketStat("gc") 623 if err != nil { 624 return 625 } 626 kv.GcLeafMetric.Set(gc.LeafPages) 627 kv.GcOverflowMetric.Set(gc.OverflowPages) 628 kv.GcPagesMetric.Set((gc.LeafPages + gc.OverflowPages) * tx.db.opts.pageSize / 8) 629 } 630 631 // ListBuckets - all buckets stored as keys of un-named bucket 632 func (tx *MdbxTx) ListBuckets() ([]string, error) { 633 return tx.tx.ListDBI() 634 } 635 636 func (db *MdbxKV) View(ctx context.Context, f func(tx kv.Tx) error) (err error) { 637 // can't use db.env.View method - because it calls commit for read transactions - it conflicts with write transactions. 638 tx, err := db.BeginRo(ctx) 639 if err != nil { 640 return err 641 } 642 defer tx.Rollback() 643 644 return f(tx) 645 } 646 647 func (db *MdbxKV) UpdateNosync(ctx context.Context, f func(tx kv.RwTx) error) (err error) { 648 tx, err := db.BeginRwNosync(ctx) 649 if err != nil { 650 return err 651 } 652 defer tx.Rollback() 653 err = f(tx) 654 if err != nil { 655 return err 656 } 657 err = tx.Commit() 658 if err != nil { 659 return err 660 } 661 return nil 662 } 663 664 func (db *MdbxKV) Update(ctx context.Context, f func(tx kv.RwTx) error) (err error) { 665 tx, err := db.BeginRw(ctx) 666 if err != nil { 667 return err 668 } 669 defer tx.Rollback() 670 err = f(tx) 671 if err != nil { 672 return err 673 } 674 err = tx.Commit() 675 if err != nil { 676 return err 677 } 678 return nil 679 } 680 681 func (tx *MdbxTx) CreateBucket(name string) error { 682 cnfCopy := tx.db.buckets[name] 683 dbi, err := tx.tx.OpenDBI(name, mdbx.DBAccede, nil, nil) 684 if err != nil && !mdbx.IsNotFound(err) { 685 return fmt.Errorf("create table: %s, %w", name, err) 686 } 687 if err == nil { 688 cnfCopy.DBI = kv.DBI(dbi) 689 var flags uint 690 flags, err = tx.tx.Flags(dbi) 691 if err != nil { 692 return err 693 } 694 cnfCopy.Flags = kv.TableFlags(flags) 695 696 tx.db.buckets[name] = cnfCopy 697 return nil 698 } 699 700 // if bucket doesn't exists - create it 701 702 var flags = tx.db.buckets[name].Flags 703 var nativeFlags uint 704 if !tx.db.ReadOnly() { 705 nativeFlags |= mdbx.Create 706 } 707 708 if flags&kv.DupSort != 0 { 709 nativeFlags |= mdbx.DupSort 710 flags ^= kv.DupSort 711 } 712 if flags != 0 { 713 return fmt.Errorf("some not supported flag provided for bucket") 714 } 715 716 dbi, err = tx.tx.OpenDBI(name, nativeFlags, nil, nil) 717 718 if err != nil { 719 return fmt.Errorf("create table: %s, %w", name, err) 720 } 721 cnfCopy.DBI = kv.DBI(dbi) 722 723 tx.db.buckets[name] = cnfCopy 724 return nil 725 } 726 727 func (tx *MdbxTx) dropEvenIfBucketIsNotDeprecated(name string) error { 728 dbi := tx.db.buckets[name].DBI 729 // if bucket was not open on db start, then it's may be deprecated 730 // try to open it now without `Create` flag, and if fail then nothing to drop 731 if dbi == NonExistingDBI { 732 nativeDBI, err := tx.tx.OpenDBI(name, 0, nil, nil) 733 if err != nil { 734 if mdbx.IsNotFound(err) { 735 return nil // DBI doesn't exists means no drop needed 736 } 737 return fmt.Errorf("bucket: %s, %w", name, err) 738 } 739 dbi = kv.DBI(nativeDBI) 740 } 741 742 if err := tx.tx.Drop(mdbx.DBI(dbi), true); err != nil { 743 return err 744 } 745 cnfCopy := tx.db.buckets[name] 746 cnfCopy.DBI = NonExistingDBI 747 tx.db.buckets[name] = cnfCopy 748 return nil 749 } 750 751 func (tx *MdbxTx) ClearBucket(bucket string) error { 752 dbi := tx.db.buckets[bucket].DBI 753 if dbi == NonExistingDBI { 754 return nil 755 } 756 return tx.tx.Drop(mdbx.DBI(dbi), false) 757 } 758 759 func (tx *MdbxTx) DropBucket(bucket string) error { 760 if cfg, ok := tx.db.buckets[bucket]; !(ok && cfg.IsDeprecated) { 761 return fmt.Errorf("%w, bucket: %s", kv.ErrAttemptToDeleteNonDeprecatedBucket, bucket) 762 } 763 764 return tx.dropEvenIfBucketIsNotDeprecated(bucket) 765 } 766 767 func (tx *MdbxTx) ExistsBucket(bucket string) (bool, error) { 768 if cfg, ok := tx.db.buckets[bucket]; ok { 769 return cfg.DBI != NonExistingDBI, nil 770 } 771 return false, nil 772 } 773 774 func (tx *MdbxTx) Commit() error { 775 if tx.tx == nil { 776 return nil 777 } 778 defer func() { 779 tx.tx = nil 780 tx.db.wg.Done() 781 if tx.readOnly { 782 tx.db.roTxsLimiter.Release(1) 783 } else { 784 runtime.UnlockOSThread() 785 } 786 tx.db.leakDetector.Del(tx.id) 787 }() 788 tx.closeCursors() 789 790 //slowTx := 10 * time.Second 791 //if debug.SlowCommit() > 0 { 792 // slowTx = debug.SlowCommit() 793 //} 794 // 795 //if debug.BigRoTxKb() > 0 || debug.BigRwTxKb() > 0 { 796 // tx.PrintDebugInfo() 797 //} 798 tx.CollectMetrics() 799 800 latency, err := tx.tx.Commit() 801 if err != nil { 802 return err 803 } 804 805 if tx.db.opts.label == kv.ChainDB { 806 kv.DbCommitPreparation.Update(latency.Preparation.Seconds()) 807 //kv.DbCommitAudit.Update(latency.Audit.Seconds()) 808 kv.DbCommitWrite.Update(latency.Write.Seconds()) 809 kv.DbCommitSync.Update(latency.Sync.Seconds()) 810 kv.DbCommitEnding.Update(latency.Ending.Seconds()) 811 kv.DbCommitTotal.Update(latency.Whole.Seconds()) 812 813 //kv.DbGcWorkPnlMergeTime.Update(latency.GCDetails.WorkPnlMergeTime.Seconds()) 814 //kv.DbGcWorkPnlMergeVolume.Set(uint64(latency.GCDetails.WorkPnlMergeVolume)) 815 //kv.DbGcWorkPnlMergeCalls.Set(uint64(latency.GCDetails.WorkPnlMergeCalls)) 816 // 817 //kv.DbGcSelfPnlMergeTime.Update(latency.GCDetails.SelfPnlMergeTime.Seconds()) 818 //kv.DbGcSelfPnlMergeVolume.Set(uint64(latency.GCDetails.SelfPnlMergeVolume)) 819 //kv.DbGcSelfPnlMergeCalls.Set(uint64(latency.GCDetails.SelfPnlMergeCalls)) 820 } 821 822 return nil 823 } 824 825 func (tx *MdbxTx) Rollback() { 826 if tx.tx == nil { 827 return 828 } 829 defer func() { 830 tx.tx = nil 831 tx.db.wg.Done() 832 if tx.readOnly { 833 tx.db.roTxsLimiter.Release(1) 834 } else { 835 runtime.UnlockOSThread() 836 } 837 tx.db.leakDetector.Del(tx.id) 838 }() 839 tx.closeCursors() 840 //tx.printDebugInfo() 841 tx.tx.Abort() 842 } 843 844 func (tx *MdbxTx) SpaceDirty() (uint64, uint64, error) { 845 txInfo, err := tx.tx.Info(true) 846 if err != nil { 847 return 0, 0, err 848 } 849 850 return txInfo.SpaceDirty, tx.db.txSize, nil 851 } 852 853 func (tx *MdbxTx) PrintDebugInfo() { 854 /* 855 txInfo, err := tx.tx.Info(true) 856 if err != nil { 857 panic(err) 858 } 859 860 txSize := uint(txInfo.SpaceDirty / 1024) 861 doPrint := debug.BigRoTxKb() == 0 && debug.BigRwTxKb() == 0 || 862 tx.readOnly && debug.BigRoTxKb() > 0 && txSize > debug.BigRoTxKb() || 863 (!tx.readOnly && debug.BigRwTxKb() > 0 && txSize > debug.BigRwTxKb()) 864 if doPrint { 865 tx.db.log.Info("Tx info", 866 "id", txInfo.Id, 867 "read_lag", txInfo.ReadLag, 868 "ro", tx.readOnly, 869 //"space_retired_mb", txInfo.SpaceRetired/1024/1024, 870 "space_dirty_mb", txInfo.SpaceDirty/1024/1024, 871 //"callers", debug.Callers(7), 872 ) 873 } 874 */ 875 } 876 877 func (tx *MdbxTx) closeCursors() { 878 for _, c := range tx.cursors { 879 if c != nil { 880 c.Close() 881 } 882 } 883 tx.cursors = nil 884 for _, c := range tx.streams { 885 if c != nil { 886 c.Close() 887 } 888 } 889 tx.statelessCursors = nil 890 } 891 892 func (tx *MdbxTx) statelessCursor(bucket string) (kv.RwCursor, error) { 893 if tx.statelessCursors == nil { 894 tx.statelessCursors = make(map[string]kv.RwCursor) 895 } 896 c, ok := tx.statelessCursors[bucket] 897 if !ok { 898 var err error 899 c, err = tx.RwCursor(bucket) 900 if err != nil { 901 return nil, err 902 } 903 tx.statelessCursors[bucket] = c 904 } 905 return c, nil 906 } 907 908 func (tx *MdbxTx) Put(table string, k, v []byte) error { 909 c, err := tx.statelessCursor(table) 910 if err != nil { 911 return err 912 } 913 return c.Put(k, v) 914 } 915 916 func (tx *MdbxTx) Delete(table string, k []byte) error { 917 c, err := tx.statelessCursor(table) 918 if err != nil { 919 return err 920 } 921 return c.Delete(k) 922 } 923 924 func (tx *MdbxTx) GetOne(bucket string, k []byte) ([]byte, error) { 925 c, err := tx.statelessCursor(bucket) 926 if err != nil { 927 return nil, err 928 } 929 _, v, err := c.SeekExact(k) 930 return v, err 931 } 932 933 func (tx *MdbxTx) Has(bucket string, key []byte) (bool, error) { 934 c, err := tx.statelessCursor(bucket) 935 if err != nil { 936 return false, err 937 } 938 k, _, err := c.Seek(key) 939 if err != nil { 940 return false, err 941 } 942 return bytes.Equal(key, k), nil 943 } 944 945 func (tx *MdbxTx) Append(bucket string, k, v []byte) error { 946 c, err := tx.statelessCursor(bucket) 947 if err != nil { 948 return err 949 } 950 return c.Append(k, v) 951 } 952 func (tx *MdbxTx) AppendDup(bucket string, k, v []byte) error { 953 c, err := tx.statelessCursor(bucket) 954 if err != nil { 955 return err 956 } 957 return c.(*MdbxDupSortCursor).AppendDup(k, v) 958 } 959 960 func (tx *MdbxTx) IncrementSequence(bucket string, amount uint64) (uint64, error) { 961 c, err := tx.statelessCursor(kv.Sequence) 962 if err != nil { 963 return 0, err 964 } 965 _, v, err := c.SeekExact([]byte(bucket)) 966 if err != nil { 967 return 0, err 968 } 969 970 var currentV uint64 = 0 971 if len(v) > 0 { 972 currentV = binary.BigEndian.Uint64(v) 973 } 974 975 newVBytes := make([]byte, 8) 976 binary.BigEndian.PutUint64(newVBytes, currentV+amount) 977 err = c.Put([]byte(bucket), newVBytes) 978 if err != nil { 979 return 0, err 980 } 981 return currentV, nil 982 } 983 984 func (tx *MdbxTx) ReadSequence(bucket string) (uint64, error) { 985 c, err := tx.statelessCursor(kv.Sequence) 986 if err != nil { 987 return 0, err 988 } 989 _, v, err := c.SeekExact([]byte(bucket)) 990 if err != nil && !mdbx.IsNotFound(err) { 991 return 0, err 992 } 993 994 var currentV uint64 995 if len(v) > 0 { 996 currentV = binary.BigEndian.Uint64(v) 997 } 998 999 return currentV, nil 1000 } 1001 1002 func (tx *MdbxTx) BucketSize(name string) (uint64, error) { 1003 st, err := tx.BucketStat(name) 1004 if err != nil { 1005 return 0, err 1006 } 1007 return (st.LeafPages + st.BranchPages + st.OverflowPages) * tx.db.opts.pageSize, nil 1008 } 1009 1010 func (tx *MdbxTx) BucketStat(name string) (*mdbx.Stat, error) { 1011 if name == "freelist" || name == "gc" || name == "free_list" { 1012 return tx.tx.StatDBI(mdbx.DBI(0)) 1013 } 1014 if name == "root" { 1015 return tx.tx.StatDBI(mdbx.DBI(1)) 1016 } 1017 st, err := tx.tx.StatDBI(mdbx.DBI(tx.db.buckets[name].DBI)) 1018 if err != nil { 1019 return nil, fmt.Errorf("bucket: %s, %w", name, err) 1020 } 1021 return st, nil 1022 } 1023 1024 func (tx *MdbxTx) DBSize() (uint64, error) { 1025 info, err := tx.db.env.Info(tx.tx) 1026 if err != nil { 1027 return 0, err 1028 } 1029 return info.Geo.Current, err 1030 } 1031 1032 func (tx *MdbxTx) RwCursor(bucket string) (kv.RwCursor, error) { 1033 b := tx.db.buckets[bucket] 1034 if b.AutoDupSortKeysConversion { 1035 return tx.stdCursor(bucket) 1036 } 1037 1038 if b.Flags&kv.DupSort != 0 { 1039 return tx.RwCursorDupSort(bucket) 1040 } 1041 1042 return tx.stdCursor(bucket) 1043 } 1044 1045 func (tx *MdbxTx) Cursor(bucket string) (kv.Cursor, error) { 1046 return tx.RwCursor(bucket) 1047 } 1048 1049 func (tx *MdbxTx) stdCursor(bucket string) (kv.RwCursor, error) { 1050 b := tx.db.buckets[bucket] 1051 c := &MdbxCursor{bucketName: bucket, tx: tx, bucketCfg: b, dbi: mdbx.DBI(tx.db.buckets[bucket].DBI), id: tx.cursorID} 1052 tx.cursorID++ 1053 1054 var err error 1055 c.c, err = tx.tx.OpenCursor(c.dbi) 1056 if err != nil { 1057 return nil, fmt.Errorf("table: %s, %w, stack: %s", c.bucketName, err, dbg.Stack()) 1058 } 1059 1060 // add to auto-cleanup on end of transactions 1061 if tx.cursors == nil { 1062 tx.cursors = map[uint64]*mdbx.Cursor{} 1063 } 1064 tx.cursors[c.id] = c.c 1065 return c, nil 1066 } 1067 1068 func (tx *MdbxTx) RwCursorDupSort(bucket string) (kv.RwCursorDupSort, error) { 1069 basicCursor, err := tx.stdCursor(bucket) 1070 if err != nil { 1071 return nil, err 1072 } 1073 return &MdbxDupSortCursor{MdbxCursor: basicCursor.(*MdbxCursor)}, nil 1074 } 1075 1076 func (tx *MdbxTx) CursorDupSort(bucket string) (kv.CursorDupSort, error) { 1077 return tx.RwCursorDupSort(bucket) 1078 } 1079 1080 // methods here help to see better pprof picture 1081 func (c *MdbxCursor) set(k []byte) ([]byte, []byte, error) { return c.c.Get(k, nil, mdbx.Set) } 1082 func (c *MdbxCursor) getCurrent() ([]byte, []byte, error) { return c.c.Get(nil, nil, mdbx.GetCurrent) } 1083 func (c *MdbxCursor) first() ([]byte, []byte, error) { return c.c.Get(nil, nil, mdbx.First) } 1084 func (c *MdbxCursor) next() ([]byte, []byte, error) { return c.c.Get(nil, nil, mdbx.Next) } 1085 func (c *MdbxCursor) nextDup() ([]byte, []byte, error) { return c.c.Get(nil, nil, mdbx.NextDup) } 1086 func (c *MdbxCursor) nextNoDup() ([]byte, []byte, error) { return c.c.Get(nil, nil, mdbx.NextNoDup) } 1087 func (c *MdbxCursor) prev() ([]byte, []byte, error) { return c.c.Get(nil, nil, mdbx.Prev) } 1088 func (c *MdbxCursor) prevDup() ([]byte, []byte, error) { return c.c.Get(nil, nil, mdbx.PrevDup) } 1089 func (c *MdbxCursor) prevNoDup() ([]byte, []byte, error) { return c.c.Get(nil, nil, mdbx.PrevNoDup) } 1090 func (c *MdbxCursor) last() ([]byte, []byte, error) { return c.c.Get(nil, nil, mdbx.Last) } 1091 func (c *MdbxCursor) delCurrent() error { return c.c.Del(mdbx.Current) } 1092 func (c *MdbxCursor) delAllDupData() error { return c.c.Del(mdbx.AllDups) } 1093 func (c *MdbxCursor) put(k, v []byte) error { return c.c.Put(k, v, 0) } 1094 func (c *MdbxCursor) putCurrent(k, v []byte) error { return c.c.Put(k, v, mdbx.Current) } 1095 func (c *MdbxCursor) putNoOverwrite(k, v []byte) error { return c.c.Put(k, v, mdbx.NoOverwrite) } 1096 func (c *MdbxCursor) getBoth(k, v []byte) ([]byte, error) { 1097 _, v, err := c.c.Get(k, v, mdbx.GetBoth) 1098 return v, err 1099 } 1100 func (c *MdbxCursor) setRange(k []byte) ([]byte, []byte, error) { 1101 return c.c.Get(k, nil, mdbx.SetRange) 1102 } 1103 func (c *MdbxCursor) getBothRange(k, v []byte) ([]byte, error) { 1104 _, v, err := c.c.Get(k, v, mdbx.GetBothRange) 1105 return v, err 1106 } 1107 func (c *MdbxCursor) firstDup() ([]byte, error) { 1108 _, v, err := c.c.Get(nil, nil, mdbx.FirstDup) 1109 return v, err 1110 } 1111 func (c *MdbxCursor) lastDup() ([]byte, error) { 1112 _, v, err := c.c.Get(nil, nil, mdbx.LastDup) 1113 return v, err 1114 } 1115 1116 func (c *MdbxCursor) Count() (uint64, error) { 1117 st, err := c.tx.tx.StatDBI(c.dbi) 1118 if err != nil { 1119 return 0, err 1120 } 1121 return st.Entries, nil 1122 } 1123 1124 func (c *MdbxCursor) First() ([]byte, []byte, error) { 1125 return c.Seek(nil) 1126 } 1127 1128 func (c *MdbxCursor) Last() ([]byte, []byte, error) { 1129 k, v, err := c.last() 1130 if err != nil { 1131 if mdbx.IsNotFound(err) { 1132 return nil, nil, nil 1133 } 1134 err = fmt.Errorf("failed MdbxKV cursor.Last(): %w, bucket: %s", err, c.bucketName) 1135 return []byte{}, nil, err 1136 } 1137 1138 b := c.bucketCfg 1139 if b.AutoDupSortKeysConversion && len(k) == b.DupToLen { 1140 keyPart := b.DupFromLen - b.DupToLen 1141 k = append(k, v[:keyPart]...) 1142 v = v[keyPart:] 1143 } 1144 1145 return k, v, nil 1146 } 1147 1148 func (c *MdbxCursor) Seek(seek []byte) (k, v []byte, err error) { 1149 if c.bucketCfg.AutoDupSortKeysConversion { 1150 return c.seekDupSort(seek) 1151 } 1152 1153 if len(seek) == 0 { 1154 k, v, err = c.first() 1155 } else { 1156 k, v, err = c.setRange(seek) 1157 } 1158 if err != nil { 1159 if mdbx.IsNotFound(err) { 1160 return nil, nil, nil 1161 } 1162 err = fmt.Errorf("failed MdbxKV cursor.Seek(): %w, bucket: %s, key: %x", err, c.bucketName, seek) 1163 return []byte{}, nil, err 1164 } 1165 1166 return k, v, nil 1167 } 1168 1169 func (c *MdbxCursor) seekDupSort(seek []byte) (k, v []byte, err error) { 1170 b := c.bucketCfg 1171 from, to := b.DupFromLen, b.DupToLen 1172 if len(seek) == 0 { 1173 k, v, err = c.first() 1174 if err != nil { 1175 if mdbx.IsNotFound(err) { 1176 return nil, nil, nil 1177 } 1178 return []byte{}, nil, err 1179 } 1180 1181 if len(k) == to { 1182 k2 := make([]byte, 0, len(k)+from-to) 1183 k2 = append(append(k2, k...), v[:from-to]...) 1184 v = v[from-to:] 1185 k = k2 1186 } 1187 return k, v, nil 1188 } 1189 1190 var seek1, seek2 []byte 1191 if len(seek) > to { 1192 seek1, seek2 = seek[:to], seek[to:] 1193 } else { 1194 seek1 = seek 1195 } 1196 k, v, err = c.setRange(seek1) 1197 if err != nil { 1198 if mdbx.IsNotFound(err) { 1199 return nil, nil, nil 1200 } 1201 1202 return []byte{}, nil, err 1203 } 1204 1205 if seek2 != nil && bytes.Equal(seek1, k) { 1206 v, err = c.getBothRange(seek1, seek2) 1207 if err != nil && mdbx.IsNotFound(err) { 1208 k, v, err = c.next() 1209 if err != nil { 1210 if mdbx.IsNotFound(err) { 1211 return nil, nil, nil 1212 } 1213 return []byte{}, nil, err 1214 } 1215 } else if err != nil { 1216 return []byte{}, nil, err 1217 } 1218 } 1219 if len(k) == to { 1220 k2 := make([]byte, 0, len(k)+from-to) 1221 k2 = append(append(k2, k...), v[:from-to]...) 1222 v = v[from-to:] 1223 k = k2 1224 } 1225 1226 return k, v, nil 1227 } 1228 1229 func (c *MdbxCursor) Next() (k, v []byte, err error) { 1230 k, v, err = c.next() 1231 if err != nil { 1232 if mdbx.IsNotFound(err) { 1233 return nil, nil, nil 1234 } 1235 return []byte{}, nil, fmt.Errorf("failed MdbxKV cursor.Next(): %w", err) 1236 } 1237 1238 b := c.bucketCfg 1239 if b.AutoDupSortKeysConversion && len(k) == b.DupToLen { 1240 keyPart := b.DupFromLen - b.DupToLen 1241 k = append(k, v[:keyPart]...) 1242 v = v[keyPart:] 1243 } 1244 1245 return k, v, nil 1246 } 1247 1248 func (c *MdbxCursor) Prev() (k, v []byte, err error) { 1249 k, v, err = c.prev() 1250 if err != nil { 1251 if mdbx.IsNotFound(err) { 1252 return nil, nil, nil 1253 } 1254 return []byte{}, nil, fmt.Errorf("failed MdbxKV cursor.Prev(): %w", err) 1255 } 1256 1257 b := c.bucketCfg 1258 if b.AutoDupSortKeysConversion && len(k) == b.DupToLen { 1259 keyPart := b.DupFromLen - b.DupToLen 1260 k = append(k, v[:keyPart]...) 1261 v = v[keyPart:] 1262 } 1263 1264 return k, v, nil 1265 } 1266 1267 // Current - return key/data at current cursor position 1268 func (c *MdbxCursor) Current() ([]byte, []byte, error) { 1269 k, v, err := c.getCurrent() 1270 if err != nil { 1271 if mdbx.IsNotFound(err) { 1272 return nil, nil, nil 1273 } 1274 return []byte{}, nil, err 1275 } 1276 1277 b := c.bucketCfg 1278 if b.AutoDupSortKeysConversion && len(k) == b.DupToLen { 1279 keyPart := b.DupFromLen - b.DupToLen 1280 k = append(k, v[:keyPart]...) 1281 v = v[keyPart:] 1282 } 1283 1284 return k, v, nil 1285 } 1286 1287 func (c *MdbxCursor) Delete(k []byte) error { 1288 if c.bucketCfg.AutoDupSortKeysConversion { 1289 return c.deleteDupSort(k) 1290 } 1291 1292 _, _, err := c.set(k) 1293 if err != nil { 1294 if mdbx.IsNotFound(err) { 1295 return nil 1296 } 1297 return err 1298 } 1299 1300 if c.bucketCfg.Flags&mdbx.DupSort != 0 { 1301 return c.delAllDupData() 1302 } 1303 return c.delCurrent() 1304 } 1305 1306 // DeleteCurrent This function deletes the key/data pair to which the cursor refers. 1307 // This does not invalidate the cursor, so operations such as MDB_NEXT 1308 // can still be used on it. 1309 // Both MDB_NEXT and MDB_GET_CURRENT will return the same record after 1310 // this operation. 1311 func (c *MdbxCursor) DeleteCurrent() error { 1312 return c.delCurrent() 1313 } 1314 1315 func (c *MdbxCursor) deleteDupSort(key []byte) error { 1316 b := c.bucketCfg 1317 from, to := b.DupFromLen, b.DupToLen 1318 if len(key) != from && len(key) >= to { 1319 return fmt.Errorf("delete from dupsort bucket: %s, can have keys of len==%d and len<%d. key: %x,%d", c.bucketName, from, to, key, len(key)) 1320 } 1321 1322 if len(key) == from { 1323 v, err := c.getBothRange(key[:to], key[to:]) 1324 if err != nil { // if key not found, or found another one - then nothing to delete 1325 if mdbx.IsNotFound(err) { 1326 return nil 1327 } 1328 return err 1329 } 1330 if !bytes.Equal(v[:from-to], key[to:]) { 1331 return nil 1332 } 1333 return c.delCurrent() 1334 } 1335 1336 _, _, err := c.set(key) 1337 if err != nil { 1338 if mdbx.IsNotFound(err) { 1339 return nil 1340 } 1341 return err 1342 } 1343 1344 return c.delCurrent() 1345 } 1346 1347 func (c *MdbxCursor) PutNoOverwrite(key []byte, value []byte) error { 1348 if c.bucketCfg.AutoDupSortKeysConversion { 1349 panic("not implemented") 1350 } 1351 1352 return c.putNoOverwrite(key, value) 1353 } 1354 1355 func (c *MdbxCursor) Put(key []byte, value []byte) error { 1356 b := c.bucketCfg 1357 if b.AutoDupSortKeysConversion { 1358 if err := c.putDupSort(key, value); err != nil { 1359 return err 1360 } 1361 return nil 1362 } 1363 if err := c.put(key, value); err != nil { 1364 return fmt.Errorf("table: %s, err: %w", c.bucketName, err) 1365 } 1366 return nil 1367 } 1368 1369 func (c *MdbxCursor) putDupSort(key []byte, value []byte) error { 1370 b := c.bucketCfg 1371 from, to := b.DupFromLen, b.DupToLen 1372 if len(key) != from && len(key) >= to { 1373 return fmt.Errorf("put dupsort bucket: %s, can have keys of len==%d and len<%d. key: %x,%d", c.bucketName, from, to, key, len(key)) 1374 } 1375 1376 if len(key) != from { 1377 err := c.putNoOverwrite(key, value) 1378 if err != nil { 1379 if mdbx.IsKeyExists(err) { 1380 return c.putCurrent(key, value) 1381 } 1382 return fmt.Errorf("putNoOverwrite, bucket: %s, key: %x, val: %x, err: %w", c.bucketName, key, value, err) 1383 } 1384 return nil 1385 } 1386 1387 value = append(key[to:], value...) 1388 key = key[:to] 1389 v, err := c.getBothRange(key, value[:from-to]) 1390 if err != nil { // if key not found, or found another one - then just insert 1391 if mdbx.IsNotFound(err) { 1392 return c.put(key, value) 1393 } 1394 return err 1395 } 1396 1397 if bytes.Equal(v[:from-to], value[:from-to]) { 1398 if len(v) == len(value) { // in DupSort case mdbx.Current works only with values of same length 1399 return c.putCurrent(key, value) 1400 } 1401 err = c.delCurrent() 1402 if err != nil { 1403 return err 1404 } 1405 } 1406 1407 return c.put(key, value) 1408 } 1409 1410 func (c *MdbxCursor) SeekExact(key []byte) ([]byte, []byte, error) { 1411 b := c.bucketCfg 1412 if b.AutoDupSortKeysConversion && len(key) == b.DupFromLen { 1413 from, to := b.DupFromLen, b.DupToLen 1414 v, err := c.getBothRange(key[:to], key[to:]) 1415 if err != nil { 1416 if mdbx.IsNotFound(err) { 1417 return nil, nil, nil 1418 } 1419 return []byte{}, nil, err 1420 } 1421 if !bytes.Equal(key[to:], v[:from-to]) { 1422 return nil, nil, nil 1423 } 1424 return key[:to], v[from-to:], nil 1425 } 1426 1427 k, v, err := c.set(key) 1428 if err != nil { 1429 if mdbx.IsNotFound(err) { 1430 return nil, nil, nil 1431 } 1432 return []byte{}, nil, err 1433 } 1434 return k, v, nil 1435 } 1436 1437 // Append - speedy feature of mdbx which is not part of KV interface. 1438 // Cast your cursor to *MdbxCursor to use this method. 1439 // Return error - if provided data will not sorted (or bucket have old records which mess with new in sorting manner). 1440 func (c *MdbxCursor) Append(k []byte, v []byte) error { 1441 if c.bucketCfg.AutoDupSortKeysConversion { 1442 b := c.bucketCfg 1443 from, to := b.DupFromLen, b.DupToLen 1444 if len(k) != from && len(k) >= to { 1445 return fmt.Errorf("append dupsort bucket: %s, can have keys of len==%d and len<%d. key: %x,%d", c.bucketName, from, to, k, len(k)) 1446 } 1447 1448 if len(k) == from { 1449 v = append(k[to:], v...) 1450 k = k[:to] 1451 } 1452 } 1453 1454 if c.bucketCfg.Flags&mdbx.DupSort != 0 { 1455 if err := c.c.Put(k, v, mdbx.AppendDup); err != nil { 1456 return fmt.Errorf("bucket: %s, %w", c.bucketName, err) 1457 } 1458 return nil 1459 } 1460 1461 if err := c.c.Put(k, v, mdbx.Append); err != nil { 1462 return fmt.Errorf("bucket: %s, %w", c.bucketName, err) 1463 } 1464 return nil 1465 } 1466 1467 func (c *MdbxCursor) Close() { 1468 if c.c != nil { 1469 c.c.Close() 1470 delete(c.tx.cursors, c.id) 1471 c.c = nil 1472 } 1473 } 1474 1475 type MdbxDupSortCursor struct { 1476 *MdbxCursor 1477 } 1478 1479 func (c *MdbxDupSortCursor) Internal() *mdbx.Cursor { 1480 return c.c 1481 } 1482 1483 // DeleteExact - does delete 1484 func (c *MdbxDupSortCursor) DeleteExact(k1, k2 []byte) error { 1485 _, err := c.getBoth(k1, k2) 1486 if err != nil { // if key not found, or found another one - then nothing to delete 1487 if mdbx.IsNotFound(err) { 1488 return nil 1489 } 1490 return err 1491 } 1492 return c.delCurrent() 1493 } 1494 1495 func (c *MdbxDupSortCursor) SeekBothExact(key, value []byte) ([]byte, []byte, error) { 1496 v, err := c.getBoth(key, value) 1497 if err != nil { 1498 if mdbx.IsNotFound(err) { 1499 return nil, nil, nil 1500 } 1501 return []byte{}, nil, fmt.Errorf("in SeekBothExact: %w", err) 1502 } 1503 return key, v, nil 1504 } 1505 1506 func (c *MdbxDupSortCursor) SeekBothRange(key, value []byte) ([]byte, error) { 1507 v, err := c.getBothRange(key, value) 1508 if err != nil { 1509 if mdbx.IsNotFound(err) { 1510 return nil, nil 1511 } 1512 return nil, fmt.Errorf("in SeekBothRange, table=%s: %w", c.bucketName, err) 1513 } 1514 return v, nil 1515 } 1516 1517 func (c *MdbxDupSortCursor) FirstDup() ([]byte, error) { 1518 v, err := c.firstDup() 1519 if err != nil { 1520 if mdbx.IsNotFound(err) { 1521 return nil, nil 1522 } 1523 return nil, fmt.Errorf("in FirstDup: %w", err) 1524 } 1525 return v, nil 1526 } 1527 1528 // NextDup - iterate only over duplicates of current key 1529 func (c *MdbxDupSortCursor) NextDup() ([]byte, []byte, error) { 1530 k, v, err := c.nextDup() 1531 if err != nil { 1532 if mdbx.IsNotFound(err) { 1533 return nil, nil, nil 1534 } 1535 return []byte{}, nil, fmt.Errorf("in NextDup: %w", err) 1536 } 1537 return k, v, nil 1538 } 1539 1540 // NextNoDup - iterate with skipping all duplicates 1541 func (c *MdbxDupSortCursor) NextNoDup() ([]byte, []byte, error) { 1542 k, v, err := c.nextNoDup() 1543 if err != nil { 1544 if mdbx.IsNotFound(err) { 1545 return nil, nil, nil 1546 } 1547 return []byte{}, nil, fmt.Errorf("in NextNoDup: %w", err) 1548 } 1549 return k, v, nil 1550 } 1551 1552 func (c *MdbxDupSortCursor) PrevDup() ([]byte, []byte, error) { 1553 k, v, err := c.prevDup() 1554 if err != nil { 1555 if mdbx.IsNotFound(err) { 1556 return nil, nil, nil 1557 } 1558 return []byte{}, nil, fmt.Errorf("in PrevDup: %w", err) 1559 } 1560 return k, v, nil 1561 } 1562 1563 func (c *MdbxDupSortCursor) PrevNoDup() ([]byte, []byte, error) { 1564 k, v, err := c.prevNoDup() 1565 if err != nil { 1566 if mdbx.IsNotFound(err) { 1567 return nil, nil, nil 1568 } 1569 return []byte{}, nil, fmt.Errorf("in PrevNoDup: %w", err) 1570 } 1571 return k, v, nil 1572 } 1573 1574 func (c *MdbxDupSortCursor) LastDup() ([]byte, error) { 1575 v, err := c.lastDup() 1576 if err != nil { 1577 if mdbx.IsNotFound(err) { 1578 return nil, nil 1579 } 1580 return nil, fmt.Errorf("in LastDup: %w", err) 1581 } 1582 return v, nil 1583 } 1584 1585 func (c *MdbxDupSortCursor) Append(k []byte, v []byte) error { 1586 if err := c.c.Put(k, v, mdbx.Append|mdbx.AppendDup); err != nil { 1587 return fmt.Errorf("in Append: bucket=%s, %w", c.bucketName, err) 1588 } 1589 return nil 1590 } 1591 1592 func (c *MdbxDupSortCursor) AppendDup(k []byte, v []byte) error { 1593 if err := c.c.Put(k, v, mdbx.AppendDup); err != nil { 1594 return fmt.Errorf("in AppendDup: bucket=%s, %w", c.bucketName, err) 1595 } 1596 return nil 1597 } 1598 1599 func (c *MdbxDupSortCursor) PutNoDupData(k, v []byte) error { 1600 if err := c.c.Put(k, v, mdbx.NoDupData); err != nil { 1601 return fmt.Errorf("in PutNoDupData: %w", err) 1602 } 1603 1604 return nil 1605 } 1606 1607 // DeleteCurrentDuplicates - delete all of the data items for the current key. 1608 func (c *MdbxDupSortCursor) DeleteCurrentDuplicates() error { 1609 if err := c.delAllDupData(); err != nil { 1610 return fmt.Errorf("in DeleteCurrentDuplicates: %w", err) 1611 } 1612 return nil 1613 } 1614 1615 // CountDuplicates returns the number of duplicates for the current key. See mdb_cursor_count 1616 func (c *MdbxDupSortCursor) CountDuplicates() (uint64, error) { 1617 res, err := c.c.Count() 1618 if err != nil { 1619 return 0, fmt.Errorf("in CountDuplicates: %w", err) 1620 } 1621 return res, nil 1622 } 1623 1624 func bucketSlice(b kv.TableCfg) []string { 1625 buckets := make([]string, 0, len(b)) 1626 for name := range b { 1627 buckets = append(buckets, name) 1628 } 1629 sort.Slice(buckets, func(i, j int) bool { 1630 return strings.Compare(buckets[i], buckets[j]) < 0 1631 }) 1632 return buckets 1633 } 1634 1635 func (tx *MdbxTx) ForEach(bucket string, fromPrefix []byte, walker func(k, v []byte) error) error { 1636 c, err := tx.Cursor(bucket) 1637 if err != nil { 1638 return err 1639 } 1640 defer c.Close() 1641 1642 for k, v, err := c.Seek(fromPrefix); k != nil; k, v, err = c.Next() { 1643 if err != nil { 1644 return err 1645 } 1646 if err := walker(k, v); err != nil { 1647 return err 1648 } 1649 } 1650 return nil 1651 } 1652 1653 func (tx *MdbxTx) ForPrefix(bucket string, prefix []byte, walker func(k, v []byte) error) error { 1654 c, err := tx.Cursor(bucket) 1655 if err != nil { 1656 return err 1657 } 1658 defer c.Close() 1659 1660 for k, v, err := c.Seek(prefix); k != nil; k, v, err = c.Next() { 1661 if err != nil { 1662 return err 1663 } 1664 if !bytes.HasPrefix(k, prefix) { 1665 break 1666 } 1667 if err := walker(k, v); err != nil { 1668 return err 1669 } 1670 } 1671 return nil 1672 } 1673 1674 func (tx *MdbxTx) Prefix(table string, prefix []byte) (iter.KV, error) { 1675 nextPrefix, ok := kv.NextSubtree(prefix) 1676 if !ok { 1677 return tx.Range(table, prefix, nil) 1678 } 1679 return tx.Range(table, prefix, nextPrefix) 1680 } 1681 1682 func (tx *MdbxTx) Range(table string, fromPrefix, toPrefix []byte) (iter.KV, error) { 1683 return tx.RangeAscend(table, fromPrefix, toPrefix, -1) 1684 } 1685 func (tx *MdbxTx) RangeAscend(table string, fromPrefix, toPrefix []byte, limit int) (iter.KV, error) { 1686 return tx.rangeOrderLimit(table, fromPrefix, toPrefix, order.Asc, limit) 1687 } 1688 func (tx *MdbxTx) RangeDescend(table string, fromPrefix, toPrefix []byte, limit int) (iter.KV, error) { 1689 return tx.rangeOrderLimit(table, fromPrefix, toPrefix, order.Desc, limit) 1690 } 1691 1692 type cursor2iter struct { 1693 c kv.Cursor 1694 fromPrefix, toPrefix, nextK, nextV []byte 1695 err error 1696 orderAscend order.By 1697 limit int64 1698 ctx context.Context 1699 } 1700 1701 func (tx *MdbxTx) rangeOrderLimit(table string, fromPrefix, toPrefix []byte, orderAscend order.By, limit int) (*cursor2iter, error) { 1702 s := &cursor2iter{ctx: tx.ctx, fromPrefix: fromPrefix, toPrefix: toPrefix, orderAscend: orderAscend, limit: int64(limit)} 1703 tx.streams = append(tx.streams, s) 1704 return s.init(table, tx) 1705 } 1706 func (s *cursor2iter) init(table string, tx kv.Tx) (*cursor2iter, error) { 1707 if s.orderAscend && s.fromPrefix != nil && s.toPrefix != nil && bytes.Compare(s.fromPrefix, s.toPrefix) >= 0 { 1708 return s, fmt.Errorf("tx.Dual: %x must be lexicographicaly before %x", s.fromPrefix, s.toPrefix) 1709 } 1710 if !s.orderAscend && s.fromPrefix != nil && s.toPrefix != nil && bytes.Compare(s.fromPrefix, s.toPrefix) <= 0 { 1711 return s, fmt.Errorf("tx.Dual: %x must be lexicographicaly before %x", s.toPrefix, s.fromPrefix) 1712 } 1713 c, err := tx.Cursor(table) 1714 if err != nil { 1715 return s, err 1716 } 1717 s.c = c 1718 1719 if s.fromPrefix == nil { // no initial position 1720 if s.orderAscend { 1721 s.nextK, s.nextV, s.err = s.c.First() 1722 } else { 1723 s.nextK, s.nextV, s.err = s.c.Last() 1724 } 1725 return s, s.err 1726 } 1727 1728 if s.orderAscend { 1729 s.nextK, s.nextV, s.err = s.c.Seek(s.fromPrefix) 1730 return s, s.err 1731 } else { 1732 // seek exactly to given key or previous one 1733 s.nextK, s.nextV, s.err = s.c.SeekExact(s.fromPrefix) 1734 if s.err != nil { 1735 return s, s.err 1736 } 1737 if s.nextK != nil { // go to last value of this key 1738 if casted, ok := s.c.(kv.CursorDupSort); ok { 1739 s.nextV, s.err = casted.LastDup() 1740 } 1741 } else { // key not found, go to prev one 1742 s.nextK, s.nextV, s.err = s.c.Prev() 1743 } 1744 return s, s.err 1745 } 1746 } 1747 1748 func (s *cursor2iter) Close() { 1749 if s.c != nil { 1750 s.c.Close() 1751 } 1752 } 1753 func (s *cursor2iter) HasNext() bool { 1754 if s.err != nil { // always true, then .Next() call will return this error 1755 return true 1756 } 1757 if s.limit == 0 { // limit reached 1758 return false 1759 } 1760 if s.nextK == nil { // EndOfTable 1761 return false 1762 } 1763 if s.toPrefix == nil { // s.nextK == nil check is above 1764 return true 1765 } 1766 1767 //Asc: [from, to) AND from > to 1768 //Desc: [from, to) AND from < to 1769 cmp := bytes.Compare(s.nextK, s.toPrefix) 1770 return (bool(s.orderAscend) && cmp < 0) || (!bool(s.orderAscend) && cmp > 0) 1771 } 1772 func (s *cursor2iter) Next() (k, v []byte, err error) { 1773 select { 1774 case <-s.ctx.Done(): 1775 return nil, nil, s.ctx.Err() 1776 default: 1777 } 1778 s.limit-- 1779 k, v, err = s.nextK, s.nextV, s.err 1780 if s.orderAscend { 1781 s.nextK, s.nextV, s.err = s.c.Next() 1782 } else { 1783 s.nextK, s.nextV, s.err = s.c.Prev() 1784 } 1785 return k, v, err 1786 } 1787 1788 func (tx *MdbxTx) RangeDupSort(table string, key []byte, fromPrefix, toPrefix []byte, asc order.By, limit int) (iter.KV, error) { 1789 s := &cursorDup2iter{ctx: tx.ctx, key: key, fromPrefix: fromPrefix, toPrefix: toPrefix, orderAscend: bool(asc), limit: int64(limit)} 1790 tx.streams = append(tx.streams, s) 1791 return s.init(table, tx) 1792 } 1793 1794 type cursorDup2iter struct { 1795 c kv.CursorDupSort 1796 key []byte 1797 fromPrefix, toPrefix, nextV []byte 1798 err error 1799 orderAscend bool 1800 limit int64 1801 ctx context.Context 1802 } 1803 1804 func (s *cursorDup2iter) init(table string, tx kv.Tx) (*cursorDup2iter, error) { 1805 if s.orderAscend && s.fromPrefix != nil && s.toPrefix != nil && bytes.Compare(s.fromPrefix, s.toPrefix) >= 0 { 1806 return s, fmt.Errorf("tx.Dual: %x must be lexicographicaly before %x", s.fromPrefix, s.toPrefix) 1807 } 1808 if !s.orderAscend && s.fromPrefix != nil && s.toPrefix != nil && bytes.Compare(s.fromPrefix, s.toPrefix) <= 0 { 1809 return s, fmt.Errorf("tx.Dual: %x must be lexicographicaly before %x", s.toPrefix, s.fromPrefix) 1810 } 1811 c, err := tx.CursorDupSort(table) 1812 if err != nil { 1813 return s, err 1814 } 1815 s.c = c 1816 k, _, err := c.SeekExact(s.key) 1817 if err != nil { 1818 return s, err 1819 } 1820 if k == nil { 1821 return s, nil 1822 } 1823 1824 if s.fromPrefix == nil { // no initial position 1825 if s.orderAscend { 1826 s.nextV, s.err = s.c.FirstDup() 1827 } else { 1828 s.nextV, s.err = s.c.LastDup() 1829 } 1830 return s, s.err 1831 } 1832 1833 if s.orderAscend { 1834 s.nextV, s.err = s.c.SeekBothRange(s.key, s.fromPrefix) 1835 return s, s.err 1836 } else { 1837 // seek exactly to given key or previous one 1838 _, s.nextV, s.err = s.c.SeekBothExact(s.key, s.fromPrefix) 1839 if s.nextV == nil { // no such key 1840 _, s.nextV, s.err = s.c.PrevDup() 1841 } 1842 return s, s.err 1843 } 1844 } 1845 1846 func (s *cursorDup2iter) Close() { 1847 if s.c != nil { 1848 s.c.Close() 1849 } 1850 } 1851 func (s *cursorDup2iter) HasNext() bool { 1852 if s.err != nil { // always true, then .Next() call will return this error 1853 return true 1854 } 1855 if s.limit == 0 { // limit reached 1856 return false 1857 } 1858 if s.nextV == nil { // EndOfTable 1859 return false 1860 } 1861 if s.toPrefix == nil { // s.nextK == nil check is above 1862 return true 1863 } 1864 1865 //Asc: [from, to) AND from > to 1866 //Desc: [from, to) AND from < to 1867 cmp := bytes.Compare(s.nextV, s.toPrefix) 1868 return (s.orderAscend && cmp < 0) || (!s.orderAscend && cmp > 0) 1869 } 1870 func (s *cursorDup2iter) Next() (k, v []byte, err error) { 1871 select { 1872 case <-s.ctx.Done(): 1873 return nil, nil, s.ctx.Err() 1874 default: 1875 } 1876 s.limit-- 1877 v, err = s.nextV, s.err 1878 if s.orderAscend { 1879 _, s.nextV, s.err = s.c.NextDup() 1880 } else { 1881 _, s.nextV, s.err = s.c.PrevDup() 1882 } 1883 return s.key, v, err 1884 } 1885 1886 func (tx *MdbxTx) ForAmount(bucket string, fromPrefix []byte, amount uint32, walker func(k, v []byte) error) error { 1887 if amount == 0 { 1888 return nil 1889 } 1890 c, err := tx.Cursor(bucket) 1891 if err != nil { 1892 return err 1893 } 1894 defer c.Close() 1895 1896 for k, v, err := c.Seek(fromPrefix); k != nil && amount > 0; k, v, err = c.Next() { 1897 if err != nil { 1898 return err 1899 } 1900 if err := walker(k, v); err != nil { 1901 return err 1902 } 1903 amount-- 1904 } 1905 return nil 1906 }