github.com/pingcap/badger@v1.5.1-0.20230103063557-828f39b09b6d/transaction.go (about) 1 /* 2 * Copyright 2017 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package badger 18 19 import ( 20 "bytes" 21 "fmt" 22 "math" 23 "sort" 24 "strconv" 25 "sync" 26 "sync/atomic" 27 28 "github.com/dgryski/go-farm" 29 "github.com/pingcap/badger/epoch" 30 "github.com/pingcap/badger/y" 31 "github.com/pingcap/errors" 32 ) 33 34 type oracle struct { 35 // curRead must be at the Top for memory alignment. See issue #311. 36 curRead uint64 // Managed by the mutex. 37 refCount int64 38 isManaged bool // Does not change value, so no locking required. 39 40 sync.Mutex 41 writeLock sync.Mutex 42 nextCommit uint64 43 44 // commits stores a key fingerprint and latest commit counter for it. 45 // refCount is used to clear out commits map to avoid a memory blowup. 46 commits map[uint64]uint64 47 } 48 49 func (o *oracle) addRef() { 50 atomic.AddInt64(&o.refCount, 1) 51 } 52 53 func (o *oracle) decrRef() { 54 if count := atomic.AddInt64(&o.refCount, -1); count == 0 { 55 // Clear out commits maps to release memory. 56 o.Lock() 57 // Avoids the race where something new is added to commitsMap 58 // after we check refCount and before we take Lock. 59 if atomic.LoadInt64(&o.refCount) != 0 { 60 o.Unlock() 61 return 62 } 63 if len(o.commits) >= 1000 { // If the map is still small, let it slide. 64 o.commits = make(map[uint64]uint64) 65 } 66 o.Unlock() 67 } 68 } 69 70 func (o *oracle) readTs() uint64 { 71 if o.isManaged { 72 return math.MaxUint64 73 } 74 return atomic.LoadUint64(&o.curRead) 75 } 76 77 func (o *oracle) commitTs() uint64 { 78 o.Lock() 79 defer o.Unlock() 80 return o.nextCommit 81 } 82 83 // hasConflict must be called while having a lock. 84 func (o *oracle) hasConflict(txn *Txn) bool { 85 if len(txn.reads) == 0 { 86 return false 87 } 88 for _, ro := range txn.reads { 89 if ts, has := o.commits[ro]; has && ts > txn.readTs { 90 return true 91 } 92 } 93 return false 94 } 95 96 func (o *oracle) newCommitTs(txn *Txn) uint64 { 97 o.Lock() 98 defer o.Unlock() 99 100 if o.hasConflict(txn) { 101 return 0 102 } 103 104 var ts uint64 105 if !o.isManaged { 106 // This is the general case, when user doesn't specify the read and commit ts. 107 ts = o.nextCommit 108 o.nextCommit++ 109 110 } else { 111 // If commitTs is set, use it instead. 112 ts = txn.commitTs 113 } 114 115 for _, w := range txn.writes { 116 o.commits[w] = ts // Update the commitTs. 117 } 118 return ts 119 } 120 121 func (o *oracle) allocTs() uint64 { 122 o.Lock() 123 ts := o.nextCommit 124 o.nextCommit++ 125 o.Unlock() 126 return ts 127 } 128 129 func (o *oracle) doneCommit(cts uint64) { 130 if o.isManaged { 131 // No need to update anything. 132 return 133 } 134 135 for { 136 curRead := atomic.LoadUint64(&o.curRead) 137 if cts <= curRead { 138 return 139 } 140 atomic.CompareAndSwapUint64(&o.curRead, curRead, cts) 141 } 142 } 143 144 // Txn represents a Badger transaction. 145 type Txn struct { 146 readTs uint64 147 commitTs uint64 148 149 update bool // update is used to conditionally keep track of reads. 150 reads []uint64 // contains fingerprints of keys read. 151 writes []uint64 // contains fingerprints of keys written. 152 153 pendingWrites map[string]*Entry // cache stores any writes done by txn. 154 155 db *DB 156 discarded bool 157 guard *epoch.Guard 158 159 size int64 160 count int64 161 numIterators int32 162 blobCache map[uint32]*blobCache 163 } 164 165 type pendingWritesIterator struct { 166 entries []*Entry 167 nextIdx int 168 readTs uint64 169 reversed bool 170 } 171 172 func (pi *pendingWritesIterator) Next() { 173 pi.nextIdx++ 174 } 175 176 func (pi *pendingWritesIterator) NextVersion() bool { 177 // We do not support adding multiple versions in a transaction. 178 return false 179 } 180 181 func (pi *pendingWritesIterator) Rewind() { 182 pi.nextIdx = 0 183 } 184 185 func (pi *pendingWritesIterator) Seek(key []byte) { 186 pi.nextIdx = sort.Search(len(pi.entries), func(idx int) bool { 187 cmp := bytes.Compare(pi.entries[idx].Key.UserKey, key) 188 if !pi.reversed { 189 return cmp >= 0 190 } 191 return cmp <= 0 192 }) 193 } 194 195 func (pi *pendingWritesIterator) Key() y.Key { 196 y.Assert(pi.Valid()) 197 entry := pi.entries[pi.nextIdx] 198 return y.KeyWithTs(entry.Key.UserKey, pi.readTs) 199 } 200 201 func (pi *pendingWritesIterator) Value() y.ValueStruct { 202 y.Assert(pi.Valid()) 203 entry := pi.entries[pi.nextIdx] 204 return y.ValueStruct{ 205 Value: entry.Value, 206 Meta: entry.meta, 207 UserMeta: entry.UserMeta, 208 Version: pi.readTs, 209 } 210 } 211 212 func (pi *pendingWritesIterator) FillValue(vs *y.ValueStruct) { 213 entry := pi.entries[pi.nextIdx] 214 vs.Value = entry.Value 215 vs.Meta = entry.meta 216 vs.UserMeta = entry.UserMeta 217 vs.Version = pi.readTs 218 } 219 220 func (pi *pendingWritesIterator) Valid() bool { 221 return pi.nextIdx < len(pi.entries) 222 } 223 224 func (pi *pendingWritesIterator) Close() error { 225 return nil 226 } 227 228 func (txn *Txn) newPendingWritesIterator(reversed bool) *pendingWritesIterator { 229 if !txn.update || len(txn.pendingWrites) == 0 { 230 return nil 231 } 232 entries := make([]*Entry, 0, len(txn.pendingWrites)) 233 for _, e := range txn.pendingWrites { 234 entries = append(entries, e) 235 } 236 // Number of pending writes per transaction shouldn't be too big in general. 237 sort.Slice(entries, func(i, j int) bool { 238 cmp := entries[i].Key.Compare(entries[j].Key) 239 if !reversed { 240 return cmp < 0 241 } 242 return cmp > 0 243 }) 244 return &pendingWritesIterator{ 245 readTs: txn.readTs, 246 entries: entries, 247 reversed: reversed, 248 } 249 } 250 251 func (txn *Txn) checkSize(e *Entry) error { 252 if len(e.UserMeta) > 255 { 253 return ErrUserMetaTooLarge 254 } 255 // Extra bytes for version in key. 256 size := int64(e.estimateSize()) + 10 257 if size >= txn.db.opt.MaxMemTableSize { 258 return ErrTxnTooBig 259 } 260 txn.count++ 261 txn.size += size 262 return nil 263 } 264 265 // Set adds a key-value pair to the database. 266 // 267 // It will return ErrReadOnlyTxn if update flag was set to false when creating the 268 // transaction. 269 func (txn *Txn) Set(key, val []byte) error { 270 if txn.db.IsManaged() { 271 return ErrManagedTxn 272 } 273 e := &Entry{ 274 Key: y.KeyWithTs(key, 0), 275 Value: val, 276 } 277 return txn.SetEntry(e) 278 } 279 280 // SetWithMeta adds a key-value pair to the database, along with a metadata 281 // byte. This byte is stored alongside the key, and can be used as an aid to 282 // interpret the value or store other contextual bits corresponding to the 283 // key-value pair. 284 func (txn *Txn) SetWithMeta(key, val []byte, meta byte) error { 285 if txn.db.IsManaged() { 286 return ErrManagedTxn 287 } 288 e := &Entry{Key: y.KeyWithTs(key, 0), Value: val, UserMeta: []byte{meta}} 289 return txn.SetEntry(e) 290 } 291 292 func (txn *Txn) SetWithMetaSlice(key, val, meta []byte) error { 293 if txn.db.IsManaged() { 294 return ErrManagedTxn 295 } 296 e := &Entry{Key: y.KeyWithTs(key, 0), Value: val, UserMeta: meta} 297 return txn.SetEntry(e) 298 } 299 300 func (txn *Txn) modify(e *Entry) error { 301 if !txn.update { 302 return ErrReadOnlyTxn 303 } else if txn.discarded { 304 return ErrDiscardedTxn 305 } else if e.Key.IsEmpty() { 306 return ErrEmptyKey 307 } else if e.Key.Len() > maxKeySize { 308 return exceedsMaxKeySizeError(e.Key.UserKey) 309 } else if int64(len(e.Value)) > txn.db.opt.ValueLogFileSize { 310 return exceedsMaxValueSizeError(e.Value, txn.db.opt.ValueLogFileSize) 311 } 312 if err := txn.checkSize(e); err != nil { 313 return err 314 } 315 316 fp := farm.Fingerprint64(e.Key.UserKey) // Avoid dealing with byte arrays. 317 txn.writes = append(txn.writes, fp) 318 txn.pendingWrites[string(e.Key.UserKey)] = e 319 return nil 320 } 321 322 // SetEntry takes an Entry struct and adds the key-value pair in the struct, along 323 // with other metadata to the database. 324 func (txn *Txn) SetEntry(e *Entry) error { 325 return txn.modify(e) 326 } 327 328 // Delete deletes a key. This is done by adding a delete marker for the key at commit timestamp. 329 // Any reads happening before this timestamp would be unaffected. Any reads after this commit would 330 // see the deletion. 331 func (txn *Txn) Delete(key []byte) error { 332 e := &Entry{ 333 Key: y.KeyWithTs(key, 0), 334 meta: bitDelete, 335 } 336 return txn.modify(e) 337 } 338 339 // Get looks for key and returns corresponding Item. 340 // If key is not found, ErrKeyNotFound is returned. 341 func (txn *Txn) Get(key []byte) (item *Item, rerr error) { 342 if len(key) == 0 { 343 return nil, ErrEmptyKey 344 } else if txn.discarded { 345 return nil, ErrDiscardedTxn 346 } 347 348 item = new(Item) 349 if txn.update { 350 if e, has := txn.pendingWrites[string(key)]; has && bytes.Equal(key, e.Key.UserKey) { 351 if isDeleted(e.meta) { 352 return nil, ErrKeyNotFound 353 } 354 // Fulfill from cache. 355 item.meta = e.meta 356 item.vptr = e.Value 357 item.userMeta = e.UserMeta 358 item.key.UserKey = key 359 item.key.Version = txn.readTs 360 // We probably don't need to set db on item here. 361 return item, nil 362 } 363 // Only track reads if this is update txn. No need to track read if txn serviced it 364 // internally. 365 fp := farm.Fingerprint64(key) 366 txn.reads = append(txn.reads, fp) 367 } 368 369 seek := y.KeyWithTs(key, txn.readTs) 370 var vs y.ValueStruct 371 for { 372 vs = txn.db.get(seek) 373 if !vs.Valid() { 374 return nil, ErrKeyNotFound 375 } 376 if isDeleted(vs.Meta) { 377 return nil, ErrKeyNotFound 378 } 379 break 380 } 381 382 item.key.UserKey = key 383 item.key.Version = vs.Version 384 item.meta = vs.Meta 385 item.userMeta = vs.UserMeta 386 item.db = txn.db 387 item.vptr = vs.Value 388 item.txn = txn 389 return item, nil 390 } 391 392 type keyValuePair struct { 393 key y.Key 394 hash uint64 395 val y.ValueStruct 396 found bool 397 } 398 399 // MultiGet gets items for keys, if not found, the corresponding item will be nil. 400 // It only supports read-only transaction for simplicity. 401 func (txn *Txn) MultiGet(keys [][]byte) (items []*Item, err error) { 402 if txn.update { 403 return nil, errors.New("not supported") 404 } 405 if txn.discarded { 406 return nil, ErrDiscardedTxn 407 } 408 keyValuePairs := make([]keyValuePair, len(keys)) 409 for i, key := range keys { 410 if len(key) == 0 { 411 return nil, ErrEmptyKey 412 } 413 keyValuePairs[i].hash = farm.Fingerprint64(key) 414 keyValuePairs[i].key = y.KeyWithTs(key, txn.readTs) 415 } 416 txn.db.multiGet(keyValuePairs) 417 items = make([]*Item, len(keys)) 418 for i, pair := range keyValuePairs { 419 if pair.found && !isDeleted(pair.val.Meta) { 420 items[i] = &Item{ 421 key: y.Key{ 422 UserKey: keys[i], 423 Version: pair.val.Version, 424 }, 425 meta: pair.val.Meta, 426 userMeta: pair.val.UserMeta, 427 db: txn.db, 428 vptr: pair.val.Value, 429 txn: txn, 430 } 431 } 432 } 433 return items, nil 434 } 435 436 // Discard discards a created transaction. This method is very important and must be called. Commit 437 // method calls this internally, however, calling this multiple times doesn't cause any issues. So, 438 // this can safely be called via a defer right when transaction is created. 439 // 440 // NOTE: If any operations are run on a discarded transaction, ErrDiscardedTxn is returned. 441 func (txn *Txn) Discard() { 442 if txn.discarded { // Avoid a re-run. 443 return 444 } 445 if atomic.LoadInt32(&txn.numIterators) > 0 { 446 panic("Unclosed iterator at time of Txn.Discard.") 447 } 448 txn.discarded = true 449 txn.blobCache = nil 450 if txn.update { 451 txn.db.orc.decrRef() 452 } 453 txn.guard.Done() 454 } 455 456 // Commit commits the transaction, following these steps: 457 // 458 // 1. If there are no writes, return immediately. 459 // 460 // 2. Check if read rows were updated since txn started. If so, return ErrConflict. 461 // 462 // 3. If no conflict, generate a commit timestamp and update written rows' commit ts. 463 // 464 // 4. Batch up all writes, write them to value log and LSM tree. 465 // 466 // If error is nil, the transaction is successfully committed. In case of a non-nil error, the LSM 467 // tree won't be updated, so there's no need for any rollback. 468 func (txn *Txn) Commit() error { 469 if txn.discarded { 470 return ErrDiscardedTxn 471 } 472 defer txn.Discard() 473 if len(txn.writes) == 0 { 474 return nil // Nothing to do. 475 } 476 managed := txn.db.IsManaged() 477 entries := make([]*Entry, 0, len(txn.pendingWrites)+1) 478 for _, e := range txn.pendingWrites { 479 if managed && e.Key.Version == 0 { 480 return fmt.Errorf("version of key %x not specified for managed db", e.Key.UserKey) 481 } 482 e.meta |= bitTxn 483 entries = append(entries, e) 484 } 485 sort.Slice(entries, func(i, j int) bool { 486 return entries[i].Key.Compare(entries[j].Key) < 0 487 }) 488 var commitTs uint64 489 state := txn.db.orc 490 state.writeLock.Lock() 491 if !managed { 492 commitTs = state.newCommitTs(txn) 493 if commitTs == 0 { 494 state.writeLock.Unlock() 495 return ErrConflict 496 } 497 for _, e := range entries { 498 // Suffix the keys with commit ts, so the key versions are sorted in 499 // descending order of commit timestamp. 500 e.Key.Version = commitTs 501 } 502 } 503 // The txnKey entry is used for mark the transaction boundary, the value here is used for assertion. 504 e := &Entry{ 505 Key: y.KeyWithTs(txnKey, commitTs), 506 Value: []byte(strconv.FormatUint(commitTs, 10)), 507 meta: bitFinTxn, 508 } 509 entries = append(entries, e) 510 511 req, err := txn.db.sendToWriteCh(entries) 512 state.writeLock.Unlock() 513 if err != nil { 514 return err 515 } 516 517 req.Wait() 518 state.doneCommit(commitTs) 519 520 return nil 521 } 522 523 // NewTransaction creates a new transaction. Badger supports concurrent execution of transactions, 524 // providing serializable snapshot isolation, avoiding write skews. Badger achieves this by tracking 525 // the keys read and at Commit time, ensuring that these read keys weren't concurrently modified by 526 // another transaction. 527 // 528 // For read-only transactions, set update to false. In this mode, we don't track the rows read for 529 // any changes. Thus, any long running iterations done in this mode wouldn't pay this overhead. 530 // 531 // Running transactions concurrently is OK. However, a transaction itself isn't thread safe, and 532 // should only be run serially. It doesn't matter if a transaction is created by one goroutine and 533 // passed down to other, as long as the Txn APIs are called serially. 534 // 535 // When you create a new transaction, it is absolutely essential to call 536 // Discard(). This should be done irrespective of what the update param is set 537 // to. Commit API internally runs Discard, but running it twice wouldn't cause 538 // any issues. 539 // 540 // txn := db.NewTransaction(false) 541 // defer txn.Discard() 542 // // Call various APIs. 543 func (db *DB) NewTransaction(update bool) *Txn { 544 if db.opt.ReadOnly { 545 // DB is read-only, force read-only transaction. 546 update = false 547 } 548 readTs := db.orc.readTs() 549 txn := &Txn{ 550 update: update, 551 db: db, 552 count: 1, // One extra entry for BitFin. 553 size: int64(len(txnKey) + 10), // Some buffer for the extra entry. 554 readTs: readTs, 555 } 556 if !db.IsManaged() { 557 txn.guard = db.resourceMgr.AcquireWithPayload(readTs) 558 } else { 559 txn.guard = db.resourceMgr.Acquire() 560 } 561 if update { 562 txn.pendingWrites = make(map[string]*Entry) 563 txn.db.orc.addRef() 564 } 565 return txn 566 } 567 568 // View executes a function creating and managing a read-only transaction for the user. Error 569 // returned by the function is relayed by the View method. 570 func (db *DB) View(fn func(txn *Txn) error) error { 571 txn := db.NewTransaction(false) 572 if db.IsManaged() { 573 txn.SetReadTS(math.MaxUint64) 574 } 575 defer txn.Discard() 576 577 return fn(txn) 578 } 579 580 // SetReadTS reads the DB with a given TS, it can only be used in a managed DB. 581 func (txn *Txn) SetReadTS(readTS uint64) { 582 y.Assert(txn.db.IsManaged()) 583 txn.readTs = readTS 584 } 585 586 // Update executes a function, creating and managing a read-write transaction 587 // for the user. Error returned by the function is relayed by the Update method. 588 func (db *DB) Update(fn func(txn *Txn) error) error { 589 txn := db.NewTransaction(true) 590 defer txn.Discard() 591 592 if err := fn(txn); err != nil { 593 return err 594 } 595 596 return txn.Commit() 597 }