github.com/coocood/badger@v1.5.1-0.20200528065104-c02ac3616d04/transaction.go (about) 1 /* 2 * Copyright 2017 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package badger 18 19 import ( 20 "bytes" 21 "fmt" 22 "math" 23 "sort" 24 "strconv" 25 "sync" 26 "sync/atomic" 27 28 "github.com/coocood/badger/epoch" 29 "github.com/coocood/badger/y" 30 "github.com/dgryski/go-farm" 31 "github.com/pingcap/errors" 32 ) 33 34 type oracle struct { 35 // curRead must be at the top for memory alignment. See issue #311. 36 curRead uint64 // Managed by the mutex. 37 refCount int64 38 isManaged bool // Does not change value, so no locking required. 39 40 sync.Mutex 41 writeLock sync.Mutex 42 nextCommit uint64 43 44 // commits stores a key fingerprint and latest commit counter for it. 45 // refCount is used to clear out commits map to avoid a memory blowup. 46 commits map[uint64]uint64 47 } 48 49 func (o *oracle) addRef() { 50 atomic.AddInt64(&o.refCount, 1) 51 } 52 53 func (o *oracle) decrRef() { 54 if count := atomic.AddInt64(&o.refCount, -1); count == 0 { 55 // Clear out commits maps to release memory. 56 o.Lock() 57 // Avoids the race where something new is added to commitsMap 58 // after we check refCount and before we take Lock. 59 if atomic.LoadInt64(&o.refCount) != 0 { 60 o.Unlock() 61 return 62 } 63 if len(o.commits) >= 1000 { // If the map is still small, let it slide. 64 o.commits = make(map[uint64]uint64) 65 } 66 o.Unlock() 67 } 68 } 69 70 func (o *oracle) readTs() uint64 { 71 if o.isManaged { 72 return math.MaxUint64 73 } 74 return atomic.LoadUint64(&o.curRead) 75 } 76 77 func (o *oracle) commitTs() uint64 { 78 o.Lock() 79 defer o.Unlock() 80 return o.nextCommit 81 } 82 83 // hasConflict must be called while having a lock. 84 func (o *oracle) hasConflict(txn *Txn) bool { 85 if len(txn.reads) == 0 { 86 return false 87 } 88 for _, ro := range txn.reads { 89 if ts, has := o.commits[ro]; has && ts > txn.readTs { 90 return true 91 } 92 } 93 return false 94 } 95 96 func (o *oracle) newCommitTs(txn *Txn) uint64 { 97 o.Lock() 98 defer o.Unlock() 99 100 if o.hasConflict(txn) { 101 return 0 102 } 103 104 var ts uint64 105 if !o.isManaged { 106 // This is the general case, when user doesn't specify the read and commit ts. 107 ts = o.nextCommit 108 o.nextCommit++ 109 110 } else { 111 // If commitTs is set, use it instead. 112 ts = txn.commitTs 113 } 114 115 for _, w := range txn.writes { 116 o.commits[w] = ts // Update the commitTs. 117 } 118 return ts 119 } 120 121 func (o *oracle) allocTs() uint64 { 122 o.Lock() 123 ts := o.nextCommit 124 o.nextCommit++ 125 o.Unlock() 126 return ts 127 } 128 129 func (o *oracle) doneCommit(cts uint64) { 130 if o.isManaged { 131 // No need to update anything. 132 return 133 } 134 135 for { 136 curRead := atomic.LoadUint64(&o.curRead) 137 if cts <= curRead { 138 return 139 } 140 atomic.CompareAndSwapUint64(&o.curRead, curRead, cts) 141 } 142 } 143 144 // Txn represents a Badger transaction. 145 type Txn struct { 146 readTs uint64 147 commitTs uint64 148 149 update bool // update is used to conditionally keep track of reads. 150 reads []uint64 // contains fingerprints of keys read. 151 writes []uint64 // contains fingerprints of keys written. 152 153 pendingWrites map[string]*Entry // cache stores any writes done by txn. 154 155 db *DB 156 discarded bool 157 guard *epoch.Guard 158 159 size int64 160 count int64 161 numIterators int32 162 blobCache map[uint32]*blobCache 163 } 164 165 type pendingWritesIterator struct { 166 entries []*Entry 167 nextIdx int 168 readTs uint64 169 reversed bool 170 } 171 172 func (pi *pendingWritesIterator) Next() { 173 pi.nextIdx++ 174 } 175 176 func (pi *pendingWritesIterator) NextVersion() bool { 177 // We do not support adding multiple versions in a transaction. 178 return false 179 } 180 181 func (pi *pendingWritesIterator) Rewind() { 182 pi.nextIdx = 0 183 } 184 185 func (pi *pendingWritesIterator) Seek(key []byte) { 186 pi.nextIdx = sort.Search(len(pi.entries), func(idx int) bool { 187 cmp := bytes.Compare(pi.entries[idx].Key.UserKey, key) 188 if !pi.reversed { 189 return cmp >= 0 190 } 191 return cmp <= 0 192 }) 193 } 194 195 func (pi *pendingWritesIterator) Key() y.Key { 196 y.Assert(pi.Valid()) 197 entry := pi.entries[pi.nextIdx] 198 return y.KeyWithTs(entry.Key.UserKey, pi.readTs) 199 } 200 201 func (pi *pendingWritesIterator) Value() y.ValueStruct { 202 y.Assert(pi.Valid()) 203 entry := pi.entries[pi.nextIdx] 204 return y.ValueStruct{ 205 Value: entry.Value, 206 Meta: entry.meta, 207 UserMeta: entry.UserMeta, 208 Version: pi.readTs, 209 } 210 } 211 212 func (pi *pendingWritesIterator) FillValue(vs *y.ValueStruct) { 213 entry := pi.entries[pi.nextIdx] 214 vs.Value = entry.Value 215 vs.Meta = entry.meta 216 vs.UserMeta = entry.UserMeta 217 vs.Version = pi.readTs 218 } 219 220 func (pi *pendingWritesIterator) Valid() bool { 221 return pi.nextIdx < len(pi.entries) 222 } 223 224 func (txn *Txn) newPendingWritesIterator(reversed bool) *pendingWritesIterator { 225 if !txn.update || len(txn.pendingWrites) == 0 { 226 return nil 227 } 228 entries := make([]*Entry, 0, len(txn.pendingWrites)) 229 for _, e := range txn.pendingWrites { 230 entries = append(entries, e) 231 } 232 // Number of pending writes per transaction shouldn't be too big in general. 233 sort.Slice(entries, func(i, j int) bool { 234 cmp := entries[i].Key.Compare(entries[j].Key) 235 if !reversed { 236 return cmp < 0 237 } 238 return cmp > 0 239 }) 240 return &pendingWritesIterator{ 241 readTs: txn.readTs, 242 entries: entries, 243 reversed: reversed, 244 } 245 } 246 247 func (txn *Txn) checkSize(e *Entry) error { 248 if len(e.UserMeta) > 255 { 249 return ErrUserMetaTooLarge 250 } 251 // Extra bytes for version in key. 252 size := int64(e.estimateSize()) + 10 253 if size >= txn.db.opt.MaxMemTableSize { 254 return ErrTxnTooBig 255 } 256 txn.count++ 257 txn.size += size 258 return nil 259 } 260 261 // Set adds a key-value pair to the database. 262 // 263 // It will return ErrReadOnlyTxn if update flag was set to false when creating the 264 // transaction. 265 func (txn *Txn) Set(key, val []byte) error { 266 if txn.db.IsManaged() { 267 return ErrManagedTxn 268 } 269 e := &Entry{ 270 Key: y.KeyWithTs(key, 0), 271 Value: val, 272 } 273 return txn.SetEntry(e) 274 } 275 276 // SetWithMeta adds a key-value pair to the database, along with a metadata 277 // byte. This byte is stored alongside the key, and can be used as an aid to 278 // interpret the value or store other contextual bits corresponding to the 279 // key-value pair. 280 func (txn *Txn) SetWithMeta(key, val []byte, meta byte) error { 281 if txn.db.IsManaged() { 282 return ErrManagedTxn 283 } 284 e := &Entry{Key: y.KeyWithTs(key, 0), Value: val, UserMeta: []byte{meta}} 285 return txn.SetEntry(e) 286 } 287 288 func (txn *Txn) SetWithMetaSlice(key, val, meta []byte) error { 289 if txn.db.IsManaged() { 290 return ErrManagedTxn 291 } 292 e := &Entry{Key: y.KeyWithTs(key, 0), Value: val, UserMeta: meta} 293 return txn.SetEntry(e) 294 } 295 296 func (txn *Txn) modify(e *Entry) error { 297 if !txn.update { 298 return ErrReadOnlyTxn 299 } else if txn.discarded { 300 return ErrDiscardedTxn 301 } else if e.Key.IsEmpty() { 302 return ErrEmptyKey 303 } else if e.Key.Len() > maxKeySize { 304 return exceedsMaxKeySizeError(e.Key.UserKey) 305 } else if int64(len(e.Value)) > txn.db.opt.ValueLogFileSize { 306 return exceedsMaxValueSizeError(e.Value, txn.db.opt.ValueLogFileSize) 307 } 308 if err := txn.checkSize(e); err != nil { 309 return err 310 } 311 312 fp := farm.Fingerprint64(e.Key.UserKey) // Avoid dealing with byte arrays. 313 txn.writes = append(txn.writes, fp) 314 txn.pendingWrites[string(e.Key.UserKey)] = e 315 return nil 316 } 317 318 // SetEntry takes an Entry struct and adds the key-value pair in the struct, along 319 // with other metadata to the database. 320 func (txn *Txn) SetEntry(e *Entry) error { 321 return txn.modify(e) 322 } 323 324 // Delete deletes a key. This is done by adding a delete marker for the key at commit timestamp. 325 // Any reads happening before this timestamp would be unaffected. Any reads after this commit would 326 // see the deletion. 327 func (txn *Txn) Delete(key []byte) error { 328 e := &Entry{ 329 Key: y.KeyWithTs(key, 0), 330 meta: bitDelete, 331 } 332 return txn.modify(e) 333 } 334 335 // Get looks for key and returns corresponding Item. 336 // If key is not found, ErrKeyNotFound is returned. 337 func (txn *Txn) Get(key []byte) (item *Item, rerr error) { 338 if len(key) == 0 { 339 return nil, ErrEmptyKey 340 } else if txn.discarded { 341 return nil, ErrDiscardedTxn 342 } 343 344 item = new(Item) 345 if txn.update { 346 if e, has := txn.pendingWrites[string(key)]; has && bytes.Equal(key, e.Key.UserKey) { 347 if isDeleted(e.meta) { 348 return nil, ErrKeyNotFound 349 } 350 // Fulfill from cache. 351 item.meta = e.meta 352 item.vptr = e.Value 353 item.userMeta = e.UserMeta 354 item.key.UserKey = key 355 item.key.Version = txn.readTs 356 // We probably don't need to set db on item here. 357 return item, nil 358 } 359 // Only track reads if this is update txn. No need to track read if txn serviced it 360 // internally. 361 fp := farm.Fingerprint64(key) 362 txn.reads = append(txn.reads, fp) 363 } 364 365 seek := y.KeyWithTs(key, txn.readTs) 366 var vs y.ValueStruct 367 for { 368 vs = txn.db.get(seek) 369 if !vs.Valid() { 370 return nil, ErrKeyNotFound 371 } 372 if isDeleted(vs.Meta) { 373 return nil, ErrKeyNotFound 374 } 375 break 376 } 377 378 item.key.UserKey = key 379 item.key.Version = vs.Version 380 item.meta = vs.Meta 381 item.userMeta = vs.UserMeta 382 item.db = txn.db 383 item.vptr = vs.Value 384 item.txn = txn 385 return item, nil 386 } 387 388 type keyValuePair struct { 389 key y.Key 390 hash uint64 391 val y.ValueStruct 392 found bool 393 } 394 395 // MultiGet gets items for keys, if not found, the corresponding item will be nil. 396 // It only supports read-only transaction for simplicity. 397 func (txn *Txn) MultiGet(keys [][]byte) (items []*Item, err error) { 398 if txn.update { 399 return nil, errors.New("not supported") 400 } 401 if txn.discarded { 402 return nil, ErrDiscardedTxn 403 } 404 keyValuePairs := make([]keyValuePair, len(keys)) 405 for i, key := range keys { 406 if len(key) == 0 { 407 return nil, ErrEmptyKey 408 } 409 keyValuePairs[i].hash = farm.Fingerprint64(key) 410 keyValuePairs[i].key = y.KeyWithTs(key, txn.readTs) 411 } 412 txn.db.multiGet(keyValuePairs) 413 items = make([]*Item, len(keys)) 414 for i, pair := range keyValuePairs { 415 if pair.found && !isDeleted(pair.val.Meta) { 416 items[i] = &Item{ 417 key: y.Key{ 418 UserKey: keys[i], 419 Version: pair.val.Version, 420 }, 421 meta: pair.val.Meta, 422 userMeta: pair.val.UserMeta, 423 db: txn.db, 424 vptr: pair.val.Value, 425 txn: txn, 426 } 427 } 428 } 429 return items, nil 430 } 431 432 // Discard discards a created transaction. This method is very important and must be called. Commit 433 // method calls this internally, however, calling this multiple times doesn't cause any issues. So, 434 // this can safely be called via a defer right when transaction is created. 435 // 436 // NOTE: If any operations are run on a discarded transaction, ErrDiscardedTxn is returned. 437 func (txn *Txn) Discard() { 438 if txn.discarded { // Avoid a re-run. 439 return 440 } 441 if atomic.LoadInt32(&txn.numIterators) > 0 { 442 panic("Unclosed iterator at time of Txn.Discard.") 443 } 444 txn.discarded = true 445 txn.blobCache = nil 446 if txn.update { 447 txn.db.orc.decrRef() 448 } 449 txn.guard.Done() 450 } 451 452 // Commit commits the transaction, following these steps: 453 // 454 // 1. If there are no writes, return immediately. 455 // 456 // 2. Check if read rows were updated since txn started. If so, return ErrConflict. 457 // 458 // 3. If no conflict, generate a commit timestamp and update written rows' commit ts. 459 // 460 // 4. Batch up all writes, write them to value log and LSM tree. 461 // 462 // If error is nil, the transaction is successfully committed. In case of a non-nil error, the LSM 463 // tree won't be updated, so there's no need for any rollback. 464 func (txn *Txn) Commit() error { 465 if txn.discarded { 466 return ErrDiscardedTxn 467 } 468 defer txn.Discard() 469 if len(txn.writes) == 0 { 470 return nil // Nothing to do. 471 } 472 managed := txn.db.IsManaged() 473 entries := make([]*Entry, 0, len(txn.pendingWrites)+1) 474 for _, e := range txn.pendingWrites { 475 if managed && e.Key.Version == 0 { 476 return fmt.Errorf("version of key %x not specified for managed db", e.Key.UserKey) 477 } 478 e.meta |= bitTxn 479 entries = append(entries, e) 480 } 481 sort.Slice(entries, func(i, j int) bool { 482 return entries[i].Key.Compare(entries[j].Key) < 0 483 }) 484 var commitTs uint64 485 state := txn.db.orc 486 state.writeLock.Lock() 487 if !managed { 488 commitTs = state.newCommitTs(txn) 489 if commitTs == 0 { 490 state.writeLock.Unlock() 491 return ErrConflict 492 } 493 for _, e := range entries { 494 // Suffix the keys with commit ts, so the key versions are sorted in 495 // descending order of commit timestamp. 496 e.Key.Version = commitTs 497 } 498 } 499 // The txnKey entry is used for mark the transaction boundary, the value here is used for assertion. 500 e := &Entry{ 501 Key: y.KeyWithTs(txnKey, commitTs), 502 Value: []byte(strconv.FormatUint(commitTs, 10)), 503 meta: bitFinTxn, 504 } 505 entries = append(entries, e) 506 507 req, err := txn.db.sendToWriteCh(entries) 508 state.writeLock.Unlock() 509 if err != nil { 510 return err 511 } 512 513 req.Wait() 514 state.doneCommit(commitTs) 515 516 return nil 517 } 518 519 // NewTransaction creates a new transaction. Badger supports concurrent execution of transactions, 520 // providing serializable snapshot isolation, avoiding write skews. Badger achieves this by tracking 521 // the keys read and at Commit time, ensuring that these read keys weren't concurrently modified by 522 // another transaction. 523 // 524 // For read-only transactions, set update to false. In this mode, we don't track the rows read for 525 // any changes. Thus, any long running iterations done in this mode wouldn't pay this overhead. 526 // 527 // Running transactions concurrently is OK. However, a transaction itself isn't thread safe, and 528 // should only be run serially. It doesn't matter if a transaction is created by one goroutine and 529 // passed down to other, as long as the Txn APIs are called serially. 530 // 531 // When you create a new transaction, it is absolutely essential to call 532 // Discard(). This should be done irrespective of what the update param is set 533 // to. Commit API internally runs Discard, but running it twice wouldn't cause 534 // any issues. 535 // 536 // txn := db.NewTransaction(false) 537 // defer txn.Discard() 538 // // Call various APIs. 539 func (db *DB) NewTransaction(update bool) *Txn { 540 if db.opt.ReadOnly { 541 // DB is read-only, force read-only transaction. 542 update = false 543 } 544 readTs := db.orc.readTs() 545 txn := &Txn{ 546 update: update, 547 db: db, 548 count: 1, // One extra entry for BitFin. 549 size: int64(len(txnKey) + 10), // Some buffer for the extra entry. 550 readTs: readTs, 551 } 552 if !db.IsManaged() { 553 txn.guard = db.resourceMgr.AcquireWithPayload(readTs) 554 } else { 555 txn.guard = db.resourceMgr.Acquire() 556 } 557 if update { 558 txn.pendingWrites = make(map[string]*Entry) 559 txn.db.orc.addRef() 560 } 561 return txn 562 } 563 564 // View executes a function creating and managing a read-only transaction for the user. Error 565 // returned by the function is relayed by the View method. 566 func (db *DB) View(fn func(txn *Txn) error) error { 567 txn := db.NewTransaction(false) 568 if db.IsManaged() { 569 txn.SetReadTS(math.MaxUint64) 570 } 571 defer txn.Discard() 572 573 return fn(txn) 574 } 575 576 // SetReadTS reads the DB with a given TS, it can only be used in a managed DB. 577 func (txn *Txn) SetReadTS(readTS uint64) { 578 y.Assert(txn.db.IsManaged()) 579 txn.readTs = readTS 580 } 581 582 // Update executes a function, creating and managing a read-write transaction 583 // for the user. Error returned by the function is relayed by the Update method. 584 func (db *DB) Update(fn func(txn *Txn) error) error { 585 txn := db.NewTransaction(true) 586 defer txn.Discard() 587 588 if err := fn(txn); err != nil { 589 return err 590 } 591 592 return txn.Commit() 593 }