github.com/vescale/zgraph@v0.0.0-20230410094002-959c02d50f95/storage/transaction.go (about) 1 // Copyright 2022 zGraph Authors. All rights reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package storage 16 17 import ( 18 "context" 19 "fmt" 20 "math" 21 "sync" 22 "time" 23 24 "github.com/cenkalti/backoff" 25 "github.com/cockroachdb/pebble" 26 "github.com/vescale/zgraph/storage/kv" 27 "github.com/vescale/zgraph/storage/latch" 28 "github.com/vescale/zgraph/storage/mvcc" 29 "github.com/vescale/zgraph/storage/resolver" 30 "go.uber.org/atomic" 31 ) 32 33 // Txn represents a transaction implemented beyond the low-level key/value storage. 34 type Txn struct { 35 mu sync.Mutex 36 vp kv.VersionProvider 37 db *pebble.DB 38 us *UnionStore 39 latches *latch.LatchesScheduler 40 resolver *resolver.Scheduler 41 valid bool 42 snapshot kv.Snapshot 43 startTime time.Time 44 startVer kv.Version 45 commitVer kv.Version 46 setCnt int64 47 lockedCnt int 48 } 49 50 // Get implements the Transaction interface. 51 func (txn *Txn) Get(ctx context.Context, k kv.Key) ([]byte, error) { 52 return txn.us.Get(ctx, k) 53 } 54 55 // Iter creates an Iterator positioned on the first entry that k <= entry's key. 56 // If such entry is not found, it returns an invalid Iterator with no error. 57 // It yields only keys that < upperBound. If upperBound is nil, it means the upperBound is unbounded. 58 // The Iterator must be Closed after use. 59 func (txn *Txn) Iter(lowerBound, upperBound kv.Key) (kv.Iterator, error) { 60 return txn.us.Iter(lowerBound, upperBound) 61 } 62 63 // IterReverse creates a reversed Iterator positioned on the first entry which key is less than k. 64 func (txn *Txn) IterReverse(lowerBound, upperBound kv.Key) (kv.Iterator, error) { 65 return txn.us.IterReverse(lowerBound, upperBound) 66 } 67 68 // Set implements the Transaction interface. 69 // It sets the value for key k as v into kv store. 70 // v must NOT be nil or empty, otherwise it returns ErrCannotSetNilValue. 71 func (txn *Txn) Set(k kv.Key, v []byte) error { 72 txn.setCnt++ 73 return txn.us.MemBuffer().Set(k, v) 74 } 75 76 // Delete implements the Transaction interface. It removes the entry for key k from kv store. 77 func (txn *Txn) Delete(k kv.Key) error { 78 return txn.us.MemBuffer().Delete(k) 79 } 80 81 // StartVer implements the Transaction interface. 82 func (txn *Txn) StartVer() kv.Version { 83 return txn.startVer 84 } 85 86 // Snapshot implements the Transaction interface. 87 func (txn *Txn) Snapshot() kv.Snapshot { 88 return txn.snapshot 89 } 90 91 // BatchGet implements the Transaction interface. 92 // It gets kv from the memory buffer of statement and transaction, and the kv storage. 93 // Do not use len(value) == 0 or value == nil to represent non-exist. 94 // If a key doesn't exist, there shouldn't be any corresponding entry in the result map. 95 func (txn *Txn) BatchGet(ctx context.Context, keys []kv.Key) (map[string][]byte, error) { 96 return NewBufferBatchGetter(txn.us.MemBuffer(), txn.Snapshot()).BatchGet(ctx, keys) 97 } 98 99 // Size implements the Transaction interface. It returns sum of keys and values length. 100 func (txn *Txn) Size() int { 101 return txn.us.MemBuffer().Size() 102 } 103 104 // Len implements the Transaction interface. It returns the number of entries in the DB. 105 func (txn *Txn) Len() int { 106 return txn.us.MemBuffer().Len() 107 } 108 109 // Reset implements the Transaction interface. It resets the Transaction to initial states. 110 func (txn *Txn) Reset() { 111 txn.us.MemBuffer().Reset() 112 } 113 114 func (txn *Txn) Commit(_ context.Context) error { 115 if !txn.valid { 116 return kv.ErrInvalidTxn 117 } 118 defer txn.close() 119 120 // Sanity check for start timestamp of the current transaction. 121 if txn.startVer == mvcc.LockVer { 122 return kv.ErrInvalidStartVer 123 } 124 125 committer := &committer{ 126 db: txn.db, 127 memDB: txn.us.MemBuffer(), 128 resolver: txn.resolver, 129 startVer: txn.startVer, 130 } 131 err := committer.init(txn.startTime) 132 if err != nil { 133 return err 134 } 135 if committer.length() == 0 { 136 return nil 137 } 138 keys := committer.keys() 139 140 err = backoff.RetryNotify(func() error { 141 // Note: don't use `defer txn.latches.UnLock(lock)` here. we need to keep the 142 // lock fine-grain. 143 // Because the subsequent routine may time-consumed: 144 // - CheckTxnStatus: will be slow if the IO usage is high. 145 // - Resolve: will block if the worker queue full. 146 lock := txn.latches.Lock(txn.startVer, keys) 147 err := committer.prepare() 148 errg, ok := err.(*kv.ErrGroup) 149 if !ok { 150 txn.latches.UnLock(lock) 151 return err 152 } 153 // Prepare transaction successfully means all lock are written into the low-level 154 // storage. 155 if len(errg.Errors) == 0 { 156 commitVer := txn.vp.CurrentVersion() 157 txn.commitVer = commitVer 158 committer.commitVer = commitVer 159 lock.SetCommitVer(commitVer) 160 txn.latches.UnLock(lock) 161 return nil 162 } 163 txn.latches.UnLock(lock) 164 165 rollbacks := map[kv.Version][]kv.Key{} 166 committed := map[kv.VersionPair][]kv.Key{} 167 for _, err := range errg.Errors { 168 // Try to resolve keys locked error. 169 lockedErr, ok := err.(*mvcc.LockedError) 170 if !ok { 171 return &backoff.PermanentError{Err: err} 172 } 173 174 status, err := resolver.CheckTxnStatus(txn.db, txn.vp, lockedErr.Primary, lockedErr.StartVer) 175 if err != nil { 176 return &backoff.PermanentError{Err: err} 177 } 178 switch status.Action { 179 case resolver.TxnActionNone: 180 // Transaction is still alive and try it letter. 181 continue 182 183 case resolver.TxnActionTTLExpireRollback, 184 resolver.TxnActionLockNotExistRollback: 185 // Resolve the current key. 186 rollbacks[lockedErr.StartVer] = append(rollbacks[lockedErr.StartVer], lockedErr.Key) 187 continue 188 189 default: 190 // TxnActionLockNotExistDoNothing 191 // Transaction committed: we try to resolve the current key and backoff. 192 pair := kv.VersionPair{StartVer: lockedErr.StartVer, CommitVer: status.CommitVer} 193 committed[pair] = append(committed[pair], lockedErr.Key) 194 continue 195 } 196 } 197 198 if len(rollbacks) > 0 { 199 for startVer, keys := range rollbacks { 200 txn.resolver.Resolve(keys, startVer, 0, nil) 201 committer.resolved = append(committer.resolved, startVer) 202 } 203 } 204 if len(committed) > 0 { 205 for pair, keys := range committed { 206 txn.resolver.Resolve(keys, pair.StartVer, pair.CommitVer, nil) 207 } 208 } 209 210 return resolver.ErrRetryable("resolving locks in transaction prepare staging") 211 }, expoBackoff(), BackoffErrReporter("committer.execute")) 212 if err != nil { 213 return err 214 } 215 216 return committer.commit() 217 } 218 219 // Rollback implements the Transaction interface. It undoes the transaction operations to KV store. 220 func (txn *Txn) Rollback() error { 221 if !txn.valid { 222 return kv.ErrInvalidTxn 223 } 224 txn.close() 225 return nil 226 } 227 228 // String implements fmt.Stringer interface. 229 func (txn *Txn) String() string { 230 return fmt.Sprintf("%d", txn.startVer) 231 } 232 233 func (txn *Txn) close() { 234 txn.valid = false 235 } 236 237 // committer represents the transaction 2 phase committer. It will calculate the 238 // mutations and apply to the low-level storage. 239 type committer struct { 240 db *pebble.DB 241 memDB *MemDB 242 resolver *resolver.Scheduler 243 startVer kv.Version 244 commitVer kv.Version 245 resolved []kv.Version 246 primaryIdx int 247 primaryKey kv.Key 248 lockTTL uint64 249 handles []MemKeyHandle 250 251 // counter of mutations 252 size, putCnt, delCnt, lockCnt, checkCnt int 253 254 // The commit status 255 mu struct { 256 sync.RWMutex 257 undeterminedErr error // undeterminedErr saves the rpc error we encounter when commit primary key. 258 committed bool 259 } 260 } 261 262 // init initializes the keys and mutations. 263 func (c *committer) init(startTime time.Time) error { 264 // Foreach all the changes cached in the memory buffer and build the mutations. 265 var err error 266 for it := c.memDB.IterWithFlags(nil, nil); it.Valid(); err = it.Next() { 267 // TODO: handle error properly 268 _ = err 269 270 var ( 271 key = it.Key() 272 flags = it.Flags() 273 value []byte 274 op mvcc.Op 275 ) 276 277 if !it.HasValue() { 278 if !flags.HasLocked() { 279 continue 280 } 281 op = mvcc.Op_Lock 282 c.lockCnt++ 283 } else { 284 value = it.Value() 285 if len(value) > 0 { 286 op = mvcc.Op_Put 287 if flags.HasPresumeKeyNotExists() { 288 op = mvcc.Op_Insert 289 } 290 c.putCnt++ 291 } else if flags.HasPresumeKeyNotExists() { 292 // delete-your-writes keys in optimistic txn need check not exists in prewrite-phase 293 // due to `Op_CheckNotExists` doesn't prewrite lock, so mark those keys should not be used in commit-phase. 294 op = mvcc.Op_CheckNotExists 295 c.checkCnt++ 296 c.memDB.UpdateFlags(key, kv.SetPrewriteOnly) 297 } else if flags.HasNewlyInserted() { 298 // The delete-your-write keys in pessimistic transactions, only lock needed keys and skip 299 // other deletes for example the secondary index delete. 300 // Here if `tidb_constraint_check_in_place` is enabled and the transaction is in optimistic mode, 301 // the logic is same as the pessimistic mode. 302 if flags.HasLocked() { 303 op = mvcc.Op_Lock 304 c.lockCnt++ 305 } else { 306 continue 307 } 308 } else { 309 op = mvcc.Op_Del 310 c.delCnt++ 311 } 312 313 handle := it.Handle() 314 handle.op = op 315 handle.flags = flags 316 c.handles = append(c.handles, handle) 317 c.size += len(key) + len(value) 318 } 319 320 // Choose the first valid key as the primary key of the current transaction. 321 if len(c.primaryKey) == 0 && op != mvcc.Op_CheckNotExists { 322 c.primaryIdx = len(c.handles) - 1 323 c.primaryKey = key 324 } 325 } 326 327 if len(c.handles) == 0 { 328 return nil 329 } 330 c.lockTTL = txnLockTTL(startTime, c.size) 331 332 return nil 333 } 334 335 func (c *committer) length() int { 336 return len(c.handles) 337 } 338 339 // keys returns keys of all mutations in the current transaction. 340 func (c *committer) keys() []kv.Key { 341 keys := make([]kv.Key, len(c.handles)) 342 for i, h := range c.handles { 343 keys[i] = c.memDB.GetKeyByHandle(h) 344 } 345 return keys 346 } 347 348 // prepare implements the first stage of 2PC transaction model. 349 func (c *committer) prepare() error { 350 var ( 351 errs []error 352 batch = c.db.NewBatch() 353 primaryKey = c.primaryKey 354 startVer = c.startVer 355 resolved = c.resolved 356 ) 357 defer batch.Close() 358 359 for _, h := range c.handles { 360 op := h.op 361 key := c.memDB.GetKeyByHandle(h) 362 enc := mvcc.Encode(key, mvcc.LockVer) 363 opt := pebble.IterOptions{ 364 LowerBound: enc, 365 } 366 if op == mvcc.Op_Insert || op == mvcc.Op_CheckNotExists { 367 iter := c.db.NewIter(&opt) 368 iter.First() 369 val, err := getValue(iter, key, startVer, resolved) 370 _ = iter.Close() 371 if err != nil { 372 errs = append(errs, err) 373 continue 374 } 375 if val != nil { 376 err = &kv.ErrKeyAlreadyExist{ 377 Key: key, 378 } 379 errs = append(errs, err) 380 continue 381 } 382 } 383 if op == mvcc.Op_CheckNotExists { 384 continue 385 } 386 387 err := func() error { 388 iter := c.db.NewIter(&opt) 389 iter.First() 390 defer iter.Close() 391 392 decoder := mvcc.LockDecoder{ExpectKey: key} 393 exists, err := decoder.Decode(iter) 394 if err != nil { 395 return err 396 } 397 398 // There is a lock exists. 399 if exists && decoder.Lock.StartVer != startVer { 400 return decoder.Lock.LockErr(key) 401 } 402 403 // Check conflicts 404 vdecoder := mvcc.ValueDecoder{ExpectKey: key} 405 exists, err = vdecoder.Decode(iter) 406 if err != nil { 407 return err 408 } 409 if exists && vdecoder.Value.CommitVer > startVer { 410 return &kv.ErrConflict{ 411 StartVer: startVer, 412 ConflictStartVer: vdecoder.Value.StartVer, 413 ConflictCommitVer: vdecoder.Value.CommitVer, 414 Key: key, 415 } 416 } 417 return nil 418 }() 419 if err != nil { 420 errs = append(errs, err) 421 continue 422 } 423 424 // Append the current row key into the write batch. 425 if op == mvcc.Op_Insert { 426 op = mvcc.Op_Put 427 } 428 val, _ := c.memDB.GetValueByHandle(h) 429 l := mvcc.Lock{ 430 StartVer: startVer, 431 Primary: primaryKey, 432 Value: val, 433 Op: op, 434 TTL: c.lockTTL, 435 } 436 writeVal, err := l.MarshalBinary() 437 if err != nil { 438 errs = append(errs, err) 439 continue 440 } 441 err = batch.Set(enc, writeVal, nil) 442 if err != nil { 443 errs = append(errs, err) 444 continue 445 } 446 } 447 448 // Commit the current write batch into the low-level storage engine. 449 if err := batch.Commit(nil); err != nil { 450 return err 451 } 452 453 return &kv.ErrGroup{Errors: errs} 454 } 455 456 // commit implements the second stage of 2PC transaction model. 457 func (c *committer) commit() error { 458 batch := c.db.NewBatch() 459 defer batch.Close() 460 461 // Commit primary key first. 462 err := resolver.Resolve(c.db, batch, c.primaryKey, c.startVer, c.commitVer) 463 if err != nil { 464 return err 465 } 466 err = batch.Commit(nil) 467 if err != nil { 468 return err 469 } 470 471 // The remained keys submit to resolver to resolve them asynchronously. 472 var remainedKeys []kv.Key 473 for i, h := range c.handles { 474 // The primary key had been committed. 475 if i == c.primaryIdx { 476 continue 477 } 478 if h.op == mvcc.Op_CheckNotExists { 479 continue 480 } 481 482 // Note: the keys stored in MemDB are reference to MemDB and its lifetime 483 // bound to the MemDB. We will release MemDB instance after the transaction 484 // committed. So we need to copy the keys, then submit them to the resolver. 485 key := c.memDB.GetKeyByHandle(h) 486 cpy := make(kv.Key, len(key)) 487 copy(cpy, key) 488 remainedKeys = append(remainedKeys, cpy) 489 } 490 c.resolver.Resolve(remainedKeys, c.startVer, c.commitVer, nil) 491 492 return nil 493 } 494 495 const bytesPerMiB = 1024 * 1024 496 497 // ttl = ttlFactor * sqrt(writeSizeInMiB) 498 var ttlFactor = 6000 499 500 // By default, locks after 3000ms is considered unusual (the client created the 501 // lock might be dead). Other client may clean up this kind of lock. 502 // For locks created recently, we will do backoff and retry. 503 var defaultLockTTL uint64 = 3000 504 505 // Global variable set by config file. 506 var ( 507 ManagedLockTTL uint64 = 20000 // 20s 508 ) 509 510 var ( 511 // PrewriteMaxBackoff is max sleep time of the `pre-write` command. 512 PrewriteMaxBackoff = atomic.NewUint64(40000) 513 // CommitMaxBackoff is max sleep time of the 'commit' command 514 CommitMaxBackoff = uint64(40000) 515 ) 516 517 func txnLockTTL(startTime time.Time, txnSize int) uint64 { 518 // Increase lockTTL for large transactions. 519 // The formula is `ttl = ttlFactor * sqrt(sizeInMiB)`. 520 // When writeSize is less than 256KB, the base ttl is defaultTTL (3s); 521 // When writeSize is 1MiB, 4MiB, or 10MiB, ttl is 6s, 12s, 20s correspondingly; 522 lockTTL := defaultLockTTL 523 if txnSize >= int(kv.TxnCommitBatchSize.Load()) { 524 sizeMiB := float64(txnSize) / bytesPerMiB 525 lockTTL = uint64(float64(ttlFactor) * math.Sqrt(sizeMiB)) 526 if lockTTL < defaultLockTTL { 527 lockTTL = defaultLockTTL 528 } 529 if lockTTL > ManagedLockTTL { 530 lockTTL = ManagedLockTTL 531 } 532 } 533 534 // Increase lockTTL by the transaction's read time. 535 // When resolving a lock, we compare current ver and startVer+lockTTL to decide whether to clean up. If a txn 536 // takes a long time to read, increasing its TTL will help to prevent it from been aborted soon after prewrite. 537 elapsed := time.Since(startTime) / time.Millisecond 538 return lockTTL + uint64(elapsed) 539 }