go.etcd.io/etcd@v3.3.27+incompatible/mvcc/kvstore.go (about) 1 // Copyright 2015 The etcd Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package mvcc 16 17 import ( 18 "context" 19 "encoding/binary" 20 "errors" 21 "hash/crc32" 22 "math" 23 "sync" 24 "sync/atomic" 25 "time" 26 27 "github.com/coreos/etcd/lease" 28 "github.com/coreos/etcd/mvcc/backend" 29 "github.com/coreos/etcd/mvcc/mvccpb" 30 "github.com/coreos/etcd/pkg/schedule" 31 "github.com/coreos/pkg/capnslog" 32 ) 33 34 var ( 35 keyBucketName = []byte("key") 36 metaBucketName = []byte("meta") 37 38 consistentIndexKeyName = []byte("consistent_index") 39 scheduledCompactKeyName = []byte("scheduledCompactRev") 40 finishedCompactKeyName = []byte("finishedCompactRev") 41 42 ErrCompacted = errors.New("mvcc: required revision has been compacted") 43 ErrFutureRev = errors.New("mvcc: required revision is a future revision") 44 ErrCanceled = errors.New("mvcc: watcher is canceled") 45 ErrClosed = errors.New("mvcc: closed") 46 47 plog = capnslog.NewPackageLogger("github.com/coreos/etcd", "mvcc") 48 ) 49 50 const ( 51 // markedRevBytesLen is the byte length of marked revision. 52 // The first `revBytesLen` bytes represents a normal revision. The last 53 // one byte is the mark. 54 markedRevBytesLen = revBytesLen + 1 55 markBytePosition = markedRevBytesLen - 1 56 markTombstone byte = 't' 57 ) 58 59 var restoreChunkKeys = 10000 // non-const for testing 60 61 // ConsistentIndexGetter is an interface that wraps the Get method. 62 // Consistent index is the offset of an entry in a consistent replicated log. 63 type ConsistentIndexGetter interface { 64 // ConsistentIndex returns the consistent index of current executing entry. 65 ConsistentIndex() uint64 66 } 67 68 type store struct { 69 ReadView 70 WriteView 71 72 // consistentIndex caches the "consistent_index" key's value. Accessed 73 // through atomics so must be 64-bit aligned. 74 consistentIndex uint64 75 76 // mu read locks for txns and write locks for non-txn store changes. 77 mu sync.RWMutex 78 79 ig ConsistentIndexGetter 80 81 b backend.Backend 82 kvindex index 83 84 le lease.Lessor 85 86 // revMuLock protects currentRev and compactMainRev. 87 // Locked at end of write txn and released after write txn unlock lock. 88 // Locked before locking read txn and released after locking. 89 revMu sync.RWMutex 90 // currentRev is the revision of the last completed transaction. 91 currentRev int64 92 // compactMainRev is the main revision of the last compaction. 93 compactMainRev int64 94 95 // bytesBuf8 is a byte slice of length 8 96 // to avoid a repetitive allocation in saveIndex. 97 bytesBuf8 []byte 98 99 fifoSched schedule.Scheduler 100 101 stopc chan struct{} 102 } 103 104 // NewStore returns a new store. It is useful to create a store inside 105 // mvcc pkg. It should only be used for testing externally. 106 func NewStore(b backend.Backend, le lease.Lessor, ig ConsistentIndexGetter) *store { 107 s := &store{ 108 b: b, 109 ig: ig, 110 kvindex: newTreeIndex(), 111 112 le: le, 113 114 currentRev: 1, 115 compactMainRev: -1, 116 117 bytesBuf8: make([]byte, 8), 118 fifoSched: schedule.NewFIFOScheduler(), 119 120 stopc: make(chan struct{}), 121 } 122 s.ReadView = &readView{s} 123 s.WriteView = &writeView{s} 124 if s.le != nil { 125 s.le.SetRangeDeleter(func() lease.TxnDelete { return s.Write() }) 126 } 127 128 tx := s.b.BatchTx() 129 tx.Lock() 130 tx.UnsafeCreateBucket(keyBucketName) 131 tx.UnsafeCreateBucket(metaBucketName) 132 tx.Unlock() 133 s.b.ForceCommit() 134 135 if err := s.restore(); err != nil { 136 // TODO: return the error instead of panic here? 137 panic("failed to recover store from backend") 138 } 139 140 return s 141 } 142 143 func (s *store) compactBarrier(ctx context.Context, ch chan struct{}) { 144 if ctx == nil || ctx.Err() != nil { 145 select { 146 case <-s.stopc: 147 default: 148 // fix deadlock in mvcc,for more information, please refer to pr 11817. 149 // s.stopc is only updated in restore operation, which is called by apply 150 // snapshot call, compaction and apply snapshot requests are serialized by 151 // raft, and do not happen at the same time. 152 s.mu.Lock() 153 f := func(ctx context.Context) { s.compactBarrier(ctx, ch) } 154 s.fifoSched.Schedule(f) 155 s.mu.Unlock() 156 } 157 return 158 } 159 close(ch) 160 } 161 162 func (s *store) Hash() (hash uint32, revision int64, err error) { 163 start := time.Now() 164 165 s.b.ForceCommit() 166 h, err := s.b.Hash(DefaultIgnores) 167 168 hashDurations.Observe(time.Since(start).Seconds()) 169 return h, s.currentRev, err 170 } 171 172 func (s *store) HashByRev(rev int64) (hash uint32, currentRev int64, compactRev int64, err error) { 173 start := time.Now() 174 175 s.mu.RLock() 176 s.revMu.RLock() 177 compactRev, currentRev = s.compactMainRev, s.currentRev 178 s.revMu.RUnlock() 179 180 if rev > 0 && rev <= compactRev { 181 s.mu.RUnlock() 182 return 0, 0, compactRev, ErrCompacted 183 } else if rev > 0 && rev > currentRev { 184 s.mu.RUnlock() 185 return 0, currentRev, 0, ErrFutureRev 186 } 187 188 if rev == 0 { 189 rev = currentRev 190 } 191 keep := s.kvindex.Keep(rev) 192 193 tx := s.b.ReadTx() 194 tx.Lock() 195 defer tx.Unlock() 196 s.mu.RUnlock() 197 198 upper := revision{main: rev + 1} 199 lower := revision{main: compactRev + 1} 200 h := crc32.New(crc32.MakeTable(crc32.Castagnoli)) 201 202 h.Write(keyBucketName) 203 err = tx.UnsafeForEach(keyBucketName, func(k, v []byte) error { 204 kr := bytesToRev(k) 205 if !upper.GreaterThan(kr) { 206 return nil 207 } 208 // skip revisions that are scheduled for deletion 209 // due to compacting; don't skip if there isn't one. 210 if lower.GreaterThan(kr) && len(keep) > 0 { 211 if _, ok := keep[kr]; !ok { 212 return nil 213 } 214 } 215 h.Write(k) 216 h.Write(v) 217 return nil 218 }) 219 hash = h.Sum32() 220 221 hashRevDurations.Observe(time.Since(start).Seconds()) 222 return hash, currentRev, compactRev, err 223 } 224 225 func (s *store) Compact(rev int64) (<-chan struct{}, error) { 226 s.mu.Lock() 227 defer s.mu.Unlock() 228 s.revMu.Lock() 229 defer s.revMu.Unlock() 230 231 if rev <= s.compactMainRev { 232 ch := make(chan struct{}) 233 f := func(ctx context.Context) { s.compactBarrier(ctx, ch) } 234 s.fifoSched.Schedule(f) 235 return ch, ErrCompacted 236 } 237 if rev > s.currentRev { 238 return nil, ErrFutureRev 239 } 240 241 start := time.Now() 242 243 s.compactMainRev = rev 244 245 rbytes := newRevBytes() 246 revToBytes(revision{main: rev}, rbytes) 247 248 tx := s.b.BatchTx() 249 tx.Lock() 250 tx.UnsafePut(metaBucketName, scheduledCompactKeyName, rbytes) 251 tx.Unlock() 252 // ensure that desired compaction is persisted 253 s.b.ForceCommit() 254 255 keep := s.kvindex.Compact(rev) 256 ch := make(chan struct{}) 257 var j = func(ctx context.Context) { 258 if ctx.Err() != nil { 259 s.compactBarrier(ctx, ch) 260 return 261 } 262 if !s.scheduleCompaction(rev, keep) { 263 s.compactBarrier(nil, ch) 264 return 265 } 266 close(ch) 267 } 268 269 s.fifoSched.Schedule(j) 270 271 indexCompactionPauseDurations.Observe(float64(time.Since(start) / time.Millisecond)) 272 return ch, nil 273 } 274 275 // DefaultIgnores is a map of keys to ignore in hash checking. 276 var DefaultIgnores map[backend.IgnoreKey]struct{} 277 278 func init() { 279 DefaultIgnores = map[backend.IgnoreKey]struct{}{ 280 // consistent index might be changed due to v2 internal sync, which 281 // is not controllable by the user. 282 {Bucket: string(metaBucketName), Key: string(consistentIndexKeyName)}: {}, 283 } 284 } 285 286 func (s *store) Commit() { 287 s.mu.Lock() 288 defer s.mu.Unlock() 289 290 tx := s.b.BatchTx() 291 tx.Lock() 292 s.saveIndex(tx) 293 tx.Unlock() 294 s.b.ForceCommit() 295 } 296 297 func (s *store) Restore(b backend.Backend) error { 298 s.mu.Lock() 299 defer s.mu.Unlock() 300 301 close(s.stopc) 302 s.fifoSched.Stop() 303 304 atomic.StoreUint64(&s.consistentIndex, 0) 305 s.b = b 306 s.kvindex = newTreeIndex() 307 s.currentRev = 1 308 s.compactMainRev = -1 309 s.fifoSched = schedule.NewFIFOScheduler() 310 s.stopc = make(chan struct{}) 311 312 return s.restore() 313 } 314 315 func (s *store) restore() error { 316 s.setupMetricsReporter() 317 318 min, max := newRevBytes(), newRevBytes() 319 revToBytes(revision{main: 1}, min) 320 revToBytes(revision{main: math.MaxInt64, sub: math.MaxInt64}, max) 321 322 keyToLease := make(map[string]lease.LeaseID) 323 324 // restore index 325 tx := s.b.BatchTx() 326 tx.Lock() 327 328 _, finishedCompactBytes := tx.UnsafeRange(metaBucketName, finishedCompactKeyName, nil, 0) 329 if len(finishedCompactBytes) != 0 { 330 s.compactMainRev = bytesToRev(finishedCompactBytes[0]).main 331 plog.Printf("restore compact to %d", s.compactMainRev) 332 } 333 _, scheduledCompactBytes := tx.UnsafeRange(metaBucketName, scheduledCompactKeyName, nil, 0) 334 scheduledCompact := int64(0) 335 if len(scheduledCompactBytes) != 0 { 336 scheduledCompact = bytesToRev(scheduledCompactBytes[0]).main 337 } 338 339 // index keys concurrently as they're loaded in from tx 340 keysGauge.Set(0) 341 rkvc, revc := restoreIntoIndex(s.kvindex) 342 for { 343 keys, vals := tx.UnsafeRange(keyBucketName, min, max, int64(restoreChunkKeys)) 344 if len(keys) == 0 { 345 break 346 } 347 // rkvc blocks if the total pending keys exceeds the restore 348 // chunk size to keep keys from consuming too much memory. 349 restoreChunk(rkvc, keys, vals, keyToLease) 350 if len(keys) < restoreChunkKeys { 351 // partial set implies final set 352 break 353 } 354 // next set begins after where this one ended 355 newMin := bytesToRev(keys[len(keys)-1][:revBytesLen]) 356 newMin.sub++ 357 revToBytes(newMin, min) 358 } 359 close(rkvc) 360 s.currentRev = <-revc 361 362 // keys in the range [compacted revision -N, compaction] might all be deleted due to compaction. 363 // the correct revision should be set to compaction revision in the case, not the largest revision 364 // we have seen. 365 if s.currentRev < s.compactMainRev { 366 s.currentRev = s.compactMainRev 367 } 368 if scheduledCompact <= s.compactMainRev { 369 scheduledCompact = 0 370 } 371 372 for key, lid := range keyToLease { 373 if s.le == nil { 374 panic("no lessor to attach lease") 375 } 376 err := s.le.Attach(lid, []lease.LeaseItem{{Key: key}}) 377 if err != nil { 378 plog.Errorf("unexpected Attach error: %v", err) 379 } 380 } 381 382 tx.Unlock() 383 384 if scheduledCompact != 0 { 385 s.Compact(scheduledCompact) 386 plog.Printf("resume scheduled compaction at %d", scheduledCompact) 387 } 388 389 return nil 390 } 391 392 type revKeyValue struct { 393 key []byte 394 kv mvccpb.KeyValue 395 kstr string 396 } 397 398 func restoreIntoIndex(idx index) (chan<- revKeyValue, <-chan int64) { 399 rkvc, revc := make(chan revKeyValue, restoreChunkKeys), make(chan int64, 1) 400 go func() { 401 currentRev := int64(1) 402 defer func() { revc <- currentRev }() 403 // restore the tree index from streaming the unordered index. 404 kiCache := make(map[string]*keyIndex, restoreChunkKeys) 405 for rkv := range rkvc { 406 ki, ok := kiCache[rkv.kstr] 407 // purge kiCache if many keys but still missing in the cache 408 if !ok && len(kiCache) >= restoreChunkKeys { 409 i := 10 410 for k := range kiCache { 411 delete(kiCache, k) 412 if i--; i == 0 { 413 break 414 } 415 } 416 } 417 // cache miss, fetch from tree index if there 418 if !ok { 419 ki = &keyIndex{key: rkv.kv.Key} 420 if idxKey := idx.KeyIndex(ki); idxKey != nil { 421 kiCache[rkv.kstr], ki = idxKey, idxKey 422 ok = true 423 } 424 } 425 rev := bytesToRev(rkv.key) 426 currentRev = rev.main 427 if ok { 428 if isTombstone(rkv.key) { 429 ki.tombstone(rev.main, rev.sub) 430 continue 431 } 432 ki.put(rev.main, rev.sub) 433 } else if !isTombstone(rkv.key) { 434 ki.restore(revision{rkv.kv.CreateRevision, 0}, rev, rkv.kv.Version) 435 idx.Insert(ki) 436 kiCache[rkv.kstr] = ki 437 } 438 } 439 }() 440 return rkvc, revc 441 } 442 443 func restoreChunk(kvc chan<- revKeyValue, keys, vals [][]byte, keyToLease map[string]lease.LeaseID) { 444 for i, key := range keys { 445 rkv := revKeyValue{key: key} 446 if err := rkv.kv.Unmarshal(vals[i]); err != nil { 447 plog.Fatalf("cannot unmarshal event: %v", err) 448 } 449 rkv.kstr = string(rkv.kv.Key) 450 if isTombstone(key) { 451 delete(keyToLease, rkv.kstr) 452 } else if lid := lease.LeaseID(rkv.kv.Lease); lid != lease.NoLease { 453 keyToLease[rkv.kstr] = lid 454 } else { 455 delete(keyToLease, rkv.kstr) 456 } 457 kvc <- rkv 458 } 459 } 460 461 func (s *store) Close() error { 462 close(s.stopc) 463 s.fifoSched.Stop() 464 return nil 465 } 466 467 func (s *store) saveIndex(tx backend.BatchTx) { 468 if s.ig == nil { 469 return 470 } 471 bs := s.bytesBuf8 472 ci := s.ig.ConsistentIndex() 473 binary.BigEndian.PutUint64(bs, ci) 474 // put the index into the underlying backend 475 // tx has been locked in TxnBegin, so there is no need to lock it again 476 tx.UnsafePut(metaBucketName, consistentIndexKeyName, bs) 477 atomic.StoreUint64(&s.consistentIndex, ci) 478 } 479 480 func (s *store) ConsistentIndex() uint64 { 481 if ci := atomic.LoadUint64(&s.consistentIndex); ci > 0 { 482 return ci 483 } 484 tx := s.b.BatchTx() 485 tx.Lock() 486 defer tx.Unlock() 487 _, vs := tx.UnsafeRange(metaBucketName, consistentIndexKeyName, nil, 0) 488 if len(vs) == 0 { 489 return 0 490 } 491 v := binary.BigEndian.Uint64(vs[0]) 492 atomic.StoreUint64(&s.consistentIndex, v) 493 return v 494 } 495 496 func (s *store) setupMetricsReporter() { 497 b := s.b 498 reportDbTotalSizeInBytesMu.Lock() 499 reportDbTotalSizeInBytes = func() float64 { return float64(b.Size()) } 500 reportDbTotalSizeInBytesMu.Unlock() 501 reportDbTotalSizeInUseInBytesMu.Lock() 502 reportDbTotalSizeInUseInBytes = func() float64 { return float64(b.SizeInUse()) } 503 reportDbTotalSizeInUseInBytesMu.Unlock() 504 reportCurrentRevMu.Lock() 505 reportCurrentRev = func() float64 { 506 s.revMu.RLock() 507 defer s.revMu.RUnlock() 508 return float64(s.currentRev) 509 } 510 reportCurrentRevMu.Unlock() 511 reportCompactRevMu.Lock() 512 reportCompactRev = func() float64 { 513 s.revMu.RLock() 514 defer s.revMu.RUnlock() 515 return float64(s.compactMainRev) 516 } 517 reportCompactRevMu.Unlock() 518 } 519 520 // appendMarkTombstone appends tombstone mark to normal revision bytes. 521 func appendMarkTombstone(b []byte) []byte { 522 if len(b) != revBytesLen { 523 plog.Panicf("cannot append mark to non normal revision bytes") 524 } 525 return append(b, markTombstone) 526 } 527 528 // isTombstone checks whether the revision bytes is a tombstone. 529 func isTombstone(b []byte) bool { 530 return len(b) == markedRevBytesLen && b[markBytePosition] == markTombstone 531 }