go.etcd.io/etcd@v3.3.27+incompatible/mvcc/kvstore.go (about)

     1  // Copyright 2015 The etcd Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package mvcc
    16  
    17  import (
    18  	"context"
    19  	"encoding/binary"
    20  	"errors"
    21  	"hash/crc32"
    22  	"math"
    23  	"sync"
    24  	"sync/atomic"
    25  	"time"
    26  
    27  	"github.com/coreos/etcd/lease"
    28  	"github.com/coreos/etcd/mvcc/backend"
    29  	"github.com/coreos/etcd/mvcc/mvccpb"
    30  	"github.com/coreos/etcd/pkg/schedule"
    31  	"github.com/coreos/pkg/capnslog"
    32  )
    33  
    34  var (
    35  	keyBucketName  = []byte("key")
    36  	metaBucketName = []byte("meta")
    37  
    38  	consistentIndexKeyName  = []byte("consistent_index")
    39  	scheduledCompactKeyName = []byte("scheduledCompactRev")
    40  	finishedCompactKeyName  = []byte("finishedCompactRev")
    41  
    42  	ErrCompacted = errors.New("mvcc: required revision has been compacted")
    43  	ErrFutureRev = errors.New("mvcc: required revision is a future revision")
    44  	ErrCanceled  = errors.New("mvcc: watcher is canceled")
    45  	ErrClosed    = errors.New("mvcc: closed")
    46  
    47  	plog = capnslog.NewPackageLogger("github.com/coreos/etcd", "mvcc")
    48  )
    49  
    50  const (
    51  	// markedRevBytesLen is the byte length of marked revision.
    52  	// The first `revBytesLen` bytes represents a normal revision. The last
    53  	// one byte is the mark.
    54  	markedRevBytesLen      = revBytesLen + 1
    55  	markBytePosition       = markedRevBytesLen - 1
    56  	markTombstone     byte = 't'
    57  )
    58  
    59  var restoreChunkKeys = 10000 // non-const for testing
    60  
    61  // ConsistentIndexGetter is an interface that wraps the Get method.
    62  // Consistent index is the offset of an entry in a consistent replicated log.
    63  type ConsistentIndexGetter interface {
    64  	// ConsistentIndex returns the consistent index of current executing entry.
    65  	ConsistentIndex() uint64
    66  }
    67  
    68  type store struct {
    69  	ReadView
    70  	WriteView
    71  
    72  	// consistentIndex caches the "consistent_index" key's value. Accessed
    73  	// through atomics so must be 64-bit aligned.
    74  	consistentIndex uint64
    75  
    76  	// mu read locks for txns and write locks for non-txn store changes.
    77  	mu sync.RWMutex
    78  
    79  	ig ConsistentIndexGetter
    80  
    81  	b       backend.Backend
    82  	kvindex index
    83  
    84  	le lease.Lessor
    85  
    86  	// revMuLock protects currentRev and compactMainRev.
    87  	// Locked at end of write txn and released after write txn unlock lock.
    88  	// Locked before locking read txn and released after locking.
    89  	revMu sync.RWMutex
    90  	// currentRev is the revision of the last completed transaction.
    91  	currentRev int64
    92  	// compactMainRev is the main revision of the last compaction.
    93  	compactMainRev int64
    94  
    95  	// bytesBuf8 is a byte slice of length 8
    96  	// to avoid a repetitive allocation in saveIndex.
    97  	bytesBuf8 []byte
    98  
    99  	fifoSched schedule.Scheduler
   100  
   101  	stopc chan struct{}
   102  }
   103  
   104  // NewStore returns a new store. It is useful to create a store inside
   105  // mvcc pkg. It should only be used for testing externally.
   106  func NewStore(b backend.Backend, le lease.Lessor, ig ConsistentIndexGetter) *store {
   107  	s := &store{
   108  		b:       b,
   109  		ig:      ig,
   110  		kvindex: newTreeIndex(),
   111  
   112  		le: le,
   113  
   114  		currentRev:     1,
   115  		compactMainRev: -1,
   116  
   117  		bytesBuf8: make([]byte, 8),
   118  		fifoSched: schedule.NewFIFOScheduler(),
   119  
   120  		stopc: make(chan struct{}),
   121  	}
   122  	s.ReadView = &readView{s}
   123  	s.WriteView = &writeView{s}
   124  	if s.le != nil {
   125  		s.le.SetRangeDeleter(func() lease.TxnDelete { return s.Write() })
   126  	}
   127  
   128  	tx := s.b.BatchTx()
   129  	tx.Lock()
   130  	tx.UnsafeCreateBucket(keyBucketName)
   131  	tx.UnsafeCreateBucket(metaBucketName)
   132  	tx.Unlock()
   133  	s.b.ForceCommit()
   134  
   135  	if err := s.restore(); err != nil {
   136  		// TODO: return the error instead of panic here?
   137  		panic("failed to recover store from backend")
   138  	}
   139  
   140  	return s
   141  }
   142  
   143  func (s *store) compactBarrier(ctx context.Context, ch chan struct{}) {
   144  	if ctx == nil || ctx.Err() != nil {
   145  		select {
   146  		case <-s.stopc:
   147  		default:
   148  			// fix deadlock in mvcc,for more information, please refer to pr 11817.
   149  			// s.stopc is only updated in restore operation, which is called by apply
   150  			// snapshot call, compaction and apply snapshot requests are serialized by
   151  			// raft, and do not happen at the same time.
   152  			s.mu.Lock()
   153  			f := func(ctx context.Context) { s.compactBarrier(ctx, ch) }
   154  			s.fifoSched.Schedule(f)
   155  			s.mu.Unlock()
   156  		}
   157  		return
   158  	}
   159  	close(ch)
   160  }
   161  
   162  func (s *store) Hash() (hash uint32, revision int64, err error) {
   163  	start := time.Now()
   164  
   165  	s.b.ForceCommit()
   166  	h, err := s.b.Hash(DefaultIgnores)
   167  
   168  	hashDurations.Observe(time.Since(start).Seconds())
   169  	return h, s.currentRev, err
   170  }
   171  
   172  func (s *store) HashByRev(rev int64) (hash uint32, currentRev int64, compactRev int64, err error) {
   173  	start := time.Now()
   174  
   175  	s.mu.RLock()
   176  	s.revMu.RLock()
   177  	compactRev, currentRev = s.compactMainRev, s.currentRev
   178  	s.revMu.RUnlock()
   179  
   180  	if rev > 0 && rev <= compactRev {
   181  		s.mu.RUnlock()
   182  		return 0, 0, compactRev, ErrCompacted
   183  	} else if rev > 0 && rev > currentRev {
   184  		s.mu.RUnlock()
   185  		return 0, currentRev, 0, ErrFutureRev
   186  	}
   187  
   188  	if rev == 0 {
   189  		rev = currentRev
   190  	}
   191  	keep := s.kvindex.Keep(rev)
   192  
   193  	tx := s.b.ReadTx()
   194  	tx.Lock()
   195  	defer tx.Unlock()
   196  	s.mu.RUnlock()
   197  
   198  	upper := revision{main: rev + 1}
   199  	lower := revision{main: compactRev + 1}
   200  	h := crc32.New(crc32.MakeTable(crc32.Castagnoli))
   201  
   202  	h.Write(keyBucketName)
   203  	err = tx.UnsafeForEach(keyBucketName, func(k, v []byte) error {
   204  		kr := bytesToRev(k)
   205  		if !upper.GreaterThan(kr) {
   206  			return nil
   207  		}
   208  		// skip revisions that are scheduled for deletion
   209  		// due to compacting; don't skip if there isn't one.
   210  		if lower.GreaterThan(kr) && len(keep) > 0 {
   211  			if _, ok := keep[kr]; !ok {
   212  				return nil
   213  			}
   214  		}
   215  		h.Write(k)
   216  		h.Write(v)
   217  		return nil
   218  	})
   219  	hash = h.Sum32()
   220  
   221  	hashRevDurations.Observe(time.Since(start).Seconds())
   222  	return hash, currentRev, compactRev, err
   223  }
   224  
   225  func (s *store) Compact(rev int64) (<-chan struct{}, error) {
   226  	s.mu.Lock()
   227  	defer s.mu.Unlock()
   228  	s.revMu.Lock()
   229  	defer s.revMu.Unlock()
   230  
   231  	if rev <= s.compactMainRev {
   232  		ch := make(chan struct{})
   233  		f := func(ctx context.Context) { s.compactBarrier(ctx, ch) }
   234  		s.fifoSched.Schedule(f)
   235  		return ch, ErrCompacted
   236  	}
   237  	if rev > s.currentRev {
   238  		return nil, ErrFutureRev
   239  	}
   240  
   241  	start := time.Now()
   242  
   243  	s.compactMainRev = rev
   244  
   245  	rbytes := newRevBytes()
   246  	revToBytes(revision{main: rev}, rbytes)
   247  
   248  	tx := s.b.BatchTx()
   249  	tx.Lock()
   250  	tx.UnsafePut(metaBucketName, scheduledCompactKeyName, rbytes)
   251  	tx.Unlock()
   252  	// ensure that desired compaction is persisted
   253  	s.b.ForceCommit()
   254  
   255  	keep := s.kvindex.Compact(rev)
   256  	ch := make(chan struct{})
   257  	var j = func(ctx context.Context) {
   258  		if ctx.Err() != nil {
   259  			s.compactBarrier(ctx, ch)
   260  			return
   261  		}
   262  		if !s.scheduleCompaction(rev, keep) {
   263  			s.compactBarrier(nil, ch)
   264  			return
   265  		}
   266  		close(ch)
   267  	}
   268  
   269  	s.fifoSched.Schedule(j)
   270  
   271  	indexCompactionPauseDurations.Observe(float64(time.Since(start) / time.Millisecond))
   272  	return ch, nil
   273  }
   274  
   275  // DefaultIgnores is a map of keys to ignore in hash checking.
   276  var DefaultIgnores map[backend.IgnoreKey]struct{}
   277  
   278  func init() {
   279  	DefaultIgnores = map[backend.IgnoreKey]struct{}{
   280  		// consistent index might be changed due to v2 internal sync, which
   281  		// is not controllable by the user.
   282  		{Bucket: string(metaBucketName), Key: string(consistentIndexKeyName)}: {},
   283  	}
   284  }
   285  
   286  func (s *store) Commit() {
   287  	s.mu.Lock()
   288  	defer s.mu.Unlock()
   289  
   290  	tx := s.b.BatchTx()
   291  	tx.Lock()
   292  	s.saveIndex(tx)
   293  	tx.Unlock()
   294  	s.b.ForceCommit()
   295  }
   296  
   297  func (s *store) Restore(b backend.Backend) error {
   298  	s.mu.Lock()
   299  	defer s.mu.Unlock()
   300  
   301  	close(s.stopc)
   302  	s.fifoSched.Stop()
   303  
   304  	atomic.StoreUint64(&s.consistentIndex, 0)
   305  	s.b = b
   306  	s.kvindex = newTreeIndex()
   307  	s.currentRev = 1
   308  	s.compactMainRev = -1
   309  	s.fifoSched = schedule.NewFIFOScheduler()
   310  	s.stopc = make(chan struct{})
   311  
   312  	return s.restore()
   313  }
   314  
   315  func (s *store) restore() error {
   316  	s.setupMetricsReporter()
   317  
   318  	min, max := newRevBytes(), newRevBytes()
   319  	revToBytes(revision{main: 1}, min)
   320  	revToBytes(revision{main: math.MaxInt64, sub: math.MaxInt64}, max)
   321  
   322  	keyToLease := make(map[string]lease.LeaseID)
   323  
   324  	// restore index
   325  	tx := s.b.BatchTx()
   326  	tx.Lock()
   327  
   328  	_, finishedCompactBytes := tx.UnsafeRange(metaBucketName, finishedCompactKeyName, nil, 0)
   329  	if len(finishedCompactBytes) != 0 {
   330  		s.compactMainRev = bytesToRev(finishedCompactBytes[0]).main
   331  		plog.Printf("restore compact to %d", s.compactMainRev)
   332  	}
   333  	_, scheduledCompactBytes := tx.UnsafeRange(metaBucketName, scheduledCompactKeyName, nil, 0)
   334  	scheduledCompact := int64(0)
   335  	if len(scheduledCompactBytes) != 0 {
   336  		scheduledCompact = bytesToRev(scheduledCompactBytes[0]).main
   337  	}
   338  
   339  	// index keys concurrently as they're loaded in from tx
   340  	keysGauge.Set(0)
   341  	rkvc, revc := restoreIntoIndex(s.kvindex)
   342  	for {
   343  		keys, vals := tx.UnsafeRange(keyBucketName, min, max, int64(restoreChunkKeys))
   344  		if len(keys) == 0 {
   345  			break
   346  		}
   347  		// rkvc blocks if the total pending keys exceeds the restore
   348  		// chunk size to keep keys from consuming too much memory.
   349  		restoreChunk(rkvc, keys, vals, keyToLease)
   350  		if len(keys) < restoreChunkKeys {
   351  			// partial set implies final set
   352  			break
   353  		}
   354  		// next set begins after where this one ended
   355  		newMin := bytesToRev(keys[len(keys)-1][:revBytesLen])
   356  		newMin.sub++
   357  		revToBytes(newMin, min)
   358  	}
   359  	close(rkvc)
   360  	s.currentRev = <-revc
   361  
   362  	// keys in the range [compacted revision -N, compaction] might all be deleted due to compaction.
   363  	// the correct revision should be set to compaction revision in the case, not the largest revision
   364  	// we have seen.
   365  	if s.currentRev < s.compactMainRev {
   366  		s.currentRev = s.compactMainRev
   367  	}
   368  	if scheduledCompact <= s.compactMainRev {
   369  		scheduledCompact = 0
   370  	}
   371  
   372  	for key, lid := range keyToLease {
   373  		if s.le == nil {
   374  			panic("no lessor to attach lease")
   375  		}
   376  		err := s.le.Attach(lid, []lease.LeaseItem{{Key: key}})
   377  		if err != nil {
   378  			plog.Errorf("unexpected Attach error: %v", err)
   379  		}
   380  	}
   381  
   382  	tx.Unlock()
   383  
   384  	if scheduledCompact != 0 {
   385  		s.Compact(scheduledCompact)
   386  		plog.Printf("resume scheduled compaction at %d", scheduledCompact)
   387  	}
   388  
   389  	return nil
   390  }
   391  
   392  type revKeyValue struct {
   393  	key  []byte
   394  	kv   mvccpb.KeyValue
   395  	kstr string
   396  }
   397  
   398  func restoreIntoIndex(idx index) (chan<- revKeyValue, <-chan int64) {
   399  	rkvc, revc := make(chan revKeyValue, restoreChunkKeys), make(chan int64, 1)
   400  	go func() {
   401  		currentRev := int64(1)
   402  		defer func() { revc <- currentRev }()
   403  		// restore the tree index from streaming the unordered index.
   404  		kiCache := make(map[string]*keyIndex, restoreChunkKeys)
   405  		for rkv := range rkvc {
   406  			ki, ok := kiCache[rkv.kstr]
   407  			// purge kiCache if many keys but still missing in the cache
   408  			if !ok && len(kiCache) >= restoreChunkKeys {
   409  				i := 10
   410  				for k := range kiCache {
   411  					delete(kiCache, k)
   412  					if i--; i == 0 {
   413  						break
   414  					}
   415  				}
   416  			}
   417  			// cache miss, fetch from tree index if there
   418  			if !ok {
   419  				ki = &keyIndex{key: rkv.kv.Key}
   420  				if idxKey := idx.KeyIndex(ki); idxKey != nil {
   421  					kiCache[rkv.kstr], ki = idxKey, idxKey
   422  					ok = true
   423  				}
   424  			}
   425  			rev := bytesToRev(rkv.key)
   426  			currentRev = rev.main
   427  			if ok {
   428  				if isTombstone(rkv.key) {
   429  					ki.tombstone(rev.main, rev.sub)
   430  					continue
   431  				}
   432  				ki.put(rev.main, rev.sub)
   433  			} else if !isTombstone(rkv.key) {
   434  				ki.restore(revision{rkv.kv.CreateRevision, 0}, rev, rkv.kv.Version)
   435  				idx.Insert(ki)
   436  				kiCache[rkv.kstr] = ki
   437  			}
   438  		}
   439  	}()
   440  	return rkvc, revc
   441  }
   442  
   443  func restoreChunk(kvc chan<- revKeyValue, keys, vals [][]byte, keyToLease map[string]lease.LeaseID) {
   444  	for i, key := range keys {
   445  		rkv := revKeyValue{key: key}
   446  		if err := rkv.kv.Unmarshal(vals[i]); err != nil {
   447  			plog.Fatalf("cannot unmarshal event: %v", err)
   448  		}
   449  		rkv.kstr = string(rkv.kv.Key)
   450  		if isTombstone(key) {
   451  			delete(keyToLease, rkv.kstr)
   452  		} else if lid := lease.LeaseID(rkv.kv.Lease); lid != lease.NoLease {
   453  			keyToLease[rkv.kstr] = lid
   454  		} else {
   455  			delete(keyToLease, rkv.kstr)
   456  		}
   457  		kvc <- rkv
   458  	}
   459  }
   460  
   461  func (s *store) Close() error {
   462  	close(s.stopc)
   463  	s.fifoSched.Stop()
   464  	return nil
   465  }
   466  
   467  func (s *store) saveIndex(tx backend.BatchTx) {
   468  	if s.ig == nil {
   469  		return
   470  	}
   471  	bs := s.bytesBuf8
   472  	ci := s.ig.ConsistentIndex()
   473  	binary.BigEndian.PutUint64(bs, ci)
   474  	// put the index into the underlying backend
   475  	// tx has been locked in TxnBegin, so there is no need to lock it again
   476  	tx.UnsafePut(metaBucketName, consistentIndexKeyName, bs)
   477  	atomic.StoreUint64(&s.consistentIndex, ci)
   478  }
   479  
   480  func (s *store) ConsistentIndex() uint64 {
   481  	if ci := atomic.LoadUint64(&s.consistentIndex); ci > 0 {
   482  		return ci
   483  	}
   484  	tx := s.b.BatchTx()
   485  	tx.Lock()
   486  	defer tx.Unlock()
   487  	_, vs := tx.UnsafeRange(metaBucketName, consistentIndexKeyName, nil, 0)
   488  	if len(vs) == 0 {
   489  		return 0
   490  	}
   491  	v := binary.BigEndian.Uint64(vs[0])
   492  	atomic.StoreUint64(&s.consistentIndex, v)
   493  	return v
   494  }
   495  
   496  func (s *store) setupMetricsReporter() {
   497  	b := s.b
   498  	reportDbTotalSizeInBytesMu.Lock()
   499  	reportDbTotalSizeInBytes = func() float64 { return float64(b.Size()) }
   500  	reportDbTotalSizeInBytesMu.Unlock()
   501  	reportDbTotalSizeInUseInBytesMu.Lock()
   502  	reportDbTotalSizeInUseInBytes = func() float64 { return float64(b.SizeInUse()) }
   503  	reportDbTotalSizeInUseInBytesMu.Unlock()
   504  	reportCurrentRevMu.Lock()
   505  	reportCurrentRev = func() float64 {
   506  		s.revMu.RLock()
   507  		defer s.revMu.RUnlock()
   508  		return float64(s.currentRev)
   509  	}
   510  	reportCurrentRevMu.Unlock()
   511  	reportCompactRevMu.Lock()
   512  	reportCompactRev = func() float64 {
   513  		s.revMu.RLock()
   514  		defer s.revMu.RUnlock()
   515  		return float64(s.compactMainRev)
   516  	}
   517  	reportCompactRevMu.Unlock()
   518  }
   519  
   520  // appendMarkTombstone appends tombstone mark to normal revision bytes.
   521  func appendMarkTombstone(b []byte) []byte {
   522  	if len(b) != revBytesLen {
   523  		plog.Panicf("cannot append mark to non normal revision bytes")
   524  	}
   525  	return append(b, markTombstone)
   526  }
   527  
   528  // isTombstone checks whether the revision bytes is a tombstone.
   529  func isTombstone(b []byte) bool {
   530  	return len(b) == markedRevBytesLen && b[markBytePosition] == markTombstone
   531  }