github.com/whtcorpsinc/MilevaDB-Prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/ekv/ekv.go (about)

     1  // Copyright 2020 WHTCORPS INC, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package ekv
    15  
    16  import (
    17  	"context"
    18  	"sync"
    19  	"time"
    20  
    21  	"github.com/whtcorpsinc/milevadb/causetstore/einsteindb/oracle"
    22  	"github.com/whtcorpsinc/milevadb/config"
    23  	"github.com/whtcorpsinc/milevadb/soliton/execdetails"
    24  	"github.com/whtcorpsinc/milevadb/soliton/memory"
    25  )
    26  
    27  // Transaction options
    28  const (
    29  	// BinlogInfo contains the binlog data and client.
    30  	BinlogInfo Option = iota + 1
    31  	// SchemaChecker is used for checking schemaReplicant-validity.
    32  	SchemaChecker
    33  	// IsolationLevel sets isolation level for current transaction. The default level is SI.
    34  	IsolationLevel
    35  	// Priority marks the priority of this transaction.
    36  	Priority
    37  	// NotFillCache makes this request do not touch the LRU cache of the underlying storage.
    38  	NotFillCache
    39  	// SyncLog decides whether the WAL(write-ahead log) of this request should be synchronized.
    40  	SyncLog
    41  	// KeyOnly retrieve only keys, it can be used in scan now.
    42  	KeyOnly
    43  	// Pessimistic is defined for pessimistic dagger
    44  	Pessimistic
    45  	// SnapshotTS is defined to set snapshot ts.
    46  	SnapshotTS
    47  	// Set replica read
    48  	ReplicaRead
    49  	// Set task ID
    50  	TaskID
    51  	// SchemaReplicant is schemaReplicant version used by txn startTS.
    52  	SchemaReplicant
    53  	// DefCauslectRuntimeStats is used to enable collect runtime stats.
    54  	DefCauslectRuntimeStats
    55  	// SchemaAmender is used to amend mutations for pessimistic transactions
    56  	SchemaAmender
    57  	// SampleStep skips 'SampleStep - 1' number of keys after each returned key.
    58  	SampleStep
    59  	// CommitHook is a callback function called right after the transaction gets committed
    60  	CommitHook
    61  )
    62  
    63  // Priority value for transaction priority.
    64  const (
    65  	PriorityNormal = iota
    66  	PriorityLow
    67  	PriorityHigh
    68  )
    69  
    70  // UnCommitIndexKVFlag uses to indicate the index key/value is no need to commit.
    71  // This is used in the situation of the index key/value was unchanged when do uFIDelate.
    72  // Usage:
    73  // 1. For non-unique index: normally, the index value is '0'.
    74  // Change the value to '1' indicate the index key/value is no need to commit.
    75  // 2. For unique index: normally, the index value is the record handle ID, 8 bytes.
    76  // Append UnCommitIndexKVFlag to the value indicate the index key/value is no need to commit.
    77  const UnCommitIndexKVFlag byte = '1'
    78  
    79  // MaxTxnTimeUse is the max time a Txn may use (in ms) from its begin to commit.
    80  // We use it to abort the transaction to guarantee GC worker will not influence it.
    81  const MaxTxnTimeUse = 24 * 60 * 60 * 1000
    82  
    83  // IsoLevel is the transaction's isolation level.
    84  type IsoLevel int
    85  
    86  const (
    87  	// SI stands for 'snapshot isolation'.
    88  	SI IsoLevel = iota
    89  	// RC stands for 'read committed'.
    90  	RC
    91  )
    92  
    93  // ReplicaReadType is the type of replica to read data from
    94  type ReplicaReadType byte
    95  
    96  const (
    97  	// ReplicaReadLeader stands for 'read from leader'.
    98  	ReplicaReadLeader ReplicaReadType = 1 << iota
    99  	// ReplicaReadFollower stands for 'read from follower'.
   100  	ReplicaReadFollower
   101  	// ReplicaReadMixed stands for 'read from leader and follower and learner'.
   102  	ReplicaReadMixed
   103  )
   104  
   105  // IsFollowerRead checks if leader is going to be used to read data.
   106  func (r ReplicaReadType) IsFollowerRead() bool {
   107  	// In some cases the default value is 0, which should be treated as `ReplicaReadLeader`.
   108  	return r != ReplicaReadLeader && r != 0
   109  }
   110  
   111  // Those limits is enforced to make sure the transaction can be well handled by EinsteinDB.
   112  var (
   113  	// TxnEntrySizeLimit is limit of single entry size (len(key) + len(value)).
   114  	TxnEntrySizeLimit uint64 = config.DefTxnEntrySizeLimit
   115  	// TxnTotalSizeLimit is limit of the sum of all entry size.
   116  	TxnTotalSizeLimit uint64 = config.DefTxnTotalSizeLimit
   117  )
   118  
   119  // Getter is the interface for the Get method.
   120  type Getter interface {
   121  	// Get gets the value for key k from ekv causetstore.
   122  	// If corresponding ekv pair does not exist, it returns nil and ErrNotExist.
   123  	Get(ctx context.Context, k Key) ([]byte, error)
   124  }
   125  
   126  // Retriever is the interface wraps the basic Get and Seek methods.
   127  type Retriever interface {
   128  	Getter
   129  	// Iter creates an Iterator positioned on the first entry that k <= entry's key.
   130  	// If such entry is not found, it returns an invalid Iterator with no error.
   131  	// It yields only keys that < upperBound. If upperBound is nil, it means the upperBound is unbounded.
   132  	// The Iterator must be Closed after use.
   133  	Iter(k Key, upperBound Key) (Iterator, error)
   134  
   135  	// IterReverse creates a reversed Iterator positioned on the first entry which key is less than k.
   136  	// The returned iterator will iterate from greater key to smaller key.
   137  	// If k is nil, the returned iterator will be positioned at the last key.
   138  	// TODO: Add lower bound limit
   139  	IterReverse(k Key) (Iterator, error)
   140  }
   141  
   142  // Mutator is the interface wraps the basic Set and Delete methods.
   143  type Mutator interface {
   144  	// Set sets the value for key k as v into ekv causetstore.
   145  	// v must NOT be nil or empty, otherwise it returns ErrCannotSetNilValue.
   146  	Set(k Key, v []byte) error
   147  	// Delete removes the entry for key k from ekv causetstore.
   148  	Delete(k Key) error
   149  }
   150  
   151  // StagingHandle is the reference of a staging buffer.
   152  type StagingHandle int
   153  
   154  var (
   155  	// InvalidStagingHandle is an invalid handler, MemBuffer will check handler to ensure safety.
   156  	InvalidStagingHandle StagingHandle = 0
   157  	// LastActiveStagingHandle is an special handler which always point to the last active staging buffer.
   158  	LastActiveStagingHandle StagingHandle = -1
   159  )
   160  
   161  // RetrieverMutator is the interface that groups Retriever and Mutator interfaces.
   162  type RetrieverMutator interface {
   163  	Retriever
   164  	Mutator
   165  }
   166  
   167  // MemBufferIterator is an Iterator with KeyFlags related functions.
   168  type MemBufferIterator interface {
   169  	Iterator
   170  	HasValue() bool
   171  	Flags() KeyFlags
   172  }
   173  
   174  // MemBuffer is an in-memory ekv collection, can be used to buffer write operations.
   175  type MemBuffer interface {
   176  	RetrieverMutator
   177  
   178  	// RLock locks the MemBuffer for shared read.
   179  	// In the most case, MemBuffer will only used by single goroutine,
   180  	// but it will be read by multiple goroutine when combined with interlock.UnionScanInterDirc.
   181  	// To avoid race introduced by interlock.UnionScanInterDirc, MemBuffer expose read dagger for it.
   182  	RLock()
   183  	// RUnlock unlocks the MemBuffer.
   184  	RUnlock()
   185  
   186  	// GetFlags returns the latest flags associated with key.
   187  	GetFlags(Key) (KeyFlags, error)
   188  	// IterWithFlags returns a MemBufferIterator.
   189  	IterWithFlags(k Key, upperBound Key) MemBufferIterator
   190  	// IterReverseWithFlags returns a reversed MemBufferIterator.
   191  	IterReverseWithFlags(k Key) MemBufferIterator
   192  	// SetWithFlags put key-value into the last active staging buffer with the given KeyFlags.
   193  	SetWithFlags(Key, []byte, ...FlagsOp) error
   194  	// UFIDelateFlags uFIDelate the flags associated with key.
   195  	UFIDelateFlags(Key, ...FlagsOp)
   196  
   197  	// Reset reset the MemBuffer to initial states.
   198  	Reset()
   199  	// DiscardValues releases the memory used by all values.
   200  	// NOTE: any operation need value will panic after this function.
   201  	DiscardValues()
   202  
   203  	// Staging create a new staging buffer inside the MemBuffer.
   204  	// Subsequent writes will be temporarily stored in this new staging buffer.
   205  	// When you think all modifications looks good, you can call `Release` to public all of them to the upper level buffer.
   206  	Staging() StagingHandle
   207  	// Release publish all modifications in the latest staging buffer to upper level.
   208  	Release(StagingHandle)
   209  	// Cleanup cleanup the resources referenced by the StagingHandle.
   210  	// If the changes are not published by `Release`, they will be discarded.
   211  	Cleanup(StagingHandle)
   212  	// InspectStage used to inspect the value uFIDelates in the given stage.
   213  	InspectStage(StagingHandle, func(Key, KeyFlags, []byte))
   214  
   215  	// SnapshotGetter returns a Getter for a snapshot of MemBuffer.
   216  	SnapshotGetter() Getter
   217  	// SnapshotIter returns a Iterator for a snapshot of MemBuffer.
   218  	SnapshotIter(k, upperbound Key) Iterator
   219  
   220  	// Size returns sum of keys and values length.
   221  	Size() int
   222  	// Len returns the number of entries in the EDB.
   223  	Len() int
   224  	// Dirty returns whether the root staging buffer is uFIDelated.
   225  	Dirty() bool
   226  }
   227  
   228  // Transaction defines the interface for operations inside a Transaction.
   229  // This is not thread safe.
   230  type Transaction interface {
   231  	RetrieverMutator
   232  	// Size returns sum of keys and values length.
   233  	Size() int
   234  	// Len returns the number of entries in the EDB.
   235  	Len() int
   236  	// Reset reset the Transaction to initial states.
   237  	Reset()
   238  	// Commit commits the transaction operations to KV causetstore.
   239  	Commit(context.Context) error
   240  	// Rollback undoes the transaction operations to KV causetstore.
   241  	Rollback() error
   242  	// String implements fmt.Stringer interface.
   243  	String() string
   244  	// LockKeys tries to dagger the entries with the keys in KV causetstore.
   245  	LockKeys(ctx context.Context, lockCtx *LockCtx, keys ...Key) error
   246  	// SetOption sets an option with a value, when val is nil, uses the default
   247  	// value of this option.
   248  	SetOption(opt Option, val interface{})
   249  	// DelOption deletes an option.
   250  	DelOption(opt Option)
   251  	// IsReadOnly checks if the transaction has only performed read operations.
   252  	IsReadOnly() bool
   253  	// StartTS returns the transaction start timestamp.
   254  	StartTS() uint64
   255  	// Valid returns if the transaction is valid.
   256  	// A transaction become invalid after commit or rollback.
   257  	Valid() bool
   258  	// GetMemBuffer return the MemBuffer binding to this transaction.
   259  	GetMemBuffer() MemBuffer
   260  	// GetSnapshot returns the Snapshot binding to this transaction.
   261  	GetSnapshot() Snapshot
   262  	// GetUnionStore returns the UnionStore binding to this transaction.
   263  	GetUnionStore() UnionStore
   264  	// SetVars sets variables to the transaction.
   265  	SetVars(vars *Variables)
   266  	// GetVars gets variables from the transaction.
   267  	GetVars() *Variables
   268  	// BatchGet gets ekv from the memory buffer of memex and transaction, and the ekv storage.
   269  	// Do not use len(value) == 0 or value == nil to represent non-exist.
   270  	// If a key doesn't exist, there shouldn't be any corresponding entry in the result map.
   271  	BatchGet(ctx context.Context, keys []Key) (map[string][]byte, error)
   272  	IsPessimistic() bool
   273  }
   274  
   275  // LockCtx contains information for LockKeys method.
   276  type LockCtx struct {
   277  	Killed                *uint32
   278  	ForUFIDelateTS        uint64
   279  	LockWaitTime          int64
   280  	WaitStartTime         time.Time
   281  	PessimisticLockWaited *int32
   282  	LockKeysDuration      *int64
   283  	LockKeysCount         *int32
   284  	ReturnValues          bool
   285  	Values                map[string]ReturnedValue
   286  	ValuesLock            sync.Mutex
   287  	LockExpired           *uint32
   288  	Stats                 *execdetails.LockKeysDetails
   289  }
   290  
   291  // ReturnedValue pairs the Value and AlreadyLocked flag for PessimisticLock return values result.
   292  type ReturnedValue struct {
   293  	Value         []byte
   294  	AlreadyLocked bool
   295  }
   296  
   297  // Client is used to send request to KV layer.
   298  type Client interface {
   299  	// Send sends request to KV layer, returns a Response.
   300  	Send(ctx context.Context, req *Request, vars *Variables) Response
   301  
   302  	// IsRequestTypeSupported checks if reqType and subType is supported.
   303  	IsRequestTypeSupported(reqType, subType int64) bool
   304  }
   305  
   306  // ReqTypes.
   307  const (
   308  	ReqTypeSelect   = 101
   309  	ReqTypeIndex    = 102
   310  	ReqTypePosetDag = 103
   311  	ReqTypeAnalyze  = 104
   312  	ReqTypeChecksum = 105
   313  
   314  	ReqSubTypeBasic          = 0
   315  	ReqSubTypeDesc           = 10000
   316  	ReqSubTypeGroupBy        = 10001
   317  	ReqSubTypeTopN           = 10002
   318  	ReqSubTypeSignature      = 10003
   319  	ReqSubTypeAnalyzeIdx     = 10004
   320  	ReqSubTypeAnalyzeDefCaus = 10005
   321  )
   322  
   323  // StoreType represents the type of a causetstore.
   324  type StoreType uint8
   325  
   326  const (
   327  	// EinsteinDB means the type of a causetstore is EinsteinDB.
   328  	EinsteinDB StoreType = iota
   329  	// TiFlash means the type of a causetstore is TiFlash.
   330  	TiFlash
   331  	// MilevaDB means the type of a causetstore is MilevaDB.
   332  	MilevaDB
   333  	// UnSpecified means the causetstore type is unknown
   334  	UnSpecified = 255
   335  )
   336  
   337  // Name returns the name of causetstore type.
   338  func (t StoreType) Name() string {
   339  	if t == TiFlash {
   340  		return "tiflash"
   341  	} else if t == MilevaDB {
   342  		return "milevadb"
   343  	} else if t == EinsteinDB {
   344  		return "einsteindb"
   345  	}
   346  	return "unspecified"
   347  }
   348  
   349  // Request represents a ekv request.
   350  type Request struct {
   351  	// Tp is the request type.
   352  	Tp        int64
   353  	StartTs   uint64
   354  	Data      []byte
   355  	KeyRanges []KeyRange
   356  
   357  	// Concurrency is 1, if it only sends the request to a single storage unit when
   358  	// ResponseIterator.Next is called. If concurrency is greater than 1, the request will be
   359  	// sent to multiple storage units concurrently.
   360  	Concurrency int
   361  	// IsolationLevel is the isolation level, default is SI.
   362  	IsolationLevel IsoLevel
   363  	// Priority is the priority of this KV request, its value may be PriorityNormal/PriorityLow/PriorityHigh.
   364  	Priority int
   365  	// memTracker is used to trace and control memory usage in co-processor layer.
   366  	MemTracker *memory.Tracker
   367  	// KeepOrder is true, if the response should be returned in order.
   368  	KeepOrder bool
   369  	// Desc is true, if the request is sent in descending order.
   370  	Desc bool
   371  	// NotFillCache makes this request do not touch the LRU cache of the underlying storage.
   372  	NotFillCache bool
   373  	// SyncLog decides whether the WAL(write-ahead log) of this request should be synchronized.
   374  	SyncLog bool
   375  	// Streaming indicates using streaming API for this request, result in that one Next()
   376  	// call would not corresponds to a whole region result.
   377  	Streaming bool
   378  	// ReplicaRead is used for reading data from replicas, only follower is supported at this time.
   379  	ReplicaRead ReplicaReadType
   380  	// StoreType represents this request is sent to the which type of causetstore.
   381  	StoreType StoreType
   382  	// Cacheable is true if the request can be cached. Currently only deterministic PosetDag requests can be cached.
   383  	Cacheable bool
   384  	// SchemaVer is for any schemaReplicant-ful storage to validate schemaReplicant correctness if necessary.
   385  	SchemaVar int64
   386  	// BatchCop indicates whether send batch interlock request to tiflash.
   387  	BatchCop bool
   388  	// TaskID is an unique ID for an execution of a memex
   389  	TaskID uint64
   390  }
   391  
   392  // ResultSubset represents a result subset from a single storage unit.
   393  // TODO: Find a better interface for ResultSubset that can reuse bytes.
   394  type ResultSubset interface {
   395  	// GetData gets the data.
   396  	GetData() []byte
   397  	// GetStartKey gets the start key.
   398  	GetStartKey() Key
   399  	// MemSize returns how many bytes of memory this result use for tracing memory usage.
   400  	MemSize() int64
   401  	// RespTime returns the response time for the request.
   402  	RespTime() time.Duration
   403  }
   404  
   405  // Response represents the response returned from KV layer.
   406  type Response interface {
   407  	// Next returns a resultSubset from a single storage unit.
   408  	// When full result set is returned, nil is returned.
   409  	Next(ctx context.Context) (resultSubset ResultSubset, err error)
   410  	// Close response.
   411  	Close() error
   412  }
   413  
   414  // Snapshot defines the interface for the snapshot fetched from KV causetstore.
   415  type Snapshot interface {
   416  	Retriever
   417  	// BatchGet gets a batch of values from snapshot.
   418  	BatchGet(ctx context.Context, keys []Key) (map[string][]byte, error)
   419  	// SetOption sets an option with a value, when val is nil, uses the default
   420  	// value of this option. Only ReplicaRead is supported for snapshot
   421  	SetOption(opt Option, val interface{})
   422  	// DelOption deletes an option.
   423  	DelOption(opt Option)
   424  }
   425  
   426  // BatchGetter is the interface for BatchGet.
   427  type BatchGetter interface {
   428  	// BatchGet gets a batch of values.
   429  	BatchGet(ctx context.Context, keys []Key) (map[string][]byte, error)
   430  }
   431  
   432  // Driver is the interface that must be implemented by a KV storage.
   433  type Driver interface {
   434  	// Open returns a new CausetStorage.
   435  	// The path is the string for storage specific format.
   436  	Open(path string) (CausetStorage, error)
   437  }
   438  
   439  // CausetStorage defines the interface for storage.
   440  // Isolation should be at least SI(SNAPSHOT ISOLATION)
   441  type CausetStorage interface {
   442  	// Begin transaction
   443  	Begin() (Transaction, error)
   444  	// BeginWithStartTS begins transaction with startTS.
   445  	BeginWithStartTS(startTS uint64) (Transaction, error)
   446  	// GetSnapshot gets a snapshot that is able to read any data which data is <= ver.
   447  	// if ver is MaxVersion or > current max committed version, we will use current version for this snapshot.
   448  	GetSnapshot(ver Version) (Snapshot, error)
   449  	// GetClient gets a client instance.
   450  	GetClient() Client
   451  	// Close causetstore
   452  	Close() error
   453  	// UUID return a unique ID which represents a CausetStorage.
   454  	UUID() string
   455  	// CurrentVersion returns current max committed version.
   456  	CurrentVersion() (Version, error)
   457  	// GetOracle gets a timestamp oracle client.
   458  	GetOracle() oracle.Oracle
   459  	// SupportDeleteRange gets the storage support delete range or not.
   460  	SupportDeleteRange() (supported bool)
   461  	// Name gets the name of the storage engine
   462  	Name() string
   463  	// Describe returns of brief introduction of the storage
   464  	Describe() string
   465  	// ShowStatus returns the specified status of the storage
   466  	ShowStatus(ctx context.Context, key string) (interface{}, error)
   467  }
   468  
   469  // FnKeyCmp is the function for iterator the keys
   470  type FnKeyCmp func(key Key) bool
   471  
   472  // Iterator is the interface for a iterator on KV causetstore.
   473  type Iterator interface {
   474  	Valid() bool
   475  	Key() Key
   476  	Value() []byte
   477  	Next() error
   478  	Close()
   479  }
   480  
   481  // SplitblockStore is the ekv causetstore which supports split regions.
   482  type SplitblockStore interface {
   483  	SplitRegions(ctx context.Context, splitKey [][]byte, scatter bool, blockID *int64) (regionID []uint64, err error)
   484  	WaitScatterRegionFinish(ctx context.Context, regionID uint64, backOff int) error
   485  	CheckRegionInScattering(regionID uint64) (bool, error)
   486  }
   487  
   488  // Used for pessimistic dagger wait time
   489  // these two constants are special for dagger protocol with einsteindb
   490  // 0 means always wait, -1 means nowait, others meaning dagger wait in milliseconds
   491  var (
   492  	LockAlwaysWait = int64(0)
   493  	LockNoWait     = int64(-1)
   494  )