github.com/whtcorpsinc/milevadb-prod@v0.0.0-20211104133533-f57f4be3b597/causetstore/ekv/ekv.go (about) 1 // Copyright 2020 WHTCORPS INC, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package ekv 15 16 import ( 17 "context" 18 "sync" 19 "time" 20 21 "github.com/whtcorpsinc/milevadb/causetstore/einsteindb/oracle" 22 "github.com/whtcorpsinc/milevadb/config" 23 "github.com/whtcorpsinc/milevadb/soliton/execdetails" 24 "github.com/whtcorpsinc/milevadb/soliton/memory" 25 ) 26 27 // Transaction options 28 const ( 29 // BinlogInfo contains the binlog data and client. 30 BinlogInfo Option = iota + 1 31 // SchemaChecker is used for checking schemaReplicant-validity. 32 SchemaChecker 33 // IsolationLevel sets isolation level for current transaction. The default level is SI. 34 IsolationLevel 35 // Priority marks the priority of this transaction. 36 Priority 37 // NotFillCache makes this request do not touch the LRU cache of the underlying storage. 38 NotFillCache 39 // SyncLog decides whether the WAL(write-ahead log) of this request should be synchronized. 40 SyncLog 41 // KeyOnly retrieve only keys, it can be used in scan now. 42 KeyOnly 43 // Pessimistic is defined for pessimistic dagger 44 Pessimistic 45 // SnapshotTS is defined to set snapshot ts. 46 SnapshotTS 47 // Set replica read 48 ReplicaRead 49 // Set task ID 50 TaskID 51 // SchemaReplicant is schemaReplicant version used by txn startTS. 52 SchemaReplicant 53 // DefCauslectRuntimeStats is used to enable collect runtime stats. 54 DefCauslectRuntimeStats 55 // SchemaAmender is used to amend mutations for pessimistic transactions 56 SchemaAmender 57 // SampleStep skips 'SampleStep - 1' number of keys after each returned key. 58 SampleStep 59 // CommitHook is a callback function called right after the transaction gets committed 60 CommitHook 61 ) 62 63 // Priority value for transaction priority. 64 const ( 65 PriorityNormal = iota 66 PriorityLow 67 PriorityHigh 68 ) 69 70 // UnCommitIndexKVFlag uses to indicate the index key/value is no need to commit. 71 // This is used in the situation of the index key/value was unchanged when do uFIDelate. 72 // Usage: 73 // 1. For non-unique index: normally, the index value is '0'. 74 // Change the value to '1' indicate the index key/value is no need to commit. 75 // 2. For unique index: normally, the index value is the record handle ID, 8 bytes. 76 // Append UnCommitIndexKVFlag to the value indicate the index key/value is no need to commit. 77 const UnCommitIndexKVFlag byte = '1' 78 79 // MaxTxnTimeUse is the max time a Txn may use (in ms) from its begin to commit. 80 // We use it to abort the transaction to guarantee GC worker will not influence it. 81 const MaxTxnTimeUse = 24 * 60 * 60 * 1000 82 83 // IsoLevel is the transaction's isolation level. 84 type IsoLevel int 85 86 const ( 87 // SI stands for 'snapshot isolation'. 88 SI IsoLevel = iota 89 // RC stands for 'read committed'. 90 RC 91 ) 92 93 // ReplicaReadType is the type of replica to read data from 94 type ReplicaReadType byte 95 96 const ( 97 // ReplicaReadLeader stands for 'read from leader'. 98 ReplicaReadLeader ReplicaReadType = 1 << iota 99 // ReplicaReadFollower stands for 'read from follower'. 100 ReplicaReadFollower 101 // ReplicaReadMixed stands for 'read from leader and follower and learner'. 102 ReplicaReadMixed 103 ) 104 105 // IsFollowerRead checks if leader is going to be used to read data. 106 func (r ReplicaReadType) IsFollowerRead() bool { 107 // In some cases the default value is 0, which should be treated as `ReplicaReadLeader`. 108 return r != ReplicaReadLeader && r != 0 109 } 110 111 // Those limits is enforced to make sure the transaction can be well handled by EinsteinDB. 112 var ( 113 // TxnEntrySizeLimit is limit of single entry size (len(key) + len(value)). 114 TxnEntrySizeLimit uint64 = config.DefTxnEntrySizeLimit 115 // TxnTotalSizeLimit is limit of the sum of all entry size. 116 TxnTotalSizeLimit uint64 = config.DefTxnTotalSizeLimit 117 ) 118 119 // Getter is the interface for the Get method. 120 type Getter interface { 121 // Get gets the value for key k from ekv causetstore. 122 // If corresponding ekv pair does not exist, it returns nil and ErrNotExist. 123 Get(ctx context.Context, k Key) ([]byte, error) 124 } 125 126 // Retriever is the interface wraps the basic Get and Seek methods. 127 type Retriever interface { 128 Getter 129 // Iter creates an Iterator positioned on the first entry that k <= entry's key. 130 // If such entry is not found, it returns an invalid Iterator with no error. 131 // It yields only keys that < upperBound. If upperBound is nil, it means the upperBound is unbounded. 132 // The Iterator must be Closed after use. 133 Iter(k Key, upperBound Key) (Iterator, error) 134 135 // IterReverse creates a reversed Iterator positioned on the first entry which key is less than k. 136 // The returned iterator will iterate from greater key to smaller key. 137 // If k is nil, the returned iterator will be positioned at the last key. 138 // TODO: Add lower bound limit 139 IterReverse(k Key) (Iterator, error) 140 } 141 142 // Mutator is the interface wraps the basic Set and Delete methods. 143 type Mutator interface { 144 // Set sets the value for key k as v into ekv causetstore. 145 // v must NOT be nil or empty, otherwise it returns ErrCannotSetNilValue. 146 Set(k Key, v []byte) error 147 // Delete removes the entry for key k from ekv causetstore. 148 Delete(k Key) error 149 } 150 151 // StagingHandle is the reference of a staging buffer. 152 type StagingHandle int 153 154 var ( 155 // InvalidStagingHandle is an invalid handler, MemBuffer will check handler to ensure safety. 156 InvalidStagingHandle StagingHandle = 0 157 // LastActiveStagingHandle is an special handler which always point to the last active staging buffer. 158 LastActiveStagingHandle StagingHandle = -1 159 ) 160 161 // RetrieverMutator is the interface that groups Retriever and Mutator interfaces. 162 type RetrieverMutator interface { 163 Retriever 164 Mutator 165 } 166 167 // MemBufferIterator is an Iterator with KeyFlags related functions. 168 type MemBufferIterator interface { 169 Iterator 170 HasValue() bool 171 Flags() KeyFlags 172 } 173 174 // MemBuffer is an in-memory ekv collection, can be used to buffer write operations. 175 type MemBuffer interface { 176 RetrieverMutator 177 178 // RLock locks the MemBuffer for shared read. 179 // In the most case, MemBuffer will only used by single goroutine, 180 // but it will be read by multiple goroutine when combined with interlock.UnionScanInterDirc. 181 // To avoid race introduced by interlock.UnionScanInterDirc, MemBuffer expose read dagger for it. 182 RLock() 183 // RUnlock unlocks the MemBuffer. 184 RUnlock() 185 186 // GetFlags returns the latest flags associated with key. 187 GetFlags(Key) (KeyFlags, error) 188 // IterWithFlags returns a MemBufferIterator. 189 IterWithFlags(k Key, upperBound Key) MemBufferIterator 190 // IterReverseWithFlags returns a reversed MemBufferIterator. 191 IterReverseWithFlags(k Key) MemBufferIterator 192 // SetWithFlags put key-value into the last active staging buffer with the given KeyFlags. 193 SetWithFlags(Key, []byte, ...FlagsOp) error 194 // UFIDelateFlags uFIDelate the flags associated with key. 195 UFIDelateFlags(Key, ...FlagsOp) 196 197 // Reset reset the MemBuffer to initial states. 198 Reset() 199 // DiscardValues releases the memory used by all values. 200 // NOTE: any operation need value will panic after this function. 201 DiscardValues() 202 203 // Staging create a new staging buffer inside the MemBuffer. 204 // Subsequent writes will be temporarily stored in this new staging buffer. 205 // When you think all modifications looks good, you can call `Release` to public all of them to the upper level buffer. 206 Staging() StagingHandle 207 // Release publish all modifications in the latest staging buffer to upper level. 208 Release(StagingHandle) 209 // Cleanup cleanup the resources referenced by the StagingHandle. 210 // If the changes are not published by `Release`, they will be discarded. 211 Cleanup(StagingHandle) 212 // InspectStage used to inspect the value uFIDelates in the given stage. 213 InspectStage(StagingHandle, func(Key, KeyFlags, []byte)) 214 215 // SnapshotGetter returns a Getter for a snapshot of MemBuffer. 216 SnapshotGetter() Getter 217 // SnapshotIter returns a Iterator for a snapshot of MemBuffer. 218 SnapshotIter(k, upperbound Key) Iterator 219 220 // Size returns sum of keys and values length. 221 Size() int 222 // Len returns the number of entries in the EDB. 223 Len() int 224 // Dirty returns whether the root staging buffer is uFIDelated. 225 Dirty() bool 226 } 227 228 // Transaction defines the interface for operations inside a Transaction. 229 // This is not thread safe. 230 type Transaction interface { 231 RetrieverMutator 232 // Size returns sum of keys and values length. 233 Size() int 234 // Len returns the number of entries in the EDB. 235 Len() int 236 // Reset reset the Transaction to initial states. 237 Reset() 238 // Commit commits the transaction operations to KV causetstore. 239 Commit(context.Context) error 240 // Rollback undoes the transaction operations to KV causetstore. 241 Rollback() error 242 // String implements fmt.Stringer interface. 243 String() string 244 // LockKeys tries to dagger the entries with the keys in KV causetstore. 245 LockKeys(ctx context.Context, lockCtx *LockCtx, keys ...Key) error 246 // SetOption sets an option with a value, when val is nil, uses the default 247 // value of this option. 248 SetOption(opt Option, val interface{}) 249 // DelOption deletes an option. 250 DelOption(opt Option) 251 // IsReadOnly checks if the transaction has only performed read operations. 252 IsReadOnly() bool 253 // StartTS returns the transaction start timestamp. 254 StartTS() uint64 255 // Valid returns if the transaction is valid. 256 // A transaction become invalid after commit or rollback. 257 Valid() bool 258 // GetMemBuffer return the MemBuffer binding to this transaction. 259 GetMemBuffer() MemBuffer 260 // GetSnapshot returns the Snapshot binding to this transaction. 261 GetSnapshot() Snapshot 262 // GetUnionStore returns the UnionStore binding to this transaction. 263 GetUnionStore() UnionStore 264 // SetVars sets variables to the transaction. 265 SetVars(vars *Variables) 266 // GetVars gets variables from the transaction. 267 GetVars() *Variables 268 // BatchGet gets ekv from the memory buffer of memex and transaction, and the ekv storage. 269 // Do not use len(value) == 0 or value == nil to represent non-exist. 270 // If a key doesn't exist, there shouldn't be any corresponding entry in the result map. 271 BatchGet(ctx context.Context, keys []Key) (map[string][]byte, error) 272 IsPessimistic() bool 273 } 274 275 // LockCtx contains information for LockKeys method. 276 type LockCtx struct { 277 Killed *uint32 278 ForUFIDelateTS uint64 279 LockWaitTime int64 280 WaitStartTime time.Time 281 PessimisticLockWaited *int32 282 LockKeysDuration *int64 283 LockKeysCount *int32 284 ReturnValues bool 285 Values map[string]ReturnedValue 286 ValuesLock sync.Mutex 287 LockExpired *uint32 288 Stats *execdetails.LockKeysDetails 289 } 290 291 // ReturnedValue pairs the Value and AlreadyLocked flag for PessimisticLock return values result. 292 type ReturnedValue struct { 293 Value []byte 294 AlreadyLocked bool 295 } 296 297 // Client is used to send request to KV layer. 298 type Client interface { 299 // Send sends request to KV layer, returns a Response. 300 Send(ctx context.Context, req *Request, vars *Variables) Response 301 302 // IsRequestTypeSupported checks if reqType and subType is supported. 303 IsRequestTypeSupported(reqType, subType int64) bool 304 } 305 306 // ReqTypes. 307 const ( 308 ReqTypeSelect = 101 309 ReqTypeIndex = 102 310 ReqTypePosetDag = 103 311 ReqTypeAnalyze = 104 312 ReqTypeChecksum = 105 313 314 ReqSubTypeBasic = 0 315 ReqSubTypeDesc = 10000 316 ReqSubTypeGroupBy = 10001 317 ReqSubTypeTopN = 10002 318 ReqSubTypeSignature = 10003 319 ReqSubTypeAnalyzeIdx = 10004 320 ReqSubTypeAnalyzeDefCaus = 10005 321 ) 322 323 // StoreType represents the type of a causetstore. 324 type StoreType uint8 325 326 const ( 327 // EinsteinDB means the type of a causetstore is EinsteinDB. 328 EinsteinDB StoreType = iota 329 // TiFlash means the type of a causetstore is TiFlash. 330 TiFlash 331 // MilevaDB means the type of a causetstore is MilevaDB. 332 MilevaDB 333 // UnSpecified means the causetstore type is unknown 334 UnSpecified = 255 335 ) 336 337 // Name returns the name of causetstore type. 338 func (t StoreType) Name() string { 339 if t == TiFlash { 340 return "tiflash" 341 } else if t == MilevaDB { 342 return "milevadb" 343 } else if t == EinsteinDB { 344 return "einsteindb" 345 } 346 return "unspecified" 347 } 348 349 // Request represents a ekv request. 350 type Request struct { 351 // Tp is the request type. 352 Tp int64 353 StartTs uint64 354 Data []byte 355 KeyRanges []KeyRange 356 357 // Concurrency is 1, if it only sends the request to a single storage unit when 358 // ResponseIterator.Next is called. If concurrency is greater than 1, the request will be 359 // sent to multiple storage units concurrently. 360 Concurrency int 361 // IsolationLevel is the isolation level, default is SI. 362 IsolationLevel IsoLevel 363 // Priority is the priority of this KV request, its value may be PriorityNormal/PriorityLow/PriorityHigh. 364 Priority int 365 // memTracker is used to trace and control memory usage in co-processor layer. 366 MemTracker *memory.Tracker 367 // KeepOrder is true, if the response should be returned in order. 368 KeepOrder bool 369 // Desc is true, if the request is sent in descending order. 370 Desc bool 371 // NotFillCache makes this request do not touch the LRU cache of the underlying storage. 372 NotFillCache bool 373 // SyncLog decides whether the WAL(write-ahead log) of this request should be synchronized. 374 SyncLog bool 375 // Streaming indicates using streaming API for this request, result in that one Next() 376 // call would not corresponds to a whole region result. 377 Streaming bool 378 // ReplicaRead is used for reading data from replicas, only follower is supported at this time. 379 ReplicaRead ReplicaReadType 380 // StoreType represents this request is sent to the which type of causetstore. 381 StoreType StoreType 382 // Cacheable is true if the request can be cached. Currently only deterministic PosetDag requests can be cached. 383 Cacheable bool 384 // SchemaVer is for any schemaReplicant-ful storage to validate schemaReplicant correctness if necessary. 385 SchemaVar int64 386 // BatchCop indicates whether send batch interlock request to tiflash. 387 BatchCop bool 388 // TaskID is an unique ID for an execution of a memex 389 TaskID uint64 390 } 391 392 // ResultSubset represents a result subset from a single storage unit. 393 // TODO: Find a better interface for ResultSubset that can reuse bytes. 394 type ResultSubset interface { 395 // GetData gets the data. 396 GetData() []byte 397 // GetStartKey gets the start key. 398 GetStartKey() Key 399 // MemSize returns how many bytes of memory this result use for tracing memory usage. 400 MemSize() int64 401 // RespTime returns the response time for the request. 402 RespTime() time.Duration 403 } 404 405 // Response represents the response returned from KV layer. 406 type Response interface { 407 // Next returns a resultSubset from a single storage unit. 408 // When full result set is returned, nil is returned. 409 Next(ctx context.Context) (resultSubset ResultSubset, err error) 410 // Close response. 411 Close() error 412 } 413 414 // Snapshot defines the interface for the snapshot fetched from KV causetstore. 415 type Snapshot interface { 416 Retriever 417 // BatchGet gets a batch of values from snapshot. 418 BatchGet(ctx context.Context, keys []Key) (map[string][]byte, error) 419 // SetOption sets an option with a value, when val is nil, uses the default 420 // value of this option. Only ReplicaRead is supported for snapshot 421 SetOption(opt Option, val interface{}) 422 // DelOption deletes an option. 423 DelOption(opt Option) 424 } 425 426 // BatchGetter is the interface for BatchGet. 427 type BatchGetter interface { 428 // BatchGet gets a batch of values. 429 BatchGet(ctx context.Context, keys []Key) (map[string][]byte, error) 430 } 431 432 // Driver is the interface that must be implemented by a KV storage. 433 type Driver interface { 434 // Open returns a new CausetStorage. 435 // The path is the string for storage specific format. 436 Open(path string) (CausetStorage, error) 437 } 438 439 // CausetStorage defines the interface for storage. 440 // Isolation should be at least SI(SNAPSHOT ISOLATION) 441 type CausetStorage interface { 442 // Begin transaction 443 Begin() (Transaction, error) 444 // BeginWithStartTS begins transaction with startTS. 445 BeginWithStartTS(startTS uint64) (Transaction, error) 446 // GetSnapshot gets a snapshot that is able to read any data which data is <= ver. 447 // if ver is MaxVersion or > current max committed version, we will use current version for this snapshot. 448 GetSnapshot(ver Version) (Snapshot, error) 449 // GetClient gets a client instance. 450 GetClient() Client 451 // Close causetstore 452 Close() error 453 // UUID return a unique ID which represents a CausetStorage. 454 UUID() string 455 // CurrentVersion returns current max committed version. 456 CurrentVersion() (Version, error) 457 // GetOracle gets a timestamp oracle client. 458 GetOracle() oracle.Oracle 459 // SupportDeleteRange gets the storage support delete range or not. 460 SupportDeleteRange() (supported bool) 461 // Name gets the name of the storage engine 462 Name() string 463 // Describe returns of brief introduction of the storage 464 Describe() string 465 // ShowStatus returns the specified status of the storage 466 ShowStatus(ctx context.Context, key string) (interface{}, error) 467 } 468 469 // FnKeyCmp is the function for iterator the keys 470 type FnKeyCmp func(key Key) bool 471 472 // Iterator is the interface for a iterator on KV causetstore. 473 type Iterator interface { 474 Valid() bool 475 Key() Key 476 Value() []byte 477 Next() error 478 Close() 479 } 480 481 // SplitblockStore is the ekv causetstore which supports split regions. 482 type SplitblockStore interface { 483 SplitRegions(ctx context.Context, splitKey [][]byte, scatter bool, blockID *int64) (regionID []uint64, err error) 484 WaitScatterRegionFinish(ctx context.Context, regionID uint64, backOff int) error 485 CheckRegionInScattering(regionID uint64) (bool, error) 486 } 487 488 // Used for pessimistic dagger wait time 489 // these two constants are special for dagger protocol with einsteindb 490 // 0 means always wait, -1 means nowait, others meaning dagger wait in milliseconds 491 var ( 492 LockAlwaysWait = int64(0) 493 LockNoWait = int64(-1) 494 )