github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/tscache/interval_skl.go (about) 1 // Copyright 2017 Andy Kimball 2 // Copyright 2017 The Cockroach Authors. 3 // 4 // Use of this software is governed by the Business Source License 5 // included in the file licenses/BSL.txt. 6 // 7 // As of the Change Date specified in that file, in accordance with 8 // the Business Source License, use of this software will be governed 9 // by the Apache License, Version 2.0, included in the file 10 // licenses/APL.txt. 11 12 package tscache 13 14 import ( 15 "bytes" 16 "container/list" 17 "context" 18 "encoding/binary" 19 "fmt" 20 "sync/atomic" 21 "time" 22 "unsafe" 23 24 "github.com/andy-kimball/arenaskl" 25 "github.com/cockroachdb/cockroach/pkg/util" 26 "github.com/cockroachdb/cockroach/pkg/util/hlc" 27 "github.com/cockroachdb/cockroach/pkg/util/log" 28 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 29 "github.com/cockroachdb/cockroach/pkg/util/uuid" 30 "github.com/cockroachdb/errors" 31 ) 32 33 // rangeOptions are passed to AddRange to indicate the bounds of the range. By 34 // default, the "from" and "to" keys are inclusive. Setting these bit flags 35 // indicates that one or both is exclusive instead. 36 type rangeOptions int 37 38 const ( 39 // excludeFrom indicates that the range does not include the starting key. 40 excludeFrom = rangeOptions(1 << iota) 41 42 // excludeTo indicates that the range does not include the ending key. 43 excludeTo 44 ) 45 46 // nodeOptions are meta tags on skiplist nodes that indicate the status and role 47 // of that node in the intervalSkl. The options are bit flags that can be 48 // independently added and removed. 49 // 50 // Each node in the intervalSkl holds a key and, optionally, the latest read 51 // timestamp for that key. In addition, the node optionally holds the latest 52 // read timestamp for the range of keys between itself and the next key that is 53 // present in the skiplist. This space between keys is called the "gap", and the 54 // timestamp for that range is called the "gap timestamp". Here is a simplified 55 // representation that would result after these ranges were added to an empty 56 // intervalSkl: 57 // ["apple", "orange") = 200 58 // ["kiwi", "raspberry"] = 100 59 // 60 // "apple" "orange" "raspberry" 61 // keyts=200 keyts=100 keyts=100 62 // gapts=200 gapts=100 gapts=0 63 // 64 // That is, the range from apple (inclusive) to orange (exclusive) has a read 65 // timestamp of 200. The range from orange (inclusive) to raspberry (inclusive) 66 // has a read timestamp of 100. All other keys have a read timestamp of 0. 67 type nodeOptions int 68 69 const ( 70 // initialized indicates that the node has been created and fully 71 // initialized. Key and gap values are final, and can now be used. 72 initialized = 1 << iota 73 74 // cantInit indicates that the node should never be allowed to initialize. 75 // This is set on nodes which were unable to ratchet their values at some 76 // point because of a full arena. In this case, the node's values should 77 // never become final and any goroutines trying to initialize it it will be 78 // forced to create it again in a new page when they notice this flag. 79 cantInit 80 81 // hasKey indicates that the node has an associated key value. If this is 82 // not set, then the key timestamp is assumed to be zero and the key is 83 // assumed to not have a corresponding txnID. 84 hasKey 85 86 // hasGap indicates that the node has an associated gap value. If this is 87 // not set, then the gap timestamp is assumed to be zero and the gap is 88 // assumed to not have a corresponding txnID. 89 hasGap 90 ) 91 92 const ( 93 encodedTsSize = int(unsafe.Sizeof(int64(0)) + unsafe.Sizeof(int32(0))) 94 encodedTxnIDSize = int(unsafe.Sizeof(uuid.UUID{})) 95 encodedValSize = encodedTsSize + encodedTxnIDSize 96 97 // initialSklPageSize is the initial size of each page in the sklImpl's 98 // intervalSkl. The pages start small to limit the memory footprint of 99 // the data structure for short-lived tests. Reducing this size can hurt 100 // performance but it decreases the risk of OOM failures when many tests 101 // are running concurrently. 102 initialSklPageSize = 128 << 10 // 128 KB 103 // maximumSklPageSize is the maximum size of each page in the sklImpl's 104 // intervalSkl. A long-running server is expected to settle on pages of 105 // this size under steady-state load. 106 maximumSklPageSize = 32 << 20 // 32 MB 107 108 defaultMinSklPages = 2 109 ) 110 111 // initialSklAllocSize is the amount of space in its arena that an empty 112 // arenaskl.Skiplist consumes. 113 var initialSklAllocSize = func() int { 114 a := arenaskl.NewArena(1000) 115 _ = arenaskl.NewSkiplist(a) 116 return int(a.Size()) 117 }() 118 119 // intervalSkl efficiently tracks the latest logical time at which any key or 120 // range of keys has been accessed. Keys are binary values of any length, and 121 // times are represented as hybrid logical timestamps (see hlc package). The 122 // data structure guarantees that the read timestamp of any given key or range 123 // will never decrease. In other words, if a lookup returns timestamp A and 124 // repeating the same lookup returns timestamp B, then B >= A. 125 // 126 // Add and lookup operations do not block or interfere with one another, which 127 // enables predictable operation latencies. Also, the impact of the structure on 128 // the GC is virtually nothing, even when the structure is very large. These 129 // properties are enabled by employing a lock-free skiplist implementation that 130 // uses an arena allocator. Skiplist nodes refer to one another by offset into 131 // the arena rather than by pointer, so the GC has very few objects to track. 132 // 133 // 134 // The data structure can conceptually be thought of as being parameterized over 135 // a key and a value type, such that the key implements a Comparable interface 136 // (see interval.Comparable) and the value implements a Ratchetable interface: 137 // 138 // type Ratchetable interface { 139 // Ratchet(other Ratchetable) (changed bool) 140 // } 141 // 142 // In other words, if Go supported zero-cost abstractions, this type might look 143 // like: 144 // 145 // type intervalSkl<K: Comparable, V: Ratchetable> 146 // 147 type intervalSkl struct { 148 // rotMutex synchronizes page rotation with all other operations. The read 149 // lock is acquired by the Add and Lookup operations. The write lock is 150 // acquired only when the pages are rotated. Since that is very rare, the 151 // vast majority of operations can proceed without blocking. 152 rotMutex syncutil.RWMutex 153 154 // The following fields are used to enforce a minimum retention window on 155 // all timestamp intervals. intervalSkl promises to retain all timestamp 156 // intervals until they are at least this old before allowing the floor 157 // timestamp to ratchet and subsume them. If clock is nil then no minimum 158 // retention policy will be employed. 159 clock *hlc.Clock 160 minRet time.Duration 161 162 // The size of the last allocated page in the data structure, in bytes. When 163 // a page fills, a new page will be allocate, the pages will be rotated, and 164 // older entries will be discarded. Page sizes grow exponentially as pages 165 // are allocated up to a maximum of maximumSklPageSize. The value will never 166 // regress over the lifetime of an intervalSkl instance. 167 // 168 // The entire data structure is typically bound to a maximum a size of 169 // maximumSklPageSize*minPages. However, this limit can be violated if the 170 // intervalSkl needs to grow larger to enforce a minimum retention policy. 171 pageSize uint32 172 pageSizeFixed bool // testing only 173 174 // The linked list maintains fixed-size skiplist pages, ordered by creation 175 // time such that the first page is the one most recently created. When the 176 // first page fills, a new empty page is prepended to the front of the list 177 // and all others are pushed back. This first page is the only sklPage that 178 // is written to, all others are immutable after they have left the front of 179 // the list. However, earlier pages are accessed whenever necessary during 180 // lookups. Pages are evicted when they become too old, subject to a minimum 181 // retention policy described above. 182 pages list.List // List<*sklPage> 183 minPages int 184 185 // In order to ensure that timestamps never decrease, intervalSkl maintains 186 // a floor timestamp, which is the minimum timestamp that can be returned by 187 // the lookup operations. When the earliest page is discarded, its current 188 // maximum timestamp becomes the new floor timestamp for the overall 189 // intervalSkl. 190 floorTS hlc.Timestamp 191 192 metrics sklMetrics 193 } 194 195 // newIntervalSkl creates a new interval skiplist with the given minimum 196 // retention duration and the maximum size. 197 func newIntervalSkl(clock *hlc.Clock, minRet time.Duration, metrics sklMetrics) *intervalSkl { 198 s := intervalSkl{ 199 clock: clock, 200 minRet: minRet, 201 pageSize: initialSklPageSize / 2, // doubled in pushNewPage 202 minPages: defaultMinSklPages, 203 metrics: metrics, 204 } 205 s.pushNewPage(0 /* maxWallTime */, nil /* arena */) 206 s.metrics.Pages.Update(1) 207 return &s 208 } 209 210 // Add marks the a single key as having been read at the given timestamp. Once 211 // Add completes, future lookups of this key are guaranteed to return an equal 212 // or greater timestamp. 213 func (s *intervalSkl) Add(key []byte, val cacheValue) { 214 s.AddRange(nil, key, 0, val) 215 } 216 217 // AddRange marks the given range of keys [from, to] as having been read at the 218 // given timestamp. The starting and ending points of the range are inclusive by 219 // default, but can be excluded by passing the applicable range options. nil can 220 // be passed as the "from" key, in which case only the end key will be added. 221 // nil can also be passed as the "to" key, in which case an open range will be 222 // added spanning [from, infinity). However, it is illegal to pass nil for both 223 // "from" and "to". It is also illegal for "from" > "to", which would be an 224 // inverted range. 225 // 226 // intervalSkl defines the domain of possible keys to span ["", nil). A range 227 // with a starting key of []byte("") is treated as a closed range beginning at 228 // the minimum key. A range with an ending key of []byte(nil) is treated as an 229 // open range extending to infinity (as such, excludeTo has not effect on it). A 230 // range starting at []byte("") and ending at []byte(nil) will span all keys. 231 // 232 // If some or all of the range was previously read at a higher timestamp, then 233 // the range is split into sub-ranges that are each marked with the maximum read 234 // timestamp for that sub-range. Once AddRange completes, future lookups at any 235 // point in the range are guaranteed to return an equal or greater timestamp. 236 func (s *intervalSkl) AddRange(from, to []byte, opt rangeOptions, val cacheValue) { 237 if from == nil && to == nil { 238 panic("from and to keys cannot be nil") 239 } 240 if encodedRangeSize(from, to, opt) > int(s.maximumPageSize())-initialSklAllocSize { 241 // Without this check, we could fall into an infinite page rotation loop 242 // if a range would take up more space than available in an empty page. 243 panic("key range too large to fit in any page") 244 } 245 246 if to != nil { 247 cmp := 0 248 if from != nil { 249 cmp = bytes.Compare(from, to) 250 } 251 252 switch { 253 case cmp > 0: 254 // Starting key is after ending key. This shouldn't happen. Determine 255 // the index where the keys diverged and panic. 256 d := 0 257 for d < len(from) && d < len(to) { 258 if from[d] != to[d] { 259 break 260 } 261 d++ 262 } 263 msg := fmt.Sprintf("inverted range (issue #32149): key lens = [%d,%d), diff @ index %d", 264 len(from), len(to), d) 265 log.Errorf(context.Background(), "%s, [%s,%s)", msg, from, to) 266 panic(log.Safe(msg)) 267 case cmp == 0: 268 // Starting key is same as ending key, so just add single node. 269 if opt == (excludeFrom | excludeTo) { 270 // Both from and to keys are excluded, so range is zero length. 271 return 272 } 273 274 // Just add the ending key. 275 from = nil 276 opt = 0 277 } 278 } 279 280 for { 281 // Try to add the range to the later page. 282 filledPage := s.addRange(from, to, opt, val) 283 if filledPage == nil { 284 break 285 } 286 287 // The page was filled up, so rotate the pages and then try again. 288 s.rotatePages(filledPage) 289 } 290 } 291 292 // addRange marks the given range of keys [from, to] as having been read at the 293 // given timestamp. The key range and the rangeOptions observe the same behavior 294 // as is specified for AddRange above. Notably, addRange treats nil "from" and 295 // "to" arguments in accordance with AddRange's contract. It returns nil if the 296 // operation was successful, or a pointer to an sklPage if the operation failed 297 // because that page was full. 298 func (s *intervalSkl) addRange(from, to []byte, opt rangeOptions, val cacheValue) *sklPage { 299 // Acquire the rotation mutex read lock so that the page will not be rotated 300 // while add or lookup operations are in progress. 301 s.rotMutex.RLock() 302 defer s.rotMutex.RUnlock() 303 304 // If floor ts is >= requested timestamp, then no need to perform a search 305 // or add any records. 306 if val.ts.LessEq(s.floorTS) { 307 return nil 308 } 309 310 fp := s.frontPage() 311 312 var it arenaskl.Iterator 313 it.Init(fp.list) 314 315 // Start by ensuring that the ending node has been created (unless "to" is 316 // nil, in which case the range extends indefinitely). Do this before creating 317 // the start node, so that the range won't extend past the end point during 318 // the period between creating the two endpoints. Since we need the ending node 319 // to be initialized before creating the starting node, we pass mustInit = true. 320 var err error 321 if to != nil { 322 if (opt & excludeTo) == 0 { 323 err = fp.addNode(&it, to, val, hasKey, true /* mustInit */) 324 } else { 325 err = fp.addNode(&it, to, val, 0, true /* mustInit */) 326 } 327 328 if errors.Is(err, arenaskl.ErrArenaFull) { 329 return fp 330 } 331 } 332 333 // If from is nil, then the "range" is just a single key. We already 334 // asserted above that if from == nil then to != nil. 335 if from == nil { 336 return nil 337 } 338 339 // Ensure that the starting node has been created. 340 if (opt & excludeFrom) == 0 { 341 err = fp.addNode(&it, from, val, hasKey|hasGap, false /* mustInit */) 342 } else { 343 err = fp.addNode(&it, from, val, hasGap, false /* mustInit */) 344 } 345 346 if errors.Is(err, arenaskl.ErrArenaFull) { 347 return fp 348 } 349 350 // Seek to the node immediately after the "from" node. 351 // 352 // If there are no nodes after the "from" node (only possible if to == nil), 353 // then ensureFloorValue below will be a no-op because no other nodes need 354 // to be adjusted. 355 if !it.Valid() || !bytes.Equal(it.Key(), from) { 356 // We will only reach this state if we didn't need to add a node at 357 // "from" due to the previous gap value being larger than val. The fast 358 // path for this case is in sklPage.addNode. For all other times, adding 359 // the new node will have positioned the iterator at "from". 360 // 361 // If Seek returns false then we're already at the following node, so 362 // there's no need to call Next. 363 if it.Seek(from) { 364 it.Next() 365 } 366 } else { 367 it.Next() 368 } 369 370 // Now iterate forwards and ensure that all nodes between the start and 371 // end (exclusive) have timestamps that are >= the range timestamp. end 372 // is exclusive because we already added a node at that key. 373 if !fp.ensureFloorValue(&it, to, val) { 374 // Page is filled up, so rotate pages and try again. 375 return fp 376 } 377 378 return nil 379 } 380 381 // frontPage returns the front page of the intervalSkl. 382 func (s *intervalSkl) frontPage() *sklPage { 383 return s.pages.Front().Value.(*sklPage) 384 } 385 386 // pushNewPage prepends a new empty page to the front of the pages list. It 387 // accepts an optional arena argument to facilitate re-use. 388 func (s *intervalSkl) pushNewPage(maxWallTime int64, arena *arenaskl.Arena) { 389 size := s.nextPageSize() 390 if arena != nil && arena.Cap() == size { 391 // Re-use the provided arena, if possible. 392 arena.Reset() 393 } else { 394 // Otherwise, construct new memory arena. 395 arena = arenaskl.NewArena(size) 396 } 397 p := newSklPage(arena) 398 p.maxWallTime = maxWallTime 399 s.pages.PushFront(p) 400 } 401 402 // nextPageSize returns the size that the next allocated page should use. 403 func (s *intervalSkl) nextPageSize() uint32 { 404 if s.pageSizeFixed || s.pageSize == maximumSklPageSize { 405 return s.pageSize 406 } 407 s.pageSize *= 2 408 if s.pageSize > maximumSklPageSize { 409 s.pageSize = maximumSklPageSize 410 } 411 return s.pageSize 412 } 413 414 // maximumPageSize returns the maximum page size that this instance of the 415 // intervalSkl will be able to accommodate. The method takes into consideration 416 // whether the page size is fixed or dynamic. 417 func (s *intervalSkl) maximumPageSize() uint32 { 418 if s.pageSizeFixed { 419 return s.pageSize 420 } 421 return maximumSklPageSize 422 } 423 424 // rotatePages makes the later page the earlier page, and then discards the 425 // earlier page. The max timestamp of the earlier page becomes the new floor 426 // timestamp, in order to guarantee that timestamp lookups never return decreasing 427 // values. 428 func (s *intervalSkl) rotatePages(filledPage *sklPage) { 429 // Acquire the rotation mutex write lock to lock the entire intervalSkl. 430 s.rotMutex.Lock() 431 defer s.rotMutex.Unlock() 432 433 fp := s.frontPage() 434 if filledPage != fp { 435 // Another thread already rotated the pages, so don't do anything more. 436 return 437 } 438 439 // Determine the minimum timestamp a page must contain to be within the 440 // minimum retention window. If clock is nil, we have no minimum retention 441 // window. 442 minTSToRetain := hlc.MaxTimestamp 443 if s.clock != nil { 444 minTSToRetain = s.clock.Now() 445 minTSToRetain.WallTime -= s.minRet.Nanoseconds() 446 } 447 448 // Iterate over the pages in reverse, evicting pages that are no longer 449 // needed and ratcheting up the floor timestamp in the process. 450 // 451 // If possible, keep a reference to an evicted page's arena so that we can 452 // re-use it. This is safe because we're holding the rotation mutex write 453 // lock, so there cannot be concurrent readers and no reader will ever 454 // access evicted pages once we unlock. 455 back := s.pages.Back() 456 var oldArena *arenaskl.Arena 457 for s.pages.Len() >= s.minPages { 458 bp := back.Value.(*sklPage) 459 bpMaxTS := hlc.Timestamp{WallTime: bp.maxWallTime} 460 if minTSToRetain.LessEq(bpMaxTS) { 461 // The back page's maximum timestamp is within the time 462 // window we've promised to retain, so we can't evict it. 463 break 464 } 465 466 // Max timestamp of the back page becomes the new floor timestamp. 467 s.floorTS.Forward(bpMaxTS) 468 469 // Evict the page. 470 oldArena = bp.list.Arena() 471 evict := back 472 back = back.Prev() 473 s.pages.Remove(evict) 474 } 475 476 // Push a new empty page on the front of the pages list. We give this page 477 // the maxWallTime of the old front page. This assures that the maxWallTime 478 // for a page is always equal to or greater than that for all earlier pages. 479 // In other words, it assures that the maxWallTime for a page is not only 480 // the maximum timestamp for all values it contains, but also for all values 481 // any earlier pages contain. 482 s.pushNewPage(fp.maxWallTime, oldArena) 483 484 // Update metrics. 485 s.metrics.Pages.Update(int64(s.pages.Len())) 486 s.metrics.PageRotations.Inc(1) 487 } 488 489 // LookupTimestamp returns the latest timestamp value at which the given key was 490 // read. If this operation is repeated with the same key, it will always result 491 // in an equal or greater timestamp. 492 func (s *intervalSkl) LookupTimestamp(key []byte) cacheValue { 493 return s.LookupTimestampRange(nil, key, 0) 494 } 495 496 // LookupTimestampRange returns the latest timestamp value of any key within the 497 // specified range. If this operation is repeated with the same range, it will 498 // always result in an equal or greater timestamp. 499 func (s *intervalSkl) LookupTimestampRange(from, to []byte, opt rangeOptions) cacheValue { 500 if from == nil && to == nil { 501 panic("from and to keys cannot be nil") 502 } 503 504 // Acquire the rotation mutex read lock so that the page will not be rotated 505 // while add or lookup operations are in progress. 506 s.rotMutex.RLock() 507 defer s.rotMutex.RUnlock() 508 509 // Iterate over the pages, performing the lookup on each and remembering the 510 // maximum value we've seen so far. 511 var val cacheValue 512 for e := s.pages.Front(); e != nil; e = e.Next() { 513 p := e.Value.(*sklPage) 514 515 // If the maximum value's timestamp is greater than the max timestamp in 516 // the current page, then there's no need to do the lookup in this page. 517 // There's also no reason to do the lookup in any earlier pages either, 518 // because rotatePages assures that a page will never have a max 519 // timestamp smaller than that of any page earlier than it. 520 // 521 // NB: if the max timestamp of the current page is equal to the maximum 522 // value's timestamp, then we still need to perform the lookup. This is 523 // because the current page's max timestamp _may_ (if the hlc.Timestamp 524 // ceil operation in sklPage.ratchetMaxTimestamp was a no-op) correspond 525 // to a real range's timestamp, and this range _may_ overlap with our 526 // lookup range. If that is the case and that other range has a 527 // different txnID than our current cacheValue result (val), then we 528 // need to remove the txnID from our result, per the ratcheting policy 529 // for cacheValues. This is tested in TestIntervalSklMaxPageTS. 530 maxTS := hlc.Timestamp{WallTime: atomic.LoadInt64(&p.maxWallTime)} 531 if maxTS.Less(val.ts) { 532 break 533 } 534 535 val2 := p.lookupTimestampRange(from, to, opt) 536 val, _ = ratchetValue(val, val2) 537 } 538 539 // Return the higher value from the the page lookups and the floor 540 // timestamp. 541 floorVal := cacheValue{ts: s.floorTS, txnID: noTxnID} 542 val, _ = ratchetValue(val, floorVal) 543 544 return val 545 } 546 547 // FloorTS returns the receiver's floor timestamp. 548 func (s *intervalSkl) FloorTS() hlc.Timestamp { 549 s.rotMutex.RLock() 550 defer s.rotMutex.RUnlock() 551 return s.floorTS 552 } 553 554 // sklPage maintains a skiplist based on a fixed-size arena. When the arena has 555 // filled up, it returns arenaskl.ErrArenaFull. At that point, a new fixed page 556 // must be allocated and used instead. 557 type sklPage struct { 558 list *arenaskl.Skiplist 559 maxWallTime int64 // accessed atomically 560 isFull int32 // accessed atomically 561 } 562 563 func newSklPage(arena *arenaskl.Arena) *sklPage { 564 return &sklPage{list: arenaskl.NewSkiplist(arena)} 565 } 566 567 func (p *sklPage) lookupTimestampRange(from, to []byte, opt rangeOptions) cacheValue { 568 if to != nil { 569 cmp := 0 570 if from != nil { 571 cmp = bytes.Compare(from, to) 572 } 573 574 if cmp > 0 { 575 // Starting key is after ending key, so range is zero length. 576 return cacheValue{} 577 } 578 if cmp == 0 { 579 // Starting key is same as ending key. 580 if opt == (excludeFrom | excludeTo) { 581 // Both from and to keys are excluded, so range is zero length. 582 return cacheValue{} 583 } 584 585 // Scan over a single key. 586 from = to 587 opt = 0 588 } 589 } 590 591 var it arenaskl.Iterator 592 it.Init(p.list) 593 it.SeekForPrev(from) 594 595 return p.maxInRange(&it, from, to, opt) 596 } 597 598 // addNode adds a new node at key with the provided value if one does not exist. 599 // If one does exist, it ratchets the existing node's value instead. 600 // 601 // If the mustInit flag is set, the function will ensure that the node is 602 // initialized by the time the method returns, even if a different goroutine 603 // created the node. If the flag is not set and a different goroutine created 604 // the node, the method won't try to help. 605 func (p *sklPage) addNode( 606 it *arenaskl.Iterator, key []byte, val cacheValue, opt nodeOptions, mustInit bool, 607 ) error { 608 // Array with constant size will remain on the stack. 609 var arr [encodedValSize * 2]byte 610 var keyVal, gapVal cacheValue 611 612 if (opt & hasKey) != 0 { 613 keyVal = val 614 } 615 616 if (opt & hasGap) != 0 { 617 gapVal = val 618 } 619 620 if !it.SeekForPrev(key) { 621 // The key was not found. Scan for the previous gap value. 622 prevGapVal := p.incomingGapVal(it, key) 623 624 var err error 625 if it.Valid() && bytes.Equal(it.Key(), key) { 626 // Another thread raced and added a node at key while we were 627 // scanning backwards. Ratchet the new node. 628 err = arenaskl.ErrRecordExists 629 } else { 630 // There is still no node at key. If the previous node has a gap 631 // value that would not be updated with the new value, then there is 632 // no need to add another node, since its timestamp would be the 633 // same as the gap timestamp and its txnID would be the same as the 634 // gap txnID. 635 if _, update := ratchetValue(prevGapVal, val); !update { 636 return nil 637 } 638 639 // Ratchet max timestamp before adding the node. 640 p.ratchetMaxTimestamp(val.ts) 641 642 // Ensure that a new node is created. It needs to stay in the 643 // initializing state until the gap value of its preceding node 644 // has been found and used to ratchet this node's value. During 645 // the search for the gap value, this node acts as a sentinel 646 // for other ongoing operations - when they see this node they're 647 // forced to stop and ratchet its value before they can continue. 648 b, meta := encodeValueSet(arr[:0], keyVal, gapVal) 649 err = it.Add(key, b, meta) 650 } 651 652 switch { 653 case errors.Is(err, arenaskl.ErrArenaFull): 654 atomic.StoreInt32(&p.isFull, 1) 655 return err 656 case errors.Is(err, arenaskl.ErrRecordExists): 657 // Another thread raced and added the node, so just ratchet its 658 // values instead (down below). 659 case err == nil: 660 // Add was successful, so finish initialization by scanning for gap 661 // value and using it to ratchet the new nodes' values. 662 return p.ensureInitialized(it, key) 663 default: 664 panic(fmt.Sprintf("unexpected error: %v", err)) 665 } 666 } 667 668 // If mustInit is set to true then we're promising that the node will be 669 // initialized by the time this method returns. Ensure this by helping out 670 // the goroutine that created the node. 671 if (it.Meta()&initialized) == 0 && mustInit { 672 if err := p.ensureInitialized(it, key); err != nil { 673 return err 674 } 675 } 676 677 // Ratchet up the timestamps on the existing node, but don't set the 678 // initialized bit. If mustInit is set then we already made sure the node 679 // was initialized. If mustInit is not set then we don't require it to be 680 // initialized. 681 if opt == 0 { 682 // Don't need to set either key or gap value, so done. 683 return nil 684 } 685 return p.ratchetValueSet(it, always, keyVal, gapVal, false /* setInit */) 686 } 687 688 // ensureInitialized ensures that the node at the specified key is initialized. 689 // It does so by first scanning backwards to the first initialized node and 690 // using its gap value as the initial "previous gap value". It then scans 691 // forward until it reaches the desired key, ratcheting any uninitialized nodes 692 // it encounters (but not initializing them), and updating the candidate 693 // "previous gap value" as it goes. Finally, it initializes the node with the 694 // "previous gap value". 695 // 696 // Iterating backwards and then forwards solves potential race conditions with 697 // other threads. During backwards iteration, other nodes can be inserting new 698 // nodes between the previous node and the lookup node, which could change the 699 // choice for the "previous gap value". The solution is two-fold: 700 // 701 // 1. Add new nodes in two phases - initializing and then initialized. Nodes in 702 // the initializing state act as a synchronization point between goroutines 703 // that are adding a particular node and goroutines that are scanning for gap 704 // values. Scanning goroutines encounter the initializing nodes and are 705 // forced to ratchet them before continuing. If they fail to ratchet them 706 // because an arena is full, the nodes must never be initialized so they are 707 // set to cantInit. This is critical for correctness, because if one of these 708 // initializing nodes was not ratcheted when encountered during a forward 709 // scan and later initialized, we could see a ratchet inversion. For example, 710 // the inversion would occur if: 711 // - 1: a goroutine is scanning forwards after finding a previous gap value 712 // from node A in which it plans to initialize node C. 713 // - 2: node B is created and initialized between node A and node C with a 714 // larger value than either. 715 // - 1: the iterator scanning forwards to node C is already past node B when 716 // it is created. 717 // - 3: a lookup for the timestamp of node C comes in. Since it's not 718 // initialized, it uses node B's gap value. 719 // - 1: the iterator reaches node C and initializes it with node A's gap 720 // value, which is smaller than node B's. 721 // - 4: another lookup for the timestamp of node C comes it. It returns the 722 // nodes newly initialized value, which is smaller than the one it 723 // reported before. 724 // Ratcheting initializing nodes when encountered with the current gap value 725 // avoids this race. 726 // 727 // However, only a goroutine that saw a node in an uninitialized state before 728 // scanning backwards can switch it from initializing to initialized. This 729 // enforces a "happens-before" relationship between the creation of a node 730 // and the discovery of the gap value that is used when initializing it. If 731 // any goroutine was able to initialize a node, then this relationship would 732 // not exist and we could experience races where a newly inserted node A's 733 // call to ensureFloorValue could come before the insertion of a node B, but 734 // node B could be initialized with a gap value discovered before the 735 // insertion of node A. For more on this, see the discussion in #19672. 736 // 737 // 2. After the gap value of the first initialized node with a key less than or 738 // equal to the desired key has been found, the scanning goroutine will scan 739 // forwards until it reaches the original key. It will ratchet any 740 // uninitialized nodes along the way and inherit the gap value from them as 741 // it goes. By the time it reaches the original key, it has a valid gap 742 // value, which we have called the "previous gap value". At this point, if 743 // the node at key is uninitialized, the node can be initialized with the 744 // "previous gap value". 745 // 746 // It is an error to call ensureInitialized on a key without a node. When 747 // finished, the iterator will be positioned the same as if it.Seek(key) had 748 // been called. 749 func (p *sklPage) ensureInitialized(it *arenaskl.Iterator, key []byte) error { 750 // Determine the incoming gap value. 751 prevGapVal := p.incomingGapVal(it, key) 752 753 // Make sure we're on the right key again. 754 if util.RaceEnabled && !bytes.Equal(it.Key(), key) { 755 panic("no node found") 756 } 757 758 // If the node isn't initialized, initialize it. 759 return p.ratchetValueSet(it, onlyIfUninitialized, prevGapVal, prevGapVal, true /* setInit */) 760 } 761 762 // ensureFloorValue scans from the current position of the iterator to the 763 // provided key, ratcheting all initialized or uninitialized nodes as it goes 764 // with the provided value. It returns a boolean indicating whether it was 765 // successful (true) or whether it saw an ErrArenaFull while ratcheting (false). 766 func (p *sklPage) ensureFloorValue(it *arenaskl.Iterator, to []byte, val cacheValue) bool { 767 for it.Valid() { 768 util.RacePreempt() 769 770 // If "to" is not nil (open range) then it is treated as an exclusive 771 // bound. 772 if to != nil && bytes.Compare(it.Key(), to) >= 0 { 773 break 774 } 775 776 if atomic.LoadInt32(&p.isFull) == 1 { 777 // Page is full, so stop iterating. The caller will then be able to 778 // release the read lock and rotate the pages. Not doing this could 779 // result in forcing all other operations to wait for this thread to 780 // completely finish iteration. That could take a long time if this 781 // range is very large. 782 return false 783 } 784 785 // Don't clear the initialization bit, since we don't have the gap 786 // timestamp from the previous node, and don't need an initialized node 787 // for this operation anyway. 788 err := p.ratchetValueSet(it, always, val, val, false /* setInit */) 789 switch { 790 case err == nil: 791 // Continue scanning. 792 case errors.Is(err, arenaskl.ErrArenaFull): 793 // Page is too full to ratchet value, so stop iterating. 794 return false 795 default: 796 panic(fmt.Sprintf("unexpected error: %v", err)) 797 } 798 799 it.Next() 800 } 801 802 return true 803 } 804 805 func (p *sklPage) ratchetMaxTimestamp(ts hlc.Timestamp) { 806 // Cheat and just use the max wall time portion of the timestamp, since it's 807 // fine for the max timestamp to be a bit too large. This is the case 808 // because it's always safe to increase the timestamp in a range. It's also 809 // always safe to remove the transaction ID from a range. Either of these 810 // changes may force a transaction to lose "ownership" over a range of keys, 811 // but they'll never allow a transaction to gain "ownership" over a range of 812 // keys that it wouldn't otherwise have. In other words, it's ok for the 813 // intervalSkl to produce false negatives but never ok for it to produce 814 // false positives. 815 // 816 // We could use an atomic.Value to store a "MaxValue" cacheValue for a given 817 // page, but this would be more expensive and it's not clear that it would 818 // be worth it. 819 new := ts.WallTime 820 if ts.Logical > 0 { 821 new++ 822 } 823 824 for { 825 old := atomic.LoadInt64(&p.maxWallTime) 826 if new <= old { 827 break 828 } 829 830 if atomic.CompareAndSwapInt64(&p.maxWallTime, old, new) { 831 break 832 } 833 } 834 } 835 836 // ratchetPolicy defines the behavior a ratcheting attempt should take when 837 // trying to ratchet a node. Certain operations require nodes to be ratcheted 838 // regardless of whether they're already initialized or not. Other operations 839 // only want nodes that are uninitialized to be ratcheted. 840 type ratchetPolicy bool 841 842 const ( 843 // always is a policy to ratchet a node regardless of whether it is already 844 // initialized or not. 845 always ratchetPolicy = false 846 // onlyIfUninitialized is a policy to only ratchet a node if it has not been 847 // initialized yet. 848 onlyIfUninitialized ratchetPolicy = true 849 ) 850 851 // ratchetValueSet will update the current node's key and gap values to the 852 // maximum of their current values or the given values. If setInit is true, then 853 // the initialized bit will be set, indicating that the node is now fully 854 // initialized and its values can now be relied upon. 855 // 856 // The method will return ErrArenaFull if the arena was too full to ratchet the 857 // node's value set. In that case, the node will be marked with the "cantInit" 858 // flag because its values should never be trusted in isolation. 859 func (p *sklPage) ratchetValueSet( 860 it *arenaskl.Iterator, policy ratchetPolicy, keyVal, gapVal cacheValue, setInit bool, 861 ) error { 862 // Array with constant size will remain on the stack. 863 var arr [encodedValSize * 2]byte 864 865 for { 866 util.RacePreempt() 867 868 meta := it.Meta() 869 inited := (meta & initialized) != 0 870 if inited && policy == onlyIfUninitialized { 871 // If the node is already initialized and the policy is 872 // onlyIfUninitialized, return. If this isn't the first ratcheting 873 // attempt then we must have raced with node initialization before. 874 return nil 875 } 876 if (meta & cantInit) != 0 { 877 // If the meta has the cantInit flag set to true, we fail with an 878 // ErrArenaFull error to force the current goroutine to retry on a 879 // new page. 880 return arenaskl.ErrArenaFull 881 } 882 883 newMeta := meta 884 updateInit := setInit && !inited 885 if updateInit { 886 newMeta |= initialized 887 } 888 889 var keyValUpdate, gapValUpdate bool 890 oldKeyVal, oldGapVal := decodeValueSet(it.Value(), meta) 891 keyVal, keyValUpdate = ratchetValue(oldKeyVal, keyVal) 892 gapVal, gapValUpdate = ratchetValue(oldGapVal, gapVal) 893 updateVals := keyValUpdate || gapValUpdate 894 895 if updateVals { 896 // If we're updating the values (and maybe the init flag) then we 897 // need to call it.Set. This can return an ErrArenaFull, which we 898 // must handle with care. 899 900 // Ratchet the max timestamp. 901 keyTs, gapTs := keyVal.ts, gapVal.ts 902 if gapTs.Less(keyTs) { 903 p.ratchetMaxTimestamp(keyTs) 904 } else { 905 p.ratchetMaxTimestamp(gapTs) 906 } 907 908 // Remove the hasKey and hasGap flags from the meta. These will be 909 // replaced below. 910 newMeta &^= (hasKey | hasGap) 911 912 // Update the values, possibly preserving the init bit. 913 b, valMeta := encodeValueSet(arr[:0], keyVal, gapVal) 914 newMeta |= valMeta 915 916 err := it.Set(b, newMeta) 917 switch { 918 case err == nil: 919 // Success. 920 return nil 921 case errors.Is(err, arenaskl.ErrRecordUpdated): 922 // Record was updated by another thread, so restart ratchet attempt. 923 continue 924 case errors.Is(err, arenaskl.ErrArenaFull): 925 // The arena was full which means that we were unable to ratchet 926 // the value of this node. Mark the page as full and make sure 927 // that the node is moved to the "cantInit" state if it hasn't 928 // been initialized yet. This is critical because if the node 929 // was initialized after this, its value set would be relied 930 // upon to stand on its own even though it would be missing the 931 // ratcheting we tried to perform here. 932 atomic.StoreInt32(&p.isFull, 1) 933 934 if !inited && (meta&cantInit) == 0 { 935 err := it.SetMeta(meta | cantInit) 936 switch { 937 case errors.Is(err, arenaskl.ErrRecordUpdated): 938 // Record was updated by another thread, so restart 939 // ratchet attempt. 940 continue 941 case errors.Is(err, arenaskl.ErrArenaFull): 942 panic(fmt.Sprintf("SetMeta with larger meta should not return %v", err)) 943 } 944 } 945 return arenaskl.ErrArenaFull 946 default: 947 panic(fmt.Sprintf("unexpected error: %v", err)) 948 } 949 } else if updateInit { 950 // If we're only updating the init flag and not the values, we can 951 // use it.SetMeta instead of it.Set, which avoids allocating new 952 // chunks in the arena. 953 err := it.SetMeta(newMeta) 954 switch { 955 case err == nil: 956 // Success. 957 return nil 958 case errors.Is(err, arenaskl.ErrRecordUpdated): 959 // Record was updated by another thread, so restart ratchet attempt. 960 continue 961 case errors.Is(err, arenaskl.ErrArenaFull): 962 panic(fmt.Sprintf("SetMeta with larger meta should not return %v", err)) 963 default: 964 panic(fmt.Sprintf("unexpected error: %v", err)) 965 } 966 } else { 967 return nil 968 } 969 } 970 } 971 972 // maxInRange scans the range of keys between from and to and returns the 973 // maximum (initialized or uninitialized) value found. When finished, the 974 // iterator will be positioned the same as if it.Seek(to) had been called. 975 func (p *sklPage) maxInRange(it *arenaskl.Iterator, from, to []byte, opt rangeOptions) cacheValue { 976 // Determine the previous gap value. This will move the iterator to the 977 // first node >= from. 978 prevGapVal := p.incomingGapVal(it, from) 979 980 if !it.Valid() { 981 // No more nodes. 982 return prevGapVal 983 } else if bytes.Equal(it.Key(), from) { 984 // Found a node at from. 985 if (it.Meta() & initialized) != 0 { 986 // The node was initialized. Ignore the previous gap value. 987 prevGapVal = cacheValue{} 988 } 989 } else { 990 // No node at from. Remove excludeFrom option. 991 opt &^= excludeFrom 992 } 993 994 // Scan the rest of the way. Notice that we provide the previous gap value. 995 // This is important for two reasons: 996 // 1. it will be counted towards the maxVal result. 997 // 2. it will be used to ratchet uninitialized nodes that the scan sees 998 // before any initialized nodes. 999 _, maxVal := p.scanTo(it, to, opt, prevGapVal) 1000 return maxVal 1001 } 1002 1003 // incomingGapVal determines the gap value active at the specified key by first 1004 // scanning backwards to the first initialized node and then scanning forwards 1005 // to the specified key. If there is already a node at key then the previous gap 1006 // value will be returned. When finished, the iterator will be positioned the 1007 // same as if it.Seek(key) had been called. 1008 // 1009 // During forward iteration, if another goroutine inserts a new gap node in the 1010 // interval between the previous node and the original key, then either: 1011 // 1012 // 1. The forward iteration finds it and looks up its gap value. That node's gap 1013 // value now becomes the new "previous gap value", and iteration continues. 1014 // 1015 // 2. The new node is created after the iterator has move past its position. As 1016 // part of node creation, the creator had to scan backwards to find the gap 1017 // value of the previous node. It is guaranteed to find a gap value that is 1018 // >= the gap value found by the original goroutine. 1019 // 1020 // This means that no matter what gets inserted, or when it gets inserted, the 1021 // scanning goroutine is guaranteed to end up with a value that will never 1022 // decrease on future lookups, which is the critical invariant. 1023 func (p *sklPage) incomingGapVal(it *arenaskl.Iterator, key []byte) cacheValue { 1024 // Iterate backwards to the nearest initialized node. 1025 prevInitNode(it) 1026 1027 // Iterate forwards to key, remembering the last gap value. 1028 prevGapVal, _ := p.scanTo(it, key, 0, cacheValue{}) 1029 return prevGapVal 1030 } 1031 1032 // scanTo scans from the current iterator position until the key "to". While 1033 // scanning, any uninitialized values are ratcheted with the current gap value, 1034 // which is essential to avoiding ratchet inversions (see the comment on 1035 // ensureInitialized). 1036 // 1037 // The function then returns the maximum value seen along with the gap value at 1038 // the end of the scan. If the iterator is positioned at a key > "to", the 1039 // function will return zero values. The function takes an optional initial gap 1040 // value argument, which is used to initialize the running maximum and gap 1041 // values. When finished, the iterator will be positioned the same as if 1042 // it.Seek(to) had been called. 1043 func (p *sklPage) scanTo( 1044 it *arenaskl.Iterator, to []byte, opt rangeOptions, initGapVal cacheValue, 1045 ) (prevGapVal, maxVal cacheValue) { 1046 prevGapVal, maxVal = initGapVal, initGapVal 1047 first := true 1048 for { 1049 util.RacePreempt() 1050 1051 if !it.Valid() { 1052 // No more nodes, which can happen for open ranges. 1053 return 1054 } 1055 1056 toCmp := bytes.Compare(it.Key(), to) 1057 if to == nil { 1058 // to == nil means open range, so toCmp will always be -1. 1059 toCmp = -1 1060 } 1061 if toCmp > 0 || (toCmp == 0 && (opt&excludeTo) != 0) { 1062 // Past the end key or we don't want to consider the end key. 1063 return 1064 } 1065 1066 // Ratchet uninitialized nodes. We pass onlyIfUninitialized, so if 1067 // the node is already initialized then this is a no-op. 1068 ratchetErr := p.ratchetValueSet(it, onlyIfUninitialized, 1069 prevGapVal, prevGapVal, false /* setInit */) 1070 1071 // Decode the current node's value set. 1072 keyVal, gapVal := decodeValueSet(it.Value(), it.Meta()) 1073 if errors.Is(ratchetErr, arenaskl.ErrArenaFull) { 1074 // If we failed to ratchet an uninitialized node above, the desired 1075 // ratcheting won't be reflected in the decoded values. Perform the 1076 // ratcheting manually. 1077 keyVal, _ = ratchetValue(keyVal, prevGapVal) 1078 gapVal, _ = ratchetValue(gapVal, prevGapVal) 1079 } 1080 1081 if !(first && (opt&excludeFrom) != 0) { 1082 // As long as this isn't the first key and opt says to exclude the 1083 // first key, we ratchet the maxVal. 1084 maxVal, _ = ratchetValue(maxVal, keyVal) 1085 } 1086 1087 if toCmp == 0 { 1088 // We're on the scan's end key, so return the max value seen. 1089 return 1090 } 1091 1092 // Ratchet the maxVal by the current gapVal. 1093 maxVal, _ = ratchetValue(maxVal, gapVal) 1094 1095 // Haven't yet reached the scan's end key, so keep iterating. 1096 prevGapVal = gapVal 1097 first = false 1098 it.Next() 1099 } 1100 } 1101 1102 // prevInitNode moves the iterator backwards to the nearest initialized node. If 1103 // the iterator is already positioned on an initialized node then this function 1104 // is a no-op. 1105 func prevInitNode(it *arenaskl.Iterator) { 1106 for { 1107 util.RacePreempt() 1108 1109 if !it.Valid() { 1110 // No more previous nodes, so use the zero value. 1111 it.SeekToFirst() 1112 break 1113 } 1114 1115 if (it.Meta() & initialized) != 0 { 1116 // Found an initialized node. 1117 break 1118 } 1119 1120 // Haven't yet reached an initialized node, so keep iterating. 1121 it.Prev() 1122 } 1123 } 1124 1125 func decodeValueSet(b []byte, meta uint16) (keyVal, gapVal cacheValue) { 1126 if (meta & hasKey) != 0 { 1127 b, keyVal = decodeValue(b) 1128 } 1129 1130 if (meta & hasGap) != 0 { 1131 _, gapVal = decodeValue(b) 1132 } 1133 1134 return 1135 } 1136 1137 func encodeValueSet(b []byte, keyVal, gapVal cacheValue) (ret []byte, meta uint16) { 1138 if keyVal.ts.WallTime != 0 || keyVal.ts.Logical != 0 { 1139 b = encodeValue(b, keyVal) 1140 meta |= hasKey 1141 } 1142 1143 if gapVal.ts.WallTime != 0 || gapVal.ts.Logical != 0 { 1144 b = encodeValue(b, gapVal) 1145 meta |= hasGap 1146 } 1147 1148 ret = b 1149 return 1150 } 1151 1152 func decodeValue(b []byte) (ret []byte, val cacheValue) { 1153 val.ts.WallTime = int64(binary.BigEndian.Uint64(b)) 1154 val.ts.Logical = int32(binary.BigEndian.Uint32(b[8:])) 1155 var err error 1156 if val.txnID, err = uuid.FromBytes(b[encodedTsSize:encodedValSize]); err != nil { 1157 panic(err) 1158 } 1159 ret = b[encodedValSize:] 1160 return 1161 } 1162 1163 func encodeValue(b []byte, val cacheValue) []byte { 1164 l := len(b) 1165 b = b[:l+encodedValSize] 1166 binary.BigEndian.PutUint64(b[l:], uint64(val.ts.WallTime)) 1167 binary.BigEndian.PutUint32(b[l+8:], uint32(val.ts.Logical)) 1168 if _, err := val.txnID.MarshalTo(b[l+encodedTsSize:]); err != nil { 1169 panic(err) 1170 } 1171 return b 1172 } 1173 1174 func encodedRangeSize(from, to []byte, opt rangeOptions) int { 1175 vals := 1 1176 if (opt & excludeTo) == 0 { 1177 vals++ 1178 } 1179 if (opt & excludeFrom) == 0 { 1180 vals++ 1181 } 1182 // This will be an overestimate because nodes will almost 1183 // always be smaller than arenaskl.MaxNodeSize. 1184 return len(from) + len(to) + (vals * encodedValSize) + (2 * arenaskl.MaxNodeSize) 1185 }