github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/tscache/tree_impl.go (about) 1 // Copyright 2017 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package tscache 12 13 import ( 14 "fmt" 15 "unsafe" 16 17 "github.com/cockroachdb/cockroach/pkg/roachpb" 18 "github.com/cockroachdb/cockroach/pkg/util/cache" 19 "github.com/cockroachdb/cockroach/pkg/util/hlc" 20 "github.com/cockroachdb/cockroach/pkg/util/interval" 21 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 22 "github.com/cockroachdb/cockroach/pkg/util/uuid" 23 ) 24 25 const ( 26 // defaultTreeImplSize is the default size in bytes for a treeImpl timestamp 27 // cache. Note that the timestamp cache can use more memory than this 28 // because it holds on to all entries that are younger than 29 // MinRetentionWindow. 30 defaultTreeImplSize = 64 << 20 // 64 MB 31 ) 32 33 func makeCacheEntry(key cache.IntervalKey, value cacheValue) *cache.Entry { 34 alloc := struct { 35 key cache.IntervalKey 36 value cacheValue 37 entry cache.Entry 38 }{ 39 key: key, 40 value: value, 41 } 42 alloc.entry.Key = &alloc.key 43 alloc.entry.Value = &alloc.value 44 return &alloc.entry 45 } 46 47 var cacheEntryOverhead = uint64(unsafe.Sizeof(cache.IntervalKey{}) + 48 unsafe.Sizeof(cacheValue{}) + unsafe.Sizeof(cache.Entry{})) 49 50 func cacheEntrySize(start, end interval.Comparable) uint64 { 51 n := uint64(cap(start)) 52 if end != nil && len(start) > 0 && len(end) > 0 && &end[0] != &start[0] { 53 // If the end key exists and is not sharing memory with the start key, 54 // account for its memory usage. 55 n += uint64(cap(end)) 56 } 57 n += cacheEntryOverhead 58 return n 59 } 60 61 // treeImpl implements the Cache interface. It maintains an interval tree FIFO 62 // cache of keys or key ranges and the timestamps at which they were most 63 // recently read or written. If a timestamp was read or written by a 64 // transaction, the txn ID is stored with the timestamp to avoid advancing 65 // timestamps on successive requests from the same transaction. 66 type treeImpl struct { 67 syncutil.RWMutex 68 69 cache *cache.IntervalCache 70 lowWater, latest hlc.Timestamp 71 72 bytes uint64 73 maxBytes uint64 74 metrics Metrics 75 } 76 77 var _ Cache = &treeImpl{} 78 79 // newTreeImpl returns a new treeImpl with the supplied hybrid clock. 80 func newTreeImpl(clock *hlc.Clock) *treeImpl { 81 tc := &treeImpl{ 82 cache: cache.NewIntervalCache(cache.Config{Policy: cache.CacheFIFO}), 83 maxBytes: uint64(defaultTreeImplSize), 84 metrics: makeMetrics(), 85 } 86 tc.clear(clock.Now()) 87 tc.cache.Config.ShouldEvict = tc.shouldEvict 88 tc.cache.Config.OnEvicted = tc.onEvicted 89 return tc 90 } 91 92 // clear clears the cache and resets the low-water mark. 93 func (tc *treeImpl) clear(lowWater hlc.Timestamp) { 94 tc.Lock() 95 defer tc.Unlock() 96 tc.cache.Clear() 97 tc.lowWater = lowWater 98 tc.latest = tc.lowWater 99 } 100 101 // len returns the total number of read and write intervals in the cache. 102 func (tc *treeImpl) len() int { 103 tc.RLock() 104 defer tc.RUnlock() 105 return tc.cache.Len() 106 } 107 108 // Add implements the Cache interface. 109 func (tc *treeImpl) Add(start, end roachpb.Key, ts hlc.Timestamp, txnID uuid.UUID) { 110 // This gives us a memory-efficient end key if end is empty. 111 if len(end) == 0 { 112 end = start.Next() 113 start = end[:len(start)] 114 } 115 116 tc.Lock() 117 defer tc.Unlock() 118 tc.latest.Forward(ts) 119 120 // Only add to the cache if the timestamp is more recent than the 121 // low water mark. 122 if tc.lowWater.Less(ts) { 123 124 addRange := func(r interval.Range) { 125 value := cacheValue{ts: ts, txnID: txnID} 126 key := tc.cache.MakeKey(r.Start, r.End) 127 entry := makeCacheEntry(key, value) 128 tc.bytes += cacheEntrySize(r.Start, r.End) 129 tc.cache.AddEntry(entry) 130 } 131 addEntryAfter := func(entry, after *cache.Entry) { 132 ck := entry.Key.(*cache.IntervalKey) 133 tc.bytes += cacheEntrySize(ck.Start, ck.End) 134 tc.cache.AddEntryAfter(entry, after) 135 } 136 137 r := interval.Range{ 138 Start: interval.Comparable(start), 139 End: interval.Comparable(end), 140 } 141 142 // Check existing, overlapping entries and truncate/split/remove if 143 // superseded and in the past. If existing entries are in the future, 144 // subtract from the range/ranges that need to be added to cache. 145 for _, entry := range tc.cache.GetOverlaps(r.Start, r.End) { 146 cv := entry.Value.(*cacheValue) 147 key := entry.Key.(*cache.IntervalKey) 148 sCmp := r.Start.Compare(key.Start) 149 eCmp := r.End.Compare(key.End) 150 // Some of the cases below adjust cv and key in-place (in a manner that 151 // maintains the IntervalCache invariants). These in-place modifications 152 // change the size of the entry. To capture all of these modifications we 153 // compute the current size of the entry and then use the new size at the 154 // end of this iteration to update Cache.bytes. 155 oldSize := cacheEntrySize(key.Start, key.End) 156 if cv.ts.Less(ts) { 157 // The existing interval has a timestamp less than the new 158 // interval. Compare interval ranges to determine how to 159 // modify existing interval. 160 switch { 161 case sCmp == 0 && eCmp == 0: 162 // New and old are equal; replace old with new and avoid the need to insert new. 163 // 164 // New: ------------ 165 // Old: ------------ 166 // 167 // New: ------------ 168 // Old: 169 *cv = cacheValue{ts: ts, txnID: txnID} 170 tc.cache.MoveToEnd(entry) 171 return 172 case sCmp <= 0 && eCmp >= 0: 173 // New contains or is equal to old; delete old. 174 // 175 // New: ------------ ------------ ------------ 176 // Old: -------- or ---------- or ---------- 177 // 178 // New: ------------ ------------ ------------ 179 // Old: 180 tc.cache.DelEntry(entry) 181 continue // DelEntry adjusted tc.bytes, don't do it again 182 case sCmp > 0 && eCmp < 0: 183 // Old contains new; split up old into two. 184 // 185 // New: ---- 186 // Old: ------------ 187 // 188 // New: ---- 189 // Old: ---- ---- 190 oldEnd := key.End 191 key.End = r.Start 192 193 newKey := tc.cache.MakeKey(r.End, oldEnd) 194 newEntry := makeCacheEntry(newKey, *cv) 195 addEntryAfter(newEntry, entry) 196 case eCmp >= 0: 197 // Left partial overlap; truncate old end. 198 // 199 // New: -------- -------- 200 // Old: -------- or ------------ 201 // 202 // New: -------- -------- 203 // Old: ---- ---- 204 key.End = r.Start 205 case sCmp <= 0: 206 // Right partial overlap; truncate old start. 207 // 208 // New: -------- -------- 209 // Old: -------- or ------------ 210 // 211 // New: -------- -------- 212 // Old: ---- ---- 213 key.Start = r.End 214 default: 215 panic(fmt.Sprintf("no overlap between %v and %v", key.Range, r)) 216 } 217 } else if ts.Less(cv.ts) { 218 // The existing interval has a timestamp greater than the new interval. 219 // Compare interval ranges to determine how to modify new interval before 220 // adding it to the timestamp cache. 221 switch { 222 case sCmp >= 0 && eCmp <= 0: 223 // Old contains or is equal to new; no need to add. 224 // 225 // Old: ----------- ----------- ----------- ----------- 226 // New: ----- or ----------- or -------- or -------- 227 // 228 // Old: ----------- ----------- ----------- ----------- 229 // New: 230 return 231 case sCmp < 0 && eCmp > 0: 232 // New contains old; split up old into two. We can add the left piece 233 // immediately because it is guaranteed to be before the rest of the 234 // overlaps. 235 // 236 // Old: ------ 237 // New: ------------ 238 // 239 // Old: ------ 240 // New: --- --- 241 lr := interval.Range{Start: r.Start, End: key.Start} 242 addRange(lr) 243 244 r.Start = key.End 245 case eCmp > 0: 246 // Left partial overlap; truncate new start. 247 // 248 // Old: -------- -------- 249 // New: -------- or ------------ 250 // 251 // Old: -------- -------- 252 // New: ---- ---- 253 r.Start = key.End 254 case sCmp < 0: 255 // Right partial overlap; truncate new end. 256 // 257 // Old: -------- -------- 258 // New: -------- or ------------ 259 // 260 // Old: -------- -------- 261 // New: ---- ---- 262 r.End = key.Start 263 default: 264 panic(fmt.Sprintf("no overlap between %v and %v", key.Range, r)) 265 } 266 } else if cv.txnID == txnID { 267 // The existing interval has a timestamp equal to the new 268 // interval, and the same transaction ID. 269 switch { 270 case sCmp >= 0 && eCmp <= 0: 271 // Old contains or is equal to new; no need to add. 272 // 273 // New: ----- or ----------- or -------- or -------- 274 // Old: ----------- ----------- ----------- ----------- 275 // 276 // New: 277 // Old: ----------- ----------- ----------- ----------- 278 return 279 case sCmp <= 0 && eCmp >= 0: 280 // New contains old; delete old. 281 // 282 // New: ------------ ------------ ------------ 283 // Old: -------- or ---------- or ---------- 284 // 285 // New: ------------ ------------ ------------ 286 // Old: 287 tc.cache.DelEntry(entry) 288 continue // DelEntry adjusted tc.bytes, don't do it again 289 case eCmp >= 0: 290 // Left partial overlap; truncate old end. 291 // 292 // New: -------- -------- 293 // Old: -------- or ------------ 294 // 295 // New: -------- -------- 296 // Old: ---- ---- 297 key.End = r.Start 298 case sCmp <= 0: 299 // Right partial overlap; truncate old start. 300 // 301 // New: -------- -------- 302 // Old: -------- or ------------ 303 // 304 // New: -------- -------- 305 // Old: ---- ---- 306 key.Start = r.End 307 default: 308 panic(fmt.Sprintf("no overlap between %v and %v", key.Range, r)) 309 } 310 } else { 311 // The existing interval has a timestamp equal to the new 312 // interval and a different transaction ID. 313 switch { 314 case sCmp == 0 && eCmp == 0: 315 // New and old are equal. Segment is no longer owned by any 316 // transaction. 317 // 318 // New: ------------ 319 // Old: ------------ 320 // 321 // New: 322 // Nil: ============ 323 // Old: 324 cv.txnID = noTxnID 325 tc.bytes += cacheEntrySize(key.Start, key.End) - oldSize 326 return 327 case sCmp == 0 && eCmp > 0: 328 // New contains old, left-aligned. Clear ownership of the 329 // existing segment and truncate new. 330 // 331 // New: ------------ 332 // Old: ---------- 333 // 334 // New: -- 335 // Nil: ========== 336 // Old: 337 cv.txnID = noTxnID 338 r.Start = key.End 339 case sCmp < 0 && eCmp == 0: 340 // New contains old, right-aligned. Clear ownership of the 341 // existing segment and truncate new. 342 // 343 // New: ------------ 344 // Old: ---------- 345 // 346 // New: -- 347 // Nil: ========== 348 // Old: 349 cv.txnID = noTxnID 350 r.End = key.Start 351 case sCmp < 0 && eCmp > 0: 352 // New contains old; split into three segments with the 353 // overlap owned by no txn. 354 // 355 // New: ------------ 356 // Old: -------- 357 // 358 // New: -- -- 359 // Nil: ======== 360 // Old: 361 cv.txnID = noTxnID 362 363 newKey := tc.cache.MakeKey(r.Start, key.Start) 364 newEntry := makeCacheEntry(newKey, cacheValue{ts: ts, txnID: txnID}) 365 addEntryAfter(newEntry, entry) 366 r.Start = key.End 367 case sCmp > 0 && eCmp < 0: 368 // Old contains new; split up old into two. New segment is 369 // owned by no txn. 370 // 371 // New: ---- 372 // Old: ------------ 373 // 374 // New: 375 // Nil: ==== 376 // Old: ---- ---- 377 txnID = noTxnID 378 oldEnd := key.End 379 key.End = r.Start 380 381 newKey := tc.cache.MakeKey(r.End, oldEnd) 382 newEntry := makeCacheEntry(newKey, *cv) 383 addEntryAfter(newEntry, entry) 384 case eCmp == 0: 385 // Old contains new, right-aligned; truncate old end and clear 386 // ownership of new segment. 387 // 388 // New: -------- 389 // Old: ------------ 390 // 391 // New: 392 // Nil: ======== 393 // Old: ---- 394 txnID = noTxnID 395 key.End = r.Start 396 case sCmp == 0: 397 // Old contains new, left-aligned; truncate old start and 398 // clear ownership of new segment. 399 // New: -------- 400 // Old: ------------ 401 // 402 // New: 403 // Nil: ======== 404 // Old: ---- 405 txnID = noTxnID 406 key.Start = r.End 407 case eCmp > 0: 408 // Left partial overlap; truncate old end and split new into 409 // segments owned by no txn (the overlap) and the new txn. 410 // 411 // New: -------- 412 // Old: -------- 413 // 414 // New: ---- 415 // Nil: ==== 416 // Old: ---- 417 key.End, r.Start = r.Start, key.End 418 419 newKey := tc.cache.MakeKey(key.End, r.Start) 420 newCV := cacheValue{ts: cv.ts} 421 newEntry := makeCacheEntry(newKey, newCV) 422 addEntryAfter(newEntry, entry) 423 case sCmp < 0: 424 // Right partial overlap; truncate old start and split new into 425 // segments owned by no txn (the overlap) and the new txn. 426 // 427 // New: -------- 428 // Old: -------- 429 // 430 // New: ---- 431 // Nil: ==== 432 // Old: ---- 433 key.Start, r.End = r.End, key.Start 434 435 newKey := tc.cache.MakeKey(r.End, key.Start) 436 newCV := cacheValue{ts: cv.ts} 437 newEntry := makeCacheEntry(newKey, newCV) 438 addEntryAfter(newEntry, entry) 439 default: 440 panic(fmt.Sprintf("no overlap between %v and %v", key.Range, r)) 441 } 442 } 443 tc.bytes += cacheEntrySize(key.Start, key.End) - oldSize 444 } 445 addRange(r) 446 } 447 } 448 449 // SetLowWater implements the Cache interface. 450 func (tc *treeImpl) SetLowWater(start, end roachpb.Key, ts hlc.Timestamp) { 451 tc.Add(start, end, ts, noTxnID) 452 } 453 454 // getLowWater implements the Cache interface. 455 func (tc *treeImpl) getLowWater() hlc.Timestamp { 456 tc.RLock() 457 defer tc.RUnlock() 458 return tc.lowWater 459 } 460 461 // GetMax implements the Cache interface. 462 func (tc *treeImpl) GetMax(start, end roachpb.Key) (hlc.Timestamp, uuid.UUID) { 463 return tc.getMax(start, end) 464 } 465 466 func (tc *treeImpl) getMax(start, end roachpb.Key) (hlc.Timestamp, uuid.UUID) { 467 tc.Lock() 468 defer tc.Unlock() 469 if len(end) == 0 { 470 end = start.Next() 471 } 472 maxTS := tc.lowWater 473 maxTxnID := noTxnID 474 for _, o := range tc.cache.GetOverlaps(start, end) { 475 ce := o.Value.(*cacheValue) 476 if maxTS.Less(ce.ts) { 477 maxTS = ce.ts 478 maxTxnID = ce.txnID 479 } else if maxTS == ce.ts && maxTxnID != ce.txnID { 480 maxTxnID = noTxnID 481 } 482 } 483 return maxTS, maxTxnID 484 } 485 486 // shouldEvict returns true if the cache entry's timestamp is no 487 // longer within the MinRetentionWindow. 488 func (tc *treeImpl) shouldEvict(size int, key, value interface{}) bool { 489 if tc.bytes <= tc.maxBytes { 490 return false 491 } 492 ce := value.(*cacheValue) 493 // In case low water mark was set higher, evict any entries 494 // which occurred before it. 495 if ce.ts.Less(tc.lowWater) { 496 return true 497 } 498 // Compute the edge of the cache window. 499 edge := tc.latest 500 edge.WallTime -= MinRetentionWindow.Nanoseconds() 501 // We evict and update the low water mark if the proposed evictee's 502 // timestamp is <= than the edge of the window. 503 if ce.ts.LessEq(edge) { 504 tc.lowWater = ce.ts 505 return true 506 } 507 return false 508 } 509 510 // onEvicted is called when an entry is evicted from the cache. 511 func (tc *treeImpl) onEvicted(k, v interface{}) { 512 ck := k.(*cache.IntervalKey) 513 reqSize := cacheEntrySize(ck.Start, ck.End) 514 if tc.bytes < reqSize { 515 panic(fmt.Sprintf("bad reqSize: %d < %d", tc.bytes, reqSize)) 516 } 517 tc.bytes -= reqSize 518 } 519 520 // Metrics implements the Cache interface. 521 func (tc *treeImpl) Metrics() Metrics { 522 return tc.metrics 523 }