github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/metrics/matcher/cache/cache.go (about) 1 // Copyright (c) 2017 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package cache 22 23 import ( 24 "errors" 25 "math/rand" 26 "sync" 27 "time" 28 29 "github.com/m3db/m3/src/metrics/matcher/namespace" 30 "github.com/m3db/m3/src/metrics/metric/id" 31 "github.com/m3db/m3/src/metrics/rules" 32 "github.com/m3db/m3/src/x/clock" 33 34 "github.com/uber-go/tally" 35 ) 36 37 const ( 38 numOngoingTasks = 2 39 deletionThrottleInterval = 100 * time.Millisecond 40 ) 41 42 var ( 43 errCacheClosed = errors.New("cache is already closed") 44 ) 45 46 // Cache caches the rule matching result associated with metrics. 47 type Cache interface { 48 rules.Matcher 49 50 // Register sets the source for a given namespace. 51 Register(namespace []byte, source rules.Matcher) 52 53 // Refresh clears the cached results for the given source for a given namespace. 54 Refresh(namespace []byte, source rules.Matcher) 55 56 // Unregister deletes the cached results for a given namespace. 57 Unregister(namespace []byte) 58 59 // Close closes the cache. 60 Close() error 61 } 62 63 type setType int 64 65 const ( 66 dontSetIfNotFound setType = iota 67 setIfNotFound 68 ) 69 70 type sleepFn func(time.Duration) 71 72 type elementPtr *element 73 74 type results struct { 75 elems *elemMap 76 source rules.Matcher 77 } 78 79 func newResults(source rules.Matcher) results { 80 return results{ 81 elems: newElemMap(elemMapOptions{}), 82 source: source, 83 } 84 } 85 86 type cacheMetrics struct { 87 hits tally.Counter 88 misses tally.Counter 89 expires tally.Counter 90 registers tally.Counter 91 registerExists tally.Counter 92 updates tally.Counter 93 updateNotExists tally.Counter 94 updateStaleSource tally.Counter 95 unregisters tally.Counter 96 unregisterNotExists tally.Counter 97 promotions tally.Counter 98 evictions tally.Counter 99 deletions tally.Counter 100 } 101 102 func newCacheMetrics(scope tally.Scope) cacheMetrics { 103 return cacheMetrics{ 104 hits: scope.Counter("hits"), 105 misses: scope.Counter("misses"), 106 expires: scope.Counter("expires"), 107 registers: scope.Counter("registers"), 108 registerExists: scope.Counter("register-exists"), 109 updates: scope.Counter("updates"), 110 updateNotExists: scope.Counter("update-not-exists"), 111 updateStaleSource: scope.Counter("update-stale-source"), 112 unregisters: scope.Counter("unregisters"), 113 unregisterNotExists: scope.Counter("unregister-not-exists"), 114 promotions: scope.Counter("promotions"), 115 evictions: scope.Counter("evictions"), 116 deletions: scope.Counter("deletions"), 117 } 118 } 119 120 // cache is an LRU-based read-through cache. 121 type cache struct { 122 sync.RWMutex 123 124 capacity int 125 nowFn clock.NowFn 126 freshDuration time.Duration 127 stutterDuration time.Duration 128 evictionBatchSize int 129 deletionBatchSize int 130 invalidationMode InvalidationMode 131 sleepFn sleepFn 132 nsResolver namespace.Resolver 133 134 namespaces *namespaceResultsMap 135 list lockedList 136 evictCh chan struct{} 137 deleteCh chan struct{} 138 toDelete []*elemMap 139 wgWorker sync.WaitGroup 140 closed bool 141 closedCh chan struct{} 142 metrics cacheMetrics 143 } 144 145 // NewCache creates a new cache. 146 func NewCache(opts Options) Cache { 147 clockOpts := opts.ClockOptions() 148 instrumentOpts := opts.InstrumentOptions() 149 c := &cache{ 150 capacity: opts.Capacity(), 151 nowFn: clockOpts.NowFn(), 152 freshDuration: opts.FreshDuration(), 153 stutterDuration: opts.StutterDuration(), 154 evictionBatchSize: opts.EvictionBatchSize(), 155 deletionBatchSize: opts.DeletionBatchSize(), 156 invalidationMode: opts.InvalidationMode(), 157 sleepFn: time.Sleep, 158 namespaces: newNamespaceResultsMap(namespaceResultsMapOptions{}), 159 evictCh: make(chan struct{}, 1), 160 deleteCh: make(chan struct{}, 1), 161 closedCh: make(chan struct{}), 162 metrics: newCacheMetrics(instrumentOpts.MetricsScope()), 163 nsResolver: opts.NamespaceResolver(), 164 } 165 166 c.wgWorker.Add(numOngoingTasks) 167 go c.evict() 168 go c.delete() 169 170 return c 171 } 172 173 func (c *cache) ForwardMatch(id id.ID, fromNanos, toNanos int64, 174 opts rules.MatchOptions) (rules.MatchResult, error) { 175 namespace := c.nsResolver.Resolve(id) 176 c.RLock() 177 res, found, err := c.tryGetWithLock(namespace, id, fromNanos, toNanos, dontSetIfNotFound, opts) 178 c.RUnlock() 179 if err != nil { 180 return rules.MatchResult{}, err 181 } 182 if found { 183 return res, nil 184 } 185 186 c.Lock() 187 res, _, err = c.tryGetWithLock(namespace, id, fromNanos, toNanos, setIfNotFound, opts) 188 c.Unlock() 189 if err != nil { 190 return rules.MatchResult{}, err 191 } 192 return res, nil 193 } 194 195 func (c *cache) Register(namespace []byte, source rules.Matcher) { 196 c.Lock() 197 defer c.Unlock() 198 199 if results, exist := c.namespaces.Get(namespace); !exist { 200 c.namespaces.Set(namespace, newResults(source)) 201 c.metrics.registers.Inc(1) 202 } else { 203 c.refreshWithLock(namespace, source, results) 204 c.metrics.registerExists.Inc(1) 205 } 206 } 207 208 func (c *cache) Refresh(namespace []byte, source rules.Matcher) { 209 c.Lock() 210 defer c.Unlock() 211 212 results, exist := c.namespaces.Get(namespace) 213 // NB: The namespace does not exist yet. This could happen if the source update came 214 // before its namespace is registered. It is safe to ignore this premature update 215 // because the namespace will eventually register itself and refreshes the cache. 216 if !exist { 217 c.metrics.updateNotExists.Inc(1) 218 return 219 } 220 // NB: The source to update is different from what's stored in the cache. This could 221 // happen if the namespace is changed, removed, and then revived before the rule change 222 // could be processed. It is safe to ignore this stale update because the last rule 223 // change update will eventually be processed and the cache will be refreshed. 224 if results.source != source { 225 c.metrics.updateStaleSource.Inc(1) 226 return 227 } 228 c.refreshWithLock(namespace, source, results) 229 c.metrics.updates.Inc(1) 230 } 231 232 func (c *cache) Unregister(namespace []byte) { 233 c.Lock() 234 defer c.Unlock() 235 236 results, exists := c.namespaces.Get(namespace) 237 if !exists { 238 c.metrics.unregisterNotExists.Inc(1) 239 return 240 } 241 c.namespaces.Delete(namespace) 242 c.toDelete = append(c.toDelete, results.elems) 243 c.notifyDeletion() 244 c.metrics.unregisters.Inc(1) 245 } 246 247 func (c *cache) Close() error { 248 c.Lock() 249 if c.closed { 250 c.Unlock() 251 return errCacheClosed 252 } 253 c.closed = true 254 c.Unlock() 255 256 close(c.closedCh) 257 c.wgWorker.Wait() 258 return nil 259 } 260 261 // tryGetWithLock attempts to get the match result, returning true if a match 262 // result is successfully determined and no further processing is required, 263 // and false otherwise. 264 func (c *cache) tryGetWithLock( 265 namespace []byte, 266 id id.ID, 267 fromNanos, toNanos int64, 268 setType setType, 269 matchOpts rules.MatchOptions, 270 ) (rules.MatchResult, bool, error) { 271 res := rules.EmptyMatchResult 272 results, exists := c.namespaces.Get(namespace) 273 if !exists { 274 c.metrics.hits.Inc(1) 275 return res, true, nil 276 } 277 entry, exists := results.elems.Get(id.Bytes()) 278 if exists { 279 elem := (*element)(entry) 280 res = elem.result 281 // NB(xichen): the cached match result expires when a new rule takes effect. 282 // Therefore we need to check if the cache result is valid up to the end 283 // of the match time range, a.k.a. toNanos. 284 if !res.HasExpired(toNanos) { 285 // NB(xichen): in order to avoid the overhead acquiring an exclusive 286 // lock to perform a promotion to move the element to the front of the 287 // list, we set an expiry time for each promotion and do not perform 288 // another promotion if the previous one is still fresh. This should be 289 // good enough because if the cache is sufficiently large, the frequently 290 // accessed items should be still near the front of the list. Additionally, 291 // we can still achieve the exact LRU semantics by setting fresh duration 292 // and stutter duration to 0. 293 now := c.nowFn() 294 if elem.ShouldPromote(now) { 295 c.promote(now, elem) 296 } 297 c.metrics.hits.Inc(1) 298 return res, true, nil 299 } 300 c.metrics.expires.Inc(1) 301 } 302 if setType == dontSetIfNotFound { 303 return res, false, nil 304 } 305 // NB(xichen): the result is either not cached, or cached but invalid, in both 306 // cases we should use the source to compute the result and set it in the cache. 307 res, err := c.setWithLock(namespace, id, fromNanos, toNanos, results, exists, matchOpts) 308 if err != nil { 309 return rules.MatchResult{}, false, err 310 } 311 return res, true, nil 312 } 313 314 func (c *cache) setWithLock( 315 namespace []byte, 316 id id.ID, 317 fromNanos, toNanos int64, 318 results results, 319 invalidate bool, 320 matchOpts rules.MatchOptions, 321 ) (rules.MatchResult, error) { 322 // NB(xichen): if a cached result is invalid, it's very likely that we've reached 323 // a new cutover time and the old cached results are now invalid, therefore it's 324 // preferrable to invalidate everything to save the overhead of multiple invalidations. 325 if invalidate { 326 results = c.invalidateWithLock(namespace, id.Bytes(), results) 327 } 328 res, err := results.source.ForwardMatch(id, fromNanos, toNanos, matchOpts) 329 if err != nil { 330 return rules.MatchResult{}, err 331 } 332 newElem := newElement(namespace, id.Bytes(), res) 333 newElem.SetPromotionExpiry(c.newPromotionExpiry(c.nowFn())) 334 results.elems.Set(id.Bytes(), newElem) 335 // NB(xichen): we don't evict until the number of cached items goes 336 // above the capacity by at least the eviction batch size to amortize 337 // the eviction overhead. 338 if newSize := c.add(newElem); newSize > c.capacity+c.evictionBatchSize { 339 c.notifyEviction() 340 } 341 c.metrics.misses.Inc(1) 342 return res, nil 343 } 344 345 // refreshWithLock clears the existing cached results for namespace nsHash 346 // and associates the namespace results with a new source. 347 func (c *cache) refreshWithLock(namespace []byte, source rules.Matcher, results results) { 348 c.toDelete = append(c.toDelete, results.elems) 349 c.notifyDeletion() 350 results.source = source 351 results.elems = newElemMap(elemMapOptions{}) 352 c.namespaces.Set(namespace, results) 353 } 354 355 func (c *cache) add(elem *element) int { 356 c.list.Lock() 357 c.list.PushFront(elem) 358 size := c.list.Len() 359 c.list.Unlock() 360 return size 361 } 362 363 func (c *cache) promote(now time.Time, elem *element) { 364 c.list.Lock() 365 // Bail if someone else got ahead of us and promoted this element. 366 if !elem.ShouldPromote(now) { 367 c.list.Unlock() 368 return 369 } 370 // Otherwise proceed with promotion. 371 elem.SetPromotionExpiry(c.newPromotionExpiry(now)) 372 c.list.MoveToFront(elem) 373 c.list.Unlock() 374 c.metrics.promotions.Inc(1) 375 } 376 377 func (c *cache) invalidateWithLock(namespace, id []byte, results results) results { 378 if c.invalidationMode == InvalidateAll { 379 c.toDelete = append(c.toDelete, results.elems) 380 c.notifyDeletion() 381 results.elems = newElemMap(elemMapOptions{}) 382 c.namespaces.Set(namespace, results) 383 } else { 384 // Guaranteed to be in the map when invalidateWithLock is called 385 elem, _ := results.elems.Get(id) 386 results.elems.Delete(id) 387 c.list.Lock() 388 c.list.Remove(elem) 389 c.list.Unlock() 390 } 391 return results 392 } 393 394 func (c *cache) evict() { 395 defer c.wgWorker.Done() 396 397 for { 398 select { 399 case <-c.evictCh: 400 c.doEvict() 401 case <-c.closedCh: 402 return 403 } 404 } 405 } 406 407 func (c *cache) doEvict() { 408 c.Lock() 409 c.list.Lock() 410 numEvicted := 0 411 for c.list.Len() > c.capacity { 412 elem := c.list.Back() 413 c.list.Remove(elem) 414 numEvicted++ 415 // NB(xichen): the namespace owning this element may have been deleted, 416 // in which case we simply continue. This is okay because the deleted element 417 // will be marked as deleted so when the deletion goroutine sees and tries to 418 // delete it again, it will be a no op, at which point it will be removed from 419 // the owning map as well. 420 results, exists := c.namespaces.Get(elem.namespace) 421 if !exists { 422 continue 423 } 424 results.elems.Delete(elem.id) 425 } 426 c.list.Unlock() 427 c.Unlock() 428 c.metrics.evictions.Inc(int64(numEvicted)) 429 } 430 431 func (c *cache) delete() { 432 defer c.wgWorker.Done() 433 434 for { 435 select { 436 case <-c.deleteCh: 437 c.doDelete() 438 case <-c.closedCh: 439 return 440 } 441 } 442 } 443 444 func (c *cache) doDelete() { 445 c.Lock() 446 if len(c.toDelete) == 0 { 447 c.Unlock() 448 return 449 } 450 451 // NB(xichen): add pooling if deletion happens frequent enough. 452 toDelete := c.toDelete 453 c.toDelete = nil 454 c.Unlock() 455 456 allDeleted := 0 457 deleted := 0 458 c.list.Lock() 459 for _, elems := range toDelete { 460 for _, entry := range elems.Iter() { 461 elem := entry.Value() 462 c.list.Remove(elem) 463 allDeleted++ 464 deleted++ 465 // If we have deleted enough elements, release the lock 466 // and give other goroutines a chance to acquire the lock 467 // since deletion does not need to be fast. 468 if deleted >= c.deletionBatchSize { 469 c.list.Unlock() 470 c.sleepFn(deletionThrottleInterval) 471 deleted = 0 472 c.list.Lock() 473 } 474 } 475 } 476 c.list.Unlock() 477 c.metrics.deletions.Inc(int64(allDeleted)) 478 } 479 480 func (c *cache) notifyEviction() { 481 select { 482 case c.evictCh <- struct{}{}: 483 default: 484 } 485 } 486 487 func (c *cache) notifyDeletion() { 488 select { 489 case c.deleteCh <- struct{}{}: 490 default: 491 } 492 } 493 494 func (c *cache) newPromotionExpiry(now time.Time) time.Time { 495 expiry := now.Add(c.freshDuration) 496 if c.stutterDuration > 0 { 497 expiry = expiry.Add(time.Duration(rand.Int63n(int64(c.stutterDuration)))) 498 } 499 return expiry 500 }