github.com/authzed/spicedb@v1.32.1-0.20240520085336-ebda56537386/internal/datastore/proxy/schemacaching/watchingcache.go (about) 1 package schemacaching 2 3 import ( 4 "context" 5 "errors" 6 "sync" 7 "time" 8 9 "github.com/prometheus/client_golang/prometheus" 10 11 pgxcommon "github.com/authzed/spicedb/internal/datastore/postgres/common" 12 "github.com/authzed/spicedb/internal/datastore/revisions" 13 log "github.com/authzed/spicedb/internal/logging" 14 "github.com/authzed/spicedb/pkg/cache" 15 "github.com/authzed/spicedb/pkg/datastore" 16 "github.com/authzed/spicedb/pkg/datastore/options" 17 "github.com/authzed/spicedb/pkg/genutil/mapz" 18 core "github.com/authzed/spicedb/pkg/proto/core/v1" 19 "github.com/authzed/spicedb/pkg/spiceerrors" 20 ) 21 22 var namespacesFallbackModeGauge = prometheus.NewGauge(prometheus.GaugeOpts{ 23 Namespace: "spicedb", 24 Subsystem: "datastore", 25 Name: "watching_schema_cache_namespaces_fallback_mode", 26 Help: "value of 1 if the cache is in fallback mode and 0 otherwise", 27 }) 28 29 var caveatsFallbackModeGauge = prometheus.NewGauge(prometheus.GaugeOpts{ 30 Namespace: "spicedb", 31 Subsystem: "datastore", 32 Name: "watching_schema_cache_caveats_fallback_mode", 33 Help: "value of 1 if the cache is in fallback mode and 0 otherwise", 34 }) 35 36 var schemaCacheRevisionGauge = prometheus.NewGauge(prometheus.GaugeOpts{ 37 Namespace: "spicedb", 38 Subsystem: "datastore", 39 Name: "watching_schema_cache_tracked_revision", 40 Help: "the currently tracked max revision for the schema cache", 41 }) 42 43 var definitionsReadCachedCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ 44 Namespace: "spicedb", 45 Subsystem: "datastore", 46 Name: "watching_schema_cache_definitions_read_cached_total", 47 Help: "cached number of definitions read from the watching cache", 48 }, []string{"definition_kind"}) 49 50 var definitionsReadTotalCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ 51 Namespace: "spicedb", 52 Subsystem: "datastore", 53 Name: "watching_schema_cache_definitions_read_total", 54 Help: "total number of definitions read from the watching cache", 55 }, []string{"definition_kind"}) 56 57 const maximumRetryCount = 10 58 59 func init() { 60 prometheus.MustRegister(namespacesFallbackModeGauge, caveatsFallbackModeGauge, schemaCacheRevisionGauge, definitionsReadCachedCounter, definitionsReadTotalCounter) 61 } 62 63 // watchingCachingProxy is a datastore proxy that caches schema (namespaces and caveat definitions) 64 // and updates its cache via a WatchSchema call. If the supplied datastore to be wrapped does not support 65 // this API, or the data is not available in this case or an error occurs, the updating cache fallsback 66 // to the standard schema cache. 67 type watchingCachingProxy struct { 68 datastore.Datastore 69 70 fallbackCache *definitionCachingProxy 71 gcWindow time.Duration 72 watchHeartbeat time.Duration 73 closed chan bool 74 75 namespaceCache *schemaWatchCache[*core.NamespaceDefinition] 76 caveatCache *schemaWatchCache[*core.CaveatDefinition] 77 } 78 79 // createWatchingCacheProxy creates and returns a watching cache proxy. 80 func createWatchingCacheProxy(delegate datastore.Datastore, c cache.Cache, gcWindow time.Duration, watchHeartbeat time.Duration) *watchingCachingProxy { 81 fallbackCache := &definitionCachingProxy{ 82 Datastore: delegate, 83 c: c, 84 } 85 86 proxy := &watchingCachingProxy{ 87 Datastore: delegate, 88 fallbackCache: fallbackCache, 89 90 gcWindow: gcWindow, 91 watchHeartbeat: watchHeartbeat, 92 closed: make(chan bool, 2), 93 94 namespaceCache: newSchemaWatchCache[*core.NamespaceDefinition]( 95 "namespace", 96 datastore.NewNamespaceNotFoundErr, 97 func(ctx context.Context, name string, revision datastore.Revision) (*core.NamespaceDefinition, datastore.Revision, error) { 98 return fallbackCache.SnapshotReader(revision).ReadNamespaceByName(ctx, name) 99 }, 100 func(ctx context.Context, names []string, revision datastore.Revision) ([]datastore.RevisionedDefinition[*core.NamespaceDefinition], error) { 101 return fallbackCache.SnapshotReader(revision).LookupNamespacesWithNames(ctx, names) 102 }, 103 definitionsReadCachedCounter, 104 definitionsReadTotalCounter, 105 namespacesFallbackModeGauge, 106 ), 107 caveatCache: newSchemaWatchCache[*core.CaveatDefinition]( 108 "caveat", 109 datastore.NewCaveatNameNotFoundErr, 110 func(ctx context.Context, name string, revision datastore.Revision) (*core.CaveatDefinition, datastore.Revision, error) { 111 return fallbackCache.SnapshotReader(revision).ReadCaveatByName(ctx, name) 112 }, 113 func(ctx context.Context, names []string, revision datastore.Revision) ([]datastore.RevisionedDefinition[*core.CaveatDefinition], error) { 114 return fallbackCache.SnapshotReader(revision).LookupCaveatsWithNames(ctx, names) 115 }, 116 definitionsReadCachedCounter, 117 definitionsReadTotalCounter, 118 caveatsFallbackModeGauge, 119 ), 120 } 121 return proxy 122 } 123 124 func (p *watchingCachingProxy) SnapshotReader(rev datastore.Revision) datastore.Reader { 125 delegateReader := p.Datastore.SnapshotReader(rev) 126 return &watchingCachingReader{delegateReader, rev, p} 127 } 128 129 func (p *watchingCachingProxy) ReadWriteTx( 130 ctx context.Context, 131 f datastore.TxUserFunc, 132 opts ...options.RWTOptionsOption, 133 ) (datastore.Revision, error) { 134 // NOTE: we always use the standard approach cache here, as it stores changes within the transaction 135 // itself, and should not impact the overall updating cache. 136 return p.fallbackCache.ReadWriteTx(ctx, f, opts...) 137 } 138 139 func (p *watchingCachingProxy) Start(ctx context.Context) error { 140 // Start async so that prepopulating doesn't block the server start. 141 go func() { 142 _ = p.startSync(ctx) 143 }() 144 145 return nil 146 } 147 148 func (p *watchingCachingProxy) startSync(ctx context.Context) error { 149 log.Info().Msg("starting watching cache") 150 headRev, err := p.Datastore.HeadRevision(context.Background()) 151 if err != nil { 152 p.namespaceCache.setFallbackMode() 153 p.caveatCache.setFallbackMode() 154 log.Warn().Err(err).Msg("received error in schema watch") 155 return err 156 } 157 158 // Start watching for expired entries to be GCed. 159 go (func() { 160 log.Debug().Str("revision", headRev.String()).Msg("starting watching cache GC goroutine") 161 162 for { 163 select { 164 case <-ctx.Done(): 165 log.Debug().Msg("GC routine for watch closed due to context cancelation") 166 return 167 168 case <-p.closed: 169 log.Debug().Msg("GC routine for watch closed") 170 return 171 172 case <-time.After(time.Hour): 173 log.Debug().Msg("beginning GC operation for schema watch") 174 p.namespaceCache.gcStaleEntries(p.gcWindow) 175 p.caveatCache.gcStaleEntries(p.gcWindow) 176 log.Debug().Msg("schema watch gc operation completed") 177 } 178 } 179 })() 180 181 var wg sync.WaitGroup 182 wg.Add(1) 183 184 // Start watching for schema changes. 185 go (func() { 186 retryCount := 0 187 188 restartWatch: 189 for { 190 p.namespaceCache.reset() 191 p.caveatCache.reset() 192 193 log.Debug().Str("revision", headRev.String()).Msg("starting watching cache watch operation") 194 reader := p.Datastore.SnapshotReader(headRev) 195 196 // Populate the cache with all definitions at the head revision. 197 log.Info().Str("revision", headRev.String()).Msg("prepopulating namespace watching cache") 198 namespaces, err := reader.ListAllNamespaces(ctx) 199 if err != nil { 200 p.namespaceCache.setFallbackMode() 201 p.caveatCache.setFallbackMode() 202 log.Warn().Err(err).Msg("received error in schema watch") 203 wg.Done() 204 return 205 } 206 207 for _, namespaceDef := range namespaces { 208 err := p.namespaceCache.updateDefinition(namespaceDef.Definition.Name, namespaceDef.Definition, false, headRev) 209 if err != nil { 210 p.namespaceCache.setFallbackMode() 211 p.caveatCache.setFallbackMode() 212 log.Warn().Err(err).Msg("received error in schema watch") 213 wg.Done() 214 return 215 } 216 } 217 log.Info().Str("revision", headRev.String()).Int("count", len(namespaces)).Msg("populated namespace watching cache") 218 219 log.Info().Str("revision", headRev.String()).Msg("prepopulating caveat watching cache") 220 caveats, err := reader.ListAllCaveats(ctx) 221 if err != nil { 222 p.namespaceCache.setFallbackMode() 223 p.caveatCache.setFallbackMode() 224 log.Warn().Err(err).Msg("received error in schema watch") 225 wg.Done() 226 return 227 } 228 229 for _, caveatDef := range caveats { 230 err := p.caveatCache.updateDefinition(caveatDef.Definition.Name, caveatDef.Definition, false, headRev) 231 if err != nil { 232 p.namespaceCache.setFallbackMode() 233 p.caveatCache.setFallbackMode() 234 log.Warn().Err(err).Msg("received error in schema watch") 235 wg.Done() 236 return 237 } 238 } 239 log.Info().Str("revision", headRev.String()).Int("count", len(caveats)).Msg("populated caveat watching cache") 240 241 log.Debug().Str("revision", headRev.String()).Dur("watch-heartbeat", p.watchHeartbeat).Msg("beginning schema watch") 242 ssc, serrc := p.Datastore.Watch(ctx, headRev, datastore.WatchOptions{ 243 Content: datastore.WatchSchema | datastore.WatchCheckpoints, 244 CheckpointInterval: p.watchHeartbeat, 245 }) 246 log.Debug().Msg("schema watch started") 247 248 p.namespaceCache.startAtRevision(headRev) 249 p.caveatCache.startAtRevision(headRev) 250 251 wg.Done() 252 253 for { 254 select { 255 case <-ctx.Done(): 256 log.Debug().Msg("schema watch closed due to context cancelation") 257 return 258 259 case <-p.closed: 260 log.Debug().Msg("schema watch closed") 261 return 262 263 case ss := <-ssc: 264 log.Trace().Object("update", ss).Msg("received update from schema watch") 265 266 if ss.IsCheckpoint { 267 if converted, ok := ss.Revision.(revisions.WithInexactFloat64); ok { 268 schemaCacheRevisionGauge.Set(converted.InexactFloat64()) 269 } 270 271 p.namespaceCache.setCheckpointRevision(ss.Revision) 272 p.caveatCache.setCheckpointRevision(ss.Revision) 273 continue 274 } 275 276 // Apply the change to the interval tree entry. 277 for _, changeDef := range ss.ChangedDefinitions { 278 switch t := changeDef.(type) { 279 case *core.NamespaceDefinition: 280 err := p.namespaceCache.updateDefinition(t.Name, t, false, ss.Revision) 281 if err != nil { 282 p.namespaceCache.setFallbackMode() 283 log.Warn().Err(err).Msg("received error in schema watch") 284 } 285 286 case *core.CaveatDefinition: 287 err := p.caveatCache.updateDefinition(t.Name, t, false, ss.Revision) 288 if err != nil { 289 p.caveatCache.setFallbackMode() 290 log.Warn().Err(err).Msg("received error in schema watch") 291 } 292 293 default: 294 p.namespaceCache.setFallbackMode() 295 p.caveatCache.setFallbackMode() 296 log.Error().Msg("unknown change definition type") 297 return 298 } 299 } 300 301 for _, deletedNamespaceName := range ss.DeletedNamespaces { 302 err := p.namespaceCache.updateDefinition(deletedNamespaceName, nil, true, ss.Revision) 303 if err != nil { 304 p.namespaceCache.setFallbackMode() 305 log.Warn().Err(err).Msg("received error in schema watch") 306 break 307 } 308 } 309 310 for _, deletedCaveatName := range ss.DeletedCaveats { 311 err := p.caveatCache.updateDefinition(deletedCaveatName, nil, true, ss.Revision) 312 if err != nil { 313 p.caveatCache.setFallbackMode() 314 log.Warn().Err(err).Msg("received error in schema watch") 315 break 316 } 317 } 318 319 case err := <-serrc: 320 var retryable datastore.ErrWatchRetryable 321 if errors.As(err, &retryable) && retryCount <= maximumRetryCount { 322 log.Warn().Err(err).Msg("received retryable error in schema watch; sleeping for a bit and restarting watch") 323 retryCount++ 324 wg.Add(1) 325 pgxcommon.SleepOnErr(ctx, err, uint8(retryCount)) 326 continue restartWatch 327 } 328 329 p.namespaceCache.setFallbackMode() 330 p.caveatCache.setFallbackMode() 331 log.Warn().Err(err).Msg("received terminal error in schema watch; setting to permanent fallback mode") 332 return 333 } 334 } 335 } 336 })() 337 338 wg.Wait() 339 return nil 340 } 341 342 func (p *watchingCachingProxy) Close() error { 343 p.caveatCache.setFallbackMode() 344 p.namespaceCache.setFallbackMode() 345 346 // Close both goroutines 347 p.closed <- true 348 p.closed <- true 349 350 return errors.Join(p.fallbackCache.Close(), p.Datastore.Close()) 351 } 352 353 // schemaWatchCache is a schema cache which updates based on changes received via the WatchSchema 354 // call. 355 type schemaWatchCache[T datastore.SchemaDefinition] struct { 356 // kind is a descriptive label of the kind of definitions in the cache. 357 kind string 358 359 notFoundError notFoundErrorFn 360 readDefinition readDefinitionFn[T] 361 lookupDefinitions lookupDefinitionsFn[T] 362 363 // inFallbackMode, if true, indicates that an error occurred with the WatchSchema call and that 364 // all further calls to this cache should passthrough, rather than using the cache itself (which 365 // is likely out of date). 366 // *Must* be accessed under the lock. 367 inFallbackMode bool 368 369 // checkpointRevision is the current revision at which the cache has been given *all* possible 370 // changes. 371 // *Must* be accessed under the lock. 372 checkpointRevision datastore.Revision 373 374 // entries are the entries in the cache, by name of the namespace or caveat. 375 // *Must* be accessed under the lock. 376 entries map[string]*intervalTracker[revisionedEntry[T]] 377 378 // definitionsReadCachedCounter is a counter of the number of cached definitions 379 // returned by the cache directly (without fallback) 380 definitionsReadCachedCounter *prometheus.CounterVec 381 382 // definitionsReadTotalCounter is a counter of the total number of definitions 383 // returned. 384 definitionsReadTotalCounter *prometheus.CounterVec 385 386 // fallbackGauge is a gauge holding a value of whether the cache is in fallback mode. 387 fallbackGauge prometheus.Gauge 388 389 lock sync.RWMutex 390 } 391 392 type revisionedEntry[T datastore.SchemaDefinition] struct { 393 revisionedDefinition datastore.RevisionedDefinition[T] 394 wasNotFound bool 395 } 396 397 type ( 398 notFoundErrorFn func(name string) error 399 readDefinitionFn[T datastore.SchemaDefinition] func(ctx context.Context, name string, revision datastore.Revision) (T, datastore.Revision, error) 400 lookupDefinitionsFn[T datastore.SchemaDefinition] func(ctx context.Context, names []string, revision datastore.Revision) ([]datastore.RevisionedDefinition[T], error) 401 ) 402 403 // newSchemaWatchCache creates a new schema watch cache, starting in fallback mode. 404 // To bring out of fallback mode, call startAtRevision to indicate that a watch loop 405 // has begun at that revision. 406 func newSchemaWatchCache[T datastore.SchemaDefinition]( 407 kind string, 408 notFoundError notFoundErrorFn, 409 readDefinition readDefinitionFn[T], 410 lookupDefinitions lookupDefinitionsFn[T], 411 definitionsReadCachedCounter *prometheus.CounterVec, 412 definitionsReadTotalCounter *prometheus.CounterVec, 413 fallbackGauge prometheus.Gauge, 414 ) *schemaWatchCache[T] { 415 fallbackGauge.Set(1) 416 417 return &schemaWatchCache[T]{ 418 kind: kind, 419 420 notFoundError: notFoundError, 421 readDefinition: readDefinition, 422 lookupDefinitions: lookupDefinitions, 423 424 inFallbackMode: true, 425 entries: map[string]*intervalTracker[revisionedEntry[T]]{}, 426 checkpointRevision: nil, 427 428 lock: sync.RWMutex{}, 429 430 definitionsReadCachedCounter: definitionsReadCachedCounter, 431 definitionsReadTotalCounter: definitionsReadTotalCounter, 432 fallbackGauge: fallbackGauge, 433 } 434 } 435 436 func (swc *schemaWatchCache[T]) startAtRevision(revision datastore.Revision) { 437 swc.lock.Lock() 438 defer swc.lock.Unlock() 439 440 swc.checkpointRevision = revision 441 swc.inFallbackMode = false 442 443 swc.fallbackGauge.Set(0) 444 } 445 446 func (swc *schemaWatchCache[T]) gcStaleEntries(gcWindow time.Duration) { 447 swc.lock.Lock() 448 defer swc.lock.Unlock() 449 450 for entryName, entry := range swc.entries { 451 fullyRemoved := entry.removeStaleIntervals(gcWindow) 452 if fullyRemoved { 453 delete(swc.entries, entryName) 454 } 455 } 456 } 457 458 func (swc *schemaWatchCache[T]) setFallbackMode() { 459 swc.lock.Lock() 460 defer swc.lock.Unlock() 461 462 swc.inFallbackMode = true 463 swc.fallbackGauge.Set(1) 464 } 465 466 func (swc *schemaWatchCache[T]) reset() { 467 swc.lock.Lock() 468 defer swc.lock.Unlock() 469 470 swc.inFallbackMode = false 471 swc.fallbackGauge.Set(0) 472 swc.entries = map[string]*intervalTracker[revisionedEntry[T]]{} 473 swc.checkpointRevision = nil 474 } 475 476 func (swc *schemaWatchCache[T]) setCheckpointRevision(revision datastore.Revision) { 477 swc.lock.Lock() 478 defer swc.lock.Unlock() 479 480 swc.checkpointRevision = revision 481 } 482 483 func (swc *schemaWatchCache[T]) getTrackerForName(name string) *intervalTracker[revisionedEntry[T]] { 484 swc.lock.RLock() 485 tracker, ok := swc.entries[name] 486 swc.lock.RUnlock() 487 488 if ok { 489 return tracker 490 } 491 492 tracker = newIntervalTracker[revisionedEntry[T]]() 493 swc.lock.Lock() 494 swc.entries[name] = tracker 495 swc.lock.Unlock() 496 return tracker 497 } 498 499 func (swc *schemaWatchCache[T]) updateDefinition(name string, definition T, isDeletion bool, revision datastore.Revision) error { 500 tracker := swc.getTrackerForName(name) 501 result := tracker.add(revisionedEntry[T]{ 502 revisionedDefinition: datastore.RevisionedDefinition[T]{ 503 Definition: definition, 504 LastWrittenRevision: revision, 505 }, 506 wasNotFound: isDeletion, 507 }, revision) 508 if !result { 509 return spiceerrors.MustBugf("received out of order insertion for definition %s", name) 510 } 511 return nil 512 } 513 514 func (swc *schemaWatchCache[T]) readDefinitionByName(ctx context.Context, name string, revision datastore.Revision) (T, datastore.Revision, error) { 515 swc.definitionsReadTotalCounter.WithLabelValues(swc.kind).Inc() 516 517 swc.lock.RLock() 518 inFallbackMode := swc.inFallbackMode 519 lastCheckpointRevision := swc.checkpointRevision 520 swc.lock.RUnlock() 521 522 // If in fallback mode, just read the definition directly from the fallback cache. 523 if inFallbackMode { 524 return swc.readDefinition(ctx, name, revision) 525 } 526 527 // Lookup the tracker for the definition name and then find the associated definition for the specified revision, 528 // if any. 529 tracker := swc.getTrackerForName(name) 530 found, ok := tracker.lookup(revision, lastCheckpointRevision) 531 if ok { 532 swc.definitionsReadCachedCounter.WithLabelValues(swc.kind).Inc() 533 534 // If an entry was found, return the stored information. 535 if found.wasNotFound { 536 return *new(T), nil, swc.notFoundError(name) 537 } 538 539 return found.revisionedDefinition.Definition, found.revisionedDefinition.LastWrittenRevision, nil 540 } 541 542 // Otherwise, read the definition from the fallback cache. 543 return swc.readDefinition(ctx, name, revision) 544 } 545 546 func (swc *schemaWatchCache[T]) readDefinitionsWithNames(ctx context.Context, names []string, revision datastore.Revision) ([]datastore.RevisionedDefinition[T], error) { 547 swc.definitionsReadTotalCounter.WithLabelValues(swc.kind).Add(float64(len(names))) 548 549 swc.lock.RLock() 550 inFallbackMode := swc.inFallbackMode 551 lastCheckpointRevision := swc.checkpointRevision 552 swc.lock.RUnlock() 553 554 // If in fallback mode, just read the definition directly from the fallback cache. 555 if inFallbackMode { 556 return swc.lookupDefinitions(ctx, names, revision) 557 } 558 559 // Find whichever trackers are cached. 560 remainingNames := mapz.NewSet(names...) 561 foundDefs := make([]datastore.RevisionedDefinition[T], 0, len(names)) 562 for _, name := range names { 563 tracker := swc.getTrackerForName(name) 564 found, ok := tracker.lookup(revision, lastCheckpointRevision) 565 if !ok { 566 continue 567 } 568 569 swc.definitionsReadCachedCounter.WithLabelValues(swc.kind).Inc() 570 remainingNames.Delete(name) 571 if !found.wasNotFound { 572 foundDefs = append(foundDefs, found.revisionedDefinition) 573 } 574 } 575 576 // If there are still remaining definition names to be looked up, look them up and then cache them. 577 if !remainingNames.IsEmpty() { 578 additionalDefs, err := swc.lookupDefinitions(ctx, remainingNames.AsSlice(), revision) 579 if err != nil { 580 return nil, err 581 } 582 583 foundDefs = append(foundDefs, additionalDefs...) 584 } 585 586 return foundDefs, nil 587 } 588 589 type watchingCachingReader struct { 590 datastore.Reader 591 rev datastore.Revision 592 p *watchingCachingProxy 593 } 594 595 func (r *watchingCachingReader) ReadNamespaceByName( 596 ctx context.Context, 597 name string, 598 ) (*core.NamespaceDefinition, datastore.Revision, error) { 599 return r.p.namespaceCache.readDefinitionByName(ctx, name, r.rev) 600 } 601 602 func (r *watchingCachingReader) LookupNamespacesWithNames( 603 ctx context.Context, 604 nsNames []string, 605 ) ([]datastore.RevisionedNamespace, error) { 606 return r.p.namespaceCache.readDefinitionsWithNames(ctx, nsNames, r.rev) 607 } 608 609 func (r *watchingCachingReader) ReadCaveatByName( 610 ctx context.Context, 611 name string, 612 ) (*core.CaveatDefinition, datastore.Revision, error) { 613 return r.p.caveatCache.readDefinitionByName(ctx, name, r.rev) 614 } 615 616 func (r *watchingCachingReader) LookupCaveatsWithNames( 617 ctx context.Context, 618 caveatNames []string, 619 ) ([]datastore.RevisionedCaveat, error) { 620 return r.p.caveatCache.readDefinitionsWithNames(ctx, caveatNames, r.rev) 621 }