github.com/thanos-io/thanos@v0.32.5/pkg/cache/groupcache.go (about) 1 // Copyright (c) The Thanos Authors. 2 // Licensed under the Apache License 2.0. 3 4 package cache 5 6 import ( 7 "context" 8 "crypto/tls" 9 "encoding/json" 10 "fmt" 11 "io" 12 "net" 13 "net/http" 14 "path/filepath" 15 "strconv" 16 "strings" 17 "time" 18 19 "github.com/go-kit/log" 20 "github.com/go-kit/log/level" 21 "github.com/prometheus/client_golang/prometheus" 22 "github.com/prometheus/common/route" 23 "github.com/thanos-io/objstore" 24 "github.com/vimeo/galaxycache" 25 galaxyhttp "github.com/vimeo/galaxycache/http" 26 "golang.org/x/net/http2" 27 "gopkg.in/yaml.v2" 28 29 "github.com/thanos-io/thanos/pkg/discovery/dns" 30 "github.com/thanos-io/thanos/pkg/extprom" 31 "github.com/thanos-io/thanos/pkg/model" 32 "github.com/thanos-io/thanos/pkg/runutil" 33 "github.com/thanos-io/thanos/pkg/store/cache/cachekey" 34 ) 35 36 type Groupcache struct { 37 galaxy *galaxycache.Galaxy 38 universe *galaxycache.Universe 39 logger log.Logger 40 timeout time.Duration 41 } 42 43 // GroupcacheConfig holds the in-memory cache config. 44 type GroupcacheConfig struct { 45 // Addresses of statically configured peers (repeatable). The scheme may be prefixed with 'dns+' or 'dnssrv+' to detect store API servers through respective DNS lookups. 46 // Typically, you'd want something like `dns+http://thanos-store:42/`. 47 Peers []string `yaml:"peers"` 48 49 // Address of ourselves in the peer list. This needs to be set to `http://external-ip:HTTP_PORT` 50 // of the current instance. 51 SelfURL string `yaml:"self_url"` 52 53 // Maximum size of the hot in-memory cache. 54 MaxSize model.Bytes `yaml:"max_size"` 55 56 // Group's name. All of the instances need to be using the same group and point to the same bucket. 57 GroupcacheGroup string `yaml:"groupcache_group"` 58 59 // DNS SD resolver to use. 60 DNSSDResolver dns.ResolverType `yaml:"dns_sd_resolver"` 61 62 // How often we should resolve the addresses. 63 DNSInterval time.Duration `yaml:"dns_interval"` 64 65 // Timeout specifies the read/write timeout. 66 Timeout time.Duration `yaml:"timeout"` 67 } 68 69 var ( 70 DefaultGroupcacheConfig = GroupcacheConfig{ 71 MaxSize: 250 * 1024 * 1024, 72 DNSSDResolver: dns.GolangResolverType, 73 DNSInterval: 1 * time.Minute, 74 Timeout: 2 * time.Second, 75 } 76 ) 77 78 // parseGroupcacheConfig unmarshals a buffer into a GroupcacheConfig with default values. 79 func parseGroupcacheConfig(conf []byte) (GroupcacheConfig, error) { 80 config := DefaultGroupcacheConfig 81 if err := yaml.Unmarshal(conf, &config); err != nil { 82 return GroupcacheConfig{}, err 83 } 84 85 if len(config.Peers) == 0 { 86 config.Peers = append(config.Peers, config.SelfURL) 87 } 88 89 for i, peer := range config.Peers { 90 // Workaround for https://github.com/thanos-community/galaxycache/blob/master/http/http.go#L205-L210. 91 // If the peer has a slash at the end then the router redirects 92 // and then the request fails. 93 if strings.HasSuffix(peer, "/") { 94 return GroupcacheConfig{}, fmt.Errorf("peer %d must not have a trailing slash (%s)", i, peer) 95 } 96 } 97 if strings.HasSuffix(config.SelfURL, "/") { 98 return GroupcacheConfig{}, fmt.Errorf("self URL %s must not have a trailing slash", config.SelfURL) 99 } 100 101 return config, nil 102 } 103 104 // NewGroupcache creates a new Groupcache instance. 105 func NewGroupcache(logger log.Logger, reg prometheus.Registerer, conf []byte, basepath string, r *route.Router, bucket objstore.Bucket, cfg *CachingBucketConfig) (*Groupcache, error) { 106 config, err := parseGroupcacheConfig(conf) 107 if err != nil { 108 return nil, err 109 } 110 111 return NewGroupcacheWithConfig(logger, reg, config, basepath, r, bucket, cfg) 112 } 113 114 // NewGroupcacheWithConfig creates a new Groupcache instance with the given config. 115 func NewGroupcacheWithConfig(logger log.Logger, reg prometheus.Registerer, conf GroupcacheConfig, basepath string, r *route.Router, bucket objstore.Bucket, 116 cfg *CachingBucketConfig) (*Groupcache, error) { 117 httpProto := galaxyhttp.NewHTTPFetchProtocol(&galaxyhttp.HTTPOptions{ 118 BasePath: basepath, 119 Transport: &http2.Transport{ 120 AllowHTTP: true, 121 DialTLS: func(network, addr string, cfg *tls.Config) (net.Conn, error) { 122 return net.Dial(network, addr) 123 }, 124 }, 125 }) 126 universe := galaxycache.NewUniverse(httpProto, conf.SelfURL) 127 128 dnsGroupcacheProvider := dns.NewProvider( 129 logger, 130 extprom.WrapRegistererWithPrefix("thanos_store_groupcache_", reg), 131 dns.ResolverType(conf.DNSSDResolver), 132 ) 133 ticker := time.NewTicker(conf.DNSInterval) 134 135 go func() { 136 for { 137 if err := dnsGroupcacheProvider.Resolve(context.Background(), conf.Peers); err != nil { 138 level.Error(logger).Log("msg", "failed to resolve addresses for groupcache", "err", err) 139 } else { 140 err := universe.Set(dnsGroupcacheProvider.Addresses()...) 141 if err != nil { 142 level.Error(logger).Log("msg", "failed to set peers for groupcache", "err", err) 143 } 144 } 145 146 <-ticker.C 147 } 148 }() 149 150 mux := http.NewServeMux() 151 galaxyhttp.RegisterHTTPHandler(universe, &galaxyhttp.HTTPOptions{ 152 BasePath: basepath, 153 }, mux) 154 r.Get(filepath.Join(basepath, conf.GroupcacheGroup, "*key"), mux.ServeHTTP) 155 156 galaxy := universe.NewGalaxy(conf.GroupcacheGroup, int64(conf.MaxSize), galaxycache.GetterFunc( 157 func(ctx context.Context, id string, dest galaxycache.Codec) error { 158 parsedData, err := cachekey.ParseBucketCacheKey(id) 159 if err != nil { 160 return err 161 } 162 163 switch parsedData.Verb { 164 case cachekey.AttributesVerb: 165 _, attrCfg := cfg.FindAttributesConfig(parsedData.Name) 166 if attrCfg == nil { 167 panic("caching bucket layer must not call on unconfigured paths") 168 } 169 170 attrs, err := bucket.Attributes(ctx, parsedData.Name) 171 if err != nil { 172 return err 173 } 174 175 finalAttrs, err := json.Marshal(attrs) 176 if err != nil { 177 return err 178 } 179 180 return dest.UnmarshalBinary(finalAttrs, time.Now().Add(attrCfg.TTL)) 181 case cachekey.IterVerb: 182 _, iterCfg := cfg.FindIterConfig(parsedData.Name) 183 if iterCfg == nil { 184 panic("caching bucket layer must not call on unconfigured paths") 185 } 186 187 var list []string 188 if err := bucket.Iter(ctx, parsedData.Name, func(s string) error { 189 list = append(list, s) 190 return nil 191 }); err != nil { 192 return err 193 } 194 195 encodedList, err := json.Marshal(list) 196 if err != nil { 197 return err 198 } 199 200 return dest.UnmarshalBinary(encodedList, time.Now().Add(iterCfg.TTL)) 201 case cachekey.IterRecursiveVerb: 202 _, iterCfg := cfg.FindIterConfig(parsedData.Name) 203 if iterCfg == nil { 204 panic("caching bucket layer must not call on unconfigured paths") 205 } 206 207 var list []string 208 if err := bucket.Iter(ctx, parsedData.Name, func(s string) error { 209 list = append(list, s) 210 return nil 211 }, objstore.WithRecursiveIter); err != nil { 212 return err 213 } 214 215 encodedList, err := json.Marshal(list) 216 if err != nil { 217 return err 218 } 219 220 return dest.UnmarshalBinary(encodedList, time.Now().Add(iterCfg.TTL)) 221 case cachekey.ContentVerb: 222 _, contentCfg := cfg.FindGetConfig(parsedData.Name) 223 if contentCfg == nil { 224 panic("caching bucket layer must not call on unconfigured paths") 225 } 226 rc, err := bucket.Get(ctx, parsedData.Name) 227 if err != nil { 228 return err 229 } 230 defer runutil.CloseWithLogOnErr(logger, rc, "closing get") 231 232 b, err := io.ReadAll(rc) 233 if err != nil { 234 return err 235 } 236 237 return dest.UnmarshalBinary(b, time.Now().Add(contentCfg.ContentTTL)) 238 case cachekey.ExistsVerb: 239 _, existsCfg := cfg.FindExistConfig(parsedData.Name) 240 if existsCfg == nil { 241 panic("caching bucket layer must not call on unconfigured paths") 242 } 243 exists, err := bucket.Exists(ctx, parsedData.Name) 244 if err != nil { 245 return err 246 } 247 248 if exists { 249 return dest.UnmarshalBinary([]byte(strconv.FormatBool(exists)), time.Now().Add(existsCfg.ExistsTTL)) 250 } else { 251 return dest.UnmarshalBinary([]byte(strconv.FormatBool(exists)), time.Now().Add(existsCfg.DoesntExistTTL)) 252 } 253 254 case cachekey.SubrangeVerb: 255 _, subrangeCfg := cfg.FindGetRangeConfig(parsedData.Name) 256 if subrangeCfg == nil { 257 panic("caching bucket layer must not call on unconfigured paths") 258 } 259 rc, err := bucket.GetRange(ctx, parsedData.Name, parsedData.Start, parsedData.End-parsedData.Start) 260 if err != nil { 261 return err 262 } 263 defer runutil.CloseWithLogOnErr(logger, rc, "closing get_range") 264 265 b, err := io.ReadAll(rc) 266 if err != nil { 267 return err 268 } 269 270 return dest.UnmarshalBinary(b, time.Now().Add(subrangeCfg.SubrangeTTL)) 271 272 } 273 274 return nil 275 }, 276 )) 277 278 RegisterCacheStatsCollector(galaxy, &conf, reg) 279 280 return &Groupcache{ 281 logger: logger, 282 galaxy: galaxy, 283 universe: universe, 284 timeout: conf.Timeout, 285 }, nil 286 } 287 288 // unsafeByteCodec is a byte slice type that implements Codec. 289 type unsafeByteCodec struct { 290 bytes []byte 291 expire time.Time 292 } 293 294 // MarshalBinary returns the contained byte-slice. 295 func (c *unsafeByteCodec) MarshalBinary() ([]byte, time.Time, error) { 296 return c.bytes, c.expire, nil 297 } 298 299 // UnmarshalBinary to provided data so they share the same backing array 300 // this is a generally unsafe performance optimization, but safe in our 301 // case because we always use io.ReadAll(). That is fine though 302 // because later that slice remains in our local cache. 303 // Used https://github.com/vimeo/galaxycache/pull/23/files as inspiration. 304 // TODO(GiedriusS): figure out if pooling could be used somehow by hooking into 305 // eviction. 306 func (c *unsafeByteCodec) UnmarshalBinary(data []byte, expire time.Time) error { 307 c.bytes = data 308 c.expire = expire 309 return nil 310 } 311 312 func (c *Groupcache) Store(data map[string][]byte, ttl time.Duration) { 313 // Noop since cache is already filled during fetching. 314 } 315 316 func (c *Groupcache) Fetch(ctx context.Context, keys []string) map[string][]byte { 317 data := map[string][]byte{} 318 319 if c.timeout != 0 { 320 timeoutCtx, cancel := context.WithTimeout(ctx, c.timeout) 321 ctx = timeoutCtx 322 defer cancel() 323 } 324 325 for _, k := range keys { 326 codec := unsafeByteCodec{} 327 328 if err := c.galaxy.Get(ctx, k, &codec); err != nil { 329 level.Debug(c.logger).Log("msg", "failed fetching data from groupcache", "err", err, "key", k) 330 continue 331 } 332 333 retrievedData, _, err := codec.MarshalBinary() 334 if err != nil { 335 level.Debug(c.logger).Log("msg", "failed retrieving data", "err", err, "key", k) 336 continue 337 } 338 339 if len(retrievedData) > 0 { 340 data[k] = retrievedData 341 } 342 } 343 344 return data 345 } 346 347 func (c *Groupcache) Name() string { 348 return c.galaxy.Name() 349 } 350 351 type CacheStatsCollector struct { 352 galaxy *galaxycache.Galaxy 353 conf *GroupcacheConfig 354 355 // GalaxyCache Metric descriptions. 356 bytes *prometheus.Desc 357 evictions *prometheus.Desc 358 items *prometheus.Desc 359 maxBytes *prometheus.Desc 360 gets *prometheus.Desc 361 loads *prometheus.Desc 362 peerLoads *prometheus.Desc 363 peerLoadErrors *prometheus.Desc 364 backendLoads *prometheus.Desc 365 backendLoadErrors *prometheus.Desc 366 cacheHits *prometheus.Desc 367 } 368 369 // RegisterCacheStatsCollector registers a groupcache metrics collector. 370 func RegisterCacheStatsCollector(galaxy *galaxycache.Galaxy, conf *GroupcacheConfig, reg prometheus.Registerer) { 371 // Cache metrics. 372 bytes := prometheus.NewDesc("thanos_cache_groupcache_bytes", "The number of bytes in the main cache.", []string{"cache"}, nil) 373 evictions := prometheus.NewDesc("thanos_cache_groupcache_evictions_total", "The number items evicted from the cache.", []string{"cache"}, nil) 374 items := prometheus.NewDesc("thanos_cache_groupcache_items", "The number of items in the cache.", []string{"cache"}, nil) 375 376 // Configuration Metrics. 377 maxBytes := prometheus.NewDesc("thanos_cache_groupcache_max_bytes", "The max number of bytes in the cache.", nil, nil) 378 379 // GroupCache metrics. 380 gets := prometheus.NewDesc("thanos_cache_groupcache_get_requests_total", "Total number of get requests, including from peers.", nil, nil) 381 loads := prometheus.NewDesc("thanos_cache_groupcache_loads_total", "Total number of loads from backend (gets - cacheHits).", nil, nil) 382 peerLoads := prometheus.NewDesc("thanos_cache_groupcache_peer_loads_total", "Total number of loads from peers (remote load or remote cache hit).", nil, nil) 383 peerLoadErrors := prometheus.NewDesc("thanos_cache_groupcache_peer_load_errors_total", "Total number of errors from peer loads.", nil, nil) 384 backendLoads := prometheus.NewDesc("thanos_cache_groupcache_backend_loads_total", "Total number of direct backend loads.", nil, nil) 385 backendLoadErrors := prometheus.NewDesc("thanos_cache_groupcache_backend_load_errors_total", "Total number of errors on direct backend loads.", nil, nil) 386 cacheHits := prometheus.NewDesc("thanos_cache_groupcache_hits_total", "Total number of cache hits.", []string{"type"}, nil) 387 388 collector := &CacheStatsCollector{ 389 galaxy: galaxy, 390 conf: conf, 391 bytes: bytes, 392 evictions: evictions, 393 items: items, 394 maxBytes: maxBytes, 395 gets: gets, 396 loads: loads, 397 peerLoads: peerLoads, 398 peerLoadErrors: peerLoadErrors, 399 backendLoads: backendLoads, 400 backendLoadErrors: backendLoadErrors, 401 cacheHits: cacheHits, 402 } 403 reg.MustRegister(collector) 404 } 405 406 func (s *CacheStatsCollector) Collect(ch chan<- prometheus.Metric) { 407 for _, cache := range []galaxycache.CacheType{galaxycache.MainCache, galaxycache.HotCache} { 408 cacheStats := s.galaxy.CacheStats(cache) 409 ch <- prometheus.MustNewConstMetric(s.bytes, prometheus.GaugeValue, float64(cacheStats.Bytes), cache.String()) 410 ch <- prometheus.MustNewConstMetric(s.evictions, prometheus.GaugeValue, float64(cacheStats.Evictions), cache.String()) 411 ch <- prometheus.MustNewConstMetric(s.items, prometheus.GaugeValue, float64(cacheStats.Items), cache.String()) 412 } 413 414 ch <- prometheus.MustNewConstMetric(s.maxBytes, prometheus.GaugeValue, float64(s.conf.MaxSize)) 415 ch <- prometheus.MustNewConstMetric(s.gets, prometheus.CounterValue, float64(s.galaxy.Stats.Gets.Get())) 416 ch <- prometheus.MustNewConstMetric(s.loads, prometheus.CounterValue, float64(s.galaxy.Stats.Loads.Get())) 417 ch <- prometheus.MustNewConstMetric(s.peerLoads, prometheus.CounterValue, float64(s.galaxy.Stats.PeerLoads.Get())) 418 ch <- prometheus.MustNewConstMetric(s.peerLoadErrors, prometheus.CounterValue, float64(s.galaxy.Stats.PeerLoadErrors.Get())) 419 ch <- prometheus.MustNewConstMetric(s.backendLoads, prometheus.CounterValue, float64(s.galaxy.Stats.BackendLoads.Get())) 420 ch <- prometheus.MustNewConstMetric(s.backendLoadErrors, prometheus.CounterValue, float64(s.galaxy.Stats.BackendLoadErrors.Get())) 421 ch <- prometheus.MustNewConstMetric(s.cacheHits, prometheus.CounterValue, float64(s.galaxy.Stats.MaincacheHits.Get()), galaxycache.MainCache.String()) 422 ch <- prometheus.MustNewConstMetric(s.cacheHits, prometheus.CounterValue, float64(s.galaxy.Stats.HotcacheHits.Get()), galaxycache.HotCache.String()) 423 } 424 425 func (s *CacheStatsCollector) Describe(ch chan<- *prometheus.Desc) { 426 prometheus.DescribeByCollect(s, ch) 427 }