github.com/thanos-io/thanos@v0.32.5/pkg/cache/groupcache.go (about)

     1  // Copyright (c) The Thanos Authors.
     2  // Licensed under the Apache License 2.0.
     3  
     4  package cache
     5  
     6  import (
     7  	"context"
     8  	"crypto/tls"
     9  	"encoding/json"
    10  	"fmt"
    11  	"io"
    12  	"net"
    13  	"net/http"
    14  	"path/filepath"
    15  	"strconv"
    16  	"strings"
    17  	"time"
    18  
    19  	"github.com/go-kit/log"
    20  	"github.com/go-kit/log/level"
    21  	"github.com/prometheus/client_golang/prometheus"
    22  	"github.com/prometheus/common/route"
    23  	"github.com/thanos-io/objstore"
    24  	"github.com/vimeo/galaxycache"
    25  	galaxyhttp "github.com/vimeo/galaxycache/http"
    26  	"golang.org/x/net/http2"
    27  	"gopkg.in/yaml.v2"
    28  
    29  	"github.com/thanos-io/thanos/pkg/discovery/dns"
    30  	"github.com/thanos-io/thanos/pkg/extprom"
    31  	"github.com/thanos-io/thanos/pkg/model"
    32  	"github.com/thanos-io/thanos/pkg/runutil"
    33  	"github.com/thanos-io/thanos/pkg/store/cache/cachekey"
    34  )
    35  
    36  type Groupcache struct {
    37  	galaxy   *galaxycache.Galaxy
    38  	universe *galaxycache.Universe
    39  	logger   log.Logger
    40  	timeout  time.Duration
    41  }
    42  
    43  // GroupcacheConfig holds the in-memory cache config.
    44  type GroupcacheConfig struct {
    45  	// Addresses of statically configured peers (repeatable). The scheme may be prefixed with 'dns+' or 'dnssrv+' to detect store API servers through respective DNS lookups.
    46  	// Typically, you'd want something like `dns+http://thanos-store:42/`.
    47  	Peers []string `yaml:"peers"`
    48  
    49  	// Address of ourselves in the peer list. This needs to be set to `http://external-ip:HTTP_PORT`
    50  	// of the current instance.
    51  	SelfURL string `yaml:"self_url"`
    52  
    53  	// Maximum size of the hot in-memory cache.
    54  	MaxSize model.Bytes `yaml:"max_size"`
    55  
    56  	// Group's name. All of the instances need to be using the same group and point to the same bucket.
    57  	GroupcacheGroup string `yaml:"groupcache_group"`
    58  
    59  	// DNS SD resolver to use.
    60  	DNSSDResolver dns.ResolverType `yaml:"dns_sd_resolver"`
    61  
    62  	// How often we should resolve the addresses.
    63  	DNSInterval time.Duration `yaml:"dns_interval"`
    64  
    65  	// Timeout specifies the read/write timeout.
    66  	Timeout time.Duration `yaml:"timeout"`
    67  }
    68  
    69  var (
    70  	DefaultGroupcacheConfig = GroupcacheConfig{
    71  		MaxSize:       250 * 1024 * 1024,
    72  		DNSSDResolver: dns.GolangResolverType,
    73  		DNSInterval:   1 * time.Minute,
    74  		Timeout:       2 * time.Second,
    75  	}
    76  )
    77  
    78  // parseGroupcacheConfig unmarshals a buffer into a GroupcacheConfig with default values.
    79  func parseGroupcacheConfig(conf []byte) (GroupcacheConfig, error) {
    80  	config := DefaultGroupcacheConfig
    81  	if err := yaml.Unmarshal(conf, &config); err != nil {
    82  		return GroupcacheConfig{}, err
    83  	}
    84  
    85  	if len(config.Peers) == 0 {
    86  		config.Peers = append(config.Peers, config.SelfURL)
    87  	}
    88  
    89  	for i, peer := range config.Peers {
    90  		// Workaround for https://github.com/thanos-community/galaxycache/blob/master/http/http.go#L205-L210.
    91  		// If the peer has a slash at the end then the router redirects
    92  		// and then the request fails.
    93  		if strings.HasSuffix(peer, "/") {
    94  			return GroupcacheConfig{}, fmt.Errorf("peer %d must not have a trailing slash (%s)", i, peer)
    95  		}
    96  	}
    97  	if strings.HasSuffix(config.SelfURL, "/") {
    98  		return GroupcacheConfig{}, fmt.Errorf("self URL %s must not have a trailing slash", config.SelfURL)
    99  	}
   100  
   101  	return config, nil
   102  }
   103  
   104  // NewGroupcache creates a new Groupcache instance.
   105  func NewGroupcache(logger log.Logger, reg prometheus.Registerer, conf []byte, basepath string, r *route.Router, bucket objstore.Bucket, cfg *CachingBucketConfig) (*Groupcache, error) {
   106  	config, err := parseGroupcacheConfig(conf)
   107  	if err != nil {
   108  		return nil, err
   109  	}
   110  
   111  	return NewGroupcacheWithConfig(logger, reg, config, basepath, r, bucket, cfg)
   112  }
   113  
   114  // NewGroupcacheWithConfig creates a new Groupcache instance with the given config.
   115  func NewGroupcacheWithConfig(logger log.Logger, reg prometheus.Registerer, conf GroupcacheConfig, basepath string, r *route.Router, bucket objstore.Bucket,
   116  	cfg *CachingBucketConfig) (*Groupcache, error) {
   117  	httpProto := galaxyhttp.NewHTTPFetchProtocol(&galaxyhttp.HTTPOptions{
   118  		BasePath: basepath,
   119  		Transport: &http2.Transport{
   120  			AllowHTTP: true,
   121  			DialTLS: func(network, addr string, cfg *tls.Config) (net.Conn, error) {
   122  				return net.Dial(network, addr)
   123  			},
   124  		},
   125  	})
   126  	universe := galaxycache.NewUniverse(httpProto, conf.SelfURL)
   127  
   128  	dnsGroupcacheProvider := dns.NewProvider(
   129  		logger,
   130  		extprom.WrapRegistererWithPrefix("thanos_store_groupcache_", reg),
   131  		dns.ResolverType(conf.DNSSDResolver),
   132  	)
   133  	ticker := time.NewTicker(conf.DNSInterval)
   134  
   135  	go func() {
   136  		for {
   137  			if err := dnsGroupcacheProvider.Resolve(context.Background(), conf.Peers); err != nil {
   138  				level.Error(logger).Log("msg", "failed to resolve addresses for groupcache", "err", err)
   139  			} else {
   140  				err := universe.Set(dnsGroupcacheProvider.Addresses()...)
   141  				if err != nil {
   142  					level.Error(logger).Log("msg", "failed to set peers for groupcache", "err", err)
   143  				}
   144  			}
   145  
   146  			<-ticker.C
   147  		}
   148  	}()
   149  
   150  	mux := http.NewServeMux()
   151  	galaxyhttp.RegisterHTTPHandler(universe, &galaxyhttp.HTTPOptions{
   152  		BasePath: basepath,
   153  	}, mux)
   154  	r.Get(filepath.Join(basepath, conf.GroupcacheGroup, "*key"), mux.ServeHTTP)
   155  
   156  	galaxy := universe.NewGalaxy(conf.GroupcacheGroup, int64(conf.MaxSize), galaxycache.GetterFunc(
   157  		func(ctx context.Context, id string, dest galaxycache.Codec) error {
   158  			parsedData, err := cachekey.ParseBucketCacheKey(id)
   159  			if err != nil {
   160  				return err
   161  			}
   162  
   163  			switch parsedData.Verb {
   164  			case cachekey.AttributesVerb:
   165  				_, attrCfg := cfg.FindAttributesConfig(parsedData.Name)
   166  				if attrCfg == nil {
   167  					panic("caching bucket layer must not call on unconfigured paths")
   168  				}
   169  
   170  				attrs, err := bucket.Attributes(ctx, parsedData.Name)
   171  				if err != nil {
   172  					return err
   173  				}
   174  
   175  				finalAttrs, err := json.Marshal(attrs)
   176  				if err != nil {
   177  					return err
   178  				}
   179  
   180  				return dest.UnmarshalBinary(finalAttrs, time.Now().Add(attrCfg.TTL))
   181  			case cachekey.IterVerb:
   182  				_, iterCfg := cfg.FindIterConfig(parsedData.Name)
   183  				if iterCfg == nil {
   184  					panic("caching bucket layer must not call on unconfigured paths")
   185  				}
   186  
   187  				var list []string
   188  				if err := bucket.Iter(ctx, parsedData.Name, func(s string) error {
   189  					list = append(list, s)
   190  					return nil
   191  				}); err != nil {
   192  					return err
   193  				}
   194  
   195  				encodedList, err := json.Marshal(list)
   196  				if err != nil {
   197  					return err
   198  				}
   199  
   200  				return dest.UnmarshalBinary(encodedList, time.Now().Add(iterCfg.TTL))
   201  			case cachekey.IterRecursiveVerb:
   202  				_, iterCfg := cfg.FindIterConfig(parsedData.Name)
   203  				if iterCfg == nil {
   204  					panic("caching bucket layer must not call on unconfigured paths")
   205  				}
   206  
   207  				var list []string
   208  				if err := bucket.Iter(ctx, parsedData.Name, func(s string) error {
   209  					list = append(list, s)
   210  					return nil
   211  				}, objstore.WithRecursiveIter); err != nil {
   212  					return err
   213  				}
   214  
   215  				encodedList, err := json.Marshal(list)
   216  				if err != nil {
   217  					return err
   218  				}
   219  
   220  				return dest.UnmarshalBinary(encodedList, time.Now().Add(iterCfg.TTL))
   221  			case cachekey.ContentVerb:
   222  				_, contentCfg := cfg.FindGetConfig(parsedData.Name)
   223  				if contentCfg == nil {
   224  					panic("caching bucket layer must not call on unconfigured paths")
   225  				}
   226  				rc, err := bucket.Get(ctx, parsedData.Name)
   227  				if err != nil {
   228  					return err
   229  				}
   230  				defer runutil.CloseWithLogOnErr(logger, rc, "closing get")
   231  
   232  				b, err := io.ReadAll(rc)
   233  				if err != nil {
   234  					return err
   235  				}
   236  
   237  				return dest.UnmarshalBinary(b, time.Now().Add(contentCfg.ContentTTL))
   238  			case cachekey.ExistsVerb:
   239  				_, existsCfg := cfg.FindExistConfig(parsedData.Name)
   240  				if existsCfg == nil {
   241  					panic("caching bucket layer must not call on unconfigured paths")
   242  				}
   243  				exists, err := bucket.Exists(ctx, parsedData.Name)
   244  				if err != nil {
   245  					return err
   246  				}
   247  
   248  				if exists {
   249  					return dest.UnmarshalBinary([]byte(strconv.FormatBool(exists)), time.Now().Add(existsCfg.ExistsTTL))
   250  				} else {
   251  					return dest.UnmarshalBinary([]byte(strconv.FormatBool(exists)), time.Now().Add(existsCfg.DoesntExistTTL))
   252  				}
   253  
   254  			case cachekey.SubrangeVerb:
   255  				_, subrangeCfg := cfg.FindGetRangeConfig(parsedData.Name)
   256  				if subrangeCfg == nil {
   257  					panic("caching bucket layer must not call on unconfigured paths")
   258  				}
   259  				rc, err := bucket.GetRange(ctx, parsedData.Name, parsedData.Start, parsedData.End-parsedData.Start)
   260  				if err != nil {
   261  					return err
   262  				}
   263  				defer runutil.CloseWithLogOnErr(logger, rc, "closing get_range")
   264  
   265  				b, err := io.ReadAll(rc)
   266  				if err != nil {
   267  					return err
   268  				}
   269  
   270  				return dest.UnmarshalBinary(b, time.Now().Add(subrangeCfg.SubrangeTTL))
   271  
   272  			}
   273  
   274  			return nil
   275  		},
   276  	))
   277  
   278  	RegisterCacheStatsCollector(galaxy, &conf, reg)
   279  
   280  	return &Groupcache{
   281  		logger:   logger,
   282  		galaxy:   galaxy,
   283  		universe: universe,
   284  		timeout:  conf.Timeout,
   285  	}, nil
   286  }
   287  
   288  // unsafeByteCodec is a byte slice type that implements Codec.
   289  type unsafeByteCodec struct {
   290  	bytes  []byte
   291  	expire time.Time
   292  }
   293  
   294  // MarshalBinary returns the contained byte-slice.
   295  func (c *unsafeByteCodec) MarshalBinary() ([]byte, time.Time, error) {
   296  	return c.bytes, c.expire, nil
   297  }
   298  
   299  // UnmarshalBinary to provided data so they share the same backing array
   300  // this is a generally unsafe performance optimization, but safe in our
   301  // case because we always use io.ReadAll(). That is fine though
   302  // because later that slice remains in our local cache.
   303  // Used https://github.com/vimeo/galaxycache/pull/23/files as inspiration.
   304  // TODO(GiedriusS): figure out if pooling could be used somehow by hooking into
   305  // eviction.
   306  func (c *unsafeByteCodec) UnmarshalBinary(data []byte, expire time.Time) error {
   307  	c.bytes = data
   308  	c.expire = expire
   309  	return nil
   310  }
   311  
   312  func (c *Groupcache) Store(data map[string][]byte, ttl time.Duration) {
   313  	// Noop since cache is already filled during fetching.
   314  }
   315  
   316  func (c *Groupcache) Fetch(ctx context.Context, keys []string) map[string][]byte {
   317  	data := map[string][]byte{}
   318  
   319  	if c.timeout != 0 {
   320  		timeoutCtx, cancel := context.WithTimeout(ctx, c.timeout)
   321  		ctx = timeoutCtx
   322  		defer cancel()
   323  	}
   324  
   325  	for _, k := range keys {
   326  		codec := unsafeByteCodec{}
   327  
   328  		if err := c.galaxy.Get(ctx, k, &codec); err != nil {
   329  			level.Debug(c.logger).Log("msg", "failed fetching data from groupcache", "err", err, "key", k)
   330  			continue
   331  		}
   332  
   333  		retrievedData, _, err := codec.MarshalBinary()
   334  		if err != nil {
   335  			level.Debug(c.logger).Log("msg", "failed retrieving data", "err", err, "key", k)
   336  			continue
   337  		}
   338  
   339  		if len(retrievedData) > 0 {
   340  			data[k] = retrievedData
   341  		}
   342  	}
   343  
   344  	return data
   345  }
   346  
   347  func (c *Groupcache) Name() string {
   348  	return c.galaxy.Name()
   349  }
   350  
   351  type CacheStatsCollector struct {
   352  	galaxy *galaxycache.Galaxy
   353  	conf   *GroupcacheConfig
   354  
   355  	// GalaxyCache Metric descriptions.
   356  	bytes             *prometheus.Desc
   357  	evictions         *prometheus.Desc
   358  	items             *prometheus.Desc
   359  	maxBytes          *prometheus.Desc
   360  	gets              *prometheus.Desc
   361  	loads             *prometheus.Desc
   362  	peerLoads         *prometheus.Desc
   363  	peerLoadErrors    *prometheus.Desc
   364  	backendLoads      *prometheus.Desc
   365  	backendLoadErrors *prometheus.Desc
   366  	cacheHits         *prometheus.Desc
   367  }
   368  
   369  // RegisterCacheStatsCollector registers a groupcache metrics collector.
   370  func RegisterCacheStatsCollector(galaxy *galaxycache.Galaxy, conf *GroupcacheConfig, reg prometheus.Registerer) {
   371  	// Cache metrics.
   372  	bytes := prometheus.NewDesc("thanos_cache_groupcache_bytes", "The number of bytes in the main cache.", []string{"cache"}, nil)
   373  	evictions := prometheus.NewDesc("thanos_cache_groupcache_evictions_total", "The number items evicted from the cache.", []string{"cache"}, nil)
   374  	items := prometheus.NewDesc("thanos_cache_groupcache_items", "The number of items in the cache.", []string{"cache"}, nil)
   375  
   376  	// Configuration Metrics.
   377  	maxBytes := prometheus.NewDesc("thanos_cache_groupcache_max_bytes", "The max number of bytes in the cache.", nil, nil)
   378  
   379  	// GroupCache metrics.
   380  	gets := prometheus.NewDesc("thanos_cache_groupcache_get_requests_total", "Total number of get requests, including from peers.", nil, nil)
   381  	loads := prometheus.NewDesc("thanos_cache_groupcache_loads_total", "Total number of loads from backend (gets - cacheHits).", nil, nil)
   382  	peerLoads := prometheus.NewDesc("thanos_cache_groupcache_peer_loads_total", "Total number of loads from peers (remote load or remote cache hit).", nil, nil)
   383  	peerLoadErrors := prometheus.NewDesc("thanos_cache_groupcache_peer_load_errors_total", "Total number of errors from peer loads.", nil, nil)
   384  	backendLoads := prometheus.NewDesc("thanos_cache_groupcache_backend_loads_total", "Total number of direct backend loads.", nil, nil)
   385  	backendLoadErrors := prometheus.NewDesc("thanos_cache_groupcache_backend_load_errors_total", "Total number of errors on direct backend loads.", nil, nil)
   386  	cacheHits := prometheus.NewDesc("thanos_cache_groupcache_hits_total", "Total number of cache hits.", []string{"type"}, nil)
   387  
   388  	collector := &CacheStatsCollector{
   389  		galaxy:            galaxy,
   390  		conf:              conf,
   391  		bytes:             bytes,
   392  		evictions:         evictions,
   393  		items:             items,
   394  		maxBytes:          maxBytes,
   395  		gets:              gets,
   396  		loads:             loads,
   397  		peerLoads:         peerLoads,
   398  		peerLoadErrors:    peerLoadErrors,
   399  		backendLoads:      backendLoads,
   400  		backendLoadErrors: backendLoadErrors,
   401  		cacheHits:         cacheHits,
   402  	}
   403  	reg.MustRegister(collector)
   404  }
   405  
   406  func (s *CacheStatsCollector) Collect(ch chan<- prometheus.Metric) {
   407  	for _, cache := range []galaxycache.CacheType{galaxycache.MainCache, galaxycache.HotCache} {
   408  		cacheStats := s.galaxy.CacheStats(cache)
   409  		ch <- prometheus.MustNewConstMetric(s.bytes, prometheus.GaugeValue, float64(cacheStats.Bytes), cache.String())
   410  		ch <- prometheus.MustNewConstMetric(s.evictions, prometheus.GaugeValue, float64(cacheStats.Evictions), cache.String())
   411  		ch <- prometheus.MustNewConstMetric(s.items, prometheus.GaugeValue, float64(cacheStats.Items), cache.String())
   412  	}
   413  
   414  	ch <- prometheus.MustNewConstMetric(s.maxBytes, prometheus.GaugeValue, float64(s.conf.MaxSize))
   415  	ch <- prometheus.MustNewConstMetric(s.gets, prometheus.CounterValue, float64(s.galaxy.Stats.Gets.Get()))
   416  	ch <- prometheus.MustNewConstMetric(s.loads, prometheus.CounterValue, float64(s.galaxy.Stats.Loads.Get()))
   417  	ch <- prometheus.MustNewConstMetric(s.peerLoads, prometheus.CounterValue, float64(s.galaxy.Stats.PeerLoads.Get()))
   418  	ch <- prometheus.MustNewConstMetric(s.peerLoadErrors, prometheus.CounterValue, float64(s.galaxy.Stats.PeerLoadErrors.Get()))
   419  	ch <- prometheus.MustNewConstMetric(s.backendLoads, prometheus.CounterValue, float64(s.galaxy.Stats.BackendLoads.Get()))
   420  	ch <- prometheus.MustNewConstMetric(s.backendLoadErrors, prometheus.CounterValue, float64(s.galaxy.Stats.BackendLoadErrors.Get()))
   421  	ch <- prometheus.MustNewConstMetric(s.cacheHits, prometheus.CounterValue, float64(s.galaxy.Stats.MaincacheHits.Get()), galaxycache.MainCache.String())
   422  	ch <- prometheus.MustNewConstMetric(s.cacheHits, prometheus.CounterValue, float64(s.galaxy.Stats.HotcacheHits.Get()), galaxycache.HotCache.String())
   423  }
   424  
   425  func (s *CacheStatsCollector) Describe(ch chan<- *prometheus.Desc) {
   426  	prometheus.DescribeByCollect(s, ch)
   427  }