github.com/thanos-io/thanos@v0.32.5/pkg/query/internal/test-storeset-pre-v0.8.0/storeset.go (about)

     1  // Copyright (c) The Thanos Authors.
     2  // Licensed under the Apache License 2.0.
     3  
     4  /*
     5  This package is for compatibility testing purposes. It is a code from v0.7.0 Querier.
     6  */
     7  
     8  package testoldstoreset
     9  
    10  import (
    11  	"context"
    12  	"fmt"
    13  	"sort"
    14  	"strings"
    15  	"sync"
    16  	"time"
    17  
    18  	"github.com/go-kit/log"
    19  	"github.com/go-kit/log/level"
    20  	"github.com/pkg/errors"
    21  	"github.com/prometheus/client_golang/prometheus"
    22  	"github.com/prometheus/client_golang/prometheus/promauto"
    23  	"github.com/prometheus/prometheus/model/labels"
    24  	"google.golang.org/grpc"
    25  
    26  	"github.com/thanos-io/thanos/pkg/component"
    27  	"github.com/thanos-io/thanos/pkg/info/infopb"
    28  	"github.com/thanos-io/thanos/pkg/runutil"
    29  	"github.com/thanos-io/thanos/pkg/store"
    30  	"github.com/thanos-io/thanos/pkg/store/labelpb"
    31  	"github.com/thanos-io/thanos/pkg/store/storepb"
    32  )
    33  
    34  const (
    35  	unhealthyStoreMessage = "removing store because it's unhealthy or does not exist"
    36  	droppingStoreMessage  = "dropping store, external labels are not unique"
    37  )
    38  
    39  type StoreSpec interface {
    40  	// Addr returns StoreAPI Address for the store spec. It is used as ID for store.
    41  	Addr() string
    42  	// Metadata returns current labels, store type and min, max ranges for store.
    43  	// It can change for every call for this method.
    44  	// If metadata call fails we assume that store is no longer accessible and we should not use it.
    45  	// NOTE: It is implementation responsibility to retry until context timeout, but a caller responsibility to manage
    46  	// given store connection.
    47  	Metadata(ctx context.Context, client storepb.StoreClient) (labelSets []labels.Labels, mint int64, maxt int64, err error)
    48  }
    49  
    50  type StoreStatus struct {
    51  	Name      string
    52  	LastCheck time.Time
    53  	LastError error
    54  	LabelSets []labels.Labels
    55  	StoreType component.StoreAPI
    56  	MinTime   int64
    57  	MaxTime   int64
    58  }
    59  
    60  type grpcStoreSpec struct {
    61  	addr string
    62  }
    63  
    64  // NewGRPCStoreSpec creates store pure gRPC spec.
    65  // It uses Info gRPC call to get Metadata.
    66  func NewGRPCStoreSpec(addr string) StoreSpec {
    67  	return &grpcStoreSpec{addr: addr}
    68  }
    69  
    70  func (s *grpcStoreSpec) Addr() string {
    71  	// API addr should not change between state changes.
    72  	return s.addr
    73  }
    74  
    75  // Metadata method for gRPC store API tries to reach host Info method until context timeout. If we are unable to get metadata after
    76  // that time, we assume that the host is unhealthy and return error.
    77  func (s *grpcStoreSpec) Metadata(ctx context.Context, client storepb.StoreClient) (labelSets []labels.Labels, mint, maxt int64, err error) {
    78  	resp, err := client.Info(ctx, &storepb.InfoRequest{}, grpc.WaitForReady(true))
    79  	if err != nil {
    80  		return nil, 0, 0, errors.Wrapf(err, "fetching store info from %s", s.addr)
    81  	}
    82  	if len(resp.LabelSets) == 0 && len(resp.Labels) > 0 {
    83  		resp.LabelSets = []labelpb.ZLabelSet{{Labels: resp.Labels}}
    84  	}
    85  
    86  	return labelpb.ZLabelSetsToPromLabelSets(resp.LabelSets...), resp.MinTime, resp.MaxTime, nil
    87  }
    88  
    89  // StoreSet maintains a set of active stores. It is backed up by Store Specifications that are dynamically fetched on
    90  // every Update() call.
    91  type StoreSet struct {
    92  	logger log.Logger
    93  
    94  	// Store specifications can change dynamically. If some store is missing from the list, we assuming it is no longer
    95  	// accessible and we close gRPC client for it.
    96  	storeSpecs          func() []StoreSpec
    97  	dialOpts            []grpc.DialOption
    98  	gRPCInfoCallTimeout time.Duration
    99  
   100  	mtx                              sync.RWMutex
   101  	storesStatusesMtx                sync.RWMutex
   102  	stores                           map[string]*storeRef
   103  	storeNodeConnections             prometheus.Gauge
   104  	externalLabelOccurrencesInStores map[string]int
   105  	storeStatuses                    map[string]*StoreStatus
   106  	unhealthyStoreTimeout            time.Duration
   107  }
   108  
   109  type storeSetNodeCollector struct {
   110  	externalLabelOccurrences func() map[string]int
   111  }
   112  
   113  var nodeInfoDesc = prometheus.NewDesc(
   114  	"thanos_store_node_info",
   115  	"Number of nodes with the same external labels identified by their hash. If any time-series is larger than 1, external label uniqueness is not true",
   116  	[]string{"external_labels"}, nil,
   117  )
   118  
   119  func (c *storeSetNodeCollector) Describe(ch chan<- *prometheus.Desc) {
   120  	ch <- nodeInfoDesc
   121  }
   122  
   123  func (c *storeSetNodeCollector) Collect(ch chan<- prometheus.Metric) {
   124  	externalLabelOccurrences := c.externalLabelOccurrences()
   125  	for externalLabels, occurrences := range externalLabelOccurrences {
   126  		ch <- prometheus.MustNewConstMetric(nodeInfoDesc, prometheus.GaugeValue, float64(occurrences), externalLabels)
   127  	}
   128  }
   129  
   130  // NewStoreSet returns a new set of stores from cluster peers and statically configured ones.
   131  func NewStoreSet(
   132  	logger log.Logger,
   133  	reg prometheus.Registerer,
   134  	storeSpecs func() []StoreSpec,
   135  	dialOpts []grpc.DialOption,
   136  	unhealthyStoreTimeout time.Duration,
   137  ) *StoreSet {
   138  	storeNodeConnections := promauto.With(reg).NewGauge(prometheus.GaugeOpts{
   139  		Name: "thanos_store_nodes_grpc_connections",
   140  		Help: "Number indicating current number of gRPC connection to store nodes. This indicates also to how many stores query node have access to.",
   141  	})
   142  
   143  	if logger == nil {
   144  		logger = log.NewNopLogger()
   145  	}
   146  	if storeSpecs == nil {
   147  		storeSpecs = func() []StoreSpec { return nil }
   148  	}
   149  
   150  	ss := &StoreSet{
   151  		logger:                           log.With(logger, "component", "storeset"),
   152  		storeSpecs:                       storeSpecs,
   153  		dialOpts:                         dialOpts,
   154  		storeNodeConnections:             storeNodeConnections,
   155  		gRPCInfoCallTimeout:              10 * time.Second,
   156  		externalLabelOccurrencesInStores: map[string]int{},
   157  		stores:                           make(map[string]*storeRef),
   158  		storeStatuses:                    make(map[string]*StoreStatus),
   159  		unhealthyStoreTimeout:            unhealthyStoreTimeout,
   160  	}
   161  
   162  	storeNodeCollector := &storeSetNodeCollector{externalLabelOccurrences: ss.externalLabelOccurrences}
   163  	if reg != nil {
   164  		reg.MustRegister(storeNodeCollector)
   165  	}
   166  
   167  	return ss
   168  }
   169  
   170  type storeRef struct {
   171  	storepb.StoreClient
   172  
   173  	mtx  sync.RWMutex
   174  	cc   *grpc.ClientConn
   175  	addr string
   176  
   177  	// Meta (can change during runtime).
   178  	labelSets []labels.Labels
   179  	storeType component.StoreAPI
   180  	minTime   int64
   181  	maxTime   int64
   182  
   183  	logger log.Logger
   184  }
   185  
   186  func (s *storeRef) Update(labelSets []labels.Labels, minTime, maxTime int64) {
   187  	s.mtx.Lock()
   188  	defer s.mtx.Unlock()
   189  
   190  	s.labelSets = labelSets
   191  	s.minTime = minTime
   192  	s.maxTime = maxTime
   193  }
   194  
   195  func (s *storeRef) LabelSets() []labels.Labels {
   196  	s.mtx.RLock()
   197  	defer s.mtx.RUnlock()
   198  	return s.labelSets
   199  }
   200  
   201  func (s *storeRef) TSDBInfos() []infopb.TSDBInfo { return nil }
   202  
   203  func (s *storeRef) TimeRange() (int64, int64) {
   204  	s.mtx.RLock()
   205  	defer s.mtx.RUnlock()
   206  
   207  	return s.minTime, s.maxTime
   208  }
   209  
   210  func (s *storeRef) SupportsSharding() bool {
   211  	return false
   212  }
   213  
   214  func (s *storeRef) SupportsWithoutReplicaLabels() bool {
   215  	return false
   216  }
   217  
   218  func (s *storeRef) String() string {
   219  	mint, maxt := s.TimeRange()
   220  	return fmt.Sprintf(
   221  		"Addr: %s LabelSets: %v MinTime: %d MaxTime: %d",
   222  		s.addr, labelpb.PromLabelSetsToString(s.LabelSets()), mint, maxt,
   223  	)
   224  }
   225  
   226  func (s *storeRef) Addr() (string, bool) {
   227  	return s.addr, false
   228  }
   229  
   230  func (s *storeRef) close() {
   231  	runutil.CloseWithLogOnErr(s.logger, s.cc, fmt.Sprintf("store %v connection close", s.addr))
   232  }
   233  
   234  // Update updates the store set. It fetches current list of store specs from function and updates the fresh metadata
   235  // from all stores.
   236  func (s *StoreSet) Update(ctx context.Context) {
   237  	healthyStores := s.getHealthyStores(ctx)
   238  
   239  	// Record the number of occurrences of external label combinations for current store slice.
   240  	externalLabelOccurrencesInStores := map[string]int{}
   241  	for _, st := range healthyStores {
   242  		externalLabelOccurrencesInStores[externalLabelsFromStore(st)]++
   243  	}
   244  	level.Debug(s.logger).Log("msg", "updating healthy stores", "externalLabelOccurrencesInStores", fmt.Sprintf("%#+v", externalLabelOccurrencesInStores))
   245  
   246  	s.mtx.Lock()
   247  	defer s.mtx.Unlock()
   248  
   249  	// Close stores that where not healthy this time (are not in healthy stores map).
   250  	for addr, store := range s.stores {
   251  		if _, ok := healthyStores[addr]; ok {
   252  			continue
   253  		}
   254  
   255  		// Peer does not exists anymore.
   256  		store.close()
   257  		delete(s.stores, addr)
   258  		s.updateStoreStatus(store, errors.New(unhealthyStoreMessage))
   259  		level.Info(s.logger).Log("msg", unhealthyStoreMessage, "address", addr)
   260  	}
   261  
   262  	// Add stores that are not yet in s.stores.
   263  	for addr, store := range healthyStores {
   264  		if _, ok := s.stores[addr]; ok {
   265  			s.updateStoreStatus(store, nil)
   266  			continue
   267  		}
   268  
   269  		externalLabels := externalLabelsFromStore(store)
   270  		if len(store.LabelSets()) > 0 &&
   271  			externalLabelOccurrencesInStores[externalLabels] != 1 {
   272  			store.close()
   273  			s.updateStoreStatus(store, errors.New(droppingStoreMessage))
   274  			level.Warn(s.logger).Log("msg", droppingStoreMessage, "address", addr, "extLset", externalLabels, "duplicates", externalLabelOccurrencesInStores[externalLabels])
   275  			// We don't want to block all of them. Leave one to not disrupt in terms of migration.
   276  			externalLabelOccurrencesInStores[externalLabels]--
   277  			continue
   278  		}
   279  
   280  		s.stores[addr] = store
   281  		s.updateStoreStatus(store, nil)
   282  		level.Info(s.logger).Log("msg", "adding new store to query storeset", "address", addr)
   283  	}
   284  
   285  	s.externalLabelOccurrencesInStores = externalLabelOccurrencesInStores
   286  	s.storeNodeConnections.Set(float64(len(s.stores)))
   287  	s.cleanUpStoreStatuses()
   288  }
   289  
   290  func (s *StoreSet) getHealthyStores(ctx context.Context) map[string]*storeRef {
   291  	var (
   292  		unique = make(map[string]struct{})
   293  
   294  		healthyStores = make(map[string]*storeRef, len(s.stores))
   295  		mtx           sync.Mutex
   296  		wg            sync.WaitGroup
   297  	)
   298  
   299  	// Gather healthy stores map concurrently. Build new store if does not exist already.
   300  	for _, storeSpec := range s.storeSpecs() {
   301  		if _, ok := unique[storeSpec.Addr()]; ok {
   302  			level.Warn(s.logger).Log("msg", "duplicated address in store nodes", "address", storeSpec.Addr())
   303  			continue
   304  		}
   305  		unique[storeSpec.Addr()] = struct{}{}
   306  
   307  		wg.Add(1)
   308  		go func(spec StoreSpec) {
   309  			defer wg.Done()
   310  
   311  			addr := spec.Addr()
   312  
   313  			ctx, cancel := context.WithTimeout(ctx, s.gRPCInfoCallTimeout)
   314  			defer cancel()
   315  
   316  			store, ok := s.stores[addr]
   317  			if ok {
   318  				// Check existing store. Is it healthy? What are current metadata?
   319  				labelSets, minTime, maxTime, err := spec.Metadata(ctx, store.StoreClient)
   320  				if err != nil {
   321  					// Peer unhealthy. Do not include in healthy stores.
   322  					s.updateStoreStatus(store, err)
   323  					level.Warn(s.logger).Log("msg", "update of store node failed", "err", err, "address", addr)
   324  					return
   325  				}
   326  				store.Update(labelSets, minTime, maxTime)
   327  			} else {
   328  				// New store or was unhealthy and was removed in the past - create new one.
   329  				conn, err := grpc.DialContext(ctx, addr, s.dialOpts...)
   330  				if err != nil {
   331  					s.updateStoreStatus(&storeRef{addr: addr}, err)
   332  					level.Warn(s.logger).Log("msg", "update of store node failed", "err", errors.Wrap(err, "dialing connection"), "address", addr)
   333  					return
   334  				}
   335  				store = &storeRef{StoreClient: storepb.NewStoreClient(conn), cc: conn, addr: addr, logger: s.logger}
   336  
   337  				// Initial info call for all types of stores to check gRPC StoreAPI.
   338  				resp, err := store.StoreClient.Info(ctx, &storepb.InfoRequest{}, grpc.WaitForReady(true))
   339  				if err != nil {
   340  					store.close()
   341  					s.updateStoreStatus(store, err)
   342  					level.Warn(s.logger).Log("msg", "update of store node failed", "err", errors.Wrap(err, "initial store client info fetch"), "address", addr)
   343  					return
   344  				}
   345  				if len(resp.LabelSets) == 0 && len(resp.Labels) > 0 {
   346  					resp.LabelSets = []labelpb.ZLabelSet{{Labels: resp.Labels}}
   347  				}
   348  				store.storeType = component.FromProto(resp.StoreType)
   349  				store.Update(labelpb.ZLabelSetsToPromLabelSets(resp.LabelSets...), resp.MinTime, resp.MaxTime)
   350  			}
   351  
   352  			mtx.Lock()
   353  			defer mtx.Unlock()
   354  
   355  			healthyStores[addr] = store
   356  		}(storeSpec)
   357  	}
   358  
   359  	wg.Wait()
   360  
   361  	return healthyStores
   362  }
   363  
   364  func externalLabelsFromStore(store *storeRef) string {
   365  	tsdbLabelSetStrings := make([]string, 0, len(store.labelSets))
   366  	for _, ls := range store.labelSets {
   367  		sort.Sort(ls)
   368  		tsdbLabelSetStrings = append(tsdbLabelSetStrings, ls.String())
   369  	}
   370  	sort.Strings(tsdbLabelSetStrings)
   371  	return strings.Join(tsdbLabelSetStrings, ",")
   372  }
   373  
   374  func (s *StoreSet) updateStoreStatus(store *storeRef, err error) {
   375  	s.storesStatusesMtx.Lock()
   376  	defer s.storesStatusesMtx.Unlock()
   377  
   378  	status := StoreStatus{Name: store.addr}
   379  	prev, ok := s.storeStatuses[store.addr]
   380  	if ok {
   381  		status = *prev
   382  	}
   383  
   384  	status.LastError = err
   385  	status.LastCheck = time.Now()
   386  
   387  	if err == nil {
   388  		status.LabelSets = store.labelSets
   389  		status.StoreType = store.storeType
   390  		status.MinTime = store.minTime
   391  		status.MaxTime = store.maxTime
   392  	}
   393  
   394  	s.storeStatuses[store.addr] = &status
   395  }
   396  
   397  func (s *StoreSet) GetStoreStatus() []StoreStatus {
   398  	s.storesStatusesMtx.RLock()
   399  	defer s.storesStatusesMtx.RUnlock()
   400  
   401  	statuses := make([]StoreStatus, 0, len(s.storeStatuses))
   402  	for _, v := range s.storeStatuses {
   403  		statuses = append(statuses, *v)
   404  	}
   405  
   406  	sort.Slice(statuses, func(i, j int) bool {
   407  		return statuses[i].Name < statuses[j].Name
   408  	})
   409  	return statuses
   410  }
   411  
   412  func (s *StoreSet) externalLabelOccurrences() map[string]int {
   413  	s.mtx.RLock()
   414  	defer s.mtx.RUnlock()
   415  
   416  	r := make(map[string]int, len(s.externalLabelOccurrencesInStores))
   417  	for k, v := range s.externalLabelOccurrencesInStores {
   418  		r[k] = v
   419  	}
   420  
   421  	return r
   422  }
   423  
   424  // Get returns a list of all active stores.
   425  func (s *StoreSet) Get() []store.Client {
   426  	s.mtx.RLock()
   427  	defer s.mtx.RUnlock()
   428  
   429  	stores := make([]store.Client, 0, len(s.stores))
   430  	for _, st := range s.stores {
   431  		stores = append(stores, st)
   432  	}
   433  	return stores
   434  }
   435  
   436  func (s *StoreSet) Close() {
   437  	for _, st := range s.stores {
   438  		st.close()
   439  	}
   440  }
   441  
   442  func (s *StoreSet) cleanUpStoreStatuses() {
   443  	s.storesStatusesMtx.Lock()
   444  	defer s.storesStatusesMtx.Unlock()
   445  
   446  	now := time.Now()
   447  	for addr, status := range s.storeStatuses {
   448  		if _, ok := s.stores[addr]; !ok {
   449  			if now.Sub(status.LastCheck) >= s.unhealthyStoreTimeout {
   450  				delete(s.storeStatuses, addr)
   451  			}
   452  		}
   453  	}
   454  }