github.com/thanos-io/thanos@v0.32.5/pkg/query/internal/test-storeset-pre-v0.8.0/storeset.go (about) 1 // Copyright (c) The Thanos Authors. 2 // Licensed under the Apache License 2.0. 3 4 /* 5 This package is for compatibility testing purposes. It is a code from v0.7.0 Querier. 6 */ 7 8 package testoldstoreset 9 10 import ( 11 "context" 12 "fmt" 13 "sort" 14 "strings" 15 "sync" 16 "time" 17 18 "github.com/go-kit/log" 19 "github.com/go-kit/log/level" 20 "github.com/pkg/errors" 21 "github.com/prometheus/client_golang/prometheus" 22 "github.com/prometheus/client_golang/prometheus/promauto" 23 "github.com/prometheus/prometheus/model/labels" 24 "google.golang.org/grpc" 25 26 "github.com/thanos-io/thanos/pkg/component" 27 "github.com/thanos-io/thanos/pkg/info/infopb" 28 "github.com/thanos-io/thanos/pkg/runutil" 29 "github.com/thanos-io/thanos/pkg/store" 30 "github.com/thanos-io/thanos/pkg/store/labelpb" 31 "github.com/thanos-io/thanos/pkg/store/storepb" 32 ) 33 34 const ( 35 unhealthyStoreMessage = "removing store because it's unhealthy or does not exist" 36 droppingStoreMessage = "dropping store, external labels are not unique" 37 ) 38 39 type StoreSpec interface { 40 // Addr returns StoreAPI Address for the store spec. It is used as ID for store. 41 Addr() string 42 // Metadata returns current labels, store type and min, max ranges for store. 43 // It can change for every call for this method. 44 // If metadata call fails we assume that store is no longer accessible and we should not use it. 45 // NOTE: It is implementation responsibility to retry until context timeout, but a caller responsibility to manage 46 // given store connection. 47 Metadata(ctx context.Context, client storepb.StoreClient) (labelSets []labels.Labels, mint int64, maxt int64, err error) 48 } 49 50 type StoreStatus struct { 51 Name string 52 LastCheck time.Time 53 LastError error 54 LabelSets []labels.Labels 55 StoreType component.StoreAPI 56 MinTime int64 57 MaxTime int64 58 } 59 60 type grpcStoreSpec struct { 61 addr string 62 } 63 64 // NewGRPCStoreSpec creates store pure gRPC spec. 65 // It uses Info gRPC call to get Metadata. 66 func NewGRPCStoreSpec(addr string) StoreSpec { 67 return &grpcStoreSpec{addr: addr} 68 } 69 70 func (s *grpcStoreSpec) Addr() string { 71 // API addr should not change between state changes. 72 return s.addr 73 } 74 75 // Metadata method for gRPC store API tries to reach host Info method until context timeout. If we are unable to get metadata after 76 // that time, we assume that the host is unhealthy and return error. 77 func (s *grpcStoreSpec) Metadata(ctx context.Context, client storepb.StoreClient) (labelSets []labels.Labels, mint, maxt int64, err error) { 78 resp, err := client.Info(ctx, &storepb.InfoRequest{}, grpc.WaitForReady(true)) 79 if err != nil { 80 return nil, 0, 0, errors.Wrapf(err, "fetching store info from %s", s.addr) 81 } 82 if len(resp.LabelSets) == 0 && len(resp.Labels) > 0 { 83 resp.LabelSets = []labelpb.ZLabelSet{{Labels: resp.Labels}} 84 } 85 86 return labelpb.ZLabelSetsToPromLabelSets(resp.LabelSets...), resp.MinTime, resp.MaxTime, nil 87 } 88 89 // StoreSet maintains a set of active stores. It is backed up by Store Specifications that are dynamically fetched on 90 // every Update() call. 91 type StoreSet struct { 92 logger log.Logger 93 94 // Store specifications can change dynamically. If some store is missing from the list, we assuming it is no longer 95 // accessible and we close gRPC client for it. 96 storeSpecs func() []StoreSpec 97 dialOpts []grpc.DialOption 98 gRPCInfoCallTimeout time.Duration 99 100 mtx sync.RWMutex 101 storesStatusesMtx sync.RWMutex 102 stores map[string]*storeRef 103 storeNodeConnections prometheus.Gauge 104 externalLabelOccurrencesInStores map[string]int 105 storeStatuses map[string]*StoreStatus 106 unhealthyStoreTimeout time.Duration 107 } 108 109 type storeSetNodeCollector struct { 110 externalLabelOccurrences func() map[string]int 111 } 112 113 var nodeInfoDesc = prometheus.NewDesc( 114 "thanos_store_node_info", 115 "Number of nodes with the same external labels identified by their hash. If any time-series is larger than 1, external label uniqueness is not true", 116 []string{"external_labels"}, nil, 117 ) 118 119 func (c *storeSetNodeCollector) Describe(ch chan<- *prometheus.Desc) { 120 ch <- nodeInfoDesc 121 } 122 123 func (c *storeSetNodeCollector) Collect(ch chan<- prometheus.Metric) { 124 externalLabelOccurrences := c.externalLabelOccurrences() 125 for externalLabels, occurrences := range externalLabelOccurrences { 126 ch <- prometheus.MustNewConstMetric(nodeInfoDesc, prometheus.GaugeValue, float64(occurrences), externalLabels) 127 } 128 } 129 130 // NewStoreSet returns a new set of stores from cluster peers and statically configured ones. 131 func NewStoreSet( 132 logger log.Logger, 133 reg prometheus.Registerer, 134 storeSpecs func() []StoreSpec, 135 dialOpts []grpc.DialOption, 136 unhealthyStoreTimeout time.Duration, 137 ) *StoreSet { 138 storeNodeConnections := promauto.With(reg).NewGauge(prometheus.GaugeOpts{ 139 Name: "thanos_store_nodes_grpc_connections", 140 Help: "Number indicating current number of gRPC connection to store nodes. This indicates also to how many stores query node have access to.", 141 }) 142 143 if logger == nil { 144 logger = log.NewNopLogger() 145 } 146 if storeSpecs == nil { 147 storeSpecs = func() []StoreSpec { return nil } 148 } 149 150 ss := &StoreSet{ 151 logger: log.With(logger, "component", "storeset"), 152 storeSpecs: storeSpecs, 153 dialOpts: dialOpts, 154 storeNodeConnections: storeNodeConnections, 155 gRPCInfoCallTimeout: 10 * time.Second, 156 externalLabelOccurrencesInStores: map[string]int{}, 157 stores: make(map[string]*storeRef), 158 storeStatuses: make(map[string]*StoreStatus), 159 unhealthyStoreTimeout: unhealthyStoreTimeout, 160 } 161 162 storeNodeCollector := &storeSetNodeCollector{externalLabelOccurrences: ss.externalLabelOccurrences} 163 if reg != nil { 164 reg.MustRegister(storeNodeCollector) 165 } 166 167 return ss 168 } 169 170 type storeRef struct { 171 storepb.StoreClient 172 173 mtx sync.RWMutex 174 cc *grpc.ClientConn 175 addr string 176 177 // Meta (can change during runtime). 178 labelSets []labels.Labels 179 storeType component.StoreAPI 180 minTime int64 181 maxTime int64 182 183 logger log.Logger 184 } 185 186 func (s *storeRef) Update(labelSets []labels.Labels, minTime, maxTime int64) { 187 s.mtx.Lock() 188 defer s.mtx.Unlock() 189 190 s.labelSets = labelSets 191 s.minTime = minTime 192 s.maxTime = maxTime 193 } 194 195 func (s *storeRef) LabelSets() []labels.Labels { 196 s.mtx.RLock() 197 defer s.mtx.RUnlock() 198 return s.labelSets 199 } 200 201 func (s *storeRef) TSDBInfos() []infopb.TSDBInfo { return nil } 202 203 func (s *storeRef) TimeRange() (int64, int64) { 204 s.mtx.RLock() 205 defer s.mtx.RUnlock() 206 207 return s.minTime, s.maxTime 208 } 209 210 func (s *storeRef) SupportsSharding() bool { 211 return false 212 } 213 214 func (s *storeRef) SupportsWithoutReplicaLabels() bool { 215 return false 216 } 217 218 func (s *storeRef) String() string { 219 mint, maxt := s.TimeRange() 220 return fmt.Sprintf( 221 "Addr: %s LabelSets: %v MinTime: %d MaxTime: %d", 222 s.addr, labelpb.PromLabelSetsToString(s.LabelSets()), mint, maxt, 223 ) 224 } 225 226 func (s *storeRef) Addr() (string, bool) { 227 return s.addr, false 228 } 229 230 func (s *storeRef) close() { 231 runutil.CloseWithLogOnErr(s.logger, s.cc, fmt.Sprintf("store %v connection close", s.addr)) 232 } 233 234 // Update updates the store set. It fetches current list of store specs from function and updates the fresh metadata 235 // from all stores. 236 func (s *StoreSet) Update(ctx context.Context) { 237 healthyStores := s.getHealthyStores(ctx) 238 239 // Record the number of occurrences of external label combinations for current store slice. 240 externalLabelOccurrencesInStores := map[string]int{} 241 for _, st := range healthyStores { 242 externalLabelOccurrencesInStores[externalLabelsFromStore(st)]++ 243 } 244 level.Debug(s.logger).Log("msg", "updating healthy stores", "externalLabelOccurrencesInStores", fmt.Sprintf("%#+v", externalLabelOccurrencesInStores)) 245 246 s.mtx.Lock() 247 defer s.mtx.Unlock() 248 249 // Close stores that where not healthy this time (are not in healthy stores map). 250 for addr, store := range s.stores { 251 if _, ok := healthyStores[addr]; ok { 252 continue 253 } 254 255 // Peer does not exists anymore. 256 store.close() 257 delete(s.stores, addr) 258 s.updateStoreStatus(store, errors.New(unhealthyStoreMessage)) 259 level.Info(s.logger).Log("msg", unhealthyStoreMessage, "address", addr) 260 } 261 262 // Add stores that are not yet in s.stores. 263 for addr, store := range healthyStores { 264 if _, ok := s.stores[addr]; ok { 265 s.updateStoreStatus(store, nil) 266 continue 267 } 268 269 externalLabels := externalLabelsFromStore(store) 270 if len(store.LabelSets()) > 0 && 271 externalLabelOccurrencesInStores[externalLabels] != 1 { 272 store.close() 273 s.updateStoreStatus(store, errors.New(droppingStoreMessage)) 274 level.Warn(s.logger).Log("msg", droppingStoreMessage, "address", addr, "extLset", externalLabels, "duplicates", externalLabelOccurrencesInStores[externalLabels]) 275 // We don't want to block all of them. Leave one to not disrupt in terms of migration. 276 externalLabelOccurrencesInStores[externalLabels]-- 277 continue 278 } 279 280 s.stores[addr] = store 281 s.updateStoreStatus(store, nil) 282 level.Info(s.logger).Log("msg", "adding new store to query storeset", "address", addr) 283 } 284 285 s.externalLabelOccurrencesInStores = externalLabelOccurrencesInStores 286 s.storeNodeConnections.Set(float64(len(s.stores))) 287 s.cleanUpStoreStatuses() 288 } 289 290 func (s *StoreSet) getHealthyStores(ctx context.Context) map[string]*storeRef { 291 var ( 292 unique = make(map[string]struct{}) 293 294 healthyStores = make(map[string]*storeRef, len(s.stores)) 295 mtx sync.Mutex 296 wg sync.WaitGroup 297 ) 298 299 // Gather healthy stores map concurrently. Build new store if does not exist already. 300 for _, storeSpec := range s.storeSpecs() { 301 if _, ok := unique[storeSpec.Addr()]; ok { 302 level.Warn(s.logger).Log("msg", "duplicated address in store nodes", "address", storeSpec.Addr()) 303 continue 304 } 305 unique[storeSpec.Addr()] = struct{}{} 306 307 wg.Add(1) 308 go func(spec StoreSpec) { 309 defer wg.Done() 310 311 addr := spec.Addr() 312 313 ctx, cancel := context.WithTimeout(ctx, s.gRPCInfoCallTimeout) 314 defer cancel() 315 316 store, ok := s.stores[addr] 317 if ok { 318 // Check existing store. Is it healthy? What are current metadata? 319 labelSets, minTime, maxTime, err := spec.Metadata(ctx, store.StoreClient) 320 if err != nil { 321 // Peer unhealthy. Do not include in healthy stores. 322 s.updateStoreStatus(store, err) 323 level.Warn(s.logger).Log("msg", "update of store node failed", "err", err, "address", addr) 324 return 325 } 326 store.Update(labelSets, minTime, maxTime) 327 } else { 328 // New store or was unhealthy and was removed in the past - create new one. 329 conn, err := grpc.DialContext(ctx, addr, s.dialOpts...) 330 if err != nil { 331 s.updateStoreStatus(&storeRef{addr: addr}, err) 332 level.Warn(s.logger).Log("msg", "update of store node failed", "err", errors.Wrap(err, "dialing connection"), "address", addr) 333 return 334 } 335 store = &storeRef{StoreClient: storepb.NewStoreClient(conn), cc: conn, addr: addr, logger: s.logger} 336 337 // Initial info call for all types of stores to check gRPC StoreAPI. 338 resp, err := store.StoreClient.Info(ctx, &storepb.InfoRequest{}, grpc.WaitForReady(true)) 339 if err != nil { 340 store.close() 341 s.updateStoreStatus(store, err) 342 level.Warn(s.logger).Log("msg", "update of store node failed", "err", errors.Wrap(err, "initial store client info fetch"), "address", addr) 343 return 344 } 345 if len(resp.LabelSets) == 0 && len(resp.Labels) > 0 { 346 resp.LabelSets = []labelpb.ZLabelSet{{Labels: resp.Labels}} 347 } 348 store.storeType = component.FromProto(resp.StoreType) 349 store.Update(labelpb.ZLabelSetsToPromLabelSets(resp.LabelSets...), resp.MinTime, resp.MaxTime) 350 } 351 352 mtx.Lock() 353 defer mtx.Unlock() 354 355 healthyStores[addr] = store 356 }(storeSpec) 357 } 358 359 wg.Wait() 360 361 return healthyStores 362 } 363 364 func externalLabelsFromStore(store *storeRef) string { 365 tsdbLabelSetStrings := make([]string, 0, len(store.labelSets)) 366 for _, ls := range store.labelSets { 367 sort.Sort(ls) 368 tsdbLabelSetStrings = append(tsdbLabelSetStrings, ls.String()) 369 } 370 sort.Strings(tsdbLabelSetStrings) 371 return strings.Join(tsdbLabelSetStrings, ",") 372 } 373 374 func (s *StoreSet) updateStoreStatus(store *storeRef, err error) { 375 s.storesStatusesMtx.Lock() 376 defer s.storesStatusesMtx.Unlock() 377 378 status := StoreStatus{Name: store.addr} 379 prev, ok := s.storeStatuses[store.addr] 380 if ok { 381 status = *prev 382 } 383 384 status.LastError = err 385 status.LastCheck = time.Now() 386 387 if err == nil { 388 status.LabelSets = store.labelSets 389 status.StoreType = store.storeType 390 status.MinTime = store.minTime 391 status.MaxTime = store.maxTime 392 } 393 394 s.storeStatuses[store.addr] = &status 395 } 396 397 func (s *StoreSet) GetStoreStatus() []StoreStatus { 398 s.storesStatusesMtx.RLock() 399 defer s.storesStatusesMtx.RUnlock() 400 401 statuses := make([]StoreStatus, 0, len(s.storeStatuses)) 402 for _, v := range s.storeStatuses { 403 statuses = append(statuses, *v) 404 } 405 406 sort.Slice(statuses, func(i, j int) bool { 407 return statuses[i].Name < statuses[j].Name 408 }) 409 return statuses 410 } 411 412 func (s *StoreSet) externalLabelOccurrences() map[string]int { 413 s.mtx.RLock() 414 defer s.mtx.RUnlock() 415 416 r := make(map[string]int, len(s.externalLabelOccurrencesInStores)) 417 for k, v := range s.externalLabelOccurrencesInStores { 418 r[k] = v 419 } 420 421 return r 422 } 423 424 // Get returns a list of all active stores. 425 func (s *StoreSet) Get() []store.Client { 426 s.mtx.RLock() 427 defer s.mtx.RUnlock() 428 429 stores := make([]store.Client, 0, len(s.stores)) 430 for _, st := range s.stores { 431 stores = append(stores, st) 432 } 433 return stores 434 } 435 436 func (s *StoreSet) Close() { 437 for _, st := range s.stores { 438 st.close() 439 } 440 } 441 442 func (s *StoreSet) cleanUpStoreStatuses() { 443 s.storesStatusesMtx.Lock() 444 defer s.storesStatusesMtx.Unlock() 445 446 now := time.Now() 447 for addr, status := range s.storeStatuses { 448 if _, ok := s.stores[addr]; !ok { 449 if now.Sub(status.LastCheck) >= s.unhealthyStoreTimeout { 450 delete(s.storeStatuses, addr) 451 } 452 } 453 } 454 }