github.com/thanos-io/thanos@v0.32.5/pkg/receive/hashring.go (about)

     1  // Copyright (c) The Thanos Authors.
     2  // Licensed under the Apache License 2.0.
     3  
     4  package receive
     5  
     6  import (
     7  	"fmt"
     8  	"math"
     9  	"sort"
    10  	"strconv"
    11  	"sync"
    12  
    13  	"github.com/cespare/xxhash"
    14  
    15  	"github.com/go-kit/log"
    16  	"github.com/go-kit/log/level"
    17  
    18  	"github.com/pkg/errors"
    19  
    20  	"github.com/thanos-io/thanos/pkg/store/labelpb"
    21  
    22  	"github.com/thanos-io/thanos/pkg/store/storepb/prompb"
    23  )
    24  
    25  // HashringAlgorithm is the algorithm used to distribute series in the ring.
    26  type HashringAlgorithm string
    27  
    28  const (
    29  	AlgorithmHashmod HashringAlgorithm = "hashmod"
    30  	AlgorithmKetama  HashringAlgorithm = "ketama"
    31  
    32  	// SectionsPerNode is the number of sections in the ring assigned to each node
    33  	// in the ketama hashring. A higher number yields a better series distribution,
    34  	// but also comes with a higher memory cost.
    35  	SectionsPerNode = 1000
    36  )
    37  
    38  // insufficientNodesError is returned when a hashring does not
    39  // have enough nodes to satisfy a request for a node.
    40  type insufficientNodesError struct {
    41  	have uint64
    42  	want uint64
    43  }
    44  
    45  // Error implements the error interface.
    46  func (i *insufficientNodesError) Error() string {
    47  	return fmt.Sprintf("insufficient nodes; have %d, want %d", i.have, i.want)
    48  }
    49  
    50  // Hashring finds the correct node to handle a given time series
    51  // for a specified tenant.
    52  // It returns the node and any error encountered.
    53  type Hashring interface {
    54  	// Get returns the first node that should handle the given tenant and time series.
    55  	Get(tenant string, timeSeries *prompb.TimeSeries) (string, error)
    56  	// GetN returns the nth node that should handle the given tenant and time series.
    57  	GetN(tenant string, timeSeries *prompb.TimeSeries, n uint64) (string, error)
    58  }
    59  
    60  // SingleNodeHashring always returns the same node.
    61  type SingleNodeHashring string
    62  
    63  // Get implements the Hashring interface.
    64  func (s SingleNodeHashring) Get(tenant string, ts *prompb.TimeSeries) (string, error) {
    65  	return s.GetN(tenant, ts, 0)
    66  }
    67  
    68  // GetN implements the Hashring interface.
    69  func (s SingleNodeHashring) GetN(_ string, _ *prompb.TimeSeries, n uint64) (string, error) {
    70  	if n > 0 {
    71  		return "", &insufficientNodesError{have: 1, want: n + 1}
    72  	}
    73  	return string(s), nil
    74  }
    75  
    76  // simpleHashring represents a group of nodes handling write requests by hashmoding individual series.
    77  type simpleHashring []string
    78  
    79  func newSimpleHashring(endpoints []Endpoint) (Hashring, error) {
    80  	addresses := make([]string, len(endpoints))
    81  	for i := range endpoints {
    82  		if endpoints[i].AZ != "" {
    83  			return nil, errors.New("Hashmod algorithm does not support AZ aware hashring configuration. Either use Ketama or remove AZ configuration.")
    84  		}
    85  		addresses[i] = endpoints[i].Address
    86  	}
    87  	return simpleHashring(addresses), nil
    88  }
    89  
    90  // Get returns a target to handle the given tenant and time series.
    91  func (s simpleHashring) Get(tenant string, ts *prompb.TimeSeries) (string, error) {
    92  	return s.GetN(tenant, ts, 0)
    93  }
    94  
    95  // GetN returns the nth target to handle the given tenant and time series.
    96  func (s simpleHashring) GetN(tenant string, ts *prompb.TimeSeries, n uint64) (string, error) {
    97  	if n >= uint64(len(s)) {
    98  		return "", &insufficientNodesError{have: uint64(len(s)), want: n + 1}
    99  	}
   100  
   101  	return s[(labelpb.HashWithPrefix(tenant, ts.Labels)+n)%uint64(len(s))], nil
   102  }
   103  
   104  type section struct {
   105  	az            string
   106  	endpointIndex uint64
   107  	hash          uint64
   108  	replicas      []uint64
   109  }
   110  
   111  type sections []*section
   112  
   113  func (p sections) Len() int           { return len(p) }
   114  func (p sections) Less(i, j int) bool { return p[i].hash < p[j].hash }
   115  func (p sections) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }
   116  func (p sections) Sort()              { sort.Sort(p) }
   117  
   118  // ketamaHashring represents a group of nodes handling write requests with consistent hashing.
   119  type ketamaHashring struct {
   120  	endpoints    []Endpoint
   121  	sections     sections
   122  	numEndpoints uint64
   123  }
   124  
   125  func newKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replicationFactor uint64) (*ketamaHashring, error) {
   126  	numSections := len(endpoints) * sectionsPerNode
   127  
   128  	if len(endpoints) < int(replicationFactor) {
   129  		return nil, errors.New("ketama: amount of endpoints needs to be larger than replication factor")
   130  
   131  	}
   132  	hash := xxhash.New()
   133  	availabilityZones := make(map[string]struct{})
   134  	ringSections := make(sections, 0, numSections)
   135  	for endpointIndex, endpoint := range endpoints {
   136  		availabilityZones[endpoint.AZ] = struct{}{}
   137  		for i := 1; i <= sectionsPerNode; i++ {
   138  			_, _ = hash.Write([]byte(endpoint.Address + ":" + strconv.Itoa(i)))
   139  			n := &section{
   140  				az:            endpoint.AZ,
   141  				endpointIndex: uint64(endpointIndex),
   142  				hash:          hash.Sum64(),
   143  				replicas:      make([]uint64, 0, replicationFactor),
   144  			}
   145  
   146  			ringSections = append(ringSections, n)
   147  			hash.Reset()
   148  		}
   149  	}
   150  	sort.Sort(ringSections)
   151  	calculateSectionReplicas(ringSections, replicationFactor, availabilityZones)
   152  
   153  	return &ketamaHashring{
   154  		endpoints:    endpoints,
   155  		sections:     ringSections,
   156  		numEndpoints: uint64(len(endpoints)),
   157  	}, nil
   158  }
   159  
   160  func sizeOfLeastOccupiedAZ(azSpread map[string]int64) int64 {
   161  	minValue := int64(math.MaxInt64)
   162  	for _, value := range azSpread {
   163  		if value < minValue {
   164  			minValue = value
   165  		}
   166  	}
   167  	return minValue
   168  }
   169  
   170  // calculateSectionReplicas pre-calculates replicas for each section,
   171  // ensuring that replicas for each ring section are owned by different endpoints.
   172  func calculateSectionReplicas(ringSections sections, replicationFactor uint64, availabilityZones map[string]struct{}) {
   173  	for i, s := range ringSections {
   174  		replicas := make(map[uint64]struct{})
   175  		azSpread := make(map[string]int64)
   176  		for az := range availabilityZones {
   177  			// This is to make sure each az is initially represented
   178  			azSpread[az] = 0
   179  		}
   180  		j := i - 1
   181  		for uint64(len(replicas)) < replicationFactor {
   182  			j = (j + 1) % len(ringSections)
   183  			rep := ringSections[j]
   184  			if _, ok := replicas[rep.endpointIndex]; ok {
   185  				continue
   186  			}
   187  			if len(azSpread) > 1 && azSpread[rep.az] > 0 && azSpread[rep.az] > sizeOfLeastOccupiedAZ(azSpread) {
   188  				// We want to ensure even AZ spread before we add more replicas within the same AZ
   189  				continue
   190  			}
   191  			replicas[rep.endpointIndex] = struct{}{}
   192  			azSpread[rep.az]++
   193  			s.replicas = append(s.replicas, rep.endpointIndex)
   194  		}
   195  	}
   196  }
   197  
   198  func (c ketamaHashring) Get(tenant string, ts *prompb.TimeSeries) (string, error) {
   199  	return c.GetN(tenant, ts, 0)
   200  }
   201  
   202  func (c ketamaHashring) GetN(tenant string, ts *prompb.TimeSeries, n uint64) (string, error) {
   203  	if n >= c.numEndpoints {
   204  		return "", &insufficientNodesError{have: c.numEndpoints, want: n + 1}
   205  	}
   206  
   207  	v := labelpb.HashWithPrefix(tenant, ts.Labels)
   208  
   209  	var i uint64
   210  	i = uint64(sort.Search(len(c.sections), func(i int) bool {
   211  		return c.sections[i].hash >= v
   212  	}))
   213  
   214  	numSections := uint64(len(c.sections))
   215  	if i == numSections {
   216  		i = 0
   217  	}
   218  
   219  	endpointIndex := c.sections[i].replicas[n]
   220  	return c.endpoints[endpointIndex].Address, nil
   221  }
   222  
   223  // multiHashring represents a set of hashrings.
   224  // Which hashring to use for a tenant is determined
   225  // by the tenants field of the hashring configuration.
   226  type multiHashring struct {
   227  	cache      map[string]Hashring
   228  	hashrings  []Hashring
   229  	tenantSets []map[string]struct{}
   230  
   231  	// We need a mutex to guard concurrent access
   232  	// to the cache map, as this is both written to
   233  	// and read from.
   234  	mu sync.RWMutex
   235  }
   236  
   237  // Get returns a target to handle the given tenant and time series.
   238  func (m *multiHashring) Get(tenant string, ts *prompb.TimeSeries) (string, error) {
   239  	return m.GetN(tenant, ts, 0)
   240  }
   241  
   242  // GetN returns the nth target to handle the given tenant and time series.
   243  func (m *multiHashring) GetN(tenant string, ts *prompb.TimeSeries, n uint64) (string, error) {
   244  	m.mu.RLock()
   245  	h, ok := m.cache[tenant]
   246  	m.mu.RUnlock()
   247  	if ok {
   248  		return h.GetN(tenant, ts, n)
   249  	}
   250  	var found bool
   251  	// If the tenant is not in the cache, then we need to check
   252  	// every tenant in the configuration.
   253  	for i, t := range m.tenantSets {
   254  		// If the hashring has no tenants, then it is
   255  		// considered a default hashring and matches everything.
   256  		if t == nil {
   257  			found = true
   258  		} else if _, ok := t[tenant]; ok {
   259  			found = true
   260  		}
   261  		if found {
   262  			m.mu.Lock()
   263  			m.cache[tenant] = m.hashrings[i]
   264  			m.mu.Unlock()
   265  
   266  			return m.hashrings[i].GetN(tenant, ts, n)
   267  		}
   268  	}
   269  	return "", errors.New("no matching hashring to handle tenant")
   270  }
   271  
   272  // newMultiHashring creates a multi-tenant hashring for a given slice of
   273  // groups.
   274  // Which hashring to use for a tenant is determined
   275  // by the tenants field of the hashring configuration.
   276  func NewMultiHashring(algorithm HashringAlgorithm, replicationFactor uint64, cfg []HashringConfig) (Hashring, error) {
   277  	m := &multiHashring{
   278  		cache: make(map[string]Hashring),
   279  	}
   280  
   281  	for _, h := range cfg {
   282  		var hashring Hashring
   283  		var err error
   284  		activeAlgorithm := algorithm
   285  		if h.Algorithm != "" {
   286  			activeAlgorithm = h.Algorithm
   287  		}
   288  		hashring, err = newHashring(activeAlgorithm, h.Endpoints, replicationFactor, h.Hashring, h.Tenants)
   289  		if err != nil {
   290  			return nil, err
   291  		}
   292  		m.hashrings = append(m.hashrings, hashring)
   293  		var t map[string]struct{}
   294  		if len(h.Tenants) != 0 {
   295  			t = make(map[string]struct{})
   296  		}
   297  		for _, tenant := range h.Tenants {
   298  			t[tenant] = struct{}{}
   299  		}
   300  		m.tenantSets = append(m.tenantSets, t)
   301  	}
   302  	return m, nil
   303  }
   304  
   305  func newHashring(algorithm HashringAlgorithm, endpoints []Endpoint, replicationFactor uint64, hashring string, tenants []string) (Hashring, error) {
   306  	switch algorithm {
   307  	case AlgorithmHashmod:
   308  		return newSimpleHashring(endpoints)
   309  	case AlgorithmKetama:
   310  		return newKetamaHashring(endpoints, SectionsPerNode, replicationFactor)
   311  	default:
   312  		l := log.NewNopLogger()
   313  		level.Warn(l).Log("msg", "Unrecognizable hashring algorithm. Fall back to hashmod algorithm.",
   314  			"hashring", hashring,
   315  			"tenants", tenants)
   316  		return newSimpleHashring(endpoints)
   317  	}
   318  }