github.com/thanos-io/thanos@v0.32.5/pkg/receive/hashring.go (about) 1 // Copyright (c) The Thanos Authors. 2 // Licensed under the Apache License 2.0. 3 4 package receive 5 6 import ( 7 "fmt" 8 "math" 9 "sort" 10 "strconv" 11 "sync" 12 13 "github.com/cespare/xxhash" 14 15 "github.com/go-kit/log" 16 "github.com/go-kit/log/level" 17 18 "github.com/pkg/errors" 19 20 "github.com/thanos-io/thanos/pkg/store/labelpb" 21 22 "github.com/thanos-io/thanos/pkg/store/storepb/prompb" 23 ) 24 25 // HashringAlgorithm is the algorithm used to distribute series in the ring. 26 type HashringAlgorithm string 27 28 const ( 29 AlgorithmHashmod HashringAlgorithm = "hashmod" 30 AlgorithmKetama HashringAlgorithm = "ketama" 31 32 // SectionsPerNode is the number of sections in the ring assigned to each node 33 // in the ketama hashring. A higher number yields a better series distribution, 34 // but also comes with a higher memory cost. 35 SectionsPerNode = 1000 36 ) 37 38 // insufficientNodesError is returned when a hashring does not 39 // have enough nodes to satisfy a request for a node. 40 type insufficientNodesError struct { 41 have uint64 42 want uint64 43 } 44 45 // Error implements the error interface. 46 func (i *insufficientNodesError) Error() string { 47 return fmt.Sprintf("insufficient nodes; have %d, want %d", i.have, i.want) 48 } 49 50 // Hashring finds the correct node to handle a given time series 51 // for a specified tenant. 52 // It returns the node and any error encountered. 53 type Hashring interface { 54 // Get returns the first node that should handle the given tenant and time series. 55 Get(tenant string, timeSeries *prompb.TimeSeries) (string, error) 56 // GetN returns the nth node that should handle the given tenant and time series. 57 GetN(tenant string, timeSeries *prompb.TimeSeries, n uint64) (string, error) 58 } 59 60 // SingleNodeHashring always returns the same node. 61 type SingleNodeHashring string 62 63 // Get implements the Hashring interface. 64 func (s SingleNodeHashring) Get(tenant string, ts *prompb.TimeSeries) (string, error) { 65 return s.GetN(tenant, ts, 0) 66 } 67 68 // GetN implements the Hashring interface. 69 func (s SingleNodeHashring) GetN(_ string, _ *prompb.TimeSeries, n uint64) (string, error) { 70 if n > 0 { 71 return "", &insufficientNodesError{have: 1, want: n + 1} 72 } 73 return string(s), nil 74 } 75 76 // simpleHashring represents a group of nodes handling write requests by hashmoding individual series. 77 type simpleHashring []string 78 79 func newSimpleHashring(endpoints []Endpoint) (Hashring, error) { 80 addresses := make([]string, len(endpoints)) 81 for i := range endpoints { 82 if endpoints[i].AZ != "" { 83 return nil, errors.New("Hashmod algorithm does not support AZ aware hashring configuration. Either use Ketama or remove AZ configuration.") 84 } 85 addresses[i] = endpoints[i].Address 86 } 87 return simpleHashring(addresses), nil 88 } 89 90 // Get returns a target to handle the given tenant and time series. 91 func (s simpleHashring) Get(tenant string, ts *prompb.TimeSeries) (string, error) { 92 return s.GetN(tenant, ts, 0) 93 } 94 95 // GetN returns the nth target to handle the given tenant and time series. 96 func (s simpleHashring) GetN(tenant string, ts *prompb.TimeSeries, n uint64) (string, error) { 97 if n >= uint64(len(s)) { 98 return "", &insufficientNodesError{have: uint64(len(s)), want: n + 1} 99 } 100 101 return s[(labelpb.HashWithPrefix(tenant, ts.Labels)+n)%uint64(len(s))], nil 102 } 103 104 type section struct { 105 az string 106 endpointIndex uint64 107 hash uint64 108 replicas []uint64 109 } 110 111 type sections []*section 112 113 func (p sections) Len() int { return len(p) } 114 func (p sections) Less(i, j int) bool { return p[i].hash < p[j].hash } 115 func (p sections) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 116 func (p sections) Sort() { sort.Sort(p) } 117 118 // ketamaHashring represents a group of nodes handling write requests with consistent hashing. 119 type ketamaHashring struct { 120 endpoints []Endpoint 121 sections sections 122 numEndpoints uint64 123 } 124 125 func newKetamaHashring(endpoints []Endpoint, sectionsPerNode int, replicationFactor uint64) (*ketamaHashring, error) { 126 numSections := len(endpoints) * sectionsPerNode 127 128 if len(endpoints) < int(replicationFactor) { 129 return nil, errors.New("ketama: amount of endpoints needs to be larger than replication factor") 130 131 } 132 hash := xxhash.New() 133 availabilityZones := make(map[string]struct{}) 134 ringSections := make(sections, 0, numSections) 135 for endpointIndex, endpoint := range endpoints { 136 availabilityZones[endpoint.AZ] = struct{}{} 137 for i := 1; i <= sectionsPerNode; i++ { 138 _, _ = hash.Write([]byte(endpoint.Address + ":" + strconv.Itoa(i))) 139 n := §ion{ 140 az: endpoint.AZ, 141 endpointIndex: uint64(endpointIndex), 142 hash: hash.Sum64(), 143 replicas: make([]uint64, 0, replicationFactor), 144 } 145 146 ringSections = append(ringSections, n) 147 hash.Reset() 148 } 149 } 150 sort.Sort(ringSections) 151 calculateSectionReplicas(ringSections, replicationFactor, availabilityZones) 152 153 return &ketamaHashring{ 154 endpoints: endpoints, 155 sections: ringSections, 156 numEndpoints: uint64(len(endpoints)), 157 }, nil 158 } 159 160 func sizeOfLeastOccupiedAZ(azSpread map[string]int64) int64 { 161 minValue := int64(math.MaxInt64) 162 for _, value := range azSpread { 163 if value < minValue { 164 minValue = value 165 } 166 } 167 return minValue 168 } 169 170 // calculateSectionReplicas pre-calculates replicas for each section, 171 // ensuring that replicas for each ring section are owned by different endpoints. 172 func calculateSectionReplicas(ringSections sections, replicationFactor uint64, availabilityZones map[string]struct{}) { 173 for i, s := range ringSections { 174 replicas := make(map[uint64]struct{}) 175 azSpread := make(map[string]int64) 176 for az := range availabilityZones { 177 // This is to make sure each az is initially represented 178 azSpread[az] = 0 179 } 180 j := i - 1 181 for uint64(len(replicas)) < replicationFactor { 182 j = (j + 1) % len(ringSections) 183 rep := ringSections[j] 184 if _, ok := replicas[rep.endpointIndex]; ok { 185 continue 186 } 187 if len(azSpread) > 1 && azSpread[rep.az] > 0 && azSpread[rep.az] > sizeOfLeastOccupiedAZ(azSpread) { 188 // We want to ensure even AZ spread before we add more replicas within the same AZ 189 continue 190 } 191 replicas[rep.endpointIndex] = struct{}{} 192 azSpread[rep.az]++ 193 s.replicas = append(s.replicas, rep.endpointIndex) 194 } 195 } 196 } 197 198 func (c ketamaHashring) Get(tenant string, ts *prompb.TimeSeries) (string, error) { 199 return c.GetN(tenant, ts, 0) 200 } 201 202 func (c ketamaHashring) GetN(tenant string, ts *prompb.TimeSeries, n uint64) (string, error) { 203 if n >= c.numEndpoints { 204 return "", &insufficientNodesError{have: c.numEndpoints, want: n + 1} 205 } 206 207 v := labelpb.HashWithPrefix(tenant, ts.Labels) 208 209 var i uint64 210 i = uint64(sort.Search(len(c.sections), func(i int) bool { 211 return c.sections[i].hash >= v 212 })) 213 214 numSections := uint64(len(c.sections)) 215 if i == numSections { 216 i = 0 217 } 218 219 endpointIndex := c.sections[i].replicas[n] 220 return c.endpoints[endpointIndex].Address, nil 221 } 222 223 // multiHashring represents a set of hashrings. 224 // Which hashring to use for a tenant is determined 225 // by the tenants field of the hashring configuration. 226 type multiHashring struct { 227 cache map[string]Hashring 228 hashrings []Hashring 229 tenantSets []map[string]struct{} 230 231 // We need a mutex to guard concurrent access 232 // to the cache map, as this is both written to 233 // and read from. 234 mu sync.RWMutex 235 } 236 237 // Get returns a target to handle the given tenant and time series. 238 func (m *multiHashring) Get(tenant string, ts *prompb.TimeSeries) (string, error) { 239 return m.GetN(tenant, ts, 0) 240 } 241 242 // GetN returns the nth target to handle the given tenant and time series. 243 func (m *multiHashring) GetN(tenant string, ts *prompb.TimeSeries, n uint64) (string, error) { 244 m.mu.RLock() 245 h, ok := m.cache[tenant] 246 m.mu.RUnlock() 247 if ok { 248 return h.GetN(tenant, ts, n) 249 } 250 var found bool 251 // If the tenant is not in the cache, then we need to check 252 // every tenant in the configuration. 253 for i, t := range m.tenantSets { 254 // If the hashring has no tenants, then it is 255 // considered a default hashring and matches everything. 256 if t == nil { 257 found = true 258 } else if _, ok := t[tenant]; ok { 259 found = true 260 } 261 if found { 262 m.mu.Lock() 263 m.cache[tenant] = m.hashrings[i] 264 m.mu.Unlock() 265 266 return m.hashrings[i].GetN(tenant, ts, n) 267 } 268 } 269 return "", errors.New("no matching hashring to handle tenant") 270 } 271 272 // newMultiHashring creates a multi-tenant hashring for a given slice of 273 // groups. 274 // Which hashring to use for a tenant is determined 275 // by the tenants field of the hashring configuration. 276 func NewMultiHashring(algorithm HashringAlgorithm, replicationFactor uint64, cfg []HashringConfig) (Hashring, error) { 277 m := &multiHashring{ 278 cache: make(map[string]Hashring), 279 } 280 281 for _, h := range cfg { 282 var hashring Hashring 283 var err error 284 activeAlgorithm := algorithm 285 if h.Algorithm != "" { 286 activeAlgorithm = h.Algorithm 287 } 288 hashring, err = newHashring(activeAlgorithm, h.Endpoints, replicationFactor, h.Hashring, h.Tenants) 289 if err != nil { 290 return nil, err 291 } 292 m.hashrings = append(m.hashrings, hashring) 293 var t map[string]struct{} 294 if len(h.Tenants) != 0 { 295 t = make(map[string]struct{}) 296 } 297 for _, tenant := range h.Tenants { 298 t[tenant] = struct{}{} 299 } 300 m.tenantSets = append(m.tenantSets, t) 301 } 302 return m, nil 303 } 304 305 func newHashring(algorithm HashringAlgorithm, endpoints []Endpoint, replicationFactor uint64, hashring string, tenants []string) (Hashring, error) { 306 switch algorithm { 307 case AlgorithmHashmod: 308 return newSimpleHashring(endpoints) 309 case AlgorithmKetama: 310 return newKetamaHashring(endpoints, SectionsPerNode, replicationFactor) 311 default: 312 l := log.NewNopLogger() 313 level.Warn(l).Log("msg", "Unrecognizable hashring algorithm. Fall back to hashmod algorithm.", 314 "hashring", hashring, 315 "tenants", tenants) 316 return newSimpleHashring(endpoints) 317 } 318 }