dubbo.apache.org/dubbo-go/v3@v3.1.1/xds/client/load/store.go (about) 1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 /* 19 * 20 * Copyright 2020 gRPC authors. 21 * 22 */ 23 24 // Package load provides functionality to record and maintain load data. 25 package load 26 27 import ( 28 "sync" 29 "sync/atomic" 30 "time" 31 ) 32 33 const negativeOneUInt64 = ^uint64(0) 34 35 // Store keeps the loads for multiple clusters and services to be reported via 36 // LRS. It contains loads to reported to one LRS server. Create multiple stores 37 // for multiple servers. 38 // 39 // It is safe for concurrent use. 40 type Store struct { 41 // mu only protects the map (2 layers). The read/write to *perClusterStore 42 // doesn't need to hold the mu. 43 mu sync.Mutex 44 // clusters is a map with cluster name as the key. The second layer is a map 45 // with service name as the key. Each value (perClusterStore) contains data 46 // for a (cluster, service) pair. 47 // 48 // Note that new entries are added to this map, but never removed. This is 49 // potentially a memory leak. But the memory is allocated for each new 50 // (cluster,service) pair, and the memory allocated is just pointers and 51 // maps. So this shouldn't get too bad. 52 clusters map[string]map[string]*perClusterStore 53 } 54 55 // NewStore creates a Store. 56 func NewStore() *Store { 57 return &Store{ 58 clusters: make(map[string]map[string]*perClusterStore), 59 } 60 } 61 62 // Stats returns the load data for the given cluster names. Data is returned in 63 // a slice with no specific order. 64 // 65 // If no clusterName is given (an empty slice), all data for all known clusters 66 // is returned. 67 // 68 // If a cluster's Data is empty (no load to report), it's not appended to the 69 // returned slice. 70 func (s *Store) Stats(clusterNames []string) []*Data { 71 var ret []*Data 72 s.mu.Lock() 73 defer s.mu.Unlock() 74 75 if len(clusterNames) == 0 { 76 for _, c := range s.clusters { 77 ret = appendClusterStats(ret, c) 78 } 79 return ret 80 } 81 82 for _, n := range clusterNames { 83 if c, ok := s.clusters[n]; ok { 84 ret = appendClusterStats(ret, c) 85 } 86 } 87 return ret 88 } 89 90 // appendClusterStats gets Data for the given cluster, append to ret, and return 91 // the new slice. 92 // 93 // Data is only appended to ret if it's not empty. 94 func appendClusterStats(ret []*Data, cluster map[string]*perClusterStore) []*Data { 95 for _, d := range cluster { 96 data := d.stats() 97 if data == nil { 98 // Skip this data if it doesn't contain any information. 99 continue 100 } 101 ret = append(ret, data) 102 } 103 return ret 104 } 105 106 // PerCluster returns the perClusterStore for the given clusterName + 107 // serviceName. 108 func (s *Store) PerCluster(clusterName, serviceName string) PerClusterReporter { 109 if s == nil { 110 return nil 111 } 112 113 s.mu.Lock() 114 defer s.mu.Unlock() 115 c, ok := s.clusters[clusterName] 116 if !ok { 117 c = make(map[string]*perClusterStore) 118 s.clusters[clusterName] = c 119 } 120 121 if p, ok := c[serviceName]; ok { 122 return p 123 } 124 p := &perClusterStore{ 125 cluster: clusterName, 126 service: serviceName, 127 } 128 c[serviceName] = p 129 return p 130 } 131 132 // perClusterStore is a repository for LB policy implementations to report store 133 // load data. It contains load for a (cluster, edsService) pair. 134 // 135 // It is safe for concurrent use. 136 // 137 // TODO(easwars): Use regular maps with mutexes instead of sync.Map here. The 138 // latter is optimized for two common use cases: (1) when the entry for a given 139 // key is only ever written once but read many times, as in caches that only 140 // grow, or (2) when multiple goroutines read, write, and overwrite entries for 141 // disjoint sets of keys. In these two cases, use of a Map may significantly 142 // reduce lock contention compared to a Go map paired with a separate Mutex or 143 // RWMutex. 144 // Neither of these conditions are met here, and we should transition to a 145 // regular map with a mutex for better type safety. 146 type perClusterStore struct { 147 cluster, service string 148 drops sync.Map // map[string]*uint64 149 localityRPCCount sync.Map // map[string]*rpcCountData 150 151 mu sync.Mutex 152 lastLoadReportAt time.Time 153 } 154 155 // Update functions are called by picker for each RPC. To avoid contention, all 156 // updates are done atomically. 157 158 // CallDropped adds one drop record with the given category to store. 159 func (ls *perClusterStore) CallDropped(category string) { 160 if ls == nil { 161 return 162 } 163 164 p, ok := ls.drops.Load(category) 165 if !ok { 166 tp := new(uint64) 167 p, _ = ls.drops.LoadOrStore(category, tp) 168 } 169 atomic.AddUint64(p.(*uint64), 1) 170 } 171 172 // CallStarted adds one call started record for the given locality. 173 func (ls *perClusterStore) CallStarted(locality string) { 174 if ls == nil { 175 return 176 } 177 178 p, ok := ls.localityRPCCount.Load(locality) 179 if !ok { 180 tp := newRPCCountData() 181 p, _ = ls.localityRPCCount.LoadOrStore(locality, tp) 182 } 183 p.(*rpcCountData).incrInProgress() 184 } 185 186 // CallFinished adds one call finished record for the given locality. 187 // For successful calls, err needs to be nil. 188 func (ls *perClusterStore) CallFinished(locality string, err error) { 189 if ls == nil { 190 return 191 } 192 193 p, ok := ls.localityRPCCount.Load(locality) 194 if !ok { 195 // The map is never cleared, only values in the map are reset. So the 196 // case where entry for call-finish is not found should never happen. 197 return 198 } 199 p.(*rpcCountData).decrInProgress() 200 if err == nil { 201 p.(*rpcCountData).incrSucceeded() 202 } else { 203 p.(*rpcCountData).incrErrored() 204 } 205 } 206 207 // CallServerLoad adds one server load record for the given locality. The 208 // load type is specified by desc, and its value by val. 209 func (ls *perClusterStore) CallServerLoad(locality, name string, d float64) { 210 if ls == nil { 211 return 212 } 213 214 p, ok := ls.localityRPCCount.Load(locality) 215 if !ok { 216 // The map is never cleared, only values in the map are reset. So the 217 // case where entry for callServerLoad is not found should never happen. 218 return 219 } 220 p.(*rpcCountData).addServerLoad(name, d) 221 } 222 223 // Data contains all load data reported to the Store since the most recent call 224 // to stats(). 225 type Data struct { 226 // Cluster is the name of the cluster this data is for. 227 Cluster string 228 // Service is the name of the EDS service this data is for. 229 Service string 230 // TotalDrops is the total number of dropped requests. 231 TotalDrops uint64 232 // Drops is the number of dropped requests per category. 233 Drops map[string]uint64 234 // LocalityStats contains load reports per locality. 235 LocalityStats map[string]LocalityData 236 // ReportInternal is the duration since last time load was reported (stats() 237 // was called). 238 ReportInterval time.Duration 239 } 240 241 // LocalityData contains load data for a single locality. 242 type LocalityData struct { 243 // RequestStats contains counts of requests made to the locality. 244 RequestStats RequestData 245 // LoadStats contains server load data for requests made to the locality, 246 // indexed by the load type. 247 LoadStats map[string]ServerLoadData 248 } 249 250 // RequestData contains request counts. 251 type RequestData struct { 252 // Succeeded is the number of succeeded requests. 253 Succeeded uint64 254 // Errored is the number of requests which ran into errors. 255 Errored uint64 256 // InProgress is the number of requests in flight. 257 InProgress uint64 258 } 259 260 // ServerLoadData contains server load data. 261 type ServerLoadData struct { 262 // Count is the number of load reports. 263 Count uint64 264 // Sum is the total value of all load reports. 265 Sum float64 266 } 267 268 func newData(cluster, service string) *Data { 269 return &Data{ 270 Cluster: cluster, 271 Service: service, 272 Drops: make(map[string]uint64), 273 LocalityStats: make(map[string]LocalityData), 274 } 275 } 276 277 // stats returns and resets all loads reported to the store, except inProgress 278 // rpc counts. 279 // 280 // It returns nil if the store doesn't contain any (new) data. 281 func (ls *perClusterStore) stats() *Data { 282 if ls == nil { 283 return nil 284 } 285 286 sd := newData(ls.cluster, ls.service) 287 ls.drops.Range(func(key, val interface{}) bool { 288 d := atomic.SwapUint64(val.(*uint64), 0) 289 if d == 0 { 290 return true 291 } 292 sd.TotalDrops += d 293 keyStr := key.(string) 294 if keyStr != "" { 295 // Skip drops without category. They are counted in total_drops, but 296 // not in per category. One example is drops by circuit breaking. 297 sd.Drops[keyStr] = d 298 } 299 return true 300 }) 301 ls.localityRPCCount.Range(func(key, val interface{}) bool { 302 countData := val.(*rpcCountData) 303 succeeded := countData.loadAndClearSucceeded() 304 inProgress := countData.loadInProgress() 305 errored := countData.loadAndClearErrored() 306 if succeeded == 0 && inProgress == 0 && errored == 0 { 307 return true 308 } 309 310 ld := LocalityData{ 311 RequestStats: RequestData{ 312 Succeeded: succeeded, 313 Errored: errored, 314 InProgress: inProgress, 315 }, 316 LoadStats: make(map[string]ServerLoadData), 317 } 318 countData.serverLoads.Range(func(key, val interface{}) bool { 319 sum, count := val.(*rpcLoadData).loadAndClear() 320 if count == 0 { 321 return true 322 } 323 ld.LoadStats[key.(string)] = ServerLoadData{ 324 Count: count, 325 Sum: sum, 326 } 327 return true 328 }) 329 sd.LocalityStats[key.(string)] = ld 330 return true 331 }) 332 333 ls.mu.Lock() 334 sd.ReportInterval = time.Since(ls.lastLoadReportAt) 335 ls.lastLoadReportAt = time.Now() 336 ls.mu.Unlock() 337 338 if sd.TotalDrops == 0 && len(sd.Drops) == 0 && len(sd.LocalityStats) == 0 { 339 return nil 340 } 341 return sd 342 } 343 344 type rpcCountData struct { 345 // Only atomic accesses are allowed for the fields. 346 succeeded *uint64 347 errored *uint64 348 inProgress *uint64 349 350 // Map from load desc to load data (sum+count). Loading data from map is 351 // atomic, but updating data takes a lock, which could cause contention when 352 // multiple RPCs try to report loads for the same desc. 353 // 354 // To fix the contention, shard this map. 355 serverLoads sync.Map // map[string]*rpcLoadData 356 } 357 358 func newRPCCountData() *rpcCountData { 359 return &rpcCountData{ 360 succeeded: new(uint64), 361 errored: new(uint64), 362 inProgress: new(uint64), 363 } 364 } 365 366 func (rcd *rpcCountData) incrSucceeded() { 367 atomic.AddUint64(rcd.succeeded, 1) 368 } 369 370 func (rcd *rpcCountData) loadAndClearSucceeded() uint64 { 371 return atomic.SwapUint64(rcd.succeeded, 0) 372 } 373 374 func (rcd *rpcCountData) incrErrored() { 375 atomic.AddUint64(rcd.errored, 1) 376 } 377 378 func (rcd *rpcCountData) loadAndClearErrored() uint64 { 379 return atomic.SwapUint64(rcd.errored, 0) 380 } 381 382 func (rcd *rpcCountData) incrInProgress() { 383 atomic.AddUint64(rcd.inProgress, 1) 384 } 385 386 func (rcd *rpcCountData) decrInProgress() { 387 atomic.AddUint64(rcd.inProgress, negativeOneUInt64) // atomic.Add(x, -1) 388 } 389 390 func (rcd *rpcCountData) loadInProgress() uint64 { 391 return atomic.LoadUint64(rcd.inProgress) // InProgress count is not clear when reading. 392 } 393 394 func (rcd *rpcCountData) addServerLoad(name string, d float64) { 395 loads, ok := rcd.serverLoads.Load(name) 396 if !ok { 397 tl := newRPCLoadData() 398 loads, _ = rcd.serverLoads.LoadOrStore(name, tl) 399 } 400 loads.(*rpcLoadData).add(d) 401 } 402 403 // Data for server loads (from trailers or oob). Fields in this struct must be 404 // updated consistently. 405 // 406 // The current solution is to hold a lock, which could cause contention. To fix, 407 // shard serverLoads map in rpcCountData. 408 type rpcLoadData struct { 409 mu sync.Mutex 410 sum float64 411 count uint64 412 } 413 414 func newRPCLoadData() *rpcLoadData { 415 return &rpcLoadData{} 416 } 417 418 func (rld *rpcLoadData) add(v float64) { 419 rld.mu.Lock() 420 rld.sum += v 421 rld.count++ 422 rld.mu.Unlock() 423 } 424 425 func (rld *rpcLoadData) loadAndClear() (s float64, c uint64) { 426 rld.mu.Lock() 427 s = rld.sum 428 rld.sum = 0 429 c = rld.count 430 rld.count = 0 431 rld.mu.Unlock() 432 return 433 }