gitee.com/ks-custle/core-gm@v0.0.0-20230922171213-b83bdd97b62c/grpc/xds/internal/xdsclient/load/store.go (about) 1 /* 2 * Copyright 2020 gRPC authors. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 // Package load provides functionality to record and maintain load data. 18 package load 19 20 import ( 21 "sync" 22 "sync/atomic" 23 "time" 24 ) 25 26 const negativeOneUInt64 = ^uint64(0) 27 28 // Store keeps the loads for multiple clusters and services to be reported via 29 // LRS. It contains loads to reported to one LRS server. Create multiple stores 30 // for multiple servers. 31 // 32 // It is safe for concurrent use. 33 type Store struct { 34 // mu only protects the map (2 layers). The read/write to *perClusterStore 35 // doesn't need to hold the mu. 36 mu sync.Mutex 37 // clusters is a map with cluster name as the key. The second layer is a map 38 // with service name as the key. Each value (perClusterStore) contains data 39 // for a (cluster, service) pair. 40 // 41 // Note that new entries are added to this map, but never removed. This is 42 // potentially a memory leak. But the memory is allocated for each new 43 // (cluster,service) pair, and the memory allocated is just pointers and 44 // maps. So this shouldn't get too bad. 45 clusters map[string]map[string]*perClusterStore 46 } 47 48 // NewStore creates a Store. 49 func NewStore() *Store { 50 return &Store{ 51 clusters: make(map[string]map[string]*perClusterStore), 52 } 53 } 54 55 // Stats returns the load data for the given cluster names. Data is returned in 56 // a slice with no specific order. 57 // 58 // If no clusterName is given (an empty slice), all data for all known clusters 59 // is returned. 60 // 61 // If a cluster's Data is empty (no load to report), it's not appended to the 62 // returned slice. 63 func (s *Store) Stats(clusterNames []string) []*Data { 64 var ret []*Data 65 s.mu.Lock() 66 defer s.mu.Unlock() 67 68 if len(clusterNames) == 0 { 69 for _, c := range s.clusters { 70 ret = appendClusterStats(ret, c) 71 } 72 return ret 73 } 74 75 for _, n := range clusterNames { 76 if c, ok := s.clusters[n]; ok { 77 ret = appendClusterStats(ret, c) 78 } 79 } 80 return ret 81 } 82 83 // appendClusterStats gets Data for the given cluster, append to ret, and return 84 // the new slice. 85 // 86 // Data is only appended to ret if it's not empty. 87 func appendClusterStats(ret []*Data, cluster map[string]*perClusterStore) []*Data { 88 for _, d := range cluster { 89 data := d.stats() 90 if data == nil { 91 // Skip this data if it doesn't contain any information. 92 continue 93 } 94 ret = append(ret, data) 95 } 96 return ret 97 } 98 99 // PerCluster returns the perClusterStore for the given clusterName + 100 // serviceName. 101 func (s *Store) PerCluster(clusterName, serviceName string) PerClusterReporter { 102 if s == nil { 103 return nil 104 } 105 106 s.mu.Lock() 107 defer s.mu.Unlock() 108 c, ok := s.clusters[clusterName] 109 if !ok { 110 c = make(map[string]*perClusterStore) 111 s.clusters[clusterName] = c 112 } 113 114 if p, ok := c[serviceName]; ok { 115 return p 116 } 117 p := &perClusterStore{ 118 cluster: clusterName, 119 service: serviceName, 120 } 121 c[serviceName] = p 122 return p 123 } 124 125 // perClusterStore is a repository for LB policy implementations to report store 126 // load data. It contains load for a (cluster, edsService) pair. 127 // 128 // It is safe for concurrent use. 129 // 130 // TODO(easwars): Use regular maps with mutexes instead of sync.Map here. The 131 // latter is optimized for two common use cases: (1) when the entry for a given 132 // key is only ever written once but read many times, as in caches that only 133 // grow, or (2) when multiple goroutines read, write, and overwrite entries for 134 // disjoint sets of keys. In these two cases, use of a Map may significantly 135 // reduce lock contention compared to a Go map paired with a separate Mutex or 136 // RWMutex. 137 // Neither of these conditions are met here, and we should transition to a 138 // regular map with a mutex for better type safety. 139 type perClusterStore struct { 140 cluster, service string 141 drops sync.Map // map[string]*uint64 142 localityRPCCount sync.Map // map[string]*rpcCountData 143 144 mu sync.Mutex 145 lastLoadReportAt time.Time 146 } 147 148 // Update functions are called by picker for each RPC. To avoid contention, all 149 // updates are done atomically. 150 151 // CallDropped adds one drop record with the given category to store. 152 func (ls *perClusterStore) CallDropped(category string) { 153 if ls == nil { 154 return 155 } 156 157 p, ok := ls.drops.Load(category) 158 if !ok { 159 tp := new(uint64) 160 p, _ = ls.drops.LoadOrStore(category, tp) 161 } 162 atomic.AddUint64(p.(*uint64), 1) 163 } 164 165 // CallStarted adds one call started record for the given locality. 166 func (ls *perClusterStore) CallStarted(locality string) { 167 if ls == nil { 168 return 169 } 170 171 p, ok := ls.localityRPCCount.Load(locality) 172 if !ok { 173 tp := newRPCCountData() 174 p, _ = ls.localityRPCCount.LoadOrStore(locality, tp) 175 } 176 p.(*rpcCountData).incrInProgress() 177 } 178 179 // CallFinished adds one call finished record for the given locality. 180 // For successful calls, err needs to be nil. 181 func (ls *perClusterStore) CallFinished(locality string, err error) { 182 if ls == nil { 183 return 184 } 185 186 p, ok := ls.localityRPCCount.Load(locality) 187 if !ok { 188 // The map is never cleared, only values in the map are reset. So the 189 // case where entry for call-finish is not found should never happen. 190 return 191 } 192 p.(*rpcCountData).decrInProgress() 193 if err == nil { 194 p.(*rpcCountData).incrSucceeded() 195 } else { 196 p.(*rpcCountData).incrErrored() 197 } 198 } 199 200 // CallServerLoad adds one server load record for the given locality. The 201 // load type is specified by desc, and its value by val. 202 func (ls *perClusterStore) CallServerLoad(locality, name string, d float64) { 203 if ls == nil { 204 return 205 } 206 207 p, ok := ls.localityRPCCount.Load(locality) 208 if !ok { 209 // The map is never cleared, only values in the map are reset. So the 210 // case where entry for callServerLoad is not found should never happen. 211 return 212 } 213 p.(*rpcCountData).addServerLoad(name, d) 214 } 215 216 // Data contains all load data reported to the Store since the most recent call 217 // to stats(). 218 type Data struct { 219 // Cluster is the name of the cluster this data is for. 220 Cluster string 221 // Service is the name of the EDS service this data is for. 222 Service string 223 // TotalDrops is the total number of dropped requests. 224 TotalDrops uint64 225 // Drops is the number of dropped requests per category. 226 Drops map[string]uint64 227 // LocalityStats contains load reports per locality. 228 LocalityStats map[string]LocalityData 229 // ReportInternal is the duration since last time load was reported (stats() 230 // was called). 231 ReportInterval time.Duration 232 } 233 234 // LocalityData contains load data for a single locality. 235 type LocalityData struct { 236 // RequestStats contains counts of requests made to the locality. 237 RequestStats RequestData 238 // LoadStats contains server load data for requests made to the locality, 239 // indexed by the load type. 240 LoadStats map[string]ServerLoadData 241 } 242 243 // RequestData contains request counts. 244 type RequestData struct { 245 // Succeeded is the number of succeeded requests. 246 Succeeded uint64 247 // Errored is the number of requests which ran into errors. 248 Errored uint64 249 // InProgress is the number of requests in flight. 250 InProgress uint64 251 } 252 253 // ServerLoadData contains server load data. 254 type ServerLoadData struct { 255 // Count is the number of load reports. 256 Count uint64 257 // Sum is the total value of all load reports. 258 Sum float64 259 } 260 261 func newData(cluster, service string) *Data { 262 return &Data{ 263 Cluster: cluster, 264 Service: service, 265 Drops: make(map[string]uint64), 266 LocalityStats: make(map[string]LocalityData), 267 } 268 } 269 270 // stats returns and resets all loads reported to the store, except inProgress 271 // rpc counts. 272 // 273 // It returns nil if the store doesn't contain any (new) data. 274 func (ls *perClusterStore) stats() *Data { 275 if ls == nil { 276 return nil 277 } 278 279 sd := newData(ls.cluster, ls.service) 280 ls.drops.Range(func(key, val interface{}) bool { 281 d := atomic.SwapUint64(val.(*uint64), 0) 282 if d == 0 { 283 return true 284 } 285 sd.TotalDrops += d 286 keyStr := key.(string) 287 if keyStr != "" { 288 // Skip drops without category. They are counted in total_drops, but 289 // not in per category. One example is drops by circuit breaking. 290 sd.Drops[keyStr] = d 291 } 292 return true 293 }) 294 ls.localityRPCCount.Range(func(key, val interface{}) bool { 295 countData := val.(*rpcCountData) 296 succeeded := countData.loadAndClearSucceeded() 297 inProgress := countData.loadInProgress() 298 errored := countData.loadAndClearErrored() 299 if succeeded == 0 && inProgress == 0 && errored == 0 { 300 return true 301 } 302 303 ld := LocalityData{ 304 RequestStats: RequestData{ 305 Succeeded: succeeded, 306 Errored: errored, 307 InProgress: inProgress, 308 }, 309 LoadStats: make(map[string]ServerLoadData), 310 } 311 countData.serverLoads.Range(func(key, val interface{}) bool { 312 sum, count := val.(*rpcLoadData).loadAndClear() 313 if count == 0 { 314 return true 315 } 316 ld.LoadStats[key.(string)] = ServerLoadData{ 317 Count: count, 318 Sum: sum, 319 } 320 return true 321 }) 322 sd.LocalityStats[key.(string)] = ld 323 return true 324 }) 325 326 ls.mu.Lock() 327 sd.ReportInterval = time.Since(ls.lastLoadReportAt) 328 ls.lastLoadReportAt = time.Now() 329 ls.mu.Unlock() 330 331 if sd.TotalDrops == 0 && len(sd.Drops) == 0 && len(sd.LocalityStats) == 0 { 332 return nil 333 } 334 return sd 335 } 336 337 type rpcCountData struct { 338 // Only atomic accesses are allowed for the fields. 339 succeeded *uint64 340 errored *uint64 341 inProgress *uint64 342 343 // Map from load desc to load data (sum+count). Loading data from map is 344 // atomic, but updating data takes a lock, which could cause contention when 345 // multiple RPCs try to report loads for the same desc. 346 // 347 // To fix the contention, shard this map. 348 serverLoads sync.Map // map[string]*rpcLoadData 349 } 350 351 func newRPCCountData() *rpcCountData { 352 return &rpcCountData{ 353 succeeded: new(uint64), 354 errored: new(uint64), 355 inProgress: new(uint64), 356 } 357 } 358 359 func (rcd *rpcCountData) incrSucceeded() { 360 atomic.AddUint64(rcd.succeeded, 1) 361 } 362 363 func (rcd *rpcCountData) loadAndClearSucceeded() uint64 { 364 return atomic.SwapUint64(rcd.succeeded, 0) 365 } 366 367 func (rcd *rpcCountData) incrErrored() { 368 atomic.AddUint64(rcd.errored, 1) 369 } 370 371 func (rcd *rpcCountData) loadAndClearErrored() uint64 { 372 return atomic.SwapUint64(rcd.errored, 0) 373 } 374 375 func (rcd *rpcCountData) incrInProgress() { 376 atomic.AddUint64(rcd.inProgress, 1) 377 } 378 379 func (rcd *rpcCountData) decrInProgress() { 380 atomic.AddUint64(rcd.inProgress, negativeOneUInt64) // atomic.Add(x, -1) 381 } 382 383 func (rcd *rpcCountData) loadInProgress() uint64 { 384 return atomic.LoadUint64(rcd.inProgress) // InProgress count is not clear when reading. 385 } 386 387 func (rcd *rpcCountData) addServerLoad(name string, d float64) { 388 loads, ok := rcd.serverLoads.Load(name) 389 if !ok { 390 tl := newRPCLoadData() 391 loads, _ = rcd.serverLoads.LoadOrStore(name, tl) 392 } 393 loads.(*rpcLoadData).add(d) 394 } 395 396 // Data for server loads (from trailers or oob). Fields in this struct must be 397 // updated consistently. 398 // 399 // The current solution is to hold a lock, which could cause contention. To fix, 400 // shard serverLoads map in rpcCountData. 401 type rpcLoadData struct { 402 mu sync.Mutex 403 sum float64 404 count uint64 405 } 406 407 func newRPCLoadData() *rpcLoadData { 408 return &rpcLoadData{} 409 } 410 411 func (rld *rpcLoadData) add(v float64) { 412 rld.mu.Lock() 413 rld.sum += v 414 rld.count++ 415 rld.mu.Unlock() 416 } 417 418 func (rld *rpcLoadData) loadAndClear() (s float64, c uint64) { 419 rld.mu.Lock() 420 s = rld.sum 421 rld.sum = 0 422 c = rld.count 423 rld.count = 0 424 rld.mu.Unlock() 425 return 426 }