dubbo.apache.org/dubbo-go/v3@v3.1.1/xds/client/load/store.go (about)

     1  /*
     2   * Licensed to the Apache Software Foundation (ASF) under one or more
     3   * contributor license agreements.  See the NOTICE file distributed with
     4   * this work for additional information regarding copyright ownership.
     5   * The ASF licenses this file to You under the Apache License, Version 2.0
     6   * (the "License"); you may not use this file except in compliance with
     7   * the License.  You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   */
    17  
    18  /*
    19   *
    20   * Copyright 2020 gRPC authors.
    21   *
    22   */
    23  
    24  // Package load provides functionality to record and maintain load data.
    25  package load
    26  
    27  import (
    28  	"sync"
    29  	"sync/atomic"
    30  	"time"
    31  )
    32  
    33  const negativeOneUInt64 = ^uint64(0)
    34  
    35  // Store keeps the loads for multiple clusters and services to be reported via
    36  // LRS. It contains loads to reported to one LRS server. Create multiple stores
    37  // for multiple servers.
    38  //
    39  // It is safe for concurrent use.
    40  type Store struct {
    41  	// mu only protects the map (2 layers). The read/write to *perClusterStore
    42  	// doesn't need to hold the mu.
    43  	mu sync.Mutex
    44  	// clusters is a map with cluster name as the key. The second layer is a map
    45  	// with service name as the key. Each value (perClusterStore) contains data
    46  	// for a (cluster, service) pair.
    47  	//
    48  	// Note that new entries are added to this map, but never removed. This is
    49  	// potentially a memory leak. But the memory is allocated for each new
    50  	// (cluster,service) pair, and the memory allocated is just pointers and
    51  	// maps. So this shouldn't get too bad.
    52  	clusters map[string]map[string]*perClusterStore
    53  }
    54  
    55  // NewStore creates a Store.
    56  func NewStore() *Store {
    57  	return &Store{
    58  		clusters: make(map[string]map[string]*perClusterStore),
    59  	}
    60  }
    61  
    62  // Stats returns the load data for the given cluster names. Data is returned in
    63  // a slice with no specific order.
    64  //
    65  // If no clusterName is given (an empty slice), all data for all known clusters
    66  // is returned.
    67  //
    68  // If a cluster's Data is empty (no load to report), it's not appended to the
    69  // returned slice.
    70  func (s *Store) Stats(clusterNames []string) []*Data {
    71  	var ret []*Data
    72  	s.mu.Lock()
    73  	defer s.mu.Unlock()
    74  
    75  	if len(clusterNames) == 0 {
    76  		for _, c := range s.clusters {
    77  			ret = appendClusterStats(ret, c)
    78  		}
    79  		return ret
    80  	}
    81  
    82  	for _, n := range clusterNames {
    83  		if c, ok := s.clusters[n]; ok {
    84  			ret = appendClusterStats(ret, c)
    85  		}
    86  	}
    87  	return ret
    88  }
    89  
    90  // appendClusterStats gets Data for the given cluster, append to ret, and return
    91  // the new slice.
    92  //
    93  // Data is only appended to ret if it's not empty.
    94  func appendClusterStats(ret []*Data, cluster map[string]*perClusterStore) []*Data {
    95  	for _, d := range cluster {
    96  		data := d.stats()
    97  		if data == nil {
    98  			// Skip this data if it doesn't contain any information.
    99  			continue
   100  		}
   101  		ret = append(ret, data)
   102  	}
   103  	return ret
   104  }
   105  
   106  // PerCluster returns the perClusterStore for the given clusterName +
   107  // serviceName.
   108  func (s *Store) PerCluster(clusterName, serviceName string) PerClusterReporter {
   109  	if s == nil {
   110  		return nil
   111  	}
   112  
   113  	s.mu.Lock()
   114  	defer s.mu.Unlock()
   115  	c, ok := s.clusters[clusterName]
   116  	if !ok {
   117  		c = make(map[string]*perClusterStore)
   118  		s.clusters[clusterName] = c
   119  	}
   120  
   121  	if p, ok := c[serviceName]; ok {
   122  		return p
   123  	}
   124  	p := &perClusterStore{
   125  		cluster: clusterName,
   126  		service: serviceName,
   127  	}
   128  	c[serviceName] = p
   129  	return p
   130  }
   131  
   132  // perClusterStore is a repository for LB policy implementations to report store
   133  // load data. It contains load for a (cluster, edsService) pair.
   134  //
   135  // It is safe for concurrent use.
   136  //
   137  // TODO(easwars): Use regular maps with mutexes instead of sync.Map here. The
   138  // latter is optimized for two common use cases: (1) when the entry for a given
   139  // key is only ever written once but read many times, as in caches that only
   140  // grow, or (2) when multiple goroutines read, write, and overwrite entries for
   141  // disjoint sets of keys. In these two cases, use of a Map may significantly
   142  // reduce lock contention compared to a Go map paired with a separate Mutex or
   143  // RWMutex.
   144  // Neither of these conditions are met here, and we should transition to a
   145  // regular map with a mutex for better type safety.
   146  type perClusterStore struct {
   147  	cluster, service string
   148  	drops            sync.Map // map[string]*uint64
   149  	localityRPCCount sync.Map // map[string]*rpcCountData
   150  
   151  	mu               sync.Mutex
   152  	lastLoadReportAt time.Time
   153  }
   154  
   155  // Update functions are called by picker for each RPC. To avoid contention, all
   156  // updates are done atomically.
   157  
   158  // CallDropped adds one drop record with the given category to store.
   159  func (ls *perClusterStore) CallDropped(category string) {
   160  	if ls == nil {
   161  		return
   162  	}
   163  
   164  	p, ok := ls.drops.Load(category)
   165  	if !ok {
   166  		tp := new(uint64)
   167  		p, _ = ls.drops.LoadOrStore(category, tp)
   168  	}
   169  	atomic.AddUint64(p.(*uint64), 1)
   170  }
   171  
   172  // CallStarted adds one call started record for the given locality.
   173  func (ls *perClusterStore) CallStarted(locality string) {
   174  	if ls == nil {
   175  		return
   176  	}
   177  
   178  	p, ok := ls.localityRPCCount.Load(locality)
   179  	if !ok {
   180  		tp := newRPCCountData()
   181  		p, _ = ls.localityRPCCount.LoadOrStore(locality, tp)
   182  	}
   183  	p.(*rpcCountData).incrInProgress()
   184  }
   185  
   186  // CallFinished adds one call finished record for the given locality.
   187  // For successful calls, err needs to be nil.
   188  func (ls *perClusterStore) CallFinished(locality string, err error) {
   189  	if ls == nil {
   190  		return
   191  	}
   192  
   193  	p, ok := ls.localityRPCCount.Load(locality)
   194  	if !ok {
   195  		// The map is never cleared, only values in the map are reset. So the
   196  		// case where entry for call-finish is not found should never happen.
   197  		return
   198  	}
   199  	p.(*rpcCountData).decrInProgress()
   200  	if err == nil {
   201  		p.(*rpcCountData).incrSucceeded()
   202  	} else {
   203  		p.(*rpcCountData).incrErrored()
   204  	}
   205  }
   206  
   207  // CallServerLoad adds one server load record for the given locality. The
   208  // load type is specified by desc, and its value by val.
   209  func (ls *perClusterStore) CallServerLoad(locality, name string, d float64) {
   210  	if ls == nil {
   211  		return
   212  	}
   213  
   214  	p, ok := ls.localityRPCCount.Load(locality)
   215  	if !ok {
   216  		// The map is never cleared, only values in the map are reset. So the
   217  		// case where entry for callServerLoad is not found should never happen.
   218  		return
   219  	}
   220  	p.(*rpcCountData).addServerLoad(name, d)
   221  }
   222  
   223  // Data contains all load data reported to the Store since the most recent call
   224  // to stats().
   225  type Data struct {
   226  	// Cluster is the name of the cluster this data is for.
   227  	Cluster string
   228  	// Service is the name of the EDS service this data is for.
   229  	Service string
   230  	// TotalDrops is the total number of dropped requests.
   231  	TotalDrops uint64
   232  	// Drops is the number of dropped requests per category.
   233  	Drops map[string]uint64
   234  	// LocalityStats contains load reports per locality.
   235  	LocalityStats map[string]LocalityData
   236  	// ReportInternal is the duration since last time load was reported (stats()
   237  	// was called).
   238  	ReportInterval time.Duration
   239  }
   240  
   241  // LocalityData contains load data for a single locality.
   242  type LocalityData struct {
   243  	// RequestStats contains counts of requests made to the locality.
   244  	RequestStats RequestData
   245  	// LoadStats contains server load data for requests made to the locality,
   246  	// indexed by the load type.
   247  	LoadStats map[string]ServerLoadData
   248  }
   249  
   250  // RequestData contains request counts.
   251  type RequestData struct {
   252  	// Succeeded is the number of succeeded requests.
   253  	Succeeded uint64
   254  	// Errored is the number of requests which ran into errors.
   255  	Errored uint64
   256  	// InProgress is the number of requests in flight.
   257  	InProgress uint64
   258  }
   259  
   260  // ServerLoadData contains server load data.
   261  type ServerLoadData struct {
   262  	// Count is the number of load reports.
   263  	Count uint64
   264  	// Sum is the total value of all load reports.
   265  	Sum float64
   266  }
   267  
   268  func newData(cluster, service string) *Data {
   269  	return &Data{
   270  		Cluster:       cluster,
   271  		Service:       service,
   272  		Drops:         make(map[string]uint64),
   273  		LocalityStats: make(map[string]LocalityData),
   274  	}
   275  }
   276  
   277  // stats returns and resets all loads reported to the store, except inProgress
   278  // rpc counts.
   279  //
   280  // It returns nil if the store doesn't contain any (new) data.
   281  func (ls *perClusterStore) stats() *Data {
   282  	if ls == nil {
   283  		return nil
   284  	}
   285  
   286  	sd := newData(ls.cluster, ls.service)
   287  	ls.drops.Range(func(key, val interface{}) bool {
   288  		d := atomic.SwapUint64(val.(*uint64), 0)
   289  		if d == 0 {
   290  			return true
   291  		}
   292  		sd.TotalDrops += d
   293  		keyStr := key.(string)
   294  		if keyStr != "" {
   295  			// Skip drops without category. They are counted in total_drops, but
   296  			// not in per category. One example is drops by circuit breaking.
   297  			sd.Drops[keyStr] = d
   298  		}
   299  		return true
   300  	})
   301  	ls.localityRPCCount.Range(func(key, val interface{}) bool {
   302  		countData := val.(*rpcCountData)
   303  		succeeded := countData.loadAndClearSucceeded()
   304  		inProgress := countData.loadInProgress()
   305  		errored := countData.loadAndClearErrored()
   306  		if succeeded == 0 && inProgress == 0 && errored == 0 {
   307  			return true
   308  		}
   309  
   310  		ld := LocalityData{
   311  			RequestStats: RequestData{
   312  				Succeeded:  succeeded,
   313  				Errored:    errored,
   314  				InProgress: inProgress,
   315  			},
   316  			LoadStats: make(map[string]ServerLoadData),
   317  		}
   318  		countData.serverLoads.Range(func(key, val interface{}) bool {
   319  			sum, count := val.(*rpcLoadData).loadAndClear()
   320  			if count == 0 {
   321  				return true
   322  			}
   323  			ld.LoadStats[key.(string)] = ServerLoadData{
   324  				Count: count,
   325  				Sum:   sum,
   326  			}
   327  			return true
   328  		})
   329  		sd.LocalityStats[key.(string)] = ld
   330  		return true
   331  	})
   332  
   333  	ls.mu.Lock()
   334  	sd.ReportInterval = time.Since(ls.lastLoadReportAt)
   335  	ls.lastLoadReportAt = time.Now()
   336  	ls.mu.Unlock()
   337  
   338  	if sd.TotalDrops == 0 && len(sd.Drops) == 0 && len(sd.LocalityStats) == 0 {
   339  		return nil
   340  	}
   341  	return sd
   342  }
   343  
   344  type rpcCountData struct {
   345  	// Only atomic accesses are allowed for the fields.
   346  	succeeded  *uint64
   347  	errored    *uint64
   348  	inProgress *uint64
   349  
   350  	// Map from load desc to load data (sum+count). Loading data from map is
   351  	// atomic, but updating data takes a lock, which could cause contention when
   352  	// multiple RPCs try to report loads for the same desc.
   353  	//
   354  	// To fix the contention, shard this map.
   355  	serverLoads sync.Map // map[string]*rpcLoadData
   356  }
   357  
   358  func newRPCCountData() *rpcCountData {
   359  	return &rpcCountData{
   360  		succeeded:  new(uint64),
   361  		errored:    new(uint64),
   362  		inProgress: new(uint64),
   363  	}
   364  }
   365  
   366  func (rcd *rpcCountData) incrSucceeded() {
   367  	atomic.AddUint64(rcd.succeeded, 1)
   368  }
   369  
   370  func (rcd *rpcCountData) loadAndClearSucceeded() uint64 {
   371  	return atomic.SwapUint64(rcd.succeeded, 0)
   372  }
   373  
   374  func (rcd *rpcCountData) incrErrored() {
   375  	atomic.AddUint64(rcd.errored, 1)
   376  }
   377  
   378  func (rcd *rpcCountData) loadAndClearErrored() uint64 {
   379  	return atomic.SwapUint64(rcd.errored, 0)
   380  }
   381  
   382  func (rcd *rpcCountData) incrInProgress() {
   383  	atomic.AddUint64(rcd.inProgress, 1)
   384  }
   385  
   386  func (rcd *rpcCountData) decrInProgress() {
   387  	atomic.AddUint64(rcd.inProgress, negativeOneUInt64) // atomic.Add(x, -1)
   388  }
   389  
   390  func (rcd *rpcCountData) loadInProgress() uint64 {
   391  	return atomic.LoadUint64(rcd.inProgress) // InProgress count is not clear when reading.
   392  }
   393  
   394  func (rcd *rpcCountData) addServerLoad(name string, d float64) {
   395  	loads, ok := rcd.serverLoads.Load(name)
   396  	if !ok {
   397  		tl := newRPCLoadData()
   398  		loads, _ = rcd.serverLoads.LoadOrStore(name, tl)
   399  	}
   400  	loads.(*rpcLoadData).add(d)
   401  }
   402  
   403  // Data for server loads (from trailers or oob). Fields in this struct must be
   404  // updated consistently.
   405  //
   406  // The current solution is to hold a lock, which could cause contention. To fix,
   407  // shard serverLoads map in rpcCountData.
   408  type rpcLoadData struct {
   409  	mu    sync.Mutex
   410  	sum   float64
   411  	count uint64
   412  }
   413  
   414  func newRPCLoadData() *rpcLoadData {
   415  	return &rpcLoadData{}
   416  }
   417  
   418  func (rld *rpcLoadData) add(v float64) {
   419  	rld.mu.Lock()
   420  	rld.sum += v
   421  	rld.count++
   422  	rld.mu.Unlock()
   423  }
   424  
   425  func (rld *rpcLoadData) loadAndClear() (s float64, c uint64) {
   426  	rld.mu.Lock()
   427  	s = rld.sum
   428  	rld.sum = 0
   429  	c = rld.count
   430  	rld.count = 0
   431  	rld.mu.Unlock()
   432  	return
   433  }