go.temporal.io/server@v1.23.0/common/persistence/health_signal_aggregator.go (about)

     1  // The MIT License
     2  //
     3  // Copyright (c) 2020 Temporal Technologies Inc.  All rights reserved.
     4  //
     5  // Copyright (c) 2020 Uber Technologies, Inc.
     6  //
     7  // Permission is hereby granted, free of charge, to any person obtaining a copy
     8  // of this software and associated documentation files (the "Software"), to deal
     9  // in the Software without restriction, including without limitation the rights
    10  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    11  // copies of the Software, and to permit persons to whom the Software is
    12  // furnished to do so, subject to the following conditions:
    13  //
    14  // The above copyright notice and this permission notice shall be included in
    15  // all copies or substantial portions of the Software.
    16  //
    17  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    18  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    19  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    20  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    21  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    22  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    23  // THE SOFTWARE.
    24  
    25  package persistence
    26  
    27  import (
    28  	"sync"
    29  	"sync/atomic"
    30  	"time"
    31  
    32  	"go.temporal.io/server/common"
    33  	"go.temporal.io/server/common/aggregate"
    34  	"go.temporal.io/server/common/dynamicconfig"
    35  	"go.temporal.io/server/common/log"
    36  	"go.temporal.io/server/common/log/tag"
    37  	"go.temporal.io/server/common/metrics"
    38  )
    39  
    40  const (
    41  	emitMetricsInterval = 30 * time.Second
    42  )
    43  
    44  type (
    45  	HealthSignalAggregator interface {
    46  		Record(callerSegment int32, namespace string, latency time.Duration, err error)
    47  		AverageLatency() float64
    48  		ErrorRatio() float64
    49  		Start()
    50  		Stop()
    51  	}
    52  
    53  	HealthSignalAggregatorImpl struct {
    54  		status     int32
    55  		shutdownCh chan struct{}
    56  
    57  		// map of shardID -> map of namespace -> request count
    58  		requestCounts map[int32]map[string]int64
    59  		requestsLock  sync.Mutex
    60  
    61  		aggregationEnabled bool
    62  		latencyAverage     aggregate.MovingWindowAverage
    63  		errorRatio         aggregate.MovingWindowAverage
    64  
    65  		metricsHandler            metrics.Handler
    66  		emitMetricsTimer          *time.Ticker
    67  		perShardRPSWarnLimit      dynamicconfig.IntPropertyFn
    68  		perShardPerNsRPSWarnLimit dynamicconfig.FloatPropertyFn
    69  
    70  		logger log.Logger
    71  	}
    72  )
    73  
    74  func NewHealthSignalAggregatorImpl(
    75  	aggregationEnabled bool,
    76  	windowSize time.Duration,
    77  	maxBufferSize int,
    78  	metricsHandler metrics.Handler,
    79  	perShardRPSWarnLimit dynamicconfig.IntPropertyFn,
    80  	perShardPerNsRPSWarnLimit dynamicconfig.FloatPropertyFn,
    81  	logger log.Logger,
    82  ) *HealthSignalAggregatorImpl {
    83  	ret := &HealthSignalAggregatorImpl{
    84  		status:                    common.DaemonStatusInitialized,
    85  		shutdownCh:                make(chan struct{}),
    86  		requestCounts:             make(map[int32]map[string]int64),
    87  		metricsHandler:            metricsHandler,
    88  		emitMetricsTimer:          time.NewTicker(emitMetricsInterval),
    89  		perShardRPSWarnLimit:      perShardRPSWarnLimit,
    90  		perShardPerNsRPSWarnLimit: perShardPerNsRPSWarnLimit,
    91  		logger:                    logger,
    92  		aggregationEnabled:        aggregationEnabled,
    93  	}
    94  
    95  	if aggregationEnabled {
    96  		ret.latencyAverage = aggregate.NewMovingWindowAvgImpl(windowSize, maxBufferSize)
    97  		ret.errorRatio = aggregate.NewMovingWindowAvgImpl(windowSize, maxBufferSize)
    98  	} else {
    99  		ret.latencyAverage = aggregate.NoopMovingWindowAverage
   100  		ret.errorRatio = aggregate.NoopMovingWindowAverage
   101  	}
   102  
   103  	return ret
   104  }
   105  
   106  func (s *HealthSignalAggregatorImpl) Start() {
   107  	if !atomic.CompareAndSwapInt32(&s.status, common.DaemonStatusInitialized, common.DaemonStatusStarted) {
   108  		return
   109  	}
   110  	go s.emitMetricsLoop()
   111  }
   112  
   113  func (s *HealthSignalAggregatorImpl) Stop() {
   114  	if !atomic.CompareAndSwapInt32(&s.status, common.DaemonStatusStarted, common.DaemonStatusStopped) {
   115  		return
   116  	}
   117  	close(s.shutdownCh)
   118  	s.emitMetricsTimer.Stop()
   119  }
   120  
   121  func (s *HealthSignalAggregatorImpl) Record(callerSegment int32, namespace string, latency time.Duration, err error) {
   122  	if s.aggregationEnabled {
   123  		s.latencyAverage.Record(latency.Milliseconds())
   124  
   125  		if isUnhealthyError(err) {
   126  			s.errorRatio.Record(1)
   127  		} else {
   128  			s.errorRatio.Record(0)
   129  		}
   130  	}
   131  
   132  	if callerSegment != CallerSegmentMissing {
   133  		s.incrementShardRequestCount(callerSegment, namespace)
   134  	}
   135  }
   136  
   137  func (s *HealthSignalAggregatorImpl) AverageLatency() float64 {
   138  	return s.latencyAverage.Average()
   139  }
   140  
   141  func (s *HealthSignalAggregatorImpl) ErrorRatio() float64 {
   142  	return s.errorRatio.Average()
   143  }
   144  
   145  func (s *HealthSignalAggregatorImpl) incrementShardRequestCount(shardID int32, namespace string) {
   146  	s.requestsLock.Lock()
   147  	defer s.requestsLock.Unlock()
   148  	if s.requestCounts[shardID] == nil {
   149  		s.requestCounts[shardID] = make(map[string]int64)
   150  	}
   151  	s.requestCounts[shardID][namespace]++
   152  }
   153  
   154  func (s *HealthSignalAggregatorImpl) emitMetricsLoop() {
   155  	for {
   156  		select {
   157  		case <-s.shutdownCh:
   158  			return
   159  		case <-s.emitMetricsTimer.C:
   160  			s.requestsLock.Lock()
   161  			requestCounts := s.requestCounts
   162  			s.requestCounts = make(map[int32]map[string]int64, len(requestCounts))
   163  			s.requestsLock.Unlock()
   164  
   165  			for shardID, requestCountPerNS := range requestCounts {
   166  				shardRequestCount := int64(0)
   167  				for namespace, count := range requestCountPerNS {
   168  					shardRequestCount += count
   169  					shardRPSPerNS := int64(float64(count) / emitMetricsInterval.Seconds())
   170  					if s.perShardPerNsRPSWarnLimit() > 0.0 && shardRPSPerNS > int64(s.perShardPerNsRPSWarnLimit()*float64(s.perShardRPSWarnLimit())) {
   171  						s.logger.Warn("Per shard per namespace RPS warn limit exceeded", tag.ShardID(shardID), tag.WorkflowNamespace(namespace), tag.RPS(shardRPSPerNS))
   172  					}
   173  				}
   174  
   175  				shardRPS := int64(float64(shardRequestCount) / emitMetricsInterval.Seconds())
   176  				s.metricsHandler.Histogram(metrics.PersistenceShardRPS.Name(), metrics.PersistenceShardRPS.Unit()).Record(shardRPS)
   177  				if shardRPS > int64(s.perShardRPSWarnLimit()) {
   178  					s.logger.Warn("Per shard RPS warn limit exceeded", tag.ShardID(shardID), tag.RPS(shardRPS))
   179  				}
   180  			}
   181  		}
   182  	}
   183  }
   184  
   185  func isUnhealthyError(err error) bool {
   186  	if err == nil {
   187  		return false
   188  	}
   189  	if common.IsContextCanceledErr(err) {
   190  		return true
   191  	}
   192  
   193  	switch err.(type) {
   194  	case *AppendHistoryTimeoutError,
   195  		*TimeoutError:
   196  		return true
   197  
   198  	default:
   199  		return false
   200  	}
   201  }