go.temporal.io/server@v1.23.0/common/persistence/health_signal_aggregator.go (about) 1 // The MIT License 2 // 3 // Copyright (c) 2020 Temporal Technologies Inc. All rights reserved. 4 // 5 // Copyright (c) 2020 Uber Technologies, Inc. 6 // 7 // Permission is hereby granted, free of charge, to any person obtaining a copy 8 // of this software and associated documentation files (the "Software"), to deal 9 // in the Software without restriction, including without limitation the rights 10 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 // copies of the Software, and to permit persons to whom the Software is 12 // furnished to do so, subject to the following conditions: 13 // 14 // The above copyright notice and this permission notice shall be included in 15 // all copies or substantial portions of the Software. 16 // 17 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 // THE SOFTWARE. 24 25 package persistence 26 27 import ( 28 "sync" 29 "sync/atomic" 30 "time" 31 32 "go.temporal.io/server/common" 33 "go.temporal.io/server/common/aggregate" 34 "go.temporal.io/server/common/dynamicconfig" 35 "go.temporal.io/server/common/log" 36 "go.temporal.io/server/common/log/tag" 37 "go.temporal.io/server/common/metrics" 38 ) 39 40 const ( 41 emitMetricsInterval = 30 * time.Second 42 ) 43 44 type ( 45 HealthSignalAggregator interface { 46 Record(callerSegment int32, namespace string, latency time.Duration, err error) 47 AverageLatency() float64 48 ErrorRatio() float64 49 Start() 50 Stop() 51 } 52 53 HealthSignalAggregatorImpl struct { 54 status int32 55 shutdownCh chan struct{} 56 57 // map of shardID -> map of namespace -> request count 58 requestCounts map[int32]map[string]int64 59 requestsLock sync.Mutex 60 61 aggregationEnabled bool 62 latencyAverage aggregate.MovingWindowAverage 63 errorRatio aggregate.MovingWindowAverage 64 65 metricsHandler metrics.Handler 66 emitMetricsTimer *time.Ticker 67 perShardRPSWarnLimit dynamicconfig.IntPropertyFn 68 perShardPerNsRPSWarnLimit dynamicconfig.FloatPropertyFn 69 70 logger log.Logger 71 } 72 ) 73 74 func NewHealthSignalAggregatorImpl( 75 aggregationEnabled bool, 76 windowSize time.Duration, 77 maxBufferSize int, 78 metricsHandler metrics.Handler, 79 perShardRPSWarnLimit dynamicconfig.IntPropertyFn, 80 perShardPerNsRPSWarnLimit dynamicconfig.FloatPropertyFn, 81 logger log.Logger, 82 ) *HealthSignalAggregatorImpl { 83 ret := &HealthSignalAggregatorImpl{ 84 status: common.DaemonStatusInitialized, 85 shutdownCh: make(chan struct{}), 86 requestCounts: make(map[int32]map[string]int64), 87 metricsHandler: metricsHandler, 88 emitMetricsTimer: time.NewTicker(emitMetricsInterval), 89 perShardRPSWarnLimit: perShardRPSWarnLimit, 90 perShardPerNsRPSWarnLimit: perShardPerNsRPSWarnLimit, 91 logger: logger, 92 aggregationEnabled: aggregationEnabled, 93 } 94 95 if aggregationEnabled { 96 ret.latencyAverage = aggregate.NewMovingWindowAvgImpl(windowSize, maxBufferSize) 97 ret.errorRatio = aggregate.NewMovingWindowAvgImpl(windowSize, maxBufferSize) 98 } else { 99 ret.latencyAverage = aggregate.NoopMovingWindowAverage 100 ret.errorRatio = aggregate.NoopMovingWindowAverage 101 } 102 103 return ret 104 } 105 106 func (s *HealthSignalAggregatorImpl) Start() { 107 if !atomic.CompareAndSwapInt32(&s.status, common.DaemonStatusInitialized, common.DaemonStatusStarted) { 108 return 109 } 110 go s.emitMetricsLoop() 111 } 112 113 func (s *HealthSignalAggregatorImpl) Stop() { 114 if !atomic.CompareAndSwapInt32(&s.status, common.DaemonStatusStarted, common.DaemonStatusStopped) { 115 return 116 } 117 close(s.shutdownCh) 118 s.emitMetricsTimer.Stop() 119 } 120 121 func (s *HealthSignalAggregatorImpl) Record(callerSegment int32, namespace string, latency time.Duration, err error) { 122 if s.aggregationEnabled { 123 s.latencyAverage.Record(latency.Milliseconds()) 124 125 if isUnhealthyError(err) { 126 s.errorRatio.Record(1) 127 } else { 128 s.errorRatio.Record(0) 129 } 130 } 131 132 if callerSegment != CallerSegmentMissing { 133 s.incrementShardRequestCount(callerSegment, namespace) 134 } 135 } 136 137 func (s *HealthSignalAggregatorImpl) AverageLatency() float64 { 138 return s.latencyAverage.Average() 139 } 140 141 func (s *HealthSignalAggregatorImpl) ErrorRatio() float64 { 142 return s.errorRatio.Average() 143 } 144 145 func (s *HealthSignalAggregatorImpl) incrementShardRequestCount(shardID int32, namespace string) { 146 s.requestsLock.Lock() 147 defer s.requestsLock.Unlock() 148 if s.requestCounts[shardID] == nil { 149 s.requestCounts[shardID] = make(map[string]int64) 150 } 151 s.requestCounts[shardID][namespace]++ 152 } 153 154 func (s *HealthSignalAggregatorImpl) emitMetricsLoop() { 155 for { 156 select { 157 case <-s.shutdownCh: 158 return 159 case <-s.emitMetricsTimer.C: 160 s.requestsLock.Lock() 161 requestCounts := s.requestCounts 162 s.requestCounts = make(map[int32]map[string]int64, len(requestCounts)) 163 s.requestsLock.Unlock() 164 165 for shardID, requestCountPerNS := range requestCounts { 166 shardRequestCount := int64(0) 167 for namespace, count := range requestCountPerNS { 168 shardRequestCount += count 169 shardRPSPerNS := int64(float64(count) / emitMetricsInterval.Seconds()) 170 if s.perShardPerNsRPSWarnLimit() > 0.0 && shardRPSPerNS > int64(s.perShardPerNsRPSWarnLimit()*float64(s.perShardRPSWarnLimit())) { 171 s.logger.Warn("Per shard per namespace RPS warn limit exceeded", tag.ShardID(shardID), tag.WorkflowNamespace(namespace), tag.RPS(shardRPSPerNS)) 172 } 173 } 174 175 shardRPS := int64(float64(shardRequestCount) / emitMetricsInterval.Seconds()) 176 s.metricsHandler.Histogram(metrics.PersistenceShardRPS.Name(), metrics.PersistenceShardRPS.Unit()).Record(shardRPS) 177 if shardRPS > int64(s.perShardRPSWarnLimit()) { 178 s.logger.Warn("Per shard RPS warn limit exceeded", tag.ShardID(shardID), tag.RPS(shardRPS)) 179 } 180 } 181 } 182 } 183 } 184 185 func isUnhealthyError(err error) bool { 186 if err == nil { 187 return false 188 } 189 if common.IsContextCanceledErr(err) { 190 return true 191 } 192 193 switch err.(type) { 194 case *AppendHistoryTimeoutError, 195 *TimeoutError: 196 return true 197 198 default: 199 return false 200 } 201 }