go.temporal.io/server@v1.23.0/common/rpc/interceptor/telemetry.go (about) 1 // The MIT License 2 // 3 // Copyright (c) 2020 Temporal Technologies Inc. All rights reserved. 4 // 5 // Copyright (c) 2020 Uber Technologies, Inc. 6 // 7 // Permission is hereby granted, free of charge, to any person obtaining a copy 8 // of this software and associated documentation files (the "Software"), to deal 9 // in the Software without restriction, including without limitation the rights 10 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 // copies of the Software, and to permit persons to whom the Software is 12 // furnished to do so, subject to the following conditions: 13 // 14 // The above copyright notice and this permission notice shall be included in 15 // all copies or substantial portions of the Software. 16 // 17 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 // THE SOFTWARE. 24 25 package interceptor 26 27 import ( 28 "context" 29 "strings" 30 "time" 31 32 "go.temporal.io/api/enums/v1" 33 "go.temporal.io/api/serviceerror" 34 "go.temporal.io/api/workflowservice/v1" 35 "google.golang.org/grpc" 36 37 "go.temporal.io/server/common" 38 "go.temporal.io/server/common/api" 39 "go.temporal.io/server/common/log" 40 "go.temporal.io/server/common/log/tag" 41 "go.temporal.io/server/common/metrics" 42 "go.temporal.io/server/common/namespace" 43 serviceerrors "go.temporal.io/server/common/serviceerror" 44 ) 45 46 type ( 47 metricsContextKey struct{} 48 49 TelemetryInterceptor struct { 50 namespaceRegistry namespace.Registry 51 metricsHandler metrics.Handler 52 logger log.Logger 53 } 54 ) 55 56 var ( 57 metricsCtxKey = metricsContextKey{} 58 59 _ grpc.UnaryServerInterceptor = (*TelemetryInterceptor)(nil).UnaryIntercept 60 _ grpc.StreamServerInterceptor = (*TelemetryInterceptor)(nil).StreamIntercept 61 ) 62 63 // static variables used to emit action metrics. 64 var ( 65 respondWorkflowTaskCompleted = "RespondWorkflowTaskCompleted" 66 pollActivityTaskQueue = "PollActivityTaskQueue" 67 68 grpcActions = map[string]struct{}{ 69 "QueryWorkflow": {}, 70 "RecordActivityTaskHeartbeat": {}, 71 "RecordActivityTaskHeartbeatById": {}, 72 "ResetWorkflowExecution": {}, 73 "StartWorkflowExecution": {}, 74 "SignalWorkflowExecution": {}, 75 "SignalWithStartWorkflowExecution": {}, 76 "RespondWorkflowTaskCompleted": {}, 77 "PollActivityTaskQueue": {}, 78 "CreateSchedule": {}, 79 "UpdateSchedule": {}, 80 "DeleteSchedule": {}, 81 "PatchSchedule": {}, 82 "UpdateWorkflowExecution": {}, 83 } 84 85 commandActions = map[enums.CommandType]struct{}{ 86 enums.COMMAND_TYPE_RECORD_MARKER: {}, 87 enums.COMMAND_TYPE_START_TIMER: {}, 88 enums.COMMAND_TYPE_SCHEDULE_ACTIVITY_TASK: {}, 89 enums.COMMAND_TYPE_START_CHILD_WORKFLOW_EXECUTION: {}, 90 enums.COMMAND_TYPE_SIGNAL_EXTERNAL_WORKFLOW_EXECUTION: {}, 91 enums.COMMAND_TYPE_UPSERT_WORKFLOW_SEARCH_ATTRIBUTES: {}, 92 enums.COMMAND_TYPE_MODIFY_WORKFLOW_PROPERTIES: {}, 93 enums.COMMAND_TYPE_CONTINUE_AS_NEW_WORKFLOW_EXECUTION: {}, 94 } 95 ) 96 97 func NewTelemetryInterceptor( 98 namespaceRegistry namespace.Registry, 99 metricsHandler metrics.Handler, 100 logger log.Logger, 101 ) *TelemetryInterceptor { 102 return &TelemetryInterceptor{ 103 namespaceRegistry: namespaceRegistry, 104 metricsHandler: metricsHandler, 105 logger: logger, 106 } 107 } 108 109 // Use this method to override scope used for reporting a metric. 110 // Ideally this method should never be used. 111 func (ti *TelemetryInterceptor) unaryOverrideOperationTag(fullName, operation string, req interface{}) string { 112 if strings.HasPrefix(fullName, api.WorkflowServicePrefix) { 113 // GetWorkflowExecutionHistory method handles both long poll and regular calls. 114 // Current plan is to eventually split GetWorkflowExecutionHistory into two APIs, 115 // remove this "if" case when that is done. 116 if operation == metrics.FrontendGetWorkflowExecutionHistoryScope { 117 request := req.(*workflowservice.GetWorkflowExecutionHistoryRequest) 118 if request.GetWaitNewEvent() { 119 return metrics.FrontendPollWorkflowExecutionHistoryScope 120 } 121 } 122 return operation 123 } 124 return ti.overrideOperationTag(fullName, operation) 125 } 126 127 // Use this method to override scope used for reporting a metric. 128 // Ideally this method should never be used. 129 func (ti *TelemetryInterceptor) overrideOperationTag(fullName, operation string) string { 130 // prepend Operator prefix to Operator APIs 131 if strings.HasPrefix(fullName, api.OperatorServicePrefix) { 132 return "Operator" + operation 133 } 134 // prepend Admin prefix to Admin APIs 135 if strings.HasPrefix(fullName, api.AdminServicePrefix) { 136 return "Admin" + operation 137 } 138 return operation 139 } 140 141 func (ti *TelemetryInterceptor) UnaryIntercept( 142 ctx context.Context, 143 req interface{}, 144 info *grpc.UnaryServerInfo, 145 handler grpc.UnaryHandler, 146 ) (interface{}, error) { 147 _, methodName := SplitMethodName(info.FullMethod) 148 metricsHandler, logTags := ti.unaryMetricsHandlerLogTags(req, info.FullMethod, methodName) 149 150 ctx = context.WithValue(ctx, metricsCtxKey, metricsHandler) 151 metrics.ServiceRequests.With(metricsHandler).Record(1) 152 153 startTime := time.Now().UTC() 154 userLatencyDuration := time.Duration(0) 155 defer func() { 156 latency := time.Since(startTime) 157 metricsHandler.Timer(metrics.ServiceLatency.Name()).Record(latency) 158 noUserLatency := latency - userLatencyDuration 159 if noUserLatency < 0 { 160 noUserLatency = 0 161 } 162 metricsHandler.Timer(metrics.ServiceLatencyNoUserLatency.Name()).Record(noUserLatency) 163 }() 164 165 resp, err := handler(ctx, req) 166 167 if val, ok := metrics.ContextCounterGet(ctx, metrics.HistoryWorkflowExecutionCacheLatency.Name()); ok { 168 userLatencyDuration = time.Duration(val) 169 startTime.Add(userLatencyDuration) 170 metricsHandler.Timer(metrics.ServiceLatencyUserLatency.Name()).Record(userLatencyDuration) 171 } 172 173 if err != nil { 174 ti.handleError(metricsHandler, logTags, err) 175 return nil, err 176 } 177 178 // emit action metrics only after successful calls 179 ti.emitActionMetric(methodName, info.FullMethod, req, metricsHandler, resp) 180 181 return resp, nil 182 } 183 184 func (ti *TelemetryInterceptor) StreamIntercept( 185 service interface{}, 186 serverStream grpc.ServerStream, 187 info *grpc.StreamServerInfo, 188 handler grpc.StreamHandler, 189 ) error { 190 _, methodName := SplitMethodName(info.FullMethod) 191 metricsHandler, logTags := ti.streamMetricsHandlerLogTags(info.FullMethod, methodName) 192 metricsHandler.Counter(metrics.ServiceRequests.Name()).Record(1) 193 194 err := handler(service, serverStream) 195 if err != nil { 196 ti.handleError(metricsHandler, logTags, err) 197 return err 198 } 199 return nil 200 } 201 202 func (ti *TelemetryInterceptor) emitActionMetric( 203 methodName string, 204 fullName string, 205 req interface{}, 206 metricsHandler metrics.Handler, 207 result interface{}, 208 ) { 209 if _, ok := grpcActions[methodName]; !ok || !strings.HasPrefix(fullName, api.WorkflowServicePrefix) { 210 // grpcActions checks that methodName is the one that we care about, and we only care about WorkflowService. 211 return 212 } 213 214 switch methodName { 215 case respondWorkflowTaskCompleted: 216 // handle commands 217 completedRequest, ok := req.(*workflowservice.RespondWorkflowTaskCompletedRequest) 218 if !ok { 219 return 220 } 221 222 hasMarker := false 223 for _, command := range completedRequest.Commands { 224 if _, ok := commandActions[command.CommandType]; ok { 225 switch command.CommandType { 226 case enums.COMMAND_TYPE_RECORD_MARKER: 227 // handle RecordMarker command, they are used for localActivity, sideEffect, versioning etc. 228 hasMarker = true 229 case enums.COMMAND_TYPE_START_CHILD_WORKFLOW_EXECUTION: 230 // Each child workflow counts as 2 actions. We use separate tags to track them separately. 231 metricsHandler.Counter(metrics.ActionCounter.Name()).Record(1, metrics.ActionType("command_"+command.CommandType.String())) 232 metricsHandler.Counter(metrics.ActionCounter.Name()).Record(1, metrics.ActionType("command_"+command.CommandType.String()+"_Extra")) 233 default: 234 // handle all other command action 235 metricsHandler.Counter(metrics.ActionCounter.Name()).Record(1, metrics.ActionType("command_"+command.CommandType.String())) 236 } 237 } 238 } 239 if hasMarker { 240 // Emit separate action metric for batch of markers. 241 // One workflow task response may contain multiple marker commands. Each marker will emit one 242 // command_RecordMarker_Xxx action metric. Depending on pricing model, you may want to ignore all individual 243 // command_RecordMarker_Xxx and use command_BatchMarkers instead. 244 metricsHandler.Counter(metrics.ActionCounter.Name()).Record(1, metrics.ActionType("command_BatchMarkers")) 245 } 246 247 case pollActivityTaskQueue: 248 // handle activity retries 249 activityPollResponse, ok := result.(*workflowservice.PollActivityTaskQueueResponse) 250 if !ok { 251 return 252 } 253 if activityPollResponse == nil || len(activityPollResponse.TaskToken) == 0 { 254 // empty response 255 return 256 } 257 if activityPollResponse.Attempt > 1 { 258 metricsHandler.Counter(metrics.ActionCounter.Name()).Record(1, metrics.ActionType("activity_retry")) 259 } 260 261 default: 262 // grpc action 263 metricsHandler.Counter(metrics.ActionCounter.Name()).Record(1, metrics.ActionType("grpc_"+methodName)) 264 } 265 } 266 267 func (ti *TelemetryInterceptor) unaryMetricsHandlerLogTags( 268 req interface{}, 269 fullMethod string, 270 methodName string, 271 ) (metrics.Handler, []tag.Tag) { 272 overridedMethodName := ti.unaryOverrideOperationTag(fullMethod, methodName, req) 273 274 nsName := MustGetNamespaceName(ti.namespaceRegistry, req) 275 if nsName == "" { 276 return ti.metricsHandler.WithTags(metrics.OperationTag(overridedMethodName), metrics.NamespaceUnknownTag()), 277 []tag.Tag{tag.Operation(overridedMethodName)} 278 } 279 return ti.metricsHandler.WithTags(metrics.OperationTag(overridedMethodName), metrics.NamespaceTag(nsName.String())), 280 []tag.Tag{tag.Operation(overridedMethodName), tag.WorkflowNamespace(nsName.String())} 281 } 282 283 func (ti *TelemetryInterceptor) streamMetricsHandlerLogTags( 284 fullMethod string, 285 methodName string, 286 ) (metrics.Handler, []tag.Tag) { 287 overridedMethodName := ti.overrideOperationTag(fullMethod, methodName) 288 return ti.metricsHandler.WithTags( 289 metrics.OperationTag(overridedMethodName), 290 metrics.NamespaceUnknownTag(), 291 ), []tag.Tag{tag.Operation(overridedMethodName)} 292 } 293 294 func (ti *TelemetryInterceptor) handleError( 295 metricsHandler metrics.Handler, 296 logTags []tag.Tag, 297 err error, 298 ) { 299 300 metricsHandler.Counter(metrics.ServiceErrorWithType.Name()).Record(1, metrics.ServiceErrorTypeTag(err)) 301 302 if common.IsContextDeadlineExceededErr(err) || common.IsContextCanceledErr(err) { 303 return 304 } 305 306 switch err := err.(type) { 307 // we emit service_error_with_type metrics, no need to emit specific metric for these known error types. 308 case *serviceerror.AlreadyExists, 309 *serviceerror.CancellationAlreadyRequested, 310 *serviceerror.FailedPrecondition, 311 *serviceerror.NamespaceInvalidState, 312 *serviceerror.NamespaceNotActive, 313 *serviceerror.NamespaceNotFound, 314 *serviceerror.NamespaceAlreadyExists, 315 *serviceerror.InvalidArgument, 316 *serviceerror.WorkflowExecutionAlreadyStarted, 317 *serviceerror.WorkflowNotReady, 318 *serviceerror.NotFound, 319 *serviceerror.QueryFailed, 320 *serviceerror.ClientVersionNotSupported, 321 *serviceerror.ServerVersionNotSupported, 322 *serviceerror.PermissionDenied, 323 *serviceerror.NewerBuildExists, 324 *serviceerrors.StickyWorkerUnavailable, 325 *serviceerrors.ShardOwnershipLost, 326 *serviceerrors.TaskAlreadyStarted, 327 *serviceerrors.RetryReplication: 328 // no-op 329 330 // specific metric for resource exhausted error with throttle reason 331 case *serviceerror.ResourceExhausted: 332 metricsHandler.Counter(metrics.ServiceErrResourceExhaustedCounter.Name()).Record(1, metrics.ResourceExhaustedCauseTag(err.Cause)) 333 334 // Any other errors are treated as ServiceFailures against SLA. 335 // Including below known errors and any other unknown errors. 336 // *serviceerror.DataLoss, 337 // *serviceerror.Internal 338 // *serviceerror.Unavailable: 339 default: 340 metricsHandler.Counter(metrics.ServiceFailures.Name()).Record(1) 341 ti.logger.Error("service failures", append(logTags, tag.Error(err))...) 342 } 343 } 344 345 func GetMetricsHandlerFromContext( 346 ctx context.Context, 347 logger log.Logger, 348 ) metrics.Handler { 349 handler, ok := ctx.Value(metricsCtxKey).(metrics.Handler) 350 if !ok { 351 logger.Error("unable to get metrics scope") 352 return metrics.NoopMetricsHandler 353 } 354 return handler 355 }