go.temporal.io/server@v1.23.0/common/rpc/interceptor/telemetry.go (about)

     1  // The MIT License
     2  //
     3  // Copyright (c) 2020 Temporal Technologies Inc.  All rights reserved.
     4  //
     5  // Copyright (c) 2020 Uber Technologies, Inc.
     6  //
     7  // Permission is hereby granted, free of charge, to any person obtaining a copy
     8  // of this software and associated documentation files (the "Software"), to deal
     9  // in the Software without restriction, including without limitation the rights
    10  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    11  // copies of the Software, and to permit persons to whom the Software is
    12  // furnished to do so, subject to the following conditions:
    13  //
    14  // The above copyright notice and this permission notice shall be included in
    15  // all copies or substantial portions of the Software.
    16  //
    17  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    18  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    19  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    20  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    21  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    22  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    23  // THE SOFTWARE.
    24  
    25  package interceptor
    26  
    27  import (
    28  	"context"
    29  	"strings"
    30  	"time"
    31  
    32  	"go.temporal.io/api/enums/v1"
    33  	"go.temporal.io/api/serviceerror"
    34  	"go.temporal.io/api/workflowservice/v1"
    35  	"google.golang.org/grpc"
    36  
    37  	"go.temporal.io/server/common"
    38  	"go.temporal.io/server/common/api"
    39  	"go.temporal.io/server/common/log"
    40  	"go.temporal.io/server/common/log/tag"
    41  	"go.temporal.io/server/common/metrics"
    42  	"go.temporal.io/server/common/namespace"
    43  	serviceerrors "go.temporal.io/server/common/serviceerror"
    44  )
    45  
    46  type (
    47  	metricsContextKey struct{}
    48  
    49  	TelemetryInterceptor struct {
    50  		namespaceRegistry namespace.Registry
    51  		metricsHandler    metrics.Handler
    52  		logger            log.Logger
    53  	}
    54  )
    55  
    56  var (
    57  	metricsCtxKey = metricsContextKey{}
    58  
    59  	_ grpc.UnaryServerInterceptor  = (*TelemetryInterceptor)(nil).UnaryIntercept
    60  	_ grpc.StreamServerInterceptor = (*TelemetryInterceptor)(nil).StreamIntercept
    61  )
    62  
    63  // static variables used to emit action metrics.
    64  var (
    65  	respondWorkflowTaskCompleted = "RespondWorkflowTaskCompleted"
    66  	pollActivityTaskQueue        = "PollActivityTaskQueue"
    67  
    68  	grpcActions = map[string]struct{}{
    69  		"QueryWorkflow":                    {},
    70  		"RecordActivityTaskHeartbeat":      {},
    71  		"RecordActivityTaskHeartbeatById":  {},
    72  		"ResetWorkflowExecution":           {},
    73  		"StartWorkflowExecution":           {},
    74  		"SignalWorkflowExecution":          {},
    75  		"SignalWithStartWorkflowExecution": {},
    76  		"RespondWorkflowTaskCompleted":     {},
    77  		"PollActivityTaskQueue":            {},
    78  		"CreateSchedule":                   {},
    79  		"UpdateSchedule":                   {},
    80  		"DeleteSchedule":                   {},
    81  		"PatchSchedule":                    {},
    82  		"UpdateWorkflowExecution":          {},
    83  	}
    84  
    85  	commandActions = map[enums.CommandType]struct{}{
    86  		enums.COMMAND_TYPE_RECORD_MARKER:                      {},
    87  		enums.COMMAND_TYPE_START_TIMER:                        {},
    88  		enums.COMMAND_TYPE_SCHEDULE_ACTIVITY_TASK:             {},
    89  		enums.COMMAND_TYPE_START_CHILD_WORKFLOW_EXECUTION:     {},
    90  		enums.COMMAND_TYPE_SIGNAL_EXTERNAL_WORKFLOW_EXECUTION: {},
    91  		enums.COMMAND_TYPE_UPSERT_WORKFLOW_SEARCH_ATTRIBUTES:  {},
    92  		enums.COMMAND_TYPE_MODIFY_WORKFLOW_PROPERTIES:         {},
    93  		enums.COMMAND_TYPE_CONTINUE_AS_NEW_WORKFLOW_EXECUTION: {},
    94  	}
    95  )
    96  
    97  func NewTelemetryInterceptor(
    98  	namespaceRegistry namespace.Registry,
    99  	metricsHandler metrics.Handler,
   100  	logger log.Logger,
   101  ) *TelemetryInterceptor {
   102  	return &TelemetryInterceptor{
   103  		namespaceRegistry: namespaceRegistry,
   104  		metricsHandler:    metricsHandler,
   105  		logger:            logger,
   106  	}
   107  }
   108  
   109  // Use this method to override scope used for reporting a metric.
   110  // Ideally this method should never be used.
   111  func (ti *TelemetryInterceptor) unaryOverrideOperationTag(fullName, operation string, req interface{}) string {
   112  	if strings.HasPrefix(fullName, api.WorkflowServicePrefix) {
   113  		// GetWorkflowExecutionHistory method handles both long poll and regular calls.
   114  		// Current plan is to eventually split GetWorkflowExecutionHistory into two APIs,
   115  		// remove this "if" case when that is done.
   116  		if operation == metrics.FrontendGetWorkflowExecutionHistoryScope {
   117  			request := req.(*workflowservice.GetWorkflowExecutionHistoryRequest)
   118  			if request.GetWaitNewEvent() {
   119  				return metrics.FrontendPollWorkflowExecutionHistoryScope
   120  			}
   121  		}
   122  		return operation
   123  	}
   124  	return ti.overrideOperationTag(fullName, operation)
   125  }
   126  
   127  // Use this method to override scope used for reporting a metric.
   128  // Ideally this method should never be used.
   129  func (ti *TelemetryInterceptor) overrideOperationTag(fullName, operation string) string {
   130  	// prepend Operator prefix to Operator APIs
   131  	if strings.HasPrefix(fullName, api.OperatorServicePrefix) {
   132  		return "Operator" + operation
   133  	}
   134  	// prepend Admin prefix to Admin APIs
   135  	if strings.HasPrefix(fullName, api.AdminServicePrefix) {
   136  		return "Admin" + operation
   137  	}
   138  	return operation
   139  }
   140  
   141  func (ti *TelemetryInterceptor) UnaryIntercept(
   142  	ctx context.Context,
   143  	req interface{},
   144  	info *grpc.UnaryServerInfo,
   145  	handler grpc.UnaryHandler,
   146  ) (interface{}, error) {
   147  	_, methodName := SplitMethodName(info.FullMethod)
   148  	metricsHandler, logTags := ti.unaryMetricsHandlerLogTags(req, info.FullMethod, methodName)
   149  
   150  	ctx = context.WithValue(ctx, metricsCtxKey, metricsHandler)
   151  	metrics.ServiceRequests.With(metricsHandler).Record(1)
   152  
   153  	startTime := time.Now().UTC()
   154  	userLatencyDuration := time.Duration(0)
   155  	defer func() {
   156  		latency := time.Since(startTime)
   157  		metricsHandler.Timer(metrics.ServiceLatency.Name()).Record(latency)
   158  		noUserLatency := latency - userLatencyDuration
   159  		if noUserLatency < 0 {
   160  			noUserLatency = 0
   161  		}
   162  		metricsHandler.Timer(metrics.ServiceLatencyNoUserLatency.Name()).Record(noUserLatency)
   163  	}()
   164  
   165  	resp, err := handler(ctx, req)
   166  
   167  	if val, ok := metrics.ContextCounterGet(ctx, metrics.HistoryWorkflowExecutionCacheLatency.Name()); ok {
   168  		userLatencyDuration = time.Duration(val)
   169  		startTime.Add(userLatencyDuration)
   170  		metricsHandler.Timer(metrics.ServiceLatencyUserLatency.Name()).Record(userLatencyDuration)
   171  	}
   172  
   173  	if err != nil {
   174  		ti.handleError(metricsHandler, logTags, err)
   175  		return nil, err
   176  	}
   177  
   178  	// emit action metrics only after successful calls
   179  	ti.emitActionMetric(methodName, info.FullMethod, req, metricsHandler, resp)
   180  
   181  	return resp, nil
   182  }
   183  
   184  func (ti *TelemetryInterceptor) StreamIntercept(
   185  	service interface{},
   186  	serverStream grpc.ServerStream,
   187  	info *grpc.StreamServerInfo,
   188  	handler grpc.StreamHandler,
   189  ) error {
   190  	_, methodName := SplitMethodName(info.FullMethod)
   191  	metricsHandler, logTags := ti.streamMetricsHandlerLogTags(info.FullMethod, methodName)
   192  	metricsHandler.Counter(metrics.ServiceRequests.Name()).Record(1)
   193  
   194  	err := handler(service, serverStream)
   195  	if err != nil {
   196  		ti.handleError(metricsHandler, logTags, err)
   197  		return err
   198  	}
   199  	return nil
   200  }
   201  
   202  func (ti *TelemetryInterceptor) emitActionMetric(
   203  	methodName string,
   204  	fullName string,
   205  	req interface{},
   206  	metricsHandler metrics.Handler,
   207  	result interface{},
   208  ) {
   209  	if _, ok := grpcActions[methodName]; !ok || !strings.HasPrefix(fullName, api.WorkflowServicePrefix) {
   210  		// grpcActions checks that methodName is the one that we care about, and we only care about WorkflowService.
   211  		return
   212  	}
   213  
   214  	switch methodName {
   215  	case respondWorkflowTaskCompleted:
   216  		// handle commands
   217  		completedRequest, ok := req.(*workflowservice.RespondWorkflowTaskCompletedRequest)
   218  		if !ok {
   219  			return
   220  		}
   221  
   222  		hasMarker := false
   223  		for _, command := range completedRequest.Commands {
   224  			if _, ok := commandActions[command.CommandType]; ok {
   225  				switch command.CommandType {
   226  				case enums.COMMAND_TYPE_RECORD_MARKER:
   227  					// handle RecordMarker command, they are used for localActivity, sideEffect, versioning etc.
   228  					hasMarker = true
   229  				case enums.COMMAND_TYPE_START_CHILD_WORKFLOW_EXECUTION:
   230  					// Each child workflow counts as 2 actions. We use separate tags to track them separately.
   231  					metricsHandler.Counter(metrics.ActionCounter.Name()).Record(1, metrics.ActionType("command_"+command.CommandType.String()))
   232  					metricsHandler.Counter(metrics.ActionCounter.Name()).Record(1, metrics.ActionType("command_"+command.CommandType.String()+"_Extra"))
   233  				default:
   234  					// handle all other command action
   235  					metricsHandler.Counter(metrics.ActionCounter.Name()).Record(1, metrics.ActionType("command_"+command.CommandType.String()))
   236  				}
   237  			}
   238  		}
   239  		if hasMarker {
   240  			// Emit separate action metric for batch of markers.
   241  			// One workflow task response may contain multiple marker commands. Each marker will emit one
   242  			// command_RecordMarker_Xxx action metric. Depending on pricing model, you may want to ignore all individual
   243  			// command_RecordMarker_Xxx and use command_BatchMarkers instead.
   244  			metricsHandler.Counter(metrics.ActionCounter.Name()).Record(1, metrics.ActionType("command_BatchMarkers"))
   245  		}
   246  
   247  	case pollActivityTaskQueue:
   248  		// handle activity retries
   249  		activityPollResponse, ok := result.(*workflowservice.PollActivityTaskQueueResponse)
   250  		if !ok {
   251  			return
   252  		}
   253  		if activityPollResponse == nil || len(activityPollResponse.TaskToken) == 0 {
   254  			// empty response
   255  			return
   256  		}
   257  		if activityPollResponse.Attempt > 1 {
   258  			metricsHandler.Counter(metrics.ActionCounter.Name()).Record(1, metrics.ActionType("activity_retry"))
   259  		}
   260  
   261  	default:
   262  		// grpc action
   263  		metricsHandler.Counter(metrics.ActionCounter.Name()).Record(1, metrics.ActionType("grpc_"+methodName))
   264  	}
   265  }
   266  
   267  func (ti *TelemetryInterceptor) unaryMetricsHandlerLogTags(
   268  	req interface{},
   269  	fullMethod string,
   270  	methodName string,
   271  ) (metrics.Handler, []tag.Tag) {
   272  	overridedMethodName := ti.unaryOverrideOperationTag(fullMethod, methodName, req)
   273  
   274  	nsName := MustGetNamespaceName(ti.namespaceRegistry, req)
   275  	if nsName == "" {
   276  		return ti.metricsHandler.WithTags(metrics.OperationTag(overridedMethodName), metrics.NamespaceUnknownTag()),
   277  			[]tag.Tag{tag.Operation(overridedMethodName)}
   278  	}
   279  	return ti.metricsHandler.WithTags(metrics.OperationTag(overridedMethodName), metrics.NamespaceTag(nsName.String())),
   280  		[]tag.Tag{tag.Operation(overridedMethodName), tag.WorkflowNamespace(nsName.String())}
   281  }
   282  
   283  func (ti *TelemetryInterceptor) streamMetricsHandlerLogTags(
   284  	fullMethod string,
   285  	methodName string,
   286  ) (metrics.Handler, []tag.Tag) {
   287  	overridedMethodName := ti.overrideOperationTag(fullMethod, methodName)
   288  	return ti.metricsHandler.WithTags(
   289  		metrics.OperationTag(overridedMethodName),
   290  		metrics.NamespaceUnknownTag(),
   291  	), []tag.Tag{tag.Operation(overridedMethodName)}
   292  }
   293  
   294  func (ti *TelemetryInterceptor) handleError(
   295  	metricsHandler metrics.Handler,
   296  	logTags []tag.Tag,
   297  	err error,
   298  ) {
   299  
   300  	metricsHandler.Counter(metrics.ServiceErrorWithType.Name()).Record(1, metrics.ServiceErrorTypeTag(err))
   301  
   302  	if common.IsContextDeadlineExceededErr(err) || common.IsContextCanceledErr(err) {
   303  		return
   304  	}
   305  
   306  	switch err := err.(type) {
   307  	// we emit service_error_with_type metrics, no need to emit specific metric for these known error types.
   308  	case *serviceerror.AlreadyExists,
   309  		*serviceerror.CancellationAlreadyRequested,
   310  		*serviceerror.FailedPrecondition,
   311  		*serviceerror.NamespaceInvalidState,
   312  		*serviceerror.NamespaceNotActive,
   313  		*serviceerror.NamespaceNotFound,
   314  		*serviceerror.NamespaceAlreadyExists,
   315  		*serviceerror.InvalidArgument,
   316  		*serviceerror.WorkflowExecutionAlreadyStarted,
   317  		*serviceerror.WorkflowNotReady,
   318  		*serviceerror.NotFound,
   319  		*serviceerror.QueryFailed,
   320  		*serviceerror.ClientVersionNotSupported,
   321  		*serviceerror.ServerVersionNotSupported,
   322  		*serviceerror.PermissionDenied,
   323  		*serviceerror.NewerBuildExists,
   324  		*serviceerrors.StickyWorkerUnavailable,
   325  		*serviceerrors.ShardOwnershipLost,
   326  		*serviceerrors.TaskAlreadyStarted,
   327  		*serviceerrors.RetryReplication:
   328  		// no-op
   329  
   330  	// specific metric for resource exhausted error with throttle reason
   331  	case *serviceerror.ResourceExhausted:
   332  		metricsHandler.Counter(metrics.ServiceErrResourceExhaustedCounter.Name()).Record(1, metrics.ResourceExhaustedCauseTag(err.Cause))
   333  
   334  	// Any other errors are treated as ServiceFailures against SLA.
   335  	// Including below known errors and any other unknown errors.
   336  	//  *serviceerror.DataLoss,
   337  	//  *serviceerror.Internal
   338  	//	*serviceerror.Unavailable:
   339  	default:
   340  		metricsHandler.Counter(metrics.ServiceFailures.Name()).Record(1)
   341  		ti.logger.Error("service failures", append(logTags, tag.Error(err))...)
   342  	}
   343  }
   344  
   345  func GetMetricsHandlerFromContext(
   346  	ctx context.Context,
   347  	logger log.Logger,
   348  ) metrics.Handler {
   349  	handler, ok := ctx.Value(metricsCtxKey).(metrics.Handler)
   350  	if !ok {
   351  		logger.Error("unable to get metrics scope")
   352  		return metrics.NoopMetricsHandler
   353  	}
   354  	return handler
   355  }