github.com/Azure/aad-pod-identity@v1.8.17/pkg/metrics/metrics.go (about)

     1  package metrics
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	"go.opencensus.io/stats"
    10  	"go.opencensus.io/stats/view"
    11  	"go.opencensus.io/tag"
    12  	"k8s.io/klog/v2"
    13  )
    14  
    15  // This const block defines the metric names.
    16  const (
    17  	assignedIdentityAdditionDurationName   = "assigned_identity_addition_duration_seconds"
    18  	assignedIdentityAdditionCountName      = "assigned_identity_addition_count"
    19  	assignedIdentityDeletionDurationName   = "assigned_identity_deletion_duration_seconds"
    20  	assignedIdentityDeletionCountName      = "assigned_identity_deletion_count"
    21  	assignedIdentityUpdateDurationName     = "assigned_identity_update_duration_seconds"
    22  	assignedIdentityUpdateCountName        = "assigned_identity_update_count"
    23  	nmiOperationsDurationName              = "nmi_operations_duration_seconds"
    24  	micCycleDurationName                   = "mic_cycle_duration_seconds"
    25  	micCycleCountName                      = "mic_cycle_count"
    26  	micNewLeaderElectionCountName          = "mic_new_leader_election_count"
    27  	cloudProviderOperationsErrorsCountName = "cloud_provider_operations_errors_count"
    28  	cloudProviderOperationsDurationName    = "cloud_provider_operations_duration_seconds"
    29  	kubernetesAPIOperationsErrorsCountName = "kubernetes_api_operations_errors_count"
    30  	imdsOperationsErrorsCountName          = "imds_operations_errors_count"
    31  	imdsOperationsDurationName             = "imds_operations_duration_seconds"
    32  
    33  	// AdalTokenFromMSIOperationName represents the duration of obtaining a token with MSI.
    34  	AdalTokenFromMSIOperationName = "adal_token_msi" // #nosec
    35  
    36  	// AdalTokenFromMSIWithUserAssignedIDOperationName represents the duration of obtaining a token with a user-assigned identity.
    37  	AdalTokenFromMSIWithUserAssignedIDOperationName = "adal_token_msi_userassignedid" // #nosec
    38  
    39  	// AdalTokenOperationName represents the duration of obtaining a token.
    40  	AdalTokenOperationName = "adal_token"
    41  
    42  	// GetVmssOperationName represents the duration of a GET request to a VMSS instance.
    43  	GetVmssOperationName = "vmss_get"
    44  
    45  	// UpdateVMSSOperationName represents the duration of a PATCH request to a VMSS instance.
    46  	UpdateVMSSOperationName = "vmss_update"
    47  
    48  	// GetVMOperationName represents the duration of a GET request to a VM instance.
    49  	GetVMOperationName = "vm_get"
    50  
    51  	// UpdateVMOperationName represents the duration of a PATCH request to a VM instance.
    52  	UpdateVMOperationName = "vm_update"
    53  
    54  	// AssignedIdentityDeletionOperationName represents the duration of an AzureAssignedIdentity deletion.
    55  	AssignedIdentityDeletionOperationName = "assigned_identity_deletion"
    56  
    57  	// AssignedIdentityAdditionOperationName represents the duration of an AzureAssignedIdentity addition.
    58  	AssignedIdentityAdditionOperationName = "assigned_identity_addition"
    59  
    60  	// AssignedIdentityUpdateOperationName represents the duration of an AzureAssignedIdentity update.
    61  	AssignedIdentityUpdateOperationName = "assigned_identity_update"
    62  
    63  	// UpdateAzureAssignedIdentityStatusOperationName represents the status of an AzureAssignedIdentity update operation.
    64  	UpdateAzureAssignedIdentityStatusOperationName = "update_azure_assigned_identity_status"
    65  
    66  	// GetPodListOperationName represents the status of a pod list operation.
    67  	GetPodListOperationName = "get_pod_list"
    68  
    69  	// GetSecretOperationName represents the status of a secret get operation.
    70  	GetSecretOperationName = "get_secret"
    71  )
    72  
    73  // The following variables are measures
    74  var (
    75  	// AssignedIdentityAdditionDurationM is a measure that tracks the duration in seconds of assigned_identity_addition operations.
    76  	AssignedIdentityAdditionDurationM = stats.Float64(
    77  		assignedIdentityAdditionDurationName,
    78  		"Duration in seconds of assigned identity addition operations",
    79  		stats.UnitMilliseconds)
    80  
    81  	// AssignedIdentityAdditionCountM is a measure that tracks the cumulative number of assigned identity addition operations.
    82  	AssignedIdentityAdditionCountM = stats.Int64(
    83  		assignedIdentityAdditionCountName,
    84  		"Total number of assigned identity addition operations",
    85  		stats.UnitDimensionless)
    86  
    87  	// AssignedIdentityDeletionDurationM is a measure that tracks the duration in seconds of assigned_identity_deletion operations.
    88  	AssignedIdentityDeletionDurationM = stats.Float64(
    89  		assignedIdentityDeletionDurationName,
    90  		"Duration in seconds of assigned identity deletion operations",
    91  		stats.UnitMilliseconds)
    92  
    93  	// AssignedIdentityDeletionCountM is a measure that tracks the cumulative number of assigned identity deletion operations.
    94  	AssignedIdentityDeletionCountM = stats.Int64(assignedIdentityDeletionCountName,
    95  		"Total number of assigned identity deletion operations",
    96  		stats.UnitDimensionless)
    97  
    98  	// NMIOperationsDurationM is a measure that tracks the duration in seconds of nmi operations.
    99  	NMIOperationsDurationM = stats.Float64(
   100  		nmiOperationsDurationName,
   101  		"Duration in seconds for nmi operations",
   102  		stats.UnitMilliseconds)
   103  
   104  	// MICCycleDurationM is a measure that tracks the duration in seconds for single mic sync cycle.
   105  	MICCycleDurationM = stats.Float64(
   106  		micCycleDurationName,
   107  		"Duration in seconds for single mic sync cycle",
   108  		stats.UnitMilliseconds)
   109  
   110  	// MICCycleCountM is a measure that tracks the cumulative number of cycles executed in mic.
   111  	MICCycleCountM = stats.Int64(
   112  		micCycleCountName,
   113  		"Total number of cycles executed in mic",
   114  		stats.UnitDimensionless)
   115  
   116  	// MICNewLeaderElectionCountM is a measure that tracks the cumulative number of new leader election in mic.
   117  	MICNewLeaderElectionCountM = stats.Int64(
   118  		micNewLeaderElectionCountName,
   119  		"Total number of new leader election in mic",
   120  		stats.UnitDimensionless)
   121  
   122  	// CloudProviderOperationsErrorsCountM is a measure that tracks the cumulative number of errors in cloud provider operations.
   123  	CloudProviderOperationsErrorsCountM = stats.Int64(
   124  		cloudProviderOperationsErrorsCountName,
   125  		"Total number of errors in cloud provider operations",
   126  		stats.UnitDimensionless)
   127  
   128  	// CloudProviderOperationsDurationM is a measure that tracks the duration in seconds of CloudProviderOperations operations.
   129  	CloudProviderOperationsDurationM = stats.Float64(
   130  		cloudProviderOperationsDurationName,
   131  		"Duration in seconds of cloudprovider operations",
   132  		stats.UnitMilliseconds)
   133  
   134  	// KubernetesAPIOperationsErrorsCountM is a measure that tracks the cumulative number of errors in cloud provider operations.
   135  	KubernetesAPIOperationsErrorsCountM = stats.Int64(
   136  		kubernetesAPIOperationsErrorsCountName,
   137  		"Total number of errors in kubernetes api operations",
   138  		stats.UnitDimensionless)
   139  
   140  	// ImdsOperationsErrorsCountM is a measure that tracks the cumulative number of errors in imds operations.
   141  	ImdsOperationsErrorsCountM = stats.Int64(
   142  		imdsOperationsErrorsCountName,
   143  		"Total number of errors in imds token operations",
   144  		stats.UnitDimensionless)
   145  
   146  	// ImdsOperationsDurationM is a measure that tracks the duration in seconds of imds operations.
   147  	ImdsOperationsDurationM = stats.Float64(
   148  		imdsOperationsDurationName,
   149  		"Duration in seconds of imds token operations",
   150  		stats.UnitMilliseconds)
   151  
   152  	// AssignedIdentityUpdateDurationM is a measure that tracks the duration in seconds of assigned_identity_update operations.
   153  	AssignedIdentityUpdateDurationM = stats.Float64(
   154  		assignedIdentityUpdateDurationName,
   155  		"Duration in seconds of assigned identity update operations",
   156  		stats.UnitMilliseconds)
   157  
   158  	// AssignedIdentityUpdateCountM is a measure that tracks the cumulative number of assigned identity update operations.
   159  	AssignedIdentityUpdateCountM = stats.Int64(
   160  		assignedIdentityUpdateCountName,
   161  		"Total number of assigned identity update operations",
   162  		stats.UnitDimensionless)
   163  )
   164  
   165  var (
   166  	operationTypeKey = tag.MustNewKey("operation_type")
   167  	statusCodeKey    = tag.MustNewKey("status_code")
   168  	namespaceKey     = tag.MustNewKey("namespace")
   169  	resourceKey      = tag.MustNewKey("resource")
   170  )
   171  
   172  const componentNamespace = "aadpodidentity"
   173  
   174  // SinceInSeconds gets the time since the specified start in seconds.
   175  func SinceInSeconds(start time.Time) float64 {
   176  	return time.Since(start).Seconds()
   177  }
   178  
   179  // registerViews register views to be collected by exporter
   180  func registerViews() error {
   181  	views := []*view.View{
   182  		{
   183  			Description: AssignedIdentityAdditionDurationM.Description(),
   184  			Measure:     AssignedIdentityAdditionDurationM,
   185  			Aggregation: view.Distribution(0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 10),
   186  		},
   187  		{
   188  			Description: AssignedIdentityAdditionCountM.Description(),
   189  			Measure:     AssignedIdentityAdditionCountM,
   190  			Aggregation: view.Count(),
   191  		},
   192  		{
   193  			Description: AssignedIdentityDeletionDurationM.Description(),
   194  			Measure:     AssignedIdentityDeletionDurationM,
   195  			Aggregation: view.Distribution(0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 10),
   196  		},
   197  		{
   198  			Description: AssignedIdentityDeletionCountM.Description(),
   199  			Measure:     AssignedIdentityDeletionCountM,
   200  			Aggregation: view.Count(),
   201  		},
   202  		{
   203  			Description: NMIOperationsDurationM.Description(),
   204  			Measure:     NMIOperationsDurationM,
   205  			Aggregation: view.Distribution(0.5, 1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100),
   206  			TagKeys:     []tag.Key{operationTypeKey, statusCodeKey, namespaceKey, resourceKey},
   207  		},
   208  		{
   209  			Description: MICCycleDurationM.Description(),
   210  			Measure:     MICCycleDurationM,
   211  			Aggregation: view.Distribution(0.5, 1, 5, 10, 30, 60, 120, 300, 600, 900, 1200),
   212  		},
   213  		{
   214  			Description: MICCycleCountM.Description(),
   215  			Measure:     MICCycleCountM,
   216  			Aggregation: view.Count(),
   217  		},
   218  		{
   219  			Description: MICNewLeaderElectionCountM.Description(),
   220  			Measure:     MICNewLeaderElectionCountM,
   221  			Aggregation: view.Count(),
   222  		},
   223  		{
   224  			Description: CloudProviderOperationsErrorsCountM.Description(),
   225  			Measure:     CloudProviderOperationsErrorsCountM,
   226  			Aggregation: view.Count(),
   227  			TagKeys:     []tag.Key{operationTypeKey},
   228  		},
   229  		{
   230  			Description: CloudProviderOperationsDurationM.Description(),
   231  			Measure:     CloudProviderOperationsDurationM,
   232  			Aggregation: view.Distribution(0.5, 1, 5, 10, 30, 60, 120, 300, 600, 900, 1200),
   233  			TagKeys:     []tag.Key{operationTypeKey},
   234  		},
   235  		{
   236  			Description: KubernetesAPIOperationsErrorsCountM.Description(),
   237  			Measure:     KubernetesAPIOperationsErrorsCountM,
   238  			Aggregation: view.Count(),
   239  			TagKeys:     []tag.Key{operationTypeKey},
   240  		},
   241  		{
   242  			Description: ImdsOperationsErrorsCountM.Description(),
   243  			Measure:     ImdsOperationsErrorsCountM,
   244  			Aggregation: view.Count(),
   245  			TagKeys:     []tag.Key{operationTypeKey},
   246  		},
   247  		{
   248  			Description: ImdsOperationsDurationM.Description(),
   249  			Measure:     ImdsOperationsDurationM,
   250  			Aggregation: view.Distribution(0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 10),
   251  			TagKeys:     []tag.Key{operationTypeKey},
   252  		},
   253  		{
   254  			Description: AssignedIdentityUpdateDurationM.Description(),
   255  			Measure:     AssignedIdentityUpdateDurationM,
   256  			Aggregation: view.Distribution(0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 10),
   257  		},
   258  		{
   259  			Description: AssignedIdentityUpdateCountM.Description(),
   260  			Measure:     AssignedIdentityUpdateCountM,
   261  			Aggregation: view.Count(),
   262  		},
   263  	}
   264  	err := view.Register(views...)
   265  	return err
   266  }
   267  
   268  // record records the given measure
   269  func record(ctx context.Context, ms ...stats.Measurement) {
   270  	stats.Record(ctx, ms...)
   271  }
   272  
   273  // Reporter is stats reporter in the context
   274  type Reporter struct {
   275  	// adding mutex lock to ensure thread safety
   276  	// TODO (aramase) remove this lock after confirming opencensus report
   277  	// call is thread-safe
   278  	mu  sync.Mutex
   279  	ctx context.Context
   280  }
   281  
   282  // NewReporter creates a reporter with new context
   283  func NewReporter() (*Reporter, error) {
   284  	ctx, err := tag.New(
   285  		context.Background(),
   286  	)
   287  	if err != nil {
   288  		return nil, err
   289  	}
   290  	return &Reporter{ctx: ctx, mu: sync.Mutex{}}, nil
   291  }
   292  
   293  // Report records the given measure
   294  func (r *Reporter) Report(ms ...stats.Measurement) {
   295  	r.mu.Lock()
   296  	record(r.ctx, ms...)
   297  	r.mu.Unlock()
   298  }
   299  
   300  // ReportOperationAndStatus records given measurements by operation type, status code for the given namespace and resource.
   301  func (r *Reporter) ReportOperationAndStatus(operationType, statusCode, namespace, resource string, ms ...stats.Measurement) error {
   302  	r.mu.Lock()
   303  	defer r.mu.Unlock()
   304  
   305  	ctx, err := tag.New(
   306  		r.ctx,
   307  		tag.Insert(operationTypeKey, operationType),
   308  		tag.Insert(statusCodeKey, statusCode),
   309  		tag.Insert(namespaceKey, namespace),
   310  		tag.Insert(resourceKey, resource),
   311  	)
   312  	if err != nil {
   313  		return err
   314  	}
   315  	record(ctx, ms...)
   316  	return nil
   317  }
   318  
   319  // ReportOperation records given measurement by operation type.
   320  func (r *Reporter) ReportOperation(operationType string, measurement stats.Measurement) error {
   321  	r.mu.Lock()
   322  	defer r.mu.Unlock()
   323  
   324  	ctx, err := tag.New(
   325  		r.ctx,
   326  		tag.Insert(operationTypeKey, operationType),
   327  	)
   328  	if err != nil {
   329  		return err
   330  	}
   331  	record(ctx, measurement)
   332  	return nil
   333  }
   334  
   335  // RegisterAndExport register the views for the measures and expose via prometheus exporter
   336  func RegisterAndExport(port string) error {
   337  	err := registerViews()
   338  	if err != nil {
   339  		return fmt.Errorf("failed to register views for metrics, error:%v", err)
   340  	}
   341  	klog.Infof("registered views for metric")
   342  	exporter, err := newPrometheusExporter(componentNamespace, port)
   343  	if err != nil {
   344  		return fmt.Errorf("failed to create Prometheus exporter, error: %+v", err)
   345  	}
   346  	view.RegisterExporter(exporter)
   347  	klog.Infof("registered and exported metrics on port %s", port)
   348  	return nil
   349  }
   350  
   351  // ReportIMDSOperationError reports IMDS error count
   352  func (r *Reporter) ReportIMDSOperationError(operation string) error {
   353  	return r.ReportOperation(operation, ImdsOperationsErrorsCountM.M(1))
   354  }
   355  
   356  // ReportIMDSOperationDuration reports IMDS operation duration
   357  func (r *Reporter) ReportIMDSOperationDuration(operation string, duration time.Duration) error {
   358  	return r.ReportOperation(operation, ImdsOperationsDurationM.M(duration.Seconds()))
   359  }
   360  
   361  // ReportCloudProviderOperationError reports cloud provider operation error count
   362  func (r *Reporter) ReportCloudProviderOperationError(operation string) error {
   363  	return r.ReportOperation(operation, CloudProviderOperationsErrorsCountM.M(1))
   364  }
   365  
   366  // ReportCloudProviderOperationDuration reports cloud provider operation duration
   367  func (r *Reporter) ReportCloudProviderOperationDuration(operation string, duration time.Duration) error {
   368  	return r.ReportOperation(operation, CloudProviderOperationsDurationM.M(duration.Seconds()))
   369  }
   370  
   371  // ReportKubernetesAPIOperationError reports kubernetes operation error count
   372  func (r *Reporter) ReportKubernetesAPIOperationError(operation string) error {
   373  	return r.ReportOperation(operation, KubernetesAPIOperationsErrorsCountM.M(1))
   374  }