github.com/Azure/aad-pod-identity@v1.8.17/pkg/metrics/metrics.go (about) 1 package metrics 2 3 import ( 4 "context" 5 "fmt" 6 "sync" 7 "time" 8 9 "go.opencensus.io/stats" 10 "go.opencensus.io/stats/view" 11 "go.opencensus.io/tag" 12 "k8s.io/klog/v2" 13 ) 14 15 // This const block defines the metric names. 16 const ( 17 assignedIdentityAdditionDurationName = "assigned_identity_addition_duration_seconds" 18 assignedIdentityAdditionCountName = "assigned_identity_addition_count" 19 assignedIdentityDeletionDurationName = "assigned_identity_deletion_duration_seconds" 20 assignedIdentityDeletionCountName = "assigned_identity_deletion_count" 21 assignedIdentityUpdateDurationName = "assigned_identity_update_duration_seconds" 22 assignedIdentityUpdateCountName = "assigned_identity_update_count" 23 nmiOperationsDurationName = "nmi_operations_duration_seconds" 24 micCycleDurationName = "mic_cycle_duration_seconds" 25 micCycleCountName = "mic_cycle_count" 26 micNewLeaderElectionCountName = "mic_new_leader_election_count" 27 cloudProviderOperationsErrorsCountName = "cloud_provider_operations_errors_count" 28 cloudProviderOperationsDurationName = "cloud_provider_operations_duration_seconds" 29 kubernetesAPIOperationsErrorsCountName = "kubernetes_api_operations_errors_count" 30 imdsOperationsErrorsCountName = "imds_operations_errors_count" 31 imdsOperationsDurationName = "imds_operations_duration_seconds" 32 33 // AdalTokenFromMSIOperationName represents the duration of obtaining a token with MSI. 34 AdalTokenFromMSIOperationName = "adal_token_msi" // #nosec 35 36 // AdalTokenFromMSIWithUserAssignedIDOperationName represents the duration of obtaining a token with a user-assigned identity. 37 AdalTokenFromMSIWithUserAssignedIDOperationName = "adal_token_msi_userassignedid" // #nosec 38 39 // AdalTokenOperationName represents the duration of obtaining a token. 40 AdalTokenOperationName = "adal_token" 41 42 // GetVmssOperationName represents the duration of a GET request to a VMSS instance. 43 GetVmssOperationName = "vmss_get" 44 45 // UpdateVMSSOperationName represents the duration of a PATCH request to a VMSS instance. 46 UpdateVMSSOperationName = "vmss_update" 47 48 // GetVMOperationName represents the duration of a GET request to a VM instance. 49 GetVMOperationName = "vm_get" 50 51 // UpdateVMOperationName represents the duration of a PATCH request to a VM instance. 52 UpdateVMOperationName = "vm_update" 53 54 // AssignedIdentityDeletionOperationName represents the duration of an AzureAssignedIdentity deletion. 55 AssignedIdentityDeletionOperationName = "assigned_identity_deletion" 56 57 // AssignedIdentityAdditionOperationName represents the duration of an AzureAssignedIdentity addition. 58 AssignedIdentityAdditionOperationName = "assigned_identity_addition" 59 60 // AssignedIdentityUpdateOperationName represents the duration of an AzureAssignedIdentity update. 61 AssignedIdentityUpdateOperationName = "assigned_identity_update" 62 63 // UpdateAzureAssignedIdentityStatusOperationName represents the status of an AzureAssignedIdentity update operation. 64 UpdateAzureAssignedIdentityStatusOperationName = "update_azure_assigned_identity_status" 65 66 // GetPodListOperationName represents the status of a pod list operation. 67 GetPodListOperationName = "get_pod_list" 68 69 // GetSecretOperationName represents the status of a secret get operation. 70 GetSecretOperationName = "get_secret" 71 ) 72 73 // The following variables are measures 74 var ( 75 // AssignedIdentityAdditionDurationM is a measure that tracks the duration in seconds of assigned_identity_addition operations. 76 AssignedIdentityAdditionDurationM = stats.Float64( 77 assignedIdentityAdditionDurationName, 78 "Duration in seconds of assigned identity addition operations", 79 stats.UnitMilliseconds) 80 81 // AssignedIdentityAdditionCountM is a measure that tracks the cumulative number of assigned identity addition operations. 82 AssignedIdentityAdditionCountM = stats.Int64( 83 assignedIdentityAdditionCountName, 84 "Total number of assigned identity addition operations", 85 stats.UnitDimensionless) 86 87 // AssignedIdentityDeletionDurationM is a measure that tracks the duration in seconds of assigned_identity_deletion operations. 88 AssignedIdentityDeletionDurationM = stats.Float64( 89 assignedIdentityDeletionDurationName, 90 "Duration in seconds of assigned identity deletion operations", 91 stats.UnitMilliseconds) 92 93 // AssignedIdentityDeletionCountM is a measure that tracks the cumulative number of assigned identity deletion operations. 94 AssignedIdentityDeletionCountM = stats.Int64(assignedIdentityDeletionCountName, 95 "Total number of assigned identity deletion operations", 96 stats.UnitDimensionless) 97 98 // NMIOperationsDurationM is a measure that tracks the duration in seconds of nmi operations. 99 NMIOperationsDurationM = stats.Float64( 100 nmiOperationsDurationName, 101 "Duration in seconds for nmi operations", 102 stats.UnitMilliseconds) 103 104 // MICCycleDurationM is a measure that tracks the duration in seconds for single mic sync cycle. 105 MICCycleDurationM = stats.Float64( 106 micCycleDurationName, 107 "Duration in seconds for single mic sync cycle", 108 stats.UnitMilliseconds) 109 110 // MICCycleCountM is a measure that tracks the cumulative number of cycles executed in mic. 111 MICCycleCountM = stats.Int64( 112 micCycleCountName, 113 "Total number of cycles executed in mic", 114 stats.UnitDimensionless) 115 116 // MICNewLeaderElectionCountM is a measure that tracks the cumulative number of new leader election in mic. 117 MICNewLeaderElectionCountM = stats.Int64( 118 micNewLeaderElectionCountName, 119 "Total number of new leader election in mic", 120 stats.UnitDimensionless) 121 122 // CloudProviderOperationsErrorsCountM is a measure that tracks the cumulative number of errors in cloud provider operations. 123 CloudProviderOperationsErrorsCountM = stats.Int64( 124 cloudProviderOperationsErrorsCountName, 125 "Total number of errors in cloud provider operations", 126 stats.UnitDimensionless) 127 128 // CloudProviderOperationsDurationM is a measure that tracks the duration in seconds of CloudProviderOperations operations. 129 CloudProviderOperationsDurationM = stats.Float64( 130 cloudProviderOperationsDurationName, 131 "Duration in seconds of cloudprovider operations", 132 stats.UnitMilliseconds) 133 134 // KubernetesAPIOperationsErrorsCountM is a measure that tracks the cumulative number of errors in cloud provider operations. 135 KubernetesAPIOperationsErrorsCountM = stats.Int64( 136 kubernetesAPIOperationsErrorsCountName, 137 "Total number of errors in kubernetes api operations", 138 stats.UnitDimensionless) 139 140 // ImdsOperationsErrorsCountM is a measure that tracks the cumulative number of errors in imds operations. 141 ImdsOperationsErrorsCountM = stats.Int64( 142 imdsOperationsErrorsCountName, 143 "Total number of errors in imds token operations", 144 stats.UnitDimensionless) 145 146 // ImdsOperationsDurationM is a measure that tracks the duration in seconds of imds operations. 147 ImdsOperationsDurationM = stats.Float64( 148 imdsOperationsDurationName, 149 "Duration in seconds of imds token operations", 150 stats.UnitMilliseconds) 151 152 // AssignedIdentityUpdateDurationM is a measure that tracks the duration in seconds of assigned_identity_update operations. 153 AssignedIdentityUpdateDurationM = stats.Float64( 154 assignedIdentityUpdateDurationName, 155 "Duration in seconds of assigned identity update operations", 156 stats.UnitMilliseconds) 157 158 // AssignedIdentityUpdateCountM is a measure that tracks the cumulative number of assigned identity update operations. 159 AssignedIdentityUpdateCountM = stats.Int64( 160 assignedIdentityUpdateCountName, 161 "Total number of assigned identity update operations", 162 stats.UnitDimensionless) 163 ) 164 165 var ( 166 operationTypeKey = tag.MustNewKey("operation_type") 167 statusCodeKey = tag.MustNewKey("status_code") 168 namespaceKey = tag.MustNewKey("namespace") 169 resourceKey = tag.MustNewKey("resource") 170 ) 171 172 const componentNamespace = "aadpodidentity" 173 174 // SinceInSeconds gets the time since the specified start in seconds. 175 func SinceInSeconds(start time.Time) float64 { 176 return time.Since(start).Seconds() 177 } 178 179 // registerViews register views to be collected by exporter 180 func registerViews() error { 181 views := []*view.View{ 182 { 183 Description: AssignedIdentityAdditionDurationM.Description(), 184 Measure: AssignedIdentityAdditionDurationM, 185 Aggregation: view.Distribution(0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 10), 186 }, 187 { 188 Description: AssignedIdentityAdditionCountM.Description(), 189 Measure: AssignedIdentityAdditionCountM, 190 Aggregation: view.Count(), 191 }, 192 { 193 Description: AssignedIdentityDeletionDurationM.Description(), 194 Measure: AssignedIdentityDeletionDurationM, 195 Aggregation: view.Distribution(0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 10), 196 }, 197 { 198 Description: AssignedIdentityDeletionCountM.Description(), 199 Measure: AssignedIdentityDeletionCountM, 200 Aggregation: view.Count(), 201 }, 202 { 203 Description: NMIOperationsDurationM.Description(), 204 Measure: NMIOperationsDurationM, 205 Aggregation: view.Distribution(0.5, 1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100), 206 TagKeys: []tag.Key{operationTypeKey, statusCodeKey, namespaceKey, resourceKey}, 207 }, 208 { 209 Description: MICCycleDurationM.Description(), 210 Measure: MICCycleDurationM, 211 Aggregation: view.Distribution(0.5, 1, 5, 10, 30, 60, 120, 300, 600, 900, 1200), 212 }, 213 { 214 Description: MICCycleCountM.Description(), 215 Measure: MICCycleCountM, 216 Aggregation: view.Count(), 217 }, 218 { 219 Description: MICNewLeaderElectionCountM.Description(), 220 Measure: MICNewLeaderElectionCountM, 221 Aggregation: view.Count(), 222 }, 223 { 224 Description: CloudProviderOperationsErrorsCountM.Description(), 225 Measure: CloudProviderOperationsErrorsCountM, 226 Aggregation: view.Count(), 227 TagKeys: []tag.Key{operationTypeKey}, 228 }, 229 { 230 Description: CloudProviderOperationsDurationM.Description(), 231 Measure: CloudProviderOperationsDurationM, 232 Aggregation: view.Distribution(0.5, 1, 5, 10, 30, 60, 120, 300, 600, 900, 1200), 233 TagKeys: []tag.Key{operationTypeKey}, 234 }, 235 { 236 Description: KubernetesAPIOperationsErrorsCountM.Description(), 237 Measure: KubernetesAPIOperationsErrorsCountM, 238 Aggregation: view.Count(), 239 TagKeys: []tag.Key{operationTypeKey}, 240 }, 241 { 242 Description: ImdsOperationsErrorsCountM.Description(), 243 Measure: ImdsOperationsErrorsCountM, 244 Aggregation: view.Count(), 245 TagKeys: []tag.Key{operationTypeKey}, 246 }, 247 { 248 Description: ImdsOperationsDurationM.Description(), 249 Measure: ImdsOperationsDurationM, 250 Aggregation: view.Distribution(0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 10), 251 TagKeys: []tag.Key{operationTypeKey}, 252 }, 253 { 254 Description: AssignedIdentityUpdateDurationM.Description(), 255 Measure: AssignedIdentityUpdateDurationM, 256 Aggregation: view.Distribution(0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 10), 257 }, 258 { 259 Description: AssignedIdentityUpdateCountM.Description(), 260 Measure: AssignedIdentityUpdateCountM, 261 Aggregation: view.Count(), 262 }, 263 } 264 err := view.Register(views...) 265 return err 266 } 267 268 // record records the given measure 269 func record(ctx context.Context, ms ...stats.Measurement) { 270 stats.Record(ctx, ms...) 271 } 272 273 // Reporter is stats reporter in the context 274 type Reporter struct { 275 // adding mutex lock to ensure thread safety 276 // TODO (aramase) remove this lock after confirming opencensus report 277 // call is thread-safe 278 mu sync.Mutex 279 ctx context.Context 280 } 281 282 // NewReporter creates a reporter with new context 283 func NewReporter() (*Reporter, error) { 284 ctx, err := tag.New( 285 context.Background(), 286 ) 287 if err != nil { 288 return nil, err 289 } 290 return &Reporter{ctx: ctx, mu: sync.Mutex{}}, nil 291 } 292 293 // Report records the given measure 294 func (r *Reporter) Report(ms ...stats.Measurement) { 295 r.mu.Lock() 296 record(r.ctx, ms...) 297 r.mu.Unlock() 298 } 299 300 // ReportOperationAndStatus records given measurements by operation type, status code for the given namespace and resource. 301 func (r *Reporter) ReportOperationAndStatus(operationType, statusCode, namespace, resource string, ms ...stats.Measurement) error { 302 r.mu.Lock() 303 defer r.mu.Unlock() 304 305 ctx, err := tag.New( 306 r.ctx, 307 tag.Insert(operationTypeKey, operationType), 308 tag.Insert(statusCodeKey, statusCode), 309 tag.Insert(namespaceKey, namespace), 310 tag.Insert(resourceKey, resource), 311 ) 312 if err != nil { 313 return err 314 } 315 record(ctx, ms...) 316 return nil 317 } 318 319 // ReportOperation records given measurement by operation type. 320 func (r *Reporter) ReportOperation(operationType string, measurement stats.Measurement) error { 321 r.mu.Lock() 322 defer r.mu.Unlock() 323 324 ctx, err := tag.New( 325 r.ctx, 326 tag.Insert(operationTypeKey, operationType), 327 ) 328 if err != nil { 329 return err 330 } 331 record(ctx, measurement) 332 return nil 333 } 334 335 // RegisterAndExport register the views for the measures and expose via prometheus exporter 336 func RegisterAndExport(port string) error { 337 err := registerViews() 338 if err != nil { 339 return fmt.Errorf("failed to register views for metrics, error:%v", err) 340 } 341 klog.Infof("registered views for metric") 342 exporter, err := newPrometheusExporter(componentNamespace, port) 343 if err != nil { 344 return fmt.Errorf("failed to create Prometheus exporter, error: %+v", err) 345 } 346 view.RegisterExporter(exporter) 347 klog.Infof("registered and exported metrics on port %s", port) 348 return nil 349 } 350 351 // ReportIMDSOperationError reports IMDS error count 352 func (r *Reporter) ReportIMDSOperationError(operation string) error { 353 return r.ReportOperation(operation, ImdsOperationsErrorsCountM.M(1)) 354 } 355 356 // ReportIMDSOperationDuration reports IMDS operation duration 357 func (r *Reporter) ReportIMDSOperationDuration(operation string, duration time.Duration) error { 358 return r.ReportOperation(operation, ImdsOperationsDurationM.M(duration.Seconds())) 359 } 360 361 // ReportCloudProviderOperationError reports cloud provider operation error count 362 func (r *Reporter) ReportCloudProviderOperationError(operation string) error { 363 return r.ReportOperation(operation, CloudProviderOperationsErrorsCountM.M(1)) 364 } 365 366 // ReportCloudProviderOperationDuration reports cloud provider operation duration 367 func (r *Reporter) ReportCloudProviderOperationDuration(operation string, duration time.Duration) error { 368 return r.ReportOperation(operation, CloudProviderOperationsDurationM.M(duration.Seconds())) 369 } 370 371 // ReportKubernetesAPIOperationError reports kubernetes operation error count 372 func (r *Reporter) ReportKubernetesAPIOperationError(operation string) error { 373 return r.ReportOperation(operation, KubernetesAPIOperationsErrorsCountM.M(1)) 374 }