k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/job/metrics/metrics.go (about) 1 /* 2 Copyright 2021 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package metrics 18 19 import ( 20 "sync" 21 22 "k8s.io/component-base/metrics" 23 "k8s.io/component-base/metrics/legacyregistry" 24 ) 25 26 // JobControllerSubsystem - subsystem name used for this controller. 27 const JobControllerSubsystem = "job_controller" 28 29 var ( 30 // JobSyncDurationSeconds tracks the latency of Job syncs. Possible label 31 // values: 32 // completion_mode: Indexed, NonIndexed 33 // result: success, error 34 // action: reconciling, tracking, pods_created, pods_deleted 35 JobSyncDurationSeconds = metrics.NewHistogramVec( 36 &metrics.HistogramOpts{ 37 Subsystem: JobControllerSubsystem, 38 Name: "job_sync_duration_seconds", 39 Help: "The time it took to sync a job", 40 StabilityLevel: metrics.STABLE, 41 Buckets: metrics.ExponentialBuckets(0.004, 2, 15), 42 }, 43 []string{"completion_mode", "result", "action"}, 44 ) 45 // JobSyncNum tracks the number of Job syncs. Possible label values: 46 // completion_mode: Indexed, NonIndexed 47 // result: success, error 48 // action: reconciling, tracking, pods_created, pods_deleted 49 JobSyncNum = metrics.NewCounterVec( 50 &metrics.CounterOpts{ 51 Subsystem: JobControllerSubsystem, 52 Name: "job_syncs_total", 53 Help: "The number of job syncs", 54 StabilityLevel: metrics.STABLE, 55 }, 56 []string{"completion_mode", "result", "action"}, 57 ) 58 // JobFinishedNum tracks the number of Jobs that finish. Empty reason label 59 // is used to count successful jobs. 60 // Possible label values: 61 // completion_mode: Indexed, NonIndexed 62 // result: failed, succeeded 63 // reason: "BackoffLimitExceeded", "DeadlineExceeded", "PodFailurePolicy", "" 64 JobFinishedNum = metrics.NewCounterVec( 65 &metrics.CounterOpts{ 66 Subsystem: JobControllerSubsystem, 67 Name: "jobs_finished_total", 68 Help: "The number of finished jobs", 69 StabilityLevel: metrics.STABLE, 70 }, 71 []string{"completion_mode", "result", "reason"}, 72 ) 73 74 // JobByExternalControllerTotal tracks the number of Jobs that were created 75 // as managed by an external controller. 76 // The value of the label controller_name corresponds to the value of the 77 // managedBy field. 78 JobByExternalControllerTotal = metrics.NewCounterVec( 79 &metrics.CounterOpts{ 80 Subsystem: JobControllerSubsystem, 81 Name: "jobs_by_external_controller_total", 82 Help: "The number of Jobs managed by an external controller", 83 StabilityLevel: metrics.ALPHA, 84 }, 85 []string{"controller_name"}, 86 ) 87 88 // JobPodsFinished records the number of finished Pods that the job controller 89 // finished tracking. 90 // It only applies to Jobs that were created while the feature gate 91 // JobTrackingWithFinalizers was enabled. 92 // Possible label values: 93 // completion_mode: Indexed, NonIndexed 94 // result: failed, succeeded 95 JobPodsFinished = metrics.NewCounterVec( 96 &metrics.CounterOpts{ 97 Subsystem: JobControllerSubsystem, 98 Name: "job_pods_finished_total", 99 Help: "The number of finished Pods that are fully tracked", 100 StabilityLevel: metrics.STABLE, 101 }, 102 []string{"completion_mode", "result"}) 103 104 // PodFailuresHandledByFailurePolicy records the number of finished Pods 105 // handled by pod failure policy. 106 // Possible label values: 107 // action: FailJob, Ignore, Count 108 PodFailuresHandledByFailurePolicy = metrics.NewCounterVec( 109 &metrics.CounterOpts{ 110 Subsystem: JobControllerSubsystem, 111 Name: "pod_failures_handled_by_failure_policy_total", 112 Help: `The number of failed Pods handled by failure policy with 113 respect to the failure policy action applied based on the matched 114 rule. Possible values of the action label correspond to the 115 possible values for the failure policy rule action, which are: 116 "FailJob", "Ignore" and "Count".`, 117 }, 118 []string{"action"}) 119 120 // TerminatedPodsTrackingFinalizerTotal records the addition and removal of 121 // terminated pods that have the finalizer batch.kubernetes.io/job-tracking, 122 // regardless of whether they are owned by a Job. 123 TerminatedPodsTrackingFinalizerTotal = metrics.NewCounterVec( 124 &metrics.CounterOpts{ 125 Subsystem: JobControllerSubsystem, 126 Name: "terminated_pods_tracking_finalizer_total", 127 Help: `The number of terminated pods (phase=Failed|Succeeded) 128 that have the finalizer batch.kubernetes.io/job-tracking 129 The event label can be "add" or "delete".`, 130 }, []string{"event"}) 131 132 // JobFinishedIndexesTotal records the number of finished indexes. 133 JobFinishedIndexesTotal = metrics.NewCounterVec( 134 &metrics.CounterOpts{ 135 Subsystem: JobControllerSubsystem, 136 Name: "job_finished_indexes_total", 137 Help: `The number of finished indexes. Possible values for the 138 status label are: "succeeded", "failed". Possible values for the 139 backoffLimit label are: "perIndex" and "global"`, 140 }, 141 []string{"status", "backoffLimit"}) 142 143 // JobPodsCreationTotal records the number of pods created by the job controller 144 // based on the reason for their creation (i.e. if PodReplacementPolicy was specified) 145 // and the status of the creation (i.e. if the Pod creation succeeded or failed). 146 // Possible label values: 147 // reason: new, recreate_terminating_or_failed, recreate_failed 148 // status: succeeded, failed 149 JobPodsCreationTotal = metrics.NewCounterVec( 150 &metrics.CounterOpts{ 151 Subsystem: JobControllerSubsystem, 152 Name: "job_pods_creation_total", 153 Help: `The number of Pods created by the Job controller labelled with a reason for the Pod creation. 154 This metric also distinguishes between Pods created using different PodReplacementPolicy settings. 155 Possible values of the "reason" label are: 156 "new", "recreate_terminating_or_failed", "recreate_failed". 157 Possible values of the "status" label are: 158 "succeeded", "failed".`, 159 }, []string{"reason", "status"}) 160 ) 161 162 const ( 163 // Possible values for the "action" label in the above metrics. 164 165 // JobSyncActionReconciling when the Job's pod creation/deletion expectations 166 // are unsatisfied and the controller is waiting for issued Pod 167 // creation/deletions to complete. 168 JobSyncActionReconciling = "reconciling" 169 // JobSyncActionTracking when the Job's pod creation/deletion expectations 170 // are satisfied and the number of active Pods matches expectations (i.e. no 171 // pod creation/deletions issued in this sync). This is expected to be the 172 // action in most of the syncs. 173 JobSyncActionTracking = "tracking" 174 // JobSyncActionPodsCreated when the controller creates Pods. This can happen 175 // when the number of active Pods is less than the wanted Job parallelism. 176 JobSyncActionPodsCreated = "pods_created" 177 // JobSyncActionPodsDeleted when the controller deletes Pods. This can happen 178 // if a Job is suspended or if the number of active Pods is more than 179 // parallelism. 180 JobSyncActionPodsDeleted = "pods_deleted" 181 182 // Possible values for "result" and "status" (job_pods_creation_total) labels in the above metrics. 183 184 Succeeded = "succeeded" 185 Failed = "failed" 186 187 // Possible values for "event" label in the terminated_pods_tracking_finalizer 188 // metric. 189 Add = "add" 190 Delete = "delete" 191 192 // Possible values for "reason" label in the job_pods_creation_total metric. 193 194 PodCreateNew = "new" 195 PodRecreateTerminatingOrFailed = "recreate_terminating_or_failed" 196 PodRecreateFailed = "recreate_failed" 197 ) 198 199 var registerMetrics sync.Once 200 201 // Register registers Job controller metrics. 202 func Register() { 203 registerMetrics.Do(func() { 204 legacyregistry.MustRegister(JobSyncDurationSeconds) 205 legacyregistry.MustRegister(JobSyncNum) 206 legacyregistry.MustRegister(JobFinishedNum) 207 legacyregistry.MustRegister(JobPodsFinished) 208 legacyregistry.MustRegister(PodFailuresHandledByFailurePolicy) 209 legacyregistry.MustRegister(TerminatedPodsTrackingFinalizerTotal) 210 legacyregistry.MustRegister(JobFinishedIndexesTotal) 211 legacyregistry.MustRegister(JobPodsCreationTotal) 212 legacyregistry.MustRegister(JobByExternalControllerTotal) 213 }) 214 }