k8s.io/kubernetes@v1.29.3/pkg/controller/job/metrics/metrics.go (about) 1 /* 2 Copyright 2021 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package metrics 18 19 import ( 20 "sync" 21 22 "k8s.io/component-base/metrics" 23 "k8s.io/component-base/metrics/legacyregistry" 24 ) 25 26 // JobControllerSubsystem - subsystem name used for this controller. 27 const JobControllerSubsystem = "job_controller" 28 29 var ( 30 // JobSyncDurationSeconds tracks the latency of Job syncs. Possible label 31 // values: 32 // completion_mode: Indexed, NonIndexed 33 // result: success, error 34 // action: reconciling, tracking, pods_created, pods_deleted 35 JobSyncDurationSeconds = metrics.NewHistogramVec( 36 &metrics.HistogramOpts{ 37 Subsystem: JobControllerSubsystem, 38 Name: "job_sync_duration_seconds", 39 Help: "The time it took to sync a job", 40 StabilityLevel: metrics.STABLE, 41 Buckets: metrics.ExponentialBuckets(0.004, 2, 15), 42 }, 43 []string{"completion_mode", "result", "action"}, 44 ) 45 // JobSyncNum tracks the number of Job syncs. Possible label values: 46 // completion_mode: Indexed, NonIndexed 47 // result: success, error 48 // action: reconciling, tracking, pods_created, pods_deleted 49 JobSyncNum = metrics.NewCounterVec( 50 &metrics.CounterOpts{ 51 Subsystem: JobControllerSubsystem, 52 Name: "job_syncs_total", 53 Help: "The number of job syncs", 54 StabilityLevel: metrics.STABLE, 55 }, 56 []string{"completion_mode", "result", "action"}, 57 ) 58 // JobFinishedNum tracks the number of Jobs that finish. Empty reason label 59 // is used to count successful jobs. 60 // Possible label values: 61 // completion_mode: Indexed, NonIndexed 62 // result: failed, succeeded 63 // reason: "BackoffLimitExceeded", "DeadlineExceeded", "PodFailurePolicy", "" 64 JobFinishedNum = metrics.NewCounterVec( 65 &metrics.CounterOpts{ 66 Subsystem: JobControllerSubsystem, 67 Name: "jobs_finished_total", 68 Help: "The number of finished jobs", 69 StabilityLevel: metrics.STABLE, 70 }, 71 []string{"completion_mode", "result", "reason"}, 72 ) 73 74 // JobPodsFinished records the number of finished Pods that the job controller 75 // finished tracking. 76 // It only applies to Jobs that were created while the feature gate 77 // JobTrackingWithFinalizers was enabled. 78 // Possible label values: 79 // completion_mode: Indexed, NonIndexed 80 // result: failed, succeeded 81 JobPodsFinished = metrics.NewCounterVec( 82 &metrics.CounterOpts{ 83 Subsystem: JobControllerSubsystem, 84 Name: "job_pods_finished_total", 85 Help: "The number of finished Pods that are fully tracked", 86 StabilityLevel: metrics.STABLE, 87 }, 88 []string{"completion_mode", "result"}) 89 90 // PodFailuresHandledByFailurePolicy records the number of finished Pods 91 // handled by pod failure policy. 92 // Possible label values: 93 // action: FailJob, Ignore, Count 94 PodFailuresHandledByFailurePolicy = metrics.NewCounterVec( 95 &metrics.CounterOpts{ 96 Subsystem: JobControllerSubsystem, 97 Name: "pod_failures_handled_by_failure_policy_total", 98 Help: `The number of failed Pods handled by failure policy with 99 respect to the failure policy action applied based on the matched 100 rule. Possible values of the action label correspond to the 101 possible values for the failure policy rule action, which are: 102 "FailJob", "Ignore" and "Count".`, 103 }, 104 []string{"action"}) 105 106 // TerminatedPodsTrackingFinalizerTotal records the addition and removal of 107 // terminated pods that have the finalizer batch.kubernetes.io/job-tracking, 108 // regardless of whether they are owned by a Job. 109 TerminatedPodsTrackingFinalizerTotal = metrics.NewCounterVec( 110 &metrics.CounterOpts{ 111 Subsystem: JobControllerSubsystem, 112 Name: "terminated_pods_tracking_finalizer_total", 113 Help: `The number of terminated pods (phase=Failed|Succeeded) 114 that have the finalizer batch.kubernetes.io/job-tracking 115 The event label can be "add" or "delete".`, 116 }, []string{"event"}) 117 118 // JobFinishedIndexesTotal records the number of finished indexes. 119 JobFinishedIndexesTotal = metrics.NewCounterVec( 120 &metrics.CounterOpts{ 121 Subsystem: JobControllerSubsystem, 122 Name: "job_finished_indexes_total", 123 Help: `The number of finished indexes. Possible values for the 124 status label are: "succeeded", "failed". Possible values for the 125 backoffLimit label are: "perIndex" and "global"`, 126 }, 127 []string{"status", "backoffLimit"}) 128 129 // JobPodsCreationTotal records the number of pods created by the job controller 130 // based on the reason for their creation (i.e. if PodReplacementPolicy was specified) 131 // and the status of the creation (i.e. if the Pod creation succeeded or failed). 132 // Possible label values: 133 // reason: new, recreate_terminating_or_failed, recreate_failed 134 // status: succeeded, failed 135 JobPodsCreationTotal = metrics.NewCounterVec( 136 &metrics.CounterOpts{ 137 Subsystem: JobControllerSubsystem, 138 Name: "job_pods_creation_total", 139 Help: `The number of Pods created by the Job controller labelled with a reason for the Pod creation. 140 This metric also distinguishes between Pods created using different PodReplacementPolicy settings. 141 Possible values of the "reason" label are: 142 "new", "recreate_terminating_or_failed", "recreate_failed". 143 Possible values of the "status" label are: 144 "succeeded", "failed".`, 145 }, []string{"reason", "status"}) 146 ) 147 148 const ( 149 // Possible values for the "action" label in the above metrics. 150 151 // JobSyncActionReconciling when the Job's pod creation/deletion expectations 152 // are unsatisfied and the controller is waiting for issued Pod 153 // creation/deletions to complete. 154 JobSyncActionReconciling = "reconciling" 155 // JobSyncActionTracking when the Job's pod creation/deletion expectations 156 // are satisfied and the number of active Pods matches expectations (i.e. no 157 // pod creation/deletions issued in this sync). This is expected to be the 158 // action in most of the syncs. 159 JobSyncActionTracking = "tracking" 160 // JobSyncActionPodsCreated when the controller creates Pods. This can happen 161 // when the number of active Pods is less than the wanted Job parallelism. 162 JobSyncActionPodsCreated = "pods_created" 163 // JobSyncActionPodsDeleted when the controller deletes Pods. This can happen 164 // if a Job is suspended or if the number of active Pods is more than 165 // parallelism. 166 JobSyncActionPodsDeleted = "pods_deleted" 167 168 // Possible values for "result" and "status" (job_pods_creation_total) labels in the above metrics. 169 170 Succeeded = "succeeded" 171 Failed = "failed" 172 173 // Possible values for "event" label in the terminated_pods_tracking_finalizer 174 // metric. 175 Add = "add" 176 Delete = "delete" 177 178 // Possible values for "reason" label in the job_pods_creation_total metric. 179 180 PodCreateNew = "new" 181 PodRecreateTerminatingOrFailed = "recreate_terminating_or_failed" 182 PodRecreateFailed = "recreate_failed" 183 ) 184 185 var registerMetrics sync.Once 186 187 // Register registers Job controller metrics. 188 func Register() { 189 registerMetrics.Do(func() { 190 legacyregistry.MustRegister(JobSyncDurationSeconds) 191 legacyregistry.MustRegister(JobSyncNum) 192 legacyregistry.MustRegister(JobFinishedNum) 193 legacyregistry.MustRegister(JobPodsFinished) 194 legacyregistry.MustRegister(PodFailuresHandledByFailurePolicy) 195 legacyregistry.MustRegister(TerminatedPodsTrackingFinalizerTotal) 196 legacyregistry.MustRegister(JobFinishedIndexesTotal) 197 legacyregistry.MustRegister(JobPodsCreationTotal) 198 }) 199 }