github.com/kubeflow/training-operator@v1.7.0/pkg/common/metrics.go (about) 1 // Copyright 2021 The Kubeflow Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License 14 15 package common 16 17 import ( 18 "github.com/prometheus/client_golang/prometheus" 19 "github.com/prometheus/client_golang/prometheus/promauto" 20 "sigs.k8s.io/controller-runtime/pkg/metrics" 21 ) 22 23 // Define all the prometheus counters for all jobs 24 var ( 25 jobsCreatedCount = promauto.NewCounterVec( 26 prometheus.CounterOpts{ 27 Name: "training_operator_jobs_created_total", 28 Help: "Counts number of jobs created", 29 }, 30 []string{"job_namespace", "framework"}, 31 ) 32 jobsDeletedCount = promauto.NewCounterVec( 33 prometheus.CounterOpts{ 34 Name: "training_operator_jobs_deleted_total", 35 Help: "Counts number of jobs deleted", 36 }, 37 []string{"job_namespace", "framework"}, 38 ) 39 jobsSuccessfulCount = promauto.NewCounterVec( 40 prometheus.CounterOpts{ 41 Name: "training_operator_jobs_successful_total", 42 Help: "Counts number of jobs successful", 43 }, 44 []string{"job_namespace", "framework"}, 45 ) 46 jobsFailedCount = promauto.NewCounterVec( 47 prometheus.CounterOpts{ 48 Name: "training_operator_jobs_failed_total", 49 Help: "Counts number of jobs failed", 50 }, 51 []string{"job_namespace", "framework"}, 52 ) 53 jobsRestartedCount = promauto.NewCounterVec( 54 prometheus.CounterOpts{ 55 Name: "training_operator_jobs_restarted_total", 56 Help: "Counts number of jobs restarted", 57 }, 58 []string{"job_namespace", "framework"}, 59 ) 60 ) 61 62 func init() { 63 // Register custom metrics with the global prometheus registry 64 metrics.Registry.MustRegister(jobsCreatedCount, 65 jobsDeletedCount, 66 jobsSuccessfulCount, 67 jobsFailedCount, 68 jobsRestartedCount) 69 } 70 71 func CreatedJobsCounterInc(job_namespace, framework string) { 72 jobsCreatedCount.WithLabelValues(job_namespace, framework).Inc() 73 } 74 75 func DeletedJobsCounterInc(job_namespace, framework string) { 76 jobsDeletedCount.WithLabelValues(job_namespace, framework).Inc() 77 } 78 79 func SuccessfulJobsCounterInc(job_namespace, framework string) { 80 jobsSuccessfulCount.WithLabelValues(job_namespace, framework).Inc() 81 } 82 83 func FailedJobsCounterInc(job_namespace, framework string) { 84 jobsFailedCount.WithLabelValues(job_namespace, framework).Inc() 85 } 86 87 func RestartedJobsCounterInc(job_namespace, framework string) { 88 jobsRestartedCount.WithLabelValues(job_namespace, framework).Inc() 89 }