github.com/kubeflow/training-operator@v1.7.0/pkg/common/metrics.go (about)

     1  // Copyright 2021 The Kubeflow Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License
    14  
    15  package common
    16  
    17  import (
    18  	"github.com/prometheus/client_golang/prometheus"
    19  	"github.com/prometheus/client_golang/prometheus/promauto"
    20  	"sigs.k8s.io/controller-runtime/pkg/metrics"
    21  )
    22  
    23  // Define all the prometheus counters for all jobs
    24  var (
    25  	jobsCreatedCount = promauto.NewCounterVec(
    26  		prometheus.CounterOpts{
    27  			Name: "training_operator_jobs_created_total",
    28  			Help: "Counts number of jobs created",
    29  		},
    30  		[]string{"job_namespace", "framework"},
    31  	)
    32  	jobsDeletedCount = promauto.NewCounterVec(
    33  		prometheus.CounterOpts{
    34  			Name: "training_operator_jobs_deleted_total",
    35  			Help: "Counts number of jobs deleted",
    36  		},
    37  		[]string{"job_namespace", "framework"},
    38  	)
    39  	jobsSuccessfulCount = promauto.NewCounterVec(
    40  		prometheus.CounterOpts{
    41  			Name: "training_operator_jobs_successful_total",
    42  			Help: "Counts number of jobs successful",
    43  		},
    44  		[]string{"job_namespace", "framework"},
    45  	)
    46  	jobsFailedCount = promauto.NewCounterVec(
    47  		prometheus.CounterOpts{
    48  			Name: "training_operator_jobs_failed_total",
    49  			Help: "Counts number of jobs failed",
    50  		},
    51  		[]string{"job_namespace", "framework"},
    52  	)
    53  	jobsRestartedCount = promauto.NewCounterVec(
    54  		prometheus.CounterOpts{
    55  			Name: "training_operator_jobs_restarted_total",
    56  			Help: "Counts number of jobs restarted",
    57  		},
    58  		[]string{"job_namespace", "framework"},
    59  	)
    60  )
    61  
    62  func init() {
    63  	// Register custom metrics with the global prometheus registry
    64  	metrics.Registry.MustRegister(jobsCreatedCount,
    65  		jobsDeletedCount,
    66  		jobsSuccessfulCount,
    67  		jobsFailedCount,
    68  		jobsRestartedCount)
    69  }
    70  
    71  func CreatedJobsCounterInc(job_namespace, framework string) {
    72  	jobsCreatedCount.WithLabelValues(job_namespace, framework).Inc()
    73  }
    74  
    75  func DeletedJobsCounterInc(job_namespace, framework string) {
    76  	jobsDeletedCount.WithLabelValues(job_namespace, framework).Inc()
    77  }
    78  
    79  func SuccessfulJobsCounterInc(job_namespace, framework string) {
    80  	jobsSuccessfulCount.WithLabelValues(job_namespace, framework).Inc()
    81  }
    82  
    83  func FailedJobsCounterInc(job_namespace, framework string) {
    84  	jobsFailedCount.WithLabelValues(job_namespace, framework).Inc()
    85  }
    86  
    87  func RestartedJobsCounterInc(job_namespace, framework string) {
    88  	jobsRestartedCount.WithLabelValues(job_namespace, framework).Inc()
    89  }