istio.io/istio@v0.0.0-20240520182934-d79c90f27776/operator/pkg/metrics/monitoring.go (about)

     1  // Copyright Istio Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package metrics defines metrics and monitoring functionality
    16  // used throughout operator.
    17  package metrics
    18  
    19  import (
    20  	"istio.io/istio/pkg/monitoring"
    21  )
    22  
    23  var (
    24  	// OperatorVersionLabel describes version of running binary.
    25  	OperatorVersionLabel = monitoring.CreateLabel("version")
    26  
    27  	// MergeErrorLabel describes the type of merge error.
    28  	MergeErrorLabel = monitoring.CreateLabel("error_type")
    29  
    30  	// RenderErrorLabel describes the type of the error while rendering.
    31  	RenderErrorLabel = monitoring.CreateLabel("render_error")
    32  
    33  	// CRFetchErrorReasonLabel describes the reason/HTTP code
    34  	// for failing to fetch CR.
    35  	CRFetchErrorReasonLabel = monitoring.CreateLabel("reason")
    36  
    37  	// ComponentNameLabel represents istio component name - like
    38  	// core, pilot, istio-cni etc.
    39  	ComponentNameLabel = monitoring.CreateLabel("component")
    40  
    41  	// ResourceKindLabel indicates the kind of resource owned
    42  	// or created or updated or deleted or pruned by operator.
    43  	ResourceKindLabel = monitoring.CreateLabel("kind")
    44  
    45  	// ReconcileRequestReasonLabel describes reason of reconcile request.
    46  	ReconcileRequestReasonLabel = monitoring.CreateLabel("reason")
    47  )
    48  
    49  // MergeErrorType describes the class of errors that could
    50  // occur while merging profile, user supplied YAML, values
    51  // overridden by --set and so on.
    52  type MergeErrorType string
    53  
    54  const (
    55  	// CannotFetchProfileError occurs when profile cannot be found.
    56  	CannotFetchProfileError MergeErrorType = "cannot_fetch_profile"
    57  
    58  	// OverlayError overlaying YAMLs to combine profile, user
    59  	// defined settings in CR, Hub-tag etc. fails.
    60  	OverlayError MergeErrorType = "overlay"
    61  
    62  	// IOPFormatError occurs when supplied CR cannot be marshaled
    63  	// or unmarshaled to/from YAML.
    64  	IOPFormatError MergeErrorType = "iop_format"
    65  
    66  	// TranslateValuesError occurs when translating from legacy API fails.
    67  	TranslateValuesError MergeErrorType = "translate_values"
    68  
    69  	// InternalYAMLParseError occurs when spec section in merged CR
    70  	// cannot be accessed for some reason (either missing or multiple).
    71  	InternalYAMLParseError MergeErrorType = "internal_yaml_parse"
    72  )
    73  
    74  // RenderErrorType describes the class of errors that could
    75  // occur while rendering Kubernetes manifest from given CR.
    76  type RenderErrorType string
    77  
    78  const (
    79  	RenderNotStartedError RenderErrorType = "render_not_started"
    80  
    81  	// HelmTranslateIOPToValuesError describes render error where renderer for
    82  	// a component cannot create values.yaml tree from given CR.
    83  	HelmTranslateIOPToValuesError RenderErrorType = "helm_translate_iop_to_values"
    84  
    85  	// HelmChartRenderError describes error where Helm charts cannot be rendered
    86  	// for the generated values.yaml tree.
    87  	HelmChartRenderError RenderErrorType = "helm_chart_render"
    88  
    89  	// K8SSettingsOverlayError describes the K8s overlay error after
    90  	// rendering Helm charts successfully.
    91  	K8SSettingsOverlayError RenderErrorType = "k8s_settings_overlay"
    92  
    93  	// K8SManifestPatchError describes errors while patching generated manifest.
    94  	K8SManifestPatchError RenderErrorType = "k8s_manifest_patch"
    95  )
    96  
    97  var (
    98  	// Version is the version of the operator binary running currently.
    99  	Version = monitoring.NewGauge(
   100  		"version",
   101  		"Version of operator binary",
   102  	)
   103  
   104  	ReconcileRequestTotal = monitoring.NewSum(
   105  		"reconcile_request_total",
   106  		"Number of times requesting Reconcile",
   107  	)
   108  
   109  	// GetCRErrorTotal counts the number of times fetching
   110  	// CR fails from API server.
   111  	GetCRErrorTotal = monitoring.NewSum(
   112  		"get_cr_error_total",
   113  		"Number of times fetching CR from apiserver failed",
   114  	)
   115  
   116  	// CRMergeFailureTotal counts number of CR merge failures.
   117  	CRMergeFailureTotal = monitoring.NewSum(
   118  		"cr_merge_failure_total",
   119  		"Number of IstioOperator CR merge failures",
   120  	)
   121  
   122  	// CRDeletionTotal counts the number of times
   123  	// IstioOperator CR was deleted.
   124  	CRDeletionTotal = monitoring.NewSum(
   125  		"cr_deletion_total",
   126  		"Number of IstioOperator CR deleted",
   127  	)
   128  
   129  	// CRValidationErrorTotal counts the number of CR
   130  	// validation failures.
   131  	CRValidationErrorTotal = monitoring.NewSum(
   132  		"cr_validation_error_total",
   133  		"Number of IstioOperator CR validation failures",
   134  	)
   135  
   136  	// RenderManifestTotal counts the number of manifest
   137  	// renders at each component level.
   138  	RenderManifestTotal = monitoring.NewSum(
   139  		"render_manifest_total",
   140  		"Number of component manifests rendered",
   141  	)
   142  
   143  	// OwnedResourceTotal indicates the number of resources
   144  	// currently owned by the CR with given name and revision.
   145  	OwnedResourceTotal = monitoring.NewGauge(
   146  		"owned_resource_total",
   147  		"Number of resources currently owned by the operator",
   148  	)
   149  
   150  	// ResourceCreationTotal indicates the number of resources
   151  	// created by the operator for a CR and revision.
   152  	ResourceCreationTotal = monitoring.NewSum(
   153  		"resource_creation_total",
   154  		"Number of resources created by the operator",
   155  	)
   156  
   157  	// ResourceUpdateTotal indicates the number of resources updated by
   158  	// the operator in response to CR updates for a revision.
   159  	ResourceUpdateTotal = monitoring.NewSum(
   160  		"resource_update_total",
   161  		"Number of resources updated by the operator",
   162  	)
   163  
   164  	// ResourceDeletionTotal indicates the number of resources deleted
   165  	// by the operator in response to CR update or delete operation (like
   166  	// ingress-gateway which was enabled could be disabled and this requires
   167  	// deleting ingress-gateway deployment).
   168  	ResourceDeletionTotal = monitoring.NewSum(
   169  		"resource_deletion_total",
   170  		"Number of resources deleted by the operator",
   171  	)
   172  
   173  	// ResourcePruneTotal indicates the resources pruned as a result of update.
   174  	ResourcePruneTotal = monitoring.NewSum(
   175  		"resource_prune_total",
   176  		"Number of resources pruned by the operator",
   177  	)
   178  
   179  	// ManifestPatchErrorTotal counts the total number of K8S patch errors.
   180  	ManifestPatchErrorTotal = monitoring.NewSum(
   181  		"manifest_patch_error_total",
   182  		"Number of times K8S patch overlays failed",
   183  	)
   184  
   185  	// ManifestRenderErrorTotal counts errors occurred while rendering manifest.
   186  	ManifestRenderErrorTotal = monitoring.NewSum(
   187  		"manifest_render_error_total",
   188  		"Number of times error occurred during rendering output manifest",
   189  	)
   190  
   191  	// LegacyPathTranslationTotal counts the translations from legacy API to new one.
   192  	LegacyPathTranslationTotal = monitoring.NewSum(
   193  		"legacy_path_translation_total",
   194  		"Number of times a legacy API path is translated",
   195  	)
   196  
   197  	// CacheFlushTotal counts number of cache flushes.
   198  	CacheFlushTotal = monitoring.NewSum(
   199  		"cache_flush_total",
   200  		"number of times operator cache was flushed",
   201  	)
   202  )
   203  
   204  func init() {
   205  	initOperatorCrdResourceMetrics()
   206  }
   207  
   208  func IncrementReconcileRequest(reason string) {
   209  	ReconcileRequestTotal.With(ReconcileRequestReasonLabel.Value(reason)).Increment()
   210  }