istio.io/istio@v0.0.0-20240520182934-d79c90f27776/operator/pkg/metrics/monitoring.go (about) 1 // Copyright Istio Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package metrics defines metrics and monitoring functionality 16 // used throughout operator. 17 package metrics 18 19 import ( 20 "istio.io/istio/pkg/monitoring" 21 ) 22 23 var ( 24 // OperatorVersionLabel describes version of running binary. 25 OperatorVersionLabel = monitoring.CreateLabel("version") 26 27 // MergeErrorLabel describes the type of merge error. 28 MergeErrorLabel = monitoring.CreateLabel("error_type") 29 30 // RenderErrorLabel describes the type of the error while rendering. 31 RenderErrorLabel = monitoring.CreateLabel("render_error") 32 33 // CRFetchErrorReasonLabel describes the reason/HTTP code 34 // for failing to fetch CR. 35 CRFetchErrorReasonLabel = monitoring.CreateLabel("reason") 36 37 // ComponentNameLabel represents istio component name - like 38 // core, pilot, istio-cni etc. 39 ComponentNameLabel = monitoring.CreateLabel("component") 40 41 // ResourceKindLabel indicates the kind of resource owned 42 // or created or updated or deleted or pruned by operator. 43 ResourceKindLabel = monitoring.CreateLabel("kind") 44 45 // ReconcileRequestReasonLabel describes reason of reconcile request. 46 ReconcileRequestReasonLabel = monitoring.CreateLabel("reason") 47 ) 48 49 // MergeErrorType describes the class of errors that could 50 // occur while merging profile, user supplied YAML, values 51 // overridden by --set and so on. 52 type MergeErrorType string 53 54 const ( 55 // CannotFetchProfileError occurs when profile cannot be found. 56 CannotFetchProfileError MergeErrorType = "cannot_fetch_profile" 57 58 // OverlayError overlaying YAMLs to combine profile, user 59 // defined settings in CR, Hub-tag etc. fails. 60 OverlayError MergeErrorType = "overlay" 61 62 // IOPFormatError occurs when supplied CR cannot be marshaled 63 // or unmarshaled to/from YAML. 64 IOPFormatError MergeErrorType = "iop_format" 65 66 // TranslateValuesError occurs when translating from legacy API fails. 67 TranslateValuesError MergeErrorType = "translate_values" 68 69 // InternalYAMLParseError occurs when spec section in merged CR 70 // cannot be accessed for some reason (either missing or multiple). 71 InternalYAMLParseError MergeErrorType = "internal_yaml_parse" 72 ) 73 74 // RenderErrorType describes the class of errors that could 75 // occur while rendering Kubernetes manifest from given CR. 76 type RenderErrorType string 77 78 const ( 79 RenderNotStartedError RenderErrorType = "render_not_started" 80 81 // HelmTranslateIOPToValuesError describes render error where renderer for 82 // a component cannot create values.yaml tree from given CR. 83 HelmTranslateIOPToValuesError RenderErrorType = "helm_translate_iop_to_values" 84 85 // HelmChartRenderError describes error where Helm charts cannot be rendered 86 // for the generated values.yaml tree. 87 HelmChartRenderError RenderErrorType = "helm_chart_render" 88 89 // K8SSettingsOverlayError describes the K8s overlay error after 90 // rendering Helm charts successfully. 91 K8SSettingsOverlayError RenderErrorType = "k8s_settings_overlay" 92 93 // K8SManifestPatchError describes errors while patching generated manifest. 94 K8SManifestPatchError RenderErrorType = "k8s_manifest_patch" 95 ) 96 97 var ( 98 // Version is the version of the operator binary running currently. 99 Version = monitoring.NewGauge( 100 "version", 101 "Version of operator binary", 102 ) 103 104 ReconcileRequestTotal = monitoring.NewSum( 105 "reconcile_request_total", 106 "Number of times requesting Reconcile", 107 ) 108 109 // GetCRErrorTotal counts the number of times fetching 110 // CR fails from API server. 111 GetCRErrorTotal = monitoring.NewSum( 112 "get_cr_error_total", 113 "Number of times fetching CR from apiserver failed", 114 ) 115 116 // CRMergeFailureTotal counts number of CR merge failures. 117 CRMergeFailureTotal = monitoring.NewSum( 118 "cr_merge_failure_total", 119 "Number of IstioOperator CR merge failures", 120 ) 121 122 // CRDeletionTotal counts the number of times 123 // IstioOperator CR was deleted. 124 CRDeletionTotal = monitoring.NewSum( 125 "cr_deletion_total", 126 "Number of IstioOperator CR deleted", 127 ) 128 129 // CRValidationErrorTotal counts the number of CR 130 // validation failures. 131 CRValidationErrorTotal = monitoring.NewSum( 132 "cr_validation_error_total", 133 "Number of IstioOperator CR validation failures", 134 ) 135 136 // RenderManifestTotal counts the number of manifest 137 // renders at each component level. 138 RenderManifestTotal = monitoring.NewSum( 139 "render_manifest_total", 140 "Number of component manifests rendered", 141 ) 142 143 // OwnedResourceTotal indicates the number of resources 144 // currently owned by the CR with given name and revision. 145 OwnedResourceTotal = monitoring.NewGauge( 146 "owned_resource_total", 147 "Number of resources currently owned by the operator", 148 ) 149 150 // ResourceCreationTotal indicates the number of resources 151 // created by the operator for a CR and revision. 152 ResourceCreationTotal = monitoring.NewSum( 153 "resource_creation_total", 154 "Number of resources created by the operator", 155 ) 156 157 // ResourceUpdateTotal indicates the number of resources updated by 158 // the operator in response to CR updates for a revision. 159 ResourceUpdateTotal = monitoring.NewSum( 160 "resource_update_total", 161 "Number of resources updated by the operator", 162 ) 163 164 // ResourceDeletionTotal indicates the number of resources deleted 165 // by the operator in response to CR update or delete operation (like 166 // ingress-gateway which was enabled could be disabled and this requires 167 // deleting ingress-gateway deployment). 168 ResourceDeletionTotal = monitoring.NewSum( 169 "resource_deletion_total", 170 "Number of resources deleted by the operator", 171 ) 172 173 // ResourcePruneTotal indicates the resources pruned as a result of update. 174 ResourcePruneTotal = monitoring.NewSum( 175 "resource_prune_total", 176 "Number of resources pruned by the operator", 177 ) 178 179 // ManifestPatchErrorTotal counts the total number of K8S patch errors. 180 ManifestPatchErrorTotal = monitoring.NewSum( 181 "manifest_patch_error_total", 182 "Number of times K8S patch overlays failed", 183 ) 184 185 // ManifestRenderErrorTotal counts errors occurred while rendering manifest. 186 ManifestRenderErrorTotal = monitoring.NewSum( 187 "manifest_render_error_total", 188 "Number of times error occurred during rendering output manifest", 189 ) 190 191 // LegacyPathTranslationTotal counts the translations from legacy API to new one. 192 LegacyPathTranslationTotal = monitoring.NewSum( 193 "legacy_path_translation_total", 194 "Number of times a legacy API path is translated", 195 ) 196 197 // CacheFlushTotal counts number of cache flushes. 198 CacheFlushTotal = monitoring.NewSum( 199 "cache_flush_total", 200 "number of times operator cache was flushed", 201 ) 202 ) 203 204 func init() { 205 initOperatorCrdResourceMetrics() 206 } 207 208 func IncrementReconcileRequest(reason string) { 209 ReconcileRequestTotal.With(ReconcileRequestReasonLabel.Value(reason)).Increment() 210 }