github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/alertmanager/alertmanager_metrics.go (about) 1 package alertmanager 2 3 import ( 4 "github.com/prometheus/client_golang/prometheus" 5 6 "github.com/cortexproject/cortex/pkg/util" 7 ) 8 9 // This struct aggregates metrics exported by Alertmanager 10 // and re-exports those aggregates as Cortex metrics. 11 type alertmanagerMetrics struct { 12 regs *util.UserRegistries 13 14 // exported metrics, gathered from Alertmanager API 15 alertsReceived *prometheus.Desc 16 alertsInvalid *prometheus.Desc 17 18 // exported metrics, gathered from Alertmanager PipelineBuilder 19 numNotifications *prometheus.Desc 20 numFailedNotifications *prometheus.Desc 21 numNotificationRequestsTotal *prometheus.Desc 22 numNotificationRequestsFailedTotal *prometheus.Desc 23 notificationLatencySeconds *prometheus.Desc 24 25 // exported metrics, gathered from Alertmanager nflog 26 nflogGCDuration *prometheus.Desc 27 nflogSnapshotDuration *prometheus.Desc 28 nflogSnapshotSize *prometheus.Desc 29 nflogQueriesTotal *prometheus.Desc 30 nflogQueryErrorsTotal *prometheus.Desc 31 nflogQueryDuration *prometheus.Desc 32 nflogPropagatedMessagesTotal *prometheus.Desc 33 34 // exported metrics, gathered from Alertmanager Marker 35 markerAlerts *prometheus.Desc 36 37 // exported metrics, gathered from Alertmanager Silences 38 silencesGCDuration *prometheus.Desc 39 silencesSnapshotDuration *prometheus.Desc 40 silencesSnapshotSize *prometheus.Desc 41 silencesQueriesTotal *prometheus.Desc 42 silencesQueryErrorsTotal *prometheus.Desc 43 silencesQueryDuration *prometheus.Desc 44 silences *prometheus.Desc 45 silencesPropagatedMessagesTotal *prometheus.Desc 46 47 // The alertmanager config hash. 48 configHashValue *prometheus.Desc 49 50 partialMerges *prometheus.Desc 51 partialMergesFailed *prometheus.Desc 52 replicationTotal *prometheus.Desc 53 replicationFailed *prometheus.Desc 54 fetchReplicaStateTotal *prometheus.Desc 55 fetchReplicaStateFailed *prometheus.Desc 56 initialSyncTotal *prometheus.Desc 57 initialSyncCompleted *prometheus.Desc 58 initialSyncDuration *prometheus.Desc 59 persistTotal *prometheus.Desc 60 persistFailed *prometheus.Desc 61 62 notificationRateLimited *prometheus.Desc 63 dispatcherAggregationGroupsLimitReached *prometheus.Desc 64 insertAlertFailures *prometheus.Desc 65 alertsLimiterAlertsCount *prometheus.Desc 66 alertsLimiterAlertsSize *prometheus.Desc 67 } 68 69 func newAlertmanagerMetrics() *alertmanagerMetrics { 70 return &alertmanagerMetrics{ 71 regs: util.NewUserRegistries(), 72 alertsReceived: prometheus.NewDesc( 73 "cortex_alertmanager_alerts_received_total", 74 "The total number of received alerts.", 75 []string{"user"}, nil), 76 alertsInvalid: prometheus.NewDesc( 77 "cortex_alertmanager_alerts_invalid_total", 78 "The total number of received alerts that were invalid.", 79 []string{"user"}, nil), 80 numNotifications: prometheus.NewDesc( 81 "cortex_alertmanager_notifications_total", 82 "The total number of attempted notifications.", 83 []string{"user", "integration"}, nil), 84 numFailedNotifications: prometheus.NewDesc( 85 "cortex_alertmanager_notifications_failed_total", 86 "The total number of failed notifications.", 87 []string{"user", "integration"}, nil), 88 numNotificationRequestsTotal: prometheus.NewDesc( 89 "cortex_alertmanager_notification_requests_total", 90 "The total number of attempted notification requests.", 91 []string{"user", "integration"}, nil), 92 numNotificationRequestsFailedTotal: prometheus.NewDesc( 93 "cortex_alertmanager_notification_requests_failed_total", 94 "The total number of failed notification requests.", 95 []string{"user", "integration"}, nil), 96 notificationLatencySeconds: prometheus.NewDesc( 97 "cortex_alertmanager_notification_latency_seconds", 98 "The latency of notifications in seconds.", 99 nil, nil), 100 nflogGCDuration: prometheus.NewDesc( 101 "cortex_alertmanager_nflog_gc_duration_seconds", 102 "Duration of the last notification log garbage collection cycle.", 103 nil, nil), 104 nflogSnapshotDuration: prometheus.NewDesc( 105 "cortex_alertmanager_nflog_snapshot_duration_seconds", 106 "Duration of the last notification log snapshot.", 107 nil, nil), 108 nflogSnapshotSize: prometheus.NewDesc( 109 "cortex_alertmanager_nflog_snapshot_size_bytes", 110 "Size of the last notification log snapshot in bytes.", 111 nil, nil), 112 nflogQueriesTotal: prometheus.NewDesc( 113 "cortex_alertmanager_nflog_queries_total", 114 "Number of notification log queries were received.", 115 nil, nil), 116 nflogQueryErrorsTotal: prometheus.NewDesc( 117 "cortex_alertmanager_nflog_query_errors_total", 118 "Number notification log received queries that failed.", 119 nil, nil), 120 nflogQueryDuration: prometheus.NewDesc( 121 "cortex_alertmanager_nflog_query_duration_seconds", 122 "Duration of notification log query evaluation.", 123 nil, nil), 124 nflogPropagatedMessagesTotal: prometheus.NewDesc( 125 "cortex_alertmanager_nflog_gossip_messages_propagated_total", 126 "Number of received gossip messages that have been further gossiped.", 127 nil, nil), 128 markerAlerts: prometheus.NewDesc( 129 "cortex_alertmanager_alerts", 130 "How many alerts by state.", 131 []string{"user", "state"}, nil), 132 silencesGCDuration: prometheus.NewDesc( 133 "cortex_alertmanager_silences_gc_duration_seconds", 134 "Duration of the last silence garbage collection cycle.", 135 nil, nil), 136 silencesSnapshotDuration: prometheus.NewDesc( 137 "cortex_alertmanager_silences_snapshot_duration_seconds", 138 "Duration of the last silence snapshot.", 139 nil, nil), 140 silencesSnapshotSize: prometheus.NewDesc( 141 "cortex_alertmanager_silences_snapshot_size_bytes", 142 "Size of the last silence snapshot in bytes.", 143 nil, nil), 144 silencesQueriesTotal: prometheus.NewDesc( 145 "cortex_alertmanager_silences_queries_total", 146 "How many silence queries were received.", 147 nil, nil), 148 silencesQueryErrorsTotal: prometheus.NewDesc( 149 "cortex_alertmanager_silences_query_errors_total", 150 "How many silence received queries did not succeed.", 151 nil, nil), 152 silencesQueryDuration: prometheus.NewDesc( 153 "cortex_alertmanager_silences_query_duration_seconds", 154 "Duration of silence query evaluation.", 155 nil, nil), 156 silencesPropagatedMessagesTotal: prometheus.NewDesc( 157 "cortex_alertmanager_silences_gossip_messages_propagated_total", 158 "Number of received gossip messages that have been further gossiped.", 159 nil, nil), 160 silences: prometheus.NewDesc( 161 "cortex_alertmanager_silences", 162 "How many silences by state.", 163 []string{"user", "state"}, nil), 164 configHashValue: prometheus.NewDesc( 165 "cortex_alertmanager_config_hash", 166 "Hash of the currently loaded alertmanager configuration.", 167 []string{"user"}, nil), 168 partialMerges: prometheus.NewDesc( 169 "cortex_alertmanager_partial_state_merges_total", 170 "Number of times we have received a partial state to merge for a key.", 171 []string{"user"}, nil), 172 partialMergesFailed: prometheus.NewDesc( 173 "cortex_alertmanager_partial_state_merges_failed_total", 174 "Number of times we have failed to merge a partial state received for a key.", 175 []string{"user"}, nil), 176 replicationTotal: prometheus.NewDesc( 177 "cortex_alertmanager_state_replication_total", 178 "Number of times we have tried to replicate a state to other alertmanagers", 179 []string{"user"}, nil), 180 replicationFailed: prometheus.NewDesc( 181 "cortex_alertmanager_state_replication_failed_total", 182 "Number of times we have failed to replicate a state to other alertmanagers", 183 []string{"user"}, nil), 184 fetchReplicaStateTotal: prometheus.NewDesc( 185 "cortex_alertmanager_state_fetch_replica_state_total", 186 "Number of times we have tried to read and merge the full state from another replica.", 187 nil, nil), 188 fetchReplicaStateFailed: prometheus.NewDesc( 189 "cortex_alertmanager_state_fetch_replica_state_failed_total", 190 "Number of times we have failed to read and merge the full state from another replica.", 191 nil, nil), 192 initialSyncTotal: prometheus.NewDesc( 193 "cortex_alertmanager_state_initial_sync_total", 194 "Number of times we have tried to sync initial state from peers or storage.", 195 nil, nil), 196 initialSyncCompleted: prometheus.NewDesc( 197 "cortex_alertmanager_state_initial_sync_completed_total", 198 "Number of times we have completed syncing initial state for each possible outcome.", 199 []string{"outcome"}, nil), 200 initialSyncDuration: prometheus.NewDesc( 201 "cortex_alertmanager_state_initial_sync_duration_seconds", 202 "Time spent syncing initial state from peers or storage.", 203 nil, nil), 204 persistTotal: prometheus.NewDesc( 205 "cortex_alertmanager_state_persist_total", 206 "Number of times we have tried to persist the running state to storage.", 207 nil, nil), 208 persistFailed: prometheus.NewDesc( 209 "cortex_alertmanager_state_persist_failed_total", 210 "Number of times we have failed to persist the running state to storage.", 211 nil, nil), 212 notificationRateLimited: prometheus.NewDesc( 213 "cortex_alertmanager_notification_rate_limited_total", 214 "Total number of rate-limited notifications per integration.", 215 []string{"user", "integration"}, nil), 216 dispatcherAggregationGroupsLimitReached: prometheus.NewDesc( 217 "cortex_alertmanager_dispatcher_aggregation_group_limit_reached_total", 218 "Number of times when dispatcher failed to create new aggregation group due to limit.", 219 []string{"user"}, nil), 220 insertAlertFailures: prometheus.NewDesc( 221 "cortex_alertmanager_alerts_insert_limited_total", 222 "Total number of failures to store alert due to hitting alertmanager limits.", 223 []string{"user"}, nil), 224 alertsLimiterAlertsCount: prometheus.NewDesc( 225 "cortex_alertmanager_alerts_limiter_current_alerts", 226 "Number of alerts tracked by alerts limiter.", 227 []string{"user"}, nil), 228 alertsLimiterAlertsSize: prometheus.NewDesc( 229 "cortex_alertmanager_alerts_limiter_current_alerts_size_bytes", 230 "Total size of alerts tracked by alerts limiter.", 231 []string{"user"}, nil), 232 } 233 } 234 235 func (m *alertmanagerMetrics) addUserRegistry(user string, reg *prometheus.Registry) { 236 m.regs.AddUserRegistry(user, reg) 237 } 238 239 func (m *alertmanagerMetrics) removeUserRegistry(user string) { 240 // We need to go for a soft deletion here, as hard deletion requires 241 // that _all_ metrics except gauges are per-user. 242 m.regs.RemoveUserRegistry(user, false) 243 } 244 245 func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) { 246 out <- m.alertsReceived 247 out <- m.alertsInvalid 248 out <- m.numNotifications 249 out <- m.numFailedNotifications 250 out <- m.numNotificationRequestsTotal 251 out <- m.numNotificationRequestsFailedTotal 252 out <- m.notificationLatencySeconds 253 out <- m.markerAlerts 254 out <- m.nflogGCDuration 255 out <- m.nflogSnapshotDuration 256 out <- m.nflogSnapshotSize 257 out <- m.nflogQueriesTotal 258 out <- m.nflogQueryErrorsTotal 259 out <- m.nflogQueryDuration 260 out <- m.nflogPropagatedMessagesTotal 261 out <- m.silencesGCDuration 262 out <- m.silencesSnapshotDuration 263 out <- m.silencesSnapshotSize 264 out <- m.silencesQueriesTotal 265 out <- m.silencesQueryErrorsTotal 266 out <- m.silencesQueryDuration 267 out <- m.silencesPropagatedMessagesTotal 268 out <- m.silences 269 out <- m.configHashValue 270 out <- m.partialMerges 271 out <- m.partialMergesFailed 272 out <- m.replicationTotal 273 out <- m.replicationFailed 274 out <- m.fetchReplicaStateTotal 275 out <- m.fetchReplicaStateFailed 276 out <- m.initialSyncTotal 277 out <- m.initialSyncCompleted 278 out <- m.initialSyncDuration 279 out <- m.persistTotal 280 out <- m.persistFailed 281 out <- m.notificationRateLimited 282 out <- m.dispatcherAggregationGroupsLimitReached 283 out <- m.insertAlertFailures 284 out <- m.alertsLimiterAlertsCount 285 out <- m.alertsLimiterAlertsSize 286 } 287 288 func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) { 289 data := m.regs.BuildMetricFamiliesPerUser() 290 291 data.SendSumOfCountersPerUser(out, m.alertsReceived, "alertmanager_alerts_received_total") 292 data.SendSumOfCountersPerUser(out, m.alertsInvalid, "alertmanager_alerts_invalid_total") 293 294 data.SendSumOfCountersPerUserWithLabels(out, m.numNotifications, "alertmanager_notifications_total", "integration") 295 data.SendSumOfCountersPerUserWithLabels(out, m.numFailedNotifications, "alertmanager_notifications_failed_total", "integration") 296 data.SendSumOfCountersPerUserWithLabels(out, m.numNotificationRequestsTotal, "alertmanager_notification_requests_total", "integration") 297 data.SendSumOfCountersPerUserWithLabels(out, m.numNotificationRequestsFailedTotal, "alertmanager_notification_requests_failed_total", "integration") 298 data.SendSumOfHistograms(out, m.notificationLatencySeconds, "alertmanager_notification_latency_seconds") 299 data.SendSumOfGaugesPerUserWithLabels(out, m.markerAlerts, "alertmanager_alerts", "state") 300 301 data.SendSumOfSummaries(out, m.nflogGCDuration, "alertmanager_nflog_gc_duration_seconds") 302 data.SendSumOfSummaries(out, m.nflogSnapshotDuration, "alertmanager_nflog_snapshot_duration_seconds") 303 data.SendSumOfGauges(out, m.nflogSnapshotSize, "alertmanager_nflog_snapshot_size_bytes") 304 data.SendSumOfCounters(out, m.nflogQueriesTotal, "alertmanager_nflog_queries_total") 305 data.SendSumOfCounters(out, m.nflogQueryErrorsTotal, "alertmanager_nflog_query_errors_total") 306 data.SendSumOfHistograms(out, m.nflogQueryDuration, "alertmanager_nflog_query_duration_seconds") 307 data.SendSumOfCounters(out, m.nflogPropagatedMessagesTotal, "alertmanager_nflog_gossip_messages_propagated_total") 308 309 data.SendSumOfSummaries(out, m.silencesGCDuration, "alertmanager_silences_gc_duration_seconds") 310 data.SendSumOfSummaries(out, m.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds") 311 data.SendSumOfGauges(out, m.silencesSnapshotSize, "alertmanager_silences_snapshot_size_bytes") 312 data.SendSumOfCounters(out, m.silencesQueriesTotal, "alertmanager_silences_queries_total") 313 data.SendSumOfCounters(out, m.silencesQueryErrorsTotal, "alertmanager_silences_query_errors_total") 314 data.SendSumOfHistograms(out, m.silencesQueryDuration, "alertmanager_silences_query_duration_seconds") 315 data.SendSumOfCounters(out, m.silencesPropagatedMessagesTotal, "alertmanager_silences_gossip_messages_propagated_total") 316 data.SendSumOfGaugesPerUserWithLabels(out, m.silences, "alertmanager_silences", "state") 317 318 data.SendMaxOfGaugesPerUser(out, m.configHashValue, "alertmanager_config_hash") 319 320 data.SendSumOfCountersPerUser(out, m.partialMerges, "alertmanager_partial_state_merges_total") 321 data.SendSumOfCountersPerUser(out, m.partialMergesFailed, "alertmanager_partial_state_merges_failed_total") 322 data.SendSumOfCountersPerUser(out, m.replicationTotal, "alertmanager_state_replication_total") 323 data.SendSumOfCountersPerUser(out, m.replicationFailed, "alertmanager_state_replication_failed_total") 324 data.SendSumOfCounters(out, m.fetchReplicaStateTotal, "alertmanager_state_fetch_replica_state_total") 325 data.SendSumOfCounters(out, m.fetchReplicaStateFailed, "alertmanager_state_fetch_replica_state_failed_total") 326 data.SendSumOfCounters(out, m.initialSyncTotal, "alertmanager_state_initial_sync_total") 327 data.SendSumOfCountersWithLabels(out, m.initialSyncCompleted, "alertmanager_state_initial_sync_completed_total", "outcome") 328 data.SendSumOfHistograms(out, m.initialSyncDuration, "alertmanager_state_initial_sync_duration_seconds") 329 data.SendSumOfCounters(out, m.persistTotal, "alertmanager_state_persist_total") 330 data.SendSumOfCounters(out, m.persistFailed, "alertmanager_state_persist_failed_total") 331 332 data.SendSumOfCountersPerUserWithLabels(out, m.notificationRateLimited, "alertmanager_notification_rate_limited_total", "integration") 333 data.SendSumOfCountersPerUser(out, m.dispatcherAggregationGroupsLimitReached, "alertmanager_dispatcher_aggregation_group_limit_reached_total") 334 data.SendSumOfCountersPerUser(out, m.insertAlertFailures, "alertmanager_alerts_insert_limited_total") 335 data.SendSumOfGaugesPerUser(out, m.alertsLimiterAlertsCount, "alertmanager_alerts_limiter_current_alerts") 336 data.SendSumOfGaugesPerUser(out, m.alertsLimiterAlertsSize, "alertmanager_alerts_limiter_current_alerts_size_bytes") 337 }