github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/alertmanager/alertmanager_metrics.go (about)

     1  package alertmanager
     2  
     3  import (
     4  	"github.com/prometheus/client_golang/prometheus"
     5  
     6  	"github.com/cortexproject/cortex/pkg/util"
     7  )
     8  
     9  // This struct aggregates metrics exported by Alertmanager
    10  // and re-exports those aggregates as Cortex metrics.
    11  type alertmanagerMetrics struct {
    12  	regs *util.UserRegistries
    13  
    14  	// exported metrics, gathered from Alertmanager API
    15  	alertsReceived *prometheus.Desc
    16  	alertsInvalid  *prometheus.Desc
    17  
    18  	// exported metrics, gathered from Alertmanager PipelineBuilder
    19  	numNotifications                   *prometheus.Desc
    20  	numFailedNotifications             *prometheus.Desc
    21  	numNotificationRequestsTotal       *prometheus.Desc
    22  	numNotificationRequestsFailedTotal *prometheus.Desc
    23  	notificationLatencySeconds         *prometheus.Desc
    24  
    25  	// exported metrics, gathered from Alertmanager nflog
    26  	nflogGCDuration              *prometheus.Desc
    27  	nflogSnapshotDuration        *prometheus.Desc
    28  	nflogSnapshotSize            *prometheus.Desc
    29  	nflogQueriesTotal            *prometheus.Desc
    30  	nflogQueryErrorsTotal        *prometheus.Desc
    31  	nflogQueryDuration           *prometheus.Desc
    32  	nflogPropagatedMessagesTotal *prometheus.Desc
    33  
    34  	// exported metrics, gathered from Alertmanager Marker
    35  	markerAlerts *prometheus.Desc
    36  
    37  	// exported metrics, gathered from Alertmanager Silences
    38  	silencesGCDuration              *prometheus.Desc
    39  	silencesSnapshotDuration        *prometheus.Desc
    40  	silencesSnapshotSize            *prometheus.Desc
    41  	silencesQueriesTotal            *prometheus.Desc
    42  	silencesQueryErrorsTotal        *prometheus.Desc
    43  	silencesQueryDuration           *prometheus.Desc
    44  	silences                        *prometheus.Desc
    45  	silencesPropagatedMessagesTotal *prometheus.Desc
    46  
    47  	// The alertmanager config hash.
    48  	configHashValue *prometheus.Desc
    49  
    50  	partialMerges           *prometheus.Desc
    51  	partialMergesFailed     *prometheus.Desc
    52  	replicationTotal        *prometheus.Desc
    53  	replicationFailed       *prometheus.Desc
    54  	fetchReplicaStateTotal  *prometheus.Desc
    55  	fetchReplicaStateFailed *prometheus.Desc
    56  	initialSyncTotal        *prometheus.Desc
    57  	initialSyncCompleted    *prometheus.Desc
    58  	initialSyncDuration     *prometheus.Desc
    59  	persistTotal            *prometheus.Desc
    60  	persistFailed           *prometheus.Desc
    61  
    62  	notificationRateLimited                 *prometheus.Desc
    63  	dispatcherAggregationGroupsLimitReached *prometheus.Desc
    64  	insertAlertFailures                     *prometheus.Desc
    65  	alertsLimiterAlertsCount                *prometheus.Desc
    66  	alertsLimiterAlertsSize                 *prometheus.Desc
    67  }
    68  
    69  func newAlertmanagerMetrics() *alertmanagerMetrics {
    70  	return &alertmanagerMetrics{
    71  		regs: util.NewUserRegistries(),
    72  		alertsReceived: prometheus.NewDesc(
    73  			"cortex_alertmanager_alerts_received_total",
    74  			"The total number of received alerts.",
    75  			[]string{"user"}, nil),
    76  		alertsInvalid: prometheus.NewDesc(
    77  			"cortex_alertmanager_alerts_invalid_total",
    78  			"The total number of received alerts that were invalid.",
    79  			[]string{"user"}, nil),
    80  		numNotifications: prometheus.NewDesc(
    81  			"cortex_alertmanager_notifications_total",
    82  			"The total number of attempted notifications.",
    83  			[]string{"user", "integration"}, nil),
    84  		numFailedNotifications: prometheus.NewDesc(
    85  			"cortex_alertmanager_notifications_failed_total",
    86  			"The total number of failed notifications.",
    87  			[]string{"user", "integration"}, nil),
    88  		numNotificationRequestsTotal: prometheus.NewDesc(
    89  			"cortex_alertmanager_notification_requests_total",
    90  			"The total number of attempted notification requests.",
    91  			[]string{"user", "integration"}, nil),
    92  		numNotificationRequestsFailedTotal: prometheus.NewDesc(
    93  			"cortex_alertmanager_notification_requests_failed_total",
    94  			"The total number of failed notification requests.",
    95  			[]string{"user", "integration"}, nil),
    96  		notificationLatencySeconds: prometheus.NewDesc(
    97  			"cortex_alertmanager_notification_latency_seconds",
    98  			"The latency of notifications in seconds.",
    99  			nil, nil),
   100  		nflogGCDuration: prometheus.NewDesc(
   101  			"cortex_alertmanager_nflog_gc_duration_seconds",
   102  			"Duration of the last notification log garbage collection cycle.",
   103  			nil, nil),
   104  		nflogSnapshotDuration: prometheus.NewDesc(
   105  			"cortex_alertmanager_nflog_snapshot_duration_seconds",
   106  			"Duration of the last notification log snapshot.",
   107  			nil, nil),
   108  		nflogSnapshotSize: prometheus.NewDesc(
   109  			"cortex_alertmanager_nflog_snapshot_size_bytes",
   110  			"Size of the last notification log snapshot in bytes.",
   111  			nil, nil),
   112  		nflogQueriesTotal: prometheus.NewDesc(
   113  			"cortex_alertmanager_nflog_queries_total",
   114  			"Number of notification log queries were received.",
   115  			nil, nil),
   116  		nflogQueryErrorsTotal: prometheus.NewDesc(
   117  			"cortex_alertmanager_nflog_query_errors_total",
   118  			"Number notification log received queries that failed.",
   119  			nil, nil),
   120  		nflogQueryDuration: prometheus.NewDesc(
   121  			"cortex_alertmanager_nflog_query_duration_seconds",
   122  			"Duration of notification log query evaluation.",
   123  			nil, nil),
   124  		nflogPropagatedMessagesTotal: prometheus.NewDesc(
   125  			"cortex_alertmanager_nflog_gossip_messages_propagated_total",
   126  			"Number of received gossip messages that have been further gossiped.",
   127  			nil, nil),
   128  		markerAlerts: prometheus.NewDesc(
   129  			"cortex_alertmanager_alerts",
   130  			"How many alerts by state.",
   131  			[]string{"user", "state"}, nil),
   132  		silencesGCDuration: prometheus.NewDesc(
   133  			"cortex_alertmanager_silences_gc_duration_seconds",
   134  			"Duration of the last silence garbage collection cycle.",
   135  			nil, nil),
   136  		silencesSnapshotDuration: prometheus.NewDesc(
   137  			"cortex_alertmanager_silences_snapshot_duration_seconds",
   138  			"Duration of the last silence snapshot.",
   139  			nil, nil),
   140  		silencesSnapshotSize: prometheus.NewDesc(
   141  			"cortex_alertmanager_silences_snapshot_size_bytes",
   142  			"Size of the last silence snapshot in bytes.",
   143  			nil, nil),
   144  		silencesQueriesTotal: prometheus.NewDesc(
   145  			"cortex_alertmanager_silences_queries_total",
   146  			"How many silence queries were received.",
   147  			nil, nil),
   148  		silencesQueryErrorsTotal: prometheus.NewDesc(
   149  			"cortex_alertmanager_silences_query_errors_total",
   150  			"How many silence received queries did not succeed.",
   151  			nil, nil),
   152  		silencesQueryDuration: prometheus.NewDesc(
   153  			"cortex_alertmanager_silences_query_duration_seconds",
   154  			"Duration of silence query evaluation.",
   155  			nil, nil),
   156  		silencesPropagatedMessagesTotal: prometheus.NewDesc(
   157  			"cortex_alertmanager_silences_gossip_messages_propagated_total",
   158  			"Number of received gossip messages that have been further gossiped.",
   159  			nil, nil),
   160  		silences: prometheus.NewDesc(
   161  			"cortex_alertmanager_silences",
   162  			"How many silences by state.",
   163  			[]string{"user", "state"}, nil),
   164  		configHashValue: prometheus.NewDesc(
   165  			"cortex_alertmanager_config_hash",
   166  			"Hash of the currently loaded alertmanager configuration.",
   167  			[]string{"user"}, nil),
   168  		partialMerges: prometheus.NewDesc(
   169  			"cortex_alertmanager_partial_state_merges_total",
   170  			"Number of times we have received a partial state to merge for a key.",
   171  			[]string{"user"}, nil),
   172  		partialMergesFailed: prometheus.NewDesc(
   173  			"cortex_alertmanager_partial_state_merges_failed_total",
   174  			"Number of times we have failed to merge a partial state received for a key.",
   175  			[]string{"user"}, nil),
   176  		replicationTotal: prometheus.NewDesc(
   177  			"cortex_alertmanager_state_replication_total",
   178  			"Number of times we have tried to replicate a state to other alertmanagers",
   179  			[]string{"user"}, nil),
   180  		replicationFailed: prometheus.NewDesc(
   181  			"cortex_alertmanager_state_replication_failed_total",
   182  			"Number of times we have failed to replicate a state to other alertmanagers",
   183  			[]string{"user"}, nil),
   184  		fetchReplicaStateTotal: prometheus.NewDesc(
   185  			"cortex_alertmanager_state_fetch_replica_state_total",
   186  			"Number of times we have tried to read and merge the full state from another replica.",
   187  			nil, nil),
   188  		fetchReplicaStateFailed: prometheus.NewDesc(
   189  			"cortex_alertmanager_state_fetch_replica_state_failed_total",
   190  			"Number of times we have failed to read and merge the full state from another replica.",
   191  			nil, nil),
   192  		initialSyncTotal: prometheus.NewDesc(
   193  			"cortex_alertmanager_state_initial_sync_total",
   194  			"Number of times we have tried to sync initial state from peers or storage.",
   195  			nil, nil),
   196  		initialSyncCompleted: prometheus.NewDesc(
   197  			"cortex_alertmanager_state_initial_sync_completed_total",
   198  			"Number of times we have completed syncing initial state for each possible outcome.",
   199  			[]string{"outcome"}, nil),
   200  		initialSyncDuration: prometheus.NewDesc(
   201  			"cortex_alertmanager_state_initial_sync_duration_seconds",
   202  			"Time spent syncing initial state from peers or storage.",
   203  			nil, nil),
   204  		persistTotal: prometheus.NewDesc(
   205  			"cortex_alertmanager_state_persist_total",
   206  			"Number of times we have tried to persist the running state to storage.",
   207  			nil, nil),
   208  		persistFailed: prometheus.NewDesc(
   209  			"cortex_alertmanager_state_persist_failed_total",
   210  			"Number of times we have failed to persist the running state to storage.",
   211  			nil, nil),
   212  		notificationRateLimited: prometheus.NewDesc(
   213  			"cortex_alertmanager_notification_rate_limited_total",
   214  			"Total number of rate-limited notifications per integration.",
   215  			[]string{"user", "integration"}, nil),
   216  		dispatcherAggregationGroupsLimitReached: prometheus.NewDesc(
   217  			"cortex_alertmanager_dispatcher_aggregation_group_limit_reached_total",
   218  			"Number of times when dispatcher failed to create new aggregation group due to limit.",
   219  			[]string{"user"}, nil),
   220  		insertAlertFailures: prometheus.NewDesc(
   221  			"cortex_alertmanager_alerts_insert_limited_total",
   222  			"Total number of failures to store alert due to hitting alertmanager limits.",
   223  			[]string{"user"}, nil),
   224  		alertsLimiterAlertsCount: prometheus.NewDesc(
   225  			"cortex_alertmanager_alerts_limiter_current_alerts",
   226  			"Number of alerts tracked by alerts limiter.",
   227  			[]string{"user"}, nil),
   228  		alertsLimiterAlertsSize: prometheus.NewDesc(
   229  			"cortex_alertmanager_alerts_limiter_current_alerts_size_bytes",
   230  			"Total size of alerts tracked by alerts limiter.",
   231  			[]string{"user"}, nil),
   232  	}
   233  }
   234  
   235  func (m *alertmanagerMetrics) addUserRegistry(user string, reg *prometheus.Registry) {
   236  	m.regs.AddUserRegistry(user, reg)
   237  }
   238  
   239  func (m *alertmanagerMetrics) removeUserRegistry(user string) {
   240  	// We need to go for a soft deletion here, as hard deletion requires
   241  	// that _all_ metrics except gauges are per-user.
   242  	m.regs.RemoveUserRegistry(user, false)
   243  }
   244  
   245  func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) {
   246  	out <- m.alertsReceived
   247  	out <- m.alertsInvalid
   248  	out <- m.numNotifications
   249  	out <- m.numFailedNotifications
   250  	out <- m.numNotificationRequestsTotal
   251  	out <- m.numNotificationRequestsFailedTotal
   252  	out <- m.notificationLatencySeconds
   253  	out <- m.markerAlerts
   254  	out <- m.nflogGCDuration
   255  	out <- m.nflogSnapshotDuration
   256  	out <- m.nflogSnapshotSize
   257  	out <- m.nflogQueriesTotal
   258  	out <- m.nflogQueryErrorsTotal
   259  	out <- m.nflogQueryDuration
   260  	out <- m.nflogPropagatedMessagesTotal
   261  	out <- m.silencesGCDuration
   262  	out <- m.silencesSnapshotDuration
   263  	out <- m.silencesSnapshotSize
   264  	out <- m.silencesQueriesTotal
   265  	out <- m.silencesQueryErrorsTotal
   266  	out <- m.silencesQueryDuration
   267  	out <- m.silencesPropagatedMessagesTotal
   268  	out <- m.silences
   269  	out <- m.configHashValue
   270  	out <- m.partialMerges
   271  	out <- m.partialMergesFailed
   272  	out <- m.replicationTotal
   273  	out <- m.replicationFailed
   274  	out <- m.fetchReplicaStateTotal
   275  	out <- m.fetchReplicaStateFailed
   276  	out <- m.initialSyncTotal
   277  	out <- m.initialSyncCompleted
   278  	out <- m.initialSyncDuration
   279  	out <- m.persistTotal
   280  	out <- m.persistFailed
   281  	out <- m.notificationRateLimited
   282  	out <- m.dispatcherAggregationGroupsLimitReached
   283  	out <- m.insertAlertFailures
   284  	out <- m.alertsLimiterAlertsCount
   285  	out <- m.alertsLimiterAlertsSize
   286  }
   287  
   288  func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {
   289  	data := m.regs.BuildMetricFamiliesPerUser()
   290  
   291  	data.SendSumOfCountersPerUser(out, m.alertsReceived, "alertmanager_alerts_received_total")
   292  	data.SendSumOfCountersPerUser(out, m.alertsInvalid, "alertmanager_alerts_invalid_total")
   293  
   294  	data.SendSumOfCountersPerUserWithLabels(out, m.numNotifications, "alertmanager_notifications_total", "integration")
   295  	data.SendSumOfCountersPerUserWithLabels(out, m.numFailedNotifications, "alertmanager_notifications_failed_total", "integration")
   296  	data.SendSumOfCountersPerUserWithLabels(out, m.numNotificationRequestsTotal, "alertmanager_notification_requests_total", "integration")
   297  	data.SendSumOfCountersPerUserWithLabels(out, m.numNotificationRequestsFailedTotal, "alertmanager_notification_requests_failed_total", "integration")
   298  	data.SendSumOfHistograms(out, m.notificationLatencySeconds, "alertmanager_notification_latency_seconds")
   299  	data.SendSumOfGaugesPerUserWithLabels(out, m.markerAlerts, "alertmanager_alerts", "state")
   300  
   301  	data.SendSumOfSummaries(out, m.nflogGCDuration, "alertmanager_nflog_gc_duration_seconds")
   302  	data.SendSumOfSummaries(out, m.nflogSnapshotDuration, "alertmanager_nflog_snapshot_duration_seconds")
   303  	data.SendSumOfGauges(out, m.nflogSnapshotSize, "alertmanager_nflog_snapshot_size_bytes")
   304  	data.SendSumOfCounters(out, m.nflogQueriesTotal, "alertmanager_nflog_queries_total")
   305  	data.SendSumOfCounters(out, m.nflogQueryErrorsTotal, "alertmanager_nflog_query_errors_total")
   306  	data.SendSumOfHistograms(out, m.nflogQueryDuration, "alertmanager_nflog_query_duration_seconds")
   307  	data.SendSumOfCounters(out, m.nflogPropagatedMessagesTotal, "alertmanager_nflog_gossip_messages_propagated_total")
   308  
   309  	data.SendSumOfSummaries(out, m.silencesGCDuration, "alertmanager_silences_gc_duration_seconds")
   310  	data.SendSumOfSummaries(out, m.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds")
   311  	data.SendSumOfGauges(out, m.silencesSnapshotSize, "alertmanager_silences_snapshot_size_bytes")
   312  	data.SendSumOfCounters(out, m.silencesQueriesTotal, "alertmanager_silences_queries_total")
   313  	data.SendSumOfCounters(out, m.silencesQueryErrorsTotal, "alertmanager_silences_query_errors_total")
   314  	data.SendSumOfHistograms(out, m.silencesQueryDuration, "alertmanager_silences_query_duration_seconds")
   315  	data.SendSumOfCounters(out, m.silencesPropagatedMessagesTotal, "alertmanager_silences_gossip_messages_propagated_total")
   316  	data.SendSumOfGaugesPerUserWithLabels(out, m.silences, "alertmanager_silences", "state")
   317  
   318  	data.SendMaxOfGaugesPerUser(out, m.configHashValue, "alertmanager_config_hash")
   319  
   320  	data.SendSumOfCountersPerUser(out, m.partialMerges, "alertmanager_partial_state_merges_total")
   321  	data.SendSumOfCountersPerUser(out, m.partialMergesFailed, "alertmanager_partial_state_merges_failed_total")
   322  	data.SendSumOfCountersPerUser(out, m.replicationTotal, "alertmanager_state_replication_total")
   323  	data.SendSumOfCountersPerUser(out, m.replicationFailed, "alertmanager_state_replication_failed_total")
   324  	data.SendSumOfCounters(out, m.fetchReplicaStateTotal, "alertmanager_state_fetch_replica_state_total")
   325  	data.SendSumOfCounters(out, m.fetchReplicaStateFailed, "alertmanager_state_fetch_replica_state_failed_total")
   326  	data.SendSumOfCounters(out, m.initialSyncTotal, "alertmanager_state_initial_sync_total")
   327  	data.SendSumOfCountersWithLabels(out, m.initialSyncCompleted, "alertmanager_state_initial_sync_completed_total", "outcome")
   328  	data.SendSumOfHistograms(out, m.initialSyncDuration, "alertmanager_state_initial_sync_duration_seconds")
   329  	data.SendSumOfCounters(out, m.persistTotal, "alertmanager_state_persist_total")
   330  	data.SendSumOfCounters(out, m.persistFailed, "alertmanager_state_persist_failed_total")
   331  
   332  	data.SendSumOfCountersPerUserWithLabels(out, m.notificationRateLimited, "alertmanager_notification_rate_limited_total", "integration")
   333  	data.SendSumOfCountersPerUser(out, m.dispatcherAggregationGroupsLimitReached, "alertmanager_dispatcher_aggregation_group_limit_reached_total")
   334  	data.SendSumOfCountersPerUser(out, m.insertAlertFailures, "alertmanager_alerts_insert_limited_total")
   335  	data.SendSumOfGaugesPerUser(out, m.alertsLimiterAlertsCount, "alertmanager_alerts_limiter_current_alerts")
   336  	data.SendSumOfGaugesPerUser(out, m.alertsLimiterAlertsSize, "alertmanager_alerts_limiter_current_alerts_size_bytes")
   337  }