github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/ingester/metrics.go

github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/ingester/metrics.go (about)

     1  package ingester
     2  
     3  import (
     4  	"github.com/prometheus/client_golang/prometheus"
     5  	"github.com/prometheus/client_golang/prometheus/promauto"
     6  
     7  	"github.com/grafana/loki/pkg/usagestats"
     8  	"github.com/grafana/loki/pkg/validation"
     9  )
    10  
    11  type ingesterMetrics struct {
    12  	checkpointDeleteFail       prometheus.Counter
    13  	checkpointDeleteTotal      prometheus.Counter
    14  	checkpointCreationFail     prometheus.Counter
    15  	checkpointCreationTotal    prometheus.Counter
    16  	checkpointDuration         prometheus.Summary
    17  	checkpointLoggedBytesTotal prometheus.Counter
    18  
    19  	walDiskFullFailures     prometheus.Counter
    20  	walReplayActive         prometheus.Gauge
    21  	walReplayDuration       prometheus.Gauge
    22  	walReplaySamplesDropped *prometheus.CounterVec
    23  	walReplayBytesDropped   *prometheus.CounterVec
    24  	walCorruptionsTotal     *prometheus.CounterVec
    25  	walLoggedBytesTotal     prometheus.Counter
    26  	walRecordsLogged        prometheus.Counter
    27  
    28  	recoveredStreamsTotal prometheus.Counter
    29  	recoveredChunksTotal  prometheus.Counter
    30  	recoveredEntriesTotal prometheus.Counter
    31  	duplicateEntriesTotal prometheus.Counter
    32  	recoveredBytesTotal   prometheus.Counter
    33  	recoveryBytesInUse    prometheus.Gauge
    34  	recoveryIsFlushing    prometheus.Gauge
    35  
    36  	limiterEnabled prometheus.Gauge
    37  
    38  	autoForgetUnhealthyIngestersTotal prometheus.Counter
    39  
    40  	chunkUtilization              prometheus.Histogram
    41  	memoryChunks                  prometheus.Gauge
    42  	chunkEntries                  prometheus.Histogram
    43  	chunkSize                     prometheus.Histogram
    44  	chunkCompressionRatio         prometheus.Histogram
    45  	chunksPerTenant               *prometheus.CounterVec
    46  	chunkSizePerTenant            *prometheus.CounterVec
    47  	chunkAge                      prometheus.Histogram
    48  	chunkEncodeTime               prometheus.Histogram
    49  	chunksFlushedPerReason        *prometheus.CounterVec
    50  	chunkLifespan                 prometheus.Histogram
    51  	flushedChunksStats            *usagestats.Counter
    52  	flushedChunksBytesStats       *usagestats.Statistics
    53  	flushedChunksLinesStats       *usagestats.Statistics
    54  	flushedChunksAgeStats         *usagestats.Statistics
    55  	flushedChunksLifespanStats    *usagestats.Statistics
    56  	flushedChunksUtilizationStats *usagestats.Statistics
    57  
    58  	chunksCreatedTotal prometheus.Counter
    59  	samplesPerChunk    prometheus.Histogram
    60  	blocksPerChunk     prometheus.Histogram
    61  	chunkCreatedStats  *usagestats.Counter
    62  }
    63  
    64  // setRecoveryBytesInUse bounds the bytes reports to >= 0.
    65  // TODO(owen-d): we can gain some efficiency by having the flusher never update this after recovery ends.
    66  func (m *ingesterMetrics) setRecoveryBytesInUse(v int64) {
    67  	if v < 0 {
    68  		v = 0
    69  	}
    70  	m.recoveryBytesInUse.Set(float64(v))
    71  }
    72  
    73  const (
    74  	walTypeCheckpoint = "checkpoint"
    75  	walTypeSegment    = "segment"
    76  
    77  	duplicateReason = "duplicate"
    78  )
    79  
    80  func newIngesterMetrics(r prometheus.Registerer) *ingesterMetrics {
    81  	return &ingesterMetrics{
    82  		walDiskFullFailures: promauto.With(r).NewCounter(prometheus.CounterOpts{
    83  			Name: "loki_ingester_wal_disk_full_failures_total",
    84  			Help: "Total number of wal write failures due to full disk.",
    85  		}),
    86  		walReplayActive: promauto.With(r).NewGauge(prometheus.GaugeOpts{
    87  			Name: "loki_ingester_wal_replay_active",
    88  			Help: "Whether the WAL is replaying",
    89  		}),
    90  		walReplayDuration: promauto.With(r).NewGauge(prometheus.GaugeOpts{
    91  			Name: "loki_ingester_wal_replay_duration_seconds",
    92  			Help: "Time taken to replay the checkpoint and the WAL.",
    93  		}),
    94  		walReplaySamplesDropped: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
    95  			Name: "loki_ingester_wal_discarded_samples_total",
    96  			Help: "WAL segment entries discarded during replay",
    97  		}, []string{validation.ReasonLabel}),
    98  		walReplayBytesDropped: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
    99  			Name: "loki_ingester_wal_discarded_bytes_total",
   100  			Help: "WAL segment bytes discarded during replay",
   101  		}, []string{validation.ReasonLabel}),
   102  		walCorruptionsTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
   103  			Name: "loki_ingester_wal_corruptions_total",
   104  			Help: "Total number of WAL corruptions encountered.",
   105  		}, []string{"type"}),
   106  		checkpointDeleteFail: promauto.With(r).NewCounter(prometheus.CounterOpts{
   107  			Name: "loki_ingester_checkpoint_deletions_failed_total",
   108  			Help: "Total number of checkpoint deletions that failed.",
   109  		}),
   110  		checkpointDeleteTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
   111  			Name: "loki_ingester_checkpoint_deletions_total",
   112  			Help: "Total number of checkpoint deletions attempted.",
   113  		}),
   114  		checkpointCreationFail: promauto.With(r).NewCounter(prometheus.CounterOpts{
   115  			Name: "loki_ingester_checkpoint_creations_failed_total",
   116  			Help: "Total number of checkpoint creations that failed.",
   117  		}),
   118  		checkpointCreationTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
   119  			Name: "loki_ingester_checkpoint_creations_total",
   120  			Help: "Total number of checkpoint creations attempted.",
   121  		}),
   122  		checkpointDuration: promauto.With(r).NewSummary(prometheus.SummaryOpts{
   123  			Name:       "loki_ingester_checkpoint_duration_seconds",
   124  			Help:       "Time taken to create a checkpoint.",
   125  			Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
   126  		}),
   127  		walRecordsLogged: promauto.With(r).NewCounter(prometheus.CounterOpts{
   128  			Name: "loki_ingester_wal_records_logged_total",
   129  			Help: "Total number of WAL records logged.",
   130  		}),
   131  		checkpointLoggedBytesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
   132  			Name: "loki_ingester_checkpoint_logged_bytes_total",
   133  			Help: "Total number of bytes written to disk for checkpointing.",
   134  		}),
   135  		walLoggedBytesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
   136  			Name: "loki_ingester_wal_logged_bytes_total",
   137  			Help: "Total number of bytes written to disk for WAL records.",
   138  		}),
   139  		recoveredStreamsTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
   140  			Name: "loki_ingester_wal_recovered_streams_total",
   141  			Help: "Total number of streams recovered from the WAL.",
   142  		}),
   143  		recoveredChunksTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
   144  			Name: "loki_ingester_wal_recovered_chunks_total",
   145  			Help: "Total number of chunks recovered from the WAL checkpoints.",
   146  		}),
   147  		recoveredEntriesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
   148  			Name: "loki_ingester_wal_recovered_entries_total",
   149  			Help: "Total number of entries recovered from the WAL.",
   150  		}),
   151  		duplicateEntriesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
   152  			Name: "loki_ingester_wal_duplicate_entries_total",
   153  			Help: "Entries discarded during WAL replay due to existing in checkpoints.",
   154  		}),
   155  		recoveredBytesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
   156  			Name: "loki_ingester_wal_recovered_bytes_total",
   157  			Help: "Total number of bytes recovered from the WAL.",
   158  		}),
   159  		recoveryBytesInUse: promauto.With(r).NewGauge(prometheus.GaugeOpts{
   160  			Name: "loki_ingester_wal_bytes_in_use",
   161  			Help: "Total number of bytes in use by the WAL recovery process.",
   162  		}),
   163  		recoveryIsFlushing: promauto.With(r).NewGauge(prometheus.GaugeOpts{
   164  			Name: "loki_ingester_wal_replay_flushing",
   165  			Help: "Whether the wal replay is in a flushing phase due to backpressure",
   166  		}),
   167  		limiterEnabled: promauto.With(r).NewGauge(prometheus.GaugeOpts{
   168  			Name: "loki_ingester_limiter_enabled",
   169  			Help: "Whether the ingester's limiter is enabled",
   170  		}),
   171  		autoForgetUnhealthyIngestersTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
   172  			Name: "loki_ingester_autoforget_unhealthy_ingesters_total",
   173  			Help: "Total number of ingesters automatically forgotten",
   174  		}),
   175  		chunkUtilization: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
   176  			Namespace: "loki",
   177  			Name:      "ingester_chunk_utilization",
   178  			Help:      "Distribution of stored chunk utilization (when stored).",
   179  			Buckets:   prometheus.LinearBuckets(0, 0.2, 6),
   180  		}),
   181  		memoryChunks: promauto.With(r).NewGauge(prometheus.GaugeOpts{
   182  			Namespace: "loki",
   183  			Name:      "ingester_memory_chunks",
   184  			Help:      "The total number of chunks in memory.",
   185  		}),
   186  		chunkEntries: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
   187  			Namespace: "loki",
   188  			Name:      "ingester_chunk_entries",
   189  			Help:      "Distribution of stored lines per chunk (when stored).",
   190  			Buckets:   prometheus.ExponentialBuckets(200, 2, 9), // biggest bucket is 200*2^(9-1) = 51200
   191  		}),
   192  		chunkSize: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
   193  			Namespace: "loki",
   194  			Name:      "ingester_chunk_size_bytes",
   195  			Help:      "Distribution of stored chunk sizes (when stored).",
   196  			Buckets:   prometheus.ExponentialBuckets(20000, 2, 10), // biggest bucket is 20000*2^(10-1) = 10,240,000 (~10.2MB)
   197  		}),
   198  		chunkCompressionRatio: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
   199  			Namespace: "loki",
   200  			Name:      "ingester_chunk_compression_ratio",
   201  			Help:      "Compression ratio of chunks (when stored).",
   202  			Buckets:   prometheus.LinearBuckets(.75, 2, 10),
   203  		}),
   204  		chunksPerTenant: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
   205  			Namespace: "loki",
   206  			Name:      "ingester_chunks_stored_total",
   207  			Help:      "Total stored chunks per tenant.",
   208  		}, []string{"tenant"}),
   209  		chunkSizePerTenant: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
   210  			Namespace: "loki",
   211  			Name:      "ingester_chunk_stored_bytes_total",
   212  			Help:      "Total bytes stored in chunks per tenant.",
   213  		}, []string{"tenant"}),
   214  		chunkAge: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
   215  			Namespace: "loki",
   216  			Name:      "ingester_chunk_age_seconds",
   217  			Help:      "Distribution of chunk ages (when stored).",
   218  			// with default settings chunks should flush between 5 min and 12 hours
   219  			// so buckets at 1min, 5min, 10min, 30min, 1hr, 2hr, 4hr, 10hr, 12hr, 16hr
   220  			Buckets: []float64{60, 300, 600, 1800, 3600, 7200, 14400, 36000, 43200, 57600},
   221  		}),
   222  		chunkEncodeTime: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
   223  			Namespace: "loki",
   224  			Name:      "ingester_chunk_encode_time_seconds",
   225  			Help:      "Distribution of chunk encode times.",
   226  			// 10ms to 10s.
   227  			Buckets: prometheus.ExponentialBuckets(0.01, 4, 6),
   228  		}),
   229  		chunksFlushedPerReason: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
   230  			Namespace: "loki",
   231  			Name:      "ingester_chunks_flushed_total",
   232  			Help:      "Total flushed chunks per reason.",
   233  		}, []string{"reason"}),
   234  		chunkLifespan: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
   235  			Namespace: "loki",
   236  			Name:      "ingester_chunk_bounds_hours",
   237  			Help:      "Distribution of chunk end-start durations.",
   238  			// 1h -> 8hr
   239  			Buckets: prometheus.LinearBuckets(1, 1, 8),
   240  		}),
   241  		flushedChunksStats:            usagestats.NewCounter("ingester_flushed_chunks"),
   242  		flushedChunksBytesStats:       usagestats.NewStatistics("ingester_flushed_chunks_bytes"),
   243  		flushedChunksLinesStats:       usagestats.NewStatistics("ingester_flushed_chunks_lines"),
   244  		flushedChunksAgeStats:         usagestats.NewStatistics("ingester_flushed_chunks_age_seconds"),
   245  		flushedChunksLifespanStats:    usagestats.NewStatistics("ingester_flushed_chunks_lifespan_seconds"),
   246  		flushedChunksUtilizationStats: usagestats.NewStatistics("ingester_flushed_chunks_utilization"),
   247  		chunksCreatedTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
   248  			Namespace: "loki",
   249  			Name:      "ingester_chunks_created_total",
   250  			Help:      "The total number of chunks created in the ingester.",
   251  		}),
   252  		samplesPerChunk: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
   253  			Namespace: "loki",
   254  			Subsystem: "ingester",
   255  			Name:      "samples_per_chunk",
   256  			Help:      "The number of samples in a chunk.",
   257  
   258  			Buckets: prometheus.LinearBuckets(4096, 2048, 6),
   259  		}),
   260  		blocksPerChunk: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
   261  			Namespace: "loki",
   262  			Subsystem: "ingester",
   263  			Name:      "blocks_per_chunk",
   264  			Help:      "The number of blocks in a chunk.",
   265  
   266  			Buckets: prometheus.ExponentialBuckets(5, 2, 6),
   267  		}),
   268  
   269  		chunkCreatedStats: usagestats.NewCounter("ingester_chunk_created"),
   270  	}
   271  }