github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/ingester/metrics.go (about) 1 package ingester 2 3 import ( 4 "github.com/prometheus/client_golang/prometheus" 5 "github.com/prometheus/client_golang/prometheus/promauto" 6 7 "github.com/grafana/loki/pkg/usagestats" 8 "github.com/grafana/loki/pkg/validation" 9 ) 10 11 type ingesterMetrics struct { 12 checkpointDeleteFail prometheus.Counter 13 checkpointDeleteTotal prometheus.Counter 14 checkpointCreationFail prometheus.Counter 15 checkpointCreationTotal prometheus.Counter 16 checkpointDuration prometheus.Summary 17 checkpointLoggedBytesTotal prometheus.Counter 18 19 walDiskFullFailures prometheus.Counter 20 walReplayActive prometheus.Gauge 21 walReplayDuration prometheus.Gauge 22 walReplaySamplesDropped *prometheus.CounterVec 23 walReplayBytesDropped *prometheus.CounterVec 24 walCorruptionsTotal *prometheus.CounterVec 25 walLoggedBytesTotal prometheus.Counter 26 walRecordsLogged prometheus.Counter 27 28 recoveredStreamsTotal prometheus.Counter 29 recoveredChunksTotal prometheus.Counter 30 recoveredEntriesTotal prometheus.Counter 31 duplicateEntriesTotal prometheus.Counter 32 recoveredBytesTotal prometheus.Counter 33 recoveryBytesInUse prometheus.Gauge 34 recoveryIsFlushing prometheus.Gauge 35 36 limiterEnabled prometheus.Gauge 37 38 autoForgetUnhealthyIngestersTotal prometheus.Counter 39 40 chunkUtilization prometheus.Histogram 41 memoryChunks prometheus.Gauge 42 chunkEntries prometheus.Histogram 43 chunkSize prometheus.Histogram 44 chunkCompressionRatio prometheus.Histogram 45 chunksPerTenant *prometheus.CounterVec 46 chunkSizePerTenant *prometheus.CounterVec 47 chunkAge prometheus.Histogram 48 chunkEncodeTime prometheus.Histogram 49 chunksFlushedPerReason *prometheus.CounterVec 50 chunkLifespan prometheus.Histogram 51 flushedChunksStats *usagestats.Counter 52 flushedChunksBytesStats *usagestats.Statistics 53 flushedChunksLinesStats *usagestats.Statistics 54 flushedChunksAgeStats *usagestats.Statistics 55 flushedChunksLifespanStats *usagestats.Statistics 56 flushedChunksUtilizationStats *usagestats.Statistics 57 58 chunksCreatedTotal prometheus.Counter 59 samplesPerChunk prometheus.Histogram 60 blocksPerChunk prometheus.Histogram 61 chunkCreatedStats *usagestats.Counter 62 } 63 64 // setRecoveryBytesInUse bounds the bytes reports to >= 0. 65 // TODO(owen-d): we can gain some efficiency by having the flusher never update this after recovery ends. 66 func (m *ingesterMetrics) setRecoveryBytesInUse(v int64) { 67 if v < 0 { 68 v = 0 69 } 70 m.recoveryBytesInUse.Set(float64(v)) 71 } 72 73 const ( 74 walTypeCheckpoint = "checkpoint" 75 walTypeSegment = "segment" 76 77 duplicateReason = "duplicate" 78 ) 79 80 func newIngesterMetrics(r prometheus.Registerer) *ingesterMetrics { 81 return &ingesterMetrics{ 82 walDiskFullFailures: promauto.With(r).NewCounter(prometheus.CounterOpts{ 83 Name: "loki_ingester_wal_disk_full_failures_total", 84 Help: "Total number of wal write failures due to full disk.", 85 }), 86 walReplayActive: promauto.With(r).NewGauge(prometheus.GaugeOpts{ 87 Name: "loki_ingester_wal_replay_active", 88 Help: "Whether the WAL is replaying", 89 }), 90 walReplayDuration: promauto.With(r).NewGauge(prometheus.GaugeOpts{ 91 Name: "loki_ingester_wal_replay_duration_seconds", 92 Help: "Time taken to replay the checkpoint and the WAL.", 93 }), 94 walReplaySamplesDropped: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ 95 Name: "loki_ingester_wal_discarded_samples_total", 96 Help: "WAL segment entries discarded during replay", 97 }, []string{validation.ReasonLabel}), 98 walReplayBytesDropped: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ 99 Name: "loki_ingester_wal_discarded_bytes_total", 100 Help: "WAL segment bytes discarded during replay", 101 }, []string{validation.ReasonLabel}), 102 walCorruptionsTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ 103 Name: "loki_ingester_wal_corruptions_total", 104 Help: "Total number of WAL corruptions encountered.", 105 }, []string{"type"}), 106 checkpointDeleteFail: promauto.With(r).NewCounter(prometheus.CounterOpts{ 107 Name: "loki_ingester_checkpoint_deletions_failed_total", 108 Help: "Total number of checkpoint deletions that failed.", 109 }), 110 checkpointDeleteTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{ 111 Name: "loki_ingester_checkpoint_deletions_total", 112 Help: "Total number of checkpoint deletions attempted.", 113 }), 114 checkpointCreationFail: promauto.With(r).NewCounter(prometheus.CounterOpts{ 115 Name: "loki_ingester_checkpoint_creations_failed_total", 116 Help: "Total number of checkpoint creations that failed.", 117 }), 118 checkpointCreationTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{ 119 Name: "loki_ingester_checkpoint_creations_total", 120 Help: "Total number of checkpoint creations attempted.", 121 }), 122 checkpointDuration: promauto.With(r).NewSummary(prometheus.SummaryOpts{ 123 Name: "loki_ingester_checkpoint_duration_seconds", 124 Help: "Time taken to create a checkpoint.", 125 Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, 126 }), 127 walRecordsLogged: promauto.With(r).NewCounter(prometheus.CounterOpts{ 128 Name: "loki_ingester_wal_records_logged_total", 129 Help: "Total number of WAL records logged.", 130 }), 131 checkpointLoggedBytesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{ 132 Name: "loki_ingester_checkpoint_logged_bytes_total", 133 Help: "Total number of bytes written to disk for checkpointing.", 134 }), 135 walLoggedBytesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{ 136 Name: "loki_ingester_wal_logged_bytes_total", 137 Help: "Total number of bytes written to disk for WAL records.", 138 }), 139 recoveredStreamsTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{ 140 Name: "loki_ingester_wal_recovered_streams_total", 141 Help: "Total number of streams recovered from the WAL.", 142 }), 143 recoveredChunksTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{ 144 Name: "loki_ingester_wal_recovered_chunks_total", 145 Help: "Total number of chunks recovered from the WAL checkpoints.", 146 }), 147 recoveredEntriesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{ 148 Name: "loki_ingester_wal_recovered_entries_total", 149 Help: "Total number of entries recovered from the WAL.", 150 }), 151 duplicateEntriesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{ 152 Name: "loki_ingester_wal_duplicate_entries_total", 153 Help: "Entries discarded during WAL replay due to existing in checkpoints.", 154 }), 155 recoveredBytesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{ 156 Name: "loki_ingester_wal_recovered_bytes_total", 157 Help: "Total number of bytes recovered from the WAL.", 158 }), 159 recoveryBytesInUse: promauto.With(r).NewGauge(prometheus.GaugeOpts{ 160 Name: "loki_ingester_wal_bytes_in_use", 161 Help: "Total number of bytes in use by the WAL recovery process.", 162 }), 163 recoveryIsFlushing: promauto.With(r).NewGauge(prometheus.GaugeOpts{ 164 Name: "loki_ingester_wal_replay_flushing", 165 Help: "Whether the wal replay is in a flushing phase due to backpressure", 166 }), 167 limiterEnabled: promauto.With(r).NewGauge(prometheus.GaugeOpts{ 168 Name: "loki_ingester_limiter_enabled", 169 Help: "Whether the ingester's limiter is enabled", 170 }), 171 autoForgetUnhealthyIngestersTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{ 172 Name: "loki_ingester_autoforget_unhealthy_ingesters_total", 173 Help: "Total number of ingesters automatically forgotten", 174 }), 175 chunkUtilization: promauto.With(r).NewHistogram(prometheus.HistogramOpts{ 176 Namespace: "loki", 177 Name: "ingester_chunk_utilization", 178 Help: "Distribution of stored chunk utilization (when stored).", 179 Buckets: prometheus.LinearBuckets(0, 0.2, 6), 180 }), 181 memoryChunks: promauto.With(r).NewGauge(prometheus.GaugeOpts{ 182 Namespace: "loki", 183 Name: "ingester_memory_chunks", 184 Help: "The total number of chunks in memory.", 185 }), 186 chunkEntries: promauto.With(r).NewHistogram(prometheus.HistogramOpts{ 187 Namespace: "loki", 188 Name: "ingester_chunk_entries", 189 Help: "Distribution of stored lines per chunk (when stored).", 190 Buckets: prometheus.ExponentialBuckets(200, 2, 9), // biggest bucket is 200*2^(9-1) = 51200 191 }), 192 chunkSize: promauto.With(r).NewHistogram(prometheus.HistogramOpts{ 193 Namespace: "loki", 194 Name: "ingester_chunk_size_bytes", 195 Help: "Distribution of stored chunk sizes (when stored).", 196 Buckets: prometheus.ExponentialBuckets(20000, 2, 10), // biggest bucket is 20000*2^(10-1) = 10,240,000 (~10.2MB) 197 }), 198 chunkCompressionRatio: promauto.With(r).NewHistogram(prometheus.HistogramOpts{ 199 Namespace: "loki", 200 Name: "ingester_chunk_compression_ratio", 201 Help: "Compression ratio of chunks (when stored).", 202 Buckets: prometheus.LinearBuckets(.75, 2, 10), 203 }), 204 chunksPerTenant: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ 205 Namespace: "loki", 206 Name: "ingester_chunks_stored_total", 207 Help: "Total stored chunks per tenant.", 208 }, []string{"tenant"}), 209 chunkSizePerTenant: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ 210 Namespace: "loki", 211 Name: "ingester_chunk_stored_bytes_total", 212 Help: "Total bytes stored in chunks per tenant.", 213 }, []string{"tenant"}), 214 chunkAge: promauto.With(r).NewHistogram(prometheus.HistogramOpts{ 215 Namespace: "loki", 216 Name: "ingester_chunk_age_seconds", 217 Help: "Distribution of chunk ages (when stored).", 218 // with default settings chunks should flush between 5 min and 12 hours 219 // so buckets at 1min, 5min, 10min, 30min, 1hr, 2hr, 4hr, 10hr, 12hr, 16hr 220 Buckets: []float64{60, 300, 600, 1800, 3600, 7200, 14400, 36000, 43200, 57600}, 221 }), 222 chunkEncodeTime: promauto.With(r).NewHistogram(prometheus.HistogramOpts{ 223 Namespace: "loki", 224 Name: "ingester_chunk_encode_time_seconds", 225 Help: "Distribution of chunk encode times.", 226 // 10ms to 10s. 227 Buckets: prometheus.ExponentialBuckets(0.01, 4, 6), 228 }), 229 chunksFlushedPerReason: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ 230 Namespace: "loki", 231 Name: "ingester_chunks_flushed_total", 232 Help: "Total flushed chunks per reason.", 233 }, []string{"reason"}), 234 chunkLifespan: promauto.With(r).NewHistogram(prometheus.HistogramOpts{ 235 Namespace: "loki", 236 Name: "ingester_chunk_bounds_hours", 237 Help: "Distribution of chunk end-start durations.", 238 // 1h -> 8hr 239 Buckets: prometheus.LinearBuckets(1, 1, 8), 240 }), 241 flushedChunksStats: usagestats.NewCounter("ingester_flushed_chunks"), 242 flushedChunksBytesStats: usagestats.NewStatistics("ingester_flushed_chunks_bytes"), 243 flushedChunksLinesStats: usagestats.NewStatistics("ingester_flushed_chunks_lines"), 244 flushedChunksAgeStats: usagestats.NewStatistics("ingester_flushed_chunks_age_seconds"), 245 flushedChunksLifespanStats: usagestats.NewStatistics("ingester_flushed_chunks_lifespan_seconds"), 246 flushedChunksUtilizationStats: usagestats.NewStatistics("ingester_flushed_chunks_utilization"), 247 chunksCreatedTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{ 248 Namespace: "loki", 249 Name: "ingester_chunks_created_total", 250 Help: "The total number of chunks created in the ingester.", 251 }), 252 samplesPerChunk: promauto.With(r).NewHistogram(prometheus.HistogramOpts{ 253 Namespace: "loki", 254 Subsystem: "ingester", 255 Name: "samples_per_chunk", 256 Help: "The number of samples in a chunk.", 257 258 Buckets: prometheus.LinearBuckets(4096, 2048, 6), 259 }), 260 blocksPerChunk: promauto.With(r).NewHistogram(prometheus.HistogramOpts{ 261 Namespace: "loki", 262 Subsystem: "ingester", 263 Name: "blocks_per_chunk", 264 Help: "The number of blocks in a chunk.", 265 266 Buckets: prometheus.ExponentialBuckets(5, 2, 6), 267 }), 268 269 chunkCreatedStats: usagestats.NewCounter("ingester_chunk_created"), 270 } 271 }