github.com/weaviate/weaviate@v1.24.6/usecases/monitoring/prometheus.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package monitoring 13 14 import ( 15 "sync" 16 17 "github.com/prometheus/client_golang/prometheus" 18 "github.com/prometheus/client_golang/prometheus/promauto" 19 "github.com/weaviate/weaviate/usecases/config" 20 ) 21 22 type PrometheusMetrics struct { 23 BatchTime *prometheus.HistogramVec 24 BatchDeleteTime *prometheus.SummaryVec 25 ObjectsTime *prometheus.SummaryVec 26 LSMBloomFilters *prometheus.SummaryVec 27 AsyncOperations *prometheus.GaugeVec 28 LSMSegmentCount *prometheus.GaugeVec 29 LSMSegmentCountByLevel *prometheus.GaugeVec 30 LSMSegmentObjects *prometheus.GaugeVec 31 LSMSegmentSize *prometheus.GaugeVec 32 LSMMemtableSize *prometheus.GaugeVec 33 LSMMemtableDurations *prometheus.SummaryVec 34 ObjectCount *prometheus.GaugeVec 35 QueriesCount *prometheus.GaugeVec 36 RequestsTotal *prometheus.GaugeVec 37 QueriesDurations *prometheus.HistogramVec 38 QueriesFilteredVectorDurations *prometheus.SummaryVec 39 QueryDimensions *prometheus.CounterVec 40 QueryDimensionsCombined prometheus.Counter 41 GoroutinesCount *prometheus.GaugeVec 42 BackupRestoreDurations *prometheus.SummaryVec 43 BackupStoreDurations *prometheus.SummaryVec 44 BucketPauseDurations *prometheus.SummaryVec 45 BackupRestoreClassDurations *prometheus.SummaryVec 46 BackupRestoreBackupInitDurations *prometheus.SummaryVec 47 BackupRestoreFromStorageDurations *prometheus.SummaryVec 48 BackupRestoreDataTransferred *prometheus.CounterVec 49 BackupStoreDataTransferred *prometheus.CounterVec 50 51 VectorIndexTombstones *prometheus.GaugeVec 52 VectorIndexTombstoneCleanupThreads *prometheus.GaugeVec 53 VectorIndexTombstoneCleanedCount *prometheus.CounterVec 54 VectorIndexOperations *prometheus.GaugeVec 55 VectorIndexDurations *prometheus.SummaryVec 56 VectorIndexSize *prometheus.GaugeVec 57 VectorIndexMaintenanceDurations *prometheus.SummaryVec 58 VectorDimensionsSum *prometheus.GaugeVec 59 VectorSegmentsSum *prometheus.GaugeVec 60 VectorDimensionsSumByVector *prometheus.GaugeVec 61 VectorSegmentsSumByVector *prometheus.GaugeVec 62 63 StartupProgress *prometheus.GaugeVec 64 StartupDurations *prometheus.SummaryVec 65 StartupDiskIO *prometheus.SummaryVec 66 67 ShardsLoaded *prometheus.GaugeVec 68 ShardsUnloaded *prometheus.GaugeVec 69 ShardsLoading *prometheus.GaugeVec 70 ShardsUnloading *prometheus.GaugeVec 71 72 Group bool 73 } 74 75 // Delete Shard deletes existing label combinations that match both 76 // the shard and class name. If a metric is not collected at the shard 77 // level it is unaffected. This is to make sure that deleting a single 78 // shard (e.g. multi-tenancy) does not affect metrics for existing 79 // shards. 80 // 81 // In addition, there are some metrics that we explicitly keep, such 82 // as vector_dimensions_sum as they can be used in billing decisions. 83 func (pm *PrometheusMetrics) DeleteShard(className, shardName string) error { 84 if pm == nil { 85 return nil 86 } 87 88 labels := prometheus.Labels{ 89 "class_name": className, 90 "shard_name": shardName, 91 } 92 pm.BatchTime.DeletePartialMatch(labels) 93 pm.BatchDeleteTime.DeletePartialMatch(labels) 94 pm.ObjectsTime.DeletePartialMatch(labels) 95 pm.ObjectCount.DeletePartialMatch(labels) 96 pm.QueriesFilteredVectorDurations.DeletePartialMatch(labels) 97 pm.AsyncOperations.DeletePartialMatch(labels) 98 pm.LSMBloomFilters.DeletePartialMatch(labels) 99 pm.LSMMemtableDurations.DeletePartialMatch(labels) 100 pm.LSMMemtableSize.DeletePartialMatch(labels) 101 pm.LSMMemtableDurations.DeletePartialMatch(labels) 102 pm.LSMSegmentCount.DeletePartialMatch(labels) 103 pm.LSMSegmentSize.DeletePartialMatch(labels) 104 pm.LSMSegmentCountByLevel.DeletePartialMatch(labels) 105 pm.VectorIndexTombstones.DeletePartialMatch(labels) 106 pm.VectorIndexTombstoneCleanupThreads.DeletePartialMatch(labels) 107 pm.VectorIndexTombstoneCleanedCount.DeletePartialMatch(labels) 108 pm.VectorIndexOperations.DeletePartialMatch(labels) 109 pm.VectorIndexMaintenanceDurations.DeletePartialMatch(labels) 110 pm.VectorIndexDurations.DeletePartialMatch(labels) 111 pm.VectorIndexSize.DeletePartialMatch(labels) 112 pm.StartupProgress.DeletePartialMatch(labels) 113 pm.StartupDurations.DeletePartialMatch(labels) 114 pm.StartupDiskIO.DeletePartialMatch(labels) 115 return nil 116 } 117 118 // DeleteClass deletes all metrics that match the class name, but do 119 // not have a shard-specific label. See [DeleteShard] for more 120 // information. 121 func (pm *PrometheusMetrics) DeleteClass(className string) error { 122 if pm == nil { 123 return nil 124 } 125 126 labels := prometheus.Labels{ 127 "class_name": className, 128 } 129 pm.QueriesCount.DeletePartialMatch(labels) 130 pm.QueriesDurations.DeletePartialMatch(labels) 131 pm.GoroutinesCount.DeletePartialMatch(labels) 132 pm.BackupRestoreClassDurations.DeletePartialMatch(labels) 133 pm.BackupRestoreBackupInitDurations.DeletePartialMatch(labels) 134 pm.BackupRestoreFromStorageDurations.DeletePartialMatch(labels) 135 pm.BackupStoreDurations.DeletePartialMatch(labels) 136 pm.BackupRestoreDataTransferred.DeletePartialMatch(labels) 137 pm.BackupStoreDataTransferred.DeletePartialMatch(labels) 138 pm.QueriesFilteredVectorDurations.DeletePartialMatch(labels) 139 140 return nil 141 } 142 143 var ( 144 msBuckets = []float64{10, 50, 100, 500, 1000, 5000} 145 metrics *PrometheusMetrics = nil 146 ) 147 148 func init() { 149 metrics = newPrometheusMetrics() 150 } 151 152 func InitConfig(cfg config.Monitoring) { 153 metrics.Group = cfg.Group 154 } 155 156 func GetMetrics() *PrometheusMetrics { 157 return metrics 158 } 159 160 func newPrometheusMetrics() *PrometheusMetrics { 161 return &PrometheusMetrics{ 162 BatchTime: promauto.NewHistogramVec(prometheus.HistogramOpts{ 163 Name: "batch_durations_ms", 164 Help: "Duration in ms of a single batch", 165 Buckets: msBuckets, 166 }, []string{"operation", "class_name", "shard_name"}), 167 BatchDeleteTime: promauto.NewSummaryVec(prometheus.SummaryOpts{ 168 Name: "batch_delete_durations_ms", 169 Help: "Duration in ms of a single delete batch", 170 }, []string{"operation", "class_name", "shard_name"}), 171 172 ObjectsTime: promauto.NewSummaryVec(prometheus.SummaryOpts{ 173 Name: "objects_durations_ms", 174 Help: "Duration of an individual object operation. Also as part of batches.", 175 }, []string{"operation", "step", "class_name", "shard_name"}), 176 ObjectCount: promauto.NewGaugeVec(prometheus.GaugeOpts{ 177 Name: "object_count", 178 Help: "Number of currently ongoing async operations", 179 }, []string{"class_name", "shard_name"}), 180 181 QueriesCount: promauto.NewGaugeVec(prometheus.GaugeOpts{ 182 Name: "concurrent_queries_count", 183 Help: "Number of concurrently running query operations", 184 }, []string{"class_name", "query_type"}), 185 186 RequestsTotal: promauto.NewGaugeVec(prometheus.GaugeOpts{ 187 Name: "requests_total", 188 Help: "Number of all requests made", 189 }, []string{"status", "class_name", "api", "query_type"}), 190 191 QueriesDurations: promauto.NewHistogramVec(prometheus.HistogramOpts{ 192 Name: "queries_durations_ms", 193 Help: "Duration of queries in milliseconds", 194 Buckets: msBuckets, 195 }, []string{"class_name", "query_type"}), 196 197 QueriesFilteredVectorDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{ 198 Name: "queries_filtered_vector_durations_ms", 199 Help: "Duration of queries in milliseconds", 200 }, []string{"class_name", "shard_name", "operation"}), 201 202 GoroutinesCount: promauto.NewGaugeVec(prometheus.GaugeOpts{ 203 Name: "concurrent_goroutines", 204 Help: "Number of concurrently running goroutines", 205 }, []string{"class_name", "query_type"}), 206 207 AsyncOperations: promauto.NewGaugeVec(prometheus.GaugeOpts{ 208 Name: "async_operations_running", 209 Help: "Number of currently ongoing async operations", 210 }, []string{"operation", "class_name", "shard_name", "path"}), 211 212 // LSM metrics 213 LSMSegmentCount: promauto.NewGaugeVec(prometheus.GaugeOpts{ 214 Name: "lsm_active_segments", 215 Help: "Number of currently present segments per shard", 216 }, []string{"strategy", "class_name", "shard_name", "path"}), 217 LSMBloomFilters: promauto.NewSummaryVec(prometheus.SummaryOpts{ 218 Name: "lsm_bloom_filters_duration_ms", 219 Help: "Duration of bloom filter operations", 220 }, []string{"operation", "strategy", "class_name", "shard_name"}), 221 LSMSegmentObjects: promauto.NewGaugeVec(prometheus.GaugeOpts{ 222 Name: "lsm_segment_objects", 223 Help: "Number of objects/entries of segment by level", 224 }, []string{"strategy", "class_name", "shard_name", "path", "level"}), 225 LSMSegmentSize: promauto.NewGaugeVec(prometheus.GaugeOpts{ 226 Name: "lsm_segment_size", 227 Help: "Size of segment by level and unit", 228 }, []string{"strategy", "class_name", "shard_name", "path", "level", "unit"}), 229 LSMSegmentCountByLevel: promauto.NewGaugeVec(prometheus.GaugeOpts{ 230 Name: "lsm_segment_count", 231 Help: "Number of segments by level", 232 }, []string{"strategy", "class_name", "shard_name", "path", "level"}), 233 LSMMemtableSize: promauto.NewGaugeVec(prometheus.GaugeOpts{ 234 Name: "lsm_memtable_size", 235 Help: "Size of memtable by path", 236 }, []string{"strategy", "class_name", "shard_name", "path"}), 237 LSMMemtableDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{ 238 Name: "lsm_memtable_durations_ms", 239 Help: "Time in ms for a bucket operation to complete", 240 }, []string{"strategy", "class_name", "shard_name", "path", "operation"}), 241 242 // Vector index metrics 243 VectorIndexTombstones: promauto.NewGaugeVec(prometheus.GaugeOpts{ 244 Name: "vector_index_tombstones", 245 Help: "Number of active vector index tombstones", 246 }, []string{"class_name", "shard_name"}), 247 VectorIndexTombstoneCleanupThreads: promauto.NewGaugeVec(prometheus.GaugeOpts{ 248 Name: "vector_index_tombstone_cleanup_threads", 249 Help: "Number of threads in use to clean up tombstones", 250 }, []string{"class_name", "shard_name"}), 251 VectorIndexTombstoneCleanedCount: promauto.NewCounterVec(prometheus.CounterOpts{ 252 Name: "vector_index_tombstone_cleaned", 253 Help: "Total number of deleted objects that have been cleaned up", 254 }, []string{"class_name", "shard_name"}), 255 VectorIndexOperations: promauto.NewGaugeVec(prometheus.GaugeOpts{ 256 Name: "vector_index_operations", 257 Help: "Total number of mutating operations on the vector index", 258 }, []string{"operation", "class_name", "shard_name"}), 259 VectorIndexSize: promauto.NewGaugeVec(prometheus.GaugeOpts{ 260 Name: "vector_index_size", 261 Help: "The size of the vector index. Typically larger than number of vectors, as it grows proactively.", 262 }, []string{"class_name", "shard_name"}), 263 VectorIndexMaintenanceDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{ 264 Name: "vector_index_maintenance_durations_ms", 265 Help: "Duration of a sync or async vector index maintenance operation", 266 }, []string{"operation", "class_name", "shard_name"}), 267 VectorIndexDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{ 268 Name: "vector_index_durations_ms", 269 Help: "Duration of typical vector index operations (insert, delete)", 270 }, []string{"operation", "step", "class_name", "shard_name"}), 271 VectorDimensionsSum: promauto.NewGaugeVec(prometheus.GaugeOpts{ 272 Name: "vector_dimensions_sum", 273 Help: "Total dimensions in a shard", 274 }, []string{"class_name", "shard_name"}), 275 VectorSegmentsSum: promauto.NewGaugeVec(prometheus.GaugeOpts{ 276 Name: "vector_segments_sum", 277 Help: "Total segments in a shard if quantization enabled", 278 }, []string{"class_name", "shard_name"}), 279 VectorDimensionsSumByVector: promauto.NewGaugeVec(prometheus.GaugeOpts{ 280 Name: "vector_dimensions_sum_by_vector", 281 Help: "Total dimensions in a shard for target vector", 282 }, []string{"class_name", "shard_name", "target_vector"}), 283 VectorSegmentsSumByVector: promauto.NewGaugeVec(prometheus.GaugeOpts{ 284 Name: "vector_segments_sum_by_vector", 285 Help: "Total segments in a shard for target vector if quantization enabled", 286 }, []string{"class_name", "shard_name", "target_vector"}), 287 288 // Startup metrics 289 StartupProgress: promauto.NewGaugeVec(prometheus.GaugeOpts{ 290 Name: "startup_progress", 291 Help: "A ratio (percentage) of startup progress for a particular component in a shard", 292 }, []string{"operation", "class_name", "shard_name"}), 293 StartupDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{ 294 Name: "startup_durations_ms", 295 Help: "Duration of individual startup operations in ms", 296 }, []string{"operation", "class_name", "shard_name"}), 297 StartupDiskIO: promauto.NewSummaryVec(prometheus.SummaryOpts{ 298 Name: "startup_diskio_throughput", 299 Help: "Disk I/O throuhput in bytes per second", 300 }, []string{"operation", "class_name", "shard_name"}), 301 QueryDimensions: promauto.NewCounterVec(prometheus.CounterOpts{ 302 Name: "query_dimensions_total", 303 Help: "The vector dimensions used by any read-query that involves vectors", 304 }, []string{"query_type", "operation", "class_name"}), 305 QueryDimensionsCombined: promauto.NewCounter(prometheus.CounterOpts{ 306 Name: "query_dimensions_combined_total", 307 Help: "The vector dimensions used by any read-query that involves vectors, aggregated across all classes and shards. The sum of all labels for query_dimensions_total should always match this labelless metric", 308 }), 309 310 // Backup/restore metrics 311 BackupRestoreDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{ 312 Name: "backup_restore_ms", 313 Help: "Duration of a backup restore", 314 }, []string{"backend_name", "class_name"}), 315 BackupRestoreClassDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{ 316 Name: "backup_restore_class_ms", 317 Help: "Duration restoring class", 318 }, []string{"class_name"}), 319 BackupRestoreBackupInitDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{ 320 Name: "backup_restore_init_ms", 321 Help: "startup phase of a backup restore", 322 }, []string{"backend_name", "class_name"}), 323 BackupRestoreFromStorageDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{ 324 Name: "backup_restore_from_backend_ms", 325 Help: "file transfer stage of a backup restore", 326 }, []string{"backend_name", "class_name"}), 327 BackupStoreDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{ 328 Name: "backup_store_to_backend_ms", 329 Help: "file transfer stage of a backup restore", 330 }, []string{"backend_name", "class_name"}), 331 BucketPauseDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{ 332 Name: "bucket_pause_durations_ms", 333 Help: "bucket pause durations", 334 }, []string{"bucket_dir"}), 335 BackupRestoreDataTransferred: promauto.NewCounterVec(prometheus.CounterOpts{ 336 Name: "backup_restore_data_transferred", 337 Help: "Total number of bytes transferred during a backup restore", 338 }, []string{"backend_name", "class_name"}), 339 BackupStoreDataTransferred: promauto.NewCounterVec(prometheus.CounterOpts{ 340 Name: "backup_store_data_transferred", 341 Help: "Total number of bytes transferred during a backup store", 342 }, []string{"backend_name", "class_name"}), 343 344 // Shard metrics 345 ShardsLoaded: promauto.NewGaugeVec(prometheus.GaugeOpts{ 346 Name: "shards_loaded", 347 Help: "Number of shards loaded", 348 }, []string{"class_name"}), 349 ShardsUnloaded: promauto.NewGaugeVec(prometheus.GaugeOpts{ 350 Name: "shards_unloaded", 351 Help: "Number of shards on not loaded", 352 }, []string{"class_name"}), 353 ShardsLoading: promauto.NewGaugeVec(prometheus.GaugeOpts{ 354 Name: "shards_loading", 355 Help: "Number of shards in process of loading", 356 }, []string{"class_name"}), 357 ShardsUnloading: promauto.NewGaugeVec(prometheus.GaugeOpts{ 358 Name: "shards_unloading", 359 Help: "Number of shards in process of unloading", 360 }, []string{"class_name"}), 361 } 362 } 363 364 type OnceUponATimer struct { 365 sync.Once 366 Timer *prometheus.Timer 367 } 368 369 func NewOnceTimer(promTimer *prometheus.Timer) *OnceUponATimer { 370 o := OnceUponATimer{} 371 o.Timer = promTimer 372 return &o 373 } 374 375 func (o *OnceUponATimer) ObserveDurationOnce() { 376 o.Do(func() { 377 o.Timer.ObserveDuration() 378 }) 379 }