github.com/weaviate/weaviate@v1.24.6/usecases/monitoring/prometheus.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package monitoring
    13  
    14  import (
    15  	"sync"
    16  
    17  	"github.com/prometheus/client_golang/prometheus"
    18  	"github.com/prometheus/client_golang/prometheus/promauto"
    19  	"github.com/weaviate/weaviate/usecases/config"
    20  )
    21  
    22  type PrometheusMetrics struct {
    23  	BatchTime                         *prometheus.HistogramVec
    24  	BatchDeleteTime                   *prometheus.SummaryVec
    25  	ObjectsTime                       *prometheus.SummaryVec
    26  	LSMBloomFilters                   *prometheus.SummaryVec
    27  	AsyncOperations                   *prometheus.GaugeVec
    28  	LSMSegmentCount                   *prometheus.GaugeVec
    29  	LSMSegmentCountByLevel            *prometheus.GaugeVec
    30  	LSMSegmentObjects                 *prometheus.GaugeVec
    31  	LSMSegmentSize                    *prometheus.GaugeVec
    32  	LSMMemtableSize                   *prometheus.GaugeVec
    33  	LSMMemtableDurations              *prometheus.SummaryVec
    34  	ObjectCount                       *prometheus.GaugeVec
    35  	QueriesCount                      *prometheus.GaugeVec
    36  	RequestsTotal                     *prometheus.GaugeVec
    37  	QueriesDurations                  *prometheus.HistogramVec
    38  	QueriesFilteredVectorDurations    *prometheus.SummaryVec
    39  	QueryDimensions                   *prometheus.CounterVec
    40  	QueryDimensionsCombined           prometheus.Counter
    41  	GoroutinesCount                   *prometheus.GaugeVec
    42  	BackupRestoreDurations            *prometheus.SummaryVec
    43  	BackupStoreDurations              *prometheus.SummaryVec
    44  	BucketPauseDurations              *prometheus.SummaryVec
    45  	BackupRestoreClassDurations       *prometheus.SummaryVec
    46  	BackupRestoreBackupInitDurations  *prometheus.SummaryVec
    47  	BackupRestoreFromStorageDurations *prometheus.SummaryVec
    48  	BackupRestoreDataTransferred      *prometheus.CounterVec
    49  	BackupStoreDataTransferred        *prometheus.CounterVec
    50  
    51  	VectorIndexTombstones              *prometheus.GaugeVec
    52  	VectorIndexTombstoneCleanupThreads *prometheus.GaugeVec
    53  	VectorIndexTombstoneCleanedCount   *prometheus.CounterVec
    54  	VectorIndexOperations              *prometheus.GaugeVec
    55  	VectorIndexDurations               *prometheus.SummaryVec
    56  	VectorIndexSize                    *prometheus.GaugeVec
    57  	VectorIndexMaintenanceDurations    *prometheus.SummaryVec
    58  	VectorDimensionsSum                *prometheus.GaugeVec
    59  	VectorSegmentsSum                  *prometheus.GaugeVec
    60  	VectorDimensionsSumByVector        *prometheus.GaugeVec
    61  	VectorSegmentsSumByVector          *prometheus.GaugeVec
    62  
    63  	StartupProgress  *prometheus.GaugeVec
    64  	StartupDurations *prometheus.SummaryVec
    65  	StartupDiskIO    *prometheus.SummaryVec
    66  
    67  	ShardsLoaded    *prometheus.GaugeVec
    68  	ShardsUnloaded  *prometheus.GaugeVec
    69  	ShardsLoading   *prometheus.GaugeVec
    70  	ShardsUnloading *prometheus.GaugeVec
    71  
    72  	Group bool
    73  }
    74  
    75  // Delete Shard deletes existing label combinations that match both
    76  // the shard and class name. If a metric is not collected at the shard
    77  // level it is unaffected. This is to make sure that deleting a single
    78  // shard (e.g. multi-tenancy) does not affect metrics for existing
    79  // shards.
    80  //
    81  // In addition, there are some metrics that we explicitly keep, such
    82  // as vector_dimensions_sum as they can be used in billing decisions.
    83  func (pm *PrometheusMetrics) DeleteShard(className, shardName string) error {
    84  	if pm == nil {
    85  		return nil
    86  	}
    87  
    88  	labels := prometheus.Labels{
    89  		"class_name": className,
    90  		"shard_name": shardName,
    91  	}
    92  	pm.BatchTime.DeletePartialMatch(labels)
    93  	pm.BatchDeleteTime.DeletePartialMatch(labels)
    94  	pm.ObjectsTime.DeletePartialMatch(labels)
    95  	pm.ObjectCount.DeletePartialMatch(labels)
    96  	pm.QueriesFilteredVectorDurations.DeletePartialMatch(labels)
    97  	pm.AsyncOperations.DeletePartialMatch(labels)
    98  	pm.LSMBloomFilters.DeletePartialMatch(labels)
    99  	pm.LSMMemtableDurations.DeletePartialMatch(labels)
   100  	pm.LSMMemtableSize.DeletePartialMatch(labels)
   101  	pm.LSMMemtableDurations.DeletePartialMatch(labels)
   102  	pm.LSMSegmentCount.DeletePartialMatch(labels)
   103  	pm.LSMSegmentSize.DeletePartialMatch(labels)
   104  	pm.LSMSegmentCountByLevel.DeletePartialMatch(labels)
   105  	pm.VectorIndexTombstones.DeletePartialMatch(labels)
   106  	pm.VectorIndexTombstoneCleanupThreads.DeletePartialMatch(labels)
   107  	pm.VectorIndexTombstoneCleanedCount.DeletePartialMatch(labels)
   108  	pm.VectorIndexOperations.DeletePartialMatch(labels)
   109  	pm.VectorIndexMaintenanceDurations.DeletePartialMatch(labels)
   110  	pm.VectorIndexDurations.DeletePartialMatch(labels)
   111  	pm.VectorIndexSize.DeletePartialMatch(labels)
   112  	pm.StartupProgress.DeletePartialMatch(labels)
   113  	pm.StartupDurations.DeletePartialMatch(labels)
   114  	pm.StartupDiskIO.DeletePartialMatch(labels)
   115  	return nil
   116  }
   117  
   118  // DeleteClass deletes all metrics that match the class name, but do
   119  // not have a shard-specific label. See [DeleteShard] for more
   120  // information.
   121  func (pm *PrometheusMetrics) DeleteClass(className string) error {
   122  	if pm == nil {
   123  		return nil
   124  	}
   125  
   126  	labels := prometheus.Labels{
   127  		"class_name": className,
   128  	}
   129  	pm.QueriesCount.DeletePartialMatch(labels)
   130  	pm.QueriesDurations.DeletePartialMatch(labels)
   131  	pm.GoroutinesCount.DeletePartialMatch(labels)
   132  	pm.BackupRestoreClassDurations.DeletePartialMatch(labels)
   133  	pm.BackupRestoreBackupInitDurations.DeletePartialMatch(labels)
   134  	pm.BackupRestoreFromStorageDurations.DeletePartialMatch(labels)
   135  	pm.BackupStoreDurations.DeletePartialMatch(labels)
   136  	pm.BackupRestoreDataTransferred.DeletePartialMatch(labels)
   137  	pm.BackupStoreDataTransferred.DeletePartialMatch(labels)
   138  	pm.QueriesFilteredVectorDurations.DeletePartialMatch(labels)
   139  
   140  	return nil
   141  }
   142  
   143  var (
   144  	msBuckets                    = []float64{10, 50, 100, 500, 1000, 5000}
   145  	metrics   *PrometheusMetrics = nil
   146  )
   147  
   148  func init() {
   149  	metrics = newPrometheusMetrics()
   150  }
   151  
   152  func InitConfig(cfg config.Monitoring) {
   153  	metrics.Group = cfg.Group
   154  }
   155  
   156  func GetMetrics() *PrometheusMetrics {
   157  	return metrics
   158  }
   159  
   160  func newPrometheusMetrics() *PrometheusMetrics {
   161  	return &PrometheusMetrics{
   162  		BatchTime: promauto.NewHistogramVec(prometheus.HistogramOpts{
   163  			Name:    "batch_durations_ms",
   164  			Help:    "Duration in ms of a single batch",
   165  			Buckets: msBuckets,
   166  		}, []string{"operation", "class_name", "shard_name"}),
   167  		BatchDeleteTime: promauto.NewSummaryVec(prometheus.SummaryOpts{
   168  			Name: "batch_delete_durations_ms",
   169  			Help: "Duration in ms of a single delete batch",
   170  		}, []string{"operation", "class_name", "shard_name"}),
   171  
   172  		ObjectsTime: promauto.NewSummaryVec(prometheus.SummaryOpts{
   173  			Name: "objects_durations_ms",
   174  			Help: "Duration of an individual object operation. Also as part of batches.",
   175  		}, []string{"operation", "step", "class_name", "shard_name"}),
   176  		ObjectCount: promauto.NewGaugeVec(prometheus.GaugeOpts{
   177  			Name: "object_count",
   178  			Help: "Number of currently ongoing async operations",
   179  		}, []string{"class_name", "shard_name"}),
   180  
   181  		QueriesCount: promauto.NewGaugeVec(prometheus.GaugeOpts{
   182  			Name: "concurrent_queries_count",
   183  			Help: "Number of concurrently running query operations",
   184  		}, []string{"class_name", "query_type"}),
   185  
   186  		RequestsTotal: promauto.NewGaugeVec(prometheus.GaugeOpts{
   187  			Name: "requests_total",
   188  			Help: "Number of all requests made",
   189  		}, []string{"status", "class_name", "api", "query_type"}),
   190  
   191  		QueriesDurations: promauto.NewHistogramVec(prometheus.HistogramOpts{
   192  			Name:    "queries_durations_ms",
   193  			Help:    "Duration of queries in milliseconds",
   194  			Buckets: msBuckets,
   195  		}, []string{"class_name", "query_type"}),
   196  
   197  		QueriesFilteredVectorDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{
   198  			Name: "queries_filtered_vector_durations_ms",
   199  			Help: "Duration of queries in milliseconds",
   200  		}, []string{"class_name", "shard_name", "operation"}),
   201  
   202  		GoroutinesCount: promauto.NewGaugeVec(prometheus.GaugeOpts{
   203  			Name: "concurrent_goroutines",
   204  			Help: "Number of concurrently running goroutines",
   205  		}, []string{"class_name", "query_type"}),
   206  
   207  		AsyncOperations: promauto.NewGaugeVec(prometheus.GaugeOpts{
   208  			Name: "async_operations_running",
   209  			Help: "Number of currently ongoing async operations",
   210  		}, []string{"operation", "class_name", "shard_name", "path"}),
   211  
   212  		// LSM metrics
   213  		LSMSegmentCount: promauto.NewGaugeVec(prometheus.GaugeOpts{
   214  			Name: "lsm_active_segments",
   215  			Help: "Number of currently present segments per shard",
   216  		}, []string{"strategy", "class_name", "shard_name", "path"}),
   217  		LSMBloomFilters: promauto.NewSummaryVec(prometheus.SummaryOpts{
   218  			Name: "lsm_bloom_filters_duration_ms",
   219  			Help: "Duration of bloom filter operations",
   220  		}, []string{"operation", "strategy", "class_name", "shard_name"}),
   221  		LSMSegmentObjects: promauto.NewGaugeVec(prometheus.GaugeOpts{
   222  			Name: "lsm_segment_objects",
   223  			Help: "Number of objects/entries of segment by level",
   224  		}, []string{"strategy", "class_name", "shard_name", "path", "level"}),
   225  		LSMSegmentSize: promauto.NewGaugeVec(prometheus.GaugeOpts{
   226  			Name: "lsm_segment_size",
   227  			Help: "Size of segment by level and unit",
   228  		}, []string{"strategy", "class_name", "shard_name", "path", "level", "unit"}),
   229  		LSMSegmentCountByLevel: promauto.NewGaugeVec(prometheus.GaugeOpts{
   230  			Name: "lsm_segment_count",
   231  			Help: "Number of segments by level",
   232  		}, []string{"strategy", "class_name", "shard_name", "path", "level"}),
   233  		LSMMemtableSize: promauto.NewGaugeVec(prometheus.GaugeOpts{
   234  			Name: "lsm_memtable_size",
   235  			Help: "Size of memtable by path",
   236  		}, []string{"strategy", "class_name", "shard_name", "path"}),
   237  		LSMMemtableDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{
   238  			Name: "lsm_memtable_durations_ms",
   239  			Help: "Time in ms for a bucket operation to complete",
   240  		}, []string{"strategy", "class_name", "shard_name", "path", "operation"}),
   241  
   242  		// Vector index metrics
   243  		VectorIndexTombstones: promauto.NewGaugeVec(prometheus.GaugeOpts{
   244  			Name: "vector_index_tombstones",
   245  			Help: "Number of active vector index tombstones",
   246  		}, []string{"class_name", "shard_name"}),
   247  		VectorIndexTombstoneCleanupThreads: promauto.NewGaugeVec(prometheus.GaugeOpts{
   248  			Name: "vector_index_tombstone_cleanup_threads",
   249  			Help: "Number of threads in use to clean up tombstones",
   250  		}, []string{"class_name", "shard_name"}),
   251  		VectorIndexTombstoneCleanedCount: promauto.NewCounterVec(prometheus.CounterOpts{
   252  			Name: "vector_index_tombstone_cleaned",
   253  			Help: "Total number of deleted objects that have been cleaned up",
   254  		}, []string{"class_name", "shard_name"}),
   255  		VectorIndexOperations: promauto.NewGaugeVec(prometheus.GaugeOpts{
   256  			Name: "vector_index_operations",
   257  			Help: "Total number of mutating operations on the vector index",
   258  		}, []string{"operation", "class_name", "shard_name"}),
   259  		VectorIndexSize: promauto.NewGaugeVec(prometheus.GaugeOpts{
   260  			Name: "vector_index_size",
   261  			Help: "The size of the vector index. Typically larger than number of vectors, as it grows proactively.",
   262  		}, []string{"class_name", "shard_name"}),
   263  		VectorIndexMaintenanceDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{
   264  			Name: "vector_index_maintenance_durations_ms",
   265  			Help: "Duration of a sync or async vector index maintenance operation",
   266  		}, []string{"operation", "class_name", "shard_name"}),
   267  		VectorIndexDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{
   268  			Name: "vector_index_durations_ms",
   269  			Help: "Duration of typical vector index operations (insert, delete)",
   270  		}, []string{"operation", "step", "class_name", "shard_name"}),
   271  		VectorDimensionsSum: promauto.NewGaugeVec(prometheus.GaugeOpts{
   272  			Name: "vector_dimensions_sum",
   273  			Help: "Total dimensions in a shard",
   274  		}, []string{"class_name", "shard_name"}),
   275  		VectorSegmentsSum: promauto.NewGaugeVec(prometheus.GaugeOpts{
   276  			Name: "vector_segments_sum",
   277  			Help: "Total segments in a shard if quantization enabled",
   278  		}, []string{"class_name", "shard_name"}),
   279  		VectorDimensionsSumByVector: promauto.NewGaugeVec(prometheus.GaugeOpts{
   280  			Name: "vector_dimensions_sum_by_vector",
   281  			Help: "Total dimensions in a shard for target vector",
   282  		}, []string{"class_name", "shard_name", "target_vector"}),
   283  		VectorSegmentsSumByVector: promauto.NewGaugeVec(prometheus.GaugeOpts{
   284  			Name: "vector_segments_sum_by_vector",
   285  			Help: "Total segments in a shard for target vector if quantization enabled",
   286  		}, []string{"class_name", "shard_name", "target_vector"}),
   287  
   288  		// Startup metrics
   289  		StartupProgress: promauto.NewGaugeVec(prometheus.GaugeOpts{
   290  			Name: "startup_progress",
   291  			Help: "A ratio (percentage) of startup progress for a particular component in a shard",
   292  		}, []string{"operation", "class_name", "shard_name"}),
   293  		StartupDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{
   294  			Name: "startup_durations_ms",
   295  			Help: "Duration of individual startup operations in ms",
   296  		}, []string{"operation", "class_name", "shard_name"}),
   297  		StartupDiskIO: promauto.NewSummaryVec(prometheus.SummaryOpts{
   298  			Name: "startup_diskio_throughput",
   299  			Help: "Disk I/O throuhput in bytes per second",
   300  		}, []string{"operation", "class_name", "shard_name"}),
   301  		QueryDimensions: promauto.NewCounterVec(prometheus.CounterOpts{
   302  			Name: "query_dimensions_total",
   303  			Help: "The vector dimensions used by any read-query that involves vectors",
   304  		}, []string{"query_type", "operation", "class_name"}),
   305  		QueryDimensionsCombined: promauto.NewCounter(prometheus.CounterOpts{
   306  			Name: "query_dimensions_combined_total",
   307  			Help: "The vector dimensions used by any read-query that involves vectors, aggregated across all classes and shards. The sum of all labels for query_dimensions_total should always match this labelless metric",
   308  		}),
   309  
   310  		// Backup/restore metrics
   311  		BackupRestoreDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{
   312  			Name: "backup_restore_ms",
   313  			Help: "Duration of a backup restore",
   314  		}, []string{"backend_name", "class_name"}),
   315  		BackupRestoreClassDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{
   316  			Name: "backup_restore_class_ms",
   317  			Help: "Duration restoring class",
   318  		}, []string{"class_name"}),
   319  		BackupRestoreBackupInitDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{
   320  			Name: "backup_restore_init_ms",
   321  			Help: "startup phase of a backup restore",
   322  		}, []string{"backend_name", "class_name"}),
   323  		BackupRestoreFromStorageDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{
   324  			Name: "backup_restore_from_backend_ms",
   325  			Help: "file transfer stage of a backup restore",
   326  		}, []string{"backend_name", "class_name"}),
   327  		BackupStoreDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{
   328  			Name: "backup_store_to_backend_ms",
   329  			Help: "file transfer stage of a backup restore",
   330  		}, []string{"backend_name", "class_name"}),
   331  		BucketPauseDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{
   332  			Name: "bucket_pause_durations_ms",
   333  			Help: "bucket pause durations",
   334  		}, []string{"bucket_dir"}),
   335  		BackupRestoreDataTransferred: promauto.NewCounterVec(prometheus.CounterOpts{
   336  			Name: "backup_restore_data_transferred",
   337  			Help: "Total number of bytes transferred during a backup restore",
   338  		}, []string{"backend_name", "class_name"}),
   339  		BackupStoreDataTransferred: promauto.NewCounterVec(prometheus.CounterOpts{
   340  			Name: "backup_store_data_transferred",
   341  			Help: "Total number of bytes transferred during a backup store",
   342  		}, []string{"backend_name", "class_name"}),
   343  
   344  		// Shard metrics
   345  		ShardsLoaded: promauto.NewGaugeVec(prometheus.GaugeOpts{
   346  			Name: "shards_loaded",
   347  			Help: "Number of shards loaded",
   348  		}, []string{"class_name"}),
   349  		ShardsUnloaded: promauto.NewGaugeVec(prometheus.GaugeOpts{
   350  			Name: "shards_unloaded",
   351  			Help: "Number of shards on not loaded",
   352  		}, []string{"class_name"}),
   353  		ShardsLoading: promauto.NewGaugeVec(prometheus.GaugeOpts{
   354  			Name: "shards_loading",
   355  			Help: "Number of shards in process of loading",
   356  		}, []string{"class_name"}),
   357  		ShardsUnloading: promauto.NewGaugeVec(prometheus.GaugeOpts{
   358  			Name: "shards_unloading",
   359  			Help: "Number of shards in process of unloading",
   360  		}, []string{"class_name"}),
   361  	}
   362  }
   363  
   364  type OnceUponATimer struct {
   365  	sync.Once
   366  	Timer *prometheus.Timer
   367  }
   368  
   369  func NewOnceTimer(promTimer *prometheus.Timer) *OnceUponATimer {
   370  	o := OnceUponATimer{}
   371  	o.Timer = promTimer
   372  	return &o
   373  }
   374  
   375  func (o *OnceUponATimer) ObserveDurationOnce() {
   376  	o.Do(func() {
   377  		o.Timer.ObserveDuration()
   378  	})
   379  }