github.com/google/fleetspeak@v0.1.15-0.20240426164851-4f31f62c1aea/fleetspeak/src/server/components/prometheus/prometheus.go (about)

     1  // Copyright 2020 Google Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     https://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package prometheus
    16  
    17  import (
    18  	"fmt"
    19  	"sort"
    20  	"strconv"
    21  	"strings"
    22  	"time"
    23  
    24  	"github.com/google/fleetspeak/fleetspeak/src/server/db"
    25  	"github.com/google/fleetspeak/fleetspeak/src/server/stats"
    26  	"github.com/prometheus/client_golang/prometheus"
    27  	"github.com/prometheus/client_golang/prometheus/promauto"
    28  
    29  	fspb "github.com/google/fleetspeak/fleetspeak/src/common/proto/fleetspeak"
    30  	mpb "github.com/google/fleetspeak/fleetspeak/src/common/proto/fleetspeak_monitoring"
    31  )
    32  
    33  var (
    34  	// Metric collectors for PrometheusStatsCollector struct
    35  	messagesIngested = promauto.NewCounterVec(prometheus.CounterOpts{
    36  		Name: "fleetspeak_messages_ingested_total",
    37  		Help: "The total number of messages ingested by Fleetspeak server",
    38  	},
    39  		[]string{"backlogged", "source_service", "destination_service", "message_type", "client_labels"},
    40  	)
    41  
    42  	messagesIngestedSize = promauto.NewCounterVec(prometheus.CounterOpts{
    43  		Name: "fleetspeak_messages_ingested_payload_bytes_size",
    44  		Help: "The total payload size of messages ingested by Fleetspeak server (in bytes)",
    45  	},
    46  		[]string{"backlogged", "source_service", "destination_service", "message_type", "client_labels"},
    47  	)
    48  
    49  	messagesSaved = promauto.NewCounterVec(prometheus.CounterOpts{
    50  		Name: "fleetspeak_messages_saved_total",
    51  		Help: "The total number of messages saved by Fleetspeak server",
    52  	},
    53  		[]string{"service", "message_type", "for_client", "client_labels"},
    54  	)
    55  
    56  	messagesSavedSize = promauto.NewCounterVec(prometheus.CounterOpts{
    57  		Name: "fleetspeak_messages_saved_payload_bytes_size",
    58  		Help: "The total payload size of messages saved by Fleetspeak server (in bytes)",
    59  	},
    60  		[]string{"service", "message_type", "for_client", "client_labels"},
    61  	)
    62  
    63  	messagesProcessed = promauto.NewHistogramVec(prometheus.HistogramOpts{
    64  		Name: "fleetspeak_server_messages_processed_latency",
    65  		Help: "The latency distribution of messages processed by Fleetspeak server",
    66  	},
    67  		[]string{"message_type", "service", "is_first_try", "client_labels"},
    68  	)
    69  
    70  	messagesErrored = promauto.NewHistogramVec(prometheus.HistogramOpts{
    71  		Name: "fleetspeak_server_messages_errored_latency",
    72  		Help: "The latency distribution of message processings that returned an error",
    73  	},
    74  		[]string{"message_type", "is_temp", "is_first_try", "client_labels"},
    75  	)
    76  
    77  	messagesDropped = promauto.NewCounterVec(prometheus.CounterOpts{
    78  		Name: "fleetspeak_server_messages_dropped_total",
    79  		Help: "The total number of messages dropped by Fleetspeak server when too many messages for the sevices are being processed.",
    80  	},
    81  		[]string{"service", "message_type", "is_first_try", "client_labels"},
    82  	)
    83  
    84  	clientPolls = promauto.NewCounterVec(prometheus.CounterOpts{
    85  		Name: "fleetspeak_server_client_polls_total",
    86  		Help: "The total number of times a client polls the Fleetspeak server.",
    87  	},
    88  		[]string{"http_status_code", "poll_type", "cache_hit"},
    89  	)
    90  
    91  	clientPollsOpTime = promauto.NewHistogramVec(prometheus.HistogramOpts{
    92  		Name: "fleetspeak_server_client_polls_operation_time_latency",
    93  		Help: "The latency distribution of times a client polls the Fleetspeak server (based on when the operation started and ended).",
    94  	},
    95  		[]string{"http_status_code", "poll_type", "cache_hit"},
    96  	)
    97  
    98  	clientPollsReadTime = promauto.NewHistogramVec(prometheus.HistogramOpts{
    99  		Name: "fleetspeak_server_client_polls_read_time_latency",
   100  		Help: "The latency distribution of times a client polls the Fleetspeak server (based on the time spent reading messages).",
   101  	},
   102  		[]string{"http_status_code", "poll_type", "cache_hit"},
   103  	)
   104  
   105  	clientPollsWriteTime = promauto.NewHistogramVec(prometheus.HistogramOpts{
   106  		Name: "fleetspeak_server_client_polls_write_time_latency",
   107  		Help: "The latency distribution of times a client polls the Fleetspeak server (based on the time spent writing messages).",
   108  	},
   109  		[]string{"http_status_code", "poll_type", "cache_hit"},
   110  	)
   111  
   112  	clientPollsReadMegabytes = promauto.NewHistogramVec(prometheus.HistogramOpts{
   113  		Name: "fleetspeak_server_client_polls_read_megabytes_size_distribution",
   114  		Help: "The size distribution of times a client polls the Fleetspeak server (based on Megabytes read).",
   115  	},
   116  		[]string{"http_status_code", "poll_type", "cache_hit"},
   117  	)
   118  
   119  	clientPollsWriteMegabytes = promauto.NewHistogramVec(prometheus.HistogramOpts{
   120  		Name: "fleetspeak_server_client_polls_write_megabytes_size_distribution",
   121  		Help: "The size distribution of times a client polls the Fleetspeak server (based on Megabytes written).",
   122  	},
   123  		[]string{"http_status_code", "poll_type", "cache_hit"},
   124  	)
   125  
   126  	datastoreOperationsCompleted = promauto.NewHistogramVec(prometheus.HistogramOpts{
   127  		Name: "fleetspeak_server_datastore_operations_completed_latency",
   128  		Help: "The latency distribution of datastore operations completed.",
   129  	},
   130  		[]string{"operation", "errored"},
   131  	)
   132  
   133  	resourcesUsageDataReceivedCount = promauto.NewCounterVec(prometheus.CounterOpts{
   134  		Name: "fleetspeak_server_resource_usage_data_received_total",
   135  		Help: "The total number of times a client-resource-usage proto is received.",
   136  	},
   137  		[]string{"client_data_labels", "blacklisted", "scope", "version"},
   138  	)
   139  
   140  	resourcesUsageDataReceivedByMeanUserCPURate = promauto.NewHistogramVec(prometheus.HistogramOpts{
   141  		Name: "fleetspeak_server_resource_usage_data_received_mean_user_cpu_rate_distribution",
   142  		Help: "The distribution of times a client-resource-usage proto is received (based on mean user CPU rate).",
   143  	},
   144  		[]string{"client_data_labels", "blacklisted", "scope", "version"},
   145  	)
   146  
   147  	resourcesUsageDataReceivedByMaxUserCPURate = promauto.NewHistogramVec(prometheus.HistogramOpts{
   148  		Name: "fleetspeak_server_resource_usage_data_received_max_user_cpu_rate_distribution",
   149  		Help: "The distribution of times a client-resource-usage proto is received (based on max user CPU rate).",
   150  	},
   151  		[]string{"client_data_labels", "blacklisted", "scope", "version"},
   152  	)
   153  
   154  	resourcesUsageDataReceivedByMeanSystemCPURate = promauto.NewHistogramVec(prometheus.HistogramOpts{
   155  		Name: "fleetspeak_server_resource_usage_data_received_mean_system_cpu_rate_distribution",
   156  		Help: "The distribution of times a client-resource-usage proto is received (based on mean system CPU rate).",
   157  	},
   158  		[]string{"client_data_labels", "blacklisted", "scope", "version"},
   159  	)
   160  
   161  	resourcesUsageDataReceivedByMaxSystemCPURate = promauto.NewHistogramVec(prometheus.HistogramOpts{
   162  		Name: "fleetspeak_server_resource_usage_data_received_max_system_cpu_rate",
   163  		Help: "The total number of times a client-resource-usage proto is received (based on max system CPU rate).",
   164  	},
   165  		[]string{"client_data_labels", "blacklisted", "scope", "version"},
   166  	)
   167  
   168  	resourcesUsageDataReceivedByMeanResidentMemory = promauto.NewHistogramVec(prometheus.HistogramOpts{
   169  		Name: "fleetspeak_server_resource_usage_data_received_mean_resident_memory_bytes_distribution",
   170  		Help: "The distribution of times a client-resource-usage proto is received (based on mean resident memory).",
   171  	},
   172  		[]string{"client_data_labels", "blacklisted", "scope", "version"},
   173  	)
   174  
   175  	resourcesUsageDataReceivedByMaxResidentMemory = promauto.NewHistogramVec(prometheus.HistogramOpts{
   176  		Name: "fleetspeak_server_resource_usage_data_received_max_resident_memory_bytes_distribution",
   177  		Help: "The distribution of times a client-resource-usage proto is received (based on max resident memory).",
   178  	},
   179  		[]string{"client_data_labels", "blacklisted", "scope", "version"},
   180  	)
   181  
   182  	killNotificationsReceived = promauto.NewCounterVec(prometheus.CounterOpts{
   183  		Name: "fleetspeak_server_kill_notifications_received_total",
   184  		Help: "The total number of times a kill notification is received from a client.",
   185  	},
   186  		[]string{"client_data_labels", "blacklisted", "service", "reason"},
   187  	)
   188  )
   189  
   190  // Returns a stable unambiguous string representation of labels from a given client.
   191  func clientLabels(cd *db.ClientData) string {
   192  	if cd == nil {
   193  		return ""
   194  	}
   195  
   196  	label_pairs := make([]string, 0, len(cd.Labels))
   197  	for _, l := range cd.Labels {
   198  		label_pairs = append(label_pairs, fmt.Sprintf("%s:%s", l.ServiceName, l.Label))
   199  	}
   200  
   201  	sort.Strings(label_pairs)
   202  	return strings.Join(label_pairs, ",")
   203  }
   204  
   205  // A PrometheusStatsCollector is an implementation of a Collector interface.
   206  // It exports stats to a Prometheus HTTP handler, which are exposed at :<configured_port>/metrics
   207  // and are scrapable by Prometheus (The port is configured in the server components config file).
   208  type StatsCollector struct{}
   209  
   210  func (s StatsCollector) MessageIngested(backlogged bool, m *fspb.Message, cd *db.ClientData) {
   211  	messagesIngested.WithLabelValues(strconv.FormatBool(backlogged), m.Source.ServiceName, m.Destination.ServiceName, m.MessageType, clientLabels(cd)).Inc()
   212  	payloadBytes := calculatePayloadBytes(m)
   213  	messagesIngestedSize.WithLabelValues(strconv.FormatBool(backlogged), m.Source.ServiceName, m.Destination.ServiceName, m.MessageType, clientLabels(cd)).Add(float64(payloadBytes))
   214  }
   215  
   216  func calculatePayloadBytes(m *fspb.Message) int {
   217  	payloadBytes := 0
   218  	if m.Data != nil {
   219  		payloadBytes = len(m.Data.TypeUrl) + len(m.Data.Value)
   220  	}
   221  	return payloadBytes
   222  }
   223  
   224  func (s StatsCollector) MessageSaved(forClient bool, m *fspb.Message, cd *db.ClientData) {
   225  	messagesSaved.WithLabelValues(m.Destination.ServiceName, m.MessageType, strconv.FormatBool(forClient), clientLabels(cd)).Inc()
   226  	savedPayloadBytes := calculatePayloadBytes(m)
   227  	messagesSavedSize.WithLabelValues(m.Destination.ServiceName, m.MessageType, strconv.FormatBool(forClient), clientLabels(cd)).Add(float64(savedPayloadBytes))
   228  }
   229  
   230  func (s StatsCollector) MessageProcessed(start, end time.Time, m *fspb.Message, isFirstTry bool, cd *db.ClientData) {
   231  	messagesProcessed.WithLabelValues(m.MessageType, m.Destination.ServiceName, strconv.FormatBool(isFirstTry), clientLabels(cd)).Observe(end.Sub(start).Seconds())
   232  }
   233  
   234  func (s StatsCollector) MessageErrored(start, end time.Time, isTemp bool, m *fspb.Message, isFirstTry bool, cd *db.ClientData) {
   235  	messagesErrored.WithLabelValues(m.MessageType, strconv.FormatBool(isTemp), strconv.FormatBool(isFirstTry), clientLabels(cd)).Observe(end.Sub(start).Seconds())
   236  }
   237  
   238  func (s StatsCollector) MessageDropped(m *fspb.Message, isFirstTry bool, cd *db.ClientData) {
   239  	messagesDropped.WithLabelValues(m.Destination.ServiceName, m.MessageType, strconv.FormatBool(isFirstTry), clientLabels(cd)).Inc()
   240  }
   241  
   242  func (s StatsCollector) ClientPoll(info stats.PollInfo) {
   243  	httpStatusCode := strconv.Itoa(info.Status)
   244  	pollType := info.Type.String()
   245  	cacheHit := strconv.FormatBool(info.CacheHit)
   246  
   247  	// CounterVec
   248  	clientPolls.WithLabelValues(httpStatusCode, pollType, cacheHit).Inc()
   249  
   250  	// HistogramVecs
   251  	clientPollsOpTime.WithLabelValues(httpStatusCode, pollType, cacheHit).Observe(info.End.Sub(info.Start).Seconds())
   252  	clientPollsReadTime.WithLabelValues(httpStatusCode, pollType, cacheHit).Observe(info.ReadTime.Seconds())
   253  	clientPollsWriteTime.WithLabelValues(httpStatusCode, pollType, cacheHit).Observe(info.WriteTime.Seconds())
   254  	clientPollsReadMegabytes.WithLabelValues(httpStatusCode, pollType, cacheHit).Observe(convertBytesToMegabytes(info.ReadBytes))
   255  	clientPollsWriteMegabytes.WithLabelValues(httpStatusCode, pollType, cacheHit).Observe(convertBytesToMegabytes(info.WriteBytes))
   256  }
   257  
   258  func convertBytesToMegabytes(bytes int) float64 {
   259  	return float64(bytes) / 1000000.0
   260  }
   261  
   262  func (s StatsCollector) DatastoreOperation(start, end time.Time, operation string, result error) {
   263  	datastoreOperationsCompleted.WithLabelValues(operation, strconv.FormatBool(result != nil)).Observe(end.Sub(start).Seconds())
   264  }
   265  
   266  func getClientDataLabelsConcatenated(cd *db.ClientData) string {
   267  	var clientDataLabels []string
   268  	for _, labelStruct := range cd.Labels {
   269  		clientDataLabels = append(clientDataLabels, labelStruct.GetLabel())
   270  	}
   271  	sort.Strings(clientDataLabels)
   272  	return strings.Join(clientDataLabels[:], ",")
   273  }
   274  
   275  func (s StatsCollector) ResourceUsageDataReceived(cd *db.ClientData, rud *mpb.ResourceUsageData, v *fspb.ValidationInfo) {
   276  	clientDataLabels := getClientDataLabelsConcatenated(cd)
   277  	blacklisted := strconv.FormatBool(cd.Blacklisted)
   278  	scope := rud.Scope
   279  	version := rud.Version
   280  
   281  	// CounterVec
   282  	resourcesUsageDataReceivedCount.WithLabelValues(clientDataLabels, strconv.FormatBool(cd.Blacklisted), scope, rud.Version).Inc()
   283  
   284  	// HistorgramVecs
   285  	resourcesUsageDataReceivedByMeanUserCPURate.WithLabelValues(clientDataLabels, blacklisted, scope, version).Observe(rud.ResourceUsage.GetMeanUserCpuRate())
   286  	resourcesUsageDataReceivedByMaxUserCPURate.WithLabelValues(clientDataLabels, blacklisted, scope, version).Observe(rud.ResourceUsage.GetMaxUserCpuRate())
   287  	resourcesUsageDataReceivedByMeanSystemCPURate.WithLabelValues(clientDataLabels, blacklisted, scope, version).Observe(rud.ResourceUsage.GetMeanSystemCpuRate())
   288  	resourcesUsageDataReceivedByMaxSystemCPURate.WithLabelValues(clientDataLabels, blacklisted, scope, version).Observe(rud.ResourceUsage.GetMaxSystemCpuRate())
   289  	resourcesUsageDataReceivedByMeanResidentMemory.WithLabelValues(clientDataLabels, blacklisted, scope, version).Observe(rud.ResourceUsage.GetMeanResidentMemory())
   290  	resourcesUsageDataReceivedByMaxResidentMemory.WithLabelValues(clientDataLabels, blacklisted, scope, version).Observe(float64(rud.ResourceUsage.GetMaxResidentMemory()))
   291  }
   292  
   293  func (s StatsCollector) KillNotificationReceived(cd *db.ClientData, kn *mpb.KillNotification) {
   294  	clientDataLabels := getClientDataLabelsConcatenated(cd)
   295  	killNotificationsReceived.WithLabelValues(clientDataLabels, strconv.FormatBool(cd.Blacklisted), kn.Service, kn.Reason.String()).Inc()
   296  }