github.com/google/fleetspeak@v0.1.15-0.20240426164851-4f31f62c1aea/fleetspeak/src/server/components/prometheus/prometheus.go (about) 1 // Copyright 2020 Google Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package prometheus 16 17 import ( 18 "fmt" 19 "sort" 20 "strconv" 21 "strings" 22 "time" 23 24 "github.com/google/fleetspeak/fleetspeak/src/server/db" 25 "github.com/google/fleetspeak/fleetspeak/src/server/stats" 26 "github.com/prometheus/client_golang/prometheus" 27 "github.com/prometheus/client_golang/prometheus/promauto" 28 29 fspb "github.com/google/fleetspeak/fleetspeak/src/common/proto/fleetspeak" 30 mpb "github.com/google/fleetspeak/fleetspeak/src/common/proto/fleetspeak_monitoring" 31 ) 32 33 var ( 34 // Metric collectors for PrometheusStatsCollector struct 35 messagesIngested = promauto.NewCounterVec(prometheus.CounterOpts{ 36 Name: "fleetspeak_messages_ingested_total", 37 Help: "The total number of messages ingested by Fleetspeak server", 38 }, 39 []string{"backlogged", "source_service", "destination_service", "message_type", "client_labels"}, 40 ) 41 42 messagesIngestedSize = promauto.NewCounterVec(prometheus.CounterOpts{ 43 Name: "fleetspeak_messages_ingested_payload_bytes_size", 44 Help: "The total payload size of messages ingested by Fleetspeak server (in bytes)", 45 }, 46 []string{"backlogged", "source_service", "destination_service", "message_type", "client_labels"}, 47 ) 48 49 messagesSaved = promauto.NewCounterVec(prometheus.CounterOpts{ 50 Name: "fleetspeak_messages_saved_total", 51 Help: "The total number of messages saved by Fleetspeak server", 52 }, 53 []string{"service", "message_type", "for_client", "client_labels"}, 54 ) 55 56 messagesSavedSize = promauto.NewCounterVec(prometheus.CounterOpts{ 57 Name: "fleetspeak_messages_saved_payload_bytes_size", 58 Help: "The total payload size of messages saved by Fleetspeak server (in bytes)", 59 }, 60 []string{"service", "message_type", "for_client", "client_labels"}, 61 ) 62 63 messagesProcessed = promauto.NewHistogramVec(prometheus.HistogramOpts{ 64 Name: "fleetspeak_server_messages_processed_latency", 65 Help: "The latency distribution of messages processed by Fleetspeak server", 66 }, 67 []string{"message_type", "service", "is_first_try", "client_labels"}, 68 ) 69 70 messagesErrored = promauto.NewHistogramVec(prometheus.HistogramOpts{ 71 Name: "fleetspeak_server_messages_errored_latency", 72 Help: "The latency distribution of message processings that returned an error", 73 }, 74 []string{"message_type", "is_temp", "is_first_try", "client_labels"}, 75 ) 76 77 messagesDropped = promauto.NewCounterVec(prometheus.CounterOpts{ 78 Name: "fleetspeak_server_messages_dropped_total", 79 Help: "The total number of messages dropped by Fleetspeak server when too many messages for the sevices are being processed.", 80 }, 81 []string{"service", "message_type", "is_first_try", "client_labels"}, 82 ) 83 84 clientPolls = promauto.NewCounterVec(prometheus.CounterOpts{ 85 Name: "fleetspeak_server_client_polls_total", 86 Help: "The total number of times a client polls the Fleetspeak server.", 87 }, 88 []string{"http_status_code", "poll_type", "cache_hit"}, 89 ) 90 91 clientPollsOpTime = promauto.NewHistogramVec(prometheus.HistogramOpts{ 92 Name: "fleetspeak_server_client_polls_operation_time_latency", 93 Help: "The latency distribution of times a client polls the Fleetspeak server (based on when the operation started and ended).", 94 }, 95 []string{"http_status_code", "poll_type", "cache_hit"}, 96 ) 97 98 clientPollsReadTime = promauto.NewHistogramVec(prometheus.HistogramOpts{ 99 Name: "fleetspeak_server_client_polls_read_time_latency", 100 Help: "The latency distribution of times a client polls the Fleetspeak server (based on the time spent reading messages).", 101 }, 102 []string{"http_status_code", "poll_type", "cache_hit"}, 103 ) 104 105 clientPollsWriteTime = promauto.NewHistogramVec(prometheus.HistogramOpts{ 106 Name: "fleetspeak_server_client_polls_write_time_latency", 107 Help: "The latency distribution of times a client polls the Fleetspeak server (based on the time spent writing messages).", 108 }, 109 []string{"http_status_code", "poll_type", "cache_hit"}, 110 ) 111 112 clientPollsReadMegabytes = promauto.NewHistogramVec(prometheus.HistogramOpts{ 113 Name: "fleetspeak_server_client_polls_read_megabytes_size_distribution", 114 Help: "The size distribution of times a client polls the Fleetspeak server (based on Megabytes read).", 115 }, 116 []string{"http_status_code", "poll_type", "cache_hit"}, 117 ) 118 119 clientPollsWriteMegabytes = promauto.NewHistogramVec(prometheus.HistogramOpts{ 120 Name: "fleetspeak_server_client_polls_write_megabytes_size_distribution", 121 Help: "The size distribution of times a client polls the Fleetspeak server (based on Megabytes written).", 122 }, 123 []string{"http_status_code", "poll_type", "cache_hit"}, 124 ) 125 126 datastoreOperationsCompleted = promauto.NewHistogramVec(prometheus.HistogramOpts{ 127 Name: "fleetspeak_server_datastore_operations_completed_latency", 128 Help: "The latency distribution of datastore operations completed.", 129 }, 130 []string{"operation", "errored"}, 131 ) 132 133 resourcesUsageDataReceivedCount = promauto.NewCounterVec(prometheus.CounterOpts{ 134 Name: "fleetspeak_server_resource_usage_data_received_total", 135 Help: "The total number of times a client-resource-usage proto is received.", 136 }, 137 []string{"client_data_labels", "blacklisted", "scope", "version"}, 138 ) 139 140 resourcesUsageDataReceivedByMeanUserCPURate = promauto.NewHistogramVec(prometheus.HistogramOpts{ 141 Name: "fleetspeak_server_resource_usage_data_received_mean_user_cpu_rate_distribution", 142 Help: "The distribution of times a client-resource-usage proto is received (based on mean user CPU rate).", 143 }, 144 []string{"client_data_labels", "blacklisted", "scope", "version"}, 145 ) 146 147 resourcesUsageDataReceivedByMaxUserCPURate = promauto.NewHistogramVec(prometheus.HistogramOpts{ 148 Name: "fleetspeak_server_resource_usage_data_received_max_user_cpu_rate_distribution", 149 Help: "The distribution of times a client-resource-usage proto is received (based on max user CPU rate).", 150 }, 151 []string{"client_data_labels", "blacklisted", "scope", "version"}, 152 ) 153 154 resourcesUsageDataReceivedByMeanSystemCPURate = promauto.NewHistogramVec(prometheus.HistogramOpts{ 155 Name: "fleetspeak_server_resource_usage_data_received_mean_system_cpu_rate_distribution", 156 Help: "The distribution of times a client-resource-usage proto is received (based on mean system CPU rate).", 157 }, 158 []string{"client_data_labels", "blacklisted", "scope", "version"}, 159 ) 160 161 resourcesUsageDataReceivedByMaxSystemCPURate = promauto.NewHistogramVec(prometheus.HistogramOpts{ 162 Name: "fleetspeak_server_resource_usage_data_received_max_system_cpu_rate", 163 Help: "The total number of times a client-resource-usage proto is received (based on max system CPU rate).", 164 }, 165 []string{"client_data_labels", "blacklisted", "scope", "version"}, 166 ) 167 168 resourcesUsageDataReceivedByMeanResidentMemory = promauto.NewHistogramVec(prometheus.HistogramOpts{ 169 Name: "fleetspeak_server_resource_usage_data_received_mean_resident_memory_bytes_distribution", 170 Help: "The distribution of times a client-resource-usage proto is received (based on mean resident memory).", 171 }, 172 []string{"client_data_labels", "blacklisted", "scope", "version"}, 173 ) 174 175 resourcesUsageDataReceivedByMaxResidentMemory = promauto.NewHistogramVec(prometheus.HistogramOpts{ 176 Name: "fleetspeak_server_resource_usage_data_received_max_resident_memory_bytes_distribution", 177 Help: "The distribution of times a client-resource-usage proto is received (based on max resident memory).", 178 }, 179 []string{"client_data_labels", "blacklisted", "scope", "version"}, 180 ) 181 182 killNotificationsReceived = promauto.NewCounterVec(prometheus.CounterOpts{ 183 Name: "fleetspeak_server_kill_notifications_received_total", 184 Help: "The total number of times a kill notification is received from a client.", 185 }, 186 []string{"client_data_labels", "blacklisted", "service", "reason"}, 187 ) 188 ) 189 190 // Returns a stable unambiguous string representation of labels from a given client. 191 func clientLabels(cd *db.ClientData) string { 192 if cd == nil { 193 return "" 194 } 195 196 label_pairs := make([]string, 0, len(cd.Labels)) 197 for _, l := range cd.Labels { 198 label_pairs = append(label_pairs, fmt.Sprintf("%s:%s", l.ServiceName, l.Label)) 199 } 200 201 sort.Strings(label_pairs) 202 return strings.Join(label_pairs, ",") 203 } 204 205 // A PrometheusStatsCollector is an implementation of a Collector interface. 206 // It exports stats to a Prometheus HTTP handler, which are exposed at :<configured_port>/metrics 207 // and are scrapable by Prometheus (The port is configured in the server components config file). 208 type StatsCollector struct{} 209 210 func (s StatsCollector) MessageIngested(backlogged bool, m *fspb.Message, cd *db.ClientData) { 211 messagesIngested.WithLabelValues(strconv.FormatBool(backlogged), m.Source.ServiceName, m.Destination.ServiceName, m.MessageType, clientLabels(cd)).Inc() 212 payloadBytes := calculatePayloadBytes(m) 213 messagesIngestedSize.WithLabelValues(strconv.FormatBool(backlogged), m.Source.ServiceName, m.Destination.ServiceName, m.MessageType, clientLabels(cd)).Add(float64(payloadBytes)) 214 } 215 216 func calculatePayloadBytes(m *fspb.Message) int { 217 payloadBytes := 0 218 if m.Data != nil { 219 payloadBytes = len(m.Data.TypeUrl) + len(m.Data.Value) 220 } 221 return payloadBytes 222 } 223 224 func (s StatsCollector) MessageSaved(forClient bool, m *fspb.Message, cd *db.ClientData) { 225 messagesSaved.WithLabelValues(m.Destination.ServiceName, m.MessageType, strconv.FormatBool(forClient), clientLabels(cd)).Inc() 226 savedPayloadBytes := calculatePayloadBytes(m) 227 messagesSavedSize.WithLabelValues(m.Destination.ServiceName, m.MessageType, strconv.FormatBool(forClient), clientLabels(cd)).Add(float64(savedPayloadBytes)) 228 } 229 230 func (s StatsCollector) MessageProcessed(start, end time.Time, m *fspb.Message, isFirstTry bool, cd *db.ClientData) { 231 messagesProcessed.WithLabelValues(m.MessageType, m.Destination.ServiceName, strconv.FormatBool(isFirstTry), clientLabels(cd)).Observe(end.Sub(start).Seconds()) 232 } 233 234 func (s StatsCollector) MessageErrored(start, end time.Time, isTemp bool, m *fspb.Message, isFirstTry bool, cd *db.ClientData) { 235 messagesErrored.WithLabelValues(m.MessageType, strconv.FormatBool(isTemp), strconv.FormatBool(isFirstTry), clientLabels(cd)).Observe(end.Sub(start).Seconds()) 236 } 237 238 func (s StatsCollector) MessageDropped(m *fspb.Message, isFirstTry bool, cd *db.ClientData) { 239 messagesDropped.WithLabelValues(m.Destination.ServiceName, m.MessageType, strconv.FormatBool(isFirstTry), clientLabels(cd)).Inc() 240 } 241 242 func (s StatsCollector) ClientPoll(info stats.PollInfo) { 243 httpStatusCode := strconv.Itoa(info.Status) 244 pollType := info.Type.String() 245 cacheHit := strconv.FormatBool(info.CacheHit) 246 247 // CounterVec 248 clientPolls.WithLabelValues(httpStatusCode, pollType, cacheHit).Inc() 249 250 // HistogramVecs 251 clientPollsOpTime.WithLabelValues(httpStatusCode, pollType, cacheHit).Observe(info.End.Sub(info.Start).Seconds()) 252 clientPollsReadTime.WithLabelValues(httpStatusCode, pollType, cacheHit).Observe(info.ReadTime.Seconds()) 253 clientPollsWriteTime.WithLabelValues(httpStatusCode, pollType, cacheHit).Observe(info.WriteTime.Seconds()) 254 clientPollsReadMegabytes.WithLabelValues(httpStatusCode, pollType, cacheHit).Observe(convertBytesToMegabytes(info.ReadBytes)) 255 clientPollsWriteMegabytes.WithLabelValues(httpStatusCode, pollType, cacheHit).Observe(convertBytesToMegabytes(info.WriteBytes)) 256 } 257 258 func convertBytesToMegabytes(bytes int) float64 { 259 return float64(bytes) / 1000000.0 260 } 261 262 func (s StatsCollector) DatastoreOperation(start, end time.Time, operation string, result error) { 263 datastoreOperationsCompleted.WithLabelValues(operation, strconv.FormatBool(result != nil)).Observe(end.Sub(start).Seconds()) 264 } 265 266 func getClientDataLabelsConcatenated(cd *db.ClientData) string { 267 var clientDataLabels []string 268 for _, labelStruct := range cd.Labels { 269 clientDataLabels = append(clientDataLabels, labelStruct.GetLabel()) 270 } 271 sort.Strings(clientDataLabels) 272 return strings.Join(clientDataLabels[:], ",") 273 } 274 275 func (s StatsCollector) ResourceUsageDataReceived(cd *db.ClientData, rud *mpb.ResourceUsageData, v *fspb.ValidationInfo) { 276 clientDataLabels := getClientDataLabelsConcatenated(cd) 277 blacklisted := strconv.FormatBool(cd.Blacklisted) 278 scope := rud.Scope 279 version := rud.Version 280 281 // CounterVec 282 resourcesUsageDataReceivedCount.WithLabelValues(clientDataLabels, strconv.FormatBool(cd.Blacklisted), scope, rud.Version).Inc() 283 284 // HistorgramVecs 285 resourcesUsageDataReceivedByMeanUserCPURate.WithLabelValues(clientDataLabels, blacklisted, scope, version).Observe(rud.ResourceUsage.GetMeanUserCpuRate()) 286 resourcesUsageDataReceivedByMaxUserCPURate.WithLabelValues(clientDataLabels, blacklisted, scope, version).Observe(rud.ResourceUsage.GetMaxUserCpuRate()) 287 resourcesUsageDataReceivedByMeanSystemCPURate.WithLabelValues(clientDataLabels, blacklisted, scope, version).Observe(rud.ResourceUsage.GetMeanSystemCpuRate()) 288 resourcesUsageDataReceivedByMaxSystemCPURate.WithLabelValues(clientDataLabels, blacklisted, scope, version).Observe(rud.ResourceUsage.GetMaxSystemCpuRate()) 289 resourcesUsageDataReceivedByMeanResidentMemory.WithLabelValues(clientDataLabels, blacklisted, scope, version).Observe(rud.ResourceUsage.GetMeanResidentMemory()) 290 resourcesUsageDataReceivedByMaxResidentMemory.WithLabelValues(clientDataLabels, blacklisted, scope, version).Observe(float64(rud.ResourceUsage.GetMaxResidentMemory())) 291 } 292 293 func (s StatsCollector) KillNotificationReceived(cd *db.ClientData, kn *mpb.KillNotification) { 294 clientDataLabels := getClientDataLabelsConcatenated(cd) 295 killNotificationsReceived.WithLabelValues(clientDataLabels, strconv.FormatBool(cd.Blacklisted), kn.Service, kn.Reason.String()).Inc() 296 }