github.com/nginxinc/kubernetes-ingress@v1.12.5/internal/metrics/collectors/latency.go (about) 1 package collectors 2 3 import ( 4 "encoding/json" 5 "fmt" 6 "strconv" 7 "strings" 8 "sync" 9 10 "github.com/golang/glog" 11 "github.com/prometheus/client_golang/prometheus" 12 ) 13 14 const nginxSeparator = "nginx:" 15 16 var latencyBucketsMilliSeconds = []float64{ 17 1, 18 2, 19 3, 20 4, 21 5, 22 10, 23 20, 24 30, 25 40, 26 50, 27 100, 28 200, 29 300, 30 400, 31 500, 32 1000, 33 2000, 34 3000, 35 4000, 36 5000, 37 10000, 38 20000, 39 30000, 40 40000, 41 50000, 42 } 43 44 // LatencyCollector is an interface for latency metrics 45 type LatencyCollector interface { 46 RecordLatency(string) 47 UpdateUpstreamServerLabels(map[string][]string) 48 DeleteUpstreamServerLabels([]string) 49 UpdateUpstreamServerPeerLabels(map[string][]string) 50 DeleteUpstreamServerPeerLabels([]string) 51 DeleteMetrics([]string) 52 Register(*prometheus.Registry) error 53 } 54 55 // metricsPublishedMap is a map of upstream server peers (upstream/server) to a metricsSet. 56 // This map is used to keep track of all the metrics published for each upstream server peer, 57 // so that the metrics can be deleted when the upstream server peers are deleted. 58 type metricsPublishedMap map[string]metricsSet 59 60 // metricsSet is a set of metrics published. 61 // The keys are string representations of the lists of label values for a published metric. 62 // The list of label values is joined with the "+" symbol. For example, a metric produced with the label values 63 // ["one", "two", "three"] is added to the set with the key "one+two+three". 64 type metricsSet map[string]struct{} 65 66 // LatencyMetricsCollector implements the LatencyCollector interface and prometheus.Collector interface 67 type LatencyMetricsCollector struct { 68 httpLatency *prometheus.HistogramVec 69 upstreamServerLabelNames []string 70 upstreamServerPeerLabelNames []string 71 upstreamServerLabels map[string][]string 72 upstreamServerPeerLabels map[string][]string 73 metricsPublishedMap metricsPublishedMap 74 metricsPublishedMutex sync.Mutex 75 variableLabelsMutex sync.RWMutex 76 } 77 78 // NewLatencyMetricsCollector creates a new LatencyMetricsCollector 79 func NewLatencyMetricsCollector( 80 constLabels map[string]string, 81 upstreamServerLabelNames []string, 82 upstreamServerPeerLabelNames []string, 83 ) *LatencyMetricsCollector { 84 return &LatencyMetricsCollector{ 85 httpLatency: prometheus.NewHistogramVec(prometheus.HistogramOpts{ 86 Namespace: metricsNamespace, 87 Name: "upstream_server_response_latency_ms", 88 Help: "Bucketed response times from when NGINX establishes a connection to an upstream server to when the last byte of the response body is received by NGINX", 89 ConstLabels: constLabels, 90 Buckets: latencyBucketsMilliSeconds, 91 }, 92 createLatencyLabelNames(upstreamServerLabelNames, upstreamServerPeerLabelNames), 93 ), 94 upstreamServerLabels: make(map[string][]string), 95 upstreamServerPeerLabels: make(map[string][]string), 96 metricsPublishedMap: make(metricsPublishedMap), 97 upstreamServerLabelNames: upstreamServerLabelNames, 98 upstreamServerPeerLabelNames: upstreamServerPeerLabelNames, 99 } 100 } 101 102 // UpdateUpstreamServerPeerLabels updates the Upstream Server Peer Labels 103 func (l *LatencyMetricsCollector) UpdateUpstreamServerPeerLabels(upstreamServerPeerLabels map[string][]string) { 104 l.variableLabelsMutex.Lock() 105 for k, v := range upstreamServerPeerLabels { 106 l.upstreamServerPeerLabels[k] = v 107 } 108 l.variableLabelsMutex.Unlock() 109 } 110 111 // DeleteUpstreamServerPeerLabels deletes the Upstream Server Peer Labels 112 func (l *LatencyMetricsCollector) DeleteUpstreamServerPeerLabels(peers []string) { 113 l.variableLabelsMutex.Lock() 114 for _, k := range peers { 115 delete(l.upstreamServerPeerLabels, k) 116 } 117 l.variableLabelsMutex.Unlock() 118 } 119 120 // UpdateUpstreamServerLabels updates the upstream server label map 121 func (l *LatencyMetricsCollector) UpdateUpstreamServerLabels(newLabelValues map[string][]string) { 122 l.variableLabelsMutex.Lock() 123 for k, v := range newLabelValues { 124 l.upstreamServerLabels[k] = v 125 } 126 l.variableLabelsMutex.Unlock() 127 } 128 129 // DeleteUpstreamServerLabels deletes upstream server labels 130 func (l *LatencyMetricsCollector) DeleteUpstreamServerLabels(upstreamNames []string) { 131 l.variableLabelsMutex.Lock() 132 for _, k := range upstreamNames { 133 delete(l.upstreamServerLabels, k) 134 } 135 l.variableLabelsMutex.Unlock() 136 } 137 138 // DeleteMetrics deletes all metrics published associated with the given upstream server peer names. 139 func (l *LatencyMetricsCollector) DeleteMetrics(upstreamServerPeerNames []string) { 140 for _, name := range upstreamServerPeerNames { 141 for _, labelValues := range l.listAndDeleteMetricsPublished(name) { 142 success := l.httpLatency.DeleteLabelValues(labelValues...) 143 if !success { 144 glog.Warningf("could not delete metric for upstream server peer: %s with values: %v", name, labelValues) 145 } 146 } 147 } 148 } 149 150 func (l *LatencyMetricsCollector) getUpstreamServerPeerLabelValues(peer string) []string { 151 l.variableLabelsMutex.RLock() 152 defer l.variableLabelsMutex.RUnlock() 153 return l.upstreamServerPeerLabels[peer] 154 } 155 156 func (l *LatencyMetricsCollector) getUpstreamServerLabels(upstreamName string) []string { 157 l.variableLabelsMutex.RLock() 158 defer l.variableLabelsMutex.RUnlock() 159 return l.upstreamServerLabels[upstreamName] 160 } 161 162 // Register registers all the metrics of the collector 163 func (l *LatencyMetricsCollector) Register(registry *prometheus.Registry) error { 164 return registry.Register(l) 165 } 166 167 // Describe implements prometheus.Collector interface Describe method 168 func (l *LatencyMetricsCollector) Describe(ch chan<- *prometheus.Desc) { 169 l.httpLatency.Describe(ch) 170 } 171 172 // Collect implements the prometheus.Collector interface Collect method 173 func (l *LatencyMetricsCollector) Collect(ch chan<- prometheus.Metric) { 174 l.httpLatency.Collect(ch) 175 } 176 177 // RecordLatency parses a syslog message and records latency 178 func (l *LatencyMetricsCollector) RecordLatency(syslogMsg string) { 179 lm, err := parseMessage(syslogMsg) 180 if err != nil { 181 glog.V(3).Infof("could not parse syslog message: %v", err) 182 return 183 } 184 labelValues, err := l.createLatencyLabelValues(lm) 185 if err != nil { 186 glog.Errorf("cannot record latency for upstream %s and server %s: %v", lm.Upstream, lm.Server, err) 187 return 188 } 189 l.httpLatency.WithLabelValues(labelValues...).Observe(lm.Latency * 1000) 190 l.updateMetricsPublished(lm.Upstream, lm.Server, labelValues) 191 } 192 193 func (l *LatencyMetricsCollector) updateMetricsPublished(upstreamName, server string, labelValues []string) { 194 l.metricsPublishedMutex.Lock() 195 key := fmt.Sprintf("%s/%s", upstreamName, server) 196 if _, ok := l.metricsPublishedMap[key]; !ok { 197 l.metricsPublishedMap[key] = make(metricsSet) 198 } 199 l.metricsPublishedMap[key][strings.Join(labelValues, "+")] = struct{}{} 200 l.metricsPublishedMutex.Unlock() 201 } 202 203 func (l *LatencyMetricsCollector) listAndDeleteMetricsPublished(key string) (metricsPublished [][]string) { 204 l.metricsPublishedMutex.Lock() 205 defer l.metricsPublishedMutex.Unlock() 206 for labelValues := range l.metricsPublishedMap[key] { 207 metricsPublished = append(metricsPublished, strings.Split(labelValues, "+")) 208 } 209 delete(l.metricsPublishedMap, key) 210 return metricsPublished 211 } 212 213 func (l *LatencyMetricsCollector) createLatencyLabelValues(lm latencyMetric) ([]string, error) { 214 labelValues := []string{lm.Upstream, lm.Server, lm.Code} 215 upstreamServerLabelValues := l.getUpstreamServerLabels(lm.Upstream) 216 if len(l.upstreamServerLabelNames) != len(upstreamServerLabelValues) { 217 return nil, fmt.Errorf("wrong number of labels for upstream %v. For labels %v, got values: %v", 218 lm.Upstream, l.upstreamServerLabelNames, upstreamServerLabelValues) 219 } 220 labelValues = append(labelValues, upstreamServerLabelValues...) 221 peerServerLabelValues := l.getUpstreamServerPeerLabelValues(fmt.Sprintf("%v/%v", lm.Upstream, lm.Server)) 222 if len(l.upstreamServerPeerLabelNames) != len(peerServerLabelValues) { 223 return nil, fmt.Errorf("wrong number of labels for upstream peer %v. For labels %v, got values: %v", 224 lm.Server, l.upstreamServerPeerLabelNames, peerServerLabelValues) 225 } 226 labelValues = append(labelValues, peerServerLabelValues...) 227 return labelValues, nil 228 } 229 230 func createLatencyLabelNames(upstreamServerLabelNames, upstreamServerPeerLabelNames []string) []string { 231 return append(append([]string{"upstream", "server", "code"}, upstreamServerLabelNames...), upstreamServerPeerLabelNames...) 232 } 233 234 type syslogMsg struct { 235 ProxyHost string `json:"proxyHost"` 236 UpstreamAddr string `json:"upstreamAddress"` 237 UpstreamStatus string `json:"upstreamStatus"` 238 UpstreamResponseTime string `json:"upstreamResponseTime"` 239 } 240 241 type latencyMetric struct { 242 Upstream string 243 Server string 244 Code string 245 Latency float64 246 } 247 248 func parseMessage(msg string) (latencyMetric, error) { 249 msgParts := strings.Split(msg, nginxSeparator) 250 if len(msgParts) != 2 { 251 return latencyMetric{}, fmt.Errorf("wrong message format: %s, expected message to start with \"%s\"", msg, nginxSeparator) 252 } 253 var sm syslogMsg 254 info := msgParts[1] 255 if err := json.Unmarshal([]byte(info), &sm); err != nil { 256 return latencyMetric{}, fmt.Errorf("could not unmarshal %s: %w", msg, err) 257 } 258 if sm.UpstreamAddr == sm.ProxyHost { 259 // no upstream connected so don't publish a metric 260 return latencyMetric{}, fmt.Errorf("nginx could not connect to upstream") 261 } 262 server := parseMultipartResponse(sm.UpstreamAddr) 263 latency, err := strconv.ParseFloat(parseMultipartResponse(sm.UpstreamResponseTime), 64) 264 if err != nil { 265 return latencyMetric{}, fmt.Errorf("could not parse float from upstream response time %s: %w", sm.UpstreamResponseTime, err) 266 } 267 code := parseMultipartResponse(sm.UpstreamStatus) 268 lm := latencyMetric{ 269 Upstream: sm.ProxyHost, 270 Server: server, 271 Code: code, 272 Latency: latency, 273 } 274 275 return lm, nil 276 } 277 278 // parseMutlipartResponse checks if the input string contains commas. 279 // If it does it returns the last item of the list, otherwise it returns input. 280 func parseMultipartResponse(input string) string { 281 parts := strings.Split(input, ",") 282 if l := len(parts); l > 1 { 283 return strings.TrimLeft(parts[l-1], " ") 284 } 285 return input 286 } 287 288 // LatencyFakeCollector is a fake collector that implements the LatencyCollector interface 289 type LatencyFakeCollector struct{} 290 291 // DeleteMetrics implements a fake DeleteMetrics 292 func (l *LatencyFakeCollector) DeleteMetrics([]string) {} 293 294 // UpdateUpstreamServerPeerLabels implements a fake UpdateUpstreamServerPeerLabels 295 func (l *LatencyFakeCollector) UpdateUpstreamServerPeerLabels(map[string][]string) {} 296 297 // DeleteUpstreamServerPeerLabels implements a fake DeleteUpstreamServerPeerLabels 298 func (l *LatencyFakeCollector) DeleteUpstreamServerPeerLabels([]string) {} 299 300 // UpdateUpstreamServerLabels implements a fake UpdateUpstreamServerLabels 301 func (l *LatencyFakeCollector) UpdateUpstreamServerLabels(map[string][]string) {} 302 303 // DeleteUpstreamServerLabels implements a fake DeleteUpstreamServerLabels 304 func (l *LatencyFakeCollector) DeleteUpstreamServerLabels([]string) {} 305 306 // NewLatencyFakeCollector creates a fake collector that implements the LatencyCollector interface 307 func NewLatencyFakeCollector() *LatencyFakeCollector { 308 return &LatencyFakeCollector{} 309 } 310 311 // Register implements a fake Register 312 func (l *LatencyFakeCollector) Register(_ *prometheus.Registry) error { return nil } 313 314 // RecordLatency implements a fake RecordLatency 315 func (l *LatencyFakeCollector) RecordLatency(_ string) {}