github.com/nginxinc/kubernetes-ingress@v1.12.5/internal/metrics/collectors/latency.go (about)

     1  package collectors
     2  
     3  import (
     4  	"encoding/json"
     5  	"fmt"
     6  	"strconv"
     7  	"strings"
     8  	"sync"
     9  
    10  	"github.com/golang/glog"
    11  	"github.com/prometheus/client_golang/prometheus"
    12  )
    13  
    14  const nginxSeparator = "nginx:"
    15  
    16  var latencyBucketsMilliSeconds = []float64{
    17  	1,
    18  	2,
    19  	3,
    20  	4,
    21  	5,
    22  	10,
    23  	20,
    24  	30,
    25  	40,
    26  	50,
    27  	100,
    28  	200,
    29  	300,
    30  	400,
    31  	500,
    32  	1000,
    33  	2000,
    34  	3000,
    35  	4000,
    36  	5000,
    37  	10000,
    38  	20000,
    39  	30000,
    40  	40000,
    41  	50000,
    42  }
    43  
    44  // LatencyCollector is an interface for latency metrics
    45  type LatencyCollector interface {
    46  	RecordLatency(string)
    47  	UpdateUpstreamServerLabels(map[string][]string)
    48  	DeleteUpstreamServerLabels([]string)
    49  	UpdateUpstreamServerPeerLabels(map[string][]string)
    50  	DeleteUpstreamServerPeerLabels([]string)
    51  	DeleteMetrics([]string)
    52  	Register(*prometheus.Registry) error
    53  }
    54  
    55  // metricsPublishedMap is a map of upstream server peers (upstream/server) to a metricsSet.
    56  // This map is used to keep track of all the metrics published for each upstream server peer,
    57  // so that the metrics can be deleted when the upstream server peers are deleted.
    58  type metricsPublishedMap map[string]metricsSet
    59  
    60  // metricsSet is a set of metrics published.
    61  // The keys are string representations of the lists of label values for a published metric.
    62  // The list of label values is joined with the "+" symbol. For example, a metric produced with the label values
    63  // ["one", "two", "three"] is added to the set with the key "one+two+three".
    64  type metricsSet map[string]struct{}
    65  
    66  // LatencyMetricsCollector implements the LatencyCollector interface and prometheus.Collector interface
    67  type LatencyMetricsCollector struct {
    68  	httpLatency                  *prometheus.HistogramVec
    69  	upstreamServerLabelNames     []string
    70  	upstreamServerPeerLabelNames []string
    71  	upstreamServerLabels         map[string][]string
    72  	upstreamServerPeerLabels     map[string][]string
    73  	metricsPublishedMap          metricsPublishedMap
    74  	metricsPublishedMutex        sync.Mutex
    75  	variableLabelsMutex          sync.RWMutex
    76  }
    77  
    78  // NewLatencyMetricsCollector creates a new LatencyMetricsCollector
    79  func NewLatencyMetricsCollector(
    80  	constLabels map[string]string,
    81  	upstreamServerLabelNames []string,
    82  	upstreamServerPeerLabelNames []string,
    83  ) *LatencyMetricsCollector {
    84  	return &LatencyMetricsCollector{
    85  		httpLatency: prometheus.NewHistogramVec(prometheus.HistogramOpts{
    86  			Namespace:   metricsNamespace,
    87  			Name:        "upstream_server_response_latency_ms",
    88  			Help:        "Bucketed response times from when NGINX establishes a connection to an upstream server to when the last byte of the response body is received by NGINX",
    89  			ConstLabels: constLabels,
    90  			Buckets:     latencyBucketsMilliSeconds,
    91  		},
    92  			createLatencyLabelNames(upstreamServerLabelNames, upstreamServerPeerLabelNames),
    93  		),
    94  		upstreamServerLabels:         make(map[string][]string),
    95  		upstreamServerPeerLabels:     make(map[string][]string),
    96  		metricsPublishedMap:          make(metricsPublishedMap),
    97  		upstreamServerLabelNames:     upstreamServerLabelNames,
    98  		upstreamServerPeerLabelNames: upstreamServerPeerLabelNames,
    99  	}
   100  }
   101  
   102  // UpdateUpstreamServerPeerLabels updates the Upstream Server Peer Labels
   103  func (l *LatencyMetricsCollector) UpdateUpstreamServerPeerLabels(upstreamServerPeerLabels map[string][]string) {
   104  	l.variableLabelsMutex.Lock()
   105  	for k, v := range upstreamServerPeerLabels {
   106  		l.upstreamServerPeerLabels[k] = v
   107  	}
   108  	l.variableLabelsMutex.Unlock()
   109  }
   110  
   111  // DeleteUpstreamServerPeerLabels deletes the Upstream Server Peer Labels
   112  func (l *LatencyMetricsCollector) DeleteUpstreamServerPeerLabels(peers []string) {
   113  	l.variableLabelsMutex.Lock()
   114  	for _, k := range peers {
   115  		delete(l.upstreamServerPeerLabels, k)
   116  	}
   117  	l.variableLabelsMutex.Unlock()
   118  }
   119  
   120  // UpdateUpstreamServerLabels updates the upstream server label map
   121  func (l *LatencyMetricsCollector) UpdateUpstreamServerLabels(newLabelValues map[string][]string) {
   122  	l.variableLabelsMutex.Lock()
   123  	for k, v := range newLabelValues {
   124  		l.upstreamServerLabels[k] = v
   125  	}
   126  	l.variableLabelsMutex.Unlock()
   127  }
   128  
   129  // DeleteUpstreamServerLabels deletes upstream server labels
   130  func (l *LatencyMetricsCollector) DeleteUpstreamServerLabels(upstreamNames []string) {
   131  	l.variableLabelsMutex.Lock()
   132  	for _, k := range upstreamNames {
   133  		delete(l.upstreamServerLabels, k)
   134  	}
   135  	l.variableLabelsMutex.Unlock()
   136  }
   137  
   138  // DeleteMetrics deletes all metrics published associated with the given upstream server peer names.
   139  func (l *LatencyMetricsCollector) DeleteMetrics(upstreamServerPeerNames []string) {
   140  	for _, name := range upstreamServerPeerNames {
   141  		for _, labelValues := range l.listAndDeleteMetricsPublished(name) {
   142  			success := l.httpLatency.DeleteLabelValues(labelValues...)
   143  			if !success {
   144  				glog.Warningf("could not delete metric for upstream server peer: %s with values: %v", name, labelValues)
   145  			}
   146  		}
   147  	}
   148  }
   149  
   150  func (l *LatencyMetricsCollector) getUpstreamServerPeerLabelValues(peer string) []string {
   151  	l.variableLabelsMutex.RLock()
   152  	defer l.variableLabelsMutex.RUnlock()
   153  	return l.upstreamServerPeerLabels[peer]
   154  }
   155  
   156  func (l *LatencyMetricsCollector) getUpstreamServerLabels(upstreamName string) []string {
   157  	l.variableLabelsMutex.RLock()
   158  	defer l.variableLabelsMutex.RUnlock()
   159  	return l.upstreamServerLabels[upstreamName]
   160  }
   161  
   162  // Register registers all the metrics of the collector
   163  func (l *LatencyMetricsCollector) Register(registry *prometheus.Registry) error {
   164  	return registry.Register(l)
   165  }
   166  
   167  // Describe implements prometheus.Collector interface Describe method
   168  func (l *LatencyMetricsCollector) Describe(ch chan<- *prometheus.Desc) {
   169  	l.httpLatency.Describe(ch)
   170  }
   171  
   172  // Collect implements the prometheus.Collector interface Collect method
   173  func (l *LatencyMetricsCollector) Collect(ch chan<- prometheus.Metric) {
   174  	l.httpLatency.Collect(ch)
   175  }
   176  
   177  // RecordLatency parses a syslog message and records latency
   178  func (l *LatencyMetricsCollector) RecordLatency(syslogMsg string) {
   179  	lm, err := parseMessage(syslogMsg)
   180  	if err != nil {
   181  		glog.V(3).Infof("could not parse syslog message: %v", err)
   182  		return
   183  	}
   184  	labelValues, err := l.createLatencyLabelValues(lm)
   185  	if err != nil {
   186  		glog.Errorf("cannot record latency for upstream %s and server %s: %v", lm.Upstream, lm.Server, err)
   187  		return
   188  	}
   189  	l.httpLatency.WithLabelValues(labelValues...).Observe(lm.Latency * 1000)
   190  	l.updateMetricsPublished(lm.Upstream, lm.Server, labelValues)
   191  }
   192  
   193  func (l *LatencyMetricsCollector) updateMetricsPublished(upstreamName, server string, labelValues []string) {
   194  	l.metricsPublishedMutex.Lock()
   195  	key := fmt.Sprintf("%s/%s", upstreamName, server)
   196  	if _, ok := l.metricsPublishedMap[key]; !ok {
   197  		l.metricsPublishedMap[key] = make(metricsSet)
   198  	}
   199  	l.metricsPublishedMap[key][strings.Join(labelValues, "+")] = struct{}{}
   200  	l.metricsPublishedMutex.Unlock()
   201  }
   202  
   203  func (l *LatencyMetricsCollector) listAndDeleteMetricsPublished(key string) (metricsPublished [][]string) {
   204  	l.metricsPublishedMutex.Lock()
   205  	defer l.metricsPublishedMutex.Unlock()
   206  	for labelValues := range l.metricsPublishedMap[key] {
   207  		metricsPublished = append(metricsPublished, strings.Split(labelValues, "+"))
   208  	}
   209  	delete(l.metricsPublishedMap, key)
   210  	return metricsPublished
   211  }
   212  
   213  func (l *LatencyMetricsCollector) createLatencyLabelValues(lm latencyMetric) ([]string, error) {
   214  	labelValues := []string{lm.Upstream, lm.Server, lm.Code}
   215  	upstreamServerLabelValues := l.getUpstreamServerLabels(lm.Upstream)
   216  	if len(l.upstreamServerLabelNames) != len(upstreamServerLabelValues) {
   217  		return nil, fmt.Errorf("wrong number of labels for upstream %v. For labels %v, got values: %v",
   218  			lm.Upstream, l.upstreamServerLabelNames, upstreamServerLabelValues)
   219  	}
   220  	labelValues = append(labelValues, upstreamServerLabelValues...)
   221  	peerServerLabelValues := l.getUpstreamServerPeerLabelValues(fmt.Sprintf("%v/%v", lm.Upstream, lm.Server))
   222  	if len(l.upstreamServerPeerLabelNames) != len(peerServerLabelValues) {
   223  		return nil, fmt.Errorf("wrong number of labels for upstream peer %v. For labels %v, got values: %v",
   224  			lm.Server, l.upstreamServerPeerLabelNames, peerServerLabelValues)
   225  	}
   226  	labelValues = append(labelValues, peerServerLabelValues...)
   227  	return labelValues, nil
   228  }
   229  
   230  func createLatencyLabelNames(upstreamServerLabelNames, upstreamServerPeerLabelNames []string) []string {
   231  	return append(append([]string{"upstream", "server", "code"}, upstreamServerLabelNames...), upstreamServerPeerLabelNames...)
   232  }
   233  
   234  type syslogMsg struct {
   235  	ProxyHost            string `json:"proxyHost"`
   236  	UpstreamAddr         string `json:"upstreamAddress"`
   237  	UpstreamStatus       string `json:"upstreamStatus"`
   238  	UpstreamResponseTime string `json:"upstreamResponseTime"`
   239  }
   240  
   241  type latencyMetric struct {
   242  	Upstream string
   243  	Server   string
   244  	Code     string
   245  	Latency  float64
   246  }
   247  
   248  func parseMessage(msg string) (latencyMetric, error) {
   249  	msgParts := strings.Split(msg, nginxSeparator)
   250  	if len(msgParts) != 2 {
   251  		return latencyMetric{}, fmt.Errorf("wrong message format: %s, expected message to start with \"%s\"", msg, nginxSeparator)
   252  	}
   253  	var sm syslogMsg
   254  	info := msgParts[1]
   255  	if err := json.Unmarshal([]byte(info), &sm); err != nil {
   256  		return latencyMetric{}, fmt.Errorf("could not unmarshal %s: %w", msg, err)
   257  	}
   258  	if sm.UpstreamAddr == sm.ProxyHost {
   259  		// no upstream connected so don't publish a metric
   260  		return latencyMetric{}, fmt.Errorf("nginx could not connect to upstream")
   261  	}
   262  	server := parseMultipartResponse(sm.UpstreamAddr)
   263  	latency, err := strconv.ParseFloat(parseMultipartResponse(sm.UpstreamResponseTime), 64)
   264  	if err != nil {
   265  		return latencyMetric{}, fmt.Errorf("could not parse float from upstream response time %s: %w", sm.UpstreamResponseTime, err)
   266  	}
   267  	code := parseMultipartResponse(sm.UpstreamStatus)
   268  	lm := latencyMetric{
   269  		Upstream: sm.ProxyHost,
   270  		Server:   server,
   271  		Code:     code,
   272  		Latency:  latency,
   273  	}
   274  
   275  	return lm, nil
   276  }
   277  
   278  // parseMutlipartResponse checks if the input string contains commas.
   279  // If it does it returns the last item of the list, otherwise it returns input.
   280  func parseMultipartResponse(input string) string {
   281  	parts := strings.Split(input, ",")
   282  	if l := len(parts); l > 1 {
   283  		return strings.TrimLeft(parts[l-1], " ")
   284  	}
   285  	return input
   286  }
   287  
   288  // LatencyFakeCollector is a fake collector that implements the LatencyCollector interface
   289  type LatencyFakeCollector struct{}
   290  
   291  // DeleteMetrics implements a fake DeleteMetrics
   292  func (l *LatencyFakeCollector) DeleteMetrics([]string) {}
   293  
   294  // UpdateUpstreamServerPeerLabels implements a fake UpdateUpstreamServerPeerLabels
   295  func (l *LatencyFakeCollector) UpdateUpstreamServerPeerLabels(map[string][]string) {}
   296  
   297  // DeleteUpstreamServerPeerLabels implements a fake DeleteUpstreamServerPeerLabels
   298  func (l *LatencyFakeCollector) DeleteUpstreamServerPeerLabels([]string) {}
   299  
   300  // UpdateUpstreamServerLabels implements a fake UpdateUpstreamServerLabels
   301  func (l *LatencyFakeCollector) UpdateUpstreamServerLabels(map[string][]string) {}
   302  
   303  // DeleteUpstreamServerLabels implements a fake DeleteUpstreamServerLabels
   304  func (l *LatencyFakeCollector) DeleteUpstreamServerLabels([]string) {}
   305  
   306  // NewLatencyFakeCollector creates a fake collector that implements the LatencyCollector interface
   307  func NewLatencyFakeCollector() *LatencyFakeCollector {
   308  	return &LatencyFakeCollector{}
   309  }
   310  
   311  // Register implements a fake Register
   312  func (l *LatencyFakeCollector) Register(_ *prometheus.Registry) error { return nil }
   313  
   314  // RecordLatency implements a fake RecordLatency
   315  func (l *LatencyFakeCollector) RecordLatency(_ string) {}