github.com/google/cloudprober@v0.11.3/surfacers/prometheus/prometheus.go (about)

     1  // Copyright 2017-2020 The Cloudprober Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  /*
    16  Package prometheus provides a prometheus surfacer for Cloudprober. Prometheus
    17  surfacer exports incoming metrics over a web interface in a format that
    18  prometheus understands (http://prometheus.io).
    19  
    20  This surfacer processes each incoming EventMetrics and holds the latest value
    21  and timestamp for each metric in memory. These metrics are made available
    22  through a web URL (default: /metrics), which Prometheus scrapes at a regular
    23  interval.
    24  
    25  Example /metrics page:
    26  #TYPE sent counter
    27  sent{ptype="dns",probe="vm-to-public-dns",dst="8.8.8.8"} 181299 1497330037000
    28  sent{ptype="ping",probe="vm-to-public-dns",dst="8.8.4.4"} 362600 1497330037000
    29  #TYPE rcvd counter
    30  rcvd{ptype="dns",probe="vm-to-public-dns",dst="8.8.8.8"} 181234 1497330037000
    31  rcvd{ptype="ping",probe="vm-to-public-dns",dst="8.8.4.4"} 362600 1497330037000
    32  */
    33  package prometheus
    34  
    35  import (
    36  	"context"
    37  	"fmt"
    38  	"io"
    39  	"net/http"
    40  	"regexp"
    41  	"strconv"
    42  	"strings"
    43  	"time"
    44  
    45  	"github.com/google/cloudprober/logger"
    46  	"github.com/google/cloudprober/metrics"
    47  	"github.com/google/cloudprober/surfacers/common/options"
    48  	configpb "github.com/google/cloudprober/surfacers/prometheus/proto"
    49  )
    50  
    51  // Prometheus metric and label names should match the following regular
    52  // expressions. Since, "-" is commonly used in metric and label names, we
    53  // replace it by "_". If a name still doesn't match the regular expression, we
    54  // ignore it with a warning log message.
    55  const (
    56  	ValidMetricNameRegex = "^[a-zA-Z_:]([a-zA-Z0-9_:])*$"
    57  	ValidLabelNameRegex  = "^[a-zA-Z_]([a-zA-Z0-9_])*$"
    58  )
    59  
    60  const histogram = "histogram"
    61  
    62  // queriesQueueSize defines how many queries can we queue before we start
    63  // blocking on previous queries to finish.
    64  const queriesQueueSize = 10
    65  
    66  var (
    67  	// Cache of EventMetric label to prometheus label mapping. We use it to
    68  	// quickly lookup if we have already seen a label and we have a prometheus
    69  	// label corresponding to it.
    70  	promLabelNames = make(map[string]string)
    71  
    72  	// Cache of EventMetric metric to prometheus metric mapping. We use it to
    73  	// quickly lookup if we have already seen a metric and we have a prometheus
    74  	// metric name corresponding to it.
    75  	promMetricNames = make(map[string]string)
    76  )
    77  
    78  type promMetric struct {
    79  	typ      string
    80  	data     map[string]*dataPoint
    81  	dataKeys []string // To keep data keys ordered
    82  }
    83  
    84  type dataPoint struct {
    85  	value     string
    86  	timestamp int64
    87  }
    88  
    89  // httpWriter is a wrapper for http.ResponseWriter that includes a channel
    90  // to signal the completion of the writing of the response.
    91  type httpWriter struct {
    92  	w        http.ResponseWriter
    93  	doneChan chan struct{}
    94  }
    95  
    96  // PromSurfacer implements a prometheus surfacer for Cloudprober. PromSurfacer
    97  // organizes metrics into a two-level data structure:
    98  //		1. Metric name -> PromMetric data structure dict.
    99  //    2. A PromMetric organizes data associated with a metric in a
   100  //			 Data key -> Data point map, where data point consists of a value
   101  //       and timestamp.
   102  // Data key represents a unique combination of metric name and labels.
   103  type PromSurfacer struct {
   104  	c           *configpb.SurfacerConf // Configuration
   105  	opts        *options.Options
   106  	prefix      string                     // Metrics prefix, e.g. "cloudprober_"
   107  	emChan      chan *metrics.EventMetrics // Buffered channel to store incoming EventMetrics
   108  	metrics     map[string]*promMetric     // Metric name to promMetric mapping
   109  	metricNames []string                   // Metric names, to keep names ordered.
   110  	queryChan   chan *httpWriter           // Query channel
   111  	l           *logger.Logger
   112  
   113  	// A handler that takes a promMetric and a dataKey and writes the
   114  	// corresponding metric string to the provided io.Writer.
   115  	dataWriter func(w io.Writer, pm *promMetric, dataKey string)
   116  
   117  	// Regexes for metric and label names.
   118  	metricNameRe *regexp.Regexp
   119  	labelNameRe  *regexp.Regexp
   120  }
   121  
   122  // New returns a prometheus surfacer based on the config provided. It sets up a
   123  // goroutine to process both the incoming EventMetrics and the web requests for
   124  // the URL handler /metrics.
   125  func New(ctx context.Context, config *configpb.SurfacerConf, opts *options.Options, l *logger.Logger) (*PromSurfacer, error) {
   126  	if config == nil {
   127  		config = &configpb.SurfacerConf{}
   128  	}
   129  	ps := &PromSurfacer{
   130  		c:            config,
   131  		opts:         opts,
   132  		emChan:       make(chan *metrics.EventMetrics, config.GetMetricsBufferSize()),
   133  		queryChan:    make(chan *httpWriter, queriesQueueSize),
   134  		metrics:      make(map[string]*promMetric),
   135  		prefix:       config.GetMetricsPrefix(),
   136  		metricNameRe: regexp.MustCompile(ValidMetricNameRegex),
   137  		labelNameRe:  regexp.MustCompile(ValidLabelNameRegex),
   138  		l:            l,
   139  	}
   140  
   141  	if ps.c.GetIncludeTimestamp() {
   142  		ps.dataWriter = func(w io.Writer, pm *promMetric, k string) {
   143  			fmt.Fprintf(w, "%s %s %d\n", k, pm.data[k].value, pm.data[k].timestamp)
   144  		}
   145  	} else {
   146  		ps.dataWriter = func(w io.Writer, pm *promMetric, k string) {
   147  			fmt.Fprintf(w, "%s %s\n", k, pm.data[k].value)
   148  		}
   149  	}
   150  
   151  	// Start a goroutine to process the incoming EventMetrics as well as
   152  	// the incoming web queries. To avoid data access race conditions, we do
   153  	// one thing at a time.
   154  	go func() {
   155  		for {
   156  			select {
   157  			case <-ctx.Done():
   158  				ps.l.Infof("Context canceled, stopping the input/output processing loop.")
   159  				return
   160  			case em := <-ps.emChan:
   161  				ps.record(em)
   162  			case hw := <-ps.queryChan:
   163  				ps.writeData(hw.w)
   164  				close(hw.doneChan)
   165  			}
   166  		}
   167  	}()
   168  
   169  	http.HandleFunc(ps.c.GetMetricsUrl(), func(w http.ResponseWriter, r *http.Request) {
   170  		// doneChan is used to track the completion of the response writing. This is
   171  		// required as response is written in a different goroutine.
   172  		doneChan := make(chan struct{}, 1)
   173  		ps.queryChan <- &httpWriter{w, doneChan}
   174  		<-doneChan
   175  	})
   176  
   177  	l.Infof("Initialized prometheus exporter at the URL: %s", ps.c.GetMetricsUrl())
   178  	return ps, nil
   179  }
   180  
   181  // Write queues the incoming data into a channel. This channel is watched by a
   182  // goroutine that actually processes the data and updates the in-memory
   183  // database.
   184  func (ps *PromSurfacer) Write(_ context.Context, em *metrics.EventMetrics) {
   185  	select {
   186  	case ps.emChan <- em:
   187  	default:
   188  		ps.l.Errorf("PromSurfacer's write channel is full, dropping new data.")
   189  	}
   190  }
   191  
   192  func promType(em *metrics.EventMetrics) string {
   193  	switch em.Kind {
   194  	case metrics.CUMULATIVE:
   195  		return "counter"
   196  	case metrics.GAUGE:
   197  		return "gauge"
   198  	default:
   199  		return "unknown"
   200  	}
   201  }
   202  
   203  // promTime converts time.Time to Unix milliseconds.
   204  func promTime(t time.Time) int64 {
   205  	return t.UnixNano() / (1000 * 1000)
   206  }
   207  
   208  func (ps *PromSurfacer) recordMetric(metricName, key, value string, em *metrics.EventMetrics, typ string) {
   209  	// Recognized metric
   210  	if pm := ps.metrics[metricName]; pm != nil {
   211  		// Recognized metric name and labels combination.
   212  		if pm.data[key] != nil {
   213  			pm.data[key].value = value
   214  			pm.data[key].timestamp = promTime(em.Timestamp)
   215  			return
   216  		}
   217  		pm.data[key] = &dataPoint{
   218  			value:     value,
   219  			timestamp: promTime(em.Timestamp),
   220  		}
   221  		pm.dataKeys = append(pm.dataKeys, key)
   222  	} else {
   223  		// Newly discovered metric name.
   224  		if typ == "" {
   225  			typ = promType(em)
   226  		}
   227  		ps.metrics[metricName] = &promMetric{
   228  			typ: typ,
   229  			data: map[string]*dataPoint{
   230  				key: &dataPoint{
   231  					value:     value,
   232  					timestamp: promTime(em.Timestamp),
   233  				},
   234  			},
   235  			dataKeys: []string{key},
   236  		}
   237  		ps.metricNames = append(ps.metricNames, metricName)
   238  	}
   239  	return
   240  }
   241  
   242  // checkLabelName finds a prometheus label name for an incoming label. If label
   243  // is found to be invalid even after some basic conversions, a zero string is
   244  // returned.
   245  func (ps *PromSurfacer) checkLabelName(k string) string {
   246  	// Before checking with regex, see if this label name is
   247  	// already known. This block will be entered only once per
   248  	// label name.
   249  	if promLabel, ok := promLabelNames[k]; ok {
   250  		return promLabel
   251  	}
   252  
   253  	ps.l.Infof("Checking validity of new label: %s", k)
   254  	// We'll come here only once per label name.
   255  
   256  	// Prometheus doesn't support "-" in metric names.
   257  	labelName := strings.Replace(k, "-", "_", -1)
   258  	if !ps.labelNameRe.MatchString(labelName) {
   259  		// Explicitly store a zero string so that we don't check it again.
   260  		promLabelNames[k] = ""
   261  		ps.l.Warningf("Ignoring invalid prometheus label name: %s", k)
   262  		return ""
   263  	}
   264  	promLabelNames[k] = labelName
   265  	return labelName
   266  }
   267  
   268  // promMetricName finds a prometheus metric name for an incoming metric. If metric
   269  // is found to be invalid even after some basic conversions, a zero string is
   270  // returned.
   271  func (ps *PromSurfacer) promMetricName(k string) string {
   272  	k = ps.prefix + k
   273  
   274  	// Before checking with regex, see if this metric name is
   275  	// already known. This block will be entered only once per
   276  	// metric name.
   277  	if metricName, ok := promMetricNames[k]; ok {
   278  		return metricName
   279  	}
   280  
   281  	ps.l.Infof("Checking validity of new metric: %s", k)
   282  	// We'll come here only once per metric name.
   283  
   284  	// Prometheus doesn't support "-" in metric names.
   285  	metricName := strings.Replace(k, "-", "_", -1)
   286  	if !ps.metricNameRe.MatchString(metricName) {
   287  		// Explicitly store a zero string so that we don't check it again.
   288  		promMetricNames[k] = ""
   289  		ps.l.Warningf("Ignoring invalid prometheus metric name: %s", k)
   290  		return ""
   291  	}
   292  	promMetricNames[k] = metricName
   293  	return metricName
   294  }
   295  
   296  func dataKey(metricName string, labels []string) string {
   297  	return metricName + "{" + strings.Join(labels, ",") + "}"
   298  }
   299  
   300  // record processes the incoming EventMetrics and updates the in-memory
   301  // database.
   302  //
   303  // Since prometheus doesn't support certain metrics.Value types, we handle them
   304  // differently.
   305  //
   306  // metrics.Map value type:  We break Map values into multiple data keys, with
   307  // each map key corresponding to a label in the data key.
   308  // For example, "resp-code map:code 200:45 500:2" gets converted into:
   309  //   resp-code{code=200} 45
   310  //   resp-code{code=500}  2
   311  //
   312  // metrics.String value type: We convert string value type into a data key with
   313  // val="value" label.
   314  // For example, "version cloudprober-20170608-RC00" gets converted into:
   315  //   version{val=cloudprober-20170608-RC00} 1
   316  func (ps *PromSurfacer) record(em *metrics.EventMetrics) {
   317  	var labels []string
   318  	for _, k := range em.LabelsKeys() {
   319  		if labelName := ps.checkLabelName(k); labelName != "" {
   320  			labels = append(labels, labelName+"=\""+em.Label(k)+"\"")
   321  		}
   322  	}
   323  
   324  	for _, metricName := range em.MetricsKeys() {
   325  		if !ps.opts.AllowMetric(metricName) {
   326  			continue
   327  		}
   328  		pMetricName := ps.promMetricName(metricName)
   329  		if pMetricName == "" {
   330  			// No prometheus metric name found for this metric.
   331  			continue
   332  		}
   333  		val := em.Metric(metricName)
   334  
   335  		// Map values get expanded into metrics with extra label.
   336  		if mapVal, ok := val.(*metrics.Map); ok {
   337  			labelName := ps.checkLabelName(mapVal.MapName)
   338  			if labelName == "" {
   339  				continue
   340  			}
   341  			for _, k := range mapVal.Keys() {
   342  				labelsWithMap := append(labels, labelName+"=\""+k+"\"")
   343  				ps.recordMetric(pMetricName, dataKey(pMetricName, labelsWithMap), mapVal.GetKey(k).String(), em, "")
   344  			}
   345  			continue
   346  		}
   347  
   348  		// Distribution values get expanded into metrics with extra label "le".
   349  		if distVal, ok := val.(*metrics.Distribution); ok {
   350  			d := distVal.Data()
   351  			var val int64
   352  			ps.recordMetric(pMetricName, dataKey(pMetricName+"_sum", labels), strconv.FormatFloat(d.Sum, 'f', -1, 64), em, histogram)
   353  			ps.recordMetric(pMetricName, dataKey(pMetricName+"_count", labels), strconv.FormatInt(d.Count, 10), em, histogram)
   354  			for i := range d.LowerBounds {
   355  				val += d.BucketCounts[i]
   356  				var lb string
   357  				if i == len(d.LowerBounds)-1 {
   358  					lb = "+Inf"
   359  				} else {
   360  					lb = strconv.FormatFloat(d.LowerBounds[i+1], 'f', -1, 64)
   361  				}
   362  				labelsWithBucket := append(labels, "le=\""+lb+"\"")
   363  				ps.recordMetric(pMetricName, dataKey(pMetricName+"_bucket", labelsWithBucket), strconv.FormatInt(val, 10), em, histogram)
   364  			}
   365  			continue
   366  		}
   367  
   368  		// String values get converted into a label.
   369  		if _, ok := val.(metrics.String); ok {
   370  			newLabels := append(labels, "val="+val.String())
   371  			ps.recordMetric(pMetricName, dataKey(pMetricName, newLabels), "1", em, "")
   372  			continue
   373  		}
   374  
   375  		// All other value types, mostly numerical types.
   376  		ps.recordMetric(pMetricName, dataKey(pMetricName, labels), val.String(), em, "")
   377  	}
   378  }
   379  
   380  // writeData writes metrics data on w io.Writer
   381  func (ps *PromSurfacer) writeData(w io.Writer) {
   382  	for _, name := range ps.metricNames {
   383  		pm := ps.metrics[name]
   384  		fmt.Fprintf(w, "#TYPE %s %s\n", name, pm.typ)
   385  		for _, k := range pm.dataKeys {
   386  			ps.dataWriter(w, pm, k)
   387  		}
   388  	}
   389  }