github.com/google/cloudprober@v0.11.3/surfacers/stackdriver/stackdriver.go

github.com/google/cloudprober@v0.11.3/surfacers/stackdriver/stackdriver.go (about)

     1  // Copyright 2017-2021 The Cloudprober Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  /*
    16  Package stackdriver implements the Stackdriver version of the Surfacer
    17  object. This package allows users to create an initialized Stack Driver
    18  Surfacer and use it to write custom metrics data.
    19  */
    20  package stackdriver
    21  
    22  import (
    23  	"context"
    24  	"fmt"
    25  	"math/rand"
    26  	"regexp"
    27  	"strings"
    28  	"time"
    29  
    30  	"cloud.google.com/go/compute/metadata"
    31  	"github.com/google/cloudprober/logger"
    32  	"golang.org/x/oauth2/google"
    33  	monitoring "google.golang.org/api/monitoring/v3"
    34  
    35  	"github.com/google/cloudprober/metrics"
    36  	"github.com/google/cloudprober/surfacers/common/options"
    37  	configpb "github.com/google/cloudprober/surfacers/stackdriver/proto"
    38  )
    39  
    40  const batchSize = 200
    41  
    42  //-----------------------------------------------------------------------------
    43  // Stack Driver Surfacer Specific Code
    44  //-----------------------------------------------------------------------------
    45  
    46  // SDSurfacer structure for StackDriver, which includes an authenticated client
    47  // for making StackDriver API calls, and a registered which is in charge of
    48  // keeping track of what metrics have already been registereded
    49  type SDSurfacer struct {
    50  	c    *configpb.SurfacerConf
    51  	opts *options.Options
    52  
    53  	// Metrics regexp
    54  	allowedMetricsRegex *regexp.Regexp
    55  
    56  	// Internal cache for saving metric data until a batch is sent
    57  	cache        map[string]*monitoring.TimeSeries
    58  	knownMetrics map[string]bool
    59  
    60  	// Channel for writing the data without blocking
    61  	writeChan chan *metrics.EventMetrics
    62  
    63  	// VM Information
    64  	onGCE       bool
    65  	projectName string
    66  	resource    *monitoring.MonitoredResource
    67  
    68  	// Time when stackdriver module was initialized. This is used as start time
    69  	// for cumulative metrics.
    70  	startTime time.Time
    71  
    72  	// Cloud logger
    73  	l       *logger.Logger
    74  	failCnt int64
    75  
    76  	// Monitoring client
    77  	client *monitoring.Service
    78  }
    79  
    80  // New initializes a SDSurfacer for Stack Driver with all its necessary internal
    81  // variables for call references (project and instances variables) as well
    82  // as provisioning it with clients for making the necessary API calls. New
    83  // requires you to pass in a valid stackdriver surfacer configuration.
    84  func New(ctx context.Context, config *configpb.SurfacerConf, opts *options.Options, l *logger.Logger) (*SDSurfacer, error) {
    85  	// Create a cache, which is used for batching write requests together,
    86  	// and a channel for writing data.
    87  	s := SDSurfacer{
    88  		cache:        make(map[string]*monitoring.TimeSeries),
    89  		knownMetrics: make(map[string]bool),
    90  		writeChan:    make(chan *metrics.EventMetrics, config.GetMetricsBufferSize()),
    91  		c:            config,
    92  		opts:         opts,
    93  		projectName:  config.GetProject(),
    94  		startTime:    time.Now(),
    95  		l:            l,
    96  	}
    97  
    98  	if s.c.GetAllowedMetricsRegex() != "" {
    99  		l.Warning("allowed_metrics_regex is now deprecated. Please use the common surfacer options: allow_metrics, ignore_metrics.")
   100  		r, err := regexp.Compile(s.c.GetAllowedMetricsRegex())
   101  		if err != nil {
   102  			return nil, err
   103  		}
   104  		s.allowedMetricsRegex = r
   105  	}
   106  
   107  	// Find all the necessary information for writing metrics to Stack
   108  	// Driver.
   109  	var err error
   110  
   111  	if metadata.OnGCE() {
   112  		s.onGCE = true
   113  
   114  		if s.projectName == "" {
   115  			if s.projectName, err = metadata.ProjectID(); err != nil {
   116  				return nil, fmt.Errorf("unable to retrieve project name: %v", err)
   117  			}
   118  		}
   119  
   120  		mr, err := monitoredResourceOnGCE(s.projectName)
   121  		if err != nil {
   122  			return nil, fmt.Errorf("error initializing monitored resource for stackdriver on GCE: %v", err)
   123  		}
   124  
   125  		s.resource = mr
   126  
   127  	}
   128  
   129  	// Create monitoring client
   130  	httpClient, err := google.DefaultClient(ctx, monitoring.CloudPlatformScope)
   131  	if err != nil {
   132  		return nil, err
   133  	}
   134  	s.client, err = monitoring.New(httpClient)
   135  	if err != nil {
   136  		return nil, err
   137  	}
   138  
   139  	// Start either the writeAsync or the writeBatch, depending on if we are
   140  	// batching or not.
   141  	go s.writeBatch(ctx)
   142  
   143  	s.l.Info("Created a new stackdriver surfacer")
   144  	return &s, nil
   145  }
   146  
   147  // Write queues a message to be written to stackdriver.
   148  func (s *SDSurfacer) Write(_ context.Context, em *metrics.EventMetrics) {
   149  	// Write inserts the data to be written into channel. This channel is
   150  	// watched by writeBatch and will make the necessary calls to the Stackdriver
   151  	// API to write the data from the channel.
   152  	select {
   153  	case s.writeChan <- em:
   154  	default:
   155  		s.l.Errorf("SDSurfacer's write channel is full, dropping new data.")
   156  	}
   157  }
   158  
   159  // createMetricDescriptor creates metric descriptor for the given timeseries.
   160  // We create metric descriptors explicitly, instead of relying on auto-
   161  // creation by creating timeseries, because auto-creation doesn't add units to
   162  // the metric.
   163  func (s *SDSurfacer) createMetricDescriptor(ts *monitoring.TimeSeries) error {
   164  	var labels []*monitoring.LabelDescriptor
   165  	for k := range ts.Metric.Labels {
   166  		labels = append(labels, &monitoring.LabelDescriptor{
   167  			Key:       k,
   168  			ValueType: "STRING",
   169  		})
   170  	}
   171  
   172  	_, err := s.client.Projects.MetricDescriptors.Create("projects/"+s.projectName, &monitoring.MetricDescriptor{
   173  		Name:       "projects/" + s.projectName + "/metricDescriptors/" + ts.Metric.Type,
   174  		Type:       ts.Metric.Type,
   175  		MetricKind: ts.MetricKind,
   176  		Labels:     labels,
   177  		Unit:       ts.Unit,
   178  		ValueType:  ts.ValueType,
   179  	}).Do()
   180  
   181  	return err
   182  }
   183  
   184  // writeBatch polls the writeChan and the sendChan waiting for either a new
   185  // write packet or a new context. If data comes in on the writeChan, then
   186  // the data is pulled off and put into the cache (if there is already an
   187  // entry into the cache for the same metric, it updates the metric to the
   188  // new data). If ticker fires, then the metrics in the cache
   189  // are batched together. The Stackdriver API has a limit on the maximum number
   190  // of metrics that can be sent in a single request, so we may have to make
   191  // multiple requests to the Stackdriver API to send the full cache of metrics.
   192  //
   193  // writeBatch is set up to run as an infinite goroutine call in the New function
   194  // to allow it to write asynchronously to Stack Driver.
   195  func (s *SDSurfacer) writeBatch(ctx context.Context) {
   196  	// Introduce a random delay before starting the loop.
   197  	rand.Seed(time.Now().UnixNano())
   198  	randomDelay := time.Duration(rand.Int63n(int64(s.c.GetBatchTimerSec()))) * time.Second
   199  	time.Sleep(randomDelay)
   200  
   201  	batchTicker := time.NewTicker(time.Duration(s.c.GetBatchTimerSec()) * time.Second)
   202  	for {
   203  		select {
   204  		case <-ctx.Done():
   205  			s.l.Infof("Context canceled, stopping the input processing loop.")
   206  			batchTicker.Stop()
   207  			return
   208  		case em := <-s.writeChan:
   209  			// Process EventMetrics to build timeseries using them and cache the timeseries
   210  			// objects.
   211  			s.recordEventMetrics(em)
   212  		case <-batchTicker.C:
   213  			// Empty time series writes cause an error to be returned, so
   214  			// we skip any calls that write but wouldn't set any data.
   215  			if len(s.cache) == 0 {
   216  				break
   217  			}
   218  
   219  			var ts []*monitoring.TimeSeries
   220  			for _, v := range s.cache {
   221  				if !s.knownMetrics[v.Metric.Type] && v.Unit != "" {
   222  					if err := s.createMetricDescriptor(v); err != nil {
   223  						s.l.Warningf("Error creating metric descriptor for: %s, err: %v", v.Metric.Type, err)
   224  						continue
   225  					}
   226  					s.knownMetrics[v.Metric.Type] = true
   227  				}
   228  				ts = append(ts, v)
   229  			}
   230  
   231  			// We batch the time series into appropriately-sized sets
   232  			// and write them
   233  			for i := 0; i < len(ts); i += batchSize {
   234  				endIndex := min(len(ts), i+batchSize)
   235  
   236  				s.l.Infof("Sending entries %d through %d of %d", i, endIndex, len(ts))
   237  
   238  				// Now that we've created the new metric, we can write the data. Making
   239  				// a time series create call will automatically register a new metric
   240  				// with the correct information if it does not already exist.
   241  				// Ref: https://cloud.google.com/monitoring/custom-metrics/creating-metrics#auto-creation
   242  				requestBody := monitoring.CreateTimeSeriesRequest{
   243  					TimeSeries: ts[i:endIndex],
   244  				}
   245  				if _, err := s.client.Projects.TimeSeries.Create("projects/"+s.projectName, &requestBody).Do(); err != nil {
   246  					s.failCnt++
   247  					s.l.Warningf("Unable to fulfill TimeSeries Create call. Err: %v", err)
   248  				}
   249  			}
   250  
   251  			// Flush the cache after we've finished writing so we don't accidentally
   252  			// re-write metric values that haven't been written over several write
   253  			// cycles.
   254  			for k := range s.cache {
   255  				delete(s.cache, k)
   256  			}
   257  		}
   258  	}
   259  
   260  }
   261  
   262  //-----------------------------------------------------------------------------
   263  // StackDriver Object Creation and Helper Functions
   264  //-----------------------------------------------------------------------------
   265  
   266  // recordTimeSeries forms a timeseries object from the given arguments, records
   267  // it in the cache if batch processing is enabled, and returns it.
   268  //
   269  // More information on the object and specific fields can be found here:
   270  //	https://cloud.google.com/monitoring/api/ref_v3/rest/v3/TimeSeries
   271  func (s *SDSurfacer) recordTimeSeries(metricKind, metricName, msgType string, labels map[string]string, timestamp time.Time, tv *monitoring.TypedValue, unit, cacheKey string) *monitoring.TimeSeries {
   272  	startTime := s.startTime.Format(time.RFC3339Nano)
   273  	if metricKind == "GAUGE" {
   274  		startTime = timestamp.Format(time.RFC3339Nano)
   275  	}
   276  
   277  	ts := &monitoring.TimeSeries{
   278  		// The URL address for our custom metric, must match the
   279  		// name we used in the MetricDescriptor.
   280  		Metric: &monitoring.Metric{
   281  			Type:   s.c.GetMonitoringUrl() + metricName,
   282  			Labels: labels,
   283  		},
   284  
   285  		// Must match the MetricKind and ValueType of the MetricDescriptor.
   286  		MetricKind: metricKind,
   287  		ValueType:  msgType,
   288  		Unit:       unit,
   289  
   290  		// Create a single data point, this could be utilized to create
   291  		// a batch of points instead of a single point if the write
   292  		// rate is too high.
   293  		Points: []*monitoring.Point{
   294  			{
   295  				Interval: &monitoring.TimeInterval{
   296  					StartTime: startTime,
   297  					EndTime:   timestamp.Format(time.RFC3339Nano),
   298  				},
   299  				Value: tv,
   300  			},
   301  		},
   302  	}
   303  
   304  	if s.resource != nil {
   305  		ts.Resource = s.resource
   306  	}
   307  
   308  	// We create a key that is a composite of both the name and the
   309  	// labels so we can make sure that the cache holds all distinct
   310  	// values and not just the ones with different names.
   311  	s.cache[metricName+","+cacheKey] = ts
   312  
   313  	return ts
   314  
   315  }
   316  
   317  // sdKind converts EventMetrics kind to StackDriver kind string.
   318  func (s *SDSurfacer) sdKind(kind metrics.Kind) string {
   319  	switch kind {
   320  	case metrics.GAUGE:
   321  		return "GAUGE"
   322  	case metrics.CUMULATIVE:
   323  		return "CUMULATIVE"
   324  	default:
   325  		return ""
   326  	}
   327  }
   328  
   329  // processLabels processes EventMetrics labels to generate:
   330  //	- a map of label key values to use in StackDriver timeseries,
   331  //	- a labels key of the form label1_key=label1_val,label2_key=label2_val,
   332  //	  used for caching.
   333  //	- prefix for metric names, usually <ptype>/<probe>.
   334  func processLabels(em *metrics.EventMetrics) (labels map[string]string, labelsKey, metricPrefix string) {
   335  	labels = make(map[string]string)
   336  	var sortedLabels []string // we use this for cache key below
   337  	var ptype, probe string
   338  	for _, k := range em.LabelsKeys() {
   339  		if k == "ptype" {
   340  			ptype = em.Label(k)
   341  			continue
   342  		}
   343  		if k == "probe" {
   344  			probe = em.Label(k)
   345  			continue
   346  		}
   347  		labels[k] = em.Label(k)
   348  		sortedLabels = append(sortedLabels, k+"="+labels[k])
   349  	}
   350  	labelsKey = strings.Join(sortedLabels, ",")
   351  
   352  	if ptype != "" {
   353  		metricPrefix += ptype + "/"
   354  	}
   355  	if probe != "" {
   356  		metricPrefix += probe + "/"
   357  	}
   358  	return
   359  }
   360  
   361  func (s *SDSurfacer) ignoreMetric(name string) bool {
   362  	if s.allowedMetricsRegex != nil {
   363  		if !s.allowedMetricsRegex.MatchString(name) {
   364  			return true
   365  		}
   366  	}
   367  
   368  	if !validMetricLength(name, s.c.GetMonitoringUrl()) {
   369  		s.l.Warningf("Message name %q is greater than the 100 character limit, skipping write", name)
   370  		return true
   371  	}
   372  
   373  	return false
   374  }
   375  
   376  // recordEventMetrics processes the incoming EventMetrics objects and builds
   377  // TimeSeries from it.
   378  //
   379  // Since stackdriver doesn't support metrics.String and metrics.Map value types,
   380  // it converts them to a numerical types (stackdriver type Double) with
   381  // additional labels. See the inline comments for this conversion is done.
   382  func (s *SDSurfacer) recordEventMetrics(em *metrics.EventMetrics) (ts []*monitoring.TimeSeries) {
   383  	metricKind := s.sdKind(em.Kind)
   384  	if metricKind == "" {
   385  		s.l.Warningf("Unknown event metrics type (not CUMULATIVE or GAUGE): %v", em.Kind)
   386  		return
   387  	}
   388  
   389  	emLabels, cacheKey, metricPrefix := processLabels(em)
   390  
   391  	for _, k := range em.MetricsKeys() {
   392  		if !s.opts.AllowMetric(k) {
   393  			continue
   394  		}
   395  
   396  		// Create a copy of emLabels for use in timeseries object.
   397  		mLabels := make(map[string]string)
   398  		for k, v := range emLabels {
   399  			mLabels[k] = v
   400  		}
   401  		name := metricPrefix + k
   402  
   403  		if s.ignoreMetric(name) {
   404  			continue
   405  		}
   406  
   407  		// Create the correct TimeSeries object based on the incoming data
   408  		val := em.Metric(k)
   409  
   410  		unit := "1" // "1" is the default unit for numbers.
   411  		if k == "latency" {
   412  			unit = map[time.Duration]string{
   413  				time.Second:      "s",
   414  				time.Millisecond: "ms",
   415  				time.Microsecond: "us",
   416  				time.Nanosecond:  "ns",
   417  			}[em.LatencyUnit]
   418  		}
   419  
   420  		// If metric value is of type numerical value.
   421  		if v, ok := val.(metrics.NumValue); ok {
   422  			f := float64(v.Int64())
   423  			ts = append(ts, s.recordTimeSeries(metricKind, name, "DOUBLE", mLabels, em.Timestamp, &monitoring.TypedValue{DoubleValue: &f}, unit, cacheKey))
   424  			continue
   425  		}
   426  
   427  		// If metric value is of type String.
   428  		if v, ok := val.(metrics.String); ok {
   429  			// Since StackDriver doesn't support string value type for custom metrics,
   430  			// we convert string metrics into a numeric metric with an additional label
   431  			// val="string-val".
   432  			//
   433  			// metrics.String stringer wraps string values in a single "". Remove those
   434  			// for stackdriver.
   435  			mLabels["val"] = strings.Trim(v.String(), "\"")
   436  			f := float64(1)
   437  			ts = append(ts, s.recordTimeSeries(metricKind, name, "DOUBLE", mLabels, em.Timestamp, &monitoring.TypedValue{DoubleValue: &f}, unit, cacheKey))
   438  			continue
   439  		}
   440  
   441  		// If metric value is of type Map.
   442  		if mapValue, ok := val.(*metrics.Map); ok {
   443  			// Since StackDriver doesn't support Map value type, we convert Map values
   444  			// to multiple timeseries with map's KeyName and key as labels.
   445  			for _, mapKey := range mapValue.Keys() {
   446  				mmLabels := make(map[string]string)
   447  				for lk, lv := range mLabels {
   448  					mmLabels[lk] = lv
   449  				}
   450  				mmLabels[mapValue.MapName] = mapKey
   451  				f := float64(mapValue.GetKey(mapKey).Int64())
   452  				ts = append(ts, s.recordTimeSeries(metricKind, name, "DOUBLE", mmLabels, em.Timestamp, &monitoring.TypedValue{DoubleValue: &f}, unit, cacheKey))
   453  			}
   454  			continue
   455  		}
   456  
   457  		// If metric value is of type Distribution.
   458  		if distValue, ok := val.(*metrics.Distribution); ok {
   459  			ts = append(ts, s.recordTimeSeries(metricKind, name, "DISTRIBUTION", mLabels, em.Timestamp, distValue.StackdriverTypedValue(), unit, cacheKey))
   460  			continue
   461  		}
   462  
   463  		// We'll reach here only if encounter an unsupported value type.
   464  		s.l.Warningf("Unsupported value type: %v", val)
   465  	}
   466  	return ts
   467  }
   468  
   469  //-----------------------------------------------------------------------------
   470  // Non-stackdriver Helper Functions
   471  //-----------------------------------------------------------------------------
   472  
   473  // checkMetricLength checks if the combination of the metricName and the url
   474  // prefix are longer than 100 characters, which is illegal in a Stackdriver
   475  // call. Stack Driver doesn't allow custom metrics with more than 100 character
   476  // names, so we have a check to see if we are going over the limit.
   477  //	Ref: https://cloud.google.com/monitoring/api/v3/metrics#metric_names
   478  func validMetricLength(metricName string, monitoringURL string) bool {
   479  	if len(metricName)+len(monitoringURL) > 100 {
   480  		return false
   481  	}
   482  	return true
   483  }
   484  
   485  // Function to return the min of two integers
   486  func min(a, b int) int {
   487  	if a < b {
   488  		return a
   489  	}
   490  	return b
   491  }