github.com/google/cloudprober@v0.11.3/probes/http/http.go (about)

     1  // Copyright 2017-2020 The Cloudprober Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package http implements HTTP probe type.
    16  package http
    17  
    18  import (
    19  	"bytes"
    20  	"context"
    21  	"crypto/tls"
    22  	"fmt"
    23  	"io/ioutil"
    24  	"math/rand"
    25  	"net"
    26  	"net/http"
    27  	"net/http/httptrace"
    28  	"net/url"
    29  	"strconv"
    30  	"strings"
    31  	"sync"
    32  	"time"
    33  
    34  	"github.com/google/cloudprober/common/oauth"
    35  	"github.com/google/cloudprober/common/tlsconfig"
    36  	"github.com/google/cloudprober/logger"
    37  	"github.com/google/cloudprober/metrics"
    38  	configpb "github.com/google/cloudprober/probes/http/proto"
    39  	"github.com/google/cloudprober/probes/options"
    40  	"github.com/google/cloudprober/targets/endpoint"
    41  	"github.com/google/cloudprober/validators"
    42  	"golang.org/x/oauth2"
    43  )
    44  
    45  // DefaultTargetsUpdateInterval defines default frequency for target updates.
    46  // Actual targets update interval is:
    47  // max(DefaultTargetsUpdateInterval, probe_interval)
    48  var DefaultTargetsUpdateInterval = 1 * time.Minute
    49  
    50  // maxGapBetweenTargets defines the maximum gap between probe loops for each
    51  // target. Actual gap is either configured or determined by the probe interval
    52  // and number of targets.
    53  const maxGapBetweenTargets = 1 * time.Second
    54  
    55  const (
    56  	maxResponseSizeForMetrics = 128
    57  	targetsUpdateInterval     = 1 * time.Minute
    58  	largeBodyThreshold        = bytes.MinRead // 512.
    59  )
    60  
    61  // Probe holds aggregate information about all probe runs, per-target.
    62  type Probe struct {
    63  	name   string
    64  	opts   *options.Options
    65  	c      *configpb.ProbeConf
    66  	l      *logger.Logger
    67  	client *http.Client
    68  
    69  	// book-keeping params
    70  	targets     []endpoint.Endpoint
    71  	protocol    string
    72  	method      string
    73  	url         string
    74  	oauthTS     oauth2.TokenSource
    75  	bearerToken string
    76  
    77  	// Run counter, used to decide when to update targets or export
    78  	// stats.
    79  	runCnt int64
    80  
    81  	// How often to resolve targets (in probe counts), it's the minimum of
    82  	targetsUpdateInterval time.Duration
    83  
    84  	// How often to export metrics (in probe counts), initialized to
    85  	// statsExportInterval / p.opts.Interval. Metrics are exported when
    86  	// (runCnt % statsExportFrequency) == 0
    87  	statsExportFrequency int64
    88  
    89  	// Cancel functions for per-target probe loop
    90  	cancelFuncs map[string]context.CancelFunc
    91  	waitGroup   sync.WaitGroup
    92  
    93  	requestBody []byte
    94  }
    95  
    96  type probeResult struct {
    97  	total, success, timeouts int64
    98  	connEvent                int64
    99  	latency                  metrics.Value
   100  	respCodes                *metrics.Map
   101  	respBodies               *metrics.Map
   102  	validationFailure        *metrics.Map
   103  }
   104  
   105  func (p *Probe) updateOauthToken() {
   106  	if p.oauthTS == nil {
   107  		return
   108  	}
   109  
   110  	tok, err := p.oauthTS.Token()
   111  	if err != nil {
   112  		p.l.Error("Error getting OAuth token: ", err.Error(), ". Skipping updating the token.")
   113  	} else {
   114  		if tok.AccessToken != "" {
   115  			p.bearerToken = tok.AccessToken
   116  		} else {
   117  			idToken, ok := tok.Extra("id_token").(string)
   118  			if ok {
   119  				p.bearerToken = idToken
   120  			}
   121  		}
   122  		p.l.Debug("Got OAuth token, len: ", strconv.FormatInt(int64(len(p.bearerToken)), 10), ", expirationTime: ", tok.Expiry.String())
   123  	}
   124  }
   125  
   126  // Init initializes the probe with the given params.
   127  func (p *Probe) Init(name string, opts *options.Options) error {
   128  	c, ok := opts.ProbeConf.(*configpb.ProbeConf)
   129  	if !ok {
   130  		return fmt.Errorf("not http config")
   131  	}
   132  	p.name = name
   133  	p.opts = opts
   134  	if p.l = opts.Logger; p.l == nil {
   135  		p.l = &logger.Logger{}
   136  	}
   137  	p.c = c
   138  
   139  	p.protocol = strings.ToLower(p.c.GetProtocol().String())
   140  	p.method = p.c.GetMethod().String()
   141  
   142  	p.url = p.c.GetRelativeUrl()
   143  	if len(p.url) > 0 && p.url[0] != '/' {
   144  		return fmt.Errorf("Invalid Relative URL: %s, must begin with '/'", p.url)
   145  	}
   146  
   147  	p.requestBody = []byte(p.c.GetBody())
   148  
   149  	// Create a transport for our use. This is mostly based on
   150  	// http.DefaultTransport with some timeouts changed.
   151  	// TODO(manugarg): Considering cloning DefaultTransport once
   152  	// https://github.com/golang/go/issues/26013 is fixed.
   153  	dialer := &net.Dialer{
   154  		Timeout:   p.opts.Timeout,
   155  		KeepAlive: 30 * time.Second, // TCP keep-alive
   156  	}
   157  
   158  	if p.opts.SourceIP != nil {
   159  		dialer.LocalAddr = &net.TCPAddr{
   160  			IP: p.opts.SourceIP,
   161  		}
   162  	}
   163  
   164  	transport := &http.Transport{
   165  		Proxy:               http.ProxyFromEnvironment,
   166  		DialContext:         dialer.DialContext,
   167  		MaxIdleConns:        256, // http.DefaultTransport.MaxIdleConns: 100.
   168  		TLSHandshakeTimeout: p.opts.Timeout,
   169  	}
   170  
   171  	if p.c.GetProxyUrl() != "" {
   172  		url, err := url.Parse(p.c.GetProxyUrl())
   173  		if err != nil {
   174  			return fmt.Errorf("error parsing proxy URL (%s): %v", p.c.GetProxyUrl(), err)
   175  		}
   176  		transport.Proxy = http.ProxyURL(url)
   177  	}
   178  
   179  	if p.c.GetDisableCertValidation() || p.c.GetTlsConfig() != nil {
   180  		if transport.TLSClientConfig == nil {
   181  			transport.TLSClientConfig = &tls.Config{}
   182  		}
   183  
   184  		if p.c.GetDisableCertValidation() {
   185  			p.l.Warning("disable_cert_validation is deprecated as of v0.10.6. Instead of this, please use \"tls_config {disable_cert_validation: true}\"")
   186  			transport.TLSClientConfig.InsecureSkipVerify = true
   187  		}
   188  
   189  		if p.c.GetTlsConfig() != nil {
   190  			if err := tlsconfig.UpdateTLSConfig(transport.TLSClientConfig, p.c.GetTlsConfig(), false); err != nil {
   191  				return err
   192  			}
   193  		}
   194  	}
   195  
   196  	// If HTTP keep-alives are not enabled (default), disable HTTP keep-alive in
   197  	// transport.
   198  	if !p.c.GetKeepAlive() {
   199  		transport.DisableKeepAlives = true
   200  	} else {
   201  		// If it's been more than 2 probe intervals since connection was used, close it.
   202  		transport.IdleConnTimeout = 2 * p.opts.Interval
   203  		if p.c.GetRequestsPerProbe() > 1 {
   204  			transport.MaxIdleConnsPerHost = int(p.c.GetRequestsPerProbe())
   205  		}
   206  	}
   207  
   208  	if p.c.GetOauthConfig() != nil {
   209  		oauthTS, err := oauth.TokenSourceFromConfig(p.c.GetOauthConfig(), p.l)
   210  		if err != nil {
   211  			return err
   212  		}
   213  		p.oauthTS = oauthTS
   214  		p.updateOauthToken() // This is also called periodically.
   215  	}
   216  
   217  	if p.c.GetDisableHttp2() {
   218  		// HTTP/2 is enabled by default if server supports it. Setting TLSNextProto
   219  		// to an empty dict is the only to disable it.
   220  		transport.TLSNextProto = make(map[string]func(string, *tls.Conn) http.RoundTripper)
   221  	}
   222  
   223  	// Clients are safe for concurrent use by multiple goroutines.
   224  	p.client = &http.Client{
   225  		Transport: transport,
   226  	}
   227  
   228  	p.statsExportFrequency = p.opts.StatsExportInterval.Nanoseconds() / p.opts.Interval.Nanoseconds()
   229  	if p.statsExportFrequency == 0 {
   230  		p.statsExportFrequency = 1
   231  	}
   232  
   233  	p.targets = p.opts.Targets.ListEndpoints()
   234  	p.cancelFuncs = make(map[string]context.CancelFunc, len(p.targets))
   235  
   236  	p.targetsUpdateInterval = DefaultTargetsUpdateInterval
   237  	// There is no point refreshing targets before probe interval.
   238  	if p.targetsUpdateInterval < p.opts.Interval {
   239  		p.targetsUpdateInterval = p.opts.Interval
   240  	}
   241  	p.l.Infof("Targets update interval: %v", p.targetsUpdateInterval)
   242  
   243  	return nil
   244  }
   245  
   246  // Return true if the underlying error indicates a http.Client timeout.
   247  //
   248  // Use for errors returned from http.Client methods (Get, Post).
   249  func isClientTimeout(err error) bool {
   250  	if uerr, ok := err.(*url.Error); ok {
   251  		if nerr, ok := uerr.Err.(net.Error); ok && nerr.Timeout() {
   252  			return true
   253  		}
   254  	}
   255  	return false
   256  }
   257  
   258  // httpRequest executes an HTTP request and updates the provided result struct.
   259  func (p *Probe) doHTTPRequest(req *http.Request, targetName string, result *probeResult, resultMu *sync.Mutex) {
   260  
   261  	if len(p.requestBody) >= largeBodyThreshold {
   262  		req = req.Clone(req.Context())
   263  		req.Body = ioutil.NopCloser(bytes.NewReader(p.requestBody))
   264  	}
   265  
   266  	if p.c.GetKeepAlive() {
   267  		trace := &httptrace.ClientTrace{
   268  			ConnectDone: func(_, addr string, err error) {
   269  				result.connEvent++
   270  				if err != nil {
   271  					p.l.Warning("Error establishing a new connection to: ", addr, ". Err: ", err.Error())
   272  					return
   273  				}
   274  				p.l.Info("Established a new connection to: ", addr)
   275  			},
   276  		}
   277  		req = req.WithContext(httptrace.WithClientTrace(req.Context(), trace))
   278  	}
   279  
   280  	start := time.Now()
   281  	resp, err := p.client.Do(req)
   282  	latency := time.Since(start)
   283  
   284  	if resultMu != nil {
   285  		// Note that we take lock on result object outside of the actual request.
   286  		resultMu.Lock()
   287  		defer resultMu.Unlock()
   288  	}
   289  
   290  	result.total++
   291  
   292  	if err != nil {
   293  		if isClientTimeout(err) {
   294  			p.l.Warning("Target:", targetName, ", URL:", req.URL.String(), ", http.doHTTPRequest: timeout error: ", err.Error())
   295  			result.timeouts++
   296  			return
   297  		}
   298  		p.l.Warning("Target:", targetName, ", URL:", req.URL.String(), ", http.doHTTPRequest: ", err.Error())
   299  		return
   300  	}
   301  
   302  	respBody, err := ioutil.ReadAll(resp.Body)
   303  	if err != nil {
   304  		p.l.Warning("Target:", targetName, ", URL:", req.URL.String(), ", http.doHTTPRequest: ", err.Error())
   305  		return
   306  	}
   307  
   308  	p.l.Debug("Target:", targetName, ", URL:", req.URL.String(), ", response: ", string(respBody))
   309  
   310  	// Calling Body.Close() allows the TCP connection to be reused.
   311  	resp.Body.Close()
   312  	result.respCodes.IncKey(strconv.FormatInt(int64(resp.StatusCode), 10))
   313  
   314  	if p.opts.Validators != nil {
   315  		failedValidations := validators.RunValidators(p.opts.Validators, &validators.Input{Response: resp, ResponseBody: respBody}, result.validationFailure, p.l)
   316  
   317  		// If any validation failed, return now, leaving the success and latency
   318  		// counters unchanged.
   319  		if len(failedValidations) > 0 {
   320  			p.l.Debug("Target:", targetName, ", URL:", req.URL.String(), ", http.doHTTPRequest: failed validations: ", strings.Join(failedValidations, ","))
   321  			return
   322  		}
   323  	}
   324  
   325  	result.success++
   326  	result.latency.AddFloat64(latency.Seconds() / p.opts.LatencyUnit.Seconds())
   327  	if result.respBodies != nil && len(respBody) <= maxResponseSizeForMetrics {
   328  		result.respBodies.IncKey(string(respBody))
   329  	}
   330  }
   331  
   332  func (p *Probe) runProbe(ctx context.Context, target endpoint.Endpoint, req *http.Request, result *probeResult) {
   333  	reqCtx, cancelReqCtx := context.WithTimeout(ctx, p.opts.Timeout)
   334  	defer cancelReqCtx()
   335  
   336  	if p.c.GetRequestsPerProbe() == 1 {
   337  		p.doHTTPRequest(req.WithContext(reqCtx), target.Name, result, nil)
   338  		return
   339  	}
   340  
   341  	// For multiple requests per probe, we launch a separate goroutine for each
   342  	// HTTP request. We use a mutex to protect access to per-target result object
   343  	// in doHTTPRequest. Note that result object is not accessed concurrently
   344  	// anywhere else -- export of metrics happens when probe is not running.
   345  	var resultMu sync.Mutex
   346  
   347  	wg := sync.WaitGroup{}
   348  	for numReq := int32(0); numReq < p.c.GetRequestsPerProbe(); numReq++ {
   349  		wg.Add(1)
   350  		go func(req *http.Request, targetName string, result *probeResult) {
   351  			defer wg.Done()
   352  			p.doHTTPRequest(req.WithContext(reqCtx), targetName, result, &resultMu)
   353  		}(req, target.Name, result)
   354  	}
   355  	wg.Wait()
   356  }
   357  
   358  func (p *Probe) newResult() *probeResult {
   359  	result := &probeResult{
   360  		respCodes: metrics.NewMap("code", metrics.NewInt(0)),
   361  	}
   362  
   363  	if p.opts.Validators != nil {
   364  		result.validationFailure = validators.ValidationFailureMap(p.opts.Validators)
   365  	}
   366  
   367  	if p.opts.LatencyDist != nil {
   368  		result.latency = p.opts.LatencyDist.Clone()
   369  	} else {
   370  		result.latency = metrics.NewFloat(0)
   371  	}
   372  
   373  	if p.c.GetExportResponseAsMetrics() {
   374  		result.respBodies = metrics.NewMap("resp", metrics.NewInt(0))
   375  	}
   376  
   377  	return result
   378  }
   379  
   380  func (p *Probe) exportMetrics(ts time.Time, result *probeResult, targetName string, dataChan chan *metrics.EventMetrics) {
   381  	em := metrics.NewEventMetrics(ts).
   382  		AddMetric("total", metrics.NewInt(result.total)).
   383  		AddMetric("success", metrics.NewInt(result.success)).
   384  		AddMetric("latency", result.latency).
   385  		AddMetric("timeouts", metrics.NewInt(result.timeouts)).
   386  		AddMetric("resp-code", result.respCodes).
   387  		AddLabel("ptype", "http").
   388  		AddLabel("probe", p.name).
   389  		AddLabel("dst", targetName)
   390  
   391  	if result.respBodies != nil {
   392  		em.AddMetric("resp-body", result.respBodies)
   393  	}
   394  
   395  	if p.c.GetKeepAlive() {
   396  		em.AddMetric("connect_event", metrics.NewInt(result.connEvent))
   397  	}
   398  
   399  	em.LatencyUnit = p.opts.LatencyUnit
   400  
   401  	for _, al := range p.opts.AdditionalLabels {
   402  		em.AddLabel(al.KeyValueForTarget(targetName))
   403  	}
   404  
   405  	if result.validationFailure != nil {
   406  		em.AddMetric("validation_failure", result.validationFailure)
   407  	}
   408  
   409  	p.opts.LogMetrics(em)
   410  	dataChan <- em
   411  }
   412  
   413  func (p *Probe) startForTarget(ctx context.Context, target endpoint.Endpoint, dataChan chan *metrics.EventMetrics) {
   414  	p.l.Debug("Starting probing for the target ", target.Name)
   415  
   416  	// We use this counter to decide when to export stats.
   417  	var runCnt int64
   418  
   419  	for _, al := range p.opts.AdditionalLabels {
   420  		al.UpdateForTarget(target)
   421  	}
   422  	result := p.newResult()
   423  	req := p.httpRequestForTarget(target, nil)
   424  
   425  	ticker := time.NewTicker(p.opts.Interval)
   426  	defer ticker.Stop()
   427  
   428  	for ts := time.Now(); true; ts = <-ticker.C {
   429  		// Don't run another probe if context is canceled already.
   430  		if ctxDone(ctx) {
   431  			return
   432  		}
   433  
   434  		// If request is nil (most likely because target resolving failed or it
   435  		// was an invalid target), skip this probe cycle. Note that request
   436  		// creation gets retried at a regular interval (stats export interval).
   437  		if req != nil {
   438  			p.runProbe(ctx, target, req, result)
   439  		}
   440  
   441  		// Export stats if it's the time to do so.
   442  		runCnt++
   443  		if (runCnt % p.statsExportFrequency) == 0 {
   444  			p.exportMetrics(ts, result, target.Name, dataChan)
   445  
   446  			// If we are resolving first, this is also a good time to recreate HTTP
   447  			// request in case target's IP has changed.
   448  			if p.c.GetResolveFirst() {
   449  				req = p.httpRequestForTarget(target, nil)
   450  			}
   451  		}
   452  	}
   453  }
   454  
   455  func (p *Probe) gapBetweenTargets() time.Duration {
   456  	interTargetGap := time.Duration(p.c.GetIntervalBetweenTargetsMsec()) * time.Millisecond
   457  
   458  	// If not configured by user, determine based on probe interval and number of
   459  	// targets.
   460  	if interTargetGap == 0 && len(p.targets) != 0 {
   461  		// Use 1/10th of the probe interval to spread out target groroutines.
   462  		interTargetGap = p.opts.Interval / time.Duration(10*len(p.targets))
   463  	}
   464  
   465  	return interTargetGap
   466  }
   467  
   468  // updateTargetsAndStartProbes refreshes targets and starts probe loop for
   469  // new targets and cancels probe loops for targets that are no longer active.
   470  // Note that this function is not concurrency safe. It is never called
   471  // concurrently by Start().
   472  func (p *Probe) updateTargetsAndStartProbes(ctx context.Context, dataChan chan *metrics.EventMetrics) {
   473  	p.targets = p.opts.Targets.ListEndpoints()
   474  
   475  	p.l.Debugf("Probe(%s) got %d targets", p.name, len(p.targets))
   476  
   477  	// updatedTargets is used only for logging.
   478  	updatedTargets := make(map[string]string)
   479  	defer func() {
   480  		if len(updatedTargets) > 0 {
   481  			p.l.Infof("Probe(%s) targets updated: %v", p.name, updatedTargets)
   482  		}
   483  	}()
   484  
   485  	activeTargets := make(map[string]endpoint.Endpoint)
   486  	for _, target := range p.targets {
   487  		key := target.Key()
   488  		activeTargets[key] = target
   489  	}
   490  
   491  	// Stop probing for deleted targets by invoking cancelFunc.
   492  	for targetKey, cancelF := range p.cancelFuncs {
   493  		if _, ok := activeTargets[targetKey]; ok {
   494  			continue
   495  		}
   496  		cancelF()
   497  		updatedTargets[targetKey] = "DELETE"
   498  		delete(p.cancelFuncs, targetKey)
   499  	}
   500  
   501  	gapBetweenTargets := p.gapBetweenTargets()
   502  	var startWaitTime time.Duration
   503  
   504  	// Start probe loop for new targets.
   505  	for key, target := range activeTargets {
   506  		// This target is already initialized.
   507  		if _, ok := p.cancelFuncs[key]; ok {
   508  			continue
   509  		}
   510  		updatedTargets[key] = "ADD"
   511  
   512  		probeCtx, cancelF := context.WithCancel(ctx)
   513  		p.waitGroup.Add(1)
   514  
   515  		go func(target endpoint.Endpoint, waitTime time.Duration) {
   516  			defer p.waitGroup.Done()
   517  			// Wait for wait time + some jitter before starting this probe loop.
   518  			time.Sleep(waitTime + time.Duration(rand.Int63n(gapBetweenTargets.Microseconds()/10))*time.Microsecond)
   519  			p.startForTarget(probeCtx, target, dataChan)
   520  		}(target, startWaitTime)
   521  
   522  		startWaitTime += gapBetweenTargets
   523  
   524  		p.cancelFuncs[key] = cancelF
   525  	}
   526  }
   527  
   528  func ctxDone(ctx context.Context) bool {
   529  	select {
   530  	case <-ctx.Done():
   531  		return true
   532  	default:
   533  		return false
   534  	}
   535  }
   536  
   537  // wait waits for child go-routines (one per target) to clean up.
   538  func (p *Probe) wait() {
   539  	p.waitGroup.Wait()
   540  }
   541  
   542  // Start starts and runs the probe indefinitely.
   543  func (p *Probe) Start(ctx context.Context, dataChan chan *metrics.EventMetrics) {
   544  	defer p.wait()
   545  
   546  	p.updateTargetsAndStartProbes(ctx, dataChan)
   547  
   548  	// Do more frequent listing of targets until we get a non-zero list of
   549  	// targets.
   550  	initialRefreshInterval := p.opts.Interval
   551  	// Don't wait too long if p.opts.Interval is large.
   552  	if initialRefreshInterval > time.Second {
   553  		initialRefreshInterval = time.Second
   554  	}
   555  
   556  	for {
   557  		if ctxDone(ctx) {
   558  			return
   559  		}
   560  		if len(p.targets) != 0 {
   561  			break
   562  		}
   563  		p.updateTargetsAndStartProbes(ctx, dataChan)
   564  		time.Sleep(initialRefreshInterval)
   565  	}
   566  
   567  	targetsUpdateTicker := time.NewTicker(p.targetsUpdateInterval)
   568  	defer targetsUpdateTicker.Stop()
   569  
   570  	for {
   571  		select {
   572  		case <-ctx.Done():
   573  			return
   574  		case <-targetsUpdateTicker.C:
   575  			p.updateOauthToken()
   576  			p.updateTargetsAndStartProbes(ctx, dataChan)
   577  		}
   578  	}
   579  }