github.com/google/cloudprober@v0.11.3/probes/grpc/grpc.go (about)

     1  // Copyright 2020 The Cloudprober Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  /*
    16  Package grpc implements a gRPC probe.
    17  
    18  This probes a cloudprober gRPC server and reports success rate, latency, and
    19  validation failures.
    20  */
    21  package grpc
    22  
    23  import (
    24  	"context"
    25  	"errors"
    26  	"fmt"
    27  	"net"
    28  	"strconv"
    29  	"sync"
    30  	"time"
    31  
    32  	"github.com/golang/protobuf/proto"
    33  	"github.com/google/cloudprober/common/oauth"
    34  	"github.com/google/cloudprober/logger"
    35  	"github.com/google/cloudprober/metrics"
    36  	configpb "github.com/google/cloudprober/probes/grpc/proto"
    37  	"github.com/google/cloudprober/probes/options"
    38  	"github.com/google/cloudprober/probes/probeutils"
    39  	"github.com/google/cloudprober/sysvars"
    40  	"github.com/google/cloudprober/targets/endpoint"
    41  
    42  	grpcprobepb "github.com/google/cloudprober/servers/grpc/proto"
    43  	servicepb "github.com/google/cloudprober/servers/grpc/proto"
    44  	"google.golang.org/grpc"
    45  	"google.golang.org/grpc/credentials/alts"
    46  	grpcoauth "google.golang.org/grpc/credentials/oauth"
    47  	"google.golang.org/grpc/peer"
    48  	"google.golang.org/grpc/resolver"
    49  
    50  	// Import grpclb module so it can be used by name for DirectPath connections.
    51  	_ "google.golang.org/grpc/balancer/grpclb"
    52  )
    53  
    54  const loadBalancingPolicy = `{"loadBalancingConfig":[{"grpclb":{"childPolicy":[{"pick_first":{}}]}}]}`
    55  
    56  // TargetsUpdateInterval controls frequency of target updates.
    57  var (
    58  	TargetsUpdateInterval = 1 * time.Minute
    59  )
    60  
    61  // Probe holds aggregate information about all probe runs, per-target.
    62  type Probe struct {
    63  	name     string
    64  	src      string
    65  	opts     *options.Options
    66  	c        *configpb.ProbeConf
    67  	l        *logger.Logger
    68  	dialOpts []grpc.DialOption
    69  
    70  	// Targets and cancellation function for each target.
    71  	targets     []endpoint.Endpoint
    72  	cancelFuncs map[string]context.CancelFunc
    73  	targetsMu   sync.Mutex
    74  
    75  	// Results by target.
    76  	results map[string]*probeRunResult
    77  }
    78  
    79  // probeRunResult captures the metrics for a single target. Multiple threads
    80  // can update metrics at the same time and the main thread periodically
    81  // outputs the values in this struct.
    82  type probeRunResult struct {
    83  	sync.Mutex
    84  	target        string
    85  	total         metrics.Int
    86  	success       metrics.Int
    87  	latency       metrics.Value
    88  	connectErrors metrics.Int
    89  }
    90  
    91  func (p *Probe) setupDialOpts() error {
    92  	oauthCfg := p.c.GetOauthConfig()
    93  	if oauthCfg != nil {
    94  		oauthTS, err := oauth.TokenSourceFromConfig(oauthCfg, p.l)
    95  		if err != nil {
    96  			return err
    97  		}
    98  		p.dialOpts = append(p.dialOpts, grpc.WithPerRPCCredentials(grpcoauth.TokenSource{oauthTS}))
    99  	}
   100  	altsCfg := p.c.GetAltsConfig()
   101  	if altsCfg != nil {
   102  		altsOpts := &alts.ClientOptions{
   103  			TargetServiceAccounts:    altsCfg.GetTargetServiceAccount(),
   104  			HandshakerServiceAddress: altsCfg.GetHandshakerServiceAddress(),
   105  		}
   106  		p.dialOpts = append(p.dialOpts, grpc.WithTransportCredentials(alts.NewClientCreds(altsOpts)))
   107  	}
   108  
   109  	if oauthCfg == nil && altsCfg == nil {
   110  		p.dialOpts = append(p.dialOpts, grpc.WithInsecure())
   111  	}
   112  	p.dialOpts = append(p.dialOpts, grpc.WithDefaultServiceConfig(loadBalancingPolicy))
   113  	p.dialOpts = append(p.dialOpts, grpc.WithBlock())
   114  	return nil
   115  }
   116  
   117  // Init initializes the probe with the given params.
   118  func (p *Probe) Init(name string, opts *options.Options) error {
   119  	c, ok := opts.ProbeConf.(*configpb.ProbeConf)
   120  	if !ok {
   121  		return errors.New("not a gRPC probe config")
   122  	}
   123  	p.c = c
   124  	p.name = name
   125  	p.opts = opts
   126  	if p.l = opts.Logger; p.l == nil {
   127  		p.l = &logger.Logger{}
   128  	}
   129  	p.targets = p.opts.Targets.ListEndpoints()
   130  	p.cancelFuncs = make(map[string]context.CancelFunc)
   131  	p.src = sysvars.Vars()["hostname"]
   132  	if err := p.setupDialOpts(); err != nil {
   133  		return err
   134  	}
   135  	resolver.SetDefaultScheme("dns")
   136  	return nil
   137  }
   138  
   139  func (p *Probe) updateTargetsAndStartProbes(ctx context.Context) {
   140  	newTargets := p.opts.Targets.ListEndpoints()
   141  	numNewTargets := len(newTargets)
   142  
   143  	p.targetsMu.Lock()
   144  	defer p.targetsMu.Unlock()
   145  	if numNewTargets == 0 || numNewTargets < (len(p.targets)/2) {
   146  		p.l.Errorf("Too few new targets, retaining old targets. New targets: %v, old count: %d", newTargets, len(p.targets))
   147  		return
   148  	}
   149  
   150  	updatedTargets := make(map[string]string)
   151  	defer func() {
   152  		if len(updatedTargets) > 0 {
   153  			p.l.Infof("Probe(%s) targets updated: %v", p.name, updatedTargets)
   154  		}
   155  	}()
   156  
   157  	activeTargets := make(map[string]bool)
   158  	// Create results structure and start probe loop for new targets.
   159  	for _, tgtEp := range newTargets {
   160  		tgt := net.JoinHostPort(tgtEp.Name, strconv.Itoa(tgtEp.Port))
   161  		activeTargets[tgt] = true
   162  		if _, ok := p.results[tgt]; ok {
   163  			continue
   164  		}
   165  		updatedTargets[tgt] = "ADD"
   166  		p.results[tgt] = p.newResult(tgt)
   167  		probeCtx, probeCancelFunc := context.WithCancel(ctx)
   168  		for i := 0; i < int(p.c.GetNumConns()); i++ {
   169  			go p.oneTargetLoop(probeCtx, tgt, i, p.results[tgt])
   170  		}
   171  		p.cancelFuncs[tgt] = probeCancelFunc
   172  	}
   173  
   174  	// Stop probing for deleted targets by invoking cancelFunc.
   175  	for tgt := range p.results {
   176  		if activeTargets[tgt] {
   177  			continue
   178  		}
   179  		p.cancelFuncs[tgt]()
   180  		updatedTargets[tgt] = "DELETE"
   181  		delete(p.results, tgt)
   182  		delete(p.cancelFuncs, tgt)
   183  	}
   184  	p.targets = newTargets
   185  }
   186  
   187  // connectWithRetry attempts to connect to a target. On failure, it retries in
   188  // an infinite loop until successful, incrementing connectErrors for every
   189  // connection error. On success, it returns a client immediately.
   190  // Interval between connects is controlled by connect_timeout_msec, defaulting
   191  // to probe timeout.
   192  func (p *Probe) connectWithRetry(ctx context.Context, tgt, msgPattern string, result *probeRunResult) *grpc.ClientConn {
   193  	connectTimeout := p.opts.Timeout
   194  	if p.c.GetConnectTimeoutMsec() > 0 {
   195  		connectTimeout = time.Duration(p.c.GetConnectTimeoutMsec()) * time.Millisecond
   196  	}
   197  	var conn *grpc.ClientConn
   198  	var err error
   199  	for {
   200  		select {
   201  		case <-ctx.Done():
   202  			p.l.Warningf("ProbeId(%s): context cancelled in connect loop.", msgPattern)
   203  			return nil
   204  		default:
   205  		}
   206  		connCtx, cancelFunc := context.WithTimeout(ctx, connectTimeout)
   207  		conn, err = grpc.DialContext(connCtx, tgt, p.dialOpts...)
   208  		cancelFunc()
   209  		if err != nil {
   210  			p.l.Warningf("ProbeId(%v) connect error: %v", msgPattern, err)
   211  		} else {
   212  			p.l.Infof("ProbeId(%v) connection established.", msgPattern)
   213  			break
   214  		}
   215  		result.Lock()
   216  		result.total.Inc()
   217  		result.connectErrors.Inc()
   218  		result.Unlock()
   219  	}
   220  	return conn
   221  }
   222  
   223  // oneTargetLoop connects to and then continuously probes a single target.
   224  func (p *Probe) oneTargetLoop(ctx context.Context, tgt string, index int, result *probeRunResult) {
   225  	msgPattern := fmt.Sprintf("%s,%s,%03d", p.src, tgt, index)
   226  
   227  	conn := p.connectWithRetry(ctx, tgt, msgPattern, result)
   228  	if conn == nil {
   229  		return
   230  	}
   231  	defer conn.Close()
   232  
   233  	client := servicepb.NewProberClient(conn)
   234  	timeout := p.opts.Timeout
   235  	method := p.c.GetMethod()
   236  
   237  	msgSize := p.c.GetBlobSize()
   238  	msg := make([]byte, msgSize)
   239  	probeutils.PatternPayload(msg, []byte(msgPattern))
   240  	ticker := time.NewTicker(p.opts.Interval)
   241  	for {
   242  		select {
   243  		case <-ctx.Done():
   244  			p.l.Warningf("ProbeId(%s): context cancelled in request loop.", msgPattern)
   245  			ticker.Stop()
   246  			return
   247  		case <-ticker.C:
   248  		}
   249  
   250  		reqCtx, cancelFunc := context.WithTimeout(ctx, timeout)
   251  		var success int64
   252  		var delta time.Duration
   253  		start := time.Now()
   254  		var err error
   255  		var peer peer.Peer
   256  		opts := []grpc.CallOption{
   257  			grpc.WaitForReady(true),
   258  			grpc.Peer(&peer),
   259  		}
   260  		switch method {
   261  		case configpb.ProbeConf_ECHO:
   262  			req := &grpcprobepb.EchoMessage{
   263  				Blob: []byte(msg),
   264  			}
   265  			_, err = client.Echo(reqCtx, req, opts...)
   266  		case configpb.ProbeConf_READ:
   267  			req := &grpcprobepb.BlobReadRequest{
   268  				Size: proto.Int32(msgSize),
   269  			}
   270  			_, err = client.BlobRead(reqCtx, req, opts...)
   271  		case configpb.ProbeConf_WRITE:
   272  			req := &grpcprobepb.BlobWriteRequest{
   273  				Blob: []byte(msg),
   274  			}
   275  			_, err = client.BlobWrite(reqCtx, req, opts...)
   276  		default:
   277  			p.l.Criticalf("Method %v not implemented", method)
   278  		}
   279  		cancelFunc()
   280  		if err != nil {
   281  			peerAddr := "unknown"
   282  			if peer.Addr != nil {
   283  				peerAddr = peer.Addr.String()
   284  			}
   285  			p.l.Warningf("ProbeId(%s) request failed: %v. ConnState: %v. Peer: %v", msgPattern, err, conn.GetState(), peerAddr)
   286  		} else {
   287  			success = 1
   288  			delta = time.Since(start)
   289  		}
   290  		// TODO(ls692): add validators for probe result.
   291  		result.Lock()
   292  		result.total.Inc()
   293  		result.success.AddInt64(success)
   294  		result.latency.AddFloat64(delta.Seconds() / p.opts.LatencyUnit.Seconds())
   295  		result.Unlock()
   296  	}
   297  }
   298  
   299  func (p *Probe) newResult(tgt string) *probeRunResult {
   300  	var latencyValue metrics.Value
   301  	if p.opts.LatencyDist != nil {
   302  		latencyValue = p.opts.LatencyDist.Clone()
   303  	} else {
   304  		latencyValue = metrics.NewFloat(0)
   305  	}
   306  	return &probeRunResult{
   307  		target:  tgt,
   308  		latency: latencyValue,
   309  	}
   310  }
   311  
   312  // Start starts and runs the probe indefinitely.
   313  func (p *Probe) Start(ctx context.Context, dataChan chan *metrics.EventMetrics) {
   314  	p.results = make(map[string]*probeRunResult)
   315  	p.updateTargetsAndStartProbes(ctx)
   316  
   317  	ticker := time.NewTicker(p.opts.StatsExportInterval)
   318  	defer ticker.Stop()
   319  
   320  	targetsUpdateTicker := time.NewTicker(TargetsUpdateInterval)
   321  	defer targetsUpdateTicker.Stop()
   322  
   323  	for ts := range ticker.C {
   324  		// Stop further processing and exit if context is canceled.
   325  		// Same context is used by probe loops.
   326  		select {
   327  		case <-ctx.Done():
   328  			return
   329  		default:
   330  		}
   331  
   332  		// Output results.
   333  		for targetName, result := range p.results {
   334  			result.Lock()
   335  			em := metrics.NewEventMetrics(ts).
   336  				AddMetric("total", result.total.Clone()).
   337  				AddMetric("success", result.success.Clone()).
   338  				AddMetric("latency", result.latency.Clone()).
   339  				AddMetric("connecterrors", result.connectErrors.Clone()).
   340  				AddLabel("ptype", "grpc").
   341  				AddLabel("probe", p.name).
   342  				AddLabel("dst", targetName)
   343  			result.Unlock()
   344  			em.LatencyUnit = p.opts.LatencyUnit
   345  			for _, al := range p.opts.AdditionalLabels {
   346  				em.AddLabel(al.KeyValueForTarget(targetName))
   347  			}
   348  			p.opts.LogMetrics(em)
   349  			dataChan <- em
   350  		}
   351  
   352  		// Finally, update targets and start new probe loops if necessary.
   353  		// Executing this as the last step in the loop also ensures that new
   354  		// targets have at least one cycle of probes before next output cycle.
   355  		select {
   356  		case <-targetsUpdateTicker.C:
   357  			p.updateTargetsAndStartProbes(ctx)
   358  		default:
   359  		}
   360  	}
   361  }