github.com/google/cloudprober@v0.11.3/probes/udp/udp.go

github.com/google/cloudprober@v0.11.3/probes/udp/udp.go (about)

     1  // Copyright 2017-2019 The Cloudprober Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  /*
    16  Package udp implements a UDP prober. It sends UDP queries to a list of
    17  targets and reports statistics on queries sent, queries received, and latency
    18  experienced.
    19  
    20  Queries to each target are sent in parallel.
    21  */
    22  package udp
    23  
    24  import (
    25  	"context"
    26  	"errors"
    27  	"fmt"
    28  	"math"
    29  	"net"
    30  	"sync"
    31  	"time"
    32  
    33  	"github.com/google/cloudprober/common/message"
    34  	"github.com/google/cloudprober/logger"
    35  	"github.com/google/cloudprober/metrics"
    36  	"github.com/google/cloudprober/probes/options"
    37  	"github.com/google/cloudprober/probes/probeutils"
    38  	configpb "github.com/google/cloudprober/probes/udp/proto"
    39  	udpsrv "github.com/google/cloudprober/servers/udp"
    40  	"github.com/google/cloudprober/sysvars"
    41  	"github.com/google/cloudprober/targets/endpoint"
    42  )
    43  
    44  const (
    45  	maxMsgSize = 65536
    46  	// maxTargets is the maximum number of targets supported by this probe type.
    47  	// If there are more targets, they are pruned from the list to bring targets
    48  	// list under maxTargets.
    49  	// TODO(manugarg): Make it configurable with documentation on its implication
    50  	// on resource consumption.
    51  	maxTargets     = 500
    52  	payloadPattern = "cloudprober"
    53  )
    54  
    55  // flow represents a UDP flow.
    56  // Since src address and dst port are constant for a probe, src-port and target
    57  // are sufficient to uniquely identify a flow.
    58  type flow struct {
    59  	srcPort string
    60  	target  string
    61  }
    62  
    63  // Probe holds aggregate information about all probe runs, per-target.
    64  type Probe struct {
    65  	name string
    66  	opts *options.Options
    67  	src  string
    68  	c    *configpb.ProbeConf
    69  	l    *logger.Logger
    70  
    71  	// List of UDP connections to use.
    72  	connList    []*net.UDPConn
    73  	srcPortList []string
    74  	numConn     int32
    75  	runID       uint64
    76  	ipVer       int
    77  
    78  	targets []endpoint.Endpoint   // List of targets for a probe iteration.
    79  	res     map[flow]*probeResult // Results by flow.
    80  	fsm     *message.FlowStateMap // Map flow parameters to flow state.
    81  	payload []byte
    82  
    83  	// Intermediate buffers of sent and received packets
    84  	sentPackets, rcvdPackets chan packetID
    85  	sPackets, rPackets       []packetID
    86  	highestSeq               map[flow]uint64
    87  	flushIntv                time.Duration
    88  }
    89  
    90  // probeResult stores the probe results for a target. The way we work with
    91  // stats makes sure that probeResult and its fields are not accessed concurrently
    92  // That's the reason we use metrics.Int types instead of metrics.AtomicInt.
    93  type probeResult struct {
    94  	total, success, delayed int64
    95  	latency                 metrics.Value
    96  }
    97  
    98  // Metrics converts probeResult into metrics.EventMetrics object
    99  func (prr probeResult) eventMetrics(probeName string, opts *options.Options, f flow, c *configpb.ProbeConf) *metrics.EventMetrics {
   100  	var suffix string
   101  	if c.GetExportMetricsByPort() {
   102  		suffix = "-per-port"
   103  	}
   104  	m := metrics.NewEventMetrics(time.Now()).
   105  		AddMetric("total"+suffix, metrics.NewInt(prr.total)).
   106  		AddMetric("success"+suffix, metrics.NewInt(prr.success)).
   107  		AddMetric("latency"+suffix, prr.latency.Clone()).
   108  		AddMetric("delayed"+suffix, metrics.NewInt(prr.delayed)).
   109  		AddLabel("ptype", "udp").
   110  		AddLabel("probe", probeName).
   111  		AddLabel("dst", f.target)
   112  
   113  	for _, al := range opts.AdditionalLabels {
   114  		m.AddLabel(al.KeyValueForTarget(f.target))
   115  	}
   116  
   117  	if c.GetExportMetricsByPort() {
   118  		m.AddLabel("src_port", f.srcPort).
   119  			AddLabel("dst_port", fmt.Sprintf("%d", c.GetPort()))
   120  	}
   121  
   122  	return m
   123  }
   124  
   125  func (p *Probe) newProbeResult() *probeResult {
   126  	var latVal metrics.Value
   127  	if p.opts.LatencyDist != nil {
   128  		latVal = p.opts.LatencyDist.Clone()
   129  	} else {
   130  		latVal = metrics.NewFloat(0)
   131  	}
   132  	return &probeResult{
   133  		latency: latVal,
   134  	}
   135  }
   136  
   137  // Init initializes the probe with the given params.
   138  func (p *Probe) Init(name string, opts *options.Options) error {
   139  	c, ok := opts.ProbeConf.(*configpb.ProbeConf)
   140  	if !ok {
   141  		return errors.New("not a UDP config")
   142  	}
   143  	p.name = name
   144  	p.opts = opts
   145  	if p.l = opts.Logger; p.l == nil {
   146  		p.l = &logger.Logger{}
   147  	}
   148  	p.src = sysvars.Vars()["hostname"]
   149  	p.c = c
   150  	p.fsm = message.NewFlowStateMap()
   151  	p.res = make(map[flow]*probeResult)
   152  
   153  	if p.c.GetPayloadSize() != 0 {
   154  		p.payload = make([]byte, p.c.GetPayloadSize())
   155  		probeutils.PatternPayload(p.payload, []byte(payloadPattern))
   156  	}
   157  
   158  	// Initialize intermediate buffers of sent and received packets
   159  	p.flushIntv = 2 * p.opts.Interval
   160  	if p.opts.Timeout > p.opts.Interval {
   161  		p.flushIntv = 2 * p.opts.Timeout
   162  	}
   163  
   164  	if p.opts.StatsExportInterval < p.flushIntv {
   165  		return fmt.Errorf("UDP probe: stats_export_interval_msec (%s) is too low. It should be at least twice of the interval (%s) and timeout (%s), whichever is bigger", p.opts.StatsExportInterval, p.opts.Interval, p.opts.Timeout)
   166  	}
   167  
   168  	// #send/recv-channel-buffer = #targets * #sources * #probing-intervals-between-flushes
   169  	minChanLen := maxTargets * int(p.c.GetNumTxPorts()) * int(math.Ceil(float64(p.flushIntv/p.opts.Interval)))
   170  	p.l.Infof("Creating sent, rcvd channels of length: %d", 2*minChanLen)
   171  	p.sentPackets = make(chan packetID, 2*minChanLen)
   172  	p.rcvdPackets = make(chan packetID, 2*minChanLen)
   173  	p.highestSeq = make(map[flow]uint64)
   174  
   175  	// For one-way connections, we use a pool of sockets.
   176  	wantConn := p.c.GetNumTxPorts()
   177  	triesRemaining := wantConn * 2
   178  	p.numConn = 0
   179  	p.connList = make([]*net.UDPConn, wantConn)
   180  	p.srcPortList = make([]string, wantConn)
   181  
   182  	udpAddr := &net.UDPAddr{Port: 0}
   183  	if p.opts.SourceIP != nil {
   184  		udpAddr.IP = p.opts.SourceIP
   185  	}
   186  	p.ipVer = p.opts.IPVersion
   187  
   188  	for p.numConn < wantConn && triesRemaining > 0 {
   189  		triesRemaining--
   190  		udpConn, err := udpsrv.Listen(udpAddr, p.l)
   191  		if err != nil {
   192  			p.l.Warningf("Opening UDP socket failed: %v", err)
   193  			continue
   194  		}
   195  		p.l.Infof("UDP socket id %d, addr %v", p.numConn, udpConn.LocalAddr())
   196  		p.connList[p.numConn] = udpConn
   197  		_, p.srcPortList[p.numConn], err = net.SplitHostPort(udpConn.LocalAddr().String())
   198  		if err != nil {
   199  			return err
   200  		}
   201  		p.numConn++
   202  	}
   203  	if p.numConn < wantConn {
   204  		for _, c := range p.connList {
   205  			c.Close()
   206  		}
   207  		return fmt.Errorf("UDP socket creation failed: got %d connections, want %d", p.numConn, wantConn)
   208  	}
   209  	return nil
   210  }
   211  
   212  // initProbeRunResults initializes missing probe results objects.
   213  func (p *Probe) initProbeRunResults() error {
   214  	for _, target := range p.targets {
   215  		if !p.c.GetExportMetricsByPort() {
   216  			f := flow{"", target.Name}
   217  			if p.res[f] == nil {
   218  				p.res[f] = p.newProbeResult()
   219  			}
   220  			continue
   221  		}
   222  
   223  		for _, srcPort := range p.srcPortList {
   224  			f := flow{srcPort, target.Name}
   225  			if p.res[f] == nil {
   226  				p.res[f] = p.newProbeResult()
   227  			}
   228  		}
   229  	}
   230  	return nil
   231  }
   232  
   233  // packetID records attributes of the packets sent and received, by runProbe
   234  // and recvLoop respectively. These packetIDs are communicated over channels
   235  // and are eventually processed by the processPackets() loop (below).
   236  type packetID struct {
   237  	f    flow
   238  	seq  uint64
   239  	txTS time.Time
   240  	rxTS time.Time
   241  }
   242  
   243  func (p *Probe) resultsKey(f flow) flow {
   244  	if p.c.GetExportMetricsByPort() {
   245  		return f
   246  	}
   247  	return flow{"", f.target}
   248  }
   249  
   250  func (p *Probe) processRcvdPacket(rpkt packetID) {
   251  	p.l.Debugf("rpkt seq: %d, target: %s", rpkt.seq, rpkt.f)
   252  	res, ok := p.res[p.resultsKey(rpkt.f)]
   253  	if !ok {
   254  		return
   255  	}
   256  	latency := rpkt.rxTS.Sub(rpkt.txTS)
   257  	if latency < 0 {
   258  		p.l.Errorf("Got negative time delta %v for flow %v seq %d", latency, rpkt.f, rpkt.seq)
   259  		return
   260  	}
   261  	if latency > p.opts.Timeout {
   262  		p.l.Debugf("Packet delayed. Seq: %d, flow: %v, delay: %v", rpkt.seq, rpkt.f, latency)
   263  		res.delayed++
   264  		return
   265  	}
   266  	res.success++
   267  	res.latency.AddFloat64(latency.Seconds() / p.opts.LatencyUnit.Seconds())
   268  }
   269  
   270  func (p *Probe) processSentPacket(spkt packetID) {
   271  	p.l.Debugf("spkt seq: %d, flow: %v", spkt.seq, spkt.f)
   272  	res, ok := p.res[p.resultsKey(spkt.f)]
   273  	if !ok {
   274  		return
   275  	}
   276  	res.total++
   277  }
   278  
   279  // processPackets processes packets on the sentPackets and rcvdPackets
   280  // channels. Packets are inserted into a lookup map as soon as they are
   281  // received. At every "statsExportInterval" interval, we go through the maps
   282  // and update the probe results.
   283  func (p *Probe) processPackets() {
   284  	// Process packets that we queued earlier (mostly from the last timeout
   285  	// interval)
   286  	for _, rpkt := range p.rPackets {
   287  		p.processRcvdPacket(rpkt)
   288  	}
   289  	for _, spkt := range p.sPackets {
   290  		p.processSentPacket(spkt)
   291  	}
   292  	p.rPackets = p.rPackets[0:0]
   293  	p.sPackets = p.sPackets[0:0]
   294  
   295  	lenRcvdPackets := len(p.rcvdPackets)
   296  	p.l.Debugf("rcvd queue length: %d", lenRcvdPackets)
   297  	lenSentPackets := len(p.sentPackets)
   298  	p.l.Debugf("sent queue length: %d", lenSentPackets)
   299  
   300  	now := time.Now()
   301  	for i := 0; i < lenSentPackets; i++ {
   302  		pkt := <-p.sentPackets
   303  		if now.Sub(pkt.txTS) < p.opts.Timeout {
   304  			p.l.Debugf("Inserting spacket (seq %d) for late processing", pkt.seq)
   305  			p.sPackets = append(p.sPackets, pkt)
   306  			continue
   307  		}
   308  		p.processSentPacket(pkt)
   309  		if pkt.seq > p.highestSeq[pkt.f] {
   310  			p.highestSeq[pkt.f] = pkt.seq
   311  		}
   312  	}
   313  
   314  	for i := 0; i < lenRcvdPackets; i++ {
   315  		pkt := <-p.rcvdPackets
   316  		if now.Sub(pkt.txTS) < p.opts.Timeout {
   317  			p.l.Debugf("Inserting rpacket (seq %d) for late processing", pkt.seq)
   318  			p.rPackets = append(p.rPackets, pkt)
   319  			continue
   320  		}
   321  		if pkt.seq > p.highestSeq[pkt.f] {
   322  			p.l.Debugf("Inserting rpacket for late processing as seq (%d) > highestSeq (%d)", pkt.seq, p.highestSeq[pkt.f])
   323  			p.rPackets = append(p.rPackets, pkt)
   324  			continue
   325  		}
   326  		p.processRcvdPacket(pkt)
   327  	}
   328  }
   329  
   330  // Return true if the underlying error indicates a udp.Client timeout.
   331  // In our case, we're using the ReadTimeout- time until response is read.
   332  func isClientTimeout(err error) bool {
   333  	e, ok := err.(*net.OpError)
   334  	return ok && e != nil && e.Timeout()
   335  }
   336  
   337  // recvLoop receives all packets over a UDP socket and updates
   338  // flowStates accordingly.
   339  func (p *Probe) recvLoop(ctx context.Context, conn *net.UDPConn) {
   340  	b := make([]byte, maxMsgSize)
   341  	for {
   342  		select {
   343  		case <-ctx.Done():
   344  			return
   345  		default:
   346  		}
   347  		conn.SetReadDeadline(time.Now().Add(p.opts.Timeout))
   348  		msgLen, raddr, err := conn.ReadFromUDP(b)
   349  		if err != nil {
   350  			if !isClientTimeout(err) {
   351  				p.l.Errorf("Receive error on %s (from %v): %v", conn.LocalAddr(), raddr, err)
   352  			}
   353  			continue
   354  		}
   355  
   356  		rxTS := time.Now()
   357  		msg, err := message.NewMessage(b[:msgLen])
   358  		if err != nil {
   359  			p.l.Errorf("Incoming message error from %s: %v", raddr, err)
   360  			continue
   361  		}
   362  		select {
   363  		case p.rcvdPackets <- packetID{flow{msg.SrcPort(), msg.Dst()}, msg.Seq(), msg.SrcTS(), rxTS}:
   364  		default:
   365  			p.l.Errorf("rcvdPackets channel full")
   366  		}
   367  	}
   368  }
   369  
   370  func (p *Probe) runSingleProbe(f flow, conn *net.UDPConn, maxLen, dstPort int) error {
   371  	ip, err := p.opts.Targets.Resolve(f.target, p.ipVer)
   372  	if err != nil {
   373  		return fmt.Errorf("unable to resolve %s: %v", f.target, err)
   374  	}
   375  	raddr := &net.UDPAddr{
   376  		IP:   ip,
   377  		Port: dstPort,
   378  	}
   379  
   380  	flowState := p.fsm.FlowState(p.src, f.srcPort, f.target)
   381  	now := time.Now()
   382  	msg, seq, err := flowState.CreateMessage(now, p.payload, maxLen)
   383  	if err != nil {
   384  		return fmt.Errorf("error creating new message to probe target(%s): %v", f.target, err)
   385  	}
   386  
   387  	if _, err := conn.WriteToUDP(msg, raddr); err != nil {
   388  		flowState.WithdrawMessage(seq)
   389  		return fmt.Errorf("unable to send to %s(%v): %v", f.target, raddr, err)
   390  	}
   391  	// Send packet over sentPackets channel
   392  	// May need to make a longer buffer for the channel.
   393  	select {
   394  	case p.sentPackets <- packetID{f, seq, now, time.Time{}}:
   395  		return nil
   396  	default:
   397  		return fmt.Errorf("sentPackets channel full")
   398  	}
   399  }
   400  
   401  // runProbe performs a single probe run. The main thread launches one goroutine
   402  // per target to probe. It manages a sync.WaitGroup and Wait's until all probes
   403  // have finished, then exits the runProbe method.
   404  //
   405  // Each per-target goroutine sends a UDP message and on success waits for
   406  // "timeout" duration before exiting. "recvLoop" function is expected to
   407  // capture the responses before "timeout" and the main loop will flush the
   408  // results.
   409  func (p *Probe) runProbe() {
   410  	if len(p.targets) == 0 {
   411  		return
   412  	}
   413  	maxLen := int(p.c.GetMaxLength())
   414  	dstPort := int(p.c.GetPort())
   415  
   416  	var packetsPerTarget, initialConn int
   417  	if p.c.GetUseAllTxPortsPerProbe() {
   418  		packetsPerTarget = len(p.connList)
   419  		initialConn = 0
   420  	} else {
   421  		packetsPerTarget = 1
   422  		initialConn = int(p.runID % uint64(len(p.connList)))
   423  	}
   424  
   425  	var wg sync.WaitGroup
   426  	wg.Add(len(p.targets) * packetsPerTarget)
   427  
   428  	for _, conn := range p.connList {
   429  		conn.SetWriteDeadline(time.Now().Add(p.opts.Interval / 2))
   430  	}
   431  	for _, target := range p.targets {
   432  		for i := 0; i < packetsPerTarget; i++ {
   433  			connID := (initialConn + i) % len(p.connList)
   434  			conn := p.connList[connID]
   435  			go func(conn *net.UDPConn, f flow) {
   436  				defer wg.Done()
   437  				if err := p.runSingleProbe(f, conn, maxLen, dstPort); err != nil {
   438  					p.l.Errorf("Probing %+v failed: %v", f, err)
   439  				}
   440  			}(conn, flow{p.srcPortList[connID], target.Name})
   441  		}
   442  	}
   443  	wg.Wait()
   444  	p.runID++
   445  }
   446  
   447  func (p *Probe) updateTargets() {
   448  	p.targets = p.opts.Targets.ListEndpoints()
   449  	if len(p.targets) > maxTargets {
   450  		p.l.Warningf("Number of targets (%d) > maxTargets (%d). Truncating the targets list.", len(p.targets), maxTargets)
   451  		p.targets = p.targets[:maxTargets]
   452  	}
   453  	for _, target := range p.targets {
   454  		for _, al := range p.opts.AdditionalLabels {
   455  			al.UpdateForTarget(target)
   456  		}
   457  	}
   458  	p.initProbeRunResults()
   459  }
   460  
   461  // Start starts and runs the probe indefinitely.
   462  func (p *Probe) Start(ctx context.Context, dataChan chan *metrics.EventMetrics) {
   463  	p.updateTargets()
   464  
   465  	for _, conn := range p.connList {
   466  		go p.recvLoop(ctx, conn)
   467  	}
   468  
   469  	probeTicker := time.NewTicker(p.opts.Interval)
   470  	statsExportTicker := time.NewTicker(p.opts.StatsExportInterval)
   471  	flushTicker := time.NewTicker(p.flushIntv)
   472  
   473  	for {
   474  		select {
   475  		case <-ctx.Done():
   476  			flushTicker.Stop()
   477  			probeTicker.Stop()
   478  			statsExportTicker.Stop()
   479  			return
   480  		case <-probeTicker.C:
   481  			p.runProbe()
   482  		case <-flushTicker.C:
   483  			p.processPackets()
   484  		case <-statsExportTicker.C:
   485  			for f, result := range p.res {
   486  				em := result.eventMetrics(p.name, p.opts, f, p.c)
   487  				em.LatencyUnit = p.opts.LatencyUnit
   488  				p.opts.LogMetrics(em)
   489  				dataChan <- em
   490  			}
   491  			// Use this opportunity to refresh targets as well.
   492  			p.updateTargets()
   493  		}
   494  	}
   495  }