github.com/google/cloudprober@v0.11.3/probes/udplistener/udplistener.go

github.com/google/cloudprober@v0.11.3/probes/udplistener/udplistener.go (about)

     1  // Copyright 2018 The Cloudprober Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  /*
    16  Package udplistener implements a UDP listener. Given a target list, it listens
    17  for packets from each of the targets and reports number of packets successfully
    18  received in order, lost or delayed. It also uses the probe interval as an
    19  indicator for the number of packets we expect from each target. Use the "udp"
    20  probe as the counterpart with the same targets list and probe interval as the
    21  sender.
    22  
    23  Notes:
    24  
    25  Each probe has 3 goroutines:
    26  - A recvLoop that keeps handling incoming packets and updates metrics.
    27  - An outputLoop that ticks twice every statsExportInterval and outputs metrics.
    28  - An echoLoop that receives incoming packets from recvLoop over a channel and
    29    echos back the packets.
    30  
    31  - Targets list determines which packet sources are valid sources. It is
    32    updated in the outputLoop routine.
    33  - We use the probe interval to determine the estimated number of packets that
    34    should be received. This number is the lower bound of the total number of
    35  	packets "sent" by each source.
    36  */
    37  package udplistener
    38  
    39  import (
    40  	"context"
    41  	"fmt"
    42  	"io"
    43  	"net"
    44  	"sync"
    45  	"sync/atomic"
    46  	"time"
    47  
    48  	"github.com/google/cloudprober/common/message"
    49  	"github.com/google/cloudprober/logger"
    50  	"github.com/google/cloudprober/metrics"
    51  	"github.com/google/cloudprober/probes/common/statskeeper"
    52  	"github.com/google/cloudprober/probes/options"
    53  	"github.com/google/cloudprober/targets/endpoint"
    54  
    55  	configpb "github.com/google/cloudprober/probes/udplistener/proto"
    56  	udpsrv "github.com/google/cloudprober/servers/udp"
    57  )
    58  
    59  const (
    60  	maxMsgSize           = 65536
    61  	maxTargets           = 1024
    62  	logThrottleThreshold = 10
    63  )
    64  
    65  // Probe holds aggregate information about all probe runs.
    66  type Probe struct {
    67  	name     string
    68  	opts     *options.Options
    69  	c        *configpb.ProbeConf
    70  	l        *logger.Logger
    71  	conn     *net.UDPConn
    72  	echoMode bool
    73  
    74  	// map target name to flow state.
    75  	targets []endpoint.Endpoint
    76  	fsm     *message.FlowStateMap
    77  
    78  	// Process and output results synchronization.
    79  	mu   sync.Mutex
    80  	errs *probeErr
    81  	res  map[string]*probeRunResult
    82  }
    83  
    84  // proberErr stores error stats and counters for throttled logging.
    85  type probeErr struct {
    86  	throttleCt     int32
    87  	invalidMsgErrs map[string]string // addr -> error string
    88  	missingTargets map[string]int    // sender -> count
    89  }
    90  
    91  // echoMsg is a struct that is passed between rx thread and echo thread.
    92  type echoMsg struct {
    93  	addr   *net.UDPAddr
    94  	bufLen int
    95  	buf    []byte
    96  }
    97  
    98  func (p *Probe) logErrs() {
    99  	// atomic inc throttleCt so that we don't grab p.mu.Lock() when not logging.
   100  	newVal := atomic.AddInt32(&p.errs.throttleCt, 1)
   101  	if newVal != int32(logThrottleThreshold) {
   102  		return
   103  	}
   104  	defer atomic.StoreInt32(&p.errs.throttleCt, 0)
   105  
   106  	p.mu.Lock()
   107  	defer p.mu.Unlock()
   108  
   109  	pe := p.errs
   110  	if len(pe.invalidMsgErrs) > 0 {
   111  		p.l.Warningf("Invalid messages received: %v", pe.invalidMsgErrs)
   112  		pe.invalidMsgErrs = make(map[string]string)
   113  	}
   114  	if len(pe.missingTargets) > 0 {
   115  		p.l.Warningf("Unknown targets sending messages: %v", pe.missingTargets)
   116  		pe.missingTargets = make(map[string]int)
   117  	}
   118  }
   119  
   120  // probeRunResult captures the results of a single probe run. The way we work with
   121  // stats makes sure that probeRunResult and its fields are not accessed concurrently
   122  // (see documentation with statsKeeper below). That's the reason we use metrics.Int
   123  // types instead of metrics.AtomicInt.
   124  type probeRunResult struct {
   125  	target  string
   126  	total   metrics.Int
   127  	success metrics.Int
   128  	ipdUS   metrics.Int // inter-packet distance in microseconds
   129  	lost    metrics.Int // lost += (currSeq - prevSeq - 1)
   130  	delayed metrics.Int // delayed += (currSeq < prevSeq)
   131  }
   132  
   133  // Target returns the p.target.
   134  func (prr probeRunResult) Target() string {
   135  	return prr.target
   136  }
   137  
   138  // Metrics converts probeRunResult into metrics.EventMetrics object
   139  func (prr probeRunResult) Metrics() *metrics.EventMetrics {
   140  	return metrics.NewEventMetrics(time.Now()).
   141  		AddMetric("total", &prr.total).
   142  		AddMetric("success", &prr.success).
   143  		AddMetric("ipd_us", &prr.ipdUS).
   144  		AddMetric("lost", &prr.lost).
   145  		AddMetric("delayed", &prr.delayed)
   146  }
   147  
   148  func (p *Probe) updateTargets() {
   149  	p.targets = p.opts.Targets.ListEndpoints()
   150  
   151  	for _, target := range p.targets {
   152  		for _, al := range p.opts.AdditionalLabels {
   153  			al.UpdateForTarget(target)
   154  		}
   155  	}
   156  }
   157  
   158  // Init initializes the probe with the given params.
   159  func (p *Probe) Init(name string, opts *options.Options) error {
   160  	c, ok := opts.ProbeConf.(*configpb.ProbeConf)
   161  	if !ok {
   162  		return fmt.Errorf("not a UDP Listener config: %v", opts.ProbeConf)
   163  	}
   164  	p.name = name
   165  	p.opts = opts
   166  	if p.l = opts.Logger; p.l == nil {
   167  		p.l = &logger.Logger{}
   168  	}
   169  	p.c = c
   170  	p.echoMode = p.c.GetType() == configpb.ProbeConf_ECHO
   171  
   172  	p.fsm = message.NewFlowStateMap()
   173  
   174  	udpAddr := &net.UDPAddr{Port: int(p.c.GetPort())}
   175  	if p.opts.SourceIP != nil {
   176  		udpAddr.IP = p.opts.SourceIP
   177  	}
   178  
   179  	conn, err := udpsrv.Listen(udpAddr, p.l)
   180  	if err != nil {
   181  		p.l.Warningf("Opening a listen UDP socket on port %d failed: %v", p.c.GetPort(), err)
   182  		return err
   183  	}
   184  	p.conn = conn
   185  
   186  	p.res = make(map[string]*probeRunResult)
   187  	p.errs = &probeErr{
   188  		invalidMsgErrs: make(map[string]string),
   189  		missingTargets: make(map[string]int),
   190  	}
   191  	return nil
   192  }
   193  
   194  // cleanup closes the udp socket
   195  func (p *Probe) cleanup() {
   196  	if p.conn != nil {
   197  		p.conn.Close()
   198  	}
   199  }
   200  
   201  // initProbeRunResults empties the current probe results objects, updates the
   202  // list of targets and builds a new result object for each target.
   203  func (p *Probe) initProbeRunResults() {
   204  	p.updateTargets()
   205  	if p.echoMode && len(p.targets) > maxTargets {
   206  		p.l.Warningf("too many targets (got %d > max %d), responses might be slow.", len(p.targets), maxTargets)
   207  	}
   208  
   209  	p.res = make(map[string]*probeRunResult)
   210  	for _, target := range p.targets {
   211  		p.res[target.Name] = &probeRunResult{
   212  			target: target.Name,
   213  		}
   214  	}
   215  }
   216  
   217  // processMessage processes an incoming message and updates metrics.
   218  func (p *Probe) processMessage(buf []byte, rxTS time.Time, srcAddr *net.UDPAddr) {
   219  	p.mu.Lock()
   220  	defer p.mu.Unlock()
   221  
   222  	msg, err := message.NewMessage(buf)
   223  	if err != nil {
   224  		p.errs.invalidMsgErrs[srcAddr.String()] = err.Error()
   225  		return
   226  	}
   227  	src := msg.Src()
   228  	probeRes, ok := p.res[src]
   229  	if !ok {
   230  		p.errs.missingTargets[src]++
   231  		return
   232  	}
   233  
   234  	msgRes := msg.ProcessOneWay(p.fsm, rxTS)
   235  	probeRes.total.Inc()
   236  	if msgRes.Success {
   237  		probeRes.success.Inc()
   238  		probeRes.ipdUS.IncBy(metrics.NewInt(msgRes.InterPktDelay.Nanoseconds() / 1000))
   239  	} else if msgRes.LostCount > 0 {
   240  		probeRes.lost.IncBy(metrics.NewInt(int64(msgRes.LostCount)))
   241  	} else if msgRes.Delayed {
   242  		probeRes.delayed.Inc()
   243  	}
   244  }
   245  
   246  // outputResults writes results to the output channel.
   247  func (p *Probe) outputResults(expectedCt int64, stats chan<- statskeeper.ProbeResult) {
   248  	p.mu.Lock()
   249  	defer p.mu.Unlock()
   250  	for _, r := range p.res {
   251  		delta := expectedCt - r.total.Int64()
   252  		if delta > 0 {
   253  			r.total.AddInt64(delta)
   254  		}
   255  		stats <- *r
   256  	}
   257  	p.initProbeRunResults()
   258  }
   259  
   260  func (p *Probe) outputLoop(ctx context.Context, stats chan<- statskeeper.ProbeResult) {
   261  	// Use a ticker to control stats output and error logging.
   262  	// ticker should be a multiple of interval between pkts (i.e., p.opts.Interval).
   263  	pktsPerExportInterval := int64(p.opts.StatsExportInterval / p.opts.Interval)
   264  	tick := p.opts.Interval
   265  	if pktsPerExportInterval > 1 {
   266  		tick = (p.opts.StatsExportInterval / 2).Round(p.opts.Interval)
   267  	}
   268  	ticker := time.NewTicker(tick)
   269  
   270  	// #packets-in-an-interval = #sending-ports * (timeDelta + interval - 1ns) / interval
   271  	// We add (interval/2 - 1ns) because int64 takes the floor, whereas we want
   272  	// to round the expression.
   273  	lastExport := time.Now()
   274  	roundAdd := p.opts.Interval/2 - time.Nanosecond
   275  	for {
   276  		select {
   277  		case <-ctx.Done():
   278  			ticker.Stop()
   279  			return
   280  		case <-ticker.C:
   281  			// Number of probes received from a single sender should equal the number of
   282  			// sending intervals in the period times the number of sending ports.
   283  			numIntervals := int64((time.Since(lastExport) + roundAdd) / p.opts.Interval)
   284  			expectedCt := numIntervals * int64(p.c.GetPacketsPerProbe())
   285  			p.outputResults(expectedCt, stats)
   286  			p.logErrs()
   287  			lastExport = time.Now()
   288  		}
   289  	}
   290  }
   291  
   292  // echoLoop transmits packets received in the msgChan.
   293  func (p *Probe) echoLoop(ctx context.Context, msgChan chan *echoMsg) {
   294  	for {
   295  		select {
   296  		case <-ctx.Done():
   297  			return
   298  		case msg := <-msgChan:
   299  			n, err := p.conn.WriteToUDP(msg.buf, msg.addr)
   300  			if err == io.EOF { // socket closed. exit the loop.
   301  				return
   302  			}
   303  			if err != nil {
   304  				p.l.Errorf("Error writing echo response to %v: %v", msg.addr, err)
   305  			} else if n < msg.bufLen {
   306  				p.l.Warningf("Reply truncated: sent %d out of %d bytes to %v.", n, msg.bufLen, msg.addr)
   307  			}
   308  		}
   309  	}
   310  }
   311  
   312  // recvLoop loops over the listener socket for incoming messages and update stats.
   313  // TODO: Move processMessage to the outputLoop and remove probe mutex.
   314  func (p *Probe) recvLoop(ctx context.Context, echoChan chan<- *echoMsg) {
   315  	conn := p.conn
   316  	// Accommodate the largest UDP message.
   317  	b := make([]byte, maxMsgSize)
   318  
   319  	p.initProbeRunResults()
   320  
   321  	for {
   322  		select {
   323  		case <-ctx.Done():
   324  			return
   325  		default:
   326  		}
   327  		conn.SetReadDeadline(time.Now().Add(time.Second))
   328  		n, srcAddr, err := conn.ReadFromUDP(b)
   329  		if err != nil {
   330  			p.l.Debugf("Error receiving on UDP socket: %v", err)
   331  			continue
   332  		}
   333  		rxTS := time.Now()
   334  		if p.echoMode {
   335  			e := &echoMsg{
   336  				buf:  make([]byte, n),
   337  				addr: srcAddr,
   338  			}
   339  			copy(e.buf, b[:n])
   340  			echoChan <- e
   341  		}
   342  		p.processMessage(b[:n], rxTS, srcAddr)
   343  	}
   344  }
   345  
   346  // probeLoop starts the necessary threads and waits for them to exit.
   347  func (p *Probe) probeLoop(ctx context.Context, resultsChan chan<- statskeeper.ProbeResult) {
   348  	var wg sync.WaitGroup
   349  
   350  	// Output Loop for metrics
   351  	wg.Add(1)
   352  	go func() {
   353  		p.outputLoop(ctx, resultsChan)
   354  		wg.Done()
   355  	}()
   356  
   357  	// Echo loop to respond to incoming messages in echo mode.
   358  	var echoChan chan *echoMsg
   359  	if p.echoMode {
   360  		echoChan = make(chan *echoMsg, maxTargets)
   361  		wg.Add(1)
   362  		go func() {
   363  			p.echoLoop(ctx, echoChan)
   364  			wg.Done()
   365  		}()
   366  	}
   367  
   368  	p.recvLoop(ctx, echoChan)
   369  	wg.Wait()
   370  }
   371  
   372  // Start starts and runs the probe indefinitely.
   373  func (p *Probe) Start(ctx context.Context, dataChan chan *metrics.EventMetrics) {
   374  	p.updateTargets()
   375  
   376  	// Make sure we don't create zero length results channel.
   377  	minResultsChLen := 10
   378  	resultsChLen := len(p.targets)
   379  	if resultsChLen < minResultsChLen {
   380  		resultsChLen = minResultsChLen
   381  	}
   382  	resultsChan := make(chan statskeeper.ProbeResult, resultsChLen)
   383  	targetsFunc := func() []endpoint.Endpoint {
   384  		return p.targets
   385  	}
   386  
   387  	go statskeeper.StatsKeeper(ctx, "udp", p.name, p.opts, targetsFunc, resultsChan, dataChan)
   388  
   389  	// probeLoop runs forever and returns only when the probe has to exit.
   390  	// So, it is safe to cleanup (in the "Start" function) once probeLoop returns.
   391  	p.probeLoop(ctx, resultsChan)
   392  	p.cleanup()
   393  	return
   394  }