github.com/google/cloudprober@v0.11.3/probes/udplistener/udplistener.go (about) 1 // Copyright 2018 The Cloudprober Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 /* 16 Package udplistener implements a UDP listener. Given a target list, it listens 17 for packets from each of the targets and reports number of packets successfully 18 received in order, lost or delayed. It also uses the probe interval as an 19 indicator for the number of packets we expect from each target. Use the "udp" 20 probe as the counterpart with the same targets list and probe interval as the 21 sender. 22 23 Notes: 24 25 Each probe has 3 goroutines: 26 - A recvLoop that keeps handling incoming packets and updates metrics. 27 - An outputLoop that ticks twice every statsExportInterval and outputs metrics. 28 - An echoLoop that receives incoming packets from recvLoop over a channel and 29 echos back the packets. 30 31 - Targets list determines which packet sources are valid sources. It is 32 updated in the outputLoop routine. 33 - We use the probe interval to determine the estimated number of packets that 34 should be received. This number is the lower bound of the total number of 35 packets "sent" by each source. 36 */ 37 package udplistener 38 39 import ( 40 "context" 41 "fmt" 42 "io" 43 "net" 44 "sync" 45 "sync/atomic" 46 "time" 47 48 "github.com/google/cloudprober/common/message" 49 "github.com/google/cloudprober/logger" 50 "github.com/google/cloudprober/metrics" 51 "github.com/google/cloudprober/probes/common/statskeeper" 52 "github.com/google/cloudprober/probes/options" 53 "github.com/google/cloudprober/targets/endpoint" 54 55 configpb "github.com/google/cloudprober/probes/udplistener/proto" 56 udpsrv "github.com/google/cloudprober/servers/udp" 57 ) 58 59 const ( 60 maxMsgSize = 65536 61 maxTargets = 1024 62 logThrottleThreshold = 10 63 ) 64 65 // Probe holds aggregate information about all probe runs. 66 type Probe struct { 67 name string 68 opts *options.Options 69 c *configpb.ProbeConf 70 l *logger.Logger 71 conn *net.UDPConn 72 echoMode bool 73 74 // map target name to flow state. 75 targets []endpoint.Endpoint 76 fsm *message.FlowStateMap 77 78 // Process and output results synchronization. 79 mu sync.Mutex 80 errs *probeErr 81 res map[string]*probeRunResult 82 } 83 84 // proberErr stores error stats and counters for throttled logging. 85 type probeErr struct { 86 throttleCt int32 87 invalidMsgErrs map[string]string // addr -> error string 88 missingTargets map[string]int // sender -> count 89 } 90 91 // echoMsg is a struct that is passed between rx thread and echo thread. 92 type echoMsg struct { 93 addr *net.UDPAddr 94 bufLen int 95 buf []byte 96 } 97 98 func (p *Probe) logErrs() { 99 // atomic inc throttleCt so that we don't grab p.mu.Lock() when not logging. 100 newVal := atomic.AddInt32(&p.errs.throttleCt, 1) 101 if newVal != int32(logThrottleThreshold) { 102 return 103 } 104 defer atomic.StoreInt32(&p.errs.throttleCt, 0) 105 106 p.mu.Lock() 107 defer p.mu.Unlock() 108 109 pe := p.errs 110 if len(pe.invalidMsgErrs) > 0 { 111 p.l.Warningf("Invalid messages received: %v", pe.invalidMsgErrs) 112 pe.invalidMsgErrs = make(map[string]string) 113 } 114 if len(pe.missingTargets) > 0 { 115 p.l.Warningf("Unknown targets sending messages: %v", pe.missingTargets) 116 pe.missingTargets = make(map[string]int) 117 } 118 } 119 120 // probeRunResult captures the results of a single probe run. The way we work with 121 // stats makes sure that probeRunResult and its fields are not accessed concurrently 122 // (see documentation with statsKeeper below). That's the reason we use metrics.Int 123 // types instead of metrics.AtomicInt. 124 type probeRunResult struct { 125 target string 126 total metrics.Int 127 success metrics.Int 128 ipdUS metrics.Int // inter-packet distance in microseconds 129 lost metrics.Int // lost += (currSeq - prevSeq - 1) 130 delayed metrics.Int // delayed += (currSeq < prevSeq) 131 } 132 133 // Target returns the p.target. 134 func (prr probeRunResult) Target() string { 135 return prr.target 136 } 137 138 // Metrics converts probeRunResult into metrics.EventMetrics object 139 func (prr probeRunResult) Metrics() *metrics.EventMetrics { 140 return metrics.NewEventMetrics(time.Now()). 141 AddMetric("total", &prr.total). 142 AddMetric("success", &prr.success). 143 AddMetric("ipd_us", &prr.ipdUS). 144 AddMetric("lost", &prr.lost). 145 AddMetric("delayed", &prr.delayed) 146 } 147 148 func (p *Probe) updateTargets() { 149 p.targets = p.opts.Targets.ListEndpoints() 150 151 for _, target := range p.targets { 152 for _, al := range p.opts.AdditionalLabels { 153 al.UpdateForTarget(target) 154 } 155 } 156 } 157 158 // Init initializes the probe with the given params. 159 func (p *Probe) Init(name string, opts *options.Options) error { 160 c, ok := opts.ProbeConf.(*configpb.ProbeConf) 161 if !ok { 162 return fmt.Errorf("not a UDP Listener config: %v", opts.ProbeConf) 163 } 164 p.name = name 165 p.opts = opts 166 if p.l = opts.Logger; p.l == nil { 167 p.l = &logger.Logger{} 168 } 169 p.c = c 170 p.echoMode = p.c.GetType() == configpb.ProbeConf_ECHO 171 172 p.fsm = message.NewFlowStateMap() 173 174 udpAddr := &net.UDPAddr{Port: int(p.c.GetPort())} 175 if p.opts.SourceIP != nil { 176 udpAddr.IP = p.opts.SourceIP 177 } 178 179 conn, err := udpsrv.Listen(udpAddr, p.l) 180 if err != nil { 181 p.l.Warningf("Opening a listen UDP socket on port %d failed: %v", p.c.GetPort(), err) 182 return err 183 } 184 p.conn = conn 185 186 p.res = make(map[string]*probeRunResult) 187 p.errs = &probeErr{ 188 invalidMsgErrs: make(map[string]string), 189 missingTargets: make(map[string]int), 190 } 191 return nil 192 } 193 194 // cleanup closes the udp socket 195 func (p *Probe) cleanup() { 196 if p.conn != nil { 197 p.conn.Close() 198 } 199 } 200 201 // initProbeRunResults empties the current probe results objects, updates the 202 // list of targets and builds a new result object for each target. 203 func (p *Probe) initProbeRunResults() { 204 p.updateTargets() 205 if p.echoMode && len(p.targets) > maxTargets { 206 p.l.Warningf("too many targets (got %d > max %d), responses might be slow.", len(p.targets), maxTargets) 207 } 208 209 p.res = make(map[string]*probeRunResult) 210 for _, target := range p.targets { 211 p.res[target.Name] = &probeRunResult{ 212 target: target.Name, 213 } 214 } 215 } 216 217 // processMessage processes an incoming message and updates metrics. 218 func (p *Probe) processMessage(buf []byte, rxTS time.Time, srcAddr *net.UDPAddr) { 219 p.mu.Lock() 220 defer p.mu.Unlock() 221 222 msg, err := message.NewMessage(buf) 223 if err != nil { 224 p.errs.invalidMsgErrs[srcAddr.String()] = err.Error() 225 return 226 } 227 src := msg.Src() 228 probeRes, ok := p.res[src] 229 if !ok { 230 p.errs.missingTargets[src]++ 231 return 232 } 233 234 msgRes := msg.ProcessOneWay(p.fsm, rxTS) 235 probeRes.total.Inc() 236 if msgRes.Success { 237 probeRes.success.Inc() 238 probeRes.ipdUS.IncBy(metrics.NewInt(msgRes.InterPktDelay.Nanoseconds() / 1000)) 239 } else if msgRes.LostCount > 0 { 240 probeRes.lost.IncBy(metrics.NewInt(int64(msgRes.LostCount))) 241 } else if msgRes.Delayed { 242 probeRes.delayed.Inc() 243 } 244 } 245 246 // outputResults writes results to the output channel. 247 func (p *Probe) outputResults(expectedCt int64, stats chan<- statskeeper.ProbeResult) { 248 p.mu.Lock() 249 defer p.mu.Unlock() 250 for _, r := range p.res { 251 delta := expectedCt - r.total.Int64() 252 if delta > 0 { 253 r.total.AddInt64(delta) 254 } 255 stats <- *r 256 } 257 p.initProbeRunResults() 258 } 259 260 func (p *Probe) outputLoop(ctx context.Context, stats chan<- statskeeper.ProbeResult) { 261 // Use a ticker to control stats output and error logging. 262 // ticker should be a multiple of interval between pkts (i.e., p.opts.Interval). 263 pktsPerExportInterval := int64(p.opts.StatsExportInterval / p.opts.Interval) 264 tick := p.opts.Interval 265 if pktsPerExportInterval > 1 { 266 tick = (p.opts.StatsExportInterval / 2).Round(p.opts.Interval) 267 } 268 ticker := time.NewTicker(tick) 269 270 // #packets-in-an-interval = #sending-ports * (timeDelta + interval - 1ns) / interval 271 // We add (interval/2 - 1ns) because int64 takes the floor, whereas we want 272 // to round the expression. 273 lastExport := time.Now() 274 roundAdd := p.opts.Interval/2 - time.Nanosecond 275 for { 276 select { 277 case <-ctx.Done(): 278 ticker.Stop() 279 return 280 case <-ticker.C: 281 // Number of probes received from a single sender should equal the number of 282 // sending intervals in the period times the number of sending ports. 283 numIntervals := int64((time.Since(lastExport) + roundAdd) / p.opts.Interval) 284 expectedCt := numIntervals * int64(p.c.GetPacketsPerProbe()) 285 p.outputResults(expectedCt, stats) 286 p.logErrs() 287 lastExport = time.Now() 288 } 289 } 290 } 291 292 // echoLoop transmits packets received in the msgChan. 293 func (p *Probe) echoLoop(ctx context.Context, msgChan chan *echoMsg) { 294 for { 295 select { 296 case <-ctx.Done(): 297 return 298 case msg := <-msgChan: 299 n, err := p.conn.WriteToUDP(msg.buf, msg.addr) 300 if err == io.EOF { // socket closed. exit the loop. 301 return 302 } 303 if err != nil { 304 p.l.Errorf("Error writing echo response to %v: %v", msg.addr, err) 305 } else if n < msg.bufLen { 306 p.l.Warningf("Reply truncated: sent %d out of %d bytes to %v.", n, msg.bufLen, msg.addr) 307 } 308 } 309 } 310 } 311 312 // recvLoop loops over the listener socket for incoming messages and update stats. 313 // TODO: Move processMessage to the outputLoop and remove probe mutex. 314 func (p *Probe) recvLoop(ctx context.Context, echoChan chan<- *echoMsg) { 315 conn := p.conn 316 // Accommodate the largest UDP message. 317 b := make([]byte, maxMsgSize) 318 319 p.initProbeRunResults() 320 321 for { 322 select { 323 case <-ctx.Done(): 324 return 325 default: 326 } 327 conn.SetReadDeadline(time.Now().Add(time.Second)) 328 n, srcAddr, err := conn.ReadFromUDP(b) 329 if err != nil { 330 p.l.Debugf("Error receiving on UDP socket: %v", err) 331 continue 332 } 333 rxTS := time.Now() 334 if p.echoMode { 335 e := &echoMsg{ 336 buf: make([]byte, n), 337 addr: srcAddr, 338 } 339 copy(e.buf, b[:n]) 340 echoChan <- e 341 } 342 p.processMessage(b[:n], rxTS, srcAddr) 343 } 344 } 345 346 // probeLoop starts the necessary threads and waits for them to exit. 347 func (p *Probe) probeLoop(ctx context.Context, resultsChan chan<- statskeeper.ProbeResult) { 348 var wg sync.WaitGroup 349 350 // Output Loop for metrics 351 wg.Add(1) 352 go func() { 353 p.outputLoop(ctx, resultsChan) 354 wg.Done() 355 }() 356 357 // Echo loop to respond to incoming messages in echo mode. 358 var echoChan chan *echoMsg 359 if p.echoMode { 360 echoChan = make(chan *echoMsg, maxTargets) 361 wg.Add(1) 362 go func() { 363 p.echoLoop(ctx, echoChan) 364 wg.Done() 365 }() 366 } 367 368 p.recvLoop(ctx, echoChan) 369 wg.Wait() 370 } 371 372 // Start starts and runs the probe indefinitely. 373 func (p *Probe) Start(ctx context.Context, dataChan chan *metrics.EventMetrics) { 374 p.updateTargets() 375 376 // Make sure we don't create zero length results channel. 377 minResultsChLen := 10 378 resultsChLen := len(p.targets) 379 if resultsChLen < minResultsChLen { 380 resultsChLen = minResultsChLen 381 } 382 resultsChan := make(chan statskeeper.ProbeResult, resultsChLen) 383 targetsFunc := func() []endpoint.Endpoint { 384 return p.targets 385 } 386 387 go statskeeper.StatsKeeper(ctx, "udp", p.name, p.opts, targetsFunc, resultsChan, dataChan) 388 389 // probeLoop runs forever and returns only when the probe has to exit. 390 // So, it is safe to cleanup (in the "Start" function) once probeLoop returns. 391 p.probeLoop(ctx, resultsChan) 392 p.cleanup() 393 return 394 }