github.com/google/cloudprober@v0.11.3/probes/udp/udp.go (about) 1 // Copyright 2017-2019 The Cloudprober Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 /* 16 Package udp implements a UDP prober. It sends UDP queries to a list of 17 targets and reports statistics on queries sent, queries received, and latency 18 experienced. 19 20 Queries to each target are sent in parallel. 21 */ 22 package udp 23 24 import ( 25 "context" 26 "errors" 27 "fmt" 28 "math" 29 "net" 30 "sync" 31 "time" 32 33 "github.com/google/cloudprober/common/message" 34 "github.com/google/cloudprober/logger" 35 "github.com/google/cloudprober/metrics" 36 "github.com/google/cloudprober/probes/options" 37 "github.com/google/cloudprober/probes/probeutils" 38 configpb "github.com/google/cloudprober/probes/udp/proto" 39 udpsrv "github.com/google/cloudprober/servers/udp" 40 "github.com/google/cloudprober/sysvars" 41 "github.com/google/cloudprober/targets/endpoint" 42 ) 43 44 const ( 45 maxMsgSize = 65536 46 // maxTargets is the maximum number of targets supported by this probe type. 47 // If there are more targets, they are pruned from the list to bring targets 48 // list under maxTargets. 49 // TODO(manugarg): Make it configurable with documentation on its implication 50 // on resource consumption. 51 maxTargets = 500 52 payloadPattern = "cloudprober" 53 ) 54 55 // flow represents a UDP flow. 56 // Since src address and dst port are constant for a probe, src-port and target 57 // are sufficient to uniquely identify a flow. 58 type flow struct { 59 srcPort string 60 target string 61 } 62 63 // Probe holds aggregate information about all probe runs, per-target. 64 type Probe struct { 65 name string 66 opts *options.Options 67 src string 68 c *configpb.ProbeConf 69 l *logger.Logger 70 71 // List of UDP connections to use. 72 connList []*net.UDPConn 73 srcPortList []string 74 numConn int32 75 runID uint64 76 ipVer int 77 78 targets []endpoint.Endpoint // List of targets for a probe iteration. 79 res map[flow]*probeResult // Results by flow. 80 fsm *message.FlowStateMap // Map flow parameters to flow state. 81 payload []byte 82 83 // Intermediate buffers of sent and received packets 84 sentPackets, rcvdPackets chan packetID 85 sPackets, rPackets []packetID 86 highestSeq map[flow]uint64 87 flushIntv time.Duration 88 } 89 90 // probeResult stores the probe results for a target. The way we work with 91 // stats makes sure that probeResult and its fields are not accessed concurrently 92 // That's the reason we use metrics.Int types instead of metrics.AtomicInt. 93 type probeResult struct { 94 total, success, delayed int64 95 latency metrics.Value 96 } 97 98 // Metrics converts probeResult into metrics.EventMetrics object 99 func (prr probeResult) eventMetrics(probeName string, opts *options.Options, f flow, c *configpb.ProbeConf) *metrics.EventMetrics { 100 var suffix string 101 if c.GetExportMetricsByPort() { 102 suffix = "-per-port" 103 } 104 m := metrics.NewEventMetrics(time.Now()). 105 AddMetric("total"+suffix, metrics.NewInt(prr.total)). 106 AddMetric("success"+suffix, metrics.NewInt(prr.success)). 107 AddMetric("latency"+suffix, prr.latency.Clone()). 108 AddMetric("delayed"+suffix, metrics.NewInt(prr.delayed)). 109 AddLabel("ptype", "udp"). 110 AddLabel("probe", probeName). 111 AddLabel("dst", f.target) 112 113 for _, al := range opts.AdditionalLabels { 114 m.AddLabel(al.KeyValueForTarget(f.target)) 115 } 116 117 if c.GetExportMetricsByPort() { 118 m.AddLabel("src_port", f.srcPort). 119 AddLabel("dst_port", fmt.Sprintf("%d", c.GetPort())) 120 } 121 122 return m 123 } 124 125 func (p *Probe) newProbeResult() *probeResult { 126 var latVal metrics.Value 127 if p.opts.LatencyDist != nil { 128 latVal = p.opts.LatencyDist.Clone() 129 } else { 130 latVal = metrics.NewFloat(0) 131 } 132 return &probeResult{ 133 latency: latVal, 134 } 135 } 136 137 // Init initializes the probe with the given params. 138 func (p *Probe) Init(name string, opts *options.Options) error { 139 c, ok := opts.ProbeConf.(*configpb.ProbeConf) 140 if !ok { 141 return errors.New("not a UDP config") 142 } 143 p.name = name 144 p.opts = opts 145 if p.l = opts.Logger; p.l == nil { 146 p.l = &logger.Logger{} 147 } 148 p.src = sysvars.Vars()["hostname"] 149 p.c = c 150 p.fsm = message.NewFlowStateMap() 151 p.res = make(map[flow]*probeResult) 152 153 if p.c.GetPayloadSize() != 0 { 154 p.payload = make([]byte, p.c.GetPayloadSize()) 155 probeutils.PatternPayload(p.payload, []byte(payloadPattern)) 156 } 157 158 // Initialize intermediate buffers of sent and received packets 159 p.flushIntv = 2 * p.opts.Interval 160 if p.opts.Timeout > p.opts.Interval { 161 p.flushIntv = 2 * p.opts.Timeout 162 } 163 164 if p.opts.StatsExportInterval < p.flushIntv { 165 return fmt.Errorf("UDP probe: stats_export_interval_msec (%s) is too low. It should be at least twice of the interval (%s) and timeout (%s), whichever is bigger", p.opts.StatsExportInterval, p.opts.Interval, p.opts.Timeout) 166 } 167 168 // #send/recv-channel-buffer = #targets * #sources * #probing-intervals-between-flushes 169 minChanLen := maxTargets * int(p.c.GetNumTxPorts()) * int(math.Ceil(float64(p.flushIntv/p.opts.Interval))) 170 p.l.Infof("Creating sent, rcvd channels of length: %d", 2*minChanLen) 171 p.sentPackets = make(chan packetID, 2*minChanLen) 172 p.rcvdPackets = make(chan packetID, 2*minChanLen) 173 p.highestSeq = make(map[flow]uint64) 174 175 // For one-way connections, we use a pool of sockets. 176 wantConn := p.c.GetNumTxPorts() 177 triesRemaining := wantConn * 2 178 p.numConn = 0 179 p.connList = make([]*net.UDPConn, wantConn) 180 p.srcPortList = make([]string, wantConn) 181 182 udpAddr := &net.UDPAddr{Port: 0} 183 if p.opts.SourceIP != nil { 184 udpAddr.IP = p.opts.SourceIP 185 } 186 p.ipVer = p.opts.IPVersion 187 188 for p.numConn < wantConn && triesRemaining > 0 { 189 triesRemaining-- 190 udpConn, err := udpsrv.Listen(udpAddr, p.l) 191 if err != nil { 192 p.l.Warningf("Opening UDP socket failed: %v", err) 193 continue 194 } 195 p.l.Infof("UDP socket id %d, addr %v", p.numConn, udpConn.LocalAddr()) 196 p.connList[p.numConn] = udpConn 197 _, p.srcPortList[p.numConn], err = net.SplitHostPort(udpConn.LocalAddr().String()) 198 if err != nil { 199 return err 200 } 201 p.numConn++ 202 } 203 if p.numConn < wantConn { 204 for _, c := range p.connList { 205 c.Close() 206 } 207 return fmt.Errorf("UDP socket creation failed: got %d connections, want %d", p.numConn, wantConn) 208 } 209 return nil 210 } 211 212 // initProbeRunResults initializes missing probe results objects. 213 func (p *Probe) initProbeRunResults() error { 214 for _, target := range p.targets { 215 if !p.c.GetExportMetricsByPort() { 216 f := flow{"", target.Name} 217 if p.res[f] == nil { 218 p.res[f] = p.newProbeResult() 219 } 220 continue 221 } 222 223 for _, srcPort := range p.srcPortList { 224 f := flow{srcPort, target.Name} 225 if p.res[f] == nil { 226 p.res[f] = p.newProbeResult() 227 } 228 } 229 } 230 return nil 231 } 232 233 // packetID records attributes of the packets sent and received, by runProbe 234 // and recvLoop respectively. These packetIDs are communicated over channels 235 // and are eventually processed by the processPackets() loop (below). 236 type packetID struct { 237 f flow 238 seq uint64 239 txTS time.Time 240 rxTS time.Time 241 } 242 243 func (p *Probe) resultsKey(f flow) flow { 244 if p.c.GetExportMetricsByPort() { 245 return f 246 } 247 return flow{"", f.target} 248 } 249 250 func (p *Probe) processRcvdPacket(rpkt packetID) { 251 p.l.Debugf("rpkt seq: %d, target: %s", rpkt.seq, rpkt.f) 252 res, ok := p.res[p.resultsKey(rpkt.f)] 253 if !ok { 254 return 255 } 256 latency := rpkt.rxTS.Sub(rpkt.txTS) 257 if latency < 0 { 258 p.l.Errorf("Got negative time delta %v for flow %v seq %d", latency, rpkt.f, rpkt.seq) 259 return 260 } 261 if latency > p.opts.Timeout { 262 p.l.Debugf("Packet delayed. Seq: %d, flow: %v, delay: %v", rpkt.seq, rpkt.f, latency) 263 res.delayed++ 264 return 265 } 266 res.success++ 267 res.latency.AddFloat64(latency.Seconds() / p.opts.LatencyUnit.Seconds()) 268 } 269 270 func (p *Probe) processSentPacket(spkt packetID) { 271 p.l.Debugf("spkt seq: %d, flow: %v", spkt.seq, spkt.f) 272 res, ok := p.res[p.resultsKey(spkt.f)] 273 if !ok { 274 return 275 } 276 res.total++ 277 } 278 279 // processPackets processes packets on the sentPackets and rcvdPackets 280 // channels. Packets are inserted into a lookup map as soon as they are 281 // received. At every "statsExportInterval" interval, we go through the maps 282 // and update the probe results. 283 func (p *Probe) processPackets() { 284 // Process packets that we queued earlier (mostly from the last timeout 285 // interval) 286 for _, rpkt := range p.rPackets { 287 p.processRcvdPacket(rpkt) 288 } 289 for _, spkt := range p.sPackets { 290 p.processSentPacket(spkt) 291 } 292 p.rPackets = p.rPackets[0:0] 293 p.sPackets = p.sPackets[0:0] 294 295 lenRcvdPackets := len(p.rcvdPackets) 296 p.l.Debugf("rcvd queue length: %d", lenRcvdPackets) 297 lenSentPackets := len(p.sentPackets) 298 p.l.Debugf("sent queue length: %d", lenSentPackets) 299 300 now := time.Now() 301 for i := 0; i < lenSentPackets; i++ { 302 pkt := <-p.sentPackets 303 if now.Sub(pkt.txTS) < p.opts.Timeout { 304 p.l.Debugf("Inserting spacket (seq %d) for late processing", pkt.seq) 305 p.sPackets = append(p.sPackets, pkt) 306 continue 307 } 308 p.processSentPacket(pkt) 309 if pkt.seq > p.highestSeq[pkt.f] { 310 p.highestSeq[pkt.f] = pkt.seq 311 } 312 } 313 314 for i := 0; i < lenRcvdPackets; i++ { 315 pkt := <-p.rcvdPackets 316 if now.Sub(pkt.txTS) < p.opts.Timeout { 317 p.l.Debugf("Inserting rpacket (seq %d) for late processing", pkt.seq) 318 p.rPackets = append(p.rPackets, pkt) 319 continue 320 } 321 if pkt.seq > p.highestSeq[pkt.f] { 322 p.l.Debugf("Inserting rpacket for late processing as seq (%d) > highestSeq (%d)", pkt.seq, p.highestSeq[pkt.f]) 323 p.rPackets = append(p.rPackets, pkt) 324 continue 325 } 326 p.processRcvdPacket(pkt) 327 } 328 } 329 330 // Return true if the underlying error indicates a udp.Client timeout. 331 // In our case, we're using the ReadTimeout- time until response is read. 332 func isClientTimeout(err error) bool { 333 e, ok := err.(*net.OpError) 334 return ok && e != nil && e.Timeout() 335 } 336 337 // recvLoop receives all packets over a UDP socket and updates 338 // flowStates accordingly. 339 func (p *Probe) recvLoop(ctx context.Context, conn *net.UDPConn) { 340 b := make([]byte, maxMsgSize) 341 for { 342 select { 343 case <-ctx.Done(): 344 return 345 default: 346 } 347 conn.SetReadDeadline(time.Now().Add(p.opts.Timeout)) 348 msgLen, raddr, err := conn.ReadFromUDP(b) 349 if err != nil { 350 if !isClientTimeout(err) { 351 p.l.Errorf("Receive error on %s (from %v): %v", conn.LocalAddr(), raddr, err) 352 } 353 continue 354 } 355 356 rxTS := time.Now() 357 msg, err := message.NewMessage(b[:msgLen]) 358 if err != nil { 359 p.l.Errorf("Incoming message error from %s: %v", raddr, err) 360 continue 361 } 362 select { 363 case p.rcvdPackets <- packetID{flow{msg.SrcPort(), msg.Dst()}, msg.Seq(), msg.SrcTS(), rxTS}: 364 default: 365 p.l.Errorf("rcvdPackets channel full") 366 } 367 } 368 } 369 370 func (p *Probe) runSingleProbe(f flow, conn *net.UDPConn, maxLen, dstPort int) error { 371 ip, err := p.opts.Targets.Resolve(f.target, p.ipVer) 372 if err != nil { 373 return fmt.Errorf("unable to resolve %s: %v", f.target, err) 374 } 375 raddr := &net.UDPAddr{ 376 IP: ip, 377 Port: dstPort, 378 } 379 380 flowState := p.fsm.FlowState(p.src, f.srcPort, f.target) 381 now := time.Now() 382 msg, seq, err := flowState.CreateMessage(now, p.payload, maxLen) 383 if err != nil { 384 return fmt.Errorf("error creating new message to probe target(%s): %v", f.target, err) 385 } 386 387 if _, err := conn.WriteToUDP(msg, raddr); err != nil { 388 flowState.WithdrawMessage(seq) 389 return fmt.Errorf("unable to send to %s(%v): %v", f.target, raddr, err) 390 } 391 // Send packet over sentPackets channel 392 // May need to make a longer buffer for the channel. 393 select { 394 case p.sentPackets <- packetID{f, seq, now, time.Time{}}: 395 return nil 396 default: 397 return fmt.Errorf("sentPackets channel full") 398 } 399 } 400 401 // runProbe performs a single probe run. The main thread launches one goroutine 402 // per target to probe. It manages a sync.WaitGroup and Wait's until all probes 403 // have finished, then exits the runProbe method. 404 // 405 // Each per-target goroutine sends a UDP message and on success waits for 406 // "timeout" duration before exiting. "recvLoop" function is expected to 407 // capture the responses before "timeout" and the main loop will flush the 408 // results. 409 func (p *Probe) runProbe() { 410 if len(p.targets) == 0 { 411 return 412 } 413 maxLen := int(p.c.GetMaxLength()) 414 dstPort := int(p.c.GetPort()) 415 416 var packetsPerTarget, initialConn int 417 if p.c.GetUseAllTxPortsPerProbe() { 418 packetsPerTarget = len(p.connList) 419 initialConn = 0 420 } else { 421 packetsPerTarget = 1 422 initialConn = int(p.runID % uint64(len(p.connList))) 423 } 424 425 var wg sync.WaitGroup 426 wg.Add(len(p.targets) * packetsPerTarget) 427 428 for _, conn := range p.connList { 429 conn.SetWriteDeadline(time.Now().Add(p.opts.Interval / 2)) 430 } 431 for _, target := range p.targets { 432 for i := 0; i < packetsPerTarget; i++ { 433 connID := (initialConn + i) % len(p.connList) 434 conn := p.connList[connID] 435 go func(conn *net.UDPConn, f flow) { 436 defer wg.Done() 437 if err := p.runSingleProbe(f, conn, maxLen, dstPort); err != nil { 438 p.l.Errorf("Probing %+v failed: %v", f, err) 439 } 440 }(conn, flow{p.srcPortList[connID], target.Name}) 441 } 442 } 443 wg.Wait() 444 p.runID++ 445 } 446 447 func (p *Probe) updateTargets() { 448 p.targets = p.opts.Targets.ListEndpoints() 449 if len(p.targets) > maxTargets { 450 p.l.Warningf("Number of targets (%d) > maxTargets (%d). Truncating the targets list.", len(p.targets), maxTargets) 451 p.targets = p.targets[:maxTargets] 452 } 453 for _, target := range p.targets { 454 for _, al := range p.opts.AdditionalLabels { 455 al.UpdateForTarget(target) 456 } 457 } 458 p.initProbeRunResults() 459 } 460 461 // Start starts and runs the probe indefinitely. 462 func (p *Probe) Start(ctx context.Context, dataChan chan *metrics.EventMetrics) { 463 p.updateTargets() 464 465 for _, conn := range p.connList { 466 go p.recvLoop(ctx, conn) 467 } 468 469 probeTicker := time.NewTicker(p.opts.Interval) 470 statsExportTicker := time.NewTicker(p.opts.StatsExportInterval) 471 flushTicker := time.NewTicker(p.flushIntv) 472 473 for { 474 select { 475 case <-ctx.Done(): 476 flushTicker.Stop() 477 probeTicker.Stop() 478 statsExportTicker.Stop() 479 return 480 case <-probeTicker.C: 481 p.runProbe() 482 case <-flushTicker.C: 483 p.processPackets() 484 case <-statsExportTicker.C: 485 for f, result := range p.res { 486 em := result.eventMetrics(p.name, p.opts, f, p.c) 487 em.LatencyUnit = p.opts.LatencyUnit 488 p.opts.LogMetrics(em) 489 dataChan <- em 490 } 491 // Use this opportunity to refresh targets as well. 492 p.updateTargets() 493 } 494 } 495 }