github.com/google/cloudprober@v0.11.3/probes/dns/dns.go (about) 1 // Copyright 2017-2019 The Cloudprober Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 /* 16 Package dns implements a DNS prober. It sends UDP DNS queries to a list of 17 targets and reports statistics on queries sent, queries received, and latency 18 experienced. 19 20 This prober uses the DNS library in /third_party/golang/dns/dns to construct, 21 send, and receive DNS messages. Every message is sent on a different UDP port. 22 Queries to each target are sent in parallel. 23 */ 24 package dns 25 26 import ( 27 "context" 28 "errors" 29 "fmt" 30 "net" 31 "strings" 32 "sync" 33 "time" 34 35 "github.com/google/cloudprober/logger" 36 "github.com/google/cloudprober/metrics" 37 "github.com/google/cloudprober/probes/common/statskeeper" 38 configpb "github.com/google/cloudprober/probes/dns/proto" 39 "github.com/google/cloudprober/probes/options" 40 "github.com/google/cloudprober/targets/endpoint" 41 "github.com/google/cloudprober/validators" 42 "github.com/miekg/dns" 43 ) 44 45 // Client provides a DNS client interface for required functionality. 46 // This makes it possible to mock. 47 type Client interface { 48 Exchange(*dns.Msg, string) (*dns.Msg, time.Duration, error) 49 setReadTimeout(time.Duration) 50 setSourceIP(net.IP) 51 } 52 53 // ClientImpl is a concrete DNS client that can be instantiated. 54 type clientImpl struct { 55 dns.Client 56 } 57 58 // setReadTimeout allows write-access to the underlying ReadTimeout variable. 59 func (c *clientImpl) setReadTimeout(d time.Duration) { 60 c.ReadTimeout = d 61 } 62 63 // setSourceIP allows write-access to the underlying ReadTimeout variable. 64 func (c *clientImpl) setSourceIP(ip net.IP) { 65 c.Dialer = &net.Dialer{ 66 LocalAddr: &net.UDPAddr{IP: ip}, 67 } 68 } 69 70 // Probe holds aggregate information about all probe runs, per-target. 71 type Probe struct { 72 name string 73 opts *options.Options 74 c *configpb.ProbeConf 75 l *logger.Logger 76 77 // book-keeping params 78 targets []endpoint.Endpoint 79 msg *dns.Msg 80 client Client 81 } 82 83 // probeRunResult captures the results of a single probe run. The way we work with 84 // stats makes sure that probeRunResult and its fields are not accessed concurrently 85 // (see documentation with statsKeeper below). That's the reason we use metrics.Int 86 // types instead of metrics.AtomicInt. 87 type probeRunResult struct { 88 target string 89 total metrics.Int 90 success metrics.Int 91 latency metrics.Value 92 timeouts metrics.Int 93 validationFailure *metrics.Map 94 } 95 96 // Metrics converts probeRunResult into metrics.EventMetrics object 97 func (prr probeRunResult) Metrics() *metrics.EventMetrics { 98 return metrics.NewEventMetrics(time.Now()). 99 AddMetric("total", &prr.total). 100 AddMetric("success", &prr.success). 101 AddMetric("latency", prr.latency). 102 AddMetric("timeouts", &prr.timeouts). 103 AddMetric("validation_failure", prr.validationFailure) 104 } 105 106 // Target returns the p.target. 107 func (prr probeRunResult) Target() string { 108 return prr.target 109 } 110 111 func (p *Probe) updateTargets() { 112 p.targets = p.opts.Targets.ListEndpoints() 113 114 for _, target := range p.targets { 115 for _, al := range p.opts.AdditionalLabels { 116 al.UpdateForTarget(target) 117 } 118 } 119 } 120 121 // Init initializes the probe with the given params. 122 func (p *Probe) Init(name string, opts *options.Options) error { 123 c, ok := opts.ProbeConf.(*configpb.ProbeConf) 124 if !ok { 125 return errors.New("no dns config") 126 } 127 p.c = c 128 p.name = name 129 p.opts = opts 130 if p.l = opts.Logger; p.l == nil { 131 p.l = &logger.Logger{} 132 } 133 p.updateTargets() 134 135 // I believe these objects are safe for concurrent use by multiple goroutines 136 // (although the documentation doesn't explicitly say so). It uses locks 137 // internally and the underlying net.Conn declares that multiple goroutines 138 // may invoke methods on a net.Conn simultaneously. 139 p.msg = new(dns.Msg) 140 queryType := p.c.GetQueryType() 141 if queryType == configpb.QueryType_NONE || int32(queryType) >= int32(dns.TypeReserved) { 142 return fmt.Errorf("dns_probe(%v): invalid query type %v", name, queryType) 143 } 144 p.msg.SetQuestion(dns.Fqdn(p.c.GetResolvedDomain()), uint16(queryType)) 145 146 p.client = new(clientImpl) 147 if p.opts.SourceIP != nil { 148 p.client.setSourceIP(p.opts.SourceIP) 149 } 150 // Use ReadTimeout because DialTimeout for UDP is not the RTT. 151 p.client.setReadTimeout(p.opts.Timeout) 152 153 return nil 154 } 155 156 // Return true if the underlying error indicates a dns.Client timeout. 157 // In our case, we're using the ReadTimeout- time until response is read. 158 func isClientTimeout(err error) bool { 159 e, ok := err.(*net.OpError) 160 return ok && e != nil && e.Timeout() 161 } 162 163 // validateResponse checks status code and answer section for correctness and 164 // returns true if the response is valid. In case of validation failures, it 165 // also updates the result structure. 166 func (p *Probe) validateResponse(resp *dns.Msg, target string, result *probeRunResult) bool { 167 if resp == nil || resp.Rcode != dns.RcodeSuccess { 168 p.l.Warningf("Target(%s): error in response %v", target, resp) 169 return false 170 } 171 172 // Validate number of answers in response. 173 // TODO: Move this logic to validators. 174 minAnswers := p.c.GetMinAnswers() 175 if minAnswers > 0 && uint32(len(resp.Answer)) < minAnswers { 176 p.l.Warningf("Target(%s): too few answers - got %d want %d.\n\tAnswerBlock: %v", 177 target, len(resp.Answer), minAnswers, resp.Answer) 178 return false 179 } 180 181 if p.opts.Validators != nil { 182 answers := []string{} 183 for _, rr := range resp.Answer { 184 if rr != nil { 185 answers = append(answers, rr.String()) 186 } 187 } 188 respBytes := []byte(strings.Join(answers, "\n")) 189 190 failedValidations := validators.RunValidators(p.opts.Validators, &validators.Input{ResponseBody: respBytes}, result.validationFailure, p.l) 191 if len(failedValidations) > 0 { 192 p.l.Debugf("Target(%s): validators %v failed. Resp: %v", target, failedValidations, answers) 193 return false 194 } 195 } 196 197 return true 198 } 199 200 // resolveFunc resolves the given host for the IP version. 201 // This type is mainly used for testing. For all other cases, a nil function 202 // should be passed to the runProbe function. 203 type resolveFunc func(host string, ipVer int) (net.IP, error) 204 205 func (p *Probe) runProbe(resultsChan chan<- statskeeper.ProbeResult, resolveF resolveFunc) { 206 // Refresh the list of targets to probe. 207 p.updateTargets() 208 209 wg := sync.WaitGroup{} 210 for _, target := range p.targets { 211 wg.Add(1) 212 213 // Launch a separate goroutine for each target. 214 // Write probe results to the "resultsChan" channel. 215 go func(target endpoint.Endpoint, resultsChan chan<- statskeeper.ProbeResult) { 216 defer wg.Done() 217 218 result := probeRunResult{ 219 target: target.Name, 220 validationFailure: validators.ValidationFailureMap(p.opts.Validators), 221 } 222 223 if p.opts.LatencyDist != nil { 224 result.latency = p.opts.LatencyDist.Clone() 225 } else { 226 result.latency = metrics.NewFloat(0) 227 } 228 229 result.total.Inc() 230 231 fullTarget := net.JoinHostPort(target.Name, "53") 232 if p.c.GetResolveFirst() { 233 if resolveF == nil { 234 resolveF = p.opts.Targets.Resolve 235 } 236 ip, err := resolveF(target.Name, p.opts.IPVersion) 237 if err != nil { 238 p.l.Warningf("Target(%s): Resolve error: %v", target.Name, err) 239 resultsChan <- result 240 return 241 } 242 fullTarget = net.JoinHostPort(ip.String(), "53") 243 } 244 245 resp, latency, err := p.client.Exchange(p.msg, fullTarget) 246 247 if err != nil { 248 if isClientTimeout(err) { 249 p.l.Warningf("Target(%s): client.Exchange: Timeout error: %v", fullTarget, err) 250 result.timeouts.Inc() 251 } else { 252 p.l.Warningf("Target(%s): client.Exchange: %v", fullTarget, err) 253 } 254 } else if p.validateResponse(resp, fullTarget, &result) { 255 result.success.Inc() 256 result.latency.AddFloat64(latency.Seconds() / p.opts.LatencyUnit.Seconds()) 257 } 258 resultsChan <- result 259 }(target, resultsChan) 260 } 261 262 // Wait until all probes are done. 263 wg.Wait() 264 } 265 266 // Start starts and runs the probe indefinitely. 267 func (p *Probe) Start(ctx context.Context, dataChan chan *metrics.EventMetrics) { 268 resultsChan := make(chan statskeeper.ProbeResult, len(p.targets)) 269 270 // This function is used by StatsKeeper to get the latest list of targets. 271 // TODO(manugarg): Make p.targets mutex protected as it's read and written by concurrent goroutines. 272 targetsFunc := func() []endpoint.Endpoint { 273 return p.targets 274 } 275 276 go statskeeper.StatsKeeper(ctx, "dns", p.name, p.opts, targetsFunc, resultsChan, dataChan) 277 278 ticker := time.NewTicker(p.opts.Interval) 279 defer ticker.Stop() 280 281 for range ticker.C { 282 // Don't run another probe if context is canceled already. 283 select { 284 case <-ctx.Done(): 285 return 286 default: 287 } 288 p.runProbe(resultsChan, nil) 289 } 290 }