github.com/google/cloudprober@v0.11.3/probes/dns/dns.go (about)

     1  // Copyright 2017-2019 The Cloudprober Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  /*
    16  Package dns implements a DNS prober. It sends UDP DNS queries to a list of
    17  targets and reports statistics on queries sent, queries received, and latency
    18  experienced.
    19  
    20  This prober uses the DNS library in /third_party/golang/dns/dns to construct,
    21  send, and receive DNS messages. Every message is sent on a different UDP port.
    22  Queries to each target are sent in parallel.
    23  */
    24  package dns
    25  
    26  import (
    27  	"context"
    28  	"errors"
    29  	"fmt"
    30  	"net"
    31  	"strings"
    32  	"sync"
    33  	"time"
    34  
    35  	"github.com/google/cloudprober/logger"
    36  	"github.com/google/cloudprober/metrics"
    37  	"github.com/google/cloudprober/probes/common/statskeeper"
    38  	configpb "github.com/google/cloudprober/probes/dns/proto"
    39  	"github.com/google/cloudprober/probes/options"
    40  	"github.com/google/cloudprober/targets/endpoint"
    41  	"github.com/google/cloudprober/validators"
    42  	"github.com/miekg/dns"
    43  )
    44  
    45  // Client provides a DNS client interface for required functionality.
    46  // This makes it possible to mock.
    47  type Client interface {
    48  	Exchange(*dns.Msg, string) (*dns.Msg, time.Duration, error)
    49  	setReadTimeout(time.Duration)
    50  	setSourceIP(net.IP)
    51  }
    52  
    53  // ClientImpl is a concrete DNS client that can be instantiated.
    54  type clientImpl struct {
    55  	dns.Client
    56  }
    57  
    58  // setReadTimeout allows write-access to the underlying ReadTimeout variable.
    59  func (c *clientImpl) setReadTimeout(d time.Duration) {
    60  	c.ReadTimeout = d
    61  }
    62  
    63  // setSourceIP allows write-access to the underlying ReadTimeout variable.
    64  func (c *clientImpl) setSourceIP(ip net.IP) {
    65  	c.Dialer = &net.Dialer{
    66  		LocalAddr: &net.UDPAddr{IP: ip},
    67  	}
    68  }
    69  
    70  // Probe holds aggregate information about all probe runs, per-target.
    71  type Probe struct {
    72  	name string
    73  	opts *options.Options
    74  	c    *configpb.ProbeConf
    75  	l    *logger.Logger
    76  
    77  	// book-keeping params
    78  	targets []endpoint.Endpoint
    79  	msg     *dns.Msg
    80  	client  Client
    81  }
    82  
    83  // probeRunResult captures the results of a single probe run. The way we work with
    84  // stats makes sure that probeRunResult and its fields are not accessed concurrently
    85  // (see documentation with statsKeeper below). That's the reason we use metrics.Int
    86  // types instead of metrics.AtomicInt.
    87  type probeRunResult struct {
    88  	target            string
    89  	total             metrics.Int
    90  	success           metrics.Int
    91  	latency           metrics.Value
    92  	timeouts          metrics.Int
    93  	validationFailure *metrics.Map
    94  }
    95  
    96  // Metrics converts probeRunResult into metrics.EventMetrics object
    97  func (prr probeRunResult) Metrics() *metrics.EventMetrics {
    98  	return metrics.NewEventMetrics(time.Now()).
    99  		AddMetric("total", &prr.total).
   100  		AddMetric("success", &prr.success).
   101  		AddMetric("latency", prr.latency).
   102  		AddMetric("timeouts", &prr.timeouts).
   103  		AddMetric("validation_failure", prr.validationFailure)
   104  }
   105  
   106  // Target returns the p.target.
   107  func (prr probeRunResult) Target() string {
   108  	return prr.target
   109  }
   110  
   111  func (p *Probe) updateTargets() {
   112  	p.targets = p.opts.Targets.ListEndpoints()
   113  
   114  	for _, target := range p.targets {
   115  		for _, al := range p.opts.AdditionalLabels {
   116  			al.UpdateForTarget(target)
   117  		}
   118  	}
   119  }
   120  
   121  // Init initializes the probe with the given params.
   122  func (p *Probe) Init(name string, opts *options.Options) error {
   123  	c, ok := opts.ProbeConf.(*configpb.ProbeConf)
   124  	if !ok {
   125  		return errors.New("no dns config")
   126  	}
   127  	p.c = c
   128  	p.name = name
   129  	p.opts = opts
   130  	if p.l = opts.Logger; p.l == nil {
   131  		p.l = &logger.Logger{}
   132  	}
   133  	p.updateTargets()
   134  
   135  	// I believe these objects are safe for concurrent use by multiple goroutines
   136  	// (although the documentation doesn't explicitly say so). It uses locks
   137  	// internally and the underlying net.Conn declares that multiple goroutines
   138  	// may invoke methods on a net.Conn simultaneously.
   139  	p.msg = new(dns.Msg)
   140  	queryType := p.c.GetQueryType()
   141  	if queryType == configpb.QueryType_NONE || int32(queryType) >= int32(dns.TypeReserved) {
   142  		return fmt.Errorf("dns_probe(%v): invalid query type %v", name, queryType)
   143  	}
   144  	p.msg.SetQuestion(dns.Fqdn(p.c.GetResolvedDomain()), uint16(queryType))
   145  
   146  	p.client = new(clientImpl)
   147  	if p.opts.SourceIP != nil {
   148  		p.client.setSourceIP(p.opts.SourceIP)
   149  	}
   150  	// Use ReadTimeout because DialTimeout for UDP is not the RTT.
   151  	p.client.setReadTimeout(p.opts.Timeout)
   152  
   153  	return nil
   154  }
   155  
   156  // Return true if the underlying error indicates a dns.Client timeout.
   157  // In our case, we're using the ReadTimeout- time until response is read.
   158  func isClientTimeout(err error) bool {
   159  	e, ok := err.(*net.OpError)
   160  	return ok && e != nil && e.Timeout()
   161  }
   162  
   163  // validateResponse checks status code and answer section for correctness and
   164  // returns true if the response is valid. In case of validation failures, it
   165  // also updates the result structure.
   166  func (p *Probe) validateResponse(resp *dns.Msg, target string, result *probeRunResult) bool {
   167  	if resp == nil || resp.Rcode != dns.RcodeSuccess {
   168  		p.l.Warningf("Target(%s): error in response %v", target, resp)
   169  		return false
   170  	}
   171  
   172  	// Validate number of answers in response.
   173  	// TODO: Move this logic to validators.
   174  	minAnswers := p.c.GetMinAnswers()
   175  	if minAnswers > 0 && uint32(len(resp.Answer)) < minAnswers {
   176  		p.l.Warningf("Target(%s): too few answers - got %d want %d.\n\tAnswerBlock: %v",
   177  			target, len(resp.Answer), minAnswers, resp.Answer)
   178  		return false
   179  	}
   180  
   181  	if p.opts.Validators != nil {
   182  		answers := []string{}
   183  		for _, rr := range resp.Answer {
   184  			if rr != nil {
   185  				answers = append(answers, rr.String())
   186  			}
   187  		}
   188  		respBytes := []byte(strings.Join(answers, "\n"))
   189  
   190  		failedValidations := validators.RunValidators(p.opts.Validators, &validators.Input{ResponseBody: respBytes}, result.validationFailure, p.l)
   191  		if len(failedValidations) > 0 {
   192  			p.l.Debugf("Target(%s): validators %v failed. Resp: %v", target, failedValidations, answers)
   193  			return false
   194  		}
   195  	}
   196  
   197  	return true
   198  }
   199  
   200  // resolveFunc resolves the given host for the IP version.
   201  // This type is mainly used for testing. For all other cases, a nil function
   202  // should be passed to the runProbe function.
   203  type resolveFunc func(host string, ipVer int) (net.IP, error)
   204  
   205  func (p *Probe) runProbe(resultsChan chan<- statskeeper.ProbeResult, resolveF resolveFunc) {
   206  	// Refresh the list of targets to probe.
   207  	p.updateTargets()
   208  
   209  	wg := sync.WaitGroup{}
   210  	for _, target := range p.targets {
   211  		wg.Add(1)
   212  
   213  		// Launch a separate goroutine for each target.
   214  		// Write probe results to the "resultsChan" channel.
   215  		go func(target endpoint.Endpoint, resultsChan chan<- statskeeper.ProbeResult) {
   216  			defer wg.Done()
   217  
   218  			result := probeRunResult{
   219  				target:            target.Name,
   220  				validationFailure: validators.ValidationFailureMap(p.opts.Validators),
   221  			}
   222  
   223  			if p.opts.LatencyDist != nil {
   224  				result.latency = p.opts.LatencyDist.Clone()
   225  			} else {
   226  				result.latency = metrics.NewFloat(0)
   227  			}
   228  
   229  			result.total.Inc()
   230  
   231  			fullTarget := net.JoinHostPort(target.Name, "53")
   232  			if p.c.GetResolveFirst() {
   233  				if resolveF == nil {
   234  					resolveF = p.opts.Targets.Resolve
   235  				}
   236  				ip, err := resolveF(target.Name, p.opts.IPVersion)
   237  				if err != nil {
   238  					p.l.Warningf("Target(%s): Resolve error: %v", target.Name, err)
   239  					resultsChan <- result
   240  					return
   241  				}
   242  				fullTarget = net.JoinHostPort(ip.String(), "53")
   243  			}
   244  
   245  			resp, latency, err := p.client.Exchange(p.msg, fullTarget)
   246  
   247  			if err != nil {
   248  				if isClientTimeout(err) {
   249  					p.l.Warningf("Target(%s): client.Exchange: Timeout error: %v", fullTarget, err)
   250  					result.timeouts.Inc()
   251  				} else {
   252  					p.l.Warningf("Target(%s): client.Exchange: %v", fullTarget, err)
   253  				}
   254  			} else if p.validateResponse(resp, fullTarget, &result) {
   255  				result.success.Inc()
   256  				result.latency.AddFloat64(latency.Seconds() / p.opts.LatencyUnit.Seconds())
   257  			}
   258  			resultsChan <- result
   259  		}(target, resultsChan)
   260  	}
   261  
   262  	// Wait until all probes are done.
   263  	wg.Wait()
   264  }
   265  
   266  // Start starts and runs the probe indefinitely.
   267  func (p *Probe) Start(ctx context.Context, dataChan chan *metrics.EventMetrics) {
   268  	resultsChan := make(chan statskeeper.ProbeResult, len(p.targets))
   269  
   270  	// This function is used by StatsKeeper to get the latest list of targets.
   271  	// TODO(manugarg): Make p.targets mutex protected as it's read and written by concurrent goroutines.
   272  	targetsFunc := func() []endpoint.Endpoint {
   273  		return p.targets
   274  	}
   275  
   276  	go statskeeper.StatsKeeper(ctx, "dns", p.name, p.opts, targetsFunc, resultsChan, dataChan)
   277  
   278  	ticker := time.NewTicker(p.opts.Interval)
   279  	defer ticker.Stop()
   280  
   281  	for range ticker.C {
   282  		// Don't run another probe if context is canceled already.
   283  		select {
   284  		case <-ctx.Done():
   285  			return
   286  		default:
   287  		}
   288  		p.runProbe(resultsChan, nil)
   289  	}
   290  }