github.com/google/cloudprober@v0.11.3/prober/prober.go (about)

     1  // Copyright 2017-2019 The Cloudprober Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  /*
    16  Package prober provides a prober for running a set of probes.
    17  
    18  Prober takes in a config proto which dictates what probes should be created
    19  with what configuration, and manages the asynchronous fan-in/fan-out of the
    20  metrics data from these probes.
    21  */
    22  package prober
    23  
    24  import (
    25  	"context"
    26  	"fmt"
    27  	"math/rand"
    28  	"regexp"
    29  	"sync"
    30  	"time"
    31  
    32  	"github.com/golang/glog"
    33  	configpb "github.com/google/cloudprober/config/proto"
    34  	"github.com/google/cloudprober/config/runconfig"
    35  	"github.com/google/cloudprober/logger"
    36  	"github.com/google/cloudprober/metrics"
    37  	spb "github.com/google/cloudprober/prober/proto"
    38  	"github.com/google/cloudprober/probes"
    39  	"github.com/google/cloudprober/probes/options"
    40  	probes_configpb "github.com/google/cloudprober/probes/proto"
    41  	rdsserver "github.com/google/cloudprober/rds/server"
    42  	"github.com/google/cloudprober/servers"
    43  	"github.com/google/cloudprober/surfacers"
    44  	"github.com/google/cloudprober/sysvars"
    45  	"github.com/google/cloudprober/targets"
    46  	"github.com/google/cloudprober/targets/endpoint"
    47  	"github.com/google/cloudprober/targets/lameduck"
    48  	"google.golang.org/grpc/codes"
    49  	"google.golang.org/grpc/status"
    50  )
    51  
    52  // Prober represents a collection of probes where each probe implements the Probe interface.
    53  type Prober struct {
    54  	Probes    map[string]*probes.ProbeInfo
    55  	Servers   []*servers.ServerInfo
    56  	c         *configpb.ProberConfig
    57  	l         *logger.Logger
    58  	mu        sync.Mutex
    59  	ldLister  endpoint.Lister
    60  	Surfacers []*surfacers.SurfacerInfo
    61  
    62  	// Probe channel to handle starting of the new probes.
    63  	grpcStartProbeCh chan string
    64  
    65  	// Per-probe cancelFunc map.
    66  	probeCancelFunc map[string]context.CancelFunc
    67  
    68  	// dataChan for passing metrics between probes and main goroutine.
    69  	dataChan chan *metrics.EventMetrics
    70  
    71  	// Used by GetConfig for /config handler.
    72  	TextConfig string
    73  }
    74  
    75  func runOnThisHost(runOn string, hostname string) (bool, error) {
    76  	if runOn == "" {
    77  		return true, nil
    78  	}
    79  	r, err := regexp.Compile(runOn)
    80  	if err != nil {
    81  		return false, err
    82  	}
    83  	return r.MatchString(hostname), nil
    84  }
    85  
    86  func (pr *Prober) addProbe(p *probes_configpb.ProbeDef) error {
    87  	pr.mu.Lock()
    88  	defer pr.mu.Unlock()
    89  
    90  	// Check if this probe is supposed to run here.
    91  	runHere, err := runOnThisHost(p.GetRunOn(), sysvars.Vars()["hostname"])
    92  	if err != nil {
    93  		return err
    94  	}
    95  	if !runHere {
    96  		return nil
    97  	}
    98  
    99  	if pr.Probes[p.GetName()] != nil {
   100  		return status.Errorf(codes.AlreadyExists, "probe %s is already defined", p.GetName())
   101  	}
   102  
   103  	opts, err := options.BuildProbeOptions(p, pr.ldLister, pr.c.GetGlobalTargetsOptions(), pr.l)
   104  	if err != nil {
   105  		return status.Errorf(codes.Unknown, err.Error())
   106  	}
   107  
   108  	pr.l.Infof("Creating a %s probe: %s", p.GetType(), p.GetName())
   109  	probeInfo, err := probes.CreateProbe(p, opts)
   110  	if err != nil {
   111  		return status.Errorf(codes.Unknown, err.Error())
   112  	}
   113  	pr.Probes[p.GetName()] = probeInfo
   114  
   115  	return nil
   116  }
   117  
   118  // Init initialize prober with the given config file.
   119  func (pr *Prober) Init(ctx context.Context, cfg *configpb.ProberConfig, l *logger.Logger) error {
   120  	pr.c = cfg
   121  	pr.l = l
   122  
   123  	// Initialize cloudprober gRPC service if configured.
   124  	srv := runconfig.DefaultGRPCServer()
   125  	if srv != nil {
   126  		pr.grpcStartProbeCh = make(chan string)
   127  		spb.RegisterCloudproberServer(srv, pr)
   128  	}
   129  
   130  	// Initialize RDS server, if configured and attach to the default gRPC server.
   131  	// Note that we can still attach services to the default gRPC server as it's
   132  	// started later in Start().
   133  	if c := pr.c.GetRdsServer(); c != nil {
   134  		l, err := logger.NewCloudproberLog("rds-server")
   135  		if err != nil {
   136  			return err
   137  		}
   138  		rdsServer, err := rdsserver.New(ctx, c, nil, l)
   139  		if err != nil {
   140  			return err
   141  		}
   142  
   143  		runconfig.SetLocalRDSServer(rdsServer)
   144  		if srv != nil {
   145  			rdsServer.RegisterWithGRPC(srv)
   146  		}
   147  	}
   148  
   149  	// Initialize lameduck lister
   150  	globalTargetsOpts := pr.c.GetGlobalTargetsOptions()
   151  
   152  	if globalTargetsOpts.GetLameDuckOptions() != nil {
   153  		ldLogger, err := logger.NewCloudproberLog("lame-duck")
   154  		if err != nil {
   155  			return fmt.Errorf("error in initializing lame-duck logger: %v", err)
   156  		}
   157  
   158  		if err := lameduck.InitDefaultLister(globalTargetsOpts, nil, ldLogger); err != nil {
   159  			return err
   160  		}
   161  
   162  		pr.ldLister, err = lameduck.GetDefaultLister()
   163  		if err != nil {
   164  			pr.l.Warningf("Error while getting default lameduck lister, lameduck behavior will be disabled. Err: %v", err)
   165  		}
   166  	}
   167  
   168  	var err error
   169  
   170  	// Initialize shared targets
   171  	for _, st := range pr.c.GetSharedTargets() {
   172  		tgts, err := targets.New(st.GetTargets(), pr.ldLister, globalTargetsOpts, pr.l, pr.l)
   173  		if err != nil {
   174  			return err
   175  		}
   176  		targets.SetSharedTargets(st.GetName(), tgts)
   177  	}
   178  
   179  	// Initiliaze probes
   180  	pr.Probes = make(map[string]*probes.ProbeInfo)
   181  	pr.probeCancelFunc = make(map[string]context.CancelFunc)
   182  	for _, p := range pr.c.GetProbe() {
   183  		if err := pr.addProbe(p); err != nil {
   184  			return err
   185  		}
   186  	}
   187  
   188  	// Initialize servers
   189  	pr.Servers, err = servers.Init(ctx, pr.c.GetServer())
   190  	if err != nil {
   191  		return err
   192  	}
   193  
   194  	pr.Surfacers, err = surfacers.Init(ctx, pr.c.GetSurfacer())
   195  	if err != nil {
   196  		return err
   197  	}
   198  
   199  	return nil
   200  }
   201  
   202  // Start starts a previously initialized Cloudprober.
   203  func (pr *Prober) Start(ctx context.Context) {
   204  	pr.dataChan = make(chan *metrics.EventMetrics, 100000)
   205  
   206  	go func() {
   207  		var em *metrics.EventMetrics
   208  		for {
   209  			em = <-pr.dataChan
   210  			var s = em.String()
   211  			if len(s) > logger.MaxLogEntrySize {
   212  				glog.Warningf("Metric entry for timestamp %v dropped due to large size: %d", em.Timestamp, len(s))
   213  				continue
   214  			}
   215  
   216  			// Replicate the surfacer message to every surfacer we have
   217  			// registered. Note that s.Write() is expected to be
   218  			// non-blocking to avoid blocking of EventMetrics message
   219  			// processing.
   220  			for _, surfacer := range pr.Surfacers {
   221  				surfacer.Write(context.Background(), em)
   222  			}
   223  		}
   224  	}()
   225  
   226  	// Start a goroutine to export system variables
   227  	go sysvars.Start(ctx, pr.dataChan, time.Millisecond*time.Duration(pr.c.GetSysvarsIntervalMsec()), pr.c.GetSysvarsEnvVar())
   228  
   229  	// Start servers, each in its own goroutine
   230  	for _, s := range pr.Servers {
   231  		go s.Start(ctx, pr.dataChan)
   232  	}
   233  
   234  	if pr.c.GetDisableJitter() {
   235  		for name := range pr.Probes {
   236  			go pr.startProbe(ctx, name)
   237  		}
   238  		return
   239  	}
   240  	pr.startProbesWithJitter(ctx)
   241  
   242  	if runconfig.DefaultGRPCServer() != nil {
   243  		// Start a goroutine to handle starting of the probes added through gRPC.
   244  		// AddProbe adds new probes to the pr.grpcStartProbeCh channel and this
   245  		// goroutine reads from that channel and starts the probe using the overall
   246  		// Start context.
   247  		go func() {
   248  			for {
   249  				select {
   250  				case name := <-pr.grpcStartProbeCh:
   251  					pr.startProbe(ctx, name)
   252  				}
   253  			}
   254  		}()
   255  	}
   256  }
   257  
   258  func (pr *Prober) startProbe(ctx context.Context, name string) {
   259  	pr.mu.Lock()
   260  	defer pr.mu.Unlock()
   261  
   262  	probeCtx, cancelFunc := context.WithCancel(ctx)
   263  	pr.probeCancelFunc[name] = cancelFunc
   264  	go pr.Probes[name].Start(probeCtx, pr.dataChan)
   265  }
   266  
   267  // startProbesWithJitter try to space out probes over time, as much as possible,
   268  // without making it too complicated. We arrange probes into interval buckets -
   269  // all probes with the same interval will be part of the same bucket, and we
   270  // then spread out probes within that interval by introducing a delay of
   271  // interval / len(probes) between probes. We also introduce a random jitter
   272  // between different interval buckets.
   273  func (pr *Prober) startProbesWithJitter(ctx context.Context) {
   274  	// Seed random number generator.
   275  	rand.Seed(time.Now().UnixNano())
   276  
   277  	// Make interval -> [probe1, probe2, probe3..] map
   278  	intervalBuckets := make(map[time.Duration][]*probes.ProbeInfo)
   279  	for _, p := range pr.Probes {
   280  		intervalBuckets[p.Options.Interval] = append(intervalBuckets[p.Options.Interval], p)
   281  	}
   282  
   283  	for interval, probeInfos := range intervalBuckets {
   284  		go func(interval time.Duration, probeInfos []*probes.ProbeInfo) {
   285  			// Introduce a random jitter between interval buckets.
   286  			randomDelayMsec := rand.Int63n(int64(interval.Seconds() * 1000))
   287  			time.Sleep(time.Duration(randomDelayMsec) * time.Millisecond)
   288  
   289  			interProbeDelay := interval / time.Duration(len(probeInfos))
   290  
   291  			// Spread out probes evenly with an interval bucket.
   292  			for _, p := range probeInfos {
   293  				pr.l.Info("Starting probe: ", p.Name)
   294  				go pr.startProbe(ctx, p.Name)
   295  				time.Sleep(interProbeDelay)
   296  			}
   297  		}(interval, probeInfos)
   298  	}
   299  }