github.com/google/cloudprober@v0.11.3/prober/prober.go (about) 1 // Copyright 2017-2019 The Cloudprober Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 /* 16 Package prober provides a prober for running a set of probes. 17 18 Prober takes in a config proto which dictates what probes should be created 19 with what configuration, and manages the asynchronous fan-in/fan-out of the 20 metrics data from these probes. 21 */ 22 package prober 23 24 import ( 25 "context" 26 "fmt" 27 "math/rand" 28 "regexp" 29 "sync" 30 "time" 31 32 "github.com/golang/glog" 33 configpb "github.com/google/cloudprober/config/proto" 34 "github.com/google/cloudprober/config/runconfig" 35 "github.com/google/cloudprober/logger" 36 "github.com/google/cloudprober/metrics" 37 spb "github.com/google/cloudprober/prober/proto" 38 "github.com/google/cloudprober/probes" 39 "github.com/google/cloudprober/probes/options" 40 probes_configpb "github.com/google/cloudprober/probes/proto" 41 rdsserver "github.com/google/cloudprober/rds/server" 42 "github.com/google/cloudprober/servers" 43 "github.com/google/cloudprober/surfacers" 44 "github.com/google/cloudprober/sysvars" 45 "github.com/google/cloudprober/targets" 46 "github.com/google/cloudprober/targets/endpoint" 47 "github.com/google/cloudprober/targets/lameduck" 48 "google.golang.org/grpc/codes" 49 "google.golang.org/grpc/status" 50 ) 51 52 // Prober represents a collection of probes where each probe implements the Probe interface. 53 type Prober struct { 54 Probes map[string]*probes.ProbeInfo 55 Servers []*servers.ServerInfo 56 c *configpb.ProberConfig 57 l *logger.Logger 58 mu sync.Mutex 59 ldLister endpoint.Lister 60 Surfacers []*surfacers.SurfacerInfo 61 62 // Probe channel to handle starting of the new probes. 63 grpcStartProbeCh chan string 64 65 // Per-probe cancelFunc map. 66 probeCancelFunc map[string]context.CancelFunc 67 68 // dataChan for passing metrics between probes and main goroutine. 69 dataChan chan *metrics.EventMetrics 70 71 // Used by GetConfig for /config handler. 72 TextConfig string 73 } 74 75 func runOnThisHost(runOn string, hostname string) (bool, error) { 76 if runOn == "" { 77 return true, nil 78 } 79 r, err := regexp.Compile(runOn) 80 if err != nil { 81 return false, err 82 } 83 return r.MatchString(hostname), nil 84 } 85 86 func (pr *Prober) addProbe(p *probes_configpb.ProbeDef) error { 87 pr.mu.Lock() 88 defer pr.mu.Unlock() 89 90 // Check if this probe is supposed to run here. 91 runHere, err := runOnThisHost(p.GetRunOn(), sysvars.Vars()["hostname"]) 92 if err != nil { 93 return err 94 } 95 if !runHere { 96 return nil 97 } 98 99 if pr.Probes[p.GetName()] != nil { 100 return status.Errorf(codes.AlreadyExists, "probe %s is already defined", p.GetName()) 101 } 102 103 opts, err := options.BuildProbeOptions(p, pr.ldLister, pr.c.GetGlobalTargetsOptions(), pr.l) 104 if err != nil { 105 return status.Errorf(codes.Unknown, err.Error()) 106 } 107 108 pr.l.Infof("Creating a %s probe: %s", p.GetType(), p.GetName()) 109 probeInfo, err := probes.CreateProbe(p, opts) 110 if err != nil { 111 return status.Errorf(codes.Unknown, err.Error()) 112 } 113 pr.Probes[p.GetName()] = probeInfo 114 115 return nil 116 } 117 118 // Init initialize prober with the given config file. 119 func (pr *Prober) Init(ctx context.Context, cfg *configpb.ProberConfig, l *logger.Logger) error { 120 pr.c = cfg 121 pr.l = l 122 123 // Initialize cloudprober gRPC service if configured. 124 srv := runconfig.DefaultGRPCServer() 125 if srv != nil { 126 pr.grpcStartProbeCh = make(chan string) 127 spb.RegisterCloudproberServer(srv, pr) 128 } 129 130 // Initialize RDS server, if configured and attach to the default gRPC server. 131 // Note that we can still attach services to the default gRPC server as it's 132 // started later in Start(). 133 if c := pr.c.GetRdsServer(); c != nil { 134 l, err := logger.NewCloudproberLog("rds-server") 135 if err != nil { 136 return err 137 } 138 rdsServer, err := rdsserver.New(ctx, c, nil, l) 139 if err != nil { 140 return err 141 } 142 143 runconfig.SetLocalRDSServer(rdsServer) 144 if srv != nil { 145 rdsServer.RegisterWithGRPC(srv) 146 } 147 } 148 149 // Initialize lameduck lister 150 globalTargetsOpts := pr.c.GetGlobalTargetsOptions() 151 152 if globalTargetsOpts.GetLameDuckOptions() != nil { 153 ldLogger, err := logger.NewCloudproberLog("lame-duck") 154 if err != nil { 155 return fmt.Errorf("error in initializing lame-duck logger: %v", err) 156 } 157 158 if err := lameduck.InitDefaultLister(globalTargetsOpts, nil, ldLogger); err != nil { 159 return err 160 } 161 162 pr.ldLister, err = lameduck.GetDefaultLister() 163 if err != nil { 164 pr.l.Warningf("Error while getting default lameduck lister, lameduck behavior will be disabled. Err: %v", err) 165 } 166 } 167 168 var err error 169 170 // Initialize shared targets 171 for _, st := range pr.c.GetSharedTargets() { 172 tgts, err := targets.New(st.GetTargets(), pr.ldLister, globalTargetsOpts, pr.l, pr.l) 173 if err != nil { 174 return err 175 } 176 targets.SetSharedTargets(st.GetName(), tgts) 177 } 178 179 // Initiliaze probes 180 pr.Probes = make(map[string]*probes.ProbeInfo) 181 pr.probeCancelFunc = make(map[string]context.CancelFunc) 182 for _, p := range pr.c.GetProbe() { 183 if err := pr.addProbe(p); err != nil { 184 return err 185 } 186 } 187 188 // Initialize servers 189 pr.Servers, err = servers.Init(ctx, pr.c.GetServer()) 190 if err != nil { 191 return err 192 } 193 194 pr.Surfacers, err = surfacers.Init(ctx, pr.c.GetSurfacer()) 195 if err != nil { 196 return err 197 } 198 199 return nil 200 } 201 202 // Start starts a previously initialized Cloudprober. 203 func (pr *Prober) Start(ctx context.Context) { 204 pr.dataChan = make(chan *metrics.EventMetrics, 100000) 205 206 go func() { 207 var em *metrics.EventMetrics 208 for { 209 em = <-pr.dataChan 210 var s = em.String() 211 if len(s) > logger.MaxLogEntrySize { 212 glog.Warningf("Metric entry for timestamp %v dropped due to large size: %d", em.Timestamp, len(s)) 213 continue 214 } 215 216 // Replicate the surfacer message to every surfacer we have 217 // registered. Note that s.Write() is expected to be 218 // non-blocking to avoid blocking of EventMetrics message 219 // processing. 220 for _, surfacer := range pr.Surfacers { 221 surfacer.Write(context.Background(), em) 222 } 223 } 224 }() 225 226 // Start a goroutine to export system variables 227 go sysvars.Start(ctx, pr.dataChan, time.Millisecond*time.Duration(pr.c.GetSysvarsIntervalMsec()), pr.c.GetSysvarsEnvVar()) 228 229 // Start servers, each in its own goroutine 230 for _, s := range pr.Servers { 231 go s.Start(ctx, pr.dataChan) 232 } 233 234 if pr.c.GetDisableJitter() { 235 for name := range pr.Probes { 236 go pr.startProbe(ctx, name) 237 } 238 return 239 } 240 pr.startProbesWithJitter(ctx) 241 242 if runconfig.DefaultGRPCServer() != nil { 243 // Start a goroutine to handle starting of the probes added through gRPC. 244 // AddProbe adds new probes to the pr.grpcStartProbeCh channel and this 245 // goroutine reads from that channel and starts the probe using the overall 246 // Start context. 247 go func() { 248 for { 249 select { 250 case name := <-pr.grpcStartProbeCh: 251 pr.startProbe(ctx, name) 252 } 253 } 254 }() 255 } 256 } 257 258 func (pr *Prober) startProbe(ctx context.Context, name string) { 259 pr.mu.Lock() 260 defer pr.mu.Unlock() 261 262 probeCtx, cancelFunc := context.WithCancel(ctx) 263 pr.probeCancelFunc[name] = cancelFunc 264 go pr.Probes[name].Start(probeCtx, pr.dataChan) 265 } 266 267 // startProbesWithJitter try to space out probes over time, as much as possible, 268 // without making it too complicated. We arrange probes into interval buckets - 269 // all probes with the same interval will be part of the same bucket, and we 270 // then spread out probes within that interval by introducing a delay of 271 // interval / len(probes) between probes. We also introduce a random jitter 272 // between different interval buckets. 273 func (pr *Prober) startProbesWithJitter(ctx context.Context) { 274 // Seed random number generator. 275 rand.Seed(time.Now().UnixNano()) 276 277 // Make interval -> [probe1, probe2, probe3..] map 278 intervalBuckets := make(map[time.Duration][]*probes.ProbeInfo) 279 for _, p := range pr.Probes { 280 intervalBuckets[p.Options.Interval] = append(intervalBuckets[p.Options.Interval], p) 281 } 282 283 for interval, probeInfos := range intervalBuckets { 284 go func(interval time.Duration, probeInfos []*probes.ProbeInfo) { 285 // Introduce a random jitter between interval buckets. 286 randomDelayMsec := rand.Int63n(int64(interval.Seconds() * 1000)) 287 time.Sleep(time.Duration(randomDelayMsec) * time.Millisecond) 288 289 interProbeDelay := interval / time.Duration(len(probeInfos)) 290 291 // Spread out probes evenly with an interval bucket. 292 for _, p := range probeInfos { 293 pr.l.Info("Starting probe: ", p.Name) 294 go pr.startProbe(ctx, p.Name) 295 time.Sleep(interProbeDelay) 296 } 297 }(interval, probeInfos) 298 } 299 }