github.com/google/cloudprober@v0.11.3/probes/grpc/grpc.go (about) 1 // Copyright 2020 The Cloudprober Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 /* 16 Package grpc implements a gRPC probe. 17 18 This probes a cloudprober gRPC server and reports success rate, latency, and 19 validation failures. 20 */ 21 package grpc 22 23 import ( 24 "context" 25 "errors" 26 "fmt" 27 "net" 28 "strconv" 29 "sync" 30 "time" 31 32 "github.com/golang/protobuf/proto" 33 "github.com/google/cloudprober/common/oauth" 34 "github.com/google/cloudprober/logger" 35 "github.com/google/cloudprober/metrics" 36 configpb "github.com/google/cloudprober/probes/grpc/proto" 37 "github.com/google/cloudprober/probes/options" 38 "github.com/google/cloudprober/probes/probeutils" 39 "github.com/google/cloudprober/sysvars" 40 "github.com/google/cloudprober/targets/endpoint" 41 42 grpcprobepb "github.com/google/cloudprober/servers/grpc/proto" 43 servicepb "github.com/google/cloudprober/servers/grpc/proto" 44 "google.golang.org/grpc" 45 "google.golang.org/grpc/credentials/alts" 46 grpcoauth "google.golang.org/grpc/credentials/oauth" 47 "google.golang.org/grpc/peer" 48 "google.golang.org/grpc/resolver" 49 50 // Import grpclb module so it can be used by name for DirectPath connections. 51 _ "google.golang.org/grpc/balancer/grpclb" 52 ) 53 54 const loadBalancingPolicy = `{"loadBalancingConfig":[{"grpclb":{"childPolicy":[{"pick_first":{}}]}}]}` 55 56 // TargetsUpdateInterval controls frequency of target updates. 57 var ( 58 TargetsUpdateInterval = 1 * time.Minute 59 ) 60 61 // Probe holds aggregate information about all probe runs, per-target. 62 type Probe struct { 63 name string 64 src string 65 opts *options.Options 66 c *configpb.ProbeConf 67 l *logger.Logger 68 dialOpts []grpc.DialOption 69 70 // Targets and cancellation function for each target. 71 targets []endpoint.Endpoint 72 cancelFuncs map[string]context.CancelFunc 73 targetsMu sync.Mutex 74 75 // Results by target. 76 results map[string]*probeRunResult 77 } 78 79 // probeRunResult captures the metrics for a single target. Multiple threads 80 // can update metrics at the same time and the main thread periodically 81 // outputs the values in this struct. 82 type probeRunResult struct { 83 sync.Mutex 84 target string 85 total metrics.Int 86 success metrics.Int 87 latency metrics.Value 88 connectErrors metrics.Int 89 } 90 91 func (p *Probe) setupDialOpts() error { 92 oauthCfg := p.c.GetOauthConfig() 93 if oauthCfg != nil { 94 oauthTS, err := oauth.TokenSourceFromConfig(oauthCfg, p.l) 95 if err != nil { 96 return err 97 } 98 p.dialOpts = append(p.dialOpts, grpc.WithPerRPCCredentials(grpcoauth.TokenSource{oauthTS})) 99 } 100 altsCfg := p.c.GetAltsConfig() 101 if altsCfg != nil { 102 altsOpts := &alts.ClientOptions{ 103 TargetServiceAccounts: altsCfg.GetTargetServiceAccount(), 104 HandshakerServiceAddress: altsCfg.GetHandshakerServiceAddress(), 105 } 106 p.dialOpts = append(p.dialOpts, grpc.WithTransportCredentials(alts.NewClientCreds(altsOpts))) 107 } 108 109 if oauthCfg == nil && altsCfg == nil { 110 p.dialOpts = append(p.dialOpts, grpc.WithInsecure()) 111 } 112 p.dialOpts = append(p.dialOpts, grpc.WithDefaultServiceConfig(loadBalancingPolicy)) 113 p.dialOpts = append(p.dialOpts, grpc.WithBlock()) 114 return nil 115 } 116 117 // Init initializes the probe with the given params. 118 func (p *Probe) Init(name string, opts *options.Options) error { 119 c, ok := opts.ProbeConf.(*configpb.ProbeConf) 120 if !ok { 121 return errors.New("not a gRPC probe config") 122 } 123 p.c = c 124 p.name = name 125 p.opts = opts 126 if p.l = opts.Logger; p.l == nil { 127 p.l = &logger.Logger{} 128 } 129 p.targets = p.opts.Targets.ListEndpoints() 130 p.cancelFuncs = make(map[string]context.CancelFunc) 131 p.src = sysvars.Vars()["hostname"] 132 if err := p.setupDialOpts(); err != nil { 133 return err 134 } 135 resolver.SetDefaultScheme("dns") 136 return nil 137 } 138 139 func (p *Probe) updateTargetsAndStartProbes(ctx context.Context) { 140 newTargets := p.opts.Targets.ListEndpoints() 141 numNewTargets := len(newTargets) 142 143 p.targetsMu.Lock() 144 defer p.targetsMu.Unlock() 145 if numNewTargets == 0 || numNewTargets < (len(p.targets)/2) { 146 p.l.Errorf("Too few new targets, retaining old targets. New targets: %v, old count: %d", newTargets, len(p.targets)) 147 return 148 } 149 150 updatedTargets := make(map[string]string) 151 defer func() { 152 if len(updatedTargets) > 0 { 153 p.l.Infof("Probe(%s) targets updated: %v", p.name, updatedTargets) 154 } 155 }() 156 157 activeTargets := make(map[string]bool) 158 // Create results structure and start probe loop for new targets. 159 for _, tgtEp := range newTargets { 160 tgt := net.JoinHostPort(tgtEp.Name, strconv.Itoa(tgtEp.Port)) 161 activeTargets[tgt] = true 162 if _, ok := p.results[tgt]; ok { 163 continue 164 } 165 updatedTargets[tgt] = "ADD" 166 p.results[tgt] = p.newResult(tgt) 167 probeCtx, probeCancelFunc := context.WithCancel(ctx) 168 for i := 0; i < int(p.c.GetNumConns()); i++ { 169 go p.oneTargetLoop(probeCtx, tgt, i, p.results[tgt]) 170 } 171 p.cancelFuncs[tgt] = probeCancelFunc 172 } 173 174 // Stop probing for deleted targets by invoking cancelFunc. 175 for tgt := range p.results { 176 if activeTargets[tgt] { 177 continue 178 } 179 p.cancelFuncs[tgt]() 180 updatedTargets[tgt] = "DELETE" 181 delete(p.results, tgt) 182 delete(p.cancelFuncs, tgt) 183 } 184 p.targets = newTargets 185 } 186 187 // connectWithRetry attempts to connect to a target. On failure, it retries in 188 // an infinite loop until successful, incrementing connectErrors for every 189 // connection error. On success, it returns a client immediately. 190 // Interval between connects is controlled by connect_timeout_msec, defaulting 191 // to probe timeout. 192 func (p *Probe) connectWithRetry(ctx context.Context, tgt, msgPattern string, result *probeRunResult) *grpc.ClientConn { 193 connectTimeout := p.opts.Timeout 194 if p.c.GetConnectTimeoutMsec() > 0 { 195 connectTimeout = time.Duration(p.c.GetConnectTimeoutMsec()) * time.Millisecond 196 } 197 var conn *grpc.ClientConn 198 var err error 199 for { 200 select { 201 case <-ctx.Done(): 202 p.l.Warningf("ProbeId(%s): context cancelled in connect loop.", msgPattern) 203 return nil 204 default: 205 } 206 connCtx, cancelFunc := context.WithTimeout(ctx, connectTimeout) 207 conn, err = grpc.DialContext(connCtx, tgt, p.dialOpts...) 208 cancelFunc() 209 if err != nil { 210 p.l.Warningf("ProbeId(%v) connect error: %v", msgPattern, err) 211 } else { 212 p.l.Infof("ProbeId(%v) connection established.", msgPattern) 213 break 214 } 215 result.Lock() 216 result.total.Inc() 217 result.connectErrors.Inc() 218 result.Unlock() 219 } 220 return conn 221 } 222 223 // oneTargetLoop connects to and then continuously probes a single target. 224 func (p *Probe) oneTargetLoop(ctx context.Context, tgt string, index int, result *probeRunResult) { 225 msgPattern := fmt.Sprintf("%s,%s,%03d", p.src, tgt, index) 226 227 conn := p.connectWithRetry(ctx, tgt, msgPattern, result) 228 if conn == nil { 229 return 230 } 231 defer conn.Close() 232 233 client := servicepb.NewProberClient(conn) 234 timeout := p.opts.Timeout 235 method := p.c.GetMethod() 236 237 msgSize := p.c.GetBlobSize() 238 msg := make([]byte, msgSize) 239 probeutils.PatternPayload(msg, []byte(msgPattern)) 240 ticker := time.NewTicker(p.opts.Interval) 241 for { 242 select { 243 case <-ctx.Done(): 244 p.l.Warningf("ProbeId(%s): context cancelled in request loop.", msgPattern) 245 ticker.Stop() 246 return 247 case <-ticker.C: 248 } 249 250 reqCtx, cancelFunc := context.WithTimeout(ctx, timeout) 251 var success int64 252 var delta time.Duration 253 start := time.Now() 254 var err error 255 var peer peer.Peer 256 opts := []grpc.CallOption{ 257 grpc.WaitForReady(true), 258 grpc.Peer(&peer), 259 } 260 switch method { 261 case configpb.ProbeConf_ECHO: 262 req := &grpcprobepb.EchoMessage{ 263 Blob: []byte(msg), 264 } 265 _, err = client.Echo(reqCtx, req, opts...) 266 case configpb.ProbeConf_READ: 267 req := &grpcprobepb.BlobReadRequest{ 268 Size: proto.Int32(msgSize), 269 } 270 _, err = client.BlobRead(reqCtx, req, opts...) 271 case configpb.ProbeConf_WRITE: 272 req := &grpcprobepb.BlobWriteRequest{ 273 Blob: []byte(msg), 274 } 275 _, err = client.BlobWrite(reqCtx, req, opts...) 276 default: 277 p.l.Criticalf("Method %v not implemented", method) 278 } 279 cancelFunc() 280 if err != nil { 281 peerAddr := "unknown" 282 if peer.Addr != nil { 283 peerAddr = peer.Addr.String() 284 } 285 p.l.Warningf("ProbeId(%s) request failed: %v. ConnState: %v. Peer: %v", msgPattern, err, conn.GetState(), peerAddr) 286 } else { 287 success = 1 288 delta = time.Since(start) 289 } 290 // TODO(ls692): add validators for probe result. 291 result.Lock() 292 result.total.Inc() 293 result.success.AddInt64(success) 294 result.latency.AddFloat64(delta.Seconds() / p.opts.LatencyUnit.Seconds()) 295 result.Unlock() 296 } 297 } 298 299 func (p *Probe) newResult(tgt string) *probeRunResult { 300 var latencyValue metrics.Value 301 if p.opts.LatencyDist != nil { 302 latencyValue = p.opts.LatencyDist.Clone() 303 } else { 304 latencyValue = metrics.NewFloat(0) 305 } 306 return &probeRunResult{ 307 target: tgt, 308 latency: latencyValue, 309 } 310 } 311 312 // Start starts and runs the probe indefinitely. 313 func (p *Probe) Start(ctx context.Context, dataChan chan *metrics.EventMetrics) { 314 p.results = make(map[string]*probeRunResult) 315 p.updateTargetsAndStartProbes(ctx) 316 317 ticker := time.NewTicker(p.opts.StatsExportInterval) 318 defer ticker.Stop() 319 320 targetsUpdateTicker := time.NewTicker(TargetsUpdateInterval) 321 defer targetsUpdateTicker.Stop() 322 323 for ts := range ticker.C { 324 // Stop further processing and exit if context is canceled. 325 // Same context is used by probe loops. 326 select { 327 case <-ctx.Done(): 328 return 329 default: 330 } 331 332 // Output results. 333 for targetName, result := range p.results { 334 result.Lock() 335 em := metrics.NewEventMetrics(ts). 336 AddMetric("total", result.total.Clone()). 337 AddMetric("success", result.success.Clone()). 338 AddMetric("latency", result.latency.Clone()). 339 AddMetric("connecterrors", result.connectErrors.Clone()). 340 AddLabel("ptype", "grpc"). 341 AddLabel("probe", p.name). 342 AddLabel("dst", targetName) 343 result.Unlock() 344 em.LatencyUnit = p.opts.LatencyUnit 345 for _, al := range p.opts.AdditionalLabels { 346 em.AddLabel(al.KeyValueForTarget(targetName)) 347 } 348 p.opts.LogMetrics(em) 349 dataChan <- em 350 } 351 352 // Finally, update targets and start new probe loops if necessary. 353 // Executing this as the last step in the loop also ensures that new 354 // targets have at least one cycle of probes before next output cycle. 355 select { 356 case <-targetsUpdateTicker.C: 357 p.updateTargetsAndStartProbes(ctx) 358 default: 359 } 360 } 361 }