github.com/google/cloudprober@v0.11.3/probes/http/http.go (about) 1 // Copyright 2017-2020 The Cloudprober Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package http implements HTTP probe type. 16 package http 17 18 import ( 19 "bytes" 20 "context" 21 "crypto/tls" 22 "fmt" 23 "io/ioutil" 24 "math/rand" 25 "net" 26 "net/http" 27 "net/http/httptrace" 28 "net/url" 29 "strconv" 30 "strings" 31 "sync" 32 "time" 33 34 "github.com/google/cloudprober/common/oauth" 35 "github.com/google/cloudprober/common/tlsconfig" 36 "github.com/google/cloudprober/logger" 37 "github.com/google/cloudprober/metrics" 38 configpb "github.com/google/cloudprober/probes/http/proto" 39 "github.com/google/cloudprober/probes/options" 40 "github.com/google/cloudprober/targets/endpoint" 41 "github.com/google/cloudprober/validators" 42 "golang.org/x/oauth2" 43 ) 44 45 // DefaultTargetsUpdateInterval defines default frequency for target updates. 46 // Actual targets update interval is: 47 // max(DefaultTargetsUpdateInterval, probe_interval) 48 var DefaultTargetsUpdateInterval = 1 * time.Minute 49 50 // maxGapBetweenTargets defines the maximum gap between probe loops for each 51 // target. Actual gap is either configured or determined by the probe interval 52 // and number of targets. 53 const maxGapBetweenTargets = 1 * time.Second 54 55 const ( 56 maxResponseSizeForMetrics = 128 57 targetsUpdateInterval = 1 * time.Minute 58 largeBodyThreshold = bytes.MinRead // 512. 59 ) 60 61 // Probe holds aggregate information about all probe runs, per-target. 62 type Probe struct { 63 name string 64 opts *options.Options 65 c *configpb.ProbeConf 66 l *logger.Logger 67 client *http.Client 68 69 // book-keeping params 70 targets []endpoint.Endpoint 71 protocol string 72 method string 73 url string 74 oauthTS oauth2.TokenSource 75 bearerToken string 76 77 // Run counter, used to decide when to update targets or export 78 // stats. 79 runCnt int64 80 81 // How often to resolve targets (in probe counts), it's the minimum of 82 targetsUpdateInterval time.Duration 83 84 // How often to export metrics (in probe counts), initialized to 85 // statsExportInterval / p.opts.Interval. Metrics are exported when 86 // (runCnt % statsExportFrequency) == 0 87 statsExportFrequency int64 88 89 // Cancel functions for per-target probe loop 90 cancelFuncs map[string]context.CancelFunc 91 waitGroup sync.WaitGroup 92 93 requestBody []byte 94 } 95 96 type probeResult struct { 97 total, success, timeouts int64 98 connEvent int64 99 latency metrics.Value 100 respCodes *metrics.Map 101 respBodies *metrics.Map 102 validationFailure *metrics.Map 103 } 104 105 func (p *Probe) updateOauthToken() { 106 if p.oauthTS == nil { 107 return 108 } 109 110 tok, err := p.oauthTS.Token() 111 if err != nil { 112 p.l.Error("Error getting OAuth token: ", err.Error(), ". Skipping updating the token.") 113 } else { 114 if tok.AccessToken != "" { 115 p.bearerToken = tok.AccessToken 116 } else { 117 idToken, ok := tok.Extra("id_token").(string) 118 if ok { 119 p.bearerToken = idToken 120 } 121 } 122 p.l.Debug("Got OAuth token, len: ", strconv.FormatInt(int64(len(p.bearerToken)), 10), ", expirationTime: ", tok.Expiry.String()) 123 } 124 } 125 126 // Init initializes the probe with the given params. 127 func (p *Probe) Init(name string, opts *options.Options) error { 128 c, ok := opts.ProbeConf.(*configpb.ProbeConf) 129 if !ok { 130 return fmt.Errorf("not http config") 131 } 132 p.name = name 133 p.opts = opts 134 if p.l = opts.Logger; p.l == nil { 135 p.l = &logger.Logger{} 136 } 137 p.c = c 138 139 p.protocol = strings.ToLower(p.c.GetProtocol().String()) 140 p.method = p.c.GetMethod().String() 141 142 p.url = p.c.GetRelativeUrl() 143 if len(p.url) > 0 && p.url[0] != '/' { 144 return fmt.Errorf("Invalid Relative URL: %s, must begin with '/'", p.url) 145 } 146 147 p.requestBody = []byte(p.c.GetBody()) 148 149 // Create a transport for our use. This is mostly based on 150 // http.DefaultTransport with some timeouts changed. 151 // TODO(manugarg): Considering cloning DefaultTransport once 152 // https://github.com/golang/go/issues/26013 is fixed. 153 dialer := &net.Dialer{ 154 Timeout: p.opts.Timeout, 155 KeepAlive: 30 * time.Second, // TCP keep-alive 156 } 157 158 if p.opts.SourceIP != nil { 159 dialer.LocalAddr = &net.TCPAddr{ 160 IP: p.opts.SourceIP, 161 } 162 } 163 164 transport := &http.Transport{ 165 Proxy: http.ProxyFromEnvironment, 166 DialContext: dialer.DialContext, 167 MaxIdleConns: 256, // http.DefaultTransport.MaxIdleConns: 100. 168 TLSHandshakeTimeout: p.opts.Timeout, 169 } 170 171 if p.c.GetProxyUrl() != "" { 172 url, err := url.Parse(p.c.GetProxyUrl()) 173 if err != nil { 174 return fmt.Errorf("error parsing proxy URL (%s): %v", p.c.GetProxyUrl(), err) 175 } 176 transport.Proxy = http.ProxyURL(url) 177 } 178 179 if p.c.GetDisableCertValidation() || p.c.GetTlsConfig() != nil { 180 if transport.TLSClientConfig == nil { 181 transport.TLSClientConfig = &tls.Config{} 182 } 183 184 if p.c.GetDisableCertValidation() { 185 p.l.Warning("disable_cert_validation is deprecated as of v0.10.6. Instead of this, please use \"tls_config {disable_cert_validation: true}\"") 186 transport.TLSClientConfig.InsecureSkipVerify = true 187 } 188 189 if p.c.GetTlsConfig() != nil { 190 if err := tlsconfig.UpdateTLSConfig(transport.TLSClientConfig, p.c.GetTlsConfig(), false); err != nil { 191 return err 192 } 193 } 194 } 195 196 // If HTTP keep-alives are not enabled (default), disable HTTP keep-alive in 197 // transport. 198 if !p.c.GetKeepAlive() { 199 transport.DisableKeepAlives = true 200 } else { 201 // If it's been more than 2 probe intervals since connection was used, close it. 202 transport.IdleConnTimeout = 2 * p.opts.Interval 203 if p.c.GetRequestsPerProbe() > 1 { 204 transport.MaxIdleConnsPerHost = int(p.c.GetRequestsPerProbe()) 205 } 206 } 207 208 if p.c.GetOauthConfig() != nil { 209 oauthTS, err := oauth.TokenSourceFromConfig(p.c.GetOauthConfig(), p.l) 210 if err != nil { 211 return err 212 } 213 p.oauthTS = oauthTS 214 p.updateOauthToken() // This is also called periodically. 215 } 216 217 if p.c.GetDisableHttp2() { 218 // HTTP/2 is enabled by default if server supports it. Setting TLSNextProto 219 // to an empty dict is the only to disable it. 220 transport.TLSNextProto = make(map[string]func(string, *tls.Conn) http.RoundTripper) 221 } 222 223 // Clients are safe for concurrent use by multiple goroutines. 224 p.client = &http.Client{ 225 Transport: transport, 226 } 227 228 p.statsExportFrequency = p.opts.StatsExportInterval.Nanoseconds() / p.opts.Interval.Nanoseconds() 229 if p.statsExportFrequency == 0 { 230 p.statsExportFrequency = 1 231 } 232 233 p.targets = p.opts.Targets.ListEndpoints() 234 p.cancelFuncs = make(map[string]context.CancelFunc, len(p.targets)) 235 236 p.targetsUpdateInterval = DefaultTargetsUpdateInterval 237 // There is no point refreshing targets before probe interval. 238 if p.targetsUpdateInterval < p.opts.Interval { 239 p.targetsUpdateInterval = p.opts.Interval 240 } 241 p.l.Infof("Targets update interval: %v", p.targetsUpdateInterval) 242 243 return nil 244 } 245 246 // Return true if the underlying error indicates a http.Client timeout. 247 // 248 // Use for errors returned from http.Client methods (Get, Post). 249 func isClientTimeout(err error) bool { 250 if uerr, ok := err.(*url.Error); ok { 251 if nerr, ok := uerr.Err.(net.Error); ok && nerr.Timeout() { 252 return true 253 } 254 } 255 return false 256 } 257 258 // httpRequest executes an HTTP request and updates the provided result struct. 259 func (p *Probe) doHTTPRequest(req *http.Request, targetName string, result *probeResult, resultMu *sync.Mutex) { 260 261 if len(p.requestBody) >= largeBodyThreshold { 262 req = req.Clone(req.Context()) 263 req.Body = ioutil.NopCloser(bytes.NewReader(p.requestBody)) 264 } 265 266 if p.c.GetKeepAlive() { 267 trace := &httptrace.ClientTrace{ 268 ConnectDone: func(_, addr string, err error) { 269 result.connEvent++ 270 if err != nil { 271 p.l.Warning("Error establishing a new connection to: ", addr, ". Err: ", err.Error()) 272 return 273 } 274 p.l.Info("Established a new connection to: ", addr) 275 }, 276 } 277 req = req.WithContext(httptrace.WithClientTrace(req.Context(), trace)) 278 } 279 280 start := time.Now() 281 resp, err := p.client.Do(req) 282 latency := time.Since(start) 283 284 if resultMu != nil { 285 // Note that we take lock on result object outside of the actual request. 286 resultMu.Lock() 287 defer resultMu.Unlock() 288 } 289 290 result.total++ 291 292 if err != nil { 293 if isClientTimeout(err) { 294 p.l.Warning("Target:", targetName, ", URL:", req.URL.String(), ", http.doHTTPRequest: timeout error: ", err.Error()) 295 result.timeouts++ 296 return 297 } 298 p.l.Warning("Target:", targetName, ", URL:", req.URL.String(), ", http.doHTTPRequest: ", err.Error()) 299 return 300 } 301 302 respBody, err := ioutil.ReadAll(resp.Body) 303 if err != nil { 304 p.l.Warning("Target:", targetName, ", URL:", req.URL.String(), ", http.doHTTPRequest: ", err.Error()) 305 return 306 } 307 308 p.l.Debug("Target:", targetName, ", URL:", req.URL.String(), ", response: ", string(respBody)) 309 310 // Calling Body.Close() allows the TCP connection to be reused. 311 resp.Body.Close() 312 result.respCodes.IncKey(strconv.FormatInt(int64(resp.StatusCode), 10)) 313 314 if p.opts.Validators != nil { 315 failedValidations := validators.RunValidators(p.opts.Validators, &validators.Input{Response: resp, ResponseBody: respBody}, result.validationFailure, p.l) 316 317 // If any validation failed, return now, leaving the success and latency 318 // counters unchanged. 319 if len(failedValidations) > 0 { 320 p.l.Debug("Target:", targetName, ", URL:", req.URL.String(), ", http.doHTTPRequest: failed validations: ", strings.Join(failedValidations, ",")) 321 return 322 } 323 } 324 325 result.success++ 326 result.latency.AddFloat64(latency.Seconds() / p.opts.LatencyUnit.Seconds()) 327 if result.respBodies != nil && len(respBody) <= maxResponseSizeForMetrics { 328 result.respBodies.IncKey(string(respBody)) 329 } 330 } 331 332 func (p *Probe) runProbe(ctx context.Context, target endpoint.Endpoint, req *http.Request, result *probeResult) { 333 reqCtx, cancelReqCtx := context.WithTimeout(ctx, p.opts.Timeout) 334 defer cancelReqCtx() 335 336 if p.c.GetRequestsPerProbe() == 1 { 337 p.doHTTPRequest(req.WithContext(reqCtx), target.Name, result, nil) 338 return 339 } 340 341 // For multiple requests per probe, we launch a separate goroutine for each 342 // HTTP request. We use a mutex to protect access to per-target result object 343 // in doHTTPRequest. Note that result object is not accessed concurrently 344 // anywhere else -- export of metrics happens when probe is not running. 345 var resultMu sync.Mutex 346 347 wg := sync.WaitGroup{} 348 for numReq := int32(0); numReq < p.c.GetRequestsPerProbe(); numReq++ { 349 wg.Add(1) 350 go func(req *http.Request, targetName string, result *probeResult) { 351 defer wg.Done() 352 p.doHTTPRequest(req.WithContext(reqCtx), targetName, result, &resultMu) 353 }(req, target.Name, result) 354 } 355 wg.Wait() 356 } 357 358 func (p *Probe) newResult() *probeResult { 359 result := &probeResult{ 360 respCodes: metrics.NewMap("code", metrics.NewInt(0)), 361 } 362 363 if p.opts.Validators != nil { 364 result.validationFailure = validators.ValidationFailureMap(p.opts.Validators) 365 } 366 367 if p.opts.LatencyDist != nil { 368 result.latency = p.opts.LatencyDist.Clone() 369 } else { 370 result.latency = metrics.NewFloat(0) 371 } 372 373 if p.c.GetExportResponseAsMetrics() { 374 result.respBodies = metrics.NewMap("resp", metrics.NewInt(0)) 375 } 376 377 return result 378 } 379 380 func (p *Probe) exportMetrics(ts time.Time, result *probeResult, targetName string, dataChan chan *metrics.EventMetrics) { 381 em := metrics.NewEventMetrics(ts). 382 AddMetric("total", metrics.NewInt(result.total)). 383 AddMetric("success", metrics.NewInt(result.success)). 384 AddMetric("latency", result.latency). 385 AddMetric("timeouts", metrics.NewInt(result.timeouts)). 386 AddMetric("resp-code", result.respCodes). 387 AddLabel("ptype", "http"). 388 AddLabel("probe", p.name). 389 AddLabel("dst", targetName) 390 391 if result.respBodies != nil { 392 em.AddMetric("resp-body", result.respBodies) 393 } 394 395 if p.c.GetKeepAlive() { 396 em.AddMetric("connect_event", metrics.NewInt(result.connEvent)) 397 } 398 399 em.LatencyUnit = p.opts.LatencyUnit 400 401 for _, al := range p.opts.AdditionalLabels { 402 em.AddLabel(al.KeyValueForTarget(targetName)) 403 } 404 405 if result.validationFailure != nil { 406 em.AddMetric("validation_failure", result.validationFailure) 407 } 408 409 p.opts.LogMetrics(em) 410 dataChan <- em 411 } 412 413 func (p *Probe) startForTarget(ctx context.Context, target endpoint.Endpoint, dataChan chan *metrics.EventMetrics) { 414 p.l.Debug("Starting probing for the target ", target.Name) 415 416 // We use this counter to decide when to export stats. 417 var runCnt int64 418 419 for _, al := range p.opts.AdditionalLabels { 420 al.UpdateForTarget(target) 421 } 422 result := p.newResult() 423 req := p.httpRequestForTarget(target, nil) 424 425 ticker := time.NewTicker(p.opts.Interval) 426 defer ticker.Stop() 427 428 for ts := time.Now(); true; ts = <-ticker.C { 429 // Don't run another probe if context is canceled already. 430 if ctxDone(ctx) { 431 return 432 } 433 434 // If request is nil (most likely because target resolving failed or it 435 // was an invalid target), skip this probe cycle. Note that request 436 // creation gets retried at a regular interval (stats export interval). 437 if req != nil { 438 p.runProbe(ctx, target, req, result) 439 } 440 441 // Export stats if it's the time to do so. 442 runCnt++ 443 if (runCnt % p.statsExportFrequency) == 0 { 444 p.exportMetrics(ts, result, target.Name, dataChan) 445 446 // If we are resolving first, this is also a good time to recreate HTTP 447 // request in case target's IP has changed. 448 if p.c.GetResolveFirst() { 449 req = p.httpRequestForTarget(target, nil) 450 } 451 } 452 } 453 } 454 455 func (p *Probe) gapBetweenTargets() time.Duration { 456 interTargetGap := time.Duration(p.c.GetIntervalBetweenTargetsMsec()) * time.Millisecond 457 458 // If not configured by user, determine based on probe interval and number of 459 // targets. 460 if interTargetGap == 0 && len(p.targets) != 0 { 461 // Use 1/10th of the probe interval to spread out target groroutines. 462 interTargetGap = p.opts.Interval / time.Duration(10*len(p.targets)) 463 } 464 465 return interTargetGap 466 } 467 468 // updateTargetsAndStartProbes refreshes targets and starts probe loop for 469 // new targets and cancels probe loops for targets that are no longer active. 470 // Note that this function is not concurrency safe. It is never called 471 // concurrently by Start(). 472 func (p *Probe) updateTargetsAndStartProbes(ctx context.Context, dataChan chan *metrics.EventMetrics) { 473 p.targets = p.opts.Targets.ListEndpoints() 474 475 p.l.Debugf("Probe(%s) got %d targets", p.name, len(p.targets)) 476 477 // updatedTargets is used only for logging. 478 updatedTargets := make(map[string]string) 479 defer func() { 480 if len(updatedTargets) > 0 { 481 p.l.Infof("Probe(%s) targets updated: %v", p.name, updatedTargets) 482 } 483 }() 484 485 activeTargets := make(map[string]endpoint.Endpoint) 486 for _, target := range p.targets { 487 key := target.Key() 488 activeTargets[key] = target 489 } 490 491 // Stop probing for deleted targets by invoking cancelFunc. 492 for targetKey, cancelF := range p.cancelFuncs { 493 if _, ok := activeTargets[targetKey]; ok { 494 continue 495 } 496 cancelF() 497 updatedTargets[targetKey] = "DELETE" 498 delete(p.cancelFuncs, targetKey) 499 } 500 501 gapBetweenTargets := p.gapBetweenTargets() 502 var startWaitTime time.Duration 503 504 // Start probe loop for new targets. 505 for key, target := range activeTargets { 506 // This target is already initialized. 507 if _, ok := p.cancelFuncs[key]; ok { 508 continue 509 } 510 updatedTargets[key] = "ADD" 511 512 probeCtx, cancelF := context.WithCancel(ctx) 513 p.waitGroup.Add(1) 514 515 go func(target endpoint.Endpoint, waitTime time.Duration) { 516 defer p.waitGroup.Done() 517 // Wait for wait time + some jitter before starting this probe loop. 518 time.Sleep(waitTime + time.Duration(rand.Int63n(gapBetweenTargets.Microseconds()/10))*time.Microsecond) 519 p.startForTarget(probeCtx, target, dataChan) 520 }(target, startWaitTime) 521 522 startWaitTime += gapBetweenTargets 523 524 p.cancelFuncs[key] = cancelF 525 } 526 } 527 528 func ctxDone(ctx context.Context) bool { 529 select { 530 case <-ctx.Done(): 531 return true 532 default: 533 return false 534 } 535 } 536 537 // wait waits for child go-routines (one per target) to clean up. 538 func (p *Probe) wait() { 539 p.waitGroup.Wait() 540 } 541 542 // Start starts and runs the probe indefinitely. 543 func (p *Probe) Start(ctx context.Context, dataChan chan *metrics.EventMetrics) { 544 defer p.wait() 545 546 p.updateTargetsAndStartProbes(ctx, dataChan) 547 548 // Do more frequent listing of targets until we get a non-zero list of 549 // targets. 550 initialRefreshInterval := p.opts.Interval 551 // Don't wait too long if p.opts.Interval is large. 552 if initialRefreshInterval > time.Second { 553 initialRefreshInterval = time.Second 554 } 555 556 for { 557 if ctxDone(ctx) { 558 return 559 } 560 if len(p.targets) != 0 { 561 break 562 } 563 p.updateTargetsAndStartProbes(ctx, dataChan) 564 time.Sleep(initialRefreshInterval) 565 } 566 567 targetsUpdateTicker := time.NewTicker(p.targetsUpdateInterval) 568 defer targetsUpdateTicker.Stop() 569 570 for { 571 select { 572 case <-ctx.Done(): 573 return 574 case <-targetsUpdateTicker.C: 575 p.updateOauthToken() 576 p.updateTargetsAndStartProbes(ctx, dataChan) 577 } 578 } 579 }