github.com/google/cloudprober@v0.11.3/probes/external/external.go (about) 1 // Copyright 2017-2020 The Cloudprober Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 /* 16 Package external implements an external probe type for cloudprober. 17 18 External probe type executes an external process for actual probing. These probes 19 can have two modes: "once" and "server". In "once" mode, the external process is 20 started for each probe run cycle, while in "server" mode, external process is 21 started only if it's not running already and Cloudprober communicates with it 22 over stdin/stdout for each probe cycle. 23 24 TODO(manugarg): Add a way to test this program. Write another program that 25 implements the probe server protocol and use that for testing. 26 */ 27 package external 28 29 import ( 30 "bufio" 31 "context" 32 "fmt" 33 "io" 34 "os" 35 "os/exec" 36 "regexp" 37 "strconv" 38 "strings" 39 "sync" 40 "time" 41 42 "github.com/golang/protobuf/proto" 43 "github.com/google/cloudprober/logger" 44 "github.com/google/cloudprober/metrics" 45 "github.com/google/cloudprober/metrics/payload" 46 configpb "github.com/google/cloudprober/probes/external/proto" 47 serverpb "github.com/google/cloudprober/probes/external/proto" 48 "github.com/google/cloudprober/probes/external/serverutils" 49 "github.com/google/cloudprober/probes/options" 50 "github.com/google/cloudprober/targets/endpoint" 51 "github.com/google/cloudprober/validators" 52 "github.com/google/shlex" 53 ) 54 55 var ( 56 // TimeBetweenRequests is the time interval between probe requests for 57 // multiple targets. In server mode, probe requests for multiple targets are 58 // sent to the same external probe process. Sleeping between requests provides 59 // some time buffer for the probe process to dequeue the incoming requests and 60 // avoids filling up the communication pipe. 61 // 62 // Note that this value impacts the effective timeout for a target as timeout 63 // is applied for all the targets in aggregate. For example, 100th target in 64 // the targets list will have the effective timeout of (timeout - 1ms). 65 // TODO(manugarg): Make sure that the last target in the list has an impact of 66 // less than 1% on its timeout. 67 TimeBetweenRequests = 10 * time.Microsecond 68 validLabelRe = regexp.MustCompile(`@(target|address|port|probe|target\.label\.[^@]+)@`) 69 ) 70 71 type result struct { 72 total, success int64 73 latency metrics.Value 74 validationFailure *metrics.Map 75 payloadMetrics *metrics.EventMetrics 76 } 77 78 // Probe holds aggregate information about all probe runs, per-target. 79 type Probe struct { 80 name string 81 mode string 82 cmdName string 83 cmdArgs []string 84 opts *options.Options 85 c *configpb.ProbeConf 86 l *logger.Logger 87 88 // book-keeping params 89 labelKeys map[string]bool // Labels for substitution 90 requestID int32 91 cmdRunning bool 92 cmdStdin io.Writer 93 cmdStdout io.ReadCloser 94 cmdStderr io.ReadCloser 95 replyChan chan *serverpb.ProbeReply 96 targets []endpoint.Endpoint 97 results map[string]*result // probe results keyed by targets 98 dataChan chan *metrics.EventMetrics 99 100 // default payload metrics that we clone from to build per-target payload 101 // metrics. 102 payloadParser *payload.Parser 103 } 104 105 func (p *Probe) updateLabelKeys() { 106 p.labelKeys = make(map[string]bool) 107 108 updateLabelKeysFn := func(s string) { 109 matches := validLabelRe.FindAllStringSubmatch(s, -1) 110 for _, m := range matches { 111 if len(m) >= 2 { 112 // Pick the match within outer parentheses. 113 p.labelKeys[m[1]] = true 114 } 115 } 116 } 117 118 for _, opt := range p.c.GetOptions() { 119 updateLabelKeysFn(opt.GetValue()) 120 } 121 for _, arg := range p.cmdArgs { 122 updateLabelKeysFn(arg) 123 } 124 } 125 126 // Init initializes the probe with the given params. 127 func (p *Probe) Init(name string, opts *options.Options) error { 128 c, ok := opts.ProbeConf.(*configpb.ProbeConf) 129 if !ok { 130 return fmt.Errorf("not external probe config") 131 } 132 p.name = name 133 p.opts = opts 134 if p.l = opts.Logger; p.l == nil { 135 p.l = &logger.Logger{} 136 } 137 p.c = c 138 p.replyChan = make(chan *serverpb.ProbeReply) 139 140 cmdParts, err := shlex.Split(p.c.GetCommand()) 141 if err != nil { 142 return fmt.Errorf("error parsing command line (%s): %v", p.c.GetCommand(), err) 143 } 144 p.cmdName = cmdParts[0] 145 p.cmdArgs = cmdParts[1:len(cmdParts)] 146 147 // Figure out labels we are interested in 148 p.updateLabelKeys() 149 150 switch p.c.GetMode() { 151 case configpb.ProbeConf_ONCE: 152 p.mode = "once" 153 case configpb.ProbeConf_SERVER: 154 p.mode = "server" 155 default: 156 return fmt.Errorf("invalid mode: %s", p.c.GetMode()) 157 } 158 159 p.results = make(map[string]*result) 160 161 if !p.c.GetOutputAsMetrics() { 162 return nil 163 } 164 165 defaultKind := metrics.CUMULATIVE 166 if p.c.GetMode() == configpb.ProbeConf_ONCE { 167 defaultKind = metrics.GAUGE 168 } 169 170 p.payloadParser, err = payload.NewParser(p.c.GetOutputMetricsOptions(), "external", p.name, metrics.Kind(defaultKind), p.l) 171 if err != nil { 172 return fmt.Errorf("error initializing payload metrics: %v", err) 173 } 174 175 return nil 176 } 177 178 // substituteLabels replaces occurrences of @label@ with the values from 179 // labels. It returns the substituted string and a bool indicating if there 180 // was a @label@ that did not exist in the labels map. 181 func substituteLabels(in string, labels map[string]string) (string, bool) { 182 if len(labels) == 0 { 183 return in, !strings.Contains(in, "@") 184 } 185 delimiter := "@" 186 output := "" 187 words := strings.Split(in, delimiter) 188 count := len(words) 189 foundAll := true 190 for j, kwd := range words { 191 // Even number of words => just copy out. 192 if j%2 == 0 { 193 output += kwd 194 continue 195 } 196 // Special case: If there are an odd number of '@' (unbalanced), the last 197 // odd index doesn't actually have a closing '@', so we just append it as it 198 // is. 199 if j == count-1 { 200 output += delimiter 201 output += kwd 202 continue 203 } 204 205 // Special case: "@@" => "@" 206 if kwd == "" { 207 output += delimiter 208 continue 209 } 210 211 // Finally, the labels. 212 replace, ok := labels[kwd] 213 if ok { 214 output += replace 215 continue 216 } 217 218 // Nothing - put the token back in. 219 foundAll = false 220 output += delimiter 221 output += kwd 222 output += delimiter 223 } 224 return output, foundAll 225 } 226 227 type command interface { 228 Wait() error 229 } 230 231 // monitorCommand waits for the process to terminate and sets cmdRunning to 232 // false when that happens. 233 func (p *Probe) monitorCommand(startCtx context.Context, cmd command) error { 234 err := cmd.Wait() 235 236 // Spare logging error message if killed explicitly. 237 select { 238 case <-startCtx.Done(): 239 return nil 240 default: 241 } 242 243 if exitErr, ok := err.(*exec.ExitError); ok { 244 return fmt.Errorf("external probe process died with the status: %s. Stderr: %s", exitErr.Error(), string(exitErr.Stderr)) 245 } 246 return err 247 } 248 249 func (p *Probe) startCmdIfNotRunning(startCtx context.Context) error { 250 // Start external probe command if it's not running already. Note that here we 251 // are trusting the cmdRunning to be set correctly. It can be false for 3 reasons: 252 // 1) This is the first call and the process has actually never been started. 253 // 2) cmd.Start() started the process but still returned an error. 254 // 3) cmd.Wait() returned incorrectly, while the process was still running. 255 // 256 // 2 or 3 should never happen as per design, but managing processes can be tricky. 257 // Documenting here to help with debugging if we run into an issue. 258 if p.cmdRunning { 259 return nil 260 } 261 p.l.Infof("Starting external command: %s %s", p.cmdName, strings.Join(p.cmdArgs, " ")) 262 cmd := exec.CommandContext(startCtx, p.cmdName, p.cmdArgs...) 263 var err error 264 if p.cmdStdin, err = cmd.StdinPipe(); err != nil { 265 return err 266 } 267 if p.cmdStdout, err = cmd.StdoutPipe(); err != nil { 268 return err 269 } 270 if p.cmdStderr, err = cmd.StderrPipe(); err != nil { 271 return err 272 } 273 274 go func() { 275 scanner := bufio.NewScanner(p.cmdStderr) 276 for scanner.Scan() { 277 p.l.Warningf("Stderr of %s: %s", cmd.Path, scanner.Text()) 278 } 279 }() 280 281 if err = cmd.Start(); err != nil { 282 p.l.Errorf("error while starting the cmd: %s %s. Err: %v", cmd.Path, cmd.Args, err) 283 return fmt.Errorf("error while starting the cmd: %s %s. Err: %v", cmd.Path, cmd.Args, err) 284 } 285 286 doneChan := make(chan struct{}) 287 // This goroutine waits for the process to terminate and sets cmdRunning to 288 // false when that happens. 289 go func() { 290 if err := p.monitorCommand(startCtx, cmd); err != nil { 291 p.l.Error(err.Error()) 292 } 293 close(doneChan) 294 p.cmdRunning = false 295 }() 296 go p.readProbeReplies(doneChan) 297 p.cmdRunning = true 298 return nil 299 } 300 301 func (p *Probe) readProbeReplies(done chan struct{}) error { 302 bufReader := bufio.NewReader(p.cmdStdout) 303 // Start a background goroutine to read probe replies from the probe server 304 // process's stdout and put them on the probe's replyChan. Note that replyChan 305 // is a one element channel. Idea is that we won't need buffering other than 306 // the one provided by Unix pipes. 307 for { 308 select { 309 case <-done: 310 return nil 311 default: 312 } 313 rep, err := serverutils.ReadProbeReply(bufReader) 314 if err != nil { 315 // Return if external probe process pipe has closed. We get: 316 // io.EOF: when other process has closed the pipe. 317 // os.ErrClosed: when we have closed the pipe (through cmd.Wait()). 318 // *os.PathError: deferred close of the pipe. 319 _, isPathError := err.(*os.PathError) 320 if err == os.ErrClosed || err == io.EOF || isPathError { 321 p.l.Errorf("External probe process pipe is closed. Err: %s", err.Error()) 322 return err 323 } 324 p.l.Errorf("Error reading probe reply: %s", err.Error()) 325 continue 326 } 327 p.replyChan <- rep 328 } 329 330 } 331 332 func (p *Probe) withAdditionalLabels(em *metrics.EventMetrics, target string) *metrics.EventMetrics { 333 for _, al := range p.opts.AdditionalLabels { 334 em.AddLabel(al.KeyValueForTarget(target)) 335 } 336 return em 337 } 338 339 func (p *Probe) defaultMetrics(target string, result *result) *metrics.EventMetrics { 340 em := metrics.NewEventMetrics(time.Now()). 341 AddMetric("success", metrics.NewInt(result.success)). 342 AddMetric("total", metrics.NewInt(result.total)). 343 AddMetric("latency", result.latency). 344 AddLabel("ptype", "external"). 345 AddLabel("probe", p.name). 346 AddLabel("dst", target) 347 348 em.LatencyUnit = p.opts.LatencyUnit 349 350 if p.opts.Validators != nil { 351 em.AddMetric("validation_failure", result.validationFailure) 352 } 353 354 return p.withAdditionalLabels(em, target) 355 } 356 357 func (p *Probe) labels(ep endpoint.Endpoint) map[string]string { 358 labels := make(map[string]string) 359 if p.labelKeys["probe"] { 360 labels["probe"] = p.name 361 } 362 if p.labelKeys["target"] { 363 labels["target"] = ep.Name 364 } 365 if p.labelKeys["port"] { 366 labels["port"] = strconv.Itoa(ep.Port) 367 } 368 if p.labelKeys["address"] { 369 addr, err := p.opts.Targets.Resolve(ep.Name, p.opts.IPVersion) 370 if err != nil { 371 p.l.Warningf("Targets.Resolve(%v, %v) failed: %v ", ep.Name, p.opts.IPVersion, err) 372 } else if !addr.IsUnspecified() { 373 labels["address"] = addr.String() 374 } 375 } 376 for lk, lv := range ep.Labels { 377 k := "target.label." + lk 378 if p.labelKeys[k] { 379 labels[k] = lv 380 } 381 } 382 return labels 383 } 384 385 func (p *Probe) sendRequest(requestID int32, ep endpoint.Endpoint) error { 386 req := &serverpb.ProbeRequest{ 387 RequestId: proto.Int32(requestID), 388 TimeLimit: proto.Int32(int32(p.opts.Timeout / time.Millisecond)), 389 Options: []*serverpb.ProbeRequest_Option{}, 390 } 391 for _, opt := range p.c.GetOptions() { 392 value, found := substituteLabels(opt.GetValue(), p.labels(ep)) 393 if !found { 394 p.l.Warningf("Missing substitution in option %q", value) 395 } 396 req.Options = append(req.Options, &serverpb.ProbeRequest_Option{ 397 Name: opt.Name, 398 Value: proto.String(value), 399 }) 400 } 401 402 p.l.Debugf("Sending a probe request %v to the external probe server for target %v", requestID, ep.Name) 403 return serverutils.WriteMessage(req, p.cmdStdin) 404 } 405 406 type requestInfo struct { 407 target string 408 timestamp time.Time 409 } 410 411 // probeStatus captures the single probe status. It's only used by runProbe 412 // functions to pass a probe's status to processProbeResult method. 413 type probeStatus struct { 414 target string 415 success bool 416 latency time.Duration 417 payload string 418 } 419 420 func (p *Probe) processProbeResult(ps *probeStatus, result *result) { 421 if ps.success && p.opts.Validators != nil { 422 failedValidations := validators.RunValidators(p.opts.Validators, &validators.Input{ResponseBody: []byte(ps.payload)}, result.validationFailure, p.l) 423 424 // If any validation failed, log and set success to false. 425 if len(failedValidations) > 0 { 426 p.l.Debug("Target:", ps.target, " failed validations: ", strings.Join(failedValidations, ","), ".") 427 ps.success = false 428 } 429 } 430 431 if ps.success { 432 result.success++ 433 result.latency.AddFloat64(ps.latency.Seconds() / p.opts.LatencyUnit.Seconds()) 434 } 435 436 em := p.defaultMetrics(ps.target, result) 437 p.opts.LogMetrics(em) 438 p.dataChan <- em 439 440 // If probe is configured to use the external process output (or reply payload 441 // in case of server probe) as metrics. 442 if p.c.GetOutputAsMetrics() { 443 if p.c.GetOutputMetricsOptions().GetAggregateInCloudprober() { 444 result.payloadMetrics = p.payloadParser.AggregatedPayloadMetrics(result.payloadMetrics, ps.payload, ps.target) 445 p.opts.LogMetrics(result.payloadMetrics) 446 p.dataChan <- p.withAdditionalLabels(result.payloadMetrics, ps.target) 447 } else { 448 for _, em := range p.payloadParser.PayloadMetrics(ps.payload, ps.target) { 449 p.opts.LogMetrics(em) 450 p.dataChan <- p.withAdditionalLabels(em, ps.target) 451 } 452 } 453 } 454 } 455 456 func (p *Probe) runServerProbe(ctx, startCtx context.Context) { 457 requests := make(map[int32]requestInfo) 458 var requestsMu sync.RWMutex 459 doneChan := make(chan struct{}) 460 461 if err := p.startCmdIfNotRunning(startCtx); err != nil { 462 p.l.Error(err.Error()) 463 return 464 } 465 466 var wg sync.WaitGroup 467 wg.Add(1) 468 go func() { 469 defer wg.Done() 470 471 // Read probe replies until we have no outstanding requests or context has 472 // run out. 473 for { 474 _, ok := <-doneChan 475 if !ok { 476 // It is safe to access requests without lock here as it won't be accessed 477 // by the send loop after doneChan is closed. 478 p.l.Debugf("Number of outstanding requests: %d", len(requests)) 479 if len(requests) == 0 { 480 return 481 } 482 } 483 select { 484 case <-ctx.Done(): 485 p.l.Error(ctx.Err().Error()) 486 return 487 case rep := <-p.replyChan: 488 requestsMu.Lock() 489 reqInfo, ok := requests[rep.GetRequestId()] 490 if ok { 491 delete(requests, rep.GetRequestId()) 492 } 493 requestsMu.Unlock() 494 if !ok { 495 // Not our reply, could be from the last timed out probe. 496 p.l.Warningf("Got a reply that doesn't match any outstading request: Request id from reply: %v. Ignoring.", rep.GetRequestId()) 497 continue 498 } 499 success := true 500 if rep.GetErrorMessage() != "" { 501 p.l.Errorf("Probe for target %v failed with error message: %s", reqInfo.target, rep.GetErrorMessage()) 502 success = false 503 } 504 p.processProbeResult(&probeStatus{ 505 target: reqInfo.target, 506 success: success, 507 latency: time.Since(reqInfo.timestamp), 508 payload: rep.GetPayload(), 509 }, p.results[reqInfo.target]) 510 } 511 } 512 }() 513 514 // Send probe requests 515 for _, target := range p.targets { 516 p.requestID++ 517 p.results[target.Name].total++ 518 requestsMu.Lock() 519 requests[p.requestID] = requestInfo{ 520 target: target.Name, 521 timestamp: time.Now(), 522 } 523 requestsMu.Unlock() 524 p.sendRequest(p.requestID, target) 525 time.Sleep(TimeBetweenRequests) 526 } 527 528 // Send signal to receiver loop that we are done sending request. 529 close(doneChan) 530 531 // Wait for receiver goroutine to exit. 532 wg.Wait() 533 534 // Handle requests that we have not yet received replies for: "requests" will 535 // contain only outstanding requests by this point. 536 requestsMu.Lock() 537 defer requestsMu.Unlock() 538 for _, req := range requests { 539 p.processProbeResult(&probeStatus{ 540 target: req.target, 541 success: false, 542 }, p.results[req.target]) 543 } 544 } 545 546 // runCommand encapsulates command executor in a variable so that we can 547 // override it for testing. 548 var runCommand = func(ctx context.Context, cmd string, args []string) ([]byte, error) { 549 return exec.CommandContext(ctx, cmd, args...).Output() 550 } 551 552 func (p *Probe) runOnceProbe(ctx context.Context) { 553 var wg sync.WaitGroup 554 555 for _, target := range p.targets { 556 wg.Add(1) 557 go func(target endpoint.Endpoint, result *result) { 558 defer wg.Done() 559 args := make([]string, len(p.cmdArgs)) 560 for i, arg := range p.cmdArgs { 561 res, found := substituteLabels(arg, p.labels(target)) 562 if !found { 563 p.l.Warningf("Substitution not found in %q", arg) 564 } 565 args[i] = res 566 } 567 568 p.l.Infof("Running external command: %s %s", p.cmdName, strings.Join(args, " ")) 569 result.total++ 570 startTime := time.Now() 571 b, err := runCommand(ctx, p.cmdName, args) 572 573 success := true 574 if err != nil { 575 success = false 576 if exitErr, ok := err.(*exec.ExitError); ok { 577 p.l.Errorf("external probe process died with the status: %s. Stderr: %s", exitErr.Error(), exitErr.Stderr) 578 } else { 579 p.l.Errorf("Error executing the external program. Err: %v", err) 580 } 581 } 582 583 p.processProbeResult(&probeStatus{ 584 target: target.Name, 585 success: success, 586 latency: time.Since(startTime), 587 payload: string(b), 588 }, result) 589 }(target, p.results[target.Name]) 590 } 591 wg.Wait() 592 } 593 594 func (p *Probe) updateTargets() { 595 p.targets = p.opts.Targets.ListEndpoints() 596 597 for _, target := range p.targets { 598 if _, ok := p.results[target.Name]; ok { 599 continue 600 } 601 602 var latencyValue metrics.Value 603 if p.opts.LatencyDist != nil { 604 latencyValue = p.opts.LatencyDist.Clone() 605 } else { 606 latencyValue = metrics.NewFloat(0) 607 } 608 609 p.results[target.Name] = &result{ 610 latency: latencyValue, 611 validationFailure: validators.ValidationFailureMap(p.opts.Validators), 612 } 613 614 for _, al := range p.opts.AdditionalLabels { 615 al.UpdateForTarget(target) 616 } 617 } 618 } 619 620 func (p *Probe) runProbe(startCtx context.Context) { 621 probeCtx, cancelFunc := context.WithTimeout(startCtx, p.opts.Timeout) 622 defer cancelFunc() 623 624 p.updateTargets() 625 626 if p.mode == "server" { 627 p.runServerProbe(probeCtx, startCtx) 628 } else { 629 p.runOnceProbe(probeCtx) 630 } 631 } 632 633 // Start starts and runs the probe indefinitely. 634 func (p *Probe) Start(startCtx context.Context, dataChan chan *metrics.EventMetrics) { 635 p.dataChan = dataChan 636 637 ticker := time.NewTicker(p.opts.Interval) 638 defer ticker.Stop() 639 640 for range ticker.C { 641 // Don't run another probe if context is canceled already. 642 select { 643 case <-startCtx.Done(): 644 return 645 default: 646 } 647 648 p.runProbe(startCtx) 649 } 650 }