github.com/netdata/go.d.plugin@v0.58.1/agent/module/job.go (about) 1 // SPDX-License-Identifier: GPL-3.0-or-later 2 3 package module 4 5 import ( 6 "bytes" 7 "fmt" 8 "io" 9 "log/slog" 10 "os" 11 "regexp" 12 "runtime/debug" 13 "strings" 14 "sync" 15 "time" 16 17 "github.com/netdata/go.d.plugin/agent/netdataapi" 18 "github.com/netdata/go.d.plugin/agent/vnodes" 19 "github.com/netdata/go.d.plugin/logger" 20 ) 21 22 var obsoleteLock = &sync.Mutex{} 23 var obsoleteCharts = true 24 25 func DontObsoleteCharts() { 26 obsoleteLock.Lock() 27 obsoleteCharts = false 28 obsoleteLock.Unlock() 29 } 30 31 func shouldObsoleteCharts() bool { 32 obsoleteLock.Lock() 33 defer obsoleteLock.Unlock() 34 return obsoleteCharts 35 } 36 37 var reSpace = regexp.MustCompile(`\s+`) 38 39 var ndInternalMonitoringDisabled = os.Getenv("NETDATA_INTERNALS_MONITORING") == "NO" 40 41 func newRuntimeChart(pluginName string) *Chart { 42 // this is needed to keep the same name as we had before https://github.com/netdata/go.d.plugin/issues/650 43 ctxName := pluginName 44 if ctxName == "go.d" { 45 ctxName = "go" 46 } 47 ctxName = reSpace.ReplaceAllString(ctxName, "_") 48 return &Chart{ 49 typ: "netdata", 50 Title: "Execution time", 51 Units: "ms", 52 Fam: pluginName, 53 Ctx: fmt.Sprintf("netdata.%s_plugin_execution_time", ctxName), 54 Priority: 145000, 55 Dims: Dims{ 56 {ID: "time"}, 57 }, 58 } 59 } 60 61 type JobConfig struct { 62 PluginName string 63 Name string 64 ModuleName string 65 FullName string 66 Module Module 67 Labels map[string]string 68 Out io.Writer 69 UpdateEvery int 70 AutoDetectEvery int 71 Priority int 72 IsStock bool 73 74 VnodeGUID string 75 VnodeHostname string 76 VnodeLabels map[string]string 77 } 78 79 const ( 80 penaltyStep = 5 81 maxPenalty = 600 82 infTries = -1 83 ) 84 85 func NewJob(cfg JobConfig) *Job { 86 var buf bytes.Buffer 87 88 j := &Job{ 89 AutoDetectEvery: cfg.AutoDetectEvery, 90 AutoDetectTries: infTries, 91 92 pluginName: cfg.PluginName, 93 name: cfg.Name, 94 moduleName: cfg.ModuleName, 95 fullName: cfg.FullName, 96 updateEvery: cfg.UpdateEvery, 97 priority: cfg.Priority, 98 isStock: cfg.IsStock, 99 module: cfg.Module, 100 labels: cfg.Labels, 101 out: cfg.Out, 102 runChart: newRuntimeChart(cfg.PluginName), 103 stop: make(chan struct{}), 104 tick: make(chan int), 105 buf: &buf, 106 api: netdataapi.New(&buf), 107 108 vnodeGUID: cfg.VnodeGUID, 109 vnodeHostname: cfg.VnodeHostname, 110 vnodeLabels: cfg.VnodeLabels, 111 } 112 113 log := logger.New().With( 114 slog.String("collector", j.ModuleName()), 115 slog.String("job", j.Name()), 116 ) 117 118 j.Logger = log 119 if j.module != nil { 120 j.module.GetBase().Logger = log 121 } 122 123 return j 124 } 125 126 // Job represents a job. It's a module wrapper. 127 type Job struct { 128 pluginName string 129 name string 130 moduleName string 131 fullName string 132 133 updateEvery int 134 AutoDetectEvery int 135 AutoDetectTries int 136 priority int 137 labels map[string]string 138 139 *logger.Logger 140 141 isStock bool 142 143 module Module 144 145 initialized bool 146 panicked bool 147 148 runChart *Chart 149 charts *Charts 150 tick chan int 151 out io.Writer 152 buf *bytes.Buffer 153 api *netdataapi.API 154 155 retries int 156 prevRun time.Time 157 158 stop chan struct{} 159 160 vnodeCreated bool 161 vnodeGUID string 162 vnodeHostname string 163 vnodeLabels map[string]string 164 } 165 166 // NetdataChartIDMaxLength is the chart ID max length. See RRD_ID_LENGTH_MAX in the netdata source code. 167 const NetdataChartIDMaxLength = 1000 168 169 // FullName returns job full name. 170 func (j Job) FullName() string { 171 return j.fullName 172 } 173 174 // ModuleName returns job module name. 175 func (j Job) ModuleName() string { 176 return j.moduleName 177 } 178 179 // Name returns job name. 180 func (j Job) Name() string { 181 return j.name 182 } 183 184 // Panicked returns 'panicked' flag value. 185 func (j Job) Panicked() bool { 186 return j.panicked 187 } 188 189 // AutoDetectionEvery returns value of AutoDetectEvery. 190 func (j Job) AutoDetectionEvery() int { 191 return j.AutoDetectEvery 192 } 193 194 // RetryAutoDetection returns whether it is needed to retry autodetection. 195 func (j Job) RetryAutoDetection() bool { 196 return j.AutoDetectEvery > 0 && (j.AutoDetectTries == infTries || j.AutoDetectTries > 0) 197 } 198 199 // AutoDetection invokes init, check and postCheck. It handles panic. 200 func (j *Job) AutoDetection() (ok bool) { 201 defer func() { 202 if r := recover(); r != nil { 203 ok = false 204 j.panicked = true 205 j.disableAutoDetection() 206 207 j.Errorf("PANIC %v", r) 208 if logger.Level.Enabled(slog.LevelDebug) { 209 j.Errorf("STACK: %s", debug.Stack()) 210 } 211 } 212 if !ok { 213 j.module.Cleanup() 214 } 215 }() 216 217 if j.isStock { 218 j.Mute() 219 } 220 221 if ok = j.init(); !ok { 222 j.Error("init failed") 223 j.Unmute() 224 j.disableAutoDetection() 225 return 226 } 227 228 if ok = j.check(); !ok { 229 j.Error("check failed") 230 j.Unmute() 231 return 232 } 233 234 j.Unmute() 235 236 j.Info("check success") 237 if ok = j.postCheck(); !ok { 238 j.Error("postCheck failed") 239 j.disableAutoDetection() 240 return 241 } 242 243 return true 244 } 245 246 // Tick Tick. 247 func (j *Job) Tick(clock int) { 248 select { 249 case j.tick <- clock: 250 default: 251 j.Debug("skip the tick due to previous run hasn't been finished") 252 } 253 } 254 255 // Start starts job main loop. 256 func (j *Job) Start() { 257 j.Infof("started, data collection interval %ds", j.updateEvery) 258 defer func() { j.Info("stopped") }() 259 260 LOOP: 261 for { 262 select { 263 case <-j.stop: 264 break LOOP 265 case t := <-j.tick: 266 if t%(j.updateEvery+j.penalty()) == 0 { 267 j.runOnce() 268 } 269 } 270 } 271 j.module.Cleanup() 272 j.Cleanup() 273 j.stop <- struct{}{} 274 } 275 276 // Stop stops job main loop. It blocks until the job is stopped. 277 func (j *Job) Stop() { 278 // TODO: should have blocking and non blocking stop 279 j.stop <- struct{}{} 280 <-j.stop 281 } 282 283 func (j *Job) disableAutoDetection() { 284 j.AutoDetectEvery = 0 285 } 286 287 func (j *Job) Cleanup() { 288 j.buf.Reset() 289 if !shouldObsoleteCharts() { 290 return 291 } 292 293 if !vnodes.Disabled { 294 if !j.vnodeCreated && j.vnodeGUID != "" { 295 _ = j.api.HOSTINFO(j.vnodeGUID, j.vnodeHostname, j.vnodeLabels) 296 j.vnodeCreated = true 297 } 298 _ = j.api.HOST(j.vnodeGUID) 299 } 300 301 if j.runChart.created { 302 j.runChart.MarkRemove() 303 j.createChart(j.runChart) 304 } 305 if j.charts != nil { 306 for _, chart := range *j.charts { 307 if chart.created { 308 chart.MarkRemove() 309 j.createChart(chart) 310 } 311 } 312 } 313 314 if j.buf.Len() > 0 { 315 _, _ = io.Copy(j.out, j.buf) 316 } 317 } 318 319 func (j *Job) init() bool { 320 if j.initialized { 321 return true 322 } 323 324 j.initialized = j.module.Init() 325 326 return j.initialized 327 } 328 329 func (j *Job) check() bool { 330 ok := j.module.Check() 331 if !ok && j.AutoDetectTries != infTries { 332 j.AutoDetectTries-- 333 } 334 return ok 335 } 336 337 func (j *Job) postCheck() bool { 338 if j.charts = j.module.Charts(); j.charts == nil { 339 j.Error("nil charts") 340 return false 341 } 342 if err := checkCharts(*j.charts...); err != nil { 343 j.Errorf("charts check: %v", err) 344 return false 345 } 346 return true 347 } 348 349 func (j *Job) runOnce() { 350 curTime := time.Now() 351 sinceLastRun := calcSinceLastRun(curTime, j.prevRun) 352 j.prevRun = curTime 353 354 metrics := j.collect() 355 356 if j.panicked { 357 return 358 } 359 360 if j.processMetrics(metrics, curTime, sinceLastRun) { 361 j.retries = 0 362 } else { 363 j.retries++ 364 } 365 366 _, _ = io.Copy(j.out, j.buf) 367 j.buf.Reset() 368 } 369 370 func (j *Job) collect() (result map[string]int64) { 371 j.panicked = false 372 defer func() { 373 if r := recover(); r != nil { 374 j.panicked = true 375 j.Errorf("PANIC: %v", r) 376 if logger.Level.Enabled(slog.LevelDebug) { 377 j.Errorf("STACK: %s", debug.Stack()) 378 } 379 } 380 }() 381 return j.module.Collect() 382 } 383 384 func (j *Job) processMetrics(metrics map[string]int64, startTime time.Time, sinceLastRun int) bool { 385 if !vnodes.Disabled { 386 if !j.vnodeCreated && j.vnodeGUID != "" { 387 _ = j.api.HOSTINFO(j.vnodeGUID, j.vnodeHostname, j.vnodeLabels) 388 j.vnodeCreated = true 389 } 390 391 _ = j.api.HOST(j.vnodeGUID) 392 } 393 394 if !ndInternalMonitoringDisabled && !j.runChart.created { 395 j.runChart.ID = fmt.Sprintf("execution_time_of_%s", j.FullName()) 396 j.createChart(j.runChart) 397 } 398 399 elapsed := int64(durationTo(time.Since(startTime), time.Millisecond)) 400 401 var i, updated int 402 for _, chart := range *j.charts { 403 if !chart.created { 404 typeID := fmt.Sprintf("%s.%s", j.FullName(), chart.ID) 405 if len(typeID) >= NetdataChartIDMaxLength { 406 j.Warningf("chart 'type.id' length (%d) >= max allowed (%d), the chart is ignored (%s)", 407 len(typeID), NetdataChartIDMaxLength, typeID) 408 chart.ignore = true 409 } 410 j.createChart(chart) 411 } 412 if chart.remove { 413 continue 414 } 415 (*j.charts)[i] = chart 416 i++ 417 if len(metrics) == 0 || chart.Obsolete { 418 continue 419 } 420 if j.updateChart(chart, metrics, sinceLastRun) { 421 updated++ 422 } 423 } 424 *j.charts = (*j.charts)[:i] 425 426 if updated == 0 { 427 return false 428 } 429 if !ndInternalMonitoringDisabled { 430 j.updateChart(j.runChart, map[string]int64{"time": elapsed}, sinceLastRun) 431 } 432 433 return true 434 } 435 436 func (j *Job) createChart(chart *Chart) { 437 defer func() { chart.created = true }() 438 if chart.ignore { 439 return 440 } 441 442 if chart.Priority == 0 { 443 chart.Priority = j.priority 444 j.priority++ 445 } 446 _ = j.api.CHART( 447 getChartType(chart, j), 448 getChartID(chart), 449 chart.OverID, 450 chart.Title, 451 chart.Units, 452 chart.Fam, 453 chart.Ctx, 454 chart.Type.String(), 455 chart.Priority, 456 j.updateEvery, 457 chart.Opts.String(), 458 j.pluginName, 459 j.moduleName, 460 ) 461 462 if chart.Obsolete { 463 _ = j.api.EMPTYLINE() 464 return 465 } 466 467 seen := make(map[string]bool) 468 for _, l := range chart.Labels { 469 if l.Key != "" { 470 seen[l.Key] = true 471 ls := l.Source 472 // the default should be auto 473 // https://github.com/netdata/netdata/blob/cc2586de697702f86a3c34e60e23652dd4ddcb42/database/rrd.h#L205 474 if ls == 0 { 475 ls = LabelSourceAuto 476 } 477 _ = j.api.CLABEL(l.Key, l.Value, ls) 478 } 479 } 480 for k, v := range j.labels { 481 if !seen[k] { 482 _ = j.api.CLABEL(k, v, LabelSourceConf) 483 } 484 } 485 _ = j.api.CLABEL("_collect_job", j.Name(), LabelSourceAuto) 486 _ = j.api.CLABELCOMMIT() 487 488 for _, dim := range chart.Dims { 489 _ = j.api.DIMENSION( 490 firstNotEmpty(dim.Name, dim.ID), 491 dim.Name, 492 dim.Algo.String(), 493 handleZero(dim.Mul), 494 handleZero(dim.Div), 495 dim.DimOpts.String(), 496 ) 497 } 498 for _, v := range chart.Vars { 499 if v.Name != "" { 500 _ = j.api.VARIABLE(v.Name, v.Value) 501 } else { 502 _ = j.api.VARIABLE(v.ID, v.Value) 503 } 504 } 505 _ = j.api.EMPTYLINE() 506 } 507 508 func (j *Job) updateChart(chart *Chart, collected map[string]int64, sinceLastRun int) bool { 509 if chart.ignore { 510 dims := chart.Dims[:0] 511 for _, dim := range chart.Dims { 512 if !dim.remove { 513 dims = append(dims, dim) 514 } 515 } 516 chart.Dims = dims 517 return false 518 } 519 520 if !chart.updated { 521 sinceLastRun = 0 522 } 523 524 _ = j.api.BEGIN( 525 getChartType(chart, j), 526 getChartID(chart), 527 sinceLastRun, 528 ) 529 var i, updated int 530 for _, dim := range chart.Dims { 531 if dim.remove { 532 continue 533 } 534 chart.Dims[i] = dim 535 i++ 536 if v, ok := collected[dim.ID]; !ok { 537 _ = j.api.SETEMPTY(firstNotEmpty(dim.Name, dim.ID)) 538 } else { 539 _ = j.api.SET(firstNotEmpty(dim.Name, dim.ID), v) 540 updated++ 541 } 542 } 543 chart.Dims = chart.Dims[:i] 544 545 for _, vr := range chart.Vars { 546 if v, ok := collected[vr.ID]; ok { 547 if vr.Name != "" { 548 _ = j.api.VARIABLE(vr.Name, v) 549 } else { 550 _ = j.api.VARIABLE(vr.ID, v) 551 } 552 } 553 554 } 555 _ = j.api.END() 556 557 if chart.updated = updated > 0; chart.updated { 558 chart.Retries = 0 559 } else { 560 chart.Retries++ 561 } 562 return chart.updated 563 } 564 565 func (j Job) penalty() int { 566 v := j.retries / penaltyStep * penaltyStep * j.updateEvery / 2 567 if v > maxPenalty { 568 return maxPenalty 569 } 570 return v 571 } 572 573 func getChartType(chart *Chart, j *Job) string { 574 if chart.typ != "" { 575 return chart.typ 576 } 577 if !chart.IDSep { 578 chart.typ = j.FullName() 579 } else if i := strings.IndexByte(chart.ID, '.'); i != -1 { 580 chart.typ = j.FullName() + "_" + chart.ID[:i] 581 } else { 582 chart.typ = j.FullName() 583 } 584 if chart.OverModule != "" { 585 if v := strings.TrimPrefix(chart.typ, j.ModuleName()); v != chart.typ { 586 chart.typ = chart.OverModule + v 587 } 588 } 589 return chart.typ 590 } 591 592 func getChartID(chart *Chart) string { 593 if chart.id != "" { 594 return chart.id 595 } 596 if !chart.IDSep { 597 return chart.ID 598 } 599 if i := strings.IndexByte(chart.ID, '.'); i != -1 { 600 chart.id = chart.ID[i+1:] 601 } else { 602 chart.id = chart.ID 603 } 604 return chart.id 605 } 606 607 func calcSinceLastRun(curTime, prevRun time.Time) int { 608 if prevRun.IsZero() { 609 return 0 610 } 611 return int((curTime.UnixNano() - prevRun.UnixNano()) / 1000) 612 } 613 614 func durationTo(duration time.Duration, to time.Duration) int { 615 return int(int64(duration) / (int64(to) / int64(time.Nanosecond))) 616 } 617 618 func firstNotEmpty(val1, val2 string) string { 619 if val1 != "" { 620 return val1 621 } 622 return val2 623 } 624 625 func handleZero(v int) int { 626 if v == 0 { 627 return 1 628 } 629 return v 630 }