github.com/netdata/go.d.plugin@v0.58.1/agent/module/job.go (about)

     1  // SPDX-License-Identifier: GPL-3.0-or-later
     2  
     3  package module
     4  
     5  import (
     6  	"bytes"
     7  	"fmt"
     8  	"io"
     9  	"log/slog"
    10  	"os"
    11  	"regexp"
    12  	"runtime/debug"
    13  	"strings"
    14  	"sync"
    15  	"time"
    16  
    17  	"github.com/netdata/go.d.plugin/agent/netdataapi"
    18  	"github.com/netdata/go.d.plugin/agent/vnodes"
    19  	"github.com/netdata/go.d.plugin/logger"
    20  )
    21  
    22  var obsoleteLock = &sync.Mutex{}
    23  var obsoleteCharts = true
    24  
    25  func DontObsoleteCharts() {
    26  	obsoleteLock.Lock()
    27  	obsoleteCharts = false
    28  	obsoleteLock.Unlock()
    29  }
    30  
    31  func shouldObsoleteCharts() bool {
    32  	obsoleteLock.Lock()
    33  	defer obsoleteLock.Unlock()
    34  	return obsoleteCharts
    35  }
    36  
    37  var reSpace = regexp.MustCompile(`\s+`)
    38  
    39  var ndInternalMonitoringDisabled = os.Getenv("NETDATA_INTERNALS_MONITORING") == "NO"
    40  
    41  func newRuntimeChart(pluginName string) *Chart {
    42  	// this is needed to keep the same name as we had before https://github.com/netdata/go.d.plugin/issues/650
    43  	ctxName := pluginName
    44  	if ctxName == "go.d" {
    45  		ctxName = "go"
    46  	}
    47  	ctxName = reSpace.ReplaceAllString(ctxName, "_")
    48  	return &Chart{
    49  		typ:      "netdata",
    50  		Title:    "Execution time",
    51  		Units:    "ms",
    52  		Fam:      pluginName,
    53  		Ctx:      fmt.Sprintf("netdata.%s_plugin_execution_time", ctxName),
    54  		Priority: 145000,
    55  		Dims: Dims{
    56  			{ID: "time"},
    57  		},
    58  	}
    59  }
    60  
    61  type JobConfig struct {
    62  	PluginName      string
    63  	Name            string
    64  	ModuleName      string
    65  	FullName        string
    66  	Module          Module
    67  	Labels          map[string]string
    68  	Out             io.Writer
    69  	UpdateEvery     int
    70  	AutoDetectEvery int
    71  	Priority        int
    72  	IsStock         bool
    73  
    74  	VnodeGUID     string
    75  	VnodeHostname string
    76  	VnodeLabels   map[string]string
    77  }
    78  
    79  const (
    80  	penaltyStep = 5
    81  	maxPenalty  = 600
    82  	infTries    = -1
    83  )
    84  
    85  func NewJob(cfg JobConfig) *Job {
    86  	var buf bytes.Buffer
    87  
    88  	j := &Job{
    89  		AutoDetectEvery: cfg.AutoDetectEvery,
    90  		AutoDetectTries: infTries,
    91  
    92  		pluginName:  cfg.PluginName,
    93  		name:        cfg.Name,
    94  		moduleName:  cfg.ModuleName,
    95  		fullName:    cfg.FullName,
    96  		updateEvery: cfg.UpdateEvery,
    97  		priority:    cfg.Priority,
    98  		isStock:     cfg.IsStock,
    99  		module:      cfg.Module,
   100  		labels:      cfg.Labels,
   101  		out:         cfg.Out,
   102  		runChart:    newRuntimeChart(cfg.PluginName),
   103  		stop:        make(chan struct{}),
   104  		tick:        make(chan int),
   105  		buf:         &buf,
   106  		api:         netdataapi.New(&buf),
   107  
   108  		vnodeGUID:     cfg.VnodeGUID,
   109  		vnodeHostname: cfg.VnodeHostname,
   110  		vnodeLabels:   cfg.VnodeLabels,
   111  	}
   112  
   113  	log := logger.New().With(
   114  		slog.String("collector", j.ModuleName()),
   115  		slog.String("job", j.Name()),
   116  	)
   117  
   118  	j.Logger = log
   119  	if j.module != nil {
   120  		j.module.GetBase().Logger = log
   121  	}
   122  
   123  	return j
   124  }
   125  
   126  // Job represents a job. It's a module wrapper.
   127  type Job struct {
   128  	pluginName string
   129  	name       string
   130  	moduleName string
   131  	fullName   string
   132  
   133  	updateEvery     int
   134  	AutoDetectEvery int
   135  	AutoDetectTries int
   136  	priority        int
   137  	labels          map[string]string
   138  
   139  	*logger.Logger
   140  
   141  	isStock bool
   142  
   143  	module Module
   144  
   145  	initialized bool
   146  	panicked    bool
   147  
   148  	runChart *Chart
   149  	charts   *Charts
   150  	tick     chan int
   151  	out      io.Writer
   152  	buf      *bytes.Buffer
   153  	api      *netdataapi.API
   154  
   155  	retries int
   156  	prevRun time.Time
   157  
   158  	stop chan struct{}
   159  
   160  	vnodeCreated  bool
   161  	vnodeGUID     string
   162  	vnodeHostname string
   163  	vnodeLabels   map[string]string
   164  }
   165  
   166  // NetdataChartIDMaxLength is the chart ID max length. See RRD_ID_LENGTH_MAX in the netdata source code.
   167  const NetdataChartIDMaxLength = 1000
   168  
   169  // FullName returns job full name.
   170  func (j Job) FullName() string {
   171  	return j.fullName
   172  }
   173  
   174  // ModuleName returns job module name.
   175  func (j Job) ModuleName() string {
   176  	return j.moduleName
   177  }
   178  
   179  // Name returns job name.
   180  func (j Job) Name() string {
   181  	return j.name
   182  }
   183  
   184  // Panicked returns 'panicked' flag value.
   185  func (j Job) Panicked() bool {
   186  	return j.panicked
   187  }
   188  
   189  // AutoDetectionEvery returns value of AutoDetectEvery.
   190  func (j Job) AutoDetectionEvery() int {
   191  	return j.AutoDetectEvery
   192  }
   193  
   194  // RetryAutoDetection returns whether it is needed to retry autodetection.
   195  func (j Job) RetryAutoDetection() bool {
   196  	return j.AutoDetectEvery > 0 && (j.AutoDetectTries == infTries || j.AutoDetectTries > 0)
   197  }
   198  
   199  // AutoDetection invokes init, check and postCheck. It handles panic.
   200  func (j *Job) AutoDetection() (ok bool) {
   201  	defer func() {
   202  		if r := recover(); r != nil {
   203  			ok = false
   204  			j.panicked = true
   205  			j.disableAutoDetection()
   206  
   207  			j.Errorf("PANIC %v", r)
   208  			if logger.Level.Enabled(slog.LevelDebug) {
   209  				j.Errorf("STACK: %s", debug.Stack())
   210  			}
   211  		}
   212  		if !ok {
   213  			j.module.Cleanup()
   214  		}
   215  	}()
   216  
   217  	if j.isStock {
   218  		j.Mute()
   219  	}
   220  
   221  	if ok = j.init(); !ok {
   222  		j.Error("init failed")
   223  		j.Unmute()
   224  		j.disableAutoDetection()
   225  		return
   226  	}
   227  
   228  	if ok = j.check(); !ok {
   229  		j.Error("check failed")
   230  		j.Unmute()
   231  		return
   232  	}
   233  
   234  	j.Unmute()
   235  
   236  	j.Info("check success")
   237  	if ok = j.postCheck(); !ok {
   238  		j.Error("postCheck failed")
   239  		j.disableAutoDetection()
   240  		return
   241  	}
   242  
   243  	return true
   244  }
   245  
   246  // Tick Tick.
   247  func (j *Job) Tick(clock int) {
   248  	select {
   249  	case j.tick <- clock:
   250  	default:
   251  		j.Debug("skip the tick due to previous run hasn't been finished")
   252  	}
   253  }
   254  
   255  // Start starts job main loop.
   256  func (j *Job) Start() {
   257  	j.Infof("started, data collection interval %ds", j.updateEvery)
   258  	defer func() { j.Info("stopped") }()
   259  
   260  LOOP:
   261  	for {
   262  		select {
   263  		case <-j.stop:
   264  			break LOOP
   265  		case t := <-j.tick:
   266  			if t%(j.updateEvery+j.penalty()) == 0 {
   267  				j.runOnce()
   268  			}
   269  		}
   270  	}
   271  	j.module.Cleanup()
   272  	j.Cleanup()
   273  	j.stop <- struct{}{}
   274  }
   275  
   276  // Stop stops job main loop. It blocks until the job is stopped.
   277  func (j *Job) Stop() {
   278  	// TODO: should have blocking and non blocking stop
   279  	j.stop <- struct{}{}
   280  	<-j.stop
   281  }
   282  
   283  func (j *Job) disableAutoDetection() {
   284  	j.AutoDetectEvery = 0
   285  }
   286  
   287  func (j *Job) Cleanup() {
   288  	j.buf.Reset()
   289  	if !shouldObsoleteCharts() {
   290  		return
   291  	}
   292  
   293  	if !vnodes.Disabled {
   294  		if !j.vnodeCreated && j.vnodeGUID != "" {
   295  			_ = j.api.HOSTINFO(j.vnodeGUID, j.vnodeHostname, j.vnodeLabels)
   296  			j.vnodeCreated = true
   297  		}
   298  		_ = j.api.HOST(j.vnodeGUID)
   299  	}
   300  
   301  	if j.runChart.created {
   302  		j.runChart.MarkRemove()
   303  		j.createChart(j.runChart)
   304  	}
   305  	if j.charts != nil {
   306  		for _, chart := range *j.charts {
   307  			if chart.created {
   308  				chart.MarkRemove()
   309  				j.createChart(chart)
   310  			}
   311  		}
   312  	}
   313  
   314  	if j.buf.Len() > 0 {
   315  		_, _ = io.Copy(j.out, j.buf)
   316  	}
   317  }
   318  
   319  func (j *Job) init() bool {
   320  	if j.initialized {
   321  		return true
   322  	}
   323  
   324  	j.initialized = j.module.Init()
   325  
   326  	return j.initialized
   327  }
   328  
   329  func (j *Job) check() bool {
   330  	ok := j.module.Check()
   331  	if !ok && j.AutoDetectTries != infTries {
   332  		j.AutoDetectTries--
   333  	}
   334  	return ok
   335  }
   336  
   337  func (j *Job) postCheck() bool {
   338  	if j.charts = j.module.Charts(); j.charts == nil {
   339  		j.Error("nil charts")
   340  		return false
   341  	}
   342  	if err := checkCharts(*j.charts...); err != nil {
   343  		j.Errorf("charts check: %v", err)
   344  		return false
   345  	}
   346  	return true
   347  }
   348  
   349  func (j *Job) runOnce() {
   350  	curTime := time.Now()
   351  	sinceLastRun := calcSinceLastRun(curTime, j.prevRun)
   352  	j.prevRun = curTime
   353  
   354  	metrics := j.collect()
   355  
   356  	if j.panicked {
   357  		return
   358  	}
   359  
   360  	if j.processMetrics(metrics, curTime, sinceLastRun) {
   361  		j.retries = 0
   362  	} else {
   363  		j.retries++
   364  	}
   365  
   366  	_, _ = io.Copy(j.out, j.buf)
   367  	j.buf.Reset()
   368  }
   369  
   370  func (j *Job) collect() (result map[string]int64) {
   371  	j.panicked = false
   372  	defer func() {
   373  		if r := recover(); r != nil {
   374  			j.panicked = true
   375  			j.Errorf("PANIC: %v", r)
   376  			if logger.Level.Enabled(slog.LevelDebug) {
   377  				j.Errorf("STACK: %s", debug.Stack())
   378  			}
   379  		}
   380  	}()
   381  	return j.module.Collect()
   382  }
   383  
   384  func (j *Job) processMetrics(metrics map[string]int64, startTime time.Time, sinceLastRun int) bool {
   385  	if !vnodes.Disabled {
   386  		if !j.vnodeCreated && j.vnodeGUID != "" {
   387  			_ = j.api.HOSTINFO(j.vnodeGUID, j.vnodeHostname, j.vnodeLabels)
   388  			j.vnodeCreated = true
   389  		}
   390  
   391  		_ = j.api.HOST(j.vnodeGUID)
   392  	}
   393  
   394  	if !ndInternalMonitoringDisabled && !j.runChart.created {
   395  		j.runChart.ID = fmt.Sprintf("execution_time_of_%s", j.FullName())
   396  		j.createChart(j.runChart)
   397  	}
   398  
   399  	elapsed := int64(durationTo(time.Since(startTime), time.Millisecond))
   400  
   401  	var i, updated int
   402  	for _, chart := range *j.charts {
   403  		if !chart.created {
   404  			typeID := fmt.Sprintf("%s.%s", j.FullName(), chart.ID)
   405  			if len(typeID) >= NetdataChartIDMaxLength {
   406  				j.Warningf("chart 'type.id' length (%d) >= max allowed (%d), the chart is ignored (%s)",
   407  					len(typeID), NetdataChartIDMaxLength, typeID)
   408  				chart.ignore = true
   409  			}
   410  			j.createChart(chart)
   411  		}
   412  		if chart.remove {
   413  			continue
   414  		}
   415  		(*j.charts)[i] = chart
   416  		i++
   417  		if len(metrics) == 0 || chart.Obsolete {
   418  			continue
   419  		}
   420  		if j.updateChart(chart, metrics, sinceLastRun) {
   421  			updated++
   422  		}
   423  	}
   424  	*j.charts = (*j.charts)[:i]
   425  
   426  	if updated == 0 {
   427  		return false
   428  	}
   429  	if !ndInternalMonitoringDisabled {
   430  		j.updateChart(j.runChart, map[string]int64{"time": elapsed}, sinceLastRun)
   431  	}
   432  
   433  	return true
   434  }
   435  
   436  func (j *Job) createChart(chart *Chart) {
   437  	defer func() { chart.created = true }()
   438  	if chart.ignore {
   439  		return
   440  	}
   441  
   442  	if chart.Priority == 0 {
   443  		chart.Priority = j.priority
   444  		j.priority++
   445  	}
   446  	_ = j.api.CHART(
   447  		getChartType(chart, j),
   448  		getChartID(chart),
   449  		chart.OverID,
   450  		chart.Title,
   451  		chart.Units,
   452  		chart.Fam,
   453  		chart.Ctx,
   454  		chart.Type.String(),
   455  		chart.Priority,
   456  		j.updateEvery,
   457  		chart.Opts.String(),
   458  		j.pluginName,
   459  		j.moduleName,
   460  	)
   461  
   462  	if chart.Obsolete {
   463  		_ = j.api.EMPTYLINE()
   464  		return
   465  	}
   466  
   467  	seen := make(map[string]bool)
   468  	for _, l := range chart.Labels {
   469  		if l.Key != "" {
   470  			seen[l.Key] = true
   471  			ls := l.Source
   472  			// the default should be auto
   473  			// https://github.com/netdata/netdata/blob/cc2586de697702f86a3c34e60e23652dd4ddcb42/database/rrd.h#L205
   474  			if ls == 0 {
   475  				ls = LabelSourceAuto
   476  			}
   477  			_ = j.api.CLABEL(l.Key, l.Value, ls)
   478  		}
   479  	}
   480  	for k, v := range j.labels {
   481  		if !seen[k] {
   482  			_ = j.api.CLABEL(k, v, LabelSourceConf)
   483  		}
   484  	}
   485  	_ = j.api.CLABEL("_collect_job", j.Name(), LabelSourceAuto)
   486  	_ = j.api.CLABELCOMMIT()
   487  
   488  	for _, dim := range chart.Dims {
   489  		_ = j.api.DIMENSION(
   490  			firstNotEmpty(dim.Name, dim.ID),
   491  			dim.Name,
   492  			dim.Algo.String(),
   493  			handleZero(dim.Mul),
   494  			handleZero(dim.Div),
   495  			dim.DimOpts.String(),
   496  		)
   497  	}
   498  	for _, v := range chart.Vars {
   499  		if v.Name != "" {
   500  			_ = j.api.VARIABLE(v.Name, v.Value)
   501  		} else {
   502  			_ = j.api.VARIABLE(v.ID, v.Value)
   503  		}
   504  	}
   505  	_ = j.api.EMPTYLINE()
   506  }
   507  
   508  func (j *Job) updateChart(chart *Chart, collected map[string]int64, sinceLastRun int) bool {
   509  	if chart.ignore {
   510  		dims := chart.Dims[:0]
   511  		for _, dim := range chart.Dims {
   512  			if !dim.remove {
   513  				dims = append(dims, dim)
   514  			}
   515  		}
   516  		chart.Dims = dims
   517  		return false
   518  	}
   519  
   520  	if !chart.updated {
   521  		sinceLastRun = 0
   522  	}
   523  
   524  	_ = j.api.BEGIN(
   525  		getChartType(chart, j),
   526  		getChartID(chart),
   527  		sinceLastRun,
   528  	)
   529  	var i, updated int
   530  	for _, dim := range chart.Dims {
   531  		if dim.remove {
   532  			continue
   533  		}
   534  		chart.Dims[i] = dim
   535  		i++
   536  		if v, ok := collected[dim.ID]; !ok {
   537  			_ = j.api.SETEMPTY(firstNotEmpty(dim.Name, dim.ID))
   538  		} else {
   539  			_ = j.api.SET(firstNotEmpty(dim.Name, dim.ID), v)
   540  			updated++
   541  		}
   542  	}
   543  	chart.Dims = chart.Dims[:i]
   544  
   545  	for _, vr := range chart.Vars {
   546  		if v, ok := collected[vr.ID]; ok {
   547  			if vr.Name != "" {
   548  				_ = j.api.VARIABLE(vr.Name, v)
   549  			} else {
   550  				_ = j.api.VARIABLE(vr.ID, v)
   551  			}
   552  		}
   553  
   554  	}
   555  	_ = j.api.END()
   556  
   557  	if chart.updated = updated > 0; chart.updated {
   558  		chart.Retries = 0
   559  	} else {
   560  		chart.Retries++
   561  	}
   562  	return chart.updated
   563  }
   564  
   565  func (j Job) penalty() int {
   566  	v := j.retries / penaltyStep * penaltyStep * j.updateEvery / 2
   567  	if v > maxPenalty {
   568  		return maxPenalty
   569  	}
   570  	return v
   571  }
   572  
   573  func getChartType(chart *Chart, j *Job) string {
   574  	if chart.typ != "" {
   575  		return chart.typ
   576  	}
   577  	if !chart.IDSep {
   578  		chart.typ = j.FullName()
   579  	} else if i := strings.IndexByte(chart.ID, '.'); i != -1 {
   580  		chart.typ = j.FullName() + "_" + chart.ID[:i]
   581  	} else {
   582  		chart.typ = j.FullName()
   583  	}
   584  	if chart.OverModule != "" {
   585  		if v := strings.TrimPrefix(chart.typ, j.ModuleName()); v != chart.typ {
   586  			chart.typ = chart.OverModule + v
   587  		}
   588  	}
   589  	return chart.typ
   590  }
   591  
   592  func getChartID(chart *Chart) string {
   593  	if chart.id != "" {
   594  		return chart.id
   595  	}
   596  	if !chart.IDSep {
   597  		return chart.ID
   598  	}
   599  	if i := strings.IndexByte(chart.ID, '.'); i != -1 {
   600  		chart.id = chart.ID[i+1:]
   601  	} else {
   602  		chart.id = chart.ID
   603  	}
   604  	return chart.id
   605  }
   606  
   607  func calcSinceLastRun(curTime, prevRun time.Time) int {
   608  	if prevRun.IsZero() {
   609  		return 0
   610  	}
   611  	return int((curTime.UnixNano() - prevRun.UnixNano()) / 1000)
   612  }
   613  
   614  func durationTo(duration time.Duration, to time.Duration) int {
   615  	return int(int64(duration) / (int64(to) / int64(time.Nanosecond)))
   616  }
   617  
   618  func firstNotEmpty(val1, val2 string) string {
   619  	if val1 != "" {
   620  		return val1
   621  	}
   622  	return val2
   623  }
   624  
   625  func handleZero(v int) int {
   626  	if v == 0 {
   627  		return 1
   628  	}
   629  	return v
   630  }