github.com/netdata/go.d.plugin@v0.58.1/agent/jobmgr/manager.go

github.com/netdata/go.d.plugin@v0.58.1/agent/jobmgr/manager.go (about)

     1  // SPDX-License-Identifier: GPL-3.0-or-later
     2  
     3  package jobmgr
     4  
     5  import (
     6  	"context"
     7  	"fmt"
     8  	"io"
     9  	"log/slog"
    10  	"os"
    11  	"strings"
    12  	"sync"
    13  	"time"
    14  
    15  	"github.com/netdata/go.d.plugin/agent/confgroup"
    16  	"github.com/netdata/go.d.plugin/agent/module"
    17  	"github.com/netdata/go.d.plugin/logger"
    18  
    19  	"gopkg.in/yaml.v2"
    20  )
    21  
    22  type Job interface {
    23  	Name() string
    24  	ModuleName() string
    25  	FullName() string
    26  	AutoDetection() bool
    27  	AutoDetectionEvery() int
    28  	RetryAutoDetection() bool
    29  	Tick(clock int)
    30  	Start()
    31  	Stop()
    32  	Cleanup()
    33  }
    34  
    35  type jobStatus = string
    36  
    37  const (
    38  	jobStatusRunning          jobStatus = "running"                    // Check() succeeded
    39  	jobStatusRetrying         jobStatus = "retrying"                   // Check() failed, but we need keep trying auto-detection
    40  	jobStatusStoppedFailed    jobStatus = "stopped_failed"             // Check() failed
    41  	jobStatusStoppedDupLocal  jobStatus = "stopped_duplicate_local"    // a job with the same FullName is running
    42  	jobStatusStoppedDupGlobal jobStatus = "stopped_duplicate_global"   // a job with the same FullName is registered by another plugin
    43  	jobStatusStoppedRegErr    jobStatus = "stopped_registration_error" // an error during registration (only 'too many open files')
    44  	jobStatusStoppedCreateErr jobStatus = "stopped_creation_error"     // an error during creation (yaml unmarshal)
    45  )
    46  
    47  func NewManager() *Manager {
    48  	np := noop{}
    49  	mgr := &Manager{
    50  		Logger: logger.New().With(
    51  			slog.String("component", "job manager"),
    52  		),
    53  		Out:         io.Discard,
    54  		FileLock:    np,
    55  		StatusSaver: np,
    56  		StatusStore: np,
    57  		Vnodes:      np,
    58  		Dyncfg:      np,
    59  
    60  		confGroupCache: confgroup.NewCache(),
    61  
    62  		runningJobs:  newRunningJobsCache(),
    63  		retryingJobs: newRetryingJobsCache(),
    64  
    65  		addCh:    make(chan confgroup.Config),
    66  		removeCh: make(chan confgroup.Config),
    67  	}
    68  
    69  	return mgr
    70  }
    71  
    72  type Manager struct {
    73  	*logger.Logger
    74  
    75  	PluginName string
    76  	Out        io.Writer
    77  	Modules    module.Registry
    78  
    79  	FileLock    FileLocker
    80  	StatusSaver StatusSaver
    81  	StatusStore StatusStore
    82  	Vnodes      Vnodes
    83  	Dyncfg      Dyncfg
    84  
    85  	confGroupCache *confgroup.Cache
    86  	runningJobs    *runningJobsCache
    87  	retryingJobs   *retryingJobsCache
    88  
    89  	addCh    chan confgroup.Config
    90  	removeCh chan confgroup.Config
    91  
    92  	queueMux sync.Mutex
    93  	queue    []Job
    94  }
    95  
    96  func (m *Manager) Run(ctx context.Context, in chan []*confgroup.Group) {
    97  	m.Info("instance is started")
    98  	defer func() { m.cleanup(); m.Info("instance is stopped") }()
    99  
   100  	var wg sync.WaitGroup
   101  
   102  	wg.Add(1)
   103  	go func() { defer wg.Done(); m.runConfigGroupsHandling(ctx, in) }()
   104  
   105  	wg.Add(1)
   106  	go func() { defer wg.Done(); m.runConfigsHandling(ctx) }()
   107  
   108  	wg.Add(1)
   109  	go func() { defer wg.Done(); m.runRunningJobsHandling(ctx) }()
   110  
   111  	wg.Wait()
   112  	<-ctx.Done()
   113  }
   114  
   115  func (m *Manager) runConfigGroupsHandling(ctx context.Context, in chan []*confgroup.Group) {
   116  	for {
   117  		select {
   118  		case <-ctx.Done():
   119  			return
   120  		case groups := <-in:
   121  			for _, gr := range groups {
   122  				select {
   123  				case <-ctx.Done():
   124  					return
   125  				default:
   126  					a, r := m.confGroupCache.Add(gr)
   127  					m.Debugf("received config group ('%s'): %d jobs (added: %d, removed: %d)", gr.Source, len(gr.Configs), len(a), len(r))
   128  					sendConfigs(ctx, m.removeCh, r)
   129  					sendConfigs(ctx, m.addCh, a)
   130  				}
   131  			}
   132  		}
   133  	}
   134  }
   135  
   136  func (m *Manager) runConfigsHandling(ctx context.Context) {
   137  	for {
   138  		select {
   139  		case <-ctx.Done():
   140  			return
   141  		case cfg := <-m.addCh:
   142  			m.addConfig(ctx, cfg)
   143  		case cfg := <-m.removeCh:
   144  			m.removeConfig(cfg)
   145  		}
   146  	}
   147  }
   148  
   149  func (m *Manager) cleanup() {
   150  	for _, task := range *m.retryingJobs {
   151  		task.cancel()
   152  	}
   153  	for name := range *m.runningJobs {
   154  		_ = m.FileLock.Unlock(name)
   155  	}
   156  	// TODO: m.Dyncfg.Register() ?
   157  	m.stopRunningJobs()
   158  }
   159  
   160  func (m *Manager) addConfig(ctx context.Context, cfg confgroup.Config) {
   161  	task, isRetry := m.retryingJobs.lookup(cfg)
   162  	if isRetry {
   163  		task.cancel()
   164  		m.retryingJobs.remove(cfg)
   165  	} else {
   166  		m.Dyncfg.Register(cfg)
   167  	}
   168  
   169  	if m.runningJobs.has(cfg) {
   170  		m.Infof("%s[%s] job is being served by another job, skipping it", cfg.Module(), cfg.Name())
   171  		m.StatusSaver.Save(cfg, jobStatusStoppedDupLocal)
   172  		m.Dyncfg.UpdateStatus(cfg, "error", "duplicate, served by another job")
   173  		return
   174  	}
   175  
   176  	job, err := m.createJob(cfg)
   177  	if err != nil {
   178  		m.Warningf("couldn't create %s[%s]: %v", cfg.Module(), cfg.Name(), err)
   179  		m.StatusSaver.Save(cfg, jobStatusStoppedCreateErr)
   180  		m.Dyncfg.UpdateStatus(cfg, "error", fmt.Sprintf("build error: %s", err))
   181  		return
   182  	}
   183  
   184  	cleanupJob := true
   185  	defer func() {
   186  		if cleanupJob {
   187  			job.Cleanup()
   188  		}
   189  	}()
   190  
   191  	if isRetry {
   192  		job.AutoDetectEvery = task.timeout
   193  		job.AutoDetectTries = task.retries
   194  	} else if job.AutoDetectionEvery() == 0 {
   195  		switch {
   196  		case m.StatusStore.Contains(cfg, jobStatusRunning, jobStatusRetrying):
   197  			m.Infof("%s[%s] job last status is running/retrying, applying recovering settings", cfg.Module(), cfg.Name())
   198  			job.AutoDetectEvery = 30
   199  			job.AutoDetectTries = 11
   200  		case isInsideK8sCluster() && cfg.Provider() == "file watcher":
   201  			m.Infof("%s[%s] is k8s job, applying recovering settings", cfg.Module(), cfg.Name())
   202  			job.AutoDetectEvery = 10
   203  			job.AutoDetectTries = 7
   204  		}
   205  	}
   206  
   207  	switch detection(job) {
   208  	case jobStatusRunning:
   209  		if ok, err := m.FileLock.Lock(cfg.FullName()); ok || err != nil && !isTooManyOpenFiles(err) {
   210  			cleanupJob = false
   211  			m.runningJobs.put(cfg)
   212  			m.StatusSaver.Save(cfg, jobStatusRunning)
   213  			m.Dyncfg.UpdateStatus(cfg, "running", "")
   214  			m.startJob(job)
   215  		} else if isTooManyOpenFiles(err) {
   216  			m.Error(err)
   217  			m.StatusSaver.Save(cfg, jobStatusStoppedRegErr)
   218  			m.Dyncfg.UpdateStatus(cfg, "error", "too many open files")
   219  		} else {
   220  			m.Infof("%s[%s] job is being served by another plugin, skipping it", cfg.Module(), cfg.Name())
   221  			m.StatusSaver.Save(cfg, jobStatusStoppedDupGlobal)
   222  			m.Dyncfg.UpdateStatus(cfg, "error", "duplicate, served by another plugin")
   223  		}
   224  	case jobStatusRetrying:
   225  		m.Infof("%s[%s] job detection failed, will retry in %d seconds", cfg.Module(), cfg.Name(), job.AutoDetectionEvery())
   226  		ctx, cancel := context.WithCancel(ctx)
   227  		m.retryingJobs.put(cfg, retryTask{
   228  			cancel:  cancel,
   229  			timeout: job.AutoDetectionEvery(),
   230  			retries: job.AutoDetectTries,
   231  		})
   232  		go runRetryTask(ctx, m.addCh, cfg, time.Second*time.Duration(job.AutoDetectionEvery()))
   233  		m.StatusSaver.Save(cfg, jobStatusRetrying)
   234  		m.Dyncfg.UpdateStatus(cfg, "error", "job detection failed, will retry later")
   235  	case jobStatusStoppedFailed:
   236  		m.StatusSaver.Save(cfg, jobStatusStoppedFailed)
   237  		m.Dyncfg.UpdateStatus(cfg, "error", "job detection failed, stopping it")
   238  	default:
   239  		m.Warningf("%s[%s] job detection: unknown state", cfg.Module(), cfg.Name())
   240  	}
   241  }
   242  
   243  func (m *Manager) removeConfig(cfg confgroup.Config) {
   244  	if m.runningJobs.has(cfg) {
   245  		m.stopJob(cfg.FullName())
   246  		_ = m.FileLock.Unlock(cfg.FullName())
   247  		m.runningJobs.remove(cfg)
   248  	}
   249  
   250  	if task, ok := m.retryingJobs.lookup(cfg); ok {
   251  		task.cancel()
   252  		m.retryingJobs.remove(cfg)
   253  	}
   254  
   255  	m.StatusSaver.Remove(cfg)
   256  	m.Dyncfg.Unregister(cfg)
   257  }
   258  
   259  func (m *Manager) createJob(cfg confgroup.Config) (*module.Job, error) {
   260  	creator, ok := m.Modules[cfg.Module()]
   261  	if !ok {
   262  		return nil, fmt.Errorf("can not find %s module", cfg.Module())
   263  	}
   264  
   265  	m.Debugf("creating %s[%s] job, config: %v", cfg.Module(), cfg.Name(), cfg)
   266  
   267  	mod := creator.Create()
   268  	if err := unmarshal(cfg, mod); err != nil {
   269  		return nil, err
   270  	}
   271  
   272  	labels := make(map[string]string)
   273  	for name, value := range cfg.Labels() {
   274  		n, ok1 := name.(string)
   275  		v, ok2 := value.(string)
   276  		if ok1 && ok2 {
   277  			labels[n] = v
   278  		}
   279  	}
   280  
   281  	jobCfg := module.JobConfig{
   282  		PluginName:      m.PluginName,
   283  		Name:            cfg.Name(),
   284  		ModuleName:      cfg.Module(),
   285  		FullName:        cfg.FullName(),
   286  		UpdateEvery:     cfg.UpdateEvery(),
   287  		AutoDetectEvery: cfg.AutoDetectionRetry(),
   288  		Priority:        cfg.Priority(),
   289  		Labels:          labels,
   290  		IsStock:         isStockConfig(cfg),
   291  		Module:          mod,
   292  		Out:             m.Out,
   293  	}
   294  
   295  	if cfg.Vnode() != "" {
   296  		n, ok := m.Vnodes.Lookup(cfg.Vnode())
   297  		if !ok {
   298  			return nil, fmt.Errorf("vnode '%s' is not found", cfg.Vnode())
   299  		}
   300  
   301  		jobCfg.VnodeGUID = n.GUID
   302  		jobCfg.VnodeHostname = n.Hostname
   303  		jobCfg.VnodeLabels = n.Labels
   304  	}
   305  
   306  	job := module.NewJob(jobCfg)
   307  
   308  	return job, nil
   309  }
   310  
   311  func detection(job Job) jobStatus {
   312  	if !job.AutoDetection() {
   313  		if job.RetryAutoDetection() {
   314  			return jobStatusRetrying
   315  		} else {
   316  			return jobStatusStoppedFailed
   317  		}
   318  	}
   319  	return jobStatusRunning
   320  }
   321  
   322  func runRetryTask(ctx context.Context, out chan<- confgroup.Config, cfg confgroup.Config, timeout time.Duration) {
   323  	t := time.NewTimer(timeout)
   324  	defer t.Stop()
   325  
   326  	select {
   327  	case <-ctx.Done():
   328  	case <-t.C:
   329  		sendConfig(ctx, out, cfg)
   330  	}
   331  }
   332  
   333  func sendConfigs(ctx context.Context, out chan<- confgroup.Config, cfgs []confgroup.Config) {
   334  	for _, cfg := range cfgs {
   335  		sendConfig(ctx, out, cfg)
   336  	}
   337  }
   338  
   339  func sendConfig(ctx context.Context, out chan<- confgroup.Config, cfg confgroup.Config) {
   340  	select {
   341  	case <-ctx.Done():
   342  		return
   343  	case out <- cfg:
   344  	}
   345  }
   346  
   347  func unmarshal(conf interface{}, module interface{}) error {
   348  	bs, err := yaml.Marshal(conf)
   349  	if err != nil {
   350  		return err
   351  	}
   352  	return yaml.Unmarshal(bs, module)
   353  }
   354  
   355  func isInsideK8sCluster() bool {
   356  	host, port := os.Getenv("KUBERNETES_SERVICE_HOST"), os.Getenv("KUBERNETES_SERVICE_PORT")
   357  	return host != "" && port != ""
   358  }
   359  
   360  func isTooManyOpenFiles(err error) bool {
   361  	return err != nil && strings.Contains(err.Error(), "too many open files")
   362  }
   363  
   364  func isStockConfig(cfg confgroup.Config) bool {
   365  	if !strings.HasPrefix(cfg.Provider(), "file") {
   366  		return false
   367  	}
   368  	return !strings.Contains(cfg.Source(), "/etc/netdata")
   369  }