github.com/netdata/go.d.plugin@v0.58.1/agent/jobmgr/manager.go (about) 1 // SPDX-License-Identifier: GPL-3.0-or-later 2 3 package jobmgr 4 5 import ( 6 "context" 7 "fmt" 8 "io" 9 "log/slog" 10 "os" 11 "strings" 12 "sync" 13 "time" 14 15 "github.com/netdata/go.d.plugin/agent/confgroup" 16 "github.com/netdata/go.d.plugin/agent/module" 17 "github.com/netdata/go.d.plugin/logger" 18 19 "gopkg.in/yaml.v2" 20 ) 21 22 type Job interface { 23 Name() string 24 ModuleName() string 25 FullName() string 26 AutoDetection() bool 27 AutoDetectionEvery() int 28 RetryAutoDetection() bool 29 Tick(clock int) 30 Start() 31 Stop() 32 Cleanup() 33 } 34 35 type jobStatus = string 36 37 const ( 38 jobStatusRunning jobStatus = "running" // Check() succeeded 39 jobStatusRetrying jobStatus = "retrying" // Check() failed, but we need keep trying auto-detection 40 jobStatusStoppedFailed jobStatus = "stopped_failed" // Check() failed 41 jobStatusStoppedDupLocal jobStatus = "stopped_duplicate_local" // a job with the same FullName is running 42 jobStatusStoppedDupGlobal jobStatus = "stopped_duplicate_global" // a job with the same FullName is registered by another plugin 43 jobStatusStoppedRegErr jobStatus = "stopped_registration_error" // an error during registration (only 'too many open files') 44 jobStatusStoppedCreateErr jobStatus = "stopped_creation_error" // an error during creation (yaml unmarshal) 45 ) 46 47 func NewManager() *Manager { 48 np := noop{} 49 mgr := &Manager{ 50 Logger: logger.New().With( 51 slog.String("component", "job manager"), 52 ), 53 Out: io.Discard, 54 FileLock: np, 55 StatusSaver: np, 56 StatusStore: np, 57 Vnodes: np, 58 Dyncfg: np, 59 60 confGroupCache: confgroup.NewCache(), 61 62 runningJobs: newRunningJobsCache(), 63 retryingJobs: newRetryingJobsCache(), 64 65 addCh: make(chan confgroup.Config), 66 removeCh: make(chan confgroup.Config), 67 } 68 69 return mgr 70 } 71 72 type Manager struct { 73 *logger.Logger 74 75 PluginName string 76 Out io.Writer 77 Modules module.Registry 78 79 FileLock FileLocker 80 StatusSaver StatusSaver 81 StatusStore StatusStore 82 Vnodes Vnodes 83 Dyncfg Dyncfg 84 85 confGroupCache *confgroup.Cache 86 runningJobs *runningJobsCache 87 retryingJobs *retryingJobsCache 88 89 addCh chan confgroup.Config 90 removeCh chan confgroup.Config 91 92 queueMux sync.Mutex 93 queue []Job 94 } 95 96 func (m *Manager) Run(ctx context.Context, in chan []*confgroup.Group) { 97 m.Info("instance is started") 98 defer func() { m.cleanup(); m.Info("instance is stopped") }() 99 100 var wg sync.WaitGroup 101 102 wg.Add(1) 103 go func() { defer wg.Done(); m.runConfigGroupsHandling(ctx, in) }() 104 105 wg.Add(1) 106 go func() { defer wg.Done(); m.runConfigsHandling(ctx) }() 107 108 wg.Add(1) 109 go func() { defer wg.Done(); m.runRunningJobsHandling(ctx) }() 110 111 wg.Wait() 112 <-ctx.Done() 113 } 114 115 func (m *Manager) runConfigGroupsHandling(ctx context.Context, in chan []*confgroup.Group) { 116 for { 117 select { 118 case <-ctx.Done(): 119 return 120 case groups := <-in: 121 for _, gr := range groups { 122 select { 123 case <-ctx.Done(): 124 return 125 default: 126 a, r := m.confGroupCache.Add(gr) 127 m.Debugf("received config group ('%s'): %d jobs (added: %d, removed: %d)", gr.Source, len(gr.Configs), len(a), len(r)) 128 sendConfigs(ctx, m.removeCh, r) 129 sendConfigs(ctx, m.addCh, a) 130 } 131 } 132 } 133 } 134 } 135 136 func (m *Manager) runConfigsHandling(ctx context.Context) { 137 for { 138 select { 139 case <-ctx.Done(): 140 return 141 case cfg := <-m.addCh: 142 m.addConfig(ctx, cfg) 143 case cfg := <-m.removeCh: 144 m.removeConfig(cfg) 145 } 146 } 147 } 148 149 func (m *Manager) cleanup() { 150 for _, task := range *m.retryingJobs { 151 task.cancel() 152 } 153 for name := range *m.runningJobs { 154 _ = m.FileLock.Unlock(name) 155 } 156 // TODO: m.Dyncfg.Register() ? 157 m.stopRunningJobs() 158 } 159 160 func (m *Manager) addConfig(ctx context.Context, cfg confgroup.Config) { 161 task, isRetry := m.retryingJobs.lookup(cfg) 162 if isRetry { 163 task.cancel() 164 m.retryingJobs.remove(cfg) 165 } else { 166 m.Dyncfg.Register(cfg) 167 } 168 169 if m.runningJobs.has(cfg) { 170 m.Infof("%s[%s] job is being served by another job, skipping it", cfg.Module(), cfg.Name()) 171 m.StatusSaver.Save(cfg, jobStatusStoppedDupLocal) 172 m.Dyncfg.UpdateStatus(cfg, "error", "duplicate, served by another job") 173 return 174 } 175 176 job, err := m.createJob(cfg) 177 if err != nil { 178 m.Warningf("couldn't create %s[%s]: %v", cfg.Module(), cfg.Name(), err) 179 m.StatusSaver.Save(cfg, jobStatusStoppedCreateErr) 180 m.Dyncfg.UpdateStatus(cfg, "error", fmt.Sprintf("build error: %s", err)) 181 return 182 } 183 184 cleanupJob := true 185 defer func() { 186 if cleanupJob { 187 job.Cleanup() 188 } 189 }() 190 191 if isRetry { 192 job.AutoDetectEvery = task.timeout 193 job.AutoDetectTries = task.retries 194 } else if job.AutoDetectionEvery() == 0 { 195 switch { 196 case m.StatusStore.Contains(cfg, jobStatusRunning, jobStatusRetrying): 197 m.Infof("%s[%s] job last status is running/retrying, applying recovering settings", cfg.Module(), cfg.Name()) 198 job.AutoDetectEvery = 30 199 job.AutoDetectTries = 11 200 case isInsideK8sCluster() && cfg.Provider() == "file watcher": 201 m.Infof("%s[%s] is k8s job, applying recovering settings", cfg.Module(), cfg.Name()) 202 job.AutoDetectEvery = 10 203 job.AutoDetectTries = 7 204 } 205 } 206 207 switch detection(job) { 208 case jobStatusRunning: 209 if ok, err := m.FileLock.Lock(cfg.FullName()); ok || err != nil && !isTooManyOpenFiles(err) { 210 cleanupJob = false 211 m.runningJobs.put(cfg) 212 m.StatusSaver.Save(cfg, jobStatusRunning) 213 m.Dyncfg.UpdateStatus(cfg, "running", "") 214 m.startJob(job) 215 } else if isTooManyOpenFiles(err) { 216 m.Error(err) 217 m.StatusSaver.Save(cfg, jobStatusStoppedRegErr) 218 m.Dyncfg.UpdateStatus(cfg, "error", "too many open files") 219 } else { 220 m.Infof("%s[%s] job is being served by another plugin, skipping it", cfg.Module(), cfg.Name()) 221 m.StatusSaver.Save(cfg, jobStatusStoppedDupGlobal) 222 m.Dyncfg.UpdateStatus(cfg, "error", "duplicate, served by another plugin") 223 } 224 case jobStatusRetrying: 225 m.Infof("%s[%s] job detection failed, will retry in %d seconds", cfg.Module(), cfg.Name(), job.AutoDetectionEvery()) 226 ctx, cancel := context.WithCancel(ctx) 227 m.retryingJobs.put(cfg, retryTask{ 228 cancel: cancel, 229 timeout: job.AutoDetectionEvery(), 230 retries: job.AutoDetectTries, 231 }) 232 go runRetryTask(ctx, m.addCh, cfg, time.Second*time.Duration(job.AutoDetectionEvery())) 233 m.StatusSaver.Save(cfg, jobStatusRetrying) 234 m.Dyncfg.UpdateStatus(cfg, "error", "job detection failed, will retry later") 235 case jobStatusStoppedFailed: 236 m.StatusSaver.Save(cfg, jobStatusStoppedFailed) 237 m.Dyncfg.UpdateStatus(cfg, "error", "job detection failed, stopping it") 238 default: 239 m.Warningf("%s[%s] job detection: unknown state", cfg.Module(), cfg.Name()) 240 } 241 } 242 243 func (m *Manager) removeConfig(cfg confgroup.Config) { 244 if m.runningJobs.has(cfg) { 245 m.stopJob(cfg.FullName()) 246 _ = m.FileLock.Unlock(cfg.FullName()) 247 m.runningJobs.remove(cfg) 248 } 249 250 if task, ok := m.retryingJobs.lookup(cfg); ok { 251 task.cancel() 252 m.retryingJobs.remove(cfg) 253 } 254 255 m.StatusSaver.Remove(cfg) 256 m.Dyncfg.Unregister(cfg) 257 } 258 259 func (m *Manager) createJob(cfg confgroup.Config) (*module.Job, error) { 260 creator, ok := m.Modules[cfg.Module()] 261 if !ok { 262 return nil, fmt.Errorf("can not find %s module", cfg.Module()) 263 } 264 265 m.Debugf("creating %s[%s] job, config: %v", cfg.Module(), cfg.Name(), cfg) 266 267 mod := creator.Create() 268 if err := unmarshal(cfg, mod); err != nil { 269 return nil, err 270 } 271 272 labels := make(map[string]string) 273 for name, value := range cfg.Labels() { 274 n, ok1 := name.(string) 275 v, ok2 := value.(string) 276 if ok1 && ok2 { 277 labels[n] = v 278 } 279 } 280 281 jobCfg := module.JobConfig{ 282 PluginName: m.PluginName, 283 Name: cfg.Name(), 284 ModuleName: cfg.Module(), 285 FullName: cfg.FullName(), 286 UpdateEvery: cfg.UpdateEvery(), 287 AutoDetectEvery: cfg.AutoDetectionRetry(), 288 Priority: cfg.Priority(), 289 Labels: labels, 290 IsStock: isStockConfig(cfg), 291 Module: mod, 292 Out: m.Out, 293 } 294 295 if cfg.Vnode() != "" { 296 n, ok := m.Vnodes.Lookup(cfg.Vnode()) 297 if !ok { 298 return nil, fmt.Errorf("vnode '%s' is not found", cfg.Vnode()) 299 } 300 301 jobCfg.VnodeGUID = n.GUID 302 jobCfg.VnodeHostname = n.Hostname 303 jobCfg.VnodeLabels = n.Labels 304 } 305 306 job := module.NewJob(jobCfg) 307 308 return job, nil 309 } 310 311 func detection(job Job) jobStatus { 312 if !job.AutoDetection() { 313 if job.RetryAutoDetection() { 314 return jobStatusRetrying 315 } else { 316 return jobStatusStoppedFailed 317 } 318 } 319 return jobStatusRunning 320 } 321 322 func runRetryTask(ctx context.Context, out chan<- confgroup.Config, cfg confgroup.Config, timeout time.Duration) { 323 t := time.NewTimer(timeout) 324 defer t.Stop() 325 326 select { 327 case <-ctx.Done(): 328 case <-t.C: 329 sendConfig(ctx, out, cfg) 330 } 331 } 332 333 func sendConfigs(ctx context.Context, out chan<- confgroup.Config, cfgs []confgroup.Config) { 334 for _, cfg := range cfgs { 335 sendConfig(ctx, out, cfg) 336 } 337 } 338 339 func sendConfig(ctx context.Context, out chan<- confgroup.Config, cfg confgroup.Config) { 340 select { 341 case <-ctx.Done(): 342 return 343 case out <- cfg: 344 } 345 } 346 347 func unmarshal(conf interface{}, module interface{}) error { 348 bs, err := yaml.Marshal(conf) 349 if err != nil { 350 return err 351 } 352 return yaml.Unmarshal(bs, module) 353 } 354 355 func isInsideK8sCluster() bool { 356 host, port := os.Getenv("KUBERNETES_SERVICE_HOST"), os.Getenv("KUBERNETES_SERVICE_PORT") 357 return host != "" && port != "" 358 } 359 360 func isTooManyOpenFiles(err error) bool { 361 return err != nil && strings.Contains(err.Error(), "too many open files") 362 } 363 364 func isStockConfig(cfg confgroup.Config) bool { 365 if !strings.HasPrefix(cfg.Provider(), "file") { 366 return false 367 } 368 return !strings.Contains(cfg.Source(), "/etc/netdata") 369 }