github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/allocrunner/taskrunner/stats_hook.go (about) 1 package taskrunner 2 3 import ( 4 "context" 5 "sync" 6 "time" 7 8 hclog "github.com/hashicorp/go-hclog" 9 "github.com/hashicorp/nomad/client/allocrunner/interfaces" 10 cstructs "github.com/hashicorp/nomad/client/structs" 11 "github.com/hashicorp/nomad/nomad/structs" 12 bstructs "github.com/hashicorp/nomad/plugins/base/structs" 13 ) 14 15 // StatsUpdater is the interface required by the StatsHook to update stats. 16 // Satisfied by TaskRunner. 17 type StatsUpdater interface { 18 UpdateStats(*cstructs.TaskResourceUsage) 19 } 20 21 // statsHook manages the task stats collection goroutine. 22 type statsHook struct { 23 updater StatsUpdater 24 interval time.Duration 25 26 // cancel is called by Exited 27 cancel context.CancelFunc 28 29 mu sync.Mutex 30 31 logger hclog.Logger 32 } 33 34 func newStatsHook(su StatsUpdater, interval time.Duration, logger hclog.Logger) *statsHook { 35 h := &statsHook{ 36 updater: su, 37 interval: interval, 38 } 39 h.logger = logger.Named(h.Name()) 40 return h 41 } 42 43 func (*statsHook) Name() string { 44 return "stats_hook" 45 } 46 47 func (h *statsHook) Poststart(_ context.Context, req *interfaces.TaskPoststartRequest, _ *interfaces.TaskPoststartResponse) error { 48 h.mu.Lock() 49 defer h.mu.Unlock() 50 51 // This shouldn't happen, but better safe than risk leaking a goroutine 52 if h.cancel != nil { 53 h.logger.Debug("poststart called twice without exiting between") 54 h.cancel() 55 } 56 57 // Using a new context here because the existing context is for the scope of 58 // the Poststart request. If that context was used, stats collection would 59 // stop when the task was killed. It makes for more readable code and better 60 // follows the taskrunner hook model to create a new context that can be 61 // canceled on the Exited hook. 62 ctx, cancel := context.WithCancel(context.Background()) 63 h.cancel = cancel 64 go h.collectResourceUsageStats(ctx, req.DriverStats) 65 66 return nil 67 } 68 69 func (h *statsHook) Exited(context.Context, *interfaces.TaskExitedRequest, *interfaces.TaskExitedResponse) error { 70 h.mu.Lock() 71 defer h.mu.Unlock() 72 73 if h.cancel == nil { 74 // No stats running 75 return nil 76 } 77 78 // Call cancel to stop stats collection 79 h.cancel() 80 81 // Clear cancel func so we don't double call for any reason 82 h.cancel = nil 83 84 return nil 85 } 86 87 // collectResourceUsageStats starts collecting resource usage stats of a Task. 88 // Collection ends when the passed channel is closed 89 func (h *statsHook) collectResourceUsageStats(ctx context.Context, handle interfaces.DriverStats) { 90 91 MAIN: 92 ch, err := h.callStatsWithRetry(ctx, handle) 93 if err != nil { 94 return 95 } 96 97 for { 98 select { 99 case ru, ok := <-ch: 100 // if channel closes, re-establish a new one 101 if !ok { 102 // backoff if driver closes channel, potentially 103 // because task shutdown or because driver 104 // doesn't implement channel interval checking 105 select { 106 case <-time.After(h.interval): 107 goto MAIN 108 case <-ctx.Done(): 109 return 110 } 111 } 112 113 // Update stats on TaskRunner and emit them 114 h.updater.UpdateStats(ru) 115 116 case <-ctx.Done(): 117 return 118 } 119 } 120 } 121 122 // callStatsWithRetry invokes handle driver Stats() functions and retries until channel is established 123 // successfully. Returns an error if it encounters a permanent error. 124 // 125 // It logs the errors with appropriate log levels; don't log returned error 126 func (h *statsHook) callStatsWithRetry(ctx context.Context, handle interfaces.DriverStats) (<-chan *cstructs.TaskResourceUsage, error) { 127 var retry int 128 129 MAIN: 130 if ctx.Err() != nil { 131 return nil, ctx.Err() 132 } 133 134 ch, err := handle.Stats(ctx, h.interval) 135 if err == nil { 136 return ch, nil 137 } 138 139 // Check if the driver doesn't implement stats 140 if err.Error() == cstructs.DriverStatsNotImplemented.Error() { 141 h.logger.Debug("driver does not support stats") 142 return nil, err 143 } 144 145 // check if the error is terminal otherwise it's likely a 146 // transport error and we should retry 147 if re, ok := err.(*structs.RecoverableError); ok && re.IsUnrecoverable() { 148 h.logger.Debug("failed to start stats collection for task with unrecoverable error", "error", err) 149 return nil, err 150 } 151 152 // We do not warn when the plugin is shutdown since this is 153 // likely because the driver plugin has unexpectedly exited, 154 // in which case sleeping and trying again or returning based 155 // on the stop channel is the correct behavior 156 if err == bstructs.ErrPluginShutdown { 157 h.logger.Debug("failed to fetching stats of task", "error", err) 158 } else { 159 h.logger.Error("failed to start stats collection for task", "error", err) 160 } 161 162 limit := time.Second * 5 163 backoff := 1 << (2 * uint64(retry)) * time.Second 164 if backoff > limit || retry > 5 { 165 backoff = limit 166 } 167 168 // Increment retry counter 169 retry++ 170 171 time.Sleep(backoff) 172 goto MAIN 173 } 174 175 func (h *statsHook) Shutdown() { 176 h.mu.Lock() 177 defer h.mu.Unlock() 178 179 if h.cancel == nil { 180 return 181 } 182 183 h.cancel() 184 }