github.com/bigcommerce/nomad@v0.9.3-bc/client/allocrunner/taskrunner/stats_hook.go (about) 1 package taskrunner 2 3 import ( 4 "context" 5 "sync" 6 "time" 7 8 hclog "github.com/hashicorp/go-hclog" 9 "github.com/hashicorp/nomad/client/allocrunner/interfaces" 10 cstructs "github.com/hashicorp/nomad/client/structs" 11 "github.com/hashicorp/nomad/nomad/structs" 12 bstructs "github.com/hashicorp/nomad/plugins/base/structs" 13 ) 14 15 // StatsUpdater is the interface required by the StatsHook to update stats. 16 // Satisfied by TaskRunner. 17 type StatsUpdater interface { 18 UpdateStats(*cstructs.TaskResourceUsage) 19 } 20 21 // statsHook manages the task stats collection goroutine. 22 type statsHook struct { 23 updater StatsUpdater 24 interval time.Duration 25 26 // cancel is called by Exited 27 cancel context.CancelFunc 28 29 mu sync.Mutex 30 31 logger hclog.Logger 32 } 33 34 func newStatsHook(su StatsUpdater, interval time.Duration, logger hclog.Logger) *statsHook { 35 h := &statsHook{ 36 updater: su, 37 interval: interval, 38 } 39 h.logger = logger.Named(h.Name()) 40 return h 41 } 42 43 func (*statsHook) Name() string { 44 return "stats_hook" 45 } 46 47 func (h *statsHook) Poststart(ctx context.Context, req *interfaces.TaskPoststartRequest, _ *interfaces.TaskPoststartResponse) error { 48 h.mu.Lock() 49 defer h.mu.Unlock() 50 51 // This shouldn't happen, but better safe than risk leaking a goroutine 52 if h.cancel != nil { 53 h.logger.Debug("poststart called twice without exiting between") 54 h.cancel() 55 } 56 57 // Using a new context here because the existing context is for the scope of 58 // the Poststart request. If that context was used, stats collection would 59 // stop when the task was killed. It makes for more readable code and better 60 // follows the taskrunner hook model to create a new context that can be 61 // canceled on the Exited hook. 62 ctx, cancel := context.WithCancel(context.Background()) 63 h.cancel = cancel 64 go h.collectResourceUsageStats(ctx, req.DriverStats) 65 66 return nil 67 } 68 69 func (h *statsHook) Exited(context.Context, *interfaces.TaskExitedRequest, *interfaces.TaskExitedResponse) error { 70 h.mu.Lock() 71 defer h.mu.Unlock() 72 73 if h.cancel == nil { 74 // No stats running 75 return nil 76 } 77 78 // Call cancel to stop stats collection 79 h.cancel() 80 81 // Clear cancel func so we don't double call for any reason 82 h.cancel = nil 83 84 return nil 85 } 86 87 // collectResourceUsageStats starts collecting resource usage stats of a Task. 88 // Collection ends when the passed channel is closed 89 func (h *statsHook) collectResourceUsageStats(ctx context.Context, handle interfaces.DriverStats) { 90 91 ch, err := handle.Stats(ctx, h.interval) 92 if err != nil { 93 // Check if the driver doesn't implement stats 94 if err.Error() == cstructs.DriverStatsNotImplemented.Error() { 95 h.logger.Debug("driver does not support stats") 96 return 97 } 98 h.logger.Error("failed to start stats collection for task", "error", err) 99 } 100 101 var backoff time.Duration 102 var retry int 103 limit := time.Second * 5 104 for { 105 time.Sleep(backoff) 106 select { 107 case ru, ok := <-ch: 108 // Channel is closed 109 if !ok { 110 var re *structs.RecoverableError 111 ch, err = handle.Stats(ctx, h.interval) 112 if err == nil { 113 goto RETRY 114 } 115 116 // We do not log when the plugin is shutdown since this is 117 // likely because the driver plugin has unexpectedly exited, 118 // in which case sleeping and trying again or returning based 119 // on the stop channel is the correct behavior 120 if err != bstructs.ErrPluginShutdown { 121 h.logger.Debug("error fetching stats of task", "error", err) 122 goto RETRY 123 } 124 // check if the error is terminal otherwise it's likely a 125 // transport error and we should retry 126 re, ok = err.(*structs.RecoverableError) 127 if ok && re.IsUnrecoverable() { 128 return 129 } 130 h.logger.Warn("stats collection for task failed", "error", err) 131 RETRY: 132 // Calculate the new backoff 133 backoff = (1 << (2 * uint64(retry))) * time.Second 134 if backoff > limit { 135 backoff = limit 136 } 137 // Increment retry counter 138 retry++ 139 140 continue 141 } 142 143 // Update stats on TaskRunner and emit them 144 h.updater.UpdateStats(ru) 145 146 case <-ctx.Done(): 147 return 148 } 149 } 150 } 151 152 func (h *statsHook) Shutdown() { 153 h.mu.Lock() 154 defer h.mu.Unlock() 155 156 if h.cancel == nil { 157 return 158 } 159 160 h.cancel() 161 }