github.com/bigcommerce/nomad@v0.9.3-bc/client/allocrunner/taskrunner/stats_hook.go (about)

     1  package taskrunner
     2  
     3  import (
     4  	"context"
     5  	"sync"
     6  	"time"
     7  
     8  	hclog "github.com/hashicorp/go-hclog"
     9  	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
    10  	cstructs "github.com/hashicorp/nomad/client/structs"
    11  	"github.com/hashicorp/nomad/nomad/structs"
    12  	bstructs "github.com/hashicorp/nomad/plugins/base/structs"
    13  )
    14  
    15  // StatsUpdater is the interface required by the StatsHook to update stats.
    16  // Satisfied by TaskRunner.
    17  type StatsUpdater interface {
    18  	UpdateStats(*cstructs.TaskResourceUsage)
    19  }
    20  
    21  // statsHook manages the task stats collection goroutine.
    22  type statsHook struct {
    23  	updater  StatsUpdater
    24  	interval time.Duration
    25  
    26  	// cancel is called by Exited
    27  	cancel context.CancelFunc
    28  
    29  	mu sync.Mutex
    30  
    31  	logger hclog.Logger
    32  }
    33  
    34  func newStatsHook(su StatsUpdater, interval time.Duration, logger hclog.Logger) *statsHook {
    35  	h := &statsHook{
    36  		updater:  su,
    37  		interval: interval,
    38  	}
    39  	h.logger = logger.Named(h.Name())
    40  	return h
    41  }
    42  
    43  func (*statsHook) Name() string {
    44  	return "stats_hook"
    45  }
    46  
    47  func (h *statsHook) Poststart(ctx context.Context, req *interfaces.TaskPoststartRequest, _ *interfaces.TaskPoststartResponse) error {
    48  	h.mu.Lock()
    49  	defer h.mu.Unlock()
    50  
    51  	// This shouldn't happen, but better safe than risk leaking a goroutine
    52  	if h.cancel != nil {
    53  		h.logger.Debug("poststart called twice without exiting between")
    54  		h.cancel()
    55  	}
    56  
    57  	// Using a new context here because the existing context is for the scope of
    58  	// the Poststart request. If that context was used, stats collection would
    59  	// stop when the task was killed. It makes for more readable code and better
    60  	// follows the taskrunner hook model to create a new context that can be
    61  	// canceled on the Exited hook.
    62  	ctx, cancel := context.WithCancel(context.Background())
    63  	h.cancel = cancel
    64  	go h.collectResourceUsageStats(ctx, req.DriverStats)
    65  
    66  	return nil
    67  }
    68  
    69  func (h *statsHook) Exited(context.Context, *interfaces.TaskExitedRequest, *interfaces.TaskExitedResponse) error {
    70  	h.mu.Lock()
    71  	defer h.mu.Unlock()
    72  
    73  	if h.cancel == nil {
    74  		// No stats running
    75  		return nil
    76  	}
    77  
    78  	// Call cancel to stop stats collection
    79  	h.cancel()
    80  
    81  	// Clear cancel func so we don't double call for any reason
    82  	h.cancel = nil
    83  
    84  	return nil
    85  }
    86  
    87  // collectResourceUsageStats starts collecting resource usage stats of a Task.
    88  // Collection ends when the passed channel is closed
    89  func (h *statsHook) collectResourceUsageStats(ctx context.Context, handle interfaces.DriverStats) {
    90  
    91  	ch, err := handle.Stats(ctx, h.interval)
    92  	if err != nil {
    93  		// Check if the driver doesn't implement stats
    94  		if err.Error() == cstructs.DriverStatsNotImplemented.Error() {
    95  			h.logger.Debug("driver does not support stats")
    96  			return
    97  		}
    98  		h.logger.Error("failed to start stats collection for task", "error", err)
    99  	}
   100  
   101  	var backoff time.Duration
   102  	var retry int
   103  	limit := time.Second * 5
   104  	for {
   105  		time.Sleep(backoff)
   106  		select {
   107  		case ru, ok := <-ch:
   108  			// Channel is closed
   109  			if !ok {
   110  				var re *structs.RecoverableError
   111  				ch, err = handle.Stats(ctx, h.interval)
   112  				if err == nil {
   113  					goto RETRY
   114  				}
   115  
   116  				// We do not log when the plugin is shutdown since this is
   117  				// likely because the driver plugin has unexpectedly exited,
   118  				// in which case sleeping and trying again or returning based
   119  				// on the stop channel is the correct behavior
   120  				if err != bstructs.ErrPluginShutdown {
   121  					h.logger.Debug("error fetching stats of task", "error", err)
   122  					goto RETRY
   123  				}
   124  				// check if the error is terminal otherwise it's likely a
   125  				// transport error and we should retry
   126  				re, ok = err.(*structs.RecoverableError)
   127  				if ok && re.IsUnrecoverable() {
   128  					return
   129  				}
   130  				h.logger.Warn("stats collection for task failed", "error", err)
   131  			RETRY:
   132  				// Calculate the new backoff
   133  				backoff = (1 << (2 * uint64(retry))) * time.Second
   134  				if backoff > limit {
   135  					backoff = limit
   136  				}
   137  				// Increment retry counter
   138  				retry++
   139  
   140  				continue
   141  			}
   142  
   143  			// Update stats on TaskRunner and emit them
   144  			h.updater.UpdateStats(ru)
   145  
   146  		case <-ctx.Done():
   147  			return
   148  		}
   149  	}
   150  }
   151  
   152  func (h *statsHook) Shutdown() {
   153  	h.mu.Lock()
   154  	defer h.mu.Unlock()
   155  
   156  	if h.cancel == nil {
   157  		return
   158  	}
   159  
   160  	h.cancel()
   161  }