github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/drivers/docker/stats.go (about)

     1  package docker
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"io"
     7  	"sync"
     8  	"time"
     9  
    10  	docker "github.com/fsouza/go-dockerclient"
    11  	cstructs "github.com/hashicorp/nomad/client/structs"
    12  	"github.com/hashicorp/nomad/drivers/docker/util"
    13  	"github.com/hashicorp/nomad/helper"
    14  	nstructs "github.com/hashicorp/nomad/nomad/structs"
    15  )
    16  
    17  const (
    18  	// statsCollectorBackoffBaseline is the baseline time for exponential
    19  	// backoff while calling the docker stats api.
    20  	statsCollectorBackoffBaseline = 5 * time.Second
    21  
    22  	// statsCollectorBackoffLimit is the limit of the exponential backoff for
    23  	// calling the docker stats api.
    24  	statsCollectorBackoffLimit = 2 * time.Minute
    25  )
    26  
    27  // usageSender wraps a TaskResourceUsage chan such that it supports concurrent
    28  // sending and closing, and backpressures by dropping events if necessary.
    29  type usageSender struct {
    30  	closed bool
    31  	destCh chan<- *cstructs.TaskResourceUsage
    32  	mu     sync.Mutex
    33  }
    34  
    35  // newStatsChanPipe returns a chan wrapped in a struct that supports concurrent
    36  // sending and closing, and the receiver end of the chan.
    37  func newStatsChanPipe() (*usageSender, <-chan *cstructs.TaskResourceUsage) {
    38  	destCh := make(chan *cstructs.TaskResourceUsage, 1)
    39  	return &usageSender{
    40  		destCh: destCh,
    41  	}, destCh
    42  
    43  }
    44  
    45  // send resource usage to the receiver unless the chan is already full or
    46  // closed.
    47  func (u *usageSender) send(tru *cstructs.TaskResourceUsage) {
    48  	u.mu.Lock()
    49  	defer u.mu.Unlock()
    50  
    51  	if u.closed {
    52  		return
    53  	}
    54  
    55  	select {
    56  	case u.destCh <- tru:
    57  	default:
    58  		// Backpressure caused missed interval
    59  	}
    60  }
    61  
    62  // close resource usage. Any further sends will be dropped.
    63  func (u *usageSender) close() {
    64  	u.mu.Lock()
    65  	defer u.mu.Unlock()
    66  
    67  	if u.closed {
    68  		// already closed
    69  		return
    70  	}
    71  
    72  	u.closed = true
    73  	close(u.destCh)
    74  }
    75  
    76  // Stats starts collecting stats from the docker daemon and sends them on the
    77  // returned channel.
    78  func (h *taskHandle) Stats(ctx context.Context, interval time.Duration) (<-chan *cstructs.TaskResourceUsage, error) {
    79  	select {
    80  	case <-h.doneCh:
    81  		return nil, nstructs.NewRecoverableError(fmt.Errorf("container stopped"), false)
    82  	default:
    83  	}
    84  
    85  	destCh, recvCh := newStatsChanPipe()
    86  	go h.collectStats(ctx, destCh, interval)
    87  	return recvCh, nil
    88  }
    89  
    90  // collectStats starts collecting resource usage stats of a docker container
    91  func (h *taskHandle) collectStats(ctx context.Context, destCh *usageSender, interval time.Duration) {
    92  	defer destCh.close()
    93  
    94  	// backoff and retry used if the docker stats API returns an error
    95  	var backoff time.Duration = 0
    96  	var retry int
    97  
    98  	// create an interval timer
    99  	timer, stop := helper.NewSafeTimer(backoff)
   100  	defer stop()
   101  
   102  	// loops until doneCh is closed
   103  	for {
   104  		timer.Reset(backoff)
   105  
   106  		if backoff > 0 {
   107  			select {
   108  			case <-timer.C:
   109  			case <-ctx.Done():
   110  				return
   111  			case <-h.doneCh:
   112  				return
   113  			}
   114  		}
   115  
   116  		// make a channel for docker stats structs and start a collector to
   117  		// receive stats from docker and emit nomad stats
   118  		// statsCh will always be closed by docker client.
   119  		statsCh := make(chan *docker.Stats)
   120  		go dockerStatsCollector(destCh, statsCh, interval)
   121  
   122  		statsOpts := docker.StatsOptions{
   123  			ID:      h.containerID,
   124  			Context: ctx,
   125  			Done:    h.doneCh,
   126  			Stats:   statsCh,
   127  			Stream:  true,
   128  		}
   129  
   130  		// Stats blocks until an error has occurred, or doneCh has been closed
   131  		if err := h.client.Stats(statsOpts); err != nil && err != io.ErrClosedPipe {
   132  			// An error occurred during stats collection, retry with backoff
   133  			h.logger.Debug("error collecting stats from container", "error", err)
   134  
   135  			// Calculate the new backoff
   136  			backoff = (1 << (2 * uint64(retry))) * statsCollectorBackoffBaseline
   137  			if backoff > statsCollectorBackoffLimit {
   138  				backoff = statsCollectorBackoffLimit
   139  			}
   140  			// Increment retry counter
   141  			retry++
   142  			continue
   143  		}
   144  		// Stats finished either because context was canceled, doneCh was closed
   145  		// or the container stopped. Stop stats collections.
   146  		return
   147  	}
   148  }
   149  
   150  func dockerStatsCollector(destCh *usageSender, statsCh <-chan *docker.Stats, interval time.Duration) {
   151  	var resourceUsage *cstructs.TaskResourceUsage
   152  
   153  	// hasSentInitialStats is used so as to emit the first stats received from
   154  	// the docker daemon
   155  	var hasSentInitialStats bool
   156  
   157  	// timer is used to send nomad status at the specified interval
   158  	timer := time.NewTimer(interval)
   159  	for {
   160  		select {
   161  		case <-timer.C:
   162  			// it is possible for the timer to go off before the first stats
   163  			// has been emitted from docker
   164  			if resourceUsage == nil {
   165  				continue
   166  			}
   167  
   168  			// sending to destCh could block, drop this interval if it does
   169  			destCh.send(resourceUsage)
   170  
   171  			timer.Reset(interval)
   172  
   173  		case s, ok := <-statsCh:
   174  			// if statsCh is closed stop collection
   175  			if !ok {
   176  				return
   177  			}
   178  			// s should always be set, but check and skip just in case
   179  			if s != nil {
   180  				resourceUsage = util.DockerStatsToTaskResourceUsage(s)
   181  				// send stats next interation if this is the first time received
   182  				// from docker
   183  				if !hasSentInitialStats {
   184  					timer.Reset(0)
   185  					hasSentInitialStats = true
   186  				}
   187  			}
   188  		}
   189  	}
   190  }