github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/drivers/docker/stats.go (about)

     1  package docker
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"io"
     7  	"sync"
     8  	"time"
     9  
    10  	docker "github.com/fsouza/go-dockerclient"
    11  	"github.com/hashicorp/nomad/client/structs"
    12  	cstructs "github.com/hashicorp/nomad/client/structs"
    13  	"github.com/hashicorp/nomad/drivers/docker/util"
    14  	nstructs "github.com/hashicorp/nomad/nomad/structs"
    15  )
    16  
    17  const (
    18  	// statsCollectorBackoffBaseline is the baseline time for exponential
    19  	// backoff while calling the docker stats api.
    20  	statsCollectorBackoffBaseline = 5 * time.Second
    21  
    22  	// statsCollectorBackoffLimit is the limit of the exponential backoff for
    23  	// calling the docker stats api.
    24  	statsCollectorBackoffLimit = 2 * time.Minute
    25  )
    26  
    27  // usageSender wraps a TaskResourceUsage chan such that it supports concurrent
    28  // sending and closing, and backpressures by dropping events if necessary.
    29  type usageSender struct {
    30  	closed bool
    31  	destCh chan<- *structs.TaskResourceUsage
    32  	mu     sync.Mutex
    33  }
    34  
    35  // newStatsChanPipe returns a chan wrapped in a struct that supports concurrent
    36  // sending and closing, and the receiver end of the chan.
    37  func newStatsChanPipe() (*usageSender, <-chan *structs.TaskResourceUsage) {
    38  	destCh := make(chan *cstructs.TaskResourceUsage, 1)
    39  	return &usageSender{
    40  		destCh: destCh,
    41  	}, destCh
    42  
    43  }
    44  
    45  // send resource usage to the receiver unless the chan is already full or
    46  // closed.
    47  func (u *usageSender) send(tru *cstructs.TaskResourceUsage) {
    48  	u.mu.Lock()
    49  	defer u.mu.Unlock()
    50  
    51  	if u.closed {
    52  		return
    53  	}
    54  
    55  	select {
    56  	case u.destCh <- tru:
    57  	default:
    58  		// Backpressure caused missed interval
    59  	}
    60  }
    61  
    62  // close resource usage. Any further sends will be dropped.
    63  func (u *usageSender) close() {
    64  	u.mu.Lock()
    65  	defer u.mu.Unlock()
    66  
    67  	if u.closed {
    68  		// already closed
    69  		return
    70  	}
    71  
    72  	u.closed = true
    73  	close(u.destCh)
    74  }
    75  
    76  // Stats starts collecting stats from the docker daemon and sends them on the
    77  // returned channel.
    78  func (h *taskHandle) Stats(ctx context.Context, interval time.Duration) (<-chan *cstructs.TaskResourceUsage, error) {
    79  	select {
    80  	case <-h.doneCh:
    81  		return nil, nstructs.NewRecoverableError(fmt.Errorf("container stopped"), false)
    82  	default:
    83  	}
    84  
    85  	destCh, recvCh := newStatsChanPipe()
    86  	go h.collectStats(ctx, destCh, interval)
    87  	return recvCh, nil
    88  }
    89  
    90  // collectStats starts collecting resource usage stats of a docker container
    91  func (h *taskHandle) collectStats(ctx context.Context, destCh *usageSender, interval time.Duration) {
    92  	defer destCh.close()
    93  
    94  	// backoff and retry used if the docker stats API returns an error
    95  	var backoff time.Duration
    96  	var retry int
    97  	// loops until doneCh is closed
    98  	for {
    99  		if backoff > 0 {
   100  			select {
   101  			case <-time.After(backoff):
   102  			case <-ctx.Done():
   103  				return
   104  			case <-h.doneCh:
   105  				return
   106  			}
   107  		}
   108  		// make a channel for docker stats structs and start a collector to
   109  		// receive stats from docker and emit nomad stats
   110  		// statsCh will always be closed by docker client.
   111  		statsCh := make(chan *docker.Stats)
   112  		go dockerStatsCollector(destCh, statsCh, interval)
   113  
   114  		statsOpts := docker.StatsOptions{
   115  			ID:      h.containerID,
   116  			Context: ctx,
   117  			Done:    h.doneCh,
   118  			Stats:   statsCh,
   119  			Stream:  true,
   120  		}
   121  
   122  		// Stats blocks until an error has occurred, or doneCh has been closed
   123  		if err := h.client.Stats(statsOpts); err != nil && err != io.ErrClosedPipe {
   124  			// An error occurred during stats collection, retry with backoff
   125  			h.logger.Debug("error collecting stats from container", "error", err)
   126  
   127  			// Calculate the new backoff
   128  			backoff = (1 << (2 * uint64(retry))) * statsCollectorBackoffBaseline
   129  			if backoff > statsCollectorBackoffLimit {
   130  				backoff = statsCollectorBackoffLimit
   131  			}
   132  			// Increment retry counter
   133  			retry++
   134  			continue
   135  		}
   136  		// Stats finished either because context was canceled, doneCh was closed
   137  		// or the container stopped. Stop stats collections.
   138  		return
   139  	}
   140  }
   141  
   142  func dockerStatsCollector(destCh *usageSender, statsCh <-chan *docker.Stats, interval time.Duration) {
   143  	var resourceUsage *cstructs.TaskResourceUsage
   144  
   145  	// hasSentInitialStats is used so as to emit the first stats received from
   146  	// the docker daemon
   147  	var hasSentInitialStats bool
   148  
   149  	// timer is used to send nomad status at the specified interval
   150  	timer := time.NewTimer(interval)
   151  	for {
   152  		select {
   153  		case <-timer.C:
   154  			// it is possible for the timer to go off before the first stats
   155  			// has been emitted from docker
   156  			if resourceUsage == nil {
   157  				continue
   158  			}
   159  
   160  			// sending to destCh could block, drop this interval if it does
   161  			destCh.send(resourceUsage)
   162  
   163  			timer.Reset(interval)
   164  
   165  		case s, ok := <-statsCh:
   166  			// if statsCh is closed stop collection
   167  			if !ok {
   168  				return
   169  			}
   170  			// s should always be set, but check and skip just in case
   171  			if s != nil {
   172  				resourceUsage = util.DockerStatsToTaskResourceUsage(s)
   173  				// send stats next interation if this is the first time received
   174  				// from docker
   175  				if !hasSentInitialStats {
   176  					timer.Reset(0)
   177  					hasSentInitialStats = true
   178  				}
   179  			}
   180  		}
   181  	}
   182  }