github.com/hernad/nomad@v1.6.112/drivers/docker/stats.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package docker
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  	"io"
    10  	"sync"
    11  	"time"
    12  
    13  	docker "github.com/fsouza/go-dockerclient"
    14  	cstructs "github.com/hernad/nomad/client/structs"
    15  	"github.com/hernad/nomad/drivers/docker/util"
    16  	"github.com/hernad/nomad/helper"
    17  	nstructs "github.com/hernad/nomad/nomad/structs"
    18  )
    19  
    20  const (
    21  	// statsCollectorBackoffBaseline is the baseline time for exponential
    22  	// backoff while calling the docker stats api.
    23  	statsCollectorBackoffBaseline = 5 * time.Second
    24  
    25  	// statsCollectorBackoffLimit is the limit of the exponential backoff for
    26  	// calling the docker stats api.
    27  	statsCollectorBackoffLimit = 2 * time.Minute
    28  )
    29  
    30  // usageSender wraps a TaskResourceUsage chan such that it supports concurrent
    31  // sending and closing, and backpressures by dropping events if necessary.
    32  type usageSender struct {
    33  	closed bool
    34  	destCh chan<- *cstructs.TaskResourceUsage
    35  	mu     sync.Mutex
    36  }
    37  
    38  // newStatsChanPipe returns a chan wrapped in a struct that supports concurrent
    39  // sending and closing, and the receiver end of the chan.
    40  func newStatsChanPipe() (*usageSender, <-chan *cstructs.TaskResourceUsage) {
    41  	destCh := make(chan *cstructs.TaskResourceUsage, 1)
    42  	return &usageSender{
    43  		destCh: destCh,
    44  	}, destCh
    45  
    46  }
    47  
    48  // send resource usage to the receiver unless the chan is already full or
    49  // closed.
    50  func (u *usageSender) send(tru *cstructs.TaskResourceUsage) {
    51  	u.mu.Lock()
    52  	defer u.mu.Unlock()
    53  
    54  	if u.closed {
    55  		return
    56  	}
    57  
    58  	select {
    59  	case u.destCh <- tru:
    60  	default:
    61  		// Backpressure caused missed interval
    62  	}
    63  }
    64  
    65  // close resource usage. Any further sends will be dropped.
    66  func (u *usageSender) close() {
    67  	u.mu.Lock()
    68  	defer u.mu.Unlock()
    69  
    70  	if u.closed {
    71  		// already closed
    72  		return
    73  	}
    74  
    75  	u.closed = true
    76  	close(u.destCh)
    77  }
    78  
    79  // Stats starts collecting stats from the docker daemon and sends them on the
    80  // returned channel.
    81  func (h *taskHandle) Stats(ctx context.Context, interval time.Duration) (<-chan *cstructs.TaskResourceUsage, error) {
    82  	select {
    83  	case <-h.doneCh:
    84  		return nil, nstructs.NewRecoverableError(fmt.Errorf("container stopped"), false)
    85  	default:
    86  	}
    87  
    88  	destCh, recvCh := newStatsChanPipe()
    89  	go h.collectStats(ctx, destCh, interval)
    90  	return recvCh, nil
    91  }
    92  
    93  // collectStats starts collecting resource usage stats of a docker container
    94  func (h *taskHandle) collectStats(ctx context.Context, destCh *usageSender, interval time.Duration) {
    95  	defer destCh.close()
    96  
    97  	// backoff and retry used if the docker stats API returns an error
    98  	var backoff time.Duration = 0
    99  	var retry int
   100  
   101  	// create an interval timer
   102  	timer, stop := helper.NewSafeTimer(backoff)
   103  	defer stop()
   104  
   105  	// loops until doneCh is closed
   106  	for {
   107  		timer.Reset(backoff)
   108  
   109  		if backoff > 0 {
   110  			select {
   111  			case <-timer.C:
   112  			case <-ctx.Done():
   113  				return
   114  			case <-h.doneCh:
   115  				return
   116  			}
   117  		}
   118  
   119  		// make a channel for docker stats structs and start a collector to
   120  		// receive stats from docker and emit nomad stats
   121  		// statsCh will always be closed by docker client.
   122  		statsCh := make(chan *docker.Stats)
   123  		go dockerStatsCollector(destCh, statsCh, interval)
   124  
   125  		statsOpts := docker.StatsOptions{
   126  			ID:      h.containerID,
   127  			Context: ctx,
   128  			Done:    h.doneCh,
   129  			Stats:   statsCh,
   130  			Stream:  true,
   131  		}
   132  
   133  		// Stats blocks until an error has occurred, or doneCh has been closed
   134  		if err := h.dockerClient.Stats(statsOpts); err != nil && err != io.ErrClosedPipe {
   135  			// An error occurred during stats collection, retry with backoff
   136  			h.logger.Debug("error collecting stats from container", "error", err)
   137  
   138  			// Calculate the new backoff
   139  			backoff = (1 << (2 * uint64(retry))) * statsCollectorBackoffBaseline
   140  			if backoff > statsCollectorBackoffLimit {
   141  				backoff = statsCollectorBackoffLimit
   142  			}
   143  			// Increment retry counter
   144  			retry++
   145  			continue
   146  		}
   147  		// Stats finished either because context was canceled, doneCh was closed
   148  		// or the container stopped. Stop stats collections.
   149  		return
   150  	}
   151  }
   152  
   153  func dockerStatsCollector(destCh *usageSender, statsCh <-chan *docker.Stats, interval time.Duration) {
   154  	var resourceUsage *cstructs.TaskResourceUsage
   155  
   156  	// hasSentInitialStats is used so as to emit the first stats received from
   157  	// the docker daemon
   158  	var hasSentInitialStats bool
   159  
   160  	// timer is used to send nomad status at the specified interval
   161  	timer := time.NewTimer(interval)
   162  	for {
   163  		select {
   164  		case <-timer.C:
   165  			// it is possible for the timer to go off before the first stats
   166  			// has been emitted from docker
   167  			if resourceUsage == nil {
   168  				continue
   169  			}
   170  
   171  			// sending to destCh could block, drop this interval if it does
   172  			destCh.send(resourceUsage)
   173  
   174  			timer.Reset(interval)
   175  
   176  		case s, ok := <-statsCh:
   177  			// if statsCh is closed stop collection
   178  			if !ok {
   179  				return
   180  			}
   181  			// s should always be set, but check and skip just in case
   182  			if s != nil {
   183  				resourceUsage = util.DockerStatsToTaskResourceUsage(s)
   184  				// send stats next interation if this is the first time received
   185  				// from docker
   186  				if !hasSentInitialStats {
   187  					timer.Reset(0)
   188  					hasSentInitialStats = true
   189  				}
   190  			}
   191  		}
   192  	}
   193  }