github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/drivers/docker/stats.go (about) 1 package docker 2 3 import ( 4 "context" 5 "fmt" 6 "io" 7 "sync" 8 "time" 9 10 docker "github.com/fsouza/go-dockerclient" 11 cstructs "github.com/hashicorp/nomad/client/structs" 12 "github.com/hashicorp/nomad/drivers/docker/util" 13 "github.com/hashicorp/nomad/helper" 14 nstructs "github.com/hashicorp/nomad/nomad/structs" 15 ) 16 17 const ( 18 // statsCollectorBackoffBaseline is the baseline time for exponential 19 // backoff while calling the docker stats api. 20 statsCollectorBackoffBaseline = 5 * time.Second 21 22 // statsCollectorBackoffLimit is the limit of the exponential backoff for 23 // calling the docker stats api. 24 statsCollectorBackoffLimit = 2 * time.Minute 25 ) 26 27 // usageSender wraps a TaskResourceUsage chan such that it supports concurrent 28 // sending and closing, and backpressures by dropping events if necessary. 29 type usageSender struct { 30 closed bool 31 destCh chan<- *cstructs.TaskResourceUsage 32 mu sync.Mutex 33 } 34 35 // newStatsChanPipe returns a chan wrapped in a struct that supports concurrent 36 // sending and closing, and the receiver end of the chan. 37 func newStatsChanPipe() (*usageSender, <-chan *cstructs.TaskResourceUsage) { 38 destCh := make(chan *cstructs.TaskResourceUsage, 1) 39 return &usageSender{ 40 destCh: destCh, 41 }, destCh 42 43 } 44 45 // send resource usage to the receiver unless the chan is already full or 46 // closed. 47 func (u *usageSender) send(tru *cstructs.TaskResourceUsage) { 48 u.mu.Lock() 49 defer u.mu.Unlock() 50 51 if u.closed { 52 return 53 } 54 55 select { 56 case u.destCh <- tru: 57 default: 58 // Backpressure caused missed interval 59 } 60 } 61 62 // close resource usage. Any further sends will be dropped. 63 func (u *usageSender) close() { 64 u.mu.Lock() 65 defer u.mu.Unlock() 66 67 if u.closed { 68 // already closed 69 return 70 } 71 72 u.closed = true 73 close(u.destCh) 74 } 75 76 // Stats starts collecting stats from the docker daemon and sends them on the 77 // returned channel. 78 func (h *taskHandle) Stats(ctx context.Context, interval time.Duration) (<-chan *cstructs.TaskResourceUsage, error) { 79 select { 80 case <-h.doneCh: 81 return nil, nstructs.NewRecoverableError(fmt.Errorf("container stopped"), false) 82 default: 83 } 84 85 destCh, recvCh := newStatsChanPipe() 86 go h.collectStats(ctx, destCh, interval) 87 return recvCh, nil 88 } 89 90 // collectStats starts collecting resource usage stats of a docker container 91 func (h *taskHandle) collectStats(ctx context.Context, destCh *usageSender, interval time.Duration) { 92 defer destCh.close() 93 94 // backoff and retry used if the docker stats API returns an error 95 var backoff time.Duration = 0 96 var retry int 97 98 // create an interval timer 99 timer, stop := helper.NewSafeTimer(backoff) 100 defer stop() 101 102 // loops until doneCh is closed 103 for { 104 timer.Reset(backoff) 105 106 if backoff > 0 { 107 select { 108 case <-timer.C: 109 case <-ctx.Done(): 110 return 111 case <-h.doneCh: 112 return 113 } 114 } 115 116 // make a channel for docker stats structs and start a collector to 117 // receive stats from docker and emit nomad stats 118 // statsCh will always be closed by docker client. 119 statsCh := make(chan *docker.Stats) 120 go dockerStatsCollector(destCh, statsCh, interval) 121 122 statsOpts := docker.StatsOptions{ 123 ID: h.containerID, 124 Context: ctx, 125 Done: h.doneCh, 126 Stats: statsCh, 127 Stream: true, 128 } 129 130 // Stats blocks until an error has occurred, or doneCh has been closed 131 if err := h.client.Stats(statsOpts); err != nil && err != io.ErrClosedPipe { 132 // An error occurred during stats collection, retry with backoff 133 h.logger.Debug("error collecting stats from container", "error", err) 134 135 // Calculate the new backoff 136 backoff = (1 << (2 * uint64(retry))) * statsCollectorBackoffBaseline 137 if backoff > statsCollectorBackoffLimit { 138 backoff = statsCollectorBackoffLimit 139 } 140 // Increment retry counter 141 retry++ 142 continue 143 } 144 // Stats finished either because context was canceled, doneCh was closed 145 // or the container stopped. Stop stats collections. 146 return 147 } 148 } 149 150 func dockerStatsCollector(destCh *usageSender, statsCh <-chan *docker.Stats, interval time.Duration) { 151 var resourceUsage *cstructs.TaskResourceUsage 152 153 // hasSentInitialStats is used so as to emit the first stats received from 154 // the docker daemon 155 var hasSentInitialStats bool 156 157 // timer is used to send nomad status at the specified interval 158 timer := time.NewTimer(interval) 159 for { 160 select { 161 case <-timer.C: 162 // it is possible for the timer to go off before the first stats 163 // has been emitted from docker 164 if resourceUsage == nil { 165 continue 166 } 167 168 // sending to destCh could block, drop this interval if it does 169 destCh.send(resourceUsage) 170 171 timer.Reset(interval) 172 173 case s, ok := <-statsCh: 174 // if statsCh is closed stop collection 175 if !ok { 176 return 177 } 178 // s should always be set, but check and skip just in case 179 if s != nil { 180 resourceUsage = util.DockerStatsToTaskResourceUsage(s) 181 // send stats next interation if this is the first time received 182 // from docker 183 if !hasSentInitialStats { 184 timer.Reset(0) 185 hasSentInitialStats = true 186 } 187 } 188 } 189 } 190 }