github.com/hernad/nomad@v1.6.112/drivers/docker/stats.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package docker 5 6 import ( 7 "context" 8 "fmt" 9 "io" 10 "sync" 11 "time" 12 13 docker "github.com/fsouza/go-dockerclient" 14 cstructs "github.com/hernad/nomad/client/structs" 15 "github.com/hernad/nomad/drivers/docker/util" 16 "github.com/hernad/nomad/helper" 17 nstructs "github.com/hernad/nomad/nomad/structs" 18 ) 19 20 const ( 21 // statsCollectorBackoffBaseline is the baseline time for exponential 22 // backoff while calling the docker stats api. 23 statsCollectorBackoffBaseline = 5 * time.Second 24 25 // statsCollectorBackoffLimit is the limit of the exponential backoff for 26 // calling the docker stats api. 27 statsCollectorBackoffLimit = 2 * time.Minute 28 ) 29 30 // usageSender wraps a TaskResourceUsage chan such that it supports concurrent 31 // sending and closing, and backpressures by dropping events if necessary. 32 type usageSender struct { 33 closed bool 34 destCh chan<- *cstructs.TaskResourceUsage 35 mu sync.Mutex 36 } 37 38 // newStatsChanPipe returns a chan wrapped in a struct that supports concurrent 39 // sending and closing, and the receiver end of the chan. 40 func newStatsChanPipe() (*usageSender, <-chan *cstructs.TaskResourceUsage) { 41 destCh := make(chan *cstructs.TaskResourceUsage, 1) 42 return &usageSender{ 43 destCh: destCh, 44 }, destCh 45 46 } 47 48 // send resource usage to the receiver unless the chan is already full or 49 // closed. 50 func (u *usageSender) send(tru *cstructs.TaskResourceUsage) { 51 u.mu.Lock() 52 defer u.mu.Unlock() 53 54 if u.closed { 55 return 56 } 57 58 select { 59 case u.destCh <- tru: 60 default: 61 // Backpressure caused missed interval 62 } 63 } 64 65 // close resource usage. Any further sends will be dropped. 66 func (u *usageSender) close() { 67 u.mu.Lock() 68 defer u.mu.Unlock() 69 70 if u.closed { 71 // already closed 72 return 73 } 74 75 u.closed = true 76 close(u.destCh) 77 } 78 79 // Stats starts collecting stats from the docker daemon and sends them on the 80 // returned channel. 81 func (h *taskHandle) Stats(ctx context.Context, interval time.Duration) (<-chan *cstructs.TaskResourceUsage, error) { 82 select { 83 case <-h.doneCh: 84 return nil, nstructs.NewRecoverableError(fmt.Errorf("container stopped"), false) 85 default: 86 } 87 88 destCh, recvCh := newStatsChanPipe() 89 go h.collectStats(ctx, destCh, interval) 90 return recvCh, nil 91 } 92 93 // collectStats starts collecting resource usage stats of a docker container 94 func (h *taskHandle) collectStats(ctx context.Context, destCh *usageSender, interval time.Duration) { 95 defer destCh.close() 96 97 // backoff and retry used if the docker stats API returns an error 98 var backoff time.Duration = 0 99 var retry int 100 101 // create an interval timer 102 timer, stop := helper.NewSafeTimer(backoff) 103 defer stop() 104 105 // loops until doneCh is closed 106 for { 107 timer.Reset(backoff) 108 109 if backoff > 0 { 110 select { 111 case <-timer.C: 112 case <-ctx.Done(): 113 return 114 case <-h.doneCh: 115 return 116 } 117 } 118 119 // make a channel for docker stats structs and start a collector to 120 // receive stats from docker and emit nomad stats 121 // statsCh will always be closed by docker client. 122 statsCh := make(chan *docker.Stats) 123 go dockerStatsCollector(destCh, statsCh, interval) 124 125 statsOpts := docker.StatsOptions{ 126 ID: h.containerID, 127 Context: ctx, 128 Done: h.doneCh, 129 Stats: statsCh, 130 Stream: true, 131 } 132 133 // Stats blocks until an error has occurred, or doneCh has been closed 134 if err := h.dockerClient.Stats(statsOpts); err != nil && err != io.ErrClosedPipe { 135 // An error occurred during stats collection, retry with backoff 136 h.logger.Debug("error collecting stats from container", "error", err) 137 138 // Calculate the new backoff 139 backoff = (1 << (2 * uint64(retry))) * statsCollectorBackoffBaseline 140 if backoff > statsCollectorBackoffLimit { 141 backoff = statsCollectorBackoffLimit 142 } 143 // Increment retry counter 144 retry++ 145 continue 146 } 147 // Stats finished either because context was canceled, doneCh was closed 148 // or the container stopped. Stop stats collections. 149 return 150 } 151 } 152 153 func dockerStatsCollector(destCh *usageSender, statsCh <-chan *docker.Stats, interval time.Duration) { 154 var resourceUsage *cstructs.TaskResourceUsage 155 156 // hasSentInitialStats is used so as to emit the first stats received from 157 // the docker daemon 158 var hasSentInitialStats bool 159 160 // timer is used to send nomad status at the specified interval 161 timer := time.NewTimer(interval) 162 for { 163 select { 164 case <-timer.C: 165 // it is possible for the timer to go off before the first stats 166 // has been emitted from docker 167 if resourceUsage == nil { 168 continue 169 } 170 171 // sending to destCh could block, drop this interval if it does 172 destCh.send(resourceUsage) 173 174 timer.Reset(interval) 175 176 case s, ok := <-statsCh: 177 // if statsCh is closed stop collection 178 if !ok { 179 return 180 } 181 // s should always be set, but check and skip just in case 182 if s != nil { 183 resourceUsage = util.DockerStatsToTaskResourceUsage(s) 184 // send stats next interation if this is the first time received 185 // from docker 186 if !hasSentInitialStats { 187 timer.Reset(0) 188 hasSentInitialStats = true 189 } 190 } 191 } 192 } 193 }