github.com/bigcommerce/nomad@v0.9.3-bc/drivers/docker/coordinator.go (about) 1 package docker 2 3 import ( 4 "context" 5 "fmt" 6 "regexp" 7 "sync" 8 "time" 9 10 docker "github.com/fsouza/go-dockerclient" 11 hclog "github.com/hashicorp/go-hclog" 12 "github.com/hashicorp/nomad/nomad/structs" 13 ) 14 15 var ( 16 // imageNotFoundMatcher is a regex expression that matches the image not 17 // found error Docker returns. 18 imageNotFoundMatcher = regexp.MustCompile(`Error: image .+ not found`) 19 ) 20 21 // pullFuture is a sharable future for retrieving a pulled images ID and any 22 // error that may have occurred during the pull. 23 type pullFuture struct { 24 waitCh chan struct{} 25 26 err error 27 imageID string 28 } 29 30 // newPullFuture returns a new pull future 31 func newPullFuture() *pullFuture { 32 return &pullFuture{ 33 waitCh: make(chan struct{}), 34 } 35 } 36 37 // wait waits till the future has a result 38 func (p *pullFuture) wait() *pullFuture { 39 <-p.waitCh 40 return p 41 } 42 43 // result returns the results of the future and should only ever be called after 44 // wait returns. 45 func (p *pullFuture) result() (imageID string, err error) { 46 return p.imageID, p.err 47 } 48 49 // set is used to set the results and unblock any waiter. This may only be 50 // called once. 51 func (p *pullFuture) set(imageID string, err error) { 52 p.imageID = imageID 53 p.err = err 54 close(p.waitCh) 55 } 56 57 // DockerImageClient provides the methods required to do CRUD operations on the 58 // Docker images 59 type DockerImageClient interface { 60 PullImage(opts docker.PullImageOptions, auth docker.AuthConfiguration) error 61 InspectImage(id string) (*docker.Image, error) 62 RemoveImage(id string) error 63 } 64 65 // LogEventFn is a callback which allows Drivers to emit task events. 66 type LogEventFn func(message string, annotations map[string]string) 67 68 // dockerCoordinatorConfig is used to configure the Docker coordinator. 69 type dockerCoordinatorConfig struct { 70 // logger is the logger the coordinator should use 71 logger hclog.Logger 72 73 // cleanup marks whether images should be deleted when the reference count 74 // is zero 75 cleanup bool 76 77 // client is the Docker client to use for communicating with Docker 78 client DockerImageClient 79 80 // removeDelay is the delay between an image's reference count going to 81 // zero and the image actually being deleted. 82 removeDelay time.Duration 83 } 84 85 // dockerCoordinator is used to coordinate actions against images to prevent 86 // racy deletions. It can be thought of as a reference counter on images. 87 type dockerCoordinator struct { 88 *dockerCoordinatorConfig 89 90 // imageLock is used to lock access to all images 91 imageLock sync.Mutex 92 93 // pullFutures is used to allow multiple callers to pull the same image but 94 // only have one request be sent to Docker 95 pullFutures map[string]*pullFuture 96 97 // pullLoggers is used to track the LogEventFn for each alloc pulling an image. 98 // If multiple alloc's are attempting to pull the same image, each will need 99 // to register its own LogEventFn with the coordinator. 100 pullLoggers map[string][]LogEventFn 101 102 // pullLoggerLock is used to sync access to the pullLoggers map 103 pullLoggerLock sync.RWMutex 104 105 // imageRefCount is the reference count of image IDs 106 imageRefCount map[string]map[string]struct{} 107 108 // deleteFuture is indexed by image ID and has a cancelable delete future 109 deleteFuture map[string]context.CancelFunc 110 } 111 112 // newDockerCoordinator returns a new Docker coordinator 113 func newDockerCoordinator(config *dockerCoordinatorConfig) *dockerCoordinator { 114 if config.client == nil { 115 return nil 116 } 117 118 return &dockerCoordinator{ 119 dockerCoordinatorConfig: config, 120 pullFutures: make(map[string]*pullFuture), 121 pullLoggers: make(map[string][]LogEventFn), 122 imageRefCount: make(map[string]map[string]struct{}), 123 deleteFuture: make(map[string]context.CancelFunc), 124 } 125 } 126 127 // PullImage is used to pull an image. It returns the pulled imaged ID or an 128 // error that occurred during the pull 129 func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConfiguration, callerID string, emitFn LogEventFn) (imageID string, err error) { 130 // Get the future 131 d.imageLock.Lock() 132 future, ok := d.pullFutures[image] 133 d.registerPullLogger(image, emitFn) 134 if !ok { 135 // Make the future 136 future = newPullFuture() 137 d.pullFutures[image] = future 138 go d.pullImageImpl(image, authOptions, future) 139 } 140 d.imageLock.Unlock() 141 142 // We unlock while we wait since this can take a while 143 id, err := future.wait().result() 144 145 d.imageLock.Lock() 146 defer d.imageLock.Unlock() 147 148 // Delete the future since we don't need it and we don't want to cache an 149 // image being there if it has possibly been manually deleted (outside of 150 // Nomad). 151 if _, ok := d.pullFutures[image]; ok { 152 delete(d.pullFutures, image) 153 } 154 155 // If we are cleaning up, we increment the reference count on the image 156 if err == nil && d.cleanup { 157 d.incrementImageReferenceImpl(id, image, callerID) 158 } 159 160 return id, err 161 } 162 163 // pullImageImpl is the implementation of pulling an image. The results are 164 // returned via the passed future 165 func (d *dockerCoordinator) pullImageImpl(image string, authOptions *docker.AuthConfiguration, future *pullFuture) { 166 defer d.clearPullLogger(image) 167 // Parse the repo and tag 168 repo, tag := parseDockerImage(image) 169 ctx, cancel := context.WithCancel(context.Background()) 170 defer cancel() 171 172 pm := newImageProgressManager(image, cancel, d.handlePullInactivity, 173 d.handlePullProgressReport, d.handleSlowPullProgressReport) 174 defer pm.stop() 175 176 pullOptions := docker.PullImageOptions{ 177 Repository: repo, 178 Tag: tag, 179 OutputStream: pm, 180 RawJSONStream: true, 181 Context: ctx, 182 } 183 184 // Attempt to pull the image 185 var auth docker.AuthConfiguration 186 if authOptions != nil { 187 auth = *authOptions 188 } 189 190 err := d.client.PullImage(pullOptions, auth) 191 192 if ctxErr := ctx.Err(); ctxErr == context.DeadlineExceeded { 193 d.logger.Error("timeout pulling container", "image_ref", dockerImageRef(repo, tag)) 194 future.set("", recoverablePullError(ctxErr, image)) 195 return 196 } 197 198 if err != nil { 199 d.logger.Error("failed pulling container", "image_ref", dockerImageRef(repo, tag), 200 "error", err) 201 future.set("", recoverablePullError(err, image)) 202 return 203 } 204 205 d.logger.Debug("docker pull succeeded", "image_ref", dockerImageRef(repo, tag)) 206 207 dockerImage, err := d.client.InspectImage(image) 208 if err != nil { 209 d.logger.Error("failed getting image id", "image_name", image, "error", err) 210 future.set("", recoverableErrTimeouts(err)) 211 return 212 } 213 214 future.set(dockerImage.ID, nil) 215 return 216 } 217 218 // IncrementImageReference is used to increment an image reference count 219 func (d *dockerCoordinator) IncrementImageReference(imageID, imageName, callerID string) { 220 d.imageLock.Lock() 221 defer d.imageLock.Unlock() 222 if d.cleanup { 223 d.incrementImageReferenceImpl(imageID, imageName, callerID) 224 } 225 } 226 227 // incrementImageReferenceImpl assumes the lock is held 228 func (d *dockerCoordinator) incrementImageReferenceImpl(imageID, imageName, callerID string) { 229 // Cancel any pending delete 230 if cancel, ok := d.deleteFuture[imageID]; ok { 231 d.logger.Debug("cancelling removal of container image", "image_name", imageName) 232 cancel() 233 delete(d.deleteFuture, imageID) 234 } 235 236 // Increment the reference 237 references, ok := d.imageRefCount[imageID] 238 if !ok { 239 references = make(map[string]struct{}) 240 d.imageRefCount[imageID] = references 241 } 242 243 if _, ok := references[callerID]; !ok { 244 references[callerID] = struct{}{} 245 d.logger.Debug("image reference count incremented", "image_name", imageName, "image_id", imageID, "references", len(references)) 246 } 247 } 248 249 // RemoveImage removes the given image. If there are any errors removing the 250 // image, the remove is retried internally. 251 func (d *dockerCoordinator) RemoveImage(imageID, callerID string) { 252 d.imageLock.Lock() 253 defer d.imageLock.Unlock() 254 255 if !d.cleanup { 256 return 257 } 258 259 references, ok := d.imageRefCount[imageID] 260 if !ok { 261 d.logger.Warn("RemoveImage on non-referenced counted image id", "image_id", imageID) 262 return 263 } 264 265 // Decrement the reference count 266 delete(references, callerID) 267 count := len(references) 268 d.logger.Debug("image id reference count decremented", "image_id", imageID, "references", count) 269 270 // Nothing to do 271 if count != 0 { 272 return 273 } 274 275 // This should never be the case but we safety guard so we don't leak a 276 // cancel. 277 if cancel, ok := d.deleteFuture[imageID]; ok { 278 d.logger.Error("image id has lingering delete future", "image_id", imageID) 279 cancel() 280 } 281 282 // Setup a future to delete the image 283 ctx, cancel := context.WithCancel(context.Background()) 284 d.deleteFuture[imageID] = cancel 285 go d.removeImageImpl(imageID, ctx) 286 287 // Delete the key from the reference count 288 delete(d.imageRefCount, imageID) 289 } 290 291 // removeImageImpl is used to remove an image. It wil wait the specified remove 292 // delay to remove the image. If the context is cancelled before that the image 293 // removal will be cancelled. 294 func (d *dockerCoordinator) removeImageImpl(id string, ctx context.Context) { 295 // Wait for the delay or a cancellation event 296 select { 297 case <-ctx.Done(): 298 // We have been cancelled 299 return 300 case <-time.After(d.removeDelay): 301 } 302 303 // Ensure we are suppose to delete. Do a short check while holding the lock 304 // so there can't be interleaving. There is still the smallest chance that 305 // the delete occurs after the image has been pulled but before it has been 306 // incremented. For handling that we just treat it as a recoverable error in 307 // the docker driver. 308 d.imageLock.Lock() 309 select { 310 case <-ctx.Done(): 311 d.imageLock.Unlock() 312 return 313 default: 314 } 315 d.imageLock.Unlock() 316 317 for i := 0; i < 3; i++ { 318 err := d.client.RemoveImage(id) 319 if err == nil { 320 break 321 } 322 323 if err == docker.ErrNoSuchImage { 324 d.logger.Debug("unable to cleanup image, does not exist", "image_id", id) 325 return 326 } 327 if derr, ok := err.(*docker.Error); ok && derr.Status == 409 { 328 d.logger.Debug("unable to cleanup image, still in use", "image_id", id) 329 return 330 } 331 332 // Retry on unknown errors 333 d.logger.Debug("failed to remove image", "image_id", id, "attempt", i+1, "error", err) 334 335 select { 336 case <-ctx.Done(): 337 // We have been cancelled 338 return 339 case <-time.After(3 * time.Second): 340 } 341 } 342 343 d.logger.Debug("cleanup removed downloaded image", "image_id", id) 344 345 // Cleanup the future from the map and free the context by cancelling it 346 d.imageLock.Lock() 347 if cancel, ok := d.deleteFuture[id]; ok { 348 delete(d.deleteFuture, id) 349 cancel() 350 } 351 d.imageLock.Unlock() 352 } 353 354 func (d *dockerCoordinator) registerPullLogger(image string, logger LogEventFn) { 355 d.pullLoggerLock.Lock() 356 defer d.pullLoggerLock.Unlock() 357 if _, ok := d.pullLoggers[image]; !ok { 358 d.pullLoggers[image] = []LogEventFn{} 359 } 360 d.pullLoggers[image] = append(d.pullLoggers[image], logger) 361 } 362 363 func (d *dockerCoordinator) clearPullLogger(image string) { 364 d.pullLoggerLock.Lock() 365 defer d.pullLoggerLock.Unlock() 366 delete(d.pullLoggers, image) 367 } 368 369 func (d *dockerCoordinator) emitEvent(image, message string, annotations map[string]string) { 370 d.pullLoggerLock.RLock() 371 defer d.pullLoggerLock.RUnlock() 372 for i := range d.pullLoggers[image] { 373 go d.pullLoggers[image][i](message, annotations) 374 } 375 } 376 377 func (d *dockerCoordinator) handlePullInactivity(image, msg string, timestamp time.Time) { 378 d.logger.Error("image pull aborted due to inactivity", "image_name", image, 379 "last_event_timestamp", timestamp.String(), "last_event", msg) 380 } 381 382 func (d *dockerCoordinator) handlePullProgressReport(image, msg string, _ time.Time) { 383 d.logger.Debug("image pull progress", "image_name", image, "message", msg) 384 } 385 386 func (d *dockerCoordinator) handleSlowPullProgressReport(image, msg string, _ time.Time) { 387 d.emitEvent(image, fmt.Sprintf("Docker image pull progress: %s", msg), map[string]string{ 388 "image": image, 389 }) 390 } 391 392 // recoverablePullError wraps the error gotten when trying to pull and image if 393 // the error is recoverable. 394 func recoverablePullError(err error, image string) error { 395 recoverable := true 396 if imageNotFoundMatcher.MatchString(err.Error()) { 397 recoverable = false 398 } 399 return structs.NewRecoverableError(fmt.Errorf("Failed to pull `%s`: %s", image, err), recoverable) 400 }