github.com/ilhicas/nomad@v0.11.8/drivers/docker/coordinator.go (about) 1 package docker 2 3 import ( 4 "context" 5 "fmt" 6 "regexp" 7 "sync" 8 "time" 9 10 docker "github.com/fsouza/go-dockerclient" 11 hclog "github.com/hashicorp/go-hclog" 12 "github.com/hashicorp/nomad/nomad/structs" 13 ) 14 15 var ( 16 // imageNotFoundMatcher is a regex expression that matches the image not 17 // found error Docker returns. 18 imageNotFoundMatcher = regexp.MustCompile(`Error: image .+ not found`) 19 ) 20 21 // pullFuture is a sharable future for retrieving a pulled images ID and any 22 // error that may have occurred during the pull. 23 type pullFuture struct { 24 waitCh chan struct{} 25 26 err error 27 imageID string 28 } 29 30 // newPullFuture returns a new pull future 31 func newPullFuture() *pullFuture { 32 return &pullFuture{ 33 waitCh: make(chan struct{}), 34 } 35 } 36 37 // wait waits till the future has a result 38 func (p *pullFuture) wait() *pullFuture { 39 <-p.waitCh 40 return p 41 } 42 43 // result returns the results of the future and should only ever be called after 44 // wait returns. 45 func (p *pullFuture) result() (imageID string, err error) { 46 return p.imageID, p.err 47 } 48 49 // set is used to set the results and unblock any waiter. This may only be 50 // called once. 51 func (p *pullFuture) set(imageID string, err error) { 52 p.imageID = imageID 53 p.err = err 54 close(p.waitCh) 55 } 56 57 // DockerImageClient provides the methods required to do CRUD operations on the 58 // Docker images 59 type DockerImageClient interface { 60 PullImage(opts docker.PullImageOptions, auth docker.AuthConfiguration) error 61 InspectImage(id string) (*docker.Image, error) 62 RemoveImage(id string) error 63 } 64 65 // LogEventFn is a callback which allows Drivers to emit task events. 66 type LogEventFn func(message string, annotations map[string]string) 67 68 // noopLogEventFn satisfies the LogEventFn type but noops when called 69 func noopLogEventFn(string, map[string]string) {} 70 71 // dockerCoordinatorConfig is used to configure the Docker coordinator. 72 type dockerCoordinatorConfig struct { 73 ctx context.Context 74 75 // logger is the logger the coordinator should use 76 logger hclog.Logger 77 78 // cleanup marks whether images should be deleted when the reference count 79 // is zero 80 cleanup bool 81 82 // client is the Docker client to use for communicating with Docker 83 client DockerImageClient 84 85 // removeDelay is the delay between an image's reference count going to 86 // zero and the image actually being deleted. 87 removeDelay time.Duration 88 } 89 90 // dockerCoordinator is used to coordinate actions against images to prevent 91 // racy deletions. It can be thought of as a reference counter on images. 92 type dockerCoordinator struct { 93 *dockerCoordinatorConfig 94 95 // imageLock is used to lock access to all images 96 imageLock sync.Mutex 97 98 // pullFutures is used to allow multiple callers to pull the same image but 99 // only have one request be sent to Docker 100 pullFutures map[string]*pullFuture 101 102 // pullLoggers is used to track the LogEventFn for each alloc pulling an image. 103 // If multiple alloc's are attempting to pull the same image, each will need 104 // to register its own LogEventFn with the coordinator. 105 pullLoggers map[string][]LogEventFn 106 107 // pullLoggerLock is used to sync access to the pullLoggers map 108 pullLoggerLock sync.RWMutex 109 110 // imageRefCount is the reference count of image IDs 111 imageRefCount map[string]map[string]struct{} 112 113 // deleteFuture is indexed by image ID and has a cancelable delete future 114 deleteFuture map[string]context.CancelFunc 115 } 116 117 // newDockerCoordinator returns a new Docker coordinator 118 func newDockerCoordinator(config *dockerCoordinatorConfig) *dockerCoordinator { 119 if config.client == nil { 120 return nil 121 } 122 123 return &dockerCoordinator{ 124 dockerCoordinatorConfig: config, 125 pullFutures: make(map[string]*pullFuture), 126 pullLoggers: make(map[string][]LogEventFn), 127 imageRefCount: make(map[string]map[string]struct{}), 128 deleteFuture: make(map[string]context.CancelFunc), 129 } 130 } 131 132 // PullImage is used to pull an image. It returns the pulled imaged ID or an 133 // error that occurred during the pull 134 func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConfiguration, callerID string, emitFn LogEventFn, pullActivityTimeout time.Duration) (imageID string, err error) { 135 // Get the future 136 d.imageLock.Lock() 137 future, ok := d.pullFutures[image] 138 d.registerPullLogger(image, emitFn) 139 if !ok { 140 // Make the future 141 future = newPullFuture() 142 d.pullFutures[image] = future 143 go d.pullImageImpl(image, authOptions, pullActivityTimeout, future) 144 } 145 d.imageLock.Unlock() 146 147 // We unlock while we wait since this can take a while 148 id, err := future.wait().result() 149 150 d.imageLock.Lock() 151 defer d.imageLock.Unlock() 152 153 // Delete the future since we don't need it and we don't want to cache an 154 // image being there if it has possibly been manually deleted (outside of 155 // Nomad). 156 if _, ok := d.pullFutures[image]; ok { 157 delete(d.pullFutures, image) 158 } 159 160 // If we are cleaning up, we increment the reference count on the image 161 if err == nil && d.cleanup { 162 d.incrementImageReferenceImpl(id, image, callerID) 163 } 164 165 return id, err 166 } 167 168 // pullImageImpl is the implementation of pulling an image. The results are 169 // returned via the passed future 170 func (d *dockerCoordinator) pullImageImpl(image string, authOptions *docker.AuthConfiguration, pullActivityTimeout time.Duration, future *pullFuture) { 171 defer d.clearPullLogger(image) 172 // Parse the repo and tag 173 repo, tag := parseDockerImage(image) 174 ctx, cancel := context.WithCancel(context.Background()) 175 defer cancel() 176 177 pm := newImageProgressManager(image, cancel, pullActivityTimeout, d.handlePullInactivity, 178 d.handlePullProgressReport, d.handleSlowPullProgressReport) 179 defer pm.stop() 180 181 pullOptions := docker.PullImageOptions{ 182 Repository: repo, 183 Tag: tag, 184 OutputStream: pm, 185 RawJSONStream: true, 186 Context: ctx, 187 } 188 189 // Attempt to pull the image 190 var auth docker.AuthConfiguration 191 if authOptions != nil { 192 auth = *authOptions 193 } 194 195 err := d.client.PullImage(pullOptions, auth) 196 197 if ctxErr := ctx.Err(); ctxErr == context.DeadlineExceeded { 198 d.logger.Error("timeout pulling container", "image_ref", dockerImageRef(repo, tag)) 199 future.set("", recoverablePullError(ctxErr, image)) 200 return 201 } 202 203 if err != nil { 204 d.logger.Error("failed pulling container", "image_ref", dockerImageRef(repo, tag), 205 "error", err) 206 future.set("", recoverablePullError(err, image)) 207 return 208 } 209 210 d.logger.Debug("docker pull succeeded", "image_ref", dockerImageRef(repo, tag)) 211 212 dockerImage, err := d.client.InspectImage(image) 213 if err != nil { 214 d.logger.Error("failed getting image id", "image_name", image, "error", err) 215 future.set("", recoverableErrTimeouts(err)) 216 return 217 } 218 219 future.set(dockerImage.ID, nil) 220 return 221 } 222 223 // IncrementImageReference is used to increment an image reference count 224 func (d *dockerCoordinator) IncrementImageReference(imageID, imageName, callerID string) { 225 d.imageLock.Lock() 226 defer d.imageLock.Unlock() 227 if d.cleanup { 228 d.incrementImageReferenceImpl(imageID, imageName, callerID) 229 } 230 } 231 232 // incrementImageReferenceImpl assumes the lock is held 233 func (d *dockerCoordinator) incrementImageReferenceImpl(imageID, imageName, callerID string) { 234 // Cancel any pending delete 235 if cancel, ok := d.deleteFuture[imageID]; ok { 236 d.logger.Debug("cancelling removal of container image", "image_name", imageName) 237 cancel() 238 delete(d.deleteFuture, imageID) 239 } 240 241 // Increment the reference 242 references, ok := d.imageRefCount[imageID] 243 if !ok { 244 references = make(map[string]struct{}) 245 d.imageRefCount[imageID] = references 246 } 247 248 if _, ok := references[callerID]; !ok { 249 references[callerID] = struct{}{} 250 d.logger.Debug("image reference count incremented", "image_name", imageName, "image_id", imageID, "references", len(references)) 251 } 252 } 253 254 // RemoveImage removes the given image. If there are any errors removing the 255 // image, the remove is retried internally. 256 func (d *dockerCoordinator) RemoveImage(imageID, callerID string) { 257 d.imageLock.Lock() 258 defer d.imageLock.Unlock() 259 260 if !d.cleanup { 261 return 262 } 263 264 references, ok := d.imageRefCount[imageID] 265 if !ok { 266 d.logger.Warn("RemoveImage on non-referenced counted image id", "image_id", imageID) 267 return 268 } 269 270 // Decrement the reference count 271 delete(references, callerID) 272 count := len(references) 273 d.logger.Debug("image id reference count decremented", "image_id", imageID, "references", count) 274 275 // Nothing to do 276 if count != 0 { 277 return 278 } 279 280 // This should never be the case but we safety guard so we don't leak a 281 // cancel. 282 if cancel, ok := d.deleteFuture[imageID]; ok { 283 d.logger.Error("image id has lingering delete future", "image_id", imageID) 284 cancel() 285 } 286 287 // Setup a future to delete the image 288 ctx, cancel := context.WithCancel(d.ctx) 289 d.deleteFuture[imageID] = cancel 290 go d.removeImageImpl(imageID, ctx) 291 292 // Delete the key from the reference count 293 delete(d.imageRefCount, imageID) 294 } 295 296 // removeImageImpl is used to remove an image. It wil wait the specified remove 297 // delay to remove the image. If the context is cancelled before that the image 298 // removal will be cancelled. 299 func (d *dockerCoordinator) removeImageImpl(id string, ctx context.Context) { 300 // Wait for the delay or a cancellation event 301 select { 302 case <-ctx.Done(): 303 // We have been cancelled 304 return 305 case <-time.After(d.removeDelay): 306 } 307 308 // Ensure we are suppose to delete. Do a short check while holding the lock 309 // so there can't be interleaving. There is still the smallest chance that 310 // the delete occurs after the image has been pulled but before it has been 311 // incremented. For handling that we just treat it as a recoverable error in 312 // the docker driver. 313 d.imageLock.Lock() 314 select { 315 case <-ctx.Done(): 316 d.imageLock.Unlock() 317 return 318 default: 319 } 320 d.imageLock.Unlock() 321 322 for i := 0; i < 3; i++ { 323 err := d.client.RemoveImage(id) 324 if err == nil { 325 break 326 } 327 328 if err == docker.ErrNoSuchImage { 329 d.logger.Debug("unable to cleanup image, does not exist", "image_id", id) 330 return 331 } 332 if derr, ok := err.(*docker.Error); ok && derr.Status == 409 { 333 d.logger.Debug("unable to cleanup image, still in use", "image_id", id) 334 return 335 } 336 337 // Retry on unknown errors 338 d.logger.Debug("failed to remove image", "image_id", id, "attempt", i+1, "error", err) 339 340 select { 341 case <-ctx.Done(): 342 // We have been cancelled 343 return 344 case <-time.After(3 * time.Second): 345 } 346 } 347 348 d.logger.Debug("cleanup removed downloaded image", "image_id", id) 349 350 // Cleanup the future from the map and free the context by cancelling it 351 d.imageLock.Lock() 352 if cancel, ok := d.deleteFuture[id]; ok { 353 delete(d.deleteFuture, id) 354 cancel() 355 } 356 d.imageLock.Unlock() 357 } 358 359 func (d *dockerCoordinator) registerPullLogger(image string, logger LogEventFn) { 360 d.pullLoggerLock.Lock() 361 defer d.pullLoggerLock.Unlock() 362 if _, ok := d.pullLoggers[image]; !ok { 363 d.pullLoggers[image] = []LogEventFn{} 364 } 365 d.pullLoggers[image] = append(d.pullLoggers[image], logger) 366 } 367 368 func (d *dockerCoordinator) clearPullLogger(image string) { 369 d.pullLoggerLock.Lock() 370 defer d.pullLoggerLock.Unlock() 371 delete(d.pullLoggers, image) 372 } 373 374 func (d *dockerCoordinator) emitEvent(image, message string, annotations map[string]string) { 375 d.pullLoggerLock.RLock() 376 defer d.pullLoggerLock.RUnlock() 377 for i := range d.pullLoggers[image] { 378 go d.pullLoggers[image][i](message, annotations) 379 } 380 } 381 382 func (d *dockerCoordinator) handlePullInactivity(image, msg string, timestamp time.Time) { 383 d.logger.Error("image pull aborted due to inactivity", "image_name", image, 384 "last_event_timestamp", timestamp.String(), "last_event", msg) 385 } 386 387 func (d *dockerCoordinator) handlePullProgressReport(image, msg string, _ time.Time) { 388 d.logger.Debug("image pull progress", "image_name", image, "message", msg) 389 } 390 391 func (d *dockerCoordinator) handleSlowPullProgressReport(image, msg string, _ time.Time) { 392 d.emitEvent(image, fmt.Sprintf("Docker image pull progress: %s", msg), map[string]string{ 393 "image": image, 394 }) 395 } 396 397 // recoverablePullError wraps the error gotten when trying to pull and image if 398 // the error is recoverable. 399 func recoverablePullError(err error, image string) error { 400 recoverable := true 401 if imageNotFoundMatcher.MatchString(err.Error()) { 402 recoverable = false 403 } 404 return structs.NewRecoverableError(fmt.Errorf("Failed to pull `%s`: %s", image, err), recoverable) 405 }