github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/drivers/docker/coordinator.go (about) 1 package docker 2 3 import ( 4 "context" 5 "fmt" 6 "regexp" 7 "sync" 8 "time" 9 10 docker "github.com/fsouza/go-dockerclient" 11 hclog "github.com/hashicorp/go-hclog" 12 "github.com/hashicorp/nomad/nomad/structs" 13 ) 14 15 var ( 16 // imageNotFoundMatcher is a regex expression that matches the image not 17 // found error Docker returns. 18 imageNotFoundMatcher = regexp.MustCompile(`Error: image .+ not found`) 19 ) 20 21 // pullFuture is a sharable future for retrieving a pulled images ID and any 22 // error that may have occurred during the pull. 23 type pullFuture struct { 24 waitCh chan struct{} 25 26 err error 27 imageID string 28 } 29 30 // newPullFuture returns a new pull future 31 func newPullFuture() *pullFuture { 32 return &pullFuture{ 33 waitCh: make(chan struct{}), 34 } 35 } 36 37 // wait waits till the future has a result 38 func (p *pullFuture) wait() *pullFuture { 39 <-p.waitCh 40 return p 41 } 42 43 // result returns the results of the future and should only ever be called after 44 // wait returns. 45 func (p *pullFuture) result() (imageID string, err error) { 46 return p.imageID, p.err 47 } 48 49 // set is used to set the results and unblock any waiter. This may only be 50 // called once. 51 func (p *pullFuture) set(imageID string, err error) { 52 p.imageID = imageID 53 p.err = err 54 close(p.waitCh) 55 } 56 57 // DockerImageClient provides the methods required to do CRUD operations on the 58 // Docker images 59 type DockerImageClient interface { 60 PullImage(opts docker.PullImageOptions, auth docker.AuthConfiguration) error 61 InspectImage(id string) (*docker.Image, error) 62 RemoveImage(id string) error 63 } 64 65 // LogEventFn is a callback which allows Drivers to emit task events. 66 type LogEventFn func(message string, annotations map[string]string) 67 68 // noopLogEventFn satisfies the LogEventFn type but noops when called 69 func noopLogEventFn(string, map[string]string) {} 70 71 // dockerCoordinatorConfig is used to configure the Docker coordinator. 72 type dockerCoordinatorConfig struct { 73 ctx context.Context 74 75 // logger is the logger the coordinator should use 76 logger hclog.Logger 77 78 // cleanup marks whether images should be deleted when the reference count 79 // is zero 80 cleanup bool 81 82 // client is the Docker client to use for communicating with Docker 83 client DockerImageClient 84 85 // removeDelay is the delay between an image's reference count going to 86 // zero and the image actually being deleted. 87 removeDelay time.Duration 88 } 89 90 // dockerCoordinator is used to coordinate actions against images to prevent 91 // racy deletions. It can be thought of as a reference counter on images. 92 type dockerCoordinator struct { 93 *dockerCoordinatorConfig 94 95 // imageLock is used to lock access to all images 96 imageLock sync.Mutex 97 98 // pullFutures is used to allow multiple callers to pull the same image but 99 // only have one request be sent to Docker 100 pullFutures map[string]*pullFuture 101 102 // pullLoggers is used to track the LogEventFn for each alloc pulling an image. 103 // If multiple alloc's are attempting to pull the same image, each will need 104 // to register its own LogEventFn with the coordinator. 105 pullLoggers map[string][]LogEventFn 106 107 // pullLoggerLock is used to sync access to the pullLoggers map 108 pullLoggerLock sync.RWMutex 109 110 // imageRefCount is the reference count of image IDs 111 imageRefCount map[string]map[string]struct{} 112 113 // deleteFuture is indexed by image ID and has a cancelable delete future 114 deleteFuture map[string]context.CancelFunc 115 } 116 117 // newDockerCoordinator returns a new Docker coordinator 118 func newDockerCoordinator(config *dockerCoordinatorConfig) *dockerCoordinator { 119 if config.client == nil { 120 return nil 121 } 122 123 return &dockerCoordinator{ 124 dockerCoordinatorConfig: config, 125 pullFutures: make(map[string]*pullFuture), 126 pullLoggers: make(map[string][]LogEventFn), 127 imageRefCount: make(map[string]map[string]struct{}), 128 deleteFuture: make(map[string]context.CancelFunc), 129 } 130 } 131 132 // PullImage is used to pull an image. It returns the pulled imaged ID or an 133 // error that occurred during the pull 134 func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConfiguration, callerID string, 135 emitFn LogEventFn, pullTimeout, pullActivityTimeout time.Duration) (imageID string, err error) { 136 // Get the future 137 d.imageLock.Lock() 138 future, ok := d.pullFutures[image] 139 d.registerPullLogger(image, emitFn) 140 if !ok { 141 // Make the future 142 future = newPullFuture() 143 d.pullFutures[image] = future 144 go d.pullImageImpl(image, authOptions, pullTimeout, pullActivityTimeout, future) 145 } 146 d.imageLock.Unlock() 147 148 // We unlock while we wait since this can take a while 149 id, err := future.wait().result() 150 151 d.imageLock.Lock() 152 defer d.imageLock.Unlock() 153 154 // Delete the future since we don't need it and we don't want to cache an 155 // image being there if it has possibly been manually deleted (outside of 156 // Nomad). 157 delete(d.pullFutures, image) 158 159 // If we are cleaning up, we increment the reference count on the image 160 if err == nil && d.cleanup { 161 d.incrementImageReferenceImpl(id, image, callerID) 162 } 163 164 return id, err 165 } 166 167 // pullImageImpl is the implementation of pulling an image. The results are 168 // returned via the passed future 169 func (d *dockerCoordinator) pullImageImpl(image string, authOptions *docker.AuthConfiguration, 170 pullTimeout, pullActivityTimeout time.Duration, future *pullFuture) { 171 172 defer d.clearPullLogger(image) 173 // Parse the repo and tag 174 repo, tag := parseDockerImage(image) 175 ctx, cancel := context.WithTimeout(context.Background(), pullTimeout) 176 defer cancel() 177 178 pm := newImageProgressManager(image, cancel, pullActivityTimeout, d.handlePullInactivity, 179 d.handlePullProgressReport, d.handleSlowPullProgressReport) 180 defer pm.stop() 181 182 pullOptions := docker.PullImageOptions{ 183 Repository: repo, 184 Tag: tag, 185 OutputStream: pm, 186 RawJSONStream: true, 187 Context: ctx, 188 } 189 190 // Attempt to pull the image 191 var auth docker.AuthConfiguration 192 if authOptions != nil { 193 auth = *authOptions 194 } 195 196 err := d.client.PullImage(pullOptions, auth) 197 198 if ctxErr := ctx.Err(); ctxErr == context.DeadlineExceeded { 199 d.logger.Error("timeout pulling container", "image_ref", dockerImageRef(repo, tag)) 200 future.set("", recoverablePullError(ctxErr, image)) 201 return 202 } 203 204 if err != nil { 205 d.logger.Error("failed pulling container", "image_ref", dockerImageRef(repo, tag), 206 "error", err) 207 future.set("", recoverablePullError(err, image)) 208 return 209 } 210 211 d.logger.Debug("docker pull succeeded", "image_ref", dockerImageRef(repo, tag)) 212 213 dockerImage, err := d.client.InspectImage(image) 214 if err != nil { 215 d.logger.Error("failed getting image id", "image_name", image, "error", err) 216 future.set("", recoverableErrTimeouts(err)) 217 return 218 } 219 220 future.set(dockerImage.ID, nil) 221 } 222 223 // IncrementImageReference is used to increment an image reference count 224 func (d *dockerCoordinator) IncrementImageReference(imageID, imageName, callerID string) { 225 d.imageLock.Lock() 226 defer d.imageLock.Unlock() 227 if d.cleanup { 228 d.incrementImageReferenceImpl(imageID, imageName, callerID) 229 } 230 } 231 232 // incrementImageReferenceImpl assumes the lock is held 233 func (d *dockerCoordinator) incrementImageReferenceImpl(imageID, imageName, callerID string) { 234 // Cancel any pending delete 235 if cancel, ok := d.deleteFuture[imageID]; ok { 236 d.logger.Debug("cancelling removal of container image", "image_name", imageName) 237 cancel() 238 delete(d.deleteFuture, imageID) 239 } 240 241 // Increment the reference 242 references, ok := d.imageRefCount[imageID] 243 if !ok { 244 references = make(map[string]struct{}) 245 d.imageRefCount[imageID] = references 246 } 247 248 if _, ok := references[callerID]; !ok { 249 references[callerID] = struct{}{} 250 d.logger.Debug("image reference count incremented", "image_name", imageName, "image_id", imageID, "references", len(references)) 251 } 252 } 253 254 // RemoveImage removes the given image. If there are any errors removing the 255 // image, the remove is retried internally. 256 func (d *dockerCoordinator) RemoveImage(imageID, callerID string) { 257 d.imageLock.Lock() 258 defer d.imageLock.Unlock() 259 260 if !d.cleanup { 261 return 262 } 263 264 references, ok := d.imageRefCount[imageID] 265 if !ok { 266 d.logger.Warn("RemoveImage on non-referenced counted image id", "image_id", imageID) 267 return 268 } 269 270 // Decrement the reference count 271 delete(references, callerID) 272 count := len(references) 273 d.logger.Debug("image id reference count decremented", "image_id", imageID, "references", count) 274 275 // Nothing to do 276 if count != 0 { 277 return 278 } 279 280 // This should never be the case but we safety guard so we don't leak a 281 // cancel. 282 if cancel, ok := d.deleteFuture[imageID]; ok { 283 d.logger.Error("image id has lingering delete future", "image_id", imageID) 284 cancel() 285 } 286 287 // Setup a future to delete the image 288 ctx, cancel := context.WithCancel(d.ctx) 289 d.deleteFuture[imageID] = cancel 290 go d.removeImageImpl(imageID, ctx) 291 292 // Delete the key from the reference count 293 delete(d.imageRefCount, imageID) 294 } 295 296 // removeImageImpl is used to remove an image. It wil wait the specified remove 297 // delay to remove the image. If the context is cancelled before that the image 298 // removal will be cancelled. 299 func (d *dockerCoordinator) removeImageImpl(id string, ctx context.Context) { 300 // Wait for the delay or a cancellation event 301 select { 302 case <-ctx.Done(): 303 // We have been cancelled 304 return 305 case <-time.After(d.removeDelay): 306 } 307 308 // Ensure we are suppose to delete. Do a short check while holding the lock 309 // so there can't be interleaving. There is still the smallest chance that 310 // the delete occurs after the image has been pulled but before it has been 311 // incremented. For handling that we just treat it as a recoverable error in 312 // the docker driver. 313 d.imageLock.Lock() 314 select { 315 case <-ctx.Done(): 316 d.imageLock.Unlock() 317 return 318 default: 319 } 320 d.imageLock.Unlock() 321 322 for i := 0; i < 3; i++ { 323 err := d.client.RemoveImage(id) 324 if err == nil { 325 break 326 } 327 328 if err == docker.ErrNoSuchImage { 329 d.logger.Debug("unable to cleanup image, does not exist", "image_id", id) 330 return 331 } 332 if derr, ok := err.(*docker.Error); ok && derr.Status == 409 { 333 d.logger.Debug("unable to cleanup image, still in use", "image_id", id) 334 return 335 } 336 337 // Retry on unknown errors 338 d.logger.Debug("failed to remove image", "image_id", id, "attempt", i+1, "error", err) 339 340 select { 341 case <-ctx.Done(): 342 // We have been cancelled 343 return 344 case <-time.After(3 * time.Second): 345 } 346 } 347 348 d.logger.Debug("cleanup removed downloaded image", "image_id", id) 349 350 // Cleanup the future from the map and free the context by cancelling it 351 d.imageLock.Lock() 352 if cancel, ok := d.deleteFuture[id]; ok { 353 delete(d.deleteFuture, id) 354 cancel() 355 } 356 d.imageLock.Unlock() 357 } 358 359 func (d *dockerCoordinator) registerPullLogger(image string, logger LogEventFn) { 360 d.pullLoggerLock.Lock() 361 defer d.pullLoggerLock.Unlock() 362 if _, ok := d.pullLoggers[image]; !ok { 363 d.pullLoggers[image] = []LogEventFn{} 364 } 365 d.pullLoggers[image] = append(d.pullLoggers[image], logger) 366 } 367 368 func (d *dockerCoordinator) clearPullLogger(image string) { 369 d.pullLoggerLock.Lock() 370 defer d.pullLoggerLock.Unlock() 371 delete(d.pullLoggers, image) 372 } 373 374 func (d *dockerCoordinator) emitEvent(image, message string, annotations map[string]string) { 375 d.pullLoggerLock.RLock() 376 defer d.pullLoggerLock.RUnlock() 377 for i := range d.pullLoggers[image] { 378 go d.pullLoggers[image][i](message, annotations) 379 } 380 } 381 382 func (d *dockerCoordinator) handlePullInactivity(image, msg string, timestamp time.Time) { 383 d.logger.Error("image pull aborted due to inactivity", "image_name", image, 384 "last_event_timestamp", timestamp.String(), "last_event", msg) 385 } 386 387 func (d *dockerCoordinator) handlePullProgressReport(image, msg string, _ time.Time) { 388 d.logger.Debug("image pull progress", "image_name", image, "message", msg) 389 } 390 391 func (d *dockerCoordinator) handleSlowPullProgressReport(image, msg string, _ time.Time) { 392 d.emitEvent(image, fmt.Sprintf("Docker image pull progress: %s", msg), map[string]string{ 393 "image": image, 394 }) 395 } 396 397 // recoverablePullError wraps the error gotten when trying to pull and image if 398 // the error is recoverable. 399 func recoverablePullError(err error, image string) error { 400 recoverable := true 401 if imageNotFoundMatcher.MatchString(err.Error()) { 402 recoverable = false 403 } 404 return structs.NewRecoverableError(fmt.Errorf("Failed to pull `%s`: %s", image, err), recoverable) 405 }