github.com/anuvu/nomad@v0.8.7-atom1/client/driver/docker_coordinator.go (about) 1 package driver 2 3 import ( 4 "context" 5 "fmt" 6 "log" 7 "regexp" 8 "sync" 9 "time" 10 11 docker "github.com/fsouza/go-dockerclient" 12 "github.com/hashicorp/nomad/nomad/structs" 13 ) 14 15 var ( 16 // createCoordinator allows us to only create a single coordinator 17 createCoordinator sync.Once 18 19 // globalCoordinator is the shared coordinator and should only be retrieved 20 // using the GetDockerCoordinator() method. 21 globalCoordinator *dockerCoordinator 22 23 // imageNotFoundMatcher is a regex expression that matches the image not 24 // found error Docker returns. 25 imageNotFoundMatcher = regexp.MustCompile(`Error: image .+ not found`) 26 ) 27 28 // pullFuture is a sharable future for retrieving a pulled images ID and any 29 // error that may have occurred during the pull. 30 type pullFuture struct { 31 waitCh chan struct{} 32 33 err error 34 imageID string 35 } 36 37 // newPullFuture returns a new pull future 38 func newPullFuture() *pullFuture { 39 return &pullFuture{ 40 waitCh: make(chan struct{}), 41 } 42 } 43 44 // wait waits till the future has a result 45 func (p *pullFuture) wait() *pullFuture { 46 <-p.waitCh 47 return p 48 } 49 50 // result returns the results of the future and should only ever be called after 51 // wait returns. 52 func (p *pullFuture) result() (imageID string, err error) { 53 return p.imageID, p.err 54 } 55 56 // set is used to set the results and unblock any waiter. This may only be 57 // called once. 58 func (p *pullFuture) set(imageID string, err error) { 59 p.imageID = imageID 60 p.err = err 61 close(p.waitCh) 62 } 63 64 // DockerImageClient provides the methods required to do CRUD operations on the 65 // Docker images 66 type DockerImageClient interface { 67 PullImage(opts docker.PullImageOptions, auth docker.AuthConfiguration) error 68 InspectImage(id string) (*docker.Image, error) 69 RemoveImage(id string) error 70 } 71 72 // dockerCoordinatorConfig is used to configure the Docker coordinator. 73 type dockerCoordinatorConfig struct { 74 // logger is the logger the coordinator should use 75 logger *log.Logger 76 77 // cleanup marks whether images should be deleting when the reference count 78 // is zero 79 cleanup bool 80 81 // client is the Docker client to use for communicating with Docker 82 client DockerImageClient 83 84 // removeDelay is the delay between an image's reference count going to 85 // zero and the image actually being deleted. 86 removeDelay time.Duration 87 } 88 89 // dockerCoordinator is used to coordinate actions against images to prevent 90 // racy deletions. It can be thought of as a reference counter on images. 91 type dockerCoordinator struct { 92 *dockerCoordinatorConfig 93 94 // imageLock is used to lock access to all images 95 imageLock sync.Mutex 96 97 // pullFutures is used to allow multiple callers to pull the same image but 98 // only have one request be sent to Docker 99 pullFutures map[string]*pullFuture 100 101 // pullLoggers is used to track the LogEventFn for each alloc pulling an image. 102 // If multiple alloc's are attempting to pull the same image, each will need 103 // to register its own LogEventFn with the coordinator. 104 pullLoggers map[string][]LogEventFn 105 106 // pullLoggerLock is used to sync access to the pullLoggers map 107 pullLoggerLock sync.RWMutex 108 109 // imageRefCount is the reference count of image IDs 110 imageRefCount map[string]map[string]struct{} 111 112 // deleteFuture is indexed by image ID and has a cancelable delete future 113 deleteFuture map[string]context.CancelFunc 114 } 115 116 // NewDockerCoordinator returns a new Docker coordinator 117 func NewDockerCoordinator(config *dockerCoordinatorConfig) *dockerCoordinator { 118 if config.client == nil { 119 return nil 120 } 121 122 return &dockerCoordinator{ 123 dockerCoordinatorConfig: config, 124 pullFutures: make(map[string]*pullFuture), 125 pullLoggers: make(map[string][]LogEventFn), 126 imageRefCount: make(map[string]map[string]struct{}), 127 deleteFuture: make(map[string]context.CancelFunc), 128 } 129 } 130 131 // GetDockerCoordinator returns the shared dockerCoordinator instance 132 func GetDockerCoordinator(config *dockerCoordinatorConfig) *dockerCoordinator { 133 createCoordinator.Do(func() { 134 globalCoordinator = NewDockerCoordinator(config) 135 }) 136 137 return globalCoordinator 138 } 139 140 // PullImage is used to pull an image. It returns the pulled imaged ID or an 141 // error that occurred during the pull 142 func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConfiguration, callerID string, emitFn LogEventFn) (imageID string, err error) { 143 // Get the future 144 d.imageLock.Lock() 145 future, ok := d.pullFutures[image] 146 d.registerPullLogger(image, emitFn) 147 if !ok { 148 // Make the future 149 future = newPullFuture() 150 d.pullFutures[image] = future 151 go d.pullImageImpl(image, authOptions, future) 152 } 153 d.imageLock.Unlock() 154 155 // We unlock while we wait since this can take a while 156 id, err := future.wait().result() 157 158 d.imageLock.Lock() 159 defer d.imageLock.Unlock() 160 161 // Delete the future since we don't need it and we don't want to cache an 162 // image being there if it has possibly been manually deleted (outside of 163 // Nomad). 164 if _, ok := d.pullFutures[image]; ok { 165 delete(d.pullFutures, image) 166 } 167 168 // If we are cleaning up, we increment the reference count on the image 169 if err == nil && d.cleanup { 170 d.incrementImageReferenceImpl(id, image, callerID) 171 } 172 173 return id, err 174 } 175 176 // pullImageImpl is the implementation of pulling an image. The results are 177 // returned via the passed future 178 func (d *dockerCoordinator) pullImageImpl(image string, authOptions *docker.AuthConfiguration, future *pullFuture) { 179 defer d.clearPullLogger(image) 180 // Parse the repo and tag 181 repo, tag := parseDockerImage(image) 182 ctx, cancel := context.WithCancel(context.Background()) 183 defer cancel() 184 185 pm := newImageProgressManager(image, cancel, d.handlePullInactivity, 186 d.handlePullProgressReport, d.handleSlowPullProgressReport) 187 defer pm.stop() 188 189 pullOptions := docker.PullImageOptions{ 190 Repository: repo, 191 Tag: tag, 192 OutputStream: pm, 193 RawJSONStream: true, 194 Context: ctx, 195 } 196 197 // Attempt to pull the image 198 var auth docker.AuthConfiguration 199 if authOptions != nil { 200 auth = *authOptions 201 } 202 203 err := d.client.PullImage(pullOptions, auth) 204 205 if ctxErr := ctx.Err(); ctxErr == context.DeadlineExceeded { 206 d.logger.Printf("[ERR] driver.docker: timeout pulling container %s", dockerImageRef(repo, tag)) 207 future.set("", recoverablePullError(ctxErr, image)) 208 return 209 } 210 211 if err != nil { 212 d.logger.Printf("[ERR] driver.docker: failed pulling container %s: %s", dockerImageRef(repo, tag), err) 213 future.set("", recoverablePullError(err, image)) 214 return 215 } 216 217 d.logger.Printf("[DEBUG] driver.docker: docker pull %s succeeded", dockerImageRef(repo, tag)) 218 219 dockerImage, err := d.client.InspectImage(image) 220 if err != nil { 221 d.logger.Printf("[ERR] driver.docker: failed getting image id for %q: %v", image, err) 222 future.set("", recoverableErrTimeouts(err)) 223 return 224 } 225 226 future.set(dockerImage.ID, nil) 227 return 228 } 229 230 // IncrementImageReference is used to increment an image reference count 231 func (d *dockerCoordinator) IncrementImageReference(imageID, imageName, callerID string) { 232 d.imageLock.Lock() 233 defer d.imageLock.Unlock() 234 if d.cleanup { 235 d.incrementImageReferenceImpl(imageID, imageName, callerID) 236 } 237 } 238 239 // incrementImageReferenceImpl assumes the lock is held 240 func (d *dockerCoordinator) incrementImageReferenceImpl(imageID, imageName, callerID string) { 241 // Cancel any pending delete 242 if cancel, ok := d.deleteFuture[imageID]; ok { 243 d.logger.Printf("[DEBUG] driver.docker: cancelling removal of image %q", imageName) 244 cancel() 245 delete(d.deleteFuture, imageID) 246 } 247 248 // Increment the reference 249 references, ok := d.imageRefCount[imageID] 250 if !ok { 251 references = make(map[string]struct{}) 252 d.imageRefCount[imageID] = references 253 } 254 255 if _, ok := references[callerID]; !ok { 256 references[callerID] = struct{}{} 257 d.logger.Printf("[DEBUG] driver.docker: image %q (%v) reference count incremented: %d", imageName, imageID, len(references)) 258 } 259 } 260 261 // RemoveImage removes the given image. If there are any errors removing the 262 // image, the remove is retried internally. 263 func (d *dockerCoordinator) RemoveImage(imageID, callerID string) { 264 d.imageLock.Lock() 265 defer d.imageLock.Unlock() 266 267 if !d.cleanup { 268 return 269 } 270 271 references, ok := d.imageRefCount[imageID] 272 if !ok { 273 d.logger.Printf("[WARN] driver.docker: RemoveImage on non-referenced counted image id %q", imageID) 274 return 275 } 276 277 // Decrement the reference count 278 delete(references, callerID) 279 count := len(references) 280 d.logger.Printf("[DEBUG] driver.docker: image id %q reference count decremented: %d", imageID, count) 281 282 // Nothing to do 283 if count != 0 { 284 return 285 } 286 287 // This should never be the case but we safety guard so we don't leak a 288 // cancel. 289 if cancel, ok := d.deleteFuture[imageID]; ok { 290 d.logger.Printf("[ERR] driver.docker: image id %q has lingering delete future", imageID) 291 cancel() 292 } 293 294 // Setup a future to delete the image 295 ctx, cancel := context.WithCancel(context.Background()) 296 d.deleteFuture[imageID] = cancel 297 go d.removeImageImpl(imageID, ctx) 298 299 // Delete the key from the reference count 300 delete(d.imageRefCount, imageID) 301 } 302 303 // removeImageImpl is used to remove an image. It wil wait the specified remove 304 // delay to remove the image. If the context is cancelled before that the image 305 // removal will be cancelled. 306 func (d *dockerCoordinator) removeImageImpl(id string, ctx context.Context) { 307 // Wait for the delay or a cancellation event 308 select { 309 case <-ctx.Done(): 310 // We have been cancelled 311 return 312 case <-time.After(d.removeDelay): 313 } 314 315 // Ensure we are suppose to delete. Do a short check while holding the lock 316 // so there can't be interleaving. There is still the smallest chance that 317 // the delete occurs after the image has been pulled but before it has been 318 // incremented. For handling that we just treat it as a recoverable error in 319 // the docker driver. 320 d.imageLock.Lock() 321 select { 322 case <-ctx.Done(): 323 d.imageLock.Unlock() 324 return 325 default: 326 } 327 d.imageLock.Unlock() 328 329 for i := 0; i < 3; i++ { 330 err := d.client.RemoveImage(id) 331 if err == nil { 332 break 333 } 334 335 if err == docker.ErrNoSuchImage { 336 d.logger.Printf("[DEBUG] driver.docker: unable to cleanup image %q: does not exist", id) 337 return 338 } 339 if derr, ok := err.(*docker.Error); ok && derr.Status == 409 { 340 d.logger.Printf("[DEBUG] driver.docker: unable to cleanup image %q: still in use", id) 341 return 342 } 343 344 // Retry on unknown errors 345 d.logger.Printf("[DEBUG] driver.docker: failed to remove image %q (attempt %d): %v", id, i+1, err) 346 347 select { 348 case <-ctx.Done(): 349 // We have been cancelled 350 return 351 case <-time.After(3 * time.Second): 352 } 353 } 354 355 d.logger.Printf("[DEBUG] driver.docker: cleanup removed downloaded image: %q", id) 356 357 // Cleanup the future from the map and free the context by cancelling it 358 d.imageLock.Lock() 359 if cancel, ok := d.deleteFuture[id]; ok { 360 delete(d.deleteFuture, id) 361 cancel() 362 } 363 d.imageLock.Unlock() 364 } 365 366 func (d *dockerCoordinator) registerPullLogger(image string, logger LogEventFn) { 367 d.pullLoggerLock.Lock() 368 defer d.pullLoggerLock.Unlock() 369 if _, ok := d.pullLoggers[image]; !ok { 370 d.pullLoggers[image] = []LogEventFn{} 371 } 372 d.pullLoggers[image] = append(d.pullLoggers[image], logger) 373 } 374 375 func (d *dockerCoordinator) clearPullLogger(image string) { 376 d.pullLoggerLock.Lock() 377 defer d.pullLoggerLock.Unlock() 378 delete(d.pullLoggers, image) 379 } 380 381 func (d *dockerCoordinator) emitEvent(image, message string, args ...interface{}) { 382 d.pullLoggerLock.RLock() 383 defer d.pullLoggerLock.RUnlock() 384 for i := range d.pullLoggers[image] { 385 go d.pullLoggers[image][i](message, args...) 386 } 387 } 388 389 func (d *dockerCoordinator) handlePullInactivity(image, msg string, timestamp time.Time) { 390 d.logger.Printf("[ERR] driver.docker: image %s pull aborted due to inactivity, last message recevieved at [%s]: %s", image, timestamp.String(), msg) 391 } 392 393 func (d *dockerCoordinator) handlePullProgressReport(image, msg string, _ time.Time) { 394 d.logger.Printf("[DEBUG] driver.docker: image %s pull progress: %s", image, msg) 395 } 396 397 func (d *dockerCoordinator) handleSlowPullProgressReport(image, msg string, _ time.Time) { 398 d.emitEvent(image, "Docker image %s pull progress: %s", image, msg) 399 } 400 401 // recoverablePullError wraps the error gotten when trying to pull and image if 402 // the error is recoverable. 403 func recoverablePullError(err error, image string) error { 404 recoverable := true 405 if imageNotFoundMatcher.MatchString(err.Error()) { 406 recoverable = false 407 } 408 return structs.NewRecoverableError(fmt.Errorf("Failed to pull `%s`: %s", image, err), recoverable) 409 }