github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/client/driver/docker_coordinator.go (about) 1 package driver 2 3 import ( 4 "context" 5 "fmt" 6 "log" 7 "regexp" 8 "sync" 9 "time" 10 11 docker "github.com/fsouza/go-dockerclient" 12 "github.com/hashicorp/nomad/nomad/structs" 13 ) 14 15 var ( 16 // createCoordinator allows us to only create a single coordinator 17 createCoordinator sync.Once 18 19 // globalCoordinator is the shared coordinator and should only be retreived 20 // using the GetDockerCoordinator() method. 21 globalCoordinator *dockerCoordinator 22 23 // imageNotFoundMatcher is a regex expression that matches the image not 24 // found error Docker returns. 25 imageNotFoundMatcher = regexp.MustCompile(`Error: image .+ not found`) 26 ) 27 28 // pullFuture is a sharable future for retrieving a pulled images ID and any 29 // error that may have occured during the pull. 30 type pullFuture struct { 31 waitCh chan struct{} 32 33 err error 34 imageID string 35 } 36 37 // newPullFuture returns a new pull future 38 func newPullFuture() *pullFuture { 39 return &pullFuture{ 40 waitCh: make(chan struct{}), 41 } 42 } 43 44 // wait waits till the future has a result 45 func (p *pullFuture) wait() *pullFuture { 46 <-p.waitCh 47 return p 48 } 49 50 // result returns the results of the future and should only ever be called after 51 // wait returns. 52 func (p *pullFuture) result() (imageID string, err error) { 53 return p.imageID, p.err 54 } 55 56 // set is used to set the results and unblock any waiter. This may only be 57 // called once. 58 func (p *pullFuture) set(imageID string, err error) { 59 p.imageID = imageID 60 p.err = err 61 close(p.waitCh) 62 } 63 64 // DockerImageClient provides the methods required to do CRUD operations on the 65 // Docker images 66 type DockerImageClient interface { 67 PullImage(opts docker.PullImageOptions, auth docker.AuthConfiguration) error 68 InspectImage(id string) (*docker.Image, error) 69 RemoveImage(id string) error 70 } 71 72 // dockerCoordinatorConfig is used to configure the Docker coordinator. 73 type dockerCoordinatorConfig struct { 74 // logger is the logger the coordinator should use 75 logger *log.Logger 76 77 // cleanup marks whether images should be deleting when the reference count 78 // is zero 79 cleanup bool 80 81 // client is the Docker client to use for communicating with Docker 82 client DockerImageClient 83 84 // removeDelay is the delay between an image's reference count going to 85 // zero and the image actually being deleted. 86 removeDelay time.Duration 87 } 88 89 // dockerCoordinator is used to coordinate actions against images to prevent 90 // racy deletions. It can be thought of as a reference counter on images. 91 type dockerCoordinator struct { 92 *dockerCoordinatorConfig 93 94 // imageLock is used to lock access to all images 95 imageLock sync.Mutex 96 97 // pullFutures is used to allow multiple callers to pull the same image but 98 // only have one request be sent to Docker 99 pullFutures map[string]*pullFuture 100 101 // imageRefCount is the reference count of image IDs 102 imageRefCount map[string]map[string]struct{} 103 104 // deleteFuture is indexed by image ID and has a cancable delete future 105 deleteFuture map[string]context.CancelFunc 106 } 107 108 // NewDockerCoordinator returns a new Docker coordinator 109 func NewDockerCoordinator(config *dockerCoordinatorConfig) *dockerCoordinator { 110 if config.client == nil { 111 return nil 112 } 113 114 return &dockerCoordinator{ 115 dockerCoordinatorConfig: config, 116 pullFutures: make(map[string]*pullFuture), 117 imageRefCount: make(map[string]map[string]struct{}), 118 deleteFuture: make(map[string]context.CancelFunc), 119 } 120 } 121 122 // GetDockerCoordinator returns the shared dockerCoordinator instance 123 func GetDockerCoordinator(config *dockerCoordinatorConfig) *dockerCoordinator { 124 createCoordinator.Do(func() { 125 globalCoordinator = NewDockerCoordinator(config) 126 }) 127 128 return globalCoordinator 129 } 130 131 // PullImage is used to pull an image. It returns the pulled imaged ID or an 132 // error that occured during the pull 133 func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConfiguration, callerID string) (imageID string, err error) { 134 // Get the future 135 d.imageLock.Lock() 136 future, ok := d.pullFutures[image] 137 if !ok { 138 // Make the future 139 future = newPullFuture() 140 d.pullFutures[image] = future 141 go d.pullImageImpl(image, authOptions, future) 142 } 143 d.imageLock.Unlock() 144 145 // We unlock while we wait since this can take a while 146 id, err := future.wait().result() 147 148 d.imageLock.Lock() 149 defer d.imageLock.Unlock() 150 151 // Delete the future since we don't need it and we don't want to cache an 152 // image being there if it has possibly been manually deleted (outside of 153 // Nomad). 154 if _, ok := d.pullFutures[image]; ok { 155 delete(d.pullFutures, image) 156 } 157 158 // If we are cleaning up, we increment the reference count on the image 159 if err == nil && d.cleanup { 160 d.incrementImageReferenceImpl(id, image, callerID) 161 } 162 163 return id, err 164 } 165 166 // pullImageImpl is the implementation of pulling an image. The results are 167 // returned via the passed future 168 func (d *dockerCoordinator) pullImageImpl(image string, authOptions *docker.AuthConfiguration, future *pullFuture) { 169 // Parse the repo and tag 170 repo, tag := docker.ParseRepositoryTag(image) 171 if tag == "" { 172 tag = "latest" 173 } 174 pullOptions := docker.PullImageOptions{ 175 Repository: repo, 176 Tag: tag, 177 } 178 179 // Attempt to pull the image 180 var auth docker.AuthConfiguration 181 if authOptions != nil { 182 auth = *authOptions 183 } 184 err := d.client.PullImage(pullOptions, auth) 185 if err != nil { 186 d.logger.Printf("[ERR] driver.docker: failed pulling container %s:%s: %s", repo, tag, err) 187 future.set("", recoverablePullError(err, image)) 188 return 189 } 190 191 d.logger.Printf("[DEBUG] driver.docker: docker pull %s:%s succeeded", repo, tag) 192 193 dockerImage, err := d.client.InspectImage(image) 194 if err != nil { 195 d.logger.Printf("[ERR] driver.docker: failed getting image id for %q: %v", image, err) 196 future.set("", recoverableErrTimeouts(err)) 197 return 198 } 199 200 future.set(dockerImage.ID, nil) 201 return 202 } 203 204 // IncrementImageReference is used to increment an image reference count 205 func (d *dockerCoordinator) IncrementImageReference(imageID, imageName, callerID string) { 206 d.imageLock.Lock() 207 defer d.imageLock.Unlock() 208 if d.cleanup { 209 d.incrementImageReferenceImpl(imageID, imageName, callerID) 210 } 211 } 212 213 // incrementImageReferenceImpl assumes the lock is held 214 func (d *dockerCoordinator) incrementImageReferenceImpl(imageID, imageName, callerID string) { 215 // Cancel any pending delete 216 if cancel, ok := d.deleteFuture[imageID]; ok { 217 d.logger.Printf("[DEBUG] driver.docker: cancelling removal of image %q", imageName) 218 cancel() 219 delete(d.deleteFuture, imageID) 220 } 221 222 // Increment the reference 223 references, ok := d.imageRefCount[imageID] 224 if !ok { 225 references = make(map[string]struct{}) 226 d.imageRefCount[imageID] = references 227 } 228 229 if _, ok := references[callerID]; !ok { 230 references[callerID] = struct{}{} 231 d.logger.Printf("[DEBUG] driver.docker: image %q (%v) reference count incremented: %d", imageName, imageID, len(references)) 232 } 233 } 234 235 // RemoveImage removes the given image. If there are any errors removing the 236 // image, the remove is retried internally. 237 func (d *dockerCoordinator) RemoveImage(imageID, callerID string) { 238 d.imageLock.Lock() 239 defer d.imageLock.Unlock() 240 241 if !d.cleanup { 242 return 243 } 244 245 references, ok := d.imageRefCount[imageID] 246 if !ok { 247 d.logger.Printf("[WARN] driver.docker: RemoveImage on non-referenced counted image id %q", imageID) 248 return 249 } 250 251 // Decrement the reference count 252 delete(references, callerID) 253 count := len(references) 254 d.logger.Printf("[DEBUG] driver.docker: image id %q reference count decremented: %d", imageID, count) 255 256 // Nothing to do 257 if count != 0 { 258 return 259 } 260 261 // This should never be the case but we safefty guard so we don't leak a 262 // cancel. 263 if cancel, ok := d.deleteFuture[imageID]; ok { 264 d.logger.Printf("[ERR] driver.docker: image id %q has lingering delete future", imageID) 265 cancel() 266 } 267 268 // Setup a future to delete the image 269 ctx, cancel := context.WithCancel(context.Background()) 270 d.deleteFuture[imageID] = cancel 271 go d.removeImageImpl(imageID, ctx) 272 273 // Delete the key from the reference count 274 delete(d.imageRefCount, imageID) 275 } 276 277 // removeImageImpl is used to remove an image. It wil wait the specified remove 278 // delay to remove the image. If the context is cancalled before that the image 279 // removal will be cancelled. 280 func (d *dockerCoordinator) removeImageImpl(id string, ctx context.Context) { 281 // Wait for the delay or a cancellation event 282 select { 283 case <-ctx.Done(): 284 // We have been cancelled 285 return 286 case <-time.After(d.removeDelay): 287 } 288 289 // Ensure we are suppose to delete. Do a short check while holding the lock 290 // so there can't be interleaving. There is still the smallest chance that 291 // the delete occurs after the image has been pulled but before it has been 292 // incremented. For handling that we just treat it as a recoverable error in 293 // the docker driver. 294 d.imageLock.Lock() 295 select { 296 case <-ctx.Done(): 297 d.imageLock.Unlock() 298 return 299 default: 300 } 301 d.imageLock.Unlock() 302 303 for i := 0; i < 3; i++ { 304 err := d.client.RemoveImage(id) 305 if err == nil { 306 break 307 } 308 309 if err == docker.ErrNoSuchImage { 310 d.logger.Printf("[DEBUG] driver.docker: unable to cleanup image %q: does not exist", id) 311 return 312 } 313 if derr, ok := err.(*docker.Error); ok && derr.Status == 409 { 314 d.logger.Printf("[DEBUG] driver.docker: unable to cleanup image %q: still in use", id) 315 return 316 } 317 318 // Retry on unknown errors 319 d.logger.Printf("[DEBUG] driver.docker: failed to remove image %q (attempt %d): %v", id, i+1, err) 320 321 select { 322 case <-ctx.Done(): 323 // We have been cancelled 324 return 325 case <-time.After(3 * time.Second): 326 } 327 } 328 329 d.logger.Printf("[DEBUG] driver.docker: cleanup removed downloaded image: %q", id) 330 331 // Cleanup the future from the map and free the context by cancelling it 332 d.imageLock.Lock() 333 if cancel, ok := d.deleteFuture[id]; ok { 334 delete(d.deleteFuture, id) 335 cancel() 336 } 337 d.imageLock.Unlock() 338 } 339 340 // recoverablePullError wraps the error gotten when trying to pull and image if 341 // the error is recoverable. 342 func recoverablePullError(err error, image string) error { 343 recoverable := true 344 if imageNotFoundMatcher.MatchString(err.Error()) { 345 recoverable = false 346 } 347 return structs.NewRecoverableError(fmt.Errorf("Failed to pull `%s`: %s", image, err), recoverable) 348 }