github.com/anuvu/nomad@v0.8.7-atom1/client/driver/docker_coordinator.go (about)

     1  package driver
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"log"
     7  	"regexp"
     8  	"sync"
     9  	"time"
    10  
    11  	docker "github.com/fsouza/go-dockerclient"
    12  	"github.com/hashicorp/nomad/nomad/structs"
    13  )
    14  
    15  var (
    16  	// createCoordinator allows us to only create a single coordinator
    17  	createCoordinator sync.Once
    18  
    19  	// globalCoordinator is the shared coordinator and should only be retrieved
    20  	// using the GetDockerCoordinator() method.
    21  	globalCoordinator *dockerCoordinator
    22  
    23  	// imageNotFoundMatcher is a regex expression that matches the image not
    24  	// found error Docker returns.
    25  	imageNotFoundMatcher = regexp.MustCompile(`Error: image .+ not found`)
    26  )
    27  
    28  // pullFuture is a sharable future for retrieving a pulled images ID and any
    29  // error that may have occurred during the pull.
    30  type pullFuture struct {
    31  	waitCh chan struct{}
    32  
    33  	err     error
    34  	imageID string
    35  }
    36  
    37  // newPullFuture returns a new pull future
    38  func newPullFuture() *pullFuture {
    39  	return &pullFuture{
    40  		waitCh: make(chan struct{}),
    41  	}
    42  }
    43  
    44  // wait waits till the future has a result
    45  func (p *pullFuture) wait() *pullFuture {
    46  	<-p.waitCh
    47  	return p
    48  }
    49  
    50  // result returns the results of the future and should only ever be called after
    51  // wait returns.
    52  func (p *pullFuture) result() (imageID string, err error) {
    53  	return p.imageID, p.err
    54  }
    55  
    56  // set is used to set the results and unblock any waiter. This may only be
    57  // called once.
    58  func (p *pullFuture) set(imageID string, err error) {
    59  	p.imageID = imageID
    60  	p.err = err
    61  	close(p.waitCh)
    62  }
    63  
    64  // DockerImageClient provides the methods required to do CRUD operations on the
    65  // Docker images
    66  type DockerImageClient interface {
    67  	PullImage(opts docker.PullImageOptions, auth docker.AuthConfiguration) error
    68  	InspectImage(id string) (*docker.Image, error)
    69  	RemoveImage(id string) error
    70  }
    71  
    72  // dockerCoordinatorConfig is used to configure the Docker coordinator.
    73  type dockerCoordinatorConfig struct {
    74  	// logger is the logger the coordinator should use
    75  	logger *log.Logger
    76  
    77  	// cleanup marks whether images should be deleting when the reference count
    78  	// is zero
    79  	cleanup bool
    80  
    81  	// client is the Docker client to use for communicating with Docker
    82  	client DockerImageClient
    83  
    84  	// removeDelay is the delay between an image's reference count going to
    85  	// zero and the image actually being deleted.
    86  	removeDelay time.Duration
    87  }
    88  
    89  // dockerCoordinator is used to coordinate actions against images to prevent
    90  // racy deletions. It can be thought of as a reference counter on images.
    91  type dockerCoordinator struct {
    92  	*dockerCoordinatorConfig
    93  
    94  	// imageLock is used to lock access to all images
    95  	imageLock sync.Mutex
    96  
    97  	// pullFutures is used to allow multiple callers to pull the same image but
    98  	// only have one request be sent to Docker
    99  	pullFutures map[string]*pullFuture
   100  
   101  	// pullLoggers is used to track the LogEventFn for each alloc pulling an image.
   102  	// If multiple alloc's are attempting to pull the same image, each will need
   103  	// to register its own LogEventFn with the coordinator.
   104  	pullLoggers map[string][]LogEventFn
   105  
   106  	// pullLoggerLock is used to sync access to the pullLoggers map
   107  	pullLoggerLock sync.RWMutex
   108  
   109  	// imageRefCount is the reference count of image IDs
   110  	imageRefCount map[string]map[string]struct{}
   111  
   112  	// deleteFuture is indexed by image ID and has a cancelable delete future
   113  	deleteFuture map[string]context.CancelFunc
   114  }
   115  
   116  // NewDockerCoordinator returns a new Docker coordinator
   117  func NewDockerCoordinator(config *dockerCoordinatorConfig) *dockerCoordinator {
   118  	if config.client == nil {
   119  		return nil
   120  	}
   121  
   122  	return &dockerCoordinator{
   123  		dockerCoordinatorConfig: config,
   124  		pullFutures:             make(map[string]*pullFuture),
   125  		pullLoggers:             make(map[string][]LogEventFn),
   126  		imageRefCount:           make(map[string]map[string]struct{}),
   127  		deleteFuture:            make(map[string]context.CancelFunc),
   128  	}
   129  }
   130  
   131  // GetDockerCoordinator returns the shared dockerCoordinator instance
   132  func GetDockerCoordinator(config *dockerCoordinatorConfig) *dockerCoordinator {
   133  	createCoordinator.Do(func() {
   134  		globalCoordinator = NewDockerCoordinator(config)
   135  	})
   136  
   137  	return globalCoordinator
   138  }
   139  
   140  // PullImage is used to pull an image. It returns the pulled imaged ID or an
   141  // error that occurred during the pull
   142  func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConfiguration, callerID string, emitFn LogEventFn) (imageID string, err error) {
   143  	// Get the future
   144  	d.imageLock.Lock()
   145  	future, ok := d.pullFutures[image]
   146  	d.registerPullLogger(image, emitFn)
   147  	if !ok {
   148  		// Make the future
   149  		future = newPullFuture()
   150  		d.pullFutures[image] = future
   151  		go d.pullImageImpl(image, authOptions, future)
   152  	}
   153  	d.imageLock.Unlock()
   154  
   155  	// We unlock while we wait since this can take a while
   156  	id, err := future.wait().result()
   157  
   158  	d.imageLock.Lock()
   159  	defer d.imageLock.Unlock()
   160  
   161  	// Delete the future since we don't need it and we don't want to cache an
   162  	// image being there if it has possibly been manually deleted (outside of
   163  	// Nomad).
   164  	if _, ok := d.pullFutures[image]; ok {
   165  		delete(d.pullFutures, image)
   166  	}
   167  
   168  	// If we are cleaning up, we increment the reference count on the image
   169  	if err == nil && d.cleanup {
   170  		d.incrementImageReferenceImpl(id, image, callerID)
   171  	}
   172  
   173  	return id, err
   174  }
   175  
   176  // pullImageImpl is the implementation of pulling an image. The results are
   177  // returned via the passed future
   178  func (d *dockerCoordinator) pullImageImpl(image string, authOptions *docker.AuthConfiguration, future *pullFuture) {
   179  	defer d.clearPullLogger(image)
   180  	// Parse the repo and tag
   181  	repo, tag := parseDockerImage(image)
   182  	ctx, cancel := context.WithCancel(context.Background())
   183  	defer cancel()
   184  
   185  	pm := newImageProgressManager(image, cancel, d.handlePullInactivity,
   186  		d.handlePullProgressReport, d.handleSlowPullProgressReport)
   187  	defer pm.stop()
   188  
   189  	pullOptions := docker.PullImageOptions{
   190  		Repository:    repo,
   191  		Tag:           tag,
   192  		OutputStream:  pm,
   193  		RawJSONStream: true,
   194  		Context:       ctx,
   195  	}
   196  
   197  	// Attempt to pull the image
   198  	var auth docker.AuthConfiguration
   199  	if authOptions != nil {
   200  		auth = *authOptions
   201  	}
   202  
   203  	err := d.client.PullImage(pullOptions, auth)
   204  
   205  	if ctxErr := ctx.Err(); ctxErr == context.DeadlineExceeded {
   206  		d.logger.Printf("[ERR] driver.docker: timeout pulling container %s", dockerImageRef(repo, tag))
   207  		future.set("", recoverablePullError(ctxErr, image))
   208  		return
   209  	}
   210  
   211  	if err != nil {
   212  		d.logger.Printf("[ERR] driver.docker: failed pulling container %s: %s", dockerImageRef(repo, tag), err)
   213  		future.set("", recoverablePullError(err, image))
   214  		return
   215  	}
   216  
   217  	d.logger.Printf("[DEBUG] driver.docker: docker pull %s succeeded", dockerImageRef(repo, tag))
   218  
   219  	dockerImage, err := d.client.InspectImage(image)
   220  	if err != nil {
   221  		d.logger.Printf("[ERR] driver.docker: failed getting image id for %q: %v", image, err)
   222  		future.set("", recoverableErrTimeouts(err))
   223  		return
   224  	}
   225  
   226  	future.set(dockerImage.ID, nil)
   227  	return
   228  }
   229  
   230  // IncrementImageReference is used to increment an image reference count
   231  func (d *dockerCoordinator) IncrementImageReference(imageID, imageName, callerID string) {
   232  	d.imageLock.Lock()
   233  	defer d.imageLock.Unlock()
   234  	if d.cleanup {
   235  		d.incrementImageReferenceImpl(imageID, imageName, callerID)
   236  	}
   237  }
   238  
   239  // incrementImageReferenceImpl assumes the lock is held
   240  func (d *dockerCoordinator) incrementImageReferenceImpl(imageID, imageName, callerID string) {
   241  	// Cancel any pending delete
   242  	if cancel, ok := d.deleteFuture[imageID]; ok {
   243  		d.logger.Printf("[DEBUG] driver.docker: cancelling removal of image %q", imageName)
   244  		cancel()
   245  		delete(d.deleteFuture, imageID)
   246  	}
   247  
   248  	// Increment the reference
   249  	references, ok := d.imageRefCount[imageID]
   250  	if !ok {
   251  		references = make(map[string]struct{})
   252  		d.imageRefCount[imageID] = references
   253  	}
   254  
   255  	if _, ok := references[callerID]; !ok {
   256  		references[callerID] = struct{}{}
   257  		d.logger.Printf("[DEBUG] driver.docker: image %q (%v) reference count incremented: %d", imageName, imageID, len(references))
   258  	}
   259  }
   260  
   261  // RemoveImage removes the given image. If there are any errors removing the
   262  // image, the remove is retried internally.
   263  func (d *dockerCoordinator) RemoveImage(imageID, callerID string) {
   264  	d.imageLock.Lock()
   265  	defer d.imageLock.Unlock()
   266  
   267  	if !d.cleanup {
   268  		return
   269  	}
   270  
   271  	references, ok := d.imageRefCount[imageID]
   272  	if !ok {
   273  		d.logger.Printf("[WARN] driver.docker: RemoveImage on non-referenced counted image id %q", imageID)
   274  		return
   275  	}
   276  
   277  	// Decrement the reference count
   278  	delete(references, callerID)
   279  	count := len(references)
   280  	d.logger.Printf("[DEBUG] driver.docker: image id %q reference count decremented: %d", imageID, count)
   281  
   282  	// Nothing to do
   283  	if count != 0 {
   284  		return
   285  	}
   286  
   287  	// This should never be the case but we safety guard so we don't leak a
   288  	// cancel.
   289  	if cancel, ok := d.deleteFuture[imageID]; ok {
   290  		d.logger.Printf("[ERR] driver.docker: image id %q has lingering delete future", imageID)
   291  		cancel()
   292  	}
   293  
   294  	// Setup a future to delete the image
   295  	ctx, cancel := context.WithCancel(context.Background())
   296  	d.deleteFuture[imageID] = cancel
   297  	go d.removeImageImpl(imageID, ctx)
   298  
   299  	// Delete the key from the reference count
   300  	delete(d.imageRefCount, imageID)
   301  }
   302  
   303  // removeImageImpl is used to remove an image. It wil wait the specified remove
   304  // delay to remove the image. If the context is cancelled before that the image
   305  // removal will be cancelled.
   306  func (d *dockerCoordinator) removeImageImpl(id string, ctx context.Context) {
   307  	// Wait for the delay or a cancellation event
   308  	select {
   309  	case <-ctx.Done():
   310  		// We have been cancelled
   311  		return
   312  	case <-time.After(d.removeDelay):
   313  	}
   314  
   315  	// Ensure we are suppose to delete. Do a short check while holding the lock
   316  	// so there can't be interleaving. There is still the smallest chance that
   317  	// the delete occurs after the image has been pulled but before it has been
   318  	// incremented. For handling that we just treat it as a recoverable error in
   319  	// the docker driver.
   320  	d.imageLock.Lock()
   321  	select {
   322  	case <-ctx.Done():
   323  		d.imageLock.Unlock()
   324  		return
   325  	default:
   326  	}
   327  	d.imageLock.Unlock()
   328  
   329  	for i := 0; i < 3; i++ {
   330  		err := d.client.RemoveImage(id)
   331  		if err == nil {
   332  			break
   333  		}
   334  
   335  		if err == docker.ErrNoSuchImage {
   336  			d.logger.Printf("[DEBUG] driver.docker: unable to cleanup image %q: does not exist", id)
   337  			return
   338  		}
   339  		if derr, ok := err.(*docker.Error); ok && derr.Status == 409 {
   340  			d.logger.Printf("[DEBUG] driver.docker: unable to cleanup image %q: still in use", id)
   341  			return
   342  		}
   343  
   344  		// Retry on unknown errors
   345  		d.logger.Printf("[DEBUG] driver.docker: failed to remove image %q (attempt %d): %v", id, i+1, err)
   346  
   347  		select {
   348  		case <-ctx.Done():
   349  			// We have been cancelled
   350  			return
   351  		case <-time.After(3 * time.Second):
   352  		}
   353  	}
   354  
   355  	d.logger.Printf("[DEBUG] driver.docker: cleanup removed downloaded image: %q", id)
   356  
   357  	// Cleanup the future from the map and free the context by cancelling it
   358  	d.imageLock.Lock()
   359  	if cancel, ok := d.deleteFuture[id]; ok {
   360  		delete(d.deleteFuture, id)
   361  		cancel()
   362  	}
   363  	d.imageLock.Unlock()
   364  }
   365  
   366  func (d *dockerCoordinator) registerPullLogger(image string, logger LogEventFn) {
   367  	d.pullLoggerLock.Lock()
   368  	defer d.pullLoggerLock.Unlock()
   369  	if _, ok := d.pullLoggers[image]; !ok {
   370  		d.pullLoggers[image] = []LogEventFn{}
   371  	}
   372  	d.pullLoggers[image] = append(d.pullLoggers[image], logger)
   373  }
   374  
   375  func (d *dockerCoordinator) clearPullLogger(image string) {
   376  	d.pullLoggerLock.Lock()
   377  	defer d.pullLoggerLock.Unlock()
   378  	delete(d.pullLoggers, image)
   379  }
   380  
   381  func (d *dockerCoordinator) emitEvent(image, message string, args ...interface{}) {
   382  	d.pullLoggerLock.RLock()
   383  	defer d.pullLoggerLock.RUnlock()
   384  	for i := range d.pullLoggers[image] {
   385  		go d.pullLoggers[image][i](message, args...)
   386  	}
   387  }
   388  
   389  func (d *dockerCoordinator) handlePullInactivity(image, msg string, timestamp time.Time) {
   390  	d.logger.Printf("[ERR] driver.docker: image %s pull aborted due to inactivity, last message recevieved at [%s]: %s", image, timestamp.String(), msg)
   391  }
   392  
   393  func (d *dockerCoordinator) handlePullProgressReport(image, msg string, _ time.Time) {
   394  	d.logger.Printf("[DEBUG] driver.docker: image %s pull progress: %s", image, msg)
   395  }
   396  
   397  func (d *dockerCoordinator) handleSlowPullProgressReport(image, msg string, _ time.Time) {
   398  	d.emitEvent(image, "Docker image %s pull progress: %s", image, msg)
   399  }
   400  
   401  // recoverablePullError wraps the error gotten when trying to pull and image if
   402  // the error is recoverable.
   403  func recoverablePullError(err error, image string) error {
   404  	recoverable := true
   405  	if imageNotFoundMatcher.MatchString(err.Error()) {
   406  		recoverable = false
   407  	}
   408  	return structs.NewRecoverableError(fmt.Errorf("Failed to pull `%s`: %s", image, err), recoverable)
   409  }