github.com/bigcommerce/nomad@v0.9.3-bc/drivers/docker/coordinator.go (about)

     1  package docker
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"regexp"
     7  	"sync"
     8  	"time"
     9  
    10  	docker "github.com/fsouza/go-dockerclient"
    11  	hclog "github.com/hashicorp/go-hclog"
    12  	"github.com/hashicorp/nomad/nomad/structs"
    13  )
    14  
    15  var (
    16  	// imageNotFoundMatcher is a regex expression that matches the image not
    17  	// found error Docker returns.
    18  	imageNotFoundMatcher = regexp.MustCompile(`Error: image .+ not found`)
    19  )
    20  
    21  // pullFuture is a sharable future for retrieving a pulled images ID and any
    22  // error that may have occurred during the pull.
    23  type pullFuture struct {
    24  	waitCh chan struct{}
    25  
    26  	err     error
    27  	imageID string
    28  }
    29  
    30  // newPullFuture returns a new pull future
    31  func newPullFuture() *pullFuture {
    32  	return &pullFuture{
    33  		waitCh: make(chan struct{}),
    34  	}
    35  }
    36  
    37  // wait waits till the future has a result
    38  func (p *pullFuture) wait() *pullFuture {
    39  	<-p.waitCh
    40  	return p
    41  }
    42  
    43  // result returns the results of the future and should only ever be called after
    44  // wait returns.
    45  func (p *pullFuture) result() (imageID string, err error) {
    46  	return p.imageID, p.err
    47  }
    48  
    49  // set is used to set the results and unblock any waiter. This may only be
    50  // called once.
    51  func (p *pullFuture) set(imageID string, err error) {
    52  	p.imageID = imageID
    53  	p.err = err
    54  	close(p.waitCh)
    55  }
    56  
    57  // DockerImageClient provides the methods required to do CRUD operations on the
    58  // Docker images
    59  type DockerImageClient interface {
    60  	PullImage(opts docker.PullImageOptions, auth docker.AuthConfiguration) error
    61  	InspectImage(id string) (*docker.Image, error)
    62  	RemoveImage(id string) error
    63  }
    64  
    65  // LogEventFn is a callback which allows Drivers to emit task events.
    66  type LogEventFn func(message string, annotations map[string]string)
    67  
    68  // dockerCoordinatorConfig is used to configure the Docker coordinator.
    69  type dockerCoordinatorConfig struct {
    70  	// logger is the logger the coordinator should use
    71  	logger hclog.Logger
    72  
    73  	// cleanup marks whether images should be deleted when the reference count
    74  	// is zero
    75  	cleanup bool
    76  
    77  	// client is the Docker client to use for communicating with Docker
    78  	client DockerImageClient
    79  
    80  	// removeDelay is the delay between an image's reference count going to
    81  	// zero and the image actually being deleted.
    82  	removeDelay time.Duration
    83  }
    84  
    85  // dockerCoordinator is used to coordinate actions against images to prevent
    86  // racy deletions. It can be thought of as a reference counter on images.
    87  type dockerCoordinator struct {
    88  	*dockerCoordinatorConfig
    89  
    90  	// imageLock is used to lock access to all images
    91  	imageLock sync.Mutex
    92  
    93  	// pullFutures is used to allow multiple callers to pull the same image but
    94  	// only have one request be sent to Docker
    95  	pullFutures map[string]*pullFuture
    96  
    97  	// pullLoggers is used to track the LogEventFn for each alloc pulling an image.
    98  	// If multiple alloc's are attempting to pull the same image, each will need
    99  	// to register its own LogEventFn with the coordinator.
   100  	pullLoggers map[string][]LogEventFn
   101  
   102  	// pullLoggerLock is used to sync access to the pullLoggers map
   103  	pullLoggerLock sync.RWMutex
   104  
   105  	// imageRefCount is the reference count of image IDs
   106  	imageRefCount map[string]map[string]struct{}
   107  
   108  	// deleteFuture is indexed by image ID and has a cancelable delete future
   109  	deleteFuture map[string]context.CancelFunc
   110  }
   111  
   112  // newDockerCoordinator returns a new Docker coordinator
   113  func newDockerCoordinator(config *dockerCoordinatorConfig) *dockerCoordinator {
   114  	if config.client == nil {
   115  		return nil
   116  	}
   117  
   118  	return &dockerCoordinator{
   119  		dockerCoordinatorConfig: config,
   120  		pullFutures:             make(map[string]*pullFuture),
   121  		pullLoggers:             make(map[string][]LogEventFn),
   122  		imageRefCount:           make(map[string]map[string]struct{}),
   123  		deleteFuture:            make(map[string]context.CancelFunc),
   124  	}
   125  }
   126  
   127  // PullImage is used to pull an image. It returns the pulled imaged ID or an
   128  // error that occurred during the pull
   129  func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConfiguration, callerID string, emitFn LogEventFn) (imageID string, err error) {
   130  	// Get the future
   131  	d.imageLock.Lock()
   132  	future, ok := d.pullFutures[image]
   133  	d.registerPullLogger(image, emitFn)
   134  	if !ok {
   135  		// Make the future
   136  		future = newPullFuture()
   137  		d.pullFutures[image] = future
   138  		go d.pullImageImpl(image, authOptions, future)
   139  	}
   140  	d.imageLock.Unlock()
   141  
   142  	// We unlock while we wait since this can take a while
   143  	id, err := future.wait().result()
   144  
   145  	d.imageLock.Lock()
   146  	defer d.imageLock.Unlock()
   147  
   148  	// Delete the future since we don't need it and we don't want to cache an
   149  	// image being there if it has possibly been manually deleted (outside of
   150  	// Nomad).
   151  	if _, ok := d.pullFutures[image]; ok {
   152  		delete(d.pullFutures, image)
   153  	}
   154  
   155  	// If we are cleaning up, we increment the reference count on the image
   156  	if err == nil && d.cleanup {
   157  		d.incrementImageReferenceImpl(id, image, callerID)
   158  	}
   159  
   160  	return id, err
   161  }
   162  
   163  // pullImageImpl is the implementation of pulling an image. The results are
   164  // returned via the passed future
   165  func (d *dockerCoordinator) pullImageImpl(image string, authOptions *docker.AuthConfiguration, future *pullFuture) {
   166  	defer d.clearPullLogger(image)
   167  	// Parse the repo and tag
   168  	repo, tag := parseDockerImage(image)
   169  	ctx, cancel := context.WithCancel(context.Background())
   170  	defer cancel()
   171  
   172  	pm := newImageProgressManager(image, cancel, d.handlePullInactivity,
   173  		d.handlePullProgressReport, d.handleSlowPullProgressReport)
   174  	defer pm.stop()
   175  
   176  	pullOptions := docker.PullImageOptions{
   177  		Repository:    repo,
   178  		Tag:           tag,
   179  		OutputStream:  pm,
   180  		RawJSONStream: true,
   181  		Context:       ctx,
   182  	}
   183  
   184  	// Attempt to pull the image
   185  	var auth docker.AuthConfiguration
   186  	if authOptions != nil {
   187  		auth = *authOptions
   188  	}
   189  
   190  	err := d.client.PullImage(pullOptions, auth)
   191  
   192  	if ctxErr := ctx.Err(); ctxErr == context.DeadlineExceeded {
   193  		d.logger.Error("timeout pulling container", "image_ref", dockerImageRef(repo, tag))
   194  		future.set("", recoverablePullError(ctxErr, image))
   195  		return
   196  	}
   197  
   198  	if err != nil {
   199  		d.logger.Error("failed pulling container", "image_ref", dockerImageRef(repo, tag),
   200  			"error", err)
   201  		future.set("", recoverablePullError(err, image))
   202  		return
   203  	}
   204  
   205  	d.logger.Debug("docker pull succeeded", "image_ref", dockerImageRef(repo, tag))
   206  
   207  	dockerImage, err := d.client.InspectImage(image)
   208  	if err != nil {
   209  		d.logger.Error("failed getting image id", "image_name", image, "error", err)
   210  		future.set("", recoverableErrTimeouts(err))
   211  		return
   212  	}
   213  
   214  	future.set(dockerImage.ID, nil)
   215  	return
   216  }
   217  
   218  // IncrementImageReference is used to increment an image reference count
   219  func (d *dockerCoordinator) IncrementImageReference(imageID, imageName, callerID string) {
   220  	d.imageLock.Lock()
   221  	defer d.imageLock.Unlock()
   222  	if d.cleanup {
   223  		d.incrementImageReferenceImpl(imageID, imageName, callerID)
   224  	}
   225  }
   226  
   227  // incrementImageReferenceImpl assumes the lock is held
   228  func (d *dockerCoordinator) incrementImageReferenceImpl(imageID, imageName, callerID string) {
   229  	// Cancel any pending delete
   230  	if cancel, ok := d.deleteFuture[imageID]; ok {
   231  		d.logger.Debug("cancelling removal of container image", "image_name", imageName)
   232  		cancel()
   233  		delete(d.deleteFuture, imageID)
   234  	}
   235  
   236  	// Increment the reference
   237  	references, ok := d.imageRefCount[imageID]
   238  	if !ok {
   239  		references = make(map[string]struct{})
   240  		d.imageRefCount[imageID] = references
   241  	}
   242  
   243  	if _, ok := references[callerID]; !ok {
   244  		references[callerID] = struct{}{}
   245  		d.logger.Debug("image reference count incremented", "image_name", imageName, "image_id", imageID, "references", len(references))
   246  	}
   247  }
   248  
   249  // RemoveImage removes the given image. If there are any errors removing the
   250  // image, the remove is retried internally.
   251  func (d *dockerCoordinator) RemoveImage(imageID, callerID string) {
   252  	d.imageLock.Lock()
   253  	defer d.imageLock.Unlock()
   254  
   255  	if !d.cleanup {
   256  		return
   257  	}
   258  
   259  	references, ok := d.imageRefCount[imageID]
   260  	if !ok {
   261  		d.logger.Warn("RemoveImage on non-referenced counted image id", "image_id", imageID)
   262  		return
   263  	}
   264  
   265  	// Decrement the reference count
   266  	delete(references, callerID)
   267  	count := len(references)
   268  	d.logger.Debug("image id reference count decremented", "image_id", imageID, "references", count)
   269  
   270  	// Nothing to do
   271  	if count != 0 {
   272  		return
   273  	}
   274  
   275  	// This should never be the case but we safety guard so we don't leak a
   276  	// cancel.
   277  	if cancel, ok := d.deleteFuture[imageID]; ok {
   278  		d.logger.Error("image id has lingering delete future", "image_id", imageID)
   279  		cancel()
   280  	}
   281  
   282  	// Setup a future to delete the image
   283  	ctx, cancel := context.WithCancel(context.Background())
   284  	d.deleteFuture[imageID] = cancel
   285  	go d.removeImageImpl(imageID, ctx)
   286  
   287  	// Delete the key from the reference count
   288  	delete(d.imageRefCount, imageID)
   289  }
   290  
   291  // removeImageImpl is used to remove an image. It wil wait the specified remove
   292  // delay to remove the image. If the context is cancelled before that the image
   293  // removal will be cancelled.
   294  func (d *dockerCoordinator) removeImageImpl(id string, ctx context.Context) {
   295  	// Wait for the delay or a cancellation event
   296  	select {
   297  	case <-ctx.Done():
   298  		// We have been cancelled
   299  		return
   300  	case <-time.After(d.removeDelay):
   301  	}
   302  
   303  	// Ensure we are suppose to delete. Do a short check while holding the lock
   304  	// so there can't be interleaving. There is still the smallest chance that
   305  	// the delete occurs after the image has been pulled but before it has been
   306  	// incremented. For handling that we just treat it as a recoverable error in
   307  	// the docker driver.
   308  	d.imageLock.Lock()
   309  	select {
   310  	case <-ctx.Done():
   311  		d.imageLock.Unlock()
   312  		return
   313  	default:
   314  	}
   315  	d.imageLock.Unlock()
   316  
   317  	for i := 0; i < 3; i++ {
   318  		err := d.client.RemoveImage(id)
   319  		if err == nil {
   320  			break
   321  		}
   322  
   323  		if err == docker.ErrNoSuchImage {
   324  			d.logger.Debug("unable to cleanup image, does not exist", "image_id", id)
   325  			return
   326  		}
   327  		if derr, ok := err.(*docker.Error); ok && derr.Status == 409 {
   328  			d.logger.Debug("unable to cleanup image, still in use", "image_id", id)
   329  			return
   330  		}
   331  
   332  		// Retry on unknown errors
   333  		d.logger.Debug("failed to remove image", "image_id", id, "attempt", i+1, "error", err)
   334  
   335  		select {
   336  		case <-ctx.Done():
   337  			// We have been cancelled
   338  			return
   339  		case <-time.After(3 * time.Second):
   340  		}
   341  	}
   342  
   343  	d.logger.Debug("cleanup removed downloaded image", "image_id", id)
   344  
   345  	// Cleanup the future from the map and free the context by cancelling it
   346  	d.imageLock.Lock()
   347  	if cancel, ok := d.deleteFuture[id]; ok {
   348  		delete(d.deleteFuture, id)
   349  		cancel()
   350  	}
   351  	d.imageLock.Unlock()
   352  }
   353  
   354  func (d *dockerCoordinator) registerPullLogger(image string, logger LogEventFn) {
   355  	d.pullLoggerLock.Lock()
   356  	defer d.pullLoggerLock.Unlock()
   357  	if _, ok := d.pullLoggers[image]; !ok {
   358  		d.pullLoggers[image] = []LogEventFn{}
   359  	}
   360  	d.pullLoggers[image] = append(d.pullLoggers[image], logger)
   361  }
   362  
   363  func (d *dockerCoordinator) clearPullLogger(image string) {
   364  	d.pullLoggerLock.Lock()
   365  	defer d.pullLoggerLock.Unlock()
   366  	delete(d.pullLoggers, image)
   367  }
   368  
   369  func (d *dockerCoordinator) emitEvent(image, message string, annotations map[string]string) {
   370  	d.pullLoggerLock.RLock()
   371  	defer d.pullLoggerLock.RUnlock()
   372  	for i := range d.pullLoggers[image] {
   373  		go d.pullLoggers[image][i](message, annotations)
   374  	}
   375  }
   376  
   377  func (d *dockerCoordinator) handlePullInactivity(image, msg string, timestamp time.Time) {
   378  	d.logger.Error("image pull aborted due to inactivity", "image_name", image,
   379  		"last_event_timestamp", timestamp.String(), "last_event", msg)
   380  }
   381  
   382  func (d *dockerCoordinator) handlePullProgressReport(image, msg string, _ time.Time) {
   383  	d.logger.Debug("image pull progress", "image_name", image, "message", msg)
   384  }
   385  
   386  func (d *dockerCoordinator) handleSlowPullProgressReport(image, msg string, _ time.Time) {
   387  	d.emitEvent(image, fmt.Sprintf("Docker image pull progress: %s", msg), map[string]string{
   388  		"image": image,
   389  	})
   390  }
   391  
   392  // recoverablePullError wraps the error gotten when trying to pull and image if
   393  // the error is recoverable.
   394  func recoverablePullError(err error, image string) error {
   395  	recoverable := true
   396  	if imageNotFoundMatcher.MatchString(err.Error()) {
   397  		recoverable = false
   398  	}
   399  	return structs.NewRecoverableError(fmt.Errorf("Failed to pull `%s`: %s", image, err), recoverable)
   400  }