github.com/ilhicas/nomad@v0.11.8/drivers/docker/coordinator.go (about)

     1  package docker
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"regexp"
     7  	"sync"
     8  	"time"
     9  
    10  	docker "github.com/fsouza/go-dockerclient"
    11  	hclog "github.com/hashicorp/go-hclog"
    12  	"github.com/hashicorp/nomad/nomad/structs"
    13  )
    14  
    15  var (
    16  	// imageNotFoundMatcher is a regex expression that matches the image not
    17  	// found error Docker returns.
    18  	imageNotFoundMatcher = regexp.MustCompile(`Error: image .+ not found`)
    19  )
    20  
    21  // pullFuture is a sharable future for retrieving a pulled images ID and any
    22  // error that may have occurred during the pull.
    23  type pullFuture struct {
    24  	waitCh chan struct{}
    25  
    26  	err     error
    27  	imageID string
    28  }
    29  
    30  // newPullFuture returns a new pull future
    31  func newPullFuture() *pullFuture {
    32  	return &pullFuture{
    33  		waitCh: make(chan struct{}),
    34  	}
    35  }
    36  
    37  // wait waits till the future has a result
    38  func (p *pullFuture) wait() *pullFuture {
    39  	<-p.waitCh
    40  	return p
    41  }
    42  
    43  // result returns the results of the future and should only ever be called after
    44  // wait returns.
    45  func (p *pullFuture) result() (imageID string, err error) {
    46  	return p.imageID, p.err
    47  }
    48  
    49  // set is used to set the results and unblock any waiter. This may only be
    50  // called once.
    51  func (p *pullFuture) set(imageID string, err error) {
    52  	p.imageID = imageID
    53  	p.err = err
    54  	close(p.waitCh)
    55  }
    56  
    57  // DockerImageClient provides the methods required to do CRUD operations on the
    58  // Docker images
    59  type DockerImageClient interface {
    60  	PullImage(opts docker.PullImageOptions, auth docker.AuthConfiguration) error
    61  	InspectImage(id string) (*docker.Image, error)
    62  	RemoveImage(id string) error
    63  }
    64  
    65  // LogEventFn is a callback which allows Drivers to emit task events.
    66  type LogEventFn func(message string, annotations map[string]string)
    67  
    68  // noopLogEventFn satisfies the LogEventFn type but noops when called
    69  func noopLogEventFn(string, map[string]string) {}
    70  
    71  // dockerCoordinatorConfig is used to configure the Docker coordinator.
    72  type dockerCoordinatorConfig struct {
    73  	ctx context.Context
    74  
    75  	// logger is the logger the coordinator should use
    76  	logger hclog.Logger
    77  
    78  	// cleanup marks whether images should be deleted when the reference count
    79  	// is zero
    80  	cleanup bool
    81  
    82  	// client is the Docker client to use for communicating with Docker
    83  	client DockerImageClient
    84  
    85  	// removeDelay is the delay between an image's reference count going to
    86  	// zero and the image actually being deleted.
    87  	removeDelay time.Duration
    88  }
    89  
    90  // dockerCoordinator is used to coordinate actions against images to prevent
    91  // racy deletions. It can be thought of as a reference counter on images.
    92  type dockerCoordinator struct {
    93  	*dockerCoordinatorConfig
    94  
    95  	// imageLock is used to lock access to all images
    96  	imageLock sync.Mutex
    97  
    98  	// pullFutures is used to allow multiple callers to pull the same image but
    99  	// only have one request be sent to Docker
   100  	pullFutures map[string]*pullFuture
   101  
   102  	// pullLoggers is used to track the LogEventFn for each alloc pulling an image.
   103  	// If multiple alloc's are attempting to pull the same image, each will need
   104  	// to register its own LogEventFn with the coordinator.
   105  	pullLoggers map[string][]LogEventFn
   106  
   107  	// pullLoggerLock is used to sync access to the pullLoggers map
   108  	pullLoggerLock sync.RWMutex
   109  
   110  	// imageRefCount is the reference count of image IDs
   111  	imageRefCount map[string]map[string]struct{}
   112  
   113  	// deleteFuture is indexed by image ID and has a cancelable delete future
   114  	deleteFuture map[string]context.CancelFunc
   115  }
   116  
   117  // newDockerCoordinator returns a new Docker coordinator
   118  func newDockerCoordinator(config *dockerCoordinatorConfig) *dockerCoordinator {
   119  	if config.client == nil {
   120  		return nil
   121  	}
   122  
   123  	return &dockerCoordinator{
   124  		dockerCoordinatorConfig: config,
   125  		pullFutures:             make(map[string]*pullFuture),
   126  		pullLoggers:             make(map[string][]LogEventFn),
   127  		imageRefCount:           make(map[string]map[string]struct{}),
   128  		deleteFuture:            make(map[string]context.CancelFunc),
   129  	}
   130  }
   131  
   132  // PullImage is used to pull an image. It returns the pulled imaged ID or an
   133  // error that occurred during the pull
   134  func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConfiguration, callerID string, emitFn LogEventFn, pullActivityTimeout time.Duration) (imageID string, err error) {
   135  	// Get the future
   136  	d.imageLock.Lock()
   137  	future, ok := d.pullFutures[image]
   138  	d.registerPullLogger(image, emitFn)
   139  	if !ok {
   140  		// Make the future
   141  		future = newPullFuture()
   142  		d.pullFutures[image] = future
   143  		go d.pullImageImpl(image, authOptions, pullActivityTimeout, future)
   144  	}
   145  	d.imageLock.Unlock()
   146  
   147  	// We unlock while we wait since this can take a while
   148  	id, err := future.wait().result()
   149  
   150  	d.imageLock.Lock()
   151  	defer d.imageLock.Unlock()
   152  
   153  	// Delete the future since we don't need it and we don't want to cache an
   154  	// image being there if it has possibly been manually deleted (outside of
   155  	// Nomad).
   156  	if _, ok := d.pullFutures[image]; ok {
   157  		delete(d.pullFutures, image)
   158  	}
   159  
   160  	// If we are cleaning up, we increment the reference count on the image
   161  	if err == nil && d.cleanup {
   162  		d.incrementImageReferenceImpl(id, image, callerID)
   163  	}
   164  
   165  	return id, err
   166  }
   167  
   168  // pullImageImpl is the implementation of pulling an image. The results are
   169  // returned via the passed future
   170  func (d *dockerCoordinator) pullImageImpl(image string, authOptions *docker.AuthConfiguration, pullActivityTimeout time.Duration, future *pullFuture) {
   171  	defer d.clearPullLogger(image)
   172  	// Parse the repo and tag
   173  	repo, tag := parseDockerImage(image)
   174  	ctx, cancel := context.WithCancel(context.Background())
   175  	defer cancel()
   176  
   177  	pm := newImageProgressManager(image, cancel, pullActivityTimeout, d.handlePullInactivity,
   178  		d.handlePullProgressReport, d.handleSlowPullProgressReport)
   179  	defer pm.stop()
   180  
   181  	pullOptions := docker.PullImageOptions{
   182  		Repository:    repo,
   183  		Tag:           tag,
   184  		OutputStream:  pm,
   185  		RawJSONStream: true,
   186  		Context:       ctx,
   187  	}
   188  
   189  	// Attempt to pull the image
   190  	var auth docker.AuthConfiguration
   191  	if authOptions != nil {
   192  		auth = *authOptions
   193  	}
   194  
   195  	err := d.client.PullImage(pullOptions, auth)
   196  
   197  	if ctxErr := ctx.Err(); ctxErr == context.DeadlineExceeded {
   198  		d.logger.Error("timeout pulling container", "image_ref", dockerImageRef(repo, tag))
   199  		future.set("", recoverablePullError(ctxErr, image))
   200  		return
   201  	}
   202  
   203  	if err != nil {
   204  		d.logger.Error("failed pulling container", "image_ref", dockerImageRef(repo, tag),
   205  			"error", err)
   206  		future.set("", recoverablePullError(err, image))
   207  		return
   208  	}
   209  
   210  	d.logger.Debug("docker pull succeeded", "image_ref", dockerImageRef(repo, tag))
   211  
   212  	dockerImage, err := d.client.InspectImage(image)
   213  	if err != nil {
   214  		d.logger.Error("failed getting image id", "image_name", image, "error", err)
   215  		future.set("", recoverableErrTimeouts(err))
   216  		return
   217  	}
   218  
   219  	future.set(dockerImage.ID, nil)
   220  	return
   221  }
   222  
   223  // IncrementImageReference is used to increment an image reference count
   224  func (d *dockerCoordinator) IncrementImageReference(imageID, imageName, callerID string) {
   225  	d.imageLock.Lock()
   226  	defer d.imageLock.Unlock()
   227  	if d.cleanup {
   228  		d.incrementImageReferenceImpl(imageID, imageName, callerID)
   229  	}
   230  }
   231  
   232  // incrementImageReferenceImpl assumes the lock is held
   233  func (d *dockerCoordinator) incrementImageReferenceImpl(imageID, imageName, callerID string) {
   234  	// Cancel any pending delete
   235  	if cancel, ok := d.deleteFuture[imageID]; ok {
   236  		d.logger.Debug("cancelling removal of container image", "image_name", imageName)
   237  		cancel()
   238  		delete(d.deleteFuture, imageID)
   239  	}
   240  
   241  	// Increment the reference
   242  	references, ok := d.imageRefCount[imageID]
   243  	if !ok {
   244  		references = make(map[string]struct{})
   245  		d.imageRefCount[imageID] = references
   246  	}
   247  
   248  	if _, ok := references[callerID]; !ok {
   249  		references[callerID] = struct{}{}
   250  		d.logger.Debug("image reference count incremented", "image_name", imageName, "image_id", imageID, "references", len(references))
   251  	}
   252  }
   253  
   254  // RemoveImage removes the given image. If there are any errors removing the
   255  // image, the remove is retried internally.
   256  func (d *dockerCoordinator) RemoveImage(imageID, callerID string) {
   257  	d.imageLock.Lock()
   258  	defer d.imageLock.Unlock()
   259  
   260  	if !d.cleanup {
   261  		return
   262  	}
   263  
   264  	references, ok := d.imageRefCount[imageID]
   265  	if !ok {
   266  		d.logger.Warn("RemoveImage on non-referenced counted image id", "image_id", imageID)
   267  		return
   268  	}
   269  
   270  	// Decrement the reference count
   271  	delete(references, callerID)
   272  	count := len(references)
   273  	d.logger.Debug("image id reference count decremented", "image_id", imageID, "references", count)
   274  
   275  	// Nothing to do
   276  	if count != 0 {
   277  		return
   278  	}
   279  
   280  	// This should never be the case but we safety guard so we don't leak a
   281  	// cancel.
   282  	if cancel, ok := d.deleteFuture[imageID]; ok {
   283  		d.logger.Error("image id has lingering delete future", "image_id", imageID)
   284  		cancel()
   285  	}
   286  
   287  	// Setup a future to delete the image
   288  	ctx, cancel := context.WithCancel(d.ctx)
   289  	d.deleteFuture[imageID] = cancel
   290  	go d.removeImageImpl(imageID, ctx)
   291  
   292  	// Delete the key from the reference count
   293  	delete(d.imageRefCount, imageID)
   294  }
   295  
   296  // removeImageImpl is used to remove an image. It wil wait the specified remove
   297  // delay to remove the image. If the context is cancelled before that the image
   298  // removal will be cancelled.
   299  func (d *dockerCoordinator) removeImageImpl(id string, ctx context.Context) {
   300  	// Wait for the delay or a cancellation event
   301  	select {
   302  	case <-ctx.Done():
   303  		// We have been cancelled
   304  		return
   305  	case <-time.After(d.removeDelay):
   306  	}
   307  
   308  	// Ensure we are suppose to delete. Do a short check while holding the lock
   309  	// so there can't be interleaving. There is still the smallest chance that
   310  	// the delete occurs after the image has been pulled but before it has been
   311  	// incremented. For handling that we just treat it as a recoverable error in
   312  	// the docker driver.
   313  	d.imageLock.Lock()
   314  	select {
   315  	case <-ctx.Done():
   316  		d.imageLock.Unlock()
   317  		return
   318  	default:
   319  	}
   320  	d.imageLock.Unlock()
   321  
   322  	for i := 0; i < 3; i++ {
   323  		err := d.client.RemoveImage(id)
   324  		if err == nil {
   325  			break
   326  		}
   327  
   328  		if err == docker.ErrNoSuchImage {
   329  			d.logger.Debug("unable to cleanup image, does not exist", "image_id", id)
   330  			return
   331  		}
   332  		if derr, ok := err.(*docker.Error); ok && derr.Status == 409 {
   333  			d.logger.Debug("unable to cleanup image, still in use", "image_id", id)
   334  			return
   335  		}
   336  
   337  		// Retry on unknown errors
   338  		d.logger.Debug("failed to remove image", "image_id", id, "attempt", i+1, "error", err)
   339  
   340  		select {
   341  		case <-ctx.Done():
   342  			// We have been cancelled
   343  			return
   344  		case <-time.After(3 * time.Second):
   345  		}
   346  	}
   347  
   348  	d.logger.Debug("cleanup removed downloaded image", "image_id", id)
   349  
   350  	// Cleanup the future from the map and free the context by cancelling it
   351  	d.imageLock.Lock()
   352  	if cancel, ok := d.deleteFuture[id]; ok {
   353  		delete(d.deleteFuture, id)
   354  		cancel()
   355  	}
   356  	d.imageLock.Unlock()
   357  }
   358  
   359  func (d *dockerCoordinator) registerPullLogger(image string, logger LogEventFn) {
   360  	d.pullLoggerLock.Lock()
   361  	defer d.pullLoggerLock.Unlock()
   362  	if _, ok := d.pullLoggers[image]; !ok {
   363  		d.pullLoggers[image] = []LogEventFn{}
   364  	}
   365  	d.pullLoggers[image] = append(d.pullLoggers[image], logger)
   366  }
   367  
   368  func (d *dockerCoordinator) clearPullLogger(image string) {
   369  	d.pullLoggerLock.Lock()
   370  	defer d.pullLoggerLock.Unlock()
   371  	delete(d.pullLoggers, image)
   372  }
   373  
   374  func (d *dockerCoordinator) emitEvent(image, message string, annotations map[string]string) {
   375  	d.pullLoggerLock.RLock()
   376  	defer d.pullLoggerLock.RUnlock()
   377  	for i := range d.pullLoggers[image] {
   378  		go d.pullLoggers[image][i](message, annotations)
   379  	}
   380  }
   381  
   382  func (d *dockerCoordinator) handlePullInactivity(image, msg string, timestamp time.Time) {
   383  	d.logger.Error("image pull aborted due to inactivity", "image_name", image,
   384  		"last_event_timestamp", timestamp.String(), "last_event", msg)
   385  }
   386  
   387  func (d *dockerCoordinator) handlePullProgressReport(image, msg string, _ time.Time) {
   388  	d.logger.Debug("image pull progress", "image_name", image, "message", msg)
   389  }
   390  
   391  func (d *dockerCoordinator) handleSlowPullProgressReport(image, msg string, _ time.Time) {
   392  	d.emitEvent(image, fmt.Sprintf("Docker image pull progress: %s", msg), map[string]string{
   393  		"image": image,
   394  	})
   395  }
   396  
   397  // recoverablePullError wraps the error gotten when trying to pull and image if
   398  // the error is recoverable.
   399  func recoverablePullError(err error, image string) error {
   400  	recoverable := true
   401  	if imageNotFoundMatcher.MatchString(err.Error()) {
   402  		recoverable = false
   403  	}
   404  	return structs.NewRecoverableError(fmt.Errorf("Failed to pull `%s`: %s", image, err), recoverable)
   405  }