github.com/ilhicas/nomad@v0.11.8/drivers/docker/reconciler.go (about)

     1  package docker
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"regexp"
     7  	"sync"
     8  	"time"
     9  
    10  	docker "github.com/fsouza/go-dockerclient"
    11  	hclog "github.com/hashicorp/go-hclog"
    12  )
    13  
    14  // containerReconciler detects and kills unexpectedly running containers.
    15  //
    16  // Due to Docker architecture and network based communication, it is
    17  // possible for Docker to start a container successfully, but have the
    18  // creation API call fail with a network error.  containerReconciler
    19  // scans for these untracked containers and kill them.
    20  type containerReconciler struct {
    21  	ctx    context.Context
    22  	config *ContainerGCConfig
    23  	client *docker.Client
    24  	logger hclog.Logger
    25  
    26  	isDriverHealthy   func() bool
    27  	trackedContainers func() map[string]bool
    28  	isNomadContainer  func(c docker.APIContainers) bool
    29  
    30  	once sync.Once
    31  }
    32  
    33  func newReconciler(d *Driver) *containerReconciler {
    34  	return &containerReconciler{
    35  		ctx:    d.ctx,
    36  		config: &d.config.GC.DanglingContainers,
    37  		client: client,
    38  		logger: d.logger,
    39  
    40  		isDriverHealthy:   func() bool { return d.previouslyDetected() && d.fingerprintSuccessful() },
    41  		trackedContainers: d.trackedContainers,
    42  		isNomadContainer:  isNomadContainer,
    43  	}
    44  }
    45  
    46  func (r *containerReconciler) Start() {
    47  	if !r.config.Enabled {
    48  		r.logger.Debug("skipping dangling containers handling; is disabled")
    49  		return
    50  	}
    51  
    52  	r.once.Do(func() {
    53  		go r.removeDanglingContainersGoroutine()
    54  	})
    55  }
    56  
    57  func (r *containerReconciler) removeDanglingContainersGoroutine() {
    58  	period := r.config.period
    59  
    60  	lastIterSucceeded := true
    61  
    62  	// ensure that we wait for at least a period or creation timeout
    63  	// for first container GC iteration
    64  	// The initial period is a grace period for restore allocation
    65  	// before a driver may kill containers launched by an earlier nomad
    66  	// process.
    67  	initialDelay := period
    68  	if r.config.CreationGrace > initialDelay {
    69  		initialDelay = r.config.CreationGrace
    70  	}
    71  
    72  	timer := time.NewTimer(initialDelay)
    73  	for {
    74  		select {
    75  		case <-timer.C:
    76  			if r.isDriverHealthy() {
    77  				err := r.removeDanglingContainersIteration()
    78  				if err != nil && lastIterSucceeded {
    79  					r.logger.Warn("failed to remove dangling containers", "error", err)
    80  				}
    81  				lastIterSucceeded = (err == nil)
    82  			}
    83  
    84  			timer.Reset(period)
    85  		case <-r.ctx.Done():
    86  			return
    87  		}
    88  	}
    89  }
    90  
    91  func (r *containerReconciler) removeDanglingContainersIteration() error {
    92  	cutoff := time.Now().Add(-r.config.CreationGrace)
    93  	tracked := r.trackedContainers()
    94  	untracked, err := r.untrackedContainers(tracked, cutoff)
    95  	if err != nil {
    96  		return fmt.Errorf("failed to find untracked containers: %v", err)
    97  	}
    98  
    99  	if len(untracked) == 0 {
   100  		return nil
   101  	}
   102  
   103  	if r.config.DryRun {
   104  		r.logger.Info("detected untracked containers", "container_ids", untracked)
   105  		return nil
   106  	}
   107  
   108  	for _, id := range untracked {
   109  		ctx, cancel := r.dockerAPIQueryContext()
   110  		err := client.RemoveContainer(docker.RemoveContainerOptions{
   111  			Context: ctx,
   112  			ID:      id,
   113  			Force:   true,
   114  		})
   115  		cancel()
   116  		if err != nil {
   117  			r.logger.Warn("failed to remove untracked container", "container_id", id, "error", err)
   118  		} else {
   119  			r.logger.Info("removed untracked container", "container_id", id)
   120  		}
   121  	}
   122  
   123  	return nil
   124  }
   125  
   126  // untrackedContainers returns the ids of containers that suspected
   127  // to have been started by Nomad but aren't tracked by this driver
   128  func (r *containerReconciler) untrackedContainers(tracked map[string]bool, cutoffTime time.Time) ([]string, error) {
   129  	result := []string{}
   130  
   131  	ctx, cancel := r.dockerAPIQueryContext()
   132  	defer cancel()
   133  
   134  	cc, err := client.ListContainers(docker.ListContainersOptions{
   135  		Context: ctx,
   136  		All:     false, // only reconcile running containers
   137  	})
   138  	if err != nil {
   139  		return nil, fmt.Errorf("failed to list containers: %v", err)
   140  	}
   141  
   142  	cutoff := cutoffTime.Unix()
   143  
   144  	for _, c := range cc {
   145  		if tracked[c.ID] {
   146  			continue
   147  		}
   148  
   149  		if c.Created > cutoff {
   150  			continue
   151  		}
   152  
   153  		if !r.isNomadContainer(c) {
   154  			continue
   155  		}
   156  
   157  		result = append(result, c.ID)
   158  	}
   159  
   160  	return result, nil
   161  }
   162  
   163  // dockerAPIQueryTimeout returns a context for docker API response with an appropriate timeout
   164  // to protect against wedged locked-up API call.
   165  //
   166  // We'll try hitting Docker API on subsequent iteration.
   167  func (r *containerReconciler) dockerAPIQueryContext() (context.Context, context.CancelFunc) {
   168  	// use a reasoanble floor to avoid very small limit
   169  	timeout := 30 * time.Second
   170  
   171  	if timeout < r.config.period {
   172  		timeout = r.config.period
   173  	}
   174  
   175  	return context.WithTimeout(context.Background(), timeout)
   176  }
   177  
   178  func isNomadContainer(c docker.APIContainers) bool {
   179  	if _, ok := c.Labels[dockerLabelAllocID]; ok {
   180  		return true
   181  	}
   182  
   183  	// pre-0.10 containers aren't tagged or labeled in any way,
   184  	// so use cheap heuristic based on mount paths
   185  	// before inspecting container details
   186  	if !hasMount(c, "/alloc") ||
   187  		!hasMount(c, "/local") ||
   188  		!hasMount(c, "/secrets") ||
   189  		!hasNomadName(c) {
   190  		return false
   191  	}
   192  
   193  	return true
   194  }
   195  
   196  func hasMount(c docker.APIContainers, p string) bool {
   197  	for _, m := range c.Mounts {
   198  		if m.Destination == p {
   199  			return true
   200  		}
   201  	}
   202  
   203  	return false
   204  }
   205  
   206  var nomadContainerNamePattern = regexp.MustCompile(`\/.*-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}`)
   207  
   208  func hasNomadName(c docker.APIContainers) bool {
   209  	for _, n := range c.Names {
   210  		if nomadContainerNamePattern.MatchString(n) {
   211  			return true
   212  		}
   213  	}
   214  
   215  	return false
   216  }
   217  
   218  func (d *Driver) trackedContainers() map[string]bool {
   219  	d.tasks.lock.RLock()
   220  	defer d.tasks.lock.RUnlock()
   221  
   222  	r := make(map[string]bool, len(d.tasks.store))
   223  	for _, h := range d.tasks.store {
   224  		r[h.containerID] = true
   225  	}
   226  
   227  	return r
   228  }