github.com/hernad/nomad@v1.6.112/drivers/docker/reconcile_dangling.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package docker
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  	"regexp"
    10  	"sync"
    11  	"time"
    12  
    13  	docker "github.com/fsouza/go-dockerclient"
    14  	hclog "github.com/hashicorp/go-hclog"
    15  	"github.com/hashicorp/go-set"
    16  )
    17  
    18  // containerReconciler detects and kills unexpectedly running containers.
    19  //
    20  // Due to Docker architecture and network based communication, it is
    21  // possible for Docker to start a container successfully, but have the
    22  // creation API call fail with a network error.  containerReconciler
    23  // scans for these untracked containers and kill them.
    24  type containerReconciler struct {
    25  	ctx       context.Context
    26  	config    *ContainerGCConfig
    27  	logger    hclog.Logger
    28  	getClient func() (*docker.Client, error)
    29  
    30  	isDriverHealthy   func() bool
    31  	trackedContainers func() *set.Set[string]
    32  	isNomadContainer  func(c docker.APIContainers) bool
    33  
    34  	once sync.Once
    35  }
    36  
    37  func newReconciler(d *Driver) *containerReconciler {
    38  	return &containerReconciler{
    39  		ctx:       d.ctx,
    40  		config:    &d.config.GC.DanglingContainers,
    41  		getClient: d.getDockerClient,
    42  		logger:    d.logger,
    43  
    44  		isDriverHealthy:   func() bool { return d.previouslyDetected() && d.fingerprintSuccessful() },
    45  		trackedContainers: d.trackedContainers,
    46  		isNomadContainer:  isNomadContainer,
    47  	}
    48  }
    49  
    50  func (r *containerReconciler) Start() {
    51  	if !r.config.Enabled {
    52  		r.logger.Debug("skipping dangling containers handling; is disabled")
    53  		return
    54  	}
    55  
    56  	r.once.Do(func() {
    57  		go r.removeDanglingContainersGoroutine()
    58  	})
    59  }
    60  
    61  func (r *containerReconciler) removeDanglingContainersGoroutine() {
    62  	period := r.config.period
    63  
    64  	lastIterSucceeded := true
    65  
    66  	// ensure that we wait for at least a period or creation timeout
    67  	// for first container GC iteration
    68  	// The initial period is a grace period for restore allocation
    69  	// before a driver may kill containers launched by an earlier nomad
    70  	// process.
    71  	initialDelay := period
    72  	if r.config.CreationGrace > initialDelay {
    73  		initialDelay = r.config.CreationGrace
    74  	}
    75  
    76  	timer := time.NewTimer(initialDelay)
    77  	for {
    78  		select {
    79  		case <-timer.C:
    80  			if r.isDriverHealthy() {
    81  				err := r.removeDanglingContainersIteration()
    82  				if err != nil && lastIterSucceeded {
    83  					r.logger.Warn("failed to remove dangling containers", "error", err)
    84  				}
    85  				lastIterSucceeded = (err == nil)
    86  			}
    87  
    88  			timer.Reset(period)
    89  		case <-r.ctx.Done():
    90  			return
    91  		}
    92  	}
    93  }
    94  
    95  func (r *containerReconciler) removeDanglingContainersIteration() error {
    96  	cutoff := time.Now().Add(-r.config.CreationGrace)
    97  	tracked := r.trackedContainers()
    98  	untracked, err := r.untrackedContainers(tracked, cutoff)
    99  	if err != nil {
   100  		return fmt.Errorf("failed to find untracked containers: %v", err)
   101  	}
   102  
   103  	if untracked.Empty() {
   104  		return nil
   105  	}
   106  
   107  	if r.config.DryRun {
   108  		r.logger.Info("detected untracked containers", "container_ids", untracked)
   109  		return nil
   110  	}
   111  
   112  	dockerClient, err := r.getClient()
   113  	if err != nil {
   114  		return err
   115  	}
   116  
   117  	for _, id := range untracked.Slice() {
   118  		ctx, cancel := r.dockerAPIQueryContext()
   119  		err := dockerClient.RemoveContainer(docker.RemoveContainerOptions{
   120  			Context: ctx,
   121  			ID:      id,
   122  			Force:   true,
   123  		})
   124  		cancel()
   125  		if err != nil {
   126  			r.logger.Warn("failed to remove untracked container", "container_id", id, "error", err)
   127  		} else {
   128  			r.logger.Info("removed untracked container", "container_id", id)
   129  		}
   130  	}
   131  
   132  	return nil
   133  }
   134  
   135  // untrackedContainers returns the ids of containers that suspected
   136  // to have been started by Nomad but aren't tracked by this driver
   137  func (r *containerReconciler) untrackedContainers(tracked *set.Set[string], cutoffTime time.Time) (*set.Set[string], error) {
   138  	result := set.New[string](10)
   139  
   140  	ctx, cancel := r.dockerAPIQueryContext()
   141  	defer cancel()
   142  
   143  	dockerClient, err := r.getClient()
   144  	if err != nil {
   145  		return nil, err
   146  	}
   147  
   148  	cc, err := dockerClient.ListContainers(docker.ListContainersOptions{
   149  		Context: ctx,
   150  		All:     false, // only reconcile running containers
   151  	})
   152  	if err != nil {
   153  		return nil, fmt.Errorf("failed to list containers: %v", err)
   154  	}
   155  
   156  	cutoff := cutoffTime.Unix()
   157  
   158  	for _, c := range cc {
   159  		if tracked.Contains(c.ID) {
   160  			continue
   161  		}
   162  
   163  		if c.Created > cutoff {
   164  			continue
   165  		}
   166  
   167  		if !r.isNomadContainer(c) {
   168  			continue
   169  		}
   170  
   171  		result.Insert(c.ID)
   172  	}
   173  	return result, nil
   174  }
   175  
   176  // dockerAPIQueryTimeout returns a context for docker API response with an appropriate timeout
   177  // to protect against wedged locked-up API call.
   178  //
   179  // We'll try hitting Docker API on subsequent iteration.
   180  func (r *containerReconciler) dockerAPIQueryContext() (context.Context, context.CancelFunc) {
   181  	// use a reasonable floor to avoid very small limit
   182  	timeout := 30 * time.Second
   183  
   184  	if timeout < r.config.period {
   185  		timeout = r.config.period
   186  	}
   187  
   188  	return context.WithTimeout(context.Background(), timeout)
   189  }
   190  
   191  func isNomadContainer(c docker.APIContainers) bool {
   192  	if _, ok := c.Labels[dockerLabelAllocID]; ok {
   193  		return true
   194  	}
   195  
   196  	// pre-0.10 containers aren't tagged or labeled in any way,
   197  	// so use cheap heuristic based on mount paths
   198  	// before inspecting container details
   199  	if !hasMount(c, "/alloc") ||
   200  		!hasMount(c, "/local") ||
   201  		!hasMount(c, "/secrets") ||
   202  		!hasNomadName(c) {
   203  		return false
   204  	}
   205  
   206  	return true
   207  }
   208  
   209  func hasMount(c docker.APIContainers, p string) bool {
   210  	for _, m := range c.Mounts {
   211  		if m.Destination == p {
   212  			return true
   213  		}
   214  	}
   215  
   216  	return false
   217  }
   218  
   219  var nomadContainerNamePattern = regexp.MustCompile(`\/.*-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}`)
   220  
   221  func hasNomadName(c docker.APIContainers) bool {
   222  	for _, n := range c.Names {
   223  		if nomadContainerNamePattern.MatchString(n) {
   224  			return true
   225  		}
   226  	}
   227  	return false
   228  }
   229  
   230  // trackedContainers returns the set of container IDs of containers that were
   231  // started by Driver and are expected to be running. This includes both normal
   232  // Task containers, as well as infra pause containers.
   233  func (d *Driver) trackedContainers() *set.Set[string] {
   234  	// collect the task containers
   235  	ids := d.tasks.IDs()
   236  	// now also accumulate pause containers
   237  	return d.pauseContainers.union(ids)
   238  }