github.com/hernad/nomad@v1.6.112/drivers/docker/reconcile_dangling.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package docker 5 6 import ( 7 "context" 8 "fmt" 9 "regexp" 10 "sync" 11 "time" 12 13 docker "github.com/fsouza/go-dockerclient" 14 hclog "github.com/hashicorp/go-hclog" 15 "github.com/hashicorp/go-set" 16 ) 17 18 // containerReconciler detects and kills unexpectedly running containers. 19 // 20 // Due to Docker architecture and network based communication, it is 21 // possible for Docker to start a container successfully, but have the 22 // creation API call fail with a network error. containerReconciler 23 // scans for these untracked containers and kill them. 24 type containerReconciler struct { 25 ctx context.Context 26 config *ContainerGCConfig 27 logger hclog.Logger 28 getClient func() (*docker.Client, error) 29 30 isDriverHealthy func() bool 31 trackedContainers func() *set.Set[string] 32 isNomadContainer func(c docker.APIContainers) bool 33 34 once sync.Once 35 } 36 37 func newReconciler(d *Driver) *containerReconciler { 38 return &containerReconciler{ 39 ctx: d.ctx, 40 config: &d.config.GC.DanglingContainers, 41 getClient: d.getDockerClient, 42 logger: d.logger, 43 44 isDriverHealthy: func() bool { return d.previouslyDetected() && d.fingerprintSuccessful() }, 45 trackedContainers: d.trackedContainers, 46 isNomadContainer: isNomadContainer, 47 } 48 } 49 50 func (r *containerReconciler) Start() { 51 if !r.config.Enabled { 52 r.logger.Debug("skipping dangling containers handling; is disabled") 53 return 54 } 55 56 r.once.Do(func() { 57 go r.removeDanglingContainersGoroutine() 58 }) 59 } 60 61 func (r *containerReconciler) removeDanglingContainersGoroutine() { 62 period := r.config.period 63 64 lastIterSucceeded := true 65 66 // ensure that we wait for at least a period or creation timeout 67 // for first container GC iteration 68 // The initial period is a grace period for restore allocation 69 // before a driver may kill containers launched by an earlier nomad 70 // process. 71 initialDelay := period 72 if r.config.CreationGrace > initialDelay { 73 initialDelay = r.config.CreationGrace 74 } 75 76 timer := time.NewTimer(initialDelay) 77 for { 78 select { 79 case <-timer.C: 80 if r.isDriverHealthy() { 81 err := r.removeDanglingContainersIteration() 82 if err != nil && lastIterSucceeded { 83 r.logger.Warn("failed to remove dangling containers", "error", err) 84 } 85 lastIterSucceeded = (err == nil) 86 } 87 88 timer.Reset(period) 89 case <-r.ctx.Done(): 90 return 91 } 92 } 93 } 94 95 func (r *containerReconciler) removeDanglingContainersIteration() error { 96 cutoff := time.Now().Add(-r.config.CreationGrace) 97 tracked := r.trackedContainers() 98 untracked, err := r.untrackedContainers(tracked, cutoff) 99 if err != nil { 100 return fmt.Errorf("failed to find untracked containers: %v", err) 101 } 102 103 if untracked.Empty() { 104 return nil 105 } 106 107 if r.config.DryRun { 108 r.logger.Info("detected untracked containers", "container_ids", untracked) 109 return nil 110 } 111 112 dockerClient, err := r.getClient() 113 if err != nil { 114 return err 115 } 116 117 for _, id := range untracked.Slice() { 118 ctx, cancel := r.dockerAPIQueryContext() 119 err := dockerClient.RemoveContainer(docker.RemoveContainerOptions{ 120 Context: ctx, 121 ID: id, 122 Force: true, 123 }) 124 cancel() 125 if err != nil { 126 r.logger.Warn("failed to remove untracked container", "container_id", id, "error", err) 127 } else { 128 r.logger.Info("removed untracked container", "container_id", id) 129 } 130 } 131 132 return nil 133 } 134 135 // untrackedContainers returns the ids of containers that suspected 136 // to have been started by Nomad but aren't tracked by this driver 137 func (r *containerReconciler) untrackedContainers(tracked *set.Set[string], cutoffTime time.Time) (*set.Set[string], error) { 138 result := set.New[string](10) 139 140 ctx, cancel := r.dockerAPIQueryContext() 141 defer cancel() 142 143 dockerClient, err := r.getClient() 144 if err != nil { 145 return nil, err 146 } 147 148 cc, err := dockerClient.ListContainers(docker.ListContainersOptions{ 149 Context: ctx, 150 All: false, // only reconcile running containers 151 }) 152 if err != nil { 153 return nil, fmt.Errorf("failed to list containers: %v", err) 154 } 155 156 cutoff := cutoffTime.Unix() 157 158 for _, c := range cc { 159 if tracked.Contains(c.ID) { 160 continue 161 } 162 163 if c.Created > cutoff { 164 continue 165 } 166 167 if !r.isNomadContainer(c) { 168 continue 169 } 170 171 result.Insert(c.ID) 172 } 173 return result, nil 174 } 175 176 // dockerAPIQueryTimeout returns a context for docker API response with an appropriate timeout 177 // to protect against wedged locked-up API call. 178 // 179 // We'll try hitting Docker API on subsequent iteration. 180 func (r *containerReconciler) dockerAPIQueryContext() (context.Context, context.CancelFunc) { 181 // use a reasonable floor to avoid very small limit 182 timeout := 30 * time.Second 183 184 if timeout < r.config.period { 185 timeout = r.config.period 186 } 187 188 return context.WithTimeout(context.Background(), timeout) 189 } 190 191 func isNomadContainer(c docker.APIContainers) bool { 192 if _, ok := c.Labels[dockerLabelAllocID]; ok { 193 return true 194 } 195 196 // pre-0.10 containers aren't tagged or labeled in any way, 197 // so use cheap heuristic based on mount paths 198 // before inspecting container details 199 if !hasMount(c, "/alloc") || 200 !hasMount(c, "/local") || 201 !hasMount(c, "/secrets") || 202 !hasNomadName(c) { 203 return false 204 } 205 206 return true 207 } 208 209 func hasMount(c docker.APIContainers, p string) bool { 210 for _, m := range c.Mounts { 211 if m.Destination == p { 212 return true 213 } 214 } 215 216 return false 217 } 218 219 var nomadContainerNamePattern = regexp.MustCompile(`\/.*-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}`) 220 221 func hasNomadName(c docker.APIContainers) bool { 222 for _, n := range c.Names { 223 if nomadContainerNamePattern.MatchString(n) { 224 return true 225 } 226 } 227 return false 228 } 229 230 // trackedContainers returns the set of container IDs of containers that were 231 // started by Driver and are expected to be running. This includes both normal 232 // Task containers, as well as infra pause containers. 233 func (d *Driver) trackedContainers() *set.Set[string] { 234 // collect the task containers 235 ids := d.tasks.IDs() 236 // now also accumulate pause containers 237 return d.pauseContainers.union(ids) 238 }