github.com/ilhicas/nomad@v0.11.8/drivers/docker/reconciler.go (about) 1 package docker 2 3 import ( 4 "context" 5 "fmt" 6 "regexp" 7 "sync" 8 "time" 9 10 docker "github.com/fsouza/go-dockerclient" 11 hclog "github.com/hashicorp/go-hclog" 12 ) 13 14 // containerReconciler detects and kills unexpectedly running containers. 15 // 16 // Due to Docker architecture and network based communication, it is 17 // possible for Docker to start a container successfully, but have the 18 // creation API call fail with a network error. containerReconciler 19 // scans for these untracked containers and kill them. 20 type containerReconciler struct { 21 ctx context.Context 22 config *ContainerGCConfig 23 client *docker.Client 24 logger hclog.Logger 25 26 isDriverHealthy func() bool 27 trackedContainers func() map[string]bool 28 isNomadContainer func(c docker.APIContainers) bool 29 30 once sync.Once 31 } 32 33 func newReconciler(d *Driver) *containerReconciler { 34 return &containerReconciler{ 35 ctx: d.ctx, 36 config: &d.config.GC.DanglingContainers, 37 client: client, 38 logger: d.logger, 39 40 isDriverHealthy: func() bool { return d.previouslyDetected() && d.fingerprintSuccessful() }, 41 trackedContainers: d.trackedContainers, 42 isNomadContainer: isNomadContainer, 43 } 44 } 45 46 func (r *containerReconciler) Start() { 47 if !r.config.Enabled { 48 r.logger.Debug("skipping dangling containers handling; is disabled") 49 return 50 } 51 52 r.once.Do(func() { 53 go r.removeDanglingContainersGoroutine() 54 }) 55 } 56 57 func (r *containerReconciler) removeDanglingContainersGoroutine() { 58 period := r.config.period 59 60 lastIterSucceeded := true 61 62 // ensure that we wait for at least a period or creation timeout 63 // for first container GC iteration 64 // The initial period is a grace period for restore allocation 65 // before a driver may kill containers launched by an earlier nomad 66 // process. 67 initialDelay := period 68 if r.config.CreationGrace > initialDelay { 69 initialDelay = r.config.CreationGrace 70 } 71 72 timer := time.NewTimer(initialDelay) 73 for { 74 select { 75 case <-timer.C: 76 if r.isDriverHealthy() { 77 err := r.removeDanglingContainersIteration() 78 if err != nil && lastIterSucceeded { 79 r.logger.Warn("failed to remove dangling containers", "error", err) 80 } 81 lastIterSucceeded = (err == nil) 82 } 83 84 timer.Reset(period) 85 case <-r.ctx.Done(): 86 return 87 } 88 } 89 } 90 91 func (r *containerReconciler) removeDanglingContainersIteration() error { 92 cutoff := time.Now().Add(-r.config.CreationGrace) 93 tracked := r.trackedContainers() 94 untracked, err := r.untrackedContainers(tracked, cutoff) 95 if err != nil { 96 return fmt.Errorf("failed to find untracked containers: %v", err) 97 } 98 99 if len(untracked) == 0 { 100 return nil 101 } 102 103 if r.config.DryRun { 104 r.logger.Info("detected untracked containers", "container_ids", untracked) 105 return nil 106 } 107 108 for _, id := range untracked { 109 ctx, cancel := r.dockerAPIQueryContext() 110 err := client.RemoveContainer(docker.RemoveContainerOptions{ 111 Context: ctx, 112 ID: id, 113 Force: true, 114 }) 115 cancel() 116 if err != nil { 117 r.logger.Warn("failed to remove untracked container", "container_id", id, "error", err) 118 } else { 119 r.logger.Info("removed untracked container", "container_id", id) 120 } 121 } 122 123 return nil 124 } 125 126 // untrackedContainers returns the ids of containers that suspected 127 // to have been started by Nomad but aren't tracked by this driver 128 func (r *containerReconciler) untrackedContainers(tracked map[string]bool, cutoffTime time.Time) ([]string, error) { 129 result := []string{} 130 131 ctx, cancel := r.dockerAPIQueryContext() 132 defer cancel() 133 134 cc, err := client.ListContainers(docker.ListContainersOptions{ 135 Context: ctx, 136 All: false, // only reconcile running containers 137 }) 138 if err != nil { 139 return nil, fmt.Errorf("failed to list containers: %v", err) 140 } 141 142 cutoff := cutoffTime.Unix() 143 144 for _, c := range cc { 145 if tracked[c.ID] { 146 continue 147 } 148 149 if c.Created > cutoff { 150 continue 151 } 152 153 if !r.isNomadContainer(c) { 154 continue 155 } 156 157 result = append(result, c.ID) 158 } 159 160 return result, nil 161 } 162 163 // dockerAPIQueryTimeout returns a context for docker API response with an appropriate timeout 164 // to protect against wedged locked-up API call. 165 // 166 // We'll try hitting Docker API on subsequent iteration. 167 func (r *containerReconciler) dockerAPIQueryContext() (context.Context, context.CancelFunc) { 168 // use a reasoanble floor to avoid very small limit 169 timeout := 30 * time.Second 170 171 if timeout < r.config.period { 172 timeout = r.config.period 173 } 174 175 return context.WithTimeout(context.Background(), timeout) 176 } 177 178 func isNomadContainer(c docker.APIContainers) bool { 179 if _, ok := c.Labels[dockerLabelAllocID]; ok { 180 return true 181 } 182 183 // pre-0.10 containers aren't tagged or labeled in any way, 184 // so use cheap heuristic based on mount paths 185 // before inspecting container details 186 if !hasMount(c, "/alloc") || 187 !hasMount(c, "/local") || 188 !hasMount(c, "/secrets") || 189 !hasNomadName(c) { 190 return false 191 } 192 193 return true 194 } 195 196 func hasMount(c docker.APIContainers, p string) bool { 197 for _, m := range c.Mounts { 198 if m.Destination == p { 199 return true 200 } 201 } 202 203 return false 204 } 205 206 var nomadContainerNamePattern = regexp.MustCompile(`\/.*-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}`) 207 208 func hasNomadName(c docker.APIContainers) bool { 209 for _, n := range c.Names { 210 if nomadContainerNamePattern.MatchString(n) { 211 return true 212 } 213 } 214 215 return false 216 } 217 218 func (d *Driver) trackedContainers() map[string]bool { 219 d.tasks.lock.RLock() 220 defer d.tasks.lock.RUnlock() 221 222 r := make(map[string]bool, len(d.tasks.store)) 223 for _, h := range d.tasks.store { 224 r[h.containerID] = true 225 } 226 227 return r 228 }