github.com/djenriquez/nomad-1@v0.8.1/client/gc.go (about) 1 package client 2 3 import ( 4 "container/heap" 5 "fmt" 6 "log" 7 "sync" 8 "time" 9 10 "github.com/hashicorp/nomad/client/stats" 11 "github.com/hashicorp/nomad/nomad/structs" 12 ) 13 14 const ( 15 // MB is a constant which converts values in bytes to MB 16 MB = 1024 * 1024 17 ) 18 19 // GCConfig allows changing the behaviour of the garbage collector 20 type GCConfig struct { 21 // MaxAllocs is the maximum number of allocations to track before a GC 22 // is triggered. 23 MaxAllocs int 24 DiskUsageThreshold float64 25 InodeUsageThreshold float64 26 Interval time.Duration 27 ReservedDiskMB int 28 ParallelDestroys int 29 } 30 31 // AllocCounter is used by AllocGarbageCollector to discover how many un-GC'd 32 // allocations a client has and is generally fulfilled by the Client. 33 type AllocCounter interface { 34 NumAllocs() int 35 } 36 37 // AllocGarbageCollector garbage collects terminated allocations on a node 38 type AllocGarbageCollector struct { 39 config *GCConfig 40 41 // allocRunners marked for GC 42 allocRunners *IndexedGCAllocPQ 43 44 // statsCollector for node based thresholds (eg disk) 45 statsCollector stats.NodeStatsCollector 46 47 // allocCounter return the number of un-GC'd allocs on this node 48 allocCounter AllocCounter 49 50 // destroyCh is a semaphore for rate limiting concurrent garbage 51 // collections 52 destroyCh chan struct{} 53 54 // shutdownCh is closed when the GC's run method should exit 55 shutdownCh chan struct{} 56 57 // triggerCh is ticked by the Trigger method to cause a GC 58 triggerCh chan struct{} 59 60 logger *log.Logger 61 } 62 63 // NewAllocGarbageCollector returns a garbage collector for terminated 64 // allocations on a node. Must call Run() in a goroutine enable periodic 65 // garbage collection. 66 func NewAllocGarbageCollector(logger *log.Logger, statsCollector stats.NodeStatsCollector, ac AllocCounter, config *GCConfig) *AllocGarbageCollector { 67 // Require at least 1 to make progress 68 if config.ParallelDestroys <= 0 { 69 logger.Printf("[WARN] client.gc: garbage collector defaulting parallelism to 1 due to invalid input value of %d", config.ParallelDestroys) 70 config.ParallelDestroys = 1 71 } 72 73 gc := &AllocGarbageCollector{ 74 allocRunners: NewIndexedGCAllocPQ(), 75 statsCollector: statsCollector, 76 allocCounter: ac, 77 config: config, 78 logger: logger, 79 destroyCh: make(chan struct{}, config.ParallelDestroys), 80 shutdownCh: make(chan struct{}), 81 triggerCh: make(chan struct{}, 1), 82 } 83 84 return gc 85 } 86 87 // Run the periodic garbage collector. 88 func (a *AllocGarbageCollector) Run() { 89 ticker := time.NewTicker(a.config.Interval) 90 for { 91 select { 92 case <-a.triggerCh: 93 case <-ticker.C: 94 case <-a.shutdownCh: 95 ticker.Stop() 96 return 97 } 98 99 if err := a.keepUsageBelowThreshold(); err != nil { 100 a.logger.Printf("[ERR] client.gc: error garbage collecting allocation: %v", err) 101 } 102 } 103 } 104 105 // Force the garbage collector to run. 106 func (a *AllocGarbageCollector) Trigger() { 107 select { 108 case a.triggerCh <- struct{}{}: 109 default: 110 // already triggered 111 } 112 } 113 114 // keepUsageBelowThreshold collects disk usage information and garbage collects 115 // allocations to make disk space available. 116 func (a *AllocGarbageCollector) keepUsageBelowThreshold() error { 117 for { 118 select { 119 case <-a.shutdownCh: 120 return nil 121 default: 122 } 123 124 // Check if we have enough free space 125 if err := a.statsCollector.Collect(); err != nil { 126 return err 127 } 128 129 // See if we are below thresholds for used disk space and inode usage 130 diskStats := a.statsCollector.Stats().AllocDirStats 131 reason := "" 132 level := "WARN" 133 134 liveAllocs := a.allocCounter.NumAllocs() 135 136 switch { 137 case diskStats.UsedPercent > a.config.DiskUsageThreshold: 138 reason = fmt.Sprintf("disk usage of %.0f is over gc threshold of %.0f", 139 diskStats.UsedPercent, a.config.DiskUsageThreshold) 140 case diskStats.InodesUsedPercent > a.config.InodeUsageThreshold: 141 reason = fmt.Sprintf("inode usage of %.0f is over gc threshold of %.0f", 142 diskStats.InodesUsedPercent, a.config.InodeUsageThreshold) 143 case liveAllocs > a.config.MaxAllocs: 144 // if we're unable to gc, don't WARN until at least 2x over limit 145 if liveAllocs < (a.config.MaxAllocs * 2) { 146 level = "INFO" 147 } 148 reason = fmt.Sprintf("number of allocations (%d) is over the limit (%d)", liveAllocs, a.config.MaxAllocs) 149 } 150 151 if reason == "" { 152 // No reason to gc, exit 153 break 154 } 155 156 // Collect an allocation 157 gcAlloc := a.allocRunners.Pop() 158 if gcAlloc == nil { 159 a.logger.Printf("[%s] client.gc: garbage collection due to %s skipped because no terminal allocations", level, reason) 160 break 161 } 162 163 // Destroy the alloc runner and wait until it exits 164 a.destroyAllocRunner(gcAlloc.allocRunner, reason) 165 } 166 return nil 167 } 168 169 // destroyAllocRunner is used to destroy an allocation runner. It will acquire a 170 // lock to restrict parallelism and then destroy the alloc runner, returning 171 // once the allocation has been destroyed. 172 func (a *AllocGarbageCollector) destroyAllocRunner(ar *AllocRunner, reason string) { 173 id := "<nil>" 174 if alloc := ar.Alloc(); alloc != nil { 175 id = alloc.ID 176 } 177 a.logger.Printf("[INFO] client.gc: garbage collecting allocation %s due to %s", id, reason) 178 179 // Acquire the destroy lock 180 select { 181 case <-a.shutdownCh: 182 return 183 case a.destroyCh <- struct{}{}: 184 } 185 186 ar.Destroy() 187 188 select { 189 case <-ar.WaitCh(): 190 case <-a.shutdownCh: 191 } 192 193 a.logger.Printf("[DEBUG] client.gc: garbage collected %q", ar.Alloc().ID) 194 195 // Release the lock 196 <-a.destroyCh 197 } 198 199 func (a *AllocGarbageCollector) Stop() { 200 close(a.shutdownCh) 201 } 202 203 // Collect garbage collects a single allocation on a node. Returns true if 204 // alloc was found and garbage collected; otherwise false. 205 func (a *AllocGarbageCollector) Collect(allocID string) bool { 206 if gcAlloc := a.allocRunners.Remove(allocID); gcAlloc != nil { 207 a.destroyAllocRunner(gcAlloc.allocRunner, "forced collection") 208 return true 209 } 210 211 a.logger.Printf("[DEBUG] client.gc: alloc %s is invalid or was already garbage collected", allocID) 212 return false 213 } 214 215 // CollectAll garbage collects all terminated allocations on a node 216 func (a *AllocGarbageCollector) CollectAll() { 217 for { 218 select { 219 case <-a.shutdownCh: 220 return 221 default: 222 } 223 224 gcAlloc := a.allocRunners.Pop() 225 if gcAlloc == nil { 226 return 227 } 228 229 go a.destroyAllocRunner(gcAlloc.allocRunner, "forced full node collection") 230 } 231 } 232 233 // MakeRoomFor garbage collects enough number of allocations in the terminal 234 // state to make room for new allocations 235 func (a *AllocGarbageCollector) MakeRoomFor(allocations []*structs.Allocation) error { 236 if len(allocations) == 0 { 237 // Nothing to make room for! 238 return nil 239 } 240 241 // GC allocs until below the max limit + the new allocations 242 max := a.config.MaxAllocs - len(allocations) 243 for a.allocCounter.NumAllocs() > max { 244 select { 245 case <-a.shutdownCh: 246 return nil 247 default: 248 } 249 250 gcAlloc := a.allocRunners.Pop() 251 if gcAlloc == nil { 252 // It's fine if we can't lower below the limit here as 253 // we'll keep trying to drop below the limit with each 254 // periodic gc 255 break 256 } 257 258 // Destroy the alloc runner and wait until it exits 259 a.destroyAllocRunner(gcAlloc.allocRunner, fmt.Sprintf("new allocations and over max (%d)", a.config.MaxAllocs)) 260 } 261 262 totalResource := &structs.Resources{} 263 for _, alloc := range allocations { 264 if err := totalResource.Add(alloc.Resources); err != nil { 265 return err 266 } 267 } 268 269 // If the host has enough free space to accommodate the new allocations then 270 // we don't need to garbage collect terminated allocations 271 if hostStats := a.statsCollector.Stats(); hostStats != nil { 272 var availableForAllocations uint64 273 if hostStats.AllocDirStats.Available < uint64(a.config.ReservedDiskMB*MB) { 274 availableForAllocations = 0 275 } else { 276 availableForAllocations = hostStats.AllocDirStats.Available - uint64(a.config.ReservedDiskMB*MB) 277 } 278 if uint64(totalResource.DiskMB*MB) < availableForAllocations { 279 return nil 280 } 281 } 282 283 var diskCleared int 284 for { 285 select { 286 case <-a.shutdownCh: 287 return nil 288 default: 289 } 290 291 // Collect host stats and see if we still need to remove older 292 // allocations 293 var allocDirStats *stats.DiskStats 294 if err := a.statsCollector.Collect(); err == nil { 295 if hostStats := a.statsCollector.Stats(); hostStats != nil { 296 allocDirStats = hostStats.AllocDirStats 297 } 298 } 299 300 if allocDirStats != nil { 301 if allocDirStats.Available >= uint64(totalResource.DiskMB*MB) { 302 break 303 } 304 } else { 305 // Falling back to a simpler model to know if we have enough disk 306 // space if stats collection fails 307 if diskCleared >= totalResource.DiskMB { 308 break 309 } 310 } 311 312 gcAlloc := a.allocRunners.Pop() 313 if gcAlloc == nil { 314 break 315 } 316 317 ar := gcAlloc.allocRunner 318 alloc := ar.Alloc() 319 320 // Destroy the alloc runner and wait until it exits 321 a.destroyAllocRunner(ar, fmt.Sprintf("freeing %d MB for new allocations", alloc.Resources.DiskMB)) 322 323 // Call stats collect again 324 diskCleared += alloc.Resources.DiskMB 325 } 326 return nil 327 } 328 329 // MarkForCollection starts tracking an allocation for Garbage Collection 330 func (a *AllocGarbageCollector) MarkForCollection(ar *AllocRunner) { 331 if ar.Alloc() == nil { 332 a.destroyAllocRunner(ar, "alloc is nil") 333 return 334 } 335 336 if a.allocRunners.Push(ar) { 337 a.logger.Printf("[INFO] client.gc: marking allocation %v for GC", ar.Alloc().ID) 338 } 339 } 340 341 // GCAlloc wraps an allocation runner and an index enabling it to be used within 342 // a PQ 343 type GCAlloc struct { 344 timeStamp time.Time 345 allocRunner *AllocRunner 346 index int 347 } 348 349 type GCAllocPQImpl []*GCAlloc 350 351 func (pq GCAllocPQImpl) Len() int { 352 return len(pq) 353 } 354 355 func (pq GCAllocPQImpl) Less(i, j int) bool { 356 return pq[i].timeStamp.Before(pq[j].timeStamp) 357 } 358 359 func (pq GCAllocPQImpl) Swap(i, j int) { 360 pq[i], pq[j] = pq[j], pq[i] 361 pq[i].index = i 362 pq[j].index = j 363 } 364 365 func (pq *GCAllocPQImpl) Push(x interface{}) { 366 n := len(*pq) 367 item := x.(*GCAlloc) 368 item.index = n 369 *pq = append(*pq, item) 370 } 371 372 func (pq *GCAllocPQImpl) Pop() interface{} { 373 old := *pq 374 n := len(old) 375 item := old[n-1] 376 item.index = -1 // for safety 377 *pq = old[0 : n-1] 378 return item 379 } 380 381 // IndexedGCAllocPQ is an indexed PQ which maintains a list of allocation runner 382 // based on their termination time. 383 type IndexedGCAllocPQ struct { 384 index map[string]*GCAlloc 385 heap GCAllocPQImpl 386 387 pqLock sync.Mutex 388 } 389 390 func NewIndexedGCAllocPQ() *IndexedGCAllocPQ { 391 return &IndexedGCAllocPQ{ 392 index: make(map[string]*GCAlloc), 393 heap: make(GCAllocPQImpl, 0), 394 } 395 } 396 397 // Push an alloc runner into the GC queue. Returns true if alloc was added, 398 // false if the alloc already existed. 399 func (i *IndexedGCAllocPQ) Push(ar *AllocRunner) bool { 400 i.pqLock.Lock() 401 defer i.pqLock.Unlock() 402 403 alloc := ar.Alloc() 404 if _, ok := i.index[alloc.ID]; ok { 405 // No work to do 406 return false 407 } 408 gcAlloc := &GCAlloc{ 409 timeStamp: time.Now(), 410 allocRunner: ar, 411 } 412 i.index[alloc.ID] = gcAlloc 413 heap.Push(&i.heap, gcAlloc) 414 return true 415 } 416 417 func (i *IndexedGCAllocPQ) Pop() *GCAlloc { 418 i.pqLock.Lock() 419 defer i.pqLock.Unlock() 420 421 if len(i.heap) == 0 { 422 return nil 423 } 424 425 gcAlloc := heap.Pop(&i.heap).(*GCAlloc) 426 delete(i.index, gcAlloc.allocRunner.Alloc().ID) 427 return gcAlloc 428 } 429 430 // Remove alloc from GC. Returns nil if alloc doesn't exist. 431 func (i *IndexedGCAllocPQ) Remove(allocID string) *GCAlloc { 432 i.pqLock.Lock() 433 defer i.pqLock.Unlock() 434 435 if gcAlloc, ok := i.index[allocID]; ok { 436 heap.Remove(&i.heap, gcAlloc.index) 437 delete(i.index, allocID) 438 return gcAlloc 439 } 440 441 return nil 442 } 443 444 func (i *IndexedGCAllocPQ) Length() int { 445 i.pqLock.Lock() 446 defer i.pqLock.Unlock() 447 448 return len(i.heap) 449 }