github.com/ferranbt/nomad@v0.9.3-0.20190607002617-85c449b7667c/client/gc.go (about) 1 package client 2 3 import ( 4 "container/heap" 5 "fmt" 6 "sync" 7 "time" 8 9 hclog "github.com/hashicorp/go-hclog" 10 "github.com/hashicorp/nomad/client/stats" 11 "github.com/hashicorp/nomad/nomad/structs" 12 ) 13 14 const ( 15 // MB is a constant which converts values in bytes to MB 16 MB = 1024 * 1024 17 ) 18 19 // GCConfig allows changing the behaviour of the garbage collector 20 type GCConfig struct { 21 // MaxAllocs is the maximum number of allocations to track before a GC 22 // is triggered. 23 MaxAllocs int 24 DiskUsageThreshold float64 25 InodeUsageThreshold float64 26 Interval time.Duration 27 ReservedDiskMB int 28 ParallelDestroys int 29 } 30 31 // AllocCounter is used by AllocGarbageCollector to discover how many un-GC'd 32 // allocations a client has and is generally fulfilled by the Client. 33 type AllocCounter interface { 34 NumAllocs() int 35 } 36 37 // AllocGarbageCollector garbage collects terminated allocations on a node 38 type AllocGarbageCollector struct { 39 config *GCConfig 40 41 // allocRunners marked for GC 42 allocRunners *IndexedGCAllocPQ 43 44 // statsCollector for node based thresholds (eg disk) 45 statsCollector stats.NodeStatsCollector 46 47 // allocCounter return the number of un-GC'd allocs on this node 48 allocCounter AllocCounter 49 50 // destroyCh is a semaphore for rate limiting concurrent garbage 51 // collections 52 destroyCh chan struct{} 53 54 // shutdownCh is closed when the GC's run method should exit 55 shutdownCh chan struct{} 56 57 // triggerCh is ticked by the Trigger method to cause a GC 58 triggerCh chan struct{} 59 60 logger hclog.Logger 61 } 62 63 // NewAllocGarbageCollector returns a garbage collector for terminated 64 // allocations on a node. Must call Run() in a goroutine enable periodic 65 // garbage collection. 66 func NewAllocGarbageCollector(logger hclog.Logger, statsCollector stats.NodeStatsCollector, ac AllocCounter, config *GCConfig) *AllocGarbageCollector { 67 logger = logger.Named("gc") 68 // Require at least 1 to make progress 69 if config.ParallelDestroys <= 0 { 70 logger.Warn("garbage collector defaulting parallelism to 1 due to invalid input value", "gc_parallel_destroys", config.ParallelDestroys) 71 config.ParallelDestroys = 1 72 } 73 74 gc := &AllocGarbageCollector{ 75 allocRunners: NewIndexedGCAllocPQ(), 76 statsCollector: statsCollector, 77 allocCounter: ac, 78 config: config, 79 logger: logger, 80 destroyCh: make(chan struct{}, config.ParallelDestroys), 81 shutdownCh: make(chan struct{}), 82 triggerCh: make(chan struct{}, 1), 83 } 84 85 return gc 86 } 87 88 // Run the periodic garbage collector. 89 func (a *AllocGarbageCollector) Run() { 90 ticker := time.NewTicker(a.config.Interval) 91 for { 92 select { 93 case <-a.triggerCh: 94 case <-ticker.C: 95 case <-a.shutdownCh: 96 ticker.Stop() 97 return 98 } 99 100 if err := a.keepUsageBelowThreshold(); err != nil { 101 a.logger.Error("error garbage collecting allocations", "error", err) 102 } 103 } 104 } 105 106 // Force the garbage collector to run. 107 func (a *AllocGarbageCollector) Trigger() { 108 select { 109 case a.triggerCh <- struct{}{}: 110 default: 111 // already triggered 112 } 113 } 114 115 // keepUsageBelowThreshold collects disk usage information and garbage collects 116 // allocations to make disk space available. 117 func (a *AllocGarbageCollector) keepUsageBelowThreshold() error { 118 for { 119 select { 120 case <-a.shutdownCh: 121 return nil 122 default: 123 } 124 125 // Check if we have enough free space 126 if err := a.statsCollector.Collect(); err != nil { 127 return err 128 } 129 130 // See if we are below thresholds for used disk space and inode usage 131 diskStats := a.statsCollector.Stats().AllocDirStats 132 reason := "" 133 logf := a.logger.Warn 134 135 liveAllocs := a.allocCounter.NumAllocs() 136 137 switch { 138 case diskStats.UsedPercent > a.config.DiskUsageThreshold: 139 reason = fmt.Sprintf("disk usage of %.0f is over gc threshold of %.0f", 140 diskStats.UsedPercent, a.config.DiskUsageThreshold) 141 case diskStats.InodesUsedPercent > a.config.InodeUsageThreshold: 142 reason = fmt.Sprintf("inode usage of %.0f is over gc threshold of %.0f", 143 diskStats.InodesUsedPercent, a.config.InodeUsageThreshold) 144 case liveAllocs > a.config.MaxAllocs: 145 // if we're unable to gc, don't WARN until at least 2x over limit 146 if liveAllocs < (a.config.MaxAllocs * 2) { 147 logf = a.logger.Info 148 } 149 reason = fmt.Sprintf("number of allocations (%d) is over the limit (%d)", liveAllocs, a.config.MaxAllocs) 150 } 151 152 if reason == "" { 153 // No reason to gc, exit 154 break 155 } 156 157 // Collect an allocation 158 gcAlloc := a.allocRunners.Pop() 159 if gcAlloc == nil { 160 logf("garbage collection skipped because no terminal allocations", "reason", reason) 161 break 162 } 163 164 // Destroy the alloc runner and wait until it exits 165 a.destroyAllocRunner(gcAlloc.allocID, gcAlloc.allocRunner, reason) 166 } 167 return nil 168 } 169 170 // destroyAllocRunner is used to destroy an allocation runner. It will acquire a 171 // lock to restrict parallelism and then destroy the alloc runner, returning 172 // once the allocation has been destroyed. 173 func (a *AllocGarbageCollector) destroyAllocRunner(allocID string, ar AllocRunner, reason string) { 174 a.logger.Info("garbage collecting allocation", "alloc_id", allocID, "reason", reason) 175 176 // Acquire the destroy lock 177 select { 178 case <-a.shutdownCh: 179 return 180 case a.destroyCh <- struct{}{}: 181 } 182 183 ar.Destroy() 184 185 select { 186 case <-ar.DestroyCh(): 187 case <-a.shutdownCh: 188 } 189 190 a.logger.Debug("alloc garbage collected", "alloc_id", allocID) 191 192 // Release the lock 193 <-a.destroyCh 194 } 195 196 func (a *AllocGarbageCollector) Stop() { 197 close(a.shutdownCh) 198 } 199 200 // Collect garbage collects a single allocation on a node. Returns true if 201 // alloc was found and garbage collected; otherwise false. 202 func (a *AllocGarbageCollector) Collect(allocID string) bool { 203 gcAlloc := a.allocRunners.Remove(allocID) 204 if gcAlloc == nil { 205 a.logger.Debug("alloc was already garbage collected", "alloc_id", allocID) 206 return false 207 } 208 209 a.destroyAllocRunner(allocID, gcAlloc.allocRunner, "forced collection") 210 return true 211 } 212 213 // CollectAll garbage collects all terminated allocations on a node 214 func (a *AllocGarbageCollector) CollectAll() { 215 for { 216 select { 217 case <-a.shutdownCh: 218 return 219 default: 220 } 221 222 gcAlloc := a.allocRunners.Pop() 223 if gcAlloc == nil { 224 return 225 } 226 227 go a.destroyAllocRunner(gcAlloc.allocID, gcAlloc.allocRunner, "forced full node collection") 228 } 229 } 230 231 // MakeRoomFor garbage collects enough number of allocations in the terminal 232 // state to make room for new allocations 233 func (a *AllocGarbageCollector) MakeRoomFor(allocations []*structs.Allocation) error { 234 if len(allocations) == 0 { 235 // Nothing to make room for! 236 return nil 237 } 238 239 // GC allocs until below the max limit + the new allocations 240 max := a.config.MaxAllocs - len(allocations) 241 for a.allocCounter.NumAllocs() > max { 242 select { 243 case <-a.shutdownCh: 244 return nil 245 default: 246 } 247 248 gcAlloc := a.allocRunners.Pop() 249 if gcAlloc == nil { 250 // It's fine if we can't lower below the limit here as 251 // we'll keep trying to drop below the limit with each 252 // periodic gc 253 break 254 } 255 256 // Destroy the alloc runner and wait until it exits 257 a.destroyAllocRunner(gcAlloc.allocID, gcAlloc.allocRunner, fmt.Sprintf("new allocations and over max (%d)", a.config.MaxAllocs)) 258 } 259 260 totalResource := &structs.AllocatedSharedResources{} 261 for _, alloc := range allocations { 262 // COMPAT(0.11): Remove in 0.11 263 if alloc.AllocatedResources != nil { 264 totalResource.Add(&alloc.AllocatedResources.Shared) 265 } else { 266 totalResource.DiskMB += int64(alloc.Resources.DiskMB) 267 } 268 } 269 270 // If the host has enough free space to accommodate the new allocations then 271 // we don't need to garbage collect terminated allocations 272 if hostStats := a.statsCollector.Stats(); hostStats != nil { 273 var availableForAllocations uint64 274 if hostStats.AllocDirStats.Available < uint64(a.config.ReservedDiskMB*MB) { 275 availableForAllocations = 0 276 } else { 277 availableForAllocations = hostStats.AllocDirStats.Available - uint64(a.config.ReservedDiskMB*MB) 278 } 279 if uint64(totalResource.DiskMB*MB) < availableForAllocations { 280 return nil 281 } 282 } 283 284 var diskCleared int64 285 for { 286 select { 287 case <-a.shutdownCh: 288 return nil 289 default: 290 } 291 292 // Collect host stats and see if we still need to remove older 293 // allocations 294 var allocDirStats *stats.DiskStats 295 if err := a.statsCollector.Collect(); err == nil { 296 if hostStats := a.statsCollector.Stats(); hostStats != nil { 297 allocDirStats = hostStats.AllocDirStats 298 } 299 } 300 301 if allocDirStats != nil { 302 if allocDirStats.Available >= uint64(totalResource.DiskMB*MB) { 303 break 304 } 305 } else { 306 // Falling back to a simpler model to know if we have enough disk 307 // space if stats collection fails 308 if diskCleared >= totalResource.DiskMB { 309 break 310 } 311 } 312 313 gcAlloc := a.allocRunners.Pop() 314 if gcAlloc == nil { 315 break 316 } 317 318 ar := gcAlloc.allocRunner 319 alloc := ar.Alloc() 320 321 // COMPAT(0.11): Remove in 0.11 322 var allocDiskMB int64 323 if alloc.AllocatedResources != nil { 324 allocDiskMB = alloc.AllocatedResources.Shared.DiskMB 325 } else { 326 allocDiskMB = int64(alloc.Resources.DiskMB) 327 } 328 329 // Destroy the alloc runner and wait until it exits 330 a.destroyAllocRunner(gcAlloc.allocID, ar, fmt.Sprintf("freeing %d MB for new allocations", allocDiskMB)) 331 332 diskCleared += allocDiskMB 333 } 334 return nil 335 } 336 337 // MarkForCollection starts tracking an allocation for Garbage Collection 338 func (a *AllocGarbageCollector) MarkForCollection(allocID string, ar AllocRunner) { 339 if a.allocRunners.Push(allocID, ar) { 340 a.logger.Info("marking allocation for GC", "alloc_id", allocID) 341 } 342 } 343 344 // GCAlloc wraps an allocation runner and an index enabling it to be used within 345 // a PQ 346 type GCAlloc struct { 347 timeStamp time.Time 348 allocID string 349 allocRunner AllocRunner 350 index int 351 } 352 353 type GCAllocPQImpl []*GCAlloc 354 355 func (pq GCAllocPQImpl) Len() int { 356 return len(pq) 357 } 358 359 func (pq GCAllocPQImpl) Less(i, j int) bool { 360 return pq[i].timeStamp.Before(pq[j].timeStamp) 361 } 362 363 func (pq GCAllocPQImpl) Swap(i, j int) { 364 pq[i], pq[j] = pq[j], pq[i] 365 pq[i].index = i 366 pq[j].index = j 367 } 368 369 func (pq *GCAllocPQImpl) Push(x interface{}) { 370 n := len(*pq) 371 item := x.(*GCAlloc) 372 item.index = n 373 *pq = append(*pq, item) 374 } 375 376 func (pq *GCAllocPQImpl) Pop() interface{} { 377 old := *pq 378 n := len(old) 379 item := old[n-1] 380 item.index = -1 // for safety 381 *pq = old[0 : n-1] 382 return item 383 } 384 385 // IndexedGCAllocPQ is an indexed PQ which maintains a list of allocation runner 386 // based on their termination time. 387 type IndexedGCAllocPQ struct { 388 index map[string]*GCAlloc 389 heap GCAllocPQImpl 390 391 pqLock sync.Mutex 392 } 393 394 func NewIndexedGCAllocPQ() *IndexedGCAllocPQ { 395 return &IndexedGCAllocPQ{ 396 index: make(map[string]*GCAlloc), 397 heap: make(GCAllocPQImpl, 0), 398 } 399 } 400 401 // Push an alloc runner into the GC queue. Returns true if alloc was added, 402 // false if the alloc already existed. 403 func (i *IndexedGCAllocPQ) Push(allocID string, ar AllocRunner) bool { 404 i.pqLock.Lock() 405 defer i.pqLock.Unlock() 406 407 if _, ok := i.index[allocID]; ok { 408 // No work to do 409 return false 410 } 411 gcAlloc := &GCAlloc{ 412 timeStamp: time.Now(), 413 allocID: allocID, 414 allocRunner: ar, 415 } 416 i.index[allocID] = gcAlloc 417 heap.Push(&i.heap, gcAlloc) 418 return true 419 } 420 421 func (i *IndexedGCAllocPQ) Pop() *GCAlloc { 422 i.pqLock.Lock() 423 defer i.pqLock.Unlock() 424 425 if len(i.heap) == 0 { 426 return nil 427 } 428 429 gcAlloc := heap.Pop(&i.heap).(*GCAlloc) 430 delete(i.index, gcAlloc.allocRunner.Alloc().ID) 431 return gcAlloc 432 } 433 434 // Remove alloc from GC. Returns nil if alloc doesn't exist. 435 func (i *IndexedGCAllocPQ) Remove(allocID string) *GCAlloc { 436 i.pqLock.Lock() 437 defer i.pqLock.Unlock() 438 439 if gcAlloc, ok := i.index[allocID]; ok { 440 heap.Remove(&i.heap, gcAlloc.index) 441 delete(i.index, allocID) 442 return gcAlloc 443 } 444 445 return nil 446 } 447 448 func (i *IndexedGCAllocPQ) Length() int { 449 i.pqLock.Lock() 450 defer i.pqLock.Unlock() 451 452 return len(i.heap) 453 }