github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/client/gc.go (about) 1 package client 2 3 import ( 4 "container/heap" 5 "fmt" 6 "log" 7 "sync" 8 "time" 9 10 "github.com/hashicorp/nomad/client/stats" 11 "github.com/hashicorp/nomad/nomad/structs" 12 ) 13 14 const ( 15 // MB is a constant which converts values in bytes to MB 16 MB = 1024 * 1024 17 ) 18 19 // GCConfig allows changing the behaviour of the garbage collector 20 type GCConfig struct { 21 // MaxAllocs is the maximum number of allocations to track before a GC 22 // is triggered. 23 MaxAllocs int 24 DiskUsageThreshold float64 25 InodeUsageThreshold float64 26 Interval time.Duration 27 ReservedDiskMB int 28 ParallelDestroys int 29 } 30 31 // AllocCounter is used by AllocGarbageCollector to discover how many 32 // allocations a node has and is generally fulfilled by the Client. 33 type AllocCounter interface { 34 NumAllocs() int 35 } 36 37 // AllocGarbageCollector garbage collects terminated allocations on a node 38 type AllocGarbageCollector struct { 39 allocRunners *IndexedGCAllocPQ 40 statsCollector stats.NodeStatsCollector 41 allocCounter AllocCounter 42 config *GCConfig 43 logger *log.Logger 44 destroyCh chan struct{} 45 shutdownCh chan struct{} 46 } 47 48 // NewAllocGarbageCollector returns a garbage collector for terminated 49 // allocations on a node. Must call Run() in a goroutine enable periodic 50 // garbage collection. 51 func NewAllocGarbageCollector(logger *log.Logger, statsCollector stats.NodeStatsCollector, ac AllocCounter, config *GCConfig) *AllocGarbageCollector { 52 // Require at least 1 to make progress 53 if config.ParallelDestroys <= 0 { 54 logger.Printf("[WARN] client: garbage collector defaulting parallism to 1 due to invalid input value of %d", config.ParallelDestroys) 55 config.ParallelDestroys = 1 56 } 57 58 gc := &AllocGarbageCollector{ 59 allocRunners: NewIndexedGCAllocPQ(), 60 statsCollector: statsCollector, 61 allocCounter: ac, 62 config: config, 63 logger: logger, 64 destroyCh: make(chan struct{}, config.ParallelDestroys), 65 shutdownCh: make(chan struct{}), 66 } 67 68 return gc 69 } 70 71 // Run the periodic garbage collector. 72 func (a *AllocGarbageCollector) Run() { 73 ticker := time.NewTicker(a.config.Interval) 74 for { 75 select { 76 case <-ticker.C: 77 if err := a.keepUsageBelowThreshold(); err != nil { 78 a.logger.Printf("[ERR] client: error garbage collecting allocation: %v", err) 79 } 80 case <-a.shutdownCh: 81 ticker.Stop() 82 return 83 } 84 } 85 } 86 87 // keepUsageBelowThreshold collects disk usage information and garbage collects 88 // allocations to make disk space available. 89 func (a *AllocGarbageCollector) keepUsageBelowThreshold() error { 90 for { 91 select { 92 case <-a.shutdownCh: 93 return nil 94 default: 95 } 96 97 // Check if we have enough free space 98 err := a.statsCollector.Collect() 99 if err != nil { 100 return err 101 } 102 103 // See if we are below thresholds for used disk space and inode usage 104 // TODO(diptanu) figure out why this is nil 105 stats := a.statsCollector.Stats() 106 if stats == nil { 107 break 108 } 109 110 diskStats := stats.AllocDirStats 111 if diskStats == nil { 112 break 113 } 114 115 reason := "" 116 117 switch { 118 case diskStats.UsedPercent > a.config.DiskUsageThreshold: 119 reason = fmt.Sprintf("disk usage of %.0f is over gc threshold of %.0f", 120 diskStats.UsedPercent, a.config.DiskUsageThreshold) 121 case diskStats.InodesUsedPercent > a.config.InodeUsageThreshold: 122 reason = fmt.Sprintf("inode usage of %.0f is over gc threshold of %.0f", 123 diskStats.InodesUsedPercent, a.config.InodeUsageThreshold) 124 case a.numAllocs() > a.config.MaxAllocs: 125 reason = fmt.Sprintf("number of allocations is over the limit (%d)", a.config.MaxAllocs) 126 } 127 128 // No reason to gc, exit 129 if reason == "" { 130 break 131 } 132 133 // Collect an allocation 134 gcAlloc := a.allocRunners.Pop() 135 if gcAlloc == nil { 136 a.logger.Printf("[WARN] client: garbage collection due to %s skipped because no terminal allocations", reason) 137 break 138 } 139 140 // Destroy the alloc runner and wait until it exits 141 a.destroyAllocRunner(gcAlloc.allocRunner, reason) 142 } 143 return nil 144 } 145 146 // destroyAllocRunner is used to destroy an allocation runner. It will acquire a 147 // lock to restrict parallelism and then destroy the alloc runner, returning 148 // once the allocation has been destroyed. 149 func (a *AllocGarbageCollector) destroyAllocRunner(ar *AllocRunner, reason string) { 150 id := "<nil>" 151 if alloc := ar.Alloc(); alloc != nil { 152 id = alloc.ID 153 } 154 a.logger.Printf("[INFO] client: garbage collecting allocation %s due to %s", id, reason) 155 156 // Acquire the destroy lock 157 select { 158 case <-a.shutdownCh: 159 return 160 case a.destroyCh <- struct{}{}: 161 } 162 163 ar.Destroy() 164 165 select { 166 case <-ar.WaitCh(): 167 case <-a.shutdownCh: 168 } 169 170 a.logger.Printf("[DEBUG] client: garbage collected %q", ar.Alloc().ID) 171 172 // Release the lock 173 <-a.destroyCh 174 } 175 176 func (a *AllocGarbageCollector) Stop() { 177 close(a.shutdownCh) 178 } 179 180 // Collect garbage collects a single allocation on a node 181 func (a *AllocGarbageCollector) Collect(allocID string) error { 182 gcAlloc, err := a.allocRunners.Remove(allocID) 183 if err != nil { 184 return fmt.Errorf("unable to collect allocation %q: %v", allocID, err) 185 } 186 a.destroyAllocRunner(gcAlloc.allocRunner, "forced collection") 187 return nil 188 } 189 190 // CollectAll garbage collects all termianated allocations on a node 191 func (a *AllocGarbageCollector) CollectAll() error { 192 for { 193 select { 194 case <-a.shutdownCh: 195 return nil 196 default: 197 } 198 199 gcAlloc := a.allocRunners.Pop() 200 if gcAlloc == nil { 201 break 202 } 203 204 go a.destroyAllocRunner(gcAlloc.allocRunner, "forced full collection") 205 } 206 return nil 207 } 208 209 // MakeRoomFor garbage collects enough number of allocations in the terminal 210 // state to make room for new allocations 211 func (a *AllocGarbageCollector) MakeRoomFor(allocations []*structs.Allocation) error { 212 // GC allocs until below the max limit + the new allocations 213 max := a.config.MaxAllocs - len(allocations) 214 for a.numAllocs() > max { 215 select { 216 case <-a.shutdownCh: 217 return nil 218 default: 219 } 220 221 gcAlloc := a.allocRunners.Pop() 222 if gcAlloc == nil { 223 // It's fine if we can't lower below the limit here as 224 // we'll keep trying to drop below the limit with each 225 // periodic gc 226 break 227 } 228 229 // Destroy the alloc runner and wait until it exits 230 a.destroyAllocRunner(gcAlloc.allocRunner, "new allocations") 231 } 232 totalResource := &structs.Resources{} 233 for _, alloc := range allocations { 234 if err := totalResource.Add(alloc.Resources); err != nil { 235 return err 236 } 237 } 238 239 // If the host has enough free space to accomodate the new allocations then 240 // we don't need to garbage collect terminated allocations 241 if hostStats := a.statsCollector.Stats(); hostStats != nil { 242 var availableForAllocations uint64 243 if hostStats.AllocDirStats.Available < uint64(a.config.ReservedDiskMB*MB) { 244 availableForAllocations = 0 245 } else { 246 availableForAllocations = hostStats.AllocDirStats.Available - uint64(a.config.ReservedDiskMB*MB) 247 } 248 if uint64(totalResource.DiskMB*MB) < availableForAllocations { 249 return nil 250 } 251 } 252 253 var diskCleared int 254 for { 255 select { 256 case <-a.shutdownCh: 257 return nil 258 default: 259 } 260 261 // Collect host stats and see if we still need to remove older 262 // allocations 263 var allocDirStats *stats.DiskStats 264 if err := a.statsCollector.Collect(); err == nil { 265 if hostStats := a.statsCollector.Stats(); hostStats != nil { 266 allocDirStats = hostStats.AllocDirStats 267 } 268 } 269 270 if allocDirStats != nil { 271 if allocDirStats.Available >= uint64(totalResource.DiskMB*MB) { 272 break 273 } 274 } else { 275 // Falling back to a simpler model to know if we have enough disk 276 // space if stats collection fails 277 if diskCleared >= totalResource.DiskMB { 278 break 279 } 280 } 281 282 gcAlloc := a.allocRunners.Pop() 283 if gcAlloc == nil { 284 break 285 } 286 287 ar := gcAlloc.allocRunner 288 alloc := ar.Alloc() 289 290 // Destroy the alloc runner and wait until it exits 291 a.destroyAllocRunner(ar, fmt.Sprintf("freeing %d MB for new allocations", alloc.Resources.DiskMB)) 292 293 // Call stats collect again 294 diskCleared += alloc.Resources.DiskMB 295 } 296 return nil 297 } 298 299 // MarkForCollection starts tracking an allocation for Garbage Collection 300 func (a *AllocGarbageCollector) MarkForCollection(ar *AllocRunner) error { 301 if ar == nil { 302 return fmt.Errorf("nil allocation runner inserted for garbage collection") 303 } 304 if ar.Alloc() == nil { 305 a.destroyAllocRunner(ar, "alloc is nil") 306 } 307 308 a.logger.Printf("[INFO] client: marking allocation %v for GC", ar.Alloc().ID) 309 return a.allocRunners.Push(ar) 310 } 311 312 // Remove removes an alloc runner without garbage collecting it 313 func (a *AllocGarbageCollector) Remove(ar *AllocRunner) { 314 if ar == nil || ar.Alloc() == nil { 315 return 316 } 317 318 alloc := ar.Alloc() 319 if _, err := a.allocRunners.Remove(alloc.ID); err == nil { 320 a.logger.Printf("[INFO] client: removed alloc runner %v from garbage collector", alloc.ID) 321 } 322 } 323 324 // numAllocs returns the total number of allocs tracked by the client as well 325 // as those marked for GC. 326 func (a *AllocGarbageCollector) numAllocs() int { 327 return a.allocRunners.Length() + a.allocCounter.NumAllocs() 328 } 329 330 // GCAlloc wraps an allocation runner and an index enabling it to be used within 331 // a PQ 332 type GCAlloc struct { 333 timeStamp time.Time 334 allocRunner *AllocRunner 335 index int 336 } 337 338 type GCAllocPQImpl []*GCAlloc 339 340 func (pq GCAllocPQImpl) Len() int { 341 return len(pq) 342 } 343 344 func (pq GCAllocPQImpl) Less(i, j int) bool { 345 return pq[i].timeStamp.Before(pq[j].timeStamp) 346 } 347 348 func (pq GCAllocPQImpl) Swap(i, j int) { 349 pq[i], pq[j] = pq[j], pq[i] 350 pq[i].index = i 351 pq[j].index = j 352 } 353 354 func (pq *GCAllocPQImpl) Push(x interface{}) { 355 n := len(*pq) 356 item := x.(*GCAlloc) 357 item.index = n 358 *pq = append(*pq, item) 359 } 360 361 func (pq *GCAllocPQImpl) Pop() interface{} { 362 old := *pq 363 n := len(old) 364 item := old[n-1] 365 item.index = -1 // for safety 366 *pq = old[0 : n-1] 367 return item 368 } 369 370 // IndexedGCAllocPQ is an indexed PQ which maintains a list of allocation runner 371 // based on their termination time. 372 type IndexedGCAllocPQ struct { 373 index map[string]*GCAlloc 374 heap GCAllocPQImpl 375 376 pqLock sync.Mutex 377 } 378 379 func NewIndexedGCAllocPQ() *IndexedGCAllocPQ { 380 return &IndexedGCAllocPQ{ 381 index: make(map[string]*GCAlloc), 382 heap: make(GCAllocPQImpl, 0), 383 } 384 } 385 386 func (i *IndexedGCAllocPQ) Push(ar *AllocRunner) error { 387 i.pqLock.Lock() 388 defer i.pqLock.Unlock() 389 390 alloc := ar.Alloc() 391 if _, ok := i.index[alloc.ID]; ok { 392 // No work to do 393 return nil 394 } 395 gcAlloc := &GCAlloc{ 396 timeStamp: time.Now(), 397 allocRunner: ar, 398 } 399 i.index[alloc.ID] = gcAlloc 400 heap.Push(&i.heap, gcAlloc) 401 return nil 402 } 403 404 func (i *IndexedGCAllocPQ) Pop() *GCAlloc { 405 i.pqLock.Lock() 406 defer i.pqLock.Unlock() 407 408 if len(i.heap) == 0 { 409 return nil 410 } 411 412 gcAlloc := heap.Pop(&i.heap).(*GCAlloc) 413 delete(i.index, gcAlloc.allocRunner.Alloc().ID) 414 return gcAlloc 415 } 416 417 func (i *IndexedGCAllocPQ) Remove(allocID string) (*GCAlloc, error) { 418 i.pqLock.Lock() 419 defer i.pqLock.Unlock() 420 421 if gcAlloc, ok := i.index[allocID]; ok { 422 heap.Remove(&i.heap, gcAlloc.index) 423 delete(i.index, allocID) 424 return gcAlloc, nil 425 } 426 427 return nil, fmt.Errorf("alloc %q not present", allocID) 428 } 429 430 func (i *IndexedGCAllocPQ) Length() int { 431 i.pqLock.Lock() 432 defer i.pqLock.Unlock() 433 434 return len(i.heap) 435 }