github.com/taylorchu/nomad@v0.5.3-rc1.0.20170407200202-db11e7dd7b55/client/gc.go (about) 1 package client 2 3 import ( 4 "container/heap" 5 "fmt" 6 "log" 7 "sync" 8 "time" 9 10 "github.com/hashicorp/nomad/client/stats" 11 "github.com/hashicorp/nomad/nomad/structs" 12 ) 13 14 const ( 15 // MB is a constant which converts values in bytes to MB 16 MB = 1024 * 1024 17 ) 18 19 // GCConfig allows changing the behaviour of the garbage collector 20 type GCConfig struct { 21 DiskUsageThreshold float64 22 InodeUsageThreshold float64 23 Interval time.Duration 24 ReservedDiskMB int 25 ParallelDestroys int 26 } 27 28 // AllocGarbageCollector garbage collects terminated allocations on a node 29 type AllocGarbageCollector struct { 30 allocRunners *IndexedGCAllocPQ 31 statsCollector stats.NodeStatsCollector 32 config *GCConfig 33 logger *log.Logger 34 destroyCh chan struct{} 35 shutdownCh chan struct{} 36 } 37 38 // NewAllocGarbageCollector returns a garbage collector for terminated 39 // allocations on a node. 40 func NewAllocGarbageCollector(logger *log.Logger, statsCollector stats.NodeStatsCollector, config *GCConfig) *AllocGarbageCollector { 41 // Require at least 1 to make progress 42 if config.ParallelDestroys <= 0 { 43 logger.Printf("[WARN] client: garbage collector defaulting parallism to 1 due to invalid input value of %d", config.ParallelDestroys) 44 config.ParallelDestroys = 1 45 } 46 47 gc := &AllocGarbageCollector{ 48 allocRunners: NewIndexedGCAllocPQ(), 49 statsCollector: statsCollector, 50 config: config, 51 logger: logger, 52 destroyCh: make(chan struct{}, config.ParallelDestroys), 53 shutdownCh: make(chan struct{}), 54 } 55 56 go gc.run() 57 return gc 58 } 59 60 func (a *AllocGarbageCollector) run() { 61 ticker := time.NewTicker(a.config.Interval) 62 for { 63 select { 64 case <-ticker.C: 65 if err := a.keepUsageBelowThreshold(); err != nil { 66 a.logger.Printf("[ERR] client: error garbage collecting allocation: %v", err) 67 } 68 case <-a.shutdownCh: 69 ticker.Stop() 70 return 71 } 72 } 73 } 74 75 // keepUsageBelowThreshold collects disk usage information and garbage collects 76 // allocations to make disk space available. 77 func (a *AllocGarbageCollector) keepUsageBelowThreshold() error { 78 for { 79 select { 80 case <-a.shutdownCh: 81 return nil 82 default: 83 } 84 85 // Check if we have enough free space 86 err := a.statsCollector.Collect() 87 if err != nil { 88 return err 89 } 90 91 // See if we are below thresholds for used disk space and inode usage 92 // TODO(diptanu) figure out why this is nil 93 stats := a.statsCollector.Stats() 94 if stats == nil { 95 break 96 } 97 98 diskStats := stats.AllocDirStats 99 if diskStats == nil { 100 break 101 } 102 103 if diskStats.UsedPercent <= a.config.DiskUsageThreshold && 104 diskStats.InodesUsedPercent <= a.config.InodeUsageThreshold { 105 break 106 } 107 108 // Collect an allocation 109 gcAlloc := a.allocRunners.Pop() 110 if gcAlloc == nil { 111 break 112 } 113 114 ar := gcAlloc.allocRunner 115 alloc := ar.Alloc() 116 a.logger.Printf("[INFO] client: garbage collecting allocation %v", alloc.ID) 117 118 // Destroy the alloc runner and wait until it exits 119 a.destroyAllocRunner(ar) 120 } 121 return nil 122 } 123 124 // destroyAllocRunner is used to destroy an allocation runner. It will acquire a 125 // lock to restrict parallelism and then destroy the alloc runner, returning 126 // once the allocation has been destroyed. 127 func (a *AllocGarbageCollector) destroyAllocRunner(ar *AllocRunner) { 128 // Acquire the destroy lock 129 select { 130 case <-a.shutdownCh: 131 return 132 case a.destroyCh <- struct{}{}: 133 } 134 135 ar.Destroy() 136 137 select { 138 case <-ar.WaitCh(): 139 case <-a.shutdownCh: 140 } 141 142 a.logger.Printf("[DEBUG] client: garbage collected %q", ar.Alloc().ID) 143 144 // Release the lock 145 <-a.destroyCh 146 } 147 148 func (a *AllocGarbageCollector) Stop() { 149 close(a.shutdownCh) 150 } 151 152 // Collect garbage collects a single allocation on a node 153 func (a *AllocGarbageCollector) Collect(allocID string) error { 154 gcAlloc, err := a.allocRunners.Remove(allocID) 155 if err != nil { 156 return fmt.Errorf("unable to collect allocation %q: %v", allocID, err) 157 } 158 159 ar := gcAlloc.allocRunner 160 a.logger.Printf("[INFO] client: garbage collecting allocation %q", ar.Alloc().ID) 161 162 a.destroyAllocRunner(ar) 163 return nil 164 } 165 166 // CollectAll garbage collects all termianated allocations on a node 167 func (a *AllocGarbageCollector) CollectAll() error { 168 for { 169 select { 170 case <-a.shutdownCh: 171 return nil 172 default: 173 } 174 175 gcAlloc := a.allocRunners.Pop() 176 if gcAlloc == nil { 177 break 178 } 179 180 ar := gcAlloc.allocRunner 181 a.logger.Printf("[INFO] client: garbage collecting alloc runner for alloc %q", ar.Alloc().ID) 182 go a.destroyAllocRunner(ar) 183 } 184 return nil 185 } 186 187 // MakeRoomFor garbage collects enough number of allocations in the terminal 188 // state to make room for new allocations 189 func (a *AllocGarbageCollector) MakeRoomFor(allocations []*structs.Allocation) error { 190 totalResource := &structs.Resources{} 191 for _, alloc := range allocations { 192 if err := totalResource.Add(alloc.Resources); err != nil { 193 return err 194 } 195 } 196 197 // If the host has enough free space to accomodate the new allocations then 198 // we don't need to garbage collect terminated allocations 199 if hostStats := a.statsCollector.Stats(); hostStats != nil { 200 var availableForAllocations uint64 201 if hostStats.AllocDirStats.Available < uint64(a.config.ReservedDiskMB*MB) { 202 availableForAllocations = 0 203 } else { 204 availableForAllocations = hostStats.AllocDirStats.Available - uint64(a.config.ReservedDiskMB*MB) 205 } 206 if uint64(totalResource.DiskMB*MB) < availableForAllocations { 207 return nil 208 } 209 } 210 211 var diskCleared int 212 for { 213 select { 214 case <-a.shutdownCh: 215 return nil 216 default: 217 } 218 219 // Collect host stats and see if we still need to remove older 220 // allocations 221 var allocDirStats *stats.DiskStats 222 if err := a.statsCollector.Collect(); err == nil { 223 if hostStats := a.statsCollector.Stats(); hostStats != nil { 224 allocDirStats = hostStats.AllocDirStats 225 } 226 } 227 228 if allocDirStats != nil { 229 if allocDirStats.Available >= uint64(totalResource.DiskMB*MB) { 230 break 231 } 232 } else { 233 // Falling back to a simpler model to know if we have enough disk 234 // space if stats collection fails 235 if diskCleared >= totalResource.DiskMB { 236 break 237 } 238 } 239 240 gcAlloc := a.allocRunners.Pop() 241 if gcAlloc == nil { 242 break 243 } 244 245 ar := gcAlloc.allocRunner 246 alloc := ar.Alloc() 247 a.logger.Printf("[INFO] client: garbage collecting allocation %v", alloc.ID) 248 249 // Destroy the alloc runner and wait until it exits 250 a.destroyAllocRunner(ar) 251 252 // Call stats collect again 253 diskCleared += alloc.Resources.DiskMB 254 } 255 return nil 256 } 257 258 // MarkForCollection starts tracking an allocation for Garbage Collection 259 func (a *AllocGarbageCollector) MarkForCollection(ar *AllocRunner) error { 260 if ar == nil { 261 return fmt.Errorf("nil allocation runner inserted for garbage collection") 262 } 263 if ar.Alloc() == nil { 264 a.logger.Printf("[INFO] client: alloc is nil, so garbage collecting") 265 a.destroyAllocRunner(ar) 266 } 267 268 a.logger.Printf("[INFO] client: marking allocation %v for GC", ar.Alloc().ID) 269 return a.allocRunners.Push(ar) 270 } 271 272 // Remove removes an alloc runner without garbage collecting it 273 func (a *AllocGarbageCollector) Remove(ar *AllocRunner) { 274 if ar == nil || ar.Alloc() == nil { 275 return 276 } 277 278 alloc := ar.Alloc() 279 if _, err := a.allocRunners.Remove(alloc.ID); err == nil { 280 a.logger.Printf("[INFO] client: removed alloc runner %v from garbage collector", alloc.ID) 281 } 282 } 283 284 // GCAlloc wraps an allocation runner and an index enabling it to be used within 285 // a PQ 286 type GCAlloc struct { 287 timeStamp time.Time 288 allocRunner *AllocRunner 289 index int 290 } 291 292 type GCAllocPQImpl []*GCAlloc 293 294 func (pq GCAllocPQImpl) Len() int { 295 return len(pq) 296 } 297 298 func (pq GCAllocPQImpl) Less(i, j int) bool { 299 return pq[i].timeStamp.Before(pq[j].timeStamp) 300 } 301 302 func (pq GCAllocPQImpl) Swap(i, j int) { 303 pq[i], pq[j] = pq[j], pq[i] 304 pq[i].index = i 305 pq[j].index = j 306 } 307 308 func (pq *GCAllocPQImpl) Push(x interface{}) { 309 n := len(*pq) 310 item := x.(*GCAlloc) 311 item.index = n 312 *pq = append(*pq, item) 313 } 314 315 func (pq *GCAllocPQImpl) Pop() interface{} { 316 old := *pq 317 n := len(old) 318 item := old[n-1] 319 item.index = -1 // for safety 320 *pq = old[0 : n-1] 321 return item 322 } 323 324 // IndexedGCAllocPQ is an indexed PQ which maintains a list of allocation runner 325 // based on their termination time. 326 type IndexedGCAllocPQ struct { 327 index map[string]*GCAlloc 328 heap GCAllocPQImpl 329 330 pqLock sync.Mutex 331 } 332 333 func NewIndexedGCAllocPQ() *IndexedGCAllocPQ { 334 return &IndexedGCAllocPQ{ 335 index: make(map[string]*GCAlloc), 336 heap: make(GCAllocPQImpl, 0), 337 } 338 } 339 340 func (i *IndexedGCAllocPQ) Push(ar *AllocRunner) error { 341 i.pqLock.Lock() 342 defer i.pqLock.Unlock() 343 344 alloc := ar.Alloc() 345 if _, ok := i.index[alloc.ID]; ok { 346 // No work to do 347 return nil 348 } 349 gcAlloc := &GCAlloc{ 350 timeStamp: time.Now(), 351 allocRunner: ar, 352 } 353 i.index[alloc.ID] = gcAlloc 354 heap.Push(&i.heap, gcAlloc) 355 return nil 356 } 357 358 func (i *IndexedGCAllocPQ) Pop() *GCAlloc { 359 i.pqLock.Lock() 360 defer i.pqLock.Unlock() 361 362 if len(i.heap) == 0 { 363 return nil 364 } 365 366 gcAlloc := heap.Pop(&i.heap).(*GCAlloc) 367 delete(i.index, gcAlloc.allocRunner.Alloc().ID) 368 return gcAlloc 369 } 370 371 func (i *IndexedGCAllocPQ) Remove(allocID string) (*GCAlloc, error) { 372 i.pqLock.Lock() 373 defer i.pqLock.Unlock() 374 375 if gcAlloc, ok := i.index[allocID]; ok { 376 heap.Remove(&i.heap, gcAlloc.index) 377 delete(i.index, allocID) 378 return gcAlloc, nil 379 } 380 381 return nil, fmt.Errorf("alloc %q not present", allocID) 382 } 383 384 func (i *IndexedGCAllocPQ) Length() int { 385 i.pqLock.Lock() 386 defer i.pqLock.Unlock() 387 388 return len(i.heap) 389 }