github.com/taylorchu/nomad@v0.5.3-rc1.0.20170407200202-db11e7dd7b55/client/gc.go (about)

     1  package client
     2  
     3  import (
     4  	"container/heap"
     5  	"fmt"
     6  	"log"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/hashicorp/nomad/client/stats"
    11  	"github.com/hashicorp/nomad/nomad/structs"
    12  )
    13  
    14  const (
    15  	// MB is a constant which converts values in bytes to MB
    16  	MB = 1024 * 1024
    17  )
    18  
    19  // GCConfig allows changing the behaviour of the garbage collector
    20  type GCConfig struct {
    21  	DiskUsageThreshold  float64
    22  	InodeUsageThreshold float64
    23  	Interval            time.Duration
    24  	ReservedDiskMB      int
    25  	ParallelDestroys    int
    26  }
    27  
    28  // AllocGarbageCollector garbage collects terminated allocations on a node
    29  type AllocGarbageCollector struct {
    30  	allocRunners   *IndexedGCAllocPQ
    31  	statsCollector stats.NodeStatsCollector
    32  	config         *GCConfig
    33  	logger         *log.Logger
    34  	destroyCh      chan struct{}
    35  	shutdownCh     chan struct{}
    36  }
    37  
    38  // NewAllocGarbageCollector returns a garbage collector for terminated
    39  // allocations on a node.
    40  func NewAllocGarbageCollector(logger *log.Logger, statsCollector stats.NodeStatsCollector, config *GCConfig) *AllocGarbageCollector {
    41  	// Require at least 1 to make progress
    42  	if config.ParallelDestroys <= 0 {
    43  		logger.Printf("[WARN] client: garbage collector defaulting parallism to 1 due to invalid input value of %d", config.ParallelDestroys)
    44  		config.ParallelDestroys = 1
    45  	}
    46  
    47  	gc := &AllocGarbageCollector{
    48  		allocRunners:   NewIndexedGCAllocPQ(),
    49  		statsCollector: statsCollector,
    50  		config:         config,
    51  		logger:         logger,
    52  		destroyCh:      make(chan struct{}, config.ParallelDestroys),
    53  		shutdownCh:     make(chan struct{}),
    54  	}
    55  
    56  	go gc.run()
    57  	return gc
    58  }
    59  
    60  func (a *AllocGarbageCollector) run() {
    61  	ticker := time.NewTicker(a.config.Interval)
    62  	for {
    63  		select {
    64  		case <-ticker.C:
    65  			if err := a.keepUsageBelowThreshold(); err != nil {
    66  				a.logger.Printf("[ERR] client: error garbage collecting allocation: %v", err)
    67  			}
    68  		case <-a.shutdownCh:
    69  			ticker.Stop()
    70  			return
    71  		}
    72  	}
    73  }
    74  
    75  // keepUsageBelowThreshold collects disk usage information and garbage collects
    76  // allocations to make disk space available.
    77  func (a *AllocGarbageCollector) keepUsageBelowThreshold() error {
    78  	for {
    79  		select {
    80  		case <-a.shutdownCh:
    81  			return nil
    82  		default:
    83  		}
    84  
    85  		// Check if we have enough free space
    86  		err := a.statsCollector.Collect()
    87  		if err != nil {
    88  			return err
    89  		}
    90  
    91  		// See if we are below thresholds for used disk space and inode usage
    92  		// TODO(diptanu) figure out why this is nil
    93  		stats := a.statsCollector.Stats()
    94  		if stats == nil {
    95  			break
    96  		}
    97  
    98  		diskStats := stats.AllocDirStats
    99  		if diskStats == nil {
   100  			break
   101  		}
   102  
   103  		if diskStats.UsedPercent <= a.config.DiskUsageThreshold &&
   104  			diskStats.InodesUsedPercent <= a.config.InodeUsageThreshold {
   105  			break
   106  		}
   107  
   108  		// Collect an allocation
   109  		gcAlloc := a.allocRunners.Pop()
   110  		if gcAlloc == nil {
   111  			break
   112  		}
   113  
   114  		ar := gcAlloc.allocRunner
   115  		alloc := ar.Alloc()
   116  		a.logger.Printf("[INFO] client: garbage collecting allocation %v", alloc.ID)
   117  
   118  		// Destroy the alloc runner and wait until it exits
   119  		a.destroyAllocRunner(ar)
   120  	}
   121  	return nil
   122  }
   123  
   124  // destroyAllocRunner is used to destroy an allocation runner. It will acquire a
   125  // lock to restrict parallelism and then destroy the alloc runner, returning
   126  // once the allocation has been destroyed.
   127  func (a *AllocGarbageCollector) destroyAllocRunner(ar *AllocRunner) {
   128  	// Acquire the destroy lock
   129  	select {
   130  	case <-a.shutdownCh:
   131  		return
   132  	case a.destroyCh <- struct{}{}:
   133  	}
   134  
   135  	ar.Destroy()
   136  
   137  	select {
   138  	case <-ar.WaitCh():
   139  	case <-a.shutdownCh:
   140  	}
   141  
   142  	a.logger.Printf("[DEBUG] client: garbage collected %q", ar.Alloc().ID)
   143  
   144  	// Release the lock
   145  	<-a.destroyCh
   146  }
   147  
   148  func (a *AllocGarbageCollector) Stop() {
   149  	close(a.shutdownCh)
   150  }
   151  
   152  // Collect garbage collects a single allocation on a node
   153  func (a *AllocGarbageCollector) Collect(allocID string) error {
   154  	gcAlloc, err := a.allocRunners.Remove(allocID)
   155  	if err != nil {
   156  		return fmt.Errorf("unable to collect allocation %q: %v", allocID, err)
   157  	}
   158  
   159  	ar := gcAlloc.allocRunner
   160  	a.logger.Printf("[INFO] client: garbage collecting allocation %q", ar.Alloc().ID)
   161  
   162  	a.destroyAllocRunner(ar)
   163  	return nil
   164  }
   165  
   166  // CollectAll garbage collects all termianated allocations on a node
   167  func (a *AllocGarbageCollector) CollectAll() error {
   168  	for {
   169  		select {
   170  		case <-a.shutdownCh:
   171  			return nil
   172  		default:
   173  		}
   174  
   175  		gcAlloc := a.allocRunners.Pop()
   176  		if gcAlloc == nil {
   177  			break
   178  		}
   179  
   180  		ar := gcAlloc.allocRunner
   181  		a.logger.Printf("[INFO] client: garbage collecting alloc runner for alloc %q", ar.Alloc().ID)
   182  		go a.destroyAllocRunner(ar)
   183  	}
   184  	return nil
   185  }
   186  
   187  // MakeRoomFor garbage collects enough number of allocations in the terminal
   188  // state to make room for new allocations
   189  func (a *AllocGarbageCollector) MakeRoomFor(allocations []*structs.Allocation) error {
   190  	totalResource := &structs.Resources{}
   191  	for _, alloc := range allocations {
   192  		if err := totalResource.Add(alloc.Resources); err != nil {
   193  			return err
   194  		}
   195  	}
   196  
   197  	// If the host has enough free space to accomodate the new allocations then
   198  	// we don't need to garbage collect terminated allocations
   199  	if hostStats := a.statsCollector.Stats(); hostStats != nil {
   200  		var availableForAllocations uint64
   201  		if hostStats.AllocDirStats.Available < uint64(a.config.ReservedDiskMB*MB) {
   202  			availableForAllocations = 0
   203  		} else {
   204  			availableForAllocations = hostStats.AllocDirStats.Available - uint64(a.config.ReservedDiskMB*MB)
   205  		}
   206  		if uint64(totalResource.DiskMB*MB) < availableForAllocations {
   207  			return nil
   208  		}
   209  	}
   210  
   211  	var diskCleared int
   212  	for {
   213  		select {
   214  		case <-a.shutdownCh:
   215  			return nil
   216  		default:
   217  		}
   218  
   219  		// Collect host stats and see if we still need to remove older
   220  		// allocations
   221  		var allocDirStats *stats.DiskStats
   222  		if err := a.statsCollector.Collect(); err == nil {
   223  			if hostStats := a.statsCollector.Stats(); hostStats != nil {
   224  				allocDirStats = hostStats.AllocDirStats
   225  			}
   226  		}
   227  
   228  		if allocDirStats != nil {
   229  			if allocDirStats.Available >= uint64(totalResource.DiskMB*MB) {
   230  				break
   231  			}
   232  		} else {
   233  			// Falling back to a simpler model to know if we have enough disk
   234  			// space if stats collection fails
   235  			if diskCleared >= totalResource.DiskMB {
   236  				break
   237  			}
   238  		}
   239  
   240  		gcAlloc := a.allocRunners.Pop()
   241  		if gcAlloc == nil {
   242  			break
   243  		}
   244  
   245  		ar := gcAlloc.allocRunner
   246  		alloc := ar.Alloc()
   247  		a.logger.Printf("[INFO] client: garbage collecting allocation %v", alloc.ID)
   248  
   249  		// Destroy the alloc runner and wait until it exits
   250  		a.destroyAllocRunner(ar)
   251  
   252  		// Call stats collect again
   253  		diskCleared += alloc.Resources.DiskMB
   254  	}
   255  	return nil
   256  }
   257  
   258  // MarkForCollection starts tracking an allocation for Garbage Collection
   259  func (a *AllocGarbageCollector) MarkForCollection(ar *AllocRunner) error {
   260  	if ar == nil {
   261  		return fmt.Errorf("nil allocation runner inserted for garbage collection")
   262  	}
   263  	if ar.Alloc() == nil {
   264  		a.logger.Printf("[INFO] client: alloc is nil, so garbage collecting")
   265  		a.destroyAllocRunner(ar)
   266  	}
   267  
   268  	a.logger.Printf("[INFO] client: marking allocation %v for GC", ar.Alloc().ID)
   269  	return a.allocRunners.Push(ar)
   270  }
   271  
   272  // Remove removes an alloc runner without garbage collecting it
   273  func (a *AllocGarbageCollector) Remove(ar *AllocRunner) {
   274  	if ar == nil || ar.Alloc() == nil {
   275  		return
   276  	}
   277  
   278  	alloc := ar.Alloc()
   279  	if _, err := a.allocRunners.Remove(alloc.ID); err == nil {
   280  		a.logger.Printf("[INFO] client: removed alloc runner %v from garbage collector", alloc.ID)
   281  	}
   282  }
   283  
   284  // GCAlloc wraps an allocation runner and an index enabling it to be used within
   285  // a PQ
   286  type GCAlloc struct {
   287  	timeStamp   time.Time
   288  	allocRunner *AllocRunner
   289  	index       int
   290  }
   291  
   292  type GCAllocPQImpl []*GCAlloc
   293  
   294  func (pq GCAllocPQImpl) Len() int {
   295  	return len(pq)
   296  }
   297  
   298  func (pq GCAllocPQImpl) Less(i, j int) bool {
   299  	return pq[i].timeStamp.Before(pq[j].timeStamp)
   300  }
   301  
   302  func (pq GCAllocPQImpl) Swap(i, j int) {
   303  	pq[i], pq[j] = pq[j], pq[i]
   304  	pq[i].index = i
   305  	pq[j].index = j
   306  }
   307  
   308  func (pq *GCAllocPQImpl) Push(x interface{}) {
   309  	n := len(*pq)
   310  	item := x.(*GCAlloc)
   311  	item.index = n
   312  	*pq = append(*pq, item)
   313  }
   314  
   315  func (pq *GCAllocPQImpl) Pop() interface{} {
   316  	old := *pq
   317  	n := len(old)
   318  	item := old[n-1]
   319  	item.index = -1 // for safety
   320  	*pq = old[0 : n-1]
   321  	return item
   322  }
   323  
   324  // IndexedGCAllocPQ is an indexed PQ which maintains a list of allocation runner
   325  // based on their termination time.
   326  type IndexedGCAllocPQ struct {
   327  	index map[string]*GCAlloc
   328  	heap  GCAllocPQImpl
   329  
   330  	pqLock sync.Mutex
   331  }
   332  
   333  func NewIndexedGCAllocPQ() *IndexedGCAllocPQ {
   334  	return &IndexedGCAllocPQ{
   335  		index: make(map[string]*GCAlloc),
   336  		heap:  make(GCAllocPQImpl, 0),
   337  	}
   338  }
   339  
   340  func (i *IndexedGCAllocPQ) Push(ar *AllocRunner) error {
   341  	i.pqLock.Lock()
   342  	defer i.pqLock.Unlock()
   343  
   344  	alloc := ar.Alloc()
   345  	if _, ok := i.index[alloc.ID]; ok {
   346  		// No work to do
   347  		return nil
   348  	}
   349  	gcAlloc := &GCAlloc{
   350  		timeStamp:   time.Now(),
   351  		allocRunner: ar,
   352  	}
   353  	i.index[alloc.ID] = gcAlloc
   354  	heap.Push(&i.heap, gcAlloc)
   355  	return nil
   356  }
   357  
   358  func (i *IndexedGCAllocPQ) Pop() *GCAlloc {
   359  	i.pqLock.Lock()
   360  	defer i.pqLock.Unlock()
   361  
   362  	if len(i.heap) == 0 {
   363  		return nil
   364  	}
   365  
   366  	gcAlloc := heap.Pop(&i.heap).(*GCAlloc)
   367  	delete(i.index, gcAlloc.allocRunner.Alloc().ID)
   368  	return gcAlloc
   369  }
   370  
   371  func (i *IndexedGCAllocPQ) Remove(allocID string) (*GCAlloc, error) {
   372  	i.pqLock.Lock()
   373  	defer i.pqLock.Unlock()
   374  
   375  	if gcAlloc, ok := i.index[allocID]; ok {
   376  		heap.Remove(&i.heap, gcAlloc.index)
   377  		delete(i.index, allocID)
   378  		return gcAlloc, nil
   379  	}
   380  
   381  	return nil, fmt.Errorf("alloc %q not present", allocID)
   382  }
   383  
   384  func (i *IndexedGCAllocPQ) Length() int {
   385  	i.pqLock.Lock()
   386  	defer i.pqLock.Unlock()
   387  
   388  	return len(i.heap)
   389  }