github.com/ferranbt/nomad@v0.9.3-0.20190607002617-85c449b7667c/client/gc.go (about)

     1  package client
     2  
     3  import (
     4  	"container/heap"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	hclog "github.com/hashicorp/go-hclog"
    10  	"github.com/hashicorp/nomad/client/stats"
    11  	"github.com/hashicorp/nomad/nomad/structs"
    12  )
    13  
    14  const (
    15  	// MB is a constant which converts values in bytes to MB
    16  	MB = 1024 * 1024
    17  )
    18  
    19  // GCConfig allows changing the behaviour of the garbage collector
    20  type GCConfig struct {
    21  	// MaxAllocs is the maximum number of allocations to track before a GC
    22  	// is triggered.
    23  	MaxAllocs           int
    24  	DiskUsageThreshold  float64
    25  	InodeUsageThreshold float64
    26  	Interval            time.Duration
    27  	ReservedDiskMB      int
    28  	ParallelDestroys    int
    29  }
    30  
    31  // AllocCounter is used by AllocGarbageCollector to discover how many un-GC'd
    32  // allocations a client has and is generally fulfilled by the Client.
    33  type AllocCounter interface {
    34  	NumAllocs() int
    35  }
    36  
    37  // AllocGarbageCollector garbage collects terminated allocations on a node
    38  type AllocGarbageCollector struct {
    39  	config *GCConfig
    40  
    41  	// allocRunners marked for GC
    42  	allocRunners *IndexedGCAllocPQ
    43  
    44  	// statsCollector for node based thresholds (eg disk)
    45  	statsCollector stats.NodeStatsCollector
    46  
    47  	// allocCounter return the number of un-GC'd allocs on this node
    48  	allocCounter AllocCounter
    49  
    50  	// destroyCh is a semaphore for rate limiting concurrent garbage
    51  	// collections
    52  	destroyCh chan struct{}
    53  
    54  	// shutdownCh is closed when the GC's run method should exit
    55  	shutdownCh chan struct{}
    56  
    57  	// triggerCh is ticked by the Trigger method to cause a GC
    58  	triggerCh chan struct{}
    59  
    60  	logger hclog.Logger
    61  }
    62  
    63  // NewAllocGarbageCollector returns a garbage collector for terminated
    64  // allocations on a node. Must call Run() in a goroutine enable periodic
    65  // garbage collection.
    66  func NewAllocGarbageCollector(logger hclog.Logger, statsCollector stats.NodeStatsCollector, ac AllocCounter, config *GCConfig) *AllocGarbageCollector {
    67  	logger = logger.Named("gc")
    68  	// Require at least 1 to make progress
    69  	if config.ParallelDestroys <= 0 {
    70  		logger.Warn("garbage collector defaulting parallelism to 1 due to invalid input value", "gc_parallel_destroys", config.ParallelDestroys)
    71  		config.ParallelDestroys = 1
    72  	}
    73  
    74  	gc := &AllocGarbageCollector{
    75  		allocRunners:   NewIndexedGCAllocPQ(),
    76  		statsCollector: statsCollector,
    77  		allocCounter:   ac,
    78  		config:         config,
    79  		logger:         logger,
    80  		destroyCh:      make(chan struct{}, config.ParallelDestroys),
    81  		shutdownCh:     make(chan struct{}),
    82  		triggerCh:      make(chan struct{}, 1),
    83  	}
    84  
    85  	return gc
    86  }
    87  
    88  // Run the periodic garbage collector.
    89  func (a *AllocGarbageCollector) Run() {
    90  	ticker := time.NewTicker(a.config.Interval)
    91  	for {
    92  		select {
    93  		case <-a.triggerCh:
    94  		case <-ticker.C:
    95  		case <-a.shutdownCh:
    96  			ticker.Stop()
    97  			return
    98  		}
    99  
   100  		if err := a.keepUsageBelowThreshold(); err != nil {
   101  			a.logger.Error("error garbage collecting allocations", "error", err)
   102  		}
   103  	}
   104  }
   105  
   106  // Force the garbage collector to run.
   107  func (a *AllocGarbageCollector) Trigger() {
   108  	select {
   109  	case a.triggerCh <- struct{}{}:
   110  	default:
   111  		// already triggered
   112  	}
   113  }
   114  
   115  // keepUsageBelowThreshold collects disk usage information and garbage collects
   116  // allocations to make disk space available.
   117  func (a *AllocGarbageCollector) keepUsageBelowThreshold() error {
   118  	for {
   119  		select {
   120  		case <-a.shutdownCh:
   121  			return nil
   122  		default:
   123  		}
   124  
   125  		// Check if we have enough free space
   126  		if err := a.statsCollector.Collect(); err != nil {
   127  			return err
   128  		}
   129  
   130  		// See if we are below thresholds for used disk space and inode usage
   131  		diskStats := a.statsCollector.Stats().AllocDirStats
   132  		reason := ""
   133  		logf := a.logger.Warn
   134  
   135  		liveAllocs := a.allocCounter.NumAllocs()
   136  
   137  		switch {
   138  		case diskStats.UsedPercent > a.config.DiskUsageThreshold:
   139  			reason = fmt.Sprintf("disk usage of %.0f is over gc threshold of %.0f",
   140  				diskStats.UsedPercent, a.config.DiskUsageThreshold)
   141  		case diskStats.InodesUsedPercent > a.config.InodeUsageThreshold:
   142  			reason = fmt.Sprintf("inode usage of %.0f is over gc threshold of %.0f",
   143  				diskStats.InodesUsedPercent, a.config.InodeUsageThreshold)
   144  		case liveAllocs > a.config.MaxAllocs:
   145  			// if we're unable to gc, don't WARN until at least 2x over limit
   146  			if liveAllocs < (a.config.MaxAllocs * 2) {
   147  				logf = a.logger.Info
   148  			}
   149  			reason = fmt.Sprintf("number of allocations (%d) is over the limit (%d)", liveAllocs, a.config.MaxAllocs)
   150  		}
   151  
   152  		if reason == "" {
   153  			// No reason to gc, exit
   154  			break
   155  		}
   156  
   157  		// Collect an allocation
   158  		gcAlloc := a.allocRunners.Pop()
   159  		if gcAlloc == nil {
   160  			logf("garbage collection skipped because no terminal allocations", "reason", reason)
   161  			break
   162  		}
   163  
   164  		// Destroy the alloc runner and wait until it exits
   165  		a.destroyAllocRunner(gcAlloc.allocID, gcAlloc.allocRunner, reason)
   166  	}
   167  	return nil
   168  }
   169  
   170  // destroyAllocRunner is used to destroy an allocation runner. It will acquire a
   171  // lock to restrict parallelism and then destroy the alloc runner, returning
   172  // once the allocation has been destroyed.
   173  func (a *AllocGarbageCollector) destroyAllocRunner(allocID string, ar AllocRunner, reason string) {
   174  	a.logger.Info("garbage collecting allocation", "alloc_id", allocID, "reason", reason)
   175  
   176  	// Acquire the destroy lock
   177  	select {
   178  	case <-a.shutdownCh:
   179  		return
   180  	case a.destroyCh <- struct{}{}:
   181  	}
   182  
   183  	ar.Destroy()
   184  
   185  	select {
   186  	case <-ar.DestroyCh():
   187  	case <-a.shutdownCh:
   188  	}
   189  
   190  	a.logger.Debug("alloc garbage collected", "alloc_id", allocID)
   191  
   192  	// Release the lock
   193  	<-a.destroyCh
   194  }
   195  
   196  func (a *AllocGarbageCollector) Stop() {
   197  	close(a.shutdownCh)
   198  }
   199  
   200  // Collect garbage collects a single allocation on a node. Returns true if
   201  // alloc was found and garbage collected; otherwise false.
   202  func (a *AllocGarbageCollector) Collect(allocID string) bool {
   203  	gcAlloc := a.allocRunners.Remove(allocID)
   204  	if gcAlloc == nil {
   205  		a.logger.Debug("alloc was already garbage collected", "alloc_id", allocID)
   206  		return false
   207  	}
   208  
   209  	a.destroyAllocRunner(allocID, gcAlloc.allocRunner, "forced collection")
   210  	return true
   211  }
   212  
   213  // CollectAll garbage collects all terminated allocations on a node
   214  func (a *AllocGarbageCollector) CollectAll() {
   215  	for {
   216  		select {
   217  		case <-a.shutdownCh:
   218  			return
   219  		default:
   220  		}
   221  
   222  		gcAlloc := a.allocRunners.Pop()
   223  		if gcAlloc == nil {
   224  			return
   225  		}
   226  
   227  		go a.destroyAllocRunner(gcAlloc.allocID, gcAlloc.allocRunner, "forced full node collection")
   228  	}
   229  }
   230  
   231  // MakeRoomFor garbage collects enough number of allocations in the terminal
   232  // state to make room for new allocations
   233  func (a *AllocGarbageCollector) MakeRoomFor(allocations []*structs.Allocation) error {
   234  	if len(allocations) == 0 {
   235  		// Nothing to make room for!
   236  		return nil
   237  	}
   238  
   239  	// GC allocs until below the max limit + the new allocations
   240  	max := a.config.MaxAllocs - len(allocations)
   241  	for a.allocCounter.NumAllocs() > max {
   242  		select {
   243  		case <-a.shutdownCh:
   244  			return nil
   245  		default:
   246  		}
   247  
   248  		gcAlloc := a.allocRunners.Pop()
   249  		if gcAlloc == nil {
   250  			// It's fine if we can't lower below the limit here as
   251  			// we'll keep trying to drop below the limit with each
   252  			// periodic gc
   253  			break
   254  		}
   255  
   256  		// Destroy the alloc runner and wait until it exits
   257  		a.destroyAllocRunner(gcAlloc.allocID, gcAlloc.allocRunner, fmt.Sprintf("new allocations and over max (%d)", a.config.MaxAllocs))
   258  	}
   259  
   260  	totalResource := &structs.AllocatedSharedResources{}
   261  	for _, alloc := range allocations {
   262  		// COMPAT(0.11): Remove in 0.11
   263  		if alloc.AllocatedResources != nil {
   264  			totalResource.Add(&alloc.AllocatedResources.Shared)
   265  		} else {
   266  			totalResource.DiskMB += int64(alloc.Resources.DiskMB)
   267  		}
   268  	}
   269  
   270  	// If the host has enough free space to accommodate the new allocations then
   271  	// we don't need to garbage collect terminated allocations
   272  	if hostStats := a.statsCollector.Stats(); hostStats != nil {
   273  		var availableForAllocations uint64
   274  		if hostStats.AllocDirStats.Available < uint64(a.config.ReservedDiskMB*MB) {
   275  			availableForAllocations = 0
   276  		} else {
   277  			availableForAllocations = hostStats.AllocDirStats.Available - uint64(a.config.ReservedDiskMB*MB)
   278  		}
   279  		if uint64(totalResource.DiskMB*MB) < availableForAllocations {
   280  			return nil
   281  		}
   282  	}
   283  
   284  	var diskCleared int64
   285  	for {
   286  		select {
   287  		case <-a.shutdownCh:
   288  			return nil
   289  		default:
   290  		}
   291  
   292  		// Collect host stats and see if we still need to remove older
   293  		// allocations
   294  		var allocDirStats *stats.DiskStats
   295  		if err := a.statsCollector.Collect(); err == nil {
   296  			if hostStats := a.statsCollector.Stats(); hostStats != nil {
   297  				allocDirStats = hostStats.AllocDirStats
   298  			}
   299  		}
   300  
   301  		if allocDirStats != nil {
   302  			if allocDirStats.Available >= uint64(totalResource.DiskMB*MB) {
   303  				break
   304  			}
   305  		} else {
   306  			// Falling back to a simpler model to know if we have enough disk
   307  			// space if stats collection fails
   308  			if diskCleared >= totalResource.DiskMB {
   309  				break
   310  			}
   311  		}
   312  
   313  		gcAlloc := a.allocRunners.Pop()
   314  		if gcAlloc == nil {
   315  			break
   316  		}
   317  
   318  		ar := gcAlloc.allocRunner
   319  		alloc := ar.Alloc()
   320  
   321  		// COMPAT(0.11): Remove in 0.11
   322  		var allocDiskMB int64
   323  		if alloc.AllocatedResources != nil {
   324  			allocDiskMB = alloc.AllocatedResources.Shared.DiskMB
   325  		} else {
   326  			allocDiskMB = int64(alloc.Resources.DiskMB)
   327  		}
   328  
   329  		// Destroy the alloc runner and wait until it exits
   330  		a.destroyAllocRunner(gcAlloc.allocID, ar, fmt.Sprintf("freeing %d MB for new allocations", allocDiskMB))
   331  
   332  		diskCleared += allocDiskMB
   333  	}
   334  	return nil
   335  }
   336  
   337  // MarkForCollection starts tracking an allocation for Garbage Collection
   338  func (a *AllocGarbageCollector) MarkForCollection(allocID string, ar AllocRunner) {
   339  	if a.allocRunners.Push(allocID, ar) {
   340  		a.logger.Info("marking allocation for GC", "alloc_id", allocID)
   341  	}
   342  }
   343  
   344  // GCAlloc wraps an allocation runner and an index enabling it to be used within
   345  // a PQ
   346  type GCAlloc struct {
   347  	timeStamp   time.Time
   348  	allocID     string
   349  	allocRunner AllocRunner
   350  	index       int
   351  }
   352  
   353  type GCAllocPQImpl []*GCAlloc
   354  
   355  func (pq GCAllocPQImpl) Len() int {
   356  	return len(pq)
   357  }
   358  
   359  func (pq GCAllocPQImpl) Less(i, j int) bool {
   360  	return pq[i].timeStamp.Before(pq[j].timeStamp)
   361  }
   362  
   363  func (pq GCAllocPQImpl) Swap(i, j int) {
   364  	pq[i], pq[j] = pq[j], pq[i]
   365  	pq[i].index = i
   366  	pq[j].index = j
   367  }
   368  
   369  func (pq *GCAllocPQImpl) Push(x interface{}) {
   370  	n := len(*pq)
   371  	item := x.(*GCAlloc)
   372  	item.index = n
   373  	*pq = append(*pq, item)
   374  }
   375  
   376  func (pq *GCAllocPQImpl) Pop() interface{} {
   377  	old := *pq
   378  	n := len(old)
   379  	item := old[n-1]
   380  	item.index = -1 // for safety
   381  	*pq = old[0 : n-1]
   382  	return item
   383  }
   384  
   385  // IndexedGCAllocPQ is an indexed PQ which maintains a list of allocation runner
   386  // based on their termination time.
   387  type IndexedGCAllocPQ struct {
   388  	index map[string]*GCAlloc
   389  	heap  GCAllocPQImpl
   390  
   391  	pqLock sync.Mutex
   392  }
   393  
   394  func NewIndexedGCAllocPQ() *IndexedGCAllocPQ {
   395  	return &IndexedGCAllocPQ{
   396  		index: make(map[string]*GCAlloc),
   397  		heap:  make(GCAllocPQImpl, 0),
   398  	}
   399  }
   400  
   401  // Push an alloc runner into the GC queue. Returns true if alloc was added,
   402  // false if the alloc already existed.
   403  func (i *IndexedGCAllocPQ) Push(allocID string, ar AllocRunner) bool {
   404  	i.pqLock.Lock()
   405  	defer i.pqLock.Unlock()
   406  
   407  	if _, ok := i.index[allocID]; ok {
   408  		// No work to do
   409  		return false
   410  	}
   411  	gcAlloc := &GCAlloc{
   412  		timeStamp:   time.Now(),
   413  		allocID:     allocID,
   414  		allocRunner: ar,
   415  	}
   416  	i.index[allocID] = gcAlloc
   417  	heap.Push(&i.heap, gcAlloc)
   418  	return true
   419  }
   420  
   421  func (i *IndexedGCAllocPQ) Pop() *GCAlloc {
   422  	i.pqLock.Lock()
   423  	defer i.pqLock.Unlock()
   424  
   425  	if len(i.heap) == 0 {
   426  		return nil
   427  	}
   428  
   429  	gcAlloc := heap.Pop(&i.heap).(*GCAlloc)
   430  	delete(i.index, gcAlloc.allocRunner.Alloc().ID)
   431  	return gcAlloc
   432  }
   433  
   434  // Remove alloc from GC. Returns nil if alloc doesn't exist.
   435  func (i *IndexedGCAllocPQ) Remove(allocID string) *GCAlloc {
   436  	i.pqLock.Lock()
   437  	defer i.pqLock.Unlock()
   438  
   439  	if gcAlloc, ok := i.index[allocID]; ok {
   440  		heap.Remove(&i.heap, gcAlloc.index)
   441  		delete(i.index, allocID)
   442  		return gcAlloc
   443  	}
   444  
   445  	return nil
   446  }
   447  
   448  func (i *IndexedGCAllocPQ) Length() int {
   449  	i.pqLock.Lock()
   450  	defer i.pqLock.Unlock()
   451  
   452  	return len(i.heap)
   453  }