github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/client/gc.go (about)

     1  package client
     2  
     3  import (
     4  	"container/heap"
     5  	"fmt"
     6  	"log"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/hashicorp/nomad/client/stats"
    11  	"github.com/hashicorp/nomad/nomad/structs"
    12  )
    13  
    14  const (
    15  	// MB is a constant which converts values in bytes to MB
    16  	MB = 1024 * 1024
    17  )
    18  
    19  // GCConfig allows changing the behaviour of the garbage collector
    20  type GCConfig struct {
    21  	// MaxAllocs is the maximum number of allocations to track before a GC
    22  	// is triggered.
    23  	MaxAllocs           int
    24  	DiskUsageThreshold  float64
    25  	InodeUsageThreshold float64
    26  	Interval            time.Duration
    27  	ReservedDiskMB      int
    28  	ParallelDestroys    int
    29  }
    30  
    31  // AllocCounter is used by AllocGarbageCollector to discover how many
    32  // allocations a node has and is generally fulfilled by the Client.
    33  type AllocCounter interface {
    34  	NumAllocs() int
    35  }
    36  
    37  // AllocGarbageCollector garbage collects terminated allocations on a node
    38  type AllocGarbageCollector struct {
    39  	allocRunners   *IndexedGCAllocPQ
    40  	statsCollector stats.NodeStatsCollector
    41  	allocCounter   AllocCounter
    42  	config         *GCConfig
    43  	logger         *log.Logger
    44  	destroyCh      chan struct{}
    45  	shutdownCh     chan struct{}
    46  }
    47  
    48  // NewAllocGarbageCollector returns a garbage collector for terminated
    49  // allocations on a node. Must call Run() in a goroutine enable periodic
    50  // garbage collection.
    51  func NewAllocGarbageCollector(logger *log.Logger, statsCollector stats.NodeStatsCollector, ac AllocCounter, config *GCConfig) *AllocGarbageCollector {
    52  	// Require at least 1 to make progress
    53  	if config.ParallelDestroys <= 0 {
    54  		logger.Printf("[WARN] client: garbage collector defaulting parallism to 1 due to invalid input value of %d", config.ParallelDestroys)
    55  		config.ParallelDestroys = 1
    56  	}
    57  
    58  	gc := &AllocGarbageCollector{
    59  		allocRunners:   NewIndexedGCAllocPQ(),
    60  		statsCollector: statsCollector,
    61  		allocCounter:   ac,
    62  		config:         config,
    63  		logger:         logger,
    64  		destroyCh:      make(chan struct{}, config.ParallelDestroys),
    65  		shutdownCh:     make(chan struct{}),
    66  	}
    67  
    68  	return gc
    69  }
    70  
    71  // Run the periodic garbage collector.
    72  func (a *AllocGarbageCollector) Run() {
    73  	ticker := time.NewTicker(a.config.Interval)
    74  	for {
    75  		select {
    76  		case <-ticker.C:
    77  			if err := a.keepUsageBelowThreshold(); err != nil {
    78  				a.logger.Printf("[ERR] client: error garbage collecting allocation: %v", err)
    79  			}
    80  		case <-a.shutdownCh:
    81  			ticker.Stop()
    82  			return
    83  		}
    84  	}
    85  }
    86  
    87  // keepUsageBelowThreshold collects disk usage information and garbage collects
    88  // allocations to make disk space available.
    89  func (a *AllocGarbageCollector) keepUsageBelowThreshold() error {
    90  	for {
    91  		select {
    92  		case <-a.shutdownCh:
    93  			return nil
    94  		default:
    95  		}
    96  
    97  		// Check if we have enough free space
    98  		err := a.statsCollector.Collect()
    99  		if err != nil {
   100  			return err
   101  		}
   102  
   103  		// See if we are below thresholds for used disk space and inode usage
   104  		// TODO(diptanu) figure out why this is nil
   105  		stats := a.statsCollector.Stats()
   106  		if stats == nil {
   107  			break
   108  		}
   109  
   110  		diskStats := stats.AllocDirStats
   111  		if diskStats == nil {
   112  			break
   113  		}
   114  
   115  		reason := ""
   116  
   117  		switch {
   118  		case diskStats.UsedPercent > a.config.DiskUsageThreshold:
   119  			reason = fmt.Sprintf("disk usage of %.0f is over gc threshold of %.0f",
   120  				diskStats.UsedPercent, a.config.DiskUsageThreshold)
   121  		case diskStats.InodesUsedPercent > a.config.InodeUsageThreshold:
   122  			reason = fmt.Sprintf("inode usage of %.0f is over gc threshold of %.0f",
   123  				diskStats.InodesUsedPercent, a.config.InodeUsageThreshold)
   124  		case a.numAllocs() > a.config.MaxAllocs:
   125  			reason = fmt.Sprintf("number of allocations is over the limit (%d)", a.config.MaxAllocs)
   126  		}
   127  
   128  		// No reason to gc, exit
   129  		if reason == "" {
   130  			break
   131  		}
   132  
   133  		// Collect an allocation
   134  		gcAlloc := a.allocRunners.Pop()
   135  		if gcAlloc == nil {
   136  			a.logger.Printf("[WARN] client: garbage collection due to %s skipped because no terminal allocations", reason)
   137  			break
   138  		}
   139  
   140  		// Destroy the alloc runner and wait until it exits
   141  		a.destroyAllocRunner(gcAlloc.allocRunner, reason)
   142  	}
   143  	return nil
   144  }
   145  
   146  // destroyAllocRunner is used to destroy an allocation runner. It will acquire a
   147  // lock to restrict parallelism and then destroy the alloc runner, returning
   148  // once the allocation has been destroyed.
   149  func (a *AllocGarbageCollector) destroyAllocRunner(ar *AllocRunner, reason string) {
   150  	id := "<nil>"
   151  	if alloc := ar.Alloc(); alloc != nil {
   152  		id = alloc.ID
   153  	}
   154  	a.logger.Printf("[INFO] client: garbage collecting allocation %s due to %s", id, reason)
   155  
   156  	// Acquire the destroy lock
   157  	select {
   158  	case <-a.shutdownCh:
   159  		return
   160  	case a.destroyCh <- struct{}{}:
   161  	}
   162  
   163  	ar.Destroy()
   164  
   165  	select {
   166  	case <-ar.WaitCh():
   167  	case <-a.shutdownCh:
   168  	}
   169  
   170  	a.logger.Printf("[DEBUG] client: garbage collected %q", ar.Alloc().ID)
   171  
   172  	// Release the lock
   173  	<-a.destroyCh
   174  }
   175  
   176  func (a *AllocGarbageCollector) Stop() {
   177  	close(a.shutdownCh)
   178  }
   179  
   180  // Collect garbage collects a single allocation on a node
   181  func (a *AllocGarbageCollector) Collect(allocID string) error {
   182  	gcAlloc, err := a.allocRunners.Remove(allocID)
   183  	if err != nil {
   184  		return fmt.Errorf("unable to collect allocation %q: %v", allocID, err)
   185  	}
   186  	a.destroyAllocRunner(gcAlloc.allocRunner, "forced collection")
   187  	return nil
   188  }
   189  
   190  // CollectAll garbage collects all termianated allocations on a node
   191  func (a *AllocGarbageCollector) CollectAll() error {
   192  	for {
   193  		select {
   194  		case <-a.shutdownCh:
   195  			return nil
   196  		default:
   197  		}
   198  
   199  		gcAlloc := a.allocRunners.Pop()
   200  		if gcAlloc == nil {
   201  			break
   202  		}
   203  
   204  		go a.destroyAllocRunner(gcAlloc.allocRunner, "forced full collection")
   205  	}
   206  	return nil
   207  }
   208  
   209  // MakeRoomFor garbage collects enough number of allocations in the terminal
   210  // state to make room for new allocations
   211  func (a *AllocGarbageCollector) MakeRoomFor(allocations []*structs.Allocation) error {
   212  	// GC allocs until below the max limit + the new allocations
   213  	max := a.config.MaxAllocs - len(allocations)
   214  	for a.numAllocs() > max {
   215  		select {
   216  		case <-a.shutdownCh:
   217  			return nil
   218  		default:
   219  		}
   220  
   221  		gcAlloc := a.allocRunners.Pop()
   222  		if gcAlloc == nil {
   223  			// It's fine if we can't lower below the limit here as
   224  			// we'll keep trying to drop below the limit with each
   225  			// periodic gc
   226  			break
   227  		}
   228  
   229  		// Destroy the alloc runner and wait until it exits
   230  		a.destroyAllocRunner(gcAlloc.allocRunner, "new allocations")
   231  	}
   232  	totalResource := &structs.Resources{}
   233  	for _, alloc := range allocations {
   234  		if err := totalResource.Add(alloc.Resources); err != nil {
   235  			return err
   236  		}
   237  	}
   238  
   239  	// If the host has enough free space to accomodate the new allocations then
   240  	// we don't need to garbage collect terminated allocations
   241  	if hostStats := a.statsCollector.Stats(); hostStats != nil {
   242  		var availableForAllocations uint64
   243  		if hostStats.AllocDirStats.Available < uint64(a.config.ReservedDiskMB*MB) {
   244  			availableForAllocations = 0
   245  		} else {
   246  			availableForAllocations = hostStats.AllocDirStats.Available - uint64(a.config.ReservedDiskMB*MB)
   247  		}
   248  		if uint64(totalResource.DiskMB*MB) < availableForAllocations {
   249  			return nil
   250  		}
   251  	}
   252  
   253  	var diskCleared int
   254  	for {
   255  		select {
   256  		case <-a.shutdownCh:
   257  			return nil
   258  		default:
   259  		}
   260  
   261  		// Collect host stats and see if we still need to remove older
   262  		// allocations
   263  		var allocDirStats *stats.DiskStats
   264  		if err := a.statsCollector.Collect(); err == nil {
   265  			if hostStats := a.statsCollector.Stats(); hostStats != nil {
   266  				allocDirStats = hostStats.AllocDirStats
   267  			}
   268  		}
   269  
   270  		if allocDirStats != nil {
   271  			if allocDirStats.Available >= uint64(totalResource.DiskMB*MB) {
   272  				break
   273  			}
   274  		} else {
   275  			// Falling back to a simpler model to know if we have enough disk
   276  			// space if stats collection fails
   277  			if diskCleared >= totalResource.DiskMB {
   278  				break
   279  			}
   280  		}
   281  
   282  		gcAlloc := a.allocRunners.Pop()
   283  		if gcAlloc == nil {
   284  			break
   285  		}
   286  
   287  		ar := gcAlloc.allocRunner
   288  		alloc := ar.Alloc()
   289  
   290  		// Destroy the alloc runner and wait until it exits
   291  		a.destroyAllocRunner(ar, fmt.Sprintf("freeing %d MB for new allocations", alloc.Resources.DiskMB))
   292  
   293  		// Call stats collect again
   294  		diskCleared += alloc.Resources.DiskMB
   295  	}
   296  	return nil
   297  }
   298  
   299  // MarkForCollection starts tracking an allocation for Garbage Collection
   300  func (a *AllocGarbageCollector) MarkForCollection(ar *AllocRunner) error {
   301  	if ar == nil {
   302  		return fmt.Errorf("nil allocation runner inserted for garbage collection")
   303  	}
   304  	if ar.Alloc() == nil {
   305  		a.destroyAllocRunner(ar, "alloc is nil")
   306  	}
   307  
   308  	a.logger.Printf("[INFO] client: marking allocation %v for GC", ar.Alloc().ID)
   309  	return a.allocRunners.Push(ar)
   310  }
   311  
   312  // Remove removes an alloc runner without garbage collecting it
   313  func (a *AllocGarbageCollector) Remove(ar *AllocRunner) {
   314  	if ar == nil || ar.Alloc() == nil {
   315  		return
   316  	}
   317  
   318  	alloc := ar.Alloc()
   319  	if _, err := a.allocRunners.Remove(alloc.ID); err == nil {
   320  		a.logger.Printf("[INFO] client: removed alloc runner %v from garbage collector", alloc.ID)
   321  	}
   322  }
   323  
   324  // numAllocs returns the total number of allocs tracked by the client as well
   325  // as those marked for GC.
   326  func (a *AllocGarbageCollector) numAllocs() int {
   327  	return a.allocRunners.Length() + a.allocCounter.NumAllocs()
   328  }
   329  
   330  // GCAlloc wraps an allocation runner and an index enabling it to be used within
   331  // a PQ
   332  type GCAlloc struct {
   333  	timeStamp   time.Time
   334  	allocRunner *AllocRunner
   335  	index       int
   336  }
   337  
   338  type GCAllocPQImpl []*GCAlloc
   339  
   340  func (pq GCAllocPQImpl) Len() int {
   341  	return len(pq)
   342  }
   343  
   344  func (pq GCAllocPQImpl) Less(i, j int) bool {
   345  	return pq[i].timeStamp.Before(pq[j].timeStamp)
   346  }
   347  
   348  func (pq GCAllocPQImpl) Swap(i, j int) {
   349  	pq[i], pq[j] = pq[j], pq[i]
   350  	pq[i].index = i
   351  	pq[j].index = j
   352  }
   353  
   354  func (pq *GCAllocPQImpl) Push(x interface{}) {
   355  	n := len(*pq)
   356  	item := x.(*GCAlloc)
   357  	item.index = n
   358  	*pq = append(*pq, item)
   359  }
   360  
   361  func (pq *GCAllocPQImpl) Pop() interface{} {
   362  	old := *pq
   363  	n := len(old)
   364  	item := old[n-1]
   365  	item.index = -1 // for safety
   366  	*pq = old[0 : n-1]
   367  	return item
   368  }
   369  
   370  // IndexedGCAllocPQ is an indexed PQ which maintains a list of allocation runner
   371  // based on their termination time.
   372  type IndexedGCAllocPQ struct {
   373  	index map[string]*GCAlloc
   374  	heap  GCAllocPQImpl
   375  
   376  	pqLock sync.Mutex
   377  }
   378  
   379  func NewIndexedGCAllocPQ() *IndexedGCAllocPQ {
   380  	return &IndexedGCAllocPQ{
   381  		index: make(map[string]*GCAlloc),
   382  		heap:  make(GCAllocPQImpl, 0),
   383  	}
   384  }
   385  
   386  func (i *IndexedGCAllocPQ) Push(ar *AllocRunner) error {
   387  	i.pqLock.Lock()
   388  	defer i.pqLock.Unlock()
   389  
   390  	alloc := ar.Alloc()
   391  	if _, ok := i.index[alloc.ID]; ok {
   392  		// No work to do
   393  		return nil
   394  	}
   395  	gcAlloc := &GCAlloc{
   396  		timeStamp:   time.Now(),
   397  		allocRunner: ar,
   398  	}
   399  	i.index[alloc.ID] = gcAlloc
   400  	heap.Push(&i.heap, gcAlloc)
   401  	return nil
   402  }
   403  
   404  func (i *IndexedGCAllocPQ) Pop() *GCAlloc {
   405  	i.pqLock.Lock()
   406  	defer i.pqLock.Unlock()
   407  
   408  	if len(i.heap) == 0 {
   409  		return nil
   410  	}
   411  
   412  	gcAlloc := heap.Pop(&i.heap).(*GCAlloc)
   413  	delete(i.index, gcAlloc.allocRunner.Alloc().ID)
   414  	return gcAlloc
   415  }
   416  
   417  func (i *IndexedGCAllocPQ) Remove(allocID string) (*GCAlloc, error) {
   418  	i.pqLock.Lock()
   419  	defer i.pqLock.Unlock()
   420  
   421  	if gcAlloc, ok := i.index[allocID]; ok {
   422  		heap.Remove(&i.heap, gcAlloc.index)
   423  		delete(i.index, allocID)
   424  		return gcAlloc, nil
   425  	}
   426  
   427  	return nil, fmt.Errorf("alloc %q not present", allocID)
   428  }
   429  
   430  func (i *IndexedGCAllocPQ) Length() int {
   431  	i.pqLock.Lock()
   432  	defer i.pqLock.Unlock()
   433  
   434  	return len(i.heap)
   435  }