github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/lib/cgutil/cpuset_manager_v2.go (about)

     1  //go:build linux
     2  
     3  package cgutil
     4  
     5  import (
     6  	"context"
     7  	"fmt"
     8  	"os"
     9  	"path/filepath"
    10  	"strings"
    11  	"sync"
    12  	"time"
    13  
    14  	"github.com/hashicorp/go-hclog"
    15  	"github.com/hashicorp/go-set"
    16  	"github.com/hashicorp/nomad/helper"
    17  	"github.com/hashicorp/nomad/lib/cpuset"
    18  	"github.com/hashicorp/nomad/nomad/structs"
    19  	"github.com/opencontainers/runc/libcontainer/cgroups"
    20  	"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
    21  	"github.com/opencontainers/runc/libcontainer/configs"
    22  )
    23  
    24  const (
    25  	// CreationPID is a special PID in libcontainer used to denote a cgroup
    26  	// should be created, but with no process added.
    27  	//
    28  	// https://github.com/opencontainers/runc/blob/v1.0.3/libcontainer/cgroups/utils.go#L372
    29  	CreationPID = -1
    30  
    31  	// DefaultCgroupParentV2 is the name of Nomad's default parent cgroup, under which
    32  	// all other cgroups are managed. This can be changed with client configuration
    33  	// in case for e.g. Nomad tasks should be further constrained by an externally
    34  	// configured systemd cgroup.
    35  	DefaultCgroupParentV2 = "nomad.slice"
    36  )
    37  
    38  // nothing is used for treating a map like a set with no values
    39  type nothing struct{}
    40  
    41  // present indicates something exists
    42  var present = nothing{}
    43  
    44  type cpusetManagerV2 struct {
    45  	logger hclog.Logger
    46  
    47  	parent    string        // relative to cgroup root (e.g. "nomad.slice")
    48  	parentAbs string        // absolute path (e.g. "/sys/fs/cgroup/nomad.slice")
    49  	initial   cpuset.CPUSet // set of initial cores (never changes)
    50  
    51  	lock      sync.Mutex                 // hold this when managing pool / sharing / isolating
    52  	pool      cpuset.CPUSet              // pool of cores being shared among all tasks
    53  	sharing   map[identity]nothing       // sharing tasks using cores only from the pool
    54  	isolating map[identity]cpuset.CPUSet // isolating tasks using cores from the pool + reserved cores
    55  }
    56  
    57  func NewCpusetManagerV2(parent string, reservable []uint16, logger hclog.Logger) CpusetManager {
    58  	if err := minimumRootControllers(); err != nil {
    59  		logger.Error("failed to enabled minimum set of cgroup controllers; disabling cpuset management", "error", err)
    60  		return new(NoopCpusetManager)
    61  	}
    62  
    63  	parentAbs := filepath.Join(CgroupRoot, parent)
    64  	if err := os.MkdirAll(parentAbs, 0o755); err != nil {
    65  		logger.Error("failed to ensure nomad parent cgroup exists; disabling cpuset management", "error", err)
    66  		return new(NoopCpusetManager)
    67  	}
    68  
    69  	if len(reservable) == 0 {
    70  		// read from group
    71  		if cpus, err := GetCPUsFromCgroup(parent); err != nil {
    72  			logger.Error("failed to lookup cpus from parent cgroup; disabling cpuset management", "error", err)
    73  			return new(NoopCpusetManager)
    74  		} else {
    75  			reservable = cpus
    76  		}
    77  	}
    78  
    79  	return &cpusetManagerV2{
    80  		initial:   cpuset.New(reservable...),
    81  		parent:    parent,
    82  		parentAbs: parentAbs,
    83  		logger:    logger,
    84  		sharing:   make(map[identity]nothing),
    85  		isolating: make(map[identity]cpuset.CPUSet),
    86  	}
    87  }
    88  
    89  // minimumControllers sets the minimum set of required controllers on the
    90  // /sys/fs/cgroup/cgroup.subtree_control file - ensuring [cpuset, cpu, io, memory, pids]
    91  // are enabled.
    92  func minimumRootControllers() error {
    93  	e := new(editor)
    94  	s, err := e.read("cgroup.subtree_control")
    95  	if err != nil {
    96  		return err
    97  	}
    98  
    99  	required := set.From[string]([]string{"cpuset", "cpu", "io", "memory", "pids"})
   100  	enabled := set.From[string](strings.Fields(s))
   101  	needed := required.Difference(enabled)
   102  
   103  	if needed.Size() == 0 {
   104  		return nil // already sufficient
   105  	}
   106  
   107  	sb := new(strings.Builder)
   108  	for _, controller := range needed.List() {
   109  		sb.WriteString("+" + controller + " ")
   110  	}
   111  
   112  	activation := strings.TrimSpace(sb.String())
   113  	return e.write("cgroup.subtree_control", activation)
   114  }
   115  
   116  func (c *cpusetManagerV2) Init() {
   117  	c.logger.Debug("initializing with", "cores", c.initial)
   118  }
   119  
   120  func (c *cpusetManagerV2) AddAlloc(alloc *structs.Allocation) {
   121  	if alloc == nil || alloc.AllocatedResources == nil {
   122  		return
   123  	}
   124  	c.logger.Trace("add allocation", "name", alloc.Name, "id", alloc.ID)
   125  
   126  	// grab write lock while we recompute and apply changes
   127  	c.lock.Lock()
   128  	defer c.lock.Unlock()
   129  
   130  	// first update our tracking of isolating and sharing tasks
   131  	for task, resources := range alloc.AllocatedResources.Tasks {
   132  		id := makeID(alloc.ID, task)
   133  		if len(resources.Cpu.ReservedCores) > 0 {
   134  			c.isolating[id] = cpuset.New(resources.Cpu.ReservedCores...)
   135  		} else {
   136  			c.sharing[id] = present
   137  		}
   138  	}
   139  
   140  	// recompute the available sharable cpu cores
   141  	c.recalculate()
   142  
   143  	// now write out the entire cgroups space
   144  	c.reconcile()
   145  
   146  	// no need to cleanup on adds, we did not remove a task
   147  }
   148  
   149  func (c *cpusetManagerV2) RemoveAlloc(allocID string) {
   150  	c.logger.Trace("remove allocation", "id", allocID)
   151  
   152  	// grab write lock while we recompute and apply changes.
   153  	c.lock.Lock()
   154  	defer c.lock.Unlock()
   155  
   156  	// remove tasks of allocID from the sharing set
   157  	for id := range c.sharing {
   158  		if strings.HasPrefix(string(id), allocID) {
   159  			delete(c.sharing, id)
   160  		}
   161  	}
   162  
   163  	// remove tasks of allocID from the isolating set
   164  	for id := range c.isolating {
   165  		if strings.HasPrefix(string(id), allocID) {
   166  			delete(c.isolating, id)
   167  		}
   168  	}
   169  
   170  	// recompute available sharable cpu cores
   171  	c.recalculate()
   172  
   173  	// now write out the entire cgroups space
   174  	c.reconcile()
   175  
   176  	// now remove any tasks no longer running
   177  	c.cleanup()
   178  }
   179  
   180  func (c *cpusetManagerV2) CgroupPathFor(allocID, task string) CgroupPathGetter {
   181  	// The CgroupPathFor implementation must block until cgroup for allocID.task
   182  	// exists [and can accept a PID].
   183  	return func(ctx context.Context) (string, error) {
   184  		ticks, cancel := helper.NewSafeTimer(100 * time.Millisecond)
   185  		defer cancel()
   186  
   187  		for {
   188  			path := c.pathOf(makeID(allocID, task))
   189  			mgr, err := fs2.NewManager(nil, path)
   190  			if err != nil {
   191  				return "", err
   192  			}
   193  
   194  			if mgr.Exists() {
   195  				return path, nil
   196  			}
   197  
   198  			select {
   199  			case <-ctx.Done():
   200  				return "", ctx.Err()
   201  			case <-ticks.C:
   202  				continue
   203  			}
   204  		}
   205  	}
   206  }
   207  
   208  // recalculate the number of cores sharable by non-isolating tasks (and isolating tasks)
   209  //
   210  // must be called while holding c.lock
   211  func (c *cpusetManagerV2) recalculate() {
   212  	remaining := c.initial.Copy()
   213  	for _, set := range c.isolating {
   214  		remaining = remaining.Difference(set)
   215  	}
   216  	c.pool = remaining
   217  }
   218  
   219  // reconcile will actually write the cpuset values for all tracked tasks.
   220  //
   221  // must be called while holding c.lock
   222  func (c *cpusetManagerV2) reconcile() {
   223  	for id := range c.sharing {
   224  		c.write(id, c.pool)
   225  	}
   226  
   227  	for id, set := range c.isolating {
   228  		c.write(id, c.pool.Union(set))
   229  	}
   230  }
   231  
   232  // cleanup will remove any cgroups for allocations no longer being tracked
   233  //
   234  // must be called while holding c.lock
   235  func (c *cpusetManagerV2) cleanup() {
   236  	// create a map to lookup ids we know about
   237  	size := len(c.sharing) + len(c.isolating)
   238  	ids := make(map[identity]nothing, size)
   239  	for id := range c.sharing {
   240  		ids[id] = present
   241  	}
   242  	for id := range c.isolating {
   243  		ids[id] = present
   244  	}
   245  
   246  	if err := filepath.WalkDir(c.parentAbs, func(path string, entry os.DirEntry, err error) error {
   247  		// a cgroup is a directory
   248  		if !entry.IsDir() {
   249  			return nil
   250  		}
   251  
   252  		dir := filepath.Dir(path)
   253  		base := filepath.Base(path)
   254  
   255  		// only manage scopes directly under nomad.slice
   256  		if dir != c.parentAbs || !strings.HasSuffix(base, ".scope") {
   257  			return nil
   258  		}
   259  
   260  		// only remove the scope if we do not track it
   261  		id := identity(strings.TrimSuffix(base, ".scope"))
   262  		_, exists := ids[id]
   263  		if !exists {
   264  			c.remove(path)
   265  		}
   266  
   267  		return nil
   268  	}); err != nil {
   269  		c.logger.Error("failed to cleanup cgroup", "error", err)
   270  	}
   271  }
   272  
   273  // pathOf returns the absolute path to a task with identity id.
   274  func (c *cpusetManagerV2) pathOf(id identity) string {
   275  	return filepath.Join(c.parentAbs, makeScope(id))
   276  }
   277  
   278  // remove does the actual fs delete of the cgroup
   279  //
   280  // We avoid removing a cgroup if it still contains a PID, as the cpuset manager
   281  // may be initially empty on a Nomad client restart.
   282  func (c *cpusetManagerV2) remove(path string) {
   283  	mgr, err := fs2.NewManager(nil, path)
   284  	if err != nil {
   285  		c.logger.Warn("failed to create manager", "path", path, "error", err)
   286  		return
   287  	}
   288  
   289  	// get the list of pids managed by this scope (should be 0 or 1)
   290  	pids, _ := mgr.GetPids()
   291  
   292  	// do not destroy the scope if a PID is still present
   293  	// this is a normal condition when an agent restarts with running tasks
   294  	// and the v2 manager is still rebuilding its tracked tasks
   295  	if len(pids) > 0 {
   296  		return
   297  	}
   298  
   299  	// remove the cgroup
   300  	if err3 := mgr.Destroy(); err3 != nil {
   301  		c.logger.Warn("failed to cleanup cgroup", "path", path, "error", err)
   302  		return
   303  	}
   304  }
   305  
   306  // write does the actual write of cpuset set for cgroup id
   307  func (c *cpusetManagerV2) write(id identity, set cpuset.CPUSet) {
   308  	path := c.pathOf(id)
   309  
   310  	// make a manager for the cgroup
   311  	m, err := fs2.NewManager(new(configs.Cgroup), path)
   312  	if err != nil {
   313  		c.logger.Error("failed to manage cgroup", "path", path, "error", err)
   314  		return
   315  	}
   316  
   317  	// create the cgroup
   318  	if err = m.Apply(CreationPID); err != nil {
   319  		c.logger.Error("failed to apply cgroup", "path", path, "error", err)
   320  		return
   321  	}
   322  
   323  	// set the cpuset value for the cgroup
   324  	if err = m.Set(&configs.Resources{
   325  		CpusetCpus: set.String(),
   326  	}); err != nil {
   327  		c.logger.Error("failed to set cgroup", "path", path, "error", err)
   328  		return
   329  	}
   330  }
   331  
   332  // fromRoot returns the joined filepath of group on the CgroupRoot
   333  func fromRoot(group string) string {
   334  	return filepath.Join(CgroupRoot, group)
   335  }
   336  
   337  // getCPUsFromCgroupV2 retrieves the effective cpuset for the group, which must
   338  // be directly under the cgroup root (i.e. the parent, like nomad.slice).
   339  func getCPUsFromCgroupV2(group string) ([]uint16, error) {
   340  	path := fromRoot(group)
   341  	effective, err := cgroups.ReadFile(path, "cpuset.cpus.effective")
   342  	if err != nil {
   343  		return nil, err
   344  	}
   345  	set, err := cpuset.Parse(effective)
   346  	if err != nil {
   347  		return nil, err
   348  	}
   349  	return set.ToSlice(), nil
   350  }
   351  
   352  // identity is the "<allocID>.<taskName>" string that uniquely identifies an
   353  // individual instance of a task within the flat cgroup namespace
   354  type identity string
   355  
   356  func makeID(allocID, task string) identity {
   357  	return identity(fmt.Sprintf("%s.%s", allocID, task))
   358  }
   359  
   360  func makeScope(id identity) string {
   361  	return string(id) + ".scope"
   362  }