github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/kernel/cgroup.go (about)

     1  // Copyright 2021 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"sort"
    21  
    22  	"github.com/MerlinKodo/gvisor/pkg/atomicbitops"
    23  	"github.com/MerlinKodo/gvisor/pkg/context"
    24  	"github.com/MerlinKodo/gvisor/pkg/errors/linuxerr"
    25  	"github.com/MerlinKodo/gvisor/pkg/fspath"
    26  	"github.com/MerlinKodo/gvisor/pkg/log"
    27  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/kernfs"
    28  	"github.com/MerlinKodo/gvisor/pkg/sentry/vfs"
    29  )
    30  
    31  // InvalidCgroupHierarchyID indicates an uninitialized hierarchy ID.
    32  const InvalidCgroupHierarchyID uint32 = 0
    33  
    34  // InvalidCgroupID indicates an uninitialized cgroup ID.
    35  const InvalidCgroupID uint32 = 0
    36  
    37  // CgroupControllerType is the name of a cgroup controller.
    38  type CgroupControllerType string
    39  
    40  // Available cgroup controllers.
    41  const (
    42  	CgroupControllerCPU     = CgroupControllerType("cpu")
    43  	CgroupControllerCPUAcct = CgroupControllerType("cpuacct")
    44  	CgroupControllerCPUSet  = CgroupControllerType("cpuset")
    45  	CgroupControllerDevices = CgroupControllerType("devices")
    46  	CgroupControllerJob     = CgroupControllerType("job")
    47  	CgroupControllerMemory  = CgroupControllerType("memory")
    48  	CgroupControllerPIDs    = CgroupControllerType("pids")
    49  )
    50  
    51  // ParseCgroupController parses a string as a CgroupControllerType.
    52  func ParseCgroupController(val string) (CgroupControllerType, error) {
    53  	switch val {
    54  	case "cpu":
    55  		return CgroupControllerCPU, nil
    56  	case "cpuacct":
    57  		return CgroupControllerCPUAcct, nil
    58  	case "cpuset":
    59  		return CgroupControllerCPUSet, nil
    60  	case "devices":
    61  		return CgroupControllerDevices, nil
    62  	case "job":
    63  		return CgroupControllerJob, nil
    64  	case "memory":
    65  		return CgroupControllerMemory, nil
    66  	case "pids":
    67  		return CgroupControllerPIDs, nil
    68  	default:
    69  		return "", fmt.Errorf("no such cgroup controller")
    70  	}
    71  }
    72  
    73  // CgroupResourceType represents a resource type tracked by a particular
    74  // controller.
    75  type CgroupResourceType int
    76  
    77  // Resources for the cpuacct controller.
    78  const (
    79  	// CgroupResourcePID represents a charge for pids.current.
    80  	CgroupResourcePID CgroupResourceType = iota
    81  )
    82  
    83  // CgroupController is the common interface to cgroup controllers available to
    84  // the entire sentry. The controllers themselves are defined by cgroupfs.
    85  //
    86  // Callers of this interface are often unable access synchronization needed to
    87  // ensure returned values remain valid. Some of values returned from this
    88  // interface are thus snapshots in time, and may become stale. This is ok for
    89  // many callers like procfs.
    90  type CgroupController interface {
    91  	// Returns the type of this cgroup controller (ex "memory", "cpu"). Returned
    92  	// value is valid for the lifetime of the controller.
    93  	Type() CgroupControllerType
    94  
    95  	// Hierarchy returns the ID of the hierarchy this cgroup controller is
    96  	// attached to. Returned value is valid for the lifetime of the controller.
    97  	HierarchyID() uint32
    98  
    99  	// EffectiveRootCgroup returns the effective root cgroup for this
   100  	// controller. This is either the actual root of the underlying cgroupfs
   101  	// filesystem, or the override root configured at sandbox startup. Returned
   102  	// value is valid for the lifetime of the controller.
   103  	EffectiveRootCgroup() Cgroup
   104  
   105  	// NumCgroups returns the number of cgroups managed by this controller.
   106  	// Returned value is a snapshot in time.
   107  	NumCgroups() uint64
   108  
   109  	// Enabled returns whether this controller is enabled. Returned value is a
   110  	// snapshot in time.
   111  	Enabled() bool
   112  }
   113  
   114  // Cgroup represents a named pointer to a cgroup in cgroupfs. When a task enters
   115  // a cgroup, it holds a reference on the underlying dentry pointing to the
   116  // cgroup.
   117  //
   118  // +stateify savable
   119  type Cgroup struct {
   120  	*kernfs.Dentry
   121  	CgroupImpl
   122  }
   123  
   124  // decRef drops a reference on the cgroup. This must happen outside a Task.mu
   125  // critical section.
   126  func (c *Cgroup) decRef() {
   127  	c.Dentry.DecRef(context.Background())
   128  }
   129  
   130  // Path returns the absolute path of c, relative to its hierarchy root.
   131  func (c *Cgroup) Path() string {
   132  	return c.FSLocalPath()
   133  }
   134  
   135  // Walk returns the cgroup at p, starting from c.
   136  func (c *Cgroup) Walk(ctx context.Context, vfsObj *vfs.VirtualFilesystem, p fspath.Path) (Cgroup, error) {
   137  	d, err := c.Dentry.WalkDentryTree(ctx, vfsObj, p)
   138  	if err != nil {
   139  		return Cgroup{}, err
   140  	}
   141  	return Cgroup{
   142  		Dentry:     d,
   143  		CgroupImpl: d.Inode().(CgroupImpl),
   144  	}, nil
   145  }
   146  
   147  // CgroupMigrationContext represents an in-flight cgroup migration for
   148  // a single task.
   149  type CgroupMigrationContext struct {
   150  	src Cgroup
   151  	dst Cgroup
   152  	t   *Task
   153  }
   154  
   155  // Abort cancels a migration.
   156  func (ctx *CgroupMigrationContext) Abort() {
   157  	ctx.dst.AbortMigrate(ctx.t, &ctx.src)
   158  }
   159  
   160  // Commit completes a migration.
   161  func (ctx *CgroupMigrationContext) Commit() {
   162  	ctx.dst.CommitMigrate(ctx.t, &ctx.src)
   163  
   164  	ctx.t.mu.Lock()
   165  	delete(ctx.t.cgroups, ctx.src)
   166  	ctx.src.DecRef(ctx.t)
   167  	ctx.dst.IncRef()
   168  	ctx.t.cgroups[ctx.dst] = struct{}{}
   169  	ctx.t.mu.Unlock()
   170  }
   171  
   172  // CgroupImpl is the common interface to cgroups.
   173  type CgroupImpl interface {
   174  	// Controllers lists the controller associated with this cgroup.
   175  	Controllers() []CgroupController
   176  
   177  	// HierarchyID returns the id of the hierarchy that contains this cgroup.
   178  	HierarchyID() uint32
   179  
   180  	// Name returns the name for this cgroup, if any. If no name was provided
   181  	// when the hierarchy was created, returns "".
   182  	Name() string
   183  
   184  	// Enter moves t into this cgroup.
   185  	Enter(t *Task)
   186  
   187  	// Leave moves t out of this cgroup.
   188  	Leave(t *Task)
   189  
   190  	// PrepareMigrate initiates a migration of t from src to this cgroup. See
   191  	// cgroupfs.controller.PrepareMigrate.
   192  	PrepareMigrate(t *Task, src *Cgroup) error
   193  
   194  	// CommitMigrate completes an in-flight migration. See
   195  	// cgroupfs.controller.CommitMigrate.
   196  	CommitMigrate(t *Task, src *Cgroup)
   197  
   198  	// AbortMigrate cancels an in-flight migration. See
   199  	// cgroupfs.controller.AbortMigrate.
   200  	AbortMigrate(t *Task, src *Cgroup)
   201  
   202  	// Charge charges a controller in this cgroup for a particular resource. key
   203  	// must match a valid resource for the specified controller type.
   204  	//
   205  	// The implementer should silently succeed if no matching controllers are
   206  	// found.
   207  	//
   208  	// The underlying implementaion will panic if passed an incompatible
   209  	// resource type for a given controller.
   210  	//
   211  	// See cgroupfs.controller.Charge.
   212  	Charge(t *Task, d *kernfs.Dentry, ctl CgroupControllerType, res CgroupResourceType, value int64) error
   213  
   214  	// ReadControlFromBackground allows a background context to read a cgroup's
   215  	// control values.
   216  	ReadControl(ctx context.Context, name string) (string, error)
   217  
   218  	// WriteControl allows a background context to write a cgroup's control
   219  	// values.
   220  	WriteControl(ctx context.Context, name string, val string) error
   221  
   222  	// ID returns the id of this cgroup.
   223  	ID() uint32
   224  }
   225  
   226  // hierarchy represents a cgroupfs filesystem instance, with a unique set of
   227  // controllers attached to it. Multiple cgroupfs mounts may reference the same
   228  // hierarchy.
   229  //
   230  // +stateify savable
   231  type hierarchy struct {
   232  	id   uint32
   233  	name string
   234  	// These are a subset of the controllers in CgroupRegistry.controllers,
   235  	// grouped here by hierarchy for conveninent lookup.
   236  	controllers map[CgroupControllerType]CgroupController
   237  	// fs is not owned by hierarchy. The FS is responsible for unregistering the
   238  	// hierarchy on destruction, which removes this association.
   239  	fs *vfs.Filesystem
   240  }
   241  
   242  func (h *hierarchy) match(ctypes []CgroupControllerType) bool {
   243  	if len(ctypes) != len(h.controllers) {
   244  		return false
   245  	}
   246  	for _, ty := range ctypes {
   247  		if _, ok := h.controllers[ty]; !ok {
   248  			return false
   249  		}
   250  	}
   251  	return true
   252  }
   253  
   254  // cgroupFS is the public interface to cgroupfs. This lets the kernel package
   255  // refer to cgroupfs.filesystem methods without directly depending on the
   256  // cgroupfs package, which would lead to a circular dependency.
   257  type cgroupFS interface {
   258  	// Returns the vfs.Filesystem for the cgroupfs.
   259  	VFSFilesystem() *vfs.Filesystem
   260  
   261  	// InitializeHierarchyID sets the hierarchy ID for this filesystem during
   262  	// filesystem creation. May only be called before the filesystem is visible
   263  	// to the vfs layer.
   264  	InitializeHierarchyID(hid uint32)
   265  
   266  	// RootCgroup returns the root cgroup of this instance. This returns the
   267  	// actual root, and ignores any overrides setting an effective root.
   268  	RootCgroup() Cgroup
   269  }
   270  
   271  // CgroupRegistry tracks the active set of cgroup controllers on the system.
   272  //
   273  // +stateify savable
   274  type CgroupRegistry struct {
   275  	// lastHierarchyID is the id of the last allocated cgroup hierarchy. Valid
   276  	// ids are from 1 to math.MaxUint32.
   277  	//
   278  	lastHierarchyID atomicbitops.Uint32
   279  
   280  	// lastCgroupID is the id of the last allocated cgroup. Valid ids are
   281  	// from 1 to math.MaxUint32.
   282  	//
   283  	lastCgroupID atomicbitops.Uint32
   284  
   285  	mu cgroupMutex `state:"nosave"`
   286  
   287  	// controllers is the set of currently known cgroup controllers on the
   288  	// system.
   289  	//
   290  	// +checklocks:mu
   291  	controllers map[CgroupControllerType]CgroupController
   292  
   293  	// hierarchies is the active set of cgroup hierarchies. This contains all
   294  	// hierarchies on the system.
   295  	//
   296  	// +checklocks:mu
   297  	hierarchies map[uint32]hierarchy
   298  
   299  	// hierarchiesByName is a map of named hierarchies. Only named hierarchies
   300  	// are tracked on this map.
   301  	//
   302  	// +checklocks:mu
   303  	hierarchiesByName map[string]hierarchy
   304  
   305  	// cgroups is the active set of cgroups. This contains all the cgroups
   306  	// on the system.
   307  	//
   308  	// +checklocks:mu
   309  	cgroups map[uint32]CgroupImpl
   310  }
   311  
   312  func newCgroupRegistry() *CgroupRegistry {
   313  	return &CgroupRegistry{
   314  		controllers:       make(map[CgroupControllerType]CgroupController),
   315  		hierarchies:       make(map[uint32]hierarchy),
   316  		hierarchiesByName: make(map[string]hierarchy),
   317  		cgroups:           make(map[uint32]CgroupImpl),
   318  	}
   319  }
   320  
   321  // nextHierarchyID returns a newly allocated, unique hierarchy ID.
   322  func (r *CgroupRegistry) nextHierarchyID() (uint32, error) {
   323  	if hid := r.lastHierarchyID.Add(1); hid != 0 {
   324  		return hid, nil
   325  	}
   326  	return InvalidCgroupHierarchyID, fmt.Errorf("cgroup hierarchy ID overflow")
   327  }
   328  
   329  // FindHierarchy returns a cgroup filesystem containing exactly the set of
   330  // controllers named in ctypes, and optionally the name specified in name if it
   331  // isn't empty. If no such FS is found, FindHierarchy return nil. FindHierarchy
   332  // takes a reference on the returned FS, which is transferred to the caller.
   333  func (r *CgroupRegistry) FindHierarchy(name string, ctypes []CgroupControllerType) (*vfs.Filesystem, error) {
   334  	r.mu.Lock()
   335  	defer r.mu.Unlock()
   336  
   337  	// If we have a hierarchy name, lookup by name.
   338  	if name != "" {
   339  		h, ok := r.hierarchiesByName[name]
   340  		if !ok {
   341  			// Name not found.
   342  			return nil, nil
   343  		}
   344  
   345  		if h.match(ctypes) {
   346  			if !h.fs.TryIncRef() {
   347  				// May be racing with filesystem destruction, see below.
   348  				r.unregisterLocked(h.id)
   349  				return nil, nil
   350  			}
   351  			return h.fs, nil
   352  		}
   353  
   354  		// Name matched, but controllers didn't. Fail per linux
   355  		// kernel/cgroup.c:cgroup_mount().
   356  		log.Debugf("cgroupfs: Registry lookup for name=%s controllers=%v failed; named matched but controllers didn't (have controllers=%v)", name, ctypes, h.controllers)
   357  		return nil, linuxerr.EBUSY
   358  	}
   359  
   360  	for _, h := range r.hierarchies {
   361  		if h.match(ctypes) {
   362  			if !h.fs.TryIncRef() {
   363  				// Racing with filesystem destruction, namely h.fs.Release.
   364  				// Since we hold r.mu, we know the hierarchy hasn't been
   365  				// unregistered yet, but its associated filesystem is tearing
   366  				// down.
   367  				//
   368  				// If we simply indicate the hierarchy wasn't found without
   369  				// cleaning up the registry, the caller can race with the
   370  				// unregister and find itself temporarily unable to create a new
   371  				// hierarchy with a subset of the relevant controllers.
   372  				//
   373  				// To keep the result of FindHierarchy consistent with the
   374  				// uniqueness of controllers enforced by Register, drop the
   375  				// dying hierarchy now. The eventual unregister by the FS
   376  				// teardown will become a no-op.
   377  				r.unregisterLocked(h.id)
   378  				return nil, nil
   379  			}
   380  			return h.fs, nil
   381  		}
   382  	}
   383  
   384  	return nil, nil
   385  }
   386  
   387  // FindCgroup locates a cgroup with the given parameters.
   388  //
   389  // A cgroup is considered a match even if it contains other controllers on the
   390  // same hierarchy.
   391  func (r *CgroupRegistry) FindCgroup(ctx context.Context, ctype CgroupControllerType, path string) (Cgroup, error) {
   392  	p := fspath.Parse(path)
   393  	if !p.Absolute {
   394  		return Cgroup{}, fmt.Errorf("path must be absolute")
   395  	}
   396  	k := KernelFromContext(ctx)
   397  	vfsfs, err := r.FindHierarchy("", []CgroupControllerType{ctype})
   398  	if err != nil {
   399  		return Cgroup{}, err
   400  	}
   401  	if vfsfs == nil {
   402  		return Cgroup{}, fmt.Errorf("controller not active")
   403  	}
   404  
   405  	rootCG := vfsfs.Impl().(cgroupFS).RootCgroup()
   406  
   407  	if !p.HasComponents() {
   408  		// Explicit root '/'.
   409  		return rootCG, nil
   410  	}
   411  
   412  	return rootCG.Walk(ctx, k.VFS(), p)
   413  }
   414  
   415  // Register registers the provided set of controllers with the registry as a new
   416  // hierarchy. If any controller is already registered, the function returns an
   417  // error without modifying the registry. Register sets the hierarchy ID for the
   418  // filesystem on success.
   419  func (r *CgroupRegistry) Register(name string, cs []CgroupController, fs cgroupFS) error {
   420  	r.mu.Lock()
   421  	defer r.mu.Unlock()
   422  
   423  	if name == "" && len(cs) == 0 {
   424  		return fmt.Errorf("can't register hierarchy with both no controllers and no name")
   425  	}
   426  
   427  	for _, c := range cs {
   428  		if _, ok := r.controllers[c.Type()]; ok {
   429  			return fmt.Errorf("controllers may only be mounted on a single hierarchy")
   430  		}
   431  	}
   432  
   433  	if _, ok := r.hierarchiesByName[name]; name != "" && ok {
   434  		return fmt.Errorf("hierarchy named %q already exists", name)
   435  	}
   436  
   437  	hid, err := r.nextHierarchyID()
   438  	if err != nil {
   439  		return err
   440  	}
   441  
   442  	// Must not fail below here, once we publish the hierarchy ID.
   443  
   444  	fs.InitializeHierarchyID(hid)
   445  
   446  	h := hierarchy{
   447  		id:          hid,
   448  		name:        name,
   449  		controllers: make(map[CgroupControllerType]CgroupController),
   450  		fs:          fs.VFSFilesystem(),
   451  	}
   452  	for _, c := range cs {
   453  		n := c.Type()
   454  		r.controllers[n] = c
   455  		h.controllers[n] = c
   456  	}
   457  	r.hierarchies[hid] = h
   458  	if name != "" {
   459  		r.hierarchiesByName[name] = h
   460  	}
   461  	return nil
   462  }
   463  
   464  // Unregister removes a previously registered hierarchy from the registry. If no
   465  // such hierarchy is registered, Unregister is a no-op.
   466  func (r *CgroupRegistry) Unregister(hid uint32) {
   467  	r.mu.Lock()
   468  	r.unregisterLocked(hid)
   469  	r.mu.Unlock()
   470  }
   471  
   472  // Precondition: Caller must hold r.mu.
   473  // +checklocks:r.mu
   474  func (r *CgroupRegistry) unregisterLocked(hid uint32) {
   475  	if h, ok := r.hierarchies[hid]; ok {
   476  		for name := range h.controllers {
   477  			delete(r.controllers, name)
   478  		}
   479  		delete(r.hierarchies, hid)
   480  	}
   481  }
   482  
   483  // computeInitialGroups takes a reference on each of the returned cgroups. The
   484  // caller takes ownership of this returned reference.
   485  func (r *CgroupRegistry) computeInitialGroups(inherit map[Cgroup]struct{}) map[Cgroup]struct{} {
   486  	r.mu.Lock()
   487  	defer r.mu.Unlock()
   488  
   489  	ctlSet := make(map[CgroupControllerType]CgroupController)
   490  	cgset := make(map[Cgroup]struct{})
   491  
   492  	// Remember controllers from the inherited cgroups set...
   493  	for cg := range inherit {
   494  		cg.IncRef() // Ref transferred to caller.
   495  		for _, ctl := range cg.Controllers() {
   496  			ctlSet[ctl.Type()] = ctl
   497  			cgset[cg] = struct{}{}
   498  		}
   499  	}
   500  
   501  	// ... and add the root cgroups of all the missing controllers.
   502  	for name, ctl := range r.controllers {
   503  		if _, ok := ctlSet[name]; !ok {
   504  			cg := ctl.EffectiveRootCgroup()
   505  			// Multiple controllers may share the same hierarchy, so may have
   506  			// the same root cgroup. Grab a single ref per hierarchy root.
   507  			if _, ok := cgset[cg]; ok {
   508  				continue
   509  			}
   510  			cg.IncRef() // Ref transferred to caller.
   511  			cgset[cg] = struct{}{}
   512  		}
   513  	}
   514  	return cgset
   515  }
   516  
   517  // GenerateProcCgroups writes the contents of /proc/cgroups to buf.
   518  func (r *CgroupRegistry) GenerateProcCgroups(buf *bytes.Buffer) {
   519  	r.mu.Lock()
   520  	entries := make([]string, 0, len(r.controllers))
   521  	for _, c := range r.controllers {
   522  		en := 0
   523  		if c.Enabled() {
   524  			en = 1
   525  		}
   526  		entries = append(entries, fmt.Sprintf("%s\t%d\t%d\t%d\n", c.Type(), c.HierarchyID(), c.NumCgroups(), en))
   527  	}
   528  	r.mu.Unlock()
   529  
   530  	sort.Strings(entries)
   531  	fmt.Fprint(buf, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n")
   532  	for _, e := range entries {
   533  		fmt.Fprint(buf, e)
   534  	}
   535  }
   536  
   537  // NextCgroupID returns a newly allocated, unique cgroup ID.
   538  func (r *CgroupRegistry) NextCgroupID() (uint32, error) {
   539  	if cid := r.lastCgroupID.Add(1); cid != 0 {
   540  		return cid, nil
   541  	}
   542  	return InvalidCgroupID, fmt.Errorf("cgroup ID overflow")
   543  }
   544  
   545  // AddCgroup adds the ID and cgroup in the map.
   546  func (r *CgroupRegistry) AddCgroup(cg CgroupImpl) {
   547  	r.mu.Lock()
   548  	r.cgroups[cg.ID()] = cg
   549  	r.mu.Unlock()
   550  }
   551  
   552  // GetCgroup returns the cgroup associated with the cgroup ID.
   553  func (r *CgroupRegistry) GetCgroup(cid uint32) (CgroupImpl, error) {
   554  	r.mu.Lock()
   555  	defer r.mu.Unlock()
   556  	cg, ok := r.cgroups[cid]
   557  	if !ok {
   558  		return nil, fmt.Errorf("cgroup with ID %d does not exist", cid)
   559  	}
   560  	return cg, nil
   561  }