github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/kernel/cgroup.go (about)

     1  // Copyright 2021 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"sort"
    21  
    22  	"github.com/metacubex/gvisor/pkg/atomicbitops"
    23  	"github.com/metacubex/gvisor/pkg/context"
    24  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    25  	"github.com/metacubex/gvisor/pkg/fspath"
    26  	"github.com/metacubex/gvisor/pkg/log"
    27  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/kernfs"
    28  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    29  )
    30  
    31  // InvalidCgroupHierarchyID indicates an uninitialized hierarchy ID.
    32  const InvalidCgroupHierarchyID uint32 = 0
    33  
    34  // InvalidCgroupID indicates an uninitialized cgroup ID.
    35  const InvalidCgroupID uint32 = 0
    36  
    37  // CgroupControllerType is the name of a cgroup controller.
    38  type CgroupControllerType string
    39  
    40  // Available cgroup controllers.
    41  const (
    42  	CgroupControllerCPU     = CgroupControllerType("cpu")
    43  	CgroupControllerCPUAcct = CgroupControllerType("cpuacct")
    44  	CgroupControllerCPUSet  = CgroupControllerType("cpuset")
    45  	CgroupControllerDevices = CgroupControllerType("devices")
    46  	CgroupControllerJob     = CgroupControllerType("job")
    47  	CgroupControllerMemory  = CgroupControllerType("memory")
    48  	CgroupControllerPIDs    = CgroupControllerType("pids")
    49  )
    50  
    51  // CgroupCtrls is the list of cgroup controllers.
    52  var CgroupCtrls = []CgroupControllerType{"cpu", "cpuacct", "cpuset", "devices", "job", "memory", "pids"}
    53  
    54  // ParseCgroupController parses a string as a CgroupControllerType.
    55  func ParseCgroupController(val string) (CgroupControllerType, error) {
    56  	switch val {
    57  	case "cpu":
    58  		return CgroupControllerCPU, nil
    59  	case "cpuacct":
    60  		return CgroupControllerCPUAcct, nil
    61  	case "cpuset":
    62  		return CgroupControllerCPUSet, nil
    63  	case "devices":
    64  		return CgroupControllerDevices, nil
    65  	case "job":
    66  		return CgroupControllerJob, nil
    67  	case "memory":
    68  		return CgroupControllerMemory, nil
    69  	case "pids":
    70  		return CgroupControllerPIDs, nil
    71  	default:
    72  		return "", fmt.Errorf("no such cgroup controller")
    73  	}
    74  }
    75  
    76  // CgroupResourceType represents a resource type tracked by a particular
    77  // controller.
    78  type CgroupResourceType int
    79  
    80  // Resources for the cpuacct controller.
    81  const (
    82  	// CgroupResourcePID represents a charge for pids.current.
    83  	CgroupResourcePID CgroupResourceType = iota
    84  )
    85  
    86  // CgroupController is the common interface to cgroup controllers available to
    87  // the entire sentry. The controllers themselves are defined by cgroupfs.
    88  //
    89  // Callers of this interface are often unable access synchronization needed to
    90  // ensure returned values remain valid. Some of values returned from this
    91  // interface are thus snapshots in time, and may become stale. This is ok for
    92  // many callers like procfs.
    93  type CgroupController interface {
    94  	// Returns the type of this cgroup controller (ex "memory", "cpu"). Returned
    95  	// value is valid for the lifetime of the controller.
    96  	Type() CgroupControllerType
    97  
    98  	// Hierarchy returns the ID of the hierarchy this cgroup controller is
    99  	// attached to. Returned value is valid for the lifetime of the controller.
   100  	HierarchyID() uint32
   101  
   102  	// EffectiveRootCgroup returns the effective root cgroup for this
   103  	// controller. This is either the actual root of the underlying cgroupfs
   104  	// filesystem, or the override root configured at sandbox startup. Returned
   105  	// value is valid for the lifetime of the controller.
   106  	EffectiveRootCgroup() Cgroup
   107  
   108  	// NumCgroups returns the number of cgroups managed by this controller.
   109  	// Returned value is a snapshot in time.
   110  	NumCgroups() uint64
   111  
   112  	// Enabled returns whether this controller is enabled. Returned value is a
   113  	// snapshot in time.
   114  	Enabled() bool
   115  }
   116  
   117  // Cgroup represents a named pointer to a cgroup in cgroupfs. When a task enters
   118  // a cgroup, it holds a reference on the underlying dentry pointing to the
   119  // cgroup.
   120  //
   121  // +stateify savable
   122  type Cgroup struct {
   123  	*kernfs.Dentry
   124  	CgroupImpl
   125  }
   126  
   127  // decRef drops a reference on the cgroup. This must happen outside a Task.mu
   128  // critical section.
   129  func (c *Cgroup) decRef() {
   130  	c.Dentry.DecRef(context.Background())
   131  }
   132  
   133  // Path returns the absolute path of c, relative to its hierarchy root.
   134  func (c *Cgroup) Path() string {
   135  	return c.FSLocalPath()
   136  }
   137  
   138  // Walk returns the cgroup at p, starting from c.
   139  func (c *Cgroup) Walk(ctx context.Context, vfsObj *vfs.VirtualFilesystem, p fspath.Path) (Cgroup, error) {
   140  	d, err := c.Dentry.WalkDentryTree(ctx, vfsObj, p)
   141  	if err != nil {
   142  		return Cgroup{}, err
   143  	}
   144  	return Cgroup{
   145  		Dentry:     d,
   146  		CgroupImpl: d.Inode().(CgroupImpl),
   147  	}, nil
   148  }
   149  
   150  // CgroupMigrationContext represents an in-flight cgroup migration for
   151  // a single task.
   152  type CgroupMigrationContext struct {
   153  	src Cgroup
   154  	dst Cgroup
   155  	t   *Task
   156  }
   157  
   158  // Abort cancels a migration.
   159  func (ctx *CgroupMigrationContext) Abort() {
   160  	ctx.dst.AbortMigrate(ctx.t, &ctx.src)
   161  }
   162  
   163  // Commit completes a migration.
   164  func (ctx *CgroupMigrationContext) Commit() {
   165  	ctx.dst.CommitMigrate(ctx.t, &ctx.src)
   166  
   167  	ctx.t.mu.Lock()
   168  	delete(ctx.t.cgroups, ctx.src)
   169  	ctx.src.DecRef(ctx.t)
   170  	ctx.dst.IncRef()
   171  	ctx.t.cgroups[ctx.dst] = struct{}{}
   172  	ctx.t.mu.Unlock()
   173  }
   174  
   175  // CgroupImpl is the common interface to cgroups.
   176  type CgroupImpl interface {
   177  	// Controllers lists the controller associated with this cgroup.
   178  	Controllers() []CgroupController
   179  
   180  	// HierarchyID returns the id of the hierarchy that contains this cgroup.
   181  	HierarchyID() uint32
   182  
   183  	// Name returns the name for this cgroup, if any. If no name was provided
   184  	// when the hierarchy was created, returns "".
   185  	Name() string
   186  
   187  	// Enter moves t into this cgroup.
   188  	Enter(t *Task)
   189  
   190  	// Leave moves t out of this cgroup.
   191  	Leave(t *Task)
   192  
   193  	// PrepareMigrate initiates a migration of t from src to this cgroup. See
   194  	// cgroupfs.controller.PrepareMigrate.
   195  	PrepareMigrate(t *Task, src *Cgroup) error
   196  
   197  	// CommitMigrate completes an in-flight migration. See
   198  	// cgroupfs.controller.CommitMigrate.
   199  	CommitMigrate(t *Task, src *Cgroup)
   200  
   201  	// AbortMigrate cancels an in-flight migration. See
   202  	// cgroupfs.controller.AbortMigrate.
   203  	AbortMigrate(t *Task, src *Cgroup)
   204  
   205  	// Charge charges a controller in this cgroup for a particular resource. key
   206  	// must match a valid resource for the specified controller type.
   207  	//
   208  	// The implementer should silently succeed if no matching controllers are
   209  	// found.
   210  	//
   211  	// The underlying implementation will panic if passed an incompatible
   212  	// resource type for a given controller.
   213  	//
   214  	// See cgroupfs.controller.Charge.
   215  	Charge(t *Task, d *kernfs.Dentry, ctl CgroupControllerType, res CgroupResourceType, value int64) error
   216  
   217  	// ReadControlFromBackground allows a background context to read a cgroup's
   218  	// control values.
   219  	ReadControl(ctx context.Context, name string) (string, error)
   220  
   221  	// WriteControl allows a background context to write a cgroup's control
   222  	// values.
   223  	WriteControl(ctx context.Context, name string, val string) error
   224  
   225  	// ID returns the id of this cgroup.
   226  	ID() uint32
   227  }
   228  
   229  // hierarchy represents a cgroupfs filesystem instance, with a unique set of
   230  // controllers attached to it. Multiple cgroupfs mounts may reference the same
   231  // hierarchy.
   232  //
   233  // +stateify savable
   234  type hierarchy struct {
   235  	id   uint32
   236  	name string
   237  	// These are a subset of the controllers in CgroupRegistry.controllers,
   238  	// grouped here by hierarchy for convenient lookup.
   239  	controllers map[CgroupControllerType]CgroupController
   240  	// fs is not owned by hierarchy. The FS is responsible for unregistering the
   241  	// hierarchy on destruction, which removes this association.
   242  	fs *vfs.Filesystem
   243  }
   244  
   245  func (h *hierarchy) match(ctypes []CgroupControllerType) bool {
   246  	if len(ctypes) != len(h.controllers) {
   247  		return false
   248  	}
   249  	for _, ty := range ctypes {
   250  		if _, ok := h.controllers[ty]; !ok {
   251  			return false
   252  		}
   253  	}
   254  	return true
   255  }
   256  
   257  // cgroupFS is the public interface to cgroupfs. This lets the kernel package
   258  // refer to cgroupfs.filesystem methods without directly depending on the
   259  // cgroupfs package, which would lead to a circular dependency.
   260  type cgroupFS interface {
   261  	// Returns the vfs.Filesystem for the cgroupfs.
   262  	VFSFilesystem() *vfs.Filesystem
   263  
   264  	// InitializeHierarchyID sets the hierarchy ID for this filesystem during
   265  	// filesystem creation. May only be called before the filesystem is visible
   266  	// to the vfs layer.
   267  	InitializeHierarchyID(hid uint32)
   268  
   269  	// RootCgroup returns the root cgroup of this instance. This returns the
   270  	// actual root, and ignores any overrides setting an effective root.
   271  	RootCgroup() Cgroup
   272  }
   273  
   274  // CgroupRegistry tracks the active set of cgroup controllers on the system.
   275  //
   276  // +stateify savable
   277  type CgroupRegistry struct {
   278  	// lastHierarchyID is the id of the last allocated cgroup hierarchy. Valid
   279  	// ids are from 1 to math.MaxUint32.
   280  	//
   281  	lastHierarchyID atomicbitops.Uint32
   282  
   283  	// lastCgroupID is the id of the last allocated cgroup. Valid ids are
   284  	// from 1 to math.MaxUint32.
   285  	//
   286  	lastCgroupID atomicbitops.Uint32
   287  
   288  	mu cgroupMutex `state:"nosave"`
   289  
   290  	// controllers is the set of currently known cgroup controllers on the
   291  	// system.
   292  	//
   293  	// +checklocks:mu
   294  	controllers map[CgroupControllerType]CgroupController
   295  
   296  	// hierarchies is the active set of cgroup hierarchies. This contains all
   297  	// hierarchies on the system.
   298  	//
   299  	// +checklocks:mu
   300  	hierarchies map[uint32]hierarchy
   301  
   302  	// hierarchiesByName is a map of named hierarchies. Only named hierarchies
   303  	// are tracked on this map.
   304  	//
   305  	// +checklocks:mu
   306  	hierarchiesByName map[string]hierarchy
   307  
   308  	// cgroups is the active set of cgroups. This contains all the cgroups
   309  	// on the system.
   310  	//
   311  	// +checklocks:mu
   312  	cgroups map[uint32]CgroupImpl
   313  }
   314  
   315  func newCgroupRegistry() *CgroupRegistry {
   316  	return &CgroupRegistry{
   317  		controllers:       make(map[CgroupControllerType]CgroupController),
   318  		hierarchies:       make(map[uint32]hierarchy),
   319  		hierarchiesByName: make(map[string]hierarchy),
   320  		cgroups:           make(map[uint32]CgroupImpl),
   321  	}
   322  }
   323  
   324  // nextHierarchyID returns a newly allocated, unique hierarchy ID.
   325  func (r *CgroupRegistry) nextHierarchyID() (uint32, error) {
   326  	if hid := r.lastHierarchyID.Add(1); hid != 0 {
   327  		return hid, nil
   328  	}
   329  	return InvalidCgroupHierarchyID, fmt.Errorf("cgroup hierarchy ID overflow")
   330  }
   331  
   332  // FindHierarchy returns a cgroup filesystem containing exactly the set of
   333  // controllers named in ctypes, and optionally the name specified in name if it
   334  // isn't empty. If no such FS is found, FindHierarchy return nil. FindHierarchy
   335  // takes a reference on the returned FS, which is transferred to the caller.
   336  func (r *CgroupRegistry) FindHierarchy(name string, ctypes []CgroupControllerType) (*vfs.Filesystem, error) {
   337  	r.mu.Lock()
   338  	defer r.mu.Unlock()
   339  
   340  	// If we have a hierarchy name, lookup by name.
   341  	if name != "" {
   342  		h, ok := r.hierarchiesByName[name]
   343  		if !ok {
   344  			// Name not found.
   345  			return nil, nil
   346  		}
   347  
   348  		if h.match(ctypes) {
   349  			if !h.fs.TryIncRef() {
   350  				// May be racing with filesystem destruction, see below.
   351  				r.unregisterLocked(h.id)
   352  				return nil, nil
   353  			}
   354  			return h.fs, nil
   355  		}
   356  
   357  		// Name matched, but controllers didn't. Fail per linux
   358  		// kernel/cgroup.c:cgroup_mount().
   359  		log.Debugf("cgroupfs: Registry lookup for name=%s controllers=%v failed; named matched but controllers didn't (have controllers=%v)", name, ctypes, h.controllers)
   360  		return nil, linuxerr.EBUSY
   361  	}
   362  
   363  	for _, h := range r.hierarchies {
   364  		if h.match(ctypes) {
   365  			if !h.fs.TryIncRef() {
   366  				// Racing with filesystem destruction, namely h.fs.Release.
   367  				// Since we hold r.mu, we know the hierarchy hasn't been
   368  				// unregistered yet, but its associated filesystem is tearing
   369  				// down.
   370  				//
   371  				// If we simply indicate the hierarchy wasn't found without
   372  				// cleaning up the registry, the caller can race with the
   373  				// unregister and find itself temporarily unable to create a new
   374  				// hierarchy with a subset of the relevant controllers.
   375  				//
   376  				// To keep the result of FindHierarchy consistent with the
   377  				// uniqueness of controllers enforced by Register, drop the
   378  				// dying hierarchy now. The eventual unregister by the FS
   379  				// teardown will become a no-op.
   380  				r.unregisterLocked(h.id)
   381  				return nil, nil
   382  			}
   383  			return h.fs, nil
   384  		}
   385  	}
   386  
   387  	return nil, nil
   388  }
   389  
   390  // FindCgroup locates a cgroup with the given parameters.
   391  //
   392  // A cgroup is considered a match even if it contains other controllers on the
   393  // same hierarchy.
   394  func (r *CgroupRegistry) FindCgroup(ctx context.Context, ctype CgroupControllerType, path string) (Cgroup, error) {
   395  	p := fspath.Parse(path)
   396  	if !p.Absolute {
   397  		return Cgroup{}, fmt.Errorf("path must be absolute")
   398  	}
   399  	k := KernelFromContext(ctx)
   400  	vfsfs, err := r.FindHierarchy("", []CgroupControllerType{ctype})
   401  	if err != nil {
   402  		return Cgroup{}, err
   403  	}
   404  	if vfsfs == nil {
   405  		return Cgroup{}, fmt.Errorf("controller not active")
   406  	}
   407  	defer vfsfs.DecRef(ctx)
   408  
   409  	rootCG := vfsfs.Impl().(cgroupFS).RootCgroup()
   410  
   411  	if !p.HasComponents() {
   412  		// Explicit root '/'.
   413  		return rootCG, nil
   414  	}
   415  
   416  	return rootCG.Walk(ctx, k.VFS(), p)
   417  }
   418  
   419  // Register registers the provided set of controllers with the registry as a new
   420  // hierarchy. If any controller is already registered, the function returns an
   421  // error without modifying the registry. Register sets the hierarchy ID for the
   422  // filesystem on success.
   423  func (r *CgroupRegistry) Register(name string, cs []CgroupController, fs cgroupFS) error {
   424  	r.mu.Lock()
   425  	defer r.mu.Unlock()
   426  
   427  	if name == "" && len(cs) == 0 {
   428  		return fmt.Errorf("can't register hierarchy with both no controllers and no name")
   429  	}
   430  
   431  	for _, c := range cs {
   432  		if _, ok := r.controllers[c.Type()]; ok {
   433  			return fmt.Errorf("controllers may only be mounted on a single hierarchy")
   434  		}
   435  	}
   436  
   437  	if _, ok := r.hierarchiesByName[name]; name != "" && ok {
   438  		return fmt.Errorf("hierarchy named %q already exists", name)
   439  	}
   440  
   441  	hid, err := r.nextHierarchyID()
   442  	if err != nil {
   443  		return err
   444  	}
   445  
   446  	// Must not fail below here, once we publish the hierarchy ID.
   447  
   448  	fs.InitializeHierarchyID(hid)
   449  
   450  	h := hierarchy{
   451  		id:          hid,
   452  		name:        name,
   453  		controllers: make(map[CgroupControllerType]CgroupController),
   454  		fs:          fs.VFSFilesystem(),
   455  	}
   456  	for _, c := range cs {
   457  		n := c.Type()
   458  		r.controllers[n] = c
   459  		h.controllers[n] = c
   460  	}
   461  	r.hierarchies[hid] = h
   462  	if name != "" {
   463  		r.hierarchiesByName[name] = h
   464  	}
   465  	return nil
   466  }
   467  
   468  // Unregister removes a previously registered hierarchy from the registry. If no
   469  // such hierarchy is registered, Unregister is a no-op.
   470  func (r *CgroupRegistry) Unregister(hid uint32) {
   471  	r.mu.Lock()
   472  	r.unregisterLocked(hid)
   473  	r.mu.Unlock()
   474  }
   475  
   476  // Precondition: Caller must hold r.mu.
   477  // +checklocks:r.mu
   478  func (r *CgroupRegistry) unregisterLocked(hid uint32) {
   479  	if h, ok := r.hierarchies[hid]; ok {
   480  		for name := range h.controllers {
   481  			delete(r.controllers, name)
   482  		}
   483  		delete(r.hierarchies, hid)
   484  	}
   485  }
   486  
   487  // computeInitialGroups takes a reference on each of the returned cgroups. The
   488  // caller takes ownership of this returned reference.
   489  func (r *CgroupRegistry) computeInitialGroups(inherit map[Cgroup]struct{}) map[Cgroup]struct{} {
   490  	r.mu.Lock()
   491  	defer r.mu.Unlock()
   492  
   493  	ctlSet := make(map[CgroupControllerType]CgroupController)
   494  	cgset := make(map[Cgroup]struct{})
   495  
   496  	// Remember controllers from the inherited cgroups set...
   497  	for cg := range inherit {
   498  		cg.IncRef() // Ref transferred to caller.
   499  		for _, ctl := range cg.Controllers() {
   500  			ctlSet[ctl.Type()] = ctl
   501  			cgset[cg] = struct{}{}
   502  		}
   503  	}
   504  
   505  	// ... and add the root cgroups of all the missing controllers.
   506  	for name, ctl := range r.controllers {
   507  		if _, ok := ctlSet[name]; !ok {
   508  			cg := ctl.EffectiveRootCgroup()
   509  			// Multiple controllers may share the same hierarchy, so may have
   510  			// the same root cgroup. Grab a single ref per hierarchy root.
   511  			if _, ok := cgset[cg]; ok {
   512  				continue
   513  			}
   514  			cg.IncRef() // Ref transferred to caller.
   515  			cgset[cg] = struct{}{}
   516  		}
   517  	}
   518  	return cgset
   519  }
   520  
   521  // GenerateProcCgroups writes the contents of /proc/cgroups to buf.
   522  func (r *CgroupRegistry) GenerateProcCgroups(buf *bytes.Buffer) {
   523  	r.mu.Lock()
   524  	entries := make([]string, 0, len(r.controllers))
   525  	for _, c := range r.controllers {
   526  		en := 0
   527  		if c.Enabled() {
   528  			en = 1
   529  		}
   530  		entries = append(entries, fmt.Sprintf("%s\t%d\t%d\t%d\n", c.Type(), c.HierarchyID(), c.NumCgroups(), en))
   531  	}
   532  	r.mu.Unlock()
   533  
   534  	sort.Strings(entries)
   535  	fmt.Fprint(buf, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n")
   536  	for _, e := range entries {
   537  		fmt.Fprint(buf, e)
   538  	}
   539  }
   540  
   541  // NextCgroupID returns a newly allocated, unique cgroup ID.
   542  func (r *CgroupRegistry) NextCgroupID() (uint32, error) {
   543  	if cid := r.lastCgroupID.Add(1); cid != 0 {
   544  		return cid, nil
   545  	}
   546  	return InvalidCgroupID, fmt.Errorf("cgroup ID overflow")
   547  }
   548  
   549  // AddCgroup adds the ID and cgroup in the map.
   550  func (r *CgroupRegistry) AddCgroup(cg CgroupImpl) {
   551  	r.mu.Lock()
   552  	r.cgroups[cg.ID()] = cg
   553  	r.mu.Unlock()
   554  }
   555  
   556  // GetCgroup returns the cgroup associated with the cgroup ID.
   557  func (r *CgroupRegistry) GetCgroup(cid uint32) (CgroupImpl, error) {
   558  	r.mu.Lock()
   559  	defer r.mu.Unlock()
   560  	cg, ok := r.cgroups[cid]
   561  	if !ok {
   562  		return nil, fmt.Errorf("cgroup with ID %d does not exist", cid)
   563  	}
   564  	return cg, nil
   565  }