github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/kernel/cgroup.go (about)

     1  // Copyright 2021 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"sort"
    21  
    22  	"github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops"
    23  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/fspath"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/log"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/kernfs"
    28  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs"
    29  )
    30  
    31  // InvalidCgroupHierarchyID indicates an uninitialized hierarchy ID.
    32  const InvalidCgroupHierarchyID uint32 = 0
    33  
    34  // InvalidCgroupID indicates an uninitialized cgroup ID.
    35  const InvalidCgroupID uint32 = 0
    36  
    37  // CgroupControllerType is the name of a cgroup controller.
    38  type CgroupControllerType string
    39  
    40  // Available cgroup controllers.
    41  const (
    42  	CgroupControllerCPU     = CgroupControllerType("cpu")
    43  	CgroupControllerCPUAcct = CgroupControllerType("cpuacct")
    44  	CgroupControllerCPUSet  = CgroupControllerType("cpuset")
    45  	CgroupControllerJob     = CgroupControllerType("job")
    46  	CgroupControllerMemory  = CgroupControllerType("memory")
    47  	CgroupControllerPIDs    = CgroupControllerType("pids")
    48  )
    49  
    50  // ParseCgroupController parses a string as a CgroupControllerType.
    51  func ParseCgroupController(val string) (CgroupControllerType, error) {
    52  	switch val {
    53  	case "cpu":
    54  		return CgroupControllerCPU, nil
    55  	case "cpuacct":
    56  		return CgroupControllerCPUAcct, nil
    57  	case "cpuset":
    58  		return CgroupControllerCPUSet, nil
    59  	case "job":
    60  		return CgroupControllerJob, nil
    61  	case "memory":
    62  		return CgroupControllerMemory, nil
    63  	case "pids":
    64  		return CgroupControllerPIDs, nil
    65  	default:
    66  		return "", fmt.Errorf("no such cgroup controller")
    67  	}
    68  }
    69  
    70  // CgroupResourceType represents a resource type tracked by a particular
    71  // controller.
    72  type CgroupResourceType int
    73  
    74  // Resources for the cpuacct controller.
    75  const (
    76  	// CgroupResourcePID represents a charge for pids.current.
    77  	CgroupResourcePID CgroupResourceType = iota
    78  )
    79  
    80  // CgroupController is the common interface to cgroup controllers available to
    81  // the entire sentry. The controllers themselves are defined by cgroupfs.
    82  //
    83  // Callers of this interface are often unable access synchronization needed to
    84  // ensure returned values remain valid. Some of values returned from this
    85  // interface are thus snapshots in time, and may become stale. This is ok for
    86  // many callers like procfs.
    87  type CgroupController interface {
    88  	// Returns the type of this cgroup controller (ex "memory", "cpu"). Returned
    89  	// value is valid for the lifetime of the controller.
    90  	Type() CgroupControllerType
    91  
    92  	// Hierarchy returns the ID of the hierarchy this cgroup controller is
    93  	// attached to. Returned value is valid for the lifetime of the controller.
    94  	HierarchyID() uint32
    95  
    96  	// EffectiveRootCgroup returns the effective root cgroup for this
    97  	// controller. This is either the actual root of the underlying cgroupfs
    98  	// filesystem, or the override root configured at sandbox startup. Returned
    99  	// value is valid for the lifetime of the controller.
   100  	EffectiveRootCgroup() Cgroup
   101  
   102  	// NumCgroups returns the number of cgroups managed by this controller.
   103  	// Returned value is a snapshot in time.
   104  	NumCgroups() uint64
   105  
   106  	// Enabled returns whether this controller is enabled. Returned value is a
   107  	// snapshot in time.
   108  	Enabled() bool
   109  }
   110  
   111  // Cgroup represents a named pointer to a cgroup in cgroupfs. When a task enters
   112  // a cgroup, it holds a reference on the underlying dentry pointing to the
   113  // cgroup.
   114  //
   115  // +stateify savable
   116  type Cgroup struct {
   117  	*kernfs.Dentry
   118  	CgroupImpl
   119  }
   120  
   121  // decRef drops a reference on the cgroup. This must happen outside a Task.mu
   122  // critical section.
   123  func (c *Cgroup) decRef() {
   124  	c.Dentry.DecRef(context.Background())
   125  }
   126  
   127  // Path returns the absolute path of c, relative to its hierarchy root.
   128  func (c *Cgroup) Path() string {
   129  	return c.FSLocalPath()
   130  }
   131  
   132  // Walk returns the cgroup at p, starting from c.
   133  func (c *Cgroup) Walk(ctx context.Context, vfsObj *vfs.VirtualFilesystem, p fspath.Path) (Cgroup, error) {
   134  	d, err := c.Dentry.WalkDentryTree(ctx, vfsObj, p)
   135  	if err != nil {
   136  		return Cgroup{}, err
   137  	}
   138  	return Cgroup{
   139  		Dentry:     d,
   140  		CgroupImpl: d.Inode().(CgroupImpl),
   141  	}, nil
   142  }
   143  
   144  // CgroupMigrationContext represents an in-flight cgroup migration for
   145  // a single task.
   146  type CgroupMigrationContext struct {
   147  	src Cgroup
   148  	dst Cgroup
   149  	t   *Task
   150  }
   151  
   152  // Abort cancels a migration.
   153  func (ctx *CgroupMigrationContext) Abort() {
   154  	ctx.dst.AbortMigrate(ctx.t, &ctx.src)
   155  }
   156  
   157  // Commit completes a migration.
   158  func (ctx *CgroupMigrationContext) Commit() {
   159  	ctx.dst.CommitMigrate(ctx.t, &ctx.src)
   160  
   161  	ctx.t.mu.Lock()
   162  	delete(ctx.t.cgroups, ctx.src)
   163  	ctx.src.DecRef(ctx.t)
   164  	ctx.dst.IncRef()
   165  	ctx.t.cgroups[ctx.dst] = struct{}{}
   166  	ctx.t.mu.Unlock()
   167  }
   168  
   169  // CgroupImpl is the common interface to cgroups.
   170  type CgroupImpl interface {
   171  	// Controllers lists the controller associated with this cgroup.
   172  	Controllers() []CgroupController
   173  
   174  	// HierarchyID returns the id of the hierarchy that contains this cgroup.
   175  	HierarchyID() uint32
   176  
   177  	// Name returns the name for this cgroup, if any. If no name was provided
   178  	// when the hierarchy was created, returns "".
   179  	Name() string
   180  
   181  	// Enter moves t into this cgroup.
   182  	Enter(t *Task)
   183  
   184  	// Leave moves t out of this cgroup.
   185  	Leave(t *Task)
   186  
   187  	// PrepareMigrate initiates a migration of t from src to this cgroup. See
   188  	// cgroupfs.controller.PrepareMigrate.
   189  	PrepareMigrate(t *Task, src *Cgroup) error
   190  
   191  	// CommitMigrate completes an in-flight migration. See
   192  	// cgroupfs.controller.CommitMigrate.
   193  	CommitMigrate(t *Task, src *Cgroup)
   194  
   195  	// AbortMigrate cancels an in-flight migration. See
   196  	// cgroupfs.controller.AbortMigrate.
   197  	AbortMigrate(t *Task, src *Cgroup)
   198  
   199  	// Charge charges a controller in this cgroup for a particular resource. key
   200  	// must match a valid resource for the specified controller type.
   201  	//
   202  	// The implementer should silently succeed if no matching controllers are
   203  	// found.
   204  	//
   205  	// The underlying implementaion will panic if passed an incompatible
   206  	// resource type for a given controller.
   207  	//
   208  	// See cgroupfs.controller.Charge.
   209  	Charge(t *Task, d *kernfs.Dentry, ctl CgroupControllerType, res CgroupResourceType, value int64) error
   210  
   211  	// ReadControlFromBackground allows a background context to read a cgroup's
   212  	// control values.
   213  	ReadControl(ctx context.Context, name string) (string, error)
   214  
   215  	// WriteControl allows a background context to write a cgroup's control
   216  	// values.
   217  	WriteControl(ctx context.Context, name string, val string) error
   218  
   219  	// ID returns the id of this cgroup.
   220  	ID() uint32
   221  }
   222  
   223  // hierarchy represents a cgroupfs filesystem instance, with a unique set of
   224  // controllers attached to it. Multiple cgroupfs mounts may reference the same
   225  // hierarchy.
   226  //
   227  // +stateify savable
   228  type hierarchy struct {
   229  	id   uint32
   230  	name string
   231  	// These are a subset of the controllers in CgroupRegistry.controllers,
   232  	// grouped here by hierarchy for conveninent lookup.
   233  	controllers map[CgroupControllerType]CgroupController
   234  	// fs is not owned by hierarchy. The FS is responsible for unregistering the
   235  	// hierarchy on destruction, which removes this association.
   236  	fs *vfs.Filesystem
   237  }
   238  
   239  func (h *hierarchy) match(ctypes []CgroupControllerType) bool {
   240  	if len(ctypes) != len(h.controllers) {
   241  		return false
   242  	}
   243  	for _, ty := range ctypes {
   244  		if _, ok := h.controllers[ty]; !ok {
   245  			return false
   246  		}
   247  	}
   248  	return true
   249  }
   250  
   251  // cgroupFS is the public interface to cgroupfs. This lets the kernel package
   252  // refer to cgroupfs.filesystem methods without directly depending on the
   253  // cgroupfs package, which would lead to a circular dependency.
   254  type cgroupFS interface {
   255  	// Returns the vfs.Filesystem for the cgroupfs.
   256  	VFSFilesystem() *vfs.Filesystem
   257  
   258  	// InitializeHierarchyID sets the hierarchy ID for this filesystem during
   259  	// filesystem creation. May only be called before the filesystem is visible
   260  	// to the vfs layer.
   261  	InitializeHierarchyID(hid uint32)
   262  
   263  	// RootCgroup returns the root cgroup of this instance. This returns the
   264  	// actual root, and ignores any overrides setting an effective root.
   265  	RootCgroup() Cgroup
   266  }
   267  
   268  // CgroupRegistry tracks the active set of cgroup controllers on the system.
   269  //
   270  // +stateify savable
   271  type CgroupRegistry struct {
   272  	// lastHierarchyID is the id of the last allocated cgroup hierarchy. Valid
   273  	// ids are from 1 to math.MaxUint32.
   274  	//
   275  	lastHierarchyID atomicbitops.Uint32
   276  
   277  	// lastCgroupID is the id of the last allocated cgroup. Valid ids are
   278  	// from 1 to math.MaxUint32.
   279  	//
   280  	lastCgroupID atomicbitops.Uint32
   281  
   282  	mu cgroupMutex `state:"nosave"`
   283  
   284  	// controllers is the set of currently known cgroup controllers on the
   285  	// system.
   286  	//
   287  	// +checklocks:mu
   288  	controllers map[CgroupControllerType]CgroupController
   289  
   290  	// hierarchies is the active set of cgroup hierarchies. This contains all
   291  	// hierarchies on the system.
   292  	//
   293  	// +checklocks:mu
   294  	hierarchies map[uint32]hierarchy
   295  
   296  	// hierarchiesByName is a map of named hierarchies. Only named hierarchies
   297  	// are tracked on this map.
   298  	//
   299  	// +checklocks:mu
   300  	hierarchiesByName map[string]hierarchy
   301  
   302  	// cgroups is the active set of cgroups. This contains all the cgroups
   303  	// on the system.
   304  	//
   305  	// +checklocks:mu
   306  	cgroups map[uint32]CgroupImpl
   307  }
   308  
   309  func newCgroupRegistry() *CgroupRegistry {
   310  	return &CgroupRegistry{
   311  		controllers:       make(map[CgroupControllerType]CgroupController),
   312  		hierarchies:       make(map[uint32]hierarchy),
   313  		hierarchiesByName: make(map[string]hierarchy),
   314  		cgroups:           make(map[uint32]CgroupImpl),
   315  	}
   316  }
   317  
   318  // nextHierarchyID returns a newly allocated, unique hierarchy ID.
   319  func (r *CgroupRegistry) nextHierarchyID() (uint32, error) {
   320  	if hid := r.lastHierarchyID.Add(1); hid != 0 {
   321  		return hid, nil
   322  	}
   323  	return InvalidCgroupHierarchyID, fmt.Errorf("cgroup hierarchy ID overflow")
   324  }
   325  
   326  // FindHierarchy returns a cgroup filesystem containing exactly the set of
   327  // controllers named in ctypes, and optionally the name specified in name if it
   328  // isn't empty. If no such FS is found, FindHierarchy return nil. FindHierarchy
   329  // takes a reference on the returned FS, which is transferred to the caller.
   330  func (r *CgroupRegistry) FindHierarchy(name string, ctypes []CgroupControllerType) (*vfs.Filesystem, error) {
   331  	r.mu.Lock()
   332  	defer r.mu.Unlock()
   333  
   334  	// If we have a hierarchy name, lookup by name.
   335  	if name != "" {
   336  		h, ok := r.hierarchiesByName[name]
   337  		if !ok {
   338  			// Name not found.
   339  			return nil, nil
   340  		}
   341  
   342  		if h.match(ctypes) {
   343  			if !h.fs.TryIncRef() {
   344  				// May be racing with filesystem destruction, see below.
   345  				r.unregisterLocked(h.id)
   346  				return nil, nil
   347  			}
   348  			return h.fs, nil
   349  		}
   350  
   351  		// Name matched, but controllers didn't. Fail per linux
   352  		// kernel/cgroup.c:cgroup_mount().
   353  		log.Debugf("cgroupfs: Registry lookup for name=%s controllers=%v failed; named matched but controllers didn't (have controllers=%v)", name, ctypes, h.controllers)
   354  		return nil, linuxerr.EBUSY
   355  	}
   356  
   357  	for _, h := range r.hierarchies {
   358  		if h.match(ctypes) {
   359  			if !h.fs.TryIncRef() {
   360  				// Racing with filesystem destruction, namely h.fs.Release.
   361  				// Since we hold r.mu, we know the hierarchy hasn't been
   362  				// unregistered yet, but its associated filesystem is tearing
   363  				// down.
   364  				//
   365  				// If we simply indicate the hierarchy wasn't found without
   366  				// cleaning up the registry, the caller can race with the
   367  				// unregister and find itself temporarily unable to create a new
   368  				// hierarchy with a subset of the relevant controllers.
   369  				//
   370  				// To keep the result of FindHierarchy consistent with the
   371  				// uniqueness of controllers enforced by Register, drop the
   372  				// dying hierarchy now. The eventual unregister by the FS
   373  				// teardown will become a no-op.
   374  				r.unregisterLocked(h.id)
   375  				return nil, nil
   376  			}
   377  			return h.fs, nil
   378  		}
   379  	}
   380  
   381  	return nil, nil
   382  }
   383  
   384  // FindCgroup locates a cgroup with the given parameters.
   385  //
   386  // A cgroup is considered a match even if it contains other controllers on the
   387  // same hierarchy.
   388  func (r *CgroupRegistry) FindCgroup(ctx context.Context, ctype CgroupControllerType, path string) (Cgroup, error) {
   389  	p := fspath.Parse(path)
   390  	if !p.Absolute {
   391  		return Cgroup{}, fmt.Errorf("path must be absolute")
   392  	}
   393  	k := KernelFromContext(ctx)
   394  	vfsfs, err := r.FindHierarchy("", []CgroupControllerType{ctype})
   395  	if err != nil {
   396  		return Cgroup{}, err
   397  	}
   398  	if vfsfs == nil {
   399  		return Cgroup{}, fmt.Errorf("controller not active")
   400  	}
   401  
   402  	rootCG := vfsfs.Impl().(cgroupFS).RootCgroup()
   403  
   404  	if !p.HasComponents() {
   405  		// Explicit root '/'.
   406  		return rootCG, nil
   407  	}
   408  
   409  	return rootCG.Walk(ctx, k.VFS(), p)
   410  }
   411  
   412  // Register registers the provided set of controllers with the registry as a new
   413  // hierarchy. If any controller is already registered, the function returns an
   414  // error without modifying the registry. Register sets the hierarchy ID for the
   415  // filesystem on success.
   416  func (r *CgroupRegistry) Register(name string, cs []CgroupController, fs cgroupFS) error {
   417  	r.mu.Lock()
   418  	defer r.mu.Unlock()
   419  
   420  	if name == "" && len(cs) == 0 {
   421  		return fmt.Errorf("can't register hierarchy with both no controllers and no name")
   422  	}
   423  
   424  	for _, c := range cs {
   425  		if _, ok := r.controllers[c.Type()]; ok {
   426  			return fmt.Errorf("controllers may only be mounted on a single hierarchy")
   427  		}
   428  	}
   429  
   430  	if _, ok := r.hierarchiesByName[name]; name != "" && ok {
   431  		return fmt.Errorf("hierarchy named %q already exists", name)
   432  	}
   433  
   434  	hid, err := r.nextHierarchyID()
   435  	if err != nil {
   436  		return err
   437  	}
   438  
   439  	// Must not fail below here, once we publish the hierarchy ID.
   440  
   441  	fs.InitializeHierarchyID(hid)
   442  
   443  	h := hierarchy{
   444  		id:          hid,
   445  		name:        name,
   446  		controllers: make(map[CgroupControllerType]CgroupController),
   447  		fs:          fs.VFSFilesystem(),
   448  	}
   449  	for _, c := range cs {
   450  		n := c.Type()
   451  		r.controllers[n] = c
   452  		h.controllers[n] = c
   453  	}
   454  	r.hierarchies[hid] = h
   455  	if name != "" {
   456  		r.hierarchiesByName[name] = h
   457  	}
   458  	return nil
   459  }
   460  
   461  // Unregister removes a previously registered hierarchy from the registry. If no
   462  // such hierarchy is registered, Unregister is a no-op.
   463  func (r *CgroupRegistry) Unregister(hid uint32) {
   464  	r.mu.Lock()
   465  	r.unregisterLocked(hid)
   466  	r.mu.Unlock()
   467  }
   468  
   469  // Precondition: Caller must hold r.mu.
   470  // +checklocks:r.mu
   471  func (r *CgroupRegistry) unregisterLocked(hid uint32) {
   472  	if h, ok := r.hierarchies[hid]; ok {
   473  		for name := range h.controllers {
   474  			delete(r.controllers, name)
   475  		}
   476  		delete(r.hierarchies, hid)
   477  	}
   478  }
   479  
   480  // computeInitialGroups takes a reference on each of the returned cgroups. The
   481  // caller takes ownership of this returned reference.
   482  func (r *CgroupRegistry) computeInitialGroups(inherit map[Cgroup]struct{}) map[Cgroup]struct{} {
   483  	r.mu.Lock()
   484  	defer r.mu.Unlock()
   485  
   486  	ctlSet := make(map[CgroupControllerType]CgroupController)
   487  	cgset := make(map[Cgroup]struct{})
   488  
   489  	// Remember controllers from the inherited cgroups set...
   490  	for cg := range inherit {
   491  		cg.IncRef() // Ref transferred to caller.
   492  		for _, ctl := range cg.Controllers() {
   493  			ctlSet[ctl.Type()] = ctl
   494  			cgset[cg] = struct{}{}
   495  		}
   496  	}
   497  
   498  	// ... and add the root cgroups of all the missing controllers.
   499  	for name, ctl := range r.controllers {
   500  		if _, ok := ctlSet[name]; !ok {
   501  			cg := ctl.EffectiveRootCgroup()
   502  			// Multiple controllers may share the same hierarchy, so may have
   503  			// the same root cgroup. Grab a single ref per hierarchy root.
   504  			if _, ok := cgset[cg]; ok {
   505  				continue
   506  			}
   507  			cg.IncRef() // Ref transferred to caller.
   508  			cgset[cg] = struct{}{}
   509  		}
   510  	}
   511  	return cgset
   512  }
   513  
   514  // GenerateProcCgroups writes the contents of /proc/cgroups to buf.
   515  func (r *CgroupRegistry) GenerateProcCgroups(buf *bytes.Buffer) {
   516  	r.mu.Lock()
   517  	entries := make([]string, 0, len(r.controllers))
   518  	for _, c := range r.controllers {
   519  		en := 0
   520  		if c.Enabled() {
   521  			en = 1
   522  		}
   523  		entries = append(entries, fmt.Sprintf("%s\t%d\t%d\t%d\n", c.Type(), c.HierarchyID(), c.NumCgroups(), en))
   524  	}
   525  	r.mu.Unlock()
   526  
   527  	sort.Strings(entries)
   528  	fmt.Fprint(buf, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n")
   529  	for _, e := range entries {
   530  		fmt.Fprint(buf, e)
   531  	}
   532  }
   533  
   534  // NextCgroupID returns a newly allocated, unique cgroup ID.
   535  func (r *CgroupRegistry) NextCgroupID() (uint32, error) {
   536  	if cid := r.lastCgroupID.Add(1); cid != 0 {
   537  		return cid, nil
   538  	}
   539  	return InvalidCgroupID, fmt.Errorf("cgroup ID overflow")
   540  }
   541  
   542  // AddCgroup adds the ID and cgroup in the map.
   543  func (r *CgroupRegistry) AddCgroup(cg CgroupImpl) {
   544  	r.mu.Lock()
   545  	r.cgroups[cg.ID()] = cg
   546  	r.mu.Unlock()
   547  }
   548  
   549  // GetCgroup returns the cgroup associated with the cgroup ID.
   550  func (r *CgroupRegistry) GetCgroup(cid uint32) (CgroupImpl, error) {
   551  	r.mu.Lock()
   552  	defer r.mu.Unlock()
   553  	cg, ok := r.cgroups[cid]
   554  	if !ok {
   555  		return nil, fmt.Errorf("cgroup with ID %d does not exist", cid)
   556  	}
   557  	return cg, nil
   558  }