github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/cgroup.go (about)

     1  // Copyright 2021 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"sort"
    21  	"sync/atomic"
    22  
    23  	"github.com/SagerNet/gvisor/pkg/context"
    24  	"github.com/SagerNet/gvisor/pkg/sentry/fsimpl/kernfs"
    25  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    26  	"github.com/SagerNet/gvisor/pkg/sync"
    27  )
    28  
    29  // InvalidCgroupHierarchyID indicates an uninitialized hierarchy ID.
    30  const InvalidCgroupHierarchyID uint32 = 0
    31  
    32  // CgroupControllerType is the name of a cgroup controller.
    33  type CgroupControllerType string
    34  
    35  // CgroupController is the common interface to cgroup controllers available to
    36  // the entire sentry. The controllers themselves are defined by cgroupfs.
    37  //
    38  // Callers of this interface are often unable access synchronization needed to
    39  // ensure returned values remain valid. Some of values returned from this
    40  // interface are thus snapshots in time, and may become stale. This is ok for
    41  // many callers like procfs.
    42  type CgroupController interface {
    43  	// Returns the type of this cgroup controller (ex "memory", "cpu"). Returned
    44  	// value is valid for the lifetime of the controller.
    45  	Type() CgroupControllerType
    46  
    47  	// Hierarchy returns the ID of the hierarchy this cgroup controller is
    48  	// attached to. Returned value is valid for the lifetime of the controller.
    49  	HierarchyID() uint32
    50  
    51  	// RootCgroup returns the root cgroup for this controller. Returned value is
    52  	// valid for the lifetime of the controller.
    53  	RootCgroup() Cgroup
    54  
    55  	// NumCgroups returns the number of cgroups managed by this controller.
    56  	// Returned value is a snapshot in time.
    57  	NumCgroups() uint64
    58  
    59  	// Enabled returns whether this controller is enabled. Returned value is a
    60  	// snapshot in time.
    61  	Enabled() bool
    62  }
    63  
    64  // Cgroup represents a named pointer to a cgroup in cgroupfs. When a task enters
    65  // a cgroup, it holds a reference on the underlying dentry pointing to the
    66  // cgroup.
    67  //
    68  // +stateify savable
    69  type Cgroup struct {
    70  	*kernfs.Dentry
    71  	CgroupImpl
    72  }
    73  
    74  func (c *Cgroup) decRef() {
    75  	c.Dentry.DecRef(context.Background())
    76  }
    77  
    78  // Path returns the absolute path of c, relative to its hierarchy root.
    79  func (c *Cgroup) Path() string {
    80  	return c.FSLocalPath()
    81  }
    82  
    83  // HierarchyID returns the id of the hierarchy that contains this cgroup.
    84  func (c *Cgroup) HierarchyID() uint32 {
    85  	// Note: a cgroup is guaranteed to have at least one controller.
    86  	return c.Controllers()[0].HierarchyID()
    87  }
    88  
    89  // CgroupImpl is the common interface to cgroups.
    90  type CgroupImpl interface {
    91  	Controllers() []CgroupController
    92  	Enter(t *Task)
    93  	Leave(t *Task)
    94  }
    95  
    96  // hierarchy represents a cgroupfs filesystem instance, with a unique set of
    97  // controllers attached to it. Multiple cgroupfs mounts may reference the same
    98  // hierarchy.
    99  //
   100  // +stateify savable
   101  type hierarchy struct {
   102  	id uint32
   103  	// These are a subset of the controllers in CgroupRegistry.controllers,
   104  	// grouped here by hierarchy for conveninent lookup.
   105  	controllers map[CgroupControllerType]CgroupController
   106  	// fs is not owned by hierarchy. The FS is responsible for unregistering the
   107  	// hierarchy on destruction, which removes this association.
   108  	fs *vfs.Filesystem
   109  }
   110  
   111  func (h *hierarchy) match(ctypes []CgroupControllerType) bool {
   112  	if len(ctypes) != len(h.controllers) {
   113  		return false
   114  	}
   115  	for _, ty := range ctypes {
   116  		if _, ok := h.controllers[ty]; !ok {
   117  			return false
   118  		}
   119  	}
   120  	return true
   121  }
   122  
   123  // cgroupFS is the public interface to cgroupfs. This lets the kernel package
   124  // refer to cgroupfs.filesystem methods without directly depending on the
   125  // cgroupfs package, which would lead to a circular dependency.
   126  type cgroupFS interface {
   127  	// Returns the vfs.Filesystem for the cgroupfs.
   128  	VFSFilesystem() *vfs.Filesystem
   129  
   130  	// InitializeHierarchyID sets the hierarchy ID for this filesystem during
   131  	// filesystem creation. May only be called before the filesystem is visible
   132  	// to the vfs layer.
   133  	InitializeHierarchyID(hid uint32)
   134  }
   135  
   136  // CgroupRegistry tracks the active set of cgroup controllers on the system.
   137  //
   138  // +stateify savable
   139  type CgroupRegistry struct {
   140  	// lastHierarchyID is the id of the last allocated cgroup hierarchy. Valid
   141  	// ids are from 1 to math.MaxUint32. Must be accessed through atomic ops.
   142  	//
   143  	lastHierarchyID uint32
   144  
   145  	mu sync.Mutex `state:"nosave"`
   146  
   147  	// controllers is the set of currently known cgroup controllers on the
   148  	// system. Protected by mu.
   149  	//
   150  	// +checklocks:mu
   151  	controllers map[CgroupControllerType]CgroupController
   152  
   153  	// hierarchies is the active set of cgroup hierarchies. Protected by mu.
   154  	//
   155  	// +checklocks:mu
   156  	hierarchies map[uint32]hierarchy
   157  }
   158  
   159  func newCgroupRegistry() *CgroupRegistry {
   160  	return &CgroupRegistry{
   161  		controllers: make(map[CgroupControllerType]CgroupController),
   162  		hierarchies: make(map[uint32]hierarchy),
   163  	}
   164  }
   165  
   166  // nextHierarchyID returns a newly allocated, unique hierarchy ID.
   167  func (r *CgroupRegistry) nextHierarchyID() (uint32, error) {
   168  	if hid := atomic.AddUint32(&r.lastHierarchyID, 1); hid != 0 {
   169  		return hid, nil
   170  	}
   171  	return InvalidCgroupHierarchyID, fmt.Errorf("cgroup hierarchy ID overflow")
   172  }
   173  
   174  // FindHierarchy returns a cgroup filesystem containing exactly the set of
   175  // controllers named in names. If no such FS is found, FindHierarchy return
   176  // nil. FindHierarchy takes a reference on the returned FS, which is transferred
   177  // to the caller.
   178  func (r *CgroupRegistry) FindHierarchy(ctypes []CgroupControllerType) *vfs.Filesystem {
   179  	r.mu.Lock()
   180  	defer r.mu.Unlock()
   181  
   182  	for _, h := range r.hierarchies {
   183  		if h.match(ctypes) {
   184  			if !h.fs.TryIncRef() {
   185  				// Racing with filesystem destruction, namely h.fs.Release.
   186  				// Since we hold r.mu, we know the hierarchy hasn't been
   187  				// unregistered yet, but its associated filesystem is tearing
   188  				// down.
   189  				//
   190  				// If we simply indicate the hierarchy wasn't found without
   191  				// cleaning up the registry, the caller can race with the
   192  				// unregister and find itself temporarily unable to create a new
   193  				// hierarchy with a subset of the relevant controllers.
   194  				//
   195  				// To keep the result of FindHierarchy consistent with the
   196  				// uniqueness of controllers enforced by Register, drop the
   197  				// dying hierarchy now. The eventual unregister by the FS
   198  				// teardown will become a no-op.
   199  				return nil
   200  			}
   201  			return h.fs
   202  		}
   203  	}
   204  
   205  	return nil
   206  }
   207  
   208  // Register registers the provided set of controllers with the registry as a new
   209  // hierarchy. If any controller is already registered, the function returns an
   210  // error without modifying the registry. Register sets the hierarchy ID for the
   211  // filesystem on success.
   212  func (r *CgroupRegistry) Register(cs []CgroupController, fs cgroupFS) error {
   213  	r.mu.Lock()
   214  	defer r.mu.Unlock()
   215  
   216  	if len(cs) == 0 {
   217  		return fmt.Errorf("can't register hierarchy with no controllers")
   218  	}
   219  
   220  	for _, c := range cs {
   221  		if _, ok := r.controllers[c.Type()]; ok {
   222  			return fmt.Errorf("controllers may only be mounted on a single hierarchy")
   223  		}
   224  	}
   225  
   226  	hid, err := r.nextHierarchyID()
   227  	if err != nil {
   228  		return err
   229  	}
   230  
   231  	// Must not fail below here, once we publish the hierarchy ID.
   232  
   233  	fs.InitializeHierarchyID(hid)
   234  
   235  	h := hierarchy{
   236  		id:          hid,
   237  		controllers: make(map[CgroupControllerType]CgroupController),
   238  		fs:          fs.VFSFilesystem(),
   239  	}
   240  	for _, c := range cs {
   241  		n := c.Type()
   242  		r.controllers[n] = c
   243  		h.controllers[n] = c
   244  	}
   245  	r.hierarchies[hid] = h
   246  	return nil
   247  }
   248  
   249  // Unregister removes a previously registered hierarchy from the registry. If no
   250  // such hierarchy is registered, Unregister is a no-op.
   251  func (r *CgroupRegistry) Unregister(hid uint32) {
   252  	r.mu.Lock()
   253  	r.unregisterLocked(hid)
   254  	r.mu.Unlock()
   255  }
   256  
   257  // Precondition: Caller must hold r.mu.
   258  // +checklocks:r.mu
   259  func (r *CgroupRegistry) unregisterLocked(hid uint32) {
   260  	if h, ok := r.hierarchies[hid]; ok {
   261  		for name, _ := range h.controllers {
   262  			delete(r.controllers, name)
   263  		}
   264  		delete(r.hierarchies, hid)
   265  	}
   266  }
   267  
   268  // computeInitialGroups takes a reference on each of the returned cgroups. The
   269  // caller takes ownership of this returned reference.
   270  func (r *CgroupRegistry) computeInitialGroups(inherit map[Cgroup]struct{}) map[Cgroup]struct{} {
   271  	r.mu.Lock()
   272  	defer r.mu.Unlock()
   273  
   274  	ctlSet := make(map[CgroupControllerType]CgroupController)
   275  	cgset := make(map[Cgroup]struct{})
   276  
   277  	// Remember controllers from the inherited cgroups set...
   278  	for cg, _ := range inherit {
   279  		cg.IncRef() // Ref transferred to caller.
   280  		for _, ctl := range cg.Controllers() {
   281  			ctlSet[ctl.Type()] = ctl
   282  			cgset[cg] = struct{}{}
   283  		}
   284  	}
   285  
   286  	// ... and add the root cgroups of all the missing controllers.
   287  	for name, ctl := range r.controllers {
   288  		if _, ok := ctlSet[name]; !ok {
   289  			cg := ctl.RootCgroup()
   290  			// Multiple controllers may share the same hierarchy, so may have
   291  			// the same root cgroup. Grab a single ref per hierarchy root.
   292  			if _, ok := cgset[cg]; ok {
   293  				continue
   294  			}
   295  			cg.IncRef() // Ref transferred to caller.
   296  			cgset[cg] = struct{}{}
   297  		}
   298  	}
   299  	return cgset
   300  }
   301  
   302  // GenerateProcCgroups writes the contents of /proc/cgroups to buf.
   303  func (r *CgroupRegistry) GenerateProcCgroups(buf *bytes.Buffer) {
   304  	r.mu.Lock()
   305  	entries := make([]string, 0, len(r.controllers))
   306  	for _, c := range r.controllers {
   307  		en := 0
   308  		if c.Enabled() {
   309  			en = 1
   310  		}
   311  		entries = append(entries, fmt.Sprintf("%s\t%d\t%d\t%d\n", c.Type(), c.HierarchyID(), c.NumCgroups(), en))
   312  	}
   313  	r.mu.Unlock()
   314  
   315  	sort.Strings(entries)
   316  	fmt.Fprint(buf, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n")
   317  	for _, e := range entries {
   318  		fmt.Fprint(buf, e)
   319  	}
   320  }