gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/fsimpl/cgroupfs/pids.go (about)

     1  // Copyright 2022 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package cgroupfs
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"strings"
    21  
    22  	"gvisor.dev/gvisor/pkg/context"
    23  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    24  	"gvisor.dev/gvisor/pkg/hostarch"
    25  	"gvisor.dev/gvisor/pkg/log"
    26  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
    27  	"gvisor.dev/gvisor/pkg/sentry/kernel"
    28  	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
    29  	"gvisor.dev/gvisor/pkg/sentry/vfs"
    30  	"gvisor.dev/gvisor/pkg/usermem"
    31  )
    32  
    33  // pidMaxLimit is the maximum number of pids allowed on a 64-bit system. The
    34  // practical limit is much lower. See Linux, include/linux/threads.h.
    35  const pidMaxLimit = 4 * 1024 * 1024
    36  const pidLimitUnlimited = pidMaxLimit + 1
    37  
    38  // pidsController tracks how many pids are used by tasks in a cgroup. This is
    39  // used to limit the number of tasks per cgroup. The limit is enforced only when
    40  // new tasks are created via Fork/Clone. Task migrations and limit changes can
    41  // cause the current number of pids to exceed the limit.
    42  //
    43  // A task can charge a PIDs cgroup in two ways:
    44  //
    45  //  1. A task created prior to the PIDs controller being enabled, or created
    46  //     through kernel.CreateProcess (i.e. not from userspace) directly add
    47  //     committed charges via the Enter method.
    48  //
    49  //  2. A task created through Task.Clone (i.e. userspace fork/clone) first add a
    50  //     pending charge through the Charge method. This is a temporary reservation
    51  //     which ensures the cgroup has enough space to allow the task to start. Once
    52  //     the task startup succeeds, it calls Enter and consumes the reservation.
    53  //
    54  // +stateify savable
    55  type pidsController struct {
    56  	controllerCommon
    57  
    58  	// isRoot indicates if this is the root cgroup in its hierarchy. Immutable
    59  	// since cgroupfs doesn't allow cross directory renames.
    60  	isRoot bool
    61  
    62  	// mu protects the fields below.
    63  	mu pidsControllerMutex `state:"nosave"`
    64  
    65  	// pendingTotal and pendingPool tracks the charge for processes starting
    66  	// up. During startup, we check if PIDs are available by charging the
    67  	// cgroup. However, the process actually joins the cgroup as a later point
    68  	// via Enter. We keep a count of the charges we allocated via Charge, and
    69  	// use this pool to account for already accounted charges from Enter.
    70  	//
    71  	// We also track which task owns the pending charge so we can cancel the
    72  	// charge if a task creation fails after the Charge call.
    73  	//
    74  	// pendingTotal and pendingPool are both protected by mu.
    75  	pendingTotal int64
    76  	pendingPool  map[*kernel.Task]int64
    77  
    78  	// committed represent charges for tasks that have already started and
    79  	// called Enter. Protected by mu.
    80  	committed int64
    81  
    82  	// max is the PID limit for this cgroup. Protected by mu.
    83  	max int64
    84  }
    85  
    86  var _ controller = (*pidsController)(nil)
    87  
    88  // newRootPIDsController creates the root node for a PIDs cgroup. Child
    89  // directories should be created through Clone.
    90  func newRootPIDsController(fs *filesystem) *pidsController {
    91  	c := &pidsController{
    92  		isRoot:      true,
    93  		max:         pidLimitUnlimited,
    94  		pendingPool: make(map[*kernel.Task]int64),
    95  	}
    96  	c.controllerCommon.init(kernel.CgroupControllerPIDs, fs)
    97  	return c
    98  }
    99  
   100  // Clone implements controller.Clone.
   101  func (c *pidsController) Clone() controller {
   102  	c.mu.Lock()
   103  	defer c.mu.Unlock()
   104  	new := &pidsController{
   105  		isRoot:      false,
   106  		max:         pidLimitUnlimited,
   107  		pendingPool: make(map[*kernel.Task]int64),
   108  	}
   109  	new.controllerCommon.cloneFromParent(c)
   110  	return new
   111  }
   112  
   113  // AddControlFiles implements controller.AddControlFiles.
   114  func (c *pidsController) AddControlFiles(ctx context.Context, creds *auth.Credentials, _ *cgroupInode, contents map[string]kernfs.Inode) {
   115  	contents["pids.current"] = c.fs.newControllerFile(ctx, creds, &pidsCurrentData{c: c}, true)
   116  	if !c.isRoot {
   117  		// "This is not available in the root cgroup for obvious reasons" --
   118  		// Linux, Documentation/cgroup-v1/pids.txt.
   119  		contents["pids.max"] = c.fs.newControllerWritableFile(ctx, creds, &pidsMaxData{c: c}, true)
   120  	}
   121  }
   122  
   123  // Enter implements controller.Enter.
   124  //
   125  // Enter attempts to commit a charge from the pending pool. If at least one
   126  // charge is pending for t, one pending charge is converted to a committed
   127  // charge, and the net change in total charges is zero. If no charge is pending,
   128  // a new charge is added directly to the committed pool.
   129  func (c *pidsController) Enter(t *kernel.Task) {
   130  	c.mu.Lock()
   131  	defer c.mu.Unlock()
   132  
   133  	if pending, ok := c.pendingPool[t]; ok {
   134  		if pending == 1 {
   135  			delete(c.pendingPool, t)
   136  		} else {
   137  			c.pendingPool[t] = pending - 1
   138  		}
   139  		c.pendingTotal--
   140  		if c.pendingTotal < 0 {
   141  			panic(fmt.Sprintf("cgroupfs: pids controller has negative pending charge: %v\n", c.committed))
   142  		}
   143  	}
   144  
   145  	// Either we're converting a pending charge from above, or generating a new
   146  	// committed charge directly here. Either way, we don't enforce the limit on
   147  	// Enter.
   148  	c.committed++
   149  }
   150  
   151  // Leave implements controller.Leave.
   152  func (c *pidsController) Leave(t *kernel.Task) {
   153  	c.mu.Lock()
   154  	defer c.mu.Unlock()
   155  
   156  	if c.committed <= 0 {
   157  		panic(fmt.Sprintf("cgroupfs: pids controller committed charge underflow on Leave for task %+v", t))
   158  	}
   159  	c.committed--
   160  }
   161  
   162  // PrepareMigrate implements controller.PrepareMigrate.
   163  func (c *pidsController) PrepareMigrate(t *kernel.Task, src controller) error {
   164  	srcC := src.(*pidsController)
   165  	srcC.mu.Lock()
   166  	defer srcC.mu.Unlock()
   167  
   168  	if _, ok := srcC.pendingPool[t]; ok {
   169  		// Migrating task isn't fully initialized, return transient failure.
   170  		return linuxerr.EAGAIN
   171  	}
   172  
   173  	return nil
   174  }
   175  
   176  // CommitMigrate implements controller.CommitMigrate.
   177  //
   178  // Migrations can cause a cgroup to exceed its limit. CommitMigrate can only be
   179  // called for tasks with committed charges, PrepareMigrate will deny migrations
   180  // prior to Enter.
   181  func (c *pidsController) CommitMigrate(t *kernel.Task, src controller) {
   182  	// Note: The charge is allowed to exceed max on migration. The charge may
   183  	// not exceed max when incurred due to a fork/clone, which will call
   184  	// pidsController.Charge().
   185  	c.mu.Lock()
   186  	c.committed++
   187  	c.mu.Unlock()
   188  
   189  	srcC := src.(*pidsController)
   190  	srcC.mu.Lock()
   191  	if srcC.committed <= 0 {
   192  		panic(fmt.Sprintf("cgroupfs: pids controller committed charge underflow on CommitMigrate for task %+v on the source cgroup", t))
   193  	}
   194  	srcC.committed--
   195  	srcC.mu.Unlock()
   196  }
   197  
   198  // AbortMigrate implements controller.AbortMigrate.
   199  func (c *pidsController) AbortMigrate(t *kernel.Task, src controller) {}
   200  
   201  // Charge implements controller.Charge. This manipulates the pending
   202  // pool. Charge are committed from the pending pool by Enter. The caller is
   203  // responsible for ensuring negative charges correspond to previous positive
   204  // charges. Negative charges that cause an underflow result in a panic.
   205  func (c *pidsController) Charge(t *kernel.Task, d *kernfs.Dentry, res kernel.CgroupResourceType, value int64) error {
   206  	if res != kernel.CgroupResourcePID {
   207  		panic(fmt.Sprintf("cgroupfs: pids controller invalid resource type %v", res))
   208  	}
   209  
   210  	c.mu.Lock()
   211  	defer c.mu.Unlock()
   212  
   213  	// Negative charge.
   214  	if value < 0 {
   215  		if c.pendingTotal+value < 0 {
   216  			panic(fmt.Sprintf("cgroupfs: pids controller pending pool would be negative if charge was allowed: current pool: %d, proposed charge: %d, path: %q, task: %p", c.pendingTotal, value, d.FSLocalPath(), t))
   217  		}
   218  
   219  		pending, ok := c.pendingPool[t]
   220  		if !ok {
   221  			panic(fmt.Sprintf("cgroupfs: pids controller attempted to remove pending charge for Task %p, but task didn't have pending charges, path: %q", t, d.FSLocalPath()))
   222  		}
   223  		if pending+value < 0 {
   224  			panic(fmt.Sprintf("cgroupfs: pids controller attempted to remove pending charge for Task %p, but task didn't have enough pending charges; current charges: %d, proposed charge: %d, path: %q", t, pending, value, d.FSLocalPath()))
   225  
   226  		}
   227  
   228  		c.pendingPool[t] += value
   229  		c.pendingTotal += value
   230  		return nil
   231  	}
   232  
   233  	// Positive charge.
   234  	new := c.committed + c.pendingTotal + value
   235  	if new > c.max {
   236  		log.Debugf("cgroupfs: pids controller charge denied due to limit: path: %q, requested: %d, current: %d (pending: %v, committed: %v), max: %v",
   237  			d.FSLocalPath(), value, c.committed+c.pendingTotal, c.pendingTotal, c.committed, c.max)
   238  		return linuxerr.EAGAIN
   239  	}
   240  
   241  	c.pendingPool[t] += value
   242  	c.pendingTotal += value
   243  	return nil
   244  }
   245  
   246  // +stateify savable
   247  type pidsCurrentData struct {
   248  	c *pidsController
   249  }
   250  
   251  // Generate implements vfs.DynamicBytesSource.Generate.
   252  func (d *pidsCurrentData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   253  	d.c.mu.Lock()
   254  	defer d.c.mu.Unlock()
   255  	fmt.Fprintf(buf, "%d\n", d.c.committed+d.c.pendingTotal)
   256  	return nil
   257  }
   258  
   259  // +stateify savable
   260  type pidsMaxData struct {
   261  	c *pidsController
   262  }
   263  
   264  // Generate implements vfs.DynamicBytesSource.Generate.
   265  func (d *pidsMaxData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   266  	d.c.mu.Lock()
   267  	defer d.c.mu.Unlock()
   268  
   269  	if d.c.max > pidMaxLimit {
   270  		fmt.Fprintf(buf, "max\n")
   271  	} else {
   272  		fmt.Fprintf(buf, "%d\n", d.c.max)
   273  	}
   274  
   275  	return nil
   276  }
   277  
   278  // Write implements vfs.WritableDynamicBytesSource.Write.
   279  func (d *pidsMaxData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) {
   280  	return d.WriteBackground(ctx, src)
   281  }
   282  
   283  // WriteBackground implements writableControllerFileImpl.WriteBackground.
   284  func (d *pidsMaxData) WriteBackground(ctx context.Context, src usermem.IOSequence) (int64, error) {
   285  	buf := copyScratchBufferFromContext(ctx, hostarch.PageSize)
   286  	ncpy, err := src.CopyIn(ctx, buf)
   287  	if err != nil {
   288  		return 0, err
   289  	}
   290  	if strings.TrimSpace(string(buf)) == "max" {
   291  		d.c.mu.Lock()
   292  		defer d.c.mu.Unlock()
   293  		d.c.max = pidLimitUnlimited
   294  		return int64(ncpy), nil
   295  	}
   296  
   297  	val, n, err := parseInt64FromString(ctx, src)
   298  	if err != nil {
   299  		return 0, linuxerr.EINVAL
   300  	}
   301  	if val < 0 || val > pidMaxLimit {
   302  		return 0, linuxerr.EINVAL
   303  	}
   304  
   305  	d.c.mu.Lock()
   306  	defer d.c.mu.Unlock()
   307  	d.c.max = val
   308  	return int64(n), nil
   309  }