gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/fsimpl/cgroupfs/pids.go (about) 1 // Copyright 2022 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package cgroupfs 16 17 import ( 18 "bytes" 19 "fmt" 20 "strings" 21 22 "gvisor.dev/gvisor/pkg/context" 23 "gvisor.dev/gvisor/pkg/errors/linuxerr" 24 "gvisor.dev/gvisor/pkg/hostarch" 25 "gvisor.dev/gvisor/pkg/log" 26 "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" 27 "gvisor.dev/gvisor/pkg/sentry/kernel" 28 "gvisor.dev/gvisor/pkg/sentry/kernel/auth" 29 "gvisor.dev/gvisor/pkg/sentry/vfs" 30 "gvisor.dev/gvisor/pkg/usermem" 31 ) 32 33 // pidMaxLimit is the maximum number of pids allowed on a 64-bit system. The 34 // practical limit is much lower. See Linux, include/linux/threads.h. 35 const pidMaxLimit = 4 * 1024 * 1024 36 const pidLimitUnlimited = pidMaxLimit + 1 37 38 // pidsController tracks how many pids are used by tasks in a cgroup. This is 39 // used to limit the number of tasks per cgroup. The limit is enforced only when 40 // new tasks are created via Fork/Clone. Task migrations and limit changes can 41 // cause the current number of pids to exceed the limit. 42 // 43 // A task can charge a PIDs cgroup in two ways: 44 // 45 // 1. A task created prior to the PIDs controller being enabled, or created 46 // through kernel.CreateProcess (i.e. not from userspace) directly add 47 // committed charges via the Enter method. 48 // 49 // 2. A task created through Task.Clone (i.e. userspace fork/clone) first add a 50 // pending charge through the Charge method. This is a temporary reservation 51 // which ensures the cgroup has enough space to allow the task to start. Once 52 // the task startup succeeds, it calls Enter and consumes the reservation. 53 // 54 // +stateify savable 55 type pidsController struct { 56 controllerCommon 57 58 // isRoot indicates if this is the root cgroup in its hierarchy. Immutable 59 // since cgroupfs doesn't allow cross directory renames. 60 isRoot bool 61 62 // mu protects the fields below. 63 mu pidsControllerMutex `state:"nosave"` 64 65 // pendingTotal and pendingPool tracks the charge for processes starting 66 // up. During startup, we check if PIDs are available by charging the 67 // cgroup. However, the process actually joins the cgroup as a later point 68 // via Enter. We keep a count of the charges we allocated via Charge, and 69 // use this pool to account for already accounted charges from Enter. 70 // 71 // We also track which task owns the pending charge so we can cancel the 72 // charge if a task creation fails after the Charge call. 73 // 74 // pendingTotal and pendingPool are both protected by mu. 75 pendingTotal int64 76 pendingPool map[*kernel.Task]int64 77 78 // committed represent charges for tasks that have already started and 79 // called Enter. Protected by mu. 80 committed int64 81 82 // max is the PID limit for this cgroup. Protected by mu. 83 max int64 84 } 85 86 var _ controller = (*pidsController)(nil) 87 88 // newRootPIDsController creates the root node for a PIDs cgroup. Child 89 // directories should be created through Clone. 90 func newRootPIDsController(fs *filesystem) *pidsController { 91 c := &pidsController{ 92 isRoot: true, 93 max: pidLimitUnlimited, 94 pendingPool: make(map[*kernel.Task]int64), 95 } 96 c.controllerCommon.init(kernel.CgroupControllerPIDs, fs) 97 return c 98 } 99 100 // Clone implements controller.Clone. 101 func (c *pidsController) Clone() controller { 102 c.mu.Lock() 103 defer c.mu.Unlock() 104 new := &pidsController{ 105 isRoot: false, 106 max: pidLimitUnlimited, 107 pendingPool: make(map[*kernel.Task]int64), 108 } 109 new.controllerCommon.cloneFromParent(c) 110 return new 111 } 112 113 // AddControlFiles implements controller.AddControlFiles. 114 func (c *pidsController) AddControlFiles(ctx context.Context, creds *auth.Credentials, _ *cgroupInode, contents map[string]kernfs.Inode) { 115 contents["pids.current"] = c.fs.newControllerFile(ctx, creds, &pidsCurrentData{c: c}, true) 116 if !c.isRoot { 117 // "This is not available in the root cgroup for obvious reasons" -- 118 // Linux, Documentation/cgroup-v1/pids.txt. 119 contents["pids.max"] = c.fs.newControllerWritableFile(ctx, creds, &pidsMaxData{c: c}, true) 120 } 121 } 122 123 // Enter implements controller.Enter. 124 // 125 // Enter attempts to commit a charge from the pending pool. If at least one 126 // charge is pending for t, one pending charge is converted to a committed 127 // charge, and the net change in total charges is zero. If no charge is pending, 128 // a new charge is added directly to the committed pool. 129 func (c *pidsController) Enter(t *kernel.Task) { 130 c.mu.Lock() 131 defer c.mu.Unlock() 132 133 if pending, ok := c.pendingPool[t]; ok { 134 if pending == 1 { 135 delete(c.pendingPool, t) 136 } else { 137 c.pendingPool[t] = pending - 1 138 } 139 c.pendingTotal-- 140 if c.pendingTotal < 0 { 141 panic(fmt.Sprintf("cgroupfs: pids controller has negative pending charge: %v\n", c.committed)) 142 } 143 } 144 145 // Either we're converting a pending charge from above, or generating a new 146 // committed charge directly here. Either way, we don't enforce the limit on 147 // Enter. 148 c.committed++ 149 } 150 151 // Leave implements controller.Leave. 152 func (c *pidsController) Leave(t *kernel.Task) { 153 c.mu.Lock() 154 defer c.mu.Unlock() 155 156 if c.committed <= 0 { 157 panic(fmt.Sprintf("cgroupfs: pids controller committed charge underflow on Leave for task %+v", t)) 158 } 159 c.committed-- 160 } 161 162 // PrepareMigrate implements controller.PrepareMigrate. 163 func (c *pidsController) PrepareMigrate(t *kernel.Task, src controller) error { 164 srcC := src.(*pidsController) 165 srcC.mu.Lock() 166 defer srcC.mu.Unlock() 167 168 if _, ok := srcC.pendingPool[t]; ok { 169 // Migrating task isn't fully initialized, return transient failure. 170 return linuxerr.EAGAIN 171 } 172 173 return nil 174 } 175 176 // CommitMigrate implements controller.CommitMigrate. 177 // 178 // Migrations can cause a cgroup to exceed its limit. CommitMigrate can only be 179 // called for tasks with committed charges, PrepareMigrate will deny migrations 180 // prior to Enter. 181 func (c *pidsController) CommitMigrate(t *kernel.Task, src controller) { 182 // Note: The charge is allowed to exceed max on migration. The charge may 183 // not exceed max when incurred due to a fork/clone, which will call 184 // pidsController.Charge(). 185 c.mu.Lock() 186 c.committed++ 187 c.mu.Unlock() 188 189 srcC := src.(*pidsController) 190 srcC.mu.Lock() 191 if srcC.committed <= 0 { 192 panic(fmt.Sprintf("cgroupfs: pids controller committed charge underflow on CommitMigrate for task %+v on the source cgroup", t)) 193 } 194 srcC.committed-- 195 srcC.mu.Unlock() 196 } 197 198 // AbortMigrate implements controller.AbortMigrate. 199 func (c *pidsController) AbortMigrate(t *kernel.Task, src controller) {} 200 201 // Charge implements controller.Charge. This manipulates the pending 202 // pool. Charge are committed from the pending pool by Enter. The caller is 203 // responsible for ensuring negative charges correspond to previous positive 204 // charges. Negative charges that cause an underflow result in a panic. 205 func (c *pidsController) Charge(t *kernel.Task, d *kernfs.Dentry, res kernel.CgroupResourceType, value int64) error { 206 if res != kernel.CgroupResourcePID { 207 panic(fmt.Sprintf("cgroupfs: pids controller invalid resource type %v", res)) 208 } 209 210 c.mu.Lock() 211 defer c.mu.Unlock() 212 213 // Negative charge. 214 if value < 0 { 215 if c.pendingTotal+value < 0 { 216 panic(fmt.Sprintf("cgroupfs: pids controller pending pool would be negative if charge was allowed: current pool: %d, proposed charge: %d, path: %q, task: %p", c.pendingTotal, value, d.FSLocalPath(), t)) 217 } 218 219 pending, ok := c.pendingPool[t] 220 if !ok { 221 panic(fmt.Sprintf("cgroupfs: pids controller attempted to remove pending charge for Task %p, but task didn't have pending charges, path: %q", t, d.FSLocalPath())) 222 } 223 if pending+value < 0 { 224 panic(fmt.Sprintf("cgroupfs: pids controller attempted to remove pending charge for Task %p, but task didn't have enough pending charges; current charges: %d, proposed charge: %d, path: %q", t, pending, value, d.FSLocalPath())) 225 226 } 227 228 c.pendingPool[t] += value 229 c.pendingTotal += value 230 return nil 231 } 232 233 // Positive charge. 234 new := c.committed + c.pendingTotal + value 235 if new > c.max { 236 log.Debugf("cgroupfs: pids controller charge denied due to limit: path: %q, requested: %d, current: %d (pending: %v, committed: %v), max: %v", 237 d.FSLocalPath(), value, c.committed+c.pendingTotal, c.pendingTotal, c.committed, c.max) 238 return linuxerr.EAGAIN 239 } 240 241 c.pendingPool[t] += value 242 c.pendingTotal += value 243 return nil 244 } 245 246 // +stateify savable 247 type pidsCurrentData struct { 248 c *pidsController 249 } 250 251 // Generate implements vfs.DynamicBytesSource.Generate. 252 func (d *pidsCurrentData) Generate(ctx context.Context, buf *bytes.Buffer) error { 253 d.c.mu.Lock() 254 defer d.c.mu.Unlock() 255 fmt.Fprintf(buf, "%d\n", d.c.committed+d.c.pendingTotal) 256 return nil 257 } 258 259 // +stateify savable 260 type pidsMaxData struct { 261 c *pidsController 262 } 263 264 // Generate implements vfs.DynamicBytesSource.Generate. 265 func (d *pidsMaxData) Generate(ctx context.Context, buf *bytes.Buffer) error { 266 d.c.mu.Lock() 267 defer d.c.mu.Unlock() 268 269 if d.c.max > pidMaxLimit { 270 fmt.Fprintf(buf, "max\n") 271 } else { 272 fmt.Fprintf(buf, "%d\n", d.c.max) 273 } 274 275 return nil 276 } 277 278 // Write implements vfs.WritableDynamicBytesSource.Write. 279 func (d *pidsMaxData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { 280 return d.WriteBackground(ctx, src) 281 } 282 283 // WriteBackground implements writableControllerFileImpl.WriteBackground. 284 func (d *pidsMaxData) WriteBackground(ctx context.Context, src usermem.IOSequence) (int64, error) { 285 buf := copyScratchBufferFromContext(ctx, hostarch.PageSize) 286 ncpy, err := src.CopyIn(ctx, buf) 287 if err != nil { 288 return 0, err 289 } 290 if strings.TrimSpace(string(buf)) == "max" { 291 d.c.mu.Lock() 292 defer d.c.mu.Unlock() 293 d.c.max = pidLimitUnlimited 294 return int64(ncpy), nil 295 } 296 297 val, n, err := parseInt64FromString(ctx, src) 298 if err != nil { 299 return 0, linuxerr.EINVAL 300 } 301 if val < 0 || val > pidMaxLimit { 302 return 0, linuxerr.EINVAL 303 } 304 305 d.c.mu.Lock() 306 defer d.c.mu.Unlock() 307 d.c.max = val 308 return int64(n), nil 309 }