github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/kernel/cgroup.go (about) 1 // Copyright 2021 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "bytes" 19 "fmt" 20 "sort" 21 22 "github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops" 23 "github.com/nicocha30/gvisor-ligolo/pkg/context" 24 "github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr" 25 "github.com/nicocha30/gvisor-ligolo/pkg/fspath" 26 "github.com/nicocha30/gvisor-ligolo/pkg/log" 27 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/kernfs" 28 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs" 29 ) 30 31 // InvalidCgroupHierarchyID indicates an uninitialized hierarchy ID. 32 const InvalidCgroupHierarchyID uint32 = 0 33 34 // InvalidCgroupID indicates an uninitialized cgroup ID. 35 const InvalidCgroupID uint32 = 0 36 37 // CgroupControllerType is the name of a cgroup controller. 38 type CgroupControllerType string 39 40 // Available cgroup controllers. 41 const ( 42 CgroupControllerCPU = CgroupControllerType("cpu") 43 CgroupControllerCPUAcct = CgroupControllerType("cpuacct") 44 CgroupControllerCPUSet = CgroupControllerType("cpuset") 45 CgroupControllerJob = CgroupControllerType("job") 46 CgroupControllerMemory = CgroupControllerType("memory") 47 CgroupControllerPIDs = CgroupControllerType("pids") 48 ) 49 50 // ParseCgroupController parses a string as a CgroupControllerType. 51 func ParseCgroupController(val string) (CgroupControllerType, error) { 52 switch val { 53 case "cpu": 54 return CgroupControllerCPU, nil 55 case "cpuacct": 56 return CgroupControllerCPUAcct, nil 57 case "cpuset": 58 return CgroupControllerCPUSet, nil 59 case "job": 60 return CgroupControllerJob, nil 61 case "memory": 62 return CgroupControllerMemory, nil 63 case "pids": 64 return CgroupControllerPIDs, nil 65 default: 66 return "", fmt.Errorf("no such cgroup controller") 67 } 68 } 69 70 // CgroupResourceType represents a resource type tracked by a particular 71 // controller. 72 type CgroupResourceType int 73 74 // Resources for the cpuacct controller. 75 const ( 76 // CgroupResourcePID represents a charge for pids.current. 77 CgroupResourcePID CgroupResourceType = iota 78 ) 79 80 // CgroupController is the common interface to cgroup controllers available to 81 // the entire sentry. The controllers themselves are defined by cgroupfs. 82 // 83 // Callers of this interface are often unable access synchronization needed to 84 // ensure returned values remain valid. Some of values returned from this 85 // interface are thus snapshots in time, and may become stale. This is ok for 86 // many callers like procfs. 87 type CgroupController interface { 88 // Returns the type of this cgroup controller (ex "memory", "cpu"). Returned 89 // value is valid for the lifetime of the controller. 90 Type() CgroupControllerType 91 92 // Hierarchy returns the ID of the hierarchy this cgroup controller is 93 // attached to. Returned value is valid for the lifetime of the controller. 94 HierarchyID() uint32 95 96 // EffectiveRootCgroup returns the effective root cgroup for this 97 // controller. This is either the actual root of the underlying cgroupfs 98 // filesystem, or the override root configured at sandbox startup. Returned 99 // value is valid for the lifetime of the controller. 100 EffectiveRootCgroup() Cgroup 101 102 // NumCgroups returns the number of cgroups managed by this controller. 103 // Returned value is a snapshot in time. 104 NumCgroups() uint64 105 106 // Enabled returns whether this controller is enabled. Returned value is a 107 // snapshot in time. 108 Enabled() bool 109 } 110 111 // Cgroup represents a named pointer to a cgroup in cgroupfs. When a task enters 112 // a cgroup, it holds a reference on the underlying dentry pointing to the 113 // cgroup. 114 // 115 // +stateify savable 116 type Cgroup struct { 117 *kernfs.Dentry 118 CgroupImpl 119 } 120 121 // decRef drops a reference on the cgroup. This must happen outside a Task.mu 122 // critical section. 123 func (c *Cgroup) decRef() { 124 c.Dentry.DecRef(context.Background()) 125 } 126 127 // Path returns the absolute path of c, relative to its hierarchy root. 128 func (c *Cgroup) Path() string { 129 return c.FSLocalPath() 130 } 131 132 // Walk returns the cgroup at p, starting from c. 133 func (c *Cgroup) Walk(ctx context.Context, vfsObj *vfs.VirtualFilesystem, p fspath.Path) (Cgroup, error) { 134 d, err := c.Dentry.WalkDentryTree(ctx, vfsObj, p) 135 if err != nil { 136 return Cgroup{}, err 137 } 138 return Cgroup{ 139 Dentry: d, 140 CgroupImpl: d.Inode().(CgroupImpl), 141 }, nil 142 } 143 144 // CgroupMigrationContext represents an in-flight cgroup migration for 145 // a single task. 146 type CgroupMigrationContext struct { 147 src Cgroup 148 dst Cgroup 149 t *Task 150 } 151 152 // Abort cancels a migration. 153 func (ctx *CgroupMigrationContext) Abort() { 154 ctx.dst.AbortMigrate(ctx.t, &ctx.src) 155 } 156 157 // Commit completes a migration. 158 func (ctx *CgroupMigrationContext) Commit() { 159 ctx.dst.CommitMigrate(ctx.t, &ctx.src) 160 161 ctx.t.mu.Lock() 162 delete(ctx.t.cgroups, ctx.src) 163 ctx.src.DecRef(ctx.t) 164 ctx.dst.IncRef() 165 ctx.t.cgroups[ctx.dst] = struct{}{} 166 ctx.t.mu.Unlock() 167 } 168 169 // CgroupImpl is the common interface to cgroups. 170 type CgroupImpl interface { 171 // Controllers lists the controller associated with this cgroup. 172 Controllers() []CgroupController 173 174 // HierarchyID returns the id of the hierarchy that contains this cgroup. 175 HierarchyID() uint32 176 177 // Name returns the name for this cgroup, if any. If no name was provided 178 // when the hierarchy was created, returns "". 179 Name() string 180 181 // Enter moves t into this cgroup. 182 Enter(t *Task) 183 184 // Leave moves t out of this cgroup. 185 Leave(t *Task) 186 187 // PrepareMigrate initiates a migration of t from src to this cgroup. See 188 // cgroupfs.controller.PrepareMigrate. 189 PrepareMigrate(t *Task, src *Cgroup) error 190 191 // CommitMigrate completes an in-flight migration. See 192 // cgroupfs.controller.CommitMigrate. 193 CommitMigrate(t *Task, src *Cgroup) 194 195 // AbortMigrate cancels an in-flight migration. See 196 // cgroupfs.controller.AbortMigrate. 197 AbortMigrate(t *Task, src *Cgroup) 198 199 // Charge charges a controller in this cgroup for a particular resource. key 200 // must match a valid resource for the specified controller type. 201 // 202 // The implementer should silently succeed if no matching controllers are 203 // found. 204 // 205 // The underlying implementaion will panic if passed an incompatible 206 // resource type for a given controller. 207 // 208 // See cgroupfs.controller.Charge. 209 Charge(t *Task, d *kernfs.Dentry, ctl CgroupControllerType, res CgroupResourceType, value int64) error 210 211 // ReadControlFromBackground allows a background context to read a cgroup's 212 // control values. 213 ReadControl(ctx context.Context, name string) (string, error) 214 215 // WriteControl allows a background context to write a cgroup's control 216 // values. 217 WriteControl(ctx context.Context, name string, val string) error 218 219 // ID returns the id of this cgroup. 220 ID() uint32 221 } 222 223 // hierarchy represents a cgroupfs filesystem instance, with a unique set of 224 // controllers attached to it. Multiple cgroupfs mounts may reference the same 225 // hierarchy. 226 // 227 // +stateify savable 228 type hierarchy struct { 229 id uint32 230 name string 231 // These are a subset of the controllers in CgroupRegistry.controllers, 232 // grouped here by hierarchy for conveninent lookup. 233 controllers map[CgroupControllerType]CgroupController 234 // fs is not owned by hierarchy. The FS is responsible for unregistering the 235 // hierarchy on destruction, which removes this association. 236 fs *vfs.Filesystem 237 } 238 239 func (h *hierarchy) match(ctypes []CgroupControllerType) bool { 240 if len(ctypes) != len(h.controllers) { 241 return false 242 } 243 for _, ty := range ctypes { 244 if _, ok := h.controllers[ty]; !ok { 245 return false 246 } 247 } 248 return true 249 } 250 251 // cgroupFS is the public interface to cgroupfs. This lets the kernel package 252 // refer to cgroupfs.filesystem methods without directly depending on the 253 // cgroupfs package, which would lead to a circular dependency. 254 type cgroupFS interface { 255 // Returns the vfs.Filesystem for the cgroupfs. 256 VFSFilesystem() *vfs.Filesystem 257 258 // InitializeHierarchyID sets the hierarchy ID for this filesystem during 259 // filesystem creation. May only be called before the filesystem is visible 260 // to the vfs layer. 261 InitializeHierarchyID(hid uint32) 262 263 // RootCgroup returns the root cgroup of this instance. This returns the 264 // actual root, and ignores any overrides setting an effective root. 265 RootCgroup() Cgroup 266 } 267 268 // CgroupRegistry tracks the active set of cgroup controllers on the system. 269 // 270 // +stateify savable 271 type CgroupRegistry struct { 272 // lastHierarchyID is the id of the last allocated cgroup hierarchy. Valid 273 // ids are from 1 to math.MaxUint32. 274 // 275 lastHierarchyID atomicbitops.Uint32 276 277 // lastCgroupID is the id of the last allocated cgroup. Valid ids are 278 // from 1 to math.MaxUint32. 279 // 280 lastCgroupID atomicbitops.Uint32 281 282 mu cgroupMutex `state:"nosave"` 283 284 // controllers is the set of currently known cgroup controllers on the 285 // system. 286 // 287 // +checklocks:mu 288 controllers map[CgroupControllerType]CgroupController 289 290 // hierarchies is the active set of cgroup hierarchies. This contains all 291 // hierarchies on the system. 292 // 293 // +checklocks:mu 294 hierarchies map[uint32]hierarchy 295 296 // hierarchiesByName is a map of named hierarchies. Only named hierarchies 297 // are tracked on this map. 298 // 299 // +checklocks:mu 300 hierarchiesByName map[string]hierarchy 301 302 // cgroups is the active set of cgroups. This contains all the cgroups 303 // on the system. 304 // 305 // +checklocks:mu 306 cgroups map[uint32]CgroupImpl 307 } 308 309 func newCgroupRegistry() *CgroupRegistry { 310 return &CgroupRegistry{ 311 controllers: make(map[CgroupControllerType]CgroupController), 312 hierarchies: make(map[uint32]hierarchy), 313 hierarchiesByName: make(map[string]hierarchy), 314 cgroups: make(map[uint32]CgroupImpl), 315 } 316 } 317 318 // nextHierarchyID returns a newly allocated, unique hierarchy ID. 319 func (r *CgroupRegistry) nextHierarchyID() (uint32, error) { 320 if hid := r.lastHierarchyID.Add(1); hid != 0 { 321 return hid, nil 322 } 323 return InvalidCgroupHierarchyID, fmt.Errorf("cgroup hierarchy ID overflow") 324 } 325 326 // FindHierarchy returns a cgroup filesystem containing exactly the set of 327 // controllers named in ctypes, and optionally the name specified in name if it 328 // isn't empty. If no such FS is found, FindHierarchy return nil. FindHierarchy 329 // takes a reference on the returned FS, which is transferred to the caller. 330 func (r *CgroupRegistry) FindHierarchy(name string, ctypes []CgroupControllerType) (*vfs.Filesystem, error) { 331 r.mu.Lock() 332 defer r.mu.Unlock() 333 334 // If we have a hierarchy name, lookup by name. 335 if name != "" { 336 h, ok := r.hierarchiesByName[name] 337 if !ok { 338 // Name not found. 339 return nil, nil 340 } 341 342 if h.match(ctypes) { 343 if !h.fs.TryIncRef() { 344 // May be racing with filesystem destruction, see below. 345 r.unregisterLocked(h.id) 346 return nil, nil 347 } 348 return h.fs, nil 349 } 350 351 // Name matched, but controllers didn't. Fail per linux 352 // kernel/cgroup.c:cgroup_mount(). 353 log.Debugf("cgroupfs: Registry lookup for name=%s controllers=%v failed; named matched but controllers didn't (have controllers=%v)", name, ctypes, h.controllers) 354 return nil, linuxerr.EBUSY 355 } 356 357 for _, h := range r.hierarchies { 358 if h.match(ctypes) { 359 if !h.fs.TryIncRef() { 360 // Racing with filesystem destruction, namely h.fs.Release. 361 // Since we hold r.mu, we know the hierarchy hasn't been 362 // unregistered yet, but its associated filesystem is tearing 363 // down. 364 // 365 // If we simply indicate the hierarchy wasn't found without 366 // cleaning up the registry, the caller can race with the 367 // unregister and find itself temporarily unable to create a new 368 // hierarchy with a subset of the relevant controllers. 369 // 370 // To keep the result of FindHierarchy consistent with the 371 // uniqueness of controllers enforced by Register, drop the 372 // dying hierarchy now. The eventual unregister by the FS 373 // teardown will become a no-op. 374 r.unregisterLocked(h.id) 375 return nil, nil 376 } 377 return h.fs, nil 378 } 379 } 380 381 return nil, nil 382 } 383 384 // FindCgroup locates a cgroup with the given parameters. 385 // 386 // A cgroup is considered a match even if it contains other controllers on the 387 // same hierarchy. 388 func (r *CgroupRegistry) FindCgroup(ctx context.Context, ctype CgroupControllerType, path string) (Cgroup, error) { 389 p := fspath.Parse(path) 390 if !p.Absolute { 391 return Cgroup{}, fmt.Errorf("path must be absolute") 392 } 393 k := KernelFromContext(ctx) 394 vfsfs, err := r.FindHierarchy("", []CgroupControllerType{ctype}) 395 if err != nil { 396 return Cgroup{}, err 397 } 398 if vfsfs == nil { 399 return Cgroup{}, fmt.Errorf("controller not active") 400 } 401 402 rootCG := vfsfs.Impl().(cgroupFS).RootCgroup() 403 404 if !p.HasComponents() { 405 // Explicit root '/'. 406 return rootCG, nil 407 } 408 409 return rootCG.Walk(ctx, k.VFS(), p) 410 } 411 412 // Register registers the provided set of controllers with the registry as a new 413 // hierarchy. If any controller is already registered, the function returns an 414 // error without modifying the registry. Register sets the hierarchy ID for the 415 // filesystem on success. 416 func (r *CgroupRegistry) Register(name string, cs []CgroupController, fs cgroupFS) error { 417 r.mu.Lock() 418 defer r.mu.Unlock() 419 420 if name == "" && len(cs) == 0 { 421 return fmt.Errorf("can't register hierarchy with both no controllers and no name") 422 } 423 424 for _, c := range cs { 425 if _, ok := r.controllers[c.Type()]; ok { 426 return fmt.Errorf("controllers may only be mounted on a single hierarchy") 427 } 428 } 429 430 if _, ok := r.hierarchiesByName[name]; name != "" && ok { 431 return fmt.Errorf("hierarchy named %q already exists", name) 432 } 433 434 hid, err := r.nextHierarchyID() 435 if err != nil { 436 return err 437 } 438 439 // Must not fail below here, once we publish the hierarchy ID. 440 441 fs.InitializeHierarchyID(hid) 442 443 h := hierarchy{ 444 id: hid, 445 name: name, 446 controllers: make(map[CgroupControllerType]CgroupController), 447 fs: fs.VFSFilesystem(), 448 } 449 for _, c := range cs { 450 n := c.Type() 451 r.controllers[n] = c 452 h.controllers[n] = c 453 } 454 r.hierarchies[hid] = h 455 if name != "" { 456 r.hierarchiesByName[name] = h 457 } 458 return nil 459 } 460 461 // Unregister removes a previously registered hierarchy from the registry. If no 462 // such hierarchy is registered, Unregister is a no-op. 463 func (r *CgroupRegistry) Unregister(hid uint32) { 464 r.mu.Lock() 465 r.unregisterLocked(hid) 466 r.mu.Unlock() 467 } 468 469 // Precondition: Caller must hold r.mu. 470 // +checklocks:r.mu 471 func (r *CgroupRegistry) unregisterLocked(hid uint32) { 472 if h, ok := r.hierarchies[hid]; ok { 473 for name := range h.controllers { 474 delete(r.controllers, name) 475 } 476 delete(r.hierarchies, hid) 477 } 478 } 479 480 // computeInitialGroups takes a reference on each of the returned cgroups. The 481 // caller takes ownership of this returned reference. 482 func (r *CgroupRegistry) computeInitialGroups(inherit map[Cgroup]struct{}) map[Cgroup]struct{} { 483 r.mu.Lock() 484 defer r.mu.Unlock() 485 486 ctlSet := make(map[CgroupControllerType]CgroupController) 487 cgset := make(map[Cgroup]struct{}) 488 489 // Remember controllers from the inherited cgroups set... 490 for cg := range inherit { 491 cg.IncRef() // Ref transferred to caller. 492 for _, ctl := range cg.Controllers() { 493 ctlSet[ctl.Type()] = ctl 494 cgset[cg] = struct{}{} 495 } 496 } 497 498 // ... and add the root cgroups of all the missing controllers. 499 for name, ctl := range r.controllers { 500 if _, ok := ctlSet[name]; !ok { 501 cg := ctl.EffectiveRootCgroup() 502 // Multiple controllers may share the same hierarchy, so may have 503 // the same root cgroup. Grab a single ref per hierarchy root. 504 if _, ok := cgset[cg]; ok { 505 continue 506 } 507 cg.IncRef() // Ref transferred to caller. 508 cgset[cg] = struct{}{} 509 } 510 } 511 return cgset 512 } 513 514 // GenerateProcCgroups writes the contents of /proc/cgroups to buf. 515 func (r *CgroupRegistry) GenerateProcCgroups(buf *bytes.Buffer) { 516 r.mu.Lock() 517 entries := make([]string, 0, len(r.controllers)) 518 for _, c := range r.controllers { 519 en := 0 520 if c.Enabled() { 521 en = 1 522 } 523 entries = append(entries, fmt.Sprintf("%s\t%d\t%d\t%d\n", c.Type(), c.HierarchyID(), c.NumCgroups(), en)) 524 } 525 r.mu.Unlock() 526 527 sort.Strings(entries) 528 fmt.Fprint(buf, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n") 529 for _, e := range entries { 530 fmt.Fprint(buf, e) 531 } 532 } 533 534 // NextCgroupID returns a newly allocated, unique cgroup ID. 535 func (r *CgroupRegistry) NextCgroupID() (uint32, error) { 536 if cid := r.lastCgroupID.Add(1); cid != 0 { 537 return cid, nil 538 } 539 return InvalidCgroupID, fmt.Errorf("cgroup ID overflow") 540 } 541 542 // AddCgroup adds the ID and cgroup in the map. 543 func (r *CgroupRegistry) AddCgroup(cg CgroupImpl) { 544 r.mu.Lock() 545 r.cgroups[cg.ID()] = cg 546 r.mu.Unlock() 547 } 548 549 // GetCgroup returns the cgroup associated with the cgroup ID. 550 func (r *CgroupRegistry) GetCgroup(cid uint32) (CgroupImpl, error) { 551 r.mu.Lock() 552 defer r.mu.Unlock() 553 cg, ok := r.cgroups[cid] 554 if !ok { 555 return nil, fmt.Errorf("cgroup with ID %d does not exist", cid) 556 } 557 return cg, nil 558 }