github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fsimpl/cgroupfs/cgroupfs.go (about) 1 // Copyright 2021 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package cgroupfs implements cgroupfs. 16 // 17 // A cgroup is a collection of tasks on the system, organized into a tree-like 18 // structure similar to a filesystem directory tree. In fact, each cgroup is 19 // represented by a directory on cgroupfs, and is manipulated through control 20 // files in the directory. 21 // 22 // All cgroups on a system are organized into hierarchies. Hierarchies are a 23 // distinct tree of cgroups, with a common set of controllers. One or more 24 // cgroupfs mounts may point to each hierarchy. These mounts provide a common 25 // view into the same tree of cgroups. 26 // 27 // A controller (also known as a "resource controller", or a cgroup "subsystem") 28 // determines the behaviour of each cgroup. 29 // 30 // In addition to cgroupfs, the kernel has a cgroup registry that tracks 31 // system-wide state related to cgroups such as active hierarchies and the 32 // controllers associated with them. 33 // 34 // Since cgroupfs doesn't allow hardlinks, there is a unique mapping between 35 // cgroupfs dentries and inodes. 36 // 37 // # Synchronization 38 // 39 // Cgroup hierarchy creation and destruction is protected by the 40 // kernel.CgroupRegistry.mu. Once created, a hierarchy's set of controllers, the 41 // filesystem associated with it, and the root cgroup for the hierarchy are 42 // immutable. 43 // 44 // Membership of tasks within cgroups is protected by 45 // cgroupfs.filesystem.tasksMu. Tasks also maintain a set of all cgroups they're 46 // in, and this list is protected by Task.mu. 47 // 48 // Lock order: 49 // 50 // kernel.CgroupRegistry.mu 51 // cgroupfs.filesystem.mu 52 // kernel.TaskSet.mu 53 // kernel.Task.mu 54 // cgroupfs.filesystem.tasksMu. 55 package cgroupfs 56 57 import ( 58 "fmt" 59 "sort" 60 "strconv" 61 "strings" 62 63 "github.com/SagerNet/gvisor/pkg/abi/linux" 64 "github.com/SagerNet/gvisor/pkg/context" 65 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 66 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/kernfs" 67 "github.com/SagerNet/gvisor/pkg/sentry/kernel" 68 "github.com/SagerNet/gvisor/pkg/sentry/kernel/auth" 69 "github.com/SagerNet/gvisor/pkg/sentry/vfs" 70 "github.com/SagerNet/gvisor/pkg/sync" 71 ) 72 73 const ( 74 // Name is the default filesystem name. 75 Name = "cgroup" 76 readonlyFileMode = linux.FileMode(0444) 77 writableFileMode = linux.FileMode(0644) 78 defaultMaxCachedDentries = uint64(1000) 79 ) 80 81 const ( 82 controllerCPU = kernel.CgroupControllerType("cpu") 83 controllerCPUAcct = kernel.CgroupControllerType("cpuacct") 84 controllerCPUSet = kernel.CgroupControllerType("cpuset") 85 controllerJob = kernel.CgroupControllerType("job") 86 controllerMemory = kernel.CgroupControllerType("memory") 87 ) 88 89 var allControllers = []kernel.CgroupControllerType{ 90 controllerCPU, 91 controllerCPUAcct, 92 controllerCPUSet, 93 controllerJob, 94 controllerMemory, 95 } 96 97 // SupportedMountOptions is the set of supported mount options for cgroupfs. 98 var SupportedMountOptions = []string{"all", "cpu", "cpuacct", "cpuset", "job", "memory"} 99 100 // FilesystemType implements vfs.FilesystemType. 101 // 102 // +stateify savable 103 type FilesystemType struct{} 104 105 // InternalData contains internal data passed in to the cgroupfs mount via 106 // vfs.GetFilesystemOptions.InternalData. 107 // 108 // +stateify savable 109 type InternalData struct { 110 DefaultControlValues map[string]int64 111 } 112 113 // filesystem implements vfs.FilesystemImpl and kernel.cgroupFS. 114 // 115 // +stateify savable 116 type filesystem struct { 117 kernfs.Filesystem 118 devMinor uint32 119 120 // hierarchyID is the id the cgroup registry assigns to this hierarchy. Has 121 // the value kernel.InvalidCgroupHierarchyID until the FS is fully 122 // initialized. 123 // 124 // hierarchyID is immutable after initialization. 125 hierarchyID uint32 126 127 // controllers and kcontrollers are both the list of controllers attached to 128 // this cgroupfs. Both lists are the same set of controllers, but typecast 129 // to different interfaces for convenience. Both must stay in sync, and are 130 // immutable. 131 controllers []controller 132 kcontrollers []kernel.CgroupController 133 134 numCgroups uint64 // Protected by atomic ops. 135 136 root *kernfs.Dentry 137 138 // tasksMu serializes task membership changes across all cgroups within a 139 // filesystem. 140 tasksMu sync.RWMutex `state:"nosave"` 141 } 142 143 // InitializeHierarchyID implements kernel.cgroupFS.InitializeHierarchyID. 144 func (fs *filesystem) InitializeHierarchyID(hid uint32) { 145 fs.hierarchyID = hid 146 } 147 148 // Name implements vfs.FilesystemType.Name. 149 func (FilesystemType) Name() string { 150 return Name 151 } 152 153 // Release implements vfs.FilesystemType.Release. 154 func (FilesystemType) Release(ctx context.Context) {} 155 156 // GetFilesystem implements vfs.FilesystemType.GetFilesystem. 157 func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { 158 devMinor, err := vfsObj.GetAnonBlockDevMinor() 159 if err != nil { 160 return nil, nil, err 161 } 162 163 mopts := vfs.GenericParseMountOptions(opts.Data) 164 maxCachedDentries := defaultMaxCachedDentries 165 if str, ok := mopts["dentry_cache_limit"]; ok { 166 delete(mopts, "dentry_cache_limit") 167 maxCachedDentries, err = strconv.ParseUint(str, 10, 64) 168 if err != nil { 169 ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) 170 return nil, nil, linuxerr.EINVAL 171 } 172 } 173 174 var wantControllers []kernel.CgroupControllerType 175 if _, ok := mopts["cpu"]; ok { 176 delete(mopts, "cpu") 177 wantControllers = append(wantControllers, controllerCPU) 178 } 179 if _, ok := mopts["cpuacct"]; ok { 180 delete(mopts, "cpuacct") 181 wantControllers = append(wantControllers, controllerCPUAcct) 182 } 183 if _, ok := mopts["cpuset"]; ok { 184 delete(mopts, "cpuset") 185 wantControllers = append(wantControllers, controllerCPUSet) 186 } 187 if _, ok := mopts["job"]; ok { 188 delete(mopts, "job") 189 wantControllers = append(wantControllers, controllerJob) 190 } 191 if _, ok := mopts["memory"]; ok { 192 delete(mopts, "memory") 193 wantControllers = append(wantControllers, controllerMemory) 194 } 195 if _, ok := mopts["all"]; ok { 196 if len(wantControllers) > 0 { 197 ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: other controllers specified with all: %v", wantControllers) 198 return nil, nil, linuxerr.EINVAL 199 } 200 201 delete(mopts, "all") 202 wantControllers = allControllers 203 } 204 205 if len(wantControllers) == 0 { 206 // Specifying no controllers implies all controllers. 207 wantControllers = allControllers 208 } 209 210 if len(mopts) != 0 { 211 ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: unknown options: %v", mopts) 212 return nil, nil, linuxerr.EINVAL 213 } 214 215 k := kernel.KernelFromContext(ctx) 216 r := k.CgroupRegistry() 217 218 // "It is not possible to mount the same controller against multiple 219 // cgroup hierarchies. For example, it is not possible to mount both 220 // the cpu and cpuacct controllers against one hierarchy, and to mount 221 // the cpu controller alone against another hierarchy." - man cgroups(7) 222 // 223 // Is there a hierarchy available with all the controllers we want? If so, 224 // this mount is a view into the same hierarchy. 225 // 226 // Note: we're guaranteed to have at least one requested controller, since 227 // no explicit controller name implies all controllers. 228 if vfsfs := r.FindHierarchy(wantControllers); vfsfs != nil { 229 fs := vfsfs.Impl().(*filesystem) 230 ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: mounting new view to hierarchy %v", fs.hierarchyID) 231 fs.root.IncRef() 232 return vfsfs, fs.root.VFSDentry(), nil 233 } 234 235 // No existing hierarchy with the exactly controllers found. Make a new 236 // one. Note that it's possible this mount creation is unsatisfiable, if one 237 // or more of the requested controllers are already on existing 238 // hierarchies. We'll find out about such collisions when we try to register 239 // the new hierarchy later. 240 fs := &filesystem{ 241 devMinor: devMinor, 242 } 243 fs.MaxCachedDentries = maxCachedDentries 244 fs.VFSFilesystem().Init(vfsObj, &fsType, fs) 245 246 var defaults map[string]int64 247 if opts.InternalData != nil { 248 ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults) 249 defaults = opts.InternalData.(*InternalData).DefaultControlValues 250 } 251 252 for _, ty := range wantControllers { 253 var c controller 254 switch ty { 255 case controllerCPU: 256 c = newCPUController(fs, defaults) 257 case controllerCPUAcct: 258 c = newCPUAcctController(fs) 259 case controllerCPUSet: 260 c = newCPUSetController(fs) 261 case controllerJob: 262 c = newJobController(fs) 263 case controllerMemory: 264 c = newMemoryController(fs, defaults) 265 default: 266 panic(fmt.Sprintf("Unreachable: unknown cgroup controller %q", ty)) 267 } 268 fs.controllers = append(fs.controllers, c) 269 } 270 271 if len(defaults) != 0 { 272 // Internal data is always provided at sentry startup and unused values 273 // indicate a problem with the sandbox config. Fail fast. 274 panic(fmt.Sprintf("cgroupfs.FilesystemType.GetFilesystem: unknown internal mount data: %v", defaults)) 275 } 276 277 // Controllers usually appear in alphabetical order when displayed. Sort it 278 // here now, so it never needs to be sorted elsewhere. 279 sort.Slice(fs.controllers, func(i, j int) bool { return fs.controllers[i].Type() < fs.controllers[j].Type() }) 280 fs.kcontrollers = make([]kernel.CgroupController, 0, len(fs.controllers)) 281 for _, c := range fs.controllers { 282 fs.kcontrollers = append(fs.kcontrollers, c) 283 } 284 285 root := fs.newCgroupInode(ctx, creds) 286 var rootD kernfs.Dentry 287 rootD.InitRoot(&fs.Filesystem, root) 288 fs.root = &rootD 289 290 // Register controllers. The registry may be modified concurrently, so if we 291 // get an error, we raced with someone else who registered the same 292 // controllers first. 293 if err := r.Register(fs.kcontrollers, fs); err != nil { 294 ctx.Infof("cgroupfs.FilesystemType.GetFilesystem: failed to register new hierarchy with controllers %v: %v", wantControllers, err) 295 rootD.DecRef(ctx) 296 fs.VFSFilesystem().DecRef(ctx) 297 return nil, nil, linuxerr.EBUSY 298 } 299 300 // Move all existing tasks to the root of the new hierarchy. 301 k.PopulateNewCgroupHierarchy(fs.rootCgroup()) 302 303 return fs.VFSFilesystem(), rootD.VFSDentry(), nil 304 } 305 306 func (fs *filesystem) rootCgroup() kernel.Cgroup { 307 return kernel.Cgroup{ 308 Dentry: fs.root, 309 CgroupImpl: fs.root.Inode().(kernel.CgroupImpl), 310 } 311 } 312 313 // Release implements vfs.FilesystemImpl.Release. 314 func (fs *filesystem) Release(ctx context.Context) { 315 k := kernel.KernelFromContext(ctx) 316 r := k.CgroupRegistry() 317 318 if fs.hierarchyID != kernel.InvalidCgroupHierarchyID { 319 k.ReleaseCgroupHierarchy(fs.hierarchyID) 320 r.Unregister(fs.hierarchyID) 321 } 322 323 fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) 324 fs.Filesystem.Release(ctx) 325 } 326 327 // MountOptions implements vfs.FilesystemImpl.MountOptions. 328 func (fs *filesystem) MountOptions() string { 329 var cnames []string 330 for _, c := range fs.controllers { 331 cnames = append(cnames, string(c.Type())) 332 } 333 return strings.Join(cnames, ",") 334 } 335 336 // +stateify savable 337 type implStatFS struct{} 338 339 // StatFS implements kernfs.Inode.StatFS. 340 func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { 341 return vfs.GenericStatFS(linux.CGROUP_SUPER_MAGIC), nil 342 } 343 344 // dir implements kernfs.Inode for a generic cgroup resource controller 345 // directory. Specific controllers extend this to add their own functionality. 346 // 347 // +stateify savable 348 type dir struct { 349 dirRefs 350 kernfs.InodeAlwaysValid 351 kernfs.InodeAttrs 352 kernfs.InodeNotSymlink 353 kernfs.InodeDirectoryNoNewChildren // TODO(b/183137098): Implement mkdir. 354 kernfs.OrderedChildren 355 implStatFS 356 357 locks vfs.FileLocks 358 } 359 360 // Keep implements kernfs.Inode.Keep. 361 func (*dir) Keep() bool { 362 return true 363 } 364 365 // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. 366 func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { 367 return linuxerr.EPERM 368 } 369 370 // Open implements kernfs.Inode.Open. 371 func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 372 fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{ 373 SeekEnd: kernfs.SeekEndStaticEntries, 374 }) 375 if err != nil { 376 return nil, err 377 } 378 return fd.VFSFileDescription(), nil 379 } 380 381 // DecRef implements kernfs.Inode.DecRef. 382 func (d *dir) DecRef(ctx context.Context) { 383 d.dirRefs.DecRef(func() { d.Destroy(ctx) }) 384 } 385 386 // StatFS implements kernfs.Inode.StatFS. 387 func (d *dir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) { 388 return vfs.GenericStatFS(linux.CGROUP_SUPER_MAGIC), nil 389 } 390 391 // controllerFile represents a generic control file that appears within a cgroup 392 // directory. 393 // 394 // +stateify savable 395 type controllerFile struct { 396 kernfs.DynamicBytesFile 397 } 398 399 func (fs *filesystem) newControllerFile(ctx context.Context, creds *auth.Credentials, data vfs.DynamicBytesSource) kernfs.Inode { 400 f := &controllerFile{} 401 f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), data, readonlyFileMode) 402 return f 403 } 404 405 func (fs *filesystem) newControllerWritableFile(ctx context.Context, creds *auth.Credentials, data vfs.WritableDynamicBytesSource) kernfs.Inode { 406 f := &controllerFile{} 407 f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), data, writableFileMode) 408 return f 409 } 410 411 // staticControllerFile represents a generic control file that appears within a 412 // cgroup directory which always returns the same data when read. 413 // staticControllerFiles are not writable. 414 // 415 // +stateify savable 416 type staticControllerFile struct { 417 kernfs.DynamicBytesFile 418 vfs.StaticData 419 } 420 421 // Note: We let the caller provide the mode so that static files may be used to 422 // fake both readable and writable control files. However, static files are 423 // effectively readonly, as attempting to write to them will return EIO 424 // regardless of the mode. 425 func (fs *filesystem) newStaticControllerFile(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, data string) kernfs.Inode { 426 f := &staticControllerFile{StaticData: vfs.StaticData{Data: data}} 427 f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), f, mode) 428 return f 429 }