github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/proc/tasks.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package proc 16 17 import ( 18 "bytes" 19 "sort" 20 "strconv" 21 22 "github.com/metacubex/gvisor/pkg/abi/linux" 23 "github.com/metacubex/gvisor/pkg/context" 24 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 25 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/kernfs" 26 "github.com/metacubex/gvisor/pkg/sentry/kernel" 27 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 28 "github.com/metacubex/gvisor/pkg/sentry/vfs" 29 ) 30 31 const ( 32 selfName = "self" 33 threadSelfName = "thread-self" 34 ) 35 36 // tasksInode represents the inode for /proc/ directory. 37 // 38 // +stateify savable 39 type tasksInode struct { 40 implStatFS 41 kernfs.InodeAlwaysValid 42 kernfs.InodeAttrs 43 kernfs.InodeDirectoryNoNewChildren 44 kernfs.InodeNotAnonymous 45 kernfs.InodeNotSymlink 46 kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid. 47 kernfs.InodeWatches 48 kernfs.OrderedChildren 49 tasksInodeRefs 50 51 locks vfs.FileLocks 52 53 fs *filesystem 54 pidns *kernel.PIDNamespace 55 56 // '/proc/self' and '/proc/thread-self' have custom directory offsets in 57 // Linux. So handle them outside of OrderedChildren. 58 59 // fakeCgroupControllers is a map of controller name to directory in the 60 // cgroup hierarchy. These controllers are immutable and will be listed 61 // in /proc/pid/cgroup if not nil. 62 fakeCgroupControllers map[string]string 63 } 64 65 var _ kernfs.Inode = (*tasksInode)(nil) 66 67 func (fs *filesystem) newTasksInode(ctx context.Context, k *kernel.Kernel, pidns *kernel.PIDNamespace, fakeCgroupControllers map[string]string) *tasksInode { 68 root := auth.NewRootCredentials(pidns.UserNamespace()) 69 contents := map[string]kernfs.Inode{ 70 "cmdline": fs.newInode(ctx, root, 0444, &cmdLineData{}), 71 "cpuinfo": fs.newInode(ctx, root, 0444, newStaticFileSetStat(cpuInfoData(k))), 72 "filesystems": fs.newInode(ctx, root, 0444, &filesystemsData{}), 73 "loadavg": fs.newInode(ctx, root, 0444, &loadavgData{}), 74 "sys": fs.newSysDir(ctx, root, k), 75 "bus": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{}), 76 "fs": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{}), 77 "irq": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{}), 78 "meminfo": fs.newInode(ctx, root, 0444, &meminfoData{}), 79 "mounts": kernfs.NewStaticSymlink(ctx, root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/mounts"), 80 "net": kernfs.NewStaticSymlink(ctx, root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/net"), 81 "sentry-meminfo": fs.newInode(ctx, root, 0444, &sentryMeminfoData{}), 82 "stat": fs.newInode(ctx, root, 0444, &statData{}), 83 "sysrq-trigger": fs.newInode(ctx, root, 0200, newStaticFile("")), 84 "uptime": fs.newInode(ctx, root, 0444, &uptimeData{}), 85 "version": fs.newInode(ctx, root, 0444, &versionData{}), 86 } 87 // If fakeCgroupControllers are provided, don't create a cgroupfs backed 88 // /proc/cgroup as it will not match the fake controllers. 89 if len(fakeCgroupControllers) == 0 { 90 contents["cgroups"] = fs.newInode(ctx, root, 0444, &cgroupsData{}) 91 } 92 93 inode := &tasksInode{ 94 pidns: pidns, 95 fs: fs, 96 fakeCgroupControllers: fakeCgroupControllers, 97 } 98 inode.InodeAttrs.Init(ctx, root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) 99 inode.InitRefs() 100 101 inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) 102 links := inode.OrderedChildren.Populate(contents) 103 inode.IncLinks(links) 104 105 return inode 106 } 107 108 // Lookup implements kernfs.inodeDirectory.Lookup. 109 func (i *tasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) { 110 // Check if a static entry was looked up. 111 if d, err := i.OrderedChildren.Lookup(ctx, name); err == nil { 112 return d, nil 113 } 114 115 // Not a static entry. Try to lookup a corresponding task. 116 tid, err := strconv.ParseUint(name, 10, 64) 117 if err != nil { 118 root := auth.NewRootCredentials(i.pidns.UserNamespace()) 119 // If it failed to parse, check if it's one of the special handled files. 120 switch name { 121 case selfName: 122 return i.newSelfSymlink(ctx, root), nil 123 case threadSelfName: 124 return i.newThreadSelfSymlink(ctx, root), nil 125 } 126 return nil, linuxerr.ENOENT 127 } 128 129 task := i.pidns.TaskWithID(kernel.ThreadID(tid)) 130 if task == nil { 131 return nil, linuxerr.ENOENT 132 } 133 134 return i.fs.newTaskInode(ctx, task, i.pidns, true, i.fakeCgroupControllers) 135 } 136 137 // IterDirents implements kernfs.inodeDirectory.IterDirents. 138 func (i *tasksInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) { 139 // fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256 140 const FIRST_PROCESS_ENTRY = 256 141 142 // Use maxTaskID to shortcut searches that will result in 0 entries. 143 const maxTaskID = kernel.TasksLimit + 1 144 if offset >= maxTaskID { 145 return offset, nil 146 } 147 148 // According to Linux (fs/proc/base.c:proc_pid_readdir()), process directories 149 // start at offset FIRST_PROCESS_ENTRY with '/proc/self', followed by 150 // '/proc/thread-self' and then '/proc/[pid]'. 151 if offset < FIRST_PROCESS_ENTRY { 152 offset = FIRST_PROCESS_ENTRY 153 } 154 155 if offset == FIRST_PROCESS_ENTRY { 156 dirent := vfs.Dirent{ 157 Name: selfName, 158 Type: linux.DT_LNK, 159 Ino: i.fs.NextIno(), 160 NextOff: offset + 1, 161 } 162 if err := cb.Handle(dirent); err != nil { 163 return offset, err 164 } 165 offset++ 166 } 167 if offset == FIRST_PROCESS_ENTRY+1 { 168 dirent := vfs.Dirent{ 169 Name: threadSelfName, 170 Type: linux.DT_LNK, 171 Ino: i.fs.NextIno(), 172 NextOff: offset + 1, 173 } 174 if err := cb.Handle(dirent); err != nil { 175 return offset, err 176 } 177 offset++ 178 } 179 180 // Collect all tasks that TGIDs are greater than the offset specified. Per 181 // Linux we only include in directory listings if it's the leader. But for 182 // whatever crazy reason, you can still walk to the given node. 183 var tids []int 184 startTid := offset - FIRST_PROCESS_ENTRY - 2 185 for _, tg := range i.pidns.ThreadGroups() { 186 tid := i.pidns.IDOfThreadGroup(tg) 187 if int64(tid) < startTid { 188 continue 189 } 190 if leader := tg.Leader(); leader != nil { 191 tids = append(tids, int(tid)) 192 } 193 } 194 195 if len(tids) == 0 { 196 return offset, nil 197 } 198 199 sort.Ints(tids) 200 for _, tid := range tids { 201 dirent := vfs.Dirent{ 202 Name: strconv.FormatUint(uint64(tid), 10), 203 Type: linux.DT_DIR, 204 Ino: i.fs.NextIno(), 205 NextOff: FIRST_PROCESS_ENTRY + 2 + int64(tid) + 1, 206 } 207 if err := cb.Handle(dirent); err != nil { 208 return offset, err 209 } 210 offset++ 211 } 212 return maxTaskID, nil 213 } 214 215 // Open implements kernfs.Inode.Open. 216 func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 217 fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{ 218 SeekEnd: kernfs.SeekEndZero, 219 }) 220 if err != nil { 221 return nil, err 222 } 223 return fd.VFSFileDescription(), nil 224 } 225 226 func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { 227 stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts) 228 if err != nil { 229 return linux.Statx{}, err 230 } 231 232 if opts.Mask&linux.STATX_NLINK != 0 { 233 // Add dynamic children to link count. 234 for _, tg := range i.pidns.ThreadGroups() { 235 if leader := tg.Leader(); leader != nil { 236 stat.Nlink++ 237 } 238 } 239 } 240 241 return stat, nil 242 } 243 244 // DecRef implements kernfs.Inode.DecRef. 245 func (i *tasksInode) DecRef(ctx context.Context) { 246 i.tasksInodeRefs.DecRef(func() { i.Destroy(ctx) }) 247 } 248 249 // staticFileSetStat implements a special static file that allows inode 250 // attributes to be set. This is to support /proc files that are readonly, but 251 // allow attributes to be set. 252 // 253 // +stateify savable 254 type staticFileSetStat struct { 255 dynamicBytesFileSetAttr 256 vfs.StaticData 257 } 258 259 var _ dynamicInode = (*staticFileSetStat)(nil) 260 261 func newStaticFileSetStat(data string) *staticFileSetStat { 262 return &staticFileSetStat{StaticData: vfs.StaticData{Data: data}} 263 } 264 265 func cpuInfoData(k *kernel.Kernel) string { 266 features := k.FeatureSet() 267 var buf bytes.Buffer 268 for i, max := uint(0), k.ApplicationCores(); i < max; i++ { 269 features.WriteCPUInfoTo(i, &buf) 270 } 271 return buf.String() 272 } 273 274 func ipcData(v uint64) dynamicInode { 275 return newStaticFile(strconv.FormatUint(v, 10)) 276 }