github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fsimpl/proc/task_files.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package proc 16 17 import ( 18 "bytes" 19 "fmt" 20 "io" 21 22 "github.com/SagerNet/gvisor/pkg/abi/linux" 23 "github.com/SagerNet/gvisor/pkg/context" 24 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 25 "github.com/SagerNet/gvisor/pkg/hostarch" 26 "github.com/SagerNet/gvisor/pkg/safemem" 27 "github.com/SagerNet/gvisor/pkg/sentry/fsbridge" 28 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/kernfs" 29 "github.com/SagerNet/gvisor/pkg/sentry/kernel" 30 "github.com/SagerNet/gvisor/pkg/sentry/kernel/auth" 31 "github.com/SagerNet/gvisor/pkg/sentry/limits" 32 "github.com/SagerNet/gvisor/pkg/sentry/mm" 33 "github.com/SagerNet/gvisor/pkg/sentry/usage" 34 "github.com/SagerNet/gvisor/pkg/sentry/vfs" 35 "github.com/SagerNet/gvisor/pkg/sync" 36 "github.com/SagerNet/gvisor/pkg/syserror" 37 "github.com/SagerNet/gvisor/pkg/usermem" 38 ) 39 40 // "There is an (arbitrary) limit on the number of lines in the file. As at 41 // Linux 3.18, the limit is five lines." - user_namespaces(7) 42 const maxIDMapLines = 5 43 44 // mm gets the kernel task's MemoryManager. No additional reference is taken on 45 // mm here. This is safe because MemoryManager.destroy is required to leave the 46 // MemoryManager in a state where it's still usable as a DynamicBytesSource. 47 func getMM(task *kernel.Task) *mm.MemoryManager { 48 var tmm *mm.MemoryManager 49 task.WithMuLocked(func(t *kernel.Task) { 50 if mm := t.MemoryManager(); mm != nil { 51 tmm = mm 52 } 53 }) 54 return tmm 55 } 56 57 // getMMIncRef returns t's MemoryManager. If getMMIncRef succeeds, the 58 // MemoryManager's users count is incremented, and must be decremented by the 59 // caller when it is no longer in use. 60 func getMMIncRef(task *kernel.Task) (*mm.MemoryManager, error) { 61 var m *mm.MemoryManager 62 task.WithMuLocked(func(t *kernel.Task) { 63 m = t.MemoryManager() 64 }) 65 if m == nil || !m.IncUsers() { 66 return nil, io.EOF 67 } 68 return m, nil 69 } 70 71 func checkTaskState(t *kernel.Task) error { 72 switch t.ExitState() { 73 case kernel.TaskExitZombie: 74 return linuxerr.EACCES 75 case kernel.TaskExitDead: 76 return syserror.ESRCH 77 } 78 return nil 79 } 80 81 type bufferWriter struct { 82 buf *bytes.Buffer 83 } 84 85 // WriteFromBlocks writes up to srcs.NumBytes() bytes from srcs and returns 86 // the number of bytes written. It may return a partial write without an 87 // error (i.e. (n, nil) where 0 < n < srcs.NumBytes()). It should not 88 // return a full write with an error (i.e. srcs.NumBytes(), err) where err 89 // != nil). 90 func (w *bufferWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { 91 written := srcs.NumBytes() 92 for !srcs.IsEmpty() { 93 w.buf.Write(srcs.Head().ToSlice()) 94 srcs = srcs.Tail() 95 } 96 return written, nil 97 } 98 99 // auxvData implements vfs.DynamicBytesSource for /proc/[pid]/auxv. 100 // 101 // +stateify savable 102 type auxvData struct { 103 kernfs.DynamicBytesFile 104 105 task *kernel.Task 106 } 107 108 var _ dynamicInode = (*auxvData)(nil) 109 110 // Generate implements vfs.DynamicBytesSource.Generate. 111 func (d *auxvData) Generate(ctx context.Context, buf *bytes.Buffer) error { 112 if d.task.ExitState() == kernel.TaskExitDead { 113 return syserror.ESRCH 114 } 115 m, err := getMMIncRef(d.task) 116 if err != nil { 117 // Return empty file. 118 return nil 119 } 120 defer m.DecUsers(ctx) 121 122 auxv := m.Auxv() 123 // Space for buffer with AT_NULL (0) terminator at the end. 124 buf.Grow((len(auxv) + 1) * 16) 125 for _, e := range auxv { 126 var tmp [16]byte 127 hostarch.ByteOrder.PutUint64(tmp[:8], e.Key) 128 hostarch.ByteOrder.PutUint64(tmp[8:], uint64(e.Value)) 129 buf.Write(tmp[:]) 130 } 131 var atNull [16]byte 132 buf.Write(atNull[:]) 133 134 return nil 135 } 136 137 // execArgType enumerates the types of exec arguments that are exposed through 138 // proc. 139 type execArgType int 140 141 const ( 142 cmdlineDataArg execArgType = iota 143 environDataArg 144 ) 145 146 // cmdlineData implements vfs.DynamicBytesSource for /proc/[pid]/cmdline. 147 // 148 // +stateify savable 149 type cmdlineData struct { 150 kernfs.DynamicBytesFile 151 152 task *kernel.Task 153 154 // arg is the type of exec argument this file contains. 155 arg execArgType 156 } 157 158 var _ dynamicInode = (*cmdlineData)(nil) 159 160 // Generate implements vfs.DynamicBytesSource.Generate. 161 func (d *cmdlineData) Generate(ctx context.Context, buf *bytes.Buffer) error { 162 if d.task.ExitState() == kernel.TaskExitDead { 163 return syserror.ESRCH 164 } 165 m, err := getMMIncRef(d.task) 166 if err != nil { 167 // Return empty file. 168 return nil 169 } 170 defer m.DecUsers(ctx) 171 172 // Figure out the bounds of the exec arg we are trying to read. 173 var ar hostarch.AddrRange 174 switch d.arg { 175 case cmdlineDataArg: 176 ar = hostarch.AddrRange{ 177 Start: m.ArgvStart(), 178 End: m.ArgvEnd(), 179 } 180 case environDataArg: 181 ar = hostarch.AddrRange{ 182 Start: m.EnvvStart(), 183 End: m.EnvvEnd(), 184 } 185 default: 186 panic(fmt.Sprintf("unknown exec arg type %v", d.arg)) 187 } 188 if ar.Start == 0 || ar.End == 0 { 189 // Don't attempt to read before the start/end are set up. 190 return io.EOF 191 } 192 193 // N.B. Technically this should be usermem.IOOpts.IgnorePermissions = true 194 // until Linux 4.9 (272ddc8b3735 "proc: don't use FOLL_FORCE for reading 195 // cmdline and environment"). 196 writer := &bufferWriter{buf: buf} 197 if n, err := m.CopyInTo(ctx, hostarch.AddrRangeSeqOf(ar), writer, usermem.IOOpts{}); n == 0 || err != nil { 198 // Nothing to copy or something went wrong. 199 return err 200 } 201 202 // On Linux, if the NULL byte at the end of the argument vector has been 203 // overwritten, it continues reading the environment vector as part of 204 // the argument vector. 205 if d.arg == cmdlineDataArg && buf.Bytes()[buf.Len()-1] != 0 { 206 if end := bytes.IndexByte(buf.Bytes(), 0); end != -1 { 207 // If we found a NULL character somewhere else in argv, truncate the 208 // return up to the NULL terminator (including it). 209 buf.Truncate(end) 210 return nil 211 } 212 213 // There is no NULL terminator in the string, return into envp. 214 arEnvv := hostarch.AddrRange{ 215 Start: m.EnvvStart(), 216 End: m.EnvvEnd(), 217 } 218 219 // Upstream limits the returned amount to one page of slop. 220 // https://elixir.bootlin.com/linux/v4.20/source/fs/proc/base.c#L208 221 // we'll return one page total between argv and envp because of the 222 // above page restrictions. 223 if buf.Len() >= hostarch.PageSize { 224 // Returned at least one page already, nothing else to add. 225 return nil 226 } 227 remaining := hostarch.PageSize - buf.Len() 228 if int(arEnvv.Length()) > remaining { 229 end, ok := arEnvv.Start.AddLength(uint64(remaining)) 230 if !ok { 231 return syserror.EFAULT 232 } 233 arEnvv.End = end 234 } 235 if _, err := m.CopyInTo(ctx, hostarch.AddrRangeSeqOf(arEnvv), writer, usermem.IOOpts{}); err != nil { 236 return err 237 } 238 239 // Linux will return envp up to and including the first NULL character, 240 // so find it. 241 envStart := int(ar.Length()) 242 if nullIdx := bytes.IndexByte(buf.Bytes()[envStart:], 0); nullIdx != -1 { 243 buf.Truncate(envStart + nullIdx) 244 } 245 } 246 247 return nil 248 } 249 250 // +stateify savable 251 type commInode struct { 252 kernfs.DynamicBytesFile 253 254 task *kernel.Task 255 } 256 257 func (fs *filesystem) newComm(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode { 258 inode := &commInode{task: task} 259 inode.DynamicBytesFile.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm) 260 return inode 261 } 262 263 func (i *commInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { 264 // This file can always be read or written by members of the same thread 265 // group. See fs/proc/base.c:proc_tid_comm_permission. 266 // 267 // N.B. This check is currently a no-op as we don't yet support writing and 268 // this file is world-readable anyways. 269 t := kernel.TaskFromContext(ctx) 270 if t != nil && t.ThreadGroup() == i.task.ThreadGroup() && !ats.MayExec() { 271 return nil 272 } 273 274 return i.DynamicBytesFile.CheckPermissions(ctx, creds, ats) 275 } 276 277 // commData implements vfs.DynamicBytesSource for /proc/[pid]/comm. 278 // 279 // +stateify savable 280 type commData struct { 281 kernfs.DynamicBytesFile 282 283 task *kernel.Task 284 } 285 286 var _ dynamicInode = (*commData)(nil) 287 288 // Generate implements vfs.DynamicBytesSource.Generate. 289 func (d *commData) Generate(ctx context.Context, buf *bytes.Buffer) error { 290 buf.WriteString(d.task.Name()) 291 buf.WriteString("\n") 292 return nil 293 } 294 295 // idMapData implements vfs.WritableDynamicBytesSource for 296 // /proc/[pid]/{gid_map|uid_map}. 297 // 298 // +stateify savable 299 type idMapData struct { 300 kernfs.DynamicBytesFile 301 302 task *kernel.Task 303 gids bool 304 } 305 306 var _ dynamicInode = (*idMapData)(nil) 307 308 // Generate implements vfs.WritableDynamicBytesSource.Generate. 309 func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error { 310 var entries []auth.IDMapEntry 311 if d.gids { 312 entries = d.task.UserNamespace().GIDMap() 313 } else { 314 entries = d.task.UserNamespace().UIDMap() 315 } 316 for _, e := range entries { 317 fmt.Fprintf(buf, "%10d %10d %10d\n", e.FirstID, e.FirstParentID, e.Length) 318 } 319 return nil 320 } 321 322 // Write implements vfs.WritableDynamicBytesSource.Write. 323 func (d *idMapData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { 324 // "In addition, the number of bytes written to the file must be less than 325 // the system page size, and the write must be performed at the start of 326 // the file ..." - user_namespaces(7) 327 srclen := src.NumBytes() 328 if srclen >= hostarch.PageSize || offset != 0 { 329 return 0, linuxerr.EINVAL 330 } 331 b := make([]byte, srclen) 332 if _, err := src.CopyIn(ctx, b); err != nil { 333 return 0, err 334 } 335 336 // Truncate from the first NULL byte. 337 var nul int64 338 nul = int64(bytes.IndexByte(b, 0)) 339 if nul == -1 { 340 nul = srclen 341 } 342 b = b[:nul] 343 // Remove the last \n. 344 if nul >= 1 && b[nul-1] == '\n' { 345 b = b[:nul-1] 346 } 347 lines := bytes.SplitN(b, []byte("\n"), maxIDMapLines+1) 348 if len(lines) > maxIDMapLines { 349 return 0, linuxerr.EINVAL 350 } 351 352 entries := make([]auth.IDMapEntry, len(lines)) 353 for i, l := range lines { 354 var e auth.IDMapEntry 355 _, err := fmt.Sscan(string(l), &e.FirstID, &e.FirstParentID, &e.Length) 356 if err != nil { 357 return 0, linuxerr.EINVAL 358 } 359 entries[i] = e 360 } 361 var err error 362 if d.gids { 363 err = d.task.UserNamespace().SetGIDMap(ctx, entries) 364 } else { 365 err = d.task.UserNamespace().SetUIDMap(ctx, entries) 366 } 367 if err != nil { 368 return 0, err 369 } 370 371 // On success, Linux's kernel/user_namespace.c:map_write() always returns 372 // count, even if fewer bytes were used. 373 return int64(srclen), nil 374 } 375 376 var _ kernfs.Inode = (*memInode)(nil) 377 378 // memInode implements kernfs.Inode for /proc/[pid]/mem. 379 // 380 // +stateify savable 381 type memInode struct { 382 kernfs.InodeAttrs 383 kernfs.InodeNoStatFS 384 kernfs.InodeNoopRefCount 385 kernfs.InodeNotDirectory 386 kernfs.InodeNotSymlink 387 388 task *kernel.Task 389 locks vfs.FileLocks 390 } 391 392 func (fs *filesystem) newMemInode(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode { 393 // Note: credentials are overridden by taskOwnedInode. 394 inode := &memInode{task: task} 395 inode.init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm) 396 return &taskOwnedInode{Inode: inode, owner: task} 397 } 398 399 func (f *memInode) init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { 400 if perm&^linux.PermissionsMask != 0 { 401 panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) 402 } 403 f.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm) 404 } 405 406 // Open implements kernfs.Inode.Open. 407 func (f *memInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 408 // TODO(github.com/SagerNet/issue/260): Add check for PTRACE_MODE_ATTACH_FSCREDS 409 // Permission to read this file is governed by PTRACE_MODE_ATTACH_FSCREDS 410 // Since we dont implement setfsuid/setfsgid we can just use PTRACE_MODE_ATTACH 411 if !kernel.ContextCanTrace(ctx, f.task, true) { 412 return nil, linuxerr.EACCES 413 } 414 if err := checkTaskState(f.task); err != nil { 415 return nil, err 416 } 417 fd := &memFD{} 418 if err := fd.Init(rp.Mount(), d, f, opts.Flags); err != nil { 419 return nil, err 420 } 421 return &fd.vfsfd, nil 422 } 423 424 // SetStat implements kernfs.Inode.SetStat. 425 func (*memInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { 426 return linuxerr.EPERM 427 } 428 429 var _ vfs.FileDescriptionImpl = (*memFD)(nil) 430 431 // memFD implements vfs.FileDescriptionImpl for /proc/[pid]/mem. 432 // 433 // +stateify savable 434 type memFD struct { 435 vfsfd vfs.FileDescription 436 vfs.FileDescriptionDefaultImpl 437 vfs.LockFD 438 439 inode *memInode 440 441 // mu guards the fields below. 442 mu sync.Mutex `state:"nosave"` 443 offset int64 444 } 445 446 // Init initializes memFD. 447 func (fd *memFD) Init(m *vfs.Mount, d *kernfs.Dentry, inode *memInode, flags uint32) error { 448 fd.LockFD.Init(&inode.locks) 449 if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { 450 return err 451 } 452 fd.inode = inode 453 return nil 454 } 455 456 // Seek implements vfs.FileDescriptionImpl.Seek. 457 func (fd *memFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { 458 fd.mu.Lock() 459 defer fd.mu.Unlock() 460 switch whence { 461 case linux.SEEK_SET: 462 case linux.SEEK_CUR: 463 offset += fd.offset 464 default: 465 return 0, linuxerr.EINVAL 466 } 467 if offset < 0 { 468 return 0, linuxerr.EINVAL 469 } 470 fd.offset = offset 471 return offset, nil 472 } 473 474 // PRead implements vfs.FileDescriptionImpl.PRead. 475 func (fd *memFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { 476 if dst.NumBytes() == 0 { 477 return 0, nil 478 } 479 m, err := getMMIncRef(fd.inode.task) 480 if err != nil { 481 return 0, err 482 } 483 defer m.DecUsers(ctx) 484 // Buffer the read data because of MM locks 485 buf := make([]byte, dst.NumBytes()) 486 n, readErr := m.CopyIn(ctx, hostarch.Addr(offset), buf, usermem.IOOpts{IgnorePermissions: true}) 487 if n > 0 { 488 if _, err := dst.CopyOut(ctx, buf[:n]); err != nil { 489 return 0, syserror.EFAULT 490 } 491 return int64(n), nil 492 } 493 if readErr != nil { 494 return 0, syserror.EIO 495 } 496 return 0, nil 497 } 498 499 // Read implements vfs.FileDescriptionImpl.Read. 500 func (fd *memFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { 501 fd.mu.Lock() 502 n, err := fd.PRead(ctx, dst, fd.offset, opts) 503 fd.offset += n 504 fd.mu.Unlock() 505 return n, err 506 } 507 508 // Stat implements vfs.FileDescriptionImpl.Stat. 509 func (fd *memFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 510 fs := fd.vfsfd.VirtualDentry().Mount().Filesystem() 511 return fd.inode.Stat(ctx, fs, opts) 512 } 513 514 // SetStat implements vfs.FileDescriptionImpl.SetStat. 515 func (fd *memFD) SetStat(context.Context, vfs.SetStatOptions) error { 516 return linuxerr.EPERM 517 } 518 519 // Release implements vfs.FileDescriptionImpl.Release. 520 func (fd *memFD) Release(context.Context) {} 521 522 // mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps. 523 // 524 // +stateify savable 525 type mapsData struct { 526 kernfs.DynamicBytesFile 527 528 task *kernel.Task 529 } 530 531 var _ dynamicInode = (*mapsData)(nil) 532 533 // Generate implements vfs.DynamicBytesSource.Generate. 534 func (d *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { 535 if mm := getMM(d.task); mm != nil { 536 mm.ReadMapsDataInto(ctx, buf) 537 } 538 return nil 539 } 540 541 // smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps. 542 // 543 // +stateify savable 544 type smapsData struct { 545 kernfs.DynamicBytesFile 546 547 task *kernel.Task 548 } 549 550 var _ dynamicInode = (*smapsData)(nil) 551 552 // Generate implements vfs.DynamicBytesSource.Generate. 553 func (d *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { 554 if mm := getMM(d.task); mm != nil { 555 mm.ReadSmapsDataInto(ctx, buf) 556 } 557 return nil 558 } 559 560 // +stateify savable 561 type taskStatData struct { 562 kernfs.DynamicBytesFile 563 564 task *kernel.Task 565 566 // If tgstats is true, accumulate fault stats (not implemented) and CPU 567 // time across all tasks in t's thread group. 568 tgstats bool 569 570 // pidns is the PID namespace associated with the proc filesystem that 571 // includes the file using this statData. 572 pidns *kernel.PIDNamespace 573 } 574 575 var _ dynamicInode = (*taskStatData)(nil) 576 577 // Generate implements vfs.DynamicBytesSource.Generate. 578 func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { 579 fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.task)) 580 fmt.Fprintf(buf, "(%s) ", s.task.Name()) 581 fmt.Fprintf(buf, "%c ", s.task.StateStatus()[0]) 582 ppid := kernel.ThreadID(0) 583 if parent := s.task.Parent(); parent != nil { 584 ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) 585 } 586 fmt.Fprintf(buf, "%d ", ppid) 587 fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.task.ThreadGroup().ProcessGroup())) 588 fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.task.ThreadGroup().Session())) 589 fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */) 590 fmt.Fprintf(buf, "0 " /* flags */) 591 fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */) 592 var cputime usage.CPUStats 593 if s.tgstats { 594 cputime = s.task.ThreadGroup().CPUStats() 595 } else { 596 cputime = s.task.CPUStats() 597 } 598 fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) 599 cputime = s.task.ThreadGroup().JoinedChildCPUStats() 600 fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) 601 fmt.Fprintf(buf, "%d %d ", s.task.Priority(), s.task.Niceness()) 602 fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Count()) 603 604 // itrealvalue. Since kernel 2.6.17, this field is no longer 605 // maintained, and is hard coded as 0. 606 fmt.Fprintf(buf, "0 ") 607 608 // Start time is relative to boot time, expressed in clock ticks. 609 fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.task.StartTime().Sub(s.task.Kernel().Timekeeper().BootTime()))) 610 611 var vss, rss uint64 612 s.task.WithMuLocked(func(t *kernel.Task) { 613 if mm := t.MemoryManager(); mm != nil { 614 vss = mm.VirtualMemorySize() 615 rss = mm.ResidentSetSize() 616 } 617 }) 618 fmt.Fprintf(buf, "%d %d ", vss, rss/hostarch.PageSize) 619 620 // rsslim. 621 fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Limits().Get(limits.Rss).Cur) 622 623 fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */) 624 fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */) 625 fmt.Fprintf(buf, "0 0 " /* nswap cnswap */) 626 terminationSignal := linux.Signal(0) 627 if s.task == s.task.ThreadGroup().Leader() { 628 terminationSignal = s.task.ThreadGroup().TerminationSignal() 629 } 630 fmt.Fprintf(buf, "%d ", terminationSignal) 631 fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */) 632 fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */) 633 fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */) 634 fmt.Fprintf(buf, "0\n" /* exit_code */) 635 636 return nil 637 } 638 639 // statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm. 640 // 641 // +stateify savable 642 type statmData struct { 643 kernfs.DynamicBytesFile 644 645 task *kernel.Task 646 } 647 648 var _ dynamicInode = (*statmData)(nil) 649 650 // Generate implements vfs.DynamicBytesSource.Generate. 651 func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error { 652 var vss, rss uint64 653 s.task.WithMuLocked(func(t *kernel.Task) { 654 if mm := t.MemoryManager(); mm != nil { 655 vss = mm.VirtualMemorySize() 656 rss = mm.ResidentSetSize() 657 } 658 }) 659 660 fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/hostarch.PageSize, rss/hostarch.PageSize) 661 return nil 662 } 663 664 // statusData implements vfs.DynamicBytesSource for /proc/[pid]/status. 665 // 666 // +stateify savable 667 type statusData struct { 668 kernfs.DynamicBytesFile 669 670 task *kernel.Task 671 pidns *kernel.PIDNamespace 672 } 673 674 var _ dynamicInode = (*statusData)(nil) 675 676 // Generate implements vfs.DynamicBytesSource.Generate. 677 func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { 678 fmt.Fprintf(buf, "Name:\t%s\n", s.task.Name()) 679 fmt.Fprintf(buf, "State:\t%s\n", s.task.StateStatus()) 680 fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.task.ThreadGroup())) 681 fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.task)) 682 ppid := kernel.ThreadID(0) 683 if parent := s.task.Parent(); parent != nil { 684 ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) 685 } 686 fmt.Fprintf(buf, "PPid:\t%d\n", ppid) 687 tpid := kernel.ThreadID(0) 688 if tracer := s.task.Tracer(); tracer != nil { 689 tpid = s.pidns.IDOfTask(tracer) 690 } 691 fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid) 692 var fds int 693 var vss, rss, data uint64 694 s.task.WithMuLocked(func(t *kernel.Task) { 695 if fdTable := t.FDTable(); fdTable != nil { 696 fds = fdTable.CurrentMaxFDs() 697 } 698 if mm := t.MemoryManager(); mm != nil { 699 vss = mm.VirtualMemorySize() 700 rss = mm.ResidentSetSize() 701 data = mm.VirtualDataSize() 702 } 703 }) 704 fmt.Fprintf(buf, "FDSize:\t%d\n", fds) 705 fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10) 706 fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10) 707 fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10) 708 fmt.Fprintf(buf, "Threads:\t%d\n", s.task.ThreadGroup().Count()) 709 creds := s.task.Credentials() 710 fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps) 711 fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps) 712 fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps) 713 fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps) 714 fmt.Fprintf(buf, "Seccomp:\t%d\n", s.task.SeccompMode()) 715 // We unconditionally report a single NUMA node. See 716 // pkg/sentry/syscalls/linux/sys_mempolicy.go. 717 fmt.Fprintf(buf, "Mems_allowed:\t1\n") 718 fmt.Fprintf(buf, "Mems_allowed_list:\t0\n") 719 return nil 720 } 721 722 // ioUsage is the /proc/[pid]/io and /proc/[pid]/task/[tid]/io data provider. 723 type ioUsage interface { 724 // IOUsage returns the io usage data. 725 IOUsage() *usage.IO 726 } 727 728 // +stateify savable 729 type ioData struct { 730 kernfs.DynamicBytesFile 731 732 ioUsage 733 } 734 735 var _ dynamicInode = (*ioData)(nil) 736 737 // Generate implements vfs.DynamicBytesSource.Generate. 738 func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error { 739 io := usage.IO{} 740 io.Accumulate(i.IOUsage()) 741 742 fmt.Fprintf(buf, "char: %d\n", io.CharsRead) 743 fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten) 744 fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls) 745 fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls) 746 fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead) 747 fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten) 748 fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled) 749 return nil 750 } 751 752 // oomScoreAdj is a stub of the /proc/<pid>/oom_score_adj file. 753 // 754 // +stateify savable 755 type oomScoreAdj struct { 756 kernfs.DynamicBytesFile 757 758 task *kernel.Task 759 } 760 761 var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil) 762 763 // Generate implements vfs.DynamicBytesSource.Generate. 764 func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error { 765 if o.task.ExitState() == kernel.TaskExitDead { 766 return syserror.ESRCH 767 } 768 fmt.Fprintf(buf, "%d\n", o.task.OOMScoreAdj()) 769 return nil 770 } 771 772 // Write implements vfs.WritableDynamicBytesSource.Write. 773 func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { 774 if src.NumBytes() == 0 { 775 return 0, nil 776 } 777 778 // Limit input size so as not to impact performance if input size is large. 779 src = src.TakeFirst(hostarch.PageSize - 1) 780 781 var v int32 782 n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) 783 if err != nil { 784 return 0, err 785 } 786 787 if o.task.ExitState() == kernel.TaskExitDead { 788 return 0, syserror.ESRCH 789 } 790 if err := o.task.SetOOMScoreAdj(v); err != nil { 791 return 0, err 792 } 793 794 return n, nil 795 } 796 797 // exeSymlink is an symlink for the /proc/[pid]/exe file. 798 // 799 // +stateify savable 800 type exeSymlink struct { 801 implStatFS 802 kernfs.InodeAttrs 803 kernfs.InodeNoopRefCount 804 kernfs.InodeSymlink 805 806 task *kernel.Task 807 } 808 809 var _ kernfs.Inode = (*exeSymlink)(nil) 810 811 func (fs *filesystem) newExeSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { 812 inode := &exeSymlink{task: task} 813 inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) 814 return inode 815 } 816 817 // Readlink implements kernfs.Inode.Readlink. 818 func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { 819 exec, _, err := s.Getlink(ctx, nil) 820 if err != nil { 821 return "", err 822 } 823 defer exec.DecRef(ctx) 824 825 root := vfs.RootFromContext(ctx) 826 if !root.Ok() { 827 // It could have raced with process deletion. 828 return "", syserror.ESRCH 829 } 830 defer root.DecRef(ctx) 831 832 vfsObj := exec.Mount().Filesystem().VirtualFilesystem() 833 name, _ := vfsObj.PathnameWithDeleted(ctx, root, exec) 834 return name, nil 835 } 836 837 // Getlink implements kernfs.Inode.Getlink. 838 func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { 839 if !kernel.ContextCanTrace(ctx, s.task, false) { 840 return vfs.VirtualDentry{}, "", linuxerr.EACCES 841 } 842 if err := checkTaskState(s.task); err != nil { 843 return vfs.VirtualDentry{}, "", err 844 } 845 846 var err error 847 var exec fsbridge.File 848 s.task.WithMuLocked(func(t *kernel.Task) { 849 mm := t.MemoryManager() 850 if mm == nil { 851 err = linuxerr.EACCES 852 return 853 } 854 855 // The MemoryManager may be destroyed, in which case 856 // MemoryManager.destroy will simply set the executable to nil 857 // (with locks held). 858 exec = mm.Executable() 859 if exec == nil { 860 err = syserror.ESRCH 861 } 862 }) 863 if err != nil { 864 return vfs.VirtualDentry{}, "", err 865 } 866 defer exec.DecRef(ctx) 867 868 vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry() 869 vd.IncRef() 870 return vd, "", nil 871 } 872 873 // cwdSymlink is an symlink for the /proc/[pid]/cwd file. 874 // 875 // +stateify savable 876 type cwdSymlink struct { 877 implStatFS 878 kernfs.InodeAttrs 879 kernfs.InodeNoopRefCount 880 kernfs.InodeSymlink 881 882 task *kernel.Task 883 } 884 885 var _ kernfs.Inode = (*cwdSymlink)(nil) 886 887 func (fs *filesystem) newCwdSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { 888 inode := &cwdSymlink{task: task} 889 inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) 890 return inode 891 } 892 893 // Readlink implements kernfs.Inode.Readlink. 894 func (s *cwdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { 895 cwd, _, err := s.Getlink(ctx, nil) 896 if err != nil { 897 return "", err 898 } 899 defer cwd.DecRef(ctx) 900 901 root := vfs.RootFromContext(ctx) 902 if !root.Ok() { 903 // It could have raced with process deletion. 904 return "", syserror.ESRCH 905 } 906 defer root.DecRef(ctx) 907 908 vfsObj := cwd.Mount().Filesystem().VirtualFilesystem() 909 name, _ := vfsObj.PathnameWithDeleted(ctx, root, cwd) 910 return name, nil 911 } 912 913 // Getlink implements kernfs.Inode.Getlink. 914 func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { 915 if !kernel.ContextCanTrace(ctx, s.task, false) { 916 return vfs.VirtualDentry{}, "", linuxerr.EACCES 917 } 918 if err := checkTaskState(s.task); err != nil { 919 return vfs.VirtualDentry{}, "", err 920 } 921 cwd := s.task.FSContext().WorkingDirectoryVFS2() 922 if !cwd.Ok() { 923 // It could have raced with process deletion. 924 return vfs.VirtualDentry{}, "", syserror.ESRCH 925 } 926 return cwd, "", nil 927 } 928 929 // mountInfoData is used to implement /proc/[pid]/mountinfo. 930 // 931 // +stateify savable 932 type mountInfoData struct { 933 kernfs.DynamicBytesFile 934 935 task *kernel.Task 936 } 937 938 var _ dynamicInode = (*mountInfoData)(nil) 939 940 // Generate implements vfs.DynamicBytesSource.Generate. 941 func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { 942 var fsctx *kernel.FSContext 943 i.task.WithMuLocked(func(t *kernel.Task) { 944 fsctx = t.FSContext() 945 }) 946 if fsctx == nil { 947 // The task has been destroyed. Nothing to show here. 948 return nil 949 } 950 rootDir := fsctx.RootDirectoryVFS2() 951 if !rootDir.Ok() { 952 // Root has been destroyed. Don't try to read mounts. 953 return nil 954 } 955 defer rootDir.DecRef(ctx) 956 i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf) 957 return nil 958 } 959 960 // mountsData is used to implement /proc/[pid]/mounts. 961 // 962 // +stateify savable 963 type mountsData struct { 964 kernfs.DynamicBytesFile 965 966 task *kernel.Task 967 } 968 969 var _ dynamicInode = (*mountsData)(nil) 970 971 // Generate implements vfs.DynamicBytesSource.Generate. 972 func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error { 973 var fsctx *kernel.FSContext 974 i.task.WithMuLocked(func(t *kernel.Task) { 975 fsctx = t.FSContext() 976 }) 977 if fsctx == nil { 978 // The task has been destroyed. Nothing to show here. 979 return nil 980 } 981 rootDir := fsctx.RootDirectoryVFS2() 982 if !rootDir.Ok() { 983 // Root has been destroyed. Don't try to read mounts. 984 return nil 985 } 986 defer rootDir.DecRef(ctx) 987 i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf) 988 return nil 989 } 990 991 // +stateify savable 992 type namespaceSymlink struct { 993 kernfs.StaticSymlink 994 995 task *kernel.Task 996 } 997 998 func (fs *filesystem) newNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64, ns string) kernfs.Inode { 999 // Namespace symlinks should contain the namespace name and the inode number 1000 // for the namespace instance, so for example user:[123456]. We currently fake 1001 // the inode number by sticking the symlink inode in its place. 1002 target := fmt.Sprintf("%s:[%d]", ns, ino) 1003 1004 inode := &namespaceSymlink{task: task} 1005 // Note: credentials are overridden by taskOwnedInode. 1006 inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target) 1007 1008 taskInode := &taskOwnedInode{Inode: inode, owner: task} 1009 return taskInode 1010 } 1011 1012 // Readlink implements kernfs.Inode.Readlink. 1013 func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { 1014 if err := checkTaskState(s.task); err != nil { 1015 return "", err 1016 } 1017 return s.StaticSymlink.Readlink(ctx, mnt) 1018 } 1019 1020 // Getlink implements kernfs.Inode.Getlink. 1021 func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { 1022 if err := checkTaskState(s.task); err != nil { 1023 return vfs.VirtualDentry{}, "", err 1024 } 1025 1026 // Create a synthetic inode to represent the namespace. 1027 fs := mnt.Filesystem().Impl().(*filesystem) 1028 nsInode := &namespaceInode{} 1029 nsInode.Init(ctx, auth.CredentialsFromContext(ctx), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0444) 1030 dentry := &kernfs.Dentry{} 1031 dentry.Init(&fs.Filesystem, nsInode) 1032 vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry()) 1033 // Only IncRef vd.Mount() because vd.Dentry() already holds a ref of 1. 1034 mnt.IncRef() 1035 return vd, "", nil 1036 } 1037 1038 // namespaceInode is a synthetic inode created to represent a namespace in 1039 // /proc/[pid]/ns/*. 1040 // 1041 // +stateify savable 1042 type namespaceInode struct { 1043 implStatFS 1044 kernfs.InodeAttrs 1045 kernfs.InodeNoopRefCount 1046 kernfs.InodeNotDirectory 1047 kernfs.InodeNotSymlink 1048 1049 locks vfs.FileLocks 1050 } 1051 1052 var _ kernfs.Inode = (*namespaceInode)(nil) 1053 1054 // Init initializes a namespace inode. 1055 func (i *namespaceInode) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { 1056 if perm&^linux.PermissionsMask != 0 { 1057 panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) 1058 } 1059 i.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm) 1060 } 1061 1062 // Open implements kernfs.Inode.Open. 1063 func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 1064 fd := &namespaceFD{inode: i} 1065 i.IncRef() 1066 fd.LockFD.Init(&i.locks) 1067 if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { 1068 return nil, err 1069 } 1070 return &fd.vfsfd, nil 1071 } 1072 1073 // namespace FD is a synthetic file that represents a namespace in 1074 // /proc/[pid]/ns/*. 1075 // 1076 // +stateify savable 1077 type namespaceFD struct { 1078 vfs.FileDescriptionDefaultImpl 1079 vfs.LockFD 1080 1081 vfsfd vfs.FileDescription 1082 inode *namespaceInode 1083 } 1084 1085 var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil) 1086 1087 // Stat implements vfs.FileDescriptionImpl.Stat. 1088 func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 1089 vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() 1090 return fd.inode.Stat(ctx, vfs, opts) 1091 } 1092 1093 // SetStat implements vfs.FileDescriptionImpl.SetStat. 1094 func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { 1095 vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() 1096 creds := auth.CredentialsFromContext(ctx) 1097 return fd.inode.SetStat(ctx, vfs, creds, opts) 1098 } 1099 1100 // Release implements vfs.FileDescriptionImpl.Release. 1101 func (fd *namespaceFD) Release(ctx context.Context) { 1102 fd.inode.DecRef(ctx) 1103 } 1104 1105 // taskCgroupData generates data for /proc/[pid]/cgroup. 1106 // 1107 // +stateify savable 1108 type taskCgroupData struct { 1109 dynamicBytesFileSetAttr 1110 task *kernel.Task 1111 } 1112 1113 var _ dynamicInode = (*taskCgroupData)(nil) 1114 1115 // Generate implements vfs.DynamicBytesSource.Generate. 1116 func (d *taskCgroupData) Generate(ctx context.Context, buf *bytes.Buffer) error { 1117 // When a task is existing on Linux, a task's cgroup set is cleared and 1118 // reset to the initial cgroup set, which is essentially the set of root 1119 // cgroups. Because of this, the /proc/<pid>/cgroup file is always readable 1120 // on Linux throughout a task's lifetime. 1121 // 1122 // The sentry removes tasks from cgroups during the exit process, but 1123 // doesn't move them into an initial cgroup set, so partway through task 1124 // exit this file show a task is in no cgroups, which is incorrect. Instead, 1125 // once a task has left its cgroups, we return an error. 1126 if d.task.ExitState() >= kernel.TaskExitInitiated { 1127 return syserror.ESRCH 1128 } 1129 1130 d.task.GenerateProcTaskCgroup(buf) 1131 return nil 1132 }