gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/fsimpl/proc/task_files.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package proc 16 17 import ( 18 "bytes" 19 "fmt" 20 "io" 21 "sort" 22 "strconv" 23 "strings" 24 25 "gvisor.dev/gvisor/pkg/abi/linux" 26 "gvisor.dev/gvisor/pkg/context" 27 "gvisor.dev/gvisor/pkg/errors/linuxerr" 28 "gvisor.dev/gvisor/pkg/hostarch" 29 "gvisor.dev/gvisor/pkg/safemem" 30 "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" 31 "gvisor.dev/gvisor/pkg/sentry/fsimpl/nsfs" 32 "gvisor.dev/gvisor/pkg/sentry/kernel" 33 "gvisor.dev/gvisor/pkg/sentry/kernel/auth" 34 "gvisor.dev/gvisor/pkg/sentry/limits" 35 "gvisor.dev/gvisor/pkg/sentry/mm" 36 "gvisor.dev/gvisor/pkg/sentry/usage" 37 "gvisor.dev/gvisor/pkg/sentry/vfs" 38 "gvisor.dev/gvisor/pkg/sync" 39 "gvisor.dev/gvisor/pkg/usermem" 40 ) 41 42 // "There is an (arbitrary) limit on the number of lines in the file. As at 43 // Linux 3.18, the limit is five lines." - user_namespaces(7) 44 const maxIDMapLines = 5 45 46 // getMM gets the kernel task's MemoryManager. No additional reference is taken on 47 // mm here. This is safe because MemoryManager.destroy is required to leave the 48 // MemoryManager in a state where it's still usable as a DynamicBytesSource. 49 func getMM(task *kernel.Task) *mm.MemoryManager { 50 var tmm *mm.MemoryManager 51 task.WithMuLocked(func(t *kernel.Task) { 52 if mm := t.MemoryManager(); mm != nil { 53 tmm = mm 54 } 55 }) 56 return tmm 57 } 58 59 // getMMIncRef returns t's MemoryManager. If getMMIncRef succeeds, the 60 // MemoryManager's users count is incremented, and must be decremented by the 61 // caller when it is no longer in use. 62 func getMMIncRef(task *kernel.Task) (*mm.MemoryManager, error) { 63 var m *mm.MemoryManager 64 task.WithMuLocked(func(t *kernel.Task) { 65 m = t.MemoryManager() 66 }) 67 if m == nil || !m.IncUsers() { 68 return nil, io.EOF 69 } 70 return m, nil 71 } 72 73 func checkTaskState(t *kernel.Task) error { 74 switch t.ExitState() { 75 case kernel.TaskExitZombie: 76 return linuxerr.EACCES 77 case kernel.TaskExitDead: 78 return linuxerr.ESRCH 79 } 80 return nil 81 } 82 83 type bufferWriter struct { 84 buf *bytes.Buffer 85 } 86 87 // WriteFromBlocks writes up to srcs.NumBytes() bytes from srcs and returns 88 // the number of bytes written. It may return a partial write without an 89 // error (i.e. (n, nil) where 0 < n < srcs.NumBytes()). It should not 90 // return a full write with an error (i.e. srcs.NumBytes(), err) where err 91 // != nil). 92 func (w *bufferWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { 93 written := srcs.NumBytes() 94 for !srcs.IsEmpty() { 95 w.buf.Write(srcs.Head().ToSlice()) 96 srcs = srcs.Tail() 97 } 98 return written, nil 99 } 100 101 // auxvData implements vfs.DynamicBytesSource for /proc/[pid]/auxv. 102 // 103 // +stateify savable 104 type auxvData struct { 105 kernfs.DynamicBytesFile 106 107 task *kernel.Task 108 } 109 110 var _ dynamicInode = (*auxvData)(nil) 111 112 // Generate implements vfs.DynamicBytesSource.Generate. 113 func (d *auxvData) Generate(ctx context.Context, buf *bytes.Buffer) error { 114 if d.task.ExitState() == kernel.TaskExitDead { 115 return linuxerr.ESRCH 116 } 117 m, err := getMMIncRef(d.task) 118 if err != nil { 119 // Return empty file. 120 return nil 121 } 122 defer m.DecUsers(ctx) 123 124 auxv := m.Auxv() 125 // Space for buffer with AT_NULL (0) terminator at the end. 126 buf.Grow((len(auxv) + 1) * 16) 127 for _, e := range auxv { 128 var tmp [16]byte 129 hostarch.ByteOrder.PutUint64(tmp[:8], e.Key) 130 hostarch.ByteOrder.PutUint64(tmp[8:], uint64(e.Value)) 131 buf.Write(tmp[:]) 132 } 133 var atNull [16]byte 134 buf.Write(atNull[:]) 135 136 return nil 137 } 138 139 // MetadataType enumerates the types of metadata that is exposed through proc. 140 type MetadataType int 141 142 const ( 143 // Cmdline represents /proc/[pid]/cmdline. 144 Cmdline MetadataType = iota 145 146 // Environ represents /proc/[pid]/environ. 147 Environ 148 ) 149 150 // GetMetadata fetches the process's metadata of type t and writes it into 151 // buf. The process is identified by mm. 152 func GetMetadata(ctx context.Context, mm *mm.MemoryManager, buf *bytes.Buffer, t MetadataType) error { 153 // Figure out the bounds of the exec arg we are trying to read. 154 var ar hostarch.AddrRange 155 switch t { 156 case Cmdline: 157 ar = hostarch.AddrRange{ 158 Start: mm.ArgvStart(), 159 End: mm.ArgvEnd(), 160 } 161 case Environ: 162 ar = hostarch.AddrRange{ 163 Start: mm.EnvvStart(), 164 End: mm.EnvvEnd(), 165 } 166 default: 167 panic(fmt.Sprintf("unknown exec arg type %v", t)) 168 } 169 if ar.Start == 0 || ar.End == 0 { 170 // Don't attempt to read before the start/end are set up. 171 return io.EOF 172 } 173 174 // N.B. Technically this should be usermem.IOOpts.IgnorePermissions = true 175 // until Linux 4.9 (272ddc8b3735 "proc: don't use FOLL_FORCE for reading 176 // cmdline and environment"). 177 writer := &bufferWriter{buf: buf} 178 if n, err := mm.CopyInTo(ctx, hostarch.AddrRangeSeqOf(ar), writer, usermem.IOOpts{}); n == 0 || err != nil { 179 // Nothing to copy or something went wrong. 180 return err 181 } 182 183 // On Linux, if the NULL byte at the end of the argument vector has been 184 // overwritten, it continues reading the environment vector as part of 185 // the argument vector. 186 if t == Cmdline && buf.Bytes()[buf.Len()-1] != 0 { 187 if end := bytes.IndexByte(buf.Bytes(), 0); end != -1 { 188 // If we found a NULL character somewhere else in argv, truncate the 189 // return up to the NULL terminator (including it). 190 buf.Truncate(end) 191 return nil 192 } 193 194 // There is no NULL terminator in the string, return into envp. 195 arEnvv := hostarch.AddrRange{ 196 Start: mm.EnvvStart(), 197 End: mm.EnvvEnd(), 198 } 199 200 // Upstream limits the returned amount to one page of slop. 201 // https://elixir.bootlin.com/linux/v4.20/source/fs/proc/base.c#L208 202 // we'll return one page total between argv and envp because of the 203 // above page restrictions. 204 if buf.Len() >= hostarch.PageSize { 205 // Returned at least one page already, nothing else to add. 206 return nil 207 } 208 remaining := hostarch.PageSize - buf.Len() 209 if int(arEnvv.Length()) > remaining { 210 end, ok := arEnvv.Start.AddLength(uint64(remaining)) 211 if !ok { 212 return linuxerr.EFAULT 213 } 214 arEnvv.End = end 215 } 216 if _, err := mm.CopyInTo(ctx, hostarch.AddrRangeSeqOf(arEnvv), writer, usermem.IOOpts{}); err != nil { 217 return err 218 } 219 220 // Linux will return envp up to and including the first NULL character, 221 // so find it. 222 envStart := int(ar.Length()) 223 if nullIdx := bytes.IndexByte(buf.Bytes()[envStart:], 0); nullIdx != -1 { 224 buf.Truncate(envStart + nullIdx) 225 } 226 } 227 228 return nil 229 } 230 231 // metadataData implements vfs.DynamicBytesSource for proc metadata fields like: 232 // 233 // - /proc/[pid]/cmdline 234 // - /proc/[pid]/environ 235 // 236 // +stateify savable 237 type metadataData struct { 238 kernfs.DynamicBytesFile 239 240 task *kernel.Task 241 242 // arg is the type of exec argument this file contains. 243 metaType MetadataType 244 } 245 246 var _ dynamicInode = (*metadataData)(nil) 247 248 // Generate implements vfs.DynamicBytesSource.Generate. 249 func (d *metadataData) Generate(ctx context.Context, buf *bytes.Buffer) error { 250 if d.task.ExitState() == kernel.TaskExitDead { 251 return linuxerr.ESRCH 252 } 253 m, err := getMMIncRef(d.task) 254 if err != nil { 255 // Return empty file. 256 return nil 257 } 258 defer m.DecUsers(ctx) 259 return GetMetadata(ctx, m, buf, d.metaType) 260 } 261 262 // +stateify savable 263 type commInode struct { 264 kernfs.DynamicBytesFile 265 266 task *kernel.Task 267 } 268 269 func (fs *filesystem) newComm(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode { 270 inode := &commInode{task: task} 271 inode.DynamicBytesFile.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm) 272 return inode 273 } 274 275 func (i *commInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { 276 // This file can always be read or written by members of the same thread 277 // group. See fs/proc/base.c:proc_tid_comm_permission. 278 t := kernel.TaskFromContext(ctx) 279 if t != nil && t.ThreadGroup() == i.task.ThreadGroup() && !ats.MayExec() { 280 return nil 281 } 282 283 return i.DynamicBytesFile.CheckPermissions(ctx, creds, ats) 284 } 285 286 // commData implements vfs.WritableDynamicBytesSource for /proc/[pid]/comm. 287 // 288 // +stateify savable 289 type commData struct { 290 kernfs.DynamicBytesFile 291 292 task *kernel.Task 293 } 294 295 var _ dynamicInode = (*commData)(nil) 296 var _ vfs.WritableDynamicBytesSource = (*commData)(nil) 297 298 // Generate implements vfs.DynamicBytesSource.Generate. 299 func (d *commData) Generate(ctx context.Context, buf *bytes.Buffer) error { 300 buf.WriteString(d.task.Name()) 301 buf.WriteString("\n") 302 return nil 303 } 304 305 // Write implements vfs.WritableDynamicBytesSource.Write. 306 func (d *commData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { 307 srclen := src.NumBytes() 308 nameLen := int64(linux.TASK_COMM_LEN - 1) 309 if srclen < nameLen { 310 nameLen = srclen 311 } 312 name := make([]byte, nameLen) 313 if _, err := src.CopyIn(ctx, name); err != nil { 314 return 0, err 315 } 316 317 // Only allow writes from the same thread group, otherwise return 318 // EINVAL. See fs/proc/base.c:comm_write. 319 // 320 // Note that this check exists in addition to the same-thread-group 321 // check in CheckPermissions. 322 t := kernel.TaskFromContext(ctx) 323 if t == nil || t.ThreadGroup() != d.task.ThreadGroup() { 324 return 0, linuxerr.EINVAL 325 } 326 d.task.SetName(string(name)) 327 return int64(srclen), nil 328 } 329 330 // idMapData implements vfs.WritableDynamicBytesSource for 331 // /proc/[pid]/{gid_map|uid_map}. 332 // 333 // +stateify savable 334 type idMapData struct { 335 kernfs.DynamicBytesFile 336 337 task *kernel.Task 338 gids bool 339 } 340 341 var _ dynamicInode = (*idMapData)(nil) 342 var _ vfs.WritableDynamicBytesSource = (*idMapData)(nil) 343 344 // Generate implements vfs.WritableDynamicBytesSource.Generate. 345 func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error { 346 var entries []auth.IDMapEntry 347 if d.gids { 348 entries = d.task.UserNamespace().GIDMap() 349 } else { 350 entries = d.task.UserNamespace().UIDMap() 351 } 352 for _, e := range entries { 353 fmt.Fprintf(buf, "%10d %10d %10d\n", e.FirstID, e.FirstParentID, e.Length) 354 } 355 return nil 356 } 357 358 // Write implements vfs.WritableDynamicBytesSource.Write. 359 func (d *idMapData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { 360 // "In addition, the number of bytes written to the file must be less than 361 // the system page size, and the write must be performed at the start of 362 // the file ..." - user_namespaces(7) 363 srclen := src.NumBytes() 364 if srclen >= hostarch.PageSize || offset != 0 { 365 return 0, linuxerr.EINVAL 366 } 367 b := make([]byte, srclen) 368 if _, err := src.CopyIn(ctx, b); err != nil { 369 return 0, err 370 } 371 372 // Truncate from the first NULL byte. 373 var nul int64 374 nul = int64(bytes.IndexByte(b, 0)) 375 if nul == -1 { 376 nul = srclen 377 } 378 b = b[:nul] 379 // Remove the last \n. 380 if nul >= 1 && b[nul-1] == '\n' { 381 b = b[:nul-1] 382 } 383 lines := bytes.SplitN(b, []byte("\n"), maxIDMapLines+1) 384 if len(lines) > maxIDMapLines { 385 return 0, linuxerr.EINVAL 386 } 387 388 entries := make([]auth.IDMapEntry, len(lines)) 389 for i, l := range lines { 390 var e auth.IDMapEntry 391 _, err := fmt.Sscan(string(l), &e.FirstID, &e.FirstParentID, &e.Length) 392 if err != nil { 393 return 0, linuxerr.EINVAL 394 } 395 entries[i] = e 396 } 397 var err error 398 if d.gids { 399 err = d.task.UserNamespace().SetGIDMap(ctx, entries) 400 } else { 401 err = d.task.UserNamespace().SetUIDMap(ctx, entries) 402 } 403 if err != nil { 404 return 0, err 405 } 406 407 // On success, Linux's kernel/user_namespace.c:map_write() always returns 408 // count, even if fewer bytes were used. 409 return int64(srclen), nil 410 } 411 412 var _ kernfs.Inode = (*memInode)(nil) 413 414 // memInode implements kernfs.Inode for /proc/[pid]/mem. 415 // 416 // +stateify savable 417 type memInode struct { 418 kernfs.InodeAttrs 419 kernfs.InodeNoStatFS 420 kernfs.InodeNoopRefCount 421 kernfs.InodeNotAnonymous 422 kernfs.InodeNotDirectory 423 kernfs.InodeNotSymlink 424 kernfs.InodeWatches 425 426 task *kernel.Task 427 locks vfs.FileLocks 428 } 429 430 func (fs *filesystem) newMemInode(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode { 431 // Note: credentials are overridden by taskOwnedInode. 432 inode := &memInode{task: task} 433 inode.init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm) 434 return &taskOwnedInode{Inode: inode, owner: task} 435 } 436 437 func (f *memInode) init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { 438 if perm&^linux.PermissionsMask != 0 { 439 panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) 440 } 441 f.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm) 442 } 443 444 // Open implements kernfs.Inode.Open. 445 func (f *memInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 446 // TODO(gvisor.dev/issue/260): Add check for PTRACE_MODE_ATTACH_FSCREDS 447 // Permission to read this file is governed by PTRACE_MODE_ATTACH_FSCREDS 448 // Since we dont implement setfsuid/setfsgid we can just use PTRACE_MODE_ATTACH 449 if !kernel.ContextCanTrace(ctx, f.task, true) { 450 return nil, linuxerr.EACCES 451 } 452 if err := checkTaskState(f.task); err != nil { 453 return nil, err 454 } 455 fd := &memFD{} 456 if err := fd.Init(rp.Mount(), d, f, opts.Flags); err != nil { 457 return nil, err 458 } 459 return &fd.vfsfd, nil 460 } 461 462 // SetStat implements kernfs.Inode.SetStat. 463 func (*memInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { 464 return linuxerr.EPERM 465 } 466 467 var _ vfs.FileDescriptionImpl = (*memFD)(nil) 468 469 // memFD implements vfs.FileDescriptionImpl for /proc/[pid]/mem. 470 // 471 // +stateify savable 472 type memFD struct { 473 vfsfd vfs.FileDescription 474 vfs.FileDescriptionDefaultImpl 475 vfs.LockFD 476 477 inode *memInode 478 479 // mu guards the fields below. 480 mu sync.Mutex `state:"nosave"` 481 offset int64 482 } 483 484 // Init initializes memFD. 485 func (fd *memFD) Init(m *vfs.Mount, d *kernfs.Dentry, inode *memInode, flags uint32) error { 486 fd.LockFD.Init(&inode.locks) 487 if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { 488 return err 489 } 490 fd.inode = inode 491 return nil 492 } 493 494 // Seek implements vfs.FileDescriptionImpl.Seek. 495 func (fd *memFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { 496 fd.mu.Lock() 497 defer fd.mu.Unlock() 498 switch whence { 499 case linux.SEEK_SET: 500 case linux.SEEK_CUR: 501 offset += fd.offset 502 default: 503 return 0, linuxerr.EINVAL 504 } 505 if offset < 0 { 506 return 0, linuxerr.EINVAL 507 } 508 fd.offset = offset 509 return offset, nil 510 } 511 512 // PRead implements vfs.FileDescriptionImpl.PRead. 513 func (fd *memFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { 514 if dst.NumBytes() == 0 { 515 return 0, nil 516 } 517 m, err := getMMIncRef(fd.inode.task) 518 if err != nil { 519 return 0, err 520 } 521 defer m.DecUsers(ctx) 522 // Buffer the read data because of MM locks 523 buf := make([]byte, dst.NumBytes()) 524 n, readErr := m.CopyIn(ctx, hostarch.Addr(offset), buf, usermem.IOOpts{IgnorePermissions: true}) 525 if n > 0 { 526 if _, err := dst.CopyOut(ctx, buf[:n]); err != nil { 527 return 0, linuxerr.EFAULT 528 } 529 return int64(n), nil 530 } 531 if readErr != nil { 532 return 0, linuxerr.EIO 533 } 534 return 0, nil 535 } 536 537 // Read implements vfs.FileDescriptionImpl.Read. 538 func (fd *memFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { 539 fd.mu.Lock() 540 n, err := fd.PRead(ctx, dst, fd.offset, opts) 541 fd.offset += n 542 fd.mu.Unlock() 543 return n, err 544 } 545 546 // Stat implements vfs.FileDescriptionImpl.Stat. 547 func (fd *memFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 548 fs := fd.vfsfd.VirtualDentry().Mount().Filesystem() 549 return fd.inode.Stat(ctx, fs, opts) 550 } 551 552 // SetStat implements vfs.FileDescriptionImpl.SetStat. 553 func (fd *memFD) SetStat(context.Context, vfs.SetStatOptions) error { 554 return linuxerr.EPERM 555 } 556 557 // Release implements vfs.FileDescriptionImpl.Release. 558 func (fd *memFD) Release(context.Context) {} 559 560 // limitsData implements vfs.DynamicBytesSource for /proc/[pid]/limits. 561 // 562 // +stateify savable 563 type limitsData struct { 564 kernfs.DynamicBytesFile 565 566 task *kernel.Task 567 } 568 569 func (d *limitsData) Generate(ctx context.Context, buf *bytes.Buffer) error { 570 taskLimits := d.task.Limits() 571 // formatting matches the kernel output from linux/fs/proc/base.c:proc_pid_limits() 572 fmt.Fprintf(buf, "Limit Soft Limit Hard Limit Units \n") 573 for _, lt := range limits.AllLimitTypes { 574 fmt.Fprintf(buf, "%-25s ", lt.Name()) 575 576 l := taskLimits.Get(lt) 577 if l.Cur == limits.Infinity { 578 fmt.Fprintf(buf, "%-20s ", "unlimited") 579 } else { 580 fmt.Fprintf(buf, "%-20d ", l.Cur) 581 } 582 583 if l.Max == limits.Infinity { 584 fmt.Fprintf(buf, "%-20s ", "unlimited") 585 } else { 586 fmt.Fprintf(buf, "%-20d ", l.Max) 587 } 588 589 if u := lt.Unit(); u != "" { 590 fmt.Fprintf(buf, "%-10s", u) 591 } 592 593 buf.WriteByte('\n') 594 } 595 return nil 596 } 597 598 // mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps. 599 // 600 // +stateify savable 601 type mapsData struct { 602 kernfs.DynamicBytesFile 603 604 task *kernel.Task 605 } 606 607 var _ dynamicInode = (*mapsData)(nil) 608 609 // Generate implements vfs.DynamicBytesSource.Generate. 610 func (d *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { 611 if mm := getMM(d.task); mm != nil { 612 mm.ReadMapsDataInto(ctx, mm.MapsCallbackFuncForBuffer(buf)) 613 } 614 return nil 615 } 616 617 // smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps. 618 // 619 // +stateify savable 620 type smapsData struct { 621 kernfs.DynamicBytesFile 622 623 task *kernel.Task 624 } 625 626 var _ dynamicInode = (*smapsData)(nil) 627 628 // Generate implements vfs.DynamicBytesSource.Generate. 629 func (d *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { 630 if mm := getMM(d.task); mm != nil { 631 mm.ReadSmapsDataInto(ctx, buf) 632 } 633 return nil 634 } 635 636 // +stateify savable 637 type taskStatData struct { 638 kernfs.DynamicBytesFile 639 640 task *kernel.Task 641 642 // If tgstats is true, accumulate fault stats (not implemented) and CPU 643 // time across all tasks in t's thread group. 644 tgstats bool 645 646 // pidns is the PID namespace associated with the proc filesystem that 647 // includes the file using this statData. 648 pidns *kernel.PIDNamespace 649 } 650 651 var _ dynamicInode = (*taskStatData)(nil) 652 653 // Generate implements vfs.DynamicBytesSource.Generate. 654 func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { 655 fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.task)) 656 fmt.Fprintf(buf, "(%s) ", s.task.Name()) 657 fmt.Fprintf(buf, "%c ", s.task.StateStatus()[0]) 658 ppid := kernel.ThreadID(0) 659 if parent := s.task.Parent(); parent != nil { 660 ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) 661 } 662 fmt.Fprintf(buf, "%d ", ppid) 663 fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.task.ThreadGroup().ProcessGroup())) 664 fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.task.ThreadGroup().Session())) 665 fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */) 666 fmt.Fprintf(buf, "0 " /* flags */) 667 fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */) 668 var cputime usage.CPUStats 669 if s.tgstats { 670 cputime = s.task.ThreadGroup().CPUStats() 671 } else { 672 cputime = s.task.CPUStats() 673 } 674 fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) 675 cputime = s.task.ThreadGroup().JoinedChildCPUStats() 676 fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) 677 fmt.Fprintf(buf, "%d %d ", s.task.Priority(), s.task.Niceness()) 678 fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Count()) 679 680 // itrealvalue. Since kernel 2.6.17, this field is no longer 681 // maintained, and is hard coded as 0. 682 fmt.Fprintf(buf, "0 ") 683 684 // Start time is relative to boot time, expressed in clock ticks. 685 fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.task.StartTime().Sub(s.task.Kernel().Timekeeper().BootTime()))) 686 687 var vss, rss uint64 688 if mm := getMM(s.task); mm != nil { 689 vss = mm.VirtualMemorySize() 690 rss = mm.ResidentSetSize() 691 } 692 fmt.Fprintf(buf, "%d %d ", vss, rss/hostarch.PageSize) 693 694 // rsslim. 695 fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Limits().Get(limits.Rss).Cur) 696 697 fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */) 698 fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */) 699 fmt.Fprintf(buf, "0 0 " /* nswap cnswap */) 700 terminationSignal := linux.Signal(0) 701 if s.task == s.task.ThreadGroup().Leader() { 702 terminationSignal = s.task.ThreadGroup().TerminationSignal() 703 } 704 fmt.Fprintf(buf, "%d ", terminationSignal) 705 fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */) 706 fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */) 707 fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */) 708 fmt.Fprintf(buf, "0\n" /* exit_code */) 709 710 return nil 711 } 712 713 // statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm. 714 // 715 // +stateify savable 716 type statmData struct { 717 kernfs.DynamicBytesFile 718 719 task *kernel.Task 720 } 721 722 var _ dynamicInode = (*statmData)(nil) 723 724 // Generate implements vfs.DynamicBytesSource.Generate. 725 func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error { 726 var vss, rss uint64 727 if mm := getMM(s.task); mm != nil { 728 vss = mm.VirtualMemorySize() 729 rss = mm.ResidentSetSize() 730 } 731 fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/hostarch.PageSize, rss/hostarch.PageSize) 732 return nil 733 } 734 735 // statusInode implements kernfs.Inode for /proc/[pid]/status. 736 // 737 // +stateify savable 738 type statusInode struct { 739 kernfs.InodeAttrs 740 kernfs.InodeNoStatFS 741 kernfs.InodeNoopRefCount 742 kernfs.InodeNotAnonymous 743 kernfs.InodeNotDirectory 744 kernfs.InodeNotSymlink 745 kernfs.InodeWatches 746 747 task *kernel.Task 748 pidns *kernel.PIDNamespace 749 locks vfs.FileLocks 750 } 751 752 // statusFD implements vfs.FileDescriptionImpl and vfs.DynamicByteSource for 753 // /proc/[pid]/status. 754 // 755 // +stateify savable 756 type statusFD struct { 757 statusFDLowerBase 758 vfs.DynamicBytesFileDescriptionImpl 759 vfs.LockFD 760 761 vfsfd vfs.FileDescription 762 763 inode *statusInode 764 task *kernel.Task 765 pidns *kernel.PIDNamespace 766 userns *auth.UserNamespace // equivalent to struct file::f_cred::user_ns 767 } 768 769 // statusFDLowerBase is a dumb hack to ensure that statusFD prefers 770 // vfs.DynamicBytesFileDescriptionImpl methods to vfs.FileDescriptinDefaultImpl 771 // methods. 772 // 773 // +stateify savable 774 type statusFDLowerBase struct { 775 vfs.FileDescriptionDefaultImpl 776 } 777 778 func (fs *filesystem) newStatusInode(ctx context.Context, task *kernel.Task, pidns *kernel.PIDNamespace, ino uint64, perm linux.FileMode) kernfs.Inode { 779 // Note: credentials are overridden by taskOwnedInode. 780 inode := &statusInode{ 781 task: task, 782 pidns: pidns, 783 } 784 inode.InodeAttrs.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeRegular|perm) 785 return &taskOwnedInode{Inode: inode, owner: task} 786 } 787 788 // Open implements kernfs.Inode.Open. 789 func (s *statusInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 790 fd := &statusFD{ 791 inode: s, 792 task: s.task, 793 pidns: s.pidns, 794 userns: rp.Credentials().UserNamespace, 795 } 796 fd.LockFD.Init(&s.locks) 797 if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { 798 return nil, err 799 } 800 fd.DynamicBytesFileDescriptionImpl.Init(&fd.vfsfd, fd) 801 return &fd.vfsfd, nil 802 } 803 804 // SetStat implements kernfs.Inode.SetStat. 805 func (*statusInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { 806 return linuxerr.EPERM 807 } 808 809 // Release implements vfs.FileDescriptionImpl.Release. 810 func (s *statusFD) Release(ctx context.Context) { 811 } 812 813 // Stat implements vfs.FileDescriptionImpl.Stat. 814 func (s *statusFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 815 fs := s.vfsfd.VirtualDentry().Mount().Filesystem() 816 return s.inode.Stat(ctx, fs, opts) 817 } 818 819 // SetStat implements vfs.FileDescriptionImpl.SetStat. 820 func (s *statusFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { 821 return linuxerr.EPERM 822 } 823 824 // Generate implements vfs.DynamicBytesSource.Generate. 825 func (s *statusFD) Generate(ctx context.Context, buf *bytes.Buffer) error { 826 fmt.Fprintf(buf, "Name:\t%s\n", s.task.Name()) 827 fmt.Fprintf(buf, "State:\t%s\n", s.task.StateStatus()) 828 fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.task.ThreadGroup())) 829 fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.task)) 830 831 ppid := kernel.ThreadID(0) 832 if parent := s.task.Parent(); parent != nil { 833 ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) 834 } 835 fmt.Fprintf(buf, "PPid:\t%d\n", ppid) 836 837 tpid := kernel.ThreadID(0) 838 if tracer := s.task.Tracer(); tracer != nil { 839 tpid = s.pidns.IDOfTask(tracer) 840 } 841 fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid) 842 843 creds := s.task.Credentials() 844 ruid := creds.RealKUID.In(s.userns).OrOverflow() 845 euid := creds.EffectiveKUID.In(s.userns).OrOverflow() 846 suid := creds.SavedKUID.In(s.userns).OrOverflow() 847 rgid := creds.RealKGID.In(s.userns).OrOverflow() 848 egid := creds.EffectiveKGID.In(s.userns).OrOverflow() 849 sgid := creds.SavedKGID.In(s.userns).OrOverflow() 850 var fds int 851 var vss, rss, data uint64 852 s.task.WithMuLocked(func(t *kernel.Task) { 853 if fdTable := t.FDTable(); fdTable != nil { 854 fds = fdTable.CurrentMaxFDs() 855 } 856 }) 857 if mm := getMM(s.task); mm != nil { 858 vss = mm.VirtualMemorySize() 859 rss = mm.ResidentSetSize() 860 data = mm.VirtualDataSize() 861 } 862 // Filesystem user/group IDs aren't implemented; effective UID/GID are used 863 // instead. 864 fmt.Fprintf(buf, "Uid:\t%d\t%d\t%d\t%d\n", ruid, euid, suid, euid) 865 fmt.Fprintf(buf, "Gid:\t%d\t%d\t%d\t%d\n", rgid, egid, sgid, egid) 866 fmt.Fprintf(buf, "FDSize:\t%d\n", fds) 867 buf.WriteString("Groups:\t") 868 // There is a space between each pair of supplemental GIDs, as well as an 869 // unconditional trailing space that some applications actually depend on. 870 var sep string 871 for _, kgid := range creds.ExtraKGIDs { 872 fmt.Fprintf(buf, "%s%d", sep, kgid.In(s.userns).OrOverflow()) 873 sep = " " 874 } 875 buf.WriteString(" \n") 876 877 fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10) 878 fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10) 879 fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10) 880 881 fmt.Fprintf(buf, "Threads:\t%d\n", s.task.ThreadGroup().Count()) 882 fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps) 883 fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps) 884 fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps) 885 fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps) 886 fmt.Fprintf(buf, "Seccomp:\t%d\n", s.task.SeccompMode()) 887 // We unconditionally report a single NUMA node. See 888 // pkg/sentry/syscalls/linux/sys_mempolicy.go. 889 fmt.Fprintf(buf, "Mems_allowed:\t1\n") 890 fmt.Fprintf(buf, "Mems_allowed_list:\t0\n") 891 return nil 892 } 893 894 // ioUsage is the /proc/[pid]/io and /proc/[pid]/task/[tid]/io data provider. 895 type ioUsage interface { 896 // IOUsage returns the io usage data. 897 IOUsage() *usage.IO 898 } 899 900 // +stateify savable 901 type ioData struct { 902 kernfs.DynamicBytesFile 903 904 ioUsage 905 } 906 907 var _ dynamicInode = (*ioData)(nil) 908 909 // Generate implements vfs.DynamicBytesSource.Generate. 910 func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error { 911 io := usage.IO{} 912 io.Accumulate(i.IOUsage()) 913 914 fmt.Fprintf(buf, "char: %d\n", io.CharsRead.RacyLoad()) 915 fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten.RacyLoad()) 916 fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls.RacyLoad()) 917 fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls.RacyLoad()) 918 fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead.RacyLoad()) 919 fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten.RacyLoad()) 920 fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled.RacyLoad()) 921 return nil 922 } 923 924 // oomScoreAdj is a stub of the /proc/<pid>/oom_score_adj file. 925 // 926 // +stateify savable 927 type oomScoreAdj struct { 928 kernfs.DynamicBytesFile 929 930 task *kernel.Task 931 } 932 933 var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil) 934 935 // Generate implements vfs.DynamicBytesSource.Generate. 936 func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error { 937 if o.task.ExitState() == kernel.TaskExitDead { 938 return linuxerr.ESRCH 939 } 940 fmt.Fprintf(buf, "%d\n", o.task.OOMScoreAdj()) 941 return nil 942 } 943 944 // Write implements vfs.WritableDynamicBytesSource.Write. 945 func (o *oomScoreAdj) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { 946 if src.NumBytes() == 0 { 947 return 0, nil 948 } 949 950 // Limit input size so as not to impact performance if input size is large. 951 src = src.TakeFirst(hostarch.PageSize - 1) 952 953 str, err := usermem.CopyStringIn(ctx, src.IO, src.Addrs.Head().Start, int(src.Addrs.Head().Length()), src.Opts) 954 if err != nil && err != linuxerr.ENAMETOOLONG { 955 return 0, err 956 } 957 958 str = strings.TrimSpace(str) 959 v, err := strconv.ParseInt(str, 0, 32) 960 if err != nil { 961 return 0, linuxerr.EINVAL 962 } 963 964 if o.task.ExitState() == kernel.TaskExitDead { 965 return 0, linuxerr.ESRCH 966 } 967 if err := o.task.SetOOMScoreAdj(int32(v)); err != nil { 968 return 0, err 969 } 970 971 return src.NumBytes(), nil 972 } 973 974 // exeSymlink is an symlink for the /proc/[pid]/exe file. 975 // 976 // +stateify savable 977 type exeSymlink struct { 978 implStatFS 979 kernfs.InodeAttrs 980 kernfs.InodeNoopRefCount 981 kernfs.InodeNotAnonymous 982 kernfs.InodeSymlink 983 kernfs.InodeWatches 984 985 fs *filesystem 986 task *kernel.Task 987 } 988 989 var _ kernfs.Inode = (*exeSymlink)(nil) 990 991 func (fs *filesystem) newExeSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { 992 inode := &exeSymlink{ 993 fs: fs, 994 task: task, 995 } 996 inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) 997 return inode 998 } 999 1000 // Readlink implements kernfs.Inode.Readlink. 1001 func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { 1002 exec, _, err := s.Getlink(ctx, nil) 1003 if err != nil { 1004 return "", err 1005 } 1006 defer s.fs.SafeDecRef(ctx, exec) 1007 1008 root := vfs.RootFromContext(ctx) 1009 if !root.Ok() { 1010 panic("procfs Readlink requires context with root value") 1011 } 1012 defer s.fs.SafeDecRef(ctx, root) 1013 1014 vfsObj := exec.Mount().Filesystem().VirtualFilesystem() 1015 name, _ := vfsObj.PathnameWithDeleted(ctx, root, exec) 1016 return name, nil 1017 } 1018 1019 // Getlink implements kernfs.Inode.Getlink. 1020 func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { 1021 if !kernel.ContextCanTrace(ctx, s.task, false) { 1022 return vfs.VirtualDentry{}, "", linuxerr.EACCES 1023 } 1024 if err := checkTaskState(s.task); err != nil { 1025 return vfs.VirtualDentry{}, "", err 1026 } 1027 1028 mm := getMM(s.task) 1029 if mm == nil { 1030 return vfs.VirtualDentry{}, "", linuxerr.EACCES 1031 } 1032 1033 // The MemoryManager may be destroyed, in which case 1034 // MemoryManager.destroy will simply set the executable to nil 1035 // (with locks held). 1036 exec := mm.Executable() 1037 if exec == nil { 1038 return vfs.VirtualDentry{}, "", linuxerr.ESRCH 1039 } 1040 defer exec.DecRef(ctx) 1041 1042 vd := exec.VirtualDentry() 1043 vd.IncRef() 1044 return vd, "", nil 1045 } 1046 1047 // cwdSymlink is an symlink for the /proc/[pid]/cwd file. 1048 // 1049 // +stateify savable 1050 type cwdSymlink struct { 1051 implStatFS 1052 kernfs.InodeAttrs 1053 kernfs.InodeNoopRefCount 1054 kernfs.InodeNotAnonymous 1055 kernfs.InodeSymlink 1056 kernfs.InodeWatches 1057 1058 fs *filesystem 1059 task *kernel.Task 1060 } 1061 1062 var _ kernfs.Inode = (*cwdSymlink)(nil) 1063 1064 func (fs *filesystem) newCwdSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { 1065 inode := &cwdSymlink{ 1066 fs: fs, 1067 task: task, 1068 } 1069 inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) 1070 return inode 1071 } 1072 1073 // Readlink implements kernfs.Inode.Readlink. 1074 func (s *cwdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { 1075 cwd, _, err := s.Getlink(ctx, nil) 1076 if err != nil { 1077 return "", err 1078 } 1079 defer s.fs.SafeDecRef(ctx, cwd) 1080 1081 root := vfs.RootFromContext(ctx) 1082 if !root.Ok() { 1083 panic("procfs Readlink requires context with root value") 1084 } 1085 defer s.fs.SafeDecRef(ctx, root) 1086 1087 vfsObj := cwd.Mount().Filesystem().VirtualFilesystem() 1088 name, _ := vfsObj.PathnameWithDeleted(ctx, root, cwd) 1089 return name, nil 1090 } 1091 1092 // Getlink implements kernfs.Inode.Getlink. 1093 func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { 1094 if !kernel.ContextCanTrace(ctx, s.task, false) { 1095 return vfs.VirtualDentry{}, "", linuxerr.EACCES 1096 } 1097 if err := checkTaskState(s.task); err != nil { 1098 return vfs.VirtualDentry{}, "", err 1099 } 1100 cwd := s.task.FSContext().WorkingDirectory() 1101 if !cwd.Ok() { 1102 // It could have raced with process deletion. 1103 return vfs.VirtualDentry{}, "", linuxerr.ESRCH 1104 } 1105 // The reference is transferred to the caller. 1106 return cwd, "", nil 1107 } 1108 1109 // rootSymlink is an symlink for the /proc/[pid]/root file. 1110 // 1111 // +stateify savable 1112 type rootSymlink struct { 1113 implStatFS 1114 kernfs.InodeAttrs 1115 kernfs.InodeNoopRefCount 1116 kernfs.InodeNotAnonymous 1117 kernfs.InodeSymlink 1118 kernfs.InodeWatches 1119 1120 fs *filesystem 1121 task *kernel.Task 1122 } 1123 1124 var _ kernfs.Inode = (*rootSymlink)(nil) 1125 1126 func (fs *filesystem) newRootSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { 1127 inode := &rootSymlink{ 1128 fs: fs, 1129 task: task, 1130 } 1131 inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) 1132 return inode 1133 } 1134 1135 // Readlink implements kernfs.Inode.Readlink. 1136 func (s *rootSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { 1137 root, _, err := s.Getlink(ctx, nil) 1138 if err != nil { 1139 return "", err 1140 } 1141 defer s.fs.SafeDecRef(ctx, root) 1142 1143 vfsRoot := vfs.RootFromContext(ctx) 1144 if !vfsRoot.Ok() { 1145 panic("procfs Readlink requires context with root value") 1146 } 1147 defer s.fs.SafeDecRef(ctx, vfsRoot) 1148 1149 vfsObj := root.Mount().Filesystem().VirtualFilesystem() 1150 name, _ := vfsObj.PathnameWithDeleted(ctx, vfsRoot, root) 1151 return name, nil 1152 } 1153 1154 // Getlink implements kernfs.Inode.Getlink. 1155 func (s *rootSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { 1156 if !kernel.ContextCanTrace(ctx, s.task, false) { 1157 return vfs.VirtualDentry{}, "", linuxerr.EACCES 1158 } 1159 if err := checkTaskState(s.task); err != nil { 1160 return vfs.VirtualDentry{}, "", err 1161 } 1162 root := s.task.FSContext().RootDirectory() 1163 if !root.Ok() { 1164 // It could have raced with process deletion. 1165 return vfs.VirtualDentry{}, "", linuxerr.ESRCH 1166 } 1167 // The reference is transferred to the caller. 1168 return root, "", nil 1169 } 1170 1171 // mountInfoData is used to implement /proc/[pid]/mountinfo. 1172 // 1173 // +stateify savable 1174 type mountInfoData struct { 1175 kernfs.DynamicBytesFile 1176 1177 fs *filesystem 1178 task *kernel.Task 1179 } 1180 1181 var _ dynamicInode = (*mountInfoData)(nil) 1182 1183 // Generate implements vfs.DynamicBytesSource.Generate. 1184 func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { 1185 var fsctx *kernel.FSContext 1186 i.task.WithMuLocked(func(t *kernel.Task) { 1187 fsctx = t.FSContext() 1188 }) 1189 if fsctx == nil { 1190 // The task has been destroyed. Nothing to show here. 1191 return nil 1192 } 1193 rootDir := fsctx.RootDirectory() 1194 if !rootDir.Ok() { 1195 // Root has been destroyed. Don't try to read mounts. 1196 return nil 1197 } 1198 defer i.fs.SafeDecRef(ctx, rootDir) 1199 i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf) 1200 return nil 1201 } 1202 1203 // mountsData is used to implement /proc/[pid]/mounts. 1204 // 1205 // +stateify savable 1206 type mountsData struct { 1207 kernfs.DynamicBytesFile 1208 1209 fs *filesystem 1210 task *kernel.Task 1211 } 1212 1213 var _ dynamicInode = (*mountsData)(nil) 1214 1215 // Generate implements vfs.DynamicBytesSource.Generate. 1216 func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error { 1217 var fsctx *kernel.FSContext 1218 i.task.WithMuLocked(func(t *kernel.Task) { 1219 fsctx = t.FSContext() 1220 }) 1221 if fsctx == nil { 1222 // The task has been destroyed. Nothing to show here. 1223 return nil 1224 } 1225 rootDir := fsctx.RootDirectory() 1226 if !rootDir.Ok() { 1227 // Root has been destroyed. Don't try to read mounts. 1228 return nil 1229 } 1230 defer i.fs.SafeDecRef(ctx, rootDir) 1231 i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf) 1232 return nil 1233 } 1234 1235 // +stateify savable 1236 type namespaceSymlink struct { 1237 kernfs.StaticSymlink 1238 1239 task *kernel.Task 1240 nsType int 1241 } 1242 1243 func (fs *filesystem) newNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64, nsType int) kernfs.Inode { 1244 inode := &namespaceSymlink{task: task, nsType: nsType} 1245 1246 // Note: credentials are overridden by taskOwnedInode. 1247 inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, "") 1248 1249 taskInode := &taskOwnedInode{Inode: inode, owner: task} 1250 return taskInode 1251 } 1252 1253 func (fs *filesystem) newPIDNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { 1254 target := fmt.Sprintf("pid:[%d]", task.PIDNamespace().ID()) 1255 1256 inode := &namespaceSymlink{task: task} 1257 // Note: credentials are overridden by taskOwnedInode. 1258 inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target) 1259 1260 taskInode := &taskOwnedInode{Inode: inode, owner: task} 1261 return taskInode 1262 } 1263 1264 func (fs *filesystem) newFakeNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64, ns string) kernfs.Inode { 1265 // Namespace symlinks should contain the namespace name and the inode number 1266 // for the namespace instance, so for example user:[123456]. We currently fake 1267 // the inode number by sticking the symlink inode in its place. 1268 target := fmt.Sprintf("%s:[%d]", ns, ino) 1269 1270 inode := &namespaceSymlink{task: task} 1271 // Note: credentials are overridden by taskOwnedInode. 1272 inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target) 1273 1274 taskInode := &taskOwnedInode{Inode: inode, owner: task} 1275 return taskInode 1276 } 1277 1278 func (s *namespaceSymlink) getInode(t *kernel.Task) *nsfs.Inode { 1279 switch s.nsType { 1280 case linux.CLONE_NEWNET: 1281 netns := t.GetNetworkNamespace() 1282 if netns == nil { 1283 return nil 1284 } 1285 return netns.GetInode() 1286 case linux.CLONE_NEWIPC: 1287 if ipcns := t.GetIPCNamespace(); ipcns != nil { 1288 return ipcns.GetInode() 1289 } 1290 return nil 1291 case linux.CLONE_NEWUTS: 1292 if utsns := t.GetUTSNamespace(); utsns != nil { 1293 return utsns.GetInode() 1294 } 1295 return nil 1296 case linux.CLONE_NEWNS: 1297 mntns := t.GetMountNamespace() 1298 if mntns == nil { 1299 return nil 1300 } 1301 inode, _ := mntns.Refs.(*nsfs.Inode) 1302 return inode 1303 default: 1304 panic("unknown namespace") 1305 } 1306 } 1307 1308 // Readlink implements kernfs.Inode.Readlink. 1309 func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { 1310 if err := checkTaskState(s.task); err != nil { 1311 return "", err 1312 } 1313 if s.nsType != 0 { 1314 inode := s.getInode(s.task) 1315 if inode == nil { 1316 return "", linuxerr.ENOENT 1317 } 1318 target := inode.Name() 1319 inode.DecRef(ctx) 1320 return target, nil 1321 } 1322 return s.StaticSymlink.Readlink(ctx, mnt) 1323 } 1324 1325 // Getlink implements kernfs.Inode.Getlink. 1326 func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { 1327 if err := checkTaskState(s.task); err != nil { 1328 return vfs.VirtualDentry{}, "", err 1329 } 1330 1331 if s.nsType != 0 { 1332 inode := s.getInode(s.task) 1333 if inode == nil { 1334 return vfs.VirtualDentry{}, "", linuxerr.ENOENT 1335 } 1336 defer inode.DecRef(ctx) 1337 return inode.VirtualDentry(), "", nil 1338 } 1339 // Create a synthetic inode to represent the namespace. 1340 fs := mnt.Filesystem().Impl().(*filesystem) 1341 nsInode := &namespaceInode{} 1342 nsInode.Init(ctx, auth.CredentialsFromContext(ctx), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0444) 1343 dentry := &kernfs.Dentry{} 1344 dentry.Init(&fs.Filesystem, nsInode) 1345 vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry()) 1346 // Only IncRef vd.Mount() because vd.Dentry() already holds a ref of 1. 1347 mnt.IncRef() 1348 return vd, "", nil 1349 } 1350 1351 // namespaceInode is a synthetic inode created to represent a namespace in 1352 // /proc/[pid]/ns/*. 1353 // 1354 // +stateify savable 1355 type namespaceInode struct { 1356 implStatFS 1357 kernfs.InodeAttrs 1358 kernfs.InodeNoopRefCount 1359 kernfs.InodeNotAnonymous 1360 kernfs.InodeNotDirectory 1361 kernfs.InodeNotSymlink 1362 kernfs.InodeWatches 1363 1364 locks vfs.FileLocks 1365 } 1366 1367 var _ kernfs.Inode = (*namespaceInode)(nil) 1368 1369 // Init initializes a namespace inode. 1370 func (i *namespaceInode) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { 1371 if perm&^linux.PermissionsMask != 0 { 1372 panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) 1373 } 1374 i.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm) 1375 } 1376 1377 // Open implements kernfs.Inode.Open. 1378 func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 1379 fd := &namespaceFD{inode: i} 1380 i.IncRef() 1381 fd.LockFD.Init(&i.locks) 1382 if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { 1383 return nil, err 1384 } 1385 return &fd.vfsfd, nil 1386 } 1387 1388 // namespace FD is a synthetic file that represents a namespace in 1389 // /proc/[pid]/ns/*. 1390 // 1391 // +stateify savable 1392 type namespaceFD struct { 1393 vfs.FileDescriptionDefaultImpl 1394 vfs.LockFD 1395 1396 vfsfd vfs.FileDescription 1397 inode *namespaceInode 1398 } 1399 1400 var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil) 1401 1402 // Stat implements vfs.FileDescriptionImpl.Stat. 1403 func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 1404 vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() 1405 return fd.inode.Stat(ctx, vfs, opts) 1406 } 1407 1408 // SetStat implements vfs.FileDescriptionImpl.SetStat. 1409 func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { 1410 vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() 1411 creds := auth.CredentialsFromContext(ctx) 1412 return fd.inode.SetStat(ctx, vfs, creds, opts) 1413 } 1414 1415 // Release implements vfs.FileDescriptionImpl.Release. 1416 func (fd *namespaceFD) Release(ctx context.Context) { 1417 fd.inode.DecRef(ctx) 1418 } 1419 1420 // taskCgroupData generates data for /proc/[pid]/cgroup. 1421 // 1422 // +stateify savable 1423 type taskCgroupData struct { 1424 dynamicBytesFileSetAttr 1425 task *kernel.Task 1426 } 1427 1428 var _ dynamicInode = (*taskCgroupData)(nil) 1429 1430 // Generate implements vfs.DynamicBytesSource.Generate. 1431 func (d *taskCgroupData) Generate(ctx context.Context, buf *bytes.Buffer) error { 1432 // When a task is existing on Linux, a task's cgroup set is cleared and 1433 // reset to the initial cgroup set, which is essentially the set of root 1434 // cgroups. Because of this, the /proc/<pid>/cgroup file is always readable 1435 // on Linux throughout a task's lifetime. 1436 // 1437 // The sentry removes tasks from cgroups during the exit process, but 1438 // doesn't move them into an initial cgroup set, so partway through task 1439 // exit this file show a task is in no cgroups, which is incorrect. Instead, 1440 // once a task has left its cgroups, we return an error. 1441 if d.task.ExitState() >= kernel.TaskExitInitiated { 1442 return linuxerr.ESRCH 1443 } 1444 1445 d.task.GenerateProcTaskCgroup(buf) 1446 return nil 1447 } 1448 1449 // childrenData implements vfs.DynamicBytesSource for /proc/[pid]/task/[tid]/children. 1450 // 1451 // +stateify savable 1452 type childrenData struct { 1453 kernfs.DynamicBytesFile 1454 1455 task *kernel.Task 1456 1457 // pidns is the PID namespace associated with the proc filesystem that 1458 // includes the file using this childrenData. 1459 pidns *kernel.PIDNamespace 1460 } 1461 1462 // Generate implements vfs.DynamicBytesSource.Generate. 1463 func (d *childrenData) Generate(ctx context.Context, buf *bytes.Buffer) error { 1464 children := d.task.Children() 1465 var childrenTIDs []int 1466 for childTask := range children { 1467 childrenTIDs = append(childrenTIDs, int(d.pidns.IDOfTask(childTask))) 1468 } 1469 1470 // The TIDs need to be in sorted order in accordance with the Linux implementation. 1471 sort.Ints(childrenTIDs) 1472 1473 for _, childrenTID := range childrenTIDs { 1474 // It contains a space-separated list of child tasks of the `task`. 1475 // Each task is represented by its TID. 1476 fmt.Fprintf(buf, "%d ", childrenTID) 1477 } 1478 1479 return nil 1480 }