github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/fsimpl/proc/task_files.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package proc 16 17 import ( 18 "bytes" 19 "fmt" 20 "io" 21 "sort" 22 "strconv" 23 "strings" 24 25 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 26 "github.com/MerlinKodo/gvisor/pkg/context" 27 "github.com/MerlinKodo/gvisor/pkg/errors/linuxerr" 28 "github.com/MerlinKodo/gvisor/pkg/hostarch" 29 "github.com/MerlinKodo/gvisor/pkg/safemem" 30 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/kernfs" 31 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/nsfs" 32 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel" 33 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth" 34 "github.com/MerlinKodo/gvisor/pkg/sentry/limits" 35 "github.com/MerlinKodo/gvisor/pkg/sentry/mm" 36 "github.com/MerlinKodo/gvisor/pkg/sentry/usage" 37 "github.com/MerlinKodo/gvisor/pkg/sentry/vfs" 38 "github.com/MerlinKodo/gvisor/pkg/sync" 39 "github.com/MerlinKodo/gvisor/pkg/usermem" 40 ) 41 42 // "There is an (arbitrary) limit on the number of lines in the file. As at 43 // Linux 3.18, the limit is five lines." - user_namespaces(7) 44 const maxIDMapLines = 5 45 46 // getMM gets the kernel task's MemoryManager. No additional reference is taken on 47 // mm here. This is safe because MemoryManager.destroy is required to leave the 48 // MemoryManager in a state where it's still usable as a DynamicBytesSource. 49 func getMM(task *kernel.Task) *mm.MemoryManager { 50 var tmm *mm.MemoryManager 51 task.WithMuLocked(func(t *kernel.Task) { 52 if mm := t.MemoryManager(); mm != nil { 53 tmm = mm 54 } 55 }) 56 return tmm 57 } 58 59 // getMMIncRef returns t's MemoryManager. If getMMIncRef succeeds, the 60 // MemoryManager's users count is incremented, and must be decremented by the 61 // caller when it is no longer in use. 62 func getMMIncRef(task *kernel.Task) (*mm.MemoryManager, error) { 63 var m *mm.MemoryManager 64 task.WithMuLocked(func(t *kernel.Task) { 65 m = t.MemoryManager() 66 }) 67 if m == nil || !m.IncUsers() { 68 return nil, io.EOF 69 } 70 return m, nil 71 } 72 73 func checkTaskState(t *kernel.Task) error { 74 switch t.ExitState() { 75 case kernel.TaskExitZombie: 76 return linuxerr.EACCES 77 case kernel.TaskExitDead: 78 return linuxerr.ESRCH 79 } 80 return nil 81 } 82 83 type bufferWriter struct { 84 buf *bytes.Buffer 85 } 86 87 // WriteFromBlocks writes up to srcs.NumBytes() bytes from srcs and returns 88 // the number of bytes written. It may return a partial write without an 89 // error (i.e. (n, nil) where 0 < n < srcs.NumBytes()). It should not 90 // return a full write with an error (i.e. srcs.NumBytes(), err) where err 91 // != nil). 92 func (w *bufferWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { 93 written := srcs.NumBytes() 94 for !srcs.IsEmpty() { 95 w.buf.Write(srcs.Head().ToSlice()) 96 srcs = srcs.Tail() 97 } 98 return written, nil 99 } 100 101 // auxvData implements vfs.DynamicBytesSource for /proc/[pid]/auxv. 102 // 103 // +stateify savable 104 type auxvData struct { 105 kernfs.DynamicBytesFile 106 107 task *kernel.Task 108 } 109 110 var _ dynamicInode = (*auxvData)(nil) 111 112 // Generate implements vfs.DynamicBytesSource.Generate. 113 func (d *auxvData) Generate(ctx context.Context, buf *bytes.Buffer) error { 114 if d.task.ExitState() == kernel.TaskExitDead { 115 return linuxerr.ESRCH 116 } 117 m, err := getMMIncRef(d.task) 118 if err != nil { 119 // Return empty file. 120 return nil 121 } 122 defer m.DecUsers(ctx) 123 124 auxv := m.Auxv() 125 // Space for buffer with AT_NULL (0) terminator at the end. 126 buf.Grow((len(auxv) + 1) * 16) 127 for _, e := range auxv { 128 var tmp [16]byte 129 hostarch.ByteOrder.PutUint64(tmp[:8], e.Key) 130 hostarch.ByteOrder.PutUint64(tmp[8:], uint64(e.Value)) 131 buf.Write(tmp[:]) 132 } 133 var atNull [16]byte 134 buf.Write(atNull[:]) 135 136 return nil 137 } 138 139 // MetadataType enumerates the types of metadata that is exposed through proc. 140 type MetadataType int 141 142 const ( 143 // Cmdline represents /proc/[pid]/cmdline. 144 Cmdline MetadataType = iota 145 146 // Environ represents /proc/[pid]/environ. 147 Environ 148 ) 149 150 // GetMetadata fetches the process's metadata of type t and writes it into 151 // buf. The process is identified by mm. 152 func GetMetadata(ctx context.Context, mm *mm.MemoryManager, buf *bytes.Buffer, t MetadataType) error { 153 // Figure out the bounds of the exec arg we are trying to read. 154 var ar hostarch.AddrRange 155 switch t { 156 case Cmdline: 157 ar = hostarch.AddrRange{ 158 Start: mm.ArgvStart(), 159 End: mm.ArgvEnd(), 160 } 161 case Environ: 162 ar = hostarch.AddrRange{ 163 Start: mm.EnvvStart(), 164 End: mm.EnvvEnd(), 165 } 166 default: 167 panic(fmt.Sprintf("unknown exec arg type %v", t)) 168 } 169 if ar.Start == 0 || ar.End == 0 { 170 // Don't attempt to read before the start/end are set up. 171 return io.EOF 172 } 173 174 // N.B. Technically this should be usermem.IOOpts.IgnorePermissions = true 175 // until Linux 4.9 (272ddc8b3735 "proc: don't use FOLL_FORCE for reading 176 // cmdline and environment"). 177 writer := &bufferWriter{buf: buf} 178 if n, err := mm.CopyInTo(ctx, hostarch.AddrRangeSeqOf(ar), writer, usermem.IOOpts{}); n == 0 || err != nil { 179 // Nothing to copy or something went wrong. 180 return err 181 } 182 183 // On Linux, if the NULL byte at the end of the argument vector has been 184 // overwritten, it continues reading the environment vector as part of 185 // the argument vector. 186 if t == Cmdline && buf.Bytes()[buf.Len()-1] != 0 { 187 if end := bytes.IndexByte(buf.Bytes(), 0); end != -1 { 188 // If we found a NULL character somewhere else in argv, truncate the 189 // return up to the NULL terminator (including it). 190 buf.Truncate(end) 191 return nil 192 } 193 194 // There is no NULL terminator in the string, return into envp. 195 arEnvv := hostarch.AddrRange{ 196 Start: mm.EnvvStart(), 197 End: mm.EnvvEnd(), 198 } 199 200 // Upstream limits the returned amount to one page of slop. 201 // https://elixir.bootlin.com/linux/v4.20/source/fs/proc/base.c#L208 202 // we'll return one page total between argv and envp because of the 203 // above page restrictions. 204 if buf.Len() >= hostarch.PageSize { 205 // Returned at least one page already, nothing else to add. 206 return nil 207 } 208 remaining := hostarch.PageSize - buf.Len() 209 if int(arEnvv.Length()) > remaining { 210 end, ok := arEnvv.Start.AddLength(uint64(remaining)) 211 if !ok { 212 return linuxerr.EFAULT 213 } 214 arEnvv.End = end 215 } 216 if _, err := mm.CopyInTo(ctx, hostarch.AddrRangeSeqOf(arEnvv), writer, usermem.IOOpts{}); err != nil { 217 return err 218 } 219 220 // Linux will return envp up to and including the first NULL character, 221 // so find it. 222 envStart := int(ar.Length()) 223 if nullIdx := bytes.IndexByte(buf.Bytes()[envStart:], 0); nullIdx != -1 { 224 buf.Truncate(envStart + nullIdx) 225 } 226 } 227 228 return nil 229 } 230 231 // metadataData implements vfs.DynamicBytesSource for proc metadata fields like: 232 // 233 // - /proc/[pid]/cmdline 234 // - /proc/[pid]/environ 235 // 236 // +stateify savable 237 type metadataData struct { 238 kernfs.DynamicBytesFile 239 240 task *kernel.Task 241 242 // arg is the type of exec argument this file contains. 243 metaType MetadataType 244 } 245 246 var _ dynamicInode = (*metadataData)(nil) 247 248 // Generate implements vfs.DynamicBytesSource.Generate. 249 func (d *metadataData) Generate(ctx context.Context, buf *bytes.Buffer) error { 250 if d.task.ExitState() == kernel.TaskExitDead { 251 return linuxerr.ESRCH 252 } 253 m, err := getMMIncRef(d.task) 254 if err != nil { 255 // Return empty file. 256 return nil 257 } 258 defer m.DecUsers(ctx) 259 return GetMetadata(ctx, m, buf, d.metaType) 260 } 261 262 // +stateify savable 263 type commInode struct { 264 kernfs.DynamicBytesFile 265 266 task *kernel.Task 267 } 268 269 func (fs *filesystem) newComm(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode { 270 inode := &commInode{task: task} 271 inode.DynamicBytesFile.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm) 272 return inode 273 } 274 275 func (i *commInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { 276 // This file can always be read or written by members of the same thread 277 // group. See fs/proc/base.c:proc_tid_comm_permission. 278 t := kernel.TaskFromContext(ctx) 279 if t != nil && t.ThreadGroup() == i.task.ThreadGroup() && !ats.MayExec() { 280 return nil 281 } 282 283 return i.DynamicBytesFile.CheckPermissions(ctx, creds, ats) 284 } 285 286 // commData implements vfs.WritableDynamicBytesSource for /proc/[pid]/comm. 287 // 288 // +stateify savable 289 type commData struct { 290 kernfs.DynamicBytesFile 291 292 task *kernel.Task 293 } 294 295 var _ dynamicInode = (*commData)(nil) 296 var _ vfs.WritableDynamicBytesSource = (*commData)(nil) 297 298 // Generate implements vfs.DynamicBytesSource.Generate. 299 func (d *commData) Generate(ctx context.Context, buf *bytes.Buffer) error { 300 buf.WriteString(d.task.Name()) 301 buf.WriteString("\n") 302 return nil 303 } 304 305 // Write implements vfs.WritableDynamicBytesSource.Write. 306 func (d *commData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { 307 srclen := src.NumBytes() 308 name := make([]byte, srclen) 309 if _, err := src.CopyIn(ctx, name); err != nil { 310 return 0, err 311 } 312 313 // Only allow writes from the same thread group, otherwise return 314 // EINVAL. See fs/proc/base.c:comm_write. 315 // 316 // Note that this check exists in addition to the same-thread-group 317 // check in CheckPermissions. 318 t := kernel.TaskFromContext(ctx) 319 if t == nil || t.ThreadGroup() != d.task.ThreadGroup() { 320 return 0, linuxerr.EINVAL 321 } 322 d.task.SetName(string(name)) 323 return int64(srclen), nil 324 } 325 326 // idMapData implements vfs.WritableDynamicBytesSource for 327 // /proc/[pid]/{gid_map|uid_map}. 328 // 329 // +stateify savable 330 type idMapData struct { 331 kernfs.DynamicBytesFile 332 333 task *kernel.Task 334 gids bool 335 } 336 337 var _ dynamicInode = (*idMapData)(nil) 338 var _ vfs.WritableDynamicBytesSource = (*idMapData)(nil) 339 340 // Generate implements vfs.WritableDynamicBytesSource.Generate. 341 func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error { 342 var entries []auth.IDMapEntry 343 if d.gids { 344 entries = d.task.UserNamespace().GIDMap() 345 } else { 346 entries = d.task.UserNamespace().UIDMap() 347 } 348 for _, e := range entries { 349 fmt.Fprintf(buf, "%10d %10d %10d\n", e.FirstID, e.FirstParentID, e.Length) 350 } 351 return nil 352 } 353 354 // Write implements vfs.WritableDynamicBytesSource.Write. 355 func (d *idMapData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { 356 // "In addition, the number of bytes written to the file must be less than 357 // the system page size, and the write must be performed at the start of 358 // the file ..." - user_namespaces(7) 359 srclen := src.NumBytes() 360 if srclen >= hostarch.PageSize || offset != 0 { 361 return 0, linuxerr.EINVAL 362 } 363 b := make([]byte, srclen) 364 if _, err := src.CopyIn(ctx, b); err != nil { 365 return 0, err 366 } 367 368 // Truncate from the first NULL byte. 369 var nul int64 370 nul = int64(bytes.IndexByte(b, 0)) 371 if nul == -1 { 372 nul = srclen 373 } 374 b = b[:nul] 375 // Remove the last \n. 376 if nul >= 1 && b[nul-1] == '\n' { 377 b = b[:nul-1] 378 } 379 lines := bytes.SplitN(b, []byte("\n"), maxIDMapLines+1) 380 if len(lines) > maxIDMapLines { 381 return 0, linuxerr.EINVAL 382 } 383 384 entries := make([]auth.IDMapEntry, len(lines)) 385 for i, l := range lines { 386 var e auth.IDMapEntry 387 _, err := fmt.Sscan(string(l), &e.FirstID, &e.FirstParentID, &e.Length) 388 if err != nil { 389 return 0, linuxerr.EINVAL 390 } 391 entries[i] = e 392 } 393 var err error 394 if d.gids { 395 err = d.task.UserNamespace().SetGIDMap(ctx, entries) 396 } else { 397 err = d.task.UserNamespace().SetUIDMap(ctx, entries) 398 } 399 if err != nil { 400 return 0, err 401 } 402 403 // On success, Linux's kernel/user_namespace.c:map_write() always returns 404 // count, even if fewer bytes were used. 405 return int64(srclen), nil 406 } 407 408 var _ kernfs.Inode = (*memInode)(nil) 409 410 // memInode implements kernfs.Inode for /proc/[pid]/mem. 411 // 412 // +stateify savable 413 type memInode struct { 414 kernfs.InodeAttrs 415 kernfs.InodeNoStatFS 416 kernfs.InodeNoopRefCount 417 kernfs.InodeNotAnonymous 418 kernfs.InodeNotDirectory 419 kernfs.InodeNotSymlink 420 kernfs.InodeWatches 421 422 task *kernel.Task 423 locks vfs.FileLocks 424 } 425 426 func (fs *filesystem) newMemInode(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode { 427 // Note: credentials are overridden by taskOwnedInode. 428 inode := &memInode{task: task} 429 inode.init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm) 430 return &taskOwnedInode{Inode: inode, owner: task} 431 } 432 433 func (f *memInode) init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { 434 if perm&^linux.PermissionsMask != 0 { 435 panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) 436 } 437 f.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm) 438 } 439 440 // Open implements kernfs.Inode.Open. 441 func (f *memInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 442 // TODO(gvisor.dev/issue/260): Add check for PTRACE_MODE_ATTACH_FSCREDS 443 // Permission to read this file is governed by PTRACE_MODE_ATTACH_FSCREDS 444 // Since we dont implement setfsuid/setfsgid we can just use PTRACE_MODE_ATTACH 445 if !kernel.ContextCanTrace(ctx, f.task, true) { 446 return nil, linuxerr.EACCES 447 } 448 if err := checkTaskState(f.task); err != nil { 449 return nil, err 450 } 451 fd := &memFD{} 452 if err := fd.Init(rp.Mount(), d, f, opts.Flags); err != nil { 453 return nil, err 454 } 455 return &fd.vfsfd, nil 456 } 457 458 // SetStat implements kernfs.Inode.SetStat. 459 func (*memInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { 460 return linuxerr.EPERM 461 } 462 463 var _ vfs.FileDescriptionImpl = (*memFD)(nil) 464 465 // memFD implements vfs.FileDescriptionImpl for /proc/[pid]/mem. 466 // 467 // +stateify savable 468 type memFD struct { 469 vfsfd vfs.FileDescription 470 vfs.FileDescriptionDefaultImpl 471 vfs.LockFD 472 473 inode *memInode 474 475 // mu guards the fields below. 476 mu sync.Mutex `state:"nosave"` 477 offset int64 478 } 479 480 // Init initializes memFD. 481 func (fd *memFD) Init(m *vfs.Mount, d *kernfs.Dentry, inode *memInode, flags uint32) error { 482 fd.LockFD.Init(&inode.locks) 483 if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { 484 return err 485 } 486 fd.inode = inode 487 return nil 488 } 489 490 // Seek implements vfs.FileDescriptionImpl.Seek. 491 func (fd *memFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { 492 fd.mu.Lock() 493 defer fd.mu.Unlock() 494 switch whence { 495 case linux.SEEK_SET: 496 case linux.SEEK_CUR: 497 offset += fd.offset 498 default: 499 return 0, linuxerr.EINVAL 500 } 501 if offset < 0 { 502 return 0, linuxerr.EINVAL 503 } 504 fd.offset = offset 505 return offset, nil 506 } 507 508 // PRead implements vfs.FileDescriptionImpl.PRead. 509 func (fd *memFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { 510 if dst.NumBytes() == 0 { 511 return 0, nil 512 } 513 m, err := getMMIncRef(fd.inode.task) 514 if err != nil { 515 return 0, err 516 } 517 defer m.DecUsers(ctx) 518 // Buffer the read data because of MM locks 519 buf := make([]byte, dst.NumBytes()) 520 n, readErr := m.CopyIn(ctx, hostarch.Addr(offset), buf, usermem.IOOpts{IgnorePermissions: true}) 521 if n > 0 { 522 if _, err := dst.CopyOut(ctx, buf[:n]); err != nil { 523 return 0, linuxerr.EFAULT 524 } 525 return int64(n), nil 526 } 527 if readErr != nil { 528 return 0, linuxerr.EIO 529 } 530 return 0, nil 531 } 532 533 // Read implements vfs.FileDescriptionImpl.Read. 534 func (fd *memFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { 535 fd.mu.Lock() 536 n, err := fd.PRead(ctx, dst, fd.offset, opts) 537 fd.offset += n 538 fd.mu.Unlock() 539 return n, err 540 } 541 542 // Stat implements vfs.FileDescriptionImpl.Stat. 543 func (fd *memFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 544 fs := fd.vfsfd.VirtualDentry().Mount().Filesystem() 545 return fd.inode.Stat(ctx, fs, opts) 546 } 547 548 // SetStat implements vfs.FileDescriptionImpl.SetStat. 549 func (fd *memFD) SetStat(context.Context, vfs.SetStatOptions) error { 550 return linuxerr.EPERM 551 } 552 553 // Release implements vfs.FileDescriptionImpl.Release. 554 func (fd *memFD) Release(context.Context) {} 555 556 // limitsData implements vfs.DynamicBytesSource for /proc/[pid]/limits. 557 // 558 // +stateify savable 559 type limitsData struct { 560 kernfs.DynamicBytesFile 561 562 task *kernel.Task 563 } 564 565 func (d *limitsData) Generate(ctx context.Context, buf *bytes.Buffer) error { 566 taskLimits := d.task.Limits() 567 // formatting matches the kernel output from linux/fs/proc/base.c:proc_pid_limits() 568 fmt.Fprintf(buf, "Limit Soft Limit Hard Limit Units \n") 569 for _, lt := range limits.AllLimitTypes { 570 fmt.Fprintf(buf, "%-25s ", lt.Name()) 571 572 l := taskLimits.Get(lt) 573 if l.Cur == limits.Infinity { 574 fmt.Fprintf(buf, "%-20s ", "unlimited") 575 } else { 576 fmt.Fprintf(buf, "%-20d ", l.Cur) 577 } 578 579 if l.Max == limits.Infinity { 580 fmt.Fprintf(buf, "%-20s ", "unlimited") 581 } else { 582 fmt.Fprintf(buf, "%-20d ", l.Max) 583 } 584 585 if u := lt.Unit(); u != "" { 586 fmt.Fprintf(buf, "%-10s", u) 587 } 588 589 buf.WriteByte('\n') 590 } 591 return nil 592 } 593 594 // mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps. 595 // 596 // +stateify savable 597 type mapsData struct { 598 kernfs.DynamicBytesFile 599 600 task *kernel.Task 601 } 602 603 var _ dynamicInode = (*mapsData)(nil) 604 605 // Generate implements vfs.DynamicBytesSource.Generate. 606 func (d *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { 607 if mm := getMM(d.task); mm != nil { 608 mm.ReadMapsDataInto(ctx, mm.MapsCallbackFuncForBuffer(buf)) 609 } 610 return nil 611 } 612 613 // smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps. 614 // 615 // +stateify savable 616 type smapsData struct { 617 kernfs.DynamicBytesFile 618 619 task *kernel.Task 620 } 621 622 var _ dynamicInode = (*smapsData)(nil) 623 624 // Generate implements vfs.DynamicBytesSource.Generate. 625 func (d *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { 626 if mm := getMM(d.task); mm != nil { 627 mm.ReadSmapsDataInto(ctx, buf) 628 } 629 return nil 630 } 631 632 // +stateify savable 633 type taskStatData struct { 634 kernfs.DynamicBytesFile 635 636 task *kernel.Task 637 638 // If tgstats is true, accumulate fault stats (not implemented) and CPU 639 // time across all tasks in t's thread group. 640 tgstats bool 641 642 // pidns is the PID namespace associated with the proc filesystem that 643 // includes the file using this statData. 644 pidns *kernel.PIDNamespace 645 } 646 647 var _ dynamicInode = (*taskStatData)(nil) 648 649 // Generate implements vfs.DynamicBytesSource.Generate. 650 func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { 651 fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.task)) 652 fmt.Fprintf(buf, "(%s) ", s.task.Name()) 653 fmt.Fprintf(buf, "%c ", s.task.StateStatus()[0]) 654 ppid := kernel.ThreadID(0) 655 if parent := s.task.Parent(); parent != nil { 656 ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) 657 } 658 fmt.Fprintf(buf, "%d ", ppid) 659 fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.task.ThreadGroup().ProcessGroup())) 660 fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.task.ThreadGroup().Session())) 661 fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */) 662 fmt.Fprintf(buf, "0 " /* flags */) 663 fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */) 664 var cputime usage.CPUStats 665 if s.tgstats { 666 cputime = s.task.ThreadGroup().CPUStats() 667 } else { 668 cputime = s.task.CPUStats() 669 } 670 fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) 671 cputime = s.task.ThreadGroup().JoinedChildCPUStats() 672 fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) 673 fmt.Fprintf(buf, "%d %d ", s.task.Priority(), s.task.Niceness()) 674 fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Count()) 675 676 // itrealvalue. Since kernel 2.6.17, this field is no longer 677 // maintained, and is hard coded as 0. 678 fmt.Fprintf(buf, "0 ") 679 680 // Start time is relative to boot time, expressed in clock ticks. 681 fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.task.StartTime().Sub(s.task.Kernel().Timekeeper().BootTime()))) 682 683 var vss, rss uint64 684 if mm := getMM(s.task); mm != nil { 685 vss = mm.VirtualMemorySize() 686 rss = mm.ResidentSetSize() 687 } 688 fmt.Fprintf(buf, "%d %d ", vss, rss/hostarch.PageSize) 689 690 // rsslim. 691 fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Limits().Get(limits.Rss).Cur) 692 693 fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */) 694 fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */) 695 fmt.Fprintf(buf, "0 0 " /* nswap cnswap */) 696 terminationSignal := linux.Signal(0) 697 if s.task == s.task.ThreadGroup().Leader() { 698 terminationSignal = s.task.ThreadGroup().TerminationSignal() 699 } 700 fmt.Fprintf(buf, "%d ", terminationSignal) 701 fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */) 702 fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */) 703 fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */) 704 fmt.Fprintf(buf, "0\n" /* exit_code */) 705 706 return nil 707 } 708 709 // statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm. 710 // 711 // +stateify savable 712 type statmData struct { 713 kernfs.DynamicBytesFile 714 715 task *kernel.Task 716 } 717 718 var _ dynamicInode = (*statmData)(nil) 719 720 // Generate implements vfs.DynamicBytesSource.Generate. 721 func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error { 722 var vss, rss uint64 723 if mm := getMM(s.task); mm != nil { 724 vss = mm.VirtualMemorySize() 725 rss = mm.ResidentSetSize() 726 } 727 fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/hostarch.PageSize, rss/hostarch.PageSize) 728 return nil 729 } 730 731 // statusInode implements kernfs.Inode for /proc/[pid]/status. 732 // 733 // +stateify savable 734 type statusInode struct { 735 kernfs.InodeAttrs 736 kernfs.InodeNoStatFS 737 kernfs.InodeNoopRefCount 738 kernfs.InodeNotAnonymous 739 kernfs.InodeNotDirectory 740 kernfs.InodeNotSymlink 741 kernfs.InodeWatches 742 743 task *kernel.Task 744 pidns *kernel.PIDNamespace 745 locks vfs.FileLocks 746 } 747 748 // statusFD implements vfs.FileDescriptionImpl and vfs.DynamicByteSource for 749 // /proc/[pid]/status. 750 // 751 // +stateify savable 752 type statusFD struct { 753 statusFDLowerBase 754 vfs.DynamicBytesFileDescriptionImpl 755 vfs.LockFD 756 757 vfsfd vfs.FileDescription 758 759 inode *statusInode 760 task *kernel.Task 761 pidns *kernel.PIDNamespace 762 userns *auth.UserNamespace // equivalent to struct file::f_cred::user_ns 763 } 764 765 // statusFDLowerBase is a dumb hack to ensure that statusFD prefers 766 // vfs.DynamicBytesFileDescriptionImpl methods to vfs.FileDescriptinDefaultImpl 767 // methods. 768 // 769 // +stateify savable 770 type statusFDLowerBase struct { 771 vfs.FileDescriptionDefaultImpl 772 } 773 774 func (fs *filesystem) newStatusInode(ctx context.Context, task *kernel.Task, pidns *kernel.PIDNamespace, ino uint64, perm linux.FileMode) kernfs.Inode { 775 // Note: credentials are overridden by taskOwnedInode. 776 inode := &statusInode{ 777 task: task, 778 pidns: pidns, 779 } 780 inode.InodeAttrs.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeRegular|perm) 781 return &taskOwnedInode{Inode: inode, owner: task} 782 } 783 784 // Open implements kernfs.Inode.Open. 785 func (s *statusInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 786 fd := &statusFD{ 787 inode: s, 788 task: s.task, 789 pidns: s.pidns, 790 userns: rp.Credentials().UserNamespace, 791 } 792 fd.LockFD.Init(&s.locks) 793 if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { 794 return nil, err 795 } 796 fd.DynamicBytesFileDescriptionImpl.Init(&fd.vfsfd, fd) 797 return &fd.vfsfd, nil 798 } 799 800 // SetStat implements kernfs.Inode.SetStat. 801 func (*statusInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { 802 return linuxerr.EPERM 803 } 804 805 // Release implements vfs.FileDescriptionImpl.Release. 806 func (s *statusFD) Release(ctx context.Context) { 807 } 808 809 // Stat implements vfs.FileDescriptionImpl.Stat. 810 func (s *statusFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 811 fs := s.vfsfd.VirtualDentry().Mount().Filesystem() 812 return s.inode.Stat(ctx, fs, opts) 813 } 814 815 // SetStat implements vfs.FileDescriptionImpl.SetStat. 816 func (s *statusFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { 817 return linuxerr.EPERM 818 } 819 820 // Generate implements vfs.DynamicBytesSource.Generate. 821 func (s *statusFD) Generate(ctx context.Context, buf *bytes.Buffer) error { 822 fmt.Fprintf(buf, "Name:\t%s\n", s.task.Name()) 823 fmt.Fprintf(buf, "State:\t%s\n", s.task.StateStatus()) 824 fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.task.ThreadGroup())) 825 fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.task)) 826 827 ppid := kernel.ThreadID(0) 828 if parent := s.task.Parent(); parent != nil { 829 ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) 830 } 831 fmt.Fprintf(buf, "PPid:\t%d\n", ppid) 832 833 tpid := kernel.ThreadID(0) 834 if tracer := s.task.Tracer(); tracer != nil { 835 tpid = s.pidns.IDOfTask(tracer) 836 } 837 fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid) 838 839 creds := s.task.Credentials() 840 ruid := creds.RealKUID.In(s.userns).OrOverflow() 841 euid := creds.EffectiveKUID.In(s.userns).OrOverflow() 842 suid := creds.SavedKUID.In(s.userns).OrOverflow() 843 rgid := creds.RealKGID.In(s.userns).OrOverflow() 844 egid := creds.EffectiveKGID.In(s.userns).OrOverflow() 845 sgid := creds.SavedKGID.In(s.userns).OrOverflow() 846 var fds int 847 var vss, rss, data uint64 848 s.task.WithMuLocked(func(t *kernel.Task) { 849 if fdTable := t.FDTable(); fdTable != nil { 850 fds = fdTable.CurrentMaxFDs() 851 } 852 }) 853 if mm := getMM(s.task); mm != nil { 854 vss = mm.VirtualMemorySize() 855 rss = mm.ResidentSetSize() 856 data = mm.VirtualDataSize() 857 } 858 // Filesystem user/group IDs aren't implemented; effective UID/GID are used 859 // instead. 860 fmt.Fprintf(buf, "Uid:\t%d\t%d\t%d\t%d\n", ruid, euid, suid, euid) 861 fmt.Fprintf(buf, "Gid:\t%d\t%d\t%d\t%d\n", rgid, egid, sgid, egid) 862 fmt.Fprintf(buf, "FDSize:\t%d\n", fds) 863 buf.WriteString("Groups:\t") 864 // There is a space between each pair of supplemental GIDs, as well as an 865 // unconditional trailing space that some applications actually depend on. 866 var sep string 867 for _, kgid := range creds.ExtraKGIDs { 868 fmt.Fprintf(buf, "%s%d", sep, kgid.In(s.userns).OrOverflow()) 869 sep = " " 870 } 871 buf.WriteString(" \n") 872 873 fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10) 874 fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10) 875 fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10) 876 877 fmt.Fprintf(buf, "Threads:\t%d\n", s.task.ThreadGroup().Count()) 878 fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps) 879 fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps) 880 fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps) 881 fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps) 882 fmt.Fprintf(buf, "Seccomp:\t%d\n", s.task.SeccompMode()) 883 // We unconditionally report a single NUMA node. See 884 // pkg/sentry/syscalls/linux/sys_mempolicy.go. 885 fmt.Fprintf(buf, "Mems_allowed:\t1\n") 886 fmt.Fprintf(buf, "Mems_allowed_list:\t0\n") 887 return nil 888 } 889 890 // ioUsage is the /proc/[pid]/io and /proc/[pid]/task/[tid]/io data provider. 891 type ioUsage interface { 892 // IOUsage returns the io usage data. 893 IOUsage() *usage.IO 894 } 895 896 // +stateify savable 897 type ioData struct { 898 kernfs.DynamicBytesFile 899 900 ioUsage 901 } 902 903 var _ dynamicInode = (*ioData)(nil) 904 905 // Generate implements vfs.DynamicBytesSource.Generate. 906 func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error { 907 io := usage.IO{} 908 io.Accumulate(i.IOUsage()) 909 910 fmt.Fprintf(buf, "char: %d\n", io.CharsRead.RacyLoad()) 911 fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten.RacyLoad()) 912 fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls.RacyLoad()) 913 fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls.RacyLoad()) 914 fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead.RacyLoad()) 915 fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten.RacyLoad()) 916 fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled.RacyLoad()) 917 return nil 918 } 919 920 // oomScoreAdj is a stub of the /proc/<pid>/oom_score_adj file. 921 // 922 // +stateify savable 923 type oomScoreAdj struct { 924 kernfs.DynamicBytesFile 925 926 task *kernel.Task 927 } 928 929 var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil) 930 931 // Generate implements vfs.DynamicBytesSource.Generate. 932 func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error { 933 if o.task.ExitState() == kernel.TaskExitDead { 934 return linuxerr.ESRCH 935 } 936 fmt.Fprintf(buf, "%d\n", o.task.OOMScoreAdj()) 937 return nil 938 } 939 940 // Write implements vfs.WritableDynamicBytesSource.Write. 941 func (o *oomScoreAdj) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { 942 if src.NumBytes() == 0 { 943 return 0, nil 944 } 945 946 // Limit input size so as not to impact performance if input size is large. 947 src = src.TakeFirst(hostarch.PageSize - 1) 948 949 str, err := usermem.CopyStringIn(ctx, src.IO, src.Addrs.Head().Start, int(src.Addrs.Head().Length()), src.Opts) 950 if err != nil && err != linuxerr.ENAMETOOLONG { 951 return 0, err 952 } 953 954 str = strings.TrimSpace(str) 955 v, err := strconv.ParseInt(str, 0, 32) 956 if err != nil { 957 return 0, linuxerr.EINVAL 958 } 959 960 if o.task.ExitState() == kernel.TaskExitDead { 961 return 0, linuxerr.ESRCH 962 } 963 if err := o.task.SetOOMScoreAdj(int32(v)); err != nil { 964 return 0, err 965 } 966 967 return src.NumBytes(), nil 968 } 969 970 // exeSymlink is an symlink for the /proc/[pid]/exe file. 971 // 972 // +stateify savable 973 type exeSymlink struct { 974 implStatFS 975 kernfs.InodeAttrs 976 kernfs.InodeNoopRefCount 977 kernfs.InodeNotAnonymous 978 kernfs.InodeSymlink 979 kernfs.InodeWatches 980 981 fs *filesystem 982 task *kernel.Task 983 } 984 985 var _ kernfs.Inode = (*exeSymlink)(nil) 986 987 func (fs *filesystem) newExeSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { 988 inode := &exeSymlink{ 989 fs: fs, 990 task: task, 991 } 992 inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) 993 return inode 994 } 995 996 // Readlink implements kernfs.Inode.Readlink. 997 func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { 998 exec, _, err := s.Getlink(ctx, nil) 999 if err != nil { 1000 return "", err 1001 } 1002 defer s.fs.SafeDecRef(ctx, exec) 1003 1004 root := vfs.RootFromContext(ctx) 1005 if !root.Ok() { 1006 panic("procfs Readlink requires context with root value") 1007 } 1008 defer s.fs.SafeDecRef(ctx, root) 1009 1010 vfsObj := exec.Mount().Filesystem().VirtualFilesystem() 1011 name, _ := vfsObj.PathnameWithDeleted(ctx, root, exec) 1012 return name, nil 1013 } 1014 1015 // Getlink implements kernfs.Inode.Getlink. 1016 func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { 1017 if !kernel.ContextCanTrace(ctx, s.task, false) { 1018 return vfs.VirtualDentry{}, "", linuxerr.EACCES 1019 } 1020 if err := checkTaskState(s.task); err != nil { 1021 return vfs.VirtualDentry{}, "", err 1022 } 1023 1024 mm := getMM(s.task) 1025 if mm == nil { 1026 return vfs.VirtualDentry{}, "", linuxerr.EACCES 1027 } 1028 1029 // The MemoryManager may be destroyed, in which case 1030 // MemoryManager.destroy will simply set the executable to nil 1031 // (with locks held). 1032 exec := mm.Executable() 1033 if exec == nil { 1034 return vfs.VirtualDentry{}, "", linuxerr.ESRCH 1035 } 1036 defer exec.DecRef(ctx) 1037 1038 vd := exec.VirtualDentry() 1039 vd.IncRef() 1040 return vd, "", nil 1041 } 1042 1043 // cwdSymlink is an symlink for the /proc/[pid]/cwd file. 1044 // 1045 // +stateify savable 1046 type cwdSymlink struct { 1047 implStatFS 1048 kernfs.InodeAttrs 1049 kernfs.InodeNoopRefCount 1050 kernfs.InodeNotAnonymous 1051 kernfs.InodeSymlink 1052 kernfs.InodeWatches 1053 1054 fs *filesystem 1055 task *kernel.Task 1056 } 1057 1058 var _ kernfs.Inode = (*cwdSymlink)(nil) 1059 1060 func (fs *filesystem) newCwdSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { 1061 inode := &cwdSymlink{ 1062 fs: fs, 1063 task: task, 1064 } 1065 inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) 1066 return inode 1067 } 1068 1069 // Readlink implements kernfs.Inode.Readlink. 1070 func (s *cwdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { 1071 cwd, _, err := s.Getlink(ctx, nil) 1072 if err != nil { 1073 return "", err 1074 } 1075 defer s.fs.SafeDecRef(ctx, cwd) 1076 1077 root := vfs.RootFromContext(ctx) 1078 if !root.Ok() { 1079 panic("procfs Readlink requires context with root value") 1080 } 1081 defer s.fs.SafeDecRef(ctx, root) 1082 1083 vfsObj := cwd.Mount().Filesystem().VirtualFilesystem() 1084 name, _ := vfsObj.PathnameWithDeleted(ctx, root, cwd) 1085 return name, nil 1086 } 1087 1088 // Getlink implements kernfs.Inode.Getlink. 1089 func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { 1090 if !kernel.ContextCanTrace(ctx, s.task, false) { 1091 return vfs.VirtualDentry{}, "", linuxerr.EACCES 1092 } 1093 if err := checkTaskState(s.task); err != nil { 1094 return vfs.VirtualDentry{}, "", err 1095 } 1096 cwd := s.task.FSContext().WorkingDirectory() 1097 if !cwd.Ok() { 1098 // It could have raced with process deletion. 1099 return vfs.VirtualDentry{}, "", linuxerr.ESRCH 1100 } 1101 // The reference is transferred to the caller. 1102 return cwd, "", nil 1103 } 1104 1105 // rootSymlink is an symlink for the /proc/[pid]/root file. 1106 // 1107 // +stateify savable 1108 type rootSymlink struct { 1109 implStatFS 1110 kernfs.InodeAttrs 1111 kernfs.InodeNoopRefCount 1112 kernfs.InodeNotAnonymous 1113 kernfs.InodeSymlink 1114 kernfs.InodeWatches 1115 1116 fs *filesystem 1117 task *kernel.Task 1118 } 1119 1120 var _ kernfs.Inode = (*rootSymlink)(nil) 1121 1122 func (fs *filesystem) newRootSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { 1123 inode := &rootSymlink{ 1124 fs: fs, 1125 task: task, 1126 } 1127 inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) 1128 return inode 1129 } 1130 1131 // Readlink implements kernfs.Inode.Readlink. 1132 func (s *rootSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { 1133 root, _, err := s.Getlink(ctx, nil) 1134 if err != nil { 1135 return "", err 1136 } 1137 defer s.fs.SafeDecRef(ctx, root) 1138 1139 vfsRoot := vfs.RootFromContext(ctx) 1140 if !vfsRoot.Ok() { 1141 panic("procfs Readlink requires context with root value") 1142 } 1143 defer s.fs.SafeDecRef(ctx, vfsRoot) 1144 1145 vfsObj := root.Mount().Filesystem().VirtualFilesystem() 1146 name, _ := vfsObj.PathnameWithDeleted(ctx, vfsRoot, root) 1147 return name, nil 1148 } 1149 1150 // Getlink implements kernfs.Inode.Getlink. 1151 func (s *rootSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { 1152 if !kernel.ContextCanTrace(ctx, s.task, false) { 1153 return vfs.VirtualDentry{}, "", linuxerr.EACCES 1154 } 1155 if err := checkTaskState(s.task); err != nil { 1156 return vfs.VirtualDentry{}, "", err 1157 } 1158 root := s.task.FSContext().RootDirectory() 1159 if !root.Ok() { 1160 // It could have raced with process deletion. 1161 return vfs.VirtualDentry{}, "", linuxerr.ESRCH 1162 } 1163 // The reference is transferred to the caller. 1164 return root, "", nil 1165 } 1166 1167 // mountInfoData is used to implement /proc/[pid]/mountinfo. 1168 // 1169 // +stateify savable 1170 type mountInfoData struct { 1171 kernfs.DynamicBytesFile 1172 1173 fs *filesystem 1174 task *kernel.Task 1175 } 1176 1177 var _ dynamicInode = (*mountInfoData)(nil) 1178 1179 // Generate implements vfs.DynamicBytesSource.Generate. 1180 func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { 1181 var fsctx *kernel.FSContext 1182 i.task.WithMuLocked(func(t *kernel.Task) { 1183 fsctx = t.FSContext() 1184 }) 1185 if fsctx == nil { 1186 // The task has been destroyed. Nothing to show here. 1187 return nil 1188 } 1189 rootDir := fsctx.RootDirectory() 1190 if !rootDir.Ok() { 1191 // Root has been destroyed. Don't try to read mounts. 1192 return nil 1193 } 1194 defer i.fs.SafeDecRef(ctx, rootDir) 1195 i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf) 1196 return nil 1197 } 1198 1199 // mountsData is used to implement /proc/[pid]/mounts. 1200 // 1201 // +stateify savable 1202 type mountsData struct { 1203 kernfs.DynamicBytesFile 1204 1205 fs *filesystem 1206 task *kernel.Task 1207 } 1208 1209 var _ dynamicInode = (*mountsData)(nil) 1210 1211 // Generate implements vfs.DynamicBytesSource.Generate. 1212 func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error { 1213 var fsctx *kernel.FSContext 1214 i.task.WithMuLocked(func(t *kernel.Task) { 1215 fsctx = t.FSContext() 1216 }) 1217 if fsctx == nil { 1218 // The task has been destroyed. Nothing to show here. 1219 return nil 1220 } 1221 rootDir := fsctx.RootDirectory() 1222 if !rootDir.Ok() { 1223 // Root has been destroyed. Don't try to read mounts. 1224 return nil 1225 } 1226 defer i.fs.SafeDecRef(ctx, rootDir) 1227 i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf) 1228 return nil 1229 } 1230 1231 // +stateify savable 1232 type namespaceSymlink struct { 1233 kernfs.StaticSymlink 1234 1235 task *kernel.Task 1236 nsType int 1237 } 1238 1239 func (fs *filesystem) newNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64, nsType int) kernfs.Inode { 1240 inode := &namespaceSymlink{task: task, nsType: nsType} 1241 1242 // Note: credentials are overridden by taskOwnedInode. 1243 inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, "") 1244 1245 taskInode := &taskOwnedInode{Inode: inode, owner: task} 1246 return taskInode 1247 } 1248 1249 func (fs *filesystem) newPIDNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { 1250 target := fmt.Sprintf("pid:[%d]", task.PIDNamespace().ID()) 1251 1252 inode := &namespaceSymlink{task: task} 1253 // Note: credentials are overridden by taskOwnedInode. 1254 inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target) 1255 1256 taskInode := &taskOwnedInode{Inode: inode, owner: task} 1257 return taskInode 1258 } 1259 1260 func (fs *filesystem) newFakeNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64, ns string) kernfs.Inode { 1261 // Namespace symlinks should contain the namespace name and the inode number 1262 // for the namespace instance, so for example user:[123456]. We currently fake 1263 // the inode number by sticking the symlink inode in its place. 1264 target := fmt.Sprintf("%s:[%d]", ns, ino) 1265 1266 inode := &namespaceSymlink{task: task} 1267 // Note: credentials are overridden by taskOwnedInode. 1268 inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target) 1269 1270 taskInode := &taskOwnedInode{Inode: inode, owner: task} 1271 return taskInode 1272 } 1273 1274 func (s *namespaceSymlink) getInode(t *kernel.Task) *nsfs.Inode { 1275 switch s.nsType { 1276 case linux.CLONE_NEWNET: 1277 netns := t.GetNetworkNamespace() 1278 if netns == nil { 1279 return nil 1280 } 1281 return netns.GetInode() 1282 case linux.CLONE_NEWIPC: 1283 if ipcns := t.GetIPCNamespace(); ipcns != nil { 1284 return ipcns.GetInode() 1285 } 1286 return nil 1287 case linux.CLONE_NEWUTS: 1288 if utsns := t.GetUTSNamespace(); utsns != nil { 1289 return utsns.GetInode() 1290 } 1291 return nil 1292 case linux.CLONE_NEWNS: 1293 mntns := t.GetMountNamespace() 1294 if mntns == nil { 1295 return nil 1296 } 1297 inode, _ := mntns.Refs.(*nsfs.Inode) 1298 return inode 1299 default: 1300 panic("unknown namespace") 1301 } 1302 } 1303 1304 // Readlink implements kernfs.Inode.Readlink. 1305 func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { 1306 if err := checkTaskState(s.task); err != nil { 1307 return "", err 1308 } 1309 if s.nsType != 0 { 1310 inode := s.getInode(s.task) 1311 if inode == nil { 1312 return "", linuxerr.ENOENT 1313 } 1314 target := inode.Name() 1315 inode.DecRef(ctx) 1316 return target, nil 1317 } 1318 return s.StaticSymlink.Readlink(ctx, mnt) 1319 } 1320 1321 // Getlink implements kernfs.Inode.Getlink. 1322 func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { 1323 if err := checkTaskState(s.task); err != nil { 1324 return vfs.VirtualDentry{}, "", err 1325 } 1326 1327 if s.nsType != 0 { 1328 inode := s.getInode(s.task) 1329 if inode == nil { 1330 return vfs.VirtualDentry{}, "", linuxerr.ENOENT 1331 } 1332 defer inode.DecRef(ctx) 1333 return inode.VirtualDentry(), "", nil 1334 } 1335 // Create a synthetic inode to represent the namespace. 1336 fs := mnt.Filesystem().Impl().(*filesystem) 1337 nsInode := &namespaceInode{} 1338 nsInode.Init(ctx, auth.CredentialsFromContext(ctx), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0444) 1339 dentry := &kernfs.Dentry{} 1340 dentry.Init(&fs.Filesystem, nsInode) 1341 vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry()) 1342 // Only IncRef vd.Mount() because vd.Dentry() already holds a ref of 1. 1343 mnt.IncRef() 1344 return vd, "", nil 1345 } 1346 1347 // namespaceInode is a synthetic inode created to represent a namespace in 1348 // /proc/[pid]/ns/*. 1349 // 1350 // +stateify savable 1351 type namespaceInode struct { 1352 implStatFS 1353 kernfs.InodeAttrs 1354 kernfs.InodeNoopRefCount 1355 kernfs.InodeNotAnonymous 1356 kernfs.InodeNotDirectory 1357 kernfs.InodeNotSymlink 1358 kernfs.InodeWatches 1359 1360 locks vfs.FileLocks 1361 } 1362 1363 var _ kernfs.Inode = (*namespaceInode)(nil) 1364 1365 // Init initializes a namespace inode. 1366 func (i *namespaceInode) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { 1367 if perm&^linux.PermissionsMask != 0 { 1368 panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) 1369 } 1370 i.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm) 1371 } 1372 1373 // Open implements kernfs.Inode.Open. 1374 func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 1375 fd := &namespaceFD{inode: i} 1376 i.IncRef() 1377 fd.LockFD.Init(&i.locks) 1378 if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { 1379 return nil, err 1380 } 1381 return &fd.vfsfd, nil 1382 } 1383 1384 // namespace FD is a synthetic file that represents a namespace in 1385 // /proc/[pid]/ns/*. 1386 // 1387 // +stateify savable 1388 type namespaceFD struct { 1389 vfs.FileDescriptionDefaultImpl 1390 vfs.LockFD 1391 1392 vfsfd vfs.FileDescription 1393 inode *namespaceInode 1394 } 1395 1396 var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil) 1397 1398 // Stat implements vfs.FileDescriptionImpl.Stat. 1399 func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 1400 vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() 1401 return fd.inode.Stat(ctx, vfs, opts) 1402 } 1403 1404 // SetStat implements vfs.FileDescriptionImpl.SetStat. 1405 func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { 1406 vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() 1407 creds := auth.CredentialsFromContext(ctx) 1408 return fd.inode.SetStat(ctx, vfs, creds, opts) 1409 } 1410 1411 // Release implements vfs.FileDescriptionImpl.Release. 1412 func (fd *namespaceFD) Release(ctx context.Context) { 1413 fd.inode.DecRef(ctx) 1414 } 1415 1416 // taskCgroupData generates data for /proc/[pid]/cgroup. 1417 // 1418 // +stateify savable 1419 type taskCgroupData struct { 1420 dynamicBytesFileSetAttr 1421 task *kernel.Task 1422 } 1423 1424 var _ dynamicInode = (*taskCgroupData)(nil) 1425 1426 // Generate implements vfs.DynamicBytesSource.Generate. 1427 func (d *taskCgroupData) Generate(ctx context.Context, buf *bytes.Buffer) error { 1428 // When a task is existing on Linux, a task's cgroup set is cleared and 1429 // reset to the initial cgroup set, which is essentially the set of root 1430 // cgroups. Because of this, the /proc/<pid>/cgroup file is always readable 1431 // on Linux throughout a task's lifetime. 1432 // 1433 // The sentry removes tasks from cgroups during the exit process, but 1434 // doesn't move them into an initial cgroup set, so partway through task 1435 // exit this file show a task is in no cgroups, which is incorrect. Instead, 1436 // once a task has left its cgroups, we return an error. 1437 if d.task.ExitState() >= kernel.TaskExitInitiated { 1438 return linuxerr.ESRCH 1439 } 1440 1441 d.task.GenerateProcTaskCgroup(buf) 1442 return nil 1443 } 1444 1445 // childrenData implements vfs.DynamicBytesSource for /proc/[pid]/task/[tid]/children. 1446 // 1447 // +stateify savable 1448 type childrenData struct { 1449 kernfs.DynamicBytesFile 1450 1451 task *kernel.Task 1452 1453 // pidns is the PID namespace associated with the proc filesystem that 1454 // includes the file using this childrenData. 1455 pidns *kernel.PIDNamespace 1456 } 1457 1458 // Generate implements vfs.DynamicBytesSource.Generate. 1459 func (d *childrenData) Generate(ctx context.Context, buf *bytes.Buffer) error { 1460 children := d.task.Children() 1461 var childrenTIDs []int 1462 for childTask := range children { 1463 childrenTIDs = append(childrenTIDs, int(d.pidns.IDOfTask(childTask))) 1464 } 1465 1466 // The TIDs need to be in sorted order in accordance with the Linux implementation. 1467 sort.Ints(childrenTIDs) 1468 1469 for _, childrenTID := range childrenTIDs { 1470 // It contains a space-separated list of child tasks of the `task`. 1471 // Each task is represented by its TID. 1472 fmt.Fprintf(buf, "%d ", childrenTID) 1473 } 1474 1475 return nil 1476 }