github.com/ttpreport/gvisor-ligolo@v0.0.0-20240123134145-a858404967ba/pkg/sentry/fsimpl/proc/task_files.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package proc 16 17 import ( 18 "bytes" 19 "fmt" 20 "io" 21 "strconv" 22 "strings" 23 24 "github.com/ttpreport/gvisor-ligolo/pkg/abi/linux" 25 "github.com/ttpreport/gvisor-ligolo/pkg/context" 26 "github.com/ttpreport/gvisor-ligolo/pkg/errors/linuxerr" 27 "github.com/ttpreport/gvisor-ligolo/pkg/hostarch" 28 "github.com/ttpreport/gvisor-ligolo/pkg/safemem" 29 "github.com/ttpreport/gvisor-ligolo/pkg/sentry/fsimpl/kernfs" 30 "github.com/ttpreport/gvisor-ligolo/pkg/sentry/fsimpl/nsfs" 31 "github.com/ttpreport/gvisor-ligolo/pkg/sentry/kernel" 32 "github.com/ttpreport/gvisor-ligolo/pkg/sentry/kernel/auth" 33 "github.com/ttpreport/gvisor-ligolo/pkg/sentry/limits" 34 "github.com/ttpreport/gvisor-ligolo/pkg/sentry/mm" 35 "github.com/ttpreport/gvisor-ligolo/pkg/sentry/usage" 36 "github.com/ttpreport/gvisor-ligolo/pkg/sentry/vfs" 37 "github.com/ttpreport/gvisor-ligolo/pkg/sync" 38 "github.com/ttpreport/gvisor-ligolo/pkg/usermem" 39 ) 40 41 // "There is an (arbitrary) limit on the number of lines in the file. As at 42 // Linux 3.18, the limit is five lines." - user_namespaces(7) 43 const maxIDMapLines = 5 44 45 // getMM gets the kernel task's MemoryManager. No additional reference is taken on 46 // mm here. This is safe because MemoryManager.destroy is required to leave the 47 // MemoryManager in a state where it's still usable as a DynamicBytesSource. 48 func getMM(task *kernel.Task) *mm.MemoryManager { 49 var tmm *mm.MemoryManager 50 task.WithMuLocked(func(t *kernel.Task) { 51 if mm := t.MemoryManager(); mm != nil { 52 tmm = mm 53 } 54 }) 55 return tmm 56 } 57 58 // getMMIncRef returns t's MemoryManager. If getMMIncRef succeeds, the 59 // MemoryManager's users count is incremented, and must be decremented by the 60 // caller when it is no longer in use. 61 func getMMIncRef(task *kernel.Task) (*mm.MemoryManager, error) { 62 var m *mm.MemoryManager 63 task.WithMuLocked(func(t *kernel.Task) { 64 m = t.MemoryManager() 65 }) 66 if m == nil || !m.IncUsers() { 67 return nil, io.EOF 68 } 69 return m, nil 70 } 71 72 func checkTaskState(t *kernel.Task) error { 73 switch t.ExitState() { 74 case kernel.TaskExitZombie: 75 return linuxerr.EACCES 76 case kernel.TaskExitDead: 77 return linuxerr.ESRCH 78 } 79 return nil 80 } 81 82 type bufferWriter struct { 83 buf *bytes.Buffer 84 } 85 86 // WriteFromBlocks writes up to srcs.NumBytes() bytes from srcs and returns 87 // the number of bytes written. It may return a partial write without an 88 // error (i.e. (n, nil) where 0 < n < srcs.NumBytes()). It should not 89 // return a full write with an error (i.e. srcs.NumBytes(), err) where err 90 // != nil). 91 func (w *bufferWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { 92 written := srcs.NumBytes() 93 for !srcs.IsEmpty() { 94 w.buf.Write(srcs.Head().ToSlice()) 95 srcs = srcs.Tail() 96 } 97 return written, nil 98 } 99 100 // auxvData implements vfs.DynamicBytesSource for /proc/[pid]/auxv. 101 // 102 // +stateify savable 103 type auxvData struct { 104 kernfs.DynamicBytesFile 105 106 task *kernel.Task 107 } 108 109 var _ dynamicInode = (*auxvData)(nil) 110 111 // Generate implements vfs.DynamicBytesSource.Generate. 112 func (d *auxvData) Generate(ctx context.Context, buf *bytes.Buffer) error { 113 if d.task.ExitState() == kernel.TaskExitDead { 114 return linuxerr.ESRCH 115 } 116 m, err := getMMIncRef(d.task) 117 if err != nil { 118 // Return empty file. 119 return nil 120 } 121 defer m.DecUsers(ctx) 122 123 auxv := m.Auxv() 124 // Space for buffer with AT_NULL (0) terminator at the end. 125 buf.Grow((len(auxv) + 1) * 16) 126 for _, e := range auxv { 127 var tmp [16]byte 128 hostarch.ByteOrder.PutUint64(tmp[:8], e.Key) 129 hostarch.ByteOrder.PutUint64(tmp[8:], uint64(e.Value)) 130 buf.Write(tmp[:]) 131 } 132 var atNull [16]byte 133 buf.Write(atNull[:]) 134 135 return nil 136 } 137 138 // MetadataType enumerates the types of metadata that is exposed through proc. 139 type MetadataType int 140 141 const ( 142 // Cmdline represents /proc/[pid]/cmdline. 143 Cmdline MetadataType = iota 144 145 // Environ represents /proc/[pid]/environ. 146 Environ 147 ) 148 149 // GetMetadata fetches the process's metadata of type t and writes it into 150 // buf. The process is identified by mm. 151 func GetMetadata(ctx context.Context, mm *mm.MemoryManager, buf *bytes.Buffer, t MetadataType) error { 152 // Figure out the bounds of the exec arg we are trying to read. 153 var ar hostarch.AddrRange 154 switch t { 155 case Cmdline: 156 ar = hostarch.AddrRange{ 157 Start: mm.ArgvStart(), 158 End: mm.ArgvEnd(), 159 } 160 case Environ: 161 ar = hostarch.AddrRange{ 162 Start: mm.EnvvStart(), 163 End: mm.EnvvEnd(), 164 } 165 default: 166 panic(fmt.Sprintf("unknown exec arg type %v", t)) 167 } 168 if ar.Start == 0 || ar.End == 0 { 169 // Don't attempt to read before the start/end are set up. 170 return io.EOF 171 } 172 173 // N.B. Technically this should be usermem.IOOpts.IgnorePermissions = true 174 // until Linux 4.9 (272ddc8b3735 "proc: don't use FOLL_FORCE for reading 175 // cmdline and environment"). 176 writer := &bufferWriter{buf: buf} 177 if n, err := mm.CopyInTo(ctx, hostarch.AddrRangeSeqOf(ar), writer, usermem.IOOpts{}); n == 0 || err != nil { 178 // Nothing to copy or something went wrong. 179 return err 180 } 181 182 // On Linux, if the NULL byte at the end of the argument vector has been 183 // overwritten, it continues reading the environment vector as part of 184 // the argument vector. 185 if t == Cmdline && buf.Bytes()[buf.Len()-1] != 0 { 186 if end := bytes.IndexByte(buf.Bytes(), 0); end != -1 { 187 // If we found a NULL character somewhere else in argv, truncate the 188 // return up to the NULL terminator (including it). 189 buf.Truncate(end) 190 return nil 191 } 192 193 // There is no NULL terminator in the string, return into envp. 194 arEnvv := hostarch.AddrRange{ 195 Start: mm.EnvvStart(), 196 End: mm.EnvvEnd(), 197 } 198 199 // Upstream limits the returned amount to one page of slop. 200 // https://elixir.bootlin.com/linux/v4.20/source/fs/proc/base.c#L208 201 // we'll return one page total between argv and envp because of the 202 // above page restrictions. 203 if buf.Len() >= hostarch.PageSize { 204 // Returned at least one page already, nothing else to add. 205 return nil 206 } 207 remaining := hostarch.PageSize - buf.Len() 208 if int(arEnvv.Length()) > remaining { 209 end, ok := arEnvv.Start.AddLength(uint64(remaining)) 210 if !ok { 211 return linuxerr.EFAULT 212 } 213 arEnvv.End = end 214 } 215 if _, err := mm.CopyInTo(ctx, hostarch.AddrRangeSeqOf(arEnvv), writer, usermem.IOOpts{}); err != nil { 216 return err 217 } 218 219 // Linux will return envp up to and including the first NULL character, 220 // so find it. 221 envStart := int(ar.Length()) 222 if nullIdx := bytes.IndexByte(buf.Bytes()[envStart:], 0); nullIdx != -1 { 223 buf.Truncate(envStart + nullIdx) 224 } 225 } 226 227 return nil 228 } 229 230 // metadataData implements vfs.DynamicBytesSource for proc metadata fields like: 231 // 232 // - /proc/[pid]/cmdline 233 // - /proc/[pid]/environ 234 // 235 // +stateify savable 236 type metadataData struct { 237 kernfs.DynamicBytesFile 238 239 task *kernel.Task 240 241 // arg is the type of exec argument this file contains. 242 metaType MetadataType 243 } 244 245 var _ dynamicInode = (*metadataData)(nil) 246 247 // Generate implements vfs.DynamicBytesSource.Generate. 248 func (d *metadataData) Generate(ctx context.Context, buf *bytes.Buffer) error { 249 if d.task.ExitState() == kernel.TaskExitDead { 250 return linuxerr.ESRCH 251 } 252 m, err := getMMIncRef(d.task) 253 if err != nil { 254 // Return empty file. 255 return nil 256 } 257 defer m.DecUsers(ctx) 258 return GetMetadata(ctx, m, buf, d.metaType) 259 } 260 261 // +stateify savable 262 type commInode struct { 263 kernfs.DynamicBytesFile 264 265 task *kernel.Task 266 } 267 268 func (fs *filesystem) newComm(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode { 269 inode := &commInode{task: task} 270 inode.DynamicBytesFile.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm) 271 return inode 272 } 273 274 func (i *commInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { 275 // This file can always be read or written by members of the same thread 276 // group. See fs/proc/base.c:proc_tid_comm_permission. 277 t := kernel.TaskFromContext(ctx) 278 if t != nil && t.ThreadGroup() == i.task.ThreadGroup() && !ats.MayExec() { 279 return nil 280 } 281 282 return i.DynamicBytesFile.CheckPermissions(ctx, creds, ats) 283 } 284 285 // commData implements vfs.WritableDynamicBytesSource for /proc/[pid]/comm. 286 // 287 // +stateify savable 288 type commData struct { 289 kernfs.DynamicBytesFile 290 291 task *kernel.Task 292 } 293 294 var _ dynamicInode = (*commData)(nil) 295 var _ vfs.WritableDynamicBytesSource = (*commData)(nil) 296 297 // Generate implements vfs.DynamicBytesSource.Generate. 298 func (d *commData) Generate(ctx context.Context, buf *bytes.Buffer) error { 299 buf.WriteString(d.task.Name()) 300 buf.WriteString("\n") 301 return nil 302 } 303 304 // Write implements vfs.WritableDynamicBytesSource.Write. 305 func (d *commData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { 306 srclen := src.NumBytes() 307 name := make([]byte, srclen) 308 if _, err := src.CopyIn(ctx, name); err != nil { 309 return 0, err 310 } 311 312 // Only allow writes from the same thread group, otherwise return 313 // EINVAL. See fs/proc/base.c:comm_write. 314 // 315 // Note that this check exists in addition to the same-thread-group 316 // check in CheckPermissions. 317 t := kernel.TaskFromContext(ctx) 318 if t == nil || t.ThreadGroup() != d.task.ThreadGroup() { 319 return 0, linuxerr.EINVAL 320 } 321 d.task.SetName(string(name)) 322 return int64(srclen), nil 323 } 324 325 // idMapData implements vfs.WritableDynamicBytesSource for 326 // /proc/[pid]/{gid_map|uid_map}. 327 // 328 // +stateify savable 329 type idMapData struct { 330 kernfs.DynamicBytesFile 331 332 task *kernel.Task 333 gids bool 334 } 335 336 var _ dynamicInode = (*idMapData)(nil) 337 var _ vfs.WritableDynamicBytesSource = (*idMapData)(nil) 338 339 // Generate implements vfs.WritableDynamicBytesSource.Generate. 340 func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error { 341 var entries []auth.IDMapEntry 342 if d.gids { 343 entries = d.task.UserNamespace().GIDMap() 344 } else { 345 entries = d.task.UserNamespace().UIDMap() 346 } 347 for _, e := range entries { 348 fmt.Fprintf(buf, "%10d %10d %10d\n", e.FirstID, e.FirstParentID, e.Length) 349 } 350 return nil 351 } 352 353 // Write implements vfs.WritableDynamicBytesSource.Write. 354 func (d *idMapData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { 355 // "In addition, the number of bytes written to the file must be less than 356 // the system page size, and the write must be performed at the start of 357 // the file ..." - user_namespaces(7) 358 srclen := src.NumBytes() 359 if srclen >= hostarch.PageSize || offset != 0 { 360 return 0, linuxerr.EINVAL 361 } 362 b := make([]byte, srclen) 363 if _, err := src.CopyIn(ctx, b); err != nil { 364 return 0, err 365 } 366 367 // Truncate from the first NULL byte. 368 var nul int64 369 nul = int64(bytes.IndexByte(b, 0)) 370 if nul == -1 { 371 nul = srclen 372 } 373 b = b[:nul] 374 // Remove the last \n. 375 if nul >= 1 && b[nul-1] == '\n' { 376 b = b[:nul-1] 377 } 378 lines := bytes.SplitN(b, []byte("\n"), maxIDMapLines+1) 379 if len(lines) > maxIDMapLines { 380 return 0, linuxerr.EINVAL 381 } 382 383 entries := make([]auth.IDMapEntry, len(lines)) 384 for i, l := range lines { 385 var e auth.IDMapEntry 386 _, err := fmt.Sscan(string(l), &e.FirstID, &e.FirstParentID, &e.Length) 387 if err != nil { 388 return 0, linuxerr.EINVAL 389 } 390 entries[i] = e 391 } 392 var err error 393 if d.gids { 394 err = d.task.UserNamespace().SetGIDMap(ctx, entries) 395 } else { 396 err = d.task.UserNamespace().SetUIDMap(ctx, entries) 397 } 398 if err != nil { 399 return 0, err 400 } 401 402 // On success, Linux's kernel/user_namespace.c:map_write() always returns 403 // count, even if fewer bytes were used. 404 return int64(srclen), nil 405 } 406 407 var _ kernfs.Inode = (*memInode)(nil) 408 409 // memInode implements kernfs.Inode for /proc/[pid]/mem. 410 // 411 // +stateify savable 412 type memInode struct { 413 kernfs.InodeAttrs 414 kernfs.InodeNoStatFS 415 kernfs.InodeNoopRefCount 416 kernfs.InodeNotAnonymous 417 kernfs.InodeNotDirectory 418 kernfs.InodeNotSymlink 419 kernfs.InodeWatches 420 421 task *kernel.Task 422 locks vfs.FileLocks 423 } 424 425 func (fs *filesystem) newMemInode(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode { 426 // Note: credentials are overridden by taskOwnedInode. 427 inode := &memInode{task: task} 428 inode.init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm) 429 return &taskOwnedInode{Inode: inode, owner: task} 430 } 431 432 func (f *memInode) init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { 433 if perm&^linux.PermissionsMask != 0 { 434 panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) 435 } 436 f.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm) 437 } 438 439 // Open implements kernfs.Inode.Open. 440 func (f *memInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 441 // TODO(gvisor.dev/issue/260): Add check for PTRACE_MODE_ATTACH_FSCREDS 442 // Permission to read this file is governed by PTRACE_MODE_ATTACH_FSCREDS 443 // Since we dont implement setfsuid/setfsgid we can just use PTRACE_MODE_ATTACH 444 if !kernel.ContextCanTrace(ctx, f.task, true) { 445 return nil, linuxerr.EACCES 446 } 447 if err := checkTaskState(f.task); err != nil { 448 return nil, err 449 } 450 fd := &memFD{} 451 if err := fd.Init(rp.Mount(), d, f, opts.Flags); err != nil { 452 return nil, err 453 } 454 return &fd.vfsfd, nil 455 } 456 457 // SetStat implements kernfs.Inode.SetStat. 458 func (*memInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { 459 return linuxerr.EPERM 460 } 461 462 var _ vfs.FileDescriptionImpl = (*memFD)(nil) 463 464 // memFD implements vfs.FileDescriptionImpl for /proc/[pid]/mem. 465 // 466 // +stateify savable 467 type memFD struct { 468 vfsfd vfs.FileDescription 469 vfs.FileDescriptionDefaultImpl 470 vfs.LockFD 471 472 inode *memInode 473 474 // mu guards the fields below. 475 mu sync.Mutex `state:"nosave"` 476 offset int64 477 } 478 479 // Init initializes memFD. 480 func (fd *memFD) Init(m *vfs.Mount, d *kernfs.Dentry, inode *memInode, flags uint32) error { 481 fd.LockFD.Init(&inode.locks) 482 if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { 483 return err 484 } 485 fd.inode = inode 486 return nil 487 } 488 489 // Seek implements vfs.FileDescriptionImpl.Seek. 490 func (fd *memFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { 491 fd.mu.Lock() 492 defer fd.mu.Unlock() 493 switch whence { 494 case linux.SEEK_SET: 495 case linux.SEEK_CUR: 496 offset += fd.offset 497 default: 498 return 0, linuxerr.EINVAL 499 } 500 if offset < 0 { 501 return 0, linuxerr.EINVAL 502 } 503 fd.offset = offset 504 return offset, nil 505 } 506 507 // PRead implements vfs.FileDescriptionImpl.PRead. 508 func (fd *memFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { 509 if dst.NumBytes() == 0 { 510 return 0, nil 511 } 512 m, err := getMMIncRef(fd.inode.task) 513 if err != nil { 514 return 0, err 515 } 516 defer m.DecUsers(ctx) 517 // Buffer the read data because of MM locks 518 buf := make([]byte, dst.NumBytes()) 519 n, readErr := m.CopyIn(ctx, hostarch.Addr(offset), buf, usermem.IOOpts{IgnorePermissions: true}) 520 if n > 0 { 521 if _, err := dst.CopyOut(ctx, buf[:n]); err != nil { 522 return 0, linuxerr.EFAULT 523 } 524 return int64(n), nil 525 } 526 if readErr != nil { 527 return 0, linuxerr.EIO 528 } 529 return 0, nil 530 } 531 532 // Read implements vfs.FileDescriptionImpl.Read. 533 func (fd *memFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { 534 fd.mu.Lock() 535 n, err := fd.PRead(ctx, dst, fd.offset, opts) 536 fd.offset += n 537 fd.mu.Unlock() 538 return n, err 539 } 540 541 // Stat implements vfs.FileDescriptionImpl.Stat. 542 func (fd *memFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 543 fs := fd.vfsfd.VirtualDentry().Mount().Filesystem() 544 return fd.inode.Stat(ctx, fs, opts) 545 } 546 547 // SetStat implements vfs.FileDescriptionImpl.SetStat. 548 func (fd *memFD) SetStat(context.Context, vfs.SetStatOptions) error { 549 return linuxerr.EPERM 550 } 551 552 // Release implements vfs.FileDescriptionImpl.Release. 553 func (fd *memFD) Release(context.Context) {} 554 555 // limitsData implements vfs.DynamicBytesSource for /proc/[pid]/limits. 556 // 557 // +stateify savable 558 type limitsData struct { 559 kernfs.DynamicBytesFile 560 561 task *kernel.Task 562 } 563 564 func (d *limitsData) Generate(ctx context.Context, buf *bytes.Buffer) error { 565 taskLimits := d.task.Limits() 566 // formatting matches the kernel output from linux/fs/proc/base.c:proc_pid_limits() 567 fmt.Fprintf(buf, "Limit Soft Limit Hard Limit Units \n") 568 for _, lt := range limits.AllLimitTypes { 569 fmt.Fprintf(buf, "%-25s ", lt.Name()) 570 571 l := taskLimits.Get(lt) 572 if l.Cur == limits.Infinity { 573 fmt.Fprintf(buf, "%-20s ", "unlimited") 574 } else { 575 fmt.Fprintf(buf, "%-20d ", l.Cur) 576 } 577 578 if l.Max == limits.Infinity { 579 fmt.Fprintf(buf, "%-20s ", "unlimited") 580 } else { 581 fmt.Fprintf(buf, "%-20d ", l.Max) 582 } 583 584 if u := lt.Unit(); u != "" { 585 fmt.Fprintf(buf, "%-10s", u) 586 } 587 588 buf.WriteByte('\n') 589 } 590 return nil 591 } 592 593 // mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps. 594 // 595 // +stateify savable 596 type mapsData struct { 597 kernfs.DynamicBytesFile 598 599 task *kernel.Task 600 } 601 602 var _ dynamicInode = (*mapsData)(nil) 603 604 // Generate implements vfs.DynamicBytesSource.Generate. 605 func (d *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { 606 if mm := getMM(d.task); mm != nil { 607 mm.ReadMapsDataInto(ctx, mm.MapsCallbackFuncForBuffer(buf)) 608 } 609 return nil 610 } 611 612 // smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps. 613 // 614 // +stateify savable 615 type smapsData struct { 616 kernfs.DynamicBytesFile 617 618 task *kernel.Task 619 } 620 621 var _ dynamicInode = (*smapsData)(nil) 622 623 // Generate implements vfs.DynamicBytesSource.Generate. 624 func (d *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { 625 if mm := getMM(d.task); mm != nil { 626 mm.ReadSmapsDataInto(ctx, buf) 627 } 628 return nil 629 } 630 631 // +stateify savable 632 type taskStatData struct { 633 kernfs.DynamicBytesFile 634 635 task *kernel.Task 636 637 // If tgstats is true, accumulate fault stats (not implemented) and CPU 638 // time across all tasks in t's thread group. 639 tgstats bool 640 641 // pidns is the PID namespace associated with the proc filesystem that 642 // includes the file using this statData. 643 pidns *kernel.PIDNamespace 644 } 645 646 var _ dynamicInode = (*taskStatData)(nil) 647 648 // Generate implements vfs.DynamicBytesSource.Generate. 649 func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { 650 fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.task)) 651 fmt.Fprintf(buf, "(%s) ", s.task.Name()) 652 fmt.Fprintf(buf, "%c ", s.task.StateStatus()[0]) 653 ppid := kernel.ThreadID(0) 654 if parent := s.task.Parent(); parent != nil { 655 ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) 656 } 657 fmt.Fprintf(buf, "%d ", ppid) 658 fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.task.ThreadGroup().ProcessGroup())) 659 fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.task.ThreadGroup().Session())) 660 fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */) 661 fmt.Fprintf(buf, "0 " /* flags */) 662 fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */) 663 var cputime usage.CPUStats 664 if s.tgstats { 665 cputime = s.task.ThreadGroup().CPUStats() 666 } else { 667 cputime = s.task.CPUStats() 668 } 669 fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) 670 cputime = s.task.ThreadGroup().JoinedChildCPUStats() 671 fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) 672 fmt.Fprintf(buf, "%d %d ", s.task.Priority(), s.task.Niceness()) 673 fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Count()) 674 675 // itrealvalue. Since kernel 2.6.17, this field is no longer 676 // maintained, and is hard coded as 0. 677 fmt.Fprintf(buf, "0 ") 678 679 // Start time is relative to boot time, expressed in clock ticks. 680 fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.task.StartTime().Sub(s.task.Kernel().Timekeeper().BootTime()))) 681 682 var vss, rss uint64 683 if mm := getMM(s.task); mm != nil { 684 vss = mm.VirtualMemorySize() 685 rss = mm.ResidentSetSize() 686 } 687 fmt.Fprintf(buf, "%d %d ", vss, rss/hostarch.PageSize) 688 689 // rsslim. 690 fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Limits().Get(limits.Rss).Cur) 691 692 fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */) 693 fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */) 694 fmt.Fprintf(buf, "0 0 " /* nswap cnswap */) 695 terminationSignal := linux.Signal(0) 696 if s.task == s.task.ThreadGroup().Leader() { 697 terminationSignal = s.task.ThreadGroup().TerminationSignal() 698 } 699 fmt.Fprintf(buf, "%d ", terminationSignal) 700 fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */) 701 fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */) 702 fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */) 703 fmt.Fprintf(buf, "0\n" /* exit_code */) 704 705 return nil 706 } 707 708 // statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm. 709 // 710 // +stateify savable 711 type statmData struct { 712 kernfs.DynamicBytesFile 713 714 task *kernel.Task 715 } 716 717 var _ dynamicInode = (*statmData)(nil) 718 719 // Generate implements vfs.DynamicBytesSource.Generate. 720 func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error { 721 var vss, rss uint64 722 if mm := getMM(s.task); mm != nil { 723 vss = mm.VirtualMemorySize() 724 rss = mm.ResidentSetSize() 725 } 726 fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/hostarch.PageSize, rss/hostarch.PageSize) 727 return nil 728 } 729 730 // statusInode implements kernfs.Inode for /proc/[pid]/status. 731 // 732 // +stateify savable 733 type statusInode struct { 734 kernfs.InodeAttrs 735 kernfs.InodeNoStatFS 736 kernfs.InodeNoopRefCount 737 kernfs.InodeNotAnonymous 738 kernfs.InodeNotDirectory 739 kernfs.InodeNotSymlink 740 kernfs.InodeWatches 741 742 task *kernel.Task 743 pidns *kernel.PIDNamespace 744 locks vfs.FileLocks 745 } 746 747 // statusFD implements vfs.FileDescriptionImpl and vfs.DynamicByteSource for 748 // /proc/[pid]/status. 749 // 750 // +stateify savable 751 type statusFD struct { 752 statusFDLowerBase 753 vfs.DynamicBytesFileDescriptionImpl 754 vfs.LockFD 755 756 vfsfd vfs.FileDescription 757 758 inode *statusInode 759 task *kernel.Task 760 pidns *kernel.PIDNamespace 761 userns *auth.UserNamespace // equivalent to struct file::f_cred::user_ns 762 } 763 764 // statusFDLowerBase is a dumb hack to ensure that statusFD prefers 765 // vfs.DynamicBytesFileDescriptionImpl methods to vfs.FileDescriptinDefaultImpl 766 // methods. 767 // 768 // +stateify savable 769 type statusFDLowerBase struct { 770 vfs.FileDescriptionDefaultImpl 771 } 772 773 func (fs *filesystem) newStatusInode(ctx context.Context, task *kernel.Task, pidns *kernel.PIDNamespace, ino uint64, perm linux.FileMode) kernfs.Inode { 774 // Note: credentials are overridden by taskOwnedInode. 775 inode := &statusInode{ 776 task: task, 777 pidns: pidns, 778 } 779 inode.InodeAttrs.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeRegular|perm) 780 return &taskOwnedInode{Inode: inode, owner: task} 781 } 782 783 // Open implements kernfs.Inode.Open. 784 func (s *statusInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 785 fd := &statusFD{ 786 inode: s, 787 task: s.task, 788 pidns: s.pidns, 789 userns: rp.Credentials().UserNamespace, 790 } 791 fd.LockFD.Init(&s.locks) 792 if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { 793 return nil, err 794 } 795 fd.DynamicBytesFileDescriptionImpl.Init(&fd.vfsfd, fd) 796 return &fd.vfsfd, nil 797 } 798 799 // SetStat implements kernfs.Inode.SetStat. 800 func (*statusInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { 801 return linuxerr.EPERM 802 } 803 804 // Release implements vfs.FileDescriptionImpl.Release. 805 func (s *statusFD) Release(ctx context.Context) { 806 } 807 808 // Stat implements vfs.FileDescriptionImpl.Stat. 809 func (s *statusFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 810 fs := s.vfsfd.VirtualDentry().Mount().Filesystem() 811 return s.inode.Stat(ctx, fs, opts) 812 } 813 814 // SetStat implements vfs.FileDescriptionImpl.SetStat. 815 func (s *statusFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { 816 return linuxerr.EPERM 817 } 818 819 // Generate implements vfs.DynamicBytesSource.Generate. 820 func (s *statusFD) Generate(ctx context.Context, buf *bytes.Buffer) error { 821 fmt.Fprintf(buf, "Name:\t%s\n", s.task.Name()) 822 fmt.Fprintf(buf, "State:\t%s\n", s.task.StateStatus()) 823 fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.task.ThreadGroup())) 824 fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.task)) 825 826 ppid := kernel.ThreadID(0) 827 if parent := s.task.Parent(); parent != nil { 828 ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) 829 } 830 fmt.Fprintf(buf, "PPid:\t%d\n", ppid) 831 832 tpid := kernel.ThreadID(0) 833 if tracer := s.task.Tracer(); tracer != nil { 834 tpid = s.pidns.IDOfTask(tracer) 835 } 836 fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid) 837 838 creds := s.task.Credentials() 839 ruid := creds.RealKUID.In(s.userns).OrOverflow() 840 euid := creds.EffectiveKUID.In(s.userns).OrOverflow() 841 suid := creds.SavedKUID.In(s.userns).OrOverflow() 842 rgid := creds.RealKGID.In(s.userns).OrOverflow() 843 egid := creds.EffectiveKGID.In(s.userns).OrOverflow() 844 sgid := creds.SavedKGID.In(s.userns).OrOverflow() 845 var fds int 846 var vss, rss, data uint64 847 s.task.WithMuLocked(func(t *kernel.Task) { 848 if fdTable := t.FDTable(); fdTable != nil { 849 fds = fdTable.CurrentMaxFDs() 850 } 851 }) 852 if mm := getMM(s.task); mm != nil { 853 vss = mm.VirtualMemorySize() 854 rss = mm.ResidentSetSize() 855 data = mm.VirtualDataSize() 856 } 857 // Filesystem user/group IDs aren't implemented; effective UID/GID are used 858 // instead. 859 fmt.Fprintf(buf, "Uid:\t%d\t%d\t%d\t%d\n", ruid, euid, suid, euid) 860 fmt.Fprintf(buf, "Gid:\t%d\t%d\t%d\t%d\n", rgid, egid, sgid, egid) 861 fmt.Fprintf(buf, "FDSize:\t%d\n", fds) 862 buf.WriteString("Groups:\t") 863 // There is a space between each pair of supplemental GIDs, as well as an 864 // unconditional trailing space that some applications actually depend on. 865 var sep string 866 for _, kgid := range creds.ExtraKGIDs { 867 fmt.Fprintf(buf, "%s%d", sep, kgid.In(s.userns).OrOverflow()) 868 sep = " " 869 } 870 buf.WriteString(" \n") 871 872 fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10) 873 fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10) 874 fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10) 875 876 fmt.Fprintf(buf, "Threads:\t%d\n", s.task.ThreadGroup().Count()) 877 fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps) 878 fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps) 879 fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps) 880 fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps) 881 fmt.Fprintf(buf, "Seccomp:\t%d\n", s.task.SeccompMode()) 882 // We unconditionally report a single NUMA node. See 883 // pkg/sentry/syscalls/linux/sys_mempolicy.go. 884 fmt.Fprintf(buf, "Mems_allowed:\t1\n") 885 fmt.Fprintf(buf, "Mems_allowed_list:\t0\n") 886 return nil 887 } 888 889 // ioUsage is the /proc/[pid]/io and /proc/[pid]/task/[tid]/io data provider. 890 type ioUsage interface { 891 // IOUsage returns the io usage data. 892 IOUsage() *usage.IO 893 } 894 895 // +stateify savable 896 type ioData struct { 897 kernfs.DynamicBytesFile 898 899 ioUsage 900 } 901 902 var _ dynamicInode = (*ioData)(nil) 903 904 // Generate implements vfs.DynamicBytesSource.Generate. 905 func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error { 906 io := usage.IO{} 907 io.Accumulate(i.IOUsage()) 908 909 fmt.Fprintf(buf, "char: %d\n", io.CharsRead.RacyLoad()) 910 fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten.RacyLoad()) 911 fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls.RacyLoad()) 912 fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls.RacyLoad()) 913 fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead.RacyLoad()) 914 fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten.RacyLoad()) 915 fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled.RacyLoad()) 916 return nil 917 } 918 919 // oomScoreAdj is a stub of the /proc/<pid>/oom_score_adj file. 920 // 921 // +stateify savable 922 type oomScoreAdj struct { 923 kernfs.DynamicBytesFile 924 925 task *kernel.Task 926 } 927 928 var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil) 929 930 // Generate implements vfs.DynamicBytesSource.Generate. 931 func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error { 932 if o.task.ExitState() == kernel.TaskExitDead { 933 return linuxerr.ESRCH 934 } 935 fmt.Fprintf(buf, "%d\n", o.task.OOMScoreAdj()) 936 return nil 937 } 938 939 // Write implements vfs.WritableDynamicBytesSource.Write. 940 func (o *oomScoreAdj) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) { 941 if src.NumBytes() == 0 { 942 return 0, nil 943 } 944 945 // Limit input size so as not to impact performance if input size is large. 946 src = src.TakeFirst(hostarch.PageSize - 1) 947 948 str, err := usermem.CopyStringIn(ctx, src.IO, src.Addrs.Head().Start, int(src.Addrs.Head().Length()), src.Opts) 949 if err != nil && err != linuxerr.ENAMETOOLONG { 950 return 0, err 951 } 952 953 str = strings.TrimSpace(str) 954 v, err := strconv.ParseInt(str, 0, 32) 955 if err != nil { 956 return 0, linuxerr.EINVAL 957 } 958 959 if o.task.ExitState() == kernel.TaskExitDead { 960 return 0, linuxerr.ESRCH 961 } 962 if err := o.task.SetOOMScoreAdj(int32(v)); err != nil { 963 return 0, err 964 } 965 966 return src.NumBytes(), nil 967 } 968 969 // exeSymlink is an symlink for the /proc/[pid]/exe file. 970 // 971 // +stateify savable 972 type exeSymlink struct { 973 implStatFS 974 kernfs.InodeAttrs 975 kernfs.InodeNoopRefCount 976 kernfs.InodeNotAnonymous 977 kernfs.InodeSymlink 978 kernfs.InodeWatches 979 980 fs *filesystem 981 task *kernel.Task 982 } 983 984 var _ kernfs.Inode = (*exeSymlink)(nil) 985 986 func (fs *filesystem) newExeSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { 987 inode := &exeSymlink{ 988 fs: fs, 989 task: task, 990 } 991 inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) 992 return inode 993 } 994 995 // Readlink implements kernfs.Inode.Readlink. 996 func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { 997 exec, _, err := s.Getlink(ctx, nil) 998 if err != nil { 999 return "", err 1000 } 1001 defer s.fs.SafeDecRef(ctx, exec) 1002 1003 root := vfs.RootFromContext(ctx) 1004 if !root.Ok() { 1005 panic("procfs Readlink requires context with root value") 1006 } 1007 defer s.fs.SafeDecRef(ctx, root) 1008 1009 vfsObj := exec.Mount().Filesystem().VirtualFilesystem() 1010 name, _ := vfsObj.PathnameWithDeleted(ctx, root, exec) 1011 return name, nil 1012 } 1013 1014 // Getlink implements kernfs.Inode.Getlink. 1015 func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { 1016 if !kernel.ContextCanTrace(ctx, s.task, false) { 1017 return vfs.VirtualDentry{}, "", linuxerr.EACCES 1018 } 1019 if err := checkTaskState(s.task); err != nil { 1020 return vfs.VirtualDentry{}, "", err 1021 } 1022 1023 mm := getMM(s.task) 1024 if mm == nil { 1025 return vfs.VirtualDentry{}, "", linuxerr.EACCES 1026 } 1027 1028 // The MemoryManager may be destroyed, in which case 1029 // MemoryManager.destroy will simply set the executable to nil 1030 // (with locks held). 1031 exec := mm.Executable() 1032 if exec == nil { 1033 return vfs.VirtualDentry{}, "", linuxerr.ESRCH 1034 } 1035 defer exec.DecRef(ctx) 1036 1037 vd := exec.VirtualDentry() 1038 vd.IncRef() 1039 return vd, "", nil 1040 } 1041 1042 // cwdSymlink is an symlink for the /proc/[pid]/cwd file. 1043 // 1044 // +stateify savable 1045 type cwdSymlink struct { 1046 implStatFS 1047 kernfs.InodeAttrs 1048 kernfs.InodeNoopRefCount 1049 kernfs.InodeNotAnonymous 1050 kernfs.InodeSymlink 1051 kernfs.InodeWatches 1052 1053 fs *filesystem 1054 task *kernel.Task 1055 } 1056 1057 var _ kernfs.Inode = (*cwdSymlink)(nil) 1058 1059 func (fs *filesystem) newCwdSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { 1060 inode := &cwdSymlink{ 1061 fs: fs, 1062 task: task, 1063 } 1064 inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) 1065 return inode 1066 } 1067 1068 // Readlink implements kernfs.Inode.Readlink. 1069 func (s *cwdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { 1070 cwd, _, err := s.Getlink(ctx, nil) 1071 if err != nil { 1072 return "", err 1073 } 1074 defer s.fs.SafeDecRef(ctx, cwd) 1075 1076 root := vfs.RootFromContext(ctx) 1077 if !root.Ok() { 1078 panic("procfs Readlink requires context with root value") 1079 } 1080 defer s.fs.SafeDecRef(ctx, root) 1081 1082 vfsObj := cwd.Mount().Filesystem().VirtualFilesystem() 1083 name, _ := vfsObj.PathnameWithDeleted(ctx, root, cwd) 1084 return name, nil 1085 } 1086 1087 // Getlink implements kernfs.Inode.Getlink. 1088 func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { 1089 if !kernel.ContextCanTrace(ctx, s.task, false) { 1090 return vfs.VirtualDentry{}, "", linuxerr.EACCES 1091 } 1092 if err := checkTaskState(s.task); err != nil { 1093 return vfs.VirtualDentry{}, "", err 1094 } 1095 cwd := s.task.FSContext().WorkingDirectory() 1096 if !cwd.Ok() { 1097 // It could have raced with process deletion. 1098 return vfs.VirtualDentry{}, "", linuxerr.ESRCH 1099 } 1100 // The reference is transferred to the caller. 1101 return cwd, "", nil 1102 } 1103 1104 // rootSymlink is an symlink for the /proc/[pid]/root file. 1105 // 1106 // +stateify savable 1107 type rootSymlink struct { 1108 implStatFS 1109 kernfs.InodeAttrs 1110 kernfs.InodeNoopRefCount 1111 kernfs.InodeNotAnonymous 1112 kernfs.InodeSymlink 1113 kernfs.InodeWatches 1114 1115 fs *filesystem 1116 task *kernel.Task 1117 } 1118 1119 var _ kernfs.Inode = (*rootSymlink)(nil) 1120 1121 func (fs *filesystem) newRootSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { 1122 inode := &rootSymlink{ 1123 fs: fs, 1124 task: task, 1125 } 1126 inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) 1127 return inode 1128 } 1129 1130 // Readlink implements kernfs.Inode.Readlink. 1131 func (s *rootSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) { 1132 root, _, err := s.Getlink(ctx, nil) 1133 if err != nil { 1134 return "", err 1135 } 1136 defer s.fs.SafeDecRef(ctx, root) 1137 1138 vfsRoot := vfs.RootFromContext(ctx) 1139 if !vfsRoot.Ok() { 1140 panic("procfs Readlink requires context with root value") 1141 } 1142 defer s.fs.SafeDecRef(ctx, vfsRoot) 1143 1144 vfsObj := root.Mount().Filesystem().VirtualFilesystem() 1145 name, _ := vfsObj.PathnameWithDeleted(ctx, vfsRoot, root) 1146 return name, nil 1147 } 1148 1149 // Getlink implements kernfs.Inode.Getlink. 1150 func (s *rootSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { 1151 if !kernel.ContextCanTrace(ctx, s.task, false) { 1152 return vfs.VirtualDentry{}, "", linuxerr.EACCES 1153 } 1154 if err := checkTaskState(s.task); err != nil { 1155 return vfs.VirtualDentry{}, "", err 1156 } 1157 root := s.task.FSContext().RootDirectory() 1158 if !root.Ok() { 1159 // It could have raced with process deletion. 1160 return vfs.VirtualDentry{}, "", linuxerr.ESRCH 1161 } 1162 // The reference is transferred to the caller. 1163 return root, "", nil 1164 } 1165 1166 // mountInfoData is used to implement /proc/[pid]/mountinfo. 1167 // 1168 // +stateify savable 1169 type mountInfoData struct { 1170 kernfs.DynamicBytesFile 1171 1172 fs *filesystem 1173 task *kernel.Task 1174 } 1175 1176 var _ dynamicInode = (*mountInfoData)(nil) 1177 1178 // Generate implements vfs.DynamicBytesSource.Generate. 1179 func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { 1180 var fsctx *kernel.FSContext 1181 i.task.WithMuLocked(func(t *kernel.Task) { 1182 fsctx = t.FSContext() 1183 }) 1184 if fsctx == nil { 1185 // The task has been destroyed. Nothing to show here. 1186 return nil 1187 } 1188 rootDir := fsctx.RootDirectory() 1189 if !rootDir.Ok() { 1190 // Root has been destroyed. Don't try to read mounts. 1191 return nil 1192 } 1193 defer i.fs.SafeDecRef(ctx, rootDir) 1194 i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf) 1195 return nil 1196 } 1197 1198 // mountsData is used to implement /proc/[pid]/mounts. 1199 // 1200 // +stateify savable 1201 type mountsData struct { 1202 kernfs.DynamicBytesFile 1203 1204 fs *filesystem 1205 task *kernel.Task 1206 } 1207 1208 var _ dynamicInode = (*mountsData)(nil) 1209 1210 // Generate implements vfs.DynamicBytesSource.Generate. 1211 func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error { 1212 var fsctx *kernel.FSContext 1213 i.task.WithMuLocked(func(t *kernel.Task) { 1214 fsctx = t.FSContext() 1215 }) 1216 if fsctx == nil { 1217 // The task has been destroyed. Nothing to show here. 1218 return nil 1219 } 1220 rootDir := fsctx.RootDirectory() 1221 if !rootDir.Ok() { 1222 // Root has been destroyed. Don't try to read mounts. 1223 return nil 1224 } 1225 defer i.fs.SafeDecRef(ctx, rootDir) 1226 i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf) 1227 return nil 1228 } 1229 1230 // +stateify savable 1231 type namespaceSymlink struct { 1232 kernfs.StaticSymlink 1233 1234 task *kernel.Task 1235 nsType int 1236 } 1237 1238 func (fs *filesystem) newNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64, nsType int) kernfs.Inode { 1239 inode := &namespaceSymlink{task: task, nsType: nsType} 1240 1241 // Note: credentials are overridden by taskOwnedInode. 1242 inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, "") 1243 1244 taskInode := &taskOwnedInode{Inode: inode, owner: task} 1245 return taskInode 1246 } 1247 1248 func (fs *filesystem) newPIDNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode { 1249 target := fmt.Sprintf("pid:[%d]", task.PIDNamespace().ID()) 1250 1251 inode := &namespaceSymlink{task: task} 1252 // Note: credentials are overridden by taskOwnedInode. 1253 inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target) 1254 1255 taskInode := &taskOwnedInode{Inode: inode, owner: task} 1256 return taskInode 1257 } 1258 1259 func (fs *filesystem) newFakeNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64, ns string) kernfs.Inode { 1260 // Namespace symlinks should contain the namespace name and the inode number 1261 // for the namespace instance, so for example user:[123456]. We currently fake 1262 // the inode number by sticking the symlink inode in its place. 1263 target := fmt.Sprintf("%s:[%d]", ns, ino) 1264 1265 inode := &namespaceSymlink{task: task} 1266 // Note: credentials are overridden by taskOwnedInode. 1267 inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target) 1268 1269 taskInode := &taskOwnedInode{Inode: inode, owner: task} 1270 return taskInode 1271 } 1272 1273 func (s *namespaceSymlink) getInode(t *kernel.Task) *nsfs.Inode { 1274 switch s.nsType { 1275 case linux.CLONE_NEWNET: 1276 return t.GetNetworkNamespace().GetInode() 1277 default: 1278 panic("unknown namespace") 1279 } 1280 } 1281 1282 // Readlink implements kernfs.Inode.Readlink. 1283 func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { 1284 if err := checkTaskState(s.task); err != nil { 1285 return "", err 1286 } 1287 if s.nsType != 0 { 1288 inode := s.getInode(s.task) 1289 if inode == nil { 1290 return "", linuxerr.ENOENT 1291 } 1292 target := inode.Name() 1293 inode.DecRef(ctx) 1294 return target, nil 1295 } 1296 return s.StaticSymlink.Readlink(ctx, mnt) 1297 } 1298 1299 // Getlink implements kernfs.Inode.Getlink. 1300 func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { 1301 if err := checkTaskState(s.task); err != nil { 1302 return vfs.VirtualDentry{}, "", err 1303 } 1304 1305 if s.nsType != 0 { 1306 inode := s.getInode(s.task) 1307 if inode == nil { 1308 return vfs.VirtualDentry{}, "", linuxerr.ENOENT 1309 } 1310 defer inode.DecRef(ctx) 1311 return inode.VirtualDentry(), "", nil 1312 } 1313 // Create a synthetic inode to represent the namespace. 1314 fs := mnt.Filesystem().Impl().(*filesystem) 1315 nsInode := &namespaceInode{} 1316 nsInode.Init(ctx, auth.CredentialsFromContext(ctx), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0444) 1317 dentry := &kernfs.Dentry{} 1318 dentry.Init(&fs.Filesystem, nsInode) 1319 vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry()) 1320 // Only IncRef vd.Mount() because vd.Dentry() already holds a ref of 1. 1321 mnt.IncRef() 1322 return vd, "", nil 1323 } 1324 1325 // namespaceInode is a synthetic inode created to represent a namespace in 1326 // /proc/[pid]/ns/*. 1327 // 1328 // +stateify savable 1329 type namespaceInode struct { 1330 implStatFS 1331 kernfs.InodeAttrs 1332 kernfs.InodeNoopRefCount 1333 kernfs.InodeNotAnonymous 1334 kernfs.InodeNotDirectory 1335 kernfs.InodeNotSymlink 1336 kernfs.InodeWatches 1337 1338 locks vfs.FileLocks 1339 } 1340 1341 var _ kernfs.Inode = (*namespaceInode)(nil) 1342 1343 // Init initializes a namespace inode. 1344 func (i *namespaceInode) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { 1345 if perm&^linux.PermissionsMask != 0 { 1346 panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) 1347 } 1348 i.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm) 1349 } 1350 1351 // Open implements kernfs.Inode.Open. 1352 func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 1353 fd := &namespaceFD{inode: i} 1354 i.IncRef() 1355 fd.LockFD.Init(&i.locks) 1356 if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { 1357 return nil, err 1358 } 1359 return &fd.vfsfd, nil 1360 } 1361 1362 // namespace FD is a synthetic file that represents a namespace in 1363 // /proc/[pid]/ns/*. 1364 // 1365 // +stateify savable 1366 type namespaceFD struct { 1367 vfs.FileDescriptionDefaultImpl 1368 vfs.LockFD 1369 1370 vfsfd vfs.FileDescription 1371 inode *namespaceInode 1372 } 1373 1374 var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil) 1375 1376 // Stat implements vfs.FileDescriptionImpl.Stat. 1377 func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 1378 vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() 1379 return fd.inode.Stat(ctx, vfs, opts) 1380 } 1381 1382 // SetStat implements vfs.FileDescriptionImpl.SetStat. 1383 func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { 1384 vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() 1385 creds := auth.CredentialsFromContext(ctx) 1386 return fd.inode.SetStat(ctx, vfs, creds, opts) 1387 } 1388 1389 // Release implements vfs.FileDescriptionImpl.Release. 1390 func (fd *namespaceFD) Release(ctx context.Context) { 1391 fd.inode.DecRef(ctx) 1392 } 1393 1394 // taskCgroupData generates data for /proc/[pid]/cgroup. 1395 // 1396 // +stateify savable 1397 type taskCgroupData struct { 1398 dynamicBytesFileSetAttr 1399 task *kernel.Task 1400 } 1401 1402 var _ dynamicInode = (*taskCgroupData)(nil) 1403 1404 // Generate implements vfs.DynamicBytesSource.Generate. 1405 func (d *taskCgroupData) Generate(ctx context.Context, buf *bytes.Buffer) error { 1406 // When a task is existing on Linux, a task's cgroup set is cleared and 1407 // reset to the initial cgroup set, which is essentially the set of root 1408 // cgroups. Because of this, the /proc/<pid>/cgroup file is always readable 1409 // on Linux throughout a task's lifetime. 1410 // 1411 // The sentry removes tasks from cgroups during the exit process, but 1412 // doesn't move them into an initial cgroup set, so partway through task 1413 // exit this file show a task is in no cgroups, which is incorrect. Instead, 1414 // once a task has left its cgroups, we return an error. 1415 if d.task.ExitState() >= kernel.TaskExitInitiated { 1416 return linuxerr.ESRCH 1417 } 1418 1419 d.task.GenerateProcTaskCgroup(buf) 1420 return nil 1421 }