github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/mm/procfs.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package mm
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  
    21  	"github.com/SagerNet/gvisor/pkg/context"
    22  	"github.com/SagerNet/gvisor/pkg/hostarch"
    23  	"github.com/SagerNet/gvisor/pkg/sentry/fs/proc/seqfile"
    24  	"github.com/SagerNet/gvisor/pkg/sentry/memmap"
    25  )
    26  
    27  const (
    28  	// devMinorBits is the number of minor bits in a device number. Linux:
    29  	// include/linux/kdev_t.h:MINORBITS
    30  	devMinorBits = 20
    31  
    32  	vsyscallEnd        = hostarch.Addr(0xffffffffff601000)
    33  	vsyscallMapsEntry  = "ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]\n"
    34  	vsyscallSmapsEntry = vsyscallMapsEntry +
    35  		"Size:                  4 kB\n" +
    36  		"Rss:                   0 kB\n" +
    37  		"Pss:                   0 kB\n" +
    38  		"Shared_Clean:          0 kB\n" +
    39  		"Shared_Dirty:          0 kB\n" +
    40  		"Private_Clean:         0 kB\n" +
    41  		"Private_Dirty:         0 kB\n" +
    42  		"Referenced:            0 kB\n" +
    43  		"Anonymous:             0 kB\n" +
    44  		"AnonHugePages:         0 kB\n" +
    45  		"Shared_Hugetlb:        0 kB\n" +
    46  		"Private_Hugetlb:       0 kB\n" +
    47  		"Swap:                  0 kB\n" +
    48  		"SwapPss:               0 kB\n" +
    49  		"KernelPageSize:        4 kB\n" +
    50  		"MMUPageSize:           4 kB\n" +
    51  		"Locked:                0 kB\n" +
    52  		"VmFlags: rd ex \n"
    53  )
    54  
    55  // NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
    56  func (mm *MemoryManager) NeedsUpdate(generation int64) bool {
    57  	return true
    58  }
    59  
    60  // ReadMapsDataInto is called by fsimpl/proc.mapsData.Generate to
    61  // implement /proc/[pid]/maps.
    62  func (mm *MemoryManager) ReadMapsDataInto(ctx context.Context, buf *bytes.Buffer) {
    63  	mm.mappingMu.RLock()
    64  	defer mm.mappingMu.RUnlock()
    65  	var start hostarch.Addr
    66  
    67  	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
    68  		mm.appendVMAMapsEntryLocked(ctx, vseg, buf)
    69  	}
    70  
    71  	// We always emulate vsyscall, so advertise it here. Everything about a
    72  	// vsyscall region is static, so just hard code the maps entry since we
    73  	// don't have a real vma backing it. The vsyscall region is at the end of
    74  	// the virtual address space so nothing should be mapped after it (if
    75  	// something is really mapped in the tiny ~10 MiB segment afterwards, we'll
    76  	// get the sorting on the maps file wrong at worst; but that's not possible
    77  	// on any current platform).
    78  	//
    79  	// Artifically adjust the seqfile handle so we only output vsyscall entry once.
    80  	if start != vsyscallEnd {
    81  		buf.WriteString(vsyscallMapsEntry)
    82  	}
    83  }
    84  
    85  // ReadMapsSeqFileData is called by fs/proc.mapsData.ReadSeqFileData to
    86  // implement /proc/[pid]/maps.
    87  func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
    88  	mm.mappingMu.RLock()
    89  	defer mm.mappingMu.RUnlock()
    90  	var data []seqfile.SeqData
    91  	var start hostarch.Addr
    92  	if handle != nil {
    93  		start = *handle.(*hostarch.Addr)
    94  	}
    95  	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
    96  		vmaAddr := vseg.End()
    97  		data = append(data, seqfile.SeqData{
    98  			Buf:    mm.vmaMapsEntryLocked(ctx, vseg),
    99  			Handle: &vmaAddr,
   100  		})
   101  	}
   102  
   103  	// We always emulate vsyscall, so advertise it here. Everything about a
   104  	// vsyscall region is static, so just hard code the maps entry since we
   105  	// don't have a real vma backing it. The vsyscall region is at the end of
   106  	// the virtual address space so nothing should be mapped after it (if
   107  	// something is really mapped in the tiny ~10 MiB segment afterwards, we'll
   108  	// get the sorting on the maps file wrong at worst; but that's not possible
   109  	// on any current platform).
   110  	//
   111  	// Artifically adjust the seqfile handle so we only output vsyscall entry once.
   112  	if start != vsyscallEnd {
   113  		vmaAddr := vsyscallEnd
   114  		data = append(data, seqfile.SeqData{
   115  			Buf:    []byte(vsyscallMapsEntry),
   116  			Handle: &vmaAddr,
   117  		})
   118  	}
   119  	return data, 1
   120  }
   121  
   122  // vmaMapsEntryLocked returns a /proc/[pid]/maps entry for the vma iterated by
   123  // vseg, including the trailing newline.
   124  //
   125  // Preconditions: mm.mappingMu must be locked.
   126  func (mm *MemoryManager) vmaMapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte {
   127  	var b bytes.Buffer
   128  	mm.appendVMAMapsEntryLocked(ctx, vseg, &b)
   129  	return b.Bytes()
   130  }
   131  
   132  // Preconditions: mm.mappingMu must be locked.
   133  func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaIterator, b *bytes.Buffer) {
   134  	vma := vseg.ValuePtr()
   135  	private := "p"
   136  	if !vma.private {
   137  		private = "s"
   138  	}
   139  
   140  	var dev, ino uint64
   141  	if vma.id != nil {
   142  		dev = vma.id.DeviceID()
   143  		ino = vma.id.InodeID()
   144  	}
   145  	devMajor := uint32(dev >> devMinorBits)
   146  	devMinor := uint32(dev & ((1 << devMinorBits) - 1))
   147  
   148  	// Do not include the guard page: fs/proc/task_mmu.c:show_map_vma() =>
   149  	// stack_guard_page_start().
   150  	lineLen, _ := fmt.Fprintf(b, "%08x-%08x %s%s %08x %02x:%02x %d ",
   151  		vseg.Start(), vseg.End(), vma.realPerms, private, vma.off, devMajor, devMinor, ino)
   152  
   153  	// Figure out our filename or hint.
   154  	var s string
   155  	if vma.hint != "" {
   156  		s = vma.hint
   157  	} else if vma.id != nil {
   158  		// FIXME(jamieliu): We are holding mm.mappingMu here, which is
   159  		// consistent with Linux's holding mmap_sem in
   160  		// fs/proc/task_mmu.c:show_map_vma() => fs/seq_file.c:seq_file_path().
   161  		// However, it's not clear that fs.File.MappedName() is actually
   162  		// consistent with this lock order.
   163  		s = vma.id.MappedName(ctx)
   164  	}
   165  	if s != "" {
   166  		// Per linux, we pad until the 74th character.
   167  		for pad := 73 - lineLen; pad > 0; pad-- {
   168  			b.WriteByte(' ')
   169  		}
   170  		b.WriteString(s)
   171  	}
   172  	b.WriteByte('\n')
   173  }
   174  
   175  // ReadSmapsDataInto is called by fsimpl/proc.smapsData.Generate to
   176  // implement /proc/[pid]/maps.
   177  func (mm *MemoryManager) ReadSmapsDataInto(ctx context.Context, buf *bytes.Buffer) {
   178  	mm.mappingMu.RLock()
   179  	defer mm.mappingMu.RUnlock()
   180  	var start hostarch.Addr
   181  
   182  	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
   183  		mm.vmaSmapsEntryIntoLocked(ctx, vseg, buf)
   184  	}
   185  
   186  	// We always emulate vsyscall, so advertise it here. See
   187  	// ReadMapsSeqFileData for additional commentary.
   188  	if start != vsyscallEnd {
   189  		buf.WriteString(vsyscallSmapsEntry)
   190  	}
   191  }
   192  
   193  // ReadSmapsSeqFileData is called by fs/proc.smapsData.ReadSeqFileData to
   194  // implement /proc/[pid]/smaps.
   195  func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
   196  	mm.mappingMu.RLock()
   197  	defer mm.mappingMu.RUnlock()
   198  	var data []seqfile.SeqData
   199  	var start hostarch.Addr
   200  	if handle != nil {
   201  		start = *handle.(*hostarch.Addr)
   202  	}
   203  	for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
   204  		vmaAddr := vseg.End()
   205  		data = append(data, seqfile.SeqData{
   206  			Buf:    mm.vmaSmapsEntryLocked(ctx, vseg),
   207  			Handle: &vmaAddr,
   208  		})
   209  	}
   210  
   211  	// We always emulate vsyscall, so advertise it here. See
   212  	// ReadMapsSeqFileData for additional commentary.
   213  	if start != vsyscallEnd {
   214  		vmaAddr := vsyscallEnd
   215  		data = append(data, seqfile.SeqData{
   216  			Buf:    []byte(vsyscallSmapsEntry),
   217  			Handle: &vmaAddr,
   218  		})
   219  	}
   220  	return data, 1
   221  }
   222  
   223  // vmaSmapsEntryLocked returns a /proc/[pid]/smaps entry for the vma iterated
   224  // by vseg, including the trailing newline.
   225  //
   226  // Preconditions: mm.mappingMu must be locked.
   227  func (mm *MemoryManager) vmaSmapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte {
   228  	var b bytes.Buffer
   229  	mm.vmaSmapsEntryIntoLocked(ctx, vseg, &b)
   230  	return b.Bytes()
   231  }
   232  
   233  func (mm *MemoryManager) vmaSmapsEntryIntoLocked(ctx context.Context, vseg vmaIterator, b *bytes.Buffer) {
   234  	mm.appendVMAMapsEntryLocked(ctx, vseg, b)
   235  	vma := vseg.ValuePtr()
   236  
   237  	// We take mm.activeMu here in each call to vmaSmapsEntryLocked, instead of
   238  	// requiring it to be locked as a precondition, to reduce the latency
   239  	// impact of reading /proc/[pid]/smaps on concurrent performance-sensitive
   240  	// operations requiring activeMu for writing like faults.
   241  	mm.activeMu.RLock()
   242  	var rss uint64
   243  	var anon uint64
   244  	vsegAR := vseg.Range()
   245  	for pseg := mm.pmas.LowerBoundSegment(vsegAR.Start); pseg.Ok() && pseg.Start() < vsegAR.End; pseg = pseg.NextSegment() {
   246  		psegAR := pseg.Range().Intersect(vsegAR)
   247  		size := uint64(psegAR.Length())
   248  		rss += size
   249  		if pseg.ValuePtr().private {
   250  			anon += size
   251  		}
   252  	}
   253  	mm.activeMu.RUnlock()
   254  
   255  	fmt.Fprintf(b, "Size:           %8d kB\n", vseg.Range().Length()/1024)
   256  	fmt.Fprintf(b, "Rss:            %8d kB\n", rss/1024)
   257  	// Currently we report PSS = RSS, i.e. we pretend each page mapped by a pma
   258  	// is only mapped by that pma. This avoids having to query memmap.Mappables
   259  	// for reference count information on each page. As a corollary, all pages
   260  	// are accounted as "private" whether or not the vma is private; compare
   261  	// Linux's fs/proc/task_mmu.c:smaps_account().
   262  	fmt.Fprintf(b, "Pss:            %8d kB\n", rss/1024)
   263  	fmt.Fprintf(b, "Shared_Clean:   %8d kB\n", 0)
   264  	fmt.Fprintf(b, "Shared_Dirty:   %8d kB\n", 0)
   265  	// Pretend that all pages are dirty if the vma is writable, and clean otherwise.
   266  	clean := rss
   267  	if vma.effectivePerms.Write {
   268  		clean = 0
   269  	}
   270  	fmt.Fprintf(b, "Private_Clean:  %8d kB\n", clean/1024)
   271  	fmt.Fprintf(b, "Private_Dirty:  %8d kB\n", (rss-clean)/1024)
   272  	// Pretend that all pages are "referenced" (recently touched).
   273  	fmt.Fprintf(b, "Referenced:     %8d kB\n", rss/1024)
   274  	fmt.Fprintf(b, "Anonymous:      %8d kB\n", anon/1024)
   275  	// Hugepages (hugetlb and THP) are not implemented.
   276  	fmt.Fprintf(b, "AnonHugePages:  %8d kB\n", 0)
   277  	fmt.Fprintf(b, "Shared_Hugetlb: %8d kB\n", 0)
   278  	fmt.Fprintf(b, "Private_Hugetlb: %7d kB\n", 0)
   279  	// Swap is not implemented.
   280  	fmt.Fprintf(b, "Swap:           %8d kB\n", 0)
   281  	fmt.Fprintf(b, "SwapPss:        %8d kB\n", 0)
   282  	fmt.Fprintf(b, "KernelPageSize: %8d kB\n", hostarch.PageSize/1024)
   283  	fmt.Fprintf(b, "MMUPageSize:    %8d kB\n", hostarch.PageSize/1024)
   284  	locked := rss
   285  	if vma.mlockMode == memmap.MLockNone {
   286  		locked = 0
   287  	}
   288  	fmt.Fprintf(b, "Locked:         %8d kB\n", locked/1024)
   289  
   290  	b.WriteString("VmFlags: ")
   291  	if vma.realPerms.Read {
   292  		b.WriteString("rd ")
   293  	}
   294  	if vma.realPerms.Write {
   295  		b.WriteString("wr ")
   296  	}
   297  	if vma.realPerms.Execute {
   298  		b.WriteString("ex ")
   299  	}
   300  	if vma.canWriteMappableLocked() { // VM_SHARED
   301  		b.WriteString("sh ")
   302  	}
   303  	if vma.maxPerms.Read {
   304  		b.WriteString("mr ")
   305  	}
   306  	if vma.maxPerms.Write {
   307  		b.WriteString("mw ")
   308  	}
   309  	if vma.maxPerms.Execute {
   310  		b.WriteString("me ")
   311  	}
   312  	if !vma.private { // VM_MAYSHARE
   313  		b.WriteString("ms ")
   314  	}
   315  	if vma.growsDown {
   316  		b.WriteString("gd ")
   317  	}
   318  	if vma.mlockMode != memmap.MLockNone { // VM_LOCKED
   319  		b.WriteString("lo ")
   320  	}
   321  	if vma.mlockMode == memmap.MLockLazy { // VM_LOCKONFAULT
   322  		b.WriteString("?? ") // no explicit encoding in fs/proc/task_mmu.c:show_smap_vma_flags()
   323  	}
   324  	if vma.private && vma.effectivePerms.Write { // VM_ACCOUNT
   325  		b.WriteString("ac ")
   326  	}
   327  	b.WriteString("\n")
   328  }