gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/mm/io.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package mm
    16  
    17  import (
    18  	"fmt"
    19  
    20  	"gvisor.dev/gvisor/pkg/context"
    21  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    22  	"gvisor.dev/gvisor/pkg/hostarch"
    23  	"gvisor.dev/gvisor/pkg/safemem"
    24  	"gvisor.dev/gvisor/pkg/sentry/memmap"
    25  	"gvisor.dev/gvisor/pkg/sentry/platform"
    26  	"gvisor.dev/gvisor/pkg/sync"
    27  	"gvisor.dev/gvisor/pkg/usermem"
    28  )
    29  
    30  // There are two supported ways to copy data to/from application virtual
    31  // memory:
    32  //
    33  // 1. Internally-mapped copying: Determine the memmap.File that backs the
    34  // copied-to/from virtual address, obtain a mapping of its pages, and read or
    35  // write to the mapping.
    36  //
    37  // 2. AddressSpace copying: If platform.Platform.SupportsAddressSpaceIO() is
    38  // true, AddressSpace permissions are applicable, and an AddressSpace is
    39  // available, copy directly through the AddressSpace, handling faults as
    40  // needed.
    41  //
    42  // (Given that internally-mapped copying requires that backing memory is always
    43  // implemented using a host file descriptor, we could also preadv/pwritev to it
    44  // instead. But this would incur a host syscall for each use of the mapped
    45  // page, whereas mmap is a one-time cost.)
    46  //
    47  // The fixed overhead of internally-mapped copying is expected to be higher
    48  // than that of AddressSpace copying since the former always needs to translate
    49  // addresses, whereas the latter only needs to do so when faults occur.
    50  // However, the throughput of internally-mapped copying is expected to be
    51  // somewhat higher than that of AddressSpace copying due to the high cost of
    52  // page faults and because implementations of the latter usually rely on
    53  // safecopy, which doesn't use AVX registers. So we prefer to use AddressSpace
    54  // copying (when available) for smaller copies, and switch to internally-mapped
    55  // copying once a size threshold is exceeded.
    56  const (
    57  	// copyMapMinBytes is the size threshold for switching to internally-mapped
    58  	// copying in CopyOut, CopyIn, and ZeroOut.
    59  	copyMapMinBytes = 32 << 10 // 32 KB
    60  
    61  	// rwMapMinBytes is the size threshold for switching to internally-mapped
    62  	// copying in CopyOutFrom and CopyInTo. It's lower than copyMapMinBytes
    63  	// since AddressSpace copying in this case requires additional buffering;
    64  	// see CopyOutFrom for details.
    65  	rwMapMinBytes = 512
    66  )
    67  
    68  // CheckIORange is similar to hostarch.Addr.ToRange, but applies bounds checks
    69  // consistent with Linux's arch/x86/include/asm/uaccess.h:access_ok().
    70  //
    71  // Preconditions: length >= 0.
    72  func (mm *MemoryManager) CheckIORange(addr hostarch.Addr, length int64) (hostarch.AddrRange, bool) {
    73  	// Note that access_ok() constrains end even if length == 0.
    74  	ar, ok := addr.ToRange(uint64(length))
    75  	return ar, (ok && ar.End <= mm.layout.MaxAddr)
    76  }
    77  
    78  // checkIOVec applies bound checks consistent with Linux's
    79  // arch/x86/include/asm/uaccess.h:access_ok() to ars.
    80  func (mm *MemoryManager) checkIOVec(ars hostarch.AddrRangeSeq) bool {
    81  	for !ars.IsEmpty() {
    82  		ar := ars.Head()
    83  		if _, ok := mm.CheckIORange(ar.Start, int64(ar.Length())); !ok {
    84  			return false
    85  		}
    86  		ars = ars.Tail()
    87  	}
    88  	return true
    89  }
    90  
    91  func (mm *MemoryManager) asioEnabled(opts usermem.IOOpts) bool {
    92  	return mm.haveASIO && !opts.IgnorePermissions && opts.AddressSpaceActive
    93  }
    94  
    95  // translateIOError converts errors to EFAULT, as is usually reported for all
    96  // I/O errors originating from MM in Linux.
    97  func translateIOError(ctx context.Context, err error) error {
    98  	if err == nil {
    99  		return nil
   100  	}
   101  	if logIOErrors {
   102  		ctx.Debugf("MM I/O error: %v", err)
   103  	}
   104  	return linuxerr.EFAULT
   105  }
   106  
   107  // CopyOut implements usermem.IO.CopyOut.
   108  func (mm *MemoryManager) CopyOut(ctx context.Context, addr hostarch.Addr, src []byte, opts usermem.IOOpts) (int, error) {
   109  	ar, ok := mm.CheckIORange(addr, int64(len(src)))
   110  	if !ok {
   111  		return 0, linuxerr.EFAULT
   112  	}
   113  
   114  	if len(src) == 0 {
   115  		return 0, nil
   116  	}
   117  
   118  	// Do AddressSpace IO if applicable.
   119  	if mm.asioEnabled(opts) && len(src) < copyMapMinBytes {
   120  		return mm.asCopyOut(ctx, addr, src)
   121  	}
   122  
   123  	// Go through internal mappings.
   124  	// NOTE(gvisor.dev/issue/10331): Using mm.withInternalMappings() here means
   125  	// that if we encounter any memmap.BufferedIOFallbackErrs, this copy will
   126  	// traverse an unnecessary layer of buffering. This can be fixed by
   127  	// inlining mm.withInternalMappings() and passing src subslices directly to
   128  	// memmap.File.BufferWriteAt().
   129  	n64, err := mm.withInternalMappings(ctx, ar, hostarch.Write, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
   130  		n, err := safemem.CopySeq(ims, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(src)))
   131  		return n, translateIOError(ctx, err)
   132  	})
   133  	return int(n64), err
   134  }
   135  
   136  func (mm *MemoryManager) asCopyOut(ctx context.Context, addr hostarch.Addr, src []byte) (int, error) {
   137  	var done int
   138  	for {
   139  		n, err := mm.as.CopyOut(addr+hostarch.Addr(done), src[done:])
   140  		done += n
   141  		if err == nil {
   142  			return done, nil
   143  		}
   144  		if f, ok := err.(platform.SegmentationFault); ok {
   145  			ar, _ := addr.ToRange(uint64(len(src)))
   146  			if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.Write); err != nil {
   147  				return done, err
   148  			}
   149  			continue
   150  		}
   151  		return done, translateIOError(ctx, err)
   152  	}
   153  }
   154  
   155  // CopyIn implements usermem.IO.CopyIn.
   156  func (mm *MemoryManager) CopyIn(ctx context.Context, addr hostarch.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
   157  	ar, ok := mm.CheckIORange(addr, int64(len(dst)))
   158  	if !ok {
   159  		return 0, linuxerr.EFAULT
   160  	}
   161  
   162  	if len(dst) == 0 {
   163  		return 0, nil
   164  	}
   165  
   166  	// Do AddressSpace IO if applicable.
   167  	if mm.asioEnabled(opts) && len(dst) < copyMapMinBytes {
   168  		return mm.asCopyIn(ctx, addr, dst)
   169  	}
   170  
   171  	// Go through internal mappings.
   172  	// NOTE(gvisor.dev/issue/10331): Using mm.withInternalMappings() here means
   173  	// that if we encounter any memmap.BufferedIOFallbackErrs, this copy will
   174  	// traverse an unnecessary layer of buffering. This can be fixed by
   175  	// inlining mm.withInternalMappings() and passing dst subslices directly to
   176  	// memmap.File.BufferReadAt().
   177  	n64, err := mm.withInternalMappings(ctx, ar, hostarch.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
   178  		n, err := safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(dst)), ims)
   179  		return n, translateIOError(ctx, err)
   180  	})
   181  	return int(n64), err
   182  }
   183  
   184  func (mm *MemoryManager) asCopyIn(ctx context.Context, addr hostarch.Addr, dst []byte) (int, error) {
   185  	var done int
   186  	for {
   187  		n, err := mm.as.CopyIn(addr+hostarch.Addr(done), dst[done:])
   188  		done += n
   189  		if err == nil {
   190  			return done, nil
   191  		}
   192  		if f, ok := err.(platform.SegmentationFault); ok {
   193  			ar, _ := addr.ToRange(uint64(len(dst)))
   194  			if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.Read); err != nil {
   195  				return done, err
   196  			}
   197  			continue
   198  		}
   199  		return done, translateIOError(ctx, err)
   200  	}
   201  }
   202  
   203  // ZeroOut implements usermem.IO.ZeroOut.
   204  func (mm *MemoryManager) ZeroOut(ctx context.Context, addr hostarch.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
   205  	ar, ok := mm.CheckIORange(addr, toZero)
   206  	if !ok {
   207  		return 0, linuxerr.EFAULT
   208  	}
   209  
   210  	if toZero == 0 {
   211  		return 0, nil
   212  	}
   213  
   214  	// Do AddressSpace IO if applicable.
   215  	if mm.asioEnabled(opts) && toZero < copyMapMinBytes {
   216  		return mm.asZeroOut(ctx, addr, toZero)
   217  	}
   218  
   219  	// Go through internal mappings.
   220  	return mm.withInternalMappings(ctx, ar, hostarch.Write, opts.IgnorePermissions, func(dsts safemem.BlockSeq) (uint64, error) {
   221  		n, err := safemem.ZeroSeq(dsts)
   222  		return n, translateIOError(ctx, err)
   223  	})
   224  }
   225  
   226  func (mm *MemoryManager) asZeroOut(ctx context.Context, addr hostarch.Addr, toZero int64) (int64, error) {
   227  	var done int64
   228  	for {
   229  		n, err := mm.as.ZeroOut(addr+hostarch.Addr(done), uintptr(toZero-done))
   230  		done += int64(n)
   231  		if err == nil {
   232  			return done, nil
   233  		}
   234  		if f, ok := err.(platform.SegmentationFault); ok {
   235  			ar, _ := addr.ToRange(uint64(toZero))
   236  			if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.Write); err != nil {
   237  				return done, err
   238  			}
   239  			continue
   240  		}
   241  		return done, translateIOError(ctx, err)
   242  	}
   243  }
   244  
   245  // CopyOutFrom implements usermem.IO.CopyOutFrom.
   246  func (mm *MemoryManager) CopyOutFrom(ctx context.Context, ars hostarch.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) {
   247  	if !mm.checkIOVec(ars) {
   248  		return 0, linuxerr.EFAULT
   249  	}
   250  
   251  	if ars.NumBytes() == 0 {
   252  		return 0, nil
   253  	}
   254  
   255  	// Do AddressSpace IO if applicable.
   256  	if mm.asioEnabled(opts) && ars.NumBytes() < rwMapMinBytes {
   257  		// We have to introduce a buffered copy, instead of just passing a
   258  		// safemem.BlockSeq representing addresses in the AddressSpace to src.
   259  		// This is because usermem.IO.CopyOutFrom() guarantees that it calls
   260  		// src.ReadToBlocks() at most once, which is incompatible with handling
   261  		// faults between calls. In the future, this is probably best resolved
   262  		// by introducing a CopyOutFrom variant or option that allows it to
   263  		// call src.ReadToBlocks() any number of times.
   264  		//
   265  		// This issue applies to CopyInTo as well.
   266  		buf := make([]byte, int(ars.NumBytes()))
   267  		bufN, bufErr := src.ReadToBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)))
   268  		var done int64
   269  		for done < int64(bufN) {
   270  			ar := ars.Head()
   271  			cplen := int64(ar.Length())
   272  			if cplen > int64(bufN)-done {
   273  				cplen = int64(bufN) - done
   274  			}
   275  			n, err := mm.asCopyOut(ctx, ar.Start, buf[int(done):int(done+cplen)])
   276  			done += int64(n)
   277  			if err != nil {
   278  				return done, err
   279  			}
   280  			ars = ars.Tail()
   281  		}
   282  		// Do not convert errors returned by src to EFAULT.
   283  		return done, bufErr
   284  	}
   285  
   286  	// Go through internal mappings.
   287  	return mm.withVecInternalMappings(ctx, ars, hostarch.Write, opts.IgnorePermissions, src.ReadToBlocks)
   288  }
   289  
   290  // CopyInTo implements usermem.IO.CopyInTo.
   291  func (mm *MemoryManager) CopyInTo(ctx context.Context, ars hostarch.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
   292  	if !mm.checkIOVec(ars) {
   293  		return 0, linuxerr.EFAULT
   294  	}
   295  
   296  	if ars.NumBytes() == 0 {
   297  		return 0, nil
   298  	}
   299  
   300  	// Do AddressSpace IO if applicable.
   301  	if mm.asioEnabled(opts) && ars.NumBytes() < rwMapMinBytes {
   302  		buf := make([]byte, int(ars.NumBytes()))
   303  		var done int
   304  		var bufErr error
   305  		for !ars.IsEmpty() {
   306  			ar := ars.Head()
   307  			var n int
   308  			n, bufErr = mm.asCopyIn(ctx, ar.Start, buf[done:done+int(ar.Length())])
   309  			done += n
   310  			if bufErr != nil {
   311  				break
   312  			}
   313  			ars = ars.Tail()
   314  		}
   315  		n, err := dst.WriteFromBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:done])))
   316  		if err != nil {
   317  			return int64(n), err
   318  		}
   319  		// Do not convert errors returned by dst to EFAULT.
   320  		return int64(n), bufErr
   321  	}
   322  
   323  	// Go through internal mappings.
   324  	return mm.withVecInternalMappings(ctx, ars, hostarch.Read, opts.IgnorePermissions, dst.WriteFromBlocks)
   325  }
   326  
   327  // EnsurePMAsExist attempts to ensure that PMAs exist for the given addr with the
   328  // requested length. It returns the length to which it was able to either
   329  // initialize PMAs for, or ascertain that PMAs exist for. If this length is
   330  // smaller than the requested length it returns an error explaining why.
   331  func (mm *MemoryManager) EnsurePMAsExist(ctx context.Context, addr hostarch.Addr, length int64, opts usermem.IOOpts) (int64, error) {
   332  	ar, ok := mm.CheckIORange(addr, length)
   333  	if !ok {
   334  		return 0, linuxerr.EFAULT
   335  	}
   336  	n64, err := mm.withInternalMappings(ctx, ar, hostarch.Write, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
   337  		return uint64(ims.NumBytes()), nil
   338  	})
   339  	return int64(n64), err
   340  }
   341  
   342  // SwapUint32 implements usermem.IO.SwapUint32.
   343  func (mm *MemoryManager) SwapUint32(ctx context.Context, addr hostarch.Addr, new uint32, opts usermem.IOOpts) (uint32, error) {
   344  	ar, ok := mm.CheckIORange(addr, 4)
   345  	if !ok {
   346  		return 0, linuxerr.EFAULT
   347  	}
   348  
   349  	// Do AddressSpace IO if applicable.
   350  	if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions {
   351  		for {
   352  			old, err := mm.as.SwapUint32(addr, new)
   353  			if err == nil {
   354  				return old, nil
   355  			}
   356  			if f, ok := err.(platform.SegmentationFault); ok {
   357  				if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.ReadWrite); err != nil {
   358  					return 0, err
   359  				}
   360  				continue
   361  			}
   362  			return 0, translateIOError(ctx, err)
   363  		}
   364  	}
   365  
   366  	// Go through internal mappings.
   367  	var old uint32
   368  	_, err := mm.withInternalMappings(ctx, ar, hostarch.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
   369  		if ims.NumBlocks() != 1 || ims.NumBytes() != 4 {
   370  			// Atomicity is unachievable across mappings.
   371  			return 0, linuxerr.EFAULT
   372  		}
   373  		im := ims.Head()
   374  		var err error
   375  		old, err = safemem.SwapUint32(im, new)
   376  		if err != nil {
   377  			return 0, translateIOError(ctx, err)
   378  		}
   379  		// Return the number of bytes read.
   380  		return 4, nil
   381  	})
   382  	return old, err
   383  }
   384  
   385  // CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32.
   386  func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr hostarch.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) {
   387  	ar, ok := mm.CheckIORange(addr, 4)
   388  	if !ok {
   389  		return 0, linuxerr.EFAULT
   390  	}
   391  
   392  	// Do AddressSpace IO if applicable.
   393  	if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions {
   394  		for {
   395  			prev, err := mm.as.CompareAndSwapUint32(addr, old, new)
   396  			if err == nil {
   397  				return prev, nil
   398  			}
   399  			if f, ok := err.(platform.SegmentationFault); ok {
   400  				if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.ReadWrite); err != nil {
   401  					return 0, err
   402  				}
   403  				continue
   404  			}
   405  			return 0, translateIOError(ctx, err)
   406  		}
   407  	}
   408  
   409  	// Go through internal mappings.
   410  	var prev uint32
   411  	_, err := mm.withInternalMappings(ctx, ar, hostarch.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
   412  		if ims.NumBlocks() != 1 || ims.NumBytes() != 4 {
   413  			// Atomicity is unachievable across mappings.
   414  			return 0, linuxerr.EFAULT
   415  		}
   416  		im := ims.Head()
   417  		var err error
   418  		prev, err = safemem.CompareAndSwapUint32(im, old, new)
   419  		if err != nil {
   420  			return 0, translateIOError(ctx, err)
   421  		}
   422  		// Return the number of bytes read.
   423  		return 4, nil
   424  	})
   425  	return prev, err
   426  }
   427  
   428  // LoadUint32 implements usermem.IO.LoadUint32.
   429  func (mm *MemoryManager) LoadUint32(ctx context.Context, addr hostarch.Addr, opts usermem.IOOpts) (uint32, error) {
   430  	ar, ok := mm.CheckIORange(addr, 4)
   431  	if !ok {
   432  		return 0, linuxerr.EFAULT
   433  	}
   434  
   435  	// Do AddressSpace IO if applicable.
   436  	if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions {
   437  		for {
   438  			val, err := mm.as.LoadUint32(addr)
   439  			if err == nil {
   440  				return val, nil
   441  			}
   442  			if f, ok := err.(platform.SegmentationFault); ok {
   443  				if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.Read); err != nil {
   444  					return 0, err
   445  				}
   446  				continue
   447  			}
   448  			return 0, translateIOError(ctx, err)
   449  		}
   450  	}
   451  
   452  	// Go through internal mappings.
   453  	var val uint32
   454  	_, err := mm.withInternalMappings(ctx, ar, hostarch.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
   455  		if ims.NumBlocks() != 1 || ims.NumBytes() != 4 {
   456  			// Atomicity is unachievable across mappings.
   457  			return 0, linuxerr.EFAULT
   458  		}
   459  		im := ims.Head()
   460  		var err error
   461  		val, err = safemem.LoadUint32(im)
   462  		if err != nil {
   463  			return 0, translateIOError(ctx, err)
   464  		}
   465  		// Return the number of bytes read.
   466  		return 4, nil
   467  	})
   468  	return val, err
   469  }
   470  
   471  // handleASIOFault handles a page fault at address addr for an AddressSpaceIO
   472  // operation spanning ioar.
   473  //
   474  // Preconditions:
   475  //   - mm.as != nil.
   476  //   - ioar.Length() != 0.
   477  //   - ioar.Contains(addr).
   478  func (mm *MemoryManager) handleASIOFault(ctx context.Context, addr hostarch.Addr, ioar hostarch.AddrRange, at hostarch.AccessType) error {
   479  	// Try to map all remaining pages in the I/O operation. This RoundUp can't
   480  	// overflow because otherwise it would have been caught by CheckIORange.
   481  	end, _ := ioar.End.RoundUp()
   482  	ar := hostarch.AddrRange{addr.RoundDown(), end}
   483  
   484  	// Don't bother trying existingPMAsLocked; in most cases, if we did have
   485  	// existing pmas, we wouldn't have faulted.
   486  
   487  	// Ensure that we have usable vmas. Here and below, only return early if we
   488  	// can't map the first (faulting) page; failure to map later pages are
   489  	// silently ignored. This maximizes partial success.
   490  	mm.mappingMu.RLock()
   491  	vseg, vend, err := mm.getVMAsLocked(ctx, ar, at, false)
   492  	if vendaddr := vend.Start(); vendaddr < ar.End {
   493  		if vendaddr <= ar.Start {
   494  			mm.mappingMu.RUnlock()
   495  			return translateIOError(ctx, err)
   496  		}
   497  		ar.End = vendaddr
   498  	}
   499  
   500  	// Ensure that we have usable pmas.
   501  	mm.activeMu.Lock()
   502  	pseg, pend, err := mm.getPMAsLocked(ctx, vseg, ar, at)
   503  	mm.mappingMu.RUnlock()
   504  	if pendaddr := pend.Start(); pendaddr < ar.End {
   505  		if pendaddr <= ar.Start {
   506  			mm.activeMu.Unlock()
   507  			return translateIOError(ctx, err)
   508  		}
   509  		ar.End = pendaddr
   510  	}
   511  
   512  	// Downgrade to a read-lock on activeMu since we don't need to mutate pmas
   513  	// anymore.
   514  	mm.activeMu.DowngradeLock()
   515  
   516  	err = mm.mapASLocked(pseg, ar, memmap.PlatformEffectDefault)
   517  	mm.activeMu.RUnlock()
   518  	return translateIOError(ctx, err)
   519  }
   520  
   521  // withInternalMappings ensures that pmas exist for all addresses in ar,
   522  // support access of type (at, ignorePermissions), and have internal mappings
   523  // cached. It then calls f with mm.activeMu locked for reading, passing
   524  // internal mappings for the subrange of ar for which this property holds.
   525  //
   526  // withInternalMappings takes a function returning uint64 since many safemem
   527  // functions have this property, but returns an int64 since this is usually
   528  // more useful for usermem.IO methods.
   529  //
   530  // Preconditions: 0 < ar.Length() <= math.MaxInt64.
   531  func (mm *MemoryManager) withInternalMappings(ctx context.Context, ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) {
   532  	// If pmas are already available, we can do IO without touching mm.vmas or
   533  	// mm.mappingMu.
   534  	mm.activeMu.RLock()
   535  	if pseg := mm.existingPMAsLocked(ar, at, ignorePermissions, true /* needInternalMappings */); pseg.Ok() {
   536  		n, err := f(mm.internalMappingsLocked(pseg, ar))
   537  		mm.activeMu.RUnlock()
   538  		// Do not convert errors returned by f to EFAULT.
   539  		return int64(n), err
   540  	}
   541  	mm.activeMu.RUnlock()
   542  
   543  	// Ensure that we have usable vmas.
   544  	mm.mappingMu.RLock()
   545  	vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions)
   546  	if vendaddr := vend.Start(); vendaddr < ar.End {
   547  		if vendaddr <= ar.Start {
   548  			mm.mappingMu.RUnlock()
   549  			return 0, translateIOError(ctx, verr)
   550  		}
   551  		ar.End = vendaddr
   552  	}
   553  
   554  	// Ensure that we have usable pmas.
   555  	mm.activeMu.Lock()
   556  	pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, at)
   557  	mm.mappingMu.RUnlock()
   558  	if pendaddr := pend.Start(); pendaddr < ar.End {
   559  		if pendaddr <= ar.Start {
   560  			mm.activeMu.Unlock()
   561  			return 0, translateIOError(ctx, perr)
   562  		}
   563  		ar.End = pendaddr
   564  	}
   565  	imbs, t, imerr := mm.getIOMappingsLocked(pseg, ar, at)
   566  	mm.activeMu.DowngradeLock()
   567  	if imlen := imbs.NumBytes(); imlen < uint64(ar.Length()) {
   568  		if imlen == 0 {
   569  			t.flush(0, nil)
   570  			mm.activeMu.RUnlock()
   571  			return 0, translateIOError(ctx, imerr)
   572  		}
   573  		ar.End = ar.Start + hostarch.Addr(imlen)
   574  	}
   575  
   576  	// Do I/O.
   577  	un, err := t.flush(f(imbs))
   578  	mm.activeMu.RUnlock()
   579  	n := int64(un)
   580  
   581  	// Return the first error in order of progress through ar.
   582  	if err != nil {
   583  		// Do not convert errors returned by f to EFAULT.
   584  		return n, err
   585  	}
   586  	if imerr != nil {
   587  		return n, translateIOError(ctx, imerr)
   588  	}
   589  	if perr != nil {
   590  		return n, translateIOError(ctx, perr)
   591  	}
   592  	return n, translateIOError(ctx, verr)
   593  }
   594  
   595  // withVecInternalMappings ensures that pmas exist for all addresses in ars,
   596  // support access of type (at, ignorePermissions), and have internal mappings
   597  // cached. It then calls f with mm.activeMu locked for reading, passing
   598  // internal mappings for the subset of ars for which this property holds.
   599  //
   600  // Preconditions: !ars.IsEmpty().
   601  func (mm *MemoryManager) withVecInternalMappings(ctx context.Context, ars hostarch.AddrRangeSeq, at hostarch.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) {
   602  	// withInternalMappings is faster than withVecInternalMappings because of
   603  	// iterator plumbing (this isn't generally practical in the vector case due
   604  	// to iterator invalidation between AddrRanges). Use it if possible.
   605  	if ars.NumRanges() == 1 {
   606  		return mm.withInternalMappings(ctx, ars.Head(), at, ignorePermissions, f)
   607  	}
   608  
   609  	// If pmas are already available, we can do IO without touching mm.vmas or
   610  	// mm.mappingMu.
   611  	mm.activeMu.RLock()
   612  	if mm.existingVecPMAsLocked(ars, at, ignorePermissions, true /* needInternalMappings */) {
   613  		n, err := f(mm.vecInternalMappingsLocked(ars))
   614  		mm.activeMu.RUnlock()
   615  		// Do not convert errors returned by f to EFAULT.
   616  		return int64(n), err
   617  	}
   618  	mm.activeMu.RUnlock()
   619  
   620  	// Ensure that we have usable vmas.
   621  	mm.mappingMu.RLock()
   622  	vars, verr := mm.getVecVMAsLocked(ctx, ars, at, ignorePermissions)
   623  	if vars.NumBytes() == 0 {
   624  		mm.mappingMu.RUnlock()
   625  		return 0, translateIOError(ctx, verr)
   626  	}
   627  
   628  	// Ensure that we have usable pmas.
   629  	mm.activeMu.Lock()
   630  	pars, perr := mm.getVecPMAsLocked(ctx, vars, at)
   631  	mm.mappingMu.RUnlock()
   632  	if pars.NumBytes() == 0 {
   633  		mm.activeMu.Unlock()
   634  		return 0, translateIOError(ctx, perr)
   635  	}
   636  	imbs, t, imerr := mm.getVecIOMappingsLocked(pars, at)
   637  	mm.activeMu.DowngradeLock()
   638  	if imbs.NumBytes() == 0 {
   639  		t.flush(0, nil)
   640  		mm.activeMu.RUnlock()
   641  		return 0, translateIOError(ctx, imerr)
   642  	}
   643  
   644  	// Do I/O.
   645  	un, err := t.flush(f(imbs))
   646  	mm.activeMu.RUnlock()
   647  	n := int64(un)
   648  
   649  	// Return the first error in order of progress through ars.
   650  	if err != nil {
   651  		// Do not convert errors from f to EFAULT.
   652  		return n, err
   653  	}
   654  	if imerr != nil {
   655  		return n, translateIOError(ctx, imerr)
   656  	}
   657  	if perr != nil {
   658  		return n, translateIOError(ctx, perr)
   659  	}
   660  	return n, translateIOError(ctx, verr)
   661  }
   662  
   663  // getIOMappingsLocked returns internal mappings appropriate for I/O for
   664  // addresses in ar. If mappings are only available for a strict subset of ar,
   665  // the returned error is non-nil.
   666  //
   667  // ioBufTracker.flush() must be called on the returned ioBufTracker when the
   668  // returned mappings are no longer in use, and its return value indicates the
   669  // number of bytes actually completed after buffer flushing. Returned mappings
   670  // are valid until either mm.activeMu is unlocked or ioBufTracker.flush() is
   671  // called.
   672  //
   673  // Preconditions:
   674  //   - mm.activeMu must be locked for writing.
   675  //   - pseg.Range().Contains(ar.Start).
   676  //   - pmas must exist for all addresses in ar.
   677  //   - ar.Length() != 0.
   678  //
   679  // Postconditions: getIOMappingsLocked does not invalidate iterators into mm.pmas.
   680  func (mm *MemoryManager) getIOMappingsLocked(pseg pmaIterator, ar hostarch.AddrRange, at hostarch.AccessType) (safemem.BlockSeq, *ioBufTracker, error) {
   681  	if checkInvariants {
   682  		if !ar.WellFormed() || ar.Length() == 0 {
   683  			panic(fmt.Sprintf("invalid ar: %v", ar))
   684  		}
   685  		if !pseg.Range().Contains(ar.Start) {
   686  			panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar))
   687  		}
   688  	}
   689  
   690  	if ar.End <= pseg.End() {
   691  		// Since only one pma is involved, we can use pma.internalMappings
   692  		// directly, avoiding a slice allocation.
   693  		if err := pseg.getInternalMappingsLocked(); err != nil {
   694  			if _, ok := err.(memmap.BufferedIOFallbackErr); ok {
   695  				goto slowPath
   696  			}
   697  			return safemem.BlockSeq{}, nil, err
   698  		}
   699  		offset := uint64(ar.Start - pseg.Start())
   700  		return pseg.ValuePtr().internalMappings.DropFirst64(offset).TakeFirst64(uint64(ar.Length())), nil, nil
   701  	}
   702  
   703  slowPath:
   704  	ims, t, _, err := mm.getIOMappingsTrackedLocked(pseg, ar, at, nil, nil, 0)
   705  	return safemem.BlockSeqFromSlice(ims), t, err
   706  }
   707  
   708  // getVecIOMappingsLocked returns internal mappings appropriate for I/O for
   709  // addresses in ars. If mappings are only available for a strict subset of ar,
   710  // the returned error is non-nil.
   711  //
   712  // ioBufTracker.flush() must be called on the returned ioBufTracker when the
   713  // returned mappings are no longer in use, and its return value indicates the
   714  // number of bytes actually completed after buffer flushing. Returned mappings
   715  // are valid until either mm.activeMu is unlocked or ioBufTracker.flush() is
   716  // called.
   717  //
   718  // Preconditions:
   719  //   - mm.activeMu must be locked for writing.
   720  //   - pmas must exist for all addresses in ar.
   721  //
   722  // Postconditions: getVecIOMappingsLocked does not invalidate iterators into
   723  // mm.pmas
   724  func (mm *MemoryManager) getVecIOMappingsLocked(ars hostarch.AddrRangeSeq, at hostarch.AccessType) (safemem.BlockSeq, *ioBufTracker, error) {
   725  	if ars.NumRanges() == 1 {
   726  		ar := ars.Head()
   727  		return mm.getIOMappingsLocked(mm.pmas.FindSegment(ar.Start), ar, at)
   728  	}
   729  
   730  	var ims []safemem.Block
   731  	var t *ioBufTracker
   732  	unbufBytes := uint64(0)
   733  	for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
   734  		ar := arsit.Head()
   735  		if ar.Length() == 0 {
   736  			continue
   737  		}
   738  		var err error
   739  		ims, t, unbufBytes, err = mm.getIOMappingsTrackedLocked(mm.pmas.FindSegment(ar.Start), ar, at, ims, t, unbufBytes)
   740  		if err != nil {
   741  			return safemem.BlockSeqFromSlice(ims), t, err
   742  		}
   743  	}
   744  	return safemem.BlockSeqFromSlice(ims), t, nil
   745  }
   746  
   747  // getIOMappingsTrackedLocked collects internal mappings appropriate for I/O
   748  // for addresses in ar, appends them to ims, and returns an updated slice. If
   749  // mappings are only available for a strict subset of ar, the returned error is
   750  // non-nil.
   751  //
   752  // If any iterated memmap.Files require buffering for I/O, they are recorded in
   753  // an ioBufTracker. Since the ioBufTracker pointer is initially nil (to
   754  // minimize overhead for the common case where no memmap.files require
   755  // buffering for I/O), getIOMappingsTrackedLocked returns an updated
   756  // ioBufTracker pointer.
   757  //
   758  // unbufBytes is the number of bytes of unbuffered mappings that have been
   759  // appended to ims since the last buffered mapping; getIOMappingsTrackedLocked
   760  // also returns an updated value for unbufBytes.
   761  //
   762  // Returned mappings are valid until either mm.activeMu is unlocked or
   763  // ioBufTracker.flush() is called.
   764  //
   765  // Preconditions:
   766  //   - mm.activeMu must be locked for writing.
   767  //   - pseg.Range().Contains(ar.Start).
   768  //   - pmas must exist for all addresses in ar.
   769  //   - ar.Length() != 0.
   770  //
   771  // Postconditions: getIOMappingsTrackedLocked does not invalidate iterators
   772  // into mm.pmas.
   773  func (mm *MemoryManager) getIOMappingsTrackedLocked(pseg pmaIterator, ar hostarch.AddrRange, at hostarch.AccessType, ims []safemem.Block, t *ioBufTracker, unbufBytes uint64) ([]safemem.Block, *ioBufTracker, uint64, error) {
   774  	for {
   775  		pmaAR := ar.Intersect(pseg.Range())
   776  		if err := pseg.getInternalMappingsLocked(); err == nil {
   777  			// Iterate the subset of the PMA's cached internal mappings that
   778  			// correspond to pmaAR, and append them to ims.
   779  			for pims := pseg.ValuePtr().internalMappings.DropFirst64(uint64(pmaAR.Start - pseg.Start())).TakeFirst64(uint64(pmaAR.Length())); !pims.IsEmpty(); pims = pims.Tail() {
   780  				ims = append(ims, pims.Head())
   781  			}
   782  			unbufBytes += uint64(pmaAR.Length())
   783  		} else if _, ok := err.(memmap.BufferedIOFallbackErr); !ok {
   784  			return ims, t, unbufBytes, err
   785  		} else {
   786  			// Fall back to buffered I/O as instructed.
   787  			if t == nil {
   788  				t = getIOBufTracker(at.Write)
   789  			}
   790  			buf := getByteSlicePtr(int(pmaAR.Length()))
   791  			pma := pseg.ValuePtr()
   792  			off := pseg.fileRangeOf(pmaAR).Start
   793  			// If the caller will read from the buffer, fill it from the file;
   794  			// otherwise leave it zeroed.
   795  			if at.Read || at.Execute {
   796  				var n uint64
   797  				n, err = pma.file.BufferReadAt(off, *buf)
   798  				*buf = (*buf)[:n]
   799  			} else {
   800  				err = nil
   801  			}
   802  			if len(*buf) != 0 {
   803  				ims = append(ims, safemem.BlockFromSafeSlice(*buf))
   804  				t.bufs = append(t.bufs, ioBuf{
   805  					unbufBytesBefore: unbufBytes,
   806  					file:             pma.file,
   807  					off:              off,
   808  					buf:              buf,
   809  				})
   810  				unbufBytes = 0
   811  			}
   812  			if err != nil {
   813  				return ims, t, unbufBytes, err
   814  			}
   815  		}
   816  		if ar.End <= pseg.End() {
   817  			return ims, t, unbufBytes, nil
   818  		}
   819  		pseg, _ = pseg.NextNonEmpty()
   820  	}
   821  }
   822  
   823  type ioBuf struct {
   824  	unbufBytesBefore uint64
   825  	file             memmap.File
   826  	off              uint64
   827  	buf              *[]byte
   828  }
   829  
   830  type ioBufTracker struct {
   831  	write bool
   832  	bufs  []ioBuf
   833  }
   834  
   835  var ioBufTrackerPool = sync.Pool{
   836  	New: func() any {
   837  		return &ioBufTracker{}
   838  	},
   839  }
   840  
   841  func getIOBufTracker(write bool) *ioBufTracker {
   842  	t := ioBufTrackerPool.Get().(*ioBufTracker)
   843  	t.write = write
   844  	return t
   845  }
   846  
   847  func putIOBufTracker(t *ioBufTracker) {
   848  	for i := range t.bufs {
   849  		t.bufs[i].file = nil
   850  		putByteSlicePtr(t.bufs[i].buf)
   851  		t.bufs[i].buf = nil
   852  	}
   853  	t.bufs = t.bufs[:0]
   854  	ioBufTrackerPool.Put(t)
   855  }
   856  
   857  func (t *ioBufTracker) flush(prevN uint64, prevErr error) (uint64, error) {
   858  	if t == nil {
   859  		return prevN, prevErr
   860  	}
   861  	return t.flushSlow(prevN, prevErr)
   862  }
   863  
   864  func (t *ioBufTracker) flushSlow(prevN uint64, prevErr error) (uint64, error) {
   865  	defer putIOBufTracker(t)
   866  	if !t.write {
   867  		return prevN, prevErr
   868  	}
   869  	// Flush dirty buffers to underlying memmap.Files.
   870  	rem := prevN
   871  	done := uint64(0)
   872  	for i := range t.bufs {
   873  		buf := &t.bufs[i]
   874  		if rem <= buf.unbufBytesBefore {
   875  			// The write ended before reaching buf.buf.
   876  			break
   877  		}
   878  		rem -= buf.unbufBytesBefore
   879  		done += buf.unbufBytesBefore
   880  		n, err := buf.file.BufferWriteAt(buf.off, (*buf.buf)[:min(len(*buf.buf), int(rem))])
   881  		rem -= n
   882  		done += n
   883  		if err != nil {
   884  			return done, err
   885  		}
   886  	}
   887  	// All buffers covered by prevN were written back successfully.
   888  	return prevN, prevErr
   889  }
   890  
   891  var byteSlicePtrPool sync.Pool
   892  
   893  // getByteSlicePtr returns a pointer to a byte slice with the given length. The
   894  // slice is either newly-allocated or recycled from a previous call to
   895  // putByteSlicePtr. The pointer should be passed to putByteSlicePtr when the
   896  // slice is no longer in use.
   897  func getByteSlicePtr(l int) *[]byte {
   898  	a := byteSlicePtrPool.Get()
   899  	if a == nil {
   900  		s := make([]byte, l)
   901  		return &s
   902  	}
   903  	sp := a.(*[]byte)
   904  	s := *sp
   905  	if l <= cap(s) {
   906  		s = s[:l]
   907  	} else {
   908  		s = make([]byte, l)
   909  	}
   910  	*sp = s
   911  	return sp
   912  }
   913  
   914  // putByteSlicePtr marks all of the given's slice capacity reusable by a future
   915  // call to getByteSlicePtr.
   916  func putByteSlicePtr(s *[]byte) {
   917  	byteSlicePtrPool.Put(s)
   918  }
   919  
   920  // truncatedAddrRangeSeq returns a copy of ars, but with the end truncated to
   921  // at most address end on AddrRange arsit.Head(). It is used in vector I/O paths to
   922  // truncate hostarch.AddrRangeSeq when errors occur.
   923  //
   924  // Preconditions:
   925  //   - !arsit.IsEmpty().
   926  //   - end <= arsit.Head().End.
   927  func truncatedAddrRangeSeq(ars, arsit hostarch.AddrRangeSeq, end hostarch.Addr) hostarch.AddrRangeSeq {
   928  	ar := arsit.Head()
   929  	if end <= ar.Start {
   930  		return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes())
   931  	}
   932  	return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes() + int64(end-ar.Start))
   933  }