gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/mm/io.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package mm 16 17 import ( 18 "fmt" 19 20 "gvisor.dev/gvisor/pkg/context" 21 "gvisor.dev/gvisor/pkg/errors/linuxerr" 22 "gvisor.dev/gvisor/pkg/hostarch" 23 "gvisor.dev/gvisor/pkg/safemem" 24 "gvisor.dev/gvisor/pkg/sentry/memmap" 25 "gvisor.dev/gvisor/pkg/sentry/platform" 26 "gvisor.dev/gvisor/pkg/sync" 27 "gvisor.dev/gvisor/pkg/usermem" 28 ) 29 30 // There are two supported ways to copy data to/from application virtual 31 // memory: 32 // 33 // 1. Internally-mapped copying: Determine the memmap.File that backs the 34 // copied-to/from virtual address, obtain a mapping of its pages, and read or 35 // write to the mapping. 36 // 37 // 2. AddressSpace copying: If platform.Platform.SupportsAddressSpaceIO() is 38 // true, AddressSpace permissions are applicable, and an AddressSpace is 39 // available, copy directly through the AddressSpace, handling faults as 40 // needed. 41 // 42 // (Given that internally-mapped copying requires that backing memory is always 43 // implemented using a host file descriptor, we could also preadv/pwritev to it 44 // instead. But this would incur a host syscall for each use of the mapped 45 // page, whereas mmap is a one-time cost.) 46 // 47 // The fixed overhead of internally-mapped copying is expected to be higher 48 // than that of AddressSpace copying since the former always needs to translate 49 // addresses, whereas the latter only needs to do so when faults occur. 50 // However, the throughput of internally-mapped copying is expected to be 51 // somewhat higher than that of AddressSpace copying due to the high cost of 52 // page faults and because implementations of the latter usually rely on 53 // safecopy, which doesn't use AVX registers. So we prefer to use AddressSpace 54 // copying (when available) for smaller copies, and switch to internally-mapped 55 // copying once a size threshold is exceeded. 56 const ( 57 // copyMapMinBytes is the size threshold for switching to internally-mapped 58 // copying in CopyOut, CopyIn, and ZeroOut. 59 copyMapMinBytes = 32 << 10 // 32 KB 60 61 // rwMapMinBytes is the size threshold for switching to internally-mapped 62 // copying in CopyOutFrom and CopyInTo. It's lower than copyMapMinBytes 63 // since AddressSpace copying in this case requires additional buffering; 64 // see CopyOutFrom for details. 65 rwMapMinBytes = 512 66 ) 67 68 // CheckIORange is similar to hostarch.Addr.ToRange, but applies bounds checks 69 // consistent with Linux's arch/x86/include/asm/uaccess.h:access_ok(). 70 // 71 // Preconditions: length >= 0. 72 func (mm *MemoryManager) CheckIORange(addr hostarch.Addr, length int64) (hostarch.AddrRange, bool) { 73 // Note that access_ok() constrains end even if length == 0. 74 ar, ok := addr.ToRange(uint64(length)) 75 return ar, (ok && ar.End <= mm.layout.MaxAddr) 76 } 77 78 // checkIOVec applies bound checks consistent with Linux's 79 // arch/x86/include/asm/uaccess.h:access_ok() to ars. 80 func (mm *MemoryManager) checkIOVec(ars hostarch.AddrRangeSeq) bool { 81 for !ars.IsEmpty() { 82 ar := ars.Head() 83 if _, ok := mm.CheckIORange(ar.Start, int64(ar.Length())); !ok { 84 return false 85 } 86 ars = ars.Tail() 87 } 88 return true 89 } 90 91 func (mm *MemoryManager) asioEnabled(opts usermem.IOOpts) bool { 92 return mm.haveASIO && !opts.IgnorePermissions && opts.AddressSpaceActive 93 } 94 95 // translateIOError converts errors to EFAULT, as is usually reported for all 96 // I/O errors originating from MM in Linux. 97 func translateIOError(ctx context.Context, err error) error { 98 if err == nil { 99 return nil 100 } 101 if logIOErrors { 102 ctx.Debugf("MM I/O error: %v", err) 103 } 104 return linuxerr.EFAULT 105 } 106 107 // CopyOut implements usermem.IO.CopyOut. 108 func (mm *MemoryManager) CopyOut(ctx context.Context, addr hostarch.Addr, src []byte, opts usermem.IOOpts) (int, error) { 109 ar, ok := mm.CheckIORange(addr, int64(len(src))) 110 if !ok { 111 return 0, linuxerr.EFAULT 112 } 113 114 if len(src) == 0 { 115 return 0, nil 116 } 117 118 // Do AddressSpace IO if applicable. 119 if mm.asioEnabled(opts) && len(src) < copyMapMinBytes { 120 return mm.asCopyOut(ctx, addr, src) 121 } 122 123 // Go through internal mappings. 124 // NOTE(gvisor.dev/issue/10331): Using mm.withInternalMappings() here means 125 // that if we encounter any memmap.BufferedIOFallbackErrs, this copy will 126 // traverse an unnecessary layer of buffering. This can be fixed by 127 // inlining mm.withInternalMappings() and passing src subslices directly to 128 // memmap.File.BufferWriteAt(). 129 n64, err := mm.withInternalMappings(ctx, ar, hostarch.Write, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { 130 n, err := safemem.CopySeq(ims, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(src))) 131 return n, translateIOError(ctx, err) 132 }) 133 return int(n64), err 134 } 135 136 func (mm *MemoryManager) asCopyOut(ctx context.Context, addr hostarch.Addr, src []byte) (int, error) { 137 var done int 138 for { 139 n, err := mm.as.CopyOut(addr+hostarch.Addr(done), src[done:]) 140 done += n 141 if err == nil { 142 return done, nil 143 } 144 if f, ok := err.(platform.SegmentationFault); ok { 145 ar, _ := addr.ToRange(uint64(len(src))) 146 if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.Write); err != nil { 147 return done, err 148 } 149 continue 150 } 151 return done, translateIOError(ctx, err) 152 } 153 } 154 155 // CopyIn implements usermem.IO.CopyIn. 156 func (mm *MemoryManager) CopyIn(ctx context.Context, addr hostarch.Addr, dst []byte, opts usermem.IOOpts) (int, error) { 157 ar, ok := mm.CheckIORange(addr, int64(len(dst))) 158 if !ok { 159 return 0, linuxerr.EFAULT 160 } 161 162 if len(dst) == 0 { 163 return 0, nil 164 } 165 166 // Do AddressSpace IO if applicable. 167 if mm.asioEnabled(opts) && len(dst) < copyMapMinBytes { 168 return mm.asCopyIn(ctx, addr, dst) 169 } 170 171 // Go through internal mappings. 172 // NOTE(gvisor.dev/issue/10331): Using mm.withInternalMappings() here means 173 // that if we encounter any memmap.BufferedIOFallbackErrs, this copy will 174 // traverse an unnecessary layer of buffering. This can be fixed by 175 // inlining mm.withInternalMappings() and passing dst subslices directly to 176 // memmap.File.BufferReadAt(). 177 n64, err := mm.withInternalMappings(ctx, ar, hostarch.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { 178 n, err := safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(dst)), ims) 179 return n, translateIOError(ctx, err) 180 }) 181 return int(n64), err 182 } 183 184 func (mm *MemoryManager) asCopyIn(ctx context.Context, addr hostarch.Addr, dst []byte) (int, error) { 185 var done int 186 for { 187 n, err := mm.as.CopyIn(addr+hostarch.Addr(done), dst[done:]) 188 done += n 189 if err == nil { 190 return done, nil 191 } 192 if f, ok := err.(platform.SegmentationFault); ok { 193 ar, _ := addr.ToRange(uint64(len(dst))) 194 if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.Read); err != nil { 195 return done, err 196 } 197 continue 198 } 199 return done, translateIOError(ctx, err) 200 } 201 } 202 203 // ZeroOut implements usermem.IO.ZeroOut. 204 func (mm *MemoryManager) ZeroOut(ctx context.Context, addr hostarch.Addr, toZero int64, opts usermem.IOOpts) (int64, error) { 205 ar, ok := mm.CheckIORange(addr, toZero) 206 if !ok { 207 return 0, linuxerr.EFAULT 208 } 209 210 if toZero == 0 { 211 return 0, nil 212 } 213 214 // Do AddressSpace IO if applicable. 215 if mm.asioEnabled(opts) && toZero < copyMapMinBytes { 216 return mm.asZeroOut(ctx, addr, toZero) 217 } 218 219 // Go through internal mappings. 220 return mm.withInternalMappings(ctx, ar, hostarch.Write, opts.IgnorePermissions, func(dsts safemem.BlockSeq) (uint64, error) { 221 n, err := safemem.ZeroSeq(dsts) 222 return n, translateIOError(ctx, err) 223 }) 224 } 225 226 func (mm *MemoryManager) asZeroOut(ctx context.Context, addr hostarch.Addr, toZero int64) (int64, error) { 227 var done int64 228 for { 229 n, err := mm.as.ZeroOut(addr+hostarch.Addr(done), uintptr(toZero-done)) 230 done += int64(n) 231 if err == nil { 232 return done, nil 233 } 234 if f, ok := err.(platform.SegmentationFault); ok { 235 ar, _ := addr.ToRange(uint64(toZero)) 236 if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.Write); err != nil { 237 return done, err 238 } 239 continue 240 } 241 return done, translateIOError(ctx, err) 242 } 243 } 244 245 // CopyOutFrom implements usermem.IO.CopyOutFrom. 246 func (mm *MemoryManager) CopyOutFrom(ctx context.Context, ars hostarch.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) { 247 if !mm.checkIOVec(ars) { 248 return 0, linuxerr.EFAULT 249 } 250 251 if ars.NumBytes() == 0 { 252 return 0, nil 253 } 254 255 // Do AddressSpace IO if applicable. 256 if mm.asioEnabled(opts) && ars.NumBytes() < rwMapMinBytes { 257 // We have to introduce a buffered copy, instead of just passing a 258 // safemem.BlockSeq representing addresses in the AddressSpace to src. 259 // This is because usermem.IO.CopyOutFrom() guarantees that it calls 260 // src.ReadToBlocks() at most once, which is incompatible with handling 261 // faults between calls. In the future, this is probably best resolved 262 // by introducing a CopyOutFrom variant or option that allows it to 263 // call src.ReadToBlocks() any number of times. 264 // 265 // This issue applies to CopyInTo as well. 266 buf := make([]byte, int(ars.NumBytes())) 267 bufN, bufErr := src.ReadToBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf))) 268 var done int64 269 for done < int64(bufN) { 270 ar := ars.Head() 271 cplen := int64(ar.Length()) 272 if cplen > int64(bufN)-done { 273 cplen = int64(bufN) - done 274 } 275 n, err := mm.asCopyOut(ctx, ar.Start, buf[int(done):int(done+cplen)]) 276 done += int64(n) 277 if err != nil { 278 return done, err 279 } 280 ars = ars.Tail() 281 } 282 // Do not convert errors returned by src to EFAULT. 283 return done, bufErr 284 } 285 286 // Go through internal mappings. 287 return mm.withVecInternalMappings(ctx, ars, hostarch.Write, opts.IgnorePermissions, src.ReadToBlocks) 288 } 289 290 // CopyInTo implements usermem.IO.CopyInTo. 291 func (mm *MemoryManager) CopyInTo(ctx context.Context, ars hostarch.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) { 292 if !mm.checkIOVec(ars) { 293 return 0, linuxerr.EFAULT 294 } 295 296 if ars.NumBytes() == 0 { 297 return 0, nil 298 } 299 300 // Do AddressSpace IO if applicable. 301 if mm.asioEnabled(opts) && ars.NumBytes() < rwMapMinBytes { 302 buf := make([]byte, int(ars.NumBytes())) 303 var done int 304 var bufErr error 305 for !ars.IsEmpty() { 306 ar := ars.Head() 307 var n int 308 n, bufErr = mm.asCopyIn(ctx, ar.Start, buf[done:done+int(ar.Length())]) 309 done += n 310 if bufErr != nil { 311 break 312 } 313 ars = ars.Tail() 314 } 315 n, err := dst.WriteFromBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:done]))) 316 if err != nil { 317 return int64(n), err 318 } 319 // Do not convert errors returned by dst to EFAULT. 320 return int64(n), bufErr 321 } 322 323 // Go through internal mappings. 324 return mm.withVecInternalMappings(ctx, ars, hostarch.Read, opts.IgnorePermissions, dst.WriteFromBlocks) 325 } 326 327 // EnsurePMAsExist attempts to ensure that PMAs exist for the given addr with the 328 // requested length. It returns the length to which it was able to either 329 // initialize PMAs for, or ascertain that PMAs exist for. If this length is 330 // smaller than the requested length it returns an error explaining why. 331 func (mm *MemoryManager) EnsurePMAsExist(ctx context.Context, addr hostarch.Addr, length int64, opts usermem.IOOpts) (int64, error) { 332 ar, ok := mm.CheckIORange(addr, length) 333 if !ok { 334 return 0, linuxerr.EFAULT 335 } 336 n64, err := mm.withInternalMappings(ctx, ar, hostarch.Write, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { 337 return uint64(ims.NumBytes()), nil 338 }) 339 return int64(n64), err 340 } 341 342 // SwapUint32 implements usermem.IO.SwapUint32. 343 func (mm *MemoryManager) SwapUint32(ctx context.Context, addr hostarch.Addr, new uint32, opts usermem.IOOpts) (uint32, error) { 344 ar, ok := mm.CheckIORange(addr, 4) 345 if !ok { 346 return 0, linuxerr.EFAULT 347 } 348 349 // Do AddressSpace IO if applicable. 350 if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions { 351 for { 352 old, err := mm.as.SwapUint32(addr, new) 353 if err == nil { 354 return old, nil 355 } 356 if f, ok := err.(platform.SegmentationFault); ok { 357 if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.ReadWrite); err != nil { 358 return 0, err 359 } 360 continue 361 } 362 return 0, translateIOError(ctx, err) 363 } 364 } 365 366 // Go through internal mappings. 367 var old uint32 368 _, err := mm.withInternalMappings(ctx, ar, hostarch.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { 369 if ims.NumBlocks() != 1 || ims.NumBytes() != 4 { 370 // Atomicity is unachievable across mappings. 371 return 0, linuxerr.EFAULT 372 } 373 im := ims.Head() 374 var err error 375 old, err = safemem.SwapUint32(im, new) 376 if err != nil { 377 return 0, translateIOError(ctx, err) 378 } 379 // Return the number of bytes read. 380 return 4, nil 381 }) 382 return old, err 383 } 384 385 // CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32. 386 func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr hostarch.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) { 387 ar, ok := mm.CheckIORange(addr, 4) 388 if !ok { 389 return 0, linuxerr.EFAULT 390 } 391 392 // Do AddressSpace IO if applicable. 393 if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions { 394 for { 395 prev, err := mm.as.CompareAndSwapUint32(addr, old, new) 396 if err == nil { 397 return prev, nil 398 } 399 if f, ok := err.(platform.SegmentationFault); ok { 400 if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.ReadWrite); err != nil { 401 return 0, err 402 } 403 continue 404 } 405 return 0, translateIOError(ctx, err) 406 } 407 } 408 409 // Go through internal mappings. 410 var prev uint32 411 _, err := mm.withInternalMappings(ctx, ar, hostarch.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { 412 if ims.NumBlocks() != 1 || ims.NumBytes() != 4 { 413 // Atomicity is unachievable across mappings. 414 return 0, linuxerr.EFAULT 415 } 416 im := ims.Head() 417 var err error 418 prev, err = safemem.CompareAndSwapUint32(im, old, new) 419 if err != nil { 420 return 0, translateIOError(ctx, err) 421 } 422 // Return the number of bytes read. 423 return 4, nil 424 }) 425 return prev, err 426 } 427 428 // LoadUint32 implements usermem.IO.LoadUint32. 429 func (mm *MemoryManager) LoadUint32(ctx context.Context, addr hostarch.Addr, opts usermem.IOOpts) (uint32, error) { 430 ar, ok := mm.CheckIORange(addr, 4) 431 if !ok { 432 return 0, linuxerr.EFAULT 433 } 434 435 // Do AddressSpace IO if applicable. 436 if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions { 437 for { 438 val, err := mm.as.LoadUint32(addr) 439 if err == nil { 440 return val, nil 441 } 442 if f, ok := err.(platform.SegmentationFault); ok { 443 if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.Read); err != nil { 444 return 0, err 445 } 446 continue 447 } 448 return 0, translateIOError(ctx, err) 449 } 450 } 451 452 // Go through internal mappings. 453 var val uint32 454 _, err := mm.withInternalMappings(ctx, ar, hostarch.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) { 455 if ims.NumBlocks() != 1 || ims.NumBytes() != 4 { 456 // Atomicity is unachievable across mappings. 457 return 0, linuxerr.EFAULT 458 } 459 im := ims.Head() 460 var err error 461 val, err = safemem.LoadUint32(im) 462 if err != nil { 463 return 0, translateIOError(ctx, err) 464 } 465 // Return the number of bytes read. 466 return 4, nil 467 }) 468 return val, err 469 } 470 471 // handleASIOFault handles a page fault at address addr for an AddressSpaceIO 472 // operation spanning ioar. 473 // 474 // Preconditions: 475 // - mm.as != nil. 476 // - ioar.Length() != 0. 477 // - ioar.Contains(addr). 478 func (mm *MemoryManager) handleASIOFault(ctx context.Context, addr hostarch.Addr, ioar hostarch.AddrRange, at hostarch.AccessType) error { 479 // Try to map all remaining pages in the I/O operation. This RoundUp can't 480 // overflow because otherwise it would have been caught by CheckIORange. 481 end, _ := ioar.End.RoundUp() 482 ar := hostarch.AddrRange{addr.RoundDown(), end} 483 484 // Don't bother trying existingPMAsLocked; in most cases, if we did have 485 // existing pmas, we wouldn't have faulted. 486 487 // Ensure that we have usable vmas. Here and below, only return early if we 488 // can't map the first (faulting) page; failure to map later pages are 489 // silently ignored. This maximizes partial success. 490 mm.mappingMu.RLock() 491 vseg, vend, err := mm.getVMAsLocked(ctx, ar, at, false) 492 if vendaddr := vend.Start(); vendaddr < ar.End { 493 if vendaddr <= ar.Start { 494 mm.mappingMu.RUnlock() 495 return translateIOError(ctx, err) 496 } 497 ar.End = vendaddr 498 } 499 500 // Ensure that we have usable pmas. 501 mm.activeMu.Lock() 502 pseg, pend, err := mm.getPMAsLocked(ctx, vseg, ar, at) 503 mm.mappingMu.RUnlock() 504 if pendaddr := pend.Start(); pendaddr < ar.End { 505 if pendaddr <= ar.Start { 506 mm.activeMu.Unlock() 507 return translateIOError(ctx, err) 508 } 509 ar.End = pendaddr 510 } 511 512 // Downgrade to a read-lock on activeMu since we don't need to mutate pmas 513 // anymore. 514 mm.activeMu.DowngradeLock() 515 516 err = mm.mapASLocked(pseg, ar, memmap.PlatformEffectDefault) 517 mm.activeMu.RUnlock() 518 return translateIOError(ctx, err) 519 } 520 521 // withInternalMappings ensures that pmas exist for all addresses in ar, 522 // support access of type (at, ignorePermissions), and have internal mappings 523 // cached. It then calls f with mm.activeMu locked for reading, passing 524 // internal mappings for the subrange of ar for which this property holds. 525 // 526 // withInternalMappings takes a function returning uint64 since many safemem 527 // functions have this property, but returns an int64 since this is usually 528 // more useful for usermem.IO methods. 529 // 530 // Preconditions: 0 < ar.Length() <= math.MaxInt64. 531 func (mm *MemoryManager) withInternalMappings(ctx context.Context, ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) { 532 // If pmas are already available, we can do IO without touching mm.vmas or 533 // mm.mappingMu. 534 mm.activeMu.RLock() 535 if pseg := mm.existingPMAsLocked(ar, at, ignorePermissions, true /* needInternalMappings */); pseg.Ok() { 536 n, err := f(mm.internalMappingsLocked(pseg, ar)) 537 mm.activeMu.RUnlock() 538 // Do not convert errors returned by f to EFAULT. 539 return int64(n), err 540 } 541 mm.activeMu.RUnlock() 542 543 // Ensure that we have usable vmas. 544 mm.mappingMu.RLock() 545 vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions) 546 if vendaddr := vend.Start(); vendaddr < ar.End { 547 if vendaddr <= ar.Start { 548 mm.mappingMu.RUnlock() 549 return 0, translateIOError(ctx, verr) 550 } 551 ar.End = vendaddr 552 } 553 554 // Ensure that we have usable pmas. 555 mm.activeMu.Lock() 556 pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, at) 557 mm.mappingMu.RUnlock() 558 if pendaddr := pend.Start(); pendaddr < ar.End { 559 if pendaddr <= ar.Start { 560 mm.activeMu.Unlock() 561 return 0, translateIOError(ctx, perr) 562 } 563 ar.End = pendaddr 564 } 565 imbs, t, imerr := mm.getIOMappingsLocked(pseg, ar, at) 566 mm.activeMu.DowngradeLock() 567 if imlen := imbs.NumBytes(); imlen < uint64(ar.Length()) { 568 if imlen == 0 { 569 t.flush(0, nil) 570 mm.activeMu.RUnlock() 571 return 0, translateIOError(ctx, imerr) 572 } 573 ar.End = ar.Start + hostarch.Addr(imlen) 574 } 575 576 // Do I/O. 577 un, err := t.flush(f(imbs)) 578 mm.activeMu.RUnlock() 579 n := int64(un) 580 581 // Return the first error in order of progress through ar. 582 if err != nil { 583 // Do not convert errors returned by f to EFAULT. 584 return n, err 585 } 586 if imerr != nil { 587 return n, translateIOError(ctx, imerr) 588 } 589 if perr != nil { 590 return n, translateIOError(ctx, perr) 591 } 592 return n, translateIOError(ctx, verr) 593 } 594 595 // withVecInternalMappings ensures that pmas exist for all addresses in ars, 596 // support access of type (at, ignorePermissions), and have internal mappings 597 // cached. It then calls f with mm.activeMu locked for reading, passing 598 // internal mappings for the subset of ars for which this property holds. 599 // 600 // Preconditions: !ars.IsEmpty(). 601 func (mm *MemoryManager) withVecInternalMappings(ctx context.Context, ars hostarch.AddrRangeSeq, at hostarch.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) { 602 // withInternalMappings is faster than withVecInternalMappings because of 603 // iterator plumbing (this isn't generally practical in the vector case due 604 // to iterator invalidation between AddrRanges). Use it if possible. 605 if ars.NumRanges() == 1 { 606 return mm.withInternalMappings(ctx, ars.Head(), at, ignorePermissions, f) 607 } 608 609 // If pmas are already available, we can do IO without touching mm.vmas or 610 // mm.mappingMu. 611 mm.activeMu.RLock() 612 if mm.existingVecPMAsLocked(ars, at, ignorePermissions, true /* needInternalMappings */) { 613 n, err := f(mm.vecInternalMappingsLocked(ars)) 614 mm.activeMu.RUnlock() 615 // Do not convert errors returned by f to EFAULT. 616 return int64(n), err 617 } 618 mm.activeMu.RUnlock() 619 620 // Ensure that we have usable vmas. 621 mm.mappingMu.RLock() 622 vars, verr := mm.getVecVMAsLocked(ctx, ars, at, ignorePermissions) 623 if vars.NumBytes() == 0 { 624 mm.mappingMu.RUnlock() 625 return 0, translateIOError(ctx, verr) 626 } 627 628 // Ensure that we have usable pmas. 629 mm.activeMu.Lock() 630 pars, perr := mm.getVecPMAsLocked(ctx, vars, at) 631 mm.mappingMu.RUnlock() 632 if pars.NumBytes() == 0 { 633 mm.activeMu.Unlock() 634 return 0, translateIOError(ctx, perr) 635 } 636 imbs, t, imerr := mm.getVecIOMappingsLocked(pars, at) 637 mm.activeMu.DowngradeLock() 638 if imbs.NumBytes() == 0 { 639 t.flush(0, nil) 640 mm.activeMu.RUnlock() 641 return 0, translateIOError(ctx, imerr) 642 } 643 644 // Do I/O. 645 un, err := t.flush(f(imbs)) 646 mm.activeMu.RUnlock() 647 n := int64(un) 648 649 // Return the first error in order of progress through ars. 650 if err != nil { 651 // Do not convert errors from f to EFAULT. 652 return n, err 653 } 654 if imerr != nil { 655 return n, translateIOError(ctx, imerr) 656 } 657 if perr != nil { 658 return n, translateIOError(ctx, perr) 659 } 660 return n, translateIOError(ctx, verr) 661 } 662 663 // getIOMappingsLocked returns internal mappings appropriate for I/O for 664 // addresses in ar. If mappings are only available for a strict subset of ar, 665 // the returned error is non-nil. 666 // 667 // ioBufTracker.flush() must be called on the returned ioBufTracker when the 668 // returned mappings are no longer in use, and its return value indicates the 669 // number of bytes actually completed after buffer flushing. Returned mappings 670 // are valid until either mm.activeMu is unlocked or ioBufTracker.flush() is 671 // called. 672 // 673 // Preconditions: 674 // - mm.activeMu must be locked for writing. 675 // - pseg.Range().Contains(ar.Start). 676 // - pmas must exist for all addresses in ar. 677 // - ar.Length() != 0. 678 // 679 // Postconditions: getIOMappingsLocked does not invalidate iterators into mm.pmas. 680 func (mm *MemoryManager) getIOMappingsLocked(pseg pmaIterator, ar hostarch.AddrRange, at hostarch.AccessType) (safemem.BlockSeq, *ioBufTracker, error) { 681 if checkInvariants { 682 if !ar.WellFormed() || ar.Length() == 0 { 683 panic(fmt.Sprintf("invalid ar: %v", ar)) 684 } 685 if !pseg.Range().Contains(ar.Start) { 686 panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar)) 687 } 688 } 689 690 if ar.End <= pseg.End() { 691 // Since only one pma is involved, we can use pma.internalMappings 692 // directly, avoiding a slice allocation. 693 if err := pseg.getInternalMappingsLocked(); err != nil { 694 if _, ok := err.(memmap.BufferedIOFallbackErr); ok { 695 goto slowPath 696 } 697 return safemem.BlockSeq{}, nil, err 698 } 699 offset := uint64(ar.Start - pseg.Start()) 700 return pseg.ValuePtr().internalMappings.DropFirst64(offset).TakeFirst64(uint64(ar.Length())), nil, nil 701 } 702 703 slowPath: 704 ims, t, _, err := mm.getIOMappingsTrackedLocked(pseg, ar, at, nil, nil, 0) 705 return safemem.BlockSeqFromSlice(ims), t, err 706 } 707 708 // getVecIOMappingsLocked returns internal mappings appropriate for I/O for 709 // addresses in ars. If mappings are only available for a strict subset of ar, 710 // the returned error is non-nil. 711 // 712 // ioBufTracker.flush() must be called on the returned ioBufTracker when the 713 // returned mappings are no longer in use, and its return value indicates the 714 // number of bytes actually completed after buffer flushing. Returned mappings 715 // are valid until either mm.activeMu is unlocked or ioBufTracker.flush() is 716 // called. 717 // 718 // Preconditions: 719 // - mm.activeMu must be locked for writing. 720 // - pmas must exist for all addresses in ar. 721 // 722 // Postconditions: getVecIOMappingsLocked does not invalidate iterators into 723 // mm.pmas 724 func (mm *MemoryManager) getVecIOMappingsLocked(ars hostarch.AddrRangeSeq, at hostarch.AccessType) (safemem.BlockSeq, *ioBufTracker, error) { 725 if ars.NumRanges() == 1 { 726 ar := ars.Head() 727 return mm.getIOMappingsLocked(mm.pmas.FindSegment(ar.Start), ar, at) 728 } 729 730 var ims []safemem.Block 731 var t *ioBufTracker 732 unbufBytes := uint64(0) 733 for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() { 734 ar := arsit.Head() 735 if ar.Length() == 0 { 736 continue 737 } 738 var err error 739 ims, t, unbufBytes, err = mm.getIOMappingsTrackedLocked(mm.pmas.FindSegment(ar.Start), ar, at, ims, t, unbufBytes) 740 if err != nil { 741 return safemem.BlockSeqFromSlice(ims), t, err 742 } 743 } 744 return safemem.BlockSeqFromSlice(ims), t, nil 745 } 746 747 // getIOMappingsTrackedLocked collects internal mappings appropriate for I/O 748 // for addresses in ar, appends them to ims, and returns an updated slice. If 749 // mappings are only available for a strict subset of ar, the returned error is 750 // non-nil. 751 // 752 // If any iterated memmap.Files require buffering for I/O, they are recorded in 753 // an ioBufTracker. Since the ioBufTracker pointer is initially nil (to 754 // minimize overhead for the common case where no memmap.files require 755 // buffering for I/O), getIOMappingsTrackedLocked returns an updated 756 // ioBufTracker pointer. 757 // 758 // unbufBytes is the number of bytes of unbuffered mappings that have been 759 // appended to ims since the last buffered mapping; getIOMappingsTrackedLocked 760 // also returns an updated value for unbufBytes. 761 // 762 // Returned mappings are valid until either mm.activeMu is unlocked or 763 // ioBufTracker.flush() is called. 764 // 765 // Preconditions: 766 // - mm.activeMu must be locked for writing. 767 // - pseg.Range().Contains(ar.Start). 768 // - pmas must exist for all addresses in ar. 769 // - ar.Length() != 0. 770 // 771 // Postconditions: getIOMappingsTrackedLocked does not invalidate iterators 772 // into mm.pmas. 773 func (mm *MemoryManager) getIOMappingsTrackedLocked(pseg pmaIterator, ar hostarch.AddrRange, at hostarch.AccessType, ims []safemem.Block, t *ioBufTracker, unbufBytes uint64) ([]safemem.Block, *ioBufTracker, uint64, error) { 774 for { 775 pmaAR := ar.Intersect(pseg.Range()) 776 if err := pseg.getInternalMappingsLocked(); err == nil { 777 // Iterate the subset of the PMA's cached internal mappings that 778 // correspond to pmaAR, and append them to ims. 779 for pims := pseg.ValuePtr().internalMappings.DropFirst64(uint64(pmaAR.Start - pseg.Start())).TakeFirst64(uint64(pmaAR.Length())); !pims.IsEmpty(); pims = pims.Tail() { 780 ims = append(ims, pims.Head()) 781 } 782 unbufBytes += uint64(pmaAR.Length()) 783 } else if _, ok := err.(memmap.BufferedIOFallbackErr); !ok { 784 return ims, t, unbufBytes, err 785 } else { 786 // Fall back to buffered I/O as instructed. 787 if t == nil { 788 t = getIOBufTracker(at.Write) 789 } 790 buf := getByteSlicePtr(int(pmaAR.Length())) 791 pma := pseg.ValuePtr() 792 off := pseg.fileRangeOf(pmaAR).Start 793 // If the caller will read from the buffer, fill it from the file; 794 // otherwise leave it zeroed. 795 if at.Read || at.Execute { 796 var n uint64 797 n, err = pma.file.BufferReadAt(off, *buf) 798 *buf = (*buf)[:n] 799 } else { 800 err = nil 801 } 802 if len(*buf) != 0 { 803 ims = append(ims, safemem.BlockFromSafeSlice(*buf)) 804 t.bufs = append(t.bufs, ioBuf{ 805 unbufBytesBefore: unbufBytes, 806 file: pma.file, 807 off: off, 808 buf: buf, 809 }) 810 unbufBytes = 0 811 } 812 if err != nil { 813 return ims, t, unbufBytes, err 814 } 815 } 816 if ar.End <= pseg.End() { 817 return ims, t, unbufBytes, nil 818 } 819 pseg, _ = pseg.NextNonEmpty() 820 } 821 } 822 823 type ioBuf struct { 824 unbufBytesBefore uint64 825 file memmap.File 826 off uint64 827 buf *[]byte 828 } 829 830 type ioBufTracker struct { 831 write bool 832 bufs []ioBuf 833 } 834 835 var ioBufTrackerPool = sync.Pool{ 836 New: func() any { 837 return &ioBufTracker{} 838 }, 839 } 840 841 func getIOBufTracker(write bool) *ioBufTracker { 842 t := ioBufTrackerPool.Get().(*ioBufTracker) 843 t.write = write 844 return t 845 } 846 847 func putIOBufTracker(t *ioBufTracker) { 848 for i := range t.bufs { 849 t.bufs[i].file = nil 850 putByteSlicePtr(t.bufs[i].buf) 851 t.bufs[i].buf = nil 852 } 853 t.bufs = t.bufs[:0] 854 ioBufTrackerPool.Put(t) 855 } 856 857 func (t *ioBufTracker) flush(prevN uint64, prevErr error) (uint64, error) { 858 if t == nil { 859 return prevN, prevErr 860 } 861 return t.flushSlow(prevN, prevErr) 862 } 863 864 func (t *ioBufTracker) flushSlow(prevN uint64, prevErr error) (uint64, error) { 865 defer putIOBufTracker(t) 866 if !t.write { 867 return prevN, prevErr 868 } 869 // Flush dirty buffers to underlying memmap.Files. 870 rem := prevN 871 done := uint64(0) 872 for i := range t.bufs { 873 buf := &t.bufs[i] 874 if rem <= buf.unbufBytesBefore { 875 // The write ended before reaching buf.buf. 876 break 877 } 878 rem -= buf.unbufBytesBefore 879 done += buf.unbufBytesBefore 880 n, err := buf.file.BufferWriteAt(buf.off, (*buf.buf)[:min(len(*buf.buf), int(rem))]) 881 rem -= n 882 done += n 883 if err != nil { 884 return done, err 885 } 886 } 887 // All buffers covered by prevN were written back successfully. 888 return prevN, prevErr 889 } 890 891 var byteSlicePtrPool sync.Pool 892 893 // getByteSlicePtr returns a pointer to a byte slice with the given length. The 894 // slice is either newly-allocated or recycled from a previous call to 895 // putByteSlicePtr. The pointer should be passed to putByteSlicePtr when the 896 // slice is no longer in use. 897 func getByteSlicePtr(l int) *[]byte { 898 a := byteSlicePtrPool.Get() 899 if a == nil { 900 s := make([]byte, l) 901 return &s 902 } 903 sp := a.(*[]byte) 904 s := *sp 905 if l <= cap(s) { 906 s = s[:l] 907 } else { 908 s = make([]byte, l) 909 } 910 *sp = s 911 return sp 912 } 913 914 // putByteSlicePtr marks all of the given's slice capacity reusable by a future 915 // call to getByteSlicePtr. 916 func putByteSlicePtr(s *[]byte) { 917 byteSlicePtrPool.Put(s) 918 } 919 920 // truncatedAddrRangeSeq returns a copy of ars, but with the end truncated to 921 // at most address end on AddrRange arsit.Head(). It is used in vector I/O paths to 922 // truncate hostarch.AddrRangeSeq when errors occur. 923 // 924 // Preconditions: 925 // - !arsit.IsEmpty(). 926 // - end <= arsit.Head().End. 927 func truncatedAddrRangeSeq(ars, arsit hostarch.AddrRangeSeq, end hostarch.Addr) hostarch.AddrRangeSeq { 928 ar := arsit.Head() 929 if end <= ar.Start { 930 return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes()) 931 } 932 return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes() + int64(end-ar.Start)) 933 }