github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/platform/kvm/address_space.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kvm
    16  
    17  import (
    18  	"sync/atomic"
    19  
    20  	"github.com/SagerNet/gvisor/pkg/atomicbitops"
    21  	"github.com/SagerNet/gvisor/pkg/hostarch"
    22  	"github.com/SagerNet/gvisor/pkg/ring0/pagetables"
    23  	"github.com/SagerNet/gvisor/pkg/sentry/memmap"
    24  	"github.com/SagerNet/gvisor/pkg/sentry/platform"
    25  	"github.com/SagerNet/gvisor/pkg/sync"
    26  )
    27  
    28  // dirtySet tracks vCPUs for invalidation.
    29  type dirtySet struct {
    30  	vCPUMasks []uint64
    31  }
    32  
    33  // forEach iterates over all CPUs in the dirty set.
    34  //
    35  //go:nosplit
    36  func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) {
    37  	for index := range ds.vCPUMasks {
    38  		mask := atomic.SwapUint64(&ds.vCPUMasks[index], 0)
    39  		if mask != 0 {
    40  			for bit := 0; bit < 64; bit++ {
    41  				if mask&(1<<uint64(bit)) == 0 {
    42  					continue
    43  				}
    44  				id := 64*index + bit
    45  				fn(m.vCPUsByID[id])
    46  			}
    47  		}
    48  	}
    49  }
    50  
    51  // mark marks the given vCPU as dirty and returns whether it was previously
    52  // clean. Being previously clean implies that a flush is needed on entry.
    53  func (ds *dirtySet) mark(c *vCPU) bool {
    54  	index := uint64(c.id) / 64
    55  	bit := uint64(1) << uint(c.id%64)
    56  
    57  	oldValue := atomic.LoadUint64(&ds.vCPUMasks[index])
    58  	if oldValue&bit != 0 {
    59  		return false // Not clean.
    60  	}
    61  
    62  	// Set the bit unilaterally, and ensure that a flush takes place. Note
    63  	// that it's possible for races to occur here, but since the flush is
    64  	// taking place long after these lines there's no race in practice.
    65  	atomicbitops.OrUint64(&ds.vCPUMasks[index], bit)
    66  	return true // Previously clean.
    67  }
    68  
    69  // addressSpace is a wrapper for PageTables.
    70  type addressSpace struct {
    71  	platform.NoAddressSpaceIO
    72  
    73  	// mu is the lock for modifications to the address space.
    74  	//
    75  	// Note that the page tables themselves are not locked.
    76  	mu sync.Mutex
    77  
    78  	// machine is the underlying machine.
    79  	machine *machine
    80  
    81  	// pageTables are for this particular address space.
    82  	pageTables *pagetables.PageTables
    83  
    84  	// dirtySet is the set of dirty vCPUs.
    85  	dirtySet *dirtySet
    86  }
    87  
    88  // Invalidate interrupts all dirty contexts.
    89  func (as *addressSpace) Invalidate() {
    90  	as.mu.Lock()
    91  	defer as.mu.Unlock()
    92  	as.invalidate()
    93  }
    94  
    95  // Touch adds the given vCPU to the dirty list.
    96  //
    97  // The return value indicates whether a flush is required.
    98  func (as *addressSpace) Touch(c *vCPU) bool {
    99  	return as.dirtySet.mark(c)
   100  }
   101  
   102  type hostMapEntry struct {
   103  	addr   uintptr
   104  	length uintptr
   105  }
   106  
   107  // mapLocked maps the given host entry.
   108  //
   109  // +checkescape:hard,stack
   110  //
   111  //go:nosplit
   112  func (as *addressSpace) mapLocked(addr hostarch.Addr, m hostMapEntry, at hostarch.AccessType) (inv bool) {
   113  	for m.length > 0 {
   114  		physical, length, ok := translateToPhysical(m.addr)
   115  		if !ok {
   116  			panic("unable to translate segment")
   117  		}
   118  		if length > m.length {
   119  			length = m.length
   120  		}
   121  
   122  		// Ensure that this map has physical mappings. If the page does
   123  		// not have physical mappings, the KVM module may inject
   124  		// spurious exceptions when emulation fails (i.e. it tries to
   125  		// emulate because the RIP is pointed at those pages).
   126  		as.machine.mapPhysical(physical, length, physicalRegions, _KVM_MEM_FLAGS_NONE)
   127  
   128  		// Install the page table mappings. Note that the ordering is
   129  		// important; if the pagetable mappings were installed before
   130  		// ensuring the physical pages were available, then some other
   131  		// thread could theoretically access them.
   132  		inv = as.pageTables.Map(addr, length, pagetables.MapOpts{
   133  			AccessType: at,
   134  			User:       true,
   135  		}, physical) || inv
   136  		m.addr += length
   137  		m.length -= length
   138  		addr += hostarch.Addr(length)
   139  	}
   140  
   141  	return inv
   142  }
   143  
   144  // MapFile implements platform.AddressSpace.MapFile.
   145  func (as *addressSpace) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error {
   146  	as.mu.Lock()
   147  	defer as.mu.Unlock()
   148  
   149  	// Get mappings in the sentry's address space, which are guaranteed to be
   150  	// valid as long as a reference is held on the mapped pages (which is in
   151  	// turn required by AddressSpace.MapFile precondition).
   152  	//
   153  	// If precommit is true, we will touch mappings to commit them, so ensure
   154  	// that mappings are readable from sentry context.
   155  	//
   156  	// We don't execute from application file-mapped memory, and guest page
   157  	// tables don't care if we have execute permission (but they do need pages
   158  	// to be readable).
   159  	bs, err := f.MapInternal(fr, hostarch.AccessType{
   160  		Read:  at.Read || at.Execute || precommit,
   161  		Write: at.Write,
   162  	})
   163  	if err != nil {
   164  		return err
   165  	}
   166  
   167  	// See block in mapLocked.
   168  	as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
   169  	defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
   170  
   171  	// Map the mappings in the sentry's address space (guest physical memory)
   172  	// into the application's address space (guest virtual memory).
   173  	inv := false
   174  	for !bs.IsEmpty() {
   175  		b := bs.Head()
   176  		bs = bs.Tail()
   177  		// Since fr was page-aligned, b should also be page-aligned. We do the
   178  		// lookup in our host page tables for this translation.
   179  		if precommit {
   180  			s := b.ToSlice()
   181  			for i := 0; i < len(s); i += hostarch.PageSize {
   182  				_ = s[i] // Touch to commit.
   183  			}
   184  		}
   185  
   186  		// See bluepill_allocator.go.
   187  		bluepill(as.pageTables.Allocator.(*allocator).cpu)
   188  
   189  		// Perform the mapping.
   190  		prev := as.mapLocked(addr, hostMapEntry{
   191  			addr:   b.Addr(),
   192  			length: uintptr(b.Len()),
   193  		}, at)
   194  		inv = inv || prev
   195  		addr += hostarch.Addr(b.Len())
   196  	}
   197  	if inv {
   198  		as.invalidate()
   199  	}
   200  
   201  	return nil
   202  }
   203  
   204  // unmapLocked is an escape-checked wrapped around Unmap.
   205  //
   206  // +checkescape:hard,stack
   207  //
   208  //go:nosplit
   209  func (as *addressSpace) unmapLocked(addr hostarch.Addr, length uint64) bool {
   210  	return as.pageTables.Unmap(addr, uintptr(length))
   211  }
   212  
   213  // Unmap unmaps the given range by calling pagetables.PageTables.Unmap.
   214  func (as *addressSpace) Unmap(addr hostarch.Addr, length uint64) {
   215  	as.mu.Lock()
   216  	defer as.mu.Unlock()
   217  
   218  	// See above & bluepill_allocator.go.
   219  	as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
   220  	defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
   221  	bluepill(as.pageTables.Allocator.(*allocator).cpu)
   222  
   223  	if prev := as.unmapLocked(addr, length); prev {
   224  		// Invalidate all active vCPUs.
   225  		as.invalidate()
   226  
   227  		// Recycle any freed intermediate pages.
   228  		as.pageTables.Allocator.Recycle()
   229  	}
   230  }
   231  
   232  // Release releases the page tables.
   233  func (as *addressSpace) Release() {
   234  	as.Unmap(0, ^uint64(0))
   235  
   236  	// Free all pages from the allocator.
   237  	as.pageTables.Allocator.(*allocator).base.Drain()
   238  
   239  	// Drop all cached machine references.
   240  	as.machine.dropPageTables(as.pageTables)
   241  }
   242  
   243  // PreFork implements platform.AddressSpace.PreFork.
   244  func (as *addressSpace) PreFork() {}
   245  
   246  // PostFork implements platform.AddressSpace.PostFork.
   247  func (as *addressSpace) PostFork() {}