github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/platform/kvm/address_space.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kvm
    16  
    17  import (
    18  	"github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops"
    19  	"github.com/nicocha30/gvisor-ligolo/pkg/hostarch"
    20  	"github.com/nicocha30/gvisor-ligolo/pkg/ring0/pagetables"
    21  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/memmap"
    22  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/platform"
    23  	"github.com/nicocha30/gvisor-ligolo/pkg/sync"
    24  )
    25  
    26  // dirtySet tracks vCPUs for invalidation.
    27  type dirtySet struct {
    28  	vCPUMasks []atomicbitops.Uint64
    29  }
    30  
    31  // forEach iterates over all CPUs in the dirty set.
    32  //
    33  //go:nosplit
    34  func (ds *dirtySet) forEach(m *machine, fn func(c *vCPU)) {
    35  	for index := range ds.vCPUMasks {
    36  		mask := ds.vCPUMasks[index].Swap(0)
    37  		if mask != 0 {
    38  			for bit := 0; bit < 64; bit++ {
    39  				if mask&(1<<uint64(bit)) == 0 {
    40  					continue
    41  				}
    42  				id := 64*index + bit
    43  				fn(m.vCPUsByID[id])
    44  			}
    45  		}
    46  	}
    47  }
    48  
    49  // mark marks the given vCPU as dirty and returns whether it was previously
    50  // clean. Being previously clean implies that a flush is needed on entry.
    51  func (ds *dirtySet) mark(c *vCPU) bool {
    52  	index := uint64(c.id) / 64
    53  	bit := uint64(1) << uint(c.id%64)
    54  
    55  	oldValue := ds.vCPUMasks[index].Load()
    56  	if oldValue&bit != 0 {
    57  		return false // Not clean.
    58  	}
    59  
    60  	// Set the bit unilaterally, and ensure that a flush takes place. Note
    61  	// that it's possible for races to occur here, but since the flush is
    62  	// taking place long after these lines there's no race in practice.
    63  	atomicbitops.OrUint64(&ds.vCPUMasks[index], bit)
    64  	return true // Previously clean.
    65  }
    66  
    67  // addressSpace is a wrapper for PageTables.
    68  type addressSpace struct {
    69  	platform.NoAddressSpaceIO
    70  
    71  	// mu is the lock for modifications to the address space.
    72  	//
    73  	// Note that the page tables themselves are not locked.
    74  	mu sync.Mutex
    75  
    76  	// machine is the underlying machine.
    77  	machine *machine
    78  
    79  	// pageTables are for this particular address space.
    80  	pageTables *pagetables.PageTables
    81  
    82  	// dirtySet is the set of dirty vCPUs.
    83  	dirtySet *dirtySet
    84  }
    85  
    86  // Invalidate interrupts all dirty contexts.
    87  func (as *addressSpace) Invalidate() {
    88  	as.mu.Lock()
    89  	defer as.mu.Unlock()
    90  	as.invalidate()
    91  }
    92  
    93  // Touch adds the given vCPU to the dirty list.
    94  //
    95  // The return value indicates whether a flush is required.
    96  func (as *addressSpace) Touch(c *vCPU) bool {
    97  	return as.dirtySet.mark(c)
    98  }
    99  
   100  type hostMapEntry struct {
   101  	addr   uintptr
   102  	length uintptr
   103  }
   104  
   105  // mapLocked maps the given host entry.
   106  //
   107  // +checkescape:hard,stack
   108  //
   109  //go:nosplit
   110  func (as *addressSpace) mapLocked(addr hostarch.Addr, m hostMapEntry, at hostarch.AccessType) (inv bool) {
   111  	for m.length > 0 {
   112  		physical, length, ok := translateToPhysical(m.addr)
   113  		if !ok {
   114  			panic("unable to translate segment")
   115  		}
   116  		if length > m.length {
   117  			length = m.length
   118  		}
   119  
   120  		// Ensure that this map has physical mappings. If the page does
   121  		// not have physical mappings, the KVM module may inject
   122  		// spurious exceptions when emulation fails (i.e. it tries to
   123  		// emulate because the RIP is pointed at those pages).
   124  		as.machine.mapPhysical(physical, length, physicalRegions)
   125  
   126  		// Install the page table mappings. Note that the ordering is
   127  		// important; if the pagetable mappings were installed before
   128  		// ensuring the physical pages were available, then some other
   129  		// thread could theoretically access them.
   130  		inv = as.pageTables.Map(addr, length, pagetables.MapOpts{
   131  			AccessType: at,
   132  			User:       true,
   133  		}, physical) || inv
   134  		m.addr += length
   135  		m.length -= length
   136  		addr += hostarch.Addr(length)
   137  	}
   138  
   139  	return inv
   140  }
   141  
   142  // MapFile implements platform.AddressSpace.MapFile.
   143  func (as *addressSpace) MapFile(addr hostarch.Addr, f memmap.File, fr memmap.FileRange, at hostarch.AccessType, precommit bool) error {
   144  	as.mu.Lock()
   145  	defer as.mu.Unlock()
   146  
   147  	// Get mappings in the sentry's address space, which are guaranteed to be
   148  	// valid as long as a reference is held on the mapped pages (which is in
   149  	// turn required by AddressSpace.MapFile precondition).
   150  	//
   151  	// If precommit is true, we will touch mappings to commit them, so ensure
   152  	// that mappings are readable from sentry context.
   153  	//
   154  	// We don't execute from application file-mapped memory, and guest page
   155  	// tables don't care if we have execute permission (but they do need pages
   156  	// to be readable).
   157  	bs, err := f.MapInternal(fr, hostarch.AccessType{
   158  		Read:  at.Read || at.Execute || precommit,
   159  		Write: at.Write,
   160  	})
   161  	if err != nil {
   162  		return err
   163  	}
   164  
   165  	// See block in mapLocked.
   166  	as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
   167  	defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
   168  
   169  	// Map the mappings in the sentry's address space (guest physical memory)
   170  	// into the application's address space (guest virtual memory).
   171  	inv := false
   172  	for !bs.IsEmpty() {
   173  		b := bs.Head()
   174  		bs = bs.Tail()
   175  		// Since fr was page-aligned, b should also be page-aligned. We do the
   176  		// lookup in our host page tables for this translation.
   177  		if precommit {
   178  			s := b.ToSlice()
   179  			for i := 0; i < len(s); i += hostarch.PageSize {
   180  				_ = s[i] // Touch to commit.
   181  			}
   182  		}
   183  
   184  		// See bluepill_allocator.go.
   185  		bluepill(as.pageTables.Allocator.(*allocator).cpu)
   186  
   187  		// Perform the mapping.
   188  		prev := as.mapLocked(addr, hostMapEntry{
   189  			addr:   b.Addr(),
   190  			length: uintptr(b.Len()),
   191  		}, at)
   192  		inv = inv || prev
   193  		addr += hostarch.Addr(b.Len())
   194  	}
   195  	if inv {
   196  		as.invalidate()
   197  	}
   198  
   199  	return nil
   200  }
   201  
   202  // unmapLocked is an escape-checked wrapped around Unmap.
   203  //
   204  // +checkescape:hard,stack
   205  //
   206  //go:nosplit
   207  func (as *addressSpace) unmapLocked(addr hostarch.Addr, length uint64) bool {
   208  	return as.pageTables.Unmap(addr, uintptr(length))
   209  }
   210  
   211  // Unmap unmaps the given range by calling pagetables.PageTables.Unmap.
   212  func (as *addressSpace) Unmap(addr hostarch.Addr, length uint64) {
   213  	as.mu.Lock()
   214  	defer as.mu.Unlock()
   215  
   216  	// See above & bluepill_allocator.go.
   217  	as.pageTables.Allocator.(*allocator).cpu = as.machine.Get()
   218  	defer as.machine.Put(as.pageTables.Allocator.(*allocator).cpu)
   219  	bluepill(as.pageTables.Allocator.(*allocator).cpu)
   220  
   221  	if prev := as.unmapLocked(addr, length); prev {
   222  		// Invalidate all active vCPUs.
   223  		as.invalidate()
   224  
   225  		// Recycle any freed intermediate pages.
   226  		as.pageTables.Allocator.Recycle()
   227  	}
   228  }
   229  
   230  // Release releases the page tables.
   231  func (as *addressSpace) Release() {
   232  	as.Unmap(0, ^uint64(0))
   233  
   234  	// Free all pages from the allocator.
   235  	as.pageTables.Allocator.(*allocator).base.Drain()
   236  
   237  	// Drop all cached machine references.
   238  	as.machine.dropPageTables(as.pageTables)
   239  }
   240  
   241  // PreFork implements platform.AddressSpace.PreFork.
   242  func (as *addressSpace) PreFork() {}
   243  
   244  // PostFork implements platform.AddressSpace.PostFork.
   245  func (as *addressSpace) PostFork() {}