github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/platform/kvm/machine_arm64.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build arm64
    16  // +build arm64
    17  
    18  package kvm
    19  
    20  import (
    21  	"fmt"
    22  	"runtime"
    23  
    24  	"golang.org/x/sys/unix"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/hostarch"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/ring0"
    28  	"github.com/nicocha30/gvisor-ligolo/pkg/ring0/pagetables"
    29  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/platform"
    30  )
    31  
    32  type vCPUArchState struct {
    33  	// PCIDs is the set of PCIDs for this vCPU.
    34  	//
    35  	// This starts above fixedKernelPCID.
    36  	PCIDs *pagetables.PCIDs
    37  }
    38  
    39  const (
    40  	// fixedKernelPCID is a fixed kernel PCID used for the kernel page
    41  	// tables. We must start allocating user PCIDs above this in order to
    42  	// avoid any conflict (see below).
    43  	fixedKernelPCID = 1
    44  
    45  	// poolPCIDs is the number of PCIDs to record in the database. As this
    46  	// grows, assignment can take longer, since it is a simple linear scan.
    47  	// Beyond a relatively small number, there are likely few perform
    48  	// benefits, since the TLB has likely long since lost any translations
    49  	// from more than a few PCIDs past.
    50  	poolPCIDs = 128
    51  )
    52  
    53  func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
    54  	applyPhysicalRegions(func(pr physicalRegion) bool {
    55  		pageTable.Map(
    56  			hostarch.Addr(ring0.KernelStartAddress|pr.virtual),
    57  			pr.length,
    58  			pagetables.MapOpts{AccessType: hostarch.AnyAccess, Global: true},
    59  			pr.physical)
    60  
    61  		return true // Keep iterating.
    62  	})
    63  }
    64  
    65  // archPhysicalRegions fills readOnlyGuestRegions and allocates separate
    66  // physical regions form them.
    67  func archPhysicalRegions(physicalRegions []physicalRegion) []physicalRegion {
    68  	rdRegions := []virtualRegion{}
    69  	if err := applyVirtualRegions(func(vr virtualRegion) {
    70  		if excludeVirtualRegion(vr) {
    71  			return // skip region.
    72  		}
    73  		// Skip PROT_NONE mappings. Go-runtime uses them as place
    74  		// holders for future read-write mappings.
    75  		if !vr.accessType.Write && vr.accessType.Read {
    76  			rdRegions = append(rdRegions, vr)
    77  		}
    78  	}); err != nil {
    79  		panic(fmt.Sprintf("error parsing /proc/self/maps: %v", err))
    80  	}
    81  
    82  	// Add an unreachable region.
    83  	rdRegions = append(rdRegions, virtualRegion{
    84  		region: region{
    85  			virtual: 0xffffffffffffffff,
    86  			length:  0,
    87  		},
    88  	})
    89  
    90  	var regions []physicalRegion
    91  	addValidRegion := func(r *physicalRegion, virtual, length uintptr, readOnly bool) {
    92  		if length == 0 {
    93  			return
    94  		}
    95  		regions = append(regions, physicalRegion{
    96  			region: region{
    97  				virtual: virtual,
    98  				length:  length,
    99  			},
   100  			physical: r.physical + (virtual - r.virtual),
   101  			readOnly: readOnly,
   102  		})
   103  	}
   104  	i := 0
   105  	for _, pr := range physicalRegions {
   106  		start := pr.virtual
   107  		end := pr.virtual + pr.length
   108  		for start < end {
   109  			rdRegion := rdRegions[i].region
   110  			rdStart := rdRegion.virtual
   111  			rdEnd := rdRegion.virtual + rdRegion.length
   112  			if rdEnd <= start {
   113  				i++
   114  				continue
   115  			}
   116  			if rdStart > start {
   117  				newEnd := rdStart
   118  				if end < rdStart {
   119  					newEnd = end
   120  				}
   121  				addValidRegion(&pr, start, newEnd-start, false)
   122  				start = rdStart
   123  				continue
   124  			}
   125  			if rdEnd < end {
   126  				addValidRegion(&pr, start, rdEnd-start, true)
   127  				start = rdEnd
   128  				continue
   129  			}
   130  			addValidRegion(&pr, start, end-start, start >= rdStart && end <= rdEnd)
   131  			start = end
   132  		}
   133  	}
   134  
   135  	return regions
   136  }
   137  
   138  // nonCanonical generates a canonical address return.
   139  //
   140  //go:nosplit
   141  func nonCanonical(addr uint64, signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) {
   142  	*info = linux.SignalInfo{
   143  		Signo: signal,
   144  		Code:  linux.SI_KERNEL,
   145  	}
   146  	info.SetAddr(addr) // Include address.
   147  	return hostarch.NoAccess, platform.ErrContextSignal
   148  }
   149  
   150  // isInstructionAbort returns true if it is an instruction abort.
   151  //
   152  //go:nosplit
   153  func isInstructionAbort(code uint64) bool {
   154  	value := (code & _ESR_ELx_EC_MASK) >> _ESR_ELx_EC_SHIFT
   155  	return value == _ESR_ELx_EC_IABT_LOW
   156  }
   157  
   158  // isWriteFault returns whether it is a write fault.
   159  //
   160  //go:nosplit
   161  func isWriteFault(code uint64) bool {
   162  	if isInstructionAbort(code) {
   163  		return false
   164  	}
   165  
   166  	return (code & _ESR_ELx_WNR) != 0
   167  }
   168  
   169  // fault generates an appropriate fault return.
   170  //
   171  //go:nosplit
   172  func (c *vCPU) fault(signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) {
   173  	bluepill(c) // Probably no-op, but may not be.
   174  	faultAddr := c.FaultAddr()
   175  	code, user := c.ErrorCode()
   176  	if !user {
   177  		// The last fault serviced by this CPU was not a user
   178  		// fault, so we can't reliably trust the faultAddr or
   179  		// the code provided here. We need to re-execute.
   180  		return hostarch.NoAccess, platform.ErrContextInterrupt
   181  	}
   182  
   183  	// Reset the pointed SignalInfo.
   184  	*info = linux.SignalInfo{Signo: signal}
   185  	info.SetAddr(uint64(faultAddr))
   186  	accessType := hostarch.AccessType{}
   187  	if signal == int32(unix.SIGSEGV) {
   188  		accessType = hostarch.AccessType{
   189  			Read:    !isWriteFault(uint64(code)),
   190  			Write:   isWriteFault(uint64(code)),
   191  			Execute: isInstructionAbort(uint64(code)),
   192  		}
   193  	}
   194  
   195  	ret := code & _ESR_ELx_FSC
   196  	switch ret {
   197  	case _ESR_SEGV_MAPERR_L0, _ESR_SEGV_MAPERR_L1, _ESR_SEGV_MAPERR_L2, _ESR_SEGV_MAPERR_L3:
   198  		info.Code = 1 //SEGV_MAPERR
   199  	case _ESR_SEGV_ACCERR_L1, _ESR_SEGV_ACCERR_L2, _ESR_SEGV_ACCERR_L3, _ESR_SEGV_PEMERR_L1, _ESR_SEGV_PEMERR_L2, _ESR_SEGV_PEMERR_L3:
   200  		info.Code = 2 // SEGV_ACCERR.
   201  	default:
   202  		info.Code = 2
   203  	}
   204  
   205  	return accessType, platform.ErrContextSignal
   206  }
   207  
   208  // getMaxVCPU get max vCPU number
   209  func (m *machine) getMaxVCPU() {
   210  	rmaxVCPUs := runtime.NumCPU()
   211  	smaxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
   212  	// compare the max vcpu number from runtime and syscall, use smaller one.
   213  	if errno != 0 {
   214  		m.maxVCPUs = rmaxVCPUs
   215  	} else {
   216  		if rmaxVCPUs < int(smaxVCPUs) {
   217  			m.maxVCPUs = rmaxVCPUs
   218  		} else {
   219  			m.maxVCPUs = int(smaxVCPUs)
   220  		}
   221  	}
   222  }