github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/platform/kvm/machine_arm64.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // +build arm64
    16  
    17  package kvm
    18  
    19  import (
    20  	"runtime"
    21  	"sync/atomic"
    22  
    23  	"golang.org/x/sys/unix"
    24  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    25  	"github.com/SagerNet/gvisor/pkg/hostarch"
    26  	"github.com/SagerNet/gvisor/pkg/ring0"
    27  	"github.com/SagerNet/gvisor/pkg/ring0/pagetables"
    28  	"github.com/SagerNet/gvisor/pkg/sentry/arch/fpu"
    29  	"github.com/SagerNet/gvisor/pkg/sentry/platform"
    30  )
    31  
    32  type machineArchState struct {
    33  	//initialvCPUs is the machine vCPUs which has initialized but not used
    34  	initialvCPUs map[int]*vCPU
    35  }
    36  
    37  type vCPUArchState struct {
    38  	// PCIDs is the set of PCIDs for this vCPU.
    39  	//
    40  	// This starts above fixedKernelPCID.
    41  	PCIDs *pagetables.PCIDs
    42  
    43  	// floatingPointState is the floating point state buffer used in guest
    44  	// to host transitions. See usage in bluepill_arm64.go.
    45  	floatingPointState fpu.State
    46  }
    47  
    48  const (
    49  	// fixedKernelPCID is a fixed kernel PCID used for the kernel page
    50  	// tables. We must start allocating user PCIDs above this in order to
    51  	// avoid any conflict (see below).
    52  	fixedKernelPCID = 1
    53  
    54  	// poolPCIDs is the number of PCIDs to record in the database. As this
    55  	// grows, assignment can take longer, since it is a simple linear scan.
    56  	// Beyond a relatively small number, there are likely few perform
    57  	// benefits, since the TLB has likely long since lost any translations
    58  	// from more than a few PCIDs past.
    59  	poolPCIDs = 128
    60  )
    61  
    62  func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
    63  	applyPhysicalRegions(func(pr physicalRegion) bool {
    64  		pageTable.Map(
    65  			hostarch.Addr(ring0.KernelStartAddress|pr.virtual),
    66  			pr.length,
    67  			pagetables.MapOpts{AccessType: hostarch.AnyAccess, Global: true},
    68  			pr.physical)
    69  
    70  		return true // Keep iterating.
    71  	})
    72  }
    73  
    74  // Get all read-only physicalRegions.
    75  func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
    76  	var rdonlyRegions []region
    77  
    78  	applyVirtualRegions(func(vr virtualRegion) {
    79  		if excludeVirtualRegion(vr) {
    80  			return
    81  		}
    82  
    83  		if !vr.accessType.Write && vr.accessType.Read {
    84  			rdonlyRegions = append(rdonlyRegions, vr.region)
    85  		}
    86  
    87  		// TODO(github.com/SagerNet/issue/2686): PROT_NONE should be specially treated.
    88  		// Workaround: treated as rdonly temporarily.
    89  		if !vr.accessType.Write && !vr.accessType.Read && !vr.accessType.Execute {
    90  			rdonlyRegions = append(rdonlyRegions, vr.region)
    91  		}
    92  	})
    93  
    94  	for _, r := range rdonlyRegions {
    95  		physical, _, ok := translateToPhysical(r.virtual)
    96  		if !ok {
    97  			continue
    98  		}
    99  
   100  		phyRegions = append(phyRegions, physicalRegion{
   101  			region: region{
   102  				virtual: r.virtual,
   103  				length:  r.length,
   104  			},
   105  			physical: physical,
   106  		})
   107  	}
   108  
   109  	return phyRegions
   110  }
   111  
   112  // Get all available physicalRegions.
   113  func availableRegionsForSetMem() (phyRegions []physicalRegion) {
   114  	var excludeRegions []region
   115  	applyVirtualRegions(func(vr virtualRegion) {
   116  		if !vr.accessType.Write {
   117  			excludeRegions = append(excludeRegions, vr.region)
   118  		}
   119  	})
   120  
   121  	phyRegions = computePhysicalRegions(excludeRegions)
   122  
   123  	return phyRegions
   124  }
   125  
   126  // nonCanonical generates a canonical address return.
   127  //
   128  //go:nosplit
   129  func nonCanonical(addr uint64, signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) {
   130  	*info = linux.SignalInfo{
   131  		Signo: signal,
   132  		Code:  linux.SI_KERNEL,
   133  	}
   134  	info.SetAddr(addr) // Include address.
   135  	return hostarch.NoAccess, platform.ErrContextSignal
   136  }
   137  
   138  // isInstructionAbort returns true if it is an instruction abort.
   139  //
   140  //go:nosplit
   141  func isInstructionAbort(code uint64) bool {
   142  	value := (code & _ESR_ELx_EC_MASK) >> _ESR_ELx_EC_SHIFT
   143  	return value == _ESR_ELx_EC_IABT_LOW
   144  }
   145  
   146  // isWriteFault returns whether it is a write fault.
   147  //
   148  //go:nosplit
   149  func isWriteFault(code uint64) bool {
   150  	if isInstructionAbort(code) {
   151  		return false
   152  	}
   153  
   154  	return (code & _ESR_ELx_WNR) != 0
   155  }
   156  
   157  // fault generates an appropriate fault return.
   158  //
   159  //go:nosplit
   160  func (c *vCPU) fault(signal int32, info *linux.SignalInfo) (hostarch.AccessType, error) {
   161  	bluepill(c) // Probably no-op, but may not be.
   162  	faultAddr := c.GetFaultAddr()
   163  	code, user := c.ErrorCode()
   164  
   165  	if !user {
   166  		// The last fault serviced by this CPU was not a user
   167  		// fault, so we can't reliably trust the faultAddr or
   168  		// the code provided here. We need to re-execute.
   169  		return hostarch.NoAccess, platform.ErrContextInterrupt
   170  	}
   171  
   172  	// Reset the pointed SignalInfo.
   173  	*info = linux.SignalInfo{Signo: signal}
   174  	info.SetAddr(uint64(faultAddr))
   175  
   176  	ret := code & _ESR_ELx_FSC
   177  	switch ret {
   178  	case _ESR_SEGV_MAPERR_L0, _ESR_SEGV_MAPERR_L1, _ESR_SEGV_MAPERR_L2, _ESR_SEGV_MAPERR_L3:
   179  		info.Code = 1 //SEGV_MAPERR
   180  	case _ESR_SEGV_ACCERR_L1, _ESR_SEGV_ACCERR_L2, _ESR_SEGV_ACCERR_L3, _ESR_SEGV_PEMERR_L1, _ESR_SEGV_PEMERR_L2, _ESR_SEGV_PEMERR_L3:
   181  		info.Code = 2 // SEGV_ACCERR.
   182  	default:
   183  		info.Code = 2
   184  	}
   185  
   186  	accessType := hostarch.AccessType{
   187  		Read:    !isWriteFault(uint64(code)),
   188  		Write:   isWriteFault(uint64(code)),
   189  		Execute: isInstructionAbort(uint64(code)),
   190  	}
   191  
   192  	return accessType, platform.ErrContextSignal
   193  }
   194  
   195  // getMaxVCPU get max vCPU number
   196  func (m *machine) getMaxVCPU() {
   197  	rmaxVCPUs := runtime.NumCPU()
   198  	smaxVCPUs, _, errno := unix.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
   199  	// compare the max vcpu number from runtime and syscall, use smaller one.
   200  	if errno != 0 {
   201  		m.maxVCPUs = rmaxVCPUs
   202  	} else {
   203  		if rmaxVCPUs < int(smaxVCPUs) {
   204  			m.maxVCPUs = rmaxVCPUs
   205  		} else {
   206  			m.maxVCPUs = int(smaxVCPUs)
   207  		}
   208  	}
   209  }
   210  
   211  // getNewVCPU() scan for an available vCPU from initialvCPUs
   212  func (m *machine) getNewVCPU() *vCPU {
   213  	for CID, c := range m.initialvCPUs {
   214  		if atomic.CompareAndSwapUint32(&c.state, vCPUReady, vCPUUser) {
   215  			delete(m.initialvCPUs, CID)
   216  			return c
   217  		}
   218  	}
   219  	return nil
   220  }