github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/platform/systrap/usertrap/usertrap_amd64.go (about)

     1  // Copyright 2020 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build amd64
    16  // +build amd64
    17  
    18  package usertrap
    19  
    20  import (
    21  	"encoding/binary"
    22  	"fmt"
    23  	"math/rand"
    24  
    25  	"github.com/MerlinKodo/gvisor/pkg/context"
    26  	"github.com/MerlinKodo/gvisor/pkg/hostarch"
    27  	"github.com/MerlinKodo/gvisor/pkg/marshal/primitive"
    28  	"github.com/MerlinKodo/gvisor/pkg/sentry/arch"
    29  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel"
    30  	"github.com/MerlinKodo/gvisor/pkg/sentry/memmap"
    31  	"github.com/MerlinKodo/gvisor/pkg/sync"
    32  	"github.com/MerlinKodo/gvisor/pkg/usermem"
    33  	"golang.org/x/sys/unix"
    34  )
    35  
    36  // trapNR is the maximum number of traps what can fit in the trap table.
    37  const trapNR = 256
    38  
    39  // trapSize is the size of one trap.
    40  const trapSize = 80
    41  
    42  var (
    43  	// jmpInst is the binary code of "jmp *addr".
    44  	jmpInst          = [7]byte{0xff, 0x24, 0x25, 0, 0, 0, 0}
    45  	jmpInstOpcodeLen = 3
    46  	// faultInst is the single byte invalid instruction.
    47  	faultInst = [1]byte{0x6}
    48  	// faultInstOffset is the offset of the syscall instruction.
    49  	faultInstOffset = uintptr(5)
    50  )
    51  
    52  type memoryManager interface {
    53  	usermem.IO
    54  	MMap(ctx context.Context, opts memmap.MMapOpts) (hostarch.Addr, error)
    55  	FindVMAByName(ar hostarch.AddrRange, hint string) (hostarch.Addr, uint64, error)
    56  }
    57  
    58  // State represents the current state of the trap table.
    59  //
    60  // +stateify savable
    61  type State struct {
    62  	mu        sync.RWMutex `state:"nosave"`
    63  	nextTrap  uint32
    64  	tableAddr hostarch.Addr
    65  }
    66  
    67  // New returns the new state structure.
    68  func New() *State {
    69  	return &State{}
    70  }
    71  
    72  // +marshal
    73  type header struct {
    74  	nextTrap uint32
    75  }
    76  
    77  func (s *State) trapAddr(trap uint32) hostarch.Addr {
    78  	return s.tableAddr + hostarch.Addr(trapSize*trap)
    79  }
    80  
    81  // newTrapLocked allocates a new trap entry.
    82  //
    83  // Preconditions: s.mu must be locked.
    84  func (s *State) newTrapLocked(ctx context.Context, mm memoryManager) (hostarch.Addr, error) {
    85  	var hdr header
    86  	task := kernel.TaskFromContext(ctx)
    87  	if task == nil {
    88  		return 0, fmt.Errorf("no task found")
    89  	}
    90  
    91  	// s.nextTrap is zero if it isn't initialized. Here are three cases
    92  	// when this can happen:
    93  	//	* A usertrap vma has not been mapped yet.
    94  	//	* The address space has been forked.
    95  	//	* The address space has been restored.
    96  	// nextTrap is saved on the usertrap vma to handle the third and second
    97  	// cases.
    98  	if s.nextTrap == 0 {
    99  		addr, off, err := mm.FindVMAByName(trapTableAddrRange, tableHint)
   100  		if off != 0 {
   101  			return 0, fmt.Errorf("the usertrap vma has been overmounted")
   102  		}
   103  		if err != nil {
   104  			// The usertrap table has not been mapped yet.
   105  			addr := hostarch.Addr(rand.Int63n(int64(trapTableAddrRange.Length()-trapTableSize))).RoundDown() + trapTableAddrRange.Start
   106  			ctx.Debugf("Map a usertrap vma at %x", addr)
   107  			if err := loadUsertrap(ctx, mm, addr); err != nil {
   108  				return 0, err
   109  			}
   110  			// The first cell in the table is used to save an index of a
   111  			// next unused trap.
   112  			s.nextTrap = 1
   113  			s.tableAddr = addr
   114  		} else if _, err := hdr.CopyIn(task.OwnCopyContext(usermem.IOOpts{AddressSpaceActive: false}), addr); err != nil {
   115  			return 0, err
   116  		} else {
   117  			// Read an index of a next unused trap.
   118  			s.nextTrap = hdr.nextTrap
   119  			s.tableAddr = addr
   120  		}
   121  	}
   122  	ctx.Debugf("Allocate a new trap: %p %d", s, s.nextTrap)
   123  	if s.nextTrap >= trapNR {
   124  		ctx.Warningf("No space in the trap table")
   125  		return 0, fmt.Errorf("no space in the trap table")
   126  	}
   127  	trap := s.nextTrap
   128  	s.nextTrap++
   129  
   130  	// An entire trap has to be on the same page to avoid memory faults.
   131  	addr := s.trapAddr(trap)
   132  	if addr/hostarch.PageSize != (addr+trapSize)/hostarch.PageSize {
   133  		trap = s.nextTrap
   134  		s.nextTrap++
   135  	}
   136  	hdr = header{
   137  		nextTrap: s.nextTrap,
   138  	}
   139  	if _, err := hdr.CopyOut(task.OwnCopyContext(usermem.IOOpts{IgnorePermissions: true}), s.tableAddr); err != nil {
   140  		return 0, err
   141  	}
   142  	return s.trapAddr(trap), nil
   143  }
   144  
   145  // trapTableAddrRange is the range where a trap table can be placed.
   146  //
   147  // The value has to be below 2GB and the high two bytes has to be an invalid
   148  // instruction.  In case of 0x60000, the high two bytes is 0x6. This is "push
   149  // es" in x86 and the bad instruction on x64.
   150  var trapTableAddrRange = hostarch.AddrRange{Start: 0x60000, End: 0x70000}
   151  
   152  const (
   153  	trapTableSize = hostarch.Addr(trapNR * trapSize)
   154  
   155  	tableHint = "[usertrap]"
   156  )
   157  
   158  // LoadUsertrap maps the usertrap table into the address space.
   159  func loadUsertrap(ctx context.Context, mm memoryManager, addr hostarch.Addr) error {
   160  	size, _ := hostarch.Addr(trapTableSize).RoundUp()
   161  	// Force is true because Addr is below MinUserAddress.
   162  	_, err := mm.MMap(ctx, memmap.MMapOpts{
   163  		Force:     true,
   164  		Unmap:     true,
   165  		Fixed:     true,
   166  		Addr:      addr,
   167  		Length:    uint64(size),
   168  		Private:   true,
   169  		Hint:      tableHint,
   170  		MLockMode: memmap.MLockEager,
   171  		Perms: hostarch.AccessType{
   172  			Write:   false,
   173  			Read:    true,
   174  			Execute: true,
   175  		},
   176  		MaxPerms: hostarch.AccessType{
   177  			Write:   true,
   178  			Read:    true,
   179  			Execute: true,
   180  		},
   181  	})
   182  	if err != nil {
   183  		return err
   184  	}
   185  
   186  	return nil
   187  }
   188  
   189  // PatchSyscall changes the syscall instruction into a function call.
   190  //
   191  // Returns true if the thread has to be restarted.
   192  func (s *State) PatchSyscall(ctx context.Context, ac *arch.Context64, mm memoryManager) (bool, error) {
   193  	task := kernel.TaskFromContext(ctx)
   194  	if task == nil {
   195  		return false, fmt.Errorf("no task found")
   196  	}
   197  
   198  	s.mu.Lock()
   199  	defer s.mu.Unlock()
   200  
   201  	sysno := ac.SyscallNo()
   202  	patchAddr := ac.IP() - uintptr(len(jmpInst))
   203  
   204  	prevCode := make([]uint8, len(jmpInst))
   205  	if _, err := primitive.CopyUint8SliceIn(task.OwnCopyContext(usermem.IOOpts{AddressSpaceActive: false}), hostarch.Addr(patchAddr), prevCode); err != nil {
   206  		return false, err
   207  	}
   208  
   209  	// Check that another thread has not patched this syscall yet.
   210  	// 0xb8 is the first byte of "mov sysno, %eax".
   211  	if prevCode[0] == uint8(0xb8) {
   212  		ctx.Debugf("Found the pattern at ip %x:sysno %d", patchAddr, sysno)
   213  
   214  		trapAddr, err := s.addTrapLocked(ctx, ac, mm, uint32(sysno))
   215  		if trapAddr == 0 || err != nil {
   216  			ctx.Warningf("Failed to add a new trap: %v", err)
   217  			return false, nil
   218  		}
   219  
   220  		// Replace "mov sysno, %eax; syscall" with "jmp trapAddr".
   221  		newCode := make([]uint8, len(jmpInst))
   222  		copy(newCode[:jmpInstOpcodeLen], jmpInst[:jmpInstOpcodeLen])
   223  		binary.LittleEndian.PutUint32(newCode[jmpInstOpcodeLen:], uint32(trapAddr))
   224  
   225  		ctx.Debugf("Apply the binary patch addr %x trap addr %x (%v -> %v)", patchAddr, trapAddr, prevCode, newCode)
   226  
   227  		ignorePermContext := task.OwnCopyContext(usermem.IOOpts{IgnorePermissions: true})
   228  
   229  		// The patch can't be applied atomically, so we need to
   230  		// guarantee that in each moment other threads will read a
   231  		// valid set of instructions, detect any inconsistent states
   232  		// and restart the patched code if so.
   233  		//
   234  		// A subtle aspect is the address at which the user trap table
   235  		// is always mapped which is 0x60000. The first byte of this is
   236  		// 0x06 which is an invalid opcode. That’s why when we
   237  		// overwrite all the bytes but the first 1 in the second step
   238  		// it works fine since the jump address still writes a 0x6 at
   239  		// the location of the first byte of syscall instruction that
   240  		// we are removing and any threads reading the instructions
   241  		// will still fault at the same place.
   242  		//
   243  		// Another subtle aspect is the second step is done using a
   244  		// regular non-atomic write which means a thread decoding the
   245  		// mov instruction could read a garbage value of the immediate
   246  		// operand for the ‘mov sysyno, %eax” instruction. But it
   247  		// doesn’t matter since we don’t change the first byte which is
   248  		// the one that contains the opcode. Also since the thread will
   249  		// fault on the 0x6 right after and will be restarted with the
   250  		// patched code the mov reading a garbage immediate operand
   251  		// doesn’t impact correctness.
   252  
   253  		// The patch is applied in three steps:
   254  		//
   255  		// The first step is to replace the first byte of the syscall
   256  		// instruction by one-byte invalid instruction (0x06), so that
   257  		// other threads which have passed the mov instruction fault on
   258  		// the invalid instruction and restart a patched code.
   259  		faultInstB := primitive.ByteSlice(faultInst[:])
   260  		if _, err := faultInstB.CopyOut(ignorePermContext, hostarch.Addr(patchAddr+faultInstOffset)); err != nil {
   261  			return false, err
   262  		}
   263  		// The second step is to replace all bytes except the first one
   264  		// which is the opcode of the mov instruction, so that the first
   265  		// five bytes remain "mov XXX, %rax".
   266  		if _, err := primitive.CopyUint8SliceOut(ignorePermContext, hostarch.Addr(patchAddr+1), newCode[1:]); err != nil {
   267  			return false, err
   268  		}
   269  		// The final step is to replace the first byte of the patch.
   270  		// After this point, all threads will read the valid jmp
   271  		// instruction.
   272  		if _, err := primitive.CopyUint8SliceOut(ignorePermContext, hostarch.Addr(patchAddr), newCode[0:1]); err != nil {
   273  			return false, err
   274  		}
   275  	}
   276  	ac.RestartSyscall()
   277  	ac.SetIP(patchAddr)
   278  	return true, nil
   279  }
   280  
   281  // HandleFault handles a fault on a patched syscall instruction.
   282  //
   283  // When we replace a system call with a function call, we replace two
   284  // instructions with one instruction. This means that here can be a thread
   285  // which called the first instruction, then another thread applied a binary
   286  // patch and the first thread calls the second instruction.
   287  //
   288  // To handle this case, the function call (jmp) instruction is constructed so
   289  // that the first byte of the syscall instruction is changed with the one-byte
   290  // invalid instruction (0x6).  And in case of the race, the first thread will
   291  // fault on the invalid instruction and HandleFault will restart the function
   292  // call.
   293  func (s *State) HandleFault(ctx context.Context, ac *arch.Context64, mm memoryManager) error {
   294  	task := kernel.TaskFromContext(ctx)
   295  	if task == nil {
   296  		return fmt.Errorf("no task found")
   297  	}
   298  
   299  	s.mu.RLock()
   300  	defer s.mu.RUnlock()
   301  
   302  	code := make([]uint8, len(jmpInst))
   303  	ip := ac.IP() - faultInstOffset
   304  	if _, err := primitive.CopyUint8SliceIn(task.OwnCopyContext(usermem.IOOpts{AddressSpaceActive: false}), hostarch.Addr(ip), code); err != nil {
   305  		return err
   306  	}
   307  
   308  	for i := 0; i < jmpInstOpcodeLen; i++ {
   309  		if code[i] != jmpInst[i] {
   310  			return nil
   311  		}
   312  	}
   313  	for i := 0; i < len(faultInst); i++ {
   314  		if code[i+int(faultInstOffset)] != faultInst[i] {
   315  			return nil
   316  		}
   317  	}
   318  
   319  	regs := &ac.StateData().Regs
   320  	if regs.Rax == uint64(unix.SYS_RESTART_SYSCALL) {
   321  		// restart_syscall is usually set by the Sentry to restart a
   322  		// system call after interruption by a stop signal. The Sentry
   323  		// sets RAX and moves RIP back on the size of the syscall
   324  		// instruction.
   325  		//
   326  		// RAX can't be set to SYS_RESTART_SYSCALL due to a race with
   327  		// injecting a function call, because neither of the two first
   328  		// bytes are equal to proper bytes of jmpInst.
   329  		regs.Orig_rax = regs.Rax
   330  		regs.Rip += arch.SyscallWidth
   331  		return ErrFaultSyscall
   332  	}
   333  
   334  	ac.SetIP(ip)
   335  	return ErrFaultRestart
   336  }
   337  
   338  // PreFork locks the trap table for reading. This call guarantees that the trap
   339  // table will not be changed before the next PostFork call.
   340  // +checklocksacquireread:s.mu
   341  func (s *State) PreFork() {
   342  	s.mu.RLock()
   343  }
   344  
   345  // PostFork unlocks the trap table.
   346  // +checklocksreleaseread:s.mu
   347  func (s *State) PostFork() {
   348  	s.mu.RUnlock()
   349  }