gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/platform/systrap/syscall_thread.go (about)

     1  // Copyright 2021 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package systrap
    16  
    17  import (
    18  	"fmt"
    19  	"os"
    20  	"sync/atomic"
    21  
    22  	"golang.org/x/sys/unix"
    23  	"gvisor.dev/gvisor/pkg/abi/linux"
    24  	"gvisor.dev/gvisor/pkg/hostarch"
    25  	"gvisor.dev/gvisor/pkg/seccomp"
    26  	"gvisor.dev/gvisor/pkg/sentry/arch"
    27  	"gvisor.dev/gvisor/pkg/sentry/memmap"
    28  	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
    29  	"gvisor.dev/gvisor/pkg/sentry/platform/systrap/sysmsg"
    30  	"gvisor.dev/gvisor/pkg/sentry/usage"
    31  )
    32  
    33  // The syscall message consists of sentry and stub messages.
    34  const syscallThreadMessageSize = hostarch.PageSize * 2
    35  
    36  // syscallThread implements the process of calling syscalls in a stub process.
    37  //
    38  // Each syscall thread owns a shared memory region to communicate with the
    39  // Sentry. This region consists of two pages. The first page called
    40  // sentryMessage is mapped as read-only in the stub address space. The second
    41  // page called stubMessage is mapped as read-write in the stub process.
    42  //
    43  // Any memory regions that are mapped as read-write in a stub address space can
    44  // be changed from a user code. This means that we can't trust the content of
    45  // stubMessage, but it is used to receive a syscall return code. Therefore
    46  // syscallThread can be used only in these cases:
    47  //   - If a system call never fails (e.g munmap).
    48  //   - If a system call has to return only one know value or if it fails,
    49  //     it doesn't not reveal any data (e.g. mmap).
    50  type syscallThread struct {
    51  	// subproc is a link to the subprocess which is used to call native
    52  	// system calls and track when a sysmsg thread has to be recreated.
    53  	// Look at getSysmsgThread() for more details.
    54  	subproc *subprocess
    55  
    56  	// thread is a thread identifier.
    57  	thread *thread
    58  
    59  	// stackRange is the range for the sentry syscall message in the memory
    60  	// file.
    61  	stackRange memmap.FileRange
    62  
    63  	// sentryAddr is the address of the shared memory region in the Sentry
    64  	// address space.
    65  	sentryAddr uintptr
    66  	// stubAddr is the address of the shared memory region in the stub
    67  	// address space.
    68  	stubAddr uintptr
    69  
    70  	// sentryMessage is the first page of the share message that can't be
    71  	// modified by the stub thread.
    72  	sentryMessage *syscallSentryMessage
    73  	// stubMessage is the second page of the shared message that can be
    74  	// modified by the stub thread.
    75  	stubMessage *syscallStubMessage
    76  
    77  	seccompNotify     *os.File
    78  	seccompNotifyResp linux.SeccompNotifResp
    79  }
    80  
    81  func (t *syscallThread) init(seccompNotify bool) error {
    82  	// Allocate a new shared memory message.
    83  	opts := pgalloc.AllocOpts{
    84  		Kind: usage.System,
    85  		Dir:  pgalloc.TopDown,
    86  	}
    87  	fr, err := t.subproc.memoryFile.Allocate(syscallThreadMessageSize, opts)
    88  	if err != nil {
    89  		return err
    90  	}
    91  
    92  	t.stackRange = fr
    93  	t.stubAddr = stubSysmsgStack + sysmsg.PerThreadMemSize*uintptr(t.thread.sysmsgStackID)
    94  	err = t.mapMessageIntoStub()
    95  	if err != nil {
    96  		t.destroy()
    97  		return err
    98  	}
    99  
   100  	if seccompNotify {
   101  		t.seccompNotify = t.installSeccompNotify()
   102  	}
   103  
   104  	// Map the stack into the sentry.
   105  	sentryAddr, _, errno := unix.RawSyscall6(
   106  		unix.SYS_MMAP,
   107  		0,
   108  		syscallThreadMessageSize,
   109  		unix.PROT_WRITE|unix.PROT_READ,
   110  		unix.MAP_SHARED|unix.MAP_FILE,
   111  		uintptr(t.subproc.memoryFile.FD()), uintptr(fr.Start))
   112  	if errno != 0 {
   113  		t.destroy()
   114  		return fmt.Errorf("mmap failed: %v", errno)
   115  	}
   116  	t.sentryAddr = sentryAddr
   117  
   118  	t.initRequestReplyAddresses(sentryAddr)
   119  	return nil
   120  }
   121  
   122  func (t *syscallThread) destroy() {
   123  	if t.sentryAddr != 0 {
   124  		_, _, errno := unix.RawSyscall6(
   125  			unix.SYS_MUNMAP,
   126  			t.sentryAddr,
   127  			syscallThreadMessageSize,
   128  			0, 0, 0, 0)
   129  		if errno != 0 {
   130  			panic(fmt.Sprintf("mumap failed: %v", errno))
   131  		}
   132  	}
   133  	if t.stubAddr != 0 {
   134  		_, err := t.thread.syscallIgnoreInterrupt(&t.thread.initRegs, unix.SYS_MUNMAP,
   135  			arch.SyscallArgument{Value: t.stubAddr},
   136  			arch.SyscallArgument{Value: uintptr(syscallThreadMessageSize)})
   137  		if err != nil {
   138  			panic(fmt.Sprintf("munmap failed: %v", err))
   139  		}
   140  	}
   141  	t.subproc.memoryFile.DecRef(t.stackRange)
   142  	t.subproc.sysmsgStackPool.Put(t.thread.sysmsgStackID)
   143  }
   144  
   145  func (t *syscallThread) installSeccompNotify() *os.File {
   146  	fd, err := t.thread.syscallIgnoreInterrupt(&t.thread.initRegs, seccomp.SYS_SECCOMP,
   147  		arch.SyscallArgument{Value: uintptr(linux.SECCOMP_SET_MODE_FILTER)},
   148  		arch.SyscallArgument{Value: uintptr(linux.SECCOMP_FILTER_FLAG_NEW_LISTENER)},
   149  		arch.SyscallArgument{Value: stubSyscallRules})
   150  	if err != nil {
   151  		panic(fmt.Sprintf("seccomp failed: %v", err))
   152  	}
   153  	_, _, errno := unix.RawSyscall(unix.SYS_IOCTL, fd, linux.SECCOMP_IOCTL_NOTIF_SET_FLAGS, linux.SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
   154  	if errno != 0 {
   155  		t.thread.Debugf("failed to set SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP")
   156  	}
   157  	return os.NewFile(fd, "seccomp_notify")
   158  }
   159  
   160  // mapMessageIntoStub maps the syscall message into the stub process address space.
   161  func (t *syscallThread) mapMessageIntoStub() error {
   162  	// Map sentryMessage as read-only.
   163  	_, err := t.thread.syscallIgnoreInterrupt(&t.thread.initRegs, unix.SYS_MMAP,
   164  		arch.SyscallArgument{Value: t.stubAddr},
   165  		arch.SyscallArgument{Value: uintptr(hostarch.PageSize)},
   166  		arch.SyscallArgument{Value: uintptr(unix.PROT_READ)},
   167  		arch.SyscallArgument{Value: unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED},
   168  		arch.SyscallArgument{Value: uintptr(t.subproc.memoryFile.FD())},
   169  		arch.SyscallArgument{Value: uintptr(t.stackRange.Start)})
   170  	if err != nil {
   171  		return err
   172  	}
   173  	// Map stubMessage as read-write.
   174  	_, err = t.thread.syscallIgnoreInterrupt(&t.thread.initRegs, unix.SYS_MMAP,
   175  		arch.SyscallArgument{Value: t.stubAddr + syscallStubMessageOffset},
   176  		arch.SyscallArgument{Value: uintptr(hostarch.PageSize)},
   177  		arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)},
   178  		arch.SyscallArgument{Value: unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED},
   179  		arch.SyscallArgument{Value: uintptr(t.subproc.memoryFile.FD())},
   180  		arch.SyscallArgument{Value: uintptr(t.stackRange.Start + hostarch.PageSize)})
   181  	return err
   182  }
   183  
   184  // attach attaches to the stub thread with ptrace and unlock signals.
   185  func (t *syscallThread) attach() error {
   186  	if err := t.thread.attach(); err != nil {
   187  		return err
   188  	}
   189  	// We need to unblock signals, because the TRAP signal is used to run
   190  	// syscalls via ptrace.
   191  	t.unmaskAllSignalsAttached()
   192  	return nil
   193  }
   194  
   195  const maxErrno = 4095
   196  
   197  func (t *syscallThread) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintptr, error) {
   198  	if t.subproc.dead.Load() {
   199  		return 0, errDeadSubprocess
   200  	}
   201  	sentryMsg := t.sentryMessage
   202  	stubMsg := t.stubMessage
   203  	sentryMsg.sysno = uint64(sysno)
   204  	for i := 0; i < len(sentryMsg.args); i++ {
   205  		if i < len(args) {
   206  			sentryMsg.args[i] = uint64(args[i].Value)
   207  		} else {
   208  			sentryMsg.args[i] = 0
   209  		}
   210  	}
   211  
   212  	if t.seccompNotify != nil {
   213  		if errno := t.kickSeccompNotify(); errno != 0 {
   214  			t.thread.kill()
   215  			t.thread.Warningf("failed sending request to syscall thread: %s", errno)
   216  			return 0, errDeadSubprocess
   217  		}
   218  		if err := t.waitForSeccompNotify(); err != nil {
   219  			t.thread.Warningf("failed waiting for seccomp notify: %s", err)
   220  			return 0, errDeadSubprocess
   221  		}
   222  	} else {
   223  
   224  		// Notify the syscall thread about a new syscall request.
   225  		atomic.AddUint32(&sentryMsg.state, 1)
   226  		futexWakeUint32(&sentryMsg.state)
   227  
   228  		// Wait for reply.
   229  		//
   230  		// futex waits for sentryMsg.state that isn't changed, so it will
   231  		// returns only only when the other side will call FUTEX_WAKE.
   232  		futexWaitWake(&sentryMsg.state, atomic.LoadUint32(&sentryMsg.state))
   233  	}
   234  
   235  	errno := -uintptr(stubMsg.ret)
   236  	if errno > 0 && errno < maxErrno {
   237  		return 0, fmt.Errorf("stub syscall (%x, %#v) failed with %w", sysno, args, unix.Errno(errno))
   238  	}
   239  
   240  	return uintptr(stubMsg.ret), nil
   241  }