gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/platform/systrap/syscall_thread.go (about) 1 // Copyright 2021 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package systrap 16 17 import ( 18 "fmt" 19 "os" 20 "sync/atomic" 21 22 "golang.org/x/sys/unix" 23 "gvisor.dev/gvisor/pkg/abi/linux" 24 "gvisor.dev/gvisor/pkg/hostarch" 25 "gvisor.dev/gvisor/pkg/seccomp" 26 "gvisor.dev/gvisor/pkg/sentry/arch" 27 "gvisor.dev/gvisor/pkg/sentry/memmap" 28 "gvisor.dev/gvisor/pkg/sentry/pgalloc" 29 "gvisor.dev/gvisor/pkg/sentry/platform/systrap/sysmsg" 30 "gvisor.dev/gvisor/pkg/sentry/usage" 31 ) 32 33 // The syscall message consists of sentry and stub messages. 34 const syscallThreadMessageSize = hostarch.PageSize * 2 35 36 // syscallThread implements the process of calling syscalls in a stub process. 37 // 38 // Each syscall thread owns a shared memory region to communicate with the 39 // Sentry. This region consists of two pages. The first page called 40 // sentryMessage is mapped as read-only in the stub address space. The second 41 // page called stubMessage is mapped as read-write in the stub process. 42 // 43 // Any memory regions that are mapped as read-write in a stub address space can 44 // be changed from a user code. This means that we can't trust the content of 45 // stubMessage, but it is used to receive a syscall return code. Therefore 46 // syscallThread can be used only in these cases: 47 // - If a system call never fails (e.g munmap). 48 // - If a system call has to return only one know value or if it fails, 49 // it doesn't not reveal any data (e.g. mmap). 50 type syscallThread struct { 51 // subproc is a link to the subprocess which is used to call native 52 // system calls and track when a sysmsg thread has to be recreated. 53 // Look at getSysmsgThread() for more details. 54 subproc *subprocess 55 56 // thread is a thread identifier. 57 thread *thread 58 59 // stackRange is the range for the sentry syscall message in the memory 60 // file. 61 stackRange memmap.FileRange 62 63 // sentryAddr is the address of the shared memory region in the Sentry 64 // address space. 65 sentryAddr uintptr 66 // stubAddr is the address of the shared memory region in the stub 67 // address space. 68 stubAddr uintptr 69 70 // sentryMessage is the first page of the share message that can't be 71 // modified by the stub thread. 72 sentryMessage *syscallSentryMessage 73 // stubMessage is the second page of the shared message that can be 74 // modified by the stub thread. 75 stubMessage *syscallStubMessage 76 77 seccompNotify *os.File 78 seccompNotifyResp linux.SeccompNotifResp 79 } 80 81 func (t *syscallThread) init(seccompNotify bool) error { 82 // Allocate a new shared memory message. 83 opts := pgalloc.AllocOpts{ 84 Kind: usage.System, 85 Dir: pgalloc.TopDown, 86 } 87 fr, err := t.subproc.memoryFile.Allocate(syscallThreadMessageSize, opts) 88 if err != nil { 89 return err 90 } 91 92 t.stackRange = fr 93 t.stubAddr = stubSysmsgStack + sysmsg.PerThreadMemSize*uintptr(t.thread.sysmsgStackID) 94 err = t.mapMessageIntoStub() 95 if err != nil { 96 t.destroy() 97 return err 98 } 99 100 if seccompNotify { 101 t.seccompNotify = t.installSeccompNotify() 102 } 103 104 // Map the stack into the sentry. 105 sentryAddr, _, errno := unix.RawSyscall6( 106 unix.SYS_MMAP, 107 0, 108 syscallThreadMessageSize, 109 unix.PROT_WRITE|unix.PROT_READ, 110 unix.MAP_SHARED|unix.MAP_FILE, 111 uintptr(t.subproc.memoryFile.FD()), uintptr(fr.Start)) 112 if errno != 0 { 113 t.destroy() 114 return fmt.Errorf("mmap failed: %v", errno) 115 } 116 t.sentryAddr = sentryAddr 117 118 t.initRequestReplyAddresses(sentryAddr) 119 return nil 120 } 121 122 func (t *syscallThread) destroy() { 123 if t.sentryAddr != 0 { 124 _, _, errno := unix.RawSyscall6( 125 unix.SYS_MUNMAP, 126 t.sentryAddr, 127 syscallThreadMessageSize, 128 0, 0, 0, 0) 129 if errno != 0 { 130 panic(fmt.Sprintf("mumap failed: %v", errno)) 131 } 132 } 133 if t.stubAddr != 0 { 134 _, err := t.thread.syscallIgnoreInterrupt(&t.thread.initRegs, unix.SYS_MUNMAP, 135 arch.SyscallArgument{Value: t.stubAddr}, 136 arch.SyscallArgument{Value: uintptr(syscallThreadMessageSize)}) 137 if err != nil { 138 panic(fmt.Sprintf("munmap failed: %v", err)) 139 } 140 } 141 t.subproc.memoryFile.DecRef(t.stackRange) 142 t.subproc.sysmsgStackPool.Put(t.thread.sysmsgStackID) 143 } 144 145 func (t *syscallThread) installSeccompNotify() *os.File { 146 fd, err := t.thread.syscallIgnoreInterrupt(&t.thread.initRegs, seccomp.SYS_SECCOMP, 147 arch.SyscallArgument{Value: uintptr(linux.SECCOMP_SET_MODE_FILTER)}, 148 arch.SyscallArgument{Value: uintptr(linux.SECCOMP_FILTER_FLAG_NEW_LISTENER)}, 149 arch.SyscallArgument{Value: stubSyscallRules}) 150 if err != nil { 151 panic(fmt.Sprintf("seccomp failed: %v", err)) 152 } 153 _, _, errno := unix.RawSyscall(unix.SYS_IOCTL, fd, linux.SECCOMP_IOCTL_NOTIF_SET_FLAGS, linux.SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP) 154 if errno != 0 { 155 t.thread.Debugf("failed to set SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP") 156 } 157 return os.NewFile(fd, "seccomp_notify") 158 } 159 160 // mapMessageIntoStub maps the syscall message into the stub process address space. 161 func (t *syscallThread) mapMessageIntoStub() error { 162 // Map sentryMessage as read-only. 163 _, err := t.thread.syscallIgnoreInterrupt(&t.thread.initRegs, unix.SYS_MMAP, 164 arch.SyscallArgument{Value: t.stubAddr}, 165 arch.SyscallArgument{Value: uintptr(hostarch.PageSize)}, 166 arch.SyscallArgument{Value: uintptr(unix.PROT_READ)}, 167 arch.SyscallArgument{Value: unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED}, 168 arch.SyscallArgument{Value: uintptr(t.subproc.memoryFile.FD())}, 169 arch.SyscallArgument{Value: uintptr(t.stackRange.Start)}) 170 if err != nil { 171 return err 172 } 173 // Map stubMessage as read-write. 174 _, err = t.thread.syscallIgnoreInterrupt(&t.thread.initRegs, unix.SYS_MMAP, 175 arch.SyscallArgument{Value: t.stubAddr + syscallStubMessageOffset}, 176 arch.SyscallArgument{Value: uintptr(hostarch.PageSize)}, 177 arch.SyscallArgument{Value: uintptr(unix.PROT_READ | unix.PROT_WRITE)}, 178 arch.SyscallArgument{Value: unix.MAP_SHARED | unix.MAP_FILE | unix.MAP_FIXED}, 179 arch.SyscallArgument{Value: uintptr(t.subproc.memoryFile.FD())}, 180 arch.SyscallArgument{Value: uintptr(t.stackRange.Start + hostarch.PageSize)}) 181 return err 182 } 183 184 // attach attaches to the stub thread with ptrace and unlock signals. 185 func (t *syscallThread) attach() error { 186 if err := t.thread.attach(); err != nil { 187 return err 188 } 189 // We need to unblock signals, because the TRAP signal is used to run 190 // syscalls via ptrace. 191 t.unmaskAllSignalsAttached() 192 return nil 193 } 194 195 const maxErrno = 4095 196 197 func (t *syscallThread) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintptr, error) { 198 if t.subproc.dead.Load() { 199 return 0, errDeadSubprocess 200 } 201 sentryMsg := t.sentryMessage 202 stubMsg := t.stubMessage 203 sentryMsg.sysno = uint64(sysno) 204 for i := 0; i < len(sentryMsg.args); i++ { 205 if i < len(args) { 206 sentryMsg.args[i] = uint64(args[i].Value) 207 } else { 208 sentryMsg.args[i] = 0 209 } 210 } 211 212 if t.seccompNotify != nil { 213 if errno := t.kickSeccompNotify(); errno != 0 { 214 t.thread.kill() 215 t.thread.Warningf("failed sending request to syscall thread: %s", errno) 216 return 0, errDeadSubprocess 217 } 218 if err := t.waitForSeccompNotify(); err != nil { 219 t.thread.Warningf("failed waiting for seccomp notify: %s", err) 220 return 0, errDeadSubprocess 221 } 222 } else { 223 224 // Notify the syscall thread about a new syscall request. 225 atomic.AddUint32(&sentryMsg.state, 1) 226 futexWakeUint32(&sentryMsg.state) 227 228 // Wait for reply. 229 // 230 // futex waits for sentryMsg.state that isn't changed, so it will 231 // returns only only when the other side will call FUTEX_WAKE. 232 futexWaitWake(&sentryMsg.state, atomic.LoadUint32(&sentryMsg.state)) 233 } 234 235 errno := -uintptr(stubMsg.ret) 236 if errno > 0 && errno < maxErrno { 237 return 0, fmt.Errorf("stub syscall (%x, %#v) failed with %w", sysno, args, unix.Errno(errno)) 238 } 239 240 return uintptr(stubMsg.ret), nil 241 }