github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/syscalls/linux/sys_futex.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package linux 16 17 import ( 18 "time" 19 20 "github.com/SagerNet/gvisor/pkg/abi/linux" 21 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 22 "github.com/SagerNet/gvisor/pkg/hostarch" 23 "github.com/SagerNet/gvisor/pkg/sentry/arch" 24 "github.com/SagerNet/gvisor/pkg/sentry/kernel" 25 ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time" 26 "github.com/SagerNet/gvisor/pkg/syserror" 27 ) 28 29 // futexWaitRestartBlock encapsulates the state required to restart futex(2) 30 // via restart_syscall(2). 31 // 32 // +stateify savable 33 type futexWaitRestartBlock struct { 34 duration time.Duration 35 36 // addr stored as uint64 since uintptr is not save-able. 37 addr uint64 38 private bool 39 val uint32 40 mask uint32 41 } 42 43 // Restart implements kernel.SyscallRestartBlock.Restart. 44 func (f *futexWaitRestartBlock) Restart(t *kernel.Task) (uintptr, error) { 45 return futexWaitDuration(t, f.duration, false, hostarch.Addr(f.addr), f.private, f.val, f.mask) 46 } 47 48 // futexWaitAbsolute performs a FUTEX_WAIT_BITSET, blocking until the wait is 49 // complete. 50 // 51 // The wait blocks forever if forever is true, otherwise it blocks until ts. 52 // 53 // If blocking is interrupted, the syscall is restarted with the original 54 // arguments. 55 func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, forever bool, addr hostarch.Addr, private bool, val, mask uint32) (uintptr, error) { 56 w := t.FutexWaiter() 57 err := t.Futex().WaitPrepare(w, t, addr, private, val, mask) 58 if err != nil { 59 return 0, err 60 } 61 62 if forever { 63 err = t.Block(w.C) 64 } else if clockRealtime { 65 notifier, tchan := ktime.NewChannelNotifier() 66 timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier) 67 timer.Swap(ktime.Setting{ 68 Enabled: true, 69 Next: ktime.FromTimespec(ts), 70 }) 71 err = t.BlockWithTimer(w.C, tchan) 72 timer.Destroy() 73 } else { 74 err = t.BlockWithDeadline(w.C, true, ktime.FromTimespec(ts)) 75 } 76 77 t.Futex().WaitComplete(w, t) 78 return 0, syserror.ConvertIntr(err, syserror.ERESTARTSYS) 79 } 80 81 // futexWaitDuration performs a FUTEX_WAIT, blocking until the wait is 82 // complete. 83 // 84 // The wait blocks forever if forever is true, otherwise is blocks for 85 // duration. 86 // 87 // If blocking is interrupted, forever determines how to restart the 88 // syscall. If forever is true, the syscall is restarted with the original 89 // arguments. If forever is false, duration is a relative timeout and the 90 // syscall is restarted with the remaining timeout. 91 func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, addr hostarch.Addr, private bool, val, mask uint32) (uintptr, error) { 92 w := t.FutexWaiter() 93 err := t.Futex().WaitPrepare(w, t, addr, private, val, mask) 94 if err != nil { 95 return 0, err 96 } 97 98 remaining, err := t.BlockWithTimeout(w.C, !forever, duration) 99 t.Futex().WaitComplete(w, t) 100 if err == nil { 101 return 0, nil 102 } 103 104 // The wait was unsuccessful for some reason other than interruption. Simply 105 // forward the error. 106 if err != syserror.ErrInterrupted { 107 return 0, err 108 } 109 110 // The wait was interrupted and we need to restart. Decide how. 111 112 // The wait duration was absolute, restart with the original arguments. 113 if forever { 114 return 0, syserror.ERESTARTSYS 115 } 116 117 // The wait duration was relative, restart with the remaining duration. 118 t.SetSyscallRestartBlock(&futexWaitRestartBlock{ 119 duration: remaining, 120 addr: uint64(addr), 121 private: private, 122 val: val, 123 mask: mask, 124 }) 125 return 0, syserror.ERESTART_RESTARTBLOCK 126 } 127 128 func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr hostarch.Addr, private bool) error { 129 w := t.FutexWaiter() 130 locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, false) 131 if err != nil { 132 return err 133 } 134 if locked { 135 // Futex acquired, we're done! 136 return nil 137 } 138 139 if forever { 140 err = t.Block(w.C) 141 } else { 142 notifier, tchan := ktime.NewChannelNotifier() 143 timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier) 144 timer.Swap(ktime.Setting{ 145 Enabled: true, 146 Next: ktime.FromTimespec(ts), 147 }) 148 err = t.BlockWithTimer(w.C, tchan) 149 timer.Destroy() 150 } 151 152 t.Futex().WaitComplete(w, t) 153 return syserror.ConvertIntr(err, syserror.ERESTARTSYS) 154 } 155 156 func tryLockPI(t *kernel.Task, addr hostarch.Addr, private bool) error { 157 w := t.FutexWaiter() 158 locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, true) 159 if err != nil { 160 return err 161 } 162 if !locked { 163 return linuxerr.EWOULDBLOCK 164 } 165 return nil 166 } 167 168 // Futex implements linux syscall futex(2). 169 // It provides a method for a program to wait for a value at a given address to 170 // change, and a method to wake up anyone waiting on a particular address. 171 func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 172 addr := args[0].Pointer() 173 futexOp := args[1].Int() 174 val := int(args[2].Int()) 175 nreq := int(args[3].Int()) 176 timeout := args[3].Pointer() 177 naddr := args[4].Pointer() 178 val3 := args[5].Int() 179 180 cmd := futexOp &^ (linux.FUTEX_PRIVATE_FLAG | linux.FUTEX_CLOCK_REALTIME) 181 private := (futexOp & linux.FUTEX_PRIVATE_FLAG) != 0 182 clockRealtime := (futexOp & linux.FUTEX_CLOCK_REALTIME) == linux.FUTEX_CLOCK_REALTIME 183 mask := uint32(val3) 184 185 switch cmd { 186 case linux.FUTEX_WAIT, linux.FUTEX_WAIT_BITSET: 187 // WAIT{_BITSET} wait forever if the timeout isn't passed. 188 forever := (timeout == 0) 189 190 var timespec linux.Timespec 191 if !forever { 192 var err error 193 timespec, err = copyTimespecIn(t, timeout) 194 if err != nil { 195 return 0, nil, err 196 } 197 } 198 199 switch cmd { 200 case linux.FUTEX_WAIT: 201 // WAIT uses a relative timeout. 202 mask = linux.FUTEX_BITSET_MATCH_ANY 203 var timeoutDur time.Duration 204 if !forever { 205 timeoutDur = time.Duration(timespec.ToNsecCapped()) * time.Nanosecond 206 } 207 n, err := futexWaitDuration(t, timeoutDur, forever, addr, private, uint32(val), mask) 208 return n, nil, err 209 210 case linux.FUTEX_WAIT_BITSET: 211 // WAIT_BITSET uses an absolute timeout which is either 212 // CLOCK_MONOTONIC or CLOCK_REALTIME. 213 if mask == 0 { 214 return 0, nil, linuxerr.EINVAL 215 } 216 n, err := futexWaitAbsolute(t, clockRealtime, timespec, forever, addr, private, uint32(val), mask) 217 return n, nil, err 218 default: 219 panic("unreachable") 220 } 221 222 case linux.FUTEX_WAKE: 223 mask = ^uint32(0) 224 fallthrough 225 226 case linux.FUTEX_WAKE_BITSET: 227 if mask == 0 { 228 return 0, nil, linuxerr.EINVAL 229 } 230 if val <= 0 { 231 // The Linux kernel wakes one waiter even if val is 232 // non-positive. 233 val = 1 234 } 235 n, err := t.Futex().Wake(t, addr, private, mask, val) 236 return uintptr(n), nil, err 237 238 case linux.FUTEX_REQUEUE: 239 n, err := t.Futex().Requeue(t, addr, naddr, private, val, nreq) 240 return uintptr(n), nil, err 241 242 case linux.FUTEX_CMP_REQUEUE: 243 // 'val3' contains the value to be checked at 'addr' and 244 // 'val' is the number of waiters that should be woken up. 245 nval := uint32(val3) 246 n, err := t.Futex().RequeueCmp(t, addr, naddr, private, nval, val, nreq) 247 return uintptr(n), nil, err 248 249 case linux.FUTEX_WAKE_OP: 250 op := uint32(val3) 251 if val <= 0 { 252 // The Linux kernel wakes one waiter even if val is 253 // non-positive. 254 val = 1 255 } 256 n, err := t.Futex().WakeOp(t, addr, naddr, private, val, nreq, op) 257 return uintptr(n), nil, err 258 259 case linux.FUTEX_LOCK_PI: 260 forever := (timeout == 0) 261 262 var timespec linux.Timespec 263 if !forever { 264 var err error 265 timespec, err = copyTimespecIn(t, timeout) 266 if err != nil { 267 return 0, nil, err 268 } 269 } 270 err := futexLockPI(t, timespec, forever, addr, private) 271 return 0, nil, err 272 273 case linux.FUTEX_TRYLOCK_PI: 274 err := tryLockPI(t, addr, private) 275 return 0, nil, err 276 277 case linux.FUTEX_UNLOCK_PI: 278 err := t.Futex().UnlockPI(t, addr, uint32(t.ThreadID()), private) 279 return 0, nil, err 280 281 case linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI: 282 t.Kernel().EmitUnimplementedEvent(t) 283 return 0, nil, syserror.ENOSYS 284 285 default: 286 // We don't even know about this command. 287 return 0, nil, syserror.ENOSYS 288 } 289 } 290 291 // SetRobustList implements linux syscall set_robust_list(2). 292 func SetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 293 // Despite the syscall using the name 'pid' for this variable, it is 294 // very much a tid. 295 head := args[0].Pointer() 296 length := args[1].SizeT() 297 298 if length != uint(linux.SizeOfRobustListHead) { 299 return 0, nil, linuxerr.EINVAL 300 } 301 t.SetRobustList(head) 302 return 0, nil, nil 303 } 304 305 // GetRobustList implements linux syscall get_robust_list(2). 306 func GetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 307 // Despite the syscall using the name 'pid' for this variable, it is 308 // very much a tid. 309 tid := args[0].Int() 310 headAddr := args[1].Pointer() 311 sizeAddr := args[2].Pointer() 312 313 if tid < 0 { 314 return 0, nil, linuxerr.EINVAL 315 } 316 317 ot := t 318 if tid != 0 { 319 if ot = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)); ot == nil { 320 return 0, nil, syserror.ESRCH 321 } 322 } 323 324 // Copy out head pointer. 325 head := t.Arch().Native(uintptr(ot.GetRobustList())) 326 if _, err := head.CopyOut(t, headAddr); err != nil { 327 return 0, nil, err 328 } 329 330 // Copy out size, which is a constant. Note that while size isn't 331 // an address, it is defined as the arch-dependent size_t, so it 332 // needs to be converted to a native-sized int. 333 size := t.Arch().Native(uintptr(linux.SizeOfRobustListHead)) 334 if _, err := size.CopyOut(t, sizeAddr); err != nil { 335 return 0, nil, err 336 } 337 338 return 0, nil, nil 339 }