github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/syscalls/linux/sys_futex.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package linux 16 17 import ( 18 "time" 19 20 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 21 "github.com/MerlinKodo/gvisor/pkg/errors/linuxerr" 22 "github.com/MerlinKodo/gvisor/pkg/hostarch" 23 "github.com/MerlinKodo/gvisor/pkg/sentry/arch" 24 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel" 25 ktime "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/time" 26 ) 27 28 // futexWaitRestartBlock encapsulates the state required to restart futex(2) 29 // via restart_syscall(2). 30 // 31 // +stateify savable 32 type futexWaitRestartBlock struct { 33 duration time.Duration 34 35 // addr stored as uint64 since uintptr is not save-able. 36 addr uint64 37 private bool 38 val uint32 39 mask uint32 40 } 41 42 // Restart implements kernel.SyscallRestartBlock.Restart. 43 func (f *futexWaitRestartBlock) Restart(t *kernel.Task) (uintptr, error) { 44 return futexWaitDuration(t, f.duration, false, hostarch.Addr(f.addr), f.private, f.val, f.mask) 45 } 46 47 // futexWaitAbsolute performs a FUTEX_WAIT_BITSET, blocking until the wait is 48 // complete. 49 // 50 // The wait blocks forever if forever is true, otherwise it blocks until ts. 51 // 52 // If blocking is interrupted, the syscall is restarted with the original 53 // arguments. 54 func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, forever bool, addr hostarch.Addr, private bool, val, mask uint32) (uintptr, error) { 55 w := t.FutexWaiter() 56 err := t.Futex().WaitPrepare(w, t, addr, private, val, mask) 57 if err != nil { 58 return 0, err 59 } 60 61 if forever { 62 err = t.Block(w.C) 63 } else if clockRealtime { 64 notifier, tchan := ktime.NewChannelNotifier() 65 timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier) 66 timer.Swap(ktime.Setting{ 67 Enabled: true, 68 Next: ktime.FromTimespec(ts), 69 }) 70 err = t.BlockWithTimer(w.C, tchan) 71 timer.Destroy() 72 } else { 73 err = t.BlockWithDeadline(w.C, true, ktime.FromTimespec(ts)) 74 } 75 76 t.Futex().WaitComplete(w, t) 77 return 0, linuxerr.ConvertIntr(err, linuxerr.ERESTARTSYS) 78 } 79 80 // futexWaitDuration performs a FUTEX_WAIT, blocking until the wait is 81 // complete. 82 // 83 // The wait blocks forever if forever is true, otherwise is blocks for 84 // duration. 85 // 86 // If blocking is interrupted, forever determines how to restart the 87 // syscall. If forever is true, the syscall is restarted with the original 88 // arguments. If forever is false, duration is a relative timeout and the 89 // syscall is restarted with the remaining timeout. 90 func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, addr hostarch.Addr, private bool, val, mask uint32) (uintptr, error) { 91 w := t.FutexWaiter() 92 err := t.Futex().WaitPrepare(w, t, addr, private, val, mask) 93 if err != nil { 94 return 0, err 95 } 96 97 remaining, err := t.BlockWithTimeout(w.C, !forever, duration) 98 t.Futex().WaitComplete(w, t) 99 if err == nil { 100 return 0, nil 101 } 102 103 // The wait was unsuccessful for some reason other than interruption. Simply 104 // forward the error. 105 if err != linuxerr.ErrInterrupted { 106 return 0, err 107 } 108 109 // The wait was interrupted and we need to restart. Decide how. 110 111 // The wait duration was absolute, restart with the original arguments. 112 if forever { 113 return 0, linuxerr.ERESTARTSYS 114 } 115 116 // The wait duration was relative, restart with the remaining duration. 117 t.SetSyscallRestartBlock(&futexWaitRestartBlock{ 118 duration: remaining, 119 addr: uint64(addr), 120 private: private, 121 val: val, 122 mask: mask, 123 }) 124 return 0, linuxerr.ERESTART_RESTARTBLOCK 125 } 126 127 func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr hostarch.Addr, private bool) error { 128 w := t.FutexWaiter() 129 locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, false) 130 if err != nil { 131 return err 132 } 133 if locked { 134 // Futex acquired, we're done! 135 return nil 136 } 137 138 if forever { 139 err = t.Block(w.C) 140 } else { 141 notifier, tchan := ktime.NewChannelNotifier() 142 timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier) 143 timer.Swap(ktime.Setting{ 144 Enabled: true, 145 Next: ktime.FromTimespec(ts), 146 }) 147 err = t.BlockWithTimer(w.C, tchan) 148 timer.Destroy() 149 } 150 151 t.Futex().WaitComplete(w, t) 152 return linuxerr.ConvertIntr(err, linuxerr.ERESTARTSYS) 153 } 154 155 func tryLockPI(t *kernel.Task, addr hostarch.Addr, private bool) error { 156 w := t.FutexWaiter() 157 locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, true) 158 if err != nil { 159 return err 160 } 161 if !locked { 162 return linuxerr.EWOULDBLOCK 163 } 164 return nil 165 } 166 167 // Futex implements linux syscall futex(2). 168 // It provides a method for a program to wait for a value at a given address to 169 // change, and a method to wake up anyone waiting on a particular address. 170 func Futex(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 171 addr := args[0].Pointer() 172 futexOp := args[1].Int() 173 val := int(args[2].Int()) 174 nreq := int(args[3].Int()) 175 timeout := args[3].Pointer() 176 naddr := args[4].Pointer() 177 val3 := args[5].Int() 178 179 cmd := futexOp &^ (linux.FUTEX_PRIVATE_FLAG | linux.FUTEX_CLOCK_REALTIME) 180 private := (futexOp & linux.FUTEX_PRIVATE_FLAG) != 0 181 clockRealtime := (futexOp & linux.FUTEX_CLOCK_REALTIME) == linux.FUTEX_CLOCK_REALTIME 182 mask := uint32(val3) 183 184 switch cmd { 185 case linux.FUTEX_WAIT, linux.FUTEX_WAIT_BITSET: 186 // WAIT{_BITSET} wait forever if the timeout isn't passed. 187 forever := (timeout == 0) 188 189 var timespec linux.Timespec 190 if !forever { 191 var err error 192 timespec, err = copyTimespecIn(t, timeout) 193 if err != nil { 194 return 0, nil, err 195 } 196 } 197 198 switch cmd { 199 case linux.FUTEX_WAIT: 200 // WAIT uses a relative timeout. 201 mask = linux.FUTEX_BITSET_MATCH_ANY 202 var timeoutDur time.Duration 203 if !forever { 204 timeoutDur = time.Duration(timespec.ToNsecCapped()) * time.Nanosecond 205 } 206 n, err := futexWaitDuration(t, timeoutDur, forever, addr, private, uint32(val), mask) 207 return n, nil, err 208 209 case linux.FUTEX_WAIT_BITSET: 210 // WAIT_BITSET uses an absolute timeout which is either 211 // CLOCK_MONOTONIC or CLOCK_REALTIME. 212 if mask == 0 { 213 return 0, nil, linuxerr.EINVAL 214 } 215 n, err := futexWaitAbsolute(t, clockRealtime, timespec, forever, addr, private, uint32(val), mask) 216 return n, nil, err 217 default: 218 panic("unreachable") 219 } 220 221 case linux.FUTEX_WAKE: 222 mask = ^uint32(0) 223 fallthrough 224 225 case linux.FUTEX_WAKE_BITSET: 226 if mask == 0 { 227 return 0, nil, linuxerr.EINVAL 228 } 229 if val <= 0 { 230 // The Linux kernel wakes one waiter even if val is 231 // non-positive. 232 val = 1 233 } 234 n, err := t.Futex().Wake(t, addr, private, mask, val) 235 return uintptr(n), nil, err 236 237 case linux.FUTEX_REQUEUE: 238 n, err := t.Futex().Requeue(t, addr, naddr, private, val, nreq) 239 return uintptr(n), nil, err 240 241 case linux.FUTEX_CMP_REQUEUE: 242 // 'val3' contains the value to be checked at 'addr' and 243 // 'val' is the number of waiters that should be woken up. 244 nval := uint32(val3) 245 n, err := t.Futex().RequeueCmp(t, addr, naddr, private, nval, val, nreq) 246 return uintptr(n), nil, err 247 248 case linux.FUTEX_WAKE_OP: 249 op := uint32(val3) 250 if val <= 0 { 251 // The Linux kernel wakes one waiter even if val is 252 // non-positive. 253 val = 1 254 } 255 n, err := t.Futex().WakeOp(t, addr, naddr, private, val, nreq, op) 256 return uintptr(n), nil, err 257 258 case linux.FUTEX_LOCK_PI: 259 forever := (timeout == 0) 260 261 var timespec linux.Timespec 262 if !forever { 263 var err error 264 timespec, err = copyTimespecIn(t, timeout) 265 if err != nil { 266 return 0, nil, err 267 } 268 } 269 err := futexLockPI(t, timespec, forever, addr, private) 270 return 0, nil, err 271 272 case linux.FUTEX_TRYLOCK_PI: 273 err := tryLockPI(t, addr, private) 274 return 0, nil, err 275 276 case linux.FUTEX_UNLOCK_PI: 277 err := t.Futex().UnlockPI(t, addr, uint32(t.ThreadID()), private) 278 return 0, nil, err 279 280 case linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI: 281 t.Kernel().EmitUnimplementedEvent(t, sysno) 282 return 0, nil, linuxerr.ENOSYS 283 284 default: 285 // We don't even know about this command. 286 return 0, nil, linuxerr.ENOSYS 287 } 288 } 289 290 // SetRobustList implements linux syscall set_robust_list(2). 291 func SetRobustList(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 292 // Despite the syscall using the name 'pid' for this variable, it is 293 // very much a tid. 294 head := args[0].Pointer() 295 length := args[1].SizeT() 296 297 if length != uint(linux.SizeOfRobustListHead) { 298 return 0, nil, linuxerr.EINVAL 299 } 300 t.SetRobustList(head) 301 return 0, nil, nil 302 } 303 304 // GetRobustList implements linux syscall get_robust_list(2). 305 func GetRobustList(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 306 // Despite the syscall using the name 'pid' for this variable, it is 307 // very much a tid. 308 tid := args[0].Int() 309 headAddr := args[1].Pointer() 310 sizeAddr := args[2].Pointer() 311 312 if tid < 0 { 313 return 0, nil, linuxerr.EINVAL 314 } 315 316 ot := t 317 if tid != 0 { 318 if ot = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)); ot == nil { 319 return 0, nil, linuxerr.ESRCH 320 } 321 } 322 323 // Copy out head pointer. 324 head := t.Arch().Native(uintptr(ot.GetRobustList())) 325 if _, err := head.CopyOut(t, headAddr); err != nil { 326 return 0, nil, err 327 } 328 329 // Copy out size, which is a constant. Note that while size isn't 330 // an address, it is defined as the arch-dependent size_t, so it 331 // needs to be converted to a native-sized int. 332 size := t.Arch().Native(uintptr(linux.SizeOfRobustListHead)) 333 if _, err := size.CopyOut(t, sizeAddr); err != nil { 334 return 0, nil, err 335 } 336 337 return 0, nil, nil 338 }