github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/syscalls/linux/sys_futex.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package linux 16 17 import ( 18 "time" 19 20 "github.com/metacubex/gvisor/pkg/abi/linux" 21 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 22 "github.com/metacubex/gvisor/pkg/hostarch" 23 "github.com/metacubex/gvisor/pkg/sentry/arch" 24 "github.com/metacubex/gvisor/pkg/sentry/kernel" 25 ktime "github.com/metacubex/gvisor/pkg/sentry/kernel/time" 26 ) 27 28 // futexWaitRestartBlock encapsulates the state required to restart futex(2) 29 // via restart_syscall(2). 30 // 31 // +stateify savable 32 type futexWaitRestartBlock struct { 33 duration time.Duration 34 35 // addr stored as uint64 since uintptr is not save-able. 36 addr uint64 37 private bool 38 val uint32 39 mask uint32 40 } 41 42 // Restart implements kernel.SyscallRestartBlock.Restart. 43 func (f *futexWaitRestartBlock) Restart(t *kernel.Task) (uintptr, error) { 44 return futexWaitDuration(t, f.duration, false, hostarch.Addr(f.addr), f.private, f.val, f.mask) 45 } 46 47 // futexWaitAbsolute performs a FUTEX_WAIT_BITSET, blocking until the wait is 48 // complete. 49 // 50 // The wait blocks forever if forever is true, otherwise it blocks until ts. 51 // 52 // If blocking is interrupted, the syscall is restarted with the original 53 // arguments. 54 func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, forever bool, addr hostarch.Addr, private bool, val, mask uint32) (uintptr, error) { 55 w := t.FutexWaiter() 56 err := t.Futex().WaitPrepare(w, t, addr, private, val, mask) 57 if err != nil { 58 return 0, err 59 } 60 61 if forever { 62 err = t.Block(w.C) 63 } else if clockRealtime { 64 err = t.BlockWithDeadlineFrom(w.C, t.Kernel().RealtimeClock(), true, ktime.FromTimespec(ts)) 65 } else { 66 err = t.BlockWithDeadline(w.C, true, ktime.FromTimespec(ts)) 67 } 68 69 t.Futex().WaitComplete(w, t) 70 return 0, linuxerr.ConvertIntr(err, linuxerr.ERESTARTSYS) 71 } 72 73 // futexWaitDuration performs a FUTEX_WAIT, blocking until the wait is 74 // complete. 75 // 76 // The wait blocks forever if forever is true, otherwise is blocks for 77 // duration. 78 // 79 // If blocking is interrupted, forever determines how to restart the 80 // syscall. If forever is true, the syscall is restarted with the original 81 // arguments. If forever is false, duration is a relative timeout and the 82 // syscall is restarted with the remaining timeout. 83 func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, addr hostarch.Addr, private bool, val, mask uint32) (uintptr, error) { 84 w := t.FutexWaiter() 85 err := t.Futex().WaitPrepare(w, t, addr, private, val, mask) 86 if err != nil { 87 return 0, err 88 } 89 90 remaining, err := t.BlockWithTimeout(w.C, !forever, duration) 91 t.Futex().WaitComplete(w, t) 92 if err == nil { 93 return 0, nil 94 } 95 96 // The wait was unsuccessful for some reason other than interruption. Simply 97 // forward the error. 98 if err != linuxerr.ErrInterrupted { 99 return 0, err 100 } 101 102 // The wait was interrupted and we need to restart. Decide how. 103 104 // The wait duration was absolute, restart with the original arguments. 105 if forever { 106 return 0, linuxerr.ERESTARTSYS 107 } 108 109 // The wait duration was relative, restart with the remaining duration. 110 t.SetSyscallRestartBlock(&futexWaitRestartBlock{ 111 duration: remaining, 112 addr: uint64(addr), 113 private: private, 114 val: val, 115 mask: mask, 116 }) 117 return 0, linuxerr.ERESTART_RESTARTBLOCK 118 } 119 120 func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr hostarch.Addr, private bool) error { 121 w := t.FutexWaiter() 122 locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, false) 123 if err != nil { 124 return err 125 } 126 if locked { 127 // Futex acquired, we're done! 128 return nil 129 } 130 131 if forever { 132 err = t.Block(w.C) 133 } else { 134 err = t.BlockWithDeadlineFrom(w.C, t.Kernel().RealtimeClock(), true, ktime.FromTimespec(ts)) 135 } 136 137 t.Futex().WaitComplete(w, t) 138 return linuxerr.ConvertIntr(err, linuxerr.ERESTARTSYS) 139 } 140 141 func tryLockPI(t *kernel.Task, addr hostarch.Addr, private bool) error { 142 w := t.FutexWaiter() 143 locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, true) 144 if err != nil { 145 return err 146 } 147 if !locked { 148 return linuxerr.EWOULDBLOCK 149 } 150 return nil 151 } 152 153 // Futex implements linux syscall futex(2). 154 // It provides a method for a program to wait for a value at a given address to 155 // change, and a method to wake up anyone waiting on a particular address. 156 func Futex(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 157 addr := args[0].Pointer() 158 futexOp := args[1].Int() 159 val := int(args[2].Int()) 160 nreq := int(args[3].Int()) 161 timeout := args[3].Pointer() 162 naddr := args[4].Pointer() 163 val3 := args[5].Int() 164 165 cmd := futexOp &^ (linux.FUTEX_PRIVATE_FLAG | linux.FUTEX_CLOCK_REALTIME) 166 private := (futexOp & linux.FUTEX_PRIVATE_FLAG) != 0 167 clockRealtime := (futexOp & linux.FUTEX_CLOCK_REALTIME) == linux.FUTEX_CLOCK_REALTIME 168 mask := uint32(val3) 169 170 switch cmd { 171 case linux.FUTEX_WAIT, linux.FUTEX_WAIT_BITSET: 172 // WAIT{_BITSET} wait forever if the timeout isn't passed. 173 forever := (timeout == 0) 174 175 var timespec linux.Timespec 176 if !forever { 177 var err error 178 timespec, err = copyTimespecIn(t, timeout) 179 if err != nil { 180 return 0, nil, err 181 } 182 } 183 184 switch cmd { 185 case linux.FUTEX_WAIT: 186 // WAIT uses a relative timeout. 187 mask = linux.FUTEX_BITSET_MATCH_ANY 188 var timeoutDur time.Duration 189 if !forever { 190 timeoutDur = time.Duration(timespec.ToNsecCapped()) * time.Nanosecond 191 } 192 n, err := futexWaitDuration(t, timeoutDur, forever, addr, private, uint32(val), mask) 193 return n, nil, err 194 195 case linux.FUTEX_WAIT_BITSET: 196 // WAIT_BITSET uses an absolute timeout which is either 197 // CLOCK_MONOTONIC or CLOCK_REALTIME. 198 if mask == 0 { 199 return 0, nil, linuxerr.EINVAL 200 } 201 n, err := futexWaitAbsolute(t, clockRealtime, timespec, forever, addr, private, uint32(val), mask) 202 return n, nil, err 203 default: 204 panic("unreachable") 205 } 206 207 case linux.FUTEX_WAKE: 208 mask = ^uint32(0) 209 fallthrough 210 211 case linux.FUTEX_WAKE_BITSET: 212 if mask == 0 { 213 return 0, nil, linuxerr.EINVAL 214 } 215 if val <= 0 { 216 // The Linux kernel wakes one waiter even if val is 217 // non-positive. 218 val = 1 219 } 220 n, err := t.Futex().Wake(t, addr, private, mask, val) 221 return uintptr(n), nil, err 222 223 case linux.FUTEX_REQUEUE: 224 n, err := t.Futex().Requeue(t, addr, naddr, private, val, nreq) 225 return uintptr(n), nil, err 226 227 case linux.FUTEX_CMP_REQUEUE: 228 // 'val3' contains the value to be checked at 'addr' and 229 // 'val' is the number of waiters that should be woken up. 230 nval := uint32(val3) 231 n, err := t.Futex().RequeueCmp(t, addr, naddr, private, nval, val, nreq) 232 return uintptr(n), nil, err 233 234 case linux.FUTEX_WAKE_OP: 235 op := uint32(val3) 236 if val <= 0 { 237 // The Linux kernel wakes one waiter even if val is 238 // non-positive. 239 val = 1 240 } 241 n, err := t.Futex().WakeOp(t, addr, naddr, private, val, nreq, op) 242 return uintptr(n), nil, err 243 244 case linux.FUTEX_LOCK_PI: 245 forever := (timeout == 0) 246 247 var timespec linux.Timespec 248 if !forever { 249 var err error 250 timespec, err = copyTimespecIn(t, timeout) 251 if err != nil { 252 return 0, nil, err 253 } 254 } 255 err := futexLockPI(t, timespec, forever, addr, private) 256 return 0, nil, err 257 258 case linux.FUTEX_TRYLOCK_PI: 259 err := tryLockPI(t, addr, private) 260 return 0, nil, err 261 262 case linux.FUTEX_UNLOCK_PI: 263 err := t.Futex().UnlockPI(t, addr, uint32(t.ThreadID()), private) 264 return 0, nil, err 265 266 case linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI: 267 t.Kernel().EmitUnimplementedEvent(t, sysno) 268 return 0, nil, linuxerr.ENOSYS 269 270 default: 271 // We don't even know about this command. 272 return 0, nil, linuxerr.ENOSYS 273 } 274 } 275 276 // SetRobustList implements linux syscall set_robust_list(2). 277 func SetRobustList(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 278 // Despite the syscall using the name 'pid' for this variable, it is 279 // very much a tid. 280 head := args[0].Pointer() 281 length := args[1].SizeT() 282 283 if length != uint(linux.SizeOfRobustListHead) { 284 return 0, nil, linuxerr.EINVAL 285 } 286 t.SetRobustList(head) 287 return 0, nil, nil 288 } 289 290 // GetRobustList implements linux syscall get_robust_list(2). 291 func GetRobustList(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 292 // Despite the syscall using the name 'pid' for this variable, it is 293 // very much a tid. 294 tid := args[0].Int() 295 headAddr := args[1].Pointer() 296 sizeAddr := args[2].Pointer() 297 298 if tid < 0 { 299 return 0, nil, linuxerr.EINVAL 300 } 301 302 ot := t 303 if tid != 0 { 304 if ot = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)); ot == nil { 305 return 0, nil, linuxerr.ESRCH 306 } 307 } 308 309 // Copy out head pointer. 310 head := t.Arch().Native(uintptr(ot.GetRobustList())) 311 if _, err := head.CopyOut(t, headAddr); err != nil { 312 return 0, nil, err 313 } 314 315 // Copy out size, which is a constant. Note that while size isn't 316 // an address, it is defined as the arch-dependent size_t, so it 317 // needs to be converted to a native-sized int. 318 size := t.Arch().Native(uintptr(linux.SizeOfRobustListHead)) 319 if _, err := size.CopyOut(t, sizeAddr); err != nil { 320 return 0, nil, err 321 } 322 323 return 0, nil, nil 324 }