gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/syscalls/linux/sys_aio.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package linux 16 17 import ( 18 "gvisor.dev/gvisor/pkg/abi/linux" 19 "gvisor.dev/gvisor/pkg/context" 20 "gvisor.dev/gvisor/pkg/errors/linuxerr" 21 "gvisor.dev/gvisor/pkg/hostarch" 22 "gvisor.dev/gvisor/pkg/marshal/primitive" 23 "gvisor.dev/gvisor/pkg/sentry/arch" 24 "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd" 25 "gvisor.dev/gvisor/pkg/sentry/kernel" 26 ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" 27 "gvisor.dev/gvisor/pkg/sentry/mm" 28 "gvisor.dev/gvisor/pkg/sentry/vfs" 29 "gvisor.dev/gvisor/pkg/usermem" 30 ) 31 32 // IoSetup implements linux syscall io_setup(2). 33 func IoSetup(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 34 nrEvents := args[0].Int() 35 idAddr := args[1].Pointer() 36 37 // Linux uses the native long as the aio ID. 38 // 39 // The context pointer _must_ be zero initially. 40 var idIn uint64 41 if _, err := primitive.CopyUint64In(t, idAddr, &idIn); err != nil { 42 return 0, nil, err 43 } 44 if idIn != 0 { 45 return 0, nil, linuxerr.EINVAL 46 } 47 48 id, err := t.MemoryManager().NewAIOContext(t, uint32(nrEvents)) 49 if err != nil { 50 return 0, nil, err 51 } 52 53 // Copy out the new ID. 54 if _, err := primitive.CopyUint64Out(t, idAddr, id); err != nil { 55 t.MemoryManager().DestroyAIOContext(t, id) 56 return 0, nil, err 57 } 58 59 return 0, nil, nil 60 } 61 62 // IoDestroy implements linux syscall io_destroy(2). 63 func IoDestroy(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 64 id := args[0].Uint64() 65 66 ctx := t.MemoryManager().DestroyAIOContext(t, id) 67 if ctx == nil { 68 // Does not exist. 69 return 0, nil, linuxerr.EINVAL 70 } 71 72 // Drain completed requests amd wait for pending requests until there are no 73 // more. 74 for { 75 ctx.Drain() 76 77 ch := ctx.WaitChannel() 78 if ch == nil { 79 // No more requests, we're done. 80 return 0, nil, nil 81 } 82 // The task cannot be interrupted during the wait. Equivalent to 83 // TASK_UNINTERRUPTIBLE in Linux. 84 t.UninterruptibleSleepStart(true /* deactivate */) 85 <-ch 86 t.UninterruptibleSleepFinish(true /* activate */) 87 } 88 } 89 90 // IoGetevents implements linux syscall io_getevents(2). 91 func IoGetevents(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 92 id := args[0].Uint64() 93 minEvents := args[1].Int() 94 events := args[2].Int() 95 eventsAddr := args[3].Pointer() 96 timespecAddr := args[4].Pointer() 97 98 // Sanity check arguments. 99 if minEvents < 0 || minEvents > events { 100 return 0, nil, linuxerr.EINVAL 101 } 102 103 ctx, ok := t.MemoryManager().LookupAIOContext(t, id) 104 if !ok { 105 return 0, nil, linuxerr.EINVAL 106 } 107 108 // Setup the timeout. 109 var haveDeadline bool 110 var deadline ktime.Time 111 if timespecAddr != 0 { 112 d, err := copyTimespecIn(t, timespecAddr) 113 if err != nil { 114 return 0, nil, err 115 } 116 if !d.Valid() { 117 return 0, nil, linuxerr.EINVAL 118 } 119 deadline = t.Kernel().MonotonicClock().Now().Add(d.ToDuration()) 120 haveDeadline = true 121 } 122 123 // Loop over all requests. 124 for count := int32(0); count < events; count++ { 125 // Get a request, per semantics. 126 var v any 127 if count >= minEvents { 128 var ok bool 129 v, ok = ctx.PopRequest() 130 if !ok { 131 return uintptr(count), nil, nil 132 } 133 } else { 134 var err error 135 v, err = waitForRequest(ctx, t, haveDeadline, deadline) 136 if err != nil { 137 if count > 0 || linuxerr.Equals(linuxerr.ETIMEDOUT, err) { 138 return uintptr(count), nil, nil 139 } 140 return 0, nil, linuxerr.ConvertIntr(err, linuxerr.EINTR) 141 } 142 } 143 144 ev := v.(*linux.IOEvent) 145 146 // Copy out the result. 147 if _, err := ev.CopyOut(t, eventsAddr); err != nil { 148 if count > 0 { 149 return uintptr(count), nil, nil 150 } 151 // Nothing done. 152 return 0, nil, err 153 } 154 155 // Keep rolling. 156 eventsAddr += hostarch.Addr(linux.IOEventSize) 157 } 158 159 // Everything finished. 160 return uintptr(events), nil, nil 161 } 162 163 func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadline ktime.Time) (any, error) { 164 for { 165 if v, ok := ctx.PopRequest(); ok { 166 // Request was readily available. Just return it. 167 return v, nil 168 } 169 170 // Need to wait for request completion. 171 done := ctx.WaitChannel() 172 if done == nil { 173 // Context has been destroyed. 174 return nil, linuxerr.EINVAL 175 } 176 if err := t.BlockWithDeadline(done, haveDeadline, deadline); err != nil { 177 return nil, err 178 } 179 } 180 } 181 182 // memoryFor returns appropriate memory for the given callback. 183 func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) { 184 bytes := int(cb.Bytes) 185 if bytes < 0 { 186 // Linux also requires that this field fit in ssize_t. 187 return usermem.IOSequence{}, linuxerr.EINVAL 188 } 189 190 // Since this I/O will be asynchronous with respect to t's task goroutine, 191 // we have no guarantee that t's AddressSpace will be active during the 192 // I/O. 193 switch cb.OpCode { 194 case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE: 195 return t.SingleIOSequence(hostarch.Addr(cb.Buf), bytes, usermem.IOOpts{ 196 AddressSpaceActive: false, 197 }) 198 199 case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV: 200 return t.IovecsIOSequence(hostarch.Addr(cb.Buf), bytes, usermem.IOOpts{ 201 AddressSpaceActive: false, 202 }) 203 204 case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP: 205 return usermem.IOSequence{}, nil 206 207 default: 208 // Not a supported command. 209 return usermem.IOSequence{}, linuxerr.EINVAL 210 } 211 } 212 213 // IoCancel implements linux syscall io_cancel(2). 214 // 215 // It is not presently supported (ENOSYS indicates no support on this 216 // architecture). 217 func IoCancel(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 218 return 0, nil, linuxerr.ENOSYS 219 } 220 221 // IoSubmit implements linux syscall io_submit(2). 222 func IoSubmit(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 223 id := args[0].Uint64() 224 nrEvents := args[1].Int() 225 addr := args[2].Pointer() 226 227 if nrEvents < 0 { 228 return 0, nil, linuxerr.EINVAL 229 } 230 231 for i := int32(0); i < nrEvents; i++ { 232 // Copy in the callback address. 233 var cbAddr hostarch.Addr 234 switch t.Arch().Width() { 235 case 8: 236 var cbAddrP primitive.Uint64 237 if _, err := cbAddrP.CopyIn(t, addr); err != nil { 238 if i > 0 { 239 // Some successful. 240 return uintptr(i), nil, nil 241 } 242 // Nothing done. 243 return 0, nil, err 244 } 245 cbAddr = hostarch.Addr(cbAddrP) 246 default: 247 return 0, nil, linuxerr.ENOSYS 248 } 249 250 // Copy in this callback. 251 var cb linux.IOCallback 252 if _, err := cb.CopyIn(t, cbAddr); err != nil { 253 if i > 0 { 254 // Some have been successful. 255 return uintptr(i), nil, nil 256 } 257 // Nothing done. 258 return 0, nil, err 259 } 260 261 // Process this callback. 262 if err := submitCallback(t, id, &cb, cbAddr); err != nil { 263 if i > 0 { 264 // Partial success. 265 return uintptr(i), nil, nil 266 } 267 // Nothing done. 268 return 0, nil, err 269 } 270 271 // Advance to the next one. 272 addr += hostarch.Addr(t.Arch().Width()) 273 } 274 275 return uintptr(nrEvents), nil, nil 276 } 277 278 // submitCallback processes a single callback. 279 func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr hostarch.Addr) error { 280 if cb.Reserved2 != 0 { 281 return linuxerr.EINVAL 282 } 283 284 fd := t.GetFile(cb.FD) 285 if fd == nil { 286 return linuxerr.EBADF 287 } 288 defer fd.DecRef(t) 289 290 // Was there an eventFD? Extract it. 291 var eventFD *vfs.FileDescription 292 if cb.Flags&linux.IOCB_FLAG_RESFD != 0 { 293 eventFD = t.GetFile(cb.ResFD) 294 if eventFD == nil { 295 return linuxerr.EBADF 296 } 297 defer eventFD.DecRef(t) 298 299 // Check that it is an eventfd. 300 if _, ok := eventFD.Impl().(*eventfd.EventFileDescription); !ok { 301 return linuxerr.EINVAL 302 } 303 } 304 305 ioseq, err := memoryFor(t, cb) 306 if err != nil { 307 return err 308 } 309 310 // Check offset for reads/writes. 311 switch cb.OpCode { 312 case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: 313 if cb.Offset < 0 { 314 return linuxerr.EINVAL 315 } 316 } 317 318 // Prepare the request. 319 aioCtx, ok := t.MemoryManager().LookupAIOContext(t, id) 320 if !ok { 321 return linuxerr.EINVAL 322 } 323 if err := aioCtx.Prepare(); err != nil { 324 return err 325 } 326 327 if eventFD != nil { 328 // The request is set. Make sure there's a ref on the file. 329 // 330 // This is necessary when the callback executes on completion, 331 // which is also what will release this reference. 332 eventFD.IncRef() 333 } 334 335 // Perform the request asynchronously. 336 fd.IncRef() 337 t.QueueAIO(getAIOCallback(t, fd, eventFD, cbAddr, cb, ioseq, aioCtx)) 338 return nil 339 } 340 341 func getAIOCallback(t *kernel.Task, fd, eventFD *vfs.FileDescription, cbAddr hostarch.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, aioCtx *mm.AIOContext) kernel.AIOCallback { 342 return func(ctx context.Context) { 343 // Release references after completing the callback. 344 defer fd.DecRef(ctx) 345 if eventFD != nil { 346 defer eventFD.DecRef(ctx) 347 } 348 349 if aioCtx.Dead() { 350 aioCtx.CancelPendingRequest() 351 return 352 } 353 ev := &linux.IOEvent{ 354 Data: cb.Data, 355 Obj: uint64(cbAddr), 356 } 357 358 var err error 359 switch cb.OpCode { 360 case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV: 361 ev.Result, err = fd.PRead(ctx, ioseq, cb.Offset, vfs.ReadOptions{}) 362 case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV: 363 ev.Result, err = fd.PWrite(ctx, ioseq, cb.Offset, vfs.WriteOptions{}) 364 case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC: 365 err = fd.Sync(ctx) 366 } 367 368 // Update the result. 369 if err != nil { 370 err = HandleIOError(ctx, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", fd) 371 ev.Result = -int64(kernel.ExtractErrno(err, 0)) 372 } 373 374 // Queue the result for delivery. 375 aioCtx.FinishRequest(ev) 376 377 // Notify the event file if one was specified. This needs to happen 378 // *after* queueing the result to avoid racing with the thread we may 379 // wake up. 380 if eventFD != nil { 381 eventFD.Impl().(*eventfd.EventFileDescription).Signal(1) 382 } 383 } 384 }