github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/mm/aio_context.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package mm 16 17 import ( 18 "github.com/SagerNet/gvisor/pkg/abi/linux" 19 "github.com/SagerNet/gvisor/pkg/context" 20 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 21 "github.com/SagerNet/gvisor/pkg/hostarch" 22 "github.com/SagerNet/gvisor/pkg/sentry/memmap" 23 "github.com/SagerNet/gvisor/pkg/sentry/pgalloc" 24 "github.com/SagerNet/gvisor/pkg/sentry/usage" 25 "github.com/SagerNet/gvisor/pkg/sync" 26 "github.com/SagerNet/gvisor/pkg/syserror" 27 "github.com/SagerNet/gvisor/pkg/usermem" 28 ) 29 30 // aioManager creates and manages asynchronous I/O contexts. 31 // 32 // +stateify savable 33 type aioManager struct { 34 // mu protects below. 35 mu sync.Mutex `state:"nosave"` 36 37 // aioContexts is the set of asynchronous I/O contexts. 38 contexts map[uint64]*AIOContext 39 } 40 41 func (mm *MemoryManager) destroyAIOManager(ctx context.Context) { 42 mm.aioManager.mu.Lock() 43 defer mm.aioManager.mu.Unlock() 44 45 for id := range mm.aioManager.contexts { 46 mm.destroyAIOContextLocked(ctx, id) 47 } 48 } 49 50 // newAIOContext creates a new context for asynchronous I/O. 51 // 52 // Returns false if 'id' is currently in use. 53 func (a *aioManager) newAIOContext(events uint32, id uint64) bool { 54 a.mu.Lock() 55 defer a.mu.Unlock() 56 57 if _, ok := a.contexts[id]; ok { 58 return false 59 } 60 61 a.contexts[id] = &AIOContext{ 62 requestReady: make(chan struct{}, 1), 63 maxOutstanding: events, 64 } 65 return true 66 } 67 68 // destroyAIOContext destroys an asynchronous I/O context. It doesn't wait for 69 // for pending requests to complete. Returns the destroyed AIOContext so it can 70 // be drained. 71 // 72 // Nil is returned if the context does not exist. 73 // 74 // Precondition: mm.aioManager.mu is locked. 75 func (mm *MemoryManager) destroyAIOContextLocked(ctx context.Context, id uint64) *AIOContext { 76 aioCtx, ok := mm.aioManager.contexts[id] 77 if !ok { 78 return nil 79 } 80 81 // Only unmaps after it assured that the address is a valid aio context to 82 // prevent random memory from been unmapped. 83 // 84 // Note: It's possible to unmap this address and map something else into 85 // the same address. Then it would be unmapping memory that it doesn't own. 86 // This is, however, the way Linux implements AIO. Keeps the same [weird] 87 // semantics in case anyone relies on it. 88 mm.MUnmap(ctx, hostarch.Addr(id), aioRingBufferSize) 89 90 delete(mm.aioManager.contexts, id) 91 aioCtx.destroy() 92 return aioCtx 93 } 94 95 // lookupAIOContext looks up the given context. 96 // 97 // Returns false if context does not exist. 98 func (a *aioManager) lookupAIOContext(id uint64) (*AIOContext, bool) { 99 a.mu.Lock() 100 defer a.mu.Unlock() 101 ctx, ok := a.contexts[id] 102 return ctx, ok 103 } 104 105 // ioResult is a completed I/O operation. 106 // 107 // +stateify savable 108 type ioResult struct { 109 data interface{} 110 ioEntry 111 } 112 113 // AIOContext is a single asynchronous I/O context. 114 // 115 // +stateify savable 116 type AIOContext struct { 117 // requestReady is the notification channel used for all requests. 118 requestReady chan struct{} `state:"nosave"` 119 120 // mu protects below. 121 mu sync.Mutex `state:"nosave"` 122 123 // results is the set of completed requests. 124 results ioList 125 126 // maxOutstanding is the maximum number of outstanding entries; this value 127 // is immutable. 128 maxOutstanding uint32 129 130 // outstanding is the number of requests outstanding; this will effectively 131 // be the number of entries in the result list or that are expected to be 132 // added to the result list. 133 outstanding uint32 134 135 // dead is set when the context is destroyed. 136 dead bool `state:"zerovalue"` 137 } 138 139 // destroy marks the context dead. 140 func (ctx *AIOContext) destroy() { 141 ctx.mu.Lock() 142 defer ctx.mu.Unlock() 143 ctx.dead = true 144 ctx.checkForDone() 145 } 146 147 // Preconditions: ctx.mu must be held by caller. 148 func (ctx *AIOContext) checkForDone() { 149 if ctx.dead && ctx.outstanding == 0 { 150 close(ctx.requestReady) 151 ctx.requestReady = nil 152 } 153 } 154 155 // Prepare reserves space for a new request, returning nil if available. 156 // Returns EAGAIN if the context is busy and EINVAL if the context is dead. 157 func (ctx *AIOContext) Prepare() error { 158 ctx.mu.Lock() 159 defer ctx.mu.Unlock() 160 if ctx.dead { 161 // Context died after the caller looked it up. 162 return linuxerr.EINVAL 163 } 164 if ctx.outstanding >= ctx.maxOutstanding { 165 // Context is busy. 166 return linuxerr.EAGAIN 167 } 168 ctx.outstanding++ 169 return nil 170 } 171 172 // PopRequest pops a completed request if available, this function does not do 173 // any blocking. Returns false if no request is available. 174 func (ctx *AIOContext) PopRequest() (interface{}, bool) { 175 ctx.mu.Lock() 176 defer ctx.mu.Unlock() 177 178 // Is there anything ready? 179 if e := ctx.results.Front(); e != nil { 180 if ctx.outstanding == 0 { 181 panic("AIOContext outstanding is going negative") 182 } 183 ctx.outstanding-- 184 ctx.results.Remove(e) 185 ctx.checkForDone() 186 return e.data, true 187 } 188 return nil, false 189 } 190 191 // FinishRequest finishes a pending request. It queues up the data 192 // and notifies listeners. 193 func (ctx *AIOContext) FinishRequest(data interface{}) { 194 ctx.mu.Lock() 195 defer ctx.mu.Unlock() 196 197 // Push to the list and notify opportunistically. The channel notify 198 // here is guaranteed to be safe because outstanding must be non-zero. 199 // The requestReady channel is only closed when outstanding reaches zero. 200 ctx.results.PushBack(&ioResult{data: data}) 201 202 select { 203 case ctx.requestReady <- struct{}{}: 204 default: 205 } 206 } 207 208 // WaitChannel returns a channel that is notified when an AIO request is 209 // completed. Returns nil if the context is destroyed and there are no more 210 // outstanding requests. 211 func (ctx *AIOContext) WaitChannel() chan struct{} { 212 ctx.mu.Lock() 213 defer ctx.mu.Unlock() 214 return ctx.requestReady 215 } 216 217 // Dead returns true if the context has been destroyed. 218 func (ctx *AIOContext) Dead() bool { 219 ctx.mu.Lock() 220 defer ctx.mu.Unlock() 221 return ctx.dead 222 } 223 224 // CancelPendingRequest forgets about a request that hasn't yet completed. 225 func (ctx *AIOContext) CancelPendingRequest() { 226 ctx.mu.Lock() 227 defer ctx.mu.Unlock() 228 229 if ctx.outstanding == 0 { 230 panic("AIOContext outstanding is going negative") 231 } 232 ctx.outstanding-- 233 ctx.checkForDone() 234 } 235 236 // Drain drops all completed requests. Pending requests remain untouched. 237 func (ctx *AIOContext) Drain() { 238 ctx.mu.Lock() 239 defer ctx.mu.Unlock() 240 241 if ctx.outstanding == 0 { 242 return 243 } 244 size := uint32(ctx.results.Len()) 245 if ctx.outstanding < size { 246 panic("AIOContext outstanding is going negative") 247 } 248 ctx.outstanding -= size 249 ctx.results.Reset() 250 ctx.checkForDone() 251 } 252 253 // aioMappable implements memmap.MappingIdentity and memmap.Mappable for AIO 254 // ring buffers. 255 // 256 // +stateify savable 257 type aioMappable struct { 258 aioMappableRefs 259 260 mfp pgalloc.MemoryFileProvider 261 fr memmap.FileRange 262 } 263 264 var aioRingBufferSize = uint64(hostarch.Addr(linux.AIORingSize).MustRoundUp()) 265 266 func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) { 267 fr, err := mfp.MemoryFile().Allocate(aioRingBufferSize, usage.Anonymous) 268 if err != nil { 269 return nil, err 270 } 271 m := aioMappable{mfp: mfp, fr: fr} 272 m.InitRefs() 273 return &m, nil 274 } 275 276 // DecRef implements refs.RefCounter.DecRef. 277 func (m *aioMappable) DecRef(ctx context.Context) { 278 m.aioMappableRefs.DecRef(func() { 279 m.mfp.MemoryFile().DecRef(m.fr) 280 }) 281 } 282 283 // MappedName implements memmap.MappingIdentity.MappedName. 284 func (m *aioMappable) MappedName(ctx context.Context) string { 285 return "[aio]" 286 } 287 288 // DeviceID implements memmap.MappingIdentity.DeviceID. 289 func (m *aioMappable) DeviceID() uint64 { 290 return 0 291 } 292 293 // InodeID implements memmap.MappingIdentity.InodeID. 294 func (m *aioMappable) InodeID() uint64 { 295 return 0 296 } 297 298 // Msync implements memmap.MappingIdentity.Msync. 299 func (m *aioMappable) Msync(ctx context.Context, mr memmap.MappableRange) error { 300 // Linux: aio_ring_fops.fsync == NULL 301 return linuxerr.EINVAL 302 } 303 304 // AddMapping implements memmap.Mappable.AddMapping. 305 func (m *aioMappable) AddMapping(_ context.Context, _ memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, _ bool) error { 306 // Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap() 307 // sets VM_DONTEXPAND). 308 if offset != 0 || uint64(ar.Length()) != aioRingBufferSize { 309 return syserror.EFAULT 310 } 311 return nil 312 } 313 314 // RemoveMapping implements memmap.Mappable.RemoveMapping. 315 func (m *aioMappable) RemoveMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange, uint64, bool) { 316 } 317 318 // CopyMapping implements memmap.Mappable.CopyMapping. 319 func (m *aioMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, _ bool) error { 320 // Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap() 321 // sets VM_DONTEXPAND). 322 if offset != 0 || uint64(dstAR.Length()) != aioRingBufferSize { 323 return syserror.EFAULT 324 } 325 // Require that the mapping correspond to a live AIOContext. Compare 326 // Linux's fs/aio.c:aio_ring_mremap(). 327 mm, ok := ms.(*MemoryManager) 328 if !ok { 329 return linuxerr.EINVAL 330 } 331 am := &mm.aioManager 332 am.mu.Lock() 333 defer am.mu.Unlock() 334 oldID := uint64(srcAR.Start) 335 aioCtx, ok := am.contexts[oldID] 336 if !ok { 337 return linuxerr.EINVAL 338 } 339 aioCtx.mu.Lock() 340 defer aioCtx.mu.Unlock() 341 if aioCtx.dead { 342 return linuxerr.EINVAL 343 } 344 // Use the new ID for the AIOContext. 345 am.contexts[uint64(dstAR.Start)] = aioCtx 346 delete(am.contexts, oldID) 347 return nil 348 } 349 350 // Translate implements memmap.Mappable.Translate. 351 func (m *aioMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { 352 var err error 353 if required.End > m.fr.Length() { 354 err = &memmap.BusError{syserror.EFAULT} 355 } 356 if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 { 357 return []memmap.Translation{ 358 { 359 Source: source, 360 File: m.mfp.MemoryFile(), 361 Offset: m.fr.Start + source.Start, 362 Perms: hostarch.AnyAccess, 363 }, 364 }, err 365 } 366 return nil, err 367 } 368 369 // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. 370 func (m *aioMappable) InvalidateUnsavable(ctx context.Context) error { 371 return nil 372 } 373 374 // NewAIOContext creates a new context for asynchronous I/O. 375 // 376 // NewAIOContext is analogous to Linux's fs/aio.c:ioctx_alloc(). 377 func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint64, error) { 378 // libaio get_ioevents() expects context "handle" to be a valid address. 379 // libaio peeks inside looking for a magic number. This function allocates 380 // a page per context and keeps it set to zeroes to ensure it will not 381 // match AIO_RING_MAGIC and make libaio happy. 382 m, err := newAIOMappable(mm.mfp) 383 if err != nil { 384 return 0, err 385 } 386 defer m.DecRef(ctx) 387 addr, err := mm.MMap(ctx, memmap.MMapOpts{ 388 Length: aioRingBufferSize, 389 MappingIdentity: m, 390 Mappable: m, 391 // Linux uses "do_mmap_pgoff(..., PROT_READ | PROT_WRITE, ...)" in 392 // fs/aio.c:aio_setup_ring(). Since we don't implement AIO_RING_MAGIC, 393 // user mode should not write to this page. 394 Perms: hostarch.Read, 395 MaxPerms: hostarch.Read, 396 }) 397 if err != nil { 398 return 0, err 399 } 400 id := uint64(addr) 401 if !mm.aioManager.newAIOContext(events, id) { 402 mm.MUnmap(ctx, addr, aioRingBufferSize) 403 return 0, linuxerr.EINVAL 404 } 405 return id, nil 406 } 407 408 // DestroyAIOContext destroys an asynchronous I/O context. It returns the 409 // destroyed context. nil if the context does not exist. 410 func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) *AIOContext { 411 if !mm.isValidAddr(ctx, id) { 412 return nil 413 } 414 415 mm.aioManager.mu.Lock() 416 defer mm.aioManager.mu.Unlock() 417 return mm.destroyAIOContextLocked(ctx, id) 418 } 419 420 // LookupAIOContext looks up the given context. It returns false if the context 421 // does not exist. 422 func (mm *MemoryManager) LookupAIOContext(ctx context.Context, id uint64) (*AIOContext, bool) { 423 aioCtx, ok := mm.aioManager.lookupAIOContext(id) 424 if !ok { 425 return nil, false 426 } 427 428 // Protect against 'id' that is inaccessible. 429 if !mm.isValidAddr(ctx, id) { 430 return nil, false 431 } 432 433 return aioCtx, true 434 } 435 436 // isValidAddr determines if the address `id` is valid. (Linux also reads 4 437 // bytes from id). 438 func (mm *MemoryManager) isValidAddr(ctx context.Context, id uint64) bool { 439 var buf [4]byte 440 _, err := mm.CopyIn(ctx, hostarch.Addr(id), buf[:], usermem.IOOpts{}) 441 return err == nil 442 }