github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/mm/aio_context.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package mm 16 17 import ( 18 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 19 "github.com/nicocha30/gvisor-ligolo/pkg/context" 20 "github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr" 21 "github.com/nicocha30/gvisor-ligolo/pkg/hostarch" 22 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/memmap" 23 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/pgalloc" 24 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/usage" 25 "github.com/nicocha30/gvisor-ligolo/pkg/usermem" 26 ) 27 28 // aioManager creates and manages asynchronous I/O contexts. 29 // 30 // +stateify savable 31 type aioManager struct { 32 // mu protects below. 33 mu aioManagerMutex `state:"nosave"` 34 35 // aioContexts is the set of asynchronous I/O contexts. 36 contexts map[uint64]*AIOContext 37 } 38 39 func (mm *MemoryManager) destroyAIOManager(ctx context.Context) { 40 mm.aioManager.mu.Lock() 41 defer mm.aioManager.mu.Unlock() 42 43 for id := range mm.aioManager.contexts { 44 mm.destroyAIOContextLocked(ctx, id) 45 } 46 } 47 48 // newAIOContext creates a new context for asynchronous I/O. 49 // 50 // Returns false if 'id' is currently in use. 51 func (a *aioManager) newAIOContext(events uint32, id uint64) bool { 52 a.mu.Lock() 53 defer a.mu.Unlock() 54 55 if _, ok := a.contexts[id]; ok { 56 return false 57 } 58 59 a.contexts[id] = &AIOContext{ 60 requestReady: make(chan struct{}, 1), 61 maxOutstanding: events, 62 } 63 return true 64 } 65 66 // destroyAIOContext destroys an asynchronous I/O context. It doesn't wait for 67 // for pending requests to complete. Returns the destroyed AIOContext so it can 68 // be drained. 69 // 70 // Nil is returned if the context does not exist. 71 // 72 // Precondition: mm.aioManager.mu is locked. 73 func (mm *MemoryManager) destroyAIOContextLocked(ctx context.Context, id uint64) *AIOContext { 74 aioCtx, ok := mm.aioManager.contexts[id] 75 if !ok { 76 return nil 77 } 78 79 delete(mm.aioManager.contexts, id) 80 aioCtx.destroy() 81 return aioCtx 82 } 83 84 // lookupAIOContext looks up the given context. 85 // 86 // Returns false if context does not exist. 87 func (a *aioManager) lookupAIOContext(id uint64) (*AIOContext, bool) { 88 a.mu.Lock() 89 defer a.mu.Unlock() 90 ctx, ok := a.contexts[id] 91 return ctx, ok 92 } 93 94 // ioResult is a completed I/O operation. 95 // 96 // +stateify savable 97 type ioResult struct { 98 data any 99 ioEntry 100 } 101 102 // AIOContext is a single asynchronous I/O context. 103 // 104 // +stateify savable 105 type AIOContext struct { 106 // requestReady is the notification channel used for all requests. 107 requestReady chan struct{} `state:"nosave"` 108 109 // mu protects below. 110 mu aioContextMutex `state:"nosave"` 111 112 // results is the set of completed requests. 113 results ioList 114 115 // maxOutstanding is the maximum number of outstanding entries; this value 116 // is immutable. 117 maxOutstanding uint32 118 119 // outstanding is the number of requests outstanding; this will effectively 120 // be the number of entries in the result list or that are expected to be 121 // added to the result list. 122 outstanding uint32 123 124 // dead is set when the context is destroyed. 125 dead bool `state:"zerovalue"` 126 } 127 128 // destroy marks the context dead. 129 func (ctx *AIOContext) destroy() { 130 ctx.mu.Lock() 131 defer ctx.mu.Unlock() 132 ctx.dead = true 133 ctx.checkForDone() 134 } 135 136 // Preconditions: ctx.mu must be held by caller. 137 func (ctx *AIOContext) checkForDone() { 138 if ctx.dead && ctx.outstanding == 0 { 139 close(ctx.requestReady) 140 ctx.requestReady = nil 141 } 142 } 143 144 // Prepare reserves space for a new request, returning nil if available. 145 // Returns EAGAIN if the context is busy and EINVAL if the context is dead. 146 func (ctx *AIOContext) Prepare() error { 147 ctx.mu.Lock() 148 defer ctx.mu.Unlock() 149 if ctx.dead { 150 // Context died after the caller looked it up. 151 return linuxerr.EINVAL 152 } 153 if ctx.outstanding >= ctx.maxOutstanding { 154 // Context is busy. 155 return linuxerr.EAGAIN 156 } 157 ctx.outstanding++ 158 return nil 159 } 160 161 // PopRequest pops a completed request if available, this function does not do 162 // any blocking. Returns false if no request is available. 163 func (ctx *AIOContext) PopRequest() (any, bool) { 164 ctx.mu.Lock() 165 defer ctx.mu.Unlock() 166 167 // Is there anything ready? 168 if e := ctx.results.Front(); e != nil { 169 if ctx.outstanding == 0 { 170 panic("AIOContext outstanding is going negative") 171 } 172 ctx.outstanding-- 173 ctx.results.Remove(e) 174 ctx.checkForDone() 175 return e.data, true 176 } 177 return nil, false 178 } 179 180 // FinishRequest finishes a pending request. It queues up the data 181 // and notifies listeners. 182 func (ctx *AIOContext) FinishRequest(data any) { 183 ctx.mu.Lock() 184 defer ctx.mu.Unlock() 185 186 // Push to the list and notify opportunistically. The channel notify 187 // here is guaranteed to be safe because outstanding must be non-zero. 188 // The requestReady channel is only closed when outstanding reaches zero. 189 ctx.results.PushBack(&ioResult{data: data}) 190 191 select { 192 case ctx.requestReady <- struct{}{}: 193 default: 194 } 195 } 196 197 // WaitChannel returns a channel that is notified when an AIO request is 198 // completed. Returns nil if the context is destroyed and there are no more 199 // outstanding requests. 200 func (ctx *AIOContext) WaitChannel() chan struct{} { 201 ctx.mu.Lock() 202 defer ctx.mu.Unlock() 203 return ctx.requestReady 204 } 205 206 // Dead returns true if the context has been destroyed. 207 func (ctx *AIOContext) Dead() bool { 208 ctx.mu.Lock() 209 defer ctx.mu.Unlock() 210 return ctx.dead 211 } 212 213 // CancelPendingRequest forgets about a request that hasn't yet completed. 214 func (ctx *AIOContext) CancelPendingRequest() { 215 ctx.mu.Lock() 216 defer ctx.mu.Unlock() 217 218 if ctx.outstanding == 0 { 219 panic("AIOContext outstanding is going negative") 220 } 221 ctx.outstanding-- 222 ctx.checkForDone() 223 } 224 225 // Drain drops all completed requests. Pending requests remain untouched. 226 func (ctx *AIOContext) Drain() { 227 ctx.mu.Lock() 228 defer ctx.mu.Unlock() 229 230 if ctx.outstanding == 0 { 231 return 232 } 233 size := uint32(ctx.results.Len()) 234 if ctx.outstanding < size { 235 panic("AIOContext outstanding is going negative") 236 } 237 ctx.outstanding -= size 238 ctx.results.Reset() 239 ctx.checkForDone() 240 } 241 242 // aioMappable implements memmap.MappingIdentity and memmap.Mappable for AIO 243 // ring buffers. 244 // 245 // +stateify savable 246 type aioMappable struct { 247 aioMappableRefs 248 249 mfp pgalloc.MemoryFileProvider 250 fr memmap.FileRange 251 } 252 253 var aioRingBufferSize = uint64(hostarch.Addr(linux.AIORingSize).MustRoundUp()) 254 255 func newAIOMappable(ctx context.Context, mfp pgalloc.MemoryFileProvider) (*aioMappable, error) { 256 fr, err := mfp.MemoryFile().Allocate(aioRingBufferSize, pgalloc.AllocOpts{Kind: usage.Anonymous, MemCgID: pgalloc.MemoryCgroupIDFromContext(ctx)}) 257 if err != nil { 258 return nil, err 259 } 260 m := aioMappable{mfp: mfp, fr: fr} 261 m.InitRefs() 262 return &m, nil 263 } 264 265 // DecRef implements refs.RefCounter.DecRef. 266 func (m *aioMappable) DecRef(ctx context.Context) { 267 m.aioMappableRefs.DecRef(func() { 268 m.mfp.MemoryFile().DecRef(m.fr) 269 }) 270 } 271 272 // MappedName implements memmap.MappingIdentity.MappedName. 273 func (m *aioMappable) MappedName(ctx context.Context) string { 274 return "[aio]" 275 } 276 277 // DeviceID implements memmap.MappingIdentity.DeviceID. 278 func (m *aioMappable) DeviceID() uint64 { 279 return 0 280 } 281 282 // InodeID implements memmap.MappingIdentity.InodeID. 283 func (m *aioMappable) InodeID() uint64 { 284 return 0 285 } 286 287 // Msync implements memmap.MappingIdentity.Msync. 288 func (m *aioMappable) Msync(ctx context.Context, mr memmap.MappableRange) error { 289 // Linux: aio_ring_fops.fsync == NULL 290 return linuxerr.EINVAL 291 } 292 293 // AddMapping implements memmap.Mappable.AddMapping. 294 func (m *aioMappable) AddMapping(_ context.Context, _ memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, _ bool) error { 295 // Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap() 296 // sets VM_DONTEXPAND). 297 if offset != 0 || uint64(ar.Length()) != aioRingBufferSize { 298 return linuxerr.EFAULT 299 } 300 return nil 301 } 302 303 // RemoveMapping implements memmap.Mappable.RemoveMapping. 304 func (m *aioMappable) RemoveMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange, uint64, bool) { 305 } 306 307 // CopyMapping implements memmap.Mappable.CopyMapping. 308 func (m *aioMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, _ bool) error { 309 // Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap() 310 // sets VM_DONTEXPAND). 311 if offset != 0 || uint64(dstAR.Length()) != aioRingBufferSize { 312 return linuxerr.EFAULT 313 } 314 // Require that the mapping correspond to a live AIOContext. Compare 315 // Linux's fs/aio.c:aio_ring_mremap(). 316 mm, ok := ms.(*MemoryManager) 317 if !ok { 318 return linuxerr.EINVAL 319 } 320 am := &mm.aioManager 321 am.mu.Lock() 322 defer am.mu.Unlock() 323 oldID := uint64(srcAR.Start) 324 aioCtx, ok := am.contexts[oldID] 325 if !ok { 326 return linuxerr.EINVAL 327 } 328 aioCtx.mu.Lock() 329 defer aioCtx.mu.Unlock() 330 if aioCtx.dead { 331 return linuxerr.EINVAL 332 } 333 // Use the new ID for the AIOContext. 334 am.contexts[uint64(dstAR.Start)] = aioCtx 335 delete(am.contexts, oldID) 336 return nil 337 } 338 339 // Translate implements memmap.Mappable.Translate. 340 func (m *aioMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { 341 var err error 342 if required.End > m.fr.Length() { 343 err = &memmap.BusError{linuxerr.EFAULT} 344 } 345 if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 { 346 return []memmap.Translation{ 347 { 348 Source: source, 349 File: m.mfp.MemoryFile(), 350 Offset: m.fr.Start + source.Start, 351 Perms: hostarch.AnyAccess, 352 }, 353 }, err 354 } 355 return nil, err 356 } 357 358 // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. 359 func (m *aioMappable) InvalidateUnsavable(ctx context.Context) error { 360 return nil 361 } 362 363 // NewAIOContext creates a new context for asynchronous I/O. 364 // 365 // NewAIOContext is analogous to Linux's fs/aio.c:ioctx_alloc(). 366 func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint64, error) { 367 // libaio get_ioevents() expects context "handle" to be a valid address. 368 // libaio peeks inside looking for a magic number. This function allocates 369 // a page per context and keeps it set to zeroes to ensure it will not 370 // match AIO_RING_MAGIC and make libaio happy. 371 m, err := newAIOMappable(ctx, mm.mfp) 372 if err != nil { 373 return 0, err 374 } 375 defer m.DecRef(ctx) 376 addr, err := mm.MMap(ctx, memmap.MMapOpts{ 377 Length: aioRingBufferSize, 378 MappingIdentity: m, 379 Mappable: m, 380 // Linux uses "do_mmap_pgoff(..., PROT_READ | PROT_WRITE, ...)" in 381 // fs/aio.c:aio_setup_ring(). Since we don't implement AIO_RING_MAGIC, 382 // user mode should not write to this page. 383 Perms: hostarch.Read, 384 MaxPerms: hostarch.Read, 385 }) 386 if err != nil { 387 return 0, err 388 } 389 id := uint64(addr) 390 if !mm.aioManager.newAIOContext(events, id) { 391 mm.MUnmap(ctx, addr, aioRingBufferSize) 392 return 0, linuxerr.EINVAL 393 } 394 return id, nil 395 } 396 397 // DestroyAIOContext destroys an asynchronous I/O context. It returns the 398 // destroyed context. nil if the context does not exist. 399 func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) *AIOContext { 400 if !mm.isValidAddr(ctx, id) { 401 return nil 402 } 403 404 // Only unmaps after it assured that the address is a valid aio context to 405 // prevent random memory from been unmapped. 406 // 407 // Note: It's possible to unmap this address and map something else into 408 // the same address. Then it would be unmapping memory that it doesn't own. 409 // This is, however, the way Linux implements AIO. Keeps the same [weird] 410 // semantics in case anyone relies on it. 411 mm.MUnmap(ctx, hostarch.Addr(id), aioRingBufferSize) 412 413 mm.aioManager.mu.Lock() 414 defer mm.aioManager.mu.Unlock() 415 return mm.destroyAIOContextLocked(ctx, id) 416 } 417 418 // LookupAIOContext looks up the given context. It returns false if the context 419 // does not exist. 420 func (mm *MemoryManager) LookupAIOContext(ctx context.Context, id uint64) (*AIOContext, bool) { 421 aioCtx, ok := mm.aioManager.lookupAIOContext(id) 422 if !ok { 423 return nil, false 424 } 425 426 // Protect against 'id' that is inaccessible. 427 if !mm.isValidAddr(ctx, id) { 428 return nil, false 429 } 430 431 return aioCtx, true 432 } 433 434 // isValidAddr determines if the address `id` is valid. (Linux also reads 4 435 // bytes from id). 436 func (mm *MemoryManager) isValidAddr(ctx context.Context, id uint64) bool { 437 var buf [4]byte 438 _, err := mm.CopyIn(ctx, hostarch.Addr(id), buf[:], usermem.IOOpts{}) 439 return err == nil 440 }