github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/shm/shm.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package shm implements sysv shared memory segments. 16 // 17 // Known missing features: 18 // 19 // - SHM_LOCK/SHM_UNLOCK are no-ops. The sentry currently doesn't implement 20 // memory locking in general. 21 // 22 // - SHM_HUGETLB and related flags for shmget(2) are ignored. There's no easy 23 // way to implement hugetlb support on a per-map basis, and it has no impact 24 // on correctness. 25 // 26 // - SHM_NORESERVE for shmget(2) is ignored, the sentry doesn't implement swap 27 // so it's meaningless to reserve space for swap. 28 // 29 // - No per-process segment size enforcement. This feature probably isn't used 30 // much anyways, since Linux sets the per-process limits to the system-wide 31 // limits by default. 32 // 33 // Lock ordering: mm.mappingMu -> shm registry lock -> shm lock 34 package shm 35 36 import ( 37 "fmt" 38 39 "github.com/SagerNet/gvisor/pkg/abi/linux" 40 "github.com/SagerNet/gvisor/pkg/context" 41 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 42 "github.com/SagerNet/gvisor/pkg/hostarch" 43 "github.com/SagerNet/gvisor/pkg/log" 44 "github.com/SagerNet/gvisor/pkg/sentry/fs" 45 "github.com/SagerNet/gvisor/pkg/sentry/kernel/auth" 46 ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time" 47 "github.com/SagerNet/gvisor/pkg/sentry/memmap" 48 "github.com/SagerNet/gvisor/pkg/sentry/pgalloc" 49 "github.com/SagerNet/gvisor/pkg/sentry/usage" 50 "github.com/SagerNet/gvisor/pkg/sync" 51 "github.com/SagerNet/gvisor/pkg/syserror" 52 ) 53 54 // Key represents a shm segment key. Analogous to a file name. 55 type Key int32 56 57 // ID represents the opaque handle for a shm segment. Analogous to an fd. 58 type ID int32 59 60 // Registry tracks all shared memory segments in an IPC namespace. The registry 61 // provides the mechanisms for creating and finding segments, and reporting 62 // global shm parameters. 63 // 64 // +stateify savable 65 type Registry struct { 66 // userNS owns the IPC namespace this registry belong to. Immutable. 67 userNS *auth.UserNamespace 68 69 // mu protects all fields below. 70 mu sync.Mutex `state:"nosave"` 71 72 // shms maps segment ids to segments. 73 // 74 // shms holds all referenced segments, which are removed on the last 75 // DecRef. Thus, it cannot itself hold a reference on the Shm. 76 // 77 // Since removal only occurs after the last (unlocked) DecRef, there 78 // exists a short window during which a Shm still exists in Shm, but is 79 // unreferenced. Users must use TryIncRef to determine if the Shm is 80 // still valid. 81 shms map[ID]*Shm 82 83 // keysToShms maps segment keys to segments. 84 // 85 // Shms in keysToShms are guaranteed to be referenced, as they are 86 // removed by disassociateKey before the last DecRef. 87 keysToShms map[Key]*Shm 88 89 // Sum of the sizes of all existing segments rounded up to page size, in 90 // units of page size. 91 totalPages uint64 92 93 // ID assigned to the last created segment. Used to quickly find the next 94 // unused ID. 95 lastIDUsed ID 96 } 97 98 // NewRegistry creates a new shm registry. 99 func NewRegistry(userNS *auth.UserNamespace) *Registry { 100 return &Registry{ 101 userNS: userNS, 102 shms: make(map[ID]*Shm), 103 keysToShms: make(map[Key]*Shm), 104 } 105 } 106 107 // FindByID looks up a segment given an ID. 108 // 109 // FindByID returns a reference on Shm. 110 func (r *Registry) FindByID(id ID) *Shm { 111 r.mu.Lock() 112 defer r.mu.Unlock() 113 s := r.shms[id] 114 // Take a reference on s. If TryIncRef fails, s has reached the last 115 // DecRef, but hasn't quite been removed from r.shms yet. 116 if s != nil && s.TryIncRef() { 117 return s 118 } 119 return nil 120 } 121 122 // dissociateKey removes the association between a segment and its key, 123 // preventing it from being discovered in the registry. This doesn't necessarily 124 // mean the segment is about to be destroyed. This is analogous to unlinking a 125 // file; the segment can still be used by a process already referencing it, but 126 // cannot be discovered by a new process. 127 func (r *Registry) dissociateKey(s *Shm) { 128 r.mu.Lock() 129 defer r.mu.Unlock() 130 s.mu.Lock() 131 defer s.mu.Unlock() 132 if s.key != linux.IPC_PRIVATE { 133 delete(r.keysToShms, s.key) 134 s.key = linux.IPC_PRIVATE 135 } 136 } 137 138 // FindOrCreate looks up or creates a segment in the registry. It's functionally 139 // analogous to open(2). 140 // 141 // FindOrCreate returns a reference on Shm. 142 func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) { 143 if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) { 144 // "A new segment was to be created and size is less than SHMMIN or 145 // greater than SHMMAX." - man shmget(2) 146 // 147 // Note that 'private' always implies the creation of a new segment 148 // whether IPC_CREAT is specified or not. 149 return nil, linuxerr.EINVAL 150 } 151 152 r.mu.Lock() 153 defer r.mu.Unlock() 154 155 if len(r.shms) >= linux.SHMMNI { 156 // "All possible shared memory IDs have been taken (SHMMNI) ..." 157 // - man shmget(2) 158 return nil, syserror.ENOSPC 159 } 160 161 if !private { 162 // Look up an existing segment. 163 if shm := r.keysToShms[key]; shm != nil { 164 shm.mu.Lock() 165 defer shm.mu.Unlock() 166 167 // Check that caller can access the segment. 168 if !shm.checkPermissions(ctx, fs.PermsFromMode(mode)) { 169 // "The user does not have permission to access the shared 170 // memory segment, and does not have the CAP_IPC_OWNER 171 // capability in the user namespace that governs its IPC 172 // namespace." - man shmget(2) 173 return nil, linuxerr.EACCES 174 } 175 176 if size > shm.size { 177 // "A segment for the given key exists, but size is greater than 178 // the size of that segment." - man shmget(2) 179 return nil, linuxerr.EINVAL 180 } 181 182 if create && exclusive { 183 // "IPC_CREAT and IPC_EXCL were specified in shmflg, but a 184 // shared memory segment already exists for key." 185 // - man shmget(2) 186 return nil, syserror.EEXIST 187 } 188 189 shm.IncRef() 190 return shm, nil 191 } 192 193 if !create { 194 // "No segment exists for the given key, and IPC_CREAT was not 195 // specified." - man shmget(2) 196 return nil, syserror.ENOENT 197 } 198 } 199 200 var sizeAligned uint64 201 if val, ok := hostarch.Addr(size).RoundUp(); ok { 202 sizeAligned = uint64(val) 203 } else { 204 return nil, linuxerr.EINVAL 205 } 206 207 if numPages := sizeAligned / hostarch.PageSize; r.totalPages+numPages > linux.SHMALL { 208 // "... allocating a segment of the requested size would cause the 209 // system to exceed the system-wide limit on shared memory (SHMALL)." 210 // - man shmget(2) 211 return nil, syserror.ENOSPC 212 } 213 214 // Need to create a new segment. 215 creator := fs.FileOwnerFromContext(ctx) 216 perms := fs.FilePermsFromMode(mode) 217 s, err := r.newShm(ctx, pid, key, creator, perms, size) 218 if err != nil { 219 return nil, err 220 } 221 // The initial reference is held by s itself. Take another to return to 222 // the caller. 223 s.IncRef() 224 return s, nil 225 } 226 227 // newShm creates a new segment in the registry. 228 // 229 // Precondition: Caller must hold r.mu. 230 func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.FileOwner, perms fs.FilePermissions, size uint64) (*Shm, error) { 231 mfp := pgalloc.MemoryFileProviderFromContext(ctx) 232 if mfp == nil { 233 panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider)) 234 } 235 236 effectiveSize := uint64(hostarch.Addr(size).MustRoundUp()) 237 fr, err := mfp.MemoryFile().Allocate(effectiveSize, usage.Anonymous) 238 if err != nil { 239 return nil, err 240 } 241 242 shm := &Shm{ 243 mfp: mfp, 244 registry: r, 245 creator: creator, 246 size: size, 247 effectiveSize: effectiveSize, 248 fr: fr, 249 key: key, 250 perms: perms, 251 owner: creator, 252 creatorPID: pid, 253 changeTime: ktime.NowFromContext(ctx), 254 } 255 shm.InitRefs() 256 257 // Find the next available ID. 258 for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ { 259 // Handle wrap around. 260 if id < 0 { 261 id = 0 262 continue 263 } 264 if r.shms[id] == nil { 265 r.lastIDUsed = id 266 267 shm.ID = id 268 r.shms[id] = shm 269 r.keysToShms[key] = shm 270 271 r.totalPages += effectiveSize / hostarch.PageSize 272 273 return shm, nil 274 } 275 } 276 277 log.Warningf("Shm ids exhuasted, they may be leaking") 278 return nil, syserror.ENOSPC 279 } 280 281 // IPCInfo reports global parameters for sysv shared memory segments on this 282 // system. See shmctl(IPC_INFO). 283 func (r *Registry) IPCInfo() *linux.ShmParams { 284 return &linux.ShmParams{ 285 ShmMax: linux.SHMMAX, 286 ShmMin: linux.SHMMIN, 287 ShmMni: linux.SHMMNI, 288 ShmSeg: linux.SHMSEG, 289 ShmAll: linux.SHMALL, 290 } 291 } 292 293 // ShmInfo reports linux-specific global parameters for sysv shared memory 294 // segments on this system. See shmctl(SHM_INFO). 295 func (r *Registry) ShmInfo() *linux.ShmInfo { 296 r.mu.Lock() 297 defer r.mu.Unlock() 298 299 return &linux.ShmInfo{ 300 UsedIDs: int32(r.lastIDUsed), 301 ShmTot: r.totalPages, 302 ShmRss: r.totalPages, // We could probably get a better estimate from memory accounting. 303 ShmSwp: 0, // No reclaim at the moment. 304 } 305 } 306 307 // remove deletes a segment from this registry, deaccounting the memory used by 308 // the segment. 309 // 310 // Precondition: Must follow a call to r.dissociateKey(s). 311 func (r *Registry) remove(s *Shm) { 312 r.mu.Lock() 313 defer r.mu.Unlock() 314 s.mu.Lock() 315 defer s.mu.Unlock() 316 317 if s.key != linux.IPC_PRIVATE { 318 panic(fmt.Sprintf("Attempted to remove %s from the registry whose key is still associated", s.debugLocked())) 319 } 320 321 delete(r.shms, s.ID) 322 r.totalPages -= s.effectiveSize / hostarch.PageSize 323 } 324 325 // Release drops the self-reference of each active shm segment in the registry. 326 // It is called when the kernel.IPCNamespace containing r is being destroyed. 327 func (r *Registry) Release(ctx context.Context) { 328 // Because Shm.DecRef() may acquire the same locks, collect the segments to 329 // release first. Note that this should not race with any updates to r, since 330 // the IPC namespace containing it has no more references. 331 toRelease := make([]*Shm, 0) 332 r.mu.Lock() 333 for _, s := range r.keysToShms { 334 s.mu.Lock() 335 if !s.pendingDestruction { 336 toRelease = append(toRelease, s) 337 } 338 s.mu.Unlock() 339 } 340 r.mu.Unlock() 341 342 for _, s := range toRelease { 343 r.dissociateKey(s) 344 s.DecRef(ctx) 345 } 346 } 347 348 // Shm represents a single shared memory segment. 349 // 350 // Shm segments are backed directly by an allocation from platform memory. 351 // Segments are always mapped as a whole, greatly simplifying how mappings are 352 // tracked. However note that mremap and munmap calls may cause the vma for a 353 // segment to become fragmented; which requires special care when unmapping a 354 // segment. See mm/shm.go. 355 // 356 // Segments persist until they are explicitly marked for destruction via 357 // MarkDestroyed(). 358 // 359 // Shm implements memmap.Mappable and memmap.MappingIdentity. 360 // 361 // +stateify savable 362 type Shm struct { 363 // ShmRefs tracks the number of references to this segment. 364 // 365 // A segment holds a reference to itself until it is marked for 366 // destruction. 367 // 368 // In addition to direct users, the MemoryManager will hold references 369 // via MappingIdentity. 370 ShmRefs 371 372 mfp pgalloc.MemoryFileProvider 373 374 // registry points to the shm registry containing this segment. Immutable. 375 registry *Registry 376 377 // ID is the kernel identifier for this segment. Immutable. 378 ID ID 379 380 // creator is the user that created the segment. Immutable. 381 creator fs.FileOwner 382 383 // size is the requested size of the segment at creation, in 384 // bytes. Immutable. 385 size uint64 386 387 // effectiveSize of the segment, rounding up to the next page 388 // boundary. Immutable. 389 // 390 // Invariant: effectiveSize must be a multiple of hostarch.PageSize. 391 effectiveSize uint64 392 393 // fr is the offset into mfp.MemoryFile() that backs this contents of this 394 // segment. Immutable. 395 fr memmap.FileRange 396 397 // mu protects all fields below. 398 mu sync.Mutex `state:"nosave"` 399 400 // key is the public identifier for this segment. 401 key Key 402 403 // perms is the access permissions for the segment. 404 perms fs.FilePermissions 405 406 // owner of this segment. 407 owner fs.FileOwner 408 // attachTime is updated on every successful shmat. 409 attachTime ktime.Time 410 // detachTime is updated on every successful shmdt. 411 detachTime ktime.Time 412 // changeTime is updated on every successful changes to the segment via 413 // shmctl(IPC_SET). 414 changeTime ktime.Time 415 416 // creatorPID is the PID of the process that created the segment. 417 creatorPID int32 418 // lastAttachDetachPID is the pid of the process that issued the last shmat 419 // or shmdt syscall. 420 lastAttachDetachPID int32 421 422 // pendingDestruction indicates the segment was marked as destroyed through 423 // shmctl(IPC_RMID). When marked as destroyed, the segment will not be found 424 // in the registry and can no longer be attached. When the last user 425 // detaches from the segment, it is destroyed. 426 pendingDestruction bool 427 } 428 429 // Precondition: Caller must hold s.mu. 430 func (s *Shm) debugLocked() string { 431 return fmt.Sprintf("Shm{id: %d, key: %d, size: %d bytes, refs: %d, destroyed: %v}", 432 s.ID, s.key, s.size, s.ReadRefs(), s.pendingDestruction) 433 } 434 435 // MappedName implements memmap.MappingIdentity.MappedName. 436 func (s *Shm) MappedName(ctx context.Context) string { 437 s.mu.Lock() 438 defer s.mu.Unlock() 439 return fmt.Sprintf("SYSV%08d", s.key) 440 } 441 442 // DeviceID implements memmap.MappingIdentity.DeviceID. 443 func (s *Shm) DeviceID() uint64 { 444 return shmDevice.DeviceID() 445 } 446 447 // InodeID implements memmap.MappingIdentity.InodeID. 448 func (s *Shm) InodeID() uint64 { 449 // "shmid gets reported as "inode#" in /proc/pid/maps. proc-ps tools use 450 // this. Changing this will break them." -- Linux, ipc/shm.c:newseg() 451 return uint64(s.ID) 452 } 453 454 // DecRef drops a reference on s. 455 // 456 // Precondition: Caller must not hold s.mu. 457 func (s *Shm) DecRef(ctx context.Context) { 458 s.ShmRefs.DecRef(func() { 459 s.mfp.MemoryFile().DecRef(s.fr) 460 s.registry.remove(s) 461 }) 462 } 463 464 // Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm 465 // segments. 466 func (s *Shm) Msync(context.Context, memmap.MappableRange) error { 467 return nil 468 } 469 470 // AddMapping implements memmap.Mappable.AddMapping. 471 func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ hostarch.AddrRange, _ uint64, _ bool) error { 472 s.mu.Lock() 473 defer s.mu.Unlock() 474 s.attachTime = ktime.NowFromContext(ctx) 475 if pid, ok := context.ThreadGroupIDFromContext(ctx); ok { 476 s.lastAttachDetachPID = pid 477 } else { 478 // AddMapping is called during a syscall, so ctx should always be a task 479 // context. 480 log.Warningf("Adding mapping to %s but couldn't get the current pid; not updating the last attach pid", s.debugLocked()) 481 } 482 return nil 483 } 484 485 // RemoveMapping implements memmap.Mappable.RemoveMapping. 486 func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ hostarch.AddrRange, _ uint64, _ bool) { 487 s.mu.Lock() 488 defer s.mu.Unlock() 489 // RemoveMapping may be called during task exit, when ctx 490 // is context.Background. Gracefully handle missing clocks. Failing to 491 // update the detach time in these cases is ok, since no one can observe the 492 // omission. 493 if clock := ktime.RealtimeClockFromContext(ctx); clock != nil { 494 s.detachTime = clock.Now() 495 } 496 497 // If called from a non-task context we also won't have a threadgroup 498 // id. Silently skip updating the lastAttachDetachPid in that case. 499 if pid, ok := context.ThreadGroupIDFromContext(ctx); ok { 500 s.lastAttachDetachPID = pid 501 } else { 502 log.Debugf("Couldn't obtain pid when removing mapping to %s, not updating the last detach pid.", s.debugLocked()) 503 } 504 } 505 506 // CopyMapping implements memmap.Mappable.CopyMapping. 507 func (*Shm) CopyMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange, hostarch.AddrRange, uint64, bool) error { 508 return nil 509 } 510 511 // Translate implements memmap.Mappable.Translate. 512 func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { 513 var err error 514 if required.End > s.fr.Length() { 515 err = &memmap.BusError{syserror.EFAULT} 516 } 517 if source := optional.Intersect(memmap.MappableRange{0, s.fr.Length()}); source.Length() != 0 { 518 return []memmap.Translation{ 519 { 520 Source: source, 521 File: s.mfp.MemoryFile(), 522 Offset: s.fr.Start + source.Start, 523 Perms: hostarch.AnyAccess, 524 }, 525 }, err 526 } 527 return nil, err 528 } 529 530 // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. 531 func (s *Shm) InvalidateUnsavable(ctx context.Context) error { 532 return nil 533 } 534 535 // AttachOpts describes various flags passed to shmat(2). 536 type AttachOpts struct { 537 Execute bool 538 Readonly bool 539 Remap bool 540 } 541 542 // ConfigureAttach creates an mmap configuration for the segment with the 543 // requested attach options. 544 // 545 // Postconditions: The returned MMapOpts are valid only as long as a reference 546 // continues to be held on s. 547 func (s *Shm) ConfigureAttach(ctx context.Context, addr hostarch.Addr, opts AttachOpts) (memmap.MMapOpts, error) { 548 s.mu.Lock() 549 defer s.mu.Unlock() 550 if s.pendingDestruction && s.ReadRefs() == 0 { 551 return memmap.MMapOpts{}, syserror.EIDRM 552 } 553 554 if !s.checkPermissions(ctx, fs.PermMask{ 555 Read: true, 556 Write: !opts.Readonly, 557 Execute: opts.Execute, 558 }) { 559 // "The calling process does not have the required permissions for the 560 // requested attach type, and does not have the CAP_IPC_OWNER capability 561 // in the user namespace that governs its IPC namespace." - man shmat(2) 562 return memmap.MMapOpts{}, linuxerr.EACCES 563 } 564 return memmap.MMapOpts{ 565 Length: s.size, 566 Offset: 0, 567 Addr: addr, 568 Fixed: opts.Remap, 569 Perms: hostarch.AccessType{ 570 Read: true, 571 Write: !opts.Readonly, 572 Execute: opts.Execute, 573 }, 574 MaxPerms: hostarch.AnyAccess, 575 Mappable: s, 576 MappingIdentity: s, 577 }, nil 578 } 579 580 // EffectiveSize returns the size of the underlying shared memory segment. This 581 // may be larger than the requested size at creation, due to rounding to page 582 // boundaries. 583 func (s *Shm) EffectiveSize() uint64 { 584 return s.effectiveSize 585 } 586 587 // IPCStat returns information about a shm. See shmctl(IPC_STAT). 588 func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) { 589 s.mu.Lock() 590 defer s.mu.Unlock() 591 592 // "The caller must have read permission on the shared memory segment." 593 // - man shmctl(2) 594 if !s.checkPermissions(ctx, fs.PermMask{Read: true}) { 595 // "IPC_STAT or SHM_STAT is requested and shm_perm.mode does not allow 596 // read access for shmid, and the calling process does not have the 597 // CAP_IPC_OWNER capability in the user namespace that governs its IPC 598 // namespace." - man shmctl(2) 599 return nil, linuxerr.EACCES 600 } 601 602 var mode uint16 603 if s.pendingDestruction { 604 mode |= linux.SHM_DEST 605 } 606 creds := auth.CredentialsFromContext(ctx) 607 608 // Use the reference count as a rudimentary count of the number of 609 // attaches. We exclude: 610 // 611 // 1. The reference the caller holds. 612 // 2. The self-reference held by s prior to destruction. 613 // 614 // Note that this may still overcount by including transient references 615 // used in concurrent calls. 616 nattach := uint64(s.ReadRefs()) - 1 617 if !s.pendingDestruction { 618 nattach-- 619 } 620 621 ds := &linux.ShmidDS{ 622 ShmPerm: linux.IPCPerm{ 623 Key: uint32(s.key), 624 UID: uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)), 625 GID: uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)), 626 CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)), 627 CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)), 628 Mode: mode | uint16(s.perms.LinuxMode()), 629 Seq: 0, // IPC sequences not supported. 630 }, 631 ShmSegsz: s.size, 632 ShmAtime: s.attachTime.TimeT(), 633 ShmDtime: s.detachTime.TimeT(), 634 ShmCtime: s.changeTime.TimeT(), 635 ShmCpid: s.creatorPID, 636 ShmLpid: s.lastAttachDetachPID, 637 ShmNattach: nattach, 638 } 639 640 return ds, nil 641 } 642 643 // Set modifies attributes for a segment. See shmctl(IPC_SET). 644 func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error { 645 s.mu.Lock() 646 defer s.mu.Unlock() 647 648 if !s.checkOwnership(ctx) { 649 return linuxerr.EPERM 650 } 651 652 creds := auth.CredentialsFromContext(ctx) 653 uid := creds.UserNamespace.MapToKUID(auth.UID(ds.ShmPerm.UID)) 654 gid := creds.UserNamespace.MapToKGID(auth.GID(ds.ShmPerm.GID)) 655 if !uid.Ok() || !gid.Ok() { 656 return linuxerr.EINVAL 657 } 658 659 // User may only modify the lower 9 bits of the mode. All the other bits are 660 // always 0 for the underlying inode. 661 mode := linux.FileMode(ds.ShmPerm.Mode & 0x1ff) 662 s.perms = fs.FilePermsFromMode(mode) 663 664 s.owner.UID = uid 665 s.owner.GID = gid 666 667 s.changeTime = ktime.NowFromContext(ctx) 668 return nil 669 } 670 671 // MarkDestroyed marks a segment for destruction. The segment is actually 672 // destroyed once it has no references. MarkDestroyed may be called multiple 673 // times, and is safe to call after a segment has already been destroyed. See 674 // shmctl(IPC_RMID). 675 func (s *Shm) MarkDestroyed(ctx context.Context) { 676 s.registry.dissociateKey(s) 677 678 s.mu.Lock() 679 if s.pendingDestruction { 680 s.mu.Unlock() 681 return 682 } 683 s.pendingDestruction = true 684 s.mu.Unlock() 685 686 // Drop the self-reference so destruction occurs when all 687 // external references are gone. 688 // 689 // N.B. This cannot be the final DecRef, as the caller also 690 // holds a reference. 691 s.DecRef(ctx) 692 return 693 } 694 695 // checkOwnership verifies whether a segment may be accessed by ctx as an 696 // owner. See ipc/util.c:ipcctl_pre_down_nolock() in Linux. 697 // 698 // Precondition: Caller must hold s.mu. 699 func (s *Shm) checkOwnership(ctx context.Context) bool { 700 creds := auth.CredentialsFromContext(ctx) 701 if s.owner.UID == creds.EffectiveKUID || s.creator.UID == creds.EffectiveKUID { 702 return true 703 } 704 705 // Tasks with CAP_SYS_ADMIN may bypass ownership checks. Strangely, Linux 706 // doesn't use CAP_IPC_OWNER for this despite CAP_IPC_OWNER being documented 707 // for use to "override IPC ownership checks". 708 return creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, s.registry.userNS) 709 } 710 711 // checkPermissions verifies whether a segment is accessible by ctx for access 712 // described by req. See ipc/util.c:ipcperms() in Linux. 713 // 714 // Precondition: Caller must hold s.mu. 715 func (s *Shm) checkPermissions(ctx context.Context, req fs.PermMask) bool { 716 creds := auth.CredentialsFromContext(ctx) 717 718 p := s.perms.Other 719 if s.owner.UID == creds.EffectiveKUID { 720 p = s.perms.User 721 } else if creds.InGroup(s.owner.GID) { 722 p = s.perms.Group 723 } 724 if p.SupersetOf(req) { 725 return true 726 } 727 728 // Tasks with CAP_IPC_OWNER may bypass permission checks. 729 return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS) 730 }