github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/kernel/shm/shm.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package shm implements sysv shared memory segments.
    16  //
    17  // Known missing features:
    18  //
    19  //   - SHM_LOCK/SHM_UNLOCK are no-ops. The sentry currently doesn't implement
    20  //     memory locking in general.
    21  //
    22  //   - SHM_HUGETLB and related flags for shmget(2) are ignored. There's no easy
    23  //     way to implement hugetlb support on a per-map basis, and it has no impact
    24  //     on correctness.
    25  //
    26  //   - SHM_NORESERVE for shmget(2) is ignored, the sentry doesn't implement swap
    27  //     so it's meaningless to reserve space for swap.
    28  //
    29  //   - No per-process segment size enforcement. This feature probably isn't used
    30  //     much anyways, since Linux sets the per-process limits to the system-wide
    31  //     limits by default.
    32  //
    33  // Lock ordering: mm.mappingMu -> shm registry lock -> shm lock
    34  package shm
    35  
    36  import (
    37  	"fmt"
    38  
    39  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    40  	"github.com/MerlinKodo/gvisor/pkg/context"
    41  	"github.com/MerlinKodo/gvisor/pkg/errors/linuxerr"
    42  	"github.com/MerlinKodo/gvisor/pkg/hostarch"
    43  	"github.com/MerlinKodo/gvisor/pkg/log"
    44  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth"
    45  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/ipc"
    46  	ktime "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/time"
    47  	"github.com/MerlinKodo/gvisor/pkg/sentry/memmap"
    48  	"github.com/MerlinKodo/gvisor/pkg/sentry/pgalloc"
    49  	"github.com/MerlinKodo/gvisor/pkg/sentry/usage"
    50  	"github.com/MerlinKodo/gvisor/pkg/sentry/vfs"
    51  	"github.com/MerlinKodo/gvisor/pkg/sync"
    52  )
    53  
    54  // Registry tracks all shared memory segments in an IPC namespace. The registry
    55  // provides the mechanisms for creating and finding segments, and reporting
    56  // global shm parameters.
    57  //
    58  // +stateify savable
    59  type Registry struct {
    60  	// userNS owns the IPC namespace this registry belong to. Immutable.
    61  	userNS *auth.UserNamespace
    62  
    63  	// mu protects all fields below.
    64  	mu sync.Mutex `state:"nosave"`
    65  
    66  	// reg defines basic fields and operations needed for all SysV registries.
    67  	//
    68  	// Withing reg, there are two maps, Objects and KeysToIDs.
    69  	//
    70  	// reg.objects holds all referenced segments, which are removed on the last
    71  	// DecRef. Thus, it cannot itself hold a reference on the Shm.
    72  	//
    73  	// Since removal only occurs after the last (unlocked) DecRef, there
    74  	// exists a short window during which a Shm still exists in Shm, but is
    75  	// unreferenced. Users must use TryIncRef to determine if the Shm is
    76  	// still valid.
    77  	//
    78  	// keysToIDs maps segment keys to IDs.
    79  	//
    80  	// Shms in keysToIDs are guaranteed to be referenced, as they are
    81  	// removed by disassociateKey before the last DecRef.
    82  	reg *ipc.Registry
    83  
    84  	// Sum of the sizes of all existing segments rounded up to page size, in
    85  	// units of page size.
    86  	totalPages uint64
    87  }
    88  
    89  // NewRegistry creates a new shm registry.
    90  func NewRegistry(userNS *auth.UserNamespace) *Registry {
    91  	return &Registry{
    92  		userNS: userNS,
    93  		reg:    ipc.NewRegistry(userNS),
    94  	}
    95  }
    96  
    97  // FindByID looks up a segment given an ID.
    98  //
    99  // FindByID returns a reference on Shm.
   100  func (r *Registry) FindByID(id ipc.ID) *Shm {
   101  	r.mu.Lock()
   102  	defer r.mu.Unlock()
   103  	mech := r.reg.FindByID(id)
   104  	if mech == nil {
   105  		return nil
   106  	}
   107  	s := mech.(*Shm)
   108  
   109  	// Take a reference on s. If TryIncRef fails, s has reached the last
   110  	// DecRef, but hasn't quite been removed from r.reg.objects yet.
   111  	if s != nil && s.TryIncRef() {
   112  		return s
   113  	}
   114  	return nil
   115  }
   116  
   117  // dissociateKey removes the association between a segment and its key,
   118  // preventing it from being discovered in the registry. This doesn't necessarily
   119  // mean the segment is about to be destroyed. This is analogous to unlinking a
   120  // file; the segment can still be used by a process already referencing it, but
   121  // cannot be discovered by a new process.
   122  func (r *Registry) dissociateKey(s *Shm) {
   123  	r.mu.Lock()
   124  	defer r.mu.Unlock()
   125  	s.mu.Lock()
   126  	defer s.mu.Unlock()
   127  	if s.obj.Key != linux.IPC_PRIVATE {
   128  		r.reg.DissociateKey(s.obj.Key)
   129  		s.obj.Key = linux.IPC_PRIVATE
   130  	}
   131  }
   132  
   133  // FindOrCreate looks up or creates a segment in the registry. It's functionally
   134  // analogous to open(2).
   135  //
   136  // FindOrCreate returns a reference on Shm.
   137  func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key ipc.Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) {
   138  	if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) {
   139  		// "A new segment was to be created and size is less than SHMMIN or
   140  		// greater than SHMMAX." - man shmget(2)
   141  		//
   142  		// Note that 'private' always implies the creation of a new segment
   143  		// whether IPC_CREAT is specified or not.
   144  		return nil, linuxerr.EINVAL
   145  	}
   146  
   147  	r.mu.Lock()
   148  	defer r.mu.Unlock()
   149  
   150  	if r.reg.ObjectCount() >= linux.SHMMNI {
   151  		// "All possible shared memory IDs have been taken (SHMMNI) ..."
   152  		//   - man shmget(2)
   153  		return nil, linuxerr.ENOSPC
   154  	}
   155  
   156  	if !private {
   157  		shm, err := r.reg.Find(ctx, key, mode, create, exclusive)
   158  		if err != nil {
   159  			return nil, err
   160  		}
   161  
   162  		// Validate shm-specific parameters.
   163  		if shm != nil {
   164  			shm := shm.(*Shm)
   165  			if size > shm.size {
   166  				// "A segment for the given key exists, but size is greater than
   167  				// the size of that segment." - man shmget(2)
   168  				return nil, linuxerr.EINVAL
   169  			}
   170  			shm.IncRef()
   171  			return shm, nil
   172  		}
   173  	}
   174  
   175  	var sizeAligned uint64
   176  	if val, ok := hostarch.Addr(size).RoundUp(); ok {
   177  		sizeAligned = uint64(val)
   178  	} else {
   179  		return nil, linuxerr.EINVAL
   180  	}
   181  
   182  	if numPages := sizeAligned / hostarch.PageSize; r.totalPages+numPages > linux.SHMALL {
   183  		// "... allocating a segment of the requested size would cause the
   184  		// system to exceed the system-wide limit on shared memory (SHMALL)."
   185  		//   - man shmget(2)
   186  		return nil, linuxerr.ENOSPC
   187  	}
   188  
   189  	// Need to create a new segment.
   190  	s, err := r.newShmLocked(ctx, pid, key, auth.CredentialsFromContext(ctx), mode, size)
   191  	if err != nil {
   192  		return nil, err
   193  	}
   194  	// The initial reference is held by s itself. Take another to return to
   195  	// the caller.
   196  	s.IncRef()
   197  	return s, nil
   198  }
   199  
   200  // newShmLocked creates a new segment in the registry.
   201  //
   202  // Precondition: Caller must hold r.mu.
   203  func (r *Registry) newShmLocked(ctx context.Context, pid int32, key ipc.Key, creator *auth.Credentials, mode linux.FileMode, size uint64) (*Shm, error) {
   204  	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
   205  	if mfp == nil {
   206  		panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider))
   207  	}
   208  	devID, ok := deviceIDFromContext(ctx)
   209  	if !ok {
   210  		panic(fmt.Sprintf("context.Context %T lacks value for key %T", ctx, CtxDeviceID))
   211  	}
   212  
   213  	effectiveSize := uint64(hostarch.Addr(size).MustRoundUp())
   214  	fr, err := mfp.MemoryFile().Allocate(effectiveSize, pgalloc.AllocOpts{Kind: usage.Anonymous, MemCgID: pgalloc.MemoryCgroupIDFromContext(ctx)})
   215  	if err != nil {
   216  		return nil, err
   217  	}
   218  
   219  	shm := &Shm{
   220  		mfp:           mfp,
   221  		registry:      r,
   222  		devID:         devID,
   223  		size:          size,
   224  		effectiveSize: effectiveSize,
   225  		obj:           ipc.NewObject(r.reg.UserNS, ipc.Key(key), creator, creator, mode),
   226  		fr:            fr,
   227  		creatorPID:    pid,
   228  		changeTime:    ktime.NowFromContext(ctx),
   229  	}
   230  	shm.InitRefs()
   231  
   232  	if err := r.reg.Register(shm); err != nil {
   233  		return nil, err
   234  	}
   235  	r.totalPages += effectiveSize / hostarch.PageSize
   236  
   237  	return shm, nil
   238  }
   239  
   240  // IPCInfo reports global parameters for sysv shared memory segments on this
   241  // system. See shmctl(IPC_INFO).
   242  func (r *Registry) IPCInfo() *linux.ShmParams {
   243  	return &linux.ShmParams{
   244  		ShmMax: linux.SHMMAX,
   245  		ShmMin: linux.SHMMIN,
   246  		ShmMni: linux.SHMMNI,
   247  		ShmSeg: linux.SHMSEG,
   248  		ShmAll: linux.SHMALL,
   249  	}
   250  }
   251  
   252  // ShmInfo reports linux-specific global parameters for sysv shared memory
   253  // segments on this system. See shmctl(SHM_INFO).
   254  func (r *Registry) ShmInfo() *linux.ShmInfo {
   255  	r.mu.Lock()
   256  	defer r.mu.Unlock()
   257  
   258  	return &linux.ShmInfo{
   259  		UsedIDs: int32(r.reg.LastIDUsed()),
   260  		ShmTot:  r.totalPages,
   261  		ShmRss:  r.totalPages, // We could probably get a better estimate from memory accounting.
   262  		ShmSwp:  0,            // No reclaim at the moment.
   263  	}
   264  }
   265  
   266  // remove deletes a segment from this registry, deaccounting the memory used by
   267  // the segment.
   268  //
   269  // Precondition: Must follow a call to r.dissociateKey(s).
   270  func (r *Registry) remove(s *Shm) {
   271  	r.mu.Lock()
   272  	defer r.mu.Unlock()
   273  	s.mu.Lock()
   274  	defer s.mu.Unlock()
   275  
   276  	if s.obj.Key != linux.IPC_PRIVATE {
   277  		panic(fmt.Sprintf("Attempted to remove %s from the registry whose key is still associated", s.debugLocked()))
   278  	}
   279  
   280  	r.reg.DissociateID(s.obj.ID)
   281  	r.totalPages -= s.effectiveSize / hostarch.PageSize
   282  }
   283  
   284  // Release drops the self-reference of each active shm segment in the registry.
   285  // It is called when the kernel.IPCNamespace containing r is being destroyed.
   286  func (r *Registry) Release(ctx context.Context) {
   287  	// Because Shm.DecRef() may acquire the same locks, collect the segments to
   288  	// release first. Note that this should not race with any updates to r, since
   289  	// the IPC namespace containing it has no more references.
   290  	toRelease := make([]*Shm, 0)
   291  	r.mu.Lock()
   292  	r.reg.ForAllObjects(
   293  		func(o ipc.Mechanism) {
   294  			s := o.(*Shm)
   295  			s.mu.Lock()
   296  			if !s.pendingDestruction {
   297  				toRelease = append(toRelease, s)
   298  			}
   299  			s.mu.Unlock()
   300  		},
   301  	)
   302  	r.mu.Unlock()
   303  
   304  	for _, s := range toRelease {
   305  		r.dissociateKey(s)
   306  		s.DecRef(ctx)
   307  	}
   308  }
   309  
   310  // Shm represents a single shared memory segment.
   311  //
   312  // Shm segments are backed directly by an allocation from platform memory.
   313  // Segments are always mapped as a whole, greatly simplifying how mappings are
   314  // tracked. However note that mremap and munmap calls may cause the vma for a
   315  // segment to become fragmented; which requires special care when unmapping a
   316  // segment. See mm/shm.go.
   317  //
   318  // Segments persist until they are explicitly marked for destruction via
   319  // MarkDestroyed().
   320  //
   321  // Shm implements memmap.Mappable and memmap.MappingIdentity.
   322  //
   323  // +stateify savable
   324  type Shm struct {
   325  	// ShmRefs tracks the number of references to this segment.
   326  	//
   327  	// A segment holds a reference to itself until it is marked for
   328  	// destruction.
   329  	//
   330  	// In addition to direct users, the MemoryManager will hold references
   331  	// via MappingIdentity.
   332  	ShmRefs
   333  
   334  	mfp pgalloc.MemoryFileProvider
   335  
   336  	// registry points to the shm registry containing this segment. Immutable.
   337  	registry *Registry
   338  
   339  	// devID is the segment's device ID. Immutable.
   340  	devID uint32
   341  
   342  	// size is the requested size of the segment at creation, in
   343  	// bytes. Immutable.
   344  	size uint64
   345  
   346  	// effectiveSize of the segment, rounding up to the next page
   347  	// boundary. Immutable.
   348  	//
   349  	// Invariant: effectiveSize must be a multiple of hostarch.PageSize.
   350  	effectiveSize uint64
   351  
   352  	// fr is the offset into mfp.MemoryFile() that backs this contents of this
   353  	// segment. Immutable.
   354  	fr memmap.FileRange
   355  
   356  	// mu protects all fields below.
   357  	mu sync.Mutex `state:"nosave"`
   358  
   359  	obj *ipc.Object
   360  
   361  	// attachTime is updated on every successful shmat.
   362  	attachTime ktime.Time
   363  	// detachTime is updated on every successful shmdt.
   364  	detachTime ktime.Time
   365  	// changeTime is updated on every successful changes to the segment via
   366  	// shmctl(IPC_SET).
   367  	changeTime ktime.Time
   368  
   369  	// creatorPID is the PID of the process that created the segment.
   370  	creatorPID int32
   371  	// lastAttachDetachPID is the pid of the process that issued the last shmat
   372  	// or shmdt syscall.
   373  	lastAttachDetachPID int32
   374  
   375  	// pendingDestruction indicates the segment was marked as destroyed through
   376  	// shmctl(IPC_RMID). When marked as destroyed, the segment will not be found
   377  	// in the registry and can no longer be attached. When the last user
   378  	// detaches from the segment, it is destroyed.
   379  	pendingDestruction bool
   380  }
   381  
   382  // ID returns object's ID.
   383  func (s *Shm) ID() ipc.ID {
   384  	return s.obj.ID
   385  }
   386  
   387  // Object implements ipc.Mechanism.Object.
   388  func (s *Shm) Object() *ipc.Object {
   389  	return s.obj
   390  }
   391  
   392  // Destroy implements ipc.Mechanism.Destroy. No work is performed on shm.Destroy
   393  // because a different removal mechanism is used in shm. See Shm.MarkDestroyed.
   394  func (s *Shm) Destroy() {
   395  }
   396  
   397  // Lock implements ipc.Mechanism.Lock.
   398  func (s *Shm) Lock() {
   399  	s.mu.Lock()
   400  }
   401  
   402  // Unlock implements ipc.mechanism.Unlock.
   403  //
   404  // +checklocksignore
   405  func (s *Shm) Unlock() {
   406  	s.mu.Unlock()
   407  }
   408  
   409  // Precondition: Caller must hold s.mu.
   410  func (s *Shm) debugLocked() string {
   411  	return fmt.Sprintf("Shm{id: %d, key: %d, size: %d bytes, refs: %d, destroyed: %v}",
   412  		s.obj.ID, s.obj.Key, s.size, s.ReadRefs(), s.pendingDestruction)
   413  }
   414  
   415  // MappedName implements memmap.MappingIdentity.MappedName.
   416  func (s *Shm) MappedName(ctx context.Context) string {
   417  	s.mu.Lock()
   418  	defer s.mu.Unlock()
   419  	return fmt.Sprintf("SYSV%08d", s.obj.Key)
   420  }
   421  
   422  // DeviceID implements memmap.MappingIdentity.DeviceID.
   423  func (s *Shm) DeviceID() uint64 {
   424  	return uint64(s.devID)
   425  }
   426  
   427  // InodeID implements memmap.MappingIdentity.InodeID.
   428  func (s *Shm) InodeID() uint64 {
   429  	// "shmid gets reported as "inode#" in /proc/pid/maps. proc-ps tools use
   430  	// this. Changing this will break them." -- Linux, ipc/shm.c:newseg()
   431  	return uint64(s.obj.ID)
   432  }
   433  
   434  // DecRef drops a reference on s.
   435  //
   436  // Precondition: Caller must not hold s.mu.
   437  func (s *Shm) DecRef(ctx context.Context) {
   438  	s.ShmRefs.DecRef(func() {
   439  		s.mfp.MemoryFile().DecRef(s.fr)
   440  		s.registry.remove(s)
   441  	})
   442  }
   443  
   444  // Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm
   445  // segments.
   446  func (s *Shm) Msync(context.Context, memmap.MappableRange) error {
   447  	return nil
   448  }
   449  
   450  // AddMapping implements memmap.Mappable.AddMapping.
   451  func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ hostarch.AddrRange, _ uint64, _ bool) error {
   452  	s.mu.Lock()
   453  	defer s.mu.Unlock()
   454  	s.attachTime = ktime.NowFromContext(ctx)
   455  	if pid, ok := auth.ThreadGroupIDFromContext(ctx); ok {
   456  		s.lastAttachDetachPID = pid
   457  	} else {
   458  		// AddMapping is called during a syscall, so ctx should always be a task
   459  		// context.
   460  		log.Warningf("Adding mapping to %s but couldn't get the current pid; not updating the last attach pid", s.debugLocked())
   461  	}
   462  	return nil
   463  }
   464  
   465  // RemoveMapping implements memmap.Mappable.RemoveMapping.
   466  func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ hostarch.AddrRange, _ uint64, _ bool) {
   467  	s.mu.Lock()
   468  	defer s.mu.Unlock()
   469  	// RemoveMapping may be called during task exit, when ctx
   470  	// is context.Background. Gracefully handle missing clocks. Failing to
   471  	// update the detach time in these cases is ok, since no one can observe the
   472  	// omission.
   473  	if clock := ktime.RealtimeClockFromContext(ctx); clock != nil {
   474  		s.detachTime = clock.Now()
   475  	}
   476  
   477  	// If called from a non-task context we also won't have a threadgroup
   478  	// id. Silently skip updating the lastAttachDetachPid in that case.
   479  	if pid, ok := auth.ThreadGroupIDFromContext(ctx); ok {
   480  		s.lastAttachDetachPID = pid
   481  	} else {
   482  		log.Debugf("Couldn't obtain pid when removing mapping to %s, not updating the last detach pid.", s.debugLocked())
   483  	}
   484  }
   485  
   486  // CopyMapping implements memmap.Mappable.CopyMapping.
   487  func (*Shm) CopyMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange, hostarch.AddrRange, uint64, bool) error {
   488  	return nil
   489  }
   490  
   491  // Translate implements memmap.Mappable.Translate.
   492  func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
   493  	var err error
   494  	if required.End > s.fr.Length() {
   495  		err = &memmap.BusError{linuxerr.EFAULT}
   496  	}
   497  	if source := optional.Intersect(memmap.MappableRange{0, s.fr.Length()}); source.Length() != 0 {
   498  		return []memmap.Translation{
   499  			{
   500  				Source: source,
   501  				File:   s.mfp.MemoryFile(),
   502  				Offset: s.fr.Start + source.Start,
   503  				Perms:  hostarch.AnyAccess,
   504  			},
   505  		}, err
   506  	}
   507  	return nil, err
   508  }
   509  
   510  // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
   511  func (s *Shm) InvalidateUnsavable(ctx context.Context) error {
   512  	return nil
   513  }
   514  
   515  // AttachOpts describes various flags passed to shmat(2).
   516  type AttachOpts struct {
   517  	Execute  bool
   518  	Readonly bool
   519  	Remap    bool
   520  }
   521  
   522  // ConfigureAttach creates an mmap configuration for the segment with the
   523  // requested attach options.
   524  //
   525  // Postconditions: The returned MMapOpts are valid only as long as a reference
   526  // continues to be held on s.
   527  func (s *Shm) ConfigureAttach(ctx context.Context, addr hostarch.Addr, opts AttachOpts) (memmap.MMapOpts, error) {
   528  	s.mu.Lock()
   529  	defer s.mu.Unlock()
   530  	if s.pendingDestruction && s.ReadRefs() == 0 {
   531  		return memmap.MMapOpts{}, linuxerr.EIDRM
   532  	}
   533  
   534  	creds := auth.CredentialsFromContext(ctx)
   535  	ats := vfs.MayRead
   536  	if !opts.Readonly {
   537  		ats |= vfs.MayWrite
   538  	}
   539  	if opts.Execute {
   540  		ats |= vfs.MayExec
   541  	}
   542  	if !s.obj.CheckPermissions(creds, ats) {
   543  		// "The calling process does not have the required permissions for the
   544  		// requested attach type, and does not have the CAP_IPC_OWNER capability
   545  		// in the user namespace that governs its IPC namespace." - man shmat(2)
   546  		return memmap.MMapOpts{}, linuxerr.EACCES
   547  	}
   548  	return memmap.MMapOpts{
   549  		Length: s.size,
   550  		Offset: 0,
   551  		Addr:   addr,
   552  		Fixed:  opts.Remap,
   553  		Perms: hostarch.AccessType{
   554  			Read:    true,
   555  			Write:   !opts.Readonly,
   556  			Execute: opts.Execute,
   557  		},
   558  		MaxPerms:        hostarch.AnyAccess,
   559  		Mappable:        s,
   560  		MappingIdentity: s,
   561  	}, nil
   562  }
   563  
   564  // EffectiveSize returns the size of the underlying shared memory segment. This
   565  // may be larger than the requested size at creation, due to rounding to page
   566  // boundaries.
   567  func (s *Shm) EffectiveSize() uint64 {
   568  	return s.effectiveSize
   569  }
   570  
   571  // IPCStat returns information about a shm. See shmctl(IPC_STAT).
   572  func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) {
   573  	s.mu.Lock()
   574  	defer s.mu.Unlock()
   575  
   576  	// "The caller must have read permission on the shared memory segment."
   577  	//   - man shmctl(2)
   578  	creds := auth.CredentialsFromContext(ctx)
   579  	if !s.obj.CheckPermissions(creds, vfs.MayRead) {
   580  		// "IPC_STAT or SHM_STAT is requested and shm_perm.mode does not allow
   581  		// read access for shmid, and the calling process does not have the
   582  		// CAP_IPC_OWNER capability in the user namespace that governs its IPC
   583  		// namespace." - man shmctl(2)
   584  		return nil, linuxerr.EACCES
   585  	}
   586  
   587  	var mode uint16
   588  	if s.pendingDestruction {
   589  		mode |= linux.SHM_DEST
   590  	}
   591  
   592  	// Use the reference count as a rudimentary count of the number of
   593  	// attaches. We exclude:
   594  	//
   595  	// 1. The reference the caller holds.
   596  	// 2. The self-reference held by s prior to destruction.
   597  	//
   598  	// Note that this may still overcount by including transient references
   599  	// used in concurrent calls.
   600  	nattach := uint64(s.ReadRefs()) - 1
   601  	if !s.pendingDestruction {
   602  		nattach--
   603  	}
   604  
   605  	ds := &linux.ShmidDS{
   606  		ShmPerm: linux.IPCPerm{
   607  			Key:  uint32(s.obj.Key),
   608  			UID:  uint32(creds.UserNamespace.MapFromKUID(s.obj.OwnerUID)),
   609  			GID:  uint32(creds.UserNamespace.MapFromKGID(s.obj.OwnerGID)),
   610  			CUID: uint32(creds.UserNamespace.MapFromKUID(s.obj.CreatorUID)),
   611  			CGID: uint32(creds.UserNamespace.MapFromKGID(s.obj.CreatorGID)),
   612  			Mode: mode | uint16(s.obj.Mode),
   613  			Seq:  0, // IPC sequences not supported.
   614  		},
   615  		ShmSegsz:   s.size,
   616  		ShmAtime:   s.attachTime.TimeT(),
   617  		ShmDtime:   s.detachTime.TimeT(),
   618  		ShmCtime:   s.changeTime.TimeT(),
   619  		ShmCpid:    s.creatorPID,
   620  		ShmLpid:    s.lastAttachDetachPID,
   621  		ShmNattach: nattach,
   622  	}
   623  
   624  	return ds, nil
   625  }
   626  
   627  // Set modifies attributes for a segment. See shmctl(IPC_SET).
   628  func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error {
   629  	s.mu.Lock()
   630  	defer s.mu.Unlock()
   631  
   632  	if err := s.obj.Set(ctx, &ds.ShmPerm); err != nil {
   633  		return err
   634  	}
   635  
   636  	s.changeTime = ktime.NowFromContext(ctx)
   637  	return nil
   638  }
   639  
   640  // MarkDestroyed marks a segment for destruction. The segment is actually
   641  // destroyed once it has no references. MarkDestroyed may be called multiple
   642  // times, and is safe to call after a segment has already been destroyed. See
   643  // shmctl(IPC_RMID).
   644  func (s *Shm) MarkDestroyed(ctx context.Context) {
   645  	s.registry.dissociateKey(s)
   646  
   647  	s.mu.Lock()
   648  	if s.pendingDestruction {
   649  		s.mu.Unlock()
   650  		return
   651  	}
   652  	s.pendingDestruction = true
   653  	s.mu.Unlock()
   654  
   655  	// Drop the self-reference so destruction occurs when all
   656  	// external references are gone.
   657  	//
   658  	// N.B. This cannot be the final DecRef, as the caller also
   659  	// holds a reference.
   660  	s.DecRef(ctx)
   661  	return
   662  }