gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/kernel/shm/shm.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package shm implements sysv shared memory segments.
    16  //
    17  // Known missing features:
    18  //
    19  //   - SHM_LOCK/SHM_UNLOCK are no-ops. The sentry currently doesn't implement
    20  //     memory locking in general.
    21  //
    22  //   - SHM_HUGETLB and related flags for shmget(2) are ignored. There's no easy
    23  //     way to implement hugetlb support on a per-map basis, and it has no impact
    24  //     on correctness.
    25  //
    26  //   - SHM_NORESERVE for shmget(2) is ignored, the sentry doesn't implement swap
    27  //     so it's meaningless to reserve space for swap.
    28  //
    29  //   - No per-process segment size enforcement. This feature probably isn't used
    30  //     much anyways, since Linux sets the per-process limits to the system-wide
    31  //     limits by default.
    32  //
    33  // Lock ordering: mm.mappingMu -> shm registry lock -> shm lock
    34  package shm
    35  
    36  import (
    37  	goContext "context"
    38  	"fmt"
    39  
    40  	"gvisor.dev/gvisor/pkg/abi/linux"
    41  	"gvisor.dev/gvisor/pkg/context"
    42  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    43  	"gvisor.dev/gvisor/pkg/hostarch"
    44  	"gvisor.dev/gvisor/pkg/log"
    45  	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
    46  	"gvisor.dev/gvisor/pkg/sentry/kernel/ipc"
    47  	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
    48  	"gvisor.dev/gvisor/pkg/sentry/memmap"
    49  	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
    50  	"gvisor.dev/gvisor/pkg/sentry/usage"
    51  	"gvisor.dev/gvisor/pkg/sentry/vfs"
    52  	"gvisor.dev/gvisor/pkg/sync"
    53  )
    54  
    55  // Registry tracks all shared memory segments in an IPC namespace. The registry
    56  // provides the mechanisms for creating and finding segments, and reporting
    57  // global shm parameters.
    58  //
    59  // +stateify savable
    60  type Registry struct {
    61  	// userNS owns the IPC namespace this registry belong to. Immutable.
    62  	userNS *auth.UserNamespace
    63  
    64  	// mu protects all fields below.
    65  	mu sync.Mutex `state:"nosave"`
    66  
    67  	// reg defines basic fields and operations needed for all SysV registries.
    68  	//
    69  	// Within reg, there are two maps, Objects and KeysToIDs.
    70  	//
    71  	// reg.objects holds all referenced segments, which are removed on the last
    72  	// DecRef. Thus, it cannot itself hold a reference on the Shm.
    73  	//
    74  	// Since removal only occurs after the last (unlocked) DecRef, there
    75  	// exists a short window during which a Shm still exists in Shm, but is
    76  	// unreferenced. Users must use TryIncRef to determine if the Shm is
    77  	// still valid.
    78  	//
    79  	// keysToIDs maps segment keys to IDs.
    80  	//
    81  	// Shms in keysToIDs are guaranteed to be referenced, as they are
    82  	// removed by disassociateKey before the last DecRef.
    83  	reg *ipc.Registry
    84  
    85  	// Sum of the sizes of all existing segments rounded up to page size, in
    86  	// units of page size.
    87  	totalPages uint64
    88  }
    89  
    90  // NewRegistry creates a new shm registry.
    91  func NewRegistry(userNS *auth.UserNamespace) *Registry {
    92  	return &Registry{
    93  		userNS: userNS,
    94  		reg:    ipc.NewRegistry(userNS),
    95  	}
    96  }
    97  
    98  // FindByID looks up a segment given an ID.
    99  //
   100  // FindByID returns a reference on Shm.
   101  func (r *Registry) FindByID(id ipc.ID) *Shm {
   102  	r.mu.Lock()
   103  	defer r.mu.Unlock()
   104  	mech := r.reg.FindByID(id)
   105  	if mech == nil {
   106  		return nil
   107  	}
   108  	s := mech.(*Shm)
   109  
   110  	// Take a reference on s. If TryIncRef fails, s has reached the last
   111  	// DecRef, but hasn't quite been removed from r.reg.objects yet.
   112  	if s != nil && s.TryIncRef() {
   113  		return s
   114  	}
   115  	return nil
   116  }
   117  
   118  // dissociateKey removes the association between a segment and its key,
   119  // preventing it from being discovered in the registry. This doesn't necessarily
   120  // mean the segment is about to be destroyed. This is analogous to unlinking a
   121  // file; the segment can still be used by a process already referencing it, but
   122  // cannot be discovered by a new process.
   123  func (r *Registry) dissociateKey(s *Shm) {
   124  	r.mu.Lock()
   125  	defer r.mu.Unlock()
   126  	s.mu.Lock()
   127  	defer s.mu.Unlock()
   128  	if s.obj.Key != linux.IPC_PRIVATE {
   129  		r.reg.DissociateKey(s.obj.Key)
   130  		s.obj.Key = linux.IPC_PRIVATE
   131  	}
   132  }
   133  
   134  // FindOrCreate looks up or creates a segment in the registry. It's functionally
   135  // analogous to open(2).
   136  //
   137  // FindOrCreate returns a reference on Shm.
   138  func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key ipc.Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) {
   139  	if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) {
   140  		// "A new segment was to be created and size is less than SHMMIN or
   141  		// greater than SHMMAX." - man shmget(2)
   142  		//
   143  		// Note that 'private' always implies the creation of a new segment
   144  		// whether IPC_CREAT is specified or not.
   145  		return nil, linuxerr.EINVAL
   146  	}
   147  
   148  	r.mu.Lock()
   149  	defer r.mu.Unlock()
   150  
   151  	if r.reg.ObjectCount() >= linux.SHMMNI {
   152  		// "All possible shared memory IDs have been taken (SHMMNI) ..."
   153  		//   - man shmget(2)
   154  		return nil, linuxerr.ENOSPC
   155  	}
   156  
   157  	if !private {
   158  		shm, err := r.reg.Find(ctx, key, mode, create, exclusive)
   159  		if err != nil {
   160  			return nil, err
   161  		}
   162  
   163  		// Validate shm-specific parameters.
   164  		if shm != nil {
   165  			shm := shm.(*Shm)
   166  			if size > shm.size {
   167  				// "A segment for the given key exists, but size is greater than
   168  				// the size of that segment." - man shmget(2)
   169  				return nil, linuxerr.EINVAL
   170  			}
   171  			shm.IncRef()
   172  			return shm, nil
   173  		}
   174  	}
   175  
   176  	var sizeAligned uint64
   177  	if val, ok := hostarch.Addr(size).RoundUp(); ok {
   178  		sizeAligned = uint64(val)
   179  	} else {
   180  		return nil, linuxerr.EINVAL
   181  	}
   182  
   183  	if numPages := sizeAligned / hostarch.PageSize; r.totalPages+numPages > linux.SHMALL {
   184  		// "... allocating a segment of the requested size would cause the
   185  		// system to exceed the system-wide limit on shared memory (SHMALL)."
   186  		//   - man shmget(2)
   187  		return nil, linuxerr.ENOSPC
   188  	}
   189  
   190  	// Need to create a new segment.
   191  	s, err := r.newShmLocked(ctx, pid, key, auth.CredentialsFromContext(ctx), mode, size)
   192  	if err != nil {
   193  		return nil, err
   194  	}
   195  	// The initial reference is held by s itself. Take another to return to
   196  	// the caller.
   197  	s.IncRef()
   198  	return s, nil
   199  }
   200  
   201  // newShmLocked creates a new segment in the registry.
   202  //
   203  // Precondition: Caller must hold r.mu.
   204  func (r *Registry) newShmLocked(ctx context.Context, pid int32, key ipc.Key, creator *auth.Credentials, mode linux.FileMode, size uint64) (*Shm, error) {
   205  	mf := pgalloc.MemoryFileFromContext(ctx)
   206  	if mf == nil {
   207  		panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFile))
   208  	}
   209  	devID, ok := deviceIDFromContext(ctx)
   210  	if !ok {
   211  		panic(fmt.Sprintf("context.Context %T lacks value for key %T", ctx, CtxDeviceID))
   212  	}
   213  
   214  	effectiveSize := uint64(hostarch.Addr(size).MustRoundUp())
   215  	fr, err := mf.Allocate(effectiveSize, pgalloc.AllocOpts{Kind: usage.Anonymous, MemCgID: pgalloc.MemoryCgroupIDFromContext(ctx)})
   216  	if err != nil {
   217  		return nil, err
   218  	}
   219  
   220  	shm := &Shm{
   221  		mf:            mf,
   222  		registry:      r,
   223  		devID:         devID,
   224  		size:          size,
   225  		effectiveSize: effectiveSize,
   226  		obj:           ipc.NewObject(r.reg.UserNS, ipc.Key(key), creator, creator, mode),
   227  		fr:            fr,
   228  		creatorPID:    pid,
   229  		changeTime:    ktime.NowFromContext(ctx),
   230  	}
   231  	shm.InitRefs()
   232  
   233  	if err := r.reg.Register(shm); err != nil {
   234  		return nil, err
   235  	}
   236  	r.totalPages += effectiveSize / hostarch.PageSize
   237  
   238  	return shm, nil
   239  }
   240  
   241  // IPCInfo reports global parameters for sysv shared memory segments on this
   242  // system. See shmctl(IPC_INFO).
   243  func (r *Registry) IPCInfo() *linux.ShmParams {
   244  	return &linux.ShmParams{
   245  		ShmMax: linux.SHMMAX,
   246  		ShmMin: linux.SHMMIN,
   247  		ShmMni: linux.SHMMNI,
   248  		ShmSeg: linux.SHMSEG,
   249  		ShmAll: linux.SHMALL,
   250  	}
   251  }
   252  
   253  // ShmInfo reports linux-specific global parameters for sysv shared memory
   254  // segments on this system. See shmctl(SHM_INFO).
   255  func (r *Registry) ShmInfo() *linux.ShmInfo {
   256  	r.mu.Lock()
   257  	defer r.mu.Unlock()
   258  
   259  	return &linux.ShmInfo{
   260  		UsedIDs: int32(r.reg.LastIDUsed()),
   261  		ShmTot:  r.totalPages,
   262  		ShmRss:  r.totalPages, // We could probably get a better estimate from memory accounting.
   263  		ShmSwp:  0,            // No reclaim at the moment.
   264  	}
   265  }
   266  
   267  // remove deletes a segment from this registry, deaccounting the memory used by
   268  // the segment.
   269  //
   270  // Precondition: Must follow a call to r.dissociateKey(s).
   271  func (r *Registry) remove(s *Shm) {
   272  	r.mu.Lock()
   273  	defer r.mu.Unlock()
   274  	s.mu.Lock()
   275  	defer s.mu.Unlock()
   276  
   277  	if s.obj.Key != linux.IPC_PRIVATE {
   278  		panic(fmt.Sprintf("Attempted to remove %s from the registry whose key is still associated", s.debugLocked()))
   279  	}
   280  
   281  	r.reg.DissociateID(s.obj.ID)
   282  	r.totalPages -= s.effectiveSize / hostarch.PageSize
   283  }
   284  
   285  // Release drops the self-reference of each active shm segment in the registry.
   286  // It is called when the kernel.IPCNamespace containing r is being destroyed.
   287  func (r *Registry) Release(ctx context.Context) {
   288  	// Because Shm.DecRef() may acquire the same locks, collect the segments to
   289  	// release first. Note that this should not race with any updates to r, since
   290  	// the IPC namespace containing it has no more references.
   291  	toRelease := make([]*Shm, 0)
   292  	r.mu.Lock()
   293  	r.reg.ForAllObjects(
   294  		func(o ipc.Mechanism) {
   295  			s := o.(*Shm)
   296  			s.mu.Lock()
   297  			if !s.pendingDestruction {
   298  				toRelease = append(toRelease, s)
   299  			}
   300  			s.mu.Unlock()
   301  		},
   302  	)
   303  	r.mu.Unlock()
   304  
   305  	for _, s := range toRelease {
   306  		r.dissociateKey(s)
   307  		s.DecRef(ctx)
   308  	}
   309  }
   310  
   311  // Shm represents a single shared memory segment.
   312  //
   313  // Shm segments are backed directly by an allocation from platform memory.
   314  // Segments are always mapped as a whole, greatly simplifying how mappings are
   315  // tracked. However note that mremap and munmap calls may cause the vma for a
   316  // segment to become fragmented; which requires special care when unmapping a
   317  // segment. See mm/shm.go.
   318  //
   319  // Segments persist until they are explicitly marked for destruction via
   320  // MarkDestroyed().
   321  //
   322  // Shm implements memmap.Mappable and memmap.MappingIdentity.
   323  //
   324  // +stateify savable
   325  type Shm struct {
   326  	// ShmRefs tracks the number of references to this segment.
   327  	//
   328  	// A segment holds a reference to itself until it is marked for
   329  	// destruction.
   330  	//
   331  	// In addition to direct users, the MemoryManager will hold references
   332  	// via MappingIdentity.
   333  	ShmRefs
   334  
   335  	mf *pgalloc.MemoryFile `state:"nosave"`
   336  
   337  	// registry points to the shm registry containing this segment. Immutable.
   338  	registry *Registry
   339  
   340  	// devID is the segment's device ID. Immutable.
   341  	devID uint32
   342  
   343  	// size is the requested size of the segment at creation, in
   344  	// bytes. Immutable.
   345  	size uint64
   346  
   347  	// effectiveSize of the segment, rounding up to the next page
   348  	// boundary. Immutable.
   349  	//
   350  	// Invariant: effectiveSize must be a multiple of hostarch.PageSize.
   351  	effectiveSize uint64
   352  
   353  	// fr is the offset into mfp.MemoryFile() that backs this contents of this
   354  	// segment. Immutable.
   355  	fr memmap.FileRange
   356  
   357  	// mu protects all fields below.
   358  	mu sync.Mutex `state:"nosave"`
   359  
   360  	obj *ipc.Object
   361  
   362  	// attachTime is updated on every successful shmat.
   363  	attachTime ktime.Time
   364  	// detachTime is updated on every successful shmdt.
   365  	detachTime ktime.Time
   366  	// changeTime is updated on every successful changes to the segment via
   367  	// shmctl(IPC_SET).
   368  	changeTime ktime.Time
   369  
   370  	// creatorPID is the PID of the process that created the segment.
   371  	creatorPID int32
   372  	// lastAttachDetachPID is the pid of the process that issued the last shmat
   373  	// or shmdt syscall.
   374  	lastAttachDetachPID int32
   375  
   376  	// pendingDestruction indicates the segment was marked as destroyed through
   377  	// shmctl(IPC_RMID). When marked as destroyed, the segment will not be found
   378  	// in the registry and can no longer be attached. When the last user
   379  	// detaches from the segment, it is destroyed.
   380  	pendingDestruction bool
   381  }
   382  
   383  // afterLoad is invoked by stateify.
   384  func (s *Shm) afterLoad(ctx goContext.Context) {
   385  	s.mf = pgalloc.MemoryFileFromContext(ctx)
   386  }
   387  
   388  // ID returns object's ID.
   389  func (s *Shm) ID() ipc.ID {
   390  	return s.obj.ID
   391  }
   392  
   393  // Object implements ipc.Mechanism.Object.
   394  func (s *Shm) Object() *ipc.Object {
   395  	return s.obj
   396  }
   397  
   398  // Destroy implements ipc.Mechanism.Destroy. No work is performed on shm.Destroy
   399  // because a different removal mechanism is used in shm. See Shm.MarkDestroyed.
   400  func (s *Shm) Destroy() {
   401  }
   402  
   403  // Lock implements ipc.Mechanism.Lock.
   404  func (s *Shm) Lock() {
   405  	s.mu.Lock()
   406  }
   407  
   408  // Unlock implements ipc.mechanism.Unlock.
   409  //
   410  // +checklocksignore
   411  func (s *Shm) Unlock() {
   412  	s.mu.Unlock()
   413  }
   414  
   415  // Precondition: Caller must hold s.mu.
   416  func (s *Shm) debugLocked() string {
   417  	return fmt.Sprintf("Shm{id: %d, key: %d, size: %d bytes, refs: %d, destroyed: %v}",
   418  		s.obj.ID, s.obj.Key, s.size, s.ReadRefs(), s.pendingDestruction)
   419  }
   420  
   421  // MappedName implements memmap.MappingIdentity.MappedName.
   422  func (s *Shm) MappedName(ctx context.Context) string {
   423  	s.mu.Lock()
   424  	defer s.mu.Unlock()
   425  	return fmt.Sprintf("SYSV%08d", s.obj.Key)
   426  }
   427  
   428  // DeviceID implements memmap.MappingIdentity.DeviceID.
   429  func (s *Shm) DeviceID() uint64 {
   430  	return uint64(s.devID)
   431  }
   432  
   433  // InodeID implements memmap.MappingIdentity.InodeID.
   434  func (s *Shm) InodeID() uint64 {
   435  	// "shmid gets reported as "inode#" in /proc/pid/maps. proc-ps tools use
   436  	// this. Changing this will break them." -- Linux, ipc/shm.c:newseg()
   437  	return uint64(s.obj.ID)
   438  }
   439  
   440  // DecRef drops a reference on s.
   441  //
   442  // Precondition: Caller must not hold s.mu.
   443  func (s *Shm) DecRef(ctx context.Context) {
   444  	s.ShmRefs.DecRef(func() {
   445  		s.mf.DecRef(s.fr)
   446  		s.registry.remove(s)
   447  	})
   448  }
   449  
   450  // Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm
   451  // segments.
   452  func (s *Shm) Msync(context.Context, memmap.MappableRange) error {
   453  	return nil
   454  }
   455  
   456  // AddMapping implements memmap.Mappable.AddMapping.
   457  func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ hostarch.AddrRange, _ uint64, _ bool) error {
   458  	s.mu.Lock()
   459  	defer s.mu.Unlock()
   460  	s.attachTime = ktime.NowFromContext(ctx)
   461  	if pid, ok := auth.ThreadGroupIDFromContext(ctx); ok {
   462  		s.lastAttachDetachPID = pid
   463  	} else {
   464  		// AddMapping is called during a syscall, so ctx should always be a task
   465  		// context.
   466  		log.Warningf("Adding mapping to %s but couldn't get the current pid; not updating the last attach pid", s.debugLocked())
   467  	}
   468  	return nil
   469  }
   470  
   471  // RemoveMapping implements memmap.Mappable.RemoveMapping.
   472  func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ hostarch.AddrRange, _ uint64, _ bool) {
   473  	s.mu.Lock()
   474  	defer s.mu.Unlock()
   475  	// RemoveMapping may be called during task exit, when ctx
   476  	// is context.Background. Gracefully handle missing clocks. Failing to
   477  	// update the detach time in these cases is ok, since no one can observe the
   478  	// omission.
   479  	if clock := ktime.RealtimeClockFromContext(ctx); clock != nil {
   480  		s.detachTime = clock.Now()
   481  	}
   482  
   483  	// If called from a non-task context we also won't have a threadgroup
   484  	// id. Silently skip updating the lastAttachDetachPid in that case.
   485  	if pid, ok := auth.ThreadGroupIDFromContext(ctx); ok {
   486  		s.lastAttachDetachPID = pid
   487  	} else {
   488  		log.Debugf("Couldn't obtain pid when removing mapping to %s, not updating the last detach pid.", s.debugLocked())
   489  	}
   490  }
   491  
   492  // CopyMapping implements memmap.Mappable.CopyMapping.
   493  func (*Shm) CopyMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange, hostarch.AddrRange, uint64, bool) error {
   494  	return nil
   495  }
   496  
   497  // Translate implements memmap.Mappable.Translate.
   498  func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
   499  	var err error
   500  	if required.End > s.fr.Length() {
   501  		err = &memmap.BusError{linuxerr.EFAULT}
   502  	}
   503  	if source := optional.Intersect(memmap.MappableRange{0, s.fr.Length()}); source.Length() != 0 {
   504  		return []memmap.Translation{
   505  			{
   506  				Source: source,
   507  				File:   s.mf,
   508  				Offset: s.fr.Start + source.Start,
   509  				Perms:  hostarch.AnyAccess,
   510  			},
   511  		}, err
   512  	}
   513  	return nil, err
   514  }
   515  
   516  // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
   517  func (s *Shm) InvalidateUnsavable(ctx context.Context) error {
   518  	return nil
   519  }
   520  
   521  // AttachOpts describes various flags passed to shmat(2).
   522  type AttachOpts struct {
   523  	Execute  bool
   524  	Readonly bool
   525  	Remap    bool
   526  }
   527  
   528  // ConfigureAttach creates an mmap configuration for the segment with the
   529  // requested attach options.
   530  //
   531  // Postconditions: The returned MMapOpts are valid only as long as a reference
   532  // continues to be held on s.
   533  func (s *Shm) ConfigureAttach(ctx context.Context, addr hostarch.Addr, opts AttachOpts) (memmap.MMapOpts, error) {
   534  	s.mu.Lock()
   535  	defer s.mu.Unlock()
   536  	if s.pendingDestruction && s.ReadRefs() == 0 {
   537  		return memmap.MMapOpts{}, linuxerr.EIDRM
   538  	}
   539  
   540  	creds := auth.CredentialsFromContext(ctx)
   541  	ats := vfs.MayRead
   542  	if !opts.Readonly {
   543  		ats |= vfs.MayWrite
   544  	}
   545  	if opts.Execute {
   546  		ats |= vfs.MayExec
   547  	}
   548  	if !s.obj.CheckPermissions(creds, ats) {
   549  		// "The calling process does not have the required permissions for the
   550  		// requested attach type, and does not have the CAP_IPC_OWNER capability
   551  		// in the user namespace that governs its IPC namespace." - man shmat(2)
   552  		return memmap.MMapOpts{}, linuxerr.EACCES
   553  	}
   554  	return memmap.MMapOpts{
   555  		Length: s.size,
   556  		Offset: 0,
   557  		Addr:   addr,
   558  		Fixed:  opts.Remap,
   559  		Perms: hostarch.AccessType{
   560  			Read:    true,
   561  			Write:   !opts.Readonly,
   562  			Execute: opts.Execute,
   563  		},
   564  		MaxPerms:        hostarch.AnyAccess,
   565  		Mappable:        s,
   566  		MappingIdentity: s,
   567  	}, nil
   568  }
   569  
   570  // EffectiveSize returns the size of the underlying shared memory segment. This
   571  // may be larger than the requested size at creation, due to rounding to page
   572  // boundaries.
   573  func (s *Shm) EffectiveSize() uint64 {
   574  	return s.effectiveSize
   575  }
   576  
   577  // IPCStat returns information about a shm. See shmctl(IPC_STAT).
   578  func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) {
   579  	s.mu.Lock()
   580  	defer s.mu.Unlock()
   581  
   582  	// "The caller must have read permission on the shared memory segment."
   583  	//   - man shmctl(2)
   584  	creds := auth.CredentialsFromContext(ctx)
   585  	if !s.obj.CheckPermissions(creds, vfs.MayRead) {
   586  		// "IPC_STAT or SHM_STAT is requested and shm_perm.mode does not allow
   587  		// read access for shmid, and the calling process does not have the
   588  		// CAP_IPC_OWNER capability in the user namespace that governs its IPC
   589  		// namespace." - man shmctl(2)
   590  		return nil, linuxerr.EACCES
   591  	}
   592  
   593  	var mode uint16
   594  	if s.pendingDestruction {
   595  		mode |= linux.SHM_DEST
   596  	}
   597  
   598  	// Use the reference count as a rudimentary count of the number of
   599  	// attaches. We exclude:
   600  	//
   601  	// 1. The reference the caller holds.
   602  	// 2. The self-reference held by s prior to destruction.
   603  	//
   604  	// Note that this may still overcount by including transient references
   605  	// used in concurrent calls.
   606  	nattach := uint64(s.ReadRefs()) - 1
   607  	if !s.pendingDestruction {
   608  		nattach--
   609  	}
   610  
   611  	ds := &linux.ShmidDS{
   612  		ShmPerm: linux.IPCPerm{
   613  			Key:  uint32(s.obj.Key),
   614  			UID:  uint32(creds.UserNamespace.MapFromKUID(s.obj.OwnerUID)),
   615  			GID:  uint32(creds.UserNamespace.MapFromKGID(s.obj.OwnerGID)),
   616  			CUID: uint32(creds.UserNamespace.MapFromKUID(s.obj.CreatorUID)),
   617  			CGID: uint32(creds.UserNamespace.MapFromKGID(s.obj.CreatorGID)),
   618  			Mode: mode | uint16(s.obj.Mode),
   619  			Seq:  0, // IPC sequences not supported.
   620  		},
   621  		ShmSegsz:   s.size,
   622  		ShmAtime:   s.attachTime.TimeT(),
   623  		ShmDtime:   s.detachTime.TimeT(),
   624  		ShmCtime:   s.changeTime.TimeT(),
   625  		ShmCpid:    s.creatorPID,
   626  		ShmLpid:    s.lastAttachDetachPID,
   627  		ShmNattach: nattach,
   628  	}
   629  
   630  	return ds, nil
   631  }
   632  
   633  // Set modifies attributes for a segment. See shmctl(IPC_SET).
   634  func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error {
   635  	s.mu.Lock()
   636  	defer s.mu.Unlock()
   637  
   638  	if err := s.obj.Set(ctx, &ds.ShmPerm); err != nil {
   639  		return err
   640  	}
   641  
   642  	s.changeTime = ktime.NowFromContext(ctx)
   643  	return nil
   644  }
   645  
   646  // MarkDestroyed marks a segment for destruction. The segment is actually
   647  // destroyed once it has no references. MarkDestroyed may be called multiple
   648  // times, and is safe to call after a segment has already been destroyed. See
   649  // shmctl(IPC_RMID).
   650  func (s *Shm) MarkDestroyed(ctx context.Context) {
   651  	s.registry.dissociateKey(s)
   652  
   653  	s.mu.Lock()
   654  	if s.pendingDestruction {
   655  		s.mu.Unlock()
   656  		return
   657  	}
   658  	s.pendingDestruction = true
   659  	s.mu.Unlock()
   660  
   661  	// Drop the self-reference so destruction occurs when all
   662  	// external references are gone.
   663  	//
   664  	// N.B. This cannot be the final DecRef, as the caller also
   665  	// holds a reference.
   666  	s.DecRef(ctx)
   667  	return
   668  }