github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/shm/shm.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package shm implements sysv shared memory segments.
    16  //
    17  // Known missing features:
    18  //
    19  // - SHM_LOCK/SHM_UNLOCK are no-ops. The sentry currently doesn't implement
    20  //   memory locking in general.
    21  //
    22  // - SHM_HUGETLB and related flags for shmget(2) are ignored. There's no easy
    23  //   way to implement hugetlb support on a per-map basis, and it has no impact
    24  //   on correctness.
    25  //
    26  // - SHM_NORESERVE for shmget(2) is ignored, the sentry doesn't implement swap
    27  //   so it's meaningless to reserve space for swap.
    28  //
    29  // - No per-process segment size enforcement. This feature probably isn't used
    30  //   much anyways, since Linux sets the per-process limits to the system-wide
    31  //   limits by default.
    32  //
    33  // Lock ordering: mm.mappingMu -> shm registry lock -> shm lock
    34  package shm
    35  
    36  import (
    37  	"fmt"
    38  
    39  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    40  	"github.com/SagerNet/gvisor/pkg/context"
    41  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    42  	"github.com/SagerNet/gvisor/pkg/hostarch"
    43  	"github.com/SagerNet/gvisor/pkg/log"
    44  	"github.com/SagerNet/gvisor/pkg/sentry/fs"
    45  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    46  	ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time"
    47  	"github.com/SagerNet/gvisor/pkg/sentry/memmap"
    48  	"github.com/SagerNet/gvisor/pkg/sentry/pgalloc"
    49  	"github.com/SagerNet/gvisor/pkg/sentry/usage"
    50  	"github.com/SagerNet/gvisor/pkg/sync"
    51  	"github.com/SagerNet/gvisor/pkg/syserror"
    52  )
    53  
    54  // Key represents a shm segment key. Analogous to a file name.
    55  type Key int32
    56  
    57  // ID represents the opaque handle for a shm segment. Analogous to an fd.
    58  type ID int32
    59  
    60  // Registry tracks all shared memory segments in an IPC namespace. The registry
    61  // provides the mechanisms for creating and finding segments, and reporting
    62  // global shm parameters.
    63  //
    64  // +stateify savable
    65  type Registry struct {
    66  	// userNS owns the IPC namespace this registry belong to. Immutable.
    67  	userNS *auth.UserNamespace
    68  
    69  	// mu protects all fields below.
    70  	mu sync.Mutex `state:"nosave"`
    71  
    72  	// shms maps segment ids to segments.
    73  	//
    74  	// shms holds all referenced segments, which are removed on the last
    75  	// DecRef. Thus, it cannot itself hold a reference on the Shm.
    76  	//
    77  	// Since removal only occurs after the last (unlocked) DecRef, there
    78  	// exists a short window during which a Shm still exists in Shm, but is
    79  	// unreferenced. Users must use TryIncRef to determine if the Shm is
    80  	// still valid.
    81  	shms map[ID]*Shm
    82  
    83  	// keysToShms maps segment keys to segments.
    84  	//
    85  	// Shms in keysToShms are guaranteed to be referenced, as they are
    86  	// removed by disassociateKey before the last DecRef.
    87  	keysToShms map[Key]*Shm
    88  
    89  	// Sum of the sizes of all existing segments rounded up to page size, in
    90  	// units of page size.
    91  	totalPages uint64
    92  
    93  	// ID assigned to the last created segment. Used to quickly find the next
    94  	// unused ID.
    95  	lastIDUsed ID
    96  }
    97  
    98  // NewRegistry creates a new shm registry.
    99  func NewRegistry(userNS *auth.UserNamespace) *Registry {
   100  	return &Registry{
   101  		userNS:     userNS,
   102  		shms:       make(map[ID]*Shm),
   103  		keysToShms: make(map[Key]*Shm),
   104  	}
   105  }
   106  
   107  // FindByID looks up a segment given an ID.
   108  //
   109  // FindByID returns a reference on Shm.
   110  func (r *Registry) FindByID(id ID) *Shm {
   111  	r.mu.Lock()
   112  	defer r.mu.Unlock()
   113  	s := r.shms[id]
   114  	// Take a reference on s. If TryIncRef fails, s has reached the last
   115  	// DecRef, but hasn't quite been removed from r.shms yet.
   116  	if s != nil && s.TryIncRef() {
   117  		return s
   118  	}
   119  	return nil
   120  }
   121  
   122  // dissociateKey removes the association between a segment and its key,
   123  // preventing it from being discovered in the registry. This doesn't necessarily
   124  // mean the segment is about to be destroyed. This is analogous to unlinking a
   125  // file; the segment can still be used by a process already referencing it, but
   126  // cannot be discovered by a new process.
   127  func (r *Registry) dissociateKey(s *Shm) {
   128  	r.mu.Lock()
   129  	defer r.mu.Unlock()
   130  	s.mu.Lock()
   131  	defer s.mu.Unlock()
   132  	if s.key != linux.IPC_PRIVATE {
   133  		delete(r.keysToShms, s.key)
   134  		s.key = linux.IPC_PRIVATE
   135  	}
   136  }
   137  
   138  // FindOrCreate looks up or creates a segment in the registry. It's functionally
   139  // analogous to open(2).
   140  //
   141  // FindOrCreate returns a reference on Shm.
   142  func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) {
   143  	if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) {
   144  		// "A new segment was to be created and size is less than SHMMIN or
   145  		// greater than SHMMAX." - man shmget(2)
   146  		//
   147  		// Note that 'private' always implies the creation of a new segment
   148  		// whether IPC_CREAT is specified or not.
   149  		return nil, linuxerr.EINVAL
   150  	}
   151  
   152  	r.mu.Lock()
   153  	defer r.mu.Unlock()
   154  
   155  	if len(r.shms) >= linux.SHMMNI {
   156  		// "All possible shared memory IDs have been taken (SHMMNI) ..."
   157  		//   - man shmget(2)
   158  		return nil, syserror.ENOSPC
   159  	}
   160  
   161  	if !private {
   162  		// Look up an existing segment.
   163  		if shm := r.keysToShms[key]; shm != nil {
   164  			shm.mu.Lock()
   165  			defer shm.mu.Unlock()
   166  
   167  			// Check that caller can access the segment.
   168  			if !shm.checkPermissions(ctx, fs.PermsFromMode(mode)) {
   169  				// "The user does not have permission to access the shared
   170  				// memory segment, and does not have the CAP_IPC_OWNER
   171  				// capability in the user namespace that governs its IPC
   172  				// namespace." - man shmget(2)
   173  				return nil, linuxerr.EACCES
   174  			}
   175  
   176  			if size > shm.size {
   177  				// "A segment for the given key exists, but size is greater than
   178  				// the size of that segment." - man shmget(2)
   179  				return nil, linuxerr.EINVAL
   180  			}
   181  
   182  			if create && exclusive {
   183  				// "IPC_CREAT and IPC_EXCL were specified in shmflg, but a
   184  				// shared memory segment already exists for key."
   185  				//  - man shmget(2)
   186  				return nil, syserror.EEXIST
   187  			}
   188  
   189  			shm.IncRef()
   190  			return shm, nil
   191  		}
   192  
   193  		if !create {
   194  			// "No segment exists for the given key, and IPC_CREAT was not
   195  			// specified." - man shmget(2)
   196  			return nil, syserror.ENOENT
   197  		}
   198  	}
   199  
   200  	var sizeAligned uint64
   201  	if val, ok := hostarch.Addr(size).RoundUp(); ok {
   202  		sizeAligned = uint64(val)
   203  	} else {
   204  		return nil, linuxerr.EINVAL
   205  	}
   206  
   207  	if numPages := sizeAligned / hostarch.PageSize; r.totalPages+numPages > linux.SHMALL {
   208  		// "... allocating a segment of the requested size would cause the
   209  		// system to exceed the system-wide limit on shared memory (SHMALL)."
   210  		//   - man shmget(2)
   211  		return nil, syserror.ENOSPC
   212  	}
   213  
   214  	// Need to create a new segment.
   215  	creator := fs.FileOwnerFromContext(ctx)
   216  	perms := fs.FilePermsFromMode(mode)
   217  	s, err := r.newShm(ctx, pid, key, creator, perms, size)
   218  	if err != nil {
   219  		return nil, err
   220  	}
   221  	// The initial reference is held by s itself. Take another to return to
   222  	// the caller.
   223  	s.IncRef()
   224  	return s, nil
   225  }
   226  
   227  // newShm creates a new segment in the registry.
   228  //
   229  // Precondition: Caller must hold r.mu.
   230  func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.FileOwner, perms fs.FilePermissions, size uint64) (*Shm, error) {
   231  	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
   232  	if mfp == nil {
   233  		panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider))
   234  	}
   235  
   236  	effectiveSize := uint64(hostarch.Addr(size).MustRoundUp())
   237  	fr, err := mfp.MemoryFile().Allocate(effectiveSize, usage.Anonymous)
   238  	if err != nil {
   239  		return nil, err
   240  	}
   241  
   242  	shm := &Shm{
   243  		mfp:           mfp,
   244  		registry:      r,
   245  		creator:       creator,
   246  		size:          size,
   247  		effectiveSize: effectiveSize,
   248  		fr:            fr,
   249  		key:           key,
   250  		perms:         perms,
   251  		owner:         creator,
   252  		creatorPID:    pid,
   253  		changeTime:    ktime.NowFromContext(ctx),
   254  	}
   255  	shm.InitRefs()
   256  
   257  	// Find the next available ID.
   258  	for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
   259  		// Handle wrap around.
   260  		if id < 0 {
   261  			id = 0
   262  			continue
   263  		}
   264  		if r.shms[id] == nil {
   265  			r.lastIDUsed = id
   266  
   267  			shm.ID = id
   268  			r.shms[id] = shm
   269  			r.keysToShms[key] = shm
   270  
   271  			r.totalPages += effectiveSize / hostarch.PageSize
   272  
   273  			return shm, nil
   274  		}
   275  	}
   276  
   277  	log.Warningf("Shm ids exhuasted, they may be leaking")
   278  	return nil, syserror.ENOSPC
   279  }
   280  
   281  // IPCInfo reports global parameters for sysv shared memory segments on this
   282  // system. See shmctl(IPC_INFO).
   283  func (r *Registry) IPCInfo() *linux.ShmParams {
   284  	return &linux.ShmParams{
   285  		ShmMax: linux.SHMMAX,
   286  		ShmMin: linux.SHMMIN,
   287  		ShmMni: linux.SHMMNI,
   288  		ShmSeg: linux.SHMSEG,
   289  		ShmAll: linux.SHMALL,
   290  	}
   291  }
   292  
   293  // ShmInfo reports linux-specific global parameters for sysv shared memory
   294  // segments on this system. See shmctl(SHM_INFO).
   295  func (r *Registry) ShmInfo() *linux.ShmInfo {
   296  	r.mu.Lock()
   297  	defer r.mu.Unlock()
   298  
   299  	return &linux.ShmInfo{
   300  		UsedIDs: int32(r.lastIDUsed),
   301  		ShmTot:  r.totalPages,
   302  		ShmRss:  r.totalPages, // We could probably get a better estimate from memory accounting.
   303  		ShmSwp:  0,            // No reclaim at the moment.
   304  	}
   305  }
   306  
   307  // remove deletes a segment from this registry, deaccounting the memory used by
   308  // the segment.
   309  //
   310  // Precondition: Must follow a call to r.dissociateKey(s).
   311  func (r *Registry) remove(s *Shm) {
   312  	r.mu.Lock()
   313  	defer r.mu.Unlock()
   314  	s.mu.Lock()
   315  	defer s.mu.Unlock()
   316  
   317  	if s.key != linux.IPC_PRIVATE {
   318  		panic(fmt.Sprintf("Attempted to remove %s from the registry whose key is still associated", s.debugLocked()))
   319  	}
   320  
   321  	delete(r.shms, s.ID)
   322  	r.totalPages -= s.effectiveSize / hostarch.PageSize
   323  }
   324  
   325  // Release drops the self-reference of each active shm segment in the registry.
   326  // It is called when the kernel.IPCNamespace containing r is being destroyed.
   327  func (r *Registry) Release(ctx context.Context) {
   328  	// Because Shm.DecRef() may acquire the same locks, collect the segments to
   329  	// release first. Note that this should not race with any updates to r, since
   330  	// the IPC namespace containing it has no more references.
   331  	toRelease := make([]*Shm, 0)
   332  	r.mu.Lock()
   333  	for _, s := range r.keysToShms {
   334  		s.mu.Lock()
   335  		if !s.pendingDestruction {
   336  			toRelease = append(toRelease, s)
   337  		}
   338  		s.mu.Unlock()
   339  	}
   340  	r.mu.Unlock()
   341  
   342  	for _, s := range toRelease {
   343  		r.dissociateKey(s)
   344  		s.DecRef(ctx)
   345  	}
   346  }
   347  
   348  // Shm represents a single shared memory segment.
   349  //
   350  // Shm segments are backed directly by an allocation from platform memory.
   351  // Segments are always mapped as a whole, greatly simplifying how mappings are
   352  // tracked. However note that mremap and munmap calls may cause the vma for a
   353  // segment to become fragmented; which requires special care when unmapping a
   354  // segment. See mm/shm.go.
   355  //
   356  // Segments persist until they are explicitly marked for destruction via
   357  // MarkDestroyed().
   358  //
   359  // Shm implements memmap.Mappable and memmap.MappingIdentity.
   360  //
   361  // +stateify savable
   362  type Shm struct {
   363  	// ShmRefs tracks the number of references to this segment.
   364  	//
   365  	// A segment holds a reference to itself until it is marked for
   366  	// destruction.
   367  	//
   368  	// In addition to direct users, the MemoryManager will hold references
   369  	// via MappingIdentity.
   370  	ShmRefs
   371  
   372  	mfp pgalloc.MemoryFileProvider
   373  
   374  	// registry points to the shm registry containing this segment. Immutable.
   375  	registry *Registry
   376  
   377  	// ID is the kernel identifier for this segment. Immutable.
   378  	ID ID
   379  
   380  	// creator is the user that created the segment. Immutable.
   381  	creator fs.FileOwner
   382  
   383  	// size is the requested size of the segment at creation, in
   384  	// bytes. Immutable.
   385  	size uint64
   386  
   387  	// effectiveSize of the segment, rounding up to the next page
   388  	// boundary. Immutable.
   389  	//
   390  	// Invariant: effectiveSize must be a multiple of hostarch.PageSize.
   391  	effectiveSize uint64
   392  
   393  	// fr is the offset into mfp.MemoryFile() that backs this contents of this
   394  	// segment. Immutable.
   395  	fr memmap.FileRange
   396  
   397  	// mu protects all fields below.
   398  	mu sync.Mutex `state:"nosave"`
   399  
   400  	// key is the public identifier for this segment.
   401  	key Key
   402  
   403  	// perms is the access permissions for the segment.
   404  	perms fs.FilePermissions
   405  
   406  	// owner of this segment.
   407  	owner fs.FileOwner
   408  	// attachTime is updated on every successful shmat.
   409  	attachTime ktime.Time
   410  	// detachTime is updated on every successful shmdt.
   411  	detachTime ktime.Time
   412  	// changeTime is updated on every successful changes to the segment via
   413  	// shmctl(IPC_SET).
   414  	changeTime ktime.Time
   415  
   416  	// creatorPID is the PID of the process that created the segment.
   417  	creatorPID int32
   418  	// lastAttachDetachPID is the pid of the process that issued the last shmat
   419  	// or shmdt syscall.
   420  	lastAttachDetachPID int32
   421  
   422  	// pendingDestruction indicates the segment was marked as destroyed through
   423  	// shmctl(IPC_RMID). When marked as destroyed, the segment will not be found
   424  	// in the registry and can no longer be attached. When the last user
   425  	// detaches from the segment, it is destroyed.
   426  	pendingDestruction bool
   427  }
   428  
   429  // Precondition: Caller must hold s.mu.
   430  func (s *Shm) debugLocked() string {
   431  	return fmt.Sprintf("Shm{id: %d, key: %d, size: %d bytes, refs: %d, destroyed: %v}",
   432  		s.ID, s.key, s.size, s.ReadRefs(), s.pendingDestruction)
   433  }
   434  
   435  // MappedName implements memmap.MappingIdentity.MappedName.
   436  func (s *Shm) MappedName(ctx context.Context) string {
   437  	s.mu.Lock()
   438  	defer s.mu.Unlock()
   439  	return fmt.Sprintf("SYSV%08d", s.key)
   440  }
   441  
   442  // DeviceID implements memmap.MappingIdentity.DeviceID.
   443  func (s *Shm) DeviceID() uint64 {
   444  	return shmDevice.DeviceID()
   445  }
   446  
   447  // InodeID implements memmap.MappingIdentity.InodeID.
   448  func (s *Shm) InodeID() uint64 {
   449  	// "shmid gets reported as "inode#" in /proc/pid/maps. proc-ps tools use
   450  	// this. Changing this will break them." -- Linux, ipc/shm.c:newseg()
   451  	return uint64(s.ID)
   452  }
   453  
   454  // DecRef drops a reference on s.
   455  //
   456  // Precondition: Caller must not hold s.mu.
   457  func (s *Shm) DecRef(ctx context.Context) {
   458  	s.ShmRefs.DecRef(func() {
   459  		s.mfp.MemoryFile().DecRef(s.fr)
   460  		s.registry.remove(s)
   461  	})
   462  }
   463  
   464  // Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm
   465  // segments.
   466  func (s *Shm) Msync(context.Context, memmap.MappableRange) error {
   467  	return nil
   468  }
   469  
   470  // AddMapping implements memmap.Mappable.AddMapping.
   471  func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ hostarch.AddrRange, _ uint64, _ bool) error {
   472  	s.mu.Lock()
   473  	defer s.mu.Unlock()
   474  	s.attachTime = ktime.NowFromContext(ctx)
   475  	if pid, ok := context.ThreadGroupIDFromContext(ctx); ok {
   476  		s.lastAttachDetachPID = pid
   477  	} else {
   478  		// AddMapping is called during a syscall, so ctx should always be a task
   479  		// context.
   480  		log.Warningf("Adding mapping to %s but couldn't get the current pid; not updating the last attach pid", s.debugLocked())
   481  	}
   482  	return nil
   483  }
   484  
   485  // RemoveMapping implements memmap.Mappable.RemoveMapping.
   486  func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ hostarch.AddrRange, _ uint64, _ bool) {
   487  	s.mu.Lock()
   488  	defer s.mu.Unlock()
   489  	// RemoveMapping may be called during task exit, when ctx
   490  	// is context.Background. Gracefully handle missing clocks. Failing to
   491  	// update the detach time in these cases is ok, since no one can observe the
   492  	// omission.
   493  	if clock := ktime.RealtimeClockFromContext(ctx); clock != nil {
   494  		s.detachTime = clock.Now()
   495  	}
   496  
   497  	// If called from a non-task context we also won't have a threadgroup
   498  	// id. Silently skip updating the lastAttachDetachPid in that case.
   499  	if pid, ok := context.ThreadGroupIDFromContext(ctx); ok {
   500  		s.lastAttachDetachPID = pid
   501  	} else {
   502  		log.Debugf("Couldn't obtain pid when removing mapping to %s, not updating the last detach pid.", s.debugLocked())
   503  	}
   504  }
   505  
   506  // CopyMapping implements memmap.Mappable.CopyMapping.
   507  func (*Shm) CopyMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange, hostarch.AddrRange, uint64, bool) error {
   508  	return nil
   509  }
   510  
   511  // Translate implements memmap.Mappable.Translate.
   512  func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
   513  	var err error
   514  	if required.End > s.fr.Length() {
   515  		err = &memmap.BusError{syserror.EFAULT}
   516  	}
   517  	if source := optional.Intersect(memmap.MappableRange{0, s.fr.Length()}); source.Length() != 0 {
   518  		return []memmap.Translation{
   519  			{
   520  				Source: source,
   521  				File:   s.mfp.MemoryFile(),
   522  				Offset: s.fr.Start + source.Start,
   523  				Perms:  hostarch.AnyAccess,
   524  			},
   525  		}, err
   526  	}
   527  	return nil, err
   528  }
   529  
   530  // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
   531  func (s *Shm) InvalidateUnsavable(ctx context.Context) error {
   532  	return nil
   533  }
   534  
   535  // AttachOpts describes various flags passed to shmat(2).
   536  type AttachOpts struct {
   537  	Execute  bool
   538  	Readonly bool
   539  	Remap    bool
   540  }
   541  
   542  // ConfigureAttach creates an mmap configuration for the segment with the
   543  // requested attach options.
   544  //
   545  // Postconditions: The returned MMapOpts are valid only as long as a reference
   546  // continues to be held on s.
   547  func (s *Shm) ConfigureAttach(ctx context.Context, addr hostarch.Addr, opts AttachOpts) (memmap.MMapOpts, error) {
   548  	s.mu.Lock()
   549  	defer s.mu.Unlock()
   550  	if s.pendingDestruction && s.ReadRefs() == 0 {
   551  		return memmap.MMapOpts{}, syserror.EIDRM
   552  	}
   553  
   554  	if !s.checkPermissions(ctx, fs.PermMask{
   555  		Read:    true,
   556  		Write:   !opts.Readonly,
   557  		Execute: opts.Execute,
   558  	}) {
   559  		// "The calling process does not have the required permissions for the
   560  		// requested attach type, and does not have the CAP_IPC_OWNER capability
   561  		// in the user namespace that governs its IPC namespace." - man shmat(2)
   562  		return memmap.MMapOpts{}, linuxerr.EACCES
   563  	}
   564  	return memmap.MMapOpts{
   565  		Length: s.size,
   566  		Offset: 0,
   567  		Addr:   addr,
   568  		Fixed:  opts.Remap,
   569  		Perms: hostarch.AccessType{
   570  			Read:    true,
   571  			Write:   !opts.Readonly,
   572  			Execute: opts.Execute,
   573  		},
   574  		MaxPerms:        hostarch.AnyAccess,
   575  		Mappable:        s,
   576  		MappingIdentity: s,
   577  	}, nil
   578  }
   579  
   580  // EffectiveSize returns the size of the underlying shared memory segment. This
   581  // may be larger than the requested size at creation, due to rounding to page
   582  // boundaries.
   583  func (s *Shm) EffectiveSize() uint64 {
   584  	return s.effectiveSize
   585  }
   586  
   587  // IPCStat returns information about a shm. See shmctl(IPC_STAT).
   588  func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) {
   589  	s.mu.Lock()
   590  	defer s.mu.Unlock()
   591  
   592  	// "The caller must have read permission on the shared memory segment."
   593  	//   - man shmctl(2)
   594  	if !s.checkPermissions(ctx, fs.PermMask{Read: true}) {
   595  		// "IPC_STAT or SHM_STAT is requested and shm_perm.mode does not allow
   596  		// read access for shmid, and the calling process does not have the
   597  		// CAP_IPC_OWNER capability in the user namespace that governs its IPC
   598  		// namespace." - man shmctl(2)
   599  		return nil, linuxerr.EACCES
   600  	}
   601  
   602  	var mode uint16
   603  	if s.pendingDestruction {
   604  		mode |= linux.SHM_DEST
   605  	}
   606  	creds := auth.CredentialsFromContext(ctx)
   607  
   608  	// Use the reference count as a rudimentary count of the number of
   609  	// attaches. We exclude:
   610  	//
   611  	// 1. The reference the caller holds.
   612  	// 2. The self-reference held by s prior to destruction.
   613  	//
   614  	// Note that this may still overcount by including transient references
   615  	// used in concurrent calls.
   616  	nattach := uint64(s.ReadRefs()) - 1
   617  	if !s.pendingDestruction {
   618  		nattach--
   619  	}
   620  
   621  	ds := &linux.ShmidDS{
   622  		ShmPerm: linux.IPCPerm{
   623  			Key:  uint32(s.key),
   624  			UID:  uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)),
   625  			GID:  uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)),
   626  			CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)),
   627  			CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)),
   628  			Mode: mode | uint16(s.perms.LinuxMode()),
   629  			Seq:  0, // IPC sequences not supported.
   630  		},
   631  		ShmSegsz:   s.size,
   632  		ShmAtime:   s.attachTime.TimeT(),
   633  		ShmDtime:   s.detachTime.TimeT(),
   634  		ShmCtime:   s.changeTime.TimeT(),
   635  		ShmCpid:    s.creatorPID,
   636  		ShmLpid:    s.lastAttachDetachPID,
   637  		ShmNattach: nattach,
   638  	}
   639  
   640  	return ds, nil
   641  }
   642  
   643  // Set modifies attributes for a segment. See shmctl(IPC_SET).
   644  func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error {
   645  	s.mu.Lock()
   646  	defer s.mu.Unlock()
   647  
   648  	if !s.checkOwnership(ctx) {
   649  		return linuxerr.EPERM
   650  	}
   651  
   652  	creds := auth.CredentialsFromContext(ctx)
   653  	uid := creds.UserNamespace.MapToKUID(auth.UID(ds.ShmPerm.UID))
   654  	gid := creds.UserNamespace.MapToKGID(auth.GID(ds.ShmPerm.GID))
   655  	if !uid.Ok() || !gid.Ok() {
   656  		return linuxerr.EINVAL
   657  	}
   658  
   659  	// User may only modify the lower 9 bits of the mode. All the other bits are
   660  	// always 0 for the underlying inode.
   661  	mode := linux.FileMode(ds.ShmPerm.Mode & 0x1ff)
   662  	s.perms = fs.FilePermsFromMode(mode)
   663  
   664  	s.owner.UID = uid
   665  	s.owner.GID = gid
   666  
   667  	s.changeTime = ktime.NowFromContext(ctx)
   668  	return nil
   669  }
   670  
   671  // MarkDestroyed marks a segment for destruction. The segment is actually
   672  // destroyed once it has no references. MarkDestroyed may be called multiple
   673  // times, and is safe to call after a segment has already been destroyed. See
   674  // shmctl(IPC_RMID).
   675  func (s *Shm) MarkDestroyed(ctx context.Context) {
   676  	s.registry.dissociateKey(s)
   677  
   678  	s.mu.Lock()
   679  	if s.pendingDestruction {
   680  		s.mu.Unlock()
   681  		return
   682  	}
   683  	s.pendingDestruction = true
   684  	s.mu.Unlock()
   685  
   686  	// Drop the self-reference so destruction occurs when all
   687  	// external references are gone.
   688  	//
   689  	// N.B. This cannot be the final DecRef, as the caller also
   690  	// holds a reference.
   691  	s.DecRef(ctx)
   692  	return
   693  }
   694  
   695  // checkOwnership verifies whether a segment may be accessed by ctx as an
   696  // owner. See ipc/util.c:ipcctl_pre_down_nolock() in Linux.
   697  //
   698  // Precondition: Caller must hold s.mu.
   699  func (s *Shm) checkOwnership(ctx context.Context) bool {
   700  	creds := auth.CredentialsFromContext(ctx)
   701  	if s.owner.UID == creds.EffectiveKUID || s.creator.UID == creds.EffectiveKUID {
   702  		return true
   703  	}
   704  
   705  	// Tasks with CAP_SYS_ADMIN may bypass ownership checks. Strangely, Linux
   706  	// doesn't use CAP_IPC_OWNER for this despite CAP_IPC_OWNER being documented
   707  	// for use to "override IPC ownership checks".
   708  	return creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, s.registry.userNS)
   709  }
   710  
   711  // checkPermissions verifies whether a segment is accessible by ctx for access
   712  // described by req. See ipc/util.c:ipcperms() in Linux.
   713  //
   714  // Precondition: Caller must hold s.mu.
   715  func (s *Shm) checkPermissions(ctx context.Context, req fs.PermMask) bool {
   716  	creds := auth.CredentialsFromContext(ctx)
   717  
   718  	p := s.perms.Other
   719  	if s.owner.UID == creds.EffectiveKUID {
   720  		p = s.perms.User
   721  	} else if creds.InGroup(s.owner.GID) {
   722  		p = s.perms.Group
   723  	}
   724  	if p.SupersetOf(req) {
   725  		return true
   726  	}
   727  
   728  	// Tasks with CAP_IPC_OWNER may bypass permission checks.
   729  	return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS)
   730  }