github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/kernel/mq/mq.go (about)

     1  // Copyright 2021 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package mq provides an implementation for POSIX message queues.
    16  package mq
    17  
    18  import (
    19  	"bytes"
    20  	"fmt"
    21  	"strings"
    22  
    23  	"github.com/metacubex/gvisor/pkg/abi/linux"
    24  	"github.com/metacubex/gvisor/pkg/context"
    25  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    26  	"github.com/metacubex/gvisor/pkg/sentry/kernel/auth"
    27  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    28  	"github.com/metacubex/gvisor/pkg/sync"
    29  	"github.com/metacubex/gvisor/pkg/waiter"
    30  )
    31  
    32  // AccessType is the access type passed to mq_open.
    33  type AccessType int
    34  
    35  // Possible access types.
    36  const (
    37  	ReadOnly AccessType = iota
    38  	WriteOnly
    39  	ReadWrite
    40  )
    41  
    42  // MaxName is the maximum size for a queue name.
    43  const MaxName = 255
    44  
    45  const (
    46  	maxPriority = linux.MQ_PRIO_MAX - 1 // Highest possible message priority.
    47  
    48  	maxQueuesDefault = linux.DFLT_QUEUESMAX // Default max number of queues.
    49  
    50  	maxMsgDefault   = linux.DFLT_MSG    // Default max number of messages per queue.
    51  	maxMsgMin       = linux.MIN_MSGMAX  // Min value for max number of messages per queue.
    52  	maxMsgLimit     = linux.DFLT_MSGMAX // Limit for max number of messages per queue.
    53  	maxMsgHardLimit = linux.HARD_MSGMAX // Hard limit for max number of messages per queue.
    54  
    55  	msgSizeDefault   = linux.DFLT_MSGSIZE    // Default max message size.
    56  	msgSizeMin       = linux.MIN_MSGSIZEMAX  // Min value for max message size.
    57  	msgSizeLimit     = linux.DFLT_MSGSIZEMAX // Limit for max message size.
    58  	msgSizeHardLimit = linux.HARD_MSGSIZEMAX // Hard limit for max message size.
    59  )
    60  
    61  // Registry is a POSIX message queue registry.
    62  //
    63  // Unlike SysV utilities, Registry is not map-based. It uses a provided
    64  // RegistryImpl backed by a virtual filesystem to implement registry operations.
    65  //
    66  // +stateify savable
    67  type Registry struct {
    68  	// userNS is the user namespace containing this registry. Immutable.
    69  	userNS *auth.UserNamespace
    70  
    71  	// mu protects all fields below.
    72  	mu sync.Mutex `state:"nosave"`
    73  
    74  	// impl is an implementation of several message queue utilities needed by
    75  	// the registry. impl should be provided by mqfs.
    76  	impl RegistryImpl
    77  }
    78  
    79  // RegistryImpl defines utilities needed by a Registry to provide actual
    80  // registry implementation. It works mainly as an abstraction layer used by
    81  // Registry to avoid dealing directly with the filesystem. RegistryImpl should
    82  // be implemented by mqfs and provided to Registry at initialization.
    83  type RegistryImpl interface {
    84  	// Get searches for a queue with the given name, if it exists, the queue is
    85  	// used to create a new FD, return it and return true. If the queue  doesn't
    86  	// exist, return false and no error. An error is returned if creation fails.
    87  	Get(ctx context.Context, name string, access AccessType, block bool, flags uint32) (*vfs.FileDescription, bool, error)
    88  
    89  	// New creates a new inode and file description using the given queue,
    90  	// inserts the inode into the filesystem tree using the given name, and
    91  	// returns the file description. An error is returned if creation fails, or
    92  	// if the name already exists.
    93  	New(ctx context.Context, name string, q *Queue, access AccessType, block bool, perm linux.FileMode, flags uint32) (*vfs.FileDescription, error)
    94  
    95  	// Unlink removes the queue with given name from the registry, and returns
    96  	// an error if the name doesn't exist.
    97  	Unlink(ctx context.Context, name string) error
    98  
    99  	// Destroy destroys the registry.
   100  	Destroy(context.Context)
   101  }
   102  
   103  // NewRegistry returns a new, initialized message queue registry. NewRegistry
   104  // should be called when a new message queue filesystem is created, once per
   105  // IPCNamespace.
   106  func NewRegistry(userNS *auth.UserNamespace, impl RegistryImpl) *Registry {
   107  	return &Registry{
   108  		userNS: userNS,
   109  		impl:   impl,
   110  	}
   111  }
   112  
   113  // OpenOpts holds the options passed to FindOrCreate.
   114  type OpenOpts struct {
   115  	Name      string
   116  	Access    AccessType
   117  	Create    bool
   118  	Exclusive bool
   119  	Block     bool
   120  }
   121  
   122  // FindOrCreate creates a new POSIX message queue or opens an existing queue.
   123  // See mq_open(2).
   124  func (r *Registry) FindOrCreate(ctx context.Context, opts OpenOpts, mode linux.FileMode, attr *linux.MqAttr) (*vfs.FileDescription, error) {
   125  	// mq_overview(7) mentions that: "Each message queue is identified by a name
   126  	// of the form '/somename'", but the mq_open(3) man pages mention:
   127  	//   "The mq_open() library function is implemented on top of a system call
   128  	//    of the same name.  The library function performs the check that the
   129  	//    name starts with a slash (/), giving the EINVAL error if it does not.
   130  	//    The kernel system call expects name to contain no preceding slash, so
   131  	//    the C library function passes name without the preceding slash (i.e.,
   132  	//    name+1) to the system call."
   133  	// So we don't need to check it.
   134  
   135  	if len(opts.Name) == 0 {
   136  		return nil, linuxerr.ENOENT
   137  	}
   138  	if len(opts.Name) > MaxName {
   139  		return nil, linuxerr.ENAMETOOLONG
   140  	}
   141  	if strings.ContainsRune(opts.Name, '/') {
   142  		return nil, linuxerr.EACCES
   143  	}
   144  	if opts.Name == "." || opts.Name == ".." {
   145  		return nil, linuxerr.EINVAL
   146  	}
   147  
   148  	// Construct status flags.
   149  	var flags uint32
   150  	if opts.Block {
   151  		flags = linux.O_NONBLOCK
   152  	}
   153  	switch opts.Access {
   154  	case ReadOnly:
   155  		flags = flags | linux.O_RDONLY
   156  	case WriteOnly:
   157  		flags = flags | linux.O_WRONLY
   158  	case ReadWrite:
   159  		flags = flags | linux.O_RDWR
   160  	}
   161  
   162  	r.mu.Lock()
   163  	defer r.mu.Unlock()
   164  	fd, ok, err := r.impl.Get(ctx, opts.Name, opts.Access, opts.Block, flags)
   165  	if err != nil {
   166  		return nil, err
   167  	}
   168  
   169  	if ok {
   170  		if opts.Create && opts.Exclusive {
   171  			// "Both O_CREAT and O_EXCL were specified in oflag, but a queue
   172  			//  with this name already exists."
   173  			fd.DecRef(ctx)
   174  			return nil, linuxerr.EEXIST
   175  		}
   176  		return fd, nil
   177  	}
   178  
   179  	if !opts.Create {
   180  		// "The O_CREAT flag was not specified in oflag, and no queue with this name
   181  		//  exists."
   182  		return nil, linuxerr.ENOENT
   183  	}
   184  
   185  	q, err := r.newQueueLocked(auth.CredentialsFromContext(ctx), mode, attr)
   186  	if err != nil {
   187  		return nil, err
   188  	}
   189  	return r.impl.New(ctx, opts.Name, q, opts.Access, opts.Block, mode.Permissions(), flags)
   190  }
   191  
   192  // newQueueLocked creates a new queue using the given attributes. If attr is nil
   193  // return a queue with default values, otherwise use attr to create a new queue,
   194  // and return an error if attributes are invalid.
   195  func (r *Registry) newQueueLocked(creds *auth.Credentials, mode linux.FileMode, attr *linux.MqAttr) (*Queue, error) {
   196  	if attr == nil {
   197  		return &Queue{
   198  			ownerUID:        creds.EffectiveKUID,
   199  			ownerGID:        creds.EffectiveKGID,
   200  			mode:            mode,
   201  			maxMessageCount: int64(maxMsgDefault),
   202  			maxMessageSize:  uint64(msgSizeDefault),
   203  		}, nil
   204  	}
   205  
   206  	// "O_CREAT was specified in oflag, and attr was not NULL, but
   207  	//  attr->mq_maxmsg or attr->mq_msqsize was invalid.  Both of these fields
   208  	//  these fields must be greater than zero.  In a process that is
   209  	//  unprivileged (does not have the CAP_SYS_RESOURCE capability),
   210  	//  attr->mq_maxmsg must be less than or equal to the msg_max limit, and
   211  	//  attr->mq_msgsize must be less than or equal to the msgsize_max limit.
   212  	//  In addition, even in a privileged process, attr->mq_maxmsg cannot
   213  	//  exceed the HARD_MAX limit." - man mq_open(3).
   214  	if attr.MqMaxmsg <= 0 || attr.MqMsgsize <= 0 {
   215  		return nil, linuxerr.EINVAL
   216  	}
   217  
   218  	if attr.MqMaxmsg > maxMsgHardLimit || (!creds.HasCapabilityIn(linux.CAP_SYS_RESOURCE, r.userNS) && (attr.MqMaxmsg > maxMsgLimit || attr.MqMsgsize > msgSizeLimit)) {
   219  		return nil, linuxerr.EINVAL
   220  	}
   221  
   222  	return &Queue{
   223  		ownerUID:        creds.EffectiveKUID,
   224  		ownerGID:        creds.EffectiveKGID,
   225  		mode:            mode,
   226  		maxMessageCount: attr.MqMaxmsg,
   227  		maxMessageSize:  uint64(attr.MqMsgsize),
   228  	}, nil
   229  }
   230  
   231  // Remove removes the queue with the given name from the registry. See
   232  // mq_unlink(2).
   233  func (r *Registry) Remove(ctx context.Context, name string) error {
   234  	if len(name) > MaxName {
   235  		return linuxerr.ENAMETOOLONG
   236  	}
   237  
   238  	r.mu.Lock()
   239  	defer r.mu.Unlock()
   240  	return r.impl.Unlink(ctx, name)
   241  }
   242  
   243  // Destroy destroys the registry and releases all held references.
   244  func (r *Registry) Destroy(ctx context.Context) {
   245  	r.mu.Lock()
   246  	defer r.mu.Unlock()
   247  	r.impl.Destroy(ctx)
   248  }
   249  
   250  // Impl returns RegistryImpl inside r.
   251  func (r *Registry) Impl() RegistryImpl {
   252  	return r.impl
   253  }
   254  
   255  // Queue represents a POSIX message queue.
   256  //
   257  // +stateify savable
   258  type Queue struct {
   259  	// ownerUID is the registry's owner's UID. Immutable.
   260  	ownerUID auth.KUID
   261  
   262  	// ownerGID is the registry's owner's GID. Immutable.
   263  	ownerGID auth.KGID
   264  
   265  	// mode is the registry's access permissions. Immutable.
   266  	mode linux.FileMode
   267  
   268  	// mu protects all the fields below.
   269  	mu sync.Mutex `state:"nosave"`
   270  
   271  	// queue is the queue of waiters.
   272  	queue waiter.Queue
   273  
   274  	// messages is a list of messages currently in the queue.
   275  	messages msgList
   276  
   277  	// subscriber represents a task registered to receive async notification
   278  	// from this queue.
   279  	subscriber *Subscriber
   280  
   281  	// messageCount is the number of messages currently in the queue.
   282  	messageCount int64
   283  
   284  	// maxMessageCount is the maximum number of messages that the queue can
   285  	// hold.
   286  	maxMessageCount int64
   287  
   288  	// maxMessageSize is the maximum size of a message held by the queue.
   289  	maxMessageSize uint64
   290  
   291  	// byteCount is the number of bytes of data in all messages in the queue.
   292  	byteCount uint64
   293  }
   294  
   295  // View is a view into a message queue. Views should only be used in file
   296  // descriptions, but not inodes, because we use inodes to retrieve the actual
   297  // queue, and only FDs are responsible for providing user functionality.
   298  type View interface {
   299  	// TODO: Add Send and Receive when mq_timedsend(2) and mq_timedreceive(2)
   300  	// are implemented.
   301  
   302  	// Flush checks if the calling process has attached a notification request
   303  	// to this queue, if yes, then the request is removed, and another process
   304  	// can attach a request.
   305  	Flush(ctx context.Context)
   306  
   307  	waiter.Waitable
   308  }
   309  
   310  // ReaderWriter provides a send and receive view into a queue.
   311  //
   312  // +stateify savable
   313  type ReaderWriter struct {
   314  	*Queue
   315  
   316  	block bool
   317  }
   318  
   319  // Reader provides a send-only view into a queue.
   320  //
   321  // +stateify savable
   322  type Reader struct {
   323  	*Queue
   324  
   325  	block bool
   326  }
   327  
   328  // Writer provides a receive-only view into a queue.
   329  //
   330  // +stateify savable
   331  type Writer struct {
   332  	*Queue
   333  
   334  	block bool
   335  }
   336  
   337  // NewView creates a new view into a queue and returns it.
   338  func NewView(q *Queue, access AccessType, block bool) (View, error) {
   339  	switch access {
   340  	case ReadWrite:
   341  		return ReaderWriter{Queue: q, block: block}, nil
   342  	case WriteOnly:
   343  		return Writer{Queue: q, block: block}, nil
   344  	case ReadOnly:
   345  		return Reader{Queue: q, block: block}, nil
   346  	default:
   347  		// This case can't happen, due to O_RDONLY flag being 0 and O_WRONLY
   348  		// being 1, so one of them must be true.
   349  		return nil, linuxerr.EINVAL
   350  	}
   351  }
   352  
   353  // Message holds a message exchanged through a Queue via mq_timedsend(2) and
   354  // mq_timedreceive(2), and additional info relating to the message.
   355  //
   356  // +stateify savable
   357  type Message struct {
   358  	msgEntry
   359  
   360  	// Text is the message's sent content.
   361  	Text string
   362  
   363  	// Size is the message's size in bytes.
   364  	Size uint64
   365  
   366  	// Priority is the message's priority.
   367  	Priority uint32
   368  }
   369  
   370  // Subscriber represents a task registered for async notification from a Queue.
   371  //
   372  // +stateify savable
   373  type Subscriber struct {
   374  	// TODO: Add fields when mq_notify(2) is implemented.
   375  
   376  	// pid is the PID of the registered task.
   377  	pid int32
   378  }
   379  
   380  // Generate implements vfs.DynamicBytesSource.Generate. Queue is used as a
   381  // DynamicBytesSource for mqfs's queueInode.
   382  func (q *Queue) Generate(ctx context.Context, buf *bytes.Buffer) error {
   383  	q.mu.Lock()
   384  	defer q.mu.Unlock()
   385  
   386  	var (
   387  		pid       int32
   388  		method    int
   389  		sigNumber int
   390  	)
   391  	if q.subscriber != nil {
   392  		pid = q.subscriber.pid
   393  		// TODO: add method and sigNumber when mq_notify(2) is implemented.
   394  	}
   395  
   396  	buf.WriteString(
   397  		fmt.Sprintf("QSIZE:%-10d NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",
   398  			q.byteCount, method, sigNumber, pid),
   399  	)
   400  	return nil
   401  }
   402  
   403  // Flush implements View.Flush.
   404  func (q *Queue) Flush(ctx context.Context) {
   405  	q.mu.Lock()
   406  	defer q.mu.Unlock()
   407  
   408  	pid, ok := auth.ThreadGroupIDFromContext(ctx)
   409  	if ok {
   410  		if q.subscriber != nil && pid == q.subscriber.pid {
   411  			q.subscriber = nil
   412  		}
   413  	}
   414  }
   415  
   416  // Readiness implements Waitable.Readiness.
   417  func (q *Queue) Readiness(mask waiter.EventMask) waiter.EventMask {
   418  	q.mu.Lock()
   419  	defer q.mu.Unlock()
   420  
   421  	events := waiter.EventMask(0)
   422  	if q.messageCount > 0 {
   423  		events |= waiter.ReadableEvents
   424  	}
   425  	if q.messageCount < q.maxMessageCount {
   426  		events |= waiter.WritableEvents
   427  	}
   428  	return events & mask
   429  }
   430  
   431  // EventRegister implements Waitable.EventRegister.
   432  func (q *Queue) EventRegister(e *waiter.Entry) error {
   433  	q.mu.Lock()
   434  	defer q.mu.Unlock()
   435  	q.queue.EventRegister(e)
   436  	return nil
   437  }
   438  
   439  // EventUnregister implements Waitable.EventUnregister.
   440  func (q *Queue) EventUnregister(e *waiter.Entry) {
   441  	q.mu.Lock()
   442  	defer q.mu.Unlock()
   443  	q.queue.EventUnregister(e)
   444  }
   445  
   446  // HasPermissions returns true if the given credentials meet the access
   447  // permissions required by the queue.
   448  func (q *Queue) HasPermissions(creds *auth.Credentials, req vfs.AccessTypes) bool {
   449  	perms := uint16(q.mode.Permissions())
   450  	if q.ownerUID == creds.EffectiveKUID {
   451  		perms >>= 6
   452  	} else if creds.InGroup(q.ownerGID) {
   453  		perms >>= 3
   454  	}
   455  
   456  	return uint16(req)&perms == uint16(req)
   457  }