github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/kernel/mq/mq.go (about)

     1  // Copyright 2021 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package mq provides an implementation for POSIX message queues.
    16  package mq
    17  
    18  import (
    19  	"bytes"
    20  	"fmt"
    21  	"strings"
    22  
    23  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs"
    28  	"github.com/nicocha30/gvisor-ligolo/pkg/sync"
    29  	"github.com/nicocha30/gvisor-ligolo/pkg/waiter"
    30  )
    31  
    32  // AccessType is the access type passed to mq_open.
    33  type AccessType int
    34  
    35  // Possible access types.
    36  const (
    37  	ReadOnly AccessType = iota
    38  	WriteOnly
    39  	ReadWrite
    40  )
    41  
    42  // MaxName is the maximum size for a queue name.
    43  const MaxName = 255
    44  
    45  const (
    46  	maxPriority = linux.MQ_PRIO_MAX - 1 // Highest possible message priority.
    47  
    48  	maxQueuesDefault = linux.DFLT_QUEUESMAX // Default max number of queues.
    49  
    50  	maxMsgDefault   = linux.DFLT_MSG    // Default max number of messages per queue.
    51  	maxMsgMin       = linux.MIN_MSGMAX  // Min value for max number of messages per queue.
    52  	maxMsgLimit     = linux.DFLT_MSGMAX // Limit for max number of messages per queue.
    53  	maxMsgHardLimit = linux.HARD_MSGMAX // Hard limit for max number of messages per queue.
    54  
    55  	msgSizeDefault   = linux.DFLT_MSGSIZE    // Default max message size.
    56  	msgSizeMin       = linux.MIN_MSGSIZEMAX  // Min value for max message size.
    57  	msgSizeLimit     = linux.DFLT_MSGSIZEMAX // Limit for max message size.
    58  	msgSizeHardLimit = linux.HARD_MSGSIZEMAX // Hard limit for max message size.
    59  )
    60  
    61  // Registry is a POSIX message queue registry.
    62  //
    63  // Unlike SysV utilities, Registry is not map-based. It uses a provided
    64  // RegistryImpl backed by a virtual filesystem to implement registry operations.
    65  //
    66  // +stateify savable
    67  type Registry struct {
    68  	// userNS is the user namespace containing this registry. Immutable.
    69  	userNS *auth.UserNamespace
    70  
    71  	// mu protects all fields below.
    72  	mu sync.Mutex `state:"nosave"`
    73  
    74  	// impl is an implementation of several message queue utilities needed by
    75  	// the registry. impl should be provided by mqfs.
    76  	impl RegistryImpl
    77  }
    78  
    79  // RegistryImpl defines utilities needed by a Registry to provide actual
    80  // registry implementation. It works mainly as an abstraction layer used by
    81  // Registry to avoid dealing directly with the filesystem. RegistryImpl should
    82  // be implemented by mqfs and provided to Registry at initialization.
    83  type RegistryImpl interface {
    84  	// Get searchs for a queue with the given name, if it exists, the queue is
    85  	// used to create a new FD, return it and return true. If the queue  doesn't
    86  	// exist, return false and no error. An error is returned if creation fails.
    87  	Get(ctx context.Context, name string, access AccessType, block bool, flags uint32) (*vfs.FileDescription, bool, error)
    88  
    89  	// New creates a new inode and file description using the given queue,
    90  	// inserts the inode into the filesystem tree using the given name, and
    91  	// returns the file description. An error is returned if creation fails, or
    92  	// if the name already exists.
    93  	New(ctx context.Context, name string, q *Queue, access AccessType, block bool, perm linux.FileMode, flags uint32) (*vfs.FileDescription, error)
    94  
    95  	// Unlink removes the queue with given name from the registry, and returns
    96  	// an error if the name doesn't exist.
    97  	Unlink(ctx context.Context, name string) error
    98  
    99  	// Destroy destroys the registry.
   100  	Destroy(context.Context)
   101  }
   102  
   103  // NewRegistry returns a new, initialized message queue registry. NewRegistry
   104  // should be called when a new message queue filesystem is created, once per
   105  // IPCNamespace.
   106  func NewRegistry(userNS *auth.UserNamespace, impl RegistryImpl) *Registry {
   107  	return &Registry{
   108  		userNS: userNS,
   109  		impl:   impl,
   110  	}
   111  }
   112  
   113  // OpenOpts holds the options passed to FindOrCreate.
   114  type OpenOpts struct {
   115  	Name      string
   116  	Access    AccessType
   117  	Create    bool
   118  	Exclusive bool
   119  	Block     bool
   120  }
   121  
   122  // FindOrCreate creates a new POSIX message queue or opens an existing queue.
   123  // See mq_open(2).
   124  func (r *Registry) FindOrCreate(ctx context.Context, opts OpenOpts, mode linux.FileMode, attr *linux.MqAttr) (*vfs.FileDescription, error) {
   125  	// mq_overview(7) mentions that: "Each message queue is identified by a name
   126  	// of the form '/somename'", but the mq_open(3) man pages mention:
   127  	//   "The mq_open() library function is implemented on top of a system call
   128  	//    of the same name.  The library function performs the check that the
   129  	//    name starts with a slash (/), giving the EINVAL error if it does not.
   130  	//    The kernel system call expects name to contain no preceding slash, so
   131  	//    the C library function passes name without the preceding slash (i.e.,
   132  	//    name+1) to the system call."
   133  	// So we don't need to check it.
   134  
   135  	if len(opts.Name) == 0 {
   136  		return nil, linuxerr.ENOENT
   137  	}
   138  	if len(opts.Name) > MaxName {
   139  		return nil, linuxerr.ENAMETOOLONG
   140  	}
   141  	if strings.ContainsRune(opts.Name, '/') {
   142  		return nil, linuxerr.EACCES
   143  	}
   144  	if opts.Name == "." || opts.Name == ".." {
   145  		return nil, linuxerr.EINVAL
   146  	}
   147  
   148  	// Construct status flags.
   149  	var flags uint32
   150  	if opts.Block {
   151  		flags = linux.O_NONBLOCK
   152  	}
   153  	switch opts.Access {
   154  	case ReadOnly:
   155  		flags = flags | linux.O_RDONLY
   156  	case WriteOnly:
   157  		flags = flags | linux.O_WRONLY
   158  	case ReadWrite:
   159  		flags = flags | linux.O_RDWR
   160  	}
   161  
   162  	r.mu.Lock()
   163  	defer r.mu.Unlock()
   164  	fd, ok, err := r.impl.Get(ctx, opts.Name, opts.Access, opts.Block, flags)
   165  	if err != nil {
   166  		return nil, err
   167  	}
   168  
   169  	if ok {
   170  		if opts.Create && opts.Exclusive {
   171  			// "Both O_CREAT and O_EXCL were specified in oflag, but a queue
   172  			//  with this name already exists."
   173  			return nil, linuxerr.EEXIST
   174  		}
   175  		return fd, nil
   176  	}
   177  
   178  	if !opts.Create {
   179  		// "The O_CREAT flag was not specified in oflag, and no queue with this name
   180  		//  exists."
   181  		return nil, linuxerr.ENOENT
   182  	}
   183  
   184  	q, err := r.newQueueLocked(auth.CredentialsFromContext(ctx), mode, attr)
   185  	if err != nil {
   186  		return nil, err
   187  	}
   188  	return r.impl.New(ctx, opts.Name, q, opts.Access, opts.Block, mode.Permissions(), flags)
   189  }
   190  
   191  // newQueueLocked creates a new queue using the given attributes. If attr is nil
   192  // return a queue with default values, otherwise use attr to create a new queue,
   193  // and return an error if attributes are invalid.
   194  func (r *Registry) newQueueLocked(creds *auth.Credentials, mode linux.FileMode, attr *linux.MqAttr) (*Queue, error) {
   195  	if attr == nil {
   196  		return &Queue{
   197  			ownerUID:        creds.EffectiveKUID,
   198  			ownerGID:        creds.EffectiveKGID,
   199  			mode:            mode,
   200  			maxMessageCount: int64(maxMsgDefault),
   201  			maxMessageSize:  uint64(msgSizeDefault),
   202  		}, nil
   203  	}
   204  
   205  	// "O_CREAT was specified in oflag, and attr was not NULL, but
   206  	//  attr->mq_maxmsg or attr->mq_msqsize was invalid.  Both of these fields
   207  	//  these fields must be greater than zero.  In a process that is
   208  	//  unprivileged (does not have the CAP_SYS_RESOURCE capability),
   209  	//  attr->mq_maxmsg must be less than or equal to the msg_max limit, and
   210  	//  attr->mq_msgsize must be less than or equal to the msgsize_max limit.
   211  	//  In addition, even in a privileged process, attr->mq_maxmsg cannot
   212  	//  exceed the HARD_MAX limit." - man mq_open(3).
   213  	if attr.MqMaxmsg <= 0 || attr.MqMsgsize <= 0 {
   214  		return nil, linuxerr.EINVAL
   215  	}
   216  
   217  	if attr.MqMaxmsg > maxMsgHardLimit || (!creds.HasCapabilityIn(linux.CAP_SYS_RESOURCE, r.userNS) && (attr.MqMaxmsg > maxMsgLimit || attr.MqMsgsize > msgSizeLimit)) {
   218  		return nil, linuxerr.EINVAL
   219  	}
   220  
   221  	return &Queue{
   222  		ownerUID:        creds.EffectiveKUID,
   223  		ownerGID:        creds.EffectiveKGID,
   224  		mode:            mode,
   225  		maxMessageCount: attr.MqMaxmsg,
   226  		maxMessageSize:  uint64(attr.MqMsgsize),
   227  	}, nil
   228  }
   229  
   230  // Remove removes the queue with the given name from the registry. See
   231  // mq_unlink(2).
   232  func (r *Registry) Remove(ctx context.Context, name string) error {
   233  	if len(name) > MaxName {
   234  		return linuxerr.ENAMETOOLONG
   235  	}
   236  
   237  	r.mu.Lock()
   238  	defer r.mu.Unlock()
   239  	return r.impl.Unlink(ctx, name)
   240  }
   241  
   242  // Destroy destroys the registry and releases all held references.
   243  func (r *Registry) Destroy(ctx context.Context) {
   244  	r.mu.Lock()
   245  	defer r.mu.Unlock()
   246  	r.impl.Destroy(ctx)
   247  }
   248  
   249  // Impl returns RegistryImpl inside r.
   250  func (r *Registry) Impl() RegistryImpl {
   251  	return r.impl
   252  }
   253  
   254  // Queue represents a POSIX message queue.
   255  //
   256  // +stateify savable
   257  type Queue struct {
   258  	// ownerUID is the registry's owner's UID. Immutable.
   259  	ownerUID auth.KUID
   260  
   261  	// ownerGID is the registry's owner's GID. Immutable.
   262  	ownerGID auth.KGID
   263  
   264  	// mode is the registry's access permissions. Immutable.
   265  	mode linux.FileMode
   266  
   267  	// mu protects all the fields below.
   268  	mu sync.Mutex `state:"nosave"`
   269  
   270  	// queue is the queue of waiters.
   271  	queue waiter.Queue
   272  
   273  	// messages is a list of messages currently in the queue.
   274  	messages msgList
   275  
   276  	// subscriber represents a task registered to receive async notification
   277  	// from this queue.
   278  	subscriber *Subscriber
   279  
   280  	// messageCount is the number of messages currently in the queue.
   281  	messageCount int64
   282  
   283  	// maxMessageCount is the maximum number of messages that the queue can
   284  	// hold.
   285  	maxMessageCount int64
   286  
   287  	// maxMessageSize is the maximum size of a message held by the queue.
   288  	maxMessageSize uint64
   289  
   290  	// byteCount is the number of bytes of data in all messages in the queue.
   291  	byteCount uint64
   292  }
   293  
   294  // View is a view into a message queue. Views should only be used in file
   295  // descriptions, but not inodes, because we use inodes to retreive the actual
   296  // queue, and only FDs are responsible for providing user functionality.
   297  type View interface {
   298  	// TODO: Add Send and Receive when mq_timedsend(2) and mq_timedreceive(2)
   299  	// are implemented.
   300  
   301  	// Flush checks if the calling process has attached a notification request
   302  	// to this queue, if yes, then the request is removed, and another process
   303  	// can attach a request.
   304  	Flush(ctx context.Context)
   305  
   306  	waiter.Waitable
   307  }
   308  
   309  // ReaderWriter provides a send and receive view into a queue.
   310  type ReaderWriter struct {
   311  	*Queue
   312  
   313  	block bool
   314  }
   315  
   316  // Reader provides a send-only view into a queue.
   317  type Reader struct {
   318  	*Queue
   319  
   320  	block bool
   321  }
   322  
   323  // Writer provides a receive-only view into a queue.
   324  type Writer struct {
   325  	*Queue
   326  
   327  	block bool
   328  }
   329  
   330  // NewView creates a new view into a queue and returns it.
   331  func NewView(q *Queue, access AccessType, block bool) (View, error) {
   332  	switch access {
   333  	case ReadWrite:
   334  		return ReaderWriter{Queue: q, block: block}, nil
   335  	case WriteOnly:
   336  		return Writer{Queue: q, block: block}, nil
   337  	case ReadOnly:
   338  		return Reader{Queue: q, block: block}, nil
   339  	default:
   340  		// This case can't happen, due to O_RDONLY flag being 0 and O_WRONLY
   341  		// being 1, so one of them must be true.
   342  		return nil, linuxerr.EINVAL
   343  	}
   344  }
   345  
   346  // Message holds a message exchanged through a Queue via mq_timedsend(2) and
   347  // mq_timedreceive(2), and additional info relating to the message.
   348  //
   349  // +stateify savable
   350  type Message struct {
   351  	msgEntry
   352  
   353  	// Text is the message's sent content.
   354  	Text string
   355  
   356  	// Size is the message's size in bytes.
   357  	Size uint64
   358  
   359  	// Priority is the message's priority.
   360  	Priority uint32
   361  }
   362  
   363  // Subscriber represents a task registered for async notification from a Queue.
   364  //
   365  // +stateify savable
   366  type Subscriber struct {
   367  	// TODO: Add fields when mq_notify(2) is implemented.
   368  
   369  	// pid is the PID of the registered task.
   370  	pid int32
   371  }
   372  
   373  // Generate implements vfs.DynamicBytesSource.Generate. Queue is used as a
   374  // DynamicBytesSource for mqfs's queueInode.
   375  func (q *Queue) Generate(ctx context.Context, buf *bytes.Buffer) error {
   376  	q.mu.Lock()
   377  	defer q.mu.Unlock()
   378  
   379  	var (
   380  		pid       int32
   381  		method    int
   382  		sigNumber int
   383  	)
   384  	if q.subscriber != nil {
   385  		pid = q.subscriber.pid
   386  		// TODO: add method and sigNumber when mq_notify(2) is implemented.
   387  	}
   388  
   389  	buf.WriteString(
   390  		fmt.Sprintf("QSIZE:%-10d NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",
   391  			q.byteCount, method, sigNumber, pid),
   392  	)
   393  	return nil
   394  }
   395  
   396  // Flush implements View.Flush.
   397  func (q *Queue) Flush(ctx context.Context) {
   398  	q.mu.Lock()
   399  	defer q.mu.Unlock()
   400  
   401  	pid, ok := auth.ThreadGroupIDFromContext(ctx)
   402  	if ok {
   403  		if q.subscriber != nil && pid == q.subscriber.pid {
   404  			q.subscriber = nil
   405  		}
   406  	}
   407  }
   408  
   409  // Readiness implements Waitable.Readiness.
   410  func (q *Queue) Readiness(mask waiter.EventMask) waiter.EventMask {
   411  	q.mu.Lock()
   412  	defer q.mu.Unlock()
   413  
   414  	events := waiter.EventMask(0)
   415  	if q.messageCount > 0 {
   416  		events |= waiter.ReadableEvents
   417  	}
   418  	if q.messageCount < q.maxMessageCount {
   419  		events |= waiter.WritableEvents
   420  	}
   421  	return events & mask
   422  }
   423  
   424  // EventRegister implements Waitable.EventRegister.
   425  func (q *Queue) EventRegister(e *waiter.Entry) error {
   426  	q.mu.Lock()
   427  	defer q.mu.Unlock()
   428  	q.queue.EventRegister(e)
   429  	return nil
   430  }
   431  
   432  // EventUnregister implements Waitable.EventUnregister.
   433  func (q *Queue) EventUnregister(e *waiter.Entry) {
   434  	q.mu.Lock()
   435  	defer q.mu.Unlock()
   436  	q.queue.EventUnregister(e)
   437  }
   438  
   439  // HasPermissions returns true if the given credentials meet the access
   440  // permissions required by the queue.
   441  func (q *Queue) HasPermissions(creds *auth.Credentials, req vfs.AccessTypes) bool {
   442  	perms := uint16(q.mode.Permissions())
   443  	if q.ownerUID == creds.EffectiveKUID {
   444  		perms >>= 6
   445  	} else if creds.InGroup(q.ownerGID) {
   446  		perms >>= 3
   447  	}
   448  
   449  	return uint16(req)&perms == uint16(req)
   450  }