github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/kernel/mq/mq.go (about) 1 // Copyright 2021 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package mq provides an implementation for POSIX message queues. 16 package mq 17 18 import ( 19 "bytes" 20 "fmt" 21 "strings" 22 23 "github.com/metacubex/gvisor/pkg/abi/linux" 24 "github.com/metacubex/gvisor/pkg/context" 25 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 26 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 27 "github.com/metacubex/gvisor/pkg/sentry/vfs" 28 "github.com/metacubex/gvisor/pkg/sync" 29 "github.com/metacubex/gvisor/pkg/waiter" 30 ) 31 32 // AccessType is the access type passed to mq_open. 33 type AccessType int 34 35 // Possible access types. 36 const ( 37 ReadOnly AccessType = iota 38 WriteOnly 39 ReadWrite 40 ) 41 42 // MaxName is the maximum size for a queue name. 43 const MaxName = 255 44 45 const ( 46 maxPriority = linux.MQ_PRIO_MAX - 1 // Highest possible message priority. 47 48 maxQueuesDefault = linux.DFLT_QUEUESMAX // Default max number of queues. 49 50 maxMsgDefault = linux.DFLT_MSG // Default max number of messages per queue. 51 maxMsgMin = linux.MIN_MSGMAX // Min value for max number of messages per queue. 52 maxMsgLimit = linux.DFLT_MSGMAX // Limit for max number of messages per queue. 53 maxMsgHardLimit = linux.HARD_MSGMAX // Hard limit for max number of messages per queue. 54 55 msgSizeDefault = linux.DFLT_MSGSIZE // Default max message size. 56 msgSizeMin = linux.MIN_MSGSIZEMAX // Min value for max message size. 57 msgSizeLimit = linux.DFLT_MSGSIZEMAX // Limit for max message size. 58 msgSizeHardLimit = linux.HARD_MSGSIZEMAX // Hard limit for max message size. 59 ) 60 61 // Registry is a POSIX message queue registry. 62 // 63 // Unlike SysV utilities, Registry is not map-based. It uses a provided 64 // RegistryImpl backed by a virtual filesystem to implement registry operations. 65 // 66 // +stateify savable 67 type Registry struct { 68 // userNS is the user namespace containing this registry. Immutable. 69 userNS *auth.UserNamespace 70 71 // mu protects all fields below. 72 mu sync.Mutex `state:"nosave"` 73 74 // impl is an implementation of several message queue utilities needed by 75 // the registry. impl should be provided by mqfs. 76 impl RegistryImpl 77 } 78 79 // RegistryImpl defines utilities needed by a Registry to provide actual 80 // registry implementation. It works mainly as an abstraction layer used by 81 // Registry to avoid dealing directly with the filesystem. RegistryImpl should 82 // be implemented by mqfs and provided to Registry at initialization. 83 type RegistryImpl interface { 84 // Get searches for a queue with the given name, if it exists, the queue is 85 // used to create a new FD, return it and return true. If the queue doesn't 86 // exist, return false and no error. An error is returned if creation fails. 87 Get(ctx context.Context, name string, access AccessType, block bool, flags uint32) (*vfs.FileDescription, bool, error) 88 89 // New creates a new inode and file description using the given queue, 90 // inserts the inode into the filesystem tree using the given name, and 91 // returns the file description. An error is returned if creation fails, or 92 // if the name already exists. 93 New(ctx context.Context, name string, q *Queue, access AccessType, block bool, perm linux.FileMode, flags uint32) (*vfs.FileDescription, error) 94 95 // Unlink removes the queue with given name from the registry, and returns 96 // an error if the name doesn't exist. 97 Unlink(ctx context.Context, name string) error 98 99 // Destroy destroys the registry. 100 Destroy(context.Context) 101 } 102 103 // NewRegistry returns a new, initialized message queue registry. NewRegistry 104 // should be called when a new message queue filesystem is created, once per 105 // IPCNamespace. 106 func NewRegistry(userNS *auth.UserNamespace, impl RegistryImpl) *Registry { 107 return &Registry{ 108 userNS: userNS, 109 impl: impl, 110 } 111 } 112 113 // OpenOpts holds the options passed to FindOrCreate. 114 type OpenOpts struct { 115 Name string 116 Access AccessType 117 Create bool 118 Exclusive bool 119 Block bool 120 } 121 122 // FindOrCreate creates a new POSIX message queue or opens an existing queue. 123 // See mq_open(2). 124 func (r *Registry) FindOrCreate(ctx context.Context, opts OpenOpts, mode linux.FileMode, attr *linux.MqAttr) (*vfs.FileDescription, error) { 125 // mq_overview(7) mentions that: "Each message queue is identified by a name 126 // of the form '/somename'", but the mq_open(3) man pages mention: 127 // "The mq_open() library function is implemented on top of a system call 128 // of the same name. The library function performs the check that the 129 // name starts with a slash (/), giving the EINVAL error if it does not. 130 // The kernel system call expects name to contain no preceding slash, so 131 // the C library function passes name without the preceding slash (i.e., 132 // name+1) to the system call." 133 // So we don't need to check it. 134 135 if len(opts.Name) == 0 { 136 return nil, linuxerr.ENOENT 137 } 138 if len(opts.Name) > MaxName { 139 return nil, linuxerr.ENAMETOOLONG 140 } 141 if strings.ContainsRune(opts.Name, '/') { 142 return nil, linuxerr.EACCES 143 } 144 if opts.Name == "." || opts.Name == ".." { 145 return nil, linuxerr.EINVAL 146 } 147 148 // Construct status flags. 149 var flags uint32 150 if opts.Block { 151 flags = linux.O_NONBLOCK 152 } 153 switch opts.Access { 154 case ReadOnly: 155 flags = flags | linux.O_RDONLY 156 case WriteOnly: 157 flags = flags | linux.O_WRONLY 158 case ReadWrite: 159 flags = flags | linux.O_RDWR 160 } 161 162 r.mu.Lock() 163 defer r.mu.Unlock() 164 fd, ok, err := r.impl.Get(ctx, opts.Name, opts.Access, opts.Block, flags) 165 if err != nil { 166 return nil, err 167 } 168 169 if ok { 170 if opts.Create && opts.Exclusive { 171 // "Both O_CREAT and O_EXCL were specified in oflag, but a queue 172 // with this name already exists." 173 fd.DecRef(ctx) 174 return nil, linuxerr.EEXIST 175 } 176 return fd, nil 177 } 178 179 if !opts.Create { 180 // "The O_CREAT flag was not specified in oflag, and no queue with this name 181 // exists." 182 return nil, linuxerr.ENOENT 183 } 184 185 q, err := r.newQueueLocked(auth.CredentialsFromContext(ctx), mode, attr) 186 if err != nil { 187 return nil, err 188 } 189 return r.impl.New(ctx, opts.Name, q, opts.Access, opts.Block, mode.Permissions(), flags) 190 } 191 192 // newQueueLocked creates a new queue using the given attributes. If attr is nil 193 // return a queue with default values, otherwise use attr to create a new queue, 194 // and return an error if attributes are invalid. 195 func (r *Registry) newQueueLocked(creds *auth.Credentials, mode linux.FileMode, attr *linux.MqAttr) (*Queue, error) { 196 if attr == nil { 197 return &Queue{ 198 ownerUID: creds.EffectiveKUID, 199 ownerGID: creds.EffectiveKGID, 200 mode: mode, 201 maxMessageCount: int64(maxMsgDefault), 202 maxMessageSize: uint64(msgSizeDefault), 203 }, nil 204 } 205 206 // "O_CREAT was specified in oflag, and attr was not NULL, but 207 // attr->mq_maxmsg or attr->mq_msqsize was invalid. Both of these fields 208 // these fields must be greater than zero. In a process that is 209 // unprivileged (does not have the CAP_SYS_RESOURCE capability), 210 // attr->mq_maxmsg must be less than or equal to the msg_max limit, and 211 // attr->mq_msgsize must be less than or equal to the msgsize_max limit. 212 // In addition, even in a privileged process, attr->mq_maxmsg cannot 213 // exceed the HARD_MAX limit." - man mq_open(3). 214 if attr.MqMaxmsg <= 0 || attr.MqMsgsize <= 0 { 215 return nil, linuxerr.EINVAL 216 } 217 218 if attr.MqMaxmsg > maxMsgHardLimit || (!creds.HasCapabilityIn(linux.CAP_SYS_RESOURCE, r.userNS) && (attr.MqMaxmsg > maxMsgLimit || attr.MqMsgsize > msgSizeLimit)) { 219 return nil, linuxerr.EINVAL 220 } 221 222 return &Queue{ 223 ownerUID: creds.EffectiveKUID, 224 ownerGID: creds.EffectiveKGID, 225 mode: mode, 226 maxMessageCount: attr.MqMaxmsg, 227 maxMessageSize: uint64(attr.MqMsgsize), 228 }, nil 229 } 230 231 // Remove removes the queue with the given name from the registry. See 232 // mq_unlink(2). 233 func (r *Registry) Remove(ctx context.Context, name string) error { 234 if len(name) > MaxName { 235 return linuxerr.ENAMETOOLONG 236 } 237 238 r.mu.Lock() 239 defer r.mu.Unlock() 240 return r.impl.Unlink(ctx, name) 241 } 242 243 // Destroy destroys the registry and releases all held references. 244 func (r *Registry) Destroy(ctx context.Context) { 245 r.mu.Lock() 246 defer r.mu.Unlock() 247 r.impl.Destroy(ctx) 248 } 249 250 // Impl returns RegistryImpl inside r. 251 func (r *Registry) Impl() RegistryImpl { 252 return r.impl 253 } 254 255 // Queue represents a POSIX message queue. 256 // 257 // +stateify savable 258 type Queue struct { 259 // ownerUID is the registry's owner's UID. Immutable. 260 ownerUID auth.KUID 261 262 // ownerGID is the registry's owner's GID. Immutable. 263 ownerGID auth.KGID 264 265 // mode is the registry's access permissions. Immutable. 266 mode linux.FileMode 267 268 // mu protects all the fields below. 269 mu sync.Mutex `state:"nosave"` 270 271 // queue is the queue of waiters. 272 queue waiter.Queue 273 274 // messages is a list of messages currently in the queue. 275 messages msgList 276 277 // subscriber represents a task registered to receive async notification 278 // from this queue. 279 subscriber *Subscriber 280 281 // messageCount is the number of messages currently in the queue. 282 messageCount int64 283 284 // maxMessageCount is the maximum number of messages that the queue can 285 // hold. 286 maxMessageCount int64 287 288 // maxMessageSize is the maximum size of a message held by the queue. 289 maxMessageSize uint64 290 291 // byteCount is the number of bytes of data in all messages in the queue. 292 byteCount uint64 293 } 294 295 // View is a view into a message queue. Views should only be used in file 296 // descriptions, but not inodes, because we use inodes to retrieve the actual 297 // queue, and only FDs are responsible for providing user functionality. 298 type View interface { 299 // TODO: Add Send and Receive when mq_timedsend(2) and mq_timedreceive(2) 300 // are implemented. 301 302 // Flush checks if the calling process has attached a notification request 303 // to this queue, if yes, then the request is removed, and another process 304 // can attach a request. 305 Flush(ctx context.Context) 306 307 waiter.Waitable 308 } 309 310 // ReaderWriter provides a send and receive view into a queue. 311 // 312 // +stateify savable 313 type ReaderWriter struct { 314 *Queue 315 316 block bool 317 } 318 319 // Reader provides a send-only view into a queue. 320 // 321 // +stateify savable 322 type Reader struct { 323 *Queue 324 325 block bool 326 } 327 328 // Writer provides a receive-only view into a queue. 329 // 330 // +stateify savable 331 type Writer struct { 332 *Queue 333 334 block bool 335 } 336 337 // NewView creates a new view into a queue and returns it. 338 func NewView(q *Queue, access AccessType, block bool) (View, error) { 339 switch access { 340 case ReadWrite: 341 return ReaderWriter{Queue: q, block: block}, nil 342 case WriteOnly: 343 return Writer{Queue: q, block: block}, nil 344 case ReadOnly: 345 return Reader{Queue: q, block: block}, nil 346 default: 347 // This case can't happen, due to O_RDONLY flag being 0 and O_WRONLY 348 // being 1, so one of them must be true. 349 return nil, linuxerr.EINVAL 350 } 351 } 352 353 // Message holds a message exchanged through a Queue via mq_timedsend(2) and 354 // mq_timedreceive(2), and additional info relating to the message. 355 // 356 // +stateify savable 357 type Message struct { 358 msgEntry 359 360 // Text is the message's sent content. 361 Text string 362 363 // Size is the message's size in bytes. 364 Size uint64 365 366 // Priority is the message's priority. 367 Priority uint32 368 } 369 370 // Subscriber represents a task registered for async notification from a Queue. 371 // 372 // +stateify savable 373 type Subscriber struct { 374 // TODO: Add fields when mq_notify(2) is implemented. 375 376 // pid is the PID of the registered task. 377 pid int32 378 } 379 380 // Generate implements vfs.DynamicBytesSource.Generate. Queue is used as a 381 // DynamicBytesSource for mqfs's queueInode. 382 func (q *Queue) Generate(ctx context.Context, buf *bytes.Buffer) error { 383 q.mu.Lock() 384 defer q.mu.Unlock() 385 386 var ( 387 pid int32 388 method int 389 sigNumber int 390 ) 391 if q.subscriber != nil { 392 pid = q.subscriber.pid 393 // TODO: add method and sigNumber when mq_notify(2) is implemented. 394 } 395 396 buf.WriteString( 397 fmt.Sprintf("QSIZE:%-10d NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n", 398 q.byteCount, method, sigNumber, pid), 399 ) 400 return nil 401 } 402 403 // Flush implements View.Flush. 404 func (q *Queue) Flush(ctx context.Context) { 405 q.mu.Lock() 406 defer q.mu.Unlock() 407 408 pid, ok := auth.ThreadGroupIDFromContext(ctx) 409 if ok { 410 if q.subscriber != nil && pid == q.subscriber.pid { 411 q.subscriber = nil 412 } 413 } 414 } 415 416 // Readiness implements Waitable.Readiness. 417 func (q *Queue) Readiness(mask waiter.EventMask) waiter.EventMask { 418 q.mu.Lock() 419 defer q.mu.Unlock() 420 421 events := waiter.EventMask(0) 422 if q.messageCount > 0 { 423 events |= waiter.ReadableEvents 424 } 425 if q.messageCount < q.maxMessageCount { 426 events |= waiter.WritableEvents 427 } 428 return events & mask 429 } 430 431 // EventRegister implements Waitable.EventRegister. 432 func (q *Queue) EventRegister(e *waiter.Entry) error { 433 q.mu.Lock() 434 defer q.mu.Unlock() 435 q.queue.EventRegister(e) 436 return nil 437 } 438 439 // EventUnregister implements Waitable.EventUnregister. 440 func (q *Queue) EventUnregister(e *waiter.Entry) { 441 q.mu.Lock() 442 defer q.mu.Unlock() 443 q.queue.EventUnregister(e) 444 } 445 446 // HasPermissions returns true if the given credentials meet the access 447 // permissions required by the queue. 448 func (q *Queue) HasPermissions(creds *auth.Credentials, req vfs.AccessTypes) bool { 449 perms := uint16(q.mode.Permissions()) 450 if q.ownerUID == creds.EffectiveKUID { 451 perms >>= 6 452 } else if creds.InGroup(q.ownerGID) { 453 perms >>= 3 454 } 455 456 return uint16(req)&perms == uint16(req) 457 }