gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/socket/netlink/socket.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package netlink provides core functionality for netlink sockets. 16 package netlink 17 18 import ( 19 "io" 20 "math" 21 "time" 22 23 "gvisor.dev/gvisor/pkg/abi/linux" 24 "gvisor.dev/gvisor/pkg/abi/linux/errno" 25 "gvisor.dev/gvisor/pkg/context" 26 "gvisor.dev/gvisor/pkg/errors/linuxerr" 27 "gvisor.dev/gvisor/pkg/hostarch" 28 "gvisor.dev/gvisor/pkg/marshal" 29 "gvisor.dev/gvisor/pkg/marshal/primitive" 30 "gvisor.dev/gvisor/pkg/sentry/arch" 31 "gvisor.dev/gvisor/pkg/sentry/kernel" 32 "gvisor.dev/gvisor/pkg/sentry/kernel/auth" 33 ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" 34 "gvisor.dev/gvisor/pkg/sentry/socket" 35 "gvisor.dev/gvisor/pkg/sentry/socket/netlink/nlmsg" 36 "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port" 37 "gvisor.dev/gvisor/pkg/sentry/socket/unix" 38 "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" 39 "gvisor.dev/gvisor/pkg/sentry/vfs" 40 "gvisor.dev/gvisor/pkg/sync" 41 "gvisor.dev/gvisor/pkg/syserr" 42 "gvisor.dev/gvisor/pkg/usermem" 43 "gvisor.dev/gvisor/pkg/waiter" 44 ) 45 46 const sizeOfInt32 int = 4 47 48 const ( 49 // minBufferSize is the smallest size of a send buffer. 50 minSendBufferSize = 4 << 10 // 4096 bytes. 51 52 // defaultSendBufferSize is the default size for the send buffer. 53 defaultSendBufferSize = 16 * 1024 54 55 // maxBufferSize is the largest size a send buffer can grow to. 56 maxSendBufferSize = 4 << 20 // 4MB 57 ) 58 59 var errNoFilter = syserr.New("no filter attached", errno.ENOENT) 60 61 // Socket is the base socket type for netlink sockets. 62 // 63 // This implementation only supports userspace sending and receiving messages 64 // to/from the kernel. 65 // 66 // Socket implements socket.Socket and transport.Credentialer. 67 // 68 // +stateify savable 69 type Socket struct { 70 vfsfd vfs.FileDescription 71 vfs.FileDescriptionDefaultImpl 72 vfs.DentryMetadataFileDescriptionImpl 73 vfs.LockFD 74 socket.SendReceiveTimeout 75 76 // ports provides netlink port allocation. 77 ports *port.Manager 78 79 // protocol is the netlink protocol implementation. 80 protocol Protocol 81 82 // skType is the socket type. This is either SOCK_DGRAM or SOCK_RAW for 83 // netlink sockets. 84 skType linux.SockType 85 86 // ep is a datagram unix endpoint used to buffer messages sent from the 87 // kernel to userspace. RecvMsg reads messages from this endpoint. 88 ep transport.Endpoint 89 90 // connection is the kernel's connection to ep, used to write messages 91 // sent to userspace. 92 connection transport.ConnectedEndpoint 93 94 // mu protects the fields below. 95 mu sync.Mutex `state:"nosave"` 96 97 // bound indicates that portid is valid. 98 bound bool 99 100 // portID is the port ID allocated for this socket. 101 portID int32 102 103 // sendBufferSize is the send buffer "size". We don't actually have a 104 // fixed buffer but only consume this many bytes. 105 sendBufferSize uint32 106 107 // filter indicates that this socket has a BPF filter "installed". 108 // 109 // TODO(gvisor.dev/issue/1119): We don't actually support filtering, 110 // this is just bookkeeping for tracking add/remove. 111 filter bool 112 } 113 114 var _ socket.Socket = (*Socket)(nil) 115 var _ transport.Credentialer = (*Socket)(nil) 116 117 // New creates a new Socket. 118 func New(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socket, *syserr.Error) { 119 // Datagram endpoint used to buffer kernel -> user messages. 120 ep := transport.NewConnectionless(t) 121 122 // Bind the endpoint for good measure so we can connect to it. The 123 // bound address will never be exposed. 124 if err := ep.Bind(transport.Address{Addr: "dummy"}); err != nil { 125 ep.Close(t) 126 return nil, err 127 } 128 129 // Create a connection from which the kernel can write messages. 130 connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t) 131 if err != nil { 132 ep.Close(t) 133 return nil, err 134 } 135 136 fd := &Socket{ 137 ports: t.Kernel().NetlinkPorts(), 138 protocol: protocol, 139 skType: skType, 140 ep: ep, 141 connection: connection, 142 sendBufferSize: defaultSendBufferSize, 143 } 144 fd.LockFD.Init(&vfs.FileLocks{}) 145 return fd, nil 146 } 147 148 // Release implements vfs.FileDescriptionImpl.Release. 149 func (s *Socket) Release(ctx context.Context) { 150 t := kernel.TaskFromContext(ctx) 151 t.Kernel().DeleteSocket(&s.vfsfd) 152 s.connection.Release(ctx) 153 s.ep.Close(ctx) 154 155 if s.bound { 156 s.ports.Release(s.protocol.Protocol(), s.portID) 157 } 158 } 159 160 // Epollable implements FileDescriptionImpl.Epollable. 161 func (s *Socket) Epollable() bool { 162 return true 163 } 164 165 // Ioctl implements vfs.FileDescriptionImpl. 166 func (*Socket) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { 167 // TODO(b/68878065): no ioctls supported. 168 return 0, linuxerr.ENOTTY 169 } 170 171 // PRead implements vfs.FileDescriptionImpl. 172 func (s *Socket) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { 173 return 0, linuxerr.ESPIPE 174 } 175 176 // Read implements vfs.FileDescriptionImpl. 177 func (s *Socket) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { 178 // All flags other than RWF_NOWAIT should be ignored. 179 // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. 180 if opts.Flags != 0 { 181 return 0, linuxerr.EOPNOTSUPP 182 } 183 184 if dst.NumBytes() == 0 { 185 return 0, nil 186 } 187 r := unix.EndpointReader{ 188 Endpoint: s.ep, 189 } 190 n, err := dst.CopyOutFrom(ctx, &r) 191 if r.Notify != nil { 192 r.Notify() 193 } 194 return n, err 195 } 196 197 // PWrite implements vfs.FileDescriptionImpl. 198 func (s *Socket) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { 199 return 0, linuxerr.ESPIPE 200 } 201 202 // Write implements vfs.FileDescriptionImpl. 203 func (s *Socket) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { 204 // All flags other than RWF_NOWAIT should be ignored. 205 // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. 206 if opts.Flags != 0 { 207 return 0, linuxerr.EOPNOTSUPP 208 } 209 210 n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{}) 211 return int64(n), err.ToError() 212 } 213 214 // Readiness implements waiter.Waitable.Readiness. 215 func (s *Socket) Readiness(mask waiter.EventMask) waiter.EventMask { 216 // ep holds messages to be read and thus handles EventIn readiness. 217 ready := s.ep.Readiness(mask) 218 219 if mask&waiter.WritableEvents != 0 { 220 // sendMsg handles messages synchronously and is thus always 221 // ready for writing. 222 ready |= waiter.WritableEvents 223 } 224 225 return ready 226 } 227 228 // EventRegister implements waiter.Waitable.EventRegister. 229 func (s *Socket) EventRegister(e *waiter.Entry) error { 230 return s.ep.EventRegister(e) 231 // Writable readiness never changes, so no registration is needed. 232 } 233 234 // EventUnregister implements waiter.Waitable.EventUnregister. 235 func (s *Socket) EventUnregister(e *waiter.Entry) { 236 s.ep.EventUnregister(e) 237 } 238 239 // Passcred implements transport.Credentialer.Passcred. 240 func (s *Socket) Passcred() bool { 241 return s.ep.SocketOptions().GetPassCred() 242 } 243 244 // ConnectedPasscred implements transport.Credentialer.ConnectedPasscred. 245 func (s *Socket) ConnectedPasscred() bool { 246 // This socket is connected to the kernel, which doesn't need creds. 247 // 248 // This is arbitrary, as ConnectedPasscred on this type has no callers. 249 return false 250 } 251 252 // ExtractSockAddr extracts the SockAddrNetlink from b. 253 func ExtractSockAddr(b []byte) (*linux.SockAddrNetlink, *syserr.Error) { 254 if len(b) < linux.SockAddrNetlinkSize { 255 return nil, syserr.ErrBadAddress 256 } 257 258 var sa linux.SockAddrNetlink 259 sa.UnmarshalUnsafe(b) 260 261 if sa.Family != linux.AF_NETLINK { 262 return nil, syserr.ErrInvalidArgument 263 } 264 265 return &sa, nil 266 } 267 268 // bindPort binds this socket to a port, preferring 'port' if it is available. 269 // 270 // port of 0 defaults to the ThreadGroup ID. 271 // 272 // Preconditions: mu is held. 273 func (s *Socket) bindPort(t *kernel.Task, port int32) *syserr.Error { 274 if s.bound { 275 // Re-binding is only allowed if the port doesn't change. 276 if port != s.portID { 277 return syserr.ErrInvalidArgument 278 } 279 280 return nil 281 } 282 283 if port == 0 { 284 port = int32(t.ThreadGroup().ID()) 285 } 286 port, ok := s.ports.Allocate(s.protocol.Protocol(), port) 287 if !ok { 288 return syserr.ErrBusy 289 } 290 291 s.portID = port 292 s.bound = true 293 return nil 294 } 295 296 // Bind implements socket.Socket.Bind. 297 func (s *Socket) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { 298 a, err := ExtractSockAddr(sockaddr) 299 if err != nil { 300 return err 301 } 302 303 // No support for multicast groups yet. 304 if a.Groups != 0 { 305 return syserr.ErrPermissionDenied 306 } 307 308 s.mu.Lock() 309 defer s.mu.Unlock() 310 311 return s.bindPort(t, int32(a.PortID)) 312 } 313 314 // Connect implements socket.Socket.Connect. 315 func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { 316 a, err := ExtractSockAddr(sockaddr) 317 if err != nil { 318 return err 319 } 320 321 // No support for multicast groups yet. 322 if a.Groups != 0 { 323 return syserr.ErrPermissionDenied 324 } 325 326 s.mu.Lock() 327 defer s.mu.Unlock() 328 329 if a.PortID == 0 { 330 // Netlink sockets default to connected to the kernel, but 331 // connecting anyways automatically binds if not already bound. 332 if !s.bound { 333 // Pass port 0 to get an auto-selected port ID. 334 return s.bindPort(t, 0) 335 } 336 return nil 337 } 338 339 // We don't support non-kernel destination ports. Linux returns EPERM 340 // if applications attempt to do this without NL_CFG_F_NONROOT_SEND, so 341 // we emulate that. 342 return syserr.ErrPermissionDenied 343 } 344 345 // Accept implements socket.Socket.Accept. 346 func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { 347 // Netlink sockets never support accept. 348 return 0, nil, 0, syserr.ErrNotSupported 349 } 350 351 // Listen implements socket.Socket.Listen. 352 func (s *Socket) Listen(t *kernel.Task, backlog int) *syserr.Error { 353 // Netlink sockets never support listen. 354 return syserr.ErrNotSupported 355 } 356 357 // Shutdown implements socket.Socket.Shutdown. 358 func (s *Socket) Shutdown(t *kernel.Task, how int) *syserr.Error { 359 // Netlink sockets never support shutdown. 360 return syserr.ErrNotSupported 361 } 362 363 // GetSockOpt implements socket.Socket.GetSockOpt. 364 func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { 365 switch level { 366 case linux.SOL_SOCKET: 367 switch name { 368 case linux.SO_SNDBUF: 369 if outLen < sizeOfInt32 { 370 return nil, syserr.ErrInvalidArgument 371 } 372 s.mu.Lock() 373 defer s.mu.Unlock() 374 return primitive.AllocateInt32(int32(s.sendBufferSize)), nil 375 376 case linux.SO_RCVBUF: 377 if outLen < sizeOfInt32 { 378 return nil, syserr.ErrInvalidArgument 379 } 380 // We don't have limit on receiving size. 381 return primitive.AllocateInt32(math.MaxInt32), nil 382 383 case linux.SO_PASSCRED: 384 if outLen < sizeOfInt32 { 385 return nil, syserr.ErrInvalidArgument 386 } 387 var passcred primitive.Int32 388 if s.Passcred() { 389 passcred = 1 390 } 391 return &passcred, nil 392 393 case linux.SO_SNDTIMEO: 394 if outLen < linux.SizeOfTimeval { 395 return nil, syserr.ErrInvalidArgument 396 } 397 sendTimeout := linux.NsecToTimeval(s.SendTimeout()) 398 return &sendTimeout, nil 399 400 case linux.SO_RCVTIMEO: 401 if outLen < linux.SizeOfTimeval { 402 return nil, syserr.ErrInvalidArgument 403 } 404 recvTimeout := linux.NsecToTimeval(s.RecvTimeout()) 405 return &recvTimeout, nil 406 } 407 case linux.SOL_NETLINK: 408 switch name { 409 case linux.NETLINK_BROADCAST_ERROR, 410 linux.NETLINK_CAP_ACK, 411 linux.NETLINK_DUMP_STRICT_CHK, 412 linux.NETLINK_EXT_ACK, 413 linux.NETLINK_LIST_MEMBERSHIPS, 414 linux.NETLINK_NO_ENOBUFS, 415 linux.NETLINK_PKTINFO: 416 // Not supported. 417 } 418 } 419 // TODO(b/68878065): other sockopts are not supported. 420 return nil, syserr.ErrProtocolNotAvailable 421 } 422 423 // SetSockOpt implements socket.Socket.SetSockOpt. 424 func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { 425 switch level { 426 case linux.SOL_SOCKET: 427 switch name { 428 case linux.SO_SNDBUF: 429 if len(opt) < sizeOfInt32 { 430 return syserr.ErrInvalidArgument 431 } 432 size := hostarch.ByteOrder.Uint32(opt) 433 if size < minSendBufferSize { 434 size = minSendBufferSize 435 } else if size > maxSendBufferSize { 436 size = maxSendBufferSize 437 } 438 s.mu.Lock() 439 s.sendBufferSize = size 440 s.mu.Unlock() 441 return nil 442 443 case linux.SO_RCVBUF: 444 if len(opt) < sizeOfInt32 { 445 return syserr.ErrInvalidArgument 446 } 447 // We don't have limit on receiving size. So just accept anything as 448 // valid for compatibility. 449 return nil 450 451 case linux.SO_PASSCRED: 452 if len(opt) < sizeOfInt32 { 453 return syserr.ErrInvalidArgument 454 } 455 passcred := hostarch.ByteOrder.Uint32(opt) 456 457 s.ep.SocketOptions().SetPassCred(passcred != 0) 458 return nil 459 460 case linux.SO_ATTACH_FILTER: 461 // TODO(gvisor.dev/issue/1119): We don't actually 462 // support filtering. If this socket can't ever send 463 // messages, then there is nothing to filter and we can 464 // advertise support. Otherwise, be conservative and 465 // return an error. 466 if s.protocol.CanSend() { 467 return syserr.ErrProtocolNotAvailable 468 } 469 470 s.mu.Lock() 471 s.filter = true 472 s.mu.Unlock() 473 return nil 474 475 case linux.SO_DETACH_FILTER: 476 // TODO(gvisor.dev/issue/1119): See above. 477 if s.protocol.CanSend() { 478 return syserr.ErrProtocolNotAvailable 479 } 480 481 s.mu.Lock() 482 filter := s.filter 483 s.filter = false 484 s.mu.Unlock() 485 486 if !filter { 487 return errNoFilter 488 } 489 490 return nil 491 492 case linux.SO_SNDTIMEO: 493 if len(opt) < linux.SizeOfTimeval { 494 return syserr.ErrInvalidArgument 495 } 496 497 var v linux.Timeval 498 v.UnmarshalBytes(opt) 499 if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) { 500 return syserr.ErrDomain 501 } 502 s.SetSendTimeout(v.ToNsecCapped()) 503 return nil 504 505 case linux.SO_RCVTIMEO: 506 if len(opt) < linux.SizeOfTimeval { 507 return syserr.ErrInvalidArgument 508 } 509 510 var v linux.Timeval 511 v.UnmarshalBytes(opt) 512 if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) { 513 return syserr.ErrDomain 514 } 515 s.SetRecvTimeout(v.ToNsecCapped()) 516 return nil 517 } 518 case linux.SOL_NETLINK: 519 switch name { 520 case linux.NETLINK_ADD_MEMBERSHIP, 521 linux.NETLINK_BROADCAST_ERROR, 522 linux.NETLINK_CAP_ACK, 523 linux.NETLINK_DROP_MEMBERSHIP, 524 linux.NETLINK_DUMP_STRICT_CHK, 525 linux.NETLINK_EXT_ACK, 526 linux.NETLINK_LISTEN_ALL_NSID, 527 linux.NETLINK_NO_ENOBUFS, 528 linux.NETLINK_PKTINFO: 529 // Not supported. 530 } 531 } 532 533 // TODO(b/68878065): other sockopts are not supported. 534 return syserr.ErrProtocolNotAvailable 535 } 536 537 // GetSockName implements socket.Socket.GetSockName. 538 func (s *Socket) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { 539 s.mu.Lock() 540 defer s.mu.Unlock() 541 542 sa := &linux.SockAddrNetlink{ 543 Family: linux.AF_NETLINK, 544 PortID: uint32(s.portID), 545 } 546 return sa, uint32(sa.SizeBytes()), nil 547 } 548 549 // GetPeerName implements socket.Socket.GetPeerName. 550 func (s *Socket) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { 551 sa := &linux.SockAddrNetlink{ 552 Family: linux.AF_NETLINK, 553 // TODO(b/68878065): Support non-kernel peers. For now the peer 554 // must be the kernel. 555 PortID: 0, 556 } 557 return sa, uint32(sa.SizeBytes()), nil 558 } 559 560 // RecvMsg implements socket.Socket.RecvMsg. 561 func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { 562 from := &linux.SockAddrNetlink{ 563 Family: linux.AF_NETLINK, 564 PortID: 0, 565 } 566 fromLen := uint32(from.SizeBytes()) 567 568 trunc := flags&linux.MSG_TRUNC != 0 569 570 r := unix.EndpointReader{ 571 Ctx: t, 572 Endpoint: s.ep, 573 Peek: flags&linux.MSG_PEEK != 0, 574 } 575 576 doRead := func() (int64, error) { 577 return dst.CopyOutFrom(t, &r) 578 } 579 580 // If MSG_TRUNC is set with a zero byte destination then we still need 581 // to read the message and discard it, or in the case where MSG_PEEK is 582 // set, leave it be. In both cases the full message length must be 583 // returned. 584 if trunc && dst.Addrs.NumBytes() == 0 { 585 doRead = func() (int64, error) { 586 err := r.Truncate() 587 // Always return zero for bytes read since the destination size is 588 // zero. 589 return 0, err 590 } 591 } 592 593 if n, err := doRead(); err != linuxerr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 { 594 var mflags int 595 if n < int64(r.MsgSize) { 596 mflags |= linux.MSG_TRUNC 597 } 598 if trunc { 599 n = int64(r.MsgSize) 600 } 601 return int(n), mflags, from, fromLen, socket.ControlMessages{}, syserr.FromError(err) 602 } 603 604 // We'll have to block. Register for notification and keep trying to 605 // receive all the data. 606 e, ch := waiter.NewChannelEntry(waiter.ReadableEvents) 607 if err := s.EventRegister(&e); err != nil { 608 return 0, 0, from, fromLen, socket.ControlMessages{}, syserr.FromError(err) 609 } 610 defer s.EventUnregister(&e) 611 612 for { 613 if n, err := doRead(); err != linuxerr.ErrWouldBlock { 614 var mflags int 615 if n < int64(r.MsgSize) { 616 mflags |= linux.MSG_TRUNC 617 } 618 if trunc { 619 n = int64(r.MsgSize) 620 } 621 return int(n), mflags, from, fromLen, socket.ControlMessages{}, syserr.FromError(err) 622 } 623 624 if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { 625 if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { 626 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain 627 } 628 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) 629 } 630 } 631 } 632 633 // kernelSCM implements control.SCMCredentials with credentials that represent 634 // the kernel itself rather than a Task. 635 // 636 // +stateify savable 637 type kernelSCM struct{} 638 639 // Equals implements transport.CredentialsControlMessage.Equals. 640 func (kernelSCM) Equals(oc transport.CredentialsControlMessage) bool { 641 _, ok := oc.(kernelSCM) 642 return ok 643 } 644 645 // Credentials implements control.SCMCredentials.Credentials. 646 func (kernelSCM) Credentials(*kernel.Task) (kernel.ThreadID, auth.UID, auth.GID) { 647 return 0, auth.RootUID, auth.RootGID 648 } 649 650 // kernelCreds is the concrete version of kernelSCM used in all creds. 651 var kernelCreds = &kernelSCM{} 652 653 // sendResponse sends the response messages in ms back to userspace. 654 func (s *Socket) sendResponse(ctx context.Context, ms *nlmsg.MessageSet) *syserr.Error { 655 // Linux combines multiple netlink messages into a single datagram. 656 bufs := make([][]byte, 0, len(ms.Messages)) 657 for _, m := range ms.Messages { 658 bufs = append(bufs, m.Finalize()) 659 } 660 661 // All messages are from the kernel. 662 cms := transport.ControlMessages{ 663 Credentials: kernelCreds, 664 } 665 666 if len(bufs) > 0 { 667 // RecvMsg never receives the address, so we don't need to send 668 // one. 669 _, notify, err := s.connection.Send(ctx, bufs, cms, transport.Address{}) 670 // If the buffer is full, we simply drop messages, just like 671 // Linux. 672 if err != nil && err != syserr.ErrWouldBlock { 673 return err 674 } 675 if notify { 676 s.connection.SendNotify() 677 } 678 } 679 680 // N.B. multi-part messages should still send NLMSG_DONE even if 681 // nlmsg.MessageSet contains no messages. 682 // 683 // N.B. NLMSG_DONE is always sent in a different datagram. See 684 // net/netlink/af_netlink.c:netlink_dump. 685 if ms.Multi { 686 m := nlmsg.NewMessage(linux.NetlinkMessageHeader{ 687 Type: linux.NLMSG_DONE, 688 Flags: linux.NLM_F_MULTI, 689 Seq: ms.Seq, 690 PortID: uint32(ms.PortID), 691 }) 692 693 // Add the dump_done_errno payload. 694 m.Put(primitive.AllocateInt64(0)) 695 696 _, notify, err := s.connection.Send(ctx, [][]byte{m.Finalize()}, cms, transport.Address{}) 697 if err != nil && err != syserr.ErrWouldBlock { 698 return err 699 } 700 if notify { 701 s.connection.SendNotify() 702 } 703 } 704 705 return nil 706 } 707 708 func dumpErrorMessage(hdr linux.NetlinkMessageHeader, ms *nlmsg.MessageSet, err *syserr.Error) { 709 m := ms.AddMessage(linux.NetlinkMessageHeader{ 710 Type: linux.NLMSG_ERROR, 711 }) 712 m.Put(&linux.NetlinkErrorMessage{ 713 Error: int32(-err.ToLinux()), 714 Header: hdr, 715 }) 716 } 717 718 func dumpAckMessage(hdr linux.NetlinkMessageHeader, ms *nlmsg.MessageSet) { 719 m := ms.AddMessage(linux.NetlinkMessageHeader{ 720 Type: linux.NLMSG_ERROR, 721 }) 722 m.Put(&linux.NetlinkErrorMessage{ 723 Error: 0, 724 Header: hdr, 725 }) 726 } 727 728 // processMessages handles each message in buf, passing it to the protocol 729 // handler for final handling. 730 func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error { 731 for len(buf) > 0 { 732 msg, rest, ok := nlmsg.ParseMessage(buf) 733 if !ok { 734 // Linux ignores messages that are too short. See 735 // net/netlink/af_netlink.c:netlink_rcv_skb. 736 break 737 } 738 buf = rest 739 hdr := msg.Header() 740 741 // Ignore control messages. 742 if hdr.Type < linux.NLMSG_MIN_TYPE { 743 continue 744 } 745 746 ms := nlmsg.NewMessageSet(s.portID, hdr.Seq) 747 if err := s.protocol.ProcessMessage(ctx, msg, ms); err != nil { 748 dumpErrorMessage(hdr, ms, err) 749 } else if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK { 750 dumpAckMessage(hdr, ms) 751 } 752 753 if err := s.sendResponse(ctx, ms); err != nil { 754 return err 755 } 756 } 757 758 return nil 759 } 760 761 // sendMsg is the core of message send, used for SendMsg and Write. 762 func (s *Socket) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) { 763 dstPort := int32(0) 764 765 if len(to) != 0 { 766 a, err := ExtractSockAddr(to) 767 if err != nil { 768 return 0, err 769 } 770 771 // No support for multicast groups yet. 772 if a.Groups != 0 { 773 return 0, syserr.ErrPermissionDenied 774 } 775 776 dstPort = int32(a.PortID) 777 } 778 779 if dstPort != 0 { 780 // Non-kernel destinations not supported yet. Treat as if 781 // NL_CFG_F_NONROOT_SEND is not set. 782 return 0, syserr.ErrPermissionDenied 783 } 784 785 s.mu.Lock() 786 defer s.mu.Unlock() 787 788 // For simplicity, and consistency with Linux, we copy in the entire 789 // message up front. 790 if src.NumBytes() > int64(s.sendBufferSize) { 791 return 0, syserr.ErrMessageTooLong 792 } 793 794 buf := make([]byte, src.NumBytes()) 795 n, err := src.CopyIn(ctx, buf) 796 // io.EOF can be only returned if src is a file, this means that 797 // sendMsg is called from splice and the error has to be ignored in 798 // this case. 799 if err == io.EOF { 800 err = nil 801 } 802 if err != nil { 803 // Don't partially consume messages. 804 return 0, syserr.FromError(err) 805 } 806 807 if err := s.processMessages(ctx, buf); err != nil { 808 return 0, err 809 } 810 811 return n, nil 812 } 813 814 // SendMsg implements socket.Socket.SendMsg. 815 func (s *Socket) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { 816 return s.sendMsg(t, src, to, flags, controlMessages) 817 } 818 819 // State implements socket.Socket.State. 820 func (s *Socket) State() uint32 { 821 return s.ep.State() 822 } 823 824 // Type implements socket.Socket.Type. 825 func (s *Socket) Type() (family int, skType linux.SockType, protocol int) { 826 return linux.AF_NETLINK, s.skType, s.protocol.Protocol() 827 }