github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/socket/hostinet/socket.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package hostinet 16 17 import ( 18 "fmt" 19 20 "golang.org/x/sys/unix" 21 "github.com/SagerNet/gvisor/pkg/abi/linux" 22 "github.com/SagerNet/gvisor/pkg/context" 23 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 24 "github.com/SagerNet/gvisor/pkg/fdnotifier" 25 "github.com/SagerNet/gvisor/pkg/hostarch" 26 "github.com/SagerNet/gvisor/pkg/log" 27 "github.com/SagerNet/gvisor/pkg/marshal" 28 "github.com/SagerNet/gvisor/pkg/marshal/primitive" 29 "github.com/SagerNet/gvisor/pkg/safemem" 30 "github.com/SagerNet/gvisor/pkg/sentry/arch" 31 "github.com/SagerNet/gvisor/pkg/sentry/fs" 32 "github.com/SagerNet/gvisor/pkg/sentry/fs/fsutil" 33 "github.com/SagerNet/gvisor/pkg/sentry/kernel" 34 ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time" 35 "github.com/SagerNet/gvisor/pkg/sentry/socket" 36 "github.com/SagerNet/gvisor/pkg/sentry/socket/control" 37 "github.com/SagerNet/gvisor/pkg/syserr" 38 "github.com/SagerNet/gvisor/pkg/syserror" 39 "github.com/SagerNet/gvisor/pkg/usermem" 40 "github.com/SagerNet/gvisor/pkg/waiter" 41 ) 42 43 const ( 44 sizeofInt32 = 4 45 46 // sizeofSockaddr is the size in bytes of the largest sockaddr type 47 // supported by this package. 48 sizeofSockaddr = unix.SizeofSockaddrInet6 // sizeof(sockaddr_in6) > sizeof(sockaddr_in) 49 50 // maxControlLen is the maximum size of a control message buffer used in a 51 // recvmsg or sendmsg unix. 52 maxControlLen = 1024 53 ) 54 55 // LINT.IfChange 56 57 // socketOperations implements fs.FileOperations and socket.Socket for a socket 58 // implemented using a host socket. 59 type socketOperations struct { 60 fsutil.FilePipeSeek `state:"nosave"` 61 fsutil.FileNotDirReaddir `state:"nosave"` 62 fsutil.FileNoFsync `state:"nosave"` 63 fsutil.FileNoMMap `state:"nosave"` 64 fsutil.FileNoSplice `state:"nosave"` 65 fsutil.FileNoopFlush `state:"nosave"` 66 fsutil.FileUseInodeUnstableAttr `state:"nosave"` 67 68 socketOpsCommon 69 } 70 71 var _ = socket.Socket(&socketOperations{}) 72 73 func newSocketFile(ctx context.Context, family int, stype linux.SockType, protocol int, fd int, nonblock bool) (*fs.File, *syserr.Error) { 74 s := &socketOperations{ 75 socketOpsCommon: socketOpsCommon{ 76 family: family, 77 stype: stype, 78 protocol: protocol, 79 fd: fd, 80 }, 81 } 82 if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil { 83 return nil, syserr.FromError(err) 84 } 85 dirent := socket.NewDirent(ctx, socketDevice) 86 defer dirent.DecRef(ctx) 87 return fs.NewFile(ctx, dirent, fs.FileFlags{NonBlocking: nonblock, Read: true, Write: true, NonSeekable: true}, s), nil 88 } 89 90 // Ioctl implements fs.FileOperations.Ioctl. 91 func (s *socketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { 92 return ioctl(ctx, s.fd, io, args) 93 } 94 95 // Read implements fs.FileOperations.Read. 96 func (s *socketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { 97 n, err := dst.CopyOutFrom(ctx, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) { 98 // Refuse to do anything if any part of dst.Addrs was unusable. 99 if uint64(dst.NumBytes()) != dsts.NumBytes() { 100 return 0, nil 101 } 102 if dsts.IsEmpty() { 103 return 0, nil 104 } 105 if dsts.NumBlocks() == 1 { 106 // Skip allocating []unix.Iovec. 107 n, err := unix.Read(s.fd, dsts.Head().ToSlice()) 108 if err != nil { 109 return 0, translateIOSyscallError(err) 110 } 111 return uint64(n), nil 112 } 113 return readv(s.fd, safemem.IovecsFromBlockSeq(dsts)) 114 })) 115 return int64(n), err 116 } 117 118 // Write implements fs.FileOperations.Write. 119 func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { 120 n, err := src.CopyInTo(ctx, safemem.WriterFunc(func(srcs safemem.BlockSeq) (uint64, error) { 121 // Refuse to do anything if any part of src.Addrs was unusable. 122 if uint64(src.NumBytes()) != srcs.NumBytes() { 123 return 0, nil 124 } 125 if srcs.IsEmpty() { 126 return 0, nil 127 } 128 if srcs.NumBlocks() == 1 { 129 // Skip allocating []unix.Iovec. 130 n, err := unix.Write(s.fd, srcs.Head().ToSlice()) 131 if err != nil { 132 return 0, translateIOSyscallError(err) 133 } 134 return uint64(n), nil 135 } 136 return writev(s.fd, safemem.IovecsFromBlockSeq(srcs)) 137 })) 138 return int64(n), err 139 } 140 141 // Socket implements socket.Provider.Socket. 142 func (p *socketProvider) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*fs.File, *syserr.Error) { 143 // Check that we are using the host network stack. 144 stack := t.NetworkContext() 145 if stack == nil { 146 return nil, nil 147 } 148 if _, ok := stack.(*Stack); !ok { 149 return nil, nil 150 } 151 152 // Only accept TCP and UDP. 153 stype := stypeflags & linux.SOCK_TYPE_MASK 154 switch stype { 155 case unix.SOCK_STREAM: 156 switch protocol { 157 case 0, unix.IPPROTO_TCP: 158 // ok 159 default: 160 return nil, nil 161 } 162 case unix.SOCK_DGRAM: 163 switch protocol { 164 case 0, unix.IPPROTO_UDP: 165 // ok 166 default: 167 return nil, nil 168 } 169 default: 170 return nil, nil 171 } 172 173 // Conservatively ignore all flags specified by the application and add 174 // SOCK_NONBLOCK since socketOperations requires it. Pass a protocol of 0 175 // to simplify the syscall filters, since 0 and IPPROTO_* are equivalent. 176 fd, err := unix.Socket(p.family, int(stype)|unix.SOCK_NONBLOCK|unix.SOCK_CLOEXEC, 0) 177 if err != nil { 178 return nil, syserr.FromError(err) 179 } 180 return newSocketFile(t, p.family, stype, protocol, fd, stypeflags&unix.SOCK_NONBLOCK != 0) 181 } 182 183 // Pair implements socket.Provider.Pair. 184 func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) { 185 // Not supported by AF_INET/AF_INET6. 186 return nil, nil, nil 187 } 188 189 // LINT.ThenChange(./socket_vfs2.go) 190 191 // socketOpsCommon contains the socket operations common to VFS1 and VFS2. 192 // 193 // +stateify savable 194 type socketOpsCommon struct { 195 socket.SendReceiveTimeout 196 197 family int // Read-only. 198 stype linux.SockType // Read-only. 199 protocol int // Read-only. 200 queue waiter.Queue 201 202 // fd is the host socket fd. It must have O_NONBLOCK, so that operations 203 // will return EWOULDBLOCK instead of blocking on the host. This allows us to 204 // handle blocking behavior independently in the sentry. 205 fd int 206 } 207 208 // Release implements fs.FileOperations.Release. 209 func (s *socketOpsCommon) Release(context.Context) { 210 fdnotifier.RemoveFD(int32(s.fd)) 211 unix.Close(s.fd) 212 } 213 214 // Readiness implements waiter.Waitable.Readiness. 215 func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { 216 return fdnotifier.NonBlockingPoll(int32(s.fd), mask) 217 } 218 219 // EventRegister implements waiter.Waitable.EventRegister. 220 func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) { 221 s.queue.EventRegister(e, mask) 222 fdnotifier.UpdateFD(int32(s.fd)) 223 } 224 225 // EventUnregister implements waiter.Waitable.EventUnregister. 226 func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) { 227 s.queue.EventUnregister(e) 228 fdnotifier.UpdateFD(int32(s.fd)) 229 } 230 231 // Connect implements socket.Socket.Connect. 232 func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { 233 if len(sockaddr) > sizeofSockaddr { 234 sockaddr = sockaddr[:sizeofSockaddr] 235 } 236 237 _, _, errno := unix.Syscall(unix.SYS_CONNECT, uintptr(s.fd), uintptr(firstBytePtr(sockaddr)), uintptr(len(sockaddr))) 238 239 if errno == 0 { 240 return nil 241 } 242 if errno != unix.EINPROGRESS || !blocking { 243 return syserr.FromError(translateIOSyscallError(errno)) 244 } 245 246 // "EINPROGRESS: The socket is nonblocking and the connection cannot be 247 // completed immediately. It is possible to select(2) or poll(2) for 248 // completion by selecting the socket for writing. After select(2) 249 // indicates writability, use getsockopt(2) to read the SO_ERROR option at 250 // level SOL-SOCKET to determine whether connect() completed successfully 251 // (SO_ERROR is zero) or unsuccessfully (SO_ERROR is one of the usual error 252 // codes listed here, explaining the reason for the failure)." - connect(2) 253 e, ch := waiter.NewChannelEntry(nil) 254 writableMask := waiter.WritableEvents 255 s.EventRegister(&e, writableMask) 256 defer s.EventUnregister(&e) 257 if s.Readiness(writableMask)&writableMask == 0 { 258 if err := t.Block(ch); err != nil { 259 return syserr.FromError(err) 260 } 261 } 262 val, err := unix.GetsockoptInt(s.fd, unix.SOL_SOCKET, unix.SO_ERROR) 263 if err != nil { 264 return syserr.FromError(err) 265 } 266 if val != 0 { 267 return syserr.FromError(unix.Errno(uintptr(val))) 268 } 269 return nil 270 } 271 272 // Accept implements socket.Socket.Accept. 273 func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { 274 var peerAddr linux.SockAddr 275 var peerAddrBuf []byte 276 var peerAddrlen uint32 277 var peerAddrPtr *byte 278 var peerAddrlenPtr *uint32 279 if peerRequested { 280 peerAddrBuf = make([]byte, sizeofSockaddr) 281 peerAddrlen = uint32(len(peerAddrBuf)) 282 peerAddrPtr = &peerAddrBuf[0] 283 peerAddrlenPtr = &peerAddrlen 284 } 285 286 // Conservatively ignore all flags specified by the application and add 287 // SOCK_NONBLOCK since socketOpsCommon requires it. 288 fd, syscallErr := accept4(s.fd, peerAddrPtr, peerAddrlenPtr, unix.SOCK_NONBLOCK|unix.SOCK_CLOEXEC) 289 if blocking { 290 var ch chan struct{} 291 for syscallErr == syserror.ErrWouldBlock { 292 if ch != nil { 293 if syscallErr = t.Block(ch); syscallErr != nil { 294 break 295 } 296 } else { 297 var e waiter.Entry 298 e, ch = waiter.NewChannelEntry(nil) 299 s.EventRegister(&e, waiter.ReadableEvents) 300 defer s.EventUnregister(&e) 301 } 302 fd, syscallErr = accept4(s.fd, peerAddrPtr, peerAddrlenPtr, unix.SOCK_NONBLOCK|unix.SOCK_CLOEXEC) 303 } 304 } 305 306 if peerRequested { 307 peerAddr = socket.UnmarshalSockAddr(s.family, peerAddrBuf[:peerAddrlen]) 308 } 309 if syscallErr != nil { 310 return 0, peerAddr, peerAddrlen, syserr.FromError(syscallErr) 311 } 312 313 var ( 314 kfd int32 315 kerr error 316 ) 317 if kernel.VFS2Enabled { 318 f, err := newVFS2Socket(t, s.family, s.stype, s.protocol, fd, uint32(flags&unix.SOCK_NONBLOCK)) 319 if err != nil { 320 unix.Close(fd) 321 return 0, nil, 0, err 322 } 323 defer f.DecRef(t) 324 325 kfd, kerr = t.NewFDFromVFS2(0, f, kernel.FDFlags{ 326 CloseOnExec: flags&unix.SOCK_CLOEXEC != 0, 327 }) 328 t.Kernel().RecordSocketVFS2(f) 329 } else { 330 f, err := newSocketFile(t, s.family, s.stype, s.protocol, fd, flags&unix.SOCK_NONBLOCK != 0) 331 if err != nil { 332 unix.Close(fd) 333 return 0, nil, 0, err 334 } 335 defer f.DecRef(t) 336 337 kfd, kerr = t.NewFDFrom(0, f, kernel.FDFlags{ 338 CloseOnExec: flags&unix.SOCK_CLOEXEC != 0, 339 }) 340 t.Kernel().RecordSocket(f) 341 } 342 343 return kfd, peerAddr, peerAddrlen, syserr.FromError(kerr) 344 } 345 346 // Bind implements socket.Socket.Bind. 347 func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { 348 if len(sockaddr) > sizeofSockaddr { 349 sockaddr = sockaddr[:sizeofSockaddr] 350 } 351 352 _, _, errno := unix.Syscall(unix.SYS_BIND, uintptr(s.fd), uintptr(firstBytePtr(sockaddr)), uintptr(len(sockaddr))) 353 if errno != 0 { 354 return syserr.FromError(errno) 355 } 356 return nil 357 } 358 359 // Listen implements socket.Socket.Listen. 360 func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { 361 return syserr.FromError(unix.Listen(s.fd, backlog)) 362 } 363 364 // Shutdown implements socket.Socket.Shutdown. 365 func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { 366 switch how { 367 case unix.SHUT_RD, unix.SHUT_WR, unix.SHUT_RDWR: 368 return syserr.FromError(unix.Shutdown(s.fd, how)) 369 default: 370 return syserr.ErrInvalidArgument 371 } 372 } 373 374 // GetSockOpt implements socket.Socket.GetSockOpt. 375 func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { 376 if outLen < 0 { 377 return nil, syserr.ErrInvalidArgument 378 } 379 380 // Only allow known and safe options. 381 optlen := getSockOptLen(t, level, name) 382 switch level { 383 case linux.SOL_IP: 384 switch name { 385 case linux.IP_TOS, linux.IP_RECVTOS, linux.IP_PKTINFO, linux.IP_RECVORIGDSTADDR, linux.IP_RECVERR: 386 optlen = sizeofInt32 387 } 388 case linux.SOL_IPV6: 389 switch name { 390 case linux.IPV6_TCLASS, linux.IPV6_RECVTCLASS, linux.IPV6_RECVERR, linux.IPV6_V6ONLY, linux.IPV6_RECVORIGDSTADDR: 391 optlen = sizeofInt32 392 } 393 case linux.SOL_SOCKET: 394 switch name { 395 case linux.SO_ERROR, linux.SO_KEEPALIVE, linux.SO_SNDBUF, linux.SO_RCVBUF, linux.SO_REUSEADDR, linux.SO_TIMESTAMP: 396 optlen = sizeofInt32 397 case linux.SO_LINGER: 398 optlen = unix.SizeofLinger 399 } 400 case linux.SOL_TCP: 401 switch name { 402 case linux.TCP_NODELAY: 403 optlen = sizeofInt32 404 case linux.TCP_INFO: 405 optlen = int(linux.SizeOfTCPInfo) 406 } 407 } 408 409 if optlen == 0 { 410 return nil, syserr.ErrProtocolNotAvailable // ENOPROTOOPT 411 } 412 if outLen < optlen { 413 return nil, syserr.ErrInvalidArgument 414 } 415 416 opt, err := getsockopt(s.fd, level, name, optlen) 417 if err != nil { 418 return nil, syserr.FromError(err) 419 } 420 optP := primitive.ByteSlice(opt) 421 return &optP, nil 422 } 423 424 // SetSockOpt implements socket.Socket.SetSockOpt. 425 func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { 426 // Only allow known and safe options. 427 optlen := setSockOptLen(t, level, name) 428 switch level { 429 case linux.SOL_IP: 430 switch name { 431 case linux.IP_TOS, linux.IP_RECVTOS, linux.IP_PKTINFO, linux.IP_RECVORIGDSTADDR, linux.IP_RECVERR: 432 optlen = sizeofInt32 433 } 434 case linux.SOL_IPV6: 435 switch name { 436 case linux.IPV6_TCLASS, linux.IPV6_RECVTCLASS, linux.IPV6_RECVERR, linux.IPV6_V6ONLY, linux.IPV6_RECVORIGDSTADDR: 437 optlen = sizeofInt32 438 } 439 case linux.SOL_SOCKET: 440 switch name { 441 case linux.SO_SNDBUF, linux.SO_RCVBUF, linux.SO_REUSEADDR, linux.SO_TIMESTAMP: 442 optlen = sizeofInt32 443 } 444 case linux.SOL_TCP: 445 switch name { 446 case linux.TCP_NODELAY, linux.TCP_INQ: 447 optlen = sizeofInt32 448 } 449 } 450 451 if optlen == 0 { 452 // Pretend to accept socket options we don't understand. This seems 453 // dangerous, but it's what netstack does... 454 return nil 455 } 456 if len(opt) < optlen { 457 return syserr.ErrInvalidArgument 458 } 459 opt = opt[:optlen] 460 461 _, _, errno := unix.Syscall6(unix.SYS_SETSOCKOPT, uintptr(s.fd), uintptr(level), uintptr(name), uintptr(firstBytePtr(opt)), uintptr(len(opt)), 0) 462 if errno != 0 { 463 return syserr.FromError(errno) 464 } 465 return nil 466 } 467 468 func (s *socketOpsCommon) recvMsgFromHost(iovs []unix.Iovec, flags int, senderRequested bool, controlLen uint64) (uint64, int, []byte, []byte, error) { 469 // We always do a non-blocking recv*(). 470 sysflags := flags | unix.MSG_DONTWAIT 471 472 msg := unix.Msghdr{} 473 if len(iovs) > 0 { 474 msg.Iov = &iovs[0] 475 msg.Iovlen = uint64(len(iovs)) 476 } 477 var senderAddrBuf []byte 478 if senderRequested { 479 senderAddrBuf = make([]byte, sizeofSockaddr) 480 msg.Name = &senderAddrBuf[0] 481 msg.Namelen = uint32(sizeofSockaddr) 482 } 483 var controlBuf []byte 484 if controlLen > 0 { 485 if controlLen > maxControlLen { 486 controlLen = maxControlLen 487 } 488 controlBuf = make([]byte, controlLen) 489 msg.Control = &controlBuf[0] 490 msg.Controllen = controlLen 491 } 492 n, err := recvmsg(s.fd, &msg, sysflags) 493 if err != nil { 494 return 0 /* n */, 0 /* mFlags */, nil /* senderAddrBuf */, nil /* controlBuf */, err 495 } 496 return n, int(msg.Flags), senderAddrBuf[:msg.Namelen], controlBuf[:msg.Controllen], err 497 } 498 499 // RecvMsg implements socket.Socket.RecvMsg. 500 func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { 501 // Only allow known and safe flags. 502 if flags&^(unix.MSG_DONTWAIT|unix.MSG_PEEK|unix.MSG_TRUNC|unix.MSG_ERRQUEUE) != 0 { 503 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrInvalidArgument 504 } 505 506 var senderAddrBuf []byte 507 var controlBuf []byte 508 var msgFlags int 509 copyToDst := func() (int64, error) { 510 var n uint64 511 var err error 512 if dst.NumBytes() == 0 { 513 // We want to make the recvmsg(2) call to the host even if dst is empty 514 // to fetch control messages, sender address or errors if any occur. 515 n, msgFlags, senderAddrBuf, controlBuf, err = s.recvMsgFromHost(nil, flags, senderRequested, controlLen) 516 return int64(n), err 517 } 518 519 recvmsgToBlocks := safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) { 520 // Refuse to do anything if any part of dst.Addrs was unusable. 521 if uint64(dst.NumBytes()) != dsts.NumBytes() { 522 return 0, nil 523 } 524 if dsts.IsEmpty() { 525 return 0, nil 526 } 527 528 n, msgFlags, senderAddrBuf, controlBuf, err = s.recvMsgFromHost(safemem.IovecsFromBlockSeq(dsts), flags, senderRequested, controlLen) 529 return n, err 530 }) 531 return dst.CopyOutFrom(t, recvmsgToBlocks) 532 } 533 534 var ch chan struct{} 535 n, err := copyToDst() 536 // recv*(MSG_ERRQUEUE) never blocks, even without MSG_DONTWAIT. 537 if flags&(unix.MSG_DONTWAIT|unix.MSG_ERRQUEUE) == 0 { 538 for err == syserror.ErrWouldBlock { 539 // We only expect blocking to come from the actual syscall, in which 540 // case it can't have returned any data. 541 if n != 0 { 542 panic(fmt.Sprintf("CopyOutFrom: got (%d, %v), wanted (0, %v)", n, err, err)) 543 } 544 if ch != nil { 545 if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { 546 break 547 } 548 } else { 549 var e waiter.Entry 550 e, ch = waiter.NewChannelEntry(nil) 551 s.EventRegister(&e, waiter.ReadableEvents) 552 defer s.EventUnregister(&e) 553 } 554 n, err = copyToDst() 555 } 556 } 557 if err != nil { 558 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) 559 } 560 561 var senderAddr linux.SockAddr 562 if senderRequested { 563 senderAddr = socket.UnmarshalSockAddr(s.family, senderAddrBuf) 564 } 565 566 unixControlMessages, err := unix.ParseSocketControlMessage(controlBuf) 567 if err != nil { 568 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) 569 } 570 return int(n), msgFlags, senderAddr, uint32(len(senderAddrBuf)), parseUnixControlMessages(unixControlMessages), nil 571 } 572 573 func parseUnixControlMessages(unixControlMessages []unix.SocketControlMessage) socket.ControlMessages { 574 controlMessages := socket.ControlMessages{} 575 for _, unixCmsg := range unixControlMessages { 576 switch unixCmsg.Header.Level { 577 case linux.SOL_SOCKET: 578 switch unixCmsg.Header.Type { 579 case linux.SO_TIMESTAMP: 580 controlMessages.IP.HasTimestamp = true 581 ts := linux.Timeval{} 582 ts.UnmarshalUnsafe(unixCmsg.Data[:linux.SizeOfTimeval]) 583 controlMessages.IP.Timestamp = ts.ToNsecCapped() 584 } 585 586 case linux.SOL_IP: 587 switch unixCmsg.Header.Type { 588 case linux.IP_TOS: 589 controlMessages.IP.HasTOS = true 590 var tos primitive.Uint8 591 tos.UnmarshalUnsafe(unixCmsg.Data[:tos.SizeBytes()]) 592 controlMessages.IP.TOS = uint8(tos) 593 594 case linux.IP_PKTINFO: 595 controlMessages.IP.HasIPPacketInfo = true 596 var packetInfo linux.ControlMessageIPPacketInfo 597 packetInfo.UnmarshalUnsafe(unixCmsg.Data[:packetInfo.SizeBytes()]) 598 controlMessages.IP.PacketInfo = packetInfo 599 600 case linux.IP_RECVORIGDSTADDR: 601 var addr linux.SockAddrInet 602 addr.UnmarshalUnsafe(unixCmsg.Data[:addr.SizeBytes()]) 603 controlMessages.IP.OriginalDstAddress = &addr 604 605 case unix.IP_RECVERR: 606 var errCmsg linux.SockErrCMsgIPv4 607 errCmsg.UnmarshalBytes(unixCmsg.Data) 608 controlMessages.IP.SockErr = &errCmsg 609 } 610 611 case linux.SOL_IPV6: 612 switch unixCmsg.Header.Type { 613 case linux.IPV6_TCLASS: 614 controlMessages.IP.HasTClass = true 615 var tclass primitive.Uint32 616 tclass.UnmarshalUnsafe(unixCmsg.Data[:tclass.SizeBytes()]) 617 controlMessages.IP.TClass = uint32(tclass) 618 619 case linux.IPV6_RECVORIGDSTADDR: 620 var addr linux.SockAddrInet6 621 addr.UnmarshalUnsafe(unixCmsg.Data[:addr.SizeBytes()]) 622 controlMessages.IP.OriginalDstAddress = &addr 623 624 case unix.IPV6_RECVERR: 625 var errCmsg linux.SockErrCMsgIPv6 626 errCmsg.UnmarshalBytes(unixCmsg.Data) 627 controlMessages.IP.SockErr = &errCmsg 628 } 629 630 case linux.SOL_TCP: 631 switch unixCmsg.Header.Type { 632 case linux.TCP_INQ: 633 controlMessages.IP.HasInq = true 634 var inq primitive.Int32 635 inq.UnmarshalUnsafe(unixCmsg.Data[:linux.SizeOfControlMessageInq]) 636 controlMessages.IP.Inq = int32(inq) 637 } 638 } 639 } 640 return controlMessages 641 } 642 643 // SendMsg implements socket.Socket.SendMsg. 644 func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { 645 // Only allow known and safe flags. 646 if flags&^(unix.MSG_DONTWAIT|unix.MSG_EOR|unix.MSG_FASTOPEN|unix.MSG_MORE|unix.MSG_NOSIGNAL) != 0 { 647 return 0, syserr.ErrInvalidArgument 648 } 649 650 // If the src is zero-length, call SENDTO directly with a null buffer in 651 // order to generate poll/epoll notifications. 652 if src.NumBytes() == 0 { 653 sysflags := flags | unix.MSG_DONTWAIT 654 n, _, errno := unix.Syscall6(unix.SYS_SENDTO, uintptr(s.fd), 0, 0, uintptr(sysflags), uintptr(firstBytePtr(to)), uintptr(len(to))) 655 if errno != 0 { 656 return 0, syserr.FromError(errno) 657 } 658 return int(n), nil 659 } 660 661 space := uint64(control.CmsgsSpace(t, controlMessages)) 662 if space > maxControlLen { 663 space = maxControlLen 664 } 665 controlBuf := make([]byte, 0, space) 666 // PackControlMessages will append up to space bytes to controlBuf. 667 controlBuf = control.PackControlMessages(t, controlMessages, controlBuf) 668 669 sendmsgFromBlocks := safemem.WriterFunc(func(srcs safemem.BlockSeq) (uint64, error) { 670 // Refuse to do anything if any part of src.Addrs was unusable. 671 if uint64(src.NumBytes()) != srcs.NumBytes() { 672 return 0, nil 673 } 674 if srcs.IsEmpty() && len(controlBuf) == 0 { 675 return 0, nil 676 } 677 678 // We always do a non-blocking send*(). 679 sysflags := flags | unix.MSG_DONTWAIT 680 681 if srcs.NumBlocks() == 1 && len(controlBuf) == 0 { 682 // Skip allocating []unix.Iovec. 683 src := srcs.Head() 684 n, _, errno := unix.Syscall6(unix.SYS_SENDTO, uintptr(s.fd), src.Addr(), uintptr(src.Len()), uintptr(sysflags), uintptr(firstBytePtr(to)), uintptr(len(to))) 685 if errno != 0 { 686 return 0, translateIOSyscallError(errno) 687 } 688 return uint64(n), nil 689 } 690 691 iovs := safemem.IovecsFromBlockSeq(srcs) 692 msg := unix.Msghdr{ 693 Iov: &iovs[0], 694 Iovlen: uint64(len(iovs)), 695 } 696 if len(to) != 0 { 697 msg.Name = &to[0] 698 msg.Namelen = uint32(len(to)) 699 } 700 if len(controlBuf) != 0 { 701 msg.Control = &controlBuf[0] 702 msg.Controllen = uint64(len(controlBuf)) 703 } 704 return sendmsg(s.fd, &msg, sysflags) 705 }) 706 707 var ch chan struct{} 708 n, err := src.CopyInTo(t, sendmsgFromBlocks) 709 if flags&unix.MSG_DONTWAIT == 0 { 710 for err == syserror.ErrWouldBlock { 711 // We only expect blocking to come from the actual syscall, in which 712 // case it can't have returned any data. 713 if n != 0 { 714 panic(fmt.Sprintf("CopyInTo: got (%d, %v), wanted (0, %v)", n, err, err)) 715 } 716 if ch != nil { 717 if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { 718 if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { 719 err = syserror.ErrWouldBlock 720 } 721 break 722 } 723 } else { 724 var e waiter.Entry 725 e, ch = waiter.NewChannelEntry(nil) 726 s.EventRegister(&e, waiter.WritableEvents) 727 defer s.EventUnregister(&e) 728 } 729 n, err = src.CopyInTo(t, sendmsgFromBlocks) 730 } 731 } 732 733 return int(n), syserr.FromError(err) 734 } 735 736 func translateIOSyscallError(err error) error { 737 if err == unix.EAGAIN || err == unix.EWOULDBLOCK { 738 return syserror.ErrWouldBlock 739 } 740 return err 741 } 742 743 // State implements socket.Socket.State. 744 func (s *socketOpsCommon) State() uint32 { 745 info := linux.TCPInfo{} 746 buf, err := getsockopt(s.fd, unix.SOL_TCP, unix.TCP_INFO, linux.SizeOfTCPInfo) 747 if err != nil { 748 if err != unix.ENOPROTOOPT { 749 log.Warningf("Failed to get TCP socket info from %+v: %v", s, err) 750 } 751 // For non-TCP sockets, silently ignore the failure. 752 return 0 753 } 754 if len(buf) != linux.SizeOfTCPInfo { 755 // Unmarshal below will panic if getsockopt returns a buffer of 756 // unexpected size. 757 log.Warningf("Failed to get TCP socket info from %+v: getsockopt(2) returned %d bytes, expecting %d bytes.", s, len(buf), linux.SizeOfTCPInfo) 758 return 0 759 } 760 761 info.UnmarshalUnsafe(buf[:info.SizeBytes()]) 762 return uint32(info.State) 763 } 764 765 // Type implements socket.Socket.Type. 766 func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) { 767 return s.family, s.stype, s.protocol 768 } 769 770 type socketProvider struct { 771 family int 772 } 773 774 func init() { 775 for _, family := range []int{unix.AF_INET, unix.AF_INET6} { 776 socket.RegisterProvider(family, &socketProvider{family}) 777 socket.RegisterProviderVFS2(family, &socketProviderVFS2{family}) 778 } 779 }