gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/socket/netstack/netstack.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package netstack provides an implementation of the socket.Socket interface
    16  // that is backed by a tcpip.Endpoint.
    17  //
    18  // It does not depend on any particular endpoint implementation, and thus can
    19  // be used to expose certain endpoints to the sentry while leaving others out,
    20  // for example, TCP endpoints and Unix-domain endpoints.
    21  //
    22  // Lock ordering: netstack => mm: ioSequenceReadWriter copies user memory inside
    23  // tcpip.Endpoint.Write(). Netstack is allowed to (and does) hold locks during
    24  // this operation.
    25  package netstack
    26  
    27  import (
    28  	"bytes"
    29  	"encoding/binary"
    30  	"fmt"
    31  	"io"
    32  	"io/ioutil"
    33  	"math"
    34  	"reflect"
    35  	"time"
    36  
    37  	"golang.org/x/sys/unix"
    38  	"google.golang.org/protobuf/proto"
    39  	"gvisor.dev/gvisor/pkg/abi/linux"
    40  	"gvisor.dev/gvisor/pkg/abi/linux/errno"
    41  	"gvisor.dev/gvisor/pkg/context"
    42  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    43  	"gvisor.dev/gvisor/pkg/eventchannel"
    44  	"gvisor.dev/gvisor/pkg/hostarch"
    45  	"gvisor.dev/gvisor/pkg/log"
    46  	"gvisor.dev/gvisor/pkg/marshal"
    47  	"gvisor.dev/gvisor/pkg/marshal/primitive"
    48  	"gvisor.dev/gvisor/pkg/metric"
    49  	"gvisor.dev/gvisor/pkg/sentry/arch"
    50  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
    51  	"gvisor.dev/gvisor/pkg/sentry/inet"
    52  	"gvisor.dev/gvisor/pkg/sentry/kernel"
    53  	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
    54  	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
    55  	"gvisor.dev/gvisor/pkg/sentry/socket"
    56  	"gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
    57  	epb "gvisor.dev/gvisor/pkg/sentry/socket/netstack/events_go_proto"
    58  	"gvisor.dev/gvisor/pkg/sentry/vfs"
    59  	"gvisor.dev/gvisor/pkg/sync"
    60  	"gvisor.dev/gvisor/pkg/syserr"
    61  	"gvisor.dev/gvisor/pkg/tcpip"
    62  	"gvisor.dev/gvisor/pkg/tcpip/header"
    63  	"gvisor.dev/gvisor/pkg/tcpip/stack"
    64  	"gvisor.dev/gvisor/pkg/tcpip/transport"
    65  	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
    66  	"gvisor.dev/gvisor/pkg/usermem"
    67  	"gvisor.dev/gvisor/pkg/waiter"
    68  )
    69  
    70  const bitsPerUint32 = 32
    71  
    72  // statCounterValue returns a function usable as callback function when defining a gVisor Sentry
    73  // metric that contains the value counted by the StatCounter.
    74  // This avoids a dependency loop in the tcpip package.
    75  func statCounterValue(cm *tcpip.StatCounter) func(...*metric.FieldValue) uint64 {
    76  	return func(...*metric.FieldValue) uint64 {
    77  		return cm.Value()
    78  	}
    79  }
    80  
    81  func mustCreateMetric(name, description string) *tcpip.StatCounter {
    82  	var cm tcpip.StatCounter
    83  	metric.MustRegisterCustomUint64Metric(name, true /* cumulative */, false /* sync */, description, statCounterValue(&cm))
    84  	return &cm
    85  }
    86  
    87  func mustCreateGauge(name, description string) *tcpip.StatCounter {
    88  	var cm tcpip.StatCounter
    89  	metric.MustRegisterCustomUint64Metric(name, false /* cumulative */, false /* sync */, description, statCounterValue(&cm))
    90  	return &cm
    91  }
    92  
    93  // Metrics contains metrics exported by netstack.
    94  var Metrics = tcpip.Stats{
    95  	DroppedPackets: mustCreateMetric("/netstack/dropped_packets", "Number of packets dropped at the transport layer."),
    96  	NICs: tcpip.NICStats{
    97  		MalformedL4RcvdPackets: mustCreateMetric("/netstack/nic/malformed_l4_received_packets", "Number of packets received that failed L4 header parsing."),
    98  		Tx: tcpip.NICPacketStats{
    99  			Packets: mustCreateMetric("/netstack/nic/tx/packets", "Number of packets transmitted."),
   100  			Bytes:   mustCreateMetric("/netstack/nic/tx/bytes", "Number of bytes transmitted."),
   101  		},
   102  		TxPacketsDroppedNoBufferSpace: mustCreateMetric("/netstack/nic/tx_packets_dropped_no_buffer_space", "Number of TX packets dropped as a result of no buffer space errors."),
   103  		Rx: tcpip.NICPacketStats{
   104  			Packets: mustCreateMetric("/netstack/nic/rx/packets", "Number of packets received."),
   105  			Bytes:   mustCreateMetric("/netstack/nic/rx/bytes", "Number of bytes received."),
   106  		},
   107  		DisabledRx: tcpip.NICPacketStats{
   108  			Packets: mustCreateMetric("/netstack/nic/disabled_rx/packets", "Number of packets received on disabled NICs."),
   109  			Bytes:   mustCreateMetric("/netstack/nic/disabled_rx/bytes", "Number of bytes received on disabled NICs."),
   110  		},
   111  		Neighbor: tcpip.NICNeighborStats{
   112  			UnreachableEntryLookups:                    mustCreateMetric("/netstack/nic/neighbor/unreachable_entry_loopups", "Number of lookups performed on a neighbor entry in Unreachable state."),
   113  			DroppedConfirmationForNoninitiatedNeighbor: mustCreateMetric("/netstack/nic/neighbor/dropped_confirmation_for_noninitiated_neighbor", "Number of advertisements received that don't match an entry in the neighbor cache."),
   114  			DroppedInvalidLinkAddressConfirmations:     mustCreateMetric("/netstack/nic/neighbor/dropped_invalid_link_address_confirmations", "Number of advertisements dropped because they have empty source link-layer addresses"),
   115  		},
   116  	},
   117  	ICMP: tcpip.ICMPStats{
   118  		V4: tcpip.ICMPv4Stats{
   119  			PacketsSent: tcpip.ICMPv4SentPacketStats{
   120  				ICMPv4PacketStats: tcpip.ICMPv4PacketStats{
   121  					EchoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_request", "Number of ICMPv4 echo request packets sent."),
   122  					EchoReply:      mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_reply", "Number of ICMPv4 echo reply packets sent."),
   123  					DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_sent/dst_unreachable", "Number of ICMPv4 destination unreachable packets sent."),
   124  					SrcQuench:      mustCreateMetric("/netstack/icmp/v4/packets_sent/src_quench", "Number of ICMPv4 source quench packets sent."),
   125  					Redirect:       mustCreateMetric("/netstack/icmp/v4/packets_sent/redirect", "Number of ICMPv4 redirect packets sent."),
   126  					TimeExceeded:   mustCreateMetric("/netstack/icmp/v4/packets_sent/time_exceeded", "Number of ICMPv4 time exceeded packets sent."),
   127  					ParamProblem:   mustCreateMetric("/netstack/icmp/v4/packets_sent/param_problem", "Number of ICMPv4 parameter problem packets sent."),
   128  					Timestamp:      mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp", "Number of ICMPv4 timestamp packets sent."),
   129  					TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp_reply", "Number of ICMPv4 timestamp reply packets sent."),
   130  					InfoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_sent/info_request", "Number of ICMPv4 information request packets sent."),
   131  					InfoReply:      mustCreateMetric("/netstack/icmp/v4/packets_sent/info_reply", "Number of ICMPv4 information reply packets sent."),
   132  				},
   133  				Dropped:     mustCreateMetric("/netstack/icmp/v4/packets_sent/dropped", "Number of ICMPv4 packets dropped due to link layer errors."),
   134  				RateLimited: mustCreateMetric("/netstack/icmp/v4/packets_sent/rate_limited", "Number of ICMPv4 packets dropped due to rate limit being exceeded."),
   135  			},
   136  			PacketsReceived: tcpip.ICMPv4ReceivedPacketStats{
   137  				ICMPv4PacketStats: tcpip.ICMPv4PacketStats{
   138  					EchoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_received/echo_request", "Number of ICMPv4 echo request packets received."),
   139  					EchoReply:      mustCreateMetric("/netstack/icmp/v4/packets_received/echo_reply", "Number of ICMPv4 echo reply packets received."),
   140  					DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_received/dst_unreachable", "Number of ICMPv4 destination unreachable packets received."),
   141  					SrcQuench:      mustCreateMetric("/netstack/icmp/v4/packets_received/src_quench", "Number of ICMPv4 source quench packets received."),
   142  					Redirect:       mustCreateMetric("/netstack/icmp/v4/packets_received/redirect", "Number of ICMPv4 redirect packets received."),
   143  					TimeExceeded:   mustCreateMetric("/netstack/icmp/v4/packets_received/time_exceeded", "Number of ICMPv4 time exceeded packets received."),
   144  					ParamProblem:   mustCreateMetric("/netstack/icmp/v4/packets_received/param_problem", "Number of ICMPv4 parameter problem packets received."),
   145  					Timestamp:      mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp", "Number of ICMPv4 timestamp packets received."),
   146  					TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp_reply", "Number of ICMPv4 timestamp reply packets received."),
   147  					InfoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_received/info_request", "Number of ICMPv4 information request packets received."),
   148  					InfoReply:      mustCreateMetric("/netstack/icmp/v4/packets_received/info_reply", "Number of ICMPv4 information reply packets received."),
   149  				},
   150  				Invalid: mustCreateMetric("/netstack/icmp/v4/packets_received/invalid", "Number of ICMPv4 packets received that the transport layer could not parse."),
   151  			},
   152  		},
   153  		V6: tcpip.ICMPv6Stats{
   154  			PacketsSent: tcpip.ICMPv6SentPacketStats{
   155  				ICMPv6PacketStats: tcpip.ICMPv6PacketStats{
   156  					EchoRequest:             mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_request", "Number of ICMPv6 echo request packets sent."),
   157  					EchoReply:               mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_reply", "Number of ICMPv6 echo reply packets sent."),
   158  					DstUnreachable:          mustCreateMetric("/netstack/icmp/v6/packets_sent/dst_unreachable", "Number of ICMPv6 destination unreachable packets sent."),
   159  					PacketTooBig:            mustCreateMetric("/netstack/icmp/v6/packets_sent/packet_too_big", "Number of ICMPv6 packet too big packets sent."),
   160  					TimeExceeded:            mustCreateMetric("/netstack/icmp/v6/packets_sent/time_exceeded", "Number of ICMPv6 time exceeded packets sent."),
   161  					ParamProblem:            mustCreateMetric("/netstack/icmp/v6/packets_sent/param_problem", "Number of ICMPv6 parameter problem packets sent."),
   162  					RouterSolicit:           mustCreateMetric("/netstack/icmp/v6/packets_sent/router_solicit", "Number of ICMPv6 router solicit packets sent."),
   163  					RouterAdvert:            mustCreateMetric("/netstack/icmp/v6/packets_sent/router_advert", "Number of ICMPv6 router advert packets sent."),
   164  					NeighborSolicit:         mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets sent."),
   165  					NeighborAdvert:          mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_advert", "Number of ICMPv6 neighbor advert packets sent."),
   166  					RedirectMsg:             mustCreateMetric("/netstack/icmp/v6/packets_sent/redirect_msg", "Number of ICMPv6 redirect message packets sent."),
   167  					MulticastListenerQuery:  mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_query", "Number of ICMPv6 multicast listener query packets sent."),
   168  					MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent."),
   169  					MulticastListenerDone:   mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent."),
   170  				},
   171  				Dropped:     mustCreateMetric("/netstack/icmp/v6/packets_sent/dropped", "Number of ICMPv6 packets dropped due to link layer errors."),
   172  				RateLimited: mustCreateMetric("/netstack/icmp/v6/packets_sent/rate_limited", "Number of ICMPv6 packets dropped due to rate limit being exceeded."),
   173  			},
   174  			PacketsReceived: tcpip.ICMPv6ReceivedPacketStats{
   175  				ICMPv6PacketStats: tcpip.ICMPv6PacketStats{
   176  					EchoRequest:             mustCreateMetric("/netstack/icmp/v6/packets_received/echo_request", "Number of ICMPv6 echo request packets received."),
   177  					EchoReply:               mustCreateMetric("/netstack/icmp/v6/packets_received/echo_reply", "Number of ICMPv6 echo reply packets received."),
   178  					DstUnreachable:          mustCreateMetric("/netstack/icmp/v6/packets_received/dst_unreachable", "Number of ICMPv6 destination unreachable packets received."),
   179  					PacketTooBig:            mustCreateMetric("/netstack/icmp/v6/packets_received/packet_too_big", "Number of ICMPv6 packet too big packets received."),
   180  					TimeExceeded:            mustCreateMetric("/netstack/icmp/v6/packets_received/time_exceeded", "Number of ICMPv6 time exceeded packets received."),
   181  					ParamProblem:            mustCreateMetric("/netstack/icmp/v6/packets_received/param_problem", "Number of ICMPv6 parameter problem packets received."),
   182  					RouterSolicit:           mustCreateMetric("/netstack/icmp/v6/packets_received/router_solicit", "Number of ICMPv6 router solicit packets received."),
   183  					RouterAdvert:            mustCreateMetric("/netstack/icmp/v6/packets_received/router_advert", "Number of ICMPv6 router advert packets received."),
   184  					NeighborSolicit:         mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets received."),
   185  					NeighborAdvert:          mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_advert", "Number of ICMPv6 neighbor advert packets received."),
   186  					RedirectMsg:             mustCreateMetric("/netstack/icmp/v6/packets_received/redirect_msg", "Number of ICMPv6 redirect message packets received."),
   187  					MulticastListenerQuery:  mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_query", "Number of ICMPv6 multicast listener query packets received."),
   188  					MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent."),
   189  					MulticastListenerDone:   mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent."),
   190  				},
   191  				Unrecognized:                   mustCreateMetric("/netstack/icmp/v6/packets_received/unrecognized", "Number of ICMPv6 packets received that the transport layer does not know how to parse."),
   192  				Invalid:                        mustCreateMetric("/netstack/icmp/v6/packets_received/invalid", "Number of ICMPv6 packets received that the transport layer could not parse."),
   193  				RouterOnlyPacketsDroppedByHost: mustCreateMetric("/netstack/icmp/v6/packets_received/router_only_packets_dropped_by_host", "Number of ICMPv6 packets dropped due to being router-specific packets."),
   194  			},
   195  		},
   196  	},
   197  	IGMP: tcpip.IGMPStats{
   198  		PacketsSent: tcpip.IGMPSentPacketStats{
   199  			IGMPPacketStats: tcpip.IGMPPacketStats{
   200  				MembershipQuery:    mustCreateMetric("/netstack/igmp/packets_sent/membership_query", "Number of IGMP Membership Query messages sent."),
   201  				V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v1_membership_report", "Number of IGMPv1 Membership Report messages sent."),
   202  				V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v2_membership_report", "Number of IGMPv2 Membership Report messages sent."),
   203  				LeaveGroup:         mustCreateMetric("/netstack/igmp/packets_sent/leave_group", "Number of IGMP Leave Group messages sent."),
   204  			},
   205  			Dropped: mustCreateMetric("/netstack/igmp/packets_sent/dropped", "Number of IGMP packets dropped due to link layer errors."),
   206  		},
   207  		PacketsReceived: tcpip.IGMPReceivedPacketStats{
   208  			IGMPPacketStats: tcpip.IGMPPacketStats{
   209  				MembershipQuery:    mustCreateMetric("/netstack/igmp/packets_received/membership_query", "Number of IGMP Membership Query messages received."),
   210  				V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v1_membership_report", "Number of IGMPv1 Membership Report messages received."),
   211  				V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v2_membership_report", "Number of IGMPv2 Membership Report messages received."),
   212  				LeaveGroup:         mustCreateMetric("/netstack/igmp/packets_received/leave_group", "Number of IGMP Leave Group messages received."),
   213  			},
   214  			Invalid:        mustCreateMetric("/netstack/igmp/packets_received/invalid", "Number of IGMP packets received that could not be parsed."),
   215  			ChecksumErrors: mustCreateMetric("/netstack/igmp/packets_received/checksum_errors", "Number of received IGMP packets with bad checksums."),
   216  			Unrecognized:   mustCreateMetric("/netstack/igmp/packets_received/unrecognized", "Number of unrecognized IGMP packets received."),
   217  		},
   218  	},
   219  	IP: tcpip.IPStats{
   220  		PacketsReceived:                     mustCreateMetric("/netstack/ip/packets_received", "Number of IP packets received from the link layer in nic.DeliverNetworkPacket."),
   221  		DisabledPacketsReceived:             mustCreateMetric("/netstack/ip/disabled_packets_received", "Number of IP packets received from the link layer when the IP layer is disabled."),
   222  		InvalidDestinationAddressesReceived: mustCreateMetric("/netstack/ip/invalid_addresses_received", "Number of IP packets received with an unknown or invalid destination address."),
   223  		InvalidSourceAddressesReceived:      mustCreateMetric("/netstack/ip/invalid_source_addresses_received", "Number of IP packets received with an unknown or invalid source address."),
   224  		PacketsDelivered:                    mustCreateMetric("/netstack/ip/packets_delivered", "Number of incoming IP packets that are successfully delivered to the transport layer via HandlePacket."),
   225  		PacketsSent:                         mustCreateMetric("/netstack/ip/packets_sent", "Number of IP packets sent via WritePacket."),
   226  		OutgoingPacketErrors:                mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Number of IP packets which failed to write to a link-layer endpoint."),
   227  		MalformedPacketsReceived:            mustCreateMetric("/netstack/ip/malformed_packets_received", "Number of IP packets which failed IP header validation checks."),
   228  		MalformedFragmentsReceived:          mustCreateMetric("/netstack/ip/malformed_fragments_received", "Number of IP fragments which failed IP fragment validation checks."),
   229  		IPTablesPreroutingDropped:           mustCreateMetric("/netstack/ip/iptables/prerouting_dropped", "Number of IP packets dropped in the Prerouting chain."),
   230  		IPTablesInputDropped:                mustCreateMetric("/netstack/ip/iptables/input_dropped", "Number of IP packets dropped in the Input chain."),
   231  		IPTablesOutputDropped:               mustCreateMetric("/netstack/ip/iptables/output_dropped", "Number of IP packets dropped in the Output chain."),
   232  		OptionTimestampReceived:             mustCreateMetric("/netstack/ip/options/timestamp_received", "Number of timestamp options found in received IP packets."),
   233  		OptionRecordRouteReceived:           mustCreateMetric("/netstack/ip/options/record_route_received", "Number of record route options found in received IP packets."),
   234  		OptionRouterAlertReceived:           mustCreateMetric("/netstack/ip/options/router_alert_received", "Number of router alert options found in received IP packets."),
   235  		OptionUnknownReceived:               mustCreateMetric("/netstack/ip/options/unknown_received", "Number of unknown options found in received IP packets."),
   236  		Forwarding: tcpip.IPForwardingStats{
   237  			Unrouteable:            mustCreateMetric("/netstack/ip/forwarding/unrouteable", "Number of IP packets received which couldn't be routed and thus were not forwarded."),
   238  			ExhaustedTTL:           mustCreateMetric("/netstack/ip/forwarding/exhausted_ttl", "Number of IP packets received which could not be forwarded due to an exhausted TTL."),
   239  			LinkLocalSource:        mustCreateMetric("/netstack/ip/forwarding/link_local_source_address", "Number of IP packets received which could not be forwarded due to a link-local source address."),
   240  			LinkLocalDestination:   mustCreateMetric("/netstack/ip/forwarding/link_local_destination_address", "Number of IP packets received which could not be forwarded due to a link-local destination address."),
   241  			ExtensionHeaderProblem: mustCreateMetric("/netstack/ip/forwarding/extension_header_problem", "Number of IP packets received which could not be forwarded due to a problem processing their IPv6 extension headers."),
   242  			PacketTooBig:           mustCreateMetric("/netstack/ip/forwarding/packet_too_big", "Number of IP packets received which could not be forwarded because they could not fit within the outgoing MTU."),
   243  			HostUnreachable:        mustCreateMetric("/netstack/ip/forwarding/host_unreachable", "Number of IP packets received which could not be forwarded due to unresolvable next hop."),
   244  			Errors:                 mustCreateMetric("/netstack/ip/forwarding/errors", "Number of IP packets which couldn't be forwarded."),
   245  		},
   246  	},
   247  	ARP: tcpip.ARPStats{
   248  		PacketsReceived:                                 mustCreateMetric("/netstack/arp/packets_received", "Number of ARP packets received from the link layer."),
   249  		DisabledPacketsReceived:                         mustCreateMetric("/netstack/arp/disabled_packets_received", "Number of ARP packets received from the link layer when the ARP layer is disabled."),
   250  		MalformedPacketsReceived:                        mustCreateMetric("/netstack/arp/malformed_packets_received", "Number of ARP packets which failed ARP header validation checks."),
   251  		RequestsReceived:                                mustCreateMetric("/netstack/arp/requests_received", "Number of ARP requests received."),
   252  		RequestsReceivedUnknownTargetAddress:            mustCreateMetric("/netstack/arp/requests_received_unknown_addr", "Number of ARP requests received with an unknown target address."),
   253  		OutgoingRequestInterfaceHasNoLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_iface_has_no_addr", "Number of failed attempts to send an ARP request with an interface that has no network address."),
   254  		OutgoingRequestBadLocalAddressErrors:            mustCreateMetric("/netstack/arp/outgoing_requests_invalid_local_addr", "Number of failed attempts to send an ARP request with a provided local address that is invalid."),
   255  		OutgoingRequestsDropped:                         mustCreateMetric("/netstack/arp/outgoing_requests_dropped", "Number of ARP requests which failed to write to a link-layer endpoint."),
   256  		OutgoingRequestsSent:                            mustCreateMetric("/netstack/arp/outgoing_requests_sent", "Number of ARP requests sent."),
   257  		RepliesReceived:                                 mustCreateMetric("/netstack/arp/replies_received", "Number of ARP replies received."),
   258  		OutgoingRepliesDropped:                          mustCreateMetric("/netstack/arp/outgoing_replies_dropped", "Number of ARP replies which failed to write to a link-layer endpoint."),
   259  		OutgoingRepliesSent:                             mustCreateMetric("/netstack/arp/outgoing_replies_sent", "Number of ARP replies sent."),
   260  	},
   261  	TCP: tcpip.TCPStats{
   262  		ActiveConnectionOpenings:           mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."),
   263  		PassiveConnectionOpenings:          mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."),
   264  		CurrentEstablished:                 mustCreateGauge("/netstack/tcp/current_established", "Number of connections in ESTABLISHED state now."),
   265  		CurrentConnected:                   mustCreateGauge("/netstack/tcp/current_open", "Number of connections that are in connected state."),
   266  		EstablishedResets:                  mustCreateMetric("/netstack/tcp/established_resets", "Number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state"),
   267  		EstablishedClosed:                  mustCreateMetric("/netstack/tcp/established_closed", "Number of times established TCP connections made a transition to CLOSED state."),
   268  		EstablishedTimedout:                mustCreateMetric("/netstack/tcp/established_timedout", "Number of times  an established connection was reset because of keep-alive time out."),
   269  		ListenOverflowSynDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_syn_drop", "Number of times the listen queue overflowed and a SYN was dropped."),
   270  		ListenOverflowAckDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_ack_drop", "Number of times the listen queue overflowed and the final ACK in the handshake was dropped."),
   271  		ListenOverflowSynCookieSent:        mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_sent", "Number of times a SYN cookie was sent."),
   272  		ListenOverflowSynCookieRcvd:        mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_rcvd", "Number of times a SYN cookie was received."),
   273  		ListenOverflowInvalidSynCookieRcvd: mustCreateMetric("/netstack/tcp/listen_overflow_invalid_syn_cookie_rcvd", "Number of times an invalid SYN cookie was received."),
   274  		FailedConnectionAttempts:           mustCreateMetric("/netstack/tcp/failed_connection_attempts", "Number of calls to Connect or Listen (active and passive openings, respectively) that end in an error."),
   275  		ValidSegmentsReceived:              mustCreateMetric("/netstack/tcp/valid_segments_received", "Number of TCP segments received that the transport layer successfully parsed."),
   276  		InvalidSegmentsReceived:            mustCreateMetric("/netstack/tcp/invalid_segments_received", "Number of TCP segments received that the transport layer could not parse."),
   277  		SegmentsSent:                       mustCreateMetric("/netstack/tcp/segments_sent", "Number of TCP segments sent."),
   278  		SegmentSendErrors:                  mustCreateMetric("/netstack/tcp/segment_send_errors", "Number of TCP segments failed to be sent."),
   279  		ResetsSent:                         mustCreateMetric("/netstack/tcp/resets_sent", "Number of TCP resets sent."),
   280  		ResetsReceived:                     mustCreateMetric("/netstack/tcp/resets_received", "Number of TCP resets received."),
   281  		Retransmits:                        mustCreateMetric("/netstack/tcp/retransmits", "Number of TCP segments retransmitted."),
   282  		FastRecovery:                       mustCreateMetric("/netstack/tcp/fast_recovery", "Number of times fast recovery was used to recover from packet loss."),
   283  		SACKRecovery:                       mustCreateMetric("/netstack/tcp/sack_recovery", "Number of times SACK recovery was used to recover from packet loss."),
   284  		TLPRecovery:                        mustCreateMetric("/netstack/tcp/tlp_recovery", "Number of times tail loss probe triggers recovery from tail loss."),
   285  		SlowStartRetransmits:               mustCreateMetric("/netstack/tcp/slow_start_retransmits", "Number of segments retransmitted in slow start mode."),
   286  		FastRetransmit:                     mustCreateMetric("/netstack/tcp/fast_retransmit", "Number of TCP segments which were fast retransmitted."),
   287  		Timeouts:                           mustCreateMetric("/netstack/tcp/timeouts", "Number of times RTO expired."),
   288  		ChecksumErrors:                     mustCreateMetric("/netstack/tcp/checksum_errors", "Number of segments dropped due to bad checksums."),
   289  		FailedPortReservations:             mustCreateMetric("/netstack/tcp/failed_port_reservations", "Number of time TCP failed to reserve a port."),
   290  		SegmentsAckedWithDSACK:             mustCreateMetric("/netstack/tcp/segments_acked_with_dsack", "Number of segments for which DSACK was received."),
   291  		SpuriousRecovery:                   mustCreateMetric("/netstack/tcp/spurious_recovery", "Number of times the connection entered loss recovery spuriously."),
   292  		SpuriousRTORecovery:                mustCreateMetric("/netstack/tcp/spurious_rto_recovery", "Number of times the connection entered RTO spuriously."),
   293  		ForwardMaxInFlightDrop:             mustCreateMetric("/netstack/tcp/forward_max_in_flight_drop", "Number of connection requests dropped due to exceeding in-flight limit."),
   294  	},
   295  	UDP: tcpip.UDPStats{
   296  		PacketsReceived:          mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."),
   297  		UnknownPortErrors:        mustCreateMetric("/netstack/udp/unknown_port_errors", "Number of incoming UDP datagrams dropped because they did not have a known destination port."),
   298  		ReceiveBufferErrors:      mustCreateMetric("/netstack/udp/receive_buffer_errors", "Number of incoming UDP datagrams dropped due to the receiving buffer being in an invalid state."),
   299  		MalformedPacketsReceived: mustCreateMetric("/netstack/udp/malformed_packets_received", "Number of incoming UDP datagrams dropped due to the UDP header being in a malformed state."),
   300  		PacketsSent:              mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent."),
   301  		PacketSendErrors:         mustCreateMetric("/netstack/udp/packet_send_errors", "Number of UDP datagrams failed to be sent."),
   302  		ChecksumErrors:           mustCreateMetric("/netstack/udp/checksum_errors", "Number of UDP datagrams dropped due to bad checksums."),
   303  	},
   304  }
   305  
   306  // DefaultTTL is linux's default TTL. All network protocols in all stacks used
   307  // with this package must have this value set as their default TTL.
   308  const DefaultTTL = 64
   309  
   310  const sizeOfInt32 int = 4
   311  
   312  var errStackType = syserr.New("expected but did not receive a netstack.Stack", errno.EINVAL)
   313  
   314  // commonEndpoint represents the intersection of a tcpip.Endpoint and a
   315  // transport.Endpoint.
   316  type commonEndpoint interface {
   317  	// Readiness implements tcpip.Endpoint.Readiness and
   318  	// transport.Endpoint.Readiness.
   319  	Readiness(mask waiter.EventMask) waiter.EventMask
   320  
   321  	// SetSockOpt implements tcpip.Endpoint.SetSockOpt and
   322  	// transport.Endpoint.SetSockOpt.
   323  	SetSockOpt(tcpip.SettableSocketOption) tcpip.Error
   324  
   325  	// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt and
   326  	// transport.Endpoint.SetSockOptInt.
   327  	SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error
   328  
   329  	// GetSockOpt implements tcpip.Endpoint.GetSockOpt and
   330  	// transport.Endpoint.GetSockOpt.
   331  	GetSockOpt(tcpip.GettableSocketOption) tcpip.Error
   332  
   333  	// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and
   334  	// transport.Endpoint.GetSockOpt.
   335  	GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error)
   336  
   337  	// State returns a socket's lifecycle state. The returned value is
   338  	// protocol-specific and is primarily used for diagnostics.
   339  	State() uint32
   340  
   341  	// LastError implements tcpip.Endpoint.LastError and
   342  	// transport.Endpoint.LastError.
   343  	LastError() tcpip.Error
   344  
   345  	// SocketOptions implements tcpip.Endpoint.SocketOptions and
   346  	// transport.Endpoint.SocketOptions.
   347  	SocketOptions() *tcpip.SocketOptions
   348  }
   349  
   350  // sock encapsulates all the state needed to represent a network stack
   351  // endpoint in the kernel context.
   352  //
   353  // +stateify savable
   354  type sock struct {
   355  	vfsfd vfs.FileDescription
   356  	vfs.FileDescriptionDefaultImpl
   357  	vfs.DentryMetadataFileDescriptionImpl
   358  	vfs.LockFD
   359  	socket.SendReceiveTimeout
   360  	*waiter.Queue
   361  
   362  	family   int
   363  	Endpoint tcpip.Endpoint
   364  	skType   linux.SockType
   365  	protocol int
   366  
   367  	namespace *inet.Namespace
   368  
   369  	mu sync.Mutex `state:"nosave"`
   370  	// readWriter is an optimization to avoid allocations.
   371  	// +checklocks:mu
   372  	readWriter usermem.IOSequenceReadWriter `state:"nosave"`
   373  
   374  	// readMu protects access to the below fields.
   375  	readMu sync.Mutex `state:"nosave"`
   376  
   377  	// sockOptTimestamp corresponds to SO_TIMESTAMP. When true, timestamps
   378  	// of returned messages can be returned via control messages. When
   379  	// false, the same timestamp is instead stored and can be read via the
   380  	// SIOCGSTAMP ioctl. It is protected by readMu. See socket(7).
   381  	sockOptTimestamp bool
   382  	// timestampValid indicates whether timestamp for SIOCGSTAMP has been
   383  	// set. It is protected by readMu.
   384  	timestampValid bool
   385  	// timestamp holds the timestamp to use with SIOCTSTAMP. It is only
   386  	// valid when timestampValid is true. It is protected by readMu.
   387  	timestamp time.Time `state:".(int64)"`
   388  
   389  	// TODO(b/153685824): Move this to SocketOptions.
   390  	// sockOptInq corresponds to TCP_INQ.
   391  	sockOptInq bool
   392  }
   393  
   394  var _ = socket.Socket(&sock{})
   395  
   396  // New creates a new endpoint socket.
   397  func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*vfs.FileDescription, *syserr.Error) {
   398  	if skType == linux.SOCK_STREAM {
   399  		endpoint.SocketOptions().SetDelayOption(true)
   400  	}
   401  
   402  	mnt := t.Kernel().SocketMount()
   403  	d := sockfs.NewDentry(t, mnt)
   404  	defer d.DecRef(t)
   405  
   406  	namespace := t.NetworkNamespace()
   407  	s := &sock{
   408  		Queue:     queue,
   409  		family:    family,
   410  		Endpoint:  endpoint,
   411  		skType:    skType,
   412  		protocol:  protocol,
   413  		namespace: namespace,
   414  	}
   415  	s.LockFD.Init(&vfs.FileLocks{})
   416  	vfsfd := &s.vfsfd
   417  	if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{
   418  		DenyPRead:         true,
   419  		DenyPWrite:        true,
   420  		UseDentryMetadata: true,
   421  	}); err != nil {
   422  		return nil, syserr.FromError(err)
   423  	}
   424  	namespace.IncRef()
   425  	return vfsfd, nil
   426  }
   427  
   428  // Release implements vfs.FileDescriptionImpl.Release.
   429  func (s *sock) Release(ctx context.Context) {
   430  	kernel.KernelFromContext(ctx).DeleteSocket(&s.vfsfd)
   431  	e, ch := waiter.NewChannelEntry(waiter.EventHUp | waiter.EventErr)
   432  	s.EventRegister(&e)
   433  	defer s.EventUnregister(&e)
   434  
   435  	s.Endpoint.Close()
   436  
   437  	// SO_LINGER option is valid only for TCP. For other socket types
   438  	// return after endpoint close.
   439  	if family, skType, _ := s.Type(); skType == linux.SOCK_STREAM && (family == linux.AF_INET || family == linux.AF_INET6) {
   440  		v := s.Endpoint.SocketOptions().GetLinger()
   441  		// The case for zero timeout is handled in tcp endpoint close function.
   442  		// Close is blocked until either:
   443  		// 1. The endpoint state is not in any of the states: FIN-WAIT1,
   444  		// CLOSING and LAST_ACK.
   445  		// 2. Timeout is reached.
   446  		if v.Enabled && v.Timeout != 0 {
   447  			t := kernel.TaskFromContext(ctx)
   448  			start := t.Kernel().MonotonicClock().Now()
   449  			deadline := start.Add(v.Timeout)
   450  			_ = t.BlockWithDeadline(ch, true, deadline)
   451  		}
   452  	}
   453  	s.namespace.DecRef(ctx)
   454  }
   455  
   456  // Epollable implements FileDescriptionImpl.Epollable.
   457  func (s *sock) Epollable() bool {
   458  	return true
   459  }
   460  
   461  // Read implements vfs.FileDescriptionImpl.
   462  func (s *sock) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
   463  	// All flags other than RWF_NOWAIT should be ignored.
   464  	// TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
   465  	if opts.Flags != 0 {
   466  		return 0, linuxerr.EOPNOTSUPP
   467  	}
   468  
   469  	if dst.NumBytes() == 0 {
   470  		return 0, nil
   471  	}
   472  	n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false)
   473  	if err == syserr.ErrWouldBlock {
   474  		return int64(n), linuxerr.ErrWouldBlock
   475  	}
   476  	if err != nil {
   477  		return 0, err.ToError()
   478  	}
   479  	return int64(n), nil
   480  }
   481  
   482  // Write implements vfs.FileDescriptionImpl.
   483  func (s *sock) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
   484  	// All flags other than RWF_NOWAIT should be ignored.
   485  	// TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
   486  	if opts.Flags != 0 {
   487  		return 0, linuxerr.EOPNOTSUPP
   488  	}
   489  
   490  	var n int64
   491  	var err tcpip.Error
   492  	switch s.Endpoint.(type) {
   493  	case *tcp.Endpoint:
   494  		s.mu.Lock()
   495  		s.readWriter.Init(ctx, src)
   496  		n, err = s.Endpoint.Write(&s.readWriter, tcpip.WriteOptions{})
   497  		s.mu.Unlock()
   498  	default:
   499  		n, err = s.Endpoint.Write(src.Reader(ctx), tcpip.WriteOptions{})
   500  	}
   501  	if _, ok := err.(*tcpip.ErrWouldBlock); ok {
   502  		return 0, linuxerr.ErrWouldBlock
   503  	}
   504  	if err != nil {
   505  		return 0, syserr.TranslateNetstackError(err).ToError()
   506  	}
   507  
   508  	if n < src.NumBytes() {
   509  		return n, linuxerr.ErrWouldBlock
   510  	}
   511  
   512  	return n, nil
   513  }
   514  
   515  // Accept implements the linux syscall accept(2) for sockets backed by
   516  // tcpip.Endpoint.
   517  func (s *sock) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
   518  	// Issue the accept request to get the new endpoint.
   519  	var peerAddr *tcpip.FullAddress
   520  	if peerRequested {
   521  		peerAddr = &tcpip.FullAddress{}
   522  	}
   523  	ep, wq, terr := s.Endpoint.Accept(peerAddr)
   524  	if terr != nil {
   525  		if _, ok := terr.(*tcpip.ErrWouldBlock); !ok || !blocking {
   526  			return 0, nil, 0, syserr.TranslateNetstackError(terr)
   527  		}
   528  
   529  		var err *syserr.Error
   530  		ep, wq, err = s.blockingAccept(t, peerAddr)
   531  		if err != nil {
   532  			return 0, nil, 0, err
   533  		}
   534  	}
   535  
   536  	ns, err := New(t, s.family, s.skType, s.protocol, wq, ep)
   537  	if err != nil {
   538  		return 0, nil, 0, err
   539  	}
   540  	defer ns.DecRef(t)
   541  
   542  	if err := ns.SetStatusFlags(t, t.Credentials(), uint32(flags&linux.SOCK_NONBLOCK)); err != nil {
   543  		return 0, nil, 0, syserr.FromError(err)
   544  	}
   545  
   546  	var addr linux.SockAddr
   547  	var addrLen uint32
   548  	if peerAddr != nil {
   549  		// Get address of the peer and write it to peer slice.
   550  		addr, addrLen = socket.ConvertAddress(s.family, *peerAddr)
   551  	}
   552  
   553  	fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
   554  		CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
   555  	})
   556  
   557  	t.Kernel().RecordSocket(ns)
   558  
   559  	return fd, addr, addrLen, syserr.FromError(e)
   560  }
   561  
   562  // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
   563  // tcpip.Endpoint.
   564  func (s *sock) GetSockOpt(t *kernel.Task, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
   565  	// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
   566  	// implemented specifically for netstack.Socket rather than
   567  	// commonEndpoint. commonEndpoint should be extended to support socket
   568  	// options where the implementation is not shared, as unix sockets need
   569  	// their own support for SO_TIMESTAMP.
   570  	if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP {
   571  		if outLen < sizeOfInt32 {
   572  			return nil, syserr.ErrInvalidArgument
   573  		}
   574  		val := primitive.Int32(0)
   575  		s.readMu.Lock()
   576  		defer s.readMu.Unlock()
   577  		if s.sockOptTimestamp {
   578  			val = 1
   579  		}
   580  		return &val, nil
   581  	}
   582  	if level == linux.SOL_TCP && name == linux.TCP_INQ {
   583  		if outLen < sizeOfInt32 {
   584  			return nil, syserr.ErrInvalidArgument
   585  		}
   586  		val := primitive.Int32(0)
   587  		s.readMu.Lock()
   588  		defer s.readMu.Unlock()
   589  		if s.sockOptInq {
   590  			val = 1
   591  		}
   592  		return &val, nil
   593  	}
   594  
   595  	return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outPtr, outLen)
   596  }
   597  
   598  // SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
   599  // tcpip.Endpoint.
   600  func (s *sock) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error {
   601  	// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
   602  	// implemented specifically for netstack.Socket rather than
   603  	// commonEndpoint. commonEndpoint should be extended to support socket
   604  	// options where the implementation is not shared, as unix sockets need
   605  	// their own support for SO_TIMESTAMP.
   606  	if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP {
   607  		if len(optVal) < sizeOfInt32 {
   608  			return syserr.ErrInvalidArgument
   609  		}
   610  		s.readMu.Lock()
   611  		defer s.readMu.Unlock()
   612  		s.sockOptTimestamp = hostarch.ByteOrder.Uint32(optVal) != 0
   613  		return nil
   614  	}
   615  	if level == linux.SOL_TCP && name == linux.TCP_INQ {
   616  		if len(optVal) < sizeOfInt32 {
   617  			return syserr.ErrInvalidArgument
   618  		}
   619  		s.readMu.Lock()
   620  		defer s.readMu.Unlock()
   621  		s.sockOptInq = hostarch.ByteOrder.Uint32(optVal) != 0
   622  		return nil
   623  	}
   624  
   625  	return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
   626  }
   627  
   628  var sockAddrInetSize = (*linux.SockAddrInet)(nil).SizeBytes()
   629  var sockAddrInet6Size = (*linux.SockAddrInet6)(nil).SizeBytes()
   630  var sockAddrLinkSize = (*linux.SockAddrLink)(nil).SizeBytes()
   631  
   632  // minSockAddrLen returns the minimum length in bytes of a socket address for
   633  // the socket's family.
   634  func (s *sock) minSockAddrLen() int {
   635  	const addressFamilySize = 2
   636  
   637  	switch s.family {
   638  	case linux.AF_UNIX:
   639  		return addressFamilySize
   640  	case linux.AF_INET:
   641  		return sockAddrInetSize
   642  	case linux.AF_INET6:
   643  		return sockAddrInet6Size
   644  	case linux.AF_PACKET:
   645  		return sockAddrLinkSize
   646  	case linux.AF_UNSPEC:
   647  		return addressFamilySize
   648  	default:
   649  		panic(fmt.Sprintf("s.family unrecognized = %d", s.family))
   650  	}
   651  }
   652  
   653  func (s *sock) isPacketBased() bool {
   654  	return s.skType == linux.SOCK_DGRAM || s.skType == linux.SOCK_SEQPACKET || s.skType == linux.SOCK_RDM || s.skType == linux.SOCK_RAW
   655  }
   656  
   657  // Readiness returns a mask of ready events for socket s.
   658  func (s *sock) Readiness(mask waiter.EventMask) waiter.EventMask {
   659  	return s.Endpoint.Readiness(mask)
   660  }
   661  
   662  // checkFamily returns true iff the specified address family may be used with
   663  // the socket.
   664  //
   665  // If exact is true, then the specified address family must be an exact match
   666  // with the socket's family.
   667  func (s *sock) checkFamily(family uint16, exact bool) bool {
   668  	if family == uint16(s.family) {
   669  		return true
   670  	}
   671  	if !exact && family == linux.AF_INET && s.family == linux.AF_INET6 {
   672  		if !s.Endpoint.SocketOptions().GetV6Only() {
   673  			return true
   674  		}
   675  	}
   676  	return false
   677  }
   678  
   679  // mapFamily maps the AF_INET ANY address to the IPv4-mapped IPv6 ANY if the
   680  // receiver's family is AF_INET6.
   681  //
   682  // This is a hack to work around the fact that both IPv4 and IPv6 ANY are
   683  // represented by the empty string.
   684  //
   685  // TODO(gvisor.dev/issue/1556): remove this function.
   686  func (s *sock) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress {
   687  	if addr.Addr.BitLen() == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET {
   688  		addr.Addr = tcpip.AddrFrom16([16]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00})
   689  	}
   690  	return addr
   691  }
   692  
   693  // Connect implements the linux syscall connect(2) for sockets backed by
   694  // tpcip.Endpoint.
   695  func (s *sock) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
   696  	addr, family, err := socket.AddressAndFamily(sockaddr)
   697  	if err != nil {
   698  		return err
   699  	}
   700  
   701  	if family == linux.AF_UNSPEC {
   702  		err := s.Endpoint.Disconnect()
   703  		if _, ok := err.(*tcpip.ErrNotSupported); ok {
   704  			return syserr.ErrAddressFamilyNotSupported
   705  		}
   706  		return syserr.TranslateNetstackError(err)
   707  	}
   708  
   709  	if !s.checkFamily(family, false /* exact */) {
   710  		return syserr.ErrInvalidArgument
   711  	}
   712  	addr = s.mapFamily(addr, family)
   713  
   714  	// Always return right away in the non-blocking case.
   715  	if !blocking {
   716  		return syserr.TranslateNetstackError(s.Endpoint.Connect(addr))
   717  	}
   718  
   719  	// Register for notification when the endpoint becomes writable, then
   720  	// initiate the connection.
   721  	e, ch := waiter.NewChannelEntry(waiter.WritableEvents)
   722  	s.EventRegister(&e)
   723  	defer s.EventUnregister(&e)
   724  
   725  	switch err := s.Endpoint.Connect(addr); err.(type) {
   726  	case *tcpip.ErrConnectStarted, *tcpip.ErrAlreadyConnecting:
   727  	case *tcpip.ErrNoPortAvailable:
   728  		if (s.family == unix.AF_INET || s.family == unix.AF_INET6) && s.skType == linux.SOCK_STREAM {
   729  			// TCP unlike UDP returns EADDRNOTAVAIL when it can't
   730  			// find an available local ephemeral port.
   731  			return syserr.ErrAddressNotAvailable
   732  		}
   733  		return syserr.TranslateNetstackError(err)
   734  	default:
   735  		return syserr.TranslateNetstackError(err)
   736  	}
   737  
   738  	// It's pending, so we have to wait for a notification, and fetch the
   739  	// result once the wait completes.
   740  	if err := t.Block(ch); err != nil {
   741  		return syserr.FromError(err)
   742  	}
   743  
   744  	// Call Connect() again after blocking to find connect's result.
   745  	return syserr.TranslateNetstackError(s.Endpoint.Connect(addr))
   746  }
   747  
   748  // Bind implements the linux syscall bind(2) for sockets backed by
   749  // tcpip.Endpoint.
   750  func (s *sock) Bind(_ *kernel.Task, sockaddr []byte) *syserr.Error {
   751  	if len(sockaddr) < 2 {
   752  		return syserr.ErrInvalidArgument
   753  	}
   754  
   755  	family := hostarch.ByteOrder.Uint16(sockaddr)
   756  	var addr tcpip.FullAddress
   757  
   758  	// Bind for AF_PACKET requires only family, protocol and ifindex.
   759  	// In function AddressAndFamily, we check the address length which is
   760  	// not needed for AF_PACKET bind.
   761  	if family == linux.AF_PACKET {
   762  		var a linux.SockAddrLink
   763  		if len(sockaddr) < sockAddrLinkSize {
   764  			return syserr.ErrInvalidArgument
   765  		}
   766  		a.UnmarshalBytes(sockaddr)
   767  
   768  		addr = tcpip.FullAddress{
   769  			NIC: tcpip.NICID(a.InterfaceIndex),
   770  			Addr: tcpip.AddrFrom16Slice(append(
   771  				a.HardwareAddr[:header.EthernetAddressSize],
   772  				[]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}...,
   773  			)),
   774  			Port: socket.Ntohs(a.Protocol),
   775  		}
   776  	} else {
   777  		if s.minSockAddrLen() > len(sockaddr) {
   778  			return syserr.ErrInvalidArgument
   779  		}
   780  
   781  		var err *syserr.Error
   782  		addr, family, err = socket.AddressAndFamily(sockaddr)
   783  		if err != nil {
   784  			return err
   785  		}
   786  
   787  		if !s.checkFamily(family, true /* exact */) {
   788  			return syserr.ErrAddressFamilyNotSupported
   789  		}
   790  
   791  		addr = s.mapFamily(addr, family)
   792  	}
   793  
   794  	// Issue the bind request to the endpoint.
   795  	err := s.Endpoint.Bind(addr)
   796  	if _, ok := err.(*tcpip.ErrNoPortAvailable); ok {
   797  		// Bind always returns EADDRINUSE irrespective of if the specified port was
   798  		// already bound or if an ephemeral port was requested but none were
   799  		// available.
   800  		//
   801  		// *tcpip.ErrNoPortAvailable is mapped to EAGAIN in syserr package because
   802  		// UDP connect returns EAGAIN on ephemeral port exhaustion.
   803  		//
   804  		// TCP connect returns EADDRNOTAVAIL on ephemeral port exhaustion.
   805  		err = &tcpip.ErrPortInUse{}
   806  	}
   807  
   808  	return syserr.TranslateNetstackError(err)
   809  }
   810  
   811  // Listen implements the linux syscall listen(2) for sockets backed by
   812  // tcpip.Endpoint.
   813  func (s *sock) Listen(_ *kernel.Task, backlog int) *syserr.Error {
   814  	if err := s.Endpoint.Listen(backlog); err != nil {
   815  		return syserr.TranslateNetstackError(err)
   816  	}
   817  	if !socket.IsTCP(s) {
   818  		return nil
   819  	}
   820  
   821  	// Emit SentryTCPListenEvent with the bound port for tcp sockets.
   822  	addr, err := s.Endpoint.GetLocalAddress()
   823  	if err != nil {
   824  		panic(fmt.Sprintf("GetLocalAddress failed for tcp socket: %s", err))
   825  	}
   826  	eventchannel.Emit(&epb.SentryTcpListenEvent{
   827  		Port: proto.Int32(int32(addr.Port)),
   828  	})
   829  	return nil
   830  }
   831  
   832  // blockingAccept implements a blocking version of accept(2), that is, if no
   833  // connections are ready to be accept, it will block until one becomes ready.
   834  func (s *sock) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) {
   835  	// Register for notifications.
   836  	e, ch := waiter.NewChannelEntry(waiter.ReadableEvents)
   837  	s.EventRegister(&e)
   838  	defer s.EventUnregister(&e)
   839  
   840  	// Try to accept the connection again; if it fails, then wait until we
   841  	// get a notification.
   842  	for {
   843  		ep, wq, err := s.Endpoint.Accept(peerAddr)
   844  		if _, ok := err.(*tcpip.ErrWouldBlock); !ok {
   845  			return ep, wq, syserr.TranslateNetstackError(err)
   846  		}
   847  
   848  		if err := t.Block(ch); err != nil {
   849  			return nil, nil, syserr.FromError(err)
   850  		}
   851  	}
   852  }
   853  
   854  // ConvertShutdown converts Linux shutdown flags into tcpip shutdown flags.
   855  func ConvertShutdown(how int) (tcpip.ShutdownFlags, *syserr.Error) {
   856  	var f tcpip.ShutdownFlags
   857  	switch how {
   858  	case linux.SHUT_RD:
   859  		f = tcpip.ShutdownRead
   860  	case linux.SHUT_WR:
   861  		f = tcpip.ShutdownWrite
   862  	case linux.SHUT_RDWR:
   863  		f = tcpip.ShutdownRead | tcpip.ShutdownWrite
   864  	default:
   865  		return 0, syserr.ErrInvalidArgument
   866  	}
   867  	return f, nil
   868  }
   869  
   870  // Shutdown implements the linux syscall shutdown(2) for sockets backed by
   871  // tcpip.Endpoint.
   872  func (s *sock) Shutdown(_ *kernel.Task, how int) *syserr.Error {
   873  	f, err := ConvertShutdown(how)
   874  	if err != nil {
   875  		return err
   876  	}
   877  
   878  	// Issue shutdown request.
   879  	return syserr.TranslateNetstackError(s.Endpoint.Shutdown(f))
   880  }
   881  
   882  // GetSockOpt can be used to implement the linux syscall getsockopt(2) for
   883  // sockets backed by a commonEndpoint.
   884  func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
   885  	switch level {
   886  	case linux.SOL_SOCKET:
   887  		return getSockOptSocket(t, s, ep, family, skType, name, outLen)
   888  
   889  	case linux.SOL_TCP:
   890  		return getSockOptTCP(t, s, ep, name, outLen)
   891  
   892  	case linux.SOL_IPV6:
   893  		return getSockOptIPv6(t, s, ep, name, outPtr, outLen)
   894  
   895  	case linux.SOL_IP:
   896  		return getSockOptIP(t, s, ep, name, outPtr, outLen, family)
   897  
   898  	case linux.SOL_ICMPV6:
   899  		return getSockOptICMPv6(t, s, ep, name, outLen)
   900  
   901  	case linux.SOL_UDP,
   902  		linux.SOL_RAW,
   903  		linux.SOL_PACKET:
   904  		// Not supported.
   905  	}
   906  
   907  	return nil, syserr.ErrProtocolNotAvailable
   908  }
   909  
   910  func boolToInt32(v bool) int32 {
   911  	if v {
   912  		return 1
   913  	}
   914  	return 0
   915  }
   916  
   917  // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET.
   918  func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, _ linux.SockType, name, outLen int) (marshal.Marshallable, *syserr.Error) {
   919  	// TODO(b/124056281): Stop rejecting short optLen values in getsockopt.
   920  	switch name {
   921  	case linux.SO_ERROR:
   922  		if outLen < sizeOfInt32 {
   923  			return nil, syserr.ErrInvalidArgument
   924  		}
   925  
   926  		// Get the last error and convert it.
   927  		err := ep.SocketOptions().GetLastError()
   928  		if err == nil {
   929  			optP := primitive.Int32(0)
   930  			return &optP, nil
   931  		}
   932  
   933  		optP := primitive.Int32(syserr.TranslateNetstackError(err).ToLinux())
   934  		return &optP, nil
   935  
   936  	case linux.SO_PEERCRED:
   937  		if family != linux.AF_UNIX || outLen < unix.SizeofUcred {
   938  			return nil, syserr.ErrInvalidArgument
   939  		}
   940  
   941  		tcred := t.Credentials()
   942  		creds := linux.ControlMessageCredentials{
   943  			PID: int32(t.ThreadGroup().ID()),
   944  			UID: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()),
   945  			GID: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()),
   946  		}
   947  		return &creds, nil
   948  
   949  	case linux.SO_PASSCRED:
   950  		if outLen < sizeOfInt32 {
   951  			return nil, syserr.ErrInvalidArgument
   952  		}
   953  
   954  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetPassCred()))
   955  		return &v, nil
   956  
   957  	case linux.SO_SNDBUF:
   958  		if outLen < sizeOfInt32 {
   959  			return nil, syserr.ErrInvalidArgument
   960  		}
   961  
   962  		size := ep.SocketOptions().GetSendBufferSize()
   963  
   964  		if size > math.MaxInt32 {
   965  			size = math.MaxInt32
   966  		}
   967  
   968  		sizeP := primitive.Int32(size)
   969  		return &sizeP, nil
   970  
   971  	case linux.SO_RCVBUF:
   972  		if outLen < sizeOfInt32 {
   973  			return nil, syserr.ErrInvalidArgument
   974  		}
   975  
   976  		size := ep.SocketOptions().GetReceiveBufferSize()
   977  
   978  		if size > math.MaxInt32 {
   979  			size = math.MaxInt32
   980  		}
   981  
   982  		sizeP := primitive.Int32(size)
   983  		return &sizeP, nil
   984  
   985  	case linux.SO_REUSEADDR:
   986  		if outLen < sizeOfInt32 {
   987  			return nil, syserr.ErrInvalidArgument
   988  		}
   989  
   990  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReuseAddress()))
   991  		return &v, nil
   992  
   993  	case linux.SO_REUSEPORT:
   994  		if outLen < sizeOfInt32 {
   995  			return nil, syserr.ErrInvalidArgument
   996  		}
   997  
   998  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReusePort()))
   999  		return &v, nil
  1000  
  1001  	case linux.SO_BINDTODEVICE:
  1002  		v := ep.SocketOptions().GetBindToDevice()
  1003  		if v == 0 {
  1004  			var b primitive.ByteSlice
  1005  			return &b, nil
  1006  		}
  1007  		if outLen < linux.IFNAMSIZ {
  1008  			return nil, syserr.ErrInvalidArgument
  1009  		}
  1010  		s := t.NetworkContext()
  1011  		if s == nil {
  1012  			return nil, syserr.ErrNoDevice
  1013  		}
  1014  		nic, ok := s.Interfaces()[int32(v)]
  1015  		if !ok {
  1016  			// The NICID no longer indicates a valid interface, probably because that
  1017  			// interface was removed.
  1018  			return nil, syserr.ErrUnknownDevice
  1019  		}
  1020  
  1021  		name := primitive.ByteSlice(append([]byte(nic.Name), 0))
  1022  		return &name, nil
  1023  
  1024  	case linux.SO_BROADCAST:
  1025  		if outLen < sizeOfInt32 {
  1026  			return nil, syserr.ErrInvalidArgument
  1027  		}
  1028  
  1029  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetBroadcast()))
  1030  		return &v, nil
  1031  
  1032  	case linux.SO_KEEPALIVE:
  1033  		if outLen < sizeOfInt32 {
  1034  			return nil, syserr.ErrInvalidArgument
  1035  		}
  1036  
  1037  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetKeepAlive()))
  1038  		return &v, nil
  1039  
  1040  	case linux.SO_LINGER:
  1041  		if outLen < linux.SizeOfLinger {
  1042  			return nil, syserr.ErrInvalidArgument
  1043  		}
  1044  
  1045  		var linger linux.Linger
  1046  		v := ep.SocketOptions().GetLinger()
  1047  
  1048  		if v.Enabled {
  1049  			linger.OnOff = 1
  1050  		}
  1051  		linger.Linger = int32(v.Timeout.Seconds())
  1052  		return &linger, nil
  1053  
  1054  	case linux.SO_SNDTIMEO:
  1055  		// TODO(igudger): Linux allows shorter lengths for partial results.
  1056  		if outLen < linux.SizeOfTimeval {
  1057  			return nil, syserr.ErrInvalidArgument
  1058  		}
  1059  
  1060  		sendTimeout := linux.NsecToTimeval(s.SendTimeout())
  1061  		return &sendTimeout, nil
  1062  
  1063  	case linux.SO_RCVTIMEO:
  1064  		// TODO(igudger): Linux allows shorter lengths for partial results.
  1065  		if outLen < linux.SizeOfTimeval {
  1066  			return nil, syserr.ErrInvalidArgument
  1067  		}
  1068  
  1069  		recvTimeout := linux.NsecToTimeval(s.RecvTimeout())
  1070  		return &recvTimeout, nil
  1071  
  1072  	case linux.SO_OOBINLINE:
  1073  		if outLen < sizeOfInt32 {
  1074  			return nil, syserr.ErrInvalidArgument
  1075  		}
  1076  
  1077  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetOutOfBandInline()))
  1078  		return &v, nil
  1079  
  1080  	case linux.SO_NO_CHECK:
  1081  		if outLen < sizeOfInt32 {
  1082  			return nil, syserr.ErrInvalidArgument
  1083  		}
  1084  
  1085  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetNoChecksum()))
  1086  		return &v, nil
  1087  
  1088  	case linux.SO_ACCEPTCONN:
  1089  		if outLen < sizeOfInt32 {
  1090  			return nil, syserr.ErrInvalidArgument
  1091  		}
  1092  
  1093  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetAcceptConn()))
  1094  		return &v, nil
  1095  
  1096  	case linux.SO_RCVLOWAT:
  1097  		if outLen < sizeOfInt32 {
  1098  			return nil, syserr.ErrInvalidArgument
  1099  		}
  1100  
  1101  		v := primitive.Int32(ep.SocketOptions().GetRcvlowat())
  1102  		return &v, nil
  1103  	}
  1104  	return nil, syserr.ErrProtocolNotAvailable
  1105  }
  1106  
  1107  // getSockOptTCP implements GetSockOpt when level is SOL_TCP.
  1108  func getSockOptTCP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) {
  1109  	if !socket.IsTCP(s) {
  1110  		return nil, syserr.ErrUnknownProtocolOption
  1111  	}
  1112  
  1113  	switch name {
  1114  	case linux.TCP_NODELAY:
  1115  		if outLen < sizeOfInt32 {
  1116  			return nil, syserr.ErrInvalidArgument
  1117  		}
  1118  
  1119  		v := primitive.Int32(boolToInt32(!ep.SocketOptions().GetDelayOption()))
  1120  		return &v, nil
  1121  
  1122  	case linux.TCP_CORK:
  1123  		if outLen < sizeOfInt32 {
  1124  			return nil, syserr.ErrInvalidArgument
  1125  		}
  1126  
  1127  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetCorkOption()))
  1128  		return &v, nil
  1129  
  1130  	case linux.TCP_QUICKACK:
  1131  		if outLen < sizeOfInt32 {
  1132  			return nil, syserr.ErrInvalidArgument
  1133  		}
  1134  
  1135  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetQuickAck()))
  1136  		return &v, nil
  1137  
  1138  	case linux.TCP_MAXSEG:
  1139  		if outLen < sizeOfInt32 {
  1140  			return nil, syserr.ErrInvalidArgument
  1141  		}
  1142  
  1143  		v, err := ep.GetSockOptInt(tcpip.MaxSegOption)
  1144  		if err != nil {
  1145  			return nil, syserr.TranslateNetstackError(err)
  1146  		}
  1147  		vP := primitive.Int32(v)
  1148  		return &vP, nil
  1149  
  1150  	case linux.TCP_KEEPIDLE:
  1151  		if outLen < sizeOfInt32 {
  1152  			return nil, syserr.ErrInvalidArgument
  1153  		}
  1154  
  1155  		var v tcpip.KeepaliveIdleOption
  1156  		if err := ep.GetSockOpt(&v); err != nil {
  1157  			return nil, syserr.TranslateNetstackError(err)
  1158  		}
  1159  		keepAliveIdle := primitive.Int32(time.Duration(v) / time.Second)
  1160  		return &keepAliveIdle, nil
  1161  
  1162  	case linux.TCP_KEEPINTVL:
  1163  		if outLen < sizeOfInt32 {
  1164  			return nil, syserr.ErrInvalidArgument
  1165  		}
  1166  
  1167  		var v tcpip.KeepaliveIntervalOption
  1168  		if err := ep.GetSockOpt(&v); err != nil {
  1169  			return nil, syserr.TranslateNetstackError(err)
  1170  		}
  1171  		keepAliveInterval := primitive.Int32(time.Duration(v) / time.Second)
  1172  		return &keepAliveInterval, nil
  1173  
  1174  	case linux.TCP_KEEPCNT:
  1175  		if outLen < sizeOfInt32 {
  1176  			return nil, syserr.ErrInvalidArgument
  1177  		}
  1178  
  1179  		v, err := ep.GetSockOptInt(tcpip.KeepaliveCountOption)
  1180  		if err != nil {
  1181  			return nil, syserr.TranslateNetstackError(err)
  1182  		}
  1183  		vP := primitive.Int32(v)
  1184  		return &vP, nil
  1185  
  1186  	case linux.TCP_USER_TIMEOUT:
  1187  		if outLen < sizeOfInt32 {
  1188  			return nil, syserr.ErrInvalidArgument
  1189  		}
  1190  
  1191  		var v tcpip.TCPUserTimeoutOption
  1192  		if err := ep.GetSockOpt(&v); err != nil {
  1193  			return nil, syserr.TranslateNetstackError(err)
  1194  		}
  1195  		tcpUserTimeout := primitive.Int32(time.Duration(v) / time.Millisecond)
  1196  		return &tcpUserTimeout, nil
  1197  
  1198  	case linux.TCP_INFO:
  1199  		var v tcpip.TCPInfoOption
  1200  		if err := ep.GetSockOpt(&v); err != nil {
  1201  			return nil, syserr.TranslateNetstackError(err)
  1202  		}
  1203  
  1204  		info := linux.TCPInfo{
  1205  			State:       uint8(v.State),
  1206  			RTO:         uint32(v.RTO / time.Microsecond),
  1207  			RTT:         uint32(v.RTT / time.Microsecond),
  1208  			RTTVar:      uint32(v.RTTVar / time.Microsecond),
  1209  			SndSsthresh: v.SndSsthresh,
  1210  			SndCwnd:     v.SndCwnd,
  1211  		}
  1212  		switch v.CcState {
  1213  		case tcpip.RTORecovery:
  1214  			info.CaState = linux.TCP_CA_Loss
  1215  		case tcpip.FastRecovery, tcpip.SACKRecovery:
  1216  			info.CaState = linux.TCP_CA_Recovery
  1217  		case tcpip.Disorder:
  1218  			info.CaState = linux.TCP_CA_Disorder
  1219  		case tcpip.Open:
  1220  			info.CaState = linux.TCP_CA_Open
  1221  		}
  1222  
  1223  		// In netstack reorderSeen is updated only when RACK is enabled.
  1224  		// We only track whether the reordering is seen, which is
  1225  		// different than Linux where reorderSeen is not specific to
  1226  		// RACK and is incremented when a reordering event is seen.
  1227  		if v.ReorderSeen {
  1228  			info.ReordSeen = 1
  1229  		}
  1230  
  1231  		// Linux truncates the output binary to outLen.
  1232  		buf := t.CopyScratchBuffer(info.SizeBytes())
  1233  		info.MarshalUnsafe(buf)
  1234  		if len(buf) > outLen {
  1235  			buf = buf[:outLen]
  1236  		}
  1237  		bufP := primitive.ByteSlice(buf)
  1238  		return &bufP, nil
  1239  
  1240  	case linux.TCP_CC_INFO,
  1241  		linux.TCP_NOTSENT_LOWAT,
  1242  		linux.TCP_ZEROCOPY_RECEIVE:
  1243  
  1244  		// Not supported.
  1245  
  1246  	case linux.TCP_CONGESTION:
  1247  		if outLen <= 0 {
  1248  			return nil, syserr.ErrInvalidArgument
  1249  		}
  1250  
  1251  		var v tcpip.CongestionControlOption
  1252  		if err := ep.GetSockOpt(&v); err != nil {
  1253  			return nil, syserr.TranslateNetstackError(err)
  1254  		}
  1255  
  1256  		// We match linux behaviour here where it returns the lower of
  1257  		// TCP_CA_NAME_MAX bytes or the value of the option length.
  1258  		//
  1259  		// This is Linux's net/tcp.h TCP_CA_NAME_MAX.
  1260  		const tcpCANameMax = 16
  1261  
  1262  		toCopy := tcpCANameMax
  1263  		if outLen < tcpCANameMax {
  1264  			toCopy = outLen
  1265  		}
  1266  		b := make([]byte, toCopy)
  1267  		copy(b, v)
  1268  
  1269  		bP := primitive.ByteSlice(b)
  1270  		return &bP, nil
  1271  
  1272  	case linux.TCP_LINGER2:
  1273  		if outLen < sizeOfInt32 {
  1274  			return nil, syserr.ErrInvalidArgument
  1275  		}
  1276  
  1277  		var v tcpip.TCPLingerTimeoutOption
  1278  		if err := ep.GetSockOpt(&v); err != nil {
  1279  			return nil, syserr.TranslateNetstackError(err)
  1280  		}
  1281  		var lingerTimeout primitive.Int32
  1282  		if v >= 0 {
  1283  			lingerTimeout = primitive.Int32(time.Duration(v) / time.Second)
  1284  		} else {
  1285  			lingerTimeout = -1
  1286  		}
  1287  		return &lingerTimeout, nil
  1288  
  1289  	case linux.TCP_DEFER_ACCEPT:
  1290  		if outLen < sizeOfInt32 {
  1291  			return nil, syserr.ErrInvalidArgument
  1292  		}
  1293  
  1294  		var v tcpip.TCPDeferAcceptOption
  1295  		if err := ep.GetSockOpt(&v); err != nil {
  1296  			return nil, syserr.TranslateNetstackError(err)
  1297  		}
  1298  
  1299  		tcpDeferAccept := primitive.Int32(time.Duration(v) / time.Second)
  1300  		return &tcpDeferAccept, nil
  1301  
  1302  	case linux.TCP_SYNCNT:
  1303  		if outLen < sizeOfInt32 {
  1304  			return nil, syserr.ErrInvalidArgument
  1305  		}
  1306  
  1307  		v, err := ep.GetSockOptInt(tcpip.TCPSynCountOption)
  1308  		if err != nil {
  1309  			return nil, syserr.TranslateNetstackError(err)
  1310  		}
  1311  		vP := primitive.Int32(v)
  1312  		return &vP, nil
  1313  
  1314  	case linux.TCP_WINDOW_CLAMP:
  1315  		if outLen < sizeOfInt32 {
  1316  			return nil, syserr.ErrInvalidArgument
  1317  		}
  1318  
  1319  		v, err := ep.GetSockOptInt(tcpip.TCPWindowClampOption)
  1320  		if err != nil {
  1321  			return nil, syserr.TranslateNetstackError(err)
  1322  		}
  1323  		vP := primitive.Int32(v)
  1324  		return &vP, nil
  1325  	}
  1326  	return nil, syserr.ErrProtocolNotAvailable
  1327  }
  1328  
  1329  func getSockOptICMPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outLen int) (marshal.Marshallable, *syserr.Error) {
  1330  	if _, ok := ep.(tcpip.Endpoint); !ok {
  1331  		log.Warningf("SOL_ICMPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  1332  		return nil, syserr.ErrUnknownProtocolOption
  1333  	}
  1334  
  1335  	if family, _, _ := s.Type(); family != linux.AF_INET6 {
  1336  		return nil, syserr.ErrNotSupported
  1337  	}
  1338  
  1339  	switch name {
  1340  	case linux.ICMPV6_FILTER:
  1341  		var v tcpip.ICMPv6Filter
  1342  		if err := ep.GetSockOpt(&v); err != nil {
  1343  			return nil, syserr.TranslateNetstackError(err)
  1344  		}
  1345  
  1346  		filter := linux.ICMP6Filter{Filter: v.DenyType}
  1347  
  1348  		// Linux truncates the output to outLen.
  1349  		buf := t.CopyScratchBuffer(filter.SizeBytes())
  1350  		filter.MarshalUnsafe(buf)
  1351  		if len(buf) > outLen {
  1352  			buf = buf[:outLen]
  1353  		}
  1354  		bufP := primitive.ByteSlice(buf)
  1355  		return &bufP, nil
  1356  	}
  1357  	return nil, syserr.ErrProtocolNotAvailable
  1358  }
  1359  
  1360  func defaultTTL(t *kernel.Task, network tcpip.NetworkProtocolNumber) (primitive.Int32, tcpip.Error) {
  1361  	var opt tcpip.DefaultTTLOption
  1362  	stack := inet.StackFromContext(t)
  1363  	if err := stack.(*Stack).Stack.NetworkProtocolOption(network, &opt); err != nil {
  1364  		return 0, err
  1365  	}
  1366  	return primitive.Int32(opt), nil
  1367  }
  1368  
  1369  // getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6.
  1370  func getSockOptIPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
  1371  	if _, ok := ep.(tcpip.Endpoint); !ok {
  1372  		log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  1373  		return nil, syserr.ErrUnknownProtocolOption
  1374  	}
  1375  
  1376  	family, skType, _ := s.Type()
  1377  	if family != linux.AF_INET6 {
  1378  		return nil, syserr.ErrNotSupported
  1379  	}
  1380  
  1381  	switch name {
  1382  	case linux.IPV6_CHECKSUM:
  1383  		if outLen < sizeOfInt32 {
  1384  			return nil, syserr.ErrInvalidArgument
  1385  		}
  1386  
  1387  		v, err := ep.GetSockOptInt(tcpip.IPv6Checksum)
  1388  		if err != nil {
  1389  			return nil, syserr.TranslateNetstackError(err)
  1390  		}
  1391  
  1392  		vP := primitive.Int32(v)
  1393  		return &vP, nil
  1394  
  1395  	case linux.IPV6_V6ONLY:
  1396  		if outLen < sizeOfInt32 {
  1397  			return nil, syserr.ErrInvalidArgument
  1398  		}
  1399  
  1400  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetV6Only()))
  1401  		return &v, nil
  1402  
  1403  	case linux.IPV6_UNICAST_HOPS:
  1404  		if outLen < sizeOfInt32 {
  1405  			return nil, syserr.ErrInvalidArgument
  1406  		}
  1407  
  1408  		v, err := ep.GetSockOptInt(tcpip.IPv6HopLimitOption)
  1409  		if err != nil {
  1410  			return nil, syserr.TranslateNetstackError(err)
  1411  		}
  1412  
  1413  		// Fill in the default value, if needed.
  1414  		vP := primitive.Int32(v)
  1415  		if vP == -1 {
  1416  			vP, err = defaultTTL(t, header.IPv6ProtocolNumber)
  1417  			if err != nil {
  1418  				return nil, syserr.TranslateNetstackError(err)
  1419  			}
  1420  		}
  1421  
  1422  		return &vP, nil
  1423  
  1424  	case linux.IPV6_RECVHOPLIMIT:
  1425  		if outLen < sizeOfInt32 {
  1426  			return nil, syserr.ErrInvalidArgument
  1427  		}
  1428  
  1429  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveHopLimit()))
  1430  		return &v, nil
  1431  
  1432  	case linux.IPV6_PATHMTU:
  1433  		// Not supported.
  1434  
  1435  	case linux.IPV6_TCLASS:
  1436  		// Length handling for parity with Linux.
  1437  		if outLen == 0 {
  1438  			var b primitive.ByteSlice
  1439  			return &b, nil
  1440  		}
  1441  		v, err := ep.GetSockOptInt(tcpip.IPv6TrafficClassOption)
  1442  		if err != nil {
  1443  			return nil, syserr.TranslateNetstackError(err)
  1444  		}
  1445  
  1446  		uintv := primitive.Uint32(v)
  1447  		// Linux truncates the output binary to outLen.
  1448  		ib := t.CopyScratchBuffer(uintv.SizeBytes())
  1449  		uintv.MarshalUnsafe(ib)
  1450  		// Handle cases where outLen is lesser than sizeOfInt32.
  1451  		if len(ib) > outLen {
  1452  			ib = ib[:outLen]
  1453  		}
  1454  		ibP := primitive.ByteSlice(ib)
  1455  		return &ibP, nil
  1456  
  1457  	case linux.IPV6_RECVTCLASS:
  1458  		if outLen < sizeOfInt32 {
  1459  			return nil, syserr.ErrInvalidArgument
  1460  		}
  1461  
  1462  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTClass()))
  1463  		return &v, nil
  1464  	case linux.IPV6_RECVERR:
  1465  		if outLen < sizeOfInt32 {
  1466  			return nil, syserr.ErrInvalidArgument
  1467  		}
  1468  
  1469  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv6RecvError()))
  1470  		return &v, nil
  1471  
  1472  	case linux.IPV6_RECVORIGDSTADDR:
  1473  		if outLen < sizeOfInt32 {
  1474  			return nil, syserr.ErrInvalidArgument
  1475  		}
  1476  
  1477  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress()))
  1478  		return &v, nil
  1479  
  1480  	case linux.IPV6_RECVPKTINFO:
  1481  		if outLen < sizeOfInt32 {
  1482  			return nil, syserr.ErrInvalidArgument
  1483  		}
  1484  
  1485  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv6ReceivePacketInfo()))
  1486  		return &v, nil
  1487  
  1488  	case linux.IP6T_ORIGINAL_DST:
  1489  		if outLen < sockAddrInet6Size {
  1490  			return nil, syserr.ErrInvalidArgument
  1491  		}
  1492  
  1493  		var v tcpip.OriginalDestinationOption
  1494  		if err := ep.GetSockOpt(&v); err != nil {
  1495  			return nil, syserr.TranslateNetstackError(err)
  1496  		}
  1497  
  1498  		a, _ := socket.ConvertAddress(linux.AF_INET6, tcpip.FullAddress(v))
  1499  		return a.(*linux.SockAddrInet6), nil
  1500  
  1501  	case linux.IP6T_SO_GET_INFO:
  1502  		if outLen < linux.SizeOfIPTGetinfo {
  1503  			return nil, syserr.ErrInvalidArgument
  1504  		}
  1505  
  1506  		// Only valid for raw IPv6 sockets.
  1507  		if skType != linux.SOCK_RAW {
  1508  			return nil, syserr.ErrProtocolNotAvailable
  1509  		}
  1510  
  1511  		stk := inet.StackFromContext(t)
  1512  		if stk == nil {
  1513  			return nil, syserr.ErrNoDevice
  1514  		}
  1515  		info, err := netfilter.GetInfo(t, stk.(*Stack).Stack, outPtr, true)
  1516  		if err != nil {
  1517  			return nil, err
  1518  		}
  1519  		return &info, nil
  1520  
  1521  	case linux.IP6T_SO_GET_ENTRIES:
  1522  		// IPTGetEntries is reused for IPv6.
  1523  		if outLen < linux.SizeOfIPTGetEntries {
  1524  			return nil, syserr.ErrInvalidArgument
  1525  		}
  1526  		// Only valid for raw IPv6 sockets.
  1527  		if skType != linux.SOCK_RAW {
  1528  			return nil, syserr.ErrProtocolNotAvailable
  1529  		}
  1530  
  1531  		stk := inet.StackFromContext(t)
  1532  		if stk == nil {
  1533  			return nil, syserr.ErrNoDevice
  1534  		}
  1535  		entries, err := netfilter.GetEntries6(t, stk.(*Stack).Stack, outPtr, outLen)
  1536  		if err != nil {
  1537  			return nil, err
  1538  		}
  1539  		return &entries, nil
  1540  
  1541  	case linux.IP6T_SO_GET_REVISION_TARGET:
  1542  		if outLen < linux.SizeOfXTGetRevision {
  1543  			return nil, syserr.ErrInvalidArgument
  1544  		}
  1545  
  1546  		// Only valid for raw IPv6 sockets.
  1547  		if skType != linux.SOCK_RAW {
  1548  			return nil, syserr.ErrProtocolNotAvailable
  1549  		}
  1550  
  1551  		stk := inet.StackFromContext(t)
  1552  		if stk == nil {
  1553  			return nil, syserr.ErrNoDevice
  1554  		}
  1555  		ret, err := netfilter.TargetRevision(t, outPtr, header.IPv6ProtocolNumber)
  1556  		if err != nil {
  1557  			return nil, err
  1558  		}
  1559  		return &ret, nil
  1560  	}
  1561  	return nil, syserr.ErrProtocolNotAvailable
  1562  }
  1563  
  1564  // getSockOptIP implements GetSockOpt when level is SOL_IP.
  1565  func getSockOptIP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int, _ int) (marshal.Marshallable, *syserr.Error) {
  1566  	if _, ok := ep.(tcpip.Endpoint); !ok {
  1567  		log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  1568  		return nil, syserr.ErrUnknownProtocolOption
  1569  	}
  1570  
  1571  	switch name {
  1572  	case linux.IP_TTL:
  1573  		if outLen < sizeOfInt32 {
  1574  			return nil, syserr.ErrInvalidArgument
  1575  		}
  1576  
  1577  		v, err := ep.GetSockOptInt(tcpip.IPv4TTLOption)
  1578  		if err != nil {
  1579  			return nil, syserr.TranslateNetstackError(err)
  1580  		}
  1581  
  1582  		// Fill in the default value, if needed.
  1583  		vP := primitive.Int32(v)
  1584  		if vP == 0 {
  1585  			vP, err = defaultTTL(t, header.IPv4ProtocolNumber)
  1586  			if err != nil {
  1587  				return nil, syserr.TranslateNetstackError(err)
  1588  			}
  1589  		}
  1590  
  1591  		return &vP, nil
  1592  
  1593  	case linux.IP_RECVTTL:
  1594  		if outLen < sizeOfInt32 {
  1595  			return nil, syserr.ErrInvalidArgument
  1596  		}
  1597  
  1598  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTTL()))
  1599  		return &v, nil
  1600  
  1601  	case linux.IP_MULTICAST_TTL:
  1602  		if outLen < sizeOfInt32 {
  1603  			return nil, syserr.ErrInvalidArgument
  1604  		}
  1605  
  1606  		v, err := ep.GetSockOptInt(tcpip.MulticastTTLOption)
  1607  		if err != nil {
  1608  			return nil, syserr.TranslateNetstackError(err)
  1609  		}
  1610  
  1611  		vP := primitive.Int32(v)
  1612  		return &vP, nil
  1613  
  1614  	case linux.IP_MULTICAST_IF:
  1615  		if outLen < len(linux.InetAddr{}) {
  1616  			return nil, syserr.ErrInvalidArgument
  1617  		}
  1618  
  1619  		var v tcpip.MulticastInterfaceOption
  1620  		if err := ep.GetSockOpt(&v); err != nil {
  1621  			return nil, syserr.TranslateNetstackError(err)
  1622  		}
  1623  
  1624  		a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr})
  1625  
  1626  		return &a.(*linux.SockAddrInet).Addr, nil
  1627  
  1628  	case linux.IP_MULTICAST_LOOP:
  1629  		if outLen < sizeOfInt32 {
  1630  			return nil, syserr.ErrInvalidArgument
  1631  		}
  1632  
  1633  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetMulticastLoop()))
  1634  		return &v, nil
  1635  
  1636  	case linux.IP_TOS:
  1637  		// Length handling for parity with Linux.
  1638  		if outLen == 0 {
  1639  			var b primitive.ByteSlice
  1640  			return &b, nil
  1641  		}
  1642  		v, err := ep.GetSockOptInt(tcpip.IPv4TOSOption)
  1643  		if err != nil {
  1644  			return nil, syserr.TranslateNetstackError(err)
  1645  		}
  1646  		if outLen < sizeOfInt32 {
  1647  			vP := primitive.Uint8(v)
  1648  			return &vP, nil
  1649  		}
  1650  		vP := primitive.Int32(v)
  1651  		return &vP, nil
  1652  
  1653  	case linux.IP_RECVTOS:
  1654  		if outLen < sizeOfInt32 {
  1655  			return nil, syserr.ErrInvalidArgument
  1656  		}
  1657  
  1658  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTOS()))
  1659  		return &v, nil
  1660  
  1661  	case linux.IP_RECVERR:
  1662  		if outLen < sizeOfInt32 {
  1663  			return nil, syserr.ErrInvalidArgument
  1664  		}
  1665  
  1666  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv4RecvError()))
  1667  		return &v, nil
  1668  
  1669  	case linux.IP_PKTINFO:
  1670  		if outLen < sizeOfInt32 {
  1671  			return nil, syserr.ErrInvalidArgument
  1672  		}
  1673  
  1674  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceivePacketInfo()))
  1675  		return &v, nil
  1676  
  1677  	case linux.IP_HDRINCL:
  1678  		if outLen < sizeOfInt32 {
  1679  			return nil, syserr.ErrInvalidArgument
  1680  		}
  1681  
  1682  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetHeaderIncluded()))
  1683  		return &v, nil
  1684  
  1685  	case linux.IP_RECVORIGDSTADDR:
  1686  		if outLen < sizeOfInt32 {
  1687  			return nil, syserr.ErrInvalidArgument
  1688  		}
  1689  
  1690  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress()))
  1691  		return &v, nil
  1692  
  1693  	case linux.SO_ORIGINAL_DST:
  1694  		if outLen < sockAddrInetSize {
  1695  			return nil, syserr.ErrInvalidArgument
  1696  		}
  1697  
  1698  		var v tcpip.OriginalDestinationOption
  1699  		if err := ep.GetSockOpt(&v); err != nil {
  1700  			return nil, syserr.TranslateNetstackError(err)
  1701  		}
  1702  
  1703  		a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress(v))
  1704  		return a.(*linux.SockAddrInet), nil
  1705  
  1706  	case linux.IPT_SO_GET_INFO:
  1707  		if outLen < linux.SizeOfIPTGetinfo {
  1708  			return nil, syserr.ErrInvalidArgument
  1709  		}
  1710  
  1711  		// Only valid for raw IPv4 sockets.
  1712  		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
  1713  			return nil, syserr.ErrProtocolNotAvailable
  1714  		}
  1715  
  1716  		stk := inet.StackFromContext(t)
  1717  		if stk == nil {
  1718  			return nil, syserr.ErrNoDevice
  1719  		}
  1720  		info, err := netfilter.GetInfo(t, stk.(*Stack).Stack, outPtr, false)
  1721  		if err != nil {
  1722  			return nil, err
  1723  		}
  1724  		return &info, nil
  1725  
  1726  	case linux.IPT_SO_GET_ENTRIES:
  1727  		if outLen < linux.SizeOfIPTGetEntries {
  1728  			return nil, syserr.ErrInvalidArgument
  1729  		}
  1730  
  1731  		// Only valid for raw IPv4 sockets.
  1732  		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
  1733  			return nil, syserr.ErrProtocolNotAvailable
  1734  		}
  1735  
  1736  		stk := inet.StackFromContext(t)
  1737  		if stk == nil {
  1738  			return nil, syserr.ErrNoDevice
  1739  		}
  1740  		entries, err := netfilter.GetEntries4(t, stk.(*Stack).Stack, outPtr, outLen)
  1741  		if err != nil {
  1742  			return nil, err
  1743  		}
  1744  		return &entries, nil
  1745  
  1746  	case linux.IPT_SO_GET_REVISION_TARGET:
  1747  		if outLen < linux.SizeOfXTGetRevision {
  1748  			return nil, syserr.ErrInvalidArgument
  1749  		}
  1750  
  1751  		// Only valid for raw IPv4 sockets.
  1752  		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
  1753  			return nil, syserr.ErrProtocolNotAvailable
  1754  		}
  1755  
  1756  		stk := inet.StackFromContext(t)
  1757  		if stk == nil {
  1758  			return nil, syserr.ErrNoDevice
  1759  		}
  1760  		ret, err := netfilter.TargetRevision(t, outPtr, header.IPv4ProtocolNumber)
  1761  		if err != nil {
  1762  			return nil, err
  1763  		}
  1764  		return &ret, nil
  1765  
  1766  	case linux.IP_MTU_DISCOVER:
  1767  		if outLen < sizeOfInt32 {
  1768  			return nil, syserr.ErrInvalidArgument
  1769  		}
  1770  
  1771  		v, err := ep.GetSockOptInt(tcpip.MTUDiscoverOption)
  1772  		if err != nil {
  1773  			return nil, syserr.TranslateNetstackError(err)
  1774  		}
  1775  		switch tcpip.PMTUDStrategy(v) {
  1776  		case tcpip.PMTUDiscoveryWant:
  1777  			v = linux.IP_PMTUDISC_WANT
  1778  		case tcpip.PMTUDiscoveryDont:
  1779  			v = linux.IP_PMTUDISC_DONT
  1780  		case tcpip.PMTUDiscoveryDo:
  1781  			v = linux.IP_PMTUDISC_DO
  1782  		case tcpip.PMTUDiscoveryProbe:
  1783  			v = linux.IP_PMTUDISC_PROBE
  1784  		default:
  1785  			panic(fmt.Errorf("unknown PMTUD option: %d", v))
  1786  		}
  1787  		vP := primitive.Int32(v)
  1788  		return &vP, nil
  1789  	}
  1790  	return nil, syserr.ErrProtocolNotAvailable
  1791  }
  1792  
  1793  // SetSockOpt can be used to implement the linux syscall setsockopt(2) for
  1794  // sockets backed by a commonEndpoint.
  1795  func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error {
  1796  	switch level {
  1797  	case linux.SOL_SOCKET:
  1798  		return setSockOptSocket(t, s, ep, name, optVal)
  1799  
  1800  	case linux.SOL_TCP:
  1801  		return setSockOptTCP(t, s, ep, name, optVal)
  1802  
  1803  	case linux.SOL_ICMPV6:
  1804  		return setSockOptICMPv6(t, s, ep, name, optVal)
  1805  
  1806  	case linux.SOL_IPV6:
  1807  		return setSockOptIPv6(t, s, ep, name, optVal)
  1808  
  1809  	case linux.SOL_IP:
  1810  		return setSockOptIP(t, s, ep, name, optVal)
  1811  
  1812  	case linux.SOL_PACKET:
  1813  		// gVisor doesn't support any SOL_PACKET options just return not
  1814  		// supported. Returning nil here will result in tcpdump thinking AF_PACKET
  1815  		// features are supported and proceed to use them and break.
  1816  		return syserr.ErrProtocolNotAvailable
  1817  
  1818  	case linux.SOL_UDP,
  1819  		linux.SOL_RAW:
  1820  		// Not supported.
  1821  	}
  1822  
  1823  	return nil
  1824  }
  1825  
  1826  func clampBufSize(newSz, min, max int64, ignoreMax bool) int64 {
  1827  	// packetOverheadFactor is used to multiply the value provided by the user on
  1828  	// a setsockopt(2) for setting the send/receive buffer sizes sockets.
  1829  	const packetOverheadFactor = 2
  1830  
  1831  	if !ignoreMax && newSz > max {
  1832  		newSz = max
  1833  	}
  1834  
  1835  	if newSz < math.MaxInt32/packetOverheadFactor {
  1836  		newSz *= packetOverheadFactor
  1837  		if newSz < min {
  1838  			newSz = min
  1839  		}
  1840  	} else {
  1841  		newSz = math.MaxInt32
  1842  	}
  1843  	return newSz
  1844  }
  1845  
  1846  // setSockOptSocket implements SetSockOpt when level is SOL_SOCKET.
  1847  func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  1848  	switch name {
  1849  	case linux.SO_SNDBUF:
  1850  		if len(optVal) < sizeOfInt32 {
  1851  			return syserr.ErrInvalidArgument
  1852  		}
  1853  
  1854  		v := hostarch.ByteOrder.Uint32(optVal)
  1855  		min, max := ep.SocketOptions().SendBufferLimits()
  1856  		clamped := clampBufSize(int64(v), min, max, false /* ignoreMax */)
  1857  		ep.SocketOptions().SetSendBufferSize(clamped, true /* notify */)
  1858  		return nil
  1859  
  1860  	case linux.SO_RCVBUF:
  1861  		if len(optVal) < sizeOfInt32 {
  1862  			return syserr.ErrInvalidArgument
  1863  		}
  1864  
  1865  		v := hostarch.ByteOrder.Uint32(optVal)
  1866  		min, max := ep.SocketOptions().ReceiveBufferLimits()
  1867  		clamped := clampBufSize(int64(v), min, max, false /* ignoreMax */)
  1868  		ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */)
  1869  		return nil
  1870  
  1871  	case linux.SO_RCVBUFFORCE:
  1872  		if len(optVal) < sizeOfInt32 {
  1873  			return syserr.ErrInvalidArgument
  1874  		}
  1875  
  1876  		if creds := auth.CredentialsFromContext(t); !creds.HasCapability(linux.CAP_NET_ADMIN) {
  1877  			return syserr.ErrNotPermitted
  1878  		}
  1879  
  1880  		v := hostarch.ByteOrder.Uint32(optVal)
  1881  		min, max := ep.SocketOptions().ReceiveBufferLimits()
  1882  		clamped := clampBufSize(int64(v), min, max, true /* ignoreMax */)
  1883  		ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */)
  1884  		return nil
  1885  
  1886  	case linux.SO_REUSEADDR:
  1887  		if len(optVal) < sizeOfInt32 {
  1888  			return syserr.ErrInvalidArgument
  1889  		}
  1890  
  1891  		v := hostarch.ByteOrder.Uint32(optVal)
  1892  		ep.SocketOptions().SetReuseAddress(v != 0)
  1893  		return nil
  1894  
  1895  	case linux.SO_REUSEPORT:
  1896  		if len(optVal) < sizeOfInt32 {
  1897  			return syserr.ErrInvalidArgument
  1898  		}
  1899  
  1900  		v := hostarch.ByteOrder.Uint32(optVal)
  1901  		ep.SocketOptions().SetReusePort(v != 0)
  1902  		return nil
  1903  
  1904  	case linux.SO_BINDTODEVICE:
  1905  		n := bytes.IndexByte(optVal, 0)
  1906  		if n == -1 {
  1907  			n = len(optVal)
  1908  		}
  1909  		name := string(optVal[:n])
  1910  		if name == "" {
  1911  			return syserr.TranslateNetstackError(ep.SocketOptions().SetBindToDevice(0))
  1912  		}
  1913  		s := t.NetworkContext()
  1914  		if s == nil {
  1915  			return syserr.ErrNoDevice
  1916  		}
  1917  		for nicID, nic := range s.Interfaces() {
  1918  			if nic.Name == name {
  1919  				return syserr.TranslateNetstackError(ep.SocketOptions().SetBindToDevice(nicID))
  1920  			}
  1921  		}
  1922  		return syserr.ErrUnknownDevice
  1923  
  1924  	case linux.SO_BROADCAST:
  1925  		if len(optVal) < sizeOfInt32 {
  1926  			return syserr.ErrInvalidArgument
  1927  		}
  1928  
  1929  		v := hostarch.ByteOrder.Uint32(optVal)
  1930  		ep.SocketOptions().SetBroadcast(v != 0)
  1931  		return nil
  1932  
  1933  	case linux.SO_PASSCRED:
  1934  		if len(optVal) < sizeOfInt32 {
  1935  			return syserr.ErrInvalidArgument
  1936  		}
  1937  
  1938  		v := hostarch.ByteOrder.Uint32(optVal)
  1939  		ep.SocketOptions().SetPassCred(v != 0)
  1940  		return nil
  1941  
  1942  	case linux.SO_KEEPALIVE:
  1943  		if len(optVal) < sizeOfInt32 {
  1944  			return syserr.ErrInvalidArgument
  1945  		}
  1946  
  1947  		v := hostarch.ByteOrder.Uint32(optVal)
  1948  		ep.SocketOptions().SetKeepAlive(v != 0)
  1949  		return nil
  1950  
  1951  	case linux.SO_SNDTIMEO:
  1952  		if len(optVal) < linux.SizeOfTimeval {
  1953  			return syserr.ErrInvalidArgument
  1954  		}
  1955  
  1956  		var v linux.Timeval
  1957  		v.UnmarshalBytes(optVal)
  1958  		if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
  1959  			return syserr.ErrDomain
  1960  		}
  1961  		s.SetSendTimeout(v.ToNsecCapped())
  1962  		return nil
  1963  
  1964  	case linux.SO_RCVTIMEO:
  1965  		if len(optVal) < linux.SizeOfTimeval {
  1966  			return syserr.ErrInvalidArgument
  1967  		}
  1968  
  1969  		var v linux.Timeval
  1970  		v.UnmarshalBytes(optVal)
  1971  		if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
  1972  			return syserr.ErrDomain
  1973  		}
  1974  		s.SetRecvTimeout(v.ToNsecCapped())
  1975  		return nil
  1976  
  1977  	case linux.SO_OOBINLINE:
  1978  		if len(optVal) < sizeOfInt32 {
  1979  			return syserr.ErrInvalidArgument
  1980  		}
  1981  
  1982  		v := hostarch.ByteOrder.Uint32(optVal)
  1983  		ep.SocketOptions().SetOutOfBandInline(v != 0)
  1984  		return nil
  1985  
  1986  	case linux.SO_NO_CHECK:
  1987  		if len(optVal) < sizeOfInt32 {
  1988  			return syserr.ErrInvalidArgument
  1989  		}
  1990  
  1991  		v := hostarch.ByteOrder.Uint32(optVal)
  1992  		ep.SocketOptions().SetNoChecksum(v != 0)
  1993  		return nil
  1994  
  1995  	case linux.SO_LINGER:
  1996  		if len(optVal) < linux.SizeOfLinger {
  1997  			return syserr.ErrInvalidArgument
  1998  		}
  1999  
  2000  		var v linux.Linger
  2001  		v.UnmarshalBytes(optVal)
  2002  
  2003  		ep.SocketOptions().SetLinger(tcpip.LingerOption{
  2004  			Enabled: v.OnOff != 0,
  2005  			Timeout: time.Second * time.Duration(v.Linger),
  2006  		})
  2007  		return nil
  2008  
  2009  	case linux.SO_DETACH_FILTER:
  2010  		// optval is ignored.
  2011  		var v tcpip.SocketDetachFilterOption
  2012  		return syserr.TranslateNetstackError(ep.SetSockOpt(&v))
  2013  
  2014  	// TODO(b/226603727): Add support for SO_RCVLOWAT option. For now, only
  2015  	// the unsupported syscall message is removed.
  2016  	case linux.SO_RCVLOWAT:
  2017  		if len(optVal) < sizeOfInt32 {
  2018  			return syserr.ErrInvalidArgument
  2019  		}
  2020  
  2021  		v := hostarch.ByteOrder.Uint32(optVal)
  2022  		ep.SocketOptions().SetRcvlowat(int32(v))
  2023  		return nil
  2024  	}
  2025  
  2026  	return nil
  2027  }
  2028  
  2029  // setSockOptTCP implements SetSockOpt when level is SOL_TCP.
  2030  func setSockOptTCP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  2031  	if !socket.IsTCP(s) {
  2032  		return syserr.ErrUnknownProtocolOption
  2033  	}
  2034  
  2035  	switch name {
  2036  	case linux.TCP_NODELAY:
  2037  		if len(optVal) < sizeOfInt32 {
  2038  			return syserr.ErrInvalidArgument
  2039  		}
  2040  
  2041  		v := hostarch.ByteOrder.Uint32(optVal)
  2042  		ep.SocketOptions().SetDelayOption(v == 0)
  2043  		return nil
  2044  
  2045  	case linux.TCP_CORK:
  2046  		if len(optVal) < sizeOfInt32 {
  2047  			return syserr.ErrInvalidArgument
  2048  		}
  2049  
  2050  		v := hostarch.ByteOrder.Uint32(optVal)
  2051  		ep.SocketOptions().SetCorkOption(v != 0)
  2052  		return nil
  2053  
  2054  	case linux.TCP_QUICKACK:
  2055  		if len(optVal) < sizeOfInt32 {
  2056  			return syserr.ErrInvalidArgument
  2057  		}
  2058  
  2059  		v := hostarch.ByteOrder.Uint32(optVal)
  2060  		ep.SocketOptions().SetQuickAck(v != 0)
  2061  		return nil
  2062  
  2063  	case linux.TCP_MAXSEG:
  2064  		if len(optVal) < sizeOfInt32 {
  2065  			return syserr.ErrInvalidArgument
  2066  		}
  2067  
  2068  		v := hostarch.ByteOrder.Uint32(optVal)
  2069  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MaxSegOption, int(v)))
  2070  
  2071  	case linux.TCP_KEEPIDLE:
  2072  		if len(optVal) < sizeOfInt32 {
  2073  			return syserr.ErrInvalidArgument
  2074  		}
  2075  
  2076  		v := hostarch.ByteOrder.Uint32(optVal)
  2077  		if v < 1 || v > linux.MAX_TCP_KEEPIDLE {
  2078  			return syserr.ErrInvalidArgument
  2079  		}
  2080  		opt := tcpip.KeepaliveIdleOption(time.Second * time.Duration(v))
  2081  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  2082  
  2083  	case linux.TCP_KEEPINTVL:
  2084  		if len(optVal) < sizeOfInt32 {
  2085  			return syserr.ErrInvalidArgument
  2086  		}
  2087  
  2088  		v := hostarch.ByteOrder.Uint32(optVal)
  2089  		if v < 1 || v > linux.MAX_TCP_KEEPINTVL {
  2090  			return syserr.ErrInvalidArgument
  2091  		}
  2092  		opt := tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))
  2093  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  2094  
  2095  	case linux.TCP_KEEPCNT:
  2096  		if len(optVal) < sizeOfInt32 {
  2097  			return syserr.ErrInvalidArgument
  2098  		}
  2099  
  2100  		v := hostarch.ByteOrder.Uint32(optVal)
  2101  		if v < 1 || v > linux.MAX_TCP_KEEPCNT {
  2102  			return syserr.ErrInvalidArgument
  2103  		}
  2104  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.KeepaliveCountOption, int(v)))
  2105  
  2106  	case linux.TCP_USER_TIMEOUT:
  2107  		if len(optVal) < sizeOfInt32 {
  2108  			return syserr.ErrInvalidArgument
  2109  		}
  2110  
  2111  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2112  		if v < 0 {
  2113  			return syserr.ErrInvalidArgument
  2114  		}
  2115  		opt := tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v))
  2116  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  2117  
  2118  	case linux.TCP_CONGESTION:
  2119  		v := tcpip.CongestionControlOption(optVal)
  2120  		if err := ep.SetSockOpt(&v); err != nil {
  2121  			return syserr.TranslateNetstackError(err)
  2122  		}
  2123  		return nil
  2124  
  2125  	case linux.TCP_LINGER2:
  2126  		if len(optVal) < sizeOfInt32 {
  2127  			return syserr.ErrInvalidArgument
  2128  		}
  2129  
  2130  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2131  		opt := tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v))
  2132  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  2133  
  2134  	case linux.TCP_DEFER_ACCEPT:
  2135  		if len(optVal) < sizeOfInt32 {
  2136  			return syserr.ErrInvalidArgument
  2137  		}
  2138  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2139  		if v < 0 {
  2140  			v = 0
  2141  		}
  2142  		opt := tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v))
  2143  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  2144  
  2145  	case linux.TCP_SYNCNT:
  2146  		if len(optVal) < sizeOfInt32 {
  2147  			return syserr.ErrInvalidArgument
  2148  		}
  2149  		v := hostarch.ByteOrder.Uint32(optVal)
  2150  
  2151  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPSynCountOption, int(v)))
  2152  
  2153  	case linux.TCP_WINDOW_CLAMP:
  2154  		if len(optVal) < sizeOfInt32 {
  2155  			return syserr.ErrInvalidArgument
  2156  		}
  2157  		v := hostarch.ByteOrder.Uint32(optVal)
  2158  
  2159  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPWindowClampOption, int(v)))
  2160  
  2161  	case linux.TCP_REPAIR_OPTIONS:
  2162  		// Not supported.
  2163  	}
  2164  
  2165  	return nil
  2166  }
  2167  
  2168  func setSockOptICMPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  2169  	if _, ok := ep.(tcpip.Endpoint); !ok {
  2170  		log.Warningf("SOL_ICMPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  2171  		return syserr.ErrUnknownProtocolOption
  2172  	}
  2173  
  2174  	if family, _, _ := s.Type(); family != linux.AF_INET6 {
  2175  		return syserr.ErrUnknownProtocolOption
  2176  	}
  2177  
  2178  	switch name {
  2179  	case linux.ICMPV6_FILTER:
  2180  		var req linux.ICMP6Filter
  2181  		if len(optVal) < req.SizeBytes() {
  2182  			return syserr.ErrInvalidArgument
  2183  		}
  2184  
  2185  		req.UnmarshalUnsafe(optVal)
  2186  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.ICMPv6Filter{DenyType: req.Filter}))
  2187  	}
  2188  
  2189  	return nil
  2190  }
  2191  
  2192  // setSockOptIPv6 implements SetSockOpt when level is SOL_IPV6.
  2193  func setSockOptIPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  2194  	if _, ok := ep.(tcpip.Endpoint); !ok {
  2195  		log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  2196  		return syserr.ErrUnknownProtocolOption
  2197  	}
  2198  
  2199  	family, _, _ := s.Type()
  2200  	if family != linux.AF_INET6 {
  2201  		return syserr.ErrUnknownProtocolOption
  2202  	}
  2203  
  2204  	switch name {
  2205  	case linux.IPV6_CHECKSUM:
  2206  		if len(optVal) < sizeOfInt32 {
  2207  			return syserr.ErrInvalidArgument
  2208  		}
  2209  
  2210  		// int may not be 32-bits so we cast the uint32 to an int32 before casting
  2211  		// to an int.
  2212  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6Checksum, int(int32(hostarch.ByteOrder.Uint32(optVal)))))
  2213  
  2214  	case linux.IPV6_V6ONLY:
  2215  		if len(optVal) < sizeOfInt32 {
  2216  			return syserr.ErrInvalidArgument
  2217  		}
  2218  
  2219  		if socket.IsTCP(s) && tcp.EndpointState(ep.State()) != tcp.StateInitial {
  2220  			return syserr.ErrInvalidEndpointState
  2221  		} else if socket.IsUDP(s) && transport.DatagramEndpointState(ep.State()) != transport.DatagramEndpointStateInitial {
  2222  			return syserr.ErrInvalidEndpointState
  2223  		}
  2224  
  2225  		v := hostarch.ByteOrder.Uint32(optVal)
  2226  		ep.SocketOptions().SetV6Only(v != 0)
  2227  		return nil
  2228  
  2229  	case linux.IPV6_ADD_MEMBERSHIP:
  2230  		req, err := copyInMulticastV6Request(optVal)
  2231  		if err != nil {
  2232  			return err
  2233  		}
  2234  
  2235  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{
  2236  			NIC:           tcpip.NICID(req.InterfaceIndex),
  2237  			MulticastAddr: tcpip.AddrFrom16(req.MulticastAddr),
  2238  		}))
  2239  
  2240  	case linux.IPV6_DROP_MEMBERSHIP:
  2241  		req, err := copyInMulticastV6Request(optVal)
  2242  		if err != nil {
  2243  			return err
  2244  		}
  2245  
  2246  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{
  2247  			NIC:           tcpip.NICID(req.InterfaceIndex),
  2248  			MulticastAddr: tcpip.AddrFrom16(req.MulticastAddr),
  2249  		}))
  2250  
  2251  	case linux.IPV6_IPSEC_POLICY,
  2252  		linux.IPV6_JOIN_ANYCAST,
  2253  		linux.IPV6_LEAVE_ANYCAST,
  2254  		// TODO(b/148887420): Add support for IPV6_PKTINFO.
  2255  		linux.IPV6_PKTINFO,
  2256  		linux.IPV6_ROUTER_ALERT,
  2257  		linux.IPV6_XFRM_POLICY,
  2258  		linux.MCAST_BLOCK_SOURCE,
  2259  		linux.MCAST_JOIN_GROUP,
  2260  		linux.MCAST_JOIN_SOURCE_GROUP,
  2261  		linux.MCAST_LEAVE_GROUP,
  2262  		linux.MCAST_LEAVE_SOURCE_GROUP,
  2263  		linux.MCAST_UNBLOCK_SOURCE:
  2264  		// Not supported.
  2265  
  2266  	case linux.IPV6_RECVORIGDSTADDR:
  2267  		if len(optVal) < sizeOfInt32 {
  2268  			return syserr.ErrInvalidArgument
  2269  		}
  2270  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2271  
  2272  		ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0)
  2273  		return nil
  2274  
  2275  	case linux.IPV6_RECVPKTINFO:
  2276  		if len(optVal) < sizeOfInt32 {
  2277  			return syserr.ErrInvalidArgument
  2278  		}
  2279  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2280  
  2281  		ep.SocketOptions().SetIPv6ReceivePacketInfo(v != 0)
  2282  		return nil
  2283  
  2284  	case linux.IPV6_UNICAST_HOPS:
  2285  		if len(optVal) < sizeOfInt32 {
  2286  			return syserr.ErrInvalidArgument
  2287  		}
  2288  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2289  		if v < -1 || v > 255 {
  2290  			return syserr.ErrInvalidArgument
  2291  		}
  2292  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6HopLimitOption, int(v)))
  2293  
  2294  	case linux.IPV6_RECVHOPLIMIT:
  2295  		v, err := parseIntOrChar(optVal)
  2296  		if err != nil {
  2297  			return err
  2298  		}
  2299  
  2300  		ep.SocketOptions().SetReceiveHopLimit(v != 0)
  2301  		return nil
  2302  
  2303  	case linux.IPV6_TCLASS:
  2304  		if len(optVal) < sizeOfInt32 {
  2305  			return syserr.ErrInvalidArgument
  2306  		}
  2307  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2308  		if v < -1 || v > 255 {
  2309  			return syserr.ErrInvalidArgument
  2310  		}
  2311  		if v == -1 {
  2312  			v = 0
  2313  		}
  2314  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6TrafficClassOption, int(v)))
  2315  
  2316  	case linux.IPV6_RECVTCLASS:
  2317  		v, err := parseIntOrChar(optVal)
  2318  		if err != nil {
  2319  			return err
  2320  		}
  2321  
  2322  		ep.SocketOptions().SetReceiveTClass(v != 0)
  2323  		return nil
  2324  	case linux.IPV6_RECVERR:
  2325  		if len(optVal) == 0 {
  2326  			return nil
  2327  		}
  2328  		v, err := parseIntOrChar(optVal)
  2329  		if err != nil {
  2330  			return err
  2331  		}
  2332  		ep.SocketOptions().SetIPv6RecvError(v != 0)
  2333  		return nil
  2334  
  2335  	case linux.IP6T_SO_SET_REPLACE:
  2336  		if len(optVal) < linux.SizeOfIP6TReplace {
  2337  			return syserr.ErrInvalidArgument
  2338  		}
  2339  
  2340  		// Only valid for raw IPv6 sockets.
  2341  		if !socket.IsRaw(s) {
  2342  			return syserr.ErrProtocolNotAvailable
  2343  		}
  2344  
  2345  		stk := inet.StackFromContext(t)
  2346  		if stk == nil {
  2347  			return syserr.ErrNoDevice
  2348  		}
  2349  		// Stack must be a netstack stack.
  2350  		return netfilter.SetEntries(t.Credentials().UserNamespace, stk.(*Stack).Stack, optVal, true)
  2351  
  2352  	case linux.IP6T_SO_SET_ADD_COUNTERS:
  2353  		log.Infof("IP6T_SO_SET_ADD_COUNTERS is not supported")
  2354  		return nil
  2355  	}
  2356  
  2357  	return nil
  2358  }
  2359  
  2360  var (
  2361  	inetMulticastRequestSize        = (*linux.InetMulticastRequest)(nil).SizeBytes()
  2362  	inetMulticastRequestWithNICSize = (*linux.InetMulticastRequestWithNIC)(nil).SizeBytes()
  2363  	inet6MulticastRequestSize       = (*linux.Inet6MulticastRequest)(nil).SizeBytes()
  2364  )
  2365  
  2366  // copyInMulticastRequest copies in a variable-size multicast request. The
  2367  // kernel determines which structure was passed by its length. IP_MULTICAST_IF
  2368  // supports ip_mreqn, ip_mreq and in_addr, while IP_ADD_MEMBERSHIP and
  2369  // IP_DROP_MEMBERSHIP only support ip_mreqn and ip_mreq. To handle this,
  2370  // allowAddr controls whether in_addr is accepted or rejected.
  2371  func copyInMulticastRequest(optVal []byte, allowAddr bool) (linux.InetMulticastRequestWithNIC, *syserr.Error) {
  2372  	if len(optVal) < len(linux.InetAddr{}) {
  2373  		return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument
  2374  	}
  2375  
  2376  	if len(optVal) < inetMulticastRequestSize {
  2377  		if !allowAddr {
  2378  			return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument
  2379  		}
  2380  
  2381  		var req linux.InetMulticastRequestWithNIC
  2382  		copy(req.InterfaceAddr[:], optVal)
  2383  		return req, nil
  2384  	}
  2385  
  2386  	if len(optVal) >= inetMulticastRequestWithNICSize {
  2387  		var req linux.InetMulticastRequestWithNIC
  2388  		req.UnmarshalUnsafe(optVal)
  2389  		return req, nil
  2390  	}
  2391  
  2392  	var req linux.InetMulticastRequestWithNIC
  2393  	req.InetMulticastRequest.UnmarshalUnsafe(optVal)
  2394  	return req, nil
  2395  }
  2396  
  2397  func copyInMulticastV6Request(optVal []byte) (linux.Inet6MulticastRequest, *syserr.Error) {
  2398  	if len(optVal) < inet6MulticastRequestSize {
  2399  		return linux.Inet6MulticastRequest{}, syserr.ErrInvalidArgument
  2400  	}
  2401  
  2402  	var req linux.Inet6MulticastRequest
  2403  	req.UnmarshalUnsafe(optVal)
  2404  	return req, nil
  2405  }
  2406  
  2407  // parseIntOrChar copies either a 32-bit int or an 8-bit uint out of buf.
  2408  //
  2409  // net/ipv4/ip_sockglue.c:do_ip_setsockopt does this for its socket options.
  2410  func parseIntOrChar(buf []byte) (int32, *syserr.Error) {
  2411  	if len(buf) == 0 {
  2412  		return 0, syserr.ErrInvalidArgument
  2413  	}
  2414  
  2415  	if len(buf) >= sizeOfInt32 {
  2416  		return int32(hostarch.ByteOrder.Uint32(buf)), nil
  2417  	}
  2418  
  2419  	return int32(buf[0]), nil
  2420  }
  2421  
  2422  // setSockOptIP implements SetSockOpt when level is SOL_IP.
  2423  func setSockOptIP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  2424  	if _, ok := ep.(tcpip.Endpoint); !ok {
  2425  		log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  2426  		return syserr.ErrUnknownProtocolOption
  2427  	}
  2428  
  2429  	switch name {
  2430  	case linux.IP_MULTICAST_TTL:
  2431  		v, err := parseIntOrChar(optVal)
  2432  		if err != nil {
  2433  			return err
  2434  		}
  2435  
  2436  		if v == -1 {
  2437  			// Linux translates -1 to 1.
  2438  			v = 1
  2439  		}
  2440  		if v < 0 || v > 255 {
  2441  			return syserr.ErrInvalidArgument
  2442  		}
  2443  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MulticastTTLOption, int(v)))
  2444  
  2445  	case linux.IP_ADD_MEMBERSHIP:
  2446  		req, err := copyInMulticastRequest(optVal, false /* allowAddr */)
  2447  		if err != nil {
  2448  			return err
  2449  		}
  2450  
  2451  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{
  2452  			NIC: tcpip.NICID(req.InterfaceIndex),
  2453  			// TODO(igudger): Change AddMembership to use the standard
  2454  			// any address representation.
  2455  			InterfaceAddr: tcpip.AddrFrom4(req.InterfaceAddr),
  2456  			MulticastAddr: tcpip.AddrFrom4(req.MulticastAddr),
  2457  		}))
  2458  
  2459  	case linux.IP_DROP_MEMBERSHIP:
  2460  		req, err := copyInMulticastRequest(optVal, false /* allowAddr */)
  2461  		if err != nil {
  2462  			return err
  2463  		}
  2464  
  2465  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{
  2466  			NIC: tcpip.NICID(req.InterfaceIndex),
  2467  			// TODO(igudger): Change DropMembership to use the standard
  2468  			// any address representation.
  2469  			InterfaceAddr: tcpip.AddrFrom4(req.InterfaceAddr),
  2470  			MulticastAddr: tcpip.AddrFrom4(req.MulticastAddr),
  2471  		}))
  2472  
  2473  	case linux.IP_MULTICAST_IF:
  2474  		req, err := copyInMulticastRequest(optVal, true /* allowAddr */)
  2475  		if err != nil {
  2476  			return err
  2477  		}
  2478  
  2479  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.MulticastInterfaceOption{
  2480  			NIC:           tcpip.NICID(req.InterfaceIndex),
  2481  			InterfaceAddr: socket.BytesToIPAddress(req.InterfaceAddr[:]),
  2482  		}))
  2483  
  2484  	case linux.IP_MULTICAST_LOOP:
  2485  		v, err := parseIntOrChar(optVal)
  2486  		if err != nil {
  2487  			return err
  2488  		}
  2489  
  2490  		ep.SocketOptions().SetMulticastLoop(v != 0)
  2491  		return nil
  2492  
  2493  	case linux.MCAST_JOIN_GROUP:
  2494  		// FIXME(b/124219304): Implement MCAST_JOIN_GROUP.
  2495  		return syserr.ErrInvalidArgument
  2496  
  2497  	case linux.IP_TTL:
  2498  		v, err := parseIntOrChar(optVal)
  2499  		if err != nil {
  2500  			return err
  2501  		}
  2502  
  2503  		// -1 means default TTL.
  2504  		if v == -1 {
  2505  			v = 0
  2506  		} else if v < 1 || v > 255 {
  2507  			return syserr.ErrInvalidArgument
  2508  		}
  2509  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TTLOption, int(v)))
  2510  
  2511  	case linux.IP_RECVTTL:
  2512  		v, err := parseIntOrChar(optVal)
  2513  		if err != nil {
  2514  			return err
  2515  		}
  2516  		ep.SocketOptions().SetReceiveTTL(v != 0)
  2517  		return nil
  2518  
  2519  	case linux.IP_TOS:
  2520  		if len(optVal) == 0 {
  2521  			return nil
  2522  		}
  2523  		v, err := parseIntOrChar(optVal)
  2524  		if err != nil {
  2525  			return err
  2526  		}
  2527  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TOSOption, int(v)))
  2528  
  2529  	case linux.IP_RECVTOS:
  2530  		v, err := parseIntOrChar(optVal)
  2531  		if err != nil {
  2532  			return err
  2533  		}
  2534  		ep.SocketOptions().SetReceiveTOS(v != 0)
  2535  		return nil
  2536  
  2537  	case linux.IP_RECVERR:
  2538  		if len(optVal) == 0 {
  2539  			return nil
  2540  		}
  2541  		v, err := parseIntOrChar(optVal)
  2542  		if err != nil {
  2543  			return err
  2544  		}
  2545  		ep.SocketOptions().SetIPv4RecvError(v != 0)
  2546  		return nil
  2547  
  2548  	case linux.IP_PKTINFO:
  2549  		if len(optVal) == 0 {
  2550  			return nil
  2551  		}
  2552  		v, err := parseIntOrChar(optVal)
  2553  		if err != nil {
  2554  			return err
  2555  		}
  2556  		ep.SocketOptions().SetReceivePacketInfo(v != 0)
  2557  		return nil
  2558  
  2559  	case linux.IP_HDRINCL:
  2560  		if len(optVal) == 0 {
  2561  			return nil
  2562  		}
  2563  		v, err := parseIntOrChar(optVal)
  2564  		if err != nil {
  2565  			return err
  2566  		}
  2567  		ep.SocketOptions().SetHeaderIncluded(v != 0)
  2568  		return nil
  2569  
  2570  	case linux.IP_RECVORIGDSTADDR:
  2571  		if len(optVal) == 0 {
  2572  			return nil
  2573  		}
  2574  		v, err := parseIntOrChar(optVal)
  2575  		if err != nil {
  2576  			return err
  2577  		}
  2578  
  2579  		ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0)
  2580  		return nil
  2581  
  2582  	case linux.IPT_SO_SET_REPLACE:
  2583  		if len(optVal) < linux.SizeOfIPTReplace {
  2584  			return syserr.ErrInvalidArgument
  2585  		}
  2586  
  2587  		// Only valid for raw IPv4 sockets.
  2588  		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
  2589  			return syserr.ErrProtocolNotAvailable
  2590  		}
  2591  
  2592  		stk := inet.StackFromContext(t)
  2593  		if stk == nil {
  2594  			return syserr.ErrNoDevice
  2595  		}
  2596  		// Stack must be a netstack stack.
  2597  		return netfilter.SetEntries(t.Credentials().UserNamespace, stk.(*Stack).Stack, optVal, false)
  2598  
  2599  	case linux.IPT_SO_SET_ADD_COUNTERS:
  2600  		log.Infof("IPT_SO_SET_ADD_COUNTERS is not supported")
  2601  		return nil
  2602  
  2603  	case linux.IP_MTU_DISCOVER:
  2604  		if len(optVal) == 0 {
  2605  			return nil
  2606  		}
  2607  		v, err := parseIntOrChar(optVal)
  2608  		if err != nil {
  2609  			return err
  2610  		}
  2611  		switch v {
  2612  		case linux.IP_PMTUDISC_DONT:
  2613  			v = int32(tcpip.PMTUDiscoveryDont)
  2614  		case linux.IP_PMTUDISC_WANT:
  2615  			v = int32(tcpip.PMTUDiscoveryWant)
  2616  		case linux.IP_PMTUDISC_DO:
  2617  			v = int32(tcpip.PMTUDiscoveryDo)
  2618  		case linux.IP_PMTUDISC_PROBE:
  2619  			v = int32(tcpip.PMTUDiscoveryProbe)
  2620  		default:
  2621  			return syserr.ErrNotSupported
  2622  		}
  2623  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MTUDiscoverOption, int(v)))
  2624  
  2625  	case linux.IP_ADD_SOURCE_MEMBERSHIP,
  2626  		linux.IP_BIND_ADDRESS_NO_PORT,
  2627  		linux.IP_BLOCK_SOURCE,
  2628  		linux.IP_CHECKSUM,
  2629  		linux.IP_DROP_SOURCE_MEMBERSHIP,
  2630  		linux.IP_FREEBIND,
  2631  		linux.IP_IPSEC_POLICY,
  2632  		linux.IP_MINTTL,
  2633  		linux.IP_MSFILTER,
  2634  		linux.IP_MULTICAST_ALL,
  2635  		linux.IP_NODEFRAG,
  2636  		linux.IP_OPTIONS,
  2637  		linux.IP_PASSSEC,
  2638  		linux.IP_RECVFRAGSIZE,
  2639  		linux.IP_RECVOPTS,
  2640  		linux.IP_RETOPTS,
  2641  		linux.IP_TRANSPARENT,
  2642  		linux.IP_UNBLOCK_SOURCE,
  2643  		linux.IP_UNICAST_IF,
  2644  		linux.IP_XFRM_POLICY,
  2645  		linux.MCAST_BLOCK_SOURCE,
  2646  		linux.MCAST_JOIN_SOURCE_GROUP,
  2647  		linux.MCAST_LEAVE_GROUP,
  2648  		linux.MCAST_LEAVE_SOURCE_GROUP,
  2649  		linux.MCAST_MSFILTER,
  2650  		linux.MCAST_UNBLOCK_SOURCE:
  2651  		// Not supported.
  2652  	}
  2653  
  2654  	return nil
  2655  }
  2656  
  2657  // GetSockName implements the linux syscall getsockname(2) for sockets backed by
  2658  // tcpip.Endpoint.
  2659  func (s *sock) GetSockName(*kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
  2660  	addr, err := s.Endpoint.GetLocalAddress()
  2661  	if err != nil {
  2662  		return nil, 0, syserr.TranslateNetstackError(err)
  2663  	}
  2664  
  2665  	a, l := socket.ConvertAddress(s.family, addr)
  2666  	return a, l, nil
  2667  }
  2668  
  2669  // GetPeerName implements the linux syscall getpeername(2) for sockets backed by
  2670  // tcpip.Endpoint.
  2671  func (s *sock) GetPeerName(*kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
  2672  	addr, err := s.Endpoint.GetRemoteAddress()
  2673  	if err != nil {
  2674  		return nil, 0, syserr.TranslateNetstackError(err)
  2675  	}
  2676  
  2677  	a, l := socket.ConvertAddress(s.family, addr)
  2678  	return a, l, nil
  2679  }
  2680  
  2681  func (s *sock) fillCmsgInq(cmsg *socket.ControlMessages) {
  2682  	if !s.sockOptInq {
  2683  		return
  2684  	}
  2685  	rcvBufUsed, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
  2686  	if err != nil {
  2687  		return
  2688  	}
  2689  	cmsg.IP.HasInq = true
  2690  	cmsg.IP.Inq = int32(rcvBufUsed)
  2691  }
  2692  
  2693  func toLinuxPacketType(pktType tcpip.PacketType) uint8 {
  2694  	switch pktType {
  2695  	case tcpip.PacketHost:
  2696  		return linux.PACKET_HOST
  2697  	case tcpip.PacketOtherHost:
  2698  		return linux.PACKET_OTHERHOST
  2699  	case tcpip.PacketOutgoing:
  2700  		return linux.PACKET_OUTGOING
  2701  	case tcpip.PacketBroadcast:
  2702  		return linux.PACKET_BROADCAST
  2703  	case tcpip.PacketMulticast:
  2704  		return linux.PACKET_MULTICAST
  2705  	default:
  2706  		panic(fmt.Sprintf("unknown packet type: %d", pktType))
  2707  	}
  2708  }
  2709  
  2710  // nonBlockingRead issues a non-blocking read.
  2711  //
  2712  // TODO(b/78348848): Support timestamps for stream sockets.
  2713  func (s *sock) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
  2714  	isPacket := s.isPacketBased()
  2715  
  2716  	readOptions := tcpip.ReadOptions{
  2717  		Peek:               peek,
  2718  		NeedRemoteAddr:     senderRequested,
  2719  		NeedLinkPacketInfo: isPacket,
  2720  	}
  2721  
  2722  	// TCP sockets discard the data if MSG_TRUNC is set.
  2723  	//
  2724  	// This behavior is documented in man 7 tcp:
  2725  	// Since version 2.4, Linux supports the use of MSG_TRUNC in the flags
  2726  	// argument of recv(2) (and recvmsg(2)). This flag causes the received
  2727  	// bytes of data to be discarded, rather than passed back in a
  2728  	// caller-supplied  buffer.
  2729  	var w io.Writer
  2730  	var res tcpip.ReadResult
  2731  	var err tcpip.Error
  2732  
  2733  	s.readMu.Lock()
  2734  	defer s.readMu.Unlock()
  2735  
  2736  	if !isPacket && trunc {
  2737  		w = &tcpip.LimitedWriter{
  2738  			W: ioutil.Discard,
  2739  			N: dst.NumBytes(),
  2740  		}
  2741  		res, err = s.Endpoint.Read(w, readOptions)
  2742  	} else {
  2743  		switch s.Endpoint.(type) {
  2744  		case *tcp.Endpoint:
  2745  			s.mu.Lock()
  2746  			s.readWriter.Init(ctx, dst)
  2747  			res, err = s.Endpoint.Read(&s.readWriter, readOptions)
  2748  			s.mu.Unlock()
  2749  		default:
  2750  			res, err = s.Endpoint.Read(dst.Writer(ctx), readOptions)
  2751  		}
  2752  	}
  2753  
  2754  	if _, ok := err.(*tcpip.ErrBadBuffer); ok && dst.NumBytes() == 0 {
  2755  		err = nil
  2756  	}
  2757  	if err != nil {
  2758  		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err)
  2759  	}
  2760  	// Set the control message, even if 0 bytes were read.
  2761  	s.updateTimestamp(res.ControlMessages)
  2762  
  2763  	if isPacket {
  2764  		var addr linux.SockAddr
  2765  		var addrLen uint32
  2766  		if senderRequested {
  2767  			addr, addrLen = socket.ConvertAddress(s.family, res.RemoteAddr)
  2768  			switch v := addr.(type) {
  2769  			case *linux.SockAddrLink:
  2770  				v.Protocol = socket.Htons(uint16(res.LinkPacketInfo.Protocol))
  2771  				v.PacketType = toLinuxPacketType(res.LinkPacketInfo.PktType)
  2772  			}
  2773  		}
  2774  
  2775  		msgLen := res.Count
  2776  		if trunc {
  2777  			msgLen = res.Total
  2778  		}
  2779  
  2780  		var flags int
  2781  		if res.Total > res.Count {
  2782  			flags |= linux.MSG_TRUNC
  2783  		}
  2784  
  2785  		return msgLen, flags, addr, addrLen, s.netstackToLinuxControlMessages(res.ControlMessages), nil
  2786  	}
  2787  
  2788  	if peek {
  2789  		// MSG_TRUNC with MSG_PEEK on a TCP socket returns the
  2790  		// amount that could be read, and does not write to buffer.
  2791  		if trunc {
  2792  			// TCP endpoint does not return the total bytes in buffer as numTotal.
  2793  			// We need to query it from socket option.
  2794  			rql, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
  2795  			if err != nil {
  2796  				return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err)
  2797  			}
  2798  			msgLen := int(dst.NumBytes())
  2799  			if msgLen > rql {
  2800  				msgLen = rql
  2801  			}
  2802  			return msgLen, 0, nil, 0, socket.ControlMessages{}, nil
  2803  		}
  2804  	} else if n := res.Count; n != 0 {
  2805  		s.Endpoint.ModerateRecvBuf(n)
  2806  	}
  2807  
  2808  	cmsg := s.netstackToLinuxControlMessages(res.ControlMessages)
  2809  	s.fillCmsgInq(&cmsg)
  2810  	return res.Count, 0, nil, 0, cmsg, syserr.TranslateNetstackError(err)
  2811  }
  2812  
  2813  func (s *sock) netstackToLinuxControlMessages(cm tcpip.ReceivableControlMessages) socket.ControlMessages {
  2814  	readCM := socket.NewIPControlMessages(s.family, cm)
  2815  	return socket.ControlMessages{
  2816  		IP: socket.IPControlMessages{
  2817  			HasTimestamp:       readCM.HasTimestamp && s.sockOptTimestamp,
  2818  			Timestamp:          readCM.Timestamp,
  2819  			HasInq:             readCM.HasInq,
  2820  			Inq:                readCM.Inq,
  2821  			HasTOS:             readCM.HasTOS,
  2822  			TOS:                readCM.TOS,
  2823  			HasTClass:          readCM.HasTClass,
  2824  			TClass:             readCM.TClass,
  2825  			HasTTL:             readCM.HasTTL,
  2826  			TTL:                readCM.TTL,
  2827  			HasHopLimit:        readCM.HasHopLimit,
  2828  			HopLimit:           readCM.HopLimit,
  2829  			HasIPPacketInfo:    readCM.HasIPPacketInfo,
  2830  			PacketInfo:         readCM.PacketInfo,
  2831  			HasIPv6PacketInfo:  readCM.HasIPv6PacketInfo,
  2832  			IPv6PacketInfo:     readCM.IPv6PacketInfo,
  2833  			OriginalDstAddress: readCM.OriginalDstAddress,
  2834  			SockErr:            readCM.SockErr,
  2835  		},
  2836  	}
  2837  }
  2838  
  2839  func (s *sock) linuxToNetstackControlMessages(cm socket.ControlMessages) tcpip.SendableControlMessages {
  2840  	return tcpip.SendableControlMessages{
  2841  		HasTTL:      cm.IP.HasTTL,
  2842  		TTL:         uint8(cm.IP.TTL),
  2843  		HasHopLimit: cm.IP.HasHopLimit,
  2844  		HopLimit:    uint8(cm.IP.HopLimit),
  2845  	}
  2846  }
  2847  
  2848  // updateTimestamp sets the timestamp for SIOCGSTAMP. It should be called after
  2849  // successfully writing packet data out to userspace.
  2850  //
  2851  // Precondition: s.readMu must be locked.
  2852  func (s *sock) updateTimestamp(cm tcpip.ReceivableControlMessages) {
  2853  	// Save the SIOCGSTAMP timestamp only if SO_TIMESTAMP is disabled.
  2854  	if !s.sockOptTimestamp {
  2855  		s.timestampValid = true
  2856  		s.timestamp = cm.Timestamp
  2857  	}
  2858  }
  2859  
  2860  // dequeueErr is analogous to net/core/skbuff.c:sock_dequeue_err_skb().
  2861  func (s *sock) dequeueErr() *tcpip.SockError {
  2862  	so := s.Endpoint.SocketOptions()
  2863  	err := so.DequeueErr()
  2864  	if err == nil {
  2865  		return nil
  2866  	}
  2867  
  2868  	// Update socket error to reflect ICMP errors in queue.
  2869  	if nextErr := so.PeekErr(); nextErr != nil && nextErr.Cause.Origin().IsICMPErr() {
  2870  		so.SetLastError(nextErr.Err)
  2871  	} else if err.Cause.Origin().IsICMPErr() {
  2872  		so.SetLastError(nil)
  2873  	}
  2874  	return err
  2875  }
  2876  
  2877  // addrFamilyFromNetProto returns the address family identifier for the given
  2878  // network protocol.
  2879  func addrFamilyFromNetProto(net tcpip.NetworkProtocolNumber) int {
  2880  	switch net {
  2881  	case header.IPv4ProtocolNumber:
  2882  		return linux.AF_INET
  2883  	case header.IPv6ProtocolNumber:
  2884  		return linux.AF_INET6
  2885  	default:
  2886  		panic(fmt.Sprintf("invalid net proto for addr family inference: %d", net))
  2887  	}
  2888  }
  2889  
  2890  // recvErr handles MSG_ERRQUEUE for recvmsg(2).
  2891  // This is analogous to net/ipv4/ip_sockglue.c:ip_recv_error().
  2892  func (s *sock) recvErr(t *kernel.Task, dst usermem.IOSequence) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
  2893  	sockErr := s.dequeueErr()
  2894  	if sockErr == nil {
  2895  		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
  2896  	}
  2897  	if sockErr.Payload != nil {
  2898  		defer sockErr.Payload.Release()
  2899  	}
  2900  
  2901  	// The payload of the original packet that caused the error is passed as
  2902  	// normal data via msg_iovec.  -- recvmsg(2)
  2903  	msgFlags := linux.MSG_ERRQUEUE
  2904  	if int(dst.NumBytes()) < sockErr.Payload.Size() {
  2905  		msgFlags |= linux.MSG_TRUNC
  2906  	}
  2907  	n, err := dst.CopyOut(t, sockErr.Payload.AsSlice())
  2908  
  2909  	// The original destination address of the datagram that caused the error is
  2910  	// supplied via msg_name.  -- recvmsg(2)
  2911  	dstAddr, dstAddrLen := socket.ConvertAddress(addrFamilyFromNetProto(sockErr.NetProto), sockErr.Dst)
  2912  	cmgs := socket.ControlMessages{IP: socket.NewIPControlMessages(s.family, tcpip.ReceivableControlMessages{SockErr: sockErr})}
  2913  	return n, msgFlags, dstAddr, dstAddrLen, cmgs, syserr.FromError(err)
  2914  }
  2915  
  2916  // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
  2917  // tcpip.Endpoint.
  2918  func (s *sock) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, _ uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
  2919  	if flags&linux.MSG_ERRQUEUE != 0 {
  2920  		return s.recvErr(t, dst)
  2921  	}
  2922  
  2923  	trunc := flags&linux.MSG_TRUNC != 0
  2924  	peek := flags&linux.MSG_PEEK != 0
  2925  	dontWait := flags&linux.MSG_DONTWAIT != 0
  2926  	waitAll := flags&linux.MSG_WAITALL != 0
  2927  	if senderRequested && !s.isPacketBased() {
  2928  		// Stream sockets ignore the sender address.
  2929  		senderRequested = false
  2930  	}
  2931  	n, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
  2932  
  2933  	if s.isPacketBased() && err == syserr.ErrClosedForReceive && flags&linux.MSG_DONTWAIT != 0 {
  2934  		// In this situation we should return EAGAIN.
  2935  		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
  2936  	}
  2937  
  2938  	if err != nil && (err != syserr.ErrWouldBlock || dontWait) {
  2939  		// Read failed and we should not retry.
  2940  		return 0, 0, nil, 0, socket.ControlMessages{}, err
  2941  	}
  2942  
  2943  	if err == nil && (dontWait || !waitAll || s.isPacketBased() || int64(n) >= dst.NumBytes()) {
  2944  		// We got all the data we need.
  2945  		return
  2946  	}
  2947  
  2948  	// Don't overwrite any data we received.
  2949  	dst = dst.DropFirst(n)
  2950  
  2951  	// We'll have to block. Register for notifications and keep trying to
  2952  	// send all the data.
  2953  	e, ch := waiter.NewChannelEntry(waiter.ReadableEvents)
  2954  	s.EventRegister(&e)
  2955  	defer s.EventUnregister(&e)
  2956  
  2957  	for {
  2958  		var rn int
  2959  		rn, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
  2960  		n += rn
  2961  		if err != nil && err != syserr.ErrWouldBlock {
  2962  			// Always stop on errors other than would block as we generally
  2963  			// won't be able to get any more data. Eat the error if we got
  2964  			// any data.
  2965  			if n > 0 {
  2966  				err = nil
  2967  			}
  2968  			return
  2969  		}
  2970  		if err == nil && (s.isPacketBased() || !waitAll || int64(rn) >= dst.NumBytes()) {
  2971  			// We got all the data we need.
  2972  			return
  2973  		}
  2974  		dst = dst.DropFirst(rn)
  2975  
  2976  		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
  2977  			if n > 0 {
  2978  				return n, msgFlags, senderAddr, senderAddrLen, controlMessages, nil
  2979  			}
  2980  			if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
  2981  				return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
  2982  			}
  2983  			return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
  2984  		}
  2985  	}
  2986  }
  2987  
  2988  // SendMsg implements the linux syscall sendmsg(2) for sockets backed by
  2989  // tcpip.Endpoint.
  2990  func (s *sock) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
  2991  	// Reject Unix control messages.
  2992  	if !controlMessages.Unix.Empty() {
  2993  		return 0, syserr.ErrInvalidArgument
  2994  	}
  2995  
  2996  	var addr *tcpip.FullAddress
  2997  	if len(to) > 0 {
  2998  		addrBuf, family, err := socket.AddressAndFamily(to)
  2999  		if err != nil {
  3000  			return 0, err
  3001  		}
  3002  		if !s.checkFamily(family, false /* exact */) {
  3003  			return 0, syserr.ErrInvalidArgument
  3004  		}
  3005  		addrBuf = s.mapFamily(addrBuf, family)
  3006  
  3007  		addr = &addrBuf
  3008  	}
  3009  
  3010  	opts := tcpip.WriteOptions{
  3011  		To:              addr,
  3012  		More:            flags&linux.MSG_MORE != 0,
  3013  		EndOfRecord:     flags&linux.MSG_EOR != 0,
  3014  		ControlMessages: s.linuxToNetstackControlMessages(controlMessages),
  3015  	}
  3016  
  3017  	r := src.Reader(t)
  3018  	var (
  3019  		total int64
  3020  		entry waiter.Entry
  3021  		ch    <-chan struct{}
  3022  	)
  3023  	for {
  3024  		n, err := s.Endpoint.Write(r, opts)
  3025  		total += n
  3026  		if flags&linux.MSG_DONTWAIT != 0 {
  3027  			return int(total), syserr.TranslateNetstackError(err)
  3028  		}
  3029  		block := true
  3030  		switch err.(type) {
  3031  		case nil:
  3032  			block = total != src.NumBytes()
  3033  		case *tcpip.ErrWouldBlock:
  3034  		default:
  3035  			block = false
  3036  		}
  3037  		if block {
  3038  			if ch == nil {
  3039  				// We'll have to block. Register for notification and keep trying to
  3040  				// send all the data.
  3041  				entry, ch = waiter.NewChannelEntry(waiter.WritableEvents)
  3042  				s.EventRegister(&entry)
  3043  				defer s.EventUnregister(&entry)
  3044  			} else {
  3045  				// Don't wait immediately after registration in case more data
  3046  				// became available between when we last checked and when we setup
  3047  				// the notification.
  3048  				if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
  3049  					if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
  3050  						return int(total), syserr.ErrTryAgain
  3051  					}
  3052  					// handleIOError will consume errors from t.Block if needed.
  3053  					return int(total), syserr.FromError(err)
  3054  				}
  3055  			}
  3056  			continue
  3057  		}
  3058  		return int(total), syserr.TranslateNetstackError(err)
  3059  	}
  3060  }
  3061  
  3062  // Ioctl implements vfs.FileDescriptionImpl.
  3063  func (s *sock) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
  3064  	t := kernel.TaskFromContext(ctx)
  3065  	if t == nil {
  3066  		panic("ioctl(2) may only be called from a task goroutine")
  3067  	}
  3068  
  3069  	// SIOCGSTAMP is implemented by netstack rather than all commonEndpoint
  3070  	// sockets.
  3071  	// TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP.
  3072  	switch args[1].Int() {
  3073  	case linux.SIOCGSTAMP:
  3074  		s.readMu.Lock()
  3075  		defer s.readMu.Unlock()
  3076  		if !s.timestampValid {
  3077  			return 0, linuxerr.ENOENT
  3078  		}
  3079  
  3080  		tv := linux.NsecToTimeval(s.timestamp.UnixNano())
  3081  		_, err := tv.CopyOut(t, args[2].Pointer())
  3082  		return 0, err
  3083  
  3084  	case linux.TIOCINQ:
  3085  		v, terr := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
  3086  		if terr != nil {
  3087  			return 0, syserr.TranslateNetstackError(terr).ToError()
  3088  		}
  3089  
  3090  		if v > math.MaxInt32 {
  3091  			v = math.MaxInt32
  3092  		}
  3093  
  3094  		// Copy result to userspace.
  3095  		vP := primitive.Int32(v)
  3096  		_, err := vP.CopyOut(t, args[2].Pointer())
  3097  		return 0, err
  3098  	}
  3099  
  3100  	return Ioctl(ctx, s.Endpoint, uio, sysno, args)
  3101  }
  3102  
  3103  // Ioctl performs a socket ioctl.
  3104  func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
  3105  	t := kernel.TaskFromContext(ctx)
  3106  	if t == nil {
  3107  		panic("ioctl(2) may only be called from a task goroutine")
  3108  	}
  3109  
  3110  	switch arg := int(args[1].Int()); arg {
  3111  	case linux.SIOCGIFFLAGS,
  3112  		linux.SIOCGIFADDR,
  3113  		linux.SIOCGIFBRDADDR,
  3114  		linux.SIOCGIFDSTADDR,
  3115  		linux.SIOCGIFHWADDR,
  3116  		linux.SIOCGIFINDEX,
  3117  		linux.SIOCGIFMAP,
  3118  		linux.SIOCGIFMETRIC,
  3119  		linux.SIOCGIFMTU,
  3120  		linux.SIOCGIFNAME,
  3121  		linux.SIOCGIFNETMASK,
  3122  		linux.SIOCGIFTXQLEN,
  3123  		linux.SIOCETHTOOL:
  3124  
  3125  		var ifr linux.IFReq
  3126  		if _, err := ifr.CopyIn(t, args[2].Pointer()); err != nil {
  3127  			return 0, err
  3128  		}
  3129  		if err := interfaceIoctl(ctx, io, arg, &ifr); err != nil {
  3130  			return 0, err.ToError()
  3131  		}
  3132  		_, err := ifr.CopyOut(t, args[2].Pointer())
  3133  		return 0, err
  3134  
  3135  	case linux.SIOCGIFCONF:
  3136  		// Return a list of interface addresses or the buffer size
  3137  		// necessary to hold the list.
  3138  		var ifc linux.IFConf
  3139  		if _, err := ifc.CopyIn(t, args[2].Pointer()); err != nil {
  3140  			return 0, err
  3141  		}
  3142  
  3143  		if err := ifconfIoctl(ctx, t, io, &ifc); err != nil {
  3144  			return 0, err
  3145  		}
  3146  
  3147  		_, err := ifc.CopyOut(t, args[2].Pointer())
  3148  		return 0, err
  3149  
  3150  	case linux.TIOCINQ:
  3151  		v, terr := ep.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
  3152  		if terr != nil {
  3153  			return 0, syserr.TranslateNetstackError(terr).ToError()
  3154  		}
  3155  
  3156  		if v > math.MaxInt32 {
  3157  			v = math.MaxInt32
  3158  		}
  3159  		// Copy result to userspace.
  3160  		vP := primitive.Int32(v)
  3161  		_, err := vP.CopyOut(t, args[2].Pointer())
  3162  		return 0, err
  3163  
  3164  	case linux.TIOCOUTQ:
  3165  		v, terr := ep.GetSockOptInt(tcpip.SendQueueSizeOption)
  3166  		if terr != nil {
  3167  			return 0, syserr.TranslateNetstackError(terr).ToError()
  3168  		}
  3169  
  3170  		if v > math.MaxInt32 {
  3171  			v = math.MaxInt32
  3172  		}
  3173  
  3174  		// Copy result to userspace.
  3175  		vP := primitive.Int32(v)
  3176  		_, err := vP.CopyOut(t, args[2].Pointer())
  3177  		return 0, err
  3178  
  3179  	case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG:
  3180  		// Not supported.
  3181  	}
  3182  
  3183  	return 0, linuxerr.ENOTTY
  3184  }
  3185  
  3186  // interfaceIoctl implements interface requests.
  3187  func interfaceIoctl(ctx context.Context, _ usermem.IO, arg int, ifr *linux.IFReq) *syserr.Error {
  3188  	var (
  3189  		iface inet.Interface
  3190  		index int32
  3191  		found bool
  3192  	)
  3193  
  3194  	// Find the relevant device.
  3195  	stk := inet.StackFromContext(ctx)
  3196  	if stk == nil {
  3197  		return syserr.ErrNoDevice
  3198  	}
  3199  
  3200  	// SIOCGIFNAME uses ifr.ifr_ifindex rather than ifr.ifr_name to
  3201  	// identify a device.
  3202  	if arg == linux.SIOCGIFNAME {
  3203  		// Gets the name of the interface given the interface index
  3204  		// stored in ifr_ifindex.
  3205  		index = int32(hostarch.ByteOrder.Uint32(ifr.Data[:4]))
  3206  		if iface, ok := stk.Interfaces()[index]; ok {
  3207  			ifr.SetName(iface.Name)
  3208  			return nil
  3209  		}
  3210  		return syserr.ErrNoDevice
  3211  	}
  3212  
  3213  	// Find the relevant device.
  3214  	for index, iface = range stk.Interfaces() {
  3215  		if iface.Name == ifr.Name() {
  3216  			found = true
  3217  			break
  3218  		}
  3219  	}
  3220  	if !found {
  3221  		return syserr.ErrNoDevice
  3222  	}
  3223  
  3224  	switch arg {
  3225  	case linux.SIOCGIFINDEX:
  3226  		// Copy out the index to the data.
  3227  		hostarch.ByteOrder.PutUint32(ifr.Data[:], uint32(index))
  3228  
  3229  	case linux.SIOCGIFHWADDR:
  3230  		// Copy the hardware address out.
  3231  		//
  3232  		// Refer: https://linux.die.net/man/7/netdevice
  3233  		// SIOCGIFHWADDR, SIOCSIFHWADDR
  3234  		//
  3235  		// Get or set the hardware address of a device using
  3236  		// ifr_hwaddr. The hardware address is specified in a struct
  3237  		// sockaddr. sa_family contains the ARPHRD_* device type,
  3238  		// sa_data the L2 hardware address starting from byte 0. Setting
  3239  		// the hardware address is a privileged operation.
  3240  		hostarch.ByteOrder.PutUint16(ifr.Data[:], iface.DeviceType)
  3241  		n := copy(ifr.Data[2:], iface.Addr)
  3242  		for i := 2 + n; i < len(ifr.Data); i++ {
  3243  			ifr.Data[i] = 0 // Clear padding.
  3244  		}
  3245  
  3246  	case linux.SIOCGIFFLAGS:
  3247  		f, err := interfaceStatusFlags(stk, iface.Name)
  3248  		if err != nil {
  3249  			return err
  3250  		}
  3251  		// Drop the flags that don't fit in the size that we need to return. This
  3252  		// matches Linux behavior.
  3253  		hostarch.ByteOrder.PutUint16(ifr.Data[:2], uint16(f))
  3254  
  3255  	case linux.SIOCGIFADDR:
  3256  		// Copy the IPv4 address out.
  3257  		for _, addr := range stk.InterfaceAddrs()[index] {
  3258  			// This ioctl is only compatible with AF_INET addresses.
  3259  			if addr.Family != linux.AF_INET {
  3260  				continue
  3261  			}
  3262  			copy(ifr.Data[4:8], addr.Addr)
  3263  			break
  3264  		}
  3265  
  3266  	case linux.SIOCGIFMETRIC:
  3267  		// Gets the metric of the device. As per netdevice(7), this
  3268  		// always just sets ifr_metric to 0.
  3269  		hostarch.ByteOrder.PutUint32(ifr.Data[:4], 0)
  3270  
  3271  	case linux.SIOCGIFMTU:
  3272  		// Gets the MTU of the device.
  3273  		hostarch.ByteOrder.PutUint32(ifr.Data[:4], iface.MTU)
  3274  
  3275  	case linux.SIOCGIFMAP:
  3276  		// Gets the hardware parameters of the device.
  3277  		// TODO(gvisor.dev/issue/505): Implement.
  3278  
  3279  	case linux.SIOCGIFTXQLEN:
  3280  		// Gets the transmit queue length of the device.
  3281  		// TODO(gvisor.dev/issue/505): Implement.
  3282  
  3283  	case linux.SIOCGIFDSTADDR:
  3284  		// Gets the destination address of a point-to-point device.
  3285  		// TODO(gvisor.dev/issue/505): Implement.
  3286  
  3287  	case linux.SIOCGIFBRDADDR:
  3288  		// Gets the broadcast address of a device.
  3289  		// TODO(gvisor.dev/issue/505): Implement.
  3290  
  3291  	case linux.SIOCGIFNETMASK:
  3292  		// Gets the network mask of a device.
  3293  		for _, addr := range stk.InterfaceAddrs()[index] {
  3294  			// This ioctl is only compatible with AF_INET addresses.
  3295  			if addr.Family != linux.AF_INET {
  3296  				continue
  3297  			}
  3298  			// Populate ifr.ifr_netmask (type sockaddr).
  3299  			hostarch.ByteOrder.PutUint16(ifr.Data[0:], uint16(linux.AF_INET))
  3300  			hostarch.ByteOrder.PutUint16(ifr.Data[2:], 0)
  3301  			var mask uint32 = 0xffffffff << (32 - addr.PrefixLen)
  3302  			// Netmask is expected to be returned as a big endian
  3303  			// value.
  3304  			binary.BigEndian.PutUint32(ifr.Data[4:8], mask)
  3305  			break
  3306  		}
  3307  
  3308  	case linux.SIOCETHTOOL:
  3309  		// Stubbed out for now, Ideally we should implement the required
  3310  		// sub-commands for ETHTOOL
  3311  		//
  3312  		// See:
  3313  		// https://github.com/torvalds/linux/blob/aa0c9086b40c17a7ad94425b3b70dd1fdd7497bf/net/core/dev_ioctl.c
  3314  		return syserr.ErrEndpointOperation
  3315  
  3316  	default:
  3317  		// Not a valid call.
  3318  		return syserr.ErrInvalidArgument
  3319  	}
  3320  
  3321  	return nil
  3322  }
  3323  
  3324  // ifconfIoctl populates a struct ifconf for the SIOCGIFCONF ioctl.
  3325  func ifconfIoctl(ctx context.Context, t *kernel.Task, _ usermem.IO, ifc *linux.IFConf) error {
  3326  	// If Ptr is NULL, return the necessary buffer size via Len.
  3327  	// Otherwise, write up to Len bytes starting at Ptr containing ifreq
  3328  	// structs.
  3329  	stk := inet.StackFromContext(ctx)
  3330  	if stk == nil {
  3331  		return syserr.ErrNoDevice.ToError()
  3332  	}
  3333  
  3334  	if ifc.Ptr == 0 {
  3335  		ifc.Len = int32(len(stk.Interfaces())) * int32(linux.SizeOfIFReq)
  3336  		return nil
  3337  	}
  3338  
  3339  	max := ifc.Len
  3340  	ifc.Len = 0
  3341  	for key, ifaceAddrs := range stk.InterfaceAddrs() {
  3342  		iface := stk.Interfaces()[key]
  3343  		for _, ifaceAddr := range ifaceAddrs {
  3344  			// Don't write past the end of the buffer.
  3345  			if ifc.Len+int32(linux.SizeOfIFReq) > max {
  3346  				break
  3347  			}
  3348  			if ifaceAddr.Family != linux.AF_INET {
  3349  				continue
  3350  			}
  3351  
  3352  			// Populate ifr.ifr_addr.
  3353  			ifr := linux.IFReq{}
  3354  			ifr.SetName(iface.Name)
  3355  			hostarch.ByteOrder.PutUint16(ifr.Data[0:2], uint16(ifaceAddr.Family))
  3356  			hostarch.ByteOrder.PutUint16(ifr.Data[2:4], 0)
  3357  			copy(ifr.Data[4:8], ifaceAddr.Addr[:4])
  3358  
  3359  			// Copy the ifr to userspace.
  3360  			dst := uintptr(ifc.Ptr) + uintptr(ifc.Len)
  3361  			ifc.Len += int32(linux.SizeOfIFReq)
  3362  			if _, err := ifr.CopyOut(t, hostarch.Addr(dst)); err != nil {
  3363  				return err
  3364  			}
  3365  		}
  3366  	}
  3367  	return nil
  3368  }
  3369  
  3370  // interfaceStatusFlags returns status flags for an interface in the stack.
  3371  // Flag values and meanings are described in greater detail in netdevice(7) in
  3372  // the SIOCGIFFLAGS section.
  3373  func interfaceStatusFlags(stack inet.Stack, name string) (uint32, *syserr.Error) {
  3374  	// We should only ever be passed a netstack.Stack.
  3375  	epstack, ok := stack.(*Stack)
  3376  	if !ok {
  3377  		return 0, errStackType
  3378  	}
  3379  
  3380  	// Find the NIC corresponding to this interface.
  3381  	for _, info := range epstack.Stack.NICInfo() {
  3382  		if info.Name == name {
  3383  			return nicStateFlagsToLinux(info.Flags), nil
  3384  		}
  3385  	}
  3386  	return 0, syserr.ErrNoDevice
  3387  }
  3388  
  3389  func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 {
  3390  	var rv uint32
  3391  	if f.Up {
  3392  		rv |= linux.IFF_UP | linux.IFF_LOWER_UP
  3393  	}
  3394  	if f.Running {
  3395  		rv |= linux.IFF_RUNNING
  3396  	}
  3397  	if f.Promiscuous {
  3398  		rv |= linux.IFF_PROMISC
  3399  	}
  3400  	if f.Loopback {
  3401  		rv |= linux.IFF_LOOPBACK
  3402  	}
  3403  	return rv
  3404  }
  3405  
  3406  // State implements socket.Socket.State. State translates the internal state
  3407  // returned by netstack to values defined by Linux.
  3408  func (s *sock) State() uint32 {
  3409  	if s.family != linux.AF_INET && s.family != linux.AF_INET6 {
  3410  		// States not implemented for this socket's family.
  3411  		return 0
  3412  	}
  3413  
  3414  	switch {
  3415  	case socket.IsTCP(s):
  3416  		// TCP socket.
  3417  		switch tcp.EndpointState(s.Endpoint.State()) {
  3418  		case tcp.StateEstablished:
  3419  			return linux.TCP_ESTABLISHED
  3420  		case tcp.StateSynSent:
  3421  			return linux.TCP_SYN_SENT
  3422  		case tcp.StateSynRecv:
  3423  			return linux.TCP_SYN_RECV
  3424  		case tcp.StateFinWait1:
  3425  			return linux.TCP_FIN_WAIT1
  3426  		case tcp.StateFinWait2:
  3427  			return linux.TCP_FIN_WAIT2
  3428  		case tcp.StateTimeWait:
  3429  			return linux.TCP_TIME_WAIT
  3430  		case tcp.StateClose, tcp.StateInitial, tcp.StateBound, tcp.StateConnecting, tcp.StateError:
  3431  			return linux.TCP_CLOSE
  3432  		case tcp.StateCloseWait:
  3433  			return linux.TCP_CLOSE_WAIT
  3434  		case tcp.StateLastAck:
  3435  			return linux.TCP_LAST_ACK
  3436  		case tcp.StateListen:
  3437  			return linux.TCP_LISTEN
  3438  		case tcp.StateClosing:
  3439  			return linux.TCP_CLOSING
  3440  		default:
  3441  			// Internal or unknown state.
  3442  			return 0
  3443  		}
  3444  	case socket.IsUDP(s):
  3445  		// UDP socket.
  3446  		switch transport.DatagramEndpointState(s.Endpoint.State()) {
  3447  		case transport.DatagramEndpointStateInitial, transport.DatagramEndpointStateBound, transport.DatagramEndpointStateClosed:
  3448  			return linux.TCP_CLOSE
  3449  		case transport.DatagramEndpointStateConnected:
  3450  			return linux.TCP_ESTABLISHED
  3451  		default:
  3452  			return 0
  3453  		}
  3454  	case socket.IsICMP(s):
  3455  		// We don't support this yet.
  3456  	case socket.IsRaw(s):
  3457  		// We don't support this yet.
  3458  	default:
  3459  		// Unknown transport protocol, how did we make this socket?
  3460  		log.Warningf("Unknown transport protocol for an existing socket: family=%v, type=%v, protocol=%v, internal type %v", s.family, s.skType, s.protocol, reflect.TypeOf(s.Endpoint).Elem())
  3461  		return 0
  3462  	}
  3463  
  3464  	return 0
  3465  }
  3466  
  3467  // Type implements socket.Socket.Type.
  3468  func (s *sock) Type() (family int, skType linux.SockType, protocol int) {
  3469  	return s.family, s.skType, s.protocol
  3470  }
  3471  
  3472  // EventRegister implements waiter.Waitable.
  3473  func (s *sock) EventRegister(e *waiter.Entry) error {
  3474  	s.Queue.EventRegister(e)
  3475  	return nil
  3476  }
  3477  
  3478  // EventUnregister implements waiter.Waitable.EventUnregister.
  3479  func (s *sock) EventUnregister(e *waiter.Entry) {
  3480  	s.Queue.EventUnregister(e)
  3481  }