github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/socket/netstack/netstack.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package netstack provides an implementation of the socket.Socket interface
    16  // that is backed by a tcpip.Endpoint.
    17  //
    18  // It does not depend on any particular endpoint implementation, and thus can
    19  // be used to expose certain endpoints to the sentry while leaving others out,
    20  // for example, TCP endpoints and Unix-domain endpoints.
    21  //
    22  // Lock ordering: netstack => mm: ioSequenceReadWriter copies user memory inside
    23  // tcpip.Endpoint.Write(). Netstack is allowed to (and does) hold locks during
    24  // this operation.
    25  package netstack
    26  
    27  import (
    28  	"bytes"
    29  	"encoding/binary"
    30  	"fmt"
    31  	"io"
    32  	"io/ioutil"
    33  	"math"
    34  	"reflect"
    35  	"time"
    36  
    37  	"golang.org/x/sys/unix"
    38  	"google.golang.org/protobuf/proto"
    39  	"github.com/metacubex/gvisor/pkg/abi/linux"
    40  	"github.com/metacubex/gvisor/pkg/abi/linux/errno"
    41  	"github.com/metacubex/gvisor/pkg/context"
    42  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    43  	"github.com/metacubex/gvisor/pkg/eventchannel"
    44  	"github.com/metacubex/gvisor/pkg/hostarch"
    45  	"github.com/metacubex/gvisor/pkg/log"
    46  	"github.com/metacubex/gvisor/pkg/marshal"
    47  	"github.com/metacubex/gvisor/pkg/marshal/primitive"
    48  	"github.com/metacubex/gvisor/pkg/metric"
    49  	"github.com/metacubex/gvisor/pkg/sentry/arch"
    50  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/sockfs"
    51  	"github.com/metacubex/gvisor/pkg/sentry/inet"
    52  	"github.com/metacubex/gvisor/pkg/sentry/kernel"
    53  	"github.com/metacubex/gvisor/pkg/sentry/kernel/auth"
    54  	ktime "github.com/metacubex/gvisor/pkg/sentry/kernel/time"
    55  	"github.com/metacubex/gvisor/pkg/sentry/socket"
    56  	"github.com/metacubex/gvisor/pkg/sentry/socket/netfilter"
    57  	epb "github.com/metacubex/gvisor/pkg/sentry/socket/netstack/events_go_proto"
    58  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    59  	"github.com/metacubex/gvisor/pkg/sync"
    60  	"github.com/metacubex/gvisor/pkg/syserr"
    61  	"github.com/metacubex/gvisor/pkg/tcpip"
    62  	"github.com/metacubex/gvisor/pkg/tcpip/header"
    63  	"github.com/metacubex/gvisor/pkg/tcpip/stack"
    64  	"github.com/metacubex/gvisor/pkg/tcpip/transport"
    65  	"github.com/metacubex/gvisor/pkg/tcpip/transport/tcp"
    66  	"github.com/metacubex/gvisor/pkg/usermem"
    67  	"github.com/metacubex/gvisor/pkg/waiter"
    68  )
    69  
    70  const bitsPerUint32 = 32
    71  
    72  // statCounterValue returns a function usable as callback function when defining a gVisor Sentry
    73  // metric that contains the value counted by the StatCounter.
    74  // This avoids a dependency loop in the tcpip package.
    75  func statCounterValue(cm *tcpip.StatCounter) func(...*metric.FieldValue) uint64 {
    76  	return func(...*metric.FieldValue) uint64 {
    77  		return cm.Value()
    78  	}
    79  }
    80  
    81  func mustCreateMetric(name, description string) *tcpip.StatCounter {
    82  	var cm tcpip.StatCounter
    83  	metric.MustRegisterCustomUint64Metric(name, true /* cumulative */, false /* sync */, description, statCounterValue(&cm))
    84  	return &cm
    85  }
    86  
    87  func mustCreateGauge(name, description string) *tcpip.StatCounter {
    88  	var cm tcpip.StatCounter
    89  	metric.MustRegisterCustomUint64Metric(name, false /* cumulative */, false /* sync */, description, statCounterValue(&cm))
    90  	return &cm
    91  }
    92  
    93  // Metrics contains metrics exported by netstack.
    94  var Metrics = tcpip.Stats{
    95  	DroppedPackets: mustCreateMetric("/netstack/dropped_packets", "Number of packets dropped at the transport layer."),
    96  	NICs: tcpip.NICStats{
    97  		MalformedL4RcvdPackets: mustCreateMetric("/netstack/nic/malformed_l4_received_packets", "Number of packets received that failed L4 header parsing."),
    98  		Tx: tcpip.NICPacketStats{
    99  			Packets: mustCreateMetric("/netstack/nic/tx/packets", "Number of packets transmitted."),
   100  			Bytes:   mustCreateMetric("/netstack/nic/tx/bytes", "Number of bytes transmitted."),
   101  		},
   102  		TxPacketsDroppedNoBufferSpace: mustCreateMetric("/netstack/nic/tx_packets_dropped_no_buffer_space", "Number of TX packets dropped as a result of no buffer space errors."),
   103  		Rx: tcpip.NICPacketStats{
   104  			Packets: mustCreateMetric("/netstack/nic/rx/packets", "Number of packets received."),
   105  			Bytes:   mustCreateMetric("/netstack/nic/rx/bytes", "Number of bytes received."),
   106  		},
   107  		DisabledRx: tcpip.NICPacketStats{
   108  			Packets: mustCreateMetric("/netstack/nic/disabled_rx/packets", "Number of packets received on disabled NICs."),
   109  			Bytes:   mustCreateMetric("/netstack/nic/disabled_rx/bytes", "Number of bytes received on disabled NICs."),
   110  		},
   111  		Neighbor: tcpip.NICNeighborStats{
   112  			UnreachableEntryLookups:                    mustCreateMetric("/netstack/nic/neighbor/unreachable_entry_loopups", "Number of lookups performed on a neighbor entry in Unreachable state."),
   113  			DroppedConfirmationForNoninitiatedNeighbor: mustCreateMetric("/netstack/nic/neighbor/dropped_confirmation_for_noninitiated_neighbor", "Number of advertisements received that don't match an entry in the neighbor cache."),
   114  			DroppedInvalidLinkAddressConfirmations:     mustCreateMetric("/netstack/nic/neighbor/dropped_invalid_link_address_confirmations", "Number of advertisements dropped because they have empty source link-layer addresses"),
   115  		},
   116  	},
   117  	ICMP: tcpip.ICMPStats{
   118  		V4: tcpip.ICMPv4Stats{
   119  			PacketsSent: tcpip.ICMPv4SentPacketStats{
   120  				ICMPv4PacketStats: tcpip.ICMPv4PacketStats{
   121  					EchoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_request", "Number of ICMPv4 echo request packets sent."),
   122  					EchoReply:      mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_reply", "Number of ICMPv4 echo reply packets sent."),
   123  					DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_sent/dst_unreachable", "Number of ICMPv4 destination unreachable packets sent."),
   124  					SrcQuench:      mustCreateMetric("/netstack/icmp/v4/packets_sent/src_quench", "Number of ICMPv4 source quench packets sent."),
   125  					Redirect:       mustCreateMetric("/netstack/icmp/v4/packets_sent/redirect", "Number of ICMPv4 redirect packets sent."),
   126  					TimeExceeded:   mustCreateMetric("/netstack/icmp/v4/packets_sent/time_exceeded", "Number of ICMPv4 time exceeded packets sent."),
   127  					ParamProblem:   mustCreateMetric("/netstack/icmp/v4/packets_sent/param_problem", "Number of ICMPv4 parameter problem packets sent."),
   128  					Timestamp:      mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp", "Number of ICMPv4 timestamp packets sent."),
   129  					TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp_reply", "Number of ICMPv4 timestamp reply packets sent."),
   130  					InfoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_sent/info_request", "Number of ICMPv4 information request packets sent."),
   131  					InfoReply:      mustCreateMetric("/netstack/icmp/v4/packets_sent/info_reply", "Number of ICMPv4 information reply packets sent."),
   132  				},
   133  				Dropped:     mustCreateMetric("/netstack/icmp/v4/packets_sent/dropped", "Number of ICMPv4 packets dropped due to link layer errors."),
   134  				RateLimited: mustCreateMetric("/netstack/icmp/v4/packets_sent/rate_limited", "Number of ICMPv4 packets dropped due to rate limit being exceeded."),
   135  			},
   136  			PacketsReceived: tcpip.ICMPv4ReceivedPacketStats{
   137  				ICMPv4PacketStats: tcpip.ICMPv4PacketStats{
   138  					EchoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_received/echo_request", "Number of ICMPv4 echo request packets received."),
   139  					EchoReply:      mustCreateMetric("/netstack/icmp/v4/packets_received/echo_reply", "Number of ICMPv4 echo reply packets received."),
   140  					DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_received/dst_unreachable", "Number of ICMPv4 destination unreachable packets received."),
   141  					SrcQuench:      mustCreateMetric("/netstack/icmp/v4/packets_received/src_quench", "Number of ICMPv4 source quench packets received."),
   142  					Redirect:       mustCreateMetric("/netstack/icmp/v4/packets_received/redirect", "Number of ICMPv4 redirect packets received."),
   143  					TimeExceeded:   mustCreateMetric("/netstack/icmp/v4/packets_received/time_exceeded", "Number of ICMPv4 time exceeded packets received."),
   144  					ParamProblem:   mustCreateMetric("/netstack/icmp/v4/packets_received/param_problem", "Number of ICMPv4 parameter problem packets received."),
   145  					Timestamp:      mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp", "Number of ICMPv4 timestamp packets received."),
   146  					TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp_reply", "Number of ICMPv4 timestamp reply packets received."),
   147  					InfoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_received/info_request", "Number of ICMPv4 information request packets received."),
   148  					InfoReply:      mustCreateMetric("/netstack/icmp/v4/packets_received/info_reply", "Number of ICMPv4 information reply packets received."),
   149  				},
   150  				Invalid: mustCreateMetric("/netstack/icmp/v4/packets_received/invalid", "Number of ICMPv4 packets received that the transport layer could not parse."),
   151  			},
   152  		},
   153  		V6: tcpip.ICMPv6Stats{
   154  			PacketsSent: tcpip.ICMPv6SentPacketStats{
   155  				ICMPv6PacketStats: tcpip.ICMPv6PacketStats{
   156  					EchoRequest:             mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_request", "Number of ICMPv6 echo request packets sent."),
   157  					EchoReply:               mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_reply", "Number of ICMPv6 echo reply packets sent."),
   158  					DstUnreachable:          mustCreateMetric("/netstack/icmp/v6/packets_sent/dst_unreachable", "Number of ICMPv6 destination unreachable packets sent."),
   159  					PacketTooBig:            mustCreateMetric("/netstack/icmp/v6/packets_sent/packet_too_big", "Number of ICMPv6 packet too big packets sent."),
   160  					TimeExceeded:            mustCreateMetric("/netstack/icmp/v6/packets_sent/time_exceeded", "Number of ICMPv6 time exceeded packets sent."),
   161  					ParamProblem:            mustCreateMetric("/netstack/icmp/v6/packets_sent/param_problem", "Number of ICMPv6 parameter problem packets sent."),
   162  					RouterSolicit:           mustCreateMetric("/netstack/icmp/v6/packets_sent/router_solicit", "Number of ICMPv6 router solicit packets sent."),
   163  					RouterAdvert:            mustCreateMetric("/netstack/icmp/v6/packets_sent/router_advert", "Number of ICMPv6 router advert packets sent."),
   164  					NeighborSolicit:         mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets sent."),
   165  					NeighborAdvert:          mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_advert", "Number of ICMPv6 neighbor advert packets sent."),
   166  					RedirectMsg:             mustCreateMetric("/netstack/icmp/v6/packets_sent/redirect_msg", "Number of ICMPv6 redirect message packets sent."),
   167  					MulticastListenerQuery:  mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_query", "Number of ICMPv6 multicast listener query packets sent."),
   168  					MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent."),
   169  					MulticastListenerDone:   mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent."),
   170  				},
   171  				Dropped:     mustCreateMetric("/netstack/icmp/v6/packets_sent/dropped", "Number of ICMPv6 packets dropped due to link layer errors."),
   172  				RateLimited: mustCreateMetric("/netstack/icmp/v6/packets_sent/rate_limited", "Number of ICMPv6 packets dropped due to rate limit being exceeded."),
   173  			},
   174  			PacketsReceived: tcpip.ICMPv6ReceivedPacketStats{
   175  				ICMPv6PacketStats: tcpip.ICMPv6PacketStats{
   176  					EchoRequest:             mustCreateMetric("/netstack/icmp/v6/packets_received/echo_request", "Number of ICMPv6 echo request packets received."),
   177  					EchoReply:               mustCreateMetric("/netstack/icmp/v6/packets_received/echo_reply", "Number of ICMPv6 echo reply packets received."),
   178  					DstUnreachable:          mustCreateMetric("/netstack/icmp/v6/packets_received/dst_unreachable", "Number of ICMPv6 destination unreachable packets received."),
   179  					PacketTooBig:            mustCreateMetric("/netstack/icmp/v6/packets_received/packet_too_big", "Number of ICMPv6 packet too big packets received."),
   180  					TimeExceeded:            mustCreateMetric("/netstack/icmp/v6/packets_received/time_exceeded", "Number of ICMPv6 time exceeded packets received."),
   181  					ParamProblem:            mustCreateMetric("/netstack/icmp/v6/packets_received/param_problem", "Number of ICMPv6 parameter problem packets received."),
   182  					RouterSolicit:           mustCreateMetric("/netstack/icmp/v6/packets_received/router_solicit", "Number of ICMPv6 router solicit packets received."),
   183  					RouterAdvert:            mustCreateMetric("/netstack/icmp/v6/packets_received/router_advert", "Number of ICMPv6 router advert packets received."),
   184  					NeighborSolicit:         mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets received."),
   185  					NeighborAdvert:          mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_advert", "Number of ICMPv6 neighbor advert packets received."),
   186  					RedirectMsg:             mustCreateMetric("/netstack/icmp/v6/packets_received/redirect_msg", "Number of ICMPv6 redirect message packets received."),
   187  					MulticastListenerQuery:  mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_query", "Number of ICMPv6 multicast listener query packets received."),
   188  					MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent."),
   189  					MulticastListenerDone:   mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent."),
   190  				},
   191  				Unrecognized:                   mustCreateMetric("/netstack/icmp/v6/packets_received/unrecognized", "Number of ICMPv6 packets received that the transport layer does not know how to parse."),
   192  				Invalid:                        mustCreateMetric("/netstack/icmp/v6/packets_received/invalid", "Number of ICMPv6 packets received that the transport layer could not parse."),
   193  				RouterOnlyPacketsDroppedByHost: mustCreateMetric("/netstack/icmp/v6/packets_received/router_only_packets_dropped_by_host", "Number of ICMPv6 packets dropped due to being router-specific packets."),
   194  			},
   195  		},
   196  	},
   197  	IGMP: tcpip.IGMPStats{
   198  		PacketsSent: tcpip.IGMPSentPacketStats{
   199  			IGMPPacketStats: tcpip.IGMPPacketStats{
   200  				MembershipQuery:    mustCreateMetric("/netstack/igmp/packets_sent/membership_query", "Number of IGMP Membership Query messages sent."),
   201  				V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v1_membership_report", "Number of IGMPv1 Membership Report messages sent."),
   202  				V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v2_membership_report", "Number of IGMPv2 Membership Report messages sent."),
   203  				LeaveGroup:         mustCreateMetric("/netstack/igmp/packets_sent/leave_group", "Number of IGMP Leave Group messages sent."),
   204  			},
   205  			Dropped: mustCreateMetric("/netstack/igmp/packets_sent/dropped", "Number of IGMP packets dropped due to link layer errors."),
   206  		},
   207  		PacketsReceived: tcpip.IGMPReceivedPacketStats{
   208  			IGMPPacketStats: tcpip.IGMPPacketStats{
   209  				MembershipQuery:    mustCreateMetric("/netstack/igmp/packets_received/membership_query", "Number of IGMP Membership Query messages received."),
   210  				V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v1_membership_report", "Number of IGMPv1 Membership Report messages received."),
   211  				V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v2_membership_report", "Number of IGMPv2 Membership Report messages received."),
   212  				LeaveGroup:         mustCreateMetric("/netstack/igmp/packets_received/leave_group", "Number of IGMP Leave Group messages received."),
   213  			},
   214  			Invalid:        mustCreateMetric("/netstack/igmp/packets_received/invalid", "Number of IGMP packets received that could not be parsed."),
   215  			ChecksumErrors: mustCreateMetric("/netstack/igmp/packets_received/checksum_errors", "Number of received IGMP packets with bad checksums."),
   216  			Unrecognized:   mustCreateMetric("/netstack/igmp/packets_received/unrecognized", "Number of unrecognized IGMP packets received."),
   217  		},
   218  	},
   219  	IP: tcpip.IPStats{
   220  		PacketsReceived:                     mustCreateMetric("/netstack/ip/packets_received", "Number of IP packets received from the link layer in nic.DeliverNetworkPacket."),
   221  		DisabledPacketsReceived:             mustCreateMetric("/netstack/ip/disabled_packets_received", "Number of IP packets received from the link layer when the IP layer is disabled."),
   222  		InvalidDestinationAddressesReceived: mustCreateMetric("/netstack/ip/invalid_addresses_received", "Number of IP packets received with an unknown or invalid destination address."),
   223  		InvalidSourceAddressesReceived:      mustCreateMetric("/netstack/ip/invalid_source_addresses_received", "Number of IP packets received with an unknown or invalid source address."),
   224  		PacketsDelivered:                    mustCreateMetric("/netstack/ip/packets_delivered", "Number of incoming IP packets that are successfully delivered to the transport layer via HandlePacket."),
   225  		PacketsSent:                         mustCreateMetric("/netstack/ip/packets_sent", "Number of IP packets sent via WritePacket."),
   226  		OutgoingPacketErrors:                mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Number of IP packets which failed to write to a link-layer endpoint."),
   227  		MalformedPacketsReceived:            mustCreateMetric("/netstack/ip/malformed_packets_received", "Number of IP packets which failed IP header validation checks."),
   228  		MalformedFragmentsReceived:          mustCreateMetric("/netstack/ip/malformed_fragments_received", "Number of IP fragments which failed IP fragment validation checks."),
   229  		IPTablesPreroutingDropped:           mustCreateMetric("/netstack/ip/iptables/prerouting_dropped", "Number of IP packets dropped in the Prerouting chain."),
   230  		IPTablesInputDropped:                mustCreateMetric("/netstack/ip/iptables/input_dropped", "Number of IP packets dropped in the Input chain."),
   231  		IPTablesOutputDropped:               mustCreateMetric("/netstack/ip/iptables/output_dropped", "Number of IP packets dropped in the Output chain."),
   232  		OptionTimestampReceived:             mustCreateMetric("/netstack/ip/options/timestamp_received", "Number of timestamp options found in received IP packets."),
   233  		OptionRecordRouteReceived:           mustCreateMetric("/netstack/ip/options/record_route_received", "Number of record route options found in received IP packets."),
   234  		OptionRouterAlertReceived:           mustCreateMetric("/netstack/ip/options/router_alert_received", "Number of router alert options found in received IP packets."),
   235  		OptionUnknownReceived:               mustCreateMetric("/netstack/ip/options/unknown_received", "Number of unknown options found in received IP packets."),
   236  		Forwarding: tcpip.IPForwardingStats{
   237  			Unrouteable:            mustCreateMetric("/netstack/ip/forwarding/unrouteable", "Number of IP packets received which couldn't be routed and thus were not forwarded."),
   238  			ExhaustedTTL:           mustCreateMetric("/netstack/ip/forwarding/exhausted_ttl", "Number of IP packets received which could not be forwarded due to an exhausted TTL."),
   239  			LinkLocalSource:        mustCreateMetric("/netstack/ip/forwarding/link_local_source_address", "Number of IP packets received which could not be forwarded due to a link-local source address."),
   240  			LinkLocalDestination:   mustCreateMetric("/netstack/ip/forwarding/link_local_destination_address", "Number of IP packets received which could not be forwarded due to a link-local destination address."),
   241  			ExtensionHeaderProblem: mustCreateMetric("/netstack/ip/forwarding/extension_header_problem", "Number of IP packets received which could not be forwarded due to a problem processing their IPv6 extension headers."),
   242  			PacketTooBig:           mustCreateMetric("/netstack/ip/forwarding/packet_too_big", "Number of IP packets received which could not be forwarded because they could not fit within the outgoing MTU."),
   243  			HostUnreachable:        mustCreateMetric("/netstack/ip/forwarding/host_unreachable", "Number of IP packets received which could not be forwarded due to unresolvable next hop."),
   244  			Errors:                 mustCreateMetric("/netstack/ip/forwarding/errors", "Number of IP packets which couldn't be forwarded."),
   245  		},
   246  	},
   247  	ARP: tcpip.ARPStats{
   248  		PacketsReceived:                                 mustCreateMetric("/netstack/arp/packets_received", "Number of ARP packets received from the link layer."),
   249  		DisabledPacketsReceived:                         mustCreateMetric("/netstack/arp/disabled_packets_received", "Number of ARP packets received from the link layer when the ARP layer is disabled."),
   250  		MalformedPacketsReceived:                        mustCreateMetric("/netstack/arp/malformed_packets_received", "Number of ARP packets which failed ARP header validation checks."),
   251  		RequestsReceived:                                mustCreateMetric("/netstack/arp/requests_received", "Number of ARP requests received."),
   252  		RequestsReceivedUnknownTargetAddress:            mustCreateMetric("/netstack/arp/requests_received_unknown_addr", "Number of ARP requests received with an unknown target address."),
   253  		OutgoingRequestInterfaceHasNoLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_iface_has_no_addr", "Number of failed attempts to send an ARP request with an interface that has no network address."),
   254  		OutgoingRequestBadLocalAddressErrors:            mustCreateMetric("/netstack/arp/outgoing_requests_invalid_local_addr", "Number of failed attempts to send an ARP request with a provided local address that is invalid."),
   255  		OutgoingRequestsDropped:                         mustCreateMetric("/netstack/arp/outgoing_requests_dropped", "Number of ARP requests which failed to write to a link-layer endpoint."),
   256  		OutgoingRequestsSent:                            mustCreateMetric("/netstack/arp/outgoing_requests_sent", "Number of ARP requests sent."),
   257  		RepliesReceived:                                 mustCreateMetric("/netstack/arp/replies_received", "Number of ARP replies received."),
   258  		OutgoingRepliesDropped:                          mustCreateMetric("/netstack/arp/outgoing_replies_dropped", "Number of ARP replies which failed to write to a link-layer endpoint."),
   259  		OutgoingRepliesSent:                             mustCreateMetric("/netstack/arp/outgoing_replies_sent", "Number of ARP replies sent."),
   260  	},
   261  	TCP: tcpip.TCPStats{
   262  		ActiveConnectionOpenings:           mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."),
   263  		PassiveConnectionOpenings:          mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."),
   264  		CurrentEstablished:                 mustCreateGauge("/netstack/tcp/current_established", "Number of connections in ESTABLISHED state now."),
   265  		CurrentConnected:                   mustCreateGauge("/netstack/tcp/current_open", "Number of connections that are in connected state."),
   266  		EstablishedResets:                  mustCreateMetric("/netstack/tcp/established_resets", "Number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state"),
   267  		EstablishedClosed:                  mustCreateMetric("/netstack/tcp/established_closed", "Number of times established TCP connections made a transition to CLOSED state."),
   268  		EstablishedTimedout:                mustCreateMetric("/netstack/tcp/established_timedout", "Number of times  an established connection was reset because of keep-alive time out."),
   269  		ListenOverflowSynDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_syn_drop", "Number of times the listen queue overflowed and a SYN was dropped."),
   270  		ListenOverflowAckDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_ack_drop", "Number of times the listen queue overflowed and the final ACK in the handshake was dropped."),
   271  		ListenOverflowSynCookieSent:        mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_sent", "Number of times a SYN cookie was sent."),
   272  		ListenOverflowSynCookieRcvd:        mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_rcvd", "Number of times a SYN cookie was received."),
   273  		ListenOverflowInvalidSynCookieRcvd: mustCreateMetric("/netstack/tcp/listen_overflow_invalid_syn_cookie_rcvd", "Number of times an invalid SYN cookie was received."),
   274  		FailedConnectionAttempts:           mustCreateMetric("/netstack/tcp/failed_connection_attempts", "Number of calls to Connect or Listen (active and passive openings, respectively) that end in an error."),
   275  		ValidSegmentsReceived:              mustCreateMetric("/netstack/tcp/valid_segments_received", "Number of TCP segments received that the transport layer successfully parsed."),
   276  		InvalidSegmentsReceived:            mustCreateMetric("/netstack/tcp/invalid_segments_received", "Number of TCP segments received that the transport layer could not parse."),
   277  		SegmentsSent:                       mustCreateMetric("/netstack/tcp/segments_sent", "Number of TCP segments sent."),
   278  		SegmentSendErrors:                  mustCreateMetric("/netstack/tcp/segment_send_errors", "Number of TCP segments failed to be sent."),
   279  		ResetsSent:                         mustCreateMetric("/netstack/tcp/resets_sent", "Number of TCP resets sent."),
   280  		ResetsReceived:                     mustCreateMetric("/netstack/tcp/resets_received", "Number of TCP resets received."),
   281  		Retransmits:                        mustCreateMetric("/netstack/tcp/retransmits", "Number of TCP segments retransmitted."),
   282  		FastRecovery:                       mustCreateMetric("/netstack/tcp/fast_recovery", "Number of times fast recovery was used to recover from packet loss."),
   283  		SACKRecovery:                       mustCreateMetric("/netstack/tcp/sack_recovery", "Number of times SACK recovery was used to recover from packet loss."),
   284  		TLPRecovery:                        mustCreateMetric("/netstack/tcp/tlp_recovery", "Number of times tail loss probe triggers recovery from tail loss."),
   285  		SlowStartRetransmits:               mustCreateMetric("/netstack/tcp/slow_start_retransmits", "Number of segments retransmitted in slow start mode."),
   286  		FastRetransmit:                     mustCreateMetric("/netstack/tcp/fast_retransmit", "Number of TCP segments which were fast retransmitted."),
   287  		Timeouts:                           mustCreateMetric("/netstack/tcp/timeouts", "Number of times RTO expired."),
   288  		ChecksumErrors:                     mustCreateMetric("/netstack/tcp/checksum_errors", "Number of segments dropped due to bad checksums."),
   289  		FailedPortReservations:             mustCreateMetric("/netstack/tcp/failed_port_reservations", "Number of time TCP failed to reserve a port."),
   290  		SegmentsAckedWithDSACK:             mustCreateMetric("/netstack/tcp/segments_acked_with_dsack", "Number of segments for which DSACK was received."),
   291  		SpuriousRecovery:                   mustCreateMetric("/netstack/tcp/spurious_recovery", "Number of times the connection entered loss recovery spuriously."),
   292  		SpuriousRTORecovery:                mustCreateMetric("/netstack/tcp/spurious_rto_recovery", "Number of times the connection entered RTO spuriously."),
   293  		ForwardMaxInFlightDrop:             mustCreateMetric("/netstack/tcp/forward_max_in_flight_drop", "Number of connection requests dropped due to exceeding in-flight limit."),
   294  	},
   295  	UDP: tcpip.UDPStats{
   296  		PacketsReceived:          mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."),
   297  		UnknownPortErrors:        mustCreateMetric("/netstack/udp/unknown_port_errors", "Number of incoming UDP datagrams dropped because they did not have a known destination port."),
   298  		ReceiveBufferErrors:      mustCreateMetric("/netstack/udp/receive_buffer_errors", "Number of incoming UDP datagrams dropped due to the receiving buffer being in an invalid state."),
   299  		MalformedPacketsReceived: mustCreateMetric("/netstack/udp/malformed_packets_received", "Number of incoming UDP datagrams dropped due to the UDP header being in a malformed state."),
   300  		PacketsSent:              mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent."),
   301  		PacketSendErrors:         mustCreateMetric("/netstack/udp/packet_send_errors", "Number of UDP datagrams failed to be sent."),
   302  		ChecksumErrors:           mustCreateMetric("/netstack/udp/checksum_errors", "Number of UDP datagrams dropped due to bad checksums."),
   303  	},
   304  }
   305  
   306  // DefaultTTL is linux's default TTL. All network protocols in all stacks used
   307  // with this package must have this value set as their default TTL.
   308  const DefaultTTL = 64
   309  
   310  const sizeOfInt32 int = 4
   311  
   312  var errStackType = syserr.New("expected but did not receive a netstack.Stack", errno.EINVAL)
   313  
   314  // commonEndpoint represents the intersection of a tcpip.Endpoint and a
   315  // transport.Endpoint.
   316  type commonEndpoint interface {
   317  	// Readiness implements tcpip.Endpoint.Readiness and
   318  	// transport.Endpoint.Readiness.
   319  	Readiness(mask waiter.EventMask) waiter.EventMask
   320  
   321  	// SetSockOpt implements tcpip.Endpoint.SetSockOpt and
   322  	// transport.Endpoint.SetSockOpt.
   323  	SetSockOpt(tcpip.SettableSocketOption) tcpip.Error
   324  
   325  	// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt and
   326  	// transport.Endpoint.SetSockOptInt.
   327  	SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error
   328  
   329  	// GetSockOpt implements tcpip.Endpoint.GetSockOpt and
   330  	// transport.Endpoint.GetSockOpt.
   331  	GetSockOpt(tcpip.GettableSocketOption) tcpip.Error
   332  
   333  	// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and
   334  	// transport.Endpoint.GetSockOpt.
   335  	GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error)
   336  
   337  	// State returns a socket's lifecycle state. The returned value is
   338  	// protocol-specific and is primarily used for diagnostics.
   339  	State() uint32
   340  
   341  	// LastError implements tcpip.Endpoint.LastError and
   342  	// transport.Endpoint.LastError.
   343  	LastError() tcpip.Error
   344  
   345  	// SocketOptions implements tcpip.Endpoint.SocketOptions and
   346  	// transport.Endpoint.SocketOptions.
   347  	SocketOptions() *tcpip.SocketOptions
   348  }
   349  
   350  // sock encapsulates all the state needed to represent a network stack
   351  // endpoint in the kernel context.
   352  //
   353  // +stateify savable
   354  type sock struct {
   355  	vfsfd vfs.FileDescription
   356  	vfs.FileDescriptionDefaultImpl
   357  	vfs.DentryMetadataFileDescriptionImpl
   358  	vfs.LockFD
   359  	socket.SendReceiveTimeout
   360  	*waiter.Queue
   361  
   362  	family   int
   363  	Endpoint tcpip.Endpoint
   364  	skType   linux.SockType
   365  	protocol int
   366  
   367  	namespace *inet.Namespace
   368  
   369  	// readMu protects access to the below fields.
   370  	readMu sync.Mutex `state:"nosave"`
   371  
   372  	// sockOptTimestamp corresponds to SO_TIMESTAMP. When true, timestamps
   373  	// of returned messages can be returned via control messages. When
   374  	// false, the same timestamp is instead stored and can be read via the
   375  	// SIOCGSTAMP ioctl. It is protected by readMu. See socket(7).
   376  	sockOptTimestamp bool
   377  	// timestampValid indicates whether timestamp for SIOCGSTAMP has been
   378  	// set. It is protected by readMu.
   379  	timestampValid bool
   380  	// timestamp holds the timestamp to use with SIOCTSTAMP. It is only
   381  	// valid when timestampValid is true. It is protected by readMu.
   382  	timestamp time.Time `state:".(int64)"`
   383  
   384  	// TODO(b/153685824): Move this to SocketOptions.
   385  	// sockOptInq corresponds to TCP_INQ.
   386  	sockOptInq bool
   387  }
   388  
   389  var _ = socket.Socket(&sock{})
   390  
   391  // New creates a new endpoint socket.
   392  func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*vfs.FileDescription, *syserr.Error) {
   393  	if skType == linux.SOCK_STREAM {
   394  		endpoint.SocketOptions().SetDelayOption(true)
   395  	}
   396  
   397  	mnt := t.Kernel().SocketMount()
   398  	d := sockfs.NewDentry(t, mnt)
   399  	defer d.DecRef(t)
   400  
   401  	namespace := t.NetworkNamespace()
   402  	s := &sock{
   403  		Queue:     queue,
   404  		family:    family,
   405  		Endpoint:  endpoint,
   406  		skType:    skType,
   407  		protocol:  protocol,
   408  		namespace: namespace,
   409  	}
   410  	s.LockFD.Init(&vfs.FileLocks{})
   411  	vfsfd := &s.vfsfd
   412  	if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{
   413  		DenyPRead:         true,
   414  		DenyPWrite:        true,
   415  		UseDentryMetadata: true,
   416  	}); err != nil {
   417  		return nil, syserr.FromError(err)
   418  	}
   419  	namespace.IncRef()
   420  	return vfsfd, nil
   421  }
   422  
   423  // Release implements vfs.FileDescriptionImpl.Release.
   424  func (s *sock) Release(ctx context.Context) {
   425  	kernel.KernelFromContext(ctx).DeleteSocket(&s.vfsfd)
   426  	e, ch := waiter.NewChannelEntry(waiter.EventHUp | waiter.EventErr)
   427  	s.EventRegister(&e)
   428  	defer s.EventUnregister(&e)
   429  
   430  	s.Endpoint.Close()
   431  
   432  	// SO_LINGER option is valid only for TCP. For other socket types
   433  	// return after endpoint close.
   434  	if family, skType, _ := s.Type(); skType == linux.SOCK_STREAM && (family == linux.AF_INET || family == linux.AF_INET6) {
   435  		v := s.Endpoint.SocketOptions().GetLinger()
   436  		// The case for zero timeout is handled in tcp endpoint close function.
   437  		// Close is blocked until either:
   438  		// 1. The endpoint state is not in any of the states: FIN-WAIT1,
   439  		// CLOSING and LAST_ACK.
   440  		// 2. Timeout is reached.
   441  		if v.Enabled && v.Timeout != 0 {
   442  			t := kernel.TaskFromContext(ctx)
   443  			start := t.Kernel().MonotonicClock().Now()
   444  			deadline := start.Add(v.Timeout)
   445  			_ = t.BlockWithDeadline(ch, true, deadline)
   446  		}
   447  	}
   448  	s.namespace.DecRef(ctx)
   449  }
   450  
   451  // Epollable implements FileDescriptionImpl.Epollable.
   452  func (s *sock) Epollable() bool {
   453  	return true
   454  }
   455  
   456  // Read implements vfs.FileDescriptionImpl.
   457  func (s *sock) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
   458  	// All flags other than RWF_NOWAIT should be ignored.
   459  	// TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
   460  	if opts.Flags != 0 {
   461  		return 0, linuxerr.EOPNOTSUPP
   462  	}
   463  
   464  	if dst.NumBytes() == 0 {
   465  		return 0, nil
   466  	}
   467  	n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false)
   468  	if err == syserr.ErrWouldBlock {
   469  		return int64(n), linuxerr.ErrWouldBlock
   470  	}
   471  	if err != nil {
   472  		return 0, err.ToError()
   473  	}
   474  	return int64(n), nil
   475  }
   476  
   477  // Write implements vfs.FileDescriptionImpl.
   478  func (s *sock) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
   479  	// All flags other than RWF_NOWAIT should be ignored.
   480  	// TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
   481  	if opts.Flags != 0 {
   482  		return 0, linuxerr.EOPNOTSUPP
   483  	}
   484  
   485  	r := src.Reader(ctx)
   486  	n, err := s.Endpoint.Write(r, tcpip.WriteOptions{})
   487  	if _, ok := err.(*tcpip.ErrWouldBlock); ok {
   488  		return 0, linuxerr.ErrWouldBlock
   489  	}
   490  	if err != nil {
   491  		return 0, syserr.TranslateNetstackError(err).ToError()
   492  	}
   493  
   494  	if n < src.NumBytes() {
   495  		return n, linuxerr.ErrWouldBlock
   496  	}
   497  
   498  	return n, nil
   499  }
   500  
   501  // Accept implements the linux syscall accept(2) for sockets backed by
   502  // tcpip.Endpoint.
   503  func (s *sock) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
   504  	// Issue the accept request to get the new endpoint.
   505  	var peerAddr *tcpip.FullAddress
   506  	if peerRequested {
   507  		peerAddr = &tcpip.FullAddress{}
   508  	}
   509  	ep, wq, terr := s.Endpoint.Accept(peerAddr)
   510  	if terr != nil {
   511  		if _, ok := terr.(*tcpip.ErrWouldBlock); !ok || !blocking {
   512  			return 0, nil, 0, syserr.TranslateNetstackError(terr)
   513  		}
   514  
   515  		var err *syserr.Error
   516  		ep, wq, err = s.blockingAccept(t, peerAddr)
   517  		if err != nil {
   518  			return 0, nil, 0, err
   519  		}
   520  	}
   521  
   522  	ns, err := New(t, s.family, s.skType, s.protocol, wq, ep)
   523  	if err != nil {
   524  		return 0, nil, 0, err
   525  	}
   526  	defer ns.DecRef(t)
   527  
   528  	if err := ns.SetStatusFlags(t, t.Credentials(), uint32(flags&linux.SOCK_NONBLOCK)); err != nil {
   529  		return 0, nil, 0, syserr.FromError(err)
   530  	}
   531  
   532  	var addr linux.SockAddr
   533  	var addrLen uint32
   534  	if peerAddr != nil {
   535  		// Get address of the peer and write it to peer slice.
   536  		addr, addrLen = socket.ConvertAddress(s.family, *peerAddr)
   537  	}
   538  
   539  	fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
   540  		CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
   541  	})
   542  
   543  	t.Kernel().RecordSocket(ns)
   544  
   545  	return fd, addr, addrLen, syserr.FromError(e)
   546  }
   547  
   548  // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
   549  // tcpip.Endpoint.
   550  func (s *sock) GetSockOpt(t *kernel.Task, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
   551  	// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
   552  	// implemented specifically for netstack.Socket rather than
   553  	// commonEndpoint. commonEndpoint should be extended to support socket
   554  	// options where the implementation is not shared, as unix sockets need
   555  	// their own support for SO_TIMESTAMP.
   556  	if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP {
   557  		if outLen < sizeOfInt32 {
   558  			return nil, syserr.ErrInvalidArgument
   559  		}
   560  		val := primitive.Int32(0)
   561  		s.readMu.Lock()
   562  		defer s.readMu.Unlock()
   563  		if s.sockOptTimestamp {
   564  			val = 1
   565  		}
   566  		return &val, nil
   567  	}
   568  	if level == linux.SOL_TCP && name == linux.TCP_INQ {
   569  		if outLen < sizeOfInt32 {
   570  			return nil, syserr.ErrInvalidArgument
   571  		}
   572  		val := primitive.Int32(0)
   573  		s.readMu.Lock()
   574  		defer s.readMu.Unlock()
   575  		if s.sockOptInq {
   576  			val = 1
   577  		}
   578  		return &val, nil
   579  	}
   580  
   581  	return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outPtr, outLen)
   582  }
   583  
   584  // SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
   585  // tcpip.Endpoint.
   586  func (s *sock) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error {
   587  	// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
   588  	// implemented specifically for netstack.Socket rather than
   589  	// commonEndpoint. commonEndpoint should be extended to support socket
   590  	// options where the implementation is not shared, as unix sockets need
   591  	// their own support for SO_TIMESTAMP.
   592  	if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP {
   593  		if len(optVal) < sizeOfInt32 {
   594  			return syserr.ErrInvalidArgument
   595  		}
   596  		s.readMu.Lock()
   597  		defer s.readMu.Unlock()
   598  		s.sockOptTimestamp = hostarch.ByteOrder.Uint32(optVal) != 0
   599  		return nil
   600  	}
   601  	if level == linux.SOL_TCP && name == linux.TCP_INQ {
   602  		if len(optVal) < sizeOfInt32 {
   603  			return syserr.ErrInvalidArgument
   604  		}
   605  		s.readMu.Lock()
   606  		defer s.readMu.Unlock()
   607  		s.sockOptInq = hostarch.ByteOrder.Uint32(optVal) != 0
   608  		return nil
   609  	}
   610  
   611  	return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
   612  }
   613  
   614  var sockAddrInetSize = (*linux.SockAddrInet)(nil).SizeBytes()
   615  var sockAddrInet6Size = (*linux.SockAddrInet6)(nil).SizeBytes()
   616  var sockAddrLinkSize = (*linux.SockAddrLink)(nil).SizeBytes()
   617  
   618  // minSockAddrLen returns the minimum length in bytes of a socket address for
   619  // the socket's family.
   620  func (s *sock) minSockAddrLen() int {
   621  	const addressFamilySize = 2
   622  
   623  	switch s.family {
   624  	case linux.AF_UNIX:
   625  		return addressFamilySize
   626  	case linux.AF_INET:
   627  		return sockAddrInetSize
   628  	case linux.AF_INET6:
   629  		return sockAddrInet6Size
   630  	case linux.AF_PACKET:
   631  		return sockAddrLinkSize
   632  	case linux.AF_UNSPEC:
   633  		return addressFamilySize
   634  	default:
   635  		panic(fmt.Sprintf("s.family unrecognized = %d", s.family))
   636  	}
   637  }
   638  
   639  func (s *sock) isPacketBased() bool {
   640  	return s.skType == linux.SOCK_DGRAM || s.skType == linux.SOCK_SEQPACKET || s.skType == linux.SOCK_RDM || s.skType == linux.SOCK_RAW
   641  }
   642  
   643  // Readiness returns a mask of ready events for socket s.
   644  func (s *sock) Readiness(mask waiter.EventMask) waiter.EventMask {
   645  	return s.Endpoint.Readiness(mask)
   646  }
   647  
   648  // checkFamily returns true iff the specified address family may be used with
   649  // the socket.
   650  //
   651  // If exact is true, then the specified address family must be an exact match
   652  // with the socket's family.
   653  func (s *sock) checkFamily(family uint16, exact bool) bool {
   654  	if family == uint16(s.family) {
   655  		return true
   656  	}
   657  	if !exact && family == linux.AF_INET && s.family == linux.AF_INET6 {
   658  		if !s.Endpoint.SocketOptions().GetV6Only() {
   659  			return true
   660  		}
   661  	}
   662  	return false
   663  }
   664  
   665  // mapFamily maps the AF_INET ANY address to the IPv4-mapped IPv6 ANY if the
   666  // receiver's family is AF_INET6.
   667  //
   668  // This is a hack to work around the fact that both IPv4 and IPv6 ANY are
   669  // represented by the empty string.
   670  //
   671  // TODO(gvisor.dev/issue/1556): remove this function.
   672  func (s *sock) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress {
   673  	if addr.Addr.BitLen() == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET {
   674  		addr.Addr = tcpip.AddrFrom16([16]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00})
   675  	}
   676  	return addr
   677  }
   678  
   679  // Connect implements the linux syscall connect(2) for sockets backed by
   680  // tpcip.Endpoint.
   681  func (s *sock) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
   682  	addr, family, err := socket.AddressAndFamily(sockaddr)
   683  	if err != nil {
   684  		return err
   685  	}
   686  
   687  	if family == linux.AF_UNSPEC {
   688  		err := s.Endpoint.Disconnect()
   689  		if _, ok := err.(*tcpip.ErrNotSupported); ok {
   690  			return syserr.ErrAddressFamilyNotSupported
   691  		}
   692  		return syserr.TranslateNetstackError(err)
   693  	}
   694  
   695  	if !s.checkFamily(family, false /* exact */) {
   696  		return syserr.ErrInvalidArgument
   697  	}
   698  	addr = s.mapFamily(addr, family)
   699  
   700  	// Always return right away in the non-blocking case.
   701  	if !blocking {
   702  		return syserr.TranslateNetstackError(s.Endpoint.Connect(addr))
   703  	}
   704  
   705  	// Register for notification when the endpoint becomes writable, then
   706  	// initiate the connection.
   707  	e, ch := waiter.NewChannelEntry(waiter.WritableEvents)
   708  	s.EventRegister(&e)
   709  	defer s.EventUnregister(&e)
   710  
   711  	switch err := s.Endpoint.Connect(addr); err.(type) {
   712  	case *tcpip.ErrConnectStarted, *tcpip.ErrAlreadyConnecting:
   713  	case *tcpip.ErrNoPortAvailable:
   714  		if (s.family == unix.AF_INET || s.family == unix.AF_INET6) && s.skType == linux.SOCK_STREAM {
   715  			// TCP unlike UDP returns EADDRNOTAVAIL when it can't
   716  			// find an available local ephemeral port.
   717  			return syserr.ErrAddressNotAvailable
   718  		}
   719  		return syserr.TranslateNetstackError(err)
   720  	default:
   721  		return syserr.TranslateNetstackError(err)
   722  	}
   723  
   724  	// It's pending, so we have to wait for a notification, and fetch the
   725  	// result once the wait completes.
   726  	if err := t.Block(ch); err != nil {
   727  		return syserr.FromError(err)
   728  	}
   729  
   730  	// Call Connect() again after blocking to find connect's result.
   731  	return syserr.TranslateNetstackError(s.Endpoint.Connect(addr))
   732  }
   733  
   734  // Bind implements the linux syscall bind(2) for sockets backed by
   735  // tcpip.Endpoint.
   736  func (s *sock) Bind(_ *kernel.Task, sockaddr []byte) *syserr.Error {
   737  	if len(sockaddr) < 2 {
   738  		return syserr.ErrInvalidArgument
   739  	}
   740  
   741  	family := hostarch.ByteOrder.Uint16(sockaddr)
   742  	var addr tcpip.FullAddress
   743  
   744  	// Bind for AF_PACKET requires only family, protocol and ifindex.
   745  	// In function AddressAndFamily, we check the address length which is
   746  	// not needed for AF_PACKET bind.
   747  	if family == linux.AF_PACKET {
   748  		var a linux.SockAddrLink
   749  		if len(sockaddr) < sockAddrLinkSize {
   750  			return syserr.ErrInvalidArgument
   751  		}
   752  		a.UnmarshalBytes(sockaddr)
   753  
   754  		addr = tcpip.FullAddress{
   755  			NIC: tcpip.NICID(a.InterfaceIndex),
   756  			Addr: tcpip.AddrFrom16Slice(append(
   757  				a.HardwareAddr[:header.EthernetAddressSize],
   758  				[]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}...,
   759  			)),
   760  			Port: socket.Ntohs(a.Protocol),
   761  		}
   762  	} else {
   763  		if s.minSockAddrLen() > len(sockaddr) {
   764  			return syserr.ErrInvalidArgument
   765  		}
   766  
   767  		var err *syserr.Error
   768  		addr, family, err = socket.AddressAndFamily(sockaddr)
   769  		if err != nil {
   770  			return err
   771  		}
   772  
   773  		if !s.checkFamily(family, true /* exact */) {
   774  			return syserr.ErrAddressFamilyNotSupported
   775  		}
   776  
   777  		addr = s.mapFamily(addr, family)
   778  	}
   779  
   780  	// Issue the bind request to the endpoint.
   781  	err := s.Endpoint.Bind(addr)
   782  	if _, ok := err.(*tcpip.ErrNoPortAvailable); ok {
   783  		// Bind always returns EADDRINUSE irrespective of if the specified port was
   784  		// already bound or if an ephemeral port was requested but none were
   785  		// available.
   786  		//
   787  		// *tcpip.ErrNoPortAvailable is mapped to EAGAIN in syserr package because
   788  		// UDP connect returns EAGAIN on ephemeral port exhaustion.
   789  		//
   790  		// TCP connect returns EADDRNOTAVAIL on ephemeral port exhaustion.
   791  		err = &tcpip.ErrPortInUse{}
   792  	}
   793  
   794  	return syserr.TranslateNetstackError(err)
   795  }
   796  
   797  // Listen implements the linux syscall listen(2) for sockets backed by
   798  // tcpip.Endpoint.
   799  func (s *sock) Listen(_ *kernel.Task, backlog int) *syserr.Error {
   800  	if err := s.Endpoint.Listen(backlog); err != nil {
   801  		return syserr.TranslateNetstackError(err)
   802  	}
   803  	if !socket.IsTCP(s) {
   804  		return nil
   805  	}
   806  
   807  	// Emit SentryTCPListenEvent with the bound port for tcp sockets.
   808  	addr, err := s.Endpoint.GetLocalAddress()
   809  	if err != nil {
   810  		panic(fmt.Sprintf("GetLocalAddress failed for tcp socket: %s", err))
   811  	}
   812  	eventchannel.Emit(&epb.SentryTcpListenEvent{
   813  		Port: proto.Int32(int32(addr.Port)),
   814  	})
   815  	return nil
   816  }
   817  
   818  // blockingAccept implements a blocking version of accept(2), that is, if no
   819  // connections are ready to be accept, it will block until one becomes ready.
   820  func (s *sock) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) {
   821  	// Register for notifications.
   822  	e, ch := waiter.NewChannelEntry(waiter.ReadableEvents)
   823  	s.EventRegister(&e)
   824  	defer s.EventUnregister(&e)
   825  
   826  	// Try to accept the connection again; if it fails, then wait until we
   827  	// get a notification.
   828  	for {
   829  		ep, wq, err := s.Endpoint.Accept(peerAddr)
   830  		if _, ok := err.(*tcpip.ErrWouldBlock); !ok {
   831  			return ep, wq, syserr.TranslateNetstackError(err)
   832  		}
   833  
   834  		if err := t.Block(ch); err != nil {
   835  			return nil, nil, syserr.FromError(err)
   836  		}
   837  	}
   838  }
   839  
   840  // ConvertShutdown converts Linux shutdown flags into tcpip shutdown flags.
   841  func ConvertShutdown(how int) (tcpip.ShutdownFlags, *syserr.Error) {
   842  	var f tcpip.ShutdownFlags
   843  	switch how {
   844  	case linux.SHUT_RD:
   845  		f = tcpip.ShutdownRead
   846  	case linux.SHUT_WR:
   847  		f = tcpip.ShutdownWrite
   848  	case linux.SHUT_RDWR:
   849  		f = tcpip.ShutdownRead | tcpip.ShutdownWrite
   850  	default:
   851  		return 0, syserr.ErrInvalidArgument
   852  	}
   853  	return f, nil
   854  }
   855  
   856  // Shutdown implements the linux syscall shutdown(2) for sockets backed by
   857  // tcpip.Endpoint.
   858  func (s *sock) Shutdown(_ *kernel.Task, how int) *syserr.Error {
   859  	f, err := ConvertShutdown(how)
   860  	if err != nil {
   861  		return err
   862  	}
   863  
   864  	// Issue shutdown request.
   865  	return syserr.TranslateNetstackError(s.Endpoint.Shutdown(f))
   866  }
   867  
   868  // GetSockOpt can be used to implement the linux syscall getsockopt(2) for
   869  // sockets backed by a commonEndpoint.
   870  func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
   871  	switch level {
   872  	case linux.SOL_SOCKET:
   873  		return getSockOptSocket(t, s, ep, family, skType, name, outLen)
   874  
   875  	case linux.SOL_TCP:
   876  		return getSockOptTCP(t, s, ep, name, outLen)
   877  
   878  	case linux.SOL_IPV6:
   879  		return getSockOptIPv6(t, s, ep, name, outPtr, outLen)
   880  
   881  	case linux.SOL_IP:
   882  		return getSockOptIP(t, s, ep, name, outPtr, outLen, family)
   883  
   884  	case linux.SOL_ICMPV6:
   885  		return getSockOptICMPv6(t, s, ep, name, outLen)
   886  
   887  	case linux.SOL_UDP,
   888  		linux.SOL_RAW,
   889  		linux.SOL_PACKET:
   890  		// Not supported.
   891  	}
   892  
   893  	return nil, syserr.ErrProtocolNotAvailable
   894  }
   895  
   896  func boolToInt32(v bool) int32 {
   897  	if v {
   898  		return 1
   899  	}
   900  	return 0
   901  }
   902  
   903  // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET.
   904  func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, _ linux.SockType, name, outLen int) (marshal.Marshallable, *syserr.Error) {
   905  	// TODO(b/124056281): Stop rejecting short optLen values in getsockopt.
   906  	switch name {
   907  	case linux.SO_ERROR:
   908  		if outLen < sizeOfInt32 {
   909  			return nil, syserr.ErrInvalidArgument
   910  		}
   911  
   912  		// Get the last error and convert it.
   913  		err := ep.SocketOptions().GetLastError()
   914  		if err == nil {
   915  			optP := primitive.Int32(0)
   916  			return &optP, nil
   917  		}
   918  
   919  		optP := primitive.Int32(syserr.TranslateNetstackError(err).ToLinux())
   920  		return &optP, nil
   921  
   922  	case linux.SO_PEERCRED:
   923  		if family != linux.AF_UNIX || outLen < unix.SizeofUcred {
   924  			return nil, syserr.ErrInvalidArgument
   925  		}
   926  
   927  		tcred := t.Credentials()
   928  		creds := linux.ControlMessageCredentials{
   929  			PID: int32(t.ThreadGroup().ID()),
   930  			UID: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()),
   931  			GID: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()),
   932  		}
   933  		return &creds, nil
   934  
   935  	case linux.SO_PASSCRED:
   936  		if outLen < sizeOfInt32 {
   937  			return nil, syserr.ErrInvalidArgument
   938  		}
   939  
   940  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetPassCred()))
   941  		return &v, nil
   942  
   943  	case linux.SO_SNDBUF:
   944  		if outLen < sizeOfInt32 {
   945  			return nil, syserr.ErrInvalidArgument
   946  		}
   947  
   948  		size := ep.SocketOptions().GetSendBufferSize()
   949  
   950  		if size > math.MaxInt32 {
   951  			size = math.MaxInt32
   952  		}
   953  
   954  		sizeP := primitive.Int32(size)
   955  		return &sizeP, nil
   956  
   957  	case linux.SO_RCVBUF:
   958  		if outLen < sizeOfInt32 {
   959  			return nil, syserr.ErrInvalidArgument
   960  		}
   961  
   962  		size := ep.SocketOptions().GetReceiveBufferSize()
   963  
   964  		if size > math.MaxInt32 {
   965  			size = math.MaxInt32
   966  		}
   967  
   968  		sizeP := primitive.Int32(size)
   969  		return &sizeP, nil
   970  
   971  	case linux.SO_REUSEADDR:
   972  		if outLen < sizeOfInt32 {
   973  			return nil, syserr.ErrInvalidArgument
   974  		}
   975  
   976  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReuseAddress()))
   977  		return &v, nil
   978  
   979  	case linux.SO_REUSEPORT:
   980  		if outLen < sizeOfInt32 {
   981  			return nil, syserr.ErrInvalidArgument
   982  		}
   983  
   984  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReusePort()))
   985  		return &v, nil
   986  
   987  	case linux.SO_BINDTODEVICE:
   988  		v := ep.SocketOptions().GetBindToDevice()
   989  		if v == 0 {
   990  			var b primitive.ByteSlice
   991  			return &b, nil
   992  		}
   993  		if outLen < linux.IFNAMSIZ {
   994  			return nil, syserr.ErrInvalidArgument
   995  		}
   996  		s := t.NetworkContext()
   997  		if s == nil {
   998  			return nil, syserr.ErrNoDevice
   999  		}
  1000  		nic, ok := s.Interfaces()[int32(v)]
  1001  		if !ok {
  1002  			// The NICID no longer indicates a valid interface, probably because that
  1003  			// interface was removed.
  1004  			return nil, syserr.ErrUnknownDevice
  1005  		}
  1006  
  1007  		name := primitive.ByteSlice(append([]byte(nic.Name), 0))
  1008  		return &name, nil
  1009  
  1010  	case linux.SO_BROADCAST:
  1011  		if outLen < sizeOfInt32 {
  1012  			return nil, syserr.ErrInvalidArgument
  1013  		}
  1014  
  1015  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetBroadcast()))
  1016  		return &v, nil
  1017  
  1018  	case linux.SO_KEEPALIVE:
  1019  		if outLen < sizeOfInt32 {
  1020  			return nil, syserr.ErrInvalidArgument
  1021  		}
  1022  
  1023  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetKeepAlive()))
  1024  		return &v, nil
  1025  
  1026  	case linux.SO_LINGER:
  1027  		if outLen < linux.SizeOfLinger {
  1028  			return nil, syserr.ErrInvalidArgument
  1029  		}
  1030  
  1031  		var linger linux.Linger
  1032  		v := ep.SocketOptions().GetLinger()
  1033  
  1034  		if v.Enabled {
  1035  			linger.OnOff = 1
  1036  		}
  1037  		linger.Linger = int32(v.Timeout.Seconds())
  1038  		return &linger, nil
  1039  
  1040  	case linux.SO_SNDTIMEO:
  1041  		// TODO(igudger): Linux allows shorter lengths for partial results.
  1042  		if outLen < linux.SizeOfTimeval {
  1043  			return nil, syserr.ErrInvalidArgument
  1044  		}
  1045  
  1046  		sendTimeout := linux.NsecToTimeval(s.SendTimeout())
  1047  		return &sendTimeout, nil
  1048  
  1049  	case linux.SO_RCVTIMEO:
  1050  		// TODO(igudger): Linux allows shorter lengths for partial results.
  1051  		if outLen < linux.SizeOfTimeval {
  1052  			return nil, syserr.ErrInvalidArgument
  1053  		}
  1054  
  1055  		recvTimeout := linux.NsecToTimeval(s.RecvTimeout())
  1056  		return &recvTimeout, nil
  1057  
  1058  	case linux.SO_OOBINLINE:
  1059  		if outLen < sizeOfInt32 {
  1060  			return nil, syserr.ErrInvalidArgument
  1061  		}
  1062  
  1063  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetOutOfBandInline()))
  1064  		return &v, nil
  1065  
  1066  	case linux.SO_NO_CHECK:
  1067  		if outLen < sizeOfInt32 {
  1068  			return nil, syserr.ErrInvalidArgument
  1069  		}
  1070  
  1071  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetNoChecksum()))
  1072  		return &v, nil
  1073  
  1074  	case linux.SO_ACCEPTCONN:
  1075  		if outLen < sizeOfInt32 {
  1076  			return nil, syserr.ErrInvalidArgument
  1077  		}
  1078  
  1079  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetAcceptConn()))
  1080  		return &v, nil
  1081  
  1082  	case linux.SO_RCVLOWAT:
  1083  		if outLen < sizeOfInt32 {
  1084  			return nil, syserr.ErrInvalidArgument
  1085  		}
  1086  
  1087  		v := primitive.Int32(ep.SocketOptions().GetRcvlowat())
  1088  		return &v, nil
  1089  	}
  1090  	return nil, syserr.ErrProtocolNotAvailable
  1091  }
  1092  
  1093  // getSockOptTCP implements GetSockOpt when level is SOL_TCP.
  1094  func getSockOptTCP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) {
  1095  	if !socket.IsTCP(s) {
  1096  		return nil, syserr.ErrUnknownProtocolOption
  1097  	}
  1098  
  1099  	switch name {
  1100  	case linux.TCP_NODELAY:
  1101  		if outLen < sizeOfInt32 {
  1102  			return nil, syserr.ErrInvalidArgument
  1103  		}
  1104  
  1105  		v := primitive.Int32(boolToInt32(!ep.SocketOptions().GetDelayOption()))
  1106  		return &v, nil
  1107  
  1108  	case linux.TCP_CORK:
  1109  		if outLen < sizeOfInt32 {
  1110  			return nil, syserr.ErrInvalidArgument
  1111  		}
  1112  
  1113  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetCorkOption()))
  1114  		return &v, nil
  1115  
  1116  	case linux.TCP_QUICKACK:
  1117  		if outLen < sizeOfInt32 {
  1118  			return nil, syserr.ErrInvalidArgument
  1119  		}
  1120  
  1121  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetQuickAck()))
  1122  		return &v, nil
  1123  
  1124  	case linux.TCP_MAXSEG:
  1125  		if outLen < sizeOfInt32 {
  1126  			return nil, syserr.ErrInvalidArgument
  1127  		}
  1128  
  1129  		v, err := ep.GetSockOptInt(tcpip.MaxSegOption)
  1130  		if err != nil {
  1131  			return nil, syserr.TranslateNetstackError(err)
  1132  		}
  1133  		vP := primitive.Int32(v)
  1134  		return &vP, nil
  1135  
  1136  	case linux.TCP_KEEPIDLE:
  1137  		if outLen < sizeOfInt32 {
  1138  			return nil, syserr.ErrInvalidArgument
  1139  		}
  1140  
  1141  		var v tcpip.KeepaliveIdleOption
  1142  		if err := ep.GetSockOpt(&v); err != nil {
  1143  			return nil, syserr.TranslateNetstackError(err)
  1144  		}
  1145  		keepAliveIdle := primitive.Int32(time.Duration(v) / time.Second)
  1146  		return &keepAliveIdle, nil
  1147  
  1148  	case linux.TCP_KEEPINTVL:
  1149  		if outLen < sizeOfInt32 {
  1150  			return nil, syserr.ErrInvalidArgument
  1151  		}
  1152  
  1153  		var v tcpip.KeepaliveIntervalOption
  1154  		if err := ep.GetSockOpt(&v); err != nil {
  1155  			return nil, syserr.TranslateNetstackError(err)
  1156  		}
  1157  		keepAliveInterval := primitive.Int32(time.Duration(v) / time.Second)
  1158  		return &keepAliveInterval, nil
  1159  
  1160  	case linux.TCP_KEEPCNT:
  1161  		if outLen < sizeOfInt32 {
  1162  			return nil, syserr.ErrInvalidArgument
  1163  		}
  1164  
  1165  		v, err := ep.GetSockOptInt(tcpip.KeepaliveCountOption)
  1166  		if err != nil {
  1167  			return nil, syserr.TranslateNetstackError(err)
  1168  		}
  1169  		vP := primitive.Int32(v)
  1170  		return &vP, nil
  1171  
  1172  	case linux.TCP_USER_TIMEOUT:
  1173  		if outLen < sizeOfInt32 {
  1174  			return nil, syserr.ErrInvalidArgument
  1175  		}
  1176  
  1177  		var v tcpip.TCPUserTimeoutOption
  1178  		if err := ep.GetSockOpt(&v); err != nil {
  1179  			return nil, syserr.TranslateNetstackError(err)
  1180  		}
  1181  		tcpUserTimeout := primitive.Int32(time.Duration(v) / time.Millisecond)
  1182  		return &tcpUserTimeout, nil
  1183  
  1184  	case linux.TCP_INFO:
  1185  		var v tcpip.TCPInfoOption
  1186  		if err := ep.GetSockOpt(&v); err != nil {
  1187  			return nil, syserr.TranslateNetstackError(err)
  1188  		}
  1189  
  1190  		// TODO(b/64800844): Translate fields once they are added to
  1191  		// tcpip.TCPInfoOption.
  1192  		info := linux.TCPInfo{
  1193  			State:       uint8(v.State),
  1194  			RTO:         uint32(v.RTO / time.Microsecond),
  1195  			RTT:         uint32(v.RTT / time.Microsecond),
  1196  			RTTVar:      uint32(v.RTTVar / time.Microsecond),
  1197  			SndSsthresh: v.SndSsthresh,
  1198  			SndCwnd:     v.SndCwnd,
  1199  		}
  1200  		switch v.CcState {
  1201  		case tcpip.RTORecovery:
  1202  			info.CaState = linux.TCP_CA_Loss
  1203  		case tcpip.FastRecovery, tcpip.SACKRecovery:
  1204  			info.CaState = linux.TCP_CA_Recovery
  1205  		case tcpip.Disorder:
  1206  			info.CaState = linux.TCP_CA_Disorder
  1207  		case tcpip.Open:
  1208  			info.CaState = linux.TCP_CA_Open
  1209  		}
  1210  
  1211  		// In netstack reorderSeen is updated only when RACK is enabled.
  1212  		// We only track whether the reordering is seen, which is
  1213  		// different than Linux where reorderSeen is not specific to
  1214  		// RACK and is incremented when a reordering event is seen.
  1215  		if v.ReorderSeen {
  1216  			info.ReordSeen = 1
  1217  		}
  1218  
  1219  		// Linux truncates the output binary to outLen.
  1220  		buf := t.CopyScratchBuffer(info.SizeBytes())
  1221  		info.MarshalUnsafe(buf)
  1222  		if len(buf) > outLen {
  1223  			buf = buf[:outLen]
  1224  		}
  1225  		bufP := primitive.ByteSlice(buf)
  1226  		return &bufP, nil
  1227  
  1228  	case linux.TCP_CC_INFO,
  1229  		linux.TCP_NOTSENT_LOWAT,
  1230  		linux.TCP_ZEROCOPY_RECEIVE:
  1231  
  1232  		// Not supported.
  1233  
  1234  	case linux.TCP_CONGESTION:
  1235  		if outLen <= 0 {
  1236  			return nil, syserr.ErrInvalidArgument
  1237  		}
  1238  
  1239  		var v tcpip.CongestionControlOption
  1240  		if err := ep.GetSockOpt(&v); err != nil {
  1241  			return nil, syserr.TranslateNetstackError(err)
  1242  		}
  1243  
  1244  		// We match linux behaviour here where it returns the lower of
  1245  		// TCP_CA_NAME_MAX bytes or the value of the option length.
  1246  		//
  1247  		// This is Linux's net/tcp.h TCP_CA_NAME_MAX.
  1248  		const tcpCANameMax = 16
  1249  
  1250  		toCopy := tcpCANameMax
  1251  		if outLen < tcpCANameMax {
  1252  			toCopy = outLen
  1253  		}
  1254  		b := make([]byte, toCopy)
  1255  		copy(b, v)
  1256  
  1257  		bP := primitive.ByteSlice(b)
  1258  		return &bP, nil
  1259  
  1260  	case linux.TCP_LINGER2:
  1261  		if outLen < sizeOfInt32 {
  1262  			return nil, syserr.ErrInvalidArgument
  1263  		}
  1264  
  1265  		var v tcpip.TCPLingerTimeoutOption
  1266  		if err := ep.GetSockOpt(&v); err != nil {
  1267  			return nil, syserr.TranslateNetstackError(err)
  1268  		}
  1269  		var lingerTimeout primitive.Int32
  1270  		if v >= 0 {
  1271  			lingerTimeout = primitive.Int32(time.Duration(v) / time.Second)
  1272  		} else {
  1273  			lingerTimeout = -1
  1274  		}
  1275  		return &lingerTimeout, nil
  1276  
  1277  	case linux.TCP_DEFER_ACCEPT:
  1278  		if outLen < sizeOfInt32 {
  1279  			return nil, syserr.ErrInvalidArgument
  1280  		}
  1281  
  1282  		var v tcpip.TCPDeferAcceptOption
  1283  		if err := ep.GetSockOpt(&v); err != nil {
  1284  			return nil, syserr.TranslateNetstackError(err)
  1285  		}
  1286  
  1287  		tcpDeferAccept := primitive.Int32(time.Duration(v) / time.Second)
  1288  		return &tcpDeferAccept, nil
  1289  
  1290  	case linux.TCP_SYNCNT:
  1291  		if outLen < sizeOfInt32 {
  1292  			return nil, syserr.ErrInvalidArgument
  1293  		}
  1294  
  1295  		v, err := ep.GetSockOptInt(tcpip.TCPSynCountOption)
  1296  		if err != nil {
  1297  			return nil, syserr.TranslateNetstackError(err)
  1298  		}
  1299  		vP := primitive.Int32(v)
  1300  		return &vP, nil
  1301  
  1302  	case linux.TCP_WINDOW_CLAMP:
  1303  		if outLen < sizeOfInt32 {
  1304  			return nil, syserr.ErrInvalidArgument
  1305  		}
  1306  
  1307  		v, err := ep.GetSockOptInt(tcpip.TCPWindowClampOption)
  1308  		if err != nil {
  1309  			return nil, syserr.TranslateNetstackError(err)
  1310  		}
  1311  		vP := primitive.Int32(v)
  1312  		return &vP, nil
  1313  	}
  1314  	return nil, syserr.ErrProtocolNotAvailable
  1315  }
  1316  
  1317  func getSockOptICMPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outLen int) (marshal.Marshallable, *syserr.Error) {
  1318  	if _, ok := ep.(tcpip.Endpoint); !ok {
  1319  		log.Warningf("SOL_ICMPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  1320  		return nil, syserr.ErrUnknownProtocolOption
  1321  	}
  1322  
  1323  	if family, _, _ := s.Type(); family != linux.AF_INET6 {
  1324  		return nil, syserr.ErrNotSupported
  1325  	}
  1326  
  1327  	switch name {
  1328  	case linux.ICMPV6_FILTER:
  1329  		var v tcpip.ICMPv6Filter
  1330  		if err := ep.GetSockOpt(&v); err != nil {
  1331  			return nil, syserr.TranslateNetstackError(err)
  1332  		}
  1333  
  1334  		filter := linux.ICMP6Filter{Filter: v.DenyType}
  1335  
  1336  		// Linux truncates the output to outLen.
  1337  		buf := t.CopyScratchBuffer(filter.SizeBytes())
  1338  		filter.MarshalUnsafe(buf)
  1339  		if len(buf) > outLen {
  1340  			buf = buf[:outLen]
  1341  		}
  1342  		bufP := primitive.ByteSlice(buf)
  1343  		return &bufP, nil
  1344  	}
  1345  	return nil, syserr.ErrProtocolNotAvailable
  1346  }
  1347  
  1348  func defaultTTL(t *kernel.Task, network tcpip.NetworkProtocolNumber) (primitive.Int32, tcpip.Error) {
  1349  	var opt tcpip.DefaultTTLOption
  1350  	stack := inet.StackFromContext(t)
  1351  	if err := stack.(*Stack).Stack.NetworkProtocolOption(network, &opt); err != nil {
  1352  		return 0, err
  1353  	}
  1354  	return primitive.Int32(opt), nil
  1355  }
  1356  
  1357  // getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6.
  1358  func getSockOptIPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
  1359  	if _, ok := ep.(tcpip.Endpoint); !ok {
  1360  		log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  1361  		return nil, syserr.ErrUnknownProtocolOption
  1362  	}
  1363  
  1364  	family, skType, _ := s.Type()
  1365  	if family != linux.AF_INET6 {
  1366  		return nil, syserr.ErrNotSupported
  1367  	}
  1368  
  1369  	switch name {
  1370  	case linux.IPV6_CHECKSUM:
  1371  		if outLen < sizeOfInt32 {
  1372  			return nil, syserr.ErrInvalidArgument
  1373  		}
  1374  
  1375  		v, err := ep.GetSockOptInt(tcpip.IPv6Checksum)
  1376  		if err != nil {
  1377  			return nil, syserr.TranslateNetstackError(err)
  1378  		}
  1379  
  1380  		vP := primitive.Int32(v)
  1381  		return &vP, nil
  1382  
  1383  	case linux.IPV6_V6ONLY:
  1384  		if outLen < sizeOfInt32 {
  1385  			return nil, syserr.ErrInvalidArgument
  1386  		}
  1387  
  1388  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetV6Only()))
  1389  		return &v, nil
  1390  
  1391  	case linux.IPV6_UNICAST_HOPS:
  1392  		if outLen < sizeOfInt32 {
  1393  			return nil, syserr.ErrInvalidArgument
  1394  		}
  1395  
  1396  		v, err := ep.GetSockOptInt(tcpip.IPv6HopLimitOption)
  1397  		if err != nil {
  1398  			return nil, syserr.TranslateNetstackError(err)
  1399  		}
  1400  
  1401  		// Fill in the default value, if needed.
  1402  		vP := primitive.Int32(v)
  1403  		if vP == -1 {
  1404  			vP, err = defaultTTL(t, header.IPv6ProtocolNumber)
  1405  			if err != nil {
  1406  				return nil, syserr.TranslateNetstackError(err)
  1407  			}
  1408  		}
  1409  
  1410  		return &vP, nil
  1411  
  1412  	case linux.IPV6_RECVHOPLIMIT:
  1413  		if outLen < sizeOfInt32 {
  1414  			return nil, syserr.ErrInvalidArgument
  1415  		}
  1416  
  1417  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveHopLimit()))
  1418  		return &v, nil
  1419  
  1420  	case linux.IPV6_PATHMTU:
  1421  		// Not supported.
  1422  
  1423  	case linux.IPV6_TCLASS:
  1424  		// Length handling for parity with Linux.
  1425  		if outLen == 0 {
  1426  			var b primitive.ByteSlice
  1427  			return &b, nil
  1428  		}
  1429  		v, err := ep.GetSockOptInt(tcpip.IPv6TrafficClassOption)
  1430  		if err != nil {
  1431  			return nil, syserr.TranslateNetstackError(err)
  1432  		}
  1433  
  1434  		uintv := primitive.Uint32(v)
  1435  		// Linux truncates the output binary to outLen.
  1436  		ib := t.CopyScratchBuffer(uintv.SizeBytes())
  1437  		uintv.MarshalUnsafe(ib)
  1438  		// Handle cases where outLen is lesser than sizeOfInt32.
  1439  		if len(ib) > outLen {
  1440  			ib = ib[:outLen]
  1441  		}
  1442  		ibP := primitive.ByteSlice(ib)
  1443  		return &ibP, nil
  1444  
  1445  	case linux.IPV6_RECVTCLASS:
  1446  		if outLen < sizeOfInt32 {
  1447  			return nil, syserr.ErrInvalidArgument
  1448  		}
  1449  
  1450  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTClass()))
  1451  		return &v, nil
  1452  	case linux.IPV6_RECVERR:
  1453  		if outLen < sizeOfInt32 {
  1454  			return nil, syserr.ErrInvalidArgument
  1455  		}
  1456  
  1457  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv6RecvError()))
  1458  		return &v, nil
  1459  
  1460  	case linux.IPV6_RECVORIGDSTADDR:
  1461  		if outLen < sizeOfInt32 {
  1462  			return nil, syserr.ErrInvalidArgument
  1463  		}
  1464  
  1465  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress()))
  1466  		return &v, nil
  1467  
  1468  	case linux.IPV6_RECVPKTINFO:
  1469  		if outLen < sizeOfInt32 {
  1470  			return nil, syserr.ErrInvalidArgument
  1471  		}
  1472  
  1473  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv6ReceivePacketInfo()))
  1474  		return &v, nil
  1475  
  1476  	case linux.IP6T_ORIGINAL_DST:
  1477  		if outLen < sockAddrInet6Size {
  1478  			return nil, syserr.ErrInvalidArgument
  1479  		}
  1480  
  1481  		var v tcpip.OriginalDestinationOption
  1482  		if err := ep.GetSockOpt(&v); err != nil {
  1483  			return nil, syserr.TranslateNetstackError(err)
  1484  		}
  1485  
  1486  		a, _ := socket.ConvertAddress(linux.AF_INET6, tcpip.FullAddress(v))
  1487  		return a.(*linux.SockAddrInet6), nil
  1488  
  1489  	case linux.IP6T_SO_GET_INFO:
  1490  		if outLen < linux.SizeOfIPTGetinfo {
  1491  			return nil, syserr.ErrInvalidArgument
  1492  		}
  1493  
  1494  		// Only valid for raw IPv6 sockets.
  1495  		if skType != linux.SOCK_RAW {
  1496  			return nil, syserr.ErrProtocolNotAvailable
  1497  		}
  1498  
  1499  		stk := inet.StackFromContext(t)
  1500  		if stk == nil {
  1501  			return nil, syserr.ErrNoDevice
  1502  		}
  1503  		info, err := netfilter.GetInfo(t, stk.(*Stack).Stack, outPtr, true)
  1504  		if err != nil {
  1505  			return nil, err
  1506  		}
  1507  		return &info, nil
  1508  
  1509  	case linux.IP6T_SO_GET_ENTRIES:
  1510  		// IPTGetEntries is reused for IPv6.
  1511  		if outLen < linux.SizeOfIPTGetEntries {
  1512  			return nil, syserr.ErrInvalidArgument
  1513  		}
  1514  		// Only valid for raw IPv6 sockets.
  1515  		if skType != linux.SOCK_RAW {
  1516  			return nil, syserr.ErrProtocolNotAvailable
  1517  		}
  1518  
  1519  		stk := inet.StackFromContext(t)
  1520  		if stk == nil {
  1521  			return nil, syserr.ErrNoDevice
  1522  		}
  1523  		entries, err := netfilter.GetEntries6(t, stk.(*Stack).Stack, outPtr, outLen)
  1524  		if err != nil {
  1525  			return nil, err
  1526  		}
  1527  		return &entries, nil
  1528  
  1529  	case linux.IP6T_SO_GET_REVISION_TARGET:
  1530  		if outLen < linux.SizeOfXTGetRevision {
  1531  			return nil, syserr.ErrInvalidArgument
  1532  		}
  1533  
  1534  		// Only valid for raw IPv6 sockets.
  1535  		if skType != linux.SOCK_RAW {
  1536  			return nil, syserr.ErrProtocolNotAvailable
  1537  		}
  1538  
  1539  		stk := inet.StackFromContext(t)
  1540  		if stk == nil {
  1541  			return nil, syserr.ErrNoDevice
  1542  		}
  1543  		ret, err := netfilter.TargetRevision(t, outPtr, header.IPv6ProtocolNumber)
  1544  		if err != nil {
  1545  			return nil, err
  1546  		}
  1547  		return &ret, nil
  1548  	}
  1549  	return nil, syserr.ErrProtocolNotAvailable
  1550  }
  1551  
  1552  // getSockOptIP implements GetSockOpt when level is SOL_IP.
  1553  func getSockOptIP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int, _ int) (marshal.Marshallable, *syserr.Error) {
  1554  	if _, ok := ep.(tcpip.Endpoint); !ok {
  1555  		log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  1556  		return nil, syserr.ErrUnknownProtocolOption
  1557  	}
  1558  
  1559  	switch name {
  1560  	case linux.IP_TTL:
  1561  		if outLen < sizeOfInt32 {
  1562  			return nil, syserr.ErrInvalidArgument
  1563  		}
  1564  
  1565  		v, err := ep.GetSockOptInt(tcpip.IPv4TTLOption)
  1566  		if err != nil {
  1567  			return nil, syserr.TranslateNetstackError(err)
  1568  		}
  1569  
  1570  		// Fill in the default value, if needed.
  1571  		vP := primitive.Int32(v)
  1572  		if vP == 0 {
  1573  			vP, err = defaultTTL(t, header.IPv4ProtocolNumber)
  1574  			if err != nil {
  1575  				return nil, syserr.TranslateNetstackError(err)
  1576  			}
  1577  		}
  1578  
  1579  		return &vP, nil
  1580  
  1581  	case linux.IP_RECVTTL:
  1582  		if outLen < sizeOfInt32 {
  1583  			return nil, syserr.ErrInvalidArgument
  1584  		}
  1585  
  1586  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTTL()))
  1587  		return &v, nil
  1588  
  1589  	case linux.IP_MULTICAST_TTL:
  1590  		if outLen < sizeOfInt32 {
  1591  			return nil, syserr.ErrInvalidArgument
  1592  		}
  1593  
  1594  		v, err := ep.GetSockOptInt(tcpip.MulticastTTLOption)
  1595  		if err != nil {
  1596  			return nil, syserr.TranslateNetstackError(err)
  1597  		}
  1598  
  1599  		vP := primitive.Int32(v)
  1600  		return &vP, nil
  1601  
  1602  	case linux.IP_MULTICAST_IF:
  1603  		if outLen < len(linux.InetAddr{}) {
  1604  			return nil, syserr.ErrInvalidArgument
  1605  		}
  1606  
  1607  		var v tcpip.MulticastInterfaceOption
  1608  		if err := ep.GetSockOpt(&v); err != nil {
  1609  			return nil, syserr.TranslateNetstackError(err)
  1610  		}
  1611  
  1612  		a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr})
  1613  
  1614  		return &a.(*linux.SockAddrInet).Addr, nil
  1615  
  1616  	case linux.IP_MULTICAST_LOOP:
  1617  		if outLen < sizeOfInt32 {
  1618  			return nil, syserr.ErrInvalidArgument
  1619  		}
  1620  
  1621  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetMulticastLoop()))
  1622  		return &v, nil
  1623  
  1624  	case linux.IP_TOS:
  1625  		// Length handling for parity with Linux.
  1626  		if outLen == 0 {
  1627  			var b primitive.ByteSlice
  1628  			return &b, nil
  1629  		}
  1630  		v, err := ep.GetSockOptInt(tcpip.IPv4TOSOption)
  1631  		if err != nil {
  1632  			return nil, syserr.TranslateNetstackError(err)
  1633  		}
  1634  		if outLen < sizeOfInt32 {
  1635  			vP := primitive.Uint8(v)
  1636  			return &vP, nil
  1637  		}
  1638  		vP := primitive.Int32(v)
  1639  		return &vP, nil
  1640  
  1641  	case linux.IP_RECVTOS:
  1642  		if outLen < sizeOfInt32 {
  1643  			return nil, syserr.ErrInvalidArgument
  1644  		}
  1645  
  1646  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTOS()))
  1647  		return &v, nil
  1648  
  1649  	case linux.IP_RECVERR:
  1650  		if outLen < sizeOfInt32 {
  1651  			return nil, syserr.ErrInvalidArgument
  1652  		}
  1653  
  1654  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv4RecvError()))
  1655  		return &v, nil
  1656  
  1657  	case linux.IP_PKTINFO:
  1658  		if outLen < sizeOfInt32 {
  1659  			return nil, syserr.ErrInvalidArgument
  1660  		}
  1661  
  1662  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceivePacketInfo()))
  1663  		return &v, nil
  1664  
  1665  	case linux.IP_HDRINCL:
  1666  		if outLen < sizeOfInt32 {
  1667  			return nil, syserr.ErrInvalidArgument
  1668  		}
  1669  
  1670  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetHeaderIncluded()))
  1671  		return &v, nil
  1672  
  1673  	case linux.IP_RECVORIGDSTADDR:
  1674  		if outLen < sizeOfInt32 {
  1675  			return nil, syserr.ErrInvalidArgument
  1676  		}
  1677  
  1678  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress()))
  1679  		return &v, nil
  1680  
  1681  	case linux.SO_ORIGINAL_DST:
  1682  		if outLen < sockAddrInetSize {
  1683  			return nil, syserr.ErrInvalidArgument
  1684  		}
  1685  
  1686  		var v tcpip.OriginalDestinationOption
  1687  		if err := ep.GetSockOpt(&v); err != nil {
  1688  			return nil, syserr.TranslateNetstackError(err)
  1689  		}
  1690  
  1691  		a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress(v))
  1692  		return a.(*linux.SockAddrInet), nil
  1693  
  1694  	case linux.IPT_SO_GET_INFO:
  1695  		if outLen < linux.SizeOfIPTGetinfo {
  1696  			return nil, syserr.ErrInvalidArgument
  1697  		}
  1698  
  1699  		// Only valid for raw IPv4 sockets.
  1700  		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
  1701  			return nil, syserr.ErrProtocolNotAvailable
  1702  		}
  1703  
  1704  		stk := inet.StackFromContext(t)
  1705  		if stk == nil {
  1706  			return nil, syserr.ErrNoDevice
  1707  		}
  1708  		info, err := netfilter.GetInfo(t, stk.(*Stack).Stack, outPtr, false)
  1709  		if err != nil {
  1710  			return nil, err
  1711  		}
  1712  		return &info, nil
  1713  
  1714  	case linux.IPT_SO_GET_ENTRIES:
  1715  		if outLen < linux.SizeOfIPTGetEntries {
  1716  			return nil, syserr.ErrInvalidArgument
  1717  		}
  1718  
  1719  		// Only valid for raw IPv4 sockets.
  1720  		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
  1721  			return nil, syserr.ErrProtocolNotAvailable
  1722  		}
  1723  
  1724  		stk := inet.StackFromContext(t)
  1725  		if stk == nil {
  1726  			return nil, syserr.ErrNoDevice
  1727  		}
  1728  		entries, err := netfilter.GetEntries4(t, stk.(*Stack).Stack, outPtr, outLen)
  1729  		if err != nil {
  1730  			return nil, err
  1731  		}
  1732  		return &entries, nil
  1733  
  1734  	case linux.IPT_SO_GET_REVISION_TARGET:
  1735  		if outLen < linux.SizeOfXTGetRevision {
  1736  			return nil, syserr.ErrInvalidArgument
  1737  		}
  1738  
  1739  		// Only valid for raw IPv4 sockets.
  1740  		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
  1741  			return nil, syserr.ErrProtocolNotAvailable
  1742  		}
  1743  
  1744  		stk := inet.StackFromContext(t)
  1745  		if stk == nil {
  1746  			return nil, syserr.ErrNoDevice
  1747  		}
  1748  		ret, err := netfilter.TargetRevision(t, outPtr, header.IPv4ProtocolNumber)
  1749  		if err != nil {
  1750  			return nil, err
  1751  		}
  1752  		return &ret, nil
  1753  	}
  1754  	return nil, syserr.ErrProtocolNotAvailable
  1755  }
  1756  
  1757  // SetSockOpt can be used to implement the linux syscall setsockopt(2) for
  1758  // sockets backed by a commonEndpoint.
  1759  func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error {
  1760  	switch level {
  1761  	case linux.SOL_SOCKET:
  1762  		return setSockOptSocket(t, s, ep, name, optVal)
  1763  
  1764  	case linux.SOL_TCP:
  1765  		return setSockOptTCP(t, s, ep, name, optVal)
  1766  
  1767  	case linux.SOL_ICMPV6:
  1768  		return setSockOptICMPv6(t, s, ep, name, optVal)
  1769  
  1770  	case linux.SOL_IPV6:
  1771  		return setSockOptIPv6(t, s, ep, name, optVal)
  1772  
  1773  	case linux.SOL_IP:
  1774  		return setSockOptIP(t, s, ep, name, optVal)
  1775  
  1776  	case linux.SOL_PACKET:
  1777  		// gVisor doesn't support any SOL_PACKET options just return not
  1778  		// supported. Returning nil here will result in tcpdump thinking AF_PACKET
  1779  		// features are supported and proceed to use them and break.
  1780  		return syserr.ErrProtocolNotAvailable
  1781  
  1782  	case linux.SOL_UDP,
  1783  		linux.SOL_RAW:
  1784  		// Not supported.
  1785  	}
  1786  
  1787  	return nil
  1788  }
  1789  
  1790  func clampBufSize(newSz, min, max int64, ignoreMax bool) int64 {
  1791  	// packetOverheadFactor is used to multiply the value provided by the user on
  1792  	// a setsockopt(2) for setting the send/receive buffer sizes sockets.
  1793  	const packetOverheadFactor = 2
  1794  
  1795  	if !ignoreMax && newSz > max {
  1796  		newSz = max
  1797  	}
  1798  
  1799  	if newSz < math.MaxInt32/packetOverheadFactor {
  1800  		newSz *= packetOverheadFactor
  1801  		if newSz < min {
  1802  			newSz = min
  1803  		}
  1804  	} else {
  1805  		newSz = math.MaxInt32
  1806  	}
  1807  	return newSz
  1808  }
  1809  
  1810  // setSockOptSocket implements SetSockOpt when level is SOL_SOCKET.
  1811  func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  1812  	switch name {
  1813  	case linux.SO_SNDBUF:
  1814  		if len(optVal) < sizeOfInt32 {
  1815  			return syserr.ErrInvalidArgument
  1816  		}
  1817  
  1818  		v := hostarch.ByteOrder.Uint32(optVal)
  1819  		min, max := ep.SocketOptions().SendBufferLimits()
  1820  		clamped := clampBufSize(int64(v), min, max, false /* ignoreMax */)
  1821  		ep.SocketOptions().SetSendBufferSize(clamped, true /* notify */)
  1822  		return nil
  1823  
  1824  	case linux.SO_RCVBUF:
  1825  		if len(optVal) < sizeOfInt32 {
  1826  			return syserr.ErrInvalidArgument
  1827  		}
  1828  
  1829  		v := hostarch.ByteOrder.Uint32(optVal)
  1830  		min, max := ep.SocketOptions().ReceiveBufferLimits()
  1831  		clamped := clampBufSize(int64(v), min, max, false /* ignoreMax */)
  1832  		ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */)
  1833  		return nil
  1834  
  1835  	case linux.SO_RCVBUFFORCE:
  1836  		if len(optVal) < sizeOfInt32 {
  1837  			return syserr.ErrInvalidArgument
  1838  		}
  1839  
  1840  		if creds := auth.CredentialsFromContext(t); !creds.HasCapability(linux.CAP_NET_ADMIN) {
  1841  			return syserr.ErrNotPermitted
  1842  		}
  1843  
  1844  		v := hostarch.ByteOrder.Uint32(optVal)
  1845  		min, max := ep.SocketOptions().ReceiveBufferLimits()
  1846  		clamped := clampBufSize(int64(v), min, max, true /* ignoreMax */)
  1847  		ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */)
  1848  		return nil
  1849  
  1850  	case linux.SO_REUSEADDR:
  1851  		if len(optVal) < sizeOfInt32 {
  1852  			return syserr.ErrInvalidArgument
  1853  		}
  1854  
  1855  		v := hostarch.ByteOrder.Uint32(optVal)
  1856  		ep.SocketOptions().SetReuseAddress(v != 0)
  1857  		return nil
  1858  
  1859  	case linux.SO_REUSEPORT:
  1860  		if len(optVal) < sizeOfInt32 {
  1861  			return syserr.ErrInvalidArgument
  1862  		}
  1863  
  1864  		v := hostarch.ByteOrder.Uint32(optVal)
  1865  		ep.SocketOptions().SetReusePort(v != 0)
  1866  		return nil
  1867  
  1868  	case linux.SO_BINDTODEVICE:
  1869  		n := bytes.IndexByte(optVal, 0)
  1870  		if n == -1 {
  1871  			n = len(optVal)
  1872  		}
  1873  		name := string(optVal[:n])
  1874  		if name == "" {
  1875  			return syserr.TranslateNetstackError(ep.SocketOptions().SetBindToDevice(0))
  1876  		}
  1877  		s := t.NetworkContext()
  1878  		if s == nil {
  1879  			return syserr.ErrNoDevice
  1880  		}
  1881  		for nicID, nic := range s.Interfaces() {
  1882  			if nic.Name == name {
  1883  				return syserr.TranslateNetstackError(ep.SocketOptions().SetBindToDevice(nicID))
  1884  			}
  1885  		}
  1886  		return syserr.ErrUnknownDevice
  1887  
  1888  	case linux.SO_BROADCAST:
  1889  		if len(optVal) < sizeOfInt32 {
  1890  			return syserr.ErrInvalidArgument
  1891  		}
  1892  
  1893  		v := hostarch.ByteOrder.Uint32(optVal)
  1894  		ep.SocketOptions().SetBroadcast(v != 0)
  1895  		return nil
  1896  
  1897  	case linux.SO_PASSCRED:
  1898  		if len(optVal) < sizeOfInt32 {
  1899  			return syserr.ErrInvalidArgument
  1900  		}
  1901  
  1902  		v := hostarch.ByteOrder.Uint32(optVal)
  1903  		ep.SocketOptions().SetPassCred(v != 0)
  1904  		return nil
  1905  
  1906  	case linux.SO_KEEPALIVE:
  1907  		if len(optVal) < sizeOfInt32 {
  1908  			return syserr.ErrInvalidArgument
  1909  		}
  1910  
  1911  		v := hostarch.ByteOrder.Uint32(optVal)
  1912  		ep.SocketOptions().SetKeepAlive(v != 0)
  1913  		return nil
  1914  
  1915  	case linux.SO_SNDTIMEO:
  1916  		if len(optVal) < linux.SizeOfTimeval {
  1917  			return syserr.ErrInvalidArgument
  1918  		}
  1919  
  1920  		var v linux.Timeval
  1921  		v.UnmarshalBytes(optVal)
  1922  		if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
  1923  			return syserr.ErrDomain
  1924  		}
  1925  		s.SetSendTimeout(v.ToNsecCapped())
  1926  		return nil
  1927  
  1928  	case linux.SO_RCVTIMEO:
  1929  		if len(optVal) < linux.SizeOfTimeval {
  1930  			return syserr.ErrInvalidArgument
  1931  		}
  1932  
  1933  		var v linux.Timeval
  1934  		v.UnmarshalBytes(optVal)
  1935  		if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
  1936  			return syserr.ErrDomain
  1937  		}
  1938  		s.SetRecvTimeout(v.ToNsecCapped())
  1939  		return nil
  1940  
  1941  	case linux.SO_OOBINLINE:
  1942  		if len(optVal) < sizeOfInt32 {
  1943  			return syserr.ErrInvalidArgument
  1944  		}
  1945  
  1946  		v := hostarch.ByteOrder.Uint32(optVal)
  1947  		ep.SocketOptions().SetOutOfBandInline(v != 0)
  1948  		return nil
  1949  
  1950  	case linux.SO_NO_CHECK:
  1951  		if len(optVal) < sizeOfInt32 {
  1952  			return syserr.ErrInvalidArgument
  1953  		}
  1954  
  1955  		v := hostarch.ByteOrder.Uint32(optVal)
  1956  		ep.SocketOptions().SetNoChecksum(v != 0)
  1957  		return nil
  1958  
  1959  	case linux.SO_LINGER:
  1960  		if len(optVal) < linux.SizeOfLinger {
  1961  			return syserr.ErrInvalidArgument
  1962  		}
  1963  
  1964  		var v linux.Linger
  1965  		v.UnmarshalBytes(optVal)
  1966  
  1967  		ep.SocketOptions().SetLinger(tcpip.LingerOption{
  1968  			Enabled: v.OnOff != 0,
  1969  			Timeout: time.Second * time.Duration(v.Linger),
  1970  		})
  1971  		return nil
  1972  
  1973  	case linux.SO_DETACH_FILTER:
  1974  		// optval is ignored.
  1975  		var v tcpip.SocketDetachFilterOption
  1976  		return syserr.TranslateNetstackError(ep.SetSockOpt(&v))
  1977  
  1978  	// TODO(b/226603727): Add support for SO_RCVLOWAT option. For now, only
  1979  	// the unsupported syscall message is removed.
  1980  	case linux.SO_RCVLOWAT:
  1981  		if len(optVal) < sizeOfInt32 {
  1982  			return syserr.ErrInvalidArgument
  1983  		}
  1984  
  1985  		v := hostarch.ByteOrder.Uint32(optVal)
  1986  		ep.SocketOptions().SetRcvlowat(int32(v))
  1987  		return nil
  1988  	}
  1989  
  1990  	return nil
  1991  }
  1992  
  1993  // setSockOptTCP implements SetSockOpt when level is SOL_TCP.
  1994  func setSockOptTCP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  1995  	if !socket.IsTCP(s) {
  1996  		return syserr.ErrUnknownProtocolOption
  1997  	}
  1998  
  1999  	switch name {
  2000  	case linux.TCP_NODELAY:
  2001  		if len(optVal) < sizeOfInt32 {
  2002  			return syserr.ErrInvalidArgument
  2003  		}
  2004  
  2005  		v := hostarch.ByteOrder.Uint32(optVal)
  2006  		ep.SocketOptions().SetDelayOption(v == 0)
  2007  		return nil
  2008  
  2009  	case linux.TCP_CORK:
  2010  		if len(optVal) < sizeOfInt32 {
  2011  			return syserr.ErrInvalidArgument
  2012  		}
  2013  
  2014  		v := hostarch.ByteOrder.Uint32(optVal)
  2015  		ep.SocketOptions().SetCorkOption(v != 0)
  2016  		return nil
  2017  
  2018  	case linux.TCP_QUICKACK:
  2019  		if len(optVal) < sizeOfInt32 {
  2020  			return syserr.ErrInvalidArgument
  2021  		}
  2022  
  2023  		v := hostarch.ByteOrder.Uint32(optVal)
  2024  		ep.SocketOptions().SetQuickAck(v != 0)
  2025  		return nil
  2026  
  2027  	case linux.TCP_MAXSEG:
  2028  		if len(optVal) < sizeOfInt32 {
  2029  			return syserr.ErrInvalidArgument
  2030  		}
  2031  
  2032  		v := hostarch.ByteOrder.Uint32(optVal)
  2033  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MaxSegOption, int(v)))
  2034  
  2035  	case linux.TCP_KEEPIDLE:
  2036  		if len(optVal) < sizeOfInt32 {
  2037  			return syserr.ErrInvalidArgument
  2038  		}
  2039  
  2040  		v := hostarch.ByteOrder.Uint32(optVal)
  2041  		if v < 1 || v > linux.MAX_TCP_KEEPIDLE {
  2042  			return syserr.ErrInvalidArgument
  2043  		}
  2044  		opt := tcpip.KeepaliveIdleOption(time.Second * time.Duration(v))
  2045  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  2046  
  2047  	case linux.TCP_KEEPINTVL:
  2048  		if len(optVal) < sizeOfInt32 {
  2049  			return syserr.ErrInvalidArgument
  2050  		}
  2051  
  2052  		v := hostarch.ByteOrder.Uint32(optVal)
  2053  		if v < 1 || v > linux.MAX_TCP_KEEPINTVL {
  2054  			return syserr.ErrInvalidArgument
  2055  		}
  2056  		opt := tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))
  2057  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  2058  
  2059  	case linux.TCP_KEEPCNT:
  2060  		if len(optVal) < sizeOfInt32 {
  2061  			return syserr.ErrInvalidArgument
  2062  		}
  2063  
  2064  		v := hostarch.ByteOrder.Uint32(optVal)
  2065  		if v < 1 || v > linux.MAX_TCP_KEEPCNT {
  2066  			return syserr.ErrInvalidArgument
  2067  		}
  2068  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.KeepaliveCountOption, int(v)))
  2069  
  2070  	case linux.TCP_USER_TIMEOUT:
  2071  		if len(optVal) < sizeOfInt32 {
  2072  			return syserr.ErrInvalidArgument
  2073  		}
  2074  
  2075  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2076  		if v < 0 {
  2077  			return syserr.ErrInvalidArgument
  2078  		}
  2079  		opt := tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v))
  2080  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  2081  
  2082  	case linux.TCP_CONGESTION:
  2083  		v := tcpip.CongestionControlOption(optVal)
  2084  		if err := ep.SetSockOpt(&v); err != nil {
  2085  			return syserr.TranslateNetstackError(err)
  2086  		}
  2087  		return nil
  2088  
  2089  	case linux.TCP_LINGER2:
  2090  		if len(optVal) < sizeOfInt32 {
  2091  			return syserr.ErrInvalidArgument
  2092  		}
  2093  
  2094  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2095  		opt := tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v))
  2096  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  2097  
  2098  	case linux.TCP_DEFER_ACCEPT:
  2099  		if len(optVal) < sizeOfInt32 {
  2100  			return syserr.ErrInvalidArgument
  2101  		}
  2102  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2103  		if v < 0 {
  2104  			v = 0
  2105  		}
  2106  		opt := tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v))
  2107  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  2108  
  2109  	case linux.TCP_SYNCNT:
  2110  		if len(optVal) < sizeOfInt32 {
  2111  			return syserr.ErrInvalidArgument
  2112  		}
  2113  		v := hostarch.ByteOrder.Uint32(optVal)
  2114  
  2115  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPSynCountOption, int(v)))
  2116  
  2117  	case linux.TCP_WINDOW_CLAMP:
  2118  		if len(optVal) < sizeOfInt32 {
  2119  			return syserr.ErrInvalidArgument
  2120  		}
  2121  		v := hostarch.ByteOrder.Uint32(optVal)
  2122  
  2123  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPWindowClampOption, int(v)))
  2124  
  2125  	case linux.TCP_REPAIR_OPTIONS:
  2126  		// Not supported.
  2127  	}
  2128  
  2129  	return nil
  2130  }
  2131  
  2132  func setSockOptICMPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  2133  	if _, ok := ep.(tcpip.Endpoint); !ok {
  2134  		log.Warningf("SOL_ICMPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  2135  		return syserr.ErrUnknownProtocolOption
  2136  	}
  2137  
  2138  	if family, _, _ := s.Type(); family != linux.AF_INET6 {
  2139  		return syserr.ErrUnknownProtocolOption
  2140  	}
  2141  
  2142  	switch name {
  2143  	case linux.ICMPV6_FILTER:
  2144  		var req linux.ICMP6Filter
  2145  		if len(optVal) < req.SizeBytes() {
  2146  			return syserr.ErrInvalidArgument
  2147  		}
  2148  
  2149  		req.UnmarshalUnsafe(optVal)
  2150  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.ICMPv6Filter{DenyType: req.Filter}))
  2151  	}
  2152  
  2153  	return nil
  2154  }
  2155  
  2156  // setSockOptIPv6 implements SetSockOpt when level is SOL_IPV6.
  2157  func setSockOptIPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  2158  	if _, ok := ep.(tcpip.Endpoint); !ok {
  2159  		log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  2160  		return syserr.ErrUnknownProtocolOption
  2161  	}
  2162  
  2163  	family, _, _ := s.Type()
  2164  	if family != linux.AF_INET6 {
  2165  		return syserr.ErrUnknownProtocolOption
  2166  	}
  2167  
  2168  	switch name {
  2169  	case linux.IPV6_CHECKSUM:
  2170  		if len(optVal) < sizeOfInt32 {
  2171  			return syserr.ErrInvalidArgument
  2172  		}
  2173  
  2174  		// int may not be 32-bits so we cast the uint32 to an int32 before casting
  2175  		// to an int.
  2176  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6Checksum, int(int32(hostarch.ByteOrder.Uint32(optVal)))))
  2177  
  2178  	case linux.IPV6_V6ONLY:
  2179  		if len(optVal) < sizeOfInt32 {
  2180  			return syserr.ErrInvalidArgument
  2181  		}
  2182  
  2183  		if socket.IsTCP(s) && tcp.EndpointState(ep.State()) != tcp.StateInitial {
  2184  			return syserr.ErrInvalidEndpointState
  2185  		} else if socket.IsUDP(s) && transport.DatagramEndpointState(ep.State()) != transport.DatagramEndpointStateInitial {
  2186  			return syserr.ErrInvalidEndpointState
  2187  		}
  2188  
  2189  		v := hostarch.ByteOrder.Uint32(optVal)
  2190  		ep.SocketOptions().SetV6Only(v != 0)
  2191  		return nil
  2192  
  2193  	case linux.IPV6_ADD_MEMBERSHIP:
  2194  		req, err := copyInMulticastV6Request(optVal)
  2195  		if err != nil {
  2196  			return err
  2197  		}
  2198  
  2199  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{
  2200  			NIC:           tcpip.NICID(req.InterfaceIndex),
  2201  			MulticastAddr: tcpip.AddrFrom16(req.MulticastAddr),
  2202  		}))
  2203  
  2204  	case linux.IPV6_DROP_MEMBERSHIP:
  2205  		req, err := copyInMulticastV6Request(optVal)
  2206  		if err != nil {
  2207  			return err
  2208  		}
  2209  
  2210  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{
  2211  			NIC:           tcpip.NICID(req.InterfaceIndex),
  2212  			MulticastAddr: tcpip.AddrFrom16(req.MulticastAddr),
  2213  		}))
  2214  
  2215  	case linux.IPV6_IPSEC_POLICY,
  2216  		linux.IPV6_JOIN_ANYCAST,
  2217  		linux.IPV6_LEAVE_ANYCAST,
  2218  		// TODO(b/148887420): Add support for IPV6_PKTINFO.
  2219  		linux.IPV6_PKTINFO,
  2220  		linux.IPV6_ROUTER_ALERT,
  2221  		linux.IPV6_XFRM_POLICY,
  2222  		linux.MCAST_BLOCK_SOURCE,
  2223  		linux.MCAST_JOIN_GROUP,
  2224  		linux.MCAST_JOIN_SOURCE_GROUP,
  2225  		linux.MCAST_LEAVE_GROUP,
  2226  		linux.MCAST_LEAVE_SOURCE_GROUP,
  2227  		linux.MCAST_UNBLOCK_SOURCE:
  2228  		// Not supported.
  2229  
  2230  	case linux.IPV6_RECVORIGDSTADDR:
  2231  		if len(optVal) < sizeOfInt32 {
  2232  			return syserr.ErrInvalidArgument
  2233  		}
  2234  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2235  
  2236  		ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0)
  2237  		return nil
  2238  
  2239  	case linux.IPV6_RECVPKTINFO:
  2240  		if len(optVal) < sizeOfInt32 {
  2241  			return syserr.ErrInvalidArgument
  2242  		}
  2243  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2244  
  2245  		ep.SocketOptions().SetIPv6ReceivePacketInfo(v != 0)
  2246  		return nil
  2247  
  2248  	case linux.IPV6_UNICAST_HOPS:
  2249  		if len(optVal) < sizeOfInt32 {
  2250  			return syserr.ErrInvalidArgument
  2251  		}
  2252  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2253  		if v < -1 || v > 255 {
  2254  			return syserr.ErrInvalidArgument
  2255  		}
  2256  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6HopLimitOption, int(v)))
  2257  
  2258  	case linux.IPV6_RECVHOPLIMIT:
  2259  		v, err := parseIntOrChar(optVal)
  2260  		if err != nil {
  2261  			return err
  2262  		}
  2263  
  2264  		ep.SocketOptions().SetReceiveHopLimit(v != 0)
  2265  		return nil
  2266  
  2267  	case linux.IPV6_TCLASS:
  2268  		if len(optVal) < sizeOfInt32 {
  2269  			return syserr.ErrInvalidArgument
  2270  		}
  2271  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2272  		if v < -1 || v > 255 {
  2273  			return syserr.ErrInvalidArgument
  2274  		}
  2275  		if v == -1 {
  2276  			v = 0
  2277  		}
  2278  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6TrafficClassOption, int(v)))
  2279  
  2280  	case linux.IPV6_RECVTCLASS:
  2281  		v, err := parseIntOrChar(optVal)
  2282  		if err != nil {
  2283  			return err
  2284  		}
  2285  
  2286  		ep.SocketOptions().SetReceiveTClass(v != 0)
  2287  		return nil
  2288  	case linux.IPV6_RECVERR:
  2289  		if len(optVal) == 0 {
  2290  			return nil
  2291  		}
  2292  		v, err := parseIntOrChar(optVal)
  2293  		if err != nil {
  2294  			return err
  2295  		}
  2296  		ep.SocketOptions().SetIPv6RecvError(v != 0)
  2297  		return nil
  2298  
  2299  	case linux.IP6T_SO_SET_REPLACE:
  2300  		if len(optVal) < linux.SizeOfIP6TReplace {
  2301  			return syserr.ErrInvalidArgument
  2302  		}
  2303  
  2304  		// Only valid for raw IPv6 sockets.
  2305  		if !socket.IsRaw(s) {
  2306  			return syserr.ErrProtocolNotAvailable
  2307  		}
  2308  
  2309  		stk := inet.StackFromContext(t)
  2310  		if stk == nil {
  2311  			return syserr.ErrNoDevice
  2312  		}
  2313  		// Stack must be a netstack stack.
  2314  		return netfilter.SetEntries(t.Credentials().UserNamespace, stk.(*Stack).Stack, optVal, true)
  2315  
  2316  	case linux.IP6T_SO_SET_ADD_COUNTERS:
  2317  		log.Infof("IP6T_SO_SET_ADD_COUNTERS is not supported")
  2318  		return nil
  2319  	}
  2320  
  2321  	return nil
  2322  }
  2323  
  2324  var (
  2325  	inetMulticastRequestSize        = (*linux.InetMulticastRequest)(nil).SizeBytes()
  2326  	inetMulticastRequestWithNICSize = (*linux.InetMulticastRequestWithNIC)(nil).SizeBytes()
  2327  	inet6MulticastRequestSize       = (*linux.Inet6MulticastRequest)(nil).SizeBytes()
  2328  )
  2329  
  2330  // copyInMulticastRequest copies in a variable-size multicast request. The
  2331  // kernel determines which structure was passed by its length. IP_MULTICAST_IF
  2332  // supports ip_mreqn, ip_mreq and in_addr, while IP_ADD_MEMBERSHIP and
  2333  // IP_DROP_MEMBERSHIP only support ip_mreqn and ip_mreq. To handle this,
  2334  // allowAddr controls whether in_addr is accepted or rejected.
  2335  func copyInMulticastRequest(optVal []byte, allowAddr bool) (linux.InetMulticastRequestWithNIC, *syserr.Error) {
  2336  	if len(optVal) < len(linux.InetAddr{}) {
  2337  		return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument
  2338  	}
  2339  
  2340  	if len(optVal) < inetMulticastRequestSize {
  2341  		if !allowAddr {
  2342  			return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument
  2343  		}
  2344  
  2345  		var req linux.InetMulticastRequestWithNIC
  2346  		copy(req.InterfaceAddr[:], optVal)
  2347  		return req, nil
  2348  	}
  2349  
  2350  	if len(optVal) >= inetMulticastRequestWithNICSize {
  2351  		var req linux.InetMulticastRequestWithNIC
  2352  		req.UnmarshalUnsafe(optVal)
  2353  		return req, nil
  2354  	}
  2355  
  2356  	var req linux.InetMulticastRequestWithNIC
  2357  	req.InetMulticastRequest.UnmarshalUnsafe(optVal)
  2358  	return req, nil
  2359  }
  2360  
  2361  func copyInMulticastV6Request(optVal []byte) (linux.Inet6MulticastRequest, *syserr.Error) {
  2362  	if len(optVal) < inet6MulticastRequestSize {
  2363  		return linux.Inet6MulticastRequest{}, syserr.ErrInvalidArgument
  2364  	}
  2365  
  2366  	var req linux.Inet6MulticastRequest
  2367  	req.UnmarshalUnsafe(optVal)
  2368  	return req, nil
  2369  }
  2370  
  2371  // parseIntOrChar copies either a 32-bit int or an 8-bit uint out of buf.
  2372  //
  2373  // net/ipv4/ip_sockglue.c:do_ip_setsockopt does this for its socket options.
  2374  func parseIntOrChar(buf []byte) (int32, *syserr.Error) {
  2375  	if len(buf) == 0 {
  2376  		return 0, syserr.ErrInvalidArgument
  2377  	}
  2378  
  2379  	if len(buf) >= sizeOfInt32 {
  2380  		return int32(hostarch.ByteOrder.Uint32(buf)), nil
  2381  	}
  2382  
  2383  	return int32(buf[0]), nil
  2384  }
  2385  
  2386  // setSockOptIP implements SetSockOpt when level is SOL_IP.
  2387  func setSockOptIP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  2388  	if _, ok := ep.(tcpip.Endpoint); !ok {
  2389  		log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  2390  		return syserr.ErrUnknownProtocolOption
  2391  	}
  2392  
  2393  	switch name {
  2394  	case linux.IP_MULTICAST_TTL:
  2395  		v, err := parseIntOrChar(optVal)
  2396  		if err != nil {
  2397  			return err
  2398  		}
  2399  
  2400  		if v == -1 {
  2401  			// Linux translates -1 to 1.
  2402  			v = 1
  2403  		}
  2404  		if v < 0 || v > 255 {
  2405  			return syserr.ErrInvalidArgument
  2406  		}
  2407  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MulticastTTLOption, int(v)))
  2408  
  2409  	case linux.IP_ADD_MEMBERSHIP:
  2410  		req, err := copyInMulticastRequest(optVal, false /* allowAddr */)
  2411  		if err != nil {
  2412  			return err
  2413  		}
  2414  
  2415  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{
  2416  			NIC: tcpip.NICID(req.InterfaceIndex),
  2417  			// TODO(igudger): Change AddMembership to use the standard
  2418  			// any address representation.
  2419  			InterfaceAddr: tcpip.AddrFrom4(req.InterfaceAddr),
  2420  			MulticastAddr: tcpip.AddrFrom4(req.MulticastAddr),
  2421  		}))
  2422  
  2423  	case linux.IP_DROP_MEMBERSHIP:
  2424  		req, err := copyInMulticastRequest(optVal, false /* allowAddr */)
  2425  		if err != nil {
  2426  			return err
  2427  		}
  2428  
  2429  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{
  2430  			NIC: tcpip.NICID(req.InterfaceIndex),
  2431  			// TODO(igudger): Change DropMembership to use the standard
  2432  			// any address representation.
  2433  			InterfaceAddr: tcpip.AddrFrom4(req.InterfaceAddr),
  2434  			MulticastAddr: tcpip.AddrFrom4(req.MulticastAddr),
  2435  		}))
  2436  
  2437  	case linux.IP_MULTICAST_IF:
  2438  		req, err := copyInMulticastRequest(optVal, true /* allowAddr */)
  2439  		if err != nil {
  2440  			return err
  2441  		}
  2442  
  2443  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.MulticastInterfaceOption{
  2444  			NIC:           tcpip.NICID(req.InterfaceIndex),
  2445  			InterfaceAddr: socket.BytesToIPAddress(req.InterfaceAddr[:]),
  2446  		}))
  2447  
  2448  	case linux.IP_MULTICAST_LOOP:
  2449  		v, err := parseIntOrChar(optVal)
  2450  		if err != nil {
  2451  			return err
  2452  		}
  2453  
  2454  		ep.SocketOptions().SetMulticastLoop(v != 0)
  2455  		return nil
  2456  
  2457  	case linux.MCAST_JOIN_GROUP:
  2458  		// FIXME(b/124219304): Implement MCAST_JOIN_GROUP.
  2459  		return syserr.ErrInvalidArgument
  2460  
  2461  	case linux.IP_TTL:
  2462  		v, err := parseIntOrChar(optVal)
  2463  		if err != nil {
  2464  			return err
  2465  		}
  2466  
  2467  		// -1 means default TTL.
  2468  		if v == -1 {
  2469  			v = 0
  2470  		} else if v < 1 || v > 255 {
  2471  			return syserr.ErrInvalidArgument
  2472  		}
  2473  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TTLOption, int(v)))
  2474  
  2475  	case linux.IP_RECVTTL:
  2476  		v, err := parseIntOrChar(optVal)
  2477  		if err != nil {
  2478  			return err
  2479  		}
  2480  		ep.SocketOptions().SetReceiveTTL(v != 0)
  2481  		return nil
  2482  
  2483  	case linux.IP_TOS:
  2484  		if len(optVal) == 0 {
  2485  			return nil
  2486  		}
  2487  		v, err := parseIntOrChar(optVal)
  2488  		if err != nil {
  2489  			return err
  2490  		}
  2491  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TOSOption, int(v)))
  2492  
  2493  	case linux.IP_RECVTOS:
  2494  		v, err := parseIntOrChar(optVal)
  2495  		if err != nil {
  2496  			return err
  2497  		}
  2498  		ep.SocketOptions().SetReceiveTOS(v != 0)
  2499  		return nil
  2500  
  2501  	case linux.IP_RECVERR:
  2502  		if len(optVal) == 0 {
  2503  			return nil
  2504  		}
  2505  		v, err := parseIntOrChar(optVal)
  2506  		if err != nil {
  2507  			return err
  2508  		}
  2509  		ep.SocketOptions().SetIPv4RecvError(v != 0)
  2510  		return nil
  2511  
  2512  	case linux.IP_PKTINFO:
  2513  		if len(optVal) == 0 {
  2514  			return nil
  2515  		}
  2516  		v, err := parseIntOrChar(optVal)
  2517  		if err != nil {
  2518  			return err
  2519  		}
  2520  		ep.SocketOptions().SetReceivePacketInfo(v != 0)
  2521  		return nil
  2522  
  2523  	case linux.IP_HDRINCL:
  2524  		if len(optVal) == 0 {
  2525  			return nil
  2526  		}
  2527  		v, err := parseIntOrChar(optVal)
  2528  		if err != nil {
  2529  			return err
  2530  		}
  2531  		ep.SocketOptions().SetHeaderIncluded(v != 0)
  2532  		return nil
  2533  
  2534  	case linux.IP_RECVORIGDSTADDR:
  2535  		if len(optVal) == 0 {
  2536  			return nil
  2537  		}
  2538  		v, err := parseIntOrChar(optVal)
  2539  		if err != nil {
  2540  			return err
  2541  		}
  2542  
  2543  		ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0)
  2544  		return nil
  2545  
  2546  	case linux.IPT_SO_SET_REPLACE:
  2547  		if len(optVal) < linux.SizeOfIPTReplace {
  2548  			return syserr.ErrInvalidArgument
  2549  		}
  2550  
  2551  		// Only valid for raw IPv4 sockets.
  2552  		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
  2553  			return syserr.ErrProtocolNotAvailable
  2554  		}
  2555  
  2556  		stk := inet.StackFromContext(t)
  2557  		if stk == nil {
  2558  			return syserr.ErrNoDevice
  2559  		}
  2560  		// Stack must be a netstack stack.
  2561  		return netfilter.SetEntries(t.Credentials().UserNamespace, stk.(*Stack).Stack, optVal, false)
  2562  
  2563  	case linux.IPT_SO_SET_ADD_COUNTERS:
  2564  		log.Infof("IPT_SO_SET_ADD_COUNTERS is not supported")
  2565  		return nil
  2566  
  2567  	case linux.IP_ADD_SOURCE_MEMBERSHIP,
  2568  		linux.IP_BIND_ADDRESS_NO_PORT,
  2569  		linux.IP_BLOCK_SOURCE,
  2570  		linux.IP_CHECKSUM,
  2571  		linux.IP_DROP_SOURCE_MEMBERSHIP,
  2572  		linux.IP_FREEBIND,
  2573  		linux.IP_IPSEC_POLICY,
  2574  		linux.IP_MINTTL,
  2575  		linux.IP_MSFILTER,
  2576  		linux.IP_MTU_DISCOVER,
  2577  		linux.IP_MULTICAST_ALL,
  2578  		linux.IP_NODEFRAG,
  2579  		linux.IP_OPTIONS,
  2580  		linux.IP_PASSSEC,
  2581  		linux.IP_RECVFRAGSIZE,
  2582  		linux.IP_RECVOPTS,
  2583  		linux.IP_RETOPTS,
  2584  		linux.IP_TRANSPARENT,
  2585  		linux.IP_UNBLOCK_SOURCE,
  2586  		linux.IP_UNICAST_IF,
  2587  		linux.IP_XFRM_POLICY,
  2588  		linux.MCAST_BLOCK_SOURCE,
  2589  		linux.MCAST_JOIN_SOURCE_GROUP,
  2590  		linux.MCAST_LEAVE_GROUP,
  2591  		linux.MCAST_LEAVE_SOURCE_GROUP,
  2592  		linux.MCAST_MSFILTER,
  2593  		linux.MCAST_UNBLOCK_SOURCE:
  2594  		// Not supported.
  2595  	}
  2596  
  2597  	return nil
  2598  }
  2599  
  2600  // GetSockName implements the linux syscall getsockname(2) for sockets backed by
  2601  // tcpip.Endpoint.
  2602  func (s *sock) GetSockName(*kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
  2603  	addr, err := s.Endpoint.GetLocalAddress()
  2604  	if err != nil {
  2605  		return nil, 0, syserr.TranslateNetstackError(err)
  2606  	}
  2607  
  2608  	a, l := socket.ConvertAddress(s.family, addr)
  2609  	return a, l, nil
  2610  }
  2611  
  2612  // GetPeerName implements the linux syscall getpeername(2) for sockets backed by
  2613  // tcpip.Endpoint.
  2614  func (s *sock) GetPeerName(*kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
  2615  	addr, err := s.Endpoint.GetRemoteAddress()
  2616  	if err != nil {
  2617  		return nil, 0, syserr.TranslateNetstackError(err)
  2618  	}
  2619  
  2620  	a, l := socket.ConvertAddress(s.family, addr)
  2621  	return a, l, nil
  2622  }
  2623  
  2624  func (s *sock) fillCmsgInq(cmsg *socket.ControlMessages) {
  2625  	if !s.sockOptInq {
  2626  		return
  2627  	}
  2628  	rcvBufUsed, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
  2629  	if err != nil {
  2630  		return
  2631  	}
  2632  	cmsg.IP.HasInq = true
  2633  	cmsg.IP.Inq = int32(rcvBufUsed)
  2634  }
  2635  
  2636  func toLinuxPacketType(pktType tcpip.PacketType) uint8 {
  2637  	switch pktType {
  2638  	case tcpip.PacketHost:
  2639  		return linux.PACKET_HOST
  2640  	case tcpip.PacketOtherHost:
  2641  		return linux.PACKET_OTHERHOST
  2642  	case tcpip.PacketOutgoing:
  2643  		return linux.PACKET_OUTGOING
  2644  	case tcpip.PacketBroadcast:
  2645  		return linux.PACKET_BROADCAST
  2646  	case tcpip.PacketMulticast:
  2647  		return linux.PACKET_MULTICAST
  2648  	default:
  2649  		panic(fmt.Sprintf("unknown packet type: %d", pktType))
  2650  	}
  2651  }
  2652  
  2653  // nonBlockingRead issues a non-blocking read.
  2654  //
  2655  // TODO(b/78348848): Support timestamps for stream sockets.
  2656  func (s *sock) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
  2657  	isPacket := s.isPacketBased()
  2658  
  2659  	readOptions := tcpip.ReadOptions{
  2660  		Peek:               peek,
  2661  		NeedRemoteAddr:     senderRequested,
  2662  		NeedLinkPacketInfo: isPacket,
  2663  	}
  2664  
  2665  	// TCP sockets discard the data if MSG_TRUNC is set.
  2666  	//
  2667  	// This behavior is documented in man 7 tcp:
  2668  	// Since version 2.4, Linux supports the use of MSG_TRUNC in the flags
  2669  	// argument of recv(2) (and recvmsg(2)). This flag causes the received
  2670  	// bytes of data to be discarded, rather than passed back in a
  2671  	// caller-supplied  buffer.
  2672  	var w io.Writer
  2673  	if !isPacket && trunc {
  2674  		w = &tcpip.LimitedWriter{
  2675  			W: ioutil.Discard,
  2676  			N: dst.NumBytes(),
  2677  		}
  2678  	} else {
  2679  		w = dst.Writer(ctx)
  2680  	}
  2681  
  2682  	s.readMu.Lock()
  2683  	defer s.readMu.Unlock()
  2684  
  2685  	res, err := s.Endpoint.Read(w, readOptions)
  2686  	if _, ok := err.(*tcpip.ErrBadBuffer); ok && dst.NumBytes() == 0 {
  2687  		err = nil
  2688  	}
  2689  	if err != nil {
  2690  		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err)
  2691  	}
  2692  	// Set the control message, even if 0 bytes were read.
  2693  	s.updateTimestamp(res.ControlMessages)
  2694  
  2695  	if isPacket {
  2696  		var addr linux.SockAddr
  2697  		var addrLen uint32
  2698  		if senderRequested {
  2699  			addr, addrLen = socket.ConvertAddress(s.family, res.RemoteAddr)
  2700  			switch v := addr.(type) {
  2701  			case *linux.SockAddrLink:
  2702  				v.Protocol = socket.Htons(uint16(res.LinkPacketInfo.Protocol))
  2703  				v.PacketType = toLinuxPacketType(res.LinkPacketInfo.PktType)
  2704  			}
  2705  		}
  2706  
  2707  		msgLen := res.Count
  2708  		if trunc {
  2709  			msgLen = res.Total
  2710  		}
  2711  
  2712  		var flags int
  2713  		if res.Total > res.Count {
  2714  			flags |= linux.MSG_TRUNC
  2715  		}
  2716  
  2717  		return msgLen, flags, addr, addrLen, s.netstackToLinuxControlMessages(res.ControlMessages), nil
  2718  	}
  2719  
  2720  	if peek {
  2721  		// MSG_TRUNC with MSG_PEEK on a TCP socket returns the
  2722  		// amount that could be read, and does not write to buffer.
  2723  		if trunc {
  2724  			// TCP endpoint does not return the total bytes in buffer as numTotal.
  2725  			// We need to query it from socket option.
  2726  			rql, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
  2727  			if err != nil {
  2728  				return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err)
  2729  			}
  2730  			msgLen := int(dst.NumBytes())
  2731  			if msgLen > rql {
  2732  				msgLen = rql
  2733  			}
  2734  			return msgLen, 0, nil, 0, socket.ControlMessages{}, nil
  2735  		}
  2736  	} else if n := res.Count; n != 0 {
  2737  		s.Endpoint.ModerateRecvBuf(n)
  2738  	}
  2739  
  2740  	cmsg := s.netstackToLinuxControlMessages(res.ControlMessages)
  2741  	s.fillCmsgInq(&cmsg)
  2742  	return res.Count, 0, nil, 0, cmsg, syserr.TranslateNetstackError(err)
  2743  }
  2744  
  2745  func (s *sock) netstackToLinuxControlMessages(cm tcpip.ReceivableControlMessages) socket.ControlMessages {
  2746  	readCM := socket.NewIPControlMessages(s.family, cm)
  2747  	return socket.ControlMessages{
  2748  		IP: socket.IPControlMessages{
  2749  			HasTimestamp:       readCM.HasTimestamp && s.sockOptTimestamp,
  2750  			Timestamp:          readCM.Timestamp,
  2751  			HasInq:             readCM.HasInq,
  2752  			Inq:                readCM.Inq,
  2753  			HasTOS:             readCM.HasTOS,
  2754  			TOS:                readCM.TOS,
  2755  			HasTClass:          readCM.HasTClass,
  2756  			TClass:             readCM.TClass,
  2757  			HasTTL:             readCM.HasTTL,
  2758  			TTL:                readCM.TTL,
  2759  			HasHopLimit:        readCM.HasHopLimit,
  2760  			HopLimit:           readCM.HopLimit,
  2761  			HasIPPacketInfo:    readCM.HasIPPacketInfo,
  2762  			PacketInfo:         readCM.PacketInfo,
  2763  			HasIPv6PacketInfo:  readCM.HasIPv6PacketInfo,
  2764  			IPv6PacketInfo:     readCM.IPv6PacketInfo,
  2765  			OriginalDstAddress: readCM.OriginalDstAddress,
  2766  			SockErr:            readCM.SockErr,
  2767  		},
  2768  	}
  2769  }
  2770  
  2771  func (s *sock) linuxToNetstackControlMessages(cm socket.ControlMessages) tcpip.SendableControlMessages {
  2772  	return tcpip.SendableControlMessages{
  2773  		HasTTL:      cm.IP.HasTTL,
  2774  		TTL:         uint8(cm.IP.TTL),
  2775  		HasHopLimit: cm.IP.HasHopLimit,
  2776  		HopLimit:    uint8(cm.IP.HopLimit),
  2777  	}
  2778  }
  2779  
  2780  // updateTimestamp sets the timestamp for SIOCGSTAMP. It should be called after
  2781  // successfully writing packet data out to userspace.
  2782  //
  2783  // Precondition: s.readMu must be locked.
  2784  func (s *sock) updateTimestamp(cm tcpip.ReceivableControlMessages) {
  2785  	// Save the SIOCGSTAMP timestamp only if SO_TIMESTAMP is disabled.
  2786  	if !s.sockOptTimestamp {
  2787  		s.timestampValid = true
  2788  		s.timestamp = cm.Timestamp
  2789  	}
  2790  }
  2791  
  2792  // dequeueErr is analogous to net/core/skbuff.c:sock_dequeue_err_skb().
  2793  func (s *sock) dequeueErr() *tcpip.SockError {
  2794  	so := s.Endpoint.SocketOptions()
  2795  	err := so.DequeueErr()
  2796  	if err == nil {
  2797  		return nil
  2798  	}
  2799  
  2800  	// Update socket error to reflect ICMP errors in queue.
  2801  	if nextErr := so.PeekErr(); nextErr != nil && nextErr.Cause.Origin().IsICMPErr() {
  2802  		so.SetLastError(nextErr.Err)
  2803  	} else if err.Cause.Origin().IsICMPErr() {
  2804  		so.SetLastError(nil)
  2805  	}
  2806  	return err
  2807  }
  2808  
  2809  // addrFamilyFromNetProto returns the address family identifier for the given
  2810  // network protocol.
  2811  func addrFamilyFromNetProto(net tcpip.NetworkProtocolNumber) int {
  2812  	switch net {
  2813  	case header.IPv4ProtocolNumber:
  2814  		return linux.AF_INET
  2815  	case header.IPv6ProtocolNumber:
  2816  		return linux.AF_INET6
  2817  	default:
  2818  		panic(fmt.Sprintf("invalid net proto for addr family inference: %d", net))
  2819  	}
  2820  }
  2821  
  2822  // recvErr handles MSG_ERRQUEUE for recvmsg(2).
  2823  // This is analogous to net/ipv4/ip_sockglue.c:ip_recv_error().
  2824  func (s *sock) recvErr(t *kernel.Task, dst usermem.IOSequence) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
  2825  	sockErr := s.dequeueErr()
  2826  	if sockErr == nil {
  2827  		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
  2828  	}
  2829  	if sockErr.Payload != nil {
  2830  		defer sockErr.Payload.Release()
  2831  	}
  2832  
  2833  	// The payload of the original packet that caused the error is passed as
  2834  	// normal data via msg_iovec.  -- recvmsg(2)
  2835  	msgFlags := linux.MSG_ERRQUEUE
  2836  	if int(dst.NumBytes()) < sockErr.Payload.Size() {
  2837  		msgFlags |= linux.MSG_TRUNC
  2838  	}
  2839  	n, err := dst.CopyOut(t, sockErr.Payload.AsSlice())
  2840  
  2841  	// The original destination address of the datagram that caused the error is
  2842  	// supplied via msg_name.  -- recvmsg(2)
  2843  	dstAddr, dstAddrLen := socket.ConvertAddress(addrFamilyFromNetProto(sockErr.NetProto), sockErr.Dst)
  2844  	cmgs := socket.ControlMessages{IP: socket.NewIPControlMessages(s.family, tcpip.ReceivableControlMessages{SockErr: sockErr})}
  2845  	return n, msgFlags, dstAddr, dstAddrLen, cmgs, syserr.FromError(err)
  2846  }
  2847  
  2848  // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
  2849  // tcpip.Endpoint.
  2850  func (s *sock) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, _ uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
  2851  	if flags&linux.MSG_ERRQUEUE != 0 {
  2852  		return s.recvErr(t, dst)
  2853  	}
  2854  
  2855  	trunc := flags&linux.MSG_TRUNC != 0
  2856  	peek := flags&linux.MSG_PEEK != 0
  2857  	dontWait := flags&linux.MSG_DONTWAIT != 0
  2858  	waitAll := flags&linux.MSG_WAITALL != 0
  2859  	if senderRequested && !s.isPacketBased() {
  2860  		// Stream sockets ignore the sender address.
  2861  		senderRequested = false
  2862  	}
  2863  	n, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
  2864  
  2865  	if s.isPacketBased() && err == syserr.ErrClosedForReceive && flags&linux.MSG_DONTWAIT != 0 {
  2866  		// In this situation we should return EAGAIN.
  2867  		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
  2868  	}
  2869  
  2870  	if err != nil && (err != syserr.ErrWouldBlock || dontWait) {
  2871  		// Read failed and we should not retry.
  2872  		return 0, 0, nil, 0, socket.ControlMessages{}, err
  2873  	}
  2874  
  2875  	if err == nil && (dontWait || !waitAll || s.isPacketBased() || int64(n) >= dst.NumBytes()) {
  2876  		// We got all the data we need.
  2877  		return
  2878  	}
  2879  
  2880  	// Don't overwrite any data we received.
  2881  	dst = dst.DropFirst(n)
  2882  
  2883  	// We'll have to block. Register for notifications and keep trying to
  2884  	// send all the data.
  2885  	e, ch := waiter.NewChannelEntry(waiter.ReadableEvents)
  2886  	s.EventRegister(&e)
  2887  	defer s.EventUnregister(&e)
  2888  
  2889  	for {
  2890  		var rn int
  2891  		rn, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
  2892  		n += rn
  2893  		if err != nil && err != syserr.ErrWouldBlock {
  2894  			// Always stop on errors other than would block as we generally
  2895  			// won't be able to get any more data. Eat the error if we got
  2896  			// any data.
  2897  			if n > 0 {
  2898  				err = nil
  2899  			}
  2900  			return
  2901  		}
  2902  		if err == nil && (s.isPacketBased() || !waitAll || int64(rn) >= dst.NumBytes()) {
  2903  			// We got all the data we need.
  2904  			return
  2905  		}
  2906  		dst = dst.DropFirst(rn)
  2907  
  2908  		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
  2909  			if n > 0 {
  2910  				return n, msgFlags, senderAddr, senderAddrLen, controlMessages, nil
  2911  			}
  2912  			if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
  2913  				return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
  2914  			}
  2915  			return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
  2916  		}
  2917  	}
  2918  }
  2919  
  2920  // SendMsg implements the linux syscall sendmsg(2) for sockets backed by
  2921  // tcpip.Endpoint.
  2922  func (s *sock) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
  2923  	// Reject Unix control messages.
  2924  	if !controlMessages.Unix.Empty() {
  2925  		return 0, syserr.ErrInvalidArgument
  2926  	}
  2927  
  2928  	var addr *tcpip.FullAddress
  2929  	if len(to) > 0 {
  2930  		addrBuf, family, err := socket.AddressAndFamily(to)
  2931  		if err != nil {
  2932  			return 0, err
  2933  		}
  2934  		if !s.checkFamily(family, false /* exact */) {
  2935  			return 0, syserr.ErrInvalidArgument
  2936  		}
  2937  		addrBuf = s.mapFamily(addrBuf, family)
  2938  
  2939  		addr = &addrBuf
  2940  	}
  2941  
  2942  	opts := tcpip.WriteOptions{
  2943  		To:              addr,
  2944  		More:            flags&linux.MSG_MORE != 0,
  2945  		EndOfRecord:     flags&linux.MSG_EOR != 0,
  2946  		ControlMessages: s.linuxToNetstackControlMessages(controlMessages),
  2947  	}
  2948  
  2949  	r := src.Reader(t)
  2950  	var (
  2951  		total int64
  2952  		entry waiter.Entry
  2953  		ch    <-chan struct{}
  2954  	)
  2955  	for {
  2956  		n, err := s.Endpoint.Write(r, opts)
  2957  		total += n
  2958  		if flags&linux.MSG_DONTWAIT != 0 {
  2959  			return int(total), syserr.TranslateNetstackError(err)
  2960  		}
  2961  		block := true
  2962  		switch err.(type) {
  2963  		case nil:
  2964  			block = total != src.NumBytes()
  2965  		case *tcpip.ErrWouldBlock:
  2966  		default:
  2967  			block = false
  2968  		}
  2969  		if block {
  2970  			if ch == nil {
  2971  				// We'll have to block. Register for notification and keep trying to
  2972  				// send all the data.
  2973  				entry, ch = waiter.NewChannelEntry(waiter.WritableEvents)
  2974  				s.EventRegister(&entry)
  2975  				defer s.EventUnregister(&entry)
  2976  			} else {
  2977  				// Don't wait immediately after registration in case more data
  2978  				// became available between when we last checked and when we setup
  2979  				// the notification.
  2980  				if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
  2981  					if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
  2982  						return int(total), syserr.ErrTryAgain
  2983  					}
  2984  					// handleIOError will consume errors from t.Block if needed.
  2985  					return int(total), syserr.FromError(err)
  2986  				}
  2987  			}
  2988  			continue
  2989  		}
  2990  		return int(total), syserr.TranslateNetstackError(err)
  2991  	}
  2992  }
  2993  
  2994  // Ioctl implements vfs.FileDescriptionImpl.
  2995  func (s *sock) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
  2996  	t := kernel.TaskFromContext(ctx)
  2997  	if t == nil {
  2998  		panic("ioctl(2) may only be called from a task goroutine")
  2999  	}
  3000  
  3001  	// SIOCGSTAMP is implemented by netstack rather than all commonEndpoint
  3002  	// sockets.
  3003  	// TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP.
  3004  	switch args[1].Int() {
  3005  	case linux.SIOCGSTAMP:
  3006  		s.readMu.Lock()
  3007  		defer s.readMu.Unlock()
  3008  		if !s.timestampValid {
  3009  			return 0, linuxerr.ENOENT
  3010  		}
  3011  
  3012  		tv := linux.NsecToTimeval(s.timestamp.UnixNano())
  3013  		_, err := tv.CopyOut(t, args[2].Pointer())
  3014  		return 0, err
  3015  
  3016  	case linux.TIOCINQ:
  3017  		v, terr := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
  3018  		if terr != nil {
  3019  			return 0, syserr.TranslateNetstackError(terr).ToError()
  3020  		}
  3021  
  3022  		if v > math.MaxInt32 {
  3023  			v = math.MaxInt32
  3024  		}
  3025  
  3026  		// Copy result to userspace.
  3027  		vP := primitive.Int32(v)
  3028  		_, err := vP.CopyOut(t, args[2].Pointer())
  3029  		return 0, err
  3030  	}
  3031  
  3032  	return Ioctl(ctx, s.Endpoint, uio, sysno, args)
  3033  }
  3034  
  3035  // Ioctl performs a socket ioctl.
  3036  func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
  3037  	t := kernel.TaskFromContext(ctx)
  3038  	if t == nil {
  3039  		panic("ioctl(2) may only be called from a task goroutine")
  3040  	}
  3041  
  3042  	switch arg := int(args[1].Int()); arg {
  3043  	case linux.SIOCGIFFLAGS,
  3044  		linux.SIOCGIFADDR,
  3045  		linux.SIOCGIFBRDADDR,
  3046  		linux.SIOCGIFDSTADDR,
  3047  		linux.SIOCGIFHWADDR,
  3048  		linux.SIOCGIFINDEX,
  3049  		linux.SIOCGIFMAP,
  3050  		linux.SIOCGIFMETRIC,
  3051  		linux.SIOCGIFMTU,
  3052  		linux.SIOCGIFNAME,
  3053  		linux.SIOCGIFNETMASK,
  3054  		linux.SIOCGIFTXQLEN,
  3055  		linux.SIOCETHTOOL:
  3056  
  3057  		var ifr linux.IFReq
  3058  		if _, err := ifr.CopyIn(t, args[2].Pointer()); err != nil {
  3059  			return 0, err
  3060  		}
  3061  		if err := interfaceIoctl(ctx, io, arg, &ifr); err != nil {
  3062  			return 0, err.ToError()
  3063  		}
  3064  		_, err := ifr.CopyOut(t, args[2].Pointer())
  3065  		return 0, err
  3066  
  3067  	case linux.SIOCGIFCONF:
  3068  		// Return a list of interface addresses or the buffer size
  3069  		// necessary to hold the list.
  3070  		var ifc linux.IFConf
  3071  		if _, err := ifc.CopyIn(t, args[2].Pointer()); err != nil {
  3072  			return 0, err
  3073  		}
  3074  
  3075  		if err := ifconfIoctl(ctx, t, io, &ifc); err != nil {
  3076  			return 0, err
  3077  		}
  3078  
  3079  		_, err := ifc.CopyOut(t, args[2].Pointer())
  3080  		return 0, err
  3081  
  3082  	case linux.TIOCINQ:
  3083  		v, terr := ep.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
  3084  		if terr != nil {
  3085  			return 0, syserr.TranslateNetstackError(terr).ToError()
  3086  		}
  3087  
  3088  		if v > math.MaxInt32 {
  3089  			v = math.MaxInt32
  3090  		}
  3091  		// Copy result to userspace.
  3092  		vP := primitive.Int32(v)
  3093  		_, err := vP.CopyOut(t, args[2].Pointer())
  3094  		return 0, err
  3095  
  3096  	case linux.TIOCOUTQ:
  3097  		v, terr := ep.GetSockOptInt(tcpip.SendQueueSizeOption)
  3098  		if terr != nil {
  3099  			return 0, syserr.TranslateNetstackError(terr).ToError()
  3100  		}
  3101  
  3102  		if v > math.MaxInt32 {
  3103  			v = math.MaxInt32
  3104  		}
  3105  
  3106  		// Copy result to userspace.
  3107  		vP := primitive.Int32(v)
  3108  		_, err := vP.CopyOut(t, args[2].Pointer())
  3109  		return 0, err
  3110  
  3111  	case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG:
  3112  		// Not supported.
  3113  	}
  3114  
  3115  	return 0, linuxerr.ENOTTY
  3116  }
  3117  
  3118  // interfaceIoctl implements interface requests.
  3119  func interfaceIoctl(ctx context.Context, _ usermem.IO, arg int, ifr *linux.IFReq) *syserr.Error {
  3120  	var (
  3121  		iface inet.Interface
  3122  		index int32
  3123  		found bool
  3124  	)
  3125  
  3126  	// Find the relevant device.
  3127  	stk := inet.StackFromContext(ctx)
  3128  	if stk == nil {
  3129  		return syserr.ErrNoDevice
  3130  	}
  3131  
  3132  	// SIOCGIFNAME uses ifr.ifr_ifindex rather than ifr.ifr_name to
  3133  	// identify a device.
  3134  	if arg == linux.SIOCGIFNAME {
  3135  		// Gets the name of the interface given the interface index
  3136  		// stored in ifr_ifindex.
  3137  		index = int32(hostarch.ByteOrder.Uint32(ifr.Data[:4]))
  3138  		if iface, ok := stk.Interfaces()[index]; ok {
  3139  			ifr.SetName(iface.Name)
  3140  			return nil
  3141  		}
  3142  		return syserr.ErrNoDevice
  3143  	}
  3144  
  3145  	// Find the relevant device.
  3146  	for index, iface = range stk.Interfaces() {
  3147  		if iface.Name == ifr.Name() {
  3148  			found = true
  3149  			break
  3150  		}
  3151  	}
  3152  	if !found {
  3153  		return syserr.ErrNoDevice
  3154  	}
  3155  
  3156  	switch arg {
  3157  	case linux.SIOCGIFINDEX:
  3158  		// Copy out the index to the data.
  3159  		hostarch.ByteOrder.PutUint32(ifr.Data[:], uint32(index))
  3160  
  3161  	case linux.SIOCGIFHWADDR:
  3162  		// Copy the hardware address out.
  3163  		//
  3164  		// Refer: https://linux.die.net/man/7/netdevice
  3165  		// SIOCGIFHWADDR, SIOCSIFHWADDR
  3166  		//
  3167  		// Get or set the hardware address of a device using
  3168  		// ifr_hwaddr. The hardware address is specified in a struct
  3169  		// sockaddr. sa_family contains the ARPHRD_* device type,
  3170  		// sa_data the L2 hardware address starting from byte 0. Setting
  3171  		// the hardware address is a privileged operation.
  3172  		hostarch.ByteOrder.PutUint16(ifr.Data[:], iface.DeviceType)
  3173  		n := copy(ifr.Data[2:], iface.Addr)
  3174  		for i := 2 + n; i < len(ifr.Data); i++ {
  3175  			ifr.Data[i] = 0 // Clear padding.
  3176  		}
  3177  
  3178  	case linux.SIOCGIFFLAGS:
  3179  		f, err := interfaceStatusFlags(stk, iface.Name)
  3180  		if err != nil {
  3181  			return err
  3182  		}
  3183  		// Drop the flags that don't fit in the size that we need to return. This
  3184  		// matches Linux behavior.
  3185  		hostarch.ByteOrder.PutUint16(ifr.Data[:2], uint16(f))
  3186  
  3187  	case linux.SIOCGIFADDR:
  3188  		// Copy the IPv4 address out.
  3189  		for _, addr := range stk.InterfaceAddrs()[index] {
  3190  			// This ioctl is only compatible with AF_INET addresses.
  3191  			if addr.Family != linux.AF_INET {
  3192  				continue
  3193  			}
  3194  			copy(ifr.Data[4:8], addr.Addr)
  3195  			break
  3196  		}
  3197  
  3198  	case linux.SIOCGIFMETRIC:
  3199  		// Gets the metric of the device. As per netdevice(7), this
  3200  		// always just sets ifr_metric to 0.
  3201  		hostarch.ByteOrder.PutUint32(ifr.Data[:4], 0)
  3202  
  3203  	case linux.SIOCGIFMTU:
  3204  		// Gets the MTU of the device.
  3205  		hostarch.ByteOrder.PutUint32(ifr.Data[:4], iface.MTU)
  3206  
  3207  	case linux.SIOCGIFMAP:
  3208  		// Gets the hardware parameters of the device.
  3209  		// TODO(gvisor.dev/issue/505): Implement.
  3210  
  3211  	case linux.SIOCGIFTXQLEN:
  3212  		// Gets the transmit queue length of the device.
  3213  		// TODO(gvisor.dev/issue/505): Implement.
  3214  
  3215  	case linux.SIOCGIFDSTADDR:
  3216  		// Gets the destination address of a point-to-point device.
  3217  		// TODO(gvisor.dev/issue/505): Implement.
  3218  
  3219  	case linux.SIOCGIFBRDADDR:
  3220  		// Gets the broadcast address of a device.
  3221  		// TODO(gvisor.dev/issue/505): Implement.
  3222  
  3223  	case linux.SIOCGIFNETMASK:
  3224  		// Gets the network mask of a device.
  3225  		for _, addr := range stk.InterfaceAddrs()[index] {
  3226  			// This ioctl is only compatible with AF_INET addresses.
  3227  			if addr.Family != linux.AF_INET {
  3228  				continue
  3229  			}
  3230  			// Populate ifr.ifr_netmask (type sockaddr).
  3231  			hostarch.ByteOrder.PutUint16(ifr.Data[0:], uint16(linux.AF_INET))
  3232  			hostarch.ByteOrder.PutUint16(ifr.Data[2:], 0)
  3233  			var mask uint32 = 0xffffffff << (32 - addr.PrefixLen)
  3234  			// Netmask is expected to be returned as a big endian
  3235  			// value.
  3236  			binary.BigEndian.PutUint32(ifr.Data[4:8], mask)
  3237  			break
  3238  		}
  3239  
  3240  	case linux.SIOCETHTOOL:
  3241  		// Stubbed out for now, Ideally we should implement the required
  3242  		// sub-commands for ETHTOOL
  3243  		//
  3244  		// See:
  3245  		// https://github.com/torvalds/linux/blob/aa0c9086b40c17a7ad94425b3b70dd1fdd7497bf/net/core/dev_ioctl.c
  3246  		return syserr.ErrEndpointOperation
  3247  
  3248  	default:
  3249  		// Not a valid call.
  3250  		return syserr.ErrInvalidArgument
  3251  	}
  3252  
  3253  	return nil
  3254  }
  3255  
  3256  // ifconfIoctl populates a struct ifconf for the SIOCGIFCONF ioctl.
  3257  func ifconfIoctl(ctx context.Context, t *kernel.Task, _ usermem.IO, ifc *linux.IFConf) error {
  3258  	// If Ptr is NULL, return the necessary buffer size via Len.
  3259  	// Otherwise, write up to Len bytes starting at Ptr containing ifreq
  3260  	// structs.
  3261  	stk := inet.StackFromContext(ctx)
  3262  	if stk == nil {
  3263  		return syserr.ErrNoDevice.ToError()
  3264  	}
  3265  
  3266  	if ifc.Ptr == 0 {
  3267  		ifc.Len = int32(len(stk.Interfaces())) * int32(linux.SizeOfIFReq)
  3268  		return nil
  3269  	}
  3270  
  3271  	max := ifc.Len
  3272  	ifc.Len = 0
  3273  	for key, ifaceAddrs := range stk.InterfaceAddrs() {
  3274  		iface := stk.Interfaces()[key]
  3275  		for _, ifaceAddr := range ifaceAddrs {
  3276  			// Don't write past the end of the buffer.
  3277  			if ifc.Len+int32(linux.SizeOfIFReq) > max {
  3278  				break
  3279  			}
  3280  			if ifaceAddr.Family != linux.AF_INET {
  3281  				continue
  3282  			}
  3283  
  3284  			// Populate ifr.ifr_addr.
  3285  			ifr := linux.IFReq{}
  3286  			ifr.SetName(iface.Name)
  3287  			hostarch.ByteOrder.PutUint16(ifr.Data[0:2], uint16(ifaceAddr.Family))
  3288  			hostarch.ByteOrder.PutUint16(ifr.Data[2:4], 0)
  3289  			copy(ifr.Data[4:8], ifaceAddr.Addr[:4])
  3290  
  3291  			// Copy the ifr to userspace.
  3292  			dst := uintptr(ifc.Ptr) + uintptr(ifc.Len)
  3293  			ifc.Len += int32(linux.SizeOfIFReq)
  3294  			if _, err := ifr.CopyOut(t, hostarch.Addr(dst)); err != nil {
  3295  				return err
  3296  			}
  3297  		}
  3298  	}
  3299  	return nil
  3300  }
  3301  
  3302  // interfaceStatusFlags returns status flags for an interface in the stack.
  3303  // Flag values and meanings are described in greater detail in netdevice(7) in
  3304  // the SIOCGIFFLAGS section.
  3305  func interfaceStatusFlags(stack inet.Stack, name string) (uint32, *syserr.Error) {
  3306  	// We should only ever be passed a netstack.Stack.
  3307  	epstack, ok := stack.(*Stack)
  3308  	if !ok {
  3309  		return 0, errStackType
  3310  	}
  3311  
  3312  	// Find the NIC corresponding to this interface.
  3313  	for _, info := range epstack.Stack.NICInfo() {
  3314  		if info.Name == name {
  3315  			return nicStateFlagsToLinux(info.Flags), nil
  3316  		}
  3317  	}
  3318  	return 0, syserr.ErrNoDevice
  3319  }
  3320  
  3321  func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 {
  3322  	var rv uint32
  3323  	if f.Up {
  3324  		rv |= linux.IFF_UP | linux.IFF_LOWER_UP
  3325  	}
  3326  	if f.Running {
  3327  		rv |= linux.IFF_RUNNING
  3328  	}
  3329  	if f.Promiscuous {
  3330  		rv |= linux.IFF_PROMISC
  3331  	}
  3332  	if f.Loopback {
  3333  		rv |= linux.IFF_LOOPBACK
  3334  	}
  3335  	return rv
  3336  }
  3337  
  3338  // State implements socket.Socket.State. State translates the internal state
  3339  // returned by netstack to values defined by Linux.
  3340  func (s *sock) State() uint32 {
  3341  	if s.family != linux.AF_INET && s.family != linux.AF_INET6 {
  3342  		// States not implemented for this socket's family.
  3343  		return 0
  3344  	}
  3345  
  3346  	switch {
  3347  	case socket.IsTCP(s):
  3348  		// TCP socket.
  3349  		switch tcp.EndpointState(s.Endpoint.State()) {
  3350  		case tcp.StateEstablished:
  3351  			return linux.TCP_ESTABLISHED
  3352  		case tcp.StateSynSent:
  3353  			return linux.TCP_SYN_SENT
  3354  		case tcp.StateSynRecv:
  3355  			return linux.TCP_SYN_RECV
  3356  		case tcp.StateFinWait1:
  3357  			return linux.TCP_FIN_WAIT1
  3358  		case tcp.StateFinWait2:
  3359  			return linux.TCP_FIN_WAIT2
  3360  		case tcp.StateTimeWait:
  3361  			return linux.TCP_TIME_WAIT
  3362  		case tcp.StateClose, tcp.StateInitial, tcp.StateBound, tcp.StateConnecting, tcp.StateError:
  3363  			return linux.TCP_CLOSE
  3364  		case tcp.StateCloseWait:
  3365  			return linux.TCP_CLOSE_WAIT
  3366  		case tcp.StateLastAck:
  3367  			return linux.TCP_LAST_ACK
  3368  		case tcp.StateListen:
  3369  			return linux.TCP_LISTEN
  3370  		case tcp.StateClosing:
  3371  			return linux.TCP_CLOSING
  3372  		default:
  3373  			// Internal or unknown state.
  3374  			return 0
  3375  		}
  3376  	case socket.IsUDP(s):
  3377  		// UDP socket.
  3378  		switch transport.DatagramEndpointState(s.Endpoint.State()) {
  3379  		case transport.DatagramEndpointStateInitial, transport.DatagramEndpointStateBound, transport.DatagramEndpointStateClosed:
  3380  			return linux.TCP_CLOSE
  3381  		case transport.DatagramEndpointStateConnected:
  3382  			return linux.TCP_ESTABLISHED
  3383  		default:
  3384  			return 0
  3385  		}
  3386  	case socket.IsICMP(s):
  3387  		// TODO(b/112063468): Export states for ICMP sockets.
  3388  	case socket.IsRaw(s):
  3389  		// TODO(b/112063468): Export states for raw sockets.
  3390  	default:
  3391  		// Unknown transport protocol, how did we make this socket?
  3392  		log.Warningf("Unknown transport protocol for an existing socket: family=%v, type=%v, protocol=%v, internal type %v", s.family, s.skType, s.protocol, reflect.TypeOf(s.Endpoint).Elem())
  3393  		return 0
  3394  	}
  3395  
  3396  	return 0
  3397  }
  3398  
  3399  // Type implements socket.Socket.Type.
  3400  func (s *sock) Type() (family int, skType linux.SockType, protocol int) {
  3401  	return s.family, s.skType, s.protocol
  3402  }
  3403  
  3404  // EventRegister implements waiter.Waitable.
  3405  func (s *sock) EventRegister(e *waiter.Entry) error {
  3406  	s.Queue.EventRegister(e)
  3407  	return nil
  3408  }
  3409  
  3410  // EventUnregister implements waiter.Waitable.EventUnregister.
  3411  func (s *sock) EventUnregister(e *waiter.Entry) {
  3412  	s.Queue.EventUnregister(e)
  3413  }