github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/socket/netstack/netstack.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package netstack provides an implementation of the socket.Socket interface
    16  // that is backed by a tcpip.Endpoint.
    17  //
    18  // It does not depend on any particular endpoint implementation, and thus can
    19  // be used to expose certain endpoints to the sentry while leaving others out,
    20  // for example, TCP endpoints and Unix-domain endpoints.
    21  //
    22  // Lock ordering: netstack => mm: ioSequenceReadWriter copies user memory inside
    23  // tcpip.Endpoint.Write(). Netstack is allowed to (and does) hold locks during
    24  // this operation.
    25  package netstack
    26  
    27  import (
    28  	"bytes"
    29  	"encoding/binary"
    30  	"fmt"
    31  	"io"
    32  	"io/ioutil"
    33  	"math"
    34  	"reflect"
    35  	"time"
    36  
    37  	"golang.org/x/sys/unix"
    38  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    39  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux/errno"
    40  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    41  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    42  	"github.com/nicocha30/gvisor-ligolo/pkg/hostarch"
    43  	"github.com/nicocha30/gvisor-ligolo/pkg/log"
    44  	"github.com/nicocha30/gvisor-ligolo/pkg/marshal"
    45  	"github.com/nicocha30/gvisor-ligolo/pkg/marshal/primitive"
    46  	"github.com/nicocha30/gvisor-ligolo/pkg/metric"
    47  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/arch"
    48  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/sockfs"
    49  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/inet"
    50  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel"
    51  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth"
    52  	ktime "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/time"
    53  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket"
    54  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/netfilter"
    55  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs"
    56  	"github.com/nicocha30/gvisor-ligolo/pkg/sync"
    57  	"github.com/nicocha30/gvisor-ligolo/pkg/syserr"
    58  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip"
    59  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/header"
    60  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/stack"
    61  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/transport"
    62  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip/transport/tcp"
    63  	"github.com/nicocha30/gvisor-ligolo/pkg/usermem"
    64  	"github.com/nicocha30/gvisor-ligolo/pkg/waiter"
    65  )
    66  
    67  const bitsPerUint32 = 32
    68  
    69  // statCounterValue returns a function usable as callback function when defining a gVisor Sentry
    70  // metric that contains the value counted by the StatCounter.
    71  // This avoids a dependency loop in the tcpip package.
    72  func statCounterValue(cm *tcpip.StatCounter) func(...*metric.FieldValue) uint64 {
    73  	return func(...*metric.FieldValue) uint64 {
    74  		return cm.Value()
    75  	}
    76  }
    77  
    78  func mustCreateMetric(name, description string) *tcpip.StatCounter {
    79  	var cm tcpip.StatCounter
    80  	metric.MustRegisterCustomUint64Metric(name, true /* cumulative */, false /* sync */, description, statCounterValue(&cm))
    81  	return &cm
    82  }
    83  
    84  func mustCreateGauge(name, description string) *tcpip.StatCounter {
    85  	var cm tcpip.StatCounter
    86  	metric.MustRegisterCustomUint64Metric(name, false /* cumulative */, false /* sync */, description, statCounterValue(&cm))
    87  	return &cm
    88  }
    89  
    90  // Metrics contains metrics exported by netstack.
    91  var Metrics = tcpip.Stats{
    92  	DroppedPackets: mustCreateMetric("/netstack/dropped_packets", "Number of packets dropped at the transport layer."),
    93  	NICs: tcpip.NICStats{
    94  		MalformedL4RcvdPackets: mustCreateMetric("/netstack/nic/malformed_l4_received_packets", "Number of packets received that failed L4 header parsing."),
    95  		Tx: tcpip.NICPacketStats{
    96  			Packets: mustCreateMetric("/netstack/nic/tx/packets", "Number of packets transmitted."),
    97  			Bytes:   mustCreateMetric("/netstack/nic/tx/bytes", "Number of bytes transmitted."),
    98  		},
    99  		TxPacketsDroppedNoBufferSpace: mustCreateMetric("/netstack/nic/tx_packets_dropped_no_buffer_space", "Number of TX packets dropped as a result of no buffer space errors."),
   100  		Rx: tcpip.NICPacketStats{
   101  			Packets: mustCreateMetric("/netstack/nic/rx/packets", "Number of packets received."),
   102  			Bytes:   mustCreateMetric("/netstack/nic/rx/bytes", "Number of bytes received."),
   103  		},
   104  		DisabledRx: tcpip.NICPacketStats{
   105  			Packets: mustCreateMetric("/netstack/nic/disabled_rx/packets", "Number of packets received on disabled NICs."),
   106  			Bytes:   mustCreateMetric("/netstack/nic/disabled_rx/bytes", "Number of bytes received on disabled NICs."),
   107  		},
   108  		Neighbor: tcpip.NICNeighborStats{
   109  			UnreachableEntryLookups:                    mustCreateMetric("/netstack/nic/neighbor/unreachable_entry_loopups", "Number of lookups performed on a neighbor entry in Unreachable state."),
   110  			DroppedConfirmationForNoninitiatedNeighbor: mustCreateMetric("/netstack/nic/neighbor/dropped_confirmation_for_noninitiated_neighbor", "Number of advertisements received that don't match an entry in the neighbor cache."),
   111  			DroppedInvalidLinkAddressConfirmations:     mustCreateMetric("/netstack/nic/neighbor/dropped_invalid_link_address_confirmations", "Number of advertisements dropped because they have empty source link-layer addresses"),
   112  		},
   113  	},
   114  	ICMP: tcpip.ICMPStats{
   115  		V4: tcpip.ICMPv4Stats{
   116  			PacketsSent: tcpip.ICMPv4SentPacketStats{
   117  				ICMPv4PacketStats: tcpip.ICMPv4PacketStats{
   118  					EchoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_request", "Number of ICMPv4 echo request packets sent."),
   119  					EchoReply:      mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_reply", "Number of ICMPv4 echo reply packets sent."),
   120  					DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_sent/dst_unreachable", "Number of ICMPv4 destination unreachable packets sent."),
   121  					SrcQuench:      mustCreateMetric("/netstack/icmp/v4/packets_sent/src_quench", "Number of ICMPv4 source quench packets sent."),
   122  					Redirect:       mustCreateMetric("/netstack/icmp/v4/packets_sent/redirect", "Number of ICMPv4 redirect packets sent."),
   123  					TimeExceeded:   mustCreateMetric("/netstack/icmp/v4/packets_sent/time_exceeded", "Number of ICMPv4 time exceeded packets sent."),
   124  					ParamProblem:   mustCreateMetric("/netstack/icmp/v4/packets_sent/param_problem", "Number of ICMPv4 parameter problem packets sent."),
   125  					Timestamp:      mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp", "Number of ICMPv4 timestamp packets sent."),
   126  					TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp_reply", "Number of ICMPv4 timestamp reply packets sent."),
   127  					InfoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_sent/info_request", "Number of ICMPv4 information request packets sent."),
   128  					InfoReply:      mustCreateMetric("/netstack/icmp/v4/packets_sent/info_reply", "Number of ICMPv4 information reply packets sent."),
   129  				},
   130  				Dropped:     mustCreateMetric("/netstack/icmp/v4/packets_sent/dropped", "Number of ICMPv4 packets dropped due to link layer errors."),
   131  				RateLimited: mustCreateMetric("/netstack/icmp/v4/packets_sent/rate_limited", "Number of ICMPv4 packets dropped due to rate limit being exceeded."),
   132  			},
   133  			PacketsReceived: tcpip.ICMPv4ReceivedPacketStats{
   134  				ICMPv4PacketStats: tcpip.ICMPv4PacketStats{
   135  					EchoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_received/echo_request", "Number of ICMPv4 echo request packets received."),
   136  					EchoReply:      mustCreateMetric("/netstack/icmp/v4/packets_received/echo_reply", "Number of ICMPv4 echo reply packets received."),
   137  					DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_received/dst_unreachable", "Number of ICMPv4 destination unreachable packets received."),
   138  					SrcQuench:      mustCreateMetric("/netstack/icmp/v4/packets_received/src_quench", "Number of ICMPv4 source quench packets received."),
   139  					Redirect:       mustCreateMetric("/netstack/icmp/v4/packets_received/redirect", "Number of ICMPv4 redirect packets received."),
   140  					TimeExceeded:   mustCreateMetric("/netstack/icmp/v4/packets_received/time_exceeded", "Number of ICMPv4 time exceeded packets received."),
   141  					ParamProblem:   mustCreateMetric("/netstack/icmp/v4/packets_received/param_problem", "Number of ICMPv4 parameter problem packets received."),
   142  					Timestamp:      mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp", "Number of ICMPv4 timestamp packets received."),
   143  					TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp_reply", "Number of ICMPv4 timestamp reply packets received."),
   144  					InfoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_received/info_request", "Number of ICMPv4 information request packets received."),
   145  					InfoReply:      mustCreateMetric("/netstack/icmp/v4/packets_received/info_reply", "Number of ICMPv4 information reply packets received."),
   146  				},
   147  				Invalid: mustCreateMetric("/netstack/icmp/v4/packets_received/invalid", "Number of ICMPv4 packets received that the transport layer could not parse."),
   148  			},
   149  		},
   150  		V6: tcpip.ICMPv6Stats{
   151  			PacketsSent: tcpip.ICMPv6SentPacketStats{
   152  				ICMPv6PacketStats: tcpip.ICMPv6PacketStats{
   153  					EchoRequest:             mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_request", "Number of ICMPv6 echo request packets sent."),
   154  					EchoReply:               mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_reply", "Number of ICMPv6 echo reply packets sent."),
   155  					DstUnreachable:          mustCreateMetric("/netstack/icmp/v6/packets_sent/dst_unreachable", "Number of ICMPv6 destination unreachable packets sent."),
   156  					PacketTooBig:            mustCreateMetric("/netstack/icmp/v6/packets_sent/packet_too_big", "Number of ICMPv6 packet too big packets sent."),
   157  					TimeExceeded:            mustCreateMetric("/netstack/icmp/v6/packets_sent/time_exceeded", "Number of ICMPv6 time exceeded packets sent."),
   158  					ParamProblem:            mustCreateMetric("/netstack/icmp/v6/packets_sent/param_problem", "Number of ICMPv6 parameter problem packets sent."),
   159  					RouterSolicit:           mustCreateMetric("/netstack/icmp/v6/packets_sent/router_solicit", "Number of ICMPv6 router solicit packets sent."),
   160  					RouterAdvert:            mustCreateMetric("/netstack/icmp/v6/packets_sent/router_advert", "Number of ICMPv6 router advert packets sent."),
   161  					NeighborSolicit:         mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets sent."),
   162  					NeighborAdvert:          mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_advert", "Number of ICMPv6 neighbor advert packets sent."),
   163  					RedirectMsg:             mustCreateMetric("/netstack/icmp/v6/packets_sent/redirect_msg", "Number of ICMPv6 redirect message packets sent."),
   164  					MulticastListenerQuery:  mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_query", "Number of ICMPv6 multicast listener query packets sent."),
   165  					MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent."),
   166  					MulticastListenerDone:   mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent."),
   167  				},
   168  				Dropped:     mustCreateMetric("/netstack/icmp/v6/packets_sent/dropped", "Number of ICMPv6 packets dropped due to link layer errors."),
   169  				RateLimited: mustCreateMetric("/netstack/icmp/v6/packets_sent/rate_limited", "Number of ICMPv6 packets dropped due to rate limit being exceeded."),
   170  			},
   171  			PacketsReceived: tcpip.ICMPv6ReceivedPacketStats{
   172  				ICMPv6PacketStats: tcpip.ICMPv6PacketStats{
   173  					EchoRequest:             mustCreateMetric("/netstack/icmp/v6/packets_received/echo_request", "Number of ICMPv6 echo request packets received."),
   174  					EchoReply:               mustCreateMetric("/netstack/icmp/v6/packets_received/echo_reply", "Number of ICMPv6 echo reply packets received."),
   175  					DstUnreachable:          mustCreateMetric("/netstack/icmp/v6/packets_received/dst_unreachable", "Number of ICMPv6 destination unreachable packets received."),
   176  					PacketTooBig:            mustCreateMetric("/netstack/icmp/v6/packets_received/packet_too_big", "Number of ICMPv6 packet too big packets received."),
   177  					TimeExceeded:            mustCreateMetric("/netstack/icmp/v6/packets_received/time_exceeded", "Number of ICMPv6 time exceeded packets received."),
   178  					ParamProblem:            mustCreateMetric("/netstack/icmp/v6/packets_received/param_problem", "Number of ICMPv6 parameter problem packets received."),
   179  					RouterSolicit:           mustCreateMetric("/netstack/icmp/v6/packets_received/router_solicit", "Number of ICMPv6 router solicit packets received."),
   180  					RouterAdvert:            mustCreateMetric("/netstack/icmp/v6/packets_received/router_advert", "Number of ICMPv6 router advert packets received."),
   181  					NeighborSolicit:         mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets received."),
   182  					NeighborAdvert:          mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_advert", "Number of ICMPv6 neighbor advert packets received."),
   183  					RedirectMsg:             mustCreateMetric("/netstack/icmp/v6/packets_received/redirect_msg", "Number of ICMPv6 redirect message packets received."),
   184  					MulticastListenerQuery:  mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_query", "Number of ICMPv6 multicast listener query packets received."),
   185  					MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent."),
   186  					MulticastListenerDone:   mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent."),
   187  				},
   188  				Unrecognized:                   mustCreateMetric("/netstack/icmp/v6/packets_received/unrecognized", "Number of ICMPv6 packets received that the transport layer does not know how to parse."),
   189  				Invalid:                        mustCreateMetric("/netstack/icmp/v6/packets_received/invalid", "Number of ICMPv6 packets received that the transport layer could not parse."),
   190  				RouterOnlyPacketsDroppedByHost: mustCreateMetric("/netstack/icmp/v6/packets_received/router_only_packets_dropped_by_host", "Number of ICMPv6 packets dropped due to being router-specific packets."),
   191  			},
   192  		},
   193  	},
   194  	IGMP: tcpip.IGMPStats{
   195  		PacketsSent: tcpip.IGMPSentPacketStats{
   196  			IGMPPacketStats: tcpip.IGMPPacketStats{
   197  				MembershipQuery:    mustCreateMetric("/netstack/igmp/packets_sent/membership_query", "Number of IGMP Membership Query messages sent."),
   198  				V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v1_membership_report", "Number of IGMPv1 Membership Report messages sent."),
   199  				V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v2_membership_report", "Number of IGMPv2 Membership Report messages sent."),
   200  				LeaveGroup:         mustCreateMetric("/netstack/igmp/packets_sent/leave_group", "Number of IGMP Leave Group messages sent."),
   201  			},
   202  			Dropped: mustCreateMetric("/netstack/igmp/packets_sent/dropped", "Number of IGMP packets dropped due to link layer errors."),
   203  		},
   204  		PacketsReceived: tcpip.IGMPReceivedPacketStats{
   205  			IGMPPacketStats: tcpip.IGMPPacketStats{
   206  				MembershipQuery:    mustCreateMetric("/netstack/igmp/packets_received/membership_query", "Number of IGMP Membership Query messages received."),
   207  				V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v1_membership_report", "Number of IGMPv1 Membership Report messages received."),
   208  				V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v2_membership_report", "Number of IGMPv2 Membership Report messages received."),
   209  				LeaveGroup:         mustCreateMetric("/netstack/igmp/packets_received/leave_group", "Number of IGMP Leave Group messages received."),
   210  			},
   211  			Invalid:        mustCreateMetric("/netstack/igmp/packets_received/invalid", "Number of IGMP packets received that could not be parsed."),
   212  			ChecksumErrors: mustCreateMetric("/netstack/igmp/packets_received/checksum_errors", "Number of received IGMP packets with bad checksums."),
   213  			Unrecognized:   mustCreateMetric("/netstack/igmp/packets_received/unrecognized", "Number of unrecognized IGMP packets received."),
   214  		},
   215  	},
   216  	IP: tcpip.IPStats{
   217  		PacketsReceived:                     mustCreateMetric("/netstack/ip/packets_received", "Number of IP packets received from the link layer in nic.DeliverNetworkPacket."),
   218  		DisabledPacketsReceived:             mustCreateMetric("/netstack/ip/disabled_packets_received", "Number of IP packets received from the link layer when the IP layer is disabled."),
   219  		InvalidDestinationAddressesReceived: mustCreateMetric("/netstack/ip/invalid_addresses_received", "Number of IP packets received with an unknown or invalid destination address."),
   220  		InvalidSourceAddressesReceived:      mustCreateMetric("/netstack/ip/invalid_source_addresses_received", "Number of IP packets received with an unknown or invalid source address."),
   221  		PacketsDelivered:                    mustCreateMetric("/netstack/ip/packets_delivered", "Number of incoming IP packets that are successfully delivered to the transport layer via HandlePacket."),
   222  		PacketsSent:                         mustCreateMetric("/netstack/ip/packets_sent", "Number of IP packets sent via WritePacket."),
   223  		OutgoingPacketErrors:                mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Number of IP packets which failed to write to a link-layer endpoint."),
   224  		MalformedPacketsReceived:            mustCreateMetric("/netstack/ip/malformed_packets_received", "Number of IP packets which failed IP header validation checks."),
   225  		MalformedFragmentsReceived:          mustCreateMetric("/netstack/ip/malformed_fragments_received", "Number of IP fragments which failed IP fragment validation checks."),
   226  		IPTablesPreroutingDropped:           mustCreateMetric("/netstack/ip/iptables/prerouting_dropped", "Number of IP packets dropped in the Prerouting chain."),
   227  		IPTablesInputDropped:                mustCreateMetric("/netstack/ip/iptables/input_dropped", "Number of IP packets dropped in the Input chain."),
   228  		IPTablesOutputDropped:               mustCreateMetric("/netstack/ip/iptables/output_dropped", "Number of IP packets dropped in the Output chain."),
   229  		OptionTimestampReceived:             mustCreateMetric("/netstack/ip/options/timestamp_received", "Number of timestamp options found in received IP packets."),
   230  		OptionRecordRouteReceived:           mustCreateMetric("/netstack/ip/options/record_route_received", "Number of record route options found in received IP packets."),
   231  		OptionRouterAlertReceived:           mustCreateMetric("/netstack/ip/options/router_alert_received", "Number of router alert options found in received IP packets."),
   232  		OptionUnknownReceived:               mustCreateMetric("/netstack/ip/options/unknown_received", "Number of unknown options found in received IP packets."),
   233  		Forwarding: tcpip.IPForwardingStats{
   234  			Unrouteable:            mustCreateMetric("/netstack/ip/forwarding/unrouteable", "Number of IP packets received which couldn't be routed and thus were not forwarded."),
   235  			ExhaustedTTL:           mustCreateMetric("/netstack/ip/forwarding/exhausted_ttl", "Number of IP packets received which could not be forwarded due to an exhausted TTL."),
   236  			LinkLocalSource:        mustCreateMetric("/netstack/ip/forwarding/link_local_source_address", "Number of IP packets received which could not be forwarded due to a link-local source address."),
   237  			LinkLocalDestination:   mustCreateMetric("/netstack/ip/forwarding/link_local_destination_address", "Number of IP packets received which could not be forwarded due to a link-local destination address."),
   238  			ExtensionHeaderProblem: mustCreateMetric("/netstack/ip/forwarding/extension_header_problem", "Number of IP packets received which could not be forwarded due to a problem processing their IPv6 extension headers."),
   239  			PacketTooBig:           mustCreateMetric("/netstack/ip/forwarding/packet_too_big", "Number of IP packets received which could not be forwarded because they could not fit within the outgoing MTU."),
   240  			HostUnreachable:        mustCreateMetric("/netstack/ip/forwarding/host_unreachable", "Number of IP packets received which could not be forwarded due to unresolvable next hop."),
   241  			Errors:                 mustCreateMetric("/netstack/ip/forwarding/errors", "Number of IP packets which couldn't be forwarded."),
   242  		},
   243  	},
   244  	ARP: tcpip.ARPStats{
   245  		PacketsReceived:                                 mustCreateMetric("/netstack/arp/packets_received", "Number of ARP packets received from the link layer."),
   246  		DisabledPacketsReceived:                         mustCreateMetric("/netstack/arp/disabled_packets_received", "Number of ARP packets received from the link layer when the ARP layer is disabled."),
   247  		MalformedPacketsReceived:                        mustCreateMetric("/netstack/arp/malformed_packets_received", "Number of ARP packets which failed ARP header validation checks."),
   248  		RequestsReceived:                                mustCreateMetric("/netstack/arp/requests_received", "Number of ARP requests received."),
   249  		RequestsReceivedUnknownTargetAddress:            mustCreateMetric("/netstack/arp/requests_received_unknown_addr", "Number of ARP requests received with an unknown target address."),
   250  		OutgoingRequestInterfaceHasNoLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_iface_has_no_addr", "Number of failed attempts to send an ARP request with an interface that has no network address."),
   251  		OutgoingRequestBadLocalAddressErrors:            mustCreateMetric("/netstack/arp/outgoing_requests_invalid_local_addr", "Number of failed attempts to send an ARP request with a provided local address that is invalid."),
   252  		OutgoingRequestsDropped:                         mustCreateMetric("/netstack/arp/outgoing_requests_dropped", "Number of ARP requests which failed to write to a link-layer endpoint."),
   253  		OutgoingRequestsSent:                            mustCreateMetric("/netstack/arp/outgoing_requests_sent", "Number of ARP requests sent."),
   254  		RepliesReceived:                                 mustCreateMetric("/netstack/arp/replies_received", "Number of ARP replies received."),
   255  		OutgoingRepliesDropped:                          mustCreateMetric("/netstack/arp/outgoing_replies_dropped", "Number of ARP replies which failed to write to a link-layer endpoint."),
   256  		OutgoingRepliesSent:                             mustCreateMetric("/netstack/arp/outgoing_replies_sent", "Number of ARP replies sent."),
   257  	},
   258  	TCP: tcpip.TCPStats{
   259  		ActiveConnectionOpenings:           mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."),
   260  		PassiveConnectionOpenings:          mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."),
   261  		CurrentEstablished:                 mustCreateGauge("/netstack/tcp/current_established", "Number of connections in ESTABLISHED state now."),
   262  		CurrentConnected:                   mustCreateGauge("/netstack/tcp/current_open", "Number of connections that are in connected state."),
   263  		EstablishedResets:                  mustCreateMetric("/netstack/tcp/established_resets", "Number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state"),
   264  		EstablishedClosed:                  mustCreateMetric("/netstack/tcp/established_closed", "Number of times established TCP connections made a transition to CLOSED state."),
   265  		EstablishedTimedout:                mustCreateMetric("/netstack/tcp/established_timedout", "Number of times  an established connection was reset because of keep-alive time out."),
   266  		ListenOverflowSynDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_syn_drop", "Number of times the listen queue overflowed and a SYN was dropped."),
   267  		ListenOverflowAckDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_ack_drop", "Number of times the listen queue overflowed and the final ACK in the handshake was dropped."),
   268  		ListenOverflowSynCookieSent:        mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_sent", "Number of times a SYN cookie was sent."),
   269  		ListenOverflowSynCookieRcvd:        mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_rcvd", "Number of times a SYN cookie was received."),
   270  		ListenOverflowInvalidSynCookieRcvd: mustCreateMetric("/netstack/tcp/listen_overflow_invalid_syn_cookie_rcvd", "Number of times an invalid SYN cookie was received."),
   271  		FailedConnectionAttempts:           mustCreateMetric("/netstack/tcp/failed_connection_attempts", "Number of calls to Connect or Listen (active and passive openings, respectively) that end in an error."),
   272  		ValidSegmentsReceived:              mustCreateMetric("/netstack/tcp/valid_segments_received", "Number of TCP segments received that the transport layer successfully parsed."),
   273  		InvalidSegmentsReceived:            mustCreateMetric("/netstack/tcp/invalid_segments_received", "Number of TCP segments received that the transport layer could not parse."),
   274  		SegmentsSent:                       mustCreateMetric("/netstack/tcp/segments_sent", "Number of TCP segments sent."),
   275  		SegmentSendErrors:                  mustCreateMetric("/netstack/tcp/segment_send_errors", "Number of TCP segments failed to be sent."),
   276  		ResetsSent:                         mustCreateMetric("/netstack/tcp/resets_sent", "Number of TCP resets sent."),
   277  		ResetsReceived:                     mustCreateMetric("/netstack/tcp/resets_received", "Number of TCP resets received."),
   278  		Retransmits:                        mustCreateMetric("/netstack/tcp/retransmits", "Number of TCP segments retransmitted."),
   279  		FastRecovery:                       mustCreateMetric("/netstack/tcp/fast_recovery", "Number of times fast recovery was used to recover from packet loss."),
   280  		SACKRecovery:                       mustCreateMetric("/netstack/tcp/sack_recovery", "Number of times SACK recovery was used to recover from packet loss."),
   281  		TLPRecovery:                        mustCreateMetric("/netstack/tcp/tlp_recovery", "Number of times tail loss probe triggers recovery from tail loss."),
   282  		SlowStartRetransmits:               mustCreateMetric("/netstack/tcp/slow_start_retransmits", "Number of segments retransmitted in slow start mode."),
   283  		FastRetransmit:                     mustCreateMetric("/netstack/tcp/fast_retransmit", "Number of TCP segments which were fast retransmitted."),
   284  		Timeouts:                           mustCreateMetric("/netstack/tcp/timeouts", "Number of times RTO expired."),
   285  		ChecksumErrors:                     mustCreateMetric("/netstack/tcp/checksum_errors", "Number of segments dropped due to bad checksums."),
   286  		FailedPortReservations:             mustCreateMetric("/netstack/tcp/failed_port_reservations", "Number of time TCP failed to reserve a port."),
   287  		SegmentsAckedWithDSACK:             mustCreateMetric("/netstack/tcp/segments_acked_with_dsack", "Number of segments for which DSACK was received."),
   288  		SpuriousRecovery:                   mustCreateMetric("/netstack/tcp/spurious_recovery", "Number of times the connection entered loss recovery spuriously."),
   289  		SpuriousRTORecovery:                mustCreateMetric("/netstack/tcp/spurious_rto_recovery", "Number of times the connection entered RTO spuriously."),
   290  		ForwardMaxInFlightDrop:             mustCreateMetric("/netstack/tcp/forward_max_in_flight_drop", "Number of connection requests dropped due to exceeding in-flight limit."),
   291  	},
   292  	UDP: tcpip.UDPStats{
   293  		PacketsReceived:          mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."),
   294  		UnknownPortErrors:        mustCreateMetric("/netstack/udp/unknown_port_errors", "Number of incoming UDP datagrams dropped because they did not have a known destination port."),
   295  		ReceiveBufferErrors:      mustCreateMetric("/netstack/udp/receive_buffer_errors", "Number of incoming UDP datagrams dropped due to the receiving buffer being in an invalid state."),
   296  		MalformedPacketsReceived: mustCreateMetric("/netstack/udp/malformed_packets_received", "Number of incoming UDP datagrams dropped due to the UDP header being in a malformed state."),
   297  		PacketsSent:              mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent."),
   298  		PacketSendErrors:         mustCreateMetric("/netstack/udp/packet_send_errors", "Number of UDP datagrams failed to be sent."),
   299  		ChecksumErrors:           mustCreateMetric("/netstack/udp/checksum_errors", "Number of UDP datagrams dropped due to bad checksums."),
   300  	},
   301  }
   302  
   303  // DefaultTTL is linux's default TTL. All network protocols in all stacks used
   304  // with this package must have this value set as their default TTL.
   305  const DefaultTTL = 64
   306  
   307  const sizeOfInt32 int = 4
   308  
   309  var errStackType = syserr.New("expected but did not receive a netstack.Stack", errno.EINVAL)
   310  
   311  // commonEndpoint represents the intersection of a tcpip.Endpoint and a
   312  // transport.Endpoint.
   313  type commonEndpoint interface {
   314  	// Readiness implements tcpip.Endpoint.Readiness and
   315  	// transport.Endpoint.Readiness.
   316  	Readiness(mask waiter.EventMask) waiter.EventMask
   317  
   318  	// SetSockOpt implements tcpip.Endpoint.SetSockOpt and
   319  	// transport.Endpoint.SetSockOpt.
   320  	SetSockOpt(tcpip.SettableSocketOption) tcpip.Error
   321  
   322  	// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt and
   323  	// transport.Endpoint.SetSockOptInt.
   324  	SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error
   325  
   326  	// GetSockOpt implements tcpip.Endpoint.GetSockOpt and
   327  	// transport.Endpoint.GetSockOpt.
   328  	GetSockOpt(tcpip.GettableSocketOption) tcpip.Error
   329  
   330  	// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and
   331  	// transport.Endpoint.GetSockOpt.
   332  	GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error)
   333  
   334  	// State returns a socket's lifecycle state. The returned value is
   335  	// protocol-specific and is primarily used for diagnostics.
   336  	State() uint32
   337  
   338  	// LastError implements tcpip.Endpoint.LastError and
   339  	// transport.Endpoint.LastError.
   340  	LastError() tcpip.Error
   341  
   342  	// SocketOptions implements tcpip.Endpoint.SocketOptions and
   343  	// transport.Endpoint.SocketOptions.
   344  	SocketOptions() *tcpip.SocketOptions
   345  }
   346  
   347  // sock encapsulates all the state needed to represent a network stack
   348  // endpoint in the kernel context.
   349  //
   350  // +stateify savable
   351  type sock struct {
   352  	vfsfd vfs.FileDescription
   353  	vfs.FileDescriptionDefaultImpl
   354  	vfs.DentryMetadataFileDescriptionImpl
   355  	vfs.LockFD
   356  	socket.SendReceiveTimeout
   357  	*waiter.Queue
   358  
   359  	family   int
   360  	Endpoint tcpip.Endpoint
   361  	skType   linux.SockType
   362  	protocol int
   363  
   364  	namespace *inet.Namespace
   365  
   366  	// readMu protects access to the below fields.
   367  	readMu sync.Mutex `state:"nosave"`
   368  
   369  	// sockOptTimestamp corresponds to SO_TIMESTAMP. When true, timestamps
   370  	// of returned messages can be returned via control messages. When
   371  	// false, the same timestamp is instead stored and can be read via the
   372  	// SIOCGSTAMP ioctl. It is protected by readMu. See socket(7).
   373  	sockOptTimestamp bool
   374  	// timestampValid indicates whether timestamp for SIOCGSTAMP has been
   375  	// set. It is protected by readMu.
   376  	timestampValid bool
   377  	// timestamp holds the timestamp to use with SIOCTSTAMP. It is only
   378  	// valid when timestampValid is true. It is protected by readMu.
   379  	timestamp time.Time `state:".(int64)"`
   380  
   381  	// TODO(b/153685824): Move this to SocketOptions.
   382  	// sockOptInq corresponds to TCP_INQ.
   383  	sockOptInq bool
   384  }
   385  
   386  var _ = socket.Socket(&sock{})
   387  
   388  // New creates a new endpoint socket.
   389  func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*vfs.FileDescription, *syserr.Error) {
   390  	if skType == linux.SOCK_STREAM {
   391  		endpoint.SocketOptions().SetDelayOption(true)
   392  	}
   393  
   394  	mnt := t.Kernel().SocketMount()
   395  	d := sockfs.NewDentry(t, mnt)
   396  	defer d.DecRef(t)
   397  
   398  	namespace := t.NetworkNamespace()
   399  	s := &sock{
   400  		Queue:     queue,
   401  		family:    family,
   402  		Endpoint:  endpoint,
   403  		skType:    skType,
   404  		protocol:  protocol,
   405  		namespace: namespace,
   406  	}
   407  	s.LockFD.Init(&vfs.FileLocks{})
   408  	vfsfd := &s.vfsfd
   409  	if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{
   410  		DenyPRead:         true,
   411  		DenyPWrite:        true,
   412  		UseDentryMetadata: true,
   413  	}); err != nil {
   414  		return nil, syserr.FromError(err)
   415  	}
   416  	namespace.IncRef()
   417  	return vfsfd, nil
   418  }
   419  
   420  // Release implements vfs.FileDescriptionImpl.Release.
   421  func (s *sock) Release(ctx context.Context) {
   422  	kernel.KernelFromContext(ctx).DeleteSocket(&s.vfsfd)
   423  	e, ch := waiter.NewChannelEntry(waiter.EventHUp | waiter.EventErr)
   424  	s.EventRegister(&e)
   425  	defer s.EventUnregister(&e)
   426  
   427  	s.Endpoint.Close()
   428  
   429  	// SO_LINGER option is valid only for TCP. For other socket types
   430  	// return after endpoint close.
   431  	if family, skType, _ := s.Type(); skType == linux.SOCK_STREAM && (family == linux.AF_INET || family == linux.AF_INET6) {
   432  		v := s.Endpoint.SocketOptions().GetLinger()
   433  		// The case for zero timeout is handled in tcp endpoint close function.
   434  		// Close is blocked until either:
   435  		// 1. The endpoint state is not in any of the states: FIN-WAIT1,
   436  		// CLOSING and LAST_ACK.
   437  		// 2. Timeout is reached.
   438  		if v.Enabled && v.Timeout != 0 {
   439  			t := kernel.TaskFromContext(ctx)
   440  			start := t.Kernel().MonotonicClock().Now()
   441  			deadline := start.Add(v.Timeout)
   442  			_ = t.BlockWithDeadline(ch, true, deadline)
   443  		}
   444  	}
   445  	s.namespace.DecRef(ctx)
   446  }
   447  
   448  // Epollable implements FileDescriptionImpl.Epollable.
   449  func (s *sock) Epollable() bool {
   450  	return true
   451  }
   452  
   453  // Read implements vfs.FileDescriptionImpl.
   454  func (s *sock) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
   455  	// All flags other than RWF_NOWAIT should be ignored.
   456  	// TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
   457  	if opts.Flags != 0 {
   458  		return 0, linuxerr.EOPNOTSUPP
   459  	}
   460  
   461  	if dst.NumBytes() == 0 {
   462  		return 0, nil
   463  	}
   464  	n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false)
   465  	if err == syserr.ErrWouldBlock {
   466  		return int64(n), linuxerr.ErrWouldBlock
   467  	}
   468  	if err != nil {
   469  		return 0, err.ToError()
   470  	}
   471  	return int64(n), nil
   472  }
   473  
   474  // Write implements vfs.FileDescriptionImpl.
   475  func (s *sock) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
   476  	// All flags other than RWF_NOWAIT should be ignored.
   477  	// TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
   478  	if opts.Flags != 0 {
   479  		return 0, linuxerr.EOPNOTSUPP
   480  	}
   481  
   482  	r := src.Reader(ctx)
   483  	n, err := s.Endpoint.Write(r, tcpip.WriteOptions{})
   484  	if _, ok := err.(*tcpip.ErrWouldBlock); ok {
   485  		return 0, linuxerr.ErrWouldBlock
   486  	}
   487  	if err != nil {
   488  		return 0, syserr.TranslateNetstackError(err).ToError()
   489  	}
   490  
   491  	if n < src.NumBytes() {
   492  		return n, linuxerr.ErrWouldBlock
   493  	}
   494  
   495  	return n, nil
   496  }
   497  
   498  // Accept implements the linux syscall accept(2) for sockets backed by
   499  // tcpip.Endpoint.
   500  func (s *sock) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
   501  	// Issue the accept request to get the new endpoint.
   502  	var peerAddr *tcpip.FullAddress
   503  	if peerRequested {
   504  		peerAddr = &tcpip.FullAddress{}
   505  	}
   506  	ep, wq, terr := s.Endpoint.Accept(peerAddr)
   507  	if terr != nil {
   508  		if _, ok := terr.(*tcpip.ErrWouldBlock); !ok || !blocking {
   509  			return 0, nil, 0, syserr.TranslateNetstackError(terr)
   510  		}
   511  
   512  		var err *syserr.Error
   513  		ep, wq, err = s.blockingAccept(t, peerAddr)
   514  		if err != nil {
   515  			return 0, nil, 0, err
   516  		}
   517  	}
   518  
   519  	ns, err := New(t, s.family, s.skType, s.protocol, wq, ep)
   520  	if err != nil {
   521  		return 0, nil, 0, err
   522  	}
   523  	defer ns.DecRef(t)
   524  
   525  	if err := ns.SetStatusFlags(t, t.Credentials(), uint32(flags&linux.SOCK_NONBLOCK)); err != nil {
   526  		return 0, nil, 0, syserr.FromError(err)
   527  	}
   528  
   529  	var addr linux.SockAddr
   530  	var addrLen uint32
   531  	if peerAddr != nil {
   532  		// Get address of the peer and write it to peer slice.
   533  		addr, addrLen = socket.ConvertAddress(s.family, *peerAddr)
   534  	}
   535  
   536  	fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
   537  		CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
   538  	})
   539  
   540  	t.Kernel().RecordSocket(ns)
   541  
   542  	return fd, addr, addrLen, syserr.FromError(e)
   543  }
   544  
   545  // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
   546  // tcpip.Endpoint.
   547  func (s *sock) GetSockOpt(t *kernel.Task, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
   548  	// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
   549  	// implemented specifically for netstack.Socket rather than
   550  	// commonEndpoint. commonEndpoint should be extended to support socket
   551  	// options where the implementation is not shared, as unix sockets need
   552  	// their own support for SO_TIMESTAMP.
   553  	if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP {
   554  		if outLen < sizeOfInt32 {
   555  			return nil, syserr.ErrInvalidArgument
   556  		}
   557  		val := primitive.Int32(0)
   558  		s.readMu.Lock()
   559  		defer s.readMu.Unlock()
   560  		if s.sockOptTimestamp {
   561  			val = 1
   562  		}
   563  		return &val, nil
   564  	}
   565  	if level == linux.SOL_TCP && name == linux.TCP_INQ {
   566  		if outLen < sizeOfInt32 {
   567  			return nil, syserr.ErrInvalidArgument
   568  		}
   569  		val := primitive.Int32(0)
   570  		s.readMu.Lock()
   571  		defer s.readMu.Unlock()
   572  		if s.sockOptInq {
   573  			val = 1
   574  		}
   575  		return &val, nil
   576  	}
   577  
   578  	return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outPtr, outLen)
   579  }
   580  
   581  // SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
   582  // tcpip.Endpoint.
   583  func (s *sock) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error {
   584  	// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
   585  	// implemented specifically for netstack.Socket rather than
   586  	// commonEndpoint. commonEndpoint should be extended to support socket
   587  	// options where the implementation is not shared, as unix sockets need
   588  	// their own support for SO_TIMESTAMP.
   589  	if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP {
   590  		if len(optVal) < sizeOfInt32 {
   591  			return syserr.ErrInvalidArgument
   592  		}
   593  		s.readMu.Lock()
   594  		defer s.readMu.Unlock()
   595  		s.sockOptTimestamp = hostarch.ByteOrder.Uint32(optVal) != 0
   596  		return nil
   597  	}
   598  	if level == linux.SOL_TCP && name == linux.TCP_INQ {
   599  		if len(optVal) < sizeOfInt32 {
   600  			return syserr.ErrInvalidArgument
   601  		}
   602  		s.readMu.Lock()
   603  		defer s.readMu.Unlock()
   604  		s.sockOptInq = hostarch.ByteOrder.Uint32(optVal) != 0
   605  		return nil
   606  	}
   607  
   608  	return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
   609  }
   610  
   611  var sockAddrInetSize = (*linux.SockAddrInet)(nil).SizeBytes()
   612  var sockAddrInet6Size = (*linux.SockAddrInet6)(nil).SizeBytes()
   613  var sockAddrLinkSize = (*linux.SockAddrLink)(nil).SizeBytes()
   614  
   615  // minSockAddrLen returns the minimum length in bytes of a socket address for
   616  // the socket's family.
   617  func (s *sock) minSockAddrLen() int {
   618  	const addressFamilySize = 2
   619  
   620  	switch s.family {
   621  	case linux.AF_UNIX:
   622  		return addressFamilySize
   623  	case linux.AF_INET:
   624  		return sockAddrInetSize
   625  	case linux.AF_INET6:
   626  		return sockAddrInet6Size
   627  	case linux.AF_PACKET:
   628  		return sockAddrLinkSize
   629  	case linux.AF_UNSPEC:
   630  		return addressFamilySize
   631  	default:
   632  		panic(fmt.Sprintf("s.family unrecognized = %d", s.family))
   633  	}
   634  }
   635  
   636  func (s *sock) isPacketBased() bool {
   637  	return s.skType == linux.SOCK_DGRAM || s.skType == linux.SOCK_SEQPACKET || s.skType == linux.SOCK_RDM || s.skType == linux.SOCK_RAW
   638  }
   639  
   640  // Readiness returns a mask of ready events for socket s.
   641  func (s *sock) Readiness(mask waiter.EventMask) waiter.EventMask {
   642  	return s.Endpoint.Readiness(mask)
   643  }
   644  
   645  // checkFamily returns true iff the specified address family may be used with
   646  // the socket.
   647  //
   648  // If exact is true, then the specified address family must be an exact match
   649  // with the socket's family.
   650  func (s *sock) checkFamily(family uint16, exact bool) bool {
   651  	if family == uint16(s.family) {
   652  		return true
   653  	}
   654  	if !exact && family == linux.AF_INET && s.family == linux.AF_INET6 {
   655  		if !s.Endpoint.SocketOptions().GetV6Only() {
   656  			return true
   657  		}
   658  	}
   659  	return false
   660  }
   661  
   662  // mapFamily maps the AF_INET ANY address to the IPv4-mapped IPv6 ANY if the
   663  // receiver's family is AF_INET6.
   664  //
   665  // This is a hack to work around the fact that both IPv4 and IPv6 ANY are
   666  // represented by the empty string.
   667  //
   668  // TODO(gvisor.dev/issue/1556): remove this function.
   669  func (s *sock) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress {
   670  	if addr.Addr.BitLen() == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET {
   671  		addr.Addr = tcpip.AddrFrom16([16]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00})
   672  	}
   673  	return addr
   674  }
   675  
   676  // Connect implements the linux syscall connect(2) for sockets backed by
   677  // tpcip.Endpoint.
   678  func (s *sock) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
   679  	addr, family, err := socket.AddressAndFamily(sockaddr)
   680  	if err != nil {
   681  		return err
   682  	}
   683  
   684  	if family == linux.AF_UNSPEC {
   685  		err := s.Endpoint.Disconnect()
   686  		if _, ok := err.(*tcpip.ErrNotSupported); ok {
   687  			return syserr.ErrAddressFamilyNotSupported
   688  		}
   689  		return syserr.TranslateNetstackError(err)
   690  	}
   691  
   692  	if !s.checkFamily(family, false /* exact */) {
   693  		return syserr.ErrInvalidArgument
   694  	}
   695  	addr = s.mapFamily(addr, family)
   696  
   697  	// Always return right away in the non-blocking case.
   698  	if !blocking {
   699  		return syserr.TranslateNetstackError(s.Endpoint.Connect(addr))
   700  	}
   701  
   702  	// Register for notification when the endpoint becomes writable, then
   703  	// initiate the connection.
   704  	e, ch := waiter.NewChannelEntry(waiter.WritableEvents)
   705  	s.EventRegister(&e)
   706  	defer s.EventUnregister(&e)
   707  
   708  	switch err := s.Endpoint.Connect(addr); err.(type) {
   709  	case *tcpip.ErrConnectStarted, *tcpip.ErrAlreadyConnecting:
   710  	case *tcpip.ErrNoPortAvailable:
   711  		if (s.family == unix.AF_INET || s.family == unix.AF_INET6) && s.skType == linux.SOCK_STREAM {
   712  			// TCP unlike UDP returns EADDRNOTAVAIL when it can't
   713  			// find an available local ephemeral port.
   714  			return syserr.ErrAddressNotAvailable
   715  		}
   716  		return syserr.TranslateNetstackError(err)
   717  	default:
   718  		return syserr.TranslateNetstackError(err)
   719  	}
   720  
   721  	// It's pending, so we have to wait for a notification, and fetch the
   722  	// result once the wait completes.
   723  	if err := t.Block(ch); err != nil {
   724  		return syserr.FromError(err)
   725  	}
   726  
   727  	// Call Connect() again after blocking to find connect's result.
   728  	return syserr.TranslateNetstackError(s.Endpoint.Connect(addr))
   729  }
   730  
   731  // Bind implements the linux syscall bind(2) for sockets backed by
   732  // tcpip.Endpoint.
   733  func (s *sock) Bind(_ *kernel.Task, sockaddr []byte) *syserr.Error {
   734  	if len(sockaddr) < 2 {
   735  		return syserr.ErrInvalidArgument
   736  	}
   737  
   738  	family := hostarch.ByteOrder.Uint16(sockaddr)
   739  	var addr tcpip.FullAddress
   740  
   741  	// Bind for AF_PACKET requires only family, protocol and ifindex.
   742  	// In function AddressAndFamily, we check the address length which is
   743  	// not needed for AF_PACKET bind.
   744  	if family == linux.AF_PACKET {
   745  		var a linux.SockAddrLink
   746  		if len(sockaddr) < sockAddrLinkSize {
   747  			return syserr.ErrInvalidArgument
   748  		}
   749  		a.UnmarshalBytes(sockaddr)
   750  
   751  		addr = tcpip.FullAddress{
   752  			NIC: tcpip.NICID(a.InterfaceIndex),
   753  			Addr: tcpip.AddrFrom16Slice(append(
   754  				a.HardwareAddr[:header.EthernetAddressSize],
   755  				[]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}...,
   756  			)),
   757  			Port: socket.Ntohs(a.Protocol),
   758  		}
   759  	} else {
   760  		if s.minSockAddrLen() > len(sockaddr) {
   761  			return syserr.ErrInvalidArgument
   762  		}
   763  
   764  		var err *syserr.Error
   765  		addr, family, err = socket.AddressAndFamily(sockaddr)
   766  		if err != nil {
   767  			return err
   768  		}
   769  
   770  		if !s.checkFamily(family, true /* exact */) {
   771  			return syserr.ErrAddressFamilyNotSupported
   772  		}
   773  
   774  		addr = s.mapFamily(addr, family)
   775  	}
   776  
   777  	// Issue the bind request to the endpoint.
   778  	err := s.Endpoint.Bind(addr)
   779  	if _, ok := err.(*tcpip.ErrNoPortAvailable); ok {
   780  		// Bind always returns EADDRINUSE irrespective of if the specified port was
   781  		// already bound or if an ephemeral port was requested but none were
   782  		// available.
   783  		//
   784  		// *tcpip.ErrNoPortAvailable is mapped to EAGAIN in syserr package because
   785  		// UDP connect returns EAGAIN on ephemeral port exhaustion.
   786  		//
   787  		// TCP connect returns EADDRNOTAVAIL on ephemeral port exhaustion.
   788  		err = &tcpip.ErrPortInUse{}
   789  	}
   790  
   791  	return syserr.TranslateNetstackError(err)
   792  }
   793  
   794  // Listen implements the linux syscall listen(2) for sockets backed by
   795  // tcpip.Endpoint.
   796  func (s *sock) Listen(_ *kernel.Task, backlog int) *syserr.Error {
   797  	return syserr.TranslateNetstackError(s.Endpoint.Listen(backlog))
   798  }
   799  
   800  // blockingAccept implements a blocking version of accept(2), that is, if no
   801  // connections are ready to be accept, it will block until one becomes ready.
   802  func (s *sock) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) {
   803  	// Register for notifications.
   804  	e, ch := waiter.NewChannelEntry(waiter.ReadableEvents)
   805  	s.EventRegister(&e)
   806  	defer s.EventUnregister(&e)
   807  
   808  	// Try to accept the connection again; if it fails, then wait until we
   809  	// get a notification.
   810  	for {
   811  		ep, wq, err := s.Endpoint.Accept(peerAddr)
   812  		if _, ok := err.(*tcpip.ErrWouldBlock); !ok {
   813  			return ep, wq, syserr.TranslateNetstackError(err)
   814  		}
   815  
   816  		if err := t.Block(ch); err != nil {
   817  			return nil, nil, syserr.FromError(err)
   818  		}
   819  	}
   820  }
   821  
   822  // ConvertShutdown converts Linux shutdown flags into tcpip shutdown flags.
   823  func ConvertShutdown(how int) (tcpip.ShutdownFlags, *syserr.Error) {
   824  	var f tcpip.ShutdownFlags
   825  	switch how {
   826  	case linux.SHUT_RD:
   827  		f = tcpip.ShutdownRead
   828  	case linux.SHUT_WR:
   829  		f = tcpip.ShutdownWrite
   830  	case linux.SHUT_RDWR:
   831  		f = tcpip.ShutdownRead | tcpip.ShutdownWrite
   832  	default:
   833  		return 0, syserr.ErrInvalidArgument
   834  	}
   835  	return f, nil
   836  }
   837  
   838  // Shutdown implements the linux syscall shutdown(2) for sockets backed by
   839  // tcpip.Endpoint.
   840  func (s *sock) Shutdown(_ *kernel.Task, how int) *syserr.Error {
   841  	f, err := ConvertShutdown(how)
   842  	if err != nil {
   843  		return err
   844  	}
   845  
   846  	// Issue shutdown request.
   847  	return syserr.TranslateNetstackError(s.Endpoint.Shutdown(f))
   848  }
   849  
   850  // GetSockOpt can be used to implement the linux syscall getsockopt(2) for
   851  // sockets backed by a commonEndpoint.
   852  func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
   853  	switch level {
   854  	case linux.SOL_SOCKET:
   855  		return getSockOptSocket(t, s, ep, family, skType, name, outLen)
   856  
   857  	case linux.SOL_TCP:
   858  		return getSockOptTCP(t, s, ep, name, outLen)
   859  
   860  	case linux.SOL_IPV6:
   861  		return getSockOptIPv6(t, s, ep, name, outPtr, outLen)
   862  
   863  	case linux.SOL_IP:
   864  		return getSockOptIP(t, s, ep, name, outPtr, outLen, family)
   865  
   866  	case linux.SOL_ICMPV6:
   867  		return getSockOptICMPv6(t, s, ep, name, outLen)
   868  
   869  	case linux.SOL_UDP,
   870  		linux.SOL_RAW,
   871  		linux.SOL_PACKET:
   872  		// Not supported.
   873  	}
   874  
   875  	return nil, syserr.ErrProtocolNotAvailable
   876  }
   877  
   878  func boolToInt32(v bool) int32 {
   879  	if v {
   880  		return 1
   881  	}
   882  	return 0
   883  }
   884  
   885  // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET.
   886  func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, _ linux.SockType, name, outLen int) (marshal.Marshallable, *syserr.Error) {
   887  	// TODO(b/124056281): Stop rejecting short optLen values in getsockopt.
   888  	switch name {
   889  	case linux.SO_ERROR:
   890  		if outLen < sizeOfInt32 {
   891  			return nil, syserr.ErrInvalidArgument
   892  		}
   893  
   894  		// Get the last error and convert it.
   895  		err := ep.SocketOptions().GetLastError()
   896  		if err == nil {
   897  			optP := primitive.Int32(0)
   898  			return &optP, nil
   899  		}
   900  
   901  		optP := primitive.Int32(syserr.TranslateNetstackError(err).ToLinux())
   902  		return &optP, nil
   903  
   904  	case linux.SO_PEERCRED:
   905  		if family != linux.AF_UNIX || outLen < unix.SizeofUcred {
   906  			return nil, syserr.ErrInvalidArgument
   907  		}
   908  
   909  		tcred := t.Credentials()
   910  		creds := linux.ControlMessageCredentials{
   911  			PID: int32(t.ThreadGroup().ID()),
   912  			UID: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()),
   913  			GID: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()),
   914  		}
   915  		return &creds, nil
   916  
   917  	case linux.SO_PASSCRED:
   918  		if outLen < sizeOfInt32 {
   919  			return nil, syserr.ErrInvalidArgument
   920  		}
   921  
   922  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetPassCred()))
   923  		return &v, nil
   924  
   925  	case linux.SO_SNDBUF:
   926  		if outLen < sizeOfInt32 {
   927  			return nil, syserr.ErrInvalidArgument
   928  		}
   929  
   930  		size := ep.SocketOptions().GetSendBufferSize()
   931  
   932  		if size > math.MaxInt32 {
   933  			size = math.MaxInt32
   934  		}
   935  
   936  		sizeP := primitive.Int32(size)
   937  		return &sizeP, nil
   938  
   939  	case linux.SO_RCVBUF:
   940  		if outLen < sizeOfInt32 {
   941  			return nil, syserr.ErrInvalidArgument
   942  		}
   943  
   944  		size := ep.SocketOptions().GetReceiveBufferSize()
   945  
   946  		if size > math.MaxInt32 {
   947  			size = math.MaxInt32
   948  		}
   949  
   950  		sizeP := primitive.Int32(size)
   951  		return &sizeP, nil
   952  
   953  	case linux.SO_REUSEADDR:
   954  		if outLen < sizeOfInt32 {
   955  			return nil, syserr.ErrInvalidArgument
   956  		}
   957  
   958  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReuseAddress()))
   959  		return &v, nil
   960  
   961  	case linux.SO_REUSEPORT:
   962  		if outLen < sizeOfInt32 {
   963  			return nil, syserr.ErrInvalidArgument
   964  		}
   965  
   966  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReusePort()))
   967  		return &v, nil
   968  
   969  	case linux.SO_BINDTODEVICE:
   970  		v := ep.SocketOptions().GetBindToDevice()
   971  		if v == 0 {
   972  			var b primitive.ByteSlice
   973  			return &b, nil
   974  		}
   975  		if outLen < linux.IFNAMSIZ {
   976  			return nil, syserr.ErrInvalidArgument
   977  		}
   978  		s := t.NetworkContext()
   979  		if s == nil {
   980  			return nil, syserr.ErrNoDevice
   981  		}
   982  		nic, ok := s.Interfaces()[int32(v)]
   983  		if !ok {
   984  			// The NICID no longer indicates a valid interface, probably because that
   985  			// interface was removed.
   986  			return nil, syserr.ErrUnknownDevice
   987  		}
   988  
   989  		name := primitive.ByteSlice(append([]byte(nic.Name), 0))
   990  		return &name, nil
   991  
   992  	case linux.SO_BROADCAST:
   993  		if outLen < sizeOfInt32 {
   994  			return nil, syserr.ErrInvalidArgument
   995  		}
   996  
   997  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetBroadcast()))
   998  		return &v, nil
   999  
  1000  	case linux.SO_KEEPALIVE:
  1001  		if outLen < sizeOfInt32 {
  1002  			return nil, syserr.ErrInvalidArgument
  1003  		}
  1004  
  1005  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetKeepAlive()))
  1006  		return &v, nil
  1007  
  1008  	case linux.SO_LINGER:
  1009  		if outLen < linux.SizeOfLinger {
  1010  			return nil, syserr.ErrInvalidArgument
  1011  		}
  1012  
  1013  		var linger linux.Linger
  1014  		v := ep.SocketOptions().GetLinger()
  1015  
  1016  		if v.Enabled {
  1017  			linger.OnOff = 1
  1018  		}
  1019  		linger.Linger = int32(v.Timeout.Seconds())
  1020  		return &linger, nil
  1021  
  1022  	case linux.SO_SNDTIMEO:
  1023  		// TODO(igudger): Linux allows shorter lengths for partial results.
  1024  		if outLen < linux.SizeOfTimeval {
  1025  			return nil, syserr.ErrInvalidArgument
  1026  		}
  1027  
  1028  		sendTimeout := linux.NsecToTimeval(s.SendTimeout())
  1029  		return &sendTimeout, nil
  1030  
  1031  	case linux.SO_RCVTIMEO:
  1032  		// TODO(igudger): Linux allows shorter lengths for partial results.
  1033  		if outLen < linux.SizeOfTimeval {
  1034  			return nil, syserr.ErrInvalidArgument
  1035  		}
  1036  
  1037  		recvTimeout := linux.NsecToTimeval(s.RecvTimeout())
  1038  		return &recvTimeout, nil
  1039  
  1040  	case linux.SO_OOBINLINE:
  1041  		if outLen < sizeOfInt32 {
  1042  			return nil, syserr.ErrInvalidArgument
  1043  		}
  1044  
  1045  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetOutOfBandInline()))
  1046  		return &v, nil
  1047  
  1048  	case linux.SO_NO_CHECK:
  1049  		if outLen < sizeOfInt32 {
  1050  			return nil, syserr.ErrInvalidArgument
  1051  		}
  1052  
  1053  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetNoChecksum()))
  1054  		return &v, nil
  1055  
  1056  	case linux.SO_ACCEPTCONN:
  1057  		if outLen < sizeOfInt32 {
  1058  			return nil, syserr.ErrInvalidArgument
  1059  		}
  1060  
  1061  		// This option is only viable for TCP endpoints.
  1062  		var v bool
  1063  		if socket.IsTCP(s) {
  1064  			v = tcp.EndpointState(ep.State()) == tcp.StateListen
  1065  		}
  1066  		vP := primitive.Int32(boolToInt32(v))
  1067  		return &vP, nil
  1068  
  1069  	case linux.SO_RCVLOWAT:
  1070  		if outLen < sizeOfInt32 {
  1071  			return nil, syserr.ErrInvalidArgument
  1072  		}
  1073  
  1074  		v := primitive.Int32(ep.SocketOptions().GetRcvlowat())
  1075  		return &v, nil
  1076  	}
  1077  	return nil, syserr.ErrProtocolNotAvailable
  1078  }
  1079  
  1080  // getSockOptTCP implements GetSockOpt when level is SOL_TCP.
  1081  func getSockOptTCP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) {
  1082  	if !socket.IsTCP(s) {
  1083  		return nil, syserr.ErrUnknownProtocolOption
  1084  	}
  1085  
  1086  	switch name {
  1087  	case linux.TCP_NODELAY:
  1088  		if outLen < sizeOfInt32 {
  1089  			return nil, syserr.ErrInvalidArgument
  1090  		}
  1091  
  1092  		v := primitive.Int32(boolToInt32(!ep.SocketOptions().GetDelayOption()))
  1093  		return &v, nil
  1094  
  1095  	case linux.TCP_CORK:
  1096  		if outLen < sizeOfInt32 {
  1097  			return nil, syserr.ErrInvalidArgument
  1098  		}
  1099  
  1100  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetCorkOption()))
  1101  		return &v, nil
  1102  
  1103  	case linux.TCP_QUICKACK:
  1104  		if outLen < sizeOfInt32 {
  1105  			return nil, syserr.ErrInvalidArgument
  1106  		}
  1107  
  1108  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetQuickAck()))
  1109  		return &v, nil
  1110  
  1111  	case linux.TCP_MAXSEG:
  1112  		if outLen < sizeOfInt32 {
  1113  			return nil, syserr.ErrInvalidArgument
  1114  		}
  1115  
  1116  		v, err := ep.GetSockOptInt(tcpip.MaxSegOption)
  1117  		if err != nil {
  1118  			return nil, syserr.TranslateNetstackError(err)
  1119  		}
  1120  		vP := primitive.Int32(v)
  1121  		return &vP, nil
  1122  
  1123  	case linux.TCP_KEEPIDLE:
  1124  		if outLen < sizeOfInt32 {
  1125  			return nil, syserr.ErrInvalidArgument
  1126  		}
  1127  
  1128  		var v tcpip.KeepaliveIdleOption
  1129  		if err := ep.GetSockOpt(&v); err != nil {
  1130  			return nil, syserr.TranslateNetstackError(err)
  1131  		}
  1132  		keepAliveIdle := primitive.Int32(time.Duration(v) / time.Second)
  1133  		return &keepAliveIdle, nil
  1134  
  1135  	case linux.TCP_KEEPINTVL:
  1136  		if outLen < sizeOfInt32 {
  1137  			return nil, syserr.ErrInvalidArgument
  1138  		}
  1139  
  1140  		var v tcpip.KeepaliveIntervalOption
  1141  		if err := ep.GetSockOpt(&v); err != nil {
  1142  			return nil, syserr.TranslateNetstackError(err)
  1143  		}
  1144  		keepAliveInterval := primitive.Int32(time.Duration(v) / time.Second)
  1145  		return &keepAliveInterval, nil
  1146  
  1147  	case linux.TCP_KEEPCNT:
  1148  		if outLen < sizeOfInt32 {
  1149  			return nil, syserr.ErrInvalidArgument
  1150  		}
  1151  
  1152  		v, err := ep.GetSockOptInt(tcpip.KeepaliveCountOption)
  1153  		if err != nil {
  1154  			return nil, syserr.TranslateNetstackError(err)
  1155  		}
  1156  		vP := primitive.Int32(v)
  1157  		return &vP, nil
  1158  
  1159  	case linux.TCP_USER_TIMEOUT:
  1160  		if outLen < sizeOfInt32 {
  1161  			return nil, syserr.ErrInvalidArgument
  1162  		}
  1163  
  1164  		var v tcpip.TCPUserTimeoutOption
  1165  		if err := ep.GetSockOpt(&v); err != nil {
  1166  			return nil, syserr.TranslateNetstackError(err)
  1167  		}
  1168  		tcpUserTimeout := primitive.Int32(time.Duration(v) / time.Millisecond)
  1169  		return &tcpUserTimeout, nil
  1170  
  1171  	case linux.TCP_INFO:
  1172  		var v tcpip.TCPInfoOption
  1173  		if err := ep.GetSockOpt(&v); err != nil {
  1174  			return nil, syserr.TranslateNetstackError(err)
  1175  		}
  1176  
  1177  		// TODO(b/64800844): Translate fields once they are added to
  1178  		// tcpip.TCPInfoOption.
  1179  		info := linux.TCPInfo{
  1180  			State:       uint8(v.State),
  1181  			RTO:         uint32(v.RTO / time.Microsecond),
  1182  			RTT:         uint32(v.RTT / time.Microsecond),
  1183  			RTTVar:      uint32(v.RTTVar / time.Microsecond),
  1184  			SndSsthresh: v.SndSsthresh,
  1185  			SndCwnd:     v.SndCwnd,
  1186  		}
  1187  		switch v.CcState {
  1188  		case tcpip.RTORecovery:
  1189  			info.CaState = linux.TCP_CA_Loss
  1190  		case tcpip.FastRecovery, tcpip.SACKRecovery:
  1191  			info.CaState = linux.TCP_CA_Recovery
  1192  		case tcpip.Disorder:
  1193  			info.CaState = linux.TCP_CA_Disorder
  1194  		case tcpip.Open:
  1195  			info.CaState = linux.TCP_CA_Open
  1196  		}
  1197  
  1198  		// In netstack reorderSeen is updated only when RACK is enabled.
  1199  		// We only track whether the reordering is seen, which is
  1200  		// different than Linux where reorderSeen is not specific to
  1201  		// RACK and is incremented when a reordering event is seen.
  1202  		if v.ReorderSeen {
  1203  			info.ReordSeen = 1
  1204  		}
  1205  
  1206  		// Linux truncates the output binary to outLen.
  1207  		buf := t.CopyScratchBuffer(info.SizeBytes())
  1208  		info.MarshalUnsafe(buf)
  1209  		if len(buf) > outLen {
  1210  			buf = buf[:outLen]
  1211  		}
  1212  		bufP := primitive.ByteSlice(buf)
  1213  		return &bufP, nil
  1214  
  1215  	case linux.TCP_CC_INFO,
  1216  		linux.TCP_NOTSENT_LOWAT,
  1217  		linux.TCP_ZEROCOPY_RECEIVE:
  1218  
  1219  		// Not supported.
  1220  
  1221  	case linux.TCP_CONGESTION:
  1222  		if outLen <= 0 {
  1223  			return nil, syserr.ErrInvalidArgument
  1224  		}
  1225  
  1226  		var v tcpip.CongestionControlOption
  1227  		if err := ep.GetSockOpt(&v); err != nil {
  1228  			return nil, syserr.TranslateNetstackError(err)
  1229  		}
  1230  
  1231  		// We match linux behaviour here where it returns the lower of
  1232  		// TCP_CA_NAME_MAX bytes or the value of the option length.
  1233  		//
  1234  		// This is Linux's net/tcp.h TCP_CA_NAME_MAX.
  1235  		const tcpCANameMax = 16
  1236  
  1237  		toCopy := tcpCANameMax
  1238  		if outLen < tcpCANameMax {
  1239  			toCopy = outLen
  1240  		}
  1241  		b := make([]byte, toCopy)
  1242  		copy(b, v)
  1243  
  1244  		bP := primitive.ByteSlice(b)
  1245  		return &bP, nil
  1246  
  1247  	case linux.TCP_LINGER2:
  1248  		if outLen < sizeOfInt32 {
  1249  			return nil, syserr.ErrInvalidArgument
  1250  		}
  1251  
  1252  		var v tcpip.TCPLingerTimeoutOption
  1253  		if err := ep.GetSockOpt(&v); err != nil {
  1254  			return nil, syserr.TranslateNetstackError(err)
  1255  		}
  1256  		var lingerTimeout primitive.Int32
  1257  		if v >= 0 {
  1258  			lingerTimeout = primitive.Int32(time.Duration(v) / time.Second)
  1259  		} else {
  1260  			lingerTimeout = -1
  1261  		}
  1262  		return &lingerTimeout, nil
  1263  
  1264  	case linux.TCP_DEFER_ACCEPT:
  1265  		if outLen < sizeOfInt32 {
  1266  			return nil, syserr.ErrInvalidArgument
  1267  		}
  1268  
  1269  		var v tcpip.TCPDeferAcceptOption
  1270  		if err := ep.GetSockOpt(&v); err != nil {
  1271  			return nil, syserr.TranslateNetstackError(err)
  1272  		}
  1273  
  1274  		tcpDeferAccept := primitive.Int32(time.Duration(v) / time.Second)
  1275  		return &tcpDeferAccept, nil
  1276  
  1277  	case linux.TCP_SYNCNT:
  1278  		if outLen < sizeOfInt32 {
  1279  			return nil, syserr.ErrInvalidArgument
  1280  		}
  1281  
  1282  		v, err := ep.GetSockOptInt(tcpip.TCPSynCountOption)
  1283  		if err != nil {
  1284  			return nil, syserr.TranslateNetstackError(err)
  1285  		}
  1286  		vP := primitive.Int32(v)
  1287  		return &vP, nil
  1288  
  1289  	case linux.TCP_WINDOW_CLAMP:
  1290  		if outLen < sizeOfInt32 {
  1291  			return nil, syserr.ErrInvalidArgument
  1292  		}
  1293  
  1294  		v, err := ep.GetSockOptInt(tcpip.TCPWindowClampOption)
  1295  		if err != nil {
  1296  			return nil, syserr.TranslateNetstackError(err)
  1297  		}
  1298  		vP := primitive.Int32(v)
  1299  		return &vP, nil
  1300  	}
  1301  	return nil, syserr.ErrProtocolNotAvailable
  1302  }
  1303  
  1304  func getSockOptICMPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outLen int) (marshal.Marshallable, *syserr.Error) {
  1305  	if _, ok := ep.(tcpip.Endpoint); !ok {
  1306  		log.Warningf("SOL_ICMPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  1307  		return nil, syserr.ErrUnknownProtocolOption
  1308  	}
  1309  
  1310  	if family, _, _ := s.Type(); family != linux.AF_INET6 {
  1311  		return nil, syserr.ErrNotSupported
  1312  	}
  1313  
  1314  	switch name {
  1315  	case linux.ICMPV6_FILTER:
  1316  		var v tcpip.ICMPv6Filter
  1317  		if err := ep.GetSockOpt(&v); err != nil {
  1318  			return nil, syserr.TranslateNetstackError(err)
  1319  		}
  1320  
  1321  		filter := linux.ICMP6Filter{Filter: v.DenyType}
  1322  
  1323  		// Linux truncates the output to outLen.
  1324  		buf := t.CopyScratchBuffer(filter.SizeBytes())
  1325  		filter.MarshalUnsafe(buf)
  1326  		if len(buf) > outLen {
  1327  			buf = buf[:outLen]
  1328  		}
  1329  		bufP := primitive.ByteSlice(buf)
  1330  		return &bufP, nil
  1331  	}
  1332  	return nil, syserr.ErrProtocolNotAvailable
  1333  }
  1334  
  1335  func defaultTTL(t *kernel.Task, network tcpip.NetworkProtocolNumber) (primitive.Int32, tcpip.Error) {
  1336  	var opt tcpip.DefaultTTLOption
  1337  	stack := inet.StackFromContext(t)
  1338  	if err := stack.(*Stack).Stack.NetworkProtocolOption(network, &opt); err != nil {
  1339  		return 0, err
  1340  	}
  1341  	return primitive.Int32(opt), nil
  1342  }
  1343  
  1344  // getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6.
  1345  func getSockOptIPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
  1346  	if _, ok := ep.(tcpip.Endpoint); !ok {
  1347  		log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  1348  		return nil, syserr.ErrUnknownProtocolOption
  1349  	}
  1350  
  1351  	family, skType, _ := s.Type()
  1352  	if family != linux.AF_INET6 {
  1353  		return nil, syserr.ErrNotSupported
  1354  	}
  1355  
  1356  	switch name {
  1357  	case linux.IPV6_CHECKSUM:
  1358  		if outLen < sizeOfInt32 {
  1359  			return nil, syserr.ErrInvalidArgument
  1360  		}
  1361  
  1362  		v, err := ep.GetSockOptInt(tcpip.IPv6Checksum)
  1363  		if err != nil {
  1364  			return nil, syserr.TranslateNetstackError(err)
  1365  		}
  1366  
  1367  		vP := primitive.Int32(v)
  1368  		return &vP, nil
  1369  
  1370  	case linux.IPV6_V6ONLY:
  1371  		if outLen < sizeOfInt32 {
  1372  			return nil, syserr.ErrInvalidArgument
  1373  		}
  1374  
  1375  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetV6Only()))
  1376  		return &v, nil
  1377  
  1378  	case linux.IPV6_UNICAST_HOPS:
  1379  		if outLen < sizeOfInt32 {
  1380  			return nil, syserr.ErrInvalidArgument
  1381  		}
  1382  
  1383  		v, err := ep.GetSockOptInt(tcpip.IPv6HopLimitOption)
  1384  		if err != nil {
  1385  			return nil, syserr.TranslateNetstackError(err)
  1386  		}
  1387  
  1388  		// Fill in the default value, if needed.
  1389  		vP := primitive.Int32(v)
  1390  		if vP == -1 {
  1391  			vP, err = defaultTTL(t, header.IPv6ProtocolNumber)
  1392  			if err != nil {
  1393  				return nil, syserr.TranslateNetstackError(err)
  1394  			}
  1395  		}
  1396  
  1397  		return &vP, nil
  1398  
  1399  	case linux.IPV6_RECVHOPLIMIT:
  1400  		if outLen < sizeOfInt32 {
  1401  			return nil, syserr.ErrInvalidArgument
  1402  		}
  1403  
  1404  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveHopLimit()))
  1405  		return &v, nil
  1406  
  1407  	case linux.IPV6_PATHMTU:
  1408  		// Not supported.
  1409  
  1410  	case linux.IPV6_TCLASS:
  1411  		// Length handling for parity with Linux.
  1412  		if outLen == 0 {
  1413  			var b primitive.ByteSlice
  1414  			return &b, nil
  1415  		}
  1416  		v, err := ep.GetSockOptInt(tcpip.IPv6TrafficClassOption)
  1417  		if err != nil {
  1418  			return nil, syserr.TranslateNetstackError(err)
  1419  		}
  1420  
  1421  		uintv := primitive.Uint32(v)
  1422  		// Linux truncates the output binary to outLen.
  1423  		ib := t.CopyScratchBuffer(uintv.SizeBytes())
  1424  		uintv.MarshalUnsafe(ib)
  1425  		// Handle cases where outLen is lesser than sizeOfInt32.
  1426  		if len(ib) > outLen {
  1427  			ib = ib[:outLen]
  1428  		}
  1429  		ibP := primitive.ByteSlice(ib)
  1430  		return &ibP, nil
  1431  
  1432  	case linux.IPV6_RECVTCLASS:
  1433  		if outLen < sizeOfInt32 {
  1434  			return nil, syserr.ErrInvalidArgument
  1435  		}
  1436  
  1437  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTClass()))
  1438  		return &v, nil
  1439  	case linux.IPV6_RECVERR:
  1440  		if outLen < sizeOfInt32 {
  1441  			return nil, syserr.ErrInvalidArgument
  1442  		}
  1443  
  1444  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv6RecvError()))
  1445  		return &v, nil
  1446  
  1447  	case linux.IPV6_RECVORIGDSTADDR:
  1448  		if outLen < sizeOfInt32 {
  1449  			return nil, syserr.ErrInvalidArgument
  1450  		}
  1451  
  1452  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress()))
  1453  		return &v, nil
  1454  
  1455  	case linux.IPV6_RECVPKTINFO:
  1456  		if outLen < sizeOfInt32 {
  1457  			return nil, syserr.ErrInvalidArgument
  1458  		}
  1459  
  1460  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv6ReceivePacketInfo()))
  1461  		return &v, nil
  1462  
  1463  	case linux.IP6T_ORIGINAL_DST:
  1464  		if outLen < sockAddrInet6Size {
  1465  			return nil, syserr.ErrInvalidArgument
  1466  		}
  1467  
  1468  		var v tcpip.OriginalDestinationOption
  1469  		if err := ep.GetSockOpt(&v); err != nil {
  1470  			return nil, syserr.TranslateNetstackError(err)
  1471  		}
  1472  
  1473  		a, _ := socket.ConvertAddress(linux.AF_INET6, tcpip.FullAddress(v))
  1474  		return a.(*linux.SockAddrInet6), nil
  1475  
  1476  	case linux.IP6T_SO_GET_INFO:
  1477  		if outLen < linux.SizeOfIPTGetinfo {
  1478  			return nil, syserr.ErrInvalidArgument
  1479  		}
  1480  
  1481  		// Only valid for raw IPv6 sockets.
  1482  		if skType != linux.SOCK_RAW {
  1483  			return nil, syserr.ErrProtocolNotAvailable
  1484  		}
  1485  
  1486  		stk := inet.StackFromContext(t)
  1487  		if stk == nil {
  1488  			return nil, syserr.ErrNoDevice
  1489  		}
  1490  		info, err := netfilter.GetInfo(t, stk.(*Stack).Stack, outPtr, true)
  1491  		if err != nil {
  1492  			return nil, err
  1493  		}
  1494  		return &info, nil
  1495  
  1496  	case linux.IP6T_SO_GET_ENTRIES:
  1497  		// IPTGetEntries is reused for IPv6.
  1498  		if outLen < linux.SizeOfIPTGetEntries {
  1499  			return nil, syserr.ErrInvalidArgument
  1500  		}
  1501  		// Only valid for raw IPv6 sockets.
  1502  		if skType != linux.SOCK_RAW {
  1503  			return nil, syserr.ErrProtocolNotAvailable
  1504  		}
  1505  
  1506  		stk := inet.StackFromContext(t)
  1507  		if stk == nil {
  1508  			return nil, syserr.ErrNoDevice
  1509  		}
  1510  		entries, err := netfilter.GetEntries6(t, stk.(*Stack).Stack, outPtr, outLen)
  1511  		if err != nil {
  1512  			return nil, err
  1513  		}
  1514  		return &entries, nil
  1515  
  1516  	case linux.IP6T_SO_GET_REVISION_TARGET:
  1517  		if outLen < linux.SizeOfXTGetRevision {
  1518  			return nil, syserr.ErrInvalidArgument
  1519  		}
  1520  
  1521  		// Only valid for raw IPv6 sockets.
  1522  		if skType != linux.SOCK_RAW {
  1523  			return nil, syserr.ErrProtocolNotAvailable
  1524  		}
  1525  
  1526  		stk := inet.StackFromContext(t)
  1527  		if stk == nil {
  1528  			return nil, syserr.ErrNoDevice
  1529  		}
  1530  		ret, err := netfilter.TargetRevision(t, outPtr, header.IPv6ProtocolNumber)
  1531  		if err != nil {
  1532  			return nil, err
  1533  		}
  1534  		return &ret, nil
  1535  	}
  1536  	return nil, syserr.ErrProtocolNotAvailable
  1537  }
  1538  
  1539  // getSockOptIP implements GetSockOpt when level is SOL_IP.
  1540  func getSockOptIP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int, _ int) (marshal.Marshallable, *syserr.Error) {
  1541  	if _, ok := ep.(tcpip.Endpoint); !ok {
  1542  		log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  1543  		return nil, syserr.ErrUnknownProtocolOption
  1544  	}
  1545  
  1546  	switch name {
  1547  	case linux.IP_TTL:
  1548  		if outLen < sizeOfInt32 {
  1549  			return nil, syserr.ErrInvalidArgument
  1550  		}
  1551  
  1552  		v, err := ep.GetSockOptInt(tcpip.IPv4TTLOption)
  1553  		if err != nil {
  1554  			return nil, syserr.TranslateNetstackError(err)
  1555  		}
  1556  
  1557  		// Fill in the default value, if needed.
  1558  		vP := primitive.Int32(v)
  1559  		if vP == 0 {
  1560  			vP, err = defaultTTL(t, header.IPv4ProtocolNumber)
  1561  			if err != nil {
  1562  				return nil, syserr.TranslateNetstackError(err)
  1563  			}
  1564  		}
  1565  
  1566  		return &vP, nil
  1567  
  1568  	case linux.IP_RECVTTL:
  1569  		if outLen < sizeOfInt32 {
  1570  			return nil, syserr.ErrInvalidArgument
  1571  		}
  1572  
  1573  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTTL()))
  1574  		return &v, nil
  1575  
  1576  	case linux.IP_MULTICAST_TTL:
  1577  		if outLen < sizeOfInt32 {
  1578  			return nil, syserr.ErrInvalidArgument
  1579  		}
  1580  
  1581  		v, err := ep.GetSockOptInt(tcpip.MulticastTTLOption)
  1582  		if err != nil {
  1583  			return nil, syserr.TranslateNetstackError(err)
  1584  		}
  1585  
  1586  		vP := primitive.Int32(v)
  1587  		return &vP, nil
  1588  
  1589  	case linux.IP_MULTICAST_IF:
  1590  		if outLen < len(linux.InetAddr{}) {
  1591  			return nil, syserr.ErrInvalidArgument
  1592  		}
  1593  
  1594  		var v tcpip.MulticastInterfaceOption
  1595  		if err := ep.GetSockOpt(&v); err != nil {
  1596  			return nil, syserr.TranslateNetstackError(err)
  1597  		}
  1598  
  1599  		a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr})
  1600  
  1601  		return &a.(*linux.SockAddrInet).Addr, nil
  1602  
  1603  	case linux.IP_MULTICAST_LOOP:
  1604  		if outLen < sizeOfInt32 {
  1605  			return nil, syserr.ErrInvalidArgument
  1606  		}
  1607  
  1608  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetMulticastLoop()))
  1609  		return &v, nil
  1610  
  1611  	case linux.IP_TOS:
  1612  		// Length handling for parity with Linux.
  1613  		if outLen == 0 {
  1614  			var b primitive.ByteSlice
  1615  			return &b, nil
  1616  		}
  1617  		v, err := ep.GetSockOptInt(tcpip.IPv4TOSOption)
  1618  		if err != nil {
  1619  			return nil, syserr.TranslateNetstackError(err)
  1620  		}
  1621  		if outLen < sizeOfInt32 {
  1622  			vP := primitive.Uint8(v)
  1623  			return &vP, nil
  1624  		}
  1625  		vP := primitive.Int32(v)
  1626  		return &vP, nil
  1627  
  1628  	case linux.IP_RECVTOS:
  1629  		if outLen < sizeOfInt32 {
  1630  			return nil, syserr.ErrInvalidArgument
  1631  		}
  1632  
  1633  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTOS()))
  1634  		return &v, nil
  1635  
  1636  	case linux.IP_RECVERR:
  1637  		if outLen < sizeOfInt32 {
  1638  			return nil, syserr.ErrInvalidArgument
  1639  		}
  1640  
  1641  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv4RecvError()))
  1642  		return &v, nil
  1643  
  1644  	case linux.IP_PKTINFO:
  1645  		if outLen < sizeOfInt32 {
  1646  			return nil, syserr.ErrInvalidArgument
  1647  		}
  1648  
  1649  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceivePacketInfo()))
  1650  		return &v, nil
  1651  
  1652  	case linux.IP_HDRINCL:
  1653  		if outLen < sizeOfInt32 {
  1654  			return nil, syserr.ErrInvalidArgument
  1655  		}
  1656  
  1657  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetHeaderIncluded()))
  1658  		return &v, nil
  1659  
  1660  	case linux.IP_RECVORIGDSTADDR:
  1661  		if outLen < sizeOfInt32 {
  1662  			return nil, syserr.ErrInvalidArgument
  1663  		}
  1664  
  1665  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress()))
  1666  		return &v, nil
  1667  
  1668  	case linux.SO_ORIGINAL_DST:
  1669  		if outLen < sockAddrInetSize {
  1670  			return nil, syserr.ErrInvalidArgument
  1671  		}
  1672  
  1673  		var v tcpip.OriginalDestinationOption
  1674  		if err := ep.GetSockOpt(&v); err != nil {
  1675  			return nil, syserr.TranslateNetstackError(err)
  1676  		}
  1677  
  1678  		a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress(v))
  1679  		return a.(*linux.SockAddrInet), nil
  1680  
  1681  	case linux.IPT_SO_GET_INFO:
  1682  		if outLen < linux.SizeOfIPTGetinfo {
  1683  			return nil, syserr.ErrInvalidArgument
  1684  		}
  1685  
  1686  		// Only valid for raw IPv4 sockets.
  1687  		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
  1688  			return nil, syserr.ErrProtocolNotAvailable
  1689  		}
  1690  
  1691  		stk := inet.StackFromContext(t)
  1692  		if stk == nil {
  1693  			return nil, syserr.ErrNoDevice
  1694  		}
  1695  		info, err := netfilter.GetInfo(t, stk.(*Stack).Stack, outPtr, false)
  1696  		if err != nil {
  1697  			return nil, err
  1698  		}
  1699  		return &info, nil
  1700  
  1701  	case linux.IPT_SO_GET_ENTRIES:
  1702  		if outLen < linux.SizeOfIPTGetEntries {
  1703  			return nil, syserr.ErrInvalidArgument
  1704  		}
  1705  
  1706  		// Only valid for raw IPv4 sockets.
  1707  		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
  1708  			return nil, syserr.ErrProtocolNotAvailable
  1709  		}
  1710  
  1711  		stk := inet.StackFromContext(t)
  1712  		if stk == nil {
  1713  			return nil, syserr.ErrNoDevice
  1714  		}
  1715  		entries, err := netfilter.GetEntries4(t, stk.(*Stack).Stack, outPtr, outLen)
  1716  		if err != nil {
  1717  			return nil, err
  1718  		}
  1719  		return &entries, nil
  1720  
  1721  	case linux.IPT_SO_GET_REVISION_TARGET:
  1722  		if outLen < linux.SizeOfXTGetRevision {
  1723  			return nil, syserr.ErrInvalidArgument
  1724  		}
  1725  
  1726  		// Only valid for raw IPv4 sockets.
  1727  		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
  1728  			return nil, syserr.ErrProtocolNotAvailable
  1729  		}
  1730  
  1731  		stk := inet.StackFromContext(t)
  1732  		if stk == nil {
  1733  			return nil, syserr.ErrNoDevice
  1734  		}
  1735  		ret, err := netfilter.TargetRevision(t, outPtr, header.IPv4ProtocolNumber)
  1736  		if err != nil {
  1737  			return nil, err
  1738  		}
  1739  		return &ret, nil
  1740  	}
  1741  	return nil, syserr.ErrProtocolNotAvailable
  1742  }
  1743  
  1744  // SetSockOpt can be used to implement the linux syscall setsockopt(2) for
  1745  // sockets backed by a commonEndpoint.
  1746  func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error {
  1747  	switch level {
  1748  	case linux.SOL_SOCKET:
  1749  		return setSockOptSocket(t, s, ep, name, optVal)
  1750  
  1751  	case linux.SOL_TCP:
  1752  		return setSockOptTCP(t, s, ep, name, optVal)
  1753  
  1754  	case linux.SOL_ICMPV6:
  1755  		return setSockOptICMPv6(t, s, ep, name, optVal)
  1756  
  1757  	case linux.SOL_IPV6:
  1758  		return setSockOptIPv6(t, s, ep, name, optVal)
  1759  
  1760  	case linux.SOL_IP:
  1761  		return setSockOptIP(t, s, ep, name, optVal)
  1762  
  1763  	case linux.SOL_PACKET:
  1764  		// gVisor doesn't support any SOL_PACKET options just return not
  1765  		// supported. Returning nil here will result in tcpdump thinking AF_PACKET
  1766  		// features are supported and proceed to use them and break.
  1767  		return syserr.ErrProtocolNotAvailable
  1768  
  1769  	case linux.SOL_UDP,
  1770  		linux.SOL_RAW:
  1771  		// Not supported.
  1772  	}
  1773  
  1774  	return nil
  1775  }
  1776  
  1777  func clampBufSize(newSz, min, max int64, ignoreMax bool) int64 {
  1778  	// packetOverheadFactor is used to multiply the value provided by the user on
  1779  	// a setsockopt(2) for setting the send/receive buffer sizes sockets.
  1780  	const packetOverheadFactor = 2
  1781  
  1782  	if !ignoreMax && newSz > max {
  1783  		newSz = max
  1784  	}
  1785  
  1786  	if newSz < math.MaxInt32/packetOverheadFactor {
  1787  		newSz *= packetOverheadFactor
  1788  		if newSz < min {
  1789  			newSz = min
  1790  		}
  1791  	} else {
  1792  		newSz = math.MaxInt32
  1793  	}
  1794  	return newSz
  1795  }
  1796  
  1797  // setSockOptSocket implements SetSockOpt when level is SOL_SOCKET.
  1798  func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  1799  	switch name {
  1800  	case linux.SO_SNDBUF:
  1801  		if len(optVal) < sizeOfInt32 {
  1802  			return syserr.ErrInvalidArgument
  1803  		}
  1804  
  1805  		v := hostarch.ByteOrder.Uint32(optVal)
  1806  		min, max := ep.SocketOptions().SendBufferLimits()
  1807  		clamped := clampBufSize(int64(v), min, max, false /* ignoreMax */)
  1808  		ep.SocketOptions().SetSendBufferSize(clamped, true /* notify */)
  1809  		return nil
  1810  
  1811  	case linux.SO_RCVBUF:
  1812  		if len(optVal) < sizeOfInt32 {
  1813  			return syserr.ErrInvalidArgument
  1814  		}
  1815  
  1816  		v := hostarch.ByteOrder.Uint32(optVal)
  1817  		min, max := ep.SocketOptions().ReceiveBufferLimits()
  1818  		clamped := clampBufSize(int64(v), min, max, false /* ignoreMax */)
  1819  		ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */)
  1820  		return nil
  1821  
  1822  	case linux.SO_RCVBUFFORCE:
  1823  		if len(optVal) < sizeOfInt32 {
  1824  			return syserr.ErrInvalidArgument
  1825  		}
  1826  
  1827  		if creds := auth.CredentialsFromContext(t); !creds.HasCapability(linux.CAP_NET_ADMIN) {
  1828  			return syserr.ErrNotPermitted
  1829  		}
  1830  
  1831  		v := hostarch.ByteOrder.Uint32(optVal)
  1832  		min, max := ep.SocketOptions().ReceiveBufferLimits()
  1833  		clamped := clampBufSize(int64(v), min, max, true /* ignoreMax */)
  1834  		ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */)
  1835  		return nil
  1836  
  1837  	case linux.SO_REUSEADDR:
  1838  		if len(optVal) < sizeOfInt32 {
  1839  			return syserr.ErrInvalidArgument
  1840  		}
  1841  
  1842  		v := hostarch.ByteOrder.Uint32(optVal)
  1843  		ep.SocketOptions().SetReuseAddress(v != 0)
  1844  		return nil
  1845  
  1846  	case linux.SO_REUSEPORT:
  1847  		if len(optVal) < sizeOfInt32 {
  1848  			return syserr.ErrInvalidArgument
  1849  		}
  1850  
  1851  		v := hostarch.ByteOrder.Uint32(optVal)
  1852  		ep.SocketOptions().SetReusePort(v != 0)
  1853  		return nil
  1854  
  1855  	case linux.SO_BINDTODEVICE:
  1856  		n := bytes.IndexByte(optVal, 0)
  1857  		if n == -1 {
  1858  			n = len(optVal)
  1859  		}
  1860  		name := string(optVal[:n])
  1861  		if name == "" {
  1862  			return syserr.TranslateNetstackError(ep.SocketOptions().SetBindToDevice(0))
  1863  		}
  1864  		s := t.NetworkContext()
  1865  		if s == nil {
  1866  			return syserr.ErrNoDevice
  1867  		}
  1868  		for nicID, nic := range s.Interfaces() {
  1869  			if nic.Name == name {
  1870  				return syserr.TranslateNetstackError(ep.SocketOptions().SetBindToDevice(nicID))
  1871  			}
  1872  		}
  1873  		return syserr.ErrUnknownDevice
  1874  
  1875  	case linux.SO_BROADCAST:
  1876  		if len(optVal) < sizeOfInt32 {
  1877  			return syserr.ErrInvalidArgument
  1878  		}
  1879  
  1880  		v := hostarch.ByteOrder.Uint32(optVal)
  1881  		ep.SocketOptions().SetBroadcast(v != 0)
  1882  		return nil
  1883  
  1884  	case linux.SO_PASSCRED:
  1885  		if len(optVal) < sizeOfInt32 {
  1886  			return syserr.ErrInvalidArgument
  1887  		}
  1888  
  1889  		v := hostarch.ByteOrder.Uint32(optVal)
  1890  		ep.SocketOptions().SetPassCred(v != 0)
  1891  		return nil
  1892  
  1893  	case linux.SO_KEEPALIVE:
  1894  		if len(optVal) < sizeOfInt32 {
  1895  			return syserr.ErrInvalidArgument
  1896  		}
  1897  
  1898  		v := hostarch.ByteOrder.Uint32(optVal)
  1899  		ep.SocketOptions().SetKeepAlive(v != 0)
  1900  		return nil
  1901  
  1902  	case linux.SO_SNDTIMEO:
  1903  		if len(optVal) < linux.SizeOfTimeval {
  1904  			return syserr.ErrInvalidArgument
  1905  		}
  1906  
  1907  		var v linux.Timeval
  1908  		v.UnmarshalBytes(optVal)
  1909  		if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
  1910  			return syserr.ErrDomain
  1911  		}
  1912  		s.SetSendTimeout(v.ToNsecCapped())
  1913  		return nil
  1914  
  1915  	case linux.SO_RCVTIMEO:
  1916  		if len(optVal) < linux.SizeOfTimeval {
  1917  			return syserr.ErrInvalidArgument
  1918  		}
  1919  
  1920  		var v linux.Timeval
  1921  		v.UnmarshalBytes(optVal)
  1922  		if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
  1923  			return syserr.ErrDomain
  1924  		}
  1925  		s.SetRecvTimeout(v.ToNsecCapped())
  1926  		return nil
  1927  
  1928  	case linux.SO_OOBINLINE:
  1929  		if len(optVal) < sizeOfInt32 {
  1930  			return syserr.ErrInvalidArgument
  1931  		}
  1932  
  1933  		v := hostarch.ByteOrder.Uint32(optVal)
  1934  		ep.SocketOptions().SetOutOfBandInline(v != 0)
  1935  		return nil
  1936  
  1937  	case linux.SO_NO_CHECK:
  1938  		if len(optVal) < sizeOfInt32 {
  1939  			return syserr.ErrInvalidArgument
  1940  		}
  1941  
  1942  		v := hostarch.ByteOrder.Uint32(optVal)
  1943  		ep.SocketOptions().SetNoChecksum(v != 0)
  1944  		return nil
  1945  
  1946  	case linux.SO_LINGER:
  1947  		if len(optVal) < linux.SizeOfLinger {
  1948  			return syserr.ErrInvalidArgument
  1949  		}
  1950  
  1951  		var v linux.Linger
  1952  		v.UnmarshalBytes(optVal)
  1953  
  1954  		ep.SocketOptions().SetLinger(tcpip.LingerOption{
  1955  			Enabled: v.OnOff != 0,
  1956  			Timeout: time.Second * time.Duration(v.Linger),
  1957  		})
  1958  		return nil
  1959  
  1960  	case linux.SO_DETACH_FILTER:
  1961  		// optval is ignored.
  1962  		var v tcpip.SocketDetachFilterOption
  1963  		return syserr.TranslateNetstackError(ep.SetSockOpt(&v))
  1964  
  1965  	// TODO(b/226603727): Add support for SO_RCVLOWAT option. For now, only
  1966  	// the unsupported syscall message is removed.
  1967  	case linux.SO_RCVLOWAT:
  1968  		if len(optVal) < sizeOfInt32 {
  1969  			return syserr.ErrInvalidArgument
  1970  		}
  1971  
  1972  		v := hostarch.ByteOrder.Uint32(optVal)
  1973  		ep.SocketOptions().SetRcvlowat(int32(v))
  1974  		return nil
  1975  	}
  1976  
  1977  	return nil
  1978  }
  1979  
  1980  // setSockOptTCP implements SetSockOpt when level is SOL_TCP.
  1981  func setSockOptTCP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  1982  	if !socket.IsTCP(s) {
  1983  		return syserr.ErrUnknownProtocolOption
  1984  	}
  1985  
  1986  	switch name {
  1987  	case linux.TCP_NODELAY:
  1988  		if len(optVal) < sizeOfInt32 {
  1989  			return syserr.ErrInvalidArgument
  1990  		}
  1991  
  1992  		v := hostarch.ByteOrder.Uint32(optVal)
  1993  		ep.SocketOptions().SetDelayOption(v == 0)
  1994  		return nil
  1995  
  1996  	case linux.TCP_CORK:
  1997  		if len(optVal) < sizeOfInt32 {
  1998  			return syserr.ErrInvalidArgument
  1999  		}
  2000  
  2001  		v := hostarch.ByteOrder.Uint32(optVal)
  2002  		ep.SocketOptions().SetCorkOption(v != 0)
  2003  		return nil
  2004  
  2005  	case linux.TCP_QUICKACK:
  2006  		if len(optVal) < sizeOfInt32 {
  2007  			return syserr.ErrInvalidArgument
  2008  		}
  2009  
  2010  		v := hostarch.ByteOrder.Uint32(optVal)
  2011  		ep.SocketOptions().SetQuickAck(v != 0)
  2012  		return nil
  2013  
  2014  	case linux.TCP_MAXSEG:
  2015  		if len(optVal) < sizeOfInt32 {
  2016  			return syserr.ErrInvalidArgument
  2017  		}
  2018  
  2019  		v := hostarch.ByteOrder.Uint32(optVal)
  2020  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MaxSegOption, int(v)))
  2021  
  2022  	case linux.TCP_KEEPIDLE:
  2023  		if len(optVal) < sizeOfInt32 {
  2024  			return syserr.ErrInvalidArgument
  2025  		}
  2026  
  2027  		v := hostarch.ByteOrder.Uint32(optVal)
  2028  		if v < 1 || v > linux.MAX_TCP_KEEPIDLE {
  2029  			return syserr.ErrInvalidArgument
  2030  		}
  2031  		opt := tcpip.KeepaliveIdleOption(time.Second * time.Duration(v))
  2032  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  2033  
  2034  	case linux.TCP_KEEPINTVL:
  2035  		if len(optVal) < sizeOfInt32 {
  2036  			return syserr.ErrInvalidArgument
  2037  		}
  2038  
  2039  		v := hostarch.ByteOrder.Uint32(optVal)
  2040  		if v < 1 || v > linux.MAX_TCP_KEEPINTVL {
  2041  			return syserr.ErrInvalidArgument
  2042  		}
  2043  		opt := tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))
  2044  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  2045  
  2046  	case linux.TCP_KEEPCNT:
  2047  		if len(optVal) < sizeOfInt32 {
  2048  			return syserr.ErrInvalidArgument
  2049  		}
  2050  
  2051  		v := hostarch.ByteOrder.Uint32(optVal)
  2052  		if v < 1 || v > linux.MAX_TCP_KEEPCNT {
  2053  			return syserr.ErrInvalidArgument
  2054  		}
  2055  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.KeepaliveCountOption, int(v)))
  2056  
  2057  	case linux.TCP_USER_TIMEOUT:
  2058  		if len(optVal) < sizeOfInt32 {
  2059  			return syserr.ErrInvalidArgument
  2060  		}
  2061  
  2062  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2063  		if v < 0 {
  2064  			return syserr.ErrInvalidArgument
  2065  		}
  2066  		opt := tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v))
  2067  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  2068  
  2069  	case linux.TCP_CONGESTION:
  2070  		v := tcpip.CongestionControlOption(optVal)
  2071  		if err := ep.SetSockOpt(&v); err != nil {
  2072  			return syserr.TranslateNetstackError(err)
  2073  		}
  2074  		return nil
  2075  
  2076  	case linux.TCP_LINGER2:
  2077  		if len(optVal) < sizeOfInt32 {
  2078  			return syserr.ErrInvalidArgument
  2079  		}
  2080  
  2081  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2082  		opt := tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v))
  2083  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  2084  
  2085  	case linux.TCP_DEFER_ACCEPT:
  2086  		if len(optVal) < sizeOfInt32 {
  2087  			return syserr.ErrInvalidArgument
  2088  		}
  2089  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2090  		if v < 0 {
  2091  			v = 0
  2092  		}
  2093  		opt := tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v))
  2094  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  2095  
  2096  	case linux.TCP_SYNCNT:
  2097  		if len(optVal) < sizeOfInt32 {
  2098  			return syserr.ErrInvalidArgument
  2099  		}
  2100  		v := hostarch.ByteOrder.Uint32(optVal)
  2101  
  2102  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPSynCountOption, int(v)))
  2103  
  2104  	case linux.TCP_WINDOW_CLAMP:
  2105  		if len(optVal) < sizeOfInt32 {
  2106  			return syserr.ErrInvalidArgument
  2107  		}
  2108  		v := hostarch.ByteOrder.Uint32(optVal)
  2109  
  2110  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPWindowClampOption, int(v)))
  2111  
  2112  	case linux.TCP_REPAIR_OPTIONS:
  2113  		// Not supported.
  2114  	}
  2115  
  2116  	return nil
  2117  }
  2118  
  2119  func setSockOptICMPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  2120  	if _, ok := ep.(tcpip.Endpoint); !ok {
  2121  		log.Warningf("SOL_ICMPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  2122  		return syserr.ErrUnknownProtocolOption
  2123  	}
  2124  
  2125  	if family, _, _ := s.Type(); family != linux.AF_INET6 {
  2126  		return syserr.ErrUnknownProtocolOption
  2127  	}
  2128  
  2129  	switch name {
  2130  	case linux.ICMPV6_FILTER:
  2131  		var req linux.ICMP6Filter
  2132  		if len(optVal) < req.SizeBytes() {
  2133  			return syserr.ErrInvalidArgument
  2134  		}
  2135  
  2136  		req.UnmarshalUnsafe(optVal)
  2137  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.ICMPv6Filter{DenyType: req.Filter}))
  2138  	}
  2139  
  2140  	return nil
  2141  }
  2142  
  2143  // setSockOptIPv6 implements SetSockOpt when level is SOL_IPV6.
  2144  func setSockOptIPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  2145  	if _, ok := ep.(tcpip.Endpoint); !ok {
  2146  		log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  2147  		return syserr.ErrUnknownProtocolOption
  2148  	}
  2149  
  2150  	family, _, _ := s.Type()
  2151  	if family != linux.AF_INET6 {
  2152  		return syserr.ErrUnknownProtocolOption
  2153  	}
  2154  
  2155  	switch name {
  2156  	case linux.IPV6_CHECKSUM:
  2157  		if len(optVal) < sizeOfInt32 {
  2158  			return syserr.ErrInvalidArgument
  2159  		}
  2160  
  2161  		// int may not be 32-bits so we cast the uint32 to an int32 before casting
  2162  		// to an int.
  2163  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6Checksum, int(int32(hostarch.ByteOrder.Uint32(optVal)))))
  2164  
  2165  	case linux.IPV6_V6ONLY:
  2166  		if len(optVal) < sizeOfInt32 {
  2167  			return syserr.ErrInvalidArgument
  2168  		}
  2169  
  2170  		if socket.IsTCP(s) && tcp.EndpointState(ep.State()) != tcp.StateInitial {
  2171  			return syserr.ErrInvalidEndpointState
  2172  		} else if socket.IsUDP(s) && transport.DatagramEndpointState(ep.State()) != transport.DatagramEndpointStateInitial {
  2173  			return syserr.ErrInvalidEndpointState
  2174  		}
  2175  
  2176  		v := hostarch.ByteOrder.Uint32(optVal)
  2177  		ep.SocketOptions().SetV6Only(v != 0)
  2178  		return nil
  2179  
  2180  	case linux.IPV6_ADD_MEMBERSHIP:
  2181  		req, err := copyInMulticastV6Request(optVal)
  2182  		if err != nil {
  2183  			return err
  2184  		}
  2185  
  2186  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{
  2187  			NIC:           tcpip.NICID(req.InterfaceIndex),
  2188  			MulticastAddr: tcpip.AddrFrom16(req.MulticastAddr),
  2189  		}))
  2190  
  2191  	case linux.IPV6_DROP_MEMBERSHIP:
  2192  		req, err := copyInMulticastV6Request(optVal)
  2193  		if err != nil {
  2194  			return err
  2195  		}
  2196  
  2197  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{
  2198  			NIC:           tcpip.NICID(req.InterfaceIndex),
  2199  			MulticastAddr: tcpip.AddrFrom16(req.MulticastAddr),
  2200  		}))
  2201  
  2202  	case linux.IPV6_IPSEC_POLICY,
  2203  		linux.IPV6_JOIN_ANYCAST,
  2204  		linux.IPV6_LEAVE_ANYCAST,
  2205  		// TODO(b/148887420): Add support for IPV6_PKTINFO.
  2206  		linux.IPV6_PKTINFO,
  2207  		linux.IPV6_ROUTER_ALERT,
  2208  		linux.IPV6_XFRM_POLICY,
  2209  		linux.MCAST_BLOCK_SOURCE,
  2210  		linux.MCAST_JOIN_GROUP,
  2211  		linux.MCAST_JOIN_SOURCE_GROUP,
  2212  		linux.MCAST_LEAVE_GROUP,
  2213  		linux.MCAST_LEAVE_SOURCE_GROUP,
  2214  		linux.MCAST_UNBLOCK_SOURCE:
  2215  		// Not supported.
  2216  
  2217  	case linux.IPV6_RECVORIGDSTADDR:
  2218  		if len(optVal) < sizeOfInt32 {
  2219  			return syserr.ErrInvalidArgument
  2220  		}
  2221  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2222  
  2223  		ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0)
  2224  		return nil
  2225  
  2226  	case linux.IPV6_RECVPKTINFO:
  2227  		if len(optVal) < sizeOfInt32 {
  2228  			return syserr.ErrInvalidArgument
  2229  		}
  2230  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2231  
  2232  		ep.SocketOptions().SetIPv6ReceivePacketInfo(v != 0)
  2233  		return nil
  2234  
  2235  	case linux.IPV6_UNICAST_HOPS:
  2236  		if len(optVal) < sizeOfInt32 {
  2237  			return syserr.ErrInvalidArgument
  2238  		}
  2239  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2240  		if v < -1 || v > 255 {
  2241  			return syserr.ErrInvalidArgument
  2242  		}
  2243  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6HopLimitOption, int(v)))
  2244  
  2245  	case linux.IPV6_RECVHOPLIMIT:
  2246  		v, err := parseIntOrChar(optVal)
  2247  		if err != nil {
  2248  			return err
  2249  		}
  2250  
  2251  		ep.SocketOptions().SetReceiveHopLimit(v != 0)
  2252  		return nil
  2253  
  2254  	case linux.IPV6_TCLASS:
  2255  		if len(optVal) < sizeOfInt32 {
  2256  			return syserr.ErrInvalidArgument
  2257  		}
  2258  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2259  		if v < -1 || v > 255 {
  2260  			return syserr.ErrInvalidArgument
  2261  		}
  2262  		if v == -1 {
  2263  			v = 0
  2264  		}
  2265  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6TrafficClassOption, int(v)))
  2266  
  2267  	case linux.IPV6_RECVTCLASS:
  2268  		v, err := parseIntOrChar(optVal)
  2269  		if err != nil {
  2270  			return err
  2271  		}
  2272  
  2273  		ep.SocketOptions().SetReceiveTClass(v != 0)
  2274  		return nil
  2275  	case linux.IPV6_RECVERR:
  2276  		if len(optVal) == 0 {
  2277  			return nil
  2278  		}
  2279  		v, err := parseIntOrChar(optVal)
  2280  		if err != nil {
  2281  			return err
  2282  		}
  2283  		ep.SocketOptions().SetIPv6RecvError(v != 0)
  2284  		return nil
  2285  
  2286  	case linux.IP6T_SO_SET_REPLACE:
  2287  		if len(optVal) < linux.SizeOfIP6TReplace {
  2288  			return syserr.ErrInvalidArgument
  2289  		}
  2290  
  2291  		// Only valid for raw IPv6 sockets.
  2292  		if !socket.IsRaw(s) {
  2293  			return syserr.ErrProtocolNotAvailable
  2294  		}
  2295  
  2296  		stk := inet.StackFromContext(t)
  2297  		if stk == nil {
  2298  			return syserr.ErrNoDevice
  2299  		}
  2300  		// Stack must be a netstack stack.
  2301  		return netfilter.SetEntries(t, stk.(*Stack).Stack, optVal, true)
  2302  
  2303  	case linux.IP6T_SO_SET_ADD_COUNTERS:
  2304  		log.Infof("IP6T_SO_SET_ADD_COUNTERS is not supported")
  2305  		return nil
  2306  	}
  2307  
  2308  	return nil
  2309  }
  2310  
  2311  var (
  2312  	inetMulticastRequestSize        = (*linux.InetMulticastRequest)(nil).SizeBytes()
  2313  	inetMulticastRequestWithNICSize = (*linux.InetMulticastRequestWithNIC)(nil).SizeBytes()
  2314  	inet6MulticastRequestSize       = (*linux.Inet6MulticastRequest)(nil).SizeBytes()
  2315  )
  2316  
  2317  // copyInMulticastRequest copies in a variable-size multicast request. The
  2318  // kernel determines which structure was passed by its length. IP_MULTICAST_IF
  2319  // supports ip_mreqn, ip_mreq and in_addr, while IP_ADD_MEMBERSHIP and
  2320  // IP_DROP_MEMBERSHIP only support ip_mreqn and ip_mreq. To handle this,
  2321  // allowAddr controls whether in_addr is accepted or rejected.
  2322  func copyInMulticastRequest(optVal []byte, allowAddr bool) (linux.InetMulticastRequestWithNIC, *syserr.Error) {
  2323  	if len(optVal) < len(linux.InetAddr{}) {
  2324  		return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument
  2325  	}
  2326  
  2327  	if len(optVal) < inetMulticastRequestSize {
  2328  		if !allowAddr {
  2329  			return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument
  2330  		}
  2331  
  2332  		var req linux.InetMulticastRequestWithNIC
  2333  		copy(req.InterfaceAddr[:], optVal)
  2334  		return req, nil
  2335  	}
  2336  
  2337  	if len(optVal) >= inetMulticastRequestWithNICSize {
  2338  		var req linux.InetMulticastRequestWithNIC
  2339  		req.UnmarshalUnsafe(optVal)
  2340  		return req, nil
  2341  	}
  2342  
  2343  	var req linux.InetMulticastRequestWithNIC
  2344  	req.InetMulticastRequest.UnmarshalUnsafe(optVal)
  2345  	return req, nil
  2346  }
  2347  
  2348  func copyInMulticastV6Request(optVal []byte) (linux.Inet6MulticastRequest, *syserr.Error) {
  2349  	if len(optVal) < inet6MulticastRequestSize {
  2350  		return linux.Inet6MulticastRequest{}, syserr.ErrInvalidArgument
  2351  	}
  2352  
  2353  	var req linux.Inet6MulticastRequest
  2354  	req.UnmarshalUnsafe(optVal)
  2355  	return req, nil
  2356  }
  2357  
  2358  // parseIntOrChar copies either a 32-bit int or an 8-bit uint out of buf.
  2359  //
  2360  // net/ipv4/ip_sockglue.c:do_ip_setsockopt does this for its socket options.
  2361  func parseIntOrChar(buf []byte) (int32, *syserr.Error) {
  2362  	if len(buf) == 0 {
  2363  		return 0, syserr.ErrInvalidArgument
  2364  	}
  2365  
  2366  	if len(buf) >= sizeOfInt32 {
  2367  		return int32(hostarch.ByteOrder.Uint32(buf)), nil
  2368  	}
  2369  
  2370  	return int32(buf[0]), nil
  2371  }
  2372  
  2373  // setSockOptIP implements SetSockOpt when level is SOL_IP.
  2374  func setSockOptIP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  2375  	if _, ok := ep.(tcpip.Endpoint); !ok {
  2376  		log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  2377  		return syserr.ErrUnknownProtocolOption
  2378  	}
  2379  
  2380  	switch name {
  2381  	case linux.IP_MULTICAST_TTL:
  2382  		v, err := parseIntOrChar(optVal)
  2383  		if err != nil {
  2384  			return err
  2385  		}
  2386  
  2387  		if v == -1 {
  2388  			// Linux translates -1 to 1.
  2389  			v = 1
  2390  		}
  2391  		if v < 0 || v > 255 {
  2392  			return syserr.ErrInvalidArgument
  2393  		}
  2394  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MulticastTTLOption, int(v)))
  2395  
  2396  	case linux.IP_ADD_MEMBERSHIP:
  2397  		req, err := copyInMulticastRequest(optVal, false /* allowAddr */)
  2398  		if err != nil {
  2399  			return err
  2400  		}
  2401  
  2402  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{
  2403  			NIC: tcpip.NICID(req.InterfaceIndex),
  2404  			// TODO(igudger): Change AddMembership to use the standard
  2405  			// any address representation.
  2406  			InterfaceAddr: tcpip.AddrFrom4(req.InterfaceAddr),
  2407  			MulticastAddr: tcpip.AddrFrom4(req.MulticastAddr),
  2408  		}))
  2409  
  2410  	case linux.IP_DROP_MEMBERSHIP:
  2411  		req, err := copyInMulticastRequest(optVal, false /* allowAddr */)
  2412  		if err != nil {
  2413  			return err
  2414  		}
  2415  
  2416  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{
  2417  			NIC: tcpip.NICID(req.InterfaceIndex),
  2418  			// TODO(igudger): Change DropMembership to use the standard
  2419  			// any address representation.
  2420  			InterfaceAddr: tcpip.AddrFrom4(req.InterfaceAddr),
  2421  			MulticastAddr: tcpip.AddrFrom4(req.MulticastAddr),
  2422  		}))
  2423  
  2424  	case linux.IP_MULTICAST_IF:
  2425  		req, err := copyInMulticastRequest(optVal, true /* allowAddr */)
  2426  		if err != nil {
  2427  			return err
  2428  		}
  2429  
  2430  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.MulticastInterfaceOption{
  2431  			NIC:           tcpip.NICID(req.InterfaceIndex),
  2432  			InterfaceAddr: socket.BytesToIPAddress(req.InterfaceAddr[:]),
  2433  		}))
  2434  
  2435  	case linux.IP_MULTICAST_LOOP:
  2436  		v, err := parseIntOrChar(optVal)
  2437  		if err != nil {
  2438  			return err
  2439  		}
  2440  
  2441  		ep.SocketOptions().SetMulticastLoop(v != 0)
  2442  		return nil
  2443  
  2444  	case linux.MCAST_JOIN_GROUP:
  2445  		// FIXME(b/124219304): Implement MCAST_JOIN_GROUP.
  2446  		return syserr.ErrInvalidArgument
  2447  
  2448  	case linux.IP_TTL:
  2449  		v, err := parseIntOrChar(optVal)
  2450  		if err != nil {
  2451  			return err
  2452  		}
  2453  
  2454  		// -1 means default TTL.
  2455  		if v == -1 {
  2456  			v = 0
  2457  		} else if v < 1 || v > 255 {
  2458  			return syserr.ErrInvalidArgument
  2459  		}
  2460  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TTLOption, int(v)))
  2461  
  2462  	case linux.IP_RECVTTL:
  2463  		v, err := parseIntOrChar(optVal)
  2464  		if err != nil {
  2465  			return err
  2466  		}
  2467  		ep.SocketOptions().SetReceiveTTL(v != 0)
  2468  		return nil
  2469  
  2470  	case linux.IP_TOS:
  2471  		if len(optVal) == 0 {
  2472  			return nil
  2473  		}
  2474  		v, err := parseIntOrChar(optVal)
  2475  		if err != nil {
  2476  			return err
  2477  		}
  2478  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TOSOption, int(v)))
  2479  
  2480  	case linux.IP_RECVTOS:
  2481  		v, err := parseIntOrChar(optVal)
  2482  		if err != nil {
  2483  			return err
  2484  		}
  2485  		ep.SocketOptions().SetReceiveTOS(v != 0)
  2486  		return nil
  2487  
  2488  	case linux.IP_RECVERR:
  2489  		if len(optVal) == 0 {
  2490  			return nil
  2491  		}
  2492  		v, err := parseIntOrChar(optVal)
  2493  		if err != nil {
  2494  			return err
  2495  		}
  2496  		ep.SocketOptions().SetIPv4RecvError(v != 0)
  2497  		return nil
  2498  
  2499  	case linux.IP_PKTINFO:
  2500  		if len(optVal) == 0 {
  2501  			return nil
  2502  		}
  2503  		v, err := parseIntOrChar(optVal)
  2504  		if err != nil {
  2505  			return err
  2506  		}
  2507  		ep.SocketOptions().SetReceivePacketInfo(v != 0)
  2508  		return nil
  2509  
  2510  	case linux.IP_HDRINCL:
  2511  		if len(optVal) == 0 {
  2512  			return nil
  2513  		}
  2514  		v, err := parseIntOrChar(optVal)
  2515  		if err != nil {
  2516  			return err
  2517  		}
  2518  		ep.SocketOptions().SetHeaderIncluded(v != 0)
  2519  		return nil
  2520  
  2521  	case linux.IP_RECVORIGDSTADDR:
  2522  		if len(optVal) == 0 {
  2523  			return nil
  2524  		}
  2525  		v, err := parseIntOrChar(optVal)
  2526  		if err != nil {
  2527  			return err
  2528  		}
  2529  
  2530  		ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0)
  2531  		return nil
  2532  
  2533  	case linux.IPT_SO_SET_REPLACE:
  2534  		if len(optVal) < linux.SizeOfIPTReplace {
  2535  			return syserr.ErrInvalidArgument
  2536  		}
  2537  
  2538  		// Only valid for raw IPv4 sockets.
  2539  		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
  2540  			return syserr.ErrProtocolNotAvailable
  2541  		}
  2542  
  2543  		stk := inet.StackFromContext(t)
  2544  		if stk == nil {
  2545  			return syserr.ErrNoDevice
  2546  		}
  2547  		// Stack must be a netstack stack.
  2548  		return netfilter.SetEntries(t, stk.(*Stack).Stack, optVal, false)
  2549  
  2550  	case linux.IPT_SO_SET_ADD_COUNTERS:
  2551  		log.Infof("IPT_SO_SET_ADD_COUNTERS is not supported")
  2552  		return nil
  2553  
  2554  	case linux.IP_ADD_SOURCE_MEMBERSHIP,
  2555  		linux.IP_BIND_ADDRESS_NO_PORT,
  2556  		linux.IP_BLOCK_SOURCE,
  2557  		linux.IP_CHECKSUM,
  2558  		linux.IP_DROP_SOURCE_MEMBERSHIP,
  2559  		linux.IP_FREEBIND,
  2560  		linux.IP_IPSEC_POLICY,
  2561  		linux.IP_MINTTL,
  2562  		linux.IP_MSFILTER,
  2563  		linux.IP_MTU_DISCOVER,
  2564  		linux.IP_MULTICAST_ALL,
  2565  		linux.IP_NODEFRAG,
  2566  		linux.IP_OPTIONS,
  2567  		linux.IP_PASSSEC,
  2568  		linux.IP_RECVFRAGSIZE,
  2569  		linux.IP_RECVOPTS,
  2570  		linux.IP_RETOPTS,
  2571  		linux.IP_TRANSPARENT,
  2572  		linux.IP_UNBLOCK_SOURCE,
  2573  		linux.IP_UNICAST_IF,
  2574  		linux.IP_XFRM_POLICY,
  2575  		linux.MCAST_BLOCK_SOURCE,
  2576  		linux.MCAST_JOIN_SOURCE_GROUP,
  2577  		linux.MCAST_LEAVE_GROUP,
  2578  		linux.MCAST_LEAVE_SOURCE_GROUP,
  2579  		linux.MCAST_MSFILTER,
  2580  		linux.MCAST_UNBLOCK_SOURCE:
  2581  		// Not supported.
  2582  	}
  2583  
  2584  	return nil
  2585  }
  2586  
  2587  // GetSockName implements the linux syscall getsockname(2) for sockets backed by
  2588  // tcpip.Endpoint.
  2589  func (s *sock) GetSockName(*kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
  2590  	addr, err := s.Endpoint.GetLocalAddress()
  2591  	if err != nil {
  2592  		return nil, 0, syserr.TranslateNetstackError(err)
  2593  	}
  2594  
  2595  	a, l := socket.ConvertAddress(s.family, addr)
  2596  	return a, l, nil
  2597  }
  2598  
  2599  // GetPeerName implements the linux syscall getpeername(2) for sockets backed by
  2600  // tcpip.Endpoint.
  2601  func (s *sock) GetPeerName(*kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
  2602  	addr, err := s.Endpoint.GetRemoteAddress()
  2603  	if err != nil {
  2604  		return nil, 0, syserr.TranslateNetstackError(err)
  2605  	}
  2606  
  2607  	a, l := socket.ConvertAddress(s.family, addr)
  2608  	return a, l, nil
  2609  }
  2610  
  2611  func (s *sock) fillCmsgInq(cmsg *socket.ControlMessages) {
  2612  	if !s.sockOptInq {
  2613  		return
  2614  	}
  2615  	rcvBufUsed, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
  2616  	if err != nil {
  2617  		return
  2618  	}
  2619  	cmsg.IP.HasInq = true
  2620  	cmsg.IP.Inq = int32(rcvBufUsed)
  2621  }
  2622  
  2623  func toLinuxPacketType(pktType tcpip.PacketType) uint8 {
  2624  	switch pktType {
  2625  	case tcpip.PacketHost:
  2626  		return linux.PACKET_HOST
  2627  	case tcpip.PacketOtherHost:
  2628  		return linux.PACKET_OTHERHOST
  2629  	case tcpip.PacketOutgoing:
  2630  		return linux.PACKET_OUTGOING
  2631  	case tcpip.PacketBroadcast:
  2632  		return linux.PACKET_BROADCAST
  2633  	case tcpip.PacketMulticast:
  2634  		return linux.PACKET_MULTICAST
  2635  	default:
  2636  		panic(fmt.Sprintf("unknown packet type: %d", pktType))
  2637  	}
  2638  }
  2639  
  2640  // nonBlockingRead issues a non-blocking read.
  2641  //
  2642  // TODO(b/78348848): Support timestamps for stream sockets.
  2643  func (s *sock) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
  2644  	isPacket := s.isPacketBased()
  2645  
  2646  	readOptions := tcpip.ReadOptions{
  2647  		Peek:               peek,
  2648  		NeedRemoteAddr:     senderRequested,
  2649  		NeedLinkPacketInfo: isPacket,
  2650  	}
  2651  
  2652  	// TCP sockets discard the data if MSG_TRUNC is set.
  2653  	//
  2654  	// This behavior is documented in man 7 tcp:
  2655  	// Since version 2.4, Linux supports the use of MSG_TRUNC in the flags
  2656  	// argument of recv(2) (and recvmsg(2)). This flag causes the received
  2657  	// bytes of data to be discarded, rather than passed back in a
  2658  	// caller-supplied  buffer.
  2659  	var w io.Writer
  2660  	if !isPacket && trunc {
  2661  		w = &tcpip.LimitedWriter{
  2662  			W: ioutil.Discard,
  2663  			N: dst.NumBytes(),
  2664  		}
  2665  	} else {
  2666  		w = dst.Writer(ctx)
  2667  	}
  2668  
  2669  	s.readMu.Lock()
  2670  	defer s.readMu.Unlock()
  2671  
  2672  	res, err := s.Endpoint.Read(w, readOptions)
  2673  	if _, ok := err.(*tcpip.ErrBadBuffer); ok && dst.NumBytes() == 0 {
  2674  		err = nil
  2675  	}
  2676  	if err != nil {
  2677  		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err)
  2678  	}
  2679  	// Set the control message, even if 0 bytes were read.
  2680  	s.updateTimestamp(res.ControlMessages)
  2681  
  2682  	if isPacket {
  2683  		var addr linux.SockAddr
  2684  		var addrLen uint32
  2685  		if senderRequested {
  2686  			addr, addrLen = socket.ConvertAddress(s.family, res.RemoteAddr)
  2687  			switch v := addr.(type) {
  2688  			case *linux.SockAddrLink:
  2689  				v.Protocol = socket.Htons(uint16(res.LinkPacketInfo.Protocol))
  2690  				v.PacketType = toLinuxPacketType(res.LinkPacketInfo.PktType)
  2691  			}
  2692  		}
  2693  
  2694  		msgLen := res.Count
  2695  		if trunc {
  2696  			msgLen = res.Total
  2697  		}
  2698  
  2699  		var flags int
  2700  		if res.Total > res.Count {
  2701  			flags |= linux.MSG_TRUNC
  2702  		}
  2703  
  2704  		return msgLen, flags, addr, addrLen, s.netstackToLinuxControlMessages(res.ControlMessages), nil
  2705  	}
  2706  
  2707  	if peek {
  2708  		// MSG_TRUNC with MSG_PEEK on a TCP socket returns the
  2709  		// amount that could be read, and does not write to buffer.
  2710  		if trunc {
  2711  			// TCP endpoint does not return the total bytes in buffer as numTotal.
  2712  			// We need to query it from socket option.
  2713  			rql, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
  2714  			if err != nil {
  2715  				return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err)
  2716  			}
  2717  			msgLen := int(dst.NumBytes())
  2718  			if msgLen > rql {
  2719  				msgLen = rql
  2720  			}
  2721  			return msgLen, 0, nil, 0, socket.ControlMessages{}, nil
  2722  		}
  2723  	} else if n := res.Count; n != 0 {
  2724  		s.Endpoint.ModerateRecvBuf(n)
  2725  	}
  2726  
  2727  	cmsg := s.netstackToLinuxControlMessages(res.ControlMessages)
  2728  	s.fillCmsgInq(&cmsg)
  2729  	return res.Count, 0, nil, 0, cmsg, syserr.TranslateNetstackError(err)
  2730  }
  2731  
  2732  func (s *sock) netstackToLinuxControlMessages(cm tcpip.ReceivableControlMessages) socket.ControlMessages {
  2733  	readCM := socket.NewIPControlMessages(s.family, cm)
  2734  	return socket.ControlMessages{
  2735  		IP: socket.IPControlMessages{
  2736  			HasTimestamp:       readCM.HasTimestamp && s.sockOptTimestamp,
  2737  			Timestamp:          readCM.Timestamp,
  2738  			HasInq:             readCM.HasInq,
  2739  			Inq:                readCM.Inq,
  2740  			HasTOS:             readCM.HasTOS,
  2741  			TOS:                readCM.TOS,
  2742  			HasTClass:          readCM.HasTClass,
  2743  			TClass:             readCM.TClass,
  2744  			HasTTL:             readCM.HasTTL,
  2745  			TTL:                readCM.TTL,
  2746  			HasHopLimit:        readCM.HasHopLimit,
  2747  			HopLimit:           readCM.HopLimit,
  2748  			HasIPPacketInfo:    readCM.HasIPPacketInfo,
  2749  			PacketInfo:         readCM.PacketInfo,
  2750  			HasIPv6PacketInfo:  readCM.HasIPv6PacketInfo,
  2751  			IPv6PacketInfo:     readCM.IPv6PacketInfo,
  2752  			OriginalDstAddress: readCM.OriginalDstAddress,
  2753  			SockErr:            readCM.SockErr,
  2754  		},
  2755  	}
  2756  }
  2757  
  2758  func (s *sock) linuxToNetstackControlMessages(cm socket.ControlMessages) tcpip.SendableControlMessages {
  2759  	return tcpip.SendableControlMessages{
  2760  		HasTTL:      cm.IP.HasTTL,
  2761  		TTL:         uint8(cm.IP.TTL),
  2762  		HasHopLimit: cm.IP.HasHopLimit,
  2763  		HopLimit:    uint8(cm.IP.HopLimit),
  2764  	}
  2765  }
  2766  
  2767  // updateTimestamp sets the timestamp for SIOCGSTAMP. It should be called after
  2768  // successfully writing packet data out to userspace.
  2769  //
  2770  // Precondition: s.readMu must be locked.
  2771  func (s *sock) updateTimestamp(cm tcpip.ReceivableControlMessages) {
  2772  	// Save the SIOCGSTAMP timestamp only if SO_TIMESTAMP is disabled.
  2773  	if !s.sockOptTimestamp {
  2774  		s.timestampValid = true
  2775  		s.timestamp = cm.Timestamp
  2776  	}
  2777  }
  2778  
  2779  // dequeueErr is analogous to net/core/skbuff.c:sock_dequeue_err_skb().
  2780  func (s *sock) dequeueErr() *tcpip.SockError {
  2781  	so := s.Endpoint.SocketOptions()
  2782  	err := so.DequeueErr()
  2783  	if err == nil {
  2784  		return nil
  2785  	}
  2786  
  2787  	// Update socket error to reflect ICMP errors in queue.
  2788  	if nextErr := so.PeekErr(); nextErr != nil && nextErr.Cause.Origin().IsICMPErr() {
  2789  		so.SetLastError(nextErr.Err)
  2790  	} else if err.Cause.Origin().IsICMPErr() {
  2791  		so.SetLastError(nil)
  2792  	}
  2793  	return err
  2794  }
  2795  
  2796  // addrFamilyFromNetProto returns the address family identifier for the given
  2797  // network protocol.
  2798  func addrFamilyFromNetProto(net tcpip.NetworkProtocolNumber) int {
  2799  	switch net {
  2800  	case header.IPv4ProtocolNumber:
  2801  		return linux.AF_INET
  2802  	case header.IPv6ProtocolNumber:
  2803  		return linux.AF_INET6
  2804  	default:
  2805  		panic(fmt.Sprintf("invalid net proto for addr family inference: %d", net))
  2806  	}
  2807  }
  2808  
  2809  // recvErr handles MSG_ERRQUEUE for recvmsg(2).
  2810  // This is analogous to net/ipv4/ip_sockglue.c:ip_recv_error().
  2811  func (s *sock) recvErr(t *kernel.Task, dst usermem.IOSequence) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
  2812  	sockErr := s.dequeueErr()
  2813  	if sockErr == nil {
  2814  		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
  2815  	}
  2816  	if sockErr.Payload != nil {
  2817  		defer sockErr.Payload.Release()
  2818  	}
  2819  
  2820  	// The payload of the original packet that caused the error is passed as
  2821  	// normal data via msg_iovec.  -- recvmsg(2)
  2822  	msgFlags := linux.MSG_ERRQUEUE
  2823  	if int(dst.NumBytes()) < sockErr.Payload.Size() {
  2824  		msgFlags |= linux.MSG_TRUNC
  2825  	}
  2826  	n, err := dst.CopyOut(t, sockErr.Payload.AsSlice())
  2827  
  2828  	// The original destination address of the datagram that caused the error is
  2829  	// supplied via msg_name.  -- recvmsg(2)
  2830  	dstAddr, dstAddrLen := socket.ConvertAddress(addrFamilyFromNetProto(sockErr.NetProto), sockErr.Dst)
  2831  	cmgs := socket.ControlMessages{IP: socket.NewIPControlMessages(s.family, tcpip.ReceivableControlMessages{SockErr: sockErr})}
  2832  	return n, msgFlags, dstAddr, dstAddrLen, cmgs, syserr.FromError(err)
  2833  }
  2834  
  2835  // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
  2836  // tcpip.Endpoint.
  2837  func (s *sock) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, _ uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
  2838  	if flags&linux.MSG_ERRQUEUE != 0 {
  2839  		return s.recvErr(t, dst)
  2840  	}
  2841  
  2842  	trunc := flags&linux.MSG_TRUNC != 0
  2843  	peek := flags&linux.MSG_PEEK != 0
  2844  	dontWait := flags&linux.MSG_DONTWAIT != 0
  2845  	waitAll := flags&linux.MSG_WAITALL != 0
  2846  	if senderRequested && !s.isPacketBased() {
  2847  		// Stream sockets ignore the sender address.
  2848  		senderRequested = false
  2849  	}
  2850  	n, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
  2851  
  2852  	if s.isPacketBased() && err == syserr.ErrClosedForReceive && flags&linux.MSG_DONTWAIT != 0 {
  2853  		// In this situation we should return EAGAIN.
  2854  		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
  2855  	}
  2856  
  2857  	if err != nil && (err != syserr.ErrWouldBlock || dontWait) {
  2858  		// Read failed and we should not retry.
  2859  		return 0, 0, nil, 0, socket.ControlMessages{}, err
  2860  	}
  2861  
  2862  	if err == nil && (dontWait || !waitAll || s.isPacketBased() || int64(n) >= dst.NumBytes()) {
  2863  		// We got all the data we need.
  2864  		return
  2865  	}
  2866  
  2867  	// Don't overwrite any data we received.
  2868  	dst = dst.DropFirst(n)
  2869  
  2870  	// We'll have to block. Register for notifications and keep trying to
  2871  	// send all the data.
  2872  	e, ch := waiter.NewChannelEntry(waiter.ReadableEvents)
  2873  	s.EventRegister(&e)
  2874  	defer s.EventUnregister(&e)
  2875  
  2876  	for {
  2877  		var rn int
  2878  		rn, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
  2879  		n += rn
  2880  		if err != nil && err != syserr.ErrWouldBlock {
  2881  			// Always stop on errors other than would block as we generally
  2882  			// won't be able to get any more data. Eat the error if we got
  2883  			// any data.
  2884  			if n > 0 {
  2885  				err = nil
  2886  			}
  2887  			return
  2888  		}
  2889  		if err == nil && (s.isPacketBased() || !waitAll || int64(rn) >= dst.NumBytes()) {
  2890  			// We got all the data we need.
  2891  			return
  2892  		}
  2893  		dst = dst.DropFirst(rn)
  2894  
  2895  		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
  2896  			if n > 0 {
  2897  				return n, msgFlags, senderAddr, senderAddrLen, controlMessages, nil
  2898  			}
  2899  			if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
  2900  				return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
  2901  			}
  2902  			return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
  2903  		}
  2904  	}
  2905  }
  2906  
  2907  // SendMsg implements the linux syscall sendmsg(2) for sockets backed by
  2908  // tcpip.Endpoint.
  2909  func (s *sock) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
  2910  	// Reject Unix control messages.
  2911  	if !controlMessages.Unix.Empty() {
  2912  		return 0, syserr.ErrInvalidArgument
  2913  	}
  2914  
  2915  	var addr *tcpip.FullAddress
  2916  	if len(to) > 0 {
  2917  		addrBuf, family, err := socket.AddressAndFamily(to)
  2918  		if err != nil {
  2919  			return 0, err
  2920  		}
  2921  		if !s.checkFamily(family, false /* exact */) {
  2922  			return 0, syserr.ErrInvalidArgument
  2923  		}
  2924  		addrBuf = s.mapFamily(addrBuf, family)
  2925  
  2926  		addr = &addrBuf
  2927  	}
  2928  
  2929  	opts := tcpip.WriteOptions{
  2930  		To:              addr,
  2931  		More:            flags&linux.MSG_MORE != 0,
  2932  		EndOfRecord:     flags&linux.MSG_EOR != 0,
  2933  		ControlMessages: s.linuxToNetstackControlMessages(controlMessages),
  2934  	}
  2935  
  2936  	r := src.Reader(t)
  2937  	var (
  2938  		total int64
  2939  		entry waiter.Entry
  2940  		ch    <-chan struct{}
  2941  	)
  2942  	for {
  2943  		n, err := s.Endpoint.Write(r, opts)
  2944  		total += n
  2945  		if flags&linux.MSG_DONTWAIT != 0 {
  2946  			return int(total), syserr.TranslateNetstackError(err)
  2947  		}
  2948  		block := true
  2949  		switch err.(type) {
  2950  		case nil:
  2951  			block = total != src.NumBytes()
  2952  		case *tcpip.ErrWouldBlock:
  2953  		default:
  2954  			block = false
  2955  		}
  2956  		if block {
  2957  			if ch == nil {
  2958  				// We'll have to block. Register for notification and keep trying to
  2959  				// send all the data.
  2960  				entry, ch = waiter.NewChannelEntry(waiter.WritableEvents)
  2961  				s.EventRegister(&entry)
  2962  				defer s.EventUnregister(&entry)
  2963  			} else {
  2964  				// Don't wait immediately after registration in case more data
  2965  				// became available between when we last checked and when we setup
  2966  				// the notification.
  2967  				if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
  2968  					if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
  2969  						return int(total), syserr.ErrTryAgain
  2970  					}
  2971  					// handleIOError will consume errors from t.Block if needed.
  2972  					return int(total), syserr.FromError(err)
  2973  				}
  2974  			}
  2975  			continue
  2976  		}
  2977  		return int(total), syserr.TranslateNetstackError(err)
  2978  	}
  2979  }
  2980  
  2981  // Ioctl implements vfs.FileDescriptionImpl.
  2982  func (s *sock) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
  2983  	t := kernel.TaskFromContext(ctx)
  2984  	if t == nil {
  2985  		panic("ioctl(2) may only be called from a task goroutine")
  2986  	}
  2987  
  2988  	// SIOCGSTAMP is implemented by netstack rather than all commonEndpoint
  2989  	// sockets.
  2990  	// TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP.
  2991  	switch args[1].Int() {
  2992  	case linux.SIOCGSTAMP:
  2993  		s.readMu.Lock()
  2994  		defer s.readMu.Unlock()
  2995  		if !s.timestampValid {
  2996  			return 0, linuxerr.ENOENT
  2997  		}
  2998  
  2999  		tv := linux.NsecToTimeval(s.timestamp.UnixNano())
  3000  		_, err := tv.CopyOut(t, args[2].Pointer())
  3001  		return 0, err
  3002  
  3003  	case linux.TIOCINQ:
  3004  		v, terr := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
  3005  		if terr != nil {
  3006  			return 0, syserr.TranslateNetstackError(terr).ToError()
  3007  		}
  3008  
  3009  		if v > math.MaxInt32 {
  3010  			v = math.MaxInt32
  3011  		}
  3012  
  3013  		// Copy result to userspace.
  3014  		vP := primitive.Int32(v)
  3015  		_, err := vP.CopyOut(t, args[2].Pointer())
  3016  		return 0, err
  3017  	}
  3018  
  3019  	return Ioctl(ctx, s.Endpoint, uio, sysno, args)
  3020  }
  3021  
  3022  // Ioctl performs a socket ioctl.
  3023  func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
  3024  	t := kernel.TaskFromContext(ctx)
  3025  	if t == nil {
  3026  		panic("ioctl(2) may only be called from a task goroutine")
  3027  	}
  3028  
  3029  	switch arg := int(args[1].Int()); arg {
  3030  	case linux.SIOCGIFFLAGS,
  3031  		linux.SIOCGIFADDR,
  3032  		linux.SIOCGIFBRDADDR,
  3033  		linux.SIOCGIFDSTADDR,
  3034  		linux.SIOCGIFHWADDR,
  3035  		linux.SIOCGIFINDEX,
  3036  		linux.SIOCGIFMAP,
  3037  		linux.SIOCGIFMETRIC,
  3038  		linux.SIOCGIFMTU,
  3039  		linux.SIOCGIFNAME,
  3040  		linux.SIOCGIFNETMASK,
  3041  		linux.SIOCGIFTXQLEN,
  3042  		linux.SIOCETHTOOL:
  3043  
  3044  		var ifr linux.IFReq
  3045  		if _, err := ifr.CopyIn(t, args[2].Pointer()); err != nil {
  3046  			return 0, err
  3047  		}
  3048  		if err := interfaceIoctl(ctx, io, arg, &ifr); err != nil {
  3049  			return 0, err.ToError()
  3050  		}
  3051  		_, err := ifr.CopyOut(t, args[2].Pointer())
  3052  		return 0, err
  3053  
  3054  	case linux.SIOCGIFCONF:
  3055  		// Return a list of interface addresses or the buffer size
  3056  		// necessary to hold the list.
  3057  		var ifc linux.IFConf
  3058  		if _, err := ifc.CopyIn(t, args[2].Pointer()); err != nil {
  3059  			return 0, err
  3060  		}
  3061  
  3062  		if err := ifconfIoctl(ctx, t, io, &ifc); err != nil {
  3063  			return 0, err
  3064  		}
  3065  
  3066  		_, err := ifc.CopyOut(t, args[2].Pointer())
  3067  		return 0, err
  3068  
  3069  	case linux.TIOCINQ:
  3070  		v, terr := ep.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
  3071  		if terr != nil {
  3072  			return 0, syserr.TranslateNetstackError(terr).ToError()
  3073  		}
  3074  
  3075  		if v > math.MaxInt32 {
  3076  			v = math.MaxInt32
  3077  		}
  3078  		// Copy result to userspace.
  3079  		vP := primitive.Int32(v)
  3080  		_, err := vP.CopyOut(t, args[2].Pointer())
  3081  		return 0, err
  3082  
  3083  	case linux.TIOCOUTQ:
  3084  		v, terr := ep.GetSockOptInt(tcpip.SendQueueSizeOption)
  3085  		if terr != nil {
  3086  			return 0, syserr.TranslateNetstackError(terr).ToError()
  3087  		}
  3088  
  3089  		if v > math.MaxInt32 {
  3090  			v = math.MaxInt32
  3091  		}
  3092  
  3093  		// Copy result to userspace.
  3094  		vP := primitive.Int32(v)
  3095  		_, err := vP.CopyOut(t, args[2].Pointer())
  3096  		return 0, err
  3097  
  3098  	case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG:
  3099  		// Not supported.
  3100  	}
  3101  
  3102  	return 0, linuxerr.ENOTTY
  3103  }
  3104  
  3105  // interfaceIoctl implements interface requests.
  3106  func interfaceIoctl(ctx context.Context, _ usermem.IO, arg int, ifr *linux.IFReq) *syserr.Error {
  3107  	var (
  3108  		iface inet.Interface
  3109  		index int32
  3110  		found bool
  3111  	)
  3112  
  3113  	// Find the relevant device.
  3114  	stk := inet.StackFromContext(ctx)
  3115  	if stk == nil {
  3116  		return syserr.ErrNoDevice
  3117  	}
  3118  
  3119  	// SIOCGIFNAME uses ifr.ifr_ifindex rather than ifr.ifr_name to
  3120  	// identify a device.
  3121  	if arg == linux.SIOCGIFNAME {
  3122  		// Gets the name of the interface given the interface index
  3123  		// stored in ifr_ifindex.
  3124  		index = int32(hostarch.ByteOrder.Uint32(ifr.Data[:4]))
  3125  		if iface, ok := stk.Interfaces()[index]; ok {
  3126  			ifr.SetName(iface.Name)
  3127  			return nil
  3128  		}
  3129  		return syserr.ErrNoDevice
  3130  	}
  3131  
  3132  	// Find the relevant device.
  3133  	for index, iface = range stk.Interfaces() {
  3134  		if iface.Name == ifr.Name() {
  3135  			found = true
  3136  			break
  3137  		}
  3138  	}
  3139  	if !found {
  3140  		return syserr.ErrNoDevice
  3141  	}
  3142  
  3143  	switch arg {
  3144  	case linux.SIOCGIFINDEX:
  3145  		// Copy out the index to the data.
  3146  		hostarch.ByteOrder.PutUint32(ifr.Data[:], uint32(index))
  3147  
  3148  	case linux.SIOCGIFHWADDR:
  3149  		// Copy the hardware address out.
  3150  		//
  3151  		// Refer: https://linux.die.net/man/7/netdevice
  3152  		// SIOCGIFHWADDR, SIOCSIFHWADDR
  3153  		//
  3154  		// Get or set the hardware address of a device using
  3155  		// ifr_hwaddr. The hardware address is specified in a struct
  3156  		// sockaddr. sa_family contains the ARPHRD_* device type,
  3157  		// sa_data the L2 hardware address starting from byte 0. Setting
  3158  		// the hardware address is a privileged operation.
  3159  		hostarch.ByteOrder.PutUint16(ifr.Data[:], iface.DeviceType)
  3160  		n := copy(ifr.Data[2:], iface.Addr)
  3161  		for i := 2 + n; i < len(ifr.Data); i++ {
  3162  			ifr.Data[i] = 0 // Clear padding.
  3163  		}
  3164  
  3165  	case linux.SIOCGIFFLAGS:
  3166  		f, err := interfaceStatusFlags(stk, iface.Name)
  3167  		if err != nil {
  3168  			return err
  3169  		}
  3170  		// Drop the flags that don't fit in the size that we need to return. This
  3171  		// matches Linux behavior.
  3172  		hostarch.ByteOrder.PutUint16(ifr.Data[:2], uint16(f))
  3173  
  3174  	case linux.SIOCGIFADDR:
  3175  		// Copy the IPv4 address out.
  3176  		for _, addr := range stk.InterfaceAddrs()[index] {
  3177  			// This ioctl is only compatible with AF_INET addresses.
  3178  			if addr.Family != linux.AF_INET {
  3179  				continue
  3180  			}
  3181  			copy(ifr.Data[4:8], addr.Addr)
  3182  			break
  3183  		}
  3184  
  3185  	case linux.SIOCGIFMETRIC:
  3186  		// Gets the metric of the device. As per netdevice(7), this
  3187  		// always just sets ifr_metric to 0.
  3188  		hostarch.ByteOrder.PutUint32(ifr.Data[:4], 0)
  3189  
  3190  	case linux.SIOCGIFMTU:
  3191  		// Gets the MTU of the device.
  3192  		hostarch.ByteOrder.PutUint32(ifr.Data[:4], iface.MTU)
  3193  
  3194  	case linux.SIOCGIFMAP:
  3195  		// Gets the hardware parameters of the device.
  3196  		// TODO(gvisor.dev/issue/505): Implement.
  3197  
  3198  	case linux.SIOCGIFTXQLEN:
  3199  		// Gets the transmit queue length of the device.
  3200  		// TODO(gvisor.dev/issue/505): Implement.
  3201  
  3202  	case linux.SIOCGIFDSTADDR:
  3203  		// Gets the destination address of a point-to-point device.
  3204  		// TODO(gvisor.dev/issue/505): Implement.
  3205  
  3206  	case linux.SIOCGIFBRDADDR:
  3207  		// Gets the broadcast address of a device.
  3208  		// TODO(gvisor.dev/issue/505): Implement.
  3209  
  3210  	case linux.SIOCGIFNETMASK:
  3211  		// Gets the network mask of a device.
  3212  		for _, addr := range stk.InterfaceAddrs()[index] {
  3213  			// This ioctl is only compatible with AF_INET addresses.
  3214  			if addr.Family != linux.AF_INET {
  3215  				continue
  3216  			}
  3217  			// Populate ifr.ifr_netmask (type sockaddr).
  3218  			hostarch.ByteOrder.PutUint16(ifr.Data[0:], uint16(linux.AF_INET))
  3219  			hostarch.ByteOrder.PutUint16(ifr.Data[2:], 0)
  3220  			var mask uint32 = 0xffffffff << (32 - addr.PrefixLen)
  3221  			// Netmask is expected to be returned as a big endian
  3222  			// value.
  3223  			binary.BigEndian.PutUint32(ifr.Data[4:8], mask)
  3224  			break
  3225  		}
  3226  
  3227  	case linux.SIOCETHTOOL:
  3228  		// Stubbed out for now, Ideally we should implement the required
  3229  		// sub-commands for ETHTOOL
  3230  		//
  3231  		// See:
  3232  		// https://github.com/torvalds/linux/blob/aa0c9086b40c17a7ad94425b3b70dd1fdd7497bf/net/core/dev_ioctl.c
  3233  		return syserr.ErrEndpointOperation
  3234  
  3235  	default:
  3236  		// Not a valid call.
  3237  		return syserr.ErrInvalidArgument
  3238  	}
  3239  
  3240  	return nil
  3241  }
  3242  
  3243  // ifconfIoctl populates a struct ifconf for the SIOCGIFCONF ioctl.
  3244  func ifconfIoctl(ctx context.Context, t *kernel.Task, _ usermem.IO, ifc *linux.IFConf) error {
  3245  	// If Ptr is NULL, return the necessary buffer size via Len.
  3246  	// Otherwise, write up to Len bytes starting at Ptr containing ifreq
  3247  	// structs.
  3248  	stk := inet.StackFromContext(ctx)
  3249  	if stk == nil {
  3250  		return syserr.ErrNoDevice.ToError()
  3251  	}
  3252  
  3253  	if ifc.Ptr == 0 {
  3254  		ifc.Len = int32(len(stk.Interfaces())) * int32(linux.SizeOfIFReq)
  3255  		return nil
  3256  	}
  3257  
  3258  	max := ifc.Len
  3259  	ifc.Len = 0
  3260  	for key, ifaceAddrs := range stk.InterfaceAddrs() {
  3261  		iface := stk.Interfaces()[key]
  3262  		for _, ifaceAddr := range ifaceAddrs {
  3263  			// Don't write past the end of the buffer.
  3264  			if ifc.Len+int32(linux.SizeOfIFReq) > max {
  3265  				break
  3266  			}
  3267  			if ifaceAddr.Family != linux.AF_INET {
  3268  				continue
  3269  			}
  3270  
  3271  			// Populate ifr.ifr_addr.
  3272  			ifr := linux.IFReq{}
  3273  			ifr.SetName(iface.Name)
  3274  			hostarch.ByteOrder.PutUint16(ifr.Data[0:2], uint16(ifaceAddr.Family))
  3275  			hostarch.ByteOrder.PutUint16(ifr.Data[2:4], 0)
  3276  			copy(ifr.Data[4:8], ifaceAddr.Addr[:4])
  3277  
  3278  			// Copy the ifr to userspace.
  3279  			dst := uintptr(ifc.Ptr) + uintptr(ifc.Len)
  3280  			ifc.Len += int32(linux.SizeOfIFReq)
  3281  			if _, err := ifr.CopyOut(t, hostarch.Addr(dst)); err != nil {
  3282  				return err
  3283  			}
  3284  		}
  3285  	}
  3286  	return nil
  3287  }
  3288  
  3289  // interfaceStatusFlags returns status flags for an interface in the stack.
  3290  // Flag values and meanings are described in greater detail in netdevice(7) in
  3291  // the SIOCGIFFLAGS section.
  3292  func interfaceStatusFlags(stack inet.Stack, name string) (uint32, *syserr.Error) {
  3293  	// We should only ever be passed a netstack.Stack.
  3294  	epstack, ok := stack.(*Stack)
  3295  	if !ok {
  3296  		return 0, errStackType
  3297  	}
  3298  
  3299  	// Find the NIC corresponding to this interface.
  3300  	for _, info := range epstack.Stack.NICInfo() {
  3301  		if info.Name == name {
  3302  			return nicStateFlagsToLinux(info.Flags), nil
  3303  		}
  3304  	}
  3305  	return 0, syserr.ErrNoDevice
  3306  }
  3307  
  3308  func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 {
  3309  	var rv uint32
  3310  	if f.Up {
  3311  		rv |= linux.IFF_UP | linux.IFF_LOWER_UP
  3312  	}
  3313  	if f.Running {
  3314  		rv |= linux.IFF_RUNNING
  3315  	}
  3316  	if f.Promiscuous {
  3317  		rv |= linux.IFF_PROMISC
  3318  	}
  3319  	if f.Loopback {
  3320  		rv |= linux.IFF_LOOPBACK
  3321  	}
  3322  	return rv
  3323  }
  3324  
  3325  // State implements socket.Socket.State. State translates the internal state
  3326  // returned by netstack to values defined by Linux.
  3327  func (s *sock) State() uint32 {
  3328  	if s.family != linux.AF_INET && s.family != linux.AF_INET6 {
  3329  		// States not implemented for this socket's family.
  3330  		return 0
  3331  	}
  3332  
  3333  	switch {
  3334  	case socket.IsTCP(s):
  3335  		// TCP socket.
  3336  		switch tcp.EndpointState(s.Endpoint.State()) {
  3337  		case tcp.StateEstablished:
  3338  			return linux.TCP_ESTABLISHED
  3339  		case tcp.StateSynSent:
  3340  			return linux.TCP_SYN_SENT
  3341  		case tcp.StateSynRecv:
  3342  			return linux.TCP_SYN_RECV
  3343  		case tcp.StateFinWait1:
  3344  			return linux.TCP_FIN_WAIT1
  3345  		case tcp.StateFinWait2:
  3346  			return linux.TCP_FIN_WAIT2
  3347  		case tcp.StateTimeWait:
  3348  			return linux.TCP_TIME_WAIT
  3349  		case tcp.StateClose, tcp.StateInitial, tcp.StateBound, tcp.StateConnecting, tcp.StateError:
  3350  			return linux.TCP_CLOSE
  3351  		case tcp.StateCloseWait:
  3352  			return linux.TCP_CLOSE_WAIT
  3353  		case tcp.StateLastAck:
  3354  			return linux.TCP_LAST_ACK
  3355  		case tcp.StateListen:
  3356  			return linux.TCP_LISTEN
  3357  		case tcp.StateClosing:
  3358  			return linux.TCP_CLOSING
  3359  		default:
  3360  			// Internal or unknown state.
  3361  			return 0
  3362  		}
  3363  	case socket.IsUDP(s):
  3364  		// UDP socket.
  3365  		switch transport.DatagramEndpointState(s.Endpoint.State()) {
  3366  		case transport.DatagramEndpointStateInitial, transport.DatagramEndpointStateBound, transport.DatagramEndpointStateClosed:
  3367  			return linux.TCP_CLOSE
  3368  		case transport.DatagramEndpointStateConnected:
  3369  			return linux.TCP_ESTABLISHED
  3370  		default:
  3371  			return 0
  3372  		}
  3373  	case socket.IsICMP(s):
  3374  		// TODO(b/112063468): Export states for ICMP sockets.
  3375  	case socket.IsRaw(s):
  3376  		// TODO(b/112063468): Export states for raw sockets.
  3377  	default:
  3378  		// Unknown transport protocol, how did we make this socket?
  3379  		log.Warningf("Unknown transport protocol for an existing socket: family=%v, type=%v, protocol=%v, internal type %v", s.family, s.skType, s.protocol, reflect.TypeOf(s.Endpoint).Elem())
  3380  		return 0
  3381  	}
  3382  
  3383  	return 0
  3384  }
  3385  
  3386  // Type implements socket.Socket.Type.
  3387  func (s *sock) Type() (family int, skType linux.SockType, protocol int) {
  3388  	return s.family, s.skType, s.protocol
  3389  }
  3390  
  3391  // EventRegister implements waiter.Waitable.
  3392  func (s *sock) EventRegister(e *waiter.Entry) error {
  3393  	s.Queue.EventRegister(e)
  3394  	return nil
  3395  }
  3396  
  3397  // EventUnregister implements waiter.Waitable.EventUnregister.
  3398  func (s *sock) EventUnregister(e *waiter.Entry) {
  3399  	s.Queue.EventUnregister(e)
  3400  }