github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/socket/netstack/netstack.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package netstack provides an implementation of the socket.Socket interface
    16  // that is backed by a tcpip.Endpoint.
    17  //
    18  // It does not depend on any particular endpoint implementation, and thus can
    19  // be used to expose certain endpoints to the sentry while leaving others out,
    20  // for example, TCP endpoints and Unix-domain endpoints.
    21  //
    22  // Lock ordering: netstack => mm: ioSequenceReadWriter copies user memory inside
    23  // tcpip.Endpoint.Write(). Netstack is allowed to (and does) hold locks during
    24  // this operation.
    25  package netstack
    26  
    27  import (
    28  	"bytes"
    29  	"encoding/binary"
    30  	"fmt"
    31  	"io"
    32  	"io/ioutil"
    33  	"math"
    34  	"reflect"
    35  	"time"
    36  
    37  	"golang.org/x/sys/unix"
    38  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    39  	"github.com/SagerNet/gvisor/pkg/abi/linux/errno"
    40  	"github.com/SagerNet/gvisor/pkg/context"
    41  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    42  	"github.com/SagerNet/gvisor/pkg/hostarch"
    43  	"github.com/SagerNet/gvisor/pkg/log"
    44  	"github.com/SagerNet/gvisor/pkg/marshal"
    45  	"github.com/SagerNet/gvisor/pkg/marshal/primitive"
    46  	"github.com/SagerNet/gvisor/pkg/metric"
    47  	"github.com/SagerNet/gvisor/pkg/sentry/arch"
    48  	"github.com/SagerNet/gvisor/pkg/sentry/fs"
    49  	"github.com/SagerNet/gvisor/pkg/sentry/fs/fsutil"
    50  	"github.com/SagerNet/gvisor/pkg/sentry/inet"
    51  	"github.com/SagerNet/gvisor/pkg/sentry/kernel"
    52  	ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time"
    53  	"github.com/SagerNet/gvisor/pkg/sentry/socket"
    54  	"github.com/SagerNet/gvisor/pkg/sentry/socket/netfilter"
    55  	"github.com/SagerNet/gvisor/pkg/sentry/unimpl"
    56  	"github.com/SagerNet/gvisor/pkg/sync"
    57  	"github.com/SagerNet/gvisor/pkg/syserr"
    58  	"github.com/SagerNet/gvisor/pkg/syserror"
    59  	"github.com/SagerNet/gvisor/pkg/tcpip"
    60  	"github.com/SagerNet/gvisor/pkg/tcpip/header"
    61  	"github.com/SagerNet/gvisor/pkg/tcpip/stack"
    62  	"github.com/SagerNet/gvisor/pkg/tcpip/transport/tcp"
    63  	"github.com/SagerNet/gvisor/pkg/tcpip/transport/udp"
    64  	"github.com/SagerNet/gvisor/pkg/usermem"
    65  	"github.com/SagerNet/gvisor/pkg/waiter"
    66  )
    67  
    68  func mustCreateMetric(name, description string) *tcpip.StatCounter {
    69  	var cm tcpip.StatCounter
    70  	metric.MustRegisterCustomUint64Metric(name, true /* cumulative */, false /* sync */, description, cm.Value)
    71  	return &cm
    72  }
    73  
    74  func mustCreateGauge(name, description string) *tcpip.StatCounter {
    75  	var cm tcpip.StatCounter
    76  	metric.MustRegisterCustomUint64Metric(name, false /* cumulative */, false /* sync */, description, cm.Value)
    77  	return &cm
    78  }
    79  
    80  // Metrics contains metrics exported by netstack.
    81  var Metrics = tcpip.Stats{
    82  	DroppedPackets: mustCreateMetric("/netstack/dropped_packets", "Number of packets dropped at the transport layer."),
    83  	NICs: tcpip.NICStats{
    84  		UnknownL3ProtocolRcvdPackets: mustCreateMetric("/netstack/nic/unknown_l3_protocol_received_packets", "Number of packets received that were for an unknown or unsupported L3 protocol."),
    85  		UnknownL4ProtocolRcvdPackets: mustCreateMetric("/netstack/nic/unknown_l4_protocol_received_packets", "Number of packets received that were for an unknown or unsupported L4 protocol."),
    86  		MalformedL4RcvdPackets:       mustCreateMetric("/netstack/nic/malformed_l4_received_packets", "Number of packets received that failed L4 header parsing."),
    87  		Tx: tcpip.NICPacketStats{
    88  			Packets: mustCreateMetric("/netstack/nic/tx/packets", "Number of packets transmitted."),
    89  			Bytes:   mustCreateMetric("/netstack/nic/tx/bytes", "Number of bytes transmitted."),
    90  		},
    91  		Rx: tcpip.NICPacketStats{
    92  			Packets: mustCreateMetric("/netstack/nic/rx/packets", "Number of packets received."),
    93  			Bytes:   mustCreateMetric("/netstack/nic/rx/bytes", "Number of bytes received."),
    94  		},
    95  		DisabledRx: tcpip.NICPacketStats{
    96  			Packets: mustCreateMetric("/netstack/nic/disabled_rx/packets", "Number of packets received on disabled NICs."),
    97  			Bytes:   mustCreateMetric("/netstack/nic/disabled_rx/bytes", "Number of bytes received on disabled NICs."),
    98  		},
    99  		Neighbor: tcpip.NICNeighborStats{
   100  			UnreachableEntryLookups: mustCreateMetric("/netstack/nic/neighbor/unreachable_entry_loopups", "Number of lookups performed on a neighbor entry in Unreachable state."),
   101  		},
   102  	},
   103  	ICMP: tcpip.ICMPStats{
   104  		V4: tcpip.ICMPv4Stats{
   105  			PacketsSent: tcpip.ICMPv4SentPacketStats{
   106  				ICMPv4PacketStats: tcpip.ICMPv4PacketStats{
   107  					EchoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_request", "Number of ICMPv4 echo request packets sent."),
   108  					EchoReply:      mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_reply", "Number of ICMPv4 echo reply packets sent."),
   109  					DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_sent/dst_unreachable", "Number of ICMPv4 destination unreachable packets sent."),
   110  					SrcQuench:      mustCreateMetric("/netstack/icmp/v4/packets_sent/src_quench", "Number of ICMPv4 source quench packets sent."),
   111  					Redirect:       mustCreateMetric("/netstack/icmp/v4/packets_sent/redirect", "Number of ICMPv4 redirect packets sent."),
   112  					TimeExceeded:   mustCreateMetric("/netstack/icmp/v4/packets_sent/time_exceeded", "Number of ICMPv4 time exceeded packets sent."),
   113  					ParamProblem:   mustCreateMetric("/netstack/icmp/v4/packets_sent/param_problem", "Number of ICMPv4 parameter problem packets sent."),
   114  					Timestamp:      mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp", "Number of ICMPv4 timestamp packets sent."),
   115  					TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp_reply", "Number of ICMPv4 timestamp reply packets sent."),
   116  					InfoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_sent/info_request", "Number of ICMPv4 information request packets sent."),
   117  					InfoReply:      mustCreateMetric("/netstack/icmp/v4/packets_sent/info_reply", "Number of ICMPv4 information reply packets sent."),
   118  				},
   119  				Dropped:     mustCreateMetric("/netstack/icmp/v4/packets_sent/dropped", "Number of ICMPv4 packets dropped due to link layer errors."),
   120  				RateLimited: mustCreateMetric("/netstack/icmp/v4/packets_sent/rate_limited", "Number of ICMPv4 packets dropped due to rate limit being exceeded."),
   121  			},
   122  			PacketsReceived: tcpip.ICMPv4ReceivedPacketStats{
   123  				ICMPv4PacketStats: tcpip.ICMPv4PacketStats{
   124  					EchoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_received/echo_request", "Number of ICMPv4 echo request packets received."),
   125  					EchoReply:      mustCreateMetric("/netstack/icmp/v4/packets_received/echo_reply", "Number of ICMPv4 echo reply packets received."),
   126  					DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_received/dst_unreachable", "Number of ICMPv4 destination unreachable packets received."),
   127  					SrcQuench:      mustCreateMetric("/netstack/icmp/v4/packets_received/src_quench", "Number of ICMPv4 source quench packets received."),
   128  					Redirect:       mustCreateMetric("/netstack/icmp/v4/packets_received/redirect", "Number of ICMPv4 redirect packets received."),
   129  					TimeExceeded:   mustCreateMetric("/netstack/icmp/v4/packets_received/time_exceeded", "Number of ICMPv4 time exceeded packets received."),
   130  					ParamProblem:   mustCreateMetric("/netstack/icmp/v4/packets_received/param_problem", "Number of ICMPv4 parameter problem packets received."),
   131  					Timestamp:      mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp", "Number of ICMPv4 timestamp packets received."),
   132  					TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp_reply", "Number of ICMPv4 timestamp reply packets received."),
   133  					InfoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_received/info_request", "Number of ICMPv4 information request packets received."),
   134  					InfoReply:      mustCreateMetric("/netstack/icmp/v4/packets_received/info_reply", "Number of ICMPv4 information reply packets received."),
   135  				},
   136  				Invalid: mustCreateMetric("/netstack/icmp/v4/packets_received/invalid", "Number of ICMPv4 packets received that the transport layer could not parse."),
   137  			},
   138  		},
   139  		V6: tcpip.ICMPv6Stats{
   140  			PacketsSent: tcpip.ICMPv6SentPacketStats{
   141  				ICMPv6PacketStats: tcpip.ICMPv6PacketStats{
   142  					EchoRequest:             mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_request", "Number of ICMPv6 echo request packets sent."),
   143  					EchoReply:               mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_reply", "Number of ICMPv6 echo reply packets sent."),
   144  					DstUnreachable:          mustCreateMetric("/netstack/icmp/v6/packets_sent/dst_unreachable", "Number of ICMPv6 destination unreachable packets sent."),
   145  					PacketTooBig:            mustCreateMetric("/netstack/icmp/v6/packets_sent/packet_too_big", "Number of ICMPv6 packet too big packets sent."),
   146  					TimeExceeded:            mustCreateMetric("/netstack/icmp/v6/packets_sent/time_exceeded", "Number of ICMPv6 time exceeded packets sent."),
   147  					ParamProblem:            mustCreateMetric("/netstack/icmp/v6/packets_sent/param_problem", "Number of ICMPv6 parameter problem packets sent."),
   148  					RouterSolicit:           mustCreateMetric("/netstack/icmp/v6/packets_sent/router_solicit", "Number of ICMPv6 router solicit packets sent."),
   149  					RouterAdvert:            mustCreateMetric("/netstack/icmp/v6/packets_sent/router_advert", "Number of ICMPv6 router advert packets sent."),
   150  					NeighborSolicit:         mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets sent."),
   151  					NeighborAdvert:          mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_advert", "Number of ICMPv6 neighbor advert packets sent."),
   152  					RedirectMsg:             mustCreateMetric("/netstack/icmp/v6/packets_sent/redirect_msg", "Number of ICMPv6 redirect message packets sent."),
   153  					MulticastListenerQuery:  mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_query", "Number of ICMPv6 multicast listener query packets sent."),
   154  					MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent."),
   155  					MulticastListenerDone:   mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent."),
   156  				},
   157  				Dropped:     mustCreateMetric("/netstack/icmp/v6/packets_sent/dropped", "Number of ICMPv6 packets dropped due to link layer errors."),
   158  				RateLimited: mustCreateMetric("/netstack/icmp/v6/packets_sent/rate_limited", "Number of ICMPv6 packets dropped due to rate limit being exceeded."),
   159  			},
   160  			PacketsReceived: tcpip.ICMPv6ReceivedPacketStats{
   161  				ICMPv6PacketStats: tcpip.ICMPv6PacketStats{
   162  					EchoRequest:             mustCreateMetric("/netstack/icmp/v6/packets_received/echo_request", "Number of ICMPv6 echo request packets received."),
   163  					EchoReply:               mustCreateMetric("/netstack/icmp/v6/packets_received/echo_reply", "Number of ICMPv6 echo reply packets received."),
   164  					DstUnreachable:          mustCreateMetric("/netstack/icmp/v6/packets_received/dst_unreachable", "Number of ICMPv6 destination unreachable packets received."),
   165  					PacketTooBig:            mustCreateMetric("/netstack/icmp/v6/packets_received/packet_too_big", "Number of ICMPv6 packet too big packets received."),
   166  					TimeExceeded:            mustCreateMetric("/netstack/icmp/v6/packets_received/time_exceeded", "Number of ICMPv6 time exceeded packets received."),
   167  					ParamProblem:            mustCreateMetric("/netstack/icmp/v6/packets_received/param_problem", "Number of ICMPv6 parameter problem packets received."),
   168  					RouterSolicit:           mustCreateMetric("/netstack/icmp/v6/packets_received/router_solicit", "Number of ICMPv6 router solicit packets received."),
   169  					RouterAdvert:            mustCreateMetric("/netstack/icmp/v6/packets_received/router_advert", "Number of ICMPv6 router advert packets received."),
   170  					NeighborSolicit:         mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets received."),
   171  					NeighborAdvert:          mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_advert", "Number of ICMPv6 neighbor advert packets received."),
   172  					RedirectMsg:             mustCreateMetric("/netstack/icmp/v6/packets_received/redirect_msg", "Number of ICMPv6 redirect message packets received."),
   173  					MulticastListenerQuery:  mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_query", "Number of ICMPv6 multicast listener query packets received."),
   174  					MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent."),
   175  					MulticastListenerDone:   mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent."),
   176  				},
   177  				Unrecognized:                   mustCreateMetric("/netstack/icmp/v6/packets_received/unrecognized", "Number of ICMPv6 packets received that the transport layer does not know how to parse."),
   178  				Invalid:                        mustCreateMetric("/netstack/icmp/v6/packets_received/invalid", "Number of ICMPv6 packets received that the transport layer could not parse."),
   179  				RouterOnlyPacketsDroppedByHost: mustCreateMetric("/netstack/icmp/v6/packets_received/router_only_packets_dropped_by_host", "Number of ICMPv6 packets dropped due to being router-specific packets."),
   180  			},
   181  		},
   182  	},
   183  	IGMP: tcpip.IGMPStats{
   184  		PacketsSent: tcpip.IGMPSentPacketStats{
   185  			IGMPPacketStats: tcpip.IGMPPacketStats{
   186  				MembershipQuery:    mustCreateMetric("/netstack/igmp/packets_sent/membership_query", "Number of IGMP Membership Query messages sent."),
   187  				V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v1_membership_report", "Number of IGMPv1 Membership Report messages sent."),
   188  				V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v2_membership_report", "Number of IGMPv2 Membership Report messages sent."),
   189  				LeaveGroup:         mustCreateMetric("/netstack/igmp/packets_sent/leave_group", "Number of IGMP Leave Group messages sent."),
   190  			},
   191  			Dropped: mustCreateMetric("/netstack/igmp/packets_sent/dropped", "Number of IGMP packets dropped due to link layer errors."),
   192  		},
   193  		PacketsReceived: tcpip.IGMPReceivedPacketStats{
   194  			IGMPPacketStats: tcpip.IGMPPacketStats{
   195  				MembershipQuery:    mustCreateMetric("/netstack/igmp/packets_received/membership_query", "Number of IGMP Membership Query messages received."),
   196  				V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v1_membership_report", "Number of IGMPv1 Membership Report messages received."),
   197  				V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v2_membership_report", "Number of IGMPv2 Membership Report messages received."),
   198  				LeaveGroup:         mustCreateMetric("/netstack/igmp/packets_received/leave_group", "Number of IGMP Leave Group messages received."),
   199  			},
   200  			Invalid:        mustCreateMetric("/netstack/igmp/packets_received/invalid", "Number of IGMP packets received that could not be parsed."),
   201  			ChecksumErrors: mustCreateMetric("/netstack/igmp/packets_received/checksum_errors", "Number of received IGMP packets with bad checksums."),
   202  			Unrecognized:   mustCreateMetric("/netstack/igmp/packets_received/unrecognized", "Number of unrecognized IGMP packets received."),
   203  		},
   204  	},
   205  	IP: tcpip.IPStats{
   206  		PacketsReceived:                     mustCreateMetric("/netstack/ip/packets_received", "Number of IP packets received from the link layer in nic.DeliverNetworkPacket."),
   207  		DisabledPacketsReceived:             mustCreateMetric("/netstack/ip/disabled_packets_received", "Number of IP packets received from the link layer when the IP layer is disabled."),
   208  		InvalidDestinationAddressesReceived: mustCreateMetric("/netstack/ip/invalid_addresses_received", "Number of IP packets received with an unknown or invalid destination address."),
   209  		InvalidSourceAddressesReceived:      mustCreateMetric("/netstack/ip/invalid_source_addresses_received", "Number of IP packets received with an unknown or invalid source address."),
   210  		PacketsDelivered:                    mustCreateMetric("/netstack/ip/packets_delivered", "Number of incoming IP packets that are successfully delivered to the transport layer via HandlePacket."),
   211  		PacketsSent:                         mustCreateMetric("/netstack/ip/packets_sent", "Number of IP packets sent via WritePacket."),
   212  		OutgoingPacketErrors:                mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Number of IP packets which failed to write to a link-layer endpoint."),
   213  		MalformedPacketsReceived:            mustCreateMetric("/netstack/ip/malformed_packets_received", "Number of IP packets which failed IP header validation checks."),
   214  		MalformedFragmentsReceived:          mustCreateMetric("/netstack/ip/malformed_fragments_received", "Number of IP fragments which failed IP fragment validation checks."),
   215  		IPTablesPreroutingDropped:           mustCreateMetric("/netstack/ip/iptables/prerouting_dropped", "Number of IP packets dropped in the Prerouting chain."),
   216  		IPTablesInputDropped:                mustCreateMetric("/netstack/ip/iptables/input_dropped", "Number of IP packets dropped in the Input chain."),
   217  		IPTablesOutputDropped:               mustCreateMetric("/netstack/ip/iptables/output_dropped", "Number of IP packets dropped in the Output chain."),
   218  		OptionTimestampReceived:             mustCreateMetric("/netstack/ip/options/timestamp_received", "Number of timestamp options found in received IP packets."),
   219  		OptionRecordRouteReceived:           mustCreateMetric("/netstack/ip/options/record_route_received", "Number of record route options found in received IP packets."),
   220  		OptionRouterAlertReceived:           mustCreateMetric("/netstack/ip/options/router_alert_received", "Number of router alert options found in received IP packets."),
   221  		OptionUnknownReceived:               mustCreateMetric("/netstack/ip/options/unknown_received", "Number of unknown options found in received IP packets."),
   222  		Forwarding: tcpip.IPForwardingStats{
   223  			Unrouteable:            mustCreateMetric("/netstack/ip/forwarding/unrouteable", "Number of IP packets received which couldn't be routed and thus were not forwarded."),
   224  			ExhaustedTTL:           mustCreateMetric("/netstack/ip/forwarding/exhausted_ttl", "Number of IP packets received which could not be forwarded due to an exhausted TTL."),
   225  			LinkLocalSource:        mustCreateMetric("/netstack/ip/forwarding/link_local_source_address", "Number of IP packets received which could not be forwarded due to a link-local source address."),
   226  			LinkLocalDestination:   mustCreateMetric("/netstack/ip/forwarding/link_local_destination_address", "Number of IP packets received which could not be forwarded due to a link-local destination address."),
   227  			ExtensionHeaderProblem: mustCreateMetric("/netstack/ip/forwarding/extension_header_problem", "Number of IP packets received which could not be forwarded due to a problem processing their IPv6 extension headers."),
   228  			PacketTooBig:           mustCreateMetric("/netstack/ip/forwarding/packet_too_big", "Number of IP packets received which could not be forwarded because they could not fit within the outgoing MTU."),
   229  			HostUnreachable:        mustCreateMetric("/netstack/ip/forwarding/host_unreachable", "Number of IP packets received which could not be forwarded due to unresolvable next hop."),
   230  			Errors:                 mustCreateMetric("/netstack/ip/forwarding/errors", "Number of IP packets which couldn't be forwarded."),
   231  		},
   232  	},
   233  	ARP: tcpip.ARPStats{
   234  		PacketsReceived:                                 mustCreateMetric("/netstack/arp/packets_received", "Number of ARP packets received from the link layer."),
   235  		DisabledPacketsReceived:                         mustCreateMetric("/netstack/arp/disabled_packets_received", "Number of ARP packets received from the link layer when the ARP layer is disabled."),
   236  		MalformedPacketsReceived:                        mustCreateMetric("/netstack/arp/malformed_packets_received", "Number of ARP packets which failed ARP header validation checks."),
   237  		RequestsReceived:                                mustCreateMetric("/netstack/arp/requests_received", "Number of ARP requests received."),
   238  		RequestsReceivedUnknownTargetAddress:            mustCreateMetric("/netstack/arp/requests_received_unknown_addr", "Number of ARP requests received with an unknown target address."),
   239  		OutgoingRequestInterfaceHasNoLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_iface_has_no_addr", "Number of failed attempts to send an ARP request with an interface that has no network address."),
   240  		OutgoingRequestBadLocalAddressErrors:            mustCreateMetric("/netstack/arp/outgoing_requests_invalid_local_addr", "Number of failed attempts to send an ARP request with a provided local address that is invalid."),
   241  		OutgoingRequestsDropped:                         mustCreateMetric("/netstack/arp/outgoing_requests_dropped", "Number of ARP requests which failed to write to a link-layer endpoint."),
   242  		OutgoingRequestsSent:                            mustCreateMetric("/netstack/arp/outgoing_requests_sent", "Number of ARP requests sent."),
   243  		RepliesReceived:                                 mustCreateMetric("/netstack/arp/replies_received", "Number of ARP replies received."),
   244  		OutgoingRepliesDropped:                          mustCreateMetric("/netstack/arp/outgoing_replies_dropped", "Number of ARP replies which failed to write to a link-layer endpoint."),
   245  		OutgoingRepliesSent:                             mustCreateMetric("/netstack/arp/outgoing_replies_sent", "Number of ARP replies sent."),
   246  	},
   247  	TCP: tcpip.TCPStats{
   248  		ActiveConnectionOpenings:           mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."),
   249  		PassiveConnectionOpenings:          mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."),
   250  		CurrentEstablished:                 mustCreateGauge("/netstack/tcp/current_established", "Number of connections in ESTABLISHED state now."),
   251  		CurrentConnected:                   mustCreateGauge("/netstack/tcp/current_open", "Number of connections that are in connected state."),
   252  		EstablishedResets:                  mustCreateMetric("/netstack/tcp/established_resets", "Number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state"),
   253  		EstablishedClosed:                  mustCreateMetric("/netstack/tcp/established_closed", "Number of times established TCP connections made a transition to CLOSED state."),
   254  		EstablishedTimedout:                mustCreateMetric("/netstack/tcp/established_timedout", "Number of times  an established connection was reset because of keep-alive time out."),
   255  		ListenOverflowSynDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_syn_drop", "Number of times the listen queue overflowed and a SYN was dropped."),
   256  		ListenOverflowAckDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_ack_drop", "Number of times the listen queue overflowed and the final ACK in the handshake was dropped."),
   257  		ListenOverflowSynCookieSent:        mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_sent", "Number of times a SYN cookie was sent."),
   258  		ListenOverflowSynCookieRcvd:        mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_rcvd", "Number of times a SYN cookie was received."),
   259  		ListenOverflowInvalidSynCookieRcvd: mustCreateMetric("/netstack/tcp/listen_overflow_invalid_syn_cookie_rcvd", "Number of times an invalid SYN cookie was received."),
   260  		FailedConnectionAttempts:           mustCreateMetric("/netstack/tcp/failed_connection_attempts", "Number of calls to Connect or Listen (active and passive openings, respectively) that end in an error."),
   261  		ValidSegmentsReceived:              mustCreateMetric("/netstack/tcp/valid_segments_received", "Number of TCP segments received that the transport layer successfully parsed."),
   262  		InvalidSegmentsReceived:            mustCreateMetric("/netstack/tcp/invalid_segments_received", "Number of TCP segments received that the transport layer could not parse."),
   263  		SegmentsSent:                       mustCreateMetric("/netstack/tcp/segments_sent", "Number of TCP segments sent."),
   264  		SegmentSendErrors:                  mustCreateMetric("/netstack/tcp/segment_send_errors", "Number of TCP segments failed to be sent."),
   265  		ResetsSent:                         mustCreateMetric("/netstack/tcp/resets_sent", "Number of TCP resets sent."),
   266  		ResetsReceived:                     mustCreateMetric("/netstack/tcp/resets_received", "Number of TCP resets received."),
   267  		Retransmits:                        mustCreateMetric("/netstack/tcp/retransmits", "Number of TCP segments retransmitted."),
   268  		FastRecovery:                       mustCreateMetric("/netstack/tcp/fast_recovery", "Number of times fast recovery was used to recover from packet loss."),
   269  		SACKRecovery:                       mustCreateMetric("/netstack/tcp/sack_recovery", "Number of times SACK recovery was used to recover from packet loss."),
   270  		TLPRecovery:                        mustCreateMetric("/netstack/tcp/tlp_recovery", "Number of times tail loss probe triggers recovery from tail loss."),
   271  		SlowStartRetransmits:               mustCreateMetric("/netstack/tcp/slow_start_retransmits", "Number of segments retransmitted in slow start mode."),
   272  		FastRetransmit:                     mustCreateMetric("/netstack/tcp/fast_retransmit", "Number of TCP segments which were fast retransmitted."),
   273  		Timeouts:                           mustCreateMetric("/netstack/tcp/timeouts", "Number of times RTO expired."),
   274  		ChecksumErrors:                     mustCreateMetric("/netstack/tcp/checksum_errors", "Number of segments dropped due to bad checksums."),
   275  		FailedPortReservations:             mustCreateMetric("/netstack/tcp/failed_port_reservations", "Number of time TCP failed to reserve a port."),
   276  	},
   277  	UDP: tcpip.UDPStats{
   278  		PacketsReceived:          mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."),
   279  		UnknownPortErrors:        mustCreateMetric("/netstack/udp/unknown_port_errors", "Number of incoming UDP datagrams dropped because they did not have a known destination port."),
   280  		ReceiveBufferErrors:      mustCreateMetric("/netstack/udp/receive_buffer_errors", "Number of incoming UDP datagrams dropped due to the receiving buffer being in an invalid state."),
   281  		MalformedPacketsReceived: mustCreateMetric("/netstack/udp/malformed_packets_received", "Number of incoming UDP datagrams dropped due to the UDP header being in a malformed state."),
   282  		PacketsSent:              mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent."),
   283  		PacketSendErrors:         mustCreateMetric("/netstack/udp/packet_send_errors", "Number of UDP datagrams failed to be sent."),
   284  		ChecksumErrors:           mustCreateMetric("/netstack/udp/checksum_errors", "Number of UDP datagrams dropped due to bad checksums."),
   285  	},
   286  }
   287  
   288  // DefaultTTL is linux's default TTL. All network protocols in all stacks used
   289  // with this package must have this value set as their default TTL.
   290  const DefaultTTL = 64
   291  
   292  const sizeOfInt32 int = 4
   293  
   294  var errStackType = syserr.New("expected but did not receive a netstack.Stack", errno.EINVAL)
   295  
   296  // commonEndpoint represents the intersection of a tcpip.Endpoint and a
   297  // transport.Endpoint.
   298  type commonEndpoint interface {
   299  	// GetLocalAddress implements tcpip.Endpoint.GetLocalAddress and
   300  	// transport.Endpoint.GetLocalAddress.
   301  	GetLocalAddress() (tcpip.FullAddress, tcpip.Error)
   302  
   303  	// GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress and
   304  	// transport.Endpoint.GetRemoteAddress.
   305  	GetRemoteAddress() (tcpip.FullAddress, tcpip.Error)
   306  
   307  	// Readiness implements tcpip.Endpoint.Readiness and
   308  	// transport.Endpoint.Readiness.
   309  	Readiness(mask waiter.EventMask) waiter.EventMask
   310  
   311  	// SetSockOpt implements tcpip.Endpoint.SetSockOpt and
   312  	// transport.Endpoint.SetSockOpt.
   313  	SetSockOpt(tcpip.SettableSocketOption) tcpip.Error
   314  
   315  	// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt and
   316  	// transport.Endpoint.SetSockOptInt.
   317  	SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error
   318  
   319  	// GetSockOpt implements tcpip.Endpoint.GetSockOpt and
   320  	// transport.Endpoint.GetSockOpt.
   321  	GetSockOpt(tcpip.GettableSocketOption) tcpip.Error
   322  
   323  	// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and
   324  	// transport.Endpoint.GetSockOpt.
   325  	GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error)
   326  
   327  	// State returns a socket's lifecycle state. The returned value is
   328  	// protocol-specific and is primarily used for diagnostics.
   329  	State() uint32
   330  
   331  	// LastError implements tcpip.Endpoint.LastError and
   332  	// transport.Endpoint.LastError.
   333  	LastError() tcpip.Error
   334  
   335  	// SocketOptions implements tcpip.Endpoint.SocketOptions and
   336  	// transport.Endpoint.SocketOptions.
   337  	SocketOptions() *tcpip.SocketOptions
   338  }
   339  
   340  // LINT.IfChange
   341  
   342  // SocketOperations encapsulates all the state needed to represent a network stack
   343  // endpoint in the kernel context.
   344  //
   345  // +stateify savable
   346  type SocketOperations struct {
   347  	fsutil.FilePipeSeek             `state:"nosave"`
   348  	fsutil.FileNotDirReaddir        `state:"nosave"`
   349  	fsutil.FileNoopFlush            `state:"nosave"`
   350  	fsutil.FileNoFsync              `state:"nosave"`
   351  	fsutil.FileNoMMap               `state:"nosave"`
   352  	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
   353  
   354  	socketOpsCommon
   355  }
   356  
   357  // socketOpsCommon contains the socket operations common to VFS1 and VFS2.
   358  //
   359  // +stateify savable
   360  type socketOpsCommon struct {
   361  	socket.SendReceiveTimeout
   362  	*waiter.Queue
   363  
   364  	family   int
   365  	Endpoint tcpip.Endpoint
   366  	skType   linux.SockType
   367  	protocol int
   368  
   369  	// readMu protects access to the below fields.
   370  	readMu sync.Mutex `state:"nosave"`
   371  
   372  	// sockOptTimestamp corresponds to SO_TIMESTAMP. When true, timestamps
   373  	// of returned messages can be returned via control messages. When
   374  	// false, the same timestamp is instead stored and can be read via the
   375  	// SIOCGSTAMP ioctl. It is protected by readMu. See socket(7).
   376  	sockOptTimestamp bool
   377  	// timestampValid indicates whether timestamp for SIOCGSTAMP has been
   378  	// set. It is protected by readMu.
   379  	timestampValid bool
   380  	// timestampNS holds the timestamp to use with SIOCTSTAMP. It is only
   381  	// valid when timestampValid is true. It is protected by readMu.
   382  	timestampNS int64
   383  
   384  	// TODO(b/153685824): Move this to SocketOptions.
   385  	// sockOptInq corresponds to TCP_INQ.
   386  	sockOptInq bool
   387  }
   388  
   389  // New creates a new endpoint socket.
   390  func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*fs.File, *syserr.Error) {
   391  	if skType == linux.SOCK_STREAM {
   392  		endpoint.SocketOptions().SetDelayOption(true)
   393  	}
   394  
   395  	dirent := socket.NewDirent(t, netstackDevice)
   396  	defer dirent.DecRef(t)
   397  	return fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true, NonSeekable: true}, &SocketOperations{
   398  		socketOpsCommon: socketOpsCommon{
   399  			Queue:    queue,
   400  			family:   family,
   401  			Endpoint: endpoint,
   402  			skType:   skType,
   403  			protocol: protocol,
   404  		},
   405  	}), nil
   406  }
   407  
   408  var sockAddrInetSize = (*linux.SockAddrInet)(nil).SizeBytes()
   409  var sockAddrInet6Size = (*linux.SockAddrInet6)(nil).SizeBytes()
   410  var sockAddrLinkSize = (*linux.SockAddrLink)(nil).SizeBytes()
   411  
   412  // bytesToIPAddress converts an IPv4 or IPv6 address from the user to the
   413  // netstack representation taking any addresses into account.
   414  func bytesToIPAddress(addr []byte) tcpip.Address {
   415  	if bytes.Equal(addr, make([]byte, 4)) || bytes.Equal(addr, make([]byte, 16)) {
   416  		return ""
   417  	}
   418  	return tcpip.Address(addr)
   419  }
   420  
   421  func (s *socketOpsCommon) isPacketBased() bool {
   422  	return s.skType == linux.SOCK_DGRAM || s.skType == linux.SOCK_SEQPACKET || s.skType == linux.SOCK_RDM || s.skType == linux.SOCK_RAW
   423  }
   424  
   425  // Release implements fs.FileOperations.Release.
   426  func (s *socketOpsCommon) Release(ctx context.Context) {
   427  	e, ch := waiter.NewChannelEntry(nil)
   428  	s.EventRegister(&e, waiter.EventHUp|waiter.EventErr)
   429  	defer s.EventUnregister(&e)
   430  
   431  	s.Endpoint.Close()
   432  
   433  	// SO_LINGER option is valid only for TCP. For other socket types
   434  	// return after endpoint close.
   435  	if family, skType, _ := s.Type(); skType != linux.SOCK_STREAM || (family != linux.AF_INET && family != linux.AF_INET6) {
   436  		return
   437  	}
   438  
   439  	v := s.Endpoint.SocketOptions().GetLinger()
   440  	// The case for zero timeout is handled in tcp endpoint close function.
   441  	// Close is blocked until either:
   442  	// 1. The endpoint state is not in any of the states: FIN-WAIT1,
   443  	// CLOSING and LAST_ACK.
   444  	// 2. Timeout is reached.
   445  	if v.Enabled && v.Timeout != 0 {
   446  		t := kernel.TaskFromContext(ctx)
   447  		start := t.Kernel().MonotonicClock().Now()
   448  		deadline := start.Add(v.Timeout)
   449  		t.BlockWithDeadline(ch, true, deadline)
   450  	}
   451  }
   452  
   453  // Read implements fs.FileOperations.Read.
   454  func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
   455  	if dst.NumBytes() == 0 {
   456  		return 0, nil
   457  	}
   458  	n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false)
   459  	if err == syserr.ErrWouldBlock {
   460  		return int64(n), syserror.ErrWouldBlock
   461  	}
   462  	if err != nil {
   463  		return 0, err.ToError()
   464  	}
   465  	return int64(n), nil
   466  }
   467  
   468  // WriteTo implements fs.FileOperations.WriteTo.
   469  func (s *SocketOperations) WriteTo(ctx context.Context, _ *fs.File, dst io.Writer, count int64, dup bool) (int64, error) {
   470  	s.readMu.Lock()
   471  	defer s.readMu.Unlock()
   472  
   473  	w := tcpip.LimitedWriter{
   474  		W: dst,
   475  		N: count,
   476  	}
   477  
   478  	// This may return a blocking error.
   479  	res, err := s.Endpoint.Read(&w, tcpip.ReadOptions{
   480  		Peek: dup,
   481  	})
   482  	if err != nil {
   483  		return 0, syserr.TranslateNetstackError(err).ToError()
   484  	}
   485  	return int64(res.Count), nil
   486  }
   487  
   488  // Write implements fs.FileOperations.Write.
   489  func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
   490  	r := src.Reader(ctx)
   491  	n, err := s.Endpoint.Write(r, tcpip.WriteOptions{})
   492  	if _, ok := err.(*tcpip.ErrWouldBlock); ok {
   493  		return 0, syserror.ErrWouldBlock
   494  	}
   495  	if err != nil {
   496  		return 0, syserr.TranslateNetstackError(err).ToError()
   497  	}
   498  
   499  	if n < src.NumBytes() {
   500  		return n, syserror.ErrWouldBlock
   501  	}
   502  
   503  	return n, nil
   504  }
   505  
   506  var _ tcpip.Payloader = (*limitedPayloader)(nil)
   507  
   508  type limitedPayloader struct {
   509  	inner io.LimitedReader
   510  	err   error
   511  }
   512  
   513  func (l *limitedPayloader) Read(p []byte) (int, error) {
   514  	n, err := l.inner.Read(p)
   515  	l.err = err
   516  	return n, err
   517  }
   518  
   519  func (l *limitedPayloader) Len() int {
   520  	return int(l.inner.N)
   521  }
   522  
   523  // ReadFrom implements fs.FileOperations.ReadFrom.
   524  func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader, count int64) (int64, error) {
   525  	f := limitedPayloader{
   526  		inner: io.LimitedReader{
   527  			R: r,
   528  			N: count,
   529  		},
   530  	}
   531  	n, err := s.Endpoint.Write(&f, tcpip.WriteOptions{
   532  		// Reads may be destructive but should be very fast,
   533  		// so we can't release the lock while copying data.
   534  		Atomic: true,
   535  	})
   536  	if _, ok := err.(*tcpip.ErrBadBuffer); ok {
   537  		return n, f.err
   538  	}
   539  	return n, syserr.TranslateNetstackError(err).ToError()
   540  }
   541  
   542  // Readiness returns a mask of ready events for socket s.
   543  func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask {
   544  	return s.Endpoint.Readiness(mask)
   545  }
   546  
   547  func (s *socketOpsCommon) checkFamily(family uint16, exact bool) *syserr.Error {
   548  	if family == uint16(s.family) {
   549  		return nil
   550  	}
   551  	if !exact && family == linux.AF_INET && s.family == linux.AF_INET6 {
   552  		if !s.Endpoint.SocketOptions().GetV6Only() {
   553  			return nil
   554  		}
   555  	}
   556  	return syserr.ErrInvalidArgument
   557  }
   558  
   559  // mapFamily maps the AF_INET ANY address to the IPv4-mapped IPv6 ANY if the
   560  // receiver's family is AF_INET6.
   561  //
   562  // This is a hack to work around the fact that both IPv4 and IPv6 ANY are
   563  // represented by the empty string.
   564  //
   565  // TODO(github.com/SagerNet/issue/1556): remove this function.
   566  func (s *socketOpsCommon) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress {
   567  	if len(addr.Addr) == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET {
   568  		addr.Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00"
   569  	}
   570  	return addr
   571  }
   572  
   573  // Connect implements the linux syscall connect(2) for sockets backed by
   574  // tpcip.Endpoint.
   575  func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
   576  	addr, family, err := socket.AddressAndFamily(sockaddr)
   577  	if err != nil {
   578  		return err
   579  	}
   580  
   581  	if family == linux.AF_UNSPEC {
   582  		err := s.Endpoint.Disconnect()
   583  		if _, ok := err.(*tcpip.ErrNotSupported); ok {
   584  			return syserr.ErrAddressFamilyNotSupported
   585  		}
   586  		return syserr.TranslateNetstackError(err)
   587  	}
   588  
   589  	if err := s.checkFamily(family, false /* exact */); err != nil {
   590  		return err
   591  	}
   592  	addr = s.mapFamily(addr, family)
   593  
   594  	// Always return right away in the non-blocking case.
   595  	if !blocking {
   596  		return syserr.TranslateNetstackError(s.Endpoint.Connect(addr))
   597  	}
   598  
   599  	// Register for notification when the endpoint becomes writable, then
   600  	// initiate the connection.
   601  	e, ch := waiter.NewChannelEntry(nil)
   602  	s.EventRegister(&e, waiter.WritableEvents)
   603  	defer s.EventUnregister(&e)
   604  
   605  	switch err := s.Endpoint.Connect(addr); err.(type) {
   606  	case *tcpip.ErrConnectStarted, *tcpip.ErrAlreadyConnecting:
   607  	case *tcpip.ErrNoPortAvailable:
   608  		if (s.family == unix.AF_INET || s.family == unix.AF_INET6) && s.skType == linux.SOCK_STREAM {
   609  			// TCP unlike UDP returns EADDRNOTAVAIL when it can't
   610  			// find an available local ephemeral port.
   611  			return syserr.ErrAddressNotAvailable
   612  		}
   613  		return syserr.TranslateNetstackError(err)
   614  	default:
   615  		return syserr.TranslateNetstackError(err)
   616  	}
   617  
   618  	// It's pending, so we have to wait for a notification, and fetch the
   619  	// result once the wait completes.
   620  	if err := t.Block(ch); err != nil {
   621  		return syserr.FromError(err)
   622  	}
   623  
   624  	// Call Connect() again after blocking to find connect's result.
   625  	return syserr.TranslateNetstackError(s.Endpoint.Connect(addr))
   626  }
   627  
   628  // Bind implements the linux syscall bind(2) for sockets backed by
   629  // tcpip.Endpoint.
   630  func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
   631  	if len(sockaddr) < 2 {
   632  		return syserr.ErrInvalidArgument
   633  	}
   634  
   635  	family := hostarch.ByteOrder.Uint16(sockaddr)
   636  	var addr tcpip.FullAddress
   637  
   638  	// Bind for AF_PACKET requires only family, protocol and ifindex.
   639  	// In function AddressAndFamily, we check the address length which is
   640  	// not needed for AF_PACKET bind.
   641  	if family == linux.AF_PACKET {
   642  		var a linux.SockAddrLink
   643  		if len(sockaddr) < sockAddrLinkSize {
   644  			return syserr.ErrInvalidArgument
   645  		}
   646  		a.UnmarshalBytes(sockaddr[:sockAddrLinkSize])
   647  
   648  		if a.Protocol != uint16(s.protocol) {
   649  			return syserr.ErrInvalidArgument
   650  		}
   651  
   652  		addr = tcpip.FullAddress{
   653  			NIC:  tcpip.NICID(a.InterfaceIndex),
   654  			Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]),
   655  		}
   656  	} else {
   657  		var err *syserr.Error
   658  		addr, family, err = socket.AddressAndFamily(sockaddr)
   659  		if err != nil {
   660  			return err
   661  		}
   662  
   663  		if err = s.checkFamily(family, true /* exact */); err != nil {
   664  			return err
   665  		}
   666  
   667  		addr = s.mapFamily(addr, family)
   668  	}
   669  
   670  	// Issue the bind request to the endpoint.
   671  	err := s.Endpoint.Bind(addr)
   672  	if _, ok := err.(*tcpip.ErrNoPortAvailable); ok {
   673  		// Bind always returns EADDRINUSE irrespective of if the specified port was
   674  		// already bound or if an ephemeral port was requested but none were
   675  		// available.
   676  		//
   677  		// *tcpip.ErrNoPortAvailable is mapped to EAGAIN in syserr package because
   678  		// UDP connect returns EAGAIN on ephemeral port exhaustion.
   679  		//
   680  		// TCP connect returns EADDRNOTAVAIL on ephemeral port exhaustion.
   681  		err = &tcpip.ErrPortInUse{}
   682  	}
   683  
   684  	return syserr.TranslateNetstackError(err)
   685  }
   686  
   687  // Listen implements the linux syscall listen(2) for sockets backed by
   688  // tcpip.Endpoint.
   689  func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error {
   690  	return syserr.TranslateNetstackError(s.Endpoint.Listen(backlog))
   691  }
   692  
   693  // blockingAccept implements a blocking version of accept(2), that is, if no
   694  // connections are ready to be accept, it will block until one becomes ready.
   695  func (s *socketOpsCommon) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) {
   696  	// Register for notifications.
   697  	e, ch := waiter.NewChannelEntry(nil)
   698  	s.EventRegister(&e, waiter.ReadableEvents)
   699  	defer s.EventUnregister(&e)
   700  
   701  	// Try to accept the connection again; if it fails, then wait until we
   702  	// get a notification.
   703  	for {
   704  		ep, wq, err := s.Endpoint.Accept(peerAddr)
   705  		if _, ok := err.(*tcpip.ErrWouldBlock); !ok {
   706  			return ep, wq, syserr.TranslateNetstackError(err)
   707  		}
   708  
   709  		if err := t.Block(ch); err != nil {
   710  			return nil, nil, syserr.FromError(err)
   711  		}
   712  	}
   713  }
   714  
   715  // Accept implements the linux syscall accept(2) for sockets backed by
   716  // tcpip.Endpoint.
   717  func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
   718  	var peerAddr *tcpip.FullAddress
   719  	if peerRequested {
   720  		peerAddr = &tcpip.FullAddress{}
   721  	}
   722  	ep, wq, terr := s.Endpoint.Accept(peerAddr)
   723  	if terr != nil {
   724  		if _, ok := terr.(*tcpip.ErrWouldBlock); !ok || !blocking {
   725  			return 0, nil, 0, syserr.TranslateNetstackError(terr)
   726  		}
   727  
   728  		var err *syserr.Error
   729  		ep, wq, err = s.blockingAccept(t, peerAddr)
   730  		if err != nil {
   731  			return 0, nil, 0, err
   732  		}
   733  	}
   734  
   735  	ns, err := New(t, s.family, s.skType, s.protocol, wq, ep)
   736  	if err != nil {
   737  		return 0, nil, 0, err
   738  	}
   739  	defer ns.DecRef(t)
   740  
   741  	if flags&linux.SOCK_NONBLOCK != 0 {
   742  		flags := ns.Flags()
   743  		flags.NonBlocking = true
   744  		ns.SetFlags(flags.Settable())
   745  	}
   746  
   747  	var addr linux.SockAddr
   748  	var addrLen uint32
   749  	if peerAddr != nil {
   750  		addr, addrLen = socket.ConvertAddress(s.family, *peerAddr)
   751  	}
   752  
   753  	fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
   754  		CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
   755  	})
   756  
   757  	t.Kernel().RecordSocket(ns)
   758  
   759  	return fd, addr, addrLen, syserr.FromError(e)
   760  }
   761  
   762  // ConvertShutdown converts Linux shutdown flags into tcpip shutdown flags.
   763  func ConvertShutdown(how int) (tcpip.ShutdownFlags, *syserr.Error) {
   764  	var f tcpip.ShutdownFlags
   765  	switch how {
   766  	case linux.SHUT_RD:
   767  		f = tcpip.ShutdownRead
   768  	case linux.SHUT_WR:
   769  		f = tcpip.ShutdownWrite
   770  	case linux.SHUT_RDWR:
   771  		f = tcpip.ShutdownRead | tcpip.ShutdownWrite
   772  	default:
   773  		return 0, syserr.ErrInvalidArgument
   774  	}
   775  	return f, nil
   776  }
   777  
   778  // Shutdown implements the linux syscall shutdown(2) for sockets backed by
   779  // tcpip.Endpoint.
   780  func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error {
   781  	f, err := ConvertShutdown(how)
   782  	if err != nil {
   783  		return err
   784  	}
   785  
   786  	// Issue shutdown request.
   787  	return syserr.TranslateNetstackError(s.Endpoint.Shutdown(f))
   788  }
   789  
   790  // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
   791  // tcpip.Endpoint.
   792  func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
   793  	// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
   794  	// implemented specifically for netstack.SocketOperations rather than
   795  	// commonEndpoint. commonEndpoint should be extended to support socket
   796  	// options where the implementation is not shared, as unix sockets need
   797  	// their own support for SO_TIMESTAMP.
   798  	if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP {
   799  		if outLen < sizeOfInt32 {
   800  			return nil, syserr.ErrInvalidArgument
   801  		}
   802  		val := primitive.Int32(0)
   803  		s.readMu.Lock()
   804  		defer s.readMu.Unlock()
   805  		if s.sockOptTimestamp {
   806  			val = 1
   807  		}
   808  		return &val, nil
   809  	}
   810  	if level == linux.SOL_TCP && name == linux.TCP_INQ {
   811  		if outLen < sizeOfInt32 {
   812  			return nil, syserr.ErrInvalidArgument
   813  		}
   814  		val := primitive.Int32(0)
   815  		s.readMu.Lock()
   816  		defer s.readMu.Unlock()
   817  		if s.sockOptInq {
   818  			val = 1
   819  		}
   820  		return &val, nil
   821  	}
   822  
   823  	return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outPtr, outLen)
   824  }
   825  
   826  // GetSockOpt can be used to implement the linux syscall getsockopt(2) for
   827  // sockets backed by a commonEndpoint.
   828  func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
   829  	switch level {
   830  	case linux.SOL_SOCKET:
   831  		return getSockOptSocket(t, s, ep, family, skType, name, outLen)
   832  
   833  	case linux.SOL_TCP:
   834  		return getSockOptTCP(t, s, ep, name, outLen)
   835  
   836  	case linux.SOL_IPV6:
   837  		return getSockOptIPv6(t, s, ep, name, outPtr, outLen)
   838  
   839  	case linux.SOL_IP:
   840  		return getSockOptIP(t, s, ep, name, outPtr, outLen, family)
   841  
   842  	case linux.SOL_UDP,
   843  		linux.SOL_ICMPV6,
   844  		linux.SOL_RAW,
   845  		linux.SOL_PACKET:
   846  
   847  		t.Kernel().EmitUnimplementedEvent(t)
   848  	}
   849  
   850  	return nil, syserr.ErrProtocolNotAvailable
   851  }
   852  
   853  func boolToInt32(v bool) int32 {
   854  	if v {
   855  		return 1
   856  	}
   857  	return 0
   858  }
   859  
   860  // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET.
   861  func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (marshal.Marshallable, *syserr.Error) {
   862  	// TODO(b/124056281): Stop rejecting short optLen values in getsockopt.
   863  	switch name {
   864  	case linux.SO_ERROR:
   865  		if outLen < sizeOfInt32 {
   866  			return nil, syserr.ErrInvalidArgument
   867  		}
   868  
   869  		// Get the last error and convert it.
   870  		err := ep.SocketOptions().GetLastError()
   871  		if err == nil {
   872  			optP := primitive.Int32(0)
   873  			return &optP, nil
   874  		}
   875  
   876  		optP := primitive.Int32(syserr.TranslateNetstackError(err).ToLinux())
   877  		return &optP, nil
   878  
   879  	case linux.SO_PEERCRED:
   880  		if family != linux.AF_UNIX || outLen < unix.SizeofUcred {
   881  			return nil, syserr.ErrInvalidArgument
   882  		}
   883  
   884  		tcred := t.Credentials()
   885  		creds := linux.ControlMessageCredentials{
   886  			PID: int32(t.ThreadGroup().ID()),
   887  			UID: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()),
   888  			GID: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()),
   889  		}
   890  		return &creds, nil
   891  
   892  	case linux.SO_PASSCRED:
   893  		if outLen < sizeOfInt32 {
   894  			return nil, syserr.ErrInvalidArgument
   895  		}
   896  
   897  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetPassCred()))
   898  		return &v, nil
   899  
   900  	case linux.SO_SNDBUF:
   901  		if outLen < sizeOfInt32 {
   902  			return nil, syserr.ErrInvalidArgument
   903  		}
   904  
   905  		size := ep.SocketOptions().GetSendBufferSize()
   906  
   907  		if size > math.MaxInt32 {
   908  			size = math.MaxInt32
   909  		}
   910  
   911  		sizeP := primitive.Int32(size)
   912  		return &sizeP, nil
   913  
   914  	case linux.SO_RCVBUF:
   915  		if outLen < sizeOfInt32 {
   916  			return nil, syserr.ErrInvalidArgument
   917  		}
   918  
   919  		size := ep.SocketOptions().GetReceiveBufferSize()
   920  
   921  		if size > math.MaxInt32 {
   922  			size = math.MaxInt32
   923  		}
   924  
   925  		sizeP := primitive.Int32(size)
   926  		return &sizeP, nil
   927  
   928  	case linux.SO_REUSEADDR:
   929  		if outLen < sizeOfInt32 {
   930  			return nil, syserr.ErrInvalidArgument
   931  		}
   932  
   933  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReuseAddress()))
   934  		return &v, nil
   935  
   936  	case linux.SO_REUSEPORT:
   937  		if outLen < sizeOfInt32 {
   938  			return nil, syserr.ErrInvalidArgument
   939  		}
   940  
   941  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReusePort()))
   942  		return &v, nil
   943  
   944  	case linux.SO_BINDTODEVICE:
   945  		v := ep.SocketOptions().GetBindToDevice()
   946  		if v == 0 {
   947  			var b primitive.ByteSlice
   948  			return &b, nil
   949  		}
   950  		if outLen < linux.IFNAMSIZ {
   951  			return nil, syserr.ErrInvalidArgument
   952  		}
   953  		s := t.NetworkContext()
   954  		if s == nil {
   955  			return nil, syserr.ErrNoDevice
   956  		}
   957  		nic, ok := s.Interfaces()[int32(v)]
   958  		if !ok {
   959  			// The NICID no longer indicates a valid interface, probably because that
   960  			// interface was removed.
   961  			return nil, syserr.ErrUnknownDevice
   962  		}
   963  
   964  		name := primitive.ByteSlice(append([]byte(nic.Name), 0))
   965  		return &name, nil
   966  
   967  	case linux.SO_BROADCAST:
   968  		if outLen < sizeOfInt32 {
   969  			return nil, syserr.ErrInvalidArgument
   970  		}
   971  
   972  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetBroadcast()))
   973  		return &v, nil
   974  
   975  	case linux.SO_KEEPALIVE:
   976  		if outLen < sizeOfInt32 {
   977  			return nil, syserr.ErrInvalidArgument
   978  		}
   979  
   980  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetKeepAlive()))
   981  		return &v, nil
   982  
   983  	case linux.SO_LINGER:
   984  		if outLen < linux.SizeOfLinger {
   985  			return nil, syserr.ErrInvalidArgument
   986  		}
   987  
   988  		var linger linux.Linger
   989  		v := ep.SocketOptions().GetLinger()
   990  
   991  		if v.Enabled {
   992  			linger.OnOff = 1
   993  		}
   994  		linger.Linger = int32(v.Timeout.Seconds())
   995  		return &linger, nil
   996  
   997  	case linux.SO_SNDTIMEO:
   998  		// TODO(igudger): Linux allows shorter lengths for partial results.
   999  		if outLen < linux.SizeOfTimeval {
  1000  			return nil, syserr.ErrInvalidArgument
  1001  		}
  1002  
  1003  		sendTimeout := linux.NsecToTimeval(s.SendTimeout())
  1004  		return &sendTimeout, nil
  1005  
  1006  	case linux.SO_RCVTIMEO:
  1007  		// TODO(igudger): Linux allows shorter lengths for partial results.
  1008  		if outLen < linux.SizeOfTimeval {
  1009  			return nil, syserr.ErrInvalidArgument
  1010  		}
  1011  
  1012  		recvTimeout := linux.NsecToTimeval(s.RecvTimeout())
  1013  		return &recvTimeout, nil
  1014  
  1015  	case linux.SO_OOBINLINE:
  1016  		if outLen < sizeOfInt32 {
  1017  			return nil, syserr.ErrInvalidArgument
  1018  		}
  1019  
  1020  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetOutOfBandInline()))
  1021  		return &v, nil
  1022  
  1023  	case linux.SO_NO_CHECK:
  1024  		if outLen < sizeOfInt32 {
  1025  			return nil, syserr.ErrInvalidArgument
  1026  		}
  1027  
  1028  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetNoChecksum()))
  1029  		return &v, nil
  1030  
  1031  	case linux.SO_ACCEPTCONN:
  1032  		if outLen < sizeOfInt32 {
  1033  			return nil, syserr.ErrInvalidArgument
  1034  		}
  1035  
  1036  		// This option is only viable for TCP endpoints.
  1037  		var v bool
  1038  		if _, skType, skProto := s.Type(); isTCPSocket(skType, skProto) {
  1039  			v = tcp.EndpointState(ep.State()) == tcp.StateListen
  1040  		}
  1041  		vP := primitive.Int32(boolToInt32(v))
  1042  		return &vP, nil
  1043  
  1044  	default:
  1045  		socket.GetSockOptEmitUnimplementedEvent(t, name)
  1046  	}
  1047  	return nil, syserr.ErrProtocolNotAvailable
  1048  }
  1049  
  1050  // getSockOptTCP implements GetSockOpt when level is SOL_TCP.
  1051  func getSockOptTCP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) {
  1052  	if _, skType, skProto := s.Type(); !isTCPSocket(skType, skProto) {
  1053  		log.Warningf("SOL_TCP options are only supported on TCP sockets: skType, skProto = %v, %d", skType, skProto)
  1054  		return nil, syserr.ErrUnknownProtocolOption
  1055  	}
  1056  
  1057  	switch name {
  1058  	case linux.TCP_NODELAY:
  1059  		if outLen < sizeOfInt32 {
  1060  			return nil, syserr.ErrInvalidArgument
  1061  		}
  1062  
  1063  		v := primitive.Int32(boolToInt32(!ep.SocketOptions().GetDelayOption()))
  1064  		return &v, nil
  1065  
  1066  	case linux.TCP_CORK:
  1067  		if outLen < sizeOfInt32 {
  1068  			return nil, syserr.ErrInvalidArgument
  1069  		}
  1070  
  1071  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetCorkOption()))
  1072  		return &v, nil
  1073  
  1074  	case linux.TCP_QUICKACK:
  1075  		if outLen < sizeOfInt32 {
  1076  			return nil, syserr.ErrInvalidArgument
  1077  		}
  1078  
  1079  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetQuickAck()))
  1080  		return &v, nil
  1081  
  1082  	case linux.TCP_MAXSEG:
  1083  		if outLen < sizeOfInt32 {
  1084  			return nil, syserr.ErrInvalidArgument
  1085  		}
  1086  
  1087  		v, err := ep.GetSockOptInt(tcpip.MaxSegOption)
  1088  		if err != nil {
  1089  			return nil, syserr.TranslateNetstackError(err)
  1090  		}
  1091  		vP := primitive.Int32(v)
  1092  		return &vP, nil
  1093  
  1094  	case linux.TCP_KEEPIDLE:
  1095  		if outLen < sizeOfInt32 {
  1096  			return nil, syserr.ErrInvalidArgument
  1097  		}
  1098  
  1099  		var v tcpip.KeepaliveIdleOption
  1100  		if err := ep.GetSockOpt(&v); err != nil {
  1101  			return nil, syserr.TranslateNetstackError(err)
  1102  		}
  1103  		keepAliveIdle := primitive.Int32(time.Duration(v) / time.Second)
  1104  		return &keepAliveIdle, nil
  1105  
  1106  	case linux.TCP_KEEPINTVL:
  1107  		if outLen < sizeOfInt32 {
  1108  			return nil, syserr.ErrInvalidArgument
  1109  		}
  1110  
  1111  		var v tcpip.KeepaliveIntervalOption
  1112  		if err := ep.GetSockOpt(&v); err != nil {
  1113  			return nil, syserr.TranslateNetstackError(err)
  1114  		}
  1115  		keepAliveInterval := primitive.Int32(time.Duration(v) / time.Second)
  1116  		return &keepAliveInterval, nil
  1117  
  1118  	case linux.TCP_KEEPCNT:
  1119  		if outLen < sizeOfInt32 {
  1120  			return nil, syserr.ErrInvalidArgument
  1121  		}
  1122  
  1123  		v, err := ep.GetSockOptInt(tcpip.KeepaliveCountOption)
  1124  		if err != nil {
  1125  			return nil, syserr.TranslateNetstackError(err)
  1126  		}
  1127  		vP := primitive.Int32(v)
  1128  		return &vP, nil
  1129  
  1130  	case linux.TCP_USER_TIMEOUT:
  1131  		if outLen < sizeOfInt32 {
  1132  			return nil, syserr.ErrInvalidArgument
  1133  		}
  1134  
  1135  		var v tcpip.TCPUserTimeoutOption
  1136  		if err := ep.GetSockOpt(&v); err != nil {
  1137  			return nil, syserr.TranslateNetstackError(err)
  1138  		}
  1139  		tcpUserTimeout := primitive.Int32(time.Duration(v) / time.Millisecond)
  1140  		return &tcpUserTimeout, nil
  1141  
  1142  	case linux.TCP_INFO:
  1143  		var v tcpip.TCPInfoOption
  1144  		if err := ep.GetSockOpt(&v); err != nil {
  1145  			return nil, syserr.TranslateNetstackError(err)
  1146  		}
  1147  
  1148  		// TODO(b/64800844): Translate fields once they are added to
  1149  		// tcpip.TCPInfoOption.
  1150  		info := linux.TCPInfo{
  1151  			State:       uint8(v.State),
  1152  			RTO:         uint32(v.RTO / time.Microsecond),
  1153  			RTT:         uint32(v.RTT / time.Microsecond),
  1154  			RTTVar:      uint32(v.RTTVar / time.Microsecond),
  1155  			SndSsthresh: v.SndSsthresh,
  1156  			SndCwnd:     v.SndCwnd,
  1157  		}
  1158  		switch v.CcState {
  1159  		case tcpip.RTORecovery:
  1160  			info.CaState = linux.TCP_CA_Loss
  1161  		case tcpip.FastRecovery, tcpip.SACKRecovery:
  1162  			info.CaState = linux.TCP_CA_Recovery
  1163  		case tcpip.Disorder:
  1164  			info.CaState = linux.TCP_CA_Disorder
  1165  		case tcpip.Open:
  1166  			info.CaState = linux.TCP_CA_Open
  1167  		}
  1168  
  1169  		// In netstack reorderSeen is updated only when RACK is enabled.
  1170  		// We only track whether the reordering is seen, which is
  1171  		// different than Linux where reorderSeen is not specific to
  1172  		// RACK and is incremented when a reordering event is seen.
  1173  		if v.ReorderSeen {
  1174  			info.ReordSeen = 1
  1175  		}
  1176  
  1177  		// Linux truncates the output binary to outLen.
  1178  		buf := t.CopyScratchBuffer(info.SizeBytes())
  1179  		info.MarshalUnsafe(buf)
  1180  		if len(buf) > outLen {
  1181  			buf = buf[:outLen]
  1182  		}
  1183  		bufP := primitive.ByteSlice(buf)
  1184  		return &bufP, nil
  1185  
  1186  	case linux.TCP_CC_INFO,
  1187  		linux.TCP_NOTSENT_LOWAT,
  1188  		linux.TCP_ZEROCOPY_RECEIVE:
  1189  
  1190  		t.Kernel().EmitUnimplementedEvent(t)
  1191  
  1192  	case linux.TCP_CONGESTION:
  1193  		if outLen <= 0 {
  1194  			return nil, syserr.ErrInvalidArgument
  1195  		}
  1196  
  1197  		var v tcpip.CongestionControlOption
  1198  		if err := ep.GetSockOpt(&v); err != nil {
  1199  			return nil, syserr.TranslateNetstackError(err)
  1200  		}
  1201  
  1202  		// We match linux behaviour here where it returns the lower of
  1203  		// TCP_CA_NAME_MAX bytes or the value of the option length.
  1204  		//
  1205  		// This is Linux's net/tcp.h TCP_CA_NAME_MAX.
  1206  		const tcpCANameMax = 16
  1207  
  1208  		toCopy := tcpCANameMax
  1209  		if outLen < tcpCANameMax {
  1210  			toCopy = outLen
  1211  		}
  1212  		b := make([]byte, toCopy)
  1213  		copy(b, v)
  1214  
  1215  		bP := primitive.ByteSlice(b)
  1216  		return &bP, nil
  1217  
  1218  	case linux.TCP_LINGER2:
  1219  		if outLen < sizeOfInt32 {
  1220  			return nil, syserr.ErrInvalidArgument
  1221  		}
  1222  
  1223  		var v tcpip.TCPLingerTimeoutOption
  1224  		if err := ep.GetSockOpt(&v); err != nil {
  1225  			return nil, syserr.TranslateNetstackError(err)
  1226  		}
  1227  		var lingerTimeout primitive.Int32
  1228  		if v >= 0 {
  1229  			lingerTimeout = primitive.Int32(time.Duration(v) / time.Second)
  1230  		} else {
  1231  			lingerTimeout = -1
  1232  		}
  1233  		return &lingerTimeout, nil
  1234  
  1235  	case linux.TCP_DEFER_ACCEPT:
  1236  		if outLen < sizeOfInt32 {
  1237  			return nil, syserr.ErrInvalidArgument
  1238  		}
  1239  
  1240  		var v tcpip.TCPDeferAcceptOption
  1241  		if err := ep.GetSockOpt(&v); err != nil {
  1242  			return nil, syserr.TranslateNetstackError(err)
  1243  		}
  1244  
  1245  		tcpDeferAccept := primitive.Int32(time.Duration(v) / time.Second)
  1246  		return &tcpDeferAccept, nil
  1247  
  1248  	case linux.TCP_SYNCNT:
  1249  		if outLen < sizeOfInt32 {
  1250  			return nil, syserr.ErrInvalidArgument
  1251  		}
  1252  
  1253  		v, err := ep.GetSockOptInt(tcpip.TCPSynCountOption)
  1254  		if err != nil {
  1255  			return nil, syserr.TranslateNetstackError(err)
  1256  		}
  1257  		vP := primitive.Int32(v)
  1258  		return &vP, nil
  1259  
  1260  	case linux.TCP_WINDOW_CLAMP:
  1261  		if outLen < sizeOfInt32 {
  1262  			return nil, syserr.ErrInvalidArgument
  1263  		}
  1264  
  1265  		v, err := ep.GetSockOptInt(tcpip.TCPWindowClampOption)
  1266  		if err != nil {
  1267  			return nil, syserr.TranslateNetstackError(err)
  1268  		}
  1269  		vP := primitive.Int32(v)
  1270  		return &vP, nil
  1271  	default:
  1272  		emitUnimplementedEventTCP(t, name)
  1273  	}
  1274  	return nil, syserr.ErrProtocolNotAvailable
  1275  }
  1276  
  1277  // getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6.
  1278  func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
  1279  	if _, ok := ep.(tcpip.Endpoint); !ok {
  1280  		log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  1281  		return nil, syserr.ErrUnknownProtocolOption
  1282  	}
  1283  
  1284  	family, skType, _ := s.Type()
  1285  	if family != linux.AF_INET6 {
  1286  		return nil, syserr.ErrUnknownProtocolOption
  1287  	}
  1288  
  1289  	switch name {
  1290  	case linux.IPV6_V6ONLY:
  1291  		if outLen < sizeOfInt32 {
  1292  			return nil, syserr.ErrInvalidArgument
  1293  		}
  1294  
  1295  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetV6Only()))
  1296  		return &v, nil
  1297  
  1298  	case linux.IPV6_PATHMTU:
  1299  		t.Kernel().EmitUnimplementedEvent(t)
  1300  
  1301  	case linux.IPV6_TCLASS:
  1302  		// Length handling for parity with Linux.
  1303  		if outLen == 0 {
  1304  			var b primitive.ByteSlice
  1305  			return &b, nil
  1306  		}
  1307  		v, err := ep.GetSockOptInt(tcpip.IPv6TrafficClassOption)
  1308  		if err != nil {
  1309  			return nil, syserr.TranslateNetstackError(err)
  1310  		}
  1311  
  1312  		uintv := primitive.Uint32(v)
  1313  		// Linux truncates the output binary to outLen.
  1314  		ib := t.CopyScratchBuffer(uintv.SizeBytes())
  1315  		uintv.MarshalUnsafe(ib)
  1316  		// Handle cases where outLen is lesser than sizeOfInt32.
  1317  		if len(ib) > outLen {
  1318  			ib = ib[:outLen]
  1319  		}
  1320  		ibP := primitive.ByteSlice(ib)
  1321  		return &ibP, nil
  1322  
  1323  	case linux.IPV6_RECVTCLASS:
  1324  		if outLen < sizeOfInt32 {
  1325  			return nil, syserr.ErrInvalidArgument
  1326  		}
  1327  
  1328  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTClass()))
  1329  		return &v, nil
  1330  	case linux.IPV6_RECVERR:
  1331  		if outLen < sizeOfInt32 {
  1332  			return nil, syserr.ErrInvalidArgument
  1333  		}
  1334  
  1335  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetRecvError()))
  1336  		return &v, nil
  1337  
  1338  	case linux.IPV6_RECVORIGDSTADDR:
  1339  		if outLen < sizeOfInt32 {
  1340  			return nil, syserr.ErrInvalidArgument
  1341  		}
  1342  
  1343  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress()))
  1344  		return &v, nil
  1345  
  1346  	case linux.IP6T_ORIGINAL_DST:
  1347  		if outLen < sockAddrInet6Size {
  1348  			return nil, syserr.ErrInvalidArgument
  1349  		}
  1350  
  1351  		var v tcpip.OriginalDestinationOption
  1352  		if err := ep.GetSockOpt(&v); err != nil {
  1353  			return nil, syserr.TranslateNetstackError(err)
  1354  		}
  1355  
  1356  		a, _ := socket.ConvertAddress(linux.AF_INET6, tcpip.FullAddress(v))
  1357  		return a.(*linux.SockAddrInet6), nil
  1358  
  1359  	case linux.IP6T_SO_GET_INFO:
  1360  		if outLen < linux.SizeOfIPTGetinfo {
  1361  			return nil, syserr.ErrInvalidArgument
  1362  		}
  1363  
  1364  		// Only valid for raw IPv6 sockets.
  1365  		if skType != linux.SOCK_RAW {
  1366  			return nil, syserr.ErrProtocolNotAvailable
  1367  		}
  1368  
  1369  		stack := inet.StackFromContext(t)
  1370  		if stack == nil {
  1371  			return nil, syserr.ErrNoDevice
  1372  		}
  1373  		info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr, true)
  1374  		if err != nil {
  1375  			return nil, err
  1376  		}
  1377  		return &info, nil
  1378  
  1379  	case linux.IP6T_SO_GET_ENTRIES:
  1380  		// IPTGetEntries is reused for IPv6.
  1381  		if outLen < linux.SizeOfIPTGetEntries {
  1382  			return nil, syserr.ErrInvalidArgument
  1383  		}
  1384  		// Only valid for raw IPv6 sockets.
  1385  		if skType != linux.SOCK_RAW {
  1386  			return nil, syserr.ErrProtocolNotAvailable
  1387  		}
  1388  
  1389  		stack := inet.StackFromContext(t)
  1390  		if stack == nil {
  1391  			return nil, syserr.ErrNoDevice
  1392  		}
  1393  		entries, err := netfilter.GetEntries6(t, stack.(*Stack).Stack, outPtr, outLen)
  1394  		if err != nil {
  1395  			return nil, err
  1396  		}
  1397  		return &entries, nil
  1398  
  1399  	case linux.IP6T_SO_GET_REVISION_TARGET:
  1400  		if outLen < linux.SizeOfXTGetRevision {
  1401  			return nil, syserr.ErrInvalidArgument
  1402  		}
  1403  
  1404  		// Only valid for raw IPv6 sockets.
  1405  		if skType != linux.SOCK_RAW {
  1406  			return nil, syserr.ErrProtocolNotAvailable
  1407  		}
  1408  
  1409  		stack := inet.StackFromContext(t)
  1410  		if stack == nil {
  1411  			return nil, syserr.ErrNoDevice
  1412  		}
  1413  		ret, err := netfilter.TargetRevision(t, outPtr, header.IPv6ProtocolNumber)
  1414  		if err != nil {
  1415  			return nil, err
  1416  		}
  1417  		return &ret, nil
  1418  
  1419  	default:
  1420  		emitUnimplementedEventIPv6(t, name)
  1421  	}
  1422  	return nil, syserr.ErrProtocolNotAvailable
  1423  }
  1424  
  1425  // getSockOptIP implements GetSockOpt when level is SOL_IP.
  1426  func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int, family int) (marshal.Marshallable, *syserr.Error) {
  1427  	if _, ok := ep.(tcpip.Endpoint); !ok {
  1428  		log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  1429  		return nil, syserr.ErrUnknownProtocolOption
  1430  	}
  1431  
  1432  	switch name {
  1433  	case linux.IP_TTL:
  1434  		if outLen < sizeOfInt32 {
  1435  			return nil, syserr.ErrInvalidArgument
  1436  		}
  1437  
  1438  		v, err := ep.GetSockOptInt(tcpip.TTLOption)
  1439  		if err != nil {
  1440  			return nil, syserr.TranslateNetstackError(err)
  1441  		}
  1442  
  1443  		// Fill in the default value, if needed.
  1444  		vP := primitive.Int32(v)
  1445  		if vP == 0 {
  1446  			vP = DefaultTTL
  1447  		}
  1448  
  1449  		return &vP, nil
  1450  
  1451  	case linux.IP_MULTICAST_TTL:
  1452  		if outLen < sizeOfInt32 {
  1453  			return nil, syserr.ErrInvalidArgument
  1454  		}
  1455  
  1456  		v, err := ep.GetSockOptInt(tcpip.MulticastTTLOption)
  1457  		if err != nil {
  1458  			return nil, syserr.TranslateNetstackError(err)
  1459  		}
  1460  
  1461  		vP := primitive.Int32(v)
  1462  		return &vP, nil
  1463  
  1464  	case linux.IP_MULTICAST_IF:
  1465  		if outLen < len(linux.InetAddr{}) {
  1466  			return nil, syserr.ErrInvalidArgument
  1467  		}
  1468  
  1469  		var v tcpip.MulticastInterfaceOption
  1470  		if err := ep.GetSockOpt(&v); err != nil {
  1471  			return nil, syserr.TranslateNetstackError(err)
  1472  		}
  1473  
  1474  		a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr})
  1475  
  1476  		return &a.(*linux.SockAddrInet).Addr, nil
  1477  
  1478  	case linux.IP_MULTICAST_LOOP:
  1479  		if outLen < sizeOfInt32 {
  1480  			return nil, syserr.ErrInvalidArgument
  1481  		}
  1482  
  1483  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetMulticastLoop()))
  1484  		return &v, nil
  1485  
  1486  	case linux.IP_TOS:
  1487  		// Length handling for parity with Linux.
  1488  		if outLen == 0 {
  1489  			var b primitive.ByteSlice
  1490  			return &b, nil
  1491  		}
  1492  		v, err := ep.GetSockOptInt(tcpip.IPv4TOSOption)
  1493  		if err != nil {
  1494  			return nil, syserr.TranslateNetstackError(err)
  1495  		}
  1496  		if outLen < sizeOfInt32 {
  1497  			vP := primitive.Uint8(v)
  1498  			return &vP, nil
  1499  		}
  1500  		vP := primitive.Int32(v)
  1501  		return &vP, nil
  1502  
  1503  	case linux.IP_RECVTOS:
  1504  		if outLen < sizeOfInt32 {
  1505  			return nil, syserr.ErrInvalidArgument
  1506  		}
  1507  
  1508  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTOS()))
  1509  		return &v, nil
  1510  
  1511  	case linux.IP_RECVERR:
  1512  		if outLen < sizeOfInt32 {
  1513  			return nil, syserr.ErrInvalidArgument
  1514  		}
  1515  
  1516  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetRecvError()))
  1517  		return &v, nil
  1518  
  1519  	case linux.IP_PKTINFO:
  1520  		if outLen < sizeOfInt32 {
  1521  			return nil, syserr.ErrInvalidArgument
  1522  		}
  1523  
  1524  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceivePacketInfo()))
  1525  		return &v, nil
  1526  
  1527  	case linux.IP_HDRINCL:
  1528  		if outLen < sizeOfInt32 {
  1529  			return nil, syserr.ErrInvalidArgument
  1530  		}
  1531  
  1532  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetHeaderIncluded()))
  1533  		return &v, nil
  1534  
  1535  	case linux.IP_RECVORIGDSTADDR:
  1536  		if outLen < sizeOfInt32 {
  1537  			return nil, syserr.ErrInvalidArgument
  1538  		}
  1539  
  1540  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress()))
  1541  		return &v, nil
  1542  
  1543  	case linux.SO_ORIGINAL_DST:
  1544  		if outLen < sockAddrInetSize {
  1545  			return nil, syserr.ErrInvalidArgument
  1546  		}
  1547  
  1548  		var v tcpip.OriginalDestinationOption
  1549  		if err := ep.GetSockOpt(&v); err != nil {
  1550  			return nil, syserr.TranslateNetstackError(err)
  1551  		}
  1552  
  1553  		a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress(v))
  1554  		return a.(*linux.SockAddrInet), nil
  1555  
  1556  	case linux.IPT_SO_GET_INFO:
  1557  		if outLen < linux.SizeOfIPTGetinfo {
  1558  			return nil, syserr.ErrInvalidArgument
  1559  		}
  1560  
  1561  		// Only valid for raw IPv4 sockets.
  1562  		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
  1563  			return nil, syserr.ErrProtocolNotAvailable
  1564  		}
  1565  
  1566  		stack := inet.StackFromContext(t)
  1567  		if stack == nil {
  1568  			return nil, syserr.ErrNoDevice
  1569  		}
  1570  		info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr, false)
  1571  		if err != nil {
  1572  			return nil, err
  1573  		}
  1574  		return &info, nil
  1575  
  1576  	case linux.IPT_SO_GET_ENTRIES:
  1577  		if outLen < linux.SizeOfIPTGetEntries {
  1578  			return nil, syserr.ErrInvalidArgument
  1579  		}
  1580  
  1581  		// Only valid for raw IPv4 sockets.
  1582  		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
  1583  			return nil, syserr.ErrProtocolNotAvailable
  1584  		}
  1585  
  1586  		stack := inet.StackFromContext(t)
  1587  		if stack == nil {
  1588  			return nil, syserr.ErrNoDevice
  1589  		}
  1590  		entries, err := netfilter.GetEntries4(t, stack.(*Stack).Stack, outPtr, outLen)
  1591  		if err != nil {
  1592  			return nil, err
  1593  		}
  1594  		return &entries, nil
  1595  
  1596  	case linux.IPT_SO_GET_REVISION_TARGET:
  1597  		if outLen < linux.SizeOfXTGetRevision {
  1598  			return nil, syserr.ErrInvalidArgument
  1599  		}
  1600  
  1601  		// Only valid for raw IPv4 sockets.
  1602  		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
  1603  			return nil, syserr.ErrProtocolNotAvailable
  1604  		}
  1605  
  1606  		stack := inet.StackFromContext(t)
  1607  		if stack == nil {
  1608  			return nil, syserr.ErrNoDevice
  1609  		}
  1610  		ret, err := netfilter.TargetRevision(t, outPtr, header.IPv4ProtocolNumber)
  1611  		if err != nil {
  1612  			return nil, err
  1613  		}
  1614  		return &ret, nil
  1615  
  1616  	default:
  1617  		emitUnimplementedEventIP(t, name)
  1618  	}
  1619  	return nil, syserr.ErrProtocolNotAvailable
  1620  }
  1621  
  1622  // SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
  1623  // tcpip.Endpoint.
  1624  func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error {
  1625  	// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
  1626  	// implemented specifically for netstack.SocketOperations rather than
  1627  	// commonEndpoint. commonEndpoint should be extended to support socket
  1628  	// options where the implementation is not shared, as unix sockets need
  1629  	// their own support for SO_TIMESTAMP.
  1630  	if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP {
  1631  		if len(optVal) < sizeOfInt32 {
  1632  			return syserr.ErrInvalidArgument
  1633  		}
  1634  		s.readMu.Lock()
  1635  		defer s.readMu.Unlock()
  1636  		s.sockOptTimestamp = hostarch.ByteOrder.Uint32(optVal) != 0
  1637  		return nil
  1638  	}
  1639  	if level == linux.SOL_TCP && name == linux.TCP_INQ {
  1640  		if len(optVal) < sizeOfInt32 {
  1641  			return syserr.ErrInvalidArgument
  1642  		}
  1643  		s.readMu.Lock()
  1644  		defer s.readMu.Unlock()
  1645  		s.sockOptInq = hostarch.ByteOrder.Uint32(optVal) != 0
  1646  		return nil
  1647  	}
  1648  
  1649  	return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
  1650  }
  1651  
  1652  // SetSockOpt can be used to implement the linux syscall setsockopt(2) for
  1653  // sockets backed by a commonEndpoint.
  1654  func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error {
  1655  	switch level {
  1656  	case linux.SOL_SOCKET:
  1657  		return setSockOptSocket(t, s, ep, name, optVal)
  1658  
  1659  	case linux.SOL_TCP:
  1660  		return setSockOptTCP(t, s, ep, name, optVal)
  1661  
  1662  	case linux.SOL_IPV6:
  1663  		return setSockOptIPv6(t, s, ep, name, optVal)
  1664  
  1665  	case linux.SOL_IP:
  1666  		return setSockOptIP(t, s, ep, name, optVal)
  1667  
  1668  	case linux.SOL_PACKET:
  1669  		// gVisor doesn't support any SOL_PACKET options just return not
  1670  		// supported. Returning nil here will result in tcpdump thinking AF_PACKET
  1671  		// features are supported and proceed to use them and break.
  1672  		t.Kernel().EmitUnimplementedEvent(t)
  1673  		return syserr.ErrProtocolNotAvailable
  1674  
  1675  	case linux.SOL_UDP,
  1676  		linux.SOL_ICMPV6,
  1677  		linux.SOL_RAW:
  1678  
  1679  		t.Kernel().EmitUnimplementedEvent(t)
  1680  	}
  1681  
  1682  	return nil
  1683  }
  1684  
  1685  // setSockOptSocket implements SetSockOpt when level is SOL_SOCKET.
  1686  func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  1687  	switch name {
  1688  	case linux.SO_SNDBUF:
  1689  		if len(optVal) < sizeOfInt32 {
  1690  			return syserr.ErrInvalidArgument
  1691  		}
  1692  
  1693  		v := hostarch.ByteOrder.Uint32(optVal)
  1694  		ep.SocketOptions().SetSendBufferSize(int64(v), true /* notify */)
  1695  		return nil
  1696  
  1697  	case linux.SO_RCVBUF:
  1698  		if len(optVal) < sizeOfInt32 {
  1699  			return syserr.ErrInvalidArgument
  1700  		}
  1701  
  1702  		v := hostarch.ByteOrder.Uint32(optVal)
  1703  		ep.SocketOptions().SetReceiveBufferSize(int64(v), true /* notify */)
  1704  		return nil
  1705  
  1706  	case linux.SO_REUSEADDR:
  1707  		if len(optVal) < sizeOfInt32 {
  1708  			return syserr.ErrInvalidArgument
  1709  		}
  1710  
  1711  		v := hostarch.ByteOrder.Uint32(optVal)
  1712  		ep.SocketOptions().SetReuseAddress(v != 0)
  1713  		return nil
  1714  
  1715  	case linux.SO_REUSEPORT:
  1716  		if len(optVal) < sizeOfInt32 {
  1717  			return syserr.ErrInvalidArgument
  1718  		}
  1719  
  1720  		v := hostarch.ByteOrder.Uint32(optVal)
  1721  		ep.SocketOptions().SetReusePort(v != 0)
  1722  		return nil
  1723  
  1724  	case linux.SO_BINDTODEVICE:
  1725  		n := bytes.IndexByte(optVal, 0)
  1726  		if n == -1 {
  1727  			n = len(optVal)
  1728  		}
  1729  		name := string(optVal[:n])
  1730  		if name == "" {
  1731  			return syserr.TranslateNetstackError(ep.SocketOptions().SetBindToDevice(0))
  1732  		}
  1733  		s := t.NetworkContext()
  1734  		if s == nil {
  1735  			return syserr.ErrNoDevice
  1736  		}
  1737  		for nicID, nic := range s.Interfaces() {
  1738  			if nic.Name == name {
  1739  				return syserr.TranslateNetstackError(ep.SocketOptions().SetBindToDevice(nicID))
  1740  			}
  1741  		}
  1742  		return syserr.ErrUnknownDevice
  1743  
  1744  	case linux.SO_BROADCAST:
  1745  		if len(optVal) < sizeOfInt32 {
  1746  			return syserr.ErrInvalidArgument
  1747  		}
  1748  
  1749  		v := hostarch.ByteOrder.Uint32(optVal)
  1750  		ep.SocketOptions().SetBroadcast(v != 0)
  1751  		return nil
  1752  
  1753  	case linux.SO_PASSCRED:
  1754  		if len(optVal) < sizeOfInt32 {
  1755  			return syserr.ErrInvalidArgument
  1756  		}
  1757  
  1758  		v := hostarch.ByteOrder.Uint32(optVal)
  1759  		ep.SocketOptions().SetPassCred(v != 0)
  1760  		return nil
  1761  
  1762  	case linux.SO_KEEPALIVE:
  1763  		if len(optVal) < sizeOfInt32 {
  1764  			return syserr.ErrInvalidArgument
  1765  		}
  1766  
  1767  		v := hostarch.ByteOrder.Uint32(optVal)
  1768  		ep.SocketOptions().SetKeepAlive(v != 0)
  1769  		return nil
  1770  
  1771  	case linux.SO_SNDTIMEO:
  1772  		if len(optVal) < linux.SizeOfTimeval {
  1773  			return syserr.ErrInvalidArgument
  1774  		}
  1775  
  1776  		var v linux.Timeval
  1777  		v.UnmarshalBytes(optVal[:linux.SizeOfTimeval])
  1778  		if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
  1779  			return syserr.ErrDomain
  1780  		}
  1781  		s.SetSendTimeout(v.ToNsecCapped())
  1782  		return nil
  1783  
  1784  	case linux.SO_RCVTIMEO:
  1785  		if len(optVal) < linux.SizeOfTimeval {
  1786  			return syserr.ErrInvalidArgument
  1787  		}
  1788  
  1789  		var v linux.Timeval
  1790  		v.UnmarshalBytes(optVal[:linux.SizeOfTimeval])
  1791  		if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
  1792  			return syserr.ErrDomain
  1793  		}
  1794  		s.SetRecvTimeout(v.ToNsecCapped())
  1795  		return nil
  1796  
  1797  	case linux.SO_OOBINLINE:
  1798  		if len(optVal) < sizeOfInt32 {
  1799  			return syserr.ErrInvalidArgument
  1800  		}
  1801  
  1802  		v := hostarch.ByteOrder.Uint32(optVal)
  1803  		ep.SocketOptions().SetOutOfBandInline(v != 0)
  1804  		return nil
  1805  
  1806  	case linux.SO_NO_CHECK:
  1807  		if len(optVal) < sizeOfInt32 {
  1808  			return syserr.ErrInvalidArgument
  1809  		}
  1810  
  1811  		v := hostarch.ByteOrder.Uint32(optVal)
  1812  		ep.SocketOptions().SetNoChecksum(v != 0)
  1813  		return nil
  1814  
  1815  	case linux.SO_LINGER:
  1816  		if len(optVal) < linux.SizeOfLinger {
  1817  			return syserr.ErrInvalidArgument
  1818  		}
  1819  
  1820  		var v linux.Linger
  1821  		v.UnmarshalBytes(optVal[:linux.SizeOfLinger])
  1822  
  1823  		if v != (linux.Linger{}) {
  1824  			socket.SetSockOptEmitUnimplementedEvent(t, name)
  1825  		}
  1826  
  1827  		ep.SocketOptions().SetLinger(tcpip.LingerOption{
  1828  			Enabled: v.OnOff != 0,
  1829  			Timeout: time.Second * time.Duration(v.Linger),
  1830  		})
  1831  		return nil
  1832  
  1833  	case linux.SO_DETACH_FILTER:
  1834  		// optval is ignored.
  1835  		var v tcpip.SocketDetachFilterOption
  1836  		return syserr.TranslateNetstackError(ep.SetSockOpt(&v))
  1837  
  1838  	default:
  1839  		socket.SetSockOptEmitUnimplementedEvent(t, name)
  1840  	}
  1841  
  1842  	return nil
  1843  }
  1844  
  1845  // setSockOptTCP implements SetSockOpt when level is SOL_TCP.
  1846  func setSockOptTCP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  1847  	if _, skType, skProto := s.Type(); !isTCPSocket(skType, skProto) {
  1848  		log.Warningf("SOL_TCP options are only supported on TCP sockets: skType, skProto = %v, %d", skType, skProto)
  1849  		return syserr.ErrUnknownProtocolOption
  1850  	}
  1851  
  1852  	switch name {
  1853  	case linux.TCP_NODELAY:
  1854  		if len(optVal) < sizeOfInt32 {
  1855  			return syserr.ErrInvalidArgument
  1856  		}
  1857  
  1858  		v := hostarch.ByteOrder.Uint32(optVal)
  1859  		ep.SocketOptions().SetDelayOption(v == 0)
  1860  		return nil
  1861  
  1862  	case linux.TCP_CORK:
  1863  		if len(optVal) < sizeOfInt32 {
  1864  			return syserr.ErrInvalidArgument
  1865  		}
  1866  
  1867  		v := hostarch.ByteOrder.Uint32(optVal)
  1868  		ep.SocketOptions().SetCorkOption(v != 0)
  1869  		return nil
  1870  
  1871  	case linux.TCP_QUICKACK:
  1872  		if len(optVal) < sizeOfInt32 {
  1873  			return syserr.ErrInvalidArgument
  1874  		}
  1875  
  1876  		v := hostarch.ByteOrder.Uint32(optVal)
  1877  		ep.SocketOptions().SetQuickAck(v != 0)
  1878  		return nil
  1879  
  1880  	case linux.TCP_MAXSEG:
  1881  		if len(optVal) < sizeOfInt32 {
  1882  			return syserr.ErrInvalidArgument
  1883  		}
  1884  
  1885  		v := hostarch.ByteOrder.Uint32(optVal)
  1886  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MaxSegOption, int(v)))
  1887  
  1888  	case linux.TCP_KEEPIDLE:
  1889  		if len(optVal) < sizeOfInt32 {
  1890  			return syserr.ErrInvalidArgument
  1891  		}
  1892  
  1893  		v := hostarch.ByteOrder.Uint32(optVal)
  1894  		if v < 1 || v > linux.MAX_TCP_KEEPIDLE {
  1895  			return syserr.ErrInvalidArgument
  1896  		}
  1897  		opt := tcpip.KeepaliveIdleOption(time.Second * time.Duration(v))
  1898  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  1899  
  1900  	case linux.TCP_KEEPINTVL:
  1901  		if len(optVal) < sizeOfInt32 {
  1902  			return syserr.ErrInvalidArgument
  1903  		}
  1904  
  1905  		v := hostarch.ByteOrder.Uint32(optVal)
  1906  		if v < 1 || v > linux.MAX_TCP_KEEPINTVL {
  1907  			return syserr.ErrInvalidArgument
  1908  		}
  1909  		opt := tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))
  1910  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  1911  
  1912  	case linux.TCP_KEEPCNT:
  1913  		if len(optVal) < sizeOfInt32 {
  1914  			return syserr.ErrInvalidArgument
  1915  		}
  1916  
  1917  		v := hostarch.ByteOrder.Uint32(optVal)
  1918  		if v < 1 || v > linux.MAX_TCP_KEEPCNT {
  1919  			return syserr.ErrInvalidArgument
  1920  		}
  1921  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.KeepaliveCountOption, int(v)))
  1922  
  1923  	case linux.TCP_USER_TIMEOUT:
  1924  		if len(optVal) < sizeOfInt32 {
  1925  			return syserr.ErrInvalidArgument
  1926  		}
  1927  
  1928  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  1929  		if v < 0 {
  1930  			return syserr.ErrInvalidArgument
  1931  		}
  1932  		opt := tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v))
  1933  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  1934  
  1935  	case linux.TCP_CONGESTION:
  1936  		v := tcpip.CongestionControlOption(optVal)
  1937  		if err := ep.SetSockOpt(&v); err != nil {
  1938  			return syserr.TranslateNetstackError(err)
  1939  		}
  1940  		return nil
  1941  
  1942  	case linux.TCP_LINGER2:
  1943  		if len(optVal) < sizeOfInt32 {
  1944  			return syserr.ErrInvalidArgument
  1945  		}
  1946  
  1947  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  1948  		opt := tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v))
  1949  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  1950  
  1951  	case linux.TCP_DEFER_ACCEPT:
  1952  		if len(optVal) < sizeOfInt32 {
  1953  			return syserr.ErrInvalidArgument
  1954  		}
  1955  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  1956  		if v < 0 {
  1957  			v = 0
  1958  		}
  1959  		opt := tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v))
  1960  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  1961  
  1962  	case linux.TCP_SYNCNT:
  1963  		if len(optVal) < sizeOfInt32 {
  1964  			return syserr.ErrInvalidArgument
  1965  		}
  1966  		v := hostarch.ByteOrder.Uint32(optVal)
  1967  
  1968  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPSynCountOption, int(v)))
  1969  
  1970  	case linux.TCP_WINDOW_CLAMP:
  1971  		if len(optVal) < sizeOfInt32 {
  1972  			return syserr.ErrInvalidArgument
  1973  		}
  1974  		v := hostarch.ByteOrder.Uint32(optVal)
  1975  
  1976  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPWindowClampOption, int(v)))
  1977  
  1978  	case linux.TCP_REPAIR_OPTIONS:
  1979  		t.Kernel().EmitUnimplementedEvent(t)
  1980  
  1981  	default:
  1982  		emitUnimplementedEventTCP(t, name)
  1983  	}
  1984  
  1985  	return nil
  1986  }
  1987  
  1988  // setSockOptIPv6 implements SetSockOpt when level is SOL_IPV6.
  1989  func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  1990  	if _, ok := ep.(tcpip.Endpoint); !ok {
  1991  		log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  1992  		return syserr.ErrUnknownProtocolOption
  1993  	}
  1994  
  1995  	family, skType, skProto := s.Type()
  1996  	if family != linux.AF_INET6 {
  1997  		return syserr.ErrUnknownProtocolOption
  1998  	}
  1999  
  2000  	switch name {
  2001  	case linux.IPV6_V6ONLY:
  2002  		if len(optVal) < sizeOfInt32 {
  2003  			return syserr.ErrInvalidArgument
  2004  		}
  2005  
  2006  		if isTCPSocket(skType, skProto) && tcp.EndpointState(ep.State()) != tcp.StateInitial {
  2007  			return syserr.ErrInvalidEndpointState
  2008  		} else if isUDPSocket(skType, skProto) && udp.EndpointState(ep.State()) != udp.StateInitial {
  2009  			return syserr.ErrInvalidEndpointState
  2010  		}
  2011  
  2012  		v := hostarch.ByteOrder.Uint32(optVal)
  2013  		ep.SocketOptions().SetV6Only(v != 0)
  2014  		return nil
  2015  
  2016  	case linux.IPV6_ADD_MEMBERSHIP:
  2017  		req, err := copyInMulticastV6Request(optVal)
  2018  		if err != nil {
  2019  			return err
  2020  		}
  2021  
  2022  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{
  2023  			NIC:           tcpip.NICID(req.InterfaceIndex),
  2024  			MulticastAddr: tcpip.Address(req.MulticastAddr[:]),
  2025  		}))
  2026  
  2027  	case linux.IPV6_DROP_MEMBERSHIP:
  2028  		req, err := copyInMulticastV6Request(optVal)
  2029  		if err != nil {
  2030  			return err
  2031  		}
  2032  
  2033  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{
  2034  			NIC:           tcpip.NICID(req.InterfaceIndex),
  2035  			MulticastAddr: tcpip.Address(req.MulticastAddr[:]),
  2036  		}))
  2037  
  2038  	case linux.IPV6_IPSEC_POLICY,
  2039  		linux.IPV6_JOIN_ANYCAST,
  2040  		linux.IPV6_LEAVE_ANYCAST,
  2041  		// TODO(b/148887420): Add support for IPV6_PKTINFO.
  2042  		linux.IPV6_PKTINFO,
  2043  		linux.IPV6_ROUTER_ALERT,
  2044  		linux.IPV6_XFRM_POLICY,
  2045  		linux.MCAST_BLOCK_SOURCE,
  2046  		linux.MCAST_JOIN_GROUP,
  2047  		linux.MCAST_JOIN_SOURCE_GROUP,
  2048  		linux.MCAST_LEAVE_GROUP,
  2049  		linux.MCAST_LEAVE_SOURCE_GROUP,
  2050  		linux.MCAST_UNBLOCK_SOURCE:
  2051  
  2052  		t.Kernel().EmitUnimplementedEvent(t)
  2053  
  2054  	case linux.IPV6_RECVORIGDSTADDR:
  2055  		if len(optVal) < sizeOfInt32 {
  2056  			return syserr.ErrInvalidArgument
  2057  		}
  2058  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2059  
  2060  		ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0)
  2061  		return nil
  2062  
  2063  	case linux.IPV6_TCLASS:
  2064  		if len(optVal) < sizeOfInt32 {
  2065  			return syserr.ErrInvalidArgument
  2066  		}
  2067  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2068  		if v < -1 || v > 255 {
  2069  			return syserr.ErrInvalidArgument
  2070  		}
  2071  		if v == -1 {
  2072  			v = 0
  2073  		}
  2074  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6TrafficClassOption, int(v)))
  2075  
  2076  	case linux.IPV6_RECVTCLASS:
  2077  		v, err := parseIntOrChar(optVal)
  2078  		if err != nil {
  2079  			return err
  2080  		}
  2081  
  2082  		ep.SocketOptions().SetReceiveTClass(v != 0)
  2083  		return nil
  2084  	case linux.IPV6_RECVERR:
  2085  		if len(optVal) == 0 {
  2086  			return nil
  2087  		}
  2088  		v, err := parseIntOrChar(optVal)
  2089  		if err != nil {
  2090  			return err
  2091  		}
  2092  		ep.SocketOptions().SetRecvError(v != 0)
  2093  		return nil
  2094  
  2095  	case linux.IP6T_SO_SET_REPLACE:
  2096  		if len(optVal) < linux.SizeOfIP6TReplace {
  2097  			return syserr.ErrInvalidArgument
  2098  		}
  2099  
  2100  		// Only valid for raw IPv6 sockets.
  2101  		if skType != linux.SOCK_RAW {
  2102  			return syserr.ErrProtocolNotAvailable
  2103  		}
  2104  
  2105  		stack := inet.StackFromContext(t)
  2106  		if stack == nil {
  2107  			return syserr.ErrNoDevice
  2108  		}
  2109  		// Stack must be a netstack stack.
  2110  		return netfilter.SetEntries(t, stack.(*Stack).Stack, optVal, true)
  2111  
  2112  	case linux.IP6T_SO_SET_ADD_COUNTERS:
  2113  		log.Infof("IP6T_SO_SET_ADD_COUNTERS is not supported")
  2114  		return nil
  2115  
  2116  	default:
  2117  		emitUnimplementedEventIPv6(t, name)
  2118  	}
  2119  
  2120  	return nil
  2121  }
  2122  
  2123  var (
  2124  	inetMulticastRequestSize        = (*linux.InetMulticastRequest)(nil).SizeBytes()
  2125  	inetMulticastRequestWithNICSize = (*linux.InetMulticastRequestWithNIC)(nil).SizeBytes()
  2126  	inet6MulticastRequestSize       = (*linux.Inet6MulticastRequest)(nil).SizeBytes()
  2127  )
  2128  
  2129  // copyInMulticastRequest copies in a variable-size multicast request. The
  2130  // kernel determines which structure was passed by its length. IP_MULTICAST_IF
  2131  // supports ip_mreqn, ip_mreq and in_addr, while IP_ADD_MEMBERSHIP and
  2132  // IP_DROP_MEMBERSHIP only support ip_mreqn and ip_mreq. To handle this,
  2133  // allowAddr controls whether in_addr is accepted or rejected.
  2134  func copyInMulticastRequest(optVal []byte, allowAddr bool) (linux.InetMulticastRequestWithNIC, *syserr.Error) {
  2135  	if len(optVal) < len(linux.InetAddr{}) {
  2136  		return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument
  2137  	}
  2138  
  2139  	if len(optVal) < inetMulticastRequestSize {
  2140  		if !allowAddr {
  2141  			return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument
  2142  		}
  2143  
  2144  		var req linux.InetMulticastRequestWithNIC
  2145  		copy(req.InterfaceAddr[:], optVal)
  2146  		return req, nil
  2147  	}
  2148  
  2149  	if len(optVal) >= inetMulticastRequestWithNICSize {
  2150  		var req linux.InetMulticastRequestWithNIC
  2151  		req.UnmarshalUnsafe(optVal[:inetMulticastRequestWithNICSize])
  2152  		return req, nil
  2153  	}
  2154  
  2155  	var req linux.InetMulticastRequestWithNIC
  2156  	req.InetMulticastRequest.UnmarshalUnsafe(optVal[:inetMulticastRequestSize])
  2157  	return req, nil
  2158  }
  2159  
  2160  func copyInMulticastV6Request(optVal []byte) (linux.Inet6MulticastRequest, *syserr.Error) {
  2161  	if len(optVal) < inet6MulticastRequestSize {
  2162  		return linux.Inet6MulticastRequest{}, syserr.ErrInvalidArgument
  2163  	}
  2164  
  2165  	var req linux.Inet6MulticastRequest
  2166  	req.UnmarshalUnsafe(optVal[:inet6MulticastRequestSize])
  2167  	return req, nil
  2168  }
  2169  
  2170  // parseIntOrChar copies either a 32-bit int or an 8-bit uint out of buf.
  2171  //
  2172  // net/ipv4/ip_sockglue.c:do_ip_setsockopt does this for its socket options.
  2173  func parseIntOrChar(buf []byte) (int32, *syserr.Error) {
  2174  	if len(buf) == 0 {
  2175  		return 0, syserr.ErrInvalidArgument
  2176  	}
  2177  
  2178  	if len(buf) >= sizeOfInt32 {
  2179  		return int32(hostarch.ByteOrder.Uint32(buf)), nil
  2180  	}
  2181  
  2182  	return int32(buf[0]), nil
  2183  }
  2184  
  2185  // setSockOptIP implements SetSockOpt when level is SOL_IP.
  2186  func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  2187  	if _, ok := ep.(tcpip.Endpoint); !ok {
  2188  		log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  2189  		return syserr.ErrUnknownProtocolOption
  2190  	}
  2191  
  2192  	switch name {
  2193  	case linux.IP_MULTICAST_TTL:
  2194  		v, err := parseIntOrChar(optVal)
  2195  		if err != nil {
  2196  			return err
  2197  		}
  2198  
  2199  		if v == -1 {
  2200  			// Linux translates -1 to 1.
  2201  			v = 1
  2202  		}
  2203  		if v < 0 || v > 255 {
  2204  			return syserr.ErrInvalidArgument
  2205  		}
  2206  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MulticastTTLOption, int(v)))
  2207  
  2208  	case linux.IP_ADD_MEMBERSHIP:
  2209  		req, err := copyInMulticastRequest(optVal, false /* allowAddr */)
  2210  		if err != nil {
  2211  			return err
  2212  		}
  2213  
  2214  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{
  2215  			NIC: tcpip.NICID(req.InterfaceIndex),
  2216  			// TODO(igudger): Change AddMembership to use the standard
  2217  			// any address representation.
  2218  			InterfaceAddr: tcpip.Address(req.InterfaceAddr[:]),
  2219  			MulticastAddr: tcpip.Address(req.MulticastAddr[:]),
  2220  		}))
  2221  
  2222  	case linux.IP_DROP_MEMBERSHIP:
  2223  		req, err := copyInMulticastRequest(optVal, false /* allowAddr */)
  2224  		if err != nil {
  2225  			return err
  2226  		}
  2227  
  2228  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{
  2229  			NIC: tcpip.NICID(req.InterfaceIndex),
  2230  			// TODO(igudger): Change DropMembership to use the standard
  2231  			// any address representation.
  2232  			InterfaceAddr: tcpip.Address(req.InterfaceAddr[:]),
  2233  			MulticastAddr: tcpip.Address(req.MulticastAddr[:]),
  2234  		}))
  2235  
  2236  	case linux.IP_MULTICAST_IF:
  2237  		req, err := copyInMulticastRequest(optVal, true /* allowAddr */)
  2238  		if err != nil {
  2239  			return err
  2240  		}
  2241  
  2242  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.MulticastInterfaceOption{
  2243  			NIC:           tcpip.NICID(req.InterfaceIndex),
  2244  			InterfaceAddr: socket.BytesToIPAddress(req.InterfaceAddr[:]),
  2245  		}))
  2246  
  2247  	case linux.IP_MULTICAST_LOOP:
  2248  		v, err := parseIntOrChar(optVal)
  2249  		if err != nil {
  2250  			return err
  2251  		}
  2252  
  2253  		ep.SocketOptions().SetMulticastLoop(v != 0)
  2254  		return nil
  2255  
  2256  	case linux.MCAST_JOIN_GROUP:
  2257  		// FIXME(b/124219304): Implement MCAST_JOIN_GROUP.
  2258  		t.Kernel().EmitUnimplementedEvent(t)
  2259  		return syserr.ErrInvalidArgument
  2260  
  2261  	case linux.IP_TTL:
  2262  		v, err := parseIntOrChar(optVal)
  2263  		if err != nil {
  2264  			return err
  2265  		}
  2266  
  2267  		// -1 means default TTL.
  2268  		if v == -1 {
  2269  			v = 0
  2270  		} else if v < 1 || v > 255 {
  2271  			return syserr.ErrInvalidArgument
  2272  		}
  2273  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TTLOption, int(v)))
  2274  
  2275  	case linux.IP_TOS:
  2276  		if len(optVal) == 0 {
  2277  			return nil
  2278  		}
  2279  		v, err := parseIntOrChar(optVal)
  2280  		if err != nil {
  2281  			return err
  2282  		}
  2283  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TOSOption, int(v)))
  2284  
  2285  	case linux.IP_RECVTOS:
  2286  		v, err := parseIntOrChar(optVal)
  2287  		if err != nil {
  2288  			return err
  2289  		}
  2290  		ep.SocketOptions().SetReceiveTOS(v != 0)
  2291  		return nil
  2292  
  2293  	case linux.IP_RECVERR:
  2294  		if len(optVal) == 0 {
  2295  			return nil
  2296  		}
  2297  		v, err := parseIntOrChar(optVal)
  2298  		if err != nil {
  2299  			return err
  2300  		}
  2301  		ep.SocketOptions().SetRecvError(v != 0)
  2302  		return nil
  2303  
  2304  	case linux.IP_PKTINFO:
  2305  		if len(optVal) == 0 {
  2306  			return nil
  2307  		}
  2308  		v, err := parseIntOrChar(optVal)
  2309  		if err != nil {
  2310  			return err
  2311  		}
  2312  		ep.SocketOptions().SetReceivePacketInfo(v != 0)
  2313  		return nil
  2314  
  2315  	case linux.IP_HDRINCL:
  2316  		if len(optVal) == 0 {
  2317  			return nil
  2318  		}
  2319  		v, err := parseIntOrChar(optVal)
  2320  		if err != nil {
  2321  			return err
  2322  		}
  2323  		ep.SocketOptions().SetHeaderIncluded(v != 0)
  2324  		return nil
  2325  
  2326  	case linux.IP_RECVORIGDSTADDR:
  2327  		if len(optVal) == 0 {
  2328  			return nil
  2329  		}
  2330  		v, err := parseIntOrChar(optVal)
  2331  		if err != nil {
  2332  			return err
  2333  		}
  2334  
  2335  		ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0)
  2336  		return nil
  2337  
  2338  	case linux.IPT_SO_SET_REPLACE:
  2339  		if len(optVal) < linux.SizeOfIPTReplace {
  2340  			return syserr.ErrInvalidArgument
  2341  		}
  2342  
  2343  		// Only valid for raw IPv4 sockets.
  2344  		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
  2345  			return syserr.ErrProtocolNotAvailable
  2346  		}
  2347  
  2348  		stack := inet.StackFromContext(t)
  2349  		if stack == nil {
  2350  			return syserr.ErrNoDevice
  2351  		}
  2352  		// Stack must be a netstack stack.
  2353  		return netfilter.SetEntries(t, stack.(*Stack).Stack, optVal, false)
  2354  
  2355  	case linux.IPT_SO_SET_ADD_COUNTERS:
  2356  		log.Infof("IPT_SO_SET_ADD_COUNTERS is not supported")
  2357  		return nil
  2358  
  2359  	case linux.IP_ADD_SOURCE_MEMBERSHIP,
  2360  		linux.IP_BIND_ADDRESS_NO_PORT,
  2361  		linux.IP_BLOCK_SOURCE,
  2362  		linux.IP_CHECKSUM,
  2363  		linux.IP_DROP_SOURCE_MEMBERSHIP,
  2364  		linux.IP_FREEBIND,
  2365  		linux.IP_IPSEC_POLICY,
  2366  		linux.IP_MINTTL,
  2367  		linux.IP_MSFILTER,
  2368  		linux.IP_MTU_DISCOVER,
  2369  		linux.IP_MULTICAST_ALL,
  2370  		linux.IP_NODEFRAG,
  2371  		linux.IP_OPTIONS,
  2372  		linux.IP_PASSSEC,
  2373  		linux.IP_RECVFRAGSIZE,
  2374  		linux.IP_RECVOPTS,
  2375  		linux.IP_RECVTTL,
  2376  		linux.IP_RETOPTS,
  2377  		linux.IP_TRANSPARENT,
  2378  		linux.IP_UNBLOCK_SOURCE,
  2379  		linux.IP_UNICAST_IF,
  2380  		linux.IP_XFRM_POLICY,
  2381  		linux.MCAST_BLOCK_SOURCE,
  2382  		linux.MCAST_JOIN_SOURCE_GROUP,
  2383  		linux.MCAST_LEAVE_GROUP,
  2384  		linux.MCAST_LEAVE_SOURCE_GROUP,
  2385  		linux.MCAST_MSFILTER,
  2386  		linux.MCAST_UNBLOCK_SOURCE:
  2387  
  2388  		t.Kernel().EmitUnimplementedEvent(t)
  2389  	}
  2390  
  2391  	return nil
  2392  }
  2393  
  2394  // emitUnimplementedEventTCP emits unimplemented event if name is valid. This
  2395  // function contains names that are common between Get and SetSockOpt when
  2396  // level is SOL_TCP.
  2397  func emitUnimplementedEventTCP(t *kernel.Task, name int) {
  2398  	switch name {
  2399  	case linux.TCP_CONGESTION,
  2400  		linux.TCP_CORK,
  2401  		linux.TCP_FASTOPEN,
  2402  		linux.TCP_FASTOPEN_CONNECT,
  2403  		linux.TCP_FASTOPEN_KEY,
  2404  		linux.TCP_FASTOPEN_NO_COOKIE,
  2405  		linux.TCP_QUEUE_SEQ,
  2406  		linux.TCP_REPAIR,
  2407  		linux.TCP_REPAIR_QUEUE,
  2408  		linux.TCP_REPAIR_WINDOW,
  2409  		linux.TCP_SAVED_SYN,
  2410  		linux.TCP_SAVE_SYN,
  2411  		linux.TCP_THIN_DUPACK,
  2412  		linux.TCP_THIN_LINEAR_TIMEOUTS,
  2413  		linux.TCP_TIMESTAMP,
  2414  		linux.TCP_ULP:
  2415  
  2416  		t.Kernel().EmitUnimplementedEvent(t)
  2417  	}
  2418  }
  2419  
  2420  // emitUnimplementedEventIPv6 emits unimplemented event if name is valid. It
  2421  // contains names that are common between Get and SetSockOpt when level is
  2422  // SOL_IPV6.
  2423  func emitUnimplementedEventIPv6(t *kernel.Task, name int) {
  2424  	switch name {
  2425  	case linux.IPV6_2292DSTOPTS,
  2426  		linux.IPV6_2292HOPLIMIT,
  2427  		linux.IPV6_2292HOPOPTS,
  2428  		linux.IPV6_2292PKTINFO,
  2429  		linux.IPV6_2292PKTOPTIONS,
  2430  		linux.IPV6_2292RTHDR,
  2431  		linux.IPV6_ADDR_PREFERENCES,
  2432  		linux.IPV6_AUTOFLOWLABEL,
  2433  		linux.IPV6_DONTFRAG,
  2434  		linux.IPV6_DSTOPTS,
  2435  		linux.IPV6_FLOWINFO,
  2436  		linux.IPV6_FLOWINFO_SEND,
  2437  		linux.IPV6_FLOWLABEL_MGR,
  2438  		linux.IPV6_FREEBIND,
  2439  		linux.IPV6_HOPOPTS,
  2440  		linux.IPV6_MINHOPCOUNT,
  2441  		linux.IPV6_MTU,
  2442  		linux.IPV6_MTU_DISCOVER,
  2443  		linux.IPV6_MULTICAST_ALL,
  2444  		linux.IPV6_MULTICAST_HOPS,
  2445  		linux.IPV6_MULTICAST_IF,
  2446  		linux.IPV6_MULTICAST_LOOP,
  2447  		linux.IPV6_RECVDSTOPTS,
  2448  		linux.IPV6_RECVFRAGSIZE,
  2449  		linux.IPV6_RECVHOPLIMIT,
  2450  		linux.IPV6_RECVHOPOPTS,
  2451  		linux.IPV6_RECVPATHMTU,
  2452  		linux.IPV6_RECVPKTINFO,
  2453  		linux.IPV6_RECVRTHDR,
  2454  		linux.IPV6_RTHDR,
  2455  		linux.IPV6_RTHDRDSTOPTS,
  2456  		linux.IPV6_TCLASS,
  2457  		linux.IPV6_TRANSPARENT,
  2458  		linux.IPV6_UNICAST_HOPS,
  2459  		linux.IPV6_UNICAST_IF,
  2460  		linux.MCAST_MSFILTER,
  2461  		linux.IPV6_ADDRFORM:
  2462  
  2463  		t.Kernel().EmitUnimplementedEvent(t)
  2464  	}
  2465  }
  2466  
  2467  // emitUnimplementedEventIP emits unimplemented event if name is valid. It
  2468  // contains names that are common between Get and SetSockOpt when level is
  2469  // SOL_IP.
  2470  func emitUnimplementedEventIP(t *kernel.Task, name int) {
  2471  	switch name {
  2472  	case linux.IP_TOS,
  2473  		linux.IP_TTL,
  2474  		linux.IP_OPTIONS,
  2475  		linux.IP_ROUTER_ALERT,
  2476  		linux.IP_RECVOPTS,
  2477  		linux.IP_RETOPTS,
  2478  		linux.IP_PKTINFO,
  2479  		linux.IP_PKTOPTIONS,
  2480  		linux.IP_MTU_DISCOVER,
  2481  		linux.IP_RECVTTL,
  2482  		linux.IP_RECVTOS,
  2483  		linux.IP_MTU,
  2484  		linux.IP_FREEBIND,
  2485  		linux.IP_IPSEC_POLICY,
  2486  		linux.IP_XFRM_POLICY,
  2487  		linux.IP_PASSSEC,
  2488  		linux.IP_TRANSPARENT,
  2489  		linux.IP_ORIGDSTADDR,
  2490  		linux.IP_MINTTL,
  2491  		linux.IP_NODEFRAG,
  2492  		linux.IP_CHECKSUM,
  2493  		linux.IP_BIND_ADDRESS_NO_PORT,
  2494  		linux.IP_RECVFRAGSIZE,
  2495  		linux.IP_MULTICAST_IF,
  2496  		linux.IP_MULTICAST_TTL,
  2497  		linux.IP_MULTICAST_LOOP,
  2498  		linux.IP_ADD_MEMBERSHIP,
  2499  		linux.IP_DROP_MEMBERSHIP,
  2500  		linux.IP_UNBLOCK_SOURCE,
  2501  		linux.IP_BLOCK_SOURCE,
  2502  		linux.IP_ADD_SOURCE_MEMBERSHIP,
  2503  		linux.IP_DROP_SOURCE_MEMBERSHIP,
  2504  		linux.IP_MSFILTER,
  2505  		linux.MCAST_JOIN_GROUP,
  2506  		linux.MCAST_BLOCK_SOURCE,
  2507  		linux.MCAST_UNBLOCK_SOURCE,
  2508  		linux.MCAST_LEAVE_GROUP,
  2509  		linux.MCAST_JOIN_SOURCE_GROUP,
  2510  		linux.MCAST_LEAVE_SOURCE_GROUP,
  2511  		linux.MCAST_MSFILTER,
  2512  		linux.IP_MULTICAST_ALL,
  2513  		linux.IP_UNICAST_IF:
  2514  
  2515  		t.Kernel().EmitUnimplementedEvent(t)
  2516  	}
  2517  }
  2518  
  2519  // GetSockName implements the linux syscall getsockname(2) for sockets backed by
  2520  // tcpip.Endpoint.
  2521  func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
  2522  	addr, err := s.Endpoint.GetLocalAddress()
  2523  	if err != nil {
  2524  		return nil, 0, syserr.TranslateNetstackError(err)
  2525  	}
  2526  
  2527  	a, l := socket.ConvertAddress(s.family, addr)
  2528  	return a, l, nil
  2529  }
  2530  
  2531  // GetPeerName implements the linux syscall getpeername(2) for sockets backed by
  2532  // tcpip.Endpoint.
  2533  func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
  2534  	addr, err := s.Endpoint.GetRemoteAddress()
  2535  	if err != nil {
  2536  		return nil, 0, syserr.TranslateNetstackError(err)
  2537  	}
  2538  
  2539  	a, l := socket.ConvertAddress(s.family, addr)
  2540  	return a, l, nil
  2541  }
  2542  
  2543  func (s *socketOpsCommon) fillCmsgInq(cmsg *socket.ControlMessages) {
  2544  	if !s.sockOptInq {
  2545  		return
  2546  	}
  2547  	rcvBufUsed, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
  2548  	if err != nil {
  2549  		return
  2550  	}
  2551  	cmsg.IP.HasInq = true
  2552  	cmsg.IP.Inq = int32(rcvBufUsed)
  2553  }
  2554  
  2555  func toLinuxPacketType(pktType tcpip.PacketType) uint8 {
  2556  	switch pktType {
  2557  	case tcpip.PacketHost:
  2558  		return linux.PACKET_HOST
  2559  	case tcpip.PacketOtherHost:
  2560  		return linux.PACKET_OTHERHOST
  2561  	case tcpip.PacketOutgoing:
  2562  		return linux.PACKET_OUTGOING
  2563  	case tcpip.PacketBroadcast:
  2564  		return linux.PACKET_BROADCAST
  2565  	case tcpip.PacketMulticast:
  2566  		return linux.PACKET_MULTICAST
  2567  	default:
  2568  		panic(fmt.Sprintf("unknown packet type: %d", pktType))
  2569  	}
  2570  }
  2571  
  2572  // nonBlockingRead issues a non-blocking read.
  2573  //
  2574  // TODO(b/78348848): Support timestamps for stream sockets.
  2575  func (s *socketOpsCommon) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
  2576  	isPacket := s.isPacketBased()
  2577  
  2578  	readOptions := tcpip.ReadOptions{
  2579  		Peek:               peek,
  2580  		NeedRemoteAddr:     senderRequested,
  2581  		NeedLinkPacketInfo: isPacket,
  2582  	}
  2583  
  2584  	// TCP sockets discard the data if MSG_TRUNC is set.
  2585  	//
  2586  	// This behavior is documented in man 7 tcp:
  2587  	// Since version 2.4, Linux supports the use of MSG_TRUNC in the flags
  2588  	// argument of recv(2) (and recvmsg(2)). This flag causes the received
  2589  	// bytes of data to be discarded, rather than passed back in a
  2590  	// caller-supplied  buffer.
  2591  	var w io.Writer
  2592  	if !isPacket && trunc {
  2593  		w = &tcpip.LimitedWriter{
  2594  			W: ioutil.Discard,
  2595  			N: dst.NumBytes(),
  2596  		}
  2597  	} else {
  2598  		w = dst.Writer(ctx)
  2599  	}
  2600  
  2601  	s.readMu.Lock()
  2602  	defer s.readMu.Unlock()
  2603  
  2604  	res, err := s.Endpoint.Read(w, readOptions)
  2605  	if _, ok := err.(*tcpip.ErrBadBuffer); ok && dst.NumBytes() == 0 {
  2606  		err = nil
  2607  	}
  2608  	if err != nil {
  2609  		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err)
  2610  	}
  2611  	// Set the control message, even if 0 bytes were read.
  2612  	s.updateTimestamp(res.ControlMessages)
  2613  
  2614  	if isPacket {
  2615  		var addr linux.SockAddr
  2616  		var addrLen uint32
  2617  		if senderRequested {
  2618  			addr, addrLen = socket.ConvertAddress(s.family, res.RemoteAddr)
  2619  			switch v := addr.(type) {
  2620  			case *linux.SockAddrLink:
  2621  				v.Protocol = socket.Htons(uint16(res.LinkPacketInfo.Protocol))
  2622  				v.PacketType = toLinuxPacketType(res.LinkPacketInfo.PktType)
  2623  			}
  2624  		}
  2625  
  2626  		msgLen := res.Count
  2627  		if trunc {
  2628  			msgLen = res.Total
  2629  		}
  2630  
  2631  		var flags int
  2632  		if res.Total > res.Count {
  2633  			flags |= linux.MSG_TRUNC
  2634  		}
  2635  
  2636  		return msgLen, flags, addr, addrLen, s.controlMessages(res.ControlMessages), nil
  2637  	}
  2638  
  2639  	if peek {
  2640  		// MSG_TRUNC with MSG_PEEK on a TCP socket returns the
  2641  		// amount that could be read, and does not write to buffer.
  2642  		if trunc {
  2643  			// TCP endpoint does not return the total bytes in buffer as numTotal.
  2644  			// We need to query it from socket option.
  2645  			rql, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
  2646  			if err != nil {
  2647  				return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err)
  2648  			}
  2649  			msgLen := int(dst.NumBytes())
  2650  			if msgLen > rql {
  2651  				msgLen = rql
  2652  			}
  2653  			return msgLen, 0, nil, 0, socket.ControlMessages{}, nil
  2654  		}
  2655  	} else if n := res.Count; n != 0 {
  2656  		s.Endpoint.ModerateRecvBuf(n)
  2657  	}
  2658  
  2659  	cmsg := s.controlMessages(res.ControlMessages)
  2660  	s.fillCmsgInq(&cmsg)
  2661  	return res.Count, 0, nil, 0, cmsg, syserr.TranslateNetstackError(err)
  2662  }
  2663  
  2664  func (s *socketOpsCommon) controlMessages(cm tcpip.ControlMessages) socket.ControlMessages {
  2665  	readCM := socket.NewIPControlMessages(s.family, cm)
  2666  	return socket.ControlMessages{
  2667  		IP: socket.IPControlMessages{
  2668  			HasTimestamp:       readCM.HasTimestamp && s.sockOptTimestamp,
  2669  			Timestamp:          readCM.Timestamp,
  2670  			HasInq:             readCM.HasInq,
  2671  			Inq:                readCM.Inq,
  2672  			HasTOS:             readCM.HasTOS,
  2673  			TOS:                readCM.TOS,
  2674  			HasTClass:          readCM.HasTClass,
  2675  			TClass:             readCM.TClass,
  2676  			HasIPPacketInfo:    readCM.HasIPPacketInfo,
  2677  			PacketInfo:         readCM.PacketInfo,
  2678  			OriginalDstAddress: readCM.OriginalDstAddress,
  2679  			SockErr:            readCM.SockErr,
  2680  		},
  2681  	}
  2682  }
  2683  
  2684  // updateTimestamp sets the timestamp for SIOCGSTAMP. It should be called after
  2685  // successfully writing packet data out to userspace.
  2686  //
  2687  // Precondition: s.readMu must be locked.
  2688  func (s *socketOpsCommon) updateTimestamp(cm tcpip.ControlMessages) {
  2689  	// Save the SIOCGSTAMP timestamp only if SO_TIMESTAMP is disabled.
  2690  	if !s.sockOptTimestamp {
  2691  		s.timestampValid = true
  2692  		s.timestampNS = cm.Timestamp
  2693  	}
  2694  }
  2695  
  2696  // dequeueErr is analogous to net/core/skbuff.c:sock_dequeue_err_skb().
  2697  func (s *socketOpsCommon) dequeueErr() *tcpip.SockError {
  2698  	so := s.Endpoint.SocketOptions()
  2699  	err := so.DequeueErr()
  2700  	if err == nil {
  2701  		return nil
  2702  	}
  2703  
  2704  	// Update socket error to reflect ICMP errors in queue.
  2705  	if nextErr := so.PeekErr(); nextErr != nil && nextErr.Cause.Origin().IsICMPErr() {
  2706  		so.SetLastError(nextErr.Err)
  2707  	} else if err.Cause.Origin().IsICMPErr() {
  2708  		so.SetLastError(nil)
  2709  	}
  2710  	return err
  2711  }
  2712  
  2713  // addrFamilyFromNetProto returns the address family identifier for the given
  2714  // network protocol.
  2715  func addrFamilyFromNetProto(net tcpip.NetworkProtocolNumber) int {
  2716  	switch net {
  2717  	case header.IPv4ProtocolNumber:
  2718  		return linux.AF_INET
  2719  	case header.IPv6ProtocolNumber:
  2720  		return linux.AF_INET6
  2721  	default:
  2722  		panic(fmt.Sprintf("invalid net proto for addr family inference: %d", net))
  2723  	}
  2724  }
  2725  
  2726  // recvErr handles MSG_ERRQUEUE for recvmsg(2).
  2727  // This is analogous to net/ipv4/ip_sockglue.c:ip_recv_error().
  2728  func (s *socketOpsCommon) recvErr(t *kernel.Task, dst usermem.IOSequence) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
  2729  	sockErr := s.dequeueErr()
  2730  	if sockErr == nil {
  2731  		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
  2732  	}
  2733  
  2734  	// The payload of the original packet that caused the error is passed as
  2735  	// normal data via msg_iovec.  -- recvmsg(2)
  2736  	msgFlags := linux.MSG_ERRQUEUE
  2737  	if int(dst.NumBytes()) < len(sockErr.Payload) {
  2738  		msgFlags |= linux.MSG_TRUNC
  2739  	}
  2740  	n, err := dst.CopyOut(t, sockErr.Payload)
  2741  
  2742  	// The original destination address of the datagram that caused the error is
  2743  	// supplied via msg_name.  -- recvmsg(2)
  2744  	dstAddr, dstAddrLen := socket.ConvertAddress(addrFamilyFromNetProto(sockErr.NetProto), sockErr.Dst)
  2745  	cmgs := socket.ControlMessages{IP: socket.NewIPControlMessages(s.family, tcpip.ControlMessages{SockErr: sockErr})}
  2746  	return n, msgFlags, dstAddr, dstAddrLen, cmgs, syserr.FromError(err)
  2747  }
  2748  
  2749  // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
  2750  // tcpip.Endpoint.
  2751  func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
  2752  	if flags&linux.MSG_ERRQUEUE != 0 {
  2753  		return s.recvErr(t, dst)
  2754  	}
  2755  
  2756  	trunc := flags&linux.MSG_TRUNC != 0
  2757  	peek := flags&linux.MSG_PEEK != 0
  2758  	dontWait := flags&linux.MSG_DONTWAIT != 0
  2759  	waitAll := flags&linux.MSG_WAITALL != 0
  2760  	if senderRequested && !s.isPacketBased() {
  2761  		// Stream sockets ignore the sender address.
  2762  		senderRequested = false
  2763  	}
  2764  	n, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
  2765  
  2766  	if s.isPacketBased() && err == syserr.ErrClosedForReceive && flags&linux.MSG_DONTWAIT != 0 {
  2767  		// In this situation we should return EAGAIN.
  2768  		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
  2769  	}
  2770  
  2771  	if err != nil && (err != syserr.ErrWouldBlock || dontWait) {
  2772  		// Read failed and we should not retry.
  2773  		return 0, 0, nil, 0, socket.ControlMessages{}, err
  2774  	}
  2775  
  2776  	if err == nil && (dontWait || !waitAll || s.isPacketBased() || int64(n) >= dst.NumBytes()) {
  2777  		// We got all the data we need.
  2778  		return
  2779  	}
  2780  
  2781  	// Don't overwrite any data we received.
  2782  	dst = dst.DropFirst(n)
  2783  
  2784  	// We'll have to block. Register for notifications and keep trying to
  2785  	// send all the data.
  2786  	e, ch := waiter.NewChannelEntry(nil)
  2787  	s.EventRegister(&e, waiter.ReadableEvents)
  2788  	defer s.EventUnregister(&e)
  2789  
  2790  	for {
  2791  		var rn int
  2792  		rn, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
  2793  		n += rn
  2794  		if err != nil && err != syserr.ErrWouldBlock {
  2795  			// Always stop on errors other than would block as we generally
  2796  			// won't be able to get any more data. Eat the error if we got
  2797  			// any data.
  2798  			if n > 0 {
  2799  				err = nil
  2800  			}
  2801  			return
  2802  		}
  2803  		if err == nil && (s.isPacketBased() || !waitAll || int64(rn) >= dst.NumBytes()) {
  2804  			// We got all the data we need.
  2805  			return
  2806  		}
  2807  		dst = dst.DropFirst(rn)
  2808  
  2809  		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
  2810  			if n > 0 {
  2811  				return n, msgFlags, senderAddr, senderAddrLen, controlMessages, nil
  2812  			}
  2813  			if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
  2814  				return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
  2815  			}
  2816  			return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
  2817  		}
  2818  	}
  2819  }
  2820  
  2821  // SendMsg implements the linux syscall sendmsg(2) for sockets backed by
  2822  // tcpip.Endpoint.
  2823  func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
  2824  	// Reject Unix control messages.
  2825  	if !controlMessages.Unix.Empty() {
  2826  		return 0, syserr.ErrInvalidArgument
  2827  	}
  2828  
  2829  	var addr *tcpip.FullAddress
  2830  	if len(to) > 0 {
  2831  		addrBuf, family, err := socket.AddressAndFamily(to)
  2832  		if err != nil {
  2833  			return 0, err
  2834  		}
  2835  		if err := s.checkFamily(family, false /* exact */); err != nil {
  2836  			return 0, err
  2837  		}
  2838  		addrBuf = s.mapFamily(addrBuf, family)
  2839  
  2840  		addr = &addrBuf
  2841  	}
  2842  
  2843  	opts := tcpip.WriteOptions{
  2844  		To:          addr,
  2845  		More:        flags&linux.MSG_MORE != 0,
  2846  		EndOfRecord: flags&linux.MSG_EOR != 0,
  2847  	}
  2848  
  2849  	r := src.Reader(t)
  2850  	var (
  2851  		total int64
  2852  		entry waiter.Entry
  2853  		ch    <-chan struct{}
  2854  	)
  2855  	for {
  2856  		n, err := s.Endpoint.Write(r, opts)
  2857  		total += n
  2858  		if flags&linux.MSG_DONTWAIT != 0 {
  2859  			return int(total), syserr.TranslateNetstackError(err)
  2860  		}
  2861  		block := true
  2862  		switch err.(type) {
  2863  		case nil:
  2864  			block = total != src.NumBytes()
  2865  		case *tcpip.ErrWouldBlock:
  2866  		default:
  2867  			block = false
  2868  		}
  2869  		if block {
  2870  			if ch == nil {
  2871  				// We'll have to block. Register for notification and keep trying to
  2872  				// send all the data.
  2873  				entry, ch = waiter.NewChannelEntry(nil)
  2874  				s.EventRegister(&entry, waiter.WritableEvents)
  2875  				defer s.EventUnregister(&entry)
  2876  			} else {
  2877  				// Don't wait immediately after registration in case more data
  2878  				// became available between when we last checked and when we setup
  2879  				// the notification.
  2880  				if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
  2881  					if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
  2882  						return int(total), syserr.ErrTryAgain
  2883  					}
  2884  					// handleIOError will consume errors from t.Block if needed.
  2885  					return int(total), syserr.FromError(err)
  2886  				}
  2887  			}
  2888  			continue
  2889  		}
  2890  		return int(total), syserr.TranslateNetstackError(err)
  2891  	}
  2892  }
  2893  
  2894  // Ioctl implements fs.FileOperations.Ioctl.
  2895  func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
  2896  	return s.socketOpsCommon.ioctl(ctx, io, args)
  2897  }
  2898  
  2899  func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
  2900  	t := kernel.TaskFromContext(ctx)
  2901  	if t == nil {
  2902  		panic("ioctl(2) may only be called from a task goroutine")
  2903  	}
  2904  
  2905  	// SIOCGSTAMP is implemented by netstack rather than all commonEndpoint
  2906  	// sockets.
  2907  	// TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP.
  2908  	switch args[1].Int() {
  2909  	case linux.SIOCGSTAMP:
  2910  		s.readMu.Lock()
  2911  		defer s.readMu.Unlock()
  2912  		if !s.timestampValid {
  2913  			return 0, syserror.ENOENT
  2914  		}
  2915  
  2916  		tv := linux.NsecToTimeval(s.timestampNS)
  2917  		_, err := tv.CopyOut(t, args[2].Pointer())
  2918  		return 0, err
  2919  
  2920  	case linux.TIOCINQ:
  2921  		v, terr := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
  2922  		if terr != nil {
  2923  			return 0, syserr.TranslateNetstackError(terr).ToError()
  2924  		}
  2925  
  2926  		if v > math.MaxInt32 {
  2927  			v = math.MaxInt32
  2928  		}
  2929  
  2930  		// Copy result to userspace.
  2931  		vP := primitive.Int32(v)
  2932  		_, err := vP.CopyOut(t, args[2].Pointer())
  2933  		return 0, err
  2934  	}
  2935  
  2936  	return Ioctl(ctx, s.Endpoint, io, args)
  2937  }
  2938  
  2939  // Ioctl performs a socket ioctl.
  2940  func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
  2941  	t := kernel.TaskFromContext(ctx)
  2942  	if t == nil {
  2943  		panic("ioctl(2) may only be called from a task goroutine")
  2944  	}
  2945  
  2946  	switch arg := int(args[1].Int()); arg {
  2947  	case linux.SIOCGIFFLAGS,
  2948  		linux.SIOCGIFADDR,
  2949  		linux.SIOCGIFBRDADDR,
  2950  		linux.SIOCGIFDSTADDR,
  2951  		linux.SIOCGIFHWADDR,
  2952  		linux.SIOCGIFINDEX,
  2953  		linux.SIOCGIFMAP,
  2954  		linux.SIOCGIFMETRIC,
  2955  		linux.SIOCGIFMTU,
  2956  		linux.SIOCGIFNAME,
  2957  		linux.SIOCGIFNETMASK,
  2958  		linux.SIOCGIFTXQLEN,
  2959  		linux.SIOCETHTOOL:
  2960  
  2961  		var ifr linux.IFReq
  2962  		if _, err := ifr.CopyIn(t, args[2].Pointer()); err != nil {
  2963  			return 0, err
  2964  		}
  2965  		if err := interfaceIoctl(ctx, io, arg, &ifr); err != nil {
  2966  			return 0, err.ToError()
  2967  		}
  2968  		_, err := ifr.CopyOut(t, args[2].Pointer())
  2969  		return 0, err
  2970  
  2971  	case linux.SIOCGIFCONF:
  2972  		// Return a list of interface addresses or the buffer size
  2973  		// necessary to hold the list.
  2974  		var ifc linux.IFConf
  2975  		if _, err := ifc.CopyIn(t, args[2].Pointer()); err != nil {
  2976  			return 0, err
  2977  		}
  2978  
  2979  		if err := ifconfIoctl(ctx, t, io, &ifc); err != nil {
  2980  			return 0, err
  2981  		}
  2982  
  2983  		_, err := ifc.CopyOut(t, args[2].Pointer())
  2984  		return 0, err
  2985  
  2986  	case linux.TIOCINQ:
  2987  		v, terr := ep.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
  2988  		if terr != nil {
  2989  			return 0, syserr.TranslateNetstackError(terr).ToError()
  2990  		}
  2991  
  2992  		if v > math.MaxInt32 {
  2993  			v = math.MaxInt32
  2994  		}
  2995  		// Copy result to userspace.
  2996  		vP := primitive.Int32(v)
  2997  		_, err := vP.CopyOut(t, args[2].Pointer())
  2998  		return 0, err
  2999  
  3000  	case linux.TIOCOUTQ:
  3001  		v, terr := ep.GetSockOptInt(tcpip.SendQueueSizeOption)
  3002  		if terr != nil {
  3003  			return 0, syserr.TranslateNetstackError(terr).ToError()
  3004  		}
  3005  
  3006  		if v > math.MaxInt32 {
  3007  			v = math.MaxInt32
  3008  		}
  3009  
  3010  		// Copy result to userspace.
  3011  		vP := primitive.Int32(v)
  3012  		_, err := vP.CopyOut(t, args[2].Pointer())
  3013  		return 0, err
  3014  
  3015  	case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG:
  3016  		unimpl.EmitUnimplementedEvent(ctx)
  3017  	}
  3018  
  3019  	return 0, syserror.ENOTTY
  3020  }
  3021  
  3022  // interfaceIoctl implements interface requests.
  3023  func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFReq) *syserr.Error {
  3024  	var (
  3025  		iface inet.Interface
  3026  		index int32
  3027  		found bool
  3028  	)
  3029  
  3030  	// Find the relevant device.
  3031  	stack := inet.StackFromContext(ctx)
  3032  	if stack == nil {
  3033  		return syserr.ErrNoDevice
  3034  	}
  3035  
  3036  	// SIOCGIFNAME uses ifr.ifr_ifindex rather than ifr.ifr_name to
  3037  	// identify a device.
  3038  	if arg == linux.SIOCGIFNAME {
  3039  		// Gets the name of the interface given the interface index
  3040  		// stored in ifr_ifindex.
  3041  		index = int32(hostarch.ByteOrder.Uint32(ifr.Data[:4]))
  3042  		if iface, ok := stack.Interfaces()[index]; ok {
  3043  			ifr.SetName(iface.Name)
  3044  			return nil
  3045  		}
  3046  		return syserr.ErrNoDevice
  3047  	}
  3048  
  3049  	// Find the relevant device.
  3050  	for index, iface = range stack.Interfaces() {
  3051  		if iface.Name == ifr.Name() {
  3052  			found = true
  3053  			break
  3054  		}
  3055  	}
  3056  	if !found {
  3057  		return syserr.ErrNoDevice
  3058  	}
  3059  
  3060  	switch arg {
  3061  	case linux.SIOCGIFINDEX:
  3062  		// Copy out the index to the data.
  3063  		hostarch.ByteOrder.PutUint32(ifr.Data[:], uint32(index))
  3064  
  3065  	case linux.SIOCGIFHWADDR:
  3066  		// Copy the hardware address out.
  3067  		//
  3068  		// Refer: https://linux.die.net/man/7/netdevice
  3069  		// SIOCGIFHWADDR, SIOCSIFHWADDR
  3070  		//
  3071  		// Get or set the hardware address of a device using
  3072  		// ifr_hwaddr. The hardware address is specified in a struct
  3073  		// sockaddr. sa_family contains the ARPHRD_* device type,
  3074  		// sa_data the L2 hardware address starting from byte 0. Setting
  3075  		// the hardware address is a privileged operation.
  3076  		hostarch.ByteOrder.PutUint16(ifr.Data[:], iface.DeviceType)
  3077  		n := copy(ifr.Data[2:], iface.Addr)
  3078  		for i := 2 + n; i < len(ifr.Data); i++ {
  3079  			ifr.Data[i] = 0 // Clear padding.
  3080  		}
  3081  
  3082  	case linux.SIOCGIFFLAGS:
  3083  		f, err := interfaceStatusFlags(stack, iface.Name)
  3084  		if err != nil {
  3085  			return err
  3086  		}
  3087  		// Drop the flags that don't fit in the size that we need to return. This
  3088  		// matches Linux behavior.
  3089  		hostarch.ByteOrder.PutUint16(ifr.Data[:2], uint16(f))
  3090  
  3091  	case linux.SIOCGIFADDR:
  3092  		// Copy the IPv4 address out.
  3093  		for _, addr := range stack.InterfaceAddrs()[index] {
  3094  			// This ioctl is only compatible with AF_INET addresses.
  3095  			if addr.Family != linux.AF_INET {
  3096  				continue
  3097  			}
  3098  			copy(ifr.Data[4:8], addr.Addr)
  3099  			break
  3100  		}
  3101  
  3102  	case linux.SIOCGIFMETRIC:
  3103  		// Gets the metric of the device. As per netdevice(7), this
  3104  		// always just sets ifr_metric to 0.
  3105  		hostarch.ByteOrder.PutUint32(ifr.Data[:4], 0)
  3106  
  3107  	case linux.SIOCGIFMTU:
  3108  		// Gets the MTU of the device.
  3109  		hostarch.ByteOrder.PutUint32(ifr.Data[:4], iface.MTU)
  3110  
  3111  	case linux.SIOCGIFMAP:
  3112  		// Gets the hardware parameters of the device.
  3113  		// TODO(github.com/SagerNet/issue/505): Implement.
  3114  
  3115  	case linux.SIOCGIFTXQLEN:
  3116  		// Gets the transmit queue length of the device.
  3117  		// TODO(github.com/SagerNet/issue/505): Implement.
  3118  
  3119  	case linux.SIOCGIFDSTADDR:
  3120  		// Gets the destination address of a point-to-point device.
  3121  		// TODO(github.com/SagerNet/issue/505): Implement.
  3122  
  3123  	case linux.SIOCGIFBRDADDR:
  3124  		// Gets the broadcast address of a device.
  3125  		// TODO(github.com/SagerNet/issue/505): Implement.
  3126  
  3127  	case linux.SIOCGIFNETMASK:
  3128  		// Gets the network mask of a device.
  3129  		for _, addr := range stack.InterfaceAddrs()[index] {
  3130  			// This ioctl is only compatible with AF_INET addresses.
  3131  			if addr.Family != linux.AF_INET {
  3132  				continue
  3133  			}
  3134  			// Populate ifr.ifr_netmask (type sockaddr).
  3135  			hostarch.ByteOrder.PutUint16(ifr.Data[0:], uint16(linux.AF_INET))
  3136  			hostarch.ByteOrder.PutUint16(ifr.Data[2:], 0)
  3137  			var mask uint32 = 0xffffffff << (32 - addr.PrefixLen)
  3138  			// Netmask is expected to be returned as a big endian
  3139  			// value.
  3140  			binary.BigEndian.PutUint32(ifr.Data[4:8], mask)
  3141  			break
  3142  		}
  3143  
  3144  	case linux.SIOCETHTOOL:
  3145  		// Stubbed out for now, Ideally we should implement the required
  3146  		// sub-commands for ETHTOOL
  3147  		//
  3148  		// See:
  3149  		// https://github.com/torvalds/linux/blob/aa0c9086b40c17a7ad94425b3b70dd1fdd7497bf/net/core/dev_ioctl.c
  3150  		return syserr.ErrEndpointOperation
  3151  
  3152  	default:
  3153  		// Not a valid call.
  3154  		return syserr.ErrInvalidArgument
  3155  	}
  3156  
  3157  	return nil
  3158  }
  3159  
  3160  // ifconfIoctl populates a struct ifconf for the SIOCGIFCONF ioctl.
  3161  func ifconfIoctl(ctx context.Context, t *kernel.Task, io usermem.IO, ifc *linux.IFConf) error {
  3162  	// If Ptr is NULL, return the necessary buffer size via Len.
  3163  	// Otherwise, write up to Len bytes starting at Ptr containing ifreq
  3164  	// structs.
  3165  	stack := inet.StackFromContext(ctx)
  3166  	if stack == nil {
  3167  		return syserr.ErrNoDevice.ToError()
  3168  	}
  3169  
  3170  	if ifc.Ptr == 0 {
  3171  		ifc.Len = int32(len(stack.Interfaces())) * int32(linux.SizeOfIFReq)
  3172  		return nil
  3173  	}
  3174  
  3175  	max := ifc.Len
  3176  	ifc.Len = 0
  3177  	for key, ifaceAddrs := range stack.InterfaceAddrs() {
  3178  		iface := stack.Interfaces()[key]
  3179  		for _, ifaceAddr := range ifaceAddrs {
  3180  			// Don't write past the end of the buffer.
  3181  			if ifc.Len+int32(linux.SizeOfIFReq) > max {
  3182  				break
  3183  			}
  3184  			if ifaceAddr.Family != linux.AF_INET {
  3185  				continue
  3186  			}
  3187  
  3188  			// Populate ifr.ifr_addr.
  3189  			ifr := linux.IFReq{}
  3190  			ifr.SetName(iface.Name)
  3191  			hostarch.ByteOrder.PutUint16(ifr.Data[0:2], uint16(ifaceAddr.Family))
  3192  			hostarch.ByteOrder.PutUint16(ifr.Data[2:4], 0)
  3193  			copy(ifr.Data[4:8], ifaceAddr.Addr[:4])
  3194  
  3195  			// Copy the ifr to userspace.
  3196  			dst := uintptr(ifc.Ptr) + uintptr(ifc.Len)
  3197  			ifc.Len += int32(linux.SizeOfIFReq)
  3198  			if _, err := ifr.CopyOut(t, hostarch.Addr(dst)); err != nil {
  3199  				return err
  3200  			}
  3201  		}
  3202  	}
  3203  	return nil
  3204  }
  3205  
  3206  // interfaceStatusFlags returns status flags for an interface in the stack.
  3207  // Flag values and meanings are described in greater detail in netdevice(7) in
  3208  // the SIOCGIFFLAGS section.
  3209  func interfaceStatusFlags(stack inet.Stack, name string) (uint32, *syserr.Error) {
  3210  	// We should only ever be passed a netstack.Stack.
  3211  	epstack, ok := stack.(*Stack)
  3212  	if !ok {
  3213  		return 0, errStackType
  3214  	}
  3215  
  3216  	// Find the NIC corresponding to this interface.
  3217  	for _, info := range epstack.Stack.NICInfo() {
  3218  		if info.Name == name {
  3219  			return nicStateFlagsToLinux(info.Flags), nil
  3220  		}
  3221  	}
  3222  	return 0, syserr.ErrNoDevice
  3223  }
  3224  
  3225  func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 {
  3226  	var rv uint32
  3227  	if f.Up {
  3228  		rv |= linux.IFF_UP | linux.IFF_LOWER_UP
  3229  	}
  3230  	if f.Running {
  3231  		rv |= linux.IFF_RUNNING
  3232  	}
  3233  	if f.Promiscuous {
  3234  		rv |= linux.IFF_PROMISC
  3235  	}
  3236  	if f.Loopback {
  3237  		rv |= linux.IFF_LOOPBACK
  3238  	}
  3239  	return rv
  3240  }
  3241  
  3242  func isTCPSocket(skType linux.SockType, skProto int) bool {
  3243  	return skType == linux.SOCK_STREAM && (skProto == 0 || skProto == unix.IPPROTO_TCP)
  3244  }
  3245  
  3246  func isUDPSocket(skType linux.SockType, skProto int) bool {
  3247  	return skType == linux.SOCK_DGRAM && (skProto == 0 || skProto == unix.IPPROTO_UDP)
  3248  }
  3249  
  3250  func isICMPSocket(skType linux.SockType, skProto int) bool {
  3251  	return skType == linux.SOCK_DGRAM && (skProto == unix.IPPROTO_ICMP || skProto == unix.IPPROTO_ICMPV6)
  3252  }
  3253  
  3254  // State implements socket.Socket.State. State translates the internal state
  3255  // returned by netstack to values defined by Linux.
  3256  func (s *socketOpsCommon) State() uint32 {
  3257  	if s.family != linux.AF_INET && s.family != linux.AF_INET6 {
  3258  		// States not implemented for this socket's family.
  3259  		return 0
  3260  	}
  3261  
  3262  	switch {
  3263  	case isTCPSocket(s.skType, s.protocol):
  3264  		// TCP socket.
  3265  		switch tcp.EndpointState(s.Endpoint.State()) {
  3266  		case tcp.StateEstablished:
  3267  			return linux.TCP_ESTABLISHED
  3268  		case tcp.StateSynSent:
  3269  			return linux.TCP_SYN_SENT
  3270  		case tcp.StateSynRecv:
  3271  			return linux.TCP_SYN_RECV
  3272  		case tcp.StateFinWait1:
  3273  			return linux.TCP_FIN_WAIT1
  3274  		case tcp.StateFinWait2:
  3275  			return linux.TCP_FIN_WAIT2
  3276  		case tcp.StateTimeWait:
  3277  			return linux.TCP_TIME_WAIT
  3278  		case tcp.StateClose, tcp.StateInitial, tcp.StateBound, tcp.StateConnecting, tcp.StateError:
  3279  			return linux.TCP_CLOSE
  3280  		case tcp.StateCloseWait:
  3281  			return linux.TCP_CLOSE_WAIT
  3282  		case tcp.StateLastAck:
  3283  			return linux.TCP_LAST_ACK
  3284  		case tcp.StateListen:
  3285  			return linux.TCP_LISTEN
  3286  		case tcp.StateClosing:
  3287  			return linux.TCP_CLOSING
  3288  		default:
  3289  			// Internal or unknown state.
  3290  			return 0
  3291  		}
  3292  	case isUDPSocket(s.skType, s.protocol):
  3293  		// UDP socket.
  3294  		switch udp.EndpointState(s.Endpoint.State()) {
  3295  		case udp.StateInitial, udp.StateBound, udp.StateClosed:
  3296  			return linux.TCP_CLOSE
  3297  		case udp.StateConnected:
  3298  			return linux.TCP_ESTABLISHED
  3299  		default:
  3300  			return 0
  3301  		}
  3302  	case isICMPSocket(s.skType, s.protocol):
  3303  		// TODO(b/112063468): Export states for ICMP sockets.
  3304  	case s.skType == linux.SOCK_RAW:
  3305  		// TODO(b/112063468): Export states for raw sockets.
  3306  	default:
  3307  		// Unknown transport protocol, how did we make this socket?
  3308  		log.Warningf("Unknown transport protocol for an existing socket: family=%v, type=%v, protocol=%v, internal type %v", s.family, s.skType, s.protocol, reflect.TypeOf(s.Endpoint).Elem())
  3309  		return 0
  3310  	}
  3311  
  3312  	return 0
  3313  }
  3314  
  3315  // Type implements socket.Socket.Type.
  3316  func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) {
  3317  	return s.family, s.skType, s.protocol
  3318  }
  3319  
  3320  // LINT.ThenChange(./netstack_vfs2.go)