github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/socket/netstack/netstack.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package netstack provides an implementation of the socket.Socket interface
    16  // that is backed by a tcpip.Endpoint.
    17  //
    18  // It does not depend on any particular endpoint implementation, and thus can
    19  // be used to expose certain endpoints to the sentry while leaving others out,
    20  // for example, TCP endpoints and Unix-domain endpoints.
    21  //
    22  // Lock ordering: netstack => mm: ioSequenceReadWriter copies user memory inside
    23  // tcpip.Endpoint.Write(). Netstack is allowed to (and does) hold locks during
    24  // this operation.
    25  package netstack
    26  
    27  import (
    28  	"bytes"
    29  	"encoding/binary"
    30  	"fmt"
    31  	"io"
    32  	"io/ioutil"
    33  	"math"
    34  	"reflect"
    35  	"time"
    36  
    37  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    38  	"github.com/MerlinKodo/gvisor/pkg/abi/linux/errno"
    39  	"github.com/MerlinKodo/gvisor/pkg/context"
    40  	"github.com/MerlinKodo/gvisor/pkg/errors/linuxerr"
    41  	"github.com/MerlinKodo/gvisor/pkg/eventchannel"
    42  	"github.com/MerlinKodo/gvisor/pkg/hostarch"
    43  	"github.com/MerlinKodo/gvisor/pkg/log"
    44  	"github.com/MerlinKodo/gvisor/pkg/marshal"
    45  	"github.com/MerlinKodo/gvisor/pkg/marshal/primitive"
    46  	"github.com/MerlinKodo/gvisor/pkg/metric"
    47  	"github.com/MerlinKodo/gvisor/pkg/sentry/arch"
    48  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/sockfs"
    49  	"github.com/MerlinKodo/gvisor/pkg/sentry/inet"
    50  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel"
    51  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth"
    52  	ktime "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/time"
    53  	"github.com/MerlinKodo/gvisor/pkg/sentry/socket"
    54  	"github.com/MerlinKodo/gvisor/pkg/sentry/socket/netfilter"
    55  	epb "github.com/MerlinKodo/gvisor/pkg/sentry/socket/netstack/events_go_proto"
    56  	"github.com/MerlinKodo/gvisor/pkg/sentry/vfs"
    57  	"github.com/MerlinKodo/gvisor/pkg/sync"
    58  	"github.com/MerlinKodo/gvisor/pkg/syserr"
    59  	"github.com/MerlinKodo/gvisor/pkg/tcpip"
    60  	"github.com/MerlinKodo/gvisor/pkg/tcpip/header"
    61  	"github.com/MerlinKodo/gvisor/pkg/tcpip/stack"
    62  	"github.com/MerlinKodo/gvisor/pkg/tcpip/transport"
    63  	"github.com/MerlinKodo/gvisor/pkg/tcpip/transport/tcp"
    64  	"github.com/MerlinKodo/gvisor/pkg/usermem"
    65  	"github.com/MerlinKodo/gvisor/pkg/waiter"
    66  	"golang.org/x/sys/unix"
    67  	"google.golang.org/protobuf/proto"
    68  )
    69  
    70  const bitsPerUint32 = 32
    71  
    72  // statCounterValue returns a function usable as callback function when defining a gVisor Sentry
    73  // metric that contains the value counted by the StatCounter.
    74  // This avoids a dependency loop in the tcpip package.
    75  func statCounterValue(cm *tcpip.StatCounter) func(...*metric.FieldValue) uint64 {
    76  	return func(...*metric.FieldValue) uint64 {
    77  		return cm.Value()
    78  	}
    79  }
    80  
    81  func mustCreateMetric(name, description string) *tcpip.StatCounter {
    82  	var cm tcpip.StatCounter
    83  	metric.MustRegisterCustomUint64Metric(name, true /* cumulative */, false /* sync */, description, statCounterValue(&cm))
    84  	return &cm
    85  }
    86  
    87  func mustCreateGauge(name, description string) *tcpip.StatCounter {
    88  	var cm tcpip.StatCounter
    89  	metric.MustRegisterCustomUint64Metric(name, false /* cumulative */, false /* sync */, description, statCounterValue(&cm))
    90  	return &cm
    91  }
    92  
    93  // Metrics contains metrics exported by netstack.
    94  var Metrics = tcpip.Stats{
    95  	DroppedPackets: mustCreateMetric("/netstack/dropped_packets", "Number of packets dropped at the transport layer."),
    96  	NICs: tcpip.NICStats{
    97  		MalformedL4RcvdPackets: mustCreateMetric("/netstack/nic/malformed_l4_received_packets", "Number of packets received that failed L4 header parsing."),
    98  		Tx: tcpip.NICPacketStats{
    99  			Packets: mustCreateMetric("/netstack/nic/tx/packets", "Number of packets transmitted."),
   100  			Bytes:   mustCreateMetric("/netstack/nic/tx/bytes", "Number of bytes transmitted."),
   101  		},
   102  		TxPacketsDroppedNoBufferSpace: mustCreateMetric("/netstack/nic/tx_packets_dropped_no_buffer_space", "Number of TX packets dropped as a result of no buffer space errors."),
   103  		Rx: tcpip.NICPacketStats{
   104  			Packets: mustCreateMetric("/netstack/nic/rx/packets", "Number of packets received."),
   105  			Bytes:   mustCreateMetric("/netstack/nic/rx/bytes", "Number of bytes received."),
   106  		},
   107  		DisabledRx: tcpip.NICPacketStats{
   108  			Packets: mustCreateMetric("/netstack/nic/disabled_rx/packets", "Number of packets received on disabled NICs."),
   109  			Bytes:   mustCreateMetric("/netstack/nic/disabled_rx/bytes", "Number of bytes received on disabled NICs."),
   110  		},
   111  		Neighbor: tcpip.NICNeighborStats{
   112  			UnreachableEntryLookups:                    mustCreateMetric("/netstack/nic/neighbor/unreachable_entry_loopups", "Number of lookups performed on a neighbor entry in Unreachable state."),
   113  			DroppedConfirmationForNoninitiatedNeighbor: mustCreateMetric("/netstack/nic/neighbor/dropped_confirmation_for_noninitiated_neighbor", "Number of advertisements received that don't match an entry in the neighbor cache."),
   114  			DroppedInvalidLinkAddressConfirmations:     mustCreateMetric("/netstack/nic/neighbor/dropped_invalid_link_address_confirmations", "Number of advertisements dropped because they have empty source link-layer addresses"),
   115  		},
   116  	},
   117  	ICMP: tcpip.ICMPStats{
   118  		V4: tcpip.ICMPv4Stats{
   119  			PacketsSent: tcpip.ICMPv4SentPacketStats{
   120  				ICMPv4PacketStats: tcpip.ICMPv4PacketStats{
   121  					EchoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_request", "Number of ICMPv4 echo request packets sent."),
   122  					EchoReply:      mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_reply", "Number of ICMPv4 echo reply packets sent."),
   123  					DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_sent/dst_unreachable", "Number of ICMPv4 destination unreachable packets sent."),
   124  					SrcQuench:      mustCreateMetric("/netstack/icmp/v4/packets_sent/src_quench", "Number of ICMPv4 source quench packets sent."),
   125  					Redirect:       mustCreateMetric("/netstack/icmp/v4/packets_sent/redirect", "Number of ICMPv4 redirect packets sent."),
   126  					TimeExceeded:   mustCreateMetric("/netstack/icmp/v4/packets_sent/time_exceeded", "Number of ICMPv4 time exceeded packets sent."),
   127  					ParamProblem:   mustCreateMetric("/netstack/icmp/v4/packets_sent/param_problem", "Number of ICMPv4 parameter problem packets sent."),
   128  					Timestamp:      mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp", "Number of ICMPv4 timestamp packets sent."),
   129  					TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp_reply", "Number of ICMPv4 timestamp reply packets sent."),
   130  					InfoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_sent/info_request", "Number of ICMPv4 information request packets sent."),
   131  					InfoReply:      mustCreateMetric("/netstack/icmp/v4/packets_sent/info_reply", "Number of ICMPv4 information reply packets sent."),
   132  				},
   133  				Dropped:     mustCreateMetric("/netstack/icmp/v4/packets_sent/dropped", "Number of ICMPv4 packets dropped due to link layer errors."),
   134  				RateLimited: mustCreateMetric("/netstack/icmp/v4/packets_sent/rate_limited", "Number of ICMPv4 packets dropped due to rate limit being exceeded."),
   135  			},
   136  			PacketsReceived: tcpip.ICMPv4ReceivedPacketStats{
   137  				ICMPv4PacketStats: tcpip.ICMPv4PacketStats{
   138  					EchoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_received/echo_request", "Number of ICMPv4 echo request packets received."),
   139  					EchoReply:      mustCreateMetric("/netstack/icmp/v4/packets_received/echo_reply", "Number of ICMPv4 echo reply packets received."),
   140  					DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_received/dst_unreachable", "Number of ICMPv4 destination unreachable packets received."),
   141  					SrcQuench:      mustCreateMetric("/netstack/icmp/v4/packets_received/src_quench", "Number of ICMPv4 source quench packets received."),
   142  					Redirect:       mustCreateMetric("/netstack/icmp/v4/packets_received/redirect", "Number of ICMPv4 redirect packets received."),
   143  					TimeExceeded:   mustCreateMetric("/netstack/icmp/v4/packets_received/time_exceeded", "Number of ICMPv4 time exceeded packets received."),
   144  					ParamProblem:   mustCreateMetric("/netstack/icmp/v4/packets_received/param_problem", "Number of ICMPv4 parameter problem packets received."),
   145  					Timestamp:      mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp", "Number of ICMPv4 timestamp packets received."),
   146  					TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp_reply", "Number of ICMPv4 timestamp reply packets received."),
   147  					InfoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_received/info_request", "Number of ICMPv4 information request packets received."),
   148  					InfoReply:      mustCreateMetric("/netstack/icmp/v4/packets_received/info_reply", "Number of ICMPv4 information reply packets received."),
   149  				},
   150  				Invalid: mustCreateMetric("/netstack/icmp/v4/packets_received/invalid", "Number of ICMPv4 packets received that the transport layer could not parse."),
   151  			},
   152  		},
   153  		V6: tcpip.ICMPv6Stats{
   154  			PacketsSent: tcpip.ICMPv6SentPacketStats{
   155  				ICMPv6PacketStats: tcpip.ICMPv6PacketStats{
   156  					EchoRequest:             mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_request", "Number of ICMPv6 echo request packets sent."),
   157  					EchoReply:               mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_reply", "Number of ICMPv6 echo reply packets sent."),
   158  					DstUnreachable:          mustCreateMetric("/netstack/icmp/v6/packets_sent/dst_unreachable", "Number of ICMPv6 destination unreachable packets sent."),
   159  					PacketTooBig:            mustCreateMetric("/netstack/icmp/v6/packets_sent/packet_too_big", "Number of ICMPv6 packet too big packets sent."),
   160  					TimeExceeded:            mustCreateMetric("/netstack/icmp/v6/packets_sent/time_exceeded", "Number of ICMPv6 time exceeded packets sent."),
   161  					ParamProblem:            mustCreateMetric("/netstack/icmp/v6/packets_sent/param_problem", "Number of ICMPv6 parameter problem packets sent."),
   162  					RouterSolicit:           mustCreateMetric("/netstack/icmp/v6/packets_sent/router_solicit", "Number of ICMPv6 router solicit packets sent."),
   163  					RouterAdvert:            mustCreateMetric("/netstack/icmp/v6/packets_sent/router_advert", "Number of ICMPv6 router advert packets sent."),
   164  					NeighborSolicit:         mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets sent."),
   165  					NeighborAdvert:          mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_advert", "Number of ICMPv6 neighbor advert packets sent."),
   166  					RedirectMsg:             mustCreateMetric("/netstack/icmp/v6/packets_sent/redirect_msg", "Number of ICMPv6 redirect message packets sent."),
   167  					MulticastListenerQuery:  mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_query", "Number of ICMPv6 multicast listener query packets sent."),
   168  					MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent."),
   169  					MulticastListenerDone:   mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent."),
   170  				},
   171  				Dropped:     mustCreateMetric("/netstack/icmp/v6/packets_sent/dropped", "Number of ICMPv6 packets dropped due to link layer errors."),
   172  				RateLimited: mustCreateMetric("/netstack/icmp/v6/packets_sent/rate_limited", "Number of ICMPv6 packets dropped due to rate limit being exceeded."),
   173  			},
   174  			PacketsReceived: tcpip.ICMPv6ReceivedPacketStats{
   175  				ICMPv6PacketStats: tcpip.ICMPv6PacketStats{
   176  					EchoRequest:             mustCreateMetric("/netstack/icmp/v6/packets_received/echo_request", "Number of ICMPv6 echo request packets received."),
   177  					EchoReply:               mustCreateMetric("/netstack/icmp/v6/packets_received/echo_reply", "Number of ICMPv6 echo reply packets received."),
   178  					DstUnreachable:          mustCreateMetric("/netstack/icmp/v6/packets_received/dst_unreachable", "Number of ICMPv6 destination unreachable packets received."),
   179  					PacketTooBig:            mustCreateMetric("/netstack/icmp/v6/packets_received/packet_too_big", "Number of ICMPv6 packet too big packets received."),
   180  					TimeExceeded:            mustCreateMetric("/netstack/icmp/v6/packets_received/time_exceeded", "Number of ICMPv6 time exceeded packets received."),
   181  					ParamProblem:            mustCreateMetric("/netstack/icmp/v6/packets_received/param_problem", "Number of ICMPv6 parameter problem packets received."),
   182  					RouterSolicit:           mustCreateMetric("/netstack/icmp/v6/packets_received/router_solicit", "Number of ICMPv6 router solicit packets received."),
   183  					RouterAdvert:            mustCreateMetric("/netstack/icmp/v6/packets_received/router_advert", "Number of ICMPv6 router advert packets received."),
   184  					NeighborSolicit:         mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets received."),
   185  					NeighborAdvert:          mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_advert", "Number of ICMPv6 neighbor advert packets received."),
   186  					RedirectMsg:             mustCreateMetric("/netstack/icmp/v6/packets_received/redirect_msg", "Number of ICMPv6 redirect message packets received."),
   187  					MulticastListenerQuery:  mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_query", "Number of ICMPv6 multicast listener query packets received."),
   188  					MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent."),
   189  					MulticastListenerDone:   mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent."),
   190  				},
   191  				Unrecognized:                   mustCreateMetric("/netstack/icmp/v6/packets_received/unrecognized", "Number of ICMPv6 packets received that the transport layer does not know how to parse."),
   192  				Invalid:                        mustCreateMetric("/netstack/icmp/v6/packets_received/invalid", "Number of ICMPv6 packets received that the transport layer could not parse."),
   193  				RouterOnlyPacketsDroppedByHost: mustCreateMetric("/netstack/icmp/v6/packets_received/router_only_packets_dropped_by_host", "Number of ICMPv6 packets dropped due to being router-specific packets."),
   194  			},
   195  		},
   196  	},
   197  	IGMP: tcpip.IGMPStats{
   198  		PacketsSent: tcpip.IGMPSentPacketStats{
   199  			IGMPPacketStats: tcpip.IGMPPacketStats{
   200  				MembershipQuery:    mustCreateMetric("/netstack/igmp/packets_sent/membership_query", "Number of IGMP Membership Query messages sent."),
   201  				V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v1_membership_report", "Number of IGMPv1 Membership Report messages sent."),
   202  				V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v2_membership_report", "Number of IGMPv2 Membership Report messages sent."),
   203  				LeaveGroup:         mustCreateMetric("/netstack/igmp/packets_sent/leave_group", "Number of IGMP Leave Group messages sent."),
   204  			},
   205  			Dropped: mustCreateMetric("/netstack/igmp/packets_sent/dropped", "Number of IGMP packets dropped due to link layer errors."),
   206  		},
   207  		PacketsReceived: tcpip.IGMPReceivedPacketStats{
   208  			IGMPPacketStats: tcpip.IGMPPacketStats{
   209  				MembershipQuery:    mustCreateMetric("/netstack/igmp/packets_received/membership_query", "Number of IGMP Membership Query messages received."),
   210  				V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v1_membership_report", "Number of IGMPv1 Membership Report messages received."),
   211  				V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v2_membership_report", "Number of IGMPv2 Membership Report messages received."),
   212  				LeaveGroup:         mustCreateMetric("/netstack/igmp/packets_received/leave_group", "Number of IGMP Leave Group messages received."),
   213  			},
   214  			Invalid:        mustCreateMetric("/netstack/igmp/packets_received/invalid", "Number of IGMP packets received that could not be parsed."),
   215  			ChecksumErrors: mustCreateMetric("/netstack/igmp/packets_received/checksum_errors", "Number of received IGMP packets with bad checksums."),
   216  			Unrecognized:   mustCreateMetric("/netstack/igmp/packets_received/unrecognized", "Number of unrecognized IGMP packets received."),
   217  		},
   218  	},
   219  	IP: tcpip.IPStats{
   220  		PacketsReceived:                     mustCreateMetric("/netstack/ip/packets_received", "Number of IP packets received from the link layer in nic.DeliverNetworkPacket."),
   221  		DisabledPacketsReceived:             mustCreateMetric("/netstack/ip/disabled_packets_received", "Number of IP packets received from the link layer when the IP layer is disabled."),
   222  		InvalidDestinationAddressesReceived: mustCreateMetric("/netstack/ip/invalid_addresses_received", "Number of IP packets received with an unknown or invalid destination address."),
   223  		InvalidSourceAddressesReceived:      mustCreateMetric("/netstack/ip/invalid_source_addresses_received", "Number of IP packets received with an unknown or invalid source address."),
   224  		PacketsDelivered:                    mustCreateMetric("/netstack/ip/packets_delivered", "Number of incoming IP packets that are successfully delivered to the transport layer via HandlePacket."),
   225  		PacketsSent:                         mustCreateMetric("/netstack/ip/packets_sent", "Number of IP packets sent via WritePacket."),
   226  		OutgoingPacketErrors:                mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Number of IP packets which failed to write to a link-layer endpoint."),
   227  		MalformedPacketsReceived:            mustCreateMetric("/netstack/ip/malformed_packets_received", "Number of IP packets which failed IP header validation checks."),
   228  		MalformedFragmentsReceived:          mustCreateMetric("/netstack/ip/malformed_fragments_received", "Number of IP fragments which failed IP fragment validation checks."),
   229  		IPTablesPreroutingDropped:           mustCreateMetric("/netstack/ip/iptables/prerouting_dropped", "Number of IP packets dropped in the Prerouting chain."),
   230  		IPTablesInputDropped:                mustCreateMetric("/netstack/ip/iptables/input_dropped", "Number of IP packets dropped in the Input chain."),
   231  		IPTablesOutputDropped:               mustCreateMetric("/netstack/ip/iptables/output_dropped", "Number of IP packets dropped in the Output chain."),
   232  		OptionTimestampReceived:             mustCreateMetric("/netstack/ip/options/timestamp_received", "Number of timestamp options found in received IP packets."),
   233  		OptionRecordRouteReceived:           mustCreateMetric("/netstack/ip/options/record_route_received", "Number of record route options found in received IP packets."),
   234  		OptionRouterAlertReceived:           mustCreateMetric("/netstack/ip/options/router_alert_received", "Number of router alert options found in received IP packets."),
   235  		OptionUnknownReceived:               mustCreateMetric("/netstack/ip/options/unknown_received", "Number of unknown options found in received IP packets."),
   236  		Forwarding: tcpip.IPForwardingStats{
   237  			Unrouteable:            mustCreateMetric("/netstack/ip/forwarding/unrouteable", "Number of IP packets received which couldn't be routed and thus were not forwarded."),
   238  			ExhaustedTTL:           mustCreateMetric("/netstack/ip/forwarding/exhausted_ttl", "Number of IP packets received which could not be forwarded due to an exhausted TTL."),
   239  			LinkLocalSource:        mustCreateMetric("/netstack/ip/forwarding/link_local_source_address", "Number of IP packets received which could not be forwarded due to a link-local source address."),
   240  			LinkLocalDestination:   mustCreateMetric("/netstack/ip/forwarding/link_local_destination_address", "Number of IP packets received which could not be forwarded due to a link-local destination address."),
   241  			ExtensionHeaderProblem: mustCreateMetric("/netstack/ip/forwarding/extension_header_problem", "Number of IP packets received which could not be forwarded due to a problem processing their IPv6 extension headers."),
   242  			PacketTooBig:           mustCreateMetric("/netstack/ip/forwarding/packet_too_big", "Number of IP packets received which could not be forwarded because they could not fit within the outgoing MTU."),
   243  			HostUnreachable:        mustCreateMetric("/netstack/ip/forwarding/host_unreachable", "Number of IP packets received which could not be forwarded due to unresolvable next hop."),
   244  			Errors:                 mustCreateMetric("/netstack/ip/forwarding/errors", "Number of IP packets which couldn't be forwarded."),
   245  		},
   246  	},
   247  	ARP: tcpip.ARPStats{
   248  		PacketsReceived:                                 mustCreateMetric("/netstack/arp/packets_received", "Number of ARP packets received from the link layer."),
   249  		DisabledPacketsReceived:                         mustCreateMetric("/netstack/arp/disabled_packets_received", "Number of ARP packets received from the link layer when the ARP layer is disabled."),
   250  		MalformedPacketsReceived:                        mustCreateMetric("/netstack/arp/malformed_packets_received", "Number of ARP packets which failed ARP header validation checks."),
   251  		RequestsReceived:                                mustCreateMetric("/netstack/arp/requests_received", "Number of ARP requests received."),
   252  		RequestsReceivedUnknownTargetAddress:            mustCreateMetric("/netstack/arp/requests_received_unknown_addr", "Number of ARP requests received with an unknown target address."),
   253  		OutgoingRequestInterfaceHasNoLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_iface_has_no_addr", "Number of failed attempts to send an ARP request with an interface that has no network address."),
   254  		OutgoingRequestBadLocalAddressErrors:            mustCreateMetric("/netstack/arp/outgoing_requests_invalid_local_addr", "Number of failed attempts to send an ARP request with a provided local address that is invalid."),
   255  		OutgoingRequestsDropped:                         mustCreateMetric("/netstack/arp/outgoing_requests_dropped", "Number of ARP requests which failed to write to a link-layer endpoint."),
   256  		OutgoingRequestsSent:                            mustCreateMetric("/netstack/arp/outgoing_requests_sent", "Number of ARP requests sent."),
   257  		RepliesReceived:                                 mustCreateMetric("/netstack/arp/replies_received", "Number of ARP replies received."),
   258  		OutgoingRepliesDropped:                          mustCreateMetric("/netstack/arp/outgoing_replies_dropped", "Number of ARP replies which failed to write to a link-layer endpoint."),
   259  		OutgoingRepliesSent:                             mustCreateMetric("/netstack/arp/outgoing_replies_sent", "Number of ARP replies sent."),
   260  	},
   261  	TCP: tcpip.TCPStats{
   262  		ActiveConnectionOpenings:           mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."),
   263  		PassiveConnectionOpenings:          mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."),
   264  		CurrentEstablished:                 mustCreateGauge("/netstack/tcp/current_established", "Number of connections in ESTABLISHED state now."),
   265  		CurrentConnected:                   mustCreateGauge("/netstack/tcp/current_open", "Number of connections that are in connected state."),
   266  		EstablishedResets:                  mustCreateMetric("/netstack/tcp/established_resets", "Number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state"),
   267  		EstablishedClosed:                  mustCreateMetric("/netstack/tcp/established_closed", "Number of times established TCP connections made a transition to CLOSED state."),
   268  		EstablishedTimedout:                mustCreateMetric("/netstack/tcp/established_timedout", "Number of times  an established connection was reset because of keep-alive time out."),
   269  		ListenOverflowSynDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_syn_drop", "Number of times the listen queue overflowed and a SYN was dropped."),
   270  		ListenOverflowAckDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_ack_drop", "Number of times the listen queue overflowed and the final ACK in the handshake was dropped."),
   271  		ListenOverflowSynCookieSent:        mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_sent", "Number of times a SYN cookie was sent."),
   272  		ListenOverflowSynCookieRcvd:        mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_rcvd", "Number of times a SYN cookie was received."),
   273  		ListenOverflowInvalidSynCookieRcvd: mustCreateMetric("/netstack/tcp/listen_overflow_invalid_syn_cookie_rcvd", "Number of times an invalid SYN cookie was received."),
   274  		FailedConnectionAttempts:           mustCreateMetric("/netstack/tcp/failed_connection_attempts", "Number of calls to Connect or Listen (active and passive openings, respectively) that end in an error."),
   275  		ValidSegmentsReceived:              mustCreateMetric("/netstack/tcp/valid_segments_received", "Number of TCP segments received that the transport layer successfully parsed."),
   276  		InvalidSegmentsReceived:            mustCreateMetric("/netstack/tcp/invalid_segments_received", "Number of TCP segments received that the transport layer could not parse."),
   277  		SegmentsSent:                       mustCreateMetric("/netstack/tcp/segments_sent", "Number of TCP segments sent."),
   278  		SegmentSendErrors:                  mustCreateMetric("/netstack/tcp/segment_send_errors", "Number of TCP segments failed to be sent."),
   279  		ResetsSent:                         mustCreateMetric("/netstack/tcp/resets_sent", "Number of TCP resets sent."),
   280  		ResetsReceived:                     mustCreateMetric("/netstack/tcp/resets_received", "Number of TCP resets received."),
   281  		Retransmits:                        mustCreateMetric("/netstack/tcp/retransmits", "Number of TCP segments retransmitted."),
   282  		FastRecovery:                       mustCreateMetric("/netstack/tcp/fast_recovery", "Number of times fast recovery was used to recover from packet loss."),
   283  		SACKRecovery:                       mustCreateMetric("/netstack/tcp/sack_recovery", "Number of times SACK recovery was used to recover from packet loss."),
   284  		TLPRecovery:                        mustCreateMetric("/netstack/tcp/tlp_recovery", "Number of times tail loss probe triggers recovery from tail loss."),
   285  		SlowStartRetransmits:               mustCreateMetric("/netstack/tcp/slow_start_retransmits", "Number of segments retransmitted in slow start mode."),
   286  		FastRetransmit:                     mustCreateMetric("/netstack/tcp/fast_retransmit", "Number of TCP segments which were fast retransmitted."),
   287  		Timeouts:                           mustCreateMetric("/netstack/tcp/timeouts", "Number of times RTO expired."),
   288  		ChecksumErrors:                     mustCreateMetric("/netstack/tcp/checksum_errors", "Number of segments dropped due to bad checksums."),
   289  		FailedPortReservations:             mustCreateMetric("/netstack/tcp/failed_port_reservations", "Number of time TCP failed to reserve a port."),
   290  		SegmentsAckedWithDSACK:             mustCreateMetric("/netstack/tcp/segments_acked_with_dsack", "Number of segments for which DSACK was received."),
   291  		SpuriousRecovery:                   mustCreateMetric("/netstack/tcp/spurious_recovery", "Number of times the connection entered loss recovery spuriously."),
   292  		SpuriousRTORecovery:                mustCreateMetric("/netstack/tcp/spurious_rto_recovery", "Number of times the connection entered RTO spuriously."),
   293  		ForwardMaxInFlightDrop:             mustCreateMetric("/netstack/tcp/forward_max_in_flight_drop", "Number of connection requests dropped due to exceeding in-flight limit."),
   294  	},
   295  	UDP: tcpip.UDPStats{
   296  		PacketsReceived:          mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."),
   297  		UnknownPortErrors:        mustCreateMetric("/netstack/udp/unknown_port_errors", "Number of incoming UDP datagrams dropped because they did not have a known destination port."),
   298  		ReceiveBufferErrors:      mustCreateMetric("/netstack/udp/receive_buffer_errors", "Number of incoming UDP datagrams dropped due to the receiving buffer being in an invalid state."),
   299  		MalformedPacketsReceived: mustCreateMetric("/netstack/udp/malformed_packets_received", "Number of incoming UDP datagrams dropped due to the UDP header being in a malformed state."),
   300  		PacketsSent:              mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent."),
   301  		PacketSendErrors:         mustCreateMetric("/netstack/udp/packet_send_errors", "Number of UDP datagrams failed to be sent."),
   302  		ChecksumErrors:           mustCreateMetric("/netstack/udp/checksum_errors", "Number of UDP datagrams dropped due to bad checksums."),
   303  	},
   304  }
   305  
   306  // DefaultTTL is linux's default TTL. All network protocols in all stacks used
   307  // with this package must have this value set as their default TTL.
   308  const DefaultTTL = 64
   309  
   310  const sizeOfInt32 int = 4
   311  
   312  var errStackType = syserr.New("expected but did not receive a netstack.Stack", errno.EINVAL)
   313  
   314  // commonEndpoint represents the intersection of a tcpip.Endpoint and a
   315  // transport.Endpoint.
   316  type commonEndpoint interface {
   317  	// Readiness implements tcpip.Endpoint.Readiness and
   318  	// transport.Endpoint.Readiness.
   319  	Readiness(mask waiter.EventMask) waiter.EventMask
   320  
   321  	// SetSockOpt implements tcpip.Endpoint.SetSockOpt and
   322  	// transport.Endpoint.SetSockOpt.
   323  	SetSockOpt(tcpip.SettableSocketOption) tcpip.Error
   324  
   325  	// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt and
   326  	// transport.Endpoint.SetSockOptInt.
   327  	SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error
   328  
   329  	// GetSockOpt implements tcpip.Endpoint.GetSockOpt and
   330  	// transport.Endpoint.GetSockOpt.
   331  	GetSockOpt(tcpip.GettableSocketOption) tcpip.Error
   332  
   333  	// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and
   334  	// transport.Endpoint.GetSockOpt.
   335  	GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error)
   336  
   337  	// State returns a socket's lifecycle state. The returned value is
   338  	// protocol-specific and is primarily used for diagnostics.
   339  	State() uint32
   340  
   341  	// LastError implements tcpip.Endpoint.LastError and
   342  	// transport.Endpoint.LastError.
   343  	LastError() tcpip.Error
   344  
   345  	// SocketOptions implements tcpip.Endpoint.SocketOptions and
   346  	// transport.Endpoint.SocketOptions.
   347  	SocketOptions() *tcpip.SocketOptions
   348  }
   349  
   350  // sock encapsulates all the state needed to represent a network stack
   351  // endpoint in the kernel context.
   352  //
   353  // +stateify savable
   354  type sock struct {
   355  	vfsfd vfs.FileDescription
   356  	vfs.FileDescriptionDefaultImpl
   357  	vfs.DentryMetadataFileDescriptionImpl
   358  	vfs.LockFD
   359  	socket.SendReceiveTimeout
   360  	*waiter.Queue
   361  
   362  	family   int
   363  	Endpoint tcpip.Endpoint
   364  	skType   linux.SockType
   365  	protocol int
   366  
   367  	namespace *inet.Namespace
   368  
   369  	// readMu protects access to the below fields.
   370  	readMu sync.Mutex `state:"nosave"`
   371  
   372  	// sockOptTimestamp corresponds to SO_TIMESTAMP. When true, timestamps
   373  	// of returned messages can be returned via control messages. When
   374  	// false, the same timestamp is instead stored and can be read via the
   375  	// SIOCGSTAMP ioctl. It is protected by readMu. See socket(7).
   376  	sockOptTimestamp bool
   377  	// timestampValid indicates whether timestamp for SIOCGSTAMP has been
   378  	// set. It is protected by readMu.
   379  	timestampValid bool
   380  	// timestamp holds the timestamp to use with SIOCTSTAMP. It is only
   381  	// valid when timestampValid is true. It is protected by readMu.
   382  	timestamp time.Time `state:".(int64)"`
   383  
   384  	// TODO(b/153685824): Move this to SocketOptions.
   385  	// sockOptInq corresponds to TCP_INQ.
   386  	sockOptInq bool
   387  }
   388  
   389  var _ = socket.Socket(&sock{})
   390  
   391  // New creates a new endpoint socket.
   392  func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*vfs.FileDescription, *syserr.Error) {
   393  	if skType == linux.SOCK_STREAM {
   394  		endpoint.SocketOptions().SetDelayOption(true)
   395  	}
   396  
   397  	mnt := t.Kernel().SocketMount()
   398  	d := sockfs.NewDentry(t, mnt)
   399  	defer d.DecRef(t)
   400  
   401  	namespace := t.NetworkNamespace()
   402  	s := &sock{
   403  		Queue:     queue,
   404  		family:    family,
   405  		Endpoint:  endpoint,
   406  		skType:    skType,
   407  		protocol:  protocol,
   408  		namespace: namespace,
   409  	}
   410  	s.LockFD.Init(&vfs.FileLocks{})
   411  	vfsfd := &s.vfsfd
   412  	if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{
   413  		DenyPRead:         true,
   414  		DenyPWrite:        true,
   415  		UseDentryMetadata: true,
   416  	}); err != nil {
   417  		return nil, syserr.FromError(err)
   418  	}
   419  	namespace.IncRef()
   420  	return vfsfd, nil
   421  }
   422  
   423  // Release implements vfs.FileDescriptionImpl.Release.
   424  func (s *sock) Release(ctx context.Context) {
   425  	kernel.KernelFromContext(ctx).DeleteSocket(&s.vfsfd)
   426  	e, ch := waiter.NewChannelEntry(waiter.EventHUp | waiter.EventErr)
   427  	s.EventRegister(&e)
   428  	defer s.EventUnregister(&e)
   429  
   430  	s.Endpoint.Close()
   431  
   432  	// SO_LINGER option is valid only for TCP. For other socket types
   433  	// return after endpoint close.
   434  	if family, skType, _ := s.Type(); skType == linux.SOCK_STREAM && (family == linux.AF_INET || family == linux.AF_INET6) {
   435  		v := s.Endpoint.SocketOptions().GetLinger()
   436  		// The case for zero timeout is handled in tcp endpoint close function.
   437  		// Close is blocked until either:
   438  		// 1. The endpoint state is not in any of the states: FIN-WAIT1,
   439  		// CLOSING and LAST_ACK.
   440  		// 2. Timeout is reached.
   441  		if v.Enabled && v.Timeout != 0 {
   442  			t := kernel.TaskFromContext(ctx)
   443  			start := t.Kernel().MonotonicClock().Now()
   444  			deadline := start.Add(v.Timeout)
   445  			_ = t.BlockWithDeadline(ch, true, deadline)
   446  		}
   447  	}
   448  	s.namespace.DecRef(ctx)
   449  }
   450  
   451  // Epollable implements FileDescriptionImpl.Epollable.
   452  func (s *sock) Epollable() bool {
   453  	return true
   454  }
   455  
   456  // Read implements vfs.FileDescriptionImpl.
   457  func (s *sock) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
   458  	// All flags other than RWF_NOWAIT should be ignored.
   459  	// TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
   460  	if opts.Flags != 0 {
   461  		return 0, linuxerr.EOPNOTSUPP
   462  	}
   463  
   464  	if dst.NumBytes() == 0 {
   465  		return 0, nil
   466  	}
   467  	n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false)
   468  	if err == syserr.ErrWouldBlock {
   469  		return int64(n), linuxerr.ErrWouldBlock
   470  	}
   471  	if err != nil {
   472  		return 0, err.ToError()
   473  	}
   474  	return int64(n), nil
   475  }
   476  
   477  // Write implements vfs.FileDescriptionImpl.
   478  func (s *sock) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
   479  	// All flags other than RWF_NOWAIT should be ignored.
   480  	// TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
   481  	if opts.Flags != 0 {
   482  		return 0, linuxerr.EOPNOTSUPP
   483  	}
   484  
   485  	r := src.Reader(ctx)
   486  	n, err := s.Endpoint.Write(r, tcpip.WriteOptions{})
   487  	if _, ok := err.(*tcpip.ErrWouldBlock); ok {
   488  		return 0, linuxerr.ErrWouldBlock
   489  	}
   490  	if err != nil {
   491  		return 0, syserr.TranslateNetstackError(err).ToError()
   492  	}
   493  
   494  	if n < src.NumBytes() {
   495  		return n, linuxerr.ErrWouldBlock
   496  	}
   497  
   498  	return n, nil
   499  }
   500  
   501  // Accept implements the linux syscall accept(2) for sockets backed by
   502  // tcpip.Endpoint.
   503  func (s *sock) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
   504  	// Issue the accept request to get the new endpoint.
   505  	var peerAddr *tcpip.FullAddress
   506  	if peerRequested {
   507  		peerAddr = &tcpip.FullAddress{}
   508  	}
   509  	ep, wq, terr := s.Endpoint.Accept(peerAddr)
   510  	if terr != nil {
   511  		if _, ok := terr.(*tcpip.ErrWouldBlock); !ok || !blocking {
   512  			return 0, nil, 0, syserr.TranslateNetstackError(terr)
   513  		}
   514  
   515  		var err *syserr.Error
   516  		ep, wq, err = s.blockingAccept(t, peerAddr)
   517  		if err != nil {
   518  			return 0, nil, 0, err
   519  		}
   520  	}
   521  
   522  	ns, err := New(t, s.family, s.skType, s.protocol, wq, ep)
   523  	if err != nil {
   524  		return 0, nil, 0, err
   525  	}
   526  	defer ns.DecRef(t)
   527  
   528  	if err := ns.SetStatusFlags(t, t.Credentials(), uint32(flags&linux.SOCK_NONBLOCK)); err != nil {
   529  		return 0, nil, 0, syserr.FromError(err)
   530  	}
   531  
   532  	var addr linux.SockAddr
   533  	var addrLen uint32
   534  	if peerAddr != nil {
   535  		// Get address of the peer and write it to peer slice.
   536  		addr, addrLen = socket.ConvertAddress(s.family, *peerAddr)
   537  	}
   538  
   539  	fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
   540  		CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
   541  	})
   542  
   543  	t.Kernel().RecordSocket(ns)
   544  
   545  	return fd, addr, addrLen, syserr.FromError(e)
   546  }
   547  
   548  // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
   549  // tcpip.Endpoint.
   550  func (s *sock) GetSockOpt(t *kernel.Task, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
   551  	// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
   552  	// implemented specifically for netstack.Socket rather than
   553  	// commonEndpoint. commonEndpoint should be extended to support socket
   554  	// options where the implementation is not shared, as unix sockets need
   555  	// their own support for SO_TIMESTAMP.
   556  	if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP {
   557  		if outLen < sizeOfInt32 {
   558  			return nil, syserr.ErrInvalidArgument
   559  		}
   560  		val := primitive.Int32(0)
   561  		s.readMu.Lock()
   562  		defer s.readMu.Unlock()
   563  		if s.sockOptTimestamp {
   564  			val = 1
   565  		}
   566  		return &val, nil
   567  	}
   568  	if level == linux.SOL_TCP && name == linux.TCP_INQ {
   569  		if outLen < sizeOfInt32 {
   570  			return nil, syserr.ErrInvalidArgument
   571  		}
   572  		val := primitive.Int32(0)
   573  		s.readMu.Lock()
   574  		defer s.readMu.Unlock()
   575  		if s.sockOptInq {
   576  			val = 1
   577  		}
   578  		return &val, nil
   579  	}
   580  
   581  	return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outPtr, outLen)
   582  }
   583  
   584  // SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
   585  // tcpip.Endpoint.
   586  func (s *sock) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error {
   587  	// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
   588  	// implemented specifically for netstack.Socket rather than
   589  	// commonEndpoint. commonEndpoint should be extended to support socket
   590  	// options where the implementation is not shared, as unix sockets need
   591  	// their own support for SO_TIMESTAMP.
   592  	if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP {
   593  		if len(optVal) < sizeOfInt32 {
   594  			return syserr.ErrInvalidArgument
   595  		}
   596  		s.readMu.Lock()
   597  		defer s.readMu.Unlock()
   598  		s.sockOptTimestamp = hostarch.ByteOrder.Uint32(optVal) != 0
   599  		return nil
   600  	}
   601  	if level == linux.SOL_TCP && name == linux.TCP_INQ {
   602  		if len(optVal) < sizeOfInt32 {
   603  			return syserr.ErrInvalidArgument
   604  		}
   605  		s.readMu.Lock()
   606  		defer s.readMu.Unlock()
   607  		s.sockOptInq = hostarch.ByteOrder.Uint32(optVal) != 0
   608  		return nil
   609  	}
   610  
   611  	return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
   612  }
   613  
   614  var sockAddrInetSize = (*linux.SockAddrInet)(nil).SizeBytes()
   615  var sockAddrInet6Size = (*linux.SockAddrInet6)(nil).SizeBytes()
   616  var sockAddrLinkSize = (*linux.SockAddrLink)(nil).SizeBytes()
   617  
   618  // minSockAddrLen returns the minimum length in bytes of a socket address for
   619  // the socket's family.
   620  func (s *sock) minSockAddrLen() int {
   621  	const addressFamilySize = 2
   622  
   623  	switch s.family {
   624  	case linux.AF_UNIX:
   625  		return addressFamilySize
   626  	case linux.AF_INET:
   627  		return sockAddrInetSize
   628  	case linux.AF_INET6:
   629  		return sockAddrInet6Size
   630  	case linux.AF_PACKET:
   631  		return sockAddrLinkSize
   632  	case linux.AF_UNSPEC:
   633  		return addressFamilySize
   634  	default:
   635  		panic(fmt.Sprintf("s.family unrecognized = %d", s.family))
   636  	}
   637  }
   638  
   639  func (s *sock) isPacketBased() bool {
   640  	return s.skType == linux.SOCK_DGRAM || s.skType == linux.SOCK_SEQPACKET || s.skType == linux.SOCK_RDM || s.skType == linux.SOCK_RAW
   641  }
   642  
   643  // Readiness returns a mask of ready events for socket s.
   644  func (s *sock) Readiness(mask waiter.EventMask) waiter.EventMask {
   645  	return s.Endpoint.Readiness(mask)
   646  }
   647  
   648  // checkFamily returns true iff the specified address family may be used with
   649  // the socket.
   650  //
   651  // If exact is true, then the specified address family must be an exact match
   652  // with the socket's family.
   653  func (s *sock) checkFamily(family uint16, exact bool) bool {
   654  	if family == uint16(s.family) {
   655  		return true
   656  	}
   657  	if !exact && family == linux.AF_INET && s.family == linux.AF_INET6 {
   658  		if !s.Endpoint.SocketOptions().GetV6Only() {
   659  			return true
   660  		}
   661  	}
   662  	return false
   663  }
   664  
   665  // mapFamily maps the AF_INET ANY address to the IPv4-mapped IPv6 ANY if the
   666  // receiver's family is AF_INET6.
   667  //
   668  // This is a hack to work around the fact that both IPv4 and IPv6 ANY are
   669  // represented by the empty string.
   670  //
   671  // TODO(gvisor.dev/issue/1556): remove this function.
   672  func (s *sock) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress {
   673  	if addr.Addr.BitLen() == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET {
   674  		addr.Addr = tcpip.AddrFrom16([16]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00})
   675  	}
   676  	return addr
   677  }
   678  
   679  // Connect implements the linux syscall connect(2) for sockets backed by
   680  // tpcip.Endpoint.
   681  func (s *sock) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
   682  	addr, family, err := socket.AddressAndFamily(sockaddr)
   683  	if err != nil {
   684  		return err
   685  	}
   686  
   687  	if family == linux.AF_UNSPEC {
   688  		err := s.Endpoint.Disconnect()
   689  		if _, ok := err.(*tcpip.ErrNotSupported); ok {
   690  			return syserr.ErrAddressFamilyNotSupported
   691  		}
   692  		return syserr.TranslateNetstackError(err)
   693  	}
   694  
   695  	if !s.checkFamily(family, false /* exact */) {
   696  		return syserr.ErrInvalidArgument
   697  	}
   698  	addr = s.mapFamily(addr, family)
   699  
   700  	// Always return right away in the non-blocking case.
   701  	if !blocking {
   702  		return syserr.TranslateNetstackError(s.Endpoint.Connect(addr))
   703  	}
   704  
   705  	// Register for notification when the endpoint becomes writable, then
   706  	// initiate the connection.
   707  	e, ch := waiter.NewChannelEntry(waiter.WritableEvents)
   708  	s.EventRegister(&e)
   709  	defer s.EventUnregister(&e)
   710  
   711  	switch err := s.Endpoint.Connect(addr); err.(type) {
   712  	case *tcpip.ErrConnectStarted, *tcpip.ErrAlreadyConnecting:
   713  	case *tcpip.ErrNoPortAvailable:
   714  		if (s.family == unix.AF_INET || s.family == unix.AF_INET6) && s.skType == linux.SOCK_STREAM {
   715  			// TCP unlike UDP returns EADDRNOTAVAIL when it can't
   716  			// find an available local ephemeral port.
   717  			return syserr.ErrAddressNotAvailable
   718  		}
   719  		return syserr.TranslateNetstackError(err)
   720  	default:
   721  		return syserr.TranslateNetstackError(err)
   722  	}
   723  
   724  	// It's pending, so we have to wait for a notification, and fetch the
   725  	// result once the wait completes.
   726  	if err := t.Block(ch); err != nil {
   727  		return syserr.FromError(err)
   728  	}
   729  
   730  	// Call Connect() again after blocking to find connect's result.
   731  	return syserr.TranslateNetstackError(s.Endpoint.Connect(addr))
   732  }
   733  
   734  // Bind implements the linux syscall bind(2) for sockets backed by
   735  // tcpip.Endpoint.
   736  func (s *sock) Bind(_ *kernel.Task, sockaddr []byte) *syserr.Error {
   737  	if len(sockaddr) < 2 {
   738  		return syserr.ErrInvalidArgument
   739  	}
   740  
   741  	family := hostarch.ByteOrder.Uint16(sockaddr)
   742  	var addr tcpip.FullAddress
   743  
   744  	// Bind for AF_PACKET requires only family, protocol and ifindex.
   745  	// In function AddressAndFamily, we check the address length which is
   746  	// not needed for AF_PACKET bind.
   747  	if family == linux.AF_PACKET {
   748  		var a linux.SockAddrLink
   749  		if len(sockaddr) < sockAddrLinkSize {
   750  			return syserr.ErrInvalidArgument
   751  		}
   752  		a.UnmarshalBytes(sockaddr)
   753  
   754  		addr = tcpip.FullAddress{
   755  			NIC: tcpip.NICID(a.InterfaceIndex),
   756  			Addr: tcpip.AddrFrom16Slice(append(
   757  				a.HardwareAddr[:header.EthernetAddressSize],
   758  				[]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}...,
   759  			)),
   760  			Port: socket.Ntohs(a.Protocol),
   761  		}
   762  	} else {
   763  		if s.minSockAddrLen() > len(sockaddr) {
   764  			return syserr.ErrInvalidArgument
   765  		}
   766  
   767  		var err *syserr.Error
   768  		addr, family, err = socket.AddressAndFamily(sockaddr)
   769  		if err != nil {
   770  			return err
   771  		}
   772  
   773  		if !s.checkFamily(family, true /* exact */) {
   774  			return syserr.ErrAddressFamilyNotSupported
   775  		}
   776  
   777  		addr = s.mapFamily(addr, family)
   778  	}
   779  
   780  	// Issue the bind request to the endpoint.
   781  	err := s.Endpoint.Bind(addr)
   782  	if _, ok := err.(*tcpip.ErrNoPortAvailable); ok {
   783  		// Bind always returns EADDRINUSE irrespective of if the specified port was
   784  		// already bound or if an ephemeral port was requested but none were
   785  		// available.
   786  		//
   787  		// *tcpip.ErrNoPortAvailable is mapped to EAGAIN in syserr package because
   788  		// UDP connect returns EAGAIN on ephemeral port exhaustion.
   789  		//
   790  		// TCP connect returns EADDRNOTAVAIL on ephemeral port exhaustion.
   791  		err = &tcpip.ErrPortInUse{}
   792  	}
   793  
   794  	return syserr.TranslateNetstackError(err)
   795  }
   796  
   797  // Listen implements the linux syscall listen(2) for sockets backed by
   798  // tcpip.Endpoint.
   799  func (s *sock) Listen(_ *kernel.Task, backlog int) *syserr.Error {
   800  	if err := s.Endpoint.Listen(backlog); err != nil {
   801  		return syserr.TranslateNetstackError(err)
   802  	}
   803  	if !socket.IsTCP(s) {
   804  		return nil
   805  	}
   806  
   807  	// Emit SentryTCPListenEvent with the bound port for tcp sockets.
   808  	addr, err := s.Endpoint.GetLocalAddress()
   809  	if err != nil {
   810  		panic(fmt.Sprintf("GetLocalAddress failed for tcp socket: %s", err))
   811  	}
   812  	eventchannel.Emit(&epb.SentryTcpListenEvent{
   813  		Port: proto.Int32(int32(addr.Port)),
   814  	})
   815  	return nil
   816  }
   817  
   818  // blockingAccept implements a blocking version of accept(2), that is, if no
   819  // connections are ready to be accept, it will block until one becomes ready.
   820  func (s *sock) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) {
   821  	// Register for notifications.
   822  	e, ch := waiter.NewChannelEntry(waiter.ReadableEvents)
   823  	s.EventRegister(&e)
   824  	defer s.EventUnregister(&e)
   825  
   826  	// Try to accept the connection again; if it fails, then wait until we
   827  	// get a notification.
   828  	for {
   829  		ep, wq, err := s.Endpoint.Accept(peerAddr)
   830  		if _, ok := err.(*tcpip.ErrWouldBlock); !ok {
   831  			return ep, wq, syserr.TranslateNetstackError(err)
   832  		}
   833  
   834  		if err := t.Block(ch); err != nil {
   835  			return nil, nil, syserr.FromError(err)
   836  		}
   837  	}
   838  }
   839  
   840  // ConvertShutdown converts Linux shutdown flags into tcpip shutdown flags.
   841  func ConvertShutdown(how int) (tcpip.ShutdownFlags, *syserr.Error) {
   842  	var f tcpip.ShutdownFlags
   843  	switch how {
   844  	case linux.SHUT_RD:
   845  		f = tcpip.ShutdownRead
   846  	case linux.SHUT_WR:
   847  		f = tcpip.ShutdownWrite
   848  	case linux.SHUT_RDWR:
   849  		f = tcpip.ShutdownRead | tcpip.ShutdownWrite
   850  	default:
   851  		return 0, syserr.ErrInvalidArgument
   852  	}
   853  	return f, nil
   854  }
   855  
   856  // Shutdown implements the linux syscall shutdown(2) for sockets backed by
   857  // tcpip.Endpoint.
   858  func (s *sock) Shutdown(_ *kernel.Task, how int) *syserr.Error {
   859  	f, err := ConvertShutdown(how)
   860  	if err != nil {
   861  		return err
   862  	}
   863  
   864  	// Issue shutdown request.
   865  	return syserr.TranslateNetstackError(s.Endpoint.Shutdown(f))
   866  }
   867  
   868  // GetSockOpt can be used to implement the linux syscall getsockopt(2) for
   869  // sockets backed by a commonEndpoint.
   870  func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
   871  	switch level {
   872  	case linux.SOL_SOCKET:
   873  		return getSockOptSocket(t, s, ep, family, skType, name, outLen)
   874  
   875  	case linux.SOL_TCP:
   876  		return getSockOptTCP(t, s, ep, name, outLen)
   877  
   878  	case linux.SOL_IPV6:
   879  		return getSockOptIPv6(t, s, ep, name, outPtr, outLen)
   880  
   881  	case linux.SOL_IP:
   882  		return getSockOptIP(t, s, ep, name, outPtr, outLen, family)
   883  
   884  	case linux.SOL_ICMPV6:
   885  		return getSockOptICMPv6(t, s, ep, name, outLen)
   886  
   887  	case linux.SOL_UDP,
   888  		linux.SOL_RAW,
   889  		linux.SOL_PACKET:
   890  		// Not supported.
   891  	}
   892  
   893  	return nil, syserr.ErrProtocolNotAvailable
   894  }
   895  
   896  func boolToInt32(v bool) int32 {
   897  	if v {
   898  		return 1
   899  	}
   900  	return 0
   901  }
   902  
   903  // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET.
   904  func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, _ linux.SockType, name, outLen int) (marshal.Marshallable, *syserr.Error) {
   905  	// TODO(b/124056281): Stop rejecting short optLen values in getsockopt.
   906  	switch name {
   907  	case linux.SO_ERROR:
   908  		if outLen < sizeOfInt32 {
   909  			return nil, syserr.ErrInvalidArgument
   910  		}
   911  
   912  		// Get the last error and convert it.
   913  		err := ep.SocketOptions().GetLastError()
   914  		if err == nil {
   915  			optP := primitive.Int32(0)
   916  			return &optP, nil
   917  		}
   918  
   919  		optP := primitive.Int32(syserr.TranslateNetstackError(err).ToLinux())
   920  		return &optP, nil
   921  
   922  	case linux.SO_PEERCRED:
   923  		if family != linux.AF_UNIX || outLen < unix.SizeofUcred {
   924  			return nil, syserr.ErrInvalidArgument
   925  		}
   926  
   927  		tcred := t.Credentials()
   928  		creds := linux.ControlMessageCredentials{
   929  			PID: int32(t.ThreadGroup().ID()),
   930  			UID: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()),
   931  			GID: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()),
   932  		}
   933  		return &creds, nil
   934  
   935  	case linux.SO_PASSCRED:
   936  		if outLen < sizeOfInt32 {
   937  			return nil, syserr.ErrInvalidArgument
   938  		}
   939  
   940  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetPassCred()))
   941  		return &v, nil
   942  
   943  	case linux.SO_SNDBUF:
   944  		if outLen < sizeOfInt32 {
   945  			return nil, syserr.ErrInvalidArgument
   946  		}
   947  
   948  		size := ep.SocketOptions().GetSendBufferSize()
   949  
   950  		if size > math.MaxInt32 {
   951  			size = math.MaxInt32
   952  		}
   953  
   954  		sizeP := primitive.Int32(size)
   955  		return &sizeP, nil
   956  
   957  	case linux.SO_RCVBUF:
   958  		if outLen < sizeOfInt32 {
   959  			return nil, syserr.ErrInvalidArgument
   960  		}
   961  
   962  		size := ep.SocketOptions().GetReceiveBufferSize()
   963  
   964  		if size > math.MaxInt32 {
   965  			size = math.MaxInt32
   966  		}
   967  
   968  		sizeP := primitive.Int32(size)
   969  		return &sizeP, nil
   970  
   971  	case linux.SO_REUSEADDR:
   972  		if outLen < sizeOfInt32 {
   973  			return nil, syserr.ErrInvalidArgument
   974  		}
   975  
   976  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReuseAddress()))
   977  		return &v, nil
   978  
   979  	case linux.SO_REUSEPORT:
   980  		if outLen < sizeOfInt32 {
   981  			return nil, syserr.ErrInvalidArgument
   982  		}
   983  
   984  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReusePort()))
   985  		return &v, nil
   986  
   987  	case linux.SO_BINDTODEVICE:
   988  		v := ep.SocketOptions().GetBindToDevice()
   989  		if v == 0 {
   990  			var b primitive.ByteSlice
   991  			return &b, nil
   992  		}
   993  		if outLen < linux.IFNAMSIZ {
   994  			return nil, syserr.ErrInvalidArgument
   995  		}
   996  		s := t.NetworkContext()
   997  		if s == nil {
   998  			return nil, syserr.ErrNoDevice
   999  		}
  1000  		nic, ok := s.Interfaces()[int32(v)]
  1001  		if !ok {
  1002  			// The NICID no longer indicates a valid interface, probably because that
  1003  			// interface was removed.
  1004  			return nil, syserr.ErrUnknownDevice
  1005  		}
  1006  
  1007  		name := primitive.ByteSlice(append([]byte(nic.Name), 0))
  1008  		return &name, nil
  1009  
  1010  	case linux.SO_BROADCAST:
  1011  		if outLen < sizeOfInt32 {
  1012  			return nil, syserr.ErrInvalidArgument
  1013  		}
  1014  
  1015  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetBroadcast()))
  1016  		return &v, nil
  1017  
  1018  	case linux.SO_KEEPALIVE:
  1019  		if outLen < sizeOfInt32 {
  1020  			return nil, syserr.ErrInvalidArgument
  1021  		}
  1022  
  1023  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetKeepAlive()))
  1024  		return &v, nil
  1025  
  1026  	case linux.SO_LINGER:
  1027  		if outLen < linux.SizeOfLinger {
  1028  			return nil, syserr.ErrInvalidArgument
  1029  		}
  1030  
  1031  		var linger linux.Linger
  1032  		v := ep.SocketOptions().GetLinger()
  1033  
  1034  		if v.Enabled {
  1035  			linger.OnOff = 1
  1036  		}
  1037  		linger.Linger = int32(v.Timeout.Seconds())
  1038  		return &linger, nil
  1039  
  1040  	case linux.SO_SNDTIMEO:
  1041  		// TODO(igudger): Linux allows shorter lengths for partial results.
  1042  		if outLen < linux.SizeOfTimeval {
  1043  			return nil, syserr.ErrInvalidArgument
  1044  		}
  1045  
  1046  		sendTimeout := linux.NsecToTimeval(s.SendTimeout())
  1047  		return &sendTimeout, nil
  1048  
  1049  	case linux.SO_RCVTIMEO:
  1050  		// TODO(igudger): Linux allows shorter lengths for partial results.
  1051  		if outLen < linux.SizeOfTimeval {
  1052  			return nil, syserr.ErrInvalidArgument
  1053  		}
  1054  
  1055  		recvTimeout := linux.NsecToTimeval(s.RecvTimeout())
  1056  		return &recvTimeout, nil
  1057  
  1058  	case linux.SO_OOBINLINE:
  1059  		if outLen < sizeOfInt32 {
  1060  			return nil, syserr.ErrInvalidArgument
  1061  		}
  1062  
  1063  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetOutOfBandInline()))
  1064  		return &v, nil
  1065  
  1066  	case linux.SO_NO_CHECK:
  1067  		if outLen < sizeOfInt32 {
  1068  			return nil, syserr.ErrInvalidArgument
  1069  		}
  1070  
  1071  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetNoChecksum()))
  1072  		return &v, nil
  1073  
  1074  	case linux.SO_ACCEPTCONN:
  1075  		if outLen < sizeOfInt32 {
  1076  			return nil, syserr.ErrInvalidArgument
  1077  		}
  1078  
  1079  		// This option is only viable for TCP endpoints.
  1080  		var v bool
  1081  		if socket.IsTCP(s) {
  1082  			v = tcp.EndpointState(ep.State()) == tcp.StateListen
  1083  		}
  1084  		vP := primitive.Int32(boolToInt32(v))
  1085  		return &vP, nil
  1086  
  1087  	case linux.SO_RCVLOWAT:
  1088  		if outLen < sizeOfInt32 {
  1089  			return nil, syserr.ErrInvalidArgument
  1090  		}
  1091  
  1092  		v := primitive.Int32(ep.SocketOptions().GetRcvlowat())
  1093  		return &v, nil
  1094  	}
  1095  	return nil, syserr.ErrProtocolNotAvailable
  1096  }
  1097  
  1098  // getSockOptTCP implements GetSockOpt when level is SOL_TCP.
  1099  func getSockOptTCP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) {
  1100  	if !socket.IsTCP(s) {
  1101  		return nil, syserr.ErrUnknownProtocolOption
  1102  	}
  1103  
  1104  	switch name {
  1105  	case linux.TCP_NODELAY:
  1106  		if outLen < sizeOfInt32 {
  1107  			return nil, syserr.ErrInvalidArgument
  1108  		}
  1109  
  1110  		v := primitive.Int32(boolToInt32(!ep.SocketOptions().GetDelayOption()))
  1111  		return &v, nil
  1112  
  1113  	case linux.TCP_CORK:
  1114  		if outLen < sizeOfInt32 {
  1115  			return nil, syserr.ErrInvalidArgument
  1116  		}
  1117  
  1118  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetCorkOption()))
  1119  		return &v, nil
  1120  
  1121  	case linux.TCP_QUICKACK:
  1122  		if outLen < sizeOfInt32 {
  1123  			return nil, syserr.ErrInvalidArgument
  1124  		}
  1125  
  1126  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetQuickAck()))
  1127  		return &v, nil
  1128  
  1129  	case linux.TCP_MAXSEG:
  1130  		if outLen < sizeOfInt32 {
  1131  			return nil, syserr.ErrInvalidArgument
  1132  		}
  1133  
  1134  		v, err := ep.GetSockOptInt(tcpip.MaxSegOption)
  1135  		if err != nil {
  1136  			return nil, syserr.TranslateNetstackError(err)
  1137  		}
  1138  		vP := primitive.Int32(v)
  1139  		return &vP, nil
  1140  
  1141  	case linux.TCP_KEEPIDLE:
  1142  		if outLen < sizeOfInt32 {
  1143  			return nil, syserr.ErrInvalidArgument
  1144  		}
  1145  
  1146  		var v tcpip.KeepaliveIdleOption
  1147  		if err := ep.GetSockOpt(&v); err != nil {
  1148  			return nil, syserr.TranslateNetstackError(err)
  1149  		}
  1150  		keepAliveIdle := primitive.Int32(time.Duration(v) / time.Second)
  1151  		return &keepAliveIdle, nil
  1152  
  1153  	case linux.TCP_KEEPINTVL:
  1154  		if outLen < sizeOfInt32 {
  1155  			return nil, syserr.ErrInvalidArgument
  1156  		}
  1157  
  1158  		var v tcpip.KeepaliveIntervalOption
  1159  		if err := ep.GetSockOpt(&v); err != nil {
  1160  			return nil, syserr.TranslateNetstackError(err)
  1161  		}
  1162  		keepAliveInterval := primitive.Int32(time.Duration(v) / time.Second)
  1163  		return &keepAliveInterval, nil
  1164  
  1165  	case linux.TCP_KEEPCNT:
  1166  		if outLen < sizeOfInt32 {
  1167  			return nil, syserr.ErrInvalidArgument
  1168  		}
  1169  
  1170  		v, err := ep.GetSockOptInt(tcpip.KeepaliveCountOption)
  1171  		if err != nil {
  1172  			return nil, syserr.TranslateNetstackError(err)
  1173  		}
  1174  		vP := primitive.Int32(v)
  1175  		return &vP, nil
  1176  
  1177  	case linux.TCP_USER_TIMEOUT:
  1178  		if outLen < sizeOfInt32 {
  1179  			return nil, syserr.ErrInvalidArgument
  1180  		}
  1181  
  1182  		var v tcpip.TCPUserTimeoutOption
  1183  		if err := ep.GetSockOpt(&v); err != nil {
  1184  			return nil, syserr.TranslateNetstackError(err)
  1185  		}
  1186  		tcpUserTimeout := primitive.Int32(time.Duration(v) / time.Millisecond)
  1187  		return &tcpUserTimeout, nil
  1188  
  1189  	case linux.TCP_INFO:
  1190  		var v tcpip.TCPInfoOption
  1191  		if err := ep.GetSockOpt(&v); err != nil {
  1192  			return nil, syserr.TranslateNetstackError(err)
  1193  		}
  1194  
  1195  		// TODO(b/64800844): Translate fields once they are added to
  1196  		// tcpip.TCPInfoOption.
  1197  		info := linux.TCPInfo{
  1198  			State:       uint8(v.State),
  1199  			RTO:         uint32(v.RTO / time.Microsecond),
  1200  			RTT:         uint32(v.RTT / time.Microsecond),
  1201  			RTTVar:      uint32(v.RTTVar / time.Microsecond),
  1202  			SndSsthresh: v.SndSsthresh,
  1203  			SndCwnd:     v.SndCwnd,
  1204  		}
  1205  		switch v.CcState {
  1206  		case tcpip.RTORecovery:
  1207  			info.CaState = linux.TCP_CA_Loss
  1208  		case tcpip.FastRecovery, tcpip.SACKRecovery:
  1209  			info.CaState = linux.TCP_CA_Recovery
  1210  		case tcpip.Disorder:
  1211  			info.CaState = linux.TCP_CA_Disorder
  1212  		case tcpip.Open:
  1213  			info.CaState = linux.TCP_CA_Open
  1214  		}
  1215  
  1216  		// In netstack reorderSeen is updated only when RACK is enabled.
  1217  		// We only track whether the reordering is seen, which is
  1218  		// different than Linux where reorderSeen is not specific to
  1219  		// RACK and is incremented when a reordering event is seen.
  1220  		if v.ReorderSeen {
  1221  			info.ReordSeen = 1
  1222  		}
  1223  
  1224  		// Linux truncates the output binary to outLen.
  1225  		buf := t.CopyScratchBuffer(info.SizeBytes())
  1226  		info.MarshalUnsafe(buf)
  1227  		if len(buf) > outLen {
  1228  			buf = buf[:outLen]
  1229  		}
  1230  		bufP := primitive.ByteSlice(buf)
  1231  		return &bufP, nil
  1232  
  1233  	case linux.TCP_CC_INFO,
  1234  		linux.TCP_NOTSENT_LOWAT,
  1235  		linux.TCP_ZEROCOPY_RECEIVE:
  1236  
  1237  		// Not supported.
  1238  
  1239  	case linux.TCP_CONGESTION:
  1240  		if outLen <= 0 {
  1241  			return nil, syserr.ErrInvalidArgument
  1242  		}
  1243  
  1244  		var v tcpip.CongestionControlOption
  1245  		if err := ep.GetSockOpt(&v); err != nil {
  1246  			return nil, syserr.TranslateNetstackError(err)
  1247  		}
  1248  
  1249  		// We match linux behaviour here where it returns the lower of
  1250  		// TCP_CA_NAME_MAX bytes or the value of the option length.
  1251  		//
  1252  		// This is Linux's net/tcp.h TCP_CA_NAME_MAX.
  1253  		const tcpCANameMax = 16
  1254  
  1255  		toCopy := tcpCANameMax
  1256  		if outLen < tcpCANameMax {
  1257  			toCopy = outLen
  1258  		}
  1259  		b := make([]byte, toCopy)
  1260  		copy(b, v)
  1261  
  1262  		bP := primitive.ByteSlice(b)
  1263  		return &bP, nil
  1264  
  1265  	case linux.TCP_LINGER2:
  1266  		if outLen < sizeOfInt32 {
  1267  			return nil, syserr.ErrInvalidArgument
  1268  		}
  1269  
  1270  		var v tcpip.TCPLingerTimeoutOption
  1271  		if err := ep.GetSockOpt(&v); err != nil {
  1272  			return nil, syserr.TranslateNetstackError(err)
  1273  		}
  1274  		var lingerTimeout primitive.Int32
  1275  		if v >= 0 {
  1276  			lingerTimeout = primitive.Int32(time.Duration(v) / time.Second)
  1277  		} else {
  1278  			lingerTimeout = -1
  1279  		}
  1280  		return &lingerTimeout, nil
  1281  
  1282  	case linux.TCP_DEFER_ACCEPT:
  1283  		if outLen < sizeOfInt32 {
  1284  			return nil, syserr.ErrInvalidArgument
  1285  		}
  1286  
  1287  		var v tcpip.TCPDeferAcceptOption
  1288  		if err := ep.GetSockOpt(&v); err != nil {
  1289  			return nil, syserr.TranslateNetstackError(err)
  1290  		}
  1291  
  1292  		tcpDeferAccept := primitive.Int32(time.Duration(v) / time.Second)
  1293  		return &tcpDeferAccept, nil
  1294  
  1295  	case linux.TCP_SYNCNT:
  1296  		if outLen < sizeOfInt32 {
  1297  			return nil, syserr.ErrInvalidArgument
  1298  		}
  1299  
  1300  		v, err := ep.GetSockOptInt(tcpip.TCPSynCountOption)
  1301  		if err != nil {
  1302  			return nil, syserr.TranslateNetstackError(err)
  1303  		}
  1304  		vP := primitive.Int32(v)
  1305  		return &vP, nil
  1306  
  1307  	case linux.TCP_WINDOW_CLAMP:
  1308  		if outLen < sizeOfInt32 {
  1309  			return nil, syserr.ErrInvalidArgument
  1310  		}
  1311  
  1312  		v, err := ep.GetSockOptInt(tcpip.TCPWindowClampOption)
  1313  		if err != nil {
  1314  			return nil, syserr.TranslateNetstackError(err)
  1315  		}
  1316  		vP := primitive.Int32(v)
  1317  		return &vP, nil
  1318  	}
  1319  	return nil, syserr.ErrProtocolNotAvailable
  1320  }
  1321  
  1322  func getSockOptICMPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outLen int) (marshal.Marshallable, *syserr.Error) {
  1323  	if _, ok := ep.(tcpip.Endpoint); !ok {
  1324  		log.Warningf("SOL_ICMPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  1325  		return nil, syserr.ErrUnknownProtocolOption
  1326  	}
  1327  
  1328  	if family, _, _ := s.Type(); family != linux.AF_INET6 {
  1329  		return nil, syserr.ErrNotSupported
  1330  	}
  1331  
  1332  	switch name {
  1333  	case linux.ICMPV6_FILTER:
  1334  		var v tcpip.ICMPv6Filter
  1335  		if err := ep.GetSockOpt(&v); err != nil {
  1336  			return nil, syserr.TranslateNetstackError(err)
  1337  		}
  1338  
  1339  		filter := linux.ICMP6Filter{Filter: v.DenyType}
  1340  
  1341  		// Linux truncates the output to outLen.
  1342  		buf := t.CopyScratchBuffer(filter.SizeBytes())
  1343  		filter.MarshalUnsafe(buf)
  1344  		if len(buf) > outLen {
  1345  			buf = buf[:outLen]
  1346  		}
  1347  		bufP := primitive.ByteSlice(buf)
  1348  		return &bufP, nil
  1349  	}
  1350  	return nil, syserr.ErrProtocolNotAvailable
  1351  }
  1352  
  1353  func defaultTTL(t *kernel.Task, network tcpip.NetworkProtocolNumber) (primitive.Int32, tcpip.Error) {
  1354  	var opt tcpip.DefaultTTLOption
  1355  	stack := inet.StackFromContext(t)
  1356  	if err := stack.(*Stack).Stack.NetworkProtocolOption(network, &opt); err != nil {
  1357  		return 0, err
  1358  	}
  1359  	return primitive.Int32(opt), nil
  1360  }
  1361  
  1362  // getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6.
  1363  func getSockOptIPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
  1364  	if _, ok := ep.(tcpip.Endpoint); !ok {
  1365  		log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  1366  		return nil, syserr.ErrUnknownProtocolOption
  1367  	}
  1368  
  1369  	family, skType, _ := s.Type()
  1370  	if family != linux.AF_INET6 {
  1371  		return nil, syserr.ErrNotSupported
  1372  	}
  1373  
  1374  	switch name {
  1375  	case linux.IPV6_CHECKSUM:
  1376  		if outLen < sizeOfInt32 {
  1377  			return nil, syserr.ErrInvalidArgument
  1378  		}
  1379  
  1380  		v, err := ep.GetSockOptInt(tcpip.IPv6Checksum)
  1381  		if err != nil {
  1382  			return nil, syserr.TranslateNetstackError(err)
  1383  		}
  1384  
  1385  		vP := primitive.Int32(v)
  1386  		return &vP, nil
  1387  
  1388  	case linux.IPV6_V6ONLY:
  1389  		if outLen < sizeOfInt32 {
  1390  			return nil, syserr.ErrInvalidArgument
  1391  		}
  1392  
  1393  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetV6Only()))
  1394  		return &v, nil
  1395  
  1396  	case linux.IPV6_UNICAST_HOPS:
  1397  		if outLen < sizeOfInt32 {
  1398  			return nil, syserr.ErrInvalidArgument
  1399  		}
  1400  
  1401  		v, err := ep.GetSockOptInt(tcpip.IPv6HopLimitOption)
  1402  		if err != nil {
  1403  			return nil, syserr.TranslateNetstackError(err)
  1404  		}
  1405  
  1406  		// Fill in the default value, if needed.
  1407  		vP := primitive.Int32(v)
  1408  		if vP == -1 {
  1409  			vP, err = defaultTTL(t, header.IPv6ProtocolNumber)
  1410  			if err != nil {
  1411  				return nil, syserr.TranslateNetstackError(err)
  1412  			}
  1413  		}
  1414  
  1415  		return &vP, nil
  1416  
  1417  	case linux.IPV6_RECVHOPLIMIT:
  1418  		if outLen < sizeOfInt32 {
  1419  			return nil, syserr.ErrInvalidArgument
  1420  		}
  1421  
  1422  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveHopLimit()))
  1423  		return &v, nil
  1424  
  1425  	case linux.IPV6_PATHMTU:
  1426  		// Not supported.
  1427  
  1428  	case linux.IPV6_TCLASS:
  1429  		// Length handling for parity with Linux.
  1430  		if outLen == 0 {
  1431  			var b primitive.ByteSlice
  1432  			return &b, nil
  1433  		}
  1434  		v, err := ep.GetSockOptInt(tcpip.IPv6TrafficClassOption)
  1435  		if err != nil {
  1436  			return nil, syserr.TranslateNetstackError(err)
  1437  		}
  1438  
  1439  		uintv := primitive.Uint32(v)
  1440  		// Linux truncates the output binary to outLen.
  1441  		ib := t.CopyScratchBuffer(uintv.SizeBytes())
  1442  		uintv.MarshalUnsafe(ib)
  1443  		// Handle cases where outLen is lesser than sizeOfInt32.
  1444  		if len(ib) > outLen {
  1445  			ib = ib[:outLen]
  1446  		}
  1447  		ibP := primitive.ByteSlice(ib)
  1448  		return &ibP, nil
  1449  
  1450  	case linux.IPV6_RECVTCLASS:
  1451  		if outLen < sizeOfInt32 {
  1452  			return nil, syserr.ErrInvalidArgument
  1453  		}
  1454  
  1455  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTClass()))
  1456  		return &v, nil
  1457  	case linux.IPV6_RECVERR:
  1458  		if outLen < sizeOfInt32 {
  1459  			return nil, syserr.ErrInvalidArgument
  1460  		}
  1461  
  1462  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv6RecvError()))
  1463  		return &v, nil
  1464  
  1465  	case linux.IPV6_RECVORIGDSTADDR:
  1466  		if outLen < sizeOfInt32 {
  1467  			return nil, syserr.ErrInvalidArgument
  1468  		}
  1469  
  1470  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress()))
  1471  		return &v, nil
  1472  
  1473  	case linux.IPV6_RECVPKTINFO:
  1474  		if outLen < sizeOfInt32 {
  1475  			return nil, syserr.ErrInvalidArgument
  1476  		}
  1477  
  1478  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv6ReceivePacketInfo()))
  1479  		return &v, nil
  1480  
  1481  	case linux.IP6T_ORIGINAL_DST:
  1482  		if outLen < sockAddrInet6Size {
  1483  			return nil, syserr.ErrInvalidArgument
  1484  		}
  1485  
  1486  		var v tcpip.OriginalDestinationOption
  1487  		if err := ep.GetSockOpt(&v); err != nil {
  1488  			return nil, syserr.TranslateNetstackError(err)
  1489  		}
  1490  
  1491  		a, _ := socket.ConvertAddress(linux.AF_INET6, tcpip.FullAddress(v))
  1492  		return a.(*linux.SockAddrInet6), nil
  1493  
  1494  	case linux.IP6T_SO_GET_INFO:
  1495  		if outLen < linux.SizeOfIPTGetinfo {
  1496  			return nil, syserr.ErrInvalidArgument
  1497  		}
  1498  
  1499  		// Only valid for raw IPv6 sockets.
  1500  		if skType != linux.SOCK_RAW {
  1501  			return nil, syserr.ErrProtocolNotAvailable
  1502  		}
  1503  
  1504  		stk := inet.StackFromContext(t)
  1505  		if stk == nil {
  1506  			return nil, syserr.ErrNoDevice
  1507  		}
  1508  		info, err := netfilter.GetInfo(t, stk.(*Stack).Stack, outPtr, true)
  1509  		if err != nil {
  1510  			return nil, err
  1511  		}
  1512  		return &info, nil
  1513  
  1514  	case linux.IP6T_SO_GET_ENTRIES:
  1515  		// IPTGetEntries is reused for IPv6.
  1516  		if outLen < linux.SizeOfIPTGetEntries {
  1517  			return nil, syserr.ErrInvalidArgument
  1518  		}
  1519  		// Only valid for raw IPv6 sockets.
  1520  		if skType != linux.SOCK_RAW {
  1521  			return nil, syserr.ErrProtocolNotAvailable
  1522  		}
  1523  
  1524  		stk := inet.StackFromContext(t)
  1525  		if stk == nil {
  1526  			return nil, syserr.ErrNoDevice
  1527  		}
  1528  		entries, err := netfilter.GetEntries6(t, stk.(*Stack).Stack, outPtr, outLen)
  1529  		if err != nil {
  1530  			return nil, err
  1531  		}
  1532  		return &entries, nil
  1533  
  1534  	case linux.IP6T_SO_GET_REVISION_TARGET:
  1535  		if outLen < linux.SizeOfXTGetRevision {
  1536  			return nil, syserr.ErrInvalidArgument
  1537  		}
  1538  
  1539  		// Only valid for raw IPv6 sockets.
  1540  		if skType != linux.SOCK_RAW {
  1541  			return nil, syserr.ErrProtocolNotAvailable
  1542  		}
  1543  
  1544  		stk := inet.StackFromContext(t)
  1545  		if stk == nil {
  1546  			return nil, syserr.ErrNoDevice
  1547  		}
  1548  		ret, err := netfilter.TargetRevision(t, outPtr, header.IPv6ProtocolNumber)
  1549  		if err != nil {
  1550  			return nil, err
  1551  		}
  1552  		return &ret, nil
  1553  	}
  1554  	return nil, syserr.ErrProtocolNotAvailable
  1555  }
  1556  
  1557  // getSockOptIP implements GetSockOpt when level is SOL_IP.
  1558  func getSockOptIP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int, _ int) (marshal.Marshallable, *syserr.Error) {
  1559  	if _, ok := ep.(tcpip.Endpoint); !ok {
  1560  		log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  1561  		return nil, syserr.ErrUnknownProtocolOption
  1562  	}
  1563  
  1564  	switch name {
  1565  	case linux.IP_TTL:
  1566  		if outLen < sizeOfInt32 {
  1567  			return nil, syserr.ErrInvalidArgument
  1568  		}
  1569  
  1570  		v, err := ep.GetSockOptInt(tcpip.IPv4TTLOption)
  1571  		if err != nil {
  1572  			return nil, syserr.TranslateNetstackError(err)
  1573  		}
  1574  
  1575  		// Fill in the default value, if needed.
  1576  		vP := primitive.Int32(v)
  1577  		if vP == 0 {
  1578  			vP, err = defaultTTL(t, header.IPv4ProtocolNumber)
  1579  			if err != nil {
  1580  				return nil, syserr.TranslateNetstackError(err)
  1581  			}
  1582  		}
  1583  
  1584  		return &vP, nil
  1585  
  1586  	case linux.IP_RECVTTL:
  1587  		if outLen < sizeOfInt32 {
  1588  			return nil, syserr.ErrInvalidArgument
  1589  		}
  1590  
  1591  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTTL()))
  1592  		return &v, nil
  1593  
  1594  	case linux.IP_MULTICAST_TTL:
  1595  		if outLen < sizeOfInt32 {
  1596  			return nil, syserr.ErrInvalidArgument
  1597  		}
  1598  
  1599  		v, err := ep.GetSockOptInt(tcpip.MulticastTTLOption)
  1600  		if err != nil {
  1601  			return nil, syserr.TranslateNetstackError(err)
  1602  		}
  1603  
  1604  		vP := primitive.Int32(v)
  1605  		return &vP, nil
  1606  
  1607  	case linux.IP_MULTICAST_IF:
  1608  		if outLen < len(linux.InetAddr{}) {
  1609  			return nil, syserr.ErrInvalidArgument
  1610  		}
  1611  
  1612  		var v tcpip.MulticastInterfaceOption
  1613  		if err := ep.GetSockOpt(&v); err != nil {
  1614  			return nil, syserr.TranslateNetstackError(err)
  1615  		}
  1616  
  1617  		a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr})
  1618  
  1619  		return &a.(*linux.SockAddrInet).Addr, nil
  1620  
  1621  	case linux.IP_MULTICAST_LOOP:
  1622  		if outLen < sizeOfInt32 {
  1623  			return nil, syserr.ErrInvalidArgument
  1624  		}
  1625  
  1626  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetMulticastLoop()))
  1627  		return &v, nil
  1628  
  1629  	case linux.IP_TOS:
  1630  		// Length handling for parity with Linux.
  1631  		if outLen == 0 {
  1632  			var b primitive.ByteSlice
  1633  			return &b, nil
  1634  		}
  1635  		v, err := ep.GetSockOptInt(tcpip.IPv4TOSOption)
  1636  		if err != nil {
  1637  			return nil, syserr.TranslateNetstackError(err)
  1638  		}
  1639  		if outLen < sizeOfInt32 {
  1640  			vP := primitive.Uint8(v)
  1641  			return &vP, nil
  1642  		}
  1643  		vP := primitive.Int32(v)
  1644  		return &vP, nil
  1645  
  1646  	case linux.IP_RECVTOS:
  1647  		if outLen < sizeOfInt32 {
  1648  			return nil, syserr.ErrInvalidArgument
  1649  		}
  1650  
  1651  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTOS()))
  1652  		return &v, nil
  1653  
  1654  	case linux.IP_RECVERR:
  1655  		if outLen < sizeOfInt32 {
  1656  			return nil, syserr.ErrInvalidArgument
  1657  		}
  1658  
  1659  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv4RecvError()))
  1660  		return &v, nil
  1661  
  1662  	case linux.IP_PKTINFO:
  1663  		if outLen < sizeOfInt32 {
  1664  			return nil, syserr.ErrInvalidArgument
  1665  		}
  1666  
  1667  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceivePacketInfo()))
  1668  		return &v, nil
  1669  
  1670  	case linux.IP_HDRINCL:
  1671  		if outLen < sizeOfInt32 {
  1672  			return nil, syserr.ErrInvalidArgument
  1673  		}
  1674  
  1675  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetHeaderIncluded()))
  1676  		return &v, nil
  1677  
  1678  	case linux.IP_RECVORIGDSTADDR:
  1679  		if outLen < sizeOfInt32 {
  1680  			return nil, syserr.ErrInvalidArgument
  1681  		}
  1682  
  1683  		v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress()))
  1684  		return &v, nil
  1685  
  1686  	case linux.SO_ORIGINAL_DST:
  1687  		if outLen < sockAddrInetSize {
  1688  			return nil, syserr.ErrInvalidArgument
  1689  		}
  1690  
  1691  		var v tcpip.OriginalDestinationOption
  1692  		if err := ep.GetSockOpt(&v); err != nil {
  1693  			return nil, syserr.TranslateNetstackError(err)
  1694  		}
  1695  
  1696  		a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress(v))
  1697  		return a.(*linux.SockAddrInet), nil
  1698  
  1699  	case linux.IPT_SO_GET_INFO:
  1700  		if outLen < linux.SizeOfIPTGetinfo {
  1701  			return nil, syserr.ErrInvalidArgument
  1702  		}
  1703  
  1704  		// Only valid for raw IPv4 sockets.
  1705  		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
  1706  			return nil, syserr.ErrProtocolNotAvailable
  1707  		}
  1708  
  1709  		stk := inet.StackFromContext(t)
  1710  		if stk == nil {
  1711  			return nil, syserr.ErrNoDevice
  1712  		}
  1713  		info, err := netfilter.GetInfo(t, stk.(*Stack).Stack, outPtr, false)
  1714  		if err != nil {
  1715  			return nil, err
  1716  		}
  1717  		return &info, nil
  1718  
  1719  	case linux.IPT_SO_GET_ENTRIES:
  1720  		if outLen < linux.SizeOfIPTGetEntries {
  1721  			return nil, syserr.ErrInvalidArgument
  1722  		}
  1723  
  1724  		// Only valid for raw IPv4 sockets.
  1725  		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
  1726  			return nil, syserr.ErrProtocolNotAvailable
  1727  		}
  1728  
  1729  		stk := inet.StackFromContext(t)
  1730  		if stk == nil {
  1731  			return nil, syserr.ErrNoDevice
  1732  		}
  1733  		entries, err := netfilter.GetEntries4(t, stk.(*Stack).Stack, outPtr, outLen)
  1734  		if err != nil {
  1735  			return nil, err
  1736  		}
  1737  		return &entries, nil
  1738  
  1739  	case linux.IPT_SO_GET_REVISION_TARGET:
  1740  		if outLen < linux.SizeOfXTGetRevision {
  1741  			return nil, syserr.ErrInvalidArgument
  1742  		}
  1743  
  1744  		// Only valid for raw IPv4 sockets.
  1745  		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
  1746  			return nil, syserr.ErrProtocolNotAvailable
  1747  		}
  1748  
  1749  		stk := inet.StackFromContext(t)
  1750  		if stk == nil {
  1751  			return nil, syserr.ErrNoDevice
  1752  		}
  1753  		ret, err := netfilter.TargetRevision(t, outPtr, header.IPv4ProtocolNumber)
  1754  		if err != nil {
  1755  			return nil, err
  1756  		}
  1757  		return &ret, nil
  1758  	}
  1759  	return nil, syserr.ErrProtocolNotAvailable
  1760  }
  1761  
  1762  // SetSockOpt can be used to implement the linux syscall setsockopt(2) for
  1763  // sockets backed by a commonEndpoint.
  1764  func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error {
  1765  	switch level {
  1766  	case linux.SOL_SOCKET:
  1767  		return setSockOptSocket(t, s, ep, name, optVal)
  1768  
  1769  	case linux.SOL_TCP:
  1770  		return setSockOptTCP(t, s, ep, name, optVal)
  1771  
  1772  	case linux.SOL_ICMPV6:
  1773  		return setSockOptICMPv6(t, s, ep, name, optVal)
  1774  
  1775  	case linux.SOL_IPV6:
  1776  		return setSockOptIPv6(t, s, ep, name, optVal)
  1777  
  1778  	case linux.SOL_IP:
  1779  		return setSockOptIP(t, s, ep, name, optVal)
  1780  
  1781  	case linux.SOL_PACKET:
  1782  		// gVisor doesn't support any SOL_PACKET options just return not
  1783  		// supported. Returning nil here will result in tcpdump thinking AF_PACKET
  1784  		// features are supported and proceed to use them and break.
  1785  		return syserr.ErrProtocolNotAvailable
  1786  
  1787  	case linux.SOL_UDP,
  1788  		linux.SOL_RAW:
  1789  		// Not supported.
  1790  	}
  1791  
  1792  	return nil
  1793  }
  1794  
  1795  func clampBufSize(newSz, min, max int64, ignoreMax bool) int64 {
  1796  	// packetOverheadFactor is used to multiply the value provided by the user on
  1797  	// a setsockopt(2) for setting the send/receive buffer sizes sockets.
  1798  	const packetOverheadFactor = 2
  1799  
  1800  	if !ignoreMax && newSz > max {
  1801  		newSz = max
  1802  	}
  1803  
  1804  	if newSz < math.MaxInt32/packetOverheadFactor {
  1805  		newSz *= packetOverheadFactor
  1806  		if newSz < min {
  1807  			newSz = min
  1808  		}
  1809  	} else {
  1810  		newSz = math.MaxInt32
  1811  	}
  1812  	return newSz
  1813  }
  1814  
  1815  // setSockOptSocket implements SetSockOpt when level is SOL_SOCKET.
  1816  func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  1817  	switch name {
  1818  	case linux.SO_SNDBUF:
  1819  		if len(optVal) < sizeOfInt32 {
  1820  			return syserr.ErrInvalidArgument
  1821  		}
  1822  
  1823  		v := hostarch.ByteOrder.Uint32(optVal)
  1824  		min, max := ep.SocketOptions().SendBufferLimits()
  1825  		clamped := clampBufSize(int64(v), min, max, false /* ignoreMax */)
  1826  		ep.SocketOptions().SetSendBufferSize(clamped, true /* notify */)
  1827  		return nil
  1828  
  1829  	case linux.SO_RCVBUF:
  1830  		if len(optVal) < sizeOfInt32 {
  1831  			return syserr.ErrInvalidArgument
  1832  		}
  1833  
  1834  		v := hostarch.ByteOrder.Uint32(optVal)
  1835  		min, max := ep.SocketOptions().ReceiveBufferLimits()
  1836  		clamped := clampBufSize(int64(v), min, max, false /* ignoreMax */)
  1837  		ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */)
  1838  		return nil
  1839  
  1840  	case linux.SO_RCVBUFFORCE:
  1841  		if len(optVal) < sizeOfInt32 {
  1842  			return syserr.ErrInvalidArgument
  1843  		}
  1844  
  1845  		if creds := auth.CredentialsFromContext(t); !creds.HasCapability(linux.CAP_NET_ADMIN) {
  1846  			return syserr.ErrNotPermitted
  1847  		}
  1848  
  1849  		v := hostarch.ByteOrder.Uint32(optVal)
  1850  		min, max := ep.SocketOptions().ReceiveBufferLimits()
  1851  		clamped := clampBufSize(int64(v), min, max, true /* ignoreMax */)
  1852  		ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */)
  1853  		return nil
  1854  
  1855  	case linux.SO_REUSEADDR:
  1856  		if len(optVal) < sizeOfInt32 {
  1857  			return syserr.ErrInvalidArgument
  1858  		}
  1859  
  1860  		v := hostarch.ByteOrder.Uint32(optVal)
  1861  		ep.SocketOptions().SetReuseAddress(v != 0)
  1862  		return nil
  1863  
  1864  	case linux.SO_REUSEPORT:
  1865  		if len(optVal) < sizeOfInt32 {
  1866  			return syserr.ErrInvalidArgument
  1867  		}
  1868  
  1869  		v := hostarch.ByteOrder.Uint32(optVal)
  1870  		ep.SocketOptions().SetReusePort(v != 0)
  1871  		return nil
  1872  
  1873  	case linux.SO_BINDTODEVICE:
  1874  		n := bytes.IndexByte(optVal, 0)
  1875  		if n == -1 {
  1876  			n = len(optVal)
  1877  		}
  1878  		name := string(optVal[:n])
  1879  		if name == "" {
  1880  			return syserr.TranslateNetstackError(ep.SocketOptions().SetBindToDevice(0))
  1881  		}
  1882  		s := t.NetworkContext()
  1883  		if s == nil {
  1884  			return syserr.ErrNoDevice
  1885  		}
  1886  		for nicID, nic := range s.Interfaces() {
  1887  			if nic.Name == name {
  1888  				return syserr.TranslateNetstackError(ep.SocketOptions().SetBindToDevice(nicID))
  1889  			}
  1890  		}
  1891  		return syserr.ErrUnknownDevice
  1892  
  1893  	case linux.SO_BROADCAST:
  1894  		if len(optVal) < sizeOfInt32 {
  1895  			return syserr.ErrInvalidArgument
  1896  		}
  1897  
  1898  		v := hostarch.ByteOrder.Uint32(optVal)
  1899  		ep.SocketOptions().SetBroadcast(v != 0)
  1900  		return nil
  1901  
  1902  	case linux.SO_PASSCRED:
  1903  		if len(optVal) < sizeOfInt32 {
  1904  			return syserr.ErrInvalidArgument
  1905  		}
  1906  
  1907  		v := hostarch.ByteOrder.Uint32(optVal)
  1908  		ep.SocketOptions().SetPassCred(v != 0)
  1909  		return nil
  1910  
  1911  	case linux.SO_KEEPALIVE:
  1912  		if len(optVal) < sizeOfInt32 {
  1913  			return syserr.ErrInvalidArgument
  1914  		}
  1915  
  1916  		v := hostarch.ByteOrder.Uint32(optVal)
  1917  		ep.SocketOptions().SetKeepAlive(v != 0)
  1918  		return nil
  1919  
  1920  	case linux.SO_SNDTIMEO:
  1921  		if len(optVal) < linux.SizeOfTimeval {
  1922  			return syserr.ErrInvalidArgument
  1923  		}
  1924  
  1925  		var v linux.Timeval
  1926  		v.UnmarshalBytes(optVal)
  1927  		if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
  1928  			return syserr.ErrDomain
  1929  		}
  1930  		s.SetSendTimeout(v.ToNsecCapped())
  1931  		return nil
  1932  
  1933  	case linux.SO_RCVTIMEO:
  1934  		if len(optVal) < linux.SizeOfTimeval {
  1935  			return syserr.ErrInvalidArgument
  1936  		}
  1937  
  1938  		var v linux.Timeval
  1939  		v.UnmarshalBytes(optVal)
  1940  		if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
  1941  			return syserr.ErrDomain
  1942  		}
  1943  		s.SetRecvTimeout(v.ToNsecCapped())
  1944  		return nil
  1945  
  1946  	case linux.SO_OOBINLINE:
  1947  		if len(optVal) < sizeOfInt32 {
  1948  			return syserr.ErrInvalidArgument
  1949  		}
  1950  
  1951  		v := hostarch.ByteOrder.Uint32(optVal)
  1952  		ep.SocketOptions().SetOutOfBandInline(v != 0)
  1953  		return nil
  1954  
  1955  	case linux.SO_NO_CHECK:
  1956  		if len(optVal) < sizeOfInt32 {
  1957  			return syserr.ErrInvalidArgument
  1958  		}
  1959  
  1960  		v := hostarch.ByteOrder.Uint32(optVal)
  1961  		ep.SocketOptions().SetNoChecksum(v != 0)
  1962  		return nil
  1963  
  1964  	case linux.SO_LINGER:
  1965  		if len(optVal) < linux.SizeOfLinger {
  1966  			return syserr.ErrInvalidArgument
  1967  		}
  1968  
  1969  		var v linux.Linger
  1970  		v.UnmarshalBytes(optVal)
  1971  
  1972  		ep.SocketOptions().SetLinger(tcpip.LingerOption{
  1973  			Enabled: v.OnOff != 0,
  1974  			Timeout: time.Second * time.Duration(v.Linger),
  1975  		})
  1976  		return nil
  1977  
  1978  	case linux.SO_DETACH_FILTER:
  1979  		// optval is ignored.
  1980  		var v tcpip.SocketDetachFilterOption
  1981  		return syserr.TranslateNetstackError(ep.SetSockOpt(&v))
  1982  
  1983  	// TODO(b/226603727): Add support for SO_RCVLOWAT option. For now, only
  1984  	// the unsupported syscall message is removed.
  1985  	case linux.SO_RCVLOWAT:
  1986  		if len(optVal) < sizeOfInt32 {
  1987  			return syserr.ErrInvalidArgument
  1988  		}
  1989  
  1990  		v := hostarch.ByteOrder.Uint32(optVal)
  1991  		ep.SocketOptions().SetRcvlowat(int32(v))
  1992  		return nil
  1993  	}
  1994  
  1995  	return nil
  1996  }
  1997  
  1998  // setSockOptTCP implements SetSockOpt when level is SOL_TCP.
  1999  func setSockOptTCP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  2000  	if !socket.IsTCP(s) {
  2001  		return syserr.ErrUnknownProtocolOption
  2002  	}
  2003  
  2004  	switch name {
  2005  	case linux.TCP_NODELAY:
  2006  		if len(optVal) < sizeOfInt32 {
  2007  			return syserr.ErrInvalidArgument
  2008  		}
  2009  
  2010  		v := hostarch.ByteOrder.Uint32(optVal)
  2011  		ep.SocketOptions().SetDelayOption(v == 0)
  2012  		return nil
  2013  
  2014  	case linux.TCP_CORK:
  2015  		if len(optVal) < sizeOfInt32 {
  2016  			return syserr.ErrInvalidArgument
  2017  		}
  2018  
  2019  		v := hostarch.ByteOrder.Uint32(optVal)
  2020  		ep.SocketOptions().SetCorkOption(v != 0)
  2021  		return nil
  2022  
  2023  	case linux.TCP_QUICKACK:
  2024  		if len(optVal) < sizeOfInt32 {
  2025  			return syserr.ErrInvalidArgument
  2026  		}
  2027  
  2028  		v := hostarch.ByteOrder.Uint32(optVal)
  2029  		ep.SocketOptions().SetQuickAck(v != 0)
  2030  		return nil
  2031  
  2032  	case linux.TCP_MAXSEG:
  2033  		if len(optVal) < sizeOfInt32 {
  2034  			return syserr.ErrInvalidArgument
  2035  		}
  2036  
  2037  		v := hostarch.ByteOrder.Uint32(optVal)
  2038  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MaxSegOption, int(v)))
  2039  
  2040  	case linux.TCP_KEEPIDLE:
  2041  		if len(optVal) < sizeOfInt32 {
  2042  			return syserr.ErrInvalidArgument
  2043  		}
  2044  
  2045  		v := hostarch.ByteOrder.Uint32(optVal)
  2046  		if v < 1 || v > linux.MAX_TCP_KEEPIDLE {
  2047  			return syserr.ErrInvalidArgument
  2048  		}
  2049  		opt := tcpip.KeepaliveIdleOption(time.Second * time.Duration(v))
  2050  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  2051  
  2052  	case linux.TCP_KEEPINTVL:
  2053  		if len(optVal) < sizeOfInt32 {
  2054  			return syserr.ErrInvalidArgument
  2055  		}
  2056  
  2057  		v := hostarch.ByteOrder.Uint32(optVal)
  2058  		if v < 1 || v > linux.MAX_TCP_KEEPINTVL {
  2059  			return syserr.ErrInvalidArgument
  2060  		}
  2061  		opt := tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))
  2062  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  2063  
  2064  	case linux.TCP_KEEPCNT:
  2065  		if len(optVal) < sizeOfInt32 {
  2066  			return syserr.ErrInvalidArgument
  2067  		}
  2068  
  2069  		v := hostarch.ByteOrder.Uint32(optVal)
  2070  		if v < 1 || v > linux.MAX_TCP_KEEPCNT {
  2071  			return syserr.ErrInvalidArgument
  2072  		}
  2073  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.KeepaliveCountOption, int(v)))
  2074  
  2075  	case linux.TCP_USER_TIMEOUT:
  2076  		if len(optVal) < sizeOfInt32 {
  2077  			return syserr.ErrInvalidArgument
  2078  		}
  2079  
  2080  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2081  		if v < 0 {
  2082  			return syserr.ErrInvalidArgument
  2083  		}
  2084  		opt := tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v))
  2085  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  2086  
  2087  	case linux.TCP_CONGESTION:
  2088  		v := tcpip.CongestionControlOption(optVal)
  2089  		if err := ep.SetSockOpt(&v); err != nil {
  2090  			return syserr.TranslateNetstackError(err)
  2091  		}
  2092  		return nil
  2093  
  2094  	case linux.TCP_LINGER2:
  2095  		if len(optVal) < sizeOfInt32 {
  2096  			return syserr.ErrInvalidArgument
  2097  		}
  2098  
  2099  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2100  		opt := tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v))
  2101  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  2102  
  2103  	case linux.TCP_DEFER_ACCEPT:
  2104  		if len(optVal) < sizeOfInt32 {
  2105  			return syserr.ErrInvalidArgument
  2106  		}
  2107  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2108  		if v < 0 {
  2109  			v = 0
  2110  		}
  2111  		opt := tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v))
  2112  		return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))
  2113  
  2114  	case linux.TCP_SYNCNT:
  2115  		if len(optVal) < sizeOfInt32 {
  2116  			return syserr.ErrInvalidArgument
  2117  		}
  2118  		v := hostarch.ByteOrder.Uint32(optVal)
  2119  
  2120  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPSynCountOption, int(v)))
  2121  
  2122  	case linux.TCP_WINDOW_CLAMP:
  2123  		if len(optVal) < sizeOfInt32 {
  2124  			return syserr.ErrInvalidArgument
  2125  		}
  2126  		v := hostarch.ByteOrder.Uint32(optVal)
  2127  
  2128  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPWindowClampOption, int(v)))
  2129  
  2130  	case linux.TCP_REPAIR_OPTIONS:
  2131  		// Not supported.
  2132  	}
  2133  
  2134  	return nil
  2135  }
  2136  
  2137  func setSockOptICMPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  2138  	if _, ok := ep.(tcpip.Endpoint); !ok {
  2139  		log.Warningf("SOL_ICMPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  2140  		return syserr.ErrUnknownProtocolOption
  2141  	}
  2142  
  2143  	if family, _, _ := s.Type(); family != linux.AF_INET6 {
  2144  		return syserr.ErrUnknownProtocolOption
  2145  	}
  2146  
  2147  	switch name {
  2148  	case linux.ICMPV6_FILTER:
  2149  		var req linux.ICMP6Filter
  2150  		if len(optVal) < req.SizeBytes() {
  2151  			return syserr.ErrInvalidArgument
  2152  		}
  2153  
  2154  		req.UnmarshalUnsafe(optVal)
  2155  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.ICMPv6Filter{DenyType: req.Filter}))
  2156  	}
  2157  
  2158  	return nil
  2159  }
  2160  
  2161  // setSockOptIPv6 implements SetSockOpt when level is SOL_IPV6.
  2162  func setSockOptIPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  2163  	if _, ok := ep.(tcpip.Endpoint); !ok {
  2164  		log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  2165  		return syserr.ErrUnknownProtocolOption
  2166  	}
  2167  
  2168  	family, _, _ := s.Type()
  2169  	if family != linux.AF_INET6 {
  2170  		return syserr.ErrUnknownProtocolOption
  2171  	}
  2172  
  2173  	switch name {
  2174  	case linux.IPV6_CHECKSUM:
  2175  		if len(optVal) < sizeOfInt32 {
  2176  			return syserr.ErrInvalidArgument
  2177  		}
  2178  
  2179  		// int may not be 32-bits so we cast the uint32 to an int32 before casting
  2180  		// to an int.
  2181  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6Checksum, int(int32(hostarch.ByteOrder.Uint32(optVal)))))
  2182  
  2183  	case linux.IPV6_V6ONLY:
  2184  		if len(optVal) < sizeOfInt32 {
  2185  			return syserr.ErrInvalidArgument
  2186  		}
  2187  
  2188  		if socket.IsTCP(s) && tcp.EndpointState(ep.State()) != tcp.StateInitial {
  2189  			return syserr.ErrInvalidEndpointState
  2190  		} else if socket.IsUDP(s) && transport.DatagramEndpointState(ep.State()) != transport.DatagramEndpointStateInitial {
  2191  			return syserr.ErrInvalidEndpointState
  2192  		}
  2193  
  2194  		v := hostarch.ByteOrder.Uint32(optVal)
  2195  		ep.SocketOptions().SetV6Only(v != 0)
  2196  		return nil
  2197  
  2198  	case linux.IPV6_ADD_MEMBERSHIP:
  2199  		req, err := copyInMulticastV6Request(optVal)
  2200  		if err != nil {
  2201  			return err
  2202  		}
  2203  
  2204  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{
  2205  			NIC:           tcpip.NICID(req.InterfaceIndex),
  2206  			MulticastAddr: tcpip.AddrFrom16(req.MulticastAddr),
  2207  		}))
  2208  
  2209  	case linux.IPV6_DROP_MEMBERSHIP:
  2210  		req, err := copyInMulticastV6Request(optVal)
  2211  		if err != nil {
  2212  			return err
  2213  		}
  2214  
  2215  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{
  2216  			NIC:           tcpip.NICID(req.InterfaceIndex),
  2217  			MulticastAddr: tcpip.AddrFrom16(req.MulticastAddr),
  2218  		}))
  2219  
  2220  	case linux.IPV6_IPSEC_POLICY,
  2221  		linux.IPV6_JOIN_ANYCAST,
  2222  		linux.IPV6_LEAVE_ANYCAST,
  2223  		// TODO(b/148887420): Add support for IPV6_PKTINFO.
  2224  		linux.IPV6_PKTINFO,
  2225  		linux.IPV6_ROUTER_ALERT,
  2226  		linux.IPV6_XFRM_POLICY,
  2227  		linux.MCAST_BLOCK_SOURCE,
  2228  		linux.MCAST_JOIN_GROUP,
  2229  		linux.MCAST_JOIN_SOURCE_GROUP,
  2230  		linux.MCAST_LEAVE_GROUP,
  2231  		linux.MCAST_LEAVE_SOURCE_GROUP,
  2232  		linux.MCAST_UNBLOCK_SOURCE:
  2233  		// Not supported.
  2234  
  2235  	case linux.IPV6_RECVORIGDSTADDR:
  2236  		if len(optVal) < sizeOfInt32 {
  2237  			return syserr.ErrInvalidArgument
  2238  		}
  2239  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2240  
  2241  		ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0)
  2242  		return nil
  2243  
  2244  	case linux.IPV6_RECVPKTINFO:
  2245  		if len(optVal) < sizeOfInt32 {
  2246  			return syserr.ErrInvalidArgument
  2247  		}
  2248  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2249  
  2250  		ep.SocketOptions().SetIPv6ReceivePacketInfo(v != 0)
  2251  		return nil
  2252  
  2253  	case linux.IPV6_UNICAST_HOPS:
  2254  		if len(optVal) < sizeOfInt32 {
  2255  			return syserr.ErrInvalidArgument
  2256  		}
  2257  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2258  		if v < -1 || v > 255 {
  2259  			return syserr.ErrInvalidArgument
  2260  		}
  2261  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6HopLimitOption, int(v)))
  2262  
  2263  	case linux.IPV6_RECVHOPLIMIT:
  2264  		v, err := parseIntOrChar(optVal)
  2265  		if err != nil {
  2266  			return err
  2267  		}
  2268  
  2269  		ep.SocketOptions().SetReceiveHopLimit(v != 0)
  2270  		return nil
  2271  
  2272  	case linux.IPV6_TCLASS:
  2273  		if len(optVal) < sizeOfInt32 {
  2274  			return syserr.ErrInvalidArgument
  2275  		}
  2276  		v := int32(hostarch.ByteOrder.Uint32(optVal))
  2277  		if v < -1 || v > 255 {
  2278  			return syserr.ErrInvalidArgument
  2279  		}
  2280  		if v == -1 {
  2281  			v = 0
  2282  		}
  2283  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6TrafficClassOption, int(v)))
  2284  
  2285  	case linux.IPV6_RECVTCLASS:
  2286  		v, err := parseIntOrChar(optVal)
  2287  		if err != nil {
  2288  			return err
  2289  		}
  2290  
  2291  		ep.SocketOptions().SetReceiveTClass(v != 0)
  2292  		return nil
  2293  	case linux.IPV6_RECVERR:
  2294  		if len(optVal) == 0 {
  2295  			return nil
  2296  		}
  2297  		v, err := parseIntOrChar(optVal)
  2298  		if err != nil {
  2299  			return err
  2300  		}
  2301  		ep.SocketOptions().SetIPv6RecvError(v != 0)
  2302  		return nil
  2303  
  2304  	case linux.IP6T_SO_SET_REPLACE:
  2305  		if len(optVal) < linux.SizeOfIP6TReplace {
  2306  			return syserr.ErrInvalidArgument
  2307  		}
  2308  
  2309  		// Only valid for raw IPv6 sockets.
  2310  		if !socket.IsRaw(s) {
  2311  			return syserr.ErrProtocolNotAvailable
  2312  		}
  2313  
  2314  		stk := inet.StackFromContext(t)
  2315  		if stk == nil {
  2316  			return syserr.ErrNoDevice
  2317  		}
  2318  		// Stack must be a netstack stack.
  2319  		return netfilter.SetEntries(t, stk.(*Stack).Stack, optVal, true)
  2320  
  2321  	case linux.IP6T_SO_SET_ADD_COUNTERS:
  2322  		log.Infof("IP6T_SO_SET_ADD_COUNTERS is not supported")
  2323  		return nil
  2324  	}
  2325  
  2326  	return nil
  2327  }
  2328  
  2329  var (
  2330  	inetMulticastRequestSize        = (*linux.InetMulticastRequest)(nil).SizeBytes()
  2331  	inetMulticastRequestWithNICSize = (*linux.InetMulticastRequestWithNIC)(nil).SizeBytes()
  2332  	inet6MulticastRequestSize       = (*linux.Inet6MulticastRequest)(nil).SizeBytes()
  2333  )
  2334  
  2335  // copyInMulticastRequest copies in a variable-size multicast request. The
  2336  // kernel determines which structure was passed by its length. IP_MULTICAST_IF
  2337  // supports ip_mreqn, ip_mreq and in_addr, while IP_ADD_MEMBERSHIP and
  2338  // IP_DROP_MEMBERSHIP only support ip_mreqn and ip_mreq. To handle this,
  2339  // allowAddr controls whether in_addr is accepted or rejected.
  2340  func copyInMulticastRequest(optVal []byte, allowAddr bool) (linux.InetMulticastRequestWithNIC, *syserr.Error) {
  2341  	if len(optVal) < len(linux.InetAddr{}) {
  2342  		return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument
  2343  	}
  2344  
  2345  	if len(optVal) < inetMulticastRequestSize {
  2346  		if !allowAddr {
  2347  			return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument
  2348  		}
  2349  
  2350  		var req linux.InetMulticastRequestWithNIC
  2351  		copy(req.InterfaceAddr[:], optVal)
  2352  		return req, nil
  2353  	}
  2354  
  2355  	if len(optVal) >= inetMulticastRequestWithNICSize {
  2356  		var req linux.InetMulticastRequestWithNIC
  2357  		req.UnmarshalUnsafe(optVal)
  2358  		return req, nil
  2359  	}
  2360  
  2361  	var req linux.InetMulticastRequestWithNIC
  2362  	req.InetMulticastRequest.UnmarshalUnsafe(optVal)
  2363  	return req, nil
  2364  }
  2365  
  2366  func copyInMulticastV6Request(optVal []byte) (linux.Inet6MulticastRequest, *syserr.Error) {
  2367  	if len(optVal) < inet6MulticastRequestSize {
  2368  		return linux.Inet6MulticastRequest{}, syserr.ErrInvalidArgument
  2369  	}
  2370  
  2371  	var req linux.Inet6MulticastRequest
  2372  	req.UnmarshalUnsafe(optVal)
  2373  	return req, nil
  2374  }
  2375  
  2376  // parseIntOrChar copies either a 32-bit int or an 8-bit uint out of buf.
  2377  //
  2378  // net/ipv4/ip_sockglue.c:do_ip_setsockopt does this for its socket options.
  2379  func parseIntOrChar(buf []byte) (int32, *syserr.Error) {
  2380  	if len(buf) == 0 {
  2381  		return 0, syserr.ErrInvalidArgument
  2382  	}
  2383  
  2384  	if len(buf) >= sizeOfInt32 {
  2385  		return int32(hostarch.ByteOrder.Uint32(buf)), nil
  2386  	}
  2387  
  2388  	return int32(buf[0]), nil
  2389  }
  2390  
  2391  // setSockOptIP implements SetSockOpt when level is SOL_IP.
  2392  func setSockOptIP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
  2393  	if _, ok := ep.(tcpip.Endpoint); !ok {
  2394  		log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
  2395  		return syserr.ErrUnknownProtocolOption
  2396  	}
  2397  
  2398  	switch name {
  2399  	case linux.IP_MULTICAST_TTL:
  2400  		v, err := parseIntOrChar(optVal)
  2401  		if err != nil {
  2402  			return err
  2403  		}
  2404  
  2405  		if v == -1 {
  2406  			// Linux translates -1 to 1.
  2407  			v = 1
  2408  		}
  2409  		if v < 0 || v > 255 {
  2410  			return syserr.ErrInvalidArgument
  2411  		}
  2412  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MulticastTTLOption, int(v)))
  2413  
  2414  	case linux.IP_ADD_MEMBERSHIP:
  2415  		req, err := copyInMulticastRequest(optVal, false /* allowAddr */)
  2416  		if err != nil {
  2417  			return err
  2418  		}
  2419  
  2420  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{
  2421  			NIC: tcpip.NICID(req.InterfaceIndex),
  2422  			// TODO(igudger): Change AddMembership to use the standard
  2423  			// any address representation.
  2424  			InterfaceAddr: tcpip.AddrFrom4(req.InterfaceAddr),
  2425  			MulticastAddr: tcpip.AddrFrom4(req.MulticastAddr),
  2426  		}))
  2427  
  2428  	case linux.IP_DROP_MEMBERSHIP:
  2429  		req, err := copyInMulticastRequest(optVal, false /* allowAddr */)
  2430  		if err != nil {
  2431  			return err
  2432  		}
  2433  
  2434  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{
  2435  			NIC: tcpip.NICID(req.InterfaceIndex),
  2436  			// TODO(igudger): Change DropMembership to use the standard
  2437  			// any address representation.
  2438  			InterfaceAddr: tcpip.AddrFrom4(req.InterfaceAddr),
  2439  			MulticastAddr: tcpip.AddrFrom4(req.MulticastAddr),
  2440  		}))
  2441  
  2442  	case linux.IP_MULTICAST_IF:
  2443  		req, err := copyInMulticastRequest(optVal, true /* allowAddr */)
  2444  		if err != nil {
  2445  			return err
  2446  		}
  2447  
  2448  		return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.MulticastInterfaceOption{
  2449  			NIC:           tcpip.NICID(req.InterfaceIndex),
  2450  			InterfaceAddr: socket.BytesToIPAddress(req.InterfaceAddr[:]),
  2451  		}))
  2452  
  2453  	case linux.IP_MULTICAST_LOOP:
  2454  		v, err := parseIntOrChar(optVal)
  2455  		if err != nil {
  2456  			return err
  2457  		}
  2458  
  2459  		ep.SocketOptions().SetMulticastLoop(v != 0)
  2460  		return nil
  2461  
  2462  	case linux.MCAST_JOIN_GROUP:
  2463  		// FIXME(b/124219304): Implement MCAST_JOIN_GROUP.
  2464  		return syserr.ErrInvalidArgument
  2465  
  2466  	case linux.IP_TTL:
  2467  		v, err := parseIntOrChar(optVal)
  2468  		if err != nil {
  2469  			return err
  2470  		}
  2471  
  2472  		// -1 means default TTL.
  2473  		if v == -1 {
  2474  			v = 0
  2475  		} else if v < 1 || v > 255 {
  2476  			return syserr.ErrInvalidArgument
  2477  		}
  2478  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TTLOption, int(v)))
  2479  
  2480  	case linux.IP_RECVTTL:
  2481  		v, err := parseIntOrChar(optVal)
  2482  		if err != nil {
  2483  			return err
  2484  		}
  2485  		ep.SocketOptions().SetReceiveTTL(v != 0)
  2486  		return nil
  2487  
  2488  	case linux.IP_TOS:
  2489  		if len(optVal) == 0 {
  2490  			return nil
  2491  		}
  2492  		v, err := parseIntOrChar(optVal)
  2493  		if err != nil {
  2494  			return err
  2495  		}
  2496  		return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TOSOption, int(v)))
  2497  
  2498  	case linux.IP_RECVTOS:
  2499  		v, err := parseIntOrChar(optVal)
  2500  		if err != nil {
  2501  			return err
  2502  		}
  2503  		ep.SocketOptions().SetReceiveTOS(v != 0)
  2504  		return nil
  2505  
  2506  	case linux.IP_RECVERR:
  2507  		if len(optVal) == 0 {
  2508  			return nil
  2509  		}
  2510  		v, err := parseIntOrChar(optVal)
  2511  		if err != nil {
  2512  			return err
  2513  		}
  2514  		ep.SocketOptions().SetIPv4RecvError(v != 0)
  2515  		return nil
  2516  
  2517  	case linux.IP_PKTINFO:
  2518  		if len(optVal) == 0 {
  2519  			return nil
  2520  		}
  2521  		v, err := parseIntOrChar(optVal)
  2522  		if err != nil {
  2523  			return err
  2524  		}
  2525  		ep.SocketOptions().SetReceivePacketInfo(v != 0)
  2526  		return nil
  2527  
  2528  	case linux.IP_HDRINCL:
  2529  		if len(optVal) == 0 {
  2530  			return nil
  2531  		}
  2532  		v, err := parseIntOrChar(optVal)
  2533  		if err != nil {
  2534  			return err
  2535  		}
  2536  		ep.SocketOptions().SetHeaderIncluded(v != 0)
  2537  		return nil
  2538  
  2539  	case linux.IP_RECVORIGDSTADDR:
  2540  		if len(optVal) == 0 {
  2541  			return nil
  2542  		}
  2543  		v, err := parseIntOrChar(optVal)
  2544  		if err != nil {
  2545  			return err
  2546  		}
  2547  
  2548  		ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0)
  2549  		return nil
  2550  
  2551  	case linux.IPT_SO_SET_REPLACE:
  2552  		if len(optVal) < linux.SizeOfIPTReplace {
  2553  			return syserr.ErrInvalidArgument
  2554  		}
  2555  
  2556  		// Only valid for raw IPv4 sockets.
  2557  		if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
  2558  			return syserr.ErrProtocolNotAvailable
  2559  		}
  2560  
  2561  		stk := inet.StackFromContext(t)
  2562  		if stk == nil {
  2563  			return syserr.ErrNoDevice
  2564  		}
  2565  		// Stack must be a netstack stack.
  2566  		return netfilter.SetEntries(t, stk.(*Stack).Stack, optVal, false)
  2567  
  2568  	case linux.IPT_SO_SET_ADD_COUNTERS:
  2569  		log.Infof("IPT_SO_SET_ADD_COUNTERS is not supported")
  2570  		return nil
  2571  
  2572  	case linux.IP_ADD_SOURCE_MEMBERSHIP,
  2573  		linux.IP_BIND_ADDRESS_NO_PORT,
  2574  		linux.IP_BLOCK_SOURCE,
  2575  		linux.IP_CHECKSUM,
  2576  		linux.IP_DROP_SOURCE_MEMBERSHIP,
  2577  		linux.IP_FREEBIND,
  2578  		linux.IP_IPSEC_POLICY,
  2579  		linux.IP_MINTTL,
  2580  		linux.IP_MSFILTER,
  2581  		linux.IP_MTU_DISCOVER,
  2582  		linux.IP_MULTICAST_ALL,
  2583  		linux.IP_NODEFRAG,
  2584  		linux.IP_OPTIONS,
  2585  		linux.IP_PASSSEC,
  2586  		linux.IP_RECVFRAGSIZE,
  2587  		linux.IP_RECVOPTS,
  2588  		linux.IP_RETOPTS,
  2589  		linux.IP_TRANSPARENT,
  2590  		linux.IP_UNBLOCK_SOURCE,
  2591  		linux.IP_UNICAST_IF,
  2592  		linux.IP_XFRM_POLICY,
  2593  		linux.MCAST_BLOCK_SOURCE,
  2594  		linux.MCAST_JOIN_SOURCE_GROUP,
  2595  		linux.MCAST_LEAVE_GROUP,
  2596  		linux.MCAST_LEAVE_SOURCE_GROUP,
  2597  		linux.MCAST_MSFILTER,
  2598  		linux.MCAST_UNBLOCK_SOURCE:
  2599  		// Not supported.
  2600  	}
  2601  
  2602  	return nil
  2603  }
  2604  
  2605  // GetSockName implements the linux syscall getsockname(2) for sockets backed by
  2606  // tcpip.Endpoint.
  2607  func (s *sock) GetSockName(*kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
  2608  	addr, err := s.Endpoint.GetLocalAddress()
  2609  	if err != nil {
  2610  		return nil, 0, syserr.TranslateNetstackError(err)
  2611  	}
  2612  
  2613  	a, l := socket.ConvertAddress(s.family, addr)
  2614  	return a, l, nil
  2615  }
  2616  
  2617  // GetPeerName implements the linux syscall getpeername(2) for sockets backed by
  2618  // tcpip.Endpoint.
  2619  func (s *sock) GetPeerName(*kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
  2620  	addr, err := s.Endpoint.GetRemoteAddress()
  2621  	if err != nil {
  2622  		return nil, 0, syserr.TranslateNetstackError(err)
  2623  	}
  2624  
  2625  	a, l := socket.ConvertAddress(s.family, addr)
  2626  	return a, l, nil
  2627  }
  2628  
  2629  func (s *sock) fillCmsgInq(cmsg *socket.ControlMessages) {
  2630  	if !s.sockOptInq {
  2631  		return
  2632  	}
  2633  	rcvBufUsed, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
  2634  	if err != nil {
  2635  		return
  2636  	}
  2637  	cmsg.IP.HasInq = true
  2638  	cmsg.IP.Inq = int32(rcvBufUsed)
  2639  }
  2640  
  2641  func toLinuxPacketType(pktType tcpip.PacketType) uint8 {
  2642  	switch pktType {
  2643  	case tcpip.PacketHost:
  2644  		return linux.PACKET_HOST
  2645  	case tcpip.PacketOtherHost:
  2646  		return linux.PACKET_OTHERHOST
  2647  	case tcpip.PacketOutgoing:
  2648  		return linux.PACKET_OUTGOING
  2649  	case tcpip.PacketBroadcast:
  2650  		return linux.PACKET_BROADCAST
  2651  	case tcpip.PacketMulticast:
  2652  		return linux.PACKET_MULTICAST
  2653  	default:
  2654  		panic(fmt.Sprintf("unknown packet type: %d", pktType))
  2655  	}
  2656  }
  2657  
  2658  // nonBlockingRead issues a non-blocking read.
  2659  //
  2660  // TODO(b/78348848): Support timestamps for stream sockets.
  2661  func (s *sock) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
  2662  	isPacket := s.isPacketBased()
  2663  
  2664  	readOptions := tcpip.ReadOptions{
  2665  		Peek:               peek,
  2666  		NeedRemoteAddr:     senderRequested,
  2667  		NeedLinkPacketInfo: isPacket,
  2668  	}
  2669  
  2670  	// TCP sockets discard the data if MSG_TRUNC is set.
  2671  	//
  2672  	// This behavior is documented in man 7 tcp:
  2673  	// Since version 2.4, Linux supports the use of MSG_TRUNC in the flags
  2674  	// argument of recv(2) (and recvmsg(2)). This flag causes the received
  2675  	// bytes of data to be discarded, rather than passed back in a
  2676  	// caller-supplied  buffer.
  2677  	var w io.Writer
  2678  	if !isPacket && trunc {
  2679  		w = &tcpip.LimitedWriter{
  2680  			W: ioutil.Discard,
  2681  			N: dst.NumBytes(),
  2682  		}
  2683  	} else {
  2684  		w = dst.Writer(ctx)
  2685  	}
  2686  
  2687  	s.readMu.Lock()
  2688  	defer s.readMu.Unlock()
  2689  
  2690  	res, err := s.Endpoint.Read(w, readOptions)
  2691  	if _, ok := err.(*tcpip.ErrBadBuffer); ok && dst.NumBytes() == 0 {
  2692  		err = nil
  2693  	}
  2694  	if err != nil {
  2695  		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err)
  2696  	}
  2697  	// Set the control message, even if 0 bytes were read.
  2698  	s.updateTimestamp(res.ControlMessages)
  2699  
  2700  	if isPacket {
  2701  		var addr linux.SockAddr
  2702  		var addrLen uint32
  2703  		if senderRequested {
  2704  			addr, addrLen = socket.ConvertAddress(s.family, res.RemoteAddr)
  2705  			switch v := addr.(type) {
  2706  			case *linux.SockAddrLink:
  2707  				v.Protocol = socket.Htons(uint16(res.LinkPacketInfo.Protocol))
  2708  				v.PacketType = toLinuxPacketType(res.LinkPacketInfo.PktType)
  2709  			}
  2710  		}
  2711  
  2712  		msgLen := res.Count
  2713  		if trunc {
  2714  			msgLen = res.Total
  2715  		}
  2716  
  2717  		var flags int
  2718  		if res.Total > res.Count {
  2719  			flags |= linux.MSG_TRUNC
  2720  		}
  2721  
  2722  		return msgLen, flags, addr, addrLen, s.netstackToLinuxControlMessages(res.ControlMessages), nil
  2723  	}
  2724  
  2725  	if peek {
  2726  		// MSG_TRUNC with MSG_PEEK on a TCP socket returns the
  2727  		// amount that could be read, and does not write to buffer.
  2728  		if trunc {
  2729  			// TCP endpoint does not return the total bytes in buffer as numTotal.
  2730  			// We need to query it from socket option.
  2731  			rql, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
  2732  			if err != nil {
  2733  				return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err)
  2734  			}
  2735  			msgLen := int(dst.NumBytes())
  2736  			if msgLen > rql {
  2737  				msgLen = rql
  2738  			}
  2739  			return msgLen, 0, nil, 0, socket.ControlMessages{}, nil
  2740  		}
  2741  	} else if n := res.Count; n != 0 {
  2742  		s.Endpoint.ModerateRecvBuf(n)
  2743  	}
  2744  
  2745  	cmsg := s.netstackToLinuxControlMessages(res.ControlMessages)
  2746  	s.fillCmsgInq(&cmsg)
  2747  	return res.Count, 0, nil, 0, cmsg, syserr.TranslateNetstackError(err)
  2748  }
  2749  
  2750  func (s *sock) netstackToLinuxControlMessages(cm tcpip.ReceivableControlMessages) socket.ControlMessages {
  2751  	readCM := socket.NewIPControlMessages(s.family, cm)
  2752  	return socket.ControlMessages{
  2753  		IP: socket.IPControlMessages{
  2754  			HasTimestamp:       readCM.HasTimestamp && s.sockOptTimestamp,
  2755  			Timestamp:          readCM.Timestamp,
  2756  			HasInq:             readCM.HasInq,
  2757  			Inq:                readCM.Inq,
  2758  			HasTOS:             readCM.HasTOS,
  2759  			TOS:                readCM.TOS,
  2760  			HasTClass:          readCM.HasTClass,
  2761  			TClass:             readCM.TClass,
  2762  			HasTTL:             readCM.HasTTL,
  2763  			TTL:                readCM.TTL,
  2764  			HasHopLimit:        readCM.HasHopLimit,
  2765  			HopLimit:           readCM.HopLimit,
  2766  			HasIPPacketInfo:    readCM.HasIPPacketInfo,
  2767  			PacketInfo:         readCM.PacketInfo,
  2768  			HasIPv6PacketInfo:  readCM.HasIPv6PacketInfo,
  2769  			IPv6PacketInfo:     readCM.IPv6PacketInfo,
  2770  			OriginalDstAddress: readCM.OriginalDstAddress,
  2771  			SockErr:            readCM.SockErr,
  2772  		},
  2773  	}
  2774  }
  2775  
  2776  func (s *sock) linuxToNetstackControlMessages(cm socket.ControlMessages) tcpip.SendableControlMessages {
  2777  	return tcpip.SendableControlMessages{
  2778  		HasTTL:      cm.IP.HasTTL,
  2779  		TTL:         uint8(cm.IP.TTL),
  2780  		HasHopLimit: cm.IP.HasHopLimit,
  2781  		HopLimit:    uint8(cm.IP.HopLimit),
  2782  	}
  2783  }
  2784  
  2785  // updateTimestamp sets the timestamp for SIOCGSTAMP. It should be called after
  2786  // successfully writing packet data out to userspace.
  2787  //
  2788  // Precondition: s.readMu must be locked.
  2789  func (s *sock) updateTimestamp(cm tcpip.ReceivableControlMessages) {
  2790  	// Save the SIOCGSTAMP timestamp only if SO_TIMESTAMP is disabled.
  2791  	if !s.sockOptTimestamp {
  2792  		s.timestampValid = true
  2793  		s.timestamp = cm.Timestamp
  2794  	}
  2795  }
  2796  
  2797  // dequeueErr is analogous to net/core/skbuff.c:sock_dequeue_err_skb().
  2798  func (s *sock) dequeueErr() *tcpip.SockError {
  2799  	so := s.Endpoint.SocketOptions()
  2800  	err := so.DequeueErr()
  2801  	if err == nil {
  2802  		return nil
  2803  	}
  2804  
  2805  	// Update socket error to reflect ICMP errors in queue.
  2806  	if nextErr := so.PeekErr(); nextErr != nil && nextErr.Cause.Origin().IsICMPErr() {
  2807  		so.SetLastError(nextErr.Err)
  2808  	} else if err.Cause.Origin().IsICMPErr() {
  2809  		so.SetLastError(nil)
  2810  	}
  2811  	return err
  2812  }
  2813  
  2814  // addrFamilyFromNetProto returns the address family identifier for the given
  2815  // network protocol.
  2816  func addrFamilyFromNetProto(net tcpip.NetworkProtocolNumber) int {
  2817  	switch net {
  2818  	case header.IPv4ProtocolNumber:
  2819  		return linux.AF_INET
  2820  	case header.IPv6ProtocolNumber:
  2821  		return linux.AF_INET6
  2822  	default:
  2823  		panic(fmt.Sprintf("invalid net proto for addr family inference: %d", net))
  2824  	}
  2825  }
  2826  
  2827  // recvErr handles MSG_ERRQUEUE for recvmsg(2).
  2828  // This is analogous to net/ipv4/ip_sockglue.c:ip_recv_error().
  2829  func (s *sock) recvErr(t *kernel.Task, dst usermem.IOSequence) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
  2830  	sockErr := s.dequeueErr()
  2831  	if sockErr == nil {
  2832  		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
  2833  	}
  2834  	if sockErr.Payload != nil {
  2835  		defer sockErr.Payload.Release()
  2836  	}
  2837  
  2838  	// The payload of the original packet that caused the error is passed as
  2839  	// normal data via msg_iovec.  -- recvmsg(2)
  2840  	msgFlags := linux.MSG_ERRQUEUE
  2841  	if int(dst.NumBytes()) < sockErr.Payload.Size() {
  2842  		msgFlags |= linux.MSG_TRUNC
  2843  	}
  2844  	n, err := dst.CopyOut(t, sockErr.Payload.AsSlice())
  2845  
  2846  	// The original destination address of the datagram that caused the error is
  2847  	// supplied via msg_name.  -- recvmsg(2)
  2848  	dstAddr, dstAddrLen := socket.ConvertAddress(addrFamilyFromNetProto(sockErr.NetProto), sockErr.Dst)
  2849  	cmgs := socket.ControlMessages{IP: socket.NewIPControlMessages(s.family, tcpip.ReceivableControlMessages{SockErr: sockErr})}
  2850  	return n, msgFlags, dstAddr, dstAddrLen, cmgs, syserr.FromError(err)
  2851  }
  2852  
  2853  // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
  2854  // tcpip.Endpoint.
  2855  func (s *sock) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, _ uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
  2856  	if flags&linux.MSG_ERRQUEUE != 0 {
  2857  		return s.recvErr(t, dst)
  2858  	}
  2859  
  2860  	trunc := flags&linux.MSG_TRUNC != 0
  2861  	peek := flags&linux.MSG_PEEK != 0
  2862  	dontWait := flags&linux.MSG_DONTWAIT != 0
  2863  	waitAll := flags&linux.MSG_WAITALL != 0
  2864  	if senderRequested && !s.isPacketBased() {
  2865  		// Stream sockets ignore the sender address.
  2866  		senderRequested = false
  2867  	}
  2868  	n, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
  2869  
  2870  	if s.isPacketBased() && err == syserr.ErrClosedForReceive && flags&linux.MSG_DONTWAIT != 0 {
  2871  		// In this situation we should return EAGAIN.
  2872  		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
  2873  	}
  2874  
  2875  	if err != nil && (err != syserr.ErrWouldBlock || dontWait) {
  2876  		// Read failed and we should not retry.
  2877  		return 0, 0, nil, 0, socket.ControlMessages{}, err
  2878  	}
  2879  
  2880  	if err == nil && (dontWait || !waitAll || s.isPacketBased() || int64(n) >= dst.NumBytes()) {
  2881  		// We got all the data we need.
  2882  		return
  2883  	}
  2884  
  2885  	// Don't overwrite any data we received.
  2886  	dst = dst.DropFirst(n)
  2887  
  2888  	// We'll have to block. Register for notifications and keep trying to
  2889  	// send all the data.
  2890  	e, ch := waiter.NewChannelEntry(waiter.ReadableEvents)
  2891  	s.EventRegister(&e)
  2892  	defer s.EventUnregister(&e)
  2893  
  2894  	for {
  2895  		var rn int
  2896  		rn, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
  2897  		n += rn
  2898  		if err != nil && err != syserr.ErrWouldBlock {
  2899  			// Always stop on errors other than would block as we generally
  2900  			// won't be able to get any more data. Eat the error if we got
  2901  			// any data.
  2902  			if n > 0 {
  2903  				err = nil
  2904  			}
  2905  			return
  2906  		}
  2907  		if err == nil && (s.isPacketBased() || !waitAll || int64(rn) >= dst.NumBytes()) {
  2908  			// We got all the data we need.
  2909  			return
  2910  		}
  2911  		dst = dst.DropFirst(rn)
  2912  
  2913  		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
  2914  			if n > 0 {
  2915  				return n, msgFlags, senderAddr, senderAddrLen, controlMessages, nil
  2916  			}
  2917  			if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
  2918  				return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
  2919  			}
  2920  			return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
  2921  		}
  2922  	}
  2923  }
  2924  
  2925  // SendMsg implements the linux syscall sendmsg(2) for sockets backed by
  2926  // tcpip.Endpoint.
  2927  func (s *sock) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
  2928  	// Reject Unix control messages.
  2929  	if !controlMessages.Unix.Empty() {
  2930  		return 0, syserr.ErrInvalidArgument
  2931  	}
  2932  
  2933  	var addr *tcpip.FullAddress
  2934  	if len(to) > 0 {
  2935  		addrBuf, family, err := socket.AddressAndFamily(to)
  2936  		if err != nil {
  2937  			return 0, err
  2938  		}
  2939  		if !s.checkFamily(family, false /* exact */) {
  2940  			return 0, syserr.ErrInvalidArgument
  2941  		}
  2942  		addrBuf = s.mapFamily(addrBuf, family)
  2943  
  2944  		addr = &addrBuf
  2945  	}
  2946  
  2947  	opts := tcpip.WriteOptions{
  2948  		To:              addr,
  2949  		More:            flags&linux.MSG_MORE != 0,
  2950  		EndOfRecord:     flags&linux.MSG_EOR != 0,
  2951  		ControlMessages: s.linuxToNetstackControlMessages(controlMessages),
  2952  	}
  2953  
  2954  	r := src.Reader(t)
  2955  	var (
  2956  		total int64
  2957  		entry waiter.Entry
  2958  		ch    <-chan struct{}
  2959  	)
  2960  	for {
  2961  		n, err := s.Endpoint.Write(r, opts)
  2962  		total += n
  2963  		if flags&linux.MSG_DONTWAIT != 0 {
  2964  			return int(total), syserr.TranslateNetstackError(err)
  2965  		}
  2966  		block := true
  2967  		switch err.(type) {
  2968  		case nil:
  2969  			block = total != src.NumBytes()
  2970  		case *tcpip.ErrWouldBlock:
  2971  		default:
  2972  			block = false
  2973  		}
  2974  		if block {
  2975  			if ch == nil {
  2976  				// We'll have to block. Register for notification and keep trying to
  2977  				// send all the data.
  2978  				entry, ch = waiter.NewChannelEntry(waiter.WritableEvents)
  2979  				s.EventRegister(&entry)
  2980  				defer s.EventUnregister(&entry)
  2981  			} else {
  2982  				// Don't wait immediately after registration in case more data
  2983  				// became available between when we last checked and when we setup
  2984  				// the notification.
  2985  				if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
  2986  					if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
  2987  						return int(total), syserr.ErrTryAgain
  2988  					}
  2989  					// handleIOError will consume errors from t.Block if needed.
  2990  					return int(total), syserr.FromError(err)
  2991  				}
  2992  			}
  2993  			continue
  2994  		}
  2995  		return int(total), syserr.TranslateNetstackError(err)
  2996  	}
  2997  }
  2998  
  2999  // Ioctl implements vfs.FileDescriptionImpl.
  3000  func (s *sock) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
  3001  	t := kernel.TaskFromContext(ctx)
  3002  	if t == nil {
  3003  		panic("ioctl(2) may only be called from a task goroutine")
  3004  	}
  3005  
  3006  	// SIOCGSTAMP is implemented by netstack rather than all commonEndpoint
  3007  	// sockets.
  3008  	// TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP.
  3009  	switch args[1].Int() {
  3010  	case linux.SIOCGSTAMP:
  3011  		s.readMu.Lock()
  3012  		defer s.readMu.Unlock()
  3013  		if !s.timestampValid {
  3014  			return 0, linuxerr.ENOENT
  3015  		}
  3016  
  3017  		tv := linux.NsecToTimeval(s.timestamp.UnixNano())
  3018  		_, err := tv.CopyOut(t, args[2].Pointer())
  3019  		return 0, err
  3020  
  3021  	case linux.TIOCINQ:
  3022  		v, terr := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
  3023  		if terr != nil {
  3024  			return 0, syserr.TranslateNetstackError(terr).ToError()
  3025  		}
  3026  
  3027  		if v > math.MaxInt32 {
  3028  			v = math.MaxInt32
  3029  		}
  3030  
  3031  		// Copy result to userspace.
  3032  		vP := primitive.Int32(v)
  3033  		_, err := vP.CopyOut(t, args[2].Pointer())
  3034  		return 0, err
  3035  	}
  3036  
  3037  	return Ioctl(ctx, s.Endpoint, uio, sysno, args)
  3038  }
  3039  
  3040  // Ioctl performs a socket ioctl.
  3041  func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
  3042  	t := kernel.TaskFromContext(ctx)
  3043  	if t == nil {
  3044  		panic("ioctl(2) may only be called from a task goroutine")
  3045  	}
  3046  
  3047  	switch arg := int(args[1].Int()); arg {
  3048  	case linux.SIOCGIFFLAGS,
  3049  		linux.SIOCGIFADDR,
  3050  		linux.SIOCGIFBRDADDR,
  3051  		linux.SIOCGIFDSTADDR,
  3052  		linux.SIOCGIFHWADDR,
  3053  		linux.SIOCGIFINDEX,
  3054  		linux.SIOCGIFMAP,
  3055  		linux.SIOCGIFMETRIC,
  3056  		linux.SIOCGIFMTU,
  3057  		linux.SIOCGIFNAME,
  3058  		linux.SIOCGIFNETMASK,
  3059  		linux.SIOCGIFTXQLEN,
  3060  		linux.SIOCETHTOOL:
  3061  
  3062  		var ifr linux.IFReq
  3063  		if _, err := ifr.CopyIn(t, args[2].Pointer()); err != nil {
  3064  			return 0, err
  3065  		}
  3066  		if err := interfaceIoctl(ctx, io, arg, &ifr); err != nil {
  3067  			return 0, err.ToError()
  3068  		}
  3069  		_, err := ifr.CopyOut(t, args[2].Pointer())
  3070  		return 0, err
  3071  
  3072  	case linux.SIOCGIFCONF:
  3073  		// Return a list of interface addresses or the buffer size
  3074  		// necessary to hold the list.
  3075  		var ifc linux.IFConf
  3076  		if _, err := ifc.CopyIn(t, args[2].Pointer()); err != nil {
  3077  			return 0, err
  3078  		}
  3079  
  3080  		if err := ifconfIoctl(ctx, t, io, &ifc); err != nil {
  3081  			return 0, err
  3082  		}
  3083  
  3084  		_, err := ifc.CopyOut(t, args[2].Pointer())
  3085  		return 0, err
  3086  
  3087  	case linux.TIOCINQ:
  3088  		v, terr := ep.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
  3089  		if terr != nil {
  3090  			return 0, syserr.TranslateNetstackError(terr).ToError()
  3091  		}
  3092  
  3093  		if v > math.MaxInt32 {
  3094  			v = math.MaxInt32
  3095  		}
  3096  		// Copy result to userspace.
  3097  		vP := primitive.Int32(v)
  3098  		_, err := vP.CopyOut(t, args[2].Pointer())
  3099  		return 0, err
  3100  
  3101  	case linux.TIOCOUTQ:
  3102  		v, terr := ep.GetSockOptInt(tcpip.SendQueueSizeOption)
  3103  		if terr != nil {
  3104  			return 0, syserr.TranslateNetstackError(terr).ToError()
  3105  		}
  3106  
  3107  		if v > math.MaxInt32 {
  3108  			v = math.MaxInt32
  3109  		}
  3110  
  3111  		// Copy result to userspace.
  3112  		vP := primitive.Int32(v)
  3113  		_, err := vP.CopyOut(t, args[2].Pointer())
  3114  		return 0, err
  3115  
  3116  	case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG:
  3117  		// Not supported.
  3118  	}
  3119  
  3120  	return 0, linuxerr.ENOTTY
  3121  }
  3122  
  3123  // interfaceIoctl implements interface requests.
  3124  func interfaceIoctl(ctx context.Context, _ usermem.IO, arg int, ifr *linux.IFReq) *syserr.Error {
  3125  	var (
  3126  		iface inet.Interface
  3127  		index int32
  3128  		found bool
  3129  	)
  3130  
  3131  	// Find the relevant device.
  3132  	stk := inet.StackFromContext(ctx)
  3133  	if stk == nil {
  3134  		return syserr.ErrNoDevice
  3135  	}
  3136  
  3137  	// SIOCGIFNAME uses ifr.ifr_ifindex rather than ifr.ifr_name to
  3138  	// identify a device.
  3139  	if arg == linux.SIOCGIFNAME {
  3140  		// Gets the name of the interface given the interface index
  3141  		// stored in ifr_ifindex.
  3142  		index = int32(hostarch.ByteOrder.Uint32(ifr.Data[:4]))
  3143  		if iface, ok := stk.Interfaces()[index]; ok {
  3144  			ifr.SetName(iface.Name)
  3145  			return nil
  3146  		}
  3147  		return syserr.ErrNoDevice
  3148  	}
  3149  
  3150  	// Find the relevant device.
  3151  	for index, iface = range stk.Interfaces() {
  3152  		if iface.Name == ifr.Name() {
  3153  			found = true
  3154  			break
  3155  		}
  3156  	}
  3157  	if !found {
  3158  		return syserr.ErrNoDevice
  3159  	}
  3160  
  3161  	switch arg {
  3162  	case linux.SIOCGIFINDEX:
  3163  		// Copy out the index to the data.
  3164  		hostarch.ByteOrder.PutUint32(ifr.Data[:], uint32(index))
  3165  
  3166  	case linux.SIOCGIFHWADDR:
  3167  		// Copy the hardware address out.
  3168  		//
  3169  		// Refer: https://linux.die.net/man/7/netdevice
  3170  		// SIOCGIFHWADDR, SIOCSIFHWADDR
  3171  		//
  3172  		// Get or set the hardware address of a device using
  3173  		// ifr_hwaddr. The hardware address is specified in a struct
  3174  		// sockaddr. sa_family contains the ARPHRD_* device type,
  3175  		// sa_data the L2 hardware address starting from byte 0. Setting
  3176  		// the hardware address is a privileged operation.
  3177  		hostarch.ByteOrder.PutUint16(ifr.Data[:], iface.DeviceType)
  3178  		n := copy(ifr.Data[2:], iface.Addr)
  3179  		for i := 2 + n; i < len(ifr.Data); i++ {
  3180  			ifr.Data[i] = 0 // Clear padding.
  3181  		}
  3182  
  3183  	case linux.SIOCGIFFLAGS:
  3184  		f, err := interfaceStatusFlags(stk, iface.Name)
  3185  		if err != nil {
  3186  			return err
  3187  		}
  3188  		// Drop the flags that don't fit in the size that we need to return. This
  3189  		// matches Linux behavior.
  3190  		hostarch.ByteOrder.PutUint16(ifr.Data[:2], uint16(f))
  3191  
  3192  	case linux.SIOCGIFADDR:
  3193  		// Copy the IPv4 address out.
  3194  		for _, addr := range stk.InterfaceAddrs()[index] {
  3195  			// This ioctl is only compatible with AF_INET addresses.
  3196  			if addr.Family != linux.AF_INET {
  3197  				continue
  3198  			}
  3199  			copy(ifr.Data[4:8], addr.Addr)
  3200  			break
  3201  		}
  3202  
  3203  	case linux.SIOCGIFMETRIC:
  3204  		// Gets the metric of the device. As per netdevice(7), this
  3205  		// always just sets ifr_metric to 0.
  3206  		hostarch.ByteOrder.PutUint32(ifr.Data[:4], 0)
  3207  
  3208  	case linux.SIOCGIFMTU:
  3209  		// Gets the MTU of the device.
  3210  		hostarch.ByteOrder.PutUint32(ifr.Data[:4], iface.MTU)
  3211  
  3212  	case linux.SIOCGIFMAP:
  3213  		// Gets the hardware parameters of the device.
  3214  		// TODO(gvisor.dev/issue/505): Implement.
  3215  
  3216  	case linux.SIOCGIFTXQLEN:
  3217  		// Gets the transmit queue length of the device.
  3218  		// TODO(gvisor.dev/issue/505): Implement.
  3219  
  3220  	case linux.SIOCGIFDSTADDR:
  3221  		// Gets the destination address of a point-to-point device.
  3222  		// TODO(gvisor.dev/issue/505): Implement.
  3223  
  3224  	case linux.SIOCGIFBRDADDR:
  3225  		// Gets the broadcast address of a device.
  3226  		// TODO(gvisor.dev/issue/505): Implement.
  3227  
  3228  	case linux.SIOCGIFNETMASK:
  3229  		// Gets the network mask of a device.
  3230  		for _, addr := range stk.InterfaceAddrs()[index] {
  3231  			// This ioctl is only compatible with AF_INET addresses.
  3232  			if addr.Family != linux.AF_INET {
  3233  				continue
  3234  			}
  3235  			// Populate ifr.ifr_netmask (type sockaddr).
  3236  			hostarch.ByteOrder.PutUint16(ifr.Data[0:], uint16(linux.AF_INET))
  3237  			hostarch.ByteOrder.PutUint16(ifr.Data[2:], 0)
  3238  			var mask uint32 = 0xffffffff << (32 - addr.PrefixLen)
  3239  			// Netmask is expected to be returned as a big endian
  3240  			// value.
  3241  			binary.BigEndian.PutUint32(ifr.Data[4:8], mask)
  3242  			break
  3243  		}
  3244  
  3245  	case linux.SIOCETHTOOL:
  3246  		// Stubbed out for now, Ideally we should implement the required
  3247  		// sub-commands for ETHTOOL
  3248  		//
  3249  		// See:
  3250  		// https://github.com/torvalds/linux/blob/aa0c9086b40c17a7ad94425b3b70dd1fdd7497bf/net/core/dev_ioctl.c
  3251  		return syserr.ErrEndpointOperation
  3252  
  3253  	default:
  3254  		// Not a valid call.
  3255  		return syserr.ErrInvalidArgument
  3256  	}
  3257  
  3258  	return nil
  3259  }
  3260  
  3261  // ifconfIoctl populates a struct ifconf for the SIOCGIFCONF ioctl.
  3262  func ifconfIoctl(ctx context.Context, t *kernel.Task, _ usermem.IO, ifc *linux.IFConf) error {
  3263  	// If Ptr is NULL, return the necessary buffer size via Len.
  3264  	// Otherwise, write up to Len bytes starting at Ptr containing ifreq
  3265  	// structs.
  3266  	stk := inet.StackFromContext(ctx)
  3267  	if stk == nil {
  3268  		return syserr.ErrNoDevice.ToError()
  3269  	}
  3270  
  3271  	if ifc.Ptr == 0 {
  3272  		ifc.Len = int32(len(stk.Interfaces())) * int32(linux.SizeOfIFReq)
  3273  		return nil
  3274  	}
  3275  
  3276  	max := ifc.Len
  3277  	ifc.Len = 0
  3278  	for key, ifaceAddrs := range stk.InterfaceAddrs() {
  3279  		iface := stk.Interfaces()[key]
  3280  		for _, ifaceAddr := range ifaceAddrs {
  3281  			// Don't write past the end of the buffer.
  3282  			if ifc.Len+int32(linux.SizeOfIFReq) > max {
  3283  				break
  3284  			}
  3285  			if ifaceAddr.Family != linux.AF_INET {
  3286  				continue
  3287  			}
  3288  
  3289  			// Populate ifr.ifr_addr.
  3290  			ifr := linux.IFReq{}
  3291  			ifr.SetName(iface.Name)
  3292  			hostarch.ByteOrder.PutUint16(ifr.Data[0:2], uint16(ifaceAddr.Family))
  3293  			hostarch.ByteOrder.PutUint16(ifr.Data[2:4], 0)
  3294  			copy(ifr.Data[4:8], ifaceAddr.Addr[:4])
  3295  
  3296  			// Copy the ifr to userspace.
  3297  			dst := uintptr(ifc.Ptr) + uintptr(ifc.Len)
  3298  			ifc.Len += int32(linux.SizeOfIFReq)
  3299  			if _, err := ifr.CopyOut(t, hostarch.Addr(dst)); err != nil {
  3300  				return err
  3301  			}
  3302  		}
  3303  	}
  3304  	return nil
  3305  }
  3306  
  3307  // interfaceStatusFlags returns status flags for an interface in the stack.
  3308  // Flag values and meanings are described in greater detail in netdevice(7) in
  3309  // the SIOCGIFFLAGS section.
  3310  func interfaceStatusFlags(stack inet.Stack, name string) (uint32, *syserr.Error) {
  3311  	// We should only ever be passed a netstack.Stack.
  3312  	epstack, ok := stack.(*Stack)
  3313  	if !ok {
  3314  		return 0, errStackType
  3315  	}
  3316  
  3317  	// Find the NIC corresponding to this interface.
  3318  	for _, info := range epstack.Stack.NICInfo() {
  3319  		if info.Name == name {
  3320  			return nicStateFlagsToLinux(info.Flags), nil
  3321  		}
  3322  	}
  3323  	return 0, syserr.ErrNoDevice
  3324  }
  3325  
  3326  func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 {
  3327  	var rv uint32
  3328  	if f.Up {
  3329  		rv |= linux.IFF_UP | linux.IFF_LOWER_UP
  3330  	}
  3331  	if f.Running {
  3332  		rv |= linux.IFF_RUNNING
  3333  	}
  3334  	if f.Promiscuous {
  3335  		rv |= linux.IFF_PROMISC
  3336  	}
  3337  	if f.Loopback {
  3338  		rv |= linux.IFF_LOOPBACK
  3339  	}
  3340  	return rv
  3341  }
  3342  
  3343  // State implements socket.Socket.State. State translates the internal state
  3344  // returned by netstack to values defined by Linux.
  3345  func (s *sock) State() uint32 {
  3346  	if s.family != linux.AF_INET && s.family != linux.AF_INET6 {
  3347  		// States not implemented for this socket's family.
  3348  		return 0
  3349  	}
  3350  
  3351  	switch {
  3352  	case socket.IsTCP(s):
  3353  		// TCP socket.
  3354  		switch tcp.EndpointState(s.Endpoint.State()) {
  3355  		case tcp.StateEstablished:
  3356  			return linux.TCP_ESTABLISHED
  3357  		case tcp.StateSynSent:
  3358  			return linux.TCP_SYN_SENT
  3359  		case tcp.StateSynRecv:
  3360  			return linux.TCP_SYN_RECV
  3361  		case tcp.StateFinWait1:
  3362  			return linux.TCP_FIN_WAIT1
  3363  		case tcp.StateFinWait2:
  3364  			return linux.TCP_FIN_WAIT2
  3365  		case tcp.StateTimeWait:
  3366  			return linux.TCP_TIME_WAIT
  3367  		case tcp.StateClose, tcp.StateInitial, tcp.StateBound, tcp.StateConnecting, tcp.StateError:
  3368  			return linux.TCP_CLOSE
  3369  		case tcp.StateCloseWait:
  3370  			return linux.TCP_CLOSE_WAIT
  3371  		case tcp.StateLastAck:
  3372  			return linux.TCP_LAST_ACK
  3373  		case tcp.StateListen:
  3374  			return linux.TCP_LISTEN
  3375  		case tcp.StateClosing:
  3376  			return linux.TCP_CLOSING
  3377  		default:
  3378  			// Internal or unknown state.
  3379  			return 0
  3380  		}
  3381  	case socket.IsUDP(s):
  3382  		// UDP socket.
  3383  		switch transport.DatagramEndpointState(s.Endpoint.State()) {
  3384  		case transport.DatagramEndpointStateInitial, transport.DatagramEndpointStateBound, transport.DatagramEndpointStateClosed:
  3385  			return linux.TCP_CLOSE
  3386  		case transport.DatagramEndpointStateConnected:
  3387  			return linux.TCP_ESTABLISHED
  3388  		default:
  3389  			return 0
  3390  		}
  3391  	case socket.IsICMP(s):
  3392  		// TODO(b/112063468): Export states for ICMP sockets.
  3393  	case socket.IsRaw(s):
  3394  		// TODO(b/112063468): Export states for raw sockets.
  3395  	default:
  3396  		// Unknown transport protocol, how did we make this socket?
  3397  		log.Warningf("Unknown transport protocol for an existing socket: family=%v, type=%v, protocol=%v, internal type %v", s.family, s.skType, s.protocol, reflect.TypeOf(s.Endpoint).Elem())
  3398  		return 0
  3399  	}
  3400  
  3401  	return 0
  3402  }
  3403  
  3404  // Type implements socket.Socket.Type.
  3405  func (s *sock) Type() (family int, skType linux.SockType, protocol int) {
  3406  	return s.family, s.skType, s.protocol
  3407  }
  3408  
  3409  // EventRegister implements waiter.Waitable.
  3410  func (s *sock) EventRegister(e *waiter.Entry) error {
  3411  	s.Queue.EventRegister(e)
  3412  	return nil
  3413  }
  3414  
  3415  // EventUnregister implements waiter.Waitable.EventUnregister.
  3416  func (s *sock) EventUnregister(e *waiter.Entry) {
  3417  	s.Queue.EventUnregister(e)
  3418  }