gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/socket/netstack/netstack.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package netstack provides an implementation of the socket.Socket interface 16 // that is backed by a tcpip.Endpoint. 17 // 18 // It does not depend on any particular endpoint implementation, and thus can 19 // be used to expose certain endpoints to the sentry while leaving others out, 20 // for example, TCP endpoints and Unix-domain endpoints. 21 // 22 // Lock ordering: netstack => mm: ioSequenceReadWriter copies user memory inside 23 // tcpip.Endpoint.Write(). Netstack is allowed to (and does) hold locks during 24 // this operation. 25 package netstack 26 27 import ( 28 "bytes" 29 "encoding/binary" 30 "fmt" 31 "io" 32 "io/ioutil" 33 "math" 34 "reflect" 35 "time" 36 37 "golang.org/x/sys/unix" 38 "google.golang.org/protobuf/proto" 39 "gvisor.dev/gvisor/pkg/abi/linux" 40 "gvisor.dev/gvisor/pkg/abi/linux/errno" 41 "gvisor.dev/gvisor/pkg/context" 42 "gvisor.dev/gvisor/pkg/errors/linuxerr" 43 "gvisor.dev/gvisor/pkg/eventchannel" 44 "gvisor.dev/gvisor/pkg/hostarch" 45 "gvisor.dev/gvisor/pkg/log" 46 "gvisor.dev/gvisor/pkg/marshal" 47 "gvisor.dev/gvisor/pkg/marshal/primitive" 48 "gvisor.dev/gvisor/pkg/metric" 49 "gvisor.dev/gvisor/pkg/sentry/arch" 50 "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" 51 "gvisor.dev/gvisor/pkg/sentry/inet" 52 "gvisor.dev/gvisor/pkg/sentry/kernel" 53 "gvisor.dev/gvisor/pkg/sentry/kernel/auth" 54 ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" 55 "gvisor.dev/gvisor/pkg/sentry/socket" 56 "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" 57 epb "gvisor.dev/gvisor/pkg/sentry/socket/netstack/events_go_proto" 58 "gvisor.dev/gvisor/pkg/sentry/vfs" 59 "gvisor.dev/gvisor/pkg/sync" 60 "gvisor.dev/gvisor/pkg/syserr" 61 "gvisor.dev/gvisor/pkg/tcpip" 62 "gvisor.dev/gvisor/pkg/tcpip/header" 63 "gvisor.dev/gvisor/pkg/tcpip/stack" 64 "gvisor.dev/gvisor/pkg/tcpip/transport" 65 "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" 66 "gvisor.dev/gvisor/pkg/usermem" 67 "gvisor.dev/gvisor/pkg/waiter" 68 ) 69 70 const bitsPerUint32 = 32 71 72 // statCounterValue returns a function usable as callback function when defining a gVisor Sentry 73 // metric that contains the value counted by the StatCounter. 74 // This avoids a dependency loop in the tcpip package. 75 func statCounterValue(cm *tcpip.StatCounter) func(...*metric.FieldValue) uint64 { 76 return func(...*metric.FieldValue) uint64 { 77 return cm.Value() 78 } 79 } 80 81 func mustCreateMetric(name, description string) *tcpip.StatCounter { 82 var cm tcpip.StatCounter 83 metric.MustRegisterCustomUint64Metric(name, true /* cumulative */, false /* sync */, description, statCounterValue(&cm)) 84 return &cm 85 } 86 87 func mustCreateGauge(name, description string) *tcpip.StatCounter { 88 var cm tcpip.StatCounter 89 metric.MustRegisterCustomUint64Metric(name, false /* cumulative */, false /* sync */, description, statCounterValue(&cm)) 90 return &cm 91 } 92 93 // Metrics contains metrics exported by netstack. 94 var Metrics = tcpip.Stats{ 95 DroppedPackets: mustCreateMetric("/netstack/dropped_packets", "Number of packets dropped at the transport layer."), 96 NICs: tcpip.NICStats{ 97 MalformedL4RcvdPackets: mustCreateMetric("/netstack/nic/malformed_l4_received_packets", "Number of packets received that failed L4 header parsing."), 98 Tx: tcpip.NICPacketStats{ 99 Packets: mustCreateMetric("/netstack/nic/tx/packets", "Number of packets transmitted."), 100 Bytes: mustCreateMetric("/netstack/nic/tx/bytes", "Number of bytes transmitted."), 101 }, 102 TxPacketsDroppedNoBufferSpace: mustCreateMetric("/netstack/nic/tx_packets_dropped_no_buffer_space", "Number of TX packets dropped as a result of no buffer space errors."), 103 Rx: tcpip.NICPacketStats{ 104 Packets: mustCreateMetric("/netstack/nic/rx/packets", "Number of packets received."), 105 Bytes: mustCreateMetric("/netstack/nic/rx/bytes", "Number of bytes received."), 106 }, 107 DisabledRx: tcpip.NICPacketStats{ 108 Packets: mustCreateMetric("/netstack/nic/disabled_rx/packets", "Number of packets received on disabled NICs."), 109 Bytes: mustCreateMetric("/netstack/nic/disabled_rx/bytes", "Number of bytes received on disabled NICs."), 110 }, 111 Neighbor: tcpip.NICNeighborStats{ 112 UnreachableEntryLookups: mustCreateMetric("/netstack/nic/neighbor/unreachable_entry_loopups", "Number of lookups performed on a neighbor entry in Unreachable state."), 113 DroppedConfirmationForNoninitiatedNeighbor: mustCreateMetric("/netstack/nic/neighbor/dropped_confirmation_for_noninitiated_neighbor", "Number of advertisements received that don't match an entry in the neighbor cache."), 114 DroppedInvalidLinkAddressConfirmations: mustCreateMetric("/netstack/nic/neighbor/dropped_invalid_link_address_confirmations", "Number of advertisements dropped because they have empty source link-layer addresses"), 115 }, 116 }, 117 ICMP: tcpip.ICMPStats{ 118 V4: tcpip.ICMPv4Stats{ 119 PacketsSent: tcpip.ICMPv4SentPacketStats{ 120 ICMPv4PacketStats: tcpip.ICMPv4PacketStats{ 121 EchoRequest: mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_request", "Number of ICMPv4 echo request packets sent."), 122 EchoReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_reply", "Number of ICMPv4 echo reply packets sent."), 123 DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_sent/dst_unreachable", "Number of ICMPv4 destination unreachable packets sent."), 124 SrcQuench: mustCreateMetric("/netstack/icmp/v4/packets_sent/src_quench", "Number of ICMPv4 source quench packets sent."), 125 Redirect: mustCreateMetric("/netstack/icmp/v4/packets_sent/redirect", "Number of ICMPv4 redirect packets sent."), 126 TimeExceeded: mustCreateMetric("/netstack/icmp/v4/packets_sent/time_exceeded", "Number of ICMPv4 time exceeded packets sent."), 127 ParamProblem: mustCreateMetric("/netstack/icmp/v4/packets_sent/param_problem", "Number of ICMPv4 parameter problem packets sent."), 128 Timestamp: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp", "Number of ICMPv4 timestamp packets sent."), 129 TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp_reply", "Number of ICMPv4 timestamp reply packets sent."), 130 InfoRequest: mustCreateMetric("/netstack/icmp/v4/packets_sent/info_request", "Number of ICMPv4 information request packets sent."), 131 InfoReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/info_reply", "Number of ICMPv4 information reply packets sent."), 132 }, 133 Dropped: mustCreateMetric("/netstack/icmp/v4/packets_sent/dropped", "Number of ICMPv4 packets dropped due to link layer errors."), 134 RateLimited: mustCreateMetric("/netstack/icmp/v4/packets_sent/rate_limited", "Number of ICMPv4 packets dropped due to rate limit being exceeded."), 135 }, 136 PacketsReceived: tcpip.ICMPv4ReceivedPacketStats{ 137 ICMPv4PacketStats: tcpip.ICMPv4PacketStats{ 138 EchoRequest: mustCreateMetric("/netstack/icmp/v4/packets_received/echo_request", "Number of ICMPv4 echo request packets received."), 139 EchoReply: mustCreateMetric("/netstack/icmp/v4/packets_received/echo_reply", "Number of ICMPv4 echo reply packets received."), 140 DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_received/dst_unreachable", "Number of ICMPv4 destination unreachable packets received."), 141 SrcQuench: mustCreateMetric("/netstack/icmp/v4/packets_received/src_quench", "Number of ICMPv4 source quench packets received."), 142 Redirect: mustCreateMetric("/netstack/icmp/v4/packets_received/redirect", "Number of ICMPv4 redirect packets received."), 143 TimeExceeded: mustCreateMetric("/netstack/icmp/v4/packets_received/time_exceeded", "Number of ICMPv4 time exceeded packets received."), 144 ParamProblem: mustCreateMetric("/netstack/icmp/v4/packets_received/param_problem", "Number of ICMPv4 parameter problem packets received."), 145 Timestamp: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp", "Number of ICMPv4 timestamp packets received."), 146 TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp_reply", "Number of ICMPv4 timestamp reply packets received."), 147 InfoRequest: mustCreateMetric("/netstack/icmp/v4/packets_received/info_request", "Number of ICMPv4 information request packets received."), 148 InfoReply: mustCreateMetric("/netstack/icmp/v4/packets_received/info_reply", "Number of ICMPv4 information reply packets received."), 149 }, 150 Invalid: mustCreateMetric("/netstack/icmp/v4/packets_received/invalid", "Number of ICMPv4 packets received that the transport layer could not parse."), 151 }, 152 }, 153 V6: tcpip.ICMPv6Stats{ 154 PacketsSent: tcpip.ICMPv6SentPacketStats{ 155 ICMPv6PacketStats: tcpip.ICMPv6PacketStats{ 156 EchoRequest: mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_request", "Number of ICMPv6 echo request packets sent."), 157 EchoReply: mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_reply", "Number of ICMPv6 echo reply packets sent."), 158 DstUnreachable: mustCreateMetric("/netstack/icmp/v6/packets_sent/dst_unreachable", "Number of ICMPv6 destination unreachable packets sent."), 159 PacketTooBig: mustCreateMetric("/netstack/icmp/v6/packets_sent/packet_too_big", "Number of ICMPv6 packet too big packets sent."), 160 TimeExceeded: mustCreateMetric("/netstack/icmp/v6/packets_sent/time_exceeded", "Number of ICMPv6 time exceeded packets sent."), 161 ParamProblem: mustCreateMetric("/netstack/icmp/v6/packets_sent/param_problem", "Number of ICMPv6 parameter problem packets sent."), 162 RouterSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/router_solicit", "Number of ICMPv6 router solicit packets sent."), 163 RouterAdvert: mustCreateMetric("/netstack/icmp/v6/packets_sent/router_advert", "Number of ICMPv6 router advert packets sent."), 164 NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets sent."), 165 NeighborAdvert: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_advert", "Number of ICMPv6 neighbor advert packets sent."), 166 RedirectMsg: mustCreateMetric("/netstack/icmp/v6/packets_sent/redirect_msg", "Number of ICMPv6 redirect message packets sent."), 167 MulticastListenerQuery: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_query", "Number of ICMPv6 multicast listener query packets sent."), 168 MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent."), 169 MulticastListenerDone: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent."), 170 }, 171 Dropped: mustCreateMetric("/netstack/icmp/v6/packets_sent/dropped", "Number of ICMPv6 packets dropped due to link layer errors."), 172 RateLimited: mustCreateMetric("/netstack/icmp/v6/packets_sent/rate_limited", "Number of ICMPv6 packets dropped due to rate limit being exceeded."), 173 }, 174 PacketsReceived: tcpip.ICMPv6ReceivedPacketStats{ 175 ICMPv6PacketStats: tcpip.ICMPv6PacketStats{ 176 EchoRequest: mustCreateMetric("/netstack/icmp/v6/packets_received/echo_request", "Number of ICMPv6 echo request packets received."), 177 EchoReply: mustCreateMetric("/netstack/icmp/v6/packets_received/echo_reply", "Number of ICMPv6 echo reply packets received."), 178 DstUnreachable: mustCreateMetric("/netstack/icmp/v6/packets_received/dst_unreachable", "Number of ICMPv6 destination unreachable packets received."), 179 PacketTooBig: mustCreateMetric("/netstack/icmp/v6/packets_received/packet_too_big", "Number of ICMPv6 packet too big packets received."), 180 TimeExceeded: mustCreateMetric("/netstack/icmp/v6/packets_received/time_exceeded", "Number of ICMPv6 time exceeded packets received."), 181 ParamProblem: mustCreateMetric("/netstack/icmp/v6/packets_received/param_problem", "Number of ICMPv6 parameter problem packets received."), 182 RouterSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/router_solicit", "Number of ICMPv6 router solicit packets received."), 183 RouterAdvert: mustCreateMetric("/netstack/icmp/v6/packets_received/router_advert", "Number of ICMPv6 router advert packets received."), 184 NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets received."), 185 NeighborAdvert: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_advert", "Number of ICMPv6 neighbor advert packets received."), 186 RedirectMsg: mustCreateMetric("/netstack/icmp/v6/packets_received/redirect_msg", "Number of ICMPv6 redirect message packets received."), 187 MulticastListenerQuery: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_query", "Number of ICMPv6 multicast listener query packets received."), 188 MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent."), 189 MulticastListenerDone: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent."), 190 }, 191 Unrecognized: mustCreateMetric("/netstack/icmp/v6/packets_received/unrecognized", "Number of ICMPv6 packets received that the transport layer does not know how to parse."), 192 Invalid: mustCreateMetric("/netstack/icmp/v6/packets_received/invalid", "Number of ICMPv6 packets received that the transport layer could not parse."), 193 RouterOnlyPacketsDroppedByHost: mustCreateMetric("/netstack/icmp/v6/packets_received/router_only_packets_dropped_by_host", "Number of ICMPv6 packets dropped due to being router-specific packets."), 194 }, 195 }, 196 }, 197 IGMP: tcpip.IGMPStats{ 198 PacketsSent: tcpip.IGMPSentPacketStats{ 199 IGMPPacketStats: tcpip.IGMPPacketStats{ 200 MembershipQuery: mustCreateMetric("/netstack/igmp/packets_sent/membership_query", "Number of IGMP Membership Query messages sent."), 201 V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v1_membership_report", "Number of IGMPv1 Membership Report messages sent."), 202 V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v2_membership_report", "Number of IGMPv2 Membership Report messages sent."), 203 LeaveGroup: mustCreateMetric("/netstack/igmp/packets_sent/leave_group", "Number of IGMP Leave Group messages sent."), 204 }, 205 Dropped: mustCreateMetric("/netstack/igmp/packets_sent/dropped", "Number of IGMP packets dropped due to link layer errors."), 206 }, 207 PacketsReceived: tcpip.IGMPReceivedPacketStats{ 208 IGMPPacketStats: tcpip.IGMPPacketStats{ 209 MembershipQuery: mustCreateMetric("/netstack/igmp/packets_received/membership_query", "Number of IGMP Membership Query messages received."), 210 V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v1_membership_report", "Number of IGMPv1 Membership Report messages received."), 211 V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v2_membership_report", "Number of IGMPv2 Membership Report messages received."), 212 LeaveGroup: mustCreateMetric("/netstack/igmp/packets_received/leave_group", "Number of IGMP Leave Group messages received."), 213 }, 214 Invalid: mustCreateMetric("/netstack/igmp/packets_received/invalid", "Number of IGMP packets received that could not be parsed."), 215 ChecksumErrors: mustCreateMetric("/netstack/igmp/packets_received/checksum_errors", "Number of received IGMP packets with bad checksums."), 216 Unrecognized: mustCreateMetric("/netstack/igmp/packets_received/unrecognized", "Number of unrecognized IGMP packets received."), 217 }, 218 }, 219 IP: tcpip.IPStats{ 220 PacketsReceived: mustCreateMetric("/netstack/ip/packets_received", "Number of IP packets received from the link layer in nic.DeliverNetworkPacket."), 221 DisabledPacketsReceived: mustCreateMetric("/netstack/ip/disabled_packets_received", "Number of IP packets received from the link layer when the IP layer is disabled."), 222 InvalidDestinationAddressesReceived: mustCreateMetric("/netstack/ip/invalid_addresses_received", "Number of IP packets received with an unknown or invalid destination address."), 223 InvalidSourceAddressesReceived: mustCreateMetric("/netstack/ip/invalid_source_addresses_received", "Number of IP packets received with an unknown or invalid source address."), 224 PacketsDelivered: mustCreateMetric("/netstack/ip/packets_delivered", "Number of incoming IP packets that are successfully delivered to the transport layer via HandlePacket."), 225 PacketsSent: mustCreateMetric("/netstack/ip/packets_sent", "Number of IP packets sent via WritePacket."), 226 OutgoingPacketErrors: mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Number of IP packets which failed to write to a link-layer endpoint."), 227 MalformedPacketsReceived: mustCreateMetric("/netstack/ip/malformed_packets_received", "Number of IP packets which failed IP header validation checks."), 228 MalformedFragmentsReceived: mustCreateMetric("/netstack/ip/malformed_fragments_received", "Number of IP fragments which failed IP fragment validation checks."), 229 IPTablesPreroutingDropped: mustCreateMetric("/netstack/ip/iptables/prerouting_dropped", "Number of IP packets dropped in the Prerouting chain."), 230 IPTablesInputDropped: mustCreateMetric("/netstack/ip/iptables/input_dropped", "Number of IP packets dropped in the Input chain."), 231 IPTablesOutputDropped: mustCreateMetric("/netstack/ip/iptables/output_dropped", "Number of IP packets dropped in the Output chain."), 232 OptionTimestampReceived: mustCreateMetric("/netstack/ip/options/timestamp_received", "Number of timestamp options found in received IP packets."), 233 OptionRecordRouteReceived: mustCreateMetric("/netstack/ip/options/record_route_received", "Number of record route options found in received IP packets."), 234 OptionRouterAlertReceived: mustCreateMetric("/netstack/ip/options/router_alert_received", "Number of router alert options found in received IP packets."), 235 OptionUnknownReceived: mustCreateMetric("/netstack/ip/options/unknown_received", "Number of unknown options found in received IP packets."), 236 Forwarding: tcpip.IPForwardingStats{ 237 Unrouteable: mustCreateMetric("/netstack/ip/forwarding/unrouteable", "Number of IP packets received which couldn't be routed and thus were not forwarded."), 238 ExhaustedTTL: mustCreateMetric("/netstack/ip/forwarding/exhausted_ttl", "Number of IP packets received which could not be forwarded due to an exhausted TTL."), 239 LinkLocalSource: mustCreateMetric("/netstack/ip/forwarding/link_local_source_address", "Number of IP packets received which could not be forwarded due to a link-local source address."), 240 LinkLocalDestination: mustCreateMetric("/netstack/ip/forwarding/link_local_destination_address", "Number of IP packets received which could not be forwarded due to a link-local destination address."), 241 ExtensionHeaderProblem: mustCreateMetric("/netstack/ip/forwarding/extension_header_problem", "Number of IP packets received which could not be forwarded due to a problem processing their IPv6 extension headers."), 242 PacketTooBig: mustCreateMetric("/netstack/ip/forwarding/packet_too_big", "Number of IP packets received which could not be forwarded because they could not fit within the outgoing MTU."), 243 HostUnreachable: mustCreateMetric("/netstack/ip/forwarding/host_unreachable", "Number of IP packets received which could not be forwarded due to unresolvable next hop."), 244 Errors: mustCreateMetric("/netstack/ip/forwarding/errors", "Number of IP packets which couldn't be forwarded."), 245 }, 246 }, 247 ARP: tcpip.ARPStats{ 248 PacketsReceived: mustCreateMetric("/netstack/arp/packets_received", "Number of ARP packets received from the link layer."), 249 DisabledPacketsReceived: mustCreateMetric("/netstack/arp/disabled_packets_received", "Number of ARP packets received from the link layer when the ARP layer is disabled."), 250 MalformedPacketsReceived: mustCreateMetric("/netstack/arp/malformed_packets_received", "Number of ARP packets which failed ARP header validation checks."), 251 RequestsReceived: mustCreateMetric("/netstack/arp/requests_received", "Number of ARP requests received."), 252 RequestsReceivedUnknownTargetAddress: mustCreateMetric("/netstack/arp/requests_received_unknown_addr", "Number of ARP requests received with an unknown target address."), 253 OutgoingRequestInterfaceHasNoLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_iface_has_no_addr", "Number of failed attempts to send an ARP request with an interface that has no network address."), 254 OutgoingRequestBadLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_invalid_local_addr", "Number of failed attempts to send an ARP request with a provided local address that is invalid."), 255 OutgoingRequestsDropped: mustCreateMetric("/netstack/arp/outgoing_requests_dropped", "Number of ARP requests which failed to write to a link-layer endpoint."), 256 OutgoingRequestsSent: mustCreateMetric("/netstack/arp/outgoing_requests_sent", "Number of ARP requests sent."), 257 RepliesReceived: mustCreateMetric("/netstack/arp/replies_received", "Number of ARP replies received."), 258 OutgoingRepliesDropped: mustCreateMetric("/netstack/arp/outgoing_replies_dropped", "Number of ARP replies which failed to write to a link-layer endpoint."), 259 OutgoingRepliesSent: mustCreateMetric("/netstack/arp/outgoing_replies_sent", "Number of ARP replies sent."), 260 }, 261 TCP: tcpip.TCPStats{ 262 ActiveConnectionOpenings: mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."), 263 PassiveConnectionOpenings: mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."), 264 CurrentEstablished: mustCreateGauge("/netstack/tcp/current_established", "Number of connections in ESTABLISHED state now."), 265 CurrentConnected: mustCreateGauge("/netstack/tcp/current_open", "Number of connections that are in connected state."), 266 EstablishedResets: mustCreateMetric("/netstack/tcp/established_resets", "Number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state"), 267 EstablishedClosed: mustCreateMetric("/netstack/tcp/established_closed", "Number of times established TCP connections made a transition to CLOSED state."), 268 EstablishedTimedout: mustCreateMetric("/netstack/tcp/established_timedout", "Number of times an established connection was reset because of keep-alive time out."), 269 ListenOverflowSynDrop: mustCreateMetric("/netstack/tcp/listen_overflow_syn_drop", "Number of times the listen queue overflowed and a SYN was dropped."), 270 ListenOverflowAckDrop: mustCreateMetric("/netstack/tcp/listen_overflow_ack_drop", "Number of times the listen queue overflowed and the final ACK in the handshake was dropped."), 271 ListenOverflowSynCookieSent: mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_sent", "Number of times a SYN cookie was sent."), 272 ListenOverflowSynCookieRcvd: mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_rcvd", "Number of times a SYN cookie was received."), 273 ListenOverflowInvalidSynCookieRcvd: mustCreateMetric("/netstack/tcp/listen_overflow_invalid_syn_cookie_rcvd", "Number of times an invalid SYN cookie was received."), 274 FailedConnectionAttempts: mustCreateMetric("/netstack/tcp/failed_connection_attempts", "Number of calls to Connect or Listen (active and passive openings, respectively) that end in an error."), 275 ValidSegmentsReceived: mustCreateMetric("/netstack/tcp/valid_segments_received", "Number of TCP segments received that the transport layer successfully parsed."), 276 InvalidSegmentsReceived: mustCreateMetric("/netstack/tcp/invalid_segments_received", "Number of TCP segments received that the transport layer could not parse."), 277 SegmentsSent: mustCreateMetric("/netstack/tcp/segments_sent", "Number of TCP segments sent."), 278 SegmentSendErrors: mustCreateMetric("/netstack/tcp/segment_send_errors", "Number of TCP segments failed to be sent."), 279 ResetsSent: mustCreateMetric("/netstack/tcp/resets_sent", "Number of TCP resets sent."), 280 ResetsReceived: mustCreateMetric("/netstack/tcp/resets_received", "Number of TCP resets received."), 281 Retransmits: mustCreateMetric("/netstack/tcp/retransmits", "Number of TCP segments retransmitted."), 282 FastRecovery: mustCreateMetric("/netstack/tcp/fast_recovery", "Number of times fast recovery was used to recover from packet loss."), 283 SACKRecovery: mustCreateMetric("/netstack/tcp/sack_recovery", "Number of times SACK recovery was used to recover from packet loss."), 284 TLPRecovery: mustCreateMetric("/netstack/tcp/tlp_recovery", "Number of times tail loss probe triggers recovery from tail loss."), 285 SlowStartRetransmits: mustCreateMetric("/netstack/tcp/slow_start_retransmits", "Number of segments retransmitted in slow start mode."), 286 FastRetransmit: mustCreateMetric("/netstack/tcp/fast_retransmit", "Number of TCP segments which were fast retransmitted."), 287 Timeouts: mustCreateMetric("/netstack/tcp/timeouts", "Number of times RTO expired."), 288 ChecksumErrors: mustCreateMetric("/netstack/tcp/checksum_errors", "Number of segments dropped due to bad checksums."), 289 FailedPortReservations: mustCreateMetric("/netstack/tcp/failed_port_reservations", "Number of time TCP failed to reserve a port."), 290 SegmentsAckedWithDSACK: mustCreateMetric("/netstack/tcp/segments_acked_with_dsack", "Number of segments for which DSACK was received."), 291 SpuriousRecovery: mustCreateMetric("/netstack/tcp/spurious_recovery", "Number of times the connection entered loss recovery spuriously."), 292 SpuriousRTORecovery: mustCreateMetric("/netstack/tcp/spurious_rto_recovery", "Number of times the connection entered RTO spuriously."), 293 ForwardMaxInFlightDrop: mustCreateMetric("/netstack/tcp/forward_max_in_flight_drop", "Number of connection requests dropped due to exceeding in-flight limit."), 294 }, 295 UDP: tcpip.UDPStats{ 296 PacketsReceived: mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."), 297 UnknownPortErrors: mustCreateMetric("/netstack/udp/unknown_port_errors", "Number of incoming UDP datagrams dropped because they did not have a known destination port."), 298 ReceiveBufferErrors: mustCreateMetric("/netstack/udp/receive_buffer_errors", "Number of incoming UDP datagrams dropped due to the receiving buffer being in an invalid state."), 299 MalformedPacketsReceived: mustCreateMetric("/netstack/udp/malformed_packets_received", "Number of incoming UDP datagrams dropped due to the UDP header being in a malformed state."), 300 PacketsSent: mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent."), 301 PacketSendErrors: mustCreateMetric("/netstack/udp/packet_send_errors", "Number of UDP datagrams failed to be sent."), 302 ChecksumErrors: mustCreateMetric("/netstack/udp/checksum_errors", "Number of UDP datagrams dropped due to bad checksums."), 303 }, 304 } 305 306 // DefaultTTL is linux's default TTL. All network protocols in all stacks used 307 // with this package must have this value set as their default TTL. 308 const DefaultTTL = 64 309 310 const sizeOfInt32 int = 4 311 312 var errStackType = syserr.New("expected but did not receive a netstack.Stack", errno.EINVAL) 313 314 // commonEndpoint represents the intersection of a tcpip.Endpoint and a 315 // transport.Endpoint. 316 type commonEndpoint interface { 317 // Readiness implements tcpip.Endpoint.Readiness and 318 // transport.Endpoint.Readiness. 319 Readiness(mask waiter.EventMask) waiter.EventMask 320 321 // SetSockOpt implements tcpip.Endpoint.SetSockOpt and 322 // transport.Endpoint.SetSockOpt. 323 SetSockOpt(tcpip.SettableSocketOption) tcpip.Error 324 325 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt and 326 // transport.Endpoint.SetSockOptInt. 327 SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error 328 329 // GetSockOpt implements tcpip.Endpoint.GetSockOpt and 330 // transport.Endpoint.GetSockOpt. 331 GetSockOpt(tcpip.GettableSocketOption) tcpip.Error 332 333 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and 334 // transport.Endpoint.GetSockOpt. 335 GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) 336 337 // State returns a socket's lifecycle state. The returned value is 338 // protocol-specific and is primarily used for diagnostics. 339 State() uint32 340 341 // LastError implements tcpip.Endpoint.LastError and 342 // transport.Endpoint.LastError. 343 LastError() tcpip.Error 344 345 // SocketOptions implements tcpip.Endpoint.SocketOptions and 346 // transport.Endpoint.SocketOptions. 347 SocketOptions() *tcpip.SocketOptions 348 } 349 350 // sock encapsulates all the state needed to represent a network stack 351 // endpoint in the kernel context. 352 // 353 // +stateify savable 354 type sock struct { 355 vfsfd vfs.FileDescription 356 vfs.FileDescriptionDefaultImpl 357 vfs.DentryMetadataFileDescriptionImpl 358 vfs.LockFD 359 socket.SendReceiveTimeout 360 *waiter.Queue 361 362 family int 363 Endpoint tcpip.Endpoint 364 skType linux.SockType 365 protocol int 366 367 namespace *inet.Namespace 368 369 mu sync.Mutex `state:"nosave"` 370 // readWriter is an optimization to avoid allocations. 371 // +checklocks:mu 372 readWriter usermem.IOSequenceReadWriter `state:"nosave"` 373 374 // readMu protects access to the below fields. 375 readMu sync.Mutex `state:"nosave"` 376 377 // sockOptTimestamp corresponds to SO_TIMESTAMP. When true, timestamps 378 // of returned messages can be returned via control messages. When 379 // false, the same timestamp is instead stored and can be read via the 380 // SIOCGSTAMP ioctl. It is protected by readMu. See socket(7). 381 sockOptTimestamp bool 382 // timestampValid indicates whether timestamp for SIOCGSTAMP has been 383 // set. It is protected by readMu. 384 timestampValid bool 385 // timestamp holds the timestamp to use with SIOCTSTAMP. It is only 386 // valid when timestampValid is true. It is protected by readMu. 387 timestamp time.Time `state:".(int64)"` 388 389 // TODO(b/153685824): Move this to SocketOptions. 390 // sockOptInq corresponds to TCP_INQ. 391 sockOptInq bool 392 } 393 394 var _ = socket.Socket(&sock{}) 395 396 // New creates a new endpoint socket. 397 func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*vfs.FileDescription, *syserr.Error) { 398 if skType == linux.SOCK_STREAM { 399 endpoint.SocketOptions().SetDelayOption(true) 400 } 401 402 mnt := t.Kernel().SocketMount() 403 d := sockfs.NewDentry(t, mnt) 404 defer d.DecRef(t) 405 406 namespace := t.NetworkNamespace() 407 s := &sock{ 408 Queue: queue, 409 family: family, 410 Endpoint: endpoint, 411 skType: skType, 412 protocol: protocol, 413 namespace: namespace, 414 } 415 s.LockFD.Init(&vfs.FileLocks{}) 416 vfsfd := &s.vfsfd 417 if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{ 418 DenyPRead: true, 419 DenyPWrite: true, 420 UseDentryMetadata: true, 421 }); err != nil { 422 return nil, syserr.FromError(err) 423 } 424 namespace.IncRef() 425 return vfsfd, nil 426 } 427 428 // Release implements vfs.FileDescriptionImpl.Release. 429 func (s *sock) Release(ctx context.Context) { 430 kernel.KernelFromContext(ctx).DeleteSocket(&s.vfsfd) 431 e, ch := waiter.NewChannelEntry(waiter.EventHUp | waiter.EventErr) 432 s.EventRegister(&e) 433 defer s.EventUnregister(&e) 434 435 s.Endpoint.Close() 436 437 // SO_LINGER option is valid only for TCP. For other socket types 438 // return after endpoint close. 439 if family, skType, _ := s.Type(); skType == linux.SOCK_STREAM && (family == linux.AF_INET || family == linux.AF_INET6) { 440 v := s.Endpoint.SocketOptions().GetLinger() 441 // The case for zero timeout is handled in tcp endpoint close function. 442 // Close is blocked until either: 443 // 1. The endpoint state is not in any of the states: FIN-WAIT1, 444 // CLOSING and LAST_ACK. 445 // 2. Timeout is reached. 446 if v.Enabled && v.Timeout != 0 { 447 t := kernel.TaskFromContext(ctx) 448 start := t.Kernel().MonotonicClock().Now() 449 deadline := start.Add(v.Timeout) 450 _ = t.BlockWithDeadline(ch, true, deadline) 451 } 452 } 453 s.namespace.DecRef(ctx) 454 } 455 456 // Epollable implements FileDescriptionImpl.Epollable. 457 func (s *sock) Epollable() bool { 458 return true 459 } 460 461 // Read implements vfs.FileDescriptionImpl. 462 func (s *sock) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { 463 // All flags other than RWF_NOWAIT should be ignored. 464 // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. 465 if opts.Flags != 0 { 466 return 0, linuxerr.EOPNOTSUPP 467 } 468 469 if dst.NumBytes() == 0 { 470 return 0, nil 471 } 472 n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false) 473 if err == syserr.ErrWouldBlock { 474 return int64(n), linuxerr.ErrWouldBlock 475 } 476 if err != nil { 477 return 0, err.ToError() 478 } 479 return int64(n), nil 480 } 481 482 // Write implements vfs.FileDescriptionImpl. 483 func (s *sock) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { 484 // All flags other than RWF_NOWAIT should be ignored. 485 // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. 486 if opts.Flags != 0 { 487 return 0, linuxerr.EOPNOTSUPP 488 } 489 490 var n int64 491 var err tcpip.Error 492 switch s.Endpoint.(type) { 493 case *tcp.Endpoint: 494 s.mu.Lock() 495 s.readWriter.Init(ctx, src) 496 n, err = s.Endpoint.Write(&s.readWriter, tcpip.WriteOptions{}) 497 s.mu.Unlock() 498 default: 499 n, err = s.Endpoint.Write(src.Reader(ctx), tcpip.WriteOptions{}) 500 } 501 if _, ok := err.(*tcpip.ErrWouldBlock); ok { 502 return 0, linuxerr.ErrWouldBlock 503 } 504 if err != nil { 505 return 0, syserr.TranslateNetstackError(err).ToError() 506 } 507 508 if n < src.NumBytes() { 509 return n, linuxerr.ErrWouldBlock 510 } 511 512 return n, nil 513 } 514 515 // Accept implements the linux syscall accept(2) for sockets backed by 516 // tcpip.Endpoint. 517 func (s *sock) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { 518 // Issue the accept request to get the new endpoint. 519 var peerAddr *tcpip.FullAddress 520 if peerRequested { 521 peerAddr = &tcpip.FullAddress{} 522 } 523 ep, wq, terr := s.Endpoint.Accept(peerAddr) 524 if terr != nil { 525 if _, ok := terr.(*tcpip.ErrWouldBlock); !ok || !blocking { 526 return 0, nil, 0, syserr.TranslateNetstackError(terr) 527 } 528 529 var err *syserr.Error 530 ep, wq, err = s.blockingAccept(t, peerAddr) 531 if err != nil { 532 return 0, nil, 0, err 533 } 534 } 535 536 ns, err := New(t, s.family, s.skType, s.protocol, wq, ep) 537 if err != nil { 538 return 0, nil, 0, err 539 } 540 defer ns.DecRef(t) 541 542 if err := ns.SetStatusFlags(t, t.Credentials(), uint32(flags&linux.SOCK_NONBLOCK)); err != nil { 543 return 0, nil, 0, syserr.FromError(err) 544 } 545 546 var addr linux.SockAddr 547 var addrLen uint32 548 if peerAddr != nil { 549 // Get address of the peer and write it to peer slice. 550 addr, addrLen = socket.ConvertAddress(s.family, *peerAddr) 551 } 552 553 fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{ 554 CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, 555 }) 556 557 t.Kernel().RecordSocket(ns) 558 559 return fd, addr, addrLen, syserr.FromError(e) 560 } 561 562 // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by 563 // tcpip.Endpoint. 564 func (s *sock) GetSockOpt(t *kernel.Task, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { 565 // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is 566 // implemented specifically for netstack.Socket rather than 567 // commonEndpoint. commonEndpoint should be extended to support socket 568 // options where the implementation is not shared, as unix sockets need 569 // their own support for SO_TIMESTAMP. 570 if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP { 571 if outLen < sizeOfInt32 { 572 return nil, syserr.ErrInvalidArgument 573 } 574 val := primitive.Int32(0) 575 s.readMu.Lock() 576 defer s.readMu.Unlock() 577 if s.sockOptTimestamp { 578 val = 1 579 } 580 return &val, nil 581 } 582 if level == linux.SOL_TCP && name == linux.TCP_INQ { 583 if outLen < sizeOfInt32 { 584 return nil, syserr.ErrInvalidArgument 585 } 586 val := primitive.Int32(0) 587 s.readMu.Lock() 588 defer s.readMu.Unlock() 589 if s.sockOptInq { 590 val = 1 591 } 592 return &val, nil 593 } 594 595 return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outPtr, outLen) 596 } 597 598 // SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by 599 // tcpip.Endpoint. 600 func (s *sock) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error { 601 // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is 602 // implemented specifically for netstack.Socket rather than 603 // commonEndpoint. commonEndpoint should be extended to support socket 604 // options where the implementation is not shared, as unix sockets need 605 // their own support for SO_TIMESTAMP. 606 if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP { 607 if len(optVal) < sizeOfInt32 { 608 return syserr.ErrInvalidArgument 609 } 610 s.readMu.Lock() 611 defer s.readMu.Unlock() 612 s.sockOptTimestamp = hostarch.ByteOrder.Uint32(optVal) != 0 613 return nil 614 } 615 if level == linux.SOL_TCP && name == linux.TCP_INQ { 616 if len(optVal) < sizeOfInt32 { 617 return syserr.ErrInvalidArgument 618 } 619 s.readMu.Lock() 620 defer s.readMu.Unlock() 621 s.sockOptInq = hostarch.ByteOrder.Uint32(optVal) != 0 622 return nil 623 } 624 625 return SetSockOpt(t, s, s.Endpoint, level, name, optVal) 626 } 627 628 var sockAddrInetSize = (*linux.SockAddrInet)(nil).SizeBytes() 629 var sockAddrInet6Size = (*linux.SockAddrInet6)(nil).SizeBytes() 630 var sockAddrLinkSize = (*linux.SockAddrLink)(nil).SizeBytes() 631 632 // minSockAddrLen returns the minimum length in bytes of a socket address for 633 // the socket's family. 634 func (s *sock) minSockAddrLen() int { 635 const addressFamilySize = 2 636 637 switch s.family { 638 case linux.AF_UNIX: 639 return addressFamilySize 640 case linux.AF_INET: 641 return sockAddrInetSize 642 case linux.AF_INET6: 643 return sockAddrInet6Size 644 case linux.AF_PACKET: 645 return sockAddrLinkSize 646 case linux.AF_UNSPEC: 647 return addressFamilySize 648 default: 649 panic(fmt.Sprintf("s.family unrecognized = %d", s.family)) 650 } 651 } 652 653 func (s *sock) isPacketBased() bool { 654 return s.skType == linux.SOCK_DGRAM || s.skType == linux.SOCK_SEQPACKET || s.skType == linux.SOCK_RDM || s.skType == linux.SOCK_RAW 655 } 656 657 // Readiness returns a mask of ready events for socket s. 658 func (s *sock) Readiness(mask waiter.EventMask) waiter.EventMask { 659 return s.Endpoint.Readiness(mask) 660 } 661 662 // checkFamily returns true iff the specified address family may be used with 663 // the socket. 664 // 665 // If exact is true, then the specified address family must be an exact match 666 // with the socket's family. 667 func (s *sock) checkFamily(family uint16, exact bool) bool { 668 if family == uint16(s.family) { 669 return true 670 } 671 if !exact && family == linux.AF_INET && s.family == linux.AF_INET6 { 672 if !s.Endpoint.SocketOptions().GetV6Only() { 673 return true 674 } 675 } 676 return false 677 } 678 679 // mapFamily maps the AF_INET ANY address to the IPv4-mapped IPv6 ANY if the 680 // receiver's family is AF_INET6. 681 // 682 // This is a hack to work around the fact that both IPv4 and IPv6 ANY are 683 // represented by the empty string. 684 // 685 // TODO(gvisor.dev/issue/1556): remove this function. 686 func (s *sock) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress { 687 if addr.Addr.BitLen() == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET { 688 addr.Addr = tcpip.AddrFrom16([16]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}) 689 } 690 return addr 691 } 692 693 // Connect implements the linux syscall connect(2) for sockets backed by 694 // tpcip.Endpoint. 695 func (s *sock) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { 696 addr, family, err := socket.AddressAndFamily(sockaddr) 697 if err != nil { 698 return err 699 } 700 701 if family == linux.AF_UNSPEC { 702 err := s.Endpoint.Disconnect() 703 if _, ok := err.(*tcpip.ErrNotSupported); ok { 704 return syserr.ErrAddressFamilyNotSupported 705 } 706 return syserr.TranslateNetstackError(err) 707 } 708 709 if !s.checkFamily(family, false /* exact */) { 710 return syserr.ErrInvalidArgument 711 } 712 addr = s.mapFamily(addr, family) 713 714 // Always return right away in the non-blocking case. 715 if !blocking { 716 return syserr.TranslateNetstackError(s.Endpoint.Connect(addr)) 717 } 718 719 // Register for notification when the endpoint becomes writable, then 720 // initiate the connection. 721 e, ch := waiter.NewChannelEntry(waiter.WritableEvents) 722 s.EventRegister(&e) 723 defer s.EventUnregister(&e) 724 725 switch err := s.Endpoint.Connect(addr); err.(type) { 726 case *tcpip.ErrConnectStarted, *tcpip.ErrAlreadyConnecting: 727 case *tcpip.ErrNoPortAvailable: 728 if (s.family == unix.AF_INET || s.family == unix.AF_INET6) && s.skType == linux.SOCK_STREAM { 729 // TCP unlike UDP returns EADDRNOTAVAIL when it can't 730 // find an available local ephemeral port. 731 return syserr.ErrAddressNotAvailable 732 } 733 return syserr.TranslateNetstackError(err) 734 default: 735 return syserr.TranslateNetstackError(err) 736 } 737 738 // It's pending, so we have to wait for a notification, and fetch the 739 // result once the wait completes. 740 if err := t.Block(ch); err != nil { 741 return syserr.FromError(err) 742 } 743 744 // Call Connect() again after blocking to find connect's result. 745 return syserr.TranslateNetstackError(s.Endpoint.Connect(addr)) 746 } 747 748 // Bind implements the linux syscall bind(2) for sockets backed by 749 // tcpip.Endpoint. 750 func (s *sock) Bind(_ *kernel.Task, sockaddr []byte) *syserr.Error { 751 if len(sockaddr) < 2 { 752 return syserr.ErrInvalidArgument 753 } 754 755 family := hostarch.ByteOrder.Uint16(sockaddr) 756 var addr tcpip.FullAddress 757 758 // Bind for AF_PACKET requires only family, protocol and ifindex. 759 // In function AddressAndFamily, we check the address length which is 760 // not needed for AF_PACKET bind. 761 if family == linux.AF_PACKET { 762 var a linux.SockAddrLink 763 if len(sockaddr) < sockAddrLinkSize { 764 return syserr.ErrInvalidArgument 765 } 766 a.UnmarshalBytes(sockaddr) 767 768 addr = tcpip.FullAddress{ 769 NIC: tcpip.NICID(a.InterfaceIndex), 770 Addr: tcpip.AddrFrom16Slice(append( 771 a.HardwareAddr[:header.EthernetAddressSize], 772 []byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}..., 773 )), 774 Port: socket.Ntohs(a.Protocol), 775 } 776 } else { 777 if s.minSockAddrLen() > len(sockaddr) { 778 return syserr.ErrInvalidArgument 779 } 780 781 var err *syserr.Error 782 addr, family, err = socket.AddressAndFamily(sockaddr) 783 if err != nil { 784 return err 785 } 786 787 if !s.checkFamily(family, true /* exact */) { 788 return syserr.ErrAddressFamilyNotSupported 789 } 790 791 addr = s.mapFamily(addr, family) 792 } 793 794 // Issue the bind request to the endpoint. 795 err := s.Endpoint.Bind(addr) 796 if _, ok := err.(*tcpip.ErrNoPortAvailable); ok { 797 // Bind always returns EADDRINUSE irrespective of if the specified port was 798 // already bound or if an ephemeral port was requested but none were 799 // available. 800 // 801 // *tcpip.ErrNoPortAvailable is mapped to EAGAIN in syserr package because 802 // UDP connect returns EAGAIN on ephemeral port exhaustion. 803 // 804 // TCP connect returns EADDRNOTAVAIL on ephemeral port exhaustion. 805 err = &tcpip.ErrPortInUse{} 806 } 807 808 return syserr.TranslateNetstackError(err) 809 } 810 811 // Listen implements the linux syscall listen(2) for sockets backed by 812 // tcpip.Endpoint. 813 func (s *sock) Listen(_ *kernel.Task, backlog int) *syserr.Error { 814 if err := s.Endpoint.Listen(backlog); err != nil { 815 return syserr.TranslateNetstackError(err) 816 } 817 if !socket.IsTCP(s) { 818 return nil 819 } 820 821 // Emit SentryTCPListenEvent with the bound port for tcp sockets. 822 addr, err := s.Endpoint.GetLocalAddress() 823 if err != nil { 824 panic(fmt.Sprintf("GetLocalAddress failed for tcp socket: %s", err)) 825 } 826 eventchannel.Emit(&epb.SentryTcpListenEvent{ 827 Port: proto.Int32(int32(addr.Port)), 828 }) 829 return nil 830 } 831 832 // blockingAccept implements a blocking version of accept(2), that is, if no 833 // connections are ready to be accept, it will block until one becomes ready. 834 func (s *sock) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) { 835 // Register for notifications. 836 e, ch := waiter.NewChannelEntry(waiter.ReadableEvents) 837 s.EventRegister(&e) 838 defer s.EventUnregister(&e) 839 840 // Try to accept the connection again; if it fails, then wait until we 841 // get a notification. 842 for { 843 ep, wq, err := s.Endpoint.Accept(peerAddr) 844 if _, ok := err.(*tcpip.ErrWouldBlock); !ok { 845 return ep, wq, syserr.TranslateNetstackError(err) 846 } 847 848 if err := t.Block(ch); err != nil { 849 return nil, nil, syserr.FromError(err) 850 } 851 } 852 } 853 854 // ConvertShutdown converts Linux shutdown flags into tcpip shutdown flags. 855 func ConvertShutdown(how int) (tcpip.ShutdownFlags, *syserr.Error) { 856 var f tcpip.ShutdownFlags 857 switch how { 858 case linux.SHUT_RD: 859 f = tcpip.ShutdownRead 860 case linux.SHUT_WR: 861 f = tcpip.ShutdownWrite 862 case linux.SHUT_RDWR: 863 f = tcpip.ShutdownRead | tcpip.ShutdownWrite 864 default: 865 return 0, syserr.ErrInvalidArgument 866 } 867 return f, nil 868 } 869 870 // Shutdown implements the linux syscall shutdown(2) for sockets backed by 871 // tcpip.Endpoint. 872 func (s *sock) Shutdown(_ *kernel.Task, how int) *syserr.Error { 873 f, err := ConvertShutdown(how) 874 if err != nil { 875 return err 876 } 877 878 // Issue shutdown request. 879 return syserr.TranslateNetstackError(s.Endpoint.Shutdown(f)) 880 } 881 882 // GetSockOpt can be used to implement the linux syscall getsockopt(2) for 883 // sockets backed by a commonEndpoint. 884 func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { 885 switch level { 886 case linux.SOL_SOCKET: 887 return getSockOptSocket(t, s, ep, family, skType, name, outLen) 888 889 case linux.SOL_TCP: 890 return getSockOptTCP(t, s, ep, name, outLen) 891 892 case linux.SOL_IPV6: 893 return getSockOptIPv6(t, s, ep, name, outPtr, outLen) 894 895 case linux.SOL_IP: 896 return getSockOptIP(t, s, ep, name, outPtr, outLen, family) 897 898 case linux.SOL_ICMPV6: 899 return getSockOptICMPv6(t, s, ep, name, outLen) 900 901 case linux.SOL_UDP, 902 linux.SOL_RAW, 903 linux.SOL_PACKET: 904 // Not supported. 905 } 906 907 return nil, syserr.ErrProtocolNotAvailable 908 } 909 910 func boolToInt32(v bool) int32 { 911 if v { 912 return 1 913 } 914 return 0 915 } 916 917 // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET. 918 func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, _ linux.SockType, name, outLen int) (marshal.Marshallable, *syserr.Error) { 919 // TODO(b/124056281): Stop rejecting short optLen values in getsockopt. 920 switch name { 921 case linux.SO_ERROR: 922 if outLen < sizeOfInt32 { 923 return nil, syserr.ErrInvalidArgument 924 } 925 926 // Get the last error and convert it. 927 err := ep.SocketOptions().GetLastError() 928 if err == nil { 929 optP := primitive.Int32(0) 930 return &optP, nil 931 } 932 933 optP := primitive.Int32(syserr.TranslateNetstackError(err).ToLinux()) 934 return &optP, nil 935 936 case linux.SO_PEERCRED: 937 if family != linux.AF_UNIX || outLen < unix.SizeofUcred { 938 return nil, syserr.ErrInvalidArgument 939 } 940 941 tcred := t.Credentials() 942 creds := linux.ControlMessageCredentials{ 943 PID: int32(t.ThreadGroup().ID()), 944 UID: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()), 945 GID: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()), 946 } 947 return &creds, nil 948 949 case linux.SO_PASSCRED: 950 if outLen < sizeOfInt32 { 951 return nil, syserr.ErrInvalidArgument 952 } 953 954 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetPassCred())) 955 return &v, nil 956 957 case linux.SO_SNDBUF: 958 if outLen < sizeOfInt32 { 959 return nil, syserr.ErrInvalidArgument 960 } 961 962 size := ep.SocketOptions().GetSendBufferSize() 963 964 if size > math.MaxInt32 { 965 size = math.MaxInt32 966 } 967 968 sizeP := primitive.Int32(size) 969 return &sizeP, nil 970 971 case linux.SO_RCVBUF: 972 if outLen < sizeOfInt32 { 973 return nil, syserr.ErrInvalidArgument 974 } 975 976 size := ep.SocketOptions().GetReceiveBufferSize() 977 978 if size > math.MaxInt32 { 979 size = math.MaxInt32 980 } 981 982 sizeP := primitive.Int32(size) 983 return &sizeP, nil 984 985 case linux.SO_REUSEADDR: 986 if outLen < sizeOfInt32 { 987 return nil, syserr.ErrInvalidArgument 988 } 989 990 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReuseAddress())) 991 return &v, nil 992 993 case linux.SO_REUSEPORT: 994 if outLen < sizeOfInt32 { 995 return nil, syserr.ErrInvalidArgument 996 } 997 998 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReusePort())) 999 return &v, nil 1000 1001 case linux.SO_BINDTODEVICE: 1002 v := ep.SocketOptions().GetBindToDevice() 1003 if v == 0 { 1004 var b primitive.ByteSlice 1005 return &b, nil 1006 } 1007 if outLen < linux.IFNAMSIZ { 1008 return nil, syserr.ErrInvalidArgument 1009 } 1010 s := t.NetworkContext() 1011 if s == nil { 1012 return nil, syserr.ErrNoDevice 1013 } 1014 nic, ok := s.Interfaces()[int32(v)] 1015 if !ok { 1016 // The NICID no longer indicates a valid interface, probably because that 1017 // interface was removed. 1018 return nil, syserr.ErrUnknownDevice 1019 } 1020 1021 name := primitive.ByteSlice(append([]byte(nic.Name), 0)) 1022 return &name, nil 1023 1024 case linux.SO_BROADCAST: 1025 if outLen < sizeOfInt32 { 1026 return nil, syserr.ErrInvalidArgument 1027 } 1028 1029 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetBroadcast())) 1030 return &v, nil 1031 1032 case linux.SO_KEEPALIVE: 1033 if outLen < sizeOfInt32 { 1034 return nil, syserr.ErrInvalidArgument 1035 } 1036 1037 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetKeepAlive())) 1038 return &v, nil 1039 1040 case linux.SO_LINGER: 1041 if outLen < linux.SizeOfLinger { 1042 return nil, syserr.ErrInvalidArgument 1043 } 1044 1045 var linger linux.Linger 1046 v := ep.SocketOptions().GetLinger() 1047 1048 if v.Enabled { 1049 linger.OnOff = 1 1050 } 1051 linger.Linger = int32(v.Timeout.Seconds()) 1052 return &linger, nil 1053 1054 case linux.SO_SNDTIMEO: 1055 // TODO(igudger): Linux allows shorter lengths for partial results. 1056 if outLen < linux.SizeOfTimeval { 1057 return nil, syserr.ErrInvalidArgument 1058 } 1059 1060 sendTimeout := linux.NsecToTimeval(s.SendTimeout()) 1061 return &sendTimeout, nil 1062 1063 case linux.SO_RCVTIMEO: 1064 // TODO(igudger): Linux allows shorter lengths for partial results. 1065 if outLen < linux.SizeOfTimeval { 1066 return nil, syserr.ErrInvalidArgument 1067 } 1068 1069 recvTimeout := linux.NsecToTimeval(s.RecvTimeout()) 1070 return &recvTimeout, nil 1071 1072 case linux.SO_OOBINLINE: 1073 if outLen < sizeOfInt32 { 1074 return nil, syserr.ErrInvalidArgument 1075 } 1076 1077 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetOutOfBandInline())) 1078 return &v, nil 1079 1080 case linux.SO_NO_CHECK: 1081 if outLen < sizeOfInt32 { 1082 return nil, syserr.ErrInvalidArgument 1083 } 1084 1085 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetNoChecksum())) 1086 return &v, nil 1087 1088 case linux.SO_ACCEPTCONN: 1089 if outLen < sizeOfInt32 { 1090 return nil, syserr.ErrInvalidArgument 1091 } 1092 1093 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetAcceptConn())) 1094 return &v, nil 1095 1096 case linux.SO_RCVLOWAT: 1097 if outLen < sizeOfInt32 { 1098 return nil, syserr.ErrInvalidArgument 1099 } 1100 1101 v := primitive.Int32(ep.SocketOptions().GetRcvlowat()) 1102 return &v, nil 1103 } 1104 return nil, syserr.ErrProtocolNotAvailable 1105 } 1106 1107 // getSockOptTCP implements GetSockOpt when level is SOL_TCP. 1108 func getSockOptTCP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) { 1109 if !socket.IsTCP(s) { 1110 return nil, syserr.ErrUnknownProtocolOption 1111 } 1112 1113 switch name { 1114 case linux.TCP_NODELAY: 1115 if outLen < sizeOfInt32 { 1116 return nil, syserr.ErrInvalidArgument 1117 } 1118 1119 v := primitive.Int32(boolToInt32(!ep.SocketOptions().GetDelayOption())) 1120 return &v, nil 1121 1122 case linux.TCP_CORK: 1123 if outLen < sizeOfInt32 { 1124 return nil, syserr.ErrInvalidArgument 1125 } 1126 1127 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetCorkOption())) 1128 return &v, nil 1129 1130 case linux.TCP_QUICKACK: 1131 if outLen < sizeOfInt32 { 1132 return nil, syserr.ErrInvalidArgument 1133 } 1134 1135 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetQuickAck())) 1136 return &v, nil 1137 1138 case linux.TCP_MAXSEG: 1139 if outLen < sizeOfInt32 { 1140 return nil, syserr.ErrInvalidArgument 1141 } 1142 1143 v, err := ep.GetSockOptInt(tcpip.MaxSegOption) 1144 if err != nil { 1145 return nil, syserr.TranslateNetstackError(err) 1146 } 1147 vP := primitive.Int32(v) 1148 return &vP, nil 1149 1150 case linux.TCP_KEEPIDLE: 1151 if outLen < sizeOfInt32 { 1152 return nil, syserr.ErrInvalidArgument 1153 } 1154 1155 var v tcpip.KeepaliveIdleOption 1156 if err := ep.GetSockOpt(&v); err != nil { 1157 return nil, syserr.TranslateNetstackError(err) 1158 } 1159 keepAliveIdle := primitive.Int32(time.Duration(v) / time.Second) 1160 return &keepAliveIdle, nil 1161 1162 case linux.TCP_KEEPINTVL: 1163 if outLen < sizeOfInt32 { 1164 return nil, syserr.ErrInvalidArgument 1165 } 1166 1167 var v tcpip.KeepaliveIntervalOption 1168 if err := ep.GetSockOpt(&v); err != nil { 1169 return nil, syserr.TranslateNetstackError(err) 1170 } 1171 keepAliveInterval := primitive.Int32(time.Duration(v) / time.Second) 1172 return &keepAliveInterval, nil 1173 1174 case linux.TCP_KEEPCNT: 1175 if outLen < sizeOfInt32 { 1176 return nil, syserr.ErrInvalidArgument 1177 } 1178 1179 v, err := ep.GetSockOptInt(tcpip.KeepaliveCountOption) 1180 if err != nil { 1181 return nil, syserr.TranslateNetstackError(err) 1182 } 1183 vP := primitive.Int32(v) 1184 return &vP, nil 1185 1186 case linux.TCP_USER_TIMEOUT: 1187 if outLen < sizeOfInt32 { 1188 return nil, syserr.ErrInvalidArgument 1189 } 1190 1191 var v tcpip.TCPUserTimeoutOption 1192 if err := ep.GetSockOpt(&v); err != nil { 1193 return nil, syserr.TranslateNetstackError(err) 1194 } 1195 tcpUserTimeout := primitive.Int32(time.Duration(v) / time.Millisecond) 1196 return &tcpUserTimeout, nil 1197 1198 case linux.TCP_INFO: 1199 var v tcpip.TCPInfoOption 1200 if err := ep.GetSockOpt(&v); err != nil { 1201 return nil, syserr.TranslateNetstackError(err) 1202 } 1203 1204 info := linux.TCPInfo{ 1205 State: uint8(v.State), 1206 RTO: uint32(v.RTO / time.Microsecond), 1207 RTT: uint32(v.RTT / time.Microsecond), 1208 RTTVar: uint32(v.RTTVar / time.Microsecond), 1209 SndSsthresh: v.SndSsthresh, 1210 SndCwnd: v.SndCwnd, 1211 } 1212 switch v.CcState { 1213 case tcpip.RTORecovery: 1214 info.CaState = linux.TCP_CA_Loss 1215 case tcpip.FastRecovery, tcpip.SACKRecovery: 1216 info.CaState = linux.TCP_CA_Recovery 1217 case tcpip.Disorder: 1218 info.CaState = linux.TCP_CA_Disorder 1219 case tcpip.Open: 1220 info.CaState = linux.TCP_CA_Open 1221 } 1222 1223 // In netstack reorderSeen is updated only when RACK is enabled. 1224 // We only track whether the reordering is seen, which is 1225 // different than Linux where reorderSeen is not specific to 1226 // RACK and is incremented when a reordering event is seen. 1227 if v.ReorderSeen { 1228 info.ReordSeen = 1 1229 } 1230 1231 // Linux truncates the output binary to outLen. 1232 buf := t.CopyScratchBuffer(info.SizeBytes()) 1233 info.MarshalUnsafe(buf) 1234 if len(buf) > outLen { 1235 buf = buf[:outLen] 1236 } 1237 bufP := primitive.ByteSlice(buf) 1238 return &bufP, nil 1239 1240 case linux.TCP_CC_INFO, 1241 linux.TCP_NOTSENT_LOWAT, 1242 linux.TCP_ZEROCOPY_RECEIVE: 1243 1244 // Not supported. 1245 1246 case linux.TCP_CONGESTION: 1247 if outLen <= 0 { 1248 return nil, syserr.ErrInvalidArgument 1249 } 1250 1251 var v tcpip.CongestionControlOption 1252 if err := ep.GetSockOpt(&v); err != nil { 1253 return nil, syserr.TranslateNetstackError(err) 1254 } 1255 1256 // We match linux behaviour here where it returns the lower of 1257 // TCP_CA_NAME_MAX bytes or the value of the option length. 1258 // 1259 // This is Linux's net/tcp.h TCP_CA_NAME_MAX. 1260 const tcpCANameMax = 16 1261 1262 toCopy := tcpCANameMax 1263 if outLen < tcpCANameMax { 1264 toCopy = outLen 1265 } 1266 b := make([]byte, toCopy) 1267 copy(b, v) 1268 1269 bP := primitive.ByteSlice(b) 1270 return &bP, nil 1271 1272 case linux.TCP_LINGER2: 1273 if outLen < sizeOfInt32 { 1274 return nil, syserr.ErrInvalidArgument 1275 } 1276 1277 var v tcpip.TCPLingerTimeoutOption 1278 if err := ep.GetSockOpt(&v); err != nil { 1279 return nil, syserr.TranslateNetstackError(err) 1280 } 1281 var lingerTimeout primitive.Int32 1282 if v >= 0 { 1283 lingerTimeout = primitive.Int32(time.Duration(v) / time.Second) 1284 } else { 1285 lingerTimeout = -1 1286 } 1287 return &lingerTimeout, nil 1288 1289 case linux.TCP_DEFER_ACCEPT: 1290 if outLen < sizeOfInt32 { 1291 return nil, syserr.ErrInvalidArgument 1292 } 1293 1294 var v tcpip.TCPDeferAcceptOption 1295 if err := ep.GetSockOpt(&v); err != nil { 1296 return nil, syserr.TranslateNetstackError(err) 1297 } 1298 1299 tcpDeferAccept := primitive.Int32(time.Duration(v) / time.Second) 1300 return &tcpDeferAccept, nil 1301 1302 case linux.TCP_SYNCNT: 1303 if outLen < sizeOfInt32 { 1304 return nil, syserr.ErrInvalidArgument 1305 } 1306 1307 v, err := ep.GetSockOptInt(tcpip.TCPSynCountOption) 1308 if err != nil { 1309 return nil, syserr.TranslateNetstackError(err) 1310 } 1311 vP := primitive.Int32(v) 1312 return &vP, nil 1313 1314 case linux.TCP_WINDOW_CLAMP: 1315 if outLen < sizeOfInt32 { 1316 return nil, syserr.ErrInvalidArgument 1317 } 1318 1319 v, err := ep.GetSockOptInt(tcpip.TCPWindowClampOption) 1320 if err != nil { 1321 return nil, syserr.TranslateNetstackError(err) 1322 } 1323 vP := primitive.Int32(v) 1324 return &vP, nil 1325 } 1326 return nil, syserr.ErrProtocolNotAvailable 1327 } 1328 1329 func getSockOptICMPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outLen int) (marshal.Marshallable, *syserr.Error) { 1330 if _, ok := ep.(tcpip.Endpoint); !ok { 1331 log.Warningf("SOL_ICMPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name) 1332 return nil, syserr.ErrUnknownProtocolOption 1333 } 1334 1335 if family, _, _ := s.Type(); family != linux.AF_INET6 { 1336 return nil, syserr.ErrNotSupported 1337 } 1338 1339 switch name { 1340 case linux.ICMPV6_FILTER: 1341 var v tcpip.ICMPv6Filter 1342 if err := ep.GetSockOpt(&v); err != nil { 1343 return nil, syserr.TranslateNetstackError(err) 1344 } 1345 1346 filter := linux.ICMP6Filter{Filter: v.DenyType} 1347 1348 // Linux truncates the output to outLen. 1349 buf := t.CopyScratchBuffer(filter.SizeBytes()) 1350 filter.MarshalUnsafe(buf) 1351 if len(buf) > outLen { 1352 buf = buf[:outLen] 1353 } 1354 bufP := primitive.ByteSlice(buf) 1355 return &bufP, nil 1356 } 1357 return nil, syserr.ErrProtocolNotAvailable 1358 } 1359 1360 func defaultTTL(t *kernel.Task, network tcpip.NetworkProtocolNumber) (primitive.Int32, tcpip.Error) { 1361 var opt tcpip.DefaultTTLOption 1362 stack := inet.StackFromContext(t) 1363 if err := stack.(*Stack).Stack.NetworkProtocolOption(network, &opt); err != nil { 1364 return 0, err 1365 } 1366 return primitive.Int32(opt), nil 1367 } 1368 1369 // getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6. 1370 func getSockOptIPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { 1371 if _, ok := ep.(tcpip.Endpoint); !ok { 1372 log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name) 1373 return nil, syserr.ErrUnknownProtocolOption 1374 } 1375 1376 family, skType, _ := s.Type() 1377 if family != linux.AF_INET6 { 1378 return nil, syserr.ErrNotSupported 1379 } 1380 1381 switch name { 1382 case linux.IPV6_CHECKSUM: 1383 if outLen < sizeOfInt32 { 1384 return nil, syserr.ErrInvalidArgument 1385 } 1386 1387 v, err := ep.GetSockOptInt(tcpip.IPv6Checksum) 1388 if err != nil { 1389 return nil, syserr.TranslateNetstackError(err) 1390 } 1391 1392 vP := primitive.Int32(v) 1393 return &vP, nil 1394 1395 case linux.IPV6_V6ONLY: 1396 if outLen < sizeOfInt32 { 1397 return nil, syserr.ErrInvalidArgument 1398 } 1399 1400 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetV6Only())) 1401 return &v, nil 1402 1403 case linux.IPV6_UNICAST_HOPS: 1404 if outLen < sizeOfInt32 { 1405 return nil, syserr.ErrInvalidArgument 1406 } 1407 1408 v, err := ep.GetSockOptInt(tcpip.IPv6HopLimitOption) 1409 if err != nil { 1410 return nil, syserr.TranslateNetstackError(err) 1411 } 1412 1413 // Fill in the default value, if needed. 1414 vP := primitive.Int32(v) 1415 if vP == -1 { 1416 vP, err = defaultTTL(t, header.IPv6ProtocolNumber) 1417 if err != nil { 1418 return nil, syserr.TranslateNetstackError(err) 1419 } 1420 } 1421 1422 return &vP, nil 1423 1424 case linux.IPV6_RECVHOPLIMIT: 1425 if outLen < sizeOfInt32 { 1426 return nil, syserr.ErrInvalidArgument 1427 } 1428 1429 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveHopLimit())) 1430 return &v, nil 1431 1432 case linux.IPV6_PATHMTU: 1433 // Not supported. 1434 1435 case linux.IPV6_TCLASS: 1436 // Length handling for parity with Linux. 1437 if outLen == 0 { 1438 var b primitive.ByteSlice 1439 return &b, nil 1440 } 1441 v, err := ep.GetSockOptInt(tcpip.IPv6TrafficClassOption) 1442 if err != nil { 1443 return nil, syserr.TranslateNetstackError(err) 1444 } 1445 1446 uintv := primitive.Uint32(v) 1447 // Linux truncates the output binary to outLen. 1448 ib := t.CopyScratchBuffer(uintv.SizeBytes()) 1449 uintv.MarshalUnsafe(ib) 1450 // Handle cases where outLen is lesser than sizeOfInt32. 1451 if len(ib) > outLen { 1452 ib = ib[:outLen] 1453 } 1454 ibP := primitive.ByteSlice(ib) 1455 return &ibP, nil 1456 1457 case linux.IPV6_RECVTCLASS: 1458 if outLen < sizeOfInt32 { 1459 return nil, syserr.ErrInvalidArgument 1460 } 1461 1462 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTClass())) 1463 return &v, nil 1464 case linux.IPV6_RECVERR: 1465 if outLen < sizeOfInt32 { 1466 return nil, syserr.ErrInvalidArgument 1467 } 1468 1469 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv6RecvError())) 1470 return &v, nil 1471 1472 case linux.IPV6_RECVORIGDSTADDR: 1473 if outLen < sizeOfInt32 { 1474 return nil, syserr.ErrInvalidArgument 1475 } 1476 1477 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress())) 1478 return &v, nil 1479 1480 case linux.IPV6_RECVPKTINFO: 1481 if outLen < sizeOfInt32 { 1482 return nil, syserr.ErrInvalidArgument 1483 } 1484 1485 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv6ReceivePacketInfo())) 1486 return &v, nil 1487 1488 case linux.IP6T_ORIGINAL_DST: 1489 if outLen < sockAddrInet6Size { 1490 return nil, syserr.ErrInvalidArgument 1491 } 1492 1493 var v tcpip.OriginalDestinationOption 1494 if err := ep.GetSockOpt(&v); err != nil { 1495 return nil, syserr.TranslateNetstackError(err) 1496 } 1497 1498 a, _ := socket.ConvertAddress(linux.AF_INET6, tcpip.FullAddress(v)) 1499 return a.(*linux.SockAddrInet6), nil 1500 1501 case linux.IP6T_SO_GET_INFO: 1502 if outLen < linux.SizeOfIPTGetinfo { 1503 return nil, syserr.ErrInvalidArgument 1504 } 1505 1506 // Only valid for raw IPv6 sockets. 1507 if skType != linux.SOCK_RAW { 1508 return nil, syserr.ErrProtocolNotAvailable 1509 } 1510 1511 stk := inet.StackFromContext(t) 1512 if stk == nil { 1513 return nil, syserr.ErrNoDevice 1514 } 1515 info, err := netfilter.GetInfo(t, stk.(*Stack).Stack, outPtr, true) 1516 if err != nil { 1517 return nil, err 1518 } 1519 return &info, nil 1520 1521 case linux.IP6T_SO_GET_ENTRIES: 1522 // IPTGetEntries is reused for IPv6. 1523 if outLen < linux.SizeOfIPTGetEntries { 1524 return nil, syserr.ErrInvalidArgument 1525 } 1526 // Only valid for raw IPv6 sockets. 1527 if skType != linux.SOCK_RAW { 1528 return nil, syserr.ErrProtocolNotAvailable 1529 } 1530 1531 stk := inet.StackFromContext(t) 1532 if stk == nil { 1533 return nil, syserr.ErrNoDevice 1534 } 1535 entries, err := netfilter.GetEntries6(t, stk.(*Stack).Stack, outPtr, outLen) 1536 if err != nil { 1537 return nil, err 1538 } 1539 return &entries, nil 1540 1541 case linux.IP6T_SO_GET_REVISION_TARGET: 1542 if outLen < linux.SizeOfXTGetRevision { 1543 return nil, syserr.ErrInvalidArgument 1544 } 1545 1546 // Only valid for raw IPv6 sockets. 1547 if skType != linux.SOCK_RAW { 1548 return nil, syserr.ErrProtocolNotAvailable 1549 } 1550 1551 stk := inet.StackFromContext(t) 1552 if stk == nil { 1553 return nil, syserr.ErrNoDevice 1554 } 1555 ret, err := netfilter.TargetRevision(t, outPtr, header.IPv6ProtocolNumber) 1556 if err != nil { 1557 return nil, err 1558 } 1559 return &ret, nil 1560 } 1561 return nil, syserr.ErrProtocolNotAvailable 1562 } 1563 1564 // getSockOptIP implements GetSockOpt when level is SOL_IP. 1565 func getSockOptIP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int, _ int) (marshal.Marshallable, *syserr.Error) { 1566 if _, ok := ep.(tcpip.Endpoint); !ok { 1567 log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name) 1568 return nil, syserr.ErrUnknownProtocolOption 1569 } 1570 1571 switch name { 1572 case linux.IP_TTL: 1573 if outLen < sizeOfInt32 { 1574 return nil, syserr.ErrInvalidArgument 1575 } 1576 1577 v, err := ep.GetSockOptInt(tcpip.IPv4TTLOption) 1578 if err != nil { 1579 return nil, syserr.TranslateNetstackError(err) 1580 } 1581 1582 // Fill in the default value, if needed. 1583 vP := primitive.Int32(v) 1584 if vP == 0 { 1585 vP, err = defaultTTL(t, header.IPv4ProtocolNumber) 1586 if err != nil { 1587 return nil, syserr.TranslateNetstackError(err) 1588 } 1589 } 1590 1591 return &vP, nil 1592 1593 case linux.IP_RECVTTL: 1594 if outLen < sizeOfInt32 { 1595 return nil, syserr.ErrInvalidArgument 1596 } 1597 1598 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTTL())) 1599 return &v, nil 1600 1601 case linux.IP_MULTICAST_TTL: 1602 if outLen < sizeOfInt32 { 1603 return nil, syserr.ErrInvalidArgument 1604 } 1605 1606 v, err := ep.GetSockOptInt(tcpip.MulticastTTLOption) 1607 if err != nil { 1608 return nil, syserr.TranslateNetstackError(err) 1609 } 1610 1611 vP := primitive.Int32(v) 1612 return &vP, nil 1613 1614 case linux.IP_MULTICAST_IF: 1615 if outLen < len(linux.InetAddr{}) { 1616 return nil, syserr.ErrInvalidArgument 1617 } 1618 1619 var v tcpip.MulticastInterfaceOption 1620 if err := ep.GetSockOpt(&v); err != nil { 1621 return nil, syserr.TranslateNetstackError(err) 1622 } 1623 1624 a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr}) 1625 1626 return &a.(*linux.SockAddrInet).Addr, nil 1627 1628 case linux.IP_MULTICAST_LOOP: 1629 if outLen < sizeOfInt32 { 1630 return nil, syserr.ErrInvalidArgument 1631 } 1632 1633 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetMulticastLoop())) 1634 return &v, nil 1635 1636 case linux.IP_TOS: 1637 // Length handling for parity with Linux. 1638 if outLen == 0 { 1639 var b primitive.ByteSlice 1640 return &b, nil 1641 } 1642 v, err := ep.GetSockOptInt(tcpip.IPv4TOSOption) 1643 if err != nil { 1644 return nil, syserr.TranslateNetstackError(err) 1645 } 1646 if outLen < sizeOfInt32 { 1647 vP := primitive.Uint8(v) 1648 return &vP, nil 1649 } 1650 vP := primitive.Int32(v) 1651 return &vP, nil 1652 1653 case linux.IP_RECVTOS: 1654 if outLen < sizeOfInt32 { 1655 return nil, syserr.ErrInvalidArgument 1656 } 1657 1658 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTOS())) 1659 return &v, nil 1660 1661 case linux.IP_RECVERR: 1662 if outLen < sizeOfInt32 { 1663 return nil, syserr.ErrInvalidArgument 1664 } 1665 1666 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv4RecvError())) 1667 return &v, nil 1668 1669 case linux.IP_PKTINFO: 1670 if outLen < sizeOfInt32 { 1671 return nil, syserr.ErrInvalidArgument 1672 } 1673 1674 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceivePacketInfo())) 1675 return &v, nil 1676 1677 case linux.IP_HDRINCL: 1678 if outLen < sizeOfInt32 { 1679 return nil, syserr.ErrInvalidArgument 1680 } 1681 1682 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetHeaderIncluded())) 1683 return &v, nil 1684 1685 case linux.IP_RECVORIGDSTADDR: 1686 if outLen < sizeOfInt32 { 1687 return nil, syserr.ErrInvalidArgument 1688 } 1689 1690 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress())) 1691 return &v, nil 1692 1693 case linux.SO_ORIGINAL_DST: 1694 if outLen < sockAddrInetSize { 1695 return nil, syserr.ErrInvalidArgument 1696 } 1697 1698 var v tcpip.OriginalDestinationOption 1699 if err := ep.GetSockOpt(&v); err != nil { 1700 return nil, syserr.TranslateNetstackError(err) 1701 } 1702 1703 a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress(v)) 1704 return a.(*linux.SockAddrInet), nil 1705 1706 case linux.IPT_SO_GET_INFO: 1707 if outLen < linux.SizeOfIPTGetinfo { 1708 return nil, syserr.ErrInvalidArgument 1709 } 1710 1711 // Only valid for raw IPv4 sockets. 1712 if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW { 1713 return nil, syserr.ErrProtocolNotAvailable 1714 } 1715 1716 stk := inet.StackFromContext(t) 1717 if stk == nil { 1718 return nil, syserr.ErrNoDevice 1719 } 1720 info, err := netfilter.GetInfo(t, stk.(*Stack).Stack, outPtr, false) 1721 if err != nil { 1722 return nil, err 1723 } 1724 return &info, nil 1725 1726 case linux.IPT_SO_GET_ENTRIES: 1727 if outLen < linux.SizeOfIPTGetEntries { 1728 return nil, syserr.ErrInvalidArgument 1729 } 1730 1731 // Only valid for raw IPv4 sockets. 1732 if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW { 1733 return nil, syserr.ErrProtocolNotAvailable 1734 } 1735 1736 stk := inet.StackFromContext(t) 1737 if stk == nil { 1738 return nil, syserr.ErrNoDevice 1739 } 1740 entries, err := netfilter.GetEntries4(t, stk.(*Stack).Stack, outPtr, outLen) 1741 if err != nil { 1742 return nil, err 1743 } 1744 return &entries, nil 1745 1746 case linux.IPT_SO_GET_REVISION_TARGET: 1747 if outLen < linux.SizeOfXTGetRevision { 1748 return nil, syserr.ErrInvalidArgument 1749 } 1750 1751 // Only valid for raw IPv4 sockets. 1752 if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW { 1753 return nil, syserr.ErrProtocolNotAvailable 1754 } 1755 1756 stk := inet.StackFromContext(t) 1757 if stk == nil { 1758 return nil, syserr.ErrNoDevice 1759 } 1760 ret, err := netfilter.TargetRevision(t, outPtr, header.IPv4ProtocolNumber) 1761 if err != nil { 1762 return nil, err 1763 } 1764 return &ret, nil 1765 1766 case linux.IP_MTU_DISCOVER: 1767 if outLen < sizeOfInt32 { 1768 return nil, syserr.ErrInvalidArgument 1769 } 1770 1771 v, err := ep.GetSockOptInt(tcpip.MTUDiscoverOption) 1772 if err != nil { 1773 return nil, syserr.TranslateNetstackError(err) 1774 } 1775 switch tcpip.PMTUDStrategy(v) { 1776 case tcpip.PMTUDiscoveryWant: 1777 v = linux.IP_PMTUDISC_WANT 1778 case tcpip.PMTUDiscoveryDont: 1779 v = linux.IP_PMTUDISC_DONT 1780 case tcpip.PMTUDiscoveryDo: 1781 v = linux.IP_PMTUDISC_DO 1782 case tcpip.PMTUDiscoveryProbe: 1783 v = linux.IP_PMTUDISC_PROBE 1784 default: 1785 panic(fmt.Errorf("unknown PMTUD option: %d", v)) 1786 } 1787 vP := primitive.Int32(v) 1788 return &vP, nil 1789 } 1790 return nil, syserr.ErrProtocolNotAvailable 1791 } 1792 1793 // SetSockOpt can be used to implement the linux syscall setsockopt(2) for 1794 // sockets backed by a commonEndpoint. 1795 func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error { 1796 switch level { 1797 case linux.SOL_SOCKET: 1798 return setSockOptSocket(t, s, ep, name, optVal) 1799 1800 case linux.SOL_TCP: 1801 return setSockOptTCP(t, s, ep, name, optVal) 1802 1803 case linux.SOL_ICMPV6: 1804 return setSockOptICMPv6(t, s, ep, name, optVal) 1805 1806 case linux.SOL_IPV6: 1807 return setSockOptIPv6(t, s, ep, name, optVal) 1808 1809 case linux.SOL_IP: 1810 return setSockOptIP(t, s, ep, name, optVal) 1811 1812 case linux.SOL_PACKET: 1813 // gVisor doesn't support any SOL_PACKET options just return not 1814 // supported. Returning nil here will result in tcpdump thinking AF_PACKET 1815 // features are supported and proceed to use them and break. 1816 return syserr.ErrProtocolNotAvailable 1817 1818 case linux.SOL_UDP, 1819 linux.SOL_RAW: 1820 // Not supported. 1821 } 1822 1823 return nil 1824 } 1825 1826 func clampBufSize(newSz, min, max int64, ignoreMax bool) int64 { 1827 // packetOverheadFactor is used to multiply the value provided by the user on 1828 // a setsockopt(2) for setting the send/receive buffer sizes sockets. 1829 const packetOverheadFactor = 2 1830 1831 if !ignoreMax && newSz > max { 1832 newSz = max 1833 } 1834 1835 if newSz < math.MaxInt32/packetOverheadFactor { 1836 newSz *= packetOverheadFactor 1837 if newSz < min { 1838 newSz = min 1839 } 1840 } else { 1841 newSz = math.MaxInt32 1842 } 1843 return newSz 1844 } 1845 1846 // setSockOptSocket implements SetSockOpt when level is SOL_SOCKET. 1847 func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { 1848 switch name { 1849 case linux.SO_SNDBUF: 1850 if len(optVal) < sizeOfInt32 { 1851 return syserr.ErrInvalidArgument 1852 } 1853 1854 v := hostarch.ByteOrder.Uint32(optVal) 1855 min, max := ep.SocketOptions().SendBufferLimits() 1856 clamped := clampBufSize(int64(v), min, max, false /* ignoreMax */) 1857 ep.SocketOptions().SetSendBufferSize(clamped, true /* notify */) 1858 return nil 1859 1860 case linux.SO_RCVBUF: 1861 if len(optVal) < sizeOfInt32 { 1862 return syserr.ErrInvalidArgument 1863 } 1864 1865 v := hostarch.ByteOrder.Uint32(optVal) 1866 min, max := ep.SocketOptions().ReceiveBufferLimits() 1867 clamped := clampBufSize(int64(v), min, max, false /* ignoreMax */) 1868 ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */) 1869 return nil 1870 1871 case linux.SO_RCVBUFFORCE: 1872 if len(optVal) < sizeOfInt32 { 1873 return syserr.ErrInvalidArgument 1874 } 1875 1876 if creds := auth.CredentialsFromContext(t); !creds.HasCapability(linux.CAP_NET_ADMIN) { 1877 return syserr.ErrNotPermitted 1878 } 1879 1880 v := hostarch.ByteOrder.Uint32(optVal) 1881 min, max := ep.SocketOptions().ReceiveBufferLimits() 1882 clamped := clampBufSize(int64(v), min, max, true /* ignoreMax */) 1883 ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */) 1884 return nil 1885 1886 case linux.SO_REUSEADDR: 1887 if len(optVal) < sizeOfInt32 { 1888 return syserr.ErrInvalidArgument 1889 } 1890 1891 v := hostarch.ByteOrder.Uint32(optVal) 1892 ep.SocketOptions().SetReuseAddress(v != 0) 1893 return nil 1894 1895 case linux.SO_REUSEPORT: 1896 if len(optVal) < sizeOfInt32 { 1897 return syserr.ErrInvalidArgument 1898 } 1899 1900 v := hostarch.ByteOrder.Uint32(optVal) 1901 ep.SocketOptions().SetReusePort(v != 0) 1902 return nil 1903 1904 case linux.SO_BINDTODEVICE: 1905 n := bytes.IndexByte(optVal, 0) 1906 if n == -1 { 1907 n = len(optVal) 1908 } 1909 name := string(optVal[:n]) 1910 if name == "" { 1911 return syserr.TranslateNetstackError(ep.SocketOptions().SetBindToDevice(0)) 1912 } 1913 s := t.NetworkContext() 1914 if s == nil { 1915 return syserr.ErrNoDevice 1916 } 1917 for nicID, nic := range s.Interfaces() { 1918 if nic.Name == name { 1919 return syserr.TranslateNetstackError(ep.SocketOptions().SetBindToDevice(nicID)) 1920 } 1921 } 1922 return syserr.ErrUnknownDevice 1923 1924 case linux.SO_BROADCAST: 1925 if len(optVal) < sizeOfInt32 { 1926 return syserr.ErrInvalidArgument 1927 } 1928 1929 v := hostarch.ByteOrder.Uint32(optVal) 1930 ep.SocketOptions().SetBroadcast(v != 0) 1931 return nil 1932 1933 case linux.SO_PASSCRED: 1934 if len(optVal) < sizeOfInt32 { 1935 return syserr.ErrInvalidArgument 1936 } 1937 1938 v := hostarch.ByteOrder.Uint32(optVal) 1939 ep.SocketOptions().SetPassCred(v != 0) 1940 return nil 1941 1942 case linux.SO_KEEPALIVE: 1943 if len(optVal) < sizeOfInt32 { 1944 return syserr.ErrInvalidArgument 1945 } 1946 1947 v := hostarch.ByteOrder.Uint32(optVal) 1948 ep.SocketOptions().SetKeepAlive(v != 0) 1949 return nil 1950 1951 case linux.SO_SNDTIMEO: 1952 if len(optVal) < linux.SizeOfTimeval { 1953 return syserr.ErrInvalidArgument 1954 } 1955 1956 var v linux.Timeval 1957 v.UnmarshalBytes(optVal) 1958 if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) { 1959 return syserr.ErrDomain 1960 } 1961 s.SetSendTimeout(v.ToNsecCapped()) 1962 return nil 1963 1964 case linux.SO_RCVTIMEO: 1965 if len(optVal) < linux.SizeOfTimeval { 1966 return syserr.ErrInvalidArgument 1967 } 1968 1969 var v linux.Timeval 1970 v.UnmarshalBytes(optVal) 1971 if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) { 1972 return syserr.ErrDomain 1973 } 1974 s.SetRecvTimeout(v.ToNsecCapped()) 1975 return nil 1976 1977 case linux.SO_OOBINLINE: 1978 if len(optVal) < sizeOfInt32 { 1979 return syserr.ErrInvalidArgument 1980 } 1981 1982 v := hostarch.ByteOrder.Uint32(optVal) 1983 ep.SocketOptions().SetOutOfBandInline(v != 0) 1984 return nil 1985 1986 case linux.SO_NO_CHECK: 1987 if len(optVal) < sizeOfInt32 { 1988 return syserr.ErrInvalidArgument 1989 } 1990 1991 v := hostarch.ByteOrder.Uint32(optVal) 1992 ep.SocketOptions().SetNoChecksum(v != 0) 1993 return nil 1994 1995 case linux.SO_LINGER: 1996 if len(optVal) < linux.SizeOfLinger { 1997 return syserr.ErrInvalidArgument 1998 } 1999 2000 var v linux.Linger 2001 v.UnmarshalBytes(optVal) 2002 2003 ep.SocketOptions().SetLinger(tcpip.LingerOption{ 2004 Enabled: v.OnOff != 0, 2005 Timeout: time.Second * time.Duration(v.Linger), 2006 }) 2007 return nil 2008 2009 case linux.SO_DETACH_FILTER: 2010 // optval is ignored. 2011 var v tcpip.SocketDetachFilterOption 2012 return syserr.TranslateNetstackError(ep.SetSockOpt(&v)) 2013 2014 // TODO(b/226603727): Add support for SO_RCVLOWAT option. For now, only 2015 // the unsupported syscall message is removed. 2016 case linux.SO_RCVLOWAT: 2017 if len(optVal) < sizeOfInt32 { 2018 return syserr.ErrInvalidArgument 2019 } 2020 2021 v := hostarch.ByteOrder.Uint32(optVal) 2022 ep.SocketOptions().SetRcvlowat(int32(v)) 2023 return nil 2024 } 2025 2026 return nil 2027 } 2028 2029 // setSockOptTCP implements SetSockOpt when level is SOL_TCP. 2030 func setSockOptTCP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { 2031 if !socket.IsTCP(s) { 2032 return syserr.ErrUnknownProtocolOption 2033 } 2034 2035 switch name { 2036 case linux.TCP_NODELAY: 2037 if len(optVal) < sizeOfInt32 { 2038 return syserr.ErrInvalidArgument 2039 } 2040 2041 v := hostarch.ByteOrder.Uint32(optVal) 2042 ep.SocketOptions().SetDelayOption(v == 0) 2043 return nil 2044 2045 case linux.TCP_CORK: 2046 if len(optVal) < sizeOfInt32 { 2047 return syserr.ErrInvalidArgument 2048 } 2049 2050 v := hostarch.ByteOrder.Uint32(optVal) 2051 ep.SocketOptions().SetCorkOption(v != 0) 2052 return nil 2053 2054 case linux.TCP_QUICKACK: 2055 if len(optVal) < sizeOfInt32 { 2056 return syserr.ErrInvalidArgument 2057 } 2058 2059 v := hostarch.ByteOrder.Uint32(optVal) 2060 ep.SocketOptions().SetQuickAck(v != 0) 2061 return nil 2062 2063 case linux.TCP_MAXSEG: 2064 if len(optVal) < sizeOfInt32 { 2065 return syserr.ErrInvalidArgument 2066 } 2067 2068 v := hostarch.ByteOrder.Uint32(optVal) 2069 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MaxSegOption, int(v))) 2070 2071 case linux.TCP_KEEPIDLE: 2072 if len(optVal) < sizeOfInt32 { 2073 return syserr.ErrInvalidArgument 2074 } 2075 2076 v := hostarch.ByteOrder.Uint32(optVal) 2077 if v < 1 || v > linux.MAX_TCP_KEEPIDLE { 2078 return syserr.ErrInvalidArgument 2079 } 2080 opt := tcpip.KeepaliveIdleOption(time.Second * time.Duration(v)) 2081 return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) 2082 2083 case linux.TCP_KEEPINTVL: 2084 if len(optVal) < sizeOfInt32 { 2085 return syserr.ErrInvalidArgument 2086 } 2087 2088 v := hostarch.ByteOrder.Uint32(optVal) 2089 if v < 1 || v > linux.MAX_TCP_KEEPINTVL { 2090 return syserr.ErrInvalidArgument 2091 } 2092 opt := tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v)) 2093 return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) 2094 2095 case linux.TCP_KEEPCNT: 2096 if len(optVal) < sizeOfInt32 { 2097 return syserr.ErrInvalidArgument 2098 } 2099 2100 v := hostarch.ByteOrder.Uint32(optVal) 2101 if v < 1 || v > linux.MAX_TCP_KEEPCNT { 2102 return syserr.ErrInvalidArgument 2103 } 2104 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.KeepaliveCountOption, int(v))) 2105 2106 case linux.TCP_USER_TIMEOUT: 2107 if len(optVal) < sizeOfInt32 { 2108 return syserr.ErrInvalidArgument 2109 } 2110 2111 v := int32(hostarch.ByteOrder.Uint32(optVal)) 2112 if v < 0 { 2113 return syserr.ErrInvalidArgument 2114 } 2115 opt := tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v)) 2116 return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) 2117 2118 case linux.TCP_CONGESTION: 2119 v := tcpip.CongestionControlOption(optVal) 2120 if err := ep.SetSockOpt(&v); err != nil { 2121 return syserr.TranslateNetstackError(err) 2122 } 2123 return nil 2124 2125 case linux.TCP_LINGER2: 2126 if len(optVal) < sizeOfInt32 { 2127 return syserr.ErrInvalidArgument 2128 } 2129 2130 v := int32(hostarch.ByteOrder.Uint32(optVal)) 2131 opt := tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v)) 2132 return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) 2133 2134 case linux.TCP_DEFER_ACCEPT: 2135 if len(optVal) < sizeOfInt32 { 2136 return syserr.ErrInvalidArgument 2137 } 2138 v := int32(hostarch.ByteOrder.Uint32(optVal)) 2139 if v < 0 { 2140 v = 0 2141 } 2142 opt := tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v)) 2143 return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) 2144 2145 case linux.TCP_SYNCNT: 2146 if len(optVal) < sizeOfInt32 { 2147 return syserr.ErrInvalidArgument 2148 } 2149 v := hostarch.ByteOrder.Uint32(optVal) 2150 2151 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPSynCountOption, int(v))) 2152 2153 case linux.TCP_WINDOW_CLAMP: 2154 if len(optVal) < sizeOfInt32 { 2155 return syserr.ErrInvalidArgument 2156 } 2157 v := hostarch.ByteOrder.Uint32(optVal) 2158 2159 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPWindowClampOption, int(v))) 2160 2161 case linux.TCP_REPAIR_OPTIONS: 2162 // Not supported. 2163 } 2164 2165 return nil 2166 } 2167 2168 func setSockOptICMPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { 2169 if _, ok := ep.(tcpip.Endpoint); !ok { 2170 log.Warningf("SOL_ICMPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name) 2171 return syserr.ErrUnknownProtocolOption 2172 } 2173 2174 if family, _, _ := s.Type(); family != linux.AF_INET6 { 2175 return syserr.ErrUnknownProtocolOption 2176 } 2177 2178 switch name { 2179 case linux.ICMPV6_FILTER: 2180 var req linux.ICMP6Filter 2181 if len(optVal) < req.SizeBytes() { 2182 return syserr.ErrInvalidArgument 2183 } 2184 2185 req.UnmarshalUnsafe(optVal) 2186 return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.ICMPv6Filter{DenyType: req.Filter})) 2187 } 2188 2189 return nil 2190 } 2191 2192 // setSockOptIPv6 implements SetSockOpt when level is SOL_IPV6. 2193 func setSockOptIPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { 2194 if _, ok := ep.(tcpip.Endpoint); !ok { 2195 log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name) 2196 return syserr.ErrUnknownProtocolOption 2197 } 2198 2199 family, _, _ := s.Type() 2200 if family != linux.AF_INET6 { 2201 return syserr.ErrUnknownProtocolOption 2202 } 2203 2204 switch name { 2205 case linux.IPV6_CHECKSUM: 2206 if len(optVal) < sizeOfInt32 { 2207 return syserr.ErrInvalidArgument 2208 } 2209 2210 // int may not be 32-bits so we cast the uint32 to an int32 before casting 2211 // to an int. 2212 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6Checksum, int(int32(hostarch.ByteOrder.Uint32(optVal))))) 2213 2214 case linux.IPV6_V6ONLY: 2215 if len(optVal) < sizeOfInt32 { 2216 return syserr.ErrInvalidArgument 2217 } 2218 2219 if socket.IsTCP(s) && tcp.EndpointState(ep.State()) != tcp.StateInitial { 2220 return syserr.ErrInvalidEndpointState 2221 } else if socket.IsUDP(s) && transport.DatagramEndpointState(ep.State()) != transport.DatagramEndpointStateInitial { 2222 return syserr.ErrInvalidEndpointState 2223 } 2224 2225 v := hostarch.ByteOrder.Uint32(optVal) 2226 ep.SocketOptions().SetV6Only(v != 0) 2227 return nil 2228 2229 case linux.IPV6_ADD_MEMBERSHIP: 2230 req, err := copyInMulticastV6Request(optVal) 2231 if err != nil { 2232 return err 2233 } 2234 2235 return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{ 2236 NIC: tcpip.NICID(req.InterfaceIndex), 2237 MulticastAddr: tcpip.AddrFrom16(req.MulticastAddr), 2238 })) 2239 2240 case linux.IPV6_DROP_MEMBERSHIP: 2241 req, err := copyInMulticastV6Request(optVal) 2242 if err != nil { 2243 return err 2244 } 2245 2246 return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{ 2247 NIC: tcpip.NICID(req.InterfaceIndex), 2248 MulticastAddr: tcpip.AddrFrom16(req.MulticastAddr), 2249 })) 2250 2251 case linux.IPV6_IPSEC_POLICY, 2252 linux.IPV6_JOIN_ANYCAST, 2253 linux.IPV6_LEAVE_ANYCAST, 2254 // TODO(b/148887420): Add support for IPV6_PKTINFO. 2255 linux.IPV6_PKTINFO, 2256 linux.IPV6_ROUTER_ALERT, 2257 linux.IPV6_XFRM_POLICY, 2258 linux.MCAST_BLOCK_SOURCE, 2259 linux.MCAST_JOIN_GROUP, 2260 linux.MCAST_JOIN_SOURCE_GROUP, 2261 linux.MCAST_LEAVE_GROUP, 2262 linux.MCAST_LEAVE_SOURCE_GROUP, 2263 linux.MCAST_UNBLOCK_SOURCE: 2264 // Not supported. 2265 2266 case linux.IPV6_RECVORIGDSTADDR: 2267 if len(optVal) < sizeOfInt32 { 2268 return syserr.ErrInvalidArgument 2269 } 2270 v := int32(hostarch.ByteOrder.Uint32(optVal)) 2271 2272 ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0) 2273 return nil 2274 2275 case linux.IPV6_RECVPKTINFO: 2276 if len(optVal) < sizeOfInt32 { 2277 return syserr.ErrInvalidArgument 2278 } 2279 v := int32(hostarch.ByteOrder.Uint32(optVal)) 2280 2281 ep.SocketOptions().SetIPv6ReceivePacketInfo(v != 0) 2282 return nil 2283 2284 case linux.IPV6_UNICAST_HOPS: 2285 if len(optVal) < sizeOfInt32 { 2286 return syserr.ErrInvalidArgument 2287 } 2288 v := int32(hostarch.ByteOrder.Uint32(optVal)) 2289 if v < -1 || v > 255 { 2290 return syserr.ErrInvalidArgument 2291 } 2292 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6HopLimitOption, int(v))) 2293 2294 case linux.IPV6_RECVHOPLIMIT: 2295 v, err := parseIntOrChar(optVal) 2296 if err != nil { 2297 return err 2298 } 2299 2300 ep.SocketOptions().SetReceiveHopLimit(v != 0) 2301 return nil 2302 2303 case linux.IPV6_TCLASS: 2304 if len(optVal) < sizeOfInt32 { 2305 return syserr.ErrInvalidArgument 2306 } 2307 v := int32(hostarch.ByteOrder.Uint32(optVal)) 2308 if v < -1 || v > 255 { 2309 return syserr.ErrInvalidArgument 2310 } 2311 if v == -1 { 2312 v = 0 2313 } 2314 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6TrafficClassOption, int(v))) 2315 2316 case linux.IPV6_RECVTCLASS: 2317 v, err := parseIntOrChar(optVal) 2318 if err != nil { 2319 return err 2320 } 2321 2322 ep.SocketOptions().SetReceiveTClass(v != 0) 2323 return nil 2324 case linux.IPV6_RECVERR: 2325 if len(optVal) == 0 { 2326 return nil 2327 } 2328 v, err := parseIntOrChar(optVal) 2329 if err != nil { 2330 return err 2331 } 2332 ep.SocketOptions().SetIPv6RecvError(v != 0) 2333 return nil 2334 2335 case linux.IP6T_SO_SET_REPLACE: 2336 if len(optVal) < linux.SizeOfIP6TReplace { 2337 return syserr.ErrInvalidArgument 2338 } 2339 2340 // Only valid for raw IPv6 sockets. 2341 if !socket.IsRaw(s) { 2342 return syserr.ErrProtocolNotAvailable 2343 } 2344 2345 stk := inet.StackFromContext(t) 2346 if stk == nil { 2347 return syserr.ErrNoDevice 2348 } 2349 // Stack must be a netstack stack. 2350 return netfilter.SetEntries(t.Credentials().UserNamespace, stk.(*Stack).Stack, optVal, true) 2351 2352 case linux.IP6T_SO_SET_ADD_COUNTERS: 2353 log.Infof("IP6T_SO_SET_ADD_COUNTERS is not supported") 2354 return nil 2355 } 2356 2357 return nil 2358 } 2359 2360 var ( 2361 inetMulticastRequestSize = (*linux.InetMulticastRequest)(nil).SizeBytes() 2362 inetMulticastRequestWithNICSize = (*linux.InetMulticastRequestWithNIC)(nil).SizeBytes() 2363 inet6MulticastRequestSize = (*linux.Inet6MulticastRequest)(nil).SizeBytes() 2364 ) 2365 2366 // copyInMulticastRequest copies in a variable-size multicast request. The 2367 // kernel determines which structure was passed by its length. IP_MULTICAST_IF 2368 // supports ip_mreqn, ip_mreq and in_addr, while IP_ADD_MEMBERSHIP and 2369 // IP_DROP_MEMBERSHIP only support ip_mreqn and ip_mreq. To handle this, 2370 // allowAddr controls whether in_addr is accepted or rejected. 2371 func copyInMulticastRequest(optVal []byte, allowAddr bool) (linux.InetMulticastRequestWithNIC, *syserr.Error) { 2372 if len(optVal) < len(linux.InetAddr{}) { 2373 return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument 2374 } 2375 2376 if len(optVal) < inetMulticastRequestSize { 2377 if !allowAddr { 2378 return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument 2379 } 2380 2381 var req linux.InetMulticastRequestWithNIC 2382 copy(req.InterfaceAddr[:], optVal) 2383 return req, nil 2384 } 2385 2386 if len(optVal) >= inetMulticastRequestWithNICSize { 2387 var req linux.InetMulticastRequestWithNIC 2388 req.UnmarshalUnsafe(optVal) 2389 return req, nil 2390 } 2391 2392 var req linux.InetMulticastRequestWithNIC 2393 req.InetMulticastRequest.UnmarshalUnsafe(optVal) 2394 return req, nil 2395 } 2396 2397 func copyInMulticastV6Request(optVal []byte) (linux.Inet6MulticastRequest, *syserr.Error) { 2398 if len(optVal) < inet6MulticastRequestSize { 2399 return linux.Inet6MulticastRequest{}, syserr.ErrInvalidArgument 2400 } 2401 2402 var req linux.Inet6MulticastRequest 2403 req.UnmarshalUnsafe(optVal) 2404 return req, nil 2405 } 2406 2407 // parseIntOrChar copies either a 32-bit int or an 8-bit uint out of buf. 2408 // 2409 // net/ipv4/ip_sockglue.c:do_ip_setsockopt does this for its socket options. 2410 func parseIntOrChar(buf []byte) (int32, *syserr.Error) { 2411 if len(buf) == 0 { 2412 return 0, syserr.ErrInvalidArgument 2413 } 2414 2415 if len(buf) >= sizeOfInt32 { 2416 return int32(hostarch.ByteOrder.Uint32(buf)), nil 2417 } 2418 2419 return int32(buf[0]), nil 2420 } 2421 2422 // setSockOptIP implements SetSockOpt when level is SOL_IP. 2423 func setSockOptIP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { 2424 if _, ok := ep.(tcpip.Endpoint); !ok { 2425 log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name) 2426 return syserr.ErrUnknownProtocolOption 2427 } 2428 2429 switch name { 2430 case linux.IP_MULTICAST_TTL: 2431 v, err := parseIntOrChar(optVal) 2432 if err != nil { 2433 return err 2434 } 2435 2436 if v == -1 { 2437 // Linux translates -1 to 1. 2438 v = 1 2439 } 2440 if v < 0 || v > 255 { 2441 return syserr.ErrInvalidArgument 2442 } 2443 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MulticastTTLOption, int(v))) 2444 2445 case linux.IP_ADD_MEMBERSHIP: 2446 req, err := copyInMulticastRequest(optVal, false /* allowAddr */) 2447 if err != nil { 2448 return err 2449 } 2450 2451 return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{ 2452 NIC: tcpip.NICID(req.InterfaceIndex), 2453 // TODO(igudger): Change AddMembership to use the standard 2454 // any address representation. 2455 InterfaceAddr: tcpip.AddrFrom4(req.InterfaceAddr), 2456 MulticastAddr: tcpip.AddrFrom4(req.MulticastAddr), 2457 })) 2458 2459 case linux.IP_DROP_MEMBERSHIP: 2460 req, err := copyInMulticastRequest(optVal, false /* allowAddr */) 2461 if err != nil { 2462 return err 2463 } 2464 2465 return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{ 2466 NIC: tcpip.NICID(req.InterfaceIndex), 2467 // TODO(igudger): Change DropMembership to use the standard 2468 // any address representation. 2469 InterfaceAddr: tcpip.AddrFrom4(req.InterfaceAddr), 2470 MulticastAddr: tcpip.AddrFrom4(req.MulticastAddr), 2471 })) 2472 2473 case linux.IP_MULTICAST_IF: 2474 req, err := copyInMulticastRequest(optVal, true /* allowAddr */) 2475 if err != nil { 2476 return err 2477 } 2478 2479 return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.MulticastInterfaceOption{ 2480 NIC: tcpip.NICID(req.InterfaceIndex), 2481 InterfaceAddr: socket.BytesToIPAddress(req.InterfaceAddr[:]), 2482 })) 2483 2484 case linux.IP_MULTICAST_LOOP: 2485 v, err := parseIntOrChar(optVal) 2486 if err != nil { 2487 return err 2488 } 2489 2490 ep.SocketOptions().SetMulticastLoop(v != 0) 2491 return nil 2492 2493 case linux.MCAST_JOIN_GROUP: 2494 // FIXME(b/124219304): Implement MCAST_JOIN_GROUP. 2495 return syserr.ErrInvalidArgument 2496 2497 case linux.IP_TTL: 2498 v, err := parseIntOrChar(optVal) 2499 if err != nil { 2500 return err 2501 } 2502 2503 // -1 means default TTL. 2504 if v == -1 { 2505 v = 0 2506 } else if v < 1 || v > 255 { 2507 return syserr.ErrInvalidArgument 2508 } 2509 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TTLOption, int(v))) 2510 2511 case linux.IP_RECVTTL: 2512 v, err := parseIntOrChar(optVal) 2513 if err != nil { 2514 return err 2515 } 2516 ep.SocketOptions().SetReceiveTTL(v != 0) 2517 return nil 2518 2519 case linux.IP_TOS: 2520 if len(optVal) == 0 { 2521 return nil 2522 } 2523 v, err := parseIntOrChar(optVal) 2524 if err != nil { 2525 return err 2526 } 2527 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TOSOption, int(v))) 2528 2529 case linux.IP_RECVTOS: 2530 v, err := parseIntOrChar(optVal) 2531 if err != nil { 2532 return err 2533 } 2534 ep.SocketOptions().SetReceiveTOS(v != 0) 2535 return nil 2536 2537 case linux.IP_RECVERR: 2538 if len(optVal) == 0 { 2539 return nil 2540 } 2541 v, err := parseIntOrChar(optVal) 2542 if err != nil { 2543 return err 2544 } 2545 ep.SocketOptions().SetIPv4RecvError(v != 0) 2546 return nil 2547 2548 case linux.IP_PKTINFO: 2549 if len(optVal) == 0 { 2550 return nil 2551 } 2552 v, err := parseIntOrChar(optVal) 2553 if err != nil { 2554 return err 2555 } 2556 ep.SocketOptions().SetReceivePacketInfo(v != 0) 2557 return nil 2558 2559 case linux.IP_HDRINCL: 2560 if len(optVal) == 0 { 2561 return nil 2562 } 2563 v, err := parseIntOrChar(optVal) 2564 if err != nil { 2565 return err 2566 } 2567 ep.SocketOptions().SetHeaderIncluded(v != 0) 2568 return nil 2569 2570 case linux.IP_RECVORIGDSTADDR: 2571 if len(optVal) == 0 { 2572 return nil 2573 } 2574 v, err := parseIntOrChar(optVal) 2575 if err != nil { 2576 return err 2577 } 2578 2579 ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0) 2580 return nil 2581 2582 case linux.IPT_SO_SET_REPLACE: 2583 if len(optVal) < linux.SizeOfIPTReplace { 2584 return syserr.ErrInvalidArgument 2585 } 2586 2587 // Only valid for raw IPv4 sockets. 2588 if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW { 2589 return syserr.ErrProtocolNotAvailable 2590 } 2591 2592 stk := inet.StackFromContext(t) 2593 if stk == nil { 2594 return syserr.ErrNoDevice 2595 } 2596 // Stack must be a netstack stack. 2597 return netfilter.SetEntries(t.Credentials().UserNamespace, stk.(*Stack).Stack, optVal, false) 2598 2599 case linux.IPT_SO_SET_ADD_COUNTERS: 2600 log.Infof("IPT_SO_SET_ADD_COUNTERS is not supported") 2601 return nil 2602 2603 case linux.IP_MTU_DISCOVER: 2604 if len(optVal) == 0 { 2605 return nil 2606 } 2607 v, err := parseIntOrChar(optVal) 2608 if err != nil { 2609 return err 2610 } 2611 switch v { 2612 case linux.IP_PMTUDISC_DONT: 2613 v = int32(tcpip.PMTUDiscoveryDont) 2614 case linux.IP_PMTUDISC_WANT: 2615 v = int32(tcpip.PMTUDiscoveryWant) 2616 case linux.IP_PMTUDISC_DO: 2617 v = int32(tcpip.PMTUDiscoveryDo) 2618 case linux.IP_PMTUDISC_PROBE: 2619 v = int32(tcpip.PMTUDiscoveryProbe) 2620 default: 2621 return syserr.ErrNotSupported 2622 } 2623 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MTUDiscoverOption, int(v))) 2624 2625 case linux.IP_ADD_SOURCE_MEMBERSHIP, 2626 linux.IP_BIND_ADDRESS_NO_PORT, 2627 linux.IP_BLOCK_SOURCE, 2628 linux.IP_CHECKSUM, 2629 linux.IP_DROP_SOURCE_MEMBERSHIP, 2630 linux.IP_FREEBIND, 2631 linux.IP_IPSEC_POLICY, 2632 linux.IP_MINTTL, 2633 linux.IP_MSFILTER, 2634 linux.IP_MULTICAST_ALL, 2635 linux.IP_NODEFRAG, 2636 linux.IP_OPTIONS, 2637 linux.IP_PASSSEC, 2638 linux.IP_RECVFRAGSIZE, 2639 linux.IP_RECVOPTS, 2640 linux.IP_RETOPTS, 2641 linux.IP_TRANSPARENT, 2642 linux.IP_UNBLOCK_SOURCE, 2643 linux.IP_UNICAST_IF, 2644 linux.IP_XFRM_POLICY, 2645 linux.MCAST_BLOCK_SOURCE, 2646 linux.MCAST_JOIN_SOURCE_GROUP, 2647 linux.MCAST_LEAVE_GROUP, 2648 linux.MCAST_LEAVE_SOURCE_GROUP, 2649 linux.MCAST_MSFILTER, 2650 linux.MCAST_UNBLOCK_SOURCE: 2651 // Not supported. 2652 } 2653 2654 return nil 2655 } 2656 2657 // GetSockName implements the linux syscall getsockname(2) for sockets backed by 2658 // tcpip.Endpoint. 2659 func (s *sock) GetSockName(*kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { 2660 addr, err := s.Endpoint.GetLocalAddress() 2661 if err != nil { 2662 return nil, 0, syserr.TranslateNetstackError(err) 2663 } 2664 2665 a, l := socket.ConvertAddress(s.family, addr) 2666 return a, l, nil 2667 } 2668 2669 // GetPeerName implements the linux syscall getpeername(2) for sockets backed by 2670 // tcpip.Endpoint. 2671 func (s *sock) GetPeerName(*kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { 2672 addr, err := s.Endpoint.GetRemoteAddress() 2673 if err != nil { 2674 return nil, 0, syserr.TranslateNetstackError(err) 2675 } 2676 2677 a, l := socket.ConvertAddress(s.family, addr) 2678 return a, l, nil 2679 } 2680 2681 func (s *sock) fillCmsgInq(cmsg *socket.ControlMessages) { 2682 if !s.sockOptInq { 2683 return 2684 } 2685 rcvBufUsed, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption) 2686 if err != nil { 2687 return 2688 } 2689 cmsg.IP.HasInq = true 2690 cmsg.IP.Inq = int32(rcvBufUsed) 2691 } 2692 2693 func toLinuxPacketType(pktType tcpip.PacketType) uint8 { 2694 switch pktType { 2695 case tcpip.PacketHost: 2696 return linux.PACKET_HOST 2697 case tcpip.PacketOtherHost: 2698 return linux.PACKET_OTHERHOST 2699 case tcpip.PacketOutgoing: 2700 return linux.PACKET_OUTGOING 2701 case tcpip.PacketBroadcast: 2702 return linux.PACKET_BROADCAST 2703 case tcpip.PacketMulticast: 2704 return linux.PACKET_MULTICAST 2705 default: 2706 panic(fmt.Sprintf("unknown packet type: %d", pktType)) 2707 } 2708 } 2709 2710 // nonBlockingRead issues a non-blocking read. 2711 // 2712 // TODO(b/78348848): Support timestamps for stream sockets. 2713 func (s *sock) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { 2714 isPacket := s.isPacketBased() 2715 2716 readOptions := tcpip.ReadOptions{ 2717 Peek: peek, 2718 NeedRemoteAddr: senderRequested, 2719 NeedLinkPacketInfo: isPacket, 2720 } 2721 2722 // TCP sockets discard the data if MSG_TRUNC is set. 2723 // 2724 // This behavior is documented in man 7 tcp: 2725 // Since version 2.4, Linux supports the use of MSG_TRUNC in the flags 2726 // argument of recv(2) (and recvmsg(2)). This flag causes the received 2727 // bytes of data to be discarded, rather than passed back in a 2728 // caller-supplied buffer. 2729 var w io.Writer 2730 var res tcpip.ReadResult 2731 var err tcpip.Error 2732 2733 s.readMu.Lock() 2734 defer s.readMu.Unlock() 2735 2736 if !isPacket && trunc { 2737 w = &tcpip.LimitedWriter{ 2738 W: ioutil.Discard, 2739 N: dst.NumBytes(), 2740 } 2741 res, err = s.Endpoint.Read(w, readOptions) 2742 } else { 2743 switch s.Endpoint.(type) { 2744 case *tcp.Endpoint: 2745 s.mu.Lock() 2746 s.readWriter.Init(ctx, dst) 2747 res, err = s.Endpoint.Read(&s.readWriter, readOptions) 2748 s.mu.Unlock() 2749 default: 2750 res, err = s.Endpoint.Read(dst.Writer(ctx), readOptions) 2751 } 2752 } 2753 2754 if _, ok := err.(*tcpip.ErrBadBuffer); ok && dst.NumBytes() == 0 { 2755 err = nil 2756 } 2757 if err != nil { 2758 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err) 2759 } 2760 // Set the control message, even if 0 bytes were read. 2761 s.updateTimestamp(res.ControlMessages) 2762 2763 if isPacket { 2764 var addr linux.SockAddr 2765 var addrLen uint32 2766 if senderRequested { 2767 addr, addrLen = socket.ConvertAddress(s.family, res.RemoteAddr) 2768 switch v := addr.(type) { 2769 case *linux.SockAddrLink: 2770 v.Protocol = socket.Htons(uint16(res.LinkPacketInfo.Protocol)) 2771 v.PacketType = toLinuxPacketType(res.LinkPacketInfo.PktType) 2772 } 2773 } 2774 2775 msgLen := res.Count 2776 if trunc { 2777 msgLen = res.Total 2778 } 2779 2780 var flags int 2781 if res.Total > res.Count { 2782 flags |= linux.MSG_TRUNC 2783 } 2784 2785 return msgLen, flags, addr, addrLen, s.netstackToLinuxControlMessages(res.ControlMessages), nil 2786 } 2787 2788 if peek { 2789 // MSG_TRUNC with MSG_PEEK on a TCP socket returns the 2790 // amount that could be read, and does not write to buffer. 2791 if trunc { 2792 // TCP endpoint does not return the total bytes in buffer as numTotal. 2793 // We need to query it from socket option. 2794 rql, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption) 2795 if err != nil { 2796 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err) 2797 } 2798 msgLen := int(dst.NumBytes()) 2799 if msgLen > rql { 2800 msgLen = rql 2801 } 2802 return msgLen, 0, nil, 0, socket.ControlMessages{}, nil 2803 } 2804 } else if n := res.Count; n != 0 { 2805 s.Endpoint.ModerateRecvBuf(n) 2806 } 2807 2808 cmsg := s.netstackToLinuxControlMessages(res.ControlMessages) 2809 s.fillCmsgInq(&cmsg) 2810 return res.Count, 0, nil, 0, cmsg, syserr.TranslateNetstackError(err) 2811 } 2812 2813 func (s *sock) netstackToLinuxControlMessages(cm tcpip.ReceivableControlMessages) socket.ControlMessages { 2814 readCM := socket.NewIPControlMessages(s.family, cm) 2815 return socket.ControlMessages{ 2816 IP: socket.IPControlMessages{ 2817 HasTimestamp: readCM.HasTimestamp && s.sockOptTimestamp, 2818 Timestamp: readCM.Timestamp, 2819 HasInq: readCM.HasInq, 2820 Inq: readCM.Inq, 2821 HasTOS: readCM.HasTOS, 2822 TOS: readCM.TOS, 2823 HasTClass: readCM.HasTClass, 2824 TClass: readCM.TClass, 2825 HasTTL: readCM.HasTTL, 2826 TTL: readCM.TTL, 2827 HasHopLimit: readCM.HasHopLimit, 2828 HopLimit: readCM.HopLimit, 2829 HasIPPacketInfo: readCM.HasIPPacketInfo, 2830 PacketInfo: readCM.PacketInfo, 2831 HasIPv6PacketInfo: readCM.HasIPv6PacketInfo, 2832 IPv6PacketInfo: readCM.IPv6PacketInfo, 2833 OriginalDstAddress: readCM.OriginalDstAddress, 2834 SockErr: readCM.SockErr, 2835 }, 2836 } 2837 } 2838 2839 func (s *sock) linuxToNetstackControlMessages(cm socket.ControlMessages) tcpip.SendableControlMessages { 2840 return tcpip.SendableControlMessages{ 2841 HasTTL: cm.IP.HasTTL, 2842 TTL: uint8(cm.IP.TTL), 2843 HasHopLimit: cm.IP.HasHopLimit, 2844 HopLimit: uint8(cm.IP.HopLimit), 2845 } 2846 } 2847 2848 // updateTimestamp sets the timestamp for SIOCGSTAMP. It should be called after 2849 // successfully writing packet data out to userspace. 2850 // 2851 // Precondition: s.readMu must be locked. 2852 func (s *sock) updateTimestamp(cm tcpip.ReceivableControlMessages) { 2853 // Save the SIOCGSTAMP timestamp only if SO_TIMESTAMP is disabled. 2854 if !s.sockOptTimestamp { 2855 s.timestampValid = true 2856 s.timestamp = cm.Timestamp 2857 } 2858 } 2859 2860 // dequeueErr is analogous to net/core/skbuff.c:sock_dequeue_err_skb(). 2861 func (s *sock) dequeueErr() *tcpip.SockError { 2862 so := s.Endpoint.SocketOptions() 2863 err := so.DequeueErr() 2864 if err == nil { 2865 return nil 2866 } 2867 2868 // Update socket error to reflect ICMP errors in queue. 2869 if nextErr := so.PeekErr(); nextErr != nil && nextErr.Cause.Origin().IsICMPErr() { 2870 so.SetLastError(nextErr.Err) 2871 } else if err.Cause.Origin().IsICMPErr() { 2872 so.SetLastError(nil) 2873 } 2874 return err 2875 } 2876 2877 // addrFamilyFromNetProto returns the address family identifier for the given 2878 // network protocol. 2879 func addrFamilyFromNetProto(net tcpip.NetworkProtocolNumber) int { 2880 switch net { 2881 case header.IPv4ProtocolNumber: 2882 return linux.AF_INET 2883 case header.IPv6ProtocolNumber: 2884 return linux.AF_INET6 2885 default: 2886 panic(fmt.Sprintf("invalid net proto for addr family inference: %d", net)) 2887 } 2888 } 2889 2890 // recvErr handles MSG_ERRQUEUE for recvmsg(2). 2891 // This is analogous to net/ipv4/ip_sockglue.c:ip_recv_error(). 2892 func (s *sock) recvErr(t *kernel.Task, dst usermem.IOSequence) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { 2893 sockErr := s.dequeueErr() 2894 if sockErr == nil { 2895 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain 2896 } 2897 if sockErr.Payload != nil { 2898 defer sockErr.Payload.Release() 2899 } 2900 2901 // The payload of the original packet that caused the error is passed as 2902 // normal data via msg_iovec. -- recvmsg(2) 2903 msgFlags := linux.MSG_ERRQUEUE 2904 if int(dst.NumBytes()) < sockErr.Payload.Size() { 2905 msgFlags |= linux.MSG_TRUNC 2906 } 2907 n, err := dst.CopyOut(t, sockErr.Payload.AsSlice()) 2908 2909 // The original destination address of the datagram that caused the error is 2910 // supplied via msg_name. -- recvmsg(2) 2911 dstAddr, dstAddrLen := socket.ConvertAddress(addrFamilyFromNetProto(sockErr.NetProto), sockErr.Dst) 2912 cmgs := socket.ControlMessages{IP: socket.NewIPControlMessages(s.family, tcpip.ReceivableControlMessages{SockErr: sockErr})} 2913 return n, msgFlags, dstAddr, dstAddrLen, cmgs, syserr.FromError(err) 2914 } 2915 2916 // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by 2917 // tcpip.Endpoint. 2918 func (s *sock) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, _ uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { 2919 if flags&linux.MSG_ERRQUEUE != 0 { 2920 return s.recvErr(t, dst) 2921 } 2922 2923 trunc := flags&linux.MSG_TRUNC != 0 2924 peek := flags&linux.MSG_PEEK != 0 2925 dontWait := flags&linux.MSG_DONTWAIT != 0 2926 waitAll := flags&linux.MSG_WAITALL != 0 2927 if senderRequested && !s.isPacketBased() { 2928 // Stream sockets ignore the sender address. 2929 senderRequested = false 2930 } 2931 n, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested) 2932 2933 if s.isPacketBased() && err == syserr.ErrClosedForReceive && flags&linux.MSG_DONTWAIT != 0 { 2934 // In this situation we should return EAGAIN. 2935 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain 2936 } 2937 2938 if err != nil && (err != syserr.ErrWouldBlock || dontWait) { 2939 // Read failed and we should not retry. 2940 return 0, 0, nil, 0, socket.ControlMessages{}, err 2941 } 2942 2943 if err == nil && (dontWait || !waitAll || s.isPacketBased() || int64(n) >= dst.NumBytes()) { 2944 // We got all the data we need. 2945 return 2946 } 2947 2948 // Don't overwrite any data we received. 2949 dst = dst.DropFirst(n) 2950 2951 // We'll have to block. Register for notifications and keep trying to 2952 // send all the data. 2953 e, ch := waiter.NewChannelEntry(waiter.ReadableEvents) 2954 s.EventRegister(&e) 2955 defer s.EventUnregister(&e) 2956 2957 for { 2958 var rn int 2959 rn, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested) 2960 n += rn 2961 if err != nil && err != syserr.ErrWouldBlock { 2962 // Always stop on errors other than would block as we generally 2963 // won't be able to get any more data. Eat the error if we got 2964 // any data. 2965 if n > 0 { 2966 err = nil 2967 } 2968 return 2969 } 2970 if err == nil && (s.isPacketBased() || !waitAll || int64(rn) >= dst.NumBytes()) { 2971 // We got all the data we need. 2972 return 2973 } 2974 dst = dst.DropFirst(rn) 2975 2976 if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { 2977 if n > 0 { 2978 return n, msgFlags, senderAddr, senderAddrLen, controlMessages, nil 2979 } 2980 if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { 2981 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain 2982 } 2983 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) 2984 } 2985 } 2986 } 2987 2988 // SendMsg implements the linux syscall sendmsg(2) for sockets backed by 2989 // tcpip.Endpoint. 2990 func (s *sock) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { 2991 // Reject Unix control messages. 2992 if !controlMessages.Unix.Empty() { 2993 return 0, syserr.ErrInvalidArgument 2994 } 2995 2996 var addr *tcpip.FullAddress 2997 if len(to) > 0 { 2998 addrBuf, family, err := socket.AddressAndFamily(to) 2999 if err != nil { 3000 return 0, err 3001 } 3002 if !s.checkFamily(family, false /* exact */) { 3003 return 0, syserr.ErrInvalidArgument 3004 } 3005 addrBuf = s.mapFamily(addrBuf, family) 3006 3007 addr = &addrBuf 3008 } 3009 3010 opts := tcpip.WriteOptions{ 3011 To: addr, 3012 More: flags&linux.MSG_MORE != 0, 3013 EndOfRecord: flags&linux.MSG_EOR != 0, 3014 ControlMessages: s.linuxToNetstackControlMessages(controlMessages), 3015 } 3016 3017 r := src.Reader(t) 3018 var ( 3019 total int64 3020 entry waiter.Entry 3021 ch <-chan struct{} 3022 ) 3023 for { 3024 n, err := s.Endpoint.Write(r, opts) 3025 total += n 3026 if flags&linux.MSG_DONTWAIT != 0 { 3027 return int(total), syserr.TranslateNetstackError(err) 3028 } 3029 block := true 3030 switch err.(type) { 3031 case nil: 3032 block = total != src.NumBytes() 3033 case *tcpip.ErrWouldBlock: 3034 default: 3035 block = false 3036 } 3037 if block { 3038 if ch == nil { 3039 // We'll have to block. Register for notification and keep trying to 3040 // send all the data. 3041 entry, ch = waiter.NewChannelEntry(waiter.WritableEvents) 3042 s.EventRegister(&entry) 3043 defer s.EventUnregister(&entry) 3044 } else { 3045 // Don't wait immediately after registration in case more data 3046 // became available between when we last checked and when we setup 3047 // the notification. 3048 if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { 3049 if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { 3050 return int(total), syserr.ErrTryAgain 3051 } 3052 // handleIOError will consume errors from t.Block if needed. 3053 return int(total), syserr.FromError(err) 3054 } 3055 } 3056 continue 3057 } 3058 return int(total), syserr.TranslateNetstackError(err) 3059 } 3060 } 3061 3062 // Ioctl implements vfs.FileDescriptionImpl. 3063 func (s *sock) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { 3064 t := kernel.TaskFromContext(ctx) 3065 if t == nil { 3066 panic("ioctl(2) may only be called from a task goroutine") 3067 } 3068 3069 // SIOCGSTAMP is implemented by netstack rather than all commonEndpoint 3070 // sockets. 3071 // TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP. 3072 switch args[1].Int() { 3073 case linux.SIOCGSTAMP: 3074 s.readMu.Lock() 3075 defer s.readMu.Unlock() 3076 if !s.timestampValid { 3077 return 0, linuxerr.ENOENT 3078 } 3079 3080 tv := linux.NsecToTimeval(s.timestamp.UnixNano()) 3081 _, err := tv.CopyOut(t, args[2].Pointer()) 3082 return 0, err 3083 3084 case linux.TIOCINQ: 3085 v, terr := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption) 3086 if terr != nil { 3087 return 0, syserr.TranslateNetstackError(terr).ToError() 3088 } 3089 3090 if v > math.MaxInt32 { 3091 v = math.MaxInt32 3092 } 3093 3094 // Copy result to userspace. 3095 vP := primitive.Int32(v) 3096 _, err := vP.CopyOut(t, args[2].Pointer()) 3097 return 0, err 3098 } 3099 3100 return Ioctl(ctx, s.Endpoint, uio, sysno, args) 3101 } 3102 3103 // Ioctl performs a socket ioctl. 3104 func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { 3105 t := kernel.TaskFromContext(ctx) 3106 if t == nil { 3107 panic("ioctl(2) may only be called from a task goroutine") 3108 } 3109 3110 switch arg := int(args[1].Int()); arg { 3111 case linux.SIOCGIFFLAGS, 3112 linux.SIOCGIFADDR, 3113 linux.SIOCGIFBRDADDR, 3114 linux.SIOCGIFDSTADDR, 3115 linux.SIOCGIFHWADDR, 3116 linux.SIOCGIFINDEX, 3117 linux.SIOCGIFMAP, 3118 linux.SIOCGIFMETRIC, 3119 linux.SIOCGIFMTU, 3120 linux.SIOCGIFNAME, 3121 linux.SIOCGIFNETMASK, 3122 linux.SIOCGIFTXQLEN, 3123 linux.SIOCETHTOOL: 3124 3125 var ifr linux.IFReq 3126 if _, err := ifr.CopyIn(t, args[2].Pointer()); err != nil { 3127 return 0, err 3128 } 3129 if err := interfaceIoctl(ctx, io, arg, &ifr); err != nil { 3130 return 0, err.ToError() 3131 } 3132 _, err := ifr.CopyOut(t, args[2].Pointer()) 3133 return 0, err 3134 3135 case linux.SIOCGIFCONF: 3136 // Return a list of interface addresses or the buffer size 3137 // necessary to hold the list. 3138 var ifc linux.IFConf 3139 if _, err := ifc.CopyIn(t, args[2].Pointer()); err != nil { 3140 return 0, err 3141 } 3142 3143 if err := ifconfIoctl(ctx, t, io, &ifc); err != nil { 3144 return 0, err 3145 } 3146 3147 _, err := ifc.CopyOut(t, args[2].Pointer()) 3148 return 0, err 3149 3150 case linux.TIOCINQ: 3151 v, terr := ep.GetSockOptInt(tcpip.ReceiveQueueSizeOption) 3152 if terr != nil { 3153 return 0, syserr.TranslateNetstackError(terr).ToError() 3154 } 3155 3156 if v > math.MaxInt32 { 3157 v = math.MaxInt32 3158 } 3159 // Copy result to userspace. 3160 vP := primitive.Int32(v) 3161 _, err := vP.CopyOut(t, args[2].Pointer()) 3162 return 0, err 3163 3164 case linux.TIOCOUTQ: 3165 v, terr := ep.GetSockOptInt(tcpip.SendQueueSizeOption) 3166 if terr != nil { 3167 return 0, syserr.TranslateNetstackError(terr).ToError() 3168 } 3169 3170 if v > math.MaxInt32 { 3171 v = math.MaxInt32 3172 } 3173 3174 // Copy result to userspace. 3175 vP := primitive.Int32(v) 3176 _, err := vP.CopyOut(t, args[2].Pointer()) 3177 return 0, err 3178 3179 case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG: 3180 // Not supported. 3181 } 3182 3183 return 0, linuxerr.ENOTTY 3184 } 3185 3186 // interfaceIoctl implements interface requests. 3187 func interfaceIoctl(ctx context.Context, _ usermem.IO, arg int, ifr *linux.IFReq) *syserr.Error { 3188 var ( 3189 iface inet.Interface 3190 index int32 3191 found bool 3192 ) 3193 3194 // Find the relevant device. 3195 stk := inet.StackFromContext(ctx) 3196 if stk == nil { 3197 return syserr.ErrNoDevice 3198 } 3199 3200 // SIOCGIFNAME uses ifr.ifr_ifindex rather than ifr.ifr_name to 3201 // identify a device. 3202 if arg == linux.SIOCGIFNAME { 3203 // Gets the name of the interface given the interface index 3204 // stored in ifr_ifindex. 3205 index = int32(hostarch.ByteOrder.Uint32(ifr.Data[:4])) 3206 if iface, ok := stk.Interfaces()[index]; ok { 3207 ifr.SetName(iface.Name) 3208 return nil 3209 } 3210 return syserr.ErrNoDevice 3211 } 3212 3213 // Find the relevant device. 3214 for index, iface = range stk.Interfaces() { 3215 if iface.Name == ifr.Name() { 3216 found = true 3217 break 3218 } 3219 } 3220 if !found { 3221 return syserr.ErrNoDevice 3222 } 3223 3224 switch arg { 3225 case linux.SIOCGIFINDEX: 3226 // Copy out the index to the data. 3227 hostarch.ByteOrder.PutUint32(ifr.Data[:], uint32(index)) 3228 3229 case linux.SIOCGIFHWADDR: 3230 // Copy the hardware address out. 3231 // 3232 // Refer: https://linux.die.net/man/7/netdevice 3233 // SIOCGIFHWADDR, SIOCSIFHWADDR 3234 // 3235 // Get or set the hardware address of a device using 3236 // ifr_hwaddr. The hardware address is specified in a struct 3237 // sockaddr. sa_family contains the ARPHRD_* device type, 3238 // sa_data the L2 hardware address starting from byte 0. Setting 3239 // the hardware address is a privileged operation. 3240 hostarch.ByteOrder.PutUint16(ifr.Data[:], iface.DeviceType) 3241 n := copy(ifr.Data[2:], iface.Addr) 3242 for i := 2 + n; i < len(ifr.Data); i++ { 3243 ifr.Data[i] = 0 // Clear padding. 3244 } 3245 3246 case linux.SIOCGIFFLAGS: 3247 f, err := interfaceStatusFlags(stk, iface.Name) 3248 if err != nil { 3249 return err 3250 } 3251 // Drop the flags that don't fit in the size that we need to return. This 3252 // matches Linux behavior. 3253 hostarch.ByteOrder.PutUint16(ifr.Data[:2], uint16(f)) 3254 3255 case linux.SIOCGIFADDR: 3256 // Copy the IPv4 address out. 3257 for _, addr := range stk.InterfaceAddrs()[index] { 3258 // This ioctl is only compatible with AF_INET addresses. 3259 if addr.Family != linux.AF_INET { 3260 continue 3261 } 3262 copy(ifr.Data[4:8], addr.Addr) 3263 break 3264 } 3265 3266 case linux.SIOCGIFMETRIC: 3267 // Gets the metric of the device. As per netdevice(7), this 3268 // always just sets ifr_metric to 0. 3269 hostarch.ByteOrder.PutUint32(ifr.Data[:4], 0) 3270 3271 case linux.SIOCGIFMTU: 3272 // Gets the MTU of the device. 3273 hostarch.ByteOrder.PutUint32(ifr.Data[:4], iface.MTU) 3274 3275 case linux.SIOCGIFMAP: 3276 // Gets the hardware parameters of the device. 3277 // TODO(gvisor.dev/issue/505): Implement. 3278 3279 case linux.SIOCGIFTXQLEN: 3280 // Gets the transmit queue length of the device. 3281 // TODO(gvisor.dev/issue/505): Implement. 3282 3283 case linux.SIOCGIFDSTADDR: 3284 // Gets the destination address of a point-to-point device. 3285 // TODO(gvisor.dev/issue/505): Implement. 3286 3287 case linux.SIOCGIFBRDADDR: 3288 // Gets the broadcast address of a device. 3289 // TODO(gvisor.dev/issue/505): Implement. 3290 3291 case linux.SIOCGIFNETMASK: 3292 // Gets the network mask of a device. 3293 for _, addr := range stk.InterfaceAddrs()[index] { 3294 // This ioctl is only compatible with AF_INET addresses. 3295 if addr.Family != linux.AF_INET { 3296 continue 3297 } 3298 // Populate ifr.ifr_netmask (type sockaddr). 3299 hostarch.ByteOrder.PutUint16(ifr.Data[0:], uint16(linux.AF_INET)) 3300 hostarch.ByteOrder.PutUint16(ifr.Data[2:], 0) 3301 var mask uint32 = 0xffffffff << (32 - addr.PrefixLen) 3302 // Netmask is expected to be returned as a big endian 3303 // value. 3304 binary.BigEndian.PutUint32(ifr.Data[4:8], mask) 3305 break 3306 } 3307 3308 case linux.SIOCETHTOOL: 3309 // Stubbed out for now, Ideally we should implement the required 3310 // sub-commands for ETHTOOL 3311 // 3312 // See: 3313 // https://github.com/torvalds/linux/blob/aa0c9086b40c17a7ad94425b3b70dd1fdd7497bf/net/core/dev_ioctl.c 3314 return syserr.ErrEndpointOperation 3315 3316 default: 3317 // Not a valid call. 3318 return syserr.ErrInvalidArgument 3319 } 3320 3321 return nil 3322 } 3323 3324 // ifconfIoctl populates a struct ifconf for the SIOCGIFCONF ioctl. 3325 func ifconfIoctl(ctx context.Context, t *kernel.Task, _ usermem.IO, ifc *linux.IFConf) error { 3326 // If Ptr is NULL, return the necessary buffer size via Len. 3327 // Otherwise, write up to Len bytes starting at Ptr containing ifreq 3328 // structs. 3329 stk := inet.StackFromContext(ctx) 3330 if stk == nil { 3331 return syserr.ErrNoDevice.ToError() 3332 } 3333 3334 if ifc.Ptr == 0 { 3335 ifc.Len = int32(len(stk.Interfaces())) * int32(linux.SizeOfIFReq) 3336 return nil 3337 } 3338 3339 max := ifc.Len 3340 ifc.Len = 0 3341 for key, ifaceAddrs := range stk.InterfaceAddrs() { 3342 iface := stk.Interfaces()[key] 3343 for _, ifaceAddr := range ifaceAddrs { 3344 // Don't write past the end of the buffer. 3345 if ifc.Len+int32(linux.SizeOfIFReq) > max { 3346 break 3347 } 3348 if ifaceAddr.Family != linux.AF_INET { 3349 continue 3350 } 3351 3352 // Populate ifr.ifr_addr. 3353 ifr := linux.IFReq{} 3354 ifr.SetName(iface.Name) 3355 hostarch.ByteOrder.PutUint16(ifr.Data[0:2], uint16(ifaceAddr.Family)) 3356 hostarch.ByteOrder.PutUint16(ifr.Data[2:4], 0) 3357 copy(ifr.Data[4:8], ifaceAddr.Addr[:4]) 3358 3359 // Copy the ifr to userspace. 3360 dst := uintptr(ifc.Ptr) + uintptr(ifc.Len) 3361 ifc.Len += int32(linux.SizeOfIFReq) 3362 if _, err := ifr.CopyOut(t, hostarch.Addr(dst)); err != nil { 3363 return err 3364 } 3365 } 3366 } 3367 return nil 3368 } 3369 3370 // interfaceStatusFlags returns status flags for an interface in the stack. 3371 // Flag values and meanings are described in greater detail in netdevice(7) in 3372 // the SIOCGIFFLAGS section. 3373 func interfaceStatusFlags(stack inet.Stack, name string) (uint32, *syserr.Error) { 3374 // We should only ever be passed a netstack.Stack. 3375 epstack, ok := stack.(*Stack) 3376 if !ok { 3377 return 0, errStackType 3378 } 3379 3380 // Find the NIC corresponding to this interface. 3381 for _, info := range epstack.Stack.NICInfo() { 3382 if info.Name == name { 3383 return nicStateFlagsToLinux(info.Flags), nil 3384 } 3385 } 3386 return 0, syserr.ErrNoDevice 3387 } 3388 3389 func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 { 3390 var rv uint32 3391 if f.Up { 3392 rv |= linux.IFF_UP | linux.IFF_LOWER_UP 3393 } 3394 if f.Running { 3395 rv |= linux.IFF_RUNNING 3396 } 3397 if f.Promiscuous { 3398 rv |= linux.IFF_PROMISC 3399 } 3400 if f.Loopback { 3401 rv |= linux.IFF_LOOPBACK 3402 } 3403 return rv 3404 } 3405 3406 // State implements socket.Socket.State. State translates the internal state 3407 // returned by netstack to values defined by Linux. 3408 func (s *sock) State() uint32 { 3409 if s.family != linux.AF_INET && s.family != linux.AF_INET6 { 3410 // States not implemented for this socket's family. 3411 return 0 3412 } 3413 3414 switch { 3415 case socket.IsTCP(s): 3416 // TCP socket. 3417 switch tcp.EndpointState(s.Endpoint.State()) { 3418 case tcp.StateEstablished: 3419 return linux.TCP_ESTABLISHED 3420 case tcp.StateSynSent: 3421 return linux.TCP_SYN_SENT 3422 case tcp.StateSynRecv: 3423 return linux.TCP_SYN_RECV 3424 case tcp.StateFinWait1: 3425 return linux.TCP_FIN_WAIT1 3426 case tcp.StateFinWait2: 3427 return linux.TCP_FIN_WAIT2 3428 case tcp.StateTimeWait: 3429 return linux.TCP_TIME_WAIT 3430 case tcp.StateClose, tcp.StateInitial, tcp.StateBound, tcp.StateConnecting, tcp.StateError: 3431 return linux.TCP_CLOSE 3432 case tcp.StateCloseWait: 3433 return linux.TCP_CLOSE_WAIT 3434 case tcp.StateLastAck: 3435 return linux.TCP_LAST_ACK 3436 case tcp.StateListen: 3437 return linux.TCP_LISTEN 3438 case tcp.StateClosing: 3439 return linux.TCP_CLOSING 3440 default: 3441 // Internal or unknown state. 3442 return 0 3443 } 3444 case socket.IsUDP(s): 3445 // UDP socket. 3446 switch transport.DatagramEndpointState(s.Endpoint.State()) { 3447 case transport.DatagramEndpointStateInitial, transport.DatagramEndpointStateBound, transport.DatagramEndpointStateClosed: 3448 return linux.TCP_CLOSE 3449 case transport.DatagramEndpointStateConnected: 3450 return linux.TCP_ESTABLISHED 3451 default: 3452 return 0 3453 } 3454 case socket.IsICMP(s): 3455 // We don't support this yet. 3456 case socket.IsRaw(s): 3457 // We don't support this yet. 3458 default: 3459 // Unknown transport protocol, how did we make this socket? 3460 log.Warningf("Unknown transport protocol for an existing socket: family=%v, type=%v, protocol=%v, internal type %v", s.family, s.skType, s.protocol, reflect.TypeOf(s.Endpoint).Elem()) 3461 return 0 3462 } 3463 3464 return 0 3465 } 3466 3467 // Type implements socket.Socket.Type. 3468 func (s *sock) Type() (family int, skType linux.SockType, protocol int) { 3469 return s.family, s.skType, s.protocol 3470 } 3471 3472 // EventRegister implements waiter.Waitable. 3473 func (s *sock) EventRegister(e *waiter.Entry) error { 3474 s.Queue.EventRegister(e) 3475 return nil 3476 } 3477 3478 // EventUnregister implements waiter.Waitable.EventUnregister. 3479 func (s *sock) EventUnregister(e *waiter.Entry) { 3480 s.Queue.EventUnregister(e) 3481 }