github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/socket/netstack/netstack.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package netstack provides an implementation of the socket.Socket interface 16 // that is backed by a tcpip.Endpoint. 17 // 18 // It does not depend on any particular endpoint implementation, and thus can 19 // be used to expose certain endpoints to the sentry while leaving others out, 20 // for example, TCP endpoints and Unix-domain endpoints. 21 // 22 // Lock ordering: netstack => mm: ioSequenceReadWriter copies user memory inside 23 // tcpip.Endpoint.Write(). Netstack is allowed to (and does) hold locks during 24 // this operation. 25 package netstack 26 27 import ( 28 "bytes" 29 "encoding/binary" 30 "fmt" 31 "io" 32 "io/ioutil" 33 "math" 34 "reflect" 35 "time" 36 37 "golang.org/x/sys/unix" 38 "google.golang.org/protobuf/proto" 39 "github.com/metacubex/gvisor/pkg/abi/linux" 40 "github.com/metacubex/gvisor/pkg/abi/linux/errno" 41 "github.com/metacubex/gvisor/pkg/context" 42 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 43 "github.com/metacubex/gvisor/pkg/eventchannel" 44 "github.com/metacubex/gvisor/pkg/hostarch" 45 "github.com/metacubex/gvisor/pkg/log" 46 "github.com/metacubex/gvisor/pkg/marshal" 47 "github.com/metacubex/gvisor/pkg/marshal/primitive" 48 "github.com/metacubex/gvisor/pkg/metric" 49 "github.com/metacubex/gvisor/pkg/sentry/arch" 50 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/sockfs" 51 "github.com/metacubex/gvisor/pkg/sentry/inet" 52 "github.com/metacubex/gvisor/pkg/sentry/kernel" 53 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 54 ktime "github.com/metacubex/gvisor/pkg/sentry/kernel/time" 55 "github.com/metacubex/gvisor/pkg/sentry/socket" 56 "github.com/metacubex/gvisor/pkg/sentry/socket/netfilter" 57 epb "github.com/metacubex/gvisor/pkg/sentry/socket/netstack/events_go_proto" 58 "github.com/metacubex/gvisor/pkg/sentry/vfs" 59 "github.com/metacubex/gvisor/pkg/sync" 60 "github.com/metacubex/gvisor/pkg/syserr" 61 "github.com/metacubex/gvisor/pkg/tcpip" 62 "github.com/metacubex/gvisor/pkg/tcpip/header" 63 "github.com/metacubex/gvisor/pkg/tcpip/stack" 64 "github.com/metacubex/gvisor/pkg/tcpip/transport" 65 "github.com/metacubex/gvisor/pkg/tcpip/transport/tcp" 66 "github.com/metacubex/gvisor/pkg/usermem" 67 "github.com/metacubex/gvisor/pkg/waiter" 68 ) 69 70 const bitsPerUint32 = 32 71 72 // statCounterValue returns a function usable as callback function when defining a gVisor Sentry 73 // metric that contains the value counted by the StatCounter. 74 // This avoids a dependency loop in the tcpip package. 75 func statCounterValue(cm *tcpip.StatCounter) func(...*metric.FieldValue) uint64 { 76 return func(...*metric.FieldValue) uint64 { 77 return cm.Value() 78 } 79 } 80 81 func mustCreateMetric(name, description string) *tcpip.StatCounter { 82 var cm tcpip.StatCounter 83 metric.MustRegisterCustomUint64Metric(name, true /* cumulative */, false /* sync */, description, statCounterValue(&cm)) 84 return &cm 85 } 86 87 func mustCreateGauge(name, description string) *tcpip.StatCounter { 88 var cm tcpip.StatCounter 89 metric.MustRegisterCustomUint64Metric(name, false /* cumulative */, false /* sync */, description, statCounterValue(&cm)) 90 return &cm 91 } 92 93 // Metrics contains metrics exported by netstack. 94 var Metrics = tcpip.Stats{ 95 DroppedPackets: mustCreateMetric("/netstack/dropped_packets", "Number of packets dropped at the transport layer."), 96 NICs: tcpip.NICStats{ 97 MalformedL4RcvdPackets: mustCreateMetric("/netstack/nic/malformed_l4_received_packets", "Number of packets received that failed L4 header parsing."), 98 Tx: tcpip.NICPacketStats{ 99 Packets: mustCreateMetric("/netstack/nic/tx/packets", "Number of packets transmitted."), 100 Bytes: mustCreateMetric("/netstack/nic/tx/bytes", "Number of bytes transmitted."), 101 }, 102 TxPacketsDroppedNoBufferSpace: mustCreateMetric("/netstack/nic/tx_packets_dropped_no_buffer_space", "Number of TX packets dropped as a result of no buffer space errors."), 103 Rx: tcpip.NICPacketStats{ 104 Packets: mustCreateMetric("/netstack/nic/rx/packets", "Number of packets received."), 105 Bytes: mustCreateMetric("/netstack/nic/rx/bytes", "Number of bytes received."), 106 }, 107 DisabledRx: tcpip.NICPacketStats{ 108 Packets: mustCreateMetric("/netstack/nic/disabled_rx/packets", "Number of packets received on disabled NICs."), 109 Bytes: mustCreateMetric("/netstack/nic/disabled_rx/bytes", "Number of bytes received on disabled NICs."), 110 }, 111 Neighbor: tcpip.NICNeighborStats{ 112 UnreachableEntryLookups: mustCreateMetric("/netstack/nic/neighbor/unreachable_entry_loopups", "Number of lookups performed on a neighbor entry in Unreachable state."), 113 DroppedConfirmationForNoninitiatedNeighbor: mustCreateMetric("/netstack/nic/neighbor/dropped_confirmation_for_noninitiated_neighbor", "Number of advertisements received that don't match an entry in the neighbor cache."), 114 DroppedInvalidLinkAddressConfirmations: mustCreateMetric("/netstack/nic/neighbor/dropped_invalid_link_address_confirmations", "Number of advertisements dropped because they have empty source link-layer addresses"), 115 }, 116 }, 117 ICMP: tcpip.ICMPStats{ 118 V4: tcpip.ICMPv4Stats{ 119 PacketsSent: tcpip.ICMPv4SentPacketStats{ 120 ICMPv4PacketStats: tcpip.ICMPv4PacketStats{ 121 EchoRequest: mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_request", "Number of ICMPv4 echo request packets sent."), 122 EchoReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_reply", "Number of ICMPv4 echo reply packets sent."), 123 DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_sent/dst_unreachable", "Number of ICMPv4 destination unreachable packets sent."), 124 SrcQuench: mustCreateMetric("/netstack/icmp/v4/packets_sent/src_quench", "Number of ICMPv4 source quench packets sent."), 125 Redirect: mustCreateMetric("/netstack/icmp/v4/packets_sent/redirect", "Number of ICMPv4 redirect packets sent."), 126 TimeExceeded: mustCreateMetric("/netstack/icmp/v4/packets_sent/time_exceeded", "Number of ICMPv4 time exceeded packets sent."), 127 ParamProblem: mustCreateMetric("/netstack/icmp/v4/packets_sent/param_problem", "Number of ICMPv4 parameter problem packets sent."), 128 Timestamp: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp", "Number of ICMPv4 timestamp packets sent."), 129 TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp_reply", "Number of ICMPv4 timestamp reply packets sent."), 130 InfoRequest: mustCreateMetric("/netstack/icmp/v4/packets_sent/info_request", "Number of ICMPv4 information request packets sent."), 131 InfoReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/info_reply", "Number of ICMPv4 information reply packets sent."), 132 }, 133 Dropped: mustCreateMetric("/netstack/icmp/v4/packets_sent/dropped", "Number of ICMPv4 packets dropped due to link layer errors."), 134 RateLimited: mustCreateMetric("/netstack/icmp/v4/packets_sent/rate_limited", "Number of ICMPv4 packets dropped due to rate limit being exceeded."), 135 }, 136 PacketsReceived: tcpip.ICMPv4ReceivedPacketStats{ 137 ICMPv4PacketStats: tcpip.ICMPv4PacketStats{ 138 EchoRequest: mustCreateMetric("/netstack/icmp/v4/packets_received/echo_request", "Number of ICMPv4 echo request packets received."), 139 EchoReply: mustCreateMetric("/netstack/icmp/v4/packets_received/echo_reply", "Number of ICMPv4 echo reply packets received."), 140 DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_received/dst_unreachable", "Number of ICMPv4 destination unreachable packets received."), 141 SrcQuench: mustCreateMetric("/netstack/icmp/v4/packets_received/src_quench", "Number of ICMPv4 source quench packets received."), 142 Redirect: mustCreateMetric("/netstack/icmp/v4/packets_received/redirect", "Number of ICMPv4 redirect packets received."), 143 TimeExceeded: mustCreateMetric("/netstack/icmp/v4/packets_received/time_exceeded", "Number of ICMPv4 time exceeded packets received."), 144 ParamProblem: mustCreateMetric("/netstack/icmp/v4/packets_received/param_problem", "Number of ICMPv4 parameter problem packets received."), 145 Timestamp: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp", "Number of ICMPv4 timestamp packets received."), 146 TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp_reply", "Number of ICMPv4 timestamp reply packets received."), 147 InfoRequest: mustCreateMetric("/netstack/icmp/v4/packets_received/info_request", "Number of ICMPv4 information request packets received."), 148 InfoReply: mustCreateMetric("/netstack/icmp/v4/packets_received/info_reply", "Number of ICMPv4 information reply packets received."), 149 }, 150 Invalid: mustCreateMetric("/netstack/icmp/v4/packets_received/invalid", "Number of ICMPv4 packets received that the transport layer could not parse."), 151 }, 152 }, 153 V6: tcpip.ICMPv6Stats{ 154 PacketsSent: tcpip.ICMPv6SentPacketStats{ 155 ICMPv6PacketStats: tcpip.ICMPv6PacketStats{ 156 EchoRequest: mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_request", "Number of ICMPv6 echo request packets sent."), 157 EchoReply: mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_reply", "Number of ICMPv6 echo reply packets sent."), 158 DstUnreachable: mustCreateMetric("/netstack/icmp/v6/packets_sent/dst_unreachable", "Number of ICMPv6 destination unreachable packets sent."), 159 PacketTooBig: mustCreateMetric("/netstack/icmp/v6/packets_sent/packet_too_big", "Number of ICMPv6 packet too big packets sent."), 160 TimeExceeded: mustCreateMetric("/netstack/icmp/v6/packets_sent/time_exceeded", "Number of ICMPv6 time exceeded packets sent."), 161 ParamProblem: mustCreateMetric("/netstack/icmp/v6/packets_sent/param_problem", "Number of ICMPv6 parameter problem packets sent."), 162 RouterSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/router_solicit", "Number of ICMPv6 router solicit packets sent."), 163 RouterAdvert: mustCreateMetric("/netstack/icmp/v6/packets_sent/router_advert", "Number of ICMPv6 router advert packets sent."), 164 NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets sent."), 165 NeighborAdvert: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_advert", "Number of ICMPv6 neighbor advert packets sent."), 166 RedirectMsg: mustCreateMetric("/netstack/icmp/v6/packets_sent/redirect_msg", "Number of ICMPv6 redirect message packets sent."), 167 MulticastListenerQuery: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_query", "Number of ICMPv6 multicast listener query packets sent."), 168 MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent."), 169 MulticastListenerDone: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent."), 170 }, 171 Dropped: mustCreateMetric("/netstack/icmp/v6/packets_sent/dropped", "Number of ICMPv6 packets dropped due to link layer errors."), 172 RateLimited: mustCreateMetric("/netstack/icmp/v6/packets_sent/rate_limited", "Number of ICMPv6 packets dropped due to rate limit being exceeded."), 173 }, 174 PacketsReceived: tcpip.ICMPv6ReceivedPacketStats{ 175 ICMPv6PacketStats: tcpip.ICMPv6PacketStats{ 176 EchoRequest: mustCreateMetric("/netstack/icmp/v6/packets_received/echo_request", "Number of ICMPv6 echo request packets received."), 177 EchoReply: mustCreateMetric("/netstack/icmp/v6/packets_received/echo_reply", "Number of ICMPv6 echo reply packets received."), 178 DstUnreachable: mustCreateMetric("/netstack/icmp/v6/packets_received/dst_unreachable", "Number of ICMPv6 destination unreachable packets received."), 179 PacketTooBig: mustCreateMetric("/netstack/icmp/v6/packets_received/packet_too_big", "Number of ICMPv6 packet too big packets received."), 180 TimeExceeded: mustCreateMetric("/netstack/icmp/v6/packets_received/time_exceeded", "Number of ICMPv6 time exceeded packets received."), 181 ParamProblem: mustCreateMetric("/netstack/icmp/v6/packets_received/param_problem", "Number of ICMPv6 parameter problem packets received."), 182 RouterSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/router_solicit", "Number of ICMPv6 router solicit packets received."), 183 RouterAdvert: mustCreateMetric("/netstack/icmp/v6/packets_received/router_advert", "Number of ICMPv6 router advert packets received."), 184 NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets received."), 185 NeighborAdvert: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_advert", "Number of ICMPv6 neighbor advert packets received."), 186 RedirectMsg: mustCreateMetric("/netstack/icmp/v6/packets_received/redirect_msg", "Number of ICMPv6 redirect message packets received."), 187 MulticastListenerQuery: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_query", "Number of ICMPv6 multicast listener query packets received."), 188 MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent."), 189 MulticastListenerDone: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent."), 190 }, 191 Unrecognized: mustCreateMetric("/netstack/icmp/v6/packets_received/unrecognized", "Number of ICMPv6 packets received that the transport layer does not know how to parse."), 192 Invalid: mustCreateMetric("/netstack/icmp/v6/packets_received/invalid", "Number of ICMPv6 packets received that the transport layer could not parse."), 193 RouterOnlyPacketsDroppedByHost: mustCreateMetric("/netstack/icmp/v6/packets_received/router_only_packets_dropped_by_host", "Number of ICMPv6 packets dropped due to being router-specific packets."), 194 }, 195 }, 196 }, 197 IGMP: tcpip.IGMPStats{ 198 PacketsSent: tcpip.IGMPSentPacketStats{ 199 IGMPPacketStats: tcpip.IGMPPacketStats{ 200 MembershipQuery: mustCreateMetric("/netstack/igmp/packets_sent/membership_query", "Number of IGMP Membership Query messages sent."), 201 V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v1_membership_report", "Number of IGMPv1 Membership Report messages sent."), 202 V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v2_membership_report", "Number of IGMPv2 Membership Report messages sent."), 203 LeaveGroup: mustCreateMetric("/netstack/igmp/packets_sent/leave_group", "Number of IGMP Leave Group messages sent."), 204 }, 205 Dropped: mustCreateMetric("/netstack/igmp/packets_sent/dropped", "Number of IGMP packets dropped due to link layer errors."), 206 }, 207 PacketsReceived: tcpip.IGMPReceivedPacketStats{ 208 IGMPPacketStats: tcpip.IGMPPacketStats{ 209 MembershipQuery: mustCreateMetric("/netstack/igmp/packets_received/membership_query", "Number of IGMP Membership Query messages received."), 210 V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v1_membership_report", "Number of IGMPv1 Membership Report messages received."), 211 V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v2_membership_report", "Number of IGMPv2 Membership Report messages received."), 212 LeaveGroup: mustCreateMetric("/netstack/igmp/packets_received/leave_group", "Number of IGMP Leave Group messages received."), 213 }, 214 Invalid: mustCreateMetric("/netstack/igmp/packets_received/invalid", "Number of IGMP packets received that could not be parsed."), 215 ChecksumErrors: mustCreateMetric("/netstack/igmp/packets_received/checksum_errors", "Number of received IGMP packets with bad checksums."), 216 Unrecognized: mustCreateMetric("/netstack/igmp/packets_received/unrecognized", "Number of unrecognized IGMP packets received."), 217 }, 218 }, 219 IP: tcpip.IPStats{ 220 PacketsReceived: mustCreateMetric("/netstack/ip/packets_received", "Number of IP packets received from the link layer in nic.DeliverNetworkPacket."), 221 DisabledPacketsReceived: mustCreateMetric("/netstack/ip/disabled_packets_received", "Number of IP packets received from the link layer when the IP layer is disabled."), 222 InvalidDestinationAddressesReceived: mustCreateMetric("/netstack/ip/invalid_addresses_received", "Number of IP packets received with an unknown or invalid destination address."), 223 InvalidSourceAddressesReceived: mustCreateMetric("/netstack/ip/invalid_source_addresses_received", "Number of IP packets received with an unknown or invalid source address."), 224 PacketsDelivered: mustCreateMetric("/netstack/ip/packets_delivered", "Number of incoming IP packets that are successfully delivered to the transport layer via HandlePacket."), 225 PacketsSent: mustCreateMetric("/netstack/ip/packets_sent", "Number of IP packets sent via WritePacket."), 226 OutgoingPacketErrors: mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Number of IP packets which failed to write to a link-layer endpoint."), 227 MalformedPacketsReceived: mustCreateMetric("/netstack/ip/malformed_packets_received", "Number of IP packets which failed IP header validation checks."), 228 MalformedFragmentsReceived: mustCreateMetric("/netstack/ip/malformed_fragments_received", "Number of IP fragments which failed IP fragment validation checks."), 229 IPTablesPreroutingDropped: mustCreateMetric("/netstack/ip/iptables/prerouting_dropped", "Number of IP packets dropped in the Prerouting chain."), 230 IPTablesInputDropped: mustCreateMetric("/netstack/ip/iptables/input_dropped", "Number of IP packets dropped in the Input chain."), 231 IPTablesOutputDropped: mustCreateMetric("/netstack/ip/iptables/output_dropped", "Number of IP packets dropped in the Output chain."), 232 OptionTimestampReceived: mustCreateMetric("/netstack/ip/options/timestamp_received", "Number of timestamp options found in received IP packets."), 233 OptionRecordRouteReceived: mustCreateMetric("/netstack/ip/options/record_route_received", "Number of record route options found in received IP packets."), 234 OptionRouterAlertReceived: mustCreateMetric("/netstack/ip/options/router_alert_received", "Number of router alert options found in received IP packets."), 235 OptionUnknownReceived: mustCreateMetric("/netstack/ip/options/unknown_received", "Number of unknown options found in received IP packets."), 236 Forwarding: tcpip.IPForwardingStats{ 237 Unrouteable: mustCreateMetric("/netstack/ip/forwarding/unrouteable", "Number of IP packets received which couldn't be routed and thus were not forwarded."), 238 ExhaustedTTL: mustCreateMetric("/netstack/ip/forwarding/exhausted_ttl", "Number of IP packets received which could not be forwarded due to an exhausted TTL."), 239 LinkLocalSource: mustCreateMetric("/netstack/ip/forwarding/link_local_source_address", "Number of IP packets received which could not be forwarded due to a link-local source address."), 240 LinkLocalDestination: mustCreateMetric("/netstack/ip/forwarding/link_local_destination_address", "Number of IP packets received which could not be forwarded due to a link-local destination address."), 241 ExtensionHeaderProblem: mustCreateMetric("/netstack/ip/forwarding/extension_header_problem", "Number of IP packets received which could not be forwarded due to a problem processing their IPv6 extension headers."), 242 PacketTooBig: mustCreateMetric("/netstack/ip/forwarding/packet_too_big", "Number of IP packets received which could not be forwarded because they could not fit within the outgoing MTU."), 243 HostUnreachable: mustCreateMetric("/netstack/ip/forwarding/host_unreachable", "Number of IP packets received which could not be forwarded due to unresolvable next hop."), 244 Errors: mustCreateMetric("/netstack/ip/forwarding/errors", "Number of IP packets which couldn't be forwarded."), 245 }, 246 }, 247 ARP: tcpip.ARPStats{ 248 PacketsReceived: mustCreateMetric("/netstack/arp/packets_received", "Number of ARP packets received from the link layer."), 249 DisabledPacketsReceived: mustCreateMetric("/netstack/arp/disabled_packets_received", "Number of ARP packets received from the link layer when the ARP layer is disabled."), 250 MalformedPacketsReceived: mustCreateMetric("/netstack/arp/malformed_packets_received", "Number of ARP packets which failed ARP header validation checks."), 251 RequestsReceived: mustCreateMetric("/netstack/arp/requests_received", "Number of ARP requests received."), 252 RequestsReceivedUnknownTargetAddress: mustCreateMetric("/netstack/arp/requests_received_unknown_addr", "Number of ARP requests received with an unknown target address."), 253 OutgoingRequestInterfaceHasNoLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_iface_has_no_addr", "Number of failed attempts to send an ARP request with an interface that has no network address."), 254 OutgoingRequestBadLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_invalid_local_addr", "Number of failed attempts to send an ARP request with a provided local address that is invalid."), 255 OutgoingRequestsDropped: mustCreateMetric("/netstack/arp/outgoing_requests_dropped", "Number of ARP requests which failed to write to a link-layer endpoint."), 256 OutgoingRequestsSent: mustCreateMetric("/netstack/arp/outgoing_requests_sent", "Number of ARP requests sent."), 257 RepliesReceived: mustCreateMetric("/netstack/arp/replies_received", "Number of ARP replies received."), 258 OutgoingRepliesDropped: mustCreateMetric("/netstack/arp/outgoing_replies_dropped", "Number of ARP replies which failed to write to a link-layer endpoint."), 259 OutgoingRepliesSent: mustCreateMetric("/netstack/arp/outgoing_replies_sent", "Number of ARP replies sent."), 260 }, 261 TCP: tcpip.TCPStats{ 262 ActiveConnectionOpenings: mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."), 263 PassiveConnectionOpenings: mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."), 264 CurrentEstablished: mustCreateGauge("/netstack/tcp/current_established", "Number of connections in ESTABLISHED state now."), 265 CurrentConnected: mustCreateGauge("/netstack/tcp/current_open", "Number of connections that are in connected state."), 266 EstablishedResets: mustCreateMetric("/netstack/tcp/established_resets", "Number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state"), 267 EstablishedClosed: mustCreateMetric("/netstack/tcp/established_closed", "Number of times established TCP connections made a transition to CLOSED state."), 268 EstablishedTimedout: mustCreateMetric("/netstack/tcp/established_timedout", "Number of times an established connection was reset because of keep-alive time out."), 269 ListenOverflowSynDrop: mustCreateMetric("/netstack/tcp/listen_overflow_syn_drop", "Number of times the listen queue overflowed and a SYN was dropped."), 270 ListenOverflowAckDrop: mustCreateMetric("/netstack/tcp/listen_overflow_ack_drop", "Number of times the listen queue overflowed and the final ACK in the handshake was dropped."), 271 ListenOverflowSynCookieSent: mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_sent", "Number of times a SYN cookie was sent."), 272 ListenOverflowSynCookieRcvd: mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_rcvd", "Number of times a SYN cookie was received."), 273 ListenOverflowInvalidSynCookieRcvd: mustCreateMetric("/netstack/tcp/listen_overflow_invalid_syn_cookie_rcvd", "Number of times an invalid SYN cookie was received."), 274 FailedConnectionAttempts: mustCreateMetric("/netstack/tcp/failed_connection_attempts", "Number of calls to Connect or Listen (active and passive openings, respectively) that end in an error."), 275 ValidSegmentsReceived: mustCreateMetric("/netstack/tcp/valid_segments_received", "Number of TCP segments received that the transport layer successfully parsed."), 276 InvalidSegmentsReceived: mustCreateMetric("/netstack/tcp/invalid_segments_received", "Number of TCP segments received that the transport layer could not parse."), 277 SegmentsSent: mustCreateMetric("/netstack/tcp/segments_sent", "Number of TCP segments sent."), 278 SegmentSendErrors: mustCreateMetric("/netstack/tcp/segment_send_errors", "Number of TCP segments failed to be sent."), 279 ResetsSent: mustCreateMetric("/netstack/tcp/resets_sent", "Number of TCP resets sent."), 280 ResetsReceived: mustCreateMetric("/netstack/tcp/resets_received", "Number of TCP resets received."), 281 Retransmits: mustCreateMetric("/netstack/tcp/retransmits", "Number of TCP segments retransmitted."), 282 FastRecovery: mustCreateMetric("/netstack/tcp/fast_recovery", "Number of times fast recovery was used to recover from packet loss."), 283 SACKRecovery: mustCreateMetric("/netstack/tcp/sack_recovery", "Number of times SACK recovery was used to recover from packet loss."), 284 TLPRecovery: mustCreateMetric("/netstack/tcp/tlp_recovery", "Number of times tail loss probe triggers recovery from tail loss."), 285 SlowStartRetransmits: mustCreateMetric("/netstack/tcp/slow_start_retransmits", "Number of segments retransmitted in slow start mode."), 286 FastRetransmit: mustCreateMetric("/netstack/tcp/fast_retransmit", "Number of TCP segments which were fast retransmitted."), 287 Timeouts: mustCreateMetric("/netstack/tcp/timeouts", "Number of times RTO expired."), 288 ChecksumErrors: mustCreateMetric("/netstack/tcp/checksum_errors", "Number of segments dropped due to bad checksums."), 289 FailedPortReservations: mustCreateMetric("/netstack/tcp/failed_port_reservations", "Number of time TCP failed to reserve a port."), 290 SegmentsAckedWithDSACK: mustCreateMetric("/netstack/tcp/segments_acked_with_dsack", "Number of segments for which DSACK was received."), 291 SpuriousRecovery: mustCreateMetric("/netstack/tcp/spurious_recovery", "Number of times the connection entered loss recovery spuriously."), 292 SpuriousRTORecovery: mustCreateMetric("/netstack/tcp/spurious_rto_recovery", "Number of times the connection entered RTO spuriously."), 293 ForwardMaxInFlightDrop: mustCreateMetric("/netstack/tcp/forward_max_in_flight_drop", "Number of connection requests dropped due to exceeding in-flight limit."), 294 }, 295 UDP: tcpip.UDPStats{ 296 PacketsReceived: mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."), 297 UnknownPortErrors: mustCreateMetric("/netstack/udp/unknown_port_errors", "Number of incoming UDP datagrams dropped because they did not have a known destination port."), 298 ReceiveBufferErrors: mustCreateMetric("/netstack/udp/receive_buffer_errors", "Number of incoming UDP datagrams dropped due to the receiving buffer being in an invalid state."), 299 MalformedPacketsReceived: mustCreateMetric("/netstack/udp/malformed_packets_received", "Number of incoming UDP datagrams dropped due to the UDP header being in a malformed state."), 300 PacketsSent: mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent."), 301 PacketSendErrors: mustCreateMetric("/netstack/udp/packet_send_errors", "Number of UDP datagrams failed to be sent."), 302 ChecksumErrors: mustCreateMetric("/netstack/udp/checksum_errors", "Number of UDP datagrams dropped due to bad checksums."), 303 }, 304 } 305 306 // DefaultTTL is linux's default TTL. All network protocols in all stacks used 307 // with this package must have this value set as their default TTL. 308 const DefaultTTL = 64 309 310 const sizeOfInt32 int = 4 311 312 var errStackType = syserr.New("expected but did not receive a netstack.Stack", errno.EINVAL) 313 314 // commonEndpoint represents the intersection of a tcpip.Endpoint and a 315 // transport.Endpoint. 316 type commonEndpoint interface { 317 // Readiness implements tcpip.Endpoint.Readiness and 318 // transport.Endpoint.Readiness. 319 Readiness(mask waiter.EventMask) waiter.EventMask 320 321 // SetSockOpt implements tcpip.Endpoint.SetSockOpt and 322 // transport.Endpoint.SetSockOpt. 323 SetSockOpt(tcpip.SettableSocketOption) tcpip.Error 324 325 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt and 326 // transport.Endpoint.SetSockOptInt. 327 SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error 328 329 // GetSockOpt implements tcpip.Endpoint.GetSockOpt and 330 // transport.Endpoint.GetSockOpt. 331 GetSockOpt(tcpip.GettableSocketOption) tcpip.Error 332 333 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and 334 // transport.Endpoint.GetSockOpt. 335 GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) 336 337 // State returns a socket's lifecycle state. The returned value is 338 // protocol-specific and is primarily used for diagnostics. 339 State() uint32 340 341 // LastError implements tcpip.Endpoint.LastError and 342 // transport.Endpoint.LastError. 343 LastError() tcpip.Error 344 345 // SocketOptions implements tcpip.Endpoint.SocketOptions and 346 // transport.Endpoint.SocketOptions. 347 SocketOptions() *tcpip.SocketOptions 348 } 349 350 // sock encapsulates all the state needed to represent a network stack 351 // endpoint in the kernel context. 352 // 353 // +stateify savable 354 type sock struct { 355 vfsfd vfs.FileDescription 356 vfs.FileDescriptionDefaultImpl 357 vfs.DentryMetadataFileDescriptionImpl 358 vfs.LockFD 359 socket.SendReceiveTimeout 360 *waiter.Queue 361 362 family int 363 Endpoint tcpip.Endpoint 364 skType linux.SockType 365 protocol int 366 367 namespace *inet.Namespace 368 369 // readMu protects access to the below fields. 370 readMu sync.Mutex `state:"nosave"` 371 372 // sockOptTimestamp corresponds to SO_TIMESTAMP. When true, timestamps 373 // of returned messages can be returned via control messages. When 374 // false, the same timestamp is instead stored and can be read via the 375 // SIOCGSTAMP ioctl. It is protected by readMu. See socket(7). 376 sockOptTimestamp bool 377 // timestampValid indicates whether timestamp for SIOCGSTAMP has been 378 // set. It is protected by readMu. 379 timestampValid bool 380 // timestamp holds the timestamp to use with SIOCTSTAMP. It is only 381 // valid when timestampValid is true. It is protected by readMu. 382 timestamp time.Time `state:".(int64)"` 383 384 // TODO(b/153685824): Move this to SocketOptions. 385 // sockOptInq corresponds to TCP_INQ. 386 sockOptInq bool 387 } 388 389 var _ = socket.Socket(&sock{}) 390 391 // New creates a new endpoint socket. 392 func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*vfs.FileDescription, *syserr.Error) { 393 if skType == linux.SOCK_STREAM { 394 endpoint.SocketOptions().SetDelayOption(true) 395 } 396 397 mnt := t.Kernel().SocketMount() 398 d := sockfs.NewDentry(t, mnt) 399 defer d.DecRef(t) 400 401 namespace := t.NetworkNamespace() 402 s := &sock{ 403 Queue: queue, 404 family: family, 405 Endpoint: endpoint, 406 skType: skType, 407 protocol: protocol, 408 namespace: namespace, 409 } 410 s.LockFD.Init(&vfs.FileLocks{}) 411 vfsfd := &s.vfsfd 412 if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{ 413 DenyPRead: true, 414 DenyPWrite: true, 415 UseDentryMetadata: true, 416 }); err != nil { 417 return nil, syserr.FromError(err) 418 } 419 namespace.IncRef() 420 return vfsfd, nil 421 } 422 423 // Release implements vfs.FileDescriptionImpl.Release. 424 func (s *sock) Release(ctx context.Context) { 425 kernel.KernelFromContext(ctx).DeleteSocket(&s.vfsfd) 426 e, ch := waiter.NewChannelEntry(waiter.EventHUp | waiter.EventErr) 427 s.EventRegister(&e) 428 defer s.EventUnregister(&e) 429 430 s.Endpoint.Close() 431 432 // SO_LINGER option is valid only for TCP. For other socket types 433 // return after endpoint close. 434 if family, skType, _ := s.Type(); skType == linux.SOCK_STREAM && (family == linux.AF_INET || family == linux.AF_INET6) { 435 v := s.Endpoint.SocketOptions().GetLinger() 436 // The case for zero timeout is handled in tcp endpoint close function. 437 // Close is blocked until either: 438 // 1. The endpoint state is not in any of the states: FIN-WAIT1, 439 // CLOSING and LAST_ACK. 440 // 2. Timeout is reached. 441 if v.Enabled && v.Timeout != 0 { 442 t := kernel.TaskFromContext(ctx) 443 start := t.Kernel().MonotonicClock().Now() 444 deadline := start.Add(v.Timeout) 445 _ = t.BlockWithDeadline(ch, true, deadline) 446 } 447 } 448 s.namespace.DecRef(ctx) 449 } 450 451 // Epollable implements FileDescriptionImpl.Epollable. 452 func (s *sock) Epollable() bool { 453 return true 454 } 455 456 // Read implements vfs.FileDescriptionImpl. 457 func (s *sock) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { 458 // All flags other than RWF_NOWAIT should be ignored. 459 // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. 460 if opts.Flags != 0 { 461 return 0, linuxerr.EOPNOTSUPP 462 } 463 464 if dst.NumBytes() == 0 { 465 return 0, nil 466 } 467 n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false) 468 if err == syserr.ErrWouldBlock { 469 return int64(n), linuxerr.ErrWouldBlock 470 } 471 if err != nil { 472 return 0, err.ToError() 473 } 474 return int64(n), nil 475 } 476 477 // Write implements vfs.FileDescriptionImpl. 478 func (s *sock) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { 479 // All flags other than RWF_NOWAIT should be ignored. 480 // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. 481 if opts.Flags != 0 { 482 return 0, linuxerr.EOPNOTSUPP 483 } 484 485 r := src.Reader(ctx) 486 n, err := s.Endpoint.Write(r, tcpip.WriteOptions{}) 487 if _, ok := err.(*tcpip.ErrWouldBlock); ok { 488 return 0, linuxerr.ErrWouldBlock 489 } 490 if err != nil { 491 return 0, syserr.TranslateNetstackError(err).ToError() 492 } 493 494 if n < src.NumBytes() { 495 return n, linuxerr.ErrWouldBlock 496 } 497 498 return n, nil 499 } 500 501 // Accept implements the linux syscall accept(2) for sockets backed by 502 // tcpip.Endpoint. 503 func (s *sock) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { 504 // Issue the accept request to get the new endpoint. 505 var peerAddr *tcpip.FullAddress 506 if peerRequested { 507 peerAddr = &tcpip.FullAddress{} 508 } 509 ep, wq, terr := s.Endpoint.Accept(peerAddr) 510 if terr != nil { 511 if _, ok := terr.(*tcpip.ErrWouldBlock); !ok || !blocking { 512 return 0, nil, 0, syserr.TranslateNetstackError(terr) 513 } 514 515 var err *syserr.Error 516 ep, wq, err = s.blockingAccept(t, peerAddr) 517 if err != nil { 518 return 0, nil, 0, err 519 } 520 } 521 522 ns, err := New(t, s.family, s.skType, s.protocol, wq, ep) 523 if err != nil { 524 return 0, nil, 0, err 525 } 526 defer ns.DecRef(t) 527 528 if err := ns.SetStatusFlags(t, t.Credentials(), uint32(flags&linux.SOCK_NONBLOCK)); err != nil { 529 return 0, nil, 0, syserr.FromError(err) 530 } 531 532 var addr linux.SockAddr 533 var addrLen uint32 534 if peerAddr != nil { 535 // Get address of the peer and write it to peer slice. 536 addr, addrLen = socket.ConvertAddress(s.family, *peerAddr) 537 } 538 539 fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{ 540 CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, 541 }) 542 543 t.Kernel().RecordSocket(ns) 544 545 return fd, addr, addrLen, syserr.FromError(e) 546 } 547 548 // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by 549 // tcpip.Endpoint. 550 func (s *sock) GetSockOpt(t *kernel.Task, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { 551 // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is 552 // implemented specifically for netstack.Socket rather than 553 // commonEndpoint. commonEndpoint should be extended to support socket 554 // options where the implementation is not shared, as unix sockets need 555 // their own support for SO_TIMESTAMP. 556 if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP { 557 if outLen < sizeOfInt32 { 558 return nil, syserr.ErrInvalidArgument 559 } 560 val := primitive.Int32(0) 561 s.readMu.Lock() 562 defer s.readMu.Unlock() 563 if s.sockOptTimestamp { 564 val = 1 565 } 566 return &val, nil 567 } 568 if level == linux.SOL_TCP && name == linux.TCP_INQ { 569 if outLen < sizeOfInt32 { 570 return nil, syserr.ErrInvalidArgument 571 } 572 val := primitive.Int32(0) 573 s.readMu.Lock() 574 defer s.readMu.Unlock() 575 if s.sockOptInq { 576 val = 1 577 } 578 return &val, nil 579 } 580 581 return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outPtr, outLen) 582 } 583 584 // SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by 585 // tcpip.Endpoint. 586 func (s *sock) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error { 587 // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is 588 // implemented specifically for netstack.Socket rather than 589 // commonEndpoint. commonEndpoint should be extended to support socket 590 // options where the implementation is not shared, as unix sockets need 591 // their own support for SO_TIMESTAMP. 592 if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP { 593 if len(optVal) < sizeOfInt32 { 594 return syserr.ErrInvalidArgument 595 } 596 s.readMu.Lock() 597 defer s.readMu.Unlock() 598 s.sockOptTimestamp = hostarch.ByteOrder.Uint32(optVal) != 0 599 return nil 600 } 601 if level == linux.SOL_TCP && name == linux.TCP_INQ { 602 if len(optVal) < sizeOfInt32 { 603 return syserr.ErrInvalidArgument 604 } 605 s.readMu.Lock() 606 defer s.readMu.Unlock() 607 s.sockOptInq = hostarch.ByteOrder.Uint32(optVal) != 0 608 return nil 609 } 610 611 return SetSockOpt(t, s, s.Endpoint, level, name, optVal) 612 } 613 614 var sockAddrInetSize = (*linux.SockAddrInet)(nil).SizeBytes() 615 var sockAddrInet6Size = (*linux.SockAddrInet6)(nil).SizeBytes() 616 var sockAddrLinkSize = (*linux.SockAddrLink)(nil).SizeBytes() 617 618 // minSockAddrLen returns the minimum length in bytes of a socket address for 619 // the socket's family. 620 func (s *sock) minSockAddrLen() int { 621 const addressFamilySize = 2 622 623 switch s.family { 624 case linux.AF_UNIX: 625 return addressFamilySize 626 case linux.AF_INET: 627 return sockAddrInetSize 628 case linux.AF_INET6: 629 return sockAddrInet6Size 630 case linux.AF_PACKET: 631 return sockAddrLinkSize 632 case linux.AF_UNSPEC: 633 return addressFamilySize 634 default: 635 panic(fmt.Sprintf("s.family unrecognized = %d", s.family)) 636 } 637 } 638 639 func (s *sock) isPacketBased() bool { 640 return s.skType == linux.SOCK_DGRAM || s.skType == linux.SOCK_SEQPACKET || s.skType == linux.SOCK_RDM || s.skType == linux.SOCK_RAW 641 } 642 643 // Readiness returns a mask of ready events for socket s. 644 func (s *sock) Readiness(mask waiter.EventMask) waiter.EventMask { 645 return s.Endpoint.Readiness(mask) 646 } 647 648 // checkFamily returns true iff the specified address family may be used with 649 // the socket. 650 // 651 // If exact is true, then the specified address family must be an exact match 652 // with the socket's family. 653 func (s *sock) checkFamily(family uint16, exact bool) bool { 654 if family == uint16(s.family) { 655 return true 656 } 657 if !exact && family == linux.AF_INET && s.family == linux.AF_INET6 { 658 if !s.Endpoint.SocketOptions().GetV6Only() { 659 return true 660 } 661 } 662 return false 663 } 664 665 // mapFamily maps the AF_INET ANY address to the IPv4-mapped IPv6 ANY if the 666 // receiver's family is AF_INET6. 667 // 668 // This is a hack to work around the fact that both IPv4 and IPv6 ANY are 669 // represented by the empty string. 670 // 671 // TODO(gvisor.dev/issue/1556): remove this function. 672 func (s *sock) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress { 673 if addr.Addr.BitLen() == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET { 674 addr.Addr = tcpip.AddrFrom16([16]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}) 675 } 676 return addr 677 } 678 679 // Connect implements the linux syscall connect(2) for sockets backed by 680 // tpcip.Endpoint. 681 func (s *sock) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { 682 addr, family, err := socket.AddressAndFamily(sockaddr) 683 if err != nil { 684 return err 685 } 686 687 if family == linux.AF_UNSPEC { 688 err := s.Endpoint.Disconnect() 689 if _, ok := err.(*tcpip.ErrNotSupported); ok { 690 return syserr.ErrAddressFamilyNotSupported 691 } 692 return syserr.TranslateNetstackError(err) 693 } 694 695 if !s.checkFamily(family, false /* exact */) { 696 return syserr.ErrInvalidArgument 697 } 698 addr = s.mapFamily(addr, family) 699 700 // Always return right away in the non-blocking case. 701 if !blocking { 702 return syserr.TranslateNetstackError(s.Endpoint.Connect(addr)) 703 } 704 705 // Register for notification when the endpoint becomes writable, then 706 // initiate the connection. 707 e, ch := waiter.NewChannelEntry(waiter.WritableEvents) 708 s.EventRegister(&e) 709 defer s.EventUnregister(&e) 710 711 switch err := s.Endpoint.Connect(addr); err.(type) { 712 case *tcpip.ErrConnectStarted, *tcpip.ErrAlreadyConnecting: 713 case *tcpip.ErrNoPortAvailable: 714 if (s.family == unix.AF_INET || s.family == unix.AF_INET6) && s.skType == linux.SOCK_STREAM { 715 // TCP unlike UDP returns EADDRNOTAVAIL when it can't 716 // find an available local ephemeral port. 717 return syserr.ErrAddressNotAvailable 718 } 719 return syserr.TranslateNetstackError(err) 720 default: 721 return syserr.TranslateNetstackError(err) 722 } 723 724 // It's pending, so we have to wait for a notification, and fetch the 725 // result once the wait completes. 726 if err := t.Block(ch); err != nil { 727 return syserr.FromError(err) 728 } 729 730 // Call Connect() again after blocking to find connect's result. 731 return syserr.TranslateNetstackError(s.Endpoint.Connect(addr)) 732 } 733 734 // Bind implements the linux syscall bind(2) for sockets backed by 735 // tcpip.Endpoint. 736 func (s *sock) Bind(_ *kernel.Task, sockaddr []byte) *syserr.Error { 737 if len(sockaddr) < 2 { 738 return syserr.ErrInvalidArgument 739 } 740 741 family := hostarch.ByteOrder.Uint16(sockaddr) 742 var addr tcpip.FullAddress 743 744 // Bind for AF_PACKET requires only family, protocol and ifindex. 745 // In function AddressAndFamily, we check the address length which is 746 // not needed for AF_PACKET bind. 747 if family == linux.AF_PACKET { 748 var a linux.SockAddrLink 749 if len(sockaddr) < sockAddrLinkSize { 750 return syserr.ErrInvalidArgument 751 } 752 a.UnmarshalBytes(sockaddr) 753 754 addr = tcpip.FullAddress{ 755 NIC: tcpip.NICID(a.InterfaceIndex), 756 Addr: tcpip.AddrFrom16Slice(append( 757 a.HardwareAddr[:header.EthernetAddressSize], 758 []byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}..., 759 )), 760 Port: socket.Ntohs(a.Protocol), 761 } 762 } else { 763 if s.minSockAddrLen() > len(sockaddr) { 764 return syserr.ErrInvalidArgument 765 } 766 767 var err *syserr.Error 768 addr, family, err = socket.AddressAndFamily(sockaddr) 769 if err != nil { 770 return err 771 } 772 773 if !s.checkFamily(family, true /* exact */) { 774 return syserr.ErrAddressFamilyNotSupported 775 } 776 777 addr = s.mapFamily(addr, family) 778 } 779 780 // Issue the bind request to the endpoint. 781 err := s.Endpoint.Bind(addr) 782 if _, ok := err.(*tcpip.ErrNoPortAvailable); ok { 783 // Bind always returns EADDRINUSE irrespective of if the specified port was 784 // already bound or if an ephemeral port was requested but none were 785 // available. 786 // 787 // *tcpip.ErrNoPortAvailable is mapped to EAGAIN in syserr package because 788 // UDP connect returns EAGAIN on ephemeral port exhaustion. 789 // 790 // TCP connect returns EADDRNOTAVAIL on ephemeral port exhaustion. 791 err = &tcpip.ErrPortInUse{} 792 } 793 794 return syserr.TranslateNetstackError(err) 795 } 796 797 // Listen implements the linux syscall listen(2) for sockets backed by 798 // tcpip.Endpoint. 799 func (s *sock) Listen(_ *kernel.Task, backlog int) *syserr.Error { 800 if err := s.Endpoint.Listen(backlog); err != nil { 801 return syserr.TranslateNetstackError(err) 802 } 803 if !socket.IsTCP(s) { 804 return nil 805 } 806 807 // Emit SentryTCPListenEvent with the bound port for tcp sockets. 808 addr, err := s.Endpoint.GetLocalAddress() 809 if err != nil { 810 panic(fmt.Sprintf("GetLocalAddress failed for tcp socket: %s", err)) 811 } 812 eventchannel.Emit(&epb.SentryTcpListenEvent{ 813 Port: proto.Int32(int32(addr.Port)), 814 }) 815 return nil 816 } 817 818 // blockingAccept implements a blocking version of accept(2), that is, if no 819 // connections are ready to be accept, it will block until one becomes ready. 820 func (s *sock) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) { 821 // Register for notifications. 822 e, ch := waiter.NewChannelEntry(waiter.ReadableEvents) 823 s.EventRegister(&e) 824 defer s.EventUnregister(&e) 825 826 // Try to accept the connection again; if it fails, then wait until we 827 // get a notification. 828 for { 829 ep, wq, err := s.Endpoint.Accept(peerAddr) 830 if _, ok := err.(*tcpip.ErrWouldBlock); !ok { 831 return ep, wq, syserr.TranslateNetstackError(err) 832 } 833 834 if err := t.Block(ch); err != nil { 835 return nil, nil, syserr.FromError(err) 836 } 837 } 838 } 839 840 // ConvertShutdown converts Linux shutdown flags into tcpip shutdown flags. 841 func ConvertShutdown(how int) (tcpip.ShutdownFlags, *syserr.Error) { 842 var f tcpip.ShutdownFlags 843 switch how { 844 case linux.SHUT_RD: 845 f = tcpip.ShutdownRead 846 case linux.SHUT_WR: 847 f = tcpip.ShutdownWrite 848 case linux.SHUT_RDWR: 849 f = tcpip.ShutdownRead | tcpip.ShutdownWrite 850 default: 851 return 0, syserr.ErrInvalidArgument 852 } 853 return f, nil 854 } 855 856 // Shutdown implements the linux syscall shutdown(2) for sockets backed by 857 // tcpip.Endpoint. 858 func (s *sock) Shutdown(_ *kernel.Task, how int) *syserr.Error { 859 f, err := ConvertShutdown(how) 860 if err != nil { 861 return err 862 } 863 864 // Issue shutdown request. 865 return syserr.TranslateNetstackError(s.Endpoint.Shutdown(f)) 866 } 867 868 // GetSockOpt can be used to implement the linux syscall getsockopt(2) for 869 // sockets backed by a commonEndpoint. 870 func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { 871 switch level { 872 case linux.SOL_SOCKET: 873 return getSockOptSocket(t, s, ep, family, skType, name, outLen) 874 875 case linux.SOL_TCP: 876 return getSockOptTCP(t, s, ep, name, outLen) 877 878 case linux.SOL_IPV6: 879 return getSockOptIPv6(t, s, ep, name, outPtr, outLen) 880 881 case linux.SOL_IP: 882 return getSockOptIP(t, s, ep, name, outPtr, outLen, family) 883 884 case linux.SOL_ICMPV6: 885 return getSockOptICMPv6(t, s, ep, name, outLen) 886 887 case linux.SOL_UDP, 888 linux.SOL_RAW, 889 linux.SOL_PACKET: 890 // Not supported. 891 } 892 893 return nil, syserr.ErrProtocolNotAvailable 894 } 895 896 func boolToInt32(v bool) int32 { 897 if v { 898 return 1 899 } 900 return 0 901 } 902 903 // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET. 904 func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, _ linux.SockType, name, outLen int) (marshal.Marshallable, *syserr.Error) { 905 // TODO(b/124056281): Stop rejecting short optLen values in getsockopt. 906 switch name { 907 case linux.SO_ERROR: 908 if outLen < sizeOfInt32 { 909 return nil, syserr.ErrInvalidArgument 910 } 911 912 // Get the last error and convert it. 913 err := ep.SocketOptions().GetLastError() 914 if err == nil { 915 optP := primitive.Int32(0) 916 return &optP, nil 917 } 918 919 optP := primitive.Int32(syserr.TranslateNetstackError(err).ToLinux()) 920 return &optP, nil 921 922 case linux.SO_PEERCRED: 923 if family != linux.AF_UNIX || outLen < unix.SizeofUcred { 924 return nil, syserr.ErrInvalidArgument 925 } 926 927 tcred := t.Credentials() 928 creds := linux.ControlMessageCredentials{ 929 PID: int32(t.ThreadGroup().ID()), 930 UID: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()), 931 GID: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()), 932 } 933 return &creds, nil 934 935 case linux.SO_PASSCRED: 936 if outLen < sizeOfInt32 { 937 return nil, syserr.ErrInvalidArgument 938 } 939 940 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetPassCred())) 941 return &v, nil 942 943 case linux.SO_SNDBUF: 944 if outLen < sizeOfInt32 { 945 return nil, syserr.ErrInvalidArgument 946 } 947 948 size := ep.SocketOptions().GetSendBufferSize() 949 950 if size > math.MaxInt32 { 951 size = math.MaxInt32 952 } 953 954 sizeP := primitive.Int32(size) 955 return &sizeP, nil 956 957 case linux.SO_RCVBUF: 958 if outLen < sizeOfInt32 { 959 return nil, syserr.ErrInvalidArgument 960 } 961 962 size := ep.SocketOptions().GetReceiveBufferSize() 963 964 if size > math.MaxInt32 { 965 size = math.MaxInt32 966 } 967 968 sizeP := primitive.Int32(size) 969 return &sizeP, nil 970 971 case linux.SO_REUSEADDR: 972 if outLen < sizeOfInt32 { 973 return nil, syserr.ErrInvalidArgument 974 } 975 976 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReuseAddress())) 977 return &v, nil 978 979 case linux.SO_REUSEPORT: 980 if outLen < sizeOfInt32 { 981 return nil, syserr.ErrInvalidArgument 982 } 983 984 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReusePort())) 985 return &v, nil 986 987 case linux.SO_BINDTODEVICE: 988 v := ep.SocketOptions().GetBindToDevice() 989 if v == 0 { 990 var b primitive.ByteSlice 991 return &b, nil 992 } 993 if outLen < linux.IFNAMSIZ { 994 return nil, syserr.ErrInvalidArgument 995 } 996 s := t.NetworkContext() 997 if s == nil { 998 return nil, syserr.ErrNoDevice 999 } 1000 nic, ok := s.Interfaces()[int32(v)] 1001 if !ok { 1002 // The NICID no longer indicates a valid interface, probably because that 1003 // interface was removed. 1004 return nil, syserr.ErrUnknownDevice 1005 } 1006 1007 name := primitive.ByteSlice(append([]byte(nic.Name), 0)) 1008 return &name, nil 1009 1010 case linux.SO_BROADCAST: 1011 if outLen < sizeOfInt32 { 1012 return nil, syserr.ErrInvalidArgument 1013 } 1014 1015 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetBroadcast())) 1016 return &v, nil 1017 1018 case linux.SO_KEEPALIVE: 1019 if outLen < sizeOfInt32 { 1020 return nil, syserr.ErrInvalidArgument 1021 } 1022 1023 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetKeepAlive())) 1024 return &v, nil 1025 1026 case linux.SO_LINGER: 1027 if outLen < linux.SizeOfLinger { 1028 return nil, syserr.ErrInvalidArgument 1029 } 1030 1031 var linger linux.Linger 1032 v := ep.SocketOptions().GetLinger() 1033 1034 if v.Enabled { 1035 linger.OnOff = 1 1036 } 1037 linger.Linger = int32(v.Timeout.Seconds()) 1038 return &linger, nil 1039 1040 case linux.SO_SNDTIMEO: 1041 // TODO(igudger): Linux allows shorter lengths for partial results. 1042 if outLen < linux.SizeOfTimeval { 1043 return nil, syserr.ErrInvalidArgument 1044 } 1045 1046 sendTimeout := linux.NsecToTimeval(s.SendTimeout()) 1047 return &sendTimeout, nil 1048 1049 case linux.SO_RCVTIMEO: 1050 // TODO(igudger): Linux allows shorter lengths for partial results. 1051 if outLen < linux.SizeOfTimeval { 1052 return nil, syserr.ErrInvalidArgument 1053 } 1054 1055 recvTimeout := linux.NsecToTimeval(s.RecvTimeout()) 1056 return &recvTimeout, nil 1057 1058 case linux.SO_OOBINLINE: 1059 if outLen < sizeOfInt32 { 1060 return nil, syserr.ErrInvalidArgument 1061 } 1062 1063 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetOutOfBandInline())) 1064 return &v, nil 1065 1066 case linux.SO_NO_CHECK: 1067 if outLen < sizeOfInt32 { 1068 return nil, syserr.ErrInvalidArgument 1069 } 1070 1071 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetNoChecksum())) 1072 return &v, nil 1073 1074 case linux.SO_ACCEPTCONN: 1075 if outLen < sizeOfInt32 { 1076 return nil, syserr.ErrInvalidArgument 1077 } 1078 1079 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetAcceptConn())) 1080 return &v, nil 1081 1082 case linux.SO_RCVLOWAT: 1083 if outLen < sizeOfInt32 { 1084 return nil, syserr.ErrInvalidArgument 1085 } 1086 1087 v := primitive.Int32(ep.SocketOptions().GetRcvlowat()) 1088 return &v, nil 1089 } 1090 return nil, syserr.ErrProtocolNotAvailable 1091 } 1092 1093 // getSockOptTCP implements GetSockOpt when level is SOL_TCP. 1094 func getSockOptTCP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) { 1095 if !socket.IsTCP(s) { 1096 return nil, syserr.ErrUnknownProtocolOption 1097 } 1098 1099 switch name { 1100 case linux.TCP_NODELAY: 1101 if outLen < sizeOfInt32 { 1102 return nil, syserr.ErrInvalidArgument 1103 } 1104 1105 v := primitive.Int32(boolToInt32(!ep.SocketOptions().GetDelayOption())) 1106 return &v, nil 1107 1108 case linux.TCP_CORK: 1109 if outLen < sizeOfInt32 { 1110 return nil, syserr.ErrInvalidArgument 1111 } 1112 1113 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetCorkOption())) 1114 return &v, nil 1115 1116 case linux.TCP_QUICKACK: 1117 if outLen < sizeOfInt32 { 1118 return nil, syserr.ErrInvalidArgument 1119 } 1120 1121 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetQuickAck())) 1122 return &v, nil 1123 1124 case linux.TCP_MAXSEG: 1125 if outLen < sizeOfInt32 { 1126 return nil, syserr.ErrInvalidArgument 1127 } 1128 1129 v, err := ep.GetSockOptInt(tcpip.MaxSegOption) 1130 if err != nil { 1131 return nil, syserr.TranslateNetstackError(err) 1132 } 1133 vP := primitive.Int32(v) 1134 return &vP, nil 1135 1136 case linux.TCP_KEEPIDLE: 1137 if outLen < sizeOfInt32 { 1138 return nil, syserr.ErrInvalidArgument 1139 } 1140 1141 var v tcpip.KeepaliveIdleOption 1142 if err := ep.GetSockOpt(&v); err != nil { 1143 return nil, syserr.TranslateNetstackError(err) 1144 } 1145 keepAliveIdle := primitive.Int32(time.Duration(v) / time.Second) 1146 return &keepAliveIdle, nil 1147 1148 case linux.TCP_KEEPINTVL: 1149 if outLen < sizeOfInt32 { 1150 return nil, syserr.ErrInvalidArgument 1151 } 1152 1153 var v tcpip.KeepaliveIntervalOption 1154 if err := ep.GetSockOpt(&v); err != nil { 1155 return nil, syserr.TranslateNetstackError(err) 1156 } 1157 keepAliveInterval := primitive.Int32(time.Duration(v) / time.Second) 1158 return &keepAliveInterval, nil 1159 1160 case linux.TCP_KEEPCNT: 1161 if outLen < sizeOfInt32 { 1162 return nil, syserr.ErrInvalidArgument 1163 } 1164 1165 v, err := ep.GetSockOptInt(tcpip.KeepaliveCountOption) 1166 if err != nil { 1167 return nil, syserr.TranslateNetstackError(err) 1168 } 1169 vP := primitive.Int32(v) 1170 return &vP, nil 1171 1172 case linux.TCP_USER_TIMEOUT: 1173 if outLen < sizeOfInt32 { 1174 return nil, syserr.ErrInvalidArgument 1175 } 1176 1177 var v tcpip.TCPUserTimeoutOption 1178 if err := ep.GetSockOpt(&v); err != nil { 1179 return nil, syserr.TranslateNetstackError(err) 1180 } 1181 tcpUserTimeout := primitive.Int32(time.Duration(v) / time.Millisecond) 1182 return &tcpUserTimeout, nil 1183 1184 case linux.TCP_INFO: 1185 var v tcpip.TCPInfoOption 1186 if err := ep.GetSockOpt(&v); err != nil { 1187 return nil, syserr.TranslateNetstackError(err) 1188 } 1189 1190 // TODO(b/64800844): Translate fields once they are added to 1191 // tcpip.TCPInfoOption. 1192 info := linux.TCPInfo{ 1193 State: uint8(v.State), 1194 RTO: uint32(v.RTO / time.Microsecond), 1195 RTT: uint32(v.RTT / time.Microsecond), 1196 RTTVar: uint32(v.RTTVar / time.Microsecond), 1197 SndSsthresh: v.SndSsthresh, 1198 SndCwnd: v.SndCwnd, 1199 } 1200 switch v.CcState { 1201 case tcpip.RTORecovery: 1202 info.CaState = linux.TCP_CA_Loss 1203 case tcpip.FastRecovery, tcpip.SACKRecovery: 1204 info.CaState = linux.TCP_CA_Recovery 1205 case tcpip.Disorder: 1206 info.CaState = linux.TCP_CA_Disorder 1207 case tcpip.Open: 1208 info.CaState = linux.TCP_CA_Open 1209 } 1210 1211 // In netstack reorderSeen is updated only when RACK is enabled. 1212 // We only track whether the reordering is seen, which is 1213 // different than Linux where reorderSeen is not specific to 1214 // RACK and is incremented when a reordering event is seen. 1215 if v.ReorderSeen { 1216 info.ReordSeen = 1 1217 } 1218 1219 // Linux truncates the output binary to outLen. 1220 buf := t.CopyScratchBuffer(info.SizeBytes()) 1221 info.MarshalUnsafe(buf) 1222 if len(buf) > outLen { 1223 buf = buf[:outLen] 1224 } 1225 bufP := primitive.ByteSlice(buf) 1226 return &bufP, nil 1227 1228 case linux.TCP_CC_INFO, 1229 linux.TCP_NOTSENT_LOWAT, 1230 linux.TCP_ZEROCOPY_RECEIVE: 1231 1232 // Not supported. 1233 1234 case linux.TCP_CONGESTION: 1235 if outLen <= 0 { 1236 return nil, syserr.ErrInvalidArgument 1237 } 1238 1239 var v tcpip.CongestionControlOption 1240 if err := ep.GetSockOpt(&v); err != nil { 1241 return nil, syserr.TranslateNetstackError(err) 1242 } 1243 1244 // We match linux behaviour here where it returns the lower of 1245 // TCP_CA_NAME_MAX bytes or the value of the option length. 1246 // 1247 // This is Linux's net/tcp.h TCP_CA_NAME_MAX. 1248 const tcpCANameMax = 16 1249 1250 toCopy := tcpCANameMax 1251 if outLen < tcpCANameMax { 1252 toCopy = outLen 1253 } 1254 b := make([]byte, toCopy) 1255 copy(b, v) 1256 1257 bP := primitive.ByteSlice(b) 1258 return &bP, nil 1259 1260 case linux.TCP_LINGER2: 1261 if outLen < sizeOfInt32 { 1262 return nil, syserr.ErrInvalidArgument 1263 } 1264 1265 var v tcpip.TCPLingerTimeoutOption 1266 if err := ep.GetSockOpt(&v); err != nil { 1267 return nil, syserr.TranslateNetstackError(err) 1268 } 1269 var lingerTimeout primitive.Int32 1270 if v >= 0 { 1271 lingerTimeout = primitive.Int32(time.Duration(v) / time.Second) 1272 } else { 1273 lingerTimeout = -1 1274 } 1275 return &lingerTimeout, nil 1276 1277 case linux.TCP_DEFER_ACCEPT: 1278 if outLen < sizeOfInt32 { 1279 return nil, syserr.ErrInvalidArgument 1280 } 1281 1282 var v tcpip.TCPDeferAcceptOption 1283 if err := ep.GetSockOpt(&v); err != nil { 1284 return nil, syserr.TranslateNetstackError(err) 1285 } 1286 1287 tcpDeferAccept := primitive.Int32(time.Duration(v) / time.Second) 1288 return &tcpDeferAccept, nil 1289 1290 case linux.TCP_SYNCNT: 1291 if outLen < sizeOfInt32 { 1292 return nil, syserr.ErrInvalidArgument 1293 } 1294 1295 v, err := ep.GetSockOptInt(tcpip.TCPSynCountOption) 1296 if err != nil { 1297 return nil, syserr.TranslateNetstackError(err) 1298 } 1299 vP := primitive.Int32(v) 1300 return &vP, nil 1301 1302 case linux.TCP_WINDOW_CLAMP: 1303 if outLen < sizeOfInt32 { 1304 return nil, syserr.ErrInvalidArgument 1305 } 1306 1307 v, err := ep.GetSockOptInt(tcpip.TCPWindowClampOption) 1308 if err != nil { 1309 return nil, syserr.TranslateNetstackError(err) 1310 } 1311 vP := primitive.Int32(v) 1312 return &vP, nil 1313 } 1314 return nil, syserr.ErrProtocolNotAvailable 1315 } 1316 1317 func getSockOptICMPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outLen int) (marshal.Marshallable, *syserr.Error) { 1318 if _, ok := ep.(tcpip.Endpoint); !ok { 1319 log.Warningf("SOL_ICMPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name) 1320 return nil, syserr.ErrUnknownProtocolOption 1321 } 1322 1323 if family, _, _ := s.Type(); family != linux.AF_INET6 { 1324 return nil, syserr.ErrNotSupported 1325 } 1326 1327 switch name { 1328 case linux.ICMPV6_FILTER: 1329 var v tcpip.ICMPv6Filter 1330 if err := ep.GetSockOpt(&v); err != nil { 1331 return nil, syserr.TranslateNetstackError(err) 1332 } 1333 1334 filter := linux.ICMP6Filter{Filter: v.DenyType} 1335 1336 // Linux truncates the output to outLen. 1337 buf := t.CopyScratchBuffer(filter.SizeBytes()) 1338 filter.MarshalUnsafe(buf) 1339 if len(buf) > outLen { 1340 buf = buf[:outLen] 1341 } 1342 bufP := primitive.ByteSlice(buf) 1343 return &bufP, nil 1344 } 1345 return nil, syserr.ErrProtocolNotAvailable 1346 } 1347 1348 func defaultTTL(t *kernel.Task, network tcpip.NetworkProtocolNumber) (primitive.Int32, tcpip.Error) { 1349 var opt tcpip.DefaultTTLOption 1350 stack := inet.StackFromContext(t) 1351 if err := stack.(*Stack).Stack.NetworkProtocolOption(network, &opt); err != nil { 1352 return 0, err 1353 } 1354 return primitive.Int32(opt), nil 1355 } 1356 1357 // getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6. 1358 func getSockOptIPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { 1359 if _, ok := ep.(tcpip.Endpoint); !ok { 1360 log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name) 1361 return nil, syserr.ErrUnknownProtocolOption 1362 } 1363 1364 family, skType, _ := s.Type() 1365 if family != linux.AF_INET6 { 1366 return nil, syserr.ErrNotSupported 1367 } 1368 1369 switch name { 1370 case linux.IPV6_CHECKSUM: 1371 if outLen < sizeOfInt32 { 1372 return nil, syserr.ErrInvalidArgument 1373 } 1374 1375 v, err := ep.GetSockOptInt(tcpip.IPv6Checksum) 1376 if err != nil { 1377 return nil, syserr.TranslateNetstackError(err) 1378 } 1379 1380 vP := primitive.Int32(v) 1381 return &vP, nil 1382 1383 case linux.IPV6_V6ONLY: 1384 if outLen < sizeOfInt32 { 1385 return nil, syserr.ErrInvalidArgument 1386 } 1387 1388 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetV6Only())) 1389 return &v, nil 1390 1391 case linux.IPV6_UNICAST_HOPS: 1392 if outLen < sizeOfInt32 { 1393 return nil, syserr.ErrInvalidArgument 1394 } 1395 1396 v, err := ep.GetSockOptInt(tcpip.IPv6HopLimitOption) 1397 if err != nil { 1398 return nil, syserr.TranslateNetstackError(err) 1399 } 1400 1401 // Fill in the default value, if needed. 1402 vP := primitive.Int32(v) 1403 if vP == -1 { 1404 vP, err = defaultTTL(t, header.IPv6ProtocolNumber) 1405 if err != nil { 1406 return nil, syserr.TranslateNetstackError(err) 1407 } 1408 } 1409 1410 return &vP, nil 1411 1412 case linux.IPV6_RECVHOPLIMIT: 1413 if outLen < sizeOfInt32 { 1414 return nil, syserr.ErrInvalidArgument 1415 } 1416 1417 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveHopLimit())) 1418 return &v, nil 1419 1420 case linux.IPV6_PATHMTU: 1421 // Not supported. 1422 1423 case linux.IPV6_TCLASS: 1424 // Length handling for parity with Linux. 1425 if outLen == 0 { 1426 var b primitive.ByteSlice 1427 return &b, nil 1428 } 1429 v, err := ep.GetSockOptInt(tcpip.IPv6TrafficClassOption) 1430 if err != nil { 1431 return nil, syserr.TranslateNetstackError(err) 1432 } 1433 1434 uintv := primitive.Uint32(v) 1435 // Linux truncates the output binary to outLen. 1436 ib := t.CopyScratchBuffer(uintv.SizeBytes()) 1437 uintv.MarshalUnsafe(ib) 1438 // Handle cases where outLen is lesser than sizeOfInt32. 1439 if len(ib) > outLen { 1440 ib = ib[:outLen] 1441 } 1442 ibP := primitive.ByteSlice(ib) 1443 return &ibP, nil 1444 1445 case linux.IPV6_RECVTCLASS: 1446 if outLen < sizeOfInt32 { 1447 return nil, syserr.ErrInvalidArgument 1448 } 1449 1450 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTClass())) 1451 return &v, nil 1452 case linux.IPV6_RECVERR: 1453 if outLen < sizeOfInt32 { 1454 return nil, syserr.ErrInvalidArgument 1455 } 1456 1457 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv6RecvError())) 1458 return &v, nil 1459 1460 case linux.IPV6_RECVORIGDSTADDR: 1461 if outLen < sizeOfInt32 { 1462 return nil, syserr.ErrInvalidArgument 1463 } 1464 1465 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress())) 1466 return &v, nil 1467 1468 case linux.IPV6_RECVPKTINFO: 1469 if outLen < sizeOfInt32 { 1470 return nil, syserr.ErrInvalidArgument 1471 } 1472 1473 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv6ReceivePacketInfo())) 1474 return &v, nil 1475 1476 case linux.IP6T_ORIGINAL_DST: 1477 if outLen < sockAddrInet6Size { 1478 return nil, syserr.ErrInvalidArgument 1479 } 1480 1481 var v tcpip.OriginalDestinationOption 1482 if err := ep.GetSockOpt(&v); err != nil { 1483 return nil, syserr.TranslateNetstackError(err) 1484 } 1485 1486 a, _ := socket.ConvertAddress(linux.AF_INET6, tcpip.FullAddress(v)) 1487 return a.(*linux.SockAddrInet6), nil 1488 1489 case linux.IP6T_SO_GET_INFO: 1490 if outLen < linux.SizeOfIPTGetinfo { 1491 return nil, syserr.ErrInvalidArgument 1492 } 1493 1494 // Only valid for raw IPv6 sockets. 1495 if skType != linux.SOCK_RAW { 1496 return nil, syserr.ErrProtocolNotAvailable 1497 } 1498 1499 stk := inet.StackFromContext(t) 1500 if stk == nil { 1501 return nil, syserr.ErrNoDevice 1502 } 1503 info, err := netfilter.GetInfo(t, stk.(*Stack).Stack, outPtr, true) 1504 if err != nil { 1505 return nil, err 1506 } 1507 return &info, nil 1508 1509 case linux.IP6T_SO_GET_ENTRIES: 1510 // IPTGetEntries is reused for IPv6. 1511 if outLen < linux.SizeOfIPTGetEntries { 1512 return nil, syserr.ErrInvalidArgument 1513 } 1514 // Only valid for raw IPv6 sockets. 1515 if skType != linux.SOCK_RAW { 1516 return nil, syserr.ErrProtocolNotAvailable 1517 } 1518 1519 stk := inet.StackFromContext(t) 1520 if stk == nil { 1521 return nil, syserr.ErrNoDevice 1522 } 1523 entries, err := netfilter.GetEntries6(t, stk.(*Stack).Stack, outPtr, outLen) 1524 if err != nil { 1525 return nil, err 1526 } 1527 return &entries, nil 1528 1529 case linux.IP6T_SO_GET_REVISION_TARGET: 1530 if outLen < linux.SizeOfXTGetRevision { 1531 return nil, syserr.ErrInvalidArgument 1532 } 1533 1534 // Only valid for raw IPv6 sockets. 1535 if skType != linux.SOCK_RAW { 1536 return nil, syserr.ErrProtocolNotAvailable 1537 } 1538 1539 stk := inet.StackFromContext(t) 1540 if stk == nil { 1541 return nil, syserr.ErrNoDevice 1542 } 1543 ret, err := netfilter.TargetRevision(t, outPtr, header.IPv6ProtocolNumber) 1544 if err != nil { 1545 return nil, err 1546 } 1547 return &ret, nil 1548 } 1549 return nil, syserr.ErrProtocolNotAvailable 1550 } 1551 1552 // getSockOptIP implements GetSockOpt when level is SOL_IP. 1553 func getSockOptIP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int, _ int) (marshal.Marshallable, *syserr.Error) { 1554 if _, ok := ep.(tcpip.Endpoint); !ok { 1555 log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name) 1556 return nil, syserr.ErrUnknownProtocolOption 1557 } 1558 1559 switch name { 1560 case linux.IP_TTL: 1561 if outLen < sizeOfInt32 { 1562 return nil, syserr.ErrInvalidArgument 1563 } 1564 1565 v, err := ep.GetSockOptInt(tcpip.IPv4TTLOption) 1566 if err != nil { 1567 return nil, syserr.TranslateNetstackError(err) 1568 } 1569 1570 // Fill in the default value, if needed. 1571 vP := primitive.Int32(v) 1572 if vP == 0 { 1573 vP, err = defaultTTL(t, header.IPv4ProtocolNumber) 1574 if err != nil { 1575 return nil, syserr.TranslateNetstackError(err) 1576 } 1577 } 1578 1579 return &vP, nil 1580 1581 case linux.IP_RECVTTL: 1582 if outLen < sizeOfInt32 { 1583 return nil, syserr.ErrInvalidArgument 1584 } 1585 1586 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTTL())) 1587 return &v, nil 1588 1589 case linux.IP_MULTICAST_TTL: 1590 if outLen < sizeOfInt32 { 1591 return nil, syserr.ErrInvalidArgument 1592 } 1593 1594 v, err := ep.GetSockOptInt(tcpip.MulticastTTLOption) 1595 if err != nil { 1596 return nil, syserr.TranslateNetstackError(err) 1597 } 1598 1599 vP := primitive.Int32(v) 1600 return &vP, nil 1601 1602 case linux.IP_MULTICAST_IF: 1603 if outLen < len(linux.InetAddr{}) { 1604 return nil, syserr.ErrInvalidArgument 1605 } 1606 1607 var v tcpip.MulticastInterfaceOption 1608 if err := ep.GetSockOpt(&v); err != nil { 1609 return nil, syserr.TranslateNetstackError(err) 1610 } 1611 1612 a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr}) 1613 1614 return &a.(*linux.SockAddrInet).Addr, nil 1615 1616 case linux.IP_MULTICAST_LOOP: 1617 if outLen < sizeOfInt32 { 1618 return nil, syserr.ErrInvalidArgument 1619 } 1620 1621 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetMulticastLoop())) 1622 return &v, nil 1623 1624 case linux.IP_TOS: 1625 // Length handling for parity with Linux. 1626 if outLen == 0 { 1627 var b primitive.ByteSlice 1628 return &b, nil 1629 } 1630 v, err := ep.GetSockOptInt(tcpip.IPv4TOSOption) 1631 if err != nil { 1632 return nil, syserr.TranslateNetstackError(err) 1633 } 1634 if outLen < sizeOfInt32 { 1635 vP := primitive.Uint8(v) 1636 return &vP, nil 1637 } 1638 vP := primitive.Int32(v) 1639 return &vP, nil 1640 1641 case linux.IP_RECVTOS: 1642 if outLen < sizeOfInt32 { 1643 return nil, syserr.ErrInvalidArgument 1644 } 1645 1646 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTOS())) 1647 return &v, nil 1648 1649 case linux.IP_RECVERR: 1650 if outLen < sizeOfInt32 { 1651 return nil, syserr.ErrInvalidArgument 1652 } 1653 1654 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv4RecvError())) 1655 return &v, nil 1656 1657 case linux.IP_PKTINFO: 1658 if outLen < sizeOfInt32 { 1659 return nil, syserr.ErrInvalidArgument 1660 } 1661 1662 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceivePacketInfo())) 1663 return &v, nil 1664 1665 case linux.IP_HDRINCL: 1666 if outLen < sizeOfInt32 { 1667 return nil, syserr.ErrInvalidArgument 1668 } 1669 1670 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetHeaderIncluded())) 1671 return &v, nil 1672 1673 case linux.IP_RECVORIGDSTADDR: 1674 if outLen < sizeOfInt32 { 1675 return nil, syserr.ErrInvalidArgument 1676 } 1677 1678 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress())) 1679 return &v, nil 1680 1681 case linux.SO_ORIGINAL_DST: 1682 if outLen < sockAddrInetSize { 1683 return nil, syserr.ErrInvalidArgument 1684 } 1685 1686 var v tcpip.OriginalDestinationOption 1687 if err := ep.GetSockOpt(&v); err != nil { 1688 return nil, syserr.TranslateNetstackError(err) 1689 } 1690 1691 a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress(v)) 1692 return a.(*linux.SockAddrInet), nil 1693 1694 case linux.IPT_SO_GET_INFO: 1695 if outLen < linux.SizeOfIPTGetinfo { 1696 return nil, syserr.ErrInvalidArgument 1697 } 1698 1699 // Only valid for raw IPv4 sockets. 1700 if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW { 1701 return nil, syserr.ErrProtocolNotAvailable 1702 } 1703 1704 stk := inet.StackFromContext(t) 1705 if stk == nil { 1706 return nil, syserr.ErrNoDevice 1707 } 1708 info, err := netfilter.GetInfo(t, stk.(*Stack).Stack, outPtr, false) 1709 if err != nil { 1710 return nil, err 1711 } 1712 return &info, nil 1713 1714 case linux.IPT_SO_GET_ENTRIES: 1715 if outLen < linux.SizeOfIPTGetEntries { 1716 return nil, syserr.ErrInvalidArgument 1717 } 1718 1719 // Only valid for raw IPv4 sockets. 1720 if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW { 1721 return nil, syserr.ErrProtocolNotAvailable 1722 } 1723 1724 stk := inet.StackFromContext(t) 1725 if stk == nil { 1726 return nil, syserr.ErrNoDevice 1727 } 1728 entries, err := netfilter.GetEntries4(t, stk.(*Stack).Stack, outPtr, outLen) 1729 if err != nil { 1730 return nil, err 1731 } 1732 return &entries, nil 1733 1734 case linux.IPT_SO_GET_REVISION_TARGET: 1735 if outLen < linux.SizeOfXTGetRevision { 1736 return nil, syserr.ErrInvalidArgument 1737 } 1738 1739 // Only valid for raw IPv4 sockets. 1740 if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW { 1741 return nil, syserr.ErrProtocolNotAvailable 1742 } 1743 1744 stk := inet.StackFromContext(t) 1745 if stk == nil { 1746 return nil, syserr.ErrNoDevice 1747 } 1748 ret, err := netfilter.TargetRevision(t, outPtr, header.IPv4ProtocolNumber) 1749 if err != nil { 1750 return nil, err 1751 } 1752 return &ret, nil 1753 } 1754 return nil, syserr.ErrProtocolNotAvailable 1755 } 1756 1757 // SetSockOpt can be used to implement the linux syscall setsockopt(2) for 1758 // sockets backed by a commonEndpoint. 1759 func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error { 1760 switch level { 1761 case linux.SOL_SOCKET: 1762 return setSockOptSocket(t, s, ep, name, optVal) 1763 1764 case linux.SOL_TCP: 1765 return setSockOptTCP(t, s, ep, name, optVal) 1766 1767 case linux.SOL_ICMPV6: 1768 return setSockOptICMPv6(t, s, ep, name, optVal) 1769 1770 case linux.SOL_IPV6: 1771 return setSockOptIPv6(t, s, ep, name, optVal) 1772 1773 case linux.SOL_IP: 1774 return setSockOptIP(t, s, ep, name, optVal) 1775 1776 case linux.SOL_PACKET: 1777 // gVisor doesn't support any SOL_PACKET options just return not 1778 // supported. Returning nil here will result in tcpdump thinking AF_PACKET 1779 // features are supported and proceed to use them and break. 1780 return syserr.ErrProtocolNotAvailable 1781 1782 case linux.SOL_UDP, 1783 linux.SOL_RAW: 1784 // Not supported. 1785 } 1786 1787 return nil 1788 } 1789 1790 func clampBufSize(newSz, min, max int64, ignoreMax bool) int64 { 1791 // packetOverheadFactor is used to multiply the value provided by the user on 1792 // a setsockopt(2) for setting the send/receive buffer sizes sockets. 1793 const packetOverheadFactor = 2 1794 1795 if !ignoreMax && newSz > max { 1796 newSz = max 1797 } 1798 1799 if newSz < math.MaxInt32/packetOverheadFactor { 1800 newSz *= packetOverheadFactor 1801 if newSz < min { 1802 newSz = min 1803 } 1804 } else { 1805 newSz = math.MaxInt32 1806 } 1807 return newSz 1808 } 1809 1810 // setSockOptSocket implements SetSockOpt when level is SOL_SOCKET. 1811 func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { 1812 switch name { 1813 case linux.SO_SNDBUF: 1814 if len(optVal) < sizeOfInt32 { 1815 return syserr.ErrInvalidArgument 1816 } 1817 1818 v := hostarch.ByteOrder.Uint32(optVal) 1819 min, max := ep.SocketOptions().SendBufferLimits() 1820 clamped := clampBufSize(int64(v), min, max, false /* ignoreMax */) 1821 ep.SocketOptions().SetSendBufferSize(clamped, true /* notify */) 1822 return nil 1823 1824 case linux.SO_RCVBUF: 1825 if len(optVal) < sizeOfInt32 { 1826 return syserr.ErrInvalidArgument 1827 } 1828 1829 v := hostarch.ByteOrder.Uint32(optVal) 1830 min, max := ep.SocketOptions().ReceiveBufferLimits() 1831 clamped := clampBufSize(int64(v), min, max, false /* ignoreMax */) 1832 ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */) 1833 return nil 1834 1835 case linux.SO_RCVBUFFORCE: 1836 if len(optVal) < sizeOfInt32 { 1837 return syserr.ErrInvalidArgument 1838 } 1839 1840 if creds := auth.CredentialsFromContext(t); !creds.HasCapability(linux.CAP_NET_ADMIN) { 1841 return syserr.ErrNotPermitted 1842 } 1843 1844 v := hostarch.ByteOrder.Uint32(optVal) 1845 min, max := ep.SocketOptions().ReceiveBufferLimits() 1846 clamped := clampBufSize(int64(v), min, max, true /* ignoreMax */) 1847 ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */) 1848 return nil 1849 1850 case linux.SO_REUSEADDR: 1851 if len(optVal) < sizeOfInt32 { 1852 return syserr.ErrInvalidArgument 1853 } 1854 1855 v := hostarch.ByteOrder.Uint32(optVal) 1856 ep.SocketOptions().SetReuseAddress(v != 0) 1857 return nil 1858 1859 case linux.SO_REUSEPORT: 1860 if len(optVal) < sizeOfInt32 { 1861 return syserr.ErrInvalidArgument 1862 } 1863 1864 v := hostarch.ByteOrder.Uint32(optVal) 1865 ep.SocketOptions().SetReusePort(v != 0) 1866 return nil 1867 1868 case linux.SO_BINDTODEVICE: 1869 n := bytes.IndexByte(optVal, 0) 1870 if n == -1 { 1871 n = len(optVal) 1872 } 1873 name := string(optVal[:n]) 1874 if name == "" { 1875 return syserr.TranslateNetstackError(ep.SocketOptions().SetBindToDevice(0)) 1876 } 1877 s := t.NetworkContext() 1878 if s == nil { 1879 return syserr.ErrNoDevice 1880 } 1881 for nicID, nic := range s.Interfaces() { 1882 if nic.Name == name { 1883 return syserr.TranslateNetstackError(ep.SocketOptions().SetBindToDevice(nicID)) 1884 } 1885 } 1886 return syserr.ErrUnknownDevice 1887 1888 case linux.SO_BROADCAST: 1889 if len(optVal) < sizeOfInt32 { 1890 return syserr.ErrInvalidArgument 1891 } 1892 1893 v := hostarch.ByteOrder.Uint32(optVal) 1894 ep.SocketOptions().SetBroadcast(v != 0) 1895 return nil 1896 1897 case linux.SO_PASSCRED: 1898 if len(optVal) < sizeOfInt32 { 1899 return syserr.ErrInvalidArgument 1900 } 1901 1902 v := hostarch.ByteOrder.Uint32(optVal) 1903 ep.SocketOptions().SetPassCred(v != 0) 1904 return nil 1905 1906 case linux.SO_KEEPALIVE: 1907 if len(optVal) < sizeOfInt32 { 1908 return syserr.ErrInvalidArgument 1909 } 1910 1911 v := hostarch.ByteOrder.Uint32(optVal) 1912 ep.SocketOptions().SetKeepAlive(v != 0) 1913 return nil 1914 1915 case linux.SO_SNDTIMEO: 1916 if len(optVal) < linux.SizeOfTimeval { 1917 return syserr.ErrInvalidArgument 1918 } 1919 1920 var v linux.Timeval 1921 v.UnmarshalBytes(optVal) 1922 if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) { 1923 return syserr.ErrDomain 1924 } 1925 s.SetSendTimeout(v.ToNsecCapped()) 1926 return nil 1927 1928 case linux.SO_RCVTIMEO: 1929 if len(optVal) < linux.SizeOfTimeval { 1930 return syserr.ErrInvalidArgument 1931 } 1932 1933 var v linux.Timeval 1934 v.UnmarshalBytes(optVal) 1935 if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) { 1936 return syserr.ErrDomain 1937 } 1938 s.SetRecvTimeout(v.ToNsecCapped()) 1939 return nil 1940 1941 case linux.SO_OOBINLINE: 1942 if len(optVal) < sizeOfInt32 { 1943 return syserr.ErrInvalidArgument 1944 } 1945 1946 v := hostarch.ByteOrder.Uint32(optVal) 1947 ep.SocketOptions().SetOutOfBandInline(v != 0) 1948 return nil 1949 1950 case linux.SO_NO_CHECK: 1951 if len(optVal) < sizeOfInt32 { 1952 return syserr.ErrInvalidArgument 1953 } 1954 1955 v := hostarch.ByteOrder.Uint32(optVal) 1956 ep.SocketOptions().SetNoChecksum(v != 0) 1957 return nil 1958 1959 case linux.SO_LINGER: 1960 if len(optVal) < linux.SizeOfLinger { 1961 return syserr.ErrInvalidArgument 1962 } 1963 1964 var v linux.Linger 1965 v.UnmarshalBytes(optVal) 1966 1967 ep.SocketOptions().SetLinger(tcpip.LingerOption{ 1968 Enabled: v.OnOff != 0, 1969 Timeout: time.Second * time.Duration(v.Linger), 1970 }) 1971 return nil 1972 1973 case linux.SO_DETACH_FILTER: 1974 // optval is ignored. 1975 var v tcpip.SocketDetachFilterOption 1976 return syserr.TranslateNetstackError(ep.SetSockOpt(&v)) 1977 1978 // TODO(b/226603727): Add support for SO_RCVLOWAT option. For now, only 1979 // the unsupported syscall message is removed. 1980 case linux.SO_RCVLOWAT: 1981 if len(optVal) < sizeOfInt32 { 1982 return syserr.ErrInvalidArgument 1983 } 1984 1985 v := hostarch.ByteOrder.Uint32(optVal) 1986 ep.SocketOptions().SetRcvlowat(int32(v)) 1987 return nil 1988 } 1989 1990 return nil 1991 } 1992 1993 // setSockOptTCP implements SetSockOpt when level is SOL_TCP. 1994 func setSockOptTCP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { 1995 if !socket.IsTCP(s) { 1996 return syserr.ErrUnknownProtocolOption 1997 } 1998 1999 switch name { 2000 case linux.TCP_NODELAY: 2001 if len(optVal) < sizeOfInt32 { 2002 return syserr.ErrInvalidArgument 2003 } 2004 2005 v := hostarch.ByteOrder.Uint32(optVal) 2006 ep.SocketOptions().SetDelayOption(v == 0) 2007 return nil 2008 2009 case linux.TCP_CORK: 2010 if len(optVal) < sizeOfInt32 { 2011 return syserr.ErrInvalidArgument 2012 } 2013 2014 v := hostarch.ByteOrder.Uint32(optVal) 2015 ep.SocketOptions().SetCorkOption(v != 0) 2016 return nil 2017 2018 case linux.TCP_QUICKACK: 2019 if len(optVal) < sizeOfInt32 { 2020 return syserr.ErrInvalidArgument 2021 } 2022 2023 v := hostarch.ByteOrder.Uint32(optVal) 2024 ep.SocketOptions().SetQuickAck(v != 0) 2025 return nil 2026 2027 case linux.TCP_MAXSEG: 2028 if len(optVal) < sizeOfInt32 { 2029 return syserr.ErrInvalidArgument 2030 } 2031 2032 v := hostarch.ByteOrder.Uint32(optVal) 2033 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MaxSegOption, int(v))) 2034 2035 case linux.TCP_KEEPIDLE: 2036 if len(optVal) < sizeOfInt32 { 2037 return syserr.ErrInvalidArgument 2038 } 2039 2040 v := hostarch.ByteOrder.Uint32(optVal) 2041 if v < 1 || v > linux.MAX_TCP_KEEPIDLE { 2042 return syserr.ErrInvalidArgument 2043 } 2044 opt := tcpip.KeepaliveIdleOption(time.Second * time.Duration(v)) 2045 return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) 2046 2047 case linux.TCP_KEEPINTVL: 2048 if len(optVal) < sizeOfInt32 { 2049 return syserr.ErrInvalidArgument 2050 } 2051 2052 v := hostarch.ByteOrder.Uint32(optVal) 2053 if v < 1 || v > linux.MAX_TCP_KEEPINTVL { 2054 return syserr.ErrInvalidArgument 2055 } 2056 opt := tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v)) 2057 return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) 2058 2059 case linux.TCP_KEEPCNT: 2060 if len(optVal) < sizeOfInt32 { 2061 return syserr.ErrInvalidArgument 2062 } 2063 2064 v := hostarch.ByteOrder.Uint32(optVal) 2065 if v < 1 || v > linux.MAX_TCP_KEEPCNT { 2066 return syserr.ErrInvalidArgument 2067 } 2068 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.KeepaliveCountOption, int(v))) 2069 2070 case linux.TCP_USER_TIMEOUT: 2071 if len(optVal) < sizeOfInt32 { 2072 return syserr.ErrInvalidArgument 2073 } 2074 2075 v := int32(hostarch.ByteOrder.Uint32(optVal)) 2076 if v < 0 { 2077 return syserr.ErrInvalidArgument 2078 } 2079 opt := tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v)) 2080 return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) 2081 2082 case linux.TCP_CONGESTION: 2083 v := tcpip.CongestionControlOption(optVal) 2084 if err := ep.SetSockOpt(&v); err != nil { 2085 return syserr.TranslateNetstackError(err) 2086 } 2087 return nil 2088 2089 case linux.TCP_LINGER2: 2090 if len(optVal) < sizeOfInt32 { 2091 return syserr.ErrInvalidArgument 2092 } 2093 2094 v := int32(hostarch.ByteOrder.Uint32(optVal)) 2095 opt := tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v)) 2096 return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) 2097 2098 case linux.TCP_DEFER_ACCEPT: 2099 if len(optVal) < sizeOfInt32 { 2100 return syserr.ErrInvalidArgument 2101 } 2102 v := int32(hostarch.ByteOrder.Uint32(optVal)) 2103 if v < 0 { 2104 v = 0 2105 } 2106 opt := tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v)) 2107 return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) 2108 2109 case linux.TCP_SYNCNT: 2110 if len(optVal) < sizeOfInt32 { 2111 return syserr.ErrInvalidArgument 2112 } 2113 v := hostarch.ByteOrder.Uint32(optVal) 2114 2115 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPSynCountOption, int(v))) 2116 2117 case linux.TCP_WINDOW_CLAMP: 2118 if len(optVal) < sizeOfInt32 { 2119 return syserr.ErrInvalidArgument 2120 } 2121 v := hostarch.ByteOrder.Uint32(optVal) 2122 2123 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPWindowClampOption, int(v))) 2124 2125 case linux.TCP_REPAIR_OPTIONS: 2126 // Not supported. 2127 } 2128 2129 return nil 2130 } 2131 2132 func setSockOptICMPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { 2133 if _, ok := ep.(tcpip.Endpoint); !ok { 2134 log.Warningf("SOL_ICMPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name) 2135 return syserr.ErrUnknownProtocolOption 2136 } 2137 2138 if family, _, _ := s.Type(); family != linux.AF_INET6 { 2139 return syserr.ErrUnknownProtocolOption 2140 } 2141 2142 switch name { 2143 case linux.ICMPV6_FILTER: 2144 var req linux.ICMP6Filter 2145 if len(optVal) < req.SizeBytes() { 2146 return syserr.ErrInvalidArgument 2147 } 2148 2149 req.UnmarshalUnsafe(optVal) 2150 return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.ICMPv6Filter{DenyType: req.Filter})) 2151 } 2152 2153 return nil 2154 } 2155 2156 // setSockOptIPv6 implements SetSockOpt when level is SOL_IPV6. 2157 func setSockOptIPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { 2158 if _, ok := ep.(tcpip.Endpoint); !ok { 2159 log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name) 2160 return syserr.ErrUnknownProtocolOption 2161 } 2162 2163 family, _, _ := s.Type() 2164 if family != linux.AF_INET6 { 2165 return syserr.ErrUnknownProtocolOption 2166 } 2167 2168 switch name { 2169 case linux.IPV6_CHECKSUM: 2170 if len(optVal) < sizeOfInt32 { 2171 return syserr.ErrInvalidArgument 2172 } 2173 2174 // int may not be 32-bits so we cast the uint32 to an int32 before casting 2175 // to an int. 2176 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6Checksum, int(int32(hostarch.ByteOrder.Uint32(optVal))))) 2177 2178 case linux.IPV6_V6ONLY: 2179 if len(optVal) < sizeOfInt32 { 2180 return syserr.ErrInvalidArgument 2181 } 2182 2183 if socket.IsTCP(s) && tcp.EndpointState(ep.State()) != tcp.StateInitial { 2184 return syserr.ErrInvalidEndpointState 2185 } else if socket.IsUDP(s) && transport.DatagramEndpointState(ep.State()) != transport.DatagramEndpointStateInitial { 2186 return syserr.ErrInvalidEndpointState 2187 } 2188 2189 v := hostarch.ByteOrder.Uint32(optVal) 2190 ep.SocketOptions().SetV6Only(v != 0) 2191 return nil 2192 2193 case linux.IPV6_ADD_MEMBERSHIP: 2194 req, err := copyInMulticastV6Request(optVal) 2195 if err != nil { 2196 return err 2197 } 2198 2199 return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{ 2200 NIC: tcpip.NICID(req.InterfaceIndex), 2201 MulticastAddr: tcpip.AddrFrom16(req.MulticastAddr), 2202 })) 2203 2204 case linux.IPV6_DROP_MEMBERSHIP: 2205 req, err := copyInMulticastV6Request(optVal) 2206 if err != nil { 2207 return err 2208 } 2209 2210 return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{ 2211 NIC: tcpip.NICID(req.InterfaceIndex), 2212 MulticastAddr: tcpip.AddrFrom16(req.MulticastAddr), 2213 })) 2214 2215 case linux.IPV6_IPSEC_POLICY, 2216 linux.IPV6_JOIN_ANYCAST, 2217 linux.IPV6_LEAVE_ANYCAST, 2218 // TODO(b/148887420): Add support for IPV6_PKTINFO. 2219 linux.IPV6_PKTINFO, 2220 linux.IPV6_ROUTER_ALERT, 2221 linux.IPV6_XFRM_POLICY, 2222 linux.MCAST_BLOCK_SOURCE, 2223 linux.MCAST_JOIN_GROUP, 2224 linux.MCAST_JOIN_SOURCE_GROUP, 2225 linux.MCAST_LEAVE_GROUP, 2226 linux.MCAST_LEAVE_SOURCE_GROUP, 2227 linux.MCAST_UNBLOCK_SOURCE: 2228 // Not supported. 2229 2230 case linux.IPV6_RECVORIGDSTADDR: 2231 if len(optVal) < sizeOfInt32 { 2232 return syserr.ErrInvalidArgument 2233 } 2234 v := int32(hostarch.ByteOrder.Uint32(optVal)) 2235 2236 ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0) 2237 return nil 2238 2239 case linux.IPV6_RECVPKTINFO: 2240 if len(optVal) < sizeOfInt32 { 2241 return syserr.ErrInvalidArgument 2242 } 2243 v := int32(hostarch.ByteOrder.Uint32(optVal)) 2244 2245 ep.SocketOptions().SetIPv6ReceivePacketInfo(v != 0) 2246 return nil 2247 2248 case linux.IPV6_UNICAST_HOPS: 2249 if len(optVal) < sizeOfInt32 { 2250 return syserr.ErrInvalidArgument 2251 } 2252 v := int32(hostarch.ByteOrder.Uint32(optVal)) 2253 if v < -1 || v > 255 { 2254 return syserr.ErrInvalidArgument 2255 } 2256 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6HopLimitOption, int(v))) 2257 2258 case linux.IPV6_RECVHOPLIMIT: 2259 v, err := parseIntOrChar(optVal) 2260 if err != nil { 2261 return err 2262 } 2263 2264 ep.SocketOptions().SetReceiveHopLimit(v != 0) 2265 return nil 2266 2267 case linux.IPV6_TCLASS: 2268 if len(optVal) < sizeOfInt32 { 2269 return syserr.ErrInvalidArgument 2270 } 2271 v := int32(hostarch.ByteOrder.Uint32(optVal)) 2272 if v < -1 || v > 255 { 2273 return syserr.ErrInvalidArgument 2274 } 2275 if v == -1 { 2276 v = 0 2277 } 2278 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6TrafficClassOption, int(v))) 2279 2280 case linux.IPV6_RECVTCLASS: 2281 v, err := parseIntOrChar(optVal) 2282 if err != nil { 2283 return err 2284 } 2285 2286 ep.SocketOptions().SetReceiveTClass(v != 0) 2287 return nil 2288 case linux.IPV6_RECVERR: 2289 if len(optVal) == 0 { 2290 return nil 2291 } 2292 v, err := parseIntOrChar(optVal) 2293 if err != nil { 2294 return err 2295 } 2296 ep.SocketOptions().SetIPv6RecvError(v != 0) 2297 return nil 2298 2299 case linux.IP6T_SO_SET_REPLACE: 2300 if len(optVal) < linux.SizeOfIP6TReplace { 2301 return syserr.ErrInvalidArgument 2302 } 2303 2304 // Only valid for raw IPv6 sockets. 2305 if !socket.IsRaw(s) { 2306 return syserr.ErrProtocolNotAvailable 2307 } 2308 2309 stk := inet.StackFromContext(t) 2310 if stk == nil { 2311 return syserr.ErrNoDevice 2312 } 2313 // Stack must be a netstack stack. 2314 return netfilter.SetEntries(t.Credentials().UserNamespace, stk.(*Stack).Stack, optVal, true) 2315 2316 case linux.IP6T_SO_SET_ADD_COUNTERS: 2317 log.Infof("IP6T_SO_SET_ADD_COUNTERS is not supported") 2318 return nil 2319 } 2320 2321 return nil 2322 } 2323 2324 var ( 2325 inetMulticastRequestSize = (*linux.InetMulticastRequest)(nil).SizeBytes() 2326 inetMulticastRequestWithNICSize = (*linux.InetMulticastRequestWithNIC)(nil).SizeBytes() 2327 inet6MulticastRequestSize = (*linux.Inet6MulticastRequest)(nil).SizeBytes() 2328 ) 2329 2330 // copyInMulticastRequest copies in a variable-size multicast request. The 2331 // kernel determines which structure was passed by its length. IP_MULTICAST_IF 2332 // supports ip_mreqn, ip_mreq and in_addr, while IP_ADD_MEMBERSHIP and 2333 // IP_DROP_MEMBERSHIP only support ip_mreqn and ip_mreq. To handle this, 2334 // allowAddr controls whether in_addr is accepted or rejected. 2335 func copyInMulticastRequest(optVal []byte, allowAddr bool) (linux.InetMulticastRequestWithNIC, *syserr.Error) { 2336 if len(optVal) < len(linux.InetAddr{}) { 2337 return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument 2338 } 2339 2340 if len(optVal) < inetMulticastRequestSize { 2341 if !allowAddr { 2342 return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument 2343 } 2344 2345 var req linux.InetMulticastRequestWithNIC 2346 copy(req.InterfaceAddr[:], optVal) 2347 return req, nil 2348 } 2349 2350 if len(optVal) >= inetMulticastRequestWithNICSize { 2351 var req linux.InetMulticastRequestWithNIC 2352 req.UnmarshalUnsafe(optVal) 2353 return req, nil 2354 } 2355 2356 var req linux.InetMulticastRequestWithNIC 2357 req.InetMulticastRequest.UnmarshalUnsafe(optVal) 2358 return req, nil 2359 } 2360 2361 func copyInMulticastV6Request(optVal []byte) (linux.Inet6MulticastRequest, *syserr.Error) { 2362 if len(optVal) < inet6MulticastRequestSize { 2363 return linux.Inet6MulticastRequest{}, syserr.ErrInvalidArgument 2364 } 2365 2366 var req linux.Inet6MulticastRequest 2367 req.UnmarshalUnsafe(optVal) 2368 return req, nil 2369 } 2370 2371 // parseIntOrChar copies either a 32-bit int or an 8-bit uint out of buf. 2372 // 2373 // net/ipv4/ip_sockglue.c:do_ip_setsockopt does this for its socket options. 2374 func parseIntOrChar(buf []byte) (int32, *syserr.Error) { 2375 if len(buf) == 0 { 2376 return 0, syserr.ErrInvalidArgument 2377 } 2378 2379 if len(buf) >= sizeOfInt32 { 2380 return int32(hostarch.ByteOrder.Uint32(buf)), nil 2381 } 2382 2383 return int32(buf[0]), nil 2384 } 2385 2386 // setSockOptIP implements SetSockOpt when level is SOL_IP. 2387 func setSockOptIP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { 2388 if _, ok := ep.(tcpip.Endpoint); !ok { 2389 log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name) 2390 return syserr.ErrUnknownProtocolOption 2391 } 2392 2393 switch name { 2394 case linux.IP_MULTICAST_TTL: 2395 v, err := parseIntOrChar(optVal) 2396 if err != nil { 2397 return err 2398 } 2399 2400 if v == -1 { 2401 // Linux translates -1 to 1. 2402 v = 1 2403 } 2404 if v < 0 || v > 255 { 2405 return syserr.ErrInvalidArgument 2406 } 2407 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MulticastTTLOption, int(v))) 2408 2409 case linux.IP_ADD_MEMBERSHIP: 2410 req, err := copyInMulticastRequest(optVal, false /* allowAddr */) 2411 if err != nil { 2412 return err 2413 } 2414 2415 return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{ 2416 NIC: tcpip.NICID(req.InterfaceIndex), 2417 // TODO(igudger): Change AddMembership to use the standard 2418 // any address representation. 2419 InterfaceAddr: tcpip.AddrFrom4(req.InterfaceAddr), 2420 MulticastAddr: tcpip.AddrFrom4(req.MulticastAddr), 2421 })) 2422 2423 case linux.IP_DROP_MEMBERSHIP: 2424 req, err := copyInMulticastRequest(optVal, false /* allowAddr */) 2425 if err != nil { 2426 return err 2427 } 2428 2429 return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{ 2430 NIC: tcpip.NICID(req.InterfaceIndex), 2431 // TODO(igudger): Change DropMembership to use the standard 2432 // any address representation. 2433 InterfaceAddr: tcpip.AddrFrom4(req.InterfaceAddr), 2434 MulticastAddr: tcpip.AddrFrom4(req.MulticastAddr), 2435 })) 2436 2437 case linux.IP_MULTICAST_IF: 2438 req, err := copyInMulticastRequest(optVal, true /* allowAddr */) 2439 if err != nil { 2440 return err 2441 } 2442 2443 return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.MulticastInterfaceOption{ 2444 NIC: tcpip.NICID(req.InterfaceIndex), 2445 InterfaceAddr: socket.BytesToIPAddress(req.InterfaceAddr[:]), 2446 })) 2447 2448 case linux.IP_MULTICAST_LOOP: 2449 v, err := parseIntOrChar(optVal) 2450 if err != nil { 2451 return err 2452 } 2453 2454 ep.SocketOptions().SetMulticastLoop(v != 0) 2455 return nil 2456 2457 case linux.MCAST_JOIN_GROUP: 2458 // FIXME(b/124219304): Implement MCAST_JOIN_GROUP. 2459 return syserr.ErrInvalidArgument 2460 2461 case linux.IP_TTL: 2462 v, err := parseIntOrChar(optVal) 2463 if err != nil { 2464 return err 2465 } 2466 2467 // -1 means default TTL. 2468 if v == -1 { 2469 v = 0 2470 } else if v < 1 || v > 255 { 2471 return syserr.ErrInvalidArgument 2472 } 2473 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TTLOption, int(v))) 2474 2475 case linux.IP_RECVTTL: 2476 v, err := parseIntOrChar(optVal) 2477 if err != nil { 2478 return err 2479 } 2480 ep.SocketOptions().SetReceiveTTL(v != 0) 2481 return nil 2482 2483 case linux.IP_TOS: 2484 if len(optVal) == 0 { 2485 return nil 2486 } 2487 v, err := parseIntOrChar(optVal) 2488 if err != nil { 2489 return err 2490 } 2491 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TOSOption, int(v))) 2492 2493 case linux.IP_RECVTOS: 2494 v, err := parseIntOrChar(optVal) 2495 if err != nil { 2496 return err 2497 } 2498 ep.SocketOptions().SetReceiveTOS(v != 0) 2499 return nil 2500 2501 case linux.IP_RECVERR: 2502 if len(optVal) == 0 { 2503 return nil 2504 } 2505 v, err := parseIntOrChar(optVal) 2506 if err != nil { 2507 return err 2508 } 2509 ep.SocketOptions().SetIPv4RecvError(v != 0) 2510 return nil 2511 2512 case linux.IP_PKTINFO: 2513 if len(optVal) == 0 { 2514 return nil 2515 } 2516 v, err := parseIntOrChar(optVal) 2517 if err != nil { 2518 return err 2519 } 2520 ep.SocketOptions().SetReceivePacketInfo(v != 0) 2521 return nil 2522 2523 case linux.IP_HDRINCL: 2524 if len(optVal) == 0 { 2525 return nil 2526 } 2527 v, err := parseIntOrChar(optVal) 2528 if err != nil { 2529 return err 2530 } 2531 ep.SocketOptions().SetHeaderIncluded(v != 0) 2532 return nil 2533 2534 case linux.IP_RECVORIGDSTADDR: 2535 if len(optVal) == 0 { 2536 return nil 2537 } 2538 v, err := parseIntOrChar(optVal) 2539 if err != nil { 2540 return err 2541 } 2542 2543 ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0) 2544 return nil 2545 2546 case linux.IPT_SO_SET_REPLACE: 2547 if len(optVal) < linux.SizeOfIPTReplace { 2548 return syserr.ErrInvalidArgument 2549 } 2550 2551 // Only valid for raw IPv4 sockets. 2552 if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW { 2553 return syserr.ErrProtocolNotAvailable 2554 } 2555 2556 stk := inet.StackFromContext(t) 2557 if stk == nil { 2558 return syserr.ErrNoDevice 2559 } 2560 // Stack must be a netstack stack. 2561 return netfilter.SetEntries(t.Credentials().UserNamespace, stk.(*Stack).Stack, optVal, false) 2562 2563 case linux.IPT_SO_SET_ADD_COUNTERS: 2564 log.Infof("IPT_SO_SET_ADD_COUNTERS is not supported") 2565 return nil 2566 2567 case linux.IP_ADD_SOURCE_MEMBERSHIP, 2568 linux.IP_BIND_ADDRESS_NO_PORT, 2569 linux.IP_BLOCK_SOURCE, 2570 linux.IP_CHECKSUM, 2571 linux.IP_DROP_SOURCE_MEMBERSHIP, 2572 linux.IP_FREEBIND, 2573 linux.IP_IPSEC_POLICY, 2574 linux.IP_MINTTL, 2575 linux.IP_MSFILTER, 2576 linux.IP_MTU_DISCOVER, 2577 linux.IP_MULTICAST_ALL, 2578 linux.IP_NODEFRAG, 2579 linux.IP_OPTIONS, 2580 linux.IP_PASSSEC, 2581 linux.IP_RECVFRAGSIZE, 2582 linux.IP_RECVOPTS, 2583 linux.IP_RETOPTS, 2584 linux.IP_TRANSPARENT, 2585 linux.IP_UNBLOCK_SOURCE, 2586 linux.IP_UNICAST_IF, 2587 linux.IP_XFRM_POLICY, 2588 linux.MCAST_BLOCK_SOURCE, 2589 linux.MCAST_JOIN_SOURCE_GROUP, 2590 linux.MCAST_LEAVE_GROUP, 2591 linux.MCAST_LEAVE_SOURCE_GROUP, 2592 linux.MCAST_MSFILTER, 2593 linux.MCAST_UNBLOCK_SOURCE: 2594 // Not supported. 2595 } 2596 2597 return nil 2598 } 2599 2600 // GetSockName implements the linux syscall getsockname(2) for sockets backed by 2601 // tcpip.Endpoint. 2602 func (s *sock) GetSockName(*kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { 2603 addr, err := s.Endpoint.GetLocalAddress() 2604 if err != nil { 2605 return nil, 0, syserr.TranslateNetstackError(err) 2606 } 2607 2608 a, l := socket.ConvertAddress(s.family, addr) 2609 return a, l, nil 2610 } 2611 2612 // GetPeerName implements the linux syscall getpeername(2) for sockets backed by 2613 // tcpip.Endpoint. 2614 func (s *sock) GetPeerName(*kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { 2615 addr, err := s.Endpoint.GetRemoteAddress() 2616 if err != nil { 2617 return nil, 0, syserr.TranslateNetstackError(err) 2618 } 2619 2620 a, l := socket.ConvertAddress(s.family, addr) 2621 return a, l, nil 2622 } 2623 2624 func (s *sock) fillCmsgInq(cmsg *socket.ControlMessages) { 2625 if !s.sockOptInq { 2626 return 2627 } 2628 rcvBufUsed, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption) 2629 if err != nil { 2630 return 2631 } 2632 cmsg.IP.HasInq = true 2633 cmsg.IP.Inq = int32(rcvBufUsed) 2634 } 2635 2636 func toLinuxPacketType(pktType tcpip.PacketType) uint8 { 2637 switch pktType { 2638 case tcpip.PacketHost: 2639 return linux.PACKET_HOST 2640 case tcpip.PacketOtherHost: 2641 return linux.PACKET_OTHERHOST 2642 case tcpip.PacketOutgoing: 2643 return linux.PACKET_OUTGOING 2644 case tcpip.PacketBroadcast: 2645 return linux.PACKET_BROADCAST 2646 case tcpip.PacketMulticast: 2647 return linux.PACKET_MULTICAST 2648 default: 2649 panic(fmt.Sprintf("unknown packet type: %d", pktType)) 2650 } 2651 } 2652 2653 // nonBlockingRead issues a non-blocking read. 2654 // 2655 // TODO(b/78348848): Support timestamps for stream sockets. 2656 func (s *sock) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { 2657 isPacket := s.isPacketBased() 2658 2659 readOptions := tcpip.ReadOptions{ 2660 Peek: peek, 2661 NeedRemoteAddr: senderRequested, 2662 NeedLinkPacketInfo: isPacket, 2663 } 2664 2665 // TCP sockets discard the data if MSG_TRUNC is set. 2666 // 2667 // This behavior is documented in man 7 tcp: 2668 // Since version 2.4, Linux supports the use of MSG_TRUNC in the flags 2669 // argument of recv(2) (and recvmsg(2)). This flag causes the received 2670 // bytes of data to be discarded, rather than passed back in a 2671 // caller-supplied buffer. 2672 var w io.Writer 2673 if !isPacket && trunc { 2674 w = &tcpip.LimitedWriter{ 2675 W: ioutil.Discard, 2676 N: dst.NumBytes(), 2677 } 2678 } else { 2679 w = dst.Writer(ctx) 2680 } 2681 2682 s.readMu.Lock() 2683 defer s.readMu.Unlock() 2684 2685 res, err := s.Endpoint.Read(w, readOptions) 2686 if _, ok := err.(*tcpip.ErrBadBuffer); ok && dst.NumBytes() == 0 { 2687 err = nil 2688 } 2689 if err != nil { 2690 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err) 2691 } 2692 // Set the control message, even if 0 bytes were read. 2693 s.updateTimestamp(res.ControlMessages) 2694 2695 if isPacket { 2696 var addr linux.SockAddr 2697 var addrLen uint32 2698 if senderRequested { 2699 addr, addrLen = socket.ConvertAddress(s.family, res.RemoteAddr) 2700 switch v := addr.(type) { 2701 case *linux.SockAddrLink: 2702 v.Protocol = socket.Htons(uint16(res.LinkPacketInfo.Protocol)) 2703 v.PacketType = toLinuxPacketType(res.LinkPacketInfo.PktType) 2704 } 2705 } 2706 2707 msgLen := res.Count 2708 if trunc { 2709 msgLen = res.Total 2710 } 2711 2712 var flags int 2713 if res.Total > res.Count { 2714 flags |= linux.MSG_TRUNC 2715 } 2716 2717 return msgLen, flags, addr, addrLen, s.netstackToLinuxControlMessages(res.ControlMessages), nil 2718 } 2719 2720 if peek { 2721 // MSG_TRUNC with MSG_PEEK on a TCP socket returns the 2722 // amount that could be read, and does not write to buffer. 2723 if trunc { 2724 // TCP endpoint does not return the total bytes in buffer as numTotal. 2725 // We need to query it from socket option. 2726 rql, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption) 2727 if err != nil { 2728 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err) 2729 } 2730 msgLen := int(dst.NumBytes()) 2731 if msgLen > rql { 2732 msgLen = rql 2733 } 2734 return msgLen, 0, nil, 0, socket.ControlMessages{}, nil 2735 } 2736 } else if n := res.Count; n != 0 { 2737 s.Endpoint.ModerateRecvBuf(n) 2738 } 2739 2740 cmsg := s.netstackToLinuxControlMessages(res.ControlMessages) 2741 s.fillCmsgInq(&cmsg) 2742 return res.Count, 0, nil, 0, cmsg, syserr.TranslateNetstackError(err) 2743 } 2744 2745 func (s *sock) netstackToLinuxControlMessages(cm tcpip.ReceivableControlMessages) socket.ControlMessages { 2746 readCM := socket.NewIPControlMessages(s.family, cm) 2747 return socket.ControlMessages{ 2748 IP: socket.IPControlMessages{ 2749 HasTimestamp: readCM.HasTimestamp && s.sockOptTimestamp, 2750 Timestamp: readCM.Timestamp, 2751 HasInq: readCM.HasInq, 2752 Inq: readCM.Inq, 2753 HasTOS: readCM.HasTOS, 2754 TOS: readCM.TOS, 2755 HasTClass: readCM.HasTClass, 2756 TClass: readCM.TClass, 2757 HasTTL: readCM.HasTTL, 2758 TTL: readCM.TTL, 2759 HasHopLimit: readCM.HasHopLimit, 2760 HopLimit: readCM.HopLimit, 2761 HasIPPacketInfo: readCM.HasIPPacketInfo, 2762 PacketInfo: readCM.PacketInfo, 2763 HasIPv6PacketInfo: readCM.HasIPv6PacketInfo, 2764 IPv6PacketInfo: readCM.IPv6PacketInfo, 2765 OriginalDstAddress: readCM.OriginalDstAddress, 2766 SockErr: readCM.SockErr, 2767 }, 2768 } 2769 } 2770 2771 func (s *sock) linuxToNetstackControlMessages(cm socket.ControlMessages) tcpip.SendableControlMessages { 2772 return tcpip.SendableControlMessages{ 2773 HasTTL: cm.IP.HasTTL, 2774 TTL: uint8(cm.IP.TTL), 2775 HasHopLimit: cm.IP.HasHopLimit, 2776 HopLimit: uint8(cm.IP.HopLimit), 2777 } 2778 } 2779 2780 // updateTimestamp sets the timestamp for SIOCGSTAMP. It should be called after 2781 // successfully writing packet data out to userspace. 2782 // 2783 // Precondition: s.readMu must be locked. 2784 func (s *sock) updateTimestamp(cm tcpip.ReceivableControlMessages) { 2785 // Save the SIOCGSTAMP timestamp only if SO_TIMESTAMP is disabled. 2786 if !s.sockOptTimestamp { 2787 s.timestampValid = true 2788 s.timestamp = cm.Timestamp 2789 } 2790 } 2791 2792 // dequeueErr is analogous to net/core/skbuff.c:sock_dequeue_err_skb(). 2793 func (s *sock) dequeueErr() *tcpip.SockError { 2794 so := s.Endpoint.SocketOptions() 2795 err := so.DequeueErr() 2796 if err == nil { 2797 return nil 2798 } 2799 2800 // Update socket error to reflect ICMP errors in queue. 2801 if nextErr := so.PeekErr(); nextErr != nil && nextErr.Cause.Origin().IsICMPErr() { 2802 so.SetLastError(nextErr.Err) 2803 } else if err.Cause.Origin().IsICMPErr() { 2804 so.SetLastError(nil) 2805 } 2806 return err 2807 } 2808 2809 // addrFamilyFromNetProto returns the address family identifier for the given 2810 // network protocol. 2811 func addrFamilyFromNetProto(net tcpip.NetworkProtocolNumber) int { 2812 switch net { 2813 case header.IPv4ProtocolNumber: 2814 return linux.AF_INET 2815 case header.IPv6ProtocolNumber: 2816 return linux.AF_INET6 2817 default: 2818 panic(fmt.Sprintf("invalid net proto for addr family inference: %d", net)) 2819 } 2820 } 2821 2822 // recvErr handles MSG_ERRQUEUE for recvmsg(2). 2823 // This is analogous to net/ipv4/ip_sockglue.c:ip_recv_error(). 2824 func (s *sock) recvErr(t *kernel.Task, dst usermem.IOSequence) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { 2825 sockErr := s.dequeueErr() 2826 if sockErr == nil { 2827 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain 2828 } 2829 if sockErr.Payload != nil { 2830 defer sockErr.Payload.Release() 2831 } 2832 2833 // The payload of the original packet that caused the error is passed as 2834 // normal data via msg_iovec. -- recvmsg(2) 2835 msgFlags := linux.MSG_ERRQUEUE 2836 if int(dst.NumBytes()) < sockErr.Payload.Size() { 2837 msgFlags |= linux.MSG_TRUNC 2838 } 2839 n, err := dst.CopyOut(t, sockErr.Payload.AsSlice()) 2840 2841 // The original destination address of the datagram that caused the error is 2842 // supplied via msg_name. -- recvmsg(2) 2843 dstAddr, dstAddrLen := socket.ConvertAddress(addrFamilyFromNetProto(sockErr.NetProto), sockErr.Dst) 2844 cmgs := socket.ControlMessages{IP: socket.NewIPControlMessages(s.family, tcpip.ReceivableControlMessages{SockErr: sockErr})} 2845 return n, msgFlags, dstAddr, dstAddrLen, cmgs, syserr.FromError(err) 2846 } 2847 2848 // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by 2849 // tcpip.Endpoint. 2850 func (s *sock) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, _ uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { 2851 if flags&linux.MSG_ERRQUEUE != 0 { 2852 return s.recvErr(t, dst) 2853 } 2854 2855 trunc := flags&linux.MSG_TRUNC != 0 2856 peek := flags&linux.MSG_PEEK != 0 2857 dontWait := flags&linux.MSG_DONTWAIT != 0 2858 waitAll := flags&linux.MSG_WAITALL != 0 2859 if senderRequested && !s.isPacketBased() { 2860 // Stream sockets ignore the sender address. 2861 senderRequested = false 2862 } 2863 n, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested) 2864 2865 if s.isPacketBased() && err == syserr.ErrClosedForReceive && flags&linux.MSG_DONTWAIT != 0 { 2866 // In this situation we should return EAGAIN. 2867 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain 2868 } 2869 2870 if err != nil && (err != syserr.ErrWouldBlock || dontWait) { 2871 // Read failed and we should not retry. 2872 return 0, 0, nil, 0, socket.ControlMessages{}, err 2873 } 2874 2875 if err == nil && (dontWait || !waitAll || s.isPacketBased() || int64(n) >= dst.NumBytes()) { 2876 // We got all the data we need. 2877 return 2878 } 2879 2880 // Don't overwrite any data we received. 2881 dst = dst.DropFirst(n) 2882 2883 // We'll have to block. Register for notifications and keep trying to 2884 // send all the data. 2885 e, ch := waiter.NewChannelEntry(waiter.ReadableEvents) 2886 s.EventRegister(&e) 2887 defer s.EventUnregister(&e) 2888 2889 for { 2890 var rn int 2891 rn, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested) 2892 n += rn 2893 if err != nil && err != syserr.ErrWouldBlock { 2894 // Always stop on errors other than would block as we generally 2895 // won't be able to get any more data. Eat the error if we got 2896 // any data. 2897 if n > 0 { 2898 err = nil 2899 } 2900 return 2901 } 2902 if err == nil && (s.isPacketBased() || !waitAll || int64(rn) >= dst.NumBytes()) { 2903 // We got all the data we need. 2904 return 2905 } 2906 dst = dst.DropFirst(rn) 2907 2908 if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { 2909 if n > 0 { 2910 return n, msgFlags, senderAddr, senderAddrLen, controlMessages, nil 2911 } 2912 if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { 2913 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain 2914 } 2915 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) 2916 } 2917 } 2918 } 2919 2920 // SendMsg implements the linux syscall sendmsg(2) for sockets backed by 2921 // tcpip.Endpoint. 2922 func (s *sock) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { 2923 // Reject Unix control messages. 2924 if !controlMessages.Unix.Empty() { 2925 return 0, syserr.ErrInvalidArgument 2926 } 2927 2928 var addr *tcpip.FullAddress 2929 if len(to) > 0 { 2930 addrBuf, family, err := socket.AddressAndFamily(to) 2931 if err != nil { 2932 return 0, err 2933 } 2934 if !s.checkFamily(family, false /* exact */) { 2935 return 0, syserr.ErrInvalidArgument 2936 } 2937 addrBuf = s.mapFamily(addrBuf, family) 2938 2939 addr = &addrBuf 2940 } 2941 2942 opts := tcpip.WriteOptions{ 2943 To: addr, 2944 More: flags&linux.MSG_MORE != 0, 2945 EndOfRecord: flags&linux.MSG_EOR != 0, 2946 ControlMessages: s.linuxToNetstackControlMessages(controlMessages), 2947 } 2948 2949 r := src.Reader(t) 2950 var ( 2951 total int64 2952 entry waiter.Entry 2953 ch <-chan struct{} 2954 ) 2955 for { 2956 n, err := s.Endpoint.Write(r, opts) 2957 total += n 2958 if flags&linux.MSG_DONTWAIT != 0 { 2959 return int(total), syserr.TranslateNetstackError(err) 2960 } 2961 block := true 2962 switch err.(type) { 2963 case nil: 2964 block = total != src.NumBytes() 2965 case *tcpip.ErrWouldBlock: 2966 default: 2967 block = false 2968 } 2969 if block { 2970 if ch == nil { 2971 // We'll have to block. Register for notification and keep trying to 2972 // send all the data. 2973 entry, ch = waiter.NewChannelEntry(waiter.WritableEvents) 2974 s.EventRegister(&entry) 2975 defer s.EventUnregister(&entry) 2976 } else { 2977 // Don't wait immediately after registration in case more data 2978 // became available between when we last checked and when we setup 2979 // the notification. 2980 if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { 2981 if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { 2982 return int(total), syserr.ErrTryAgain 2983 } 2984 // handleIOError will consume errors from t.Block if needed. 2985 return int(total), syserr.FromError(err) 2986 } 2987 } 2988 continue 2989 } 2990 return int(total), syserr.TranslateNetstackError(err) 2991 } 2992 } 2993 2994 // Ioctl implements vfs.FileDescriptionImpl. 2995 func (s *sock) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { 2996 t := kernel.TaskFromContext(ctx) 2997 if t == nil { 2998 panic("ioctl(2) may only be called from a task goroutine") 2999 } 3000 3001 // SIOCGSTAMP is implemented by netstack rather than all commonEndpoint 3002 // sockets. 3003 // TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP. 3004 switch args[1].Int() { 3005 case linux.SIOCGSTAMP: 3006 s.readMu.Lock() 3007 defer s.readMu.Unlock() 3008 if !s.timestampValid { 3009 return 0, linuxerr.ENOENT 3010 } 3011 3012 tv := linux.NsecToTimeval(s.timestamp.UnixNano()) 3013 _, err := tv.CopyOut(t, args[2].Pointer()) 3014 return 0, err 3015 3016 case linux.TIOCINQ: 3017 v, terr := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption) 3018 if terr != nil { 3019 return 0, syserr.TranslateNetstackError(terr).ToError() 3020 } 3021 3022 if v > math.MaxInt32 { 3023 v = math.MaxInt32 3024 } 3025 3026 // Copy result to userspace. 3027 vP := primitive.Int32(v) 3028 _, err := vP.CopyOut(t, args[2].Pointer()) 3029 return 0, err 3030 } 3031 3032 return Ioctl(ctx, s.Endpoint, uio, sysno, args) 3033 } 3034 3035 // Ioctl performs a socket ioctl. 3036 func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { 3037 t := kernel.TaskFromContext(ctx) 3038 if t == nil { 3039 panic("ioctl(2) may only be called from a task goroutine") 3040 } 3041 3042 switch arg := int(args[1].Int()); arg { 3043 case linux.SIOCGIFFLAGS, 3044 linux.SIOCGIFADDR, 3045 linux.SIOCGIFBRDADDR, 3046 linux.SIOCGIFDSTADDR, 3047 linux.SIOCGIFHWADDR, 3048 linux.SIOCGIFINDEX, 3049 linux.SIOCGIFMAP, 3050 linux.SIOCGIFMETRIC, 3051 linux.SIOCGIFMTU, 3052 linux.SIOCGIFNAME, 3053 linux.SIOCGIFNETMASK, 3054 linux.SIOCGIFTXQLEN, 3055 linux.SIOCETHTOOL: 3056 3057 var ifr linux.IFReq 3058 if _, err := ifr.CopyIn(t, args[2].Pointer()); err != nil { 3059 return 0, err 3060 } 3061 if err := interfaceIoctl(ctx, io, arg, &ifr); err != nil { 3062 return 0, err.ToError() 3063 } 3064 _, err := ifr.CopyOut(t, args[2].Pointer()) 3065 return 0, err 3066 3067 case linux.SIOCGIFCONF: 3068 // Return a list of interface addresses or the buffer size 3069 // necessary to hold the list. 3070 var ifc linux.IFConf 3071 if _, err := ifc.CopyIn(t, args[2].Pointer()); err != nil { 3072 return 0, err 3073 } 3074 3075 if err := ifconfIoctl(ctx, t, io, &ifc); err != nil { 3076 return 0, err 3077 } 3078 3079 _, err := ifc.CopyOut(t, args[2].Pointer()) 3080 return 0, err 3081 3082 case linux.TIOCINQ: 3083 v, terr := ep.GetSockOptInt(tcpip.ReceiveQueueSizeOption) 3084 if terr != nil { 3085 return 0, syserr.TranslateNetstackError(terr).ToError() 3086 } 3087 3088 if v > math.MaxInt32 { 3089 v = math.MaxInt32 3090 } 3091 // Copy result to userspace. 3092 vP := primitive.Int32(v) 3093 _, err := vP.CopyOut(t, args[2].Pointer()) 3094 return 0, err 3095 3096 case linux.TIOCOUTQ: 3097 v, terr := ep.GetSockOptInt(tcpip.SendQueueSizeOption) 3098 if terr != nil { 3099 return 0, syserr.TranslateNetstackError(terr).ToError() 3100 } 3101 3102 if v > math.MaxInt32 { 3103 v = math.MaxInt32 3104 } 3105 3106 // Copy result to userspace. 3107 vP := primitive.Int32(v) 3108 _, err := vP.CopyOut(t, args[2].Pointer()) 3109 return 0, err 3110 3111 case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG: 3112 // Not supported. 3113 } 3114 3115 return 0, linuxerr.ENOTTY 3116 } 3117 3118 // interfaceIoctl implements interface requests. 3119 func interfaceIoctl(ctx context.Context, _ usermem.IO, arg int, ifr *linux.IFReq) *syserr.Error { 3120 var ( 3121 iface inet.Interface 3122 index int32 3123 found bool 3124 ) 3125 3126 // Find the relevant device. 3127 stk := inet.StackFromContext(ctx) 3128 if stk == nil { 3129 return syserr.ErrNoDevice 3130 } 3131 3132 // SIOCGIFNAME uses ifr.ifr_ifindex rather than ifr.ifr_name to 3133 // identify a device. 3134 if arg == linux.SIOCGIFNAME { 3135 // Gets the name of the interface given the interface index 3136 // stored in ifr_ifindex. 3137 index = int32(hostarch.ByteOrder.Uint32(ifr.Data[:4])) 3138 if iface, ok := stk.Interfaces()[index]; ok { 3139 ifr.SetName(iface.Name) 3140 return nil 3141 } 3142 return syserr.ErrNoDevice 3143 } 3144 3145 // Find the relevant device. 3146 for index, iface = range stk.Interfaces() { 3147 if iface.Name == ifr.Name() { 3148 found = true 3149 break 3150 } 3151 } 3152 if !found { 3153 return syserr.ErrNoDevice 3154 } 3155 3156 switch arg { 3157 case linux.SIOCGIFINDEX: 3158 // Copy out the index to the data. 3159 hostarch.ByteOrder.PutUint32(ifr.Data[:], uint32(index)) 3160 3161 case linux.SIOCGIFHWADDR: 3162 // Copy the hardware address out. 3163 // 3164 // Refer: https://linux.die.net/man/7/netdevice 3165 // SIOCGIFHWADDR, SIOCSIFHWADDR 3166 // 3167 // Get or set the hardware address of a device using 3168 // ifr_hwaddr. The hardware address is specified in a struct 3169 // sockaddr. sa_family contains the ARPHRD_* device type, 3170 // sa_data the L2 hardware address starting from byte 0. Setting 3171 // the hardware address is a privileged operation. 3172 hostarch.ByteOrder.PutUint16(ifr.Data[:], iface.DeviceType) 3173 n := copy(ifr.Data[2:], iface.Addr) 3174 for i := 2 + n; i < len(ifr.Data); i++ { 3175 ifr.Data[i] = 0 // Clear padding. 3176 } 3177 3178 case linux.SIOCGIFFLAGS: 3179 f, err := interfaceStatusFlags(stk, iface.Name) 3180 if err != nil { 3181 return err 3182 } 3183 // Drop the flags that don't fit in the size that we need to return. This 3184 // matches Linux behavior. 3185 hostarch.ByteOrder.PutUint16(ifr.Data[:2], uint16(f)) 3186 3187 case linux.SIOCGIFADDR: 3188 // Copy the IPv4 address out. 3189 for _, addr := range stk.InterfaceAddrs()[index] { 3190 // This ioctl is only compatible with AF_INET addresses. 3191 if addr.Family != linux.AF_INET { 3192 continue 3193 } 3194 copy(ifr.Data[4:8], addr.Addr) 3195 break 3196 } 3197 3198 case linux.SIOCGIFMETRIC: 3199 // Gets the metric of the device. As per netdevice(7), this 3200 // always just sets ifr_metric to 0. 3201 hostarch.ByteOrder.PutUint32(ifr.Data[:4], 0) 3202 3203 case linux.SIOCGIFMTU: 3204 // Gets the MTU of the device. 3205 hostarch.ByteOrder.PutUint32(ifr.Data[:4], iface.MTU) 3206 3207 case linux.SIOCGIFMAP: 3208 // Gets the hardware parameters of the device. 3209 // TODO(gvisor.dev/issue/505): Implement. 3210 3211 case linux.SIOCGIFTXQLEN: 3212 // Gets the transmit queue length of the device. 3213 // TODO(gvisor.dev/issue/505): Implement. 3214 3215 case linux.SIOCGIFDSTADDR: 3216 // Gets the destination address of a point-to-point device. 3217 // TODO(gvisor.dev/issue/505): Implement. 3218 3219 case linux.SIOCGIFBRDADDR: 3220 // Gets the broadcast address of a device. 3221 // TODO(gvisor.dev/issue/505): Implement. 3222 3223 case linux.SIOCGIFNETMASK: 3224 // Gets the network mask of a device. 3225 for _, addr := range stk.InterfaceAddrs()[index] { 3226 // This ioctl is only compatible with AF_INET addresses. 3227 if addr.Family != linux.AF_INET { 3228 continue 3229 } 3230 // Populate ifr.ifr_netmask (type sockaddr). 3231 hostarch.ByteOrder.PutUint16(ifr.Data[0:], uint16(linux.AF_INET)) 3232 hostarch.ByteOrder.PutUint16(ifr.Data[2:], 0) 3233 var mask uint32 = 0xffffffff << (32 - addr.PrefixLen) 3234 // Netmask is expected to be returned as a big endian 3235 // value. 3236 binary.BigEndian.PutUint32(ifr.Data[4:8], mask) 3237 break 3238 } 3239 3240 case linux.SIOCETHTOOL: 3241 // Stubbed out for now, Ideally we should implement the required 3242 // sub-commands for ETHTOOL 3243 // 3244 // See: 3245 // https://github.com/torvalds/linux/blob/aa0c9086b40c17a7ad94425b3b70dd1fdd7497bf/net/core/dev_ioctl.c 3246 return syserr.ErrEndpointOperation 3247 3248 default: 3249 // Not a valid call. 3250 return syserr.ErrInvalidArgument 3251 } 3252 3253 return nil 3254 } 3255 3256 // ifconfIoctl populates a struct ifconf for the SIOCGIFCONF ioctl. 3257 func ifconfIoctl(ctx context.Context, t *kernel.Task, _ usermem.IO, ifc *linux.IFConf) error { 3258 // If Ptr is NULL, return the necessary buffer size via Len. 3259 // Otherwise, write up to Len bytes starting at Ptr containing ifreq 3260 // structs. 3261 stk := inet.StackFromContext(ctx) 3262 if stk == nil { 3263 return syserr.ErrNoDevice.ToError() 3264 } 3265 3266 if ifc.Ptr == 0 { 3267 ifc.Len = int32(len(stk.Interfaces())) * int32(linux.SizeOfIFReq) 3268 return nil 3269 } 3270 3271 max := ifc.Len 3272 ifc.Len = 0 3273 for key, ifaceAddrs := range stk.InterfaceAddrs() { 3274 iface := stk.Interfaces()[key] 3275 for _, ifaceAddr := range ifaceAddrs { 3276 // Don't write past the end of the buffer. 3277 if ifc.Len+int32(linux.SizeOfIFReq) > max { 3278 break 3279 } 3280 if ifaceAddr.Family != linux.AF_INET { 3281 continue 3282 } 3283 3284 // Populate ifr.ifr_addr. 3285 ifr := linux.IFReq{} 3286 ifr.SetName(iface.Name) 3287 hostarch.ByteOrder.PutUint16(ifr.Data[0:2], uint16(ifaceAddr.Family)) 3288 hostarch.ByteOrder.PutUint16(ifr.Data[2:4], 0) 3289 copy(ifr.Data[4:8], ifaceAddr.Addr[:4]) 3290 3291 // Copy the ifr to userspace. 3292 dst := uintptr(ifc.Ptr) + uintptr(ifc.Len) 3293 ifc.Len += int32(linux.SizeOfIFReq) 3294 if _, err := ifr.CopyOut(t, hostarch.Addr(dst)); err != nil { 3295 return err 3296 } 3297 } 3298 } 3299 return nil 3300 } 3301 3302 // interfaceStatusFlags returns status flags for an interface in the stack. 3303 // Flag values and meanings are described in greater detail in netdevice(7) in 3304 // the SIOCGIFFLAGS section. 3305 func interfaceStatusFlags(stack inet.Stack, name string) (uint32, *syserr.Error) { 3306 // We should only ever be passed a netstack.Stack. 3307 epstack, ok := stack.(*Stack) 3308 if !ok { 3309 return 0, errStackType 3310 } 3311 3312 // Find the NIC corresponding to this interface. 3313 for _, info := range epstack.Stack.NICInfo() { 3314 if info.Name == name { 3315 return nicStateFlagsToLinux(info.Flags), nil 3316 } 3317 } 3318 return 0, syserr.ErrNoDevice 3319 } 3320 3321 func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 { 3322 var rv uint32 3323 if f.Up { 3324 rv |= linux.IFF_UP | linux.IFF_LOWER_UP 3325 } 3326 if f.Running { 3327 rv |= linux.IFF_RUNNING 3328 } 3329 if f.Promiscuous { 3330 rv |= linux.IFF_PROMISC 3331 } 3332 if f.Loopback { 3333 rv |= linux.IFF_LOOPBACK 3334 } 3335 return rv 3336 } 3337 3338 // State implements socket.Socket.State. State translates the internal state 3339 // returned by netstack to values defined by Linux. 3340 func (s *sock) State() uint32 { 3341 if s.family != linux.AF_INET && s.family != linux.AF_INET6 { 3342 // States not implemented for this socket's family. 3343 return 0 3344 } 3345 3346 switch { 3347 case socket.IsTCP(s): 3348 // TCP socket. 3349 switch tcp.EndpointState(s.Endpoint.State()) { 3350 case tcp.StateEstablished: 3351 return linux.TCP_ESTABLISHED 3352 case tcp.StateSynSent: 3353 return linux.TCP_SYN_SENT 3354 case tcp.StateSynRecv: 3355 return linux.TCP_SYN_RECV 3356 case tcp.StateFinWait1: 3357 return linux.TCP_FIN_WAIT1 3358 case tcp.StateFinWait2: 3359 return linux.TCP_FIN_WAIT2 3360 case tcp.StateTimeWait: 3361 return linux.TCP_TIME_WAIT 3362 case tcp.StateClose, tcp.StateInitial, tcp.StateBound, tcp.StateConnecting, tcp.StateError: 3363 return linux.TCP_CLOSE 3364 case tcp.StateCloseWait: 3365 return linux.TCP_CLOSE_WAIT 3366 case tcp.StateLastAck: 3367 return linux.TCP_LAST_ACK 3368 case tcp.StateListen: 3369 return linux.TCP_LISTEN 3370 case tcp.StateClosing: 3371 return linux.TCP_CLOSING 3372 default: 3373 // Internal or unknown state. 3374 return 0 3375 } 3376 case socket.IsUDP(s): 3377 // UDP socket. 3378 switch transport.DatagramEndpointState(s.Endpoint.State()) { 3379 case transport.DatagramEndpointStateInitial, transport.DatagramEndpointStateBound, transport.DatagramEndpointStateClosed: 3380 return linux.TCP_CLOSE 3381 case transport.DatagramEndpointStateConnected: 3382 return linux.TCP_ESTABLISHED 3383 default: 3384 return 0 3385 } 3386 case socket.IsICMP(s): 3387 // TODO(b/112063468): Export states for ICMP sockets. 3388 case socket.IsRaw(s): 3389 // TODO(b/112063468): Export states for raw sockets. 3390 default: 3391 // Unknown transport protocol, how did we make this socket? 3392 log.Warningf("Unknown transport protocol for an existing socket: family=%v, type=%v, protocol=%v, internal type %v", s.family, s.skType, s.protocol, reflect.TypeOf(s.Endpoint).Elem()) 3393 return 0 3394 } 3395 3396 return 0 3397 } 3398 3399 // Type implements socket.Socket.Type. 3400 func (s *sock) Type() (family int, skType linux.SockType, protocol int) { 3401 return s.family, s.skType, s.protocol 3402 } 3403 3404 // EventRegister implements waiter.Waitable. 3405 func (s *sock) EventRegister(e *waiter.Entry) error { 3406 s.Queue.EventRegister(e) 3407 return nil 3408 } 3409 3410 // EventUnregister implements waiter.Waitable.EventUnregister. 3411 func (s *sock) EventUnregister(e *waiter.Entry) { 3412 s.Queue.EventUnregister(e) 3413 }