github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/socket/netstack/netstack.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package netstack provides an implementation of the socket.Socket interface 16 // that is backed by a tcpip.Endpoint. 17 // 18 // It does not depend on any particular endpoint implementation, and thus can 19 // be used to expose certain endpoints to the sentry while leaving others out, 20 // for example, TCP endpoints and Unix-domain endpoints. 21 // 22 // Lock ordering: netstack => mm: ioSequenceReadWriter copies user memory inside 23 // tcpip.Endpoint.Write(). Netstack is allowed to (and does) hold locks during 24 // this operation. 25 package netstack 26 27 import ( 28 "bytes" 29 "encoding/binary" 30 "fmt" 31 "io" 32 "io/ioutil" 33 "math" 34 "reflect" 35 "time" 36 37 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 38 "github.com/MerlinKodo/gvisor/pkg/abi/linux/errno" 39 "github.com/MerlinKodo/gvisor/pkg/context" 40 "github.com/MerlinKodo/gvisor/pkg/errors/linuxerr" 41 "github.com/MerlinKodo/gvisor/pkg/eventchannel" 42 "github.com/MerlinKodo/gvisor/pkg/hostarch" 43 "github.com/MerlinKodo/gvisor/pkg/log" 44 "github.com/MerlinKodo/gvisor/pkg/marshal" 45 "github.com/MerlinKodo/gvisor/pkg/marshal/primitive" 46 "github.com/MerlinKodo/gvisor/pkg/metric" 47 "github.com/MerlinKodo/gvisor/pkg/sentry/arch" 48 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/sockfs" 49 "github.com/MerlinKodo/gvisor/pkg/sentry/inet" 50 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel" 51 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth" 52 ktime "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/time" 53 "github.com/MerlinKodo/gvisor/pkg/sentry/socket" 54 "github.com/MerlinKodo/gvisor/pkg/sentry/socket/netfilter" 55 epb "github.com/MerlinKodo/gvisor/pkg/sentry/socket/netstack/events_go_proto" 56 "github.com/MerlinKodo/gvisor/pkg/sentry/vfs" 57 "github.com/MerlinKodo/gvisor/pkg/sync" 58 "github.com/MerlinKodo/gvisor/pkg/syserr" 59 "github.com/MerlinKodo/gvisor/pkg/tcpip" 60 "github.com/MerlinKodo/gvisor/pkg/tcpip/header" 61 "github.com/MerlinKodo/gvisor/pkg/tcpip/stack" 62 "github.com/MerlinKodo/gvisor/pkg/tcpip/transport" 63 "github.com/MerlinKodo/gvisor/pkg/tcpip/transport/tcp" 64 "github.com/MerlinKodo/gvisor/pkg/usermem" 65 "github.com/MerlinKodo/gvisor/pkg/waiter" 66 "golang.org/x/sys/unix" 67 "google.golang.org/protobuf/proto" 68 ) 69 70 const bitsPerUint32 = 32 71 72 // statCounterValue returns a function usable as callback function when defining a gVisor Sentry 73 // metric that contains the value counted by the StatCounter. 74 // This avoids a dependency loop in the tcpip package. 75 func statCounterValue(cm *tcpip.StatCounter) func(...*metric.FieldValue) uint64 { 76 return func(...*metric.FieldValue) uint64 { 77 return cm.Value() 78 } 79 } 80 81 func mustCreateMetric(name, description string) *tcpip.StatCounter { 82 var cm tcpip.StatCounter 83 metric.MustRegisterCustomUint64Metric(name, true /* cumulative */, false /* sync */, description, statCounterValue(&cm)) 84 return &cm 85 } 86 87 func mustCreateGauge(name, description string) *tcpip.StatCounter { 88 var cm tcpip.StatCounter 89 metric.MustRegisterCustomUint64Metric(name, false /* cumulative */, false /* sync */, description, statCounterValue(&cm)) 90 return &cm 91 } 92 93 // Metrics contains metrics exported by netstack. 94 var Metrics = tcpip.Stats{ 95 DroppedPackets: mustCreateMetric("/netstack/dropped_packets", "Number of packets dropped at the transport layer."), 96 NICs: tcpip.NICStats{ 97 MalformedL4RcvdPackets: mustCreateMetric("/netstack/nic/malformed_l4_received_packets", "Number of packets received that failed L4 header parsing."), 98 Tx: tcpip.NICPacketStats{ 99 Packets: mustCreateMetric("/netstack/nic/tx/packets", "Number of packets transmitted."), 100 Bytes: mustCreateMetric("/netstack/nic/tx/bytes", "Number of bytes transmitted."), 101 }, 102 TxPacketsDroppedNoBufferSpace: mustCreateMetric("/netstack/nic/tx_packets_dropped_no_buffer_space", "Number of TX packets dropped as a result of no buffer space errors."), 103 Rx: tcpip.NICPacketStats{ 104 Packets: mustCreateMetric("/netstack/nic/rx/packets", "Number of packets received."), 105 Bytes: mustCreateMetric("/netstack/nic/rx/bytes", "Number of bytes received."), 106 }, 107 DisabledRx: tcpip.NICPacketStats{ 108 Packets: mustCreateMetric("/netstack/nic/disabled_rx/packets", "Number of packets received on disabled NICs."), 109 Bytes: mustCreateMetric("/netstack/nic/disabled_rx/bytes", "Number of bytes received on disabled NICs."), 110 }, 111 Neighbor: tcpip.NICNeighborStats{ 112 UnreachableEntryLookups: mustCreateMetric("/netstack/nic/neighbor/unreachable_entry_loopups", "Number of lookups performed on a neighbor entry in Unreachable state."), 113 DroppedConfirmationForNoninitiatedNeighbor: mustCreateMetric("/netstack/nic/neighbor/dropped_confirmation_for_noninitiated_neighbor", "Number of advertisements received that don't match an entry in the neighbor cache."), 114 DroppedInvalidLinkAddressConfirmations: mustCreateMetric("/netstack/nic/neighbor/dropped_invalid_link_address_confirmations", "Number of advertisements dropped because they have empty source link-layer addresses"), 115 }, 116 }, 117 ICMP: tcpip.ICMPStats{ 118 V4: tcpip.ICMPv4Stats{ 119 PacketsSent: tcpip.ICMPv4SentPacketStats{ 120 ICMPv4PacketStats: tcpip.ICMPv4PacketStats{ 121 EchoRequest: mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_request", "Number of ICMPv4 echo request packets sent."), 122 EchoReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_reply", "Number of ICMPv4 echo reply packets sent."), 123 DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_sent/dst_unreachable", "Number of ICMPv4 destination unreachable packets sent."), 124 SrcQuench: mustCreateMetric("/netstack/icmp/v4/packets_sent/src_quench", "Number of ICMPv4 source quench packets sent."), 125 Redirect: mustCreateMetric("/netstack/icmp/v4/packets_sent/redirect", "Number of ICMPv4 redirect packets sent."), 126 TimeExceeded: mustCreateMetric("/netstack/icmp/v4/packets_sent/time_exceeded", "Number of ICMPv4 time exceeded packets sent."), 127 ParamProblem: mustCreateMetric("/netstack/icmp/v4/packets_sent/param_problem", "Number of ICMPv4 parameter problem packets sent."), 128 Timestamp: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp", "Number of ICMPv4 timestamp packets sent."), 129 TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp_reply", "Number of ICMPv4 timestamp reply packets sent."), 130 InfoRequest: mustCreateMetric("/netstack/icmp/v4/packets_sent/info_request", "Number of ICMPv4 information request packets sent."), 131 InfoReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/info_reply", "Number of ICMPv4 information reply packets sent."), 132 }, 133 Dropped: mustCreateMetric("/netstack/icmp/v4/packets_sent/dropped", "Number of ICMPv4 packets dropped due to link layer errors."), 134 RateLimited: mustCreateMetric("/netstack/icmp/v4/packets_sent/rate_limited", "Number of ICMPv4 packets dropped due to rate limit being exceeded."), 135 }, 136 PacketsReceived: tcpip.ICMPv4ReceivedPacketStats{ 137 ICMPv4PacketStats: tcpip.ICMPv4PacketStats{ 138 EchoRequest: mustCreateMetric("/netstack/icmp/v4/packets_received/echo_request", "Number of ICMPv4 echo request packets received."), 139 EchoReply: mustCreateMetric("/netstack/icmp/v4/packets_received/echo_reply", "Number of ICMPv4 echo reply packets received."), 140 DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_received/dst_unreachable", "Number of ICMPv4 destination unreachable packets received."), 141 SrcQuench: mustCreateMetric("/netstack/icmp/v4/packets_received/src_quench", "Number of ICMPv4 source quench packets received."), 142 Redirect: mustCreateMetric("/netstack/icmp/v4/packets_received/redirect", "Number of ICMPv4 redirect packets received."), 143 TimeExceeded: mustCreateMetric("/netstack/icmp/v4/packets_received/time_exceeded", "Number of ICMPv4 time exceeded packets received."), 144 ParamProblem: mustCreateMetric("/netstack/icmp/v4/packets_received/param_problem", "Number of ICMPv4 parameter problem packets received."), 145 Timestamp: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp", "Number of ICMPv4 timestamp packets received."), 146 TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp_reply", "Number of ICMPv4 timestamp reply packets received."), 147 InfoRequest: mustCreateMetric("/netstack/icmp/v4/packets_received/info_request", "Number of ICMPv4 information request packets received."), 148 InfoReply: mustCreateMetric("/netstack/icmp/v4/packets_received/info_reply", "Number of ICMPv4 information reply packets received."), 149 }, 150 Invalid: mustCreateMetric("/netstack/icmp/v4/packets_received/invalid", "Number of ICMPv4 packets received that the transport layer could not parse."), 151 }, 152 }, 153 V6: tcpip.ICMPv6Stats{ 154 PacketsSent: tcpip.ICMPv6SentPacketStats{ 155 ICMPv6PacketStats: tcpip.ICMPv6PacketStats{ 156 EchoRequest: mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_request", "Number of ICMPv6 echo request packets sent."), 157 EchoReply: mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_reply", "Number of ICMPv6 echo reply packets sent."), 158 DstUnreachable: mustCreateMetric("/netstack/icmp/v6/packets_sent/dst_unreachable", "Number of ICMPv6 destination unreachable packets sent."), 159 PacketTooBig: mustCreateMetric("/netstack/icmp/v6/packets_sent/packet_too_big", "Number of ICMPv6 packet too big packets sent."), 160 TimeExceeded: mustCreateMetric("/netstack/icmp/v6/packets_sent/time_exceeded", "Number of ICMPv6 time exceeded packets sent."), 161 ParamProblem: mustCreateMetric("/netstack/icmp/v6/packets_sent/param_problem", "Number of ICMPv6 parameter problem packets sent."), 162 RouterSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/router_solicit", "Number of ICMPv6 router solicit packets sent."), 163 RouterAdvert: mustCreateMetric("/netstack/icmp/v6/packets_sent/router_advert", "Number of ICMPv6 router advert packets sent."), 164 NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets sent."), 165 NeighborAdvert: mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_advert", "Number of ICMPv6 neighbor advert packets sent."), 166 RedirectMsg: mustCreateMetric("/netstack/icmp/v6/packets_sent/redirect_msg", "Number of ICMPv6 redirect message packets sent."), 167 MulticastListenerQuery: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_query", "Number of ICMPv6 multicast listener query packets sent."), 168 MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent."), 169 MulticastListenerDone: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent."), 170 }, 171 Dropped: mustCreateMetric("/netstack/icmp/v6/packets_sent/dropped", "Number of ICMPv6 packets dropped due to link layer errors."), 172 RateLimited: mustCreateMetric("/netstack/icmp/v6/packets_sent/rate_limited", "Number of ICMPv6 packets dropped due to rate limit being exceeded."), 173 }, 174 PacketsReceived: tcpip.ICMPv6ReceivedPacketStats{ 175 ICMPv6PacketStats: tcpip.ICMPv6PacketStats{ 176 EchoRequest: mustCreateMetric("/netstack/icmp/v6/packets_received/echo_request", "Number of ICMPv6 echo request packets received."), 177 EchoReply: mustCreateMetric("/netstack/icmp/v6/packets_received/echo_reply", "Number of ICMPv6 echo reply packets received."), 178 DstUnreachable: mustCreateMetric("/netstack/icmp/v6/packets_received/dst_unreachable", "Number of ICMPv6 destination unreachable packets received."), 179 PacketTooBig: mustCreateMetric("/netstack/icmp/v6/packets_received/packet_too_big", "Number of ICMPv6 packet too big packets received."), 180 TimeExceeded: mustCreateMetric("/netstack/icmp/v6/packets_received/time_exceeded", "Number of ICMPv6 time exceeded packets received."), 181 ParamProblem: mustCreateMetric("/netstack/icmp/v6/packets_received/param_problem", "Number of ICMPv6 parameter problem packets received."), 182 RouterSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/router_solicit", "Number of ICMPv6 router solicit packets received."), 183 RouterAdvert: mustCreateMetric("/netstack/icmp/v6/packets_received/router_advert", "Number of ICMPv6 router advert packets received."), 184 NeighborSolicit: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets received."), 185 NeighborAdvert: mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_advert", "Number of ICMPv6 neighbor advert packets received."), 186 RedirectMsg: mustCreateMetric("/netstack/icmp/v6/packets_received/redirect_msg", "Number of ICMPv6 redirect message packets received."), 187 MulticastListenerQuery: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_query", "Number of ICMPv6 multicast listener query packets received."), 188 MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent."), 189 MulticastListenerDone: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent."), 190 }, 191 Unrecognized: mustCreateMetric("/netstack/icmp/v6/packets_received/unrecognized", "Number of ICMPv6 packets received that the transport layer does not know how to parse."), 192 Invalid: mustCreateMetric("/netstack/icmp/v6/packets_received/invalid", "Number of ICMPv6 packets received that the transport layer could not parse."), 193 RouterOnlyPacketsDroppedByHost: mustCreateMetric("/netstack/icmp/v6/packets_received/router_only_packets_dropped_by_host", "Number of ICMPv6 packets dropped due to being router-specific packets."), 194 }, 195 }, 196 }, 197 IGMP: tcpip.IGMPStats{ 198 PacketsSent: tcpip.IGMPSentPacketStats{ 199 IGMPPacketStats: tcpip.IGMPPacketStats{ 200 MembershipQuery: mustCreateMetric("/netstack/igmp/packets_sent/membership_query", "Number of IGMP Membership Query messages sent."), 201 V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v1_membership_report", "Number of IGMPv1 Membership Report messages sent."), 202 V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v2_membership_report", "Number of IGMPv2 Membership Report messages sent."), 203 LeaveGroup: mustCreateMetric("/netstack/igmp/packets_sent/leave_group", "Number of IGMP Leave Group messages sent."), 204 }, 205 Dropped: mustCreateMetric("/netstack/igmp/packets_sent/dropped", "Number of IGMP packets dropped due to link layer errors."), 206 }, 207 PacketsReceived: tcpip.IGMPReceivedPacketStats{ 208 IGMPPacketStats: tcpip.IGMPPacketStats{ 209 MembershipQuery: mustCreateMetric("/netstack/igmp/packets_received/membership_query", "Number of IGMP Membership Query messages received."), 210 V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v1_membership_report", "Number of IGMPv1 Membership Report messages received."), 211 V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v2_membership_report", "Number of IGMPv2 Membership Report messages received."), 212 LeaveGroup: mustCreateMetric("/netstack/igmp/packets_received/leave_group", "Number of IGMP Leave Group messages received."), 213 }, 214 Invalid: mustCreateMetric("/netstack/igmp/packets_received/invalid", "Number of IGMP packets received that could not be parsed."), 215 ChecksumErrors: mustCreateMetric("/netstack/igmp/packets_received/checksum_errors", "Number of received IGMP packets with bad checksums."), 216 Unrecognized: mustCreateMetric("/netstack/igmp/packets_received/unrecognized", "Number of unrecognized IGMP packets received."), 217 }, 218 }, 219 IP: tcpip.IPStats{ 220 PacketsReceived: mustCreateMetric("/netstack/ip/packets_received", "Number of IP packets received from the link layer in nic.DeliverNetworkPacket."), 221 DisabledPacketsReceived: mustCreateMetric("/netstack/ip/disabled_packets_received", "Number of IP packets received from the link layer when the IP layer is disabled."), 222 InvalidDestinationAddressesReceived: mustCreateMetric("/netstack/ip/invalid_addresses_received", "Number of IP packets received with an unknown or invalid destination address."), 223 InvalidSourceAddressesReceived: mustCreateMetric("/netstack/ip/invalid_source_addresses_received", "Number of IP packets received with an unknown or invalid source address."), 224 PacketsDelivered: mustCreateMetric("/netstack/ip/packets_delivered", "Number of incoming IP packets that are successfully delivered to the transport layer via HandlePacket."), 225 PacketsSent: mustCreateMetric("/netstack/ip/packets_sent", "Number of IP packets sent via WritePacket."), 226 OutgoingPacketErrors: mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Number of IP packets which failed to write to a link-layer endpoint."), 227 MalformedPacketsReceived: mustCreateMetric("/netstack/ip/malformed_packets_received", "Number of IP packets which failed IP header validation checks."), 228 MalformedFragmentsReceived: mustCreateMetric("/netstack/ip/malformed_fragments_received", "Number of IP fragments which failed IP fragment validation checks."), 229 IPTablesPreroutingDropped: mustCreateMetric("/netstack/ip/iptables/prerouting_dropped", "Number of IP packets dropped in the Prerouting chain."), 230 IPTablesInputDropped: mustCreateMetric("/netstack/ip/iptables/input_dropped", "Number of IP packets dropped in the Input chain."), 231 IPTablesOutputDropped: mustCreateMetric("/netstack/ip/iptables/output_dropped", "Number of IP packets dropped in the Output chain."), 232 OptionTimestampReceived: mustCreateMetric("/netstack/ip/options/timestamp_received", "Number of timestamp options found in received IP packets."), 233 OptionRecordRouteReceived: mustCreateMetric("/netstack/ip/options/record_route_received", "Number of record route options found in received IP packets."), 234 OptionRouterAlertReceived: mustCreateMetric("/netstack/ip/options/router_alert_received", "Number of router alert options found in received IP packets."), 235 OptionUnknownReceived: mustCreateMetric("/netstack/ip/options/unknown_received", "Number of unknown options found in received IP packets."), 236 Forwarding: tcpip.IPForwardingStats{ 237 Unrouteable: mustCreateMetric("/netstack/ip/forwarding/unrouteable", "Number of IP packets received which couldn't be routed and thus were not forwarded."), 238 ExhaustedTTL: mustCreateMetric("/netstack/ip/forwarding/exhausted_ttl", "Number of IP packets received which could not be forwarded due to an exhausted TTL."), 239 LinkLocalSource: mustCreateMetric("/netstack/ip/forwarding/link_local_source_address", "Number of IP packets received which could not be forwarded due to a link-local source address."), 240 LinkLocalDestination: mustCreateMetric("/netstack/ip/forwarding/link_local_destination_address", "Number of IP packets received which could not be forwarded due to a link-local destination address."), 241 ExtensionHeaderProblem: mustCreateMetric("/netstack/ip/forwarding/extension_header_problem", "Number of IP packets received which could not be forwarded due to a problem processing their IPv6 extension headers."), 242 PacketTooBig: mustCreateMetric("/netstack/ip/forwarding/packet_too_big", "Number of IP packets received which could not be forwarded because they could not fit within the outgoing MTU."), 243 HostUnreachable: mustCreateMetric("/netstack/ip/forwarding/host_unreachable", "Number of IP packets received which could not be forwarded due to unresolvable next hop."), 244 Errors: mustCreateMetric("/netstack/ip/forwarding/errors", "Number of IP packets which couldn't be forwarded."), 245 }, 246 }, 247 ARP: tcpip.ARPStats{ 248 PacketsReceived: mustCreateMetric("/netstack/arp/packets_received", "Number of ARP packets received from the link layer."), 249 DisabledPacketsReceived: mustCreateMetric("/netstack/arp/disabled_packets_received", "Number of ARP packets received from the link layer when the ARP layer is disabled."), 250 MalformedPacketsReceived: mustCreateMetric("/netstack/arp/malformed_packets_received", "Number of ARP packets which failed ARP header validation checks."), 251 RequestsReceived: mustCreateMetric("/netstack/arp/requests_received", "Number of ARP requests received."), 252 RequestsReceivedUnknownTargetAddress: mustCreateMetric("/netstack/arp/requests_received_unknown_addr", "Number of ARP requests received with an unknown target address."), 253 OutgoingRequestInterfaceHasNoLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_iface_has_no_addr", "Number of failed attempts to send an ARP request with an interface that has no network address."), 254 OutgoingRequestBadLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_invalid_local_addr", "Number of failed attempts to send an ARP request with a provided local address that is invalid."), 255 OutgoingRequestsDropped: mustCreateMetric("/netstack/arp/outgoing_requests_dropped", "Number of ARP requests which failed to write to a link-layer endpoint."), 256 OutgoingRequestsSent: mustCreateMetric("/netstack/arp/outgoing_requests_sent", "Number of ARP requests sent."), 257 RepliesReceived: mustCreateMetric("/netstack/arp/replies_received", "Number of ARP replies received."), 258 OutgoingRepliesDropped: mustCreateMetric("/netstack/arp/outgoing_replies_dropped", "Number of ARP replies which failed to write to a link-layer endpoint."), 259 OutgoingRepliesSent: mustCreateMetric("/netstack/arp/outgoing_replies_sent", "Number of ARP replies sent."), 260 }, 261 TCP: tcpip.TCPStats{ 262 ActiveConnectionOpenings: mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."), 263 PassiveConnectionOpenings: mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."), 264 CurrentEstablished: mustCreateGauge("/netstack/tcp/current_established", "Number of connections in ESTABLISHED state now."), 265 CurrentConnected: mustCreateGauge("/netstack/tcp/current_open", "Number of connections that are in connected state."), 266 EstablishedResets: mustCreateMetric("/netstack/tcp/established_resets", "Number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state"), 267 EstablishedClosed: mustCreateMetric("/netstack/tcp/established_closed", "Number of times established TCP connections made a transition to CLOSED state."), 268 EstablishedTimedout: mustCreateMetric("/netstack/tcp/established_timedout", "Number of times an established connection was reset because of keep-alive time out."), 269 ListenOverflowSynDrop: mustCreateMetric("/netstack/tcp/listen_overflow_syn_drop", "Number of times the listen queue overflowed and a SYN was dropped."), 270 ListenOverflowAckDrop: mustCreateMetric("/netstack/tcp/listen_overflow_ack_drop", "Number of times the listen queue overflowed and the final ACK in the handshake was dropped."), 271 ListenOverflowSynCookieSent: mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_sent", "Number of times a SYN cookie was sent."), 272 ListenOverflowSynCookieRcvd: mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_rcvd", "Number of times a SYN cookie was received."), 273 ListenOverflowInvalidSynCookieRcvd: mustCreateMetric("/netstack/tcp/listen_overflow_invalid_syn_cookie_rcvd", "Number of times an invalid SYN cookie was received."), 274 FailedConnectionAttempts: mustCreateMetric("/netstack/tcp/failed_connection_attempts", "Number of calls to Connect or Listen (active and passive openings, respectively) that end in an error."), 275 ValidSegmentsReceived: mustCreateMetric("/netstack/tcp/valid_segments_received", "Number of TCP segments received that the transport layer successfully parsed."), 276 InvalidSegmentsReceived: mustCreateMetric("/netstack/tcp/invalid_segments_received", "Number of TCP segments received that the transport layer could not parse."), 277 SegmentsSent: mustCreateMetric("/netstack/tcp/segments_sent", "Number of TCP segments sent."), 278 SegmentSendErrors: mustCreateMetric("/netstack/tcp/segment_send_errors", "Number of TCP segments failed to be sent."), 279 ResetsSent: mustCreateMetric("/netstack/tcp/resets_sent", "Number of TCP resets sent."), 280 ResetsReceived: mustCreateMetric("/netstack/tcp/resets_received", "Number of TCP resets received."), 281 Retransmits: mustCreateMetric("/netstack/tcp/retransmits", "Number of TCP segments retransmitted."), 282 FastRecovery: mustCreateMetric("/netstack/tcp/fast_recovery", "Number of times fast recovery was used to recover from packet loss."), 283 SACKRecovery: mustCreateMetric("/netstack/tcp/sack_recovery", "Number of times SACK recovery was used to recover from packet loss."), 284 TLPRecovery: mustCreateMetric("/netstack/tcp/tlp_recovery", "Number of times tail loss probe triggers recovery from tail loss."), 285 SlowStartRetransmits: mustCreateMetric("/netstack/tcp/slow_start_retransmits", "Number of segments retransmitted in slow start mode."), 286 FastRetransmit: mustCreateMetric("/netstack/tcp/fast_retransmit", "Number of TCP segments which were fast retransmitted."), 287 Timeouts: mustCreateMetric("/netstack/tcp/timeouts", "Number of times RTO expired."), 288 ChecksumErrors: mustCreateMetric("/netstack/tcp/checksum_errors", "Number of segments dropped due to bad checksums."), 289 FailedPortReservations: mustCreateMetric("/netstack/tcp/failed_port_reservations", "Number of time TCP failed to reserve a port."), 290 SegmentsAckedWithDSACK: mustCreateMetric("/netstack/tcp/segments_acked_with_dsack", "Number of segments for which DSACK was received."), 291 SpuriousRecovery: mustCreateMetric("/netstack/tcp/spurious_recovery", "Number of times the connection entered loss recovery spuriously."), 292 SpuriousRTORecovery: mustCreateMetric("/netstack/tcp/spurious_rto_recovery", "Number of times the connection entered RTO spuriously."), 293 ForwardMaxInFlightDrop: mustCreateMetric("/netstack/tcp/forward_max_in_flight_drop", "Number of connection requests dropped due to exceeding in-flight limit."), 294 }, 295 UDP: tcpip.UDPStats{ 296 PacketsReceived: mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."), 297 UnknownPortErrors: mustCreateMetric("/netstack/udp/unknown_port_errors", "Number of incoming UDP datagrams dropped because they did not have a known destination port."), 298 ReceiveBufferErrors: mustCreateMetric("/netstack/udp/receive_buffer_errors", "Number of incoming UDP datagrams dropped due to the receiving buffer being in an invalid state."), 299 MalformedPacketsReceived: mustCreateMetric("/netstack/udp/malformed_packets_received", "Number of incoming UDP datagrams dropped due to the UDP header being in a malformed state."), 300 PacketsSent: mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent."), 301 PacketSendErrors: mustCreateMetric("/netstack/udp/packet_send_errors", "Number of UDP datagrams failed to be sent."), 302 ChecksumErrors: mustCreateMetric("/netstack/udp/checksum_errors", "Number of UDP datagrams dropped due to bad checksums."), 303 }, 304 } 305 306 // DefaultTTL is linux's default TTL. All network protocols in all stacks used 307 // with this package must have this value set as their default TTL. 308 const DefaultTTL = 64 309 310 const sizeOfInt32 int = 4 311 312 var errStackType = syserr.New("expected but did not receive a netstack.Stack", errno.EINVAL) 313 314 // commonEndpoint represents the intersection of a tcpip.Endpoint and a 315 // transport.Endpoint. 316 type commonEndpoint interface { 317 // Readiness implements tcpip.Endpoint.Readiness and 318 // transport.Endpoint.Readiness. 319 Readiness(mask waiter.EventMask) waiter.EventMask 320 321 // SetSockOpt implements tcpip.Endpoint.SetSockOpt and 322 // transport.Endpoint.SetSockOpt. 323 SetSockOpt(tcpip.SettableSocketOption) tcpip.Error 324 325 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt and 326 // transport.Endpoint.SetSockOptInt. 327 SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error 328 329 // GetSockOpt implements tcpip.Endpoint.GetSockOpt and 330 // transport.Endpoint.GetSockOpt. 331 GetSockOpt(tcpip.GettableSocketOption) tcpip.Error 332 333 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and 334 // transport.Endpoint.GetSockOpt. 335 GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) 336 337 // State returns a socket's lifecycle state. The returned value is 338 // protocol-specific and is primarily used for diagnostics. 339 State() uint32 340 341 // LastError implements tcpip.Endpoint.LastError and 342 // transport.Endpoint.LastError. 343 LastError() tcpip.Error 344 345 // SocketOptions implements tcpip.Endpoint.SocketOptions and 346 // transport.Endpoint.SocketOptions. 347 SocketOptions() *tcpip.SocketOptions 348 } 349 350 // sock encapsulates all the state needed to represent a network stack 351 // endpoint in the kernel context. 352 // 353 // +stateify savable 354 type sock struct { 355 vfsfd vfs.FileDescription 356 vfs.FileDescriptionDefaultImpl 357 vfs.DentryMetadataFileDescriptionImpl 358 vfs.LockFD 359 socket.SendReceiveTimeout 360 *waiter.Queue 361 362 family int 363 Endpoint tcpip.Endpoint 364 skType linux.SockType 365 protocol int 366 367 namespace *inet.Namespace 368 369 // readMu protects access to the below fields. 370 readMu sync.Mutex `state:"nosave"` 371 372 // sockOptTimestamp corresponds to SO_TIMESTAMP. When true, timestamps 373 // of returned messages can be returned via control messages. When 374 // false, the same timestamp is instead stored and can be read via the 375 // SIOCGSTAMP ioctl. It is protected by readMu. See socket(7). 376 sockOptTimestamp bool 377 // timestampValid indicates whether timestamp for SIOCGSTAMP has been 378 // set. It is protected by readMu. 379 timestampValid bool 380 // timestamp holds the timestamp to use with SIOCTSTAMP. It is only 381 // valid when timestampValid is true. It is protected by readMu. 382 timestamp time.Time `state:".(int64)"` 383 384 // TODO(b/153685824): Move this to SocketOptions. 385 // sockOptInq corresponds to TCP_INQ. 386 sockOptInq bool 387 } 388 389 var _ = socket.Socket(&sock{}) 390 391 // New creates a new endpoint socket. 392 func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*vfs.FileDescription, *syserr.Error) { 393 if skType == linux.SOCK_STREAM { 394 endpoint.SocketOptions().SetDelayOption(true) 395 } 396 397 mnt := t.Kernel().SocketMount() 398 d := sockfs.NewDentry(t, mnt) 399 defer d.DecRef(t) 400 401 namespace := t.NetworkNamespace() 402 s := &sock{ 403 Queue: queue, 404 family: family, 405 Endpoint: endpoint, 406 skType: skType, 407 protocol: protocol, 408 namespace: namespace, 409 } 410 s.LockFD.Init(&vfs.FileLocks{}) 411 vfsfd := &s.vfsfd 412 if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{ 413 DenyPRead: true, 414 DenyPWrite: true, 415 UseDentryMetadata: true, 416 }); err != nil { 417 return nil, syserr.FromError(err) 418 } 419 namespace.IncRef() 420 return vfsfd, nil 421 } 422 423 // Release implements vfs.FileDescriptionImpl.Release. 424 func (s *sock) Release(ctx context.Context) { 425 kernel.KernelFromContext(ctx).DeleteSocket(&s.vfsfd) 426 e, ch := waiter.NewChannelEntry(waiter.EventHUp | waiter.EventErr) 427 s.EventRegister(&e) 428 defer s.EventUnregister(&e) 429 430 s.Endpoint.Close() 431 432 // SO_LINGER option is valid only for TCP. For other socket types 433 // return after endpoint close. 434 if family, skType, _ := s.Type(); skType == linux.SOCK_STREAM && (family == linux.AF_INET || family == linux.AF_INET6) { 435 v := s.Endpoint.SocketOptions().GetLinger() 436 // The case for zero timeout is handled in tcp endpoint close function. 437 // Close is blocked until either: 438 // 1. The endpoint state is not in any of the states: FIN-WAIT1, 439 // CLOSING and LAST_ACK. 440 // 2. Timeout is reached. 441 if v.Enabled && v.Timeout != 0 { 442 t := kernel.TaskFromContext(ctx) 443 start := t.Kernel().MonotonicClock().Now() 444 deadline := start.Add(v.Timeout) 445 _ = t.BlockWithDeadline(ch, true, deadline) 446 } 447 } 448 s.namespace.DecRef(ctx) 449 } 450 451 // Epollable implements FileDescriptionImpl.Epollable. 452 func (s *sock) Epollable() bool { 453 return true 454 } 455 456 // Read implements vfs.FileDescriptionImpl. 457 func (s *sock) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { 458 // All flags other than RWF_NOWAIT should be ignored. 459 // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. 460 if opts.Flags != 0 { 461 return 0, linuxerr.EOPNOTSUPP 462 } 463 464 if dst.NumBytes() == 0 { 465 return 0, nil 466 } 467 n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false) 468 if err == syserr.ErrWouldBlock { 469 return int64(n), linuxerr.ErrWouldBlock 470 } 471 if err != nil { 472 return 0, err.ToError() 473 } 474 return int64(n), nil 475 } 476 477 // Write implements vfs.FileDescriptionImpl. 478 func (s *sock) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { 479 // All flags other than RWF_NOWAIT should be ignored. 480 // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. 481 if opts.Flags != 0 { 482 return 0, linuxerr.EOPNOTSUPP 483 } 484 485 r := src.Reader(ctx) 486 n, err := s.Endpoint.Write(r, tcpip.WriteOptions{}) 487 if _, ok := err.(*tcpip.ErrWouldBlock); ok { 488 return 0, linuxerr.ErrWouldBlock 489 } 490 if err != nil { 491 return 0, syserr.TranslateNetstackError(err).ToError() 492 } 493 494 if n < src.NumBytes() { 495 return n, linuxerr.ErrWouldBlock 496 } 497 498 return n, nil 499 } 500 501 // Accept implements the linux syscall accept(2) for sockets backed by 502 // tcpip.Endpoint. 503 func (s *sock) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { 504 // Issue the accept request to get the new endpoint. 505 var peerAddr *tcpip.FullAddress 506 if peerRequested { 507 peerAddr = &tcpip.FullAddress{} 508 } 509 ep, wq, terr := s.Endpoint.Accept(peerAddr) 510 if terr != nil { 511 if _, ok := terr.(*tcpip.ErrWouldBlock); !ok || !blocking { 512 return 0, nil, 0, syserr.TranslateNetstackError(terr) 513 } 514 515 var err *syserr.Error 516 ep, wq, err = s.blockingAccept(t, peerAddr) 517 if err != nil { 518 return 0, nil, 0, err 519 } 520 } 521 522 ns, err := New(t, s.family, s.skType, s.protocol, wq, ep) 523 if err != nil { 524 return 0, nil, 0, err 525 } 526 defer ns.DecRef(t) 527 528 if err := ns.SetStatusFlags(t, t.Credentials(), uint32(flags&linux.SOCK_NONBLOCK)); err != nil { 529 return 0, nil, 0, syserr.FromError(err) 530 } 531 532 var addr linux.SockAddr 533 var addrLen uint32 534 if peerAddr != nil { 535 // Get address of the peer and write it to peer slice. 536 addr, addrLen = socket.ConvertAddress(s.family, *peerAddr) 537 } 538 539 fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{ 540 CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, 541 }) 542 543 t.Kernel().RecordSocket(ns) 544 545 return fd, addr, addrLen, syserr.FromError(e) 546 } 547 548 // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by 549 // tcpip.Endpoint. 550 func (s *sock) GetSockOpt(t *kernel.Task, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { 551 // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is 552 // implemented specifically for netstack.Socket rather than 553 // commonEndpoint. commonEndpoint should be extended to support socket 554 // options where the implementation is not shared, as unix sockets need 555 // their own support for SO_TIMESTAMP. 556 if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP { 557 if outLen < sizeOfInt32 { 558 return nil, syserr.ErrInvalidArgument 559 } 560 val := primitive.Int32(0) 561 s.readMu.Lock() 562 defer s.readMu.Unlock() 563 if s.sockOptTimestamp { 564 val = 1 565 } 566 return &val, nil 567 } 568 if level == linux.SOL_TCP && name == linux.TCP_INQ { 569 if outLen < sizeOfInt32 { 570 return nil, syserr.ErrInvalidArgument 571 } 572 val := primitive.Int32(0) 573 s.readMu.Lock() 574 defer s.readMu.Unlock() 575 if s.sockOptInq { 576 val = 1 577 } 578 return &val, nil 579 } 580 581 return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outPtr, outLen) 582 } 583 584 // SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by 585 // tcpip.Endpoint. 586 func (s *sock) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error { 587 // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is 588 // implemented specifically for netstack.Socket rather than 589 // commonEndpoint. commonEndpoint should be extended to support socket 590 // options where the implementation is not shared, as unix sockets need 591 // their own support for SO_TIMESTAMP. 592 if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP { 593 if len(optVal) < sizeOfInt32 { 594 return syserr.ErrInvalidArgument 595 } 596 s.readMu.Lock() 597 defer s.readMu.Unlock() 598 s.sockOptTimestamp = hostarch.ByteOrder.Uint32(optVal) != 0 599 return nil 600 } 601 if level == linux.SOL_TCP && name == linux.TCP_INQ { 602 if len(optVal) < sizeOfInt32 { 603 return syserr.ErrInvalidArgument 604 } 605 s.readMu.Lock() 606 defer s.readMu.Unlock() 607 s.sockOptInq = hostarch.ByteOrder.Uint32(optVal) != 0 608 return nil 609 } 610 611 return SetSockOpt(t, s, s.Endpoint, level, name, optVal) 612 } 613 614 var sockAddrInetSize = (*linux.SockAddrInet)(nil).SizeBytes() 615 var sockAddrInet6Size = (*linux.SockAddrInet6)(nil).SizeBytes() 616 var sockAddrLinkSize = (*linux.SockAddrLink)(nil).SizeBytes() 617 618 // minSockAddrLen returns the minimum length in bytes of a socket address for 619 // the socket's family. 620 func (s *sock) minSockAddrLen() int { 621 const addressFamilySize = 2 622 623 switch s.family { 624 case linux.AF_UNIX: 625 return addressFamilySize 626 case linux.AF_INET: 627 return sockAddrInetSize 628 case linux.AF_INET6: 629 return sockAddrInet6Size 630 case linux.AF_PACKET: 631 return sockAddrLinkSize 632 case linux.AF_UNSPEC: 633 return addressFamilySize 634 default: 635 panic(fmt.Sprintf("s.family unrecognized = %d", s.family)) 636 } 637 } 638 639 func (s *sock) isPacketBased() bool { 640 return s.skType == linux.SOCK_DGRAM || s.skType == linux.SOCK_SEQPACKET || s.skType == linux.SOCK_RDM || s.skType == linux.SOCK_RAW 641 } 642 643 // Readiness returns a mask of ready events for socket s. 644 func (s *sock) Readiness(mask waiter.EventMask) waiter.EventMask { 645 return s.Endpoint.Readiness(mask) 646 } 647 648 // checkFamily returns true iff the specified address family may be used with 649 // the socket. 650 // 651 // If exact is true, then the specified address family must be an exact match 652 // with the socket's family. 653 func (s *sock) checkFamily(family uint16, exact bool) bool { 654 if family == uint16(s.family) { 655 return true 656 } 657 if !exact && family == linux.AF_INET && s.family == linux.AF_INET6 { 658 if !s.Endpoint.SocketOptions().GetV6Only() { 659 return true 660 } 661 } 662 return false 663 } 664 665 // mapFamily maps the AF_INET ANY address to the IPv4-mapped IPv6 ANY if the 666 // receiver's family is AF_INET6. 667 // 668 // This is a hack to work around the fact that both IPv4 and IPv6 ANY are 669 // represented by the empty string. 670 // 671 // TODO(gvisor.dev/issue/1556): remove this function. 672 func (s *sock) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress { 673 if addr.Addr.BitLen() == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET { 674 addr.Addr = tcpip.AddrFrom16([16]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}) 675 } 676 return addr 677 } 678 679 // Connect implements the linux syscall connect(2) for sockets backed by 680 // tpcip.Endpoint. 681 func (s *sock) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { 682 addr, family, err := socket.AddressAndFamily(sockaddr) 683 if err != nil { 684 return err 685 } 686 687 if family == linux.AF_UNSPEC { 688 err := s.Endpoint.Disconnect() 689 if _, ok := err.(*tcpip.ErrNotSupported); ok { 690 return syserr.ErrAddressFamilyNotSupported 691 } 692 return syserr.TranslateNetstackError(err) 693 } 694 695 if !s.checkFamily(family, false /* exact */) { 696 return syserr.ErrInvalidArgument 697 } 698 addr = s.mapFamily(addr, family) 699 700 // Always return right away in the non-blocking case. 701 if !blocking { 702 return syserr.TranslateNetstackError(s.Endpoint.Connect(addr)) 703 } 704 705 // Register for notification when the endpoint becomes writable, then 706 // initiate the connection. 707 e, ch := waiter.NewChannelEntry(waiter.WritableEvents) 708 s.EventRegister(&e) 709 defer s.EventUnregister(&e) 710 711 switch err := s.Endpoint.Connect(addr); err.(type) { 712 case *tcpip.ErrConnectStarted, *tcpip.ErrAlreadyConnecting: 713 case *tcpip.ErrNoPortAvailable: 714 if (s.family == unix.AF_INET || s.family == unix.AF_INET6) && s.skType == linux.SOCK_STREAM { 715 // TCP unlike UDP returns EADDRNOTAVAIL when it can't 716 // find an available local ephemeral port. 717 return syserr.ErrAddressNotAvailable 718 } 719 return syserr.TranslateNetstackError(err) 720 default: 721 return syserr.TranslateNetstackError(err) 722 } 723 724 // It's pending, so we have to wait for a notification, and fetch the 725 // result once the wait completes. 726 if err := t.Block(ch); err != nil { 727 return syserr.FromError(err) 728 } 729 730 // Call Connect() again after blocking to find connect's result. 731 return syserr.TranslateNetstackError(s.Endpoint.Connect(addr)) 732 } 733 734 // Bind implements the linux syscall bind(2) for sockets backed by 735 // tcpip.Endpoint. 736 func (s *sock) Bind(_ *kernel.Task, sockaddr []byte) *syserr.Error { 737 if len(sockaddr) < 2 { 738 return syserr.ErrInvalidArgument 739 } 740 741 family := hostarch.ByteOrder.Uint16(sockaddr) 742 var addr tcpip.FullAddress 743 744 // Bind for AF_PACKET requires only family, protocol and ifindex. 745 // In function AddressAndFamily, we check the address length which is 746 // not needed for AF_PACKET bind. 747 if family == linux.AF_PACKET { 748 var a linux.SockAddrLink 749 if len(sockaddr) < sockAddrLinkSize { 750 return syserr.ErrInvalidArgument 751 } 752 a.UnmarshalBytes(sockaddr) 753 754 addr = tcpip.FullAddress{ 755 NIC: tcpip.NICID(a.InterfaceIndex), 756 Addr: tcpip.AddrFrom16Slice(append( 757 a.HardwareAddr[:header.EthernetAddressSize], 758 []byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}..., 759 )), 760 Port: socket.Ntohs(a.Protocol), 761 } 762 } else { 763 if s.minSockAddrLen() > len(sockaddr) { 764 return syserr.ErrInvalidArgument 765 } 766 767 var err *syserr.Error 768 addr, family, err = socket.AddressAndFamily(sockaddr) 769 if err != nil { 770 return err 771 } 772 773 if !s.checkFamily(family, true /* exact */) { 774 return syserr.ErrAddressFamilyNotSupported 775 } 776 777 addr = s.mapFamily(addr, family) 778 } 779 780 // Issue the bind request to the endpoint. 781 err := s.Endpoint.Bind(addr) 782 if _, ok := err.(*tcpip.ErrNoPortAvailable); ok { 783 // Bind always returns EADDRINUSE irrespective of if the specified port was 784 // already bound or if an ephemeral port was requested but none were 785 // available. 786 // 787 // *tcpip.ErrNoPortAvailable is mapped to EAGAIN in syserr package because 788 // UDP connect returns EAGAIN on ephemeral port exhaustion. 789 // 790 // TCP connect returns EADDRNOTAVAIL on ephemeral port exhaustion. 791 err = &tcpip.ErrPortInUse{} 792 } 793 794 return syserr.TranslateNetstackError(err) 795 } 796 797 // Listen implements the linux syscall listen(2) for sockets backed by 798 // tcpip.Endpoint. 799 func (s *sock) Listen(_ *kernel.Task, backlog int) *syserr.Error { 800 if err := s.Endpoint.Listen(backlog); err != nil { 801 return syserr.TranslateNetstackError(err) 802 } 803 if !socket.IsTCP(s) { 804 return nil 805 } 806 807 // Emit SentryTCPListenEvent with the bound port for tcp sockets. 808 addr, err := s.Endpoint.GetLocalAddress() 809 if err != nil { 810 panic(fmt.Sprintf("GetLocalAddress failed for tcp socket: %s", err)) 811 } 812 eventchannel.Emit(&epb.SentryTcpListenEvent{ 813 Port: proto.Int32(int32(addr.Port)), 814 }) 815 return nil 816 } 817 818 // blockingAccept implements a blocking version of accept(2), that is, if no 819 // connections are ready to be accept, it will block until one becomes ready. 820 func (s *sock) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) { 821 // Register for notifications. 822 e, ch := waiter.NewChannelEntry(waiter.ReadableEvents) 823 s.EventRegister(&e) 824 defer s.EventUnregister(&e) 825 826 // Try to accept the connection again; if it fails, then wait until we 827 // get a notification. 828 for { 829 ep, wq, err := s.Endpoint.Accept(peerAddr) 830 if _, ok := err.(*tcpip.ErrWouldBlock); !ok { 831 return ep, wq, syserr.TranslateNetstackError(err) 832 } 833 834 if err := t.Block(ch); err != nil { 835 return nil, nil, syserr.FromError(err) 836 } 837 } 838 } 839 840 // ConvertShutdown converts Linux shutdown flags into tcpip shutdown flags. 841 func ConvertShutdown(how int) (tcpip.ShutdownFlags, *syserr.Error) { 842 var f tcpip.ShutdownFlags 843 switch how { 844 case linux.SHUT_RD: 845 f = tcpip.ShutdownRead 846 case linux.SHUT_WR: 847 f = tcpip.ShutdownWrite 848 case linux.SHUT_RDWR: 849 f = tcpip.ShutdownRead | tcpip.ShutdownWrite 850 default: 851 return 0, syserr.ErrInvalidArgument 852 } 853 return f, nil 854 } 855 856 // Shutdown implements the linux syscall shutdown(2) for sockets backed by 857 // tcpip.Endpoint. 858 func (s *sock) Shutdown(_ *kernel.Task, how int) *syserr.Error { 859 f, err := ConvertShutdown(how) 860 if err != nil { 861 return err 862 } 863 864 // Issue shutdown request. 865 return syserr.TranslateNetstackError(s.Endpoint.Shutdown(f)) 866 } 867 868 // GetSockOpt can be used to implement the linux syscall getsockopt(2) for 869 // sockets backed by a commonEndpoint. 870 func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { 871 switch level { 872 case linux.SOL_SOCKET: 873 return getSockOptSocket(t, s, ep, family, skType, name, outLen) 874 875 case linux.SOL_TCP: 876 return getSockOptTCP(t, s, ep, name, outLen) 877 878 case linux.SOL_IPV6: 879 return getSockOptIPv6(t, s, ep, name, outPtr, outLen) 880 881 case linux.SOL_IP: 882 return getSockOptIP(t, s, ep, name, outPtr, outLen, family) 883 884 case linux.SOL_ICMPV6: 885 return getSockOptICMPv6(t, s, ep, name, outLen) 886 887 case linux.SOL_UDP, 888 linux.SOL_RAW, 889 linux.SOL_PACKET: 890 // Not supported. 891 } 892 893 return nil, syserr.ErrProtocolNotAvailable 894 } 895 896 func boolToInt32(v bool) int32 { 897 if v { 898 return 1 899 } 900 return 0 901 } 902 903 // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET. 904 func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, _ linux.SockType, name, outLen int) (marshal.Marshallable, *syserr.Error) { 905 // TODO(b/124056281): Stop rejecting short optLen values in getsockopt. 906 switch name { 907 case linux.SO_ERROR: 908 if outLen < sizeOfInt32 { 909 return nil, syserr.ErrInvalidArgument 910 } 911 912 // Get the last error and convert it. 913 err := ep.SocketOptions().GetLastError() 914 if err == nil { 915 optP := primitive.Int32(0) 916 return &optP, nil 917 } 918 919 optP := primitive.Int32(syserr.TranslateNetstackError(err).ToLinux()) 920 return &optP, nil 921 922 case linux.SO_PEERCRED: 923 if family != linux.AF_UNIX || outLen < unix.SizeofUcred { 924 return nil, syserr.ErrInvalidArgument 925 } 926 927 tcred := t.Credentials() 928 creds := linux.ControlMessageCredentials{ 929 PID: int32(t.ThreadGroup().ID()), 930 UID: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()), 931 GID: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()), 932 } 933 return &creds, nil 934 935 case linux.SO_PASSCRED: 936 if outLen < sizeOfInt32 { 937 return nil, syserr.ErrInvalidArgument 938 } 939 940 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetPassCred())) 941 return &v, nil 942 943 case linux.SO_SNDBUF: 944 if outLen < sizeOfInt32 { 945 return nil, syserr.ErrInvalidArgument 946 } 947 948 size := ep.SocketOptions().GetSendBufferSize() 949 950 if size > math.MaxInt32 { 951 size = math.MaxInt32 952 } 953 954 sizeP := primitive.Int32(size) 955 return &sizeP, nil 956 957 case linux.SO_RCVBUF: 958 if outLen < sizeOfInt32 { 959 return nil, syserr.ErrInvalidArgument 960 } 961 962 size := ep.SocketOptions().GetReceiveBufferSize() 963 964 if size > math.MaxInt32 { 965 size = math.MaxInt32 966 } 967 968 sizeP := primitive.Int32(size) 969 return &sizeP, nil 970 971 case linux.SO_REUSEADDR: 972 if outLen < sizeOfInt32 { 973 return nil, syserr.ErrInvalidArgument 974 } 975 976 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReuseAddress())) 977 return &v, nil 978 979 case linux.SO_REUSEPORT: 980 if outLen < sizeOfInt32 { 981 return nil, syserr.ErrInvalidArgument 982 } 983 984 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReusePort())) 985 return &v, nil 986 987 case linux.SO_BINDTODEVICE: 988 v := ep.SocketOptions().GetBindToDevice() 989 if v == 0 { 990 var b primitive.ByteSlice 991 return &b, nil 992 } 993 if outLen < linux.IFNAMSIZ { 994 return nil, syserr.ErrInvalidArgument 995 } 996 s := t.NetworkContext() 997 if s == nil { 998 return nil, syserr.ErrNoDevice 999 } 1000 nic, ok := s.Interfaces()[int32(v)] 1001 if !ok { 1002 // The NICID no longer indicates a valid interface, probably because that 1003 // interface was removed. 1004 return nil, syserr.ErrUnknownDevice 1005 } 1006 1007 name := primitive.ByteSlice(append([]byte(nic.Name), 0)) 1008 return &name, nil 1009 1010 case linux.SO_BROADCAST: 1011 if outLen < sizeOfInt32 { 1012 return nil, syserr.ErrInvalidArgument 1013 } 1014 1015 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetBroadcast())) 1016 return &v, nil 1017 1018 case linux.SO_KEEPALIVE: 1019 if outLen < sizeOfInt32 { 1020 return nil, syserr.ErrInvalidArgument 1021 } 1022 1023 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetKeepAlive())) 1024 return &v, nil 1025 1026 case linux.SO_LINGER: 1027 if outLen < linux.SizeOfLinger { 1028 return nil, syserr.ErrInvalidArgument 1029 } 1030 1031 var linger linux.Linger 1032 v := ep.SocketOptions().GetLinger() 1033 1034 if v.Enabled { 1035 linger.OnOff = 1 1036 } 1037 linger.Linger = int32(v.Timeout.Seconds()) 1038 return &linger, nil 1039 1040 case linux.SO_SNDTIMEO: 1041 // TODO(igudger): Linux allows shorter lengths for partial results. 1042 if outLen < linux.SizeOfTimeval { 1043 return nil, syserr.ErrInvalidArgument 1044 } 1045 1046 sendTimeout := linux.NsecToTimeval(s.SendTimeout()) 1047 return &sendTimeout, nil 1048 1049 case linux.SO_RCVTIMEO: 1050 // TODO(igudger): Linux allows shorter lengths for partial results. 1051 if outLen < linux.SizeOfTimeval { 1052 return nil, syserr.ErrInvalidArgument 1053 } 1054 1055 recvTimeout := linux.NsecToTimeval(s.RecvTimeout()) 1056 return &recvTimeout, nil 1057 1058 case linux.SO_OOBINLINE: 1059 if outLen < sizeOfInt32 { 1060 return nil, syserr.ErrInvalidArgument 1061 } 1062 1063 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetOutOfBandInline())) 1064 return &v, nil 1065 1066 case linux.SO_NO_CHECK: 1067 if outLen < sizeOfInt32 { 1068 return nil, syserr.ErrInvalidArgument 1069 } 1070 1071 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetNoChecksum())) 1072 return &v, nil 1073 1074 case linux.SO_ACCEPTCONN: 1075 if outLen < sizeOfInt32 { 1076 return nil, syserr.ErrInvalidArgument 1077 } 1078 1079 // This option is only viable for TCP endpoints. 1080 var v bool 1081 if socket.IsTCP(s) { 1082 v = tcp.EndpointState(ep.State()) == tcp.StateListen 1083 } 1084 vP := primitive.Int32(boolToInt32(v)) 1085 return &vP, nil 1086 1087 case linux.SO_RCVLOWAT: 1088 if outLen < sizeOfInt32 { 1089 return nil, syserr.ErrInvalidArgument 1090 } 1091 1092 v := primitive.Int32(ep.SocketOptions().GetRcvlowat()) 1093 return &v, nil 1094 } 1095 return nil, syserr.ErrProtocolNotAvailable 1096 } 1097 1098 // getSockOptTCP implements GetSockOpt when level is SOL_TCP. 1099 func getSockOptTCP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) { 1100 if !socket.IsTCP(s) { 1101 return nil, syserr.ErrUnknownProtocolOption 1102 } 1103 1104 switch name { 1105 case linux.TCP_NODELAY: 1106 if outLen < sizeOfInt32 { 1107 return nil, syserr.ErrInvalidArgument 1108 } 1109 1110 v := primitive.Int32(boolToInt32(!ep.SocketOptions().GetDelayOption())) 1111 return &v, nil 1112 1113 case linux.TCP_CORK: 1114 if outLen < sizeOfInt32 { 1115 return nil, syserr.ErrInvalidArgument 1116 } 1117 1118 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetCorkOption())) 1119 return &v, nil 1120 1121 case linux.TCP_QUICKACK: 1122 if outLen < sizeOfInt32 { 1123 return nil, syserr.ErrInvalidArgument 1124 } 1125 1126 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetQuickAck())) 1127 return &v, nil 1128 1129 case linux.TCP_MAXSEG: 1130 if outLen < sizeOfInt32 { 1131 return nil, syserr.ErrInvalidArgument 1132 } 1133 1134 v, err := ep.GetSockOptInt(tcpip.MaxSegOption) 1135 if err != nil { 1136 return nil, syserr.TranslateNetstackError(err) 1137 } 1138 vP := primitive.Int32(v) 1139 return &vP, nil 1140 1141 case linux.TCP_KEEPIDLE: 1142 if outLen < sizeOfInt32 { 1143 return nil, syserr.ErrInvalidArgument 1144 } 1145 1146 var v tcpip.KeepaliveIdleOption 1147 if err := ep.GetSockOpt(&v); err != nil { 1148 return nil, syserr.TranslateNetstackError(err) 1149 } 1150 keepAliveIdle := primitive.Int32(time.Duration(v) / time.Second) 1151 return &keepAliveIdle, nil 1152 1153 case linux.TCP_KEEPINTVL: 1154 if outLen < sizeOfInt32 { 1155 return nil, syserr.ErrInvalidArgument 1156 } 1157 1158 var v tcpip.KeepaliveIntervalOption 1159 if err := ep.GetSockOpt(&v); err != nil { 1160 return nil, syserr.TranslateNetstackError(err) 1161 } 1162 keepAliveInterval := primitive.Int32(time.Duration(v) / time.Second) 1163 return &keepAliveInterval, nil 1164 1165 case linux.TCP_KEEPCNT: 1166 if outLen < sizeOfInt32 { 1167 return nil, syserr.ErrInvalidArgument 1168 } 1169 1170 v, err := ep.GetSockOptInt(tcpip.KeepaliveCountOption) 1171 if err != nil { 1172 return nil, syserr.TranslateNetstackError(err) 1173 } 1174 vP := primitive.Int32(v) 1175 return &vP, nil 1176 1177 case linux.TCP_USER_TIMEOUT: 1178 if outLen < sizeOfInt32 { 1179 return nil, syserr.ErrInvalidArgument 1180 } 1181 1182 var v tcpip.TCPUserTimeoutOption 1183 if err := ep.GetSockOpt(&v); err != nil { 1184 return nil, syserr.TranslateNetstackError(err) 1185 } 1186 tcpUserTimeout := primitive.Int32(time.Duration(v) / time.Millisecond) 1187 return &tcpUserTimeout, nil 1188 1189 case linux.TCP_INFO: 1190 var v tcpip.TCPInfoOption 1191 if err := ep.GetSockOpt(&v); err != nil { 1192 return nil, syserr.TranslateNetstackError(err) 1193 } 1194 1195 // TODO(b/64800844): Translate fields once they are added to 1196 // tcpip.TCPInfoOption. 1197 info := linux.TCPInfo{ 1198 State: uint8(v.State), 1199 RTO: uint32(v.RTO / time.Microsecond), 1200 RTT: uint32(v.RTT / time.Microsecond), 1201 RTTVar: uint32(v.RTTVar / time.Microsecond), 1202 SndSsthresh: v.SndSsthresh, 1203 SndCwnd: v.SndCwnd, 1204 } 1205 switch v.CcState { 1206 case tcpip.RTORecovery: 1207 info.CaState = linux.TCP_CA_Loss 1208 case tcpip.FastRecovery, tcpip.SACKRecovery: 1209 info.CaState = linux.TCP_CA_Recovery 1210 case tcpip.Disorder: 1211 info.CaState = linux.TCP_CA_Disorder 1212 case tcpip.Open: 1213 info.CaState = linux.TCP_CA_Open 1214 } 1215 1216 // In netstack reorderSeen is updated only when RACK is enabled. 1217 // We only track whether the reordering is seen, which is 1218 // different than Linux where reorderSeen is not specific to 1219 // RACK and is incremented when a reordering event is seen. 1220 if v.ReorderSeen { 1221 info.ReordSeen = 1 1222 } 1223 1224 // Linux truncates the output binary to outLen. 1225 buf := t.CopyScratchBuffer(info.SizeBytes()) 1226 info.MarshalUnsafe(buf) 1227 if len(buf) > outLen { 1228 buf = buf[:outLen] 1229 } 1230 bufP := primitive.ByteSlice(buf) 1231 return &bufP, nil 1232 1233 case linux.TCP_CC_INFO, 1234 linux.TCP_NOTSENT_LOWAT, 1235 linux.TCP_ZEROCOPY_RECEIVE: 1236 1237 // Not supported. 1238 1239 case linux.TCP_CONGESTION: 1240 if outLen <= 0 { 1241 return nil, syserr.ErrInvalidArgument 1242 } 1243 1244 var v tcpip.CongestionControlOption 1245 if err := ep.GetSockOpt(&v); err != nil { 1246 return nil, syserr.TranslateNetstackError(err) 1247 } 1248 1249 // We match linux behaviour here where it returns the lower of 1250 // TCP_CA_NAME_MAX bytes or the value of the option length. 1251 // 1252 // This is Linux's net/tcp.h TCP_CA_NAME_MAX. 1253 const tcpCANameMax = 16 1254 1255 toCopy := tcpCANameMax 1256 if outLen < tcpCANameMax { 1257 toCopy = outLen 1258 } 1259 b := make([]byte, toCopy) 1260 copy(b, v) 1261 1262 bP := primitive.ByteSlice(b) 1263 return &bP, nil 1264 1265 case linux.TCP_LINGER2: 1266 if outLen < sizeOfInt32 { 1267 return nil, syserr.ErrInvalidArgument 1268 } 1269 1270 var v tcpip.TCPLingerTimeoutOption 1271 if err := ep.GetSockOpt(&v); err != nil { 1272 return nil, syserr.TranslateNetstackError(err) 1273 } 1274 var lingerTimeout primitive.Int32 1275 if v >= 0 { 1276 lingerTimeout = primitive.Int32(time.Duration(v) / time.Second) 1277 } else { 1278 lingerTimeout = -1 1279 } 1280 return &lingerTimeout, nil 1281 1282 case linux.TCP_DEFER_ACCEPT: 1283 if outLen < sizeOfInt32 { 1284 return nil, syserr.ErrInvalidArgument 1285 } 1286 1287 var v tcpip.TCPDeferAcceptOption 1288 if err := ep.GetSockOpt(&v); err != nil { 1289 return nil, syserr.TranslateNetstackError(err) 1290 } 1291 1292 tcpDeferAccept := primitive.Int32(time.Duration(v) / time.Second) 1293 return &tcpDeferAccept, nil 1294 1295 case linux.TCP_SYNCNT: 1296 if outLen < sizeOfInt32 { 1297 return nil, syserr.ErrInvalidArgument 1298 } 1299 1300 v, err := ep.GetSockOptInt(tcpip.TCPSynCountOption) 1301 if err != nil { 1302 return nil, syserr.TranslateNetstackError(err) 1303 } 1304 vP := primitive.Int32(v) 1305 return &vP, nil 1306 1307 case linux.TCP_WINDOW_CLAMP: 1308 if outLen < sizeOfInt32 { 1309 return nil, syserr.ErrInvalidArgument 1310 } 1311 1312 v, err := ep.GetSockOptInt(tcpip.TCPWindowClampOption) 1313 if err != nil { 1314 return nil, syserr.TranslateNetstackError(err) 1315 } 1316 vP := primitive.Int32(v) 1317 return &vP, nil 1318 } 1319 return nil, syserr.ErrProtocolNotAvailable 1320 } 1321 1322 func getSockOptICMPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outLen int) (marshal.Marshallable, *syserr.Error) { 1323 if _, ok := ep.(tcpip.Endpoint); !ok { 1324 log.Warningf("SOL_ICMPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name) 1325 return nil, syserr.ErrUnknownProtocolOption 1326 } 1327 1328 if family, _, _ := s.Type(); family != linux.AF_INET6 { 1329 return nil, syserr.ErrNotSupported 1330 } 1331 1332 switch name { 1333 case linux.ICMPV6_FILTER: 1334 var v tcpip.ICMPv6Filter 1335 if err := ep.GetSockOpt(&v); err != nil { 1336 return nil, syserr.TranslateNetstackError(err) 1337 } 1338 1339 filter := linux.ICMP6Filter{Filter: v.DenyType} 1340 1341 // Linux truncates the output to outLen. 1342 buf := t.CopyScratchBuffer(filter.SizeBytes()) 1343 filter.MarshalUnsafe(buf) 1344 if len(buf) > outLen { 1345 buf = buf[:outLen] 1346 } 1347 bufP := primitive.ByteSlice(buf) 1348 return &bufP, nil 1349 } 1350 return nil, syserr.ErrProtocolNotAvailable 1351 } 1352 1353 func defaultTTL(t *kernel.Task, network tcpip.NetworkProtocolNumber) (primitive.Int32, tcpip.Error) { 1354 var opt tcpip.DefaultTTLOption 1355 stack := inet.StackFromContext(t) 1356 if err := stack.(*Stack).Stack.NetworkProtocolOption(network, &opt); err != nil { 1357 return 0, err 1358 } 1359 return primitive.Int32(opt), nil 1360 } 1361 1362 // getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6. 1363 func getSockOptIPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) { 1364 if _, ok := ep.(tcpip.Endpoint); !ok { 1365 log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name) 1366 return nil, syserr.ErrUnknownProtocolOption 1367 } 1368 1369 family, skType, _ := s.Type() 1370 if family != linux.AF_INET6 { 1371 return nil, syserr.ErrNotSupported 1372 } 1373 1374 switch name { 1375 case linux.IPV6_CHECKSUM: 1376 if outLen < sizeOfInt32 { 1377 return nil, syserr.ErrInvalidArgument 1378 } 1379 1380 v, err := ep.GetSockOptInt(tcpip.IPv6Checksum) 1381 if err != nil { 1382 return nil, syserr.TranslateNetstackError(err) 1383 } 1384 1385 vP := primitive.Int32(v) 1386 return &vP, nil 1387 1388 case linux.IPV6_V6ONLY: 1389 if outLen < sizeOfInt32 { 1390 return nil, syserr.ErrInvalidArgument 1391 } 1392 1393 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetV6Only())) 1394 return &v, nil 1395 1396 case linux.IPV6_UNICAST_HOPS: 1397 if outLen < sizeOfInt32 { 1398 return nil, syserr.ErrInvalidArgument 1399 } 1400 1401 v, err := ep.GetSockOptInt(tcpip.IPv6HopLimitOption) 1402 if err != nil { 1403 return nil, syserr.TranslateNetstackError(err) 1404 } 1405 1406 // Fill in the default value, if needed. 1407 vP := primitive.Int32(v) 1408 if vP == -1 { 1409 vP, err = defaultTTL(t, header.IPv6ProtocolNumber) 1410 if err != nil { 1411 return nil, syserr.TranslateNetstackError(err) 1412 } 1413 } 1414 1415 return &vP, nil 1416 1417 case linux.IPV6_RECVHOPLIMIT: 1418 if outLen < sizeOfInt32 { 1419 return nil, syserr.ErrInvalidArgument 1420 } 1421 1422 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveHopLimit())) 1423 return &v, nil 1424 1425 case linux.IPV6_PATHMTU: 1426 // Not supported. 1427 1428 case linux.IPV6_TCLASS: 1429 // Length handling for parity with Linux. 1430 if outLen == 0 { 1431 var b primitive.ByteSlice 1432 return &b, nil 1433 } 1434 v, err := ep.GetSockOptInt(tcpip.IPv6TrafficClassOption) 1435 if err != nil { 1436 return nil, syserr.TranslateNetstackError(err) 1437 } 1438 1439 uintv := primitive.Uint32(v) 1440 // Linux truncates the output binary to outLen. 1441 ib := t.CopyScratchBuffer(uintv.SizeBytes()) 1442 uintv.MarshalUnsafe(ib) 1443 // Handle cases where outLen is lesser than sizeOfInt32. 1444 if len(ib) > outLen { 1445 ib = ib[:outLen] 1446 } 1447 ibP := primitive.ByteSlice(ib) 1448 return &ibP, nil 1449 1450 case linux.IPV6_RECVTCLASS: 1451 if outLen < sizeOfInt32 { 1452 return nil, syserr.ErrInvalidArgument 1453 } 1454 1455 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTClass())) 1456 return &v, nil 1457 case linux.IPV6_RECVERR: 1458 if outLen < sizeOfInt32 { 1459 return nil, syserr.ErrInvalidArgument 1460 } 1461 1462 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv6RecvError())) 1463 return &v, nil 1464 1465 case linux.IPV6_RECVORIGDSTADDR: 1466 if outLen < sizeOfInt32 { 1467 return nil, syserr.ErrInvalidArgument 1468 } 1469 1470 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress())) 1471 return &v, nil 1472 1473 case linux.IPV6_RECVPKTINFO: 1474 if outLen < sizeOfInt32 { 1475 return nil, syserr.ErrInvalidArgument 1476 } 1477 1478 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv6ReceivePacketInfo())) 1479 return &v, nil 1480 1481 case linux.IP6T_ORIGINAL_DST: 1482 if outLen < sockAddrInet6Size { 1483 return nil, syserr.ErrInvalidArgument 1484 } 1485 1486 var v tcpip.OriginalDestinationOption 1487 if err := ep.GetSockOpt(&v); err != nil { 1488 return nil, syserr.TranslateNetstackError(err) 1489 } 1490 1491 a, _ := socket.ConvertAddress(linux.AF_INET6, tcpip.FullAddress(v)) 1492 return a.(*linux.SockAddrInet6), nil 1493 1494 case linux.IP6T_SO_GET_INFO: 1495 if outLen < linux.SizeOfIPTGetinfo { 1496 return nil, syserr.ErrInvalidArgument 1497 } 1498 1499 // Only valid for raw IPv6 sockets. 1500 if skType != linux.SOCK_RAW { 1501 return nil, syserr.ErrProtocolNotAvailable 1502 } 1503 1504 stk := inet.StackFromContext(t) 1505 if stk == nil { 1506 return nil, syserr.ErrNoDevice 1507 } 1508 info, err := netfilter.GetInfo(t, stk.(*Stack).Stack, outPtr, true) 1509 if err != nil { 1510 return nil, err 1511 } 1512 return &info, nil 1513 1514 case linux.IP6T_SO_GET_ENTRIES: 1515 // IPTGetEntries is reused for IPv6. 1516 if outLen < linux.SizeOfIPTGetEntries { 1517 return nil, syserr.ErrInvalidArgument 1518 } 1519 // Only valid for raw IPv6 sockets. 1520 if skType != linux.SOCK_RAW { 1521 return nil, syserr.ErrProtocolNotAvailable 1522 } 1523 1524 stk := inet.StackFromContext(t) 1525 if stk == nil { 1526 return nil, syserr.ErrNoDevice 1527 } 1528 entries, err := netfilter.GetEntries6(t, stk.(*Stack).Stack, outPtr, outLen) 1529 if err != nil { 1530 return nil, err 1531 } 1532 return &entries, nil 1533 1534 case linux.IP6T_SO_GET_REVISION_TARGET: 1535 if outLen < linux.SizeOfXTGetRevision { 1536 return nil, syserr.ErrInvalidArgument 1537 } 1538 1539 // Only valid for raw IPv6 sockets. 1540 if skType != linux.SOCK_RAW { 1541 return nil, syserr.ErrProtocolNotAvailable 1542 } 1543 1544 stk := inet.StackFromContext(t) 1545 if stk == nil { 1546 return nil, syserr.ErrNoDevice 1547 } 1548 ret, err := netfilter.TargetRevision(t, outPtr, header.IPv6ProtocolNumber) 1549 if err != nil { 1550 return nil, err 1551 } 1552 return &ret, nil 1553 } 1554 return nil, syserr.ErrProtocolNotAvailable 1555 } 1556 1557 // getSockOptIP implements GetSockOpt when level is SOL_IP. 1558 func getSockOptIP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int, _ int) (marshal.Marshallable, *syserr.Error) { 1559 if _, ok := ep.(tcpip.Endpoint); !ok { 1560 log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name) 1561 return nil, syserr.ErrUnknownProtocolOption 1562 } 1563 1564 switch name { 1565 case linux.IP_TTL: 1566 if outLen < sizeOfInt32 { 1567 return nil, syserr.ErrInvalidArgument 1568 } 1569 1570 v, err := ep.GetSockOptInt(tcpip.IPv4TTLOption) 1571 if err != nil { 1572 return nil, syserr.TranslateNetstackError(err) 1573 } 1574 1575 // Fill in the default value, if needed. 1576 vP := primitive.Int32(v) 1577 if vP == 0 { 1578 vP, err = defaultTTL(t, header.IPv4ProtocolNumber) 1579 if err != nil { 1580 return nil, syserr.TranslateNetstackError(err) 1581 } 1582 } 1583 1584 return &vP, nil 1585 1586 case linux.IP_RECVTTL: 1587 if outLen < sizeOfInt32 { 1588 return nil, syserr.ErrInvalidArgument 1589 } 1590 1591 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTTL())) 1592 return &v, nil 1593 1594 case linux.IP_MULTICAST_TTL: 1595 if outLen < sizeOfInt32 { 1596 return nil, syserr.ErrInvalidArgument 1597 } 1598 1599 v, err := ep.GetSockOptInt(tcpip.MulticastTTLOption) 1600 if err != nil { 1601 return nil, syserr.TranslateNetstackError(err) 1602 } 1603 1604 vP := primitive.Int32(v) 1605 return &vP, nil 1606 1607 case linux.IP_MULTICAST_IF: 1608 if outLen < len(linux.InetAddr{}) { 1609 return nil, syserr.ErrInvalidArgument 1610 } 1611 1612 var v tcpip.MulticastInterfaceOption 1613 if err := ep.GetSockOpt(&v); err != nil { 1614 return nil, syserr.TranslateNetstackError(err) 1615 } 1616 1617 a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr}) 1618 1619 return &a.(*linux.SockAddrInet).Addr, nil 1620 1621 case linux.IP_MULTICAST_LOOP: 1622 if outLen < sizeOfInt32 { 1623 return nil, syserr.ErrInvalidArgument 1624 } 1625 1626 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetMulticastLoop())) 1627 return &v, nil 1628 1629 case linux.IP_TOS: 1630 // Length handling for parity with Linux. 1631 if outLen == 0 { 1632 var b primitive.ByteSlice 1633 return &b, nil 1634 } 1635 v, err := ep.GetSockOptInt(tcpip.IPv4TOSOption) 1636 if err != nil { 1637 return nil, syserr.TranslateNetstackError(err) 1638 } 1639 if outLen < sizeOfInt32 { 1640 vP := primitive.Uint8(v) 1641 return &vP, nil 1642 } 1643 vP := primitive.Int32(v) 1644 return &vP, nil 1645 1646 case linux.IP_RECVTOS: 1647 if outLen < sizeOfInt32 { 1648 return nil, syserr.ErrInvalidArgument 1649 } 1650 1651 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTOS())) 1652 return &v, nil 1653 1654 case linux.IP_RECVERR: 1655 if outLen < sizeOfInt32 { 1656 return nil, syserr.ErrInvalidArgument 1657 } 1658 1659 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetIPv4RecvError())) 1660 return &v, nil 1661 1662 case linux.IP_PKTINFO: 1663 if outLen < sizeOfInt32 { 1664 return nil, syserr.ErrInvalidArgument 1665 } 1666 1667 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceivePacketInfo())) 1668 return &v, nil 1669 1670 case linux.IP_HDRINCL: 1671 if outLen < sizeOfInt32 { 1672 return nil, syserr.ErrInvalidArgument 1673 } 1674 1675 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetHeaderIncluded())) 1676 return &v, nil 1677 1678 case linux.IP_RECVORIGDSTADDR: 1679 if outLen < sizeOfInt32 { 1680 return nil, syserr.ErrInvalidArgument 1681 } 1682 1683 v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress())) 1684 return &v, nil 1685 1686 case linux.SO_ORIGINAL_DST: 1687 if outLen < sockAddrInetSize { 1688 return nil, syserr.ErrInvalidArgument 1689 } 1690 1691 var v tcpip.OriginalDestinationOption 1692 if err := ep.GetSockOpt(&v); err != nil { 1693 return nil, syserr.TranslateNetstackError(err) 1694 } 1695 1696 a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress(v)) 1697 return a.(*linux.SockAddrInet), nil 1698 1699 case linux.IPT_SO_GET_INFO: 1700 if outLen < linux.SizeOfIPTGetinfo { 1701 return nil, syserr.ErrInvalidArgument 1702 } 1703 1704 // Only valid for raw IPv4 sockets. 1705 if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW { 1706 return nil, syserr.ErrProtocolNotAvailable 1707 } 1708 1709 stk := inet.StackFromContext(t) 1710 if stk == nil { 1711 return nil, syserr.ErrNoDevice 1712 } 1713 info, err := netfilter.GetInfo(t, stk.(*Stack).Stack, outPtr, false) 1714 if err != nil { 1715 return nil, err 1716 } 1717 return &info, nil 1718 1719 case linux.IPT_SO_GET_ENTRIES: 1720 if outLen < linux.SizeOfIPTGetEntries { 1721 return nil, syserr.ErrInvalidArgument 1722 } 1723 1724 // Only valid for raw IPv4 sockets. 1725 if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW { 1726 return nil, syserr.ErrProtocolNotAvailable 1727 } 1728 1729 stk := inet.StackFromContext(t) 1730 if stk == nil { 1731 return nil, syserr.ErrNoDevice 1732 } 1733 entries, err := netfilter.GetEntries4(t, stk.(*Stack).Stack, outPtr, outLen) 1734 if err != nil { 1735 return nil, err 1736 } 1737 return &entries, nil 1738 1739 case linux.IPT_SO_GET_REVISION_TARGET: 1740 if outLen < linux.SizeOfXTGetRevision { 1741 return nil, syserr.ErrInvalidArgument 1742 } 1743 1744 // Only valid for raw IPv4 sockets. 1745 if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW { 1746 return nil, syserr.ErrProtocolNotAvailable 1747 } 1748 1749 stk := inet.StackFromContext(t) 1750 if stk == nil { 1751 return nil, syserr.ErrNoDevice 1752 } 1753 ret, err := netfilter.TargetRevision(t, outPtr, header.IPv4ProtocolNumber) 1754 if err != nil { 1755 return nil, err 1756 } 1757 return &ret, nil 1758 } 1759 return nil, syserr.ErrProtocolNotAvailable 1760 } 1761 1762 // SetSockOpt can be used to implement the linux syscall setsockopt(2) for 1763 // sockets backed by a commonEndpoint. 1764 func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error { 1765 switch level { 1766 case linux.SOL_SOCKET: 1767 return setSockOptSocket(t, s, ep, name, optVal) 1768 1769 case linux.SOL_TCP: 1770 return setSockOptTCP(t, s, ep, name, optVal) 1771 1772 case linux.SOL_ICMPV6: 1773 return setSockOptICMPv6(t, s, ep, name, optVal) 1774 1775 case linux.SOL_IPV6: 1776 return setSockOptIPv6(t, s, ep, name, optVal) 1777 1778 case linux.SOL_IP: 1779 return setSockOptIP(t, s, ep, name, optVal) 1780 1781 case linux.SOL_PACKET: 1782 // gVisor doesn't support any SOL_PACKET options just return not 1783 // supported. Returning nil here will result in tcpdump thinking AF_PACKET 1784 // features are supported and proceed to use them and break. 1785 return syserr.ErrProtocolNotAvailable 1786 1787 case linux.SOL_UDP, 1788 linux.SOL_RAW: 1789 // Not supported. 1790 } 1791 1792 return nil 1793 } 1794 1795 func clampBufSize(newSz, min, max int64, ignoreMax bool) int64 { 1796 // packetOverheadFactor is used to multiply the value provided by the user on 1797 // a setsockopt(2) for setting the send/receive buffer sizes sockets. 1798 const packetOverheadFactor = 2 1799 1800 if !ignoreMax && newSz > max { 1801 newSz = max 1802 } 1803 1804 if newSz < math.MaxInt32/packetOverheadFactor { 1805 newSz *= packetOverheadFactor 1806 if newSz < min { 1807 newSz = min 1808 } 1809 } else { 1810 newSz = math.MaxInt32 1811 } 1812 return newSz 1813 } 1814 1815 // setSockOptSocket implements SetSockOpt when level is SOL_SOCKET. 1816 func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { 1817 switch name { 1818 case linux.SO_SNDBUF: 1819 if len(optVal) < sizeOfInt32 { 1820 return syserr.ErrInvalidArgument 1821 } 1822 1823 v := hostarch.ByteOrder.Uint32(optVal) 1824 min, max := ep.SocketOptions().SendBufferLimits() 1825 clamped := clampBufSize(int64(v), min, max, false /* ignoreMax */) 1826 ep.SocketOptions().SetSendBufferSize(clamped, true /* notify */) 1827 return nil 1828 1829 case linux.SO_RCVBUF: 1830 if len(optVal) < sizeOfInt32 { 1831 return syserr.ErrInvalidArgument 1832 } 1833 1834 v := hostarch.ByteOrder.Uint32(optVal) 1835 min, max := ep.SocketOptions().ReceiveBufferLimits() 1836 clamped := clampBufSize(int64(v), min, max, false /* ignoreMax */) 1837 ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */) 1838 return nil 1839 1840 case linux.SO_RCVBUFFORCE: 1841 if len(optVal) < sizeOfInt32 { 1842 return syserr.ErrInvalidArgument 1843 } 1844 1845 if creds := auth.CredentialsFromContext(t); !creds.HasCapability(linux.CAP_NET_ADMIN) { 1846 return syserr.ErrNotPermitted 1847 } 1848 1849 v := hostarch.ByteOrder.Uint32(optVal) 1850 min, max := ep.SocketOptions().ReceiveBufferLimits() 1851 clamped := clampBufSize(int64(v), min, max, true /* ignoreMax */) 1852 ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */) 1853 return nil 1854 1855 case linux.SO_REUSEADDR: 1856 if len(optVal) < sizeOfInt32 { 1857 return syserr.ErrInvalidArgument 1858 } 1859 1860 v := hostarch.ByteOrder.Uint32(optVal) 1861 ep.SocketOptions().SetReuseAddress(v != 0) 1862 return nil 1863 1864 case linux.SO_REUSEPORT: 1865 if len(optVal) < sizeOfInt32 { 1866 return syserr.ErrInvalidArgument 1867 } 1868 1869 v := hostarch.ByteOrder.Uint32(optVal) 1870 ep.SocketOptions().SetReusePort(v != 0) 1871 return nil 1872 1873 case linux.SO_BINDTODEVICE: 1874 n := bytes.IndexByte(optVal, 0) 1875 if n == -1 { 1876 n = len(optVal) 1877 } 1878 name := string(optVal[:n]) 1879 if name == "" { 1880 return syserr.TranslateNetstackError(ep.SocketOptions().SetBindToDevice(0)) 1881 } 1882 s := t.NetworkContext() 1883 if s == nil { 1884 return syserr.ErrNoDevice 1885 } 1886 for nicID, nic := range s.Interfaces() { 1887 if nic.Name == name { 1888 return syserr.TranslateNetstackError(ep.SocketOptions().SetBindToDevice(nicID)) 1889 } 1890 } 1891 return syserr.ErrUnknownDevice 1892 1893 case linux.SO_BROADCAST: 1894 if len(optVal) < sizeOfInt32 { 1895 return syserr.ErrInvalidArgument 1896 } 1897 1898 v := hostarch.ByteOrder.Uint32(optVal) 1899 ep.SocketOptions().SetBroadcast(v != 0) 1900 return nil 1901 1902 case linux.SO_PASSCRED: 1903 if len(optVal) < sizeOfInt32 { 1904 return syserr.ErrInvalidArgument 1905 } 1906 1907 v := hostarch.ByteOrder.Uint32(optVal) 1908 ep.SocketOptions().SetPassCred(v != 0) 1909 return nil 1910 1911 case linux.SO_KEEPALIVE: 1912 if len(optVal) < sizeOfInt32 { 1913 return syserr.ErrInvalidArgument 1914 } 1915 1916 v := hostarch.ByteOrder.Uint32(optVal) 1917 ep.SocketOptions().SetKeepAlive(v != 0) 1918 return nil 1919 1920 case linux.SO_SNDTIMEO: 1921 if len(optVal) < linux.SizeOfTimeval { 1922 return syserr.ErrInvalidArgument 1923 } 1924 1925 var v linux.Timeval 1926 v.UnmarshalBytes(optVal) 1927 if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) { 1928 return syserr.ErrDomain 1929 } 1930 s.SetSendTimeout(v.ToNsecCapped()) 1931 return nil 1932 1933 case linux.SO_RCVTIMEO: 1934 if len(optVal) < linux.SizeOfTimeval { 1935 return syserr.ErrInvalidArgument 1936 } 1937 1938 var v linux.Timeval 1939 v.UnmarshalBytes(optVal) 1940 if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) { 1941 return syserr.ErrDomain 1942 } 1943 s.SetRecvTimeout(v.ToNsecCapped()) 1944 return nil 1945 1946 case linux.SO_OOBINLINE: 1947 if len(optVal) < sizeOfInt32 { 1948 return syserr.ErrInvalidArgument 1949 } 1950 1951 v := hostarch.ByteOrder.Uint32(optVal) 1952 ep.SocketOptions().SetOutOfBandInline(v != 0) 1953 return nil 1954 1955 case linux.SO_NO_CHECK: 1956 if len(optVal) < sizeOfInt32 { 1957 return syserr.ErrInvalidArgument 1958 } 1959 1960 v := hostarch.ByteOrder.Uint32(optVal) 1961 ep.SocketOptions().SetNoChecksum(v != 0) 1962 return nil 1963 1964 case linux.SO_LINGER: 1965 if len(optVal) < linux.SizeOfLinger { 1966 return syserr.ErrInvalidArgument 1967 } 1968 1969 var v linux.Linger 1970 v.UnmarshalBytes(optVal) 1971 1972 ep.SocketOptions().SetLinger(tcpip.LingerOption{ 1973 Enabled: v.OnOff != 0, 1974 Timeout: time.Second * time.Duration(v.Linger), 1975 }) 1976 return nil 1977 1978 case linux.SO_DETACH_FILTER: 1979 // optval is ignored. 1980 var v tcpip.SocketDetachFilterOption 1981 return syserr.TranslateNetstackError(ep.SetSockOpt(&v)) 1982 1983 // TODO(b/226603727): Add support for SO_RCVLOWAT option. For now, only 1984 // the unsupported syscall message is removed. 1985 case linux.SO_RCVLOWAT: 1986 if len(optVal) < sizeOfInt32 { 1987 return syserr.ErrInvalidArgument 1988 } 1989 1990 v := hostarch.ByteOrder.Uint32(optVal) 1991 ep.SocketOptions().SetRcvlowat(int32(v)) 1992 return nil 1993 } 1994 1995 return nil 1996 } 1997 1998 // setSockOptTCP implements SetSockOpt when level is SOL_TCP. 1999 func setSockOptTCP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { 2000 if !socket.IsTCP(s) { 2001 return syserr.ErrUnknownProtocolOption 2002 } 2003 2004 switch name { 2005 case linux.TCP_NODELAY: 2006 if len(optVal) < sizeOfInt32 { 2007 return syserr.ErrInvalidArgument 2008 } 2009 2010 v := hostarch.ByteOrder.Uint32(optVal) 2011 ep.SocketOptions().SetDelayOption(v == 0) 2012 return nil 2013 2014 case linux.TCP_CORK: 2015 if len(optVal) < sizeOfInt32 { 2016 return syserr.ErrInvalidArgument 2017 } 2018 2019 v := hostarch.ByteOrder.Uint32(optVal) 2020 ep.SocketOptions().SetCorkOption(v != 0) 2021 return nil 2022 2023 case linux.TCP_QUICKACK: 2024 if len(optVal) < sizeOfInt32 { 2025 return syserr.ErrInvalidArgument 2026 } 2027 2028 v := hostarch.ByteOrder.Uint32(optVal) 2029 ep.SocketOptions().SetQuickAck(v != 0) 2030 return nil 2031 2032 case linux.TCP_MAXSEG: 2033 if len(optVal) < sizeOfInt32 { 2034 return syserr.ErrInvalidArgument 2035 } 2036 2037 v := hostarch.ByteOrder.Uint32(optVal) 2038 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MaxSegOption, int(v))) 2039 2040 case linux.TCP_KEEPIDLE: 2041 if len(optVal) < sizeOfInt32 { 2042 return syserr.ErrInvalidArgument 2043 } 2044 2045 v := hostarch.ByteOrder.Uint32(optVal) 2046 if v < 1 || v > linux.MAX_TCP_KEEPIDLE { 2047 return syserr.ErrInvalidArgument 2048 } 2049 opt := tcpip.KeepaliveIdleOption(time.Second * time.Duration(v)) 2050 return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) 2051 2052 case linux.TCP_KEEPINTVL: 2053 if len(optVal) < sizeOfInt32 { 2054 return syserr.ErrInvalidArgument 2055 } 2056 2057 v := hostarch.ByteOrder.Uint32(optVal) 2058 if v < 1 || v > linux.MAX_TCP_KEEPINTVL { 2059 return syserr.ErrInvalidArgument 2060 } 2061 opt := tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v)) 2062 return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) 2063 2064 case linux.TCP_KEEPCNT: 2065 if len(optVal) < sizeOfInt32 { 2066 return syserr.ErrInvalidArgument 2067 } 2068 2069 v := hostarch.ByteOrder.Uint32(optVal) 2070 if v < 1 || v > linux.MAX_TCP_KEEPCNT { 2071 return syserr.ErrInvalidArgument 2072 } 2073 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.KeepaliveCountOption, int(v))) 2074 2075 case linux.TCP_USER_TIMEOUT: 2076 if len(optVal) < sizeOfInt32 { 2077 return syserr.ErrInvalidArgument 2078 } 2079 2080 v := int32(hostarch.ByteOrder.Uint32(optVal)) 2081 if v < 0 { 2082 return syserr.ErrInvalidArgument 2083 } 2084 opt := tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v)) 2085 return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) 2086 2087 case linux.TCP_CONGESTION: 2088 v := tcpip.CongestionControlOption(optVal) 2089 if err := ep.SetSockOpt(&v); err != nil { 2090 return syserr.TranslateNetstackError(err) 2091 } 2092 return nil 2093 2094 case linux.TCP_LINGER2: 2095 if len(optVal) < sizeOfInt32 { 2096 return syserr.ErrInvalidArgument 2097 } 2098 2099 v := int32(hostarch.ByteOrder.Uint32(optVal)) 2100 opt := tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v)) 2101 return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) 2102 2103 case linux.TCP_DEFER_ACCEPT: 2104 if len(optVal) < sizeOfInt32 { 2105 return syserr.ErrInvalidArgument 2106 } 2107 v := int32(hostarch.ByteOrder.Uint32(optVal)) 2108 if v < 0 { 2109 v = 0 2110 } 2111 opt := tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v)) 2112 return syserr.TranslateNetstackError(ep.SetSockOpt(&opt)) 2113 2114 case linux.TCP_SYNCNT: 2115 if len(optVal) < sizeOfInt32 { 2116 return syserr.ErrInvalidArgument 2117 } 2118 v := hostarch.ByteOrder.Uint32(optVal) 2119 2120 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPSynCountOption, int(v))) 2121 2122 case linux.TCP_WINDOW_CLAMP: 2123 if len(optVal) < sizeOfInt32 { 2124 return syserr.ErrInvalidArgument 2125 } 2126 v := hostarch.ByteOrder.Uint32(optVal) 2127 2128 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPWindowClampOption, int(v))) 2129 2130 case linux.TCP_REPAIR_OPTIONS: 2131 // Not supported. 2132 } 2133 2134 return nil 2135 } 2136 2137 func setSockOptICMPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { 2138 if _, ok := ep.(tcpip.Endpoint); !ok { 2139 log.Warningf("SOL_ICMPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name) 2140 return syserr.ErrUnknownProtocolOption 2141 } 2142 2143 if family, _, _ := s.Type(); family != linux.AF_INET6 { 2144 return syserr.ErrUnknownProtocolOption 2145 } 2146 2147 switch name { 2148 case linux.ICMPV6_FILTER: 2149 var req linux.ICMP6Filter 2150 if len(optVal) < req.SizeBytes() { 2151 return syserr.ErrInvalidArgument 2152 } 2153 2154 req.UnmarshalUnsafe(optVal) 2155 return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.ICMPv6Filter{DenyType: req.Filter})) 2156 } 2157 2158 return nil 2159 } 2160 2161 // setSockOptIPv6 implements SetSockOpt when level is SOL_IPV6. 2162 func setSockOptIPv6(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { 2163 if _, ok := ep.(tcpip.Endpoint); !ok { 2164 log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name) 2165 return syserr.ErrUnknownProtocolOption 2166 } 2167 2168 family, _, _ := s.Type() 2169 if family != linux.AF_INET6 { 2170 return syserr.ErrUnknownProtocolOption 2171 } 2172 2173 switch name { 2174 case linux.IPV6_CHECKSUM: 2175 if len(optVal) < sizeOfInt32 { 2176 return syserr.ErrInvalidArgument 2177 } 2178 2179 // int may not be 32-bits so we cast the uint32 to an int32 before casting 2180 // to an int. 2181 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6Checksum, int(int32(hostarch.ByteOrder.Uint32(optVal))))) 2182 2183 case linux.IPV6_V6ONLY: 2184 if len(optVal) < sizeOfInt32 { 2185 return syserr.ErrInvalidArgument 2186 } 2187 2188 if socket.IsTCP(s) && tcp.EndpointState(ep.State()) != tcp.StateInitial { 2189 return syserr.ErrInvalidEndpointState 2190 } else if socket.IsUDP(s) && transport.DatagramEndpointState(ep.State()) != transport.DatagramEndpointStateInitial { 2191 return syserr.ErrInvalidEndpointState 2192 } 2193 2194 v := hostarch.ByteOrder.Uint32(optVal) 2195 ep.SocketOptions().SetV6Only(v != 0) 2196 return nil 2197 2198 case linux.IPV6_ADD_MEMBERSHIP: 2199 req, err := copyInMulticastV6Request(optVal) 2200 if err != nil { 2201 return err 2202 } 2203 2204 return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{ 2205 NIC: tcpip.NICID(req.InterfaceIndex), 2206 MulticastAddr: tcpip.AddrFrom16(req.MulticastAddr), 2207 })) 2208 2209 case linux.IPV6_DROP_MEMBERSHIP: 2210 req, err := copyInMulticastV6Request(optVal) 2211 if err != nil { 2212 return err 2213 } 2214 2215 return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{ 2216 NIC: tcpip.NICID(req.InterfaceIndex), 2217 MulticastAddr: tcpip.AddrFrom16(req.MulticastAddr), 2218 })) 2219 2220 case linux.IPV6_IPSEC_POLICY, 2221 linux.IPV6_JOIN_ANYCAST, 2222 linux.IPV6_LEAVE_ANYCAST, 2223 // TODO(b/148887420): Add support for IPV6_PKTINFO. 2224 linux.IPV6_PKTINFO, 2225 linux.IPV6_ROUTER_ALERT, 2226 linux.IPV6_XFRM_POLICY, 2227 linux.MCAST_BLOCK_SOURCE, 2228 linux.MCAST_JOIN_GROUP, 2229 linux.MCAST_JOIN_SOURCE_GROUP, 2230 linux.MCAST_LEAVE_GROUP, 2231 linux.MCAST_LEAVE_SOURCE_GROUP, 2232 linux.MCAST_UNBLOCK_SOURCE: 2233 // Not supported. 2234 2235 case linux.IPV6_RECVORIGDSTADDR: 2236 if len(optVal) < sizeOfInt32 { 2237 return syserr.ErrInvalidArgument 2238 } 2239 v := int32(hostarch.ByteOrder.Uint32(optVal)) 2240 2241 ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0) 2242 return nil 2243 2244 case linux.IPV6_RECVPKTINFO: 2245 if len(optVal) < sizeOfInt32 { 2246 return syserr.ErrInvalidArgument 2247 } 2248 v := int32(hostarch.ByteOrder.Uint32(optVal)) 2249 2250 ep.SocketOptions().SetIPv6ReceivePacketInfo(v != 0) 2251 return nil 2252 2253 case linux.IPV6_UNICAST_HOPS: 2254 if len(optVal) < sizeOfInt32 { 2255 return syserr.ErrInvalidArgument 2256 } 2257 v := int32(hostarch.ByteOrder.Uint32(optVal)) 2258 if v < -1 || v > 255 { 2259 return syserr.ErrInvalidArgument 2260 } 2261 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6HopLimitOption, int(v))) 2262 2263 case linux.IPV6_RECVHOPLIMIT: 2264 v, err := parseIntOrChar(optVal) 2265 if err != nil { 2266 return err 2267 } 2268 2269 ep.SocketOptions().SetReceiveHopLimit(v != 0) 2270 return nil 2271 2272 case linux.IPV6_TCLASS: 2273 if len(optVal) < sizeOfInt32 { 2274 return syserr.ErrInvalidArgument 2275 } 2276 v := int32(hostarch.ByteOrder.Uint32(optVal)) 2277 if v < -1 || v > 255 { 2278 return syserr.ErrInvalidArgument 2279 } 2280 if v == -1 { 2281 v = 0 2282 } 2283 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6TrafficClassOption, int(v))) 2284 2285 case linux.IPV6_RECVTCLASS: 2286 v, err := parseIntOrChar(optVal) 2287 if err != nil { 2288 return err 2289 } 2290 2291 ep.SocketOptions().SetReceiveTClass(v != 0) 2292 return nil 2293 case linux.IPV6_RECVERR: 2294 if len(optVal) == 0 { 2295 return nil 2296 } 2297 v, err := parseIntOrChar(optVal) 2298 if err != nil { 2299 return err 2300 } 2301 ep.SocketOptions().SetIPv6RecvError(v != 0) 2302 return nil 2303 2304 case linux.IP6T_SO_SET_REPLACE: 2305 if len(optVal) < linux.SizeOfIP6TReplace { 2306 return syserr.ErrInvalidArgument 2307 } 2308 2309 // Only valid for raw IPv6 sockets. 2310 if !socket.IsRaw(s) { 2311 return syserr.ErrProtocolNotAvailable 2312 } 2313 2314 stk := inet.StackFromContext(t) 2315 if stk == nil { 2316 return syserr.ErrNoDevice 2317 } 2318 // Stack must be a netstack stack. 2319 return netfilter.SetEntries(t, stk.(*Stack).Stack, optVal, true) 2320 2321 case linux.IP6T_SO_SET_ADD_COUNTERS: 2322 log.Infof("IP6T_SO_SET_ADD_COUNTERS is not supported") 2323 return nil 2324 } 2325 2326 return nil 2327 } 2328 2329 var ( 2330 inetMulticastRequestSize = (*linux.InetMulticastRequest)(nil).SizeBytes() 2331 inetMulticastRequestWithNICSize = (*linux.InetMulticastRequestWithNIC)(nil).SizeBytes() 2332 inet6MulticastRequestSize = (*linux.Inet6MulticastRequest)(nil).SizeBytes() 2333 ) 2334 2335 // copyInMulticastRequest copies in a variable-size multicast request. The 2336 // kernel determines which structure was passed by its length. IP_MULTICAST_IF 2337 // supports ip_mreqn, ip_mreq and in_addr, while IP_ADD_MEMBERSHIP and 2338 // IP_DROP_MEMBERSHIP only support ip_mreqn and ip_mreq. To handle this, 2339 // allowAddr controls whether in_addr is accepted or rejected. 2340 func copyInMulticastRequest(optVal []byte, allowAddr bool) (linux.InetMulticastRequestWithNIC, *syserr.Error) { 2341 if len(optVal) < len(linux.InetAddr{}) { 2342 return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument 2343 } 2344 2345 if len(optVal) < inetMulticastRequestSize { 2346 if !allowAddr { 2347 return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument 2348 } 2349 2350 var req linux.InetMulticastRequestWithNIC 2351 copy(req.InterfaceAddr[:], optVal) 2352 return req, nil 2353 } 2354 2355 if len(optVal) >= inetMulticastRequestWithNICSize { 2356 var req linux.InetMulticastRequestWithNIC 2357 req.UnmarshalUnsafe(optVal) 2358 return req, nil 2359 } 2360 2361 var req linux.InetMulticastRequestWithNIC 2362 req.InetMulticastRequest.UnmarshalUnsafe(optVal) 2363 return req, nil 2364 } 2365 2366 func copyInMulticastV6Request(optVal []byte) (linux.Inet6MulticastRequest, *syserr.Error) { 2367 if len(optVal) < inet6MulticastRequestSize { 2368 return linux.Inet6MulticastRequest{}, syserr.ErrInvalidArgument 2369 } 2370 2371 var req linux.Inet6MulticastRequest 2372 req.UnmarshalUnsafe(optVal) 2373 return req, nil 2374 } 2375 2376 // parseIntOrChar copies either a 32-bit int or an 8-bit uint out of buf. 2377 // 2378 // net/ipv4/ip_sockglue.c:do_ip_setsockopt does this for its socket options. 2379 func parseIntOrChar(buf []byte) (int32, *syserr.Error) { 2380 if len(buf) == 0 { 2381 return 0, syserr.ErrInvalidArgument 2382 } 2383 2384 if len(buf) >= sizeOfInt32 { 2385 return int32(hostarch.ByteOrder.Uint32(buf)), nil 2386 } 2387 2388 return int32(buf[0]), nil 2389 } 2390 2391 // setSockOptIP implements SetSockOpt when level is SOL_IP. 2392 func setSockOptIP(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { 2393 if _, ok := ep.(tcpip.Endpoint); !ok { 2394 log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name) 2395 return syserr.ErrUnknownProtocolOption 2396 } 2397 2398 switch name { 2399 case linux.IP_MULTICAST_TTL: 2400 v, err := parseIntOrChar(optVal) 2401 if err != nil { 2402 return err 2403 } 2404 2405 if v == -1 { 2406 // Linux translates -1 to 1. 2407 v = 1 2408 } 2409 if v < 0 || v > 255 { 2410 return syserr.ErrInvalidArgument 2411 } 2412 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MulticastTTLOption, int(v))) 2413 2414 case linux.IP_ADD_MEMBERSHIP: 2415 req, err := copyInMulticastRequest(optVal, false /* allowAddr */) 2416 if err != nil { 2417 return err 2418 } 2419 2420 return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{ 2421 NIC: tcpip.NICID(req.InterfaceIndex), 2422 // TODO(igudger): Change AddMembership to use the standard 2423 // any address representation. 2424 InterfaceAddr: tcpip.AddrFrom4(req.InterfaceAddr), 2425 MulticastAddr: tcpip.AddrFrom4(req.MulticastAddr), 2426 })) 2427 2428 case linux.IP_DROP_MEMBERSHIP: 2429 req, err := copyInMulticastRequest(optVal, false /* allowAddr */) 2430 if err != nil { 2431 return err 2432 } 2433 2434 return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{ 2435 NIC: tcpip.NICID(req.InterfaceIndex), 2436 // TODO(igudger): Change DropMembership to use the standard 2437 // any address representation. 2438 InterfaceAddr: tcpip.AddrFrom4(req.InterfaceAddr), 2439 MulticastAddr: tcpip.AddrFrom4(req.MulticastAddr), 2440 })) 2441 2442 case linux.IP_MULTICAST_IF: 2443 req, err := copyInMulticastRequest(optVal, true /* allowAddr */) 2444 if err != nil { 2445 return err 2446 } 2447 2448 return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.MulticastInterfaceOption{ 2449 NIC: tcpip.NICID(req.InterfaceIndex), 2450 InterfaceAddr: socket.BytesToIPAddress(req.InterfaceAddr[:]), 2451 })) 2452 2453 case linux.IP_MULTICAST_LOOP: 2454 v, err := parseIntOrChar(optVal) 2455 if err != nil { 2456 return err 2457 } 2458 2459 ep.SocketOptions().SetMulticastLoop(v != 0) 2460 return nil 2461 2462 case linux.MCAST_JOIN_GROUP: 2463 // FIXME(b/124219304): Implement MCAST_JOIN_GROUP. 2464 return syserr.ErrInvalidArgument 2465 2466 case linux.IP_TTL: 2467 v, err := parseIntOrChar(optVal) 2468 if err != nil { 2469 return err 2470 } 2471 2472 // -1 means default TTL. 2473 if v == -1 { 2474 v = 0 2475 } else if v < 1 || v > 255 { 2476 return syserr.ErrInvalidArgument 2477 } 2478 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TTLOption, int(v))) 2479 2480 case linux.IP_RECVTTL: 2481 v, err := parseIntOrChar(optVal) 2482 if err != nil { 2483 return err 2484 } 2485 ep.SocketOptions().SetReceiveTTL(v != 0) 2486 return nil 2487 2488 case linux.IP_TOS: 2489 if len(optVal) == 0 { 2490 return nil 2491 } 2492 v, err := parseIntOrChar(optVal) 2493 if err != nil { 2494 return err 2495 } 2496 return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TOSOption, int(v))) 2497 2498 case linux.IP_RECVTOS: 2499 v, err := parseIntOrChar(optVal) 2500 if err != nil { 2501 return err 2502 } 2503 ep.SocketOptions().SetReceiveTOS(v != 0) 2504 return nil 2505 2506 case linux.IP_RECVERR: 2507 if len(optVal) == 0 { 2508 return nil 2509 } 2510 v, err := parseIntOrChar(optVal) 2511 if err != nil { 2512 return err 2513 } 2514 ep.SocketOptions().SetIPv4RecvError(v != 0) 2515 return nil 2516 2517 case linux.IP_PKTINFO: 2518 if len(optVal) == 0 { 2519 return nil 2520 } 2521 v, err := parseIntOrChar(optVal) 2522 if err != nil { 2523 return err 2524 } 2525 ep.SocketOptions().SetReceivePacketInfo(v != 0) 2526 return nil 2527 2528 case linux.IP_HDRINCL: 2529 if len(optVal) == 0 { 2530 return nil 2531 } 2532 v, err := parseIntOrChar(optVal) 2533 if err != nil { 2534 return err 2535 } 2536 ep.SocketOptions().SetHeaderIncluded(v != 0) 2537 return nil 2538 2539 case linux.IP_RECVORIGDSTADDR: 2540 if len(optVal) == 0 { 2541 return nil 2542 } 2543 v, err := parseIntOrChar(optVal) 2544 if err != nil { 2545 return err 2546 } 2547 2548 ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0) 2549 return nil 2550 2551 case linux.IPT_SO_SET_REPLACE: 2552 if len(optVal) < linux.SizeOfIPTReplace { 2553 return syserr.ErrInvalidArgument 2554 } 2555 2556 // Only valid for raw IPv4 sockets. 2557 if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW { 2558 return syserr.ErrProtocolNotAvailable 2559 } 2560 2561 stk := inet.StackFromContext(t) 2562 if stk == nil { 2563 return syserr.ErrNoDevice 2564 } 2565 // Stack must be a netstack stack. 2566 return netfilter.SetEntries(t, stk.(*Stack).Stack, optVal, false) 2567 2568 case linux.IPT_SO_SET_ADD_COUNTERS: 2569 log.Infof("IPT_SO_SET_ADD_COUNTERS is not supported") 2570 return nil 2571 2572 case linux.IP_ADD_SOURCE_MEMBERSHIP, 2573 linux.IP_BIND_ADDRESS_NO_PORT, 2574 linux.IP_BLOCK_SOURCE, 2575 linux.IP_CHECKSUM, 2576 linux.IP_DROP_SOURCE_MEMBERSHIP, 2577 linux.IP_FREEBIND, 2578 linux.IP_IPSEC_POLICY, 2579 linux.IP_MINTTL, 2580 linux.IP_MSFILTER, 2581 linux.IP_MTU_DISCOVER, 2582 linux.IP_MULTICAST_ALL, 2583 linux.IP_NODEFRAG, 2584 linux.IP_OPTIONS, 2585 linux.IP_PASSSEC, 2586 linux.IP_RECVFRAGSIZE, 2587 linux.IP_RECVOPTS, 2588 linux.IP_RETOPTS, 2589 linux.IP_TRANSPARENT, 2590 linux.IP_UNBLOCK_SOURCE, 2591 linux.IP_UNICAST_IF, 2592 linux.IP_XFRM_POLICY, 2593 linux.MCAST_BLOCK_SOURCE, 2594 linux.MCAST_JOIN_SOURCE_GROUP, 2595 linux.MCAST_LEAVE_GROUP, 2596 linux.MCAST_LEAVE_SOURCE_GROUP, 2597 linux.MCAST_MSFILTER, 2598 linux.MCAST_UNBLOCK_SOURCE: 2599 // Not supported. 2600 } 2601 2602 return nil 2603 } 2604 2605 // GetSockName implements the linux syscall getsockname(2) for sockets backed by 2606 // tcpip.Endpoint. 2607 func (s *sock) GetSockName(*kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { 2608 addr, err := s.Endpoint.GetLocalAddress() 2609 if err != nil { 2610 return nil, 0, syserr.TranslateNetstackError(err) 2611 } 2612 2613 a, l := socket.ConvertAddress(s.family, addr) 2614 return a, l, nil 2615 } 2616 2617 // GetPeerName implements the linux syscall getpeername(2) for sockets backed by 2618 // tcpip.Endpoint. 2619 func (s *sock) GetPeerName(*kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { 2620 addr, err := s.Endpoint.GetRemoteAddress() 2621 if err != nil { 2622 return nil, 0, syserr.TranslateNetstackError(err) 2623 } 2624 2625 a, l := socket.ConvertAddress(s.family, addr) 2626 return a, l, nil 2627 } 2628 2629 func (s *sock) fillCmsgInq(cmsg *socket.ControlMessages) { 2630 if !s.sockOptInq { 2631 return 2632 } 2633 rcvBufUsed, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption) 2634 if err != nil { 2635 return 2636 } 2637 cmsg.IP.HasInq = true 2638 cmsg.IP.Inq = int32(rcvBufUsed) 2639 } 2640 2641 func toLinuxPacketType(pktType tcpip.PacketType) uint8 { 2642 switch pktType { 2643 case tcpip.PacketHost: 2644 return linux.PACKET_HOST 2645 case tcpip.PacketOtherHost: 2646 return linux.PACKET_OTHERHOST 2647 case tcpip.PacketOutgoing: 2648 return linux.PACKET_OUTGOING 2649 case tcpip.PacketBroadcast: 2650 return linux.PACKET_BROADCAST 2651 case tcpip.PacketMulticast: 2652 return linux.PACKET_MULTICAST 2653 default: 2654 panic(fmt.Sprintf("unknown packet type: %d", pktType)) 2655 } 2656 } 2657 2658 // nonBlockingRead issues a non-blocking read. 2659 // 2660 // TODO(b/78348848): Support timestamps for stream sockets. 2661 func (s *sock) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { 2662 isPacket := s.isPacketBased() 2663 2664 readOptions := tcpip.ReadOptions{ 2665 Peek: peek, 2666 NeedRemoteAddr: senderRequested, 2667 NeedLinkPacketInfo: isPacket, 2668 } 2669 2670 // TCP sockets discard the data if MSG_TRUNC is set. 2671 // 2672 // This behavior is documented in man 7 tcp: 2673 // Since version 2.4, Linux supports the use of MSG_TRUNC in the flags 2674 // argument of recv(2) (and recvmsg(2)). This flag causes the received 2675 // bytes of data to be discarded, rather than passed back in a 2676 // caller-supplied buffer. 2677 var w io.Writer 2678 if !isPacket && trunc { 2679 w = &tcpip.LimitedWriter{ 2680 W: ioutil.Discard, 2681 N: dst.NumBytes(), 2682 } 2683 } else { 2684 w = dst.Writer(ctx) 2685 } 2686 2687 s.readMu.Lock() 2688 defer s.readMu.Unlock() 2689 2690 res, err := s.Endpoint.Read(w, readOptions) 2691 if _, ok := err.(*tcpip.ErrBadBuffer); ok && dst.NumBytes() == 0 { 2692 err = nil 2693 } 2694 if err != nil { 2695 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err) 2696 } 2697 // Set the control message, even if 0 bytes were read. 2698 s.updateTimestamp(res.ControlMessages) 2699 2700 if isPacket { 2701 var addr linux.SockAddr 2702 var addrLen uint32 2703 if senderRequested { 2704 addr, addrLen = socket.ConvertAddress(s.family, res.RemoteAddr) 2705 switch v := addr.(type) { 2706 case *linux.SockAddrLink: 2707 v.Protocol = socket.Htons(uint16(res.LinkPacketInfo.Protocol)) 2708 v.PacketType = toLinuxPacketType(res.LinkPacketInfo.PktType) 2709 } 2710 } 2711 2712 msgLen := res.Count 2713 if trunc { 2714 msgLen = res.Total 2715 } 2716 2717 var flags int 2718 if res.Total > res.Count { 2719 flags |= linux.MSG_TRUNC 2720 } 2721 2722 return msgLen, flags, addr, addrLen, s.netstackToLinuxControlMessages(res.ControlMessages), nil 2723 } 2724 2725 if peek { 2726 // MSG_TRUNC with MSG_PEEK on a TCP socket returns the 2727 // amount that could be read, and does not write to buffer. 2728 if trunc { 2729 // TCP endpoint does not return the total bytes in buffer as numTotal. 2730 // We need to query it from socket option. 2731 rql, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption) 2732 if err != nil { 2733 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err) 2734 } 2735 msgLen := int(dst.NumBytes()) 2736 if msgLen > rql { 2737 msgLen = rql 2738 } 2739 return msgLen, 0, nil, 0, socket.ControlMessages{}, nil 2740 } 2741 } else if n := res.Count; n != 0 { 2742 s.Endpoint.ModerateRecvBuf(n) 2743 } 2744 2745 cmsg := s.netstackToLinuxControlMessages(res.ControlMessages) 2746 s.fillCmsgInq(&cmsg) 2747 return res.Count, 0, nil, 0, cmsg, syserr.TranslateNetstackError(err) 2748 } 2749 2750 func (s *sock) netstackToLinuxControlMessages(cm tcpip.ReceivableControlMessages) socket.ControlMessages { 2751 readCM := socket.NewIPControlMessages(s.family, cm) 2752 return socket.ControlMessages{ 2753 IP: socket.IPControlMessages{ 2754 HasTimestamp: readCM.HasTimestamp && s.sockOptTimestamp, 2755 Timestamp: readCM.Timestamp, 2756 HasInq: readCM.HasInq, 2757 Inq: readCM.Inq, 2758 HasTOS: readCM.HasTOS, 2759 TOS: readCM.TOS, 2760 HasTClass: readCM.HasTClass, 2761 TClass: readCM.TClass, 2762 HasTTL: readCM.HasTTL, 2763 TTL: readCM.TTL, 2764 HasHopLimit: readCM.HasHopLimit, 2765 HopLimit: readCM.HopLimit, 2766 HasIPPacketInfo: readCM.HasIPPacketInfo, 2767 PacketInfo: readCM.PacketInfo, 2768 HasIPv6PacketInfo: readCM.HasIPv6PacketInfo, 2769 IPv6PacketInfo: readCM.IPv6PacketInfo, 2770 OriginalDstAddress: readCM.OriginalDstAddress, 2771 SockErr: readCM.SockErr, 2772 }, 2773 } 2774 } 2775 2776 func (s *sock) linuxToNetstackControlMessages(cm socket.ControlMessages) tcpip.SendableControlMessages { 2777 return tcpip.SendableControlMessages{ 2778 HasTTL: cm.IP.HasTTL, 2779 TTL: uint8(cm.IP.TTL), 2780 HasHopLimit: cm.IP.HasHopLimit, 2781 HopLimit: uint8(cm.IP.HopLimit), 2782 } 2783 } 2784 2785 // updateTimestamp sets the timestamp for SIOCGSTAMP. It should be called after 2786 // successfully writing packet data out to userspace. 2787 // 2788 // Precondition: s.readMu must be locked. 2789 func (s *sock) updateTimestamp(cm tcpip.ReceivableControlMessages) { 2790 // Save the SIOCGSTAMP timestamp only if SO_TIMESTAMP is disabled. 2791 if !s.sockOptTimestamp { 2792 s.timestampValid = true 2793 s.timestamp = cm.Timestamp 2794 } 2795 } 2796 2797 // dequeueErr is analogous to net/core/skbuff.c:sock_dequeue_err_skb(). 2798 func (s *sock) dequeueErr() *tcpip.SockError { 2799 so := s.Endpoint.SocketOptions() 2800 err := so.DequeueErr() 2801 if err == nil { 2802 return nil 2803 } 2804 2805 // Update socket error to reflect ICMP errors in queue. 2806 if nextErr := so.PeekErr(); nextErr != nil && nextErr.Cause.Origin().IsICMPErr() { 2807 so.SetLastError(nextErr.Err) 2808 } else if err.Cause.Origin().IsICMPErr() { 2809 so.SetLastError(nil) 2810 } 2811 return err 2812 } 2813 2814 // addrFamilyFromNetProto returns the address family identifier for the given 2815 // network protocol. 2816 func addrFamilyFromNetProto(net tcpip.NetworkProtocolNumber) int { 2817 switch net { 2818 case header.IPv4ProtocolNumber: 2819 return linux.AF_INET 2820 case header.IPv6ProtocolNumber: 2821 return linux.AF_INET6 2822 default: 2823 panic(fmt.Sprintf("invalid net proto for addr family inference: %d", net)) 2824 } 2825 } 2826 2827 // recvErr handles MSG_ERRQUEUE for recvmsg(2). 2828 // This is analogous to net/ipv4/ip_sockglue.c:ip_recv_error(). 2829 func (s *sock) recvErr(t *kernel.Task, dst usermem.IOSequence) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { 2830 sockErr := s.dequeueErr() 2831 if sockErr == nil { 2832 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain 2833 } 2834 if sockErr.Payload != nil { 2835 defer sockErr.Payload.Release() 2836 } 2837 2838 // The payload of the original packet that caused the error is passed as 2839 // normal data via msg_iovec. -- recvmsg(2) 2840 msgFlags := linux.MSG_ERRQUEUE 2841 if int(dst.NumBytes()) < sockErr.Payload.Size() { 2842 msgFlags |= linux.MSG_TRUNC 2843 } 2844 n, err := dst.CopyOut(t, sockErr.Payload.AsSlice()) 2845 2846 // The original destination address of the datagram that caused the error is 2847 // supplied via msg_name. -- recvmsg(2) 2848 dstAddr, dstAddrLen := socket.ConvertAddress(addrFamilyFromNetProto(sockErr.NetProto), sockErr.Dst) 2849 cmgs := socket.ControlMessages{IP: socket.NewIPControlMessages(s.family, tcpip.ReceivableControlMessages{SockErr: sockErr})} 2850 return n, msgFlags, dstAddr, dstAddrLen, cmgs, syserr.FromError(err) 2851 } 2852 2853 // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by 2854 // tcpip.Endpoint. 2855 func (s *sock) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, _ uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { 2856 if flags&linux.MSG_ERRQUEUE != 0 { 2857 return s.recvErr(t, dst) 2858 } 2859 2860 trunc := flags&linux.MSG_TRUNC != 0 2861 peek := flags&linux.MSG_PEEK != 0 2862 dontWait := flags&linux.MSG_DONTWAIT != 0 2863 waitAll := flags&linux.MSG_WAITALL != 0 2864 if senderRequested && !s.isPacketBased() { 2865 // Stream sockets ignore the sender address. 2866 senderRequested = false 2867 } 2868 n, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested) 2869 2870 if s.isPacketBased() && err == syserr.ErrClosedForReceive && flags&linux.MSG_DONTWAIT != 0 { 2871 // In this situation we should return EAGAIN. 2872 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain 2873 } 2874 2875 if err != nil && (err != syserr.ErrWouldBlock || dontWait) { 2876 // Read failed and we should not retry. 2877 return 0, 0, nil, 0, socket.ControlMessages{}, err 2878 } 2879 2880 if err == nil && (dontWait || !waitAll || s.isPacketBased() || int64(n) >= dst.NumBytes()) { 2881 // We got all the data we need. 2882 return 2883 } 2884 2885 // Don't overwrite any data we received. 2886 dst = dst.DropFirst(n) 2887 2888 // We'll have to block. Register for notifications and keep trying to 2889 // send all the data. 2890 e, ch := waiter.NewChannelEntry(waiter.ReadableEvents) 2891 s.EventRegister(&e) 2892 defer s.EventUnregister(&e) 2893 2894 for { 2895 var rn int 2896 rn, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested) 2897 n += rn 2898 if err != nil && err != syserr.ErrWouldBlock { 2899 // Always stop on errors other than would block as we generally 2900 // won't be able to get any more data. Eat the error if we got 2901 // any data. 2902 if n > 0 { 2903 err = nil 2904 } 2905 return 2906 } 2907 if err == nil && (s.isPacketBased() || !waitAll || int64(rn) >= dst.NumBytes()) { 2908 // We got all the data we need. 2909 return 2910 } 2911 dst = dst.DropFirst(rn) 2912 2913 if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { 2914 if n > 0 { 2915 return n, msgFlags, senderAddr, senderAddrLen, controlMessages, nil 2916 } 2917 if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { 2918 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain 2919 } 2920 return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err) 2921 } 2922 } 2923 } 2924 2925 // SendMsg implements the linux syscall sendmsg(2) for sockets backed by 2926 // tcpip.Endpoint. 2927 func (s *sock) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { 2928 // Reject Unix control messages. 2929 if !controlMessages.Unix.Empty() { 2930 return 0, syserr.ErrInvalidArgument 2931 } 2932 2933 var addr *tcpip.FullAddress 2934 if len(to) > 0 { 2935 addrBuf, family, err := socket.AddressAndFamily(to) 2936 if err != nil { 2937 return 0, err 2938 } 2939 if !s.checkFamily(family, false /* exact */) { 2940 return 0, syserr.ErrInvalidArgument 2941 } 2942 addrBuf = s.mapFamily(addrBuf, family) 2943 2944 addr = &addrBuf 2945 } 2946 2947 opts := tcpip.WriteOptions{ 2948 To: addr, 2949 More: flags&linux.MSG_MORE != 0, 2950 EndOfRecord: flags&linux.MSG_EOR != 0, 2951 ControlMessages: s.linuxToNetstackControlMessages(controlMessages), 2952 } 2953 2954 r := src.Reader(t) 2955 var ( 2956 total int64 2957 entry waiter.Entry 2958 ch <-chan struct{} 2959 ) 2960 for { 2961 n, err := s.Endpoint.Write(r, opts) 2962 total += n 2963 if flags&linux.MSG_DONTWAIT != 0 { 2964 return int(total), syserr.TranslateNetstackError(err) 2965 } 2966 block := true 2967 switch err.(type) { 2968 case nil: 2969 block = total != src.NumBytes() 2970 case *tcpip.ErrWouldBlock: 2971 default: 2972 block = false 2973 } 2974 if block { 2975 if ch == nil { 2976 // We'll have to block. Register for notification and keep trying to 2977 // send all the data. 2978 entry, ch = waiter.NewChannelEntry(waiter.WritableEvents) 2979 s.EventRegister(&entry) 2980 defer s.EventUnregister(&entry) 2981 } else { 2982 // Don't wait immediately after registration in case more data 2983 // became available between when we last checked and when we setup 2984 // the notification. 2985 if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { 2986 if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { 2987 return int(total), syserr.ErrTryAgain 2988 } 2989 // handleIOError will consume errors from t.Block if needed. 2990 return int(total), syserr.FromError(err) 2991 } 2992 } 2993 continue 2994 } 2995 return int(total), syserr.TranslateNetstackError(err) 2996 } 2997 } 2998 2999 // Ioctl implements vfs.FileDescriptionImpl. 3000 func (s *sock) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { 3001 t := kernel.TaskFromContext(ctx) 3002 if t == nil { 3003 panic("ioctl(2) may only be called from a task goroutine") 3004 } 3005 3006 // SIOCGSTAMP is implemented by netstack rather than all commonEndpoint 3007 // sockets. 3008 // TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP. 3009 switch args[1].Int() { 3010 case linux.SIOCGSTAMP: 3011 s.readMu.Lock() 3012 defer s.readMu.Unlock() 3013 if !s.timestampValid { 3014 return 0, linuxerr.ENOENT 3015 } 3016 3017 tv := linux.NsecToTimeval(s.timestamp.UnixNano()) 3018 _, err := tv.CopyOut(t, args[2].Pointer()) 3019 return 0, err 3020 3021 case linux.TIOCINQ: 3022 v, terr := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption) 3023 if terr != nil { 3024 return 0, syserr.TranslateNetstackError(terr).ToError() 3025 } 3026 3027 if v > math.MaxInt32 { 3028 v = math.MaxInt32 3029 } 3030 3031 // Copy result to userspace. 3032 vP := primitive.Int32(v) 3033 _, err := vP.CopyOut(t, args[2].Pointer()) 3034 return 0, err 3035 } 3036 3037 return Ioctl(ctx, s.Endpoint, uio, sysno, args) 3038 } 3039 3040 // Ioctl performs a socket ioctl. 3041 func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { 3042 t := kernel.TaskFromContext(ctx) 3043 if t == nil { 3044 panic("ioctl(2) may only be called from a task goroutine") 3045 } 3046 3047 switch arg := int(args[1].Int()); arg { 3048 case linux.SIOCGIFFLAGS, 3049 linux.SIOCGIFADDR, 3050 linux.SIOCGIFBRDADDR, 3051 linux.SIOCGIFDSTADDR, 3052 linux.SIOCGIFHWADDR, 3053 linux.SIOCGIFINDEX, 3054 linux.SIOCGIFMAP, 3055 linux.SIOCGIFMETRIC, 3056 linux.SIOCGIFMTU, 3057 linux.SIOCGIFNAME, 3058 linux.SIOCGIFNETMASK, 3059 linux.SIOCGIFTXQLEN, 3060 linux.SIOCETHTOOL: 3061 3062 var ifr linux.IFReq 3063 if _, err := ifr.CopyIn(t, args[2].Pointer()); err != nil { 3064 return 0, err 3065 } 3066 if err := interfaceIoctl(ctx, io, arg, &ifr); err != nil { 3067 return 0, err.ToError() 3068 } 3069 _, err := ifr.CopyOut(t, args[2].Pointer()) 3070 return 0, err 3071 3072 case linux.SIOCGIFCONF: 3073 // Return a list of interface addresses or the buffer size 3074 // necessary to hold the list. 3075 var ifc linux.IFConf 3076 if _, err := ifc.CopyIn(t, args[2].Pointer()); err != nil { 3077 return 0, err 3078 } 3079 3080 if err := ifconfIoctl(ctx, t, io, &ifc); err != nil { 3081 return 0, err 3082 } 3083 3084 _, err := ifc.CopyOut(t, args[2].Pointer()) 3085 return 0, err 3086 3087 case linux.TIOCINQ: 3088 v, terr := ep.GetSockOptInt(tcpip.ReceiveQueueSizeOption) 3089 if terr != nil { 3090 return 0, syserr.TranslateNetstackError(terr).ToError() 3091 } 3092 3093 if v > math.MaxInt32 { 3094 v = math.MaxInt32 3095 } 3096 // Copy result to userspace. 3097 vP := primitive.Int32(v) 3098 _, err := vP.CopyOut(t, args[2].Pointer()) 3099 return 0, err 3100 3101 case linux.TIOCOUTQ: 3102 v, terr := ep.GetSockOptInt(tcpip.SendQueueSizeOption) 3103 if terr != nil { 3104 return 0, syserr.TranslateNetstackError(terr).ToError() 3105 } 3106 3107 if v > math.MaxInt32 { 3108 v = math.MaxInt32 3109 } 3110 3111 // Copy result to userspace. 3112 vP := primitive.Int32(v) 3113 _, err := vP.CopyOut(t, args[2].Pointer()) 3114 return 0, err 3115 3116 case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG: 3117 // Not supported. 3118 } 3119 3120 return 0, linuxerr.ENOTTY 3121 } 3122 3123 // interfaceIoctl implements interface requests. 3124 func interfaceIoctl(ctx context.Context, _ usermem.IO, arg int, ifr *linux.IFReq) *syserr.Error { 3125 var ( 3126 iface inet.Interface 3127 index int32 3128 found bool 3129 ) 3130 3131 // Find the relevant device. 3132 stk := inet.StackFromContext(ctx) 3133 if stk == nil { 3134 return syserr.ErrNoDevice 3135 } 3136 3137 // SIOCGIFNAME uses ifr.ifr_ifindex rather than ifr.ifr_name to 3138 // identify a device. 3139 if arg == linux.SIOCGIFNAME { 3140 // Gets the name of the interface given the interface index 3141 // stored in ifr_ifindex. 3142 index = int32(hostarch.ByteOrder.Uint32(ifr.Data[:4])) 3143 if iface, ok := stk.Interfaces()[index]; ok { 3144 ifr.SetName(iface.Name) 3145 return nil 3146 } 3147 return syserr.ErrNoDevice 3148 } 3149 3150 // Find the relevant device. 3151 for index, iface = range stk.Interfaces() { 3152 if iface.Name == ifr.Name() { 3153 found = true 3154 break 3155 } 3156 } 3157 if !found { 3158 return syserr.ErrNoDevice 3159 } 3160 3161 switch arg { 3162 case linux.SIOCGIFINDEX: 3163 // Copy out the index to the data. 3164 hostarch.ByteOrder.PutUint32(ifr.Data[:], uint32(index)) 3165 3166 case linux.SIOCGIFHWADDR: 3167 // Copy the hardware address out. 3168 // 3169 // Refer: https://linux.die.net/man/7/netdevice 3170 // SIOCGIFHWADDR, SIOCSIFHWADDR 3171 // 3172 // Get or set the hardware address of a device using 3173 // ifr_hwaddr. The hardware address is specified in a struct 3174 // sockaddr. sa_family contains the ARPHRD_* device type, 3175 // sa_data the L2 hardware address starting from byte 0. Setting 3176 // the hardware address is a privileged operation. 3177 hostarch.ByteOrder.PutUint16(ifr.Data[:], iface.DeviceType) 3178 n := copy(ifr.Data[2:], iface.Addr) 3179 for i := 2 + n; i < len(ifr.Data); i++ { 3180 ifr.Data[i] = 0 // Clear padding. 3181 } 3182 3183 case linux.SIOCGIFFLAGS: 3184 f, err := interfaceStatusFlags(stk, iface.Name) 3185 if err != nil { 3186 return err 3187 } 3188 // Drop the flags that don't fit in the size that we need to return. This 3189 // matches Linux behavior. 3190 hostarch.ByteOrder.PutUint16(ifr.Data[:2], uint16(f)) 3191 3192 case linux.SIOCGIFADDR: 3193 // Copy the IPv4 address out. 3194 for _, addr := range stk.InterfaceAddrs()[index] { 3195 // This ioctl is only compatible with AF_INET addresses. 3196 if addr.Family != linux.AF_INET { 3197 continue 3198 } 3199 copy(ifr.Data[4:8], addr.Addr) 3200 break 3201 } 3202 3203 case linux.SIOCGIFMETRIC: 3204 // Gets the metric of the device. As per netdevice(7), this 3205 // always just sets ifr_metric to 0. 3206 hostarch.ByteOrder.PutUint32(ifr.Data[:4], 0) 3207 3208 case linux.SIOCGIFMTU: 3209 // Gets the MTU of the device. 3210 hostarch.ByteOrder.PutUint32(ifr.Data[:4], iface.MTU) 3211 3212 case linux.SIOCGIFMAP: 3213 // Gets the hardware parameters of the device. 3214 // TODO(gvisor.dev/issue/505): Implement. 3215 3216 case linux.SIOCGIFTXQLEN: 3217 // Gets the transmit queue length of the device. 3218 // TODO(gvisor.dev/issue/505): Implement. 3219 3220 case linux.SIOCGIFDSTADDR: 3221 // Gets the destination address of a point-to-point device. 3222 // TODO(gvisor.dev/issue/505): Implement. 3223 3224 case linux.SIOCGIFBRDADDR: 3225 // Gets the broadcast address of a device. 3226 // TODO(gvisor.dev/issue/505): Implement. 3227 3228 case linux.SIOCGIFNETMASK: 3229 // Gets the network mask of a device. 3230 for _, addr := range stk.InterfaceAddrs()[index] { 3231 // This ioctl is only compatible with AF_INET addresses. 3232 if addr.Family != linux.AF_INET { 3233 continue 3234 } 3235 // Populate ifr.ifr_netmask (type sockaddr). 3236 hostarch.ByteOrder.PutUint16(ifr.Data[0:], uint16(linux.AF_INET)) 3237 hostarch.ByteOrder.PutUint16(ifr.Data[2:], 0) 3238 var mask uint32 = 0xffffffff << (32 - addr.PrefixLen) 3239 // Netmask is expected to be returned as a big endian 3240 // value. 3241 binary.BigEndian.PutUint32(ifr.Data[4:8], mask) 3242 break 3243 } 3244 3245 case linux.SIOCETHTOOL: 3246 // Stubbed out for now, Ideally we should implement the required 3247 // sub-commands for ETHTOOL 3248 // 3249 // See: 3250 // https://github.com/torvalds/linux/blob/aa0c9086b40c17a7ad94425b3b70dd1fdd7497bf/net/core/dev_ioctl.c 3251 return syserr.ErrEndpointOperation 3252 3253 default: 3254 // Not a valid call. 3255 return syserr.ErrInvalidArgument 3256 } 3257 3258 return nil 3259 } 3260 3261 // ifconfIoctl populates a struct ifconf for the SIOCGIFCONF ioctl. 3262 func ifconfIoctl(ctx context.Context, t *kernel.Task, _ usermem.IO, ifc *linux.IFConf) error { 3263 // If Ptr is NULL, return the necessary buffer size via Len. 3264 // Otherwise, write up to Len bytes starting at Ptr containing ifreq 3265 // structs. 3266 stk := inet.StackFromContext(ctx) 3267 if stk == nil { 3268 return syserr.ErrNoDevice.ToError() 3269 } 3270 3271 if ifc.Ptr == 0 { 3272 ifc.Len = int32(len(stk.Interfaces())) * int32(linux.SizeOfIFReq) 3273 return nil 3274 } 3275 3276 max := ifc.Len 3277 ifc.Len = 0 3278 for key, ifaceAddrs := range stk.InterfaceAddrs() { 3279 iface := stk.Interfaces()[key] 3280 for _, ifaceAddr := range ifaceAddrs { 3281 // Don't write past the end of the buffer. 3282 if ifc.Len+int32(linux.SizeOfIFReq) > max { 3283 break 3284 } 3285 if ifaceAddr.Family != linux.AF_INET { 3286 continue 3287 } 3288 3289 // Populate ifr.ifr_addr. 3290 ifr := linux.IFReq{} 3291 ifr.SetName(iface.Name) 3292 hostarch.ByteOrder.PutUint16(ifr.Data[0:2], uint16(ifaceAddr.Family)) 3293 hostarch.ByteOrder.PutUint16(ifr.Data[2:4], 0) 3294 copy(ifr.Data[4:8], ifaceAddr.Addr[:4]) 3295 3296 // Copy the ifr to userspace. 3297 dst := uintptr(ifc.Ptr) + uintptr(ifc.Len) 3298 ifc.Len += int32(linux.SizeOfIFReq) 3299 if _, err := ifr.CopyOut(t, hostarch.Addr(dst)); err != nil { 3300 return err 3301 } 3302 } 3303 } 3304 return nil 3305 } 3306 3307 // interfaceStatusFlags returns status flags for an interface in the stack. 3308 // Flag values and meanings are described in greater detail in netdevice(7) in 3309 // the SIOCGIFFLAGS section. 3310 func interfaceStatusFlags(stack inet.Stack, name string) (uint32, *syserr.Error) { 3311 // We should only ever be passed a netstack.Stack. 3312 epstack, ok := stack.(*Stack) 3313 if !ok { 3314 return 0, errStackType 3315 } 3316 3317 // Find the NIC corresponding to this interface. 3318 for _, info := range epstack.Stack.NICInfo() { 3319 if info.Name == name { 3320 return nicStateFlagsToLinux(info.Flags), nil 3321 } 3322 } 3323 return 0, syserr.ErrNoDevice 3324 } 3325 3326 func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 { 3327 var rv uint32 3328 if f.Up { 3329 rv |= linux.IFF_UP | linux.IFF_LOWER_UP 3330 } 3331 if f.Running { 3332 rv |= linux.IFF_RUNNING 3333 } 3334 if f.Promiscuous { 3335 rv |= linux.IFF_PROMISC 3336 } 3337 if f.Loopback { 3338 rv |= linux.IFF_LOOPBACK 3339 } 3340 return rv 3341 } 3342 3343 // State implements socket.Socket.State. State translates the internal state 3344 // returned by netstack to values defined by Linux. 3345 func (s *sock) State() uint32 { 3346 if s.family != linux.AF_INET && s.family != linux.AF_INET6 { 3347 // States not implemented for this socket's family. 3348 return 0 3349 } 3350 3351 switch { 3352 case socket.IsTCP(s): 3353 // TCP socket. 3354 switch tcp.EndpointState(s.Endpoint.State()) { 3355 case tcp.StateEstablished: 3356 return linux.TCP_ESTABLISHED 3357 case tcp.StateSynSent: 3358 return linux.TCP_SYN_SENT 3359 case tcp.StateSynRecv: 3360 return linux.TCP_SYN_RECV 3361 case tcp.StateFinWait1: 3362 return linux.TCP_FIN_WAIT1 3363 case tcp.StateFinWait2: 3364 return linux.TCP_FIN_WAIT2 3365 case tcp.StateTimeWait: 3366 return linux.TCP_TIME_WAIT 3367 case tcp.StateClose, tcp.StateInitial, tcp.StateBound, tcp.StateConnecting, tcp.StateError: 3368 return linux.TCP_CLOSE 3369 case tcp.StateCloseWait: 3370 return linux.TCP_CLOSE_WAIT 3371 case tcp.StateLastAck: 3372 return linux.TCP_LAST_ACK 3373 case tcp.StateListen: 3374 return linux.TCP_LISTEN 3375 case tcp.StateClosing: 3376 return linux.TCP_CLOSING 3377 default: 3378 // Internal or unknown state. 3379 return 0 3380 } 3381 case socket.IsUDP(s): 3382 // UDP socket. 3383 switch transport.DatagramEndpointState(s.Endpoint.State()) { 3384 case transport.DatagramEndpointStateInitial, transport.DatagramEndpointStateBound, transport.DatagramEndpointStateClosed: 3385 return linux.TCP_CLOSE 3386 case transport.DatagramEndpointStateConnected: 3387 return linux.TCP_ESTABLISHED 3388 default: 3389 return 0 3390 } 3391 case socket.IsICMP(s): 3392 // TODO(b/112063468): Export states for ICMP sockets. 3393 case socket.IsRaw(s): 3394 // TODO(b/112063468): Export states for raw sockets. 3395 default: 3396 // Unknown transport protocol, how did we make this socket? 3397 log.Warningf("Unknown transport protocol for an existing socket: family=%v, type=%v, protocol=%v, internal type %v", s.family, s.skType, s.protocol, reflect.TypeOf(s.Endpoint).Elem()) 3398 return 0 3399 } 3400 3401 return 0 3402 } 3403 3404 // Type implements socket.Socket.Type. 3405 func (s *sock) Type() (family int, skType linux.SockType, protocol int) { 3406 return s.family, s.skType, s.protocol 3407 } 3408 3409 // EventRegister implements waiter.Waitable. 3410 func (s *sock) EventRegister(e *waiter.Entry) error { 3411 s.Queue.EventRegister(e) 3412 return nil 3413 } 3414 3415 // EventUnregister implements waiter.Waitable.EventUnregister. 3416 func (s *sock) EventUnregister(e *waiter.Entry) { 3417 s.Queue.EventUnregister(e) 3418 }