inet.af/netstack@v0.0.0-20220214151720-7585b01ddccf/tcpip/network/ipv4/ipv4.go (about) 1 // Copyright 2021 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package ipv4 contains the implementation of the ipv4 network protocol. 16 package ipv4 17 18 import ( 19 "fmt" 20 "math" 21 "reflect" 22 "sync/atomic" 23 "time" 24 25 "inet.af/netstack/sync" 26 "inet.af/netstack/tcpip" 27 "inet.af/netstack/tcpip/buffer" 28 "inet.af/netstack/tcpip/header" 29 "inet.af/netstack/tcpip/header/parse" 30 "inet.af/netstack/tcpip/network/hash" 31 "inet.af/netstack/tcpip/network/internal/fragmentation" 32 "inet.af/netstack/tcpip/network/internal/ip" 33 "inet.af/netstack/tcpip/stack" 34 ) 35 36 const ( 37 // ReassembleTimeout is the time a packet stays in the reassembly 38 // system before being evicted. 39 // As per RFC 791 section 3.2: 40 // The current recommendation for the initial timer setting is 15 seconds. 41 // This may be changed as experience with this protocol accumulates. 42 // 43 // Considering that it is an old recommendation, we use the same reassembly 44 // timeout that linux defines, which is 30 seconds: 45 // https://github.com/torvalds/linux/blob/47ec5303d73ea344e84f46660fff693c57641386/include/net/ip.h#L138 46 ReassembleTimeout = 30 * time.Second 47 48 // ProtocolNumber is the ipv4 protocol number. 49 ProtocolNumber = header.IPv4ProtocolNumber 50 51 // MaxTotalSize is maximum size that can be encoded in the 16-bit 52 // TotalLength field of the ipv4 header. 53 MaxTotalSize = 0xffff 54 55 // DefaultTTL is the default time-to-live value for this endpoint. 56 DefaultTTL = 64 57 58 // buckets is the number of identifier buckets. 59 buckets = 2048 60 61 // The size of a fragment block, in bytes, as per RFC 791 section 3.1, 62 // page 14. 63 fragmentblockSize = 8 64 ) 65 66 const ( 67 forwardingDisabled = 0 68 forwardingEnabled = 1 69 ) 70 71 var ipv4BroadcastAddr = header.IPv4Broadcast.WithPrefix() 72 73 var _ stack.LinkResolvableNetworkEndpoint = (*endpoint)(nil) 74 var _ stack.ForwardingNetworkEndpoint = (*endpoint)(nil) 75 var _ stack.GroupAddressableEndpoint = (*endpoint)(nil) 76 var _ stack.AddressableEndpoint = (*endpoint)(nil) 77 var _ stack.NetworkEndpoint = (*endpoint)(nil) 78 79 type endpoint struct { 80 nic stack.NetworkInterface 81 dispatcher stack.TransportDispatcher 82 protocol *protocol 83 stats sharedStats 84 85 // enabled is set to 1 when the endpoint is enabled and 0 when it is 86 // disabled. 87 // 88 // Must be accessed using atomic operations. 89 enabled uint32 90 91 // forwarding is set to forwardingEnabled when the endpoint has forwarding 92 // enabled and forwardingDisabled when it is disabled. 93 // 94 // Must be accessed using atomic operations. 95 forwarding uint32 96 97 mu struct { 98 sync.RWMutex 99 100 addressableEndpointState stack.AddressableEndpointState 101 igmp igmpState 102 } 103 } 104 105 // HandleLinkResolutionFailure implements stack.LinkResolvableNetworkEndpoint. 106 func (e *endpoint) HandleLinkResolutionFailure(pkt *stack.PacketBuffer) { 107 // If we are operating as a router, return an ICMP error to the original 108 // packet's sender. 109 if pkt.NetworkPacketInfo.IsForwardedPacket { 110 // TODO(gvisor.dev/issue/6005): Propagate asynchronously generated ICMP 111 // errors to local endpoints. 112 e.protocol.returnError(&icmpReasonHostUnreachable{}, pkt) 113 e.stats.ip.Forwarding.Errors.Increment() 114 e.stats.ip.Forwarding.HostUnreachable.Increment() 115 return 116 } 117 // handleControl expects the entire offending packet to be in the packet 118 // buffer's data field. 119 pkt = stack.NewPacketBuffer(stack.PacketBufferOptions{ 120 Data: buffer.NewVectorisedView(pkt.Size(), pkt.Views()), 121 }) 122 defer pkt.DecRef() 123 pkt.NICID = e.nic.ID() 124 pkt.NetworkProtocolNumber = ProtocolNumber 125 // Use the same control type as an ICMPv4 destination host unreachable error 126 // since the host is considered unreachable if we cannot resolve the link 127 // address to the next hop. 128 e.handleControl(&icmpv4DestinationHostUnreachableSockError{}, pkt) 129 } 130 131 // NewEndpoint creates a new ipv4 endpoint. 132 func (p *protocol) NewEndpoint(nic stack.NetworkInterface, dispatcher stack.TransportDispatcher) stack.NetworkEndpoint { 133 e := &endpoint{ 134 nic: nic, 135 dispatcher: dispatcher, 136 protocol: p, 137 } 138 e.mu.Lock() 139 e.mu.addressableEndpointState.Init(e) 140 e.mu.igmp.init(e) 141 e.mu.Unlock() 142 143 tcpip.InitStatCounters(reflect.ValueOf(&e.stats.localStats).Elem()) 144 145 stackStats := p.stack.Stats() 146 e.stats.ip.Init(&e.stats.localStats.IP, &stackStats.IP) 147 e.stats.icmp.init(&e.stats.localStats.ICMP, &stackStats.ICMP.V4) 148 e.stats.igmp.init(&e.stats.localStats.IGMP, &stackStats.IGMP) 149 150 p.mu.Lock() 151 p.mu.eps[nic.ID()] = e 152 p.mu.Unlock() 153 154 return e 155 } 156 157 func (p *protocol) findEndpointWithAddress(addr tcpip.Address) *endpoint { 158 p.mu.RLock() 159 defer p.mu.RUnlock() 160 161 for _, e := range p.mu.eps { 162 if addressEndpoint := e.AcquireAssignedAddress(addr, false /* allowTemp */, stack.NeverPrimaryEndpoint); addressEndpoint != nil { 163 addressEndpoint.DecRef() 164 return e 165 } 166 } 167 168 return nil 169 } 170 171 func (p *protocol) getEndpointForNIC(id tcpip.NICID) (*endpoint, bool) { 172 p.mu.RLock() 173 defer p.mu.RUnlock() 174 ep, ok := p.mu.eps[id] 175 return ep, ok 176 } 177 178 func (p *protocol) forgetEndpoint(nicID tcpip.NICID) { 179 p.mu.Lock() 180 defer p.mu.Unlock() 181 delete(p.mu.eps, nicID) 182 } 183 184 // Forwarding implements stack.ForwardingNetworkEndpoint. 185 func (e *endpoint) Forwarding() bool { 186 return atomic.LoadUint32(&e.forwarding) == forwardingEnabled 187 } 188 189 // setForwarding sets the forwarding status for the endpoint. 190 // 191 // Returns true if the forwarding status was updated. 192 func (e *endpoint) setForwarding(v bool) bool { 193 forwarding := uint32(forwardingDisabled) 194 if v { 195 forwarding = forwardingEnabled 196 } 197 198 return atomic.SwapUint32(&e.forwarding, forwarding) != forwarding 199 } 200 201 // SetForwarding implements stack.ForwardingNetworkEndpoint. 202 func (e *endpoint) SetForwarding(forwarding bool) { 203 e.mu.Lock() 204 defer e.mu.Unlock() 205 206 if !e.setForwarding(forwarding) { 207 return 208 } 209 210 if forwarding { 211 // There does not seem to be an RFC requirement for a node to join the all 212 // routers multicast address but 213 // https://www.iana.org/assignments/multicast-addresses/multicast-addresses.xhtml 214 // specifies the address as a group for all routers on a subnet so we join 215 // the group here. 216 if err := e.joinGroupLocked(header.IPv4AllRoutersGroup); err != nil { 217 // joinGroupLocked only returns an error if the group address is not a 218 // valid IPv4 multicast address. 219 panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", header.IPv4AllRoutersGroup, err)) 220 } 221 222 return 223 } 224 225 switch err := e.leaveGroupLocked(header.IPv4AllRoutersGroup).(type) { 226 case nil: 227 case *tcpip.ErrBadLocalAddress: 228 // The endpoint may have already left the multicast group. 229 default: 230 panic(fmt.Sprintf("e.leaveGroupLocked(%s): %s", header.IPv4AllRoutersGroup, err)) 231 } 232 } 233 234 // Enable implements stack.NetworkEndpoint. 235 func (e *endpoint) Enable() tcpip.Error { 236 e.mu.Lock() 237 defer e.mu.Unlock() 238 239 // If the NIC is not enabled, the endpoint can't do anything meaningful so 240 // don't enable the endpoint. 241 if !e.nic.Enabled() { 242 return &tcpip.ErrNotPermitted{} 243 } 244 245 // If the endpoint is already enabled, there is nothing for it to do. 246 if !e.setEnabled(true) { 247 return nil 248 } 249 250 // Create an endpoint to receive broadcast packets on this interface. 251 ep, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(ipv4BroadcastAddr, stack.AddressProperties{PEB: stack.NeverPrimaryEndpoint}) 252 if err != nil { 253 return err 254 } 255 // We have no need for the address endpoint. 256 ep.DecRef() 257 258 // Groups may have been joined while the endpoint was disabled, or the 259 // endpoint may have left groups from the perspective of IGMP when the 260 // endpoint was disabled. Either way, we need to let routers know to 261 // send us multicast traffic. 262 e.mu.igmp.initializeAll() 263 264 // As per RFC 1122 section 3.3.7, all hosts should join the all-hosts 265 // multicast group. Note, the IANA calls the all-hosts multicast group the 266 // all-systems multicast group. 267 if err := e.joinGroupLocked(header.IPv4AllSystems); err != nil { 268 // joinGroupLocked only returns an error if the group address is not a valid 269 // IPv4 multicast address. 270 panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", header.IPv4AllSystems, err)) 271 } 272 273 return nil 274 } 275 276 // Enabled implements stack.NetworkEndpoint. 277 func (e *endpoint) Enabled() bool { 278 return e.nic.Enabled() && e.isEnabled() 279 } 280 281 // isEnabled returns true if the endpoint is enabled, regardless of the 282 // enabled status of the NIC. 283 func (e *endpoint) isEnabled() bool { 284 return atomic.LoadUint32(&e.enabled) == 1 285 } 286 287 // setEnabled sets the enabled status for the endpoint. 288 // 289 // Returns true if the enabled status was updated. 290 func (e *endpoint) setEnabled(v bool) bool { 291 if v { 292 return atomic.SwapUint32(&e.enabled, 1) == 0 293 } 294 return atomic.SwapUint32(&e.enabled, 0) == 1 295 } 296 297 // Disable implements stack.NetworkEndpoint. 298 func (e *endpoint) Disable() { 299 e.mu.Lock() 300 defer e.mu.Unlock() 301 e.disableLocked() 302 } 303 304 func (e *endpoint) disableLocked() { 305 if !e.isEnabled() { 306 return 307 } 308 309 // The endpoint may have already left the multicast group. 310 switch err := e.leaveGroupLocked(header.IPv4AllSystems).(type) { 311 case nil, *tcpip.ErrBadLocalAddress: 312 default: 313 panic(fmt.Sprintf("unexpected error when leaving group = %s: %s", header.IPv4AllSystems, err)) 314 } 315 316 // Leave groups from the perspective of IGMP so that routers know that 317 // we are no longer interested in the group. 318 e.mu.igmp.softLeaveAll() 319 320 // The address may have already been removed. 321 switch err := e.mu.addressableEndpointState.RemovePermanentAddress(ipv4BroadcastAddr.Address); err.(type) { 322 case nil, *tcpip.ErrBadLocalAddress: 323 default: 324 panic(fmt.Sprintf("unexpected error when removing address = %s: %s", ipv4BroadcastAddr.Address, err)) 325 } 326 327 // Reset the IGMP V1 present flag. 328 // 329 // If the node comes back up on the same network, it will re-learn that it 330 // needs to perform IGMPv1. 331 e.mu.igmp.resetV1Present() 332 333 if !e.setEnabled(false) { 334 panic("should have only done work to disable the endpoint if it was enabled") 335 } 336 } 337 338 // DefaultTTL is the default time-to-live value for this endpoint. 339 func (e *endpoint) DefaultTTL() uint8 { 340 return e.protocol.DefaultTTL() 341 } 342 343 // MTU implements stack.NetworkEndpoint. It returns the link-layer MTU minus the 344 // network layer max header length. 345 func (e *endpoint) MTU() uint32 { 346 networkMTU, err := calculateNetworkMTU(e.nic.MTU(), header.IPv4MinimumSize) 347 if err != nil { 348 return 0 349 } 350 return networkMTU 351 } 352 353 // MaxHeaderLength returns the maximum length needed by ipv4 headers (and 354 // underlying protocols). 355 func (e *endpoint) MaxHeaderLength() uint16 { 356 return e.nic.MaxHeaderLength() + header.IPv4MaximumHeaderSize 357 } 358 359 // NetworkProtocolNumber implements stack.NetworkEndpoint. 360 func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber { 361 return e.protocol.Number() 362 } 363 364 func (e *endpoint) addIPHeader(srcAddr, dstAddr tcpip.Address, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams, options header.IPv4OptionsSerializer) tcpip.Error { 365 hdrLen := header.IPv4MinimumSize 366 var optLen int 367 if options != nil { 368 optLen = int(options.Length()) 369 } 370 hdrLen += optLen 371 if hdrLen > header.IPv4MaximumHeaderSize { 372 return &tcpip.ErrMessageTooLong{} 373 } 374 ipH := header.IPv4(pkt.NetworkHeader().Push(hdrLen)) 375 length := pkt.Size() 376 if length > math.MaxUint16 { 377 return &tcpip.ErrMessageTooLong{} 378 } 379 // RFC 6864 section 4.3 mandates uniqueness of ID values for non-atomic 380 // datagrams. Since the DF bit is never being set here, all datagrams 381 // are non-atomic and need an ID. 382 id := atomic.AddUint32(&e.protocol.ids[hashRoute(srcAddr, dstAddr, params.Protocol, e.protocol.hashIV)%buckets], 1) 383 ipH.Encode(&header.IPv4Fields{ 384 TotalLength: uint16(length), 385 ID: uint16(id), 386 TTL: params.TTL, 387 TOS: params.TOS, 388 Protocol: uint8(params.Protocol), 389 SrcAddr: srcAddr, 390 DstAddr: dstAddr, 391 Options: options, 392 }) 393 ipH.SetChecksum(^ipH.CalculateChecksum()) 394 pkt.NetworkProtocolNumber = ProtocolNumber 395 return nil 396 } 397 398 // handleFragments fragments pkt and calls the handler function on each 399 // fragment. It returns the number of fragments handled and the number of 400 // fragments left to be processed. The IP header must already be present in the 401 // original packet. 402 func (e *endpoint) handleFragments(_ *stack.Route, networkMTU uint32, pkt *stack.PacketBuffer, handler func(*stack.PacketBuffer) tcpip.Error) (int, int, tcpip.Error) { 403 // Round the MTU down to align to 8 bytes. 404 fragmentPayloadSize := networkMTU &^ 7 405 networkHeader := header.IPv4(pkt.NetworkHeader().View()) 406 pf := fragmentation.MakePacketFragmenter(pkt, fragmentPayloadSize, pkt.AvailableHeaderBytes()+len(networkHeader)) 407 408 var n int 409 for { 410 fragPkt, more := buildNextFragment(&pf, networkHeader) 411 if err := handler(fragPkt); err != nil { 412 return n, pf.RemainingFragmentCount() + 1, err 413 } 414 n++ 415 if !more { 416 return n, pf.RemainingFragmentCount(), nil 417 } 418 } 419 } 420 421 // WritePacket writes a packet to the given destination address and protocol. 422 func (e *endpoint) WritePacket(r *stack.Route, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) tcpip.Error { 423 if err := e.addIPHeader(r.LocalAddress(), r.RemoteAddress(), pkt, params, nil /* options */); err != nil { 424 return err 425 } 426 427 // iptables filtering. All packets that reach here are locally 428 // generated. 429 outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) 430 if ok := e.protocol.stack.IPTables().CheckOutput(pkt, r, outNicName); !ok { 431 // iptables is telling us to drop the packet. 432 e.stats.ip.IPTablesOutputDropped.Increment() 433 return nil 434 } 435 436 // If the packet is manipulated as per NAT Output rules, handle packet 437 // based on destination address and do not send the packet to link 438 // layer. 439 // 440 // We should do this for every packet, rather than only NATted packets, but 441 // removing this check short circuits broadcasts before they are sent out to 442 // other hosts. 443 if pkt.DNATDone { 444 netHeader := header.IPv4(pkt.NetworkHeader().View()) 445 if ep := e.protocol.findEndpointWithAddress(netHeader.DestinationAddress()); ep != nil { 446 // Since we rewrote the packet but it is being routed back to us, we 447 // can safely assume the checksum is valid. 448 ep.handleLocalPacket(pkt, true /* canSkipRXChecksum */) 449 return nil 450 } 451 } 452 453 return e.writePacket(r, pkt, false /* headerIncluded */) 454 } 455 456 func (e *endpoint) writePacket(r *stack.Route, pkt *stack.PacketBuffer, headerIncluded bool) tcpip.Error { 457 if r.Loop()&stack.PacketLoop != 0 { 458 // If the packet was generated by the stack (not a raw/packet endpoint 459 // where a packet may be written with the header included), then we can 460 // safely assume the checksum is valid. 461 e.handleLocalPacket(pkt, !headerIncluded /* canSkipRXChecksum */) 462 } 463 if r.Loop()&stack.PacketOut == 0 { 464 return nil 465 } 466 467 // Postrouting NAT can only change the source address, and does not alter the 468 // route or outgoing interface of the packet. 469 outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) 470 if ok := e.protocol.stack.IPTables().CheckPostrouting(pkt, r, e, outNicName); !ok { 471 // iptables is telling us to drop the packet. 472 e.stats.ip.IPTablesPostroutingDropped.Increment() 473 return nil 474 } 475 476 stats := e.stats.ip 477 478 networkMTU, err := calculateNetworkMTU(e.nic.MTU(), uint32(pkt.NetworkHeader().View().Size())) 479 if err != nil { 480 stats.OutgoingPacketErrors.Increment() 481 return err 482 } 483 484 if packetMustBeFragmented(pkt, networkMTU) { 485 h := header.IPv4(pkt.NetworkHeader().View()) 486 if h.Flags()&header.IPv4FlagDontFragment != 0 && pkt.NetworkPacketInfo.IsForwardedPacket { 487 // TODO(gvisor.dev/issue/5919): Handle error condition in which DontFragment 488 // is set but the packet must be fragmented for the non-forwarding case. 489 return &tcpip.ErrMessageTooLong{} 490 } 491 sent, remain, err := e.handleFragments(r, networkMTU, pkt, func(fragPkt *stack.PacketBuffer) tcpip.Error { 492 // TODO(gvisor.dev/issue/3884): Evaluate whether we want to send each 493 // fragment one by one using WritePacket() (current strategy) or if we 494 // want to create a PacketBufferList from the fragments and feed it to 495 // WritePackets(). It'll be faster but cost more memory. 496 return e.nic.WritePacket(r, ProtocolNumber, fragPkt) 497 }) 498 stats.PacketsSent.IncrementBy(uint64(sent)) 499 stats.OutgoingPacketErrors.IncrementBy(uint64(remain)) 500 return err 501 } 502 503 if err := e.nic.WritePacket(r, ProtocolNumber, pkt); err != nil { 504 stats.OutgoingPacketErrors.Increment() 505 return err 506 } 507 stats.PacketsSent.Increment() 508 return nil 509 } 510 511 // WritePackets implements stack.NetworkEndpoint. 512 func (e *endpoint) WritePackets(r *stack.Route, pkts stack.PacketBufferList, params stack.NetworkHeaderParams) (int, tcpip.Error) { 513 if r.Loop()&stack.PacketLoop != 0 { 514 panic("multiple packets in local loop") 515 } 516 if r.Loop()&stack.PacketOut == 0 { 517 return pkts.Len(), nil 518 } 519 520 stats := e.stats.ip 521 522 for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() { 523 if err := e.addIPHeader(r.LocalAddress(), r.RemoteAddress(), pkt, params, nil /* options */); err != nil { 524 return 0, err 525 } 526 527 networkMTU, err := calculateNetworkMTU(e.nic.MTU(), uint32(pkt.NetworkHeader().View().Size())) 528 if err != nil { 529 stats.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len())) 530 return 0, err 531 } 532 533 if packetMustBeFragmented(pkt, networkMTU) { 534 // Keep track of the packet that is about to be fragmented so it can be 535 // removed once the fragmentation is done. 536 originalPkt := pkt 537 if _, _, err := e.handleFragments(r, networkMTU, pkt, func(fragPkt *stack.PacketBuffer) tcpip.Error { 538 fragPkt.IncRef() 539 // Modify the packet list in place with the new fragments. 540 pkts.InsertAfter(pkt, fragPkt) 541 pkt = fragPkt 542 return nil 543 }); err != nil { 544 panic(fmt.Sprintf("e.handleFragments(_, _, %d, _, _) = %s", networkMTU, err)) 545 } 546 // Remove the packet that was just fragmented and process the rest. 547 pkts.Remove(originalPkt) 548 } 549 } 550 551 outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) 552 // iptables filtering. All packets that reach here are locally 553 // generated. 554 outputDropped, natPkts := e.protocol.stack.IPTables().CheckOutputPackets(pkts, r, outNicName) 555 stats.IPTablesOutputDropped.IncrementBy(uint64(len(outputDropped))) 556 for pkt := range outputDropped { 557 pkts.Remove(pkt) 558 } 559 560 // The NAT-ed packets may now be destined for us. 561 locallyDelivered := 0 562 for pkt := range natPkts { 563 ep := e.protocol.findEndpointWithAddress(header.IPv4(pkt.NetworkHeader().View()).DestinationAddress()) 564 if ep == nil { 565 // The NAT-ed packet is still destined for some remote node. 566 continue 567 } 568 569 // Do not send the locally destined packet out the NIC. 570 pkts.Remove(pkt) 571 572 // Deliver the packet locally. 573 ep.handleLocalPacket(pkt, true /* canSkipRXChecksum */) 574 locallyDelivered++ 575 576 } 577 578 // We ignore the list of NAT-ed packets here because Postrouting NAT can only 579 // change the source address, and does not alter the route or outgoing 580 // interface of the packet. 581 postroutingDropped, _ := e.protocol.stack.IPTables().CheckPostroutingPackets(pkts, r, e, outNicName) 582 stats.IPTablesPostroutingDropped.IncrementBy(uint64(len(postroutingDropped))) 583 for pkt := range postroutingDropped { 584 pkts.Remove(pkt) 585 } 586 587 // The rest of the packets can be delivered to the NIC as a batch. 588 pktsLen := pkts.Len() 589 written, err := e.nic.WritePackets(r, pkts, ProtocolNumber) 590 stats.PacketsSent.IncrementBy(uint64(written)) 591 stats.OutgoingPacketErrors.IncrementBy(uint64(pktsLen - written)) 592 593 // Dropped packets aren't errors, so include them in the return value. 594 return locallyDelivered + written + len(outputDropped) + len(postroutingDropped), err 595 } 596 597 // WriteHeaderIncludedPacket implements stack.NetworkEndpoint. 598 func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) tcpip.Error { 599 // The packet already has an IP header, but there are a few required 600 // checks. 601 h, ok := pkt.Data().PullUp(header.IPv4MinimumSize) 602 if !ok { 603 return &tcpip.ErrMalformedHeader{} 604 } 605 606 hdrLen := header.IPv4(h).HeaderLength() 607 if hdrLen < header.IPv4MinimumSize { 608 return &tcpip.ErrMalformedHeader{} 609 } 610 611 h, ok = pkt.Data().PullUp(int(hdrLen)) 612 if !ok { 613 return &tcpip.ErrMalformedHeader{} 614 } 615 ipH := header.IPv4(h) 616 617 // Always set the total length. 618 pktSize := pkt.Data().Size() 619 ipH.SetTotalLength(uint16(pktSize)) 620 621 // Set the source address when zero. 622 if ipH.SourceAddress() == header.IPv4Any { 623 ipH.SetSourceAddress(r.LocalAddress()) 624 } 625 626 // Set the packet ID when zero. 627 if ipH.ID() == 0 { 628 // RFC 6864 section 4.3 mandates uniqueness of ID values for 629 // non-atomic datagrams, so assign an ID to all such datagrams 630 // according to the definition given in RFC 6864 section 4. 631 if ipH.Flags()&header.IPv4FlagDontFragment == 0 || ipH.Flags()&header.IPv4FlagMoreFragments != 0 || ipH.FragmentOffset() > 0 { 632 ipH.SetID(uint16(atomic.AddUint32(&e.protocol.ids[hashRoute(r.LocalAddress(), r.RemoteAddress(), 0 /* protocol */, e.protocol.hashIV)%buckets], 1))) 633 } 634 } 635 636 // Always set the checksum. 637 ipH.SetChecksum(0) 638 ipH.SetChecksum(^ipH.CalculateChecksum()) 639 640 // Populate the packet buffer's network header and don't allow an invalid 641 // packet to be sent. 642 // 643 // Note that parsing only makes sure that the packet is well formed as per the 644 // wire format. We also want to check if the header's fields are valid before 645 // sending the packet. 646 if !parse.IPv4(pkt) || !header.IPv4(pkt.NetworkHeader().View()).IsValid(pktSize) { 647 return &tcpip.ErrMalformedHeader{} 648 } 649 650 return e.writePacket(r, pkt, true /* headerIncluded */) 651 } 652 653 // forwardPacket attempts to forward a packet to its final destination. 654 func (e *endpoint) forwardPacket(pkt *stack.PacketBuffer) ip.ForwardingError { 655 h := header.IPv4(pkt.NetworkHeader().View()) 656 657 dstAddr := h.DestinationAddress() 658 // As per RFC 3927 section 7, 659 // 660 // A router MUST NOT forward a packet with an IPv4 Link-Local source or 661 // destination address, irrespective of the router's default route 662 // configuration or routes obtained from dynamic routing protocols. 663 // 664 // A router which receives a packet with an IPv4 Link-Local source or 665 // destination address MUST NOT forward the packet. This prevents 666 // forwarding of packets back onto the network segment from which they 667 // originated, or to any other segment. 668 if header.IsV4LinkLocalUnicastAddress(h.SourceAddress()) { 669 return &ip.ErrLinkLocalSourceAddress{} 670 } 671 if header.IsV4LinkLocalUnicastAddress(dstAddr) || header.IsV4LinkLocalMulticastAddress(dstAddr) { 672 return &ip.ErrLinkLocalDestinationAddress{} 673 } 674 675 ttl := h.TTL() 676 if ttl == 0 { 677 // As per RFC 792 page 6, Time Exceeded Message, 678 // 679 // If the gateway processing a datagram finds the time to live field 680 // is zero it must discard the datagram. The gateway may also notify 681 // the source host via the time exceeded message. 682 // 683 // We return the original error rather than the result of returning 684 // the ICMP packet because the original error is more relevant to 685 // the caller. 686 _ = e.protocol.returnError(&icmpReasonTTLExceeded{}, pkt) 687 return &ip.ErrTTLExceeded{} 688 } 689 690 if opts := h.Options(); len(opts) != 0 { 691 newOpts, _, optProblem := e.processIPOptions(pkt, opts, &optionUsageForward{}) 692 if optProblem != nil { 693 if optProblem.NeedICMP { 694 _ = e.protocol.returnError(&icmpReasonParamProblem{ 695 pointer: optProblem.Pointer, 696 forwarding: true, 697 }, pkt) 698 } 699 return &ip.ErrParameterProblem{} 700 } 701 copied := copy(opts, newOpts) 702 if copied != len(newOpts) { 703 panic(fmt.Sprintf("copied %d bytes of new options, expected %d bytes", copied, len(newOpts))) 704 } 705 // Since in forwarding we handle all options, including copying those we 706 // do not recognise, the options region should remain the same size which 707 // simplifies processing. As we MAY receive a packet with a lot of padded 708 // bytes after the "end of options list" byte, make sure we copy 709 // them as the legal padding value (0). 710 for i := copied; i < len(opts); i++ { 711 // Pad with 0 (EOL). RFC 791 page 23 says "The padding is zero". 712 opts[i] = byte(header.IPv4OptionListEndType) 713 } 714 } 715 716 stk := e.protocol.stack 717 718 // Check if the destination is owned by the stack. 719 if ep := e.protocol.findEndpointWithAddress(dstAddr); ep != nil { 720 inNicName := stk.FindNICNameFromID(e.nic.ID()) 721 outNicName := stk.FindNICNameFromID(ep.nic.ID()) 722 if ok := stk.IPTables().CheckForward(pkt, inNicName, outNicName); !ok { 723 // iptables is telling us to drop the packet. 724 e.stats.ip.IPTablesForwardDropped.Increment() 725 return nil 726 } 727 728 // The packet originally arrived on e so provide its NIC as the input NIC. 729 ep.handleValidatedPacket(h, pkt, e.nic.Name() /* inNICName */) 730 return nil 731 } 732 733 r, err := stk.FindRoute(0, "", dstAddr, ProtocolNumber, false /* multicastLoop */) 734 switch err.(type) { 735 case nil: 736 case *tcpip.ErrNoRoute, *tcpip.ErrNetworkUnreachable: 737 // We return the original error rather than the result of returning 738 // the ICMP packet because the original error is more relevant to 739 // the caller. 740 _ = e.protocol.returnError(&icmpReasonNetworkUnreachable{}, pkt) 741 return &ip.ErrNoRoute{} 742 default: 743 return &ip.ErrOther{Err: err} 744 } 745 defer r.Release() 746 747 inNicName := stk.FindNICNameFromID(e.nic.ID()) 748 outNicName := stk.FindNICNameFromID(r.NICID()) 749 if ok := stk.IPTables().CheckForward(pkt, inNicName, outNicName); !ok { 750 // iptables is telling us to drop the packet. 751 e.stats.ip.IPTablesForwardDropped.Increment() 752 return nil 753 } 754 755 // We need to do a deep copy of the IP packet because 756 // WriteHeaderIncludedPacket may modify the packet buffer, but we do 757 // not own it. 758 newPkt := pkt.DeepCopyForForwarding(int(r.MaxHeaderLength())) 759 newHdr := header.IPv4(newPkt.NetworkHeader().View()) 760 defer newPkt.DecRef() 761 762 // As per RFC 791 page 30, Time to Live, 763 // 764 // This field must be decreased at each point that the internet header 765 // is processed to reflect the time spent processing the datagram. 766 // Even if no local information is available on the time actually 767 // spent, the field must be decremented by 1. 768 newHdr.SetTTL(ttl - 1) 769 // We perform a full checksum as we may have updated options above. The IP 770 // header is relatively small so this is not expected to be an expensive 771 // operation. 772 newHdr.SetChecksum(0) 773 newHdr.SetChecksum(^newHdr.CalculateChecksum()) 774 775 forwardToEp, ok := e.protocol.getEndpointForNIC(r.NICID()) 776 if !ok { 777 // The interface was removed after we obtained the route. 778 return &ip.ErrOther{Err: &tcpip.ErrUnknownDevice{}} 779 } 780 781 switch err := forwardToEp.writePacket(r, newPkt, true /* headerIncluded */); err.(type) { 782 case nil: 783 return nil 784 case *tcpip.ErrMessageTooLong: 785 // As per RFC 792, page 4, Destination Unreachable: 786 // 787 // Another case is when a datagram must be fragmented to be forwarded by a 788 // gateway yet the Don't Fragment flag is on. In this case the gateway must 789 // discard the datagram and may return a destination unreachable message. 790 // 791 // WriteHeaderIncludedPacket checks for the presence of the Don't Fragment bit 792 // while sending the packet and returns this error iff fragmentation is 793 // necessary and the bit is also set. 794 _ = e.protocol.returnError(&icmpReasonFragmentationNeeded{}, pkt) 795 return &ip.ErrMessageTooLong{} 796 default: 797 return &ip.ErrOther{Err: err} 798 } 799 } 800 801 // HandlePacket is called by the link layer when new ipv4 packets arrive for 802 // this endpoint. 803 func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) { 804 stats := e.stats.ip 805 806 stats.PacketsReceived.Increment() 807 808 if !e.isEnabled() { 809 stats.DisabledPacketsReceived.Increment() 810 return 811 } 812 813 h, ok := e.protocol.parseAndValidate(pkt) 814 if !ok { 815 stats.MalformedPacketsReceived.Increment() 816 return 817 } 818 819 if !e.nic.IsLoopback() { 820 if !e.protocol.options.AllowExternalLoopbackTraffic { 821 if header.IsV4LoopbackAddress(h.SourceAddress()) { 822 stats.InvalidSourceAddressesReceived.Increment() 823 return 824 } 825 826 if header.IsV4LoopbackAddress(h.DestinationAddress()) { 827 stats.InvalidDestinationAddressesReceived.Increment() 828 return 829 } 830 } 831 832 if e.protocol.stack.HandleLocal() { 833 addressEndpoint := e.AcquireAssignedAddress(header.IPv4(pkt.NetworkHeader().View()).SourceAddress(), e.nic.Promiscuous(), stack.CanBePrimaryEndpoint) 834 if addressEndpoint != nil { 835 addressEndpoint.DecRef() 836 837 // The source address is one of our own, so we never should have gotten 838 // a packet like this unless HandleLocal is false or our NIC is the 839 // loopback interface. 840 stats.InvalidSourceAddressesReceived.Increment() 841 return 842 } 843 } 844 845 // Loopback traffic skips the prerouting chain. 846 inNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID()) 847 if ok := e.protocol.stack.IPTables().CheckPrerouting(pkt, e, inNicName); !ok { 848 // iptables is telling us to drop the packet. 849 stats.IPTablesPreroutingDropped.Increment() 850 return 851 } 852 } 853 854 e.handleValidatedPacket(h, pkt, e.nic.Name() /* inNICName */) 855 } 856 857 // handleLocalPacket is like HandlePacket except it does not perform the 858 // prerouting iptables hook or check for loopback traffic that originated from 859 // outside of the netstack (i.e. martian loopback packets). 860 func (e *endpoint) handleLocalPacket(pkt *stack.PacketBuffer, canSkipRXChecksum bool) { 861 stats := e.stats.ip 862 stats.PacketsReceived.Increment() 863 864 pkt = pkt.CloneToInbound() 865 defer pkt.DecRef() 866 pkt.RXTransportChecksumValidated = canSkipRXChecksum 867 868 h, ok := e.protocol.parseAndValidate(pkt) 869 if !ok { 870 stats.MalformedPacketsReceived.Increment() 871 return 872 } 873 874 e.handleValidatedPacket(h, pkt, e.nic.Name() /* inNICName */) 875 } 876 877 func (e *endpoint) handleValidatedPacket(h header.IPv4, pkt *stack.PacketBuffer, inNICName string) { 878 pkt.NICID = e.nic.ID() 879 880 // Raw socket packets are delivered based solely on the transport protocol 881 // number. We only require that the packet be valid IPv4, and that they not 882 // be fragmented. 883 if !h.More() && h.FragmentOffset() == 0 { 884 e.dispatcher.DeliverRawPacket(h.TransportProtocol(), pkt) 885 } 886 887 stats := e.stats 888 stats.ip.ValidPacketsReceived.Increment() 889 890 srcAddr := h.SourceAddress() 891 dstAddr := h.DestinationAddress() 892 893 // As per RFC 1122 section 3.2.1.3: 894 // When a host sends any datagram, the IP source address MUST 895 // be one of its own IP addresses (but not a broadcast or 896 // multicast address). 897 if srcAddr == header.IPv4Broadcast || header.IsV4MulticastAddress(srcAddr) { 898 stats.ip.InvalidSourceAddressesReceived.Increment() 899 return 900 } 901 // Make sure the source address is not a subnet-local broadcast address. 902 if addressEndpoint := e.AcquireAssignedAddress(srcAddr, false /* createTemp */, stack.NeverPrimaryEndpoint); addressEndpoint != nil { 903 subnet := addressEndpoint.Subnet() 904 addressEndpoint.DecRef() 905 if subnet.IsBroadcast(srcAddr) { 906 stats.ip.InvalidSourceAddressesReceived.Increment() 907 return 908 } 909 } 910 911 // Before we do any processing, note if the packet was received as some 912 // sort of broadcast. The destination address should be an address we own 913 // or a group we joined. 914 if addressEndpoint := e.AcquireAssignedAddress(dstAddr, e.nic.Promiscuous(), stack.CanBePrimaryEndpoint); addressEndpoint != nil { 915 subnet := addressEndpoint.AddressWithPrefix().Subnet() 916 addressEndpoint.DecRef() 917 pkt.NetworkPacketInfo.LocalAddressBroadcast = subnet.IsBroadcast(dstAddr) || dstAddr == header.IPv4Broadcast 918 } else if !e.IsInGroup(dstAddr) { 919 if !e.Forwarding() { 920 stats.ip.InvalidDestinationAddressesReceived.Increment() 921 return 922 } 923 switch err := e.forwardPacket(pkt); err.(type) { 924 case nil: 925 return 926 case *ip.ErrLinkLocalSourceAddress: 927 stats.ip.Forwarding.LinkLocalSource.Increment() 928 case *ip.ErrLinkLocalDestinationAddress: 929 stats.ip.Forwarding.LinkLocalDestination.Increment() 930 case *ip.ErrTTLExceeded: 931 stats.ip.Forwarding.ExhaustedTTL.Increment() 932 case *ip.ErrNoRoute: 933 stats.ip.Forwarding.Unrouteable.Increment() 934 case *ip.ErrParameterProblem: 935 stats.ip.MalformedPacketsReceived.Increment() 936 case *ip.ErrMessageTooLong: 937 stats.ip.Forwarding.PacketTooBig.Increment() 938 default: 939 panic(fmt.Sprintf("unexpected error %s while trying to forward packet: %#v", err, pkt)) 940 } 941 stats.ip.Forwarding.Errors.Increment() 942 return 943 } 944 945 // iptables filtering. All packets that reach here are intended for 946 // this machine and will not be forwarded. 947 if ok := e.protocol.stack.IPTables().CheckInput(pkt, inNICName); !ok { 948 // iptables is telling us to drop the packet. 949 stats.ip.IPTablesInputDropped.Increment() 950 return 951 } 952 953 if h.More() || h.FragmentOffset() != 0 { 954 if pkt.Data().Size()+pkt.TransportHeader().View().Size() == 0 { 955 // Drop the packet as it's marked as a fragment but has 956 // no payload. 957 stats.ip.MalformedPacketsReceived.Increment() 958 stats.ip.MalformedFragmentsReceived.Increment() 959 return 960 } 961 if opts := h.Options(); len(opts) != 0 { 962 // If there are options we need to check them before we do assembly 963 // or we could be assembling errant packets. However we do not change the 964 // options as that could lead to double processing later. 965 if _, _, optProblem := e.processIPOptions(pkt, opts, &optionUsageVerify{}); optProblem != nil { 966 if optProblem.NeedICMP { 967 _ = e.protocol.returnError(&icmpReasonParamProblem{ 968 pointer: optProblem.Pointer, 969 }, pkt) 970 e.stats.ip.MalformedPacketsReceived.Increment() 971 } 972 return 973 } 974 } 975 // The packet is a fragment, let's try to reassemble it. 976 start := h.FragmentOffset() 977 // Drop the fragment if the size of the reassembled payload would exceed the 978 // maximum payload size. 979 // 980 // Note that this addition doesn't overflow even on 32bit architecture 981 // because pkt.Data().Size() should not exceed 65535 (the max IP datagram 982 // size). Otherwise the packet would've been rejected as invalid before 983 // reaching here. 984 if int(start)+pkt.Data().Size() > header.IPv4MaximumPayloadSize { 985 stats.ip.MalformedPacketsReceived.Increment() 986 stats.ip.MalformedFragmentsReceived.Increment() 987 return 988 } 989 990 proto := h.Protocol() 991 resPkt, transProtoNum, ready, err := e.protocol.fragmentation.Process( 992 // As per RFC 791 section 2.3, the identification value is unique 993 // for a source-destination pair and protocol. 994 fragmentation.FragmentID{ 995 Source: h.SourceAddress(), 996 Destination: h.DestinationAddress(), 997 ID: uint32(h.ID()), 998 Protocol: proto, 999 }, 1000 start, 1001 start+uint16(pkt.Data().Size())-1, 1002 h.More(), 1003 proto, 1004 pkt, 1005 ) 1006 if err != nil { 1007 stats.ip.MalformedPacketsReceived.Increment() 1008 stats.ip.MalformedFragmentsReceived.Increment() 1009 return 1010 } 1011 if !ready { 1012 return 1013 } 1014 pkt = resPkt 1015 h = header.IPv4(pkt.NetworkHeader().View()) 1016 1017 // The reassembler doesn't take care of fixing up the header, so we need 1018 // to do it here. 1019 h.SetTotalLength(uint16(pkt.Data().Size() + len(h))) 1020 h.SetFlagsFragmentOffset(0, 0) 1021 1022 e.protocol.parseTransport(pkt, tcpip.TransportProtocolNumber(transProtoNum)) 1023 1024 // Now that the packet is reassembled, it can be sent to raw sockets. 1025 e.dispatcher.DeliverRawPacket(h.TransportProtocol(), pkt) 1026 } 1027 stats.ip.PacketsDelivered.Increment() 1028 1029 p := h.TransportProtocol() 1030 if p == header.ICMPv4ProtocolNumber { 1031 // TODO(gvisor.dev/issues/3810): when we sort out ICMP and transport 1032 // headers, the setting of the transport number here should be 1033 // unnecessary and removed. 1034 pkt.TransportProtocolNumber = p 1035 e.handleICMP(pkt) 1036 return 1037 } 1038 // ICMP handles options itself but do it here for all remaining destinations. 1039 var hasRouterAlertOption bool 1040 if opts := h.Options(); len(opts) != 0 { 1041 newOpts, processedOpts, optProblem := e.processIPOptions(pkt, opts, &optionUsageReceive{}) 1042 if optProblem != nil { 1043 if optProblem.NeedICMP { 1044 _ = e.protocol.returnError(&icmpReasonParamProblem{ 1045 pointer: optProblem.Pointer, 1046 }, pkt) 1047 stats.ip.MalformedPacketsReceived.Increment() 1048 } 1049 return 1050 } 1051 hasRouterAlertOption = processedOpts.routerAlert 1052 copied := copy(opts, newOpts) 1053 if copied != len(newOpts) { 1054 panic(fmt.Sprintf("copied %d bytes of new options, expected %d bytes", copied, len(newOpts))) 1055 } 1056 for i := copied; i < len(opts); i++ { 1057 // Pad with 0 (EOL). RFC 791 page 23 says "The padding is zero". 1058 opts[i] = byte(header.IPv4OptionListEndType) 1059 } 1060 } 1061 if p == header.IGMPProtocolNumber { 1062 e.mu.Lock() 1063 e.mu.igmp.handleIGMP(pkt, hasRouterAlertOption) 1064 e.mu.Unlock() 1065 return 1066 } 1067 1068 switch res := e.dispatcher.DeliverTransportPacket(p, pkt); res { 1069 case stack.TransportPacketHandled: 1070 case stack.TransportPacketDestinationPortUnreachable: 1071 // As per RFC: 1122 Section 3.2.2.1 A host SHOULD generate Destination 1072 // Unreachable messages with code: 1073 // 3 (Port Unreachable), when the designated transport protocol 1074 // (e.g., UDP) is unable to demultiplex the datagram but has no 1075 // protocol mechanism to inform the sender. 1076 _ = e.protocol.returnError(&icmpReasonPortUnreachable{}, pkt) 1077 case stack.TransportPacketProtocolUnreachable: 1078 // As per RFC: 1122 Section 3.2.2.1 1079 // A host SHOULD generate Destination Unreachable messages with code: 1080 // 2 (Protocol Unreachable), when the designated transport protocol 1081 // is not supported 1082 _ = e.protocol.returnError(&icmpReasonProtoUnreachable{}, pkt) 1083 default: 1084 panic(fmt.Sprintf("unrecognized result from DeliverTransportPacket = %d", res)) 1085 } 1086 } 1087 1088 // Close cleans up resources associated with the endpoint. 1089 func (e *endpoint) Close() { 1090 e.mu.Lock() 1091 e.disableLocked() 1092 e.mu.addressableEndpointState.Cleanup() 1093 e.mu.Unlock() 1094 1095 e.protocol.forgetEndpoint(e.nic.ID()) 1096 } 1097 1098 // AddAndAcquirePermanentAddress implements stack.AddressableEndpoint. 1099 func (e *endpoint) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, properties stack.AddressProperties) (stack.AddressEndpoint, tcpip.Error) { 1100 e.mu.RLock() 1101 defer e.mu.RUnlock() 1102 1103 ep, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(addr, properties) 1104 if err == nil { 1105 e.mu.igmp.sendQueuedReports() 1106 } 1107 return ep, err 1108 } 1109 1110 // RemovePermanentAddress implements stack.AddressableEndpoint. 1111 func (e *endpoint) RemovePermanentAddress(addr tcpip.Address) tcpip.Error { 1112 e.mu.RLock() 1113 defer e.mu.RUnlock() 1114 return e.mu.addressableEndpointState.RemovePermanentAddress(addr) 1115 } 1116 1117 // MainAddress implements stack.AddressableEndpoint. 1118 func (e *endpoint) MainAddress() tcpip.AddressWithPrefix { 1119 e.mu.RLock() 1120 defer e.mu.RUnlock() 1121 return e.mu.addressableEndpointState.MainAddress() 1122 } 1123 1124 // AcquireAssignedAddress implements stack.AddressableEndpoint. 1125 func (e *endpoint) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB stack.PrimaryEndpointBehavior) stack.AddressEndpoint { 1126 e.mu.RLock() 1127 defer e.mu.RUnlock() 1128 1129 loopback := e.nic.IsLoopback() 1130 return e.mu.addressableEndpointState.AcquireAssignedAddressOrMatching(localAddr, func(addressEndpoint stack.AddressEndpoint) bool { 1131 subnet := addressEndpoint.Subnet() 1132 // IPv4 has a notion of a subnet broadcast address and considers the 1133 // loopback interface bound to an address's whole subnet (on linux). 1134 return subnet.IsBroadcast(localAddr) || (loopback && subnet.Contains(localAddr)) 1135 }, allowTemp, tempPEB) 1136 } 1137 1138 // AcquireOutgoingPrimaryAddress implements stack.AddressableEndpoint. 1139 func (e *endpoint) AcquireOutgoingPrimaryAddress(remoteAddr tcpip.Address, allowExpired bool) stack.AddressEndpoint { 1140 e.mu.RLock() 1141 defer e.mu.RUnlock() 1142 return e.acquireOutgoingPrimaryAddressRLocked(remoteAddr, allowExpired) 1143 } 1144 1145 // acquireOutgoingPrimaryAddressRLocked is like AcquireOutgoingPrimaryAddress 1146 // but with locking requirements 1147 // 1148 // Precondition: igmp.ep.mu must be read locked. 1149 func (e *endpoint) acquireOutgoingPrimaryAddressRLocked(remoteAddr tcpip.Address, allowExpired bool) stack.AddressEndpoint { 1150 return e.mu.addressableEndpointState.AcquireOutgoingPrimaryAddress(remoteAddr, allowExpired) 1151 } 1152 1153 // PrimaryAddresses implements stack.AddressableEndpoint. 1154 func (e *endpoint) PrimaryAddresses() []tcpip.AddressWithPrefix { 1155 e.mu.RLock() 1156 defer e.mu.RUnlock() 1157 return e.mu.addressableEndpointState.PrimaryAddresses() 1158 } 1159 1160 // PermanentAddresses implements stack.AddressableEndpoint. 1161 func (e *endpoint) PermanentAddresses() []tcpip.AddressWithPrefix { 1162 e.mu.RLock() 1163 defer e.mu.RUnlock() 1164 return e.mu.addressableEndpointState.PermanentAddresses() 1165 } 1166 1167 // JoinGroup implements stack.GroupAddressableEndpoint. 1168 func (e *endpoint) JoinGroup(addr tcpip.Address) tcpip.Error { 1169 e.mu.Lock() 1170 defer e.mu.Unlock() 1171 return e.joinGroupLocked(addr) 1172 } 1173 1174 // joinGroupLocked is like JoinGroup but with locking requirements. 1175 // 1176 // Precondition: e.mu must be locked. 1177 func (e *endpoint) joinGroupLocked(addr tcpip.Address) tcpip.Error { 1178 if !header.IsV4MulticastAddress(addr) { 1179 return &tcpip.ErrBadAddress{} 1180 } 1181 1182 e.mu.igmp.joinGroup(addr) 1183 return nil 1184 } 1185 1186 // LeaveGroup implements stack.GroupAddressableEndpoint. 1187 func (e *endpoint) LeaveGroup(addr tcpip.Address) tcpip.Error { 1188 e.mu.Lock() 1189 defer e.mu.Unlock() 1190 return e.leaveGroupLocked(addr) 1191 } 1192 1193 // leaveGroupLocked is like LeaveGroup but with locking requirements. 1194 // 1195 // Precondition: e.mu must be locked. 1196 func (e *endpoint) leaveGroupLocked(addr tcpip.Address) tcpip.Error { 1197 return e.mu.igmp.leaveGroup(addr) 1198 } 1199 1200 // IsInGroup implements stack.GroupAddressableEndpoint. 1201 func (e *endpoint) IsInGroup(addr tcpip.Address) bool { 1202 e.mu.RLock() 1203 defer e.mu.RUnlock() 1204 return e.mu.igmp.isInGroup(addr) 1205 } 1206 1207 // Stats implements stack.NetworkEndpoint. 1208 func (e *endpoint) Stats() stack.NetworkEndpointStats { 1209 return &e.stats.localStats 1210 } 1211 1212 var _ stack.NetworkProtocol = (*protocol)(nil) 1213 var _ fragmentation.TimeoutHandler = (*protocol)(nil) 1214 1215 type protocol struct { 1216 stack *stack.Stack 1217 1218 mu struct { 1219 sync.RWMutex 1220 1221 // eps is keyed by NICID to allow protocol methods to retrieve an endpoint 1222 // when handling a packet, by looking at which NIC handled the packet. 1223 eps map[tcpip.NICID]*endpoint 1224 1225 // ICMP types for which the stack's global rate limiting must apply. 1226 icmpRateLimitedTypes map[header.ICMPv4Type]struct{} 1227 } 1228 1229 // defaultTTL is the current default TTL for the protocol. Only the 1230 // uint8 portion of it is meaningful. 1231 // 1232 // Must be accessed using atomic operations. 1233 defaultTTL uint32 1234 1235 ids []uint32 1236 hashIV uint32 1237 1238 fragmentation *fragmentation.Fragmentation 1239 1240 options Options 1241 } 1242 1243 // Number returns the ipv4 protocol number. 1244 func (p *protocol) Number() tcpip.NetworkProtocolNumber { 1245 return ProtocolNumber 1246 } 1247 1248 // MinimumPacketSize returns the minimum valid ipv4 packet size. 1249 func (p *protocol) MinimumPacketSize() int { 1250 return header.IPv4MinimumSize 1251 } 1252 1253 // ParseAddresses implements stack.NetworkProtocol. 1254 func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) { 1255 h := header.IPv4(v) 1256 return h.SourceAddress(), h.DestinationAddress() 1257 } 1258 1259 // SetOption implements stack.NetworkProtocol. 1260 func (p *protocol) SetOption(option tcpip.SettableNetworkProtocolOption) tcpip.Error { 1261 switch v := option.(type) { 1262 case *tcpip.DefaultTTLOption: 1263 p.SetDefaultTTL(uint8(*v)) 1264 return nil 1265 default: 1266 return &tcpip.ErrUnknownProtocolOption{} 1267 } 1268 } 1269 1270 // Option implements stack.NetworkProtocol. 1271 func (p *protocol) Option(option tcpip.GettableNetworkProtocolOption) tcpip.Error { 1272 switch v := option.(type) { 1273 case *tcpip.DefaultTTLOption: 1274 *v = tcpip.DefaultTTLOption(p.DefaultTTL()) 1275 return nil 1276 default: 1277 return &tcpip.ErrUnknownProtocolOption{} 1278 } 1279 } 1280 1281 // SetDefaultTTL sets the default TTL for endpoints created with this protocol. 1282 func (p *protocol) SetDefaultTTL(ttl uint8) { 1283 atomic.StoreUint32(&p.defaultTTL, uint32(ttl)) 1284 } 1285 1286 // DefaultTTL returns the default TTL for endpoints created with this protocol. 1287 func (p *protocol) DefaultTTL() uint8 { 1288 return uint8(atomic.LoadUint32(&p.defaultTTL)) 1289 } 1290 1291 // Close implements stack.TransportProtocol. 1292 func (*protocol) Close() {} 1293 1294 // Wait implements stack.TransportProtocol. 1295 func (*protocol) Wait() {} 1296 1297 // parseAndValidate parses the packet (including its transport layer header) and 1298 // returns the parsed IP header. 1299 // 1300 // Returns true if the IP header was successfully parsed. 1301 func (p *protocol) parseAndValidate(pkt *stack.PacketBuffer) (header.IPv4, bool) { 1302 transProtoNum, hasTransportHdr, ok := p.Parse(pkt) 1303 if !ok { 1304 return nil, false 1305 } 1306 1307 h := header.IPv4(pkt.NetworkHeader().View()) 1308 // Do not include the link header's size when calculating the size of the IP 1309 // packet. 1310 if !h.IsValid(pkt.Size() - pkt.LinkHeader().View().Size()) { 1311 return nil, false 1312 } 1313 1314 if !h.IsChecksumValid() { 1315 return nil, false 1316 } 1317 1318 if hasTransportHdr { 1319 p.parseTransport(pkt, transProtoNum) 1320 } 1321 1322 return h, true 1323 } 1324 1325 func (p *protocol) parseTransport(pkt *stack.PacketBuffer, transProtoNum tcpip.TransportProtocolNumber) { 1326 if transProtoNum == header.ICMPv4ProtocolNumber { 1327 // The transport layer will handle transport layer parsing errors. 1328 _ = parse.ICMPv4(pkt) 1329 return 1330 } 1331 1332 switch err := p.stack.ParsePacketBufferTransport(transProtoNum, pkt); err { 1333 case stack.ParsedOK: 1334 case stack.UnknownTransportProtocol, stack.TransportLayerParseError: 1335 // The transport layer will handle unknown protocols and transport layer 1336 // parsing errors. 1337 default: 1338 panic(fmt.Sprintf("unexpected error parsing transport header = %d", err)) 1339 } 1340 } 1341 1342 // Parse implements stack.NetworkProtocol. 1343 func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) { 1344 if ok := parse.IPv4(pkt); !ok { 1345 return 0, false, false 1346 } 1347 1348 ipHdr := header.IPv4(pkt.NetworkHeader().View()) 1349 return ipHdr.TransportProtocol(), !ipHdr.More() && ipHdr.FragmentOffset() == 0, true 1350 } 1351 1352 // allowICMPReply reports whether an ICMP reply with provided type and code may 1353 // be sent following the rate mask options and global ICMP rate limiter. 1354 func (p *protocol) allowICMPReply(icmpType header.ICMPv4Type, code header.ICMPv4Code) bool { 1355 // Mimic linux and never rate limit for PMTU discovery. 1356 // https://github.com/torvalds/linux/blob/9e9fb7655ed585da8f468e29221f0ba194a5f613/net/ipv4/icmp.c#L288 1357 if icmpType == header.ICMPv4DstUnreachable && code == header.ICMPv4FragmentationNeeded { 1358 return true 1359 } 1360 p.mu.RLock() 1361 defer p.mu.RUnlock() 1362 1363 if _, ok := p.mu.icmpRateLimitedTypes[icmpType]; ok { 1364 return p.stack.AllowICMPMessage() 1365 } 1366 return true 1367 } 1368 1369 // calculateNetworkMTU calculates the network-layer payload MTU based on the 1370 // link-layer payload mtu. 1371 func calculateNetworkMTU(linkMTU, networkHeaderSize uint32) (uint32, tcpip.Error) { 1372 if linkMTU < header.IPv4MinimumMTU { 1373 return 0, &tcpip.ErrInvalidEndpointState{} 1374 } 1375 1376 // As per RFC 791 section 3.1, an IPv4 header cannot exceed 60 bytes in 1377 // length: 1378 // The maximal internet header is 60 octets, and a typical internet header 1379 // is 20 octets, allowing a margin for headers of higher level protocols. 1380 if networkHeaderSize > header.IPv4MaximumHeaderSize { 1381 return 0, &tcpip.ErrMalformedHeader{} 1382 } 1383 1384 networkMTU := linkMTU 1385 if networkMTU > MaxTotalSize { 1386 networkMTU = MaxTotalSize 1387 } 1388 1389 return networkMTU - networkHeaderSize, nil 1390 } 1391 1392 func packetMustBeFragmented(pkt *stack.PacketBuffer, networkMTU uint32) bool { 1393 payload := pkt.TransportHeader().View().Size() + pkt.Data().Size() 1394 return pkt.GSOOptions.Type == stack.GSONone && uint32(payload) > networkMTU 1395 } 1396 1397 // addressToUint32 translates an IPv4 address into its little endian uint32 1398 // representation. 1399 // 1400 // This function does the same thing as binary.LittleEndian.Uint32 but operates 1401 // on a tcpip.Address (a string) without the need to convert it to a byte slice, 1402 // which would cause an allocation. 1403 func addressToUint32(addr tcpip.Address) uint32 { 1404 _ = addr[3] // bounds check hint to compiler 1405 return uint32(addr[0]) | uint32(addr[1])<<8 | uint32(addr[2])<<16 | uint32(addr[3])<<24 1406 } 1407 1408 // hashRoute calculates a hash value for the given source/destination pair using 1409 // the addresses, transport protocol number and a 32-bit number to generate the 1410 // hash. 1411 func hashRoute(srcAddr, dstAddr tcpip.Address, protocol tcpip.TransportProtocolNumber, hashIV uint32) uint32 { 1412 a := addressToUint32(srcAddr) 1413 b := addressToUint32(dstAddr) 1414 return hash.Hash3Words(a, b, uint32(protocol), hashIV) 1415 } 1416 1417 // Options holds options to configure a new protocol. 1418 type Options struct { 1419 // IGMP holds options for IGMP. 1420 IGMP IGMPOptions 1421 1422 // AllowExternalLoopbackTraffic indicates that inbound loopback packets (i.e. 1423 // martian loopback packets) should be accepted. 1424 AllowExternalLoopbackTraffic bool 1425 } 1426 1427 // NewProtocolWithOptions returns an IPv4 network protocol. 1428 func NewProtocolWithOptions(opts Options) stack.NetworkProtocolFactory { 1429 ids := make([]uint32, buckets) 1430 1431 // Randomly initialize hashIV and the ids. 1432 r := hash.RandN32(1 + buckets) 1433 for i := range ids { 1434 ids[i] = r[i] 1435 } 1436 hashIV := r[buckets] 1437 1438 return func(s *stack.Stack) stack.NetworkProtocol { 1439 p := &protocol{ 1440 stack: s, 1441 ids: ids, 1442 hashIV: hashIV, 1443 defaultTTL: DefaultTTL, 1444 options: opts, 1445 } 1446 p.fragmentation = fragmentation.NewFragmentation(fragmentblockSize, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, ReassembleTimeout, s.Clock(), p) 1447 p.mu.eps = make(map[tcpip.NICID]*endpoint) 1448 // Set ICMP rate limiting to Linux defaults. 1449 // See https://man7.org/linux/man-pages/man7/icmp.7.html. 1450 p.mu.icmpRateLimitedTypes = map[header.ICMPv4Type]struct{}{ 1451 header.ICMPv4DstUnreachable: struct{}{}, 1452 header.ICMPv4SrcQuench: struct{}{}, 1453 header.ICMPv4TimeExceeded: struct{}{}, 1454 header.ICMPv4ParamProblem: struct{}{}, 1455 } 1456 return p 1457 } 1458 } 1459 1460 // NewProtocol is equivalent to NewProtocolWithOptions with an empty Options. 1461 func NewProtocol(s *stack.Stack) stack.NetworkProtocol { 1462 return NewProtocolWithOptions(Options{})(s) 1463 } 1464 1465 func buildNextFragment(pf *fragmentation.PacketFragmenter, originalIPHeader header.IPv4) (*stack.PacketBuffer, bool) { 1466 fragPkt, offset, copied, more := pf.BuildNextFragment() 1467 fragPkt.NetworkProtocolNumber = ProtocolNumber 1468 1469 originalIPHeaderLength := len(originalIPHeader) 1470 nextFragIPHeader := header.IPv4(fragPkt.NetworkHeader().Push(originalIPHeaderLength)) 1471 fragPkt.NetworkProtocolNumber = ProtocolNumber 1472 1473 if copied := copy(nextFragIPHeader, originalIPHeader); copied != len(originalIPHeader) { 1474 panic(fmt.Sprintf("wrong number of bytes copied into fragmentIPHeaders: got = %d, want = %d", copied, originalIPHeaderLength)) 1475 } 1476 1477 flags := originalIPHeader.Flags() 1478 if more { 1479 flags |= header.IPv4FlagMoreFragments 1480 } 1481 nextFragIPHeader.SetFlagsFragmentOffset(flags, uint16(offset)) 1482 nextFragIPHeader.SetTotalLength(uint16(nextFragIPHeader.HeaderLength()) + uint16(copied)) 1483 nextFragIPHeader.SetChecksum(0) 1484 nextFragIPHeader.SetChecksum(^nextFragIPHeader.CalculateChecksum()) 1485 1486 return fragPkt, more 1487 } 1488 1489 // optionAction describes possible actions that may be taken on an option 1490 // while processing it. 1491 type optionAction uint8 1492 1493 const ( 1494 // optionRemove says that the option should not be in the output option set. 1495 optionRemove optionAction = iota 1496 1497 // optionProcess says that the option should be fully processed. 1498 optionProcess 1499 1500 // optionVerify says the option should be checked and passed unchanged. 1501 optionVerify 1502 1503 // optionPass says to pass the output set without checking. 1504 optionPass 1505 ) 1506 1507 // optionActions list what to do for each option in a given scenario. 1508 type optionActions struct { 1509 // timestamp controls what to do with a Timestamp option. 1510 timestamp optionAction 1511 1512 // recordRoute controls what to do with a Record Route option. 1513 recordRoute optionAction 1514 1515 // routerAlert controls what to do with a Router Alert option. 1516 routerAlert optionAction 1517 1518 // unknown controls what to do with an unknown option. 1519 unknown optionAction 1520 } 1521 1522 // optionsUsage specifies the ways options may be operated upon for a given 1523 // scenario during packet processing. 1524 type optionsUsage interface { 1525 actions() optionActions 1526 } 1527 1528 // optionUsageVerify implements optionsUsage for when we just want to check 1529 // fragments. Don't change anything, just check and reject if bad. No 1530 // replacement options are generated. 1531 type optionUsageVerify struct{} 1532 1533 // actions implements optionsUsage. 1534 func (*optionUsageVerify) actions() optionActions { 1535 return optionActions{ 1536 timestamp: optionVerify, 1537 recordRoute: optionVerify, 1538 routerAlert: optionVerify, 1539 unknown: optionRemove, 1540 } 1541 } 1542 1543 // optionUsageReceive implements optionsUsage for packets we will pass 1544 // to the transport layer (with the exception of Echo requests). 1545 type optionUsageReceive struct{} 1546 1547 // actions implements optionsUsage. 1548 func (*optionUsageReceive) actions() optionActions { 1549 return optionActions{ 1550 timestamp: optionProcess, 1551 recordRoute: optionProcess, 1552 routerAlert: optionVerify, 1553 unknown: optionPass, 1554 } 1555 } 1556 1557 // optionUsageForward implements optionsUsage for packets about to be forwarded. 1558 // All options are passed on regardless of whether we recognise them, however 1559 // we do process the Timestamp and Record Route options. 1560 type optionUsageForward struct{} 1561 1562 // actions implements optionsUsage. 1563 func (*optionUsageForward) actions() optionActions { 1564 return optionActions{ 1565 timestamp: optionProcess, 1566 recordRoute: optionProcess, 1567 routerAlert: optionVerify, 1568 unknown: optionPass, 1569 } 1570 } 1571 1572 // optionUsageEcho implements optionsUsage for echo packet processing. 1573 // Only Timestamp and RecordRoute are processed and sent back. 1574 type optionUsageEcho struct{} 1575 1576 // actions implements optionsUsage. 1577 func (*optionUsageEcho) actions() optionActions { 1578 return optionActions{ 1579 timestamp: optionProcess, 1580 recordRoute: optionProcess, 1581 routerAlert: optionVerify, 1582 unknown: optionRemove, 1583 } 1584 } 1585 1586 // handleTimestamp does any required processing on a Timestamp option 1587 // in place. 1588 func handleTimestamp(tsOpt header.IPv4OptionTimestamp, localAddress tcpip.Address, clock tcpip.Clock, usage optionsUsage) *header.IPv4OptParameterProblem { 1589 flags := tsOpt.Flags() 1590 var entrySize uint8 1591 switch flags { 1592 case header.IPv4OptionTimestampOnlyFlag: 1593 entrySize = header.IPv4OptionTimestampSize 1594 case 1595 header.IPv4OptionTimestampWithIPFlag, 1596 header.IPv4OptionTimestampWithPredefinedIPFlag: 1597 entrySize = header.IPv4OptionTimestampWithAddrSize 1598 default: 1599 return &header.IPv4OptParameterProblem{ 1600 Pointer: header.IPv4OptTSOFLWAndFLGOffset, 1601 NeedICMP: true, 1602 } 1603 } 1604 1605 pointer := tsOpt.Pointer() 1606 // RFC 791 page 22 states: "The smallest legal value is 5." 1607 // Since the pointer is 1 based, and the header is 4 bytes long the 1608 // pointer must point beyond the header therefore 4 or less is bad. 1609 if pointer <= header.IPv4OptionTimestampHdrLength { 1610 return &header.IPv4OptParameterProblem{ 1611 Pointer: header.IPv4OptTSPointerOffset, 1612 NeedICMP: true, 1613 } 1614 } 1615 // To simplify processing below, base further work on the array of timestamps 1616 // beyond the header, rather than on the whole option. Also to aid 1617 // calculations set 'nextSlot' to be 0 based as in the packet it is 1 based. 1618 nextSlot := pointer - (header.IPv4OptionTimestampHdrLength + 1) 1619 optLen := tsOpt.Size() 1620 dataLength := optLen - header.IPv4OptionTimestampHdrLength 1621 1622 // In the section below, we verify the pointer, length and overflow counter 1623 // fields of the option. The distinction is in which byte you return as being 1624 // in error in the ICMP packet. Offsets 1 (length), 2 pointer) 1625 // or 3 (overflowed counter). 1626 // 1627 // The following RFC sections cover this section: 1628 // 1629 // RFC 791 (page 22): 1630 // If there is some room but not enough room for a full timestamp 1631 // to be inserted, or the overflow count itself overflows, the 1632 // original datagram is considered to be in error and is discarded. 1633 // In either case an ICMP parameter problem message may be sent to 1634 // the source host [3]. 1635 // 1636 // You can get this situation in two ways. Firstly if the data area is not 1637 // a multiple of the entry size or secondly, if the pointer is not at a 1638 // multiple of the entry size. The wording of the RFC suggests that 1639 // this is not an error until you actually run out of space. 1640 if pointer > optLen { 1641 // RFC 791 (page 22) says we should switch to using the overflow count. 1642 // If the timestamp data area is already full (the pointer exceeds 1643 // the length) the datagram is forwarded without inserting the 1644 // timestamp, but the overflow count is incremented by one. 1645 if flags == header.IPv4OptionTimestampWithPredefinedIPFlag { 1646 // By definition we have nothing to do. 1647 return nil 1648 } 1649 1650 if tsOpt.IncOverflow() != 0 { 1651 return nil 1652 } 1653 // The overflow count is also full. 1654 return &header.IPv4OptParameterProblem{ 1655 Pointer: header.IPv4OptTSOFLWAndFLGOffset, 1656 NeedICMP: true, 1657 } 1658 } 1659 if nextSlot+entrySize > dataLength { 1660 // The data area isn't full but there isn't room for a new entry. 1661 // Either Length or Pointer could be bad. 1662 if false { 1663 // We must select Pointer for Linux compatibility, even if 1664 // only the length is bad. 1665 // The Linux code is at (in October 2020) 1666 // https://github.com/torvalds/linux/blob/bbf5c979011a099af5dc76498918ed7df445635b/net/ipv4/ip_options.c#L367-L370 1667 // if (optptr[2]+3 > optlen) { 1668 // pp_ptr = optptr + 2; 1669 // goto error; 1670 // } 1671 // which doesn't distinguish between which of optptr[2] or optlen 1672 // is wrong, but just arbitrarily decides on optptr+2. 1673 if dataLength%entrySize != 0 { 1674 // The Data section size should be a multiple of the expected 1675 // timestamp entry size. 1676 return &header.IPv4OptParameterProblem{ 1677 Pointer: header.IPv4OptionLengthOffset, 1678 NeedICMP: false, 1679 } 1680 } 1681 // If the size is OK, the pointer must be corrupted. 1682 } 1683 return &header.IPv4OptParameterProblem{ 1684 Pointer: header.IPv4OptTSPointerOffset, 1685 NeedICMP: true, 1686 } 1687 } 1688 1689 if usage.actions().timestamp == optionProcess { 1690 tsOpt.UpdateTimestamp(localAddress, clock) 1691 } 1692 return nil 1693 } 1694 1695 // handleRecordRoute checks and processes a Record route option. It is much 1696 // like the timestamp type 1 option, but without timestamps. The passed in 1697 // address is stored in the option in the correct spot if possible. 1698 func handleRecordRoute(rrOpt header.IPv4OptionRecordRoute, localAddress tcpip.Address, usage optionsUsage) *header.IPv4OptParameterProblem { 1699 optlen := rrOpt.Size() 1700 1701 if optlen < header.IPv4AddressSize+header.IPv4OptionRecordRouteHdrLength { 1702 return &header.IPv4OptParameterProblem{ 1703 Pointer: header.IPv4OptionLengthOffset, 1704 NeedICMP: true, 1705 } 1706 } 1707 1708 pointer := rrOpt.Pointer() 1709 // RFC 791 page 20 states: 1710 // The pointer is relative to this option, and the 1711 // smallest legal value for the pointer is 4. 1712 // Since the pointer is 1 based, and the header is 3 bytes long the 1713 // pointer must point beyond the header therefore 3 or less is bad. 1714 if pointer <= header.IPv4OptionRecordRouteHdrLength { 1715 return &header.IPv4OptParameterProblem{ 1716 Pointer: header.IPv4OptRRPointerOffset, 1717 NeedICMP: true, 1718 } 1719 } 1720 1721 // RFC 791 page 21 says 1722 // If the route data area is already full (the pointer exceeds the 1723 // length) the datagram is forwarded without inserting the address 1724 // into the recorded route. If there is some room but not enough 1725 // room for a full address to be inserted, the original datagram is 1726 // considered to be in error and is discarded. In either case an 1727 // ICMP parameter problem message may be sent to the source 1728 // host. 1729 // The use of the words "In either case" suggests that a 'full' RR option 1730 // could generate an ICMP at every hop after it fills up. We chose to not 1731 // do this (as do most implementations). It is probable that the inclusion 1732 // of these words is a copy/paste error from the timestamp option where 1733 // there are two failure reasons given. 1734 if pointer > optlen { 1735 return nil 1736 } 1737 1738 // The data area isn't full but there isn't room for a new entry. 1739 // Either Length or Pointer could be bad. We must select Pointer for Linux 1740 // compatibility, even if only the length is bad. NB. pointer is 1 based. 1741 if pointer+header.IPv4AddressSize > optlen+1 { 1742 if false { 1743 // This is what we would do if we were not being Linux compatible. 1744 // Check for bad pointer or length value. Must be a multiple of 4 after 1745 // accounting for the 3 byte header and not within that header. 1746 // RFC 791, page 20 says: 1747 // The pointer is relative to this option, and the 1748 // smallest legal value for the pointer is 4. 1749 // 1750 // A recorded route is composed of a series of internet addresses. 1751 // Each internet address is 32 bits or 4 octets. 1752 // Linux skips this test so we must too. See Linux code at: 1753 // https://github.com/torvalds/linux/blob/bbf5c979011a099af5dc76498918ed7df445635b/net/ipv4/ip_options.c#L338-L341 1754 // if (optptr[2]+3 > optlen) { 1755 // pp_ptr = optptr + 2; 1756 // goto error; 1757 // } 1758 if (optlen-header.IPv4OptionRecordRouteHdrLength)%header.IPv4AddressSize != 0 { 1759 // Length is bad, not on integral number of slots. 1760 return &header.IPv4OptParameterProblem{ 1761 Pointer: header.IPv4OptionLengthOffset, 1762 NeedICMP: true, 1763 } 1764 } 1765 // If not length, the fault must be with the pointer. 1766 } 1767 return &header.IPv4OptParameterProblem{ 1768 Pointer: header.IPv4OptRRPointerOffset, 1769 NeedICMP: true, 1770 } 1771 } 1772 if usage.actions().recordRoute == optionVerify { 1773 return nil 1774 } 1775 rrOpt.StoreAddress(localAddress) 1776 return nil 1777 } 1778 1779 // handleRouterAlert performs sanity checks on a Router Alert option. 1780 func handleRouterAlert(raOpt header.IPv4OptionRouterAlert) *header.IPv4OptParameterProblem { 1781 // Only the zero value is acceptable, as per RFC 2113, section 2.1: 1782 // Value: A two octet code with the following values: 1783 // 0 - Router shall examine packet 1784 // 1-65535 - Reserved 1785 if raOpt.Value() != header.IPv4OptionRouterAlertValue { 1786 return &header.IPv4OptParameterProblem{ 1787 Pointer: header.IPv4OptionRouterAlertValueOffset, 1788 NeedICMP: true, 1789 } 1790 } 1791 return nil 1792 } 1793 1794 type optionTracker struct { 1795 timestamp bool 1796 recordRoute bool 1797 routerAlert bool 1798 } 1799 1800 // processIPOptions parses the IPv4 options and produces a new set of options 1801 // suitable for use in the next step of packet processing as informed by usage. 1802 // The original will not be touched. 1803 // 1804 // If there were no errors during parsing, the new set of options is returned as 1805 // a new buffer. 1806 func (e *endpoint) processIPOptions(pkt *stack.PacketBuffer, opts header.IPv4Options, usage optionsUsage) (header.IPv4Options, optionTracker, *header.IPv4OptParameterProblem) { 1807 stats := e.stats.ip 1808 optIter := opts.MakeIterator() 1809 1810 // Except NOP, each option must only appear at most once (RFC 791 section 3.1, 1811 // at the definition of every type). 1812 // Keep track of each option we find to enable duplicate option detection. 1813 var seenOptions [math.MaxUint8 + 1]bool 1814 1815 // TODO(https://gvisor.dev/issue/4586): This will need tweaking when we start 1816 // really forwarding packets as we may need to get two addresses, for rx and 1817 // tx interfaces. We will also have to take usage into account. 1818 localAddress := e.MainAddress().Address 1819 if len(localAddress) == 0 { 1820 h := header.IPv4(pkt.NetworkHeader().View()) 1821 dstAddr := h.DestinationAddress() 1822 if pkt.NetworkPacketInfo.LocalAddressBroadcast || header.IsV4MulticastAddress(dstAddr) { 1823 return nil, optionTracker{}, &header.IPv4OptParameterProblem{ 1824 NeedICMP: false, 1825 } 1826 } 1827 localAddress = dstAddr 1828 } 1829 1830 var optionsProcessed optionTracker 1831 for { 1832 option, done, optProblem := optIter.Next() 1833 if done || optProblem != nil { 1834 return optIter.Finalize(), optionsProcessed, optProblem 1835 } 1836 optType := option.Type() 1837 if optType == header.IPv4OptionNOPType { 1838 optIter.PushNOPOrEnd(optType) 1839 continue 1840 } 1841 if optType == header.IPv4OptionListEndType { 1842 optIter.PushNOPOrEnd(optType) 1843 return optIter.Finalize(), optionsProcessed, nil 1844 } 1845 1846 // check for repeating options (multiple NOPs are OK) 1847 if seenOptions[optType] { 1848 return nil, optionTracker{}, &header.IPv4OptParameterProblem{ 1849 Pointer: optIter.ErrCursor, 1850 NeedICMP: true, 1851 } 1852 } 1853 seenOptions[optType] = true 1854 1855 optLen, optProblem := func() (int, *header.IPv4OptParameterProblem) { 1856 switch option := option.(type) { 1857 case *header.IPv4OptionTimestamp: 1858 stats.OptionTimestampReceived.Increment() 1859 optionsProcessed.timestamp = true 1860 if usage.actions().timestamp != optionRemove { 1861 clock := e.protocol.stack.Clock() 1862 newBuffer := optIter.InitReplacement(option) 1863 optProblem := handleTimestamp(header.IPv4OptionTimestamp(newBuffer), localAddress, clock, usage) 1864 return len(newBuffer), optProblem 1865 } 1866 1867 case *header.IPv4OptionRecordRoute: 1868 stats.OptionRecordRouteReceived.Increment() 1869 optionsProcessed.recordRoute = true 1870 if usage.actions().recordRoute != optionRemove { 1871 newBuffer := optIter.InitReplacement(option) 1872 optProblem := handleRecordRoute(header.IPv4OptionRecordRoute(newBuffer), localAddress, usage) 1873 return len(newBuffer), optProblem 1874 } 1875 1876 case *header.IPv4OptionRouterAlert: 1877 stats.OptionRouterAlertReceived.Increment() 1878 optionsProcessed.routerAlert = true 1879 if usage.actions().routerAlert != optionRemove { 1880 newBuffer := optIter.InitReplacement(option) 1881 optProblem := handleRouterAlert(header.IPv4OptionRouterAlert(newBuffer)) 1882 return len(newBuffer), optProblem 1883 } 1884 1885 default: 1886 stats.OptionUnknownReceived.Increment() 1887 if usage.actions().unknown == optionPass { 1888 return len(optIter.InitReplacement(option)), nil 1889 } 1890 } 1891 return 0, nil 1892 }() 1893 1894 if optProblem != nil { 1895 optProblem.Pointer += optIter.ErrCursor 1896 return nil, optionTracker{}, optProblem 1897 } 1898 optIter.ConsumeBuffer(optLen) 1899 } 1900 }