github.com/forest33/wtun@v0.3.1/tun/tcp_offload_linux.go (about)

     1  /* SPDX-License-Identifier: MIT
     2   *
     3   * Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
     4   */
     5  
     6  package tun
     7  
     8  import (
     9  	"bytes"
    10  	"encoding/binary"
    11  	"errors"
    12  	"io"
    13  	"unsafe"
    14  
    15  	"golang.org/x/sys/unix"
    16  
    17  	"github.com/forest33/wtun/conn"
    18  )
    19  
    20  const tcpFlagsOffset = 13
    21  
    22  const (
    23  	tcpFlagFIN uint8 = 0x01
    24  	tcpFlagPSH uint8 = 0x08
    25  	tcpFlagACK uint8 = 0x10
    26  )
    27  
    28  // virtioNetHdr is defined in the kernel in include/uapi/linux/virtio_net.h. The
    29  // kernel symbol is virtio_net_hdr.
    30  type virtioNetHdr struct {
    31  	flags      uint8
    32  	gsoType    uint8
    33  	hdrLen     uint16
    34  	gsoSize    uint16
    35  	csumStart  uint16
    36  	csumOffset uint16
    37  }
    38  
    39  func (v *virtioNetHdr) decode(b []byte) error {
    40  	if len(b) < virtioNetHdrLen {
    41  		return io.ErrShortBuffer
    42  	}
    43  	copy(unsafe.Slice((*byte)(unsafe.Pointer(v)), virtioNetHdrLen), b[:virtioNetHdrLen])
    44  	return nil
    45  }
    46  
    47  func (v *virtioNetHdr) encode(b []byte) error {
    48  	if len(b) < virtioNetHdrLen {
    49  		return io.ErrShortBuffer
    50  	}
    51  	copy(b[:virtioNetHdrLen], unsafe.Slice((*byte)(unsafe.Pointer(v)), virtioNetHdrLen))
    52  	return nil
    53  }
    54  
    55  const (
    56  	// virtioNetHdrLen is the length in bytes of virtioNetHdr. This matches the
    57  	// shape of the C ABI for its kernel counterpart -- sizeof(virtio_net_hdr).
    58  	virtioNetHdrLen = int(unsafe.Sizeof(virtioNetHdr{}))
    59  )
    60  
    61  // flowKey represents the key for a flow.
    62  type flowKey struct {
    63  	srcAddr, dstAddr [16]byte
    64  	srcPort, dstPort uint16
    65  	rxAck            uint32 // varying ack values should not be coalesced. Treat them as separate flows.
    66  }
    67  
    68  // tcpGROTable holds flow and coalescing information for the purposes of GRO.
    69  type tcpGROTable struct {
    70  	itemsByFlow map[flowKey][]tcpGROItem
    71  	itemsPool   [][]tcpGROItem
    72  }
    73  
    74  func newTCPGROTable() *tcpGROTable {
    75  	t := &tcpGROTable{
    76  		itemsByFlow: make(map[flowKey][]tcpGROItem, conn.IdealBatchSize),
    77  		itemsPool:   make([][]tcpGROItem, conn.IdealBatchSize),
    78  	}
    79  	for i := range t.itemsPool {
    80  		t.itemsPool[i] = make([]tcpGROItem, 0, conn.IdealBatchSize)
    81  	}
    82  	return t
    83  }
    84  
    85  func newFlowKey(pkt []byte, srcAddr, dstAddr, tcphOffset int) flowKey {
    86  	key := flowKey{}
    87  	addrSize := dstAddr - srcAddr
    88  	copy(key.srcAddr[:], pkt[srcAddr:dstAddr])
    89  	copy(key.dstAddr[:], pkt[dstAddr:dstAddr+addrSize])
    90  	key.srcPort = binary.BigEndian.Uint16(pkt[tcphOffset:])
    91  	key.dstPort = binary.BigEndian.Uint16(pkt[tcphOffset+2:])
    92  	key.rxAck = binary.BigEndian.Uint32(pkt[tcphOffset+8:])
    93  	return key
    94  }
    95  
    96  // lookupOrInsert looks up a flow for the provided packet and metadata,
    97  // returning the packets found for the flow, or inserting a new one if none
    98  // is found.
    99  func (t *tcpGROTable) lookupOrInsert(pkt []byte, srcAddrOffset, dstAddrOffset, tcphOffset, tcphLen, bufsIndex int) ([]tcpGROItem, bool) {
   100  	key := newFlowKey(pkt, srcAddrOffset, dstAddrOffset, tcphOffset)
   101  	items, ok := t.itemsByFlow[key]
   102  	if ok {
   103  		return items, ok
   104  	}
   105  	// TODO: insert() performs another map lookup. This could be rearranged to avoid.
   106  	t.insert(pkt, srcAddrOffset, dstAddrOffset, tcphOffset, tcphLen, bufsIndex)
   107  	return nil, false
   108  }
   109  
   110  // insert an item in the table for the provided packet and packet metadata.
   111  func (t *tcpGROTable) insert(pkt []byte, srcAddrOffset, dstAddrOffset, tcphOffset, tcphLen, bufsIndex int) {
   112  	key := newFlowKey(pkt, srcAddrOffset, dstAddrOffset, tcphOffset)
   113  	item := tcpGROItem{
   114  		key:       key,
   115  		bufsIndex: uint16(bufsIndex),
   116  		gsoSize:   uint16(len(pkt[tcphOffset+tcphLen:])),
   117  		iphLen:    uint8(tcphOffset),
   118  		tcphLen:   uint8(tcphLen),
   119  		sentSeq:   binary.BigEndian.Uint32(pkt[tcphOffset+4:]),
   120  		pshSet:    pkt[tcphOffset+tcpFlagsOffset]&tcpFlagPSH != 0,
   121  	}
   122  	items, ok := t.itemsByFlow[key]
   123  	if !ok {
   124  		items = t.newItems()
   125  	}
   126  	items = append(items, item)
   127  	t.itemsByFlow[key] = items
   128  }
   129  
   130  func (t *tcpGROTable) updateAt(item tcpGROItem, i int) {
   131  	items, _ := t.itemsByFlow[item.key]
   132  	items[i] = item
   133  }
   134  
   135  func (t *tcpGROTable) deleteAt(key flowKey, i int) {
   136  	items, _ := t.itemsByFlow[key]
   137  	items = append(items[:i], items[i+1:]...)
   138  	t.itemsByFlow[key] = items
   139  }
   140  
   141  // tcpGROItem represents bookkeeping data for a TCP packet during the lifetime
   142  // of a GRO evaluation across a vector of packets.
   143  type tcpGROItem struct {
   144  	key       flowKey
   145  	sentSeq   uint32 // the sequence number
   146  	bufsIndex uint16 // the index into the original bufs slice
   147  	numMerged uint16 // the number of packets merged into this item
   148  	gsoSize   uint16 // payload size
   149  	iphLen    uint8  // ip header len
   150  	tcphLen   uint8  // tcp header len
   151  	pshSet    bool   // psh flag is set
   152  }
   153  
   154  func (t *tcpGROTable) newItems() []tcpGROItem {
   155  	var items []tcpGROItem
   156  	items, t.itemsPool = t.itemsPool[len(t.itemsPool)-1], t.itemsPool[:len(t.itemsPool)-1]
   157  	return items
   158  }
   159  
   160  func (t *tcpGROTable) reset() {
   161  	for k, items := range t.itemsByFlow {
   162  		items = items[:0]
   163  		t.itemsPool = append(t.itemsPool, items)
   164  		delete(t.itemsByFlow, k)
   165  	}
   166  }
   167  
   168  // canCoalesce represents the outcome of checking if two TCP packets are
   169  // candidates for coalescing.
   170  type canCoalesce int
   171  
   172  const (
   173  	coalescePrepend     canCoalesce = -1
   174  	coalesceUnavailable canCoalesce = 0
   175  	coalesceAppend      canCoalesce = 1
   176  )
   177  
   178  // tcpPacketsCanCoalesce evaluates if pkt can be coalesced with the packet
   179  // described by item. This function makes considerations that match the kernel's
   180  // GRO self tests, which can be found in tools/testing/selftests/net/gro.c.
   181  func tcpPacketsCanCoalesce(pkt []byte, iphLen, tcphLen uint8, seq uint32, pshSet bool, gsoSize uint16, item tcpGROItem, bufs [][]byte, bufsOffset int) canCoalesce {
   182  	pktTarget := bufs[item.bufsIndex][bufsOffset:]
   183  	if tcphLen != item.tcphLen {
   184  		// cannot coalesce with unequal tcp options len
   185  		return coalesceUnavailable
   186  	}
   187  	if tcphLen > 20 {
   188  		if !bytes.Equal(pkt[iphLen+20:iphLen+tcphLen], pktTarget[item.iphLen+20:iphLen+tcphLen]) {
   189  			// cannot coalesce with unequal tcp options
   190  			return coalesceUnavailable
   191  		}
   192  	}
   193  	if pkt[0]>>4 == 6 {
   194  		if pkt[0] != pktTarget[0] || pkt[1]>>4 != pktTarget[1]>>4 {
   195  			// cannot coalesce with unequal Traffic class values
   196  			return coalesceUnavailable
   197  		}
   198  		if pkt[7] != pktTarget[7] {
   199  			// cannot coalesce with unequal Hop limit values
   200  			return coalesceUnavailable
   201  		}
   202  	} else {
   203  		if pkt[1] != pktTarget[1] {
   204  			// cannot coalesce with unequal ToS values
   205  			return coalesceUnavailable
   206  		}
   207  		if pkt[6]>>5 != pktTarget[6]>>5 {
   208  			// cannot coalesce with unequal DF or reserved bits. MF is checked
   209  			// further up the stack.
   210  			return coalesceUnavailable
   211  		}
   212  		if pkt[8] != pktTarget[8] {
   213  			// cannot coalesce with unequal TTL values
   214  			return coalesceUnavailable
   215  		}
   216  	}
   217  	// seq adjacency
   218  	lhsLen := item.gsoSize
   219  	lhsLen += item.numMerged * item.gsoSize
   220  	if seq == item.sentSeq+uint32(lhsLen) { // pkt aligns following item from a seq num perspective
   221  		if item.pshSet {
   222  			// We cannot append to a segment that has the PSH flag set, PSH
   223  			// can only be set on the final segment in a reassembled group.
   224  			return coalesceUnavailable
   225  		}
   226  		if len(pktTarget[iphLen+tcphLen:])%int(item.gsoSize) != 0 {
   227  			// A smaller than gsoSize packet has been appended previously.
   228  			// Nothing can come after a smaller packet on the end.
   229  			return coalesceUnavailable
   230  		}
   231  		if gsoSize > item.gsoSize {
   232  			// We cannot have a larger packet following a smaller one.
   233  			return coalesceUnavailable
   234  		}
   235  		return coalesceAppend
   236  	} else if seq+uint32(gsoSize) == item.sentSeq { // pkt aligns in front of item from a seq num perspective
   237  		if pshSet {
   238  			// We cannot prepend with a segment that has the PSH flag set, PSH
   239  			// can only be set on the final segment in a reassembled group.
   240  			return coalesceUnavailable
   241  		}
   242  		if gsoSize < item.gsoSize {
   243  			// We cannot have a larger packet following a smaller one.
   244  			return coalesceUnavailable
   245  		}
   246  		if gsoSize > item.gsoSize && item.numMerged > 0 {
   247  			// There's at least one previous merge, and we're larger than all
   248  			// previous. This would put multiple smaller packets on the end.
   249  			return coalesceUnavailable
   250  		}
   251  		return coalescePrepend
   252  	}
   253  	return coalesceUnavailable
   254  }
   255  
   256  func tcpChecksumValid(pkt []byte, iphLen uint8, isV6 bool) bool {
   257  	srcAddrAt := ipv4SrcAddrOffset
   258  	addrSize := 4
   259  	if isV6 {
   260  		srcAddrAt = ipv6SrcAddrOffset
   261  		addrSize = 16
   262  	}
   263  	tcpTotalLen := uint16(len(pkt) - int(iphLen))
   264  	tcpCSumNoFold := pseudoHeaderChecksumNoFold(unix.IPPROTO_TCP, pkt[srcAddrAt:srcAddrAt+addrSize], pkt[srcAddrAt+addrSize:srcAddrAt+addrSize*2], tcpTotalLen)
   265  	return ^checksum(pkt[iphLen:], tcpCSumNoFold) == 0
   266  }
   267  
   268  // coalesceResult represents the result of attempting to coalesce two TCP
   269  // packets.
   270  type coalesceResult int
   271  
   272  const (
   273  	coalesceInsufficientCap coalesceResult = 0
   274  	coalescePSHEnding       coalesceResult = 1
   275  	coalesceItemInvalidCSum coalesceResult = 2
   276  	coalescePktInvalidCSum  coalesceResult = 3
   277  	coalesceSuccess         coalesceResult = 4
   278  )
   279  
   280  // coalesceTCPPackets attempts to coalesce pkt with the packet described by
   281  // item, returning the outcome. This function may swap bufs elements in the
   282  // event of a prepend as item's bufs index is already being tracked for writing
   283  // to a Device.
   284  func coalesceTCPPackets(mode canCoalesce, pkt []byte, pktBuffsIndex int, gsoSize uint16, seq uint32, pshSet bool, item *tcpGROItem, bufs [][]byte, bufsOffset int, isV6 bool) coalesceResult {
   285  	var pktHead []byte // the packet that will end up at the front
   286  	headersLen := item.iphLen + item.tcphLen
   287  	coalescedLen := len(bufs[item.bufsIndex][bufsOffset:]) + len(pkt) - int(headersLen)
   288  
   289  	// Copy data
   290  	if mode == coalescePrepend {
   291  		pktHead = pkt
   292  		if cap(pkt)-bufsOffset < coalescedLen {
   293  			// We don't want to allocate a new underlying array if capacity is
   294  			// too small.
   295  			return coalesceInsufficientCap
   296  		}
   297  		if pshSet {
   298  			return coalescePSHEnding
   299  		}
   300  		if item.numMerged == 0 {
   301  			if !tcpChecksumValid(bufs[item.bufsIndex][bufsOffset:], item.iphLen, isV6) {
   302  				return coalesceItemInvalidCSum
   303  			}
   304  		}
   305  		if !tcpChecksumValid(pkt, item.iphLen, isV6) {
   306  			return coalescePktInvalidCSum
   307  		}
   308  		item.sentSeq = seq
   309  		extendBy := coalescedLen - len(pktHead)
   310  		bufs[pktBuffsIndex] = append(bufs[pktBuffsIndex], make([]byte, extendBy)...)
   311  		copy(bufs[pktBuffsIndex][bufsOffset+len(pkt):], bufs[item.bufsIndex][bufsOffset+int(headersLen):])
   312  		// Flip the slice headers in bufs as part of prepend. The index of item
   313  		// is already being tracked for writing.
   314  		bufs[item.bufsIndex], bufs[pktBuffsIndex] = bufs[pktBuffsIndex], bufs[item.bufsIndex]
   315  	} else {
   316  		pktHead = bufs[item.bufsIndex][bufsOffset:]
   317  		if cap(pktHead)-bufsOffset < coalescedLen {
   318  			// We don't want to allocate a new underlying array if capacity is
   319  			// too small.
   320  			return coalesceInsufficientCap
   321  		}
   322  		if item.numMerged == 0 {
   323  			if !tcpChecksumValid(bufs[item.bufsIndex][bufsOffset:], item.iphLen, isV6) {
   324  				return coalesceItemInvalidCSum
   325  			}
   326  		}
   327  		if !tcpChecksumValid(pkt, item.iphLen, isV6) {
   328  			return coalescePktInvalidCSum
   329  		}
   330  		if pshSet {
   331  			// We are appending a segment with PSH set.
   332  			item.pshSet = pshSet
   333  			pktHead[item.iphLen+tcpFlagsOffset] |= tcpFlagPSH
   334  		}
   335  		extendBy := len(pkt) - int(headersLen)
   336  		bufs[item.bufsIndex] = append(bufs[item.bufsIndex], make([]byte, extendBy)...)
   337  		copy(bufs[item.bufsIndex][bufsOffset+len(pktHead):], pkt[headersLen:])
   338  	}
   339  
   340  	if gsoSize > item.gsoSize {
   341  		item.gsoSize = gsoSize
   342  	}
   343  	hdr := virtioNetHdr{
   344  		flags:      unix.VIRTIO_NET_HDR_F_NEEDS_CSUM, // this turns into CHECKSUM_PARTIAL in the skb
   345  		hdrLen:     uint16(headersLen),
   346  		gsoSize:    uint16(item.gsoSize),
   347  		csumStart:  uint16(item.iphLen),
   348  		csumOffset: 16,
   349  	}
   350  
   351  	// Recalculate the total len (IPv4) or payload len (IPv6). Recalculate the
   352  	// (IPv4) header checksum.
   353  	if isV6 {
   354  		hdr.gsoType = unix.VIRTIO_NET_HDR_GSO_TCPV6
   355  		binary.BigEndian.PutUint16(pktHead[4:], uint16(coalescedLen)-uint16(item.iphLen)) // set new payload len
   356  	} else {
   357  		hdr.gsoType = unix.VIRTIO_NET_HDR_GSO_TCPV4
   358  		pktHead[10], pktHead[11] = 0, 0                               // clear checksum field
   359  		binary.BigEndian.PutUint16(pktHead[2:], uint16(coalescedLen)) // set new total length
   360  		iphCSum := ^checksum(pktHead[:item.iphLen], 0)                // compute checksum
   361  		binary.BigEndian.PutUint16(pktHead[10:], iphCSum)             // set checksum field
   362  	}
   363  	hdr.encode(bufs[item.bufsIndex][bufsOffset-virtioNetHdrLen:])
   364  
   365  	// Calculate the pseudo header checksum and place it at the TCP checksum
   366  	// offset. Downstream checksum offloading will combine this with computation
   367  	// of the tcp header and payload checksum.
   368  	addrLen := 4
   369  	addrOffset := ipv4SrcAddrOffset
   370  	if isV6 {
   371  		addrLen = 16
   372  		addrOffset = ipv6SrcAddrOffset
   373  	}
   374  	srcAddrAt := bufsOffset + addrOffset
   375  	srcAddr := bufs[item.bufsIndex][srcAddrAt : srcAddrAt+addrLen]
   376  	dstAddr := bufs[item.bufsIndex][srcAddrAt+addrLen : srcAddrAt+addrLen*2]
   377  	psum := pseudoHeaderChecksumNoFold(unix.IPPROTO_TCP, srcAddr, dstAddr, uint16(coalescedLen-int(item.iphLen)))
   378  	binary.BigEndian.PutUint16(pktHead[hdr.csumStart+hdr.csumOffset:], checksum([]byte{}, psum))
   379  
   380  	item.numMerged++
   381  	return coalesceSuccess
   382  }
   383  
   384  const (
   385  	ipv4FlagMoreFragments uint8 = 0x20
   386  )
   387  
   388  const (
   389  	ipv4SrcAddrOffset = 12
   390  	ipv6SrcAddrOffset = 8
   391  	maxUint16         = 1<<16 - 1
   392  )
   393  
   394  // tcpGRO evaluates the TCP packet at pktI in bufs for coalescing with
   395  // existing packets tracked in table. It will return false when pktI is not
   396  // coalesced, otherwise true. This indicates to the caller if bufs[pktI]
   397  // should be written to the Device.
   398  func tcpGRO(bufs [][]byte, offset int, pktI int, table *tcpGROTable, isV6 bool) (pktCoalesced bool) {
   399  	pkt := bufs[pktI][offset:]
   400  	if len(pkt) > maxUint16 {
   401  		// A valid IPv4 or IPv6 packet will never exceed this.
   402  		return false
   403  	}
   404  	iphLen := int((pkt[0] & 0x0F) * 4)
   405  	if isV6 {
   406  		iphLen = 40
   407  		ipv6HPayloadLen := int(binary.BigEndian.Uint16(pkt[4:]))
   408  		if ipv6HPayloadLen != len(pkt)-iphLen {
   409  			return false
   410  		}
   411  	} else {
   412  		totalLen := int(binary.BigEndian.Uint16(pkt[2:]))
   413  		if totalLen != len(pkt) {
   414  			return false
   415  		}
   416  	}
   417  	if len(pkt) < iphLen {
   418  		return false
   419  	}
   420  	tcphLen := int((pkt[iphLen+12] >> 4) * 4)
   421  	if tcphLen < 20 || tcphLen > 60 {
   422  		return false
   423  	}
   424  	if len(pkt) < iphLen+tcphLen {
   425  		return false
   426  	}
   427  	if !isV6 {
   428  		if pkt[6]&ipv4FlagMoreFragments != 0 || pkt[6]<<3 != 0 || pkt[7] != 0 {
   429  			// no GRO support for fragmented segments for now
   430  			return false
   431  		}
   432  	}
   433  	tcpFlags := pkt[iphLen+tcpFlagsOffset]
   434  	var pshSet bool
   435  	// not a candidate if any non-ACK flags (except PSH+ACK) are set
   436  	if tcpFlags != tcpFlagACK {
   437  		if pkt[iphLen+tcpFlagsOffset] != tcpFlagACK|tcpFlagPSH {
   438  			return false
   439  		}
   440  		pshSet = true
   441  	}
   442  	gsoSize := uint16(len(pkt) - tcphLen - iphLen)
   443  	// not a candidate if payload len is 0
   444  	if gsoSize < 1 {
   445  		return false
   446  	}
   447  	seq := binary.BigEndian.Uint32(pkt[iphLen+4:])
   448  	srcAddrOffset := ipv4SrcAddrOffset
   449  	addrLen := 4
   450  	if isV6 {
   451  		srcAddrOffset = ipv6SrcAddrOffset
   452  		addrLen = 16
   453  	}
   454  	items, existing := table.lookupOrInsert(pkt, srcAddrOffset, srcAddrOffset+addrLen, iphLen, tcphLen, pktI)
   455  	if !existing {
   456  		return false
   457  	}
   458  	for i := len(items) - 1; i >= 0; i-- {
   459  		// In the best case of packets arriving in order iterating in reverse is
   460  		// more efficient if there are multiple items for a given flow. This
   461  		// also enables a natural table.deleteAt() in the
   462  		// coalesceItemInvalidCSum case without the need for index tracking.
   463  		// This algorithm makes a best effort to coalesce in the event of
   464  		// unordered packets, where pkt may land anywhere in items from a
   465  		// sequence number perspective, however once an item is inserted into
   466  		// the table it is never compared across other items later.
   467  		item := items[i]
   468  		can := tcpPacketsCanCoalesce(pkt, uint8(iphLen), uint8(tcphLen), seq, pshSet, gsoSize, item, bufs, offset)
   469  		if can != coalesceUnavailable {
   470  			result := coalesceTCPPackets(can, pkt, pktI, gsoSize, seq, pshSet, &item, bufs, offset, isV6)
   471  			switch result {
   472  			case coalesceSuccess:
   473  				table.updateAt(item, i)
   474  				return true
   475  			case coalesceItemInvalidCSum:
   476  				// delete the item with an invalid csum
   477  				table.deleteAt(item.key, i)
   478  			case coalescePktInvalidCSum:
   479  				// no point in inserting an item that we can't coalesce
   480  				return false
   481  			default:
   482  			}
   483  		}
   484  	}
   485  	// failed to coalesce with any other packets; store the item in the flow
   486  	table.insert(pkt, srcAddrOffset, srcAddrOffset+addrLen, iphLen, tcphLen, pktI)
   487  	return false
   488  }
   489  
   490  func isTCP4NoIPOptions(b []byte) bool {
   491  	if len(b) < 40 {
   492  		return false
   493  	}
   494  	if b[0]>>4 != 4 {
   495  		return false
   496  	}
   497  	if b[0]&0x0F != 5 {
   498  		return false
   499  	}
   500  	if b[9] != unix.IPPROTO_TCP {
   501  		return false
   502  	}
   503  	return true
   504  }
   505  
   506  func isTCP6NoEH(b []byte) bool {
   507  	if len(b) < 60 {
   508  		return false
   509  	}
   510  	if b[0]>>4 != 6 {
   511  		return false
   512  	}
   513  	if b[6] != unix.IPPROTO_TCP {
   514  		return false
   515  	}
   516  	return true
   517  }
   518  
   519  // handleGRO evaluates bufs for GRO, and writes the indices of the resulting
   520  // packets into toWrite. toWrite, tcp4Table, and tcp6Table should initially be
   521  // empty (but non-nil), and are passed in to save allocs as the caller may reset
   522  // and recycle them across vectors of packets.
   523  func handleGRO(bufs [][]byte, offset int, tcp4Table, tcp6Table *tcpGROTable, toWrite *[]int) error {
   524  	for i := range bufs {
   525  		if offset < virtioNetHdrLen || offset > len(bufs[i])-1 {
   526  			return errors.New("invalid offset")
   527  		}
   528  		var coalesced bool
   529  		switch {
   530  		case isTCP4NoIPOptions(bufs[i][offset:]): // ipv4 packets w/IP options do not coalesce
   531  			coalesced = tcpGRO(bufs, offset, i, tcp4Table, false)
   532  		case isTCP6NoEH(bufs[i][offset:]): // ipv6 packets w/extension headers do not coalesce
   533  			coalesced = tcpGRO(bufs, offset, i, tcp6Table, true)
   534  		}
   535  		if !coalesced {
   536  			hdr := virtioNetHdr{}
   537  			err := hdr.encode(bufs[i][offset-virtioNetHdrLen:])
   538  			if err != nil {
   539  				return err
   540  			}
   541  			*toWrite = append(*toWrite, i)
   542  		}
   543  	}
   544  	return nil
   545  }
   546  
   547  // tcpTSO splits packets from in into outBuffs, writing the size of each
   548  // element into sizes. It returns the number of buffers populated, and/or an
   549  // error.
   550  func tcpTSO(in []byte, hdr virtioNetHdr, outBuffs [][]byte, sizes []int, outOffset int) (int, error) {
   551  	iphLen := int(hdr.csumStart)
   552  	srcAddrOffset := ipv6SrcAddrOffset
   553  	addrLen := 16
   554  	if hdr.gsoType == unix.VIRTIO_NET_HDR_GSO_TCPV4 {
   555  		in[10], in[11] = 0, 0 // clear ipv4 header checksum
   556  		srcAddrOffset = ipv4SrcAddrOffset
   557  		addrLen = 4
   558  	}
   559  	tcpCSumAt := int(hdr.csumStart + hdr.csumOffset)
   560  	in[tcpCSumAt], in[tcpCSumAt+1] = 0, 0 // clear tcp checksum
   561  	firstTCPSeqNum := binary.BigEndian.Uint32(in[hdr.csumStart+4:])
   562  	nextSegmentDataAt := int(hdr.hdrLen)
   563  	i := 0
   564  	for ; nextSegmentDataAt < len(in); i++ {
   565  		if i == len(outBuffs) {
   566  			return i - 1, ErrTooManySegments
   567  		}
   568  		nextSegmentEnd := nextSegmentDataAt + int(hdr.gsoSize)
   569  		if nextSegmentEnd > len(in) {
   570  			nextSegmentEnd = len(in)
   571  		}
   572  		segmentDataLen := nextSegmentEnd - nextSegmentDataAt
   573  		totalLen := int(hdr.hdrLen) + segmentDataLen
   574  		sizes[i] = totalLen
   575  		out := outBuffs[i][outOffset:]
   576  
   577  		copy(out, in[:iphLen])
   578  		if hdr.gsoType == unix.VIRTIO_NET_HDR_GSO_TCPV4 {
   579  			// For IPv4 we are responsible for incrementing the ID field,
   580  			// updating the total len field, and recalculating the header
   581  			// checksum.
   582  			if i > 0 {
   583  				id := binary.BigEndian.Uint16(out[4:])
   584  				id += uint16(i)
   585  				binary.BigEndian.PutUint16(out[4:], id)
   586  			}
   587  			binary.BigEndian.PutUint16(out[2:], uint16(totalLen))
   588  			ipv4CSum := ^checksum(out[:iphLen], 0)
   589  			binary.BigEndian.PutUint16(out[10:], ipv4CSum)
   590  		} else {
   591  			// For IPv6 we are responsible for updating the payload length field.
   592  			binary.BigEndian.PutUint16(out[4:], uint16(totalLen-iphLen))
   593  		}
   594  
   595  		// TCP header
   596  		copy(out[hdr.csumStart:hdr.hdrLen], in[hdr.csumStart:hdr.hdrLen])
   597  		tcpSeq := firstTCPSeqNum + uint32(hdr.gsoSize*uint16(i))
   598  		binary.BigEndian.PutUint32(out[hdr.csumStart+4:], tcpSeq)
   599  		if nextSegmentEnd != len(in) {
   600  			// FIN and PSH should only be set on last segment
   601  			clearFlags := tcpFlagFIN | tcpFlagPSH
   602  			out[hdr.csumStart+tcpFlagsOffset] &^= clearFlags
   603  		}
   604  
   605  		// payload
   606  		copy(out[hdr.hdrLen:], in[nextSegmentDataAt:nextSegmentEnd])
   607  
   608  		// TCP checksum
   609  		tcpHLen := int(hdr.hdrLen - hdr.csumStart)
   610  		tcpLenForPseudo := uint16(tcpHLen + segmentDataLen)
   611  		tcpCSumNoFold := pseudoHeaderChecksumNoFold(unix.IPPROTO_TCP, in[srcAddrOffset:srcAddrOffset+addrLen], in[srcAddrOffset+addrLen:srcAddrOffset+addrLen*2], tcpLenForPseudo)
   612  		tcpCSum := ^checksum(out[hdr.csumStart:totalLen], tcpCSumNoFold)
   613  		binary.BigEndian.PutUint16(out[hdr.csumStart+hdr.csumOffset:], tcpCSum)
   614  
   615  		nextSegmentDataAt += int(hdr.gsoSize)
   616  	}
   617  	return i, nil
   618  }
   619  
   620  func gsoNoneChecksum(in []byte, cSumStart, cSumOffset uint16) error {
   621  	cSumAt := cSumStart + cSumOffset
   622  	// The initial value at the checksum offset should be summed with the
   623  	// checksum we compute. This is typically the pseudo-header checksum.
   624  	initial := binary.BigEndian.Uint16(in[cSumAt:])
   625  	in[cSumAt], in[cSumAt+1] = 0, 0
   626  	binary.BigEndian.PutUint16(in[cSumAt:], ^checksum(in[cSumStart:], uint64(initial)))
   627  	return nil
   628  }