github.com/koomox/wireguard-go@v0.0.0-20230722134753-17a50b2f22a3/tun/tcp_offload_linux.go (about)

     1  /* SPDX-License-Identifier: MIT
     2   *
     3   * Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
     4   */
     5  
     6  package tun
     7  
     8  import (
     9  	"bytes"
    10  	"encoding/binary"
    11  	"errors"
    12  	"io"
    13  	"unsafe"
    14  
    15  	"golang.org/x/sys/unix"
    16  	"github.com/koomox/wireguard-go/conn"
    17  )
    18  
    19  const tcpFlagsOffset = 13
    20  
    21  const (
    22  	tcpFlagFIN uint8 = 0x01
    23  	tcpFlagPSH uint8 = 0x08
    24  	tcpFlagACK uint8 = 0x10
    25  )
    26  
    27  // virtioNetHdr is defined in the kernel in include/uapi/linux/virtio_net.h. The
    28  // kernel symbol is virtio_net_hdr.
    29  type virtioNetHdr struct {
    30  	flags      uint8
    31  	gsoType    uint8
    32  	hdrLen     uint16
    33  	gsoSize    uint16
    34  	csumStart  uint16
    35  	csumOffset uint16
    36  }
    37  
    38  func (v *virtioNetHdr) decode(b []byte) error {
    39  	if len(b) < virtioNetHdrLen {
    40  		return io.ErrShortBuffer
    41  	}
    42  	copy(unsafe.Slice((*byte)(unsafe.Pointer(v)), virtioNetHdrLen), b[:virtioNetHdrLen])
    43  	return nil
    44  }
    45  
    46  func (v *virtioNetHdr) encode(b []byte) error {
    47  	if len(b) < virtioNetHdrLen {
    48  		return io.ErrShortBuffer
    49  	}
    50  	copy(b[:virtioNetHdrLen], unsafe.Slice((*byte)(unsafe.Pointer(v)), virtioNetHdrLen))
    51  	return nil
    52  }
    53  
    54  const (
    55  	// virtioNetHdrLen is the length in bytes of virtioNetHdr. This matches the
    56  	// shape of the C ABI for its kernel counterpart -- sizeof(virtio_net_hdr).
    57  	virtioNetHdrLen = int(unsafe.Sizeof(virtioNetHdr{}))
    58  )
    59  
    60  // flowKey represents the key for a flow.
    61  type flowKey struct {
    62  	srcAddr, dstAddr [16]byte
    63  	srcPort, dstPort uint16
    64  	rxAck            uint32 // varying ack values should not be coalesced. Treat them as separate flows.
    65  }
    66  
    67  // tcpGROTable holds flow and coalescing information for the purposes of GRO.
    68  type tcpGROTable struct {
    69  	itemsByFlow map[flowKey][]tcpGROItem
    70  	itemsPool   [][]tcpGROItem
    71  }
    72  
    73  func newTCPGROTable() *tcpGROTable {
    74  	t := &tcpGROTable{
    75  		itemsByFlow: make(map[flowKey][]tcpGROItem, conn.IdealBatchSize),
    76  		itemsPool:   make([][]tcpGROItem, conn.IdealBatchSize),
    77  	}
    78  	for i := range t.itemsPool {
    79  		t.itemsPool[i] = make([]tcpGROItem, 0, conn.IdealBatchSize)
    80  	}
    81  	return t
    82  }
    83  
    84  func newFlowKey(pkt []byte, srcAddr, dstAddr, tcphOffset int) flowKey {
    85  	key := flowKey{}
    86  	addrSize := dstAddr - srcAddr
    87  	copy(key.srcAddr[:], pkt[srcAddr:dstAddr])
    88  	copy(key.dstAddr[:], pkt[dstAddr:dstAddr+addrSize])
    89  	key.srcPort = binary.BigEndian.Uint16(pkt[tcphOffset:])
    90  	key.dstPort = binary.BigEndian.Uint16(pkt[tcphOffset+2:])
    91  	key.rxAck = binary.BigEndian.Uint32(pkt[tcphOffset+8:])
    92  	return key
    93  }
    94  
    95  // lookupOrInsert looks up a flow for the provided packet and metadata,
    96  // returning the packets found for the flow, or inserting a new one if none
    97  // is found.
    98  func (t *tcpGROTable) lookupOrInsert(pkt []byte, srcAddrOffset, dstAddrOffset, tcphOffset, tcphLen, bufsIndex int) ([]tcpGROItem, bool) {
    99  	key := newFlowKey(pkt, srcAddrOffset, dstAddrOffset, tcphOffset)
   100  	items, ok := t.itemsByFlow[key]
   101  	if ok {
   102  		return items, ok
   103  	}
   104  	// TODO: insert() performs another map lookup. This could be rearranged to avoid.
   105  	t.insert(pkt, srcAddrOffset, dstAddrOffset, tcphOffset, tcphLen, bufsIndex)
   106  	return nil, false
   107  }
   108  
   109  // insert an item in the table for the provided packet and packet metadata.
   110  func (t *tcpGROTable) insert(pkt []byte, srcAddrOffset, dstAddrOffset, tcphOffset, tcphLen, bufsIndex int) {
   111  	key := newFlowKey(pkt, srcAddrOffset, dstAddrOffset, tcphOffset)
   112  	item := tcpGROItem{
   113  		key:       key,
   114  		bufsIndex: uint16(bufsIndex),
   115  		gsoSize:   uint16(len(pkt[tcphOffset+tcphLen:])),
   116  		iphLen:    uint8(tcphOffset),
   117  		tcphLen:   uint8(tcphLen),
   118  		sentSeq:   binary.BigEndian.Uint32(pkt[tcphOffset+4:]),
   119  		pshSet:    pkt[tcphOffset+tcpFlagsOffset]&tcpFlagPSH != 0,
   120  	}
   121  	items, ok := t.itemsByFlow[key]
   122  	if !ok {
   123  		items = t.newItems()
   124  	}
   125  	items = append(items, item)
   126  	t.itemsByFlow[key] = items
   127  }
   128  
   129  func (t *tcpGROTable) updateAt(item tcpGROItem, i int) {
   130  	items, _ := t.itemsByFlow[item.key]
   131  	items[i] = item
   132  }
   133  
   134  func (t *tcpGROTable) deleteAt(key flowKey, i int) {
   135  	items, _ := t.itemsByFlow[key]
   136  	items = append(items[:i], items[i+1:]...)
   137  	t.itemsByFlow[key] = items
   138  }
   139  
   140  // tcpGROItem represents bookkeeping data for a TCP packet during the lifetime
   141  // of a GRO evaluation across a vector of packets.
   142  type tcpGROItem struct {
   143  	key       flowKey
   144  	sentSeq   uint32 // the sequence number
   145  	bufsIndex uint16 // the index into the original bufs slice
   146  	numMerged uint16 // the number of packets merged into this item
   147  	gsoSize   uint16 // payload size
   148  	iphLen    uint8  // ip header len
   149  	tcphLen   uint8  // tcp header len
   150  	pshSet    bool   // psh flag is set
   151  }
   152  
   153  func (t *tcpGROTable) newItems() []tcpGROItem {
   154  	var items []tcpGROItem
   155  	items, t.itemsPool = t.itemsPool[len(t.itemsPool)-1], t.itemsPool[:len(t.itemsPool)-1]
   156  	return items
   157  }
   158  
   159  func (t *tcpGROTable) reset() {
   160  	for k, items := range t.itemsByFlow {
   161  		items = items[:0]
   162  		t.itemsPool = append(t.itemsPool, items)
   163  		delete(t.itemsByFlow, k)
   164  	}
   165  }
   166  
   167  // canCoalesce represents the outcome of checking if two TCP packets are
   168  // candidates for coalescing.
   169  type canCoalesce int
   170  
   171  const (
   172  	coalescePrepend     canCoalesce = -1
   173  	coalesceUnavailable canCoalesce = 0
   174  	coalesceAppend      canCoalesce = 1
   175  )
   176  
   177  // tcpPacketsCanCoalesce evaluates if pkt can be coalesced with the packet
   178  // described by item. This function makes considerations that match the kernel's
   179  // GRO self tests, which can be found in tools/testing/selftests/net/gro.c.
   180  func tcpPacketsCanCoalesce(pkt []byte, iphLen, tcphLen uint8, seq uint32, pshSet bool, gsoSize uint16, item tcpGROItem, bufs [][]byte, bufsOffset int) canCoalesce {
   181  	pktTarget := bufs[item.bufsIndex][bufsOffset:]
   182  	if tcphLen != item.tcphLen {
   183  		// cannot coalesce with unequal tcp options len
   184  		return coalesceUnavailable
   185  	}
   186  	if tcphLen > 20 {
   187  		if !bytes.Equal(pkt[iphLen+20:iphLen+tcphLen], pktTarget[item.iphLen+20:iphLen+tcphLen]) {
   188  			// cannot coalesce with unequal tcp options
   189  			return coalesceUnavailable
   190  		}
   191  	}
   192  	if pkt[0]>>4 == 6 {
   193  		if pkt[0] != pktTarget[0] || pkt[1]>>4 != pktTarget[1]>>4 {
   194  			// cannot coalesce with unequal Traffic class values
   195  			return coalesceUnavailable
   196  		}
   197  		if pkt[7] != pktTarget[7] {
   198  			// cannot coalesce with unequal Hop limit values
   199  			return coalesceUnavailable
   200  		}
   201  	} else {
   202  		if pkt[1] != pktTarget[1] {
   203  			// cannot coalesce with unequal ToS values
   204  			return coalesceUnavailable
   205  		}
   206  		if pkt[6]>>5 != pktTarget[6]>>5 {
   207  			// cannot coalesce with unequal DF or reserved bits. MF is checked
   208  			// further up the stack.
   209  			return coalesceUnavailable
   210  		}
   211  		if pkt[8] != pktTarget[8] {
   212  			// cannot coalesce with unequal TTL values
   213  			return coalesceUnavailable
   214  		}
   215  	}
   216  	// seq adjacency
   217  	lhsLen := item.gsoSize
   218  	lhsLen += item.numMerged * item.gsoSize
   219  	if seq == item.sentSeq+uint32(lhsLen) { // pkt aligns following item from a seq num perspective
   220  		if item.pshSet {
   221  			// We cannot append to a segment that has the PSH flag set, PSH
   222  			// can only be set on the final segment in a reassembled group.
   223  			return coalesceUnavailable
   224  		}
   225  		if len(pktTarget[iphLen+tcphLen:])%int(item.gsoSize) != 0 {
   226  			// A smaller than gsoSize packet has been appended previously.
   227  			// Nothing can come after a smaller packet on the end.
   228  			return coalesceUnavailable
   229  		}
   230  		if gsoSize > item.gsoSize {
   231  			// We cannot have a larger packet following a smaller one.
   232  			return coalesceUnavailable
   233  		}
   234  		return coalesceAppend
   235  	} else if seq+uint32(gsoSize) == item.sentSeq { // pkt aligns in front of item from a seq num perspective
   236  		if pshSet {
   237  			// We cannot prepend with a segment that has the PSH flag set, PSH
   238  			// can only be set on the final segment in a reassembled group.
   239  			return coalesceUnavailable
   240  		}
   241  		if gsoSize < item.gsoSize {
   242  			// We cannot have a larger packet following a smaller one.
   243  			return coalesceUnavailable
   244  		}
   245  		if gsoSize > item.gsoSize && item.numMerged > 0 {
   246  			// There's at least one previous merge, and we're larger than all
   247  			// previous. This would put multiple smaller packets on the end.
   248  			return coalesceUnavailable
   249  		}
   250  		return coalescePrepend
   251  	}
   252  	return coalesceUnavailable
   253  }
   254  
   255  func tcpChecksumValid(pkt []byte, iphLen uint8, isV6 bool) bool {
   256  	srcAddrAt := ipv4SrcAddrOffset
   257  	addrSize := 4
   258  	if isV6 {
   259  		srcAddrAt = ipv6SrcAddrOffset
   260  		addrSize = 16
   261  	}
   262  	tcpTotalLen := uint16(len(pkt) - int(iphLen))
   263  	tcpCSumNoFold := pseudoHeaderChecksumNoFold(unix.IPPROTO_TCP, pkt[srcAddrAt:srcAddrAt+addrSize], pkt[srcAddrAt+addrSize:srcAddrAt+addrSize*2], tcpTotalLen)
   264  	return ^checksum(pkt[iphLen:], tcpCSumNoFold) == 0
   265  }
   266  
   267  // coalesceResult represents the result of attempting to coalesce two TCP
   268  // packets.
   269  type coalesceResult int
   270  
   271  const (
   272  	coalesceInsufficientCap coalesceResult = 0
   273  	coalescePSHEnding       coalesceResult = 1
   274  	coalesceItemInvalidCSum coalesceResult = 2
   275  	coalescePktInvalidCSum  coalesceResult = 3
   276  	coalesceSuccess         coalesceResult = 4
   277  )
   278  
   279  // coalesceTCPPackets attempts to coalesce pkt with the packet described by
   280  // item, returning the outcome. This function may swap bufs elements in the
   281  // event of a prepend as item's bufs index is already being tracked for writing
   282  // to a Device.
   283  func coalesceTCPPackets(mode canCoalesce, pkt []byte, pktBuffsIndex int, gsoSize uint16, seq uint32, pshSet bool, item *tcpGROItem, bufs [][]byte, bufsOffset int, isV6 bool) coalesceResult {
   284  	var pktHead []byte // the packet that will end up at the front
   285  	headersLen := item.iphLen + item.tcphLen
   286  	coalescedLen := len(bufs[item.bufsIndex][bufsOffset:]) + len(pkt) - int(headersLen)
   287  
   288  	// Copy data
   289  	if mode == coalescePrepend {
   290  		pktHead = pkt
   291  		if cap(pkt)-bufsOffset < coalescedLen {
   292  			// We don't want to allocate a new underlying array if capacity is
   293  			// too small.
   294  			return coalesceInsufficientCap
   295  		}
   296  		if pshSet {
   297  			return coalescePSHEnding
   298  		}
   299  		if item.numMerged == 0 {
   300  			if !tcpChecksumValid(bufs[item.bufsIndex][bufsOffset:], item.iphLen, isV6) {
   301  				return coalesceItemInvalidCSum
   302  			}
   303  		}
   304  		if !tcpChecksumValid(pkt, item.iphLen, isV6) {
   305  			return coalescePktInvalidCSum
   306  		}
   307  		item.sentSeq = seq
   308  		extendBy := coalescedLen - len(pktHead)
   309  		bufs[pktBuffsIndex] = append(bufs[pktBuffsIndex], make([]byte, extendBy)...)
   310  		copy(bufs[pktBuffsIndex][bufsOffset+len(pkt):], bufs[item.bufsIndex][bufsOffset+int(headersLen):])
   311  		// Flip the slice headers in bufs as part of prepend. The index of item
   312  		// is already being tracked for writing.
   313  		bufs[item.bufsIndex], bufs[pktBuffsIndex] = bufs[pktBuffsIndex], bufs[item.bufsIndex]
   314  	} else {
   315  		pktHead = bufs[item.bufsIndex][bufsOffset:]
   316  		if cap(pktHead)-bufsOffset < coalescedLen {
   317  			// We don't want to allocate a new underlying array if capacity is
   318  			// too small.
   319  			return coalesceInsufficientCap
   320  		}
   321  		if item.numMerged == 0 {
   322  			if !tcpChecksumValid(bufs[item.bufsIndex][bufsOffset:], item.iphLen, isV6) {
   323  				return coalesceItemInvalidCSum
   324  			}
   325  		}
   326  		if !tcpChecksumValid(pkt, item.iphLen, isV6) {
   327  			return coalescePktInvalidCSum
   328  		}
   329  		if pshSet {
   330  			// We are appending a segment with PSH set.
   331  			item.pshSet = pshSet
   332  			pktHead[item.iphLen+tcpFlagsOffset] |= tcpFlagPSH
   333  		}
   334  		extendBy := len(pkt) - int(headersLen)
   335  		bufs[item.bufsIndex] = append(bufs[item.bufsIndex], make([]byte, extendBy)...)
   336  		copy(bufs[item.bufsIndex][bufsOffset+len(pktHead):], pkt[headersLen:])
   337  	}
   338  
   339  	if gsoSize > item.gsoSize {
   340  		item.gsoSize = gsoSize
   341  	}
   342  	hdr := virtioNetHdr{
   343  		flags:      unix.VIRTIO_NET_HDR_F_NEEDS_CSUM, // this turns into CHECKSUM_PARTIAL in the skb
   344  		hdrLen:     uint16(headersLen),
   345  		gsoSize:    uint16(item.gsoSize),
   346  		csumStart:  uint16(item.iphLen),
   347  		csumOffset: 16,
   348  	}
   349  
   350  	// Recalculate the total len (IPv4) or payload len (IPv6). Recalculate the
   351  	// (IPv4) header checksum.
   352  	if isV6 {
   353  		hdr.gsoType = unix.VIRTIO_NET_HDR_GSO_TCPV6
   354  		binary.BigEndian.PutUint16(pktHead[4:], uint16(coalescedLen)-uint16(item.iphLen)) // set new payload len
   355  	} else {
   356  		hdr.gsoType = unix.VIRTIO_NET_HDR_GSO_TCPV4
   357  		pktHead[10], pktHead[11] = 0, 0                               // clear checksum field
   358  		binary.BigEndian.PutUint16(pktHead[2:], uint16(coalescedLen)) // set new total length
   359  		iphCSum := ^checksum(pktHead[:item.iphLen], 0)                // compute checksum
   360  		binary.BigEndian.PutUint16(pktHead[10:], iphCSum)             // set checksum field
   361  	}
   362  	hdr.encode(bufs[item.bufsIndex][bufsOffset-virtioNetHdrLen:])
   363  
   364  	// Calculate the pseudo header checksum and place it at the TCP checksum
   365  	// offset. Downstream checksum offloading will combine this with computation
   366  	// of the tcp header and payload checksum.
   367  	addrLen := 4
   368  	addrOffset := ipv4SrcAddrOffset
   369  	if isV6 {
   370  		addrLen = 16
   371  		addrOffset = ipv6SrcAddrOffset
   372  	}
   373  	srcAddrAt := bufsOffset + addrOffset
   374  	srcAddr := bufs[item.bufsIndex][srcAddrAt : srcAddrAt+addrLen]
   375  	dstAddr := bufs[item.bufsIndex][srcAddrAt+addrLen : srcAddrAt+addrLen*2]
   376  	psum := pseudoHeaderChecksumNoFold(unix.IPPROTO_TCP, srcAddr, dstAddr, uint16(coalescedLen-int(item.iphLen)))
   377  	binary.BigEndian.PutUint16(pktHead[hdr.csumStart+hdr.csumOffset:], checksum([]byte{}, psum))
   378  
   379  	item.numMerged++
   380  	return coalesceSuccess
   381  }
   382  
   383  const (
   384  	ipv4FlagMoreFragments uint8 = 0x20
   385  )
   386  
   387  const (
   388  	ipv4SrcAddrOffset = 12
   389  	ipv6SrcAddrOffset = 8
   390  	maxUint16         = 1<<16 - 1
   391  )
   392  
   393  // tcpGRO evaluates the TCP packet at pktI in bufs for coalescing with
   394  // existing packets tracked in table. It will return false when pktI is not
   395  // coalesced, otherwise true. This indicates to the caller if bufs[pktI]
   396  // should be written to the Device.
   397  func tcpGRO(bufs [][]byte, offset int, pktI int, table *tcpGROTable, isV6 bool) (pktCoalesced bool) {
   398  	pkt := bufs[pktI][offset:]
   399  	if len(pkt) > maxUint16 {
   400  		// A valid IPv4 or IPv6 packet will never exceed this.
   401  		return false
   402  	}
   403  	iphLen := int((pkt[0] & 0x0F) * 4)
   404  	if isV6 {
   405  		iphLen = 40
   406  		ipv6HPayloadLen := int(binary.BigEndian.Uint16(pkt[4:]))
   407  		if ipv6HPayloadLen != len(pkt)-iphLen {
   408  			return false
   409  		}
   410  	} else {
   411  		totalLen := int(binary.BigEndian.Uint16(pkt[2:]))
   412  		if totalLen != len(pkt) {
   413  			return false
   414  		}
   415  	}
   416  	if len(pkt) < iphLen {
   417  		return false
   418  	}
   419  	tcphLen := int((pkt[iphLen+12] >> 4) * 4)
   420  	if tcphLen < 20 || tcphLen > 60 {
   421  		return false
   422  	}
   423  	if len(pkt) < iphLen+tcphLen {
   424  		return false
   425  	}
   426  	if !isV6 {
   427  		if pkt[6]&ipv4FlagMoreFragments != 0 || pkt[6]<<3 != 0 || pkt[7] != 0 {
   428  			// no GRO support for fragmented segments for now
   429  			return false
   430  		}
   431  	}
   432  	tcpFlags := pkt[iphLen+tcpFlagsOffset]
   433  	var pshSet bool
   434  	// not a candidate if any non-ACK flags (except PSH+ACK) are set
   435  	if tcpFlags != tcpFlagACK {
   436  		if pkt[iphLen+tcpFlagsOffset] != tcpFlagACK|tcpFlagPSH {
   437  			return false
   438  		}
   439  		pshSet = true
   440  	}
   441  	gsoSize := uint16(len(pkt) - tcphLen - iphLen)
   442  	// not a candidate if payload len is 0
   443  	if gsoSize < 1 {
   444  		return false
   445  	}
   446  	seq := binary.BigEndian.Uint32(pkt[iphLen+4:])
   447  	srcAddrOffset := ipv4SrcAddrOffset
   448  	addrLen := 4
   449  	if isV6 {
   450  		srcAddrOffset = ipv6SrcAddrOffset
   451  		addrLen = 16
   452  	}
   453  	items, existing := table.lookupOrInsert(pkt, srcAddrOffset, srcAddrOffset+addrLen, iphLen, tcphLen, pktI)
   454  	if !existing {
   455  		return false
   456  	}
   457  	for i := len(items) - 1; i >= 0; i-- {
   458  		// In the best case of packets arriving in order iterating in reverse is
   459  		// more efficient if there are multiple items for a given flow. This
   460  		// also enables a natural table.deleteAt() in the
   461  		// coalesceItemInvalidCSum case without the need for index tracking.
   462  		// This algorithm makes a best effort to coalesce in the event of
   463  		// unordered packets, where pkt may land anywhere in items from a
   464  		// sequence number perspective, however once an item is inserted into
   465  		// the table it is never compared across other items later.
   466  		item := items[i]
   467  		can := tcpPacketsCanCoalesce(pkt, uint8(iphLen), uint8(tcphLen), seq, pshSet, gsoSize, item, bufs, offset)
   468  		if can != coalesceUnavailable {
   469  			result := coalesceTCPPackets(can, pkt, pktI, gsoSize, seq, pshSet, &item, bufs, offset, isV6)
   470  			switch result {
   471  			case coalesceSuccess:
   472  				table.updateAt(item, i)
   473  				return true
   474  			case coalesceItemInvalidCSum:
   475  				// delete the item with an invalid csum
   476  				table.deleteAt(item.key, i)
   477  			case coalescePktInvalidCSum:
   478  				// no point in inserting an item that we can't coalesce
   479  				return false
   480  			default:
   481  			}
   482  		}
   483  	}
   484  	// failed to coalesce with any other packets; store the item in the flow
   485  	table.insert(pkt, srcAddrOffset, srcAddrOffset+addrLen, iphLen, tcphLen, pktI)
   486  	return false
   487  }
   488  
   489  func isTCP4NoIPOptions(b []byte) bool {
   490  	if len(b) < 40 {
   491  		return false
   492  	}
   493  	if b[0]>>4 != 4 {
   494  		return false
   495  	}
   496  	if b[0]&0x0F != 5 {
   497  		return false
   498  	}
   499  	if b[9] != unix.IPPROTO_TCP {
   500  		return false
   501  	}
   502  	return true
   503  }
   504  
   505  func isTCP6NoEH(b []byte) bool {
   506  	if len(b) < 60 {
   507  		return false
   508  	}
   509  	if b[0]>>4 != 6 {
   510  		return false
   511  	}
   512  	if b[6] != unix.IPPROTO_TCP {
   513  		return false
   514  	}
   515  	return true
   516  }
   517  
   518  // handleGRO evaluates bufs for GRO, and writes the indices of the resulting
   519  // packets into toWrite. toWrite, tcp4Table, and tcp6Table should initially be
   520  // empty (but non-nil), and are passed in to save allocs as the caller may reset
   521  // and recycle them across vectors of packets.
   522  func handleGRO(bufs [][]byte, offset int, tcp4Table, tcp6Table *tcpGROTable, toWrite *[]int) error {
   523  	for i := range bufs {
   524  		if offset < virtioNetHdrLen || offset > len(bufs[i])-1 {
   525  			return errors.New("invalid offset")
   526  		}
   527  		var coalesced bool
   528  		switch {
   529  		case isTCP4NoIPOptions(bufs[i][offset:]): // ipv4 packets w/IP options do not coalesce
   530  			coalesced = tcpGRO(bufs, offset, i, tcp4Table, false)
   531  		case isTCP6NoEH(bufs[i][offset:]): // ipv6 packets w/extension headers do not coalesce
   532  			coalesced = tcpGRO(bufs, offset, i, tcp6Table, true)
   533  		}
   534  		if !coalesced {
   535  			hdr := virtioNetHdr{}
   536  			err := hdr.encode(bufs[i][offset-virtioNetHdrLen:])
   537  			if err != nil {
   538  				return err
   539  			}
   540  			*toWrite = append(*toWrite, i)
   541  		}
   542  	}
   543  	return nil
   544  }
   545  
   546  // tcpTSO splits packets from in into outBuffs, writing the size of each
   547  // element into sizes. It returns the number of buffers populated, and/or an
   548  // error.
   549  func tcpTSO(in []byte, hdr virtioNetHdr, outBuffs [][]byte, sizes []int, outOffset int) (int, error) {
   550  	iphLen := int(hdr.csumStart)
   551  	srcAddrOffset := ipv6SrcAddrOffset
   552  	addrLen := 16
   553  	if hdr.gsoType == unix.VIRTIO_NET_HDR_GSO_TCPV4 {
   554  		in[10], in[11] = 0, 0 // clear ipv4 header checksum
   555  		srcAddrOffset = ipv4SrcAddrOffset
   556  		addrLen = 4
   557  	}
   558  	tcpCSumAt := int(hdr.csumStart + hdr.csumOffset)
   559  	in[tcpCSumAt], in[tcpCSumAt+1] = 0, 0 // clear tcp checksum
   560  	firstTCPSeqNum := binary.BigEndian.Uint32(in[hdr.csumStart+4:])
   561  	nextSegmentDataAt := int(hdr.hdrLen)
   562  	i := 0
   563  	for ; nextSegmentDataAt < len(in); i++ {
   564  		if i == len(outBuffs) {
   565  			return i - 1, ErrTooManySegments
   566  		}
   567  		nextSegmentEnd := nextSegmentDataAt + int(hdr.gsoSize)
   568  		if nextSegmentEnd > len(in) {
   569  			nextSegmentEnd = len(in)
   570  		}
   571  		segmentDataLen := nextSegmentEnd - nextSegmentDataAt
   572  		totalLen := int(hdr.hdrLen) + segmentDataLen
   573  		sizes[i] = totalLen
   574  		out := outBuffs[i][outOffset:]
   575  
   576  		copy(out, in[:iphLen])
   577  		if hdr.gsoType == unix.VIRTIO_NET_HDR_GSO_TCPV4 {
   578  			// For IPv4 we are responsible for incrementing the ID field,
   579  			// updating the total len field, and recalculating the header
   580  			// checksum.
   581  			if i > 0 {
   582  				id := binary.BigEndian.Uint16(out[4:])
   583  				id += uint16(i)
   584  				binary.BigEndian.PutUint16(out[4:], id)
   585  			}
   586  			binary.BigEndian.PutUint16(out[2:], uint16(totalLen))
   587  			ipv4CSum := ^checksum(out[:iphLen], 0)
   588  			binary.BigEndian.PutUint16(out[10:], ipv4CSum)
   589  		} else {
   590  			// For IPv6 we are responsible for updating the payload length field.
   591  			binary.BigEndian.PutUint16(out[4:], uint16(totalLen-iphLen))
   592  		}
   593  
   594  		// TCP header
   595  		copy(out[hdr.csumStart:hdr.hdrLen], in[hdr.csumStart:hdr.hdrLen])
   596  		tcpSeq := firstTCPSeqNum + uint32(hdr.gsoSize*uint16(i))
   597  		binary.BigEndian.PutUint32(out[hdr.csumStart+4:], tcpSeq)
   598  		if nextSegmentEnd != len(in) {
   599  			// FIN and PSH should only be set on last segment
   600  			clearFlags := tcpFlagFIN | tcpFlagPSH
   601  			out[hdr.csumStart+tcpFlagsOffset] &^= clearFlags
   602  		}
   603  
   604  		// payload
   605  		copy(out[hdr.hdrLen:], in[nextSegmentDataAt:nextSegmentEnd])
   606  
   607  		// TCP checksum
   608  		tcpHLen := int(hdr.hdrLen - hdr.csumStart)
   609  		tcpLenForPseudo := uint16(tcpHLen + segmentDataLen)
   610  		tcpCSumNoFold := pseudoHeaderChecksumNoFold(unix.IPPROTO_TCP, in[srcAddrOffset:srcAddrOffset+addrLen], in[srcAddrOffset+addrLen:srcAddrOffset+addrLen*2], tcpLenForPseudo)
   611  		tcpCSum := ^checksum(out[hdr.csumStart:totalLen], tcpCSumNoFold)
   612  		binary.BigEndian.PutUint16(out[hdr.csumStart+hdr.csumOffset:], tcpCSum)
   613  
   614  		nextSegmentDataAt += int(hdr.gsoSize)
   615  	}
   616  	return i, nil
   617  }
   618  
   619  func gsoNoneChecksum(in []byte, cSumStart, cSumOffset uint16) error {
   620  	cSumAt := cSumStart + cSumOffset
   621  	// The initial value at the checksum offset should be summed with the
   622  	// checksum we compute. This is typically the pseudo-header checksum.
   623  	initial := binary.BigEndian.Uint16(in[cSumAt:])
   624  	in[cSumAt], in[cSumAt+1] = 0, 0
   625  	binary.BigEndian.PutUint16(in[cSumAt:], ^checksum(in[cSumStart:], uint64(initial)))
   626  	return nil
   627  }