gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/tcpip/link/fdbased/packet_dispatchers.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build linux 16 // +build linux 17 18 package fdbased 19 20 import ( 21 "golang.org/x/sys/unix" 22 "gvisor.dev/gvisor/pkg/buffer" 23 "gvisor.dev/gvisor/pkg/tcpip" 24 "gvisor.dev/gvisor/pkg/tcpip/header" 25 "gvisor.dev/gvisor/pkg/tcpip/link/rawfile" 26 "gvisor.dev/gvisor/pkg/tcpip/link/stopfd" 27 "gvisor.dev/gvisor/pkg/tcpip/stack" 28 "gvisor.dev/gvisor/pkg/tcpip/stack/gro" 29 ) 30 31 // BufConfig defines the shape of the buffer used to read packets from the NIC. 32 var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768} 33 34 type iovecBuffer struct { 35 // buffer is the actual buffer that holds the packet contents. Some contents 36 // are reused across calls to pullBuffer if number of requested bytes is 37 // smaller than the number of bytes allocated in the buffer. 38 views []*buffer.View 39 40 // iovecs are initialized with base pointers/len of the corresponding 41 // entries in the views defined above, except when GSO is enabled 42 // (skipsVnetHdr) then the first iovec points to a buffer for the vnet header 43 // which is stripped before the views are passed up the stack for further 44 // processing. 45 iovecs []unix.Iovec 46 47 // sizes is an array of buffer sizes for the underlying views. sizes is 48 // immutable. 49 sizes []int 50 51 // skipsVnetHdr is true if virtioNetHdr is to skipped. 52 skipsVnetHdr bool 53 54 // pulledIndex is the index of the last []byte buffer pulled from the 55 // underlying buffer storage during a call to pullBuffers. It is -1 56 // if no buffer is pulled. 57 pulledIndex int 58 } 59 60 func newIovecBuffer(sizes []int, skipsVnetHdr bool) *iovecBuffer { 61 b := &iovecBuffer{ 62 views: make([]*buffer.View, len(sizes)), 63 sizes: sizes, 64 skipsVnetHdr: skipsVnetHdr, 65 } 66 niov := len(b.views) 67 if b.skipsVnetHdr { 68 niov++ 69 } 70 b.iovecs = make([]unix.Iovec, niov) 71 return b 72 } 73 74 func (b *iovecBuffer) nextIovecs() []unix.Iovec { 75 vnetHdrOff := 0 76 if b.skipsVnetHdr { 77 var vnetHdr [virtioNetHdrSize]byte 78 // The kernel adds virtioNetHdr before each packet, but 79 // we don't use it, so we allocate a buffer for it, 80 // add it in iovecs but don't add it in a view. 81 b.iovecs[0] = unix.Iovec{Base: &vnetHdr[0]} 82 b.iovecs[0].SetLen(virtioNetHdrSize) 83 vnetHdrOff++ 84 } 85 86 for i := range b.views { 87 if b.views[i] != nil { 88 break 89 } 90 v := buffer.NewViewSize(b.sizes[i]) 91 b.views[i] = v 92 b.iovecs[i+vnetHdrOff] = unix.Iovec{Base: v.BasePtr()} 93 b.iovecs[i+vnetHdrOff].SetLen(v.Size()) 94 } 95 return b.iovecs 96 } 97 98 // pullBuffer extracts the enough underlying storage from b.buffer to hold n 99 // bytes. It removes this storage from b.buffer, returns a new buffer 100 // that holds the storage, and updates pulledIndex to indicate which part 101 // of b.buffer's storage must be reallocated during the next call to 102 // nextIovecs. 103 func (b *iovecBuffer) pullBuffer(n int) buffer.Buffer { 104 var views []*buffer.View 105 c := 0 106 if b.skipsVnetHdr { 107 c += virtioNetHdrSize 108 if c >= n { 109 // Nothing in the packet. 110 return buffer.Buffer{} 111 } 112 } 113 // Remove the used views from the buffer. 114 for i, v := range b.views { 115 c += v.Size() 116 if c >= n { 117 b.views[i].CapLength(v.Size() - (c - n)) 118 views = append(views, b.views[:i+1]...) 119 break 120 } 121 } 122 for i := range views { 123 b.views[i] = nil 124 } 125 if b.skipsVnetHdr { 126 // Exclude the size of the vnet header. 127 n -= virtioNetHdrSize 128 } 129 pulled := buffer.Buffer{} 130 for _, v := range views { 131 pulled.Append(v) 132 } 133 pulled.Truncate(int64(n)) 134 return pulled 135 } 136 137 func (b *iovecBuffer) release() { 138 for _, v := range b.views { 139 if v != nil { 140 v.Release() 141 v = nil 142 } 143 } 144 } 145 146 // readVDispatcher uses readv() system call to read inbound packets and 147 // dispatches them. 148 type readVDispatcher struct { 149 stopfd.StopFD 150 // fd is the file descriptor used to send and receive packets. 151 fd int 152 153 // e is the endpoint this dispatcher is attached to. 154 e *endpoint 155 156 // buf is the iovec buffer that contains the packet contents. 157 buf *iovecBuffer 158 159 // mgr is the processor goroutine manager. 160 mgr *processorManager 161 } 162 163 func newReadVDispatcher(fd int, e *endpoint, opts *Options) (linkDispatcher, error) { 164 stopFD, err := stopfd.New() 165 if err != nil { 166 return nil, err 167 } 168 d := &readVDispatcher{ 169 StopFD: stopFD, 170 fd: fd, 171 e: e, 172 } 173 skipsVnetHdr := d.e.gsoKind == stack.HostGSOSupported 174 d.buf = newIovecBuffer(BufConfig, skipsVnetHdr) 175 d.mgr = newProcessorManager(opts, e) 176 d.mgr.start() 177 return d, nil 178 } 179 180 func (d *readVDispatcher) release() { 181 d.buf.release() 182 d.mgr.close() 183 } 184 185 // dispatch reads one packet from the file descriptor and dispatches it. 186 func (d *readVDispatcher) dispatch() (bool, tcpip.Error) { 187 n, err := rawfile.BlockingReadvUntilStopped(d.EFD, d.fd, d.buf.nextIovecs()) 188 if n <= 0 || err != nil { 189 return false, err 190 } 191 192 pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ 193 Payload: d.buf.pullBuffer(n), 194 }) 195 defer pkt.DecRef() 196 197 if d.e.hdrSize > 0 { 198 if !d.e.parseHeader(pkt) { 199 return false, nil 200 } 201 pkt.NetworkProtocolNumber = header.Ethernet(pkt.LinkHeader().Slice()).Type() 202 } 203 d.mgr.queuePacket(pkt, d.e.hdrSize > 0) 204 d.mgr.wakeReady() 205 return true, nil 206 } 207 208 // recvMMsgDispatcher uses the recvmmsg system call to read inbound packets and 209 // dispatches them. 210 type recvMMsgDispatcher struct { 211 stopfd.StopFD 212 // fd is the file descriptor used to send and receive packets. 213 fd int 214 215 // e is the endpoint this dispatcher is attached to. 216 e *endpoint 217 218 // bufs is an array of iovec buffers that contain packet contents. 219 bufs []*iovecBuffer 220 221 // msgHdrs is an array of MMsgHdr objects where each MMsghdr is used to 222 // reference an array of iovecs in the iovecs field defined above. This 223 // array is passed as the parameter to recvmmsg call to retrieve 224 // potentially more than 1 packet per unix. 225 msgHdrs []rawfile.MMsgHdr 226 227 // pkts is reused to avoid allocations. 228 pkts stack.PacketBufferList 229 230 // gro coalesces incoming packets to increase throughput. 231 gro gro.GRO 232 233 // mgr is the processor goroutine manager. 234 mgr *processorManager 235 } 236 237 const ( 238 // MaxMsgsPerRecv is the maximum number of packets we want to retrieve 239 // in a single RecvMMsg call. 240 MaxMsgsPerRecv = 8 241 ) 242 243 func newRecvMMsgDispatcher(fd int, e *endpoint, opts *Options) (linkDispatcher, error) { 244 stopFD, err := stopfd.New() 245 if err != nil { 246 return nil, err 247 } 248 d := &recvMMsgDispatcher{ 249 StopFD: stopFD, 250 fd: fd, 251 e: e, 252 bufs: make([]*iovecBuffer, MaxMsgsPerRecv), 253 msgHdrs: make([]rawfile.MMsgHdr, MaxMsgsPerRecv), 254 } 255 skipsVnetHdr := d.e.gsoKind == stack.HostGSOSupported 256 for i := range d.bufs { 257 d.bufs[i] = newIovecBuffer(BufConfig, skipsVnetHdr) 258 } 259 d.gro.Init(opts.GRO) 260 d.mgr = newProcessorManager(opts, e) 261 d.mgr.start() 262 263 return d, nil 264 } 265 266 func (d *recvMMsgDispatcher) release() { 267 for _, iov := range d.bufs { 268 iov.release() 269 } 270 d.mgr.close() 271 } 272 273 // recvMMsgDispatch reads more than one packet at a time from the file 274 // descriptor and dispatches it. 275 func (d *recvMMsgDispatcher) dispatch() (bool, tcpip.Error) { 276 // Fill message headers. 277 for k := range d.msgHdrs { 278 if d.msgHdrs[k].Msg.Iovlen > 0 { 279 break 280 } 281 iovecs := d.bufs[k].nextIovecs() 282 iovLen := len(iovecs) 283 d.msgHdrs[k].Len = 0 284 d.msgHdrs[k].Msg.Iov = &iovecs[0] 285 d.msgHdrs[k].Msg.SetIovlen(iovLen) 286 } 287 288 nMsgs, err := rawfile.BlockingRecvMMsgUntilStopped(d.EFD, d.fd, d.msgHdrs) 289 if nMsgs == -1 || err != nil { 290 return false, err 291 } 292 293 // Process each of received packets. 294 295 d.e.mu.RLock() 296 dsp := d.e.dispatcher 297 d.e.mu.RUnlock() 298 299 d.gro.Dispatcher = dsp 300 defer d.pkts.Reset() 301 302 for k := 0; k < nMsgs; k++ { 303 n := int(d.msgHdrs[k].Len) 304 pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ 305 Payload: d.bufs[k].pullBuffer(n), 306 }) 307 d.pkts.PushBack(pkt) 308 309 // Mark that this iovec has been processed. 310 d.msgHdrs[k].Msg.Iovlen = 0 311 312 if d.e.hdrSize > 0 { 313 hdr, ok := pkt.LinkHeader().Consume(d.e.hdrSize) 314 if !ok { 315 return false, nil 316 } 317 pkt.NetworkProtocolNumber = header.Ethernet(hdr).Type() 318 } 319 pkt.RXChecksumValidated = d.e.caps&stack.CapabilityRXChecksumOffload != 0 320 d.mgr.queuePacket(pkt, d.e.hdrSize > 0) 321 } 322 d.mgr.wakeReady() 323 324 return true, nil 325 }