github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/tcpip/link/fdbased/packet_dispatchers.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build linux 16 // +build linux 17 18 package fdbased 19 20 import ( 21 "golang.org/x/sys/unix" 22 "github.com/metacubex/gvisor/pkg/buffer" 23 "github.com/metacubex/gvisor/pkg/tcpip" 24 "github.com/metacubex/gvisor/pkg/tcpip/header" 25 "github.com/metacubex/gvisor/pkg/tcpip/link/rawfile" 26 "github.com/metacubex/gvisor/pkg/tcpip/link/stopfd" 27 "github.com/metacubex/gvisor/pkg/tcpip/stack" 28 ) 29 30 // BufConfig defines the shape of the buffer used to read packets from the NIC. 31 var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768} 32 33 type iovecBuffer struct { 34 // buffer is the actual buffer that holds the packet contents. Some contents 35 // are reused across calls to pullBuffer if number of requested bytes is 36 // smaller than the number of bytes allocated in the buffer. 37 views []*buffer.View 38 39 // iovecs are initialized with base pointers/len of the corresponding 40 // entries in the views defined above, except when GSO is enabled 41 // (skipsVnetHdr) then the first iovec points to a buffer for the vnet header 42 // which is stripped before the views are passed up the stack for further 43 // processing. 44 iovecs []unix.Iovec 45 46 // sizes is an array of buffer sizes for the underlying views. sizes is 47 // immutable. 48 sizes []int 49 50 // skipsVnetHdr is true if virtioNetHdr is to skipped. 51 skipsVnetHdr bool 52 53 // pulledIndex is the index of the last []byte buffer pulled from the 54 // underlying buffer storage during a call to pullBuffers. It is -1 55 // if no buffer is pulled. 56 pulledIndex int 57 } 58 59 func newIovecBuffer(sizes []int, skipsVnetHdr bool) *iovecBuffer { 60 b := &iovecBuffer{ 61 views: make([]*buffer.View, len(sizes)), 62 sizes: sizes, 63 skipsVnetHdr: skipsVnetHdr, 64 } 65 niov := len(b.views) 66 if b.skipsVnetHdr { 67 niov++ 68 } 69 b.iovecs = make([]unix.Iovec, niov) 70 return b 71 } 72 73 func (b *iovecBuffer) nextIovecs() []unix.Iovec { 74 vnetHdrOff := 0 75 if b.skipsVnetHdr { 76 var vnetHdr [virtioNetHdrSize]byte 77 // The kernel adds virtioNetHdr before each packet, but 78 // we don't use it, so we allocate a buffer for it, 79 // add it in iovecs but don't add it in a view. 80 b.iovecs[0] = unix.Iovec{Base: &vnetHdr[0]} 81 b.iovecs[0].SetLen(virtioNetHdrSize) 82 vnetHdrOff++ 83 } 84 85 for i := range b.views { 86 if b.views[i] != nil { 87 break 88 } 89 v := buffer.NewViewSize(b.sizes[i]) 90 b.views[i] = v 91 b.iovecs[i+vnetHdrOff] = unix.Iovec{Base: v.BasePtr()} 92 b.iovecs[i+vnetHdrOff].SetLen(v.Size()) 93 } 94 return b.iovecs 95 } 96 97 // pullBuffer extracts the enough underlying storage from b.buffer to hold n 98 // bytes. It removes this storage from b.buffer, returns a new buffer 99 // that holds the storage, and updates pulledIndex to indicate which part 100 // of b.buffer's storage must be reallocated during the next call to 101 // nextIovecs. 102 func (b *iovecBuffer) pullBuffer(n int) buffer.Buffer { 103 var views []*buffer.View 104 c := 0 105 if b.skipsVnetHdr { 106 c += virtioNetHdrSize 107 if c >= n { 108 // Nothing in the packet. 109 return buffer.Buffer{} 110 } 111 } 112 // Remove the used views from the buffer. 113 for i, v := range b.views { 114 c += v.Size() 115 if c >= n { 116 b.views[i].CapLength(v.Size() - (c - n)) 117 views = append(views, b.views[:i+1]...) 118 break 119 } 120 } 121 for i := range views { 122 b.views[i] = nil 123 } 124 if b.skipsVnetHdr { 125 // Exclude the size of the vnet header. 126 n -= virtioNetHdrSize 127 } 128 pulled := buffer.Buffer{} 129 for _, v := range views { 130 pulled.Append(v) 131 } 132 pulled.Truncate(int64(n)) 133 return pulled 134 } 135 136 func (b *iovecBuffer) release() { 137 for _, v := range b.views { 138 if v != nil { 139 v.Release() 140 v = nil 141 } 142 } 143 } 144 145 // readVDispatcher uses readv() system call to read inbound packets and 146 // dispatches them. 147 type readVDispatcher struct { 148 stopfd.StopFD 149 // fd is the file descriptor used to send and receive packets. 150 fd int 151 152 // e is the endpoint this dispatcher is attached to. 153 e *endpoint 154 155 // buf is the iovec buffer that contains the packet contents. 156 buf *iovecBuffer 157 } 158 159 func newReadVDispatcher(fd int, e *endpoint) (linkDispatcher, error) { 160 stopFD, err := stopfd.New() 161 if err != nil { 162 return nil, err 163 } 164 d := &readVDispatcher{ 165 StopFD: stopFD, 166 fd: fd, 167 e: e, 168 } 169 skipsVnetHdr := d.e.gsoKind == stack.HostGSOSupported 170 d.buf = newIovecBuffer(BufConfig, skipsVnetHdr) 171 return d, nil 172 } 173 174 func (d *readVDispatcher) release() { 175 d.buf.release() 176 } 177 178 // dispatch reads one packet from the file descriptor and dispatches it. 179 func (d *readVDispatcher) dispatch() (bool, tcpip.Error) { 180 n, err := rawfile.BlockingReadvUntilStopped(d.EFD, d.fd, d.buf.nextIovecs()) 181 if n <= 0 || err != nil { 182 return false, err 183 } 184 185 pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ 186 Payload: d.buf.pullBuffer(n), 187 }) 188 defer pkt.DecRef() 189 190 var p tcpip.NetworkProtocolNumber 191 if d.e.hdrSize > 0 { 192 if !d.e.parseHeader(pkt) { 193 return false, nil 194 } 195 p = header.Ethernet(pkt.LinkHeader().Slice()).Type() 196 } else { 197 // We don't get any indication of what the packet is, so try to guess 198 // if it's an IPv4 or IPv6 packet. 199 // IP version information is at the first octet, so pulling up 1 byte. 200 h, ok := pkt.Data().PullUp(1) 201 if !ok { 202 return true, nil 203 } 204 switch header.IPVersion(h) { 205 case header.IPv4Version: 206 p = header.IPv4ProtocolNumber 207 case header.IPv6Version: 208 p = header.IPv6ProtocolNumber 209 default: 210 return true, nil 211 } 212 } 213 214 d.e.mu.RLock() 215 dsp := d.e.dispatcher 216 d.e.mu.RUnlock() 217 dsp.DeliverNetworkPacket(p, pkt) 218 219 return true, nil 220 } 221 222 // recvMMsgDispatcher uses the recvmmsg system call to read inbound packets and 223 // dispatches them. 224 type recvMMsgDispatcher struct { 225 stopfd.StopFD 226 // fd is the file descriptor used to send and receive packets. 227 fd int 228 229 // e is the endpoint this dispatcher is attached to. 230 e *endpoint 231 232 // bufs is an array of iovec buffers that contain packet contents. 233 bufs []*iovecBuffer 234 235 // msgHdrs is an array of MMsgHdr objects where each MMsghdr is used to 236 // reference an array of iovecs in the iovecs field defined above. This 237 // array is passed as the parameter to recvmmsg call to retrieve 238 // potentially more than 1 packet per unix. 239 msgHdrs []rawfile.MMsgHdr 240 } 241 242 const ( 243 // MaxMsgsPerRecv is the maximum number of packets we want to retrieve 244 // in a single RecvMMsg call. 245 MaxMsgsPerRecv = 8 246 ) 247 248 func newRecvMMsgDispatcher(fd int, e *endpoint) (linkDispatcher, error) { 249 stopFD, err := stopfd.New() 250 if err != nil { 251 return nil, err 252 } 253 d := &recvMMsgDispatcher{ 254 StopFD: stopFD, 255 fd: fd, 256 e: e, 257 bufs: make([]*iovecBuffer, MaxMsgsPerRecv), 258 msgHdrs: make([]rawfile.MMsgHdr, MaxMsgsPerRecv), 259 } 260 skipsVnetHdr := d.e.gsoKind == stack.HostGSOSupported 261 for i := range d.bufs { 262 d.bufs[i] = newIovecBuffer(BufConfig, skipsVnetHdr) 263 } 264 return d, nil 265 } 266 267 func (d *recvMMsgDispatcher) release() { 268 for _, iov := range d.bufs { 269 iov.release() 270 } 271 } 272 273 // recvMMsgDispatch reads more than one packet at a time from the file 274 // descriptor and dispatches it. 275 func (d *recvMMsgDispatcher) dispatch() (bool, tcpip.Error) { 276 // Fill message headers. 277 for k := range d.msgHdrs { 278 if d.msgHdrs[k].Msg.Iovlen > 0 { 279 break 280 } 281 iovecs := d.bufs[k].nextIovecs() 282 iovLen := len(iovecs) 283 d.msgHdrs[k].Len = 0 284 d.msgHdrs[k].Msg.Iov = &iovecs[0] 285 d.msgHdrs[k].Msg.SetIovlen(iovLen) 286 } 287 288 nMsgs, err := rawfile.BlockingRecvMMsgUntilStopped(d.EFD, d.fd, d.msgHdrs) 289 if nMsgs == -1 || err != nil { 290 return false, err 291 } 292 // Process each of received packets. 293 // Keep a list of packets so we can DecRef outside of the loop. 294 var pkts stack.PacketBufferList 295 296 d.e.mu.RLock() 297 dsp := d.e.dispatcher 298 d.e.mu.RUnlock() 299 300 defer func() { pkts.DecRef() }() 301 for k := 0; k < nMsgs; k++ { 302 n := int(d.msgHdrs[k].Len) 303 pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ 304 Payload: d.bufs[k].pullBuffer(n), 305 }) 306 pkts.PushBack(pkt) 307 308 // Mark that this iovec has been processed. 309 d.msgHdrs[k].Msg.Iovlen = 0 310 311 var p tcpip.NetworkProtocolNumber 312 if d.e.hdrSize > 0 { 313 hdr, ok := pkt.LinkHeader().Consume(d.e.hdrSize) 314 if !ok { 315 return false, nil 316 } 317 p = header.Ethernet(hdr).Type() 318 } else { 319 // We don't get any indication of what the packet is, so try to guess 320 // if it's an IPv4 or IPv6 packet. 321 // IP version information is at the first octet, so pulling up 1 byte. 322 h, ok := pkt.Data().PullUp(1) 323 if !ok { 324 // Skip this packet. 325 continue 326 } 327 switch header.IPVersion(h) { 328 case header.IPv4Version: 329 p = header.IPv4ProtocolNumber 330 case header.IPv6Version: 331 p = header.IPv6ProtocolNumber 332 default: 333 // Skip this packet. 334 continue 335 } 336 } 337 338 dsp.DeliverNetworkPacket(p, pkt) 339 } 340 341 return true, nil 342 }