github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/tcpip/link/sharedmem/sharedmem.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build linux 16 // +build linux 17 18 // Package sharedmem provides the implementation of data-link layer endpoints 19 // backed by shared memory. 20 // 21 // Shared memory endpoints can be used in the networking stack by calling New() 22 // to create a new endpoint, and then passing it as an argument to 23 // Stack.CreateNIC(). 24 package sharedmem 25 26 import ( 27 "fmt" 28 29 "github.com/metacubex/gvisor/pkg/atomicbitops" 30 "github.com/metacubex/gvisor/pkg/buffer" 31 "github.com/metacubex/gvisor/pkg/eventfd" 32 "github.com/metacubex/gvisor/pkg/log" 33 "github.com/metacubex/gvisor/pkg/sync" 34 "github.com/metacubex/gvisor/pkg/tcpip" 35 "github.com/metacubex/gvisor/pkg/tcpip/header" 36 "github.com/metacubex/gvisor/pkg/tcpip/link/rawfile" 37 "github.com/metacubex/gvisor/pkg/tcpip/link/sharedmem/queue" 38 "github.com/metacubex/gvisor/pkg/tcpip/stack" 39 ) 40 41 // QueueConfig holds all the file descriptors needed to describe a tx or rx 42 // queue over shared memory. It is used when creating new shared memory 43 // endpoints to describe tx and rx queues. 44 type QueueConfig struct { 45 // DataFD is a file descriptor for the file that contains the data to 46 // be transmitted via this queue. Descriptors contain offsets within 47 // this file. 48 DataFD int 49 50 // EventFD is a file descriptor for the event that is signaled when 51 // data is becomes available in this queue. 52 EventFD eventfd.Eventfd 53 54 // TxPipeFD is a file descriptor for the tx pipe associated with the 55 // queue. 56 TxPipeFD int 57 58 // RxPipeFD is a file descriptor for the rx pipe associated with the 59 // queue. 60 RxPipeFD int 61 62 // SharedDataFD is a file descriptor for the file that contains shared 63 // state between the two ends of the queue. This data specifies, for 64 // example, whether EventFD signaling is enabled or disabled. 65 SharedDataFD int 66 } 67 68 // FDs returns the FD's in the QueueConfig as a slice of ints. This must 69 // be used in conjunction with QueueConfigFromFDs to ensure the order 70 // of FDs matches when reconstructing the config when serialized or sent 71 // as part of control messages. 72 func (q *QueueConfig) FDs() []int { 73 return []int{q.DataFD, q.EventFD.FD(), q.TxPipeFD, q.RxPipeFD, q.SharedDataFD} 74 } 75 76 // QueueConfigFromFDs constructs a QueueConfig out of a slice of ints where each 77 // entry represents an file descriptor. The order of FDs in the slice must be in 78 // the order specified below for the config to be valid. QueueConfig.FDs() 79 // should be used when the config needs to be serialized or sent as part of a 80 // control message to ensure the correct order. 81 func QueueConfigFromFDs(fds []int) (QueueConfig, error) { 82 if len(fds) != 5 { 83 return QueueConfig{}, fmt.Errorf("insufficient number of fds: len(fds): %d, want: 5", len(fds)) 84 } 85 return QueueConfig{ 86 DataFD: fds[0], 87 EventFD: eventfd.Wrap(fds[1]), 88 TxPipeFD: fds[2], 89 RxPipeFD: fds[3], 90 SharedDataFD: fds[4], 91 }, nil 92 } 93 94 // Options specify the details about the sharedmem endpoint to be created. 95 type Options struct { 96 // MTU is the mtu to use for this endpoint. 97 MTU uint32 98 99 // BufferSize is the size of each scatter/gather buffer that will hold packet 100 // data. 101 // 102 // NOTE: This directly determines number of packets that can be held in 103 // the ring buffer at any time. This does not have to be sized to the MTU as 104 // the shared memory queue design allows usage of more than one buffer to be 105 // used to make up a given packet. 106 BufferSize uint32 107 108 // LinkAddress is the link address for this endpoint (required). 109 LinkAddress tcpip.LinkAddress 110 111 // TX is the transmit queue configuration for this shared memory endpoint. 112 TX QueueConfig 113 114 // RX is the receive queue configuration for this shared memory endpoint. 115 RX QueueConfig 116 117 // PeerFD is the fd for the connected peer which can be used to detect 118 // peer disconnects. 119 PeerFD int 120 121 // OnClosed is a function that is called when the endpoint is being closed 122 // (probably due to peer going away) 123 OnClosed func(err tcpip.Error) 124 125 // TXChecksumOffload if true, indicates that this endpoints capability 126 // set should include CapabilityTXChecksumOffload. 127 TXChecksumOffload bool 128 129 // RXChecksumOffload if true, indicates that this endpoints capability 130 // set should include CapabilityRXChecksumOffload. 131 RXChecksumOffload bool 132 133 // VirtioNetHeaderRequired if true, indicates that all outbound packets should have 134 // a virtio header and inbound packets should have a virtio header as well. 135 VirtioNetHeaderRequired bool 136 137 // GSOMaxSize is the maximum GSO packet size. It is zero if GSO is 138 // disabled. Note that only gVisor GSO is supported, not host GSO. 139 GSOMaxSize uint32 140 } 141 142 var _ stack.LinkEndpoint = (*endpoint)(nil) 143 var _ stack.GSOEndpoint = (*endpoint)(nil) 144 145 type endpoint struct { 146 // mtu (maximum transmission unit) is the maximum size of a packet. 147 // mtu is immutable. 148 mtu uint32 149 150 // bufferSize is the size of each individual buffer. 151 // bufferSize is immutable. 152 bufferSize uint32 153 154 // addr is the local address of this endpoint. 155 // addr is immutable. 156 addr tcpip.LinkAddress 157 158 // peerFD is an fd to the peer that can be used to detect when the 159 // peer is gone. 160 // peerFD is immutable. 161 peerFD int 162 163 // caps holds the endpoint capabilities. 164 caps stack.LinkEndpointCapabilities 165 166 // hdrSize is the size of the link layer header if any. 167 // hdrSize is immutable. 168 hdrSize uint32 169 170 // gSOMaxSize is the maximum GSO packet size. It is zero if GSO is 171 // disabled. Note that only gVisor GSO is supported, not host GSO. 172 // gsoMaxSize is immutable. 173 gsoMaxSize uint32 174 175 // virtioNetHeaderRequired if true indicates that a virtio header is expected 176 // in all inbound/outbound packets. 177 virtioNetHeaderRequired bool 178 179 // rx is the receive queue. 180 rx rx 181 182 // stopRequested determines whether the worker goroutines should stop. 183 stopRequested atomicbitops.Uint32 184 185 // Wait group used to indicate that all workers have stopped. 186 completed sync.WaitGroup 187 188 // onClosed is a function to be called when the FD's peer (if any) closes 189 // its end of the communication pipe. 190 onClosed func(tcpip.Error) 191 192 // mu protects the following fields. 193 mu sync.Mutex 194 195 // tx is the transmit queue. 196 // +checklocks:mu 197 tx tx 198 199 // workerStarted specifies whether the worker goroutine was started. 200 // +checklocks:mu 201 workerStarted bool 202 } 203 204 // New creates a new shared-memory-based endpoint. Buffers will be broken up 205 // into buffers of "bufferSize" bytes. 206 // 207 // In order to release all resources held by the returned endpoint, Close() 208 // must be called followed by Wait(). 209 func New(opts Options) (stack.LinkEndpoint, error) { 210 e := &endpoint{ 211 mtu: opts.MTU, 212 bufferSize: opts.BufferSize, 213 addr: opts.LinkAddress, 214 peerFD: opts.PeerFD, 215 onClosed: opts.OnClosed, 216 virtioNetHeaderRequired: opts.VirtioNetHeaderRequired, 217 gsoMaxSize: opts.GSOMaxSize, 218 } 219 220 if err := e.tx.init(opts.BufferSize, &opts.TX); err != nil { 221 return nil, err 222 } 223 224 if err := e.rx.init(opts.BufferSize, &opts.RX); err != nil { 225 e.tx.cleanup() 226 return nil, err 227 } 228 229 e.caps = stack.LinkEndpointCapabilities(0) 230 if opts.RXChecksumOffload { 231 e.caps |= stack.CapabilityRXChecksumOffload 232 } 233 234 if opts.TXChecksumOffload { 235 e.caps |= stack.CapabilityTXChecksumOffload 236 } 237 238 if opts.LinkAddress != "" { 239 e.hdrSize = header.EthernetMinimumSize 240 e.caps |= stack.CapabilityResolutionRequired 241 } 242 243 if opts.VirtioNetHeaderRequired { 244 e.hdrSize += header.VirtioNetHeaderSize 245 } 246 247 return e, nil 248 } 249 250 // Close frees most resources associated with the endpoint. Wait() must be 251 // called after Close() in order to free the rest. 252 func (e *endpoint) Close() { 253 // Tell dispatch goroutine to stop, then write to the eventfd so that 254 // it wakes up in case it's sleeping. 255 e.stopRequested.Store(1) 256 e.rx.eventFD.Notify() 257 258 // Cleanup the queues inline if the worker hasn't started yet; we also 259 // know it won't start from now on because stopRequested is set to 1. 260 e.mu.Lock() 261 defer e.mu.Unlock() 262 workerPresent := e.workerStarted 263 264 if !workerPresent { 265 e.tx.cleanup() 266 e.rx.cleanup() 267 } 268 } 269 270 // Wait implements stack.LinkEndpoint.Wait. It waits until all workers have 271 // stopped after a Close() call. 272 func (e *endpoint) Wait() { 273 e.completed.Wait() 274 e.rx.eventFD.Close() 275 } 276 277 // Attach implements stack.LinkEndpoint.Attach. It launches the goroutine that 278 // reads packets from the rx queue. 279 func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) { 280 if dispatcher == nil { 281 e.Close() 282 return 283 } 284 e.mu.Lock() 285 if !e.workerStarted && e.stopRequested.Load() == 0 { 286 e.workerStarted = true 287 e.completed.Add(1) 288 289 // Spin up a goroutine to monitor for peer shutdown. 290 if e.peerFD >= 0 { 291 e.completed.Add(1) 292 go func() { 293 defer e.completed.Done() 294 b := make([]byte, 1) 295 // When sharedmem endpoint is in use the peerFD is never used for any data 296 // transfer and this Read should only return if the peer is shutting down. 297 _, err := rawfile.BlockingRead(e.peerFD, b) 298 if e.onClosed != nil { 299 e.onClosed(err) 300 } 301 }() 302 } 303 304 // Link endpoints are not savable. When transportation endpoints 305 // are saved, they stop sending outgoing packets and all 306 // incoming packets are rejected. 307 go e.dispatchLoop(dispatcher) // S/R-SAFE: see above. 308 } 309 e.mu.Unlock() 310 } 311 312 // IsAttached implements stack.LinkEndpoint.IsAttached. 313 func (e *endpoint) IsAttached() bool { 314 e.mu.Lock() 315 defer e.mu.Unlock() 316 return e.workerStarted 317 } 318 319 // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized 320 // during construction. 321 func (e *endpoint) MTU() uint32 { 322 return e.mtu 323 } 324 325 // Capabilities implements stack.LinkEndpoint.Capabilities. 326 func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { 327 return e.caps 328 } 329 330 // MaxHeaderLength implements stack.LinkEndpoint.MaxHeaderLength. It returns the 331 // ethernet frame header size. 332 func (e *endpoint) MaxHeaderLength() uint16 { 333 return uint16(e.hdrSize) 334 } 335 336 // LinkAddress implements stack.LinkEndpoint.LinkAddress. It returns the local 337 // link address. 338 func (e *endpoint) LinkAddress() tcpip.LinkAddress { 339 return e.addr 340 } 341 342 // AddHeader implements stack.LinkEndpoint.AddHeader. 343 func (e *endpoint) AddHeader(pkt *stack.PacketBuffer) { 344 // Add ethernet header if needed. 345 if len(e.addr) == 0 { 346 return 347 } 348 349 eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize)) 350 eth.Encode(&header.EthernetFields{ 351 SrcAddr: pkt.EgressRoute.LocalLinkAddress, 352 DstAddr: pkt.EgressRoute.RemoteLinkAddress, 353 Type: pkt.NetworkProtocolNumber, 354 }) 355 } 356 357 func (e *endpoint) parseHeader(pkt *stack.PacketBuffer) bool { 358 _, ok := pkt.LinkHeader().Consume(header.EthernetMinimumSize) 359 return ok 360 } 361 362 // ParseHeader implements stack.LinkEndpoint.ParseHeader. 363 func (e *endpoint) ParseHeader(pkt *stack.PacketBuffer) bool { 364 // Add ethernet header if needed. 365 if len(e.addr) == 0 { 366 return true 367 } 368 369 return e.parseHeader(pkt) 370 } 371 372 func (e *endpoint) AddVirtioNetHeader(pkt *stack.PacketBuffer) { 373 virtio := header.VirtioNetHeader(pkt.VirtioNetHeader().Push(header.VirtioNetHeaderSize)) 374 virtio.Encode(&header.VirtioNetHeaderFields{}) 375 } 376 377 // +checklocks:e.mu 378 func (e *endpoint) writePacketLocked(r stack.RouteInfo, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error { 379 if e.virtioNetHeaderRequired { 380 e.AddVirtioNetHeader(pkt) 381 } 382 383 // Transmit the packet. 384 b := pkt.ToBuffer() 385 defer b.Release() 386 ok := e.tx.transmit(b) 387 if !ok { 388 return &tcpip.ErrWouldBlock{} 389 } 390 391 return nil 392 } 393 394 // WritePackets implements stack.LinkEndpoint.WritePackets. 395 func (e *endpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) { 396 n := 0 397 var err tcpip.Error 398 e.mu.Lock() 399 defer e.mu.Unlock() 400 for _, pkt := range pkts.AsSlice() { 401 if err = e.writePacketLocked(pkt.EgressRoute, pkt.NetworkProtocolNumber, pkt); err != nil { 402 break 403 } 404 n++ 405 } 406 // WritePackets never returns an error if it successfully transmitted at least 407 // one packet. 408 if err != nil && n == 0 { 409 return 0, err 410 } 411 e.tx.notify() 412 return n, nil 413 } 414 415 // dispatchLoop reads packets from the rx queue in a loop and dispatches them 416 // to the network stack. 417 func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) { 418 // Post initial set of buffers. 419 limit := e.rx.q.PostedBuffersLimit() 420 if l := uint64(len(e.rx.data)) / uint64(e.bufferSize); limit > l { 421 limit = l 422 } 423 for i := uint64(0); i < limit; i++ { 424 b := queue.RxBuffer{ 425 Offset: i * uint64(e.bufferSize), 426 Size: e.bufferSize, 427 ID: i, 428 } 429 if !e.rx.q.PostBuffers([]queue.RxBuffer{b}) { 430 log.Warningf("Unable to post %v-th buffer", i) 431 } 432 } 433 434 // Read in a loop until a stop is requested. 435 var rxb []queue.RxBuffer 436 for e.stopRequested.Load() == 0 { 437 var n uint32 438 rxb, n = e.rx.postAndReceive(rxb, &e.stopRequested) 439 440 // Copy data from the shared area to its own buffer, then 441 // prepare to repost the buffer. 442 v := buffer.NewView(int(n)) 443 v.Grow(int(n)) 444 offset := uint32(0) 445 for i := range rxb { 446 v.WriteAt(e.rx.data[rxb[i].Offset:][:rxb[i].Size], int(offset)) 447 offset += rxb[i].Size 448 449 rxb[i].Size = e.bufferSize 450 } 451 452 pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ 453 Payload: buffer.MakeWithView(v), 454 }) 455 456 if e.virtioNetHeaderRequired { 457 _, ok := pkt.VirtioNetHeader().Consume(header.VirtioNetHeaderSize) 458 if !ok { 459 pkt.DecRef() 460 continue 461 } 462 } 463 464 var proto tcpip.NetworkProtocolNumber 465 if len(e.addr) != 0 { 466 if !e.parseHeader(pkt) { 467 pkt.DecRef() 468 continue 469 } 470 proto = header.Ethernet(pkt.LinkHeader().Slice()).Type() 471 } else { 472 // We don't get any indication of what the packet is, so try to guess 473 // if it's an IPv4 or IPv6 packet. 474 // IP version information is at the first octet, so pulling up 1 byte. 475 h, ok := pkt.Data().PullUp(1) 476 if !ok { 477 pkt.DecRef() 478 continue 479 } 480 switch header.IPVersion(h) { 481 case header.IPv4Version: 482 proto = header.IPv4ProtocolNumber 483 case header.IPv6Version: 484 proto = header.IPv6ProtocolNumber 485 default: 486 pkt.DecRef() 487 continue 488 } 489 } 490 491 // Send packet up the stack. 492 d.DeliverNetworkPacket(proto, pkt) 493 pkt.DecRef() 494 } 495 496 e.mu.Lock() 497 defer e.mu.Unlock() 498 499 // Clean state. 500 e.tx.cleanup() 501 e.rx.cleanup() 502 503 e.completed.Done() 504 } 505 506 // ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType 507 func (*endpoint) ARPHardwareType() header.ARPHardwareType { 508 return header.ARPHardwareEther 509 } 510 511 // GSOMaxSize implements stack.GSOEndpoint. 512 func (e *endpoint) GSOMaxSize() uint32 { 513 return e.gsoMaxSize 514 } 515 516 // SupportsGSO implements stack.GSOEndpoint. 517 func (e *endpoint) SupportedGSO() stack.SupportedGSO { 518 return stack.GvisorGSOSupported 519 }