github.com/sagernet/gvisor@v0.0.0-20240428053021-e691de28565f/pkg/tcpip/network/internal/fragmentation/fragmentation.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package fragmentation contains the implementation of IP fragmentation. 16 // It is based on RFC 791, RFC 815 and RFC 8200. 17 package fragmentation 18 19 import ( 20 "errors" 21 "fmt" 22 "time" 23 24 "github.com/sagernet/gvisor/pkg/buffer" 25 "github.com/sagernet/gvisor/pkg/log" 26 "github.com/sagernet/gvisor/pkg/sync" 27 "github.com/sagernet/gvisor/pkg/tcpip" 28 "github.com/sagernet/gvisor/pkg/tcpip/stack" 29 ) 30 31 const ( 32 // HighFragThreshold is the threshold at which we start trimming old 33 // fragmented packets. Linux uses a default value of 4 MB. See 34 // net.ipv4.ipfrag_high_thresh for more information. 35 HighFragThreshold = 4 << 20 // 4MB 36 37 // LowFragThreshold is the threshold we reach to when we start dropping 38 // older fragmented packets. It's important that we keep enough room for newer 39 // packets to be re-assembled. Hence, this needs to be lower than 40 // HighFragThreshold enough. Linux uses a default value of 3 MB. See 41 // net.ipv4.ipfrag_low_thresh for more information. 42 LowFragThreshold = 3 << 20 // 3MB 43 44 // minBlockSize is the minimum block size for fragments. 45 minBlockSize = 1 46 ) 47 48 var ( 49 // ErrInvalidArgs indicates to the caller that an invalid argument was 50 // provided. 51 ErrInvalidArgs = errors.New("invalid args") 52 53 // ErrFragmentOverlap indicates that, during reassembly, a fragment overlaps 54 // with another one. 55 ErrFragmentOverlap = errors.New("overlapping fragments") 56 57 // ErrFragmentConflict indicates that, during reassembly, some fragments are 58 // in conflict with one another. 59 ErrFragmentConflict = errors.New("conflicting fragments") 60 ) 61 62 // FragmentID is the identifier for a fragment. 63 type FragmentID struct { 64 // Source is the source address of the fragment. 65 Source tcpip.Address 66 67 // Destination is the destination address of the fragment. 68 Destination tcpip.Address 69 70 // ID is the identification value of the fragment. 71 // 72 // This is a uint32 because IPv6 uses a 32-bit identification value. 73 ID uint32 74 75 // The protocol for the packet. 76 Protocol uint8 77 } 78 79 // Fragmentation is the main structure that other modules 80 // of the stack should use to implement IP Fragmentation. 81 type Fragmentation struct { 82 mu sync.Mutex 83 highLimit int 84 lowLimit int 85 reassemblers map[FragmentID]*reassembler 86 rList reassemblerList 87 memSize int 88 timeout time.Duration 89 blockSize uint16 90 clock tcpip.Clock 91 releaseJob *tcpip.Job 92 timeoutHandler TimeoutHandler 93 } 94 95 // TimeoutHandler is consulted if a packet reassembly has timed out. 96 type TimeoutHandler interface { 97 // OnReassemblyTimeout will be called with the first fragment (or nil, if the 98 // first fragment has not been received) of a packet whose reassembly has 99 // timed out. 100 OnReassemblyTimeout(pkt *stack.PacketBuffer) 101 } 102 103 // NewFragmentation creates a new Fragmentation. 104 // 105 // blockSize specifies the fragment block size, in bytes. 106 // 107 // highMemoryLimit specifies the limit on the memory consumed 108 // by the fragments stored by Fragmentation (overhead of internal data-structures 109 // is not accounted). Fragments are dropped when the limit is reached. 110 // 111 // lowMemoryLimit specifies the limit on which we will reach by dropping 112 // fragments after reaching highMemoryLimit. 113 // 114 // reassemblingTimeout specifies the maximum time allowed to reassemble a packet. 115 // Fragments are lazily evicted only when a new a packet with an 116 // already existing fragmentation-id arrives after the timeout. 117 func NewFragmentation(blockSize uint16, highMemoryLimit, lowMemoryLimit int, reassemblingTimeout time.Duration, clock tcpip.Clock, timeoutHandler TimeoutHandler) *Fragmentation { 118 if lowMemoryLimit >= highMemoryLimit { 119 lowMemoryLimit = highMemoryLimit 120 } 121 122 if lowMemoryLimit < 0 { 123 lowMemoryLimit = 0 124 } 125 126 if blockSize < minBlockSize { 127 blockSize = minBlockSize 128 } 129 130 f := &Fragmentation{ 131 reassemblers: make(map[FragmentID]*reassembler), 132 highLimit: highMemoryLimit, 133 lowLimit: lowMemoryLimit, 134 timeout: reassemblingTimeout, 135 blockSize: blockSize, 136 clock: clock, 137 timeoutHandler: timeoutHandler, 138 } 139 f.releaseJob = tcpip.NewJob(f.clock, &f.mu, f.releaseReassemblersLocked) 140 141 return f 142 } 143 144 // Process processes an incoming fragment belonging to an ID and returns a 145 // complete packet and its protocol number when all the packets belonging to 146 // that ID have been received. 147 // 148 // [first, last] is the range of the fragment bytes. 149 // 150 // first must be a multiple of the block size f is configured with. The size 151 // of the fragment data must be a multiple of the block size, unless there are 152 // no fragments following this fragment (more set to false). 153 // 154 // proto is the protocol number marked in the fragment being processed. It has 155 // to be given here outside of the FragmentID struct because IPv6 should not use 156 // the protocol to identify a fragment. 157 func (f *Fragmentation) Process( 158 id FragmentID, first, last uint16, more bool, proto uint8, pkt *stack.PacketBuffer) ( 159 *stack.PacketBuffer, uint8, bool, error) { 160 if first > last { 161 return nil, 0, false, fmt.Errorf("first=%d is greater than last=%d: %w", first, last, ErrInvalidArgs) 162 } 163 164 if first%f.blockSize != 0 { 165 return nil, 0, false, fmt.Errorf("first=%d is not a multiple of block size=%d: %w", first, f.blockSize, ErrInvalidArgs) 166 } 167 168 fragmentSize := last - first + 1 169 if more && fragmentSize%f.blockSize != 0 { 170 return nil, 0, false, fmt.Errorf("fragment size=%d bytes is not a multiple of block size=%d on non-final fragment: %w", fragmentSize, f.blockSize, ErrInvalidArgs) 171 } 172 173 if l := pkt.Data().Size(); l != int(fragmentSize) { 174 return nil, 0, false, fmt.Errorf("got fragment size=%d bytes not equal to the expected fragment size=%d bytes (first=%d last=%d): %w", l, fragmentSize, first, last, ErrInvalidArgs) 175 } 176 177 f.mu.Lock() 178 if f.reassemblers == nil { 179 return nil, 0, false, fmt.Errorf("Release() called before fragmentation processing could finish") 180 } 181 182 r, ok := f.reassemblers[id] 183 if !ok { 184 r = newReassembler(id, f.clock) 185 f.reassemblers[id] = r 186 wasEmpty := f.rList.Empty() 187 f.rList.PushFront(r) 188 if wasEmpty { 189 // If we have just pushed a first reassembler into an empty list, we 190 // should kickstart the release job. The release job will keep 191 // rescheduling itself until the list becomes empty. 192 f.releaseReassemblersLocked() 193 } 194 } 195 f.mu.Unlock() 196 197 resPkt, firstFragmentProto, done, memConsumed, err := r.process(first, last, more, proto, pkt) 198 if err != nil { 199 // We probably got an invalid sequence of fragments. Just 200 // discard the reassembler and move on. 201 f.mu.Lock() 202 f.release(r, false /* timedOut */) 203 f.mu.Unlock() 204 return nil, 0, false, fmt.Errorf("fragmentation processing error: %w", err) 205 } 206 f.mu.Lock() 207 f.memSize += memConsumed 208 if done { 209 f.release(r, false /* timedOut */) 210 } 211 // Evict reassemblers if we are consuming more memory than highLimit until 212 // we reach lowLimit. 213 if f.memSize > f.highLimit { 214 for f.memSize > f.lowLimit { 215 tail := f.rList.Back() 216 if tail == nil { 217 break 218 } 219 f.release(tail, false /* timedOut */) 220 } 221 } 222 f.mu.Unlock() 223 return resPkt, firstFragmentProto, done, nil 224 } 225 226 // Release releases all underlying resources. 227 func (f *Fragmentation) Release() { 228 f.mu.Lock() 229 defer f.mu.Unlock() 230 for _, r := range f.reassemblers { 231 f.release(r, false /* timedOut */) 232 } 233 f.reassemblers = nil 234 } 235 236 func (f *Fragmentation) release(r *reassembler, timedOut bool) { 237 // Before releasing a fragment we need to check if r is already marked as done. 238 // Otherwise, we would delete it twice. 239 if r.checkDoneOrMark() { 240 return 241 } 242 243 delete(f.reassemblers, r.id) 244 f.rList.Remove(r) 245 f.memSize -= r.memSize 246 if f.memSize < 0 { 247 log.Warningf("memory counter < 0 (%d), this is an accounting bug that requires investigation", f.memSize) 248 f.memSize = 0 249 } 250 251 if h := f.timeoutHandler; timedOut && h != nil { 252 h.OnReassemblyTimeout(r.pkt) 253 } 254 if r.pkt != nil { 255 r.pkt.DecRef() 256 r.pkt = nil 257 } 258 for _, h := range r.holes { 259 if h.pkt != nil { 260 h.pkt.DecRef() 261 h.pkt = nil 262 } 263 } 264 r.holes = nil 265 } 266 267 // releaseReassemblersLocked releases already-expired reassemblers, then 268 // schedules the job to call back itself for the remaining reassemblers if 269 // any. This function must be called with f.mu locked. 270 func (f *Fragmentation) releaseReassemblersLocked() { 271 now := f.clock.NowMonotonic() 272 for { 273 // The reassembler at the end of the list is the oldest. 274 r := f.rList.Back() 275 if r == nil { 276 // The list is empty. 277 break 278 } 279 elapsed := now.Sub(r.createdAt) 280 if f.timeout > elapsed { 281 // If the oldest reassembler has not expired, schedule the release 282 // job so that this function is called back when it has expired. 283 f.releaseJob.Schedule(f.timeout - elapsed) 284 break 285 } 286 // If the oldest reassembler has already expired, release it. 287 f.release(r, true /* timedOut*/) 288 } 289 } 290 291 // PacketFragmenter is the book-keeping struct for packet fragmentation. 292 type PacketFragmenter struct { 293 transportHeader []byte 294 data buffer.Buffer 295 reserve int 296 fragmentPayloadLen int 297 fragmentCount int 298 currentFragment int 299 fragmentOffset int 300 } 301 302 // MakePacketFragmenter prepares the struct needed for packet fragmentation. 303 // 304 // pkt is the packet to be fragmented. 305 // 306 // fragmentPayloadLen is the maximum number of bytes of fragmentable data a fragment can 307 // have. 308 // 309 // reserve is the number of bytes that should be reserved for the headers in 310 // each generated fragment. 311 func MakePacketFragmenter(pkt *stack.PacketBuffer, fragmentPayloadLen uint32, reserve int) PacketFragmenter { 312 // As per RFC 8200 Section 4.5, some IPv6 extension headers should not be 313 // repeated in each fragment. However we do not currently support any header 314 // of that kind yet, so the following computation is valid for both IPv4 and 315 // IPv6. 316 // TODO(gvisor.dev/issue/3912): Once Authentication or ESP Headers are 317 // supported for outbound packets, the fragmentable data should not include 318 // these headers. 319 var fragmentableData buffer.Buffer 320 fragmentableData.Append(pkt.TransportHeader().View()) 321 pktBuf := pkt.Data().ToBuffer() 322 fragmentableData.Merge(&pktBuf) 323 fragmentCount := (uint32(fragmentableData.Size()) + fragmentPayloadLen - 1) / fragmentPayloadLen 324 325 return PacketFragmenter{ 326 data: fragmentableData, 327 reserve: reserve, 328 fragmentPayloadLen: int(fragmentPayloadLen), 329 fragmentCount: int(fragmentCount), 330 } 331 } 332 333 // BuildNextFragment returns a packet with the payload of the next fragment, 334 // along with the fragment's offset, the number of bytes copied and a boolean 335 // indicating if there are more fragments left or not. If this function is 336 // called again after it indicated that no more fragments were left, it will 337 // panic. 338 // 339 // Note that the returned packet will not have its network and link headers 340 // populated, but space for them will be reserved. The transport header will be 341 // stored in the packet's data. 342 func (pf *PacketFragmenter) BuildNextFragment() (*stack.PacketBuffer, int, int, bool) { 343 if pf.currentFragment >= pf.fragmentCount { 344 panic("BuildNextFragment should not be called again after the last fragment was returned") 345 } 346 347 fragPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ 348 ReserveHeaderBytes: pf.reserve, 349 }) 350 351 // Copy data for the fragment. 352 copied := fragPkt.Data().ReadFrom(&pf.data, pf.fragmentPayloadLen) 353 354 offset := pf.fragmentOffset 355 pf.fragmentOffset += copied 356 pf.currentFragment++ 357 more := pf.currentFragment != pf.fragmentCount 358 359 return fragPkt, offset, copied, more 360 } 361 362 // RemainingFragmentCount returns the number of fragments left to be built. 363 func (pf *PacketFragmenter) RemainingFragmentCount() int { 364 return pf.fragmentCount - pf.currentFragment 365 } 366 367 // Release frees resources owned by the packet fragmenter. 368 func (pf *PacketFragmenter) Release() { 369 pf.data.Release() 370 }