inet.af/netstack@v0.0.0-20220214151720-7585b01ddccf/tcpip/network/internal/fragmentation/fragmentation.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package fragmentation contains the implementation of IP fragmentation. 16 // It is based on RFC 791, RFC 815 and RFC 8200. 17 package fragmentation 18 19 import ( 20 "errors" 21 "fmt" 22 "log" 23 "time" 24 25 "inet.af/netstack/sync" 26 "inet.af/netstack/tcpip" 27 "inet.af/netstack/tcpip/buffer" 28 "inet.af/netstack/tcpip/stack" 29 ) 30 31 const ( 32 // HighFragThreshold is the threshold at which we start trimming old 33 // fragmented packets. Linux uses a default value of 4 MB. See 34 // net.ipv4.ipfrag_high_thresh for more information. 35 HighFragThreshold = 4 << 20 // 4MB 36 37 // LowFragThreshold is the threshold we reach to when we start dropping 38 // older fragmented packets. It's important that we keep enough room for newer 39 // packets to be re-assembled. Hence, this needs to be lower than 40 // HighFragThreshold enough. Linux uses a default value of 3 MB. See 41 // net.ipv4.ipfrag_low_thresh for more information. 42 LowFragThreshold = 3 << 20 // 3MB 43 44 // minBlockSize is the minimum block size for fragments. 45 minBlockSize = 1 46 ) 47 48 var ( 49 // ErrInvalidArgs indicates to the caller that an invalid argument was 50 // provided. 51 ErrInvalidArgs = errors.New("invalid args") 52 53 // ErrFragmentOverlap indicates that, during reassembly, a fragment overlaps 54 // with another one. 55 ErrFragmentOverlap = errors.New("overlapping fragments") 56 57 // ErrFragmentConflict indicates that, during reassembly, some fragments are 58 // in conflict with one another. 59 ErrFragmentConflict = errors.New("conflicting fragments") 60 ) 61 62 // FragmentID is the identifier for a fragment. 63 type FragmentID struct { 64 // Source is the source address of the fragment. 65 Source tcpip.Address 66 67 // Destination is the destination address of the fragment. 68 Destination tcpip.Address 69 70 // ID is the identification value of the fragment. 71 // 72 // This is a uint32 because IPv6 uses a 32-bit identification value. 73 ID uint32 74 75 // The protocol for the packet. 76 Protocol uint8 77 } 78 79 // Fragmentation is the main structure that other modules 80 // of the stack should use to implement IP Fragmentation. 81 type Fragmentation struct { 82 mu sync.Mutex 83 highLimit int 84 lowLimit int 85 reassemblers map[FragmentID]*reassembler 86 rList reassemblerList 87 memSize int 88 timeout time.Duration 89 blockSize uint16 90 clock tcpip.Clock 91 releaseJob *tcpip.Job 92 timeoutHandler TimeoutHandler 93 } 94 95 // TimeoutHandler is consulted if a packet reassembly has timed out. 96 type TimeoutHandler interface { 97 // OnReassemblyTimeout will be called with the first fragment (or nil, if the 98 // first fragment has not been received) of a packet whose reassembly has 99 // timed out. 100 OnReassemblyTimeout(pkt *stack.PacketBuffer) 101 } 102 103 // NewFragmentation creates a new Fragmentation. 104 // 105 // blockSize specifies the fragment block size, in bytes. 106 // 107 // highMemoryLimit specifies the limit on the memory consumed 108 // by the fragments stored by Fragmentation (overhead of internal data-structures 109 // is not accounted). Fragments are dropped when the limit is reached. 110 // 111 // lowMemoryLimit specifies the limit on which we will reach by dropping 112 // fragments after reaching highMemoryLimit. 113 // 114 // reassemblingTimeout specifies the maximum time allowed to reassemble a packet. 115 // Fragments are lazily evicted only when a new a packet with an 116 // already existing fragmentation-id arrives after the timeout. 117 func NewFragmentation(blockSize uint16, highMemoryLimit, lowMemoryLimit int, reassemblingTimeout time.Duration, clock tcpip.Clock, timeoutHandler TimeoutHandler) *Fragmentation { 118 if lowMemoryLimit >= highMemoryLimit { 119 lowMemoryLimit = highMemoryLimit 120 } 121 122 if lowMemoryLimit < 0 { 123 lowMemoryLimit = 0 124 } 125 126 if blockSize < minBlockSize { 127 blockSize = minBlockSize 128 } 129 130 f := &Fragmentation{ 131 reassemblers: make(map[FragmentID]*reassembler), 132 highLimit: highMemoryLimit, 133 lowLimit: lowMemoryLimit, 134 timeout: reassemblingTimeout, 135 blockSize: blockSize, 136 clock: clock, 137 timeoutHandler: timeoutHandler, 138 } 139 f.releaseJob = tcpip.NewJob(f.clock, &f.mu, f.releaseReassemblersLocked) 140 141 return f 142 } 143 144 // Process processes an incoming fragment belonging to an ID and returns a 145 // complete packet and its protocol number when all the packets belonging to 146 // that ID have been received. 147 // 148 // [first, last] is the range of the fragment bytes. 149 // 150 // first must be a multiple of the block size f is configured with. The size 151 // of the fragment data must be a multiple of the block size, unless there are 152 // no fragments following this fragment (more set to false). 153 // 154 // proto is the protocol number marked in the fragment being processed. It has 155 // to be given here outside of the FragmentID struct because IPv6 should not use 156 // the protocol to identify a fragment. 157 func (f *Fragmentation) Process( 158 id FragmentID, first, last uint16, more bool, proto uint8, pkt *stack.PacketBuffer) ( 159 *stack.PacketBuffer, uint8, bool, error) { 160 if first > last { 161 return nil, 0, false, fmt.Errorf("first=%d is greater than last=%d: %w", first, last, ErrInvalidArgs) 162 } 163 164 if first%f.blockSize != 0 { 165 return nil, 0, false, fmt.Errorf("first=%d is not a multiple of block size=%d: %w", first, f.blockSize, ErrInvalidArgs) 166 } 167 168 fragmentSize := last - first + 1 169 if more && fragmentSize%f.blockSize != 0 { 170 return nil, 0, false, fmt.Errorf("fragment size=%d bytes is not a multiple of block size=%d on non-final fragment: %w", fragmentSize, f.blockSize, ErrInvalidArgs) 171 } 172 173 if l := pkt.Data().Size(); l != int(fragmentSize) { 174 return nil, 0, false, fmt.Errorf("got fragment size=%d bytes not equal to the expected fragment size=%d bytes (first=%d last=%d): %w", l, fragmentSize, first, last, ErrInvalidArgs) 175 } 176 177 f.mu.Lock() 178 r, ok := f.reassemblers[id] 179 if !ok { 180 r = newReassembler(id, f.clock) 181 f.reassemblers[id] = r 182 wasEmpty := f.rList.Empty() 183 f.rList.PushFront(r) 184 if wasEmpty { 185 // If we have just pushed a first reassembler into an empty list, we 186 // should kickstart the release job. The release job will keep 187 // rescheduling itself until the list becomes empty. 188 f.releaseReassemblersLocked() 189 } 190 } 191 f.mu.Unlock() 192 193 resPkt, firstFragmentProto, done, memConsumed, err := r.process(first, last, more, proto, pkt) 194 if err != nil { 195 // We probably got an invalid sequence of fragments. Just 196 // discard the reassembler and move on. 197 f.mu.Lock() 198 f.release(r, false /* timedOut */) 199 f.mu.Unlock() 200 return nil, 0, false, fmt.Errorf("fragmentation processing error: %w", err) 201 } 202 f.mu.Lock() 203 f.memSize += memConsumed 204 if done { 205 f.release(r, false /* timedOut */) 206 } 207 // Evict reassemblers if we are consuming more memory than highLimit until 208 // we reach lowLimit. 209 if f.memSize > f.highLimit { 210 for f.memSize > f.lowLimit { 211 tail := f.rList.Back() 212 if tail == nil { 213 break 214 } 215 f.release(tail, false /* timedOut */) 216 } 217 } 218 f.mu.Unlock() 219 return resPkt, firstFragmentProto, done, nil 220 } 221 222 func (f *Fragmentation) release(r *reassembler, timedOut bool) { 223 // Before releasing a fragment we need to check if r is already marked as done. 224 // Otherwise, we would delete it twice. 225 if r.checkDoneOrMark() { 226 return 227 } 228 229 delete(f.reassemblers, r.id) 230 f.rList.Remove(r) 231 f.memSize -= r.memSize 232 if f.memSize < 0 { 233 log.Printf("memory counter < 0 (%d), this is an accounting bug that requires investigation", f.memSize) 234 f.memSize = 0 235 } 236 237 if h := f.timeoutHandler; timedOut && h != nil { 238 h.OnReassemblyTimeout(r.pkt) 239 } 240 } 241 242 // releaseReassemblersLocked releases already-expired reassemblers, then 243 // schedules the job to call back itself for the remaining reassemblers if 244 // any. This function must be called with f.mu locked. 245 func (f *Fragmentation) releaseReassemblersLocked() { 246 now := f.clock.NowMonotonic() 247 for { 248 // The reassembler at the end of the list is the oldest. 249 r := f.rList.Back() 250 if r == nil { 251 // The list is empty. 252 break 253 } 254 elapsed := now.Sub(r.createdAt) 255 if f.timeout > elapsed { 256 // If the oldest reassembler has not expired, schedule the release 257 // job so that this function is called back when it has expired. 258 f.releaseJob.Schedule(f.timeout - elapsed) 259 break 260 } 261 // If the oldest reassembler has already expired, release it. 262 f.release(r, true /* timedOut*/) 263 } 264 } 265 266 // PacketFragmenter is the book-keeping struct for packet fragmentation. 267 type PacketFragmenter struct { 268 transportHeader buffer.View 269 data buffer.VectorisedView 270 reserve int 271 fragmentPayloadLen int 272 fragmentCount int 273 currentFragment int 274 fragmentOffset int 275 } 276 277 // MakePacketFragmenter prepares the struct needed for packet fragmentation. 278 // 279 // pkt is the packet to be fragmented. 280 // 281 // fragmentPayloadLen is the maximum number of bytes of fragmentable data a fragment can 282 // have. 283 // 284 // reserve is the number of bytes that should be reserved for the headers in 285 // each generated fragment. 286 func MakePacketFragmenter(pkt *stack.PacketBuffer, fragmentPayloadLen uint32, reserve int) PacketFragmenter { 287 // As per RFC 8200 Section 4.5, some IPv6 extension headers should not be 288 // repeated in each fragment. However we do not currently support any header 289 // of that kind yet, so the following computation is valid for both IPv4 and 290 // IPv6. 291 // TODO(gvisor.dev/issue/3912): Once Authentication or ESP Headers are 292 // supported for outbound packets, the fragmentable data should not include 293 // these headers. 294 var fragmentableData buffer.VectorisedView 295 fragmentableData.AppendView(pkt.TransportHeader().View()) 296 fragmentableData.Append(pkt.Data().ExtractVV()) 297 fragmentCount := (uint32(fragmentableData.Size()) + fragmentPayloadLen - 1) / fragmentPayloadLen 298 299 return PacketFragmenter{ 300 data: fragmentableData, 301 reserve: reserve, 302 fragmentPayloadLen: int(fragmentPayloadLen), 303 fragmentCount: int(fragmentCount), 304 } 305 } 306 307 // BuildNextFragment returns a packet with the payload of the next fragment, 308 // along with the fragment's offset, the number of bytes copied and a boolean 309 // indicating if there are more fragments left or not. If this function is 310 // called again after it indicated that no more fragments were left, it will 311 // panic. 312 // 313 // Note that the returned packet will not have its network and link headers 314 // populated, but space for them will be reserved. The transport header will be 315 // stored in the packet's data. 316 func (pf *PacketFragmenter) BuildNextFragment() (*stack.PacketBuffer, int, int, bool) { 317 if pf.currentFragment >= pf.fragmentCount { 318 panic("BuildNextFragment should not be called again after the last fragment was returned") 319 } 320 321 fragPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{ 322 ReserveHeaderBytes: pf.reserve, 323 }) 324 325 // Copy data for the fragment. 326 copied := fragPkt.Data().ReadFromVV(&pf.data, pf.fragmentPayloadLen) 327 328 offset := pf.fragmentOffset 329 pf.fragmentOffset += copied 330 pf.currentFragment++ 331 more := pf.currentFragment != pf.fragmentCount 332 333 return fragPkt, offset, copied, more 334 } 335 336 // RemainingFragmentCount returns the number of fragments left to be built. 337 func (pf *PacketFragmenter) RemainingFragmentCount() int { 338 return pf.fragmentCount - pf.currentFragment 339 }