github.com/ethersphere/bee/v2@v2.2.0/pkg/file/joiner/joiner.go (about) 1 // Copyright 2020 The Swarm Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package joiner provides implementations of the file.Joiner interface 6 package joiner 7 8 import ( 9 "context" 10 "errors" 11 "io" 12 "sync" 13 "sync/atomic" 14 15 "github.com/ethersphere/bee/v2/pkg/bmt" 16 "github.com/ethersphere/bee/v2/pkg/encryption" 17 "github.com/ethersphere/bee/v2/pkg/encryption/store" 18 "github.com/ethersphere/bee/v2/pkg/file" 19 "github.com/ethersphere/bee/v2/pkg/file/redundancy" 20 "github.com/ethersphere/bee/v2/pkg/file/redundancy/getter" 21 "github.com/ethersphere/bee/v2/pkg/replicas" 22 storage "github.com/ethersphere/bee/v2/pkg/storage" 23 "github.com/ethersphere/bee/v2/pkg/swarm" 24 "golang.org/x/sync/errgroup" 25 ) 26 27 type joiner struct { 28 addr swarm.Address 29 rootData []byte 30 span int64 31 off int64 32 refLength int 33 rootParity int 34 maxBranching int // maximum branching in an intermediate chunk 35 36 ctx context.Context 37 decoders *decoderCache 38 chunkToSpan func(data []byte) (redundancy.Level, int64) // returns parity and span value from chunkData 39 } 40 41 // decoderCache is cache of decoders for intermediate chunks 42 type decoderCache struct { 43 fetcher storage.Getter // network retrieval interface to fetch chunks 44 putter storage.Putter // interface to local storage to save reconstructed chunks 45 mu sync.Mutex // mutex to protect cache 46 cache map[string]storage.Getter // map from chunk address to RS getter 47 config getter.Config // getter configuration 48 } 49 50 // NewDecoderCache creates a new decoder cache 51 func NewDecoderCache(g storage.Getter, p storage.Putter, conf getter.Config) *decoderCache { 52 return &decoderCache{ 53 fetcher: g, 54 putter: p, 55 cache: make(map[string]storage.Getter), 56 config: conf, 57 } 58 } 59 60 func fingerprint(addrs []swarm.Address) string { 61 h := swarm.NewHasher() 62 for _, addr := range addrs { 63 _, _ = h.Write(addr.Bytes()) 64 } 65 return string(h.Sum(nil)) 66 } 67 68 // GetOrCreate returns a decoder for the given chunk address 69 func (g *decoderCache) GetOrCreate(addrs []swarm.Address, shardCnt int) storage.Getter { 70 71 // since a recovery decoder is not allowed, simply return the underlying netstore 72 if g.config.Strict && g.config.Strategy == getter.NONE { 73 return g.fetcher 74 } 75 76 if len(addrs) == shardCnt { 77 return g.fetcher 78 } 79 80 key := fingerprint(addrs) 81 g.mu.Lock() 82 defer g.mu.Unlock() 83 d, ok := g.cache[key] 84 if ok { 85 if d == nil { 86 return g.fetcher 87 } 88 return d 89 } 90 remove := func(err error) { 91 g.mu.Lock() 92 defer g.mu.Unlock() 93 if err != nil { 94 // signals that a new getter is needed to reattempt to recover the data 95 delete(g.cache, key) 96 } else { 97 // signals that the chunks were fetched/recovered/cached so a future getter is not needed 98 g.cache[key] = nil 99 } 100 } 101 d = getter.New(addrs, shardCnt, g.fetcher, g.putter, remove, g.config) 102 g.cache[key] = d 103 return d 104 } 105 106 // New creates a new Joiner. A Joiner provides Read, Seek and Size functionalities. 107 func New(ctx context.Context, g storage.Getter, putter storage.Putter, address swarm.Address) (file.Joiner, int64, error) { 108 // retrieve the root chunk to read the total data length the be retrieved 109 rLevel := redundancy.GetLevelFromContext(ctx) 110 rootChunkGetter := store.New(g) 111 if rLevel != redundancy.NONE { 112 rootChunkGetter = store.New(replicas.NewGetter(g, rLevel)) 113 } 114 rootChunk, err := rootChunkGetter.Get(ctx, address) 115 if err != nil { 116 return nil, 0, err 117 } 118 119 chunkData := rootChunk.Data() 120 rootData := chunkData[swarm.SpanSize:] 121 refLength := len(address.Bytes()) 122 encryption := refLength == encryption.ReferenceSize 123 rLevel, span := chunkToSpan(chunkData) 124 rootParity := 0 125 maxBranching := swarm.ChunkSize / refLength 126 spanFn := func(data []byte) (redundancy.Level, int64) { 127 return 0, int64(bmt.LengthFromSpan(data[:swarm.SpanSize])) 128 } 129 conf, err := getter.NewConfigFromContext(ctx, getter.DefaultConfig) 130 if err != nil { 131 return nil, 0, err 132 } 133 // override stuff if root chunk has redundancy 134 if rLevel != redundancy.NONE { 135 _, parities := file.ReferenceCount(uint64(span), rLevel, encryption) 136 rootParity = parities 137 138 spanFn = chunkToSpan 139 if encryption { 140 maxBranching = rLevel.GetMaxEncShards() 141 } else { 142 maxBranching = rLevel.GetMaxShards() 143 } 144 } else { 145 // if root chunk has no redundancy, strategy is ignored and set to DATA and strict is set to true 146 conf.Strategy = getter.DATA 147 conf.Strict = true 148 } 149 150 j := &joiner{ 151 addr: rootChunk.Address(), 152 refLength: refLength, 153 ctx: ctx, 154 decoders: NewDecoderCache(g, putter, conf), 155 span: span, 156 rootData: rootData, 157 rootParity: rootParity, 158 maxBranching: maxBranching, 159 chunkToSpan: spanFn, 160 } 161 162 return j, span, nil 163 } 164 165 // Read is called by the consumer to retrieve the joined data. 166 // It must be called with a buffer equal to the maximum chunk size. 167 func (j *joiner) Read(b []byte) (n int, err error) { 168 read, err := j.ReadAt(b, j.off) 169 if err != nil && !errors.Is(err, io.EOF) { 170 return read, err 171 } 172 173 j.off += int64(read) 174 return read, err 175 } 176 177 func (j *joiner) ReadAt(buffer []byte, off int64) (read int, err error) { 178 // since offset is int64 and swarm spans are uint64 it means we cannot seek beyond int64 max value 179 if off >= j.span { 180 return 0, io.EOF 181 } 182 183 readLen := int64(cap(buffer)) 184 if readLen > j.span-off { 185 readLen = j.span - off 186 } 187 var bytesRead int64 188 var eg errgroup.Group 189 j.readAtOffset(buffer, j.rootData, 0, j.span, off, 0, readLen, &bytesRead, j.rootParity, &eg) 190 191 err = eg.Wait() 192 if err != nil { 193 return 0, err 194 } 195 196 return int(atomic.LoadInt64(&bytesRead)), nil 197 } 198 199 var ErrMalformedTrie = errors.New("malformed tree") 200 201 func (j *joiner) readAtOffset( 202 b, data []byte, 203 cur, subTrieSize, off, bufferOffset, bytesToRead int64, 204 bytesRead *int64, 205 parity int, 206 eg *errgroup.Group, 207 ) { 208 // we are at a leaf data chunk 209 if subTrieSize <= int64(len(data)) { 210 dataOffsetStart := off - cur 211 dataOffsetEnd := dataOffsetStart + bytesToRead 212 213 if lenDataToCopy := int64(len(data)) - dataOffsetStart; bytesToRead > lenDataToCopy { 214 dataOffsetEnd = dataOffsetStart + lenDataToCopy 215 } 216 217 bs := data[dataOffsetStart:dataOffsetEnd] 218 n := copy(b[bufferOffset:bufferOffset+int64(len(bs))], bs) 219 atomic.AddInt64(bytesRead, int64(n)) 220 return 221 } 222 pSize, err := file.ChunkPayloadSize(data) 223 if err != nil { 224 eg.Go(func() error { 225 return err 226 }) 227 return 228 } 229 230 addrs, shardCnt := file.ChunkAddresses(data[:pSize], parity, j.refLength) 231 g := store.New(j.decoders.GetOrCreate(addrs, shardCnt)) 232 for cursor := 0; cursor < len(data); cursor += j.refLength { 233 if bytesToRead == 0 { 234 break 235 } 236 237 // fast forward the cursor 238 sec := j.subtrieSection(cursor, pSize, parity, subTrieSize) 239 if cur+sec <= off { 240 cur += sec 241 continue 242 } 243 244 // if we are here it means that we are within the bounds of the data we need to read 245 addr := swarm.NewAddress(data[cursor : cursor+j.refLength]) 246 247 subtrieSpan := sec 248 subtrieSpanLimit := sec 249 250 currentReadSize := subtrieSpan - (off - cur) // the size of the subtrie, minus the offset from the start of the trie 251 252 // upper bound alignments 253 if currentReadSize > bytesToRead { 254 currentReadSize = bytesToRead 255 } 256 if currentReadSize > subtrieSpan { 257 currentReadSize = subtrieSpan 258 } 259 260 func(address swarm.Address, b []byte, cur, subTrieSize, off, bufferOffset, bytesToRead, subtrieSpanLimit int64) { 261 eg.Go(func() error { 262 ch, err := g.Get(j.ctx, addr) 263 if err != nil { 264 return err 265 } 266 267 chunkData := ch.Data()[8:] 268 subtrieLevel, subtrieSpan := j.chunkToSpan(ch.Data()) 269 _, subtrieParity := file.ReferenceCount(uint64(subtrieSpan), subtrieLevel, j.refLength == encryption.ReferenceSize) 270 271 if subtrieSpan > subtrieSpanLimit { 272 return ErrMalformedTrie 273 } 274 275 j.readAtOffset(b, chunkData, cur, subtrieSpan, off, bufferOffset, currentReadSize, bytesRead, subtrieParity, eg) 276 return nil 277 }) 278 }(addr, b, cur, subtrieSpan, off, bufferOffset, currentReadSize, subtrieSpanLimit) 279 280 bufferOffset += currentReadSize 281 bytesToRead -= currentReadSize 282 cur += subtrieSpan 283 off = cur 284 } 285 } 286 287 // getShards returns the effective reference number respective to the intermediate chunk payload length and its parities 288 func (j *joiner) getShards(payloadSize, parities int) int { 289 return (payloadSize - parities*swarm.HashSize) / j.refLength 290 } 291 292 // brute-forces the subtrie size for each of the sections in this intermediate chunk 293 func (j *joiner) subtrieSection(startIdx, payloadSize, parities int, subtrieSize int64) int64 { 294 // assume we have a trie of size `y` then we can assume that all of 295 // the forks except for the last one on the right are of equal size 296 // this is due to how the splitter wraps levels. 297 // so for the branches on the left, we can assume that 298 // y = (refs - 1) * x + l 299 // where y is the size of the subtrie, refs are the number of references 300 // x is constant (the brute forced value) and l is the size of the last subtrie 301 var ( 302 refs = int64(j.getShards(payloadSize, parities)) // how many effective references in the intermediate chunk 303 branching = int64(j.maxBranching) // branching factor is chunkSize divided by reference length 304 branchSize = int64(swarm.ChunkSize) 305 ) 306 for { 307 whatsLeft := subtrieSize - (branchSize * (refs - 1)) 308 if whatsLeft <= branchSize { 309 break 310 } 311 branchSize *= branching 312 } 313 314 // handle last branch edge case 315 if startIdx == int(refs-1)*j.refLength { 316 return subtrieSize - (refs-1)*branchSize 317 } 318 return branchSize 319 } 320 321 var errWhence = errors.New("seek: invalid whence") 322 var errOffset = errors.New("seek: invalid offset") 323 324 func (j *joiner) Seek(offset int64, whence int) (int64, error) { 325 switch whence { 326 case 0: 327 offset += 0 328 case 1: 329 offset += j.off 330 case 2: 331 332 offset = j.span - offset 333 if offset < 0 { 334 return 0, io.EOF 335 } 336 default: 337 return 0, errWhence 338 } 339 340 if offset < 0 { 341 return 0, errOffset 342 } 343 if offset > j.span { 344 return 0, io.EOF 345 } 346 j.off = offset 347 return offset, nil 348 349 } 350 351 func (j *joiner) IterateChunkAddresses(fn swarm.AddressIterFunc) error { 352 // report root address 353 err := fn(j.addr) 354 if err != nil { 355 return err 356 } 357 358 return j.processChunkAddresses(j.ctx, fn, j.rootData, j.span, j.rootParity) 359 } 360 361 func (j *joiner) processChunkAddresses(ctx context.Context, fn swarm.AddressIterFunc, data []byte, subTrieSize int64, parity int) error { 362 // we are at a leaf data chunk 363 if subTrieSize <= int64(len(data)) { 364 return nil 365 } 366 367 select { 368 case <-ctx.Done(): 369 return ctx.Err() 370 default: 371 } 372 373 eg, ectx := errgroup.WithContext(ctx) 374 375 var wg sync.WaitGroup 376 377 eSize, err := file.ChunkPayloadSize(data) 378 if err != nil { 379 return err 380 } 381 addrs, shardCnt := file.ChunkAddresses(data[:eSize], parity, j.refLength) 382 g := store.New(j.decoders.GetOrCreate(addrs, shardCnt)) 383 for i, addr := range addrs { 384 if err := fn(addr); err != nil { 385 return err 386 } 387 cursor := i * swarm.HashSize 388 if j.refLength == encryption.ReferenceSize { 389 cursor += swarm.HashSize * min(i, shardCnt) 390 } 391 sec := j.subtrieSection(cursor, eSize, parity, subTrieSize) 392 if sec <= swarm.ChunkSize { 393 continue 394 } 395 396 wg.Add(1) 397 eg.Go(func() error { 398 defer wg.Done() 399 400 if j.refLength == encryption.ReferenceSize && i < shardCnt { 401 addr = swarm.NewAddress(data[cursor : cursor+swarm.HashSize*2]) 402 } 403 ch, err := g.Get(ectx, addr) 404 if err != nil { 405 return err 406 } 407 408 chunkData := ch.Data()[8:] 409 subtrieLevel, subtrieSpan := j.chunkToSpan(ch.Data()) 410 _, parities := file.ReferenceCount(uint64(subtrieSpan), subtrieLevel, j.refLength != swarm.HashSize) 411 412 return j.processChunkAddresses(ectx, fn, chunkData, subtrieSpan, parities) 413 }) 414 415 wg.Wait() 416 } 417 418 return eg.Wait() 419 } 420 421 func (j *joiner) Size() int64 { 422 return j.span 423 } 424 425 // chunkToSpan returns redundancy level and span value 426 // in the types that the package uses 427 func chunkToSpan(data []byte) (redundancy.Level, int64) { 428 level, spanBytes := redundancy.DecodeSpan(data[:swarm.SpanSize]) 429 return level, int64(bmt.LengthFromSpan(spanBytes)) 430 }