github.com/ethersphere/bee/v2@v2.2.0/pkg/file/joiner/joiner.go (about)

     1  // Copyright 2020 The Swarm Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package joiner provides implementations of the file.Joiner interface
     6  package joiner
     7  
     8  import (
     9  	"context"
    10  	"errors"
    11  	"io"
    12  	"sync"
    13  	"sync/atomic"
    14  
    15  	"github.com/ethersphere/bee/v2/pkg/bmt"
    16  	"github.com/ethersphere/bee/v2/pkg/encryption"
    17  	"github.com/ethersphere/bee/v2/pkg/encryption/store"
    18  	"github.com/ethersphere/bee/v2/pkg/file"
    19  	"github.com/ethersphere/bee/v2/pkg/file/redundancy"
    20  	"github.com/ethersphere/bee/v2/pkg/file/redundancy/getter"
    21  	"github.com/ethersphere/bee/v2/pkg/replicas"
    22  	storage "github.com/ethersphere/bee/v2/pkg/storage"
    23  	"github.com/ethersphere/bee/v2/pkg/swarm"
    24  	"golang.org/x/sync/errgroup"
    25  )
    26  
    27  type joiner struct {
    28  	addr         swarm.Address
    29  	rootData     []byte
    30  	span         int64
    31  	off          int64
    32  	refLength    int
    33  	rootParity   int
    34  	maxBranching int // maximum branching in an intermediate chunk
    35  
    36  	ctx         context.Context
    37  	decoders    *decoderCache
    38  	chunkToSpan func(data []byte) (redundancy.Level, int64) // returns parity and span value from chunkData
    39  }
    40  
    41  // decoderCache is cache of decoders for intermediate chunks
    42  type decoderCache struct {
    43  	fetcher storage.Getter            // network retrieval interface to fetch chunks
    44  	putter  storage.Putter            // interface to local storage to save reconstructed chunks
    45  	mu      sync.Mutex                // mutex to protect cache
    46  	cache   map[string]storage.Getter // map from chunk address to RS getter
    47  	config  getter.Config             // getter configuration
    48  }
    49  
    50  // NewDecoderCache creates a new decoder cache
    51  func NewDecoderCache(g storage.Getter, p storage.Putter, conf getter.Config) *decoderCache {
    52  	return &decoderCache{
    53  		fetcher: g,
    54  		putter:  p,
    55  		cache:   make(map[string]storage.Getter),
    56  		config:  conf,
    57  	}
    58  }
    59  
    60  func fingerprint(addrs []swarm.Address) string {
    61  	h := swarm.NewHasher()
    62  	for _, addr := range addrs {
    63  		_, _ = h.Write(addr.Bytes())
    64  	}
    65  	return string(h.Sum(nil))
    66  }
    67  
    68  // GetOrCreate returns a decoder for the given chunk address
    69  func (g *decoderCache) GetOrCreate(addrs []swarm.Address, shardCnt int) storage.Getter {
    70  
    71  	// since a recovery decoder is not allowed, simply return the underlying netstore
    72  	if g.config.Strict && g.config.Strategy == getter.NONE {
    73  		return g.fetcher
    74  	}
    75  
    76  	if len(addrs) == shardCnt {
    77  		return g.fetcher
    78  	}
    79  
    80  	key := fingerprint(addrs)
    81  	g.mu.Lock()
    82  	defer g.mu.Unlock()
    83  	d, ok := g.cache[key]
    84  	if ok {
    85  		if d == nil {
    86  			return g.fetcher
    87  		}
    88  		return d
    89  	}
    90  	remove := func(err error) {
    91  		g.mu.Lock()
    92  		defer g.mu.Unlock()
    93  		if err != nil {
    94  			// signals that a new getter is needed to reattempt to recover the data
    95  			delete(g.cache, key)
    96  		} else {
    97  			// signals that the chunks were fetched/recovered/cached so a future getter is not needed
    98  			g.cache[key] = nil
    99  		}
   100  	}
   101  	d = getter.New(addrs, shardCnt, g.fetcher, g.putter, remove, g.config)
   102  	g.cache[key] = d
   103  	return d
   104  }
   105  
   106  // New creates a new Joiner. A Joiner provides Read, Seek and Size functionalities.
   107  func New(ctx context.Context, g storage.Getter, putter storage.Putter, address swarm.Address) (file.Joiner, int64, error) {
   108  	// retrieve the root chunk to read the total data length the be retrieved
   109  	rLevel := redundancy.GetLevelFromContext(ctx)
   110  	rootChunkGetter := store.New(g)
   111  	if rLevel != redundancy.NONE {
   112  		rootChunkGetter = store.New(replicas.NewGetter(g, rLevel))
   113  	}
   114  	rootChunk, err := rootChunkGetter.Get(ctx, address)
   115  	if err != nil {
   116  		return nil, 0, err
   117  	}
   118  
   119  	chunkData := rootChunk.Data()
   120  	rootData := chunkData[swarm.SpanSize:]
   121  	refLength := len(address.Bytes())
   122  	encryption := refLength == encryption.ReferenceSize
   123  	rLevel, span := chunkToSpan(chunkData)
   124  	rootParity := 0
   125  	maxBranching := swarm.ChunkSize / refLength
   126  	spanFn := func(data []byte) (redundancy.Level, int64) {
   127  		return 0, int64(bmt.LengthFromSpan(data[:swarm.SpanSize]))
   128  	}
   129  	conf, err := getter.NewConfigFromContext(ctx, getter.DefaultConfig)
   130  	if err != nil {
   131  		return nil, 0, err
   132  	}
   133  	// override stuff if root chunk has redundancy
   134  	if rLevel != redundancy.NONE {
   135  		_, parities := file.ReferenceCount(uint64(span), rLevel, encryption)
   136  		rootParity = parities
   137  
   138  		spanFn = chunkToSpan
   139  		if encryption {
   140  			maxBranching = rLevel.GetMaxEncShards()
   141  		} else {
   142  			maxBranching = rLevel.GetMaxShards()
   143  		}
   144  	} else {
   145  		// if root chunk has no redundancy, strategy is ignored and set to DATA and strict is set to true
   146  		conf.Strategy = getter.DATA
   147  		conf.Strict = true
   148  	}
   149  
   150  	j := &joiner{
   151  		addr:         rootChunk.Address(),
   152  		refLength:    refLength,
   153  		ctx:          ctx,
   154  		decoders:     NewDecoderCache(g, putter, conf),
   155  		span:         span,
   156  		rootData:     rootData,
   157  		rootParity:   rootParity,
   158  		maxBranching: maxBranching,
   159  		chunkToSpan:  spanFn,
   160  	}
   161  
   162  	return j, span, nil
   163  }
   164  
   165  // Read is called by the consumer to retrieve the joined data.
   166  // It must be called with a buffer equal to the maximum chunk size.
   167  func (j *joiner) Read(b []byte) (n int, err error) {
   168  	read, err := j.ReadAt(b, j.off)
   169  	if err != nil && !errors.Is(err, io.EOF) {
   170  		return read, err
   171  	}
   172  
   173  	j.off += int64(read)
   174  	return read, err
   175  }
   176  
   177  func (j *joiner) ReadAt(buffer []byte, off int64) (read int, err error) {
   178  	// since offset is int64 and swarm spans are uint64 it means we cannot seek beyond int64 max value
   179  	if off >= j.span {
   180  		return 0, io.EOF
   181  	}
   182  
   183  	readLen := int64(cap(buffer))
   184  	if readLen > j.span-off {
   185  		readLen = j.span - off
   186  	}
   187  	var bytesRead int64
   188  	var eg errgroup.Group
   189  	j.readAtOffset(buffer, j.rootData, 0, j.span, off, 0, readLen, &bytesRead, j.rootParity, &eg)
   190  
   191  	err = eg.Wait()
   192  	if err != nil {
   193  		return 0, err
   194  	}
   195  
   196  	return int(atomic.LoadInt64(&bytesRead)), nil
   197  }
   198  
   199  var ErrMalformedTrie = errors.New("malformed tree")
   200  
   201  func (j *joiner) readAtOffset(
   202  	b, data []byte,
   203  	cur, subTrieSize, off, bufferOffset, bytesToRead int64,
   204  	bytesRead *int64,
   205  	parity int,
   206  	eg *errgroup.Group,
   207  ) {
   208  	// we are at a leaf data chunk
   209  	if subTrieSize <= int64(len(data)) {
   210  		dataOffsetStart := off - cur
   211  		dataOffsetEnd := dataOffsetStart + bytesToRead
   212  
   213  		if lenDataToCopy := int64(len(data)) - dataOffsetStart; bytesToRead > lenDataToCopy {
   214  			dataOffsetEnd = dataOffsetStart + lenDataToCopy
   215  		}
   216  
   217  		bs := data[dataOffsetStart:dataOffsetEnd]
   218  		n := copy(b[bufferOffset:bufferOffset+int64(len(bs))], bs)
   219  		atomic.AddInt64(bytesRead, int64(n))
   220  		return
   221  	}
   222  	pSize, err := file.ChunkPayloadSize(data)
   223  	if err != nil {
   224  		eg.Go(func() error {
   225  			return err
   226  		})
   227  		return
   228  	}
   229  
   230  	addrs, shardCnt := file.ChunkAddresses(data[:pSize], parity, j.refLength)
   231  	g := store.New(j.decoders.GetOrCreate(addrs, shardCnt))
   232  	for cursor := 0; cursor < len(data); cursor += j.refLength {
   233  		if bytesToRead == 0 {
   234  			break
   235  		}
   236  
   237  		// fast forward the cursor
   238  		sec := j.subtrieSection(cursor, pSize, parity, subTrieSize)
   239  		if cur+sec <= off {
   240  			cur += sec
   241  			continue
   242  		}
   243  
   244  		// if we are here it means that we are within the bounds of the data we need to read
   245  		addr := swarm.NewAddress(data[cursor : cursor+j.refLength])
   246  
   247  		subtrieSpan := sec
   248  		subtrieSpanLimit := sec
   249  
   250  		currentReadSize := subtrieSpan - (off - cur) // the size of the subtrie, minus the offset from the start of the trie
   251  
   252  		// upper bound alignments
   253  		if currentReadSize > bytesToRead {
   254  			currentReadSize = bytesToRead
   255  		}
   256  		if currentReadSize > subtrieSpan {
   257  			currentReadSize = subtrieSpan
   258  		}
   259  
   260  		func(address swarm.Address, b []byte, cur, subTrieSize, off, bufferOffset, bytesToRead, subtrieSpanLimit int64) {
   261  			eg.Go(func() error {
   262  				ch, err := g.Get(j.ctx, addr)
   263  				if err != nil {
   264  					return err
   265  				}
   266  
   267  				chunkData := ch.Data()[8:]
   268  				subtrieLevel, subtrieSpan := j.chunkToSpan(ch.Data())
   269  				_, subtrieParity := file.ReferenceCount(uint64(subtrieSpan), subtrieLevel, j.refLength == encryption.ReferenceSize)
   270  
   271  				if subtrieSpan > subtrieSpanLimit {
   272  					return ErrMalformedTrie
   273  				}
   274  
   275  				j.readAtOffset(b, chunkData, cur, subtrieSpan, off, bufferOffset, currentReadSize, bytesRead, subtrieParity, eg)
   276  				return nil
   277  			})
   278  		}(addr, b, cur, subtrieSpan, off, bufferOffset, currentReadSize, subtrieSpanLimit)
   279  
   280  		bufferOffset += currentReadSize
   281  		bytesToRead -= currentReadSize
   282  		cur += subtrieSpan
   283  		off = cur
   284  	}
   285  }
   286  
   287  // getShards returns the effective reference number respective to the intermediate chunk payload length and its parities
   288  func (j *joiner) getShards(payloadSize, parities int) int {
   289  	return (payloadSize - parities*swarm.HashSize) / j.refLength
   290  }
   291  
   292  // brute-forces the subtrie size for each of the sections in this intermediate chunk
   293  func (j *joiner) subtrieSection(startIdx, payloadSize, parities int, subtrieSize int64) int64 {
   294  	// assume we have a trie of size `y` then we can assume that all of
   295  	// the forks except for the last one on the right are of equal size
   296  	// this is due to how the splitter wraps levels.
   297  	// so for the branches on the left, we can assume that
   298  	// y = (refs - 1) * x + l
   299  	// where y is the size of the subtrie, refs are the number of references
   300  	// x is constant (the brute forced value) and l is the size of the last subtrie
   301  	var (
   302  		refs       = int64(j.getShards(payloadSize, parities)) // how many effective references in the intermediate chunk
   303  		branching  = int64(j.maxBranching)                     // branching factor is chunkSize divided by reference length
   304  		branchSize = int64(swarm.ChunkSize)
   305  	)
   306  	for {
   307  		whatsLeft := subtrieSize - (branchSize * (refs - 1))
   308  		if whatsLeft <= branchSize {
   309  			break
   310  		}
   311  		branchSize *= branching
   312  	}
   313  
   314  	// handle last branch edge case
   315  	if startIdx == int(refs-1)*j.refLength {
   316  		return subtrieSize - (refs-1)*branchSize
   317  	}
   318  	return branchSize
   319  }
   320  
   321  var errWhence = errors.New("seek: invalid whence")
   322  var errOffset = errors.New("seek: invalid offset")
   323  
   324  func (j *joiner) Seek(offset int64, whence int) (int64, error) {
   325  	switch whence {
   326  	case 0:
   327  		offset += 0
   328  	case 1:
   329  		offset += j.off
   330  	case 2:
   331  
   332  		offset = j.span - offset
   333  		if offset < 0 {
   334  			return 0, io.EOF
   335  		}
   336  	default:
   337  		return 0, errWhence
   338  	}
   339  
   340  	if offset < 0 {
   341  		return 0, errOffset
   342  	}
   343  	if offset > j.span {
   344  		return 0, io.EOF
   345  	}
   346  	j.off = offset
   347  	return offset, nil
   348  
   349  }
   350  
   351  func (j *joiner) IterateChunkAddresses(fn swarm.AddressIterFunc) error {
   352  	// report root address
   353  	err := fn(j.addr)
   354  	if err != nil {
   355  		return err
   356  	}
   357  
   358  	return j.processChunkAddresses(j.ctx, fn, j.rootData, j.span, j.rootParity)
   359  }
   360  
   361  func (j *joiner) processChunkAddresses(ctx context.Context, fn swarm.AddressIterFunc, data []byte, subTrieSize int64, parity int) error {
   362  	// we are at a leaf data chunk
   363  	if subTrieSize <= int64(len(data)) {
   364  		return nil
   365  	}
   366  
   367  	select {
   368  	case <-ctx.Done():
   369  		return ctx.Err()
   370  	default:
   371  	}
   372  
   373  	eg, ectx := errgroup.WithContext(ctx)
   374  
   375  	var wg sync.WaitGroup
   376  
   377  	eSize, err := file.ChunkPayloadSize(data)
   378  	if err != nil {
   379  		return err
   380  	}
   381  	addrs, shardCnt := file.ChunkAddresses(data[:eSize], parity, j.refLength)
   382  	g := store.New(j.decoders.GetOrCreate(addrs, shardCnt))
   383  	for i, addr := range addrs {
   384  		if err := fn(addr); err != nil {
   385  			return err
   386  		}
   387  		cursor := i * swarm.HashSize
   388  		if j.refLength == encryption.ReferenceSize {
   389  			cursor += swarm.HashSize * min(i, shardCnt)
   390  		}
   391  		sec := j.subtrieSection(cursor, eSize, parity, subTrieSize)
   392  		if sec <= swarm.ChunkSize {
   393  			continue
   394  		}
   395  
   396  		wg.Add(1)
   397  		eg.Go(func() error {
   398  			defer wg.Done()
   399  
   400  			if j.refLength == encryption.ReferenceSize && i < shardCnt {
   401  				addr = swarm.NewAddress(data[cursor : cursor+swarm.HashSize*2])
   402  			}
   403  			ch, err := g.Get(ectx, addr)
   404  			if err != nil {
   405  				return err
   406  			}
   407  
   408  			chunkData := ch.Data()[8:]
   409  			subtrieLevel, subtrieSpan := j.chunkToSpan(ch.Data())
   410  			_, parities := file.ReferenceCount(uint64(subtrieSpan), subtrieLevel, j.refLength != swarm.HashSize)
   411  
   412  			return j.processChunkAddresses(ectx, fn, chunkData, subtrieSpan, parities)
   413  		})
   414  
   415  		wg.Wait()
   416  	}
   417  
   418  	return eg.Wait()
   419  }
   420  
   421  func (j *joiner) Size() int64 {
   422  	return j.span
   423  }
   424  
   425  // chunkToSpan returns redundancy level and span value
   426  // in the types that the package uses
   427  func chunkToSpan(data []byte) (redundancy.Level, int64) {
   428  	level, spanBytes := redundancy.DecodeSpan(data[:swarm.SpanSize])
   429  	return level, int64(bmt.LengthFromSpan(spanBytes))
   430  }