github.com/FUSIONFoundation/efsn@v3.6.2-0.20200916075423-dbb5dd5d2cc7+incompatible/swarm/storage/chunker.go (about)

     1  // Copyright 2016 The go-ethereum Authors
     2  // This file is part of the go-ethereum library.
     3  //
     4  // The go-ethereum library is free software: you can redistribute it and/or modify
     5  // it under the terms of the GNU Lesser General Public License as published by
     6  // the Free Software Foundation, either version 3 of the License, or
     7  // (at your option) any later version.
     8  //
     9  // The go-ethereum library is distributed in the hope that it will be useful,
    10  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    11  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    12  // GNU Lesser General Public License for more details.
    13  //
    14  // You should have received a copy of the GNU Lesser General Public License
    15  // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
    16  package storage
    17  
    18  import (
    19  	"context"
    20  	"encoding/binary"
    21  	"errors"
    22  	"fmt"
    23  	"io"
    24  	"sync"
    25  
    26  	"github.com/FusionFoundation/efsn/metrics"
    27  	ch "github.com/FusionFoundation/efsn/swarm/chunk"
    28  	"github.com/FusionFoundation/efsn/swarm/log"
    29  	"github.com/FusionFoundation/efsn/swarm/spancontext"
    30  	opentracing "github.com/opentracing/opentracing-go"
    31  	olog "github.com/opentracing/opentracing-go/log"
    32  )
    33  
    34  /*
    35  The distributed storage implemented in this package requires fix sized chunks of content.
    36  
    37  Chunker is the interface to a component that is responsible for disassembling and assembling larger data.
    38  
    39  TreeChunker implements a Chunker based on a tree structure defined as follows:
    40  
    41  1 each node in the tree including the root and other branching nodes are stored as a chunk.
    42  
    43  2 branching nodes encode data contents that includes the size of the dataslice covered by its entire subtree under the node as well as the hash keys of all its children :
    44  data_{i} := size(subtree_{i}) || key_{j} || key_{j+1} .... || key_{j+n-1}
    45  
    46  3 Leaf nodes encode an actual subslice of the input data.
    47  
    48  4 if data size is not more than maximum chunksize, the data is stored in a single chunk
    49    key = hash(int64(size) + data)
    50  
    51  5 if data size is more than chunksize*branches^l, but no more than chunksize*
    52    branches^(l+1), the data vector is split into slices of chunksize*
    53    branches^l length (except the last one).
    54    key = hash(int64(size) + key(slice0) + key(slice1) + ...)
    55  
    56   The underlying hash function is configurable
    57  */
    58  
    59  /*
    60  Tree chunker is a concrete implementation of data chunking.
    61  This chunker works in a simple way, it builds a tree out of the document so that each node either represents a chunk of real data or a chunk of data representing an branching non-leaf node of the tree. In particular each such non-leaf chunk will represent is a concatenation of the hash of its respective children. This scheme simultaneously guarantees data integrity as well as self addressing. Abstract nodes are transparent since their represented size component is strictly greater than their maximum data size, since they encode a subtree.
    62  
    63  If all is well it is possible to implement this by simply composing readers so that no extra allocation or buffering is necessary for the data splitting and joining. This means that in principle there can be direct IO between : memory, file system, network socket (bzz peers storage request is read from the socket). In practice there may be need for several stages of internal buffering.
    64  The hashing itself does use extra copies and allocation though, since it does need it.
    65  */
    66  
    67  var (
    68  	errAppendOppNotSuported = errors.New("Append operation not supported")
    69  )
    70  
    71  type ChunkerParams struct {
    72  	chunkSize int64
    73  	hashSize  int64
    74  }
    75  
    76  type SplitterParams struct {
    77  	ChunkerParams
    78  	reader io.Reader
    79  	putter Putter
    80  	addr   Address
    81  }
    82  
    83  type TreeSplitterParams struct {
    84  	SplitterParams
    85  	size int64
    86  }
    87  
    88  type JoinerParams struct {
    89  	ChunkerParams
    90  	addr   Address
    91  	getter Getter
    92  	// TODO: there is a bug, so depth can only be 0 today, see: https://github.com/ethersphere/go-ethereum/issues/344
    93  	depth int
    94  	ctx   context.Context
    95  }
    96  
    97  type TreeChunker struct {
    98  	ctx context.Context
    99  
   100  	branches int64
   101  	hashFunc SwarmHasher
   102  	dataSize int64
   103  	data     io.Reader
   104  	// calculated
   105  	addr        Address
   106  	depth       int
   107  	hashSize    int64        // self.hashFunc.New().Size()
   108  	chunkSize   int64        // hashSize* branches
   109  	workerCount int64        // the number of worker routines used
   110  	workerLock  sync.RWMutex // lock for the worker count
   111  	jobC        chan *hashJob
   112  	wg          *sync.WaitGroup
   113  	putter      Putter
   114  	getter      Getter
   115  	errC        chan error
   116  	quitC       chan bool
   117  }
   118  
   119  /*
   120  	Join reconstructs original content based on a root key.
   121  	When joining, the caller gets returned a Lazy SectionReader, which is
   122  	seekable and implements on-demand fetching of chunks as and where it is read.
   123  	New chunks to retrieve are coming from the getter, which the caller provides.
   124  	If an error is encountered during joining, it appears as a reader error.
   125  	The SectionReader.
   126  	As a result, partial reads from a document are possible even if other parts
   127  	are corrupt or lost.
   128  	The chunks are not meant to be validated by the chunker when joining. This
   129  	is because it is left to the DPA to decide which sources are trusted.
   130  */
   131  func TreeJoin(ctx context.Context, addr Address, getter Getter, depth int) *LazyChunkReader {
   132  	jp := &JoinerParams{
   133  		ChunkerParams: ChunkerParams{
   134  			chunkSize: ch.DefaultSize,
   135  			hashSize:  int64(len(addr)),
   136  		},
   137  		addr:   addr,
   138  		getter: getter,
   139  		depth:  depth,
   140  		ctx:    ctx,
   141  	}
   142  
   143  	return NewTreeJoiner(jp).Join(ctx)
   144  }
   145  
   146  /*
   147  	When splitting, data is given as a SectionReader, and the key is a hashSize long byte slice (Key), the root hash of the entire content will fill this once processing finishes.
   148  	New chunks to store are store using the putter which the caller provides.
   149  */
   150  func TreeSplit(ctx context.Context, data io.Reader, size int64, putter Putter) (k Address, wait func(context.Context) error, err error) {
   151  	tsp := &TreeSplitterParams{
   152  		SplitterParams: SplitterParams{
   153  			ChunkerParams: ChunkerParams{
   154  				chunkSize: ch.DefaultSize,
   155  				hashSize:  putter.RefSize(),
   156  			},
   157  			reader: data,
   158  			putter: putter,
   159  		},
   160  		size: size,
   161  	}
   162  	return NewTreeSplitter(tsp).Split(ctx)
   163  }
   164  
   165  func NewTreeJoiner(params *JoinerParams) *TreeChunker {
   166  	tc := &TreeChunker{}
   167  	tc.hashSize = params.hashSize
   168  	tc.branches = params.chunkSize / params.hashSize
   169  	tc.addr = params.addr
   170  	tc.getter = params.getter
   171  	tc.depth = params.depth
   172  	tc.chunkSize = params.chunkSize
   173  	tc.workerCount = 0
   174  	tc.jobC = make(chan *hashJob, 2*ChunkProcessors)
   175  	tc.wg = &sync.WaitGroup{}
   176  	tc.errC = make(chan error)
   177  	tc.quitC = make(chan bool)
   178  
   179  	tc.ctx = params.ctx
   180  
   181  	return tc
   182  }
   183  
   184  func NewTreeSplitter(params *TreeSplitterParams) *TreeChunker {
   185  	tc := &TreeChunker{}
   186  	tc.data = params.reader
   187  	tc.dataSize = params.size
   188  	tc.hashSize = params.hashSize
   189  	tc.branches = params.chunkSize / params.hashSize
   190  	tc.addr = params.addr
   191  	tc.chunkSize = params.chunkSize
   192  	tc.putter = params.putter
   193  	tc.workerCount = 0
   194  	tc.jobC = make(chan *hashJob, 2*ChunkProcessors)
   195  	tc.wg = &sync.WaitGroup{}
   196  	tc.errC = make(chan error)
   197  	tc.quitC = make(chan bool)
   198  
   199  	return tc
   200  }
   201  
   202  type hashJob struct {
   203  	key      Address
   204  	chunk    []byte
   205  	size     int64
   206  	parentWg *sync.WaitGroup
   207  }
   208  
   209  func (tc *TreeChunker) incrementWorkerCount() {
   210  	tc.workerLock.Lock()
   211  	defer tc.workerLock.Unlock()
   212  	tc.workerCount += 1
   213  }
   214  
   215  func (tc *TreeChunker) getWorkerCount() int64 {
   216  	tc.workerLock.RLock()
   217  	defer tc.workerLock.RUnlock()
   218  	return tc.workerCount
   219  }
   220  
   221  func (tc *TreeChunker) decrementWorkerCount() {
   222  	tc.workerLock.Lock()
   223  	defer tc.workerLock.Unlock()
   224  	tc.workerCount -= 1
   225  }
   226  
   227  func (tc *TreeChunker) Split(ctx context.Context) (k Address, wait func(context.Context) error, err error) {
   228  	if tc.chunkSize <= 0 {
   229  		panic("chunker must be initialised")
   230  	}
   231  
   232  	tc.runWorker(ctx)
   233  
   234  	depth := 0
   235  	treeSize := tc.chunkSize
   236  
   237  	// takes lowest depth such that chunksize*HashCount^(depth+1) > size
   238  	// power series, will find the order of magnitude of the data size in base hashCount or numbers of levels of branching in the resulting tree.
   239  	for ; treeSize < tc.dataSize; treeSize *= tc.branches {
   240  		depth++
   241  	}
   242  
   243  	key := make([]byte, tc.hashSize)
   244  	// this waitgroup member is released after the root hash is calculated
   245  	tc.wg.Add(1)
   246  	//launch actual recursive function passing the waitgroups
   247  	go tc.split(ctx, depth, treeSize/tc.branches, key, tc.dataSize, tc.wg)
   248  
   249  	// closes internal error channel if all subprocesses in the workgroup finished
   250  	go func() {
   251  		// waiting for all threads to finish
   252  		tc.wg.Wait()
   253  		close(tc.errC)
   254  	}()
   255  
   256  	defer close(tc.quitC)
   257  	defer tc.putter.Close()
   258  	select {
   259  	case err := <-tc.errC:
   260  		if err != nil {
   261  			return nil, nil, err
   262  		}
   263  	case <-ctx.Done():
   264  		return nil, nil, ctx.Err()
   265  	}
   266  
   267  	return key, tc.putter.Wait, nil
   268  }
   269  
   270  func (tc *TreeChunker) split(ctx context.Context, depth int, treeSize int64, addr Address, size int64, parentWg *sync.WaitGroup) {
   271  
   272  	//
   273  
   274  	for depth > 0 && size < treeSize {
   275  		treeSize /= tc.branches
   276  		depth--
   277  	}
   278  
   279  	if depth == 0 {
   280  		// leaf nodes -> content chunks
   281  		chunkData := make([]byte, size+8)
   282  		binary.LittleEndian.PutUint64(chunkData[0:8], uint64(size))
   283  		var readBytes int64
   284  		for readBytes < size {
   285  			n, err := tc.data.Read(chunkData[8+readBytes:])
   286  			readBytes += int64(n)
   287  			if err != nil && !(err == io.EOF && readBytes == size) {
   288  				tc.errC <- err
   289  				return
   290  			}
   291  		}
   292  		select {
   293  		case tc.jobC <- &hashJob{addr, chunkData, size, parentWg}:
   294  		case <-tc.quitC:
   295  		}
   296  		return
   297  	}
   298  	// dept > 0
   299  	// intermediate chunk containing child nodes hashes
   300  	branchCnt := (size + treeSize - 1) / treeSize
   301  
   302  	var chunk = make([]byte, branchCnt*tc.hashSize+8)
   303  	var pos, i int64
   304  
   305  	binary.LittleEndian.PutUint64(chunk[0:8], uint64(size))
   306  
   307  	childrenWg := &sync.WaitGroup{}
   308  	var secSize int64
   309  	for i < branchCnt {
   310  		// the last item can have shorter data
   311  		if size-pos < treeSize {
   312  			secSize = size - pos
   313  		} else {
   314  			secSize = treeSize
   315  		}
   316  		// the hash of that data
   317  		subTreeAddress := chunk[8+i*tc.hashSize : 8+(i+1)*tc.hashSize]
   318  
   319  		childrenWg.Add(1)
   320  		tc.split(ctx, depth-1, treeSize/tc.branches, subTreeAddress, secSize, childrenWg)
   321  
   322  		i++
   323  		pos += treeSize
   324  	}
   325  	// wait for all the children to complete calculating their hashes and copying them onto sections of the chunk
   326  	// parentWg.Add(1)
   327  	// go func() {
   328  	childrenWg.Wait()
   329  
   330  	worker := tc.getWorkerCount()
   331  	if int64(len(tc.jobC)) > worker && worker < ChunkProcessors {
   332  		tc.runWorker(ctx)
   333  
   334  	}
   335  	select {
   336  	case tc.jobC <- &hashJob{addr, chunk, size, parentWg}:
   337  	case <-tc.quitC:
   338  	}
   339  }
   340  
   341  func (tc *TreeChunker) runWorker(ctx context.Context) {
   342  	tc.incrementWorkerCount()
   343  	go func() {
   344  		defer tc.decrementWorkerCount()
   345  		for {
   346  			select {
   347  
   348  			case job, ok := <-tc.jobC:
   349  				if !ok {
   350  					return
   351  				}
   352  
   353  				h, err := tc.putter.Put(ctx, job.chunk)
   354  				if err != nil {
   355  					tc.errC <- err
   356  					return
   357  				}
   358  				copy(job.key, h)
   359  				job.parentWg.Done()
   360  			case <-tc.quitC:
   361  				return
   362  			}
   363  		}
   364  	}()
   365  }
   366  
   367  func (tc *TreeChunker) Append() (Address, func(), error) {
   368  	return nil, nil, errAppendOppNotSuported
   369  }
   370  
   371  // LazyChunkReader implements LazySectionReader
   372  type LazyChunkReader struct {
   373  	ctx       context.Context
   374  	addr      Address // root address
   375  	chunkData ChunkData
   376  	off       int64 // offset
   377  	chunkSize int64 // inherit from chunker
   378  	branches  int64 // inherit from chunker
   379  	hashSize  int64 // inherit from chunker
   380  	depth     int
   381  	getter    Getter
   382  }
   383  
   384  func (tc *TreeChunker) Join(ctx context.Context) *LazyChunkReader {
   385  	return &LazyChunkReader{
   386  		addr:      tc.addr,
   387  		chunkSize: tc.chunkSize,
   388  		branches:  tc.branches,
   389  		hashSize:  tc.hashSize,
   390  		depth:     tc.depth,
   391  		getter:    tc.getter,
   392  		ctx:       tc.ctx,
   393  	}
   394  }
   395  
   396  func (r *LazyChunkReader) Context() context.Context {
   397  	return r.ctx
   398  }
   399  
   400  // Size is meant to be called on the LazySectionReader
   401  func (r *LazyChunkReader) Size(ctx context.Context, quitC chan bool) (n int64, err error) {
   402  	metrics.GetOrRegisterCounter("lazychunkreader.size", nil).Inc(1)
   403  
   404  	var sp opentracing.Span
   405  	var cctx context.Context
   406  	cctx, sp = spancontext.StartSpan(
   407  		ctx,
   408  		"lcr.size")
   409  	defer sp.Finish()
   410  
   411  	log.Debug("lazychunkreader.size", "addr", r.addr)
   412  	if r.chunkData == nil {
   413  		chunkData, err := r.getter.Get(cctx, Reference(r.addr))
   414  		if err != nil {
   415  			return 0, err
   416  		}
   417  		r.chunkData = chunkData
   418  		s := r.chunkData.Size()
   419  		log.Debug("lazychunkreader.size", "key", r.addr, "size", s)
   420  		if s < 0 {
   421  			return 0, errors.New("corrupt size")
   422  		}
   423  		return int64(s), nil
   424  	}
   425  	s := r.chunkData.Size()
   426  	log.Debug("lazychunkreader.size", "key", r.addr, "size", s)
   427  
   428  	return int64(s), nil
   429  }
   430  
   431  // read at can be called numerous times
   432  // concurrent reads are allowed
   433  // Size() needs to be called synchronously on the LazyChunkReader first
   434  func (r *LazyChunkReader) ReadAt(b []byte, off int64) (read int, err error) {
   435  	metrics.GetOrRegisterCounter("lazychunkreader.readat", nil).Inc(1)
   436  
   437  	var sp opentracing.Span
   438  	var cctx context.Context
   439  	cctx, sp = spancontext.StartSpan(
   440  		r.ctx,
   441  		"lcr.read")
   442  	defer sp.Finish()
   443  
   444  	defer func() {
   445  		sp.LogFields(
   446  			olog.Int("off", int(off)),
   447  			olog.Int("read", read))
   448  	}()
   449  
   450  	// this is correct, a swarm doc cannot be zero length, so no EOF is expected
   451  	if len(b) == 0 {
   452  		return 0, nil
   453  	}
   454  	quitC := make(chan bool)
   455  	size, err := r.Size(cctx, quitC)
   456  	if err != nil {
   457  		log.Debug("lazychunkreader.readat.size", "size", size, "err", err)
   458  		return 0, err
   459  	}
   460  
   461  	errC := make(chan error)
   462  
   463  	// }
   464  	var treeSize int64
   465  	var depth int
   466  	// calculate depth and max treeSize
   467  	treeSize = r.chunkSize
   468  	for ; treeSize < size; treeSize *= r.branches {
   469  		depth++
   470  	}
   471  	wg := sync.WaitGroup{}
   472  	length := int64(len(b))
   473  	for d := 0; d < r.depth; d++ {
   474  		off *= r.chunkSize
   475  		length *= r.chunkSize
   476  	}
   477  	wg.Add(1)
   478  	go r.join(b, off, off+length, depth, treeSize/r.branches, r.chunkData, &wg, errC, quitC)
   479  	go func() {
   480  		wg.Wait()
   481  		close(errC)
   482  	}()
   483  
   484  	err = <-errC
   485  	if err != nil {
   486  		log.Debug("lazychunkreader.readat.errc", "err", err)
   487  		close(quitC)
   488  		return 0, err
   489  	}
   490  	if off+int64(len(b)) >= size {
   491  		log.Debug("lazychunkreader.readat.return at end", "size", size, "off", off)
   492  		return int(size - off), io.EOF
   493  	}
   494  	log.Debug("lazychunkreader.readat.errc", "buff", len(b))
   495  	return len(b), nil
   496  }
   497  
   498  func (r *LazyChunkReader) join(b []byte, off int64, eoff int64, depth int, treeSize int64, chunkData ChunkData, parentWg *sync.WaitGroup, errC chan error, quitC chan bool) {
   499  	defer parentWg.Done()
   500  	// find appropriate block level
   501  	for chunkData.Size() < uint64(treeSize) && depth > r.depth {
   502  		treeSize /= r.branches
   503  		depth--
   504  	}
   505  
   506  	// leaf chunk found
   507  	if depth == r.depth {
   508  		extra := 8 + eoff - int64(len(chunkData))
   509  		if extra > 0 {
   510  			eoff -= extra
   511  		}
   512  		copy(b, chunkData[8+off:8+eoff])
   513  		return // simply give back the chunks reader for content chunks
   514  	}
   515  
   516  	// subtree
   517  	start := off / treeSize
   518  	end := (eoff + treeSize - 1) / treeSize
   519  
   520  	// last non-leaf chunk can be shorter than default chunk size, let's not read it further then its end
   521  	currentBranches := int64(len(chunkData)-8) / r.hashSize
   522  	if end > currentBranches {
   523  		end = currentBranches
   524  	}
   525  
   526  	wg := &sync.WaitGroup{}
   527  	defer wg.Wait()
   528  	for i := start; i < end; i++ {
   529  		soff := i * treeSize
   530  		roff := soff
   531  		seoff := soff + treeSize
   532  
   533  		if soff < off {
   534  			soff = off
   535  		}
   536  		if seoff > eoff {
   537  			seoff = eoff
   538  		}
   539  		if depth > 1 {
   540  			wg.Wait()
   541  		}
   542  		wg.Add(1)
   543  		go func(j int64) {
   544  			childAddress := chunkData[8+j*r.hashSize : 8+(j+1)*r.hashSize]
   545  			chunkData, err := r.getter.Get(r.ctx, Reference(childAddress))
   546  			if err != nil {
   547  				log.Debug("lazychunkreader.join", "key", fmt.Sprintf("%x", childAddress), "err", err)
   548  				select {
   549  				case errC <- fmt.Errorf("chunk %v-%v not found; key: %s", off, off+treeSize, fmt.Sprintf("%x", childAddress)):
   550  				case <-quitC:
   551  				}
   552  				return
   553  			}
   554  			if l := len(chunkData); l < 9 {
   555  				select {
   556  				case errC <- fmt.Errorf("chunk %v-%v incomplete; key: %s, data length %v", off, off+treeSize, fmt.Sprintf("%x", childAddress), l):
   557  				case <-quitC:
   558  				}
   559  				return
   560  			}
   561  			if soff < off {
   562  				soff = off
   563  			}
   564  			r.join(b[soff-off:seoff-off], soff-roff, seoff-roff, depth-1, treeSize/r.branches, chunkData, wg, errC, quitC)
   565  		}(i)
   566  	} //for
   567  }
   568  
   569  // Read keeps a cursor so cannot be called simulateously, see ReadAt
   570  func (r *LazyChunkReader) Read(b []byte) (read int, err error) {
   571  	log.Debug("lazychunkreader.read", "key", r.addr)
   572  	metrics.GetOrRegisterCounter("lazychunkreader.read", nil).Inc(1)
   573  
   574  	read, err = r.ReadAt(b, r.off)
   575  	if err != nil && err != io.EOF {
   576  		log.Debug("lazychunkreader.readat", "read", read, "err", err)
   577  		metrics.GetOrRegisterCounter("lazychunkreader.read.err", nil).Inc(1)
   578  	}
   579  
   580  	metrics.GetOrRegisterCounter("lazychunkreader.read.bytes", nil).Inc(int64(read))
   581  
   582  	r.off += int64(read)
   583  	return read, err
   584  }
   585  
   586  // completely analogous to standard SectionReader implementation
   587  var errWhence = errors.New("Seek: invalid whence")
   588  var errOffset = errors.New("Seek: invalid offset")
   589  
   590  func (r *LazyChunkReader) Seek(offset int64, whence int) (int64, error) {
   591  	log.Debug("lazychunkreader.seek", "key", r.addr, "offset", offset)
   592  	switch whence {
   593  	default:
   594  		return 0, errWhence
   595  	case 0:
   596  		offset += 0
   597  	case 1:
   598  		offset += r.off
   599  	case 2:
   600  		if r.chunkData == nil { //seek from the end requires rootchunk for size. call Size first
   601  			_, err := r.Size(context.TODO(), nil)
   602  			if err != nil {
   603  				return 0, fmt.Errorf("can't get size: %v", err)
   604  			}
   605  		}
   606  		offset += int64(r.chunkData.Size())
   607  	}
   608  
   609  	if offset < 0 {
   610  		return 0, errOffset
   611  	}
   612  	r.off = offset
   613  	return offset, nil
   614  }