
     1  // Copyright 2018 The Energi Core Authors
     2  // Copyright 2016 The go-ethereum Authors
     3  // This file is part of the Energi Core library.
     4  //
     5  // The Energi Core library is free software: you can redistribute it and/or modify
     6  // it under the terms of the GNU Lesser General Public License as published by
     7  // the Free Software Foundation, either version 3 of the License, or
     8  // (at your option) any later version.
     9  //
    10  // The Energi Core library is distributed in the hope that it will be useful,
    11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    13  // GNU Lesser General Public License for more details.
    14  //
    15  // You should have received a copy of the GNU Lesser General Public License
    16  // along with the Energi Core library. If not, see <>.
    18  package storage
    20  import (
    21  	"context"
    22  	"encoding/binary"
    23  	"errors"
    24  	"fmt"
    25  	"io"
    26  	"sync"
    27  	"time"
    29  	""
    30  	ch ""
    31  	""
    32  	""
    33  	opentracing ""
    34  	olog ""
    35  )
    37  /*
    38  The distributed storage implemented in this package requires fix sized chunks of content.
    40  Chunker is the interface to a component that is responsible for disassembling and assembling larger data.
    42  TreeChunker implements a Chunker based on a tree structure defined as follows:
    44  1 each node in the tree including the root and other branching nodes are stored as a chunk.
    46  2 branching nodes encode data contents that includes the size of the dataslice covered by its entire subtree under the node as well as the hash keys of all its children :
    47  data_{i} := size(subtree_{i}) || key_{j} || key_{j+1} .... || key_{j+n-1}
    49  3 Leaf nodes encode an actual subslice of the input data.
    51  4 if data size is not more than maximum chunksize, the data is stored in a single chunk
    52    key = hash(int64(size) + data)
    54  5 if data size is more than chunksize*branches^l, but no more than chunksize*
    55    branches^(l+1), the data vector is split into slices of chunksize*
    56    branches^l length (except the last one).
    57    key = hash(int64(size) + key(slice0) + key(slice1) + ...)
    59   The underlying hash function is configurable
    60  */
    62  /*
    63  Tree chunker is a concrete implementation of data chunking.
    64  This chunker works in a simple way, it builds a tree out of the document so that each node either represents a chunk of real data or a chunk of data representing an branching non-leaf node of the tree. In particular each such non-leaf chunk will represent is a concatenation of the hash of its respective children. This scheme simultaneously guarantees data integrity as well as self addressing. Abstract nodes are transparent since their represented size component is strictly greater than their maximum data size, since they encode a subtree.
    66  If all is well it is possible to implement this by simply composing readers so that no extra allocation or buffering is necessary for the data splitting and joining. This means that in principle there can be direct IO between : memory, file system, network socket (bzz peers storage request is read from the socket). In practice there may be need for several stages of internal buffering.
    67  The hashing itself does use extra copies and allocation though, since it does need it.
    68  */
    70  type ChunkerParams struct {
    71  	chunkSize int64
    72  	hashSize  int64
    73  }
    75  type SplitterParams struct {
    76  	ChunkerParams
    77  	reader io.Reader
    78  	putter Putter
    79  	addr   Address
    80  }
    82  type TreeSplitterParams struct {
    83  	SplitterParams
    84  	size int64
    85  }
    87  type JoinerParams struct {
    88  	ChunkerParams
    89  	addr   Address
    90  	getter Getter
    91  	// TODO: there is a bug, so depth can only be 0 today, see:
    92  	depth int
    93  	ctx   context.Context
    94  }
    96  type TreeChunker struct {
    97  	ctx context.Context
    99  	branches int64
   100  	dataSize int64
   101  	data     io.Reader
   102  	// calculated
   103  	addr        Address
   104  	depth       int
   105  	hashSize    int64        // self.hashFunc.New().Size()
   106  	chunkSize   int64        // hashSize* branches
   107  	workerCount int64        // the number of worker routines used
   108  	workerLock  sync.RWMutex // lock for the worker count
   109  	jobC        chan *hashJob
   110  	wg          *sync.WaitGroup
   111  	putter      Putter
   112  	getter      Getter
   113  	errC        chan error
   114  	quitC       chan bool
   115  }
   117  /*
   118  	Join reconstructs original content based on a root key.
   119  	When joining, the caller gets returned a Lazy SectionReader, which is
   120  	seekable and implements on-demand fetching of chunks as and where it is read.
   121  	New chunks to retrieve are coming from the getter, which the caller provides.
   122  	If an error is encountered during joining, it appears as a reader error.
   123  	The SectionReader.
   124  	As a result, partial reads from a document are possible even if other parts
   125  	are corrupt or lost.
   126  	The chunks are not meant to be validated by the chunker when joining. This
   127  	is because it is left to the DPA to decide which sources are trusted.
   128  */
   129  func TreeJoin(ctx context.Context, addr Address, getter Getter, depth int) *LazyChunkReader {
   130  	jp := &JoinerParams{
   131  		ChunkerParams: ChunkerParams{
   132  			chunkSize: ch.DefaultSize,
   133  			hashSize:  int64(len(addr)),
   134  		},
   135  		addr:   addr,
   136  		getter: getter,
   137  		depth:  depth,
   138  		ctx:    ctx,
   139  	}
   141  	return NewTreeJoiner(jp).Join(ctx)
   142  }
   144  /*
   145  	When splitting, data is given as a SectionReader, and the key is a hashSize long byte slice (Key), the root hash of the entire content will fill this once processing finishes.
   146  	New chunks to store are store using the putter which the caller provides.
   147  */
   148  func TreeSplit(ctx context.Context, data io.Reader, size int64, putter Putter) (k Address, wait func(context.Context) error, err error) {
   149  	tsp := &TreeSplitterParams{
   150  		SplitterParams: SplitterParams{
   151  			ChunkerParams: ChunkerParams{
   152  				chunkSize: ch.DefaultSize,
   153  				hashSize:  putter.RefSize(),
   154  			},
   155  			reader: data,
   156  			putter: putter,
   157  		},
   158  		size: size,
   159  	}
   160  	return NewTreeSplitter(tsp).Split(ctx)
   161  }
   163  func NewTreeJoiner(params *JoinerParams) *TreeChunker {
   164  	tc := &TreeChunker{}
   165  	tc.hashSize = params.hashSize
   166  	tc.branches = params.chunkSize / params.hashSize
   167  	tc.addr = params.addr
   168  	tc.getter = params.getter
   169  	tc.depth = params.depth
   170  	tc.chunkSize = params.chunkSize
   171  	tc.workerCount = 0
   172  	tc.jobC = make(chan *hashJob, 2*ChunkProcessors)
   173  	tc.wg = &sync.WaitGroup{}
   174  	tc.errC = make(chan error)
   175  	tc.quitC = make(chan bool)
   177  	tc.ctx = params.ctx
   179  	return tc
   180  }
   182  func NewTreeSplitter(params *TreeSplitterParams) *TreeChunker {
   183  	tc := &TreeChunker{}
   184 = params.reader
   185  	tc.dataSize = params.size
   186  	tc.hashSize = params.hashSize
   187  	tc.branches = params.chunkSize / params.hashSize
   188  	tc.addr = params.addr
   189  	tc.chunkSize = params.chunkSize
   190  	tc.putter = params.putter
   191  	tc.workerCount = 0
   192  	tc.jobC = make(chan *hashJob, 2*ChunkProcessors)
   193  	tc.wg = &sync.WaitGroup{}
   194  	tc.errC = make(chan error)
   195  	tc.quitC = make(chan bool)
   197  	return tc
   198  }
   200  type hashJob struct {
   201  	key      Address
   202  	chunk    []byte
   203  	size     int64
   204  	parentWg *sync.WaitGroup
   205  }
   207  func (tc *TreeChunker) incrementWorkerCount() {
   208  	tc.workerLock.Lock()
   209  	defer tc.workerLock.Unlock()
   210  	tc.workerCount += 1
   211  }
   213  func (tc *TreeChunker) getWorkerCount() int64 {
   214  	tc.workerLock.RLock()
   215  	defer tc.workerLock.RUnlock()
   216  	return tc.workerCount
   217  }
   219  func (tc *TreeChunker) decrementWorkerCount() {
   220  	tc.workerLock.Lock()
   221  	defer tc.workerLock.Unlock()
   222  	tc.workerCount -= 1
   223  }
   225  func (tc *TreeChunker) Split(ctx context.Context) (k Address, wait func(context.Context) error, err error) {
   226  	if tc.chunkSize <= 0 {
   227  		panic("chunker must be initialised")
   228  	}
   230  	tc.runWorker(ctx)
   232  	depth := 0
   233  	treeSize := tc.chunkSize
   235  	// takes lowest depth such that chunksize*HashCount^(depth+1) > size
   236  	// power series, will find the order of magnitude of the data size in base hashCount or numbers of levels of branching in the resulting tree.
   237  	for ; treeSize < tc.dataSize; treeSize *= tc.branches {
   238  		depth++
   239  	}
   241  	key := make([]byte, tc.hashSize)
   242  	// this waitgroup member is released after the root hash is calculated
   243  	tc.wg.Add(1)
   244  	//launch actual recursive function passing the waitgroups
   245  	go tc.split(ctx, depth, treeSize/tc.branches, key, tc.dataSize, tc.wg)
   247  	// closes internal error channel if all subprocesses in the workgroup finished
   248  	go func() {
   249  		// waiting for all threads to finish
   250  		tc.wg.Wait()
   251  		close(tc.errC)
   252  	}()
   254  	defer close(tc.quitC)
   255  	defer tc.putter.Close()
   256  	select {
   257  	case err := <-tc.errC:
   258  		if err != nil {
   259  			return nil, nil, err
   260  		}
   261  	case <-ctx.Done():
   262  		return nil, nil, ctx.Err()
   263  	}
   265  	return key, tc.putter.Wait, nil
   266  }
   268  func (tc *TreeChunker) split(ctx context.Context, depth int, treeSize int64, addr Address, size int64, parentWg *sync.WaitGroup) {
   270  	//
   272  	for depth > 0 && size < treeSize {
   273  		treeSize /= tc.branches
   274  		depth--
   275  	}
   277  	if depth == 0 {
   278  		// leaf nodes -> content chunks
   279  		chunkData := make([]byte, size+8)
   280  		binary.LittleEndian.PutUint64(chunkData[0:8], uint64(size))
   281  		var readBytes int64
   282  		for readBytes < size {
   283  			n, err :=[8+readBytes:])
   284  			readBytes += int64(n)
   285  			if err != nil && !(err == io.EOF && readBytes == size) {
   286  				tc.errC <- err
   287  				return
   288  			}
   289  		}
   290  		select {
   291  		case tc.jobC <- &hashJob{addr, chunkData, size, parentWg}:
   292  		case <-tc.quitC:
   293  		}
   294  		return
   295  	}
   296  	// dept > 0
   297  	// intermediate chunk containing child nodes hashes
   298  	branchCnt := (size + treeSize - 1) / treeSize
   300  	var chunk = make([]byte, branchCnt*tc.hashSize+8)
   301  	var pos, i int64
   303  	binary.LittleEndian.PutUint64(chunk[0:8], uint64(size))
   305  	childrenWg := &sync.WaitGroup{}
   306  	var secSize int64
   307  	for i < branchCnt {
   308  		// the last item can have shorter data
   309  		if size-pos < treeSize {
   310  			secSize = size - pos
   311  		} else {
   312  			secSize = treeSize
   313  		}
   314  		// the hash of that data
   315  		subTreeAddress := chunk[8+i*tc.hashSize : 8+(i+1)*tc.hashSize]
   317  		childrenWg.Add(1)
   318  		tc.split(ctx, depth-1, treeSize/tc.branches, subTreeAddress, secSize, childrenWg)
   320  		i++
   321  		pos += treeSize
   322  	}
   323  	// wait for all the children to complete calculating their hashes and copying them onto sections of the chunk
   324  	// parentWg.Add(1)
   325  	// go func() {
   326  	childrenWg.Wait()
   328  	worker := tc.getWorkerCount()
   329  	if int64(len(tc.jobC)) > worker && worker < ChunkProcessors {
   330  		tc.runWorker(ctx)
   332  	}
   333  	select {
   334  	case tc.jobC <- &hashJob{addr, chunk, size, parentWg}:
   335  	case <-tc.quitC:
   336  	}
   337  }
   339  func (tc *TreeChunker) runWorker(ctx context.Context) {
   340  	tc.incrementWorkerCount()
   341  	go func() {
   342  		defer tc.decrementWorkerCount()
   343  		for {
   344  			select {
   346  			case job, ok := <-tc.jobC:
   347  				if !ok {
   348  					return
   349  				}
   351  				h, err := tc.putter.Put(ctx, job.chunk)
   352  				if err != nil {
   353  					tc.errC <- err
   354  					return
   355  				}
   356  				copy(job.key, h)
   357  				job.parentWg.Done()
   358  			case <-tc.quitC:
   359  				return
   360  			}
   361  		}
   362  	}()
   363  }
   365  // LazyChunkReader implements LazySectionReader
   366  type LazyChunkReader struct {
   367  	ctx       context.Context
   368  	addr      Address // root address
   369  	chunkData ChunkData
   370  	off       int64 // offset
   371  	chunkSize int64 // inherit from chunker
   372  	branches  int64 // inherit from chunker
   373  	hashSize  int64 // inherit from chunker
   374  	depth     int
   375  	getter    Getter
   376  }
   378  func (tc *TreeChunker) Join(ctx context.Context) *LazyChunkReader {
   379  	return &LazyChunkReader{
   380  		addr:      tc.addr,
   381  		chunkSize: tc.chunkSize,
   382  		branches:  tc.branches,
   383  		hashSize:  tc.hashSize,
   384  		depth:     tc.depth,
   385  		getter:    tc.getter,
   386  		ctx:       tc.ctx,
   387  	}
   388  }
   390  func (r *LazyChunkReader) Context() context.Context {
   391  	return r.ctx
   392  }
   394  // Size is meant to be called on the LazySectionReader
   395  func (r *LazyChunkReader) Size(ctx context.Context, quitC chan bool) (n int64, err error) {
   396  	metrics.GetOrRegisterCounter("lazychunkreader.size", nil).Inc(1)
   398  	var sp opentracing.Span
   399  	var cctx context.Context
   400  	cctx, sp = spancontext.StartSpan(
   401  		ctx,
   402  		"lcr.size")
   403  	defer sp.Finish()
   405  	log.Debug("lazychunkreader.size", "addr", r.addr)
   406  	if r.chunkData == nil {
   407  		startTime := time.Now()
   408  		chunkData, err := r.getter.Get(cctx, Reference(r.addr))
   409  		if err != nil {
   410  			metrics.GetOrRegisterResettingTimer("lcr.getter.get.err", nil).UpdateSince(startTime)
   411  			return 0, err
   412  		}
   413  		metrics.GetOrRegisterResettingTimer("lcr.getter.get", nil).UpdateSince(startTime)
   414  		r.chunkData = chunkData
   415  	}
   417  	s := r.chunkData.Size()
   418  	log.Debug("lazychunkreader.size", "key", r.addr, "size", s)
   420  	return int64(s), nil
   421  }
   423  // read at can be called numerous times
   424  // concurrent reads are allowed
   425  // Size() needs to be called synchronously on the LazyChunkReader first
   426  func (r *LazyChunkReader) ReadAt(b []byte, off int64) (read int, err error) {
   427  	metrics.GetOrRegisterCounter("lazychunkreader.readat", nil).Inc(1)
   429  	var sp opentracing.Span
   430  	var cctx context.Context
   431  	cctx, sp = spancontext.StartSpan(
   432  		r.ctx,
   433  		"")
   434  	defer sp.Finish()
   436  	defer func() {
   437  		sp.LogFields(
   438  			olog.Int("off", int(off)),
   439  			olog.Int("read", read))
   440  	}()
   442  	// this is correct, a swarm doc cannot be zero length, so no EOF is expected
   443  	if len(b) == 0 {
   444  		return 0, nil
   445  	}
   446  	quitC := make(chan bool)
   447  	size, err := r.Size(cctx, quitC)
   448  	if err != nil {
   449  		log.Debug("lazychunkreader.readat.size", "size", size, "err", err)
   450  		return 0, err
   451  	}
   453  	errC := make(chan error)
   455  	// }
   456  	var treeSize int64
   457  	var depth int
   458  	// calculate depth and max treeSize
   459  	treeSize = r.chunkSize
   460  	for ; treeSize < size; treeSize *= r.branches {
   461  		depth++
   462  	}
   463  	wg := sync.WaitGroup{}
   464  	length := int64(len(b))
   465  	for d := 0; d < r.depth; d++ {
   466  		off *= r.chunkSize
   467  		length *= r.chunkSize
   468  	}
   469  	wg.Add(1)
   470  	go r.join(cctx, b, off, off+length, depth, treeSize/r.branches, r.chunkData, &wg, errC, quitC)
   471  	go func() {
   472  		wg.Wait()
   473  		close(errC)
   474  	}()
   476  	err = <-errC
   477  	if err != nil {
   478  		log.Debug("lazychunkreader.readat.errc", "err", err)
   479  		close(quitC)
   480  		return 0, err
   481  	}
   482  	if off+int64(len(b)) >= size {
   483  		log.Debug("lazychunkreader.readat.return at end", "size", size, "off", off)
   484  		return int(size - off), io.EOF
   485  	}
   486  	log.Debug("lazychunkreader.readat.errc", "buff", len(b))
   487  	return len(b), nil
   488  }
   490  func (r *LazyChunkReader) join(ctx context.Context, b []byte, off int64, eoff int64, depth int, treeSize int64, chunkData ChunkData, parentWg *sync.WaitGroup, errC chan error, quitC chan bool) {
   491  	defer parentWg.Done()
   492  	// find appropriate block level
   493  	for chunkData.Size() < uint64(treeSize) && depth > r.depth {
   494  		treeSize /= r.branches
   495  		depth--
   496  	}
   498  	// leaf chunk found
   499  	if depth == r.depth {
   500  		extra := 8 + eoff - int64(len(chunkData))
   501  		if extra > 0 {
   502  			eoff -= extra
   503  		}
   504  		copy(b, chunkData[8+off:8+eoff])
   505  		return // simply give back the chunks reader for content chunks
   506  	}
   508  	// subtree
   509  	start := off / treeSize
   510  	end := (eoff + treeSize - 1) / treeSize
   512  	// last non-leaf chunk can be shorter than default chunk size, let's not read it further then its end
   513  	currentBranches := int64(len(chunkData)-8) / r.hashSize
   514  	if end > currentBranches {
   515  		end = currentBranches
   516  	}
   518  	wg := &sync.WaitGroup{}
   519  	defer wg.Wait()
   520  	for i := start; i < end; i++ {
   521  		soff := i * treeSize
   522  		roff := soff
   523  		seoff := soff + treeSize
   525  		if soff < off {
   526  			soff = off
   527  		}
   528  		if seoff > eoff {
   529  			seoff = eoff
   530  		}
   531  		if depth > 1 {
   532  			wg.Wait()
   533  		}
   534  		wg.Add(1)
   535  		go func(j int64) {
   536  			childAddress := chunkData[8+j*r.hashSize : 8+(j+1)*r.hashSize]
   537  			startTime := time.Now()
   538  			chunkData, err := r.getter.Get(ctx, Reference(childAddress))
   539  			if err != nil {
   540  				metrics.GetOrRegisterResettingTimer("lcr.getter.get.err", nil).UpdateSince(startTime)
   541  				log.Debug("lazychunkreader.join", "key", fmt.Sprintf("%x", childAddress), "err", err)
   542  				select {
   543  				case errC <- fmt.Errorf("chunk %v-%v not found; key: %s", off, off+treeSize, fmt.Sprintf("%x", childAddress)):
   544  				case <-quitC:
   545  				}
   546  				return
   547  			}
   548  			metrics.GetOrRegisterResettingTimer("lcr.getter.get", nil).UpdateSince(startTime)
   549  			if l := len(chunkData); l < 9 {
   550  				select {
   551  				case errC <- fmt.Errorf("chunk %v-%v incomplete; key: %s, data length %v", off, off+treeSize, fmt.Sprintf("%x", childAddress), l):
   552  				case <-quitC:
   553  				}
   554  				return
   555  			}
   556  			if soff < off {
   557  				soff = off
   558  			}
   559  			r.join(ctx, b[soff-off:seoff-off], soff-roff, seoff-roff, depth-1, treeSize/r.branches, chunkData, wg, errC, quitC)
   560  		}(i)
   561  	} //for
   562  }
   564  // Read keeps a cursor so cannot be called simulateously, see ReadAt
   565  func (r *LazyChunkReader) Read(b []byte) (read int, err error) {
   566  	log.Debug("", "key", r.addr)
   567  	metrics.GetOrRegisterCounter("", nil).Inc(1)
   569  	read, err = r.ReadAt(b,
   570  	if err != nil && err != io.EOF {
   571  		log.Debug("lazychunkreader.readat", "read", read, "err", err)
   572  		metrics.GetOrRegisterCounter("", nil).Inc(1)
   573  	}
   575  	metrics.GetOrRegisterCounter("", nil).Inc(int64(read))
   577 += int64(read)
   578  	return read, err
   579  }
   581  // completely analogous to standard SectionReader implementation
   582  var errWhence = errors.New("Seek: invalid whence")
   583  var errOffset = errors.New("Seek: invalid offset")
   585  func (r *LazyChunkReader) Seek(offset int64, whence int) (int64, error) {
   586  	cctx, sp := spancontext.StartSpan(
   587  		r.ctx,
   588  		"")
   589  	defer sp.Finish()
   591  	log.Debug("", "key", r.addr, "offset", offset)
   592  	switch whence {
   593  	default:
   594  		return 0, errWhence
   595  	case 0:
   596  		offset += 0
   597  	case 1:
   598  		offset +=
   599  	case 2:
   601  		if r.chunkData == nil { //seek from the end requires rootchunk for size. call Size first
   602  			_, err := r.Size(cctx, nil)
   603  			if err != nil {
   604  				return 0, fmt.Errorf("can't get size: %v", err)
   605  			}
   606  		}
   607  		offset += int64(r.chunkData.Size())
   608  	}
   610  	if offset < 0 {
   611  		return 0, errOffset
   612  	}
   613 = offset
   614  	return offset, nil
   615  }