github.com/xxRanger/go-ethereum@v1.8.23/swarm/storage/chunker.go (about)

     1  // Copyright 2016 The go-ethereum Authors
     2  // This file is part of the go-ethereum library.
     3  //
     4  // The go-ethereum library is free software: you can redistribute it and/or modify
     5  // it under the terms of the GNU Lesser General Public License as published by
     6  // the Free Software Foundation, either version 3 of the License, or
     7  // (at your option) any later version.
     8  //
     9  // The go-ethereum library is distributed in the hope that it will be useful,
    10  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    11  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    12  // GNU Lesser General Public License for more details.
    13  //
    14  // You should have received a copy of the GNU Lesser General Public License
    15  // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
    16  package storage
    17  
    18  import (
    19  	"context"
    20  	"encoding/binary"
    21  	"errors"
    22  	"fmt"
    23  	"io"
    24  	"sync"
    25  	"time"
    26  
    27  	"github.com/ethereum/go-ethereum/metrics"
    28  	ch "github.com/ethereum/go-ethereum/swarm/chunk"
    29  	"github.com/ethereum/go-ethereum/swarm/log"
    30  	"github.com/ethereum/go-ethereum/swarm/spancontext"
    31  	opentracing "github.com/opentracing/opentracing-go"
    32  	olog "github.com/opentracing/opentracing-go/log"
    33  )
    34  
    35  /*
    36  The distributed storage implemented in this package requires fix sized chunks of content.
    37  
    38  Chunker is the interface to a component that is responsible for disassembling and assembling larger data.
    39  
    40  TreeChunker implements a Chunker based on a tree structure defined as follows:
    41  
    42  1 each node in the tree including the root and other branching nodes are stored as a chunk.
    43  
    44  2 branching nodes encode data contents that includes the size of the dataslice covered by its entire subtree under the node as well as the hash keys of all its children :
    45  data_{i} := size(subtree_{i}) || key_{j} || key_{j+1} .... || key_{j+n-1}
    46  
    47  3 Leaf nodes encode an actual subslice of the input data.
    48  
    49  4 if data size is not more than maximum chunksize, the data is stored in a single chunk
    50    key = hash(int64(size) + data)
    51  
    52  5 if data size is more than chunksize*branches^l, but no more than chunksize*
    53    branches^(l+1), the data vector is split into slices of chunksize*
    54    branches^l length (except the last one).
    55    key = hash(int64(size) + key(slice0) + key(slice1) + ...)
    56  
    57   The underlying hash function is configurable
    58  */
    59  
    60  /*
    61  Tree chunker is a concrete implementation of data chunking.
    62  This chunker works in a simple way, it builds a tree out of the document so that each node either represents a chunk of real data or a chunk of data representing an branching non-leaf node of the tree. In particular each such non-leaf chunk will represent is a concatenation of the hash of its respective children. This scheme simultaneously guarantees data integrity as well as self addressing. Abstract nodes are transparent since their represented size component is strictly greater than their maximum data size, since they encode a subtree.
    63  
    64  If all is well it is possible to implement this by simply composing readers so that no extra allocation or buffering is necessary for the data splitting and joining. This means that in principle there can be direct IO between : memory, file system, network socket (bzz peers storage request is read from the socket). In practice there may be need for several stages of internal buffering.
    65  The hashing itself does use extra copies and allocation though, since it does need it.
    66  */
    67  
    68  type ChunkerParams struct {
    69  	chunkSize int64
    70  	hashSize  int64
    71  }
    72  
    73  type SplitterParams struct {
    74  	ChunkerParams
    75  	reader io.Reader
    76  	putter Putter
    77  	addr   Address
    78  }
    79  
    80  type TreeSplitterParams struct {
    81  	SplitterParams
    82  	size int64
    83  }
    84  
    85  type JoinerParams struct {
    86  	ChunkerParams
    87  	addr   Address
    88  	getter Getter
    89  	// TODO: there is a bug, so depth can only be 0 today, see: https://github.com/ethersphere/go-ethereum/issues/344
    90  	depth int
    91  	ctx   context.Context
    92  }
    93  
    94  type TreeChunker struct {
    95  	ctx context.Context
    96  
    97  	branches int64
    98  	dataSize int64
    99  	data     io.Reader
   100  	// calculated
   101  	addr        Address
   102  	depth       int
   103  	hashSize    int64        // self.hashFunc.New().Size()
   104  	chunkSize   int64        // hashSize* branches
   105  	workerCount int64        // the number of worker routines used
   106  	workerLock  sync.RWMutex // lock for the worker count
   107  	jobC        chan *hashJob
   108  	wg          *sync.WaitGroup
   109  	putter      Putter
   110  	getter      Getter
   111  	errC        chan error
   112  	quitC       chan bool
   113  }
   114  
   115  /*
   116  	Join reconstructs original content based on a root key.
   117  	When joining, the caller gets returned a Lazy SectionReader, which is
   118  	seekable and implements on-demand fetching of chunks as and where it is read.
   119  	New chunks to retrieve are coming from the getter, which the caller provides.
   120  	If an error is encountered during joining, it appears as a reader error.
   121  	The SectionReader.
   122  	As a result, partial reads from a document are possible even if other parts
   123  	are corrupt or lost.
   124  	The chunks are not meant to be validated by the chunker when joining. This
   125  	is because it is left to the DPA to decide which sources are trusted.
   126  */
   127  func TreeJoin(ctx context.Context, addr Address, getter Getter, depth int) *LazyChunkReader {
   128  	jp := &JoinerParams{
   129  		ChunkerParams: ChunkerParams{
   130  			chunkSize: ch.DefaultSize,
   131  			hashSize:  int64(len(addr)),
   132  		},
   133  		addr:   addr,
   134  		getter: getter,
   135  		depth:  depth,
   136  		ctx:    ctx,
   137  	}
   138  
   139  	return NewTreeJoiner(jp).Join(ctx)
   140  }
   141  
   142  /*
   143  	When splitting, data is given as a SectionReader, and the key is a hashSize long byte slice (Key), the root hash of the entire content will fill this once processing finishes.
   144  	New chunks to store are store using the putter which the caller provides.
   145  */
   146  func TreeSplit(ctx context.Context, data io.Reader, size int64, putter Putter) (k Address, wait func(context.Context) error, err error) {
   147  	tsp := &TreeSplitterParams{
   148  		SplitterParams: SplitterParams{
   149  			ChunkerParams: ChunkerParams{
   150  				chunkSize: ch.DefaultSize,
   151  				hashSize:  putter.RefSize(),
   152  			},
   153  			reader: data,
   154  			putter: putter,
   155  		},
   156  		size: size,
   157  	}
   158  	return NewTreeSplitter(tsp).Split(ctx)
   159  }
   160  
   161  func NewTreeJoiner(params *JoinerParams) *TreeChunker {
   162  	tc := &TreeChunker{}
   163  	tc.hashSize = params.hashSize
   164  	tc.branches = params.chunkSize / params.hashSize
   165  	tc.addr = params.addr
   166  	tc.getter = params.getter
   167  	tc.depth = params.depth
   168  	tc.chunkSize = params.chunkSize
   169  	tc.workerCount = 0
   170  	tc.jobC = make(chan *hashJob, 2*ChunkProcessors)
   171  	tc.wg = &sync.WaitGroup{}
   172  	tc.errC = make(chan error)
   173  	tc.quitC = make(chan bool)
   174  
   175  	tc.ctx = params.ctx
   176  
   177  	return tc
   178  }
   179  
   180  func NewTreeSplitter(params *TreeSplitterParams) *TreeChunker {
   181  	tc := &TreeChunker{}
   182  	tc.data = params.reader
   183  	tc.dataSize = params.size
   184  	tc.hashSize = params.hashSize
   185  	tc.branches = params.chunkSize / params.hashSize
   186  	tc.addr = params.addr
   187  	tc.chunkSize = params.chunkSize
   188  	tc.putter = params.putter
   189  	tc.workerCount = 0
   190  	tc.jobC = make(chan *hashJob, 2*ChunkProcessors)
   191  	tc.wg = &sync.WaitGroup{}
   192  	tc.errC = make(chan error)
   193  	tc.quitC = make(chan bool)
   194  
   195  	return tc
   196  }
   197  
   198  type hashJob struct {
   199  	key      Address
   200  	chunk    []byte
   201  	size     int64
   202  	parentWg *sync.WaitGroup
   203  }
   204  
   205  func (tc *TreeChunker) incrementWorkerCount() {
   206  	tc.workerLock.Lock()
   207  	defer tc.workerLock.Unlock()
   208  	tc.workerCount += 1
   209  }
   210  
   211  func (tc *TreeChunker) getWorkerCount() int64 {
   212  	tc.workerLock.RLock()
   213  	defer tc.workerLock.RUnlock()
   214  	return tc.workerCount
   215  }
   216  
   217  func (tc *TreeChunker) decrementWorkerCount() {
   218  	tc.workerLock.Lock()
   219  	defer tc.workerLock.Unlock()
   220  	tc.workerCount -= 1
   221  }
   222  
   223  func (tc *TreeChunker) Split(ctx context.Context) (k Address, wait func(context.Context) error, err error) {
   224  	if tc.chunkSize <= 0 {
   225  		panic("chunker must be initialised")
   226  	}
   227  
   228  	tc.runWorker(ctx)
   229  
   230  	depth := 0
   231  	treeSize := tc.chunkSize
   232  
   233  	// takes lowest depth such that chunksize*HashCount^(depth+1) > size
   234  	// power series, will find the order of magnitude of the data size in base hashCount or numbers of levels of branching in the resulting tree.
   235  	for ; treeSize < tc.dataSize; treeSize *= tc.branches {
   236  		depth++
   237  	}
   238  
   239  	key := make([]byte, tc.hashSize)
   240  	// this waitgroup member is released after the root hash is calculated
   241  	tc.wg.Add(1)
   242  	//launch actual recursive function passing the waitgroups
   243  	go tc.split(ctx, depth, treeSize/tc.branches, key, tc.dataSize, tc.wg)
   244  
   245  	// closes internal error channel if all subprocesses in the workgroup finished
   246  	go func() {
   247  		// waiting for all threads to finish
   248  		tc.wg.Wait()
   249  		close(tc.errC)
   250  	}()
   251  
   252  	defer close(tc.quitC)
   253  	defer tc.putter.Close()
   254  	select {
   255  	case err := <-tc.errC:
   256  		if err != nil {
   257  			return nil, nil, err
   258  		}
   259  	case <-ctx.Done():
   260  		return nil, nil, ctx.Err()
   261  	}
   262  
   263  	return key, tc.putter.Wait, nil
   264  }
   265  
   266  func (tc *TreeChunker) split(ctx context.Context, depth int, treeSize int64, addr Address, size int64, parentWg *sync.WaitGroup) {
   267  
   268  	//
   269  
   270  	for depth > 0 && size < treeSize {
   271  		treeSize /= tc.branches
   272  		depth--
   273  	}
   274  
   275  	if depth == 0 {
   276  		// leaf nodes -> content chunks
   277  		chunkData := make([]byte, size+8)
   278  		binary.LittleEndian.PutUint64(chunkData[0:8], uint64(size))
   279  		var readBytes int64
   280  		for readBytes < size {
   281  			n, err := tc.data.Read(chunkData[8+readBytes:])
   282  			readBytes += int64(n)
   283  			if err != nil && !(err == io.EOF && readBytes == size) {
   284  				tc.errC <- err
   285  				return
   286  			}
   287  		}
   288  		select {
   289  		case tc.jobC <- &hashJob{addr, chunkData, size, parentWg}:
   290  		case <-tc.quitC:
   291  		}
   292  		return
   293  	}
   294  	// dept > 0
   295  	// intermediate chunk containing child nodes hashes
   296  	branchCnt := (size + treeSize - 1) / treeSize
   297  
   298  	var chunk = make([]byte, branchCnt*tc.hashSize+8)
   299  	var pos, i int64
   300  
   301  	binary.LittleEndian.PutUint64(chunk[0:8], uint64(size))
   302  
   303  	childrenWg := &sync.WaitGroup{}
   304  	var secSize int64
   305  	for i < branchCnt {
   306  		// the last item can have shorter data
   307  		if size-pos < treeSize {
   308  			secSize = size - pos
   309  		} else {
   310  			secSize = treeSize
   311  		}
   312  		// the hash of that data
   313  		subTreeAddress := chunk[8+i*tc.hashSize : 8+(i+1)*tc.hashSize]
   314  
   315  		childrenWg.Add(1)
   316  		tc.split(ctx, depth-1, treeSize/tc.branches, subTreeAddress, secSize, childrenWg)
   317  
   318  		i++
   319  		pos += treeSize
   320  	}
   321  	// wait for all the children to complete calculating their hashes and copying them onto sections of the chunk
   322  	// parentWg.Add(1)
   323  	// go func() {
   324  	childrenWg.Wait()
   325  
   326  	worker := tc.getWorkerCount()
   327  	if int64(len(tc.jobC)) > worker && worker < ChunkProcessors {
   328  		tc.runWorker(ctx)
   329  
   330  	}
   331  	select {
   332  	case tc.jobC <- &hashJob{addr, chunk, size, parentWg}:
   333  	case <-tc.quitC:
   334  	}
   335  }
   336  
   337  func (tc *TreeChunker) runWorker(ctx context.Context) {
   338  	tc.incrementWorkerCount()
   339  	go func() {
   340  		defer tc.decrementWorkerCount()
   341  		for {
   342  			select {
   343  
   344  			case job, ok := <-tc.jobC:
   345  				if !ok {
   346  					return
   347  				}
   348  
   349  				h, err := tc.putter.Put(ctx, job.chunk)
   350  				if err != nil {
   351  					tc.errC <- err
   352  					return
   353  				}
   354  				copy(job.key, h)
   355  				job.parentWg.Done()
   356  			case <-tc.quitC:
   357  				return
   358  			}
   359  		}
   360  	}()
   361  }
   362  
   363  // LazyChunkReader implements LazySectionReader
   364  type LazyChunkReader struct {
   365  	ctx       context.Context
   366  	addr      Address // root address
   367  	chunkData ChunkData
   368  	off       int64 // offset
   369  	chunkSize int64 // inherit from chunker
   370  	branches  int64 // inherit from chunker
   371  	hashSize  int64 // inherit from chunker
   372  	depth     int
   373  	getter    Getter
   374  }
   375  
   376  func (tc *TreeChunker) Join(ctx context.Context) *LazyChunkReader {
   377  	return &LazyChunkReader{
   378  		addr:      tc.addr,
   379  		chunkSize: tc.chunkSize,
   380  		branches:  tc.branches,
   381  		hashSize:  tc.hashSize,
   382  		depth:     tc.depth,
   383  		getter:    tc.getter,
   384  		ctx:       tc.ctx,
   385  	}
   386  }
   387  
   388  func (r *LazyChunkReader) Context() context.Context {
   389  	return r.ctx
   390  }
   391  
   392  // Size is meant to be called on the LazySectionReader
   393  func (r *LazyChunkReader) Size(ctx context.Context, quitC chan bool) (n int64, err error) {
   394  	metrics.GetOrRegisterCounter("lazychunkreader.size", nil).Inc(1)
   395  
   396  	var sp opentracing.Span
   397  	var cctx context.Context
   398  	cctx, sp = spancontext.StartSpan(
   399  		ctx,
   400  		"lcr.size")
   401  	defer sp.Finish()
   402  
   403  	log.Debug("lazychunkreader.size", "addr", r.addr)
   404  	if r.chunkData == nil {
   405  		startTime := time.Now()
   406  		chunkData, err := r.getter.Get(cctx, Reference(r.addr))
   407  		if err != nil {
   408  			metrics.GetOrRegisterResettingTimer("lcr.getter.get.err", nil).UpdateSince(startTime)
   409  			return 0, err
   410  		}
   411  		metrics.GetOrRegisterResettingTimer("lcr.getter.get", nil).UpdateSince(startTime)
   412  		r.chunkData = chunkData
   413  	}
   414  
   415  	s := r.chunkData.Size()
   416  	log.Debug("lazychunkreader.size", "key", r.addr, "size", s)
   417  
   418  	return int64(s), nil
   419  }
   420  
   421  // read at can be called numerous times
   422  // concurrent reads are allowed
   423  // Size() needs to be called synchronously on the LazyChunkReader first
   424  func (r *LazyChunkReader) ReadAt(b []byte, off int64) (read int, err error) {
   425  	metrics.GetOrRegisterCounter("lazychunkreader.readat", nil).Inc(1)
   426  
   427  	var sp opentracing.Span
   428  	var cctx context.Context
   429  	cctx, sp = spancontext.StartSpan(
   430  		r.ctx,
   431  		"lcr.read")
   432  	defer sp.Finish()
   433  
   434  	defer func() {
   435  		sp.LogFields(
   436  			olog.Int("off", int(off)),
   437  			olog.Int("read", read))
   438  	}()
   439  
   440  	// this is correct, a swarm doc cannot be zero length, so no EOF is expected
   441  	if len(b) == 0 {
   442  		return 0, nil
   443  	}
   444  	quitC := make(chan bool)
   445  	size, err := r.Size(cctx, quitC)
   446  	if err != nil {
   447  		log.Debug("lazychunkreader.readat.size", "size", size, "err", err)
   448  		return 0, err
   449  	}
   450  
   451  	errC := make(chan error)
   452  
   453  	// }
   454  	var treeSize int64
   455  	var depth int
   456  	// calculate depth and max treeSize
   457  	treeSize = r.chunkSize
   458  	for ; treeSize < size; treeSize *= r.branches {
   459  		depth++
   460  	}
   461  	wg := sync.WaitGroup{}
   462  	length := int64(len(b))
   463  	for d := 0; d < r.depth; d++ {
   464  		off *= r.chunkSize
   465  		length *= r.chunkSize
   466  	}
   467  	wg.Add(1)
   468  	go r.join(cctx, b, off, off+length, depth, treeSize/r.branches, r.chunkData, &wg, errC, quitC)
   469  	go func() {
   470  		wg.Wait()
   471  		close(errC)
   472  	}()
   473  
   474  	err = <-errC
   475  	if err != nil {
   476  		log.Debug("lazychunkreader.readat.errc", "err", err)
   477  		close(quitC)
   478  		return 0, err
   479  	}
   480  	if off+int64(len(b)) >= size {
   481  		log.Debug("lazychunkreader.readat.return at end", "size", size, "off", off)
   482  		return int(size - off), io.EOF
   483  	}
   484  	log.Debug("lazychunkreader.readat.errc", "buff", len(b))
   485  	return len(b), nil
   486  }
   487  
   488  func (r *LazyChunkReader) join(ctx context.Context, b []byte, off int64, eoff int64, depth int, treeSize int64, chunkData ChunkData, parentWg *sync.WaitGroup, errC chan error, quitC chan bool) {
   489  	defer parentWg.Done()
   490  	// find appropriate block level
   491  	for chunkData.Size() < uint64(treeSize) && depth > r.depth {
   492  		treeSize /= r.branches
   493  		depth--
   494  	}
   495  
   496  	// leaf chunk found
   497  	if depth == r.depth {
   498  		extra := 8 + eoff - int64(len(chunkData))
   499  		if extra > 0 {
   500  			eoff -= extra
   501  		}
   502  		copy(b, chunkData[8+off:8+eoff])
   503  		return // simply give back the chunks reader for content chunks
   504  	}
   505  
   506  	// subtree
   507  	start := off / treeSize
   508  	end := (eoff + treeSize - 1) / treeSize
   509  
   510  	// last non-leaf chunk can be shorter than default chunk size, let's not read it further then its end
   511  	currentBranches := int64(len(chunkData)-8) / r.hashSize
   512  	if end > currentBranches {
   513  		end = currentBranches
   514  	}
   515  
   516  	wg := &sync.WaitGroup{}
   517  	defer wg.Wait()
   518  	for i := start; i < end; i++ {
   519  		soff := i * treeSize
   520  		roff := soff
   521  		seoff := soff + treeSize
   522  
   523  		if soff < off {
   524  			soff = off
   525  		}
   526  		if seoff > eoff {
   527  			seoff = eoff
   528  		}
   529  		if depth > 1 {
   530  			wg.Wait()
   531  		}
   532  		wg.Add(1)
   533  		go func(j int64) {
   534  			childAddress := chunkData[8+j*r.hashSize : 8+(j+1)*r.hashSize]
   535  			startTime := time.Now()
   536  			chunkData, err := r.getter.Get(ctx, Reference(childAddress))
   537  			if err != nil {
   538  				metrics.GetOrRegisterResettingTimer("lcr.getter.get.err", nil).UpdateSince(startTime)
   539  				log.Debug("lazychunkreader.join", "key", fmt.Sprintf("%x", childAddress), "err", err)
   540  				select {
   541  				case errC <- fmt.Errorf("chunk %v-%v not found; key: %s", off, off+treeSize, fmt.Sprintf("%x", childAddress)):
   542  				case <-quitC:
   543  				}
   544  				return
   545  			}
   546  			metrics.GetOrRegisterResettingTimer("lcr.getter.get", nil).UpdateSince(startTime)
   547  			if l := len(chunkData); l < 9 {
   548  				select {
   549  				case errC <- fmt.Errorf("chunk %v-%v incomplete; key: %s, data length %v", off, off+treeSize, fmt.Sprintf("%x", childAddress), l):
   550  				case <-quitC:
   551  				}
   552  				return
   553  			}
   554  			if soff < off {
   555  				soff = off
   556  			}
   557  			r.join(ctx, b[soff-off:seoff-off], soff-roff, seoff-roff, depth-1, treeSize/r.branches, chunkData, wg, errC, quitC)
   558  		}(i)
   559  	} //for
   560  }
   561  
   562  // Read keeps a cursor so cannot be called simulateously, see ReadAt
   563  func (r *LazyChunkReader) Read(b []byte) (read int, err error) {
   564  	log.Debug("lazychunkreader.read", "key", r.addr)
   565  	metrics.GetOrRegisterCounter("lazychunkreader.read", nil).Inc(1)
   566  
   567  	read, err = r.ReadAt(b, r.off)
   568  	if err != nil && err != io.EOF {
   569  		log.Debug("lazychunkreader.readat", "read", read, "err", err)
   570  		metrics.GetOrRegisterCounter("lazychunkreader.read.err", nil).Inc(1)
   571  	}
   572  
   573  	metrics.GetOrRegisterCounter("lazychunkreader.read.bytes", nil).Inc(int64(read))
   574  
   575  	r.off += int64(read)
   576  	return read, err
   577  }
   578  
   579  // completely analogous to standard SectionReader implementation
   580  var errWhence = errors.New("Seek: invalid whence")
   581  var errOffset = errors.New("Seek: invalid offset")
   582  
   583  func (r *LazyChunkReader) Seek(offset int64, whence int) (int64, error) {
   584  	cctx, sp := spancontext.StartSpan(
   585  		r.ctx,
   586  		"lcr.seek")
   587  	defer sp.Finish()
   588  
   589  	log.Debug("lazychunkreader.seek", "key", r.addr, "offset", offset)
   590  	switch whence {
   591  	default:
   592  		return 0, errWhence
   593  	case 0:
   594  		offset += 0
   595  	case 1:
   596  		offset += r.off
   597  	case 2:
   598  
   599  		if r.chunkData == nil { //seek from the end requires rootchunk for size. call Size first
   600  			_, err := r.Size(cctx, nil)
   601  			if err != nil {
   602  				return 0, fmt.Errorf("can't get size: %v", err)
   603  			}
   604  		}
   605  		offset += int64(r.chunkData.Size())
   606  	}
   607  
   608  	if offset < 0 {
   609  		return 0, errOffset
   610  	}
   611  	r.off = offset
   612  	return offset, nil
   613  }