github.com/daragao/go-ethereum@v1.8.14-0.20180809141559-45eaef243198/swarm/storage/chunker.go

github.com/daragao/go-ethereum@v1.8.14-0.20180809141559-45eaef243198/swarm/storage/chunker.go (about)

     1  // Copyright 2016 The go-ethereum Authors
     2  // This file is part of the go-ethereum library.
     3  //
     4  // The go-ethereum library is free software: you can redistribute it and/or modify
     5  // it under the terms of the GNU Lesser General Public License as published by
     6  // the Free Software Foundation, either version 3 of the License, or
     7  // (at your option) any later version.
     8  //
     9  // The go-ethereum library is distributed in the hope that it will be useful,
    10  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    11  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    12  // GNU Lesser General Public License for more details.
    13  //
    14  // You should have received a copy of the GNU Lesser General Public License
    15  // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
    16  package storage
    17  
    18  import (
    19  	"context"
    20  	"encoding/binary"
    21  	"errors"
    22  	"fmt"
    23  	"io"
    24  	"sync"
    25  	"time"
    26  
    27  	"github.com/ethereum/go-ethereum/metrics"
    28  	"github.com/ethereum/go-ethereum/swarm/log"
    29  	"github.com/ethereum/go-ethereum/swarm/spancontext"
    30  	opentracing "github.com/opentracing/opentracing-go"
    31  	olog "github.com/opentracing/opentracing-go/log"
    32  )
    33  
    34  /*
    35  The distributed storage implemented in this package requires fix sized chunks of content.
    36  
    37  Chunker is the interface to a component that is responsible for disassembling and assembling larger data.
    38  
    39  TreeChunker implements a Chunker based on a tree structure defined as follows:
    40  
    41  1 each node in the tree including the root and other branching nodes are stored as a chunk.
    42  
    43  2 branching nodes encode data contents that includes the size of the dataslice covered by its entire subtree under the node as well as the hash keys of all its children :
    44  data_{i} := size(subtree_{i}) || key_{j} || key_{j+1} .... || key_{j+n-1}
    45  
    46  3 Leaf nodes encode an actual subslice of the input data.
    47  
    48  4 if data size is not more than maximum chunksize, the data is stored in a single chunk
    49    key = hash(int64(size) + data)
    50  
    51  5 if data size is more than chunksize*branches^l, but no more than chunksize*
    52    branches^(l+1), the data vector is split into slices of chunksize*
    53    branches^l length (except the last one).
    54    key = hash(int64(size) + key(slice0) + key(slice1) + ...)
    55  
    56   The underlying hash function is configurable
    57  */
    58  
    59  /*
    60  Tree chunker is a concrete implementation of data chunking.
    61  This chunker works in a simple way, it builds a tree out of the document so that each node either represents a chunk of real data or a chunk of data representing an branching non-leaf node of the tree. In particular each such non-leaf chunk will represent is a concatenation of the hash of its respective children. This scheme simultaneously guarantees data integrity as well as self addressing. Abstract nodes are transparent since their represented size component is strictly greater than their maximum data size, since they encode a subtree.
    62  
    63  If all is well it is possible to implement this by simply composing readers so that no extra allocation or buffering is necessary for the data splitting and joining. This means that in principle there can be direct IO between : memory, file system, network socket (bzz peers storage request is read from the socket). In practice there may be need for several stages of internal buffering.
    64  The hashing itself does use extra copies and allocation though, since it does need it.
    65  */
    66  
    67  var (
    68  	errAppendOppNotSuported = errors.New("Append operation not supported")
    69  	errOperationTimedOut    = errors.New("operation timed out")
    70  )
    71  
    72  const (
    73  	DefaultChunkSize int64 = 4096
    74  )
    75  
    76  type ChunkerParams struct {
    77  	chunkSize int64
    78  	hashSize  int64
    79  }
    80  
    81  type SplitterParams struct {
    82  	ChunkerParams
    83  	reader io.Reader
    84  	putter Putter
    85  	addr   Address
    86  }
    87  
    88  type TreeSplitterParams struct {
    89  	SplitterParams
    90  	size int64
    91  }
    92  
    93  type JoinerParams struct {
    94  	ChunkerParams
    95  	addr   Address
    96  	getter Getter
    97  	// TODO: there is a bug, so depth can only be 0 today, see: https://github.com/ethersphere/go-ethereum/issues/344
    98  	depth int
    99  	ctx   context.Context
   100  }
   101  
   102  type TreeChunker struct {
   103  	ctx context.Context
   104  
   105  	branches int64
   106  	hashFunc SwarmHasher
   107  	dataSize int64
   108  	data     io.Reader
   109  	// calculated
   110  	addr        Address
   111  	depth       int
   112  	hashSize    int64        // self.hashFunc.New().Size()
   113  	chunkSize   int64        // hashSize* branches
   114  	workerCount int64        // the number of worker routines used
   115  	workerLock  sync.RWMutex // lock for the worker count
   116  	jobC        chan *hashJob
   117  	wg          *sync.WaitGroup
   118  	putter      Putter
   119  	getter      Getter
   120  	errC        chan error
   121  	quitC       chan bool
   122  }
   123  
   124  /*
   125  	Join reconstructs original content based on a root key.
   126  	When joining, the caller gets returned a Lazy SectionReader, which is
   127  	seekable and implements on-demand fetching of chunks as and where it is read.
   128  	New chunks to retrieve are coming from the getter, which the caller provides.
   129  	If an error is encountered during joining, it appears as a reader error.
   130  	The SectionReader.
   131  	As a result, partial reads from a document are possible even if other parts
   132  	are corrupt or lost.
   133  	The chunks are not meant to be validated by the chunker when joining. This
   134  	is because it is left to the DPA to decide which sources are trusted.
   135  */
   136  func TreeJoin(ctx context.Context, addr Address, getter Getter, depth int) *LazyChunkReader {
   137  	jp := &JoinerParams{
   138  		ChunkerParams: ChunkerParams{
   139  			chunkSize: DefaultChunkSize,
   140  			hashSize:  int64(len(addr)),
   141  		},
   142  		addr:   addr,
   143  		getter: getter,
   144  		depth:  depth,
   145  		ctx:    ctx,
   146  	}
   147  
   148  	return NewTreeJoiner(jp).Join(ctx)
   149  }
   150  
   151  /*
   152  	When splitting, data is given as a SectionReader, and the key is a hashSize long byte slice (Key), the root hash of the entire content will fill this once processing finishes.
   153  	New chunks to store are store using the putter which the caller provides.
   154  */
   155  func TreeSplit(ctx context.Context, data io.Reader, size int64, putter Putter) (k Address, wait func(context.Context) error, err error) {
   156  	tsp := &TreeSplitterParams{
   157  		SplitterParams: SplitterParams{
   158  			ChunkerParams: ChunkerParams{
   159  				chunkSize: DefaultChunkSize,
   160  				hashSize:  putter.RefSize(),
   161  			},
   162  			reader: data,
   163  			putter: putter,
   164  		},
   165  		size: size,
   166  	}
   167  	return NewTreeSplitter(tsp).Split(ctx)
   168  }
   169  
   170  func NewTreeJoiner(params *JoinerParams) *TreeChunker {
   171  	tc := &TreeChunker{}
   172  	tc.hashSize = params.hashSize
   173  	tc.branches = params.chunkSize / params.hashSize
   174  	tc.addr = params.addr
   175  	tc.getter = params.getter
   176  	tc.depth = params.depth
   177  	tc.chunkSize = params.chunkSize
   178  	tc.workerCount = 0
   179  	tc.jobC = make(chan *hashJob, 2*ChunkProcessors)
   180  	tc.wg = &sync.WaitGroup{}
   181  	tc.errC = make(chan error)
   182  	tc.quitC = make(chan bool)
   183  
   184  	tc.ctx = params.ctx
   185  
   186  	return tc
   187  }
   188  
   189  func NewTreeSplitter(params *TreeSplitterParams) *TreeChunker {
   190  	tc := &TreeChunker{}
   191  	tc.data = params.reader
   192  	tc.dataSize = params.size
   193  	tc.hashSize = params.hashSize
   194  	tc.branches = params.chunkSize / params.hashSize
   195  	tc.addr = params.addr
   196  	tc.chunkSize = params.chunkSize
   197  	tc.putter = params.putter
   198  	tc.workerCount = 0
   199  	tc.jobC = make(chan *hashJob, 2*ChunkProcessors)
   200  	tc.wg = &sync.WaitGroup{}
   201  	tc.errC = make(chan error)
   202  	tc.quitC = make(chan bool)
   203  
   204  	return tc
   205  }
   206  
   207  // String() for pretty printing
   208  func (c *Chunk) String() string {
   209  	return fmt.Sprintf("Key: %v TreeSize: %v Chunksize: %v", c.Addr.Log(), c.Size, len(c.SData))
   210  }
   211  
   212  type hashJob struct {
   213  	key      Address
   214  	chunk    []byte
   215  	size     int64
   216  	parentWg *sync.WaitGroup
   217  }
   218  
   219  func (tc *TreeChunker) incrementWorkerCount() {
   220  	tc.workerLock.Lock()
   221  	defer tc.workerLock.Unlock()
   222  	tc.workerCount += 1
   223  }
   224  
   225  func (tc *TreeChunker) getWorkerCount() int64 {
   226  	tc.workerLock.RLock()
   227  	defer tc.workerLock.RUnlock()
   228  	return tc.workerCount
   229  }
   230  
   231  func (tc *TreeChunker) decrementWorkerCount() {
   232  	tc.workerLock.Lock()
   233  	defer tc.workerLock.Unlock()
   234  	tc.workerCount -= 1
   235  }
   236  
   237  func (tc *TreeChunker) Split(ctx context.Context) (k Address, wait func(context.Context) error, err error) {
   238  	if tc.chunkSize <= 0 {
   239  		panic("chunker must be initialised")
   240  	}
   241  
   242  	tc.runWorker()
   243  
   244  	depth := 0
   245  	treeSize := tc.chunkSize
   246  
   247  	// takes lowest depth such that chunksize*HashCount^(depth+1) > size
   248  	// power series, will find the order of magnitude of the data size in base hashCount or numbers of levels of branching in the resulting tree.
   249  	for ; treeSize < tc.dataSize; treeSize *= tc.branches {
   250  		depth++
   251  	}
   252  
   253  	key := make([]byte, tc.hashSize)
   254  	// this waitgroup member is released after the root hash is calculated
   255  	tc.wg.Add(1)
   256  	//launch actual recursive function passing the waitgroups
   257  	go tc.split(depth, treeSize/tc.branches, key, tc.dataSize, tc.wg)
   258  
   259  	// closes internal error channel if all subprocesses in the workgroup finished
   260  	go func() {
   261  		// waiting for all threads to finish
   262  		tc.wg.Wait()
   263  		close(tc.errC)
   264  	}()
   265  
   266  	defer close(tc.quitC)
   267  	defer tc.putter.Close()
   268  	select {
   269  	case err := <-tc.errC:
   270  		if err != nil {
   271  			return nil, nil, err
   272  		}
   273  	case <-time.NewTimer(splitTimeout).C:
   274  		return nil, nil, errOperationTimedOut
   275  	}
   276  
   277  	return key, tc.putter.Wait, nil
   278  }
   279  
   280  func (tc *TreeChunker) split(depth int, treeSize int64, addr Address, size int64, parentWg *sync.WaitGroup) {
   281  
   282  	//
   283  
   284  	for depth > 0 && size < treeSize {
   285  		treeSize /= tc.branches
   286  		depth--
   287  	}
   288  
   289  	if depth == 0 {
   290  		// leaf nodes -> content chunks
   291  		chunkData := make([]byte, size+8)
   292  		binary.LittleEndian.PutUint64(chunkData[0:8], uint64(size))
   293  		var readBytes int64
   294  		for readBytes < size {
   295  			n, err := tc.data.Read(chunkData[8+readBytes:])
   296  			readBytes += int64(n)
   297  			if err != nil && !(err == io.EOF && readBytes == size) {
   298  				tc.errC <- err
   299  				return
   300  			}
   301  		}
   302  		select {
   303  		case tc.jobC <- &hashJob{addr, chunkData, size, parentWg}:
   304  		case <-tc.quitC:
   305  		}
   306  		return
   307  	}
   308  	// dept > 0
   309  	// intermediate chunk containing child nodes hashes
   310  	branchCnt := (size + treeSize - 1) / treeSize
   311  
   312  	var chunk = make([]byte, branchCnt*tc.hashSize+8)
   313  	var pos, i int64
   314  
   315  	binary.LittleEndian.PutUint64(chunk[0:8], uint64(size))
   316  
   317  	childrenWg := &sync.WaitGroup{}
   318  	var secSize int64
   319  	for i < branchCnt {
   320  		// the last item can have shorter data
   321  		if size-pos < treeSize {
   322  			secSize = size - pos
   323  		} else {
   324  			secSize = treeSize
   325  		}
   326  		// the hash of that data
   327  		subTreeKey := chunk[8+i*tc.hashSize : 8+(i+1)*tc.hashSize]
   328  
   329  		childrenWg.Add(1)
   330  		tc.split(depth-1, treeSize/tc.branches, subTreeKey, secSize, childrenWg)
   331  
   332  		i++
   333  		pos += treeSize
   334  	}
   335  	// wait for all the children to complete calculating their hashes and copying them onto sections of the chunk
   336  	// parentWg.Add(1)
   337  	// go func() {
   338  	childrenWg.Wait()
   339  
   340  	worker := tc.getWorkerCount()
   341  	if int64(len(tc.jobC)) > worker && worker < ChunkProcessors {
   342  		tc.runWorker()
   343  
   344  	}
   345  	select {
   346  	case tc.jobC <- &hashJob{addr, chunk, size, parentWg}:
   347  	case <-tc.quitC:
   348  	}
   349  }
   350  
   351  func (tc *TreeChunker) runWorker() {
   352  	tc.incrementWorkerCount()
   353  	go func() {
   354  		defer tc.decrementWorkerCount()
   355  		for {
   356  			select {
   357  
   358  			case job, ok := <-tc.jobC:
   359  				if !ok {
   360  					return
   361  				}
   362  
   363  				h, err := tc.putter.Put(tc.ctx, job.chunk)
   364  				if err != nil {
   365  					tc.errC <- err
   366  					return
   367  				}
   368  				copy(job.key, h)
   369  				job.parentWg.Done()
   370  			case <-tc.quitC:
   371  				return
   372  			}
   373  		}
   374  	}()
   375  }
   376  
   377  func (tc *TreeChunker) Append() (Address, func(), error) {
   378  	return nil, nil, errAppendOppNotSuported
   379  }
   380  
   381  // LazyChunkReader implements LazySectionReader
   382  type LazyChunkReader struct {
   383  	Ctx       context.Context
   384  	key       Address // root key
   385  	chunkData ChunkData
   386  	off       int64 // offset
   387  	chunkSize int64 // inherit from chunker
   388  	branches  int64 // inherit from chunker
   389  	hashSize  int64 // inherit from chunker
   390  	depth     int
   391  	getter    Getter
   392  }
   393  
   394  func (tc *TreeChunker) Join(ctx context.Context) *LazyChunkReader {
   395  	return &LazyChunkReader{
   396  		key:       tc.addr,
   397  		chunkSize: tc.chunkSize,
   398  		branches:  tc.branches,
   399  		hashSize:  tc.hashSize,
   400  		depth:     tc.depth,
   401  		getter:    tc.getter,
   402  		Ctx:       tc.ctx,
   403  	}
   404  }
   405  
   406  func (r *LazyChunkReader) Context() context.Context {
   407  	return r.Ctx
   408  }
   409  
   410  // Size is meant to be called on the LazySectionReader
   411  func (r *LazyChunkReader) Size(ctx context.Context, quitC chan bool) (n int64, err error) {
   412  	metrics.GetOrRegisterCounter("lazychunkreader.size", nil).Inc(1)
   413  
   414  	var sp opentracing.Span
   415  	var cctx context.Context
   416  	cctx, sp = spancontext.StartSpan(
   417  		ctx,
   418  		"lcr.size")
   419  	defer sp.Finish()
   420  
   421  	log.Debug("lazychunkreader.size", "key", r.key)
   422  	if r.chunkData == nil {
   423  		chunkData, err := r.getter.Get(cctx, Reference(r.key))
   424  		if err != nil {
   425  			return 0, err
   426  		}
   427  		if chunkData == nil {
   428  			select {
   429  			case <-quitC:
   430  				return 0, errors.New("aborted")
   431  			default:
   432  				return 0, fmt.Errorf("root chunk not found for %v", r.key.Hex())
   433  			}
   434  		}
   435  		r.chunkData = chunkData
   436  	}
   437  	return r.chunkData.Size(), nil
   438  }
   439  
   440  // read at can be called numerous times
   441  // concurrent reads are allowed
   442  // Size() needs to be called synchronously on the LazyChunkReader first
   443  func (r *LazyChunkReader) ReadAt(b []byte, off int64) (read int, err error) {
   444  	metrics.GetOrRegisterCounter("lazychunkreader.readat", nil).Inc(1)
   445  
   446  	var sp opentracing.Span
   447  	var cctx context.Context
   448  	cctx, sp = spancontext.StartSpan(
   449  		r.Ctx,
   450  		"lcr.read")
   451  	defer sp.Finish()
   452  
   453  	defer func() {
   454  		sp.LogFields(
   455  			olog.Int("off", int(off)),
   456  			olog.Int("read", read))
   457  	}()
   458  
   459  	// this is correct, a swarm doc cannot be zero length, so no EOF is expected
   460  	if len(b) == 0 {
   461  		return 0, nil
   462  	}
   463  	quitC := make(chan bool)
   464  	size, err := r.Size(cctx, quitC)
   465  	if err != nil {
   466  		log.Error("lazychunkreader.readat.size", "size", size, "err", err)
   467  		return 0, err
   468  	}
   469  
   470  	errC := make(chan error)
   471  
   472  	// }
   473  	var treeSize int64
   474  	var depth int
   475  	// calculate depth and max treeSize
   476  	treeSize = r.chunkSize
   477  	for ; treeSize < size; treeSize *= r.branches {
   478  		depth++
   479  	}
   480  	wg := sync.WaitGroup{}
   481  	length := int64(len(b))
   482  	for d := 0; d < r.depth; d++ {
   483  		off *= r.chunkSize
   484  		length *= r.chunkSize
   485  	}
   486  	wg.Add(1)
   487  	go r.join(cctx, b, off, off+length, depth, treeSize/r.branches, r.chunkData, &wg, errC, quitC)
   488  	go func() {
   489  		wg.Wait()
   490  		close(errC)
   491  	}()
   492  
   493  	err = <-errC
   494  	if err != nil {
   495  		log.Error("lazychunkreader.readat.errc", "err", err)
   496  		close(quitC)
   497  		return 0, err
   498  	}
   499  	if off+int64(len(b)) >= size {
   500  		return int(size - off), io.EOF
   501  	}
   502  	return len(b), nil
   503  }
   504  
   505  func (r *LazyChunkReader) join(ctx context.Context, b []byte, off int64, eoff int64, depth int, treeSize int64, chunkData ChunkData, parentWg *sync.WaitGroup, errC chan error, quitC chan bool) {
   506  	defer parentWg.Done()
   507  	// find appropriate block level
   508  	for chunkData.Size() < treeSize && depth > r.depth {
   509  		treeSize /= r.branches
   510  		depth--
   511  	}
   512  
   513  	// leaf chunk found
   514  	if depth == r.depth {
   515  		extra := 8 + eoff - int64(len(chunkData))
   516  		if extra > 0 {
   517  			eoff -= extra
   518  		}
   519  		copy(b, chunkData[8+off:8+eoff])
   520  		return // simply give back the chunks reader for content chunks
   521  	}
   522  
   523  	// subtree
   524  	start := off / treeSize
   525  	end := (eoff + treeSize - 1) / treeSize
   526  
   527  	// last non-leaf chunk can be shorter than default chunk size, let's not read it further then its end
   528  	currentBranches := int64(len(chunkData)-8) / r.hashSize
   529  	if end > currentBranches {
   530  		end = currentBranches
   531  	}
   532  
   533  	wg := &sync.WaitGroup{}
   534  	defer wg.Wait()
   535  	for i := start; i < end; i++ {
   536  		soff := i * treeSize
   537  		roff := soff
   538  		seoff := soff + treeSize
   539  
   540  		if soff < off {
   541  			soff = off
   542  		}
   543  		if seoff > eoff {
   544  			seoff = eoff
   545  		}
   546  		if depth > 1 {
   547  			wg.Wait()
   548  		}
   549  		wg.Add(1)
   550  		go func(j int64) {
   551  			childKey := chunkData[8+j*r.hashSize : 8+(j+1)*r.hashSize]
   552  			chunkData, err := r.getter.Get(ctx, Reference(childKey))
   553  			if err != nil {
   554  				log.Error("lazychunkreader.join", "key", fmt.Sprintf("%x", childKey), "err", err)
   555  				select {
   556  				case errC <- fmt.Errorf("chunk %v-%v not found; key: %s", off, off+treeSize, fmt.Sprintf("%x", childKey)):
   557  				case <-quitC:
   558  				}
   559  				return
   560  			}
   561  			if l := len(chunkData); l < 9 {
   562  				select {
   563  				case errC <- fmt.Errorf("chunk %v-%v incomplete; key: %s, data length %v", off, off+treeSize, fmt.Sprintf("%x", childKey), l):
   564  				case <-quitC:
   565  				}
   566  				return
   567  			}
   568  			if soff < off {
   569  				soff = off
   570  			}
   571  			r.join(ctx, b[soff-off:seoff-off], soff-roff, seoff-roff, depth-1, treeSize/r.branches, chunkData, wg, errC, quitC)
   572  		}(i)
   573  	} //for
   574  }
   575  
   576  // Read keeps a cursor so cannot be called simulateously, see ReadAt
   577  func (r *LazyChunkReader) Read(b []byte) (read int, err error) {
   578  	log.Debug("lazychunkreader.read", "key", r.key)
   579  	metrics.GetOrRegisterCounter("lazychunkreader.read", nil).Inc(1)
   580  
   581  	read, err = r.ReadAt(b, r.off)
   582  	if err != nil && err != io.EOF {
   583  		log.Error("lazychunkreader.readat", "read", read, "err", err)
   584  		metrics.GetOrRegisterCounter("lazychunkreader.read.err", nil).Inc(1)
   585  	}
   586  
   587  	metrics.GetOrRegisterCounter("lazychunkreader.read.bytes", nil).Inc(int64(read))
   588  
   589  	r.off += int64(read)
   590  	return
   591  }
   592  
   593  // completely analogous to standard SectionReader implementation
   594  var errWhence = errors.New("Seek: invalid whence")
   595  var errOffset = errors.New("Seek: invalid offset")
   596  
   597  func (r *LazyChunkReader) Seek(offset int64, whence int) (int64, error) {
   598  	log.Debug("lazychunkreader.seek", "key", r.key, "offset", offset)
   599  	switch whence {
   600  	default:
   601  		return 0, errWhence
   602  	case 0:
   603  		offset += 0
   604  	case 1:
   605  		offset += r.off
   606  	case 2:
   607  		if r.chunkData == nil { //seek from the end requires rootchunk for size. call Size first
   608  			_, err := r.Size(context.TODO(), nil)
   609  			if err != nil {
   610  				return 0, fmt.Errorf("can't get size: %v", err)
   611  			}
   612  		}
   613  		offset += r.chunkData.Size()
   614  	}
   615  
   616  	if offset < 0 {
   617  		return 0, errOffset
   618  	}
   619  	r.off = offset
   620  	return offset, nil
   621  }