github.com/divan/go-ethereum@v1.8.14-0.20180820134928-1de9ada4016d/swarm/storage/chunker.go (about)

     1  // Copyright 2016 The go-ethereum Authors
     2  // This file is part of the go-ethereum library.
     3  //
     4  // The go-ethereum library is free software: you can redistribute it and/or modify
     5  // it under the terms of the GNU Lesser General Public License as published by
     6  // the Free Software Foundation, either version 3 of the License, or
     7  // (at your option) any later version.
     8  //
     9  // The go-ethereum library is distributed in the hope that it will be useful,
    10  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    11  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    12  // GNU Lesser General Public License for more details.
    13  //
    14  // You should have received a copy of the GNU Lesser General Public License
    15  // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
    16  package storage
    17  
    18  import (
    19  	"context"
    20  	"encoding/binary"
    21  	"errors"
    22  	"fmt"
    23  	"io"
    24  	"sync"
    25  	"time"
    26  
    27  	"github.com/ethereum/go-ethereum/metrics"
    28  	"github.com/ethereum/go-ethereum/swarm/chunk"
    29  	"github.com/ethereum/go-ethereum/swarm/log"
    30  	"github.com/ethereum/go-ethereum/swarm/spancontext"
    31  	opentracing "github.com/opentracing/opentracing-go"
    32  	olog "github.com/opentracing/opentracing-go/log"
    33  )
    34  
    35  /*
    36  The distributed storage implemented in this package requires fix sized chunks of content.
    37  
    38  Chunker is the interface to a component that is responsible for disassembling and assembling larger data.
    39  
    40  TreeChunker implements a Chunker based on a tree structure defined as follows:
    41  
    42  1 each node in the tree including the root and other branching nodes are stored as a chunk.
    43  
    44  2 branching nodes encode data contents that includes the size of the dataslice covered by its entire subtree under the node as well as the hash keys of all its children :
    45  data_{i} := size(subtree_{i}) || key_{j} || key_{j+1} .... || key_{j+n-1}
    46  
    47  3 Leaf nodes encode an actual subslice of the input data.
    48  
    49  4 if data size is not more than maximum chunksize, the data is stored in a single chunk
    50    key = hash(int64(size) + data)
    51  
    52  5 if data size is more than chunksize*branches^l, but no more than chunksize*
    53    branches^(l+1), the data vector is split into slices of chunksize*
    54    branches^l length (except the last one).
    55    key = hash(int64(size) + key(slice0) + key(slice1) + ...)
    56  
    57   The underlying hash function is configurable
    58  */
    59  
    60  /*
    61  Tree chunker is a concrete implementation of data chunking.
    62  This chunker works in a simple way, it builds a tree out of the document so that each node either represents a chunk of real data or a chunk of data representing an branching non-leaf node of the tree. In particular each such non-leaf chunk will represent is a concatenation of the hash of its respective children. This scheme simultaneously guarantees data integrity as well as self addressing. Abstract nodes are transparent since their represented size component is strictly greater than their maximum data size, since they encode a subtree.
    63  
    64  If all is well it is possible to implement this by simply composing readers so that no extra allocation or buffering is necessary for the data splitting and joining. This means that in principle there can be direct IO between : memory, file system, network socket (bzz peers storage request is read from the socket). In practice there may be need for several stages of internal buffering.
    65  The hashing itself does use extra copies and allocation though, since it does need it.
    66  */
    67  
    68  var (
    69  	errAppendOppNotSuported = errors.New("Append operation not supported")
    70  	errOperationTimedOut    = errors.New("operation timed out")
    71  )
    72  
    73  type ChunkerParams struct {
    74  	chunkSize int64
    75  	hashSize  int64
    76  }
    77  
    78  type SplitterParams struct {
    79  	ChunkerParams
    80  	reader io.Reader
    81  	putter Putter
    82  	addr   Address
    83  }
    84  
    85  type TreeSplitterParams struct {
    86  	SplitterParams
    87  	size int64
    88  }
    89  
    90  type JoinerParams struct {
    91  	ChunkerParams
    92  	addr   Address
    93  	getter Getter
    94  	// TODO: there is a bug, so depth can only be 0 today, see: https://github.com/ethersphere/go-ethereum/issues/344
    95  	depth int
    96  	ctx   context.Context
    97  }
    98  
    99  type TreeChunker struct {
   100  	ctx context.Context
   101  
   102  	branches int64
   103  	hashFunc SwarmHasher
   104  	dataSize int64
   105  	data     io.Reader
   106  	// calculated
   107  	addr        Address
   108  	depth       int
   109  	hashSize    int64        // self.hashFunc.New().Size()
   110  	chunkSize   int64        // hashSize* branches
   111  	workerCount int64        // the number of worker routines used
   112  	workerLock  sync.RWMutex // lock for the worker count
   113  	jobC        chan *hashJob
   114  	wg          *sync.WaitGroup
   115  	putter      Putter
   116  	getter      Getter
   117  	errC        chan error
   118  	quitC       chan bool
   119  }
   120  
   121  /*
   122  	Join reconstructs original content based on a root key.
   123  	When joining, the caller gets returned a Lazy SectionReader, which is
   124  	seekable and implements on-demand fetching of chunks as and where it is read.
   125  	New chunks to retrieve are coming from the getter, which the caller provides.
   126  	If an error is encountered during joining, it appears as a reader error.
   127  	The SectionReader.
   128  	As a result, partial reads from a document are possible even if other parts
   129  	are corrupt or lost.
   130  	The chunks are not meant to be validated by the chunker when joining. This
   131  	is because it is left to the DPA to decide which sources are trusted.
   132  */
   133  func TreeJoin(ctx context.Context, addr Address, getter Getter, depth int) *LazyChunkReader {
   134  	jp := &JoinerParams{
   135  		ChunkerParams: ChunkerParams{
   136  			chunkSize: chunk.DefaultSize,
   137  			hashSize:  int64(len(addr)),
   138  		},
   139  		addr:   addr,
   140  		getter: getter,
   141  		depth:  depth,
   142  		ctx:    ctx,
   143  	}
   144  
   145  	return NewTreeJoiner(jp).Join(ctx)
   146  }
   147  
   148  /*
   149  	When splitting, data is given as a SectionReader, and the key is a hashSize long byte slice (Key), the root hash of the entire content will fill this once processing finishes.
   150  	New chunks to store are store using the putter which the caller provides.
   151  */
   152  func TreeSplit(ctx context.Context, data io.Reader, size int64, putter Putter) (k Address, wait func(context.Context) error, err error) {
   153  	tsp := &TreeSplitterParams{
   154  		SplitterParams: SplitterParams{
   155  			ChunkerParams: ChunkerParams{
   156  				chunkSize: chunk.DefaultSize,
   157  				hashSize:  putter.RefSize(),
   158  			},
   159  			reader: data,
   160  			putter: putter,
   161  		},
   162  		size: size,
   163  	}
   164  	return NewTreeSplitter(tsp).Split(ctx)
   165  }
   166  
   167  func NewTreeJoiner(params *JoinerParams) *TreeChunker {
   168  	tc := &TreeChunker{}
   169  	tc.hashSize = params.hashSize
   170  	tc.branches = params.chunkSize / params.hashSize
   171  	tc.addr = params.addr
   172  	tc.getter = params.getter
   173  	tc.depth = params.depth
   174  	tc.chunkSize = params.chunkSize
   175  	tc.workerCount = 0
   176  	tc.jobC = make(chan *hashJob, 2*ChunkProcessors)
   177  	tc.wg = &sync.WaitGroup{}
   178  	tc.errC = make(chan error)
   179  	tc.quitC = make(chan bool)
   180  
   181  	tc.ctx = params.ctx
   182  
   183  	return tc
   184  }
   185  
   186  func NewTreeSplitter(params *TreeSplitterParams) *TreeChunker {
   187  	tc := &TreeChunker{}
   188  	tc.data = params.reader
   189  	tc.dataSize = params.size
   190  	tc.hashSize = params.hashSize
   191  	tc.branches = params.chunkSize / params.hashSize
   192  	tc.addr = params.addr
   193  	tc.chunkSize = params.chunkSize
   194  	tc.putter = params.putter
   195  	tc.workerCount = 0
   196  	tc.jobC = make(chan *hashJob, 2*ChunkProcessors)
   197  	tc.wg = &sync.WaitGroup{}
   198  	tc.errC = make(chan error)
   199  	tc.quitC = make(chan bool)
   200  
   201  	return tc
   202  }
   203  
   204  // String() for pretty printing
   205  func (c *Chunk) String() string {
   206  	return fmt.Sprintf("Key: %v TreeSize: %v Chunksize: %v", c.Addr.Log(), c.Size, len(c.SData))
   207  }
   208  
   209  type hashJob struct {
   210  	key      Address
   211  	chunk    []byte
   212  	size     int64
   213  	parentWg *sync.WaitGroup
   214  }
   215  
   216  func (tc *TreeChunker) incrementWorkerCount() {
   217  	tc.workerLock.Lock()
   218  	defer tc.workerLock.Unlock()
   219  	tc.workerCount += 1
   220  }
   221  
   222  func (tc *TreeChunker) getWorkerCount() int64 {
   223  	tc.workerLock.RLock()
   224  	defer tc.workerLock.RUnlock()
   225  	return tc.workerCount
   226  }
   227  
   228  func (tc *TreeChunker) decrementWorkerCount() {
   229  	tc.workerLock.Lock()
   230  	defer tc.workerLock.Unlock()
   231  	tc.workerCount -= 1
   232  }
   233  
   234  func (tc *TreeChunker) Split(ctx context.Context) (k Address, wait func(context.Context) error, err error) {
   235  	if tc.chunkSize <= 0 {
   236  		panic("chunker must be initialised")
   237  	}
   238  
   239  	tc.runWorker()
   240  
   241  	depth := 0
   242  	treeSize := tc.chunkSize
   243  
   244  	// takes lowest depth such that chunksize*HashCount^(depth+1) > size
   245  	// power series, will find the order of magnitude of the data size in base hashCount or numbers of levels of branching in the resulting tree.
   246  	for ; treeSize < tc.dataSize; treeSize *= tc.branches {
   247  		depth++
   248  	}
   249  
   250  	key := make([]byte, tc.hashSize)
   251  	// this waitgroup member is released after the root hash is calculated
   252  	tc.wg.Add(1)
   253  	//launch actual recursive function passing the waitgroups
   254  	go tc.split(depth, treeSize/tc.branches, key, tc.dataSize, tc.wg)
   255  
   256  	// closes internal error channel if all subprocesses in the workgroup finished
   257  	go func() {
   258  		// waiting for all threads to finish
   259  		tc.wg.Wait()
   260  		close(tc.errC)
   261  	}()
   262  
   263  	defer close(tc.quitC)
   264  	defer tc.putter.Close()
   265  	select {
   266  	case err := <-tc.errC:
   267  		if err != nil {
   268  			return nil, nil, err
   269  		}
   270  	case <-time.NewTimer(splitTimeout).C:
   271  		return nil, nil, errOperationTimedOut
   272  	}
   273  
   274  	return key, tc.putter.Wait, nil
   275  }
   276  
   277  func (tc *TreeChunker) split(depth int, treeSize int64, addr Address, size int64, parentWg *sync.WaitGroup) {
   278  
   279  	//
   280  
   281  	for depth > 0 && size < treeSize {
   282  		treeSize /= tc.branches
   283  		depth--
   284  	}
   285  
   286  	if depth == 0 {
   287  		// leaf nodes -> content chunks
   288  		chunkData := make([]byte, size+8)
   289  		binary.LittleEndian.PutUint64(chunkData[0:8], uint64(size))
   290  		var readBytes int64
   291  		for readBytes < size {
   292  			n, err := tc.data.Read(chunkData[8+readBytes:])
   293  			readBytes += int64(n)
   294  			if err != nil && !(err == io.EOF && readBytes == size) {
   295  				tc.errC <- err
   296  				return
   297  			}
   298  		}
   299  		select {
   300  		case tc.jobC <- &hashJob{addr, chunkData, size, parentWg}:
   301  		case <-tc.quitC:
   302  		}
   303  		return
   304  	}
   305  	// dept > 0
   306  	// intermediate chunk containing child nodes hashes
   307  	branchCnt := (size + treeSize - 1) / treeSize
   308  
   309  	var chunk = make([]byte, branchCnt*tc.hashSize+8)
   310  	var pos, i int64
   311  
   312  	binary.LittleEndian.PutUint64(chunk[0:8], uint64(size))
   313  
   314  	childrenWg := &sync.WaitGroup{}
   315  	var secSize int64
   316  	for i < branchCnt {
   317  		// the last item can have shorter data
   318  		if size-pos < treeSize {
   319  			secSize = size - pos
   320  		} else {
   321  			secSize = treeSize
   322  		}
   323  		// the hash of that data
   324  		subTreeKey := chunk[8+i*tc.hashSize : 8+(i+1)*tc.hashSize]
   325  
   326  		childrenWg.Add(1)
   327  		tc.split(depth-1, treeSize/tc.branches, subTreeKey, secSize, childrenWg)
   328  
   329  		i++
   330  		pos += treeSize
   331  	}
   332  	// wait for all the children to complete calculating their hashes and copying them onto sections of the chunk
   333  	// parentWg.Add(1)
   334  	// go func() {
   335  	childrenWg.Wait()
   336  
   337  	worker := tc.getWorkerCount()
   338  	if int64(len(tc.jobC)) > worker && worker < ChunkProcessors {
   339  		tc.runWorker()
   340  
   341  	}
   342  	select {
   343  	case tc.jobC <- &hashJob{addr, chunk, size, parentWg}:
   344  	case <-tc.quitC:
   345  	}
   346  }
   347  
   348  func (tc *TreeChunker) runWorker() {
   349  	tc.incrementWorkerCount()
   350  	go func() {
   351  		defer tc.decrementWorkerCount()
   352  		for {
   353  			select {
   354  
   355  			case job, ok := <-tc.jobC:
   356  				if !ok {
   357  					return
   358  				}
   359  
   360  				h, err := tc.putter.Put(tc.ctx, job.chunk)
   361  				if err != nil {
   362  					tc.errC <- err
   363  					return
   364  				}
   365  				copy(job.key, h)
   366  				job.parentWg.Done()
   367  			case <-tc.quitC:
   368  				return
   369  			}
   370  		}
   371  	}()
   372  }
   373  
   374  func (tc *TreeChunker) Append() (Address, func(), error) {
   375  	return nil, nil, errAppendOppNotSuported
   376  }
   377  
   378  // LazyChunkReader implements LazySectionReader
   379  type LazyChunkReader struct {
   380  	Ctx       context.Context
   381  	key       Address // root key
   382  	chunkData ChunkData
   383  	off       int64 // offset
   384  	chunkSize int64 // inherit from chunker
   385  	branches  int64 // inherit from chunker
   386  	hashSize  int64 // inherit from chunker
   387  	depth     int
   388  	getter    Getter
   389  }
   390  
   391  func (tc *TreeChunker) Join(ctx context.Context) *LazyChunkReader {
   392  	return &LazyChunkReader{
   393  		key:       tc.addr,
   394  		chunkSize: tc.chunkSize,
   395  		branches:  tc.branches,
   396  		hashSize:  tc.hashSize,
   397  		depth:     tc.depth,
   398  		getter:    tc.getter,
   399  		Ctx:       tc.ctx,
   400  	}
   401  }
   402  
   403  func (r *LazyChunkReader) Context() context.Context {
   404  	return r.Ctx
   405  }
   406  
   407  // Size is meant to be called on the LazySectionReader
   408  func (r *LazyChunkReader) Size(ctx context.Context, quitC chan bool) (n int64, err error) {
   409  	metrics.GetOrRegisterCounter("lazychunkreader.size", nil).Inc(1)
   410  
   411  	var sp opentracing.Span
   412  	var cctx context.Context
   413  	cctx, sp = spancontext.StartSpan(
   414  		ctx,
   415  		"lcr.size")
   416  	defer sp.Finish()
   417  
   418  	log.Debug("lazychunkreader.size", "key", r.key)
   419  	if r.chunkData == nil {
   420  		chunkData, err := r.getter.Get(cctx, Reference(r.key))
   421  		if err != nil {
   422  			return 0, err
   423  		}
   424  		if chunkData == nil {
   425  			select {
   426  			case <-quitC:
   427  				return 0, errors.New("aborted")
   428  			default:
   429  				return 0, fmt.Errorf("root chunk not found for %v", r.key.Hex())
   430  			}
   431  		}
   432  		r.chunkData = chunkData
   433  	}
   434  	return r.chunkData.Size(), nil
   435  }
   436  
   437  // read at can be called numerous times
   438  // concurrent reads are allowed
   439  // Size() needs to be called synchronously on the LazyChunkReader first
   440  func (r *LazyChunkReader) ReadAt(b []byte, off int64) (read int, err error) {
   441  	metrics.GetOrRegisterCounter("lazychunkreader.readat", nil).Inc(1)
   442  
   443  	var sp opentracing.Span
   444  	var cctx context.Context
   445  	cctx, sp = spancontext.StartSpan(
   446  		r.Ctx,
   447  		"lcr.read")
   448  	defer sp.Finish()
   449  
   450  	defer func() {
   451  		sp.LogFields(
   452  			olog.Int("off", int(off)),
   453  			olog.Int("read", read))
   454  	}()
   455  
   456  	// this is correct, a swarm doc cannot be zero length, so no EOF is expected
   457  	if len(b) == 0 {
   458  		return 0, nil
   459  	}
   460  	quitC := make(chan bool)
   461  	size, err := r.Size(cctx, quitC)
   462  	if err != nil {
   463  		log.Error("lazychunkreader.readat.size", "size", size, "err", err)
   464  		return 0, err
   465  	}
   466  
   467  	errC := make(chan error)
   468  
   469  	// }
   470  	var treeSize int64
   471  	var depth int
   472  	// calculate depth and max treeSize
   473  	treeSize = r.chunkSize
   474  	for ; treeSize < size; treeSize *= r.branches {
   475  		depth++
   476  	}
   477  	wg := sync.WaitGroup{}
   478  	length := int64(len(b))
   479  	for d := 0; d < r.depth; d++ {
   480  		off *= r.chunkSize
   481  		length *= r.chunkSize
   482  	}
   483  	wg.Add(1)
   484  	go r.join(cctx, b, off, off+length, depth, treeSize/r.branches, r.chunkData, &wg, errC, quitC)
   485  	go func() {
   486  		wg.Wait()
   487  		close(errC)
   488  	}()
   489  
   490  	err = <-errC
   491  	if err != nil {
   492  		log.Error("lazychunkreader.readat.errc", "err", err)
   493  		close(quitC)
   494  		return 0, err
   495  	}
   496  	if off+int64(len(b)) >= size {
   497  		return int(size - off), io.EOF
   498  	}
   499  	return len(b), nil
   500  }
   501  
   502  func (r *LazyChunkReader) join(ctx context.Context, b []byte, off int64, eoff int64, depth int, treeSize int64, chunkData ChunkData, parentWg *sync.WaitGroup, errC chan error, quitC chan bool) {
   503  	defer parentWg.Done()
   504  	// find appropriate block level
   505  	for chunkData.Size() < treeSize && depth > r.depth {
   506  		treeSize /= r.branches
   507  		depth--
   508  	}
   509  
   510  	// leaf chunk found
   511  	if depth == r.depth {
   512  		extra := 8 + eoff - int64(len(chunkData))
   513  		if extra > 0 {
   514  			eoff -= extra
   515  		}
   516  		copy(b, chunkData[8+off:8+eoff])
   517  		return // simply give back the chunks reader for content chunks
   518  	}
   519  
   520  	// subtree
   521  	start := off / treeSize
   522  	end := (eoff + treeSize - 1) / treeSize
   523  
   524  	// last non-leaf chunk can be shorter than default chunk size, let's not read it further then its end
   525  	currentBranches := int64(len(chunkData)-8) / r.hashSize
   526  	if end > currentBranches {
   527  		end = currentBranches
   528  	}
   529  
   530  	wg := &sync.WaitGroup{}
   531  	defer wg.Wait()
   532  	for i := start; i < end; i++ {
   533  		soff := i * treeSize
   534  		roff := soff
   535  		seoff := soff + treeSize
   536  
   537  		if soff < off {
   538  			soff = off
   539  		}
   540  		if seoff > eoff {
   541  			seoff = eoff
   542  		}
   543  		if depth > 1 {
   544  			wg.Wait()
   545  		}
   546  		wg.Add(1)
   547  		go func(j int64) {
   548  			childKey := chunkData[8+j*r.hashSize : 8+(j+1)*r.hashSize]
   549  			chunkData, err := r.getter.Get(ctx, Reference(childKey))
   550  			if err != nil {
   551  				log.Error("lazychunkreader.join", "key", fmt.Sprintf("%x", childKey), "err", err)
   552  				select {
   553  				case errC <- fmt.Errorf("chunk %v-%v not found; key: %s", off, off+treeSize, fmt.Sprintf("%x", childKey)):
   554  				case <-quitC:
   555  				}
   556  				return
   557  			}
   558  			if l := len(chunkData); l < 9 {
   559  				select {
   560  				case errC <- fmt.Errorf("chunk %v-%v incomplete; key: %s, data length %v", off, off+treeSize, fmt.Sprintf("%x", childKey), l):
   561  				case <-quitC:
   562  				}
   563  				return
   564  			}
   565  			if soff < off {
   566  				soff = off
   567  			}
   568  			r.join(ctx, b[soff-off:seoff-off], soff-roff, seoff-roff, depth-1, treeSize/r.branches, chunkData, wg, errC, quitC)
   569  		}(i)
   570  	} //for
   571  }
   572  
   573  // Read keeps a cursor so cannot be called simulateously, see ReadAt
   574  func (r *LazyChunkReader) Read(b []byte) (read int, err error) {
   575  	log.Debug("lazychunkreader.read", "key", r.key)
   576  	metrics.GetOrRegisterCounter("lazychunkreader.read", nil).Inc(1)
   577  
   578  	read, err = r.ReadAt(b, r.off)
   579  	if err != nil && err != io.EOF {
   580  		log.Error("lazychunkreader.readat", "read", read, "err", err)
   581  		metrics.GetOrRegisterCounter("lazychunkreader.read.err", nil).Inc(1)
   582  	}
   583  
   584  	metrics.GetOrRegisterCounter("lazychunkreader.read.bytes", nil).Inc(int64(read))
   585  
   586  	r.off += int64(read)
   587  	return
   588  }
   589  
   590  // completely analogous to standard SectionReader implementation
   591  var errWhence = errors.New("Seek: invalid whence")
   592  var errOffset = errors.New("Seek: invalid offset")
   593  
   594  func (r *LazyChunkReader) Seek(offset int64, whence int) (int64, error) {
   595  	log.Debug("lazychunkreader.seek", "key", r.key, "offset", offset)
   596  	switch whence {
   597  	default:
   598  		return 0, errWhence
   599  	case 0:
   600  		offset += 0
   601  	case 1:
   602  		offset += r.off
   603  	case 2:
   604  		if r.chunkData == nil { //seek from the end requires rootchunk for size. call Size first
   605  			_, err := r.Size(context.TODO(), nil)
   606  			if err != nil {
   607  				return 0, fmt.Errorf("can't get size: %v", err)
   608  			}
   609  		}
   610  		offset += r.chunkData.Size()
   611  	}
   612  
   613  	if offset < 0 {
   614  		return 0, errOffset
   615  	}
   616  	r.off = offset
   617  	return offset, nil
   618  }