github.com/insight-chain/inb-go@v1.1.3-0.20191221022159-da049980ae38/swarm/storage/chunker.go (about)

     1  // Copyright 2016 The go-ethereum Authors
     2  // This file is part of the go-ethereum library.
     3  //
     4  // The go-ethereum library is free software: you can redistribute it and/or modify
     5  // it under the terms of the GNU Lesser General Public License as published by
     6  // the Free Software Foundation, either version 3 of the License, or
     7  // (at your option) any later version.
     8  //
     9  // The go-ethereum library is distributed in the hope that it will be useful,
    10  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    11  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    12  // GNU Lesser General Public License for more details.
    13  //
    14  // You should have received a copy of the GNU Lesser General Public License
    15  // along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
    16  package storage
    17  
    18  import (
    19  	"context"
    20  	"encoding/binary"
    21  	"errors"
    22  	"fmt"
    23  	"io"
    24  	"sync"
    25  	"time"
    26  
    27  	"github.com/insight-chain/inb-go/metrics"
    28  	ch "github.com/insight-chain/inb-go/swarm/chunk"
    29  	"github.com/insight-chain/inb-go/swarm/log"
    30  	"github.com/insight-chain/inb-go/swarm/spancontext"
    31  	opentracing "github.com/opentracing/opentracing-go"
    32  	olog "github.com/opentracing/opentracing-go/log"
    33  )
    34  
    35  /*
    36  The distributed storage implemented in this package requires fix sized chunks of content.
    37  
    38  Chunker is the interface to a component that is responsible for disassembling and assembling larger data.
    39  
    40  TreeChunker implements a Chunker based on a tree structure defined as follows:
    41  
    42  1 each node in the tree including the root and other branching nodes are stored as a chunk.
    43  
    44  2 branching nodes encode data contents that includes the size of the dataslice covered by its entire subtree under the node as well as the hash keys of all its children :
    45  data_{i} := size(subtree_{i}) || key_{j} || key_{j+1} .... || key_{j+n-1}
    46  
    47  3 Leaf nodes encode an actual subslice of the input data.
    48  
    49  4 if data size is not more than maximum chunksize, the data is stored in a single chunk
    50    key = hash(int64(size) + data)
    51  
    52  5 if data size is more than chunksize*branches^l, but no more than chunksize*
    53    branches^(l+1), the data vector is split into slices of chunksize*
    54    branches^l length (except the last one).
    55    key = hash(int64(size) + key(slice0) + key(slice1) + ...)
    56  
    57   The underlying hash function is configurable
    58  */
    59  
    60  /*
    61  Tree chunker is a concrete implementation of data chunking.
    62  This chunker works in a simple way, it builds a tree out of the document so that each node either represents a chunk of real data or a chunk of data representing an branching non-leaf node of the tree. In particular each such non-leaf chunk will represent is a concatenation of the hash of its respective children. This scheme simultaneously guarantees data integrity as well as self addressing. Abstract nodes are transparent since their represented size component is strictly greater than their maximum data size, since they encode a subtree.
    63  
    64  If all is well it is possible to implement this by simply composing readers so that no extra allocation or buffering is necessary for the data splitting and joining. This means that in principle there can be direct IO between : memory, file system, network socket (bzz peers storage request is read from the socket). In practice there may be need for several stages of internal buffering.
    65  The hashing itself does use extra copies and allocation though, since it does need it.
    66  */
    67  
    68  var (
    69  	errAppendOppNotSuported = errors.New("Append operation not supported")
    70  )
    71  
    72  type ChunkerParams struct {
    73  	chunkSize int64
    74  	hashSize  int64
    75  }
    76  
    77  type SplitterParams struct {
    78  	ChunkerParams
    79  	reader io.Reader
    80  	putter Putter
    81  	addr   Address
    82  }
    83  
    84  type TreeSplitterParams struct {
    85  	SplitterParams
    86  	size int64
    87  }
    88  
    89  type JoinerParams struct {
    90  	ChunkerParams
    91  	addr   Address
    92  	getter Getter
    93  	// TODO: there is a bug, so depth can only be 0 today, see: https://github.com/ethersphere/go-ethereum/issues/344
    94  	depth int
    95  	ctx   context.Context
    96  }
    97  
    98  type TreeChunker struct {
    99  	ctx context.Context
   100  
   101  	branches int64
   102  	hashFunc SwarmHasher
   103  	dataSize int64
   104  	data     io.Reader
   105  	// calculated
   106  	addr        Address
   107  	depth       int
   108  	hashSize    int64        // self.hashFunc.New().Size()
   109  	chunkSize   int64        // hashSize* branches
   110  	workerCount int64        // the number of worker routines used
   111  	workerLock  sync.RWMutex // lock for the worker count
   112  	jobC        chan *hashJob
   113  	wg          *sync.WaitGroup
   114  	putter      Putter
   115  	getter      Getter
   116  	errC        chan error
   117  	quitC       chan bool
   118  }
   119  
   120  /*
   121  	Join reconstructs original content based on a root key.
   122  	When joining, the caller gets returned a Lazy SectionReader, which is
   123  	seekable and implements on-demand fetching of chunks as and where it is read.
   124  	New chunks to retrieve are coming from the getter, which the caller provides.
   125  	If an error is encountered during joining, it appears as a reader error.
   126  	The SectionReader.
   127  	As a result, partial reads from a document are possible even if other parts
   128  	are corrupt or lost.
   129  	The chunks are not meant to be validated by the chunker when joining. This
   130  	is because it is left to the DPA to decide which sources are trusted.
   131  */
   132  func TreeJoin(ctx context.Context, addr Address, getter Getter, depth int) *LazyChunkReader {
   133  	jp := &JoinerParams{
   134  		ChunkerParams: ChunkerParams{
   135  			chunkSize: ch.DefaultSize,
   136  			hashSize:  int64(len(addr)),
   137  		},
   138  		addr:   addr,
   139  		getter: getter,
   140  		depth:  depth,
   141  		ctx:    ctx,
   142  	}
   143  
   144  	return NewTreeJoiner(jp).Join(ctx)
   145  }
   146  
   147  /*
   148  	When splitting, data is given as a SectionReader, and the key is a hashSize long byte slice (Key), the root hash of the entire content will fill this once processing finishes.
   149  	New chunks to store are store using the putter which the caller provides.
   150  */
   151  func TreeSplit(ctx context.Context, data io.Reader, size int64, putter Putter) (k Address, wait func(context.Context) error, err error) {
   152  	tsp := &TreeSplitterParams{
   153  		SplitterParams: SplitterParams{
   154  			ChunkerParams: ChunkerParams{
   155  				chunkSize: ch.DefaultSize,
   156  				hashSize:  putter.RefSize(),
   157  			},
   158  			reader: data,
   159  			putter: putter,
   160  		},
   161  		size: size,
   162  	}
   163  	return NewTreeSplitter(tsp).Split(ctx)
   164  }
   165  
   166  func NewTreeJoiner(params *JoinerParams) *TreeChunker {
   167  	tc := &TreeChunker{}
   168  	tc.hashSize = params.hashSize
   169  	tc.branches = params.chunkSize / params.hashSize
   170  	tc.addr = params.addr
   171  	tc.getter = params.getter
   172  	tc.depth = params.depth
   173  	tc.chunkSize = params.chunkSize
   174  	tc.workerCount = 0
   175  	tc.jobC = make(chan *hashJob, 2*ChunkProcessors)
   176  	tc.wg = &sync.WaitGroup{}
   177  	tc.errC = make(chan error)
   178  	tc.quitC = make(chan bool)
   179  
   180  	tc.ctx = params.ctx
   181  
   182  	return tc
   183  }
   184  
   185  func NewTreeSplitter(params *TreeSplitterParams) *TreeChunker {
   186  	tc := &TreeChunker{}
   187  	tc.data = params.reader
   188  	tc.dataSize = params.size
   189  	tc.hashSize = params.hashSize
   190  	tc.branches = params.chunkSize / params.hashSize
   191  	tc.addr = params.addr
   192  	tc.chunkSize = params.chunkSize
   193  	tc.putter = params.putter
   194  	tc.workerCount = 0
   195  	tc.jobC = make(chan *hashJob, 2*ChunkProcessors)
   196  	tc.wg = &sync.WaitGroup{}
   197  	tc.errC = make(chan error)
   198  	tc.quitC = make(chan bool)
   199  
   200  	return tc
   201  }
   202  
   203  type hashJob struct {
   204  	key      Address
   205  	chunk    []byte
   206  	size     int64
   207  	parentWg *sync.WaitGroup
   208  }
   209  
   210  func (tc *TreeChunker) incrementWorkerCount() {
   211  	tc.workerLock.Lock()
   212  	defer tc.workerLock.Unlock()
   213  	tc.workerCount += 1
   214  }
   215  
   216  func (tc *TreeChunker) getWorkerCount() int64 {
   217  	tc.workerLock.RLock()
   218  	defer tc.workerLock.RUnlock()
   219  	return tc.workerCount
   220  }
   221  
   222  func (tc *TreeChunker) decrementWorkerCount() {
   223  	tc.workerLock.Lock()
   224  	defer tc.workerLock.Unlock()
   225  	tc.workerCount -= 1
   226  }
   227  
   228  func (tc *TreeChunker) Split(ctx context.Context) (k Address, wait func(context.Context) error, err error) {
   229  	if tc.chunkSize <= 0 {
   230  		panic("chunker must be initialised")
   231  	}
   232  
   233  	tc.runWorker(ctx)
   234  
   235  	depth := 0
   236  	treeSize := tc.chunkSize
   237  
   238  	// takes lowest depth such that chunksize*HashCount^(depth+1) > size
   239  	// power series, will find the order of magnitude of the data size in base hashCount or numbers of levels of branching in the resulting tree.
   240  	for ; treeSize < tc.dataSize; treeSize *= tc.branches {
   241  		depth++
   242  	}
   243  
   244  	key := make([]byte, tc.hashSize)
   245  	// this waitgroup member is released after the root hash is calculated
   246  	tc.wg.Add(1)
   247  	//launch actual recursive function passing the waitgroups
   248  	go tc.split(ctx, depth, treeSize/tc.branches, key, tc.dataSize, tc.wg)
   249  
   250  	// closes internal error channel if all subprocesses in the workgroup finished
   251  	go func() {
   252  		// waiting for all threads to finish
   253  		tc.wg.Wait()
   254  		close(tc.errC)
   255  	}()
   256  
   257  	defer close(tc.quitC)
   258  	defer tc.putter.Close()
   259  	select {
   260  	case err := <-tc.errC:
   261  		if err != nil {
   262  			return nil, nil, err
   263  		}
   264  	case <-ctx.Done():
   265  		return nil, nil, ctx.Err()
   266  	}
   267  
   268  	return key, tc.putter.Wait, nil
   269  }
   270  
   271  func (tc *TreeChunker) split(ctx context.Context, depth int, treeSize int64, addr Address, size int64, parentWg *sync.WaitGroup) {
   272  
   273  	//
   274  
   275  	for depth > 0 && size < treeSize {
   276  		treeSize /= tc.branches
   277  		depth--
   278  	}
   279  
   280  	if depth == 0 {
   281  		// leaf nodes -> content chunks
   282  		chunkData := make([]byte, size+8)
   283  		binary.LittleEndian.PutUint64(chunkData[0:8], uint64(size))
   284  		var readBytes int64
   285  		for readBytes < size {
   286  			n, err := tc.data.Read(chunkData[8+readBytes:])
   287  			readBytes += int64(n)
   288  			if err != nil && !(err == io.EOF && readBytes == size) {
   289  				tc.errC <- err
   290  				return
   291  			}
   292  		}
   293  		select {
   294  		case tc.jobC <- &hashJob{addr, chunkData, size, parentWg}:
   295  		case <-tc.quitC:
   296  		}
   297  		return
   298  	}
   299  	// dept > 0
   300  	// intermediate chunk containing child nodes hashes
   301  	branchCnt := (size + treeSize - 1) / treeSize
   302  
   303  	var chunk = make([]byte, branchCnt*tc.hashSize+8)
   304  	var pos, i int64
   305  
   306  	binary.LittleEndian.PutUint64(chunk[0:8], uint64(size))
   307  
   308  	childrenWg := &sync.WaitGroup{}
   309  	var secSize int64
   310  	for i < branchCnt {
   311  		// the last item can have shorter data
   312  		if size-pos < treeSize {
   313  			secSize = size - pos
   314  		} else {
   315  			secSize = treeSize
   316  		}
   317  		// the hash of that data
   318  		subTreeAddress := chunk[8+i*tc.hashSize : 8+(i+1)*tc.hashSize]
   319  
   320  		childrenWg.Add(1)
   321  		tc.split(ctx, depth-1, treeSize/tc.branches, subTreeAddress, secSize, childrenWg)
   322  
   323  		i++
   324  		pos += treeSize
   325  	}
   326  	// wait for all the children to complete calculating their hashes and copying them onto sections of the chunk
   327  	// parentWg.Add(1)
   328  	// go func() {
   329  	childrenWg.Wait()
   330  
   331  	worker := tc.getWorkerCount()
   332  	if int64(len(tc.jobC)) > worker && worker < ChunkProcessors {
   333  		tc.runWorker(ctx)
   334  
   335  	}
   336  	select {
   337  	case tc.jobC <- &hashJob{addr, chunk, size, parentWg}:
   338  	case <-tc.quitC:
   339  	}
   340  }
   341  
   342  func (tc *TreeChunker) runWorker(ctx context.Context) {
   343  	tc.incrementWorkerCount()
   344  	go func() {
   345  		defer tc.decrementWorkerCount()
   346  		for {
   347  			select {
   348  
   349  			case job, ok := <-tc.jobC:
   350  				if !ok {
   351  					return
   352  				}
   353  
   354  				h, err := tc.putter.Put(ctx, job.chunk)
   355  				if err != nil {
   356  					tc.errC <- err
   357  					return
   358  				}
   359  				copy(job.key, h)
   360  				job.parentWg.Done()
   361  			case <-tc.quitC:
   362  				return
   363  			}
   364  		}
   365  	}()
   366  }
   367  
   368  func (tc *TreeChunker) Append() (Address, func(), error) {
   369  	return nil, nil, errAppendOppNotSuported
   370  }
   371  
   372  // LazyChunkReader implements LazySectionReader
   373  type LazyChunkReader struct {
   374  	ctx       context.Context
   375  	addr      Address // root address
   376  	chunkData ChunkData
   377  	off       int64 // offset
   378  	chunkSize int64 // inherit from chunker
   379  	branches  int64 // inherit from chunker
   380  	hashSize  int64 // inherit from chunker
   381  	depth     int
   382  	getter    Getter
   383  }
   384  
   385  func (tc *TreeChunker) Join(ctx context.Context) *LazyChunkReader {
   386  	return &LazyChunkReader{
   387  		addr:      tc.addr,
   388  		chunkSize: tc.chunkSize,
   389  		branches:  tc.branches,
   390  		hashSize:  tc.hashSize,
   391  		depth:     tc.depth,
   392  		getter:    tc.getter,
   393  		ctx:       tc.ctx,
   394  	}
   395  }
   396  
   397  func (r *LazyChunkReader) Context() context.Context {
   398  	return r.ctx
   399  }
   400  
   401  // Size is meant to be called on the LazySectionReader
   402  func (r *LazyChunkReader) Size(ctx context.Context, quitC chan bool) (n int64, err error) {
   403  	metrics.GetOrRegisterCounter("lazychunkreader.size", nil).Inc(1)
   404  
   405  	var sp opentracing.Span
   406  	var cctx context.Context
   407  	cctx, sp = spancontext.StartSpan(
   408  		ctx,
   409  		"lcr.size")
   410  	defer sp.Finish()
   411  
   412  	log.Debug("lazychunkreader.size", "addr", r.addr)
   413  	if r.chunkData == nil {
   414  
   415  		startTime := time.Now()
   416  		chunkData, err := r.getter.Get(cctx, Reference(r.addr))
   417  		if err != nil {
   418  			metrics.GetOrRegisterResettingTimer("lcr.getter.get.err", nil).UpdateSince(startTime)
   419  			return 0, err
   420  		}
   421  		metrics.GetOrRegisterResettingTimer("lcr.getter.get", nil).UpdateSince(startTime)
   422  		r.chunkData = chunkData
   423  		s := r.chunkData.Size()
   424  		log.Debug("lazychunkreader.size", "key", r.addr, "size", s)
   425  		if s < 0 {
   426  			return 0, errors.New("corrupt size")
   427  		}
   428  		return int64(s), nil
   429  	}
   430  	s := r.chunkData.Size()
   431  	log.Debug("lazychunkreader.size", "key", r.addr, "size", s)
   432  
   433  	return int64(s), nil
   434  }
   435  
   436  // read at can be called numerous times
   437  // concurrent reads are allowed
   438  // Size() needs to be called synchronously on the LazyChunkReader first
   439  func (r *LazyChunkReader) ReadAt(b []byte, off int64) (read int, err error) {
   440  	metrics.GetOrRegisterCounter("lazychunkreader.readat", nil).Inc(1)
   441  
   442  	var sp opentracing.Span
   443  	var cctx context.Context
   444  	cctx, sp = spancontext.StartSpan(
   445  		r.ctx,
   446  		"lcr.read")
   447  	defer sp.Finish()
   448  
   449  	defer func() {
   450  		sp.LogFields(
   451  			olog.Int("off", int(off)),
   452  			olog.Int("read", read))
   453  	}()
   454  
   455  	// this is correct, a swarm doc cannot be zero length, so no EOF is expected
   456  	if len(b) == 0 {
   457  		return 0, nil
   458  	}
   459  	quitC := make(chan bool)
   460  	size, err := r.Size(cctx, quitC)
   461  	if err != nil {
   462  		log.Debug("lazychunkreader.readat.size", "size", size, "err", err)
   463  		return 0, err
   464  	}
   465  
   466  	errC := make(chan error)
   467  
   468  	// }
   469  	var treeSize int64
   470  	var depth int
   471  	// calculate depth and max treeSize
   472  	treeSize = r.chunkSize
   473  	for ; treeSize < size; treeSize *= r.branches {
   474  		depth++
   475  	}
   476  	wg := sync.WaitGroup{}
   477  	length := int64(len(b))
   478  	for d := 0; d < r.depth; d++ {
   479  		off *= r.chunkSize
   480  		length *= r.chunkSize
   481  	}
   482  	wg.Add(1)
   483  	go r.join(b, off, off+length, depth, treeSize/r.branches, r.chunkData, &wg, errC, quitC)
   484  	go func() {
   485  		wg.Wait()
   486  		close(errC)
   487  	}()
   488  
   489  	err = <-errC
   490  	if err != nil {
   491  		log.Debug("lazychunkreader.readat.errc", "err", err)
   492  		close(quitC)
   493  		return 0, err
   494  	}
   495  	if off+int64(len(b)) >= size {
   496  		log.Debug("lazychunkreader.readat.return at end", "size", size, "off", off)
   497  		return int(size - off), io.EOF
   498  	}
   499  	log.Debug("lazychunkreader.readat.errc", "buff", len(b))
   500  	return len(b), nil
   501  }
   502  
   503  func (r *LazyChunkReader) join(b []byte, off int64, eoff int64, depth int, treeSize int64, chunkData ChunkData, parentWg *sync.WaitGroup, errC chan error, quitC chan bool) {
   504  	defer parentWg.Done()
   505  	// find appropriate block level
   506  	for chunkData.Size() < uint64(treeSize) && depth > r.depth {
   507  		treeSize /= r.branches
   508  		depth--
   509  	}
   510  
   511  	// leaf chunk found
   512  	if depth == r.depth {
   513  		extra := 8 + eoff - int64(len(chunkData))
   514  		if extra > 0 {
   515  			eoff -= extra
   516  		}
   517  		copy(b, chunkData[8+off:8+eoff])
   518  		return // simply give back the chunks reader for content chunks
   519  	}
   520  
   521  	// subtree
   522  	start := off / treeSize
   523  	end := (eoff + treeSize - 1) / treeSize
   524  
   525  	// last non-leaf chunk can be shorter than default chunk size, let's not read it further then its end
   526  	currentBranches := int64(len(chunkData)-8) / r.hashSize
   527  	if end > currentBranches {
   528  		end = currentBranches
   529  	}
   530  
   531  	wg := &sync.WaitGroup{}
   532  	defer wg.Wait()
   533  	for i := start; i < end; i++ {
   534  		soff := i * treeSize
   535  		roff := soff
   536  		seoff := soff + treeSize
   537  
   538  		if soff < off {
   539  			soff = off
   540  		}
   541  		if seoff > eoff {
   542  			seoff = eoff
   543  		}
   544  		if depth > 1 {
   545  			wg.Wait()
   546  		}
   547  		wg.Add(1)
   548  		go func(j int64) {
   549  			childAddress := chunkData[8+j*r.hashSize : 8+(j+1)*r.hashSize]
   550  			startTime := time.Now()
   551  			chunkData, err := r.getter.Get(r.ctx, Reference(childAddress))
   552  			if err != nil {
   553  				metrics.GetOrRegisterResettingTimer("lcr.getter.get.err", nil).UpdateSince(startTime)
   554  				log.Debug("lazychunkreader.join", "key", fmt.Sprintf("%x", childAddress), "err", err)
   555  				select {
   556  				case errC <- fmt.Errorf("chunk %v-%v not found; key: %s", off, off+treeSize, fmt.Sprintf("%x", childAddress)):
   557  				case <-quitC:
   558  				}
   559  				return
   560  			}
   561  			metrics.GetOrRegisterResettingTimer("lcr.getter.get", nil).UpdateSince(startTime)
   562  			if l := len(chunkData); l < 9 {
   563  				select {
   564  				case errC <- fmt.Errorf("chunk %v-%v incomplete; key: %s, data length %v", off, off+treeSize, fmt.Sprintf("%x", childAddress), l):
   565  				case <-quitC:
   566  				}
   567  				return
   568  			}
   569  			if soff < off {
   570  				soff = off
   571  			}
   572  			r.join(b[soff-off:seoff-off], soff-roff, seoff-roff, depth-1, treeSize/r.branches, chunkData, wg, errC, quitC)
   573  		}(i)
   574  	} //for
   575  }
   576  
   577  // Read keeps a cursor so cannot be called simulateously, see ReadAt
   578  func (r *LazyChunkReader) Read(b []byte) (read int, err error) {
   579  	log.Debug("lazychunkreader.read", "key", r.addr)
   580  	metrics.GetOrRegisterCounter("lazychunkreader.read", nil).Inc(1)
   581  
   582  	read, err = r.ReadAt(b, r.off)
   583  	if err != nil && err != io.EOF {
   584  		log.Debug("lazychunkreader.readat", "read", read, "err", err)
   585  		metrics.GetOrRegisterCounter("lazychunkreader.read.err", nil).Inc(1)
   586  	}
   587  
   588  	metrics.GetOrRegisterCounter("lazychunkreader.read.bytes", nil).Inc(int64(read))
   589  
   590  	r.off += int64(read)
   591  	return read, err
   592  }
   593  
   594  // completely analogous to standard SectionReader implementation
   595  var errWhence = errors.New("Seek: invalid whence")
   596  var errOffset = errors.New("Seek: invalid offset")
   597  
   598  func (r *LazyChunkReader) Seek(offset int64, whence int) (int64, error) {
   599  	log.Debug("lazychunkreader.seek", "key", r.addr, "offset", offset)
   600  	switch whence {
   601  	default:
   602  		return 0, errWhence
   603  	case 0:
   604  		offset += 0
   605  	case 1:
   606  		offset += r.off
   607  	case 2:
   608  		if r.chunkData == nil { //seek from the end requires rootchunk for size. call Size first
   609  			_, err := r.Size(context.TODO(), nil)
   610  			if err != nil {
   611  				return 0, fmt.Errorf("can't get size: %v", err)
   612  			}
   613  		}
   614  		offset += int64(r.chunkData.Size())
   615  	}
   616  
   617  	if offset < 0 {
   618  		return 0, errOffset
   619  	}
   620  	r.off = offset
   621  	return offset, nil
   622  }