github.com/sberex/go-sberex@v1.8.2-0.20181113200658-ed96ac38f7d7/swarm/storage/chunker.go (about)

     1  // This file is part of the go-sberex library. The go-sberex library is 
     2  // free software: you can redistribute it and/or modify it under the terms 
     3  // of the GNU Lesser General Public License as published by the Free 
     4  // Software Foundation, either version 3 of the License, or (at your option)
     5  // any later version.
     6  //
     7  // The go-sberex library is distributed in the hope that it will be useful, 
     8  // but WITHOUT ANY WARRANTY; without even the implied warranty of
     9  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser 
    10  // General Public License <http://www.gnu.org/licenses/> for more details.
    11  
    12  package storage
    13  
    14  import (
    15  	"encoding/binary"
    16  	"errors"
    17  	"fmt"
    18  	"io"
    19  	"sync"
    20  	"time"
    21  
    22  	"github.com/Sberex/go-sberex/metrics"
    23  )
    24  
    25  /*
    26  The distributed storage implemented in this package requires fix sized chunks of content.
    27  
    28  Chunker is the interface to a component that is responsible for disassembling and assembling larger data.
    29  
    30  TreeChunker implements a Chunker based on a tree structure defined as follows:
    31  
    32  1 each node in the tree including the root and other branching nodes are stored as a chunk.
    33  
    34  2 branching nodes encode data contents that includes the size of the dataslice covered by its entire subtree under the node as well as the hash keys of all its children :
    35  data_{i} := size(subtree_{i}) || key_{j} || key_{j+1} .... || key_{j+n-1}
    36  
    37  3 Leaf nodes encode an actual subslice of the input data.
    38  
    39  4 if data size is not more than maximum chunksize, the data is stored in a single chunk
    40    key = hash(int64(size) + data)
    41  
    42  5 if data size is more than chunksize*branches^l, but no more than chunksize*
    43    branches^(l+1), the data vector is split into slices of chunksize*
    44    branches^l length (except the last one).
    45    key = hash(int64(size) + key(slice0) + key(slice1) + ...)
    46  
    47   The underlying hash function is configurable
    48  */
    49  
    50  /*
    51  Tree chunker is a concrete implementation of data chunking.
    52  This chunker works in a simple way, it builds a tree out of the document so that each node either represents a chunk of real data or a chunk of data representing an branching non-leaf node of the tree. In particular each such non-leaf chunk will represent is a concatenation of the hash of its respective children. This scheme simultaneously guarantees data integrity as well as self addressing. Abstract nodes are transparent since their represented size component is strictly greater than their maximum data size, since they encode a subtree.
    53  
    54  If all is well it is possible to implement this by simply composing readers so that no extra allocation or buffering is necessary for the data splitting and joining. This means that in principle there can be direct IO between : memory, file system, network socket (bzz peers storage request is read from the socket). In practice there may be need for several stages of internal buffering.
    55  The hashing itself does use extra copies and allocation though, since it does need it.
    56  */
    57  
    58  var (
    59  	errAppendOppNotSuported = errors.New("Append operation not supported")
    60  	errOperationTimedOut    = errors.New("operation timed out")
    61  )
    62  
    63  //metrics variables
    64  var (
    65  	newChunkCounter = metrics.NewRegisteredCounter("storage.chunks.new", nil)
    66  )
    67  
    68  type TreeChunker struct {
    69  	branches int64
    70  	hashFunc SwarmHasher
    71  	// calculated
    72  	hashSize    int64        // self.hashFunc.New().Size()
    73  	chunkSize   int64        // hashSize* branches
    74  	workerCount int64        // the number of worker routines used
    75  	workerLock  sync.RWMutex // lock for the worker count
    76  }
    77  
    78  func NewTreeChunker(params *ChunkerParams) (self *TreeChunker) {
    79  	self = &TreeChunker{}
    80  	self.hashFunc = MakeHashFunc(params.Hash)
    81  	self.branches = params.Branches
    82  	self.hashSize = int64(self.hashFunc().Size())
    83  	self.chunkSize = self.hashSize * self.branches
    84  	self.workerCount = 0
    85  
    86  	return
    87  }
    88  
    89  // func (self *TreeChunker) KeySize() int64 {
    90  // 	return self.hashSize
    91  // }
    92  
    93  // String() for pretty printing
    94  func (self *Chunk) String() string {
    95  	return fmt.Sprintf("Key: %v TreeSize: %v Chunksize: %v", self.Key.Log(), self.Size, len(self.SData))
    96  }
    97  
    98  type hashJob struct {
    99  	key      Key
   100  	chunk    []byte
   101  	size     int64
   102  	parentWg *sync.WaitGroup
   103  }
   104  
   105  func (self *TreeChunker) incrementWorkerCount() {
   106  	self.workerLock.Lock()
   107  	defer self.workerLock.Unlock()
   108  	self.workerCount += 1
   109  }
   110  
   111  func (self *TreeChunker) getWorkerCount() int64 {
   112  	self.workerLock.RLock()
   113  	defer self.workerLock.RUnlock()
   114  	return self.workerCount
   115  }
   116  
   117  func (self *TreeChunker) decrementWorkerCount() {
   118  	self.workerLock.Lock()
   119  	defer self.workerLock.Unlock()
   120  	self.workerCount -= 1
   121  }
   122  
   123  func (self *TreeChunker) Split(data io.Reader, size int64, chunkC chan *Chunk, swg, wwg *sync.WaitGroup) (Key, error) {
   124  	if self.chunkSize <= 0 {
   125  		panic("chunker must be initialised")
   126  	}
   127  
   128  	jobC := make(chan *hashJob, 2*ChunkProcessors)
   129  	wg := &sync.WaitGroup{}
   130  	errC := make(chan error)
   131  	quitC := make(chan bool)
   132  
   133  	// wwg = workers waitgroup keeps track of hashworkers spawned by this split call
   134  	if wwg != nil {
   135  		wwg.Add(1)
   136  	}
   137  
   138  	self.incrementWorkerCount()
   139  	go self.hashWorker(jobC, chunkC, errC, quitC, swg, wwg)
   140  
   141  	depth := 0
   142  	treeSize := self.chunkSize
   143  
   144  	// takes lowest depth such that chunksize*HashCount^(depth+1) > size
   145  	// power series, will find the order of magnitude of the data size in base hashCount or numbers of levels of branching in the resulting tree.
   146  	for ; treeSize < size; treeSize *= self.branches {
   147  		depth++
   148  	}
   149  
   150  	key := make([]byte, self.hashFunc().Size())
   151  	// this waitgroup member is released after the root hash is calculated
   152  	wg.Add(1)
   153  	//launch actual recursive function passing the waitgroups
   154  	go self.split(depth, treeSize/self.branches, key, data, size, jobC, chunkC, errC, quitC, wg, swg, wwg)
   155  
   156  	// closes internal error channel if all subprocesses in the workgroup finished
   157  	go func() {
   158  		// waiting for all threads to finish
   159  		wg.Wait()
   160  		// if storage waitgroup is non-nil, we wait for storage to finish too
   161  		if swg != nil {
   162  			swg.Wait()
   163  		}
   164  		close(errC)
   165  	}()
   166  
   167  	defer close(quitC)
   168  	select {
   169  	case err := <-errC:
   170  		if err != nil {
   171  			return nil, err
   172  		}
   173  	case <-time.NewTimer(splitTimeout).C:
   174  		return nil, errOperationTimedOut
   175  	}
   176  
   177  	return key, nil
   178  }
   179  
   180  func (self *TreeChunker) split(depth int, treeSize int64, key Key, data io.Reader, size int64, jobC chan *hashJob, chunkC chan *Chunk, errC chan error, quitC chan bool, parentWg, swg, wwg *sync.WaitGroup) {
   181  
   182  	//
   183  
   184  	for depth > 0 && size < treeSize {
   185  		treeSize /= self.branches
   186  		depth--
   187  	}
   188  
   189  	if depth == 0 {
   190  		// leaf nodes -> content chunks
   191  		chunkData := make([]byte, size+8)
   192  		binary.LittleEndian.PutUint64(chunkData[0:8], uint64(size))
   193  		var readBytes int64
   194  		for readBytes < size {
   195  			n, err := data.Read(chunkData[8+readBytes:])
   196  			readBytes += int64(n)
   197  			if err != nil && !(err == io.EOF && readBytes == size) {
   198  				errC <- err
   199  				return
   200  			}
   201  		}
   202  		select {
   203  		case jobC <- &hashJob{key, chunkData, size, parentWg}:
   204  		case <-quitC:
   205  		}
   206  		return
   207  	}
   208  	// dept > 0
   209  	// intermediate chunk containing child nodes hashes
   210  	branchCnt := (size + treeSize - 1) / treeSize
   211  
   212  	var chunk = make([]byte, branchCnt*self.hashSize+8)
   213  	var pos, i int64
   214  
   215  	binary.LittleEndian.PutUint64(chunk[0:8], uint64(size))
   216  
   217  	childrenWg := &sync.WaitGroup{}
   218  	var secSize int64
   219  	for i < branchCnt {
   220  		// the last item can have shorter data
   221  		if size-pos < treeSize {
   222  			secSize = size - pos
   223  		} else {
   224  			secSize = treeSize
   225  		}
   226  		// the hash of that data
   227  		subTreeKey := chunk[8+i*self.hashSize : 8+(i+1)*self.hashSize]
   228  
   229  		childrenWg.Add(1)
   230  		self.split(depth-1, treeSize/self.branches, subTreeKey, data, secSize, jobC, chunkC, errC, quitC, childrenWg, swg, wwg)
   231  
   232  		i++
   233  		pos += treeSize
   234  	}
   235  	// wait for all the children to complete calculating their hashes and copying them onto sections of the chunk
   236  	// parentWg.Add(1)
   237  	// go func() {
   238  	childrenWg.Wait()
   239  
   240  	worker := self.getWorkerCount()
   241  	if int64(len(jobC)) > worker && worker < ChunkProcessors {
   242  		if wwg != nil {
   243  			wwg.Add(1)
   244  		}
   245  		self.incrementWorkerCount()
   246  		go self.hashWorker(jobC, chunkC, errC, quitC, swg, wwg)
   247  
   248  	}
   249  	select {
   250  	case jobC <- &hashJob{key, chunk, size, parentWg}:
   251  	case <-quitC:
   252  	}
   253  }
   254  
   255  func (self *TreeChunker) hashWorker(jobC chan *hashJob, chunkC chan *Chunk, errC chan error, quitC chan bool, swg, wwg *sync.WaitGroup) {
   256  	defer self.decrementWorkerCount()
   257  
   258  	hasher := self.hashFunc()
   259  	if wwg != nil {
   260  		defer wwg.Done()
   261  	}
   262  	for {
   263  		select {
   264  
   265  		case job, ok := <-jobC:
   266  			if !ok {
   267  				return
   268  			}
   269  			// now we got the hashes in the chunk, then hash the chunks
   270  			self.hashChunk(hasher, job, chunkC, swg)
   271  		case <-quitC:
   272  			return
   273  		}
   274  	}
   275  }
   276  
   277  // The treeChunkers own Hash hashes together
   278  // - the size (of the subtree encoded in the Chunk)
   279  // - the Chunk, ie. the contents read from the input reader
   280  func (self *TreeChunker) hashChunk(hasher SwarmHash, job *hashJob, chunkC chan *Chunk, swg *sync.WaitGroup) {
   281  	hasher.ResetWithLength(job.chunk[:8]) // 8 bytes of length
   282  	hasher.Write(job.chunk[8:])           // minus 8 []byte length
   283  	h := hasher.Sum(nil)
   284  
   285  	newChunk := &Chunk{
   286  		Key:   h,
   287  		SData: job.chunk,
   288  		Size:  job.size,
   289  		wg:    swg,
   290  	}
   291  
   292  	// report hash of this chunk one level up (keys corresponds to the proper subslice of the parent chunk)
   293  	copy(job.key, h)
   294  	// send off new chunk to storage
   295  	if chunkC != nil {
   296  		if swg != nil {
   297  			swg.Add(1)
   298  		}
   299  	}
   300  	job.parentWg.Done()
   301  
   302  	if chunkC != nil {
   303  		//NOTE: this increases the chunk count even if the local node already has this chunk;
   304  		//on file upload the node will increase this counter even if the same file has already been uploaded
   305  		//So it should be evaluated whether it is worth keeping this counter
   306  		//and/or actually better track when the chunk is Put to the local database
   307  		//(which may question the need for disambiguation when a completely new chunk has been created
   308  		//and/or a chunk is being put to the local DB; for chunk tracking it may be worth distinguishing
   309  		newChunkCounter.Inc(1)
   310  		chunkC <- newChunk
   311  	}
   312  }
   313  
   314  func (self *TreeChunker) Append(key Key, data io.Reader, chunkC chan *Chunk, swg, wwg *sync.WaitGroup) (Key, error) {
   315  	return nil, errAppendOppNotSuported
   316  }
   317  
   318  // LazyChunkReader implements LazySectionReader
   319  type LazyChunkReader struct {
   320  	key       Key         // root key
   321  	chunkC    chan *Chunk // chunk channel to send retrieve requests on
   322  	chunk     *Chunk      // size of the entire subtree
   323  	off       int64       // offset
   324  	chunkSize int64       // inherit from chunker
   325  	branches  int64       // inherit from chunker
   326  	hashSize  int64       // inherit from chunker
   327  }
   328  
   329  // implements the Joiner interface
   330  func (self *TreeChunker) Join(key Key, chunkC chan *Chunk) LazySectionReader {
   331  	return &LazyChunkReader{
   332  		key:       key,
   333  		chunkC:    chunkC,
   334  		chunkSize: self.chunkSize,
   335  		branches:  self.branches,
   336  		hashSize:  self.hashSize,
   337  	}
   338  }
   339  
   340  // Size is meant to be called on the LazySectionReader
   341  func (self *LazyChunkReader) Size(quitC chan bool) (n int64, err error) {
   342  	if self.chunk != nil {
   343  		return self.chunk.Size, nil
   344  	}
   345  	chunk := retrieve(self.key, self.chunkC, quitC)
   346  	if chunk == nil {
   347  		select {
   348  		case <-quitC:
   349  			return 0, errors.New("aborted")
   350  		default:
   351  			return 0, fmt.Errorf("root chunk not found for %v", self.key.Hex())
   352  		}
   353  	}
   354  	self.chunk = chunk
   355  	return chunk.Size, nil
   356  }
   357  
   358  // read at can be called numerous times
   359  // concurrent reads are allowed
   360  // Size() needs to be called synchronously on the LazyChunkReader first
   361  func (self *LazyChunkReader) ReadAt(b []byte, off int64) (read int, err error) {
   362  	// this is correct, a swarm doc cannot be zero length, so no EOF is expected
   363  	if len(b) == 0 {
   364  		return 0, nil
   365  	}
   366  	quitC := make(chan bool)
   367  	size, err := self.Size(quitC)
   368  	if err != nil {
   369  		return 0, err
   370  	}
   371  
   372  	errC := make(chan error)
   373  
   374  	// }
   375  	var treeSize int64
   376  	var depth int
   377  	// calculate depth and max treeSize
   378  	treeSize = self.chunkSize
   379  	for ; treeSize < size; treeSize *= self.branches {
   380  		depth++
   381  	}
   382  	wg := sync.WaitGroup{}
   383  	wg.Add(1)
   384  	go self.join(b, off, off+int64(len(b)), depth, treeSize/self.branches, self.chunk, &wg, errC, quitC)
   385  	go func() {
   386  		wg.Wait()
   387  		close(errC)
   388  	}()
   389  
   390  	err = <-errC
   391  	if err != nil {
   392  		close(quitC)
   393  
   394  		return 0, err
   395  	}
   396  	if off+int64(len(b)) >= size {
   397  		return len(b), io.EOF
   398  	}
   399  	return len(b), nil
   400  }
   401  
   402  func (self *LazyChunkReader) join(b []byte, off int64, eoff int64, depth int, treeSize int64, chunk *Chunk, parentWg *sync.WaitGroup, errC chan error, quitC chan bool) {
   403  	defer parentWg.Done()
   404  	// return NewDPA(&LocalStore{})
   405  
   406  	// chunk.Size = int64(binary.LittleEndian.Uint64(chunk.SData[0:8]))
   407  
   408  	// find appropriate block level
   409  	for chunk.Size < treeSize && depth > 0 {
   410  		treeSize /= self.branches
   411  		depth--
   412  	}
   413  
   414  	// leaf chunk found
   415  	if depth == 0 {
   416  		extra := 8 + eoff - int64(len(chunk.SData))
   417  		if extra > 0 {
   418  			eoff -= extra
   419  		}
   420  		copy(b, chunk.SData[8+off:8+eoff])
   421  		return // simply give back the chunks reader for content chunks
   422  	}
   423  
   424  	// subtree
   425  	start := off / treeSize
   426  	end := (eoff + treeSize - 1) / treeSize
   427  
   428  	wg := &sync.WaitGroup{}
   429  	defer wg.Wait()
   430  
   431  	for i := start; i < end; i++ {
   432  		soff := i * treeSize
   433  		roff := soff
   434  		seoff := soff + treeSize
   435  
   436  		if soff < off {
   437  			soff = off
   438  		}
   439  		if seoff > eoff {
   440  			seoff = eoff
   441  		}
   442  		if depth > 1 {
   443  			wg.Wait()
   444  		}
   445  		wg.Add(1)
   446  		go func(j int64) {
   447  			childKey := chunk.SData[8+j*self.hashSize : 8+(j+1)*self.hashSize]
   448  			chunk := retrieve(childKey, self.chunkC, quitC)
   449  			if chunk == nil {
   450  				select {
   451  				case errC <- fmt.Errorf("chunk %v-%v not found", off, off+treeSize):
   452  				case <-quitC:
   453  				}
   454  				return
   455  			}
   456  			if soff < off {
   457  				soff = off
   458  			}
   459  			self.join(b[soff-off:seoff-off], soff-roff, seoff-roff, depth-1, treeSize/self.branches, chunk, wg, errC, quitC)
   460  		}(i)
   461  	} //for
   462  }
   463  
   464  // the helper method submits chunks for a key to a oueue (DPA) and
   465  // block until they time out or arrive
   466  // abort if quitC is readable
   467  func retrieve(key Key, chunkC chan *Chunk, quitC chan bool) *Chunk {
   468  	chunk := &Chunk{
   469  		Key: key,
   470  		C:   make(chan bool), // close channel to signal data delivery
   471  	}
   472  	// submit chunk for retrieval
   473  	select {
   474  	case chunkC <- chunk: // submit retrieval request, someone should be listening on the other side (or we will time out globally)
   475  	case <-quitC:
   476  		return nil
   477  	}
   478  	// waiting for the chunk retrieval
   479  	select { // chunk.Size = int64(binary.LittleEndian.Uint64(chunk.SData[0:8]))
   480  
   481  	case <-quitC:
   482  		// this is how we control process leakage (quitC is closed once join is finished (after timeout))
   483  		return nil
   484  	case <-chunk.C: // bells are ringing, data have been delivered
   485  	}
   486  	if len(chunk.SData) == 0 {
   487  		return nil // chunk.Size = int64(binary.LittleEndian.Uint64(chunk.SData[0:8]))
   488  
   489  	}
   490  	return chunk
   491  }
   492  
   493  // Read keeps a cursor so cannot be called simulateously, see ReadAt
   494  func (self *LazyChunkReader) Read(b []byte) (read int, err error) {
   495  	read, err = self.ReadAt(b, self.off)
   496  
   497  	self.off += int64(read)
   498  	return
   499  }
   500  
   501  // completely analogous to standard SectionReader implementation
   502  var errWhence = errors.New("Seek: invalid whence")
   503  var errOffset = errors.New("Seek: invalid offset")
   504  
   505  func (s *LazyChunkReader) Seek(offset int64, whence int) (int64, error) {
   506  	switch whence {
   507  	default:
   508  		return 0, errWhence
   509  	case 0:
   510  		offset += 0
   511  	case 1:
   512  		offset += s.off
   513  	case 2:
   514  		if s.chunk == nil { //seek from the end requires rootchunk for size. call Size first
   515  			_, err := s.Size(nil)
   516  			if err != nil {
   517  				return 0, fmt.Errorf("can't get size: %v", err)
   518  			}
   519  		}
   520  		offset += s.chunk.Size
   521  	}
   522  
   523  	if offset < 0 {
   524  		return 0, errOffset
   525  	}
   526  	s.off = offset
   527  	return offset, nil
   528  }