github.com/sberex/go-sberex@v1.8.2-0.20181113200658-ed96ac38f7d7/bmt/bmt.go (about)

     1  // This file is part of the go-sberex library. The go-sberex library is 
     2  // free software: you can redistribute it and/or modify it under the terms 
     3  // of the GNU Lesser General Public License as published by the Free 
     4  // Software Foundation, either version 3 of the License, or (at your option)
     5  // any later version.
     6  //
     7  // The go-sberex library is distributed in the hope that it will be useful, 
     8  // but WITHOUT ANY WARRANTY; without even the implied warranty of
     9  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser 
    10  // General Public License <http://www.gnu.org/licenses/> for more details.
    11  
    12  // Package bmt provides a binary merkle tree implementation
    13  package bmt
    14  
    15  import (
    16  	"fmt"
    17  	"hash"
    18  	"io"
    19  	"strings"
    20  	"sync"
    21  	"sync/atomic"
    22  )
    23  
    24  /*
    25  Binary Merkle Tree Hash is a hash function over arbitrary datachunks of limited size
    26  It is defined as the root hash of the binary merkle tree built over fixed size segments
    27  of the underlying chunk using any base hash function (e.g keccak 256 SHA3)
    28  
    29  It is used as the chunk hash function in swarm which in turn is the basis for the
    30  128 branching swarm hash http://swarm-guide.readthedocs.io/en/latest/architecture.html#swarm-hash
    31  
    32  The BMT is optimal for providing compact inclusion proofs, i.e. prove that a
    33  segment is a substring of a chunk starting at a particular offset
    34  The size of the underlying segments is fixed at 32 bytes (called the resolution
    35  of the BMT hash), the EVM word size to optimize for on-chain BMT verification
    36  as well as the hash size optimal for inclusion proofs in the merkle tree of the swarm hash.
    37  
    38  Two implementations are provided:
    39  
    40  * RefHasher is optimized for code simplicity and meant as a reference implementation
    41  * Hasher is optimized for speed taking advantage of concurrency with minimalistic
    42    control structure to coordinate the concurrent routines
    43    It implements the ChunkHash interface as well as the go standard hash.Hash interface
    44  
    45  */
    46  
    47  const (
    48  	// DefaultSegmentCount is the maximum number of segments of the underlying chunk
    49  	DefaultSegmentCount = 128 // Should be equal to storage.DefaultBranches
    50  	// DefaultPoolSize is the maximum number of bmt trees used by the hashers, i.e,
    51  	// the maximum number of concurrent BMT hashing operations performed by the same hasher
    52  	DefaultPoolSize = 8
    53  )
    54  
    55  // BaseHasher is a hash.Hash constructor function used for the base hash of the  BMT.
    56  type BaseHasher func() hash.Hash
    57  
    58  // Hasher a reusable hasher for fixed maximum size chunks representing a BMT
    59  // implements the hash.Hash interface
    60  // reuse pool of Tree-s for amortised memory allocation and resource control
    61  // supports order-agnostic concurrent segment writes
    62  // as well as sequential read and write
    63  // can not be called concurrently on more than one chunk
    64  // can be further appended after Sum
    65  // Reset gives back the Tree to the pool and guaranteed to leave
    66  // the tree and itself in a state reusable for hashing a new chunk
    67  type Hasher struct {
    68  	pool        *TreePool   // BMT resource pool
    69  	bmt         *Tree       // prebuilt BMT resource for flowcontrol and proofs
    70  	blocksize   int         // segment size (size of hash) also for hash.Hash
    71  	count       int         // segment count
    72  	size        int         // for hash.Hash same as hashsize
    73  	cur         int         // cursor position for righmost currently open chunk
    74  	segment     []byte      // the rightmost open segment (not complete)
    75  	depth       int         // index of last level
    76  	result      chan []byte // result channel
    77  	hash        []byte      // to record the result
    78  	max         int32       // max segments for SegmentWriter interface
    79  	blockLength []byte      // The block length that needes to be added in Sum
    80  }
    81  
    82  // New creates a reusable Hasher
    83  // implements the hash.Hash interface
    84  // pulls a new Tree from a resource pool for hashing each chunk
    85  func New(p *TreePool) *Hasher {
    86  	return &Hasher{
    87  		pool:      p,
    88  		depth:     depth(p.SegmentCount),
    89  		size:      p.SegmentSize,
    90  		blocksize: p.SegmentSize,
    91  		count:     p.SegmentCount,
    92  		result:    make(chan []byte),
    93  	}
    94  }
    95  
    96  // Node is a reuseable segment hasher representing a node in a BMT
    97  // it allows for continued writes after a Sum
    98  // and is left in completely reusable state after Reset
    99  type Node struct {
   100  	level, index int   // position of node for information/logging only
   101  	initial      bool  // first and last node
   102  	root         bool  // whether the node is root to a smaller BMT
   103  	isLeft       bool  // whether it is left side of the parent double segment
   104  	unbalanced   bool  // indicates if a node has only the left segment
   105  	parent       *Node // BMT connections
   106  	state        int32 // atomic increment impl concurrent boolean toggle
   107  	left, right  []byte
   108  }
   109  
   110  // NewNode constructor for segment hasher nodes in the BMT
   111  func NewNode(level, index int, parent *Node) *Node {
   112  	return &Node{
   113  		parent:  parent,
   114  		level:   level,
   115  		index:   index,
   116  		initial: index == 0,
   117  		isLeft:  index%2 == 0,
   118  	}
   119  }
   120  
   121  // TreePool provides a pool of Trees used as resources by Hasher
   122  // a Tree popped from the pool is guaranteed to have clean state
   123  // for hashing a new chunk
   124  // Hasher Reset releases the Tree to the pool
   125  type TreePool struct {
   126  	lock         sync.Mutex
   127  	c            chan *Tree
   128  	hasher       BaseHasher
   129  	SegmentSize  int
   130  	SegmentCount int
   131  	Capacity     int
   132  	count        int
   133  }
   134  
   135  // NewTreePool creates a Tree pool with hasher, segment size, segment count and capacity
   136  // on GetTree it reuses free Trees or creates a new one if size is not reached
   137  func NewTreePool(hasher BaseHasher, segmentCount, capacity int) *TreePool {
   138  	return &TreePool{
   139  		c:            make(chan *Tree, capacity),
   140  		hasher:       hasher,
   141  		SegmentSize:  hasher().Size(),
   142  		SegmentCount: segmentCount,
   143  		Capacity:     capacity,
   144  	}
   145  }
   146  
   147  // Drain drains the pool uptil it has no more than n resources
   148  func (self *TreePool) Drain(n int) {
   149  	self.lock.Lock()
   150  	defer self.lock.Unlock()
   151  	for len(self.c) > n {
   152  		<-self.c
   153  		self.count--
   154  	}
   155  }
   156  
   157  // Reserve is blocking until it returns an available Tree
   158  // it reuses free Trees or creates a new one if size is not reached
   159  func (self *TreePool) Reserve() *Tree {
   160  	self.lock.Lock()
   161  	defer self.lock.Unlock()
   162  	var t *Tree
   163  	if self.count == self.Capacity {
   164  		return <-self.c
   165  	}
   166  	select {
   167  	case t = <-self.c:
   168  	default:
   169  		t = NewTree(self.hasher, self.SegmentSize, self.SegmentCount)
   170  		self.count++
   171  	}
   172  	return t
   173  }
   174  
   175  // Release gives back a Tree to the pool.
   176  // This Tree is guaranteed to be in reusable state
   177  // does not need locking
   178  func (self *TreePool) Release(t *Tree) {
   179  	self.c <- t // can never fail but...
   180  }
   181  
   182  // Tree is a reusable control structure representing a BMT
   183  // organised in a binary tree
   184  // Hasher uses a TreePool to pick one for each chunk hash
   185  // the Tree is 'locked' while not in the pool
   186  type Tree struct {
   187  	leaves []*Node
   188  }
   189  
   190  // Draw draws the BMT (badly)
   191  func (self *Tree) Draw(hash []byte, d int) string {
   192  	var left, right []string
   193  	var anc []*Node
   194  	for i, n := range self.leaves {
   195  		left = append(left, fmt.Sprintf("%v", hashstr(n.left)))
   196  		if i%2 == 0 {
   197  			anc = append(anc, n.parent)
   198  		}
   199  		right = append(right, fmt.Sprintf("%v", hashstr(n.right)))
   200  	}
   201  	anc = self.leaves
   202  	var hashes [][]string
   203  	for l := 0; len(anc) > 0; l++ {
   204  		var nodes []*Node
   205  		hash := []string{""}
   206  		for i, n := range anc {
   207  			hash = append(hash, fmt.Sprintf("%v|%v", hashstr(n.left), hashstr(n.right)))
   208  			if i%2 == 0 && n.parent != nil {
   209  				nodes = append(nodes, n.parent)
   210  			}
   211  		}
   212  		hash = append(hash, "")
   213  		hashes = append(hashes, hash)
   214  		anc = nodes
   215  	}
   216  	hashes = append(hashes, []string{"", fmt.Sprintf("%v", hashstr(hash)), ""})
   217  	total := 60
   218  	del := "                             "
   219  	var rows []string
   220  	for i := len(hashes) - 1; i >= 0; i-- {
   221  		var textlen int
   222  		hash := hashes[i]
   223  		for _, s := range hash {
   224  			textlen += len(s)
   225  		}
   226  		if total < textlen {
   227  			total = textlen + len(hash)
   228  		}
   229  		delsize := (total - textlen) / (len(hash) - 1)
   230  		if delsize > len(del) {
   231  			delsize = len(del)
   232  		}
   233  		row := fmt.Sprintf("%v: %v", len(hashes)-i-1, strings.Join(hash, del[:delsize]))
   234  		rows = append(rows, row)
   235  
   236  	}
   237  	rows = append(rows, strings.Join(left, "  "))
   238  	rows = append(rows, strings.Join(right, "  "))
   239  	return strings.Join(rows, "\n") + "\n"
   240  }
   241  
   242  // NewTree initialises the Tree by building up the nodes of a BMT
   243  // segment size is stipulated to be the size of the hash
   244  // segmentCount needs to be positive integer and does not need to be
   245  // a power of two and can even be an odd number
   246  // segmentSize * segmentCount determines the maximum chunk size
   247  // hashed using the tree
   248  func NewTree(hasher BaseHasher, segmentSize, segmentCount int) *Tree {
   249  	n := NewNode(0, 0, nil)
   250  	n.root = true
   251  	prevlevel := []*Node{n}
   252  	// iterate over levels and creates 2^level nodes
   253  	level := 1
   254  	count := 2
   255  	for d := 1; d <= depth(segmentCount); d++ {
   256  		nodes := make([]*Node, count)
   257  		for i := 0; i < len(nodes); i++ {
   258  			parent := prevlevel[i/2]
   259  			t := NewNode(level, i, parent)
   260  			nodes[i] = t
   261  		}
   262  		prevlevel = nodes
   263  		level++
   264  		count *= 2
   265  	}
   266  	// the datanode level is the nodes on the last level where
   267  	return &Tree{
   268  		leaves: prevlevel,
   269  	}
   270  }
   271  
   272  // methods needed by hash.Hash
   273  
   274  // Size returns the size
   275  func (self *Hasher) Size() int {
   276  	return self.size
   277  }
   278  
   279  // BlockSize returns the block size
   280  func (self *Hasher) BlockSize() int {
   281  	return self.blocksize
   282  }
   283  
   284  // Sum returns the hash of the buffer
   285  // hash.Hash interface Sum method appends the byte slice to the underlying
   286  // data before it calculates and returns the hash of the chunk
   287  func (self *Hasher) Sum(b []byte) (r []byte) {
   288  	t := self.bmt
   289  	i := self.cur
   290  	n := t.leaves[i]
   291  	j := i
   292  	// must run strictly before all nodes calculate
   293  	// datanodes are guaranteed to have a parent
   294  	if len(self.segment) > self.size && i > 0 && n.parent != nil {
   295  		n = n.parent
   296  	} else {
   297  		i *= 2
   298  	}
   299  	d := self.finalise(n, i)
   300  	self.writeSegment(j, self.segment, d)
   301  	c := <-self.result
   302  	self.releaseTree()
   303  
   304  	// sha3(length + BMT(pure_chunk))
   305  	if self.blockLength == nil {
   306  		return c
   307  	}
   308  	res := self.pool.hasher()
   309  	res.Reset()
   310  	res.Write(self.blockLength)
   311  	res.Write(c)
   312  	return res.Sum(nil)
   313  }
   314  
   315  // Hasher implements the SwarmHash interface
   316  
   317  // Hash waits for the hasher result and returns it
   318  // caller must call this on a BMT Hasher being written to
   319  func (self *Hasher) Hash() []byte {
   320  	return <-self.result
   321  }
   322  
   323  // Hasher implements the io.Writer interface
   324  
   325  // Write fills the buffer to hash
   326  // with every full segment complete launches a hasher go routine
   327  // that shoots up the BMT
   328  func (self *Hasher) Write(b []byte) (int, error) {
   329  	l := len(b)
   330  	if l <= 0 {
   331  		return 0, nil
   332  	}
   333  	s := self.segment
   334  	i := self.cur
   335  	count := (self.count + 1) / 2
   336  	need := self.count*self.size - self.cur*2*self.size
   337  	size := self.size
   338  	if need > size {
   339  		size *= 2
   340  	}
   341  	if l < need {
   342  		need = l
   343  	}
   344  	// calculate missing bit to complete current open segment
   345  	rest := size - len(s)
   346  	if need < rest {
   347  		rest = need
   348  	}
   349  	s = append(s, b[:rest]...)
   350  	need -= rest
   351  	// read full segments and the last possibly partial segment
   352  	for need > 0 && i < count-1 {
   353  		// push all finished chunks we read
   354  		self.writeSegment(i, s, self.depth)
   355  		need -= size
   356  		if need < 0 {
   357  			size += need
   358  		}
   359  		s = b[rest : rest+size]
   360  		rest += size
   361  		i++
   362  	}
   363  	self.segment = s
   364  	self.cur = i
   365  	// otherwise, we can assume len(s) == 0, so all buffer is read and chunk is not yet full
   366  	return l, nil
   367  }
   368  
   369  // Hasher implements the io.ReaderFrom interface
   370  
   371  // ReadFrom reads from io.Reader and appends to the data to hash using Write
   372  // it reads so that chunk to hash is maximum length or reader reaches EOF
   373  // caller must Reset the hasher prior to call
   374  func (self *Hasher) ReadFrom(r io.Reader) (m int64, err error) {
   375  	bufsize := self.size*self.count - self.size*self.cur - len(self.segment)
   376  	buf := make([]byte, bufsize)
   377  	var read int
   378  	for {
   379  		var n int
   380  		n, err = r.Read(buf)
   381  		read += n
   382  		if err == io.EOF || read == len(buf) {
   383  			hash := self.Sum(buf[:n])
   384  			if read == len(buf) {
   385  				err = NewEOC(hash)
   386  			}
   387  			break
   388  		}
   389  		if err != nil {
   390  			break
   391  		}
   392  		n, err = self.Write(buf[:n])
   393  		if err != nil {
   394  			break
   395  		}
   396  	}
   397  	return int64(read), err
   398  }
   399  
   400  // Reset needs to be called before writing to the hasher
   401  func (self *Hasher) Reset() {
   402  	self.getTree()
   403  	self.blockLength = nil
   404  }
   405  
   406  // Hasher implements the SwarmHash interface
   407  
   408  // ResetWithLength needs to be called before writing to the hasher
   409  // the argument is supposed to be the byte slice binary representation of
   410  // the legth of the data subsumed under the hash
   411  func (self *Hasher) ResetWithLength(l []byte) {
   412  	self.Reset()
   413  	self.blockLength = l
   414  
   415  }
   416  
   417  // Release gives back the Tree to the pool whereby it unlocks
   418  // it resets tree, segment and index
   419  func (self *Hasher) releaseTree() {
   420  	if self.bmt != nil {
   421  		n := self.bmt.leaves[self.cur]
   422  		for ; n != nil; n = n.parent {
   423  			n.unbalanced = false
   424  			if n.parent != nil {
   425  				n.root = false
   426  			}
   427  		}
   428  		self.pool.Release(self.bmt)
   429  		self.bmt = nil
   430  
   431  	}
   432  	self.cur = 0
   433  	self.segment = nil
   434  }
   435  
   436  func (self *Hasher) writeSegment(i int, s []byte, d int) {
   437  	h := self.pool.hasher()
   438  	n := self.bmt.leaves[i]
   439  
   440  	if len(s) > self.size && n.parent != nil {
   441  		go func() {
   442  			h.Reset()
   443  			h.Write(s)
   444  			s = h.Sum(nil)
   445  
   446  			if n.root {
   447  				self.result <- s
   448  				return
   449  			}
   450  			self.run(n.parent, h, d, n.index, s)
   451  		}()
   452  		return
   453  	}
   454  	go self.run(n, h, d, i*2, s)
   455  }
   456  
   457  func (self *Hasher) run(n *Node, h hash.Hash, d int, i int, s []byte) {
   458  	isLeft := i%2 == 0
   459  	for {
   460  		if isLeft {
   461  			n.left = s
   462  		} else {
   463  			n.right = s
   464  		}
   465  		if !n.unbalanced && n.toggle() {
   466  			return
   467  		}
   468  		if !n.unbalanced || !isLeft || i == 0 && d == 0 {
   469  			h.Reset()
   470  			h.Write(n.left)
   471  			h.Write(n.right)
   472  			s = h.Sum(nil)
   473  
   474  		} else {
   475  			s = append(n.left, n.right...)
   476  		}
   477  
   478  		self.hash = s
   479  		if n.root {
   480  			self.result <- s
   481  			return
   482  		}
   483  
   484  		isLeft = n.isLeft
   485  		n = n.parent
   486  		i++
   487  	}
   488  }
   489  
   490  // getTree obtains a BMT resource by reserving one from the pool
   491  func (self *Hasher) getTree() *Tree {
   492  	if self.bmt != nil {
   493  		return self.bmt
   494  	}
   495  	t := self.pool.Reserve()
   496  	self.bmt = t
   497  	return t
   498  }
   499  
   500  // atomic bool toggle implementing a concurrent reusable 2-state object
   501  // atomic addint with %2 implements atomic bool toggle
   502  // it returns true if the toggler just put it in the active/waiting state
   503  func (self *Node) toggle() bool {
   504  	return atomic.AddInt32(&self.state, 1)%2 == 1
   505  }
   506  
   507  func hashstr(b []byte) string {
   508  	end := len(b)
   509  	if end > 4 {
   510  		end = 4
   511  	}
   512  	return fmt.Sprintf("%x", b[:end])
   513  }
   514  
   515  func depth(n int) (d int) {
   516  	for l := (n - 1) / 2; l > 0; l /= 2 {
   517  		d++
   518  	}
   519  	return d
   520  }
   521  
   522  // finalise is following the zigzags on the tree belonging
   523  // to the final datasegment
   524  func (self *Hasher) finalise(n *Node, i int) (d int) {
   525  	isLeft := i%2 == 0
   526  	for {
   527  		// when the final segment's path is going via left segments
   528  		// the incoming data is pushed to the parent upon pulling the left
   529  		// we do not need toogle the state since this condition is
   530  		// detectable
   531  		n.unbalanced = isLeft
   532  		n.right = nil
   533  		if n.initial {
   534  			n.root = true
   535  			return d
   536  		}
   537  		isLeft = n.isLeft
   538  		n = n.parent
   539  		d++
   540  	}
   541  }
   542  
   543  // EOC (end of chunk) implements the error interface
   544  type EOC struct {
   545  	Hash []byte // read the hash of the chunk off the error
   546  }
   547  
   548  // Error returns the error string
   549  func (self *EOC) Error() string {
   550  	return fmt.Sprintf("hasher limit reached, chunk hash: %x", self.Hash)
   551  }
   552  
   553  // NewEOC creates new end of chunk error with the hash
   554  func NewEOC(hash []byte) *EOC {
   555  	return &EOC{hash}
   556  }