github.com/aquanetwork/aquachain@v1.7.8/opt/bmt/bmt.go (about)

     1  // Copyright 2017 The aquachain Authors
     2  // This file is part of the aquachain library.
     3  //
     4  // The aquachain library is free software: you can redistribute it and/or modify
     5  // it under the terms of the GNU Lesser General Public License as published by
     6  // the Free Software Foundation, either version 3 of the License, or
     7  // (at your option) any later version.
     8  //
     9  // The aquachain library is distributed in the hope that it will be useful,
    10  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    11  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    12  // GNU Lesser General Public License for more details.
    13  //
    14  // You should have received a copy of the GNU Lesser General Public License
    15  // along with the aquachain library. If not, see <http://www.gnu.org/licenses/>.
    16  
    17  // Package bmt provides a binary merkle tree implementation
    18  package bmt
    19  
    20  import (
    21  	"fmt"
    22  	"hash"
    23  	"io"
    24  	"strings"
    25  	"sync"
    26  	"sync/atomic"
    27  )
    28  
    29  /*
    30  Binary Merkle Tree Hash is a hash function over arbitrary datachunks of limited size
    31  It is defined as the root hash of the binary merkle tree built over fixed size segments
    32  of the underlying chunk using any base hash function (e.g keccak 256 SHA3)
    33  
    34  It is used as the chunk hash function in swarm which in turn is the basis for the
    35  128 branching swarm hash http://swarm-guide.readthedocs.io/en/latest/architecture.html#swarm-hash
    36  
    37  The BMT is optimal for providing compact inclusion proofs, i.e. prove that a
    38  segment is a substring of a chunk starting at a particular offset
    39  The size of the underlying segments is fixed at 32 bytes (called the resolution
    40  of the BMT hash), the EVM word size to optimize for on-chain BMT verification
    41  as well as the hash size optimal for inclusion proofs in the merkle tree of the swarm hash.
    42  
    43  Two implementations are provided:
    44  
    45  * RefHasher is optimized for code simplicity and meant as a reference implementation
    46  * Hasher is optimized for speed taking advantage of concurrency with minimalistic
    47    control structure to coordinate the concurrent routines
    48    It implements the ChunkHash interface as well as the go standard hash.Hash interface
    49  
    50  */
    51  
    52  const (
    53  	// DefaultSegmentCount is the maximum number of segments of the underlying chunk
    54  	DefaultSegmentCount = 128 // Should be equal to storage.DefaultBranches
    55  	// DefaultPoolSize is the maximum number of bmt trees used by the hashers, i.e,
    56  	// the maximum number of concurrent BMT hashing operations performed by the same hasher
    57  	DefaultPoolSize = 8
    58  )
    59  
    60  // BaseHasher is a hash.Hash constructor function used for the base hash of the  BMT.
    61  type BaseHasher func() hash.Hash
    62  
    63  // Hasher a reusable hasher for fixed maximum size chunks representing a BMT
    64  // implements the hash.Hash interface
    65  // reuse pool of Tree-s for amortised memory allocation and resource control
    66  // supports order-agnostic concurrent segment writes
    67  // as well as sequential read and write
    68  // can not be called concurrently on more than one chunk
    69  // can be further appended after Sum
    70  // Reset gives back the Tree to the pool and guaranteed to leave
    71  // the tree and itself in a state reusable for hashing a new chunk
    72  type Hasher struct {
    73  	pool        *TreePool   // BMT resource pool
    74  	bmt         *Tree       // prebuilt BMT resource for flowcontrol and proofs
    75  	blocksize   int         // segment size (size of hash) also for hash.Hash
    76  	count       int         // segment count
    77  	size        int         // for hash.Hash same as hashsize
    78  	cur         int         // cursor position for righmost currently open chunk
    79  	segment     []byte      // the rightmost open segment (not complete)
    80  	depth       int         // index of last level
    81  	result      chan []byte // result channel
    82  	hash        []byte      // to record the result
    83  	blockLength []byte      // The block length that needes to be added in Sum
    84  }
    85  
    86  // New creates a reusable Hasher
    87  // implements the hash.Hash interface
    88  // pulls a new Tree from a resource pool for hashing each chunk
    89  func New(p *TreePool) *Hasher {
    90  	return &Hasher{
    91  		pool:      p,
    92  		depth:     depth(p.SegmentCount),
    93  		size:      p.SegmentSize,
    94  		blocksize: p.SegmentSize,
    95  		count:     p.SegmentCount,
    96  		result:    make(chan []byte),
    97  	}
    98  }
    99  
   100  // Node is a reuseable segment hasher representing a node in a BMT
   101  // it allows for continued writes after a Sum
   102  // and is left in completely reusable state after Reset
   103  type Node struct {
   104  	level, index int   // position of node for information/logging only
   105  	initial      bool  // first and last node
   106  	root         bool  // whether the node is root to a smaller BMT
   107  	isLeft       bool  // whether it is left side of the parent double segment
   108  	unbalanced   bool  // indicates if a node has only the left segment
   109  	parent       *Node // BMT connections
   110  	state        int32 // atomic increment impl concurrent boolean toggle
   111  	left, right  []byte
   112  }
   113  
   114  // NewNode constructor for segment hasher nodes in the BMT
   115  func NewNode(level, index int, parent *Node) *Node {
   116  	return &Node{
   117  		parent:  parent,
   118  		level:   level,
   119  		index:   index,
   120  		initial: index == 0,
   121  		isLeft:  index%2 == 0,
   122  	}
   123  }
   124  
   125  // TreePool provides a pool of Trees used as resources by Hasher
   126  // a Tree popped from the pool is guaranteed to have clean state
   127  // for hashing a new chunk
   128  // Hasher Reset releases the Tree to the pool
   129  type TreePool struct {
   130  	lock         sync.Mutex
   131  	c            chan *Tree
   132  	hasher       BaseHasher
   133  	SegmentSize  int
   134  	SegmentCount int
   135  	Capacity     int
   136  	count        int
   137  }
   138  
   139  // NewTreePool creates a Tree pool with hasher, segment size, segment count and capacity
   140  // on GetTree it reuses free Trees or creates a new one if size is not reached
   141  func NewTreePool(hasher BaseHasher, segmentCount, capacity int) *TreePool {
   142  	return &TreePool{
   143  		c:            make(chan *Tree, capacity),
   144  		hasher:       hasher,
   145  		SegmentSize:  hasher().Size(),
   146  		SegmentCount: segmentCount,
   147  		Capacity:     capacity,
   148  	}
   149  }
   150  
   151  // Drain drains the pool uptil it has no more than n resources
   152  func (self *TreePool) Drain(n int) {
   153  	self.lock.Lock()
   154  	defer self.lock.Unlock()
   155  	for len(self.c) > n {
   156  		<-self.c
   157  		self.count--
   158  	}
   159  }
   160  
   161  // Reserve is blocking until it returns an available Tree
   162  // it reuses free Trees or creates a new one if size is not reached
   163  func (self *TreePool) Reserve() *Tree {
   164  	self.lock.Lock()
   165  	defer self.lock.Unlock()
   166  	var t *Tree
   167  	if self.count == self.Capacity {
   168  		return <-self.c
   169  	}
   170  	select {
   171  	case t = <-self.c:
   172  	default:
   173  		t = NewTree(self.hasher, self.SegmentSize, self.SegmentCount)
   174  		self.count++
   175  	}
   176  	return t
   177  }
   178  
   179  // Release gives back a Tree to the pool.
   180  // This Tree is guaranteed to be in reusable state
   181  // does not need locking
   182  func (self *TreePool) Release(t *Tree) {
   183  	self.c <- t // can never fail but...
   184  }
   185  
   186  // Tree is a reusable control structure representing a BMT
   187  // organised in a binary tree
   188  // Hasher uses a TreePool to pick one for each chunk hash
   189  // the Tree is 'locked' while not in the pool
   190  type Tree struct {
   191  	leaves []*Node
   192  }
   193  
   194  // Draw draws the BMT (badly)
   195  func (self *Tree) Draw(hash []byte, d int) string {
   196  	var left, right []string
   197  	var anc []*Node
   198  	for i, n := range self.leaves {
   199  		left = append(left, fmt.Sprintf("%v", hashstr(n.left)))
   200  		if i%2 == 0 {
   201  			anc = append(anc, n.parent)
   202  		}
   203  		right = append(right, fmt.Sprintf("%v", hashstr(n.right)))
   204  	}
   205  	anc = self.leaves
   206  	var hashes [][]string
   207  	for l := 0; len(anc) > 0; l++ {
   208  		var nodes []*Node
   209  		hash := []string{""}
   210  		for i, n := range anc {
   211  			hash = append(hash, fmt.Sprintf("%v|%v", hashstr(n.left), hashstr(n.right)))
   212  			if i%2 == 0 && n.parent != nil {
   213  				nodes = append(nodes, n.parent)
   214  			}
   215  		}
   216  		hash = append(hash, "")
   217  		hashes = append(hashes, hash)
   218  		anc = nodes
   219  	}
   220  	hashes = append(hashes, []string{"", fmt.Sprintf("%v", hashstr(hash)), ""})
   221  	total := 60
   222  	del := "                             "
   223  	var rows []string
   224  	for i := len(hashes) - 1; i >= 0; i-- {
   225  		var textlen int
   226  		hash := hashes[i]
   227  		for _, s := range hash {
   228  			textlen += len(s)
   229  		}
   230  		if total < textlen {
   231  			total = textlen + len(hash)
   232  		}
   233  		delsize := (total - textlen) / (len(hash) - 1)
   234  		if delsize > len(del) {
   235  			delsize = len(del)
   236  		}
   237  		row := fmt.Sprintf("%v: %v", len(hashes)-i-1, strings.Join(hash, del[:delsize]))
   238  		rows = append(rows, row)
   239  
   240  	}
   241  	rows = append(rows, strings.Join(left, "  "))
   242  	rows = append(rows, strings.Join(right, "  "))
   243  	return strings.Join(rows, "\n") + "\n"
   244  }
   245  
   246  // NewTree initialises the Tree by building up the nodes of a BMT
   247  // segment size is stipulated to be the size of the hash
   248  // segmentCount needs to be positive integer and does not need to be
   249  // a power of two and can even be an odd number
   250  // segmentSize * segmentCount determines the maximum chunk size
   251  // hashed using the tree
   252  func NewTree(hasher BaseHasher, segmentSize, segmentCount int) *Tree {
   253  	n := NewNode(0, 0, nil)
   254  	n.root = true
   255  	prevlevel := []*Node{n}
   256  	// iterate over levels and creates 2^level nodes
   257  	level := 1
   258  	count := 2
   259  	for d := 1; d <= depth(segmentCount); d++ {
   260  		nodes := make([]*Node, count)
   261  		for i := 0; i < len(nodes); i++ {
   262  			parent := prevlevel[i/2]
   263  			t := NewNode(level, i, parent)
   264  			nodes[i] = t
   265  		}
   266  		prevlevel = nodes
   267  		level++
   268  		count *= 2
   269  	}
   270  	// the datanode level is the nodes on the last level where
   271  	return &Tree{
   272  		leaves: prevlevel,
   273  	}
   274  }
   275  
   276  // methods needed by hash.Hash
   277  
   278  // Size returns the size
   279  func (self *Hasher) Size() int {
   280  	return self.size
   281  }
   282  
   283  // BlockSize returns the block size
   284  func (self *Hasher) BlockSize() int {
   285  	return self.blocksize
   286  }
   287  
   288  // Sum returns the hash of the buffer
   289  // hash.Hash interface Sum method appends the byte slice to the underlying
   290  // data before it calculates and returns the hash of the chunk
   291  func (self *Hasher) Sum(b []byte) (r []byte) {
   292  	t := self.bmt
   293  	i := self.cur
   294  	n := t.leaves[i]
   295  	j := i
   296  	// must run strictly before all nodes calculate
   297  	// datanodes are guaranteed to have a parent
   298  	if len(self.segment) > self.size && i > 0 && n.parent != nil {
   299  		n = n.parent
   300  	} else {
   301  		i *= 2
   302  	}
   303  	d := self.finalise(n, i)
   304  	self.writeSegment(j, self.segment, d)
   305  	c := <-self.result
   306  	self.releaseTree()
   307  
   308  	// sha3(length + BMT(pure_chunk))
   309  	if self.blockLength == nil {
   310  		return c
   311  	}
   312  	res := self.pool.hasher()
   313  	res.Reset()
   314  	res.Write(self.blockLength)
   315  	res.Write(c)
   316  	return res.Sum(nil)
   317  }
   318  
   319  // Hasher implements the SwarmHash interface
   320  
   321  // Hash waits for the hasher result and returns it
   322  // caller must call this on a BMT Hasher being written to
   323  func (self *Hasher) Hash() []byte {
   324  	return <-self.result
   325  }
   326  
   327  // Hasher implements the io.Writer interface
   328  
   329  // Write fills the buffer to hash
   330  // with every full segment complete launches a hasher go routine
   331  // that shoots up the BMT
   332  func (self *Hasher) Write(b []byte) (int, error) {
   333  	l := len(b)
   334  	if l <= 0 {
   335  		return 0, nil
   336  	}
   337  	s := self.segment
   338  	i := self.cur
   339  	count := (self.count + 1) / 2
   340  	need := self.count*self.size - self.cur*2*self.size
   341  	size := self.size
   342  	if need > size {
   343  		size *= 2
   344  	}
   345  	if l < need {
   346  		need = l
   347  	}
   348  	// calculate missing bit to complete current open segment
   349  	rest := size - len(s)
   350  	if need < rest {
   351  		rest = need
   352  	}
   353  	s = append(s, b[:rest]...)
   354  	need -= rest
   355  	// read full segments and the last possibly partial segment
   356  	for need > 0 && i < count-1 {
   357  		// push all finished chunks we read
   358  		self.writeSegment(i, s, self.depth)
   359  		need -= size
   360  		if need < 0 {
   361  			size += need
   362  		}
   363  		s = b[rest : rest+size]
   364  		rest += size
   365  		i++
   366  	}
   367  	self.segment = s
   368  	self.cur = i
   369  	// otherwise, we can assume len(s) == 0, so all buffer is read and chunk is not yet full
   370  	return l, nil
   371  }
   372  
   373  // Hasher implements the io.ReaderFrom interface
   374  
   375  // ReadFrom reads from io.Reader and appends to the data to hash using Write
   376  // it reads so that chunk to hash is maximum length or reader reaches EOF
   377  // caller must Reset the hasher prior to call
   378  func (self *Hasher) ReadFrom(r io.Reader) (m int64, err error) {
   379  	bufsize := self.size*self.count - self.size*self.cur - len(self.segment)
   380  	buf := make([]byte, bufsize)
   381  	var read int
   382  	for {
   383  		var n int
   384  		n, err = r.Read(buf)
   385  		read += n
   386  		if err == io.EOF || read == len(buf) {
   387  			hash := self.Sum(buf[:n])
   388  			if read == len(buf) {
   389  				err = NewEOC(hash)
   390  			}
   391  			break
   392  		}
   393  		if err != nil {
   394  			break
   395  		}
   396  		n, err = self.Write(buf[:n])
   397  		if err != nil {
   398  			break
   399  		}
   400  	}
   401  	return int64(read), err
   402  }
   403  
   404  // Reset needs to be called before writing to the hasher
   405  func (self *Hasher) Reset() {
   406  	self.getTree()
   407  	self.blockLength = nil
   408  }
   409  
   410  // Hasher implements the SwarmHash interface
   411  
   412  // ResetWithLength needs to be called before writing to the hasher
   413  // the argument is supposed to be the byte slice binary representation of
   414  // the legth of the data subsumed under the hash
   415  func (self *Hasher) ResetWithLength(l []byte) {
   416  	self.Reset()
   417  	self.blockLength = l
   418  
   419  }
   420  
   421  // Release gives back the Tree to the pool whereby it unlocks
   422  // it resets tree, segment and index
   423  func (self *Hasher) releaseTree() {
   424  	if self.bmt != nil {
   425  		n := self.bmt.leaves[self.cur]
   426  		for ; n != nil; n = n.parent {
   427  			n.unbalanced = false
   428  			if n.parent != nil {
   429  				n.root = false
   430  			}
   431  		}
   432  		self.pool.Release(self.bmt)
   433  		self.bmt = nil
   434  
   435  	}
   436  	self.cur = 0
   437  	self.segment = nil
   438  }
   439  
   440  func (self *Hasher) writeSegment(i int, s []byte, d int) {
   441  	h := self.pool.hasher()
   442  	n := self.bmt.leaves[i]
   443  
   444  	if len(s) > self.size && n.parent != nil {
   445  		go func() {
   446  			h.Reset()
   447  			h.Write(s)
   448  			s = h.Sum(nil)
   449  
   450  			if n.root {
   451  				self.result <- s
   452  				return
   453  			}
   454  			self.run(n.parent, h, d, n.index, s)
   455  		}()
   456  		return
   457  	}
   458  	go self.run(n, h, d, i*2, s)
   459  }
   460  
   461  func (self *Hasher) run(n *Node, h hash.Hash, d int, i int, s []byte) {
   462  	isLeft := i%2 == 0
   463  	for {
   464  		if isLeft {
   465  			n.left = s
   466  		} else {
   467  			n.right = s
   468  		}
   469  		if !n.unbalanced && n.toggle() {
   470  			return
   471  		}
   472  		if !n.unbalanced || !isLeft || i == 0 && d == 0 {
   473  			h.Reset()
   474  			h.Write(n.left)
   475  			h.Write(n.right)
   476  			s = h.Sum(nil)
   477  
   478  		} else {
   479  			s = append(n.left, n.right...)
   480  		}
   481  
   482  		self.hash = s
   483  		if n.root {
   484  			self.result <- s
   485  			return
   486  		}
   487  
   488  		isLeft = n.isLeft
   489  		n = n.parent
   490  		i++
   491  	}
   492  }
   493  
   494  // getTree obtains a BMT resource by reserving one from the pool
   495  func (self *Hasher) getTree() *Tree {
   496  	if self.bmt != nil {
   497  		return self.bmt
   498  	}
   499  	t := self.pool.Reserve()
   500  	self.bmt = t
   501  	return t
   502  }
   503  
   504  // atomic bool toggle implementing a concurrent reusable 2-state object
   505  // atomic addint with %2 implements atomic bool toggle
   506  // it returns true if the toggler just put it in the active/waiting state
   507  func (self *Node) toggle() bool {
   508  	return atomic.AddInt32(&self.state, 1)%2 == 1
   509  }
   510  
   511  func hashstr(b []byte) string {
   512  	end := len(b)
   513  	if end > 4 {
   514  		end = 4
   515  	}
   516  	return fmt.Sprintf("%x", b[:end])
   517  }
   518  
   519  func depth(n int) (d int) {
   520  	for l := (n - 1) / 2; l > 0; l /= 2 {
   521  		d++
   522  	}
   523  	return d
   524  }
   525  
   526  // finalise is following the zigzags on the tree belonging
   527  // to the final datasegment
   528  func (self *Hasher) finalise(n *Node, i int) (d int) {
   529  	isLeft := i%2 == 0
   530  	for {
   531  		// when the final segment's path is going via left segments
   532  		// the incoming data is pushed to the parent upon pulling the left
   533  		// we do not need toogle the state since this condition is
   534  		// detectable
   535  		n.unbalanced = isLeft
   536  		n.right = nil
   537  		if n.initial {
   538  			n.root = true
   539  			return d
   540  		}
   541  		isLeft = n.isLeft
   542  		n = n.parent
   543  		d++
   544  	}
   545  }
   546  
   547  // EOC (end of chunk) implements the error interface
   548  type EOC struct {
   549  	Hash []byte // read the hash of the chunk off the error
   550  }
   551  
   552  // Error returns the error string
   553  func (self *EOC) Error() string {
   554  	return fmt.Sprintf("hasher limit reached, chunk hash: %x", self.Hash)
   555  }
   556  
   557  // NewEOC creates new end of chunk error with the hash
   558  func NewEOC(hash []byte) *EOC {
   559  	return &EOC{hash}
   560  }