github.com/vuuihc/gocedar@v0.1.0/cedar.go (about)

     1  // Copyright 2016 Evans. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package cedar
     6  
     7  import (
     8  	"os"
     9  	"unsafe"
    10  )
    11  
    12  const (
    13  	maxMemStep = 1 << 30
    14  )
    15  
    16  // NInfo stores the information about the trie
    17  type NInfo struct {
    18  	sibling, child byte // uint8
    19  }
    20  
    21  // Node contains the array of `base` and `check` as specified in the paper:
    22  // "An efficient implementation of trie structures"
    23  // https://dl.acm.org/citation.cfm?id=146691
    24  type Node struct {
    25  	baseV, check int // int32
    26  }
    27  
    28  func (n *Node) base(reduced ...bool) int {
    29  	if !isReduced(reduced...) {
    30  		return n.baseV
    31  	}
    32  
    33  	return -(n.baseV + 1)
    34  }
    35  
    36  // Block stores the linked-list pointers and the stats info for blocks.
    37  //
    38  // Because of type conversion, this version all int16 and int32 uses int,
    39  // witch will be optimized in the next version.
    40  type Block struct {
    41  	prev   int // int32   // previous block's index, 3 bytes width
    42  	next   int // next block's index, 3 bytes width
    43  	num    int // the number of slots that is free, the range is 0-256
    44  	reject int // a heuristic number to make the search for free space faster...
    45  	trial  int // the number of times this block has been probed by `find_places` for the free block.
    46  	eHead  int // the index of the first empty elemenet in this block
    47  }
    48  
    49  func (b *Block) init() {
    50  	b.num = 256    // each of block has 256 free slots at the beginning
    51  	b.reject = 257 // initially every block need to be fully iterated through so that we can reject it to be unusable.
    52  }
    53  
    54  // Cedar holds all of the information about double array trie.
    55  type Cedar struct {
    56  	mmap *MMap
    57  	*MetaInfo
    58  
    59  	// Reduced option the reduced trie
    60  	// Reduced bool
    61  
    62  	array  []Node // storing the `base` and `check` info from the original paper.
    63  	nInfos []NInfo
    64  	blocks []Block
    65  	// reject [257]int
    66  
    67  	// blocksHeadFull   int // the index of the first 'Full' block, 0 means no 'Full' block
    68  	// blocksHeadClosed int // the index of the first 'Closed' block, 0 means no ' Closed' block
    69  	// blocksHeadOpen   int // the index of the first 'Open' block, 0 means no 'Open' block
    70  
    71  	// capacity int
    72  	// size     int
    73  	// ordered  bool
    74  	// maxTrial int // the parameter for cedar, it could be tuned for more, but the default is 1.
    75  }
    76  
    77  const (
    78  	// ValLimit cedar value limit
    79  	ValLimit = int(^uint(0) >> 1)
    80  	// NoVal not have value
    81  	NoVal = -1
    82  )
    83  
    84  // type PrefixIter struct {
    85  // }
    86  
    87  type Options struct {
    88  	Reduced  bool
    89  	UseMMap  bool
    90  	MMapPath string
    91  }
    92  
    93  // New initialize the Cedar for further use
    94  func New(opt *Options) *Cedar {
    95  	cd := &Cedar{}
    96  	if opt.UseMMap {
    97  		if len(opt.MMapPath) == 0 {
    98  			opt.MMapPath = os.TempDir()
    99  		}
   100  		mmap := NewMMap(opt.MMapPath)
   101  		mmap.InitData(cd)
   102  		cd.useMMap = true
   103  	} else {
   104  		cd.MetaInfo = &MetaInfo{}
   105  		cd.array = make([]Node, 256)
   106  		cd.nInfos = make([]NInfo, 256)
   107  		cd.blocks = make([]Block, 1)
   108  	}
   109  	if cd.LoadSize > 0 { // if there is data in mmap, do not need init meta
   110  		return cd
   111  	}
   112  	cd.Reduced = isReduced(opt.Reduced)
   113  	cd.capacity = 256
   114  	cd.size = 256
   115  	cd.ordered = true
   116  	cd.maxTrial = 1
   117  
   118  	if !cd.Reduced {
   119  		cd.array[0] = Node{baseV: 0, check: -1}
   120  	} else {
   121  		cd.array[0] = Node{baseV: -1, check: -1}
   122  	}
   123  	// make `baseV` point to the previous element, and make `check` point to the next element
   124  	for i := 1; i < 256; i++ {
   125  		cd.array[i] = Node{baseV: -(i - 1), check: -(i + 1)}
   126  	}
   127  	// make them link as a cyclic doubly-linked list
   128  	cd.array[1].baseV = -255
   129  	cd.array[255].check = -1
   130  
   131  	cd.blocks[0].eHead = 1
   132  	cd.blocks[0].init()
   133  
   134  	for i := 0; i <= 256; i++ {
   135  		cd.reject[i] = i + 1
   136  	}
   137  
   138  	return cd
   139  }
   140  
   141  // follow To move in the trie by following the `label`, and insert the node if the node is not there,
   142  // it is used by the `update` to populate the trie.
   143  func (cd *Cedar) follow(from int, label byte) (to int) {
   144  	base := cd.array[from].base(cd.Reduced)
   145  
   146  	// the node is not there
   147  	to = base ^ int(label)
   148  	if base < 0 || cd.array[to].check < 0 {
   149  		// allocate a e node
   150  		to = cd.popENode(base, from, label)
   151  		branch := to ^ int(label)
   152  
   153  		// maintain the info in ninfo
   154  		cd.pushSibling(from, branch, label, base >= 0)
   155  		return
   156  	}
   157  
   158  	// the node is already there and the ownership is not `from`,
   159  	// therefore a conflict.
   160  	if cd.array[to].check != from {
   161  		// call `resolve` to relocate.
   162  		to = cd.resolve(from, base, label)
   163  	}
   164  
   165  	return
   166  }
   167  
   168  // Mark an edge `e` as used in a trie node.
   169  // pop empty node from block; never transfer the special block (idx = 0)
   170  // nolint
   171  func (cd *Cedar) popENode(base, from int, label byte) int {
   172  	e := base ^ int(label)
   173  	if base < 0 {
   174  		e = cd.findPlace()
   175  	}
   176  
   177  	idx := e >> 8
   178  	arr := &cd.array[e]
   179  
   180  	b := &cd.blocks[idx]
   181  	b.num--
   182  	// move the block at idx to the correct linked-list depending the free slots it still have.
   183  	if b.num == 0 {
   184  		if idx != 0 {
   185  			// Closed to Full
   186  			cd.transferBlock(idx, &cd.blocksHeadClosed, &cd.blocksHeadFull)
   187  		}
   188  	} else {
   189  		// release empty node from empty ring
   190  		cd.array[-arr.baseV].check = arr.check
   191  		cd.array[-arr.check].baseV = arr.baseV
   192  
   193  		if e == b.eHead {
   194  			b.eHead = -arr.check
   195  		}
   196  
   197  		if idx != 0 && b.num == 1 && b.trial != cd.maxTrial {
   198  			// Open to Closed
   199  			cd.transferBlock(idx, &cd.blocksHeadOpen, &cd.blocksHeadClosed)
   200  		}
   201  	}
   202  
   203  	// initialize the released node
   204  	if !cd.Reduced {
   205  		if label != 0 {
   206  			cd.array[e].baseV = -1
   207  		} else {
   208  			cd.array[e].baseV = 0
   209  		}
   210  		cd.array[e].check = from
   211  		if base < 0 {
   212  			cd.array[from].baseV = e ^ int(label)
   213  		}
   214  
   215  		return e
   216  	}
   217  
   218  	cd.array[e].baseV = ValLimit
   219  	cd.array[e].check = from
   220  	if base < 0 {
   221  		cd.array[from].baseV = -(e ^ int(label)) - 1
   222  	}
   223  
   224  	return e
   225  }
   226  
   227  // Mark an edge `e` as free in a trie node.
   228  // push empty node into empty ring
   229  // nolint
   230  func (cd *Cedar) pushENode(e int) {
   231  	idx := e >> 8
   232  	b := &cd.blocks[idx]
   233  	b.num++
   234  
   235  	if b.num == 1 {
   236  		b.eHead = e
   237  		cd.array[e] = Node{baseV: -e, check: -e}
   238  
   239  		if idx != 0 {
   240  			// Move the block from 'Full' to 'Closed' since it has one free slot now.
   241  			cd.transferBlock(idx, &cd.blocksHeadFull, &cd.blocksHeadClosed)
   242  		}
   243  	} else {
   244  		prev := b.eHead
   245  		next := -cd.array[prev].check
   246  
   247  		// Insert to the edge immediately after the e_head
   248  		cd.array[e] = Node{baseV: -prev, check: -next}
   249  
   250  		cd.array[prev].check = -e
   251  		cd.array[next].baseV = -e
   252  
   253  		// Move the block from 'Closed' to 'Open' since it has more than one free slot now.
   254  		if b.num == 2 || b.trial == cd.maxTrial {
   255  			if idx != 0 {
   256  				// Closed to Open
   257  				cd.transferBlock(idx, &cd.blocksHeadClosed, &cd.blocksHeadOpen)
   258  			}
   259  		}
   260  
   261  		// Reset the trial stats
   262  		b.trial = 0
   263  	}
   264  
   265  	if b.reject < cd.reject[b.num] {
   266  		b.reject = cd.reject[b.num]
   267  	}
   268  	// reset ninfo; no child, no sibling
   269  	cd.nInfos[e] = NInfo{}
   270  }
   271  
   272  // push the `label` into the sibling chain
   273  // to from's child
   274  func (cd *Cedar) pushSibling(from, base int, label byte, hasChild bool) {
   275  	c := &cd.nInfos[from].child
   276  	keepOrder := *c == 0
   277  	if cd.ordered {
   278  		keepOrder = label > *c
   279  	}
   280  
   281  	if hasChild && keepOrder {
   282  		c = &cd.nInfos[base^int(*c)].sibling
   283  		for cd.ordered && *c != 0 && *c < label {
   284  			c = &cd.nInfos[base^int(*c)].sibling
   285  		}
   286  	}
   287  	cd.nInfos[base^int(label)].sibling = *c
   288  	*c = label
   289  }
   290  
   291  // remove the `label` from the sibling chain.
   292  func (cd *Cedar) popSibling(from, base int, label byte) {
   293  	c := &cd.nInfos[from].child
   294  	for *c != label {
   295  		c = &cd.nInfos[base^int(*c)].sibling
   296  	}
   297  	*c = cd.nInfos[base^int(*c)].sibling
   298  }
   299  
   300  // Loop through the siblings to see which one reached the end first, which means
   301  // it is the one with smaller in children size, and we should try ti relocate the smaller one.
   302  // check whether to replace branching w/ the newly added node
   303  func (cd *Cedar) consult(baseN, baseP int, cN, cP byte) bool {
   304  	cN = cd.nInfos[baseN^int(cN)].sibling
   305  	cP = cd.nInfos[baseP^int(cP)].sibling
   306  
   307  	for cN != 0 && cP != 0 {
   308  		cN = cd.nInfos[baseN^int(cN)].sibling
   309  		cP = cd.nInfos[baseP^int(cP)].sibling
   310  	}
   311  
   312  	return cP != 0
   313  }
   314  
   315  // Collect the list of the children, and push the label as well if it is not terminal node.
   316  // enumerate (equal to or more than one) child nodes
   317  func (cd *Cedar) setChild(base int, c, label byte, flag bool) []byte {
   318  	child := make([]byte, 0, 257)
   319  	// 0: terminal
   320  	if c == 0 {
   321  		child = append(child, c)
   322  		c = cd.nInfos[base^int(c)].sibling
   323  	}
   324  
   325  	if cd.ordered {
   326  		for c != 0 && c <= label {
   327  			child = append(child, c)
   328  			c = cd.nInfos[base^int(c)].sibling
   329  		}
   330  	}
   331  
   332  	if flag {
   333  		child = append(child, label)
   334  	}
   335  
   336  	for c != 0 {
   337  		child = append(child, c)
   338  		c = cd.nInfos[base^int(c)].sibling
   339  	}
   340  
   341  	return child
   342  }
   343  
   344  // For the case where only one free slot is needed
   345  func (cd *Cedar) findPlace() int {
   346  	if cd.blocksHeadClosed != 0 {
   347  		return cd.blocks[cd.blocksHeadClosed].eHead
   348  	}
   349  
   350  	if cd.blocksHeadOpen != 0 {
   351  		return cd.blocks[cd.blocksHeadOpen].eHead
   352  	}
   353  
   354  	// the block is not enough, resize it and allocate it.
   355  	return cd.addBlock() << 8
   356  }
   357  
   358  // For the case where multiple free slots are needed.
   359  func (cd *Cedar) findPlaces(child []byte) int {
   360  	idx := cd.blocksHeadOpen
   361  	// still have available 'Open' blocks.
   362  	if idx != 0 {
   363  		e := cd.listIdx(idx, child)
   364  		if e > 0 {
   365  			return e
   366  		}
   367  	}
   368  
   369  	return cd.addBlock() << 8
   370  }
   371  
   372  func (cd *Cedar) listIdx(idx int, child []byte) int {
   373  	n := len(child)
   374  	bo := cd.blocks[cd.blocksHeadOpen].prev
   375  
   376  	// only proceed if the free slots are more than the number of children. Also, we
   377  	// save the minimal number of attempts to fail in the `reject`, it only worths to
   378  	// try out this block if the number of children is less than that number.
   379  	for {
   380  		b := &cd.blocks[idx]
   381  		if b.num >= n && n < b.reject {
   382  			e := cd.listEHead(b, child)
   383  			if e > 0 {
   384  				return e
   385  			}
   386  		}
   387  
   388  		// we broke out of the loop, that means we failed. We save the information in
   389  		// `reject` for future pruning.
   390  		b.reject = n
   391  		if b.reject < cd.reject[b.num] {
   392  			// put this stats into the global array of information as well.
   393  			cd.reject[b.num] = b.reject
   394  		}
   395  
   396  		idxN := b.next
   397  		b.trial++
   398  		// move this block to the 'Closed' block list since it has reached the max_trial
   399  		if b.trial == cd.maxTrial {
   400  			cd.transferBlock(idx, &cd.blocksHeadOpen, &cd.blocksHeadClosed)
   401  		}
   402  
   403  		// we have finsihed one round of this cyclic doubly-linked-list.
   404  		if idx == bo {
   405  			break
   406  		}
   407  		// going to the next in this linked list group
   408  		idx = idxN
   409  	}
   410  
   411  	return 0
   412  }
   413  
   414  func (cd *Cedar) listEHead(b *Block, child []byte) int {
   415  	for e := b.eHead; ; {
   416  		base := e ^ int(child[0])
   417  		// iterate through the children to see if they are available: (check < 0)
   418  		for i := 0; cd.array[base^int(child[i])].check < 0; i++ {
   419  			if i == len(child)-1 {
   420  				// we have found the available block.
   421  				b.eHead = e
   422  				return e
   423  			}
   424  		}
   425  
   426  		// save the next free block's information in `check`
   427  		e = -cd.array[e].check
   428  		if e == b.eHead {
   429  			break
   430  		}
   431  	}
   432  
   433  	return 0
   434  }
   435  
   436  // resolve the conflict by moving one of the the nodes to a free block.
   437  // resolve conflict on base_n ^ label_n = base_p ^ label_p
   438  func (cd *Cedar) resolve(fromN, baseN int, labelN byte) int {
   439  	toPn := baseN ^ int(labelN)
   440  
   441  	// the `base` and `from` for the conflicting one.
   442  	fromP := cd.array[toPn].check
   443  	baseP := cd.array[fromP].base(cd.Reduced)
   444  
   445  	// whether to replace siblings of newly added
   446  	flag := cd.consult(
   447  		baseN, baseP,
   448  		cd.nInfos[fromN].child,
   449  		cd.nInfos[fromP].child,
   450  	)
   451  
   452  	// collect the list of children for the block that we are going to relocate.
   453  	var children []byte
   454  	if flag {
   455  		children = cd.setChild(baseN, cd.nInfos[fromN].child, labelN, true)
   456  	} else {
   457  		children = cd.setChild(baseP, cd.nInfos[fromP].child, 255, false)
   458  	}
   459  
   460  	// decide which algorithm to allocate free block depending on the number of children
   461  	// we have.
   462  	base := 0
   463  	if len(children) == 1 {
   464  		base = cd.findPlace()
   465  	} else {
   466  		base = cd.findPlaces(children)
   467  	}
   468  	base ^= int(children[0])
   469  
   470  	var from, nbase int
   471  	if flag {
   472  		from = fromN
   473  		nbase = baseN
   474  	} else {
   475  		from = fromP
   476  		nbase = baseP
   477  	}
   478  
   479  	if flag && children[0] == labelN {
   480  		cd.nInfos[from].child = labelN
   481  	}
   482  
   483  	// #[cfg(feature != "reduced-trie")]
   484  	if !cd.Reduced {
   485  		cd.array[from].baseV = base
   486  	} else {
   487  		cd.array[from].baseV = -base - 1
   488  	}
   489  	base, labelN, toPn = cd.listN(base, from, nbase, fromN, toPn,
   490  		labelN, children, flag)
   491  
   492  	// return the position that is free now.
   493  	if flag {
   494  		return base ^ int(labelN)
   495  	}
   496  
   497  	return toPn
   498  }
   499  
   500  func (cd *Cedar) listN(base, from, nbase, fromN, toPn int,
   501  	labelN byte, children []byte, flag bool) (int, byte, int) {
   502  	// the actual work for relocating the chilren
   503  	for i := 0; i < len(children); i++ {
   504  		to := cd.popENode(base, from, children[i])
   505  		newTo := nbase ^ int(children[i])
   506  
   507  		if i == len(children)-1 {
   508  			cd.nInfos[to].sibling = 0
   509  		} else {
   510  			cd.nInfos[to].sibling = children[i+1]
   511  		}
   512  
   513  		// new node has no children
   514  		if flag && newTo == toPn {
   515  			continue
   516  		}
   517  
   518  		arr := &cd.array[to]
   519  		arrs := &cd.array[newTo]
   520  		arr.baseV = arrs.baseV
   521  
   522  		condition := false
   523  		if !cd.Reduced {
   524  			condition = arr.baseV > 0 && children[i] != 0
   525  		} else {
   526  			condition = arr.baseV < 0 && children[i] != 0
   527  		}
   528  
   529  		if condition {
   530  			// this node has children, fix their check
   531  			c := cd.nInfos[newTo].child
   532  			cd.nInfos[to].child = c
   533  			cd.array[arr.base(cd.Reduced)^int(c)].check = to
   534  
   535  			c = cd.nInfos[arr.base(cd.Reduced)^int(c)].sibling
   536  			for c != 0 {
   537  				cd.array[arr.base(cd.Reduced)^int(c)].check = to
   538  				c = cd.nInfos[arr.base(cd.Reduced)^int(c)].sibling
   539  			}
   540  		}
   541  
   542  		// the parent node is moved
   543  		if !flag && newTo == fromN {
   544  			fromN = to
   545  		}
   546  
   547  		if flag || newTo != toPn {
   548  			cd.pushENode(newTo)
   549  			continue
   550  		}
   551  
   552  		// clean up the space that was moved away from.
   553  		cd.pushSibling(fromN, toPn^int(labelN), labelN, true)
   554  		cd.nInfos[newTo].child = 0
   555  
   556  		if !cd.Reduced {
   557  			if labelN != 0 {
   558  				arrs.baseV = -1
   559  			} else {
   560  				arrs.baseV = 0
   561  			}
   562  		} else {
   563  			arrs.baseV = ValLimit
   564  		}
   565  		arrs.check = fromN
   566  
   567  	}
   568  
   569  	return base, labelN, toPn
   570  }
   571  
   572  // pop a block at idx from the linked-list of type `from`, specially handled if it is the last
   573  // one in the linked-list.
   574  func (cd *Cedar) popBlock(idx int, from *int, last bool) {
   575  	if last {
   576  		*from = 0
   577  		return
   578  	}
   579  
   580  	b := &cd.blocks[idx]
   581  	cd.blocks[b.prev].next = b.next
   582  	cd.blocks[b.next].prev = b.prev
   583  	if idx == *from {
   584  		*from = b.next
   585  	}
   586  }
   587  
   588  // return the block at idx to the linked-list of `to`, specially handled
   589  // if the linked-list is empty
   590  func (cd *Cedar) pushBlock(idx int, to *int, empty bool) {
   591  	b := &cd.blocks[idx]
   592  	if empty {
   593  		*to, b.prev, b.next = idx, idx, idx
   594  		return
   595  	}
   596  
   597  	tailTo := &cd.blocks[*to].prev
   598  	b.prev = *tailTo
   599  	b.next = *to
   600  	*to, *tailTo, cd.blocks[*tailTo].next = idx, idx, idx
   601  }
   602  
   603  // Reallocate more spaces so that we have more free blocks.
   604  func (cd *Cedar) addBlock() int {
   605  	if cd.size == cd.capacity {
   606  		if cd.capacity*int(unsafe.Sizeof(Node{})) > maxMemStep {
   607  			cd.capacity += maxMemStep / int(unsafe.Sizeof(Node{}))
   608  		} else {
   609  			cd.capacity += cd.capacity
   610  		}
   611  		if cd.useMMap {
   612  			cd.mmap.AddBlock(cd, cd.capacity)
   613  		} else {
   614  			array := cd.array
   615  			cd.array = make([]Node, cd.capacity)
   616  			copy(cd.array, array)
   617  
   618  			nInfos := cd.nInfos
   619  			cd.nInfos = make([]NInfo, cd.capacity)
   620  			copy(cd.nInfos, nInfos)
   621  
   622  			blocks := cd.blocks
   623  			cd.blocks = make([]Block, cd.capacity>>8)
   624  			copy(cd.blocks, blocks)
   625  		}
   626  
   627  	}
   628  
   629  	cd.blocks[cd.size>>8].init()
   630  	cd.blocks[cd.size>>8].eHead = cd.size
   631  
   632  	// make it a doubley linked list
   633  	cd.array[cd.size] = Node{baseV: -(cd.size + 255), check: -(cd.size + 1)}
   634  	for i := cd.size + 1; i < cd.size+255; i++ {
   635  		cd.array[i] = Node{baseV: -(i - 1), check: -(i + 1)}
   636  	}
   637  	cd.array[cd.size+255] = Node{baseV: -(cd.size + 254), check: -cd.size}
   638  
   639  	// append to block Open
   640  	cd.pushBlock(cd.size>>8, &cd.blocksHeadOpen, cd.blocksHeadOpen == 0)
   641  	cd.size += 256
   642  	return cd.size>>8 - 1
   643  }
   644  
   645  // transfer the block at idx from the linked-list of `from` to the linked-list of `to`,
   646  // specially handle the case where the destination linked-list is empty.
   647  func (cd *Cedar) transferBlock(idx int, from, to *int) {
   648  	b := cd.blocks[idx]
   649  	cd.popBlock(idx, from, idx == b.next) // b.next it's the last one if the next points to itself
   650  	cd.pushBlock(idx, to, *to == 0 && b.num != 0)
   651  }