github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/util/fst/builder.go

github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/util/fst/builder.go (about)

     1  package fst
     2  
     3  import (
     4  	"fmt"
     5  	"github.com/balzaczyy/golucene/core/util"
     6  )
     7  
     8  /*
     9  Builds a minimal FST (maps an []int term to an arbitrary output) from
    10  pre-sorted terms with outputs. The FST becomes an FSA if you use
    11  NoOutputs. The FST is written on-the-fly into a compact serialized
    12  format byte array, which can be saved to / loaded from a Directory or
    13  used directly for traversal. The FST is always finite (no cycles).
    14  
    15  NOTE: the algorithm is described at
    16  http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.3698
    17  
    18  FSTs larger than 2.1GB are now possible (as of Lucene 4.2). FSTs
    19  containing more than 2.1B nodes are also now possible, however they
    20  cannot be packed.
    21  */
    22  type Builder struct {
    23  	dedupHash *NodeHash
    24  	fst       *FST
    25  	NO_OUTPUT interface{}
    26  
    27  	// simplistic pruning: we prune node (and all following nodes) if
    28  	// less than this number of terms go through it:
    29  	minSuffixCount1 int
    30  
    31  	// better pruning: we prune node (and all following nodes) if the
    32  	// prior node has less than this number of terms go through it:
    33  	minSuffixCount2 int
    34  
    35  	doShareNonSingletonNodes bool
    36  	shareMaxTailLength       int
    37  
    38  	lastInput *util.IntsRefBuilder
    39  
    40  	// for packing
    41  	doPackFST               bool
    42  	acceptableOverheadRatio float32
    43  
    44  	// current frontier
    45  	frontier []*UnCompiledNode
    46  }
    47  
    48  /*
    49  Instantiates an FST/FSA builder with all the possible tuning and
    50  construction tweaks. Read parameter documentation carefully.
    51  
    52  ...
    53  */
    54  func NewBuilder(inputType InputType, minSuffixCount1, minSuffixCount2 int,
    55  	doShareSuffix, doShareNonSingletonNodes bool, shareMaxTailLength int,
    56  	outputs Outputs, doPackFST bool,
    57  	acceptableOverheadRatio float32, allowArrayArcs bool, bytesPageBits int) *Builder {
    58  
    59  	fst := newFST(inputType, outputs, doPackFST, acceptableOverheadRatio, allowArrayArcs, bytesPageBits)
    60  	f := make([]*UnCompiledNode, 10)
    61  	ans := &Builder{
    62  		minSuffixCount1:          minSuffixCount1,
    63  		minSuffixCount2:          minSuffixCount2,
    64  		doShareNonSingletonNodes: doShareNonSingletonNodes,
    65  		shareMaxTailLength:       shareMaxTailLength,
    66  		doPackFST:                doPackFST,
    67  		acceptableOverheadRatio:  acceptableOverheadRatio,
    68  		fst:       fst,
    69  		NO_OUTPUT: outputs.NoOutput(),
    70  		frontier:  f,
    71  		lastInput: util.NewIntsRefBuilder(),
    72  	}
    73  	if doShareSuffix {
    74  		ans.dedupHash = newNodeHash(fst, fst.bytes.reverseReaderAllowSingle(false))
    75  	}
    76  	for i, _ := range f {
    77  		f[i] = NewUnCompiledNode(ans, i)
    78  	}
    79  	return ans
    80  }
    81  
    82  func (b *Builder) compileNode(nodeIn *UnCompiledNode, tailLength int) (*CompiledNode, error) {
    83  	var node int64
    84  	var err error
    85  	if b.dedupHash != nil &&
    86  		(b.doShareNonSingletonNodes || nodeIn.NumArcs <= 1) &&
    87  		tailLength <= b.shareMaxTailLength {
    88  		if nodeIn.NumArcs == 0 {
    89  			node, err = b.fst.addNode(nodeIn)
    90  		} else {
    91  			node, err = b.dedupHash.add(nodeIn)
    92  		}
    93  	} else {
    94  		node, err = b.fst.addNode(nodeIn)
    95  	}
    96  	if err != nil {
    97  		return nil, err
    98  	}
    99  	assert(node != -2)
   100  
   101  	nodeIn.Clear()
   102  
   103  	return &CompiledNode{node}, nil
   104  }
   105  
   106  func (b *Builder) freezeTail(prefixLenPlus1 int) error {
   107  	// fmt.Printf("  compileTail %v\n", prefixLenPlus1)
   108  	downTo := prefixLenPlus1
   109  	if downTo < 1 {
   110  		downTo = 1
   111  	}
   112  	for idx := b.lastInput.Length(); idx >= downTo; idx-- {
   113  		doPrune := false
   114  		doCompile := false
   115  
   116  		node := b.frontier[idx]
   117  		parent := b.frontier[idx-1]
   118  
   119  		if node.InputCount < int64(b.minSuffixCount1) {
   120  			doPrune = true
   121  			doCompile = true
   122  		} else if idx > prefixLenPlus1 {
   123  			// prune if parent's inputCount is less than suffixMinCount2
   124  			if parent.InputCount < int64(b.minSuffixCount2) ||
   125  				b.minSuffixCount2 == 1 && parent.InputCount == 1 && idx > 1 {
   126  				// my parent, about to be compiled, doesn't make the cut, so
   127  				// I'm definitely pruned
   128  
   129  				// if minSuffixCount2 is 1, we keep only up
   130  				// until the 'distinguished edge', ie we keep only the
   131  				// 'divergent' part of the FST. if my parent, about to be
   132  				// compiled, has inputCount 1 then we are already past the
   133  				// distinguished edge.  NOTE: this only works if
   134  				// the FST outputs are not "compressible" (simple
   135  				// ords ARE compressible).
   136  				doPrune = true
   137  			} else {
   138  				// my parent, about to be compiled, does make the cut, so
   139  				// I'm definitely not pruned
   140  				doPrune = false
   141  			}
   142  			doCompile = true
   143  		} else {
   144  			// if pruning is disabled (count is 0) we can always compile current node
   145  			doCompile = b.minSuffixCount2 == 0
   146  		}
   147  
   148  		// fmt.Printf("    label=%c idx=%v inputCount=%v doCompile=%v doPrune=%v\n",
   149  		// 	b.lastInput.At(idx-1), idx, b.frontier[idx].InputCount, doCompile, doPrune)
   150  		if node.InputCount < int64(b.minSuffixCount2) ||
   151  			(b.minSuffixCount2 == 1 && node.InputCount == 1 && idx > 1) {
   152  			// drop all arcs
   153  			panic("not implemented yet")
   154  		}
   155  
   156  		if doPrune {
   157  			// tihs node doesn't make it -- deref it
   158  			node.Clear()
   159  			parent.deleteLast(b.lastInput.At(idx-1), node)
   160  		} else {
   161  
   162  			if b.minSuffixCount2 != 0 {
   163  				b.compileAllTargets(node, b.lastInput.Length()-idx)
   164  			}
   165  			nextFinalOutput := node.output
   166  
   167  			// we "fake" the node as being final if it has no outgoing arcs;
   168  			// in theory we could leave it as non-final (the FST can
   169  			// represent this), but FSTEnum, Util, etc., have trouble w/
   170  			// non-final dead-end states:
   171  			isFinal := node.IsFinal || node.NumArcs == 0
   172  
   173  			if doCompile {
   174  				// this node makes it and we now compile it. first, compile
   175  				// any targets that were previously undecided:
   176  				label := b.lastInput.At(idx - 1)
   177  				node, err := b.compileNode(node, 1+b.lastInput.Length()-idx)
   178  				if err != nil {
   179  					return err
   180  				}
   181  				parent.replaceLast(label, node, nextFinalOutput, isFinal)
   182  			} else {
   183  				panic("not implemented yet")
   184  			}
   185  		}
   186  	}
   187  	return nil
   188  }
   189  
   190  /*
   191  It's OK to add the same input twice in a row with different outputs,
   192  as long as outputs impls the merge method. Note that input is fully
   193  consumed after this method is returned (so caller is free to reuse),
   194  but output is not. So if your outputs are changeable (eg
   195  ByteSequenceOutputs or IntSequenceOutputs) then you cannot reuse
   196  across calls.
   197  */
   198  func (b *Builder) Add(input *util.IntsRef, output interface{}) error {
   199  	// { // debug
   200  	// 	bytes := make([]byte, input.Length)
   201  	// 	for i, _ := range bytes {
   202  	// 		bytes[i] = byte(input.Ints[i])
   203  	// 	}
   204  	// 	if output == NO_OUTPUT {
   205  	// 		fmt.Printf("\nFST ADD: input=%v %v\n", string(bytes), bytes)
   206  	// 	} else {
   207  	// 		panic("not implemented yet")
   208  	// 		// fmt.Printf("\nFST ADD: input=%v %v output=%v", string(bytes), bytes, b.fst.outputs.outputToString(output)));
   209  	// 	}
   210  	// }
   211  
   212  	// de-dup NO_OUTPUT since it must be a singleton:
   213  	if output == NO_OUTPUT {
   214  		output = NO_OUTPUT
   215  	}
   216  
   217  	assert2(b.lastInput.Length() == 0 || !input.Less(b.lastInput.Get()),
   218  		"inputs are added out of order, lastInput=%v vs input=%v",
   219  		b.lastInput.Get(), input)
   220  
   221  	if input.Length == 0 {
   222  		// empty input: only allowed as first input. We have to special
   223  		// case this becaues the packed FST format cannot represent the
   224  		// empty input since 'finalness' is stored on the incoming arc,
   225  		// not on the node
   226  		b.frontier[0].InputCount++
   227  		b.frontier[0].IsFinal = true
   228  		b.fst.setEmptyOutput(output)
   229  		return nil
   230  	}
   231  
   232  	// compare shared prefix length
   233  	pos1 := 0
   234  	pos2 := input.Offset
   235  	pos1Stop := b.lastInput.Length()
   236  	if input.Length < pos1Stop {
   237  		pos1Stop = input.Length
   238  	}
   239  	for {
   240  		b.frontier[pos1].InputCount++
   241  		if pos1 >= pos1Stop || b.lastInput.At(pos1) != input.Ints[pos2] {
   242  			break
   243  		}
   244  		pos1++
   245  		pos2++
   246  	}
   247  	prefixLenPlus1 := pos1 + 1
   248  
   249  	if len(b.frontier) < input.Length+1 {
   250  		next := make([]*UnCompiledNode, util.Oversize(input.Length+1, util.NUM_BYTES_OBJECT_REF))
   251  		copy(next, b.frontier)
   252  		for idx := len(b.frontier); idx < len(next); idx++ {
   253  			next[idx] = NewUnCompiledNode(b, idx)
   254  		}
   255  		b.frontier = next
   256  	}
   257  
   258  	// minimize/compile states from previous input's orphan'd suffix
   259  	err := b.freezeTail(prefixLenPlus1)
   260  	if err != nil {
   261  		return err
   262  	}
   263  
   264  	// init tail states for current input
   265  	for idx := prefixLenPlus1; idx <= input.Length; idx++ {
   266  		b.frontier[idx-1].addArc(input.Ints[input.Offset+idx-1], b.frontier[idx])
   267  		b.frontier[idx].InputCount++
   268  	}
   269  
   270  	lastNode := b.frontier[input.Length]
   271  	if b.lastInput.Length() != input.Length || prefixLenPlus1 != input.Length+1 {
   272  		lastNode.IsFinal = true
   273  		lastNode.output = b.NO_OUTPUT
   274  	}
   275  
   276  	// push conflicting outputs forward, only as far as needed
   277  	for idx := 1; idx < prefixLenPlus1; idx++ {
   278  		node := b.frontier[idx]
   279  		parentNode := b.frontier[idx-1]
   280  
   281  		lastOutput := parentNode.lastOutput(input.Ints[input.Offset+idx-1])
   282  
   283  		var commonOutputPrefix interface{}
   284  		var wordSuffix interface{}
   285  
   286  		if lastOutput != b.NO_OUTPUT {
   287  			commonOutputPrefix = b.fst.outputs.Common(output, lastOutput)
   288  			wordSuffix = b.fst.outputs.Subtract(lastOutput, commonOutputPrefix)
   289  			parentNode.setLastOutput(input.Ints[input.Offset+idx-1], commonOutputPrefix)
   290  			node.prependOutput(wordSuffix)
   291  		} else {
   292  			commonOutputPrefix = NO_OUTPUT
   293  		}
   294  
   295  		output = b.fst.outputs.Subtract(output, commonOutputPrefix)
   296  	}
   297  
   298  	if b.lastInput.Length() == input.Length && prefixLenPlus1 == 1+input.Length {
   299  		// same input more than 1 time in a row, mapping to multiple outputs
   300  		panic("not implemented yet")
   301  	} else {
   302  		// this new arc is private to this new input; set its arc output
   303  		// to the leftover output:
   304  		b.frontier[prefixLenPlus1-1].setLastOutput(input.At(prefixLenPlus1-1), output)
   305  	}
   306  
   307  	// save last input
   308  	b.lastInput.CopyInts(input)
   309  	return nil
   310  }
   311  
   312  func assert(ok bool) {
   313  	assert2(ok, "assert fail")
   314  }
   315  
   316  func assert2(ok bool, msg string, args ...interface{}) {
   317  	if !ok {
   318  		panic(fmt.Sprintf(msg, args...))
   319  	}
   320  }
   321  
   322  /*
   323  Returns final FST. NOTE: this will return nil if nothing is accepted
   324  by the FST.
   325  */
   326  func (b *Builder) Finish() (*FST, error) {
   327  	root := b.frontier[0]
   328  
   329  	// minimize nodes in the last word's suffix
   330  	err := b.freezeTail(0)
   331  	if err != nil {
   332  		return nil, err
   333  	}
   334  	if root.InputCount < int64(b.minSuffixCount1) ||
   335  		root.InputCount < int64(b.minSuffixCount2) || root.NumArcs == 0 {
   336  		if b.fst.emptyOutput == nil {
   337  			return nil, nil
   338  		} else if b.minSuffixCount1 > 0 || b.minSuffixCount2 > 0 {
   339  			// emtpy string got pruned
   340  			return nil, nil
   341  		}
   342  	} else {
   343  		if b.minSuffixCount2 != 0 {
   344  			err = b.compileAllTargets(root, b.lastInput.Length())
   345  			if err != nil {
   346  				return nil, err
   347  			}
   348  		}
   349  	}
   350  	d, err := b.compileNode(root, b.lastInput.Length())
   351  	if err != nil {
   352  		return nil, err
   353  	}
   354  	err = b.fst.finish(d.node)
   355  	if err != nil {
   356  		return nil, err
   357  	}
   358  
   359  	if b.doPackFST {
   360  		n := b.fst.NodeCount() / 4
   361  		if n < 10 {
   362  			n = 10
   363  		}
   364  		return b.fst.pack(3, int(n), b.acceptableOverheadRatio)
   365  	}
   366  	return b.fst, nil
   367  }
   368  
   369  func (b *Builder) compileAllTargets(node *UnCompiledNode, tailLength int) error {
   370  	panic("not implemented yet")
   371  }
   372  
   373  /* Expert: holds a pending (seen but not yet serialized) arc */
   374  type builderArc struct {
   375  	label           int // really an "unsigned" byte
   376  	Target          Node
   377  	isFinal         bool
   378  	output          interface{}
   379  	nextFinalOutput interface{}
   380  }
   381  
   382  /*
   383  NOTE: not many instances of Node or CompiledNode are in memory while
   384  the FST is being built; it's only the current "frontier":
   385  */
   386  type Node interface {
   387  	isCompiled() bool
   388  }
   389  
   390  type CompiledNode struct {
   391  	node int64
   392  }
   393  
   394  func (n *CompiledNode) isCompiled() bool { return true }
   395  
   396  /* Expert: holds a pending (seen but not yet serialized) Node. */
   397  type UnCompiledNode struct {
   398  	owner      *Builder
   399  	NumArcs    int
   400  	Arcs       []*builderArc
   401  	output     interface{}
   402  	IsFinal    bool
   403  	InputCount int64
   404  
   405  	// This node's depth, starting from the automaton root.
   406  	depth int
   407  }
   408  
   409  func NewUnCompiledNode(owner *Builder, depth int) *UnCompiledNode {
   410  	return &UnCompiledNode{
   411  		owner:  owner,
   412  		Arcs:   []*builderArc{new(builderArc)},
   413  		output: owner.NO_OUTPUT,
   414  		depth:  depth,
   415  	}
   416  }
   417  
   418  func (n *UnCompiledNode) isCompiled() bool { return false }
   419  
   420  func (n *UnCompiledNode) Clear() {
   421  	n.NumArcs = 0
   422  	n.IsFinal = false
   423  	n.output = n.owner.NO_OUTPUT
   424  	n.InputCount = 0
   425  
   426  	// we don't clear the depth here becaues it never changes
   427  	// for nodes on the frontier (even when reused).
   428  }
   429  
   430  func (n *UnCompiledNode) lastOutput(labelToMatch int) interface{} {
   431  	assert(n.NumArcs > 0)
   432  	assert(n.Arcs[n.NumArcs-1].label == labelToMatch)
   433  	return n.Arcs[n.NumArcs-1].output
   434  }
   435  
   436  func (n *UnCompiledNode) addArc(label int, target Node) {
   437  	assert(label >= 0)
   438  	if n.NumArcs != 0 {
   439  		assert2(label > n.Arcs[n.NumArcs-1].label,
   440  			"arc[-1].label=%v new label=%v numArcs=%v",
   441  			n.Arcs[n.NumArcs-1].label, label, n.NumArcs)
   442  	}
   443  	if n.NumArcs == len(n.Arcs) {
   444  		newArcs := make([]*builderArc, util.Oversize(n.NumArcs+1, util.NUM_BYTES_OBJECT_REF))
   445  		copy(newArcs, n.Arcs)
   446  		for arcIdx := n.NumArcs; arcIdx < len(newArcs); arcIdx++ {
   447  			newArcs[arcIdx] = new(builderArc)
   448  		}
   449  		n.Arcs = newArcs
   450  	}
   451  	arc := n.Arcs[n.NumArcs]
   452  	n.NumArcs++
   453  	arc.label = label
   454  	arc.Target = target
   455  	arc.output = n.owner.NO_OUTPUT
   456  	arc.nextFinalOutput = n.owner.NO_OUTPUT
   457  	arc.isFinal = false
   458  }
   459  
   460  func (n *UnCompiledNode) replaceLast(labelToMatch int, target Node, nextFinalOutput interface{}, isFinal bool) {
   461  	assert(n.NumArcs > 0)
   462  	arc := n.Arcs[n.NumArcs-1]
   463  	assert2(arc.label == labelToMatch, "arc.label=%v vs %v", arc.label, labelToMatch)
   464  	arc.Target = target
   465  	arc.nextFinalOutput = nextFinalOutput
   466  	arc.isFinal = isFinal
   467  }
   468  
   469  func (n *UnCompiledNode) deleteLast(label int, target Node) {
   470  	assert(n.NumArcs > 0)
   471  	assert(label == n.Arcs[n.NumArcs-1].label)
   472  	assert(target == n.Arcs[n.NumArcs-1].Target)
   473  	n.NumArcs--
   474  }
   475  
   476  func (n *UnCompiledNode) setLastOutput(labelToMatch int, newOutput interface{}) {
   477  	assert(n.NumArcs > 0)
   478  	arc := n.Arcs[n.NumArcs-1]
   479  	assert(arc.label == labelToMatch)
   480  	arc.output = newOutput
   481  }
   482  
   483  /* pushes an output prefix forward onto all arcs */
   484  func (n *UnCompiledNode) prependOutput(outputPrefix interface{}) {
   485  	for arcIdx := 0; arcIdx < n.NumArcs; arcIdx++ {
   486  		n.Arcs[arcIdx].output = n.owner.fst.outputs.Add(outputPrefix, n.Arcs[arcIdx].output)
   487  	}
   488  
   489  	if n.IsFinal {
   490  		n.output = n.owner.fst.outputs.Add(outputPrefix, n.output)
   491  	}
   492  }