github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/util/fst/builder.go (about) 1 package fst 2 3 import ( 4 "fmt" 5 "github.com/balzaczyy/golucene/core/util" 6 ) 7 8 /* 9 Builds a minimal FST (maps an []int term to an arbitrary output) from 10 pre-sorted terms with outputs. The FST becomes an FSA if you use 11 NoOutputs. The FST is written on-the-fly into a compact serialized 12 format byte array, which can be saved to / loaded from a Directory or 13 used directly for traversal. The FST is always finite (no cycles). 14 15 NOTE: the algorithm is described at 16 http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.3698 17 18 FSTs larger than 2.1GB are now possible (as of Lucene 4.2). FSTs 19 containing more than 2.1B nodes are also now possible, however they 20 cannot be packed. 21 */ 22 type Builder struct { 23 dedupHash *NodeHash 24 fst *FST 25 NO_OUTPUT interface{} 26 27 // simplistic pruning: we prune node (and all following nodes) if 28 // less than this number of terms go through it: 29 minSuffixCount1 int 30 31 // better pruning: we prune node (and all following nodes) if the 32 // prior node has less than this number of terms go through it: 33 minSuffixCount2 int 34 35 doShareNonSingletonNodes bool 36 shareMaxTailLength int 37 38 lastInput *util.IntsRefBuilder 39 40 // for packing 41 doPackFST bool 42 acceptableOverheadRatio float32 43 44 // current frontier 45 frontier []*UnCompiledNode 46 } 47 48 /* 49 Instantiates an FST/FSA builder with all the possible tuning and 50 construction tweaks. Read parameter documentation carefully. 51 52 ... 53 */ 54 func NewBuilder(inputType InputType, minSuffixCount1, minSuffixCount2 int, 55 doShareSuffix, doShareNonSingletonNodes bool, shareMaxTailLength int, 56 outputs Outputs, doPackFST bool, 57 acceptableOverheadRatio float32, allowArrayArcs bool, bytesPageBits int) *Builder { 58 59 fst := newFST(inputType, outputs, doPackFST, acceptableOverheadRatio, allowArrayArcs, bytesPageBits) 60 f := make([]*UnCompiledNode, 10) 61 ans := &Builder{ 62 minSuffixCount1: minSuffixCount1, 63 minSuffixCount2: minSuffixCount2, 64 doShareNonSingletonNodes: doShareNonSingletonNodes, 65 shareMaxTailLength: shareMaxTailLength, 66 doPackFST: doPackFST, 67 acceptableOverheadRatio: acceptableOverheadRatio, 68 fst: fst, 69 NO_OUTPUT: outputs.NoOutput(), 70 frontier: f, 71 lastInput: util.NewIntsRefBuilder(), 72 } 73 if doShareSuffix { 74 ans.dedupHash = newNodeHash(fst, fst.bytes.reverseReaderAllowSingle(false)) 75 } 76 for i, _ := range f { 77 f[i] = NewUnCompiledNode(ans, i) 78 } 79 return ans 80 } 81 82 func (b *Builder) compileNode(nodeIn *UnCompiledNode, tailLength int) (*CompiledNode, error) { 83 var node int64 84 var err error 85 if b.dedupHash != nil && 86 (b.doShareNonSingletonNodes || nodeIn.NumArcs <= 1) && 87 tailLength <= b.shareMaxTailLength { 88 if nodeIn.NumArcs == 0 { 89 node, err = b.fst.addNode(nodeIn) 90 } else { 91 node, err = b.dedupHash.add(nodeIn) 92 } 93 } else { 94 node, err = b.fst.addNode(nodeIn) 95 } 96 if err != nil { 97 return nil, err 98 } 99 assert(node != -2) 100 101 nodeIn.Clear() 102 103 return &CompiledNode{node}, nil 104 } 105 106 func (b *Builder) freezeTail(prefixLenPlus1 int) error { 107 // fmt.Printf(" compileTail %v\n", prefixLenPlus1) 108 downTo := prefixLenPlus1 109 if downTo < 1 { 110 downTo = 1 111 } 112 for idx := b.lastInput.Length(); idx >= downTo; idx-- { 113 doPrune := false 114 doCompile := false 115 116 node := b.frontier[idx] 117 parent := b.frontier[idx-1] 118 119 if node.InputCount < int64(b.minSuffixCount1) { 120 doPrune = true 121 doCompile = true 122 } else if idx > prefixLenPlus1 { 123 // prune if parent's inputCount is less than suffixMinCount2 124 if parent.InputCount < int64(b.minSuffixCount2) || 125 b.minSuffixCount2 == 1 && parent.InputCount == 1 && idx > 1 { 126 // my parent, about to be compiled, doesn't make the cut, so 127 // I'm definitely pruned 128 129 // if minSuffixCount2 is 1, we keep only up 130 // until the 'distinguished edge', ie we keep only the 131 // 'divergent' part of the FST. if my parent, about to be 132 // compiled, has inputCount 1 then we are already past the 133 // distinguished edge. NOTE: this only works if 134 // the FST outputs are not "compressible" (simple 135 // ords ARE compressible). 136 doPrune = true 137 } else { 138 // my parent, about to be compiled, does make the cut, so 139 // I'm definitely not pruned 140 doPrune = false 141 } 142 doCompile = true 143 } else { 144 // if pruning is disabled (count is 0) we can always compile current node 145 doCompile = b.minSuffixCount2 == 0 146 } 147 148 // fmt.Printf(" label=%c idx=%v inputCount=%v doCompile=%v doPrune=%v\n", 149 // b.lastInput.At(idx-1), idx, b.frontier[idx].InputCount, doCompile, doPrune) 150 if node.InputCount < int64(b.minSuffixCount2) || 151 (b.minSuffixCount2 == 1 && node.InputCount == 1 && idx > 1) { 152 // drop all arcs 153 panic("not implemented yet") 154 } 155 156 if doPrune { 157 // tihs node doesn't make it -- deref it 158 node.Clear() 159 parent.deleteLast(b.lastInput.At(idx-1), node) 160 } else { 161 162 if b.minSuffixCount2 != 0 { 163 b.compileAllTargets(node, b.lastInput.Length()-idx) 164 } 165 nextFinalOutput := node.output 166 167 // we "fake" the node as being final if it has no outgoing arcs; 168 // in theory we could leave it as non-final (the FST can 169 // represent this), but FSTEnum, Util, etc., have trouble w/ 170 // non-final dead-end states: 171 isFinal := node.IsFinal || node.NumArcs == 0 172 173 if doCompile { 174 // this node makes it and we now compile it. first, compile 175 // any targets that were previously undecided: 176 label := b.lastInput.At(idx - 1) 177 node, err := b.compileNode(node, 1+b.lastInput.Length()-idx) 178 if err != nil { 179 return err 180 } 181 parent.replaceLast(label, node, nextFinalOutput, isFinal) 182 } else { 183 panic("not implemented yet") 184 } 185 } 186 } 187 return nil 188 } 189 190 /* 191 It's OK to add the same input twice in a row with different outputs, 192 as long as outputs impls the merge method. Note that input is fully 193 consumed after this method is returned (so caller is free to reuse), 194 but output is not. So if your outputs are changeable (eg 195 ByteSequenceOutputs or IntSequenceOutputs) then you cannot reuse 196 across calls. 197 */ 198 func (b *Builder) Add(input *util.IntsRef, output interface{}) error { 199 // { // debug 200 // bytes := make([]byte, input.Length) 201 // for i, _ := range bytes { 202 // bytes[i] = byte(input.Ints[i]) 203 // } 204 // if output == NO_OUTPUT { 205 // fmt.Printf("\nFST ADD: input=%v %v\n", string(bytes), bytes) 206 // } else { 207 // panic("not implemented yet") 208 // // fmt.Printf("\nFST ADD: input=%v %v output=%v", string(bytes), bytes, b.fst.outputs.outputToString(output))); 209 // } 210 // } 211 212 // de-dup NO_OUTPUT since it must be a singleton: 213 if output == NO_OUTPUT { 214 output = NO_OUTPUT 215 } 216 217 assert2(b.lastInput.Length() == 0 || !input.Less(b.lastInput.Get()), 218 "inputs are added out of order, lastInput=%v vs input=%v", 219 b.lastInput.Get(), input) 220 221 if input.Length == 0 { 222 // empty input: only allowed as first input. We have to special 223 // case this becaues the packed FST format cannot represent the 224 // empty input since 'finalness' is stored on the incoming arc, 225 // not on the node 226 b.frontier[0].InputCount++ 227 b.frontier[0].IsFinal = true 228 b.fst.setEmptyOutput(output) 229 return nil 230 } 231 232 // compare shared prefix length 233 pos1 := 0 234 pos2 := input.Offset 235 pos1Stop := b.lastInput.Length() 236 if input.Length < pos1Stop { 237 pos1Stop = input.Length 238 } 239 for { 240 b.frontier[pos1].InputCount++ 241 if pos1 >= pos1Stop || b.lastInput.At(pos1) != input.Ints[pos2] { 242 break 243 } 244 pos1++ 245 pos2++ 246 } 247 prefixLenPlus1 := pos1 + 1 248 249 if len(b.frontier) < input.Length+1 { 250 next := make([]*UnCompiledNode, util.Oversize(input.Length+1, util.NUM_BYTES_OBJECT_REF)) 251 copy(next, b.frontier) 252 for idx := len(b.frontier); idx < len(next); idx++ { 253 next[idx] = NewUnCompiledNode(b, idx) 254 } 255 b.frontier = next 256 } 257 258 // minimize/compile states from previous input's orphan'd suffix 259 err := b.freezeTail(prefixLenPlus1) 260 if err != nil { 261 return err 262 } 263 264 // init tail states for current input 265 for idx := prefixLenPlus1; idx <= input.Length; idx++ { 266 b.frontier[idx-1].addArc(input.Ints[input.Offset+idx-1], b.frontier[idx]) 267 b.frontier[idx].InputCount++ 268 } 269 270 lastNode := b.frontier[input.Length] 271 if b.lastInput.Length() != input.Length || prefixLenPlus1 != input.Length+1 { 272 lastNode.IsFinal = true 273 lastNode.output = b.NO_OUTPUT 274 } 275 276 // push conflicting outputs forward, only as far as needed 277 for idx := 1; idx < prefixLenPlus1; idx++ { 278 node := b.frontier[idx] 279 parentNode := b.frontier[idx-1] 280 281 lastOutput := parentNode.lastOutput(input.Ints[input.Offset+idx-1]) 282 283 var commonOutputPrefix interface{} 284 var wordSuffix interface{} 285 286 if lastOutput != b.NO_OUTPUT { 287 commonOutputPrefix = b.fst.outputs.Common(output, lastOutput) 288 wordSuffix = b.fst.outputs.Subtract(lastOutput, commonOutputPrefix) 289 parentNode.setLastOutput(input.Ints[input.Offset+idx-1], commonOutputPrefix) 290 node.prependOutput(wordSuffix) 291 } else { 292 commonOutputPrefix = NO_OUTPUT 293 } 294 295 output = b.fst.outputs.Subtract(output, commonOutputPrefix) 296 } 297 298 if b.lastInput.Length() == input.Length && prefixLenPlus1 == 1+input.Length { 299 // same input more than 1 time in a row, mapping to multiple outputs 300 panic("not implemented yet") 301 } else { 302 // this new arc is private to this new input; set its arc output 303 // to the leftover output: 304 b.frontier[prefixLenPlus1-1].setLastOutput(input.At(prefixLenPlus1-1), output) 305 } 306 307 // save last input 308 b.lastInput.CopyInts(input) 309 return nil 310 } 311 312 func assert(ok bool) { 313 assert2(ok, "assert fail") 314 } 315 316 func assert2(ok bool, msg string, args ...interface{}) { 317 if !ok { 318 panic(fmt.Sprintf(msg, args...)) 319 } 320 } 321 322 /* 323 Returns final FST. NOTE: this will return nil if nothing is accepted 324 by the FST. 325 */ 326 func (b *Builder) Finish() (*FST, error) { 327 root := b.frontier[0] 328 329 // minimize nodes in the last word's suffix 330 err := b.freezeTail(0) 331 if err != nil { 332 return nil, err 333 } 334 if root.InputCount < int64(b.minSuffixCount1) || 335 root.InputCount < int64(b.minSuffixCount2) || root.NumArcs == 0 { 336 if b.fst.emptyOutput == nil { 337 return nil, nil 338 } else if b.minSuffixCount1 > 0 || b.minSuffixCount2 > 0 { 339 // emtpy string got pruned 340 return nil, nil 341 } 342 } else { 343 if b.minSuffixCount2 != 0 { 344 err = b.compileAllTargets(root, b.lastInput.Length()) 345 if err != nil { 346 return nil, err 347 } 348 } 349 } 350 d, err := b.compileNode(root, b.lastInput.Length()) 351 if err != nil { 352 return nil, err 353 } 354 err = b.fst.finish(d.node) 355 if err != nil { 356 return nil, err 357 } 358 359 if b.doPackFST { 360 n := b.fst.NodeCount() / 4 361 if n < 10 { 362 n = 10 363 } 364 return b.fst.pack(3, int(n), b.acceptableOverheadRatio) 365 } 366 return b.fst, nil 367 } 368 369 func (b *Builder) compileAllTargets(node *UnCompiledNode, tailLength int) error { 370 panic("not implemented yet") 371 } 372 373 /* Expert: holds a pending (seen but not yet serialized) arc */ 374 type builderArc struct { 375 label int // really an "unsigned" byte 376 Target Node 377 isFinal bool 378 output interface{} 379 nextFinalOutput interface{} 380 } 381 382 /* 383 NOTE: not many instances of Node or CompiledNode are in memory while 384 the FST is being built; it's only the current "frontier": 385 */ 386 type Node interface { 387 isCompiled() bool 388 } 389 390 type CompiledNode struct { 391 node int64 392 } 393 394 func (n *CompiledNode) isCompiled() bool { return true } 395 396 /* Expert: holds a pending (seen but not yet serialized) Node. */ 397 type UnCompiledNode struct { 398 owner *Builder 399 NumArcs int 400 Arcs []*builderArc 401 output interface{} 402 IsFinal bool 403 InputCount int64 404 405 // This node's depth, starting from the automaton root. 406 depth int 407 } 408 409 func NewUnCompiledNode(owner *Builder, depth int) *UnCompiledNode { 410 return &UnCompiledNode{ 411 owner: owner, 412 Arcs: []*builderArc{new(builderArc)}, 413 output: owner.NO_OUTPUT, 414 depth: depth, 415 } 416 } 417 418 func (n *UnCompiledNode) isCompiled() bool { return false } 419 420 func (n *UnCompiledNode) Clear() { 421 n.NumArcs = 0 422 n.IsFinal = false 423 n.output = n.owner.NO_OUTPUT 424 n.InputCount = 0 425 426 // we don't clear the depth here becaues it never changes 427 // for nodes on the frontier (even when reused). 428 } 429 430 func (n *UnCompiledNode) lastOutput(labelToMatch int) interface{} { 431 assert(n.NumArcs > 0) 432 assert(n.Arcs[n.NumArcs-1].label == labelToMatch) 433 return n.Arcs[n.NumArcs-1].output 434 } 435 436 func (n *UnCompiledNode) addArc(label int, target Node) { 437 assert(label >= 0) 438 if n.NumArcs != 0 { 439 assert2(label > n.Arcs[n.NumArcs-1].label, 440 "arc[-1].label=%v new label=%v numArcs=%v", 441 n.Arcs[n.NumArcs-1].label, label, n.NumArcs) 442 } 443 if n.NumArcs == len(n.Arcs) { 444 newArcs := make([]*builderArc, util.Oversize(n.NumArcs+1, util.NUM_BYTES_OBJECT_REF)) 445 copy(newArcs, n.Arcs) 446 for arcIdx := n.NumArcs; arcIdx < len(newArcs); arcIdx++ { 447 newArcs[arcIdx] = new(builderArc) 448 } 449 n.Arcs = newArcs 450 } 451 arc := n.Arcs[n.NumArcs] 452 n.NumArcs++ 453 arc.label = label 454 arc.Target = target 455 arc.output = n.owner.NO_OUTPUT 456 arc.nextFinalOutput = n.owner.NO_OUTPUT 457 arc.isFinal = false 458 } 459 460 func (n *UnCompiledNode) replaceLast(labelToMatch int, target Node, nextFinalOutput interface{}, isFinal bool) { 461 assert(n.NumArcs > 0) 462 arc := n.Arcs[n.NumArcs-1] 463 assert2(arc.label == labelToMatch, "arc.label=%v vs %v", arc.label, labelToMatch) 464 arc.Target = target 465 arc.nextFinalOutput = nextFinalOutput 466 arc.isFinal = isFinal 467 } 468 469 func (n *UnCompiledNode) deleteLast(label int, target Node) { 470 assert(n.NumArcs > 0) 471 assert(label == n.Arcs[n.NumArcs-1].label) 472 assert(target == n.Arcs[n.NumArcs-1].Target) 473 n.NumArcs-- 474 } 475 476 func (n *UnCompiledNode) setLastOutput(labelToMatch int, newOutput interface{}) { 477 assert(n.NumArcs > 0) 478 arc := n.Arcs[n.NumArcs-1] 479 assert(arc.label == labelToMatch) 480 arc.output = newOutput 481 } 482 483 /* pushes an output prefix forward onto all arcs */ 484 func (n *UnCompiledNode) prependOutput(outputPrefix interface{}) { 485 for arcIdx := 0; arcIdx < n.NumArcs; arcIdx++ { 486 n.Arcs[arcIdx].output = n.owner.fst.outputs.Add(outputPrefix, n.Arcs[arcIdx].output) 487 } 488 489 if n.IsFinal { 490 n.output = n.owner.fst.outputs.Add(outputPrefix, n.output) 491 } 492 }