github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/util/fst/fst.go (about) 1 package fst 2 3 import ( 4 "bytes" 5 "fmt" 6 "github.com/balzaczyy/golucene/core/codec" 7 "github.com/balzaczyy/golucene/core/store" 8 "github.com/balzaczyy/golucene/core/util" 9 "github.com/balzaczyy/golucene/core/util/packed" 10 "math" 11 "reflect" 12 ) 13 14 // util/fst/FST.java 15 16 var ARC_SHALLOW_RAM_BYTES_USED = util.ShallowSizeOfInstance(reflect.TypeOf(Arc{})) 17 18 type InputType int 19 20 const ( 21 INPUT_TYPE_BYTE1 = 1 22 INPUT_TYPE_BYTE2 = 2 23 INPUT_TYPE_BYTE4 = 3 24 ) 25 26 const ( 27 FST_BIT_FINAL_ARC = byte(1 << 0) 28 FST_BIT_LAST_ARC = byte(1 << 1) 29 FST_BIT_TARGET_NEXT = byte(1 << 2) 30 FST_BIT_STOP_NODE = byte(1 << 3) 31 FST_BIT_ARC_HAS_OUTPUT = byte(1 << 4) 32 FST_BIT_ARC_HAS_FINAL_OUTPUT = byte(1 << 5) 33 FST_BIT_TARGET_DELTA = byte(1 << 6) 34 FST_ARCS_AS_FIXED_ARRAY = FST_BIT_ARC_HAS_FINAL_OUTPUT 35 36 FIXED_ARRAY_SHALLOW_DISTANCE = 3 // 0 => only root node 37 FIXED_ARRAY_NUM_ARCS_SHALLOW = 5 38 FIXED_ARRAY_NUM_ARCS_DEEP = 10 39 40 FST_FILE_FORMAT_NAME = "FST" 41 FST_VERSION_PACKED = 3 42 FST_VERSION_VINT_TARGET = 4 43 44 VERSION_CURRENT = FST_VERSION_VINT_TARGET 45 46 FST_FINAL_END_NODE = -1 47 FST_NON_FINAL_END_NODE = 0 48 49 /** If arc has this label then that arc is final/accepted */ 50 FST_END_LABEL = -1 51 52 FST_DEFAULT_MAX_BLOCK_BITS = 28 // 30 for 64 bit int 53 ) 54 55 // Represents a single arc 56 type Arc struct { 57 Label int 58 Output interface{} 59 node int64 // from node 60 target int64 // to node 61 flags byte 62 NextFinalOutput interface{} 63 nextArc int64 64 posArcsStart int64 65 bytesPerArc int 66 arcIdx int 67 numArcs int 68 } 69 70 func (arc *Arc) copyFrom(other *Arc) *Arc { 71 arc.node = other.node 72 arc.Label = other.Label 73 arc.target = other.target 74 arc.flags = other.flags 75 arc.Output = other.Output 76 arc.NextFinalOutput = other.NextFinalOutput 77 arc.nextArc = other.nextArc 78 arc.bytesPerArc = other.bytesPerArc 79 if other.bytesPerArc != 0 { 80 arc.posArcsStart = other.posArcsStart 81 arc.arcIdx = other.arcIdx 82 arc.numArcs = other.numArcs 83 } 84 return arc 85 } 86 87 func (arc *Arc) flag(flag byte) bool { 88 return hasFlag(arc.flags, flag) 89 } 90 91 func (arc *Arc) isLast() bool { 92 return arc.flag(FST_BIT_LAST_ARC) 93 } 94 95 func (arc *Arc) IsFinal() bool { 96 return arc.flag(FST_BIT_FINAL_ARC) 97 } 98 99 func (arc *Arc) String() string { 100 var b bytes.Buffer 101 fmt.Fprintf(&b, "node=%v target=%v label=%v", arc.node, arc.target, util.ItoHex(int64(arc.Label))) 102 if arc.flag(FST_BIT_FINAL_ARC) { 103 fmt.Fprintf(&b, " final") 104 } 105 if arc.flag(FST_BIT_LAST_ARC) { 106 fmt.Fprintf(&b, " last") 107 } 108 if arc.flag(FST_BIT_TARGET_NEXT) { 109 fmt.Fprintf(&b, " targetNext") 110 } 111 if arc.flag(FST_BIT_STOP_NODE) { 112 fmt.Fprintf(&b, " stop") 113 } 114 if arc.flag(FST_BIT_ARC_HAS_OUTPUT) { 115 fmt.Fprintf(&b, " output=%v", arc.Output) 116 } 117 if arc.flag(FST_BIT_ARC_HAS_FINAL_OUTPUT) { 118 fmt.Fprintf(&b, " nextFinalOutput=%v", arc.NextFinalOutput) 119 } 120 if arc.bytesPerArc != 0 { 121 fmt.Fprintf(&b, " arcArray(idx=%v of %v)", arc.arcIdx, arc.numArcs) 122 } 123 return b.String() 124 } 125 126 func hasFlag(flags, bit byte) bool { 127 return (flags & bit) != 0 128 } 129 130 type FST struct { 131 inputType InputType 132 bytesPerArc []int 133 // if non-null, this FST accepts the empty string and 134 // produces this output 135 emptyOutput interface{} 136 137 bytes *BytesStore 138 139 startNode int64 140 141 outputs Outputs 142 143 lastFrozenNode int64 144 145 NO_OUTPUT interface{} 146 147 nodeCount int64 148 arcCount int64 149 arcWithOutputCount int64 150 151 packed bool 152 nodeRefToAddress packed.PackedIntsReader 153 154 allowArrayArcs bool 155 156 cachedRootArcs []*Arc 157 assertingCachedRootArcs []*Arc // only set wit assert 158 159 version int32 160 161 nodeAddress *packed.GrowableWriter 162 163 // TODO: we could be smarter here, and prune periodically as we go; 164 // high in-count nodes will "usually" become clear early on: 165 inCounts *packed.GrowableWriter 166 167 cachedArcsBytesUsed int 168 } 169 170 /* Make a new empty FST, for building; Builder invokes this ctor */ 171 func newFST(inputType InputType, outputs Outputs, willPackFST bool, 172 acceptableOverheadRatio float32, allowArrayArcs bool, 173 bytesPageBits int) *FST { 174 bytes := newBytesStoreFromBits(uint32(bytesPageBits)) 175 // pad: ensure no node gets address 0 which is reserved to mean 176 // the stop state w/ no arcs 177 bytes.WriteByte(0) 178 ans := &FST{ 179 inputType: inputType, 180 outputs: outputs, 181 allowArrayArcs: allowArrayArcs, 182 version: VERSION_CURRENT, 183 bytes: bytes, 184 NO_OUTPUT: outputs.NoOutput(), 185 startNode: -1, 186 } 187 if willPackFST { 188 ans.nodeAddress = packed.NewGrowableWriter(15, 8, acceptableOverheadRatio) 189 ans.inCounts = packed.NewGrowableWriter(1, 8, acceptableOverheadRatio) 190 } 191 return ans 192 } 193 194 func LoadFST(in util.DataInput, outputs Outputs) (fst *FST, err error) { 195 return loadFST3(in, outputs, FST_DEFAULT_MAX_BLOCK_BITS) 196 } 197 198 /** Load a previously saved FST; maxBlockBits allows you to 199 * control the size of the byte[] pages used to hold the FST bytes. */ 200 func loadFST3(in util.DataInput, outputs Outputs, maxBlockBits uint32) (fst *FST, err error) { 201 // log.Printf("Loading FST from %v and output to %v...", in, outputs) 202 // defer func() { 203 // if err != nil { 204 // log.Print("Failed to load FST.") 205 // } 206 // }() 207 fst = &FST{outputs: outputs, startNode: -1} 208 209 if maxBlockBits < 1 || maxBlockBits > 30 { 210 panic(fmt.Sprintf("maxBlockBits should 1..30; got %v", maxBlockBits)) 211 } 212 213 // NOTE: only reads most recent format; we don't have 214 // back-compat promise for FSTs (they are experimental): 215 fst.version, err = codec.CheckHeader(in, FST_FILE_FORMAT_NAME, FST_VERSION_PACKED, FST_VERSION_VINT_TARGET) 216 if err != nil { 217 return nil, err 218 } 219 if b, err := in.ReadByte(); err == nil { 220 fst.packed = (b == 1) 221 } else { 222 return nil, err 223 } 224 if b, err := in.ReadByte(); err == nil { 225 if b == 1 { 226 // accepts empty string 227 // 1 KB blocks: 228 emptyBytes := newBytesStoreFromBits(10) 229 if numBytes, err := in.ReadVInt(); err == nil { 230 // log.Printf("Number of bytes: %v", numBytes) 231 emptyBytes.CopyBytes(in, int64(numBytes)) 232 233 // De-serialize empty-string output: 234 var reader BytesReader 235 if fst.packed { 236 // log.Printf("Forward reader.") 237 reader = emptyBytes.forwardReader() 238 } else { 239 // log.Printf("Reverse reader.") 240 reader = emptyBytes.reverseReader() 241 // NoOutputs uses 0 bytes when writing its output, 242 // so we have to check here else BytesStore gets 243 // angry: 244 if numBytes > 0 { 245 reader.setPosition(int64(numBytes - 1)) 246 } 247 } 248 // log.Printf("Reading final output from %v to %v...\n", reader, outputs) 249 fst.emptyOutput, err = outputs.ReadFinalOutput(reader) 250 } 251 } // else emptyOutput = nil 252 } 253 if err != nil { 254 return nil, err 255 } 256 257 if t, err := in.ReadByte(); err == nil { 258 switch t { 259 case 0: 260 fst.inputType = INPUT_TYPE_BYTE1 261 case 1: 262 fst.inputType = INPUT_TYPE_BYTE2 263 case 2: 264 fst.inputType = INPUT_TYPE_BYTE4 265 default: 266 panic(fmt.Sprintf("invalid input type %v", t)) 267 } 268 } else { 269 return nil, err 270 } 271 272 if fst.packed { 273 if fst.nodeRefToAddress, err = packed.NewPackedReader(in); err != nil { 274 return nil, err 275 } 276 } // else nodeRefToAddress = nil 277 278 if fst.startNode, err = in.ReadVLong(); err == nil { 279 if fst.nodeCount, err = in.ReadVLong(); err == nil { 280 if fst.arcCount, err = in.ReadVLong(); err == nil { 281 if fst.arcWithOutputCount, err = in.ReadVLong(); err == nil { 282 if numBytes, err := in.ReadVLong(); err == nil { 283 if fst.bytes, err = newBytesStoreFromInput(in, numBytes, 1<<maxBlockBits); err == nil { 284 fst.NO_OUTPUT = outputs.NoOutput() 285 286 err = fst.cacheRootArcs() 287 288 // NOTE: bogus because this is only used during 289 // building; we need to break out mutable FST from 290 // immutable 291 // fst.allowArrayArcs = false 292 } 293 } 294 } 295 } 296 } 297 } 298 return fst, err 299 } 300 301 func (t *FST) ramBytesUsed(arcs []*Arc) int64 { 302 var size int64 303 if arcs != nil { 304 size += util.ShallowSizeOf(arcs) 305 for _, arc := range arcs { 306 if arc != nil { 307 size += ARC_SHALLOW_RAM_BYTES_USED 308 if arc.Output != nil && arc.Output != t.outputs.NoOutput() { 309 size += t.outputs.ramBytesUsed(arc.Output) 310 } 311 if arc.NextFinalOutput != nil && arc.NextFinalOutput != t.outputs.NoOutput() { 312 size += t.outputs.ramBytesUsed(arc.NextFinalOutput) 313 } 314 } 315 } 316 } 317 return size 318 } 319 320 func (t *FST) finish(newStartNode int64) error { 321 assert2(t.startNode == -1, "already finished") 322 if newStartNode == FST_FINAL_END_NODE && t.emptyOutput != nil { 323 newStartNode = 0 324 } 325 t.startNode = newStartNode 326 t.bytes.finish() 327 328 return t.cacheRootArcs() 329 } 330 331 func (t *FST) getNodeAddress(node int64) int64 { 332 if t.nodeAddress != nil { // Deref 333 return t.nodeAddress.Get(int(node)) 334 } else { // Straight 335 return node 336 } 337 } 338 339 func (t *FST) cacheRootArcs() error { 340 t.cachedRootArcs = make([]*Arc, 0x80) 341 t.readRootArcs(t.cachedRootArcs) 342 t.cachedArcsBytesUsed += int(t.ramBytesUsed(t.cachedRootArcs)) 343 344 if err := t.setAssertingRootArcs(t.cachedRootArcs); err != nil { 345 return err 346 } 347 t.assertRootArcs() 348 return nil 349 } 350 351 func (t *FST) readRootArcs(arcs []*Arc) (err error) { 352 arc := &Arc{} 353 t.FirstArc(arc) 354 in := t.BytesReader() 355 if targetHasArcs(arc) { 356 _, err = t.readFirstRealTargetArc(arc.target, arc, in) 357 for err == nil { 358 if arc.Label == FST_END_LABEL { 359 panic("assert fail") 360 } 361 if arc.Label >= len(t.cachedRootArcs) { 362 break 363 } 364 arcs[arc.Label] = (&Arc{}).copyFrom(arc) 365 if arc.isLast() { 366 break 367 } 368 _, err = t.readNextRealArc(arc, in) 369 } 370 } 371 return err 372 } 373 374 func (t *FST) setAssertingRootArcs(arcs []*Arc) error { 375 t.assertingCachedRootArcs = make([]*Arc, len(arcs)) 376 err := t.readRootArcs(t.assertingCachedRootArcs) 377 if err == nil { 378 t.cachedArcsBytesUsed *= 2 379 } 380 return err 381 } 382 383 func (t *FST) assertRootArcs() { 384 if t.cachedRootArcs == nil || t.assertingCachedRootArcs == nil { 385 panic("assert fail") 386 } 387 for i, v := range t.assertingCachedRootArcs { 388 root := t.cachedRootArcs[i] 389 asserting := v 390 if root != nil { 391 assert(root.arcIdx == asserting.arcIdx) 392 assert(root.bytesPerArc == asserting.bytesPerArc) 393 assert(root.flags == asserting.flags) 394 assert(root.Label == asserting.Label) 395 assert(root.nextArc == asserting.nextArc) 396 assert2(equals(root.NextFinalOutput, asserting.NextFinalOutput), 397 "%v != %v", root.NextFinalOutput, asserting.NextFinalOutput) 398 assert(root.node == asserting.node) 399 assert(root.numArcs == asserting.numArcs) 400 assert(equals(root.Output, asserting.Output)) 401 assert(root.posArcsStart == asserting.posArcsStart) 402 assert(root.target == asserting.target) 403 } else { 404 assert(asserting == nil) 405 } 406 } 407 } 408 409 // Since Go doesn't has Java's Object.equals() method, 410 // I have to implement my own. 411 func equals(a, b interface{}) bool { 412 sameType := reflect.TypeOf(a) == reflect.TypeOf(b) 413 if _, ok := a.([]byte); ok { 414 if _, ok := b.([]byte); !ok { 415 // panic(fmt.Sprintf("incomparable type: %v vs %v", a, b)) 416 return false 417 } 418 b1 := a.([]byte) 419 b2 := b.([]byte) 420 if len(b1) != len(b2) { 421 return false 422 } 423 for i := 0; i < len(b1) && i < len(b2); i++ { 424 if b1[i] != b2[i] { 425 return false 426 } 427 } 428 return true 429 } else if _, ok := a.(int64); ok { 430 if _, ok := b.(int64); !ok { 431 // panic(fmt.Sprintf("incomparable type: %v vs %v", a, b)) 432 return false 433 } 434 return a.(int64) == b.(int64) 435 } else if a == nil && b == nil { 436 return true 437 } else if sameType && a == b { 438 return true 439 } 440 return false 441 } 442 443 func CompareFSTValue(a, b interface{}) bool { 444 return equals(a, b) 445 } 446 447 func (t *FST) EmptyOutput() interface{} { 448 return t.emptyOutput 449 } 450 451 // L493 452 func (t *FST) setEmptyOutput(v interface{}) { 453 if t.emptyOutput != nil { 454 t.emptyOutput = t.outputs.merge(t.emptyOutput, v) 455 } else { 456 t.emptyOutput = v 457 } 458 } 459 460 func (t *FST) Save(out util.DataOutput) error { 461 assert2(t.startNode != -1, "call finish first") 462 assert2(t.nodeAddress == nil, "cannot save an FST pre-packaged FST; it must first be packed") 463 _, ok := t.nodeRefToAddress.(packed.Mutable) 464 assert2(!t.packed || ok, "cannot save a FST which has been loaded from disk ") 465 err := codec.WriteHeader(out, FST_FILE_FORMAT_NAME, VERSION_CURRENT) 466 if err == nil && t.packed { 467 err = out.WriteByte(1) 468 } else { 469 err = out.WriteByte(0) 470 } 471 // TODO: really we should encode this as an arc, arriving 472 // to the root node, instead of special casing here: 473 if err == nil && t.emptyOutput != nil { 474 // accepts empty string 475 err = out.WriteByte(1) 476 477 if err == nil { 478 // serialize empty-string output: 479 ros := store.NewRAMOutputStreamBuffer() 480 err = t.outputs.writeFinalOutput(t.emptyOutput, ros) 481 482 if err == nil { 483 emptyOutputBytes := make([]byte, ros.FilePointer()) 484 err = ros.WriteToBytes(emptyOutputBytes) 485 486 length := len(emptyOutputBytes) 487 if err == nil && !t.packed { 488 // reverse 489 stopAt := length / 2 490 for upto := 0; upto < stopAt; upto++ { 491 emptyOutputBytes[upto], emptyOutputBytes[length-upto-1] = 492 emptyOutputBytes[length-upto-1], emptyOutputBytes[upto] 493 } 494 } 495 if err == nil { 496 err = out.WriteVInt(int32(length)) 497 if err == nil { 498 err = out.WriteBytes(emptyOutputBytes) 499 } 500 } 501 } 502 } 503 } else if err == nil { 504 err = out.WriteByte(0) 505 } 506 if err != nil { 507 return err 508 } 509 510 var tb byte 511 switch int(t.inputType) { 512 case INPUT_TYPE_BYTE1: 513 tb = 0 514 case INPUT_TYPE_BYTE2: 515 tb = 1 516 default: 517 tb = 2 518 } 519 err = out.WriteByte(tb) 520 if err == nil && t.packed { 521 err = t.nodeRefToAddress.(packed.Mutable).Save(out) 522 } 523 if err != nil { 524 return err 525 } 526 527 err = out.WriteVLong(t.startNode) 528 if err == nil { 529 err = out.WriteVLong(t.nodeCount) 530 if err == nil { 531 err = out.WriteVLong(t.arcCount) 532 if err == nil { 533 err = out.WriteVLong(t.arcWithOutputCount) 534 if err == nil { 535 err = out.WriteVLong(t.bytes.position()) 536 if err == nil { 537 err = t.bytes.writeTo(out) 538 } 539 } 540 } 541 } 542 } 543 return err 544 } 545 546 func (t *FST) writeLabel(out util.DataOutput, v int) error { 547 assert2(v >= 0, "v=%v", v) 548 if t.inputType == INPUT_TYPE_BYTE1 { 549 assert2(v <= 255, "v=%v", v) 550 return out.WriteByte(byte(v)) 551 } else if t.inputType == INPUT_TYPE_BYTE2 { 552 panic("not implemented yet") 553 } else { 554 panic("not implemented yet") 555 } 556 } 557 558 func (t *FST) readLabel(in util.DataInput) (v int, err error) { 559 switch t.inputType { 560 case INPUT_TYPE_BYTE1: // Unsigned byte 561 if b, err := in.ReadByte(); err == nil { 562 v = int(b) 563 } 564 case INPUT_TYPE_BYTE2: // Unsigned short 565 if s, err := in.ReadShort(); err == nil { 566 v = int(s) 567 } 568 default: 569 v, err = AsInt(in.ReadVInt()) 570 } 571 return v, err 572 } 573 574 func targetHasArcs(arc *Arc) bool { 575 return arc.target > 0 576 } 577 578 /* Serializes new node by appending its bytes to the end of the current []byte */ 579 func (t *FST) addNode(nodeIn *UnCompiledNode) (int64, error) { 580 // fmt.Printf("FST.addNode pos=%v numArcs=%v\n", t.bytes.position(), nodeIn.NumArcs) 581 if nodeIn.NumArcs == 0 { 582 if nodeIn.IsFinal { 583 return FST_FINAL_END_NODE, nil 584 } 585 return FST_NON_FINAL_END_NODE, nil 586 } 587 588 startAddress := t.bytes.position() 589 // fmt.Printf(" startAddr=%v\n", startAddress) 590 591 doFixedArray := t.shouldExpand(nodeIn) 592 if doFixedArray { 593 // fmt.Println(" fixedArray") 594 if len(t.bytesPerArc) < nodeIn.NumArcs { 595 t.bytesPerArc = make([]int, util.Oversize(nodeIn.NumArcs, 1)) 596 } 597 } 598 599 t.arcCount += int64(nodeIn.NumArcs) 600 601 lastArc := nodeIn.NumArcs - 1 602 603 lastArcStart := t.bytes.position() 604 maxBytesPerArc := 0 605 for arcIdx := 0; arcIdx < nodeIn.NumArcs; arcIdx++ { 606 arc := nodeIn.Arcs[arcIdx] 607 target := arc.Target.(*CompiledNode) 608 flags := byte(0) 609 // fmt.Printf(" arc %v label=%v -> target=%v\n", arcIdx, arc.label, target.node) 610 611 if arcIdx == lastArc { 612 flags += FST_BIT_LAST_ARC 613 } 614 615 if t.lastFrozenNode == target.node && !doFixedArray { 616 flags += FST_BIT_TARGET_NEXT 617 } 618 619 if arc.isFinal { 620 flags += FST_BIT_FINAL_ARC 621 if arc.nextFinalOutput != NO_OUTPUT { 622 flags += FST_BIT_ARC_HAS_FINAL_OUTPUT 623 } 624 } else { 625 assert(arc.nextFinalOutput == NO_OUTPUT) 626 } 627 628 targetHasArcs := target.node > 0 629 630 if !targetHasArcs { 631 flags += FST_BIT_STOP_NODE 632 } else if t.inCounts != nil { 633 panic("not implemented yet") 634 } 635 636 if arc.output != NO_OUTPUT { 637 flags += FST_BIT_ARC_HAS_OUTPUT 638 } 639 640 t.bytes.WriteByte(flags) 641 var err error 642 if err = t.writeLabel(t.bytes, arc.label); err != nil { 643 return 0, err 644 } 645 646 // fmt.Printf(" write arc: label=%c flags=%v target=%v pos=%v output=%v\n", 647 // rune(arc.label), flags, target.node, t.bytes.position(), 648 // t.outputs.outputToString(arc.output)) 649 650 if arc.output != NO_OUTPUT { 651 if err = t.outputs.Write(arc.output, t.bytes); err != nil { 652 return 0, err 653 } 654 // fmt.Println(" write output") 655 t.arcWithOutputCount++ 656 } 657 658 if arc.nextFinalOutput != NO_OUTPUT { 659 // fmt.Println(" write final output") 660 if err = t.outputs.writeFinalOutput(arc.nextFinalOutput, t.bytes); err != nil { 661 return 0, err 662 } 663 } 664 665 if targetHasArcs && (flags&FST_BIT_TARGET_NEXT) == 0 { 666 assert(target.node > 0) 667 // fmt.Println(" write target") 668 if err = t.bytes.WriteVLong(target.node); err != nil { 669 return 0, err 670 } 671 } 672 673 // just write the arcs "like normal" on first pass, but record 674 // how many bytes each one took, and max byte size: 675 if doFixedArray { 676 t.bytesPerArc[arcIdx] = int(t.bytes.position() - lastArcStart) 677 lastArcStart = t.bytes.position() 678 if t.bytesPerArc[arcIdx] > maxBytesPerArc { 679 maxBytesPerArc = t.bytesPerArc[arcIdx] 680 } 681 } 682 } 683 684 if doFixedArray { 685 MAX_HEADER_SIZE := 11 // header(byte) + numArcs(vint) + numBytes(vint) 686 assert(maxBytesPerArc > 0) 687 // 2nd pass just "expands" all arcs to take up a fixed byte size 688 // create the header 689 header := make([]byte, MAX_HEADER_SIZE) 690 bad := store.NewByteArrayDataOutput(header) 691 // write a "false" first arc: 692 bad.WriteByte(FST_ARCS_AS_FIXED_ARRAY) 693 bad.WriteVInt(int32(nodeIn.NumArcs)) 694 bad.WriteVInt(int32(maxBytesPerArc)) 695 headerLen := bad.Position() 696 697 fixedArrayStart := startAddress + int64(headerLen) 698 699 // expand the arcs in place, backwards 700 srcPos := t.bytes.position() 701 destPos := fixedArrayStart + int64(nodeIn.NumArcs)*int64(maxBytesPerArc) 702 assert(destPos >= srcPos) 703 if destPos > srcPos { 704 t.bytes.skipBytes(int(destPos - srcPos)) 705 for arcIdx := nodeIn.NumArcs - 1; arcIdx >= 0; arcIdx-- { 706 destPos -= int64(maxBytesPerArc) 707 srcPos -= int64(t.bytesPerArc[arcIdx]) 708 if srcPos != destPos { 709 assert2(destPos > srcPos, 710 "destPos=%v srcPos=%v arcIdx=%v maxBytesPerArc=%v bytesPerArc[arcIdx]=%v nodeIn.numArcs=%v", 711 destPos, srcPos, arcIdx, maxBytesPerArc, t.bytesPerArc[arcIdx], nodeIn.NumArcs) 712 t.bytes.copyBytesInside(srcPos, destPos, t.bytesPerArc[arcIdx]) 713 } 714 } 715 } 716 717 // now write the header 718 t.bytes.writeBytesAt(startAddress, header[:headerLen]) 719 } 720 721 thisNodeAddress := t.bytes.position() - 1 722 723 t.bytes.reverse(startAddress, thisNodeAddress) 724 725 // PackedInts uses int as the index, so we cannot handle > 2.1B 726 // nodes when packing: 727 assert2(t.nodeAddress == nil || t.nodeCount < math.MaxInt32, 728 "cannot create a packed FST with more than 2.1 billion nodes") 729 730 t.nodeCount++ 731 var node int64 732 if t.nodeAddress != nil { 733 panic("not implemented yet") 734 } else { 735 node = thisNodeAddress 736 } 737 t.lastFrozenNode = node 738 739 // fmt.Printf(" ret node=%v address=%v nodeAddress=%v", 740 // node, thisNodeAddress, t.nodeAddress) 741 return node, nil 742 } 743 744 func (t *FST) FirstArc(arc *Arc) *Arc { 745 if t.emptyOutput != nil { 746 arc.flags = FST_BIT_FINAL_ARC | FST_BIT_LAST_ARC 747 arc.NextFinalOutput = t.emptyOutput 748 if t.emptyOutput != NO_OUTPUT { 749 arc.flags |= FST_BIT_ARC_HAS_FINAL_OUTPUT 750 } 751 } else { 752 arc.flags = FST_BIT_LAST_ARC 753 arc.NextFinalOutput = t.NO_OUTPUT 754 } 755 arc.Output = t.NO_OUTPUT 756 757 // If there are no nodes, ie, the FST only accepts the 758 // empty string, then startNode is 0 759 arc.target = t.startNode 760 return arc 761 } 762 763 func (t *FST) readUnpackedNodeTarget(in BytesReader) (target int64, err error) { 764 if t.version < FST_VERSION_VINT_TARGET { 765 return AsInt64(in.ReadInt()) 766 } 767 return in.ReadVLong() 768 } 769 770 func AsInt(n int32, err error) (n2 int, err2 error) { 771 return int(n), err 772 } 773 774 func AsInt64(n int32, err error) (n2 int64, err2 error) { 775 return int64(n), err 776 } 777 778 func (t *FST) readFirstTargetArc(follow, arc *Arc, in BytesReader) (*Arc, error) { 779 if follow.IsFinal() { 780 // insert "fake" final first arc: 781 arc.Label = FST_END_LABEL 782 arc.Output = follow.NextFinalOutput 783 arc.flags = FST_BIT_FINAL_ARC 784 if follow.target <= 0 { 785 arc.flags |= FST_BIT_LAST_ARC 786 } else { 787 arc.node = follow.target 788 // NOTE: nextArc is a node (not an address!) in this case: 789 arc.nextArc = follow.target 790 } 791 arc.target = FST_FINAL_END_NODE 792 return arc, nil 793 } 794 return t.readFirstRealTargetArc(follow.target, arc, in) 795 } 796 797 func (t *FST) readFirstRealTargetArc(node int64, arc *Arc, in BytesReader) (ans *Arc, err error) { 798 address := t.getNodeAddress(node) 799 in.setPosition(address) 800 arc.node = node 801 802 flag, err := in.ReadByte() 803 if err != nil { 804 return nil, err 805 } 806 if flag == FST_ARCS_AS_FIXED_ARRAY { 807 // this is first arc in a fixed-array 808 arc.numArcs, err = AsInt(in.ReadVInt()) 809 if err != nil { 810 return nil, err 811 } 812 if t.packed || t.version >= FST_VERSION_VINT_TARGET { 813 arc.bytesPerArc, err = AsInt(in.ReadVInt()) 814 } else { 815 arc.bytesPerArc, err = AsInt(in.ReadInt()) 816 } 817 if err != nil { 818 return nil, err 819 } 820 arc.arcIdx = -1 821 pos := in.getPosition() 822 arc.nextArc, arc.posArcsStart = pos, pos 823 } else { 824 // arc.flags = b 825 arc.nextArc = address 826 arc.bytesPerArc = 0 827 } 828 829 return t.readNextRealArc(arc, in) 830 } 831 832 func (t *FST) readNextArc(arc *Arc, in BytesReader) (*Arc, error) { 833 if arc.Label == FST_END_LABEL { 834 // this was a fake inserted "final" arc 835 assert2(arc.nextArc > 0, "cannot readNextArc when arc.isLast()=true") 836 return t.readFirstRealTargetArc(arc.nextArc, arc, in) 837 } else { 838 return t.readNextRealArc(arc, in) 839 } 840 } 841 842 /** Never returns null, but you should never call this if 843 * arc.isLast() is true. */ 844 func (t *FST) readNextRealArc(arc *Arc, in BytesReader) (ans *Arc, err error) { 845 // TODO: can't assert this because we call from readFirstArc 846 // assert !flag(arc.flags, BIT_LAST_ARC); 847 848 // this is a continuing arc in a fixed array 849 if arc.bytesPerArc != 0 { // arcs are at fixed entries 850 arc.arcIdx++ 851 // assert arc.arcIdx < arc.numArcs 852 in.setPosition(arc.posArcsStart) 853 in.skipBytes(int64(arc.arcIdx * arc.bytesPerArc)) 854 } else { // arcs are packed 855 in.setPosition(arc.nextArc) 856 } 857 if arc.flags, err = in.ReadByte(); err == nil { 858 arc.Label, err = t.readLabel(in) 859 } 860 if err != nil { 861 return nil, err 862 } 863 864 if arc.flag(FST_BIT_ARC_HAS_OUTPUT) { 865 arc.Output, err = t.outputs.Read(in) 866 if err != nil { 867 return nil, err 868 } 869 } else { 870 arc.Output = t.outputs.NoOutput() 871 } 872 873 if arc.flag(FST_BIT_ARC_HAS_FINAL_OUTPUT) { 874 arc.NextFinalOutput, err = t.outputs.ReadFinalOutput(in) 875 if err != nil { 876 return nil, err 877 } 878 } else { 879 arc.NextFinalOutput = t.outputs.NoOutput() 880 } 881 882 if arc.flag(FST_BIT_STOP_NODE) { 883 if arc.flag(FST_BIT_FINAL_ARC) { 884 arc.target = FST_FINAL_END_NODE 885 } else { 886 arc.target = FST_NON_FINAL_END_NODE 887 } 888 arc.nextArc = in.getPosition() 889 } else if arc.flag(FST_BIT_TARGET_NEXT) { 890 arc.nextArc = in.getPosition() 891 // TODO: would be nice to make this lazy -- maybe 892 // caller doesn't need the target and is scanning arcs... 893 if t.nodeAddress == nil { 894 if !arc.flag(FST_BIT_LAST_ARC) { 895 if arc.bytesPerArc == 0 { // must scan 896 t.seekToNextNode(in) 897 } else { 898 in.setPosition(arc.posArcsStart) 899 in.skipBytes(int64(arc.bytesPerArc * arc.numArcs)) 900 } 901 } 902 arc.target = in.getPosition() 903 } else { 904 arc.target = arc.node - 1 905 // assert arc.target > 0 906 } 907 } else { 908 if t.packed { 909 pos := in.getPosition() 910 code, err := in.ReadVLong() 911 if err != nil { 912 return nil, err 913 } 914 if arc.flag(FST_BIT_TARGET_DELTA) { // Address is delta-coded from current address: 915 arc.target = pos + code 916 } else if code < int64(t.nodeRefToAddress.Size()) { // Deref 917 arc.target = t.nodeRefToAddress.Get(int(code)) 918 } else { // Absolute 919 arc.target = code 920 } 921 } else { 922 arc.target, err = t.readUnpackedNodeTarget(in) 923 if err != nil { 924 return nil, err 925 } 926 } 927 arc.nextArc = in.getPosition() 928 } 929 return arc, nil 930 } 931 932 // TODO: could we somehow [partially] tableize arc lookups 933 // look automaton? 934 935 /** Finds an arc leaving the incoming arc, replacing the arc in place. 936 * This returns null if the arc was not found, else the incoming arc. */ 937 func (t *FST) FindTargetArc(labelToMatch int, follow *Arc, arc *Arc, in BytesReader) (target *Arc, err error) { 938 if labelToMatch == FST_END_LABEL { 939 if follow.IsFinal() { 940 if follow.target <= 0 { 941 arc.flags = FST_BIT_LAST_ARC 942 } else { 943 arc.flags = 0 944 // NOTE: nextArc is a node (not an address!) in this case: 945 arc.nextArc = follow.target 946 arc.node = follow.target 947 } 948 arc.Output = follow.NextFinalOutput 949 arc.Label = FST_END_LABEL 950 return arc, nil 951 } else { 952 return nil, nil 953 } 954 } 955 956 // Short-circuit if this arc is in the root arc cache: 957 if follow.target == t.startNode && labelToMatch < len(t.cachedRootArcs) { 958 // LUCENE-5152: detect tricky cases where caller 959 // modified previously returned cached root-arcs: 960 t.assertRootArcs() 961 if result := t.cachedRootArcs[labelToMatch]; result != nil { 962 arc.copyFrom(result) 963 return arc, nil 964 } 965 return nil, nil 966 } 967 968 if !targetHasArcs(follow) { 969 return nil, nil 970 } 971 972 in.setPosition(t.getNodeAddress(follow.target)) 973 974 arc.node = follow.target 975 976 // log.Printf("fta label=%v", labelToMatch) 977 978 b, err := in.ReadByte() 979 if err != nil { 980 return nil, err 981 } 982 if b == FST_ARCS_AS_FIXED_ARRAY { 983 // Arcs are full array; do binary search: 984 arc.numArcs, err = AsInt(in.ReadVInt()) 985 if err != nil { 986 return nil, err 987 } 988 if t.packed || t.version >= FST_VERSION_VINT_TARGET { 989 arc.bytesPerArc, err = AsInt(in.ReadVInt()) 990 if err != nil { 991 return nil, err 992 } 993 } else { 994 arc.bytesPerArc, err = AsInt(in.ReadInt()) 995 if err != nil { 996 return nil, err 997 } 998 } 999 arc.posArcsStart = in.getPosition() 1000 for low, high := 0, arc.numArcs-1; low < high; { 1001 // log.Println(" cycle") 1002 mid := int(uint(low+high) / 2) 1003 in.setPosition(arc.posArcsStart) 1004 in.skipBytes(int64(arc.bytesPerArc*mid) + 1) 1005 midLabel, err := t.readLabel(in) 1006 if err != nil { 1007 return nil, err 1008 } 1009 cmp := midLabel - labelToMatch 1010 if cmp < 0 { 1011 low = mid + 1 1012 } else if cmp > 0 { 1013 high = mid - 1 1014 } else { 1015 arc.arcIdx = mid - 1 1016 // log.Println(" found!") 1017 return t.readNextRealArc(arc, in) 1018 } 1019 } 1020 1021 return nil, nil 1022 } 1023 1024 // Linear scan 1025 1026 if _, err = t.readFirstRealTargetArc(follow.target, arc, in); err != nil { 1027 return nil, err 1028 } 1029 1030 for { 1031 //System.out.println(" non-bs cycle"); 1032 // TODO: we should fix this code to not have to create 1033 // object for the output of every arc we scan... only 1034 // for the matching arc, if found 1035 if arc.Label == labelToMatch { 1036 //System.out.println(" found!"); 1037 return arc, nil 1038 } else if arc.Label > labelToMatch { 1039 return nil, nil 1040 } else if arc.isLast() { 1041 return nil, nil 1042 } else { 1043 if _, err = t.readNextRealArc(arc, in); err != nil { 1044 return nil, err 1045 } 1046 } 1047 } 1048 } 1049 1050 func (t *FST) seekToNextNode(in BytesReader) error { 1051 var err error 1052 var flags byte 1053 for { 1054 if flags, err = in.ReadByte(); err == nil { 1055 _, err = t.readLabel(in) 1056 } 1057 if err != nil { 1058 return err 1059 } 1060 1061 if hasFlag(flags, FST_BIT_ARC_HAS_OUTPUT) { 1062 if err = t.outputs.SkipOutput(in); err != nil { 1063 return err 1064 } 1065 } 1066 1067 if hasFlag(flags, FST_BIT_ARC_HAS_FINAL_OUTPUT) { 1068 if err = t.outputs.SkipFinalOutput(in); err != nil { 1069 return err 1070 } 1071 } 1072 1073 if !hasFlag(flags, FST_BIT_STOP_NODE) && !hasFlag(flags, FST_BIT_TARGET_NEXT) { 1074 if t.packed { 1075 _, err = in.ReadVLong() 1076 } else { 1077 _, err = t.readUnpackedNodeTarget(in) 1078 } 1079 if err != nil { 1080 return err 1081 } 1082 } 1083 1084 if hasFlag(flags, FST_BIT_LAST_ARC) { 1085 return nil 1086 } 1087 } 1088 } 1089 1090 func (t *FST) NodeCount() int64 { 1091 return t.nodeCount + 1 1092 } 1093 1094 func (t *FST) shouldExpand(node *UnCompiledNode) bool { 1095 return t.allowArrayArcs && 1096 (node.depth <= FIXED_ARRAY_SHALLOW_DISTANCE && node.NumArcs >= FIXED_ARRAY_NUM_ARCS_SHALLOW || 1097 node.NumArcs >= FIXED_ARRAY_NUM_ARCS_DEEP) 1098 } 1099 1100 func (t *FST) BytesReader() BytesReader { 1101 if t.packed { 1102 return t.bytes.forwardReader() 1103 } 1104 return t.bytes.reverseReader() 1105 } 1106 1107 type RandomAccess interface { 1108 getPosition() int64 1109 setPosition(pos int64) 1110 reversed() bool 1111 skipBytes(count int64) 1112 } 1113 1114 type BytesReader interface { 1115 // *util.DataInputImpl 1116 util.DataInput 1117 RandomAccess 1118 } 1119 1120 // L1464 1121 /* 1122 Expert: creates an FST by packing this one. This process requires 1123 substantial additional RAM (currently up to ~8 bytes per node 1124 depending on acceptableOverheadRatio), but then should produce a 1125 smaller FST. 1126 1127 The implementation of this method uses ideas from 1128 <a target="_blank" href="http://www.cs.put.poznan.pl/dweiss/site/publications/download/fsacomp.pdf">Smaller Representation of Finite State Automata</a> 1129 which describes techniques to reduce the size of a FST. However, this 1130 is not a strict implementation of the algorithms described in this 1131 paper. 1132 */ 1133 func (t *FST) pack(minInCountDeref, maxDerefNodes int, 1134 acceptableOverheadRatio float32) (*FST, error) { 1135 panic("not implemented yet") 1136 }