github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/util/fst/fst.go (about)

     1  package fst
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"github.com/balzaczyy/golucene/core/codec"
     7  	"github.com/balzaczyy/golucene/core/store"
     8  	"github.com/balzaczyy/golucene/core/util"
     9  	"github.com/balzaczyy/golucene/core/util/packed"
    10  	"math"
    11  	"reflect"
    12  )
    13  
    14  // util/fst/FST.java
    15  
    16  var ARC_SHALLOW_RAM_BYTES_USED = util.ShallowSizeOfInstance(reflect.TypeOf(Arc{}))
    17  
    18  type InputType int
    19  
    20  const (
    21  	INPUT_TYPE_BYTE1 = 1
    22  	INPUT_TYPE_BYTE2 = 2
    23  	INPUT_TYPE_BYTE4 = 3
    24  )
    25  
    26  const (
    27  	FST_BIT_FINAL_ARC            = byte(1 << 0)
    28  	FST_BIT_LAST_ARC             = byte(1 << 1)
    29  	FST_BIT_TARGET_NEXT          = byte(1 << 2)
    30  	FST_BIT_STOP_NODE            = byte(1 << 3)
    31  	FST_BIT_ARC_HAS_OUTPUT       = byte(1 << 4)
    32  	FST_BIT_ARC_HAS_FINAL_OUTPUT = byte(1 << 5)
    33  	FST_BIT_TARGET_DELTA         = byte(1 << 6)
    34  	FST_ARCS_AS_FIXED_ARRAY      = FST_BIT_ARC_HAS_FINAL_OUTPUT
    35  
    36  	FIXED_ARRAY_SHALLOW_DISTANCE = 3 // 0 => only root node
    37  	FIXED_ARRAY_NUM_ARCS_SHALLOW = 5
    38  	FIXED_ARRAY_NUM_ARCS_DEEP    = 10
    39  
    40  	FST_FILE_FORMAT_NAME    = "FST"
    41  	FST_VERSION_PACKED      = 3
    42  	FST_VERSION_VINT_TARGET = 4
    43  
    44  	VERSION_CURRENT = FST_VERSION_VINT_TARGET
    45  
    46  	FST_FINAL_END_NODE     = -1
    47  	FST_NON_FINAL_END_NODE = 0
    48  
    49  	/** If arc has this label then that arc is final/accepted */
    50  	FST_END_LABEL = -1
    51  
    52  	FST_DEFAULT_MAX_BLOCK_BITS = 28 // 30 for 64 bit int
    53  )
    54  
    55  // Represents a single arc
    56  type Arc struct {
    57  	Label           int
    58  	Output          interface{}
    59  	node            int64 // from node
    60  	target          int64 // to node
    61  	flags           byte
    62  	NextFinalOutput interface{}
    63  	nextArc         int64
    64  	posArcsStart    int64
    65  	bytesPerArc     int
    66  	arcIdx          int
    67  	numArcs         int
    68  }
    69  
    70  func (arc *Arc) copyFrom(other *Arc) *Arc {
    71  	arc.node = other.node
    72  	arc.Label = other.Label
    73  	arc.target = other.target
    74  	arc.flags = other.flags
    75  	arc.Output = other.Output
    76  	arc.NextFinalOutput = other.NextFinalOutput
    77  	arc.nextArc = other.nextArc
    78  	arc.bytesPerArc = other.bytesPerArc
    79  	if other.bytesPerArc != 0 {
    80  		arc.posArcsStart = other.posArcsStart
    81  		arc.arcIdx = other.arcIdx
    82  		arc.numArcs = other.numArcs
    83  	}
    84  	return arc
    85  }
    86  
    87  func (arc *Arc) flag(flag byte) bool {
    88  	return hasFlag(arc.flags, flag)
    89  }
    90  
    91  func (arc *Arc) isLast() bool {
    92  	return arc.flag(FST_BIT_LAST_ARC)
    93  }
    94  
    95  func (arc *Arc) IsFinal() bool {
    96  	return arc.flag(FST_BIT_FINAL_ARC)
    97  }
    98  
    99  func (arc *Arc) String() string {
   100  	var b bytes.Buffer
   101  	fmt.Fprintf(&b, "node=%v target=%v label=%v", arc.node, arc.target, util.ItoHex(int64(arc.Label)))
   102  	if arc.flag(FST_BIT_FINAL_ARC) {
   103  		fmt.Fprintf(&b, " final")
   104  	}
   105  	if arc.flag(FST_BIT_LAST_ARC) {
   106  		fmt.Fprintf(&b, " last")
   107  	}
   108  	if arc.flag(FST_BIT_TARGET_NEXT) {
   109  		fmt.Fprintf(&b, " targetNext")
   110  	}
   111  	if arc.flag(FST_BIT_STOP_NODE) {
   112  		fmt.Fprintf(&b, " stop")
   113  	}
   114  	if arc.flag(FST_BIT_ARC_HAS_OUTPUT) {
   115  		fmt.Fprintf(&b, " output=%v", arc.Output)
   116  	}
   117  	if arc.flag(FST_BIT_ARC_HAS_FINAL_OUTPUT) {
   118  		fmt.Fprintf(&b, " nextFinalOutput=%v", arc.NextFinalOutput)
   119  	}
   120  	if arc.bytesPerArc != 0 {
   121  		fmt.Fprintf(&b, " arcArray(idx=%v of %v)", arc.arcIdx, arc.numArcs)
   122  	}
   123  	return b.String()
   124  }
   125  
   126  func hasFlag(flags, bit byte) bool {
   127  	return (flags & bit) != 0
   128  }
   129  
   130  type FST struct {
   131  	inputType   InputType
   132  	bytesPerArc []int
   133  	// if non-null, this FST accepts the empty string and
   134  	// produces this output
   135  	emptyOutput interface{}
   136  
   137  	bytes *BytesStore
   138  
   139  	startNode int64
   140  
   141  	outputs Outputs
   142  
   143  	lastFrozenNode int64
   144  
   145  	NO_OUTPUT interface{}
   146  
   147  	nodeCount          int64
   148  	arcCount           int64
   149  	arcWithOutputCount int64
   150  
   151  	packed           bool
   152  	nodeRefToAddress packed.PackedIntsReader
   153  
   154  	allowArrayArcs bool
   155  
   156  	cachedRootArcs          []*Arc
   157  	assertingCachedRootArcs []*Arc // only set wit assert
   158  
   159  	version int32
   160  
   161  	nodeAddress *packed.GrowableWriter
   162  
   163  	// TODO: we could be smarter here, and prune periodically as we go;
   164  	// high in-count nodes will "usually" become clear early on:
   165  	inCounts *packed.GrowableWriter
   166  
   167  	cachedArcsBytesUsed int
   168  }
   169  
   170  /* Make a new empty FST, for building; Builder invokes this ctor */
   171  func newFST(inputType InputType, outputs Outputs, willPackFST bool,
   172  	acceptableOverheadRatio float32, allowArrayArcs bool,
   173  	bytesPageBits int) *FST {
   174  	bytes := newBytesStoreFromBits(uint32(bytesPageBits))
   175  	// pad: ensure no node gets address 0 which is reserved to mean
   176  	// the stop state w/ no arcs
   177  	bytes.WriteByte(0)
   178  	ans := &FST{
   179  		inputType:      inputType,
   180  		outputs:        outputs,
   181  		allowArrayArcs: allowArrayArcs,
   182  		version:        VERSION_CURRENT,
   183  		bytes:          bytes,
   184  		NO_OUTPUT:      outputs.NoOutput(),
   185  		startNode:      -1,
   186  	}
   187  	if willPackFST {
   188  		ans.nodeAddress = packed.NewGrowableWriter(15, 8, acceptableOverheadRatio)
   189  		ans.inCounts = packed.NewGrowableWriter(1, 8, acceptableOverheadRatio)
   190  	}
   191  	return ans
   192  }
   193  
   194  func LoadFST(in util.DataInput, outputs Outputs) (fst *FST, err error) {
   195  	return loadFST3(in, outputs, FST_DEFAULT_MAX_BLOCK_BITS)
   196  }
   197  
   198  /** Load a previously saved FST; maxBlockBits allows you to
   199   *  control the size of the byte[] pages used to hold the FST bytes. */
   200  func loadFST3(in util.DataInput, outputs Outputs, maxBlockBits uint32) (fst *FST, err error) {
   201  	// log.Printf("Loading FST from %v and output to %v...", in, outputs)
   202  	// defer func() {
   203  	// 	if err != nil {
   204  	// 		log.Print("Failed to load FST.")
   205  	// 	}
   206  	// }()
   207  	fst = &FST{outputs: outputs, startNode: -1}
   208  
   209  	if maxBlockBits < 1 || maxBlockBits > 30 {
   210  		panic(fmt.Sprintf("maxBlockBits should 1..30; got %v", maxBlockBits))
   211  	}
   212  
   213  	// NOTE: only reads most recent format; we don't have
   214  	// back-compat promise for FSTs (they are experimental):
   215  	fst.version, err = codec.CheckHeader(in, FST_FILE_FORMAT_NAME, FST_VERSION_PACKED, FST_VERSION_VINT_TARGET)
   216  	if err != nil {
   217  		return nil, err
   218  	}
   219  	if b, err := in.ReadByte(); err == nil {
   220  		fst.packed = (b == 1)
   221  	} else {
   222  		return nil, err
   223  	}
   224  	if b, err := in.ReadByte(); err == nil {
   225  		if b == 1 {
   226  			// accepts empty string
   227  			// 1 KB blocks:
   228  			emptyBytes := newBytesStoreFromBits(10)
   229  			if numBytes, err := in.ReadVInt(); err == nil {
   230  				// log.Printf("Number of bytes: %v", numBytes)
   231  				emptyBytes.CopyBytes(in, int64(numBytes))
   232  
   233  				// De-serialize empty-string output:
   234  				var reader BytesReader
   235  				if fst.packed {
   236  					// log.Printf("Forward reader.")
   237  					reader = emptyBytes.forwardReader()
   238  				} else {
   239  					// log.Printf("Reverse reader.")
   240  					reader = emptyBytes.reverseReader()
   241  					// NoOutputs uses 0 bytes when writing its output,
   242  					// so we have to check here else BytesStore gets
   243  					// angry:
   244  					if numBytes > 0 {
   245  						reader.setPosition(int64(numBytes - 1))
   246  					}
   247  				}
   248  				// log.Printf("Reading final output from %v to %v...\n", reader, outputs)
   249  				fst.emptyOutput, err = outputs.ReadFinalOutput(reader)
   250  			}
   251  		} // else emptyOutput = nil
   252  	}
   253  	if err != nil {
   254  		return nil, err
   255  	}
   256  
   257  	if t, err := in.ReadByte(); err == nil {
   258  		switch t {
   259  		case 0:
   260  			fst.inputType = INPUT_TYPE_BYTE1
   261  		case 1:
   262  			fst.inputType = INPUT_TYPE_BYTE2
   263  		case 2:
   264  			fst.inputType = INPUT_TYPE_BYTE4
   265  		default:
   266  			panic(fmt.Sprintf("invalid input type %v", t))
   267  		}
   268  	} else {
   269  		return nil, err
   270  	}
   271  
   272  	if fst.packed {
   273  		if fst.nodeRefToAddress, err = packed.NewPackedReader(in); err != nil {
   274  			return nil, err
   275  		}
   276  	} // else nodeRefToAddress = nil
   277  
   278  	if fst.startNode, err = in.ReadVLong(); err == nil {
   279  		if fst.nodeCount, err = in.ReadVLong(); err == nil {
   280  			if fst.arcCount, err = in.ReadVLong(); err == nil {
   281  				if fst.arcWithOutputCount, err = in.ReadVLong(); err == nil {
   282  					if numBytes, err := in.ReadVLong(); err == nil {
   283  						if fst.bytes, err = newBytesStoreFromInput(in, numBytes, 1<<maxBlockBits); err == nil {
   284  							fst.NO_OUTPUT = outputs.NoOutput()
   285  
   286  							err = fst.cacheRootArcs()
   287  
   288  							// NOTE: bogus because this is only used during
   289  							// building; we need to break out mutable FST from
   290  							// immutable
   291  							// fst.allowArrayArcs = false
   292  						}
   293  					}
   294  				}
   295  			}
   296  		}
   297  	}
   298  	return fst, err
   299  }
   300  
   301  func (t *FST) ramBytesUsed(arcs []*Arc) int64 {
   302  	var size int64
   303  	if arcs != nil {
   304  		size += util.ShallowSizeOf(arcs)
   305  		for _, arc := range arcs {
   306  			if arc != nil {
   307  				size += ARC_SHALLOW_RAM_BYTES_USED
   308  				if arc.Output != nil && arc.Output != t.outputs.NoOutput() {
   309  					size += t.outputs.ramBytesUsed(arc.Output)
   310  				}
   311  				if arc.NextFinalOutput != nil && arc.NextFinalOutput != t.outputs.NoOutput() {
   312  					size += t.outputs.ramBytesUsed(arc.NextFinalOutput)
   313  				}
   314  			}
   315  		}
   316  	}
   317  	return size
   318  }
   319  
   320  func (t *FST) finish(newStartNode int64) error {
   321  	assert2(t.startNode == -1, "already finished")
   322  	if newStartNode == FST_FINAL_END_NODE && t.emptyOutput != nil {
   323  		newStartNode = 0
   324  	}
   325  	t.startNode = newStartNode
   326  	t.bytes.finish()
   327  
   328  	return t.cacheRootArcs()
   329  }
   330  
   331  func (t *FST) getNodeAddress(node int64) int64 {
   332  	if t.nodeAddress != nil { // Deref
   333  		return t.nodeAddress.Get(int(node))
   334  	} else { // Straight
   335  		return node
   336  	}
   337  }
   338  
   339  func (t *FST) cacheRootArcs() error {
   340  	t.cachedRootArcs = make([]*Arc, 0x80)
   341  	t.readRootArcs(t.cachedRootArcs)
   342  	t.cachedArcsBytesUsed += int(t.ramBytesUsed(t.cachedRootArcs))
   343  
   344  	if err := t.setAssertingRootArcs(t.cachedRootArcs); err != nil {
   345  		return err
   346  	}
   347  	t.assertRootArcs()
   348  	return nil
   349  }
   350  
   351  func (t *FST) readRootArcs(arcs []*Arc) (err error) {
   352  	arc := &Arc{}
   353  	t.FirstArc(arc)
   354  	in := t.BytesReader()
   355  	if targetHasArcs(arc) {
   356  		_, err = t.readFirstRealTargetArc(arc.target, arc, in)
   357  		for err == nil {
   358  			if arc.Label == FST_END_LABEL {
   359  				panic("assert fail")
   360  			}
   361  			if arc.Label >= len(t.cachedRootArcs) {
   362  				break
   363  			}
   364  			arcs[arc.Label] = (&Arc{}).copyFrom(arc)
   365  			if arc.isLast() {
   366  				break
   367  			}
   368  			_, err = t.readNextRealArc(arc, in)
   369  		}
   370  	}
   371  	return err
   372  }
   373  
   374  func (t *FST) setAssertingRootArcs(arcs []*Arc) error {
   375  	t.assertingCachedRootArcs = make([]*Arc, len(arcs))
   376  	err := t.readRootArcs(t.assertingCachedRootArcs)
   377  	if err == nil {
   378  		t.cachedArcsBytesUsed *= 2
   379  	}
   380  	return err
   381  }
   382  
   383  func (t *FST) assertRootArcs() {
   384  	if t.cachedRootArcs == nil || t.assertingCachedRootArcs == nil {
   385  		panic("assert fail")
   386  	}
   387  	for i, v := range t.assertingCachedRootArcs {
   388  		root := t.cachedRootArcs[i]
   389  		asserting := v
   390  		if root != nil {
   391  			assert(root.arcIdx == asserting.arcIdx)
   392  			assert(root.bytesPerArc == asserting.bytesPerArc)
   393  			assert(root.flags == asserting.flags)
   394  			assert(root.Label == asserting.Label)
   395  			assert(root.nextArc == asserting.nextArc)
   396  			assert2(equals(root.NextFinalOutput, asserting.NextFinalOutput),
   397  				"%v != %v", root.NextFinalOutput, asserting.NextFinalOutput)
   398  			assert(root.node == asserting.node)
   399  			assert(root.numArcs == asserting.numArcs)
   400  			assert(equals(root.Output, asserting.Output))
   401  			assert(root.posArcsStart == asserting.posArcsStart)
   402  			assert(root.target == asserting.target)
   403  		} else {
   404  			assert(asserting == nil)
   405  		}
   406  	}
   407  }
   408  
   409  // Since Go doesn't has Java's Object.equals() method,
   410  // I have to implement my own.
   411  func equals(a, b interface{}) bool {
   412  	sameType := reflect.TypeOf(a) == reflect.TypeOf(b)
   413  	if _, ok := a.([]byte); ok {
   414  		if _, ok := b.([]byte); !ok {
   415  			// panic(fmt.Sprintf("incomparable type: %v vs %v", a, b))
   416  			return false
   417  		}
   418  		b1 := a.([]byte)
   419  		b2 := b.([]byte)
   420  		if len(b1) != len(b2) {
   421  			return false
   422  		}
   423  		for i := 0; i < len(b1) && i < len(b2); i++ {
   424  			if b1[i] != b2[i] {
   425  				return false
   426  			}
   427  		}
   428  		return true
   429  	} else if _, ok := a.(int64); ok {
   430  		if _, ok := b.(int64); !ok {
   431  			// panic(fmt.Sprintf("incomparable type: %v vs %v", a, b))
   432  			return false
   433  		}
   434  		return a.(int64) == b.(int64)
   435  	} else if a == nil && b == nil {
   436  		return true
   437  	} else if sameType && a == b {
   438  		return true
   439  	}
   440  	return false
   441  }
   442  
   443  func CompareFSTValue(a, b interface{}) bool {
   444  	return equals(a, b)
   445  }
   446  
   447  func (t *FST) EmptyOutput() interface{} {
   448  	return t.emptyOutput
   449  }
   450  
   451  // L493
   452  func (t *FST) setEmptyOutput(v interface{}) {
   453  	if t.emptyOutput != nil {
   454  		t.emptyOutput = t.outputs.merge(t.emptyOutput, v)
   455  	} else {
   456  		t.emptyOutput = v
   457  	}
   458  }
   459  
   460  func (t *FST) Save(out util.DataOutput) error {
   461  	assert2(t.startNode != -1, "call finish first")
   462  	assert2(t.nodeAddress == nil, "cannot save an FST pre-packaged FST; it must first be packed")
   463  	_, ok := t.nodeRefToAddress.(packed.Mutable)
   464  	assert2(!t.packed || ok, "cannot save a FST which has been loaded from disk ")
   465  	err := codec.WriteHeader(out, FST_FILE_FORMAT_NAME, VERSION_CURRENT)
   466  	if err == nil && t.packed {
   467  		err = out.WriteByte(1)
   468  	} else {
   469  		err = out.WriteByte(0)
   470  	}
   471  	// TODO: really we should encode this as an arc, arriving
   472  	// to the root node, instead of special casing here:
   473  	if err == nil && t.emptyOutput != nil {
   474  		// accepts empty string
   475  		err = out.WriteByte(1)
   476  
   477  		if err == nil {
   478  			// serialize empty-string output:
   479  			ros := store.NewRAMOutputStreamBuffer()
   480  			err = t.outputs.writeFinalOutput(t.emptyOutput, ros)
   481  
   482  			if err == nil {
   483  				emptyOutputBytes := make([]byte, ros.FilePointer())
   484  				err = ros.WriteToBytes(emptyOutputBytes)
   485  
   486  				length := len(emptyOutputBytes)
   487  				if err == nil && !t.packed {
   488  					// reverse
   489  					stopAt := length / 2
   490  					for upto := 0; upto < stopAt; upto++ {
   491  						emptyOutputBytes[upto], emptyOutputBytes[length-upto-1] =
   492  							emptyOutputBytes[length-upto-1], emptyOutputBytes[upto]
   493  					}
   494  				}
   495  				if err == nil {
   496  					err = out.WriteVInt(int32(length))
   497  					if err == nil {
   498  						err = out.WriteBytes(emptyOutputBytes)
   499  					}
   500  				}
   501  			}
   502  		}
   503  	} else if err == nil {
   504  		err = out.WriteByte(0)
   505  	}
   506  	if err != nil {
   507  		return err
   508  	}
   509  
   510  	var tb byte
   511  	switch int(t.inputType) {
   512  	case INPUT_TYPE_BYTE1:
   513  		tb = 0
   514  	case INPUT_TYPE_BYTE2:
   515  		tb = 1
   516  	default:
   517  		tb = 2
   518  	}
   519  	err = out.WriteByte(tb)
   520  	if err == nil && t.packed {
   521  		err = t.nodeRefToAddress.(packed.Mutable).Save(out)
   522  	}
   523  	if err != nil {
   524  		return err
   525  	}
   526  
   527  	err = out.WriteVLong(t.startNode)
   528  	if err == nil {
   529  		err = out.WriteVLong(t.nodeCount)
   530  		if err == nil {
   531  			err = out.WriteVLong(t.arcCount)
   532  			if err == nil {
   533  				err = out.WriteVLong(t.arcWithOutputCount)
   534  				if err == nil {
   535  					err = out.WriteVLong(t.bytes.position())
   536  					if err == nil {
   537  						err = t.bytes.writeTo(out)
   538  					}
   539  				}
   540  			}
   541  		}
   542  	}
   543  	return err
   544  }
   545  
   546  func (t *FST) writeLabel(out util.DataOutput, v int) error {
   547  	assert2(v >= 0, "v=%v", v)
   548  	if t.inputType == INPUT_TYPE_BYTE1 {
   549  		assert2(v <= 255, "v=%v", v)
   550  		return out.WriteByte(byte(v))
   551  	} else if t.inputType == INPUT_TYPE_BYTE2 {
   552  		panic("not implemented yet")
   553  	} else {
   554  		panic("not implemented yet")
   555  	}
   556  }
   557  
   558  func (t *FST) readLabel(in util.DataInput) (v int, err error) {
   559  	switch t.inputType {
   560  	case INPUT_TYPE_BYTE1: // Unsigned byte
   561  		if b, err := in.ReadByte(); err == nil {
   562  			v = int(b)
   563  		}
   564  	case INPUT_TYPE_BYTE2: // Unsigned short
   565  		if s, err := in.ReadShort(); err == nil {
   566  			v = int(s)
   567  		}
   568  	default:
   569  		v, err = AsInt(in.ReadVInt())
   570  	}
   571  	return v, err
   572  }
   573  
   574  func targetHasArcs(arc *Arc) bool {
   575  	return arc.target > 0
   576  }
   577  
   578  /* Serializes new node by appending its bytes to the end of the current []byte */
   579  func (t *FST) addNode(nodeIn *UnCompiledNode) (int64, error) {
   580  	// fmt.Printf("FST.addNode pos=%v numArcs=%v\n", t.bytes.position(), nodeIn.NumArcs)
   581  	if nodeIn.NumArcs == 0 {
   582  		if nodeIn.IsFinal {
   583  			return FST_FINAL_END_NODE, nil
   584  		}
   585  		return FST_NON_FINAL_END_NODE, nil
   586  	}
   587  
   588  	startAddress := t.bytes.position()
   589  	// fmt.Printf("  startAddr=%v\n", startAddress)
   590  
   591  	doFixedArray := t.shouldExpand(nodeIn)
   592  	if doFixedArray {
   593  		// fmt.Println("  fixedArray")
   594  		if len(t.bytesPerArc) < nodeIn.NumArcs {
   595  			t.bytesPerArc = make([]int, util.Oversize(nodeIn.NumArcs, 1))
   596  		}
   597  	}
   598  
   599  	t.arcCount += int64(nodeIn.NumArcs)
   600  
   601  	lastArc := nodeIn.NumArcs - 1
   602  
   603  	lastArcStart := t.bytes.position()
   604  	maxBytesPerArc := 0
   605  	for arcIdx := 0; arcIdx < nodeIn.NumArcs; arcIdx++ {
   606  		arc := nodeIn.Arcs[arcIdx]
   607  		target := arc.Target.(*CompiledNode)
   608  		flags := byte(0)
   609  		// fmt.Printf("  arc %v label=%v -> target=%v\n", arcIdx, arc.label, target.node)
   610  
   611  		if arcIdx == lastArc {
   612  			flags += FST_BIT_LAST_ARC
   613  		}
   614  
   615  		if t.lastFrozenNode == target.node && !doFixedArray {
   616  			flags += FST_BIT_TARGET_NEXT
   617  		}
   618  
   619  		if arc.isFinal {
   620  			flags += FST_BIT_FINAL_ARC
   621  			if arc.nextFinalOutput != NO_OUTPUT {
   622  				flags += FST_BIT_ARC_HAS_FINAL_OUTPUT
   623  			}
   624  		} else {
   625  			assert(arc.nextFinalOutput == NO_OUTPUT)
   626  		}
   627  
   628  		targetHasArcs := target.node > 0
   629  
   630  		if !targetHasArcs {
   631  			flags += FST_BIT_STOP_NODE
   632  		} else if t.inCounts != nil {
   633  			panic("not implemented yet")
   634  		}
   635  
   636  		if arc.output != NO_OUTPUT {
   637  			flags += FST_BIT_ARC_HAS_OUTPUT
   638  		}
   639  
   640  		t.bytes.WriteByte(flags)
   641  		var err error
   642  		if err = t.writeLabel(t.bytes, arc.label); err != nil {
   643  			return 0, err
   644  		}
   645  
   646  		// fmt.Printf("  write arc: label=%c flags=%v target=%v pos=%v output=%v\n",
   647  		// 	rune(arc.label), flags, target.node, t.bytes.position(),
   648  		// 	t.outputs.outputToString(arc.output))
   649  
   650  		if arc.output != NO_OUTPUT {
   651  			if err = t.outputs.Write(arc.output, t.bytes); err != nil {
   652  				return 0, err
   653  			}
   654  			// fmt.Println("    write output")
   655  			t.arcWithOutputCount++
   656  		}
   657  
   658  		if arc.nextFinalOutput != NO_OUTPUT {
   659  			// fmt.Println("    write final output")
   660  			if err = t.outputs.writeFinalOutput(arc.nextFinalOutput, t.bytes); err != nil {
   661  				return 0, err
   662  			}
   663  		}
   664  
   665  		if targetHasArcs && (flags&FST_BIT_TARGET_NEXT) == 0 {
   666  			assert(target.node > 0)
   667  			// fmt.Println("    write target")
   668  			if err = t.bytes.WriteVLong(target.node); err != nil {
   669  				return 0, err
   670  			}
   671  		}
   672  
   673  		// just write the arcs "like normal" on first pass, but record
   674  		// how many bytes each one took, and max byte size:
   675  		if doFixedArray {
   676  			t.bytesPerArc[arcIdx] = int(t.bytes.position() - lastArcStart)
   677  			lastArcStart = t.bytes.position()
   678  			if t.bytesPerArc[arcIdx] > maxBytesPerArc {
   679  				maxBytesPerArc = t.bytesPerArc[arcIdx]
   680  			}
   681  		}
   682  	}
   683  
   684  	if doFixedArray {
   685  		MAX_HEADER_SIZE := 11 // header(byte) + numArcs(vint) + numBytes(vint)
   686  		assert(maxBytesPerArc > 0)
   687  		// 2nd pass just "expands" all arcs to take up a fixed byte size
   688  		// create the header
   689  		header := make([]byte, MAX_HEADER_SIZE)
   690  		bad := store.NewByteArrayDataOutput(header)
   691  		// write a "false" first arc:
   692  		bad.WriteByte(FST_ARCS_AS_FIXED_ARRAY)
   693  		bad.WriteVInt(int32(nodeIn.NumArcs))
   694  		bad.WriteVInt(int32(maxBytesPerArc))
   695  		headerLen := bad.Position()
   696  
   697  		fixedArrayStart := startAddress + int64(headerLen)
   698  
   699  		// expand the arcs in place, backwards
   700  		srcPos := t.bytes.position()
   701  		destPos := fixedArrayStart + int64(nodeIn.NumArcs)*int64(maxBytesPerArc)
   702  		assert(destPos >= srcPos)
   703  		if destPos > srcPos {
   704  			t.bytes.skipBytes(int(destPos - srcPos))
   705  			for arcIdx := nodeIn.NumArcs - 1; arcIdx >= 0; arcIdx-- {
   706  				destPos -= int64(maxBytesPerArc)
   707  				srcPos -= int64(t.bytesPerArc[arcIdx])
   708  				if srcPos != destPos {
   709  					assert2(destPos > srcPos,
   710  						"destPos=%v srcPos=%v arcIdx=%v maxBytesPerArc=%v bytesPerArc[arcIdx]=%v nodeIn.numArcs=%v",
   711  						destPos, srcPos, arcIdx, maxBytesPerArc, t.bytesPerArc[arcIdx], nodeIn.NumArcs)
   712  					t.bytes.copyBytesInside(srcPos, destPos, t.bytesPerArc[arcIdx])
   713  				}
   714  			}
   715  		}
   716  
   717  		// now write the header
   718  		t.bytes.writeBytesAt(startAddress, header[:headerLen])
   719  	}
   720  
   721  	thisNodeAddress := t.bytes.position() - 1
   722  
   723  	t.bytes.reverse(startAddress, thisNodeAddress)
   724  
   725  	// PackedInts uses int as the index, so we cannot handle > 2.1B
   726  	// nodes when packing:
   727  	assert2(t.nodeAddress == nil || t.nodeCount < math.MaxInt32,
   728  		"cannot create a packed FST with more than 2.1 billion nodes")
   729  
   730  	t.nodeCount++
   731  	var node int64
   732  	if t.nodeAddress != nil {
   733  		panic("not implemented yet")
   734  	} else {
   735  		node = thisNodeAddress
   736  	}
   737  	t.lastFrozenNode = node
   738  
   739  	// fmt.Printf("  ret node=%v address=%v nodeAddress=%v",
   740  	// 	node, thisNodeAddress, t.nodeAddress)
   741  	return node, nil
   742  }
   743  
   744  func (t *FST) FirstArc(arc *Arc) *Arc {
   745  	if t.emptyOutput != nil {
   746  		arc.flags = FST_BIT_FINAL_ARC | FST_BIT_LAST_ARC
   747  		arc.NextFinalOutput = t.emptyOutput
   748  		if t.emptyOutput != NO_OUTPUT {
   749  			arc.flags |= FST_BIT_ARC_HAS_FINAL_OUTPUT
   750  		}
   751  	} else {
   752  		arc.flags = FST_BIT_LAST_ARC
   753  		arc.NextFinalOutput = t.NO_OUTPUT
   754  	}
   755  	arc.Output = t.NO_OUTPUT
   756  
   757  	// If there are no nodes, ie, the FST only accepts the
   758  	// empty string, then startNode is 0
   759  	arc.target = t.startNode
   760  	return arc
   761  }
   762  
   763  func (t *FST) readUnpackedNodeTarget(in BytesReader) (target int64, err error) {
   764  	if t.version < FST_VERSION_VINT_TARGET {
   765  		return AsInt64(in.ReadInt())
   766  	}
   767  	return in.ReadVLong()
   768  }
   769  
   770  func AsInt(n int32, err error) (n2 int, err2 error) {
   771  	return int(n), err
   772  }
   773  
   774  func AsInt64(n int32, err error) (n2 int64, err2 error) {
   775  	return int64(n), err
   776  }
   777  
   778  func (t *FST) readFirstTargetArc(follow, arc *Arc, in BytesReader) (*Arc, error) {
   779  	if follow.IsFinal() {
   780  		// insert "fake" final first arc:
   781  		arc.Label = FST_END_LABEL
   782  		arc.Output = follow.NextFinalOutput
   783  		arc.flags = FST_BIT_FINAL_ARC
   784  		if follow.target <= 0 {
   785  			arc.flags |= FST_BIT_LAST_ARC
   786  		} else {
   787  			arc.node = follow.target
   788  			// NOTE: nextArc is a node (not an address!) in this case:
   789  			arc.nextArc = follow.target
   790  		}
   791  		arc.target = FST_FINAL_END_NODE
   792  		return arc, nil
   793  	}
   794  	return t.readFirstRealTargetArc(follow.target, arc, in)
   795  }
   796  
   797  func (t *FST) readFirstRealTargetArc(node int64, arc *Arc, in BytesReader) (ans *Arc, err error) {
   798  	address := t.getNodeAddress(node)
   799  	in.setPosition(address)
   800  	arc.node = node
   801  
   802  	flag, err := in.ReadByte()
   803  	if err != nil {
   804  		return nil, err
   805  	}
   806  	if flag == FST_ARCS_AS_FIXED_ARRAY {
   807  		// this is first arc in a fixed-array
   808  		arc.numArcs, err = AsInt(in.ReadVInt())
   809  		if err != nil {
   810  			return nil, err
   811  		}
   812  		if t.packed || t.version >= FST_VERSION_VINT_TARGET {
   813  			arc.bytesPerArc, err = AsInt(in.ReadVInt())
   814  		} else {
   815  			arc.bytesPerArc, err = AsInt(in.ReadInt())
   816  		}
   817  		if err != nil {
   818  			return nil, err
   819  		}
   820  		arc.arcIdx = -1
   821  		pos := in.getPosition()
   822  		arc.nextArc, arc.posArcsStart = pos, pos
   823  	} else {
   824  		// arc.flags = b
   825  		arc.nextArc = address
   826  		arc.bytesPerArc = 0
   827  	}
   828  
   829  	return t.readNextRealArc(arc, in)
   830  }
   831  
   832  func (t *FST) readNextArc(arc *Arc, in BytesReader) (*Arc, error) {
   833  	if arc.Label == FST_END_LABEL {
   834  		// this was a fake inserted "final" arc
   835  		assert2(arc.nextArc > 0, "cannot readNextArc when arc.isLast()=true")
   836  		return t.readFirstRealTargetArc(arc.nextArc, arc, in)
   837  	} else {
   838  		return t.readNextRealArc(arc, in)
   839  	}
   840  }
   841  
   842  /** Never returns null, but you should never call this if
   843   *  arc.isLast() is true. */
   844  func (t *FST) readNextRealArc(arc *Arc, in BytesReader) (ans *Arc, err error) {
   845  	// TODO: can't assert this because we call from readFirstArc
   846  	// assert !flag(arc.flags, BIT_LAST_ARC);
   847  
   848  	// this is a continuing arc in a fixed array
   849  	if arc.bytesPerArc != 0 { // arcs are at fixed entries
   850  		arc.arcIdx++
   851  		// assert arc.arcIdx < arc.numArcs
   852  		in.setPosition(arc.posArcsStart)
   853  		in.skipBytes(int64(arc.arcIdx * arc.bytesPerArc))
   854  	} else { // arcs are packed
   855  		in.setPosition(arc.nextArc)
   856  	}
   857  	if arc.flags, err = in.ReadByte(); err == nil {
   858  		arc.Label, err = t.readLabel(in)
   859  	}
   860  	if err != nil {
   861  		return nil, err
   862  	}
   863  
   864  	if arc.flag(FST_BIT_ARC_HAS_OUTPUT) {
   865  		arc.Output, err = t.outputs.Read(in)
   866  		if err != nil {
   867  			return nil, err
   868  		}
   869  	} else {
   870  		arc.Output = t.outputs.NoOutput()
   871  	}
   872  
   873  	if arc.flag(FST_BIT_ARC_HAS_FINAL_OUTPUT) {
   874  		arc.NextFinalOutput, err = t.outputs.ReadFinalOutput(in)
   875  		if err != nil {
   876  			return nil, err
   877  		}
   878  	} else {
   879  		arc.NextFinalOutput = t.outputs.NoOutput()
   880  	}
   881  
   882  	if arc.flag(FST_BIT_STOP_NODE) {
   883  		if arc.flag(FST_BIT_FINAL_ARC) {
   884  			arc.target = FST_FINAL_END_NODE
   885  		} else {
   886  			arc.target = FST_NON_FINAL_END_NODE
   887  		}
   888  		arc.nextArc = in.getPosition()
   889  	} else if arc.flag(FST_BIT_TARGET_NEXT) {
   890  		arc.nextArc = in.getPosition()
   891  		// TODO: would be nice to make this lazy -- maybe
   892  		// caller doesn't need the target and is scanning arcs...
   893  		if t.nodeAddress == nil {
   894  			if !arc.flag(FST_BIT_LAST_ARC) {
   895  				if arc.bytesPerArc == 0 { // must scan
   896  					t.seekToNextNode(in)
   897  				} else {
   898  					in.setPosition(arc.posArcsStart)
   899  					in.skipBytes(int64(arc.bytesPerArc * arc.numArcs))
   900  				}
   901  			}
   902  			arc.target = in.getPosition()
   903  		} else {
   904  			arc.target = arc.node - 1
   905  			// assert arc.target > 0
   906  		}
   907  	} else {
   908  		if t.packed {
   909  			pos := in.getPosition()
   910  			code, err := in.ReadVLong()
   911  			if err != nil {
   912  				return nil, err
   913  			}
   914  			if arc.flag(FST_BIT_TARGET_DELTA) { // Address is delta-coded from current address:
   915  				arc.target = pos + code
   916  			} else if code < int64(t.nodeRefToAddress.Size()) { // Deref
   917  				arc.target = t.nodeRefToAddress.Get(int(code))
   918  			} else { // Absolute
   919  				arc.target = code
   920  			}
   921  		} else {
   922  			arc.target, err = t.readUnpackedNodeTarget(in)
   923  			if err != nil {
   924  				return nil, err
   925  			}
   926  		}
   927  		arc.nextArc = in.getPosition()
   928  	}
   929  	return arc, nil
   930  }
   931  
   932  // TODO: could we somehow [partially] tableize arc lookups
   933  // look automaton?
   934  
   935  /** Finds an arc leaving the incoming arc, replacing the arc in place.
   936   *  This returns null if the arc was not found, else the incoming arc. */
   937  func (t *FST) FindTargetArc(labelToMatch int, follow *Arc, arc *Arc, in BytesReader) (target *Arc, err error) {
   938  	if labelToMatch == FST_END_LABEL {
   939  		if follow.IsFinal() {
   940  			if follow.target <= 0 {
   941  				arc.flags = FST_BIT_LAST_ARC
   942  			} else {
   943  				arc.flags = 0
   944  				// NOTE: nextArc is a node (not an address!) in this case:
   945  				arc.nextArc = follow.target
   946  				arc.node = follow.target
   947  			}
   948  			arc.Output = follow.NextFinalOutput
   949  			arc.Label = FST_END_LABEL
   950  			return arc, nil
   951  		} else {
   952  			return nil, nil
   953  		}
   954  	}
   955  
   956  	// Short-circuit if this arc is in the root arc cache:
   957  	if follow.target == t.startNode && labelToMatch < len(t.cachedRootArcs) {
   958  		// LUCENE-5152: detect tricky cases where caller
   959  		// modified previously returned cached root-arcs:
   960  		t.assertRootArcs()
   961  		if result := t.cachedRootArcs[labelToMatch]; result != nil {
   962  			arc.copyFrom(result)
   963  			return arc, nil
   964  		}
   965  		return nil, nil
   966  	}
   967  
   968  	if !targetHasArcs(follow) {
   969  		return nil, nil
   970  	}
   971  
   972  	in.setPosition(t.getNodeAddress(follow.target))
   973  
   974  	arc.node = follow.target
   975  
   976  	// log.Printf("fta label=%v", labelToMatch)
   977  
   978  	b, err := in.ReadByte()
   979  	if err != nil {
   980  		return nil, err
   981  	}
   982  	if b == FST_ARCS_AS_FIXED_ARRAY {
   983  		// Arcs are full array; do binary search:
   984  		arc.numArcs, err = AsInt(in.ReadVInt())
   985  		if err != nil {
   986  			return nil, err
   987  		}
   988  		if t.packed || t.version >= FST_VERSION_VINT_TARGET {
   989  			arc.bytesPerArc, err = AsInt(in.ReadVInt())
   990  			if err != nil {
   991  				return nil, err
   992  			}
   993  		} else {
   994  			arc.bytesPerArc, err = AsInt(in.ReadInt())
   995  			if err != nil {
   996  				return nil, err
   997  			}
   998  		}
   999  		arc.posArcsStart = in.getPosition()
  1000  		for low, high := 0, arc.numArcs-1; low < high; {
  1001  			// log.Println("    cycle")
  1002  			mid := int(uint(low+high) / 2)
  1003  			in.setPosition(arc.posArcsStart)
  1004  			in.skipBytes(int64(arc.bytesPerArc*mid) + 1)
  1005  			midLabel, err := t.readLabel(in)
  1006  			if err != nil {
  1007  				return nil, err
  1008  			}
  1009  			cmp := midLabel - labelToMatch
  1010  			if cmp < 0 {
  1011  				low = mid + 1
  1012  			} else if cmp > 0 {
  1013  				high = mid - 1
  1014  			} else {
  1015  				arc.arcIdx = mid - 1
  1016  				// log.Println("    found!")
  1017  				return t.readNextRealArc(arc, in)
  1018  			}
  1019  		}
  1020  
  1021  		return nil, nil
  1022  	}
  1023  
  1024  	// Linear scan
  1025  
  1026  	if _, err = t.readFirstRealTargetArc(follow.target, arc, in); err != nil {
  1027  		return nil, err
  1028  	}
  1029  
  1030  	for {
  1031  		//System.out.println("  non-bs cycle");
  1032  		// TODO: we should fix this code to not have to create
  1033  		// object for the output of every arc we scan... only
  1034  		// for the matching arc, if found
  1035  		if arc.Label == labelToMatch {
  1036  			//System.out.println("    found!");
  1037  			return arc, nil
  1038  		} else if arc.Label > labelToMatch {
  1039  			return nil, nil
  1040  		} else if arc.isLast() {
  1041  			return nil, nil
  1042  		} else {
  1043  			if _, err = t.readNextRealArc(arc, in); err != nil {
  1044  				return nil, err
  1045  			}
  1046  		}
  1047  	}
  1048  }
  1049  
  1050  func (t *FST) seekToNextNode(in BytesReader) error {
  1051  	var err error
  1052  	var flags byte
  1053  	for {
  1054  		if flags, err = in.ReadByte(); err == nil {
  1055  			_, err = t.readLabel(in)
  1056  		}
  1057  		if err != nil {
  1058  			return err
  1059  		}
  1060  
  1061  		if hasFlag(flags, FST_BIT_ARC_HAS_OUTPUT) {
  1062  			if err = t.outputs.SkipOutput(in); err != nil {
  1063  				return err
  1064  			}
  1065  		}
  1066  
  1067  		if hasFlag(flags, FST_BIT_ARC_HAS_FINAL_OUTPUT) {
  1068  			if err = t.outputs.SkipFinalOutput(in); err != nil {
  1069  				return err
  1070  			}
  1071  		}
  1072  
  1073  		if !hasFlag(flags, FST_BIT_STOP_NODE) && !hasFlag(flags, FST_BIT_TARGET_NEXT) {
  1074  			if t.packed {
  1075  				_, err = in.ReadVLong()
  1076  			} else {
  1077  				_, err = t.readUnpackedNodeTarget(in)
  1078  			}
  1079  			if err != nil {
  1080  				return err
  1081  			}
  1082  		}
  1083  
  1084  		if hasFlag(flags, FST_BIT_LAST_ARC) {
  1085  			return nil
  1086  		}
  1087  	}
  1088  }
  1089  
  1090  func (t *FST) NodeCount() int64 {
  1091  	return t.nodeCount + 1
  1092  }
  1093  
  1094  func (t *FST) shouldExpand(node *UnCompiledNode) bool {
  1095  	return t.allowArrayArcs &&
  1096  		(node.depth <= FIXED_ARRAY_SHALLOW_DISTANCE && node.NumArcs >= FIXED_ARRAY_NUM_ARCS_SHALLOW ||
  1097  			node.NumArcs >= FIXED_ARRAY_NUM_ARCS_DEEP)
  1098  }
  1099  
  1100  func (t *FST) BytesReader() BytesReader {
  1101  	if t.packed {
  1102  		return t.bytes.forwardReader()
  1103  	}
  1104  	return t.bytes.reverseReader()
  1105  }
  1106  
  1107  type RandomAccess interface {
  1108  	getPosition() int64
  1109  	setPosition(pos int64)
  1110  	reversed() bool
  1111  	skipBytes(count int64)
  1112  }
  1113  
  1114  type BytesReader interface {
  1115  	// *util.DataInputImpl
  1116  	util.DataInput
  1117  	RandomAccess
  1118  }
  1119  
  1120  // L1464
  1121  /*
  1122  Expert: creates an FST by packing this one. This process requires
  1123  substantial additional RAM (currently up to ~8 bytes per node
  1124  depending on acceptableOverheadRatio), but then should produce a
  1125  smaller FST.
  1126  
  1127  The implementation of this method uses ideas from
  1128  <a target="_blank" href="http://www.cs.put.poznan.pl/dweiss/site/publications/download/fsacomp.pdf">Smaller Representation of Finite State Automata</a>
  1129  which describes techniques to reduce the size of a FST. However, this
  1130  is not a strict implementation of the algorithms described in this
  1131  paper.
  1132  */
  1133  func (t *FST) pack(minInCountDeref, maxDerefNodes int,
  1134  	acceptableOverheadRatio float32) (*FST, error) {
  1135  	panic("not implemented yet")
  1136  }