github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/index/merge.go (about)

     1  package index
     2  
     3  import (
     4  	"fmt"
     5  	. "github.com/balzaczyy/golucene/core/codec/spi"
     6  	// "github.com/balzaczyy/golucene/core/util"
     7  	"io"
     8  	"math"
     9  	"sort"
    10  	"sync"
    11  )
    12  
    13  // index/MergeScheduler.java
    14  
    15  /*
    16  Expert: IndexWriter uses an instance implementing this interface to
    17  execute the merges selected by a MergePolicy. The default
    18  MergeScheduler is ConcurrentMergeScheduler.
    19  
    20  Implementers of sub-classes shold make sure that Clone() returns an
    21  independent instance able to work with any IndexWriter instance.
    22  */
    23  type MergeScheduler interface {
    24  	io.Closer
    25  	Merge(*IndexWriter, MergeTrigger, bool) error
    26  }
    27  
    28  // index/MergeState.java
    29  
    30  // Recording units of work when merging segments.
    31  type CheckAbort interface {
    32  	// Records the fact that roughly units amount of work have been
    33  	// done since this method was last called. When adding
    34  	// time-consuming code into SegmentMerger, you should test
    35  	// different values for units to ensure that the time inbetwen
    36  	// calls to merge.checkAborted is up to ~ 1 second.
    37  	work(float64) error
    38  }
    39  
    40  /* If you use this: IW.close(false) cannot abort your merge! */
    41  type CheckAbortNone int
    42  
    43  func (ca CheckAbortNone) work(units float64) error { return nil } // do nothing
    44  
    45  // index/SerialMergeScheduler.java
    46  
    47  // A MergeScheduler that simply does each merge sequentially, using
    48  // the current thread.
    49  type SerialMergeScheduler struct {
    50  	sync.Locker
    51  }
    52  
    53  func NewSerialMergeScheduler() *SerialMergeScheduler {
    54  	return &SerialMergeScheduler{&sync.Mutex{}}
    55  }
    56  
    57  func (ms *SerialMergeScheduler) Merge(writer *IndexWriter,
    58  	trigger MergeTrigger, newMergesFound bool) (err error) {
    59  	ms.Lock() // synchronized
    60  	defer ms.Unlock()
    61  
    62  	for merge := writer.nextMerge(); merge != nil && err == nil; merge = writer.nextMerge() {
    63  		err = writer.merge(merge)
    64  	}
    65  	return
    66  }
    67  
    68  // func (ms *SerialMergeScheduler) Clone() MergeScheduler {
    69  // 	return NewSerialMergeScheduler()
    70  // }
    71  
    72  func (ms *SerialMergeScheduler) Close() error { return nil }
    73  
    74  // index/MergePolicy.java
    75  
    76  // Default max segment size in order to use compound file system.
    77  // Set to maxInt64.
    78  const DEFAULT_MAX_CFS_SEGMENT_SIZE = math.MaxInt64
    79  
    80  /*
    81  Expert: a MergePolicy determines the sequence of primitive merge
    82  operations.
    83  
    84  Whenever the segments in an index have been altered by IndexWriter,
    85  either the addition of a newly flushed segment, addition of many
    86  segments from addIndexes* calls, or a previous merge that may now
    87  seed to cascade, IndexWriter invokes findMerges() to give the
    88  MergePolicy a chance to pick merges that are now required. This
    89  method returns a MergeSpecification instance describing the set of
    90  merges that should be done, or nil if no merges are necessary. When
    91  IndexWriter.forceMerge() is called, it calls findForcedMerges() and
    92  the MergePolicy should then return the necessary merges.
    93  
    94  Note that the policy can return more than one merge at a time. In
    95  this case, if the writer is using SerialMergeScheduler, the merges
    96  will be run sequentially but if it is using ConcurrentMergeScheduler
    97  they will be run concurrently.
    98  
    99  The default MergePolicy is TieredMergePolicy.
   100  */
   101  type MergePolicy interface {
   102  	SetNoCFSRatio(noCFSRatio float64)
   103  	SetMaxCFSSegmentSizeMB(v float64)
   104  	MergeSpecifier
   105  }
   106  
   107  type MergePolicyImplSPI interface {
   108  	// Return the byte size of the provided SegmentCommitInfo,
   109  	// pro-rated by percentage of non-deleted documents if
   110  	// SetCalibrateSizeByDeletes() is set.
   111  	Size(*SegmentCommitInfo, *IndexWriter) (int64, error)
   112  }
   113  
   114  type MergePolicyImpl struct {
   115  	self    MergeSpecifier
   116  	SizeSPI MergePolicyImplSPI
   117  	// If the size of te merge segment exceeds this ratio of the total
   118  	// index size then it will remain in non-compound format.
   119  	noCFSRatio float64
   120  	// If the size of the merged segment exceeds this value then it
   121  	// will not use compound file format.
   122  	maxCFSSegmentSize float64
   123  }
   124  
   125  type MergeSpecifier interface {
   126  	// Determine what set of merge operations are now necessary on the
   127  	// index. IndexWriter calls this whenever there is a change to the
   128  	// segments. This call is always synchronized on the IndexWriter
   129  	// instance so only one thread at a time will call this method.
   130  	FindMerges(MergeTrigger, *SegmentInfos, *IndexWriter) (MergeSpecification, error)
   131  	// Determine what set of merge operations is necessary in order to
   132  	// merge to <= the specified segment count. IndexWriter calls this
   133  	// when its forceMerge() method is called. This call is always
   134  	// synchronized on the IndexWriter instance so only one thread at a
   135  	// time will call this method.
   136  	FindForcedMerges(*SegmentInfos, int,
   137  		map[*SegmentCommitInfo]bool, *IndexWriter) (MergeSpecification, error)
   138  	// Determine what set of merge operations is necessary in order to
   139  	// expunge all deletes from the index.
   140  	// FindForcedDeletesMerges(segmentinfos *SegmentInfos) (spec MergeSpecification, err error)
   141  }
   142  
   143  /*
   144  Creates a new merge policy instance. Note that if you intend to use
   145  it without passing it to IndexWriter, you should call SetIndexWriter()
   146  */
   147  func NewDefaultMergePolicyImpl(self MergeSpecifier) *MergePolicyImpl {
   148  	return newMergePolicyImpl(self, DEFAULT_NO_CFS_RATIO, DEFAULT_MAX_CFS_SEGMENT_SIZE)
   149  }
   150  
   151  /*
   152  Create a new merge policy instance with default settings for noCFSRatio
   153  and maxCFSSegmentSize. This ctor should be used by subclasses using
   154  different defaults than the MergePolicy.
   155  */
   156  func newMergePolicyImpl(self MergeSpecifier, defaultNoCFSRatio, defaultMaxCFSSegmentSize float64) *MergePolicyImpl {
   157  	ans := &MergePolicyImpl{
   158  		self:              self,
   159  		noCFSRatio:        defaultNoCFSRatio,
   160  		maxCFSSegmentSize: defaultMaxCFSSegmentSize,
   161  	}
   162  	ans.SizeSPI = ans
   163  	return ans
   164  }
   165  
   166  func (mp *MergePolicyImpl) Size(info *SegmentCommitInfo, w *IndexWriter) (n int64, err error) {
   167  	byteSize, err := info.SizeInBytes()
   168  	if err != nil {
   169  		return 0, err
   170  	}
   171  	docCount := info.Info.DocCount()
   172  	if docCount <= 0 {
   173  		return byteSize, nil
   174  	}
   175  
   176  	delCount := w.readerPool.numDeletedDocs(info)
   177  	delRatio := float32(delCount) / float32(docCount)
   178  	assert(delRatio <= 1)
   179  	return int64(float32(byteSize) * (1 - delRatio)), nil
   180  }
   181  
   182  /*
   183  Returns true if this single info is already fully merged (has no
   184  pending deletes, is in the same dir as the writer, and matches the
   185  current compound file setting)
   186  */
   187  func (mp *MergePolicyImpl) isMerged(infos *SegmentInfos,
   188  	info *SegmentCommitInfo, w *IndexWriter) bool {
   189  	panic("not implemented yet")
   190  	assert(w != nil)
   191  	hasDeletions := w.readerPool.numDeletedDocs(info) > 0
   192  	return !hasDeletions &&
   193  		!info.Info.HasSeparateNorms() &&
   194  		info.Info.Dir == w.directory &&
   195  		(mp.noCFSRatio > 0 && mp.noCFSRatio < 1 || mp.maxCFSSegmentSize < math.MaxInt64)
   196  }
   197  
   198  /*
   199  If a merged segment will be more than this percentage of the total
   200  size of the index, leave the segment as non-compound file even if
   201  compound file is enabled. Set to 1.0 to always use CFS regardless or
   202  merge size.
   203  */
   204  func (mp *MergePolicyImpl) SetNoCFSRatio(noCFSRatio float64) {
   205  	assert2(noCFSRatio >= 0 && noCFSRatio <= 1, fmt.Sprintf(
   206  		"noCFSRatio must be 0.0 to 1.0 inclusive; got %v", noCFSRatio))
   207  	mp.noCFSRatio = noCFSRatio
   208  }
   209  
   210  /*
   211  If a merged segment will be more than this value, leave the segment
   212  as non-compound file even if compound file is enabled. Set this to
   213  math.Inf(1) (default) and noCFSRatio to 1.0 to always use CFS
   214  regardless of merge size.
   215  */
   216  func (mp *MergePolicyImpl) SetMaxCFSSegmentSizeMB(v float64) {
   217  	assert2(v >= 0, fmt.Sprintf("maxCFSSegmentSizeMB must be >=0 (got %v)", v))
   218  	v *= 1024 * 1024
   219  	if v > float64(math.MaxInt64) {
   220  		mp.maxCFSSegmentSize = math.MaxInt64
   221  	} else {
   222  		mp.maxCFSSegmentSize = v
   223  	}
   224  }
   225  
   226  // Passed to MergePolicy.FindMerges(MergeTrigger, SegmentInfos) to
   227  // indicate the event that triggered the merge
   228  type MergeTrigger int
   229  
   230  const (
   231  	// Merge was triggered by a segment flush.
   232  	MERGE_TRIGGER_SEGMENT_FLUSH = MergeTrigger(1)
   233  	// Merge was triggered by a full flush. Full flushes can be caused
   234  	// by a commit, NRT reader reopen or close call on the index writer
   235  	MERGE_TRIGGER_FULL_FLUSH = MergeTrigger(2)
   236  	/* Merge has been triggerd explicitly by the user. */
   237  	MERGE_TRIGGER_EXPLICIT = MergeTrigger(3)
   238  	/* Merge was triggered by a successfully finished merge. */
   239  	MERGE_FINISHED = MergeTrigger(4)
   240  	// Merge was triggered by a closing IndexWriter.
   241  	MERGE_CLOSING = MergeTrigger(5)
   242  )
   243  
   244  func MergeTriggerName(trigger MergeTrigger) string {
   245  	switch int(trigger) {
   246  	case 1:
   247  		return "SEGMENT_FLUSH"
   248  	case 2:
   249  		return "FULL_FLUSH"
   250  	case 3:
   251  		return "EXPLICIT"
   252  	case 4:
   253  		return "MERGE_FINISHED"
   254  	case 5:
   255  		return "CLOSING"
   256  	}
   257  	panic(fmt.Sprintf("Invalid merge trigger: %v", trigger))
   258  }
   259  
   260  /*
   261  OneMerge provides the information necessary to perform an individual
   262  primitive merge operation, resulting in a single new segment. The
   263  merge spec includes the subset of segments to be merged as well as
   264  whether the new segment should use the compound file format.
   265  */
   266  type OneMerge struct {
   267  	sync.Locker
   268  
   269  	registerDone   bool // used by MergeControl
   270  	maxNumSegments int
   271  
   272  	// Segments to ber merged.
   273  	segments []*SegmentCommitInfo
   274  
   275  	// Total number of documents in segments to be merged, not
   276  	// accounting for deletions.
   277  	totalDocCount int
   278  	aborted       bool
   279  }
   280  
   281  func NewOneMerge(segments []*SegmentCommitInfo) *OneMerge {
   282  	assert2(len(segments) > 0, "segments must include at least one segment")
   283  	// clone the list, as the in list may be based off original SegmentInfos and may be modified
   284  	segments2 := make([]*SegmentCommitInfo, len(segments))
   285  	copy(segments2, segments)
   286  	count := 0
   287  	for _, info := range segments {
   288  		count += info.Info.DocCount()
   289  	}
   290  	return &OneMerge{
   291  		maxNumSegments: -1,
   292  		segments:       segments2,
   293  		totalDocCount:  count,
   294  	}
   295  }
   296  
   297  func (m *OneMerge) abort() {
   298  	m.Lock()
   299  	defer m.Unlock()
   300  	m.aborted = true
   301  }
   302  
   303  /*
   304  A MergeSpecification instance provides the information necessary to
   305  perform multiple merges. It simply contains a list of OneMerge
   306  instances.
   307  */
   308  type MergeSpecification []*OneMerge
   309  
   310  /*
   311  Thrown when a merge was explicitly aborted because IndexWriter.close()
   312  was called with false. Normally this error is privately caught and
   313  suppressed by IndexWriter.
   314  */
   315  type MergeAbortedError string
   316  
   317  func (err MergeAbortedError) Error() string {
   318  	return string(err)
   319  }
   320  
   321  // index/TieredMergePolicy.java
   322  
   323  // Default noCFSRatio. If a merge's size is >= 10% of the index, then
   324  // we disable compound file for it.
   325  const DEFAULT_NO_CFS_RATIO = 0.1
   326  
   327  /*
   328  Merges segments of approximately equal size, subject to an allowed
   329  number of segments per tier. This is similar to LogByteSizeMergePolicy,
   330  except this merge policy is able to merge non-adjacent segment, and
   331  separates how many segments are merged at once (SetMaxMergeAtOnce())
   332  from how many segments are allowed per tier (SetSegmentsPerTier()).
   333  This merge policy also does not over-merge (i.e. cascade merges).
   334  
   335  For normal merging, this policy first computes a "budget" of how many
   336  segments are allowed to be in the index. If the index is over-budget,
   337  then the policy sorts segments by decreasing size (pro-rating by
   338  percent deletes), and then finds the least-cost merge. Merge cost is
   339  measured by a combination of the "skew" of the merge (size of largest
   340  segments divided by smallest segment), total merge size and percent
   341  deletes reclaimed, so tha tmerges with lower skew, smaller size and
   342  those reclaiming more deletes, are flavored.
   343  
   344  If a merge wil produce a segment that's larger than SetMaxMergedSegmentMB(),
   345  then the policy will merge fewer segments (down to 1 at once, if that
   346  one has deletions) to keep the segment size under budget.
   347  
   348  NOTE: this policy freely merges non-adjacent segments; if this is a
   349  problem, use LogMergePolicy.
   350  
   351  NOTE: This policy always merges by byte size of the segments, always
   352  pro-rates by percent deletes, and does not apply any maximum segment
   353  size duirng forceMerge (unlike LogByteSizeMergePolicy).
   354  */
   355  type TieredMergePolicy struct {
   356  	*MergePolicyImpl
   357  
   358  	maxMergeAtOnce         int
   359  	maxMergedSegmentBytes  int64
   360  	maxMergeAtOnceExplicit int
   361  
   362  	floorSegmentBytes           int64
   363  	segsPerTier                 float64
   364  	forceMergeDeletesPctAllowed float64
   365  	reclaimDeletesWeight        float64
   366  }
   367  
   368  func NewTieredMergePolicy() *TieredMergePolicy {
   369  	res := &TieredMergePolicy{
   370  		maxMergeAtOnce:              10,
   371  		maxMergedSegmentBytes:       5 * 1024 * 1024 * 1024,
   372  		maxMergeAtOnceExplicit:      30,
   373  		floorSegmentBytes:           2 * 1024 * 1024,
   374  		segsPerTier:                 10,
   375  		forceMergeDeletesPctAllowed: 10,
   376  		reclaimDeletesWeight:        2,
   377  	}
   378  	res.MergePolicyImpl = newMergePolicyImpl(res, DEFAULT_NO_CFS_RATIO, DEFAULT_MAX_CFS_SEGMENT_SIZE)
   379  	return res
   380  }
   381  
   382  /*
   383  Maximum number of segments to be merged at a time during "normal"
   384  merging. For explicit merging (e.g., forceMerge or forceMergeDeletes
   385  was called), see SetMaxMergeAtonceExplicit(). Default is 10.
   386  */
   387  func (tmp *TieredMergePolicy) SetMaxMergeAtOnce(v int) *TieredMergePolicy {
   388  	assert2(v >= 2, fmt.Sprintf("maxMergeAtonce must be > 1 (got %v)", v))
   389  	tmp.maxMergeAtOnce = v
   390  	return tmp
   391  }
   392  
   393  /*
   394  Maximum number of segments to be merged at a time, during forceMerge
   395  or forceMergeDeletes. Default is 30.
   396  */
   397  func (tmp *TieredMergePolicy) SetMaxMergeAtOnceExplicit(v int) *TieredMergePolicy {
   398  	assert2(v >= 2, fmt.Sprintf("maxMergeAtonceExplicit must be > 1 (got %v)", v))
   399  	tmp.maxMergeAtOnceExplicit = v
   400  	return tmp
   401  }
   402  
   403  /*
   404  Maximum sized segment to produce during normal merging. This setting
   405  is approximate: the estimate of the merged segment size is made by
   406  summing sizes of to-be-merged segments(compensating for percent
   407  deleted docs). Default is 5 GB.
   408  */
   409  func (tmp *TieredMergePolicy) SetMaxMergedSegmentMB(v float64) *TieredMergePolicy {
   410  	assert2(v >= 0, fmt.Sprintf("maxMergedSegmentMB must be >= 0 (got %v)", v))
   411  	v *= 1024 * 1024
   412  	tmp.maxMergedSegmentBytes = math.MaxInt64
   413  	if v < math.MaxInt64 {
   414  		tmp.maxMergedSegmentBytes = int64(v)
   415  	}
   416  	return tmp
   417  }
   418  
   419  /*
   420  Controls how aggressively merges that reclaim more deletions are
   421  favored. Higher values will more aggresively target merges that
   422  reclaim deletions, but be careful not to go so high that way too much
   423  merging takes place; a value of 3.0 is probably nearly too high. A
   424  value of 0.0 means deletions don't impact merge selection.
   425  */
   426  func (tmp *TieredMergePolicy) SetReclaimDeletesWeight(v float64) *TieredMergePolicy {
   427  	assert2(v >= 0, fmt.Sprintf("reclaimDeletesWeight must be >= 0 (got %v)", v))
   428  	tmp.reclaimDeletesWeight = v
   429  	return tmp
   430  }
   431  
   432  /*
   433  Segments smaller than this are "rounded up" to this size, ie treated
   434  as equal (floor) size for merge selection. This is to prevent
   435  frequent flushing of tiny segments from allowing a long tail in the
   436  index. Default is 2 MB.
   437  */
   438  func (tmp *TieredMergePolicy) SetFloorSegmentMB(v float64) *TieredMergePolicy {
   439  	assert2(v > 0, fmt.Sprintf("floorSegmentMB must be > 0 (got %v)", v))
   440  	v *= 1024 * 1024
   441  	tmp.floorSegmentBytes = math.MaxInt64
   442  	if v < math.MaxInt64 {
   443  		tmp.floorSegmentBytes = int64(v)
   444  	}
   445  	return tmp
   446  }
   447  
   448  /*
   449  When forceMergeDeletes is called, we only merge away a segment if its
   450  delete percentage is over this threshold. Default is 10%.
   451  */
   452  func (tmp *TieredMergePolicy) SetForceMergeDeletesPctAllowed(v float64) *TieredMergePolicy {
   453  	assert2(v >= 0 && v <= 100, fmt.Sprintf("forceMergeDeletesPctAllowed must be between 0 and 100 inclusive (got %v)", v))
   454  	tmp.forceMergeDeletesPctAllowed = v
   455  	return tmp
   456  }
   457  
   458  /*
   459  Sets the allowed number of segments per tier. Smaller values mean
   460  more merging but fewer segments.
   461  
   462  NOTE: this value should be >= the SetMaxMergeAtOnce otherwise you'll
   463  force too much merging to occur.
   464  */
   465  func (tmp *TieredMergePolicy) SetSegmentsPerTier(v float64) *TieredMergePolicy {
   466  	assert2(v >= 2, fmt.Sprintf("segmentsPerTier must be >= 2 (got %v)", v))
   467  	tmp.segsPerTier = v
   468  	return tmp
   469  }
   470  
   471  type BySizeDescendingSegments struct {
   472  	values []*SegmentCommitInfo
   473  	writer *IndexWriter
   474  	spi    MergePolicyImplSPI
   475  }
   476  
   477  func (a *BySizeDescendingSegments) Len() int      { return len(a.values) }
   478  func (a *BySizeDescendingSegments) Swap(i, j int) { a.values[i], a.values[j] = a.values[j], a.values[i] }
   479  func (a *BySizeDescendingSegments) Less(i, j int) bool {
   480  	var err error
   481  	var sz1, sz2 int64
   482  	sz1, err = a.spi.Size(a.values[i], a.writer)
   483  	assert(err == nil)
   484  	sz2, err = a.spi.Size(a.values[j], a.writer)
   485  	assert(err == nil)
   486  	if sz1 != sz2 {
   487  		return sz1 < sz2
   488  	}
   489  	return a.values[i].Info.Name < a.values[j].Info.Name
   490  }
   491  
   492  type MergeScore interface{}
   493  
   494  func (tmp *TieredMergePolicy) FindMerges(mergeTrigger MergeTrigger,
   495  	infos *SegmentInfos, w *IndexWriter) (spec MergeSpecification, err error) {
   496  
   497  	if tmp.verbose(w) {
   498  		tmp.message(w, "findMerges: %v segments", len(infos.Segments))
   499  	}
   500  	if len(infos.Segments) == 0 {
   501  		return nil, nil
   502  	}
   503  	merging := w.MergingSegments()
   504  	toBeMerged := make(map[*SegmentCommitInfo]bool)
   505  
   506  	infosSorted := make([]*SegmentCommitInfo, len(infos.Segments))
   507  	copy(infosSorted, infos.Segments)
   508  	sort.Sort(&BySizeDescendingSegments{infosSorted, w, tmp})
   509  
   510  	// Compute total index bytes & print details about the index
   511  	totIndexBytes := int64(0)
   512  	minSegmentBytes := int64(math.MaxInt64)
   513  	for _, info := range infosSorted {
   514  		var segBytes int64
   515  		if segBytes, err = tmp.Size(info, w); err != nil {
   516  			return
   517  		}
   518  		if tmp.verbose(w) {
   519  			var extra string
   520  			if _, ok := merging[info]; ok {
   521  				extra = " [merging]"
   522  			}
   523  			if segBytes >= tmp.maxMergedSegmentBytes/2 {
   524  				extra += " [skip: too large]"
   525  			} else {
   526  				extra += " [floored]"
   527  			}
   528  			tmp.message(w, "  seg=%v size=%v MB%v",
   529  				w.readerPool.segmentToString(info),
   530  				fmt.Sprintf("%.3f", float32(segBytes)/1024/1024), extra)
   531  		}
   532  
   533  		if segBytes < minSegmentBytes {
   534  			minSegmentBytes = segBytes
   535  		}
   536  		// Accum total byte size
   537  		totIndexBytes += segBytes
   538  	}
   539  
   540  	// If we have too-large segments, grace them out of the maxSegmentCount:
   541  	tooBigCount := 0
   542  	for tooBigCount < len(infosSorted) {
   543  		var n int64
   544  		if n, err = tmp.Size(infosSorted[tooBigCount], w); err != nil {
   545  			return nil, err
   546  		}
   547  		if n < tmp.maxMergedSegmentBytes/2 {
   548  			break
   549  		}
   550  		totIndexBytes -= n
   551  		tooBigCount++
   552  	}
   553  
   554  	minSegmentBytes = tmp.floorSize(minSegmentBytes)
   555  
   556  	// Compute max allowed segs in the index
   557  	levelSize := minSegmentBytes
   558  	bytesLeft := totIndexBytes
   559  	allowedSegCount := float64(0)
   560  	for {
   561  		if segCountLevel := float64(bytesLeft) / float64(levelSize); segCountLevel < tmp.segsPerTier {
   562  			allowedSegCount += math.Ceil(segCountLevel)
   563  			break
   564  		}
   565  		allowedSegCount += tmp.segsPerTier
   566  		bytesLeft -= int64(tmp.segsPerTier * float64(levelSize))
   567  		levelSize *= int64(tmp.maxMergeAtOnce)
   568  	}
   569  	allowedSegCountInt := int(allowedSegCount)
   570  
   571  	// Cycle to possibly select more than one merge
   572  	for {
   573  		mergingBytes := int64(0)
   574  
   575  		// Gather eligible segments for merging, ie segments not already
   576  		// being merged and not already picked (by prior iteration of
   577  		// this loop) for merging:
   578  		var eligible []*SegmentCommitInfo
   579  		for _, info := range infosSorted[tooBigCount:] {
   580  			if _, ok := merging[info]; ok {
   581  				var n int64
   582  				if n, err = info.SizeInBytes(); err != nil {
   583  					return
   584  				}
   585  				mergingBytes += n
   586  			} else if _, ok := toBeMerged[info]; !ok {
   587  				eligible = append(eligible, info)
   588  			}
   589  		}
   590  
   591  		// maxMergeIsRunning := mergingBytes >= tmp.maxMergedSegmentBytes
   592  
   593  		if tmp.verbose(w) {
   594  			tmp.message(w,
   595  				"  allowedSegmentCount=%v vs count=%v (eligible count=%v) tooBigCount=%v",
   596  				allowedSegCountInt, len(infosSorted), len(eligible), tooBigCount, w)
   597  		}
   598  
   599  		if len(eligible) == 0 {
   600  			return // spec is nil
   601  		}
   602  
   603  		if len(eligible) > allowedSegCountInt {
   604  
   605  			// OK we are over budget -- find best merge!
   606  			// var bestScore MergeScore
   607  			var best []*SegmentCommitInfo
   608  			// var bestTooLarge bool
   609  			// var bestMergeBytes int64
   610  
   611  			// Consider all merge starts:
   612  			for startIdx := 0; startIdx < len(eligible)-tmp.maxMergeAtOnce; startIdx++ {
   613  				var totAfterMergesBytes int64
   614  				var candidate []*SegmentCommitInfo
   615  				// var hitTooLarge bool
   616  				for idx := startIdx; idx < len(eligible) && len(candidate) < tmp.maxMergeAtOnce; idx++ {
   617  					info := eligible[idx]
   618  					var segBytes int64
   619  					if segBytes, err = tmp.Size(info, w); err != nil {
   620  						return nil, err
   621  					}
   622  
   623  					if totAfterMergesBytes+segBytes > tmp.maxMergedSegmentBytes {
   624  						panic("niy")
   625  					}
   626  					panic("niy")
   627  				}
   628  
   629  				panic("niy")
   630  			}
   631  
   632  			if best != nil {
   633  				panic("NIY")
   634  			} else {
   635  				return spec, nil
   636  			}
   637  		} else {
   638  			return
   639  		}
   640  	}
   641  }
   642  
   643  func (tmp *TieredMergePolicy) FindForcedMerges(infos *SegmentInfos,
   644  	maxSegmentCount int, segmentsToMerge map[*SegmentCommitInfo]bool,
   645  	w *IndexWriter) (MergeSpecification, error) {
   646  	panic("not implemented yet")
   647  }
   648  
   649  func (tmp *TieredMergePolicy) floorSize(bytes int64) int64 {
   650  	if bytes > tmp.floorSegmentBytes {
   651  		return bytes
   652  	}
   653  	return tmp.floorSegmentBytes
   654  }
   655  
   656  func (tmp *TieredMergePolicy) verbose(w *IndexWriter) bool {
   657  	return w != nil && w.infoStream.IsEnabled("TMP")
   658  }
   659  
   660  func (tmp *TieredMergePolicy) message(w *IndexWriter, message string, args ...interface{}) {
   661  	w.infoStream.Message("TMP", message, args...)
   662  }
   663  
   664  func (tmp *TieredMergePolicy) String() string {
   665  	return fmt.Sprintf("[TieredMergePolicy: maxMergeAtOnce=%v, maxMergeAtOnceExplicit=%v, maxMergedSegmentMB=%v, floorSegmentMB=%v, forceMergeDeletesPctAllowed=%v, segmentPerTier=%v, maxCFSSegmentSizeMB=%v, noCFSRatio=%v",
   666  		tmp.maxMergeAtOnce, tmp.maxMergeAtOnceExplicit, tmp.maxMergedSegmentBytes/1024/1024,
   667  		tmp.floorSegmentBytes/1024/1024, tmp.forceMergeDeletesPctAllowed, tmp.segsPerTier,
   668  		tmp.maxCFSSegmentSize/1024/1024, tmp.noCFSRatio)
   669  }
   670  
   671  // index/LogMergePolicy.java
   672  
   673  /*
   674  Defines the allowed range of log(size) for each level. A level is
   675  computed by taking the max segment log size, minus LEVEL_LOG_SPAN,
   676  and finding all segments falling within that range.
   677  */
   678  const LEVEL_LOG_SPAN = 0.75
   679  
   680  // Default merge factor, which is how many segments are merged at a time
   681  const DEFAULT_MERGE_FACTOR = 10
   682  
   683  /*
   684  This class implements a MergePolicy that tries to merge segments into
   685  levels of exponentially increasing size, where each level has fewer
   686  segments than the value of the merge factor. Whenver extra segments
   687  (beyond the merge factor upper bound) are encountered, all segments
   688  within the level are merged. You can get or set the merge factor
   689  using MergeFactor() and SetMergeFactor() repectively.
   690  
   691  This class is abstract and required a subclass to define the Size()
   692  method  which specifies how a segment's size is determined.
   693  LogDocMergePolicy is one subclass that measures size by document
   694  count in the segment. LogByteSizeMergePolicy is another subclass that
   695  measures size as the total byte size of the file(s) for the segment.
   696  */
   697  type LogMergePolicy struct {
   698  	*MergePolicyImpl
   699  
   700  	// How many segments to merge at a time.
   701  	mergeFactor int
   702  	// Any segments whose size is smaller than this value will be
   703  	// rounded up to this value. This ensures that tiny segments are
   704  	// aggressively merged.
   705  	minMergeSize int64
   706  	// If the size of a segment exceeds this value then it will never
   707  	// be merged.
   708  	maxMergeSize int64
   709  	// Although the core MPs set it explicitly, we must default in case
   710  	// someone out there wrote this own LMP ...
   711  	// If the size of a segment exceeds this value then it will never
   712  	// be merged during ForceMerge()
   713  	maxMergeSizeForForcedMerge int64
   714  	// If true, we pro-rate a segment's size by the percentage of
   715  	// non-deleted documents.
   716  	calibrateSizeByDeletes bool
   717  }
   718  
   719  func NewLogMergePolicy(min, max int64) *LogMergePolicy {
   720  	res := &LogMergePolicy{
   721  		mergeFactor:                DEFAULT_MERGE_FACTOR,
   722  		minMergeSize:               min,
   723  		maxMergeSize:               max,
   724  		maxMergeSizeForForcedMerge: math.MaxInt64,
   725  		calibrateSizeByDeletes:     true,
   726  	}
   727  	res.MergePolicyImpl = newMergePolicyImpl(res, DEFAULT_NO_CFS_RATIO, DEFAULT_MAX_CFS_SEGMENT_SIZE)
   728  	return res
   729  }
   730  
   731  // Returns true if LMP is enabled in IndexWriter's InfoStream.
   732  func (mp *LogMergePolicy) verbose(w *IndexWriter) bool {
   733  	return w != nil && w.infoStream.IsEnabled("LMP")
   734  }
   735  
   736  // Print a debug message to IndexWriter's infoStream.
   737  func (mp *LogMergePolicy) message(message string, w *IndexWriter) {
   738  	if mp.verbose(w) {
   739  		w.infoStream.Message("LMP", message)
   740  	}
   741  }
   742  
   743  /*
   744  Determines how often segment indices are merged by AdDocument(). With
   745  smaller values, less RAM is used while indexing, and searches are
   746  faster, but indexing speed is slower. With larger values, more RAM is
   747  used during indexing, and while searches is slower, indexing is
   748  faster. Thus larger values (> 10) are best for batch index creation,
   749  and smaller values (< 10) for indces that are interactively
   750  maintained.
   751  */
   752  func (mp *LogMergePolicy) SetMergeFactor(mergeFactor int) {
   753  	assert2(mergeFactor >= 2, "mergeFactor cannot be less than 2")
   754  	mp.mergeFactor = mergeFactor
   755  }
   756  
   757  // Sets whether the segment size should be calibrated by the number
   758  // of delets when choosing segments to merge
   759  func (mp *LogMergePolicy) SetCalbrateSizeByDeletes(calibrateSizeByDeletes bool) {
   760  	mp.calibrateSizeByDeletes = calibrateSizeByDeletes
   761  }
   762  
   763  /*
   764  Return the number of documents in the provided SegmentCommitInfo,
   765  pro-rated by percentage of non-deleted documents if
   766  SetCalibrateSizeByDeletes() is set.
   767  */
   768  func (mp *LogMergePolicy) sizeDocs(info *SegmentCommitInfo, w *IndexWriter) (n int64, err error) {
   769  	infoDocCount := info.Info.DocCount()
   770  	if mp.calibrateSizeByDeletes {
   771  		delCount := w.readerPool.numDeletedDocs(info)
   772  		assert(delCount <= infoDocCount)
   773  		return int64(infoDocCount - delCount), nil
   774  	}
   775  	return int64(infoDocCount), nil
   776  }
   777  
   778  /*
   779  Return the byte size of the provided SegmentCommitInfo, pro-rated
   780  by percentage of non-deleted documents if SetCalibratedSizeByDeletes()
   781  is set.
   782  */
   783  func (mp *LogMergePolicy) sizeBytes(info *SegmentCommitInfo, w *IndexWriter) (n int64, err error) {
   784  	if mp.calibrateSizeByDeletes {
   785  		return mp.MergePolicyImpl.Size(info, w)
   786  	}
   787  	return info.SizeInBytes()
   788  }
   789  
   790  /*
   791  Returns true if the number of segments eligible for merging is less
   792  than or equal to the specified maxNumSegments.
   793  */
   794  func (mp *LogMergePolicy) isMergedBy(infos *SegmentInfos,
   795  	maxNumSegments int, segmentsToMerge map[*SegmentCommitInfo]bool,
   796  	w *IndexWriter) bool {
   797  	panic("not implemented yet")
   798  }
   799  
   800  func (mp *LogMergePolicy) FindForcedMerges(infos *SegmentInfos,
   801  	maxSegmentCount int, segmentsToMerge map[*SegmentCommitInfo]bool,
   802  	w *IndexWriter) (MergeSpecification, error) {
   803  	panic("not implemented yet")
   804  }
   805  
   806  type SegmentInfoAndLevel struct {
   807  	info  *SegmentCommitInfo
   808  	level float32
   809  	index int
   810  }
   811  
   812  type SegmentInfoAndLevels []SegmentInfoAndLevel
   813  
   814  func (ss SegmentInfoAndLevels) Len() int           { return len(ss) }
   815  func (ss SegmentInfoAndLevels) Swap(i, j int)      { ss[i], ss[j] = ss[j], ss[i] }
   816  func (ss SegmentInfoAndLevels) Less(i, j int) bool { return ss[i].level < ss[j].level }
   817  
   818  /*
   819  Checks if any merges are now necessary and returns a MergeSpecification
   820  if so. A merge is necessary when there are more than SetMergeFactor()
   821  segments at a given level. When multiple levels have too many
   822  segments, this method will return multiple merges, allowing the
   823  MergeScheduler to use concurrency.
   824  */
   825  func (mp *LogMergePolicy) FindMerges(mergeTrigger MergeTrigger,
   826  	infos *SegmentInfos, w *IndexWriter) (spec MergeSpecification, err error) {
   827  	numSegments := len(infos.Segments)
   828  	mp.message(fmt.Sprintf("findMerges: %v segments", numSegments), w)
   829  
   830  	// Compute levels, whic is just log (base mergeFactor) of the size
   831  	// of each segment
   832  	levels := make([]*SegmentInfoAndLevel, 0)
   833  	norm := math.Log(float64(mp.mergeFactor))
   834  
   835  	mergingSegments := w.mergingSegments
   836  
   837  	for i, info := range infos.Segments {
   838  		size, err := mp.Size(info, w)
   839  		if err != nil {
   840  			return nil, err
   841  		}
   842  
   843  		// Floor tiny segments
   844  		if size < 1 {
   845  			size = 1
   846  		}
   847  
   848  		infoLevel := &SegmentInfoAndLevel{info, float32(math.Log(float64(size)) / norm), i}
   849  		levels = append(levels, infoLevel)
   850  
   851  		if mp.verbose(w) {
   852  			segBytes, err := mp.sizeBytes(info, w)
   853  			if err != nil {
   854  				return nil, err
   855  			}
   856  			var extra string
   857  			if _, ok := mergingSegments[info]; ok {
   858  				extra = " [merging]"
   859  			}
   860  			if size >= mp.maxMergeSize {
   861  				extra = fmt.Sprintf("%v [skip: too large]", extra)
   862  			}
   863  			mp.message(fmt.Sprintf("seg=%v level=%v size=%.3f MB%v",
   864  				w.readerPool.segmentToString(info),
   865  				infoLevel.level,
   866  				segBytes/1024/1024,
   867  				extra), w)
   868  		}
   869  	}
   870  
   871  	var levelFloor float32 = 0
   872  	if mp.minMergeSize > 0 {
   873  		levelFloor = float32(math.Log(float64(mp.minMergeSize)) / float64(norm))
   874  	}
   875  
   876  	// Now, we quantize the log values into levfels. The first level is
   877  	// any segment whose log size is within LEVEL_LOG_SPAN of the max
   878  	// size, or, who has such as segment "to the right". Then, we find
   879  	// the max of all other segments and use that to define the next
   880  	// level segment, etc.
   881  
   882  	numMergeableSegments := len(levels)
   883  
   884  	for start := 0; start < numMergeableSegments; {
   885  		// Find max level of all segments not already quantized.
   886  		maxLevel := levels[start].level
   887  		for i := 1 + start; i < numMergeableSegments; i++ {
   888  			level := levels[i].level
   889  			if level > maxLevel {
   890  				maxLevel = level
   891  			}
   892  		}
   893  
   894  		// Now search backwards for the rightmost segment that falls into
   895  		// this level:
   896  		var levelBottom float32
   897  		if maxLevel <= levelFloor {
   898  			// All remaining segments fall into the min level
   899  			levelBottom = -1
   900  		} else {
   901  			levelBottom = float32(float64(maxLevel) - LEVEL_LOG_SPAN)
   902  
   903  			// Force a boundary at the level floor
   904  			if levelBottom < levelFloor && maxLevel >= levelFloor {
   905  				levelBottom = levelFloor
   906  			}
   907  		}
   908  
   909  		upto := numMergeableSegments - 1
   910  		for upto >= start {
   911  			if levels[upto].level >= levelBottom {
   912  				break
   913  			}
   914  			upto--
   915  		}
   916  		mp.message(fmt.Sprintf("  level %v to %v: %v segments",
   917  			levelBottom, maxLevel, 1+upto-start), w)
   918  
   919  		// Finally, record all merges that are viable at this level:
   920  		end := start + mp.mergeFactor
   921  		for end <= 1+upto {
   922  			panic("not implemented yet")
   923  		}
   924  
   925  		start = 1 + upto
   926  	}
   927  
   928  	return
   929  }
   930  
   931  func (mp *LogMergePolicy) String() string {
   932  	panic("not implemented yet")
   933  }
   934  
   935  // index/LogDocMergePolicy.java
   936  
   937  // Default minimum segment size.
   938  const DEFAULT_MIN_MERGE_DOCS = 1000
   939  
   940  /*
   941  This is a LogMergePolicy that measures size of a segment as the
   942  number of  documents (not taking deletions into account).
   943  */
   944  type LogDocMergePolicy struct {
   945  	*LogMergePolicy
   946  }
   947  
   948  func NewLogDocMergePolicy() *LogMergePolicy {
   949  	ans := &LogDocMergePolicy{
   950  		LogMergePolicy: NewLogMergePolicy(DEFAULT_MIN_MERGE_DOCS, math.MaxInt64),
   951  	}
   952  	// maxMergeSize(ForForcedMerge) are never used by LogDocMergePolicy;
   953  	// set it to math.MaxInt64 to disable it
   954  	ans.maxMergeSizeForForcedMerge = math.MaxInt64
   955  	ans.SizeSPI = ans
   956  	return ans.LogMergePolicy
   957  }
   958  
   959  func (p *LogDocMergePolicy) Size(info *SegmentCommitInfo, w *IndexWriter) (int64, error) {
   960  	return p.sizeDocs(info, w)
   961  }
   962  
   963  // index/LogByteSizeMergePolicy.java
   964  
   965  // Default minimum segment size.
   966  var DEFAULT_MIN_MERGE_MB = 1.6
   967  
   968  // Default maximum segment size. A segment of this size or larger
   969  // will never be merged.
   970  const DEFAULT_MAX_MERGE_MB = 2048
   971  
   972  // Default maximum segment size. A segment of this size or larger
   973  // will never be merged during forceMerge.
   974  var DEFAULT_MAX_MERGE_MB_FOR_FORCED_MERGE int64 = math.MaxInt64
   975  
   976  // this is a LogMergePolicy that measures size of a segment as the
   977  // total byte size of the segment's files.
   978  type LogByteSizeMergePolicy struct {
   979  	*LogMergePolicy
   980  }
   981  
   982  func NewLogByteSizeMergePolicy() *LogMergePolicy {
   983  	ans := &LogByteSizeMergePolicy{
   984  		LogMergePolicy: NewLogMergePolicy(int64(DEFAULT_MIN_MERGE_MB*1024*1024),
   985  			int64(DEFAULT_MAX_MERGE_MB*1024*1024)),
   986  	}
   987  	ans.maxMergeSizeForForcedMerge = int64(DEFAULT_MAX_MERGE_MB_FOR_FORCED_MERGE * 1024 * 1024)
   988  	ans.SizeSPI = ans
   989  	return ans.LogMergePolicy
   990  }
   991  
   992  func (p *LogByteSizeMergePolicy) Size(info *SegmentCommitInfo, w *IndexWriter) (int64, error) {
   993  	return p.sizeBytes(info, w)
   994  }