github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/index/merge.go (about) 1 package index 2 3 import ( 4 "fmt" 5 . "github.com/balzaczyy/golucene/core/codec/spi" 6 // "github.com/balzaczyy/golucene/core/util" 7 "io" 8 "math" 9 "sort" 10 "sync" 11 ) 12 13 // index/MergeScheduler.java 14 15 /* 16 Expert: IndexWriter uses an instance implementing this interface to 17 execute the merges selected by a MergePolicy. The default 18 MergeScheduler is ConcurrentMergeScheduler. 19 20 Implementers of sub-classes shold make sure that Clone() returns an 21 independent instance able to work with any IndexWriter instance. 22 */ 23 type MergeScheduler interface { 24 io.Closer 25 Merge(*IndexWriter, MergeTrigger, bool) error 26 } 27 28 // index/MergeState.java 29 30 // Recording units of work when merging segments. 31 type CheckAbort interface { 32 // Records the fact that roughly units amount of work have been 33 // done since this method was last called. When adding 34 // time-consuming code into SegmentMerger, you should test 35 // different values for units to ensure that the time inbetwen 36 // calls to merge.checkAborted is up to ~ 1 second. 37 work(float64) error 38 } 39 40 /* If you use this: IW.close(false) cannot abort your merge! */ 41 type CheckAbortNone int 42 43 func (ca CheckAbortNone) work(units float64) error { return nil } // do nothing 44 45 // index/SerialMergeScheduler.java 46 47 // A MergeScheduler that simply does each merge sequentially, using 48 // the current thread. 49 type SerialMergeScheduler struct { 50 sync.Locker 51 } 52 53 func NewSerialMergeScheduler() *SerialMergeScheduler { 54 return &SerialMergeScheduler{&sync.Mutex{}} 55 } 56 57 func (ms *SerialMergeScheduler) Merge(writer *IndexWriter, 58 trigger MergeTrigger, newMergesFound bool) (err error) { 59 ms.Lock() // synchronized 60 defer ms.Unlock() 61 62 for merge := writer.nextMerge(); merge != nil && err == nil; merge = writer.nextMerge() { 63 err = writer.merge(merge) 64 } 65 return 66 } 67 68 // func (ms *SerialMergeScheduler) Clone() MergeScheduler { 69 // return NewSerialMergeScheduler() 70 // } 71 72 func (ms *SerialMergeScheduler) Close() error { return nil } 73 74 // index/MergePolicy.java 75 76 // Default max segment size in order to use compound file system. 77 // Set to maxInt64. 78 const DEFAULT_MAX_CFS_SEGMENT_SIZE = math.MaxInt64 79 80 /* 81 Expert: a MergePolicy determines the sequence of primitive merge 82 operations. 83 84 Whenever the segments in an index have been altered by IndexWriter, 85 either the addition of a newly flushed segment, addition of many 86 segments from addIndexes* calls, or a previous merge that may now 87 seed to cascade, IndexWriter invokes findMerges() to give the 88 MergePolicy a chance to pick merges that are now required. This 89 method returns a MergeSpecification instance describing the set of 90 merges that should be done, or nil if no merges are necessary. When 91 IndexWriter.forceMerge() is called, it calls findForcedMerges() and 92 the MergePolicy should then return the necessary merges. 93 94 Note that the policy can return more than one merge at a time. In 95 this case, if the writer is using SerialMergeScheduler, the merges 96 will be run sequentially but if it is using ConcurrentMergeScheduler 97 they will be run concurrently. 98 99 The default MergePolicy is TieredMergePolicy. 100 */ 101 type MergePolicy interface { 102 SetNoCFSRatio(noCFSRatio float64) 103 SetMaxCFSSegmentSizeMB(v float64) 104 MergeSpecifier 105 } 106 107 type MergePolicyImplSPI interface { 108 // Return the byte size of the provided SegmentCommitInfo, 109 // pro-rated by percentage of non-deleted documents if 110 // SetCalibrateSizeByDeletes() is set. 111 Size(*SegmentCommitInfo, *IndexWriter) (int64, error) 112 } 113 114 type MergePolicyImpl struct { 115 self MergeSpecifier 116 SizeSPI MergePolicyImplSPI 117 // If the size of te merge segment exceeds this ratio of the total 118 // index size then it will remain in non-compound format. 119 noCFSRatio float64 120 // If the size of the merged segment exceeds this value then it 121 // will not use compound file format. 122 maxCFSSegmentSize float64 123 } 124 125 type MergeSpecifier interface { 126 // Determine what set of merge operations are now necessary on the 127 // index. IndexWriter calls this whenever there is a change to the 128 // segments. This call is always synchronized on the IndexWriter 129 // instance so only one thread at a time will call this method. 130 FindMerges(MergeTrigger, *SegmentInfos, *IndexWriter) (MergeSpecification, error) 131 // Determine what set of merge operations is necessary in order to 132 // merge to <= the specified segment count. IndexWriter calls this 133 // when its forceMerge() method is called. This call is always 134 // synchronized on the IndexWriter instance so only one thread at a 135 // time will call this method. 136 FindForcedMerges(*SegmentInfos, int, 137 map[*SegmentCommitInfo]bool, *IndexWriter) (MergeSpecification, error) 138 // Determine what set of merge operations is necessary in order to 139 // expunge all deletes from the index. 140 // FindForcedDeletesMerges(segmentinfos *SegmentInfos) (spec MergeSpecification, err error) 141 } 142 143 /* 144 Creates a new merge policy instance. Note that if you intend to use 145 it without passing it to IndexWriter, you should call SetIndexWriter() 146 */ 147 func NewDefaultMergePolicyImpl(self MergeSpecifier) *MergePolicyImpl { 148 return newMergePolicyImpl(self, DEFAULT_NO_CFS_RATIO, DEFAULT_MAX_CFS_SEGMENT_SIZE) 149 } 150 151 /* 152 Create a new merge policy instance with default settings for noCFSRatio 153 and maxCFSSegmentSize. This ctor should be used by subclasses using 154 different defaults than the MergePolicy. 155 */ 156 func newMergePolicyImpl(self MergeSpecifier, defaultNoCFSRatio, defaultMaxCFSSegmentSize float64) *MergePolicyImpl { 157 ans := &MergePolicyImpl{ 158 self: self, 159 noCFSRatio: defaultNoCFSRatio, 160 maxCFSSegmentSize: defaultMaxCFSSegmentSize, 161 } 162 ans.SizeSPI = ans 163 return ans 164 } 165 166 func (mp *MergePolicyImpl) Size(info *SegmentCommitInfo, w *IndexWriter) (n int64, err error) { 167 byteSize, err := info.SizeInBytes() 168 if err != nil { 169 return 0, err 170 } 171 docCount := info.Info.DocCount() 172 if docCount <= 0 { 173 return byteSize, nil 174 } 175 176 delCount := w.readerPool.numDeletedDocs(info) 177 delRatio := float32(delCount) / float32(docCount) 178 assert(delRatio <= 1) 179 return int64(float32(byteSize) * (1 - delRatio)), nil 180 } 181 182 /* 183 Returns true if this single info is already fully merged (has no 184 pending deletes, is in the same dir as the writer, and matches the 185 current compound file setting) 186 */ 187 func (mp *MergePolicyImpl) isMerged(infos *SegmentInfos, 188 info *SegmentCommitInfo, w *IndexWriter) bool { 189 panic("not implemented yet") 190 assert(w != nil) 191 hasDeletions := w.readerPool.numDeletedDocs(info) > 0 192 return !hasDeletions && 193 !info.Info.HasSeparateNorms() && 194 info.Info.Dir == w.directory && 195 (mp.noCFSRatio > 0 && mp.noCFSRatio < 1 || mp.maxCFSSegmentSize < math.MaxInt64) 196 } 197 198 /* 199 If a merged segment will be more than this percentage of the total 200 size of the index, leave the segment as non-compound file even if 201 compound file is enabled. Set to 1.0 to always use CFS regardless or 202 merge size. 203 */ 204 func (mp *MergePolicyImpl) SetNoCFSRatio(noCFSRatio float64) { 205 assert2(noCFSRatio >= 0 && noCFSRatio <= 1, fmt.Sprintf( 206 "noCFSRatio must be 0.0 to 1.0 inclusive; got %v", noCFSRatio)) 207 mp.noCFSRatio = noCFSRatio 208 } 209 210 /* 211 If a merged segment will be more than this value, leave the segment 212 as non-compound file even if compound file is enabled. Set this to 213 math.Inf(1) (default) and noCFSRatio to 1.0 to always use CFS 214 regardless of merge size. 215 */ 216 func (mp *MergePolicyImpl) SetMaxCFSSegmentSizeMB(v float64) { 217 assert2(v >= 0, fmt.Sprintf("maxCFSSegmentSizeMB must be >=0 (got %v)", v)) 218 v *= 1024 * 1024 219 if v > float64(math.MaxInt64) { 220 mp.maxCFSSegmentSize = math.MaxInt64 221 } else { 222 mp.maxCFSSegmentSize = v 223 } 224 } 225 226 // Passed to MergePolicy.FindMerges(MergeTrigger, SegmentInfos) to 227 // indicate the event that triggered the merge 228 type MergeTrigger int 229 230 const ( 231 // Merge was triggered by a segment flush. 232 MERGE_TRIGGER_SEGMENT_FLUSH = MergeTrigger(1) 233 // Merge was triggered by a full flush. Full flushes can be caused 234 // by a commit, NRT reader reopen or close call on the index writer 235 MERGE_TRIGGER_FULL_FLUSH = MergeTrigger(2) 236 /* Merge has been triggerd explicitly by the user. */ 237 MERGE_TRIGGER_EXPLICIT = MergeTrigger(3) 238 /* Merge was triggered by a successfully finished merge. */ 239 MERGE_FINISHED = MergeTrigger(4) 240 // Merge was triggered by a closing IndexWriter. 241 MERGE_CLOSING = MergeTrigger(5) 242 ) 243 244 func MergeTriggerName(trigger MergeTrigger) string { 245 switch int(trigger) { 246 case 1: 247 return "SEGMENT_FLUSH" 248 case 2: 249 return "FULL_FLUSH" 250 case 3: 251 return "EXPLICIT" 252 case 4: 253 return "MERGE_FINISHED" 254 case 5: 255 return "CLOSING" 256 } 257 panic(fmt.Sprintf("Invalid merge trigger: %v", trigger)) 258 } 259 260 /* 261 OneMerge provides the information necessary to perform an individual 262 primitive merge operation, resulting in a single new segment. The 263 merge spec includes the subset of segments to be merged as well as 264 whether the new segment should use the compound file format. 265 */ 266 type OneMerge struct { 267 sync.Locker 268 269 registerDone bool // used by MergeControl 270 maxNumSegments int 271 272 // Segments to ber merged. 273 segments []*SegmentCommitInfo 274 275 // Total number of documents in segments to be merged, not 276 // accounting for deletions. 277 totalDocCount int 278 aborted bool 279 } 280 281 func NewOneMerge(segments []*SegmentCommitInfo) *OneMerge { 282 assert2(len(segments) > 0, "segments must include at least one segment") 283 // clone the list, as the in list may be based off original SegmentInfos and may be modified 284 segments2 := make([]*SegmentCommitInfo, len(segments)) 285 copy(segments2, segments) 286 count := 0 287 for _, info := range segments { 288 count += info.Info.DocCount() 289 } 290 return &OneMerge{ 291 maxNumSegments: -1, 292 segments: segments2, 293 totalDocCount: count, 294 } 295 } 296 297 func (m *OneMerge) abort() { 298 m.Lock() 299 defer m.Unlock() 300 m.aborted = true 301 } 302 303 /* 304 A MergeSpecification instance provides the information necessary to 305 perform multiple merges. It simply contains a list of OneMerge 306 instances. 307 */ 308 type MergeSpecification []*OneMerge 309 310 /* 311 Thrown when a merge was explicitly aborted because IndexWriter.close() 312 was called with false. Normally this error is privately caught and 313 suppressed by IndexWriter. 314 */ 315 type MergeAbortedError string 316 317 func (err MergeAbortedError) Error() string { 318 return string(err) 319 } 320 321 // index/TieredMergePolicy.java 322 323 // Default noCFSRatio. If a merge's size is >= 10% of the index, then 324 // we disable compound file for it. 325 const DEFAULT_NO_CFS_RATIO = 0.1 326 327 /* 328 Merges segments of approximately equal size, subject to an allowed 329 number of segments per tier. This is similar to LogByteSizeMergePolicy, 330 except this merge policy is able to merge non-adjacent segment, and 331 separates how many segments are merged at once (SetMaxMergeAtOnce()) 332 from how many segments are allowed per tier (SetSegmentsPerTier()). 333 This merge policy also does not over-merge (i.e. cascade merges). 334 335 For normal merging, this policy first computes a "budget" of how many 336 segments are allowed to be in the index. If the index is over-budget, 337 then the policy sorts segments by decreasing size (pro-rating by 338 percent deletes), and then finds the least-cost merge. Merge cost is 339 measured by a combination of the "skew" of the merge (size of largest 340 segments divided by smallest segment), total merge size and percent 341 deletes reclaimed, so tha tmerges with lower skew, smaller size and 342 those reclaiming more deletes, are flavored. 343 344 If a merge wil produce a segment that's larger than SetMaxMergedSegmentMB(), 345 then the policy will merge fewer segments (down to 1 at once, if that 346 one has deletions) to keep the segment size under budget. 347 348 NOTE: this policy freely merges non-adjacent segments; if this is a 349 problem, use LogMergePolicy. 350 351 NOTE: This policy always merges by byte size of the segments, always 352 pro-rates by percent deletes, and does not apply any maximum segment 353 size duirng forceMerge (unlike LogByteSizeMergePolicy). 354 */ 355 type TieredMergePolicy struct { 356 *MergePolicyImpl 357 358 maxMergeAtOnce int 359 maxMergedSegmentBytes int64 360 maxMergeAtOnceExplicit int 361 362 floorSegmentBytes int64 363 segsPerTier float64 364 forceMergeDeletesPctAllowed float64 365 reclaimDeletesWeight float64 366 } 367 368 func NewTieredMergePolicy() *TieredMergePolicy { 369 res := &TieredMergePolicy{ 370 maxMergeAtOnce: 10, 371 maxMergedSegmentBytes: 5 * 1024 * 1024 * 1024, 372 maxMergeAtOnceExplicit: 30, 373 floorSegmentBytes: 2 * 1024 * 1024, 374 segsPerTier: 10, 375 forceMergeDeletesPctAllowed: 10, 376 reclaimDeletesWeight: 2, 377 } 378 res.MergePolicyImpl = newMergePolicyImpl(res, DEFAULT_NO_CFS_RATIO, DEFAULT_MAX_CFS_SEGMENT_SIZE) 379 return res 380 } 381 382 /* 383 Maximum number of segments to be merged at a time during "normal" 384 merging. For explicit merging (e.g., forceMerge or forceMergeDeletes 385 was called), see SetMaxMergeAtonceExplicit(). Default is 10. 386 */ 387 func (tmp *TieredMergePolicy) SetMaxMergeAtOnce(v int) *TieredMergePolicy { 388 assert2(v >= 2, fmt.Sprintf("maxMergeAtonce must be > 1 (got %v)", v)) 389 tmp.maxMergeAtOnce = v 390 return tmp 391 } 392 393 /* 394 Maximum number of segments to be merged at a time, during forceMerge 395 or forceMergeDeletes. Default is 30. 396 */ 397 func (tmp *TieredMergePolicy) SetMaxMergeAtOnceExplicit(v int) *TieredMergePolicy { 398 assert2(v >= 2, fmt.Sprintf("maxMergeAtonceExplicit must be > 1 (got %v)", v)) 399 tmp.maxMergeAtOnceExplicit = v 400 return tmp 401 } 402 403 /* 404 Maximum sized segment to produce during normal merging. This setting 405 is approximate: the estimate of the merged segment size is made by 406 summing sizes of to-be-merged segments(compensating for percent 407 deleted docs). Default is 5 GB. 408 */ 409 func (tmp *TieredMergePolicy) SetMaxMergedSegmentMB(v float64) *TieredMergePolicy { 410 assert2(v >= 0, fmt.Sprintf("maxMergedSegmentMB must be >= 0 (got %v)", v)) 411 v *= 1024 * 1024 412 tmp.maxMergedSegmentBytes = math.MaxInt64 413 if v < math.MaxInt64 { 414 tmp.maxMergedSegmentBytes = int64(v) 415 } 416 return tmp 417 } 418 419 /* 420 Controls how aggressively merges that reclaim more deletions are 421 favored. Higher values will more aggresively target merges that 422 reclaim deletions, but be careful not to go so high that way too much 423 merging takes place; a value of 3.0 is probably nearly too high. A 424 value of 0.0 means deletions don't impact merge selection. 425 */ 426 func (tmp *TieredMergePolicy) SetReclaimDeletesWeight(v float64) *TieredMergePolicy { 427 assert2(v >= 0, fmt.Sprintf("reclaimDeletesWeight must be >= 0 (got %v)", v)) 428 tmp.reclaimDeletesWeight = v 429 return tmp 430 } 431 432 /* 433 Segments smaller than this are "rounded up" to this size, ie treated 434 as equal (floor) size for merge selection. This is to prevent 435 frequent flushing of tiny segments from allowing a long tail in the 436 index. Default is 2 MB. 437 */ 438 func (tmp *TieredMergePolicy) SetFloorSegmentMB(v float64) *TieredMergePolicy { 439 assert2(v > 0, fmt.Sprintf("floorSegmentMB must be > 0 (got %v)", v)) 440 v *= 1024 * 1024 441 tmp.floorSegmentBytes = math.MaxInt64 442 if v < math.MaxInt64 { 443 tmp.floorSegmentBytes = int64(v) 444 } 445 return tmp 446 } 447 448 /* 449 When forceMergeDeletes is called, we only merge away a segment if its 450 delete percentage is over this threshold. Default is 10%. 451 */ 452 func (tmp *TieredMergePolicy) SetForceMergeDeletesPctAllowed(v float64) *TieredMergePolicy { 453 assert2(v >= 0 && v <= 100, fmt.Sprintf("forceMergeDeletesPctAllowed must be between 0 and 100 inclusive (got %v)", v)) 454 tmp.forceMergeDeletesPctAllowed = v 455 return tmp 456 } 457 458 /* 459 Sets the allowed number of segments per tier. Smaller values mean 460 more merging but fewer segments. 461 462 NOTE: this value should be >= the SetMaxMergeAtOnce otherwise you'll 463 force too much merging to occur. 464 */ 465 func (tmp *TieredMergePolicy) SetSegmentsPerTier(v float64) *TieredMergePolicy { 466 assert2(v >= 2, fmt.Sprintf("segmentsPerTier must be >= 2 (got %v)", v)) 467 tmp.segsPerTier = v 468 return tmp 469 } 470 471 type BySizeDescendingSegments struct { 472 values []*SegmentCommitInfo 473 writer *IndexWriter 474 spi MergePolicyImplSPI 475 } 476 477 func (a *BySizeDescendingSegments) Len() int { return len(a.values) } 478 func (a *BySizeDescendingSegments) Swap(i, j int) { a.values[i], a.values[j] = a.values[j], a.values[i] } 479 func (a *BySizeDescendingSegments) Less(i, j int) bool { 480 var err error 481 var sz1, sz2 int64 482 sz1, err = a.spi.Size(a.values[i], a.writer) 483 assert(err == nil) 484 sz2, err = a.spi.Size(a.values[j], a.writer) 485 assert(err == nil) 486 if sz1 != sz2 { 487 return sz1 < sz2 488 } 489 return a.values[i].Info.Name < a.values[j].Info.Name 490 } 491 492 type MergeScore interface{} 493 494 func (tmp *TieredMergePolicy) FindMerges(mergeTrigger MergeTrigger, 495 infos *SegmentInfos, w *IndexWriter) (spec MergeSpecification, err error) { 496 497 if tmp.verbose(w) { 498 tmp.message(w, "findMerges: %v segments", len(infos.Segments)) 499 } 500 if len(infos.Segments) == 0 { 501 return nil, nil 502 } 503 merging := w.MergingSegments() 504 toBeMerged := make(map[*SegmentCommitInfo]bool) 505 506 infosSorted := make([]*SegmentCommitInfo, len(infos.Segments)) 507 copy(infosSorted, infos.Segments) 508 sort.Sort(&BySizeDescendingSegments{infosSorted, w, tmp}) 509 510 // Compute total index bytes & print details about the index 511 totIndexBytes := int64(0) 512 minSegmentBytes := int64(math.MaxInt64) 513 for _, info := range infosSorted { 514 var segBytes int64 515 if segBytes, err = tmp.Size(info, w); err != nil { 516 return 517 } 518 if tmp.verbose(w) { 519 var extra string 520 if _, ok := merging[info]; ok { 521 extra = " [merging]" 522 } 523 if segBytes >= tmp.maxMergedSegmentBytes/2 { 524 extra += " [skip: too large]" 525 } else { 526 extra += " [floored]" 527 } 528 tmp.message(w, " seg=%v size=%v MB%v", 529 w.readerPool.segmentToString(info), 530 fmt.Sprintf("%.3f", float32(segBytes)/1024/1024), extra) 531 } 532 533 if segBytes < minSegmentBytes { 534 minSegmentBytes = segBytes 535 } 536 // Accum total byte size 537 totIndexBytes += segBytes 538 } 539 540 // If we have too-large segments, grace them out of the maxSegmentCount: 541 tooBigCount := 0 542 for tooBigCount < len(infosSorted) { 543 var n int64 544 if n, err = tmp.Size(infosSorted[tooBigCount], w); err != nil { 545 return nil, err 546 } 547 if n < tmp.maxMergedSegmentBytes/2 { 548 break 549 } 550 totIndexBytes -= n 551 tooBigCount++ 552 } 553 554 minSegmentBytes = tmp.floorSize(minSegmentBytes) 555 556 // Compute max allowed segs in the index 557 levelSize := minSegmentBytes 558 bytesLeft := totIndexBytes 559 allowedSegCount := float64(0) 560 for { 561 if segCountLevel := float64(bytesLeft) / float64(levelSize); segCountLevel < tmp.segsPerTier { 562 allowedSegCount += math.Ceil(segCountLevel) 563 break 564 } 565 allowedSegCount += tmp.segsPerTier 566 bytesLeft -= int64(tmp.segsPerTier * float64(levelSize)) 567 levelSize *= int64(tmp.maxMergeAtOnce) 568 } 569 allowedSegCountInt := int(allowedSegCount) 570 571 // Cycle to possibly select more than one merge 572 for { 573 mergingBytes := int64(0) 574 575 // Gather eligible segments for merging, ie segments not already 576 // being merged and not already picked (by prior iteration of 577 // this loop) for merging: 578 var eligible []*SegmentCommitInfo 579 for _, info := range infosSorted[tooBigCount:] { 580 if _, ok := merging[info]; ok { 581 var n int64 582 if n, err = info.SizeInBytes(); err != nil { 583 return 584 } 585 mergingBytes += n 586 } else if _, ok := toBeMerged[info]; !ok { 587 eligible = append(eligible, info) 588 } 589 } 590 591 // maxMergeIsRunning := mergingBytes >= tmp.maxMergedSegmentBytes 592 593 if tmp.verbose(w) { 594 tmp.message(w, 595 " allowedSegmentCount=%v vs count=%v (eligible count=%v) tooBigCount=%v", 596 allowedSegCountInt, len(infosSorted), len(eligible), tooBigCount, w) 597 } 598 599 if len(eligible) == 0 { 600 return // spec is nil 601 } 602 603 if len(eligible) > allowedSegCountInt { 604 605 // OK we are over budget -- find best merge! 606 // var bestScore MergeScore 607 var best []*SegmentCommitInfo 608 // var bestTooLarge bool 609 // var bestMergeBytes int64 610 611 // Consider all merge starts: 612 for startIdx := 0; startIdx < len(eligible)-tmp.maxMergeAtOnce; startIdx++ { 613 var totAfterMergesBytes int64 614 var candidate []*SegmentCommitInfo 615 // var hitTooLarge bool 616 for idx := startIdx; idx < len(eligible) && len(candidate) < tmp.maxMergeAtOnce; idx++ { 617 info := eligible[idx] 618 var segBytes int64 619 if segBytes, err = tmp.Size(info, w); err != nil { 620 return nil, err 621 } 622 623 if totAfterMergesBytes+segBytes > tmp.maxMergedSegmentBytes { 624 panic("niy") 625 } 626 panic("niy") 627 } 628 629 panic("niy") 630 } 631 632 if best != nil { 633 panic("NIY") 634 } else { 635 return spec, nil 636 } 637 } else { 638 return 639 } 640 } 641 } 642 643 func (tmp *TieredMergePolicy) FindForcedMerges(infos *SegmentInfos, 644 maxSegmentCount int, segmentsToMerge map[*SegmentCommitInfo]bool, 645 w *IndexWriter) (MergeSpecification, error) { 646 panic("not implemented yet") 647 } 648 649 func (tmp *TieredMergePolicy) floorSize(bytes int64) int64 { 650 if bytes > tmp.floorSegmentBytes { 651 return bytes 652 } 653 return tmp.floorSegmentBytes 654 } 655 656 func (tmp *TieredMergePolicy) verbose(w *IndexWriter) bool { 657 return w != nil && w.infoStream.IsEnabled("TMP") 658 } 659 660 func (tmp *TieredMergePolicy) message(w *IndexWriter, message string, args ...interface{}) { 661 w.infoStream.Message("TMP", message, args...) 662 } 663 664 func (tmp *TieredMergePolicy) String() string { 665 return fmt.Sprintf("[TieredMergePolicy: maxMergeAtOnce=%v, maxMergeAtOnceExplicit=%v, maxMergedSegmentMB=%v, floorSegmentMB=%v, forceMergeDeletesPctAllowed=%v, segmentPerTier=%v, maxCFSSegmentSizeMB=%v, noCFSRatio=%v", 666 tmp.maxMergeAtOnce, tmp.maxMergeAtOnceExplicit, tmp.maxMergedSegmentBytes/1024/1024, 667 tmp.floorSegmentBytes/1024/1024, tmp.forceMergeDeletesPctAllowed, tmp.segsPerTier, 668 tmp.maxCFSSegmentSize/1024/1024, tmp.noCFSRatio) 669 } 670 671 // index/LogMergePolicy.java 672 673 /* 674 Defines the allowed range of log(size) for each level. A level is 675 computed by taking the max segment log size, minus LEVEL_LOG_SPAN, 676 and finding all segments falling within that range. 677 */ 678 const LEVEL_LOG_SPAN = 0.75 679 680 // Default merge factor, which is how many segments are merged at a time 681 const DEFAULT_MERGE_FACTOR = 10 682 683 /* 684 This class implements a MergePolicy that tries to merge segments into 685 levels of exponentially increasing size, where each level has fewer 686 segments than the value of the merge factor. Whenver extra segments 687 (beyond the merge factor upper bound) are encountered, all segments 688 within the level are merged. You can get or set the merge factor 689 using MergeFactor() and SetMergeFactor() repectively. 690 691 This class is abstract and required a subclass to define the Size() 692 method which specifies how a segment's size is determined. 693 LogDocMergePolicy is one subclass that measures size by document 694 count in the segment. LogByteSizeMergePolicy is another subclass that 695 measures size as the total byte size of the file(s) for the segment. 696 */ 697 type LogMergePolicy struct { 698 *MergePolicyImpl 699 700 // How many segments to merge at a time. 701 mergeFactor int 702 // Any segments whose size is smaller than this value will be 703 // rounded up to this value. This ensures that tiny segments are 704 // aggressively merged. 705 minMergeSize int64 706 // If the size of a segment exceeds this value then it will never 707 // be merged. 708 maxMergeSize int64 709 // Although the core MPs set it explicitly, we must default in case 710 // someone out there wrote this own LMP ... 711 // If the size of a segment exceeds this value then it will never 712 // be merged during ForceMerge() 713 maxMergeSizeForForcedMerge int64 714 // If true, we pro-rate a segment's size by the percentage of 715 // non-deleted documents. 716 calibrateSizeByDeletes bool 717 } 718 719 func NewLogMergePolicy(min, max int64) *LogMergePolicy { 720 res := &LogMergePolicy{ 721 mergeFactor: DEFAULT_MERGE_FACTOR, 722 minMergeSize: min, 723 maxMergeSize: max, 724 maxMergeSizeForForcedMerge: math.MaxInt64, 725 calibrateSizeByDeletes: true, 726 } 727 res.MergePolicyImpl = newMergePolicyImpl(res, DEFAULT_NO_CFS_RATIO, DEFAULT_MAX_CFS_SEGMENT_SIZE) 728 return res 729 } 730 731 // Returns true if LMP is enabled in IndexWriter's InfoStream. 732 func (mp *LogMergePolicy) verbose(w *IndexWriter) bool { 733 return w != nil && w.infoStream.IsEnabled("LMP") 734 } 735 736 // Print a debug message to IndexWriter's infoStream. 737 func (mp *LogMergePolicy) message(message string, w *IndexWriter) { 738 if mp.verbose(w) { 739 w.infoStream.Message("LMP", message) 740 } 741 } 742 743 /* 744 Determines how often segment indices are merged by AdDocument(). With 745 smaller values, less RAM is used while indexing, and searches are 746 faster, but indexing speed is slower. With larger values, more RAM is 747 used during indexing, and while searches is slower, indexing is 748 faster. Thus larger values (> 10) are best for batch index creation, 749 and smaller values (< 10) for indces that are interactively 750 maintained. 751 */ 752 func (mp *LogMergePolicy) SetMergeFactor(mergeFactor int) { 753 assert2(mergeFactor >= 2, "mergeFactor cannot be less than 2") 754 mp.mergeFactor = mergeFactor 755 } 756 757 // Sets whether the segment size should be calibrated by the number 758 // of delets when choosing segments to merge 759 func (mp *LogMergePolicy) SetCalbrateSizeByDeletes(calibrateSizeByDeletes bool) { 760 mp.calibrateSizeByDeletes = calibrateSizeByDeletes 761 } 762 763 /* 764 Return the number of documents in the provided SegmentCommitInfo, 765 pro-rated by percentage of non-deleted documents if 766 SetCalibrateSizeByDeletes() is set. 767 */ 768 func (mp *LogMergePolicy) sizeDocs(info *SegmentCommitInfo, w *IndexWriter) (n int64, err error) { 769 infoDocCount := info.Info.DocCount() 770 if mp.calibrateSizeByDeletes { 771 delCount := w.readerPool.numDeletedDocs(info) 772 assert(delCount <= infoDocCount) 773 return int64(infoDocCount - delCount), nil 774 } 775 return int64(infoDocCount), nil 776 } 777 778 /* 779 Return the byte size of the provided SegmentCommitInfo, pro-rated 780 by percentage of non-deleted documents if SetCalibratedSizeByDeletes() 781 is set. 782 */ 783 func (mp *LogMergePolicy) sizeBytes(info *SegmentCommitInfo, w *IndexWriter) (n int64, err error) { 784 if mp.calibrateSizeByDeletes { 785 return mp.MergePolicyImpl.Size(info, w) 786 } 787 return info.SizeInBytes() 788 } 789 790 /* 791 Returns true if the number of segments eligible for merging is less 792 than or equal to the specified maxNumSegments. 793 */ 794 func (mp *LogMergePolicy) isMergedBy(infos *SegmentInfos, 795 maxNumSegments int, segmentsToMerge map[*SegmentCommitInfo]bool, 796 w *IndexWriter) bool { 797 panic("not implemented yet") 798 } 799 800 func (mp *LogMergePolicy) FindForcedMerges(infos *SegmentInfos, 801 maxSegmentCount int, segmentsToMerge map[*SegmentCommitInfo]bool, 802 w *IndexWriter) (MergeSpecification, error) { 803 panic("not implemented yet") 804 } 805 806 type SegmentInfoAndLevel struct { 807 info *SegmentCommitInfo 808 level float32 809 index int 810 } 811 812 type SegmentInfoAndLevels []SegmentInfoAndLevel 813 814 func (ss SegmentInfoAndLevels) Len() int { return len(ss) } 815 func (ss SegmentInfoAndLevels) Swap(i, j int) { ss[i], ss[j] = ss[j], ss[i] } 816 func (ss SegmentInfoAndLevels) Less(i, j int) bool { return ss[i].level < ss[j].level } 817 818 /* 819 Checks if any merges are now necessary and returns a MergeSpecification 820 if so. A merge is necessary when there are more than SetMergeFactor() 821 segments at a given level. When multiple levels have too many 822 segments, this method will return multiple merges, allowing the 823 MergeScheduler to use concurrency. 824 */ 825 func (mp *LogMergePolicy) FindMerges(mergeTrigger MergeTrigger, 826 infos *SegmentInfos, w *IndexWriter) (spec MergeSpecification, err error) { 827 numSegments := len(infos.Segments) 828 mp.message(fmt.Sprintf("findMerges: %v segments", numSegments), w) 829 830 // Compute levels, whic is just log (base mergeFactor) of the size 831 // of each segment 832 levels := make([]*SegmentInfoAndLevel, 0) 833 norm := math.Log(float64(mp.mergeFactor)) 834 835 mergingSegments := w.mergingSegments 836 837 for i, info := range infos.Segments { 838 size, err := mp.Size(info, w) 839 if err != nil { 840 return nil, err 841 } 842 843 // Floor tiny segments 844 if size < 1 { 845 size = 1 846 } 847 848 infoLevel := &SegmentInfoAndLevel{info, float32(math.Log(float64(size)) / norm), i} 849 levels = append(levels, infoLevel) 850 851 if mp.verbose(w) { 852 segBytes, err := mp.sizeBytes(info, w) 853 if err != nil { 854 return nil, err 855 } 856 var extra string 857 if _, ok := mergingSegments[info]; ok { 858 extra = " [merging]" 859 } 860 if size >= mp.maxMergeSize { 861 extra = fmt.Sprintf("%v [skip: too large]", extra) 862 } 863 mp.message(fmt.Sprintf("seg=%v level=%v size=%.3f MB%v", 864 w.readerPool.segmentToString(info), 865 infoLevel.level, 866 segBytes/1024/1024, 867 extra), w) 868 } 869 } 870 871 var levelFloor float32 = 0 872 if mp.minMergeSize > 0 { 873 levelFloor = float32(math.Log(float64(mp.minMergeSize)) / float64(norm)) 874 } 875 876 // Now, we quantize the log values into levfels. The first level is 877 // any segment whose log size is within LEVEL_LOG_SPAN of the max 878 // size, or, who has such as segment "to the right". Then, we find 879 // the max of all other segments and use that to define the next 880 // level segment, etc. 881 882 numMergeableSegments := len(levels) 883 884 for start := 0; start < numMergeableSegments; { 885 // Find max level of all segments not already quantized. 886 maxLevel := levels[start].level 887 for i := 1 + start; i < numMergeableSegments; i++ { 888 level := levels[i].level 889 if level > maxLevel { 890 maxLevel = level 891 } 892 } 893 894 // Now search backwards for the rightmost segment that falls into 895 // this level: 896 var levelBottom float32 897 if maxLevel <= levelFloor { 898 // All remaining segments fall into the min level 899 levelBottom = -1 900 } else { 901 levelBottom = float32(float64(maxLevel) - LEVEL_LOG_SPAN) 902 903 // Force a boundary at the level floor 904 if levelBottom < levelFloor && maxLevel >= levelFloor { 905 levelBottom = levelFloor 906 } 907 } 908 909 upto := numMergeableSegments - 1 910 for upto >= start { 911 if levels[upto].level >= levelBottom { 912 break 913 } 914 upto-- 915 } 916 mp.message(fmt.Sprintf(" level %v to %v: %v segments", 917 levelBottom, maxLevel, 1+upto-start), w) 918 919 // Finally, record all merges that are viable at this level: 920 end := start + mp.mergeFactor 921 for end <= 1+upto { 922 panic("not implemented yet") 923 } 924 925 start = 1 + upto 926 } 927 928 return 929 } 930 931 func (mp *LogMergePolicy) String() string { 932 panic("not implemented yet") 933 } 934 935 // index/LogDocMergePolicy.java 936 937 // Default minimum segment size. 938 const DEFAULT_MIN_MERGE_DOCS = 1000 939 940 /* 941 This is a LogMergePolicy that measures size of a segment as the 942 number of documents (not taking deletions into account). 943 */ 944 type LogDocMergePolicy struct { 945 *LogMergePolicy 946 } 947 948 func NewLogDocMergePolicy() *LogMergePolicy { 949 ans := &LogDocMergePolicy{ 950 LogMergePolicy: NewLogMergePolicy(DEFAULT_MIN_MERGE_DOCS, math.MaxInt64), 951 } 952 // maxMergeSize(ForForcedMerge) are never used by LogDocMergePolicy; 953 // set it to math.MaxInt64 to disable it 954 ans.maxMergeSizeForForcedMerge = math.MaxInt64 955 ans.SizeSPI = ans 956 return ans.LogMergePolicy 957 } 958 959 func (p *LogDocMergePolicy) Size(info *SegmentCommitInfo, w *IndexWriter) (int64, error) { 960 return p.sizeDocs(info, w) 961 } 962 963 // index/LogByteSizeMergePolicy.java 964 965 // Default minimum segment size. 966 var DEFAULT_MIN_MERGE_MB = 1.6 967 968 // Default maximum segment size. A segment of this size or larger 969 // will never be merged. 970 const DEFAULT_MAX_MERGE_MB = 2048 971 972 // Default maximum segment size. A segment of this size or larger 973 // will never be merged during forceMerge. 974 var DEFAULT_MAX_MERGE_MB_FOR_FORCED_MERGE int64 = math.MaxInt64 975 976 // this is a LogMergePolicy that measures size of a segment as the 977 // total byte size of the segment's files. 978 type LogByteSizeMergePolicy struct { 979 *LogMergePolicy 980 } 981 982 func NewLogByteSizeMergePolicy() *LogMergePolicy { 983 ans := &LogByteSizeMergePolicy{ 984 LogMergePolicy: NewLogMergePolicy(int64(DEFAULT_MIN_MERGE_MB*1024*1024), 985 int64(DEFAULT_MAX_MERGE_MB*1024*1024)), 986 } 987 ans.maxMergeSizeForForcedMerge = int64(DEFAULT_MAX_MERGE_MB_FOR_FORCED_MERGE * 1024 * 1024) 988 ans.SizeSPI = ans 989 return ans.LogMergePolicy 990 } 991 992 func (p *LogByteSizeMergePolicy) Size(info *SegmentCommitInfo, w *IndexWriter) (int64, error) { 993 return p.sizeBytes(info, w) 994 }