github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/index/fileDeleter.go (about) 1 package index 2 3 import ( 4 "errors" 5 "fmt" 6 "github.com/balzaczyy/golucene/core/index/model" 7 "github.com/balzaczyy/golucene/core/store" 8 "github.com/balzaczyy/golucene/core/util" 9 "math" 10 "os" 11 "reflect" 12 "strconv" 13 "strings" 14 "time" 15 ) 16 17 // index/IndexFileDeleter.java 18 19 const VERBOSE_REF_COUNT = false 20 21 /* 22 This class keeps track of each SegmentInfos instance that is still 23 "live", either because it corresponds to a segments_N file in the 24 Directory (a "commit", i.e. a commited egmentInfos) or because it's 25 an in-memory SegmentInfos that a writer is actively updating but has 26 not yet committed. This class uses simple reference counting to map 27 the live SegmentInfos instances to individual files in the Directory. 28 29 The same directory file maybe referenced by more than one IndexCommit, 30 i.e. more than one SegmentInfos. Therefore we count how many commits 31 reference each file. When all the commits referencing a certain file 32 have been deleted, the refcount for that file becomes zero, and the 33 file is deleted. 34 35 A separate deletion policy interface (IndexDeletionPolicy) is 36 consulted on creation (onInit) and once per commit (onCommit), to 37 decide when a commit should be removed. 38 39 It is the business of the IndexDeletionPolicy to choose when to 40 delete commit points. The actual mechanics of file deletion, retrying, 41 etc, derived from the deletion of commit points is the business of 42 the IndexFileDeleter. 43 44 The current default deletion policy is KeepOnlyLastCommitDeletionPolicy, 45 which removes all prior commits when a new commit has completed. 46 This matches the bahavior before 2.2. 47 48 Note that you must hold the write.lock before instantiating this 49 class. It opens segments_N file(s) directly with no retry logic. 50 */ 51 type IndexFileDeleter struct { 52 // Files that we tried to delete but failed (likely because they 53 // are open and we are running on Windows), so we will retry them 54 // again later: 55 deletable map[string]bool 56 // Reference count for all files in the index. 57 // Counts how many existing commits reference a file. 58 refCounts map[string]*RefCount 59 60 // Holds all commits (Segments_N) current in the index. This will 61 // have just 1 commit if you are using the default delete policy ( 62 // KeepOnlyLastCommitDeletionPolicy). Other policies may leave 63 // commit points live for longer in which case this list would be 64 // longer than 1: 65 commits []IndexCommit 66 67 // Holds files we had incref'd from the previous non-commit checkpoint: 68 lastFiles []string 69 70 // Commits that the IndexDeletionPolicy have decided to delete: 71 commitsToDelete []*CommitPoint 72 73 infoStream util.InfoStream 74 directory store.Directory 75 policy IndexDeletionPolicy 76 77 startingCommitDeleted bool 78 lastSegmentInfos *SegmentInfos 79 80 writer *IndexWriter 81 } 82 83 /* 84 Initialize the deleter: find all previous commits in the Directory, 85 incref the files they reference, call the policy to let it delete 86 commits. This will remove any files not referenced by any of the 87 commits. 88 */ 89 func newIndexFileDeleter(directory store.Directory, policy IndexDeletionPolicy, 90 segmentInfos *SegmentInfos, infoStream util.InfoStream, writer *IndexWriter, 91 initialIndexExists bool) (*IndexFileDeleter, error) { 92 93 assert(writer != nil) 94 95 currentSegmentsFile := segmentInfos.SegmentsFileName() 96 if infoStream.IsEnabled("IFD") { 97 infoStream.Message("IFD", "init: current segments file is '%v'; deletionPolicy=%v", 98 currentSegmentsFile, reflect.TypeOf(policy).Name()) 99 } 100 101 fd := &IndexFileDeleter{ 102 infoStream: infoStream, 103 writer: writer, 104 policy: policy, 105 directory: directory, 106 refCounts: make(map[string]*RefCount), 107 } 108 109 // First pass: walk the files and initialize our ref counts: 110 currentGen := segmentInfos.generation 111 112 var currentCommitPoint *CommitPoint 113 var files []string 114 files, err := directory.ListAll() 115 if _, ok := err.(*store.NoSuchDirectoryError); ok { 116 // it means the directory is empty, so ignore it 117 files = make([]string, 0) 118 } else if err != nil { 119 return nil, err 120 } 121 122 if currentSegmentsFile != "" { 123 m := model.CODEC_FILE_PATTERN 124 for _, filename := range files { 125 if !strings.HasSuffix(filename, WRITE_LOCK_NAME) && 126 filename != INDEX_FILENAME_SEGMENTS_GEN && 127 (m.MatchString(filename) || strings.HasPrefix(filename, util.SEGMENTS)) { 128 129 // Add this file to refCounts with initial count 0: 130 fd.refCount(filename) 131 132 if strings.HasPrefix(filename, util.SEGMENTS) { 133 // This is a commit (segments or segments_N), and it's 134 // valid (<= the max gen). Load it, then incref all files 135 // it refers to: 136 if infoStream.IsEnabled("IFD") { 137 infoStream.Message("IFD", "init: load commit '%v'", filename) 138 } 139 sis := &SegmentInfos{} 140 err := sis.Read(directory, filename) 141 if os.IsNotExist(err) { 142 // LUCENE-948: on NFS (and maybe others), if 143 // you have writers switching back and forth 144 // between machines, it's very likely that the 145 // dir listing will be stale and will claim a 146 // file segments_X exists when in fact it 147 // doesn't. So, we catch this and handle it 148 // as if the file does not exist 149 if infoStream.IsEnabled("IFD") { 150 infoStream.Message("IFD", 151 "init: hit FileNotFoundException when loading commit '%v'; skipping this commit point", 152 filename) 153 } 154 sis = nil 155 } else if err != nil { 156 if GenerationFromSegmentsFileName(filename) <= currentGen { 157 length, _ := directory.FileLength(filename) 158 if length > 0 { 159 return nil, err 160 } 161 } 162 // Most likely we are opening an index that has an 163 // aborted "future" commit, so suppress exc in this case 164 sis = nil 165 } else { // sis != nil 166 commitPoint := newCommitPoint(fd.commitsToDelete, directory, sis) 167 if sis.generation == segmentInfos.generation { 168 currentCommitPoint = commitPoint 169 } 170 fd.commits = append(fd.commits, commitPoint) 171 fd.incRef(sis, true) 172 173 if fd.lastSegmentInfos == nil || sis.generation > fd.lastSegmentInfos.generation { 174 fd.lastSegmentInfos = sis 175 } 176 } 177 } 178 } 179 } 180 } 181 182 if currentCommitPoint == nil && currentSegmentsFile != "" && initialIndexExists { 183 // We did not in fact see the segments_N file corresponding to 184 // the segmentInfos that was passed in. Yet, it must exist, 185 // because our caller holds the write lock. This can happen when 186 // the directory listing was stale (e.g. when index accessed via 187 // NFS client with stale directory listing cache). So we try now 188 // to explicitly open this commit point: 189 sis := &SegmentInfos{} 190 err := sis.Read(directory, currentSegmentsFile) 191 if err != nil { 192 return nil, errors.New(fmt.Sprintf("failed to locate current segments_N file '%v'", 193 currentSegmentsFile)) 194 } 195 if infoStream.IsEnabled("IFD") { 196 infoStream.Message("IFD", "forced open of current segments file %v", 197 segmentInfos.SegmentsFileName()) 198 } 199 currentCommitPoint = newCommitPoint(fd.commitsToDelete, directory, sis) 200 fd.commits = append(fd.commits, currentCommitPoint) 201 fd.incRef(sis, true) 202 } 203 204 // We keep commits list in sorted order (oldest to newest): 205 util.TimSort(IndexCommits(fd.commits)) 206 207 // refCounts only includes "normal" filenames (does not include segments.gen, write.lock) 208 files = nil 209 for k, _ := range fd.refCounts { 210 files = append(files, k) 211 } 212 inflateGens(segmentInfos, files, fd.infoStream) 213 214 // Now delete anyting with ref count at 0. These are presumably 215 // abandoned files e.g. due to crash of IndexWriter. 216 for filename, rc := range fd.refCounts { 217 if rc.count == 0 { 218 if infoStream.IsEnabled("IFD") { 219 infoStream.Message("IFD", "init: removing unreferenced file '%v'", 220 filename) 221 } 222 fd.deleteFile(filename) 223 } 224 } 225 226 // Finally, give policy a chance to remove things on startup: 227 err = policy.onInit(fd.commits) 228 if err != nil { 229 return nil, err 230 } 231 232 // Always protect the incoming segmentInfos since sometime it may 233 // not be the most recent commit 234 err = fd.checkpoint(segmentInfos, false) 235 if err != nil { 236 return nil, err 237 } 238 239 fd.startingCommitDeleted = (currentCommitPoint != nil && currentCommitPoint.IsDeleted()) 240 241 fd.deleteCommits() 242 243 return fd, nil 244 } 245 246 /* 247 Set all gens beyond what we currently see in the directory, to avoid 248 double-write in cases where the previous IndexWriter did not 249 gracefully close/rollback (e.g. os/machine crashed or lost power). 250 */ 251 func inflateGens(infos *SegmentInfos, files []string, infoStream util.InfoStream) { 252 var maxSegmentGen int64 = math.MinInt64 253 var maxSegmentName int64 = math.MinInt32 254 255 // Confusingly, this is the union of liveDocs, field infos, doc 256 // values (and maybe others, in the future) gens. THis is somewhat 257 // messy, since it means DV updates will suddenly write to the next 258 // gen after live docs' gen, for example, but we don't have the 259 // APIs to ask the codec which file is which: 260 maxPerSegmentGen := make(map[string]int64) 261 262 for _, filename := range files { 263 if filename == INDEX_FILENAME_SEGMENTS_GEN || filename == WRITE_LOCK_NAME { 264 // do nothing 265 } else if strings.HasPrefix(filename, INDEX_FILENAME_SEGMENTS) { 266 if n := GenerationFromSegmentsFileName(filename); n > maxSegmentGen { 267 maxSegmentGen = n 268 } 269 } else { 270 segmentName := util.ParseSegmentName(filename) 271 assert2(strings.HasPrefix(segmentName, "_"), "file=%v", filename) 272 273 n, err := strconv.ParseInt(segmentName[1:], 36, 64) 274 assert(err == nil) 275 if n > maxSegmentName { 276 maxSegmentName = n 277 } 278 279 curGen := maxPerSegmentGen[segmentName] // or zero if not exists 280 if n := util.ParseGeneration(filename); n > curGen { 281 curGen = n 282 } 283 284 maxPerSegmentGen[segmentName] = curGen 285 } 286 } 287 } 288 289 func (fd *IndexFileDeleter) ensureOpen() { 290 fd.writer.ClosingControl.ensureOpen(false) 291 // since we allow 'closing' state, we must still check this, we 292 // could be closing because we hit unexpected error 293 assert2(fd.writer.tragedy == nil, 294 "refusing to delete any files: this IndexWriter hit an unrecoverable error\n%v", 295 fd.writer.tragedy) 296 } 297 298 /* 299 Remove the CommitPoint(s) in the commitsToDelete list by decRef'ing 300 all files from each SegmentInfos. 301 */ 302 func (fd *IndexFileDeleter) deleteCommits() { 303 if size := len(fd.commitsToDelete); size > 0 { 304 // First decref all files that had been referred to by the 305 // now-deleted commits: 306 for _, commit := range fd.commitsToDelete { 307 if fd.infoStream.IsEnabled("IFD") { 308 fd.infoStream.Message("IFD", "deleteCommits: now decRef commit '%v'", 309 commit.segmentsFileName) 310 } 311 fd.decRefFiles(commit.files) 312 } 313 fd.commitsToDelete = nil 314 315 // Now compact commits to remove deleted ones (preserving the sort): 316 var writeTo = 0 317 for readFrom, commit := range fd.commits { 318 if !commit.IsDeleted() && readFrom != writeTo { 319 fd.commits[writeTo] = commit 320 writeTo++ 321 } 322 } 323 for i, _ := range fd.commits[writeTo:] { 324 fd.commits[i] = nil 325 } 326 fd.commits = fd.commits[:writeTo] 327 } 328 } 329 330 /* 331 Writer calls this when it has hit an error and had to roll back, to 332 tell us that there may now be unreferenced files in the filesystem. 333 So we re-list the filesystem and delete such files. If segmentName is 334 non-empty, we only delete files correspoding to that segment. 335 */ 336 func (fd *IndexFileDeleter) refresh(segmentName string) error { 337 // assert locked() 338 339 var prefix1, prefix2 string 340 if segmentName != "" { 341 prefix1 = segmentName + "." 342 prefix2 = segmentName + "_" 343 } 344 345 m := model.CODEC_FILE_PATTERN 346 files, err := fd.directory.ListAll() 347 if err != nil { 348 return err 349 } 350 for _, filename := range files { 351 _, hasRef := fd.refCounts[filename] 352 if (segmentName == "" || strings.HasPrefix(filename, prefix1) || 353 strings.HasPrefix(filename, prefix2)) && 354 !strings.HasSuffix(filename, WRITE_LOCK_NAME) && 355 !hasRef && filename != INDEX_FILENAME_SEGMENTS_GEN && 356 (m.MatchString(filename) || strings.HasPrefix(filename, INDEX_FILENAME_SEGMENTS)) { 357 358 // Unreferenced file, so remove it 359 if fd.infoStream.IsEnabled("IFD") { 360 fd.infoStream.Message("IFD", 361 "refresh [prefix=%v]: removing newly created unreferenced file '%v'", 362 segmentName, filename) 363 } 364 fd.deleteFile(filename) 365 } 366 } 367 return nil 368 } 369 370 func (fd *IndexFileDeleter) refreshList() error { 371 // set to nil so that we regenerate the list of pending files; 372 // else we can accumulate some file more than once 373 fd.deletable = nil 374 return fd.refresh("") 375 } 376 377 func (fd *IndexFileDeleter) Close() error { 378 // DecRef old files from the last checkpoint, if any: 379 // assert locked() 380 if len(fd.lastFiles) > 0 { 381 fd.decRefFiles(fd.lastFiles) 382 fd.lastFiles = nil 383 } 384 fd.deletePendingFiles() 385 return nil 386 } 387 388 func (fd *IndexFileDeleter) deletePendingFiles() { 389 // assert locked() 390 if fd.deletable != nil { 391 oldDeletable := fd.deletable 392 fd.deletable = nil 393 for filename, _ := range oldDeletable { 394 if fd.infoStream.IsEnabled("IFD") { 395 fd.infoStream.Message("IFD", "delete pending file %v", filename) 396 } 397 rc, ok := fd.refCounts[filename] 398 assert2(!ok || rc.count <= 0, 399 // LUCENE-5904: should never happen! This means we are about to pending-delete a referenced index file 400 "filename=%v is in pending delete list but also has refCount=%v", 401 filename, rc.count) 402 fd.deleteFile(filename) 403 } 404 } 405 } 406 407 /* 408 For definition of "check point" see IndexWriter comments: 409 "Clarification: Check Points (and commits)". 410 411 Writer calls this when it has made a "consistent change" to the index, 412 meaning new files are written to the index the in-memory SegmentInfos 413 have been modified to point to those files. 414 415 This may or may not be a commit (sgments_N may or may not have been 416 written). 417 418 We simply incref the files referenced by the new SegmentInfos and 419 decref the files we had previously seen (if any). 420 421 If this is a commit, we also call the policy to give it a chance to 422 remove other commits. If any commits are removed, we decref their 423 files as well. 424 */ 425 func (fd *IndexFileDeleter) checkpoint(segmentInfos *SegmentInfos, isCommit bool) error { 426 // asset locked() 427 start := time.Now() 428 defer func() { 429 if fd.infoStream.IsEnabled("IFD") { 430 elapsed := time.Now().Sub(start) 431 fd.infoStream.Message("IFD", "%v to checkpoint", elapsed) 432 } 433 }() 434 if fd.infoStream.IsEnabled("IFD") { 435 fd.infoStream.Message("IFD", "now checkpoint '%v' [%v segments; isCommit = %v]", 436 fd.writer.readerPool.segmentsToString(fd.writer._toLiveInfos(segmentInfos).Segments), 437 len(segmentInfos.Segments), isCommit) 438 } 439 440 // Try again now to delete any previously un-deletable files ( 441 // because they were in use, on Windows): 442 fd.deletePendingFiles() 443 444 // Incref the files: 445 fd.incRef(segmentInfos, isCommit) 446 447 if isCommit { 448 // Append to our commits list: 449 fd.commits = append(fd.commits, newCommitPoint(fd.commitsToDelete, fd.directory, segmentInfos)) 450 451 // Tell policy so it can remove commits: 452 err := fd.policy.onCommit(fd.commits) 453 if err != nil { 454 return err 455 } 456 457 // Decref files for commits that were deleted by the policy: 458 fd.deleteCommits() 459 } else { 460 // DecRef old files from the last checkpoint, if any: 461 fd.decRefFiles(fd.lastFiles) 462 fd.lastFiles = nil 463 464 // Save files so we can decr on next checkpoint/commit: 465 fd.lastFiles = append(fd.lastFiles, segmentInfos.files(fd.directory, false)...) 466 } 467 return nil 468 } 469 470 func (del *IndexFileDeleter) incRef(segmentInfos *SegmentInfos, isCommit bool) { 471 // assert locked() 472 // If this is a commit point, also incRef the segments_N file: 473 files := segmentInfos.files(del.directory, isCommit) 474 for _, filename := range files { 475 del.incRefFile(filename) 476 } 477 } 478 479 func (del *IndexFileDeleter) incRefFiles(files []string) { 480 // assert locked 481 for _, file := range files { 482 del.incRefFile(file) 483 } 484 } 485 486 func (del *IndexFileDeleter) incRefFile(filename string) { 487 // assert locked 488 rc := del.refCount(filename) 489 if del.infoStream.IsEnabled("IFD") && VERBOSE_REF_COUNT { 490 del.infoStream.Message("IFD", " IncRef '%v': pre-incr count is %v", 491 filename, rc.count) 492 } 493 rc.incRef() 494 } 495 496 func (fd *IndexFileDeleter) decRefFiles(files []string) { 497 // assert locked() 498 for _, file := range files { 499 fd.decRefFile(file) 500 } 501 } 502 503 func (fd *IndexFileDeleter) decRefFilesWhileSuppressingError(files []string) { 504 for _, file := range files { 505 fd.decRefFileWhileSuppressingError(file) 506 } 507 } 508 509 func (fd *IndexFileDeleter) decRefFile(filename string) { 510 //assert locked() 511 rc := fd.refCount(filename) 512 if fd.infoStream.IsEnabled("IFD") && VERBOSE_REF_COUNT { 513 fd.infoStream.Message("IFD", " DecRef '%v': pre-decr count is %v", 514 filename, rc.count) 515 } 516 if rc.decRef() == 0 { 517 // This file is no longer referenced by any past commit points 518 // nor by the in-memory SegmentInfos: 519 fd.deleteFile(filename) 520 delete(fd.refCounts, filename) 521 } 522 } 523 524 func (fd *IndexFileDeleter) decRefFileWhileSuppressingError(file string) { 525 defer func() { 526 recover() 527 }() 528 fd.decRefFile(file) 529 } 530 531 func (del *IndexFileDeleter) decRefInfos(infos *SegmentInfos) { 532 del.decRefFiles(infos.files(del.directory, false)) 533 } 534 535 // 529 536 func (del *IndexFileDeleter) exists(filename string) bool { 537 if v, ok := del.refCounts[filename]; ok { 538 return v.count > 0 539 } 540 return false 541 } 542 543 func (del *IndexFileDeleter) refCount(filename string) *RefCount { 544 // assert Thread.holdsLock(del.writer) TODO GoLucene doesn't have this capability 545 rc, ok := del.refCounts[filename] 546 if !ok { 547 rc = newRefCount(filename) 548 // we should never incRef a file we are already wanting to delete 549 assert2(del.deletable == nil || !del.deletable[filename], 550 "file '%v' cannot be incRef'd: it's already pending delete", 551 filename) 552 del.refCounts[filename] = rc 553 } 554 return rc 555 } 556 557 /* 558 Deletes the specified files, but only if they are new (have not yet 559 been incref'd). 560 */ 561 func (fd *IndexFileDeleter) deleteNewFiles(files []string) { 562 // assert locked 563 for _, filename := range files { 564 // NOTE: it's very unusual yet possible for the 565 // refCount to be present and 0: it can happen if you 566 // open IW on a crashed index, and it removes a bunch 567 // of unref'd files, and then you add new docs / do 568 // merging, and it reuses that segment name. 569 // TestCrash.testCrashAfterReopen can hit this: 570 if rf, ok := fd.refCounts[filename]; !ok || rf.count == 0 { 571 if fd.infoStream.IsEnabled("IFD") { 572 fd.infoStream.Message("IFD", "delete new file '%v'", filename) 573 } 574 fd.deleteFile(filename) 575 } 576 } 577 } 578 579 func (del *IndexFileDeleter) deleteFile(filename string) { 580 //assert locked() 581 del.ensureOpen() 582 if del.infoStream.IsEnabled("IFD") { 583 del.infoStream.Message("IFD", "delete '%v'", filename) 584 } 585 err := del.directory.DeleteFile(filename) 586 if err != nil { // if delete fails 587 if del.directory.FileExists(filename) { 588 // Some operating systems (e.g. Windows) don't 589 // permit a file to be deleted while it is opened 590 // for read (e.g. by another process or thread). So 591 // we assume that when a delete fails it is because 592 // the file is open in another process, and queue 593 // the file for subsequent deletion. 594 if del.infoStream.IsEnabled("IFD") { 595 del.infoStream.Message("IFD", 596 "unable to remove file '%v': %v; will re-try later.", 597 filename, err) 598 } 599 if del.deletable == nil { 600 del.deletable = make(map[string]bool) 601 } 602 del.deletable[filename] = true 603 } 604 } 605 } 606 607 /* 608 Tracks the reference count for a single index file: 609 */ 610 type RefCount struct { 611 // filename used only for better assert error messages 612 filename string 613 initDone bool 614 count int 615 } 616 617 func newRefCount(filename string) *RefCount { 618 return &RefCount{filename: filename} 619 } 620 621 func (rf *RefCount) incRef() int { 622 if !rf.initDone { 623 rf.initDone = true 624 } else { 625 assert2(rf.count > 0, fmt.Sprintf("RefCount is 0 pre-increment for file %v", rf.filename)) 626 } 627 rf.count++ 628 return rf.count 629 } 630 631 func (rf *RefCount) decRef() int { 632 assert2(rf.count > 0, fmt.Sprintf("RefCount is 0 pre-decrement for file %v", rf.filename)) 633 rf.count-- 634 return rf.count 635 } 636 637 /* 638 Holds details for each commit point. This class is also passed to the 639 deletion policy. Note: this class has a natural ordering that is 640 inconsistent with equals. 641 */ 642 type CommitPoint struct { 643 files []string 644 segmentsFileName string 645 deleted bool 646 directory store.Directory 647 commitsToDelete []*CommitPoint 648 generation int64 649 userData map[string]string 650 segmentCount int 651 } 652 653 func newCommitPoint(commitsToDelete []*CommitPoint, directory store.Directory, 654 segmentInfos *SegmentInfos) *CommitPoint { 655 return &CommitPoint{ 656 directory: directory, 657 commitsToDelete: commitsToDelete, 658 userData: segmentInfos.userData, 659 segmentsFileName: segmentInfos.SegmentsFileName(), 660 generation: segmentInfos.generation, 661 files: segmentInfos.files(directory, true), 662 segmentCount: len(segmentInfos.Segments), 663 } 664 } 665 666 func (cp *CommitPoint) String() string { 667 return fmt.Sprintf("IndexFileDeleter.CommitPoint(%v)", cp.segmentsFileName) 668 } 669 670 func (cp *CommitPoint) SegmentCount() int { 671 return cp.segmentCount 672 } 673 674 func (cp *CommitPoint) SegmentsFileName() string { 675 return cp.segmentsFileName 676 } 677 678 func (cp *CommitPoint) FileNames() []string { 679 return cp.files 680 } 681 682 func (cp *CommitPoint) Directory() store.Directory { 683 return cp.directory 684 } 685 686 func (cp *CommitPoint) Generation() int64 { 687 return cp.generation 688 } 689 690 func (cp *CommitPoint) UserData() map[string]string { 691 return cp.userData 692 } 693 694 func (cp *CommitPoint) Delete() { 695 if !cp.deleted { 696 cp.deleted = true 697 cp.commitsToDelete = append(cp.commitsToDelete, cp) 698 } 699 } 700 701 func (cp *CommitPoint) IsDeleted() bool { 702 return cp.deleted 703 }