github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/index/checkIndex.go (about)

     1  package index
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	. "github.com/balzaczyy/golucene/core/codec/spi"
     7  	"github.com/balzaczyy/golucene/core/store"
     8  	"github.com/balzaczyy/golucene/core/util"
     9  	"io"
    10  	"runtime/debug"
    11  	"strconv"
    12  )
    13  
    14  // index/CheckIndex.java
    15  
    16  /* Returned from checkIndex() detailing the health and status of the index */
    17  type CheckIndexStatus struct {
    18  	// True if no problems found with the index.
    19  	Clean bool
    20  
    21  	// True if we were unable to locate and load the segments_N file.
    22  	MissingSegments bool
    23  
    24  	// True if we were unable to open the segments_N file.
    25  	cantOpenSegments bool
    26  
    27  	// True if we were unable to read the versioin number from segments_N file.
    28  	missingSegmentVersion bool
    29  
    30  	// Name of latest segments_N file in the index.
    31  	segmentsFilename string
    32  
    33  	// Number of segments in the index
    34  	numSegments int
    35  
    36  	// True if the index was created with a newer version of Lucene than the CheckIndex tool.
    37  	toolOutOfDate bool
    38  
    39  	// List of SegmentInfoStatus instances, detailing status of each segment.
    40  	segmentInfos []*SegmentInfoStatus
    41  
    42  	// Directory index is in.
    43  	dir store.Directory
    44  
    45  	// SegmentInfos instance containing only segments that had no
    46  	// problems (this is used with the fixIndex() method to repare the
    47  	// index)
    48  	newSegments *SegmentInfos
    49  
    50  	// How many documents will be lost to bad segments.
    51  	totLoseDocCount int
    52  
    53  	// How many bad segments were found.
    54  	numBadSegments int
    55  
    56  	// Whether the SegmentInfos.counter is greater than any of the segments' names.
    57  	validCounter bool
    58  
    59  	// The greatest segment name.
    60  	maxSegmentName int
    61  
    62  	// Holds the userData of the last commit in the index
    63  	userData map[string]string
    64  }
    65  
    66  /* Holds the status of each segment in the index. */
    67  type SegmentInfoStatus struct {
    68  	// Name of the segment
    69  	name string
    70  
    71  	// Codec used to read this segment.
    72  	codec Codec
    73  
    74  	// Document count (does not take deletions into account).
    75  	docCount int
    76  
    77  	// True if segment is compound file format.
    78  	compound bool
    79  
    80  	// Number of files referenced by this segment.
    81  	numFiles int
    82  
    83  	// Net size (MB) of the files referenced by this segment.
    84  	sizeMB float64
    85  
    86  	// True if this segment has pending deletions.
    87  	hasDeletions bool
    88  
    89  	// Current deletions generation.
    90  	deletionsGen int64
    91  
    92  	// Number of deleted documents.
    93  	numDeleted int
    94  
    95  	// True if we were able to open an AR on this segment.
    96  	openReaderPassed bool
    97  
    98  	// Number of fields in this segment.
    99  	numFields int
   100  
   101  	// Map that includes certain debugging details that IW records into each segment it creates
   102  	diagnostics map[string]string
   103  
   104  	// Status for testing of field norms (nil if field norms could not be tested).
   105  	fieldNormStatus *FieldNormStatus
   106  
   107  	// Status for testing of indexed terms (nil if indexed terms could not be tested).
   108  	termIndexStatus *TermIndexStatus
   109  
   110  	// Status for testing of stored fields (nil if stored fields could not be tested).
   111  	storedFieldStatus *StoredFieldStatus
   112  
   113  	// Status for testing term vectors (nil if term vectors could not be tested).
   114  	termVectorStatus *TermVectorStatus
   115  
   116  	// Status for testing of DocVlaues (nil if DocValues could not be tested).
   117  	docValuesStatus *DocValuesStatus
   118  }
   119  
   120  type FieldNormStatus struct {
   121  	err error
   122  }
   123  
   124  type TermIndexStatus struct {
   125  	err error
   126  }
   127  
   128  type StoredFieldStatus struct {
   129  	err error
   130  }
   131  
   132  type TermVectorStatus struct {
   133  	err error
   134  }
   135  
   136  type DocValuesStatus struct {
   137  	err error
   138  }
   139  
   140  /*
   141  Basic tool and API to check the health of an index and write a new
   142  segments file that removes reference to problematic segments.
   143  
   144  As this tool checks every byte in the index, on a large index it can
   145  take a long time to run.
   146  */
   147  type CheckIndex struct {
   148  	infoStream            io.Writer
   149  	dir                   store.Directory
   150  	crossCheckTermVectors bool
   151  	failFast              bool
   152  }
   153  
   154  func NewCheckIndex(dir store.Directory, crossCheckTermVectors bool, infoStream io.Writer) *CheckIndex {
   155  	return &CheckIndex{
   156  		infoStream: infoStream,
   157  		dir:        dir,
   158  		crossCheckTermVectors: crossCheckTermVectors,
   159  	}
   160  }
   161  
   162  func (ch *CheckIndex) msg(msg string, args ...interface{}) {
   163  	fmt.Fprintf(ch.infoStream, msg, args...)
   164  	fmt.Fprintln(ch.infoStream)
   165  }
   166  
   167  /*
   168  Returns a Status instance detailing the state of the index.
   169  
   170  As this method checks every byte in the specified segments, on a
   171  large index it can take quite a long time to run.
   172  
   173  WARNING: make sure you only call this when the index is not opened
   174  by any writer.
   175  */
   176  func (ch *CheckIndex) CheckIndex(onlySegments []string) *CheckIndexStatus {
   177  	sis := &SegmentInfos{}
   178  	result := &CheckIndexStatus{
   179  		dir: ch.dir,
   180  	}
   181  	err := sis.ReadAll(ch.dir)
   182  	if err != nil {
   183  		if ch.failFast {
   184  			panic("niy")
   185  		}
   186  		fmt.Fprintln(ch.infoStream, "ERROR: could not read any segments file in directory")
   187  		debug.PrintStack()
   188  		result.MissingSegments = true
   189  		return result
   190  	}
   191  
   192  	// find the oldest and newest segment versions
   193  	var oldest util.Version
   194  	var newest util.Version
   195  	var oldSegs string
   196  	for _, si := range sis.Segments {
   197  		if version := si.Info.Version(); len(version) != 0 {
   198  			if len(oldest) == 0 || !version.OnOrAfter(oldest) {
   199  				oldest = version
   200  			}
   201  			if len(newest) == 0 || version.OnOrAfter(newest) {
   202  				newest = version
   203  			}
   204  		} else {
   205  			// pre-3.1 segment
   206  			oldSegs = "pre-3.1"
   207  		}
   208  	}
   209  
   210  	numSegments := len(sis.Segments)
   211  	segmentsFilename := sis.SegmentsFileName()
   212  	// note: we only read the format byte (required preamble) here!
   213  	input, err := ch.dir.OpenInput(segmentsFilename, store.IO_CONTEXT_READONCE)
   214  	if err != nil {
   215  		if ch.failFast {
   216  			panic("niy")
   217  		}
   218  		fmt.Fprintln(ch.infoStream, "ERROR: could not open segments file in directory")
   219  		debug.PrintStack()
   220  		result.cantOpenSegments = true
   221  		return result
   222  	}
   223  	defer input.Close() // ignore error
   224  
   225  	_, err = input.ReadInt()
   226  	if err != nil {
   227  		if ch.failFast {
   228  			panic("niy")
   229  		}
   230  		fmt.Fprintln(ch.infoStream, "ERROR: could not read segment file version in directory")
   231  		debug.PrintStack()
   232  		result.missingSegmentVersion = true
   233  		return result
   234  	}
   235  
   236  	var sFormat string
   237  	var skip = false
   238  
   239  	result.segmentsFilename = segmentsFilename
   240  	result.numSegments = numSegments
   241  	result.userData = sis.userData
   242  	var userDataStr string
   243  	if len(sis.userData) > 0 {
   244  		userDataStr = fmt.Sprintf(" userData=%v", sis.userData)
   245  	}
   246  
   247  	var versionStr string
   248  	if oldSegs != "" {
   249  		if len(newest) != 0 {
   250  			versionStr = fmt.Sprintf("versions=[%v .. %v]", oldSegs, newest)
   251  		} else {
   252  			versionStr = fmt.Sprintf("version=%v", oldSegs)
   253  		}
   254  	} else if len(newest) != 0 { // implies oldest is set
   255  		if newest.Equals(oldest) {
   256  			versionStr = fmt.Sprintf("version=%v", oldest)
   257  		} else {
   258  			versionStr = fmt.Sprintf("versions=[%v .. %v]", oldest, newest)
   259  		}
   260  	}
   261  
   262  	ch.msg("Segments file=%v numSegments=%v %v format=%v%v",
   263  		segmentsFilename, numSegments, versionStr, sFormat, userDataStr)
   264  
   265  	names := make(map[string]bool)
   266  	if onlySegments != nil {
   267  		for _, name := range onlySegments {
   268  			names[name] = true
   269  		}
   270  		panic("not implemented yet")
   271  	}
   272  
   273  	if skip {
   274  		ch.msg(
   275  			"\nERROR: this index appears to be created by a newer version of Lucene than this tool was compiled on; please re-compile this tool on the matching version of Lucene; exiting")
   276  		result.toolOutOfDate = true
   277  		return result
   278  	}
   279  
   280  	result.newSegments = sis.Clone()
   281  	result.newSegments.Clear()
   282  	result.maxSegmentName = -1
   283  
   284  	for i, info := range sis.Segments {
   285  		segmentName, err := strconv.ParseInt(info.Info.Name[1:], 36, 32)
   286  		if err != nil {
   287  			panic(err) // impossible
   288  		}
   289  		if int(segmentName) > result.maxSegmentName {
   290  			result.maxSegmentName = int(segmentName)
   291  		}
   292  		if _, ok := names[info.Info.Name]; !ok {
   293  			continue
   294  		}
   295  		segInfoStat := new(SegmentInfoStatus)
   296  		result.segmentInfos = append(result.segmentInfos, segInfoStat)
   297  		infoDocCount := info.Info.DocCount()
   298  		ch.msg("  %v of %v: name=%v docCount=%v ",
   299  			1+i, numSegments, info.Info.Name, infoDocCount)
   300  		segInfoStat.name = info.Info.Name
   301  		segInfoStat.docCount = infoDocCount
   302  
   303  		version := info.Info.Version()
   304  		if infoDocCount <= 0 && version.OnOrAfter(util.VERSION_45) {
   305  			panic(fmt.Sprintf("illegal number of documents: maxDoc=%v", infoDocCount))
   306  		}
   307  
   308  		toLoseDocCount := infoDocCount
   309  		err = func() error {
   310  			assert2(len(version) != 0, "pre 4.0 is not supported yet")
   311  			ch.msg("    version=%v", version)
   312  			codec := info.Info.Codec().(Codec)
   313  			ch.msg("    codec = %v", codec)
   314  			segInfoStat.codec = codec
   315  			ch.msg("    compound = %v", info.Info.IsCompoundFile())
   316  			segInfoStat.compound = info.Info.IsCompoundFile()
   317  			ch.msg("    numFiles = %v", len(info.Files()))
   318  			segInfoStat.numFiles = len(info.Files())
   319  			n, err := info.SizeInBytes()
   320  			if err != nil {
   321  				return err
   322  			}
   323  			segInfoStat.sizeMB = float64(n) / (1024 * 1024)
   324  			if v := info.Info.Attribute("Lucene3xSegmentInfoFormat.dsoffset"); v == "" {
   325  				// don't print size in bytes if it's a 3.0 segment iwht shared docstores
   326  				ch.msg("    size (MB) = %v", segInfoStat.sizeMB)
   327  			}
   328  
   329  			diagnostics := info.Info.Diagnostics()
   330  			segInfoStat.diagnostics = diagnostics
   331  			if len(diagnostics) > 0 {
   332  				ch.msg("    diagnostics = %v", diagnostics)
   333  			}
   334  
   335  			atts := info.Info.Attributes()
   336  			if len(atts) > 0 {
   337  				ch.msg("    attributes = %v", atts)
   338  			}
   339  
   340  			panic("not implemented yet")
   341  
   342  			if !info.HasDeletions() {
   343  				ch.msg("    no deletions")
   344  				segInfoStat.hasDeletions = false
   345  			} else {
   346  				ch.msg("     has deletions [delGen = %v]", info.DelGen())
   347  				segInfoStat.hasDeletions = true
   348  				segInfoStat.deletionsGen = info.DelGen()
   349  			}
   350  
   351  			ch.msg("    test: open reader.........")
   352  			reader, err := NewSegmentReader(info, DEFAULT_TERMS_INDEX_DIVISOR, store.IO_CONTEXT_DEFAULT)
   353  			if err != nil {
   354  				return err
   355  			}
   356  			defer reader.Close()
   357  
   358  			segInfoStat.openReaderPassed = true
   359  
   360  			numDocs := reader.NumDocs()
   361  			toLoseDocCount = numDocs
   362  			if reader.hasDeletions() {
   363  				if n := infoDocCount - info.DelCount(); n != reader.NumDocs() {
   364  					return errors.New(fmt.Sprintf(
   365  						"delete count mismatch: info=%v vs reader=%v",
   366  						n, reader.NumDocs()))
   367  				}
   368  				if n := infoDocCount - reader.NumDocs(); n > reader.MaxDoc() {
   369  					return errors.New(fmt.Sprintf(
   370  						"too many deleted docs: maxDoc()=%v vs del count=%v",
   371  						reader.MaxDoc(), n))
   372  				}
   373  				if n := infoDocCount - numDocs; n != info.DelCount() {
   374  					return errors.New(fmt.Sprintf(
   375  						"delete count mismatch: info=%v vs reader=%v",
   376  						info.DelCount(), n))
   377  				}
   378  				liveDocs := reader.LiveDocs()
   379  				if liveDocs == nil {
   380  					return errors.New("segment should have deletions, but liveDocs is nil")
   381  				} else {
   382  					var numLive = 0
   383  					for j := 0; j < liveDocs.Length(); j++ {
   384  						if liveDocs.At(j) {
   385  							numLive++
   386  						}
   387  					}
   388  					if numLive != numDocs {
   389  						return errors.New(fmt.Sprintf(
   390  							"liveDocs count mismatch: info=%v, vs bits=%v",
   391  							numDocs, numLive))
   392  					}
   393  				}
   394  
   395  				segInfoStat.numDeleted = infoDocCount - numDocs
   396  				ch.msg("OK [%v deleted docs]", segInfoStat.numDeleted)
   397  			} else {
   398  				if info.DelCount() != 0 {
   399  					return errors.New(fmt.Sprintf(
   400  						"delete count mismatch: info=%v vs reader=%v",
   401  						info.DelCount(), infoDocCount-numDocs))
   402  				}
   403  				liveDocs := reader.LiveDocs()
   404  				if liveDocs != nil {
   405  					// it's ok for it to be non-nil here, as long as none are set right?
   406  					for j := 0; j < liveDocs.Length(); j++ {
   407  						if !liveDocs.At(j) {
   408  							return errors.New(fmt.Sprintf(
   409  								"liveDocs mismatch: info says no deletions but doc %v is deleted.", j))
   410  						}
   411  					}
   412  				}
   413  				ch.msg("OK")
   414  			}
   415  			if reader.MaxDoc() != infoDocCount {
   416  				return errors.New(fmt.Sprintf(
   417  					"SegmentReader.maxDoc() %v != SegmentInfos.docCount %v",
   418  					reader.MaxDoc(), infoDocCount))
   419  			}
   420  
   421  			// Test getFieldInfos()
   422  			ch.msg("    test: fields..............")
   423  			fieldInfos := reader.FieldInfos()
   424  			ch.msg("OK [%v fields]", fieldInfos.Size())
   425  			segInfoStat.numFields = fieldInfos.Size()
   426  
   427  			segInfoStat.fieldNormStatus = ch.testFieldNorms(reader)
   428  			segInfoStat.termIndexStatus = ch.testPostings(reader)
   429  			segInfoStat.storedFieldStatus = ch.testStoredFields(reader)
   430  			segInfoStat.termVectorStatus = ch.testTermVectors(reader)
   431  			segInfoStat.docValuesStatus = ch.testDocValues(reader)
   432  
   433  			// Rethrow the first error we encountered
   434  			// This will cause stats for failed segments to be incremented properly
   435  			if segInfoStat.fieldNormStatus.err != nil {
   436  				return errors.New("Field Norm test failed")
   437  			} else if segInfoStat.termIndexStatus.err != nil {
   438  				return errors.New("Term Index test failed")
   439  			} else if segInfoStat.storedFieldStatus.err != nil {
   440  				return errors.New("Stored Field test failed")
   441  			} else if segInfoStat.termVectorStatus.err != nil {
   442  				return errors.New("Term Vector test failed")
   443  			} else if segInfoStat.docValuesStatus.err != nil {
   444  				return errors.New("DocValues test failed")
   445  			}
   446  
   447  			ch.msg("")
   448  			return nil
   449  		}()
   450  		if err != nil {
   451  			if ch.failFast {
   452  				panic("niy")
   453  			}
   454  			ch.msg("FAILED")
   455  			comment := "fixIndex() would remove reference to this segment"
   456  			ch.msg("    WARNING: %v; full error:", comment)
   457  			ch.msg(string(debug.Stack()))
   458  			ch.msg("")
   459  			result.totLoseDocCount += toLoseDocCount
   460  			result.numBadSegments++
   461  		} else {
   462  			// Keeper
   463  			result.newSegments.Segments = append(result.newSegments.Segments, info.Clone())
   464  		}
   465  	}
   466  
   467  	if result.numBadSegments == 0 {
   468  		result.Clean = true
   469  	} else {
   470  		ch.msg(
   471  			"WARNING: %v broken segments (containing %v documents) detected",
   472  			result.numBadSegments, result.totLoseDocCount)
   473  	}
   474  
   475  	result.validCounter = result.maxSegmentName < sis.counter
   476  	if !result.validCounter {
   477  		result.Clean = false
   478  		result.newSegments.counter = result.maxSegmentName + 1
   479  		ch.msg(
   480  			"ERROR: Next segment name counter %v is not greater than max segment name %v",
   481  			sis.counter, result.maxSegmentName)
   482  	}
   483  
   484  	if result.Clean {
   485  		ch.msg("No problems were detected with this index.\n")
   486  	}
   487  
   488  	return result
   489  }
   490  
   491  func (ch *CheckIndex) testFieldNorms(reader AtomicReader) *FieldNormStatus {
   492  	panic("not implemented yet")
   493  }
   494  
   495  func (ch *CheckIndex) testPostings(reader AtomicReader) *TermIndexStatus {
   496  	panic("not implemented yet")
   497  }
   498  
   499  func (ch *CheckIndex) testStoredFields(reader AtomicReader) *StoredFieldStatus {
   500  	panic("not implemented yet")
   501  }
   502  
   503  func (ch *CheckIndex) testDocValues(reader AtomicReader) *DocValuesStatus {
   504  	panic("not implemented yet")
   505  }
   506  
   507  func (ch *CheckIndex) testTermVectors(reader AtomicReader) *TermVectorStatus {
   508  	panic("not implemented yet")
   509  }