github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/index/checkIndex.go (about) 1 package index 2 3 import ( 4 "errors" 5 "fmt" 6 . "github.com/balzaczyy/golucene/core/codec/spi" 7 "github.com/balzaczyy/golucene/core/store" 8 "github.com/balzaczyy/golucene/core/util" 9 "io" 10 "runtime/debug" 11 "strconv" 12 ) 13 14 // index/CheckIndex.java 15 16 /* Returned from checkIndex() detailing the health and status of the index */ 17 type CheckIndexStatus struct { 18 // True if no problems found with the index. 19 Clean bool 20 21 // True if we were unable to locate and load the segments_N file. 22 MissingSegments bool 23 24 // True if we were unable to open the segments_N file. 25 cantOpenSegments bool 26 27 // True if we were unable to read the versioin number from segments_N file. 28 missingSegmentVersion bool 29 30 // Name of latest segments_N file in the index. 31 segmentsFilename string 32 33 // Number of segments in the index 34 numSegments int 35 36 // True if the index was created with a newer version of Lucene than the CheckIndex tool. 37 toolOutOfDate bool 38 39 // List of SegmentInfoStatus instances, detailing status of each segment. 40 segmentInfos []*SegmentInfoStatus 41 42 // Directory index is in. 43 dir store.Directory 44 45 // SegmentInfos instance containing only segments that had no 46 // problems (this is used with the fixIndex() method to repare the 47 // index) 48 newSegments *SegmentInfos 49 50 // How many documents will be lost to bad segments. 51 totLoseDocCount int 52 53 // How many bad segments were found. 54 numBadSegments int 55 56 // Whether the SegmentInfos.counter is greater than any of the segments' names. 57 validCounter bool 58 59 // The greatest segment name. 60 maxSegmentName int 61 62 // Holds the userData of the last commit in the index 63 userData map[string]string 64 } 65 66 /* Holds the status of each segment in the index. */ 67 type SegmentInfoStatus struct { 68 // Name of the segment 69 name string 70 71 // Codec used to read this segment. 72 codec Codec 73 74 // Document count (does not take deletions into account). 75 docCount int 76 77 // True if segment is compound file format. 78 compound bool 79 80 // Number of files referenced by this segment. 81 numFiles int 82 83 // Net size (MB) of the files referenced by this segment. 84 sizeMB float64 85 86 // True if this segment has pending deletions. 87 hasDeletions bool 88 89 // Current deletions generation. 90 deletionsGen int64 91 92 // Number of deleted documents. 93 numDeleted int 94 95 // True if we were able to open an AR on this segment. 96 openReaderPassed bool 97 98 // Number of fields in this segment. 99 numFields int 100 101 // Map that includes certain debugging details that IW records into each segment it creates 102 diagnostics map[string]string 103 104 // Status for testing of field norms (nil if field norms could not be tested). 105 fieldNormStatus *FieldNormStatus 106 107 // Status for testing of indexed terms (nil if indexed terms could not be tested). 108 termIndexStatus *TermIndexStatus 109 110 // Status for testing of stored fields (nil if stored fields could not be tested). 111 storedFieldStatus *StoredFieldStatus 112 113 // Status for testing term vectors (nil if term vectors could not be tested). 114 termVectorStatus *TermVectorStatus 115 116 // Status for testing of DocVlaues (nil if DocValues could not be tested). 117 docValuesStatus *DocValuesStatus 118 } 119 120 type FieldNormStatus struct { 121 err error 122 } 123 124 type TermIndexStatus struct { 125 err error 126 } 127 128 type StoredFieldStatus struct { 129 err error 130 } 131 132 type TermVectorStatus struct { 133 err error 134 } 135 136 type DocValuesStatus struct { 137 err error 138 } 139 140 /* 141 Basic tool and API to check the health of an index and write a new 142 segments file that removes reference to problematic segments. 143 144 As this tool checks every byte in the index, on a large index it can 145 take a long time to run. 146 */ 147 type CheckIndex struct { 148 infoStream io.Writer 149 dir store.Directory 150 crossCheckTermVectors bool 151 failFast bool 152 } 153 154 func NewCheckIndex(dir store.Directory, crossCheckTermVectors bool, infoStream io.Writer) *CheckIndex { 155 return &CheckIndex{ 156 infoStream: infoStream, 157 dir: dir, 158 crossCheckTermVectors: crossCheckTermVectors, 159 } 160 } 161 162 func (ch *CheckIndex) msg(msg string, args ...interface{}) { 163 fmt.Fprintf(ch.infoStream, msg, args...) 164 fmt.Fprintln(ch.infoStream) 165 } 166 167 /* 168 Returns a Status instance detailing the state of the index. 169 170 As this method checks every byte in the specified segments, on a 171 large index it can take quite a long time to run. 172 173 WARNING: make sure you only call this when the index is not opened 174 by any writer. 175 */ 176 func (ch *CheckIndex) CheckIndex(onlySegments []string) *CheckIndexStatus { 177 sis := &SegmentInfos{} 178 result := &CheckIndexStatus{ 179 dir: ch.dir, 180 } 181 err := sis.ReadAll(ch.dir) 182 if err != nil { 183 if ch.failFast { 184 panic("niy") 185 } 186 fmt.Fprintln(ch.infoStream, "ERROR: could not read any segments file in directory") 187 debug.PrintStack() 188 result.MissingSegments = true 189 return result 190 } 191 192 // find the oldest and newest segment versions 193 var oldest util.Version 194 var newest util.Version 195 var oldSegs string 196 for _, si := range sis.Segments { 197 if version := si.Info.Version(); len(version) != 0 { 198 if len(oldest) == 0 || !version.OnOrAfter(oldest) { 199 oldest = version 200 } 201 if len(newest) == 0 || version.OnOrAfter(newest) { 202 newest = version 203 } 204 } else { 205 // pre-3.1 segment 206 oldSegs = "pre-3.1" 207 } 208 } 209 210 numSegments := len(sis.Segments) 211 segmentsFilename := sis.SegmentsFileName() 212 // note: we only read the format byte (required preamble) here! 213 input, err := ch.dir.OpenInput(segmentsFilename, store.IO_CONTEXT_READONCE) 214 if err != nil { 215 if ch.failFast { 216 panic("niy") 217 } 218 fmt.Fprintln(ch.infoStream, "ERROR: could not open segments file in directory") 219 debug.PrintStack() 220 result.cantOpenSegments = true 221 return result 222 } 223 defer input.Close() // ignore error 224 225 _, err = input.ReadInt() 226 if err != nil { 227 if ch.failFast { 228 panic("niy") 229 } 230 fmt.Fprintln(ch.infoStream, "ERROR: could not read segment file version in directory") 231 debug.PrintStack() 232 result.missingSegmentVersion = true 233 return result 234 } 235 236 var sFormat string 237 var skip = false 238 239 result.segmentsFilename = segmentsFilename 240 result.numSegments = numSegments 241 result.userData = sis.userData 242 var userDataStr string 243 if len(sis.userData) > 0 { 244 userDataStr = fmt.Sprintf(" userData=%v", sis.userData) 245 } 246 247 var versionStr string 248 if oldSegs != "" { 249 if len(newest) != 0 { 250 versionStr = fmt.Sprintf("versions=[%v .. %v]", oldSegs, newest) 251 } else { 252 versionStr = fmt.Sprintf("version=%v", oldSegs) 253 } 254 } else if len(newest) != 0 { // implies oldest is set 255 if newest.Equals(oldest) { 256 versionStr = fmt.Sprintf("version=%v", oldest) 257 } else { 258 versionStr = fmt.Sprintf("versions=[%v .. %v]", oldest, newest) 259 } 260 } 261 262 ch.msg("Segments file=%v numSegments=%v %v format=%v%v", 263 segmentsFilename, numSegments, versionStr, sFormat, userDataStr) 264 265 names := make(map[string]bool) 266 if onlySegments != nil { 267 for _, name := range onlySegments { 268 names[name] = true 269 } 270 panic("not implemented yet") 271 } 272 273 if skip { 274 ch.msg( 275 "\nERROR: this index appears to be created by a newer version of Lucene than this tool was compiled on; please re-compile this tool on the matching version of Lucene; exiting") 276 result.toolOutOfDate = true 277 return result 278 } 279 280 result.newSegments = sis.Clone() 281 result.newSegments.Clear() 282 result.maxSegmentName = -1 283 284 for i, info := range sis.Segments { 285 segmentName, err := strconv.ParseInt(info.Info.Name[1:], 36, 32) 286 if err != nil { 287 panic(err) // impossible 288 } 289 if int(segmentName) > result.maxSegmentName { 290 result.maxSegmentName = int(segmentName) 291 } 292 if _, ok := names[info.Info.Name]; !ok { 293 continue 294 } 295 segInfoStat := new(SegmentInfoStatus) 296 result.segmentInfos = append(result.segmentInfos, segInfoStat) 297 infoDocCount := info.Info.DocCount() 298 ch.msg(" %v of %v: name=%v docCount=%v ", 299 1+i, numSegments, info.Info.Name, infoDocCount) 300 segInfoStat.name = info.Info.Name 301 segInfoStat.docCount = infoDocCount 302 303 version := info.Info.Version() 304 if infoDocCount <= 0 && version.OnOrAfter(util.VERSION_45) { 305 panic(fmt.Sprintf("illegal number of documents: maxDoc=%v", infoDocCount)) 306 } 307 308 toLoseDocCount := infoDocCount 309 err = func() error { 310 assert2(len(version) != 0, "pre 4.0 is not supported yet") 311 ch.msg(" version=%v", version) 312 codec := info.Info.Codec().(Codec) 313 ch.msg(" codec = %v", codec) 314 segInfoStat.codec = codec 315 ch.msg(" compound = %v", info.Info.IsCompoundFile()) 316 segInfoStat.compound = info.Info.IsCompoundFile() 317 ch.msg(" numFiles = %v", len(info.Files())) 318 segInfoStat.numFiles = len(info.Files()) 319 n, err := info.SizeInBytes() 320 if err != nil { 321 return err 322 } 323 segInfoStat.sizeMB = float64(n) / (1024 * 1024) 324 if v := info.Info.Attribute("Lucene3xSegmentInfoFormat.dsoffset"); v == "" { 325 // don't print size in bytes if it's a 3.0 segment iwht shared docstores 326 ch.msg(" size (MB) = %v", segInfoStat.sizeMB) 327 } 328 329 diagnostics := info.Info.Diagnostics() 330 segInfoStat.diagnostics = diagnostics 331 if len(diagnostics) > 0 { 332 ch.msg(" diagnostics = %v", diagnostics) 333 } 334 335 atts := info.Info.Attributes() 336 if len(atts) > 0 { 337 ch.msg(" attributes = %v", atts) 338 } 339 340 panic("not implemented yet") 341 342 if !info.HasDeletions() { 343 ch.msg(" no deletions") 344 segInfoStat.hasDeletions = false 345 } else { 346 ch.msg(" has deletions [delGen = %v]", info.DelGen()) 347 segInfoStat.hasDeletions = true 348 segInfoStat.deletionsGen = info.DelGen() 349 } 350 351 ch.msg(" test: open reader.........") 352 reader, err := NewSegmentReader(info, DEFAULT_TERMS_INDEX_DIVISOR, store.IO_CONTEXT_DEFAULT) 353 if err != nil { 354 return err 355 } 356 defer reader.Close() 357 358 segInfoStat.openReaderPassed = true 359 360 numDocs := reader.NumDocs() 361 toLoseDocCount = numDocs 362 if reader.hasDeletions() { 363 if n := infoDocCount - info.DelCount(); n != reader.NumDocs() { 364 return errors.New(fmt.Sprintf( 365 "delete count mismatch: info=%v vs reader=%v", 366 n, reader.NumDocs())) 367 } 368 if n := infoDocCount - reader.NumDocs(); n > reader.MaxDoc() { 369 return errors.New(fmt.Sprintf( 370 "too many deleted docs: maxDoc()=%v vs del count=%v", 371 reader.MaxDoc(), n)) 372 } 373 if n := infoDocCount - numDocs; n != info.DelCount() { 374 return errors.New(fmt.Sprintf( 375 "delete count mismatch: info=%v vs reader=%v", 376 info.DelCount(), n)) 377 } 378 liveDocs := reader.LiveDocs() 379 if liveDocs == nil { 380 return errors.New("segment should have deletions, but liveDocs is nil") 381 } else { 382 var numLive = 0 383 for j := 0; j < liveDocs.Length(); j++ { 384 if liveDocs.At(j) { 385 numLive++ 386 } 387 } 388 if numLive != numDocs { 389 return errors.New(fmt.Sprintf( 390 "liveDocs count mismatch: info=%v, vs bits=%v", 391 numDocs, numLive)) 392 } 393 } 394 395 segInfoStat.numDeleted = infoDocCount - numDocs 396 ch.msg("OK [%v deleted docs]", segInfoStat.numDeleted) 397 } else { 398 if info.DelCount() != 0 { 399 return errors.New(fmt.Sprintf( 400 "delete count mismatch: info=%v vs reader=%v", 401 info.DelCount(), infoDocCount-numDocs)) 402 } 403 liveDocs := reader.LiveDocs() 404 if liveDocs != nil { 405 // it's ok for it to be non-nil here, as long as none are set right? 406 for j := 0; j < liveDocs.Length(); j++ { 407 if !liveDocs.At(j) { 408 return errors.New(fmt.Sprintf( 409 "liveDocs mismatch: info says no deletions but doc %v is deleted.", j)) 410 } 411 } 412 } 413 ch.msg("OK") 414 } 415 if reader.MaxDoc() != infoDocCount { 416 return errors.New(fmt.Sprintf( 417 "SegmentReader.maxDoc() %v != SegmentInfos.docCount %v", 418 reader.MaxDoc(), infoDocCount)) 419 } 420 421 // Test getFieldInfos() 422 ch.msg(" test: fields..............") 423 fieldInfos := reader.FieldInfos() 424 ch.msg("OK [%v fields]", fieldInfos.Size()) 425 segInfoStat.numFields = fieldInfos.Size() 426 427 segInfoStat.fieldNormStatus = ch.testFieldNorms(reader) 428 segInfoStat.termIndexStatus = ch.testPostings(reader) 429 segInfoStat.storedFieldStatus = ch.testStoredFields(reader) 430 segInfoStat.termVectorStatus = ch.testTermVectors(reader) 431 segInfoStat.docValuesStatus = ch.testDocValues(reader) 432 433 // Rethrow the first error we encountered 434 // This will cause stats for failed segments to be incremented properly 435 if segInfoStat.fieldNormStatus.err != nil { 436 return errors.New("Field Norm test failed") 437 } else if segInfoStat.termIndexStatus.err != nil { 438 return errors.New("Term Index test failed") 439 } else if segInfoStat.storedFieldStatus.err != nil { 440 return errors.New("Stored Field test failed") 441 } else if segInfoStat.termVectorStatus.err != nil { 442 return errors.New("Term Vector test failed") 443 } else if segInfoStat.docValuesStatus.err != nil { 444 return errors.New("DocValues test failed") 445 } 446 447 ch.msg("") 448 return nil 449 }() 450 if err != nil { 451 if ch.failFast { 452 panic("niy") 453 } 454 ch.msg("FAILED") 455 comment := "fixIndex() would remove reference to this segment" 456 ch.msg(" WARNING: %v; full error:", comment) 457 ch.msg(string(debug.Stack())) 458 ch.msg("") 459 result.totLoseDocCount += toLoseDocCount 460 result.numBadSegments++ 461 } else { 462 // Keeper 463 result.newSegments.Segments = append(result.newSegments.Segments, info.Clone()) 464 } 465 } 466 467 if result.numBadSegments == 0 { 468 result.Clean = true 469 } else { 470 ch.msg( 471 "WARNING: %v broken segments (containing %v documents) detected", 472 result.numBadSegments, result.totLoseDocCount) 473 } 474 475 result.validCounter = result.maxSegmentName < sis.counter 476 if !result.validCounter { 477 result.Clean = false 478 result.newSegments.counter = result.maxSegmentName + 1 479 ch.msg( 480 "ERROR: Next segment name counter %v is not greater than max segment name %v", 481 sis.counter, result.maxSegmentName) 482 } 483 484 if result.Clean { 485 ch.msg("No problems were detected with this index.\n") 486 } 487 488 return result 489 } 490 491 func (ch *CheckIndex) testFieldNorms(reader AtomicReader) *FieldNormStatus { 492 panic("not implemented yet") 493 } 494 495 func (ch *CheckIndex) testPostings(reader AtomicReader) *TermIndexStatus { 496 panic("not implemented yet") 497 } 498 499 func (ch *CheckIndex) testStoredFields(reader AtomicReader) *StoredFieldStatus { 500 panic("not implemented yet") 501 } 502 503 func (ch *CheckIndex) testDocValues(reader AtomicReader) *DocValuesStatus { 504 panic("not implemented yet") 505 } 506 507 func (ch *CheckIndex) testTermVectors(reader AtomicReader) *TermVectorStatus { 508 panic("not implemented yet") 509 }