github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/index/defaultIndexingChain.go (about) 1 package index 2 3 import ( 4 "fmt" 5 "github.com/balzaczyy/golucene/core/analysis" 6 . "github.com/balzaczyy/golucene/core/codec/spi" 7 . "github.com/balzaczyy/golucene/core/index/model" 8 "github.com/balzaczyy/golucene/core/store" 9 "github.com/balzaczyy/golucene/core/util" 10 ) 11 12 /* Default general purpose indexing chain, which handles indexing all types of fields */ 13 type DefaultIndexingChain struct { 14 bytesUsed util.Counter 15 docState *docState 16 docWriter *DocumentsWriterPerThread 17 fieldInfos *FieldInfosBuilder 18 19 // Writes postings and term vectors: 20 termsHash TermsHash 21 22 storedFieldsWriter StoredFieldsWriter // lazy init 23 lastStoredDocId int 24 25 fieldHash []*PerField 26 hashMask int 27 28 totalFieldCount int 29 nextFieldGen int64 30 31 // Holds fields seen in each document 32 fields []*PerField 33 } 34 35 func newDefaultIndexingChain(docWriter *DocumentsWriterPerThread) *DefaultIndexingChain { 36 termVectorsWriter := newTermVectorsConsumer(docWriter) 37 return &DefaultIndexingChain{ 38 docWriter: docWriter, 39 fieldInfos: docWriter.fieldInfos, 40 docState: docWriter.docState, 41 bytesUsed: docWriter._bytesUsed, 42 termsHash: newFreqProxTermsWriter(docWriter, termVectorsWriter), 43 fieldHash: make([]*PerField, 2), 44 hashMask: 1, 45 fields: make([]*PerField, 1), 46 } 47 } 48 49 // TODO: can we remove this lazy-init / make cleaner / do it another way...? 50 func (c *DefaultIndexingChain) initStoredFieldsWriter() (err error) { 51 if c.storedFieldsWriter == nil { 52 assert(c != nil) 53 assert(c.docWriter != nil) 54 assert(c.docWriter.codec != nil) 55 assert(c.docWriter.codec.StoredFieldsFormat() != nil) 56 c.storedFieldsWriter, err = c.docWriter.codec.StoredFieldsFormat().FieldsWriter( 57 c.docWriter.directory, c.docWriter.segmentInfo, store.IO_CONTEXT_DEFAULT) 58 } 59 return 60 } 61 62 func (c *DefaultIndexingChain) flush(state *SegmentWriteState) (err error) { 63 // NOTE: caller (DWPT) handles aborting on any error from this method 64 65 numDocs := state.SegmentInfo.DocCount() 66 if err = c.writeNorms(state); err != nil { 67 return 68 } 69 if err = c.writeDocValues(state); err != nil { 70 return 71 } 72 73 // it's possible all docs hit non-aboritng errors... 74 if err = c.initStoredFieldsWriter(); err != nil { 75 return 76 } 77 if err = c.fillStoredFields(numDocs); err != nil { 78 return 79 } 80 if err = c.storedFieldsWriter.Finish(state.FieldInfos, numDocs); err != nil { 81 return 82 } 83 if err = c.storedFieldsWriter.Close(); err != nil { 84 return 85 } 86 87 fieldsToFlush := make(map[string]TermsHashPerField) 88 for _, perField := range c.fieldHash { 89 for perField != nil { 90 if perField.invertState != nil { 91 fieldsToFlush[perField.fieldInfo.Name] = perField.termsHashPerField 92 } 93 perField = perField.next 94 } 95 } 96 97 if err = c.termsHash.flush(fieldsToFlush, state); err != nil { 98 return 99 } 100 101 // important to save after asking consumer to flush so consumer can 102 // alter the FieldInfo* if necessary. E.g., FreqProxTermsWriter does 103 // this with FieldInfo.storePayload. 104 infosWriter := c.docWriter.codec.FieldInfosFormat().FieldInfosWriter() 105 return infosWriter(state.Directory, state.SegmentInfo.Name, "", state.FieldInfos, store.IO_CONTEXT_DEFAULT) 106 } 107 108 /* Writes all buffered doc values (called from flush()) */ 109 func (c *DefaultIndexingChain) writeDocValues(state *SegmentWriteState) (err error) { 110 docCount := state.SegmentInfo.DocCount() 111 var dvConsumer DocValuesConsumer 112 var success = false 113 if success { 114 err = util.Close(dvConsumer) 115 } else { 116 util.CloseWhileSuppressingError(dvConsumer) 117 } 118 119 for _, perField := range c.fieldHash { 120 for perField != nil { 121 if perField.docValuesWriter != nil { 122 if dvConsumer == nil { 123 // lazy init 124 fmt := state.SegmentInfo.Codec().(Codec).DocValuesFormat() 125 if dvConsumer, err = fmt.FieldsConsumer(state); err != nil { 126 return 127 } 128 } 129 130 perField.docValuesWriter.finish(docCount) 131 if err = perField.docValuesWriter.flush(state, dvConsumer); err != nil { 132 return 133 } 134 perField.docValuesWriter = nil 135 } 136 perField = perField.next 137 } 138 } 139 140 success = true 141 return nil 142 } 143 144 /* 145 Catch up for all docs before us that had no stored fields, or hit 146 non-aborting errors before writing stored fields. 147 */ 148 func (c *DefaultIndexingChain) fillStoredFields(docId int) (err error) { 149 for err == nil && c.lastStoredDocId < docId { 150 err = c.startStoredFields() 151 if err == nil { 152 err = c.finishStoredFields() 153 } 154 } 155 return 156 } 157 158 func (c *DefaultIndexingChain) writeNorms(state *SegmentWriteState) (err error) { 159 var success = false 160 var normsConsumer DocValuesConsumer 161 defer func() { 162 if success { 163 err = util.Close(normsConsumer) 164 } else { 165 util.CloseWhileSuppressingError(normsConsumer) 166 } 167 }() 168 169 if state.FieldInfos.HasNorms { 170 normsFormat := state.SegmentInfo.Codec().(Codec).NormsFormat() 171 assert(normsFormat != nil) 172 if normsConsumer, err = normsFormat.NormsConsumer(state); err != nil { 173 return 174 } 175 176 for _, fi := range state.FieldInfos.Values { 177 perField := c.perField(fi.Name) 178 assert(perField != nil) 179 180 // we must check the final value of omitNorms for the FieldInfo: 181 // it could have changed for this field since the first time we 182 // added it. 183 if !fi.OmitsNorms() { 184 if perField.norms != nil { 185 perField.norms.finish(state.SegmentInfo.DocCount()) 186 if err = perField.norms.flush(state, normsConsumer); err != nil { 187 return 188 } 189 assert(fi.NormType() == DOC_VALUES_TYPE_NUMERIC) 190 } else if fi.IsIndexed() { 191 assert2(fi.NormType() == 0, "got %v; field=%v", fi.NormType(), fi.Name) 192 } 193 } 194 } 195 } 196 success = true 197 return nil 198 } 199 200 func (c *DefaultIndexingChain) abort() { 201 // E.g. close any open files in the stored fields writer: 202 if c.storedFieldsWriter != nil { 203 c.storedFieldsWriter.Abort() // ignore error 204 } 205 206 // E.g. close any open files in the term vectors writer: 207 c.termsHash.abort() 208 209 for i, _ := range c.fieldHash { 210 c.fieldHash[i] = nil 211 } 212 } 213 214 func (c *DefaultIndexingChain) rehash() { 215 newHashSize := 2 * len(c.fieldHash) 216 assert(newHashSize > len(c.fieldHash)) 217 218 newHashArray := make([]*PerField, newHashSize) 219 220 // rehash 221 newHashMask := newHashSize - 1 222 for _, fp0 := range c.fieldHash { 223 for fp0 != nil { 224 hashPos2 := util.Hashstr(fp0.fieldInfo.Name) & newHashMask 225 fp0.next, newHashArray[hashPos2], fp0 = 226 newHashArray[hashPos2], fp0, fp0.next 227 } 228 } 229 230 c.fieldHash = newHashArray 231 c.hashMask = newHashMask 232 } 233 234 /* Calls StoredFieldsWriter.startDocument, aborting the segment if it hits any error. */ 235 func (c *DefaultIndexingChain) startStoredFields() (err error) { 236 var success = false 237 defer func() { 238 if !success { 239 c.docWriter.setAborting() 240 } 241 }() 242 243 if err = c.initStoredFieldsWriter(); err != nil { 244 return 245 } 246 if err = c.storedFieldsWriter.StartDocument(); err != nil { 247 return 248 } 249 success = true 250 251 c.lastStoredDocId++ 252 return nil 253 } 254 255 /* Calls StoredFieldsWriter.finishDocument(), aborting the segment if it hits any error. */ 256 func (c *DefaultIndexingChain) finishStoredFields() error { 257 var success = false 258 defer func() { 259 if !success { 260 c.docWriter.setAborting() 261 } 262 }() 263 if err := c.storedFieldsWriter.FinishDocument(); err != nil { 264 return err 265 } 266 success = true 267 return nil 268 } 269 270 func (c *DefaultIndexingChain) processDocument() (err error) { 271 // How many indexed field names we've seen (collapses multiple 272 // field instances by the same name): 273 fieldCount := 0 274 275 fieldGen := c.nextFieldGen 276 c.nextFieldGen++ 277 278 // NOTE: we need to passes here, in case there are multi-valued 279 // fields, because we must process all instances of a given field 280 // at once, since the anlayzer is free to reuse TOkenStream across 281 // fields (i.e., we cannot have more than one TokenStream running 282 // "at once"): 283 284 c.termsHash.startDocument() 285 286 if err = c.fillStoredFields(c.docState.docID); err != nil { 287 return 288 } 289 if err = c.startStoredFields(); err != nil { 290 return 291 } 292 293 if err = func() error { 294 defer func() { 295 if !c.docWriter.aborting { 296 // Finish each indexed field name seen in the document: 297 for _, field := range c.fields[:fieldCount] { 298 err = mergeError(err, field.finish()) 299 } 300 err = mergeError(err, c.finishStoredFields()) 301 } 302 }() 303 304 for _, field := range c.docState.doc { 305 if fieldCount, err = c.processField(field, fieldGen, fieldCount); err != nil { 306 return err 307 } 308 } 309 return nil 310 }(); err != nil { 311 return 312 } 313 314 var success = false 315 defer func() { 316 if !success { 317 // Must abort, on the possibility that on-disk term vectors are now corrupt: 318 c.docWriter.setAborting() 319 } 320 }() 321 322 if err = c.termsHash.finishDocument(); err != nil { 323 return 324 } 325 success = true 326 return nil 327 } 328 329 func (c *DefaultIndexingChain) processField(field IndexableField, 330 fieldGen int64, fieldCount int) (int, error) { 331 332 var fieldName string = field.Name() 333 var fieldType IndexableFieldType = field.FieldType() 334 var fp *PerField 335 336 // Invert indexed fields: 337 if fieldType.Indexed() { 338 339 // if the field omits norms, the boost cannot be indexed. 340 if fieldType.OmitNorms() && field.Boost() != 1 { 341 panic(fmt.Sprintf( 342 "You cannot set an index-time boost: norms are omitted for field '%v'", 343 fieldName)) 344 } 345 346 fp = c.getOrAddField(fieldName, fieldType, true) 347 first := fp.fieldGen != fieldGen 348 if err := fp.invert(field, first); err != nil { 349 return 0, err 350 } 351 352 if first { 353 c.fields[fieldCount] = fp 354 fieldCount++ 355 fp.fieldGen = fieldGen 356 } 357 } else { 358 panic("not implemented yet") 359 } 360 361 // Add stored fields: 362 if fieldType.Stored() { 363 if fp == nil { 364 panic("not implemented yet") 365 } 366 if fieldType.Stored() { 367 if err := func() error { 368 var success = false 369 defer func() { 370 if !success { 371 c.docWriter.setAborting() 372 } 373 }() 374 375 if err := c.storedFieldsWriter.WriteField(fp.fieldInfo, field); err != nil { 376 return err 377 } 378 success = true 379 return nil 380 }(); err != nil { 381 return 0, err 382 } 383 } 384 } 385 386 if dvType := fieldType.DocValueType(); int(dvType) != 0 { 387 if fp == nil { 388 panic("not implemented yet") 389 } 390 panic("not implemented yet") 391 } 392 393 return fieldCount, nil 394 } 395 396 /* 397 Returns a previously created PerField, or nil if this field name 398 wasn't seen yet. 399 */ 400 func (c *DefaultIndexingChain) perField(name string) *PerField { 401 hashPos := util.Hashstr(name) & c.hashMask 402 fp := c.fieldHash[hashPos] 403 for fp != nil && fp.fieldInfo.Name != name { 404 fp = fp.next 405 } 406 return fp 407 } 408 409 func (c *DefaultIndexingChain) getOrAddField(name string, 410 fieldType IndexableFieldType, invert bool) *PerField { 411 412 // Make sure we have a PerField allocated 413 hashPos := util.Hashstr(name) & c.hashMask 414 fp := c.fieldHash[hashPos] 415 for fp != nil && fp.fieldInfo.Name != name && fp != fp.next { 416 fp = fp.next 417 } 418 419 if fp == nil { 420 // First time we are seeing this field in this segment 421 422 fi := c.fieldInfos.AddOrUpdate(name, fieldType) 423 424 fp = newPerField(c, fi, invert) 425 fp.next = c.fieldHash[hashPos] 426 c.fieldHash[hashPos] = fp 427 c.totalFieldCount++ 428 429 // At most 50% load factor: 430 if c.totalFieldCount >= len(c.fieldHash)/2 { 431 c.rehash() 432 } 433 434 if c.totalFieldCount > len(c.fields) { 435 newFields := make([]*PerField, util.Oversize(c.totalFieldCount, util.NUM_BYTES_OBJECT_REF)) 436 copy(newFields, c.fields) 437 c.fields = newFields 438 } 439 440 } else { 441 fp.fieldInfo.Update(fieldType) 442 443 if invert && fp.invertState == nil { 444 fp.setInvertState() 445 } 446 } 447 448 return fp 449 } 450 451 type PerField struct { 452 *DefaultIndexingChain // acess at least docState, termsHash. 453 454 fieldInfo *FieldInfo 455 similarity Similarity 456 457 invertState *FieldInvertState 458 termsHashPerField TermsHashPerField 459 460 // non-nil if this field ever had doc values in this segment: 461 docValuesWriter DocValuesWriter 462 463 // We use this to know when a PerField is seen for the first time 464 // in the current document. 465 fieldGen int64 466 467 // Used by the hash table 468 next *PerField 469 470 // Lazy init'd: 471 norms *NumericDocValuesWriter 472 473 // reused 474 tokenStream analysis.TokenStream 475 } 476 477 func newPerField(parent *DefaultIndexingChain, 478 fieldInfo *FieldInfo, invert bool) *PerField { 479 480 ans := &PerField{ 481 DefaultIndexingChain: parent, 482 fieldInfo: fieldInfo, 483 similarity: parent.docState.similarity, 484 fieldGen: -1, 485 } 486 if invert { 487 ans.setInvertState() 488 } 489 return ans 490 } 491 492 func (f *PerField) setInvertState() { 493 f.invertState = newFieldInvertState(f.fieldInfo.Name) 494 f.termsHashPerField = f.termsHash.addField(f.invertState, f.fieldInfo) 495 } 496 497 func (f *PerField) finish() error { 498 if !f.fieldInfo.OmitsNorms() { 499 if f.norms == nil { 500 f.fieldInfo.SetNormValueType(DOC_VALUES_TYPE_NUMERIC) 501 f.norms = newNumericDocValuesWriter(f.fieldInfo, f.docState.docWriter._bytesUsed, false) 502 } 503 f.norms.addValue(f.docState.docID, f.similarity.ComputeNorm(f.invertState)) 504 } 505 return f.termsHashPerField.finish() 506 } 507 508 /* 509 Inverts one field for one document; first is true if this is the 510 first time we are seeing this field name in this document. 511 */ 512 func (f *PerField) invert(field IndexableField, first bool) error { 513 if first { 514 // first time we're seeing this field (indexed) in this document: 515 f.invertState.reset() 516 } 517 518 fieldType := field.FieldType() 519 520 analyzed := fieldType.Tokenized() && f.docState.analyzer != nil 521 522 if err := func() (err error) { 523 // only bother checking offsets if something will consume them 524 // TODO: after we fix analyzers, also check if termVectorOffsets will be indexed. 525 checkOffsets := fieldType.IndexOptions() == INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS 526 527 // To assist people in tracking down problems in analysis components, 528 // we wish to write the field name to the infostream when we fail. 529 // We expect some caller to eventually deal with the real error, so 530 // we don't want any error handling, but rather a deferred function 531 // that takes note of the problem. 532 aborting := false 533 succeededInProcessingField := false 534 defer func() { 535 if err != nil { 536 if _, ok := err.(util.MaxBytesLengthExceededError); ok { 537 aborting = false 538 prefix := make([]byte, 30) 539 bigTerm := f.invertState.termAttribute.BytesRef() 540 copy(prefix, bigTerm.ToBytes()[:30]) // keep at most 30 characters 541 if f.docState.infoStream.IsEnabled("IW") { 542 f.docState.infoStream.Message("IW", 543 "ERROR: Document contains at least one immense term in field='%v' "+ 544 "(whose UTF8 encoding is longer than the max length %v), "+ 545 "all of which were skipped. Please correct the analyzer to not produce such terms. "+ 546 "The prefix of the first immense term is: '%v...', original message: %v", 547 f.fieldInfo.Name, MAX_TERM_LENGTH_UTF8, string(prefix), err) 548 } 549 } 550 } 551 if !succeededInProcessingField && aborting { 552 f.docState.docWriter.setAborting() 553 } 554 555 if !succeededInProcessingField && f.docState.infoStream.IsEnabled("DW") { 556 f.docState.infoStream.Message("DW", 557 "An error was returned while processing field %v", 558 f.fieldInfo.Name) 559 } 560 }() 561 562 var stream analysis.TokenStream 563 stream, err = field.TokenStream(f.docState.analyzer, f.tokenStream) 564 if err != nil { 565 return err 566 } 567 defer stream.Close() 568 569 f.tokenStream = stream 570 // reset the TokenStream to the first token 571 if err = stream.Reset(); err != nil { 572 return err 573 } 574 575 f.invertState.setAttributeSource(stream.Attributes()) 576 577 f.termsHashPerField.start(field, first) 578 579 for { 580 var ok bool 581 if ok, err = stream.IncrementToken(); err != nil { 582 return err 583 } 584 if !ok { 585 break 586 } 587 588 // if we hit an error in stream.next below (which is fairly 589 // common, e.g. if analyzer chokes on a given document), then 590 // it's non-aborting and (above) this one document will be 591 // marked as deleted, but still consume a docId 592 593 posIncr := f.invertState.posIncrAttribute.PositionIncrement() 594 if f.invertState.position += posIncr; f.invertState.position < f.invertState.lastPosition { 595 assert2(posIncr != 0, 596 "first position increment must be > 0 (got 0) for field '%v'", 597 field.Name) 598 panic(fmt.Sprintf( 599 "position increments (and gaps) must be >= 0 (got %v) for field '%v'", 600 posIncr, field.Name)) 601 } 602 f.invertState.lastPosition = f.invertState.position 603 if posIncr == 0 { 604 f.invertState.numOverlap++ 605 } 606 607 if checkOffsets { 608 startOffset := f.invertState.offset + f.invertState.offsetAttribute.StartOffset() 609 endOffset := f.invertState.offset + f.invertState.offsetAttribute.EndOffset() 610 assert2(startOffset >= f.invertState.lastStartOffset && startOffset <= endOffset, 611 "startOffset must be non-negative, "+ 612 "and endOffset must be >= startOffset, "+ 613 "and offsets must not go backwards "+ 614 "startOffset=%v,endOffset=%v,lastStartOffset=%v for field '%v'", 615 startOffset, endOffset, f.invertState.lastStartOffset, field.Name) 616 f.invertState.lastStartOffset = startOffset 617 } 618 619 // fmt.Printf(" term=%v\n", f.invertState.termAttribute) 620 621 // if we hit an error in here, we abort all buffered documents 622 // since the last flush, on the likelihood that the internal 623 // state of the terms hash is now corrupt and should not be 624 // flushed to a new segment: 625 aborting = true 626 if err = f.termsHashPerField.add(); err != nil { 627 return err 628 } 629 aborting = false 630 631 f.invertState.length++ 632 } 633 634 // trigger streams to perform end-of-stream operations 635 if err = stream.End(); err != nil { 636 return err 637 } 638 639 // TODO: maybe add some safety? then again, it's already checked 640 // when we come back arond to the field... 641 f.invertState.position += f.invertState.posIncrAttribute.PositionIncrement() 642 f.invertState.offset += f.invertState.offsetAttribute.EndOffset() 643 644 // if there is an error coming through, we don't set this to true here: 645 succeededInProcessingField = true 646 return nil 647 }(); err != nil { 648 return err 649 } 650 651 if analyzed { 652 f.invertState.position += f.docState.analyzer.PositionIncrementGap(f.fieldInfo.Name) 653 f.invertState.offset += f.docState.analyzer.OffsetGap(f.fieldInfo.Name) 654 } 655 656 f.invertState.boost *= field.Boost() 657 return nil 658 }